]> git.saurik.com Git - redis.git/blob - redis.c
ecde1939ae470b2901c57e415bdb91dcf985cc70
[redis.git] / redis.c
1 /*
2 * Copyright (c) 2009-2010, Salvatore Sanfilippo <antirez at gmail dot com>
3 * All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions are met:
7 *
8 * * Redistributions of source code must retain the above copyright notice,
9 * this list of conditions and the following disclaimer.
10 * * Redistributions in binary form must reproduce the above copyright
11 * notice, this list of conditions and the following disclaimer in the
12 * documentation and/or other materials provided with the distribution.
13 * * Neither the name of Redis nor the names of its contributors may be used
14 * to endorse or promote products derived from this software without
15 * specific prior written permission.
16 *
17 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
18 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
21 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
22 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
23 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
24 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
25 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
26 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
27 * POSSIBILITY OF SUCH DAMAGE.
28 */
29
30 #define REDIS_VERSION "1.3.8"
31
32 #include "fmacros.h"
33 #include "config.h"
34
35 #include <stdio.h>
36 #include <stdlib.h>
37 #include <string.h>
38 #include <time.h>
39 #include <unistd.h>
40 #define __USE_POSIX199309
41 #define __USE_UNIX98
42 #include <signal.h>
43
44 #ifdef HAVE_BACKTRACE
45 #include <execinfo.h>
46 #include <ucontext.h>
47 #endif /* HAVE_BACKTRACE */
48
49 #include <sys/wait.h>
50 #include <errno.h>
51 #include <assert.h>
52 #include <ctype.h>
53 #include <stdarg.h>
54 #include <inttypes.h>
55 #include <arpa/inet.h>
56 #include <sys/stat.h>
57 #include <fcntl.h>
58 #include <sys/time.h>
59 #include <sys/resource.h>
60 #include <sys/uio.h>
61 #include <limits.h>
62 #include <math.h>
63 #include <pthread.h>
64
65 #if defined(__sun)
66 #include "solarisfixes.h"
67 #endif
68
69 #include "redis.h"
70 #include "ae.h" /* Event driven programming library */
71 #include "sds.h" /* Dynamic safe strings */
72 #include "anet.h" /* Networking the easy way */
73 #include "dict.h" /* Hash tables */
74 #include "adlist.h" /* Linked lists */
75 #include "zmalloc.h" /* total memory usage aware version of malloc/free */
76 #include "lzf.h" /* LZF compression library */
77 #include "pqsort.h" /* Partial qsort for SORT+LIMIT */
78 #include "zipmap.h"
79
80 /* Error codes */
81 #define REDIS_OK 0
82 #define REDIS_ERR -1
83
84 /* Static server configuration */
85 #define REDIS_SERVERPORT 6379 /* TCP port */
86 #define REDIS_MAXIDLETIME (60*5) /* default client timeout */
87 #define REDIS_IOBUF_LEN 1024
88 #define REDIS_LOADBUF_LEN 1024
89 #define REDIS_STATIC_ARGS 8
90 #define REDIS_DEFAULT_DBNUM 16
91 #define REDIS_CONFIGLINE_MAX 1024
92 #define REDIS_OBJFREELIST_MAX 1000000 /* Max number of objects to cache */
93 #define REDIS_MAX_SYNC_TIME 60 /* Slave can't take more to sync */
94 #define REDIS_EXPIRELOOKUPS_PER_CRON 10 /* try to expire 10 keys/loop */
95 #define REDIS_MAX_WRITE_PER_EVENT (1024*64)
96 #define REDIS_REQUEST_MAX_SIZE (1024*1024*256) /* max bytes in inline command */
97
98 /* If more then REDIS_WRITEV_THRESHOLD write packets are pending use writev */
99 #define REDIS_WRITEV_THRESHOLD 3
100 /* Max number of iovecs used for each writev call */
101 #define REDIS_WRITEV_IOVEC_COUNT 256
102
103 /* Hash table parameters */
104 #define REDIS_HT_MINFILL 10 /* Minimal hash table fill 10% */
105
106 /* Command flags */
107 #define REDIS_CMD_BULK 1 /* Bulk write command */
108 #define REDIS_CMD_INLINE 2 /* Inline command */
109 /* REDIS_CMD_DENYOOM reserves a longer comment: all the commands marked with
110 this flags will return an error when the 'maxmemory' option is set in the
111 config file and the server is using more than maxmemory bytes of memory.
112 In short this commands are denied on low memory conditions. */
113 #define REDIS_CMD_DENYOOM 4
114
115 /* Object types */
116 #define REDIS_STRING 0
117 #define REDIS_LIST 1
118 #define REDIS_SET 2
119 #define REDIS_ZSET 3
120 #define REDIS_HASH 4
121
122 /* Objects encoding. Some kind of objects like Strings and Hashes can be
123 * internally represented in multiple ways. The 'encoding' field of the object
124 * is set to one of this fields for this object. */
125 #define REDIS_ENCODING_RAW 0 /* Raw representation */
126 #define REDIS_ENCODING_INT 1 /* Encoded as integer */
127 #define REDIS_ENCODING_ZIPMAP 2 /* Encoded as zipmap */
128 #define REDIS_ENCODING_HT 3 /* Encoded as an hash table */
129
130 static char* strencoding[] = {
131 "raw", "int", "zipmap", "hashtable"
132 };
133
134 /* Object types only used for dumping to disk */
135 #define REDIS_EXPIRETIME 253
136 #define REDIS_SELECTDB 254
137 #define REDIS_EOF 255
138
139 /* Defines related to the dump file format. To store 32 bits lengths for short
140 * keys requires a lot of space, so we check the most significant 2 bits of
141 * the first byte to interpreter the length:
142 *
143 * 00|000000 => if the two MSB are 00 the len is the 6 bits of this byte
144 * 01|000000 00000000 => 01, the len is 14 byes, 6 bits + 8 bits of next byte
145 * 10|000000 [32 bit integer] => if it's 01, a full 32 bit len will follow
146 * 11|000000 this means: specially encoded object will follow. The six bits
147 * number specify the kind of object that follows.
148 * See the REDIS_RDB_ENC_* defines.
149 *
150 * Lenghts up to 63 are stored using a single byte, most DB keys, and may
151 * values, will fit inside. */
152 #define REDIS_RDB_6BITLEN 0
153 #define REDIS_RDB_14BITLEN 1
154 #define REDIS_RDB_32BITLEN 2
155 #define REDIS_RDB_ENCVAL 3
156 #define REDIS_RDB_LENERR UINT_MAX
157
158 /* When a length of a string object stored on disk has the first two bits
159 * set, the remaining two bits specify a special encoding for the object
160 * accordingly to the following defines: */
161 #define REDIS_RDB_ENC_INT8 0 /* 8 bit signed integer */
162 #define REDIS_RDB_ENC_INT16 1 /* 16 bit signed integer */
163 #define REDIS_RDB_ENC_INT32 2 /* 32 bit signed integer */
164 #define REDIS_RDB_ENC_LZF 3 /* string compressed with FASTLZ */
165
166 /* Virtual memory object->where field. */
167 #define REDIS_VM_MEMORY 0 /* The object is on memory */
168 #define REDIS_VM_SWAPPED 1 /* The object is on disk */
169 #define REDIS_VM_SWAPPING 2 /* Redis is swapping this object on disk */
170 #define REDIS_VM_LOADING 3 /* Redis is loading this object from disk */
171
172 /* Virtual memory static configuration stuff.
173 * Check vmFindContiguousPages() to know more about this magic numbers. */
174 #define REDIS_VM_MAX_NEAR_PAGES 65536
175 #define REDIS_VM_MAX_RANDOM_JUMP 4096
176 #define REDIS_VM_MAX_THREADS 32
177 #define REDIS_THREAD_STACK_SIZE (1024*1024*4)
178 /* The following is the *percentage* of completed I/O jobs to process when the
179 * handelr is called. While Virtual Memory I/O operations are performed by
180 * threads, this operations must be processed by the main thread when completed
181 * in order to take effect. */
182 #define REDIS_MAX_COMPLETED_JOBS_PROCESSED 1
183
184 /* Client flags */
185 #define REDIS_SLAVE 1 /* This client is a slave server */
186 #define REDIS_MASTER 2 /* This client is a master server */
187 #define REDIS_MONITOR 4 /* This client is a slave monitor, see MONITOR */
188 #define REDIS_MULTI 8 /* This client is in a MULTI context */
189 #define REDIS_BLOCKED 16 /* The client is waiting in a blocking operation */
190 #define REDIS_IO_WAIT 32 /* The client is waiting for Virtual Memory I/O */
191
192 /* Slave replication state - slave side */
193 #define REDIS_REPL_NONE 0 /* No active replication */
194 #define REDIS_REPL_CONNECT 1 /* Must connect to master */
195 #define REDIS_REPL_CONNECTED 2 /* Connected to master */
196
197 /* Slave replication state - from the point of view of master
198 * Note that in SEND_BULK and ONLINE state the slave receives new updates
199 * in its output queue. In the WAIT_BGSAVE state instead the server is waiting
200 * to start the next background saving in order to send updates to it. */
201 #define REDIS_REPL_WAIT_BGSAVE_START 3 /* master waits bgsave to start feeding it */
202 #define REDIS_REPL_WAIT_BGSAVE_END 4 /* master waits bgsave to start bulk DB transmission */
203 #define REDIS_REPL_SEND_BULK 5 /* master is sending the bulk DB */
204 #define REDIS_REPL_ONLINE 6 /* bulk DB already transmitted, receive updates */
205
206 /* List related stuff */
207 #define REDIS_HEAD 0
208 #define REDIS_TAIL 1
209
210 /* Sort operations */
211 #define REDIS_SORT_GET 0
212 #define REDIS_SORT_ASC 1
213 #define REDIS_SORT_DESC 2
214 #define REDIS_SORTKEY_MAX 1024
215
216 /* Log levels */
217 #define REDIS_DEBUG 0
218 #define REDIS_VERBOSE 1
219 #define REDIS_NOTICE 2
220 #define REDIS_WARNING 3
221
222 /* Anti-warning macro... */
223 #define REDIS_NOTUSED(V) ((void) V)
224
225 #define ZSKIPLIST_MAXLEVEL 32 /* Should be enough for 2^32 elements */
226 #define ZSKIPLIST_P 0.25 /* Skiplist P = 1/4 */
227
228 /* Append only defines */
229 #define APPENDFSYNC_NO 0
230 #define APPENDFSYNC_ALWAYS 1
231 #define APPENDFSYNC_EVERYSEC 2
232
233 /* Hashes related defaults */
234 #define REDIS_HASH_MAX_ZIPMAP_ENTRIES 64
235 #define REDIS_HASH_MAX_ZIPMAP_VALUE 512
236
237 /* We can print the stacktrace, so our assert is defined this way: */
238 #define redisAssert(_e) ((_e)?(void)0 : (_redisAssert(#_e,__FILE__,__LINE__),_exit(1)))
239 static void _redisAssert(char *estr, char *file, int line);
240
241 /*================================= Data types ============================== */
242
243 /* A redis object, that is a type able to hold a string / list / set */
244
245 /* The VM object structure */
246 struct redisObjectVM {
247 off_t page; /* the page at witch the object is stored on disk */
248 off_t usedpages; /* number of pages used on disk */
249 time_t atime; /* Last access time */
250 } vm;
251
252 /* The actual Redis Object */
253 typedef struct redisObject {
254 void *ptr;
255 unsigned char type;
256 unsigned char encoding;
257 unsigned char storage; /* If this object is a key, where is the value?
258 * REDIS_VM_MEMORY, REDIS_VM_SWAPPED, ... */
259 unsigned char vtype; /* If this object is a key, and value is swapped out,
260 * this is the type of the swapped out object. */
261 int refcount;
262 /* VM fields, this are only allocated if VM is active, otherwise the
263 * object allocation function will just allocate
264 * sizeof(redisObjct) minus sizeof(redisObjectVM), so using
265 * Redis without VM active will not have any overhead. */
266 struct redisObjectVM vm;
267 } robj;
268
269 /* Macro used to initalize a Redis object allocated on the stack.
270 * Note that this macro is taken near the structure definition to make sure
271 * we'll update it when the structure is changed, to avoid bugs like
272 * bug #85 introduced exactly in this way. */
273 #define initStaticStringObject(_var,_ptr) do { \
274 _var.refcount = 1; \
275 _var.type = REDIS_STRING; \
276 _var.encoding = REDIS_ENCODING_RAW; \
277 _var.ptr = _ptr; \
278 if (server.vm_enabled) _var.storage = REDIS_VM_MEMORY; \
279 } while(0);
280
281 typedef struct redisDb {
282 dict *dict; /* The keyspace for this DB */
283 dict *expires; /* Timeout of keys with a timeout set */
284 dict *blockingkeys; /* Keys with clients waiting for data (BLPOP) */
285 dict *io_keys; /* Keys with clients waiting for VM I/O */
286 int id;
287 } redisDb;
288
289 /* Client MULTI/EXEC state */
290 typedef struct multiCmd {
291 robj **argv;
292 int argc;
293 struct redisCommand *cmd;
294 } multiCmd;
295
296 typedef struct multiState {
297 multiCmd *commands; /* Array of MULTI commands */
298 int count; /* Total number of MULTI commands */
299 } multiState;
300
301 /* With multiplexing we need to take per-clinet state.
302 * Clients are taken in a liked list. */
303 typedef struct redisClient {
304 int fd;
305 redisDb *db;
306 int dictid;
307 sds querybuf;
308 robj **argv, **mbargv;
309 int argc, mbargc;
310 int bulklen; /* bulk read len. -1 if not in bulk read mode */
311 int multibulk; /* multi bulk command format active */
312 list *reply;
313 int sentlen;
314 time_t lastinteraction; /* time of the last interaction, used for timeout */
315 int flags; /* REDIS_SLAVE | REDIS_MONITOR | REDIS_MULTI ... */
316 int slaveseldb; /* slave selected db, if this client is a slave */
317 int authenticated; /* when requirepass is non-NULL */
318 int replstate; /* replication state if this is a slave */
319 int repldbfd; /* replication DB file descriptor */
320 long repldboff; /* replication DB file offset */
321 off_t repldbsize; /* replication DB file size */
322 multiState mstate; /* MULTI/EXEC state */
323 robj **blockingkeys; /* The key we are waiting to terminate a blocking
324 * operation such as BLPOP. Otherwise NULL. */
325 int blockingkeysnum; /* Number of blocking keys */
326 time_t blockingto; /* Blocking operation timeout. If UNIX current time
327 * is >= blockingto then the operation timed out. */
328 list *io_keys; /* Keys this client is waiting to be loaded from the
329 * swap file in order to continue. */
330 dict *pubsub_classes; /* Classes a client is interested in (SUBSCRIBE) */
331 } redisClient;
332
333 struct saveparam {
334 time_t seconds;
335 int changes;
336 };
337
338 /* Global server state structure */
339 struct redisServer {
340 int port;
341 int fd;
342 redisDb *db;
343 dict *sharingpool; /* Poll used for object sharing */
344 unsigned int sharingpoolsize;
345 long long dirty; /* changes to DB from the last save */
346 list *clients;
347 list *slaves, *monitors;
348 char neterr[ANET_ERR_LEN];
349 aeEventLoop *el;
350 int cronloops; /* number of times the cron function run */
351 list *objfreelist; /* A list of freed objects to avoid malloc() */
352 time_t lastsave; /* Unix time of last save succeeede */
353 /* Fields used only for stats */
354 time_t stat_starttime; /* server start time */
355 long long stat_numcommands; /* number of processed commands */
356 long long stat_numconnections; /* number of connections received */
357 long long stat_expiredkeys; /* number of expired keys */
358 /* Configuration */
359 int verbosity;
360 int glueoutputbuf;
361 int maxidletime;
362 int dbnum;
363 int daemonize;
364 int appendonly;
365 int appendfsync;
366 time_t lastfsync;
367 int appendfd;
368 int appendseldb;
369 char *pidfile;
370 pid_t bgsavechildpid;
371 pid_t bgrewritechildpid;
372 sds bgrewritebuf; /* buffer taken by parent during oppend only rewrite */
373 struct saveparam *saveparams;
374 int saveparamslen;
375 char *logfile;
376 char *bindaddr;
377 char *dbfilename;
378 char *appendfilename;
379 char *requirepass;
380 int shareobjects;
381 int rdbcompression;
382 /* Replication related */
383 int isslave;
384 char *masterauth;
385 char *masterhost;
386 int masterport;
387 redisClient *master; /* client that is master for this slave */
388 int replstate;
389 unsigned int maxclients;
390 unsigned long long maxmemory;
391 unsigned int blpop_blocked_clients;
392 unsigned int vm_blocked_clients;
393 /* Sort parameters - qsort_r() is only available under BSD so we
394 * have to take this state global, in order to pass it to sortCompare() */
395 int sort_desc;
396 int sort_alpha;
397 int sort_bypattern;
398 /* Virtual memory configuration */
399 int vm_enabled;
400 char *vm_swap_file;
401 off_t vm_page_size;
402 off_t vm_pages;
403 unsigned long long vm_max_memory;
404 /* Hashes config */
405 size_t hash_max_zipmap_entries;
406 size_t hash_max_zipmap_value;
407 /* Virtual memory state */
408 FILE *vm_fp;
409 int vm_fd;
410 off_t vm_next_page; /* Next probably empty page */
411 off_t vm_near_pages; /* Number of pages allocated sequentially */
412 unsigned char *vm_bitmap; /* Bitmap of free/used pages */
413 time_t unixtime; /* Unix time sampled every second. */
414 /* Virtual memory I/O threads stuff */
415 /* An I/O thread process an element taken from the io_jobs queue and
416 * put the result of the operation in the io_done list. While the
417 * job is being processed, it's put on io_processing queue. */
418 list *io_newjobs; /* List of VM I/O jobs yet to be processed */
419 list *io_processing; /* List of VM I/O jobs being processed */
420 list *io_processed; /* List of VM I/O jobs already processed */
421 list *io_ready_clients; /* Clients ready to be unblocked. All keys loaded */
422 pthread_mutex_t io_mutex; /* lock to access io_jobs/io_done/io_thread_job */
423 pthread_mutex_t obj_freelist_mutex; /* safe redis objects creation/free */
424 pthread_mutex_t io_swapfile_mutex; /* So we can lseek + write */
425 pthread_attr_t io_threads_attr; /* attributes for threads creation */
426 int io_active_threads; /* Number of running I/O threads */
427 int vm_max_threads; /* Max number of I/O threads running at the same time */
428 /* Our main thread is blocked on the event loop, locking for sockets ready
429 * to be read or written, so when a threaded I/O operation is ready to be
430 * processed by the main thread, the I/O thread will use a unix pipe to
431 * awake the main thread. The followings are the two pipe FDs. */
432 int io_ready_pipe_read;
433 int io_ready_pipe_write;
434 /* Virtual memory stats */
435 unsigned long long vm_stats_used_pages;
436 unsigned long long vm_stats_swapped_objects;
437 unsigned long long vm_stats_swapouts;
438 unsigned long long vm_stats_swapins;
439 /* Pubsub */
440 dict *pubsub_classes; /* Associate classes to list of subscribed clients */
441 /* Misc */
442 FILE *devnull;
443 };
444
445 typedef void redisCommandProc(redisClient *c);
446 struct redisCommand {
447 char *name;
448 redisCommandProc *proc;
449 int arity;
450 int flags;
451 /* Use a function to determine which keys need to be loaded
452 * in the background prior to executing this command. Takes precedence
453 * over vm_firstkey and others, ignored when NULL */
454 redisCommandProc *vm_preload_proc;
455 /* What keys should be loaded in background when calling this command? */
456 int vm_firstkey; /* The first argument that's a key (0 = no keys) */
457 int vm_lastkey; /* THe last argument that's a key */
458 int vm_keystep; /* The step between first and last key */
459 };
460
461 struct redisFunctionSym {
462 char *name;
463 unsigned long pointer;
464 };
465
466 typedef struct _redisSortObject {
467 robj *obj;
468 union {
469 double score;
470 robj *cmpobj;
471 } u;
472 } redisSortObject;
473
474 typedef struct _redisSortOperation {
475 int type;
476 robj *pattern;
477 } redisSortOperation;
478
479 /* ZSETs use a specialized version of Skiplists */
480
481 typedef struct zskiplistNode {
482 struct zskiplistNode **forward;
483 struct zskiplistNode *backward;
484 unsigned int *span;
485 double score;
486 robj *obj;
487 } zskiplistNode;
488
489 typedef struct zskiplist {
490 struct zskiplistNode *header, *tail;
491 unsigned long length;
492 int level;
493 } zskiplist;
494
495 typedef struct zset {
496 dict *dict;
497 zskiplist *zsl;
498 } zset;
499
500 /* Our shared "common" objects */
501
502 struct sharedObjectsStruct {
503 robj *crlf, *ok, *err, *emptybulk, *czero, *cone, *pong, *space,
504 *colon, *nullbulk, *nullmultibulk, *queued,
505 *emptymultibulk, *wrongtypeerr, *nokeyerr, *syntaxerr, *sameobjecterr,
506 *outofrangeerr, *plus,
507 *select0, *select1, *select2, *select3, *select4,
508 *select5, *select6, *select7, *select8, *select9,
509 *messagebulk, *subscribebulk, *unsubscribebulk, *mbulk3;
510 } shared;
511
512 /* Global vars that are actally used as constants. The following double
513 * values are used for double on-disk serialization, and are initialized
514 * at runtime to avoid strange compiler optimizations. */
515
516 static double R_Zero, R_PosInf, R_NegInf, R_Nan;
517
518 /* VM threaded I/O request message */
519 #define REDIS_IOJOB_LOAD 0 /* Load from disk to memory */
520 #define REDIS_IOJOB_PREPARE_SWAP 1 /* Compute needed pages */
521 #define REDIS_IOJOB_DO_SWAP 2 /* Swap from memory to disk */
522 typedef struct iojob {
523 int type; /* Request type, REDIS_IOJOB_* */
524 redisDb *db;/* Redis database */
525 robj *key; /* This I/O request is about swapping this key */
526 robj *val; /* the value to swap for REDIS_IOREQ_*_SWAP, otherwise this
527 * field is populated by the I/O thread for REDIS_IOREQ_LOAD. */
528 off_t page; /* Swap page where to read/write the object */
529 off_t pages; /* Swap pages needed to save object. PREPARE_SWAP return val */
530 int canceled; /* True if this command was canceled by blocking side of VM */
531 pthread_t thread; /* ID of the thread processing this entry */
532 } iojob;
533
534 /*================================ Prototypes =============================== */
535
536 static void freeStringObject(robj *o);
537 static void freeListObject(robj *o);
538 static void freeSetObject(robj *o);
539 static void decrRefCount(void *o);
540 static robj *createObject(int type, void *ptr);
541 static void freeClient(redisClient *c);
542 static int rdbLoad(char *filename);
543 static void addReply(redisClient *c, robj *obj);
544 static void addReplySds(redisClient *c, sds s);
545 static void incrRefCount(robj *o);
546 static int rdbSaveBackground(char *filename);
547 static robj *createStringObject(char *ptr, size_t len);
548 static robj *dupStringObject(robj *o);
549 static void replicationFeedSlaves(list *slaves, int dictid, robj **argv, int argc);
550 static void feedAppendOnlyFile(struct redisCommand *cmd, int dictid, robj **argv, int argc);
551 static int syncWithMaster(void);
552 static robj *tryObjectSharing(robj *o);
553 static int tryObjectEncoding(robj *o);
554 static robj *getDecodedObject(robj *o);
555 static int removeExpire(redisDb *db, robj *key);
556 static int expireIfNeeded(redisDb *db, robj *key);
557 static int deleteIfVolatile(redisDb *db, robj *key);
558 static int deleteIfSwapped(redisDb *db, robj *key);
559 static int deleteKey(redisDb *db, robj *key);
560 static time_t getExpire(redisDb *db, robj *key);
561 static int setExpire(redisDb *db, robj *key, time_t when);
562 static void updateSlavesWaitingBgsave(int bgsaveerr);
563 static void freeMemoryIfNeeded(void);
564 static int processCommand(redisClient *c);
565 static void setupSigSegvAction(void);
566 static void rdbRemoveTempFile(pid_t childpid);
567 static void aofRemoveTempFile(pid_t childpid);
568 static size_t stringObjectLen(robj *o);
569 static void processInputBuffer(redisClient *c);
570 static zskiplist *zslCreate(void);
571 static void zslFree(zskiplist *zsl);
572 static void zslInsert(zskiplist *zsl, double score, robj *obj);
573 static void sendReplyToClientWritev(aeEventLoop *el, int fd, void *privdata, int mask);
574 static void initClientMultiState(redisClient *c);
575 static void freeClientMultiState(redisClient *c);
576 static void queueMultiCommand(redisClient *c, struct redisCommand *cmd);
577 static void unblockClientWaitingData(redisClient *c);
578 static int handleClientsWaitingListPush(redisClient *c, robj *key, robj *ele);
579 static void vmInit(void);
580 static void vmMarkPagesFree(off_t page, off_t count);
581 static robj *vmLoadObject(robj *key);
582 static robj *vmPreviewObject(robj *key);
583 static int vmSwapOneObjectBlocking(void);
584 static int vmSwapOneObjectThreaded(void);
585 static int vmCanSwapOut(void);
586 static int tryFreeOneObjectFromFreelist(void);
587 static void acceptHandler(aeEventLoop *el, int fd, void *privdata, int mask);
588 static void vmThreadedIOCompletedJob(aeEventLoop *el, int fd, void *privdata, int mask);
589 static void vmCancelThreadedIOJob(robj *o);
590 static void lockThreadedIO(void);
591 static void unlockThreadedIO(void);
592 static int vmSwapObjectThreaded(robj *key, robj *val, redisDb *db);
593 static void freeIOJob(iojob *j);
594 static void queueIOJob(iojob *j);
595 static int vmWriteObjectOnSwap(robj *o, off_t page);
596 static robj *vmReadObjectFromSwap(off_t page, int type);
597 static void waitEmptyIOJobsQueue(void);
598 static void vmReopenSwapFile(void);
599 static int vmFreePage(off_t page);
600 static void zunionInterBlockClientOnSwappedKeys(redisClient *c);
601 static int blockClientOnSwappedKeys(struct redisCommand *cmd, redisClient *c);
602 static int dontWaitForSwappedKey(redisClient *c, robj *key);
603 static void handleClientsBlockedOnSwappedKey(redisDb *db, robj *key);
604 static void readQueryFromClient(aeEventLoop *el, int fd, void *privdata, int mask);
605 static struct redisCommand *lookupCommand(char *name);
606 static void call(redisClient *c, struct redisCommand *cmd);
607 static void resetClient(redisClient *c);
608 static void convertToRealHash(robj *o);
609 static int pubsubUnsubscribeAll(redisClient *c, int notify);
610 static void usage();
611
612 static void authCommand(redisClient *c);
613 static void pingCommand(redisClient *c);
614 static void echoCommand(redisClient *c);
615 static void setCommand(redisClient *c);
616 static void setnxCommand(redisClient *c);
617 static void getCommand(redisClient *c);
618 static void delCommand(redisClient *c);
619 static void existsCommand(redisClient *c);
620 static void incrCommand(redisClient *c);
621 static void decrCommand(redisClient *c);
622 static void incrbyCommand(redisClient *c);
623 static void decrbyCommand(redisClient *c);
624 static void selectCommand(redisClient *c);
625 static void randomkeyCommand(redisClient *c);
626 static void keysCommand(redisClient *c);
627 static void dbsizeCommand(redisClient *c);
628 static void lastsaveCommand(redisClient *c);
629 static void saveCommand(redisClient *c);
630 static void bgsaveCommand(redisClient *c);
631 static void bgrewriteaofCommand(redisClient *c);
632 static void shutdownCommand(redisClient *c);
633 static void moveCommand(redisClient *c);
634 static void renameCommand(redisClient *c);
635 static void renamenxCommand(redisClient *c);
636 static void lpushCommand(redisClient *c);
637 static void rpushCommand(redisClient *c);
638 static void lpopCommand(redisClient *c);
639 static void rpopCommand(redisClient *c);
640 static void llenCommand(redisClient *c);
641 static void lindexCommand(redisClient *c);
642 static void lrangeCommand(redisClient *c);
643 static void ltrimCommand(redisClient *c);
644 static void typeCommand(redisClient *c);
645 static void lsetCommand(redisClient *c);
646 static void saddCommand(redisClient *c);
647 static void sremCommand(redisClient *c);
648 static void smoveCommand(redisClient *c);
649 static void sismemberCommand(redisClient *c);
650 static void scardCommand(redisClient *c);
651 static void spopCommand(redisClient *c);
652 static void srandmemberCommand(redisClient *c);
653 static void sinterCommand(redisClient *c);
654 static void sinterstoreCommand(redisClient *c);
655 static void sunionCommand(redisClient *c);
656 static void sunionstoreCommand(redisClient *c);
657 static void sdiffCommand(redisClient *c);
658 static void sdiffstoreCommand(redisClient *c);
659 static void syncCommand(redisClient *c);
660 static void flushdbCommand(redisClient *c);
661 static void flushallCommand(redisClient *c);
662 static void sortCommand(redisClient *c);
663 static void lremCommand(redisClient *c);
664 static void rpoplpushcommand(redisClient *c);
665 static void infoCommand(redisClient *c);
666 static void mgetCommand(redisClient *c);
667 static void monitorCommand(redisClient *c);
668 static void expireCommand(redisClient *c);
669 static void expireatCommand(redisClient *c);
670 static void getsetCommand(redisClient *c);
671 static void ttlCommand(redisClient *c);
672 static void slaveofCommand(redisClient *c);
673 static void debugCommand(redisClient *c);
674 static void msetCommand(redisClient *c);
675 static void msetnxCommand(redisClient *c);
676 static void zaddCommand(redisClient *c);
677 static void zincrbyCommand(redisClient *c);
678 static void zrangeCommand(redisClient *c);
679 static void zrangebyscoreCommand(redisClient *c);
680 static void zcountCommand(redisClient *c);
681 static void zrevrangeCommand(redisClient *c);
682 static void zcardCommand(redisClient *c);
683 static void zremCommand(redisClient *c);
684 static void zscoreCommand(redisClient *c);
685 static void zremrangebyscoreCommand(redisClient *c);
686 static void multiCommand(redisClient *c);
687 static void execCommand(redisClient *c);
688 static void discardCommand(redisClient *c);
689 static void blpopCommand(redisClient *c);
690 static void brpopCommand(redisClient *c);
691 static void appendCommand(redisClient *c);
692 static void substrCommand(redisClient *c);
693 static void zrankCommand(redisClient *c);
694 static void zrevrankCommand(redisClient *c);
695 static void hsetCommand(redisClient *c);
696 static void hgetCommand(redisClient *c);
697 static void hdelCommand(redisClient *c);
698 static void hlenCommand(redisClient *c);
699 static void zremrangebyrankCommand(redisClient *c);
700 static void zunionCommand(redisClient *c);
701 static void zinterCommand(redisClient *c);
702 static void hkeysCommand(redisClient *c);
703 static void hvalsCommand(redisClient *c);
704 static void hgetallCommand(redisClient *c);
705 static void hexistsCommand(redisClient *c);
706 static void configCommand(redisClient *c);
707 static void hincrbyCommand(redisClient *c);
708 static void subscribeCommand(redisClient *c);
709 static void unsubscribeCommand(redisClient *c);
710 static void publishCommand(redisClient *c);
711
712 /*================================= Globals ================================= */
713
714 /* Global vars */
715 static struct redisServer server; /* server global state */
716 static struct redisCommand cmdTable[] = {
717 {"get",getCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
718 {"set",setCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,0,0,0},
719 {"setnx",setnxCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,0,0,0},
720 {"append",appendCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
721 {"substr",substrCommand,4,REDIS_CMD_INLINE,NULL,1,1,1},
722 {"del",delCommand,-2,REDIS_CMD_INLINE,NULL,0,0,0},
723 {"exists",existsCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
724 {"incr",incrCommand,2,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,1,1},
725 {"decr",decrCommand,2,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,1,1},
726 {"mget",mgetCommand,-2,REDIS_CMD_INLINE,NULL,1,-1,1},
727 {"rpush",rpushCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
728 {"lpush",lpushCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
729 {"rpop",rpopCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
730 {"lpop",lpopCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
731 {"brpop",brpopCommand,-3,REDIS_CMD_INLINE,NULL,1,1,1},
732 {"blpop",blpopCommand,-3,REDIS_CMD_INLINE,NULL,1,1,1},
733 {"llen",llenCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
734 {"lindex",lindexCommand,3,REDIS_CMD_INLINE,NULL,1,1,1},
735 {"lset",lsetCommand,4,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
736 {"lrange",lrangeCommand,4,REDIS_CMD_INLINE,NULL,1,1,1},
737 {"ltrim",ltrimCommand,4,REDIS_CMD_INLINE,NULL,1,1,1},
738 {"lrem",lremCommand,4,REDIS_CMD_BULK,NULL,1,1,1},
739 {"rpoplpush",rpoplpushcommand,3,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,2,1},
740 {"sadd",saddCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
741 {"srem",sremCommand,3,REDIS_CMD_BULK,NULL,1,1,1},
742 {"smove",smoveCommand,4,REDIS_CMD_BULK,NULL,1,2,1},
743 {"sismember",sismemberCommand,3,REDIS_CMD_BULK,NULL,1,1,1},
744 {"scard",scardCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
745 {"spop",spopCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
746 {"srandmember",srandmemberCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
747 {"sinter",sinterCommand,-2,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,-1,1},
748 {"sinterstore",sinterstoreCommand,-3,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,2,-1,1},
749 {"sunion",sunionCommand,-2,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,-1,1},
750 {"sunionstore",sunionstoreCommand,-3,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,2,-1,1},
751 {"sdiff",sdiffCommand,-2,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,-1,1},
752 {"sdiffstore",sdiffstoreCommand,-3,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,2,-1,1},
753 {"smembers",sinterCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
754 {"zadd",zaddCommand,4,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
755 {"zincrby",zincrbyCommand,4,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
756 {"zrem",zremCommand,3,REDIS_CMD_BULK,NULL,1,1,1},
757 {"zremrangebyscore",zremrangebyscoreCommand,4,REDIS_CMD_INLINE,NULL,1,1,1},
758 {"zremrangebyrank",zremrangebyrankCommand,4,REDIS_CMD_INLINE,NULL,1,1,1},
759 {"zunion",zunionCommand,-4,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,zunionInterBlockClientOnSwappedKeys,0,0,0},
760 {"zinter",zinterCommand,-4,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,zunionInterBlockClientOnSwappedKeys,0,0,0},
761 {"zrange",zrangeCommand,-4,REDIS_CMD_INLINE,NULL,1,1,1},
762 {"zrangebyscore",zrangebyscoreCommand,-4,REDIS_CMD_INLINE,NULL,1,1,1},
763 {"zcount",zcountCommand,4,REDIS_CMD_INLINE,NULL,1,1,1},
764 {"zrevrange",zrevrangeCommand,-4,REDIS_CMD_INLINE,NULL,1,1,1},
765 {"zcard",zcardCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
766 {"zscore",zscoreCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
767 {"zrank",zrankCommand,3,REDIS_CMD_BULK,NULL,1,1,1},
768 {"zrevrank",zrevrankCommand,3,REDIS_CMD_BULK,NULL,1,1,1},
769 {"hset",hsetCommand,4,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
770 {"hincrby",hincrbyCommand,4,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,1,1},
771 {"hget",hgetCommand,3,REDIS_CMD_BULK,NULL,1,1,1},
772 {"hdel",hdelCommand,3,REDIS_CMD_BULK,NULL,1,1,1},
773 {"hlen",hlenCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
774 {"hkeys",hkeysCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
775 {"hvals",hvalsCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
776 {"hgetall",hgetallCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
777 {"hexists",hexistsCommand,3,REDIS_CMD_BULK,NULL,1,1,1},
778 {"incrby",incrbyCommand,3,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,1,1},
779 {"decrby",decrbyCommand,3,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,1,1},
780 {"getset",getsetCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
781 {"mset",msetCommand,-3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,-1,2},
782 {"msetnx",msetnxCommand,-3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,-1,2},
783 {"randomkey",randomkeyCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
784 {"select",selectCommand,2,REDIS_CMD_INLINE,NULL,0,0,0},
785 {"move",moveCommand,3,REDIS_CMD_INLINE,NULL,1,1,1},
786 {"rename",renameCommand,3,REDIS_CMD_INLINE,NULL,1,1,1},
787 {"renamenx",renamenxCommand,3,REDIS_CMD_INLINE,NULL,1,1,1},
788 {"expire",expireCommand,3,REDIS_CMD_INLINE,NULL,0,0,0},
789 {"expireat",expireatCommand,3,REDIS_CMD_INLINE,NULL,0,0,0},
790 {"keys",keysCommand,2,REDIS_CMD_INLINE,NULL,0,0,0},
791 {"dbsize",dbsizeCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
792 {"auth",authCommand,2,REDIS_CMD_INLINE,NULL,0,0,0},
793 {"ping",pingCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
794 {"echo",echoCommand,2,REDIS_CMD_BULK,NULL,0,0,0},
795 {"save",saveCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
796 {"bgsave",bgsaveCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
797 {"bgrewriteaof",bgrewriteaofCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
798 {"shutdown",shutdownCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
799 {"lastsave",lastsaveCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
800 {"type",typeCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
801 {"multi",multiCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
802 {"exec",execCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
803 {"discard",discardCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
804 {"sync",syncCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
805 {"flushdb",flushdbCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
806 {"flushall",flushallCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
807 {"sort",sortCommand,-2,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,1,1},
808 {"info",infoCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
809 {"monitor",monitorCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
810 {"ttl",ttlCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
811 {"slaveof",slaveofCommand,3,REDIS_CMD_INLINE,NULL,0,0,0},
812 {"debug",debugCommand,-2,REDIS_CMD_INLINE,NULL,0,0,0},
813 {"config",configCommand,-2,REDIS_CMD_BULK,NULL,0,0,0},
814 {"subscribe",subscribeCommand,-2,REDIS_CMD_INLINE,NULL,0,0,0},
815 {"unsubscribe",unsubscribeCommand,-1,REDIS_CMD_INLINE,NULL,0,0,0},
816 {"publish",publishCommand,3,REDIS_CMD_BULK,NULL,0,0,0},
817 {NULL,NULL,0,0,NULL,0,0,0}
818 };
819
820 /*============================ Utility functions ============================ */
821
822 /* Glob-style pattern matching. */
823 static int stringmatchlen(const char *pattern, int patternLen,
824 const char *string, int stringLen, int nocase)
825 {
826 while(patternLen) {
827 switch(pattern[0]) {
828 case '*':
829 while (pattern[1] == '*') {
830 pattern++;
831 patternLen--;
832 }
833 if (patternLen == 1)
834 return 1; /* match */
835 while(stringLen) {
836 if (stringmatchlen(pattern+1, patternLen-1,
837 string, stringLen, nocase))
838 return 1; /* match */
839 string++;
840 stringLen--;
841 }
842 return 0; /* no match */
843 break;
844 case '?':
845 if (stringLen == 0)
846 return 0; /* no match */
847 string++;
848 stringLen--;
849 break;
850 case '[':
851 {
852 int not, match;
853
854 pattern++;
855 patternLen--;
856 not = pattern[0] == '^';
857 if (not) {
858 pattern++;
859 patternLen--;
860 }
861 match = 0;
862 while(1) {
863 if (pattern[0] == '\\') {
864 pattern++;
865 patternLen--;
866 if (pattern[0] == string[0])
867 match = 1;
868 } else if (pattern[0] == ']') {
869 break;
870 } else if (patternLen == 0) {
871 pattern--;
872 patternLen++;
873 break;
874 } else if (pattern[1] == '-' && patternLen >= 3) {
875 int start = pattern[0];
876 int end = pattern[2];
877 int c = string[0];
878 if (start > end) {
879 int t = start;
880 start = end;
881 end = t;
882 }
883 if (nocase) {
884 start = tolower(start);
885 end = tolower(end);
886 c = tolower(c);
887 }
888 pattern += 2;
889 patternLen -= 2;
890 if (c >= start && c <= end)
891 match = 1;
892 } else {
893 if (!nocase) {
894 if (pattern[0] == string[0])
895 match = 1;
896 } else {
897 if (tolower((int)pattern[0]) == tolower((int)string[0]))
898 match = 1;
899 }
900 }
901 pattern++;
902 patternLen--;
903 }
904 if (not)
905 match = !match;
906 if (!match)
907 return 0; /* no match */
908 string++;
909 stringLen--;
910 break;
911 }
912 case '\\':
913 if (patternLen >= 2) {
914 pattern++;
915 patternLen--;
916 }
917 /* fall through */
918 default:
919 if (!nocase) {
920 if (pattern[0] != string[0])
921 return 0; /* no match */
922 } else {
923 if (tolower((int)pattern[0]) != tolower((int)string[0]))
924 return 0; /* no match */
925 }
926 string++;
927 stringLen--;
928 break;
929 }
930 pattern++;
931 patternLen--;
932 if (stringLen == 0) {
933 while(*pattern == '*') {
934 pattern++;
935 patternLen--;
936 }
937 break;
938 }
939 }
940 if (patternLen == 0 && stringLen == 0)
941 return 1;
942 return 0;
943 }
944
945 static int stringmatch(const char *pattern, const char *string, int nocase) {
946 return stringmatchlen(pattern,strlen(pattern),string,strlen(string),nocase);
947 }
948
949 static void redisLog(int level, const char *fmt, ...) {
950 va_list ap;
951 FILE *fp;
952
953 fp = (server.logfile == NULL) ? stdout : fopen(server.logfile,"a");
954 if (!fp) return;
955
956 va_start(ap, fmt);
957 if (level >= server.verbosity) {
958 char *c = ".-*#";
959 char buf[64];
960 time_t now;
961
962 now = time(NULL);
963 strftime(buf,64,"%d %b %H:%M:%S",localtime(&now));
964 fprintf(fp,"[%d] %s %c ",(int)getpid(),buf,c[level]);
965 vfprintf(fp, fmt, ap);
966 fprintf(fp,"\n");
967 fflush(fp);
968 }
969 va_end(ap);
970
971 if (server.logfile) fclose(fp);
972 }
973
974 /*====================== Hash table type implementation ==================== */
975
976 /* This is an hash table type that uses the SDS dynamic strings libary as
977 * keys and radis objects as values (objects can hold SDS strings,
978 * lists, sets). */
979
980 static void dictVanillaFree(void *privdata, void *val)
981 {
982 DICT_NOTUSED(privdata);
983 zfree(val);
984 }
985
986 static void dictListDestructor(void *privdata, void *val)
987 {
988 DICT_NOTUSED(privdata);
989 listRelease((list*)val);
990 }
991
992 static int sdsDictKeyCompare(void *privdata, const void *key1,
993 const void *key2)
994 {
995 int l1,l2;
996 DICT_NOTUSED(privdata);
997
998 l1 = sdslen((sds)key1);
999 l2 = sdslen((sds)key2);
1000 if (l1 != l2) return 0;
1001 return memcmp(key1, key2, l1) == 0;
1002 }
1003
1004 static void dictRedisObjectDestructor(void *privdata, void *val)
1005 {
1006 DICT_NOTUSED(privdata);
1007
1008 if (val == NULL) return; /* Values of swapped out keys as set to NULL */
1009 decrRefCount(val);
1010 }
1011
1012 static int dictObjKeyCompare(void *privdata, const void *key1,
1013 const void *key2)
1014 {
1015 const robj *o1 = key1, *o2 = key2;
1016 return sdsDictKeyCompare(privdata,o1->ptr,o2->ptr);
1017 }
1018
1019 static unsigned int dictObjHash(const void *key) {
1020 const robj *o = key;
1021 return dictGenHashFunction(o->ptr, sdslen((sds)o->ptr));
1022 }
1023
1024 static int dictEncObjKeyCompare(void *privdata, const void *key1,
1025 const void *key2)
1026 {
1027 robj *o1 = (robj*) key1, *o2 = (robj*) key2;
1028 int cmp;
1029
1030 if (o1->encoding == REDIS_ENCODING_INT &&
1031 o2->encoding == REDIS_ENCODING_INT &&
1032 o1->ptr == o2->ptr) return 1;
1033
1034 o1 = getDecodedObject(o1);
1035 o2 = getDecodedObject(o2);
1036 cmp = sdsDictKeyCompare(privdata,o1->ptr,o2->ptr);
1037 decrRefCount(o1);
1038 decrRefCount(o2);
1039 return cmp;
1040 }
1041
1042 static unsigned int dictEncObjHash(const void *key) {
1043 robj *o = (robj*) key;
1044
1045 if (o->encoding == REDIS_ENCODING_RAW) {
1046 return dictGenHashFunction(o->ptr, sdslen((sds)o->ptr));
1047 } else {
1048 if (o->encoding == REDIS_ENCODING_INT) {
1049 char buf[32];
1050 int len;
1051
1052 len = snprintf(buf,32,"%ld",(long)o->ptr);
1053 return dictGenHashFunction((unsigned char*)buf, len);
1054 } else {
1055 unsigned int hash;
1056
1057 o = getDecodedObject(o);
1058 hash = dictGenHashFunction(o->ptr, sdslen((sds)o->ptr));
1059 decrRefCount(o);
1060 return hash;
1061 }
1062 }
1063 }
1064
1065 /* Sets type and expires */
1066 static dictType setDictType = {
1067 dictEncObjHash, /* hash function */
1068 NULL, /* key dup */
1069 NULL, /* val dup */
1070 dictEncObjKeyCompare, /* key compare */
1071 dictRedisObjectDestructor, /* key destructor */
1072 NULL /* val destructor */
1073 };
1074
1075 /* Sorted sets hash (note: a skiplist is used in addition to the hash table) */
1076 static dictType zsetDictType = {
1077 dictEncObjHash, /* hash function */
1078 NULL, /* key dup */
1079 NULL, /* val dup */
1080 dictEncObjKeyCompare, /* key compare */
1081 dictRedisObjectDestructor, /* key destructor */
1082 dictVanillaFree /* val destructor of malloc(sizeof(double)) */
1083 };
1084
1085 /* Db->dict */
1086 static dictType dbDictType = {
1087 dictObjHash, /* hash function */
1088 NULL, /* key dup */
1089 NULL, /* val dup */
1090 dictObjKeyCompare, /* key compare */
1091 dictRedisObjectDestructor, /* key destructor */
1092 dictRedisObjectDestructor /* val destructor */
1093 };
1094
1095 /* Db->expires */
1096 static dictType keyptrDictType = {
1097 dictObjHash, /* hash function */
1098 NULL, /* key dup */
1099 NULL, /* val dup */
1100 dictObjKeyCompare, /* key compare */
1101 dictRedisObjectDestructor, /* key destructor */
1102 NULL /* val destructor */
1103 };
1104
1105 /* Hash type hash table (note that small hashes are represented with zimpaps) */
1106 static dictType hashDictType = {
1107 dictEncObjHash, /* hash function */
1108 NULL, /* key dup */
1109 NULL, /* val dup */
1110 dictEncObjKeyCompare, /* key compare */
1111 dictRedisObjectDestructor, /* key destructor */
1112 dictRedisObjectDestructor /* val destructor */
1113 };
1114
1115 /* Keylist hash table type has unencoded redis objects as keys and
1116 * lists as values. It's used for blocking operations (BLPOP) and to
1117 * map swapped keys to a list of clients waiting for this keys to be loaded. */
1118 static dictType keylistDictType = {
1119 dictObjHash, /* hash function */
1120 NULL, /* key dup */
1121 NULL, /* val dup */
1122 dictObjKeyCompare, /* key compare */
1123 dictRedisObjectDestructor, /* key destructor */
1124 dictListDestructor /* val destructor */
1125 };
1126
1127 static void version();
1128
1129 /* ========================= Random utility functions ======================= */
1130
1131 /* Redis generally does not try to recover from out of memory conditions
1132 * when allocating objects or strings, it is not clear if it will be possible
1133 * to report this condition to the client since the networking layer itself
1134 * is based on heap allocation for send buffers, so we simply abort.
1135 * At least the code will be simpler to read... */
1136 static void oom(const char *msg) {
1137 redisLog(REDIS_WARNING, "%s: Out of memory\n",msg);
1138 sleep(1);
1139 abort();
1140 }
1141
1142 /* ====================== Redis server networking stuff ===================== */
1143 static void closeTimedoutClients(void) {
1144 redisClient *c;
1145 listNode *ln;
1146 time_t now = time(NULL);
1147 listIter li;
1148
1149 listRewind(server.clients,&li);
1150 while ((ln = listNext(&li)) != NULL) {
1151 c = listNodeValue(ln);
1152 if (server.maxidletime &&
1153 !(c->flags & REDIS_SLAVE) && /* no timeout for slaves */
1154 !(c->flags & REDIS_MASTER) && /* no timeout for masters */
1155 dictSize(c->pubsub_classes) == 0 && /* no timeout for pubsub */
1156 (now - c->lastinteraction > server.maxidletime))
1157 {
1158 redisLog(REDIS_VERBOSE,"Closing idle client");
1159 freeClient(c);
1160 } else if (c->flags & REDIS_BLOCKED) {
1161 if (c->blockingto != 0 && c->blockingto < now) {
1162 addReply(c,shared.nullmultibulk);
1163 unblockClientWaitingData(c);
1164 }
1165 }
1166 }
1167 }
1168
1169 static int htNeedsResize(dict *dict) {
1170 long long size, used;
1171
1172 size = dictSlots(dict);
1173 used = dictSize(dict);
1174 return (size && used && size > DICT_HT_INITIAL_SIZE &&
1175 (used*100/size < REDIS_HT_MINFILL));
1176 }
1177
1178 /* If the percentage of used slots in the HT reaches REDIS_HT_MINFILL
1179 * we resize the hash table to save memory */
1180 static void tryResizeHashTables(void) {
1181 int j;
1182
1183 for (j = 0; j < server.dbnum; j++) {
1184 if (htNeedsResize(server.db[j].dict)) {
1185 redisLog(REDIS_VERBOSE,"The hash table %d is too sparse, resize it...",j);
1186 dictResize(server.db[j].dict);
1187 redisLog(REDIS_VERBOSE,"Hash table %d resized.",j);
1188 }
1189 if (htNeedsResize(server.db[j].expires))
1190 dictResize(server.db[j].expires);
1191 }
1192 }
1193
1194 /* A background saving child (BGSAVE) terminated its work. Handle this. */
1195 void backgroundSaveDoneHandler(int statloc) {
1196 int exitcode = WEXITSTATUS(statloc);
1197 int bysignal = WIFSIGNALED(statloc);
1198
1199 if (!bysignal && exitcode == 0) {
1200 redisLog(REDIS_NOTICE,
1201 "Background saving terminated with success");
1202 server.dirty = 0;
1203 server.lastsave = time(NULL);
1204 } else if (!bysignal && exitcode != 0) {
1205 redisLog(REDIS_WARNING, "Background saving error");
1206 } else {
1207 redisLog(REDIS_WARNING,
1208 "Background saving terminated by signal");
1209 rdbRemoveTempFile(server.bgsavechildpid);
1210 }
1211 server.bgsavechildpid = -1;
1212 /* Possibly there are slaves waiting for a BGSAVE in order to be served
1213 * (the first stage of SYNC is a bulk transfer of dump.rdb) */
1214 updateSlavesWaitingBgsave(exitcode == 0 ? REDIS_OK : REDIS_ERR);
1215 }
1216
1217 /* A background append only file rewriting (BGREWRITEAOF) terminated its work.
1218 * Handle this. */
1219 void backgroundRewriteDoneHandler(int statloc) {
1220 int exitcode = WEXITSTATUS(statloc);
1221 int bysignal = WIFSIGNALED(statloc);
1222
1223 if (!bysignal && exitcode == 0) {
1224 int fd;
1225 char tmpfile[256];
1226
1227 redisLog(REDIS_NOTICE,
1228 "Background append only file rewriting terminated with success");
1229 /* Now it's time to flush the differences accumulated by the parent */
1230 snprintf(tmpfile,256,"temp-rewriteaof-bg-%d.aof", (int) server.bgrewritechildpid);
1231 fd = open(tmpfile,O_WRONLY|O_APPEND);
1232 if (fd == -1) {
1233 redisLog(REDIS_WARNING, "Not able to open the temp append only file produced by the child: %s", strerror(errno));
1234 goto cleanup;
1235 }
1236 /* Flush our data... */
1237 if (write(fd,server.bgrewritebuf,sdslen(server.bgrewritebuf)) !=
1238 (signed) sdslen(server.bgrewritebuf)) {
1239 redisLog(REDIS_WARNING, "Error or short write trying to flush the parent diff of the append log file in the child temp file: %s", strerror(errno));
1240 close(fd);
1241 goto cleanup;
1242 }
1243 redisLog(REDIS_NOTICE,"Parent diff flushed into the new append log file with success (%lu bytes)",sdslen(server.bgrewritebuf));
1244 /* Now our work is to rename the temp file into the stable file. And
1245 * switch the file descriptor used by the server for append only. */
1246 if (rename(tmpfile,server.appendfilename) == -1) {
1247 redisLog(REDIS_WARNING,"Can't rename the temp append only file into the stable one: %s", strerror(errno));
1248 close(fd);
1249 goto cleanup;
1250 }
1251 /* Mission completed... almost */
1252 redisLog(REDIS_NOTICE,"Append only file successfully rewritten.");
1253 if (server.appendfd != -1) {
1254 /* If append only is actually enabled... */
1255 close(server.appendfd);
1256 server.appendfd = fd;
1257 fsync(fd);
1258 server.appendseldb = -1; /* Make sure it will issue SELECT */
1259 redisLog(REDIS_NOTICE,"The new append only file was selected for future appends.");
1260 } else {
1261 /* If append only is disabled we just generate a dump in this
1262 * format. Why not? */
1263 close(fd);
1264 }
1265 } else if (!bysignal && exitcode != 0) {
1266 redisLog(REDIS_WARNING, "Background append only file rewriting error");
1267 } else {
1268 redisLog(REDIS_WARNING,
1269 "Background append only file rewriting terminated by signal");
1270 }
1271 cleanup:
1272 sdsfree(server.bgrewritebuf);
1273 server.bgrewritebuf = sdsempty();
1274 aofRemoveTempFile(server.bgrewritechildpid);
1275 server.bgrewritechildpid = -1;
1276 }
1277
1278 static int serverCron(struct aeEventLoop *eventLoop, long long id, void *clientData) {
1279 int j, loops = server.cronloops++;
1280 REDIS_NOTUSED(eventLoop);
1281 REDIS_NOTUSED(id);
1282 REDIS_NOTUSED(clientData);
1283
1284 /* We take a cached value of the unix time in the global state because
1285 * with virtual memory and aging there is to store the current time
1286 * in objects at every object access, and accuracy is not needed.
1287 * To access a global var is faster than calling time(NULL) */
1288 server.unixtime = time(NULL);
1289
1290 /* Show some info about non-empty databases */
1291 for (j = 0; j < server.dbnum; j++) {
1292 long long size, used, vkeys;
1293
1294 size = dictSlots(server.db[j].dict);
1295 used = dictSize(server.db[j].dict);
1296 vkeys = dictSize(server.db[j].expires);
1297 if (!(loops % 50) && (used || vkeys)) {
1298 redisLog(REDIS_VERBOSE,"DB %d: %lld keys (%lld volatile) in %lld slots HT.",j,used,vkeys,size);
1299 /* dictPrintStats(server.dict); */
1300 }
1301 }
1302
1303 /* We don't want to resize the hash tables while a bacground saving
1304 * is in progress: the saving child is created using fork() that is
1305 * implemented with a copy-on-write semantic in most modern systems, so
1306 * if we resize the HT while there is the saving child at work actually
1307 * a lot of memory movements in the parent will cause a lot of pages
1308 * copied. */
1309 if (server.bgsavechildpid == -1 && !(loops % 10)) tryResizeHashTables();
1310
1311 /* Show information about connected clients */
1312 if (!(loops % 50)) {
1313 redisLog(REDIS_VERBOSE,"%d clients connected (%d slaves), %zu bytes in use, %d shared objects",
1314 listLength(server.clients)-listLength(server.slaves),
1315 listLength(server.slaves),
1316 zmalloc_used_memory(),
1317 dictSize(server.sharingpool));
1318 }
1319
1320 /* Close connections of timedout clients */
1321 if ((server.maxidletime && !(loops % 100)) || server.blpop_blocked_clients)
1322 closeTimedoutClients();
1323
1324 /* Check if a background saving or AOF rewrite in progress terminated */
1325 if (server.bgsavechildpid != -1 || server.bgrewritechildpid != -1) {
1326 int statloc;
1327 pid_t pid;
1328
1329 if ((pid = wait3(&statloc,WNOHANG,NULL)) != 0) {
1330 if (pid == server.bgsavechildpid) {
1331 backgroundSaveDoneHandler(statloc);
1332 } else {
1333 backgroundRewriteDoneHandler(statloc);
1334 }
1335 }
1336 } else {
1337 /* If there is not a background saving in progress check if
1338 * we have to save now */
1339 time_t now = time(NULL);
1340 for (j = 0; j < server.saveparamslen; j++) {
1341 struct saveparam *sp = server.saveparams+j;
1342
1343 if (server.dirty >= sp->changes &&
1344 now-server.lastsave > sp->seconds) {
1345 redisLog(REDIS_NOTICE,"%d changes in %d seconds. Saving...",
1346 sp->changes, sp->seconds);
1347 rdbSaveBackground(server.dbfilename);
1348 break;
1349 }
1350 }
1351 }
1352
1353 /* Try to expire a few timed out keys. The algorithm used is adaptive and
1354 * will use few CPU cycles if there are few expiring keys, otherwise
1355 * it will get more aggressive to avoid that too much memory is used by
1356 * keys that can be removed from the keyspace. */
1357 for (j = 0; j < server.dbnum; j++) {
1358 int expired;
1359 redisDb *db = server.db+j;
1360
1361 /* Continue to expire if at the end of the cycle more than 25%
1362 * of the keys were expired. */
1363 do {
1364 long num = dictSize(db->expires);
1365 time_t now = time(NULL);
1366
1367 expired = 0;
1368 if (num > REDIS_EXPIRELOOKUPS_PER_CRON)
1369 num = REDIS_EXPIRELOOKUPS_PER_CRON;
1370 while (num--) {
1371 dictEntry *de;
1372 time_t t;
1373
1374 if ((de = dictGetRandomKey(db->expires)) == NULL) break;
1375 t = (time_t) dictGetEntryVal(de);
1376 if (now > t) {
1377 deleteKey(db,dictGetEntryKey(de));
1378 expired++;
1379 server.stat_expiredkeys++;
1380 }
1381 }
1382 } while (expired > REDIS_EXPIRELOOKUPS_PER_CRON/4);
1383 }
1384
1385 /* Swap a few keys on disk if we are over the memory limit and VM
1386 * is enbled. Try to free objects from the free list first. */
1387 if (vmCanSwapOut()) {
1388 while (server.vm_enabled && zmalloc_used_memory() >
1389 server.vm_max_memory)
1390 {
1391 int retval;
1392
1393 if (tryFreeOneObjectFromFreelist() == REDIS_OK) continue;
1394 retval = (server.vm_max_threads == 0) ?
1395 vmSwapOneObjectBlocking() :
1396 vmSwapOneObjectThreaded();
1397 if (retval == REDIS_ERR && !(loops % 300) &&
1398 zmalloc_used_memory() >
1399 (server.vm_max_memory+server.vm_max_memory/10))
1400 {
1401 redisLog(REDIS_WARNING,"WARNING: vm-max-memory limit exceeded by more than 10%% but unable to swap more objects out!");
1402 }
1403 /* Note that when using threade I/O we free just one object,
1404 * because anyway when the I/O thread in charge to swap this
1405 * object out will finish, the handler of completed jobs
1406 * will try to swap more objects if we are still out of memory. */
1407 if (retval == REDIS_ERR || server.vm_max_threads > 0) break;
1408 }
1409 }
1410
1411 /* Check if we should connect to a MASTER */
1412 if (server.replstate == REDIS_REPL_CONNECT && !(loops % 10)) {
1413 redisLog(REDIS_NOTICE,"Connecting to MASTER...");
1414 if (syncWithMaster() == REDIS_OK) {
1415 redisLog(REDIS_NOTICE,"MASTER <-> SLAVE sync succeeded");
1416 }
1417 }
1418 return 100;
1419 }
1420
1421 /* This function gets called every time Redis is entering the
1422 * main loop of the event driven library, that is, before to sleep
1423 * for ready file descriptors. */
1424 static void beforeSleep(struct aeEventLoop *eventLoop) {
1425 REDIS_NOTUSED(eventLoop);
1426
1427 if (server.vm_enabled && listLength(server.io_ready_clients)) {
1428 listIter li;
1429 listNode *ln;
1430
1431 listRewind(server.io_ready_clients,&li);
1432 while((ln = listNext(&li))) {
1433 redisClient *c = ln->value;
1434 struct redisCommand *cmd;
1435
1436 /* Resume the client. */
1437 listDelNode(server.io_ready_clients,ln);
1438 c->flags &= (~REDIS_IO_WAIT);
1439 server.vm_blocked_clients--;
1440 aeCreateFileEvent(server.el, c->fd, AE_READABLE,
1441 readQueryFromClient, c);
1442 cmd = lookupCommand(c->argv[0]->ptr);
1443 assert(cmd != NULL);
1444 call(c,cmd);
1445 resetClient(c);
1446 /* There may be more data to process in the input buffer. */
1447 if (c->querybuf && sdslen(c->querybuf) > 0)
1448 processInputBuffer(c);
1449 }
1450 }
1451 }
1452
1453 static void createSharedObjects(void) {
1454 shared.crlf = createObject(REDIS_STRING,sdsnew("\r\n"));
1455 shared.ok = createObject(REDIS_STRING,sdsnew("+OK\r\n"));
1456 shared.err = createObject(REDIS_STRING,sdsnew("-ERR\r\n"));
1457 shared.emptybulk = createObject(REDIS_STRING,sdsnew("$0\r\n\r\n"));
1458 shared.czero = createObject(REDIS_STRING,sdsnew(":0\r\n"));
1459 shared.cone = createObject(REDIS_STRING,sdsnew(":1\r\n"));
1460 shared.nullbulk = createObject(REDIS_STRING,sdsnew("$-1\r\n"));
1461 shared.nullmultibulk = createObject(REDIS_STRING,sdsnew("*-1\r\n"));
1462 shared.emptymultibulk = createObject(REDIS_STRING,sdsnew("*0\r\n"));
1463 shared.pong = createObject(REDIS_STRING,sdsnew("+PONG\r\n"));
1464 shared.queued = createObject(REDIS_STRING,sdsnew("+QUEUED\r\n"));
1465 shared.wrongtypeerr = createObject(REDIS_STRING,sdsnew(
1466 "-ERR Operation against a key holding the wrong kind of value\r\n"));
1467 shared.nokeyerr = createObject(REDIS_STRING,sdsnew(
1468 "-ERR no such key\r\n"));
1469 shared.syntaxerr = createObject(REDIS_STRING,sdsnew(
1470 "-ERR syntax error\r\n"));
1471 shared.sameobjecterr = createObject(REDIS_STRING,sdsnew(
1472 "-ERR source and destination objects are the same\r\n"));
1473 shared.outofrangeerr = createObject(REDIS_STRING,sdsnew(
1474 "-ERR index out of range\r\n"));
1475 shared.space = createObject(REDIS_STRING,sdsnew(" "));
1476 shared.colon = createObject(REDIS_STRING,sdsnew(":"));
1477 shared.plus = createObject(REDIS_STRING,sdsnew("+"));
1478 shared.select0 = createStringObject("select 0\r\n",10);
1479 shared.select1 = createStringObject("select 1\r\n",10);
1480 shared.select2 = createStringObject("select 2\r\n",10);
1481 shared.select3 = createStringObject("select 3\r\n",10);
1482 shared.select4 = createStringObject("select 4\r\n",10);
1483 shared.select5 = createStringObject("select 5\r\n",10);
1484 shared.select6 = createStringObject("select 6\r\n",10);
1485 shared.select7 = createStringObject("select 7\r\n",10);
1486 shared.select8 = createStringObject("select 8\r\n",10);
1487 shared.select9 = createStringObject("select 9\r\n",10);
1488 shared.messagebulk = createStringObject("$7\r\nmessage\r\n",13);
1489 shared.subscribebulk = createStringObject("$9\r\nsubscribe\r\n",15);
1490 shared.unsubscribebulk = createStringObject("$11\r\nunsubscribe\r\n",18);
1491 shared.mbulk3 = createStringObject("*3\r\n",4);
1492 }
1493
1494 static void appendServerSaveParams(time_t seconds, int changes) {
1495 server.saveparams = zrealloc(server.saveparams,sizeof(struct saveparam)*(server.saveparamslen+1));
1496 server.saveparams[server.saveparamslen].seconds = seconds;
1497 server.saveparams[server.saveparamslen].changes = changes;
1498 server.saveparamslen++;
1499 }
1500
1501 static void resetServerSaveParams() {
1502 zfree(server.saveparams);
1503 server.saveparams = NULL;
1504 server.saveparamslen = 0;
1505 }
1506
1507 static void initServerConfig() {
1508 server.dbnum = REDIS_DEFAULT_DBNUM;
1509 server.port = REDIS_SERVERPORT;
1510 server.verbosity = REDIS_VERBOSE;
1511 server.maxidletime = REDIS_MAXIDLETIME;
1512 server.saveparams = NULL;
1513 server.logfile = NULL; /* NULL = log on standard output */
1514 server.bindaddr = NULL;
1515 server.glueoutputbuf = 1;
1516 server.daemonize = 0;
1517 server.appendonly = 0;
1518 server.appendfsync = APPENDFSYNC_ALWAYS;
1519 server.lastfsync = time(NULL);
1520 server.appendfd = -1;
1521 server.appendseldb = -1; /* Make sure the first time will not match */
1522 server.pidfile = zstrdup("/var/run/redis.pid");
1523 server.dbfilename = zstrdup("dump.rdb");
1524 server.appendfilename = zstrdup("appendonly.aof");
1525 server.requirepass = NULL;
1526 server.shareobjects = 0;
1527 server.rdbcompression = 1;
1528 server.sharingpoolsize = 1024;
1529 server.maxclients = 0;
1530 server.blpop_blocked_clients = 0;
1531 server.maxmemory = 0;
1532 server.vm_enabled = 0;
1533 server.vm_swap_file = zstrdup("/tmp/redis-%p.vm");
1534 server.vm_page_size = 256; /* 256 bytes per page */
1535 server.vm_pages = 1024*1024*100; /* 104 millions of pages */
1536 server.vm_max_memory = 1024LL*1024*1024*1; /* 1 GB of RAM */
1537 server.vm_max_threads = 4;
1538 server.vm_blocked_clients = 0;
1539 server.hash_max_zipmap_entries = REDIS_HASH_MAX_ZIPMAP_ENTRIES;
1540 server.hash_max_zipmap_value = REDIS_HASH_MAX_ZIPMAP_VALUE;
1541
1542 resetServerSaveParams();
1543
1544 appendServerSaveParams(60*60,1); /* save after 1 hour and 1 change */
1545 appendServerSaveParams(300,100); /* save after 5 minutes and 100 changes */
1546 appendServerSaveParams(60,10000); /* save after 1 minute and 10000 changes */
1547 /* Replication related */
1548 server.isslave = 0;
1549 server.masterauth = NULL;
1550 server.masterhost = NULL;
1551 server.masterport = 6379;
1552 server.master = NULL;
1553 server.replstate = REDIS_REPL_NONE;
1554
1555 /* Double constants initialization */
1556 R_Zero = 0.0;
1557 R_PosInf = 1.0/R_Zero;
1558 R_NegInf = -1.0/R_Zero;
1559 R_Nan = R_Zero/R_Zero;
1560 }
1561
1562 static void initServer() {
1563 int j;
1564
1565 signal(SIGHUP, SIG_IGN);
1566 signal(SIGPIPE, SIG_IGN);
1567 setupSigSegvAction();
1568
1569 server.devnull = fopen("/dev/null","w");
1570 if (server.devnull == NULL) {
1571 redisLog(REDIS_WARNING, "Can't open /dev/null: %s", server.neterr);
1572 exit(1);
1573 }
1574 server.clients = listCreate();
1575 server.slaves = listCreate();
1576 server.monitors = listCreate();
1577 server.objfreelist = listCreate();
1578 createSharedObjects();
1579 server.el = aeCreateEventLoop();
1580 server.db = zmalloc(sizeof(redisDb)*server.dbnum);
1581 server.sharingpool = dictCreate(&setDictType,NULL);
1582 server.fd = anetTcpServer(server.neterr, server.port, server.bindaddr);
1583 if (server.fd == -1) {
1584 redisLog(REDIS_WARNING, "Opening TCP port: %s", server.neterr);
1585 exit(1);
1586 }
1587 for (j = 0; j < server.dbnum; j++) {
1588 server.db[j].dict = dictCreate(&dbDictType,NULL);
1589 server.db[j].expires = dictCreate(&keyptrDictType,NULL);
1590 server.db[j].blockingkeys = dictCreate(&keylistDictType,NULL);
1591 if (server.vm_enabled)
1592 server.db[j].io_keys = dictCreate(&keylistDictType,NULL);
1593 server.db[j].id = j;
1594 }
1595 server.pubsub_classes = dictCreate(&keylistDictType,NULL);
1596 server.cronloops = 0;
1597 server.bgsavechildpid = -1;
1598 server.bgrewritechildpid = -1;
1599 server.bgrewritebuf = sdsempty();
1600 server.lastsave = time(NULL);
1601 server.dirty = 0;
1602 server.stat_numcommands = 0;
1603 server.stat_numconnections = 0;
1604 server.stat_expiredkeys = 0;
1605 server.stat_starttime = time(NULL);
1606 server.unixtime = time(NULL);
1607 aeCreateTimeEvent(server.el, 1, serverCron, NULL, NULL);
1608 if (aeCreateFileEvent(server.el, server.fd, AE_READABLE,
1609 acceptHandler, NULL) == AE_ERR) oom("creating file event");
1610
1611 if (server.appendonly) {
1612 server.appendfd = open(server.appendfilename,O_WRONLY|O_APPEND|O_CREAT,0644);
1613 if (server.appendfd == -1) {
1614 redisLog(REDIS_WARNING, "Can't open the append-only file: %s",
1615 strerror(errno));
1616 exit(1);
1617 }
1618 }
1619
1620 if (server.vm_enabled) vmInit();
1621 }
1622
1623 /* Empty the whole database */
1624 static long long emptyDb() {
1625 int j;
1626 long long removed = 0;
1627
1628 for (j = 0; j < server.dbnum; j++) {
1629 removed += dictSize(server.db[j].dict);
1630 dictEmpty(server.db[j].dict);
1631 dictEmpty(server.db[j].expires);
1632 }
1633 return removed;
1634 }
1635
1636 static int yesnotoi(char *s) {
1637 if (!strcasecmp(s,"yes")) return 1;
1638 else if (!strcasecmp(s,"no")) return 0;
1639 else return -1;
1640 }
1641
1642 /* I agree, this is a very rudimental way to load a configuration...
1643 will improve later if the config gets more complex */
1644 static void loadServerConfig(char *filename) {
1645 FILE *fp;
1646 char buf[REDIS_CONFIGLINE_MAX+1], *err = NULL;
1647 int linenum = 0;
1648 sds line = NULL;
1649 char *errormsg = "Fatal error, can't open config file '%s'";
1650 char *errorbuf = zmalloc(sizeof(char)*(strlen(errormsg)+strlen(filename)));
1651 sprintf(errorbuf, errormsg, filename);
1652
1653 if (filename[0] == '-' && filename[1] == '\0')
1654 fp = stdin;
1655 else {
1656 if ((fp = fopen(filename,"r")) == NULL) {
1657 redisLog(REDIS_WARNING, errorbuf);
1658 exit(1);
1659 }
1660 }
1661
1662 while(fgets(buf,REDIS_CONFIGLINE_MAX+1,fp) != NULL) {
1663 sds *argv;
1664 int argc, j;
1665
1666 linenum++;
1667 line = sdsnew(buf);
1668 line = sdstrim(line," \t\r\n");
1669
1670 /* Skip comments and blank lines*/
1671 if (line[0] == '#' || line[0] == '\0') {
1672 sdsfree(line);
1673 continue;
1674 }
1675
1676 /* Split into arguments */
1677 argv = sdssplitlen(line,sdslen(line)," ",1,&argc);
1678 sdstolower(argv[0]);
1679
1680 /* Execute config directives */
1681 if (!strcasecmp(argv[0],"timeout") && argc == 2) {
1682 server.maxidletime = atoi(argv[1]);
1683 if (server.maxidletime < 0) {
1684 err = "Invalid timeout value"; goto loaderr;
1685 }
1686 } else if (!strcasecmp(argv[0],"port") && argc == 2) {
1687 server.port = atoi(argv[1]);
1688 if (server.port < 1 || server.port > 65535) {
1689 err = "Invalid port"; goto loaderr;
1690 }
1691 } else if (!strcasecmp(argv[0],"bind") && argc == 2) {
1692 server.bindaddr = zstrdup(argv[1]);
1693 } else if (!strcasecmp(argv[0],"save") && argc == 3) {
1694 int seconds = atoi(argv[1]);
1695 int changes = atoi(argv[2]);
1696 if (seconds < 1 || changes < 0) {
1697 err = "Invalid save parameters"; goto loaderr;
1698 }
1699 appendServerSaveParams(seconds,changes);
1700 } else if (!strcasecmp(argv[0],"dir") && argc == 2) {
1701 if (chdir(argv[1]) == -1) {
1702 redisLog(REDIS_WARNING,"Can't chdir to '%s': %s",
1703 argv[1], strerror(errno));
1704 exit(1);
1705 }
1706 } else if (!strcasecmp(argv[0],"loglevel") && argc == 2) {
1707 if (!strcasecmp(argv[1],"debug")) server.verbosity = REDIS_DEBUG;
1708 else if (!strcasecmp(argv[1],"verbose")) server.verbosity = REDIS_VERBOSE;
1709 else if (!strcasecmp(argv[1],"notice")) server.verbosity = REDIS_NOTICE;
1710 else if (!strcasecmp(argv[1],"warning")) server.verbosity = REDIS_WARNING;
1711 else {
1712 err = "Invalid log level. Must be one of debug, notice, warning";
1713 goto loaderr;
1714 }
1715 } else if (!strcasecmp(argv[0],"logfile") && argc == 2) {
1716 FILE *logfp;
1717
1718 server.logfile = zstrdup(argv[1]);
1719 if (!strcasecmp(server.logfile,"stdout")) {
1720 zfree(server.logfile);
1721 server.logfile = NULL;
1722 }
1723 if (server.logfile) {
1724 /* Test if we are able to open the file. The server will not
1725 * be able to abort just for this problem later... */
1726 logfp = fopen(server.logfile,"a");
1727 if (logfp == NULL) {
1728 err = sdscatprintf(sdsempty(),
1729 "Can't open the log file: %s", strerror(errno));
1730 goto loaderr;
1731 }
1732 fclose(logfp);
1733 }
1734 } else if (!strcasecmp(argv[0],"databases") && argc == 2) {
1735 server.dbnum = atoi(argv[1]);
1736 if (server.dbnum < 1) {
1737 err = "Invalid number of databases"; goto loaderr;
1738 }
1739 } else if (!strcasecmp(argv[0],"include") && argc == 2) {
1740 loadServerConfig(argv[1]);
1741 } else if (!strcasecmp(argv[0],"maxclients") && argc == 2) {
1742 server.maxclients = atoi(argv[1]);
1743 } else if (!strcasecmp(argv[0],"maxmemory") && argc == 2) {
1744 server.maxmemory = strtoll(argv[1], NULL, 10);
1745 } else if (!strcasecmp(argv[0],"slaveof") && argc == 3) {
1746 server.masterhost = sdsnew(argv[1]);
1747 server.masterport = atoi(argv[2]);
1748 server.replstate = REDIS_REPL_CONNECT;
1749 } else if (!strcasecmp(argv[0],"masterauth") && argc == 2) {
1750 server.masterauth = zstrdup(argv[1]);
1751 } else if (!strcasecmp(argv[0],"glueoutputbuf") && argc == 2) {
1752 if ((server.glueoutputbuf = yesnotoi(argv[1])) == -1) {
1753 err = "argument must be 'yes' or 'no'"; goto loaderr;
1754 }
1755 } else if (!strcasecmp(argv[0],"shareobjects") && argc == 2) {
1756 if ((server.shareobjects = yesnotoi(argv[1])) == -1) {
1757 err = "argument must be 'yes' or 'no'"; goto loaderr;
1758 }
1759 } else if (!strcasecmp(argv[0],"rdbcompression") && argc == 2) {
1760 if ((server.rdbcompression = yesnotoi(argv[1])) == -1) {
1761 err = "argument must be 'yes' or 'no'"; goto loaderr;
1762 }
1763 } else if (!strcasecmp(argv[0],"shareobjectspoolsize") && argc == 2) {
1764 server.sharingpoolsize = atoi(argv[1]);
1765 if (server.sharingpoolsize < 1) {
1766 err = "invalid object sharing pool size"; goto loaderr;
1767 }
1768 } else if (!strcasecmp(argv[0],"daemonize") && argc == 2) {
1769 if ((server.daemonize = yesnotoi(argv[1])) == -1) {
1770 err = "argument must be 'yes' or 'no'"; goto loaderr;
1771 }
1772 } else if (!strcasecmp(argv[0],"appendonly") && argc == 2) {
1773 if ((server.appendonly = yesnotoi(argv[1])) == -1) {
1774 err = "argument must be 'yes' or 'no'"; goto loaderr;
1775 }
1776 } else if (!strcasecmp(argv[0],"appendfsync") && argc == 2) {
1777 if (!strcasecmp(argv[1],"no")) {
1778 server.appendfsync = APPENDFSYNC_NO;
1779 } else if (!strcasecmp(argv[1],"always")) {
1780 server.appendfsync = APPENDFSYNC_ALWAYS;
1781 } else if (!strcasecmp(argv[1],"everysec")) {
1782 server.appendfsync = APPENDFSYNC_EVERYSEC;
1783 } else {
1784 err = "argument must be 'no', 'always' or 'everysec'";
1785 goto loaderr;
1786 }
1787 } else if (!strcasecmp(argv[0],"requirepass") && argc == 2) {
1788 server.requirepass = zstrdup(argv[1]);
1789 } else if (!strcasecmp(argv[0],"pidfile") && argc == 2) {
1790 zfree(server.pidfile);
1791 server.pidfile = zstrdup(argv[1]);
1792 } else if (!strcasecmp(argv[0],"dbfilename") && argc == 2) {
1793 zfree(server.dbfilename);
1794 server.dbfilename = zstrdup(argv[1]);
1795 } else if (!strcasecmp(argv[0],"vm-enabled") && argc == 2) {
1796 if ((server.vm_enabled = yesnotoi(argv[1])) == -1) {
1797 err = "argument must be 'yes' or 'no'"; goto loaderr;
1798 }
1799 } else if (!strcasecmp(argv[0],"vm-swap-file") && argc == 2) {
1800 zfree(server.vm_swap_file);
1801 server.vm_swap_file = zstrdup(argv[1]);
1802 } else if (!strcasecmp(argv[0],"vm-max-memory") && argc == 2) {
1803 server.vm_max_memory = strtoll(argv[1], NULL, 10);
1804 } else if (!strcasecmp(argv[0],"vm-page-size") && argc == 2) {
1805 server.vm_page_size = strtoll(argv[1], NULL, 10);
1806 } else if (!strcasecmp(argv[0],"vm-pages") && argc == 2) {
1807 server.vm_pages = strtoll(argv[1], NULL, 10);
1808 } else if (!strcasecmp(argv[0],"vm-max-threads") && argc == 2) {
1809 server.vm_max_threads = strtoll(argv[1], NULL, 10);
1810 } else if (!strcasecmp(argv[0],"hash-max-zipmap-entries") && argc == 2){
1811 server.hash_max_zipmap_entries = strtol(argv[1], NULL, 10);
1812 } else if (!strcasecmp(argv[0],"hash-max-zipmap-value") && argc == 2){
1813 server.hash_max_zipmap_value = strtol(argv[1], NULL, 10);
1814 } else if (!strcasecmp(argv[0],"vm-max-threads") && argc == 2) {
1815 server.vm_max_threads = strtoll(argv[1], NULL, 10);
1816 } else {
1817 err = "Bad directive or wrong number of arguments"; goto loaderr;
1818 }
1819 for (j = 0; j < argc; j++)
1820 sdsfree(argv[j]);
1821 zfree(argv);
1822 sdsfree(line);
1823 }
1824 if (fp != stdin) fclose(fp);
1825 return;
1826
1827 loaderr:
1828 fprintf(stderr, "\n*** FATAL CONFIG FILE ERROR ***\n");
1829 fprintf(stderr, "Reading the configuration file, at line %d\n", linenum);
1830 fprintf(stderr, ">>> '%s'\n", line);
1831 fprintf(stderr, "%s\n", err);
1832 exit(1);
1833 }
1834
1835 static void freeClientArgv(redisClient *c) {
1836 int j;
1837
1838 for (j = 0; j < c->argc; j++)
1839 decrRefCount(c->argv[j]);
1840 for (j = 0; j < c->mbargc; j++)
1841 decrRefCount(c->mbargv[j]);
1842 c->argc = 0;
1843 c->mbargc = 0;
1844 }
1845
1846 static void freeClient(redisClient *c) {
1847 listNode *ln;
1848
1849 /* Note that if the client we are freeing is blocked into a blocking
1850 * call, we have to set querybuf to NULL *before* to call
1851 * unblockClientWaitingData() to avoid processInputBuffer() will get
1852 * called. Also it is important to remove the file events after
1853 * this, because this call adds the READABLE event. */
1854 sdsfree(c->querybuf);
1855 c->querybuf = NULL;
1856 if (c->flags & REDIS_BLOCKED)
1857 unblockClientWaitingData(c);
1858
1859 /* Unsubscribe from all the pubsub classes */
1860 pubsubUnsubscribeAll(c,0);
1861 dictRelease(c->pubsub_classes);
1862 /* Obvious cleanup */
1863 aeDeleteFileEvent(server.el,c->fd,AE_READABLE);
1864 aeDeleteFileEvent(server.el,c->fd,AE_WRITABLE);
1865 listRelease(c->reply);
1866 freeClientArgv(c);
1867 close(c->fd);
1868 /* Remove from the list of clients */
1869 ln = listSearchKey(server.clients,c);
1870 redisAssert(ln != NULL);
1871 listDelNode(server.clients,ln);
1872 /* Remove from the list of clients waiting for swapped keys */
1873 if (c->flags & REDIS_IO_WAIT && listLength(c->io_keys) == 0) {
1874 ln = listSearchKey(server.io_ready_clients,c);
1875 if (ln) {
1876 listDelNode(server.io_ready_clients,ln);
1877 server.vm_blocked_clients--;
1878 }
1879 }
1880 while (server.vm_enabled && listLength(c->io_keys)) {
1881 ln = listFirst(c->io_keys);
1882 dontWaitForSwappedKey(c,ln->value);
1883 }
1884 listRelease(c->io_keys);
1885 /* Master/slave cleanup */
1886 if (c->flags & REDIS_SLAVE) {
1887 if (c->replstate == REDIS_REPL_SEND_BULK && c->repldbfd != -1)
1888 close(c->repldbfd);
1889 list *l = (c->flags & REDIS_MONITOR) ? server.monitors : server.slaves;
1890 ln = listSearchKey(l,c);
1891 redisAssert(ln != NULL);
1892 listDelNode(l,ln);
1893 }
1894 if (c->flags & REDIS_MASTER) {
1895 server.master = NULL;
1896 server.replstate = REDIS_REPL_CONNECT;
1897 }
1898 /* Release memory */
1899 zfree(c->argv);
1900 zfree(c->mbargv);
1901 freeClientMultiState(c);
1902 zfree(c);
1903 }
1904
1905 #define GLUEREPLY_UP_TO (1024)
1906 static void glueReplyBuffersIfNeeded(redisClient *c) {
1907 int copylen = 0;
1908 char buf[GLUEREPLY_UP_TO];
1909 listNode *ln;
1910 listIter li;
1911 robj *o;
1912
1913 listRewind(c->reply,&li);
1914 while((ln = listNext(&li))) {
1915 int objlen;
1916
1917 o = ln->value;
1918 objlen = sdslen(o->ptr);
1919 if (copylen + objlen <= GLUEREPLY_UP_TO) {
1920 memcpy(buf+copylen,o->ptr,objlen);
1921 copylen += objlen;
1922 listDelNode(c->reply,ln);
1923 } else {
1924 if (copylen == 0) return;
1925 break;
1926 }
1927 }
1928 /* Now the output buffer is empty, add the new single element */
1929 o = createObject(REDIS_STRING,sdsnewlen(buf,copylen));
1930 listAddNodeHead(c->reply,o);
1931 }
1932
1933 static void sendReplyToClient(aeEventLoop *el, int fd, void *privdata, int mask) {
1934 redisClient *c = privdata;
1935 int nwritten = 0, totwritten = 0, objlen;
1936 robj *o;
1937 REDIS_NOTUSED(el);
1938 REDIS_NOTUSED(mask);
1939
1940 /* Use writev() if we have enough buffers to send */
1941 if (!server.glueoutputbuf &&
1942 listLength(c->reply) > REDIS_WRITEV_THRESHOLD &&
1943 !(c->flags & REDIS_MASTER))
1944 {
1945 sendReplyToClientWritev(el, fd, privdata, mask);
1946 return;
1947 }
1948
1949 while(listLength(c->reply)) {
1950 if (server.glueoutputbuf && listLength(c->reply) > 1)
1951 glueReplyBuffersIfNeeded(c);
1952
1953 o = listNodeValue(listFirst(c->reply));
1954 objlen = sdslen(o->ptr);
1955
1956 if (objlen == 0) {
1957 listDelNode(c->reply,listFirst(c->reply));
1958 continue;
1959 }
1960
1961 if (c->flags & REDIS_MASTER) {
1962 /* Don't reply to a master */
1963 nwritten = objlen - c->sentlen;
1964 } else {
1965 nwritten = write(fd, ((char*)o->ptr)+c->sentlen, objlen - c->sentlen);
1966 if (nwritten <= 0) break;
1967 }
1968 c->sentlen += nwritten;
1969 totwritten += nwritten;
1970 /* If we fully sent the object on head go to the next one */
1971 if (c->sentlen == objlen) {
1972 listDelNode(c->reply,listFirst(c->reply));
1973 c->sentlen = 0;
1974 }
1975 /* Note that we avoid to send more thank REDIS_MAX_WRITE_PER_EVENT
1976 * bytes, in a single threaded server it's a good idea to serve
1977 * other clients as well, even if a very large request comes from
1978 * super fast link that is always able to accept data (in real world
1979 * scenario think about 'KEYS *' against the loopback interfae) */
1980 if (totwritten > REDIS_MAX_WRITE_PER_EVENT) break;
1981 }
1982 if (nwritten == -1) {
1983 if (errno == EAGAIN) {
1984 nwritten = 0;
1985 } else {
1986 redisLog(REDIS_VERBOSE,
1987 "Error writing to client: %s", strerror(errno));
1988 freeClient(c);
1989 return;
1990 }
1991 }
1992 if (totwritten > 0) c->lastinteraction = time(NULL);
1993 if (listLength(c->reply) == 0) {
1994 c->sentlen = 0;
1995 aeDeleteFileEvent(server.el,c->fd,AE_WRITABLE);
1996 }
1997 }
1998
1999 static void sendReplyToClientWritev(aeEventLoop *el, int fd, void *privdata, int mask)
2000 {
2001 redisClient *c = privdata;
2002 int nwritten = 0, totwritten = 0, objlen, willwrite;
2003 robj *o;
2004 struct iovec iov[REDIS_WRITEV_IOVEC_COUNT];
2005 int offset, ion = 0;
2006 REDIS_NOTUSED(el);
2007 REDIS_NOTUSED(mask);
2008
2009 listNode *node;
2010 while (listLength(c->reply)) {
2011 offset = c->sentlen;
2012 ion = 0;
2013 willwrite = 0;
2014
2015 /* fill-in the iov[] array */
2016 for(node = listFirst(c->reply); node; node = listNextNode(node)) {
2017 o = listNodeValue(node);
2018 objlen = sdslen(o->ptr);
2019
2020 if (totwritten + objlen - offset > REDIS_MAX_WRITE_PER_EVENT)
2021 break;
2022
2023 if(ion == REDIS_WRITEV_IOVEC_COUNT)
2024 break; /* no more iovecs */
2025
2026 iov[ion].iov_base = ((char*)o->ptr) + offset;
2027 iov[ion].iov_len = objlen - offset;
2028 willwrite += objlen - offset;
2029 offset = 0; /* just for the first item */
2030 ion++;
2031 }
2032
2033 if(willwrite == 0)
2034 break;
2035
2036 /* write all collected blocks at once */
2037 if((nwritten = writev(fd, iov, ion)) < 0) {
2038 if (errno != EAGAIN) {
2039 redisLog(REDIS_VERBOSE,
2040 "Error writing to client: %s", strerror(errno));
2041 freeClient(c);
2042 return;
2043 }
2044 break;
2045 }
2046
2047 totwritten += nwritten;
2048 offset = c->sentlen;
2049
2050 /* remove written robjs from c->reply */
2051 while (nwritten && listLength(c->reply)) {
2052 o = listNodeValue(listFirst(c->reply));
2053 objlen = sdslen(o->ptr);
2054
2055 if(nwritten >= objlen - offset) {
2056 listDelNode(c->reply, listFirst(c->reply));
2057 nwritten -= objlen - offset;
2058 c->sentlen = 0;
2059 } else {
2060 /* partial write */
2061 c->sentlen += nwritten;
2062 break;
2063 }
2064 offset = 0;
2065 }
2066 }
2067
2068 if (totwritten > 0)
2069 c->lastinteraction = time(NULL);
2070
2071 if (listLength(c->reply) == 0) {
2072 c->sentlen = 0;
2073 aeDeleteFileEvent(server.el,c->fd,AE_WRITABLE);
2074 }
2075 }
2076
2077 static struct redisCommand *lookupCommand(char *name) {
2078 int j = 0;
2079 while(cmdTable[j].name != NULL) {
2080 if (!strcasecmp(name,cmdTable[j].name)) return &cmdTable[j];
2081 j++;
2082 }
2083 return NULL;
2084 }
2085
2086 /* resetClient prepare the client to process the next command */
2087 static void resetClient(redisClient *c) {
2088 freeClientArgv(c);
2089 c->bulklen = -1;
2090 c->multibulk = 0;
2091 }
2092
2093 /* Call() is the core of Redis execution of a command */
2094 static void call(redisClient *c, struct redisCommand *cmd) {
2095 long long dirty;
2096
2097 dirty = server.dirty;
2098 cmd->proc(c);
2099 if (server.appendonly && server.dirty-dirty)
2100 feedAppendOnlyFile(cmd,c->db->id,c->argv,c->argc);
2101 if (server.dirty-dirty && listLength(server.slaves))
2102 replicationFeedSlaves(server.slaves,c->db->id,c->argv,c->argc);
2103 if (listLength(server.monitors))
2104 replicationFeedSlaves(server.monitors,c->db->id,c->argv,c->argc);
2105 server.stat_numcommands++;
2106 }
2107
2108 /* If this function gets called we already read a whole
2109 * command, argments are in the client argv/argc fields.
2110 * processCommand() execute the command or prepare the
2111 * server for a bulk read from the client.
2112 *
2113 * If 1 is returned the client is still alive and valid and
2114 * and other operations can be performed by the caller. Otherwise
2115 * if 0 is returned the client was destroied (i.e. after QUIT). */
2116 static int processCommand(redisClient *c) {
2117 struct redisCommand *cmd;
2118
2119 /* Free some memory if needed (maxmemory setting) */
2120 if (server.maxmemory) freeMemoryIfNeeded();
2121
2122 /* Handle the multi bulk command type. This is an alternative protocol
2123 * supported by Redis in order to receive commands that are composed of
2124 * multiple binary-safe "bulk" arguments. The latency of processing is
2125 * a bit higher but this allows things like multi-sets, so if this
2126 * protocol is used only for MSET and similar commands this is a big win. */
2127 if (c->multibulk == 0 && c->argc == 1 && ((char*)(c->argv[0]->ptr))[0] == '*') {
2128 c->multibulk = atoi(((char*)c->argv[0]->ptr)+1);
2129 if (c->multibulk <= 0) {
2130 resetClient(c);
2131 return 1;
2132 } else {
2133 decrRefCount(c->argv[c->argc-1]);
2134 c->argc--;
2135 return 1;
2136 }
2137 } else if (c->multibulk) {
2138 if (c->bulklen == -1) {
2139 if (((char*)c->argv[0]->ptr)[0] != '$') {
2140 addReplySds(c,sdsnew("-ERR multi bulk protocol error\r\n"));
2141 resetClient(c);
2142 return 1;
2143 } else {
2144 int bulklen = atoi(((char*)c->argv[0]->ptr)+1);
2145 decrRefCount(c->argv[0]);
2146 if (bulklen < 0 || bulklen > 1024*1024*1024) {
2147 c->argc--;
2148 addReplySds(c,sdsnew("-ERR invalid bulk write count\r\n"));
2149 resetClient(c);
2150 return 1;
2151 }
2152 c->argc--;
2153 c->bulklen = bulklen+2; /* add two bytes for CR+LF */
2154 return 1;
2155 }
2156 } else {
2157 c->mbargv = zrealloc(c->mbargv,(sizeof(robj*))*(c->mbargc+1));
2158 c->mbargv[c->mbargc] = c->argv[0];
2159 c->mbargc++;
2160 c->argc--;
2161 c->multibulk--;
2162 if (c->multibulk == 0) {
2163 robj **auxargv;
2164 int auxargc;
2165
2166 /* Here we need to swap the multi-bulk argc/argv with the
2167 * normal argc/argv of the client structure. */
2168 auxargv = c->argv;
2169 c->argv = c->mbargv;
2170 c->mbargv = auxargv;
2171
2172 auxargc = c->argc;
2173 c->argc = c->mbargc;
2174 c->mbargc = auxargc;
2175
2176 /* We need to set bulklen to something different than -1
2177 * in order for the code below to process the command without
2178 * to try to read the last argument of a bulk command as
2179 * a special argument. */
2180 c->bulklen = 0;
2181 /* continue below and process the command */
2182 } else {
2183 c->bulklen = -1;
2184 return 1;
2185 }
2186 }
2187 }
2188 /* -- end of multi bulk commands processing -- */
2189
2190 /* The QUIT command is handled as a special case. Normal command
2191 * procs are unable to close the client connection safely */
2192 if (!strcasecmp(c->argv[0]->ptr,"quit")) {
2193 freeClient(c);
2194 return 0;
2195 }
2196
2197 /* Now lookup the command and check ASAP about trivial error conditions
2198 * such wrong arity, bad command name and so forth. */
2199 cmd = lookupCommand(c->argv[0]->ptr);
2200 if (!cmd) {
2201 addReplySds(c,
2202 sdscatprintf(sdsempty(), "-ERR unknown command '%s'\r\n",
2203 (char*)c->argv[0]->ptr));
2204 resetClient(c);
2205 return 1;
2206 } else if ((cmd->arity > 0 && cmd->arity != c->argc) ||
2207 (c->argc < -cmd->arity)) {
2208 addReplySds(c,
2209 sdscatprintf(sdsempty(),
2210 "-ERR wrong number of arguments for '%s' command\r\n",
2211 cmd->name));
2212 resetClient(c);
2213 return 1;
2214 } else if (cmd->flags & REDIS_CMD_BULK && c->bulklen == -1) {
2215 /* This is a bulk command, we have to read the last argument yet. */
2216 int bulklen = atoi(c->argv[c->argc-1]->ptr);
2217
2218 decrRefCount(c->argv[c->argc-1]);
2219 if (bulklen < 0 || bulklen > 1024*1024*1024) {
2220 c->argc--;
2221 addReplySds(c,sdsnew("-ERR invalid bulk write count\r\n"));
2222 resetClient(c);
2223 return 1;
2224 }
2225 c->argc--;
2226 c->bulklen = bulklen+2; /* add two bytes for CR+LF */
2227 /* It is possible that the bulk read is already in the
2228 * buffer. Check this condition and handle it accordingly.
2229 * This is just a fast path, alternative to call processInputBuffer().
2230 * It's a good idea since the code is small and this condition
2231 * happens most of the times. */
2232 if ((signed)sdslen(c->querybuf) >= c->bulklen) {
2233 c->argv[c->argc] = createStringObject(c->querybuf,c->bulklen-2);
2234 c->argc++;
2235 c->querybuf = sdsrange(c->querybuf,c->bulklen,-1);
2236 } else {
2237 /* Otherwise return... there is to read the last argument
2238 * from the socket. */
2239 return 1;
2240 }
2241 }
2242 /* Let's try to share objects on the command arguments vector */
2243 if (server.shareobjects) {
2244 int j;
2245 for(j = 1; j < c->argc; j++)
2246 c->argv[j] = tryObjectSharing(c->argv[j]);
2247 }
2248 /* Let's try to encode the bulk object to save space. */
2249 if (cmd->flags & REDIS_CMD_BULK)
2250 tryObjectEncoding(c->argv[c->argc-1]);
2251
2252 /* Check if the user is authenticated */
2253 if (server.requirepass && !c->authenticated && cmd->proc != authCommand) {
2254 addReplySds(c,sdsnew("-ERR operation not permitted\r\n"));
2255 resetClient(c);
2256 return 1;
2257 }
2258
2259 /* Handle the maxmemory directive */
2260 if (server.maxmemory && (cmd->flags & REDIS_CMD_DENYOOM) &&
2261 zmalloc_used_memory() > server.maxmemory)
2262 {
2263 addReplySds(c,sdsnew("-ERR command not allowed when used memory > 'maxmemory'\r\n"));
2264 resetClient(c);
2265 return 1;
2266 }
2267
2268 /* Only allow SUBSCRIBE and UNSUBSCRIBE in the context of Pub/Sub */
2269 if (dictSize(c->pubsub_classes) > 0 &&
2270 cmd->proc != subscribeCommand && cmd->proc != unsubscribeCommand) {
2271 addReplySds(c,sdsnew("-ERR only SUBSCRIBE / UNSUBSCRIBE / QUIT allowed in this context\r\n"));
2272 resetClient(c);
2273 return 1;
2274 }
2275
2276 /* Exec the command */
2277 if (c->flags & REDIS_MULTI && cmd->proc != execCommand && cmd->proc != discardCommand) {
2278 queueMultiCommand(c,cmd);
2279 addReply(c,shared.queued);
2280 } else {
2281 if (server.vm_enabled && server.vm_max_threads > 0 &&
2282 blockClientOnSwappedKeys(cmd,c)) return 1;
2283 call(c,cmd);
2284 }
2285
2286 /* Prepare the client for the next command */
2287 resetClient(c);
2288 return 1;
2289 }
2290
2291 static void replicationFeedSlaves(list *slaves, int dictid, robj **argv, int argc) {
2292 listNode *ln;
2293 listIter li;
2294 int outc = 0, j;
2295 robj **outv;
2296 /* We need 1+(ARGS*3) objects since commands are using the new protocol
2297 * and we one 1 object for the first "*<count>\r\n" multibulk count, then
2298 * for every additional object we have "$<count>\r\n" + object + "\r\n". */
2299 robj *static_outv[REDIS_STATIC_ARGS*3+1];
2300 robj *lenobj;
2301
2302 if (argc <= REDIS_STATIC_ARGS) {
2303 outv = static_outv;
2304 } else {
2305 outv = zmalloc(sizeof(robj*)*(argc*3+1));
2306 }
2307
2308 lenobj = createObject(REDIS_STRING,
2309 sdscatprintf(sdsempty(), "*%d\r\n", argc));
2310 lenobj->refcount = 0;
2311 outv[outc++] = lenobj;
2312 for (j = 0; j < argc; j++) {
2313 lenobj = createObject(REDIS_STRING,
2314 sdscatprintf(sdsempty(),"$%lu\r\n",
2315 (unsigned long) stringObjectLen(argv[j])));
2316 lenobj->refcount = 0;
2317 outv[outc++] = lenobj;
2318 outv[outc++] = argv[j];
2319 outv[outc++] = shared.crlf;
2320 }
2321
2322 /* Increment all the refcounts at start and decrement at end in order to
2323 * be sure to free objects if there is no slave in a replication state
2324 * able to be feed with commands */
2325 for (j = 0; j < outc; j++) incrRefCount(outv[j]);
2326 listRewind(slaves,&li);
2327 while((ln = listNext(&li))) {
2328 redisClient *slave = ln->value;
2329
2330 /* Don't feed slaves that are still waiting for BGSAVE to start */
2331 if (slave->replstate == REDIS_REPL_WAIT_BGSAVE_START) continue;
2332
2333 /* Feed all the other slaves, MONITORs and so on */
2334 if (slave->slaveseldb != dictid) {
2335 robj *selectcmd;
2336
2337 switch(dictid) {
2338 case 0: selectcmd = shared.select0; break;
2339 case 1: selectcmd = shared.select1; break;
2340 case 2: selectcmd = shared.select2; break;
2341 case 3: selectcmd = shared.select3; break;
2342 case 4: selectcmd = shared.select4; break;
2343 case 5: selectcmd = shared.select5; break;
2344 case 6: selectcmd = shared.select6; break;
2345 case 7: selectcmd = shared.select7; break;
2346 case 8: selectcmd = shared.select8; break;
2347 case 9: selectcmd = shared.select9; break;
2348 default:
2349 selectcmd = createObject(REDIS_STRING,
2350 sdscatprintf(sdsempty(),"select %d\r\n",dictid));
2351 selectcmd->refcount = 0;
2352 break;
2353 }
2354 addReply(slave,selectcmd);
2355 slave->slaveseldb = dictid;
2356 }
2357 for (j = 0; j < outc; j++) addReply(slave,outv[j]);
2358 }
2359 for (j = 0; j < outc; j++) decrRefCount(outv[j]);
2360 if (outv != static_outv) zfree(outv);
2361 }
2362
2363 static void processInputBuffer(redisClient *c) {
2364 again:
2365 /* Before to process the input buffer, make sure the client is not
2366 * waitig for a blocking operation such as BLPOP. Note that the first
2367 * iteration the client is never blocked, otherwise the processInputBuffer
2368 * would not be called at all, but after the execution of the first commands
2369 * in the input buffer the client may be blocked, and the "goto again"
2370 * will try to reiterate. The following line will make it return asap. */
2371 if (c->flags & REDIS_BLOCKED || c->flags & REDIS_IO_WAIT) return;
2372 if (c->bulklen == -1) {
2373 /* Read the first line of the query */
2374 char *p = strchr(c->querybuf,'\n');
2375 size_t querylen;
2376
2377 if (p) {
2378 sds query, *argv;
2379 int argc, j;
2380
2381 query = c->querybuf;
2382 c->querybuf = sdsempty();
2383 querylen = 1+(p-(query));
2384 if (sdslen(query) > querylen) {
2385 /* leave data after the first line of the query in the buffer */
2386 c->querybuf = sdscatlen(c->querybuf,query+querylen,sdslen(query)-querylen);
2387 }
2388 *p = '\0'; /* remove "\n" */
2389 if (*(p-1) == '\r') *(p-1) = '\0'; /* and "\r" if any */
2390 sdsupdatelen(query);
2391
2392 /* Now we can split the query in arguments */
2393 argv = sdssplitlen(query,sdslen(query)," ",1,&argc);
2394 sdsfree(query);
2395
2396 if (c->argv) zfree(c->argv);
2397 c->argv = zmalloc(sizeof(robj*)*argc);
2398
2399 for (j = 0; j < argc; j++) {
2400 if (sdslen(argv[j])) {
2401 c->argv[c->argc] = createObject(REDIS_STRING,argv[j]);
2402 c->argc++;
2403 } else {
2404 sdsfree(argv[j]);
2405 }
2406 }
2407 zfree(argv);
2408 if (c->argc) {
2409 /* Execute the command. If the client is still valid
2410 * after processCommand() return and there is something
2411 * on the query buffer try to process the next command. */
2412 if (processCommand(c) && sdslen(c->querybuf)) goto again;
2413 } else {
2414 /* Nothing to process, argc == 0. Just process the query
2415 * buffer if it's not empty or return to the caller */
2416 if (sdslen(c->querybuf)) goto again;
2417 }
2418 return;
2419 } else if (sdslen(c->querybuf) >= REDIS_REQUEST_MAX_SIZE) {
2420 redisLog(REDIS_VERBOSE, "Client protocol error");
2421 freeClient(c);
2422 return;
2423 }
2424 } else {
2425 /* Bulk read handling. Note that if we are at this point
2426 the client already sent a command terminated with a newline,
2427 we are reading the bulk data that is actually the last
2428 argument of the command. */
2429 int qbl = sdslen(c->querybuf);
2430
2431 if (c->bulklen <= qbl) {
2432 /* Copy everything but the final CRLF as final argument */
2433 c->argv[c->argc] = createStringObject(c->querybuf,c->bulklen-2);
2434 c->argc++;
2435 c->querybuf = sdsrange(c->querybuf,c->bulklen,-1);
2436 /* Process the command. If the client is still valid after
2437 * the processing and there is more data in the buffer
2438 * try to parse it. */
2439 if (processCommand(c) && sdslen(c->querybuf)) goto again;
2440 return;
2441 }
2442 }
2443 }
2444
2445 static void readQueryFromClient(aeEventLoop *el, int fd, void *privdata, int mask) {
2446 redisClient *c = (redisClient*) privdata;
2447 char buf[REDIS_IOBUF_LEN];
2448 int nread;
2449 REDIS_NOTUSED(el);
2450 REDIS_NOTUSED(mask);
2451
2452 nread = read(fd, buf, REDIS_IOBUF_LEN);
2453 if (nread == -1) {
2454 if (errno == EAGAIN) {
2455 nread = 0;
2456 } else {
2457 redisLog(REDIS_VERBOSE, "Reading from client: %s",strerror(errno));
2458 freeClient(c);
2459 return;
2460 }
2461 } else if (nread == 0) {
2462 redisLog(REDIS_VERBOSE, "Client closed connection");
2463 freeClient(c);
2464 return;
2465 }
2466 if (nread) {
2467 c->querybuf = sdscatlen(c->querybuf, buf, nread);
2468 c->lastinteraction = time(NULL);
2469 } else {
2470 return;
2471 }
2472 processInputBuffer(c);
2473 }
2474
2475 static int selectDb(redisClient *c, int id) {
2476 if (id < 0 || id >= server.dbnum)
2477 return REDIS_ERR;
2478 c->db = &server.db[id];
2479 return REDIS_OK;
2480 }
2481
2482 static void *dupClientReplyValue(void *o) {
2483 incrRefCount((robj*)o);
2484 return o;
2485 }
2486
2487 static redisClient *createClient(int fd) {
2488 redisClient *c = zmalloc(sizeof(*c));
2489
2490 anetNonBlock(NULL,fd);
2491 anetTcpNoDelay(NULL,fd);
2492 if (!c) return NULL;
2493 selectDb(c,0);
2494 c->fd = fd;
2495 c->querybuf = sdsempty();
2496 c->argc = 0;
2497 c->argv = NULL;
2498 c->bulklen = -1;
2499 c->multibulk = 0;
2500 c->mbargc = 0;
2501 c->mbargv = NULL;
2502 c->sentlen = 0;
2503 c->flags = 0;
2504 c->lastinteraction = time(NULL);
2505 c->authenticated = 0;
2506 c->replstate = REDIS_REPL_NONE;
2507 c->reply = listCreate();
2508 listSetFreeMethod(c->reply,decrRefCount);
2509 listSetDupMethod(c->reply,dupClientReplyValue);
2510 c->blockingkeys = NULL;
2511 c->blockingkeysnum = 0;
2512 c->io_keys = listCreate();
2513 c->pubsub_classes = dictCreate(&setDictType,NULL);
2514 listSetFreeMethod(c->io_keys,decrRefCount);
2515 if (aeCreateFileEvent(server.el, c->fd, AE_READABLE,
2516 readQueryFromClient, c) == AE_ERR) {
2517 freeClient(c);
2518 return NULL;
2519 }
2520 listAddNodeTail(server.clients,c);
2521 initClientMultiState(c);
2522 return c;
2523 }
2524
2525 static void addReply(redisClient *c, robj *obj) {
2526 if (listLength(c->reply) == 0 &&
2527 (c->replstate == REDIS_REPL_NONE ||
2528 c->replstate == REDIS_REPL_ONLINE) &&
2529 aeCreateFileEvent(server.el, c->fd, AE_WRITABLE,
2530 sendReplyToClient, c) == AE_ERR) return;
2531
2532 if (server.vm_enabled && obj->storage != REDIS_VM_MEMORY) {
2533 obj = dupStringObject(obj);
2534 obj->refcount = 0; /* getDecodedObject() will increment the refcount */
2535 }
2536 listAddNodeTail(c->reply,getDecodedObject(obj));
2537 }
2538
2539 static void addReplySds(redisClient *c, sds s) {
2540 robj *o = createObject(REDIS_STRING,s);
2541 addReply(c,o);
2542 decrRefCount(o);
2543 }
2544
2545 static void addReplyDouble(redisClient *c, double d) {
2546 char buf[128];
2547
2548 snprintf(buf,sizeof(buf),"%.17g",d);
2549 addReplySds(c,sdscatprintf(sdsempty(),"$%lu\r\n%s\r\n",
2550 (unsigned long) strlen(buf),buf));
2551 }
2552
2553 static void addReplyLong(redisClient *c, long l) {
2554 char buf[128];
2555 size_t len;
2556
2557 if (l == 0) {
2558 addReply(c,shared.czero);
2559 return;
2560 } else if (l == 1) {
2561 addReply(c,shared.cone);
2562 return;
2563 }
2564 len = snprintf(buf,sizeof(buf),":%ld\r\n",l);
2565 addReplySds(c,sdsnewlen(buf,len));
2566 }
2567
2568 static void addReplyUlong(redisClient *c, unsigned long ul) {
2569 char buf[128];
2570 size_t len;
2571
2572 if (ul == 0) {
2573 addReply(c,shared.czero);
2574 return;
2575 } else if (ul == 1) {
2576 addReply(c,shared.cone);
2577 return;
2578 }
2579 len = snprintf(buf,sizeof(buf),":%lu\r\n",ul);
2580 addReplySds(c,sdsnewlen(buf,len));
2581 }
2582
2583 static void addReplyBulkLen(redisClient *c, robj *obj) {
2584 size_t len;
2585
2586 if (obj->encoding == REDIS_ENCODING_RAW) {
2587 len = sdslen(obj->ptr);
2588 } else {
2589 long n = (long)obj->ptr;
2590
2591 /* Compute how many bytes will take this integer as a radix 10 string */
2592 len = 1;
2593 if (n < 0) {
2594 len++;
2595 n = -n;
2596 }
2597 while((n = n/10) != 0) {
2598 len++;
2599 }
2600 }
2601 addReplySds(c,sdscatprintf(sdsempty(),"$%lu\r\n",(unsigned long)len));
2602 }
2603
2604 static void addReplyBulk(redisClient *c, robj *obj) {
2605 addReplyBulkLen(c,obj);
2606 addReply(c,obj);
2607 addReply(c,shared.crlf);
2608 }
2609
2610 /* In the CONFIG command we need to add vanilla C string as bulk replies */
2611 static void addReplyBulkCString(redisClient *c, char *s) {
2612 if (s == NULL) {
2613 addReply(c,shared.nullbulk);
2614 } else {
2615 robj *o = createStringObject(s,strlen(s));
2616 addReplyBulk(c,o);
2617 decrRefCount(o);
2618 }
2619 }
2620
2621 static void acceptHandler(aeEventLoop *el, int fd, void *privdata, int mask) {
2622 int cport, cfd;
2623 char cip[128];
2624 redisClient *c;
2625 REDIS_NOTUSED(el);
2626 REDIS_NOTUSED(mask);
2627 REDIS_NOTUSED(privdata);
2628
2629 cfd = anetAccept(server.neterr, fd, cip, &cport);
2630 if (cfd == AE_ERR) {
2631 redisLog(REDIS_VERBOSE,"Accepting client connection: %s", server.neterr);
2632 return;
2633 }
2634 redisLog(REDIS_VERBOSE,"Accepted %s:%d", cip, cport);
2635 if ((c = createClient(cfd)) == NULL) {
2636 redisLog(REDIS_WARNING,"Error allocating resoures for the client");
2637 close(cfd); /* May be already closed, just ingore errors */
2638 return;
2639 }
2640 /* If maxclient directive is set and this is one client more... close the
2641 * connection. Note that we create the client instead to check before
2642 * for this condition, since now the socket is already set in nonblocking
2643 * mode and we can send an error for free using the Kernel I/O */
2644 if (server.maxclients && listLength(server.clients) > server.maxclients) {
2645 char *err = "-ERR max number of clients reached\r\n";
2646
2647 /* That's a best effort error message, don't check write errors */
2648 if (write(c->fd,err,strlen(err)) == -1) {
2649 /* Nothing to do, Just to avoid the warning... */
2650 }
2651 freeClient(c);
2652 return;
2653 }
2654 server.stat_numconnections++;
2655 }
2656
2657 /* ======================= Redis objects implementation ===================== */
2658
2659 static robj *createObject(int type, void *ptr) {
2660 robj *o;
2661
2662 if (server.vm_enabled) pthread_mutex_lock(&server.obj_freelist_mutex);
2663 if (listLength(server.objfreelist)) {
2664 listNode *head = listFirst(server.objfreelist);
2665 o = listNodeValue(head);
2666 listDelNode(server.objfreelist,head);
2667 if (server.vm_enabled) pthread_mutex_unlock(&server.obj_freelist_mutex);
2668 } else {
2669 if (server.vm_enabled) {
2670 pthread_mutex_unlock(&server.obj_freelist_mutex);
2671 o = zmalloc(sizeof(*o));
2672 } else {
2673 o = zmalloc(sizeof(*o)-sizeof(struct redisObjectVM));
2674 }
2675 }
2676 o->type = type;
2677 o->encoding = REDIS_ENCODING_RAW;
2678 o->ptr = ptr;
2679 o->refcount = 1;
2680 if (server.vm_enabled) {
2681 /* Note that this code may run in the context of an I/O thread
2682 * and accessing to server.unixtime in theory is an error
2683 * (no locks). But in practice this is safe, and even if we read
2684 * garbage Redis will not fail, as it's just a statistical info */
2685 o->vm.atime = server.unixtime;
2686 o->storage = REDIS_VM_MEMORY;
2687 }
2688 return o;
2689 }
2690
2691 static robj *createStringObject(char *ptr, size_t len) {
2692 return createObject(REDIS_STRING,sdsnewlen(ptr,len));
2693 }
2694
2695 static robj *dupStringObject(robj *o) {
2696 assert(o->encoding == REDIS_ENCODING_RAW);
2697 return createStringObject(o->ptr,sdslen(o->ptr));
2698 }
2699
2700 static robj *createListObject(void) {
2701 list *l = listCreate();
2702
2703 listSetFreeMethod(l,decrRefCount);
2704 return createObject(REDIS_LIST,l);
2705 }
2706
2707 static robj *createSetObject(void) {
2708 dict *d = dictCreate(&setDictType,NULL);
2709 return createObject(REDIS_SET,d);
2710 }
2711
2712 static robj *createHashObject(void) {
2713 /* All the Hashes start as zipmaps. Will be automatically converted
2714 * into hash tables if there are enough elements or big elements
2715 * inside. */
2716 unsigned char *zm = zipmapNew();
2717 robj *o = createObject(REDIS_HASH,zm);
2718 o->encoding = REDIS_ENCODING_ZIPMAP;
2719 return o;
2720 }
2721
2722 static robj *createZsetObject(void) {
2723 zset *zs = zmalloc(sizeof(*zs));
2724
2725 zs->dict = dictCreate(&zsetDictType,NULL);
2726 zs->zsl = zslCreate();
2727 return createObject(REDIS_ZSET,zs);
2728 }
2729
2730 static void freeStringObject(robj *o) {
2731 if (o->encoding == REDIS_ENCODING_RAW) {
2732 sdsfree(o->ptr);
2733 }
2734 }
2735
2736 static void freeListObject(robj *o) {
2737 listRelease((list*) o->ptr);
2738 }
2739
2740 static void freeSetObject(robj *o) {
2741 dictRelease((dict*) o->ptr);
2742 }
2743
2744 static void freeZsetObject(robj *o) {
2745 zset *zs = o->ptr;
2746
2747 dictRelease(zs->dict);
2748 zslFree(zs->zsl);
2749 zfree(zs);
2750 }
2751
2752 static void freeHashObject(robj *o) {
2753 switch (o->encoding) {
2754 case REDIS_ENCODING_HT:
2755 dictRelease((dict*) o->ptr);
2756 break;
2757 case REDIS_ENCODING_ZIPMAP:
2758 zfree(o->ptr);
2759 break;
2760 default:
2761 redisAssert(0);
2762 break;
2763 }
2764 }
2765
2766 static void incrRefCount(robj *o) {
2767 redisAssert(!server.vm_enabled || o->storage == REDIS_VM_MEMORY);
2768 o->refcount++;
2769 }
2770
2771 static void decrRefCount(void *obj) {
2772 robj *o = obj;
2773
2774 /* Object is a key of a swapped out value, or in the process of being
2775 * loaded. */
2776 if (server.vm_enabled &&
2777 (o->storage == REDIS_VM_SWAPPED || o->storage == REDIS_VM_LOADING))
2778 {
2779 if (o->storage == REDIS_VM_SWAPPED || o->storage == REDIS_VM_LOADING) {
2780 redisAssert(o->refcount == 1);
2781 }
2782 if (o->storage == REDIS_VM_LOADING) vmCancelThreadedIOJob(obj);
2783 redisAssert(o->type == REDIS_STRING);
2784 freeStringObject(o);
2785 vmMarkPagesFree(o->vm.page,o->vm.usedpages);
2786 pthread_mutex_lock(&server.obj_freelist_mutex);
2787 if (listLength(server.objfreelist) > REDIS_OBJFREELIST_MAX ||
2788 !listAddNodeHead(server.objfreelist,o))
2789 zfree(o);
2790 pthread_mutex_unlock(&server.obj_freelist_mutex);
2791 server.vm_stats_swapped_objects--;
2792 return;
2793 }
2794 /* Object is in memory, or in the process of being swapped out. */
2795 if (--(o->refcount) == 0) {
2796 if (server.vm_enabled && o->storage == REDIS_VM_SWAPPING)
2797 vmCancelThreadedIOJob(obj);
2798 switch(o->type) {
2799 case REDIS_STRING: freeStringObject(o); break;
2800 case REDIS_LIST: freeListObject(o); break;
2801 case REDIS_SET: freeSetObject(o); break;
2802 case REDIS_ZSET: freeZsetObject(o); break;
2803 case REDIS_HASH: freeHashObject(o); break;
2804 default: redisAssert(0); break;
2805 }
2806 if (server.vm_enabled) pthread_mutex_lock(&server.obj_freelist_mutex);
2807 if (listLength(server.objfreelist) > REDIS_OBJFREELIST_MAX ||
2808 !listAddNodeHead(server.objfreelist,o))
2809 zfree(o);
2810 if (server.vm_enabled) pthread_mutex_unlock(&server.obj_freelist_mutex);
2811 }
2812 }
2813
2814 static robj *lookupKey(redisDb *db, robj *key) {
2815 dictEntry *de = dictFind(db->dict,key);
2816 if (de) {
2817 robj *key = dictGetEntryKey(de);
2818 robj *val = dictGetEntryVal(de);
2819
2820 if (server.vm_enabled) {
2821 if (key->storage == REDIS_VM_MEMORY ||
2822 key->storage == REDIS_VM_SWAPPING)
2823 {
2824 /* If we were swapping the object out, stop it, this key
2825 * was requested. */
2826 if (key->storage == REDIS_VM_SWAPPING)
2827 vmCancelThreadedIOJob(key);
2828 /* Update the access time of the key for the aging algorithm. */
2829 key->vm.atime = server.unixtime;
2830 } else {
2831 int notify = (key->storage == REDIS_VM_LOADING);
2832
2833 /* Our value was swapped on disk. Bring it at home. */
2834 redisAssert(val == NULL);
2835 val = vmLoadObject(key);
2836 dictGetEntryVal(de) = val;
2837
2838 /* Clients blocked by the VM subsystem may be waiting for
2839 * this key... */
2840 if (notify) handleClientsBlockedOnSwappedKey(db,key);
2841 }
2842 }
2843 return val;
2844 } else {
2845 return NULL;
2846 }
2847 }
2848
2849 static robj *lookupKeyRead(redisDb *db, robj *key) {
2850 expireIfNeeded(db,key);
2851 return lookupKey(db,key);
2852 }
2853
2854 static robj *lookupKeyWrite(redisDb *db, robj *key) {
2855 deleteIfVolatile(db,key);
2856 return lookupKey(db,key);
2857 }
2858
2859 static robj *lookupKeyReadOrReply(redisClient *c, robj *key, robj *reply) {
2860 robj *o = lookupKeyRead(c->db, key);
2861 if (!o) addReply(c,reply);
2862 return o;
2863 }
2864
2865 static robj *lookupKeyWriteOrReply(redisClient *c, robj *key, robj *reply) {
2866 robj *o = lookupKeyWrite(c->db, key);
2867 if (!o) addReply(c,reply);
2868 return o;
2869 }
2870
2871 static int checkType(redisClient *c, robj *o, int type) {
2872 if (o->type != type) {
2873 addReply(c,shared.wrongtypeerr);
2874 return 1;
2875 }
2876 return 0;
2877 }
2878
2879 static int deleteKey(redisDb *db, robj *key) {
2880 int retval;
2881
2882 /* We need to protect key from destruction: after the first dictDelete()
2883 * it may happen that 'key' is no longer valid if we don't increment
2884 * it's count. This may happen when we get the object reference directly
2885 * from the hash table with dictRandomKey() or dict iterators */
2886 incrRefCount(key);
2887 if (dictSize(db->expires)) dictDelete(db->expires,key);
2888 retval = dictDelete(db->dict,key);
2889 decrRefCount(key);
2890
2891 return retval == DICT_OK;
2892 }
2893
2894 /* Try to share an object against the shared objects pool */
2895 static robj *tryObjectSharing(robj *o) {
2896 struct dictEntry *de;
2897 unsigned long c;
2898
2899 if (o == NULL || server.shareobjects == 0) return o;
2900
2901 redisAssert(o->type == REDIS_STRING);
2902 de = dictFind(server.sharingpool,o);
2903 if (de) {
2904 robj *shared = dictGetEntryKey(de);
2905
2906 c = ((unsigned long) dictGetEntryVal(de))+1;
2907 dictGetEntryVal(de) = (void*) c;
2908 incrRefCount(shared);
2909 decrRefCount(o);
2910 return shared;
2911 } else {
2912 /* Here we are using a stream algorihtm: Every time an object is
2913 * shared we increment its count, everytime there is a miss we
2914 * recrement the counter of a random object. If this object reaches
2915 * zero we remove the object and put the current object instead. */
2916 if (dictSize(server.sharingpool) >=
2917 server.sharingpoolsize) {
2918 de = dictGetRandomKey(server.sharingpool);
2919 redisAssert(de != NULL);
2920 c = ((unsigned long) dictGetEntryVal(de))-1;
2921 dictGetEntryVal(de) = (void*) c;
2922 if (c == 0) {
2923 dictDelete(server.sharingpool,de->key);
2924 }
2925 } else {
2926 c = 0; /* If the pool is empty we want to add this object */
2927 }
2928 if (c == 0) {
2929 int retval;
2930
2931 retval = dictAdd(server.sharingpool,o,(void*)1);
2932 redisAssert(retval == DICT_OK);
2933 incrRefCount(o);
2934 }
2935 return o;
2936 }
2937 }
2938
2939 /* Check if the nul-terminated string 's' can be represented by a long
2940 * (that is, is a number that fits into long without any other space or
2941 * character before or after the digits).
2942 *
2943 * If so, the function returns REDIS_OK and *longval is set to the value
2944 * of the number. Otherwise REDIS_ERR is returned */
2945 static int isStringRepresentableAsLong(sds s, long *longval) {
2946 char buf[32], *endptr;
2947 long value;
2948 int slen;
2949
2950 value = strtol(s, &endptr, 10);
2951 if (endptr[0] != '\0') return REDIS_ERR;
2952 slen = snprintf(buf,32,"%ld",value);
2953
2954 /* If the number converted back into a string is not identical
2955 * then it's not possible to encode the string as integer */
2956 if (sdslen(s) != (unsigned)slen || memcmp(buf,s,slen)) return REDIS_ERR;
2957 if (longval) *longval = value;
2958 return REDIS_OK;
2959 }
2960
2961 /* Try to encode a string object in order to save space */
2962 static int tryObjectEncoding(robj *o) {
2963 long value;
2964 sds s = o->ptr;
2965
2966 if (o->encoding != REDIS_ENCODING_RAW)
2967 return REDIS_ERR; /* Already encoded */
2968
2969 /* It's not save to encode shared objects: shared objects can be shared
2970 * everywhere in the "object space" of Redis. Encoded objects can only
2971 * appear as "values" (and not, for instance, as keys) */
2972 if (o->refcount > 1) return REDIS_ERR;
2973
2974 /* Currently we try to encode only strings */
2975 redisAssert(o->type == REDIS_STRING);
2976
2977 /* Check if we can represent this string as a long integer */
2978 if (isStringRepresentableAsLong(s,&value) == REDIS_ERR) return REDIS_ERR;
2979
2980 /* Ok, this object can be encoded */
2981 o->encoding = REDIS_ENCODING_INT;
2982 sdsfree(o->ptr);
2983 o->ptr = (void*) value;
2984 return REDIS_OK;
2985 }
2986
2987 /* Get a decoded version of an encoded object (returned as a new object).
2988 * If the object is already raw-encoded just increment the ref count. */
2989 static robj *getDecodedObject(robj *o) {
2990 robj *dec;
2991
2992 if (o->encoding == REDIS_ENCODING_RAW) {
2993 incrRefCount(o);
2994 return o;
2995 }
2996 if (o->type == REDIS_STRING && o->encoding == REDIS_ENCODING_INT) {
2997 char buf[32];
2998
2999 snprintf(buf,32,"%ld",(long)o->ptr);
3000 dec = createStringObject(buf,strlen(buf));
3001 return dec;
3002 } else {
3003 redisAssert(1 != 1);
3004 }
3005 }
3006
3007 /* Compare two string objects via strcmp() or alike.
3008 * Note that the objects may be integer-encoded. In such a case we
3009 * use snprintf() to get a string representation of the numbers on the stack
3010 * and compare the strings, it's much faster than calling getDecodedObject().
3011 *
3012 * Important note: if objects are not integer encoded, but binary-safe strings,
3013 * sdscmp() from sds.c will apply memcmp() so this function ca be considered
3014 * binary safe. */
3015 static int compareStringObjects(robj *a, robj *b) {
3016 redisAssert(a->type == REDIS_STRING && b->type == REDIS_STRING);
3017 char bufa[128], bufb[128], *astr, *bstr;
3018 int bothsds = 1;
3019
3020 if (a == b) return 0;
3021 if (a->encoding != REDIS_ENCODING_RAW) {
3022 snprintf(bufa,sizeof(bufa),"%ld",(long) a->ptr);
3023 astr = bufa;
3024 bothsds = 0;
3025 } else {
3026 astr = a->ptr;
3027 }
3028 if (b->encoding != REDIS_ENCODING_RAW) {
3029 snprintf(bufb,sizeof(bufb),"%ld",(long) b->ptr);
3030 bstr = bufb;
3031 bothsds = 0;
3032 } else {
3033 bstr = b->ptr;
3034 }
3035 return bothsds ? sdscmp(astr,bstr) : strcmp(astr,bstr);
3036 }
3037
3038 static size_t stringObjectLen(robj *o) {
3039 redisAssert(o->type == REDIS_STRING);
3040 if (o->encoding == REDIS_ENCODING_RAW) {
3041 return sdslen(o->ptr);
3042 } else {
3043 char buf[32];
3044
3045 return snprintf(buf,32,"%ld",(long)o->ptr);
3046 }
3047 }
3048
3049 /*============================ RDB saving/loading =========================== */
3050
3051 static int rdbSaveType(FILE *fp, unsigned char type) {
3052 if (fwrite(&type,1,1,fp) == 0) return -1;
3053 return 0;
3054 }
3055
3056 static int rdbSaveTime(FILE *fp, time_t t) {
3057 int32_t t32 = (int32_t) t;
3058 if (fwrite(&t32,4,1,fp) == 0) return -1;
3059 return 0;
3060 }
3061
3062 /* check rdbLoadLen() comments for more info */
3063 static int rdbSaveLen(FILE *fp, uint32_t len) {
3064 unsigned char buf[2];
3065
3066 if (len < (1<<6)) {
3067 /* Save a 6 bit len */
3068 buf[0] = (len&0xFF)|(REDIS_RDB_6BITLEN<<6);
3069 if (fwrite(buf,1,1,fp) == 0) return -1;
3070 } else if (len < (1<<14)) {
3071 /* Save a 14 bit len */
3072 buf[0] = ((len>>8)&0xFF)|(REDIS_RDB_14BITLEN<<6);
3073 buf[1] = len&0xFF;
3074 if (fwrite(buf,2,1,fp) == 0) return -1;
3075 } else {
3076 /* Save a 32 bit len */
3077 buf[0] = (REDIS_RDB_32BITLEN<<6);
3078 if (fwrite(buf,1,1,fp) == 0) return -1;
3079 len = htonl(len);
3080 if (fwrite(&len,4,1,fp) == 0) return -1;
3081 }
3082 return 0;
3083 }
3084
3085 /* String objects in the form "2391" "-100" without any space and with a
3086 * range of values that can fit in an 8, 16 or 32 bit signed value can be
3087 * encoded as integers to save space */
3088 static int rdbTryIntegerEncoding(char *s, size_t len, unsigned char *enc) {
3089 long long value;
3090 char *endptr, buf[32];
3091
3092 /* Check if it's possible to encode this value as a number */
3093 value = strtoll(s, &endptr, 10);
3094 if (endptr[0] != '\0') return 0;
3095 snprintf(buf,32,"%lld",value);
3096
3097 /* If the number converted back into a string is not identical
3098 * then it's not possible to encode the string as integer */
3099 if (strlen(buf) != len || memcmp(buf,s,len)) return 0;
3100
3101 /* Finally check if it fits in our ranges */
3102 if (value >= -(1<<7) && value <= (1<<7)-1) {
3103 enc[0] = (REDIS_RDB_ENCVAL<<6)|REDIS_RDB_ENC_INT8;
3104 enc[1] = value&0xFF;
3105 return 2;
3106 } else if (value >= -(1<<15) && value <= (1<<15)-1) {
3107 enc[0] = (REDIS_RDB_ENCVAL<<6)|REDIS_RDB_ENC_INT16;
3108 enc[1] = value&0xFF;
3109 enc[2] = (value>>8)&0xFF;
3110 return 3;
3111 } else if (value >= -((long long)1<<31) && value <= ((long long)1<<31)-1) {
3112 enc[0] = (REDIS_RDB_ENCVAL<<6)|REDIS_RDB_ENC_INT32;
3113 enc[1] = value&0xFF;
3114 enc[2] = (value>>8)&0xFF;
3115 enc[3] = (value>>16)&0xFF;
3116 enc[4] = (value>>24)&0xFF;
3117 return 5;
3118 } else {
3119 return 0;
3120 }
3121 }
3122
3123 static int rdbSaveLzfStringObject(FILE *fp, unsigned char *s, size_t len) {
3124 size_t comprlen, outlen;
3125 unsigned char byte;
3126 void *out;
3127
3128 /* We require at least four bytes compression for this to be worth it */
3129 if (len <= 4) return 0;
3130 outlen = len-4;
3131 if ((out = zmalloc(outlen+1)) == NULL) return 0;
3132 comprlen = lzf_compress(s, len, out, outlen);
3133 if (comprlen == 0) {
3134 zfree(out);
3135 return 0;
3136 }
3137 /* Data compressed! Let's save it on disk */
3138 byte = (REDIS_RDB_ENCVAL<<6)|REDIS_RDB_ENC_LZF;
3139 if (fwrite(&byte,1,1,fp) == 0) goto writeerr;
3140 if (rdbSaveLen(fp,comprlen) == -1) goto writeerr;
3141 if (rdbSaveLen(fp,len) == -1) goto writeerr;
3142 if (fwrite(out,comprlen,1,fp) == 0) goto writeerr;
3143 zfree(out);
3144 return comprlen;
3145
3146 writeerr:
3147 zfree(out);
3148 return -1;
3149 }
3150
3151 /* Save a string objet as [len][data] on disk. If the object is a string
3152 * representation of an integer value we try to safe it in a special form */
3153 static int rdbSaveRawString(FILE *fp, unsigned char *s, size_t len) {
3154 int enclen;
3155
3156 /* Try integer encoding */
3157 if (len <= 11) {
3158 unsigned char buf[5];
3159 if ((enclen = rdbTryIntegerEncoding((char*)s,len,buf)) > 0) {
3160 if (fwrite(buf,enclen,1,fp) == 0) return -1;
3161 return 0;
3162 }
3163 }
3164
3165 /* Try LZF compression - under 20 bytes it's unable to compress even
3166 * aaaaaaaaaaaaaaaaaa so skip it */
3167 if (server.rdbcompression && len > 20) {
3168 int retval;
3169
3170 retval = rdbSaveLzfStringObject(fp,s,len);
3171 if (retval == -1) return -1;
3172 if (retval > 0) return 0;
3173 /* retval == 0 means data can't be compressed, save the old way */
3174 }
3175
3176 /* Store verbatim */
3177 if (rdbSaveLen(fp,len) == -1) return -1;
3178 if (len && fwrite(s,len,1,fp) == 0) return -1;
3179 return 0;
3180 }
3181
3182 /* Like rdbSaveStringObjectRaw() but handle encoded objects */
3183 static int rdbSaveStringObject(FILE *fp, robj *obj) {
3184 int retval;
3185
3186 /* Avoid incr/decr ref count business when possible.
3187 * This plays well with copy-on-write given that we are probably
3188 * in a child process (BGSAVE). Also this makes sure key objects
3189 * of swapped objects are not incRefCount-ed (an assert does not allow
3190 * this in order to avoid bugs) */
3191 if (obj->encoding != REDIS_ENCODING_RAW) {
3192 obj = getDecodedObject(obj);
3193 retval = rdbSaveRawString(fp,obj->ptr,sdslen(obj->ptr));
3194 decrRefCount(obj);
3195 } else {
3196 retval = rdbSaveRawString(fp,obj->ptr,sdslen(obj->ptr));
3197 }
3198 return retval;
3199 }
3200
3201 /* Save a double value. Doubles are saved as strings prefixed by an unsigned
3202 * 8 bit integer specifing the length of the representation.
3203 * This 8 bit integer has special values in order to specify the following
3204 * conditions:
3205 * 253: not a number
3206 * 254: + inf
3207 * 255: - inf
3208 */
3209 static int rdbSaveDoubleValue(FILE *fp, double val) {
3210 unsigned char buf[128];
3211 int len;
3212
3213 if (isnan(val)) {
3214 buf[0] = 253;
3215 len = 1;
3216 } else if (!isfinite(val)) {
3217 len = 1;
3218 buf[0] = (val < 0) ? 255 : 254;
3219 } else {
3220 snprintf((char*)buf+1,sizeof(buf)-1,"%.17g",val);
3221 buf[0] = strlen((char*)buf+1);
3222 len = buf[0]+1;
3223 }
3224 if (fwrite(buf,len,1,fp) == 0) return -1;
3225 return 0;
3226 }
3227
3228 /* Save a Redis object. */
3229 static int rdbSaveObject(FILE *fp, robj *o) {
3230 if (o->type == REDIS_STRING) {
3231 /* Save a string value */
3232 if (rdbSaveStringObject(fp,o) == -1) return -1;
3233 } else if (o->type == REDIS_LIST) {
3234 /* Save a list value */
3235 list *list = o->ptr;
3236 listIter li;
3237 listNode *ln;
3238
3239 if (rdbSaveLen(fp,listLength(list)) == -1) return -1;
3240 listRewind(list,&li);
3241 while((ln = listNext(&li))) {
3242 robj *eleobj = listNodeValue(ln);
3243
3244 if (rdbSaveStringObject(fp,eleobj) == -1) return -1;
3245 }
3246 } else if (o->type == REDIS_SET) {
3247 /* Save a set value */
3248 dict *set = o->ptr;
3249 dictIterator *di = dictGetIterator(set);
3250 dictEntry *de;
3251
3252 if (rdbSaveLen(fp,dictSize(set)) == -1) return -1;
3253 while((de = dictNext(di)) != NULL) {
3254 robj *eleobj = dictGetEntryKey(de);
3255
3256 if (rdbSaveStringObject(fp,eleobj) == -1) return -1;
3257 }
3258 dictReleaseIterator(di);
3259 } else if (o->type == REDIS_ZSET) {
3260 /* Save a set value */
3261 zset *zs = o->ptr;
3262 dictIterator *di = dictGetIterator(zs->dict);
3263 dictEntry *de;
3264
3265 if (rdbSaveLen(fp,dictSize(zs->dict)) == -1) return -1;
3266 while((de = dictNext(di)) != NULL) {
3267 robj *eleobj = dictGetEntryKey(de);
3268 double *score = dictGetEntryVal(de);
3269
3270 if (rdbSaveStringObject(fp,eleobj) == -1) return -1;
3271 if (rdbSaveDoubleValue(fp,*score) == -1) return -1;
3272 }
3273 dictReleaseIterator(di);
3274 } else if (o->type == REDIS_HASH) {
3275 /* Save a hash value */
3276 if (o->encoding == REDIS_ENCODING_ZIPMAP) {
3277 unsigned char *p = zipmapRewind(o->ptr);
3278 unsigned int count = zipmapLen(o->ptr);
3279 unsigned char *key, *val;
3280 unsigned int klen, vlen;
3281
3282 if (rdbSaveLen(fp,count) == -1) return -1;
3283 while((p = zipmapNext(p,&key,&klen,&val,&vlen)) != NULL) {
3284 if (rdbSaveRawString(fp,key,klen) == -1) return -1;
3285 if (rdbSaveRawString(fp,val,vlen) == -1) return -1;
3286 }
3287 } else {
3288 dictIterator *di = dictGetIterator(o->ptr);
3289 dictEntry *de;
3290
3291 if (rdbSaveLen(fp,dictSize((dict*)o->ptr)) == -1) return -1;
3292 while((de = dictNext(di)) != NULL) {
3293 robj *key = dictGetEntryKey(de);
3294 robj *val = dictGetEntryVal(de);
3295
3296 if (rdbSaveStringObject(fp,key) == -1) return -1;
3297 if (rdbSaveStringObject(fp,val) == -1) return -1;
3298 }
3299 dictReleaseIterator(di);
3300 }
3301 } else {
3302 redisAssert(0);
3303 }
3304 return 0;
3305 }
3306
3307 /* Return the length the object will have on disk if saved with
3308 * the rdbSaveObject() function. Currently we use a trick to get
3309 * this length with very little changes to the code. In the future
3310 * we could switch to a faster solution. */
3311 static off_t rdbSavedObjectLen(robj *o, FILE *fp) {
3312 if (fp == NULL) fp = server.devnull;
3313 rewind(fp);
3314 assert(rdbSaveObject(fp,o) != 1);
3315 return ftello(fp);
3316 }
3317
3318 /* Return the number of pages required to save this object in the swap file */
3319 static off_t rdbSavedObjectPages(robj *o, FILE *fp) {
3320 off_t bytes = rdbSavedObjectLen(o,fp);
3321
3322 return (bytes+(server.vm_page_size-1))/server.vm_page_size;
3323 }
3324
3325 /* Save the DB on disk. Return REDIS_ERR on error, REDIS_OK on success */
3326 static int rdbSave(char *filename) {
3327 dictIterator *di = NULL;
3328 dictEntry *de;
3329 FILE *fp;
3330 char tmpfile[256];
3331 int j;
3332 time_t now = time(NULL);
3333
3334 /* Wait for I/O therads to terminate, just in case this is a
3335 * foreground-saving, to avoid seeking the swap file descriptor at the
3336 * same time. */
3337 if (server.vm_enabled)
3338 waitEmptyIOJobsQueue();
3339
3340 snprintf(tmpfile,256,"temp-%d.rdb", (int) getpid());
3341 fp = fopen(tmpfile,"w");
3342 if (!fp) {
3343 redisLog(REDIS_WARNING, "Failed saving the DB: %s", strerror(errno));
3344 return REDIS_ERR;
3345 }
3346 if (fwrite("REDIS0001",9,1,fp) == 0) goto werr;
3347 for (j = 0; j < server.dbnum; j++) {
3348 redisDb *db = server.db+j;
3349 dict *d = db->dict;
3350 if (dictSize(d) == 0) continue;
3351 di = dictGetIterator(d);
3352 if (!di) {
3353 fclose(fp);
3354 return REDIS_ERR;
3355 }
3356
3357 /* Write the SELECT DB opcode */
3358 if (rdbSaveType(fp,REDIS_SELECTDB) == -1) goto werr;
3359 if (rdbSaveLen(fp,j) == -1) goto werr;
3360
3361 /* Iterate this DB writing every entry */
3362 while((de = dictNext(di)) != NULL) {
3363 robj *key = dictGetEntryKey(de);
3364 robj *o = dictGetEntryVal(de);
3365 time_t expiretime = getExpire(db,key);
3366
3367 /* Save the expire time */
3368 if (expiretime != -1) {
3369 /* If this key is already expired skip it */
3370 if (expiretime < now) continue;
3371 if (rdbSaveType(fp,REDIS_EXPIRETIME) == -1) goto werr;
3372 if (rdbSaveTime(fp,expiretime) == -1) goto werr;
3373 }
3374 /* Save the key and associated value. This requires special
3375 * handling if the value is swapped out. */
3376 if (!server.vm_enabled || key->storage == REDIS_VM_MEMORY ||
3377 key->storage == REDIS_VM_SWAPPING) {
3378 /* Save type, key, value */
3379 if (rdbSaveType(fp,o->type) == -1) goto werr;
3380 if (rdbSaveStringObject(fp,key) == -1) goto werr;
3381 if (rdbSaveObject(fp,o) == -1) goto werr;
3382 } else {
3383 /* REDIS_VM_SWAPPED or REDIS_VM_LOADING */
3384 robj *po;
3385 /* Get a preview of the object in memory */
3386 po = vmPreviewObject(key);
3387 /* Save type, key, value */
3388 if (rdbSaveType(fp,key->vtype) == -1) goto werr;
3389 if (rdbSaveStringObject(fp,key) == -1) goto werr;
3390 if (rdbSaveObject(fp,po) == -1) goto werr;
3391 /* Remove the loaded object from memory */
3392 decrRefCount(po);
3393 }
3394 }
3395 dictReleaseIterator(di);
3396 }
3397 /* EOF opcode */
3398 if (rdbSaveType(fp,REDIS_EOF) == -1) goto werr;
3399
3400 /* Make sure data will not remain on the OS's output buffers */
3401 fflush(fp);
3402 fsync(fileno(fp));
3403 fclose(fp);
3404
3405 /* Use RENAME to make sure the DB file is changed atomically only
3406 * if the generate DB file is ok. */
3407 if (rename(tmpfile,filename) == -1) {
3408 redisLog(REDIS_WARNING,"Error moving temp DB file on the final destination: %s", strerror(errno));
3409 unlink(tmpfile);
3410 return REDIS_ERR;
3411 }
3412 redisLog(REDIS_NOTICE,"DB saved on disk");
3413 server.dirty = 0;
3414 server.lastsave = time(NULL);
3415 return REDIS_OK;
3416
3417 werr:
3418 fclose(fp);
3419 unlink(tmpfile);
3420 redisLog(REDIS_WARNING,"Write error saving DB on disk: %s", strerror(errno));
3421 if (di) dictReleaseIterator(di);
3422 return REDIS_ERR;
3423 }
3424
3425 static int rdbSaveBackground(char *filename) {
3426 pid_t childpid;
3427
3428 if (server.bgsavechildpid != -1) return REDIS_ERR;
3429 if (server.vm_enabled) waitEmptyIOJobsQueue();
3430 if ((childpid = fork()) == 0) {
3431 /* Child */
3432 if (server.vm_enabled) vmReopenSwapFile();
3433 close(server.fd);
3434 if (rdbSave(filename) == REDIS_OK) {
3435 _exit(0);
3436 } else {
3437 _exit(1);
3438 }
3439 } else {
3440 /* Parent */
3441 if (childpid == -1) {
3442 redisLog(REDIS_WARNING,"Can't save in background: fork: %s",
3443 strerror(errno));
3444 return REDIS_ERR;
3445 }
3446 redisLog(REDIS_NOTICE,"Background saving started by pid %d",childpid);
3447 server.bgsavechildpid = childpid;
3448 return REDIS_OK;
3449 }
3450 return REDIS_OK; /* unreached */
3451 }
3452
3453 static void rdbRemoveTempFile(pid_t childpid) {
3454 char tmpfile[256];
3455
3456 snprintf(tmpfile,256,"temp-%d.rdb", (int) childpid);
3457 unlink(tmpfile);
3458 }
3459
3460 static int rdbLoadType(FILE *fp) {
3461 unsigned char type;
3462 if (fread(&type,1,1,fp) == 0) return -1;
3463 return type;
3464 }
3465
3466 static time_t rdbLoadTime(FILE *fp) {
3467 int32_t t32;
3468 if (fread(&t32,4,1,fp) == 0) return -1;
3469 return (time_t) t32;
3470 }
3471
3472 /* Load an encoded length from the DB, see the REDIS_RDB_* defines on the top
3473 * of this file for a description of how this are stored on disk.
3474 *
3475 * isencoded is set to 1 if the readed length is not actually a length but
3476 * an "encoding type", check the above comments for more info */
3477 static uint32_t rdbLoadLen(FILE *fp, int *isencoded) {
3478 unsigned char buf[2];
3479 uint32_t len;
3480 int type;
3481
3482 if (isencoded) *isencoded = 0;
3483 if (fread(buf,1,1,fp) == 0) return REDIS_RDB_LENERR;
3484 type = (buf[0]&0xC0)>>6;
3485 if (type == REDIS_RDB_6BITLEN) {
3486 /* Read a 6 bit len */
3487 return buf[0]&0x3F;
3488 } else if (type == REDIS_RDB_ENCVAL) {
3489 /* Read a 6 bit len encoding type */
3490 if (isencoded) *isencoded = 1;
3491 return buf[0]&0x3F;
3492 } else if (type == REDIS_RDB_14BITLEN) {
3493 /* Read a 14 bit len */
3494 if (fread(buf+1,1,1,fp) == 0) return REDIS_RDB_LENERR;
3495 return ((buf[0]&0x3F)<<8)|buf[1];
3496 } else {
3497 /* Read a 32 bit len */
3498 if (fread(&len,4,1,fp) == 0) return REDIS_RDB_LENERR;
3499 return ntohl(len);
3500 }
3501 }
3502
3503 static robj *rdbLoadIntegerObject(FILE *fp, int enctype) {
3504 unsigned char enc[4];
3505 long long val;
3506
3507 if (enctype == REDIS_RDB_ENC_INT8) {
3508 if (fread(enc,1,1,fp) == 0) return NULL;
3509 val = (signed char)enc[0];
3510 } else if (enctype == REDIS_RDB_ENC_INT16) {
3511 uint16_t v;
3512 if (fread(enc,2,1,fp) == 0) return NULL;
3513 v = enc[0]|(enc[1]<<8);
3514 val = (int16_t)v;
3515 } else if (enctype == REDIS_RDB_ENC_INT32) {
3516 uint32_t v;
3517 if (fread(enc,4,1,fp) == 0) return NULL;
3518 v = enc[0]|(enc[1]<<8)|(enc[2]<<16)|(enc[3]<<24);
3519 val = (int32_t)v;
3520 } else {
3521 val = 0; /* anti-warning */
3522 redisAssert(0);
3523 }
3524 return createObject(REDIS_STRING,sdscatprintf(sdsempty(),"%lld",val));
3525 }
3526
3527 static robj *rdbLoadLzfStringObject(FILE*fp) {
3528 unsigned int len, clen;
3529 unsigned char *c = NULL;
3530 sds val = NULL;
3531
3532 if ((clen = rdbLoadLen(fp,NULL)) == REDIS_RDB_LENERR) return NULL;
3533 if ((len = rdbLoadLen(fp,NULL)) == REDIS_RDB_LENERR) return NULL;
3534 if ((c = zmalloc(clen)) == NULL) goto err;
3535 if ((val = sdsnewlen(NULL,len)) == NULL) goto err;
3536 if (fread(c,clen,1,fp) == 0) goto err;
3537 if (lzf_decompress(c,clen,val,len) == 0) goto err;
3538 zfree(c);
3539 return createObject(REDIS_STRING,val);
3540 err:
3541 zfree(c);
3542 sdsfree(val);
3543 return NULL;
3544 }
3545
3546 static robj *rdbLoadStringObject(FILE*fp) {
3547 int isencoded;
3548 uint32_t len;
3549 sds val;
3550
3551 len = rdbLoadLen(fp,&isencoded);
3552 if (isencoded) {
3553 switch(len) {
3554 case REDIS_RDB_ENC_INT8:
3555 case REDIS_RDB_ENC_INT16:
3556 case REDIS_RDB_ENC_INT32:
3557 return tryObjectSharing(rdbLoadIntegerObject(fp,len));
3558 case REDIS_RDB_ENC_LZF:
3559 return tryObjectSharing(rdbLoadLzfStringObject(fp));
3560 default:
3561 redisAssert(0);
3562 }
3563 }
3564
3565 if (len == REDIS_RDB_LENERR) return NULL;
3566 val = sdsnewlen(NULL,len);
3567 if (len && fread(val,len,1,fp) == 0) {
3568 sdsfree(val);
3569 return NULL;
3570 }
3571 return tryObjectSharing(createObject(REDIS_STRING,val));
3572 }
3573
3574 /* For information about double serialization check rdbSaveDoubleValue() */
3575 static int rdbLoadDoubleValue(FILE *fp, double *val) {
3576 char buf[128];
3577 unsigned char len;
3578
3579 if (fread(&len,1,1,fp) == 0) return -1;
3580 switch(len) {
3581 case 255: *val = R_NegInf; return 0;
3582 case 254: *val = R_PosInf; return 0;
3583 case 253: *val = R_Nan; return 0;
3584 default:
3585 if (fread(buf,len,1,fp) == 0) return -1;
3586 buf[len] = '\0';
3587 sscanf(buf, "%lg", val);
3588 return 0;
3589 }
3590 }
3591
3592 /* Load a Redis object of the specified type from the specified file.
3593 * On success a newly allocated object is returned, otherwise NULL. */
3594 static robj *rdbLoadObject(int type, FILE *fp) {
3595 robj *o;
3596
3597 redisLog(REDIS_DEBUG,"LOADING OBJECT %d (at %d)\n",type,ftell(fp));
3598 if (type == REDIS_STRING) {
3599 /* Read string value */
3600 if ((o = rdbLoadStringObject(fp)) == NULL) return NULL;
3601 tryObjectEncoding(o);
3602 } else if (type == REDIS_LIST || type == REDIS_SET) {
3603 /* Read list/set value */
3604 uint32_t listlen;
3605
3606 if ((listlen = rdbLoadLen(fp,NULL)) == REDIS_RDB_LENERR) return NULL;
3607 o = (type == REDIS_LIST) ? createListObject() : createSetObject();
3608 /* It's faster to expand the dict to the right size asap in order
3609 * to avoid rehashing */
3610 if (type == REDIS_SET && listlen > DICT_HT_INITIAL_SIZE)
3611 dictExpand(o->ptr,listlen);
3612 /* Load every single element of the list/set */
3613 while(listlen--) {
3614 robj *ele;
3615
3616 if ((ele = rdbLoadStringObject(fp)) == NULL) return NULL;
3617 tryObjectEncoding(ele);
3618 if (type == REDIS_LIST) {
3619 listAddNodeTail((list*)o->ptr,ele);
3620 } else {
3621 dictAdd((dict*)o->ptr,ele,NULL);
3622 }
3623 }
3624 } else if (type == REDIS_ZSET) {
3625 /* Read list/set value */
3626 size_t zsetlen;
3627 zset *zs;
3628
3629 if ((zsetlen = rdbLoadLen(fp,NULL)) == REDIS_RDB_LENERR) return NULL;
3630 o = createZsetObject();
3631 zs = o->ptr;
3632 /* Load every single element of the list/set */
3633 while(zsetlen--) {
3634 robj *ele;
3635 double *score = zmalloc(sizeof(double));
3636
3637 if ((ele = rdbLoadStringObject(fp)) == NULL) return NULL;
3638 tryObjectEncoding(ele);
3639 if (rdbLoadDoubleValue(fp,score) == -1) return NULL;
3640 dictAdd(zs->dict,ele,score);
3641 zslInsert(zs->zsl,*score,ele);
3642 incrRefCount(ele); /* added to skiplist */
3643 }
3644 } else if (type == REDIS_HASH) {
3645 size_t hashlen;
3646
3647 if ((hashlen = rdbLoadLen(fp,NULL)) == REDIS_RDB_LENERR) return NULL;
3648 o = createHashObject();
3649 /* Too many entries? Use an hash table. */
3650 if (hashlen > server.hash_max_zipmap_entries)
3651 convertToRealHash(o);
3652 /* Load every key/value, then set it into the zipmap or hash
3653 * table, as needed. */
3654 while(hashlen--) {
3655 robj *key, *val;
3656
3657 if ((key = rdbLoadStringObject(fp)) == NULL) return NULL;
3658 if ((val = rdbLoadStringObject(fp)) == NULL) return NULL;
3659 /* If we are using a zipmap and there are too big values
3660 * the object is converted to real hash table encoding. */
3661 if (o->encoding != REDIS_ENCODING_HT &&
3662 (sdslen(key->ptr) > server.hash_max_zipmap_value ||
3663 sdslen(val->ptr) > server.hash_max_zipmap_value))
3664 {
3665 convertToRealHash(o);
3666 }
3667
3668 if (o->encoding == REDIS_ENCODING_ZIPMAP) {
3669 unsigned char *zm = o->ptr;
3670
3671 zm = zipmapSet(zm,key->ptr,sdslen(key->ptr),
3672 val->ptr,sdslen(val->ptr),NULL);
3673 o->ptr = zm;
3674 decrRefCount(key);
3675 decrRefCount(val);
3676 } else {
3677 tryObjectEncoding(key);
3678 tryObjectEncoding(val);
3679 dictAdd((dict*)o->ptr,key,val);
3680 }
3681 }
3682 } else {
3683 redisAssert(0);
3684 }
3685 return o;
3686 }
3687
3688 static int rdbLoad(char *filename) {
3689 FILE *fp;
3690 robj *keyobj = NULL;
3691 uint32_t dbid;
3692 int type, retval, rdbver;
3693 dict *d = server.db[0].dict;
3694 redisDb *db = server.db+0;
3695 char buf[1024];
3696 time_t expiretime = -1, now = time(NULL);
3697 long long loadedkeys = 0;
3698
3699 fp = fopen(filename,"r");
3700 if (!fp) return REDIS_ERR;
3701 if (fread(buf,9,1,fp) == 0) goto eoferr;
3702 buf[9] = '\0';
3703 if (memcmp(buf,"REDIS",5) != 0) {
3704 fclose(fp);
3705 redisLog(REDIS_WARNING,"Wrong signature trying to load DB from file");
3706 return REDIS_ERR;
3707 }
3708 rdbver = atoi(buf+5);
3709 if (rdbver != 1) {
3710 fclose(fp);
3711 redisLog(REDIS_WARNING,"Can't handle RDB format version %d",rdbver);
3712 return REDIS_ERR;
3713 }
3714 while(1) {
3715 robj *o;
3716
3717 /* Read type. */
3718 if ((type = rdbLoadType(fp)) == -1) goto eoferr;
3719 if (type == REDIS_EXPIRETIME) {
3720 if ((expiretime = rdbLoadTime(fp)) == -1) goto eoferr;
3721 /* We read the time so we need to read the object type again */
3722 if ((type = rdbLoadType(fp)) == -1) goto eoferr;
3723 }
3724 if (type == REDIS_EOF) break;
3725 /* Handle SELECT DB opcode as a special case */
3726 if (type == REDIS_SELECTDB) {
3727 if ((dbid = rdbLoadLen(fp,NULL)) == REDIS_RDB_LENERR)
3728 goto eoferr;
3729 if (dbid >= (unsigned)server.dbnum) {
3730 redisLog(REDIS_WARNING,"FATAL: Data file was created with a Redis server configured to handle more than %d databases. Exiting\n", server.dbnum);
3731 exit(1);
3732 }
3733 db = server.db+dbid;
3734 d = db->dict;
3735 continue;
3736 }
3737 /* Read key */
3738 if ((keyobj = rdbLoadStringObject(fp)) == NULL) goto eoferr;
3739 /* Read value */
3740 if ((o = rdbLoadObject(type,fp)) == NULL) goto eoferr;
3741 /* Add the new object in the hash table */
3742 retval = dictAdd(d,keyobj,o);
3743 if (retval == DICT_ERR) {
3744 redisLog(REDIS_WARNING,"Loading DB, duplicated key (%s) found! Unrecoverable error, exiting now.", keyobj->ptr);
3745 exit(1);
3746 }
3747 /* Set the expire time if needed */
3748 if (expiretime != -1) {
3749 setExpire(db,keyobj,expiretime);
3750 /* Delete this key if already expired */
3751 if (expiretime < now) deleteKey(db,keyobj);
3752 expiretime = -1;
3753 }
3754 keyobj = o = NULL;
3755 /* Handle swapping while loading big datasets when VM is on */
3756 loadedkeys++;
3757 if (server.vm_enabled && (loadedkeys % 5000) == 0) {
3758 while (zmalloc_used_memory() > server.vm_max_memory) {
3759 if (vmSwapOneObjectBlocking() == REDIS_ERR) break;
3760 }
3761 }
3762 }
3763 fclose(fp);
3764 return REDIS_OK;
3765
3766 eoferr: /* unexpected end of file is handled here with a fatal exit */
3767 if (keyobj) decrRefCount(keyobj);
3768 redisLog(REDIS_WARNING,"Short read or OOM loading DB. Unrecoverable error, aborting now.");
3769 exit(1);
3770 return REDIS_ERR; /* Just to avoid warning */
3771 }
3772
3773 /*================================== Commands =============================== */
3774
3775 static void authCommand(redisClient *c) {
3776 if (!server.requirepass || !strcmp(c->argv[1]->ptr, server.requirepass)) {
3777 c->authenticated = 1;
3778 addReply(c,shared.ok);
3779 } else {
3780 c->authenticated = 0;
3781 addReplySds(c,sdscatprintf(sdsempty(),"-ERR invalid password\r\n"));
3782 }
3783 }
3784
3785 static void pingCommand(redisClient *c) {
3786 addReply(c,shared.pong);
3787 }
3788
3789 static void echoCommand(redisClient *c) {
3790 addReplyBulk(c,c->argv[1]);
3791 }
3792
3793 /*=================================== Strings =============================== */
3794
3795 static void setGenericCommand(redisClient *c, int nx) {
3796 int retval;
3797
3798 if (nx) deleteIfVolatile(c->db,c->argv[1]);
3799 retval = dictAdd(c->db->dict,c->argv[1],c->argv[2]);
3800 if (retval == DICT_ERR) {
3801 if (!nx) {
3802 /* If the key is about a swapped value, we want a new key object
3803 * to overwrite the old. So we delete the old key in the database.
3804 * This will also make sure that swap pages about the old object
3805 * will be marked as free. */
3806 if (server.vm_enabled && deleteIfSwapped(c->db,c->argv[1]))
3807 incrRefCount(c->argv[1]);
3808 dictReplace(c->db->dict,c->argv[1],c->argv[2]);
3809 incrRefCount(c->argv[2]);
3810 } else {
3811 addReply(c,shared.czero);
3812 return;
3813 }
3814 } else {
3815 incrRefCount(c->argv[1]);
3816 incrRefCount(c->argv[2]);
3817 }
3818 server.dirty++;
3819 removeExpire(c->db,c->argv[1]);
3820 addReply(c, nx ? shared.cone : shared.ok);
3821 }
3822
3823 static void setCommand(redisClient *c) {
3824 setGenericCommand(c,0);
3825 }
3826
3827 static void setnxCommand(redisClient *c) {
3828 setGenericCommand(c,1);
3829 }
3830
3831 static int getGenericCommand(redisClient *c) {
3832 robj *o;
3833
3834 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.nullbulk)) == NULL)
3835 return REDIS_OK;
3836
3837 if (o->type != REDIS_STRING) {
3838 addReply(c,shared.wrongtypeerr);
3839 return REDIS_ERR;
3840 } else {
3841 addReplyBulk(c,o);
3842 return REDIS_OK;
3843 }
3844 }
3845
3846 static void getCommand(redisClient *c) {
3847 getGenericCommand(c);
3848 }
3849
3850 static void getsetCommand(redisClient *c) {
3851 if (getGenericCommand(c) == REDIS_ERR) return;
3852 if (dictAdd(c->db->dict,c->argv[1],c->argv[2]) == DICT_ERR) {
3853 dictReplace(c->db->dict,c->argv[1],c->argv[2]);
3854 } else {
3855 incrRefCount(c->argv[1]);
3856 }
3857 incrRefCount(c->argv[2]);
3858 server.dirty++;
3859 removeExpire(c->db,c->argv[1]);
3860 }
3861
3862 static void mgetCommand(redisClient *c) {
3863 int j;
3864
3865 addReplySds(c,sdscatprintf(sdsempty(),"*%d\r\n",c->argc-1));
3866 for (j = 1; j < c->argc; j++) {
3867 robj *o = lookupKeyRead(c->db,c->argv[j]);
3868 if (o == NULL) {
3869 addReply(c,shared.nullbulk);
3870 } else {
3871 if (o->type != REDIS_STRING) {
3872 addReply(c,shared.nullbulk);
3873 } else {
3874 addReplyBulk(c,o);
3875 }
3876 }
3877 }
3878 }
3879
3880 static void msetGenericCommand(redisClient *c, int nx) {
3881 int j, busykeys = 0;
3882
3883 if ((c->argc % 2) == 0) {
3884 addReplySds(c,sdsnew("-ERR wrong number of arguments for MSET\r\n"));
3885 return;
3886 }
3887 /* Handle the NX flag. The MSETNX semantic is to return zero and don't
3888 * set nothing at all if at least one already key exists. */
3889 if (nx) {
3890 for (j = 1; j < c->argc; j += 2) {
3891 if (lookupKeyWrite(c->db,c->argv[j]) != NULL) {
3892 busykeys++;
3893 }
3894 }
3895 }
3896 if (busykeys) {
3897 addReply(c, shared.czero);
3898 return;
3899 }
3900
3901 for (j = 1; j < c->argc; j += 2) {
3902 int retval;
3903
3904 tryObjectEncoding(c->argv[j+1]);
3905 retval = dictAdd(c->db->dict,c->argv[j],c->argv[j+1]);
3906 if (retval == DICT_ERR) {
3907 dictReplace(c->db->dict,c->argv[j],c->argv[j+1]);
3908 incrRefCount(c->argv[j+1]);
3909 } else {
3910 incrRefCount(c->argv[j]);
3911 incrRefCount(c->argv[j+1]);
3912 }
3913 removeExpire(c->db,c->argv[j]);
3914 }
3915 server.dirty += (c->argc-1)/2;
3916 addReply(c, nx ? shared.cone : shared.ok);
3917 }
3918
3919 static void msetCommand(redisClient *c) {
3920 msetGenericCommand(c,0);
3921 }
3922
3923 static void msetnxCommand(redisClient *c) {
3924 msetGenericCommand(c,1);
3925 }
3926
3927 static void incrDecrCommand(redisClient *c, long long incr) {
3928 long long value;
3929 int retval;
3930 robj *o;
3931
3932 o = lookupKeyWrite(c->db,c->argv[1]);
3933 if (o == NULL) {
3934 value = 0;
3935 } else {
3936 if (o->type != REDIS_STRING) {
3937 value = 0;
3938 } else {
3939 char *eptr;
3940
3941 if (o->encoding == REDIS_ENCODING_RAW)
3942 value = strtoll(o->ptr, &eptr, 10);
3943 else if (o->encoding == REDIS_ENCODING_INT)
3944 value = (long)o->ptr;
3945 else
3946 redisAssert(1 != 1);
3947 }
3948 }
3949
3950 value += incr;
3951 o = createObject(REDIS_STRING,sdscatprintf(sdsempty(),"%lld",value));
3952 tryObjectEncoding(o);
3953 retval = dictAdd(c->db->dict,c->argv[1],o);
3954 if (retval == DICT_ERR) {
3955 dictReplace(c->db->dict,c->argv[1],o);
3956 removeExpire(c->db,c->argv[1]);
3957 } else {
3958 incrRefCount(c->argv[1]);
3959 }
3960 server.dirty++;
3961 addReply(c,shared.colon);
3962 addReply(c,o);
3963 addReply(c,shared.crlf);
3964 }
3965
3966 static void incrCommand(redisClient *c) {
3967 incrDecrCommand(c,1);
3968 }
3969
3970 static void decrCommand(redisClient *c) {
3971 incrDecrCommand(c,-1);
3972 }
3973
3974 static void incrbyCommand(redisClient *c) {
3975 long long incr = strtoll(c->argv[2]->ptr, NULL, 10);
3976 incrDecrCommand(c,incr);
3977 }
3978
3979 static void decrbyCommand(redisClient *c) {
3980 long long incr = strtoll(c->argv[2]->ptr, NULL, 10);
3981 incrDecrCommand(c,-incr);
3982 }
3983
3984 static void appendCommand(redisClient *c) {
3985 int retval;
3986 size_t totlen;
3987 robj *o;
3988
3989 o = lookupKeyWrite(c->db,c->argv[1]);
3990 if (o == NULL) {
3991 /* Create the key */
3992 retval = dictAdd(c->db->dict,c->argv[1],c->argv[2]);
3993 incrRefCount(c->argv[1]);
3994 incrRefCount(c->argv[2]);
3995 totlen = stringObjectLen(c->argv[2]);
3996 } else {
3997 dictEntry *de;
3998
3999 de = dictFind(c->db->dict,c->argv[1]);
4000 assert(de != NULL);
4001
4002 o = dictGetEntryVal(de);
4003 if (o->type != REDIS_STRING) {
4004 addReply(c,shared.wrongtypeerr);
4005 return;
4006 }
4007 /* If the object is specially encoded or shared we have to make
4008 * a copy */
4009 if (o->refcount != 1 || o->encoding != REDIS_ENCODING_RAW) {
4010 robj *decoded = getDecodedObject(o);
4011
4012 o = createStringObject(decoded->ptr, sdslen(decoded->ptr));
4013 decrRefCount(decoded);
4014 dictReplace(c->db->dict,c->argv[1],o);
4015 }
4016 /* APPEND! */
4017 if (c->argv[2]->encoding == REDIS_ENCODING_RAW) {
4018 o->ptr = sdscatlen(o->ptr,
4019 c->argv[2]->ptr, sdslen(c->argv[2]->ptr));
4020 } else {
4021 o->ptr = sdscatprintf(o->ptr, "%ld",
4022 (unsigned long) c->argv[2]->ptr);
4023 }
4024 totlen = sdslen(o->ptr);
4025 }
4026 server.dirty++;
4027 addReplySds(c,sdscatprintf(sdsempty(),":%lu\r\n",(unsigned long)totlen));
4028 }
4029
4030 static void substrCommand(redisClient *c) {
4031 robj *o;
4032 long start = atoi(c->argv[2]->ptr);
4033 long end = atoi(c->argv[3]->ptr);
4034 size_t rangelen, strlen;
4035 sds range;
4036
4037 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.nullbulk)) == NULL ||
4038 checkType(c,o,REDIS_STRING)) return;
4039
4040 o = getDecodedObject(o);
4041 strlen = sdslen(o->ptr);
4042
4043 /* convert negative indexes */
4044 if (start < 0) start = strlen+start;
4045 if (end < 0) end = strlen+end;
4046 if (start < 0) start = 0;
4047 if (end < 0) end = 0;
4048
4049 /* indexes sanity checks */
4050 if (start > end || (size_t)start >= strlen) {
4051 /* Out of range start or start > end result in null reply */
4052 addReply(c,shared.nullbulk);
4053 decrRefCount(o);
4054 return;
4055 }
4056 if ((size_t)end >= strlen) end = strlen-1;
4057 rangelen = (end-start)+1;
4058
4059 /* Return the result */
4060 addReplySds(c,sdscatprintf(sdsempty(),"$%zu\r\n",rangelen));
4061 range = sdsnewlen((char*)o->ptr+start,rangelen);
4062 addReplySds(c,range);
4063 addReply(c,shared.crlf);
4064 decrRefCount(o);
4065 }
4066
4067 /* ========================= Type agnostic commands ========================= */
4068
4069 static void delCommand(redisClient *c) {
4070 int deleted = 0, j;
4071
4072 for (j = 1; j < c->argc; j++) {
4073 if (deleteKey(c->db,c->argv[j])) {
4074 server.dirty++;
4075 deleted++;
4076 }
4077 }
4078 addReplyLong(c,deleted);
4079 }
4080
4081 static void existsCommand(redisClient *c) {
4082 addReply(c,lookupKeyRead(c->db,c->argv[1]) ? shared.cone : shared.czero);
4083 }
4084
4085 static void selectCommand(redisClient *c) {
4086 int id = atoi(c->argv[1]->ptr);
4087
4088 if (selectDb(c,id) == REDIS_ERR) {
4089 addReplySds(c,sdsnew("-ERR invalid DB index\r\n"));
4090 } else {
4091 addReply(c,shared.ok);
4092 }
4093 }
4094
4095 static void randomkeyCommand(redisClient *c) {
4096 dictEntry *de;
4097
4098 while(1) {
4099 de = dictGetRandomKey(c->db->dict);
4100 if (!de || expireIfNeeded(c->db,dictGetEntryKey(de)) == 0) break;
4101 }
4102 if (de == NULL) {
4103 addReply(c,shared.plus);
4104 addReply(c,shared.crlf);
4105 } else {
4106 addReply(c,shared.plus);
4107 addReply(c,dictGetEntryKey(de));
4108 addReply(c,shared.crlf);
4109 }
4110 }
4111
4112 static void keysCommand(redisClient *c) {
4113 dictIterator *di;
4114 dictEntry *de;
4115 sds pattern = c->argv[1]->ptr;
4116 int plen = sdslen(pattern);
4117 unsigned long numkeys = 0;
4118 robj *lenobj = createObject(REDIS_STRING,NULL);
4119
4120 di = dictGetIterator(c->db->dict);
4121 addReply(c,lenobj);
4122 decrRefCount(lenobj);
4123 while((de = dictNext(di)) != NULL) {
4124 robj *keyobj = dictGetEntryKey(de);
4125
4126 sds key = keyobj->ptr;
4127 if ((pattern[0] == '*' && pattern[1] == '\0') ||
4128 stringmatchlen(pattern,plen,key,sdslen(key),0)) {
4129 if (expireIfNeeded(c->db,keyobj) == 0) {
4130 addReplyBulk(c,keyobj);
4131 numkeys++;
4132 }
4133 }
4134 }
4135 dictReleaseIterator(di);
4136 lenobj->ptr = sdscatprintf(sdsempty(),"*%lu\r\n",numkeys);
4137 }
4138
4139 static void dbsizeCommand(redisClient *c) {
4140 addReplySds(c,
4141 sdscatprintf(sdsempty(),":%lu\r\n",dictSize(c->db->dict)));
4142 }
4143
4144 static void lastsaveCommand(redisClient *c) {
4145 addReplySds(c,
4146 sdscatprintf(sdsempty(),":%lu\r\n",server.lastsave));
4147 }
4148
4149 static void typeCommand(redisClient *c) {
4150 robj *o;
4151 char *type;
4152
4153 o = lookupKeyRead(c->db,c->argv[1]);
4154 if (o == NULL) {
4155 type = "+none";
4156 } else {
4157 switch(o->type) {
4158 case REDIS_STRING: type = "+string"; break;
4159 case REDIS_LIST: type = "+list"; break;
4160 case REDIS_SET: type = "+set"; break;
4161 case REDIS_ZSET: type = "+zset"; break;
4162 case REDIS_HASH: type = "+hash"; break;
4163 default: type = "+unknown"; break;
4164 }
4165 }
4166 addReplySds(c,sdsnew(type));
4167 addReply(c,shared.crlf);
4168 }
4169
4170 static void saveCommand(redisClient *c) {
4171 if (server.bgsavechildpid != -1) {
4172 addReplySds(c,sdsnew("-ERR background save in progress\r\n"));
4173 return;
4174 }
4175 if (rdbSave(server.dbfilename) == REDIS_OK) {
4176 addReply(c,shared.ok);
4177 } else {
4178 addReply(c,shared.err);
4179 }
4180 }
4181
4182 static void bgsaveCommand(redisClient *c) {
4183 if (server.bgsavechildpid != -1) {
4184 addReplySds(c,sdsnew("-ERR background save already in progress\r\n"));
4185 return;
4186 }
4187 if (rdbSaveBackground(server.dbfilename) == REDIS_OK) {
4188 char *status = "+Background saving started\r\n";
4189 addReplySds(c,sdsnew(status));
4190 } else {
4191 addReply(c,shared.err);
4192 }
4193 }
4194
4195 static void shutdownCommand(redisClient *c) {
4196 redisLog(REDIS_WARNING,"User requested shutdown, saving DB...");
4197 /* Kill the saving child if there is a background saving in progress.
4198 We want to avoid race conditions, for instance our saving child may
4199 overwrite the synchronous saving did by SHUTDOWN. */
4200 if (server.bgsavechildpid != -1) {
4201 redisLog(REDIS_WARNING,"There is a live saving child. Killing it!");
4202 kill(server.bgsavechildpid,SIGKILL);
4203 rdbRemoveTempFile(server.bgsavechildpid);
4204 }
4205 if (server.appendonly) {
4206 /* Append only file: fsync() the AOF and exit */
4207 fsync(server.appendfd);
4208 if (server.vm_enabled) unlink(server.vm_swap_file);
4209 exit(0);
4210 } else {
4211 /* Snapshotting. Perform a SYNC SAVE and exit */
4212 if (rdbSave(server.dbfilename) == REDIS_OK) {
4213 if (server.daemonize)
4214 unlink(server.pidfile);
4215 redisLog(REDIS_WARNING,"%zu bytes used at exit",zmalloc_used_memory());
4216 redisLog(REDIS_WARNING,"Server exit now, bye bye...");
4217 if (server.vm_enabled) unlink(server.vm_swap_file);
4218 exit(0);
4219 } else {
4220 /* Ooops.. error saving! The best we can do is to continue
4221 * operating. Note that if there was a background saving process,
4222 * in the next cron() Redis will be notified that the background
4223 * saving aborted, handling special stuff like slaves pending for
4224 * synchronization... */
4225 redisLog(REDIS_WARNING,"Error trying to save the DB, can't exit");
4226 addReplySds(c,
4227 sdsnew("-ERR can't quit, problems saving the DB\r\n"));
4228 }
4229 }
4230 }
4231
4232 static void renameGenericCommand(redisClient *c, int nx) {
4233 robj *o;
4234
4235 /* To use the same key as src and dst is probably an error */
4236 if (sdscmp(c->argv[1]->ptr,c->argv[2]->ptr) == 0) {
4237 addReply(c,shared.sameobjecterr);
4238 return;
4239 }
4240
4241 if ((o = lookupKeyWriteOrReply(c,c->argv[1],shared.nokeyerr)) == NULL)
4242 return;
4243
4244 incrRefCount(o);
4245 deleteIfVolatile(c->db,c->argv[2]);
4246 if (dictAdd(c->db->dict,c->argv[2],o) == DICT_ERR) {
4247 if (nx) {
4248 decrRefCount(o);
4249 addReply(c,shared.czero);
4250 return;
4251 }
4252 dictReplace(c->db->dict,c->argv[2],o);
4253 } else {
4254 incrRefCount(c->argv[2]);
4255 }
4256 deleteKey(c->db,c->argv[1]);
4257 server.dirty++;
4258 addReply(c,nx ? shared.cone : shared.ok);
4259 }
4260
4261 static void renameCommand(redisClient *c) {
4262 renameGenericCommand(c,0);
4263 }
4264
4265 static void renamenxCommand(redisClient *c) {
4266 renameGenericCommand(c,1);
4267 }
4268
4269 static void moveCommand(redisClient *c) {
4270 robj *o;
4271 redisDb *src, *dst;
4272 int srcid;
4273
4274 /* Obtain source and target DB pointers */
4275 src = c->db;
4276 srcid = c->db->id;
4277 if (selectDb(c,atoi(c->argv[2]->ptr)) == REDIS_ERR) {
4278 addReply(c,shared.outofrangeerr);
4279 return;
4280 }
4281 dst = c->db;
4282 selectDb(c,srcid); /* Back to the source DB */
4283
4284 /* If the user is moving using as target the same
4285 * DB as the source DB it is probably an error. */
4286 if (src == dst) {
4287 addReply(c,shared.sameobjecterr);
4288 return;
4289 }
4290
4291 /* Check if the element exists and get a reference */
4292 o = lookupKeyWrite(c->db,c->argv[1]);
4293 if (!o) {
4294 addReply(c,shared.czero);
4295 return;
4296 }
4297
4298 /* Try to add the element to the target DB */
4299 deleteIfVolatile(dst,c->argv[1]);
4300 if (dictAdd(dst->dict,c->argv[1],o) == DICT_ERR) {
4301 addReply(c,shared.czero);
4302 return;
4303 }
4304 incrRefCount(c->argv[1]);
4305 incrRefCount(o);
4306
4307 /* OK! key moved, free the entry in the source DB */
4308 deleteKey(src,c->argv[1]);
4309 server.dirty++;
4310 addReply(c,shared.cone);
4311 }
4312
4313 /* =================================== Lists ================================ */
4314 static void pushGenericCommand(redisClient *c, int where) {
4315 robj *lobj;
4316 list *list;
4317
4318 lobj = lookupKeyWrite(c->db,c->argv[1]);
4319 if (lobj == NULL) {
4320 if (handleClientsWaitingListPush(c,c->argv[1],c->argv[2])) {
4321 addReply(c,shared.cone);
4322 return;
4323 }
4324 lobj = createListObject();
4325 list = lobj->ptr;
4326 if (where == REDIS_HEAD) {
4327 listAddNodeHead(list,c->argv[2]);
4328 } else {
4329 listAddNodeTail(list,c->argv[2]);
4330 }
4331 dictAdd(c->db->dict,c->argv[1],lobj);
4332 incrRefCount(c->argv[1]);
4333 incrRefCount(c->argv[2]);
4334 } else {
4335 if (lobj->type != REDIS_LIST) {
4336 addReply(c,shared.wrongtypeerr);
4337 return;
4338 }
4339 if (handleClientsWaitingListPush(c,c->argv[1],c->argv[2])) {
4340 addReply(c,shared.cone);
4341 return;
4342 }
4343 list = lobj->ptr;
4344 if (where == REDIS_HEAD) {
4345 listAddNodeHead(list,c->argv[2]);
4346 } else {
4347 listAddNodeTail(list,c->argv[2]);
4348 }
4349 incrRefCount(c->argv[2]);
4350 }
4351 server.dirty++;
4352 addReplySds(c,sdscatprintf(sdsempty(),":%d\r\n",listLength(list)));
4353 }
4354
4355 static void lpushCommand(redisClient *c) {
4356 pushGenericCommand(c,REDIS_HEAD);
4357 }
4358
4359 static void rpushCommand(redisClient *c) {
4360 pushGenericCommand(c,REDIS_TAIL);
4361 }
4362
4363 static void llenCommand(redisClient *c) {
4364 robj *o;
4365 list *l;
4366
4367 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.czero)) == NULL ||
4368 checkType(c,o,REDIS_LIST)) return;
4369
4370 l = o->ptr;
4371 addReplyUlong(c,listLength(l));
4372 }
4373
4374 static void lindexCommand(redisClient *c) {
4375 robj *o;
4376 int index = atoi(c->argv[2]->ptr);
4377 list *list;
4378 listNode *ln;
4379
4380 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.nullbulk)) == NULL ||
4381 checkType(c,o,REDIS_LIST)) return;
4382 list = o->ptr;
4383
4384 ln = listIndex(list, index);
4385 if (ln == NULL) {
4386 addReply(c,shared.nullbulk);
4387 } else {
4388 robj *ele = listNodeValue(ln);
4389 addReplyBulk(c,ele);
4390 }
4391 }
4392
4393 static void lsetCommand(redisClient *c) {
4394 robj *o;
4395 int index = atoi(c->argv[2]->ptr);
4396 list *list;
4397 listNode *ln;
4398
4399 if ((o = lookupKeyWriteOrReply(c,c->argv[1],shared.nokeyerr)) == NULL ||
4400 checkType(c,o,REDIS_LIST)) return;
4401 list = o->ptr;
4402
4403 ln = listIndex(list, index);
4404 if (ln == NULL) {
4405 addReply(c,shared.outofrangeerr);
4406 } else {
4407 robj *ele = listNodeValue(ln);
4408
4409 decrRefCount(ele);
4410 listNodeValue(ln) = c->argv[3];
4411 incrRefCount(c->argv[3]);
4412 addReply(c,shared.ok);
4413 server.dirty++;
4414 }
4415 }
4416
4417 static void popGenericCommand(redisClient *c, int where) {
4418 robj *o;
4419 list *list;
4420 listNode *ln;
4421
4422 if ((o = lookupKeyWriteOrReply(c,c->argv[1],shared.nullbulk)) == NULL ||
4423 checkType(c,o,REDIS_LIST)) return;
4424 list = o->ptr;
4425
4426 if (where == REDIS_HEAD)
4427 ln = listFirst(list);
4428 else
4429 ln = listLast(list);
4430
4431 if (ln == NULL) {
4432 addReply(c,shared.nullbulk);
4433 } else {
4434 robj *ele = listNodeValue(ln);
4435 addReplyBulk(c,ele);
4436 listDelNode(list,ln);
4437 if (listLength(list) == 0) deleteKey(c->db,c->argv[1]);
4438 server.dirty++;
4439 }
4440 }
4441
4442 static void lpopCommand(redisClient *c) {
4443 popGenericCommand(c,REDIS_HEAD);
4444 }
4445
4446 static void rpopCommand(redisClient *c) {
4447 popGenericCommand(c,REDIS_TAIL);
4448 }
4449
4450 static void lrangeCommand(redisClient *c) {
4451 robj *o;
4452 int start = atoi(c->argv[2]->ptr);
4453 int end = atoi(c->argv[3]->ptr);
4454 int llen;
4455 int rangelen, j;
4456 list *list;
4457 listNode *ln;
4458 robj *ele;
4459
4460 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.nullmultibulk)) == NULL ||
4461 checkType(c,o,REDIS_LIST)) return;
4462 list = o->ptr;
4463 llen = listLength(list);
4464
4465 /* convert negative indexes */
4466 if (start < 0) start = llen+start;
4467 if (end < 0) end = llen+end;
4468 if (start < 0) start = 0;
4469 if (end < 0) end = 0;
4470
4471 /* indexes sanity checks */
4472 if (start > end || start >= llen) {
4473 /* Out of range start or start > end result in empty list */
4474 addReply(c,shared.emptymultibulk);
4475 return;
4476 }
4477 if (end >= llen) end = llen-1;
4478 rangelen = (end-start)+1;
4479
4480 /* Return the result in form of a multi-bulk reply */
4481 ln = listIndex(list, start);
4482 addReplySds(c,sdscatprintf(sdsempty(),"*%d\r\n",rangelen));
4483 for (j = 0; j < rangelen; j++) {
4484 ele = listNodeValue(ln);
4485 addReplyBulk(c,ele);
4486 ln = ln->next;
4487 }
4488 }
4489
4490 static void ltrimCommand(redisClient *c) {
4491 robj *o;
4492 int start = atoi(c->argv[2]->ptr);
4493 int end = atoi(c->argv[3]->ptr);
4494 int llen;
4495 int j, ltrim, rtrim;
4496 list *list;
4497 listNode *ln;
4498
4499 if ((o = lookupKeyWriteOrReply(c,c->argv[1],shared.ok)) == NULL ||
4500 checkType(c,o,REDIS_LIST)) return;
4501 list = o->ptr;
4502 llen = listLength(list);
4503
4504 /* convert negative indexes */
4505 if (start < 0) start = llen+start;
4506 if (end < 0) end = llen+end;
4507 if (start < 0) start = 0;
4508 if (end < 0) end = 0;
4509
4510 /* indexes sanity checks */
4511 if (start > end || start >= llen) {
4512 /* Out of range start or start > end result in empty list */
4513 ltrim = llen;
4514 rtrim = 0;
4515 } else {
4516 if (end >= llen) end = llen-1;
4517 ltrim = start;
4518 rtrim = llen-end-1;
4519 }
4520
4521 /* Remove list elements to perform the trim */
4522 for (j = 0; j < ltrim; j++) {
4523 ln = listFirst(list);
4524 listDelNode(list,ln);
4525 }
4526 for (j = 0; j < rtrim; j++) {
4527 ln = listLast(list);
4528 listDelNode(list,ln);
4529 }
4530 if (listLength(list) == 0) deleteKey(c->db,c->argv[1]);
4531 server.dirty++;
4532 addReply(c,shared.ok);
4533 }
4534
4535 static void lremCommand(redisClient *c) {
4536 robj *o;
4537 list *list;
4538 listNode *ln, *next;
4539 int toremove = atoi(c->argv[2]->ptr);
4540 int removed = 0;
4541 int fromtail = 0;
4542
4543 if ((o = lookupKeyWriteOrReply(c,c->argv[1],shared.czero)) == NULL ||
4544 checkType(c,o,REDIS_LIST)) return;
4545 list = o->ptr;
4546
4547 if (toremove < 0) {
4548 toremove = -toremove;
4549 fromtail = 1;
4550 }
4551 ln = fromtail ? list->tail : list->head;
4552 while (ln) {
4553 robj *ele = listNodeValue(ln);
4554
4555 next = fromtail ? ln->prev : ln->next;
4556 if (compareStringObjects(ele,c->argv[3]) == 0) {
4557 listDelNode(list,ln);
4558 server.dirty++;
4559 removed++;
4560 if (toremove && removed == toremove) break;
4561 }
4562 ln = next;
4563 }
4564 if (listLength(list) == 0) deleteKey(c->db,c->argv[1]);
4565 addReplySds(c,sdscatprintf(sdsempty(),":%d\r\n",removed));
4566 }
4567
4568 /* This is the semantic of this command:
4569 * RPOPLPUSH srclist dstlist:
4570 * IF LLEN(srclist) > 0
4571 * element = RPOP srclist
4572 * LPUSH dstlist element
4573 * RETURN element
4574 * ELSE
4575 * RETURN nil
4576 * END
4577 * END
4578 *
4579 * The idea is to be able to get an element from a list in a reliable way
4580 * since the element is not just returned but pushed against another list
4581 * as well. This command was originally proposed by Ezra Zygmuntowicz.
4582 */
4583 static void rpoplpushcommand(redisClient *c) {
4584 robj *sobj;
4585 list *srclist;
4586 listNode *ln;
4587
4588 if ((sobj = lookupKeyWriteOrReply(c,c->argv[1],shared.nullbulk)) == NULL ||
4589 checkType(c,sobj,REDIS_LIST)) return;
4590 srclist = sobj->ptr;
4591 ln = listLast(srclist);
4592
4593 if (ln == NULL) {
4594 addReply(c,shared.nullbulk);
4595 } else {
4596 robj *dobj = lookupKeyWrite(c->db,c->argv[2]);
4597 robj *ele = listNodeValue(ln);
4598 list *dstlist;
4599
4600 if (dobj && dobj->type != REDIS_LIST) {
4601 addReply(c,shared.wrongtypeerr);
4602 return;
4603 }
4604
4605 /* Add the element to the target list (unless it's directly
4606 * passed to some BLPOP-ing client */
4607 if (!handleClientsWaitingListPush(c,c->argv[2],ele)) {
4608 if (dobj == NULL) {
4609 /* Create the list if the key does not exist */
4610 dobj = createListObject();
4611 dictAdd(c->db->dict,c->argv[2],dobj);
4612 incrRefCount(c->argv[2]);
4613 }
4614 dstlist = dobj->ptr;
4615 listAddNodeHead(dstlist,ele);
4616 incrRefCount(ele);
4617 }
4618
4619 /* Send the element to the client as reply as well */
4620 addReplyBulk(c,ele);
4621
4622 /* Finally remove the element from the source list */
4623 listDelNode(srclist,ln);
4624 if (listLength(srclist) == 0) deleteKey(c->db,c->argv[1]);
4625 server.dirty++;
4626 }
4627 }
4628
4629 /* ==================================== Sets ================================ */
4630
4631 static void saddCommand(redisClient *c) {
4632 robj *set;
4633
4634 set = lookupKeyWrite(c->db,c->argv[1]);
4635 if (set == NULL) {
4636 set = createSetObject();
4637 dictAdd(c->db->dict,c->argv[1],set);
4638 incrRefCount(c->argv[1]);
4639 } else {
4640 if (set->type != REDIS_SET) {
4641 addReply(c,shared.wrongtypeerr);
4642 return;
4643 }
4644 }
4645 if (dictAdd(set->ptr,c->argv[2],NULL) == DICT_OK) {
4646 incrRefCount(c->argv[2]);
4647 server.dirty++;
4648 addReply(c,shared.cone);
4649 } else {
4650 addReply(c,shared.czero);
4651 }
4652 }
4653
4654 static void sremCommand(redisClient *c) {
4655 robj *set;
4656
4657 if ((set = lookupKeyWriteOrReply(c,c->argv[1],shared.czero)) == NULL ||
4658 checkType(c,set,REDIS_SET)) return;
4659
4660 if (dictDelete(set->ptr,c->argv[2]) == DICT_OK) {
4661 server.dirty++;
4662 if (htNeedsResize(set->ptr)) dictResize(set->ptr);
4663 if (dictSize((dict*)set->ptr) == 0) deleteKey(c->db,c->argv[1]);
4664 addReply(c,shared.cone);
4665 } else {
4666 addReply(c,shared.czero);
4667 }
4668 }
4669
4670 static void smoveCommand(redisClient *c) {
4671 robj *srcset, *dstset;
4672
4673 srcset = lookupKeyWrite(c->db,c->argv[1]);
4674 dstset = lookupKeyWrite(c->db,c->argv[2]);
4675
4676 /* If the source key does not exist return 0, if it's of the wrong type
4677 * raise an error */
4678 if (srcset == NULL || srcset->type != REDIS_SET) {
4679 addReply(c, srcset ? shared.wrongtypeerr : shared.czero);
4680 return;
4681 }
4682 /* Error if the destination key is not a set as well */
4683 if (dstset && dstset->type != REDIS_SET) {
4684 addReply(c,shared.wrongtypeerr);
4685 return;
4686 }
4687 /* Remove the element from the source set */
4688 if (dictDelete(srcset->ptr,c->argv[3]) == DICT_ERR) {
4689 /* Key not found in the src set! return zero */
4690 addReply(c,shared.czero);
4691 return;
4692 }
4693 if (dictSize((dict*)srcset->ptr) == 0 && srcset != dstset)
4694 deleteKey(c->db,c->argv[1]);
4695 server.dirty++;
4696 /* Add the element to the destination set */
4697 if (!dstset) {
4698 dstset = createSetObject();
4699 dictAdd(c->db->dict,c->argv[2],dstset);
4700 incrRefCount(c->argv[2]);
4701 }
4702 if (dictAdd(dstset->ptr,c->argv[3],NULL) == DICT_OK)
4703 incrRefCount(c->argv[3]);
4704 addReply(c,shared.cone);
4705 }
4706
4707 static void sismemberCommand(redisClient *c) {
4708 robj *set;
4709
4710 if ((set = lookupKeyReadOrReply(c,c->argv[1],shared.czero)) == NULL ||
4711 checkType(c,set,REDIS_SET)) return;
4712
4713 if (dictFind(set->ptr,c->argv[2]))
4714 addReply(c,shared.cone);
4715 else
4716 addReply(c,shared.czero);
4717 }
4718
4719 static void scardCommand(redisClient *c) {
4720 robj *o;
4721 dict *s;
4722
4723 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.czero)) == NULL ||
4724 checkType(c,o,REDIS_SET)) return;
4725
4726 s = o->ptr;
4727 addReplyUlong(c,dictSize(s));
4728 }
4729
4730 static void spopCommand(redisClient *c) {
4731 robj *set;
4732 dictEntry *de;
4733
4734 if ((set = lookupKeyWriteOrReply(c,c->argv[1],shared.nullbulk)) == NULL ||
4735 checkType(c,set,REDIS_SET)) return;
4736
4737 de = dictGetRandomKey(set->ptr);
4738 if (de == NULL) {
4739 addReply(c,shared.nullbulk);
4740 } else {
4741 robj *ele = dictGetEntryKey(de);
4742
4743 addReplyBulk(c,ele);
4744 dictDelete(set->ptr,ele);
4745 if (htNeedsResize(set->ptr)) dictResize(set->ptr);
4746 if (dictSize((dict*)set->ptr) == 0) deleteKey(c->db,c->argv[1]);
4747 server.dirty++;
4748 }
4749 }
4750
4751 static void srandmemberCommand(redisClient *c) {
4752 robj *set;
4753 dictEntry *de;
4754
4755 if ((set = lookupKeyReadOrReply(c,c->argv[1],shared.nullbulk)) == NULL ||
4756 checkType(c,set,REDIS_SET)) return;
4757
4758 de = dictGetRandomKey(set->ptr);
4759 if (de == NULL) {
4760 addReply(c,shared.nullbulk);
4761 } else {
4762 robj *ele = dictGetEntryKey(de);
4763
4764 addReplyBulk(c,ele);
4765 }
4766 }
4767
4768 static int qsortCompareSetsByCardinality(const void *s1, const void *s2) {
4769 dict **d1 = (void*) s1, **d2 = (void*) s2;
4770
4771 return dictSize(*d1)-dictSize(*d2);
4772 }
4773
4774 static void sinterGenericCommand(redisClient *c, robj **setskeys, unsigned long setsnum, robj *dstkey) {
4775 dict **dv = zmalloc(sizeof(dict*)*setsnum);
4776 dictIterator *di;
4777 dictEntry *de;
4778 robj *lenobj = NULL, *dstset = NULL;
4779 unsigned long j, cardinality = 0;
4780
4781 for (j = 0; j < setsnum; j++) {
4782 robj *setobj;
4783
4784 setobj = dstkey ?
4785 lookupKeyWrite(c->db,setskeys[j]) :
4786 lookupKeyRead(c->db,setskeys[j]);
4787 if (!setobj) {
4788 zfree(dv);
4789 if (dstkey) {
4790 if (deleteKey(c->db,dstkey))
4791 server.dirty++;
4792 addReply(c,shared.czero);
4793 } else {
4794 addReply(c,shared.nullmultibulk);
4795 }
4796 return;
4797 }
4798 if (setobj->type != REDIS_SET) {
4799 zfree(dv);
4800 addReply(c,shared.wrongtypeerr);
4801 return;
4802 }
4803 dv[j] = setobj->ptr;
4804 }
4805 /* Sort sets from the smallest to largest, this will improve our
4806 * algorithm's performace */
4807 qsort(dv,setsnum,sizeof(dict*),qsortCompareSetsByCardinality);
4808
4809 /* The first thing we should output is the total number of elements...
4810 * since this is a multi-bulk write, but at this stage we don't know
4811 * the intersection set size, so we use a trick, append an empty object
4812 * to the output list and save the pointer to later modify it with the
4813 * right length */
4814 if (!dstkey) {
4815 lenobj = createObject(REDIS_STRING,NULL);
4816 addReply(c,lenobj);
4817 decrRefCount(lenobj);
4818 } else {
4819 /* If we have a target key where to store the resulting set
4820 * create this key with an empty set inside */
4821 dstset = createSetObject();
4822 }
4823
4824 /* Iterate all the elements of the first (smallest) set, and test
4825 * the element against all the other sets, if at least one set does
4826 * not include the element it is discarded */
4827 di = dictGetIterator(dv[0]);
4828
4829 while((de = dictNext(di)) != NULL) {
4830 robj *ele;
4831
4832 for (j = 1; j < setsnum; j++)
4833 if (dictFind(dv[j],dictGetEntryKey(de)) == NULL) break;
4834 if (j != setsnum)
4835 continue; /* at least one set does not contain the member */
4836 ele = dictGetEntryKey(de);
4837 if (!dstkey) {
4838 addReplyBulk(c,ele);
4839 cardinality++;
4840 } else {
4841 dictAdd(dstset->ptr,ele,NULL);
4842 incrRefCount(ele);
4843 }
4844 }
4845 dictReleaseIterator(di);
4846
4847 if (dstkey) {
4848 /* Store the resulting set into the target, if the intersection
4849 * is not an empty set. */
4850 deleteKey(c->db,dstkey);
4851 if (dictSize((dict*)dstset->ptr) > 0) {
4852 dictAdd(c->db->dict,dstkey,dstset);
4853 incrRefCount(dstkey);
4854 addReplyLong(c,dictSize((dict*)dstset->ptr));
4855 } else {
4856 decrRefCount(dstset);
4857 addReply(c,shared.czero);
4858 }
4859 server.dirty++;
4860 } else {
4861 lenobj->ptr = sdscatprintf(sdsempty(),"*%lu\r\n",cardinality);
4862 }
4863 zfree(dv);
4864 }
4865
4866 static void sinterCommand(redisClient *c) {
4867 sinterGenericCommand(c,c->argv+1,c->argc-1,NULL);
4868 }
4869
4870 static void sinterstoreCommand(redisClient *c) {
4871 sinterGenericCommand(c,c->argv+2,c->argc-2,c->argv[1]);
4872 }
4873
4874 #define REDIS_OP_UNION 0
4875 #define REDIS_OP_DIFF 1
4876 #define REDIS_OP_INTER 2
4877
4878 static void sunionDiffGenericCommand(redisClient *c, robj **setskeys, int setsnum, robj *dstkey, int op) {
4879 dict **dv = zmalloc(sizeof(dict*)*setsnum);
4880 dictIterator *di;
4881 dictEntry *de;
4882 robj *dstset = NULL;
4883 int j, cardinality = 0;
4884
4885 for (j = 0; j < setsnum; j++) {
4886 robj *setobj;
4887
4888 setobj = dstkey ?
4889 lookupKeyWrite(c->db,setskeys[j]) :
4890 lookupKeyRead(c->db,setskeys[j]);
4891 if (!setobj) {
4892 dv[j] = NULL;
4893 continue;
4894 }
4895 if (setobj->type != REDIS_SET) {
4896 zfree(dv);
4897 addReply(c,shared.wrongtypeerr);
4898 return;
4899 }
4900 dv[j] = setobj->ptr;
4901 }
4902
4903 /* We need a temp set object to store our union. If the dstkey
4904 * is not NULL (that is, we are inside an SUNIONSTORE operation) then
4905 * this set object will be the resulting object to set into the target key*/
4906 dstset = createSetObject();
4907
4908 /* Iterate all the elements of all the sets, add every element a single
4909 * time to the result set */
4910 for (j = 0; j < setsnum; j++) {
4911 if (op == REDIS_OP_DIFF && j == 0 && !dv[j]) break; /* result set is empty */
4912 if (!dv[j]) continue; /* non existing keys are like empty sets */
4913
4914 di = dictGetIterator(dv[j]);
4915
4916 while((de = dictNext(di)) != NULL) {
4917 robj *ele;
4918
4919 /* dictAdd will not add the same element multiple times */
4920 ele = dictGetEntryKey(de);
4921 if (op == REDIS_OP_UNION || j == 0) {
4922 if (dictAdd(dstset->ptr,ele,NULL) == DICT_OK) {
4923 incrRefCount(ele);
4924 cardinality++;
4925 }
4926 } else if (op == REDIS_OP_DIFF) {
4927 if (dictDelete(dstset->ptr,ele) == DICT_OK) {
4928 cardinality--;
4929 }
4930 }
4931 }
4932 dictReleaseIterator(di);
4933
4934 /* result set is empty? Exit asap. */
4935 if (op == REDIS_OP_DIFF && cardinality == 0) break;
4936 }
4937
4938 /* Output the content of the resulting set, if not in STORE mode */
4939 if (!dstkey) {
4940 addReplySds(c,sdscatprintf(sdsempty(),"*%d\r\n",cardinality));
4941 di = dictGetIterator(dstset->ptr);
4942 while((de = dictNext(di)) != NULL) {
4943 robj *ele;
4944
4945 ele = dictGetEntryKey(de);
4946 addReplyBulk(c,ele);
4947 }
4948 dictReleaseIterator(di);
4949 decrRefCount(dstset);
4950 } else {
4951 /* If we have a target key where to store the resulting set
4952 * create this key with the result set inside */
4953 deleteKey(c->db,dstkey);
4954 if (dictSize((dict*)dstset->ptr) > 0) {
4955 dictAdd(c->db->dict,dstkey,dstset);
4956 incrRefCount(dstkey);
4957 addReplyLong(c,dictSize((dict*)dstset->ptr));
4958 } else {
4959 decrRefCount(dstset);
4960 addReply(c,shared.czero);
4961 }
4962 server.dirty++;
4963 }
4964 zfree(dv);
4965 }
4966
4967 static void sunionCommand(redisClient *c) {
4968 sunionDiffGenericCommand(c,c->argv+1,c->argc-1,NULL,REDIS_OP_UNION);
4969 }
4970
4971 static void sunionstoreCommand(redisClient *c) {
4972 sunionDiffGenericCommand(c,c->argv+2,c->argc-2,c->argv[1],REDIS_OP_UNION);
4973 }
4974
4975 static void sdiffCommand(redisClient *c) {
4976 sunionDiffGenericCommand(c,c->argv+1,c->argc-1,NULL,REDIS_OP_DIFF);
4977 }
4978
4979 static void sdiffstoreCommand(redisClient *c) {
4980 sunionDiffGenericCommand(c,c->argv+2,c->argc-2,c->argv[1],REDIS_OP_DIFF);
4981 }
4982
4983 /* ==================================== ZSets =============================== */
4984
4985 /* ZSETs are ordered sets using two data structures to hold the same elements
4986 * in order to get O(log(N)) INSERT and REMOVE operations into a sorted
4987 * data structure.
4988 *
4989 * The elements are added to an hash table mapping Redis objects to scores.
4990 * At the same time the elements are added to a skip list mapping scores
4991 * to Redis objects (so objects are sorted by scores in this "view"). */
4992
4993 /* This skiplist implementation is almost a C translation of the original
4994 * algorithm described by William Pugh in "Skip Lists: A Probabilistic
4995 * Alternative to Balanced Trees", modified in three ways:
4996 * a) this implementation allows for repeated values.
4997 * b) the comparison is not just by key (our 'score') but by satellite data.
4998 * c) there is a back pointer, so it's a doubly linked list with the back
4999 * pointers being only at "level 1". This allows to traverse the list
5000 * from tail to head, useful for ZREVRANGE. */
5001
5002 static zskiplistNode *zslCreateNode(int level, double score, robj *obj) {
5003 zskiplistNode *zn = zmalloc(sizeof(*zn));
5004
5005 zn->forward = zmalloc(sizeof(zskiplistNode*) * level);
5006 if (level > 0)
5007 zn->span = zmalloc(sizeof(unsigned int) * (level - 1));
5008 zn->score = score;
5009 zn->obj = obj;
5010 return zn;
5011 }
5012
5013 static zskiplist *zslCreate(void) {
5014 int j;
5015 zskiplist *zsl;
5016
5017 zsl = zmalloc(sizeof(*zsl));
5018 zsl->level = 1;
5019 zsl->length = 0;
5020 zsl->header = zslCreateNode(ZSKIPLIST_MAXLEVEL,0,NULL);
5021 for (j = 0; j < ZSKIPLIST_MAXLEVEL; j++) {
5022 zsl->header->forward[j] = NULL;
5023
5024 /* span has space for ZSKIPLIST_MAXLEVEL-1 elements */
5025 if (j < ZSKIPLIST_MAXLEVEL-1)
5026 zsl->header->span[j] = 0;
5027 }
5028 zsl->header->backward = NULL;
5029 zsl->tail = NULL;
5030 return zsl;
5031 }
5032
5033 static void zslFreeNode(zskiplistNode *node) {
5034 decrRefCount(node->obj);
5035 zfree(node->forward);
5036 zfree(node->span);
5037 zfree(node);
5038 }
5039
5040 static void zslFree(zskiplist *zsl) {
5041 zskiplistNode *node = zsl->header->forward[0], *next;
5042
5043 zfree(zsl->header->forward);
5044 zfree(zsl->header->span);
5045 zfree(zsl->header);
5046 while(node) {
5047 next = node->forward[0];
5048 zslFreeNode(node);
5049 node = next;
5050 }
5051 zfree(zsl);
5052 }
5053
5054 static int zslRandomLevel(void) {
5055 int level = 1;
5056 while ((random()&0xFFFF) < (ZSKIPLIST_P * 0xFFFF))
5057 level += 1;
5058 return level;
5059 }
5060
5061 static void zslInsert(zskiplist *zsl, double score, robj *obj) {
5062 zskiplistNode *update[ZSKIPLIST_MAXLEVEL], *x;
5063 unsigned int rank[ZSKIPLIST_MAXLEVEL];
5064 int i, level;
5065
5066 x = zsl->header;
5067 for (i = zsl->level-1; i >= 0; i--) {
5068 /* store rank that is crossed to reach the insert position */
5069 rank[i] = i == (zsl->level-1) ? 0 : rank[i+1];
5070
5071 while (x->forward[i] &&
5072 (x->forward[i]->score < score ||
5073 (x->forward[i]->score == score &&
5074 compareStringObjects(x->forward[i]->obj,obj) < 0))) {
5075 rank[i] += i > 0 ? x->span[i-1] : 1;
5076 x = x->forward[i];
5077 }
5078 update[i] = x;
5079 }
5080 /* we assume the key is not already inside, since we allow duplicated
5081 * scores, and the re-insertion of score and redis object should never
5082 * happpen since the caller of zslInsert() should test in the hash table
5083 * if the element is already inside or not. */
5084 level = zslRandomLevel();
5085 if (level > zsl->level) {
5086 for (i = zsl->level; i < level; i++) {
5087 rank[i] = 0;
5088 update[i] = zsl->header;
5089 update[i]->span[i-1] = zsl->length;
5090 }
5091 zsl->level = level;
5092 }
5093 x = zslCreateNode(level,score,obj);
5094 for (i = 0; i < level; i++) {
5095 x->forward[i] = update[i]->forward[i];
5096 update[i]->forward[i] = x;
5097
5098 /* update span covered by update[i] as x is inserted here */
5099 if (i > 0) {
5100 x->span[i-1] = update[i]->span[i-1] - (rank[0] - rank[i]);
5101 update[i]->span[i-1] = (rank[0] - rank[i]) + 1;
5102 }
5103 }
5104
5105 /* increment span for untouched levels */
5106 for (i = level; i < zsl->level; i++) {
5107 update[i]->span[i-1]++;
5108 }
5109
5110 x->backward = (update[0] == zsl->header) ? NULL : update[0];
5111 if (x->forward[0])
5112 x->forward[0]->backward = x;
5113 else
5114 zsl->tail = x;
5115 zsl->length++;
5116 }
5117
5118 /* Internal function used by zslDelete, zslDeleteByScore and zslDeleteByRank */
5119 void zslDeleteNode(zskiplist *zsl, zskiplistNode *x, zskiplistNode **update) {
5120 int i;
5121 for (i = 0; i < zsl->level; i++) {
5122 if (update[i]->forward[i] == x) {
5123 if (i > 0) {
5124 update[i]->span[i-1] += x->span[i-1] - 1;
5125 }
5126 update[i]->forward[i] = x->forward[i];
5127 } else {
5128 /* invariant: i > 0, because update[0]->forward[0]
5129 * is always equal to x */
5130 update[i]->span[i-1] -= 1;
5131 }
5132 }
5133 if (x->forward[0]) {
5134 x->forward[0]->backward = x->backward;
5135 } else {
5136 zsl->tail = x->backward;
5137 }
5138 while(zsl->level > 1 && zsl->header->forward[zsl->level-1] == NULL)
5139 zsl->level--;
5140 zsl->length--;
5141 }
5142
5143 /* Delete an element with matching score/object from the skiplist. */
5144 static int zslDelete(zskiplist *zsl, double score, robj *obj) {
5145 zskiplistNode *update[ZSKIPLIST_MAXLEVEL], *x;
5146 int i;
5147
5148 x = zsl->header;
5149 for (i = zsl->level-1; i >= 0; i--) {
5150 while (x->forward[i] &&
5151 (x->forward[i]->score < score ||
5152 (x->forward[i]->score == score &&
5153 compareStringObjects(x->forward[i]->obj,obj) < 0)))
5154 x = x->forward[i];
5155 update[i] = x;
5156 }
5157 /* We may have multiple elements with the same score, what we need
5158 * is to find the element with both the right score and object. */
5159 x = x->forward[0];
5160 if (x && score == x->score && compareStringObjects(x->obj,obj) == 0) {
5161 zslDeleteNode(zsl, x, update);
5162 zslFreeNode(x);
5163 return 1;
5164 } else {
5165 return 0; /* not found */
5166 }
5167 return 0; /* not found */
5168 }
5169
5170 /* Delete all the elements with score between min and max from the skiplist.
5171 * Min and mx are inclusive, so a score >= min || score <= max is deleted.
5172 * Note that this function takes the reference to the hash table view of the
5173 * sorted set, in order to remove the elements from the hash table too. */
5174 static unsigned long zslDeleteRangeByScore(zskiplist *zsl, double min, double max, dict *dict) {
5175 zskiplistNode *update[ZSKIPLIST_MAXLEVEL], *x;
5176 unsigned long removed = 0;
5177 int i;
5178
5179 x = zsl->header;
5180 for (i = zsl->level-1; i >= 0; i--) {
5181 while (x->forward[i] && x->forward[i]->score < min)
5182 x = x->forward[i];
5183 update[i] = x;
5184 }
5185 /* We may have multiple elements with the same score, what we need
5186 * is to find the element with both the right score and object. */
5187 x = x->forward[0];
5188 while (x && x->score <= max) {
5189 zskiplistNode *next = x->forward[0];
5190 zslDeleteNode(zsl, x, update);
5191 dictDelete(dict,x->obj);
5192 zslFreeNode(x);
5193 removed++;
5194 x = next;
5195 }
5196 return removed; /* not found */
5197 }
5198
5199 /* Delete all the elements with rank between start and end from the skiplist.
5200 * Start and end are inclusive. Note that start and end need to be 1-based */
5201 static unsigned long zslDeleteRangeByRank(zskiplist *zsl, unsigned int start, unsigned int end, dict *dict) {
5202 zskiplistNode *update[ZSKIPLIST_MAXLEVEL], *x;
5203 unsigned long traversed = 0, removed = 0;
5204 int i;
5205
5206 x = zsl->header;
5207 for (i = zsl->level-1; i >= 0; i--) {
5208 while (x->forward[i] && (traversed + (i > 0 ? x->span[i-1] : 1)) < start) {
5209 traversed += i > 0 ? x->span[i-1] : 1;
5210 x = x->forward[i];
5211 }
5212 update[i] = x;
5213 }
5214
5215 traversed++;
5216 x = x->forward[0];
5217 while (x && traversed <= end) {
5218 zskiplistNode *next = x->forward[0];
5219 zslDeleteNode(zsl, x, update);
5220 dictDelete(dict,x->obj);
5221 zslFreeNode(x);
5222 removed++;
5223 traversed++;
5224 x = next;
5225 }
5226 return removed;
5227 }
5228
5229 /* Find the first node having a score equal or greater than the specified one.
5230 * Returns NULL if there is no match. */
5231 static zskiplistNode *zslFirstWithScore(zskiplist *zsl, double score) {
5232 zskiplistNode *x;
5233 int i;
5234
5235 x = zsl->header;
5236 for (i = zsl->level-1; i >= 0; i--) {
5237 while (x->forward[i] && x->forward[i]->score < score)
5238 x = x->forward[i];
5239 }
5240 /* We may have multiple elements with the same score, what we need
5241 * is to find the element with both the right score and object. */
5242 return x->forward[0];
5243 }
5244
5245 /* Find the rank for an element by both score and key.
5246 * Returns 0 when the element cannot be found, rank otherwise.
5247 * Note that the rank is 1-based due to the span of zsl->header to the
5248 * first element. */
5249 static unsigned long zslGetRank(zskiplist *zsl, double score, robj *o) {
5250 zskiplistNode *x;
5251 unsigned long rank = 0;
5252 int i;
5253
5254 x = zsl->header;
5255 for (i = zsl->level-1; i >= 0; i--) {
5256 while (x->forward[i] &&
5257 (x->forward[i]->score < score ||
5258 (x->forward[i]->score == score &&
5259 compareStringObjects(x->forward[i]->obj,o) <= 0))) {
5260 rank += i > 0 ? x->span[i-1] : 1;
5261 x = x->forward[i];
5262 }
5263
5264 /* x might be equal to zsl->header, so test if obj is non-NULL */
5265 if (x->obj && compareStringObjects(x->obj,o) == 0) {
5266 return rank;
5267 }
5268 }
5269 return 0;
5270 }
5271
5272 /* Finds an element by its rank. The rank argument needs to be 1-based. */
5273 zskiplistNode* zslGetElementByRank(zskiplist *zsl, unsigned long rank) {
5274 zskiplistNode *x;
5275 unsigned long traversed = 0;
5276 int i;
5277
5278 x = zsl->header;
5279 for (i = zsl->level-1; i >= 0; i--) {
5280 while (x->forward[i] && (traversed + (i>0 ? x->span[i-1] : 1)) <= rank)
5281 {
5282 traversed += i > 0 ? x->span[i-1] : 1;
5283 x = x->forward[i];
5284 }
5285 if (traversed == rank) {
5286 return x;
5287 }
5288 }
5289 return NULL;
5290 }
5291
5292 /* The actual Z-commands implementations */
5293
5294 /* This generic command implements both ZADD and ZINCRBY.
5295 * scoreval is the score if the operation is a ZADD (doincrement == 0) or
5296 * the increment if the operation is a ZINCRBY (doincrement == 1). */
5297 static void zaddGenericCommand(redisClient *c, robj *key, robj *ele, double scoreval, int doincrement) {
5298 robj *zsetobj;
5299 zset *zs;
5300 double *score;
5301
5302 zsetobj = lookupKeyWrite(c->db,key);
5303 if (zsetobj == NULL) {
5304 zsetobj = createZsetObject();
5305 dictAdd(c->db->dict,key,zsetobj);
5306 incrRefCount(key);
5307 } else {
5308 if (zsetobj->type != REDIS_ZSET) {
5309 addReply(c,shared.wrongtypeerr);
5310 return;
5311 }
5312 }
5313 zs = zsetobj->ptr;
5314
5315 /* Ok now since we implement both ZADD and ZINCRBY here the code
5316 * needs to handle the two different conditions. It's all about setting
5317 * '*score', that is, the new score to set, to the right value. */
5318 score = zmalloc(sizeof(double));
5319 if (doincrement) {
5320 dictEntry *de;
5321
5322 /* Read the old score. If the element was not present starts from 0 */
5323 de = dictFind(zs->dict,ele);
5324 if (de) {
5325 double *oldscore = dictGetEntryVal(de);
5326 *score = *oldscore + scoreval;
5327 } else {
5328 *score = scoreval;
5329 }
5330 } else {
5331 *score = scoreval;
5332 }
5333
5334 /* What follows is a simple remove and re-insert operation that is common
5335 * to both ZADD and ZINCRBY... */
5336 if (dictAdd(zs->dict,ele,score) == DICT_OK) {
5337 /* case 1: New element */
5338 incrRefCount(ele); /* added to hash */
5339 zslInsert(zs->zsl,*score,ele);
5340 incrRefCount(ele); /* added to skiplist */
5341 server.dirty++;
5342 if (doincrement)
5343 addReplyDouble(c,*score);
5344 else
5345 addReply(c,shared.cone);
5346 } else {
5347 dictEntry *de;
5348 double *oldscore;
5349
5350 /* case 2: Score update operation */
5351 de = dictFind(zs->dict,ele);
5352 redisAssert(de != NULL);
5353 oldscore = dictGetEntryVal(de);
5354 if (*score != *oldscore) {
5355 int deleted;
5356
5357 /* Remove and insert the element in the skip list with new score */
5358 deleted = zslDelete(zs->zsl,*oldscore,ele);
5359 redisAssert(deleted != 0);
5360 zslInsert(zs->zsl,*score,ele);
5361 incrRefCount(ele);
5362 /* Update the score in the hash table */
5363 dictReplace(zs->dict,ele,score);
5364 server.dirty++;
5365 } else {
5366 zfree(score);
5367 }
5368 if (doincrement)
5369 addReplyDouble(c,*score);
5370 else
5371 addReply(c,shared.czero);
5372 }
5373 }
5374
5375 static void zaddCommand(redisClient *c) {
5376 double scoreval;
5377
5378 scoreval = strtod(c->argv[2]->ptr,NULL);
5379 zaddGenericCommand(c,c->argv[1],c->argv[3],scoreval,0);
5380 }
5381
5382 static void zincrbyCommand(redisClient *c) {
5383 double scoreval;
5384
5385 scoreval = strtod(c->argv[2]->ptr,NULL);
5386 zaddGenericCommand(c,c->argv[1],c->argv[3],scoreval,1);
5387 }
5388
5389 static void zremCommand(redisClient *c) {
5390 robj *zsetobj;
5391 zset *zs;
5392 dictEntry *de;
5393 double *oldscore;
5394 int deleted;
5395
5396 if ((zsetobj = lookupKeyWriteOrReply(c,c->argv[1],shared.czero)) == NULL ||
5397 checkType(c,zsetobj,REDIS_ZSET)) return;
5398
5399 zs = zsetobj->ptr;
5400 de = dictFind(zs->dict,c->argv[2]);
5401 if (de == NULL) {
5402 addReply(c,shared.czero);
5403 return;
5404 }
5405 /* Delete from the skiplist */
5406 oldscore = dictGetEntryVal(de);
5407 deleted = zslDelete(zs->zsl,*oldscore,c->argv[2]);
5408 redisAssert(deleted != 0);
5409
5410 /* Delete from the hash table */
5411 dictDelete(zs->dict,c->argv[2]);
5412 if (htNeedsResize(zs->dict)) dictResize(zs->dict);
5413 if (dictSize(zs->dict) == 0) deleteKey(c->db,c->argv[1]);
5414 server.dirty++;
5415 addReply(c,shared.cone);
5416 }
5417
5418 static void zremrangebyscoreCommand(redisClient *c) {
5419 double min = strtod(c->argv[2]->ptr,NULL);
5420 double max = strtod(c->argv[3]->ptr,NULL);
5421 long deleted;
5422 robj *zsetobj;
5423 zset *zs;
5424
5425 if ((zsetobj = lookupKeyWriteOrReply(c,c->argv[1],shared.czero)) == NULL ||
5426 checkType(c,zsetobj,REDIS_ZSET)) return;
5427
5428 zs = zsetobj->ptr;
5429 deleted = zslDeleteRangeByScore(zs->zsl,min,max,zs->dict);
5430 if (htNeedsResize(zs->dict)) dictResize(zs->dict);
5431 if (dictSize(zs->dict) == 0) deleteKey(c->db,c->argv[1]);
5432 server.dirty += deleted;
5433 addReplyLong(c,deleted);
5434 }
5435
5436 static void zremrangebyrankCommand(redisClient *c) {
5437 int start = atoi(c->argv[2]->ptr);
5438 int end = atoi(c->argv[3]->ptr);
5439 int llen;
5440 long deleted;
5441 robj *zsetobj;
5442 zset *zs;
5443
5444 if ((zsetobj = lookupKeyWriteOrReply(c,c->argv[1],shared.czero)) == NULL ||
5445 checkType(c,zsetobj,REDIS_ZSET)) return;
5446 zs = zsetobj->ptr;
5447 llen = zs->zsl->length;
5448
5449 /* convert negative indexes */
5450 if (start < 0) start = llen+start;
5451 if (end < 0) end = llen+end;
5452 if (start < 0) start = 0;
5453 if (end < 0) end = 0;
5454
5455 /* indexes sanity checks */
5456 if (start > end || start >= llen) {
5457 addReply(c,shared.czero);
5458 return;
5459 }
5460 if (end >= llen) end = llen-1;
5461
5462 /* increment start and end because zsl*Rank functions
5463 * use 1-based rank */
5464 deleted = zslDeleteRangeByRank(zs->zsl,start+1,end+1,zs->dict);
5465 if (htNeedsResize(zs->dict)) dictResize(zs->dict);
5466 if (dictSize(zs->dict) == 0) deleteKey(c->db,c->argv[1]);
5467 server.dirty += deleted;
5468 addReplyLong(c, deleted);
5469 }
5470
5471 typedef struct {
5472 dict *dict;
5473 double weight;
5474 } zsetopsrc;
5475
5476 static int qsortCompareZsetopsrcByCardinality(const void *s1, const void *s2) {
5477 zsetopsrc *d1 = (void*) s1, *d2 = (void*) s2;
5478 unsigned long size1, size2;
5479 size1 = d1->dict ? dictSize(d1->dict) : 0;
5480 size2 = d2->dict ? dictSize(d2->dict) : 0;
5481 return size1 - size2;
5482 }
5483
5484 #define REDIS_AGGR_SUM 1
5485 #define REDIS_AGGR_MIN 2
5486 #define REDIS_AGGR_MAX 3
5487
5488 inline static void zunionInterAggregate(double *target, double val, int aggregate) {
5489 if (aggregate == REDIS_AGGR_SUM) {
5490 *target = *target + val;
5491 } else if (aggregate == REDIS_AGGR_MIN) {
5492 *target = val < *target ? val : *target;
5493 } else if (aggregate == REDIS_AGGR_MAX) {
5494 *target = val > *target ? val : *target;
5495 } else {
5496 /* safety net */
5497 redisAssert(0 != 0);
5498 }
5499 }
5500
5501 static void zunionInterGenericCommand(redisClient *c, robj *dstkey, int op) {
5502 int i, j, zsetnum;
5503 int aggregate = REDIS_AGGR_SUM;
5504 zsetopsrc *src;
5505 robj *dstobj;
5506 zset *dstzset;
5507 dictIterator *di;
5508 dictEntry *de;
5509
5510 /* expect zsetnum input keys to be given */
5511 zsetnum = atoi(c->argv[2]->ptr);
5512 if (zsetnum < 1) {
5513 addReplySds(c,sdsnew("-ERR at least 1 input key is needed for ZUNION/ZINTER\r\n"));
5514 return;
5515 }
5516
5517 /* test if the expected number of keys would overflow */
5518 if (3+zsetnum > c->argc) {
5519 addReply(c,shared.syntaxerr);
5520 return;
5521 }
5522
5523 /* read keys to be used for input */
5524 src = zmalloc(sizeof(zsetopsrc) * zsetnum);
5525 for (i = 0, j = 3; i < zsetnum; i++, j++) {
5526 robj *zsetobj = lookupKeyWrite(c->db,c->argv[j]);
5527 if (!zsetobj) {
5528 src[i].dict = NULL;
5529 } else {
5530 if (zsetobj->type != REDIS_ZSET) {
5531 zfree(src);
5532 addReply(c,shared.wrongtypeerr);
5533 return;
5534 }
5535 src[i].dict = ((zset*)zsetobj->ptr)->dict;
5536 }
5537
5538 /* default all weights to 1 */
5539 src[i].weight = 1.0;
5540 }
5541
5542 /* parse optional extra arguments */
5543 if (j < c->argc) {
5544 int remaining = c->argc - j;
5545
5546 while (remaining) {
5547 if (remaining >= (zsetnum + 1) && !strcasecmp(c->argv[j]->ptr,"weights")) {
5548 j++; remaining--;
5549 for (i = 0; i < zsetnum; i++, j++, remaining--) {
5550 src[i].weight = strtod(c->argv[j]->ptr, NULL);
5551 }
5552 } else if (remaining >= 2 && !strcasecmp(c->argv[j]->ptr,"aggregate")) {
5553 j++; remaining--;
5554 if (!strcasecmp(c->argv[j]->ptr,"sum")) {
5555 aggregate = REDIS_AGGR_SUM;
5556 } else if (!strcasecmp(c->argv[j]->ptr,"min")) {
5557 aggregate = REDIS_AGGR_MIN;
5558 } else if (!strcasecmp(c->argv[j]->ptr,"max")) {
5559 aggregate = REDIS_AGGR_MAX;
5560 } else {
5561 zfree(src);
5562 addReply(c,shared.syntaxerr);
5563 return;
5564 }
5565 j++; remaining--;
5566 } else {
5567 zfree(src);
5568 addReply(c,shared.syntaxerr);
5569 return;
5570 }
5571 }
5572 }
5573
5574 /* sort sets from the smallest to largest, this will improve our
5575 * algorithm's performance */
5576 qsort(src,zsetnum,sizeof(zsetopsrc), qsortCompareZsetopsrcByCardinality);
5577
5578 dstobj = createZsetObject();
5579 dstzset = dstobj->ptr;
5580
5581 if (op == REDIS_OP_INTER) {
5582 /* skip going over all entries if the smallest zset is NULL or empty */
5583 if (src[0].dict && dictSize(src[0].dict) > 0) {
5584 /* precondition: as src[0].dict is non-empty and the zsets are ordered
5585 * from small to large, all src[i > 0].dict are non-empty too */
5586 di = dictGetIterator(src[0].dict);
5587 while((de = dictNext(di)) != NULL) {
5588 double *score = zmalloc(sizeof(double)), value;
5589 *score = src[0].weight * (*(double*)dictGetEntryVal(de));
5590
5591 for (j = 1; j < zsetnum; j++) {
5592 dictEntry *other = dictFind(src[j].dict,dictGetEntryKey(de));
5593 if (other) {
5594 value = src[j].weight * (*(double*)dictGetEntryVal(other));
5595 zunionInterAggregate(score, value, aggregate);
5596 } else {
5597 break;
5598 }
5599 }
5600
5601 /* skip entry when not present in every source dict */
5602 if (j != zsetnum) {
5603 zfree(score);
5604 } else {
5605 robj *o = dictGetEntryKey(de);
5606 dictAdd(dstzset->dict,o,score);
5607 incrRefCount(o); /* added to dictionary */
5608 zslInsert(dstzset->zsl,*score,o);
5609 incrRefCount(o); /* added to skiplist */
5610 }
5611 }
5612 dictReleaseIterator(di);
5613 }
5614 } else if (op == REDIS_OP_UNION) {
5615 for (i = 0; i < zsetnum; i++) {
5616 if (!src[i].dict) continue;
5617
5618 di = dictGetIterator(src[i].dict);
5619 while((de = dictNext(di)) != NULL) {
5620 /* skip key when already processed */
5621 if (dictFind(dstzset->dict,dictGetEntryKey(de)) != NULL) continue;
5622
5623 double *score = zmalloc(sizeof(double)), value;
5624 *score = src[i].weight * (*(double*)dictGetEntryVal(de));
5625
5626 /* because the zsets are sorted by size, its only possible
5627 * for sets at larger indices to hold this entry */
5628 for (j = (i+1); j < zsetnum; j++) {
5629 dictEntry *other = dictFind(src[j].dict,dictGetEntryKey(de));
5630 if (other) {
5631 value = src[j].weight * (*(double*)dictGetEntryVal(other));
5632 zunionInterAggregate(score, value, aggregate);
5633 }
5634 }
5635
5636 robj *o = dictGetEntryKey(de);
5637 dictAdd(dstzset->dict,o,score);
5638 incrRefCount(o); /* added to dictionary */
5639 zslInsert(dstzset->zsl,*score,o);
5640 incrRefCount(o); /* added to skiplist */
5641 }
5642 dictReleaseIterator(di);
5643 }
5644 } else {
5645 /* unknown operator */
5646 redisAssert(op == REDIS_OP_INTER || op == REDIS_OP_UNION);
5647 }
5648
5649 deleteKey(c->db,dstkey);
5650 if (dstzset->zsl->length) {
5651 dictAdd(c->db->dict,dstkey,dstobj);
5652 incrRefCount(dstkey);
5653 addReplyLong(c, dstzset->zsl->length);
5654 server.dirty++;
5655 } else {
5656 decrRefCount(dstzset);
5657 addReply(c, shared.czero);
5658 }
5659 zfree(src);
5660 }
5661
5662 static void zunionCommand(redisClient *c) {
5663 zunionInterGenericCommand(c,c->argv[1], REDIS_OP_UNION);
5664 }
5665
5666 static void zinterCommand(redisClient *c) {
5667 zunionInterGenericCommand(c,c->argv[1], REDIS_OP_INTER);
5668 }
5669
5670 static void zrangeGenericCommand(redisClient *c, int reverse) {
5671 robj *o;
5672 int start = atoi(c->argv[2]->ptr);
5673 int end = atoi(c->argv[3]->ptr);
5674 int withscores = 0;
5675 int llen;
5676 int rangelen, j;
5677 zset *zsetobj;
5678 zskiplist *zsl;
5679 zskiplistNode *ln;
5680 robj *ele;
5681
5682 if (c->argc == 5 && !strcasecmp(c->argv[4]->ptr,"withscores")) {
5683 withscores = 1;
5684 } else if (c->argc >= 5) {
5685 addReply(c,shared.syntaxerr);
5686 return;
5687 }
5688
5689 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.nullmultibulk)) == NULL ||
5690 checkType(c,o,REDIS_ZSET)) return;
5691 zsetobj = o->ptr;
5692 zsl = zsetobj->zsl;
5693 llen = zsl->length;
5694
5695 /* convert negative indexes */
5696 if (start < 0) start = llen+start;
5697 if (end < 0) end = llen+end;
5698 if (start < 0) start = 0;
5699 if (end < 0) end = 0;
5700
5701 /* indexes sanity checks */
5702 if (start > end || start >= llen) {
5703 /* Out of range start or start > end result in empty list */
5704 addReply(c,shared.emptymultibulk);
5705 return;
5706 }
5707 if (end >= llen) end = llen-1;
5708 rangelen = (end-start)+1;
5709
5710 /* check if starting point is trivial, before searching
5711 * the element in log(N) time */
5712 if (reverse) {
5713 ln = start == 0 ? zsl->tail : zslGetElementByRank(zsl, llen-start);
5714 } else {
5715 ln = start == 0 ?
5716 zsl->header->forward[0] : zslGetElementByRank(zsl, start+1);
5717 }
5718
5719 /* Return the result in form of a multi-bulk reply */
5720 addReplySds(c,sdscatprintf(sdsempty(),"*%d\r\n",
5721 withscores ? (rangelen*2) : rangelen));
5722 for (j = 0; j < rangelen; j++) {
5723 ele = ln->obj;
5724 addReplyBulk(c,ele);
5725 if (withscores)
5726 addReplyDouble(c,ln->score);
5727 ln = reverse ? ln->backward : ln->forward[0];
5728 }
5729 }
5730
5731 static void zrangeCommand(redisClient *c) {
5732 zrangeGenericCommand(c,0);
5733 }
5734
5735 static void zrevrangeCommand(redisClient *c) {
5736 zrangeGenericCommand(c,1);
5737 }
5738
5739 /* This command implements both ZRANGEBYSCORE and ZCOUNT.
5740 * If justcount is non-zero, just the count is returned. */
5741 static void genericZrangebyscoreCommand(redisClient *c, int justcount) {
5742 robj *o;
5743 double min, max;
5744 int minex = 0, maxex = 0; /* are min or max exclusive? */
5745 int offset = 0, limit = -1;
5746 int withscores = 0;
5747 int badsyntax = 0;
5748
5749 /* Parse the min-max interval. If one of the values is prefixed
5750 * by the "(" character, it's considered "open". For instance
5751 * ZRANGEBYSCORE zset (1.5 (2.5 will match min < x < max
5752 * ZRANGEBYSCORE zset 1.5 2.5 will instead match min <= x <= max */
5753 if (((char*)c->argv[2]->ptr)[0] == '(') {
5754 min = strtod((char*)c->argv[2]->ptr+1,NULL);
5755 minex = 1;
5756 } else {
5757 min = strtod(c->argv[2]->ptr,NULL);
5758 }
5759 if (((char*)c->argv[3]->ptr)[0] == '(') {
5760 max = strtod((char*)c->argv[3]->ptr+1,NULL);
5761 maxex = 1;
5762 } else {
5763 max = strtod(c->argv[3]->ptr,NULL);
5764 }
5765
5766 /* Parse "WITHSCORES": note that if the command was called with
5767 * the name ZCOUNT then we are sure that c->argc == 4, so we'll never
5768 * enter the following paths to parse WITHSCORES and LIMIT. */
5769 if (c->argc == 5 || c->argc == 8) {
5770 if (strcasecmp(c->argv[c->argc-1]->ptr,"withscores") == 0)
5771 withscores = 1;
5772 else
5773 badsyntax = 1;
5774 }
5775 if (c->argc != (4 + withscores) && c->argc != (7 + withscores))
5776 badsyntax = 1;
5777 if (badsyntax) {
5778 addReplySds(c,
5779 sdsnew("-ERR wrong number of arguments for ZRANGEBYSCORE\r\n"));
5780 return;
5781 }
5782
5783 /* Parse "LIMIT" */
5784 if (c->argc == (7 + withscores) && strcasecmp(c->argv[4]->ptr,"limit")) {
5785 addReply(c,shared.syntaxerr);
5786 return;
5787 } else if (c->argc == (7 + withscores)) {
5788 offset = atoi(c->argv[5]->ptr);
5789 limit = atoi(c->argv[6]->ptr);
5790 if (offset < 0) offset = 0;
5791 }
5792
5793 /* Ok, lookup the key and get the range */
5794 o = lookupKeyRead(c->db,c->argv[1]);
5795 if (o == NULL) {
5796 addReply(c,justcount ? shared.czero : shared.nullmultibulk);
5797 } else {
5798 if (o->type != REDIS_ZSET) {
5799 addReply(c,shared.wrongtypeerr);
5800 } else {
5801 zset *zsetobj = o->ptr;
5802 zskiplist *zsl = zsetobj->zsl;
5803 zskiplistNode *ln;
5804 robj *ele, *lenobj = NULL;
5805 unsigned long rangelen = 0;
5806
5807 /* Get the first node with the score >= min, or with
5808 * score > min if 'minex' is true. */
5809 ln = zslFirstWithScore(zsl,min);
5810 while (minex && ln && ln->score == min) ln = ln->forward[0];
5811
5812 if (ln == NULL) {
5813 /* No element matching the speciifed interval */
5814 addReply(c,justcount ? shared.czero : shared.emptymultibulk);
5815 return;
5816 }
5817
5818 /* We don't know in advance how many matching elements there
5819 * are in the list, so we push this object that will represent
5820 * the multi-bulk length in the output buffer, and will "fix"
5821 * it later */
5822 if (!justcount) {
5823 lenobj = createObject(REDIS_STRING,NULL);
5824 addReply(c,lenobj);
5825 decrRefCount(lenobj);
5826 }
5827
5828 while(ln && (maxex ? (ln->score < max) : (ln->score <= max))) {
5829 if (offset) {
5830 offset--;
5831 ln = ln->forward[0];
5832 continue;
5833 }
5834 if (limit == 0) break;
5835 if (!justcount) {
5836 ele = ln->obj;
5837 addReplyBulk(c,ele);
5838 if (withscores)
5839 addReplyDouble(c,ln->score);
5840 }
5841 ln = ln->forward[0];
5842 rangelen++;
5843 if (limit > 0) limit--;
5844 }
5845 if (justcount) {
5846 addReplyLong(c,(long)rangelen);
5847 } else {
5848 lenobj->ptr = sdscatprintf(sdsempty(),"*%lu\r\n",
5849 withscores ? (rangelen*2) : rangelen);
5850 }
5851 }
5852 }
5853 }
5854
5855 static void zrangebyscoreCommand(redisClient *c) {
5856 genericZrangebyscoreCommand(c,0);
5857 }
5858
5859 static void zcountCommand(redisClient *c) {
5860 genericZrangebyscoreCommand(c,1);
5861 }
5862
5863 static void zcardCommand(redisClient *c) {
5864 robj *o;
5865 zset *zs;
5866
5867 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.czero)) == NULL ||
5868 checkType(c,o,REDIS_ZSET)) return;
5869
5870 zs = o->ptr;
5871 addReplyUlong(c,zs->zsl->length);
5872 }
5873
5874 static void zscoreCommand(redisClient *c) {
5875 robj *o;
5876 zset *zs;
5877 dictEntry *de;
5878
5879 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.nullbulk)) == NULL ||
5880 checkType(c,o,REDIS_ZSET)) return;
5881
5882 zs = o->ptr;
5883 de = dictFind(zs->dict,c->argv[2]);
5884 if (!de) {
5885 addReply(c,shared.nullbulk);
5886 } else {
5887 double *score = dictGetEntryVal(de);
5888
5889 addReplyDouble(c,*score);
5890 }
5891 }
5892
5893 static void zrankGenericCommand(redisClient *c, int reverse) {
5894 robj *o;
5895 zset *zs;
5896 zskiplist *zsl;
5897 dictEntry *de;
5898 unsigned long rank;
5899 double *score;
5900
5901 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.nullbulk)) == NULL ||
5902 checkType(c,o,REDIS_ZSET)) return;
5903
5904 zs = o->ptr;
5905 zsl = zs->zsl;
5906 de = dictFind(zs->dict,c->argv[2]);
5907 if (!de) {
5908 addReply(c,shared.nullbulk);
5909 return;
5910 }
5911
5912 score = dictGetEntryVal(de);
5913 rank = zslGetRank(zsl, *score, c->argv[2]);
5914 if (rank) {
5915 if (reverse) {
5916 addReplyLong(c, zsl->length - rank);
5917 } else {
5918 addReplyLong(c, rank-1);
5919 }
5920 } else {
5921 addReply(c,shared.nullbulk);
5922 }
5923 }
5924
5925 static void zrankCommand(redisClient *c) {
5926 zrankGenericCommand(c, 0);
5927 }
5928
5929 static void zrevrankCommand(redisClient *c) {
5930 zrankGenericCommand(c, 1);
5931 }
5932
5933 /* =================================== Hashes =============================== */
5934 static void hsetCommand(redisClient *c) {
5935 int update = 0;
5936 robj *o = lookupKeyWrite(c->db,c->argv[1]);
5937
5938 if (o == NULL) {
5939 o = createHashObject();
5940 dictAdd(c->db->dict,c->argv[1],o);
5941 incrRefCount(c->argv[1]);
5942 } else {
5943 if (o->type != REDIS_HASH) {
5944 addReply(c,shared.wrongtypeerr);
5945 return;
5946 }
5947 }
5948 /* We want to convert the zipmap into an hash table right now if the
5949 * entry to be added is too big. Note that we check if the object
5950 * is integer encoded before to try fetching the length in the test below.
5951 * This is because integers are small, but currently stringObjectLen()
5952 * performs a slow conversion: not worth it. */
5953 if (o->encoding == REDIS_ENCODING_ZIPMAP &&
5954 ((c->argv[2]->encoding == REDIS_ENCODING_RAW &&
5955 sdslen(c->argv[2]->ptr) > server.hash_max_zipmap_value) ||
5956 (c->argv[3]->encoding == REDIS_ENCODING_RAW &&
5957 sdslen(c->argv[3]->ptr) > server.hash_max_zipmap_value)))
5958 {
5959 convertToRealHash(o);
5960 }
5961
5962 if (o->encoding == REDIS_ENCODING_ZIPMAP) {
5963 unsigned char *zm = o->ptr;
5964 robj *valobj = getDecodedObject(c->argv[3]);
5965
5966 zm = zipmapSet(zm,c->argv[2]->ptr,sdslen(c->argv[2]->ptr),
5967 valobj->ptr,sdslen(valobj->ptr),&update);
5968 decrRefCount(valobj);
5969 o->ptr = zm;
5970
5971 /* And here there is the second check for hash conversion...
5972 * we want to do it only if the operation was not just an update as
5973 * zipmapLen() is O(N). */
5974 if (!update && zipmapLen(zm) > server.hash_max_zipmap_entries)
5975 convertToRealHash(o);
5976 } else {
5977 tryObjectEncoding(c->argv[2]);
5978 /* note that c->argv[3] is already encoded, as the latest arg
5979 * of a bulk command is always integer encoded if possible. */
5980 if (dictReplace(o->ptr,c->argv[2],c->argv[3])) {
5981 incrRefCount(c->argv[2]);
5982 } else {
5983 update = 1;
5984 }
5985 incrRefCount(c->argv[3]);
5986 }
5987 server.dirty++;
5988 addReplySds(c,sdscatprintf(sdsempty(),":%d\r\n",update == 0));
5989 }
5990
5991 static void hincrbyCommand(redisClient *c) {
5992 int update = 0;
5993 long long value = 0, incr = 0;
5994 robj *o = lookupKeyWrite(c->db,c->argv[1]);
5995
5996 if (o == NULL) {
5997 o = createHashObject();
5998 dictAdd(c->db->dict,c->argv[1],o);
5999 incrRefCount(c->argv[1]);
6000 } else {
6001 if (o->type != REDIS_HASH) {
6002 addReply(c,shared.wrongtypeerr);
6003 return;
6004 }
6005 }
6006
6007 robj *o_incr = getDecodedObject(c->argv[3]);
6008 incr = strtoll(o_incr->ptr, NULL, 10);
6009 decrRefCount(o_incr);
6010
6011 if (o->encoding == REDIS_ENCODING_ZIPMAP) {
6012 unsigned char *zm = o->ptr;
6013 unsigned char *zval;
6014 unsigned int zvlen;
6015
6016 /* Find value if already present in hash */
6017 if (zipmapGet(zm,c->argv[2]->ptr,sdslen(c->argv[2]->ptr),
6018 &zval,&zvlen)) {
6019 /* strtoll needs the char* to have a trailing \0, but
6020 * the zipmap doesn't include them. */
6021 sds szval = sdsnewlen(zval, zvlen);
6022 value = strtoll(szval,NULL,10);
6023 sdsfree(szval);
6024 }
6025
6026 value += incr;
6027 sds svalue = sdscatprintf(sdsempty(),"%lld",value);
6028 zm = zipmapSet(zm,c->argv[2]->ptr,sdslen(c->argv[2]->ptr),
6029 (unsigned char*)svalue,sdslen(svalue),&update);
6030 sdsfree(svalue);
6031 o->ptr = zm;
6032
6033 /* Check if the zipmap needs to be converted
6034 * if this was not an update. */
6035 if (!update && zipmapLen(zm) > server.hash_max_zipmap_entries)
6036 convertToRealHash(o);
6037 } else {
6038 robj *hval;
6039 dictEntry *de;
6040
6041 /* Find value if already present in hash */
6042 de = dictFind(o->ptr,c->argv[2]);
6043 if (de != NULL) {
6044 hval = dictGetEntryVal(de);
6045 if (hval->encoding == REDIS_ENCODING_RAW)
6046 value = strtoll(hval->ptr,NULL,10);
6047 else if (hval->encoding == REDIS_ENCODING_INT)
6048 value = (long)hval->ptr;
6049 else
6050 redisAssert(1 != 1);
6051 }
6052
6053 value += incr;
6054 hval = createObject(REDIS_STRING,sdscatprintf(sdsempty(),"%lld",value));
6055 tryObjectEncoding(hval);
6056 if (dictReplace(o->ptr,c->argv[2],hval)) {
6057 incrRefCount(c->argv[2]);
6058 }
6059 }
6060
6061 server.dirty++;
6062 addReplyLong(c, value);
6063 }
6064
6065 static void hgetCommand(redisClient *c) {
6066 robj *o;
6067
6068 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.nullbulk)) == NULL ||
6069 checkType(c,o,REDIS_HASH)) return;
6070
6071 if (o->encoding == REDIS_ENCODING_ZIPMAP) {
6072 unsigned char *zm = o->ptr;
6073 unsigned char *val;
6074 unsigned int vlen;
6075 robj *field;
6076
6077 field = getDecodedObject(c->argv[2]);
6078 if (zipmapGet(zm,field->ptr,sdslen(field->ptr), &val,&vlen)) {
6079 addReplySds(c,sdscatprintf(sdsempty(),"$%u\r\n", vlen));
6080 addReplySds(c,sdsnewlen(val,vlen));
6081 addReply(c,shared.crlf);
6082 decrRefCount(field);
6083 return;
6084 } else {
6085 addReply(c,shared.nullbulk);
6086 decrRefCount(field);
6087 return;
6088 }
6089 } else {
6090 struct dictEntry *de;
6091
6092 de = dictFind(o->ptr,c->argv[2]);
6093 if (de == NULL) {
6094 addReply(c,shared.nullbulk);
6095 } else {
6096 robj *e = dictGetEntryVal(de);
6097
6098 addReplyBulk(c,e);
6099 }
6100 }
6101 }
6102
6103 static void hdelCommand(redisClient *c) {
6104 robj *o;
6105 int deleted = 0;
6106
6107 if ((o = lookupKeyWriteOrReply(c,c->argv[1],shared.czero)) == NULL ||
6108 checkType(c,o,REDIS_HASH)) return;
6109
6110 if (o->encoding == REDIS_ENCODING_ZIPMAP) {
6111 robj *field = getDecodedObject(c->argv[2]);
6112
6113 o->ptr = zipmapDel((unsigned char*) o->ptr,
6114 (unsigned char*) field->ptr,
6115 sdslen(field->ptr), &deleted);
6116 decrRefCount(field);
6117 if (zipmapLen((unsigned char*) o->ptr) == 0)
6118 deleteKey(c->db,c->argv[1]);
6119 } else {
6120 deleted = dictDelete((dict*)o->ptr,c->argv[2]) == DICT_OK;
6121 if (htNeedsResize(o->ptr)) dictResize(o->ptr);
6122 if (dictSize((dict*)o->ptr) == 0) deleteKey(c->db,c->argv[1]);
6123 }
6124 if (deleted) server.dirty++;
6125 addReply(c,deleted ? shared.cone : shared.czero);
6126 }
6127
6128 static void hlenCommand(redisClient *c) {
6129 robj *o;
6130 unsigned long len;
6131
6132 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.czero)) == NULL ||
6133 checkType(c,o,REDIS_HASH)) return;
6134
6135 len = (o->encoding == REDIS_ENCODING_ZIPMAP) ?
6136 zipmapLen((unsigned char*)o->ptr) : dictSize((dict*)o->ptr);
6137 addReplyUlong(c,len);
6138 }
6139
6140 #define REDIS_GETALL_KEYS 1
6141 #define REDIS_GETALL_VALS 2
6142 static void genericHgetallCommand(redisClient *c, int flags) {
6143 robj *o, *lenobj;
6144 unsigned long count = 0;
6145
6146 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.nullmultibulk)) == NULL
6147 || checkType(c,o,REDIS_HASH)) return;
6148
6149 lenobj = createObject(REDIS_STRING,NULL);
6150 addReply(c,lenobj);
6151 decrRefCount(lenobj);
6152
6153 if (o->encoding == REDIS_ENCODING_ZIPMAP) {
6154 unsigned char *p = zipmapRewind(o->ptr);
6155 unsigned char *field, *val;
6156 unsigned int flen, vlen;
6157
6158 while((p = zipmapNext(p,&field,&flen,&val,&vlen)) != NULL) {
6159 robj *aux;
6160
6161 if (flags & REDIS_GETALL_KEYS) {
6162 aux = createStringObject((char*)field,flen);
6163 addReplyBulk(c,aux);
6164 decrRefCount(aux);
6165 count++;
6166 }
6167 if (flags & REDIS_GETALL_VALS) {
6168 aux = createStringObject((char*)val,vlen);
6169 addReplyBulk(c,aux);
6170 decrRefCount(aux);
6171 count++;
6172 }
6173 }
6174 } else {
6175 dictIterator *di = dictGetIterator(o->ptr);
6176 dictEntry *de;
6177
6178 while((de = dictNext(di)) != NULL) {
6179 robj *fieldobj = dictGetEntryKey(de);
6180 robj *valobj = dictGetEntryVal(de);
6181
6182 if (flags & REDIS_GETALL_KEYS) {
6183 addReplyBulk(c,fieldobj);
6184 count++;
6185 }
6186 if (flags & REDIS_GETALL_VALS) {
6187 addReplyBulk(c,valobj);
6188 count++;
6189 }
6190 }
6191 dictReleaseIterator(di);
6192 }
6193 lenobj->ptr = sdscatprintf(sdsempty(),"*%lu\r\n",count);
6194 }
6195
6196 static void hkeysCommand(redisClient *c) {
6197 genericHgetallCommand(c,REDIS_GETALL_KEYS);
6198 }
6199
6200 static void hvalsCommand(redisClient *c) {
6201 genericHgetallCommand(c,REDIS_GETALL_VALS);
6202 }
6203
6204 static void hgetallCommand(redisClient *c) {
6205 genericHgetallCommand(c,REDIS_GETALL_KEYS|REDIS_GETALL_VALS);
6206 }
6207
6208 static void hexistsCommand(redisClient *c) {
6209 robj *o;
6210 int exists = 0;
6211
6212 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.czero)) == NULL ||
6213 checkType(c,o,REDIS_HASH)) return;
6214
6215 if (o->encoding == REDIS_ENCODING_ZIPMAP) {
6216 robj *field;
6217 unsigned char *zm = o->ptr;
6218
6219 field = getDecodedObject(c->argv[2]);
6220 exists = zipmapExists(zm,field->ptr,sdslen(field->ptr));
6221 decrRefCount(field);
6222 } else {
6223 exists = dictFind(o->ptr,c->argv[2]) != NULL;
6224 }
6225 addReply(c,exists ? shared.cone : shared.czero);
6226 }
6227
6228 static void convertToRealHash(robj *o) {
6229 unsigned char *key, *val, *p, *zm = o->ptr;
6230 unsigned int klen, vlen;
6231 dict *dict = dictCreate(&hashDictType,NULL);
6232
6233 assert(o->type == REDIS_HASH && o->encoding != REDIS_ENCODING_HT);
6234 p = zipmapRewind(zm);
6235 while((p = zipmapNext(p,&key,&klen,&val,&vlen)) != NULL) {
6236 robj *keyobj, *valobj;
6237
6238 keyobj = createStringObject((char*)key,klen);
6239 valobj = createStringObject((char*)val,vlen);
6240 tryObjectEncoding(keyobj);
6241 tryObjectEncoding(valobj);
6242 dictAdd(dict,keyobj,valobj);
6243 }
6244 o->encoding = REDIS_ENCODING_HT;
6245 o->ptr = dict;
6246 zfree(zm);
6247 }
6248
6249 /* ========================= Non type-specific commands ==================== */
6250
6251 static void flushdbCommand(redisClient *c) {
6252 server.dirty += dictSize(c->db->dict);
6253 dictEmpty(c->db->dict);
6254 dictEmpty(c->db->expires);
6255 addReply(c,shared.ok);
6256 }
6257
6258 static void flushallCommand(redisClient *c) {
6259 server.dirty += emptyDb();
6260 addReply(c,shared.ok);
6261 if (server.bgsavechildpid != -1) {
6262 kill(server.bgsavechildpid,SIGKILL);
6263 rdbRemoveTempFile(server.bgsavechildpid);
6264 }
6265 rdbSave(server.dbfilename);
6266 server.dirty++;
6267 }
6268
6269 static redisSortOperation *createSortOperation(int type, robj *pattern) {
6270 redisSortOperation *so = zmalloc(sizeof(*so));
6271 so->type = type;
6272 so->pattern = pattern;
6273 return so;
6274 }
6275
6276 /* Return the value associated to the key with a name obtained
6277 * substituting the first occurence of '*' in 'pattern' with 'subst' */
6278 static robj *lookupKeyByPattern(redisDb *db, robj *pattern, robj *subst) {
6279 char *p;
6280 sds spat, ssub;
6281 robj keyobj;
6282 int prefixlen, sublen, postfixlen;
6283 /* Expoit the internal sds representation to create a sds string allocated on the stack in order to make this function faster */
6284 struct {
6285 long len;
6286 long free;
6287 char buf[REDIS_SORTKEY_MAX+1];
6288 } keyname;
6289
6290 /* If the pattern is "#" return the substitution object itself in order
6291 * to implement the "SORT ... GET #" feature. */
6292 spat = pattern->ptr;
6293 if (spat[0] == '#' && spat[1] == '\0') {
6294 return subst;
6295 }
6296
6297 /* The substitution object may be specially encoded. If so we create
6298 * a decoded object on the fly. Otherwise getDecodedObject will just
6299 * increment the ref count, that we'll decrement later. */
6300 subst = getDecodedObject(subst);
6301
6302 ssub = subst->ptr;
6303 if (sdslen(spat)+sdslen(ssub)-1 > REDIS_SORTKEY_MAX) return NULL;
6304 p = strchr(spat,'*');
6305 if (!p) {
6306 decrRefCount(subst);
6307 return NULL;
6308 }
6309
6310 prefixlen = p-spat;
6311 sublen = sdslen(ssub);
6312 postfixlen = sdslen(spat)-(prefixlen+1);
6313 memcpy(keyname.buf,spat,prefixlen);
6314 memcpy(keyname.buf+prefixlen,ssub,sublen);
6315 memcpy(keyname.buf+prefixlen+sublen,p+1,postfixlen);
6316 keyname.buf[prefixlen+sublen+postfixlen] = '\0';
6317 keyname.len = prefixlen+sublen+postfixlen;
6318
6319 initStaticStringObject(keyobj,((char*)&keyname)+(sizeof(long)*2))
6320 decrRefCount(subst);
6321
6322 /* printf("lookup '%s' => %p\n", keyname.buf,de); */
6323 return lookupKeyRead(db,&keyobj);
6324 }
6325
6326 /* sortCompare() is used by qsort in sortCommand(). Given that qsort_r with
6327 * the additional parameter is not standard but a BSD-specific we have to
6328 * pass sorting parameters via the global 'server' structure */
6329 static int sortCompare(const void *s1, const void *s2) {
6330 const redisSortObject *so1 = s1, *so2 = s2;
6331 int cmp;
6332
6333 if (!server.sort_alpha) {
6334 /* Numeric sorting. Here it's trivial as we precomputed scores */
6335 if (so1->u.score > so2->u.score) {
6336 cmp = 1;
6337 } else if (so1->u.score < so2->u.score) {
6338 cmp = -1;
6339 } else {
6340 cmp = 0;
6341 }
6342 } else {
6343 /* Alphanumeric sorting */
6344 if (server.sort_bypattern) {
6345 if (!so1->u.cmpobj || !so2->u.cmpobj) {
6346 /* At least one compare object is NULL */
6347 if (so1->u.cmpobj == so2->u.cmpobj)
6348 cmp = 0;
6349 else if (so1->u.cmpobj == NULL)
6350 cmp = -1;
6351 else
6352 cmp = 1;
6353 } else {
6354 /* We have both the objects, use strcoll */
6355 cmp = strcoll(so1->u.cmpobj->ptr,so2->u.cmpobj->ptr);
6356 }
6357 } else {
6358 /* Compare elements directly */
6359 robj *dec1, *dec2;
6360
6361 dec1 = getDecodedObject(so1->obj);
6362 dec2 = getDecodedObject(so2->obj);
6363 cmp = strcoll(dec1->ptr,dec2->ptr);
6364 decrRefCount(dec1);
6365 decrRefCount(dec2);
6366 }
6367 }
6368 return server.sort_desc ? -cmp : cmp;
6369 }
6370
6371 /* The SORT command is the most complex command in Redis. Warning: this code
6372 * is optimized for speed and a bit less for readability */
6373 static void sortCommand(redisClient *c) {
6374 list *operations;
6375 int outputlen = 0;
6376 int desc = 0, alpha = 0;
6377 int limit_start = 0, limit_count = -1, start, end;
6378 int j, dontsort = 0, vectorlen;
6379 int getop = 0; /* GET operation counter */
6380 robj *sortval, *sortby = NULL, *storekey = NULL;
6381 redisSortObject *vector; /* Resulting vector to sort */
6382
6383 /* Lookup the key to sort. It must be of the right types */
6384 sortval = lookupKeyRead(c->db,c->argv[1]);
6385 if (sortval == NULL) {
6386 addReply(c,shared.nullmultibulk);
6387 return;
6388 }
6389 if (sortval->type != REDIS_SET && sortval->type != REDIS_LIST &&
6390 sortval->type != REDIS_ZSET)
6391 {
6392 addReply(c,shared.wrongtypeerr);
6393 return;
6394 }
6395
6396 /* Create a list of operations to perform for every sorted element.
6397 * Operations can be GET/DEL/INCR/DECR */
6398 operations = listCreate();
6399 listSetFreeMethod(operations,zfree);
6400 j = 2;
6401
6402 /* Now we need to protect sortval incrementing its count, in the future
6403 * SORT may have options able to overwrite/delete keys during the sorting
6404 * and the sorted key itself may get destroied */
6405 incrRefCount(sortval);
6406
6407 /* The SORT command has an SQL-alike syntax, parse it */
6408 while(j < c->argc) {
6409 int leftargs = c->argc-j-1;
6410 if (!strcasecmp(c->argv[j]->ptr,"asc")) {
6411 desc = 0;
6412 } else if (!strcasecmp(c->argv[j]->ptr,"desc")) {
6413 desc = 1;
6414 } else if (!strcasecmp(c->argv[j]->ptr,"alpha")) {
6415 alpha = 1;
6416 } else if (!strcasecmp(c->argv[j]->ptr,"limit") && leftargs >= 2) {
6417 limit_start = atoi(c->argv[j+1]->ptr);
6418 limit_count = atoi(c->argv[j+2]->ptr);
6419 j+=2;
6420 } else if (!strcasecmp(c->argv[j]->ptr,"store") && leftargs >= 1) {
6421 storekey = c->argv[j+1];
6422 j++;
6423 } else if (!strcasecmp(c->argv[j]->ptr,"by") && leftargs >= 1) {
6424 sortby = c->argv[j+1];
6425 /* If the BY pattern does not contain '*', i.e. it is constant,
6426 * we don't need to sort nor to lookup the weight keys. */
6427 if (strchr(c->argv[j+1]->ptr,'*') == NULL) dontsort = 1;
6428 j++;
6429 } else if (!strcasecmp(c->argv[j]->ptr,"get") && leftargs >= 1) {
6430 listAddNodeTail(operations,createSortOperation(
6431 REDIS_SORT_GET,c->argv[j+1]));
6432 getop++;
6433 j++;
6434 } else {
6435 decrRefCount(sortval);
6436 listRelease(operations);
6437 addReply(c,shared.syntaxerr);
6438 return;
6439 }
6440 j++;
6441 }
6442
6443 /* Load the sorting vector with all the objects to sort */
6444 switch(sortval->type) {
6445 case REDIS_LIST: vectorlen = listLength((list*)sortval->ptr); break;
6446 case REDIS_SET: vectorlen = dictSize((dict*)sortval->ptr); break;
6447 case REDIS_ZSET: vectorlen = dictSize(((zset*)sortval->ptr)->dict); break;
6448 default: vectorlen = 0; redisAssert(0); /* Avoid GCC warning */
6449 }
6450 vector = zmalloc(sizeof(redisSortObject)*vectorlen);
6451 j = 0;
6452
6453 if (sortval->type == REDIS_LIST) {
6454 list *list = sortval->ptr;
6455 listNode *ln;
6456 listIter li;
6457
6458 listRewind(list,&li);
6459 while((ln = listNext(&li))) {
6460 robj *ele = ln->value;
6461 vector[j].obj = ele;
6462 vector[j].u.score = 0;
6463 vector[j].u.cmpobj = NULL;
6464 j++;
6465 }
6466 } else {
6467 dict *set;
6468 dictIterator *di;
6469 dictEntry *setele;
6470
6471 if (sortval->type == REDIS_SET) {
6472 set = sortval->ptr;
6473 } else {
6474 zset *zs = sortval->ptr;
6475 set = zs->dict;
6476 }
6477
6478 di = dictGetIterator(set);
6479 while((setele = dictNext(di)) != NULL) {
6480 vector[j].obj = dictGetEntryKey(setele);
6481 vector[j].u.score = 0;
6482 vector[j].u.cmpobj = NULL;
6483 j++;
6484 }
6485 dictReleaseIterator(di);
6486 }
6487 redisAssert(j == vectorlen);
6488
6489 /* Now it's time to load the right scores in the sorting vector */
6490 if (dontsort == 0) {
6491 for (j = 0; j < vectorlen; j++) {
6492 if (sortby) {
6493 robj *byval;
6494
6495 byval = lookupKeyByPattern(c->db,sortby,vector[j].obj);
6496 if (!byval || byval->type != REDIS_STRING) continue;
6497 if (alpha) {
6498 vector[j].u.cmpobj = getDecodedObject(byval);
6499 } else {
6500 if (byval->encoding == REDIS_ENCODING_RAW) {
6501 vector[j].u.score = strtod(byval->ptr,NULL);
6502 } else {
6503 /* Don't need to decode the object if it's
6504 * integer-encoded (the only encoding supported) so
6505 * far. We can just cast it */
6506 if (byval->encoding == REDIS_ENCODING_INT) {
6507 vector[j].u.score = (long)byval->ptr;
6508 } else
6509 redisAssert(1 != 1);
6510 }
6511 }
6512 } else {
6513 if (!alpha) {
6514 if (vector[j].obj->encoding == REDIS_ENCODING_RAW)
6515 vector[j].u.score = strtod(vector[j].obj->ptr,NULL);
6516 else {
6517 if (vector[j].obj->encoding == REDIS_ENCODING_INT)
6518 vector[j].u.score = (long) vector[j].obj->ptr;
6519 else
6520 redisAssert(1 != 1);
6521 }
6522 }
6523 }
6524 }
6525 }
6526
6527 /* We are ready to sort the vector... perform a bit of sanity check
6528 * on the LIMIT option too. We'll use a partial version of quicksort. */
6529 start = (limit_start < 0) ? 0 : limit_start;
6530 end = (limit_count < 0) ? vectorlen-1 : start+limit_count-1;
6531 if (start >= vectorlen) {
6532 start = vectorlen-1;
6533 end = vectorlen-2;
6534 }
6535 if (end >= vectorlen) end = vectorlen-1;
6536
6537 if (dontsort == 0) {
6538 server.sort_desc = desc;
6539 server.sort_alpha = alpha;
6540 server.sort_bypattern = sortby ? 1 : 0;
6541 if (sortby && (start != 0 || end != vectorlen-1))
6542 pqsort(vector,vectorlen,sizeof(redisSortObject),sortCompare, start,end);
6543 else
6544 qsort(vector,vectorlen,sizeof(redisSortObject),sortCompare);
6545 }
6546
6547 /* Send command output to the output buffer, performing the specified
6548 * GET/DEL/INCR/DECR operations if any. */
6549 outputlen = getop ? getop*(end-start+1) : end-start+1;
6550 if (storekey == NULL) {
6551 /* STORE option not specified, sent the sorting result to client */
6552 addReplySds(c,sdscatprintf(sdsempty(),"*%d\r\n",outputlen));
6553 for (j = start; j <= end; j++) {
6554 listNode *ln;
6555 listIter li;
6556
6557 if (!getop) addReplyBulk(c,vector[j].obj);
6558 listRewind(operations,&li);
6559 while((ln = listNext(&li))) {
6560 redisSortOperation *sop = ln->value;
6561 robj *val = lookupKeyByPattern(c->db,sop->pattern,
6562 vector[j].obj);
6563
6564 if (sop->type == REDIS_SORT_GET) {
6565 if (!val || val->type != REDIS_STRING) {
6566 addReply(c,shared.nullbulk);
6567 } else {
6568 addReplyBulk(c,val);
6569 }
6570 } else {
6571 redisAssert(sop->type == REDIS_SORT_GET); /* always fails */
6572 }
6573 }
6574 }
6575 } else {
6576 robj *listObject = createListObject();
6577 list *listPtr = (list*) listObject->ptr;
6578
6579 /* STORE option specified, set the sorting result as a List object */
6580 for (j = start; j <= end; j++) {
6581 listNode *ln;
6582 listIter li;
6583
6584 if (!getop) {
6585 listAddNodeTail(listPtr,vector[j].obj);
6586 incrRefCount(vector[j].obj);
6587 }
6588 listRewind(operations,&li);
6589 while((ln = listNext(&li))) {
6590 redisSortOperation *sop = ln->value;
6591 robj *val = lookupKeyByPattern(c->db,sop->pattern,
6592 vector[j].obj);
6593
6594 if (sop->type == REDIS_SORT_GET) {
6595 if (!val || val->type != REDIS_STRING) {
6596 listAddNodeTail(listPtr,createStringObject("",0));
6597 } else {
6598 listAddNodeTail(listPtr,val);
6599 incrRefCount(val);
6600 }
6601 } else {
6602 redisAssert(sop->type == REDIS_SORT_GET); /* always fails */
6603 }
6604 }
6605 }
6606 if (dictReplace(c->db->dict,storekey,listObject)) {
6607 incrRefCount(storekey);
6608 }
6609 /* Note: we add 1 because the DB is dirty anyway since even if the
6610 * SORT result is empty a new key is set and maybe the old content
6611 * replaced. */
6612 server.dirty += 1+outputlen;
6613 addReplySds(c,sdscatprintf(sdsempty(),":%d\r\n",outputlen));
6614 }
6615
6616 /* Cleanup */
6617 decrRefCount(sortval);
6618 listRelease(operations);
6619 for (j = 0; j < vectorlen; j++) {
6620 if (sortby && alpha && vector[j].u.cmpobj)
6621 decrRefCount(vector[j].u.cmpobj);
6622 }
6623 zfree(vector);
6624 }
6625
6626 /* Convert an amount of bytes into a human readable string in the form
6627 * of 100B, 2G, 100M, 4K, and so forth. */
6628 static void bytesToHuman(char *s, unsigned long long n) {
6629 double d;
6630
6631 if (n < 1024) {
6632 /* Bytes */
6633 sprintf(s,"%lluB",n);
6634 return;
6635 } else if (n < (1024*1024)) {
6636 d = (double)n/(1024);
6637 sprintf(s,"%.2fK",d);
6638 } else if (n < (1024LL*1024*1024)) {
6639 d = (double)n/(1024*1024);
6640 sprintf(s,"%.2fM",d);
6641 } else if (n < (1024LL*1024*1024*1024)) {
6642 d = (double)n/(1024LL*1024*1024);
6643 sprintf(s,"%.2fG",d);
6644 }
6645 }
6646
6647 /* Create the string returned by the INFO command. This is decoupled
6648 * by the INFO command itself as we need to report the same information
6649 * on memory corruption problems. */
6650 static sds genRedisInfoString(void) {
6651 sds info;
6652 time_t uptime = time(NULL)-server.stat_starttime;
6653 int j;
6654 char hmem[64];
6655
6656 bytesToHuman(hmem,zmalloc_used_memory());
6657 info = sdscatprintf(sdsempty(),
6658 "redis_version:%s\r\n"
6659 "arch_bits:%s\r\n"
6660 "multiplexing_api:%s\r\n"
6661 "process_id:%ld\r\n"
6662 "uptime_in_seconds:%ld\r\n"
6663 "uptime_in_days:%ld\r\n"
6664 "connected_clients:%d\r\n"
6665 "connected_slaves:%d\r\n"
6666 "blocked_clients:%d\r\n"
6667 "used_memory:%zu\r\n"
6668 "used_memory_human:%s\r\n"
6669 "changes_since_last_save:%lld\r\n"
6670 "bgsave_in_progress:%d\r\n"
6671 "last_save_time:%ld\r\n"
6672 "bgrewriteaof_in_progress:%d\r\n"
6673 "total_connections_received:%lld\r\n"
6674 "total_commands_processed:%lld\r\n"
6675 "expired_keys:%lld\r\n"
6676 "hash_max_zipmap_entries:%ld\r\n"
6677 "hash_max_zipmap_value:%ld\r\n"
6678 "pubsub_classes:%ld\r\n"
6679 "vm_enabled:%d\r\n"
6680 "role:%s\r\n"
6681 ,REDIS_VERSION,
6682 (sizeof(long) == 8) ? "64" : "32",
6683 aeGetApiName(),
6684 (long) getpid(),
6685 uptime,
6686 uptime/(3600*24),
6687 listLength(server.clients)-listLength(server.slaves),
6688 listLength(server.slaves),
6689 server.blpop_blocked_clients,
6690 zmalloc_used_memory(),
6691 hmem,
6692 server.dirty,
6693 server.bgsavechildpid != -1,
6694 server.lastsave,
6695 server.bgrewritechildpid != -1,
6696 server.stat_numconnections,
6697 server.stat_numcommands,
6698 server.stat_expiredkeys,
6699 server.hash_max_zipmap_entries,
6700 server.hash_max_zipmap_value,
6701 dictSize(server.pubsub_classes),
6702 server.vm_enabled != 0,
6703 server.masterhost == NULL ? "master" : "slave"
6704 );
6705 if (server.masterhost) {
6706 info = sdscatprintf(info,
6707 "master_host:%s\r\n"
6708 "master_port:%d\r\n"
6709 "master_link_status:%s\r\n"
6710 "master_last_io_seconds_ago:%d\r\n"
6711 ,server.masterhost,
6712 server.masterport,
6713 (server.replstate == REDIS_REPL_CONNECTED) ?
6714 "up" : "down",
6715 server.master ? ((int)(time(NULL)-server.master->lastinteraction)) : -1
6716 );
6717 }
6718 if (server.vm_enabled) {
6719 lockThreadedIO();
6720 info = sdscatprintf(info,
6721 "vm_conf_max_memory:%llu\r\n"
6722 "vm_conf_page_size:%llu\r\n"
6723 "vm_conf_pages:%llu\r\n"
6724 "vm_stats_used_pages:%llu\r\n"
6725 "vm_stats_swapped_objects:%llu\r\n"
6726 "vm_stats_swappin_count:%llu\r\n"
6727 "vm_stats_swappout_count:%llu\r\n"
6728 "vm_stats_io_newjobs_len:%lu\r\n"
6729 "vm_stats_io_processing_len:%lu\r\n"
6730 "vm_stats_io_processed_len:%lu\r\n"
6731 "vm_stats_io_active_threads:%lu\r\n"
6732 "vm_stats_blocked_clients:%lu\r\n"
6733 ,(unsigned long long) server.vm_max_memory,
6734 (unsigned long long) server.vm_page_size,
6735 (unsigned long long) server.vm_pages,
6736 (unsigned long long) server.vm_stats_used_pages,
6737 (unsigned long long) server.vm_stats_swapped_objects,
6738 (unsigned long long) server.vm_stats_swapins,
6739 (unsigned long long) server.vm_stats_swapouts,
6740 (unsigned long) listLength(server.io_newjobs),
6741 (unsigned long) listLength(server.io_processing),
6742 (unsigned long) listLength(server.io_processed),
6743 (unsigned long) server.io_active_threads,
6744 (unsigned long) server.vm_blocked_clients
6745 );
6746 unlockThreadedIO();
6747 }
6748 for (j = 0; j < server.dbnum; j++) {
6749 long long keys, vkeys;
6750
6751 keys = dictSize(server.db[j].dict);
6752 vkeys = dictSize(server.db[j].expires);
6753 if (keys || vkeys) {
6754 info = sdscatprintf(info, "db%d:keys=%lld,expires=%lld\r\n",
6755 j, keys, vkeys);
6756 }
6757 }
6758 return info;
6759 }
6760
6761 static void infoCommand(redisClient *c) {
6762 sds info = genRedisInfoString();
6763 addReplySds(c,sdscatprintf(sdsempty(),"$%lu\r\n",
6764 (unsigned long)sdslen(info)));
6765 addReplySds(c,info);
6766 addReply(c,shared.crlf);
6767 }
6768
6769 static void monitorCommand(redisClient *c) {
6770 /* ignore MONITOR if aleady slave or in monitor mode */
6771 if (c->flags & REDIS_SLAVE) return;
6772
6773 c->flags |= (REDIS_SLAVE|REDIS_MONITOR);
6774 c->slaveseldb = 0;
6775 listAddNodeTail(server.monitors,c);
6776 addReply(c,shared.ok);
6777 }
6778
6779 /* ================================= Expire ================================= */
6780 static int removeExpire(redisDb *db, robj *key) {
6781 if (dictDelete(db->expires,key) == DICT_OK) {
6782 return 1;
6783 } else {
6784 return 0;
6785 }
6786 }
6787
6788 static int setExpire(redisDb *db, robj *key, time_t when) {
6789 if (dictAdd(db->expires,key,(void*)when) == DICT_ERR) {
6790 return 0;
6791 } else {
6792 incrRefCount(key);
6793 return 1;
6794 }
6795 }
6796
6797 /* Return the expire time of the specified key, or -1 if no expire
6798 * is associated with this key (i.e. the key is non volatile) */
6799 static time_t getExpire(redisDb *db, robj *key) {
6800 dictEntry *de;
6801
6802 /* No expire? return ASAP */
6803 if (dictSize(db->expires) == 0 ||
6804 (de = dictFind(db->expires,key)) == NULL) return -1;
6805
6806 return (time_t) dictGetEntryVal(de);
6807 }
6808
6809 static int expireIfNeeded(redisDb *db, robj *key) {
6810 time_t when;
6811 dictEntry *de;
6812
6813 /* No expire? return ASAP */
6814 if (dictSize(db->expires) == 0 ||
6815 (de = dictFind(db->expires,key)) == NULL) return 0;
6816
6817 /* Lookup the expire */
6818 when = (time_t) dictGetEntryVal(de);
6819 if (time(NULL) <= when) return 0;
6820
6821 /* Delete the key */
6822 dictDelete(db->expires,key);
6823 server.stat_expiredkeys++;
6824 return dictDelete(db->dict,key) == DICT_OK;
6825 }
6826
6827 static int deleteIfVolatile(redisDb *db, robj *key) {
6828 dictEntry *de;
6829
6830 /* No expire? return ASAP */
6831 if (dictSize(db->expires) == 0 ||
6832 (de = dictFind(db->expires,key)) == NULL) return 0;
6833
6834 /* Delete the key */
6835 server.dirty++;
6836 server.stat_expiredkeys++;
6837 dictDelete(db->expires,key);
6838 return dictDelete(db->dict,key) == DICT_OK;
6839 }
6840
6841 static void expireGenericCommand(redisClient *c, robj *key, time_t seconds) {
6842 dictEntry *de;
6843
6844 de = dictFind(c->db->dict,key);
6845 if (de == NULL) {
6846 addReply(c,shared.czero);
6847 return;
6848 }
6849 if (seconds < 0) {
6850 if (deleteKey(c->db,key)) server.dirty++;
6851 addReply(c, shared.cone);
6852 return;
6853 } else {
6854 time_t when = time(NULL)+seconds;
6855 if (setExpire(c->db,key,when)) {
6856 addReply(c,shared.cone);
6857 server.dirty++;
6858 } else {
6859 addReply(c,shared.czero);
6860 }
6861 return;
6862 }
6863 }
6864
6865 static void expireCommand(redisClient *c) {
6866 expireGenericCommand(c,c->argv[1],strtol(c->argv[2]->ptr,NULL,10));
6867 }
6868
6869 static void expireatCommand(redisClient *c) {
6870 expireGenericCommand(c,c->argv[1],strtol(c->argv[2]->ptr,NULL,10)-time(NULL));
6871 }
6872
6873 static void ttlCommand(redisClient *c) {
6874 time_t expire;
6875 int ttl = -1;
6876
6877 expire = getExpire(c->db,c->argv[1]);
6878 if (expire != -1) {
6879 ttl = (int) (expire-time(NULL));
6880 if (ttl < 0) ttl = -1;
6881 }
6882 addReplySds(c,sdscatprintf(sdsempty(),":%d\r\n",ttl));
6883 }
6884
6885 /* ================================ MULTI/EXEC ============================== */
6886
6887 /* Client state initialization for MULTI/EXEC */
6888 static void initClientMultiState(redisClient *c) {
6889 c->mstate.commands = NULL;
6890 c->mstate.count = 0;
6891 }
6892
6893 /* Release all the resources associated with MULTI/EXEC state */
6894 static void freeClientMultiState(redisClient *c) {
6895 int j;
6896
6897 for (j = 0; j < c->mstate.count; j++) {
6898 int i;
6899 multiCmd *mc = c->mstate.commands+j;
6900
6901 for (i = 0; i < mc->argc; i++)
6902 decrRefCount(mc->argv[i]);
6903 zfree(mc->argv);
6904 }
6905 zfree(c->mstate.commands);
6906 }
6907
6908 /* Add a new command into the MULTI commands queue */
6909 static void queueMultiCommand(redisClient *c, struct redisCommand *cmd) {
6910 multiCmd *mc;
6911 int j;
6912
6913 c->mstate.commands = zrealloc(c->mstate.commands,
6914 sizeof(multiCmd)*(c->mstate.count+1));
6915 mc = c->mstate.commands+c->mstate.count;
6916 mc->cmd = cmd;
6917 mc->argc = c->argc;
6918 mc->argv = zmalloc(sizeof(robj*)*c->argc);
6919 memcpy(mc->argv,c->argv,sizeof(robj*)*c->argc);
6920 for (j = 0; j < c->argc; j++)
6921 incrRefCount(mc->argv[j]);
6922 c->mstate.count++;
6923 }
6924
6925 static void multiCommand(redisClient *c) {
6926 c->flags |= REDIS_MULTI;
6927 addReply(c,shared.ok);
6928 }
6929
6930 static void discardCommand(redisClient *c) {
6931 if (!(c->flags & REDIS_MULTI)) {
6932 addReplySds(c,sdsnew("-ERR DISCARD without MULTI\r\n"));
6933 return;
6934 }
6935
6936 freeClientMultiState(c);
6937 initClientMultiState(c);
6938 c->flags &= (~REDIS_MULTI);
6939 addReply(c,shared.ok);
6940 }
6941
6942 static void execCommand(redisClient *c) {
6943 int j;
6944 robj **orig_argv;
6945 int orig_argc;
6946
6947 if (!(c->flags & REDIS_MULTI)) {
6948 addReplySds(c,sdsnew("-ERR EXEC without MULTI\r\n"));
6949 return;
6950 }
6951
6952 orig_argv = c->argv;
6953 orig_argc = c->argc;
6954 addReplySds(c,sdscatprintf(sdsempty(),"*%d\r\n",c->mstate.count));
6955 for (j = 0; j < c->mstate.count; j++) {
6956 c->argc = c->mstate.commands[j].argc;
6957 c->argv = c->mstate.commands[j].argv;
6958 call(c,c->mstate.commands[j].cmd);
6959 }
6960 c->argv = orig_argv;
6961 c->argc = orig_argc;
6962 freeClientMultiState(c);
6963 initClientMultiState(c);
6964 c->flags &= (~REDIS_MULTI);
6965 }
6966
6967 /* =========================== Blocking Operations ========================= */
6968
6969 /* Currently Redis blocking operations support is limited to list POP ops,
6970 * so the current implementation is not fully generic, but it is also not
6971 * completely specific so it will not require a rewrite to support new
6972 * kind of blocking operations in the future.
6973 *
6974 * Still it's important to note that list blocking operations can be already
6975 * used as a notification mechanism in order to implement other blocking
6976 * operations at application level, so there must be a very strong evidence
6977 * of usefulness and generality before new blocking operations are implemented.
6978 *
6979 * This is how the current blocking POP works, we use BLPOP as example:
6980 * - If the user calls BLPOP and the key exists and contains a non empty list
6981 * then LPOP is called instead. So BLPOP is semantically the same as LPOP
6982 * if there is not to block.
6983 * - If instead BLPOP is called and the key does not exists or the list is
6984 * empty we need to block. In order to do so we remove the notification for
6985 * new data to read in the client socket (so that we'll not serve new
6986 * requests if the blocking request is not served). Also we put the client
6987 * in a dictionary (db->blockingkeys) mapping keys to a list of clients
6988 * blocking for this keys.
6989 * - If a PUSH operation against a key with blocked clients waiting is
6990 * performed, we serve the first in the list: basically instead to push
6991 * the new element inside the list we return it to the (first / oldest)
6992 * blocking client, unblock the client, and remove it form the list.
6993 *
6994 * The above comment and the source code should be enough in order to understand
6995 * the implementation and modify / fix it later.
6996 */
6997
6998 /* Set a client in blocking mode for the specified key, with the specified
6999 * timeout */
7000 static void blockForKeys(redisClient *c, robj **keys, int numkeys, time_t timeout) {
7001 dictEntry *de;
7002 list *l;
7003 int j;
7004
7005 c->blockingkeys = zmalloc(sizeof(robj*)*numkeys);
7006 c->blockingkeysnum = numkeys;
7007 c->blockingto = timeout;
7008 for (j = 0; j < numkeys; j++) {
7009 /* Add the key in the client structure, to map clients -> keys */
7010 c->blockingkeys[j] = keys[j];
7011 incrRefCount(keys[j]);
7012
7013 /* And in the other "side", to map keys -> clients */
7014 de = dictFind(c->db->blockingkeys,keys[j]);
7015 if (de == NULL) {
7016 int retval;
7017
7018 /* For every key we take a list of clients blocked for it */
7019 l = listCreate();
7020 retval = dictAdd(c->db->blockingkeys,keys[j],l);
7021 incrRefCount(keys[j]);
7022 assert(retval == DICT_OK);
7023 } else {
7024 l = dictGetEntryVal(de);
7025 }
7026 listAddNodeTail(l,c);
7027 }
7028 /* Mark the client as a blocked client */
7029 c->flags |= REDIS_BLOCKED;
7030 server.blpop_blocked_clients++;
7031 }
7032
7033 /* Unblock a client that's waiting in a blocking operation such as BLPOP */
7034 static void unblockClientWaitingData(redisClient *c) {
7035 dictEntry *de;
7036 list *l;
7037 int j;
7038
7039 assert(c->blockingkeys != NULL);
7040 /* The client may wait for multiple keys, so unblock it for every key. */
7041 for (j = 0; j < c->blockingkeysnum; j++) {
7042 /* Remove this client from the list of clients waiting for this key. */
7043 de = dictFind(c->db->blockingkeys,c->blockingkeys[j]);
7044 assert(de != NULL);
7045 l = dictGetEntryVal(de);
7046 listDelNode(l,listSearchKey(l,c));
7047 /* If the list is empty we need to remove it to avoid wasting memory */
7048 if (listLength(l) == 0)
7049 dictDelete(c->db->blockingkeys,c->blockingkeys[j]);
7050 decrRefCount(c->blockingkeys[j]);
7051 }
7052 /* Cleanup the client structure */
7053 zfree(c->blockingkeys);
7054 c->blockingkeys = NULL;
7055 c->flags &= (~REDIS_BLOCKED);
7056 server.blpop_blocked_clients--;
7057 /* We want to process data if there is some command waiting
7058 * in the input buffer. Note that this is safe even if
7059 * unblockClientWaitingData() gets called from freeClient() because
7060 * freeClient() will be smart enough to call this function
7061 * *after* c->querybuf was set to NULL. */
7062 if (c->querybuf && sdslen(c->querybuf) > 0) processInputBuffer(c);
7063 }
7064
7065 /* This should be called from any function PUSHing into lists.
7066 * 'c' is the "pushing client", 'key' is the key it is pushing data against,
7067 * 'ele' is the element pushed.
7068 *
7069 * If the function returns 0 there was no client waiting for a list push
7070 * against this key.
7071 *
7072 * If the function returns 1 there was a client waiting for a list push
7073 * against this key, the element was passed to this client thus it's not
7074 * needed to actually add it to the list and the caller should return asap. */
7075 static int handleClientsWaitingListPush(redisClient *c, robj *key, robj *ele) {
7076 struct dictEntry *de;
7077 redisClient *receiver;
7078 list *l;
7079 listNode *ln;
7080
7081 de = dictFind(c->db->blockingkeys,key);
7082 if (de == NULL) return 0;
7083 l = dictGetEntryVal(de);
7084 ln = listFirst(l);
7085 assert(ln != NULL);
7086 receiver = ln->value;
7087
7088 addReplySds(receiver,sdsnew("*2\r\n"));
7089 addReplyBulk(receiver,key);
7090 addReplyBulk(receiver,ele);
7091 unblockClientWaitingData(receiver);
7092 return 1;
7093 }
7094
7095 /* Blocking RPOP/LPOP */
7096 static void blockingPopGenericCommand(redisClient *c, int where) {
7097 robj *o;
7098 time_t timeout;
7099 int j;
7100
7101 for (j = 1; j < c->argc-1; j++) {
7102 o = lookupKeyWrite(c->db,c->argv[j]);
7103 if (o != NULL) {
7104 if (o->type != REDIS_LIST) {
7105 addReply(c,shared.wrongtypeerr);
7106 return;
7107 } else {
7108 list *list = o->ptr;
7109 if (listLength(list) != 0) {
7110 /* If the list contains elements fall back to the usual
7111 * non-blocking POP operation */
7112 robj *argv[2], **orig_argv;
7113 int orig_argc;
7114
7115 /* We need to alter the command arguments before to call
7116 * popGenericCommand() as the command takes a single key. */
7117 orig_argv = c->argv;
7118 orig_argc = c->argc;
7119 argv[1] = c->argv[j];
7120 c->argv = argv;
7121 c->argc = 2;
7122
7123 /* Also the return value is different, we need to output
7124 * the multi bulk reply header and the key name. The
7125 * "real" command will add the last element (the value)
7126 * for us. If this souds like an hack to you it's just
7127 * because it is... */
7128 addReplySds(c,sdsnew("*2\r\n"));
7129 addReplyBulk(c,argv[1]);
7130 popGenericCommand(c,where);
7131
7132 /* Fix the client structure with the original stuff */
7133 c->argv = orig_argv;
7134 c->argc = orig_argc;
7135 return;
7136 }
7137 }
7138 }
7139 }
7140 /* If the list is empty or the key does not exists we must block */
7141 timeout = strtol(c->argv[c->argc-1]->ptr,NULL,10);
7142 if (timeout > 0) timeout += time(NULL);
7143 blockForKeys(c,c->argv+1,c->argc-2,timeout);
7144 }
7145
7146 static void blpopCommand(redisClient *c) {
7147 blockingPopGenericCommand(c,REDIS_HEAD);
7148 }
7149
7150 static void brpopCommand(redisClient *c) {
7151 blockingPopGenericCommand(c,REDIS_TAIL);
7152 }
7153
7154 /* =============================== Replication ============================= */
7155
7156 static int syncWrite(int fd, char *ptr, ssize_t size, int timeout) {
7157 ssize_t nwritten, ret = size;
7158 time_t start = time(NULL);
7159
7160 timeout++;
7161 while(size) {
7162 if (aeWait(fd,AE_WRITABLE,1000) & AE_WRITABLE) {
7163 nwritten = write(fd,ptr,size);
7164 if (nwritten == -1) return -1;
7165 ptr += nwritten;
7166 size -= nwritten;
7167 }
7168 if ((time(NULL)-start) > timeout) {
7169 errno = ETIMEDOUT;
7170 return -1;
7171 }
7172 }
7173 return ret;
7174 }
7175
7176 static int syncRead(int fd, char *ptr, ssize_t size, int timeout) {
7177 ssize_t nread, totread = 0;
7178 time_t start = time(NULL);
7179
7180 timeout++;
7181 while(size) {
7182 if (aeWait(fd,AE_READABLE,1000) & AE_READABLE) {
7183 nread = read(fd,ptr,size);
7184 if (nread == -1) return -1;
7185 ptr += nread;
7186 size -= nread;
7187 totread += nread;
7188 }
7189 if ((time(NULL)-start) > timeout) {
7190 errno = ETIMEDOUT;
7191 return -1;
7192 }
7193 }
7194 return totread;
7195 }
7196
7197 static int syncReadLine(int fd, char *ptr, ssize_t size, int timeout) {
7198 ssize_t nread = 0;
7199
7200 size--;
7201 while(size) {
7202 char c;
7203
7204 if (syncRead(fd,&c,1,timeout) == -1) return -1;
7205 if (c == '\n') {
7206 *ptr = '\0';
7207 if (nread && *(ptr-1) == '\r') *(ptr-1) = '\0';
7208 return nread;
7209 } else {
7210 *ptr++ = c;
7211 *ptr = '\0';
7212 nread++;
7213 }
7214 }
7215 return nread;
7216 }
7217
7218 static void syncCommand(redisClient *c) {
7219 /* ignore SYNC if aleady slave or in monitor mode */
7220 if (c->flags & REDIS_SLAVE) return;
7221
7222 /* SYNC can't be issued when the server has pending data to send to
7223 * the client about already issued commands. We need a fresh reply
7224 * buffer registering the differences between the BGSAVE and the current
7225 * dataset, so that we can copy to other slaves if needed. */
7226 if (listLength(c->reply) != 0) {
7227 addReplySds(c,sdsnew("-ERR SYNC is invalid with pending input\r\n"));
7228 return;
7229 }
7230
7231 redisLog(REDIS_NOTICE,"Slave ask for synchronization");
7232 /* Here we need to check if there is a background saving operation
7233 * in progress, or if it is required to start one */
7234 if (server.bgsavechildpid != -1) {
7235 /* Ok a background save is in progress. Let's check if it is a good
7236 * one for replication, i.e. if there is another slave that is
7237 * registering differences since the server forked to save */
7238 redisClient *slave;
7239 listNode *ln;
7240 listIter li;
7241
7242 listRewind(server.slaves,&li);
7243 while((ln = listNext(&li))) {
7244 slave = ln->value;
7245 if (slave->replstate == REDIS_REPL_WAIT_BGSAVE_END) break;
7246 }
7247 if (ln) {
7248 /* Perfect, the server is already registering differences for
7249 * another slave. Set the right state, and copy the buffer. */
7250 listRelease(c->reply);
7251 c->reply = listDup(slave->reply);
7252 c->replstate = REDIS_REPL_WAIT_BGSAVE_END;
7253 redisLog(REDIS_NOTICE,"Waiting for end of BGSAVE for SYNC");
7254 } else {
7255 /* No way, we need to wait for the next BGSAVE in order to
7256 * register differences */
7257 c->replstate = REDIS_REPL_WAIT_BGSAVE_START;
7258 redisLog(REDIS_NOTICE,"Waiting for next BGSAVE for SYNC");
7259 }
7260 } else {
7261 /* Ok we don't have a BGSAVE in progress, let's start one */
7262 redisLog(REDIS_NOTICE,"Starting BGSAVE for SYNC");
7263 if (rdbSaveBackground(server.dbfilename) != REDIS_OK) {
7264 redisLog(REDIS_NOTICE,"Replication failed, can't BGSAVE");
7265 addReplySds(c,sdsnew("-ERR Unalbe to perform background save\r\n"));
7266 return;
7267 }
7268 c->replstate = REDIS_REPL_WAIT_BGSAVE_END;
7269 }
7270 c->repldbfd = -1;
7271 c->flags |= REDIS_SLAVE;
7272 c->slaveseldb = 0;
7273 listAddNodeTail(server.slaves,c);
7274 return;
7275 }
7276
7277 static void sendBulkToSlave(aeEventLoop *el, int fd, void *privdata, int mask) {
7278 redisClient *slave = privdata;
7279 REDIS_NOTUSED(el);
7280 REDIS_NOTUSED(mask);
7281 char buf[REDIS_IOBUF_LEN];
7282 ssize_t nwritten, buflen;
7283
7284 if (slave->repldboff == 0) {
7285 /* Write the bulk write count before to transfer the DB. In theory here
7286 * we don't know how much room there is in the output buffer of the
7287 * socket, but in pratice SO_SNDLOWAT (the minimum count for output
7288 * operations) will never be smaller than the few bytes we need. */
7289 sds bulkcount;
7290
7291 bulkcount = sdscatprintf(sdsempty(),"$%lld\r\n",(unsigned long long)
7292 slave->repldbsize);
7293 if (write(fd,bulkcount,sdslen(bulkcount)) != (signed)sdslen(bulkcount))
7294 {
7295 sdsfree(bulkcount);
7296 freeClient(slave);
7297 return;
7298 }
7299 sdsfree(bulkcount);
7300 }
7301 lseek(slave->repldbfd,slave->repldboff,SEEK_SET);
7302 buflen = read(slave->repldbfd,buf,REDIS_IOBUF_LEN);
7303 if (buflen <= 0) {
7304 redisLog(REDIS_WARNING,"Read error sending DB to slave: %s",
7305 (buflen == 0) ? "premature EOF" : strerror(errno));
7306 freeClient(slave);
7307 return;
7308 }
7309 if ((nwritten = write(fd,buf,buflen)) == -1) {
7310 redisLog(REDIS_VERBOSE,"Write error sending DB to slave: %s",
7311 strerror(errno));
7312 freeClient(slave);
7313 return;
7314 }
7315 slave->repldboff += nwritten;
7316 if (slave->repldboff == slave->repldbsize) {
7317 close(slave->repldbfd);
7318 slave->repldbfd = -1;
7319 aeDeleteFileEvent(server.el,slave->fd,AE_WRITABLE);
7320 slave->replstate = REDIS_REPL_ONLINE;
7321 if (aeCreateFileEvent(server.el, slave->fd, AE_WRITABLE,
7322 sendReplyToClient, slave) == AE_ERR) {
7323 freeClient(slave);
7324 return;
7325 }
7326 addReplySds(slave,sdsempty());
7327 redisLog(REDIS_NOTICE,"Synchronization with slave succeeded");
7328 }
7329 }
7330
7331 /* This function is called at the end of every backgrond saving.
7332 * The argument bgsaveerr is REDIS_OK if the background saving succeeded
7333 * otherwise REDIS_ERR is passed to the function.
7334 *
7335 * The goal of this function is to handle slaves waiting for a successful
7336 * background saving in order to perform non-blocking synchronization. */
7337 static void updateSlavesWaitingBgsave(int bgsaveerr) {
7338 listNode *ln;
7339 int startbgsave = 0;
7340 listIter li;
7341
7342 listRewind(server.slaves,&li);
7343 while((ln = listNext(&li))) {
7344 redisClient *slave = ln->value;
7345
7346 if (slave->replstate == REDIS_REPL_WAIT_BGSAVE_START) {
7347 startbgsave = 1;
7348 slave->replstate = REDIS_REPL_WAIT_BGSAVE_END;
7349 } else if (slave->replstate == REDIS_REPL_WAIT_BGSAVE_END) {
7350 struct redis_stat buf;
7351
7352 if (bgsaveerr != REDIS_OK) {
7353 freeClient(slave);
7354 redisLog(REDIS_WARNING,"SYNC failed. BGSAVE child returned an error");
7355 continue;
7356 }
7357 if ((slave->repldbfd = open(server.dbfilename,O_RDONLY)) == -1 ||
7358 redis_fstat(slave->repldbfd,&buf) == -1) {
7359 freeClient(slave);
7360 redisLog(REDIS_WARNING,"SYNC failed. Can't open/stat DB after BGSAVE: %s", strerror(errno));
7361 continue;
7362 }
7363 slave->repldboff = 0;
7364 slave->repldbsize = buf.st_size;
7365 slave->replstate = REDIS_REPL_SEND_BULK;
7366 aeDeleteFileEvent(server.el,slave->fd,AE_WRITABLE);
7367 if (aeCreateFileEvent(server.el, slave->fd, AE_WRITABLE, sendBulkToSlave, slave) == AE_ERR) {
7368 freeClient(slave);
7369 continue;
7370 }
7371 }
7372 }
7373 if (startbgsave) {
7374 if (rdbSaveBackground(server.dbfilename) != REDIS_OK) {
7375 listIter li;
7376
7377 listRewind(server.slaves,&li);
7378 redisLog(REDIS_WARNING,"SYNC failed. BGSAVE failed");
7379 while((ln = listNext(&li))) {
7380 redisClient *slave = ln->value;
7381
7382 if (slave->replstate == REDIS_REPL_WAIT_BGSAVE_START)
7383 freeClient(slave);
7384 }
7385 }
7386 }
7387 }
7388
7389 static int syncWithMaster(void) {
7390 char buf[1024], tmpfile[256], authcmd[1024];
7391 long dumpsize;
7392 int fd = anetTcpConnect(NULL,server.masterhost,server.masterport);
7393 int dfd, maxtries = 5;
7394
7395 if (fd == -1) {
7396 redisLog(REDIS_WARNING,"Unable to connect to MASTER: %s",
7397 strerror(errno));
7398 return REDIS_ERR;
7399 }
7400
7401 /* AUTH with the master if required. */
7402 if(server.masterauth) {
7403 snprintf(authcmd, 1024, "AUTH %s\r\n", server.masterauth);
7404 if (syncWrite(fd, authcmd, strlen(server.masterauth)+7, 5) == -1) {
7405 close(fd);
7406 redisLog(REDIS_WARNING,"Unable to AUTH to MASTER: %s",
7407 strerror(errno));
7408 return REDIS_ERR;
7409 }
7410 /* Read the AUTH result. */
7411 if (syncReadLine(fd,buf,1024,3600) == -1) {
7412 close(fd);
7413 redisLog(REDIS_WARNING,"I/O error reading auth result from MASTER: %s",
7414 strerror(errno));
7415 return REDIS_ERR;
7416 }
7417 if (buf[0] != '+') {
7418 close(fd);
7419 redisLog(REDIS_WARNING,"Cannot AUTH to MASTER, is the masterauth password correct?");
7420 return REDIS_ERR;
7421 }
7422 }
7423
7424 /* Issue the SYNC command */
7425 if (syncWrite(fd,"SYNC \r\n",7,5) == -1) {
7426 close(fd);
7427 redisLog(REDIS_WARNING,"I/O error writing to MASTER: %s",
7428 strerror(errno));
7429 return REDIS_ERR;
7430 }
7431 /* Read the bulk write count */
7432 if (syncReadLine(fd,buf,1024,3600) == -1) {
7433 close(fd);
7434 redisLog(REDIS_WARNING,"I/O error reading bulk count from MASTER: %s",
7435 strerror(errno));
7436 return REDIS_ERR;
7437 }
7438 if (buf[0] != '$') {
7439 close(fd);
7440 redisLog(REDIS_WARNING,"Bad protocol from MASTER, the first byte is not '$', are you sure the host and port are right?");
7441 return REDIS_ERR;
7442 }
7443 dumpsize = strtol(buf+1,NULL,10);
7444 redisLog(REDIS_NOTICE,"Receiving %ld bytes data dump from MASTER",dumpsize);
7445 /* Read the bulk write data on a temp file */
7446 while(maxtries--) {
7447 snprintf(tmpfile,256,
7448 "temp-%d.%ld.rdb",(int)time(NULL),(long int)getpid());
7449 dfd = open(tmpfile,O_CREAT|O_WRONLY|O_EXCL,0644);
7450 if (dfd != -1) break;
7451 sleep(1);
7452 }
7453 if (dfd == -1) {
7454 close(fd);
7455 redisLog(REDIS_WARNING,"Opening the temp file needed for MASTER <-> SLAVE synchronization: %s",strerror(errno));
7456 return REDIS_ERR;
7457 }
7458 while(dumpsize) {
7459 int nread, nwritten;
7460
7461 nread = read(fd,buf,(dumpsize < 1024)?dumpsize:1024);
7462 if (nread == -1) {
7463 redisLog(REDIS_WARNING,"I/O error trying to sync with MASTER: %s",
7464 strerror(errno));
7465 close(fd);
7466 close(dfd);
7467 return REDIS_ERR;
7468 }
7469 nwritten = write(dfd,buf,nread);
7470 if (nwritten == -1) {
7471 redisLog(REDIS_WARNING,"Write error writing to the DB dump file needed for MASTER <-> SLAVE synchrnonization: %s", strerror(errno));
7472 close(fd);
7473 close(dfd);
7474 return REDIS_ERR;
7475 }
7476 dumpsize -= nread;
7477 }
7478 close(dfd);
7479 if (rename(tmpfile,server.dbfilename) == -1) {
7480 redisLog(REDIS_WARNING,"Failed trying to rename the temp DB into dump.rdb in MASTER <-> SLAVE synchronization: %s", strerror(errno));
7481 unlink(tmpfile);
7482 close(fd);
7483 return REDIS_ERR;
7484 }
7485 emptyDb();
7486 if (rdbLoad(server.dbfilename) != REDIS_OK) {
7487 redisLog(REDIS_WARNING,"Failed trying to load the MASTER synchronization DB from disk");
7488 close(fd);
7489 return REDIS_ERR;
7490 }
7491 server.master = createClient(fd);
7492 server.master->flags |= REDIS_MASTER;
7493 server.master->authenticated = 1;
7494 server.replstate = REDIS_REPL_CONNECTED;
7495 return REDIS_OK;
7496 }
7497
7498 static void slaveofCommand(redisClient *c) {
7499 if (!strcasecmp(c->argv[1]->ptr,"no") &&
7500 !strcasecmp(c->argv[2]->ptr,"one")) {
7501 if (server.masterhost) {
7502 sdsfree(server.masterhost);
7503 server.masterhost = NULL;
7504 if (server.master) freeClient(server.master);
7505 server.replstate = REDIS_REPL_NONE;
7506 redisLog(REDIS_NOTICE,"MASTER MODE enabled (user request)");
7507 }
7508 } else {
7509 sdsfree(server.masterhost);
7510 server.masterhost = sdsdup(c->argv[1]->ptr);
7511 server.masterport = atoi(c->argv[2]->ptr);
7512 if (server.master) freeClient(server.master);
7513 server.replstate = REDIS_REPL_CONNECT;
7514 redisLog(REDIS_NOTICE,"SLAVE OF %s:%d enabled (user request)",
7515 server.masterhost, server.masterport);
7516 }
7517 addReply(c,shared.ok);
7518 }
7519
7520 /* ============================ Maxmemory directive ======================== */
7521
7522 /* Try to free one object form the pre-allocated objects free list.
7523 * This is useful under low mem conditions as by default we take 1 million
7524 * free objects allocated. On success REDIS_OK is returned, otherwise
7525 * REDIS_ERR. */
7526 static int tryFreeOneObjectFromFreelist(void) {
7527 robj *o;
7528
7529 if (server.vm_enabled) pthread_mutex_lock(&server.obj_freelist_mutex);
7530 if (listLength(server.objfreelist)) {
7531 listNode *head = listFirst(server.objfreelist);
7532 o = listNodeValue(head);
7533 listDelNode(server.objfreelist,head);
7534 if (server.vm_enabled) pthread_mutex_unlock(&server.obj_freelist_mutex);
7535 zfree(o);
7536 return REDIS_OK;
7537 } else {
7538 if (server.vm_enabled) pthread_mutex_unlock(&server.obj_freelist_mutex);
7539 return REDIS_ERR;
7540 }
7541 }
7542
7543 /* This function gets called when 'maxmemory' is set on the config file to limit
7544 * the max memory used by the server, and we are out of memory.
7545 * This function will try to, in order:
7546 *
7547 * - Free objects from the free list
7548 * - Try to remove keys with an EXPIRE set
7549 *
7550 * It is not possible to free enough memory to reach used-memory < maxmemory
7551 * the server will start refusing commands that will enlarge even more the
7552 * memory usage.
7553 */
7554 static void freeMemoryIfNeeded(void) {
7555 while (server.maxmemory && zmalloc_used_memory() > server.maxmemory) {
7556 int j, k, freed = 0;
7557
7558 if (tryFreeOneObjectFromFreelist() == REDIS_OK) continue;
7559 for (j = 0; j < server.dbnum; j++) {
7560 int minttl = -1;
7561 robj *minkey = NULL;
7562 struct dictEntry *de;
7563
7564 if (dictSize(server.db[j].expires)) {
7565 freed = 1;
7566 /* From a sample of three keys drop the one nearest to
7567 * the natural expire */
7568 for (k = 0; k < 3; k++) {
7569 time_t t;
7570
7571 de = dictGetRandomKey(server.db[j].expires);
7572 t = (time_t) dictGetEntryVal(de);
7573 if (minttl == -1 || t < minttl) {
7574 minkey = dictGetEntryKey(de);
7575 minttl = t;
7576 }
7577 }
7578 deleteKey(server.db+j,minkey);
7579 }
7580 }
7581 if (!freed) return; /* nothing to free... */
7582 }
7583 }
7584
7585 /* ============================== Append Only file ========================== */
7586
7587 static void feedAppendOnlyFile(struct redisCommand *cmd, int dictid, robj **argv, int argc) {
7588 sds buf = sdsempty();
7589 int j;
7590 ssize_t nwritten;
7591 time_t now;
7592 robj *tmpargv[3];
7593
7594 /* The DB this command was targetting is not the same as the last command
7595 * we appendend. To issue a SELECT command is needed. */
7596 if (dictid != server.appendseldb) {
7597 char seldb[64];
7598
7599 snprintf(seldb,sizeof(seldb),"%d",dictid);
7600 buf = sdscatprintf(buf,"*2\r\n$6\r\nSELECT\r\n$%lu\r\n%s\r\n",
7601 (unsigned long)strlen(seldb),seldb);
7602 server.appendseldb = dictid;
7603 }
7604
7605 /* "Fix" the argv vector if the command is EXPIRE. We want to translate
7606 * EXPIREs into EXPIREATs calls */
7607 if (cmd->proc == expireCommand) {
7608 long when;
7609
7610 tmpargv[0] = createStringObject("EXPIREAT",8);
7611 tmpargv[1] = argv[1];
7612 incrRefCount(argv[1]);
7613 when = time(NULL)+strtol(argv[2]->ptr,NULL,10);
7614 tmpargv[2] = createObject(REDIS_STRING,
7615 sdscatprintf(sdsempty(),"%ld",when));
7616 argv = tmpargv;
7617 }
7618
7619 /* Append the actual command */
7620 buf = sdscatprintf(buf,"*%d\r\n",argc);
7621 for (j = 0; j < argc; j++) {
7622 robj *o = argv[j];
7623
7624 o = getDecodedObject(o);
7625 buf = sdscatprintf(buf,"$%lu\r\n",(unsigned long)sdslen(o->ptr));
7626 buf = sdscatlen(buf,o->ptr,sdslen(o->ptr));
7627 buf = sdscatlen(buf,"\r\n",2);
7628 decrRefCount(o);
7629 }
7630
7631 /* Free the objects from the modified argv for EXPIREAT */
7632 if (cmd->proc == expireCommand) {
7633 for (j = 0; j < 3; j++)
7634 decrRefCount(argv[j]);
7635 }
7636
7637 /* We want to perform a single write. This should be guaranteed atomic
7638 * at least if the filesystem we are writing is a real physical one.
7639 * While this will save us against the server being killed I don't think
7640 * there is much to do about the whole server stopping for power problems
7641 * or alike */
7642 nwritten = write(server.appendfd,buf,sdslen(buf));
7643 if (nwritten != (signed)sdslen(buf)) {
7644 /* Ooops, we are in troubles. The best thing to do for now is
7645 * to simply exit instead to give the illusion that everything is
7646 * working as expected. */
7647 if (nwritten == -1) {
7648 redisLog(REDIS_WARNING,"Exiting on error writing to the append-only file: %s",strerror(errno));
7649 } else {
7650 redisLog(REDIS_WARNING,"Exiting on short write while writing to the append-only file: %s",strerror(errno));
7651 }
7652 exit(1);
7653 }
7654 /* If a background append only file rewriting is in progress we want to
7655 * accumulate the differences between the child DB and the current one
7656 * in a buffer, so that when the child process will do its work we
7657 * can append the differences to the new append only file. */
7658 if (server.bgrewritechildpid != -1)
7659 server.bgrewritebuf = sdscatlen(server.bgrewritebuf,buf,sdslen(buf));
7660
7661 sdsfree(buf);
7662 now = time(NULL);
7663 if (server.appendfsync == APPENDFSYNC_ALWAYS ||
7664 (server.appendfsync == APPENDFSYNC_EVERYSEC &&
7665 now-server.lastfsync > 1))
7666 {
7667 fsync(server.appendfd); /* Let's try to get this data on the disk */
7668 server.lastfsync = now;
7669 }
7670 }
7671
7672 /* In Redis commands are always executed in the context of a client, so in
7673 * order to load the append only file we need to create a fake client. */
7674 static struct redisClient *createFakeClient(void) {
7675 struct redisClient *c = zmalloc(sizeof(*c));
7676
7677 selectDb(c,0);
7678 c->fd = -1;
7679 c->querybuf = sdsempty();
7680 c->argc = 0;
7681 c->argv = NULL;
7682 c->flags = 0;
7683 /* We set the fake client as a slave waiting for the synchronization
7684 * so that Redis will not try to send replies to this client. */
7685 c->replstate = REDIS_REPL_WAIT_BGSAVE_START;
7686 c->reply = listCreate();
7687 listSetFreeMethod(c->reply,decrRefCount);
7688 listSetDupMethod(c->reply,dupClientReplyValue);
7689 return c;
7690 }
7691
7692 static void freeFakeClient(struct redisClient *c) {
7693 sdsfree(c->querybuf);
7694 listRelease(c->reply);
7695 zfree(c);
7696 }
7697
7698 /* Replay the append log file. On error REDIS_OK is returned. On non fatal
7699 * error (the append only file is zero-length) REDIS_ERR is returned. On
7700 * fatal error an error message is logged and the program exists. */
7701 int loadAppendOnlyFile(char *filename) {
7702 struct redisClient *fakeClient;
7703 FILE *fp = fopen(filename,"r");
7704 struct redis_stat sb;
7705 unsigned long long loadedkeys = 0;
7706
7707 if (redis_fstat(fileno(fp),&sb) != -1 && sb.st_size == 0)
7708 return REDIS_ERR;
7709
7710 if (fp == NULL) {
7711 redisLog(REDIS_WARNING,"Fatal error: can't open the append log file for reading: %s",strerror(errno));
7712 exit(1);
7713 }
7714
7715 fakeClient = createFakeClient();
7716 while(1) {
7717 int argc, j;
7718 unsigned long len;
7719 robj **argv;
7720 char buf[128];
7721 sds argsds;
7722 struct redisCommand *cmd;
7723
7724 if (fgets(buf,sizeof(buf),fp) == NULL) {
7725 if (feof(fp))
7726 break;
7727 else
7728 goto readerr;
7729 }
7730 if (buf[0] != '*') goto fmterr;
7731 argc = atoi(buf+1);
7732 argv = zmalloc(sizeof(robj*)*argc);
7733 for (j = 0; j < argc; j++) {
7734 if (fgets(buf,sizeof(buf),fp) == NULL) goto readerr;
7735 if (buf[0] != '$') goto fmterr;
7736 len = strtol(buf+1,NULL,10);
7737 argsds = sdsnewlen(NULL,len);
7738 if (len && fread(argsds,len,1,fp) == 0) goto fmterr;
7739 argv[j] = createObject(REDIS_STRING,argsds);
7740 if (fread(buf,2,1,fp) == 0) goto fmterr; /* discard CRLF */
7741 }
7742
7743 /* Command lookup */
7744 cmd = lookupCommand(argv[0]->ptr);
7745 if (!cmd) {
7746 redisLog(REDIS_WARNING,"Unknown command '%s' reading the append only file", argv[0]->ptr);
7747 exit(1);
7748 }
7749 /* Try object sharing and encoding */
7750 if (server.shareobjects) {
7751 int j;
7752 for(j = 1; j < argc; j++)
7753 argv[j] = tryObjectSharing(argv[j]);
7754 }
7755 if (cmd->flags & REDIS_CMD_BULK)
7756 tryObjectEncoding(argv[argc-1]);
7757 /* Run the command in the context of a fake client */
7758 fakeClient->argc = argc;
7759 fakeClient->argv = argv;
7760 cmd->proc(fakeClient);
7761 /* Discard the reply objects list from the fake client */
7762 while(listLength(fakeClient->reply))
7763 listDelNode(fakeClient->reply,listFirst(fakeClient->reply));
7764 /* Clean up, ready for the next command */
7765 for (j = 0; j < argc; j++) decrRefCount(argv[j]);
7766 zfree(argv);
7767 /* Handle swapping while loading big datasets when VM is on */
7768 loadedkeys++;
7769 if (server.vm_enabled && (loadedkeys % 5000) == 0) {
7770 while (zmalloc_used_memory() > server.vm_max_memory) {
7771 if (vmSwapOneObjectBlocking() == REDIS_ERR) break;
7772 }
7773 }
7774 }
7775 fclose(fp);
7776 freeFakeClient(fakeClient);
7777 return REDIS_OK;
7778
7779 readerr:
7780 if (feof(fp)) {
7781 redisLog(REDIS_WARNING,"Unexpected end of file reading the append only file");
7782 } else {
7783 redisLog(REDIS_WARNING,"Unrecoverable error reading the append only file: %s", strerror(errno));
7784 }
7785 exit(1);
7786 fmterr:
7787 redisLog(REDIS_WARNING,"Bad file format reading the append only file");
7788 exit(1);
7789 }
7790
7791 /* Write an object into a file in the bulk format $<count>\r\n<payload>\r\n */
7792 static int fwriteBulkObject(FILE *fp, robj *obj) {
7793 char buf[128];
7794 int decrrc = 0;
7795
7796 /* Avoid the incr/decr ref count business if possible to help
7797 * copy-on-write (we are often in a child process when this function
7798 * is called).
7799 * Also makes sure that key objects don't get incrRefCount-ed when VM
7800 * is enabled */
7801 if (obj->encoding != REDIS_ENCODING_RAW) {
7802 obj = getDecodedObject(obj);
7803 decrrc = 1;
7804 }
7805 snprintf(buf,sizeof(buf),"$%ld\r\n",(long)sdslen(obj->ptr));
7806 if (fwrite(buf,strlen(buf),1,fp) == 0) goto err;
7807 if (sdslen(obj->ptr) && fwrite(obj->ptr,sdslen(obj->ptr),1,fp) == 0)
7808 goto err;
7809 if (fwrite("\r\n",2,1,fp) == 0) goto err;
7810 if (decrrc) decrRefCount(obj);
7811 return 1;
7812 err:
7813 if (decrrc) decrRefCount(obj);
7814 return 0;
7815 }
7816
7817 /* Write binary-safe string into a file in the bulkformat
7818 * $<count>\r\n<payload>\r\n */
7819 static int fwriteBulkString(FILE *fp, char *s, unsigned long len) {
7820 char buf[128];
7821
7822 snprintf(buf,sizeof(buf),"$%ld\r\n",(unsigned long)len);
7823 if (fwrite(buf,strlen(buf),1,fp) == 0) return 0;
7824 if (len && fwrite(s,len,1,fp) == 0) return 0;
7825 if (fwrite("\r\n",2,1,fp) == 0) return 0;
7826 return 1;
7827 }
7828
7829 /* Write a double value in bulk format $<count>\r\n<payload>\r\n */
7830 static int fwriteBulkDouble(FILE *fp, double d) {
7831 char buf[128], dbuf[128];
7832
7833 snprintf(dbuf,sizeof(dbuf),"%.17g\r\n",d);
7834 snprintf(buf,sizeof(buf),"$%lu\r\n",(unsigned long)strlen(dbuf)-2);
7835 if (fwrite(buf,strlen(buf),1,fp) == 0) return 0;
7836 if (fwrite(dbuf,strlen(dbuf),1,fp) == 0) return 0;
7837 return 1;
7838 }
7839
7840 /* Write a long value in bulk format $<count>\r\n<payload>\r\n */
7841 static int fwriteBulkLong(FILE *fp, long l) {
7842 char buf[128], lbuf[128];
7843
7844 snprintf(lbuf,sizeof(lbuf),"%ld\r\n",l);
7845 snprintf(buf,sizeof(buf),"$%lu\r\n",(unsigned long)strlen(lbuf)-2);
7846 if (fwrite(buf,strlen(buf),1,fp) == 0) return 0;
7847 if (fwrite(lbuf,strlen(lbuf),1,fp) == 0) return 0;
7848 return 1;
7849 }
7850
7851 /* Write a sequence of commands able to fully rebuild the dataset into
7852 * "filename". Used both by REWRITEAOF and BGREWRITEAOF. */
7853 static int rewriteAppendOnlyFile(char *filename) {
7854 dictIterator *di = NULL;
7855 dictEntry *de;
7856 FILE *fp;
7857 char tmpfile[256];
7858 int j;
7859 time_t now = time(NULL);
7860
7861 /* Note that we have to use a different temp name here compared to the
7862 * one used by rewriteAppendOnlyFileBackground() function. */
7863 snprintf(tmpfile,256,"temp-rewriteaof-%d.aof", (int) getpid());
7864 fp = fopen(tmpfile,"w");
7865 if (!fp) {
7866 redisLog(REDIS_WARNING, "Failed rewriting the append only file: %s", strerror(errno));
7867 return REDIS_ERR;
7868 }
7869 for (j = 0; j < server.dbnum; j++) {
7870 char selectcmd[] = "*2\r\n$6\r\nSELECT\r\n";
7871 redisDb *db = server.db+j;
7872 dict *d = db->dict;
7873 if (dictSize(d) == 0) continue;
7874 di = dictGetIterator(d);
7875 if (!di) {
7876 fclose(fp);
7877 return REDIS_ERR;
7878 }
7879
7880 /* SELECT the new DB */
7881 if (fwrite(selectcmd,sizeof(selectcmd)-1,1,fp) == 0) goto werr;
7882 if (fwriteBulkLong(fp,j) == 0) goto werr;
7883
7884 /* Iterate this DB writing every entry */
7885 while((de = dictNext(di)) != NULL) {
7886 robj *key, *o;
7887 time_t expiretime;
7888 int swapped;
7889
7890 key = dictGetEntryKey(de);
7891 /* If the value for this key is swapped, load a preview in memory.
7892 * We use a "swapped" flag to remember if we need to free the
7893 * value object instead to just increment the ref count anyway
7894 * in order to avoid copy-on-write of pages if we are forked() */
7895 if (!server.vm_enabled || key->storage == REDIS_VM_MEMORY ||
7896 key->storage == REDIS_VM_SWAPPING) {
7897 o = dictGetEntryVal(de);
7898 swapped = 0;
7899 } else {
7900 o = vmPreviewObject(key);
7901 swapped = 1;
7902 }
7903 expiretime = getExpire(db,key);
7904
7905 /* Save the key and associated value */
7906 if (o->type == REDIS_STRING) {
7907 /* Emit a SET command */
7908 char cmd[]="*3\r\n$3\r\nSET\r\n";
7909 if (fwrite(cmd,sizeof(cmd)-1,1,fp) == 0) goto werr;
7910 /* Key and value */
7911 if (fwriteBulkObject(fp,key) == 0) goto werr;
7912 if (fwriteBulkObject(fp,o) == 0) goto werr;
7913 } else if (o->type == REDIS_LIST) {
7914 /* Emit the RPUSHes needed to rebuild the list */
7915 list *list = o->ptr;
7916 listNode *ln;
7917 listIter li;
7918
7919 listRewind(list,&li);
7920 while((ln = listNext(&li))) {
7921 char cmd[]="*3\r\n$5\r\nRPUSH\r\n";
7922 robj *eleobj = listNodeValue(ln);
7923
7924 if (fwrite(cmd,sizeof(cmd)-1,1,fp) == 0) goto werr;
7925 if (fwriteBulkObject(fp,key) == 0) goto werr;
7926 if (fwriteBulkObject(fp,eleobj) == 0) goto werr;
7927 }
7928 } else if (o->type == REDIS_SET) {
7929 /* Emit the SADDs needed to rebuild the set */
7930 dict *set = o->ptr;
7931 dictIterator *di = dictGetIterator(set);
7932 dictEntry *de;
7933
7934 while((de = dictNext(di)) != NULL) {
7935 char cmd[]="*3\r\n$4\r\nSADD\r\n";
7936 robj *eleobj = dictGetEntryKey(de);
7937
7938 if (fwrite(cmd,sizeof(cmd)-1,1,fp) == 0) goto werr;
7939 if (fwriteBulkObject(fp,key) == 0) goto werr;
7940 if (fwriteBulkObject(fp,eleobj) == 0) goto werr;
7941 }
7942 dictReleaseIterator(di);
7943 } else if (o->type == REDIS_ZSET) {
7944 /* Emit the ZADDs needed to rebuild the sorted set */
7945 zset *zs = o->ptr;
7946 dictIterator *di = dictGetIterator(zs->dict);
7947 dictEntry *de;
7948
7949 while((de = dictNext(di)) != NULL) {
7950 char cmd[]="*4\r\n$4\r\nZADD\r\n";
7951 robj *eleobj = dictGetEntryKey(de);
7952 double *score = dictGetEntryVal(de);
7953
7954 if (fwrite(cmd,sizeof(cmd)-1,1,fp) == 0) goto werr;
7955 if (fwriteBulkObject(fp,key) == 0) goto werr;
7956 if (fwriteBulkDouble(fp,*score) == 0) goto werr;
7957 if (fwriteBulkObject(fp,eleobj) == 0) goto werr;
7958 }
7959 dictReleaseIterator(di);
7960 } else if (o->type == REDIS_HASH) {
7961 char cmd[]="*4\r\n$4\r\nHSET\r\n";
7962
7963 /* Emit the HSETs needed to rebuild the hash */
7964 if (o->encoding == REDIS_ENCODING_ZIPMAP) {
7965 unsigned char *p = zipmapRewind(o->ptr);
7966 unsigned char *field, *val;
7967 unsigned int flen, vlen;
7968
7969 while((p = zipmapNext(p,&field,&flen,&val,&vlen)) != NULL) {
7970 if (fwrite(cmd,sizeof(cmd)-1,1,fp) == 0) goto werr;
7971 if (fwriteBulkObject(fp,key) == 0) goto werr;
7972 if (fwriteBulkString(fp,(char*)field,flen) == -1)
7973 return -1;
7974 if (fwriteBulkString(fp,(char*)val,vlen) == -1)
7975 return -1;
7976 }
7977 } else {
7978 dictIterator *di = dictGetIterator(o->ptr);
7979 dictEntry *de;
7980
7981 while((de = dictNext(di)) != NULL) {
7982 robj *field = dictGetEntryKey(de);
7983 robj *val = dictGetEntryVal(de);
7984
7985 if (fwrite(cmd,sizeof(cmd)-1,1,fp) == 0) goto werr;
7986 if (fwriteBulkObject(fp,key) == 0) goto werr;
7987 if (fwriteBulkObject(fp,field) == -1) return -1;
7988 if (fwriteBulkObject(fp,val) == -1) return -1;
7989 }
7990 dictReleaseIterator(di);
7991 }
7992 } else {
7993 redisAssert(0);
7994 }
7995 /* Save the expire time */
7996 if (expiretime != -1) {
7997 char cmd[]="*3\r\n$8\r\nEXPIREAT\r\n";
7998 /* If this key is already expired skip it */
7999 if (expiretime < now) continue;
8000 if (fwrite(cmd,sizeof(cmd)-1,1,fp) == 0) goto werr;
8001 if (fwriteBulkObject(fp,key) == 0) goto werr;
8002 if (fwriteBulkLong(fp,expiretime) == 0) goto werr;
8003 }
8004 if (swapped) decrRefCount(o);
8005 }
8006 dictReleaseIterator(di);
8007 }
8008
8009 /* Make sure data will not remain on the OS's output buffers */
8010 fflush(fp);
8011 fsync(fileno(fp));
8012 fclose(fp);
8013
8014 /* Use RENAME to make sure the DB file is changed atomically only
8015 * if the generate DB file is ok. */
8016 if (rename(tmpfile,filename) == -1) {
8017 redisLog(REDIS_WARNING,"Error moving temp append only file on the final destination: %s", strerror(errno));
8018 unlink(tmpfile);
8019 return REDIS_ERR;
8020 }
8021 redisLog(REDIS_NOTICE,"SYNC append only file rewrite performed");
8022 return REDIS_OK;
8023
8024 werr:
8025 fclose(fp);
8026 unlink(tmpfile);
8027 redisLog(REDIS_WARNING,"Write error writing append only file on disk: %s", strerror(errno));
8028 if (di) dictReleaseIterator(di);
8029 return REDIS_ERR;
8030 }
8031
8032 /* This is how rewriting of the append only file in background works:
8033 *
8034 * 1) The user calls BGREWRITEAOF
8035 * 2) Redis calls this function, that forks():
8036 * 2a) the child rewrite the append only file in a temp file.
8037 * 2b) the parent accumulates differences in server.bgrewritebuf.
8038 * 3) When the child finished '2a' exists.
8039 * 4) The parent will trap the exit code, if it's OK, will append the
8040 * data accumulated into server.bgrewritebuf into the temp file, and
8041 * finally will rename(2) the temp file in the actual file name.
8042 * The the new file is reopened as the new append only file. Profit!
8043 */
8044 static int rewriteAppendOnlyFileBackground(void) {
8045 pid_t childpid;
8046
8047 if (server.bgrewritechildpid != -1) return REDIS_ERR;
8048 if (server.vm_enabled) waitEmptyIOJobsQueue();
8049 if ((childpid = fork()) == 0) {
8050 /* Child */
8051 char tmpfile[256];
8052
8053 if (server.vm_enabled) vmReopenSwapFile();
8054 close(server.fd);
8055 snprintf(tmpfile,256,"temp-rewriteaof-bg-%d.aof", (int) getpid());
8056 if (rewriteAppendOnlyFile(tmpfile) == REDIS_OK) {
8057 _exit(0);
8058 } else {
8059 _exit(1);
8060 }
8061 } else {
8062 /* Parent */
8063 if (childpid == -1) {
8064 redisLog(REDIS_WARNING,
8065 "Can't rewrite append only file in background: fork: %s",
8066 strerror(errno));
8067 return REDIS_ERR;
8068 }
8069 redisLog(REDIS_NOTICE,
8070 "Background append only file rewriting started by pid %d",childpid);
8071 server.bgrewritechildpid = childpid;
8072 /* We set appendseldb to -1 in order to force the next call to the
8073 * feedAppendOnlyFile() to issue a SELECT command, so the differences
8074 * accumulated by the parent into server.bgrewritebuf will start
8075 * with a SELECT statement and it will be safe to merge. */
8076 server.appendseldb = -1;
8077 return REDIS_OK;
8078 }
8079 return REDIS_OK; /* unreached */
8080 }
8081
8082 static void bgrewriteaofCommand(redisClient *c) {
8083 if (server.bgrewritechildpid != -1) {
8084 addReplySds(c,sdsnew("-ERR background append only file rewriting already in progress\r\n"));
8085 return;
8086 }
8087 if (rewriteAppendOnlyFileBackground() == REDIS_OK) {
8088 char *status = "+Background append only file rewriting started\r\n";
8089 addReplySds(c,sdsnew(status));
8090 } else {
8091 addReply(c,shared.err);
8092 }
8093 }
8094
8095 static void aofRemoveTempFile(pid_t childpid) {
8096 char tmpfile[256];
8097
8098 snprintf(tmpfile,256,"temp-rewriteaof-bg-%d.aof", (int) childpid);
8099 unlink(tmpfile);
8100 }
8101
8102 /* Virtual Memory is composed mainly of two subsystems:
8103 * - Blocking Virutal Memory
8104 * - Threaded Virtual Memory I/O
8105 * The two parts are not fully decoupled, but functions are split among two
8106 * different sections of the source code (delimited by comments) in order to
8107 * make more clear what functionality is about the blocking VM and what about
8108 * the threaded (not blocking) VM.
8109 *
8110 * Redis VM design:
8111 *
8112 * Redis VM is a blocking VM (one that blocks reading swapped values from
8113 * disk into memory when a value swapped out is needed in memory) that is made
8114 * unblocking by trying to examine the command argument vector in order to
8115 * load in background values that will likely be needed in order to exec
8116 * the command. The command is executed only once all the relevant keys
8117 * are loaded into memory.
8118 *
8119 * This basically is almost as simple of a blocking VM, but almost as parallel
8120 * as a fully non-blocking VM.
8121 */
8122
8123 /* =================== Virtual Memory - Blocking Side ====================== */
8124
8125 /* substitute the first occurrence of '%p' with the process pid in the
8126 * swap file name. */
8127 static void expandVmSwapFilename(void) {
8128 char *p = strstr(server.vm_swap_file,"%p");
8129 sds new;
8130
8131 if (!p) return;
8132 new = sdsempty();
8133 *p = '\0';
8134 new = sdscat(new,server.vm_swap_file);
8135 new = sdscatprintf(new,"%ld",(long) getpid());
8136 new = sdscat(new,p+2);
8137 zfree(server.vm_swap_file);
8138 server.vm_swap_file = new;
8139 }
8140
8141 static void vmInit(void) {
8142 off_t totsize;
8143 int pipefds[2];
8144 size_t stacksize;
8145
8146 if (server.vm_max_threads != 0)
8147 zmalloc_enable_thread_safeness(); /* we need thread safe zmalloc() */
8148
8149 expandVmSwapFilename();
8150 redisLog(REDIS_NOTICE,"Using '%s' as swap file",server.vm_swap_file);
8151 if ((server.vm_fp = fopen(server.vm_swap_file,"r+b")) == NULL) {
8152 server.vm_fp = fopen(server.vm_swap_file,"w+b");
8153 }
8154 if (server.vm_fp == NULL) {
8155 redisLog(REDIS_WARNING,
8156 "Impossible to open the swap file: %s. Exiting.",
8157 strerror(errno));
8158 exit(1);
8159 }
8160 server.vm_fd = fileno(server.vm_fp);
8161 server.vm_next_page = 0;
8162 server.vm_near_pages = 0;
8163 server.vm_stats_used_pages = 0;
8164 server.vm_stats_swapped_objects = 0;
8165 server.vm_stats_swapouts = 0;
8166 server.vm_stats_swapins = 0;
8167 totsize = server.vm_pages*server.vm_page_size;
8168 redisLog(REDIS_NOTICE,"Allocating %lld bytes of swap file",totsize);
8169 if (ftruncate(server.vm_fd,totsize) == -1) {
8170 redisLog(REDIS_WARNING,"Can't ftruncate swap file: %s. Exiting.",
8171 strerror(errno));
8172 exit(1);
8173 } else {
8174 redisLog(REDIS_NOTICE,"Swap file allocated with success");
8175 }
8176 server.vm_bitmap = zmalloc((server.vm_pages+7)/8);
8177 redisLog(REDIS_VERBOSE,"Allocated %lld bytes page table for %lld pages",
8178 (long long) (server.vm_pages+7)/8, server.vm_pages);
8179 memset(server.vm_bitmap,0,(server.vm_pages+7)/8);
8180
8181 /* Initialize threaded I/O (used by Virtual Memory) */
8182 server.io_newjobs = listCreate();
8183 server.io_processing = listCreate();
8184 server.io_processed = listCreate();
8185 server.io_ready_clients = listCreate();
8186 pthread_mutex_init(&server.io_mutex,NULL);
8187 pthread_mutex_init(&server.obj_freelist_mutex,NULL);
8188 pthread_mutex_init(&server.io_swapfile_mutex,NULL);
8189 server.io_active_threads = 0;
8190 if (pipe(pipefds) == -1) {
8191 redisLog(REDIS_WARNING,"Unable to intialized VM: pipe(2): %s. Exiting."
8192 ,strerror(errno));
8193 exit(1);
8194 }
8195 server.io_ready_pipe_read = pipefds[0];
8196 server.io_ready_pipe_write = pipefds[1];
8197 redisAssert(anetNonBlock(NULL,server.io_ready_pipe_read) != ANET_ERR);
8198 /* LZF requires a lot of stack */
8199 pthread_attr_init(&server.io_threads_attr);
8200 pthread_attr_getstacksize(&server.io_threads_attr, &stacksize);
8201 while (stacksize < REDIS_THREAD_STACK_SIZE) stacksize *= 2;
8202 pthread_attr_setstacksize(&server.io_threads_attr, stacksize);
8203 /* Listen for events in the threaded I/O pipe */
8204 if (aeCreateFileEvent(server.el, server.io_ready_pipe_read, AE_READABLE,
8205 vmThreadedIOCompletedJob, NULL) == AE_ERR)
8206 oom("creating file event");
8207 }
8208
8209 /* Mark the page as used */
8210 static void vmMarkPageUsed(off_t page) {
8211 off_t byte = page/8;
8212 int bit = page&7;
8213 redisAssert(vmFreePage(page) == 1);
8214 server.vm_bitmap[byte] |= 1<<bit;
8215 }
8216
8217 /* Mark N contiguous pages as used, with 'page' being the first. */
8218 static void vmMarkPagesUsed(off_t page, off_t count) {
8219 off_t j;
8220
8221 for (j = 0; j < count; j++)
8222 vmMarkPageUsed(page+j);
8223 server.vm_stats_used_pages += count;
8224 redisLog(REDIS_DEBUG,"Mark USED pages: %lld pages at %lld\n",
8225 (long long)count, (long long)page);
8226 }
8227
8228 /* Mark the page as free */
8229 static void vmMarkPageFree(off_t page) {
8230 off_t byte = page/8;
8231 int bit = page&7;
8232 redisAssert(vmFreePage(page) == 0);
8233 server.vm_bitmap[byte] &= ~(1<<bit);
8234 }
8235
8236 /* Mark N contiguous pages as free, with 'page' being the first. */
8237 static void vmMarkPagesFree(off_t page, off_t count) {
8238 off_t j;
8239
8240 for (j = 0; j < count; j++)
8241 vmMarkPageFree(page+j);
8242 server.vm_stats_used_pages -= count;
8243 redisLog(REDIS_DEBUG,"Mark FREE pages: %lld pages at %lld\n",
8244 (long long)count, (long long)page);
8245 }
8246
8247 /* Test if the page is free */
8248 static int vmFreePage(off_t page) {
8249 off_t byte = page/8;
8250 int bit = page&7;
8251 return (server.vm_bitmap[byte] & (1<<bit)) == 0;
8252 }
8253
8254 /* Find N contiguous free pages storing the first page of the cluster in *first.
8255 * Returns REDIS_OK if it was able to find N contiguous pages, otherwise
8256 * REDIS_ERR is returned.
8257 *
8258 * This function uses a simple algorithm: we try to allocate
8259 * REDIS_VM_MAX_NEAR_PAGES sequentially, when we reach this limit we start
8260 * again from the start of the swap file searching for free spaces.
8261 *
8262 * If it looks pretty clear that there are no free pages near our offset
8263 * we try to find less populated places doing a forward jump of
8264 * REDIS_VM_MAX_RANDOM_JUMP, then we start scanning again a few pages
8265 * without hurry, and then we jump again and so forth...
8266 *
8267 * This function can be improved using a free list to avoid to guess
8268 * too much, since we could collect data about freed pages.
8269 *
8270 * note: I implemented this function just after watching an episode of
8271 * Battlestar Galactica, where the hybrid was continuing to say "JUMP!"
8272 */
8273 static int vmFindContiguousPages(off_t *first, off_t n) {
8274 off_t base, offset = 0, since_jump = 0, numfree = 0;
8275
8276 if (server.vm_near_pages == REDIS_VM_MAX_NEAR_PAGES) {
8277 server.vm_near_pages = 0;
8278 server.vm_next_page = 0;
8279 }
8280 server.vm_near_pages++; /* Yet another try for pages near to the old ones */
8281 base = server.vm_next_page;
8282
8283 while(offset < server.vm_pages) {
8284 off_t this = base+offset;
8285
8286 /* If we overflow, restart from page zero */
8287 if (this >= server.vm_pages) {
8288 this -= server.vm_pages;
8289 if (this == 0) {
8290 /* Just overflowed, what we found on tail is no longer
8291 * interesting, as it's no longer contiguous. */
8292 numfree = 0;
8293 }
8294 }
8295 if (vmFreePage(this)) {
8296 /* This is a free page */
8297 numfree++;
8298 /* Already got N free pages? Return to the caller, with success */
8299 if (numfree == n) {
8300 *first = this-(n-1);
8301 server.vm_next_page = this+1;
8302 redisLog(REDIS_DEBUG, "FOUND CONTIGUOUS PAGES: %lld pages at %lld\n", (long long) n, (long long) *first);
8303 return REDIS_OK;
8304 }
8305 } else {
8306 /* The current one is not a free page */
8307 numfree = 0;
8308 }
8309
8310 /* Fast-forward if the current page is not free and we already
8311 * searched enough near this place. */
8312 since_jump++;
8313 if (!numfree && since_jump >= REDIS_VM_MAX_RANDOM_JUMP/4) {
8314 offset += random() % REDIS_VM_MAX_RANDOM_JUMP;
8315 since_jump = 0;
8316 /* Note that even if we rewind after the jump, we are don't need
8317 * to make sure numfree is set to zero as we only jump *if* it
8318 * is set to zero. */
8319 } else {
8320 /* Otherwise just check the next page */
8321 offset++;
8322 }
8323 }
8324 return REDIS_ERR;
8325 }
8326
8327 /* Write the specified object at the specified page of the swap file */
8328 static int vmWriteObjectOnSwap(robj *o, off_t page) {
8329 if (server.vm_enabled) pthread_mutex_lock(&server.io_swapfile_mutex);
8330 if (fseeko(server.vm_fp,page*server.vm_page_size,SEEK_SET) == -1) {
8331 if (server.vm_enabled) pthread_mutex_unlock(&server.io_swapfile_mutex);
8332 redisLog(REDIS_WARNING,
8333 "Critical VM problem in vmWriteObjectOnSwap(): can't seek: %s",
8334 strerror(errno));
8335 return REDIS_ERR;
8336 }
8337 rdbSaveObject(server.vm_fp,o);
8338 fflush(server.vm_fp);
8339 if (server.vm_enabled) pthread_mutex_unlock(&server.io_swapfile_mutex);
8340 return REDIS_OK;
8341 }
8342
8343 /* Swap the 'val' object relative to 'key' into disk. Store all the information
8344 * needed to later retrieve the object into the key object.
8345 * If we can't find enough contiguous empty pages to swap the object on disk
8346 * REDIS_ERR is returned. */
8347 static int vmSwapObjectBlocking(robj *key, robj *val) {
8348 off_t pages = rdbSavedObjectPages(val,NULL);
8349 off_t page;
8350
8351 assert(key->storage == REDIS_VM_MEMORY);
8352 assert(key->refcount == 1);
8353 if (vmFindContiguousPages(&page,pages) == REDIS_ERR) return REDIS_ERR;
8354 if (vmWriteObjectOnSwap(val,page) == REDIS_ERR) return REDIS_ERR;
8355 key->vm.page = page;
8356 key->vm.usedpages = pages;
8357 key->storage = REDIS_VM_SWAPPED;
8358 key->vtype = val->type;
8359 decrRefCount(val); /* Deallocate the object from memory. */
8360 vmMarkPagesUsed(page,pages);
8361 redisLog(REDIS_DEBUG,"VM: object %s swapped out at %lld (%lld pages)",
8362 (unsigned char*) key->ptr,
8363 (unsigned long long) page, (unsigned long long) pages);
8364 server.vm_stats_swapped_objects++;
8365 server.vm_stats_swapouts++;
8366 return REDIS_OK;
8367 }
8368
8369 static robj *vmReadObjectFromSwap(off_t page, int type) {
8370 robj *o;
8371
8372 if (server.vm_enabled) pthread_mutex_lock(&server.io_swapfile_mutex);
8373 if (fseeko(server.vm_fp,page*server.vm_page_size,SEEK_SET) == -1) {
8374 redisLog(REDIS_WARNING,
8375 "Unrecoverable VM problem in vmReadObjectFromSwap(): can't seek: %s",
8376 strerror(errno));
8377 _exit(1);
8378 }
8379 o = rdbLoadObject(type,server.vm_fp);
8380 if (o == NULL) {
8381 redisLog(REDIS_WARNING, "Unrecoverable VM problem in vmReadObjectFromSwap(): can't load object from swap file: %s", strerror(errno));
8382 _exit(1);
8383 }
8384 if (server.vm_enabled) pthread_mutex_unlock(&server.io_swapfile_mutex);
8385 return o;
8386 }
8387
8388 /* Load the value object relative to the 'key' object from swap to memory.
8389 * The newly allocated object is returned.
8390 *
8391 * If preview is true the unserialized object is returned to the caller but
8392 * no changes are made to the key object, nor the pages are marked as freed */
8393 static robj *vmGenericLoadObject(robj *key, int preview) {
8394 robj *val;
8395
8396 redisAssert(key->storage == REDIS_VM_SWAPPED || key->storage == REDIS_VM_LOADING);
8397 val = vmReadObjectFromSwap(key->vm.page,key->vtype);
8398 if (!preview) {
8399 key->storage = REDIS_VM_MEMORY;
8400 key->vm.atime = server.unixtime;
8401 vmMarkPagesFree(key->vm.page,key->vm.usedpages);
8402 redisLog(REDIS_DEBUG, "VM: object %s loaded from disk",
8403 (unsigned char*) key->ptr);
8404 server.vm_stats_swapped_objects--;
8405 } else {
8406 redisLog(REDIS_DEBUG, "VM: object %s previewed from disk",
8407 (unsigned char*) key->ptr);
8408 }
8409 server.vm_stats_swapins++;
8410 return val;
8411 }
8412
8413 /* Plain object loading, from swap to memory */
8414 static robj *vmLoadObject(robj *key) {
8415 /* If we are loading the object in background, stop it, we
8416 * need to load this object synchronously ASAP. */
8417 if (key->storage == REDIS_VM_LOADING)
8418 vmCancelThreadedIOJob(key);
8419 return vmGenericLoadObject(key,0);
8420 }
8421
8422 /* Just load the value on disk, without to modify the key.
8423 * This is useful when we want to perform some operation on the value
8424 * without to really bring it from swap to memory, like while saving the
8425 * dataset or rewriting the append only log. */
8426 static robj *vmPreviewObject(robj *key) {
8427 return vmGenericLoadObject(key,1);
8428 }
8429
8430 /* How a good candidate is this object for swapping?
8431 * The better candidate it is, the greater the returned value.
8432 *
8433 * Currently we try to perform a fast estimation of the object size in
8434 * memory, and combine it with aging informations.
8435 *
8436 * Basically swappability = idle-time * log(estimated size)
8437 *
8438 * Bigger objects are preferred over smaller objects, but not
8439 * proportionally, this is why we use the logarithm. This algorithm is
8440 * just a first try and will probably be tuned later. */
8441 static double computeObjectSwappability(robj *o) {
8442 time_t age = server.unixtime - o->vm.atime;
8443 long asize = 0;
8444 list *l;
8445 dict *d;
8446 struct dictEntry *de;
8447 int z;
8448
8449 if (age <= 0) return 0;
8450 switch(o->type) {
8451 case REDIS_STRING:
8452 if (o->encoding != REDIS_ENCODING_RAW) {
8453 asize = sizeof(*o);
8454 } else {
8455 asize = sdslen(o->ptr)+sizeof(*o)+sizeof(long)*2;
8456 }
8457 break;
8458 case REDIS_LIST:
8459 l = o->ptr;
8460 listNode *ln = listFirst(l);
8461
8462 asize = sizeof(list);
8463 if (ln) {
8464 robj *ele = ln->value;
8465 long elesize;
8466
8467 elesize = (ele->encoding == REDIS_ENCODING_RAW) ?
8468 (sizeof(*o)+sdslen(ele->ptr)) :
8469 sizeof(*o);
8470 asize += (sizeof(listNode)+elesize)*listLength(l);
8471 }
8472 break;
8473 case REDIS_SET:
8474 case REDIS_ZSET:
8475 z = (o->type == REDIS_ZSET);
8476 d = z ? ((zset*)o->ptr)->dict : o->ptr;
8477
8478 asize = sizeof(dict)+(sizeof(struct dictEntry*)*dictSlots(d));
8479 if (z) asize += sizeof(zset)-sizeof(dict);
8480 if (dictSize(d)) {
8481 long elesize;
8482 robj *ele;
8483
8484 de = dictGetRandomKey(d);
8485 ele = dictGetEntryKey(de);
8486 elesize = (ele->encoding == REDIS_ENCODING_RAW) ?
8487 (sizeof(*o)+sdslen(ele->ptr)) :
8488 sizeof(*o);
8489 asize += (sizeof(struct dictEntry)+elesize)*dictSize(d);
8490 if (z) asize += sizeof(zskiplistNode)*dictSize(d);
8491 }
8492 break;
8493 case REDIS_HASH:
8494 if (o->encoding == REDIS_ENCODING_ZIPMAP) {
8495 unsigned char *p = zipmapRewind((unsigned char*)o->ptr);
8496 unsigned int len = zipmapLen((unsigned char*)o->ptr);
8497 unsigned int klen, vlen;
8498 unsigned char *key, *val;
8499
8500 if ((p = zipmapNext(p,&key,&klen,&val,&vlen)) == NULL) {
8501 klen = 0;
8502 vlen = 0;
8503 }
8504 asize = len*(klen+vlen+3);
8505 } else if (o->encoding == REDIS_ENCODING_HT) {
8506 d = o->ptr;
8507 asize = sizeof(dict)+(sizeof(struct dictEntry*)*dictSlots(d));
8508 if (dictSize(d)) {
8509 long elesize;
8510 robj *ele;
8511
8512 de = dictGetRandomKey(d);
8513 ele = dictGetEntryKey(de);
8514 elesize = (ele->encoding == REDIS_ENCODING_RAW) ?
8515 (sizeof(*o)+sdslen(ele->ptr)) :
8516 sizeof(*o);
8517 ele = dictGetEntryVal(de);
8518 elesize = (ele->encoding == REDIS_ENCODING_RAW) ?
8519 (sizeof(*o)+sdslen(ele->ptr)) :
8520 sizeof(*o);
8521 asize += (sizeof(struct dictEntry)+elesize)*dictSize(d);
8522 }
8523 }
8524 break;
8525 }
8526 return (double)age*log(1+asize);
8527 }
8528
8529 /* Try to swap an object that's a good candidate for swapping.
8530 * Returns REDIS_OK if the object was swapped, REDIS_ERR if it's not possible
8531 * to swap any object at all.
8532 *
8533 * If 'usethreaded' is true, Redis will try to swap the object in background
8534 * using I/O threads. */
8535 static int vmSwapOneObject(int usethreads) {
8536 int j, i;
8537 struct dictEntry *best = NULL;
8538 double best_swappability = 0;
8539 redisDb *best_db = NULL;
8540 robj *key, *val;
8541
8542 for (j = 0; j < server.dbnum; j++) {
8543 redisDb *db = server.db+j;
8544 /* Why maxtries is set to 100?
8545 * Because this way (usually) we'll find 1 object even if just 1% - 2%
8546 * are swappable objects */
8547 int maxtries = 100;
8548
8549 if (dictSize(db->dict) == 0) continue;
8550 for (i = 0; i < 5; i++) {
8551 dictEntry *de;
8552 double swappability;
8553
8554 if (maxtries) maxtries--;
8555 de = dictGetRandomKey(db->dict);
8556 key = dictGetEntryKey(de);
8557 val = dictGetEntryVal(de);
8558 /* Only swap objects that are currently in memory.
8559 *
8560 * Also don't swap shared objects if threaded VM is on, as we
8561 * try to ensure that the main thread does not touch the
8562 * object while the I/O thread is using it, but we can't
8563 * control other keys without adding additional mutex. */
8564 if (key->storage != REDIS_VM_MEMORY ||
8565 (server.vm_max_threads != 0 && val->refcount != 1)) {
8566 if (maxtries) i--; /* don't count this try */
8567 continue;
8568 }
8569 swappability = computeObjectSwappability(val);
8570 if (!best || swappability > best_swappability) {
8571 best = de;
8572 best_swappability = swappability;
8573 best_db = db;
8574 }
8575 }
8576 }
8577 if (best == NULL) return REDIS_ERR;
8578 key = dictGetEntryKey(best);
8579 val = dictGetEntryVal(best);
8580
8581 redisLog(REDIS_DEBUG,"Key with best swappability: %s, %f",
8582 key->ptr, best_swappability);
8583
8584 /* Unshare the key if needed */
8585 if (key->refcount > 1) {
8586 robj *newkey = dupStringObject(key);
8587 decrRefCount(key);
8588 key = dictGetEntryKey(best) = newkey;
8589 }
8590 /* Swap it */
8591 if (usethreads) {
8592 vmSwapObjectThreaded(key,val,best_db);
8593 return REDIS_OK;
8594 } else {
8595 if (vmSwapObjectBlocking(key,val) == REDIS_OK) {
8596 dictGetEntryVal(best) = NULL;
8597 return REDIS_OK;
8598 } else {
8599 return REDIS_ERR;
8600 }
8601 }
8602 }
8603
8604 static int vmSwapOneObjectBlocking() {
8605 return vmSwapOneObject(0);
8606 }
8607
8608 static int vmSwapOneObjectThreaded() {
8609 return vmSwapOneObject(1);
8610 }
8611
8612 /* Return true if it's safe to swap out objects in a given moment.
8613 * Basically we don't want to swap objects out while there is a BGSAVE
8614 * or a BGAEOREWRITE running in backgroud. */
8615 static int vmCanSwapOut(void) {
8616 return (server.bgsavechildpid == -1 && server.bgrewritechildpid == -1);
8617 }
8618
8619 /* Delete a key if swapped. Returns 1 if the key was found, was swapped
8620 * and was deleted. Otherwise 0 is returned. */
8621 static int deleteIfSwapped(redisDb *db, robj *key) {
8622 dictEntry *de;
8623 robj *foundkey;
8624
8625 if ((de = dictFind(db->dict,key)) == NULL) return 0;
8626 foundkey = dictGetEntryKey(de);
8627 if (foundkey->storage == REDIS_VM_MEMORY) return 0;
8628 deleteKey(db,key);
8629 return 1;
8630 }
8631
8632 /* =================== Virtual Memory - Threaded I/O ======================= */
8633
8634 static void freeIOJob(iojob *j) {
8635 if ((j->type == REDIS_IOJOB_PREPARE_SWAP ||
8636 j->type == REDIS_IOJOB_DO_SWAP ||
8637 j->type == REDIS_IOJOB_LOAD) && j->val != NULL)
8638 decrRefCount(j->val);
8639 decrRefCount(j->key);
8640 zfree(j);
8641 }
8642
8643 /* Every time a thread finished a Job, it writes a byte into the write side
8644 * of an unix pipe in order to "awake" the main thread, and this function
8645 * is called. */
8646 static void vmThreadedIOCompletedJob(aeEventLoop *el, int fd, void *privdata,
8647 int mask)
8648 {
8649 char buf[1];
8650 int retval, processed = 0, toprocess = -1, trytoswap = 1;
8651 REDIS_NOTUSED(el);
8652 REDIS_NOTUSED(mask);
8653 REDIS_NOTUSED(privdata);
8654
8655 /* For every byte we read in the read side of the pipe, there is one
8656 * I/O job completed to process. */
8657 while((retval = read(fd,buf,1)) == 1) {
8658 iojob *j;
8659 listNode *ln;
8660 robj *key;
8661 struct dictEntry *de;
8662
8663 redisLog(REDIS_DEBUG,"Processing I/O completed job");
8664
8665 /* Get the processed element (the oldest one) */
8666 lockThreadedIO();
8667 assert(listLength(server.io_processed) != 0);
8668 if (toprocess == -1) {
8669 toprocess = (listLength(server.io_processed)*REDIS_MAX_COMPLETED_JOBS_PROCESSED)/100;
8670 if (toprocess <= 0) toprocess = 1;
8671 }
8672 ln = listFirst(server.io_processed);
8673 j = ln->value;
8674 listDelNode(server.io_processed,ln);
8675 unlockThreadedIO();
8676 /* If this job is marked as canceled, just ignore it */
8677 if (j->canceled) {
8678 freeIOJob(j);
8679 continue;
8680 }
8681 /* Post process it in the main thread, as there are things we
8682 * can do just here to avoid race conditions and/or invasive locks */
8683 redisLog(REDIS_DEBUG,"Job %p type: %d, key at %p (%s) refcount: %d\n", (void*) j, j->type, (void*)j->key, (char*)j->key->ptr, j->key->refcount);
8684 de = dictFind(j->db->dict,j->key);
8685 assert(de != NULL);
8686 key = dictGetEntryKey(de);
8687 if (j->type == REDIS_IOJOB_LOAD) {
8688 redisDb *db;
8689
8690 /* Key loaded, bring it at home */
8691 key->storage = REDIS_VM_MEMORY;
8692 key->vm.atime = server.unixtime;
8693 vmMarkPagesFree(key->vm.page,key->vm.usedpages);
8694 redisLog(REDIS_DEBUG, "VM: object %s loaded from disk (threaded)",
8695 (unsigned char*) key->ptr);
8696 server.vm_stats_swapped_objects--;
8697 server.vm_stats_swapins++;
8698 dictGetEntryVal(de) = j->val;
8699 incrRefCount(j->val);
8700 db = j->db;
8701 freeIOJob(j);
8702 /* Handle clients waiting for this key to be loaded. */
8703 handleClientsBlockedOnSwappedKey(db,key);
8704 } else if (j->type == REDIS_IOJOB_PREPARE_SWAP) {
8705 /* Now we know the amount of pages required to swap this object.
8706 * Let's find some space for it, and queue this task again
8707 * rebranded as REDIS_IOJOB_DO_SWAP. */
8708 if (!vmCanSwapOut() ||
8709 vmFindContiguousPages(&j->page,j->pages) == REDIS_ERR)
8710 {
8711 /* Ooops... no space or we can't swap as there is
8712 * a fork()ed Redis trying to save stuff on disk. */
8713 freeIOJob(j);
8714 key->storage = REDIS_VM_MEMORY; /* undo operation */
8715 } else {
8716 /* Note that we need to mark this pages as used now,
8717 * if the job will be canceled, we'll mark them as freed
8718 * again. */
8719 vmMarkPagesUsed(j->page,j->pages);
8720 j->type = REDIS_IOJOB_DO_SWAP;
8721 lockThreadedIO();
8722 queueIOJob(j);
8723 unlockThreadedIO();
8724 }
8725 } else if (j->type == REDIS_IOJOB_DO_SWAP) {
8726 robj *val;
8727
8728 /* Key swapped. We can finally free some memory. */
8729 if (key->storage != REDIS_VM_SWAPPING) {
8730 printf("key->storage: %d\n",key->storage);
8731 printf("key->name: %s\n",(char*)key->ptr);
8732 printf("key->refcount: %d\n",key->refcount);
8733 printf("val: %p\n",(void*)j->val);
8734 printf("val->type: %d\n",j->val->type);
8735 printf("val->ptr: %s\n",(char*)j->val->ptr);
8736 }
8737 redisAssert(key->storage == REDIS_VM_SWAPPING);
8738 val = dictGetEntryVal(de);
8739 key->vm.page = j->page;
8740 key->vm.usedpages = j->pages;
8741 key->storage = REDIS_VM_SWAPPED;
8742 key->vtype = j->val->type;
8743 decrRefCount(val); /* Deallocate the object from memory. */
8744 dictGetEntryVal(de) = NULL;
8745 redisLog(REDIS_DEBUG,
8746 "VM: object %s swapped out at %lld (%lld pages) (threaded)",
8747 (unsigned char*) key->ptr,
8748 (unsigned long long) j->page, (unsigned long long) j->pages);
8749 server.vm_stats_swapped_objects++;
8750 server.vm_stats_swapouts++;
8751 freeIOJob(j);
8752 /* Put a few more swap requests in queue if we are still
8753 * out of memory */
8754 if (trytoswap && vmCanSwapOut() &&
8755 zmalloc_used_memory() > server.vm_max_memory)
8756 {
8757 int more = 1;
8758 while(more) {
8759 lockThreadedIO();
8760 more = listLength(server.io_newjobs) <
8761 (unsigned) server.vm_max_threads;
8762 unlockThreadedIO();
8763 /* Don't waste CPU time if swappable objects are rare. */
8764 if (vmSwapOneObjectThreaded() == REDIS_ERR) {
8765 trytoswap = 0;
8766 break;
8767 }
8768 }
8769 }
8770 }
8771 processed++;
8772 if (processed == toprocess) return;
8773 }
8774 if (retval < 0 && errno != EAGAIN) {
8775 redisLog(REDIS_WARNING,
8776 "WARNING: read(2) error in vmThreadedIOCompletedJob() %s",
8777 strerror(errno));
8778 }
8779 }
8780
8781 static void lockThreadedIO(void) {
8782 pthread_mutex_lock(&server.io_mutex);
8783 }
8784
8785 static void unlockThreadedIO(void) {
8786 pthread_mutex_unlock(&server.io_mutex);
8787 }
8788
8789 /* Remove the specified object from the threaded I/O queue if still not
8790 * processed, otherwise make sure to flag it as canceled. */
8791 static void vmCancelThreadedIOJob(robj *o) {
8792 list *lists[3] = {
8793 server.io_newjobs, /* 0 */
8794 server.io_processing, /* 1 */
8795 server.io_processed /* 2 */
8796 };
8797 int i;
8798
8799 assert(o->storage == REDIS_VM_LOADING || o->storage == REDIS_VM_SWAPPING);
8800 again:
8801 lockThreadedIO();
8802 /* Search for a matching key in one of the queues */
8803 for (i = 0; i < 3; i++) {
8804 listNode *ln;
8805 listIter li;
8806
8807 listRewind(lists[i],&li);
8808 while ((ln = listNext(&li)) != NULL) {
8809 iojob *job = ln->value;
8810
8811 if (job->canceled) continue; /* Skip this, already canceled. */
8812 if (compareStringObjects(job->key,o) == 0) {
8813 redisLog(REDIS_DEBUG,"*** CANCELED %p (%s) (type %d) (LIST ID %d)\n",
8814 (void*)job, (char*)o->ptr, job->type, i);
8815 /* Mark the pages as free since the swap didn't happened
8816 * or happened but is now discarded. */
8817 if (i != 1 && job->type == REDIS_IOJOB_DO_SWAP)
8818 vmMarkPagesFree(job->page,job->pages);
8819 /* Cancel the job. It depends on the list the job is
8820 * living in. */
8821 switch(i) {
8822 case 0: /* io_newjobs */
8823 /* If the job was yet not processed the best thing to do
8824 * is to remove it from the queue at all */
8825 freeIOJob(job);
8826 listDelNode(lists[i],ln);
8827 break;
8828 case 1: /* io_processing */
8829 /* Oh Shi- the thread is messing with the Job:
8830 *
8831 * Probably it's accessing the object if this is a
8832 * PREPARE_SWAP or DO_SWAP job.
8833 * If it's a LOAD job it may be reading from disk and
8834 * if we don't wait for the job to terminate before to
8835 * cancel it, maybe in a few microseconds data can be
8836 * corrupted in this pages. So the short story is:
8837 *
8838 * Better to wait for the job to move into the
8839 * next queue (processed)... */
8840
8841 /* We try again and again until the job is completed. */
8842 unlockThreadedIO();
8843 /* But let's wait some time for the I/O thread
8844 * to finish with this job. After all this condition
8845 * should be very rare. */
8846 usleep(1);
8847 goto again;
8848 case 2: /* io_processed */
8849 /* The job was already processed, that's easy...
8850 * just mark it as canceled so that we'll ignore it
8851 * when processing completed jobs. */
8852 job->canceled = 1;
8853 break;
8854 }
8855 /* Finally we have to adjust the storage type of the object
8856 * in order to "UNDO" the operaiton. */
8857 if (o->storage == REDIS_VM_LOADING)
8858 o->storage = REDIS_VM_SWAPPED;
8859 else if (o->storage == REDIS_VM_SWAPPING)
8860 o->storage = REDIS_VM_MEMORY;
8861 unlockThreadedIO();
8862 return;
8863 }
8864 }
8865 }
8866 unlockThreadedIO();
8867 assert(1 != 1); /* We should never reach this */
8868 }
8869
8870 static void *IOThreadEntryPoint(void *arg) {
8871 iojob *j;
8872 listNode *ln;
8873 REDIS_NOTUSED(arg);
8874
8875 pthread_detach(pthread_self());
8876 while(1) {
8877 /* Get a new job to process */
8878 lockThreadedIO();
8879 if (listLength(server.io_newjobs) == 0) {
8880 /* No new jobs in queue, exit. */
8881 redisLog(REDIS_DEBUG,"Thread %ld exiting, nothing to do",
8882 (long) pthread_self());
8883 server.io_active_threads--;
8884 unlockThreadedIO();
8885 return NULL;
8886 }
8887 ln = listFirst(server.io_newjobs);
8888 j = ln->value;
8889 listDelNode(server.io_newjobs,ln);
8890 /* Add the job in the processing queue */
8891 j->thread = pthread_self();
8892 listAddNodeTail(server.io_processing,j);
8893 ln = listLast(server.io_processing); /* We use ln later to remove it */
8894 unlockThreadedIO();
8895 redisLog(REDIS_DEBUG,"Thread %ld got a new job (type %d): %p about key '%s'",
8896 (long) pthread_self(), j->type, (void*)j, (char*)j->key->ptr);
8897
8898 /* Process the Job */
8899 if (j->type == REDIS_IOJOB_LOAD) {
8900 j->val = vmReadObjectFromSwap(j->page,j->key->vtype);
8901 } else if (j->type == REDIS_IOJOB_PREPARE_SWAP) {
8902 FILE *fp = fopen("/dev/null","w+");
8903 j->pages = rdbSavedObjectPages(j->val,fp);
8904 fclose(fp);
8905 } else if (j->type == REDIS_IOJOB_DO_SWAP) {
8906 if (vmWriteObjectOnSwap(j->val,j->page) == REDIS_ERR)
8907 j->canceled = 1;
8908 }
8909
8910 /* Done: insert the job into the processed queue */
8911 redisLog(REDIS_DEBUG,"Thread %ld completed the job: %p (key %s)",
8912 (long) pthread_self(), (void*)j, (char*)j->key->ptr);
8913 lockThreadedIO();
8914 listDelNode(server.io_processing,ln);
8915 listAddNodeTail(server.io_processed,j);
8916 unlockThreadedIO();
8917
8918 /* Signal the main thread there is new stuff to process */
8919 assert(write(server.io_ready_pipe_write,"x",1) == 1);
8920 }
8921 return NULL; /* never reached */
8922 }
8923
8924 static void spawnIOThread(void) {
8925 pthread_t thread;
8926 sigset_t mask, omask;
8927 int err;
8928
8929 sigemptyset(&mask);
8930 sigaddset(&mask,SIGCHLD);
8931 sigaddset(&mask,SIGHUP);
8932 sigaddset(&mask,SIGPIPE);
8933 pthread_sigmask(SIG_SETMASK, &mask, &omask);
8934 while ((err = pthread_create(&thread,&server.io_threads_attr,IOThreadEntryPoint,NULL)) != 0) {
8935 redisLog(REDIS_WARNING,"Unable to spawn an I/O thread: %s",
8936 strerror(err));
8937 usleep(1000000);
8938 }
8939 pthread_sigmask(SIG_SETMASK, &omask, NULL);
8940 server.io_active_threads++;
8941 }
8942
8943 /* We need to wait for the last thread to exit before we are able to
8944 * fork() in order to BGSAVE or BGREWRITEAOF. */
8945 static void waitEmptyIOJobsQueue(void) {
8946 while(1) {
8947 int io_processed_len;
8948
8949 lockThreadedIO();
8950 if (listLength(server.io_newjobs) == 0 &&
8951 listLength(server.io_processing) == 0 &&
8952 server.io_active_threads == 0)
8953 {
8954 unlockThreadedIO();
8955 return;
8956 }
8957 /* While waiting for empty jobs queue condition we post-process some
8958 * finshed job, as I/O threads may be hanging trying to write against
8959 * the io_ready_pipe_write FD but there are so much pending jobs that
8960 * it's blocking. */
8961 io_processed_len = listLength(server.io_processed);
8962 unlockThreadedIO();
8963 if (io_processed_len) {
8964 vmThreadedIOCompletedJob(NULL,server.io_ready_pipe_read,NULL,0);
8965 usleep(1000); /* 1 millisecond */
8966 } else {
8967 usleep(10000); /* 10 milliseconds */
8968 }
8969 }
8970 }
8971
8972 static void vmReopenSwapFile(void) {
8973 /* Note: we don't close the old one as we are in the child process
8974 * and don't want to mess at all with the original file object. */
8975 server.vm_fp = fopen(server.vm_swap_file,"r+b");
8976 if (server.vm_fp == NULL) {
8977 redisLog(REDIS_WARNING,"Can't re-open the VM swap file: %s. Exiting.",
8978 server.vm_swap_file);
8979 _exit(1);
8980 }
8981 server.vm_fd = fileno(server.vm_fp);
8982 }
8983
8984 /* This function must be called while with threaded IO locked */
8985 static void queueIOJob(iojob *j) {
8986 redisLog(REDIS_DEBUG,"Queued IO Job %p type %d about key '%s'\n",
8987 (void*)j, j->type, (char*)j->key->ptr);
8988 listAddNodeTail(server.io_newjobs,j);
8989 if (server.io_active_threads < server.vm_max_threads)
8990 spawnIOThread();
8991 }
8992
8993 static int vmSwapObjectThreaded(robj *key, robj *val, redisDb *db) {
8994 iojob *j;
8995
8996 assert(key->storage == REDIS_VM_MEMORY);
8997 assert(key->refcount == 1);
8998
8999 j = zmalloc(sizeof(*j));
9000 j->type = REDIS_IOJOB_PREPARE_SWAP;
9001 j->db = db;
9002 j->key = dupStringObject(key);
9003 j->val = val;
9004 incrRefCount(val);
9005 j->canceled = 0;
9006 j->thread = (pthread_t) -1;
9007 key->storage = REDIS_VM_SWAPPING;
9008
9009 lockThreadedIO();
9010 queueIOJob(j);
9011 unlockThreadedIO();
9012 return REDIS_OK;
9013 }
9014
9015 /* ============ Virtual Memory - Blocking clients on missing keys =========== */
9016
9017 /* This function makes the clinet 'c' waiting for the key 'key' to be loaded.
9018 * If there is not already a job loading the key, it is craeted.
9019 * The key is added to the io_keys list in the client structure, and also
9020 * in the hash table mapping swapped keys to waiting clients, that is,
9021 * server.io_waited_keys. */
9022 static int waitForSwappedKey(redisClient *c, robj *key) {
9023 struct dictEntry *de;
9024 robj *o;
9025 list *l;
9026
9027 /* If the key does not exist or is already in RAM we don't need to
9028 * block the client at all. */
9029 de = dictFind(c->db->dict,key);
9030 if (de == NULL) return 0;
9031 o = dictGetEntryKey(de);
9032 if (o->storage == REDIS_VM_MEMORY) {
9033 return 0;
9034 } else if (o->storage == REDIS_VM_SWAPPING) {
9035 /* We were swapping the key, undo it! */
9036 vmCancelThreadedIOJob(o);
9037 return 0;
9038 }
9039
9040 /* OK: the key is either swapped, or being loaded just now. */
9041
9042 /* Add the key to the list of keys this client is waiting for.
9043 * This maps clients to keys they are waiting for. */
9044 listAddNodeTail(c->io_keys,key);
9045 incrRefCount(key);
9046
9047 /* Add the client to the swapped keys => clients waiting map. */
9048 de = dictFind(c->db->io_keys,key);
9049 if (de == NULL) {
9050 int retval;
9051
9052 /* For every key we take a list of clients blocked for it */
9053 l = listCreate();
9054 retval = dictAdd(c->db->io_keys,key,l);
9055 incrRefCount(key);
9056 assert(retval == DICT_OK);
9057 } else {
9058 l = dictGetEntryVal(de);
9059 }
9060 listAddNodeTail(l,c);
9061
9062 /* Are we already loading the key from disk? If not create a job */
9063 if (o->storage == REDIS_VM_SWAPPED) {
9064 iojob *j;
9065
9066 o->storage = REDIS_VM_LOADING;
9067 j = zmalloc(sizeof(*j));
9068 j->type = REDIS_IOJOB_LOAD;
9069 j->db = c->db;
9070 j->key = dupStringObject(key);
9071 j->key->vtype = o->vtype;
9072 j->page = o->vm.page;
9073 j->val = NULL;
9074 j->canceled = 0;
9075 j->thread = (pthread_t) -1;
9076 lockThreadedIO();
9077 queueIOJob(j);
9078 unlockThreadedIO();
9079 }
9080 return 1;
9081 }
9082
9083 /* Preload keys needed for the ZUNION and ZINTER commands. */
9084 static void zunionInterBlockClientOnSwappedKeys(redisClient *c) {
9085 int i, num;
9086 num = atoi(c->argv[2]->ptr);
9087 for (i = 0; i < num; i++) {
9088 waitForSwappedKey(c,c->argv[3+i]);
9089 }
9090 }
9091
9092 /* Is this client attempting to run a command against swapped keys?
9093 * If so, block it ASAP, load the keys in background, then resume it.
9094 *
9095 * The important idea about this function is that it can fail! If keys will
9096 * still be swapped when the client is resumed, this key lookups will
9097 * just block loading keys from disk. In practical terms this should only
9098 * happen with SORT BY command or if there is a bug in this function.
9099 *
9100 * Return 1 if the client is marked as blocked, 0 if the client can
9101 * continue as the keys it is going to access appear to be in memory. */
9102 static int blockClientOnSwappedKeys(struct redisCommand *cmd, redisClient *c) {
9103 int j, last;
9104
9105 if (cmd->vm_preload_proc != NULL) {
9106 cmd->vm_preload_proc(c);
9107 } else {
9108 if (cmd->vm_firstkey == 0) return 0;
9109 last = cmd->vm_lastkey;
9110 if (last < 0) last = c->argc+last;
9111 for (j = cmd->vm_firstkey; j <= last; j += cmd->vm_keystep)
9112 waitForSwappedKey(c,c->argv[j]);
9113 }
9114
9115 /* If the client was blocked for at least one key, mark it as blocked. */
9116 if (listLength(c->io_keys)) {
9117 c->flags |= REDIS_IO_WAIT;
9118 aeDeleteFileEvent(server.el,c->fd,AE_READABLE);
9119 server.vm_blocked_clients++;
9120 return 1;
9121 } else {
9122 return 0;
9123 }
9124 }
9125
9126 /* Remove the 'key' from the list of blocked keys for a given client.
9127 *
9128 * The function returns 1 when there are no longer blocking keys after
9129 * the current one was removed (and the client can be unblocked). */
9130 static int dontWaitForSwappedKey(redisClient *c, robj *key) {
9131 list *l;
9132 listNode *ln;
9133 listIter li;
9134 struct dictEntry *de;
9135
9136 /* Remove the key from the list of keys this client is waiting for. */
9137 listRewind(c->io_keys,&li);
9138 while ((ln = listNext(&li)) != NULL) {
9139 if (compareStringObjects(ln->value,key) == 0) {
9140 listDelNode(c->io_keys,ln);
9141 break;
9142 }
9143 }
9144 assert(ln != NULL);
9145
9146 /* Remove the client form the key => waiting clients map. */
9147 de = dictFind(c->db->io_keys,key);
9148 assert(de != NULL);
9149 l = dictGetEntryVal(de);
9150 ln = listSearchKey(l,c);
9151 assert(ln != NULL);
9152 listDelNode(l,ln);
9153 if (listLength(l) == 0)
9154 dictDelete(c->db->io_keys,key);
9155
9156 return listLength(c->io_keys) == 0;
9157 }
9158
9159 static void handleClientsBlockedOnSwappedKey(redisDb *db, robj *key) {
9160 struct dictEntry *de;
9161 list *l;
9162 listNode *ln;
9163 int len;
9164
9165 de = dictFind(db->io_keys,key);
9166 if (!de) return;
9167
9168 l = dictGetEntryVal(de);
9169 len = listLength(l);
9170 /* Note: we can't use something like while(listLength(l)) as the list
9171 * can be freed by the calling function when we remove the last element. */
9172 while (len--) {
9173 ln = listFirst(l);
9174 redisClient *c = ln->value;
9175
9176 if (dontWaitForSwappedKey(c,key)) {
9177 /* Put the client in the list of clients ready to go as we
9178 * loaded all the keys about it. */
9179 listAddNodeTail(server.io_ready_clients,c);
9180 }
9181 }
9182 }
9183
9184 /* =========================== Remote Configuration ========================= */
9185
9186 static void configSetCommand(redisClient *c) {
9187 robj *o = getDecodedObject(c->argv[3]);
9188 if (!strcasecmp(c->argv[2]->ptr,"dbfilename")) {
9189 zfree(server.dbfilename);
9190 server.dbfilename = zstrdup(o->ptr);
9191 } else if (!strcasecmp(c->argv[2]->ptr,"requirepass")) {
9192 zfree(server.requirepass);
9193 server.requirepass = zstrdup(o->ptr);
9194 } else if (!strcasecmp(c->argv[2]->ptr,"masterauth")) {
9195 zfree(server.masterauth);
9196 server.masterauth = zstrdup(o->ptr);
9197 } else if (!strcasecmp(c->argv[2]->ptr,"maxmemory")) {
9198 server.maxmemory = strtoll(o->ptr, NULL, 10);
9199 } else {
9200 addReplySds(c,sdscatprintf(sdsempty(),
9201 "-ERR not supported CONFIG parameter %s\r\n",
9202 (char*)c->argv[2]->ptr));
9203 decrRefCount(o);
9204 return;
9205 }
9206 decrRefCount(o);
9207 addReply(c,shared.ok);
9208 }
9209
9210 static void configGetCommand(redisClient *c) {
9211 robj *o = getDecodedObject(c->argv[2]);
9212 robj *lenobj = createObject(REDIS_STRING,NULL);
9213 char *pattern = o->ptr;
9214 int matches = 0;
9215
9216 addReply(c,lenobj);
9217 decrRefCount(lenobj);
9218
9219 if (stringmatch(pattern,"dbfilename",0)) {
9220 addReplyBulkCString(c,"dbfilename");
9221 addReplyBulkCString(c,server.dbfilename);
9222 matches++;
9223 }
9224 if (stringmatch(pattern,"requirepass",0)) {
9225 addReplyBulkCString(c,"requirepass");
9226 addReplyBulkCString(c,server.requirepass);
9227 matches++;
9228 }
9229 if (stringmatch(pattern,"masterauth",0)) {
9230 addReplyBulkCString(c,"masterauth");
9231 addReplyBulkCString(c,server.masterauth);
9232 matches++;
9233 }
9234 if (stringmatch(pattern,"maxmemory",0)) {
9235 char buf[128];
9236
9237 snprintf(buf,128,"%llu\n",server.maxmemory);
9238 addReplyBulkCString(c,"maxmemory");
9239 addReplyBulkCString(c,buf);
9240 matches++;
9241 }
9242 decrRefCount(o);
9243 lenobj->ptr = sdscatprintf(sdsempty(),"*%d\r\n",matches*2);
9244 }
9245
9246 static void configCommand(redisClient *c) {
9247 if (!strcasecmp(c->argv[1]->ptr,"set")) {
9248 if (c->argc != 4) goto badarity;
9249 configSetCommand(c);
9250 } else if (!strcasecmp(c->argv[1]->ptr,"get")) {
9251 if (c->argc != 3) goto badarity;
9252 configGetCommand(c);
9253 } else if (!strcasecmp(c->argv[1]->ptr,"resetstat")) {
9254 if (c->argc != 2) goto badarity;
9255 server.stat_numcommands = 0;
9256 server.stat_numconnections = 0;
9257 server.stat_expiredkeys = 0;
9258 server.stat_starttime = time(NULL);
9259 addReply(c,shared.ok);
9260 } else {
9261 addReplySds(c,sdscatprintf(sdsempty(),
9262 "-ERR CONFIG subcommand must be one of GET, SET, RESETSTAT\r\n"));
9263 }
9264 return;
9265
9266 badarity:
9267 addReplySds(c,sdscatprintf(sdsempty(),
9268 "-ERR Wrong number of arguments for CONFIG %s\r\n",
9269 (char*) c->argv[1]->ptr));
9270 }
9271
9272 /* =========================== Pubsub implementation ======================== */
9273
9274 /* Subscribe a client to a class. Returns 1 if the operation succeeded, or
9275 * 0 if the client was already subscribed to that class. */
9276 static int pubsubSubscribe(redisClient *c, robj *class) {
9277 struct dictEntry *de;
9278 list *clients = NULL;
9279 int retval = 0;
9280
9281 /* Add the class to the client -> classes hash table */
9282 if (dictAdd(c->pubsub_classes,class,NULL) == DICT_OK) {
9283 retval = 1;
9284 incrRefCount(class);
9285 /* Add the client to the class -> list of clients hash table */
9286 de = dictFind(server.pubsub_classes,class);
9287 if (de == NULL) {
9288 clients = listCreate();
9289 dictAdd(server.pubsub_classes,class,clients);
9290 incrRefCount(class);
9291 } else {
9292 clients = dictGetEntryVal(de);
9293 }
9294 listAddNodeTail(clients,c);
9295 }
9296 /* Notify the client */
9297 addReply(c,shared.mbulk3);
9298 addReply(c,shared.subscribebulk);
9299 addReplyBulk(c,class);
9300 addReplyLong(c,dictSize(c->pubsub_classes));
9301 return retval;
9302 }
9303
9304 /* Unsubscribe a client from a class. Returns 1 if the operation succeeded, or
9305 * 0 if the client was not subscribed to the specified class. */
9306 static int pubsubUnsubscribe(redisClient *c, robj *class, int notify) {
9307 struct dictEntry *de;
9308 list *clients;
9309 listNode *ln;
9310 int retval = 0;
9311
9312 /* Remove the class from the client -> classes hash table */
9313 incrRefCount(class); /* class may be just a pointer to the same object
9314 we have in the hash tables. Protect it... */
9315 if (dictDelete(c->pubsub_classes,class) == DICT_OK) {
9316 retval = 1;
9317 /* Remove the client from the class -> clients list hash table */
9318 de = dictFind(server.pubsub_classes,class);
9319 assert(de != NULL);
9320 clients = dictGetEntryVal(de);
9321 ln = listSearchKey(clients,c);
9322 assert(ln != NULL);
9323 listDelNode(clients,ln);
9324 if (listLength(clients) == 0) {
9325 /* Free the list and associated hash entry at all if this was
9326 * the latest client, so that it will be possible to abuse
9327 * Redis PUBSUB creating millions of classes. */
9328 dictDelete(server.pubsub_classes,class);
9329 }
9330 }
9331 /* Notify the client */
9332 if (notify) {
9333 addReply(c,shared.mbulk3);
9334 addReply(c,shared.unsubscribebulk);
9335 addReplyBulk(c,class);
9336 addReplyLong(c,dictSize(c->pubsub_classes));
9337 }
9338 decrRefCount(class); /* it is finally safe to release it */
9339 return retval;
9340 }
9341
9342 /* Unsubscribe from all the classes. Return the number of classes the
9343 * client was subscribed to. */
9344 static int pubsubUnsubscribeAll(redisClient *c, int notify) {
9345 dictIterator *di = dictGetIterator(c->pubsub_classes);
9346 dictEntry *de;
9347 int count = 0;
9348
9349 while((de = dictNext(di)) != NULL) {
9350 robj *class = dictGetEntryKey(de);
9351
9352 count += pubsubUnsubscribe(c,class,notify);
9353 }
9354 dictReleaseIterator(di);
9355 return count;
9356 }
9357
9358 /* Publish a message */
9359 static int pubsubPublishMessage(robj *class, robj *message) {
9360 int receivers = 0;
9361 struct dictEntry *de;
9362
9363 de = dictFind(server.pubsub_classes,class);
9364 if (de) {
9365 list *list = dictGetEntryVal(de);
9366 listNode *ln;
9367 listIter li;
9368
9369 listRewind(list,&li);
9370 while ((ln = listNext(&li)) != NULL) {
9371 redisClient *c = ln->value;
9372
9373 addReply(c,shared.mbulk3);
9374 addReply(c,shared.messagebulk);
9375 addReplyBulk(c,class);
9376 addReplyBulk(c,message);
9377 receivers++;
9378 }
9379 }
9380 return receivers;
9381 }
9382
9383 static void subscribeCommand(redisClient *c) {
9384 int j;
9385
9386 for (j = 1; j < c->argc; j++)
9387 pubsubSubscribe(c,c->argv[j]);
9388 }
9389
9390 static void unsubscribeCommand(redisClient *c) {
9391 if (c->argc == 1) {
9392 pubsubUnsubscribeAll(c,1);
9393 return;
9394 } else {
9395 int j;
9396
9397 for (j = 1; j < c->argc; j++)
9398 pubsubUnsubscribe(c,c->argv[j],1);
9399 }
9400 }
9401
9402 static void publishCommand(redisClient *c) {
9403 int receivers = pubsubPublishMessage(c->argv[1],c->argv[2]);
9404 addReplyLong(c,receivers);
9405 }
9406
9407 /* ================================= Debugging ============================== */
9408
9409 static void debugCommand(redisClient *c) {
9410 if (!strcasecmp(c->argv[1]->ptr,"segfault")) {
9411 *((char*)-1) = 'x';
9412 } else if (!strcasecmp(c->argv[1]->ptr,"reload")) {
9413 if (rdbSave(server.dbfilename) != REDIS_OK) {
9414 addReply(c,shared.err);
9415 return;
9416 }
9417 emptyDb();
9418 if (rdbLoad(server.dbfilename) != REDIS_OK) {
9419 addReply(c,shared.err);
9420 return;
9421 }
9422 redisLog(REDIS_WARNING,"DB reloaded by DEBUG RELOAD");
9423 addReply(c,shared.ok);
9424 } else if (!strcasecmp(c->argv[1]->ptr,"loadaof")) {
9425 emptyDb();
9426 if (loadAppendOnlyFile(server.appendfilename) != REDIS_OK) {
9427 addReply(c,shared.err);
9428 return;
9429 }
9430 redisLog(REDIS_WARNING,"Append Only File loaded by DEBUG LOADAOF");
9431 addReply(c,shared.ok);
9432 } else if (!strcasecmp(c->argv[1]->ptr,"object") && c->argc == 3) {
9433 dictEntry *de = dictFind(c->db->dict,c->argv[2]);
9434 robj *key, *val;
9435
9436 if (!de) {
9437 addReply(c,shared.nokeyerr);
9438 return;
9439 }
9440 key = dictGetEntryKey(de);
9441 val = dictGetEntryVal(de);
9442 if (!server.vm_enabled || (key->storage == REDIS_VM_MEMORY ||
9443 key->storage == REDIS_VM_SWAPPING)) {
9444 char *strenc;
9445 char buf[128];
9446
9447 if (val->encoding < (sizeof(strencoding)/sizeof(char*))) {
9448 strenc = strencoding[val->encoding];
9449 } else {
9450 snprintf(buf,64,"unknown encoding %d\n", val->encoding);
9451 strenc = buf;
9452 }
9453 addReplySds(c,sdscatprintf(sdsempty(),
9454 "+Key at:%p refcount:%d, value at:%p refcount:%d "
9455 "encoding:%s serializedlength:%lld\r\n",
9456 (void*)key, key->refcount, (void*)val, val->refcount,
9457 strenc, (long long) rdbSavedObjectLen(val,NULL)));
9458 } else {
9459 addReplySds(c,sdscatprintf(sdsempty(),
9460 "+Key at:%p refcount:%d, value swapped at: page %llu "
9461 "using %llu pages\r\n",
9462 (void*)key, key->refcount, (unsigned long long) key->vm.page,
9463 (unsigned long long) key->vm.usedpages));
9464 }
9465 } else if (!strcasecmp(c->argv[1]->ptr,"swapout") && c->argc == 3) {
9466 dictEntry *de = dictFind(c->db->dict,c->argv[2]);
9467 robj *key, *val;
9468
9469 if (!server.vm_enabled) {
9470 addReplySds(c,sdsnew("-ERR Virtual Memory is disabled\r\n"));
9471 return;
9472 }
9473 if (!de) {
9474 addReply(c,shared.nokeyerr);
9475 return;
9476 }
9477 key = dictGetEntryKey(de);
9478 val = dictGetEntryVal(de);
9479 /* If the key is shared we want to create a copy */
9480 if (key->refcount > 1) {
9481 robj *newkey = dupStringObject(key);
9482 decrRefCount(key);
9483 key = dictGetEntryKey(de) = newkey;
9484 }
9485 /* Swap it */
9486 if (key->storage != REDIS_VM_MEMORY) {
9487 addReplySds(c,sdsnew("-ERR This key is not in memory\r\n"));
9488 } else if (vmSwapObjectBlocking(key,val) == REDIS_OK) {
9489 dictGetEntryVal(de) = NULL;
9490 addReply(c,shared.ok);
9491 } else {
9492 addReply(c,shared.err);
9493 }
9494 } else {
9495 addReplySds(c,sdsnew(
9496 "-ERR Syntax error, try DEBUG [SEGFAULT|OBJECT <key>|SWAPOUT <key>|RELOAD]\r\n"));
9497 }
9498 }
9499
9500 static void _redisAssert(char *estr, char *file, int line) {
9501 redisLog(REDIS_WARNING,"=== ASSERTION FAILED ===");
9502 redisLog(REDIS_WARNING,"==> %s:%d '%s' is not true\n",file,line,estr);
9503 #ifdef HAVE_BACKTRACE
9504 redisLog(REDIS_WARNING,"(forcing SIGSEGV in order to print the stack trace)");
9505 *((char*)-1) = 'x';
9506 #endif
9507 }
9508
9509 /* =================================== Main! ================================ */
9510
9511 #ifdef __linux__
9512 int linuxOvercommitMemoryValue(void) {
9513 FILE *fp = fopen("/proc/sys/vm/overcommit_memory","r");
9514 char buf[64];
9515
9516 if (!fp) return -1;
9517 if (fgets(buf,64,fp) == NULL) {
9518 fclose(fp);
9519 return -1;
9520 }
9521 fclose(fp);
9522
9523 return atoi(buf);
9524 }
9525
9526 void linuxOvercommitMemoryWarning(void) {
9527 if (linuxOvercommitMemoryValue() == 0) {
9528 redisLog(REDIS_WARNING,"WARNING overcommit_memory is set to 0! Background save may fail under low condition memory. To fix this issue add 'vm.overcommit_memory = 1' to /etc/sysctl.conf and then reboot or run the command 'sysctl vm.overcommit_memory=1' for this to take effect.");
9529 }
9530 }
9531 #endif /* __linux__ */
9532
9533 static void daemonize(void) {
9534 int fd;
9535 FILE *fp;
9536
9537 if (fork() != 0) exit(0); /* parent exits */
9538 setsid(); /* create a new session */
9539
9540 /* Every output goes to /dev/null. If Redis is daemonized but
9541 * the 'logfile' is set to 'stdout' in the configuration file
9542 * it will not log at all. */
9543 if ((fd = open("/dev/null", O_RDWR, 0)) != -1) {
9544 dup2(fd, STDIN_FILENO);
9545 dup2(fd, STDOUT_FILENO);
9546 dup2(fd, STDERR_FILENO);
9547 if (fd > STDERR_FILENO) close(fd);
9548 }
9549 /* Try to write the pid file */
9550 fp = fopen(server.pidfile,"w");
9551 if (fp) {
9552 fprintf(fp,"%d\n",getpid());
9553 fclose(fp);
9554 }
9555 }
9556
9557 static void version() {
9558 printf("Redis server version %s\n", REDIS_VERSION);
9559 exit(0);
9560 }
9561
9562 static void usage() {
9563 fprintf(stderr,"Usage: ./redis-server [/path/to/redis.conf]\n");
9564 fprintf(stderr," ./redis-server - (read config from stdin)\n");
9565 exit(1);
9566 }
9567
9568 int main(int argc, char **argv) {
9569 time_t start;
9570
9571 initServerConfig();
9572 if (argc == 2) {
9573 if (strcmp(argv[1], "-v") == 0 ||
9574 strcmp(argv[1], "--version") == 0) version();
9575 if (strcmp(argv[1], "--help") == 0) usage();
9576 resetServerSaveParams();
9577 loadServerConfig(argv[1]);
9578 } else if ((argc > 2)) {
9579 usage();
9580 } else {
9581 redisLog(REDIS_WARNING,"Warning: no config file specified, using the default config. In order to specify a config file use 'redis-server /path/to/redis.conf'");
9582 }
9583 if (server.daemonize) daemonize();
9584 initServer();
9585 redisLog(REDIS_NOTICE,"Server started, Redis version " REDIS_VERSION);
9586 #ifdef __linux__
9587 linuxOvercommitMemoryWarning();
9588 #endif
9589 start = time(NULL);
9590 if (server.appendonly) {
9591 if (loadAppendOnlyFile(server.appendfilename) == REDIS_OK)
9592 redisLog(REDIS_NOTICE,"DB loaded from append only file: %ld seconds",time(NULL)-start);
9593 } else {
9594 if (rdbLoad(server.dbfilename) == REDIS_OK)
9595 redisLog(REDIS_NOTICE,"DB loaded from disk: %ld seconds",time(NULL)-start);
9596 }
9597 redisLog(REDIS_NOTICE,"The server is now ready to accept connections on port %d", server.port);
9598 aeSetBeforeSleepProc(server.el,beforeSleep);
9599 aeMain(server.el);
9600 aeDeleteEventLoop(server.el);
9601 return 0;
9602 }
9603
9604 /* ============================= Backtrace support ========================= */
9605
9606 #ifdef HAVE_BACKTRACE
9607 static char *findFuncName(void *pointer, unsigned long *offset);
9608
9609 static void *getMcontextEip(ucontext_t *uc) {
9610 #if defined(__FreeBSD__)
9611 return (void*) uc->uc_mcontext.mc_eip;
9612 #elif defined(__dietlibc__)
9613 return (void*) uc->uc_mcontext.eip;
9614 #elif defined(__APPLE__) && !defined(MAC_OS_X_VERSION_10_6)
9615 #if __x86_64__
9616 return (void*) uc->uc_mcontext->__ss.__rip;
9617 #else
9618 return (void*) uc->uc_mcontext->__ss.__eip;
9619 #endif
9620 #elif defined(__APPLE__) && defined(MAC_OS_X_VERSION_10_6)
9621 #if defined(_STRUCT_X86_THREAD_STATE64) && !defined(__i386__)
9622 return (void*) uc->uc_mcontext->__ss.__rip;
9623 #else
9624 return (void*) uc->uc_mcontext->__ss.__eip;
9625 #endif
9626 #elif defined(__i386__) || defined(__X86_64__) || defined(__x86_64__)
9627 return (void*) uc->uc_mcontext.gregs[REG_EIP]; /* Linux 32/64 bit */
9628 #elif defined(__ia64__) /* Linux IA64 */
9629 return (void*) uc->uc_mcontext.sc_ip;
9630 #else
9631 return NULL;
9632 #endif
9633 }
9634
9635 static void segvHandler(int sig, siginfo_t *info, void *secret) {
9636 void *trace[100];
9637 char **messages = NULL;
9638 int i, trace_size = 0;
9639 unsigned long offset=0;
9640 ucontext_t *uc = (ucontext_t*) secret;
9641 sds infostring;
9642 REDIS_NOTUSED(info);
9643
9644 redisLog(REDIS_WARNING,
9645 "======= Ooops! Redis %s got signal: -%d- =======", REDIS_VERSION, sig);
9646 infostring = genRedisInfoString();
9647 redisLog(REDIS_WARNING, "%s",infostring);
9648 /* It's not safe to sdsfree() the returned string under memory
9649 * corruption conditions. Let it leak as we are going to abort */
9650
9651 trace_size = backtrace(trace, 100);
9652 /* overwrite sigaction with caller's address */
9653 if (getMcontextEip(uc) != NULL) {
9654 trace[1] = getMcontextEip(uc);
9655 }
9656 messages = backtrace_symbols(trace, trace_size);
9657
9658 for (i=1; i<trace_size; ++i) {
9659 char *fn = findFuncName(trace[i], &offset), *p;
9660
9661 p = strchr(messages[i],'+');
9662 if (!fn || (p && ((unsigned long)strtol(p+1,NULL,10)) < offset)) {
9663 redisLog(REDIS_WARNING,"%s", messages[i]);
9664 } else {
9665 redisLog(REDIS_WARNING,"%d redis-server %p %s + %d", i, trace[i], fn, (unsigned int)offset);
9666 }
9667 }
9668 /* free(messages); Don't call free() with possibly corrupted memory. */
9669 _exit(0);
9670 }
9671
9672 static void setupSigSegvAction(void) {
9673 struct sigaction act;
9674
9675 sigemptyset (&act.sa_mask);
9676 /* When the SA_SIGINFO flag is set in sa_flags then sa_sigaction
9677 * is used. Otherwise, sa_handler is used */
9678 act.sa_flags = SA_NODEFER | SA_ONSTACK | SA_RESETHAND | SA_SIGINFO;
9679 act.sa_sigaction = segvHandler;
9680 sigaction (SIGSEGV, &act, NULL);
9681 sigaction (SIGBUS, &act, NULL);
9682 sigaction (SIGFPE, &act, NULL);
9683 sigaction (SIGILL, &act, NULL);
9684 sigaction (SIGBUS, &act, NULL);
9685 return;
9686 }
9687
9688 #include "staticsymbols.h"
9689 /* This function try to convert a pointer into a function name. It's used in
9690 * oreder to provide a backtrace under segmentation fault that's able to
9691 * display functions declared as static (otherwise the backtrace is useless). */
9692 static char *findFuncName(void *pointer, unsigned long *offset){
9693 int i, ret = -1;
9694 unsigned long off, minoff = 0;
9695
9696 /* Try to match against the Symbol with the smallest offset */
9697 for (i=0; symsTable[i].pointer; i++) {
9698 unsigned long lp = (unsigned long) pointer;
9699
9700 if (lp != (unsigned long)-1 && lp >= symsTable[i].pointer) {
9701 off=lp-symsTable[i].pointer;
9702 if (ret < 0 || off < minoff) {
9703 minoff=off;
9704 ret=i;
9705 }
9706 }
9707 }
9708 if (ret == -1) return NULL;
9709 *offset = minoff;
9710 return symsTable[ret].name;
9711 }
9712 #else /* HAVE_BACKTRACE */
9713 static void setupSigSegvAction(void) {
9714 }
9715 #endif /* HAVE_BACKTRACE */
9716
9717
9718
9719 /* The End */
9720
9721
9722