]> git.saurik.com Git - redis.git/blob - redis.c
35b9ba252c69070f2abe0966f130828d0460f846
[redis.git] / redis.c
1 /*
2 * Copyright (c) 2009-2010, Salvatore Sanfilippo <antirez at gmail dot com>
3 * All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions are met:
7 *
8 * * Redistributions of source code must retain the above copyright notice,
9 * this list of conditions and the following disclaimer.
10 * * Redistributions in binary form must reproduce the above copyright
11 * notice, this list of conditions and the following disclaimer in the
12 * documentation and/or other materials provided with the distribution.
13 * * Neither the name of Redis nor the names of its contributors may be used
14 * to endorse or promote products derived from this software without
15 * specific prior written permission.
16 *
17 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
18 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
21 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
22 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
23 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
24 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
25 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
26 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
27 * POSSIBILITY OF SUCH DAMAGE.
28 */
29
30 #define REDIS_VERSION "2.1.1"
31
32 #include "fmacros.h"
33 #include "config.h"
34
35 #include <stdio.h>
36 #include <stdlib.h>
37 #include <string.h>
38 #include <time.h>
39 #include <unistd.h>
40 #include <signal.h>
41
42 #ifdef HAVE_BACKTRACE
43 #include <execinfo.h>
44 #include <ucontext.h>
45 #endif /* HAVE_BACKTRACE */
46
47 #include <sys/wait.h>
48 #include <errno.h>
49 #include <assert.h>
50 #include <ctype.h>
51 #include <stdarg.h>
52 #include <inttypes.h>
53 #include <arpa/inet.h>
54 #include <sys/stat.h>
55 #include <fcntl.h>
56 #include <sys/time.h>
57 #include <sys/resource.h>
58 #include <sys/uio.h>
59 #include <limits.h>
60 #include <float.h>
61 #include <math.h>
62 #include <pthread.h>
63
64 #if defined(__sun)
65 #include "solarisfixes.h"
66 #endif
67
68 #include "redis.h"
69 #include "ae.h" /* Event driven programming library */
70 #include "sds.h" /* Dynamic safe strings */
71 #include "anet.h" /* Networking the easy way */
72 #include "dict.h" /* Hash tables */
73 #include "adlist.h" /* Linked lists */
74 #include "zmalloc.h" /* total memory usage aware version of malloc/free */
75 #include "lzf.h" /* LZF compression library */
76 #include "pqsort.h" /* Partial qsort for SORT+LIMIT */
77 #include "zipmap.h" /* Compact dictionary-alike data structure */
78 #include "sha1.h" /* SHA1 is used for DEBUG DIGEST */
79 #include "release.h" /* Release and/or git repository information */
80
81 /* Error codes */
82 #define REDIS_OK 0
83 #define REDIS_ERR -1
84
85 /* Static server configuration */
86 #define REDIS_SERVERPORT 6379 /* TCP port */
87 #define REDIS_MAXIDLETIME (60*5) /* default client timeout */
88 #define REDIS_IOBUF_LEN 1024
89 #define REDIS_LOADBUF_LEN 1024
90 #define REDIS_STATIC_ARGS 8
91 #define REDIS_DEFAULT_DBNUM 16
92 #define REDIS_CONFIGLINE_MAX 1024
93 #define REDIS_OBJFREELIST_MAX 1000000 /* Max number of objects to cache */
94 #define REDIS_MAX_SYNC_TIME 60 /* Slave can't take more to sync */
95 #define REDIS_EXPIRELOOKUPS_PER_CRON 10 /* lookup 10 expires per loop */
96 #define REDIS_MAX_WRITE_PER_EVENT (1024*64)
97 #define REDIS_REQUEST_MAX_SIZE (1024*1024*256) /* max bytes in inline command */
98
99 /* If more then REDIS_WRITEV_THRESHOLD write packets are pending use writev */
100 #define REDIS_WRITEV_THRESHOLD 3
101 /* Max number of iovecs used for each writev call */
102 #define REDIS_WRITEV_IOVEC_COUNT 256
103
104 /* Hash table parameters */
105 #define REDIS_HT_MINFILL 10 /* Minimal hash table fill 10% */
106
107 /* Command flags */
108 #define REDIS_CMD_BULK 1 /* Bulk write command */
109 #define REDIS_CMD_INLINE 2 /* Inline command */
110 /* REDIS_CMD_DENYOOM reserves a longer comment: all the commands marked with
111 this flags will return an error when the 'maxmemory' option is set in the
112 config file and the server is using more than maxmemory bytes of memory.
113 In short this commands are denied on low memory conditions. */
114 #define REDIS_CMD_DENYOOM 4
115 #define REDIS_CMD_FORCE_REPLICATION 8 /* Force replication even if dirty is 0 */
116
117 /* Object types */
118 #define REDIS_STRING 0
119 #define REDIS_LIST 1
120 #define REDIS_SET 2
121 #define REDIS_ZSET 3
122 #define REDIS_HASH 4
123 #define REDIS_VMPOINTER 8
124
125 /* Objects encoding. Some kind of objects like Strings and Hashes can be
126 * internally represented in multiple ways. The 'encoding' field of the object
127 * is set to one of this fields for this object. */
128 #define REDIS_ENCODING_RAW 0 /* Raw representation */
129 #define REDIS_ENCODING_INT 1 /* Encoded as integer */
130 #define REDIS_ENCODING_ZIPMAP 2 /* Encoded as zipmap */
131 #define REDIS_ENCODING_HT 3 /* Encoded as an hash table */
132
133 static char* strencoding[] = {
134 "raw", "int", "zipmap", "hashtable"
135 };
136
137 /* Object types only used for dumping to disk */
138 #define REDIS_EXPIRETIME 253
139 #define REDIS_SELECTDB 254
140 #define REDIS_EOF 255
141
142 /* Defines related to the dump file format. To store 32 bits lengths for short
143 * keys requires a lot of space, so we check the most significant 2 bits of
144 * the first byte to interpreter the length:
145 *
146 * 00|000000 => if the two MSB are 00 the len is the 6 bits of this byte
147 * 01|000000 00000000 => 01, the len is 14 byes, 6 bits + 8 bits of next byte
148 * 10|000000 [32 bit integer] => if it's 01, a full 32 bit len will follow
149 * 11|000000 this means: specially encoded object will follow. The six bits
150 * number specify the kind of object that follows.
151 * See the REDIS_RDB_ENC_* defines.
152 *
153 * Lenghts up to 63 are stored using a single byte, most DB keys, and may
154 * values, will fit inside. */
155 #define REDIS_RDB_6BITLEN 0
156 #define REDIS_RDB_14BITLEN 1
157 #define REDIS_RDB_32BITLEN 2
158 #define REDIS_RDB_ENCVAL 3
159 #define REDIS_RDB_LENERR UINT_MAX
160
161 /* When a length of a string object stored on disk has the first two bits
162 * set, the remaining two bits specify a special encoding for the object
163 * accordingly to the following defines: */
164 #define REDIS_RDB_ENC_INT8 0 /* 8 bit signed integer */
165 #define REDIS_RDB_ENC_INT16 1 /* 16 bit signed integer */
166 #define REDIS_RDB_ENC_INT32 2 /* 32 bit signed integer */
167 #define REDIS_RDB_ENC_LZF 3 /* string compressed with FASTLZ */
168
169 /* Virtual memory object->where field. */
170 #define REDIS_VM_MEMORY 0 /* The object is on memory */
171 #define REDIS_VM_SWAPPED 1 /* The object is on disk */
172 #define REDIS_VM_SWAPPING 2 /* Redis is swapping this object on disk */
173 #define REDIS_VM_LOADING 3 /* Redis is loading this object from disk */
174
175 /* Virtual memory static configuration stuff.
176 * Check vmFindContiguousPages() to know more about this magic numbers. */
177 #define REDIS_VM_MAX_NEAR_PAGES 65536
178 #define REDIS_VM_MAX_RANDOM_JUMP 4096
179 #define REDIS_VM_MAX_THREADS 32
180 #define REDIS_THREAD_STACK_SIZE (1024*1024*4)
181 /* The following is the *percentage* of completed I/O jobs to process when the
182 * handelr is called. While Virtual Memory I/O operations are performed by
183 * threads, this operations must be processed by the main thread when completed
184 * in order to take effect. */
185 #define REDIS_MAX_COMPLETED_JOBS_PROCESSED 1
186
187 /* Client flags */
188 #define REDIS_SLAVE 1 /* This client is a slave server */
189 #define REDIS_MASTER 2 /* This client is a master server */
190 #define REDIS_MONITOR 4 /* This client is a slave monitor, see MONITOR */
191 #define REDIS_MULTI 8 /* This client is in a MULTI context */
192 #define REDIS_BLOCKED 16 /* The client is waiting in a blocking operation */
193 #define REDIS_IO_WAIT 32 /* The client is waiting for Virtual Memory I/O */
194 #define REDIS_DIRTY_CAS 64 /* Watched keys modified. EXEC will fail. */
195
196 /* Slave replication state - slave side */
197 #define REDIS_REPL_NONE 0 /* No active replication */
198 #define REDIS_REPL_CONNECT 1 /* Must connect to master */
199 #define REDIS_REPL_CONNECTED 2 /* Connected to master */
200
201 /* Slave replication state - from the point of view of master
202 * Note that in SEND_BULK and ONLINE state the slave receives new updates
203 * in its output queue. In the WAIT_BGSAVE state instead the server is waiting
204 * to start the next background saving in order to send updates to it. */
205 #define REDIS_REPL_WAIT_BGSAVE_START 3 /* master waits bgsave to start feeding it */
206 #define REDIS_REPL_WAIT_BGSAVE_END 4 /* master waits bgsave to start bulk DB transmission */
207 #define REDIS_REPL_SEND_BULK 5 /* master is sending the bulk DB */
208 #define REDIS_REPL_ONLINE 6 /* bulk DB already transmitted, receive updates */
209
210 /* List related stuff */
211 #define REDIS_HEAD 0
212 #define REDIS_TAIL 1
213
214 /* Sort operations */
215 #define REDIS_SORT_GET 0
216 #define REDIS_SORT_ASC 1
217 #define REDIS_SORT_DESC 2
218 #define REDIS_SORTKEY_MAX 1024
219
220 /* Log levels */
221 #define REDIS_DEBUG 0
222 #define REDIS_VERBOSE 1
223 #define REDIS_NOTICE 2
224 #define REDIS_WARNING 3
225
226 /* Anti-warning macro... */
227 #define REDIS_NOTUSED(V) ((void) V)
228
229 #define ZSKIPLIST_MAXLEVEL 32 /* Should be enough for 2^32 elements */
230 #define ZSKIPLIST_P 0.25 /* Skiplist P = 1/4 */
231
232 /* Append only defines */
233 #define APPENDFSYNC_NO 0
234 #define APPENDFSYNC_ALWAYS 1
235 #define APPENDFSYNC_EVERYSEC 2
236
237 /* Hashes related defaults */
238 #define REDIS_HASH_MAX_ZIPMAP_ENTRIES 64
239 #define REDIS_HASH_MAX_ZIPMAP_VALUE 512
240
241 /* We can print the stacktrace, so our assert is defined this way: */
242 #define redisAssert(_e) ((_e)?(void)0 : (_redisAssert(#_e,__FILE__,__LINE__),_exit(1)))
243 #define redisPanic(_e) _redisPanic(#_e,__FILE__,__LINE__),_exit(1)
244 static void _redisAssert(char *estr, char *file, int line);
245 static void _redisPanic(char *msg, char *file, int line);
246
247 /*================================= Data types ============================== */
248
249 /* A redis object, that is a type able to hold a string / list / set */
250
251 /* The actual Redis Object */
252 typedef struct redisObject {
253 unsigned type:4;
254 unsigned storage:2; /* REDIS_VM_MEMORY or REDIS_VM_SWAPPING */
255 unsigned encoding:4;
256 unsigned lru:22; /* lru time (relative to server.lruclock) */
257 int refcount;
258 void *ptr;
259 /* VM fields, this are only allocated if VM is active, otherwise the
260 * object allocation function will just allocate
261 * sizeof(redisObjct) minus sizeof(redisObjectVM), so using
262 * Redis without VM active will not have any overhead. */
263 } robj;
264
265 /* The VM pointer structure - identifies an object in the swap file.
266 *
267 * This object is stored in place of the value
268 * object in the main key->value hash table representing a database.
269 * Note that the first fields (type, storage) are the same as the redisObject
270 * structure so that vmPointer strucuters can be accessed even when casted
271 * as redisObject structures.
272 *
273 * This is useful as we don't know if a value object is or not on disk, but we
274 * are always able to read obj->storage to check this. For vmPointer
275 * structures "type" is set to REDIS_VMPOINTER (even if without this field
276 * is still possible to check the kind of object from the value of 'storage').*/
277 typedef struct vmPointer {
278 unsigned type:4;
279 unsigned storage:2; /* REDIS_VM_SWAPPED or REDIS_VM_LOADING */
280 unsigned notused:26;
281 unsigned int vtype; /* type of the object stored in the swap file */
282 off_t page; /* the page at witch the object is stored on disk */
283 off_t usedpages; /* number of pages used on disk */
284 } vmpointer;
285
286 /* Macro used to initalize a Redis object allocated on the stack.
287 * Note that this macro is taken near the structure definition to make sure
288 * we'll update it when the structure is changed, to avoid bugs like
289 * bug #85 introduced exactly in this way. */
290 #define initStaticStringObject(_var,_ptr) do { \
291 _var.refcount = 1; \
292 _var.type = REDIS_STRING; \
293 _var.encoding = REDIS_ENCODING_RAW; \
294 _var.ptr = _ptr; \
295 _var.storage = REDIS_VM_MEMORY; \
296 } while(0);
297
298 typedef struct redisDb {
299 dict *dict; /* The keyspace for this DB */
300 dict *expires; /* Timeout of keys with a timeout set */
301 dict *blocking_keys; /* Keys with clients waiting for data (BLPOP) */
302 dict *io_keys; /* Keys with clients waiting for VM I/O */
303 dict *watched_keys; /* WATCHED keys for MULTI/EXEC CAS */
304 int id;
305 } redisDb;
306
307 /* Client MULTI/EXEC state */
308 typedef struct multiCmd {
309 robj **argv;
310 int argc;
311 struct redisCommand *cmd;
312 } multiCmd;
313
314 typedef struct multiState {
315 multiCmd *commands; /* Array of MULTI commands */
316 int count; /* Total number of MULTI commands */
317 } multiState;
318
319 /* With multiplexing we need to take per-clinet state.
320 * Clients are taken in a liked list. */
321 typedef struct redisClient {
322 int fd;
323 redisDb *db;
324 int dictid;
325 sds querybuf;
326 robj **argv, **mbargv;
327 int argc, mbargc;
328 int bulklen; /* bulk read len. -1 if not in bulk read mode */
329 int multibulk; /* multi bulk command format active */
330 list *reply;
331 int sentlen;
332 time_t lastinteraction; /* time of the last interaction, used for timeout */
333 int flags; /* REDIS_SLAVE | REDIS_MONITOR | REDIS_MULTI ... */
334 int slaveseldb; /* slave selected db, if this client is a slave */
335 int authenticated; /* when requirepass is non-NULL */
336 int replstate; /* replication state if this is a slave */
337 int repldbfd; /* replication DB file descriptor */
338 long repldboff; /* replication DB file offset */
339 off_t repldbsize; /* replication DB file size */
340 multiState mstate; /* MULTI/EXEC state */
341 robj **blocking_keys; /* The key we are waiting to terminate a blocking
342 * operation such as BLPOP. Otherwise NULL. */
343 int blocking_keys_num; /* Number of blocking keys */
344 time_t blockingto; /* Blocking operation timeout. If UNIX current time
345 * is >= blockingto then the operation timed out. */
346 list *io_keys; /* Keys this client is waiting to be loaded from the
347 * swap file in order to continue. */
348 list *watched_keys; /* Keys WATCHED for MULTI/EXEC CAS */
349 dict *pubsub_channels; /* channels a client is interested in (SUBSCRIBE) */
350 list *pubsub_patterns; /* patterns a client is interested in (SUBSCRIBE) */
351 } redisClient;
352
353 struct saveparam {
354 time_t seconds;
355 int changes;
356 };
357
358 /* Global server state structure */
359 struct redisServer {
360 int port;
361 int fd;
362 redisDb *db;
363 long long dirty; /* changes to DB from the last save */
364 list *clients;
365 list *slaves, *monitors;
366 char neterr[ANET_ERR_LEN];
367 aeEventLoop *el;
368 int cronloops; /* number of times the cron function run */
369 list *objfreelist; /* A list of freed objects to avoid malloc() */
370 time_t lastsave; /* Unix time of last save succeeede */
371 /* Fields used only for stats */
372 time_t stat_starttime; /* server start time */
373 long long stat_numcommands; /* number of processed commands */
374 long long stat_numconnections; /* number of connections received */
375 long long stat_expiredkeys; /* number of expired keys */
376 /* Configuration */
377 int verbosity;
378 int glueoutputbuf;
379 int maxidletime;
380 int dbnum;
381 int daemonize;
382 int appendonly;
383 int appendfsync;
384 int no_appendfsync_on_rewrite;
385 int shutdown_asap;
386 time_t lastfsync;
387 int appendfd;
388 int appendseldb;
389 char *pidfile;
390 pid_t bgsavechildpid;
391 pid_t bgrewritechildpid;
392 sds bgrewritebuf; /* buffer taken by parent during oppend only rewrite */
393 sds aofbuf; /* AOF buffer, written before entering the event loop */
394 struct saveparam *saveparams;
395 int saveparamslen;
396 char *logfile;
397 char *bindaddr;
398 char *dbfilename;
399 char *appendfilename;
400 char *requirepass;
401 int rdbcompression;
402 int activerehashing;
403 /* Replication related */
404 int isslave;
405 char *masterauth;
406 char *masterhost;
407 int masterport;
408 redisClient *master; /* client that is master for this slave */
409 int replstate;
410 unsigned int maxclients;
411 unsigned long long maxmemory;
412 unsigned int blpop_blocked_clients;
413 unsigned int vm_blocked_clients;
414 /* Sort parameters - qsort_r() is only available under BSD so we
415 * have to take this state global, in order to pass it to sortCompare() */
416 int sort_desc;
417 int sort_alpha;
418 int sort_bypattern;
419 /* Virtual memory configuration */
420 int vm_enabled;
421 char *vm_swap_file;
422 off_t vm_page_size;
423 off_t vm_pages;
424 unsigned long long vm_max_memory;
425 /* Hashes config */
426 size_t hash_max_zipmap_entries;
427 size_t hash_max_zipmap_value;
428 /* Virtual memory state */
429 FILE *vm_fp;
430 int vm_fd;
431 off_t vm_next_page; /* Next probably empty page */
432 off_t vm_near_pages; /* Number of pages allocated sequentially */
433 unsigned char *vm_bitmap; /* Bitmap of free/used pages */
434 time_t unixtime; /* Unix time sampled every second. */
435 /* Virtual memory I/O threads stuff */
436 /* An I/O thread process an element taken from the io_jobs queue and
437 * put the result of the operation in the io_done list. While the
438 * job is being processed, it's put on io_processing queue. */
439 list *io_newjobs; /* List of VM I/O jobs yet to be processed */
440 list *io_processing; /* List of VM I/O jobs being processed */
441 list *io_processed; /* List of VM I/O jobs already processed */
442 list *io_ready_clients; /* Clients ready to be unblocked. All keys loaded */
443 pthread_mutex_t io_mutex; /* lock to access io_jobs/io_done/io_thread_job */
444 pthread_mutex_t obj_freelist_mutex; /* safe redis objects creation/free */
445 pthread_mutex_t io_swapfile_mutex; /* So we can lseek + write */
446 pthread_attr_t io_threads_attr; /* attributes for threads creation */
447 int io_active_threads; /* Number of running I/O threads */
448 int vm_max_threads; /* Max number of I/O threads running at the same time */
449 /* Our main thread is blocked on the event loop, locking for sockets ready
450 * to be read or written, so when a threaded I/O operation is ready to be
451 * processed by the main thread, the I/O thread will use a unix pipe to
452 * awake the main thread. The followings are the two pipe FDs. */
453 int io_ready_pipe_read;
454 int io_ready_pipe_write;
455 /* Virtual memory stats */
456 unsigned long long vm_stats_used_pages;
457 unsigned long long vm_stats_swapped_objects;
458 unsigned long long vm_stats_swapouts;
459 unsigned long long vm_stats_swapins;
460 /* Pubsub */
461 dict *pubsub_channels; /* Map channels to list of subscribed clients */
462 list *pubsub_patterns; /* A list of pubsub_patterns */
463 /* Misc */
464 FILE *devnull;
465 unsigned lruclock:22; /* clock incrementing every minute, for LRU */
466 unsigned lruclock_padding:10;
467 };
468
469 typedef struct pubsubPattern {
470 redisClient *client;
471 robj *pattern;
472 } pubsubPattern;
473
474 typedef void redisCommandProc(redisClient *c);
475 typedef void redisVmPreloadProc(redisClient *c, struct redisCommand *cmd, int argc, robj **argv);
476 struct redisCommand {
477 char *name;
478 redisCommandProc *proc;
479 int arity;
480 int flags;
481 /* Use a function to determine which keys need to be loaded
482 * in the background prior to executing this command. Takes precedence
483 * over vm_firstkey and others, ignored when NULL */
484 redisVmPreloadProc *vm_preload_proc;
485 /* What keys should be loaded in background when calling this command? */
486 int vm_firstkey; /* The first argument that's a key (0 = no keys) */
487 int vm_lastkey; /* THe last argument that's a key */
488 int vm_keystep; /* The step between first and last key */
489 };
490
491 struct redisFunctionSym {
492 char *name;
493 unsigned long pointer;
494 };
495
496 typedef struct _redisSortObject {
497 robj *obj;
498 union {
499 double score;
500 robj *cmpobj;
501 } u;
502 } redisSortObject;
503
504 typedef struct _redisSortOperation {
505 int type;
506 robj *pattern;
507 } redisSortOperation;
508
509 /* ZSETs use a specialized version of Skiplists */
510
511 typedef struct zskiplistNode {
512 struct zskiplistNode **forward;
513 struct zskiplistNode *backward;
514 unsigned int *span;
515 double score;
516 robj *obj;
517 } zskiplistNode;
518
519 typedef struct zskiplist {
520 struct zskiplistNode *header, *tail;
521 unsigned long length;
522 int level;
523 } zskiplist;
524
525 typedef struct zset {
526 dict *dict;
527 zskiplist *zsl;
528 } zset;
529
530 /* Our shared "common" objects */
531
532 #define REDIS_SHARED_INTEGERS 10000
533 struct sharedObjectsStruct {
534 robj *crlf, *ok, *err, *emptybulk, *czero, *cone, *pong, *space,
535 *colon, *nullbulk, *nullmultibulk, *queued,
536 *emptymultibulk, *wrongtypeerr, *nokeyerr, *syntaxerr, *sameobjecterr,
537 *outofrangeerr, *plus,
538 *select0, *select1, *select2, *select3, *select4,
539 *select5, *select6, *select7, *select8, *select9,
540 *messagebulk, *pmessagebulk, *subscribebulk, *unsubscribebulk, *mbulk3,
541 *mbulk4, *psubscribebulk, *punsubscribebulk,
542 *integers[REDIS_SHARED_INTEGERS];
543 } shared;
544
545 /* Global vars that are actally used as constants. The following double
546 * values are used for double on-disk serialization, and are initialized
547 * at runtime to avoid strange compiler optimizations. */
548
549 static double R_Zero, R_PosInf, R_NegInf, R_Nan;
550
551 /* VM threaded I/O request message */
552 #define REDIS_IOJOB_LOAD 0 /* Load from disk to memory */
553 #define REDIS_IOJOB_PREPARE_SWAP 1 /* Compute needed pages */
554 #define REDIS_IOJOB_DO_SWAP 2 /* Swap from memory to disk */
555 typedef struct iojob {
556 int type; /* Request type, REDIS_IOJOB_* */
557 redisDb *db;/* Redis database */
558 robj *key; /* This I/O request is about swapping this key */
559 robj *id; /* Unique identifier of this job:
560 this is the object to swap for REDIS_IOREQ_*_SWAP, or the
561 vmpointer objct for REDIS_IOREQ_LOAD. */
562 robj *val; /* the value to swap for REDIS_IOREQ_*_SWAP, otherwise this
563 * field is populated by the I/O thread for REDIS_IOREQ_LOAD. */
564 off_t page; /* Swap page where to read/write the object */
565 off_t pages; /* Swap pages needed to save object. PREPARE_SWAP return val */
566 int canceled; /* True if this command was canceled by blocking side of VM */
567 pthread_t thread; /* ID of the thread processing this entry */
568 } iojob;
569
570 /*================================ Prototypes =============================== */
571
572 static void freeStringObject(robj *o);
573 static void freeListObject(robj *o);
574 static void freeSetObject(robj *o);
575 static void decrRefCount(void *o);
576 static robj *createObject(int type, void *ptr);
577 static void freeClient(redisClient *c);
578 static int rdbLoad(char *filename);
579 static void addReply(redisClient *c, robj *obj);
580 static void addReplySds(redisClient *c, sds s);
581 static void incrRefCount(robj *o);
582 static int rdbSaveBackground(char *filename);
583 static robj *createStringObject(char *ptr, size_t len);
584 static robj *dupStringObject(robj *o);
585 static void replicationFeedSlaves(list *slaves, int dictid, robj **argv, int argc);
586 static void replicationFeedMonitors(list *monitors, int dictid, robj **argv, int argc);
587 static void flushAppendOnlyFile(void);
588 static void feedAppendOnlyFile(struct redisCommand *cmd, int dictid, robj **argv, int argc);
589 static int syncWithMaster(void);
590 static robj *tryObjectEncoding(robj *o);
591 static robj *getDecodedObject(robj *o);
592 static int removeExpire(redisDb *db, robj *key);
593 static int expireIfNeeded(redisDb *db, robj *key);
594 static int deleteIfVolatile(redisDb *db, robj *key);
595 static int dbDelete(redisDb *db, robj *key);
596 static time_t getExpire(redisDb *db, robj *key);
597 static int setExpire(redisDb *db, robj *key, time_t when);
598 static void updateSlavesWaitingBgsave(int bgsaveerr);
599 static void freeMemoryIfNeeded(void);
600 static int processCommand(redisClient *c);
601 static void setupSigSegvAction(void);
602 static void rdbRemoveTempFile(pid_t childpid);
603 static void aofRemoveTempFile(pid_t childpid);
604 static size_t stringObjectLen(robj *o);
605 static void processInputBuffer(redisClient *c);
606 static zskiplist *zslCreate(void);
607 static void zslFree(zskiplist *zsl);
608 static void zslInsert(zskiplist *zsl, double score, robj *obj);
609 static void sendReplyToClientWritev(aeEventLoop *el, int fd, void *privdata, int mask);
610 static void initClientMultiState(redisClient *c);
611 static void freeClientMultiState(redisClient *c);
612 static void queueMultiCommand(redisClient *c, struct redisCommand *cmd);
613 static void unblockClientWaitingData(redisClient *c);
614 static int handleClientsWaitingListPush(redisClient *c, robj *key, robj *ele);
615 static void vmInit(void);
616 static void vmMarkPagesFree(off_t page, off_t count);
617 static robj *vmLoadObject(robj *o);
618 static robj *vmPreviewObject(robj *o);
619 static int vmSwapOneObjectBlocking(void);
620 static int vmSwapOneObjectThreaded(void);
621 static int vmCanSwapOut(void);
622 static int tryFreeOneObjectFromFreelist(void);
623 static void acceptHandler(aeEventLoop *el, int fd, void *privdata, int mask);
624 static void vmThreadedIOCompletedJob(aeEventLoop *el, int fd, void *privdata, int mask);
625 static void vmCancelThreadedIOJob(robj *o);
626 static void lockThreadedIO(void);
627 static void unlockThreadedIO(void);
628 static int vmSwapObjectThreaded(robj *key, robj *val, redisDb *db);
629 static void freeIOJob(iojob *j);
630 static void queueIOJob(iojob *j);
631 static int vmWriteObjectOnSwap(robj *o, off_t page);
632 static robj *vmReadObjectFromSwap(off_t page, int type);
633 static void waitEmptyIOJobsQueue(void);
634 static void vmReopenSwapFile(void);
635 static int vmFreePage(off_t page);
636 static void zunionInterBlockClientOnSwappedKeys(redisClient *c, struct redisCommand *cmd, int argc, robj **argv);
637 static void execBlockClientOnSwappedKeys(redisClient *c, struct redisCommand *cmd, int argc, robj **argv);
638 static int blockClientOnSwappedKeys(redisClient *c, struct redisCommand *cmd);
639 static int dontWaitForSwappedKey(redisClient *c, robj *key);
640 static void handleClientsBlockedOnSwappedKey(redisDb *db, robj *key);
641 static void readQueryFromClient(aeEventLoop *el, int fd, void *privdata, int mask);
642 static struct redisCommand *lookupCommand(char *name);
643 static void call(redisClient *c, struct redisCommand *cmd);
644 static void resetClient(redisClient *c);
645 static void convertToRealHash(robj *o);
646 static int pubsubUnsubscribeAllChannels(redisClient *c, int notify);
647 static int pubsubUnsubscribeAllPatterns(redisClient *c, int notify);
648 static void freePubsubPattern(void *p);
649 static int listMatchPubsubPattern(void *a, void *b);
650 static int compareStringObjects(robj *a, robj *b);
651 static int equalStringObjects(robj *a, robj *b);
652 static void usage();
653 static int rewriteAppendOnlyFileBackground(void);
654 static vmpointer *vmSwapObjectBlocking(robj *val);
655 static int prepareForShutdown();
656 static void touchWatchedKey(redisDb *db, robj *key);
657 static void touchWatchedKeysOnFlush(int dbid);
658 static void unwatchAllKeys(redisClient *c);
659
660 static void authCommand(redisClient *c);
661 static void pingCommand(redisClient *c);
662 static void echoCommand(redisClient *c);
663 static void setCommand(redisClient *c);
664 static void setnxCommand(redisClient *c);
665 static void setexCommand(redisClient *c);
666 static void getCommand(redisClient *c);
667 static void delCommand(redisClient *c);
668 static void existsCommand(redisClient *c);
669 static void incrCommand(redisClient *c);
670 static void decrCommand(redisClient *c);
671 static void incrbyCommand(redisClient *c);
672 static void decrbyCommand(redisClient *c);
673 static void selectCommand(redisClient *c);
674 static void randomkeyCommand(redisClient *c);
675 static void keysCommand(redisClient *c);
676 static void dbsizeCommand(redisClient *c);
677 static void lastsaveCommand(redisClient *c);
678 static void saveCommand(redisClient *c);
679 static void bgsaveCommand(redisClient *c);
680 static void bgrewriteaofCommand(redisClient *c);
681 static void shutdownCommand(redisClient *c);
682 static void moveCommand(redisClient *c);
683 static void renameCommand(redisClient *c);
684 static void renamenxCommand(redisClient *c);
685 static void lpushCommand(redisClient *c);
686 static void rpushCommand(redisClient *c);
687 static void lpopCommand(redisClient *c);
688 static void rpopCommand(redisClient *c);
689 static void llenCommand(redisClient *c);
690 static void lindexCommand(redisClient *c);
691 static void lrangeCommand(redisClient *c);
692 static void ltrimCommand(redisClient *c);
693 static void typeCommand(redisClient *c);
694 static void lsetCommand(redisClient *c);
695 static void saddCommand(redisClient *c);
696 static void sremCommand(redisClient *c);
697 static void smoveCommand(redisClient *c);
698 static void sismemberCommand(redisClient *c);
699 static void scardCommand(redisClient *c);
700 static void spopCommand(redisClient *c);
701 static void srandmemberCommand(redisClient *c);
702 static void sinterCommand(redisClient *c);
703 static void sinterstoreCommand(redisClient *c);
704 static void sunionCommand(redisClient *c);
705 static void sunionstoreCommand(redisClient *c);
706 static void sdiffCommand(redisClient *c);
707 static void sdiffstoreCommand(redisClient *c);
708 static void syncCommand(redisClient *c);
709 static void flushdbCommand(redisClient *c);
710 static void flushallCommand(redisClient *c);
711 static void sortCommand(redisClient *c);
712 static void lremCommand(redisClient *c);
713 static void rpoplpushcommand(redisClient *c);
714 static void infoCommand(redisClient *c);
715 static void mgetCommand(redisClient *c);
716 static void monitorCommand(redisClient *c);
717 static void expireCommand(redisClient *c);
718 static void expireatCommand(redisClient *c);
719 static void getsetCommand(redisClient *c);
720 static void ttlCommand(redisClient *c);
721 static void slaveofCommand(redisClient *c);
722 static void debugCommand(redisClient *c);
723 static void msetCommand(redisClient *c);
724 static void msetnxCommand(redisClient *c);
725 static void zaddCommand(redisClient *c);
726 static void zincrbyCommand(redisClient *c);
727 static void zrangeCommand(redisClient *c);
728 static void zrangebyscoreCommand(redisClient *c);
729 static void zcountCommand(redisClient *c);
730 static void zrevrangeCommand(redisClient *c);
731 static void zcardCommand(redisClient *c);
732 static void zremCommand(redisClient *c);
733 static void zscoreCommand(redisClient *c);
734 static void zremrangebyscoreCommand(redisClient *c);
735 static void multiCommand(redisClient *c);
736 static void execCommand(redisClient *c);
737 static void discardCommand(redisClient *c);
738 static void blpopCommand(redisClient *c);
739 static void brpopCommand(redisClient *c);
740 static void appendCommand(redisClient *c);
741 static void substrCommand(redisClient *c);
742 static void zrankCommand(redisClient *c);
743 static void zrevrankCommand(redisClient *c);
744 static void hsetCommand(redisClient *c);
745 static void hsetnxCommand(redisClient *c);
746 static void hgetCommand(redisClient *c);
747 static void hmsetCommand(redisClient *c);
748 static void hmgetCommand(redisClient *c);
749 static void hdelCommand(redisClient *c);
750 static void hlenCommand(redisClient *c);
751 static void zremrangebyrankCommand(redisClient *c);
752 static void zunionstoreCommand(redisClient *c);
753 static void zinterstoreCommand(redisClient *c);
754 static void hkeysCommand(redisClient *c);
755 static void hvalsCommand(redisClient *c);
756 static void hgetallCommand(redisClient *c);
757 static void hexistsCommand(redisClient *c);
758 static void configCommand(redisClient *c);
759 static void hincrbyCommand(redisClient *c);
760 static void subscribeCommand(redisClient *c);
761 static void unsubscribeCommand(redisClient *c);
762 static void psubscribeCommand(redisClient *c);
763 static void punsubscribeCommand(redisClient *c);
764 static void publishCommand(redisClient *c);
765 static void watchCommand(redisClient *c);
766 static void unwatchCommand(redisClient *c);
767
768 /*================================= Globals ================================= */
769
770 /* Global vars */
771 static struct redisServer server; /* server global state */
772 static struct redisCommand *commandTable;
773 static struct redisCommand readonlyCommandTable[] = {
774 {"get",getCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
775 {"set",setCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,0,0,0},
776 {"setnx",setnxCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,0,0,0},
777 {"setex",setexCommand,4,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,0,0,0},
778 {"append",appendCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
779 {"substr",substrCommand,4,REDIS_CMD_INLINE,NULL,1,1,1},
780 {"del",delCommand,-2,REDIS_CMD_INLINE,NULL,0,0,0},
781 {"exists",existsCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
782 {"incr",incrCommand,2,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,1,1},
783 {"decr",decrCommand,2,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,1,1},
784 {"mget",mgetCommand,-2,REDIS_CMD_INLINE,NULL,1,-1,1},
785 {"rpush",rpushCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
786 {"lpush",lpushCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
787 {"rpop",rpopCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
788 {"lpop",lpopCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
789 {"brpop",brpopCommand,-3,REDIS_CMD_INLINE,NULL,1,1,1},
790 {"blpop",blpopCommand,-3,REDIS_CMD_INLINE,NULL,1,1,1},
791 {"llen",llenCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
792 {"lindex",lindexCommand,3,REDIS_CMD_INLINE,NULL,1,1,1},
793 {"lset",lsetCommand,4,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
794 {"lrange",lrangeCommand,4,REDIS_CMD_INLINE,NULL,1,1,1},
795 {"ltrim",ltrimCommand,4,REDIS_CMD_INLINE,NULL,1,1,1},
796 {"lrem",lremCommand,4,REDIS_CMD_BULK,NULL,1,1,1},
797 {"rpoplpush",rpoplpushcommand,3,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,2,1},
798 {"sadd",saddCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
799 {"srem",sremCommand,3,REDIS_CMD_BULK,NULL,1,1,1},
800 {"smove",smoveCommand,4,REDIS_CMD_BULK,NULL,1,2,1},
801 {"sismember",sismemberCommand,3,REDIS_CMD_BULK,NULL,1,1,1},
802 {"scard",scardCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
803 {"spop",spopCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
804 {"srandmember",srandmemberCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
805 {"sinter",sinterCommand,-2,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,-1,1},
806 {"sinterstore",sinterstoreCommand,-3,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,2,-1,1},
807 {"sunion",sunionCommand,-2,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,-1,1},
808 {"sunionstore",sunionstoreCommand,-3,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,2,-1,1},
809 {"sdiff",sdiffCommand,-2,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,-1,1},
810 {"sdiffstore",sdiffstoreCommand,-3,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,2,-1,1},
811 {"smembers",sinterCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
812 {"zadd",zaddCommand,4,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
813 {"zincrby",zincrbyCommand,4,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
814 {"zrem",zremCommand,3,REDIS_CMD_BULK,NULL,1,1,1},
815 {"zremrangebyscore",zremrangebyscoreCommand,4,REDIS_CMD_INLINE,NULL,1,1,1},
816 {"zremrangebyrank",zremrangebyrankCommand,4,REDIS_CMD_INLINE,NULL,1,1,1},
817 {"zunionstore",zunionstoreCommand,-4,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,zunionInterBlockClientOnSwappedKeys,0,0,0},
818 {"zinterstore",zinterstoreCommand,-4,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,zunionInterBlockClientOnSwappedKeys,0,0,0},
819 {"zrange",zrangeCommand,-4,REDIS_CMD_INLINE,NULL,1,1,1},
820 {"zrangebyscore",zrangebyscoreCommand,-4,REDIS_CMD_INLINE,NULL,1,1,1},
821 {"zcount",zcountCommand,4,REDIS_CMD_INLINE,NULL,1,1,1},
822 {"zrevrange",zrevrangeCommand,-4,REDIS_CMD_INLINE,NULL,1,1,1},
823 {"zcard",zcardCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
824 {"zscore",zscoreCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
825 {"zrank",zrankCommand,3,REDIS_CMD_BULK,NULL,1,1,1},
826 {"zrevrank",zrevrankCommand,3,REDIS_CMD_BULK,NULL,1,1,1},
827 {"hset",hsetCommand,4,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
828 {"hsetnx",hsetnxCommand,4,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
829 {"hget",hgetCommand,3,REDIS_CMD_BULK,NULL,1,1,1},
830 {"hmset",hmsetCommand,-4,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
831 {"hmget",hmgetCommand,-3,REDIS_CMD_BULK,NULL,1,1,1},
832 {"hincrby",hincrbyCommand,4,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,1,1},
833 {"hdel",hdelCommand,3,REDIS_CMD_BULK,NULL,1,1,1},
834 {"hlen",hlenCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
835 {"hkeys",hkeysCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
836 {"hvals",hvalsCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
837 {"hgetall",hgetallCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
838 {"hexists",hexistsCommand,3,REDIS_CMD_BULK,NULL,1,1,1},
839 {"incrby",incrbyCommand,3,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,1,1},
840 {"decrby",decrbyCommand,3,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,1,1},
841 {"getset",getsetCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
842 {"mset",msetCommand,-3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,-1,2},
843 {"msetnx",msetnxCommand,-3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,-1,2},
844 {"randomkey",randomkeyCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
845 {"select",selectCommand,2,REDIS_CMD_INLINE,NULL,0,0,0},
846 {"move",moveCommand,3,REDIS_CMD_INLINE,NULL,1,1,1},
847 {"rename",renameCommand,3,REDIS_CMD_INLINE,NULL,1,1,1},
848 {"renamenx",renamenxCommand,3,REDIS_CMD_INLINE,NULL,1,1,1},
849 {"expire",expireCommand,3,REDIS_CMD_INLINE,NULL,0,0,0},
850 {"expireat",expireatCommand,3,REDIS_CMD_INLINE,NULL,0,0,0},
851 {"keys",keysCommand,2,REDIS_CMD_INLINE,NULL,0,0,0},
852 {"dbsize",dbsizeCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
853 {"auth",authCommand,2,REDIS_CMD_INLINE,NULL,0,0,0},
854 {"ping",pingCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
855 {"echo",echoCommand,2,REDIS_CMD_BULK,NULL,0,0,0},
856 {"save",saveCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
857 {"bgsave",bgsaveCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
858 {"bgrewriteaof",bgrewriteaofCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
859 {"shutdown",shutdownCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
860 {"lastsave",lastsaveCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
861 {"type",typeCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
862 {"multi",multiCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
863 {"exec",execCommand,1,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,execBlockClientOnSwappedKeys,0,0,0},
864 {"discard",discardCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
865 {"sync",syncCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
866 {"flushdb",flushdbCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
867 {"flushall",flushallCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
868 {"sort",sortCommand,-2,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,1,1},
869 {"info",infoCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
870 {"monitor",monitorCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
871 {"ttl",ttlCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
872 {"slaveof",slaveofCommand,3,REDIS_CMD_INLINE,NULL,0,0,0},
873 {"debug",debugCommand,-2,REDIS_CMD_INLINE,NULL,0,0,0},
874 {"config",configCommand,-2,REDIS_CMD_BULK,NULL,0,0,0},
875 {"subscribe",subscribeCommand,-2,REDIS_CMD_INLINE,NULL,0,0,0},
876 {"unsubscribe",unsubscribeCommand,-1,REDIS_CMD_INLINE,NULL,0,0,0},
877 {"psubscribe",psubscribeCommand,-2,REDIS_CMD_INLINE,NULL,0,0,0},
878 {"punsubscribe",punsubscribeCommand,-1,REDIS_CMD_INLINE,NULL,0,0,0},
879 {"publish",publishCommand,3,REDIS_CMD_BULK|REDIS_CMD_FORCE_REPLICATION,NULL,0,0,0},
880 {"watch",watchCommand,-2,REDIS_CMD_INLINE,NULL,0,0,0},
881 {"unwatch",unwatchCommand,1,REDIS_CMD_INLINE,NULL,0,0,0}
882 };
883
884 /*============================ Utility functions ============================ */
885
886 /* Glob-style pattern matching. */
887 static int stringmatchlen(const char *pattern, int patternLen,
888 const char *string, int stringLen, int nocase)
889 {
890 while(patternLen) {
891 switch(pattern[0]) {
892 case '*':
893 while (pattern[1] == '*') {
894 pattern++;
895 patternLen--;
896 }
897 if (patternLen == 1)
898 return 1; /* match */
899 while(stringLen) {
900 if (stringmatchlen(pattern+1, patternLen-1,
901 string, stringLen, nocase))
902 return 1; /* match */
903 string++;
904 stringLen--;
905 }
906 return 0; /* no match */
907 break;
908 case '?':
909 if (stringLen == 0)
910 return 0; /* no match */
911 string++;
912 stringLen--;
913 break;
914 case '[':
915 {
916 int not, match;
917
918 pattern++;
919 patternLen--;
920 not = pattern[0] == '^';
921 if (not) {
922 pattern++;
923 patternLen--;
924 }
925 match = 0;
926 while(1) {
927 if (pattern[0] == '\\') {
928 pattern++;
929 patternLen--;
930 if (pattern[0] == string[0])
931 match = 1;
932 } else if (pattern[0] == ']') {
933 break;
934 } else if (patternLen == 0) {
935 pattern--;
936 patternLen++;
937 break;
938 } else if (pattern[1] == '-' && patternLen >= 3) {
939 int start = pattern[0];
940 int end = pattern[2];
941 int c = string[0];
942 if (start > end) {
943 int t = start;
944 start = end;
945 end = t;
946 }
947 if (nocase) {
948 start = tolower(start);
949 end = tolower(end);
950 c = tolower(c);
951 }
952 pattern += 2;
953 patternLen -= 2;
954 if (c >= start && c <= end)
955 match = 1;
956 } else {
957 if (!nocase) {
958 if (pattern[0] == string[0])
959 match = 1;
960 } else {
961 if (tolower((int)pattern[0]) == tolower((int)string[0]))
962 match = 1;
963 }
964 }
965 pattern++;
966 patternLen--;
967 }
968 if (not)
969 match = !match;
970 if (!match)
971 return 0; /* no match */
972 string++;
973 stringLen--;
974 break;
975 }
976 case '\\':
977 if (patternLen >= 2) {
978 pattern++;
979 patternLen--;
980 }
981 /* fall through */
982 default:
983 if (!nocase) {
984 if (pattern[0] != string[0])
985 return 0; /* no match */
986 } else {
987 if (tolower((int)pattern[0]) != tolower((int)string[0]))
988 return 0; /* no match */
989 }
990 string++;
991 stringLen--;
992 break;
993 }
994 pattern++;
995 patternLen--;
996 if (stringLen == 0) {
997 while(*pattern == '*') {
998 pattern++;
999 patternLen--;
1000 }
1001 break;
1002 }
1003 }
1004 if (patternLen == 0 && stringLen == 0)
1005 return 1;
1006 return 0;
1007 }
1008
1009 static int stringmatch(const char *pattern, const char *string, int nocase) {
1010 return stringmatchlen(pattern,strlen(pattern),string,strlen(string),nocase);
1011 }
1012
1013 /* Convert a string representing an amount of memory into the number of
1014 * bytes, so for instance memtoll("1Gi") will return 1073741824 that is
1015 * (1024*1024*1024).
1016 *
1017 * On parsing error, if *err is not NULL, it's set to 1, otherwise it's
1018 * set to 0 */
1019 static long long memtoll(const char *p, int *err) {
1020 const char *u;
1021 char buf[128];
1022 long mul; /* unit multiplier */
1023 long long val;
1024 unsigned int digits;
1025
1026 if (err) *err = 0;
1027 /* Search the first non digit character. */
1028 u = p;
1029 if (*u == '-') u++;
1030 while(*u && isdigit(*u)) u++;
1031 if (*u == '\0' || !strcasecmp(u,"b")) {
1032 mul = 1;
1033 } else if (!strcasecmp(u,"k")) {
1034 mul = 1000;
1035 } else if (!strcasecmp(u,"kb")) {
1036 mul = 1024;
1037 } else if (!strcasecmp(u,"m")) {
1038 mul = 1000*1000;
1039 } else if (!strcasecmp(u,"mb")) {
1040 mul = 1024*1024;
1041 } else if (!strcasecmp(u,"g")) {
1042 mul = 1000L*1000*1000;
1043 } else if (!strcasecmp(u,"gb")) {
1044 mul = 1024L*1024*1024;
1045 } else {
1046 if (err) *err = 1;
1047 mul = 1;
1048 }
1049 digits = u-p;
1050 if (digits >= sizeof(buf)) {
1051 if (err) *err = 1;
1052 return LLONG_MAX;
1053 }
1054 memcpy(buf,p,digits);
1055 buf[digits] = '\0';
1056 val = strtoll(buf,NULL,10);
1057 return val*mul;
1058 }
1059
1060 /* Convert a long long into a string. Returns the number of
1061 * characters needed to represent the number, that can be shorter if passed
1062 * buffer length is not enough to store the whole number. */
1063 static int ll2string(char *s, size_t len, long long value) {
1064 char buf[32], *p;
1065 unsigned long long v;
1066 size_t l;
1067
1068 if (len == 0) return 0;
1069 v = (value < 0) ? -value : value;
1070 p = buf+31; /* point to the last character */
1071 do {
1072 *p-- = '0'+(v%10);
1073 v /= 10;
1074 } while(v);
1075 if (value < 0) *p-- = '-';
1076 p++;
1077 l = 32-(p-buf);
1078 if (l+1 > len) l = len-1; /* Make sure it fits, including the nul term */
1079 memcpy(s,p,l);
1080 s[l] = '\0';
1081 return l;
1082 }
1083
1084 static void redisLog(int level, const char *fmt, ...) {
1085 va_list ap;
1086 FILE *fp;
1087
1088 fp = (server.logfile == NULL) ? stdout : fopen(server.logfile,"a");
1089 if (!fp) return;
1090
1091 va_start(ap, fmt);
1092 if (level >= server.verbosity) {
1093 char *c = ".-*#";
1094 char buf[64];
1095 time_t now;
1096
1097 now = time(NULL);
1098 strftime(buf,64,"%d %b %H:%M:%S",localtime(&now));
1099 fprintf(fp,"[%d] %s %c ",(int)getpid(),buf,c[level]);
1100 vfprintf(fp, fmt, ap);
1101 fprintf(fp,"\n");
1102 fflush(fp);
1103 }
1104 va_end(ap);
1105
1106 if (server.logfile) fclose(fp);
1107 }
1108
1109 /*====================== Hash table type implementation ==================== */
1110
1111 /* This is an hash table type that uses the SDS dynamic strings libary as
1112 * keys and radis objects as values (objects can hold SDS strings,
1113 * lists, sets). */
1114
1115 static void dictVanillaFree(void *privdata, void *val)
1116 {
1117 DICT_NOTUSED(privdata);
1118 zfree(val);
1119 }
1120
1121 static void dictListDestructor(void *privdata, void *val)
1122 {
1123 DICT_NOTUSED(privdata);
1124 listRelease((list*)val);
1125 }
1126
1127 static int dictSdsKeyCompare(void *privdata, const void *key1,
1128 const void *key2)
1129 {
1130 int l1,l2;
1131 DICT_NOTUSED(privdata);
1132
1133 l1 = sdslen((sds)key1);
1134 l2 = sdslen((sds)key2);
1135 if (l1 != l2) return 0;
1136 return memcmp(key1, key2, l1) == 0;
1137 }
1138
1139 static void dictRedisObjectDestructor(void *privdata, void *val)
1140 {
1141 DICT_NOTUSED(privdata);
1142
1143 if (val == NULL) return; /* Values of swapped out keys as set to NULL */
1144 decrRefCount(val);
1145 }
1146
1147 static void dictSdsDestructor(void *privdata, void *val)
1148 {
1149 DICT_NOTUSED(privdata);
1150
1151 sdsfree(val);
1152 }
1153
1154 static int dictObjKeyCompare(void *privdata, const void *key1,
1155 const void *key2)
1156 {
1157 const robj *o1 = key1, *o2 = key2;
1158 return dictSdsKeyCompare(privdata,o1->ptr,o2->ptr);
1159 }
1160
1161 static unsigned int dictObjHash(const void *key) {
1162 const robj *o = key;
1163 return dictGenHashFunction(o->ptr, sdslen((sds)o->ptr));
1164 }
1165
1166 static unsigned int dictSdsHash(const void *key) {
1167 return dictGenHashFunction((unsigned char*)key, sdslen((char*)key));
1168 }
1169
1170 static int dictEncObjKeyCompare(void *privdata, const void *key1,
1171 const void *key2)
1172 {
1173 robj *o1 = (robj*) key1, *o2 = (robj*) key2;
1174 int cmp;
1175
1176 if (o1->encoding == REDIS_ENCODING_INT &&
1177 o2->encoding == REDIS_ENCODING_INT)
1178 return o1->ptr == o2->ptr;
1179
1180 o1 = getDecodedObject(o1);
1181 o2 = getDecodedObject(o2);
1182 cmp = dictSdsKeyCompare(privdata,o1->ptr,o2->ptr);
1183 decrRefCount(o1);
1184 decrRefCount(o2);
1185 return cmp;
1186 }
1187
1188 static unsigned int dictEncObjHash(const void *key) {
1189 robj *o = (robj*) key;
1190
1191 if (o->encoding == REDIS_ENCODING_RAW) {
1192 return dictGenHashFunction(o->ptr, sdslen((sds)o->ptr));
1193 } else {
1194 if (o->encoding == REDIS_ENCODING_INT) {
1195 char buf[32];
1196 int len;
1197
1198 len = ll2string(buf,32,(long)o->ptr);
1199 return dictGenHashFunction((unsigned char*)buf, len);
1200 } else {
1201 unsigned int hash;
1202
1203 o = getDecodedObject(o);
1204 hash = dictGenHashFunction(o->ptr, sdslen((sds)o->ptr));
1205 decrRefCount(o);
1206 return hash;
1207 }
1208 }
1209 }
1210
1211 /* Sets type */
1212 static dictType setDictType = {
1213 dictEncObjHash, /* hash function */
1214 NULL, /* key dup */
1215 NULL, /* val dup */
1216 dictEncObjKeyCompare, /* key compare */
1217 dictRedisObjectDestructor, /* key destructor */
1218 NULL /* val destructor */
1219 };
1220
1221 /* Sorted sets hash (note: a skiplist is used in addition to the hash table) */
1222 static dictType zsetDictType = {
1223 dictEncObjHash, /* hash function */
1224 NULL, /* key dup */
1225 NULL, /* val dup */
1226 dictEncObjKeyCompare, /* key compare */
1227 dictRedisObjectDestructor, /* key destructor */
1228 dictVanillaFree /* val destructor of malloc(sizeof(double)) */
1229 };
1230
1231 /* Db->dict, keys are sds strings, vals are Redis objects. */
1232 static dictType dbDictType = {
1233 dictSdsHash, /* hash function */
1234 NULL, /* key dup */
1235 NULL, /* val dup */
1236 dictSdsKeyCompare, /* key compare */
1237 dictSdsDestructor, /* key destructor */
1238 dictRedisObjectDestructor /* val destructor */
1239 };
1240
1241 /* Db->expires */
1242 static dictType keyptrDictType = {
1243 dictSdsHash, /* hash function */
1244 NULL, /* key dup */
1245 NULL, /* val dup */
1246 dictSdsKeyCompare, /* key compare */
1247 dictSdsDestructor, /* key destructor */
1248 NULL /* val destructor */
1249 };
1250
1251 /* Hash type hash table (note that small hashes are represented with zimpaps) */
1252 static dictType hashDictType = {
1253 dictEncObjHash, /* hash function */
1254 NULL, /* key dup */
1255 NULL, /* val dup */
1256 dictEncObjKeyCompare, /* key compare */
1257 dictRedisObjectDestructor, /* key destructor */
1258 dictRedisObjectDestructor /* val destructor */
1259 };
1260
1261 /* Keylist hash table type has unencoded redis objects as keys and
1262 * lists as values. It's used for blocking operations (BLPOP) and to
1263 * map swapped keys to a list of clients waiting for this keys to be loaded. */
1264 static dictType keylistDictType = {
1265 dictObjHash, /* hash function */
1266 NULL, /* key dup */
1267 NULL, /* val dup */
1268 dictObjKeyCompare, /* key compare */
1269 dictRedisObjectDestructor, /* key destructor */
1270 dictListDestructor /* val destructor */
1271 };
1272
1273 static void version();
1274
1275 /* ========================= Random utility functions ======================= */
1276
1277 /* Redis generally does not try to recover from out of memory conditions
1278 * when allocating objects or strings, it is not clear if it will be possible
1279 * to report this condition to the client since the networking layer itself
1280 * is based on heap allocation for send buffers, so we simply abort.
1281 * At least the code will be simpler to read... */
1282 static void oom(const char *msg) {
1283 redisLog(REDIS_WARNING, "%s: Out of memory\n",msg);
1284 sleep(1);
1285 abort();
1286 }
1287
1288 /* ====================== Redis server networking stuff ===================== */
1289 static void closeTimedoutClients(void) {
1290 redisClient *c;
1291 listNode *ln;
1292 time_t now = time(NULL);
1293 listIter li;
1294
1295 listRewind(server.clients,&li);
1296 while ((ln = listNext(&li)) != NULL) {
1297 c = listNodeValue(ln);
1298 if (server.maxidletime &&
1299 !(c->flags & REDIS_SLAVE) && /* no timeout for slaves */
1300 !(c->flags & REDIS_MASTER) && /* no timeout for masters */
1301 dictSize(c->pubsub_channels) == 0 && /* no timeout for pubsub */
1302 listLength(c->pubsub_patterns) == 0 &&
1303 (now - c->lastinteraction > server.maxidletime))
1304 {
1305 redisLog(REDIS_VERBOSE,"Closing idle client");
1306 freeClient(c);
1307 } else if (c->flags & REDIS_BLOCKED) {
1308 if (c->blockingto != 0 && c->blockingto < now) {
1309 addReply(c,shared.nullmultibulk);
1310 unblockClientWaitingData(c);
1311 }
1312 }
1313 }
1314 }
1315
1316 static int htNeedsResize(dict *dict) {
1317 long long size, used;
1318
1319 size = dictSlots(dict);
1320 used = dictSize(dict);
1321 return (size && used && size > DICT_HT_INITIAL_SIZE &&
1322 (used*100/size < REDIS_HT_MINFILL));
1323 }
1324
1325 /* If the percentage of used slots in the HT reaches REDIS_HT_MINFILL
1326 * we resize the hash table to save memory */
1327 static void tryResizeHashTables(void) {
1328 int j;
1329
1330 for (j = 0; j < server.dbnum; j++) {
1331 if (htNeedsResize(server.db[j].dict))
1332 dictResize(server.db[j].dict);
1333 if (htNeedsResize(server.db[j].expires))
1334 dictResize(server.db[j].expires);
1335 }
1336 }
1337
1338 /* Our hash table implementation performs rehashing incrementally while
1339 * we write/read from the hash table. Still if the server is idle, the hash
1340 * table will use two tables for a long time. So we try to use 1 millisecond
1341 * of CPU time at every serverCron() loop in order to rehash some key. */
1342 static void incrementallyRehash(void) {
1343 int j;
1344
1345 for (j = 0; j < server.dbnum; j++) {
1346 if (dictIsRehashing(server.db[j].dict)) {
1347 dictRehashMilliseconds(server.db[j].dict,1);
1348 break; /* already used our millisecond for this loop... */
1349 }
1350 }
1351 }
1352
1353 /* A background saving child (BGSAVE) terminated its work. Handle this. */
1354 void backgroundSaveDoneHandler(int statloc) {
1355 int exitcode = WEXITSTATUS(statloc);
1356 int bysignal = WIFSIGNALED(statloc);
1357
1358 if (!bysignal && exitcode == 0) {
1359 redisLog(REDIS_NOTICE,
1360 "Background saving terminated with success");
1361 server.dirty = 0;
1362 server.lastsave = time(NULL);
1363 } else if (!bysignal && exitcode != 0) {
1364 redisLog(REDIS_WARNING, "Background saving error");
1365 } else {
1366 redisLog(REDIS_WARNING,
1367 "Background saving terminated by signal %d", WTERMSIG(statloc));
1368 rdbRemoveTempFile(server.bgsavechildpid);
1369 }
1370 server.bgsavechildpid = -1;
1371 /* Possibly there are slaves waiting for a BGSAVE in order to be served
1372 * (the first stage of SYNC is a bulk transfer of dump.rdb) */
1373 updateSlavesWaitingBgsave(exitcode == 0 ? REDIS_OK : REDIS_ERR);
1374 }
1375
1376 /* A background append only file rewriting (BGREWRITEAOF) terminated its work.
1377 * Handle this. */
1378 void backgroundRewriteDoneHandler(int statloc) {
1379 int exitcode = WEXITSTATUS(statloc);
1380 int bysignal = WIFSIGNALED(statloc);
1381
1382 if (!bysignal && exitcode == 0) {
1383 int fd;
1384 char tmpfile[256];
1385
1386 redisLog(REDIS_NOTICE,
1387 "Background append only file rewriting terminated with success");
1388 /* Now it's time to flush the differences accumulated by the parent */
1389 snprintf(tmpfile,256,"temp-rewriteaof-bg-%d.aof", (int) server.bgrewritechildpid);
1390 fd = open(tmpfile,O_WRONLY|O_APPEND);
1391 if (fd == -1) {
1392 redisLog(REDIS_WARNING, "Not able to open the temp append only file produced by the child: %s", strerror(errno));
1393 goto cleanup;
1394 }
1395 /* Flush our data... */
1396 if (write(fd,server.bgrewritebuf,sdslen(server.bgrewritebuf)) !=
1397 (signed) sdslen(server.bgrewritebuf)) {
1398 redisLog(REDIS_WARNING, "Error or short write trying to flush the parent diff of the append log file in the child temp file: %s", strerror(errno));
1399 close(fd);
1400 goto cleanup;
1401 }
1402 redisLog(REDIS_NOTICE,"Parent diff flushed into the new append log file with success (%lu bytes)",sdslen(server.bgrewritebuf));
1403 /* Now our work is to rename the temp file into the stable file. And
1404 * switch the file descriptor used by the server for append only. */
1405 if (rename(tmpfile,server.appendfilename) == -1) {
1406 redisLog(REDIS_WARNING,"Can't rename the temp append only file into the stable one: %s", strerror(errno));
1407 close(fd);
1408 goto cleanup;
1409 }
1410 /* Mission completed... almost */
1411 redisLog(REDIS_NOTICE,"Append only file successfully rewritten.");
1412 if (server.appendfd != -1) {
1413 /* If append only is actually enabled... */
1414 close(server.appendfd);
1415 server.appendfd = fd;
1416 if (server.appendfsync != APPENDFSYNC_NO) aof_fsync(fd);
1417 server.appendseldb = -1; /* Make sure it will issue SELECT */
1418 redisLog(REDIS_NOTICE,"The new append only file was selected for future appends.");
1419 } else {
1420 /* If append only is disabled we just generate a dump in this
1421 * format. Why not? */
1422 close(fd);
1423 }
1424 } else if (!bysignal && exitcode != 0) {
1425 redisLog(REDIS_WARNING, "Background append only file rewriting error");
1426 } else {
1427 redisLog(REDIS_WARNING,
1428 "Background append only file rewriting terminated by signal %d",
1429 WTERMSIG(statloc));
1430 }
1431 cleanup:
1432 sdsfree(server.bgrewritebuf);
1433 server.bgrewritebuf = sdsempty();
1434 aofRemoveTempFile(server.bgrewritechildpid);
1435 server.bgrewritechildpid = -1;
1436 }
1437
1438 /* This function is called once a background process of some kind terminates,
1439 * as we want to avoid resizing the hash tables when there is a child in order
1440 * to play well with copy-on-write (otherwise when a resize happens lots of
1441 * memory pages are copied). The goal of this function is to update the ability
1442 * for dict.c to resize the hash tables accordingly to the fact we have o not
1443 * running childs. */
1444 static void updateDictResizePolicy(void) {
1445 if (server.bgsavechildpid == -1 && server.bgrewritechildpid == -1)
1446 dictEnableResize();
1447 else
1448 dictDisableResize();
1449 }
1450
1451 static int serverCron(struct aeEventLoop *eventLoop, long long id, void *clientData) {
1452 int j, loops = server.cronloops++;
1453 REDIS_NOTUSED(eventLoop);
1454 REDIS_NOTUSED(id);
1455 REDIS_NOTUSED(clientData);
1456
1457 /* We take a cached value of the unix time in the global state because
1458 * with virtual memory and aging there is to store the current time
1459 * in objects at every object access, and accuracy is not needed.
1460 * To access a global var is faster than calling time(NULL) */
1461 server.unixtime = time(NULL);
1462 /* We have just 21 bits per object for LRU information.
1463 * So we use an (eventually wrapping) LRU clock with minutes resolution.
1464 *
1465 * When we need to select what object to swap, we compute the minimum
1466 * time distance between the current lruclock and the object last access
1467 * lruclock info. Even if clocks will wrap on overflow, there is
1468 * the interesting property that we are sure that at least
1469 * ABS(A-B) minutes passed between current time and timestamp B.
1470 *
1471 * This is not precise but we don't need at all precision, but just
1472 * something statistically reasonable.
1473 */
1474 server.lruclock = (time(NULL)/60)&((1<<21)-1);
1475
1476 /* We received a SIGTERM, shutting down here in a safe way, as it is
1477 * not ok doing so inside the signal handler. */
1478 if (server.shutdown_asap) {
1479 if (prepareForShutdown() == REDIS_OK) exit(0);
1480 redisLog(REDIS_WARNING,"SIGTERM received but errors trying to shut down the server, check the logs for more information");
1481 }
1482
1483 /* Show some info about non-empty databases */
1484 for (j = 0; j < server.dbnum; j++) {
1485 long long size, used, vkeys;
1486
1487 size = dictSlots(server.db[j].dict);
1488 used = dictSize(server.db[j].dict);
1489 vkeys = dictSize(server.db[j].expires);
1490 if (!(loops % 50) && (used || vkeys)) {
1491 redisLog(REDIS_VERBOSE,"DB %d: %lld keys (%lld volatile) in %lld slots HT.",j,used,vkeys,size);
1492 /* dictPrintStats(server.dict); */
1493 }
1494 }
1495
1496 /* We don't want to resize the hash tables while a bacground saving
1497 * is in progress: the saving child is created using fork() that is
1498 * implemented with a copy-on-write semantic in most modern systems, so
1499 * if we resize the HT while there is the saving child at work actually
1500 * a lot of memory movements in the parent will cause a lot of pages
1501 * copied. */
1502 if (server.bgsavechildpid == -1 && server.bgrewritechildpid == -1) {
1503 if (!(loops % 10)) tryResizeHashTables();
1504 if (server.activerehashing) incrementallyRehash();
1505 }
1506
1507 /* Show information about connected clients */
1508 if (!(loops % 50)) {
1509 redisLog(REDIS_VERBOSE,"%d clients connected (%d slaves), %zu bytes in use",
1510 listLength(server.clients)-listLength(server.slaves),
1511 listLength(server.slaves),
1512 zmalloc_used_memory());
1513 }
1514
1515 /* Close connections of timedout clients */
1516 if ((server.maxidletime && !(loops % 100)) || server.blpop_blocked_clients)
1517 closeTimedoutClients();
1518
1519 /* Check if a background saving or AOF rewrite in progress terminated */
1520 if (server.bgsavechildpid != -1 || server.bgrewritechildpid != -1) {
1521 int statloc;
1522 pid_t pid;
1523
1524 if ((pid = wait3(&statloc,WNOHANG,NULL)) != 0) {
1525 if (pid == server.bgsavechildpid) {
1526 backgroundSaveDoneHandler(statloc);
1527 } else {
1528 backgroundRewriteDoneHandler(statloc);
1529 }
1530 updateDictResizePolicy();
1531 }
1532 } else {
1533 /* If there is not a background saving in progress check if
1534 * we have to save now */
1535 time_t now = time(NULL);
1536 for (j = 0; j < server.saveparamslen; j++) {
1537 struct saveparam *sp = server.saveparams+j;
1538
1539 if (server.dirty >= sp->changes &&
1540 now-server.lastsave > sp->seconds) {
1541 redisLog(REDIS_NOTICE,"%d changes in %d seconds. Saving...",
1542 sp->changes, sp->seconds);
1543 rdbSaveBackground(server.dbfilename);
1544 break;
1545 }
1546 }
1547 }
1548
1549 /* Try to expire a few timed out keys. The algorithm used is adaptive and
1550 * will use few CPU cycles if there are few expiring keys, otherwise
1551 * it will get more aggressive to avoid that too much memory is used by
1552 * keys that can be removed from the keyspace. */
1553 for (j = 0; j < server.dbnum; j++) {
1554 int expired;
1555 redisDb *db = server.db+j;
1556
1557 /* Continue to expire if at the end of the cycle more than 25%
1558 * of the keys were expired. */
1559 do {
1560 long num = dictSize(db->expires);
1561 time_t now = time(NULL);
1562
1563 expired = 0;
1564 if (num > REDIS_EXPIRELOOKUPS_PER_CRON)
1565 num = REDIS_EXPIRELOOKUPS_PER_CRON;
1566 while (num--) {
1567 dictEntry *de;
1568 time_t t;
1569
1570 if ((de = dictGetRandomKey(db->expires)) == NULL) break;
1571 t = (time_t) dictGetEntryVal(de);
1572 if (now > t) {
1573 sds key = dictGetEntryKey(de);
1574 robj *keyobj = createStringObject(key,sdslen(key));
1575
1576 dbDelete(db,keyobj);
1577 decrRefCount(keyobj);
1578 expired++;
1579 server.stat_expiredkeys++;
1580 }
1581 }
1582 } while (expired > REDIS_EXPIRELOOKUPS_PER_CRON/4);
1583 }
1584
1585 /* Swap a few keys on disk if we are over the memory limit and VM
1586 * is enbled. Try to free objects from the free list first. */
1587 if (vmCanSwapOut()) {
1588 while (server.vm_enabled && zmalloc_used_memory() >
1589 server.vm_max_memory)
1590 {
1591 int retval;
1592
1593 if (tryFreeOneObjectFromFreelist() == REDIS_OK) continue;
1594 retval = (server.vm_max_threads == 0) ?
1595 vmSwapOneObjectBlocking() :
1596 vmSwapOneObjectThreaded();
1597 if (retval == REDIS_ERR && !(loops % 300) &&
1598 zmalloc_used_memory() >
1599 (server.vm_max_memory+server.vm_max_memory/10))
1600 {
1601 redisLog(REDIS_WARNING,"WARNING: vm-max-memory limit exceeded by more than 10%% but unable to swap more objects out!");
1602 }
1603 /* Note that when using threade I/O we free just one object,
1604 * because anyway when the I/O thread in charge to swap this
1605 * object out will finish, the handler of completed jobs
1606 * will try to swap more objects if we are still out of memory. */
1607 if (retval == REDIS_ERR || server.vm_max_threads > 0) break;
1608 }
1609 }
1610
1611 /* Check if we should connect to a MASTER */
1612 if (server.replstate == REDIS_REPL_CONNECT && !(loops % 10)) {
1613 redisLog(REDIS_NOTICE,"Connecting to MASTER...");
1614 if (syncWithMaster() == REDIS_OK) {
1615 redisLog(REDIS_NOTICE,"MASTER <-> SLAVE sync succeeded");
1616 if (server.appendonly) rewriteAppendOnlyFileBackground();
1617 }
1618 }
1619 return 100;
1620 }
1621
1622 /* This function gets called every time Redis is entering the
1623 * main loop of the event driven library, that is, before to sleep
1624 * for ready file descriptors. */
1625 static void beforeSleep(struct aeEventLoop *eventLoop) {
1626 REDIS_NOTUSED(eventLoop);
1627
1628 /* Awake clients that got all the swapped keys they requested */
1629 if (server.vm_enabled && listLength(server.io_ready_clients)) {
1630 listIter li;
1631 listNode *ln;
1632
1633 listRewind(server.io_ready_clients,&li);
1634 while((ln = listNext(&li))) {
1635 redisClient *c = ln->value;
1636 struct redisCommand *cmd;
1637
1638 /* Resume the client. */
1639 listDelNode(server.io_ready_clients,ln);
1640 c->flags &= (~REDIS_IO_WAIT);
1641 server.vm_blocked_clients--;
1642 aeCreateFileEvent(server.el, c->fd, AE_READABLE,
1643 readQueryFromClient, c);
1644 cmd = lookupCommand(c->argv[0]->ptr);
1645 assert(cmd != NULL);
1646 call(c,cmd);
1647 resetClient(c);
1648 /* There may be more data to process in the input buffer. */
1649 if (c->querybuf && sdslen(c->querybuf) > 0)
1650 processInputBuffer(c);
1651 }
1652 }
1653 /* Write the AOF buffer on disk */
1654 flushAppendOnlyFile();
1655 }
1656
1657 static void createSharedObjects(void) {
1658 int j;
1659
1660 shared.crlf = createObject(REDIS_STRING,sdsnew("\r\n"));
1661 shared.ok = createObject(REDIS_STRING,sdsnew("+OK\r\n"));
1662 shared.err = createObject(REDIS_STRING,sdsnew("-ERR\r\n"));
1663 shared.emptybulk = createObject(REDIS_STRING,sdsnew("$0\r\n\r\n"));
1664 shared.czero = createObject(REDIS_STRING,sdsnew(":0\r\n"));
1665 shared.cone = createObject(REDIS_STRING,sdsnew(":1\r\n"));
1666 shared.nullbulk = createObject(REDIS_STRING,sdsnew("$-1\r\n"));
1667 shared.nullmultibulk = createObject(REDIS_STRING,sdsnew("*-1\r\n"));
1668 shared.emptymultibulk = createObject(REDIS_STRING,sdsnew("*0\r\n"));
1669 shared.pong = createObject(REDIS_STRING,sdsnew("+PONG\r\n"));
1670 shared.queued = createObject(REDIS_STRING,sdsnew("+QUEUED\r\n"));
1671 shared.wrongtypeerr = createObject(REDIS_STRING,sdsnew(
1672 "-ERR Operation against a key holding the wrong kind of value\r\n"));
1673 shared.nokeyerr = createObject(REDIS_STRING,sdsnew(
1674 "-ERR no such key\r\n"));
1675 shared.syntaxerr = createObject(REDIS_STRING,sdsnew(
1676 "-ERR syntax error\r\n"));
1677 shared.sameobjecterr = createObject(REDIS_STRING,sdsnew(
1678 "-ERR source and destination objects are the same\r\n"));
1679 shared.outofrangeerr = createObject(REDIS_STRING,sdsnew(
1680 "-ERR index out of range\r\n"));
1681 shared.space = createObject(REDIS_STRING,sdsnew(" "));
1682 shared.colon = createObject(REDIS_STRING,sdsnew(":"));
1683 shared.plus = createObject(REDIS_STRING,sdsnew("+"));
1684 shared.select0 = createStringObject("select 0\r\n",10);
1685 shared.select1 = createStringObject("select 1\r\n",10);
1686 shared.select2 = createStringObject("select 2\r\n",10);
1687 shared.select3 = createStringObject("select 3\r\n",10);
1688 shared.select4 = createStringObject("select 4\r\n",10);
1689 shared.select5 = createStringObject("select 5\r\n",10);
1690 shared.select6 = createStringObject("select 6\r\n",10);
1691 shared.select7 = createStringObject("select 7\r\n",10);
1692 shared.select8 = createStringObject("select 8\r\n",10);
1693 shared.select9 = createStringObject("select 9\r\n",10);
1694 shared.messagebulk = createStringObject("$7\r\nmessage\r\n",13);
1695 shared.pmessagebulk = createStringObject("$8\r\npmessage\r\n",14);
1696 shared.subscribebulk = createStringObject("$9\r\nsubscribe\r\n",15);
1697 shared.unsubscribebulk = createStringObject("$11\r\nunsubscribe\r\n",18);
1698 shared.psubscribebulk = createStringObject("$10\r\npsubscribe\r\n",17);
1699 shared.punsubscribebulk = createStringObject("$12\r\npunsubscribe\r\n",19);
1700 shared.mbulk3 = createStringObject("*3\r\n",4);
1701 shared.mbulk4 = createStringObject("*4\r\n",4);
1702 for (j = 0; j < REDIS_SHARED_INTEGERS; j++) {
1703 shared.integers[j] = createObject(REDIS_STRING,(void*)(long)j);
1704 shared.integers[j]->encoding = REDIS_ENCODING_INT;
1705 }
1706 }
1707
1708 static void appendServerSaveParams(time_t seconds, int changes) {
1709 server.saveparams = zrealloc(server.saveparams,sizeof(struct saveparam)*(server.saveparamslen+1));
1710 server.saveparams[server.saveparamslen].seconds = seconds;
1711 server.saveparams[server.saveparamslen].changes = changes;
1712 server.saveparamslen++;
1713 }
1714
1715 static void resetServerSaveParams() {
1716 zfree(server.saveparams);
1717 server.saveparams = NULL;
1718 server.saveparamslen = 0;
1719 }
1720
1721 static void initServerConfig() {
1722 server.dbnum = REDIS_DEFAULT_DBNUM;
1723 server.port = REDIS_SERVERPORT;
1724 server.verbosity = REDIS_VERBOSE;
1725 server.maxidletime = REDIS_MAXIDLETIME;
1726 server.saveparams = NULL;
1727 server.logfile = NULL; /* NULL = log on standard output */
1728 server.bindaddr = NULL;
1729 server.glueoutputbuf = 1;
1730 server.daemonize = 0;
1731 server.appendonly = 0;
1732 server.appendfsync = APPENDFSYNC_EVERYSEC;
1733 server.no_appendfsync_on_rewrite = 0;
1734 server.lastfsync = time(NULL);
1735 server.appendfd = -1;
1736 server.appendseldb = -1; /* Make sure the first time will not match */
1737 server.pidfile = zstrdup("/var/run/redis.pid");
1738 server.dbfilename = zstrdup("dump.rdb");
1739 server.appendfilename = zstrdup("appendonly.aof");
1740 server.requirepass = NULL;
1741 server.rdbcompression = 1;
1742 server.activerehashing = 1;
1743 server.maxclients = 0;
1744 server.blpop_blocked_clients = 0;
1745 server.maxmemory = 0;
1746 server.vm_enabled = 0;
1747 server.vm_swap_file = zstrdup("/tmp/redis-%p.vm");
1748 server.vm_page_size = 256; /* 256 bytes per page */
1749 server.vm_pages = 1024*1024*100; /* 104 millions of pages */
1750 server.vm_max_memory = 1024LL*1024*1024*1; /* 1 GB of RAM */
1751 server.vm_max_threads = 4;
1752 server.vm_blocked_clients = 0;
1753 server.hash_max_zipmap_entries = REDIS_HASH_MAX_ZIPMAP_ENTRIES;
1754 server.hash_max_zipmap_value = REDIS_HASH_MAX_ZIPMAP_VALUE;
1755 server.shutdown_asap = 0;
1756
1757 resetServerSaveParams();
1758
1759 appendServerSaveParams(60*60,1); /* save after 1 hour and 1 change */
1760 appendServerSaveParams(300,100); /* save after 5 minutes and 100 changes */
1761 appendServerSaveParams(60,10000); /* save after 1 minute and 10000 changes */
1762 /* Replication related */
1763 server.isslave = 0;
1764 server.masterauth = NULL;
1765 server.masterhost = NULL;
1766 server.masterport = 6379;
1767 server.master = NULL;
1768 server.replstate = REDIS_REPL_NONE;
1769
1770 /* Double constants initialization */
1771 R_Zero = 0.0;
1772 R_PosInf = 1.0/R_Zero;
1773 R_NegInf = -1.0/R_Zero;
1774 R_Nan = R_Zero/R_Zero;
1775 }
1776
1777 static void initServer() {
1778 int j;
1779
1780 signal(SIGHUP, SIG_IGN);
1781 signal(SIGPIPE, SIG_IGN);
1782 setupSigSegvAction();
1783
1784 server.devnull = fopen("/dev/null","w");
1785 if (server.devnull == NULL) {
1786 redisLog(REDIS_WARNING, "Can't open /dev/null: %s", server.neterr);
1787 exit(1);
1788 }
1789 server.clients = listCreate();
1790 server.slaves = listCreate();
1791 server.monitors = listCreate();
1792 server.objfreelist = listCreate();
1793 createSharedObjects();
1794 server.el = aeCreateEventLoop();
1795 server.db = zmalloc(sizeof(redisDb)*server.dbnum);
1796 server.fd = anetTcpServer(server.neterr, server.port, server.bindaddr);
1797 if (server.fd == -1) {
1798 redisLog(REDIS_WARNING, "Opening TCP port: %s", server.neterr);
1799 exit(1);
1800 }
1801 for (j = 0; j < server.dbnum; j++) {
1802 server.db[j].dict = dictCreate(&dbDictType,NULL);
1803 server.db[j].expires = dictCreate(&keyptrDictType,NULL);
1804 server.db[j].blocking_keys = dictCreate(&keylistDictType,NULL);
1805 server.db[j].watched_keys = dictCreate(&keylistDictType,NULL);
1806 if (server.vm_enabled)
1807 server.db[j].io_keys = dictCreate(&keylistDictType,NULL);
1808 server.db[j].id = j;
1809 }
1810 server.pubsub_channels = dictCreate(&keylistDictType,NULL);
1811 server.pubsub_patterns = listCreate();
1812 listSetFreeMethod(server.pubsub_patterns,freePubsubPattern);
1813 listSetMatchMethod(server.pubsub_patterns,listMatchPubsubPattern);
1814 server.cronloops = 0;
1815 server.bgsavechildpid = -1;
1816 server.bgrewritechildpid = -1;
1817 server.bgrewritebuf = sdsempty();
1818 server.aofbuf = sdsempty();
1819 server.lastsave = time(NULL);
1820 server.dirty = 0;
1821 server.stat_numcommands = 0;
1822 server.stat_numconnections = 0;
1823 server.stat_expiredkeys = 0;
1824 server.stat_starttime = time(NULL);
1825 server.unixtime = time(NULL);
1826 aeCreateTimeEvent(server.el, 1, serverCron, NULL, NULL);
1827 if (aeCreateFileEvent(server.el, server.fd, AE_READABLE,
1828 acceptHandler, NULL) == AE_ERR) oom("creating file event");
1829
1830 if (server.appendonly) {
1831 server.appendfd = open(server.appendfilename,O_WRONLY|O_APPEND|O_CREAT,0644);
1832 if (server.appendfd == -1) {
1833 redisLog(REDIS_WARNING, "Can't open the append-only file: %s",
1834 strerror(errno));
1835 exit(1);
1836 }
1837 }
1838
1839 if (server.vm_enabled) vmInit();
1840 }
1841
1842 /* Empty the whole database */
1843 static long long emptyDb() {
1844 int j;
1845 long long removed = 0;
1846
1847 for (j = 0; j < server.dbnum; j++) {
1848 removed += dictSize(server.db[j].dict);
1849 dictEmpty(server.db[j].dict);
1850 dictEmpty(server.db[j].expires);
1851 }
1852 return removed;
1853 }
1854
1855 static int yesnotoi(char *s) {
1856 if (!strcasecmp(s,"yes")) return 1;
1857 else if (!strcasecmp(s,"no")) return 0;
1858 else return -1;
1859 }
1860
1861 /* I agree, this is a very rudimental way to load a configuration...
1862 will improve later if the config gets more complex */
1863 static void loadServerConfig(char *filename) {
1864 FILE *fp;
1865 char buf[REDIS_CONFIGLINE_MAX+1], *err = NULL;
1866 int linenum = 0;
1867 sds line = NULL;
1868
1869 if (filename[0] == '-' && filename[1] == '\0')
1870 fp = stdin;
1871 else {
1872 if ((fp = fopen(filename,"r")) == NULL) {
1873 redisLog(REDIS_WARNING, "Fatal error, can't open config file '%s'", filename);
1874 exit(1);
1875 }
1876 }
1877
1878 while(fgets(buf,REDIS_CONFIGLINE_MAX+1,fp) != NULL) {
1879 sds *argv;
1880 int argc, j;
1881
1882 linenum++;
1883 line = sdsnew(buf);
1884 line = sdstrim(line," \t\r\n");
1885
1886 /* Skip comments and blank lines*/
1887 if (line[0] == '#' || line[0] == '\0') {
1888 sdsfree(line);
1889 continue;
1890 }
1891
1892 /* Split into arguments */
1893 argv = sdssplitlen(line,sdslen(line)," ",1,&argc);
1894 sdstolower(argv[0]);
1895
1896 /* Execute config directives */
1897 if (!strcasecmp(argv[0],"timeout") && argc == 2) {
1898 server.maxidletime = atoi(argv[1]);
1899 if (server.maxidletime < 0) {
1900 err = "Invalid timeout value"; goto loaderr;
1901 }
1902 } else if (!strcasecmp(argv[0],"port") && argc == 2) {
1903 server.port = atoi(argv[1]);
1904 if (server.port < 1 || server.port > 65535) {
1905 err = "Invalid port"; goto loaderr;
1906 }
1907 } else if (!strcasecmp(argv[0],"bind") && argc == 2) {
1908 server.bindaddr = zstrdup(argv[1]);
1909 } else if (!strcasecmp(argv[0],"save") && argc == 3) {
1910 int seconds = atoi(argv[1]);
1911 int changes = atoi(argv[2]);
1912 if (seconds < 1 || changes < 0) {
1913 err = "Invalid save parameters"; goto loaderr;
1914 }
1915 appendServerSaveParams(seconds,changes);
1916 } else if (!strcasecmp(argv[0],"dir") && argc == 2) {
1917 if (chdir(argv[1]) == -1) {
1918 redisLog(REDIS_WARNING,"Can't chdir to '%s': %s",
1919 argv[1], strerror(errno));
1920 exit(1);
1921 }
1922 } else if (!strcasecmp(argv[0],"loglevel") && argc == 2) {
1923 if (!strcasecmp(argv[1],"debug")) server.verbosity = REDIS_DEBUG;
1924 else if (!strcasecmp(argv[1],"verbose")) server.verbosity = REDIS_VERBOSE;
1925 else if (!strcasecmp(argv[1],"notice")) server.verbosity = REDIS_NOTICE;
1926 else if (!strcasecmp(argv[1],"warning")) server.verbosity = REDIS_WARNING;
1927 else {
1928 err = "Invalid log level. Must be one of debug, notice, warning";
1929 goto loaderr;
1930 }
1931 } else if (!strcasecmp(argv[0],"logfile") && argc == 2) {
1932 FILE *logfp;
1933
1934 server.logfile = zstrdup(argv[1]);
1935 if (!strcasecmp(server.logfile,"stdout")) {
1936 zfree(server.logfile);
1937 server.logfile = NULL;
1938 }
1939 if (server.logfile) {
1940 /* Test if we are able to open the file. The server will not
1941 * be able to abort just for this problem later... */
1942 logfp = fopen(server.logfile,"a");
1943 if (logfp == NULL) {
1944 err = sdscatprintf(sdsempty(),
1945 "Can't open the log file: %s", strerror(errno));
1946 goto loaderr;
1947 }
1948 fclose(logfp);
1949 }
1950 } else if (!strcasecmp(argv[0],"databases") && argc == 2) {
1951 server.dbnum = atoi(argv[1]);
1952 if (server.dbnum < 1) {
1953 err = "Invalid number of databases"; goto loaderr;
1954 }
1955 } else if (!strcasecmp(argv[0],"include") && argc == 2) {
1956 loadServerConfig(argv[1]);
1957 } else if (!strcasecmp(argv[0],"maxclients") && argc == 2) {
1958 server.maxclients = atoi(argv[1]);
1959 } else if (!strcasecmp(argv[0],"maxmemory") && argc == 2) {
1960 server.maxmemory = memtoll(argv[1],NULL);
1961 } else if (!strcasecmp(argv[0],"slaveof") && argc == 3) {
1962 server.masterhost = sdsnew(argv[1]);
1963 server.masterport = atoi(argv[2]);
1964 server.replstate = REDIS_REPL_CONNECT;
1965 } else if (!strcasecmp(argv[0],"masterauth") && argc == 2) {
1966 server.masterauth = zstrdup(argv[1]);
1967 } else if (!strcasecmp(argv[0],"glueoutputbuf") && argc == 2) {
1968 if ((server.glueoutputbuf = yesnotoi(argv[1])) == -1) {
1969 err = "argument must be 'yes' or 'no'"; goto loaderr;
1970 }
1971 } else if (!strcasecmp(argv[0],"rdbcompression") && argc == 2) {
1972 if ((server.rdbcompression = yesnotoi(argv[1])) == -1) {
1973 err = "argument must be 'yes' or 'no'"; goto loaderr;
1974 }
1975 } else if (!strcasecmp(argv[0],"activerehashing") && argc == 2) {
1976 if ((server.activerehashing = yesnotoi(argv[1])) == -1) {
1977 err = "argument must be 'yes' or 'no'"; goto loaderr;
1978 }
1979 } else if (!strcasecmp(argv[0],"daemonize") && argc == 2) {
1980 if ((server.daemonize = yesnotoi(argv[1])) == -1) {
1981 err = "argument must be 'yes' or 'no'"; goto loaderr;
1982 }
1983 } else if (!strcasecmp(argv[0],"appendonly") && argc == 2) {
1984 if ((server.appendonly = yesnotoi(argv[1])) == -1) {
1985 err = "argument must be 'yes' or 'no'"; goto loaderr;
1986 }
1987 } else if (!strcasecmp(argv[0],"appendfilename") && argc == 2) {
1988 zfree(server.appendfilename);
1989 server.appendfilename = zstrdup(argv[1]);
1990 } else if (!strcasecmp(argv[0],"no-appendfsync-on-rewrite")
1991 && argc == 2) {
1992 if ((server.no_appendfsync_on_rewrite= yesnotoi(argv[1])) == -1) {
1993 err = "argument must be 'yes' or 'no'"; goto loaderr;
1994 }
1995 } else if (!strcasecmp(argv[0],"appendfsync") && argc == 2) {
1996 if (!strcasecmp(argv[1],"no")) {
1997 server.appendfsync = APPENDFSYNC_NO;
1998 } else if (!strcasecmp(argv[1],"always")) {
1999 server.appendfsync = APPENDFSYNC_ALWAYS;
2000 } else if (!strcasecmp(argv[1],"everysec")) {
2001 server.appendfsync = APPENDFSYNC_EVERYSEC;
2002 } else {
2003 err = "argument must be 'no', 'always' or 'everysec'";
2004 goto loaderr;
2005 }
2006 } else if (!strcasecmp(argv[0],"requirepass") && argc == 2) {
2007 server.requirepass = zstrdup(argv[1]);
2008 } else if (!strcasecmp(argv[0],"pidfile") && argc == 2) {
2009 zfree(server.pidfile);
2010 server.pidfile = zstrdup(argv[1]);
2011 } else if (!strcasecmp(argv[0],"dbfilename") && argc == 2) {
2012 zfree(server.dbfilename);
2013 server.dbfilename = zstrdup(argv[1]);
2014 } else if (!strcasecmp(argv[0],"vm-enabled") && argc == 2) {
2015 if ((server.vm_enabled = yesnotoi(argv[1])) == -1) {
2016 err = "argument must be 'yes' or 'no'"; goto loaderr;
2017 }
2018 } else if (!strcasecmp(argv[0],"vm-swap-file") && argc == 2) {
2019 zfree(server.vm_swap_file);
2020 server.vm_swap_file = zstrdup(argv[1]);
2021 } else if (!strcasecmp(argv[0],"vm-max-memory") && argc == 2) {
2022 server.vm_max_memory = memtoll(argv[1],NULL);
2023 } else if (!strcasecmp(argv[0],"vm-page-size") && argc == 2) {
2024 server.vm_page_size = memtoll(argv[1], NULL);
2025 } else if (!strcasecmp(argv[0],"vm-pages") && argc == 2) {
2026 server.vm_pages = memtoll(argv[1], NULL);
2027 } else if (!strcasecmp(argv[0],"vm-max-threads") && argc == 2) {
2028 server.vm_max_threads = strtoll(argv[1], NULL, 10);
2029 } else if (!strcasecmp(argv[0],"hash-max-zipmap-entries") && argc == 2){
2030 server.hash_max_zipmap_entries = memtoll(argv[1], NULL);
2031 } else if (!strcasecmp(argv[0],"hash-max-zipmap-value") && argc == 2){
2032 server.hash_max_zipmap_value = memtoll(argv[1], NULL);
2033 } else {
2034 err = "Bad directive or wrong number of arguments"; goto loaderr;
2035 }
2036 for (j = 0; j < argc; j++)
2037 sdsfree(argv[j]);
2038 zfree(argv);
2039 sdsfree(line);
2040 }
2041 if (fp != stdin) fclose(fp);
2042 return;
2043
2044 loaderr:
2045 fprintf(stderr, "\n*** FATAL CONFIG FILE ERROR ***\n");
2046 fprintf(stderr, "Reading the configuration file, at line %d\n", linenum);
2047 fprintf(stderr, ">>> '%s'\n", line);
2048 fprintf(stderr, "%s\n", err);
2049 exit(1);
2050 }
2051
2052 static void freeClientArgv(redisClient *c) {
2053 int j;
2054
2055 for (j = 0; j < c->argc; j++)
2056 decrRefCount(c->argv[j]);
2057 for (j = 0; j < c->mbargc; j++)
2058 decrRefCount(c->mbargv[j]);
2059 c->argc = 0;
2060 c->mbargc = 0;
2061 }
2062
2063 static void freeClient(redisClient *c) {
2064 listNode *ln;
2065
2066 /* Note that if the client we are freeing is blocked into a blocking
2067 * call, we have to set querybuf to NULL *before* to call
2068 * unblockClientWaitingData() to avoid processInputBuffer() will get
2069 * called. Also it is important to remove the file events after
2070 * this, because this call adds the READABLE event. */
2071 sdsfree(c->querybuf);
2072 c->querybuf = NULL;
2073 if (c->flags & REDIS_BLOCKED)
2074 unblockClientWaitingData(c);
2075
2076 /* UNWATCH all the keys */
2077 unwatchAllKeys(c);
2078 listRelease(c->watched_keys);
2079 /* Unsubscribe from all the pubsub channels */
2080 pubsubUnsubscribeAllChannels(c,0);
2081 pubsubUnsubscribeAllPatterns(c,0);
2082 dictRelease(c->pubsub_channels);
2083 listRelease(c->pubsub_patterns);
2084 /* Obvious cleanup */
2085 aeDeleteFileEvent(server.el,c->fd,AE_READABLE);
2086 aeDeleteFileEvent(server.el,c->fd,AE_WRITABLE);
2087 listRelease(c->reply);
2088 freeClientArgv(c);
2089 close(c->fd);
2090 /* Remove from the list of clients */
2091 ln = listSearchKey(server.clients,c);
2092 redisAssert(ln != NULL);
2093 listDelNode(server.clients,ln);
2094 /* Remove from the list of clients that are now ready to be restarted
2095 * after waiting for swapped keys */
2096 if (c->flags & REDIS_IO_WAIT && listLength(c->io_keys) == 0) {
2097 ln = listSearchKey(server.io_ready_clients,c);
2098 if (ln) {
2099 listDelNode(server.io_ready_clients,ln);
2100 server.vm_blocked_clients--;
2101 }
2102 }
2103 /* Remove from the list of clients waiting for swapped keys */
2104 while (server.vm_enabled && listLength(c->io_keys)) {
2105 ln = listFirst(c->io_keys);
2106 dontWaitForSwappedKey(c,ln->value);
2107 }
2108 listRelease(c->io_keys);
2109 /* Master/slave cleanup */
2110 if (c->flags & REDIS_SLAVE) {
2111 if (c->replstate == REDIS_REPL_SEND_BULK && c->repldbfd != -1)
2112 close(c->repldbfd);
2113 list *l = (c->flags & REDIS_MONITOR) ? server.monitors : server.slaves;
2114 ln = listSearchKey(l,c);
2115 redisAssert(ln != NULL);
2116 listDelNode(l,ln);
2117 }
2118 if (c->flags & REDIS_MASTER) {
2119 server.master = NULL;
2120 server.replstate = REDIS_REPL_CONNECT;
2121 }
2122 /* Release memory */
2123 zfree(c->argv);
2124 zfree(c->mbargv);
2125 freeClientMultiState(c);
2126 zfree(c);
2127 }
2128
2129 #define GLUEREPLY_UP_TO (1024)
2130 static void glueReplyBuffersIfNeeded(redisClient *c) {
2131 int copylen = 0;
2132 char buf[GLUEREPLY_UP_TO];
2133 listNode *ln;
2134 listIter li;
2135 robj *o;
2136
2137 listRewind(c->reply,&li);
2138 while((ln = listNext(&li))) {
2139 int objlen;
2140
2141 o = ln->value;
2142 objlen = sdslen(o->ptr);
2143 if (copylen + objlen <= GLUEREPLY_UP_TO) {
2144 memcpy(buf+copylen,o->ptr,objlen);
2145 copylen += objlen;
2146 listDelNode(c->reply,ln);
2147 } else {
2148 if (copylen == 0) return;
2149 break;
2150 }
2151 }
2152 /* Now the output buffer is empty, add the new single element */
2153 o = createObject(REDIS_STRING,sdsnewlen(buf,copylen));
2154 listAddNodeHead(c->reply,o);
2155 }
2156
2157 static void sendReplyToClient(aeEventLoop *el, int fd, void *privdata, int mask) {
2158 redisClient *c = privdata;
2159 int nwritten = 0, totwritten = 0, objlen;
2160 robj *o;
2161 REDIS_NOTUSED(el);
2162 REDIS_NOTUSED(mask);
2163
2164 /* Use writev() if we have enough buffers to send */
2165 if (!server.glueoutputbuf &&
2166 listLength(c->reply) > REDIS_WRITEV_THRESHOLD &&
2167 !(c->flags & REDIS_MASTER))
2168 {
2169 sendReplyToClientWritev(el, fd, privdata, mask);
2170 return;
2171 }
2172
2173 while(listLength(c->reply)) {
2174 if (server.glueoutputbuf && listLength(c->reply) > 1)
2175 glueReplyBuffersIfNeeded(c);
2176
2177 o = listNodeValue(listFirst(c->reply));
2178 objlen = sdslen(o->ptr);
2179
2180 if (objlen == 0) {
2181 listDelNode(c->reply,listFirst(c->reply));
2182 continue;
2183 }
2184
2185 if (c->flags & REDIS_MASTER) {
2186 /* Don't reply to a master */
2187 nwritten = objlen - c->sentlen;
2188 } else {
2189 nwritten = write(fd, ((char*)o->ptr)+c->sentlen, objlen - c->sentlen);
2190 if (nwritten <= 0) break;
2191 }
2192 c->sentlen += nwritten;
2193 totwritten += nwritten;
2194 /* If we fully sent the object on head go to the next one */
2195 if (c->sentlen == objlen) {
2196 listDelNode(c->reply,listFirst(c->reply));
2197 c->sentlen = 0;
2198 }
2199 /* Note that we avoid to send more thank REDIS_MAX_WRITE_PER_EVENT
2200 * bytes, in a single threaded server it's a good idea to serve
2201 * other clients as well, even if a very large request comes from
2202 * super fast link that is always able to accept data (in real world
2203 * scenario think about 'KEYS *' against the loopback interfae) */
2204 if (totwritten > REDIS_MAX_WRITE_PER_EVENT) break;
2205 }
2206 if (nwritten == -1) {
2207 if (errno == EAGAIN) {
2208 nwritten = 0;
2209 } else {
2210 redisLog(REDIS_VERBOSE,
2211 "Error writing to client: %s", strerror(errno));
2212 freeClient(c);
2213 return;
2214 }
2215 }
2216 if (totwritten > 0) c->lastinteraction = time(NULL);
2217 if (listLength(c->reply) == 0) {
2218 c->sentlen = 0;
2219 aeDeleteFileEvent(server.el,c->fd,AE_WRITABLE);
2220 }
2221 }
2222
2223 static void sendReplyToClientWritev(aeEventLoop *el, int fd, void *privdata, int mask)
2224 {
2225 redisClient *c = privdata;
2226 int nwritten = 0, totwritten = 0, objlen, willwrite;
2227 robj *o;
2228 struct iovec iov[REDIS_WRITEV_IOVEC_COUNT];
2229 int offset, ion = 0;
2230 REDIS_NOTUSED(el);
2231 REDIS_NOTUSED(mask);
2232
2233 listNode *node;
2234 while (listLength(c->reply)) {
2235 offset = c->sentlen;
2236 ion = 0;
2237 willwrite = 0;
2238
2239 /* fill-in the iov[] array */
2240 for(node = listFirst(c->reply); node; node = listNextNode(node)) {
2241 o = listNodeValue(node);
2242 objlen = sdslen(o->ptr);
2243
2244 if (totwritten + objlen - offset > REDIS_MAX_WRITE_PER_EVENT)
2245 break;
2246
2247 if(ion == REDIS_WRITEV_IOVEC_COUNT)
2248 break; /* no more iovecs */
2249
2250 iov[ion].iov_base = ((char*)o->ptr) + offset;
2251 iov[ion].iov_len = objlen - offset;
2252 willwrite += objlen - offset;
2253 offset = 0; /* just for the first item */
2254 ion++;
2255 }
2256
2257 if(willwrite == 0)
2258 break;
2259
2260 /* write all collected blocks at once */
2261 if((nwritten = writev(fd, iov, ion)) < 0) {
2262 if (errno != EAGAIN) {
2263 redisLog(REDIS_VERBOSE,
2264 "Error writing to client: %s", strerror(errno));
2265 freeClient(c);
2266 return;
2267 }
2268 break;
2269 }
2270
2271 totwritten += nwritten;
2272 offset = c->sentlen;
2273
2274 /* remove written robjs from c->reply */
2275 while (nwritten && listLength(c->reply)) {
2276 o = listNodeValue(listFirst(c->reply));
2277 objlen = sdslen(o->ptr);
2278
2279 if(nwritten >= objlen - offset) {
2280 listDelNode(c->reply, listFirst(c->reply));
2281 nwritten -= objlen - offset;
2282 c->sentlen = 0;
2283 } else {
2284 /* partial write */
2285 c->sentlen += nwritten;
2286 break;
2287 }
2288 offset = 0;
2289 }
2290 }
2291
2292 if (totwritten > 0)
2293 c->lastinteraction = time(NULL);
2294
2295 if (listLength(c->reply) == 0) {
2296 c->sentlen = 0;
2297 aeDeleteFileEvent(server.el,c->fd,AE_WRITABLE);
2298 }
2299 }
2300
2301 static int qsortRedisCommands(const void *r1, const void *r2) {
2302 return strcasecmp(
2303 ((struct redisCommand*)r1)->name,
2304 ((struct redisCommand*)r2)->name);
2305 }
2306
2307 static void sortCommandTable() {
2308 /* Copy and sort the read-only version of the command table */
2309 commandTable = (struct redisCommand*)malloc(sizeof(readonlyCommandTable));
2310 memcpy(commandTable,readonlyCommandTable,sizeof(readonlyCommandTable));
2311 qsort(commandTable,
2312 sizeof(readonlyCommandTable)/sizeof(struct redisCommand),
2313 sizeof(struct redisCommand),qsortRedisCommands);
2314 }
2315
2316 static struct redisCommand *lookupCommand(char *name) {
2317 struct redisCommand tmp = {name,NULL,0,0,NULL,0,0,0};
2318 return bsearch(
2319 &tmp,
2320 commandTable,
2321 sizeof(readonlyCommandTable)/sizeof(struct redisCommand),
2322 sizeof(struct redisCommand),
2323 qsortRedisCommands);
2324 }
2325
2326 /* resetClient prepare the client to process the next command */
2327 static void resetClient(redisClient *c) {
2328 freeClientArgv(c);
2329 c->bulklen = -1;
2330 c->multibulk = 0;
2331 }
2332
2333 /* Call() is the core of Redis execution of a command */
2334 static void call(redisClient *c, struct redisCommand *cmd) {
2335 long long dirty;
2336
2337 dirty = server.dirty;
2338 cmd->proc(c);
2339 dirty = server.dirty-dirty;
2340
2341 if (server.appendonly && dirty)
2342 feedAppendOnlyFile(cmd,c->db->id,c->argv,c->argc);
2343 if ((dirty || cmd->flags & REDIS_CMD_FORCE_REPLICATION) &&
2344 listLength(server.slaves))
2345 replicationFeedSlaves(server.slaves,c->db->id,c->argv,c->argc);
2346 if (listLength(server.monitors))
2347 replicationFeedMonitors(server.monitors,c->db->id,c->argv,c->argc);
2348 server.stat_numcommands++;
2349 }
2350
2351 /* If this function gets called we already read a whole
2352 * command, argments are in the client argv/argc fields.
2353 * processCommand() execute the command or prepare the
2354 * server for a bulk read from the client.
2355 *
2356 * If 1 is returned the client is still alive and valid and
2357 * and other operations can be performed by the caller. Otherwise
2358 * if 0 is returned the client was destroied (i.e. after QUIT). */
2359 static int processCommand(redisClient *c) {
2360 struct redisCommand *cmd;
2361
2362 /* Free some memory if needed (maxmemory setting) */
2363 if (server.maxmemory) freeMemoryIfNeeded();
2364
2365 /* Handle the multi bulk command type. This is an alternative protocol
2366 * supported by Redis in order to receive commands that are composed of
2367 * multiple binary-safe "bulk" arguments. The latency of processing is
2368 * a bit higher but this allows things like multi-sets, so if this
2369 * protocol is used only for MSET and similar commands this is a big win. */
2370 if (c->multibulk == 0 && c->argc == 1 && ((char*)(c->argv[0]->ptr))[0] == '*') {
2371 c->multibulk = atoi(((char*)c->argv[0]->ptr)+1);
2372 if (c->multibulk <= 0) {
2373 resetClient(c);
2374 return 1;
2375 } else {
2376 decrRefCount(c->argv[c->argc-1]);
2377 c->argc--;
2378 return 1;
2379 }
2380 } else if (c->multibulk) {
2381 if (c->bulklen == -1) {
2382 if (((char*)c->argv[0]->ptr)[0] != '$') {
2383 addReplySds(c,sdsnew("-ERR multi bulk protocol error\r\n"));
2384 resetClient(c);
2385 return 1;
2386 } else {
2387 int bulklen = atoi(((char*)c->argv[0]->ptr)+1);
2388 decrRefCount(c->argv[0]);
2389 if (bulklen < 0 || bulklen > 1024*1024*1024) {
2390 c->argc--;
2391 addReplySds(c,sdsnew("-ERR invalid bulk write count\r\n"));
2392 resetClient(c);
2393 return 1;
2394 }
2395 c->argc--;
2396 c->bulklen = bulklen+2; /* add two bytes for CR+LF */
2397 return 1;
2398 }
2399 } else {
2400 c->mbargv = zrealloc(c->mbargv,(sizeof(robj*))*(c->mbargc+1));
2401 c->mbargv[c->mbargc] = c->argv[0];
2402 c->mbargc++;
2403 c->argc--;
2404 c->multibulk--;
2405 if (c->multibulk == 0) {
2406 robj **auxargv;
2407 int auxargc;
2408
2409 /* Here we need to swap the multi-bulk argc/argv with the
2410 * normal argc/argv of the client structure. */
2411 auxargv = c->argv;
2412 c->argv = c->mbargv;
2413 c->mbargv = auxargv;
2414
2415 auxargc = c->argc;
2416 c->argc = c->mbargc;
2417 c->mbargc = auxargc;
2418
2419 /* We need to set bulklen to something different than -1
2420 * in order for the code below to process the command without
2421 * to try to read the last argument of a bulk command as
2422 * a special argument. */
2423 c->bulklen = 0;
2424 /* continue below and process the command */
2425 } else {
2426 c->bulklen = -1;
2427 return 1;
2428 }
2429 }
2430 }
2431 /* -- end of multi bulk commands processing -- */
2432
2433 /* The QUIT command is handled as a special case. Normal command
2434 * procs are unable to close the client connection safely */
2435 if (!strcasecmp(c->argv[0]->ptr,"quit")) {
2436 freeClient(c);
2437 return 0;
2438 }
2439
2440 /* Now lookup the command and check ASAP about trivial error conditions
2441 * such wrong arity, bad command name and so forth. */
2442 cmd = lookupCommand(c->argv[0]->ptr);
2443 if (!cmd) {
2444 addReplySds(c,
2445 sdscatprintf(sdsempty(), "-ERR unknown command '%s'\r\n",
2446 (char*)c->argv[0]->ptr));
2447 resetClient(c);
2448 return 1;
2449 } else if ((cmd->arity > 0 && cmd->arity != c->argc) ||
2450 (c->argc < -cmd->arity)) {
2451 addReplySds(c,
2452 sdscatprintf(sdsempty(),
2453 "-ERR wrong number of arguments for '%s' command\r\n",
2454 cmd->name));
2455 resetClient(c);
2456 return 1;
2457 } else if (cmd->flags & REDIS_CMD_BULK && c->bulklen == -1) {
2458 /* This is a bulk command, we have to read the last argument yet. */
2459 int bulklen = atoi(c->argv[c->argc-1]->ptr);
2460
2461 decrRefCount(c->argv[c->argc-1]);
2462 if (bulklen < 0 || bulklen > 1024*1024*1024) {
2463 c->argc--;
2464 addReplySds(c,sdsnew("-ERR invalid bulk write count\r\n"));
2465 resetClient(c);
2466 return 1;
2467 }
2468 c->argc--;
2469 c->bulklen = bulklen+2; /* add two bytes for CR+LF */
2470 /* It is possible that the bulk read is already in the
2471 * buffer. Check this condition and handle it accordingly.
2472 * This is just a fast path, alternative to call processInputBuffer().
2473 * It's a good idea since the code is small and this condition
2474 * happens most of the times. */
2475 if ((signed)sdslen(c->querybuf) >= c->bulklen) {
2476 c->argv[c->argc] = createStringObject(c->querybuf,c->bulklen-2);
2477 c->argc++;
2478 c->querybuf = sdsrange(c->querybuf,c->bulklen,-1);
2479 } else {
2480 /* Otherwise return... there is to read the last argument
2481 * from the socket. */
2482 return 1;
2483 }
2484 }
2485 /* Let's try to encode the bulk object to save space. */
2486 if (cmd->flags & REDIS_CMD_BULK)
2487 c->argv[c->argc-1] = tryObjectEncoding(c->argv[c->argc-1]);
2488
2489 /* Check if the user is authenticated */
2490 if (server.requirepass && !c->authenticated && cmd->proc != authCommand) {
2491 addReplySds(c,sdsnew("-ERR operation not permitted\r\n"));
2492 resetClient(c);
2493 return 1;
2494 }
2495
2496 /* Handle the maxmemory directive */
2497 if (server.maxmemory && (cmd->flags & REDIS_CMD_DENYOOM) &&
2498 zmalloc_used_memory() > server.maxmemory)
2499 {
2500 addReplySds(c,sdsnew("-ERR command not allowed when used memory > 'maxmemory'\r\n"));
2501 resetClient(c);
2502 return 1;
2503 }
2504
2505 /* Only allow SUBSCRIBE and UNSUBSCRIBE in the context of Pub/Sub */
2506 if ((dictSize(c->pubsub_channels) > 0 || listLength(c->pubsub_patterns) > 0)
2507 &&
2508 cmd->proc != subscribeCommand && cmd->proc != unsubscribeCommand &&
2509 cmd->proc != psubscribeCommand && cmd->proc != punsubscribeCommand) {
2510 addReplySds(c,sdsnew("-ERR only (P)SUBSCRIBE / (P)UNSUBSCRIBE / QUIT allowed in this context\r\n"));
2511 resetClient(c);
2512 return 1;
2513 }
2514
2515 /* Exec the command */
2516 if (c->flags & REDIS_MULTI &&
2517 cmd->proc != execCommand && cmd->proc != discardCommand &&
2518 cmd->proc != multiCommand && cmd->proc != watchCommand)
2519 {
2520 queueMultiCommand(c,cmd);
2521 addReply(c,shared.queued);
2522 } else {
2523 if (server.vm_enabled && server.vm_max_threads > 0 &&
2524 blockClientOnSwappedKeys(c,cmd)) return 1;
2525 call(c,cmd);
2526 }
2527
2528 /* Prepare the client for the next command */
2529 resetClient(c);
2530 return 1;
2531 }
2532
2533 static void replicationFeedSlaves(list *slaves, int dictid, robj **argv, int argc) {
2534 listNode *ln;
2535 listIter li;
2536 int outc = 0, j;
2537 robj **outv;
2538 /* We need 1+(ARGS*3) objects since commands are using the new protocol
2539 * and we one 1 object for the first "*<count>\r\n" multibulk count, then
2540 * for every additional object we have "$<count>\r\n" + object + "\r\n". */
2541 robj *static_outv[REDIS_STATIC_ARGS*3+1];
2542 robj *lenobj;
2543
2544 if (argc <= REDIS_STATIC_ARGS) {
2545 outv = static_outv;
2546 } else {
2547 outv = zmalloc(sizeof(robj*)*(argc*3+1));
2548 }
2549
2550 lenobj = createObject(REDIS_STRING,
2551 sdscatprintf(sdsempty(), "*%d\r\n", argc));
2552 lenobj->refcount = 0;
2553 outv[outc++] = lenobj;
2554 for (j = 0; j < argc; j++) {
2555 lenobj = createObject(REDIS_STRING,
2556 sdscatprintf(sdsempty(),"$%lu\r\n",
2557 (unsigned long) stringObjectLen(argv[j])));
2558 lenobj->refcount = 0;
2559 outv[outc++] = lenobj;
2560 outv[outc++] = argv[j];
2561 outv[outc++] = shared.crlf;
2562 }
2563
2564 /* Increment all the refcounts at start and decrement at end in order to
2565 * be sure to free objects if there is no slave in a replication state
2566 * able to be feed with commands */
2567 for (j = 0; j < outc; j++) incrRefCount(outv[j]);
2568 listRewind(slaves,&li);
2569 while((ln = listNext(&li))) {
2570 redisClient *slave = ln->value;
2571
2572 /* Don't feed slaves that are still waiting for BGSAVE to start */
2573 if (slave->replstate == REDIS_REPL_WAIT_BGSAVE_START) continue;
2574
2575 /* Feed all the other slaves, MONITORs and so on */
2576 if (slave->slaveseldb != dictid) {
2577 robj *selectcmd;
2578
2579 switch(dictid) {
2580 case 0: selectcmd = shared.select0; break;
2581 case 1: selectcmd = shared.select1; break;
2582 case 2: selectcmd = shared.select2; break;
2583 case 3: selectcmd = shared.select3; break;
2584 case 4: selectcmd = shared.select4; break;
2585 case 5: selectcmd = shared.select5; break;
2586 case 6: selectcmd = shared.select6; break;
2587 case 7: selectcmd = shared.select7; break;
2588 case 8: selectcmd = shared.select8; break;
2589 case 9: selectcmd = shared.select9; break;
2590 default:
2591 selectcmd = createObject(REDIS_STRING,
2592 sdscatprintf(sdsempty(),"select %d\r\n",dictid));
2593 selectcmd->refcount = 0;
2594 break;
2595 }
2596 addReply(slave,selectcmd);
2597 slave->slaveseldb = dictid;
2598 }
2599 for (j = 0; j < outc; j++) addReply(slave,outv[j]);
2600 }
2601 for (j = 0; j < outc; j++) decrRefCount(outv[j]);
2602 if (outv != static_outv) zfree(outv);
2603 }
2604
2605 static sds sdscatrepr(sds s, char *p, size_t len) {
2606 s = sdscatlen(s,"\"",1);
2607 while(len--) {
2608 switch(*p) {
2609 case '\\':
2610 case '"':
2611 s = sdscatprintf(s,"\\%c",*p);
2612 break;
2613 case '\n': s = sdscatlen(s,"\\n",1); break;
2614 case '\r': s = sdscatlen(s,"\\r",1); break;
2615 case '\t': s = sdscatlen(s,"\\t",1); break;
2616 case '\a': s = sdscatlen(s,"\\a",1); break;
2617 case '\b': s = sdscatlen(s,"\\b",1); break;
2618 default:
2619 if (isprint(*p))
2620 s = sdscatprintf(s,"%c",*p);
2621 else
2622 s = sdscatprintf(s,"\\x%02x",(unsigned char)*p);
2623 break;
2624 }
2625 p++;
2626 }
2627 return sdscatlen(s,"\"",1);
2628 }
2629
2630 static void replicationFeedMonitors(list *monitors, int dictid, robj **argv, int argc) {
2631 listNode *ln;
2632 listIter li;
2633 int j;
2634 sds cmdrepr = sdsnew("+");
2635 robj *cmdobj;
2636 struct timeval tv;
2637
2638 gettimeofday(&tv,NULL);
2639 cmdrepr = sdscatprintf(cmdrepr,"%ld.%ld ",(long)tv.tv_sec,(long)tv.tv_usec);
2640 if (dictid != 0) cmdrepr = sdscatprintf(cmdrepr,"(db %d) ", dictid);
2641
2642 for (j = 0; j < argc; j++) {
2643 if (argv[j]->encoding == REDIS_ENCODING_INT) {
2644 cmdrepr = sdscatprintf(cmdrepr, "%ld", (long)argv[j]->ptr);
2645 } else {
2646 cmdrepr = sdscatrepr(cmdrepr,(char*)argv[j]->ptr,
2647 sdslen(argv[j]->ptr));
2648 }
2649 if (j != argc-1)
2650 cmdrepr = sdscatlen(cmdrepr," ",1);
2651 }
2652 cmdrepr = sdscatlen(cmdrepr,"\r\n",2);
2653 cmdobj = createObject(REDIS_STRING,cmdrepr);
2654
2655 listRewind(monitors,&li);
2656 while((ln = listNext(&li))) {
2657 redisClient *monitor = ln->value;
2658 addReply(monitor,cmdobj);
2659 }
2660 decrRefCount(cmdobj);
2661 }
2662
2663 static void processInputBuffer(redisClient *c) {
2664 again:
2665 /* Before to process the input buffer, make sure the client is not
2666 * waitig for a blocking operation such as BLPOP. Note that the first
2667 * iteration the client is never blocked, otherwise the processInputBuffer
2668 * would not be called at all, but after the execution of the first commands
2669 * in the input buffer the client may be blocked, and the "goto again"
2670 * will try to reiterate. The following line will make it return asap. */
2671 if (c->flags & REDIS_BLOCKED || c->flags & REDIS_IO_WAIT) return;
2672 if (c->bulklen == -1) {
2673 /* Read the first line of the query */
2674 char *p = strchr(c->querybuf,'\n');
2675 size_t querylen;
2676
2677 if (p) {
2678 sds query, *argv;
2679 int argc, j;
2680
2681 query = c->querybuf;
2682 c->querybuf = sdsempty();
2683 querylen = 1+(p-(query));
2684 if (sdslen(query) > querylen) {
2685 /* leave data after the first line of the query in the buffer */
2686 c->querybuf = sdscatlen(c->querybuf,query+querylen,sdslen(query)-querylen);
2687 }
2688 *p = '\0'; /* remove "\n" */
2689 if (*(p-1) == '\r') *(p-1) = '\0'; /* and "\r" if any */
2690 sdsupdatelen(query);
2691
2692 /* Now we can split the query in arguments */
2693 argv = sdssplitlen(query,sdslen(query)," ",1,&argc);
2694 sdsfree(query);
2695
2696 if (c->argv) zfree(c->argv);
2697 c->argv = zmalloc(sizeof(robj*)*argc);
2698
2699 for (j = 0; j < argc; j++) {
2700 if (sdslen(argv[j])) {
2701 c->argv[c->argc] = createObject(REDIS_STRING,argv[j]);
2702 c->argc++;
2703 } else {
2704 sdsfree(argv[j]);
2705 }
2706 }
2707 zfree(argv);
2708 if (c->argc) {
2709 /* Execute the command. If the client is still valid
2710 * after processCommand() return and there is something
2711 * on the query buffer try to process the next command. */
2712 if (processCommand(c) && sdslen(c->querybuf)) goto again;
2713 } else {
2714 /* Nothing to process, argc == 0. Just process the query
2715 * buffer if it's not empty or return to the caller */
2716 if (sdslen(c->querybuf)) goto again;
2717 }
2718 return;
2719 } else if (sdslen(c->querybuf) >= REDIS_REQUEST_MAX_SIZE) {
2720 redisLog(REDIS_VERBOSE, "Client protocol error");
2721 freeClient(c);
2722 return;
2723 }
2724 } else {
2725 /* Bulk read handling. Note that if we are at this point
2726 the client already sent a command terminated with a newline,
2727 we are reading the bulk data that is actually the last
2728 argument of the command. */
2729 int qbl = sdslen(c->querybuf);
2730
2731 if (c->bulklen <= qbl) {
2732 /* Copy everything but the final CRLF as final argument */
2733 c->argv[c->argc] = createStringObject(c->querybuf,c->bulklen-2);
2734 c->argc++;
2735 c->querybuf = sdsrange(c->querybuf,c->bulklen,-1);
2736 /* Process the command. If the client is still valid after
2737 * the processing and there is more data in the buffer
2738 * try to parse it. */
2739 if (processCommand(c) && sdslen(c->querybuf)) goto again;
2740 return;
2741 }
2742 }
2743 }
2744
2745 static void readQueryFromClient(aeEventLoop *el, int fd, void *privdata, int mask) {
2746 redisClient *c = (redisClient*) privdata;
2747 char buf[REDIS_IOBUF_LEN];
2748 int nread;
2749 REDIS_NOTUSED(el);
2750 REDIS_NOTUSED(mask);
2751
2752 nread = read(fd, buf, REDIS_IOBUF_LEN);
2753 if (nread == -1) {
2754 if (errno == EAGAIN) {
2755 nread = 0;
2756 } else {
2757 redisLog(REDIS_VERBOSE, "Reading from client: %s",strerror(errno));
2758 freeClient(c);
2759 return;
2760 }
2761 } else if (nread == 0) {
2762 redisLog(REDIS_VERBOSE, "Client closed connection");
2763 freeClient(c);
2764 return;
2765 }
2766 if (nread) {
2767 c->querybuf = sdscatlen(c->querybuf, buf, nread);
2768 c->lastinteraction = time(NULL);
2769 } else {
2770 return;
2771 }
2772 processInputBuffer(c);
2773 }
2774
2775 static int selectDb(redisClient *c, int id) {
2776 if (id < 0 || id >= server.dbnum)
2777 return REDIS_ERR;
2778 c->db = &server.db[id];
2779 return REDIS_OK;
2780 }
2781
2782 static void *dupClientReplyValue(void *o) {
2783 incrRefCount((robj*)o);
2784 return o;
2785 }
2786
2787 static int listMatchObjects(void *a, void *b) {
2788 return equalStringObjects(a,b);
2789 }
2790
2791 static redisClient *createClient(int fd) {
2792 redisClient *c = zmalloc(sizeof(*c));
2793
2794 anetNonBlock(NULL,fd);
2795 anetTcpNoDelay(NULL,fd);
2796 if (!c) return NULL;
2797 selectDb(c,0);
2798 c->fd = fd;
2799 c->querybuf = sdsempty();
2800 c->argc = 0;
2801 c->argv = NULL;
2802 c->bulklen = -1;
2803 c->multibulk = 0;
2804 c->mbargc = 0;
2805 c->mbargv = NULL;
2806 c->sentlen = 0;
2807 c->flags = 0;
2808 c->lastinteraction = time(NULL);
2809 c->authenticated = 0;
2810 c->replstate = REDIS_REPL_NONE;
2811 c->reply = listCreate();
2812 listSetFreeMethod(c->reply,decrRefCount);
2813 listSetDupMethod(c->reply,dupClientReplyValue);
2814 c->blocking_keys = NULL;
2815 c->blocking_keys_num = 0;
2816 c->io_keys = listCreate();
2817 c->watched_keys = listCreate();
2818 listSetFreeMethod(c->io_keys,decrRefCount);
2819 c->pubsub_channels = dictCreate(&setDictType,NULL);
2820 c->pubsub_patterns = listCreate();
2821 listSetFreeMethod(c->pubsub_patterns,decrRefCount);
2822 listSetMatchMethod(c->pubsub_patterns,listMatchObjects);
2823 if (aeCreateFileEvent(server.el, c->fd, AE_READABLE,
2824 readQueryFromClient, c) == AE_ERR) {
2825 freeClient(c);
2826 return NULL;
2827 }
2828 listAddNodeTail(server.clients,c);
2829 initClientMultiState(c);
2830 return c;
2831 }
2832
2833 static void addReply(redisClient *c, robj *obj) {
2834 if (listLength(c->reply) == 0 &&
2835 (c->replstate == REDIS_REPL_NONE ||
2836 c->replstate == REDIS_REPL_ONLINE) &&
2837 aeCreateFileEvent(server.el, c->fd, AE_WRITABLE,
2838 sendReplyToClient, c) == AE_ERR) return;
2839
2840 if (server.vm_enabled && obj->storage != REDIS_VM_MEMORY) {
2841 obj = dupStringObject(obj);
2842 obj->refcount = 0; /* getDecodedObject() will increment the refcount */
2843 }
2844 listAddNodeTail(c->reply,getDecodedObject(obj));
2845 }
2846
2847 static void addReplySds(redisClient *c, sds s) {
2848 robj *o = createObject(REDIS_STRING,s);
2849 addReply(c,o);
2850 decrRefCount(o);
2851 }
2852
2853 static void addReplyDouble(redisClient *c, double d) {
2854 char buf[128];
2855
2856 snprintf(buf,sizeof(buf),"%.17g",d);
2857 addReplySds(c,sdscatprintf(sdsempty(),"$%lu\r\n%s\r\n",
2858 (unsigned long) strlen(buf),buf));
2859 }
2860
2861 static void addReplyLongLong(redisClient *c, long long ll) {
2862 char buf[128];
2863 size_t len;
2864
2865 if (ll == 0) {
2866 addReply(c,shared.czero);
2867 return;
2868 } else if (ll == 1) {
2869 addReply(c,shared.cone);
2870 return;
2871 }
2872 buf[0] = ':';
2873 len = ll2string(buf+1,sizeof(buf)-1,ll);
2874 buf[len+1] = '\r';
2875 buf[len+2] = '\n';
2876 addReplySds(c,sdsnewlen(buf,len+3));
2877 }
2878
2879 static void addReplyUlong(redisClient *c, unsigned long ul) {
2880 char buf[128];
2881 size_t len;
2882
2883 if (ul == 0) {
2884 addReply(c,shared.czero);
2885 return;
2886 } else if (ul == 1) {
2887 addReply(c,shared.cone);
2888 return;
2889 }
2890 len = snprintf(buf,sizeof(buf),":%lu\r\n",ul);
2891 addReplySds(c,sdsnewlen(buf,len));
2892 }
2893
2894 static void addReplyBulkLen(redisClient *c, robj *obj) {
2895 size_t len, intlen;
2896 char buf[128];
2897
2898 if (obj->encoding == REDIS_ENCODING_RAW) {
2899 len = sdslen(obj->ptr);
2900 } else {
2901 long n = (long)obj->ptr;
2902
2903 /* Compute how many bytes will take this integer as a radix 10 string */
2904 len = 1;
2905 if (n < 0) {
2906 len++;
2907 n = -n;
2908 }
2909 while((n = n/10) != 0) {
2910 len++;
2911 }
2912 }
2913 buf[0] = '$';
2914 intlen = ll2string(buf+1,sizeof(buf)-1,(long long)len);
2915 buf[intlen+1] = '\r';
2916 buf[intlen+2] = '\n';
2917 addReplySds(c,sdsnewlen(buf,intlen+3));
2918 }
2919
2920 static void addReplyBulk(redisClient *c, robj *obj) {
2921 addReplyBulkLen(c,obj);
2922 addReply(c,obj);
2923 addReply(c,shared.crlf);
2924 }
2925
2926 static void addReplyBulkSds(redisClient *c, sds s) {
2927 robj *o = createStringObject(s, sdslen(s));
2928 addReplyBulk(c,o);
2929 decrRefCount(o);
2930 }
2931
2932 /* In the CONFIG command we need to add vanilla C string as bulk replies */
2933 static void addReplyBulkCString(redisClient *c, char *s) {
2934 if (s == NULL) {
2935 addReply(c,shared.nullbulk);
2936 } else {
2937 robj *o = createStringObject(s,strlen(s));
2938 addReplyBulk(c,o);
2939 decrRefCount(o);
2940 }
2941 }
2942
2943 static void acceptHandler(aeEventLoop *el, int fd, void *privdata, int mask) {
2944 int cport, cfd;
2945 char cip[128];
2946 redisClient *c;
2947 REDIS_NOTUSED(el);
2948 REDIS_NOTUSED(mask);
2949 REDIS_NOTUSED(privdata);
2950
2951 cfd = anetAccept(server.neterr, fd, cip, &cport);
2952 if (cfd == AE_ERR) {
2953 redisLog(REDIS_VERBOSE,"Accepting client connection: %s", server.neterr);
2954 return;
2955 }
2956 redisLog(REDIS_VERBOSE,"Accepted %s:%d", cip, cport);
2957 if ((c = createClient(cfd)) == NULL) {
2958 redisLog(REDIS_WARNING,"Error allocating resoures for the client");
2959 close(cfd); /* May be already closed, just ingore errors */
2960 return;
2961 }
2962 /* If maxclient directive is set and this is one client more... close the
2963 * connection. Note that we create the client instead to check before
2964 * for this condition, since now the socket is already set in nonblocking
2965 * mode and we can send an error for free using the Kernel I/O */
2966 if (server.maxclients && listLength(server.clients) > server.maxclients) {
2967 char *err = "-ERR max number of clients reached\r\n";
2968
2969 /* That's a best effort error message, don't check write errors */
2970 if (write(c->fd,err,strlen(err)) == -1) {
2971 /* Nothing to do, Just to avoid the warning... */
2972 }
2973 freeClient(c);
2974 return;
2975 }
2976 server.stat_numconnections++;
2977 }
2978
2979 /* ======================= Redis objects implementation ===================== */
2980
2981 static robj *createObject(int type, void *ptr) {
2982 robj *o;
2983
2984 if (server.vm_enabled) pthread_mutex_lock(&server.obj_freelist_mutex);
2985 if (listLength(server.objfreelist)) {
2986 listNode *head = listFirst(server.objfreelist);
2987 o = listNodeValue(head);
2988 listDelNode(server.objfreelist,head);
2989 if (server.vm_enabled) pthread_mutex_unlock(&server.obj_freelist_mutex);
2990 } else {
2991 if (server.vm_enabled)
2992 pthread_mutex_unlock(&server.obj_freelist_mutex);
2993 o = zmalloc(sizeof(*o));
2994 }
2995 o->type = type;
2996 o->encoding = REDIS_ENCODING_RAW;
2997 o->ptr = ptr;
2998 o->refcount = 1;
2999 if (server.vm_enabled) {
3000 /* Note that this code may run in the context of an I/O thread
3001 * and accessing server.lruclock in theory is an error
3002 * (no locks). But in practice this is safe, and even if we read
3003 * garbage Redis will not fail. */
3004 o->lru = server.lruclock;
3005 o->storage = REDIS_VM_MEMORY;
3006 }
3007 return o;
3008 }
3009
3010 static robj *createStringObject(char *ptr, size_t len) {
3011 return createObject(REDIS_STRING,sdsnewlen(ptr,len));
3012 }
3013
3014 static robj *createStringObjectFromLongLong(long long value) {
3015 robj *o;
3016 if (value >= 0 && value < REDIS_SHARED_INTEGERS) {
3017 incrRefCount(shared.integers[value]);
3018 o = shared.integers[value];
3019 } else {
3020 if (value >= LONG_MIN && value <= LONG_MAX) {
3021 o = createObject(REDIS_STRING, NULL);
3022 o->encoding = REDIS_ENCODING_INT;
3023 o->ptr = (void*)((long)value);
3024 } else {
3025 o = createObject(REDIS_STRING,sdsfromlonglong(value));
3026 }
3027 }
3028 return o;
3029 }
3030
3031 static robj *dupStringObject(robj *o) {
3032 assert(o->encoding == REDIS_ENCODING_RAW);
3033 return createStringObject(o->ptr,sdslen(o->ptr));
3034 }
3035
3036 static robj *createListObject(void) {
3037 list *l = listCreate();
3038
3039 listSetFreeMethod(l,decrRefCount);
3040 return createObject(REDIS_LIST,l);
3041 }
3042
3043 static robj *createSetObject(void) {
3044 dict *d = dictCreate(&setDictType,NULL);
3045 return createObject(REDIS_SET,d);
3046 }
3047
3048 static robj *createHashObject(void) {
3049 /* All the Hashes start as zipmaps. Will be automatically converted
3050 * into hash tables if there are enough elements or big elements
3051 * inside. */
3052 unsigned char *zm = zipmapNew();
3053 robj *o = createObject(REDIS_HASH,zm);
3054 o->encoding = REDIS_ENCODING_ZIPMAP;
3055 return o;
3056 }
3057
3058 static robj *createZsetObject(void) {
3059 zset *zs = zmalloc(sizeof(*zs));
3060
3061 zs->dict = dictCreate(&zsetDictType,NULL);
3062 zs->zsl = zslCreate();
3063 return createObject(REDIS_ZSET,zs);
3064 }
3065
3066 static void freeStringObject(robj *o) {
3067 if (o->encoding == REDIS_ENCODING_RAW) {
3068 sdsfree(o->ptr);
3069 }
3070 }
3071
3072 static void freeListObject(robj *o) {
3073 listRelease((list*) o->ptr);
3074 }
3075
3076 static void freeSetObject(robj *o) {
3077 dictRelease((dict*) o->ptr);
3078 }
3079
3080 static void freeZsetObject(robj *o) {
3081 zset *zs = o->ptr;
3082
3083 dictRelease(zs->dict);
3084 zslFree(zs->zsl);
3085 zfree(zs);
3086 }
3087
3088 static void freeHashObject(robj *o) {
3089 switch (o->encoding) {
3090 case REDIS_ENCODING_HT:
3091 dictRelease((dict*) o->ptr);
3092 break;
3093 case REDIS_ENCODING_ZIPMAP:
3094 zfree(o->ptr);
3095 break;
3096 default:
3097 redisPanic("Unknown hash encoding type");
3098 break;
3099 }
3100 }
3101
3102 static void incrRefCount(robj *o) {
3103 o->refcount++;
3104 }
3105
3106 static void decrRefCount(void *obj) {
3107 robj *o = obj;
3108
3109 /* Object is a swapped out value, or in the process of being loaded. */
3110 if (server.vm_enabled &&
3111 (o->storage == REDIS_VM_SWAPPED || o->storage == REDIS_VM_LOADING))
3112 {
3113 vmpointer *vp = obj;
3114 if (o->storage == REDIS_VM_LOADING) vmCancelThreadedIOJob(o);
3115 vmMarkPagesFree(vp->page,vp->usedpages);
3116 server.vm_stats_swapped_objects--;
3117 zfree(vp);
3118 return;
3119 }
3120
3121 if (o->refcount <= 0) redisPanic("decrRefCount against refcount <= 0");
3122 /* Object is in memory, or in the process of being swapped out.
3123 *
3124 * If the object is being swapped out, abort the operation on
3125 * decrRefCount even if the refcount does not drop to 0: the object
3126 * is referenced at least two times, as value of the key AND as
3127 * job->val in the iojob. So if we don't invalidate the iojob, when it is
3128 * done but the relevant key was removed in the meantime, the
3129 * complete jobs handler will not find the key about the job and the
3130 * assert will fail. */
3131 if (server.vm_enabled && o->storage == REDIS_VM_SWAPPING)
3132 vmCancelThreadedIOJob(o);
3133 if (--(o->refcount) == 0) {
3134 switch(o->type) {
3135 case REDIS_STRING: freeStringObject(o); break;
3136 case REDIS_LIST: freeListObject(o); break;
3137 case REDIS_SET: freeSetObject(o); break;
3138 case REDIS_ZSET: freeZsetObject(o); break;
3139 case REDIS_HASH: freeHashObject(o); break;
3140 default: redisPanic("Unknown object type"); break;
3141 }
3142 if (server.vm_enabled) pthread_mutex_lock(&server.obj_freelist_mutex);
3143 if (listLength(server.objfreelist) > REDIS_OBJFREELIST_MAX ||
3144 !listAddNodeHead(server.objfreelist,o))
3145 zfree(o);
3146 if (server.vm_enabled) pthread_mutex_unlock(&server.obj_freelist_mutex);
3147 }
3148 }
3149
3150 static int checkType(redisClient *c, robj *o, int type) {
3151 if (o->type != type) {
3152 addReply(c,shared.wrongtypeerr);
3153 return 1;
3154 }
3155 return 0;
3156 }
3157
3158 /* Check if the nul-terminated string 's' can be represented by a long
3159 * (that is, is a number that fits into long without any other space or
3160 * character before or after the digits).
3161 *
3162 * If so, the function returns REDIS_OK and *longval is set to the value
3163 * of the number. Otherwise REDIS_ERR is returned */
3164 static int isStringRepresentableAsLong(sds s, long *longval) {
3165 char buf[32], *endptr;
3166 long value;
3167 int slen;
3168
3169 value = strtol(s, &endptr, 10);
3170 if (endptr[0] != '\0') return REDIS_ERR;
3171 slen = ll2string(buf,32,value);
3172
3173 /* If the number converted back into a string is not identical
3174 * then it's not possible to encode the string as integer */
3175 if (sdslen(s) != (unsigned)slen || memcmp(buf,s,slen)) return REDIS_ERR;
3176 if (longval) *longval = value;
3177 return REDIS_OK;
3178 }
3179
3180 /* Try to encode a string object in order to save space */
3181 static robj *tryObjectEncoding(robj *o) {
3182 long value;
3183 sds s = o->ptr;
3184
3185 if (o->encoding != REDIS_ENCODING_RAW)
3186 return o; /* Already encoded */
3187
3188 /* It's not safe to encode shared objects: shared objects can be shared
3189 * everywhere in the "object space" of Redis. Encoded objects can only
3190 * appear as "values" (and not, for instance, as keys) */
3191 if (o->refcount > 1) return o;
3192
3193 /* Currently we try to encode only strings */
3194 redisAssert(o->type == REDIS_STRING);
3195
3196 /* Check if we can represent this string as a long integer */
3197 if (isStringRepresentableAsLong(s,&value) == REDIS_ERR) return o;
3198
3199 /* Ok, this object can be encoded */
3200 if (value >= 0 && value < REDIS_SHARED_INTEGERS) {
3201 decrRefCount(o);
3202 incrRefCount(shared.integers[value]);
3203 return shared.integers[value];
3204 } else {
3205 o->encoding = REDIS_ENCODING_INT;
3206 sdsfree(o->ptr);
3207 o->ptr = (void*) value;
3208 return o;
3209 }
3210 }
3211
3212 /* Get a decoded version of an encoded object (returned as a new object).
3213 * If the object is already raw-encoded just increment the ref count. */
3214 static robj *getDecodedObject(robj *o) {
3215 robj *dec;
3216
3217 if (o->encoding == REDIS_ENCODING_RAW) {
3218 incrRefCount(o);
3219 return o;
3220 }
3221 if (o->type == REDIS_STRING && o->encoding == REDIS_ENCODING_INT) {
3222 char buf[32];
3223
3224 ll2string(buf,32,(long)o->ptr);
3225 dec = createStringObject(buf,strlen(buf));
3226 return dec;
3227 } else {
3228 redisPanic("Unknown encoding type");
3229 }
3230 }
3231
3232 /* Compare two string objects via strcmp() or alike.
3233 * Note that the objects may be integer-encoded. In such a case we
3234 * use ll2string() to get a string representation of the numbers on the stack
3235 * and compare the strings, it's much faster than calling getDecodedObject().
3236 *
3237 * Important note: if objects are not integer encoded, but binary-safe strings,
3238 * sdscmp() from sds.c will apply memcmp() so this function ca be considered
3239 * binary safe. */
3240 static int compareStringObjects(robj *a, robj *b) {
3241 redisAssert(a->type == REDIS_STRING && b->type == REDIS_STRING);
3242 char bufa[128], bufb[128], *astr, *bstr;
3243 int bothsds = 1;
3244
3245 if (a == b) return 0;
3246 if (a->encoding != REDIS_ENCODING_RAW) {
3247 ll2string(bufa,sizeof(bufa),(long) a->ptr);
3248 astr = bufa;
3249 bothsds = 0;
3250 } else {
3251 astr = a->ptr;
3252 }
3253 if (b->encoding != REDIS_ENCODING_RAW) {
3254 ll2string(bufb,sizeof(bufb),(long) b->ptr);
3255 bstr = bufb;
3256 bothsds = 0;
3257 } else {
3258 bstr = b->ptr;
3259 }
3260 return bothsds ? sdscmp(astr,bstr) : strcmp(astr,bstr);
3261 }
3262
3263 /* Equal string objects return 1 if the two objects are the same from the
3264 * point of view of a string comparison, otherwise 0 is returned. Note that
3265 * this function is faster then checking for (compareStringObject(a,b) == 0)
3266 * because it can perform some more optimization. */
3267 static int equalStringObjects(robj *a, robj *b) {
3268 if (a->encoding != REDIS_ENCODING_RAW && b->encoding != REDIS_ENCODING_RAW){
3269 return a->ptr == b->ptr;
3270 } else {
3271 return compareStringObjects(a,b) == 0;
3272 }
3273 }
3274
3275 static size_t stringObjectLen(robj *o) {
3276 redisAssert(o->type == REDIS_STRING);
3277 if (o->encoding == REDIS_ENCODING_RAW) {
3278 return sdslen(o->ptr);
3279 } else {
3280 char buf[32];
3281
3282 return ll2string(buf,32,(long)o->ptr);
3283 }
3284 }
3285
3286 static int getDoubleFromObject(robj *o, double *target) {
3287 double value;
3288 char *eptr;
3289
3290 if (o == NULL) {
3291 value = 0;
3292 } else {
3293 redisAssert(o->type == REDIS_STRING);
3294 if (o->encoding == REDIS_ENCODING_RAW) {
3295 value = strtod(o->ptr, &eptr);
3296 if (eptr[0] != '\0') return REDIS_ERR;
3297 } else if (o->encoding == REDIS_ENCODING_INT) {
3298 value = (long)o->ptr;
3299 } else {
3300 redisPanic("Unknown string encoding");
3301 }
3302 }
3303
3304 *target = value;
3305 return REDIS_OK;
3306 }
3307
3308 static int getDoubleFromObjectOrReply(redisClient *c, robj *o, double *target, const char *msg) {
3309 double value;
3310 if (getDoubleFromObject(o, &value) != REDIS_OK) {
3311 if (msg != NULL) {
3312 addReplySds(c, sdscatprintf(sdsempty(), "-ERR %s\r\n", msg));
3313 } else {
3314 addReplySds(c, sdsnew("-ERR value is not a double\r\n"));
3315 }
3316 return REDIS_ERR;
3317 }
3318
3319 *target = value;
3320 return REDIS_OK;
3321 }
3322
3323 static int getLongLongFromObject(robj *o, long long *target) {
3324 long long value;
3325 char *eptr;
3326
3327 if (o == NULL) {
3328 value = 0;
3329 } else {
3330 redisAssert(o->type == REDIS_STRING);
3331 if (o->encoding == REDIS_ENCODING_RAW) {
3332 value = strtoll(o->ptr, &eptr, 10);
3333 if (eptr[0] != '\0') return REDIS_ERR;
3334 } else if (o->encoding == REDIS_ENCODING_INT) {
3335 value = (long)o->ptr;
3336 } else {
3337 redisPanic("Unknown string encoding");
3338 }
3339 }
3340
3341 *target = value;
3342 return REDIS_OK;
3343 }
3344
3345 static int getLongLongFromObjectOrReply(redisClient *c, robj *o, long long *target, const char *msg) {
3346 long long value;
3347 if (getLongLongFromObject(o, &value) != REDIS_OK) {
3348 if (msg != NULL) {
3349 addReplySds(c, sdscatprintf(sdsempty(), "-ERR %s\r\n", msg));
3350 } else {
3351 addReplySds(c, sdsnew("-ERR value is not an integer\r\n"));
3352 }
3353 return REDIS_ERR;
3354 }
3355
3356 *target = value;
3357 return REDIS_OK;
3358 }
3359
3360 static int getLongFromObjectOrReply(redisClient *c, robj *o, long *target, const char *msg) {
3361 long long value;
3362
3363 if (getLongLongFromObjectOrReply(c, o, &value, msg) != REDIS_OK) return REDIS_ERR;
3364 if (value < LONG_MIN || value > LONG_MAX) {
3365 if (msg != NULL) {
3366 addReplySds(c, sdscatprintf(sdsempty(), "-ERR %s\r\n", msg));
3367 } else {
3368 addReplySds(c, sdsnew("-ERR value is out of range\r\n"));
3369 }
3370 return REDIS_ERR;
3371 }
3372
3373 *target = value;
3374 return REDIS_OK;
3375 }
3376
3377 /* =========================== Keyspace access API ========================== */
3378
3379 static robj *lookupKey(redisDb *db, robj *key) {
3380 dictEntry *de = dictFind(db->dict,key->ptr);
3381 if (de) {
3382 robj *val = dictGetEntryVal(de);
3383
3384 if (server.vm_enabled) {
3385 if (val->storage == REDIS_VM_MEMORY ||
3386 val->storage == REDIS_VM_SWAPPING)
3387 {
3388 /* If we were swapping the object out, cancel the operation */
3389 if (val->storage == REDIS_VM_SWAPPING)
3390 vmCancelThreadedIOJob(val);
3391 /* Update the access time for the aging algorithm. */
3392 val->lru = server.lruclock;
3393 } else {
3394 int notify = (val->storage == REDIS_VM_LOADING);
3395
3396 /* Our value was swapped on disk. Bring it at home. */
3397 redisAssert(val->type == REDIS_VMPOINTER);
3398 val = vmLoadObject(val);
3399 dictGetEntryVal(de) = val;
3400
3401 /* Clients blocked by the VM subsystem may be waiting for
3402 * this key... */
3403 if (notify) handleClientsBlockedOnSwappedKey(db,key);
3404 }
3405 }
3406 return val;
3407 } else {
3408 return NULL;
3409 }
3410 }
3411
3412 static robj *lookupKeyRead(redisDb *db, robj *key) {
3413 expireIfNeeded(db,key);
3414 return lookupKey(db,key);
3415 }
3416
3417 static robj *lookupKeyWrite(redisDb *db, robj *key) {
3418 deleteIfVolatile(db,key);
3419 touchWatchedKey(db,key);
3420 return lookupKey(db,key);
3421 }
3422
3423 static robj *lookupKeyReadOrReply(redisClient *c, robj *key, robj *reply) {
3424 robj *o = lookupKeyRead(c->db, key);
3425 if (!o) addReply(c,reply);
3426 return o;
3427 }
3428
3429 static robj *lookupKeyWriteOrReply(redisClient *c, robj *key, robj *reply) {
3430 robj *o = lookupKeyWrite(c->db, key);
3431 if (!o) addReply(c,reply);
3432 return o;
3433 }
3434
3435 /* Add the key to the DB. If the key already exists REDIS_ERR is returned,
3436 * otherwise REDIS_OK is returned, and the caller should increment the
3437 * refcount of 'val'. */
3438 static int dbAdd(redisDb *db, robj *key, robj *val) {
3439 /* Perform a lookup before adding the key, as we need to copy the
3440 * key value. */
3441 if (dictFind(db->dict, key->ptr) != NULL) {
3442 return REDIS_ERR;
3443 } else {
3444 sds copy = sdsdup(key->ptr);
3445 dictAdd(db->dict, copy, val);
3446 return REDIS_OK;
3447 }
3448 }
3449
3450 /* If the key does not exist, this is just like dbAdd(). Otherwise
3451 * the value associated to the key is replaced with the new one.
3452 *
3453 * On update (key already existed) 0 is returned. Otherwise 1. */
3454 static int dbReplace(redisDb *db, robj *key, robj *val) {
3455 if (dictFind(db->dict,key->ptr) == NULL) {
3456 sds copy = sdsdup(key->ptr);
3457 dictAdd(db->dict, copy, val);
3458 return 1;
3459 } else {
3460 dictReplace(db->dict, key->ptr, val);
3461 return 0;
3462 }
3463 }
3464
3465 static int dbExists(redisDb *db, robj *key) {
3466 return dictFind(db->dict,key->ptr) != NULL;
3467 }
3468
3469 /* Return a random key, in form of a Redis object.
3470 * If there are no keys, NULL is returned.
3471 *
3472 * The function makes sure to return keys not already expired. */
3473 static robj *dbRandomKey(redisDb *db) {
3474 struct dictEntry *de;
3475
3476 while(1) {
3477 sds key;
3478 robj *keyobj;
3479
3480 de = dictGetRandomKey(db->dict);
3481 if (de == NULL) return NULL;
3482
3483 key = dictGetEntryKey(de);
3484 keyobj = createStringObject(key,sdslen(key));
3485 if (dictFind(db->expires,key)) {
3486 if (expireIfNeeded(db,keyobj)) {
3487 decrRefCount(keyobj);
3488 continue; /* search for another key. This expired. */
3489 }
3490 }
3491 return keyobj;
3492 }
3493 }
3494
3495 /* Delete a key, value, and associated expiration entry if any, from the DB */
3496 static int dbDelete(redisDb *db, robj *key) {
3497 int retval;
3498
3499 if (dictSize(db->expires)) dictDelete(db->expires,key->ptr);
3500 retval = dictDelete(db->dict,key->ptr);
3501
3502 return retval == DICT_OK;
3503 }
3504
3505 /*============================ RDB saving/loading =========================== */
3506
3507 static int rdbSaveType(FILE *fp, unsigned char type) {
3508 if (fwrite(&type,1,1,fp) == 0) return -1;
3509 return 0;
3510 }
3511
3512 static int rdbSaveTime(FILE *fp, time_t t) {
3513 int32_t t32 = (int32_t) t;
3514 if (fwrite(&t32,4,1,fp) == 0) return -1;
3515 return 0;
3516 }
3517
3518 /* check rdbLoadLen() comments for more info */
3519 static int rdbSaveLen(FILE *fp, uint32_t len) {
3520 unsigned char buf[2];
3521
3522 if (len < (1<<6)) {
3523 /* Save a 6 bit len */
3524 buf[0] = (len&0xFF)|(REDIS_RDB_6BITLEN<<6);
3525 if (fwrite(buf,1,1,fp) == 0) return -1;
3526 } else if (len < (1<<14)) {
3527 /* Save a 14 bit len */
3528 buf[0] = ((len>>8)&0xFF)|(REDIS_RDB_14BITLEN<<6);
3529 buf[1] = len&0xFF;
3530 if (fwrite(buf,2,1,fp) == 0) return -1;
3531 } else {
3532 /* Save a 32 bit len */
3533 buf[0] = (REDIS_RDB_32BITLEN<<6);
3534 if (fwrite(buf,1,1,fp) == 0) return -1;
3535 len = htonl(len);
3536 if (fwrite(&len,4,1,fp) == 0) return -1;
3537 }
3538 return 0;
3539 }
3540
3541 /* Encode 'value' as an integer if possible (if integer will fit the
3542 * supported range). If the function sucessful encoded the integer
3543 * then the (up to 5 bytes) encoded representation is written in the
3544 * string pointed by 'enc' and the length is returned. Otherwise
3545 * 0 is returned. */
3546 static int rdbEncodeInteger(long long value, unsigned char *enc) {
3547 /* Finally check if it fits in our ranges */
3548 if (value >= -(1<<7) && value <= (1<<7)-1) {
3549 enc[0] = (REDIS_RDB_ENCVAL<<6)|REDIS_RDB_ENC_INT8;
3550 enc[1] = value&0xFF;
3551 return 2;
3552 } else if (value >= -(1<<15) && value <= (1<<15)-1) {
3553 enc[0] = (REDIS_RDB_ENCVAL<<6)|REDIS_RDB_ENC_INT16;
3554 enc[1] = value&0xFF;
3555 enc[2] = (value>>8)&0xFF;
3556 return 3;
3557 } else if (value >= -((long long)1<<31) && value <= ((long long)1<<31)-1) {
3558 enc[0] = (REDIS_RDB_ENCVAL<<6)|REDIS_RDB_ENC_INT32;
3559 enc[1] = value&0xFF;
3560 enc[2] = (value>>8)&0xFF;
3561 enc[3] = (value>>16)&0xFF;
3562 enc[4] = (value>>24)&0xFF;
3563 return 5;
3564 } else {
3565 return 0;
3566 }
3567 }
3568
3569 /* String objects in the form "2391" "-100" without any space and with a
3570 * range of values that can fit in an 8, 16 or 32 bit signed value can be
3571 * encoded as integers to save space */
3572 static int rdbTryIntegerEncoding(char *s, size_t len, unsigned char *enc) {
3573 long long value;
3574 char *endptr, buf[32];
3575
3576 /* Check if it's possible to encode this value as a number */
3577 value = strtoll(s, &endptr, 10);
3578 if (endptr[0] != '\0') return 0;
3579 ll2string(buf,32,value);
3580
3581 /* If the number converted back into a string is not identical
3582 * then it's not possible to encode the string as integer */
3583 if (strlen(buf) != len || memcmp(buf,s,len)) return 0;
3584
3585 return rdbEncodeInteger(value,enc);
3586 }
3587
3588 static int rdbSaveLzfStringObject(FILE *fp, unsigned char *s, size_t len) {
3589 size_t comprlen, outlen;
3590 unsigned char byte;
3591 void *out;
3592
3593 /* We require at least four bytes compression for this to be worth it */
3594 if (len <= 4) return 0;
3595 outlen = len-4;
3596 if ((out = zmalloc(outlen+1)) == NULL) return 0;
3597 comprlen = lzf_compress(s, len, out, outlen);
3598 if (comprlen == 0) {
3599 zfree(out);
3600 return 0;
3601 }
3602 /* Data compressed! Let's save it on disk */
3603 byte = (REDIS_RDB_ENCVAL<<6)|REDIS_RDB_ENC_LZF;
3604 if (fwrite(&byte,1,1,fp) == 0) goto writeerr;
3605 if (rdbSaveLen(fp,comprlen) == -1) goto writeerr;
3606 if (rdbSaveLen(fp,len) == -1) goto writeerr;
3607 if (fwrite(out,comprlen,1,fp) == 0) goto writeerr;
3608 zfree(out);
3609 return comprlen;
3610
3611 writeerr:
3612 zfree(out);
3613 return -1;
3614 }
3615
3616 /* Save a string objet as [len][data] on disk. If the object is a string
3617 * representation of an integer value we try to safe it in a special form */
3618 static int rdbSaveRawString(FILE *fp, unsigned char *s, size_t len) {
3619 int enclen;
3620
3621 /* Try integer encoding */
3622 if (len <= 11) {
3623 unsigned char buf[5];
3624 if ((enclen = rdbTryIntegerEncoding((char*)s,len,buf)) > 0) {
3625 if (fwrite(buf,enclen,1,fp) == 0) return -1;
3626 return 0;
3627 }
3628 }
3629
3630 /* Try LZF compression - under 20 bytes it's unable to compress even
3631 * aaaaaaaaaaaaaaaaaa so skip it */
3632 if (server.rdbcompression && len > 20) {
3633 int retval;
3634
3635 retval = rdbSaveLzfStringObject(fp,s,len);
3636 if (retval == -1) return -1;
3637 if (retval > 0) return 0;
3638 /* retval == 0 means data can't be compressed, save the old way */
3639 }
3640
3641 /* Store verbatim */
3642 if (rdbSaveLen(fp,len) == -1) return -1;
3643 if (len && fwrite(s,len,1,fp) == 0) return -1;
3644 return 0;
3645 }
3646
3647 /* Like rdbSaveStringObjectRaw() but handle encoded objects */
3648 static int rdbSaveStringObject(FILE *fp, robj *obj) {
3649 int retval;
3650
3651 /* Avoid to decode the object, then encode it again, if the
3652 * object is alrady integer encoded. */
3653 if (obj->encoding == REDIS_ENCODING_INT) {
3654 long val = (long) obj->ptr;
3655 unsigned char buf[5];
3656 int enclen;
3657
3658 if ((enclen = rdbEncodeInteger(val,buf)) > 0) {
3659 if (fwrite(buf,enclen,1,fp) == 0) return -1;
3660 return 0;
3661 }
3662 /* otherwise... fall throught and continue with the usual
3663 * code path. */
3664 }
3665
3666 /* Avoid incr/decr ref count business when possible.
3667 * This plays well with copy-on-write given that we are probably
3668 * in a child process (BGSAVE). Also this makes sure key objects
3669 * of swapped objects are not incRefCount-ed (an assert does not allow
3670 * this in order to avoid bugs) */
3671 if (obj->encoding != REDIS_ENCODING_RAW) {
3672 obj = getDecodedObject(obj);
3673 retval = rdbSaveRawString(fp,obj->ptr,sdslen(obj->ptr));
3674 decrRefCount(obj);
3675 } else {
3676 retval = rdbSaveRawString(fp,obj->ptr,sdslen(obj->ptr));
3677 }
3678 return retval;
3679 }
3680
3681 /* Save a double value. Doubles are saved as strings prefixed by an unsigned
3682 * 8 bit integer specifing the length of the representation.
3683 * This 8 bit integer has special values in order to specify the following
3684 * conditions:
3685 * 253: not a number
3686 * 254: + inf
3687 * 255: - inf
3688 */
3689 static int rdbSaveDoubleValue(FILE *fp, double val) {
3690 unsigned char buf[128];
3691 int len;
3692
3693 if (isnan(val)) {
3694 buf[0] = 253;
3695 len = 1;
3696 } else if (!isfinite(val)) {
3697 len = 1;
3698 buf[0] = (val < 0) ? 255 : 254;
3699 } else {
3700 #if (DBL_MANT_DIG >= 52) && (LLONG_MAX == 0x7fffffffffffffffLL)
3701 /* Check if the float is in a safe range to be casted into a
3702 * long long. We are assuming that long long is 64 bit here.
3703 * Also we are assuming that there are no implementations around where
3704 * double has precision < 52 bit.
3705 *
3706 * Under this assumptions we test if a double is inside an interval
3707 * where casting to long long is safe. Then using two castings we
3708 * make sure the decimal part is zero. If all this is true we use
3709 * integer printing function that is much faster. */
3710 double min = -4503599627370495; /* (2^52)-1 */
3711 double max = 4503599627370496; /* -(2^52) */
3712 if (val > min && val < max && val == ((double)((long long)val)))
3713 ll2string((char*)buf+1,sizeof(buf),(long long)val);
3714 else
3715 #endif
3716 snprintf((char*)buf+1,sizeof(buf)-1,"%.17g",val);
3717 buf[0] = strlen((char*)buf+1);
3718 len = buf[0]+1;
3719 }
3720 if (fwrite(buf,len,1,fp) == 0) return -1;
3721 return 0;
3722 }
3723
3724 /* Save a Redis object. */
3725 static int rdbSaveObject(FILE *fp, robj *o) {
3726 if (o->type == REDIS_STRING) {
3727 /* Save a string value */
3728 if (rdbSaveStringObject(fp,o) == -1) return -1;
3729 } else if (o->type == REDIS_LIST) {
3730 /* Save a list value */
3731 list *list = o->ptr;
3732 listIter li;
3733 listNode *ln;
3734
3735 if (rdbSaveLen(fp,listLength(list)) == -1) return -1;
3736 listRewind(list,&li);
3737 while((ln = listNext(&li))) {
3738 robj *eleobj = listNodeValue(ln);
3739
3740 if (rdbSaveStringObject(fp,eleobj) == -1) return -1;
3741 }
3742 } else if (o->type == REDIS_SET) {
3743 /* Save a set value */
3744 dict *set = o->ptr;
3745 dictIterator *di = dictGetIterator(set);
3746 dictEntry *de;
3747
3748 if (rdbSaveLen(fp,dictSize(set)) == -1) return -1;
3749 while((de = dictNext(di)) != NULL) {
3750 robj *eleobj = dictGetEntryKey(de);
3751
3752 if (rdbSaveStringObject(fp,eleobj) == -1) return -1;
3753 }
3754 dictReleaseIterator(di);
3755 } else if (o->type == REDIS_ZSET) {
3756 /* Save a set value */
3757 zset *zs = o->ptr;
3758 dictIterator *di = dictGetIterator(zs->dict);
3759 dictEntry *de;
3760
3761 if (rdbSaveLen(fp,dictSize(zs->dict)) == -1) return -1;
3762 while((de = dictNext(di)) != NULL) {
3763 robj *eleobj = dictGetEntryKey(de);
3764 double *score = dictGetEntryVal(de);
3765
3766 if (rdbSaveStringObject(fp,eleobj) == -1) return -1;
3767 if (rdbSaveDoubleValue(fp,*score) == -1) return -1;
3768 }
3769 dictReleaseIterator(di);
3770 } else if (o->type == REDIS_HASH) {
3771 /* Save a hash value */
3772 if (o->encoding == REDIS_ENCODING_ZIPMAP) {
3773 unsigned char *p = zipmapRewind(o->ptr);
3774 unsigned int count = zipmapLen(o->ptr);
3775 unsigned char *key, *val;
3776 unsigned int klen, vlen;
3777
3778 if (rdbSaveLen(fp,count) == -1) return -1;
3779 while((p = zipmapNext(p,&key,&klen,&val,&vlen)) != NULL) {
3780 if (rdbSaveRawString(fp,key,klen) == -1) return -1;
3781 if (rdbSaveRawString(fp,val,vlen) == -1) return -1;
3782 }
3783 } else {
3784 dictIterator *di = dictGetIterator(o->ptr);
3785 dictEntry *de;
3786
3787 if (rdbSaveLen(fp,dictSize((dict*)o->ptr)) == -1) return -1;
3788 while((de = dictNext(di)) != NULL) {
3789 robj *key = dictGetEntryKey(de);
3790 robj *val = dictGetEntryVal(de);
3791
3792 if (rdbSaveStringObject(fp,key) == -1) return -1;
3793 if (rdbSaveStringObject(fp,val) == -1) return -1;
3794 }
3795 dictReleaseIterator(di);
3796 }
3797 } else {
3798 redisPanic("Unknown object type");
3799 }
3800 return 0;
3801 }
3802
3803 /* Return the length the object will have on disk if saved with
3804 * the rdbSaveObject() function. Currently we use a trick to get
3805 * this length with very little changes to the code. In the future
3806 * we could switch to a faster solution. */
3807 static off_t rdbSavedObjectLen(robj *o, FILE *fp) {
3808 if (fp == NULL) fp = server.devnull;
3809 rewind(fp);
3810 assert(rdbSaveObject(fp,o) != 1);
3811 return ftello(fp);
3812 }
3813
3814 /* Return the number of pages required to save this object in the swap file */
3815 static off_t rdbSavedObjectPages(robj *o, FILE *fp) {
3816 off_t bytes = rdbSavedObjectLen(o,fp);
3817
3818 return (bytes+(server.vm_page_size-1))/server.vm_page_size;
3819 }
3820
3821 /* Save the DB on disk. Return REDIS_ERR on error, REDIS_OK on success */
3822 static int rdbSave(char *filename) {
3823 dictIterator *di = NULL;
3824 dictEntry *de;
3825 FILE *fp;
3826 char tmpfile[256];
3827 int j;
3828 time_t now = time(NULL);
3829
3830 /* Wait for I/O therads to terminate, just in case this is a
3831 * foreground-saving, to avoid seeking the swap file descriptor at the
3832 * same time. */
3833 if (server.vm_enabled)
3834 waitEmptyIOJobsQueue();
3835
3836 snprintf(tmpfile,256,"temp-%d.rdb", (int) getpid());
3837 fp = fopen(tmpfile,"w");
3838 if (!fp) {
3839 redisLog(REDIS_WARNING, "Failed saving the DB: %s", strerror(errno));
3840 return REDIS_ERR;
3841 }
3842 if (fwrite("REDIS0001",9,1,fp) == 0) goto werr;
3843 for (j = 0; j < server.dbnum; j++) {
3844 redisDb *db = server.db+j;
3845 dict *d = db->dict;
3846 if (dictSize(d) == 0) continue;
3847 di = dictGetIterator(d);
3848 if (!di) {
3849 fclose(fp);
3850 return REDIS_ERR;
3851 }
3852
3853 /* Write the SELECT DB opcode */
3854 if (rdbSaveType(fp,REDIS_SELECTDB) == -1) goto werr;
3855 if (rdbSaveLen(fp,j) == -1) goto werr;
3856
3857 /* Iterate this DB writing every entry */
3858 while((de = dictNext(di)) != NULL) {
3859 sds keystr = dictGetEntryKey(de);
3860 robj key, *o = dictGetEntryVal(de);
3861 time_t expiretime;
3862
3863 initStaticStringObject(key,keystr);
3864 expiretime = getExpire(db,&key);
3865
3866 /* Save the expire time */
3867 if (expiretime != -1) {
3868 /* If this key is already expired skip it */
3869 if (expiretime < now) continue;
3870 if (rdbSaveType(fp,REDIS_EXPIRETIME) == -1) goto werr;
3871 if (rdbSaveTime(fp,expiretime) == -1) goto werr;
3872 }
3873 /* Save the key and associated value. This requires special
3874 * handling if the value is swapped out. */
3875 if (!server.vm_enabled || o->storage == REDIS_VM_MEMORY ||
3876 o->storage == REDIS_VM_SWAPPING) {
3877 /* Save type, key, value */
3878 if (rdbSaveType(fp,o->type) == -1) goto werr;
3879 if (rdbSaveStringObject(fp,&key) == -1) goto werr;
3880 if (rdbSaveObject(fp,o) == -1) goto werr;
3881 } else {
3882 /* REDIS_VM_SWAPPED or REDIS_VM_LOADING */
3883 robj *po;
3884 /* Get a preview of the object in memory */
3885 po = vmPreviewObject(o);
3886 /* Save type, key, value */
3887 if (rdbSaveType(fp,po->type) == -1) goto werr;
3888 if (rdbSaveStringObject(fp,&key) == -1) goto werr;
3889 if (rdbSaveObject(fp,po) == -1) goto werr;
3890 /* Remove the loaded object from memory */
3891 decrRefCount(po);
3892 }
3893 }
3894 dictReleaseIterator(di);
3895 }
3896 /* EOF opcode */
3897 if (rdbSaveType(fp,REDIS_EOF) == -1) goto werr;
3898
3899 /* Make sure data will not remain on the OS's output buffers */
3900 fflush(fp);
3901 fsync(fileno(fp));
3902 fclose(fp);
3903
3904 /* Use RENAME to make sure the DB file is changed atomically only
3905 * if the generate DB file is ok. */
3906 if (rename(tmpfile,filename) == -1) {
3907 redisLog(REDIS_WARNING,"Error moving temp DB file on the final destination: %s", strerror(errno));
3908 unlink(tmpfile);
3909 return REDIS_ERR;
3910 }
3911 redisLog(REDIS_NOTICE,"DB saved on disk");
3912 server.dirty = 0;
3913 server.lastsave = time(NULL);
3914 return REDIS_OK;
3915
3916 werr:
3917 fclose(fp);
3918 unlink(tmpfile);
3919 redisLog(REDIS_WARNING,"Write error saving DB on disk: %s", strerror(errno));
3920 if (di) dictReleaseIterator(di);
3921 return REDIS_ERR;
3922 }
3923
3924 static int rdbSaveBackground(char *filename) {
3925 pid_t childpid;
3926
3927 if (server.bgsavechildpid != -1) return REDIS_ERR;
3928 if (server.vm_enabled) waitEmptyIOJobsQueue();
3929 if ((childpid = fork()) == 0) {
3930 /* Child */
3931 if (server.vm_enabled) vmReopenSwapFile();
3932 close(server.fd);
3933 if (rdbSave(filename) == REDIS_OK) {
3934 _exit(0);
3935 } else {
3936 _exit(1);
3937 }
3938 } else {
3939 /* Parent */
3940 if (childpid == -1) {
3941 redisLog(REDIS_WARNING,"Can't save in background: fork: %s",
3942 strerror(errno));
3943 return REDIS_ERR;
3944 }
3945 redisLog(REDIS_NOTICE,"Background saving started by pid %d",childpid);
3946 server.bgsavechildpid = childpid;
3947 updateDictResizePolicy();
3948 return REDIS_OK;
3949 }
3950 return REDIS_OK; /* unreached */
3951 }
3952
3953 static void rdbRemoveTempFile(pid_t childpid) {
3954 char tmpfile[256];
3955
3956 snprintf(tmpfile,256,"temp-%d.rdb", (int) childpid);
3957 unlink(tmpfile);
3958 }
3959
3960 static int rdbLoadType(FILE *fp) {
3961 unsigned char type;
3962 if (fread(&type,1,1,fp) == 0) return -1;
3963 return type;
3964 }
3965
3966 static time_t rdbLoadTime(FILE *fp) {
3967 int32_t t32;
3968 if (fread(&t32,4,1,fp) == 0) return -1;
3969 return (time_t) t32;
3970 }
3971
3972 /* Load an encoded length from the DB, see the REDIS_RDB_* defines on the top
3973 * of this file for a description of how this are stored on disk.
3974 *
3975 * isencoded is set to 1 if the readed length is not actually a length but
3976 * an "encoding type", check the above comments for more info */
3977 static uint32_t rdbLoadLen(FILE *fp, int *isencoded) {
3978 unsigned char buf[2];
3979 uint32_t len;
3980 int type;
3981
3982 if (isencoded) *isencoded = 0;
3983 if (fread(buf,1,1,fp) == 0) return REDIS_RDB_LENERR;
3984 type = (buf[0]&0xC0)>>6;
3985 if (type == REDIS_RDB_6BITLEN) {
3986 /* Read a 6 bit len */
3987 return buf[0]&0x3F;
3988 } else if (type == REDIS_RDB_ENCVAL) {
3989 /* Read a 6 bit len encoding type */
3990 if (isencoded) *isencoded = 1;
3991 return buf[0]&0x3F;
3992 } else if (type == REDIS_RDB_14BITLEN) {
3993 /* Read a 14 bit len */
3994 if (fread(buf+1,1,1,fp) == 0) return REDIS_RDB_LENERR;
3995 return ((buf[0]&0x3F)<<8)|buf[1];
3996 } else {
3997 /* Read a 32 bit len */
3998 if (fread(&len,4,1,fp) == 0) return REDIS_RDB_LENERR;
3999 return ntohl(len);
4000 }
4001 }
4002
4003 /* Load an integer-encoded object from file 'fp', with the specified
4004 * encoding type 'enctype'. If encode is true the function may return
4005 * an integer-encoded object as reply, otherwise the returned object
4006 * will always be encoded as a raw string. */
4007 static robj *rdbLoadIntegerObject(FILE *fp, int enctype, int encode) {
4008 unsigned char enc[4];
4009 long long val;
4010
4011 if (enctype == REDIS_RDB_ENC_INT8) {
4012 if (fread(enc,1,1,fp) == 0) return NULL;
4013 val = (signed char)enc[0];
4014 } else if (enctype == REDIS_RDB_ENC_INT16) {
4015 uint16_t v;
4016 if (fread(enc,2,1,fp) == 0) return NULL;
4017 v = enc[0]|(enc[1]<<8);
4018 val = (int16_t)v;
4019 } else if (enctype == REDIS_RDB_ENC_INT32) {
4020 uint32_t v;
4021 if (fread(enc,4,1,fp) == 0) return NULL;
4022 v = enc[0]|(enc[1]<<8)|(enc[2]<<16)|(enc[3]<<24);
4023 val = (int32_t)v;
4024 } else {
4025 val = 0; /* anti-warning */
4026 redisPanic("Unknown RDB integer encoding type");
4027 }
4028 if (encode)
4029 return createStringObjectFromLongLong(val);
4030 else
4031 return createObject(REDIS_STRING,sdsfromlonglong(val));
4032 }
4033
4034 static robj *rdbLoadLzfStringObject(FILE*fp) {
4035 unsigned int len, clen;
4036 unsigned char *c = NULL;
4037 sds val = NULL;
4038
4039 if ((clen = rdbLoadLen(fp,NULL)) == REDIS_RDB_LENERR) return NULL;
4040 if ((len = rdbLoadLen(fp,NULL)) == REDIS_RDB_LENERR) return NULL;
4041 if ((c = zmalloc(clen)) == NULL) goto err;
4042 if ((val = sdsnewlen(NULL,len)) == NULL) goto err;
4043 if (fread(c,clen,1,fp) == 0) goto err;
4044 if (lzf_decompress(c,clen,val,len) == 0) goto err;
4045 zfree(c);
4046 return createObject(REDIS_STRING,val);
4047 err:
4048 zfree(c);
4049 sdsfree(val);
4050 return NULL;
4051 }
4052
4053 static robj *rdbGenericLoadStringObject(FILE*fp, int encode) {
4054 int isencoded;
4055 uint32_t len;
4056 sds val;
4057
4058 len = rdbLoadLen(fp,&isencoded);
4059 if (isencoded) {
4060 switch(len) {
4061 case REDIS_RDB_ENC_INT8:
4062 case REDIS_RDB_ENC_INT16:
4063 case REDIS_RDB_ENC_INT32:
4064 return rdbLoadIntegerObject(fp,len,encode);
4065 case REDIS_RDB_ENC_LZF:
4066 return rdbLoadLzfStringObject(fp);
4067 default:
4068 redisPanic("Unknown RDB encoding type");
4069 }
4070 }
4071
4072 if (len == REDIS_RDB_LENERR) return NULL;
4073 val = sdsnewlen(NULL,len);
4074 if (len && fread(val,len,1,fp) == 0) {
4075 sdsfree(val);
4076 return NULL;
4077 }
4078 return createObject(REDIS_STRING,val);
4079 }
4080
4081 static robj *rdbLoadStringObject(FILE *fp) {
4082 return rdbGenericLoadStringObject(fp,0);
4083 }
4084
4085 static robj *rdbLoadEncodedStringObject(FILE *fp) {
4086 return rdbGenericLoadStringObject(fp,1);
4087 }
4088
4089 /* For information about double serialization check rdbSaveDoubleValue() */
4090 static int rdbLoadDoubleValue(FILE *fp, double *val) {
4091 char buf[128];
4092 unsigned char len;
4093
4094 if (fread(&len,1,1,fp) == 0) return -1;
4095 switch(len) {
4096 case 255: *val = R_NegInf; return 0;
4097 case 254: *val = R_PosInf; return 0;
4098 case 253: *val = R_Nan; return 0;
4099 default:
4100 if (fread(buf,len,1,fp) == 0) return -1;
4101 buf[len] = '\0';
4102 sscanf(buf, "%lg", val);
4103 return 0;
4104 }
4105 }
4106
4107 /* Load a Redis object of the specified type from the specified file.
4108 * On success a newly allocated object is returned, otherwise NULL. */
4109 static robj *rdbLoadObject(int type, FILE *fp) {
4110 robj *o;
4111
4112 redisLog(REDIS_DEBUG,"LOADING OBJECT %d (at %d)\n",type,ftell(fp));
4113 if (type == REDIS_STRING) {
4114 /* Read string value */
4115 if ((o = rdbLoadEncodedStringObject(fp)) == NULL) return NULL;
4116 o = tryObjectEncoding(o);
4117 } else if (type == REDIS_LIST || type == REDIS_SET) {
4118 /* Read list/set value */
4119 uint32_t listlen;
4120
4121 if ((listlen = rdbLoadLen(fp,NULL)) == REDIS_RDB_LENERR) return NULL;
4122 o = (type == REDIS_LIST) ? createListObject() : createSetObject();
4123 /* It's faster to expand the dict to the right size asap in order
4124 * to avoid rehashing */
4125 if (type == REDIS_SET && listlen > DICT_HT_INITIAL_SIZE)
4126 dictExpand(o->ptr,listlen);
4127 /* Load every single element of the list/set */
4128 while(listlen--) {
4129 robj *ele;
4130
4131 if ((ele = rdbLoadEncodedStringObject(fp)) == NULL) return NULL;
4132 ele = tryObjectEncoding(ele);
4133 if (type == REDIS_LIST) {
4134 listAddNodeTail((list*)o->ptr,ele);
4135 } else {
4136 dictAdd((dict*)o->ptr,ele,NULL);
4137 }
4138 }
4139 } else if (type == REDIS_ZSET) {
4140 /* Read list/set value */
4141 size_t zsetlen;
4142 zset *zs;
4143
4144 if ((zsetlen = rdbLoadLen(fp,NULL)) == REDIS_RDB_LENERR) return NULL;
4145 o = createZsetObject();
4146 zs = o->ptr;
4147 /* Load every single element of the list/set */
4148 while(zsetlen--) {
4149 robj *ele;
4150 double *score = zmalloc(sizeof(double));
4151
4152 if ((ele = rdbLoadEncodedStringObject(fp)) == NULL) return NULL;
4153 ele = tryObjectEncoding(ele);
4154 if (rdbLoadDoubleValue(fp,score) == -1) return NULL;
4155 dictAdd(zs->dict,ele,score);
4156 zslInsert(zs->zsl,*score,ele);
4157 incrRefCount(ele); /* added to skiplist */
4158 }
4159 } else if (type == REDIS_HASH) {
4160 size_t hashlen;
4161
4162 if ((hashlen = rdbLoadLen(fp,NULL)) == REDIS_RDB_LENERR) return NULL;
4163 o = createHashObject();
4164 /* Too many entries? Use an hash table. */
4165 if (hashlen > server.hash_max_zipmap_entries)
4166 convertToRealHash(o);
4167 /* Load every key/value, then set it into the zipmap or hash
4168 * table, as needed. */
4169 while(hashlen--) {
4170 robj *key, *val;
4171
4172 if ((key = rdbLoadStringObject(fp)) == NULL) return NULL;
4173 if ((val = rdbLoadStringObject(fp)) == NULL) return NULL;
4174 /* If we are using a zipmap and there are too big values
4175 * the object is converted to real hash table encoding. */
4176 if (o->encoding != REDIS_ENCODING_HT &&
4177 (sdslen(key->ptr) > server.hash_max_zipmap_value ||
4178 sdslen(val->ptr) > server.hash_max_zipmap_value))
4179 {
4180 convertToRealHash(o);
4181 }
4182
4183 if (o->encoding == REDIS_ENCODING_ZIPMAP) {
4184 unsigned char *zm = o->ptr;
4185
4186 zm = zipmapSet(zm,key->ptr,sdslen(key->ptr),
4187 val->ptr,sdslen(val->ptr),NULL);
4188 o->ptr = zm;
4189 decrRefCount(key);
4190 decrRefCount(val);
4191 } else {
4192 key = tryObjectEncoding(key);
4193 val = tryObjectEncoding(val);
4194 dictAdd((dict*)o->ptr,key,val);
4195 }
4196 }
4197 } else {
4198 redisPanic("Unknown object type");
4199 }
4200 return o;
4201 }
4202
4203 static int rdbLoad(char *filename) {
4204 FILE *fp;
4205 uint32_t dbid;
4206 int type, retval, rdbver;
4207 int swap_all_values = 0;
4208 redisDb *db = server.db+0;
4209 char buf[1024];
4210 time_t expiretime, now = time(NULL);
4211
4212 fp = fopen(filename,"r");
4213 if (!fp) return REDIS_ERR;
4214 if (fread(buf,9,1,fp) == 0) goto eoferr;
4215 buf[9] = '\0';
4216 if (memcmp(buf,"REDIS",5) != 0) {
4217 fclose(fp);
4218 redisLog(REDIS_WARNING,"Wrong signature trying to load DB from file");
4219 return REDIS_ERR;
4220 }
4221 rdbver = atoi(buf+5);
4222 if (rdbver != 1) {
4223 fclose(fp);
4224 redisLog(REDIS_WARNING,"Can't handle RDB format version %d",rdbver);
4225 return REDIS_ERR;
4226 }
4227 while(1) {
4228 robj *key, *val;
4229 int force_swapout;
4230
4231 expiretime = -1;
4232 /* Read type. */
4233 if ((type = rdbLoadType(fp)) == -1) goto eoferr;
4234 if (type == REDIS_EXPIRETIME) {
4235 if ((expiretime = rdbLoadTime(fp)) == -1) goto eoferr;
4236 /* We read the time so we need to read the object type again */
4237 if ((type = rdbLoadType(fp)) == -1) goto eoferr;
4238 }
4239 if (type == REDIS_EOF) break;
4240 /* Handle SELECT DB opcode as a special case */
4241 if (type == REDIS_SELECTDB) {
4242 if ((dbid = rdbLoadLen(fp,NULL)) == REDIS_RDB_LENERR)
4243 goto eoferr;
4244 if (dbid >= (unsigned)server.dbnum) {
4245 redisLog(REDIS_WARNING,"FATAL: Data file was created with a Redis server configured to handle more than %d databases. Exiting\n", server.dbnum);
4246 exit(1);
4247 }
4248 db = server.db+dbid;
4249 continue;
4250 }
4251 /* Read key */
4252 if ((key = rdbLoadStringObject(fp)) == NULL) goto eoferr;
4253 /* Read value */
4254 if ((val = rdbLoadObject(type,fp)) == NULL) goto eoferr;
4255 /* Check if the key already expired */
4256 if (expiretime != -1 && expiretime < now) {
4257 decrRefCount(key);
4258 decrRefCount(val);
4259 continue;
4260 }
4261 /* Add the new object in the hash table */
4262 retval = dbAdd(db,key,val);
4263 if (retval == REDIS_ERR) {
4264 redisLog(REDIS_WARNING,"Loading DB, duplicated key (%s) found! Unrecoverable error, exiting now.", key->ptr);
4265 exit(1);
4266 }
4267 /* Set the expire time if needed */
4268 if (expiretime != -1) setExpire(db,key,expiretime);
4269
4270 /* Handle swapping while loading big datasets when VM is on */
4271
4272 /* If we detecter we are hopeless about fitting something in memory
4273 * we just swap every new key on disk. Directly...
4274 * Note that's important to check for this condition before resorting
4275 * to random sampling, otherwise we may try to swap already
4276 * swapped keys. */
4277 if (swap_all_values) {
4278 dictEntry *de = dictFind(db->dict,key->ptr);
4279
4280 /* de may be NULL since the key already expired */
4281 if (de) {
4282 vmpointer *vp;
4283 val = dictGetEntryVal(de);
4284
4285 if (val->refcount == 1 &&
4286 (vp = vmSwapObjectBlocking(val)) != NULL)
4287 dictGetEntryVal(de) = vp;
4288 }
4289 decrRefCount(key);
4290 continue;
4291 }
4292 decrRefCount(key);
4293
4294 /* Flush data on disk once 32 MB of additional RAM are used... */
4295 force_swapout = 0;
4296 if ((zmalloc_used_memory() - server.vm_max_memory) > 1024*1024*32)
4297 force_swapout = 1;
4298
4299 /* If we have still some hope of having some value fitting memory
4300 * then we try random sampling. */
4301 if (!swap_all_values && server.vm_enabled && force_swapout) {
4302 while (zmalloc_used_memory() > server.vm_max_memory) {
4303 if (vmSwapOneObjectBlocking() == REDIS_ERR) break;
4304 }
4305 if (zmalloc_used_memory() > server.vm_max_memory)
4306 swap_all_values = 1; /* We are already using too much mem */
4307 }
4308 }
4309 fclose(fp);
4310 return REDIS_OK;
4311
4312 eoferr: /* unexpected end of file is handled here with a fatal exit */
4313 redisLog(REDIS_WARNING,"Short read or OOM loading DB. Unrecoverable error, aborting now.");
4314 exit(1);
4315 return REDIS_ERR; /* Just to avoid warning */
4316 }
4317
4318 /*================================== Shutdown =============================== */
4319 static int prepareForShutdown() {
4320 redisLog(REDIS_WARNING,"User requested shutdown, saving DB...");
4321 /* Kill the saving child if there is a background saving in progress.
4322 We want to avoid race conditions, for instance our saving child may
4323 overwrite the synchronous saving did by SHUTDOWN. */
4324 if (server.bgsavechildpid != -1) {
4325 redisLog(REDIS_WARNING,"There is a live saving child. Killing it!");
4326 kill(server.bgsavechildpid,SIGKILL);
4327 rdbRemoveTempFile(server.bgsavechildpid);
4328 }
4329 if (server.appendonly) {
4330 /* Append only file: fsync() the AOF and exit */
4331 aof_fsync(server.appendfd);
4332 if (server.vm_enabled) unlink(server.vm_swap_file);
4333 } else {
4334 /* Snapshotting. Perform a SYNC SAVE and exit */
4335 if (rdbSave(server.dbfilename) == REDIS_OK) {
4336 if (server.daemonize)
4337 unlink(server.pidfile);
4338 redisLog(REDIS_WARNING,"%zu bytes used at exit",zmalloc_used_memory());
4339 } else {
4340 /* Ooops.. error saving! The best we can do is to continue
4341 * operating. Note that if there was a background saving process,
4342 * in the next cron() Redis will be notified that the background
4343 * saving aborted, handling special stuff like slaves pending for
4344 * synchronization... */
4345 redisLog(REDIS_WARNING,"Error trying to save the DB, can't exit");
4346 return REDIS_ERR;
4347 }
4348 }
4349 redisLog(REDIS_WARNING,"Server exit now, bye bye...");
4350 return REDIS_OK;
4351 }
4352
4353 /*================================== Commands =============================== */
4354
4355 static void authCommand(redisClient *c) {
4356 if (!server.requirepass || !strcmp(c->argv[1]->ptr, server.requirepass)) {
4357 c->authenticated = 1;
4358 addReply(c,shared.ok);
4359 } else {
4360 c->authenticated = 0;
4361 addReplySds(c,sdscatprintf(sdsempty(),"-ERR invalid password\r\n"));
4362 }
4363 }
4364
4365 static void pingCommand(redisClient *c) {
4366 addReply(c,shared.pong);
4367 }
4368
4369 static void echoCommand(redisClient *c) {
4370 addReplyBulk(c,c->argv[1]);
4371 }
4372
4373 /*=================================== Strings =============================== */
4374
4375 static void setGenericCommand(redisClient *c, int nx, robj *key, robj *val, robj *expire) {
4376 int retval;
4377 long seconds = 0; /* initialized to avoid an harmness warning */
4378
4379 if (expire) {
4380 if (getLongFromObjectOrReply(c, expire, &seconds, NULL) != REDIS_OK)
4381 return;
4382 if (seconds <= 0) {
4383 addReplySds(c,sdsnew("-ERR invalid expire time in SETEX\r\n"));
4384 return;
4385 }
4386 }
4387
4388 touchWatchedKey(c->db,key);
4389 if (nx) deleteIfVolatile(c->db,key);
4390 retval = dbAdd(c->db,key,val);
4391 if (retval == REDIS_ERR) {
4392 if (!nx) {
4393 dbReplace(c->db,key,val);
4394 incrRefCount(val);
4395 } else {
4396 addReply(c,shared.czero);
4397 return;
4398 }
4399 } else {
4400 incrRefCount(val);
4401 }
4402 server.dirty++;
4403 removeExpire(c->db,key);
4404 if (expire) setExpire(c->db,key,time(NULL)+seconds);
4405 addReply(c, nx ? shared.cone : shared.ok);
4406 }
4407
4408 static void setCommand(redisClient *c) {
4409 setGenericCommand(c,0,c->argv[1],c->argv[2],NULL);
4410 }
4411
4412 static void setnxCommand(redisClient *c) {
4413 setGenericCommand(c,1,c->argv[1],c->argv[2],NULL);
4414 }
4415
4416 static void setexCommand(redisClient *c) {
4417 setGenericCommand(c,0,c->argv[1],c->argv[3],c->argv[2]);
4418 }
4419
4420 static int getGenericCommand(redisClient *c) {
4421 robj *o;
4422
4423 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.nullbulk)) == NULL)
4424 return REDIS_OK;
4425
4426 if (o->type != REDIS_STRING) {
4427 addReply(c,shared.wrongtypeerr);
4428 return REDIS_ERR;
4429 } else {
4430 addReplyBulk(c,o);
4431 return REDIS_OK;
4432 }
4433 }
4434
4435 static void getCommand(redisClient *c) {
4436 getGenericCommand(c);
4437 }
4438
4439 static void getsetCommand(redisClient *c) {
4440 if (getGenericCommand(c) == REDIS_ERR) return;
4441 dbReplace(c->db,c->argv[1],c->argv[2]);
4442 incrRefCount(c->argv[2]);
4443 server.dirty++;
4444 removeExpire(c->db,c->argv[1]);
4445 }
4446
4447 static void mgetCommand(redisClient *c) {
4448 int j;
4449
4450 addReplySds(c,sdscatprintf(sdsempty(),"*%d\r\n",c->argc-1));
4451 for (j = 1; j < c->argc; j++) {
4452 robj *o = lookupKeyRead(c->db,c->argv[j]);
4453 if (o == NULL) {
4454 addReply(c,shared.nullbulk);
4455 } else {
4456 if (o->type != REDIS_STRING) {
4457 addReply(c,shared.nullbulk);
4458 } else {
4459 addReplyBulk(c,o);
4460 }
4461 }
4462 }
4463 }
4464
4465 static void msetGenericCommand(redisClient *c, int nx) {
4466 int j, busykeys = 0;
4467
4468 if ((c->argc % 2) == 0) {
4469 addReplySds(c,sdsnew("-ERR wrong number of arguments for MSET\r\n"));
4470 return;
4471 }
4472 /* Handle the NX flag. The MSETNX semantic is to return zero and don't
4473 * set nothing at all if at least one already key exists. */
4474 if (nx) {
4475 for (j = 1; j < c->argc; j += 2) {
4476 if (lookupKeyWrite(c->db,c->argv[j]) != NULL) {
4477 busykeys++;
4478 }
4479 }
4480 }
4481 if (busykeys) {
4482 addReply(c, shared.czero);
4483 return;
4484 }
4485
4486 for (j = 1; j < c->argc; j += 2) {
4487 c->argv[j+1] = tryObjectEncoding(c->argv[j+1]);
4488 dbReplace(c->db,c->argv[j],c->argv[j+1]);
4489 incrRefCount(c->argv[j+1]);
4490 removeExpire(c->db,c->argv[j]);
4491 }
4492 server.dirty += (c->argc-1)/2;
4493 addReply(c, nx ? shared.cone : shared.ok);
4494 }
4495
4496 static void msetCommand(redisClient *c) {
4497 msetGenericCommand(c,0);
4498 }
4499
4500 static void msetnxCommand(redisClient *c) {
4501 msetGenericCommand(c,1);
4502 }
4503
4504 static void incrDecrCommand(redisClient *c, long long incr) {
4505 long long value;
4506 robj *o;
4507
4508 o = lookupKeyWrite(c->db,c->argv[1]);
4509 if (o != NULL && checkType(c,o,REDIS_STRING)) return;
4510 if (getLongLongFromObjectOrReply(c,o,&value,NULL) != REDIS_OK) return;
4511
4512 value += incr;
4513 o = createStringObjectFromLongLong(value);
4514 dbReplace(c->db,c->argv[1],o);
4515 server.dirty++;
4516 addReply(c,shared.colon);
4517 addReply(c,o);
4518 addReply(c,shared.crlf);
4519 }
4520
4521 static void incrCommand(redisClient *c) {
4522 incrDecrCommand(c,1);
4523 }
4524
4525 static void decrCommand(redisClient *c) {
4526 incrDecrCommand(c,-1);
4527 }
4528
4529 static void incrbyCommand(redisClient *c) {
4530 long long incr;
4531
4532 if (getLongLongFromObjectOrReply(c, c->argv[2], &incr, NULL) != REDIS_OK) return;
4533 incrDecrCommand(c,incr);
4534 }
4535
4536 static void decrbyCommand(redisClient *c) {
4537 long long incr;
4538
4539 if (getLongLongFromObjectOrReply(c, c->argv[2], &incr, NULL) != REDIS_OK) return;
4540 incrDecrCommand(c,-incr);
4541 }
4542
4543 static void appendCommand(redisClient *c) {
4544 int retval;
4545 size_t totlen;
4546 robj *o;
4547
4548 o = lookupKeyWrite(c->db,c->argv[1]);
4549 if (o == NULL) {
4550 /* Create the key */
4551 retval = dbAdd(c->db,c->argv[1],c->argv[2]);
4552 incrRefCount(c->argv[2]);
4553 totlen = stringObjectLen(c->argv[2]);
4554 } else {
4555 if (o->type != REDIS_STRING) {
4556 addReply(c,shared.wrongtypeerr);
4557 return;
4558 }
4559 /* If the object is specially encoded or shared we have to make
4560 * a copy */
4561 if (o->refcount != 1 || o->encoding != REDIS_ENCODING_RAW) {
4562 robj *decoded = getDecodedObject(o);
4563
4564 o = createStringObject(decoded->ptr, sdslen(decoded->ptr));
4565 decrRefCount(decoded);
4566 dbReplace(c->db,c->argv[1],o);
4567 }
4568 /* APPEND! */
4569 if (c->argv[2]->encoding == REDIS_ENCODING_RAW) {
4570 o->ptr = sdscatlen(o->ptr,
4571 c->argv[2]->ptr, sdslen(c->argv[2]->ptr));
4572 } else {
4573 o->ptr = sdscatprintf(o->ptr, "%ld",
4574 (unsigned long) c->argv[2]->ptr);
4575 }
4576 totlen = sdslen(o->ptr);
4577 }
4578 server.dirty++;
4579 addReplySds(c,sdscatprintf(sdsempty(),":%lu\r\n",(unsigned long)totlen));
4580 }
4581
4582 static void substrCommand(redisClient *c) {
4583 robj *o;
4584 long start = atoi(c->argv[2]->ptr);
4585 long end = atoi(c->argv[3]->ptr);
4586 size_t rangelen, strlen;
4587 sds range;
4588
4589 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.nullbulk)) == NULL ||
4590 checkType(c,o,REDIS_STRING)) return;
4591
4592 o = getDecodedObject(o);
4593 strlen = sdslen(o->ptr);
4594
4595 /* convert negative indexes */
4596 if (start < 0) start = strlen+start;
4597 if (end < 0) end = strlen+end;
4598 if (start < 0) start = 0;
4599 if (end < 0) end = 0;
4600
4601 /* indexes sanity checks */
4602 if (start > end || (size_t)start >= strlen) {
4603 /* Out of range start or start > end result in null reply */
4604 addReply(c,shared.nullbulk);
4605 decrRefCount(o);
4606 return;
4607 }
4608 if ((size_t)end >= strlen) end = strlen-1;
4609 rangelen = (end-start)+1;
4610
4611 /* Return the result */
4612 addReplySds(c,sdscatprintf(sdsempty(),"$%zu\r\n",rangelen));
4613 range = sdsnewlen((char*)o->ptr+start,rangelen);
4614 addReplySds(c,range);
4615 addReply(c,shared.crlf);
4616 decrRefCount(o);
4617 }
4618
4619 /* ========================= Type agnostic commands ========================= */
4620
4621 static void delCommand(redisClient *c) {
4622 int deleted = 0, j;
4623
4624 for (j = 1; j < c->argc; j++) {
4625 if (dbDelete(c->db,c->argv[j])) {
4626 touchWatchedKey(c->db,c->argv[j]);
4627 server.dirty++;
4628 deleted++;
4629 }
4630 }
4631 addReplyLongLong(c,deleted);
4632 }
4633
4634 static void existsCommand(redisClient *c) {
4635 expireIfNeeded(c->db,c->argv[1]);
4636 if (dbExists(c->db,c->argv[1])) {
4637 addReply(c, shared.cone);
4638 } else {
4639 addReply(c, shared.czero);
4640 }
4641 }
4642
4643 static void selectCommand(redisClient *c) {
4644 int id = atoi(c->argv[1]->ptr);
4645
4646 if (selectDb(c,id) == REDIS_ERR) {
4647 addReplySds(c,sdsnew("-ERR invalid DB index\r\n"));
4648 } else {
4649 addReply(c,shared.ok);
4650 }
4651 }
4652
4653 static void randomkeyCommand(redisClient *c) {
4654 robj *key;
4655
4656 if ((key = dbRandomKey(c->db)) == NULL) {
4657 addReply(c,shared.nullbulk);
4658 return;
4659 }
4660
4661 addReplyBulk(c,key);
4662 decrRefCount(key);
4663 }
4664
4665 static void keysCommand(redisClient *c) {
4666 dictIterator *di;
4667 dictEntry *de;
4668 sds pattern = c->argv[1]->ptr;
4669 int plen = sdslen(pattern);
4670 unsigned long numkeys = 0;
4671 robj *lenobj = createObject(REDIS_STRING,NULL);
4672
4673 di = dictGetIterator(c->db->dict);
4674 addReply(c,lenobj);
4675 decrRefCount(lenobj);
4676 while((de = dictNext(di)) != NULL) {
4677 sds key = dictGetEntryKey(de);
4678 robj *keyobj;
4679
4680 if ((pattern[0] == '*' && pattern[1] == '\0') ||
4681 stringmatchlen(pattern,plen,key,sdslen(key),0)) {
4682 keyobj = createStringObject(key,sdslen(key));
4683 if (expireIfNeeded(c->db,keyobj) == 0) {
4684 addReplyBulk(c,keyobj);
4685 numkeys++;
4686 }
4687 decrRefCount(keyobj);
4688 }
4689 }
4690 dictReleaseIterator(di);
4691 lenobj->ptr = sdscatprintf(sdsempty(),"*%lu\r\n",numkeys);
4692 }
4693
4694 static void dbsizeCommand(redisClient *c) {
4695 addReplySds(c,
4696 sdscatprintf(sdsempty(),":%lu\r\n",dictSize(c->db->dict)));
4697 }
4698
4699 static void lastsaveCommand(redisClient *c) {
4700 addReplySds(c,
4701 sdscatprintf(sdsempty(),":%lu\r\n",server.lastsave));
4702 }
4703
4704 static void typeCommand(redisClient *c) {
4705 robj *o;
4706 char *type;
4707
4708 o = lookupKeyRead(c->db,c->argv[1]);
4709 if (o == NULL) {
4710 type = "+none";
4711 } else {
4712 switch(o->type) {
4713 case REDIS_STRING: type = "+string"; break;
4714 case REDIS_LIST: type = "+list"; break;
4715 case REDIS_SET: type = "+set"; break;
4716 case REDIS_ZSET: type = "+zset"; break;
4717 case REDIS_HASH: type = "+hash"; break;
4718 default: type = "+unknown"; break;
4719 }
4720 }
4721 addReplySds(c,sdsnew(type));
4722 addReply(c,shared.crlf);
4723 }
4724
4725 static void saveCommand(redisClient *c) {
4726 if (server.bgsavechildpid != -1) {
4727 addReplySds(c,sdsnew("-ERR background save in progress\r\n"));
4728 return;
4729 }
4730 if (rdbSave(server.dbfilename) == REDIS_OK) {
4731 addReply(c,shared.ok);
4732 } else {
4733 addReply(c,shared.err);
4734 }
4735 }
4736
4737 static void bgsaveCommand(redisClient *c) {
4738 if (server.bgsavechildpid != -1) {
4739 addReplySds(c,sdsnew("-ERR background save already in progress\r\n"));
4740 return;
4741 }
4742 if (rdbSaveBackground(server.dbfilename) == REDIS_OK) {
4743 char *status = "+Background saving started\r\n";
4744 addReplySds(c,sdsnew(status));
4745 } else {
4746 addReply(c,shared.err);
4747 }
4748 }
4749
4750 static void shutdownCommand(redisClient *c) {
4751 if (prepareForShutdown() == REDIS_OK)
4752 exit(0);
4753 addReplySds(c, sdsnew("-ERR Errors trying to SHUTDOWN. Check logs.\r\n"));
4754 }
4755
4756 static void renameGenericCommand(redisClient *c, int nx) {
4757 robj *o;
4758
4759 /* To use the same key as src and dst is probably an error */
4760 if (sdscmp(c->argv[1]->ptr,c->argv[2]->ptr) == 0) {
4761 addReply(c,shared.sameobjecterr);
4762 return;
4763 }
4764
4765 if ((o = lookupKeyWriteOrReply(c,c->argv[1],shared.nokeyerr)) == NULL)
4766 return;
4767
4768 incrRefCount(o);
4769 deleteIfVolatile(c->db,c->argv[2]);
4770 if (dbAdd(c->db,c->argv[2],o) == REDIS_ERR) {
4771 if (nx) {
4772 decrRefCount(o);
4773 addReply(c,shared.czero);
4774 return;
4775 }
4776 dbReplace(c->db,c->argv[2],o);
4777 }
4778 dbDelete(c->db,c->argv[1]);
4779 touchWatchedKey(c->db,c->argv[2]);
4780 server.dirty++;
4781 addReply(c,nx ? shared.cone : shared.ok);
4782 }
4783
4784 static void renameCommand(redisClient *c) {
4785 renameGenericCommand(c,0);
4786 }
4787
4788 static void renamenxCommand(redisClient *c) {
4789 renameGenericCommand(c,1);
4790 }
4791
4792 static void moveCommand(redisClient *c) {
4793 robj *o;
4794 redisDb *src, *dst;
4795 int srcid;
4796
4797 /* Obtain source and target DB pointers */
4798 src = c->db;
4799 srcid = c->db->id;
4800 if (selectDb(c,atoi(c->argv[2]->ptr)) == REDIS_ERR) {
4801 addReply(c,shared.outofrangeerr);
4802 return;
4803 }
4804 dst = c->db;
4805 selectDb(c,srcid); /* Back to the source DB */
4806
4807 /* If the user is moving using as target the same
4808 * DB as the source DB it is probably an error. */
4809 if (src == dst) {
4810 addReply(c,shared.sameobjecterr);
4811 return;
4812 }
4813
4814 /* Check if the element exists and get a reference */
4815 o = lookupKeyWrite(c->db,c->argv[1]);
4816 if (!o) {
4817 addReply(c,shared.czero);
4818 return;
4819 }
4820
4821 /* Try to add the element to the target DB */
4822 deleteIfVolatile(dst,c->argv[1]);
4823 if (dbAdd(dst,c->argv[1],o) == REDIS_ERR) {
4824 addReply(c,shared.czero);
4825 return;
4826 }
4827 incrRefCount(o);
4828
4829 /* OK! key moved, free the entry in the source DB */
4830 dbDelete(src,c->argv[1]);
4831 server.dirty++;
4832 addReply(c,shared.cone);
4833 }
4834
4835 /* =================================== Lists ================================ */
4836 static void pushGenericCommand(redisClient *c, int where) {
4837 robj *lobj;
4838 list *list;
4839
4840 lobj = lookupKeyWrite(c->db,c->argv[1]);
4841 if (lobj == NULL) {
4842 if (handleClientsWaitingListPush(c,c->argv[1],c->argv[2])) {
4843 addReply(c,shared.cone);
4844 return;
4845 }
4846 lobj = createListObject();
4847 list = lobj->ptr;
4848 if (where == REDIS_HEAD) {
4849 listAddNodeHead(list,c->argv[2]);
4850 } else {
4851 listAddNodeTail(list,c->argv[2]);
4852 }
4853 incrRefCount(c->argv[2]);
4854 dbAdd(c->db,c->argv[1],lobj);
4855 } else {
4856 if (lobj->type != REDIS_LIST) {
4857 addReply(c,shared.wrongtypeerr);
4858 return;
4859 }
4860 if (handleClientsWaitingListPush(c,c->argv[1],c->argv[2])) {
4861 addReply(c,shared.cone);
4862 return;
4863 }
4864 list = lobj->ptr;
4865 if (where == REDIS_HEAD) {
4866 listAddNodeHead(list,c->argv[2]);
4867 } else {
4868 listAddNodeTail(list,c->argv[2]);
4869 }
4870 incrRefCount(c->argv[2]);
4871 }
4872 server.dirty++;
4873 addReplyLongLong(c,listLength(list));
4874 }
4875
4876 static void lpushCommand(redisClient *c) {
4877 pushGenericCommand(c,REDIS_HEAD);
4878 }
4879
4880 static void rpushCommand(redisClient *c) {
4881 pushGenericCommand(c,REDIS_TAIL);
4882 }
4883
4884 static void llenCommand(redisClient *c) {
4885 robj *o;
4886 list *l;
4887
4888 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.czero)) == NULL ||
4889 checkType(c,o,REDIS_LIST)) return;
4890
4891 l = o->ptr;
4892 addReplyUlong(c,listLength(l));
4893 }
4894
4895 static void lindexCommand(redisClient *c) {
4896 robj *o;
4897 int index = atoi(c->argv[2]->ptr);
4898 list *list;
4899 listNode *ln;
4900
4901 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.nullbulk)) == NULL ||
4902 checkType(c,o,REDIS_LIST)) return;
4903 list = o->ptr;
4904
4905 ln = listIndex(list, index);
4906 if (ln == NULL) {
4907 addReply(c,shared.nullbulk);
4908 } else {
4909 robj *ele = listNodeValue(ln);
4910 addReplyBulk(c,ele);
4911 }
4912 }
4913
4914 static void lsetCommand(redisClient *c) {
4915 robj *o;
4916 int index = atoi(c->argv[2]->ptr);
4917 list *list;
4918 listNode *ln;
4919
4920 if ((o = lookupKeyWriteOrReply(c,c->argv[1],shared.nokeyerr)) == NULL ||
4921 checkType(c,o,REDIS_LIST)) return;
4922 list = o->ptr;
4923
4924 ln = listIndex(list, index);
4925 if (ln == NULL) {
4926 addReply(c,shared.outofrangeerr);
4927 } else {
4928 robj *ele = listNodeValue(ln);
4929
4930 decrRefCount(ele);
4931 listNodeValue(ln) = c->argv[3];
4932 incrRefCount(c->argv[3]);
4933 addReply(c,shared.ok);
4934 server.dirty++;
4935 }
4936 }
4937
4938 static void popGenericCommand(redisClient *c, int where) {
4939 robj *o;
4940 list *list;
4941 listNode *ln;
4942
4943 if ((o = lookupKeyWriteOrReply(c,c->argv[1],shared.nullbulk)) == NULL ||
4944 checkType(c,o,REDIS_LIST)) return;
4945 list = o->ptr;
4946
4947 if (where == REDIS_HEAD)
4948 ln = listFirst(list);
4949 else
4950 ln = listLast(list);
4951
4952 if (ln == NULL) {
4953 addReply(c,shared.nullbulk);
4954 } else {
4955 robj *ele = listNodeValue(ln);
4956 addReplyBulk(c,ele);
4957 listDelNode(list,ln);
4958 if (listLength(list) == 0) dbDelete(c->db,c->argv[1]);
4959 server.dirty++;
4960 }
4961 }
4962
4963 static void lpopCommand(redisClient *c) {
4964 popGenericCommand(c,REDIS_HEAD);
4965 }
4966
4967 static void rpopCommand(redisClient *c) {
4968 popGenericCommand(c,REDIS_TAIL);
4969 }
4970
4971 static void lrangeCommand(redisClient *c) {
4972 robj *o;
4973 int start = atoi(c->argv[2]->ptr);
4974 int end = atoi(c->argv[3]->ptr);
4975 int llen;
4976 int rangelen, j;
4977 list *list;
4978 listNode *ln;
4979 robj *ele;
4980
4981 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.emptymultibulk)) == NULL
4982 || checkType(c,o,REDIS_LIST)) return;
4983 list = o->ptr;
4984 llen = listLength(list);
4985
4986 /* convert negative indexes */
4987 if (start < 0) start = llen+start;
4988 if (end < 0) end = llen+end;
4989 if (start < 0) start = 0;
4990 if (end < 0) end = 0;
4991
4992 /* indexes sanity checks */
4993 if (start > end || start >= llen) {
4994 /* Out of range start or start > end result in empty list */
4995 addReply(c,shared.emptymultibulk);
4996 return;
4997 }
4998 if (end >= llen) end = llen-1;
4999 rangelen = (end-start)+1;
5000
5001 /* Return the result in form of a multi-bulk reply */
5002 ln = listIndex(list, start);
5003 addReplySds(c,sdscatprintf(sdsempty(),"*%d\r\n",rangelen));
5004 for (j = 0; j < rangelen; j++) {
5005 ele = listNodeValue(ln);
5006 addReplyBulk(c,ele);
5007 ln = ln->next;
5008 }
5009 }
5010
5011 static void ltrimCommand(redisClient *c) {
5012 robj *o;
5013 int start = atoi(c->argv[2]->ptr);
5014 int end = atoi(c->argv[3]->ptr);
5015 int llen;
5016 int j, ltrim, rtrim;
5017 list *list;
5018 listNode *ln;
5019
5020 if ((o = lookupKeyWriteOrReply(c,c->argv[1],shared.ok)) == NULL ||
5021 checkType(c,o,REDIS_LIST)) return;
5022 list = o->ptr;
5023 llen = listLength(list);
5024
5025 /* convert negative indexes */
5026 if (start < 0) start = llen+start;
5027 if (end < 0) end = llen+end;
5028 if (start < 0) start = 0;
5029 if (end < 0) end = 0;
5030
5031 /* indexes sanity checks */
5032 if (start > end || start >= llen) {
5033 /* Out of range start or start > end result in empty list */
5034 ltrim = llen;
5035 rtrim = 0;
5036 } else {
5037 if (end >= llen) end = llen-1;
5038 ltrim = start;
5039 rtrim = llen-end-1;
5040 }
5041
5042 /* Remove list elements to perform the trim */
5043 for (j = 0; j < ltrim; j++) {
5044 ln = listFirst(list);
5045 listDelNode(list,ln);
5046 }
5047 for (j = 0; j < rtrim; j++) {
5048 ln = listLast(list);
5049 listDelNode(list,ln);
5050 }
5051 if (listLength(list) == 0) dbDelete(c->db,c->argv[1]);
5052 server.dirty++;
5053 addReply(c,shared.ok);
5054 }
5055
5056 static void lremCommand(redisClient *c) {
5057 robj *o;
5058 list *list;
5059 listNode *ln, *next;
5060 int toremove = atoi(c->argv[2]->ptr);
5061 int removed = 0;
5062 int fromtail = 0;
5063
5064 if ((o = lookupKeyWriteOrReply(c,c->argv[1],shared.czero)) == NULL ||
5065 checkType(c,o,REDIS_LIST)) return;
5066 list = o->ptr;
5067
5068 if (toremove < 0) {
5069 toremove = -toremove;
5070 fromtail = 1;
5071 }
5072 ln = fromtail ? list->tail : list->head;
5073 while (ln) {
5074 robj *ele = listNodeValue(ln);
5075
5076 next = fromtail ? ln->prev : ln->next;
5077 if (equalStringObjects(ele,c->argv[3])) {
5078 listDelNode(list,ln);
5079 server.dirty++;
5080 removed++;
5081 if (toremove && removed == toremove) break;
5082 }
5083 ln = next;
5084 }
5085 if (listLength(list) == 0) dbDelete(c->db,c->argv[1]);
5086 addReplySds(c,sdscatprintf(sdsempty(),":%d\r\n",removed));
5087 }
5088
5089 /* This is the semantic of this command:
5090 * RPOPLPUSH srclist dstlist:
5091 * IF LLEN(srclist) > 0
5092 * element = RPOP srclist
5093 * LPUSH dstlist element
5094 * RETURN element
5095 * ELSE
5096 * RETURN nil
5097 * END
5098 * END
5099 *
5100 * The idea is to be able to get an element from a list in a reliable way
5101 * since the element is not just returned but pushed against another list
5102 * as well. This command was originally proposed by Ezra Zygmuntowicz.
5103 */
5104 static void rpoplpushcommand(redisClient *c) {
5105 robj *sobj;
5106 list *srclist;
5107 listNode *ln;
5108
5109 if ((sobj = lookupKeyWriteOrReply(c,c->argv[1],shared.nullbulk)) == NULL ||
5110 checkType(c,sobj,REDIS_LIST)) return;
5111 srclist = sobj->ptr;
5112 ln = listLast(srclist);
5113
5114 if (ln == NULL) {
5115 addReply(c,shared.nullbulk);
5116 } else {
5117 robj *dobj = lookupKeyWrite(c->db,c->argv[2]);
5118 robj *ele = listNodeValue(ln);
5119 list *dstlist;
5120
5121 if (dobj && dobj->type != REDIS_LIST) {
5122 addReply(c,shared.wrongtypeerr);
5123 return;
5124 }
5125
5126 /* Add the element to the target list (unless it's directly
5127 * passed to some BLPOP-ing client */
5128 if (!handleClientsWaitingListPush(c,c->argv[2],ele)) {
5129 if (dobj == NULL) {
5130 /* Create the list if the key does not exist */
5131 dobj = createListObject();
5132 dbAdd(c->db,c->argv[2],dobj);
5133 }
5134 dstlist = dobj->ptr;
5135 listAddNodeHead(dstlist,ele);
5136 incrRefCount(ele);
5137 }
5138
5139 /* Send the element to the client as reply as well */
5140 addReplyBulk(c,ele);
5141
5142 /* Finally remove the element from the source list */
5143 listDelNode(srclist,ln);
5144 if (listLength(srclist) == 0) dbDelete(c->db,c->argv[1]);
5145 server.dirty++;
5146 }
5147 }
5148
5149 /* ==================================== Sets ================================ */
5150
5151 static void saddCommand(redisClient *c) {
5152 robj *set;
5153
5154 set = lookupKeyWrite(c->db,c->argv[1]);
5155 if (set == NULL) {
5156 set = createSetObject();
5157 dbAdd(c->db,c->argv[1],set);
5158 } else {
5159 if (set->type != REDIS_SET) {
5160 addReply(c,shared.wrongtypeerr);
5161 return;
5162 }
5163 }
5164 if (dictAdd(set->ptr,c->argv[2],NULL) == DICT_OK) {
5165 incrRefCount(c->argv[2]);
5166 server.dirty++;
5167 addReply(c,shared.cone);
5168 } else {
5169 addReply(c,shared.czero);
5170 }
5171 }
5172
5173 static void sremCommand(redisClient *c) {
5174 robj *set;
5175
5176 if ((set = lookupKeyWriteOrReply(c,c->argv[1],shared.czero)) == NULL ||
5177 checkType(c,set,REDIS_SET)) return;
5178
5179 if (dictDelete(set->ptr,c->argv[2]) == DICT_OK) {
5180 server.dirty++;
5181 if (htNeedsResize(set->ptr)) dictResize(set->ptr);
5182 if (dictSize((dict*)set->ptr) == 0) dbDelete(c->db,c->argv[1]);
5183 addReply(c,shared.cone);
5184 } else {
5185 addReply(c,shared.czero);
5186 }
5187 }
5188
5189 static void smoveCommand(redisClient *c) {
5190 robj *srcset, *dstset;
5191
5192 srcset = lookupKeyWrite(c->db,c->argv[1]);
5193 dstset = lookupKeyWrite(c->db,c->argv[2]);
5194
5195 /* If the source key does not exist return 0, if it's of the wrong type
5196 * raise an error */
5197 if (srcset == NULL || srcset->type != REDIS_SET) {
5198 addReply(c, srcset ? shared.wrongtypeerr : shared.czero);
5199 return;
5200 }
5201 /* Error if the destination key is not a set as well */
5202 if (dstset && dstset->type != REDIS_SET) {
5203 addReply(c,shared.wrongtypeerr);
5204 return;
5205 }
5206 /* Remove the element from the source set */
5207 if (dictDelete(srcset->ptr,c->argv[3]) == DICT_ERR) {
5208 /* Key not found in the src set! return zero */
5209 addReply(c,shared.czero);
5210 return;
5211 }
5212 if (dictSize((dict*)srcset->ptr) == 0 && srcset != dstset)
5213 dbDelete(c->db,c->argv[1]);
5214 server.dirty++;
5215 /* Add the element to the destination set */
5216 if (!dstset) {
5217 dstset = createSetObject();
5218 dbAdd(c->db,c->argv[2],dstset);
5219 }
5220 if (dictAdd(dstset->ptr,c->argv[3],NULL) == DICT_OK)
5221 incrRefCount(c->argv[3]);
5222 addReply(c,shared.cone);
5223 }
5224
5225 static void sismemberCommand(redisClient *c) {
5226 robj *set;
5227
5228 if ((set = lookupKeyReadOrReply(c,c->argv[1],shared.czero)) == NULL ||
5229 checkType(c,set,REDIS_SET)) return;
5230
5231 if (dictFind(set->ptr,c->argv[2]))
5232 addReply(c,shared.cone);
5233 else
5234 addReply(c,shared.czero);
5235 }
5236
5237 static void scardCommand(redisClient *c) {
5238 robj *o;
5239 dict *s;
5240
5241 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.czero)) == NULL ||
5242 checkType(c,o,REDIS_SET)) return;
5243
5244 s = o->ptr;
5245 addReplyUlong(c,dictSize(s));
5246 }
5247
5248 static void spopCommand(redisClient *c) {
5249 robj *set;
5250 dictEntry *de;
5251
5252 if ((set = lookupKeyWriteOrReply(c,c->argv[1],shared.nullbulk)) == NULL ||
5253 checkType(c,set,REDIS_SET)) return;
5254
5255 de = dictGetRandomKey(set->ptr);
5256 if (de == NULL) {
5257 addReply(c,shared.nullbulk);
5258 } else {
5259 robj *ele = dictGetEntryKey(de);
5260
5261 addReplyBulk(c,ele);
5262 dictDelete(set->ptr,ele);
5263 if (htNeedsResize(set->ptr)) dictResize(set->ptr);
5264 if (dictSize((dict*)set->ptr) == 0) dbDelete(c->db,c->argv[1]);
5265 server.dirty++;
5266 }
5267 }
5268
5269 static void srandmemberCommand(redisClient *c) {
5270 robj *set;
5271 dictEntry *de;
5272
5273 if ((set = lookupKeyReadOrReply(c,c->argv[1],shared.nullbulk)) == NULL ||
5274 checkType(c,set,REDIS_SET)) return;
5275
5276 de = dictGetRandomKey(set->ptr);
5277 if (de == NULL) {
5278 addReply(c,shared.nullbulk);
5279 } else {
5280 robj *ele = dictGetEntryKey(de);
5281
5282 addReplyBulk(c,ele);
5283 }
5284 }
5285
5286 static int qsortCompareSetsByCardinality(const void *s1, const void *s2) {
5287 dict **d1 = (void*) s1, **d2 = (void*) s2;
5288
5289 return dictSize(*d1)-dictSize(*d2);
5290 }
5291
5292 static void sinterGenericCommand(redisClient *c, robj **setskeys, unsigned long setsnum, robj *dstkey) {
5293 dict **dv = zmalloc(sizeof(dict*)*setsnum);
5294 dictIterator *di;
5295 dictEntry *de;
5296 robj *lenobj = NULL, *dstset = NULL;
5297 unsigned long j, cardinality = 0;
5298
5299 for (j = 0; j < setsnum; j++) {
5300 robj *setobj;
5301
5302 setobj = dstkey ?
5303 lookupKeyWrite(c->db,setskeys[j]) :
5304 lookupKeyRead(c->db,setskeys[j]);
5305 if (!setobj) {
5306 zfree(dv);
5307 if (dstkey) {
5308 if (dbDelete(c->db,dstkey))
5309 server.dirty++;
5310 addReply(c,shared.czero);
5311 } else {
5312 addReply(c,shared.emptymultibulk);
5313 }
5314 return;
5315 }
5316 if (setobj->type != REDIS_SET) {
5317 zfree(dv);
5318 addReply(c,shared.wrongtypeerr);
5319 return;
5320 }
5321 dv[j] = setobj->ptr;
5322 }
5323 /* Sort sets from the smallest to largest, this will improve our
5324 * algorithm's performace */
5325 qsort(dv,setsnum,sizeof(dict*),qsortCompareSetsByCardinality);
5326
5327 /* The first thing we should output is the total number of elements...
5328 * since this is a multi-bulk write, but at this stage we don't know
5329 * the intersection set size, so we use a trick, append an empty object
5330 * to the output list and save the pointer to later modify it with the
5331 * right length */
5332 if (!dstkey) {
5333 lenobj = createObject(REDIS_STRING,NULL);
5334 addReply(c,lenobj);
5335 decrRefCount(lenobj);
5336 } else {
5337 /* If we have a target key where to store the resulting set
5338 * create this key with an empty set inside */
5339 dstset = createSetObject();
5340 }
5341
5342 /* Iterate all the elements of the first (smallest) set, and test
5343 * the element against all the other sets, if at least one set does
5344 * not include the element it is discarded */
5345 di = dictGetIterator(dv[0]);
5346
5347 while((de = dictNext(di)) != NULL) {
5348 robj *ele;
5349
5350 for (j = 1; j < setsnum; j++)
5351 if (dictFind(dv[j],dictGetEntryKey(de)) == NULL) break;
5352 if (j != setsnum)
5353 continue; /* at least one set does not contain the member */
5354 ele = dictGetEntryKey(de);
5355 if (!dstkey) {
5356 addReplyBulk(c,ele);
5357 cardinality++;
5358 } else {
5359 dictAdd(dstset->ptr,ele,NULL);
5360 incrRefCount(ele);
5361 }
5362 }
5363 dictReleaseIterator(di);
5364
5365 if (dstkey) {
5366 /* Store the resulting set into the target, if the intersection
5367 * is not an empty set. */
5368 dbDelete(c->db,dstkey);
5369 if (dictSize((dict*)dstset->ptr) > 0) {
5370 dbAdd(c->db,dstkey,dstset);
5371 addReplyLongLong(c,dictSize((dict*)dstset->ptr));
5372 } else {
5373 decrRefCount(dstset);
5374 addReply(c,shared.czero);
5375 }
5376 server.dirty++;
5377 } else {
5378 lenobj->ptr = sdscatprintf(sdsempty(),"*%lu\r\n",cardinality);
5379 }
5380 zfree(dv);
5381 }
5382
5383 static void sinterCommand(redisClient *c) {
5384 sinterGenericCommand(c,c->argv+1,c->argc-1,NULL);
5385 }
5386
5387 static void sinterstoreCommand(redisClient *c) {
5388 sinterGenericCommand(c,c->argv+2,c->argc-2,c->argv[1]);
5389 }
5390
5391 #define REDIS_OP_UNION 0
5392 #define REDIS_OP_DIFF 1
5393 #define REDIS_OP_INTER 2
5394
5395 static void sunionDiffGenericCommand(redisClient *c, robj **setskeys, int setsnum, robj *dstkey, int op) {
5396 dict **dv = zmalloc(sizeof(dict*)*setsnum);
5397 dictIterator *di;
5398 dictEntry *de;
5399 robj *dstset = NULL;
5400 int j, cardinality = 0;
5401
5402 for (j = 0; j < setsnum; j++) {
5403 robj *setobj;
5404
5405 setobj = dstkey ?
5406 lookupKeyWrite(c->db,setskeys[j]) :
5407 lookupKeyRead(c->db,setskeys[j]);
5408 if (!setobj) {
5409 dv[j] = NULL;
5410 continue;
5411 }
5412 if (setobj->type != REDIS_SET) {
5413 zfree(dv);
5414 addReply(c,shared.wrongtypeerr);
5415 return;
5416 }
5417 dv[j] = setobj->ptr;
5418 }
5419
5420 /* We need a temp set object to store our union. If the dstkey
5421 * is not NULL (that is, we are inside an SUNIONSTORE operation) then
5422 * this set object will be the resulting object to set into the target key*/
5423 dstset = createSetObject();
5424
5425 /* Iterate all the elements of all the sets, add every element a single
5426 * time to the result set */
5427 for (j = 0; j < setsnum; j++) {
5428 if (op == REDIS_OP_DIFF && j == 0 && !dv[j]) break; /* result set is empty */
5429 if (!dv[j]) continue; /* non existing keys are like empty sets */
5430
5431 di = dictGetIterator(dv[j]);
5432
5433 while((de = dictNext(di)) != NULL) {
5434 robj *ele;
5435
5436 /* dictAdd will not add the same element multiple times */
5437 ele = dictGetEntryKey(de);
5438 if (op == REDIS_OP_UNION || j == 0) {
5439 if (dictAdd(dstset->ptr,ele,NULL) == DICT_OK) {
5440 incrRefCount(ele);
5441 cardinality++;
5442 }
5443 } else if (op == REDIS_OP_DIFF) {
5444 if (dictDelete(dstset->ptr,ele) == DICT_OK) {
5445 cardinality--;
5446 }
5447 }
5448 }
5449 dictReleaseIterator(di);
5450
5451 /* result set is empty? Exit asap. */
5452 if (op == REDIS_OP_DIFF && cardinality == 0) break;
5453 }
5454
5455 /* Output the content of the resulting set, if not in STORE mode */
5456 if (!dstkey) {
5457 addReplySds(c,sdscatprintf(sdsempty(),"*%d\r\n",cardinality));
5458 di = dictGetIterator(dstset->ptr);
5459 while((de = dictNext(di)) != NULL) {
5460 robj *ele;
5461
5462 ele = dictGetEntryKey(de);
5463 addReplyBulk(c,ele);
5464 }
5465 dictReleaseIterator(di);
5466 decrRefCount(dstset);
5467 } else {
5468 /* If we have a target key where to store the resulting set
5469 * create this key with the result set inside */
5470 dbDelete(c->db,dstkey);
5471 if (dictSize((dict*)dstset->ptr) > 0) {
5472 dbAdd(c->db,dstkey,dstset);
5473 addReplyLongLong(c,dictSize((dict*)dstset->ptr));
5474 } else {
5475 decrRefCount(dstset);
5476 addReply(c,shared.czero);
5477 }
5478 server.dirty++;
5479 }
5480 zfree(dv);
5481 }
5482
5483 static void sunionCommand(redisClient *c) {
5484 sunionDiffGenericCommand(c,c->argv+1,c->argc-1,NULL,REDIS_OP_UNION);
5485 }
5486
5487 static void sunionstoreCommand(redisClient *c) {
5488 sunionDiffGenericCommand(c,c->argv+2,c->argc-2,c->argv[1],REDIS_OP_UNION);
5489 }
5490
5491 static void sdiffCommand(redisClient *c) {
5492 sunionDiffGenericCommand(c,c->argv+1,c->argc-1,NULL,REDIS_OP_DIFF);
5493 }
5494
5495 static void sdiffstoreCommand(redisClient *c) {
5496 sunionDiffGenericCommand(c,c->argv+2,c->argc-2,c->argv[1],REDIS_OP_DIFF);
5497 }
5498
5499 /* ==================================== ZSets =============================== */
5500
5501 /* ZSETs are ordered sets using two data structures to hold the same elements
5502 * in order to get O(log(N)) INSERT and REMOVE operations into a sorted
5503 * data structure.
5504 *
5505 * The elements are added to an hash table mapping Redis objects to scores.
5506 * At the same time the elements are added to a skip list mapping scores
5507 * to Redis objects (so objects are sorted by scores in this "view"). */
5508
5509 /* This skiplist implementation is almost a C translation of the original
5510 * algorithm described by William Pugh in "Skip Lists: A Probabilistic
5511 * Alternative to Balanced Trees", modified in three ways:
5512 * a) this implementation allows for repeated values.
5513 * b) the comparison is not just by key (our 'score') but by satellite data.
5514 * c) there is a back pointer, so it's a doubly linked list with the back
5515 * pointers being only at "level 1". This allows to traverse the list
5516 * from tail to head, useful for ZREVRANGE. */
5517
5518 static zskiplistNode *zslCreateNode(int level, double score, robj *obj) {
5519 zskiplistNode *zn = zmalloc(sizeof(*zn));
5520
5521 zn->forward = zmalloc(sizeof(zskiplistNode*) * level);
5522 if (level > 1)
5523 zn->span = zmalloc(sizeof(unsigned int) * (level - 1));
5524 else
5525 zn->span = NULL;
5526 zn->score = score;
5527 zn->obj = obj;
5528 return zn;
5529 }
5530
5531 static zskiplist *zslCreate(void) {
5532 int j;
5533 zskiplist *zsl;
5534
5535 zsl = zmalloc(sizeof(*zsl));
5536 zsl->level = 1;
5537 zsl->length = 0;
5538 zsl->header = zslCreateNode(ZSKIPLIST_MAXLEVEL,0,NULL);
5539 for (j = 0; j < ZSKIPLIST_MAXLEVEL; j++) {
5540 zsl->header->forward[j] = NULL;
5541
5542 /* span has space for ZSKIPLIST_MAXLEVEL-1 elements */
5543 if (j < ZSKIPLIST_MAXLEVEL-1)
5544 zsl->header->span[j] = 0;
5545 }
5546 zsl->header->backward = NULL;
5547 zsl->tail = NULL;
5548 return zsl;
5549 }
5550
5551 static void zslFreeNode(zskiplistNode *node) {
5552 decrRefCount(node->obj);
5553 zfree(node->forward);
5554 zfree(node->span);
5555 zfree(node);
5556 }
5557
5558 static void zslFree(zskiplist *zsl) {
5559 zskiplistNode *node = zsl->header->forward[0], *next;
5560
5561 zfree(zsl->header->forward);
5562 zfree(zsl->header->span);
5563 zfree(zsl->header);
5564 while(node) {
5565 next = node->forward[0];
5566 zslFreeNode(node);
5567 node = next;
5568 }
5569 zfree(zsl);
5570 }
5571
5572 static int zslRandomLevel(void) {
5573 int level = 1;
5574 while ((random()&0xFFFF) < (ZSKIPLIST_P * 0xFFFF))
5575 level += 1;
5576 return (level<ZSKIPLIST_MAXLEVEL) ? level : ZSKIPLIST_MAXLEVEL;
5577 }
5578
5579 static void zslInsert(zskiplist *zsl, double score, robj *obj) {
5580 zskiplistNode *update[ZSKIPLIST_MAXLEVEL], *x;
5581 unsigned int rank[ZSKIPLIST_MAXLEVEL];
5582 int i, level;
5583
5584 x = zsl->header;
5585 for (i = zsl->level-1; i >= 0; i--) {
5586 /* store rank that is crossed to reach the insert position */
5587 rank[i] = i == (zsl->level-1) ? 0 : rank[i+1];
5588
5589 while (x->forward[i] &&
5590 (x->forward[i]->score < score ||
5591 (x->forward[i]->score == score &&
5592 compareStringObjects(x->forward[i]->obj,obj) < 0))) {
5593 rank[i] += i > 0 ? x->span[i-1] : 1;
5594 x = x->forward[i];
5595 }
5596 update[i] = x;
5597 }
5598 /* we assume the key is not already inside, since we allow duplicated
5599 * scores, and the re-insertion of score and redis object should never
5600 * happpen since the caller of zslInsert() should test in the hash table
5601 * if the element is already inside or not. */
5602 level = zslRandomLevel();
5603 if (level > zsl->level) {
5604 for (i = zsl->level; i < level; i++) {
5605 rank[i] = 0;
5606 update[i] = zsl->header;
5607 update[i]->span[i-1] = zsl->length;
5608 }
5609 zsl->level = level;
5610 }
5611 x = zslCreateNode(level,score,obj);
5612 for (i = 0; i < level; i++) {
5613 x->forward[i] = update[i]->forward[i];
5614 update[i]->forward[i] = x;
5615
5616 /* update span covered by update[i] as x is inserted here */
5617 if (i > 0) {
5618 x->span[i-1] = update[i]->span[i-1] - (rank[0] - rank[i]);
5619 update[i]->span[i-1] = (rank[0] - rank[i]) + 1;
5620 }
5621 }
5622
5623 /* increment span for untouched levels */
5624 for (i = level; i < zsl->level; i++) {
5625 update[i]->span[i-1]++;
5626 }
5627
5628 x->backward = (update[0] == zsl->header) ? NULL : update[0];
5629 if (x->forward[0])
5630 x->forward[0]->backward = x;
5631 else
5632 zsl->tail = x;
5633 zsl->length++;
5634 }
5635
5636 /* Internal function used by zslDelete, zslDeleteByScore and zslDeleteByRank */
5637 void zslDeleteNode(zskiplist *zsl, zskiplistNode *x, zskiplistNode **update) {
5638 int i;
5639 for (i = 0; i < zsl->level; i++) {
5640 if (update[i]->forward[i] == x) {
5641 if (i > 0) {
5642 update[i]->span[i-1] += x->span[i-1] - 1;
5643 }
5644 update[i]->forward[i] = x->forward[i];
5645 } else {
5646 /* invariant: i > 0, because update[0]->forward[0]
5647 * is always equal to x */
5648 update[i]->span[i-1] -= 1;
5649 }
5650 }
5651 if (x->forward[0]) {
5652 x->forward[0]->backward = x->backward;
5653 } else {
5654 zsl->tail = x->backward;
5655 }
5656 while(zsl->level > 1 && zsl->header->forward[zsl->level-1] == NULL)
5657 zsl->level--;
5658 zsl->length--;
5659 }
5660
5661 /* Delete an element with matching score/object from the skiplist. */
5662 static int zslDelete(zskiplist *zsl, double score, robj *obj) {
5663 zskiplistNode *update[ZSKIPLIST_MAXLEVEL], *x;
5664 int i;
5665
5666 x = zsl->header;
5667 for (i = zsl->level-1; i >= 0; i--) {
5668 while (x->forward[i] &&
5669 (x->forward[i]->score < score ||
5670 (x->forward[i]->score == score &&
5671 compareStringObjects(x->forward[i]->obj,obj) < 0)))
5672 x = x->forward[i];
5673 update[i] = x;
5674 }
5675 /* We may have multiple elements with the same score, what we need
5676 * is to find the element with both the right score and object. */
5677 x = x->forward[0];
5678 if (x && score == x->score && equalStringObjects(x->obj,obj)) {
5679 zslDeleteNode(zsl, x, update);
5680 zslFreeNode(x);
5681 return 1;
5682 } else {
5683 return 0; /* not found */
5684 }
5685 return 0; /* not found */
5686 }
5687
5688 /* Delete all the elements with score between min and max from the skiplist.
5689 * Min and mx are inclusive, so a score >= min || score <= max is deleted.
5690 * Note that this function takes the reference to the hash table view of the
5691 * sorted set, in order to remove the elements from the hash table too. */
5692 static unsigned long zslDeleteRangeByScore(zskiplist *zsl, double min, double max, dict *dict) {
5693 zskiplistNode *update[ZSKIPLIST_MAXLEVEL], *x;
5694 unsigned long removed = 0;
5695 int i;
5696
5697 x = zsl->header;
5698 for (i = zsl->level-1; i >= 0; i--) {
5699 while (x->forward[i] && x->forward[i]->score < min)
5700 x = x->forward[i];
5701 update[i] = x;
5702 }
5703 /* We may have multiple elements with the same score, what we need
5704 * is to find the element with both the right score and object. */
5705 x = x->forward[0];
5706 while (x && x->score <= max) {
5707 zskiplistNode *next = x->forward[0];
5708 zslDeleteNode(zsl, x, update);
5709 dictDelete(dict,x->obj);
5710 zslFreeNode(x);
5711 removed++;
5712 x = next;
5713 }
5714 return removed; /* not found */
5715 }
5716
5717 /* Delete all the elements with rank between start and end from the skiplist.
5718 * Start and end are inclusive. Note that start and end need to be 1-based */
5719 static unsigned long zslDeleteRangeByRank(zskiplist *zsl, unsigned int start, unsigned int end, dict *dict) {
5720 zskiplistNode *update[ZSKIPLIST_MAXLEVEL], *x;
5721 unsigned long traversed = 0, removed = 0;
5722 int i;
5723
5724 x = zsl->header;
5725 for (i = zsl->level-1; i >= 0; i--) {
5726 while (x->forward[i] && (traversed + (i > 0 ? x->span[i-1] : 1)) < start) {
5727 traversed += i > 0 ? x->span[i-1] : 1;
5728 x = x->forward[i];
5729 }
5730 update[i] = x;
5731 }
5732
5733 traversed++;
5734 x = x->forward[0];
5735 while (x && traversed <= end) {
5736 zskiplistNode *next = x->forward[0];
5737 zslDeleteNode(zsl, x, update);
5738 dictDelete(dict,x->obj);
5739 zslFreeNode(x);
5740 removed++;
5741 traversed++;
5742 x = next;
5743 }
5744 return removed;
5745 }
5746
5747 /* Find the first node having a score equal or greater than the specified one.
5748 * Returns NULL if there is no match. */
5749 static zskiplistNode *zslFirstWithScore(zskiplist *zsl, double score) {
5750 zskiplistNode *x;
5751 int i;
5752
5753 x = zsl->header;
5754 for (i = zsl->level-1; i >= 0; i--) {
5755 while (x->forward[i] && x->forward[i]->score < score)
5756 x = x->forward[i];
5757 }
5758 /* We may have multiple elements with the same score, what we need
5759 * is to find the element with both the right score and object. */
5760 return x->forward[0];
5761 }
5762
5763 /* Find the rank for an element by both score and key.
5764 * Returns 0 when the element cannot be found, rank otherwise.
5765 * Note that the rank is 1-based due to the span of zsl->header to the
5766 * first element. */
5767 static unsigned long zslGetRank(zskiplist *zsl, double score, robj *o) {
5768 zskiplistNode *x;
5769 unsigned long rank = 0;
5770 int i;
5771
5772 x = zsl->header;
5773 for (i = zsl->level-1; i >= 0; i--) {
5774 while (x->forward[i] &&
5775 (x->forward[i]->score < score ||
5776 (x->forward[i]->score == score &&
5777 compareStringObjects(x->forward[i]->obj,o) <= 0))) {
5778 rank += i > 0 ? x->span[i-1] : 1;
5779 x = x->forward[i];
5780 }
5781
5782 /* x might be equal to zsl->header, so test if obj is non-NULL */
5783 if (x->obj && equalStringObjects(x->obj,o)) {
5784 return rank;
5785 }
5786 }
5787 return 0;
5788 }
5789
5790 /* Finds an element by its rank. The rank argument needs to be 1-based. */
5791 zskiplistNode* zslGetElementByRank(zskiplist *zsl, unsigned long rank) {
5792 zskiplistNode *x;
5793 unsigned long traversed = 0;
5794 int i;
5795
5796 x = zsl->header;
5797 for (i = zsl->level-1; i >= 0; i--) {
5798 while (x->forward[i] && (traversed + (i>0 ? x->span[i-1] : 1)) <= rank)
5799 {
5800 traversed += i > 0 ? x->span[i-1] : 1;
5801 x = x->forward[i];
5802 }
5803 if (traversed == rank) {
5804 return x;
5805 }
5806 }
5807 return NULL;
5808 }
5809
5810 /* The actual Z-commands implementations */
5811
5812 /* This generic command implements both ZADD and ZINCRBY.
5813 * scoreval is the score if the operation is a ZADD (doincrement == 0) or
5814 * the increment if the operation is a ZINCRBY (doincrement == 1). */
5815 static void zaddGenericCommand(redisClient *c, robj *key, robj *ele, double scoreval, int doincrement) {
5816 robj *zsetobj;
5817 zset *zs;
5818 double *score;
5819
5820 if (isnan(scoreval)) {
5821 addReplySds(c,sdsnew("-ERR provide score is Not A Number (nan)\r\n"));
5822 return;
5823 }
5824
5825 zsetobj = lookupKeyWrite(c->db,key);
5826 if (zsetobj == NULL) {
5827 zsetobj = createZsetObject();
5828 dbAdd(c->db,key,zsetobj);
5829 } else {
5830 if (zsetobj->type != REDIS_ZSET) {
5831 addReply(c,shared.wrongtypeerr);
5832 return;
5833 }
5834 }
5835 zs = zsetobj->ptr;
5836
5837 /* Ok now since we implement both ZADD and ZINCRBY here the code
5838 * needs to handle the two different conditions. It's all about setting
5839 * '*score', that is, the new score to set, to the right value. */
5840 score = zmalloc(sizeof(double));
5841 if (doincrement) {
5842 dictEntry *de;
5843
5844 /* Read the old score. If the element was not present starts from 0 */
5845 de = dictFind(zs->dict,ele);
5846 if (de) {
5847 double *oldscore = dictGetEntryVal(de);
5848 *score = *oldscore + scoreval;
5849 } else {
5850 *score = scoreval;
5851 }
5852 if (isnan(*score)) {
5853 addReplySds(c,
5854 sdsnew("-ERR resulting score is Not A Number (nan)\r\n"));
5855 zfree(score);
5856 /* Note that we don't need to check if the zset may be empty and
5857 * should be removed here, as we can only obtain Nan as score if
5858 * there was already an element in the sorted set. */
5859 return;
5860 }
5861 } else {
5862 *score = scoreval;
5863 }
5864
5865 /* What follows is a simple remove and re-insert operation that is common
5866 * to both ZADD and ZINCRBY... */
5867 if (dictAdd(zs->dict,ele,score) == DICT_OK) {
5868 /* case 1: New element */
5869 incrRefCount(ele); /* added to hash */
5870 zslInsert(zs->zsl,*score,ele);
5871 incrRefCount(ele); /* added to skiplist */
5872 server.dirty++;
5873 if (doincrement)
5874 addReplyDouble(c,*score);
5875 else
5876 addReply(c,shared.cone);
5877 } else {
5878 dictEntry *de;
5879 double *oldscore;
5880
5881 /* case 2: Score update operation */
5882 de = dictFind(zs->dict,ele);
5883 redisAssert(de != NULL);
5884 oldscore = dictGetEntryVal(de);
5885 if (*score != *oldscore) {
5886 int deleted;
5887
5888 /* Remove and insert the element in the skip list with new score */
5889 deleted = zslDelete(zs->zsl,*oldscore,ele);
5890 redisAssert(deleted != 0);
5891 zslInsert(zs->zsl,*score,ele);
5892 incrRefCount(ele);
5893 /* Update the score in the hash table */
5894 dictReplace(zs->dict,ele,score);
5895 server.dirty++;
5896 } else {
5897 zfree(score);
5898 }
5899 if (doincrement)
5900 addReplyDouble(c,*score);
5901 else
5902 addReply(c,shared.czero);
5903 }
5904 }
5905
5906 static void zaddCommand(redisClient *c) {
5907 double scoreval;
5908
5909 if (getDoubleFromObjectOrReply(c, c->argv[2], &scoreval, NULL) != REDIS_OK) return;
5910 zaddGenericCommand(c,c->argv[1],c->argv[3],scoreval,0);
5911 }
5912
5913 static void zincrbyCommand(redisClient *c) {
5914 double scoreval;
5915
5916 if (getDoubleFromObjectOrReply(c, c->argv[2], &scoreval, NULL) != REDIS_OK) return;
5917 zaddGenericCommand(c,c->argv[1],c->argv[3],scoreval,1);
5918 }
5919
5920 static void zremCommand(redisClient *c) {
5921 robj *zsetobj;
5922 zset *zs;
5923 dictEntry *de;
5924 double *oldscore;
5925 int deleted;
5926
5927 if ((zsetobj = lookupKeyWriteOrReply(c,c->argv[1],shared.czero)) == NULL ||
5928 checkType(c,zsetobj,REDIS_ZSET)) return;
5929
5930 zs = zsetobj->ptr;
5931 de = dictFind(zs->dict,c->argv[2]);
5932 if (de == NULL) {
5933 addReply(c,shared.czero);
5934 return;
5935 }
5936 /* Delete from the skiplist */
5937 oldscore = dictGetEntryVal(de);
5938 deleted = zslDelete(zs->zsl,*oldscore,c->argv[2]);
5939 redisAssert(deleted != 0);
5940
5941 /* Delete from the hash table */
5942 dictDelete(zs->dict,c->argv[2]);
5943 if (htNeedsResize(zs->dict)) dictResize(zs->dict);
5944 if (dictSize(zs->dict) == 0) dbDelete(c->db,c->argv[1]);
5945 server.dirty++;
5946 addReply(c,shared.cone);
5947 }
5948
5949 static void zremrangebyscoreCommand(redisClient *c) {
5950 double min;
5951 double max;
5952 long deleted;
5953 robj *zsetobj;
5954 zset *zs;
5955
5956 if ((getDoubleFromObjectOrReply(c, c->argv[2], &min, NULL) != REDIS_OK) ||
5957 (getDoubleFromObjectOrReply(c, c->argv[3], &max, NULL) != REDIS_OK)) return;
5958
5959 if ((zsetobj = lookupKeyWriteOrReply(c,c->argv[1],shared.czero)) == NULL ||
5960 checkType(c,zsetobj,REDIS_ZSET)) return;
5961
5962 zs = zsetobj->ptr;
5963 deleted = zslDeleteRangeByScore(zs->zsl,min,max,zs->dict);
5964 if (htNeedsResize(zs->dict)) dictResize(zs->dict);
5965 if (dictSize(zs->dict) == 0) dbDelete(c->db,c->argv[1]);
5966 server.dirty += deleted;
5967 addReplyLongLong(c,deleted);
5968 }
5969
5970 static void zremrangebyrankCommand(redisClient *c) {
5971 long start;
5972 long end;
5973 int llen;
5974 long deleted;
5975 robj *zsetobj;
5976 zset *zs;
5977
5978 if ((getLongFromObjectOrReply(c, c->argv[2], &start, NULL) != REDIS_OK) ||
5979 (getLongFromObjectOrReply(c, c->argv[3], &end, NULL) != REDIS_OK)) return;
5980
5981 if ((zsetobj = lookupKeyWriteOrReply(c,c->argv[1],shared.czero)) == NULL ||
5982 checkType(c,zsetobj,REDIS_ZSET)) return;
5983 zs = zsetobj->ptr;
5984 llen = zs->zsl->length;
5985
5986 /* convert negative indexes */
5987 if (start < 0) start = llen+start;
5988 if (end < 0) end = llen+end;
5989 if (start < 0) start = 0;
5990 if (end < 0) end = 0;
5991
5992 /* indexes sanity checks */
5993 if (start > end || start >= llen) {
5994 addReply(c,shared.czero);
5995 return;
5996 }
5997 if (end >= llen) end = llen-1;
5998
5999 /* increment start and end because zsl*Rank functions
6000 * use 1-based rank */
6001 deleted = zslDeleteRangeByRank(zs->zsl,start+1,end+1,zs->dict);
6002 if (htNeedsResize(zs->dict)) dictResize(zs->dict);
6003 if (dictSize(zs->dict) == 0) dbDelete(c->db,c->argv[1]);
6004 server.dirty += deleted;
6005 addReplyLongLong(c, deleted);
6006 }
6007
6008 typedef struct {
6009 dict *dict;
6010 double weight;
6011 } zsetopsrc;
6012
6013 static int qsortCompareZsetopsrcByCardinality(const void *s1, const void *s2) {
6014 zsetopsrc *d1 = (void*) s1, *d2 = (void*) s2;
6015 unsigned long size1, size2;
6016 size1 = d1->dict ? dictSize(d1->dict) : 0;
6017 size2 = d2->dict ? dictSize(d2->dict) : 0;
6018 return size1 - size2;
6019 }
6020
6021 #define REDIS_AGGR_SUM 1
6022 #define REDIS_AGGR_MIN 2
6023 #define REDIS_AGGR_MAX 3
6024 #define zunionInterDictValue(_e) (dictGetEntryVal(_e) == NULL ? 1.0 : *(double*)dictGetEntryVal(_e))
6025
6026 inline static void zunionInterAggregate(double *target, double val, int aggregate) {
6027 if (aggregate == REDIS_AGGR_SUM) {
6028 *target = *target + val;
6029 } else if (aggregate == REDIS_AGGR_MIN) {
6030 *target = val < *target ? val : *target;
6031 } else if (aggregate == REDIS_AGGR_MAX) {
6032 *target = val > *target ? val : *target;
6033 } else {
6034 /* safety net */
6035 redisPanic("Unknown ZUNION/INTER aggregate type");
6036 }
6037 }
6038
6039 static void zunionInterGenericCommand(redisClient *c, robj *dstkey, int op) {
6040 int i, j, setnum;
6041 int aggregate = REDIS_AGGR_SUM;
6042 zsetopsrc *src;
6043 robj *dstobj;
6044 zset *dstzset;
6045 dictIterator *di;
6046 dictEntry *de;
6047
6048 /* expect setnum input keys to be given */
6049 setnum = atoi(c->argv[2]->ptr);
6050 if (setnum < 1) {
6051 addReplySds(c,sdsnew("-ERR at least 1 input key is needed for ZUNIONSTORE/ZINTERSTORE\r\n"));
6052 return;
6053 }
6054
6055 /* test if the expected number of keys would overflow */
6056 if (3+setnum > c->argc) {
6057 addReply(c,shared.syntaxerr);
6058 return;
6059 }
6060
6061 /* read keys to be used for input */
6062 src = zmalloc(sizeof(zsetopsrc) * setnum);
6063 for (i = 0, j = 3; i < setnum; i++, j++) {
6064 robj *obj = lookupKeyWrite(c->db,c->argv[j]);
6065 if (!obj) {
6066 src[i].dict = NULL;
6067 } else {
6068 if (obj->type == REDIS_ZSET) {
6069 src[i].dict = ((zset*)obj->ptr)->dict;
6070 } else if (obj->type == REDIS_SET) {
6071 src[i].dict = (obj->ptr);
6072 } else {
6073 zfree(src);
6074 addReply(c,shared.wrongtypeerr);
6075 return;
6076 }
6077 }
6078
6079 /* default all weights to 1 */
6080 src[i].weight = 1.0;
6081 }
6082
6083 /* parse optional extra arguments */
6084 if (j < c->argc) {
6085 int remaining = c->argc - j;
6086
6087 while (remaining) {
6088 if (remaining >= (setnum + 1) && !strcasecmp(c->argv[j]->ptr,"weights")) {
6089 j++; remaining--;
6090 for (i = 0; i < setnum; i++, j++, remaining--) {
6091 if (getDoubleFromObjectOrReply(c, c->argv[j], &src[i].weight, NULL) != REDIS_OK)
6092 return;
6093 }
6094 } else if (remaining >= 2 && !strcasecmp(c->argv[j]->ptr,"aggregate")) {
6095 j++; remaining--;
6096 if (!strcasecmp(c->argv[j]->ptr,"sum")) {
6097 aggregate = REDIS_AGGR_SUM;
6098 } else if (!strcasecmp(c->argv[j]->ptr,"min")) {
6099 aggregate = REDIS_AGGR_MIN;
6100 } else if (!strcasecmp(c->argv[j]->ptr,"max")) {
6101 aggregate = REDIS_AGGR_MAX;
6102 } else {
6103 zfree(src);
6104 addReply(c,shared.syntaxerr);
6105 return;
6106 }
6107 j++; remaining--;
6108 } else {
6109 zfree(src);
6110 addReply(c,shared.syntaxerr);
6111 return;
6112 }
6113 }
6114 }
6115
6116 /* sort sets from the smallest to largest, this will improve our
6117 * algorithm's performance */
6118 qsort(src,setnum,sizeof(zsetopsrc),qsortCompareZsetopsrcByCardinality);
6119
6120 dstobj = createZsetObject();
6121 dstzset = dstobj->ptr;
6122
6123 if (op == REDIS_OP_INTER) {
6124 /* skip going over all entries if the smallest zset is NULL or empty */
6125 if (src[0].dict && dictSize(src[0].dict) > 0) {
6126 /* precondition: as src[0].dict is non-empty and the zsets are ordered
6127 * from small to large, all src[i > 0].dict are non-empty too */
6128 di = dictGetIterator(src[0].dict);
6129 while((de = dictNext(di)) != NULL) {
6130 double *score = zmalloc(sizeof(double)), value;
6131 *score = src[0].weight * zunionInterDictValue(de);
6132
6133 for (j = 1; j < setnum; j++) {
6134 dictEntry *other = dictFind(src[j].dict,dictGetEntryKey(de));
6135 if (other) {
6136 value = src[j].weight * zunionInterDictValue(other);
6137 zunionInterAggregate(score, value, aggregate);
6138 } else {
6139 break;
6140 }
6141 }
6142
6143 /* skip entry when not present in every source dict */
6144 if (j != setnum) {
6145 zfree(score);
6146 } else {
6147 robj *o = dictGetEntryKey(de);
6148 dictAdd(dstzset->dict,o,score);
6149 incrRefCount(o); /* added to dictionary */
6150 zslInsert(dstzset->zsl,*score,o);
6151 incrRefCount(o); /* added to skiplist */
6152 }
6153 }
6154 dictReleaseIterator(di);
6155 }
6156 } else if (op == REDIS_OP_UNION) {
6157 for (i = 0; i < setnum; i++) {
6158 if (!src[i].dict) continue;
6159
6160 di = dictGetIterator(src[i].dict);
6161 while((de = dictNext(di)) != NULL) {
6162 /* skip key when already processed */
6163 if (dictFind(dstzset->dict,dictGetEntryKey(de)) != NULL) continue;
6164
6165 double *score = zmalloc(sizeof(double)), value;
6166 *score = src[i].weight * zunionInterDictValue(de);
6167
6168 /* because the zsets are sorted by size, its only possible
6169 * for sets at larger indices to hold this entry */
6170 for (j = (i+1); j < setnum; j++) {
6171 dictEntry *other = dictFind(src[j].dict,dictGetEntryKey(de));
6172 if (other) {
6173 value = src[j].weight * zunionInterDictValue(other);
6174 zunionInterAggregate(score, value, aggregate);
6175 }
6176 }
6177
6178 robj *o = dictGetEntryKey(de);
6179 dictAdd(dstzset->dict,o,score);
6180 incrRefCount(o); /* added to dictionary */
6181 zslInsert(dstzset->zsl,*score,o);
6182 incrRefCount(o); /* added to skiplist */
6183 }
6184 dictReleaseIterator(di);
6185 }
6186 } else {
6187 /* unknown operator */
6188 redisAssert(op == REDIS_OP_INTER || op == REDIS_OP_UNION);
6189 }
6190
6191 dbDelete(c->db,dstkey);
6192 if (dstzset->zsl->length) {
6193 dbAdd(c->db,dstkey,dstobj);
6194 addReplyLongLong(c, dstzset->zsl->length);
6195 server.dirty++;
6196 } else {
6197 decrRefCount(dstobj);
6198 addReply(c, shared.czero);
6199 }
6200 zfree(src);
6201 }
6202
6203 static void zunionstoreCommand(redisClient *c) {
6204 zunionInterGenericCommand(c,c->argv[1], REDIS_OP_UNION);
6205 }
6206
6207 static void zinterstoreCommand(redisClient *c) {
6208 zunionInterGenericCommand(c,c->argv[1], REDIS_OP_INTER);
6209 }
6210
6211 static void zrangeGenericCommand(redisClient *c, int reverse) {
6212 robj *o;
6213 long start;
6214 long end;
6215 int withscores = 0;
6216 int llen;
6217 int rangelen, j;
6218 zset *zsetobj;
6219 zskiplist *zsl;
6220 zskiplistNode *ln;
6221 robj *ele;
6222
6223 if ((getLongFromObjectOrReply(c, c->argv[2], &start, NULL) != REDIS_OK) ||
6224 (getLongFromObjectOrReply(c, c->argv[3], &end, NULL) != REDIS_OK)) return;
6225
6226 if (c->argc == 5 && !strcasecmp(c->argv[4]->ptr,"withscores")) {
6227 withscores = 1;
6228 } else if (c->argc >= 5) {
6229 addReply(c,shared.syntaxerr);
6230 return;
6231 }
6232
6233 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.emptymultibulk)) == NULL
6234 || checkType(c,o,REDIS_ZSET)) return;
6235 zsetobj = o->ptr;
6236 zsl = zsetobj->zsl;
6237 llen = zsl->length;
6238
6239 /* convert negative indexes */
6240 if (start < 0) start = llen+start;
6241 if (end < 0) end = llen+end;
6242 if (start < 0) start = 0;
6243 if (end < 0) end = 0;
6244
6245 /* indexes sanity checks */
6246 if (start > end || start >= llen) {
6247 /* Out of range start or start > end result in empty list */
6248 addReply(c,shared.emptymultibulk);
6249 return;
6250 }
6251 if (end >= llen) end = llen-1;
6252 rangelen = (end-start)+1;
6253
6254 /* check if starting point is trivial, before searching
6255 * the element in log(N) time */
6256 if (reverse) {
6257 ln = start == 0 ? zsl->tail : zslGetElementByRank(zsl, llen-start);
6258 } else {
6259 ln = start == 0 ?
6260 zsl->header->forward[0] : zslGetElementByRank(zsl, start+1);
6261 }
6262
6263 /* Return the result in form of a multi-bulk reply */
6264 addReplySds(c,sdscatprintf(sdsempty(),"*%d\r\n",
6265 withscores ? (rangelen*2) : rangelen));
6266 for (j = 0; j < rangelen; j++) {
6267 ele = ln->obj;
6268 addReplyBulk(c,ele);
6269 if (withscores)
6270 addReplyDouble(c,ln->score);
6271 ln = reverse ? ln->backward : ln->forward[0];
6272 }
6273 }
6274
6275 static void zrangeCommand(redisClient *c) {
6276 zrangeGenericCommand(c,0);
6277 }
6278
6279 static void zrevrangeCommand(redisClient *c) {
6280 zrangeGenericCommand(c,1);
6281 }
6282
6283 /* This command implements both ZRANGEBYSCORE and ZCOUNT.
6284 * If justcount is non-zero, just the count is returned. */
6285 static void genericZrangebyscoreCommand(redisClient *c, int justcount) {
6286 robj *o;
6287 double min, max;
6288 int minex = 0, maxex = 0; /* are min or max exclusive? */
6289 int offset = 0, limit = -1;
6290 int withscores = 0;
6291 int badsyntax = 0;
6292
6293 /* Parse the min-max interval. If one of the values is prefixed
6294 * by the "(" character, it's considered "open". For instance
6295 * ZRANGEBYSCORE zset (1.5 (2.5 will match min < x < max
6296 * ZRANGEBYSCORE zset 1.5 2.5 will instead match min <= x <= max */
6297 if (((char*)c->argv[2]->ptr)[0] == '(') {
6298 min = strtod((char*)c->argv[2]->ptr+1,NULL);
6299 minex = 1;
6300 } else {
6301 min = strtod(c->argv[2]->ptr,NULL);
6302 }
6303 if (((char*)c->argv[3]->ptr)[0] == '(') {
6304 max = strtod((char*)c->argv[3]->ptr+1,NULL);
6305 maxex = 1;
6306 } else {
6307 max = strtod(c->argv[3]->ptr,NULL);
6308 }
6309
6310 /* Parse "WITHSCORES": note that if the command was called with
6311 * the name ZCOUNT then we are sure that c->argc == 4, so we'll never
6312 * enter the following paths to parse WITHSCORES and LIMIT. */
6313 if (c->argc == 5 || c->argc == 8) {
6314 if (strcasecmp(c->argv[c->argc-1]->ptr,"withscores") == 0)
6315 withscores = 1;
6316 else
6317 badsyntax = 1;
6318 }
6319 if (c->argc != (4 + withscores) && c->argc != (7 + withscores))
6320 badsyntax = 1;
6321 if (badsyntax) {
6322 addReplySds(c,
6323 sdsnew("-ERR wrong number of arguments for ZRANGEBYSCORE\r\n"));
6324 return;
6325 }
6326
6327 /* Parse "LIMIT" */
6328 if (c->argc == (7 + withscores) && strcasecmp(c->argv[4]->ptr,"limit")) {
6329 addReply(c,shared.syntaxerr);
6330 return;
6331 } else if (c->argc == (7 + withscores)) {
6332 offset = atoi(c->argv[5]->ptr);
6333 limit = atoi(c->argv[6]->ptr);
6334 if (offset < 0) offset = 0;
6335 }
6336
6337 /* Ok, lookup the key and get the range */
6338 o = lookupKeyRead(c->db,c->argv[1]);
6339 if (o == NULL) {
6340 addReply(c,justcount ? shared.czero : shared.emptymultibulk);
6341 } else {
6342 if (o->type != REDIS_ZSET) {
6343 addReply(c,shared.wrongtypeerr);
6344 } else {
6345 zset *zsetobj = o->ptr;
6346 zskiplist *zsl = zsetobj->zsl;
6347 zskiplistNode *ln;
6348 robj *ele, *lenobj = NULL;
6349 unsigned long rangelen = 0;
6350
6351 /* Get the first node with the score >= min, or with
6352 * score > min if 'minex' is true. */
6353 ln = zslFirstWithScore(zsl,min);
6354 while (minex && ln && ln->score == min) ln = ln->forward[0];
6355
6356 if (ln == NULL) {
6357 /* No element matching the speciifed interval */
6358 addReply(c,justcount ? shared.czero : shared.emptymultibulk);
6359 return;
6360 }
6361
6362 /* We don't know in advance how many matching elements there
6363 * are in the list, so we push this object that will represent
6364 * the multi-bulk length in the output buffer, and will "fix"
6365 * it later */
6366 if (!justcount) {
6367 lenobj = createObject(REDIS_STRING,NULL);
6368 addReply(c,lenobj);
6369 decrRefCount(lenobj);
6370 }
6371
6372 while(ln && (maxex ? (ln->score < max) : (ln->score <= max))) {
6373 if (offset) {
6374 offset--;
6375 ln = ln->forward[0];
6376 continue;
6377 }
6378 if (limit == 0) break;
6379 if (!justcount) {
6380 ele = ln->obj;
6381 addReplyBulk(c,ele);
6382 if (withscores)
6383 addReplyDouble(c,ln->score);
6384 }
6385 ln = ln->forward[0];
6386 rangelen++;
6387 if (limit > 0) limit--;
6388 }
6389 if (justcount) {
6390 addReplyLongLong(c,(long)rangelen);
6391 } else {
6392 lenobj->ptr = sdscatprintf(sdsempty(),"*%lu\r\n",
6393 withscores ? (rangelen*2) : rangelen);
6394 }
6395 }
6396 }
6397 }
6398
6399 static void zrangebyscoreCommand(redisClient *c) {
6400 genericZrangebyscoreCommand(c,0);
6401 }
6402
6403 static void zcountCommand(redisClient *c) {
6404 genericZrangebyscoreCommand(c,1);
6405 }
6406
6407 static void zcardCommand(redisClient *c) {
6408 robj *o;
6409 zset *zs;
6410
6411 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.czero)) == NULL ||
6412 checkType(c,o,REDIS_ZSET)) return;
6413
6414 zs = o->ptr;
6415 addReplyUlong(c,zs->zsl->length);
6416 }
6417
6418 static void zscoreCommand(redisClient *c) {
6419 robj *o;
6420 zset *zs;
6421 dictEntry *de;
6422
6423 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.nullbulk)) == NULL ||
6424 checkType(c,o,REDIS_ZSET)) return;
6425
6426 zs = o->ptr;
6427 de = dictFind(zs->dict,c->argv[2]);
6428 if (!de) {
6429 addReply(c,shared.nullbulk);
6430 } else {
6431 double *score = dictGetEntryVal(de);
6432
6433 addReplyDouble(c,*score);
6434 }
6435 }
6436
6437 static void zrankGenericCommand(redisClient *c, int reverse) {
6438 robj *o;
6439 zset *zs;
6440 zskiplist *zsl;
6441 dictEntry *de;
6442 unsigned long rank;
6443 double *score;
6444
6445 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.nullbulk)) == NULL ||
6446 checkType(c,o,REDIS_ZSET)) return;
6447
6448 zs = o->ptr;
6449 zsl = zs->zsl;
6450 de = dictFind(zs->dict,c->argv[2]);
6451 if (!de) {
6452 addReply(c,shared.nullbulk);
6453 return;
6454 }
6455
6456 score = dictGetEntryVal(de);
6457 rank = zslGetRank(zsl, *score, c->argv[2]);
6458 if (rank) {
6459 if (reverse) {
6460 addReplyLongLong(c, zsl->length - rank);
6461 } else {
6462 addReplyLongLong(c, rank-1);
6463 }
6464 } else {
6465 addReply(c,shared.nullbulk);
6466 }
6467 }
6468
6469 static void zrankCommand(redisClient *c) {
6470 zrankGenericCommand(c, 0);
6471 }
6472
6473 static void zrevrankCommand(redisClient *c) {
6474 zrankGenericCommand(c, 1);
6475 }
6476
6477 /* ========================= Hashes utility functions ======================= */
6478 #define REDIS_HASH_KEY 1
6479 #define REDIS_HASH_VALUE 2
6480
6481 /* Check the length of a number of objects to see if we need to convert a
6482 * zipmap to a real hash. Note that we only check string encoded objects
6483 * as their string length can be queried in constant time. */
6484 static void hashTryConversion(robj *subject, robj **argv, int start, int end) {
6485 int i;
6486 if (subject->encoding != REDIS_ENCODING_ZIPMAP) return;
6487
6488 for (i = start; i <= end; i++) {
6489 if (argv[i]->encoding == REDIS_ENCODING_RAW &&
6490 sdslen(argv[i]->ptr) > server.hash_max_zipmap_value)
6491 {
6492 convertToRealHash(subject);
6493 return;
6494 }
6495 }
6496 }
6497
6498 /* Encode given objects in-place when the hash uses a dict. */
6499 static void hashTryObjectEncoding(robj *subject, robj **o1, robj **o2) {
6500 if (subject->encoding == REDIS_ENCODING_HT) {
6501 if (o1) *o1 = tryObjectEncoding(*o1);
6502 if (o2) *o2 = tryObjectEncoding(*o2);
6503 }
6504 }
6505
6506 /* Get the value from a hash identified by key. Returns either a string
6507 * object or NULL if the value cannot be found. The refcount of the object
6508 * is always increased by 1 when the value was found. */
6509 static robj *hashGet(robj *o, robj *key) {
6510 robj *value = NULL;
6511 if (o->encoding == REDIS_ENCODING_ZIPMAP) {
6512 unsigned char *v;
6513 unsigned int vlen;
6514 key = getDecodedObject(key);
6515 if (zipmapGet(o->ptr,key->ptr,sdslen(key->ptr),&v,&vlen)) {
6516 value = createStringObject((char*)v,vlen);
6517 }
6518 decrRefCount(key);
6519 } else {
6520 dictEntry *de = dictFind(o->ptr,key);
6521 if (de != NULL) {
6522 value = dictGetEntryVal(de);
6523 incrRefCount(value);
6524 }
6525 }
6526 return value;
6527 }
6528
6529 /* Test if the key exists in the given hash. Returns 1 if the key
6530 * exists and 0 when it doesn't. */
6531 static int hashExists(robj *o, robj *key) {
6532 if (o->encoding == REDIS_ENCODING_ZIPMAP) {
6533 key = getDecodedObject(key);
6534 if (zipmapExists(o->ptr,key->ptr,sdslen(key->ptr))) {
6535 decrRefCount(key);
6536 return 1;
6537 }
6538 decrRefCount(key);
6539 } else {
6540 if (dictFind(o->ptr,key) != NULL) {
6541 return 1;
6542 }
6543 }
6544 return 0;
6545 }
6546
6547 /* Add an element, discard the old if the key already exists.
6548 * Return 0 on insert and 1 on update. */
6549 static int hashSet(robj *o, robj *key, robj *value) {
6550 int update = 0;
6551 if (o->encoding == REDIS_ENCODING_ZIPMAP) {
6552 key = getDecodedObject(key);
6553 value = getDecodedObject(value);
6554 o->ptr = zipmapSet(o->ptr,
6555 key->ptr,sdslen(key->ptr),
6556 value->ptr,sdslen(value->ptr), &update);
6557 decrRefCount(key);
6558 decrRefCount(value);
6559
6560 /* Check if the zipmap needs to be upgraded to a real hash table */
6561 if (zipmapLen(o->ptr) > server.hash_max_zipmap_entries)
6562 convertToRealHash(o);
6563 } else {
6564 if (dictReplace(o->ptr,key,value)) {
6565 /* Insert */
6566 incrRefCount(key);
6567 } else {
6568 /* Update */
6569 update = 1;
6570 }
6571 incrRefCount(value);
6572 }
6573 return update;
6574 }
6575
6576 /* Delete an element from a hash.
6577 * Return 1 on deleted and 0 on not found. */
6578 static int hashDelete(robj *o, robj *key) {
6579 int deleted = 0;
6580 if (o->encoding == REDIS_ENCODING_ZIPMAP) {
6581 key = getDecodedObject(key);
6582 o->ptr = zipmapDel(o->ptr,key->ptr,sdslen(key->ptr), &deleted);
6583 decrRefCount(key);
6584 } else {
6585 deleted = dictDelete((dict*)o->ptr,key) == DICT_OK;
6586 /* Always check if the dictionary needs a resize after a delete. */
6587 if (deleted && htNeedsResize(o->ptr)) dictResize(o->ptr);
6588 }
6589 return deleted;
6590 }
6591
6592 /* Return the number of elements in a hash. */
6593 static unsigned long hashLength(robj *o) {
6594 return (o->encoding == REDIS_ENCODING_ZIPMAP) ?
6595 zipmapLen((unsigned char*)o->ptr) : dictSize((dict*)o->ptr);
6596 }
6597
6598 /* Structure to hold hash iteration abstration. Note that iteration over
6599 * hashes involves both fields and values. Because it is possible that
6600 * not both are required, store pointers in the iterator to avoid
6601 * unnecessary memory allocation for fields/values. */
6602 typedef struct {
6603 int encoding;
6604 unsigned char *zi;
6605 unsigned char *zk, *zv;
6606 unsigned int zklen, zvlen;
6607
6608 dictIterator *di;
6609 dictEntry *de;
6610 } hashIterator;
6611
6612 static hashIterator *hashInitIterator(robj *subject) {
6613 hashIterator *hi = zmalloc(sizeof(hashIterator));
6614 hi->encoding = subject->encoding;
6615 if (hi->encoding == REDIS_ENCODING_ZIPMAP) {
6616 hi->zi = zipmapRewind(subject->ptr);
6617 } else if (hi->encoding == REDIS_ENCODING_HT) {
6618 hi->di = dictGetIterator(subject->ptr);
6619 } else {
6620 redisAssert(NULL);
6621 }
6622 return hi;
6623 }
6624
6625 static void hashReleaseIterator(hashIterator *hi) {
6626 if (hi->encoding == REDIS_ENCODING_HT) {
6627 dictReleaseIterator(hi->di);
6628 }
6629 zfree(hi);
6630 }
6631
6632 /* Move to the next entry in the hash. Return REDIS_OK when the next entry
6633 * could be found and REDIS_ERR when the iterator reaches the end. */
6634 static int hashNext(hashIterator *hi) {
6635 if (hi->encoding == REDIS_ENCODING_ZIPMAP) {
6636 if ((hi->zi = zipmapNext(hi->zi, &hi->zk, &hi->zklen,
6637 &hi->zv, &hi->zvlen)) == NULL) return REDIS_ERR;
6638 } else {
6639 if ((hi->de = dictNext(hi->di)) == NULL) return REDIS_ERR;
6640 }
6641 return REDIS_OK;
6642 }
6643
6644 /* Get key or value object at current iteration position.
6645 * This increases the refcount of the field object by 1. */
6646 static robj *hashCurrent(hashIterator *hi, int what) {
6647 robj *o;
6648 if (hi->encoding == REDIS_ENCODING_ZIPMAP) {
6649 if (what & REDIS_HASH_KEY) {
6650 o = createStringObject((char*)hi->zk,hi->zklen);
6651 } else {
6652 o = createStringObject((char*)hi->zv,hi->zvlen);
6653 }
6654 } else {
6655 if (what & REDIS_HASH_KEY) {
6656 o = dictGetEntryKey(hi->de);
6657 } else {
6658 o = dictGetEntryVal(hi->de);
6659 }
6660 incrRefCount(o);
6661 }
6662 return o;
6663 }
6664
6665 static robj *hashLookupWriteOrCreate(redisClient *c, robj *key) {
6666 robj *o = lookupKeyWrite(c->db,key);
6667 if (o == NULL) {
6668 o = createHashObject();
6669 dbAdd(c->db,key,o);
6670 } else {
6671 if (o->type != REDIS_HASH) {
6672 addReply(c,shared.wrongtypeerr);
6673 return NULL;
6674 }
6675 }
6676 return o;
6677 }
6678
6679 /* ============================= Hash commands ============================== */
6680 static void hsetCommand(redisClient *c) {
6681 int update;
6682 robj *o;
6683
6684 if ((o = hashLookupWriteOrCreate(c,c->argv[1])) == NULL) return;
6685 hashTryConversion(o,c->argv,2,3);
6686 hashTryObjectEncoding(o,&c->argv[2], &c->argv[3]);
6687 update = hashSet(o,c->argv[2],c->argv[3]);
6688 addReply(c, update ? shared.czero : shared.cone);
6689 server.dirty++;
6690 }
6691
6692 static void hsetnxCommand(redisClient *c) {
6693 robj *o;
6694 if ((o = hashLookupWriteOrCreate(c,c->argv[1])) == NULL) return;
6695 hashTryConversion(o,c->argv,2,3);
6696
6697 if (hashExists(o, c->argv[2])) {
6698 addReply(c, shared.czero);
6699 } else {
6700 hashTryObjectEncoding(o,&c->argv[2], &c->argv[3]);
6701 hashSet(o,c->argv[2],c->argv[3]);
6702 addReply(c, shared.cone);
6703 server.dirty++;
6704 }
6705 }
6706
6707 static void hmsetCommand(redisClient *c) {
6708 int i;
6709 robj *o;
6710
6711 if ((c->argc % 2) == 1) {
6712 addReplySds(c,sdsnew("-ERR wrong number of arguments for HMSET\r\n"));
6713 return;
6714 }
6715
6716 if ((o = hashLookupWriteOrCreate(c,c->argv[1])) == NULL) return;
6717 hashTryConversion(o,c->argv,2,c->argc-1);
6718 for (i = 2; i < c->argc; i += 2) {
6719 hashTryObjectEncoding(o,&c->argv[i], &c->argv[i+1]);
6720 hashSet(o,c->argv[i],c->argv[i+1]);
6721 }
6722 addReply(c, shared.ok);
6723 server.dirty++;
6724 }
6725
6726 static void hincrbyCommand(redisClient *c) {
6727 long long value, incr;
6728 robj *o, *current, *new;
6729
6730 if (getLongLongFromObjectOrReply(c,c->argv[3],&incr,NULL) != REDIS_OK) return;
6731 if ((o = hashLookupWriteOrCreate(c,c->argv[1])) == NULL) return;
6732 if ((current = hashGet(o,c->argv[2])) != NULL) {
6733 if (getLongLongFromObjectOrReply(c,current,&value,
6734 "hash value is not an integer") != REDIS_OK) {
6735 decrRefCount(current);
6736 return;
6737 }
6738 decrRefCount(current);
6739 } else {
6740 value = 0;
6741 }
6742
6743 value += incr;
6744 new = createStringObjectFromLongLong(value);
6745 hashTryObjectEncoding(o,&c->argv[2],NULL);
6746 hashSet(o,c->argv[2],new);
6747 decrRefCount(new);
6748 addReplyLongLong(c,value);
6749 server.dirty++;
6750 }
6751
6752 static void hgetCommand(redisClient *c) {
6753 robj *o, *value;
6754 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.nullbulk)) == NULL ||
6755 checkType(c,o,REDIS_HASH)) return;
6756
6757 if ((value = hashGet(o,c->argv[2])) != NULL) {
6758 addReplyBulk(c,value);
6759 decrRefCount(value);
6760 } else {
6761 addReply(c,shared.nullbulk);
6762 }
6763 }
6764
6765 static void hmgetCommand(redisClient *c) {
6766 int i;
6767 robj *o, *value;
6768 o = lookupKeyRead(c->db,c->argv[1]);
6769 if (o != NULL && o->type != REDIS_HASH) {
6770 addReply(c,shared.wrongtypeerr);
6771 }
6772
6773 /* Note the check for o != NULL happens inside the loop. This is
6774 * done because objects that cannot be found are considered to be
6775 * an empty hash. The reply should then be a series of NULLs. */
6776 addReplySds(c,sdscatprintf(sdsempty(),"*%d\r\n",c->argc-2));
6777 for (i = 2; i < c->argc; i++) {
6778 if (o != NULL && (value = hashGet(o,c->argv[i])) != NULL) {
6779 addReplyBulk(c,value);
6780 decrRefCount(value);
6781 } else {
6782 addReply(c,shared.nullbulk);
6783 }
6784 }
6785 }
6786
6787 static void hdelCommand(redisClient *c) {
6788 robj *o;
6789 if ((o = lookupKeyWriteOrReply(c,c->argv[1],shared.czero)) == NULL ||
6790 checkType(c,o,REDIS_HASH)) return;
6791
6792 if (hashDelete(o,c->argv[2])) {
6793 if (hashLength(o) == 0) dbDelete(c->db,c->argv[1]);
6794 addReply(c,shared.cone);
6795 server.dirty++;
6796 } else {
6797 addReply(c,shared.czero);
6798 }
6799 }
6800
6801 static void hlenCommand(redisClient *c) {
6802 robj *o;
6803 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.czero)) == NULL ||
6804 checkType(c,o,REDIS_HASH)) return;
6805
6806 addReplyUlong(c,hashLength(o));
6807 }
6808
6809 static void genericHgetallCommand(redisClient *c, int flags) {
6810 robj *o, *lenobj, *obj;
6811 unsigned long count = 0;
6812 hashIterator *hi;
6813
6814 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.emptymultibulk)) == NULL
6815 || checkType(c,o,REDIS_HASH)) return;
6816
6817 lenobj = createObject(REDIS_STRING,NULL);
6818 addReply(c,lenobj);
6819 decrRefCount(lenobj);
6820
6821 hi = hashInitIterator(o);
6822 while (hashNext(hi) != REDIS_ERR) {
6823 if (flags & REDIS_HASH_KEY) {
6824 obj = hashCurrent(hi,REDIS_HASH_KEY);
6825 addReplyBulk(c,obj);
6826 decrRefCount(obj);
6827 count++;
6828 }
6829 if (flags & REDIS_HASH_VALUE) {
6830 obj = hashCurrent(hi,REDIS_HASH_VALUE);
6831 addReplyBulk(c,obj);
6832 decrRefCount(obj);
6833 count++;
6834 }
6835 }
6836 hashReleaseIterator(hi);
6837
6838 lenobj->ptr = sdscatprintf(sdsempty(),"*%lu\r\n",count);
6839 }
6840
6841 static void hkeysCommand(redisClient *c) {
6842 genericHgetallCommand(c,REDIS_HASH_KEY);
6843 }
6844
6845 static void hvalsCommand(redisClient *c) {
6846 genericHgetallCommand(c,REDIS_HASH_VALUE);
6847 }
6848
6849 static void hgetallCommand(redisClient *c) {
6850 genericHgetallCommand(c,REDIS_HASH_KEY|REDIS_HASH_VALUE);
6851 }
6852
6853 static void hexistsCommand(redisClient *c) {
6854 robj *o;
6855 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.czero)) == NULL ||
6856 checkType(c,o,REDIS_HASH)) return;
6857
6858 addReply(c, hashExists(o,c->argv[2]) ? shared.cone : shared.czero);
6859 }
6860
6861 static void convertToRealHash(robj *o) {
6862 unsigned char *key, *val, *p, *zm = o->ptr;
6863 unsigned int klen, vlen;
6864 dict *dict = dictCreate(&hashDictType,NULL);
6865
6866 assert(o->type == REDIS_HASH && o->encoding != REDIS_ENCODING_HT);
6867 p = zipmapRewind(zm);
6868 while((p = zipmapNext(p,&key,&klen,&val,&vlen)) != NULL) {
6869 robj *keyobj, *valobj;
6870
6871 keyobj = createStringObject((char*)key,klen);
6872 valobj = createStringObject((char*)val,vlen);
6873 keyobj = tryObjectEncoding(keyobj);
6874 valobj = tryObjectEncoding(valobj);
6875 dictAdd(dict,keyobj,valobj);
6876 }
6877 o->encoding = REDIS_ENCODING_HT;
6878 o->ptr = dict;
6879 zfree(zm);
6880 }
6881
6882 /* ========================= Non type-specific commands ==================== */
6883
6884 static void flushdbCommand(redisClient *c) {
6885 server.dirty += dictSize(c->db->dict);
6886 touchWatchedKeysOnFlush(c->db->id);
6887 dictEmpty(c->db->dict);
6888 dictEmpty(c->db->expires);
6889 addReply(c,shared.ok);
6890 }
6891
6892 static void flushallCommand(redisClient *c) {
6893 touchWatchedKeysOnFlush(-1);
6894 server.dirty += emptyDb();
6895 addReply(c,shared.ok);
6896 if (server.bgsavechildpid != -1) {
6897 kill(server.bgsavechildpid,SIGKILL);
6898 rdbRemoveTempFile(server.bgsavechildpid);
6899 }
6900 rdbSave(server.dbfilename);
6901 server.dirty++;
6902 }
6903
6904 static redisSortOperation *createSortOperation(int type, robj *pattern) {
6905 redisSortOperation *so = zmalloc(sizeof(*so));
6906 so->type = type;
6907 so->pattern = pattern;
6908 return so;
6909 }
6910
6911 /* Return the value associated to the key with a name obtained
6912 * substituting the first occurence of '*' in 'pattern' with 'subst'.
6913 * The returned object will always have its refcount increased by 1
6914 * when it is non-NULL. */
6915 static robj *lookupKeyByPattern(redisDb *db, robj *pattern, robj *subst) {
6916 char *p, *f;
6917 sds spat, ssub;
6918 robj keyobj, fieldobj, *o;
6919 int prefixlen, sublen, postfixlen, fieldlen;
6920 /* Expoit the internal sds representation to create a sds string allocated on the stack in order to make this function faster */
6921 struct {
6922 long len;
6923 long free;
6924 char buf[REDIS_SORTKEY_MAX+1];
6925 } keyname, fieldname;
6926
6927 /* If the pattern is "#" return the substitution object itself in order
6928 * to implement the "SORT ... GET #" feature. */
6929 spat = pattern->ptr;
6930 if (spat[0] == '#' && spat[1] == '\0') {
6931 incrRefCount(subst);
6932 return subst;
6933 }
6934
6935 /* The substitution object may be specially encoded. If so we create
6936 * a decoded object on the fly. Otherwise getDecodedObject will just
6937 * increment the ref count, that we'll decrement later. */
6938 subst = getDecodedObject(subst);
6939
6940 ssub = subst->ptr;
6941 if (sdslen(spat)+sdslen(ssub)-1 > REDIS_SORTKEY_MAX) return NULL;
6942 p = strchr(spat,'*');
6943 if (!p) {
6944 decrRefCount(subst);
6945 return NULL;
6946 }
6947
6948 /* Find out if we're dealing with a hash dereference. */
6949 if ((f = strstr(p+1, "->")) != NULL) {
6950 fieldlen = sdslen(spat)-(f-spat);
6951 /* this also copies \0 character */
6952 memcpy(fieldname.buf,f+2,fieldlen-1);
6953 fieldname.len = fieldlen-2;
6954 } else {
6955 fieldlen = 0;
6956 }
6957
6958 prefixlen = p-spat;
6959 sublen = sdslen(ssub);
6960 postfixlen = sdslen(spat)-(prefixlen+1)-fieldlen;
6961 memcpy(keyname.buf,spat,prefixlen);
6962 memcpy(keyname.buf+prefixlen,ssub,sublen);
6963 memcpy(keyname.buf+prefixlen+sublen,p+1,postfixlen);
6964 keyname.buf[prefixlen+sublen+postfixlen] = '\0';
6965 keyname.len = prefixlen+sublen+postfixlen;
6966 decrRefCount(subst);
6967
6968 /* Lookup substituted key */
6969 initStaticStringObject(keyobj,((char*)&keyname)+(sizeof(long)*2));
6970 o = lookupKeyRead(db,&keyobj);
6971 if (o == NULL) return NULL;
6972
6973 if (fieldlen > 0) {
6974 if (o->type != REDIS_HASH || fieldname.len < 1) return NULL;
6975
6976 /* Retrieve value from hash by the field name. This operation
6977 * already increases the refcount of the returned object. */
6978 initStaticStringObject(fieldobj,((char*)&fieldname)+(sizeof(long)*2));
6979 o = hashGet(o, &fieldobj);
6980 } else {
6981 if (o->type != REDIS_STRING) return NULL;
6982
6983 /* Every object that this function returns needs to have its refcount
6984 * increased. sortCommand decreases it again. */
6985 incrRefCount(o);
6986 }
6987
6988 return o;
6989 }
6990
6991 /* sortCompare() is used by qsort in sortCommand(). Given that qsort_r with
6992 * the additional parameter is not standard but a BSD-specific we have to
6993 * pass sorting parameters via the global 'server' structure */
6994 static int sortCompare(const void *s1, const void *s2) {
6995 const redisSortObject *so1 = s1, *so2 = s2;
6996 int cmp;
6997
6998 if (!server.sort_alpha) {
6999 /* Numeric sorting. Here it's trivial as we precomputed scores */
7000 if (so1->u.score > so2->u.score) {
7001 cmp = 1;
7002 } else if (so1->u.score < so2->u.score) {
7003 cmp = -1;
7004 } else {
7005 cmp = 0;
7006 }
7007 } else {
7008 /* Alphanumeric sorting */
7009 if (server.sort_bypattern) {
7010 if (!so1->u.cmpobj || !so2->u.cmpobj) {
7011 /* At least one compare object is NULL */
7012 if (so1->u.cmpobj == so2->u.cmpobj)
7013 cmp = 0;
7014 else if (so1->u.cmpobj == NULL)
7015 cmp = -1;
7016 else
7017 cmp = 1;
7018 } else {
7019 /* We have both the objects, use strcoll */
7020 cmp = strcoll(so1->u.cmpobj->ptr,so2->u.cmpobj->ptr);
7021 }
7022 } else {
7023 /* Compare elements directly. */
7024 cmp = compareStringObjects(so1->obj,so2->obj);
7025 }
7026 }
7027 return server.sort_desc ? -cmp : cmp;
7028 }
7029
7030 /* The SORT command is the most complex command in Redis. Warning: this code
7031 * is optimized for speed and a bit less for readability */
7032 static void sortCommand(redisClient *c) {
7033 list *operations;
7034 int outputlen = 0;
7035 int desc = 0, alpha = 0;
7036 int limit_start = 0, limit_count = -1, start, end;
7037 int j, dontsort = 0, vectorlen;
7038 int getop = 0; /* GET operation counter */
7039 robj *sortval, *sortby = NULL, *storekey = NULL;
7040 redisSortObject *vector; /* Resulting vector to sort */
7041
7042 /* Lookup the key to sort. It must be of the right types */
7043 sortval = lookupKeyRead(c->db,c->argv[1]);
7044 if (sortval == NULL) {
7045 addReply(c,shared.emptymultibulk);
7046 return;
7047 }
7048 if (sortval->type != REDIS_SET && sortval->type != REDIS_LIST &&
7049 sortval->type != REDIS_ZSET)
7050 {
7051 addReply(c,shared.wrongtypeerr);
7052 return;
7053 }
7054
7055 /* Create a list of operations to perform for every sorted element.
7056 * Operations can be GET/DEL/INCR/DECR */
7057 operations = listCreate();
7058 listSetFreeMethod(operations,zfree);
7059 j = 2;
7060
7061 /* Now we need to protect sortval incrementing its count, in the future
7062 * SORT may have options able to overwrite/delete keys during the sorting
7063 * and the sorted key itself may get destroied */
7064 incrRefCount(sortval);
7065
7066 /* The SORT command has an SQL-alike syntax, parse it */
7067 while(j < c->argc) {
7068 int leftargs = c->argc-j-1;
7069 if (!strcasecmp(c->argv[j]->ptr,"asc")) {
7070 desc = 0;
7071 } else if (!strcasecmp(c->argv[j]->ptr,"desc")) {
7072 desc = 1;
7073 } else if (!strcasecmp(c->argv[j]->ptr,"alpha")) {
7074 alpha = 1;
7075 } else if (!strcasecmp(c->argv[j]->ptr,"limit") && leftargs >= 2) {
7076 limit_start = atoi(c->argv[j+1]->ptr);
7077 limit_count = atoi(c->argv[j+2]->ptr);
7078 j+=2;
7079 } else if (!strcasecmp(c->argv[j]->ptr,"store") && leftargs >= 1) {
7080 storekey = c->argv[j+1];
7081 j++;
7082 } else if (!strcasecmp(c->argv[j]->ptr,"by") && leftargs >= 1) {
7083 sortby = c->argv[j+1];
7084 /* If the BY pattern does not contain '*', i.e. it is constant,
7085 * we don't need to sort nor to lookup the weight keys. */
7086 if (strchr(c->argv[j+1]->ptr,'*') == NULL) dontsort = 1;
7087 j++;
7088 } else if (!strcasecmp(c->argv[j]->ptr,"get") && leftargs >= 1) {
7089 listAddNodeTail(operations,createSortOperation(
7090 REDIS_SORT_GET,c->argv[j+1]));
7091 getop++;
7092 j++;
7093 } else {
7094 decrRefCount(sortval);
7095 listRelease(operations);
7096 addReply(c,shared.syntaxerr);
7097 return;
7098 }
7099 j++;
7100 }
7101
7102 /* Load the sorting vector with all the objects to sort */
7103 switch(sortval->type) {
7104 case REDIS_LIST: vectorlen = listLength((list*)sortval->ptr); break;
7105 case REDIS_SET: vectorlen = dictSize((dict*)sortval->ptr); break;
7106 case REDIS_ZSET: vectorlen = dictSize(((zset*)sortval->ptr)->dict); break;
7107 default: vectorlen = 0; redisPanic("Bad SORT type"); /* Avoid GCC warning */
7108 }
7109 vector = zmalloc(sizeof(redisSortObject)*vectorlen);
7110 j = 0;
7111
7112 if (sortval->type == REDIS_LIST) {
7113 list *list = sortval->ptr;
7114 listNode *ln;
7115 listIter li;
7116
7117 listRewind(list,&li);
7118 while((ln = listNext(&li))) {
7119 robj *ele = ln->value;
7120 vector[j].obj = ele;
7121 vector[j].u.score = 0;
7122 vector[j].u.cmpobj = NULL;
7123 j++;
7124 }
7125 } else {
7126 dict *set;
7127 dictIterator *di;
7128 dictEntry *setele;
7129
7130 if (sortval->type == REDIS_SET) {
7131 set = sortval->ptr;
7132 } else {
7133 zset *zs = sortval->ptr;
7134 set = zs->dict;
7135 }
7136
7137 di = dictGetIterator(set);
7138 while((setele = dictNext(di)) != NULL) {
7139 vector[j].obj = dictGetEntryKey(setele);
7140 vector[j].u.score = 0;
7141 vector[j].u.cmpobj = NULL;
7142 j++;
7143 }
7144 dictReleaseIterator(di);
7145 }
7146 redisAssert(j == vectorlen);
7147
7148 /* Now it's time to load the right scores in the sorting vector */
7149 if (dontsort == 0) {
7150 for (j = 0; j < vectorlen; j++) {
7151 robj *byval;
7152 if (sortby) {
7153 /* lookup value to sort by */
7154 byval = lookupKeyByPattern(c->db,sortby,vector[j].obj);
7155 if (!byval) continue;
7156 } else {
7157 /* use object itself to sort by */
7158 byval = vector[j].obj;
7159 }
7160
7161 if (alpha) {
7162 if (sortby) vector[j].u.cmpobj = getDecodedObject(byval);
7163 } else {
7164 if (byval->encoding == REDIS_ENCODING_RAW) {
7165 vector[j].u.score = strtod(byval->ptr,NULL);
7166 } else if (byval->encoding == REDIS_ENCODING_INT) {
7167 /* Don't need to decode the object if it's
7168 * integer-encoded (the only encoding supported) so
7169 * far. We can just cast it */
7170 vector[j].u.score = (long)byval->ptr;
7171 } else {
7172 redisAssert(1 != 1);
7173 }
7174 }
7175
7176 /* when the object was retrieved using lookupKeyByPattern,
7177 * its refcount needs to be decreased. */
7178 if (sortby) {
7179 decrRefCount(byval);
7180 }
7181 }
7182 }
7183
7184 /* We are ready to sort the vector... perform a bit of sanity check
7185 * on the LIMIT option too. We'll use a partial version of quicksort. */
7186 start = (limit_start < 0) ? 0 : limit_start;
7187 end = (limit_count < 0) ? vectorlen-1 : start+limit_count-1;
7188 if (start >= vectorlen) {
7189 start = vectorlen-1;
7190 end = vectorlen-2;
7191 }
7192 if (end >= vectorlen) end = vectorlen-1;
7193
7194 if (dontsort == 0) {
7195 server.sort_desc = desc;
7196 server.sort_alpha = alpha;
7197 server.sort_bypattern = sortby ? 1 : 0;
7198 if (sortby && (start != 0 || end != vectorlen-1))
7199 pqsort(vector,vectorlen,sizeof(redisSortObject),sortCompare, start,end);
7200 else
7201 qsort(vector,vectorlen,sizeof(redisSortObject),sortCompare);
7202 }
7203
7204 /* Send command output to the output buffer, performing the specified
7205 * GET/DEL/INCR/DECR operations if any. */
7206 outputlen = getop ? getop*(end-start+1) : end-start+1;
7207 if (storekey == NULL) {
7208 /* STORE option not specified, sent the sorting result to client */
7209 addReplySds(c,sdscatprintf(sdsempty(),"*%d\r\n",outputlen));
7210 for (j = start; j <= end; j++) {
7211 listNode *ln;
7212 listIter li;
7213
7214 if (!getop) addReplyBulk(c,vector[j].obj);
7215 listRewind(operations,&li);
7216 while((ln = listNext(&li))) {
7217 redisSortOperation *sop = ln->value;
7218 robj *val = lookupKeyByPattern(c->db,sop->pattern,
7219 vector[j].obj);
7220
7221 if (sop->type == REDIS_SORT_GET) {
7222 if (!val) {
7223 addReply(c,shared.nullbulk);
7224 } else {
7225 addReplyBulk(c,val);
7226 decrRefCount(val);
7227 }
7228 } else {
7229 redisAssert(sop->type == REDIS_SORT_GET); /* always fails */
7230 }
7231 }
7232 }
7233 } else {
7234 robj *listObject = createListObject();
7235 list *listPtr = (list*) listObject->ptr;
7236
7237 /* STORE option specified, set the sorting result as a List object */
7238 for (j = start; j <= end; j++) {
7239 listNode *ln;
7240 listIter li;
7241
7242 if (!getop) {
7243 listAddNodeTail(listPtr,vector[j].obj);
7244 incrRefCount(vector[j].obj);
7245 }
7246 listRewind(operations,&li);
7247 while((ln = listNext(&li))) {
7248 redisSortOperation *sop = ln->value;
7249 robj *val = lookupKeyByPattern(c->db,sop->pattern,
7250 vector[j].obj);
7251
7252 if (sop->type == REDIS_SORT_GET) {
7253 if (!val) {
7254 listAddNodeTail(listPtr,createStringObject("",0));
7255 } else {
7256 /* We should do a incrRefCount on val because it is
7257 * added to the list, but also a decrRefCount because
7258 * it is returned by lookupKeyByPattern. This results
7259 * in doing nothing at all. */
7260 listAddNodeTail(listPtr,val);
7261 }
7262 } else {
7263 redisAssert(sop->type == REDIS_SORT_GET); /* always fails */
7264 }
7265 }
7266 }
7267 dbReplace(c->db,storekey,listObject);
7268 /* Note: we add 1 because the DB is dirty anyway since even if the
7269 * SORT result is empty a new key is set and maybe the old content
7270 * replaced. */
7271 server.dirty += 1+outputlen;
7272 addReplySds(c,sdscatprintf(sdsempty(),":%d\r\n",outputlen));
7273 }
7274
7275 /* Cleanup */
7276 decrRefCount(sortval);
7277 listRelease(operations);
7278 for (j = 0; j < vectorlen; j++) {
7279 if (alpha && vector[j].u.cmpobj)
7280 decrRefCount(vector[j].u.cmpobj);
7281 }
7282 zfree(vector);
7283 }
7284
7285 /* Convert an amount of bytes into a human readable string in the form
7286 * of 100B, 2G, 100M, 4K, and so forth. */
7287 static void bytesToHuman(char *s, unsigned long long n) {
7288 double d;
7289
7290 if (n < 1024) {
7291 /* Bytes */
7292 sprintf(s,"%lluB",n);
7293 return;
7294 } else if (n < (1024*1024)) {
7295 d = (double)n/(1024);
7296 sprintf(s,"%.2fK",d);
7297 } else if (n < (1024LL*1024*1024)) {
7298 d = (double)n/(1024*1024);
7299 sprintf(s,"%.2fM",d);
7300 } else if (n < (1024LL*1024*1024*1024)) {
7301 d = (double)n/(1024LL*1024*1024);
7302 sprintf(s,"%.2fG",d);
7303 }
7304 }
7305
7306 /* Create the string returned by the INFO command. This is decoupled
7307 * by the INFO command itself as we need to report the same information
7308 * on memory corruption problems. */
7309 static sds genRedisInfoString(void) {
7310 sds info;
7311 time_t uptime = time(NULL)-server.stat_starttime;
7312 int j;
7313 char hmem[64];
7314
7315 bytesToHuman(hmem,zmalloc_used_memory());
7316 info = sdscatprintf(sdsempty(),
7317 "redis_version:%s\r\n"
7318 "redis_git_sha1:%s\r\n"
7319 "redis_git_dirty:%d\r\n"
7320 "arch_bits:%s\r\n"
7321 "multiplexing_api:%s\r\n"
7322 "process_id:%ld\r\n"
7323 "uptime_in_seconds:%ld\r\n"
7324 "uptime_in_days:%ld\r\n"
7325 "connected_clients:%d\r\n"
7326 "connected_slaves:%d\r\n"
7327 "blocked_clients:%d\r\n"
7328 "used_memory:%zu\r\n"
7329 "used_memory_human:%s\r\n"
7330 "changes_since_last_save:%lld\r\n"
7331 "bgsave_in_progress:%d\r\n"
7332 "last_save_time:%ld\r\n"
7333 "bgrewriteaof_in_progress:%d\r\n"
7334 "total_connections_received:%lld\r\n"
7335 "total_commands_processed:%lld\r\n"
7336 "expired_keys:%lld\r\n"
7337 "hash_max_zipmap_entries:%zu\r\n"
7338 "hash_max_zipmap_value:%zu\r\n"
7339 "pubsub_channels:%ld\r\n"
7340 "pubsub_patterns:%u\r\n"
7341 "vm_enabled:%d\r\n"
7342 "role:%s\r\n"
7343 ,REDIS_VERSION,
7344 REDIS_GIT_SHA1,
7345 strtol(REDIS_GIT_DIRTY,NULL,10) > 0,
7346 (sizeof(long) == 8) ? "64" : "32",
7347 aeGetApiName(),
7348 (long) getpid(),
7349 uptime,
7350 uptime/(3600*24),
7351 listLength(server.clients)-listLength(server.slaves),
7352 listLength(server.slaves),
7353 server.blpop_blocked_clients,
7354 zmalloc_used_memory(),
7355 hmem,
7356 server.dirty,
7357 server.bgsavechildpid != -1,
7358 server.lastsave,
7359 server.bgrewritechildpid != -1,
7360 server.stat_numconnections,
7361 server.stat_numcommands,
7362 server.stat_expiredkeys,
7363 server.hash_max_zipmap_entries,
7364 server.hash_max_zipmap_value,
7365 dictSize(server.pubsub_channels),
7366 listLength(server.pubsub_patterns),
7367 server.vm_enabled != 0,
7368 server.masterhost == NULL ? "master" : "slave"
7369 );
7370 if (server.masterhost) {
7371 info = sdscatprintf(info,
7372 "master_host:%s\r\n"
7373 "master_port:%d\r\n"
7374 "master_link_status:%s\r\n"
7375 "master_last_io_seconds_ago:%d\r\n"
7376 ,server.masterhost,
7377 server.masterport,
7378 (server.replstate == REDIS_REPL_CONNECTED) ?
7379 "up" : "down",
7380 server.master ? ((int)(time(NULL)-server.master->lastinteraction)) : -1
7381 );
7382 }
7383 if (server.vm_enabled) {
7384 lockThreadedIO();
7385 info = sdscatprintf(info,
7386 "vm_conf_max_memory:%llu\r\n"
7387 "vm_conf_page_size:%llu\r\n"
7388 "vm_conf_pages:%llu\r\n"
7389 "vm_stats_used_pages:%llu\r\n"
7390 "vm_stats_swapped_objects:%llu\r\n"
7391 "vm_stats_swappin_count:%llu\r\n"
7392 "vm_stats_swappout_count:%llu\r\n"
7393 "vm_stats_io_newjobs_len:%lu\r\n"
7394 "vm_stats_io_processing_len:%lu\r\n"
7395 "vm_stats_io_processed_len:%lu\r\n"
7396 "vm_stats_io_active_threads:%lu\r\n"
7397 "vm_stats_blocked_clients:%lu\r\n"
7398 ,(unsigned long long) server.vm_max_memory,
7399 (unsigned long long) server.vm_page_size,
7400 (unsigned long long) server.vm_pages,
7401 (unsigned long long) server.vm_stats_used_pages,
7402 (unsigned long long) server.vm_stats_swapped_objects,
7403 (unsigned long long) server.vm_stats_swapins,
7404 (unsigned long long) server.vm_stats_swapouts,
7405 (unsigned long) listLength(server.io_newjobs),
7406 (unsigned long) listLength(server.io_processing),
7407 (unsigned long) listLength(server.io_processed),
7408 (unsigned long) server.io_active_threads,
7409 (unsigned long) server.vm_blocked_clients
7410 );
7411 unlockThreadedIO();
7412 }
7413 for (j = 0; j < server.dbnum; j++) {
7414 long long keys, vkeys;
7415
7416 keys = dictSize(server.db[j].dict);
7417 vkeys = dictSize(server.db[j].expires);
7418 if (keys || vkeys) {
7419 info = sdscatprintf(info, "db%d:keys=%lld,expires=%lld\r\n",
7420 j, keys, vkeys);
7421 }
7422 }
7423 return info;
7424 }
7425
7426 static void infoCommand(redisClient *c) {
7427 sds info = genRedisInfoString();
7428 addReplySds(c,sdscatprintf(sdsempty(),"$%lu\r\n",
7429 (unsigned long)sdslen(info)));
7430 addReplySds(c,info);
7431 addReply(c,shared.crlf);
7432 }
7433
7434 static void monitorCommand(redisClient *c) {
7435 /* ignore MONITOR if aleady slave or in monitor mode */
7436 if (c->flags & REDIS_SLAVE) return;
7437
7438 c->flags |= (REDIS_SLAVE|REDIS_MONITOR);
7439 c->slaveseldb = 0;
7440 listAddNodeTail(server.monitors,c);
7441 addReply(c,shared.ok);
7442 }
7443
7444 /* ================================= Expire ================================= */
7445 static int removeExpire(redisDb *db, robj *key) {
7446 if (dictDelete(db->expires,key->ptr) == DICT_OK) {
7447 return 1;
7448 } else {
7449 return 0;
7450 }
7451 }
7452
7453 static int setExpire(redisDb *db, robj *key, time_t when) {
7454 sds copy = sdsdup(key->ptr);
7455 if (dictAdd(db->expires,copy,(void*)when) == DICT_ERR) {
7456 sdsfree(copy);
7457 return 0;
7458 } else {
7459 return 1;
7460 }
7461 }
7462
7463 /* Return the expire time of the specified key, or -1 if no expire
7464 * is associated with this key (i.e. the key is non volatile) */
7465 static time_t getExpire(redisDb *db, robj *key) {
7466 dictEntry *de;
7467
7468 /* No expire? return ASAP */
7469 if (dictSize(db->expires) == 0 ||
7470 (de = dictFind(db->expires,key->ptr)) == NULL) return -1;
7471
7472 return (time_t) dictGetEntryVal(de);
7473 }
7474
7475 static int expireIfNeeded(redisDb *db, robj *key) {
7476 time_t when;
7477 dictEntry *de;
7478
7479 /* No expire? return ASAP */
7480 if (dictSize(db->expires) == 0 ||
7481 (de = dictFind(db->expires,key->ptr)) == NULL) return 0;
7482
7483 /* Lookup the expire */
7484 when = (time_t) dictGetEntryVal(de);
7485 if (time(NULL) <= when) return 0;
7486
7487 /* Delete the key */
7488 dbDelete(db,key);
7489 server.stat_expiredkeys++;
7490 return 1;
7491 }
7492
7493 static int deleteIfVolatile(redisDb *db, robj *key) {
7494 dictEntry *de;
7495
7496 /* No expire? return ASAP */
7497 if (dictSize(db->expires) == 0 ||
7498 (de = dictFind(db->expires,key->ptr)) == NULL) return 0;
7499
7500 /* Delete the key */
7501 server.dirty++;
7502 server.stat_expiredkeys++;
7503 dictDelete(db->expires,key->ptr);
7504 return dictDelete(db->dict,key->ptr) == DICT_OK;
7505 }
7506
7507 static void expireGenericCommand(redisClient *c, robj *key, robj *param, long offset) {
7508 dictEntry *de;
7509 time_t seconds;
7510
7511 if (getLongFromObjectOrReply(c, param, &seconds, NULL) != REDIS_OK) return;
7512
7513 seconds -= offset;
7514
7515 de = dictFind(c->db->dict,key->ptr);
7516 if (de == NULL) {
7517 addReply(c,shared.czero);
7518 return;
7519 }
7520 if (seconds <= 0) {
7521 if (dbDelete(c->db,key)) server.dirty++;
7522 addReply(c, shared.cone);
7523 return;
7524 } else {
7525 time_t when = time(NULL)+seconds;
7526 if (setExpire(c->db,key,when)) {
7527 addReply(c,shared.cone);
7528 server.dirty++;
7529 } else {
7530 addReply(c,shared.czero);
7531 }
7532 return;
7533 }
7534 }
7535
7536 static void expireCommand(redisClient *c) {
7537 expireGenericCommand(c,c->argv[1],c->argv[2],0);
7538 }
7539
7540 static void expireatCommand(redisClient *c) {
7541 expireGenericCommand(c,c->argv[1],c->argv[2],time(NULL));
7542 }
7543
7544 static void ttlCommand(redisClient *c) {
7545 time_t expire;
7546 int ttl = -1;
7547
7548 expire = getExpire(c->db,c->argv[1]);
7549 if (expire != -1) {
7550 ttl = (int) (expire-time(NULL));
7551 if (ttl < 0) ttl = -1;
7552 }
7553 addReplySds(c,sdscatprintf(sdsempty(),":%d\r\n",ttl));
7554 }
7555
7556 /* ================================ MULTI/EXEC ============================== */
7557
7558 /* Client state initialization for MULTI/EXEC */
7559 static void initClientMultiState(redisClient *c) {
7560 c->mstate.commands = NULL;
7561 c->mstate.count = 0;
7562 }
7563
7564 /* Release all the resources associated with MULTI/EXEC state */
7565 static void freeClientMultiState(redisClient *c) {
7566 int j;
7567
7568 for (j = 0; j < c->mstate.count; j++) {
7569 int i;
7570 multiCmd *mc = c->mstate.commands+j;
7571
7572 for (i = 0; i < mc->argc; i++)
7573 decrRefCount(mc->argv[i]);
7574 zfree(mc->argv);
7575 }
7576 zfree(c->mstate.commands);
7577 }
7578
7579 /* Add a new command into the MULTI commands queue */
7580 static void queueMultiCommand(redisClient *c, struct redisCommand *cmd) {
7581 multiCmd *mc;
7582 int j;
7583
7584 c->mstate.commands = zrealloc(c->mstate.commands,
7585 sizeof(multiCmd)*(c->mstate.count+1));
7586 mc = c->mstate.commands+c->mstate.count;
7587 mc->cmd = cmd;
7588 mc->argc = c->argc;
7589 mc->argv = zmalloc(sizeof(robj*)*c->argc);
7590 memcpy(mc->argv,c->argv,sizeof(robj*)*c->argc);
7591 for (j = 0; j < c->argc; j++)
7592 incrRefCount(mc->argv[j]);
7593 c->mstate.count++;
7594 }
7595
7596 static void multiCommand(redisClient *c) {
7597 if (c->flags & REDIS_MULTI) {
7598 addReplySds(c,sdsnew("-ERR MULTI calls can not be nested\r\n"));
7599 return;
7600 }
7601 c->flags |= REDIS_MULTI;
7602 addReply(c,shared.ok);
7603 }
7604
7605 static void discardCommand(redisClient *c) {
7606 if (!(c->flags & REDIS_MULTI)) {
7607 addReplySds(c,sdsnew("-ERR DISCARD without MULTI\r\n"));
7608 return;
7609 }
7610
7611 freeClientMultiState(c);
7612 initClientMultiState(c);
7613 c->flags &= (~REDIS_MULTI);
7614 addReply(c,shared.ok);
7615 }
7616
7617 /* Send a MULTI command to all the slaves and AOF file. Check the execCommand
7618 * implememntation for more information. */
7619 static void execCommandReplicateMulti(redisClient *c) {
7620 struct redisCommand *cmd;
7621 robj *multistring = createStringObject("MULTI",5);
7622
7623 cmd = lookupCommand("multi");
7624 if (server.appendonly)
7625 feedAppendOnlyFile(cmd,c->db->id,&multistring,1);
7626 if (listLength(server.slaves))
7627 replicationFeedSlaves(server.slaves,c->db->id,&multistring,1);
7628 decrRefCount(multistring);
7629 }
7630
7631 static void execCommand(redisClient *c) {
7632 int j;
7633 robj **orig_argv;
7634 int orig_argc;
7635
7636 if (!(c->flags & REDIS_MULTI)) {
7637 addReplySds(c,sdsnew("-ERR EXEC without MULTI\r\n"));
7638 return;
7639 }
7640
7641 /* Check if we need to abort the EXEC if some WATCHed key was touched.
7642 * A failed EXEC will return a multi bulk nil object. */
7643 if (c->flags & REDIS_DIRTY_CAS) {
7644 freeClientMultiState(c);
7645 initClientMultiState(c);
7646 c->flags &= ~(REDIS_MULTI|REDIS_DIRTY_CAS);
7647 unwatchAllKeys(c);
7648 addReply(c,shared.nullmultibulk);
7649 return;
7650 }
7651
7652 /* Replicate a MULTI request now that we are sure the block is executed.
7653 * This way we'll deliver the MULTI/..../EXEC block as a whole and
7654 * both the AOF and the replication link will have the same consistency
7655 * and atomicity guarantees. */
7656 execCommandReplicateMulti(c);
7657
7658 /* Exec all the queued commands */
7659 unwatchAllKeys(c); /* Unwatch ASAP otherwise we'll waste CPU cycles */
7660 orig_argv = c->argv;
7661 orig_argc = c->argc;
7662 addReplySds(c,sdscatprintf(sdsempty(),"*%d\r\n",c->mstate.count));
7663 for (j = 0; j < c->mstate.count; j++) {
7664 c->argc = c->mstate.commands[j].argc;
7665 c->argv = c->mstate.commands[j].argv;
7666 call(c,c->mstate.commands[j].cmd);
7667 }
7668 c->argv = orig_argv;
7669 c->argc = orig_argc;
7670 freeClientMultiState(c);
7671 initClientMultiState(c);
7672 c->flags &= ~(REDIS_MULTI|REDIS_DIRTY_CAS);
7673 /* Make sure the EXEC command is always replicated / AOF, since we
7674 * always send the MULTI command (we can't know beforehand if the
7675 * next operations will contain at least a modification to the DB). */
7676 server.dirty++;
7677 }
7678
7679 /* =========================== Blocking Operations ========================= */
7680
7681 /* Currently Redis blocking operations support is limited to list POP ops,
7682 * so the current implementation is not fully generic, but it is also not
7683 * completely specific so it will not require a rewrite to support new
7684 * kind of blocking operations in the future.
7685 *
7686 * Still it's important to note that list blocking operations can be already
7687 * used as a notification mechanism in order to implement other blocking
7688 * operations at application level, so there must be a very strong evidence
7689 * of usefulness and generality before new blocking operations are implemented.
7690 *
7691 * This is how the current blocking POP works, we use BLPOP as example:
7692 * - If the user calls BLPOP and the key exists and contains a non empty list
7693 * then LPOP is called instead. So BLPOP is semantically the same as LPOP
7694 * if there is not to block.
7695 * - If instead BLPOP is called and the key does not exists or the list is
7696 * empty we need to block. In order to do so we remove the notification for
7697 * new data to read in the client socket (so that we'll not serve new
7698 * requests if the blocking request is not served). Also we put the client
7699 * in a dictionary (db->blocking_keys) mapping keys to a list of clients
7700 * blocking for this keys.
7701 * - If a PUSH operation against a key with blocked clients waiting is
7702 * performed, we serve the first in the list: basically instead to push
7703 * the new element inside the list we return it to the (first / oldest)
7704 * blocking client, unblock the client, and remove it form the list.
7705 *
7706 * The above comment and the source code should be enough in order to understand
7707 * the implementation and modify / fix it later.
7708 */
7709
7710 /* Set a client in blocking mode for the specified key, with the specified
7711 * timeout */
7712 static void blockForKeys(redisClient *c, robj **keys, int numkeys, time_t timeout) {
7713 dictEntry *de;
7714 list *l;
7715 int j;
7716
7717 c->blocking_keys = zmalloc(sizeof(robj*)*numkeys);
7718 c->blocking_keys_num = numkeys;
7719 c->blockingto = timeout;
7720 for (j = 0; j < numkeys; j++) {
7721 /* Add the key in the client structure, to map clients -> keys */
7722 c->blocking_keys[j] = keys[j];
7723 incrRefCount(keys[j]);
7724
7725 /* And in the other "side", to map keys -> clients */
7726 de = dictFind(c->db->blocking_keys,keys[j]);
7727 if (de == NULL) {
7728 int retval;
7729
7730 /* For every key we take a list of clients blocked for it */
7731 l = listCreate();
7732 retval = dictAdd(c->db->blocking_keys,keys[j],l);
7733 incrRefCount(keys[j]);
7734 assert(retval == DICT_OK);
7735 } else {
7736 l = dictGetEntryVal(de);
7737 }
7738 listAddNodeTail(l,c);
7739 }
7740 /* Mark the client as a blocked client */
7741 c->flags |= REDIS_BLOCKED;
7742 server.blpop_blocked_clients++;
7743 }
7744
7745 /* Unblock a client that's waiting in a blocking operation such as BLPOP */
7746 static void unblockClientWaitingData(redisClient *c) {
7747 dictEntry *de;
7748 list *l;
7749 int j;
7750
7751 assert(c->blocking_keys != NULL);
7752 /* The client may wait for multiple keys, so unblock it for every key. */
7753 for (j = 0; j < c->blocking_keys_num; j++) {
7754 /* Remove this client from the list of clients waiting for this key. */
7755 de = dictFind(c->db->blocking_keys,c->blocking_keys[j]);
7756 assert(de != NULL);
7757 l = dictGetEntryVal(de);
7758 listDelNode(l,listSearchKey(l,c));
7759 /* If the list is empty we need to remove it to avoid wasting memory */
7760 if (listLength(l) == 0)
7761 dictDelete(c->db->blocking_keys,c->blocking_keys[j]);
7762 decrRefCount(c->blocking_keys[j]);
7763 }
7764 /* Cleanup the client structure */
7765 zfree(c->blocking_keys);
7766 c->blocking_keys = NULL;
7767 c->flags &= (~REDIS_BLOCKED);
7768 server.blpop_blocked_clients--;
7769 /* We want to process data if there is some command waiting
7770 * in the input buffer. Note that this is safe even if
7771 * unblockClientWaitingData() gets called from freeClient() because
7772 * freeClient() will be smart enough to call this function
7773 * *after* c->querybuf was set to NULL. */
7774 if (c->querybuf && sdslen(c->querybuf) > 0) processInputBuffer(c);
7775 }
7776
7777 /* This should be called from any function PUSHing into lists.
7778 * 'c' is the "pushing client", 'key' is the key it is pushing data against,
7779 * 'ele' is the element pushed.
7780 *
7781 * If the function returns 0 there was no client waiting for a list push
7782 * against this key.
7783 *
7784 * If the function returns 1 there was a client waiting for a list push
7785 * against this key, the element was passed to this client thus it's not
7786 * needed to actually add it to the list and the caller should return asap. */
7787 static int handleClientsWaitingListPush(redisClient *c, robj *key, robj *ele) {
7788 struct dictEntry *de;
7789 redisClient *receiver;
7790 list *l;
7791 listNode *ln;
7792
7793 de = dictFind(c->db->blocking_keys,key);
7794 if (de == NULL) return 0;
7795 l = dictGetEntryVal(de);
7796 ln = listFirst(l);
7797 assert(ln != NULL);
7798 receiver = ln->value;
7799
7800 addReplySds(receiver,sdsnew("*2\r\n"));
7801 addReplyBulk(receiver,key);
7802 addReplyBulk(receiver,ele);
7803 unblockClientWaitingData(receiver);
7804 return 1;
7805 }
7806
7807 /* Blocking RPOP/LPOP */
7808 static void blockingPopGenericCommand(redisClient *c, int where) {
7809 robj *o;
7810 time_t timeout;
7811 int j;
7812
7813 for (j = 1; j < c->argc-1; j++) {
7814 o = lookupKeyWrite(c->db,c->argv[j]);
7815 if (o != NULL) {
7816 if (o->type != REDIS_LIST) {
7817 addReply(c,shared.wrongtypeerr);
7818 return;
7819 } else {
7820 list *list = o->ptr;
7821 if (listLength(list) != 0) {
7822 /* If the list contains elements fall back to the usual
7823 * non-blocking POP operation */
7824 robj *argv[2], **orig_argv;
7825 int orig_argc;
7826
7827 /* We need to alter the command arguments before to call
7828 * popGenericCommand() as the command takes a single key. */
7829 orig_argv = c->argv;
7830 orig_argc = c->argc;
7831 argv[1] = c->argv[j];
7832 c->argv = argv;
7833 c->argc = 2;
7834
7835 /* Also the return value is different, we need to output
7836 * the multi bulk reply header and the key name. The
7837 * "real" command will add the last element (the value)
7838 * for us. If this souds like an hack to you it's just
7839 * because it is... */
7840 addReplySds(c,sdsnew("*2\r\n"));
7841 addReplyBulk(c,argv[1]);
7842 popGenericCommand(c,where);
7843
7844 /* Fix the client structure with the original stuff */
7845 c->argv = orig_argv;
7846 c->argc = orig_argc;
7847 return;
7848 }
7849 }
7850 }
7851 }
7852 /* If the list is empty or the key does not exists we must block */
7853 timeout = strtol(c->argv[c->argc-1]->ptr,NULL,10);
7854 if (timeout > 0) timeout += time(NULL);
7855 blockForKeys(c,c->argv+1,c->argc-2,timeout);
7856 }
7857
7858 static void blpopCommand(redisClient *c) {
7859 blockingPopGenericCommand(c,REDIS_HEAD);
7860 }
7861
7862 static void brpopCommand(redisClient *c) {
7863 blockingPopGenericCommand(c,REDIS_TAIL);
7864 }
7865
7866 /* =============================== Replication ============================= */
7867
7868 static int syncWrite(int fd, char *ptr, ssize_t size, int timeout) {
7869 ssize_t nwritten, ret = size;
7870 time_t start = time(NULL);
7871
7872 timeout++;
7873 while(size) {
7874 if (aeWait(fd,AE_WRITABLE,1000) & AE_WRITABLE) {
7875 nwritten = write(fd,ptr,size);
7876 if (nwritten == -1) return -1;
7877 ptr += nwritten;
7878 size -= nwritten;
7879 }
7880 if ((time(NULL)-start) > timeout) {
7881 errno = ETIMEDOUT;
7882 return -1;
7883 }
7884 }
7885 return ret;
7886 }
7887
7888 static int syncRead(int fd, char *ptr, ssize_t size, int timeout) {
7889 ssize_t nread, totread = 0;
7890 time_t start = time(NULL);
7891
7892 timeout++;
7893 while(size) {
7894 if (aeWait(fd,AE_READABLE,1000) & AE_READABLE) {
7895 nread = read(fd,ptr,size);
7896 if (nread == -1) return -1;
7897 ptr += nread;
7898 size -= nread;
7899 totread += nread;
7900 }
7901 if ((time(NULL)-start) > timeout) {
7902 errno = ETIMEDOUT;
7903 return -1;
7904 }
7905 }
7906 return totread;
7907 }
7908
7909 static int syncReadLine(int fd, char *ptr, ssize_t size, int timeout) {
7910 ssize_t nread = 0;
7911
7912 size--;
7913 while(size) {
7914 char c;
7915
7916 if (syncRead(fd,&c,1,timeout) == -1) return -1;
7917 if (c == '\n') {
7918 *ptr = '\0';
7919 if (nread && *(ptr-1) == '\r') *(ptr-1) = '\0';
7920 return nread;
7921 } else {
7922 *ptr++ = c;
7923 *ptr = '\0';
7924 nread++;
7925 }
7926 }
7927 return nread;
7928 }
7929
7930 static void syncCommand(redisClient *c) {
7931 /* ignore SYNC if aleady slave or in monitor mode */
7932 if (c->flags & REDIS_SLAVE) return;
7933
7934 /* SYNC can't be issued when the server has pending data to send to
7935 * the client about already issued commands. We need a fresh reply
7936 * buffer registering the differences between the BGSAVE and the current
7937 * dataset, so that we can copy to other slaves if needed. */
7938 if (listLength(c->reply) != 0) {
7939 addReplySds(c,sdsnew("-ERR SYNC is invalid with pending input\r\n"));
7940 return;
7941 }
7942
7943 redisLog(REDIS_NOTICE,"Slave ask for synchronization");
7944 /* Here we need to check if there is a background saving operation
7945 * in progress, or if it is required to start one */
7946 if (server.bgsavechildpid != -1) {
7947 /* Ok a background save is in progress. Let's check if it is a good
7948 * one for replication, i.e. if there is another slave that is
7949 * registering differences since the server forked to save */
7950 redisClient *slave;
7951 listNode *ln;
7952 listIter li;
7953
7954 listRewind(server.slaves,&li);
7955 while((ln = listNext(&li))) {
7956 slave = ln->value;
7957 if (slave->replstate == REDIS_REPL_WAIT_BGSAVE_END) break;
7958 }
7959 if (ln) {
7960 /* Perfect, the server is already registering differences for
7961 * another slave. Set the right state, and copy the buffer. */
7962 listRelease(c->reply);
7963 c->reply = listDup(slave->reply);
7964 c->replstate = REDIS_REPL_WAIT_BGSAVE_END;
7965 redisLog(REDIS_NOTICE,"Waiting for end of BGSAVE for SYNC");
7966 } else {
7967 /* No way, we need to wait for the next BGSAVE in order to
7968 * register differences */
7969 c->replstate = REDIS_REPL_WAIT_BGSAVE_START;
7970 redisLog(REDIS_NOTICE,"Waiting for next BGSAVE for SYNC");
7971 }
7972 } else {
7973 /* Ok we don't have a BGSAVE in progress, let's start one */
7974 redisLog(REDIS_NOTICE,"Starting BGSAVE for SYNC");
7975 if (rdbSaveBackground(server.dbfilename) != REDIS_OK) {
7976 redisLog(REDIS_NOTICE,"Replication failed, can't BGSAVE");
7977 addReplySds(c,sdsnew("-ERR Unalbe to perform background save\r\n"));
7978 return;
7979 }
7980 c->replstate = REDIS_REPL_WAIT_BGSAVE_END;
7981 }
7982 c->repldbfd = -1;
7983 c->flags |= REDIS_SLAVE;
7984 c->slaveseldb = 0;
7985 listAddNodeTail(server.slaves,c);
7986 return;
7987 }
7988
7989 static void sendBulkToSlave(aeEventLoop *el, int fd, void *privdata, int mask) {
7990 redisClient *slave = privdata;
7991 REDIS_NOTUSED(el);
7992 REDIS_NOTUSED(mask);
7993 char buf[REDIS_IOBUF_LEN];
7994 ssize_t nwritten, buflen;
7995
7996 if (slave->repldboff == 0) {
7997 /* Write the bulk write count before to transfer the DB. In theory here
7998 * we don't know how much room there is in the output buffer of the
7999 * socket, but in pratice SO_SNDLOWAT (the minimum count for output
8000 * operations) will never be smaller than the few bytes we need. */
8001 sds bulkcount;
8002
8003 bulkcount = sdscatprintf(sdsempty(),"$%lld\r\n",(unsigned long long)
8004 slave->repldbsize);
8005 if (write(fd,bulkcount,sdslen(bulkcount)) != (signed)sdslen(bulkcount))
8006 {
8007 sdsfree(bulkcount);
8008 freeClient(slave);
8009 return;
8010 }
8011 sdsfree(bulkcount);
8012 }
8013 lseek(slave->repldbfd,slave->repldboff,SEEK_SET);
8014 buflen = read(slave->repldbfd,buf,REDIS_IOBUF_LEN);
8015 if (buflen <= 0) {
8016 redisLog(REDIS_WARNING,"Read error sending DB to slave: %s",
8017 (buflen == 0) ? "premature EOF" : strerror(errno));
8018 freeClient(slave);
8019 return;
8020 }
8021 if ((nwritten = write(fd,buf,buflen)) == -1) {
8022 redisLog(REDIS_VERBOSE,"Write error sending DB to slave: %s",
8023 strerror(errno));
8024 freeClient(slave);
8025 return;
8026 }
8027 slave->repldboff += nwritten;
8028 if (slave->repldboff == slave->repldbsize) {
8029 close(slave->repldbfd);
8030 slave->repldbfd = -1;
8031 aeDeleteFileEvent(server.el,slave->fd,AE_WRITABLE);
8032 slave->replstate = REDIS_REPL_ONLINE;
8033 if (aeCreateFileEvent(server.el, slave->fd, AE_WRITABLE,
8034 sendReplyToClient, slave) == AE_ERR) {
8035 freeClient(slave);
8036 return;
8037 }
8038 addReplySds(slave,sdsempty());
8039 redisLog(REDIS_NOTICE,"Synchronization with slave succeeded");
8040 }
8041 }
8042
8043 /* This function is called at the end of every backgrond saving.
8044 * The argument bgsaveerr is REDIS_OK if the background saving succeeded
8045 * otherwise REDIS_ERR is passed to the function.
8046 *
8047 * The goal of this function is to handle slaves waiting for a successful
8048 * background saving in order to perform non-blocking synchronization. */
8049 static void updateSlavesWaitingBgsave(int bgsaveerr) {
8050 listNode *ln;
8051 int startbgsave = 0;
8052 listIter li;
8053
8054 listRewind(server.slaves,&li);
8055 while((ln = listNext(&li))) {
8056 redisClient *slave = ln->value;
8057
8058 if (slave->replstate == REDIS_REPL_WAIT_BGSAVE_START) {
8059 startbgsave = 1;
8060 slave->replstate = REDIS_REPL_WAIT_BGSAVE_END;
8061 } else if (slave->replstate == REDIS_REPL_WAIT_BGSAVE_END) {
8062 struct redis_stat buf;
8063
8064 if (bgsaveerr != REDIS_OK) {
8065 freeClient(slave);
8066 redisLog(REDIS_WARNING,"SYNC failed. BGSAVE child returned an error");
8067 continue;
8068 }
8069 if ((slave->repldbfd = open(server.dbfilename,O_RDONLY)) == -1 ||
8070 redis_fstat(slave->repldbfd,&buf) == -1) {
8071 freeClient(slave);
8072 redisLog(REDIS_WARNING,"SYNC failed. Can't open/stat DB after BGSAVE: %s", strerror(errno));
8073 continue;
8074 }
8075 slave->repldboff = 0;
8076 slave->repldbsize = buf.st_size;
8077 slave->replstate = REDIS_REPL_SEND_BULK;
8078 aeDeleteFileEvent(server.el,slave->fd,AE_WRITABLE);
8079 if (aeCreateFileEvent(server.el, slave->fd, AE_WRITABLE, sendBulkToSlave, slave) == AE_ERR) {
8080 freeClient(slave);
8081 continue;
8082 }
8083 }
8084 }
8085 if (startbgsave) {
8086 if (rdbSaveBackground(server.dbfilename) != REDIS_OK) {
8087 listIter li;
8088
8089 listRewind(server.slaves,&li);
8090 redisLog(REDIS_WARNING,"SYNC failed. BGSAVE failed");
8091 while((ln = listNext(&li))) {
8092 redisClient *slave = ln->value;
8093
8094 if (slave->replstate == REDIS_REPL_WAIT_BGSAVE_START)
8095 freeClient(slave);
8096 }
8097 }
8098 }
8099 }
8100
8101 static int syncWithMaster(void) {
8102 char buf[1024], tmpfile[256], authcmd[1024];
8103 long dumpsize;
8104 int fd = anetTcpConnect(NULL,server.masterhost,server.masterport);
8105 int dfd, maxtries = 5;
8106
8107 if (fd == -1) {
8108 redisLog(REDIS_WARNING,"Unable to connect to MASTER: %s",
8109 strerror(errno));
8110 return REDIS_ERR;
8111 }
8112
8113 /* AUTH with the master if required. */
8114 if(server.masterauth) {
8115 snprintf(authcmd, 1024, "AUTH %s\r\n", server.masterauth);
8116 if (syncWrite(fd, authcmd, strlen(server.masterauth)+7, 5) == -1) {
8117 close(fd);
8118 redisLog(REDIS_WARNING,"Unable to AUTH to MASTER: %s",
8119 strerror(errno));
8120 return REDIS_ERR;
8121 }
8122 /* Read the AUTH result. */
8123 if (syncReadLine(fd,buf,1024,3600) == -1) {
8124 close(fd);
8125 redisLog(REDIS_WARNING,"I/O error reading auth result from MASTER: %s",
8126 strerror(errno));
8127 return REDIS_ERR;
8128 }
8129 if (buf[0] != '+') {
8130 close(fd);
8131 redisLog(REDIS_WARNING,"Cannot AUTH to MASTER, is the masterauth password correct?");
8132 return REDIS_ERR;
8133 }
8134 }
8135
8136 /* Issue the SYNC command */
8137 if (syncWrite(fd,"SYNC \r\n",7,5) == -1) {
8138 close(fd);
8139 redisLog(REDIS_WARNING,"I/O error writing to MASTER: %s",
8140 strerror(errno));
8141 return REDIS_ERR;
8142 }
8143 /* Read the bulk write count */
8144 if (syncReadLine(fd,buf,1024,3600) == -1) {
8145 close(fd);
8146 redisLog(REDIS_WARNING,"I/O error reading bulk count from MASTER: %s",
8147 strerror(errno));
8148 return REDIS_ERR;
8149 }
8150 if (buf[0] != '$') {
8151 close(fd);
8152 redisLog(REDIS_WARNING,"Bad protocol from MASTER, the first byte is not '$', are you sure the host and port are right?");
8153 return REDIS_ERR;
8154 }
8155 dumpsize = strtol(buf+1,NULL,10);
8156 redisLog(REDIS_NOTICE,"Receiving %ld bytes data dump from MASTER",dumpsize);
8157 /* Read the bulk write data on a temp file */
8158 while(maxtries--) {
8159 snprintf(tmpfile,256,
8160 "temp-%d.%ld.rdb",(int)time(NULL),(long int)getpid());
8161 dfd = open(tmpfile,O_CREAT|O_WRONLY|O_EXCL,0644);
8162 if (dfd != -1) break;
8163 sleep(1);
8164 }
8165 if (dfd == -1) {
8166 close(fd);
8167 redisLog(REDIS_WARNING,"Opening the temp file needed for MASTER <-> SLAVE synchronization: %s",strerror(errno));
8168 return REDIS_ERR;
8169 }
8170 while(dumpsize) {
8171 int nread, nwritten;
8172
8173 nread = read(fd,buf,(dumpsize < 1024)?dumpsize:1024);
8174 if (nread == -1) {
8175 redisLog(REDIS_WARNING,"I/O error trying to sync with MASTER: %s",
8176 strerror(errno));
8177 close(fd);
8178 close(dfd);
8179 return REDIS_ERR;
8180 }
8181 nwritten = write(dfd,buf,nread);
8182 if (nwritten == -1) {
8183 redisLog(REDIS_WARNING,"Write error writing to the DB dump file needed for MASTER <-> SLAVE synchrnonization: %s", strerror(errno));
8184 close(fd);
8185 close(dfd);
8186 return REDIS_ERR;
8187 }
8188 dumpsize -= nread;
8189 }
8190 close(dfd);
8191 if (rename(tmpfile,server.dbfilename) == -1) {
8192 redisLog(REDIS_WARNING,"Failed trying to rename the temp DB into dump.rdb in MASTER <-> SLAVE synchronization: %s", strerror(errno));
8193 unlink(tmpfile);
8194 close(fd);
8195 return REDIS_ERR;
8196 }
8197 emptyDb();
8198 if (rdbLoad(server.dbfilename) != REDIS_OK) {
8199 redisLog(REDIS_WARNING,"Failed trying to load the MASTER synchronization DB from disk");
8200 close(fd);
8201 return REDIS_ERR;
8202 }
8203 server.master = createClient(fd);
8204 server.master->flags |= REDIS_MASTER;
8205 server.master->authenticated = 1;
8206 server.replstate = REDIS_REPL_CONNECTED;
8207 return REDIS_OK;
8208 }
8209
8210 static void slaveofCommand(redisClient *c) {
8211 if (!strcasecmp(c->argv[1]->ptr,"no") &&
8212 !strcasecmp(c->argv[2]->ptr,"one")) {
8213 if (server.masterhost) {
8214 sdsfree(server.masterhost);
8215 server.masterhost = NULL;
8216 if (server.master) freeClient(server.master);
8217 server.replstate = REDIS_REPL_NONE;
8218 redisLog(REDIS_NOTICE,"MASTER MODE enabled (user request)");
8219 }
8220 } else {
8221 sdsfree(server.masterhost);
8222 server.masterhost = sdsdup(c->argv[1]->ptr);
8223 server.masterport = atoi(c->argv[2]->ptr);
8224 if (server.master) freeClient(server.master);
8225 server.replstate = REDIS_REPL_CONNECT;
8226 redisLog(REDIS_NOTICE,"SLAVE OF %s:%d enabled (user request)",
8227 server.masterhost, server.masterport);
8228 }
8229 addReply(c,shared.ok);
8230 }
8231
8232 /* ============================ Maxmemory directive ======================== */
8233
8234 /* Try to free one object form the pre-allocated objects free list.
8235 * This is useful under low mem conditions as by default we take 1 million
8236 * free objects allocated. On success REDIS_OK is returned, otherwise
8237 * REDIS_ERR. */
8238 static int tryFreeOneObjectFromFreelist(void) {
8239 robj *o;
8240
8241 if (server.vm_enabled) pthread_mutex_lock(&server.obj_freelist_mutex);
8242 if (listLength(server.objfreelist)) {
8243 listNode *head = listFirst(server.objfreelist);
8244 o = listNodeValue(head);
8245 listDelNode(server.objfreelist,head);
8246 if (server.vm_enabled) pthread_mutex_unlock(&server.obj_freelist_mutex);
8247 zfree(o);
8248 return REDIS_OK;
8249 } else {
8250 if (server.vm_enabled) pthread_mutex_unlock(&server.obj_freelist_mutex);
8251 return REDIS_ERR;
8252 }
8253 }
8254
8255 /* This function gets called when 'maxmemory' is set on the config file to limit
8256 * the max memory used by the server, and we are out of memory.
8257 * This function will try to, in order:
8258 *
8259 * - Free objects from the free list
8260 * - Try to remove keys with an EXPIRE set
8261 *
8262 * It is not possible to free enough memory to reach used-memory < maxmemory
8263 * the server will start refusing commands that will enlarge even more the
8264 * memory usage.
8265 */
8266 static void freeMemoryIfNeeded(void) {
8267 while (server.maxmemory && zmalloc_used_memory() > server.maxmemory) {
8268 int j, k, freed = 0;
8269
8270 if (tryFreeOneObjectFromFreelist() == REDIS_OK) continue;
8271 for (j = 0; j < server.dbnum; j++) {
8272 int minttl = -1;
8273 robj *minkey = NULL;
8274 struct dictEntry *de;
8275
8276 if (dictSize(server.db[j].expires)) {
8277 freed = 1;
8278 /* From a sample of three keys drop the one nearest to
8279 * the natural expire */
8280 for (k = 0; k < 3; k++) {
8281 time_t t;
8282
8283 de = dictGetRandomKey(server.db[j].expires);
8284 t = (time_t) dictGetEntryVal(de);
8285 if (minttl == -1 || t < minttl) {
8286 minkey = dictGetEntryKey(de);
8287 minttl = t;
8288 }
8289 }
8290 dbDelete(server.db+j,minkey);
8291 }
8292 }
8293 if (!freed) return; /* nothing to free... */
8294 }
8295 }
8296
8297 /* ============================== Append Only file ========================== */
8298
8299 /* Called when the user switches from "appendonly yes" to "appendonly no"
8300 * at runtime using the CONFIG command. */
8301 static void stopAppendOnly(void) {
8302 flushAppendOnlyFile();
8303 aof_fsync(server.appendfd);
8304 close(server.appendfd);
8305
8306 server.appendfd = -1;
8307 server.appendseldb = -1;
8308 server.appendonly = 0;
8309 /* rewrite operation in progress? kill it, wait child exit */
8310 if (server.bgsavechildpid != -1) {
8311 int statloc;
8312
8313 if (kill(server.bgsavechildpid,SIGKILL) != -1)
8314 wait3(&statloc,0,NULL);
8315 /* reset the buffer accumulating changes while the child saves */
8316 sdsfree(server.bgrewritebuf);
8317 server.bgrewritebuf = sdsempty();
8318 server.bgsavechildpid = -1;
8319 }
8320 }
8321
8322 /* Called when the user switches from "appendonly no" to "appendonly yes"
8323 * at runtime using the CONFIG command. */
8324 static int startAppendOnly(void) {
8325 server.appendonly = 1;
8326 server.lastfsync = time(NULL);
8327 server.appendfd = open(server.appendfilename,O_WRONLY|O_APPEND|O_CREAT,0644);
8328 if (server.appendfd == -1) {
8329 redisLog(REDIS_WARNING,"Used tried to switch on AOF via CONFIG, but I can't open the AOF file: %s",strerror(errno));
8330 return REDIS_ERR;
8331 }
8332 if (rewriteAppendOnlyFileBackground() == REDIS_ERR) {
8333 server.appendonly = 0;
8334 close(server.appendfd);
8335 redisLog(REDIS_WARNING,"Used tried to switch on AOF via CONFIG, I can't trigger a background AOF rewrite operation. Check the above logs for more info about the error.",strerror(errno));
8336 return REDIS_ERR;
8337 }
8338 return REDIS_OK;
8339 }
8340
8341 /* Write the append only file buffer on disk.
8342 *
8343 * Since we are required to write the AOF before replying to the client,
8344 * and the only way the client socket can get a write is entering when the
8345 * the event loop, we accumulate all the AOF writes in a memory
8346 * buffer and write it on disk using this function just before entering
8347 * the event loop again. */
8348 static void flushAppendOnlyFile(void) {
8349 time_t now;
8350 ssize_t nwritten;
8351
8352 if (sdslen(server.aofbuf) == 0) return;
8353
8354 /* We want to perform a single write. This should be guaranteed atomic
8355 * at least if the filesystem we are writing is a real physical one.
8356 * While this will save us against the server being killed I don't think
8357 * there is much to do about the whole server stopping for power problems
8358 * or alike */
8359 nwritten = write(server.appendfd,server.aofbuf,sdslen(server.aofbuf));
8360 if (nwritten != (signed)sdslen(server.aofbuf)) {
8361 /* Ooops, we are in troubles. The best thing to do for now is
8362 * aborting instead of giving the illusion that everything is
8363 * working as expected. */
8364 if (nwritten == -1) {
8365 redisLog(REDIS_WARNING,"Exiting on error writing to the append-only file: %s",strerror(errno));
8366 } else {
8367 redisLog(REDIS_WARNING,"Exiting on short write while writing to the append-only file: %s",strerror(errno));
8368 }
8369 exit(1);
8370 }
8371 sdsfree(server.aofbuf);
8372 server.aofbuf = sdsempty();
8373
8374 /* Don't Fsync if no-appendfsync-on-rewrite is set to yes and we have
8375 * childs performing heavy I/O on disk. */
8376 if (server.no_appendfsync_on_rewrite &&
8377 (server.bgrewritechildpid != -1 || server.bgsavechildpid != -1))
8378 return;
8379 /* Fsync if needed */
8380 now = time(NULL);
8381 if (server.appendfsync == APPENDFSYNC_ALWAYS ||
8382 (server.appendfsync == APPENDFSYNC_EVERYSEC &&
8383 now-server.lastfsync > 1))
8384 {
8385 /* aof_fsync is defined as fdatasync() for Linux in order to avoid
8386 * flushing metadata. */
8387 aof_fsync(server.appendfd); /* Let's try to get this data on the disk */
8388 server.lastfsync = now;
8389 }
8390 }
8391
8392 static sds catAppendOnlyGenericCommand(sds buf, int argc, robj **argv) {
8393 int j;
8394 buf = sdscatprintf(buf,"*%d\r\n",argc);
8395 for (j = 0; j < argc; j++) {
8396 robj *o = getDecodedObject(argv[j]);
8397 buf = sdscatprintf(buf,"$%lu\r\n",(unsigned long)sdslen(o->ptr));
8398 buf = sdscatlen(buf,o->ptr,sdslen(o->ptr));
8399 buf = sdscatlen(buf,"\r\n",2);
8400 decrRefCount(o);
8401 }
8402 return buf;
8403 }
8404
8405 static sds catAppendOnlyExpireAtCommand(sds buf, robj *key, robj *seconds) {
8406 int argc = 3;
8407 long when;
8408 robj *argv[3];
8409
8410 /* Make sure we can use strtol */
8411 seconds = getDecodedObject(seconds);
8412 when = time(NULL)+strtol(seconds->ptr,NULL,10);
8413 decrRefCount(seconds);
8414
8415 argv[0] = createStringObject("EXPIREAT",8);
8416 argv[1] = key;
8417 argv[2] = createObject(REDIS_STRING,
8418 sdscatprintf(sdsempty(),"%ld",when));
8419 buf = catAppendOnlyGenericCommand(buf, argc, argv);
8420 decrRefCount(argv[0]);
8421 decrRefCount(argv[2]);
8422 return buf;
8423 }
8424
8425 static void feedAppendOnlyFile(struct redisCommand *cmd, int dictid, robj **argv, int argc) {
8426 sds buf = sdsempty();
8427 robj *tmpargv[3];
8428
8429 /* The DB this command was targetting is not the same as the last command
8430 * we appendend. To issue a SELECT command is needed. */
8431 if (dictid != server.appendseldb) {
8432 char seldb[64];
8433
8434 snprintf(seldb,sizeof(seldb),"%d",dictid);
8435 buf = sdscatprintf(buf,"*2\r\n$6\r\nSELECT\r\n$%lu\r\n%s\r\n",
8436 (unsigned long)strlen(seldb),seldb);
8437 server.appendseldb = dictid;
8438 }
8439
8440 if (cmd->proc == expireCommand) {
8441 /* Translate EXPIRE into EXPIREAT */
8442 buf = catAppendOnlyExpireAtCommand(buf,argv[1],argv[2]);
8443 } else if (cmd->proc == setexCommand) {
8444 /* Translate SETEX to SET and EXPIREAT */
8445 tmpargv[0] = createStringObject("SET",3);
8446 tmpargv[1] = argv[1];
8447 tmpargv[2] = argv[3];
8448 buf = catAppendOnlyGenericCommand(buf,3,tmpargv);
8449 decrRefCount(tmpargv[0]);
8450 buf = catAppendOnlyExpireAtCommand(buf,argv[1],argv[2]);
8451 } else {
8452 buf = catAppendOnlyGenericCommand(buf,argc,argv);
8453 }
8454
8455 /* Append to the AOF buffer. This will be flushed on disk just before
8456 * of re-entering the event loop, so before the client will get a
8457 * positive reply about the operation performed. */
8458 server.aofbuf = sdscatlen(server.aofbuf,buf,sdslen(buf));
8459
8460 /* If a background append only file rewriting is in progress we want to
8461 * accumulate the differences between the child DB and the current one
8462 * in a buffer, so that when the child process will do its work we
8463 * can append the differences to the new append only file. */
8464 if (server.bgrewritechildpid != -1)
8465 server.bgrewritebuf = sdscatlen(server.bgrewritebuf,buf,sdslen(buf));
8466
8467 sdsfree(buf);
8468 }
8469
8470 /* In Redis commands are always executed in the context of a client, so in
8471 * order to load the append only file we need to create a fake client. */
8472 static struct redisClient *createFakeClient(void) {
8473 struct redisClient *c = zmalloc(sizeof(*c));
8474
8475 selectDb(c,0);
8476 c->fd = -1;
8477 c->querybuf = sdsempty();
8478 c->argc = 0;
8479 c->argv = NULL;
8480 c->flags = 0;
8481 /* We set the fake client as a slave waiting for the synchronization
8482 * so that Redis will not try to send replies to this client. */
8483 c->replstate = REDIS_REPL_WAIT_BGSAVE_START;
8484 c->reply = listCreate();
8485 listSetFreeMethod(c->reply,decrRefCount);
8486 listSetDupMethod(c->reply,dupClientReplyValue);
8487 initClientMultiState(c);
8488 return c;
8489 }
8490
8491 static void freeFakeClient(struct redisClient *c) {
8492 sdsfree(c->querybuf);
8493 listRelease(c->reply);
8494 freeClientMultiState(c);
8495 zfree(c);
8496 }
8497
8498 /* Replay the append log file. On error REDIS_OK is returned. On non fatal
8499 * error (the append only file is zero-length) REDIS_ERR is returned. On
8500 * fatal error an error message is logged and the program exists. */
8501 int loadAppendOnlyFile(char *filename) {
8502 struct redisClient *fakeClient;
8503 FILE *fp = fopen(filename,"r");
8504 struct redis_stat sb;
8505 int appendonly = server.appendonly;
8506
8507 if (redis_fstat(fileno(fp),&sb) != -1 && sb.st_size == 0)
8508 return REDIS_ERR;
8509
8510 if (fp == NULL) {
8511 redisLog(REDIS_WARNING,"Fatal error: can't open the append log file for reading: %s",strerror(errno));
8512 exit(1);
8513 }
8514
8515 /* Temporarily disable AOF, to prevent EXEC from feeding a MULTI
8516 * to the same file we're about to read. */
8517 server.appendonly = 0;
8518
8519 fakeClient = createFakeClient();
8520 while(1) {
8521 int argc, j;
8522 unsigned long len;
8523 robj **argv;
8524 char buf[128];
8525 sds argsds;
8526 struct redisCommand *cmd;
8527 int force_swapout;
8528
8529 if (fgets(buf,sizeof(buf),fp) == NULL) {
8530 if (feof(fp))
8531 break;
8532 else
8533 goto readerr;
8534 }
8535 if (buf[0] != '*') goto fmterr;
8536 argc = atoi(buf+1);
8537 argv = zmalloc(sizeof(robj*)*argc);
8538 for (j = 0; j < argc; j++) {
8539 if (fgets(buf,sizeof(buf),fp) == NULL) goto readerr;
8540 if (buf[0] != '$') goto fmterr;
8541 len = strtol(buf+1,NULL,10);
8542 argsds = sdsnewlen(NULL,len);
8543 if (len && fread(argsds,len,1,fp) == 0) goto fmterr;
8544 argv[j] = createObject(REDIS_STRING,argsds);
8545 if (fread(buf,2,1,fp) == 0) goto fmterr; /* discard CRLF */
8546 }
8547
8548 /* Command lookup */
8549 cmd = lookupCommand(argv[0]->ptr);
8550 if (!cmd) {
8551 redisLog(REDIS_WARNING,"Unknown command '%s' reading the append only file", argv[0]->ptr);
8552 exit(1);
8553 }
8554 /* Try object encoding */
8555 if (cmd->flags & REDIS_CMD_BULK)
8556 argv[argc-1] = tryObjectEncoding(argv[argc-1]);
8557 /* Run the command in the context of a fake client */
8558 fakeClient->argc = argc;
8559 fakeClient->argv = argv;
8560 cmd->proc(fakeClient);
8561 /* Discard the reply objects list from the fake client */
8562 while(listLength(fakeClient->reply))
8563 listDelNode(fakeClient->reply,listFirst(fakeClient->reply));
8564 /* Clean up, ready for the next command */
8565 for (j = 0; j < argc; j++) decrRefCount(argv[j]);
8566 zfree(argv);
8567 /* Handle swapping while loading big datasets when VM is on */
8568 force_swapout = 0;
8569 if ((zmalloc_used_memory() - server.vm_max_memory) > 1024*1024*32)
8570 force_swapout = 1;
8571
8572 if (server.vm_enabled && force_swapout) {
8573 while (zmalloc_used_memory() > server.vm_max_memory) {
8574 if (vmSwapOneObjectBlocking() == REDIS_ERR) break;
8575 }
8576 }
8577 }
8578
8579 /* This point can only be reached when EOF is reached without errors.
8580 * If the client is in the middle of a MULTI/EXEC, log error and quit. */
8581 if (fakeClient->flags & REDIS_MULTI) goto readerr;
8582
8583 fclose(fp);
8584 freeFakeClient(fakeClient);
8585 server.appendonly = appendonly;
8586 return REDIS_OK;
8587
8588 readerr:
8589 if (feof(fp)) {
8590 redisLog(REDIS_WARNING,"Unexpected end of file reading the append only file");
8591 } else {
8592 redisLog(REDIS_WARNING,"Unrecoverable error reading the append only file: %s", strerror(errno));
8593 }
8594 exit(1);
8595 fmterr:
8596 redisLog(REDIS_WARNING,"Bad file format reading the append only file");
8597 exit(1);
8598 }
8599
8600 /* Write an object into a file in the bulk format $<count>\r\n<payload>\r\n */
8601 static int fwriteBulkObject(FILE *fp, robj *obj) {
8602 char buf[128];
8603 int decrrc = 0;
8604
8605 /* Avoid the incr/decr ref count business if possible to help
8606 * copy-on-write (we are often in a child process when this function
8607 * is called).
8608 * Also makes sure that key objects don't get incrRefCount-ed when VM
8609 * is enabled */
8610 if (obj->encoding != REDIS_ENCODING_RAW) {
8611 obj = getDecodedObject(obj);
8612 decrrc = 1;
8613 }
8614 snprintf(buf,sizeof(buf),"$%ld\r\n",(long)sdslen(obj->ptr));
8615 if (fwrite(buf,strlen(buf),1,fp) == 0) goto err;
8616 if (sdslen(obj->ptr) && fwrite(obj->ptr,sdslen(obj->ptr),1,fp) == 0)
8617 goto err;
8618 if (fwrite("\r\n",2,1,fp) == 0) goto err;
8619 if (decrrc) decrRefCount(obj);
8620 return 1;
8621 err:
8622 if (decrrc) decrRefCount(obj);
8623 return 0;
8624 }
8625
8626 /* Write binary-safe string into a file in the bulkformat
8627 * $<count>\r\n<payload>\r\n */
8628 static int fwriteBulkString(FILE *fp, char *s, unsigned long len) {
8629 char buf[128];
8630
8631 snprintf(buf,sizeof(buf),"$%ld\r\n",(unsigned long)len);
8632 if (fwrite(buf,strlen(buf),1,fp) == 0) return 0;
8633 if (len && fwrite(s,len,1,fp) == 0) return 0;
8634 if (fwrite("\r\n",2,1,fp) == 0) return 0;
8635 return 1;
8636 }
8637
8638 /* Write a double value in bulk format $<count>\r\n<payload>\r\n */
8639 static int fwriteBulkDouble(FILE *fp, double d) {
8640 char buf[128], dbuf[128];
8641
8642 snprintf(dbuf,sizeof(dbuf),"%.17g\r\n",d);
8643 snprintf(buf,sizeof(buf),"$%lu\r\n",(unsigned long)strlen(dbuf)-2);
8644 if (fwrite(buf,strlen(buf),1,fp) == 0) return 0;
8645 if (fwrite(dbuf,strlen(dbuf),1,fp) == 0) return 0;
8646 return 1;
8647 }
8648
8649 /* Write a long value in bulk format $<count>\r\n<payload>\r\n */
8650 static int fwriteBulkLong(FILE *fp, long l) {
8651 char buf[128], lbuf[128];
8652
8653 snprintf(lbuf,sizeof(lbuf),"%ld\r\n",l);
8654 snprintf(buf,sizeof(buf),"$%lu\r\n",(unsigned long)strlen(lbuf)-2);
8655 if (fwrite(buf,strlen(buf),1,fp) == 0) return 0;
8656 if (fwrite(lbuf,strlen(lbuf),1,fp) == 0) return 0;
8657 return 1;
8658 }
8659
8660 /* Write a sequence of commands able to fully rebuild the dataset into
8661 * "filename". Used both by REWRITEAOF and BGREWRITEAOF. */
8662 static int rewriteAppendOnlyFile(char *filename) {
8663 dictIterator *di = NULL;
8664 dictEntry *de;
8665 FILE *fp;
8666 char tmpfile[256];
8667 int j;
8668 time_t now = time(NULL);
8669
8670 /* Note that we have to use a different temp name here compared to the
8671 * one used by rewriteAppendOnlyFileBackground() function. */
8672 snprintf(tmpfile,256,"temp-rewriteaof-%d.aof", (int) getpid());
8673 fp = fopen(tmpfile,"w");
8674 if (!fp) {
8675 redisLog(REDIS_WARNING, "Failed rewriting the append only file: %s", strerror(errno));
8676 return REDIS_ERR;
8677 }
8678 for (j = 0; j < server.dbnum; j++) {
8679 char selectcmd[] = "*2\r\n$6\r\nSELECT\r\n";
8680 redisDb *db = server.db+j;
8681 dict *d = db->dict;
8682 if (dictSize(d) == 0) continue;
8683 di = dictGetIterator(d);
8684 if (!di) {
8685 fclose(fp);
8686 return REDIS_ERR;
8687 }
8688
8689 /* SELECT the new DB */
8690 if (fwrite(selectcmd,sizeof(selectcmd)-1,1,fp) == 0) goto werr;
8691 if (fwriteBulkLong(fp,j) == 0) goto werr;
8692
8693 /* Iterate this DB writing every entry */
8694 while((de = dictNext(di)) != NULL) {
8695 sds keystr = dictGetEntryKey(de);
8696 robj key, *o;
8697 time_t expiretime;
8698 int swapped;
8699
8700 keystr = dictGetEntryKey(de);
8701 o = dictGetEntryVal(de);
8702 initStaticStringObject(key,keystr);
8703 /* If the value for this key is swapped, load a preview in memory.
8704 * We use a "swapped" flag to remember if we need to free the
8705 * value object instead to just increment the ref count anyway
8706 * in order to avoid copy-on-write of pages if we are forked() */
8707 if (!server.vm_enabled || o->storage == REDIS_VM_MEMORY ||
8708 o->storage == REDIS_VM_SWAPPING) {
8709 swapped = 0;
8710 } else {
8711 o = vmPreviewObject(o);
8712 swapped = 1;
8713 }
8714 expiretime = getExpire(db,&key);
8715
8716 /* Save the key and associated value */
8717 if (o->type == REDIS_STRING) {
8718 /* Emit a SET command */
8719 char cmd[]="*3\r\n$3\r\nSET\r\n";
8720 if (fwrite(cmd,sizeof(cmd)-1,1,fp) == 0) goto werr;
8721 /* Key and value */
8722 if (fwriteBulkObject(fp,&key) == 0) goto werr;
8723 if (fwriteBulkObject(fp,o) == 0) goto werr;
8724 } else if (o->type == REDIS_LIST) {
8725 /* Emit the RPUSHes needed to rebuild the list */
8726 list *list = o->ptr;
8727 listNode *ln;
8728 listIter li;
8729
8730 listRewind(list,&li);
8731 while((ln = listNext(&li))) {
8732 char cmd[]="*3\r\n$5\r\nRPUSH\r\n";
8733 robj *eleobj = listNodeValue(ln);
8734
8735 if (fwrite(cmd,sizeof(cmd)-1,1,fp) == 0) goto werr;
8736 if (fwriteBulkObject(fp,&key) == 0) goto werr;
8737 if (fwriteBulkObject(fp,eleobj) == 0) goto werr;
8738 }
8739 } else if (o->type == REDIS_SET) {
8740 /* Emit the SADDs needed to rebuild the set */
8741 dict *set = o->ptr;
8742 dictIterator *di = dictGetIterator(set);
8743 dictEntry *de;
8744
8745 while((de = dictNext(di)) != NULL) {
8746 char cmd[]="*3\r\n$4\r\nSADD\r\n";
8747 robj *eleobj = dictGetEntryKey(de);
8748
8749 if (fwrite(cmd,sizeof(cmd)-1,1,fp) == 0) goto werr;
8750 if (fwriteBulkObject(fp,&key) == 0) goto werr;
8751 if (fwriteBulkObject(fp,eleobj) == 0) goto werr;
8752 }
8753 dictReleaseIterator(di);
8754 } else if (o->type == REDIS_ZSET) {
8755 /* Emit the ZADDs needed to rebuild the sorted set */
8756 zset *zs = o->ptr;
8757 dictIterator *di = dictGetIterator(zs->dict);
8758 dictEntry *de;
8759
8760 while((de = dictNext(di)) != NULL) {
8761 char cmd[]="*4\r\n$4\r\nZADD\r\n";
8762 robj *eleobj = dictGetEntryKey(de);
8763 double *score = dictGetEntryVal(de);
8764
8765 if (fwrite(cmd,sizeof(cmd)-1,1,fp) == 0) goto werr;
8766 if (fwriteBulkObject(fp,&key) == 0) goto werr;
8767 if (fwriteBulkDouble(fp,*score) == 0) goto werr;
8768 if (fwriteBulkObject(fp,eleobj) == 0) goto werr;
8769 }
8770 dictReleaseIterator(di);
8771 } else if (o->type == REDIS_HASH) {
8772 char cmd[]="*4\r\n$4\r\nHSET\r\n";
8773
8774 /* Emit the HSETs needed to rebuild the hash */
8775 if (o->encoding == REDIS_ENCODING_ZIPMAP) {
8776 unsigned char *p = zipmapRewind(o->ptr);
8777 unsigned char *field, *val;
8778 unsigned int flen, vlen;
8779
8780 while((p = zipmapNext(p,&field,&flen,&val,&vlen)) != NULL) {
8781 if (fwrite(cmd,sizeof(cmd)-1,1,fp) == 0) goto werr;
8782 if (fwriteBulkObject(fp,&key) == 0) goto werr;
8783 if (fwriteBulkString(fp,(char*)field,flen) == -1)
8784 return -1;
8785 if (fwriteBulkString(fp,(char*)val,vlen) == -1)
8786 return -1;
8787 }
8788 } else {
8789 dictIterator *di = dictGetIterator(o->ptr);
8790 dictEntry *de;
8791
8792 while((de = dictNext(di)) != NULL) {
8793 robj *field = dictGetEntryKey(de);
8794 robj *val = dictGetEntryVal(de);
8795
8796 if (fwrite(cmd,sizeof(cmd)-1,1,fp) == 0) goto werr;
8797 if (fwriteBulkObject(fp,&key) == 0) goto werr;
8798 if (fwriteBulkObject(fp,field) == -1) return -1;
8799 if (fwriteBulkObject(fp,val) == -1) return -1;
8800 }
8801 dictReleaseIterator(di);
8802 }
8803 } else {
8804 redisPanic("Unknown object type");
8805 }
8806 /* Save the expire time */
8807 if (expiretime != -1) {
8808 char cmd[]="*3\r\n$8\r\nEXPIREAT\r\n";
8809 /* If this key is already expired skip it */
8810 if (expiretime < now) continue;
8811 if (fwrite(cmd,sizeof(cmd)-1,1,fp) == 0) goto werr;
8812 if (fwriteBulkObject(fp,&key) == 0) goto werr;
8813 if (fwriteBulkLong(fp,expiretime) == 0) goto werr;
8814 }
8815 if (swapped) decrRefCount(o);
8816 }
8817 dictReleaseIterator(di);
8818 }
8819
8820 /* Make sure data will not remain on the OS's output buffers */
8821 fflush(fp);
8822 aof_fsync(fileno(fp));
8823 fclose(fp);
8824
8825 /* Use RENAME to make sure the DB file is changed atomically only
8826 * if the generate DB file is ok. */
8827 if (rename(tmpfile,filename) == -1) {
8828 redisLog(REDIS_WARNING,"Error moving temp append only file on the final destination: %s", strerror(errno));
8829 unlink(tmpfile);
8830 return REDIS_ERR;
8831 }
8832 redisLog(REDIS_NOTICE,"SYNC append only file rewrite performed");
8833 return REDIS_OK;
8834
8835 werr:
8836 fclose(fp);
8837 unlink(tmpfile);
8838 redisLog(REDIS_WARNING,"Write error writing append only file on disk: %s", strerror(errno));
8839 if (di) dictReleaseIterator(di);
8840 return REDIS_ERR;
8841 }
8842
8843 /* This is how rewriting of the append only file in background works:
8844 *
8845 * 1) The user calls BGREWRITEAOF
8846 * 2) Redis calls this function, that forks():
8847 * 2a) the child rewrite the append only file in a temp file.
8848 * 2b) the parent accumulates differences in server.bgrewritebuf.
8849 * 3) When the child finished '2a' exists.
8850 * 4) The parent will trap the exit code, if it's OK, will append the
8851 * data accumulated into server.bgrewritebuf into the temp file, and
8852 * finally will rename(2) the temp file in the actual file name.
8853 * The the new file is reopened as the new append only file. Profit!
8854 */
8855 static int rewriteAppendOnlyFileBackground(void) {
8856 pid_t childpid;
8857
8858 if (server.bgrewritechildpid != -1) return REDIS_ERR;
8859 if (server.vm_enabled) waitEmptyIOJobsQueue();
8860 if ((childpid = fork()) == 0) {
8861 /* Child */
8862 char tmpfile[256];
8863
8864 if (server.vm_enabled) vmReopenSwapFile();
8865 close(server.fd);
8866 snprintf(tmpfile,256,"temp-rewriteaof-bg-%d.aof", (int) getpid());
8867 if (rewriteAppendOnlyFile(tmpfile) == REDIS_OK) {
8868 _exit(0);
8869 } else {
8870 _exit(1);
8871 }
8872 } else {
8873 /* Parent */
8874 if (childpid == -1) {
8875 redisLog(REDIS_WARNING,
8876 "Can't rewrite append only file in background: fork: %s",
8877 strerror(errno));
8878 return REDIS_ERR;
8879 }
8880 redisLog(REDIS_NOTICE,
8881 "Background append only file rewriting started by pid %d",childpid);
8882 server.bgrewritechildpid = childpid;
8883 updateDictResizePolicy();
8884 /* We set appendseldb to -1 in order to force the next call to the
8885 * feedAppendOnlyFile() to issue a SELECT command, so the differences
8886 * accumulated by the parent into server.bgrewritebuf will start
8887 * with a SELECT statement and it will be safe to merge. */
8888 server.appendseldb = -1;
8889 return REDIS_OK;
8890 }
8891 return REDIS_OK; /* unreached */
8892 }
8893
8894 static void bgrewriteaofCommand(redisClient *c) {
8895 if (server.bgrewritechildpid != -1) {
8896 addReplySds(c,sdsnew("-ERR background append only file rewriting already in progress\r\n"));
8897 return;
8898 }
8899 if (rewriteAppendOnlyFileBackground() == REDIS_OK) {
8900 char *status = "+Background append only file rewriting started\r\n";
8901 addReplySds(c,sdsnew(status));
8902 } else {
8903 addReply(c,shared.err);
8904 }
8905 }
8906
8907 static void aofRemoveTempFile(pid_t childpid) {
8908 char tmpfile[256];
8909
8910 snprintf(tmpfile,256,"temp-rewriteaof-bg-%d.aof", (int) childpid);
8911 unlink(tmpfile);
8912 }
8913
8914 /* Virtual Memory is composed mainly of two subsystems:
8915 * - Blocking Virutal Memory
8916 * - Threaded Virtual Memory I/O
8917 * The two parts are not fully decoupled, but functions are split among two
8918 * different sections of the source code (delimited by comments) in order to
8919 * make more clear what functionality is about the blocking VM and what about
8920 * the threaded (not blocking) VM.
8921 *
8922 * Redis VM design:
8923 *
8924 * Redis VM is a blocking VM (one that blocks reading swapped values from
8925 * disk into memory when a value swapped out is needed in memory) that is made
8926 * unblocking by trying to examine the command argument vector in order to
8927 * load in background values that will likely be needed in order to exec
8928 * the command. The command is executed only once all the relevant keys
8929 * are loaded into memory.
8930 *
8931 * This basically is almost as simple of a blocking VM, but almost as parallel
8932 * as a fully non-blocking VM.
8933 */
8934
8935 /* =================== Virtual Memory - Blocking Side ====================== */
8936
8937 /* Create a VM pointer object. This kind of objects are used in place of
8938 * values in the key -> value hash table, for swapped out objects. */
8939 static vmpointer *createVmPointer(int vtype) {
8940 vmpointer *vp = zmalloc(sizeof(vmpointer));
8941
8942 vp->type = REDIS_VMPOINTER;
8943 vp->storage = REDIS_VM_SWAPPED;
8944 vp->vtype = vtype;
8945 return vp;
8946 }
8947
8948 static void vmInit(void) {
8949 off_t totsize;
8950 int pipefds[2];
8951 size_t stacksize;
8952 struct flock fl;
8953
8954 if (server.vm_max_threads != 0)
8955 zmalloc_enable_thread_safeness(); /* we need thread safe zmalloc() */
8956
8957 redisLog(REDIS_NOTICE,"Using '%s' as swap file",server.vm_swap_file);
8958 /* Try to open the old swap file, otherwise create it */
8959 if ((server.vm_fp = fopen(server.vm_swap_file,"r+b")) == NULL) {
8960 server.vm_fp = fopen(server.vm_swap_file,"w+b");
8961 }
8962 if (server.vm_fp == NULL) {
8963 redisLog(REDIS_WARNING,
8964 "Can't open the swap file: %s. Exiting.",
8965 strerror(errno));
8966 exit(1);
8967 }
8968 server.vm_fd = fileno(server.vm_fp);
8969 /* Lock the swap file for writing, this is useful in order to avoid
8970 * another instance to use the same swap file for a config error. */
8971 fl.l_type = F_WRLCK;
8972 fl.l_whence = SEEK_SET;
8973 fl.l_start = fl.l_len = 0;
8974 if (fcntl(server.vm_fd,F_SETLK,&fl) == -1) {
8975 redisLog(REDIS_WARNING,
8976 "Can't lock the swap file at '%s': %s. Make sure it is not used by another Redis instance.", server.vm_swap_file, strerror(errno));
8977 exit(1);
8978 }
8979 /* Initialize */
8980 server.vm_next_page = 0;
8981 server.vm_near_pages = 0;
8982 server.vm_stats_used_pages = 0;
8983 server.vm_stats_swapped_objects = 0;
8984 server.vm_stats_swapouts = 0;
8985 server.vm_stats_swapins = 0;
8986 totsize = server.vm_pages*server.vm_page_size;
8987 redisLog(REDIS_NOTICE,"Allocating %lld bytes of swap file",totsize);
8988 if (ftruncate(server.vm_fd,totsize) == -1) {
8989 redisLog(REDIS_WARNING,"Can't ftruncate swap file: %s. Exiting.",
8990 strerror(errno));
8991 exit(1);
8992 } else {
8993 redisLog(REDIS_NOTICE,"Swap file allocated with success");
8994 }
8995 server.vm_bitmap = zmalloc((server.vm_pages+7)/8);
8996 redisLog(REDIS_VERBOSE,"Allocated %lld bytes page table for %lld pages",
8997 (long long) (server.vm_pages+7)/8, server.vm_pages);
8998 memset(server.vm_bitmap,0,(server.vm_pages+7)/8);
8999
9000 /* Initialize threaded I/O (used by Virtual Memory) */
9001 server.io_newjobs = listCreate();
9002 server.io_processing = listCreate();
9003 server.io_processed = listCreate();
9004 server.io_ready_clients = listCreate();
9005 pthread_mutex_init(&server.io_mutex,NULL);
9006 pthread_mutex_init(&server.obj_freelist_mutex,NULL);
9007 pthread_mutex_init(&server.io_swapfile_mutex,NULL);
9008 server.io_active_threads = 0;
9009 if (pipe(pipefds) == -1) {
9010 redisLog(REDIS_WARNING,"Unable to intialized VM: pipe(2): %s. Exiting."
9011 ,strerror(errno));
9012 exit(1);
9013 }
9014 server.io_ready_pipe_read = pipefds[0];
9015 server.io_ready_pipe_write = pipefds[1];
9016 redisAssert(anetNonBlock(NULL,server.io_ready_pipe_read) != ANET_ERR);
9017 /* LZF requires a lot of stack */
9018 pthread_attr_init(&server.io_threads_attr);
9019 pthread_attr_getstacksize(&server.io_threads_attr, &stacksize);
9020 while (stacksize < REDIS_THREAD_STACK_SIZE) stacksize *= 2;
9021 pthread_attr_setstacksize(&server.io_threads_attr, stacksize);
9022 /* Listen for events in the threaded I/O pipe */
9023 if (aeCreateFileEvent(server.el, server.io_ready_pipe_read, AE_READABLE,
9024 vmThreadedIOCompletedJob, NULL) == AE_ERR)
9025 oom("creating file event");
9026 }
9027
9028 /* Mark the page as used */
9029 static void vmMarkPageUsed(off_t page) {
9030 off_t byte = page/8;
9031 int bit = page&7;
9032 redisAssert(vmFreePage(page) == 1);
9033 server.vm_bitmap[byte] |= 1<<bit;
9034 }
9035
9036 /* Mark N contiguous pages as used, with 'page' being the first. */
9037 static void vmMarkPagesUsed(off_t page, off_t count) {
9038 off_t j;
9039
9040 for (j = 0; j < count; j++)
9041 vmMarkPageUsed(page+j);
9042 server.vm_stats_used_pages += count;
9043 redisLog(REDIS_DEBUG,"Mark USED pages: %lld pages at %lld\n",
9044 (long long)count, (long long)page);
9045 }
9046
9047 /* Mark the page as free */
9048 static void vmMarkPageFree(off_t page) {
9049 off_t byte = page/8;
9050 int bit = page&7;
9051 redisAssert(vmFreePage(page) == 0);
9052 server.vm_bitmap[byte] &= ~(1<<bit);
9053 }
9054
9055 /* Mark N contiguous pages as free, with 'page' being the first. */
9056 static void vmMarkPagesFree(off_t page, off_t count) {
9057 off_t j;
9058
9059 for (j = 0; j < count; j++)
9060 vmMarkPageFree(page+j);
9061 server.vm_stats_used_pages -= count;
9062 redisLog(REDIS_DEBUG,"Mark FREE pages: %lld pages at %lld\n",
9063 (long long)count, (long long)page);
9064 }
9065
9066 /* Test if the page is free */
9067 static int vmFreePage(off_t page) {
9068 off_t byte = page/8;
9069 int bit = page&7;
9070 return (server.vm_bitmap[byte] & (1<<bit)) == 0;
9071 }
9072
9073 /* Find N contiguous free pages storing the first page of the cluster in *first.
9074 * Returns REDIS_OK if it was able to find N contiguous pages, otherwise
9075 * REDIS_ERR is returned.
9076 *
9077 * This function uses a simple algorithm: we try to allocate
9078 * REDIS_VM_MAX_NEAR_PAGES sequentially, when we reach this limit we start
9079 * again from the start of the swap file searching for free spaces.
9080 *
9081 * If it looks pretty clear that there are no free pages near our offset
9082 * we try to find less populated places doing a forward jump of
9083 * REDIS_VM_MAX_RANDOM_JUMP, then we start scanning again a few pages
9084 * without hurry, and then we jump again and so forth...
9085 *
9086 * This function can be improved using a free list to avoid to guess
9087 * too much, since we could collect data about freed pages.
9088 *
9089 * note: I implemented this function just after watching an episode of
9090 * Battlestar Galactica, where the hybrid was continuing to say "JUMP!"
9091 */
9092 static int vmFindContiguousPages(off_t *first, off_t n) {
9093 off_t base, offset = 0, since_jump = 0, numfree = 0;
9094
9095 if (server.vm_near_pages == REDIS_VM_MAX_NEAR_PAGES) {
9096 server.vm_near_pages = 0;
9097 server.vm_next_page = 0;
9098 }
9099 server.vm_near_pages++; /* Yet another try for pages near to the old ones */
9100 base = server.vm_next_page;
9101
9102 while(offset < server.vm_pages) {
9103 off_t this = base+offset;
9104
9105 /* If we overflow, restart from page zero */
9106 if (this >= server.vm_pages) {
9107 this -= server.vm_pages;
9108 if (this == 0) {
9109 /* Just overflowed, what we found on tail is no longer
9110 * interesting, as it's no longer contiguous. */
9111 numfree = 0;
9112 }
9113 }
9114 if (vmFreePage(this)) {
9115 /* This is a free page */
9116 numfree++;
9117 /* Already got N free pages? Return to the caller, with success */
9118 if (numfree == n) {
9119 *first = this-(n-1);
9120 server.vm_next_page = this+1;
9121 redisLog(REDIS_DEBUG, "FOUND CONTIGUOUS PAGES: %lld pages at %lld\n", (long long) n, (long long) *first);
9122 return REDIS_OK;
9123 }
9124 } else {
9125 /* The current one is not a free page */
9126 numfree = 0;
9127 }
9128
9129 /* Fast-forward if the current page is not free and we already
9130 * searched enough near this place. */
9131 since_jump++;
9132 if (!numfree && since_jump >= REDIS_VM_MAX_RANDOM_JUMP/4) {
9133 offset += random() % REDIS_VM_MAX_RANDOM_JUMP;
9134 since_jump = 0;
9135 /* Note that even if we rewind after the jump, we are don't need
9136 * to make sure numfree is set to zero as we only jump *if* it
9137 * is set to zero. */
9138 } else {
9139 /* Otherwise just check the next page */
9140 offset++;
9141 }
9142 }
9143 return REDIS_ERR;
9144 }
9145
9146 /* Write the specified object at the specified page of the swap file */
9147 static int vmWriteObjectOnSwap(robj *o, off_t page) {
9148 if (server.vm_enabled) pthread_mutex_lock(&server.io_swapfile_mutex);
9149 if (fseeko(server.vm_fp,page*server.vm_page_size,SEEK_SET) == -1) {
9150 if (server.vm_enabled) pthread_mutex_unlock(&server.io_swapfile_mutex);
9151 redisLog(REDIS_WARNING,
9152 "Critical VM problem in vmWriteObjectOnSwap(): can't seek: %s",
9153 strerror(errno));
9154 return REDIS_ERR;
9155 }
9156 rdbSaveObject(server.vm_fp,o);
9157 fflush(server.vm_fp);
9158 if (server.vm_enabled) pthread_mutex_unlock(&server.io_swapfile_mutex);
9159 return REDIS_OK;
9160 }
9161
9162 /* Transfers the 'val' object to disk. Store all the information
9163 * a 'vmpointer' object containing all the information needed to load the
9164 * object back later is returned.
9165 *
9166 * If we can't find enough contiguous empty pages to swap the object on disk
9167 * NULL is returned. */
9168 static vmpointer *vmSwapObjectBlocking(robj *val) {
9169 off_t pages = rdbSavedObjectPages(val,NULL);
9170 off_t page;
9171 vmpointer *vp;
9172
9173 assert(val->storage == REDIS_VM_MEMORY);
9174 assert(val->refcount == 1);
9175 if (vmFindContiguousPages(&page,pages) == REDIS_ERR) return NULL;
9176 if (vmWriteObjectOnSwap(val,page) == REDIS_ERR) return NULL;
9177
9178 vp = createVmPointer(val->type);
9179 vp->page = page;
9180 vp->usedpages = pages;
9181 decrRefCount(val); /* Deallocate the object from memory. */
9182 vmMarkPagesUsed(page,pages);
9183 redisLog(REDIS_DEBUG,"VM: object %p swapped out at %lld (%lld pages)",
9184 (void*) val,
9185 (unsigned long long) page, (unsigned long long) pages);
9186 server.vm_stats_swapped_objects++;
9187 server.vm_stats_swapouts++;
9188 return vp;
9189 }
9190
9191 static robj *vmReadObjectFromSwap(off_t page, int type) {
9192 robj *o;
9193
9194 if (server.vm_enabled) pthread_mutex_lock(&server.io_swapfile_mutex);
9195 if (fseeko(server.vm_fp,page*server.vm_page_size,SEEK_SET) == -1) {
9196 redisLog(REDIS_WARNING,
9197 "Unrecoverable VM problem in vmReadObjectFromSwap(): can't seek: %s",
9198 strerror(errno));
9199 _exit(1);
9200 }
9201 o = rdbLoadObject(type,server.vm_fp);
9202 if (o == NULL) {
9203 redisLog(REDIS_WARNING, "Unrecoverable VM problem in vmReadObjectFromSwap(): can't load object from swap file: %s", strerror(errno));
9204 _exit(1);
9205 }
9206 if (server.vm_enabled) pthread_mutex_unlock(&server.io_swapfile_mutex);
9207 return o;
9208 }
9209
9210 /* Load the specified object from swap to memory.
9211 * The newly allocated object is returned.
9212 *
9213 * If preview is true the unserialized object is returned to the caller but
9214 * the pages are not marked as freed, nor the vp object is freed. */
9215 static robj *vmGenericLoadObject(vmpointer *vp, int preview) {
9216 robj *val;
9217
9218 redisAssert(vp->type == REDIS_VMPOINTER &&
9219 (vp->storage == REDIS_VM_SWAPPED || vp->storage == REDIS_VM_LOADING));
9220 val = vmReadObjectFromSwap(vp->page,vp->vtype);
9221 if (!preview) {
9222 redisLog(REDIS_DEBUG, "VM: object %p loaded from disk", (void*)vp);
9223 vmMarkPagesFree(vp->page,vp->usedpages);
9224 zfree(vp);
9225 server.vm_stats_swapped_objects--;
9226 } else {
9227 redisLog(REDIS_DEBUG, "VM: object %p previewed from disk", (void*)vp);
9228 }
9229 server.vm_stats_swapins++;
9230 return val;
9231 }
9232
9233 /* Plain object loading, from swap to memory.
9234 *
9235 * 'o' is actually a redisVmPointer structure that will be freed by the call.
9236 * The return value is the loaded object. */
9237 static robj *vmLoadObject(robj *o) {
9238 /* If we are loading the object in background, stop it, we
9239 * need to load this object synchronously ASAP. */
9240 if (o->storage == REDIS_VM_LOADING)
9241 vmCancelThreadedIOJob(o);
9242 return vmGenericLoadObject((vmpointer*)o,0);
9243 }
9244
9245 /* Just load the value on disk, without to modify the key.
9246 * This is useful when we want to perform some operation on the value
9247 * without to really bring it from swap to memory, like while saving the
9248 * dataset or rewriting the append only log. */
9249 static robj *vmPreviewObject(robj *o) {
9250 return vmGenericLoadObject((vmpointer*)o,1);
9251 }
9252
9253 /* How a good candidate is this object for swapping?
9254 * The better candidate it is, the greater the returned value.
9255 *
9256 * Currently we try to perform a fast estimation of the object size in
9257 * memory, and combine it with aging informations.
9258 *
9259 * Basically swappability = idle-time * log(estimated size)
9260 *
9261 * Bigger objects are preferred over smaller objects, but not
9262 * proportionally, this is why we use the logarithm. This algorithm is
9263 * just a first try and will probably be tuned later. */
9264 static double computeObjectSwappability(robj *o) {
9265 /* actual age can be >= minage, but not < minage. As we use wrapping
9266 * 21 bit clocks with minutes resolution for the LRU. */
9267 time_t minage = abs(server.lruclock - o->lru);
9268 long asize = 0;
9269 list *l;
9270 dict *d;
9271 struct dictEntry *de;
9272 int z;
9273
9274 if (minage <= 0) return 0;
9275 switch(o->type) {
9276 case REDIS_STRING:
9277 if (o->encoding != REDIS_ENCODING_RAW) {
9278 asize = sizeof(*o);
9279 } else {
9280 asize = sdslen(o->ptr)+sizeof(*o)+sizeof(long)*2;
9281 }
9282 break;
9283 case REDIS_LIST:
9284 l = o->ptr;
9285 listNode *ln = listFirst(l);
9286
9287 asize = sizeof(list);
9288 if (ln) {
9289 robj *ele = ln->value;
9290 long elesize;
9291
9292 elesize = (ele->encoding == REDIS_ENCODING_RAW) ?
9293 (sizeof(*o)+sdslen(ele->ptr)) : sizeof(*o);
9294 asize += (sizeof(listNode)+elesize)*listLength(l);
9295 }
9296 break;
9297 case REDIS_SET:
9298 case REDIS_ZSET:
9299 z = (o->type == REDIS_ZSET);
9300 d = z ? ((zset*)o->ptr)->dict : o->ptr;
9301
9302 asize = sizeof(dict)+(sizeof(struct dictEntry*)*dictSlots(d));
9303 if (z) asize += sizeof(zset)-sizeof(dict);
9304 if (dictSize(d)) {
9305 long elesize;
9306 robj *ele;
9307
9308 de = dictGetRandomKey(d);
9309 ele = dictGetEntryKey(de);
9310 elesize = (ele->encoding == REDIS_ENCODING_RAW) ?
9311 (sizeof(*o)+sdslen(ele->ptr)) : sizeof(*o);
9312 asize += (sizeof(struct dictEntry)+elesize)*dictSize(d);
9313 if (z) asize += sizeof(zskiplistNode)*dictSize(d);
9314 }
9315 break;
9316 case REDIS_HASH:
9317 if (o->encoding == REDIS_ENCODING_ZIPMAP) {
9318 unsigned char *p = zipmapRewind((unsigned char*)o->ptr);
9319 unsigned int len = zipmapLen((unsigned char*)o->ptr);
9320 unsigned int klen, vlen;
9321 unsigned char *key, *val;
9322
9323 if ((p = zipmapNext(p,&key,&klen,&val,&vlen)) == NULL) {
9324 klen = 0;
9325 vlen = 0;
9326 }
9327 asize = len*(klen+vlen+3);
9328 } else if (o->encoding == REDIS_ENCODING_HT) {
9329 d = o->ptr;
9330 asize = sizeof(dict)+(sizeof(struct dictEntry*)*dictSlots(d));
9331 if (dictSize(d)) {
9332 long elesize;
9333 robj *ele;
9334
9335 de = dictGetRandomKey(d);
9336 ele = dictGetEntryKey(de);
9337 elesize = (ele->encoding == REDIS_ENCODING_RAW) ?
9338 (sizeof(*o)+sdslen(ele->ptr)) : sizeof(*o);
9339 ele = dictGetEntryVal(de);
9340 elesize = (ele->encoding == REDIS_ENCODING_RAW) ?
9341 (sizeof(*o)+sdslen(ele->ptr)) : sizeof(*o);
9342 asize += (sizeof(struct dictEntry)+elesize)*dictSize(d);
9343 }
9344 }
9345 break;
9346 }
9347 return (double)minage*log(1+asize);
9348 }
9349
9350 /* Try to swap an object that's a good candidate for swapping.
9351 * Returns REDIS_OK if the object was swapped, REDIS_ERR if it's not possible
9352 * to swap any object at all.
9353 *
9354 * If 'usethreaded' is true, Redis will try to swap the object in background
9355 * using I/O threads. */
9356 static int vmSwapOneObject(int usethreads) {
9357 int j, i;
9358 struct dictEntry *best = NULL;
9359 double best_swappability = 0;
9360 redisDb *best_db = NULL;
9361 robj *val;
9362 sds key;
9363
9364 for (j = 0; j < server.dbnum; j++) {
9365 redisDb *db = server.db+j;
9366 /* Why maxtries is set to 100?
9367 * Because this way (usually) we'll find 1 object even if just 1% - 2%
9368 * are swappable objects */
9369 int maxtries = 100;
9370
9371 if (dictSize(db->dict) == 0) continue;
9372 for (i = 0; i < 5; i++) {
9373 dictEntry *de;
9374 double swappability;
9375
9376 if (maxtries) maxtries--;
9377 de = dictGetRandomKey(db->dict);
9378 val = dictGetEntryVal(de);
9379 /* Only swap objects that are currently in memory.
9380 *
9381 * Also don't swap shared objects: not a good idea in general and
9382 * we need to ensure that the main thread does not touch the
9383 * object while the I/O thread is using it, but we can't
9384 * control other keys without adding additional mutex. */
9385 if (val->storage != REDIS_VM_MEMORY || val->refcount != 1) {
9386 if (maxtries) i--; /* don't count this try */
9387 continue;
9388 }
9389 swappability = computeObjectSwappability(val);
9390 if (!best || swappability > best_swappability) {
9391 best = de;
9392 best_swappability = swappability;
9393 best_db = db;
9394 }
9395 }
9396 }
9397 if (best == NULL) return REDIS_ERR;
9398 key = dictGetEntryKey(best);
9399 val = dictGetEntryVal(best);
9400
9401 redisLog(REDIS_DEBUG,"Key with best swappability: %s, %f",
9402 key, best_swappability);
9403
9404 /* Swap it */
9405 if (usethreads) {
9406 vmSwapObjectThreaded(createStringObject(key,sdslen(key)),val,best_db);
9407 return REDIS_OK;
9408 } else {
9409 vmpointer *vp;
9410
9411 if ((vp = vmSwapObjectBlocking(val)) != NULL) {
9412 dictGetEntryVal(best) = vp;
9413 return REDIS_OK;
9414 } else {
9415 return REDIS_ERR;
9416 }
9417 }
9418 }
9419
9420 static int vmSwapOneObjectBlocking() {
9421 return vmSwapOneObject(0);
9422 }
9423
9424 static int vmSwapOneObjectThreaded() {
9425 return vmSwapOneObject(1);
9426 }
9427
9428 /* Return true if it's safe to swap out objects in a given moment.
9429 * Basically we don't want to swap objects out while there is a BGSAVE
9430 * or a BGAEOREWRITE running in backgroud. */
9431 static int vmCanSwapOut(void) {
9432 return (server.bgsavechildpid == -1 && server.bgrewritechildpid == -1);
9433 }
9434
9435 /* =================== Virtual Memory - Threaded I/O ======================= */
9436
9437 static void freeIOJob(iojob *j) {
9438 if ((j->type == REDIS_IOJOB_PREPARE_SWAP ||
9439 j->type == REDIS_IOJOB_DO_SWAP ||
9440 j->type == REDIS_IOJOB_LOAD) && j->val != NULL)
9441 {
9442 /* we fix the storage type, otherwise decrRefCount() will try to
9443 * kill the I/O thread Job (that does no longer exists). */
9444 if (j->val->storage == REDIS_VM_SWAPPING)
9445 j->val->storage = REDIS_VM_MEMORY;
9446 decrRefCount(j->val);
9447 }
9448 decrRefCount(j->key);
9449 zfree(j);
9450 }
9451
9452 /* Every time a thread finished a Job, it writes a byte into the write side
9453 * of an unix pipe in order to "awake" the main thread, and this function
9454 * is called. */
9455 static void vmThreadedIOCompletedJob(aeEventLoop *el, int fd, void *privdata,
9456 int mask)
9457 {
9458 char buf[1];
9459 int retval, processed = 0, toprocess = -1, trytoswap = 1;
9460 REDIS_NOTUSED(el);
9461 REDIS_NOTUSED(mask);
9462 REDIS_NOTUSED(privdata);
9463
9464 /* For every byte we read in the read side of the pipe, there is one
9465 * I/O job completed to process. */
9466 while((retval = read(fd,buf,1)) == 1) {
9467 iojob *j;
9468 listNode *ln;
9469 struct dictEntry *de;
9470
9471 redisLog(REDIS_DEBUG,"Processing I/O completed job");
9472
9473 /* Get the processed element (the oldest one) */
9474 lockThreadedIO();
9475 assert(listLength(server.io_processed) != 0);
9476 if (toprocess == -1) {
9477 toprocess = (listLength(server.io_processed)*REDIS_MAX_COMPLETED_JOBS_PROCESSED)/100;
9478 if (toprocess <= 0) toprocess = 1;
9479 }
9480 ln = listFirst(server.io_processed);
9481 j = ln->value;
9482 listDelNode(server.io_processed,ln);
9483 unlockThreadedIO();
9484 /* If this job is marked as canceled, just ignore it */
9485 if (j->canceled) {
9486 freeIOJob(j);
9487 continue;
9488 }
9489 /* Post process it in the main thread, as there are things we
9490 * can do just here to avoid race conditions and/or invasive locks */
9491 redisLog(REDIS_DEBUG,"COMPLETED Job type: %d, ID %p, key: %s", j->type, (void*)j->id, (unsigned char*)j->key->ptr);
9492 de = dictFind(j->db->dict,j->key->ptr);
9493 redisAssert(de != NULL);
9494 if (j->type == REDIS_IOJOB_LOAD) {
9495 redisDb *db;
9496 vmpointer *vp = dictGetEntryVal(de);
9497
9498 /* Key loaded, bring it at home */
9499 vmMarkPagesFree(vp->page,vp->usedpages);
9500 redisLog(REDIS_DEBUG, "VM: object %s loaded from disk (threaded)",
9501 (unsigned char*) j->key->ptr);
9502 server.vm_stats_swapped_objects--;
9503 server.vm_stats_swapins++;
9504 dictGetEntryVal(de) = j->val;
9505 incrRefCount(j->val);
9506 db = j->db;
9507 /* Handle clients waiting for this key to be loaded. */
9508 handleClientsBlockedOnSwappedKey(db,j->key);
9509 freeIOJob(j);
9510 zfree(vp);
9511 } else if (j->type == REDIS_IOJOB_PREPARE_SWAP) {
9512 /* Now we know the amount of pages required to swap this object.
9513 * Let's find some space for it, and queue this task again
9514 * rebranded as REDIS_IOJOB_DO_SWAP. */
9515 if (!vmCanSwapOut() ||
9516 vmFindContiguousPages(&j->page,j->pages) == REDIS_ERR)
9517 {
9518 /* Ooops... no space or we can't swap as there is
9519 * a fork()ed Redis trying to save stuff on disk. */
9520 j->val->storage = REDIS_VM_MEMORY; /* undo operation */
9521 freeIOJob(j);
9522 } else {
9523 /* Note that we need to mark this pages as used now,
9524 * if the job will be canceled, we'll mark them as freed
9525 * again. */
9526 vmMarkPagesUsed(j->page,j->pages);
9527 j->type = REDIS_IOJOB_DO_SWAP;
9528 lockThreadedIO();
9529 queueIOJob(j);
9530 unlockThreadedIO();
9531 }
9532 } else if (j->type == REDIS_IOJOB_DO_SWAP) {
9533 vmpointer *vp;
9534
9535 /* Key swapped. We can finally free some memory. */
9536 if (j->val->storage != REDIS_VM_SWAPPING) {
9537 vmpointer *vp = (vmpointer*) j->id;
9538 printf("storage: %d\n",vp->storage);
9539 printf("key->name: %s\n",(char*)j->key->ptr);
9540 printf("val: %p\n",(void*)j->val);
9541 printf("val->type: %d\n",j->val->type);
9542 printf("val->ptr: %s\n",(char*)j->val->ptr);
9543 }
9544 redisAssert(j->val->storage == REDIS_VM_SWAPPING);
9545 vp = createVmPointer(j->val->type);
9546 vp->page = j->page;
9547 vp->usedpages = j->pages;
9548 dictGetEntryVal(de) = vp;
9549 /* Fix the storage otherwise decrRefCount will attempt to
9550 * remove the associated I/O job */
9551 j->val->storage = REDIS_VM_MEMORY;
9552 decrRefCount(j->val);
9553 redisLog(REDIS_DEBUG,
9554 "VM: object %s swapped out at %lld (%lld pages) (threaded)",
9555 (unsigned char*) j->key->ptr,
9556 (unsigned long long) j->page, (unsigned long long) j->pages);
9557 server.vm_stats_swapped_objects++;
9558 server.vm_stats_swapouts++;
9559 freeIOJob(j);
9560 /* Put a few more swap requests in queue if we are still
9561 * out of memory */
9562 if (trytoswap && vmCanSwapOut() &&
9563 zmalloc_used_memory() > server.vm_max_memory)
9564 {
9565 int more = 1;
9566 while(more) {
9567 lockThreadedIO();
9568 more = listLength(server.io_newjobs) <
9569 (unsigned) server.vm_max_threads;
9570 unlockThreadedIO();
9571 /* Don't waste CPU time if swappable objects are rare. */
9572 if (vmSwapOneObjectThreaded() == REDIS_ERR) {
9573 trytoswap = 0;
9574 break;
9575 }
9576 }
9577 }
9578 }
9579 processed++;
9580 if (processed == toprocess) return;
9581 }
9582 if (retval < 0 && errno != EAGAIN) {
9583 redisLog(REDIS_WARNING,
9584 "WARNING: read(2) error in vmThreadedIOCompletedJob() %s",
9585 strerror(errno));
9586 }
9587 }
9588
9589 static void lockThreadedIO(void) {
9590 pthread_mutex_lock(&server.io_mutex);
9591 }
9592
9593 static void unlockThreadedIO(void) {
9594 pthread_mutex_unlock(&server.io_mutex);
9595 }
9596
9597 /* Remove the specified object from the threaded I/O queue if still not
9598 * processed, otherwise make sure to flag it as canceled. */
9599 static void vmCancelThreadedIOJob(robj *o) {
9600 list *lists[3] = {
9601 server.io_newjobs, /* 0 */
9602 server.io_processing, /* 1 */
9603 server.io_processed /* 2 */
9604 };
9605 int i;
9606
9607 assert(o->storage == REDIS_VM_LOADING || o->storage == REDIS_VM_SWAPPING);
9608 again:
9609 lockThreadedIO();
9610 /* Search for a matching object in one of the queues */
9611 for (i = 0; i < 3; i++) {
9612 listNode *ln;
9613 listIter li;
9614
9615 listRewind(lists[i],&li);
9616 while ((ln = listNext(&li)) != NULL) {
9617 iojob *job = ln->value;
9618
9619 if (job->canceled) continue; /* Skip this, already canceled. */
9620 if (job->id == o) {
9621 redisLog(REDIS_DEBUG,"*** CANCELED %p (key %s) (type %d) (LIST ID %d)\n",
9622 (void*)job, (char*)job->key->ptr, job->type, i);
9623 /* Mark the pages as free since the swap didn't happened
9624 * or happened but is now discarded. */
9625 if (i != 1 && job->type == REDIS_IOJOB_DO_SWAP)
9626 vmMarkPagesFree(job->page,job->pages);
9627 /* Cancel the job. It depends on the list the job is
9628 * living in. */
9629 switch(i) {
9630 case 0: /* io_newjobs */
9631 /* If the job was yet not processed the best thing to do
9632 * is to remove it from the queue at all */
9633 freeIOJob(job);
9634 listDelNode(lists[i],ln);
9635 break;
9636 case 1: /* io_processing */
9637 /* Oh Shi- the thread is messing with the Job:
9638 *
9639 * Probably it's accessing the object if this is a
9640 * PREPARE_SWAP or DO_SWAP job.
9641 * If it's a LOAD job it may be reading from disk and
9642 * if we don't wait for the job to terminate before to
9643 * cancel it, maybe in a few microseconds data can be
9644 * corrupted in this pages. So the short story is:
9645 *
9646 * Better to wait for the job to move into the
9647 * next queue (processed)... */
9648
9649 /* We try again and again until the job is completed. */
9650 unlockThreadedIO();
9651 /* But let's wait some time for the I/O thread
9652 * to finish with this job. After all this condition
9653 * should be very rare. */
9654 usleep(1);
9655 goto again;
9656 case 2: /* io_processed */
9657 /* The job was already processed, that's easy...
9658 * just mark it as canceled so that we'll ignore it
9659 * when processing completed jobs. */
9660 job->canceled = 1;
9661 break;
9662 }
9663 /* Finally we have to adjust the storage type of the object
9664 * in order to "UNDO" the operaiton. */
9665 if (o->storage == REDIS_VM_LOADING)
9666 o->storage = REDIS_VM_SWAPPED;
9667 else if (o->storage == REDIS_VM_SWAPPING)
9668 o->storage = REDIS_VM_MEMORY;
9669 unlockThreadedIO();
9670 redisLog(REDIS_DEBUG,"*** DONE");
9671 return;
9672 }
9673 }
9674 }
9675 unlockThreadedIO();
9676 printf("Not found: %p\n", (void*)o);
9677 redisAssert(1 != 1); /* We should never reach this */
9678 }
9679
9680 static void *IOThreadEntryPoint(void *arg) {
9681 iojob *j;
9682 listNode *ln;
9683 REDIS_NOTUSED(arg);
9684
9685 pthread_detach(pthread_self());
9686 while(1) {
9687 /* Get a new job to process */
9688 lockThreadedIO();
9689 if (listLength(server.io_newjobs) == 0) {
9690 /* No new jobs in queue, exit. */
9691 redisLog(REDIS_DEBUG,"Thread %ld exiting, nothing to do",
9692 (long) pthread_self());
9693 server.io_active_threads--;
9694 unlockThreadedIO();
9695 return NULL;
9696 }
9697 ln = listFirst(server.io_newjobs);
9698 j = ln->value;
9699 listDelNode(server.io_newjobs,ln);
9700 /* Add the job in the processing queue */
9701 j->thread = pthread_self();
9702 listAddNodeTail(server.io_processing,j);
9703 ln = listLast(server.io_processing); /* We use ln later to remove it */
9704 unlockThreadedIO();
9705 redisLog(REDIS_DEBUG,"Thread %ld got a new job (type %d): %p about key '%s'",
9706 (long) pthread_self(), j->type, (void*)j, (char*)j->key->ptr);
9707
9708 /* Process the Job */
9709 if (j->type == REDIS_IOJOB_LOAD) {
9710 vmpointer *vp = (vmpointer*)j->id;
9711 j->val = vmReadObjectFromSwap(j->page,vp->vtype);
9712 } else if (j->type == REDIS_IOJOB_PREPARE_SWAP) {
9713 FILE *fp = fopen("/dev/null","w+");
9714 j->pages = rdbSavedObjectPages(j->val,fp);
9715 fclose(fp);
9716 } else if (j->type == REDIS_IOJOB_DO_SWAP) {
9717 if (vmWriteObjectOnSwap(j->val,j->page) == REDIS_ERR)
9718 j->canceled = 1;
9719 }
9720
9721 /* Done: insert the job into the processed queue */
9722 redisLog(REDIS_DEBUG,"Thread %ld completed the job: %p (key %s)",
9723 (long) pthread_self(), (void*)j, (char*)j->key->ptr);
9724 lockThreadedIO();
9725 listDelNode(server.io_processing,ln);
9726 listAddNodeTail(server.io_processed,j);
9727 unlockThreadedIO();
9728
9729 /* Signal the main thread there is new stuff to process */
9730 assert(write(server.io_ready_pipe_write,"x",1) == 1);
9731 }
9732 return NULL; /* never reached */
9733 }
9734
9735 static void spawnIOThread(void) {
9736 pthread_t thread;
9737 sigset_t mask, omask;
9738 int err;
9739
9740 sigemptyset(&mask);
9741 sigaddset(&mask,SIGCHLD);
9742 sigaddset(&mask,SIGHUP);
9743 sigaddset(&mask,SIGPIPE);
9744 pthread_sigmask(SIG_SETMASK, &mask, &omask);
9745 while ((err = pthread_create(&thread,&server.io_threads_attr,IOThreadEntryPoint,NULL)) != 0) {
9746 redisLog(REDIS_WARNING,"Unable to spawn an I/O thread: %s",
9747 strerror(err));
9748 usleep(1000000);
9749 }
9750 pthread_sigmask(SIG_SETMASK, &omask, NULL);
9751 server.io_active_threads++;
9752 }
9753
9754 /* We need to wait for the last thread to exit before we are able to
9755 * fork() in order to BGSAVE or BGREWRITEAOF. */
9756 static void waitEmptyIOJobsQueue(void) {
9757 while(1) {
9758 int io_processed_len;
9759
9760 lockThreadedIO();
9761 if (listLength(server.io_newjobs) == 0 &&
9762 listLength(server.io_processing) == 0 &&
9763 server.io_active_threads == 0)
9764 {
9765 unlockThreadedIO();
9766 return;
9767 }
9768 /* While waiting for empty jobs queue condition we post-process some
9769 * finshed job, as I/O threads may be hanging trying to write against
9770 * the io_ready_pipe_write FD but there are so much pending jobs that
9771 * it's blocking. */
9772 io_processed_len = listLength(server.io_processed);
9773 unlockThreadedIO();
9774 if (io_processed_len) {
9775 vmThreadedIOCompletedJob(NULL,server.io_ready_pipe_read,NULL,0);
9776 usleep(1000); /* 1 millisecond */
9777 } else {
9778 usleep(10000); /* 10 milliseconds */
9779 }
9780 }
9781 }
9782
9783 static void vmReopenSwapFile(void) {
9784 /* Note: we don't close the old one as we are in the child process
9785 * and don't want to mess at all with the original file object. */
9786 server.vm_fp = fopen(server.vm_swap_file,"r+b");
9787 if (server.vm_fp == NULL) {
9788 redisLog(REDIS_WARNING,"Can't re-open the VM swap file: %s. Exiting.",
9789 server.vm_swap_file);
9790 _exit(1);
9791 }
9792 server.vm_fd = fileno(server.vm_fp);
9793 }
9794
9795 /* This function must be called while with threaded IO locked */
9796 static void queueIOJob(iojob *j) {
9797 redisLog(REDIS_DEBUG,"Queued IO Job %p type %d about key '%s'\n",
9798 (void*)j, j->type, (char*)j->key->ptr);
9799 listAddNodeTail(server.io_newjobs,j);
9800 if (server.io_active_threads < server.vm_max_threads)
9801 spawnIOThread();
9802 }
9803
9804 static int vmSwapObjectThreaded(robj *key, robj *val, redisDb *db) {
9805 iojob *j;
9806
9807 j = zmalloc(sizeof(*j));
9808 j->type = REDIS_IOJOB_PREPARE_SWAP;
9809 j->db = db;
9810 j->key = key;
9811 incrRefCount(key);
9812 j->id = j->val = val;
9813 incrRefCount(val);
9814 j->canceled = 0;
9815 j->thread = (pthread_t) -1;
9816 val->storage = REDIS_VM_SWAPPING;
9817
9818 lockThreadedIO();
9819 queueIOJob(j);
9820 unlockThreadedIO();
9821 return REDIS_OK;
9822 }
9823
9824 /* ============ Virtual Memory - Blocking clients on missing keys =========== */
9825
9826 /* This function makes the clinet 'c' waiting for the key 'key' to be loaded.
9827 * If there is not already a job loading the key, it is craeted.
9828 * The key is added to the io_keys list in the client structure, and also
9829 * in the hash table mapping swapped keys to waiting clients, that is,
9830 * server.io_waited_keys. */
9831 static int waitForSwappedKey(redisClient *c, robj *key) {
9832 struct dictEntry *de;
9833 robj *o;
9834 list *l;
9835
9836 /* If the key does not exist or is already in RAM we don't need to
9837 * block the client at all. */
9838 de = dictFind(c->db->dict,key->ptr);
9839 if (de == NULL) return 0;
9840 o = dictGetEntryVal(de);
9841 if (o->storage == REDIS_VM_MEMORY) {
9842 return 0;
9843 } else if (o->storage == REDIS_VM_SWAPPING) {
9844 /* We were swapping the key, undo it! */
9845 vmCancelThreadedIOJob(o);
9846 return 0;
9847 }
9848
9849 /* OK: the key is either swapped, or being loaded just now. */
9850
9851 /* Add the key to the list of keys this client is waiting for.
9852 * This maps clients to keys they are waiting for. */
9853 listAddNodeTail(c->io_keys,key);
9854 incrRefCount(key);
9855
9856 /* Add the client to the swapped keys => clients waiting map. */
9857 de = dictFind(c->db->io_keys,key);
9858 if (de == NULL) {
9859 int retval;
9860
9861 /* For every key we take a list of clients blocked for it */
9862 l = listCreate();
9863 retval = dictAdd(c->db->io_keys,key,l);
9864 incrRefCount(key);
9865 assert(retval == DICT_OK);
9866 } else {
9867 l = dictGetEntryVal(de);
9868 }
9869 listAddNodeTail(l,c);
9870
9871 /* Are we already loading the key from disk? If not create a job */
9872 if (o->storage == REDIS_VM_SWAPPED) {
9873 iojob *j;
9874 vmpointer *vp = (vmpointer*)o;
9875
9876 o->storage = REDIS_VM_LOADING;
9877 j = zmalloc(sizeof(*j));
9878 j->type = REDIS_IOJOB_LOAD;
9879 j->db = c->db;
9880 j->id = (robj*)vp;
9881 j->key = key;
9882 incrRefCount(key);
9883 j->page = vp->page;
9884 j->val = NULL;
9885 j->canceled = 0;
9886 j->thread = (pthread_t) -1;
9887 lockThreadedIO();
9888 queueIOJob(j);
9889 unlockThreadedIO();
9890 }
9891 return 1;
9892 }
9893
9894 /* Preload keys for any command with first, last and step values for
9895 * the command keys prototype, as defined in the command table. */
9896 static void waitForMultipleSwappedKeys(redisClient *c, struct redisCommand *cmd, int argc, robj **argv) {
9897 int j, last;
9898 if (cmd->vm_firstkey == 0) return;
9899 last = cmd->vm_lastkey;
9900 if (last < 0) last = argc+last;
9901 for (j = cmd->vm_firstkey; j <= last; j += cmd->vm_keystep) {
9902 redisAssert(j < argc);
9903 waitForSwappedKey(c,argv[j]);
9904 }
9905 }
9906
9907 /* Preload keys needed for the ZUNIONSTORE and ZINTERSTORE commands.
9908 * Note that the number of keys to preload is user-defined, so we need to
9909 * apply a sanity check against argc. */
9910 static void zunionInterBlockClientOnSwappedKeys(redisClient *c, struct redisCommand *cmd, int argc, robj **argv) {
9911 int i, num;
9912 REDIS_NOTUSED(cmd);
9913
9914 num = atoi(argv[2]->ptr);
9915 if (num > (argc-3)) return;
9916 for (i = 0; i < num; i++) {
9917 waitForSwappedKey(c,argv[3+i]);
9918 }
9919 }
9920
9921 /* Preload keys needed to execute the entire MULTI/EXEC block.
9922 *
9923 * This function is called by blockClientOnSwappedKeys when EXEC is issued,
9924 * and will block the client when any command requires a swapped out value. */
9925 static void execBlockClientOnSwappedKeys(redisClient *c, struct redisCommand *cmd, int argc, robj **argv) {
9926 int i, margc;
9927 struct redisCommand *mcmd;
9928 robj **margv;
9929 REDIS_NOTUSED(cmd);
9930 REDIS_NOTUSED(argc);
9931 REDIS_NOTUSED(argv);
9932
9933 if (!(c->flags & REDIS_MULTI)) return;
9934 for (i = 0; i < c->mstate.count; i++) {
9935 mcmd = c->mstate.commands[i].cmd;
9936 margc = c->mstate.commands[i].argc;
9937 margv = c->mstate.commands[i].argv;
9938
9939 if (mcmd->vm_preload_proc != NULL) {
9940 mcmd->vm_preload_proc(c,mcmd,margc,margv);
9941 } else {
9942 waitForMultipleSwappedKeys(c,mcmd,margc,margv);
9943 }
9944 }
9945 }
9946
9947 /* Is this client attempting to run a command against swapped keys?
9948 * If so, block it ASAP, load the keys in background, then resume it.
9949 *
9950 * The important idea about this function is that it can fail! If keys will
9951 * still be swapped when the client is resumed, this key lookups will
9952 * just block loading keys from disk. In practical terms this should only
9953 * happen with SORT BY command or if there is a bug in this function.
9954 *
9955 * Return 1 if the client is marked as blocked, 0 if the client can
9956 * continue as the keys it is going to access appear to be in memory. */
9957 static int blockClientOnSwappedKeys(redisClient *c, struct redisCommand *cmd) {
9958 if (cmd->vm_preload_proc != NULL) {
9959 cmd->vm_preload_proc(c,cmd,c->argc,c->argv);
9960 } else {
9961 waitForMultipleSwappedKeys(c,cmd,c->argc,c->argv);
9962 }
9963
9964 /* If the client was blocked for at least one key, mark it as blocked. */
9965 if (listLength(c->io_keys)) {
9966 c->flags |= REDIS_IO_WAIT;
9967 aeDeleteFileEvent(server.el,c->fd,AE_READABLE);
9968 server.vm_blocked_clients++;
9969 return 1;
9970 } else {
9971 return 0;
9972 }
9973 }
9974
9975 /* Remove the 'key' from the list of blocked keys for a given client.
9976 *
9977 * The function returns 1 when there are no longer blocking keys after
9978 * the current one was removed (and the client can be unblocked). */
9979 static int dontWaitForSwappedKey(redisClient *c, robj *key) {
9980 list *l;
9981 listNode *ln;
9982 listIter li;
9983 struct dictEntry *de;
9984
9985 /* Remove the key from the list of keys this client is waiting for. */
9986 listRewind(c->io_keys,&li);
9987 while ((ln = listNext(&li)) != NULL) {
9988 if (equalStringObjects(ln->value,key)) {
9989 listDelNode(c->io_keys,ln);
9990 break;
9991 }
9992 }
9993 assert(ln != NULL);
9994
9995 /* Remove the client form the key => waiting clients map. */
9996 de = dictFind(c->db->io_keys,key);
9997 assert(de != NULL);
9998 l = dictGetEntryVal(de);
9999 ln = listSearchKey(l,c);
10000 assert(ln != NULL);
10001 listDelNode(l,ln);
10002 if (listLength(l) == 0)
10003 dictDelete(c->db->io_keys,key);
10004
10005 return listLength(c->io_keys) == 0;
10006 }
10007
10008 /* Every time we now a key was loaded back in memory, we handle clients
10009 * waiting for this key if any. */
10010 static void handleClientsBlockedOnSwappedKey(redisDb *db, robj *key) {
10011 struct dictEntry *de;
10012 list *l;
10013 listNode *ln;
10014 int len;
10015
10016 de = dictFind(db->io_keys,key);
10017 if (!de) return;
10018
10019 l = dictGetEntryVal(de);
10020 len = listLength(l);
10021 /* Note: we can't use something like while(listLength(l)) as the list
10022 * can be freed by the calling function when we remove the last element. */
10023 while (len--) {
10024 ln = listFirst(l);
10025 redisClient *c = ln->value;
10026
10027 if (dontWaitForSwappedKey(c,key)) {
10028 /* Put the client in the list of clients ready to go as we
10029 * loaded all the keys about it. */
10030 listAddNodeTail(server.io_ready_clients,c);
10031 }
10032 }
10033 }
10034
10035 /* =========================== Remote Configuration ========================= */
10036
10037 static void configSetCommand(redisClient *c) {
10038 robj *o = getDecodedObject(c->argv[3]);
10039 long long ll;
10040
10041 if (!strcasecmp(c->argv[2]->ptr,"dbfilename")) {
10042 zfree(server.dbfilename);
10043 server.dbfilename = zstrdup(o->ptr);
10044 } else if (!strcasecmp(c->argv[2]->ptr,"requirepass")) {
10045 zfree(server.requirepass);
10046 server.requirepass = zstrdup(o->ptr);
10047 } else if (!strcasecmp(c->argv[2]->ptr,"masterauth")) {
10048 zfree(server.masterauth);
10049 server.masterauth = zstrdup(o->ptr);
10050 } else if (!strcasecmp(c->argv[2]->ptr,"maxmemory")) {
10051 if (getLongLongFromObject(o,&ll) == REDIS_ERR ||
10052 ll < 0) goto badfmt;
10053 server.maxmemory = ll;
10054 } else if (!strcasecmp(c->argv[2]->ptr,"timeout")) {
10055 if (getLongLongFromObject(o,&ll) == REDIS_ERR ||
10056 ll < 0 || ll > LONG_MAX) goto badfmt;
10057 server.maxidletime = ll;
10058 } else if (!strcasecmp(c->argv[2]->ptr,"appendfsync")) {
10059 if (!strcasecmp(o->ptr,"no")) {
10060 server.appendfsync = APPENDFSYNC_NO;
10061 } else if (!strcasecmp(o->ptr,"everysec")) {
10062 server.appendfsync = APPENDFSYNC_EVERYSEC;
10063 } else if (!strcasecmp(o->ptr,"always")) {
10064 server.appendfsync = APPENDFSYNC_ALWAYS;
10065 } else {
10066 goto badfmt;
10067 }
10068 } else if (!strcasecmp(c->argv[2]->ptr,"no-appendfsync-on-rewrite")) {
10069 int yn = yesnotoi(o->ptr);
10070
10071 if (yn == -1) goto badfmt;
10072 server.no_appendfsync_on_rewrite = yn;
10073 } else if (!strcasecmp(c->argv[2]->ptr,"appendonly")) {
10074 int old = server.appendonly;
10075 int new = yesnotoi(o->ptr);
10076
10077 if (new == -1) goto badfmt;
10078 if (old != new) {
10079 if (new == 0) {
10080 stopAppendOnly();
10081 } else {
10082 if (startAppendOnly() == REDIS_ERR) {
10083 addReplySds(c,sdscatprintf(sdsempty(),
10084 "-ERR Unable to turn on AOF. Check server logs.\r\n"));
10085 decrRefCount(o);
10086 return;
10087 }
10088 }
10089 }
10090 } else if (!strcasecmp(c->argv[2]->ptr,"save")) {
10091 int vlen, j;
10092 sds *v = sdssplitlen(o->ptr,sdslen(o->ptr)," ",1,&vlen);
10093
10094 /* Perform sanity check before setting the new config:
10095 * - Even number of args
10096 * - Seconds >= 1, changes >= 0 */
10097 if (vlen & 1) {
10098 sdsfreesplitres(v,vlen);
10099 goto badfmt;
10100 }
10101 for (j = 0; j < vlen; j++) {
10102 char *eptr;
10103 long val;
10104
10105 val = strtoll(v[j], &eptr, 10);
10106 if (eptr[0] != '\0' ||
10107 ((j & 1) == 0 && val < 1) ||
10108 ((j & 1) == 1 && val < 0)) {
10109 sdsfreesplitres(v,vlen);
10110 goto badfmt;
10111 }
10112 }
10113 /* Finally set the new config */
10114 resetServerSaveParams();
10115 for (j = 0; j < vlen; j += 2) {
10116 time_t seconds;
10117 int changes;
10118
10119 seconds = strtoll(v[j],NULL,10);
10120 changes = strtoll(v[j+1],NULL,10);
10121 appendServerSaveParams(seconds, changes);
10122 }
10123 sdsfreesplitres(v,vlen);
10124 } else {
10125 addReplySds(c,sdscatprintf(sdsempty(),
10126 "-ERR not supported CONFIG parameter %s\r\n",
10127 (char*)c->argv[2]->ptr));
10128 decrRefCount(o);
10129 return;
10130 }
10131 decrRefCount(o);
10132 addReply(c,shared.ok);
10133 return;
10134
10135 badfmt: /* Bad format errors */
10136 addReplySds(c,sdscatprintf(sdsempty(),
10137 "-ERR invalid argument '%s' for CONFIG SET '%s'\r\n",
10138 (char*)o->ptr,
10139 (char*)c->argv[2]->ptr));
10140 decrRefCount(o);
10141 }
10142
10143 static void configGetCommand(redisClient *c) {
10144 robj *o = getDecodedObject(c->argv[2]);
10145 robj *lenobj = createObject(REDIS_STRING,NULL);
10146 char *pattern = o->ptr;
10147 int matches = 0;
10148
10149 addReply(c,lenobj);
10150 decrRefCount(lenobj);
10151
10152 if (stringmatch(pattern,"dbfilename",0)) {
10153 addReplyBulkCString(c,"dbfilename");
10154 addReplyBulkCString(c,server.dbfilename);
10155 matches++;
10156 }
10157 if (stringmatch(pattern,"requirepass",0)) {
10158 addReplyBulkCString(c,"requirepass");
10159 addReplyBulkCString(c,server.requirepass);
10160 matches++;
10161 }
10162 if (stringmatch(pattern,"masterauth",0)) {
10163 addReplyBulkCString(c,"masterauth");
10164 addReplyBulkCString(c,server.masterauth);
10165 matches++;
10166 }
10167 if (stringmatch(pattern,"maxmemory",0)) {
10168 char buf[128];
10169
10170 ll2string(buf,128,server.maxmemory);
10171 addReplyBulkCString(c,"maxmemory");
10172 addReplyBulkCString(c,buf);
10173 matches++;
10174 }
10175 if (stringmatch(pattern,"timeout",0)) {
10176 char buf[128];
10177
10178 ll2string(buf,128,server.maxidletime);
10179 addReplyBulkCString(c,"timeout");
10180 addReplyBulkCString(c,buf);
10181 matches++;
10182 }
10183 if (stringmatch(pattern,"appendonly",0)) {
10184 addReplyBulkCString(c,"appendonly");
10185 addReplyBulkCString(c,server.appendonly ? "yes" : "no");
10186 matches++;
10187 }
10188 if (stringmatch(pattern,"no-appendfsync-on-rewrite",0)) {
10189 addReplyBulkCString(c,"no-appendfsync-on-rewrite");
10190 addReplyBulkCString(c,server.no_appendfsync_on_rewrite ? "yes" : "no");
10191 matches++;
10192 }
10193 if (stringmatch(pattern,"appendfsync",0)) {
10194 char *policy;
10195
10196 switch(server.appendfsync) {
10197 case APPENDFSYNC_NO: policy = "no"; break;
10198 case APPENDFSYNC_EVERYSEC: policy = "everysec"; break;
10199 case APPENDFSYNC_ALWAYS: policy = "always"; break;
10200 default: policy = "unknown"; break; /* too harmless to panic */
10201 }
10202 addReplyBulkCString(c,"appendfsync");
10203 addReplyBulkCString(c,policy);
10204 matches++;
10205 }
10206 if (stringmatch(pattern,"save",0)) {
10207 sds buf = sdsempty();
10208 int j;
10209
10210 for (j = 0; j < server.saveparamslen; j++) {
10211 buf = sdscatprintf(buf,"%ld %d",
10212 server.saveparams[j].seconds,
10213 server.saveparams[j].changes);
10214 if (j != server.saveparamslen-1)
10215 buf = sdscatlen(buf," ",1);
10216 }
10217 addReplyBulkCString(c,"save");
10218 addReplyBulkCString(c,buf);
10219 sdsfree(buf);
10220 matches++;
10221 }
10222 decrRefCount(o);
10223 lenobj->ptr = sdscatprintf(sdsempty(),"*%d\r\n",matches*2);
10224 }
10225
10226 static void configCommand(redisClient *c) {
10227 if (!strcasecmp(c->argv[1]->ptr,"set")) {
10228 if (c->argc != 4) goto badarity;
10229 configSetCommand(c);
10230 } else if (!strcasecmp(c->argv[1]->ptr,"get")) {
10231 if (c->argc != 3) goto badarity;
10232 configGetCommand(c);
10233 } else if (!strcasecmp(c->argv[1]->ptr,"resetstat")) {
10234 if (c->argc != 2) goto badarity;
10235 server.stat_numcommands = 0;
10236 server.stat_numconnections = 0;
10237 server.stat_expiredkeys = 0;
10238 server.stat_starttime = time(NULL);
10239 addReply(c,shared.ok);
10240 } else {
10241 addReplySds(c,sdscatprintf(sdsempty(),
10242 "-ERR CONFIG subcommand must be one of GET, SET, RESETSTAT\r\n"));
10243 }
10244 return;
10245
10246 badarity:
10247 addReplySds(c,sdscatprintf(sdsempty(),
10248 "-ERR Wrong number of arguments for CONFIG %s\r\n",
10249 (char*) c->argv[1]->ptr));
10250 }
10251
10252 /* =========================== Pubsub implementation ======================== */
10253
10254 static void freePubsubPattern(void *p) {
10255 pubsubPattern *pat = p;
10256
10257 decrRefCount(pat->pattern);
10258 zfree(pat);
10259 }
10260
10261 static int listMatchPubsubPattern(void *a, void *b) {
10262 pubsubPattern *pa = a, *pb = b;
10263
10264 return (pa->client == pb->client) &&
10265 (equalStringObjects(pa->pattern,pb->pattern));
10266 }
10267
10268 /* Subscribe a client to a channel. Returns 1 if the operation succeeded, or
10269 * 0 if the client was already subscribed to that channel. */
10270 static int pubsubSubscribeChannel(redisClient *c, robj *channel) {
10271 struct dictEntry *de;
10272 list *clients = NULL;
10273 int retval = 0;
10274
10275 /* Add the channel to the client -> channels hash table */
10276 if (dictAdd(c->pubsub_channels,channel,NULL) == DICT_OK) {
10277 retval = 1;
10278 incrRefCount(channel);
10279 /* Add the client to the channel -> list of clients hash table */
10280 de = dictFind(server.pubsub_channels,channel);
10281 if (de == NULL) {
10282 clients = listCreate();
10283 dictAdd(server.pubsub_channels,channel,clients);
10284 incrRefCount(channel);
10285 } else {
10286 clients = dictGetEntryVal(de);
10287 }
10288 listAddNodeTail(clients,c);
10289 }
10290 /* Notify the client */
10291 addReply(c,shared.mbulk3);
10292 addReply(c,shared.subscribebulk);
10293 addReplyBulk(c,channel);
10294 addReplyLongLong(c,dictSize(c->pubsub_channels)+listLength(c->pubsub_patterns));
10295 return retval;
10296 }
10297
10298 /* Unsubscribe a client from a channel. Returns 1 if the operation succeeded, or
10299 * 0 if the client was not subscribed to the specified channel. */
10300 static int pubsubUnsubscribeChannel(redisClient *c, robj *channel, int notify) {
10301 struct dictEntry *de;
10302 list *clients;
10303 listNode *ln;
10304 int retval = 0;
10305
10306 /* Remove the channel from the client -> channels hash table */
10307 incrRefCount(channel); /* channel may be just a pointer to the same object
10308 we have in the hash tables. Protect it... */
10309 if (dictDelete(c->pubsub_channels,channel) == DICT_OK) {
10310 retval = 1;
10311 /* Remove the client from the channel -> clients list hash table */
10312 de = dictFind(server.pubsub_channels,channel);
10313 assert(de != NULL);
10314 clients = dictGetEntryVal(de);
10315 ln = listSearchKey(clients,c);
10316 assert(ln != NULL);
10317 listDelNode(clients,ln);
10318 if (listLength(clients) == 0) {
10319 /* Free the list and associated hash entry at all if this was
10320 * the latest client, so that it will be possible to abuse
10321 * Redis PUBSUB creating millions of channels. */
10322 dictDelete(server.pubsub_channels,channel);
10323 }
10324 }
10325 /* Notify the client */
10326 if (notify) {
10327 addReply(c,shared.mbulk3);
10328 addReply(c,shared.unsubscribebulk);
10329 addReplyBulk(c,channel);
10330 addReplyLongLong(c,dictSize(c->pubsub_channels)+
10331 listLength(c->pubsub_patterns));
10332
10333 }
10334 decrRefCount(channel); /* it is finally safe to release it */
10335 return retval;
10336 }
10337
10338 /* Subscribe a client to a pattern. Returns 1 if the operation succeeded, or 0 if the clinet was already subscribed to that pattern. */
10339 static int pubsubSubscribePattern(redisClient *c, robj *pattern) {
10340 int retval = 0;
10341
10342 if (listSearchKey(c->pubsub_patterns,pattern) == NULL) {
10343 retval = 1;
10344 pubsubPattern *pat;
10345 listAddNodeTail(c->pubsub_patterns,pattern);
10346 incrRefCount(pattern);
10347 pat = zmalloc(sizeof(*pat));
10348 pat->pattern = getDecodedObject(pattern);
10349 pat->client = c;
10350 listAddNodeTail(server.pubsub_patterns,pat);
10351 }
10352 /* Notify the client */
10353 addReply(c,shared.mbulk3);
10354 addReply(c,shared.psubscribebulk);
10355 addReplyBulk(c,pattern);
10356 addReplyLongLong(c,dictSize(c->pubsub_channels)+listLength(c->pubsub_patterns));
10357 return retval;
10358 }
10359
10360 /* Unsubscribe a client from a channel. Returns 1 if the operation succeeded, or
10361 * 0 if the client was not subscribed to the specified channel. */
10362 static int pubsubUnsubscribePattern(redisClient *c, robj *pattern, int notify) {
10363 listNode *ln;
10364 pubsubPattern pat;
10365 int retval = 0;
10366
10367 incrRefCount(pattern); /* Protect the object. May be the same we remove */
10368 if ((ln = listSearchKey(c->pubsub_patterns,pattern)) != NULL) {
10369 retval = 1;
10370 listDelNode(c->pubsub_patterns,ln);
10371 pat.client = c;
10372 pat.pattern = pattern;
10373 ln = listSearchKey(server.pubsub_patterns,&pat);
10374 listDelNode(server.pubsub_patterns,ln);
10375 }
10376 /* Notify the client */
10377 if (notify) {
10378 addReply(c,shared.mbulk3);
10379 addReply(c,shared.punsubscribebulk);
10380 addReplyBulk(c,pattern);
10381 addReplyLongLong(c,dictSize(c->pubsub_channels)+
10382 listLength(c->pubsub_patterns));
10383 }
10384 decrRefCount(pattern);
10385 return retval;
10386 }
10387
10388 /* Unsubscribe from all the channels. Return the number of channels the
10389 * client was subscribed from. */
10390 static int pubsubUnsubscribeAllChannels(redisClient *c, int notify) {
10391 dictIterator *di = dictGetIterator(c->pubsub_channels);
10392 dictEntry *de;
10393 int count = 0;
10394
10395 while((de = dictNext(di)) != NULL) {
10396 robj *channel = dictGetEntryKey(de);
10397
10398 count += pubsubUnsubscribeChannel(c,channel,notify);
10399 }
10400 dictReleaseIterator(di);
10401 return count;
10402 }
10403
10404 /* Unsubscribe from all the patterns. Return the number of patterns the
10405 * client was subscribed from. */
10406 static int pubsubUnsubscribeAllPatterns(redisClient *c, int notify) {
10407 listNode *ln;
10408 listIter li;
10409 int count = 0;
10410
10411 listRewind(c->pubsub_patterns,&li);
10412 while ((ln = listNext(&li)) != NULL) {
10413 robj *pattern = ln->value;
10414
10415 count += pubsubUnsubscribePattern(c,pattern,notify);
10416 }
10417 return count;
10418 }
10419
10420 /* Publish a message */
10421 static int pubsubPublishMessage(robj *channel, robj *message) {
10422 int receivers = 0;
10423 struct dictEntry *de;
10424 listNode *ln;
10425 listIter li;
10426
10427 /* Send to clients listening for that channel */
10428 de = dictFind(server.pubsub_channels,channel);
10429 if (de) {
10430 list *list = dictGetEntryVal(de);
10431 listNode *ln;
10432 listIter li;
10433
10434 listRewind(list,&li);
10435 while ((ln = listNext(&li)) != NULL) {
10436 redisClient *c = ln->value;
10437
10438 addReply(c,shared.mbulk3);
10439 addReply(c,shared.messagebulk);
10440 addReplyBulk(c,channel);
10441 addReplyBulk(c,message);
10442 receivers++;
10443 }
10444 }
10445 /* Send to clients listening to matching channels */
10446 if (listLength(server.pubsub_patterns)) {
10447 listRewind(server.pubsub_patterns,&li);
10448 channel = getDecodedObject(channel);
10449 while ((ln = listNext(&li)) != NULL) {
10450 pubsubPattern *pat = ln->value;
10451
10452 if (stringmatchlen((char*)pat->pattern->ptr,
10453 sdslen(pat->pattern->ptr),
10454 (char*)channel->ptr,
10455 sdslen(channel->ptr),0)) {
10456 addReply(pat->client,shared.mbulk4);
10457 addReply(pat->client,shared.pmessagebulk);
10458 addReplyBulk(pat->client,pat->pattern);
10459 addReplyBulk(pat->client,channel);
10460 addReplyBulk(pat->client,message);
10461 receivers++;
10462 }
10463 }
10464 decrRefCount(channel);
10465 }
10466 return receivers;
10467 }
10468
10469 static void subscribeCommand(redisClient *c) {
10470 int j;
10471
10472 for (j = 1; j < c->argc; j++)
10473 pubsubSubscribeChannel(c,c->argv[j]);
10474 }
10475
10476 static void unsubscribeCommand(redisClient *c) {
10477 if (c->argc == 1) {
10478 pubsubUnsubscribeAllChannels(c,1);
10479 return;
10480 } else {
10481 int j;
10482
10483 for (j = 1; j < c->argc; j++)
10484 pubsubUnsubscribeChannel(c,c->argv[j],1);
10485 }
10486 }
10487
10488 static void psubscribeCommand(redisClient *c) {
10489 int j;
10490
10491 for (j = 1; j < c->argc; j++)
10492 pubsubSubscribePattern(c,c->argv[j]);
10493 }
10494
10495 static void punsubscribeCommand(redisClient *c) {
10496 if (c->argc == 1) {
10497 pubsubUnsubscribeAllPatterns(c,1);
10498 return;
10499 } else {
10500 int j;
10501
10502 for (j = 1; j < c->argc; j++)
10503 pubsubUnsubscribePattern(c,c->argv[j],1);
10504 }
10505 }
10506
10507 static void publishCommand(redisClient *c) {
10508 int receivers = pubsubPublishMessage(c->argv[1],c->argv[2]);
10509 addReplyLongLong(c,receivers);
10510 }
10511
10512 /* ===================== WATCH (CAS alike for MULTI/EXEC) ===================
10513 *
10514 * The implementation uses a per-DB hash table mapping keys to list of clients
10515 * WATCHing those keys, so that given a key that is going to be modified
10516 * we can mark all the associated clients as dirty.
10517 *
10518 * Also every client contains a list of WATCHed keys so that's possible to
10519 * un-watch such keys when the client is freed or when UNWATCH is called. */
10520
10521 /* In the client->watched_keys list we need to use watchedKey structures
10522 * as in order to identify a key in Redis we need both the key name and the
10523 * DB */
10524 typedef struct watchedKey {
10525 robj *key;
10526 redisDb *db;
10527 } watchedKey;
10528
10529 /* Watch for the specified key */
10530 static void watchForKey(redisClient *c, robj *key) {
10531 list *clients = NULL;
10532 listIter li;
10533 listNode *ln;
10534 watchedKey *wk;
10535
10536 /* Check if we are already watching for this key */
10537 listRewind(c->watched_keys,&li);
10538 while((ln = listNext(&li))) {
10539 wk = listNodeValue(ln);
10540 if (wk->db == c->db && equalStringObjects(key,wk->key))
10541 return; /* Key already watched */
10542 }
10543 /* This key is not already watched in this DB. Let's add it */
10544 clients = dictFetchValue(c->db->watched_keys,key);
10545 if (!clients) {
10546 clients = listCreate();
10547 dictAdd(c->db->watched_keys,key,clients);
10548 incrRefCount(key);
10549 }
10550 listAddNodeTail(clients,c);
10551 /* Add the new key to the lits of keys watched by this client */
10552 wk = zmalloc(sizeof(*wk));
10553 wk->key = key;
10554 wk->db = c->db;
10555 incrRefCount(key);
10556 listAddNodeTail(c->watched_keys,wk);
10557 }
10558
10559 /* Unwatch all the keys watched by this client. To clean the EXEC dirty
10560 * flag is up to the caller. */
10561 static void unwatchAllKeys(redisClient *c) {
10562 listIter li;
10563 listNode *ln;
10564
10565 if (listLength(c->watched_keys) == 0) return;
10566 listRewind(c->watched_keys,&li);
10567 while((ln = listNext(&li))) {
10568 list *clients;
10569 watchedKey *wk;
10570
10571 /* Lookup the watched key -> clients list and remove the client
10572 * from the list */
10573 wk = listNodeValue(ln);
10574 clients = dictFetchValue(wk->db->watched_keys, wk->key);
10575 assert(clients != NULL);
10576 listDelNode(clients,listSearchKey(clients,c));
10577 /* Kill the entry at all if this was the only client */
10578 if (listLength(clients) == 0)
10579 dictDelete(wk->db->watched_keys, wk->key);
10580 /* Remove this watched key from the client->watched list */
10581 listDelNode(c->watched_keys,ln);
10582 decrRefCount(wk->key);
10583 zfree(wk);
10584 }
10585 }
10586
10587 /* "Touch" a key, so that if this key is being WATCHed by some client the
10588 * next EXEC will fail. */
10589 static void touchWatchedKey(redisDb *db, robj *key) {
10590 list *clients;
10591 listIter li;
10592 listNode *ln;
10593
10594 if (dictSize(db->watched_keys) == 0) return;
10595 clients = dictFetchValue(db->watched_keys, key);
10596 if (!clients) return;
10597
10598 /* Mark all the clients watching this key as REDIS_DIRTY_CAS */
10599 /* Check if we are already watching for this key */
10600 listRewind(clients,&li);
10601 while((ln = listNext(&li))) {
10602 redisClient *c = listNodeValue(ln);
10603
10604 c->flags |= REDIS_DIRTY_CAS;
10605 }
10606 }
10607
10608 /* On FLUSHDB or FLUSHALL all the watched keys that are present before the
10609 * flush but will be deleted as effect of the flushing operation should
10610 * be touched. "dbid" is the DB that's getting the flush. -1 if it is
10611 * a FLUSHALL operation (all the DBs flushed). */
10612 static void touchWatchedKeysOnFlush(int dbid) {
10613 listIter li1, li2;
10614 listNode *ln;
10615
10616 /* For every client, check all the waited keys */
10617 listRewind(server.clients,&li1);
10618 while((ln = listNext(&li1))) {
10619 redisClient *c = listNodeValue(ln);
10620 listRewind(c->watched_keys,&li2);
10621 while((ln = listNext(&li2))) {
10622 watchedKey *wk = listNodeValue(ln);
10623
10624 /* For every watched key matching the specified DB, if the
10625 * key exists, mark the client as dirty, as the key will be
10626 * removed. */
10627 if (dbid == -1 || wk->db->id == dbid) {
10628 if (dictFind(wk->db->dict, wk->key->ptr) != NULL)
10629 c->flags |= REDIS_DIRTY_CAS;
10630 }
10631 }
10632 }
10633 }
10634
10635 static void watchCommand(redisClient *c) {
10636 int j;
10637
10638 if (c->flags & REDIS_MULTI) {
10639 addReplySds(c,sdsnew("-ERR WATCH inside MULTI is not allowed\r\n"));
10640 return;
10641 }
10642 for (j = 1; j < c->argc; j++)
10643 watchForKey(c,c->argv[j]);
10644 addReply(c,shared.ok);
10645 }
10646
10647 static void unwatchCommand(redisClient *c) {
10648 unwatchAllKeys(c);
10649 c->flags &= (~REDIS_DIRTY_CAS);
10650 addReply(c,shared.ok);
10651 }
10652
10653 /* ================================= Debugging ============================== */
10654
10655 /* Compute the sha1 of string at 's' with 'len' bytes long.
10656 * The SHA1 is then xored againt the string pointed by digest.
10657 * Since xor is commutative, this operation is used in order to
10658 * "add" digests relative to unordered elements.
10659 *
10660 * So digest(a,b,c,d) will be the same of digest(b,a,c,d) */
10661 static void xorDigest(unsigned char *digest, void *ptr, size_t len) {
10662 SHA1_CTX ctx;
10663 unsigned char hash[20], *s = ptr;
10664 int j;
10665
10666 SHA1Init(&ctx);
10667 SHA1Update(&ctx,s,len);
10668 SHA1Final(hash,&ctx);
10669
10670 for (j = 0; j < 20; j++)
10671 digest[j] ^= hash[j];
10672 }
10673
10674 static void xorObjectDigest(unsigned char *digest, robj *o) {
10675 o = getDecodedObject(o);
10676 xorDigest(digest,o->ptr,sdslen(o->ptr));
10677 decrRefCount(o);
10678 }
10679
10680 /* This function instead of just computing the SHA1 and xoring it
10681 * against diget, also perform the digest of "digest" itself and
10682 * replace the old value with the new one.
10683 *
10684 * So the final digest will be:
10685 *
10686 * digest = SHA1(digest xor SHA1(data))
10687 *
10688 * This function is used every time we want to preserve the order so
10689 * that digest(a,b,c,d) will be different than digest(b,c,d,a)
10690 *
10691 * Also note that mixdigest("foo") followed by mixdigest("bar")
10692 * will lead to a different digest compared to "fo", "obar".
10693 */
10694 static void mixDigest(unsigned char *digest, void *ptr, size_t len) {
10695 SHA1_CTX ctx;
10696 char *s = ptr;
10697
10698 xorDigest(digest,s,len);
10699 SHA1Init(&ctx);
10700 SHA1Update(&ctx,digest,20);
10701 SHA1Final(digest,&ctx);
10702 }
10703
10704 static void mixObjectDigest(unsigned char *digest, robj *o) {
10705 o = getDecodedObject(o);
10706 mixDigest(digest,o->ptr,sdslen(o->ptr));
10707 decrRefCount(o);
10708 }
10709
10710 /* Compute the dataset digest. Since keys, sets elements, hashes elements
10711 * are not ordered, we use a trick: every aggregate digest is the xor
10712 * of the digests of their elements. This way the order will not change
10713 * the result. For list instead we use a feedback entering the output digest
10714 * as input in order to ensure that a different ordered list will result in
10715 * a different digest. */
10716 static void computeDatasetDigest(unsigned char *final) {
10717 unsigned char digest[20];
10718 char buf[128];
10719 dictIterator *di = NULL;
10720 dictEntry *de;
10721 int j;
10722 uint32_t aux;
10723
10724 memset(final,0,20); /* Start with a clean result */
10725
10726 for (j = 0; j < server.dbnum; j++) {
10727 redisDb *db = server.db+j;
10728
10729 if (dictSize(db->dict) == 0) continue;
10730 di = dictGetIterator(db->dict);
10731
10732 /* hash the DB id, so the same dataset moved in a different
10733 * DB will lead to a different digest */
10734 aux = htonl(j);
10735 mixDigest(final,&aux,sizeof(aux));
10736
10737 /* Iterate this DB writing every entry */
10738 while((de = dictNext(di)) != NULL) {
10739 sds key;
10740 robj *keyobj, *o;
10741 time_t expiretime;
10742
10743 memset(digest,0,20); /* This key-val digest */
10744 key = dictGetEntryKey(de);
10745 keyobj = createStringObject(key,sdslen(key));
10746
10747 mixDigest(digest,key,sdslen(key));
10748
10749 /* Make sure the key is loaded if VM is active */
10750 o = lookupKeyRead(db,keyobj);
10751
10752 aux = htonl(o->type);
10753 mixDigest(digest,&aux,sizeof(aux));
10754 expiretime = getExpire(db,keyobj);
10755
10756 /* Save the key and associated value */
10757 if (o->type == REDIS_STRING) {
10758 mixObjectDigest(digest,o);
10759 } else if (o->type == REDIS_LIST) {
10760 list *list = o->ptr;
10761 listNode *ln;
10762 listIter li;
10763
10764 listRewind(list,&li);
10765 while((ln = listNext(&li))) {
10766 robj *eleobj = listNodeValue(ln);
10767
10768 mixObjectDigest(digest,eleobj);
10769 }
10770 } else if (o->type == REDIS_SET) {
10771 dict *set = o->ptr;
10772 dictIterator *di = dictGetIterator(set);
10773 dictEntry *de;
10774
10775 while((de = dictNext(di)) != NULL) {
10776 robj *eleobj = dictGetEntryKey(de);
10777
10778 xorObjectDigest(digest,eleobj);
10779 }
10780 dictReleaseIterator(di);
10781 } else if (o->type == REDIS_ZSET) {
10782 zset *zs = o->ptr;
10783 dictIterator *di = dictGetIterator(zs->dict);
10784 dictEntry *de;
10785
10786 while((de = dictNext(di)) != NULL) {
10787 robj *eleobj = dictGetEntryKey(de);
10788 double *score = dictGetEntryVal(de);
10789 unsigned char eledigest[20];
10790
10791 snprintf(buf,sizeof(buf),"%.17g",*score);
10792 memset(eledigest,0,20);
10793 mixObjectDigest(eledigest,eleobj);
10794 mixDigest(eledigest,buf,strlen(buf));
10795 xorDigest(digest,eledigest,20);
10796 }
10797 dictReleaseIterator(di);
10798 } else if (o->type == REDIS_HASH) {
10799 hashIterator *hi;
10800 robj *obj;
10801
10802 hi = hashInitIterator(o);
10803 while (hashNext(hi) != REDIS_ERR) {
10804 unsigned char eledigest[20];
10805
10806 memset(eledigest,0,20);
10807 obj = hashCurrent(hi,REDIS_HASH_KEY);
10808 mixObjectDigest(eledigest,obj);
10809 decrRefCount(obj);
10810 obj = hashCurrent(hi,REDIS_HASH_VALUE);
10811 mixObjectDigest(eledigest,obj);
10812 decrRefCount(obj);
10813 xorDigest(digest,eledigest,20);
10814 }
10815 hashReleaseIterator(hi);
10816 } else {
10817 redisPanic("Unknown object type");
10818 }
10819 /* If the key has an expire, add it to the mix */
10820 if (expiretime != -1) xorDigest(digest,"!!expire!!",10);
10821 /* We can finally xor the key-val digest to the final digest */
10822 xorDigest(final,digest,20);
10823 decrRefCount(keyobj);
10824 }
10825 dictReleaseIterator(di);
10826 }
10827 }
10828
10829 static void debugCommand(redisClient *c) {
10830 if (!strcasecmp(c->argv[1]->ptr,"segfault")) {
10831 *((char*)-1) = 'x';
10832 } else if (!strcasecmp(c->argv[1]->ptr,"reload")) {
10833 if (rdbSave(server.dbfilename) != REDIS_OK) {
10834 addReply(c,shared.err);
10835 return;
10836 }
10837 emptyDb();
10838 if (rdbLoad(server.dbfilename) != REDIS_OK) {
10839 addReply(c,shared.err);
10840 return;
10841 }
10842 redisLog(REDIS_WARNING,"DB reloaded by DEBUG RELOAD");
10843 addReply(c,shared.ok);
10844 } else if (!strcasecmp(c->argv[1]->ptr,"loadaof")) {
10845 emptyDb();
10846 if (loadAppendOnlyFile(server.appendfilename) != REDIS_OK) {
10847 addReply(c,shared.err);
10848 return;
10849 }
10850 redisLog(REDIS_WARNING,"Append Only File loaded by DEBUG LOADAOF");
10851 addReply(c,shared.ok);
10852 } else if (!strcasecmp(c->argv[1]->ptr,"object") && c->argc == 3) {
10853 dictEntry *de = dictFind(c->db->dict,c->argv[2]->ptr);
10854 robj *val;
10855
10856 if (!de) {
10857 addReply(c,shared.nokeyerr);
10858 return;
10859 }
10860 val = dictGetEntryVal(de);
10861 if (!server.vm_enabled || (val->storage == REDIS_VM_MEMORY ||
10862 val->storage == REDIS_VM_SWAPPING)) {
10863 char *strenc;
10864 char buf[128];
10865
10866 if (val->encoding < (sizeof(strencoding)/sizeof(char*))) {
10867 strenc = strencoding[val->encoding];
10868 } else {
10869 snprintf(buf,64,"unknown encoding %d\n", val->encoding);
10870 strenc = buf;
10871 }
10872 addReplySds(c,sdscatprintf(sdsempty(),
10873 "+Value at:%p refcount:%d "
10874 "encoding:%s serializedlength:%lld\r\n",
10875 (void*)val, val->refcount,
10876 strenc, (long long) rdbSavedObjectLen(val,NULL)));
10877 } else {
10878 vmpointer *vp = (vmpointer*) val;
10879 addReplySds(c,sdscatprintf(sdsempty(),
10880 "+Value swapped at: page %llu "
10881 "using %llu pages\r\n",
10882 (unsigned long long) vp->page,
10883 (unsigned long long) vp->usedpages));
10884 }
10885 } else if (!strcasecmp(c->argv[1]->ptr,"swapin") && c->argc == 3) {
10886 lookupKeyRead(c->db,c->argv[2]);
10887 addReply(c,shared.ok);
10888 } else if (!strcasecmp(c->argv[1]->ptr,"swapout") && c->argc == 3) {
10889 dictEntry *de = dictFind(c->db->dict,c->argv[2]->ptr);
10890 robj *val;
10891 vmpointer *vp;
10892
10893 if (!server.vm_enabled) {
10894 addReplySds(c,sdsnew("-ERR Virtual Memory is disabled\r\n"));
10895 return;
10896 }
10897 if (!de) {
10898 addReply(c,shared.nokeyerr);
10899 return;
10900 }
10901 val = dictGetEntryVal(de);
10902 /* Swap it */
10903 if (val->storage != REDIS_VM_MEMORY) {
10904 addReplySds(c,sdsnew("-ERR This key is not in memory\r\n"));
10905 } else if (val->refcount != 1) {
10906 addReplySds(c,sdsnew("-ERR Object is shared\r\n"));
10907 } else if ((vp = vmSwapObjectBlocking(val)) != NULL) {
10908 dictGetEntryVal(de) = vp;
10909 addReply(c,shared.ok);
10910 } else {
10911 addReply(c,shared.err);
10912 }
10913 } else if (!strcasecmp(c->argv[1]->ptr,"populate") && c->argc == 3) {
10914 long keys, j;
10915 robj *key, *val;
10916 char buf[128];
10917
10918 if (getLongFromObjectOrReply(c, c->argv[2], &keys, NULL) != REDIS_OK)
10919 return;
10920 for (j = 0; j < keys; j++) {
10921 snprintf(buf,sizeof(buf),"key:%lu",j);
10922 key = createStringObject(buf,strlen(buf));
10923 if (lookupKeyRead(c->db,key) != NULL) {
10924 decrRefCount(key);
10925 continue;
10926 }
10927 snprintf(buf,sizeof(buf),"value:%lu",j);
10928 val = createStringObject(buf,strlen(buf));
10929 dbAdd(c->db,key,val);
10930 decrRefCount(key);
10931 }
10932 addReply(c,shared.ok);
10933 } else if (!strcasecmp(c->argv[1]->ptr,"digest") && c->argc == 2) {
10934 unsigned char digest[20];
10935 sds d = sdsnew("+");
10936 int j;
10937
10938 computeDatasetDigest(digest);
10939 for (j = 0; j < 20; j++)
10940 d = sdscatprintf(d, "%02x",digest[j]);
10941
10942 d = sdscatlen(d,"\r\n",2);
10943 addReplySds(c,d);
10944 } else {
10945 addReplySds(c,sdsnew(
10946 "-ERR Syntax error, try DEBUG [SEGFAULT|OBJECT <key>|SWAPIN <key>|SWAPOUT <key>|RELOAD]\r\n"));
10947 }
10948 }
10949
10950 static void _redisAssert(char *estr, char *file, int line) {
10951 redisLog(REDIS_WARNING,"=== ASSERTION FAILED ===");
10952 redisLog(REDIS_WARNING,"==> %s:%d '%s' is not true",file,line,estr);
10953 #ifdef HAVE_BACKTRACE
10954 redisLog(REDIS_WARNING,"(forcing SIGSEGV in order to print the stack trace)");
10955 *((char*)-1) = 'x';
10956 #endif
10957 }
10958
10959 static void _redisPanic(char *msg, char *file, int line) {
10960 redisLog(REDIS_WARNING,"!!! Software Failure. Press left mouse button to continue");
10961 redisLog(REDIS_WARNING,"Guru Meditation: %s #%s:%d",msg,file,line);
10962 #ifdef HAVE_BACKTRACE
10963 redisLog(REDIS_WARNING,"(forcing SIGSEGV in order to print the stack trace)");
10964 *((char*)-1) = 'x';
10965 #endif
10966 }
10967
10968 /* =================================== Main! ================================ */
10969
10970 #ifdef __linux__
10971 int linuxOvercommitMemoryValue(void) {
10972 FILE *fp = fopen("/proc/sys/vm/overcommit_memory","r");
10973 char buf[64];
10974
10975 if (!fp) return -1;
10976 if (fgets(buf,64,fp) == NULL) {
10977 fclose(fp);
10978 return -1;
10979 }
10980 fclose(fp);
10981
10982 return atoi(buf);
10983 }
10984
10985 void linuxOvercommitMemoryWarning(void) {
10986 if (linuxOvercommitMemoryValue() == 0) {
10987 redisLog(REDIS_WARNING,"WARNING overcommit_memory is set to 0! Background save may fail under low memory condition. To fix this issue add 'vm.overcommit_memory = 1' to /etc/sysctl.conf and then reboot or run the command 'sysctl vm.overcommit_memory=1' for this to take effect.");
10988 }
10989 }
10990 #endif /* __linux__ */
10991
10992 static void daemonize(void) {
10993 int fd;
10994 FILE *fp;
10995
10996 if (fork() != 0) exit(0); /* parent exits */
10997 setsid(); /* create a new session */
10998
10999 /* Every output goes to /dev/null. If Redis is daemonized but
11000 * the 'logfile' is set to 'stdout' in the configuration file
11001 * it will not log at all. */
11002 if ((fd = open("/dev/null", O_RDWR, 0)) != -1) {
11003 dup2(fd, STDIN_FILENO);
11004 dup2(fd, STDOUT_FILENO);
11005 dup2(fd, STDERR_FILENO);
11006 if (fd > STDERR_FILENO) close(fd);
11007 }
11008 /* Try to write the pid file */
11009 fp = fopen(server.pidfile,"w");
11010 if (fp) {
11011 fprintf(fp,"%d\n",getpid());
11012 fclose(fp);
11013 }
11014 }
11015
11016 static void version() {
11017 printf("Redis server version %s (%s:%d)\n", REDIS_VERSION,
11018 REDIS_GIT_SHA1, atoi(REDIS_GIT_DIRTY) > 0);
11019 exit(0);
11020 }
11021
11022 static void usage() {
11023 fprintf(stderr,"Usage: ./redis-server [/path/to/redis.conf]\n");
11024 fprintf(stderr," ./redis-server - (read config from stdin)\n");
11025 exit(1);
11026 }
11027
11028 int main(int argc, char **argv) {
11029 time_t start;
11030
11031 initServerConfig();
11032 sortCommandTable();
11033 if (argc == 2) {
11034 if (strcmp(argv[1], "-v") == 0 ||
11035 strcmp(argv[1], "--version") == 0) version();
11036 if (strcmp(argv[1], "--help") == 0) usage();
11037 resetServerSaveParams();
11038 loadServerConfig(argv[1]);
11039 } else if ((argc > 2)) {
11040 usage();
11041 } else {
11042 redisLog(REDIS_WARNING,"Warning: no config file specified, using the default config. In order to specify a config file use 'redis-server /path/to/redis.conf'");
11043 }
11044 if (server.daemonize) daemonize();
11045 initServer();
11046 redisLog(REDIS_NOTICE,"Server started, Redis version " REDIS_VERSION);
11047 #ifdef __linux__
11048 linuxOvercommitMemoryWarning();
11049 #endif
11050 start = time(NULL);
11051 if (server.appendonly) {
11052 if (loadAppendOnlyFile(server.appendfilename) == REDIS_OK)
11053 redisLog(REDIS_NOTICE,"DB loaded from append only file: %ld seconds",time(NULL)-start);
11054 } else {
11055 if (rdbLoad(server.dbfilename) == REDIS_OK)
11056 redisLog(REDIS_NOTICE,"DB loaded from disk: %ld seconds",time(NULL)-start);
11057 }
11058 redisLog(REDIS_NOTICE,"The server is now ready to accept connections on port %d", server.port);
11059 aeSetBeforeSleepProc(server.el,beforeSleep);
11060 aeMain(server.el);
11061 aeDeleteEventLoop(server.el);
11062 return 0;
11063 }
11064
11065 /* ============================= Backtrace support ========================= */
11066
11067 #ifdef HAVE_BACKTRACE
11068 static char *findFuncName(void *pointer, unsigned long *offset);
11069
11070 static void *getMcontextEip(ucontext_t *uc) {
11071 #if defined(__FreeBSD__)
11072 return (void*) uc->uc_mcontext.mc_eip;
11073 #elif defined(__dietlibc__)
11074 return (void*) uc->uc_mcontext.eip;
11075 #elif defined(__APPLE__) && !defined(MAC_OS_X_VERSION_10_6)
11076 #if __x86_64__
11077 return (void*) uc->uc_mcontext->__ss.__rip;
11078 #else
11079 return (void*) uc->uc_mcontext->__ss.__eip;
11080 #endif
11081 #elif defined(__APPLE__) && defined(MAC_OS_X_VERSION_10_6)
11082 #if defined(_STRUCT_X86_THREAD_STATE64) && !defined(__i386__)
11083 return (void*) uc->uc_mcontext->__ss.__rip;
11084 #else
11085 return (void*) uc->uc_mcontext->__ss.__eip;
11086 #endif
11087 #elif defined(__i386__) || defined(__X86_64__) || defined(__x86_64__)
11088 return (void*) uc->uc_mcontext.gregs[REG_EIP]; /* Linux 32/64 bit */
11089 #elif defined(__ia64__) /* Linux IA64 */
11090 return (void*) uc->uc_mcontext.sc_ip;
11091 #else
11092 return NULL;
11093 #endif
11094 }
11095
11096 static void segvHandler(int sig, siginfo_t *info, void *secret) {
11097 void *trace[100];
11098 char **messages = NULL;
11099 int i, trace_size = 0;
11100 unsigned long offset=0;
11101 ucontext_t *uc = (ucontext_t*) secret;
11102 sds infostring;
11103 REDIS_NOTUSED(info);
11104
11105 redisLog(REDIS_WARNING,
11106 "======= Ooops! Redis %s got signal: -%d- =======", REDIS_VERSION, sig);
11107 infostring = genRedisInfoString();
11108 redisLog(REDIS_WARNING, "%s",infostring);
11109 /* It's not safe to sdsfree() the returned string under memory
11110 * corruption conditions. Let it leak as we are going to abort */
11111
11112 trace_size = backtrace(trace, 100);
11113 /* overwrite sigaction with caller's address */
11114 if (getMcontextEip(uc) != NULL) {
11115 trace[1] = getMcontextEip(uc);
11116 }
11117 messages = backtrace_symbols(trace, trace_size);
11118
11119 for (i=1; i<trace_size; ++i) {
11120 char *fn = findFuncName(trace[i], &offset), *p;
11121
11122 p = strchr(messages[i],'+');
11123 if (!fn || (p && ((unsigned long)strtol(p+1,NULL,10)) < offset)) {
11124 redisLog(REDIS_WARNING,"%s", messages[i]);
11125 } else {
11126 redisLog(REDIS_WARNING,"%d redis-server %p %s + %d", i, trace[i], fn, (unsigned int)offset);
11127 }
11128 }
11129 /* free(messages); Don't call free() with possibly corrupted memory. */
11130 _exit(0);
11131 }
11132
11133 static void sigtermHandler(int sig) {
11134 REDIS_NOTUSED(sig);
11135
11136 redisLog(REDIS_WARNING,"SIGTERM received, scheduling shutting down...");
11137 server.shutdown_asap = 1;
11138 }
11139
11140 static void setupSigSegvAction(void) {
11141 struct sigaction act;
11142
11143 sigemptyset (&act.sa_mask);
11144 /* When the SA_SIGINFO flag is set in sa_flags then sa_sigaction
11145 * is used. Otherwise, sa_handler is used */
11146 act.sa_flags = SA_NODEFER | SA_ONSTACK | SA_RESETHAND | SA_SIGINFO;
11147 act.sa_sigaction = segvHandler;
11148 sigaction (SIGSEGV, &act, NULL);
11149 sigaction (SIGBUS, &act, NULL);
11150 sigaction (SIGFPE, &act, NULL);
11151 sigaction (SIGILL, &act, NULL);
11152 sigaction (SIGBUS, &act, NULL);
11153
11154 act.sa_flags = SA_NODEFER | SA_ONSTACK | SA_RESETHAND;
11155 act.sa_handler = sigtermHandler;
11156 sigaction (SIGTERM, &act, NULL);
11157 return;
11158 }
11159
11160 #include "staticsymbols.h"
11161 /* This function try to convert a pointer into a function name. It's used in
11162 * oreder to provide a backtrace under segmentation fault that's able to
11163 * display functions declared as static (otherwise the backtrace is useless). */
11164 static char *findFuncName(void *pointer, unsigned long *offset){
11165 int i, ret = -1;
11166 unsigned long off, minoff = 0;
11167
11168 /* Try to match against the Symbol with the smallest offset */
11169 for (i=0; symsTable[i].pointer; i++) {
11170 unsigned long lp = (unsigned long) pointer;
11171
11172 if (lp != (unsigned long)-1 && lp >= symsTable[i].pointer) {
11173 off=lp-symsTable[i].pointer;
11174 if (ret < 0 || off < minoff) {
11175 minoff=off;
11176 ret=i;
11177 }
11178 }
11179 }
11180 if (ret == -1) return NULL;
11181 *offset = minoff;
11182 return symsTable[ret].name;
11183 }
11184 #else /* HAVE_BACKTRACE */
11185 static void setupSigSegvAction(void) {
11186 }
11187 #endif /* HAVE_BACKTRACE */
11188
11189
11190
11191 /* The End */
11192
11193
11194