]> git.saurik.com Git - redis.git/blob - redis.c
ca4d9f870e79b213fc44b144124361b1f4cd2f8f
[redis.git] / redis.c
1 /*
2 * Copyright (c) 2009-2010, Salvatore Sanfilippo <antirez at gmail dot com>
3 * All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions are met:
7 *
8 * * Redistributions of source code must retain the above copyright notice,
9 * this list of conditions and the following disclaimer.
10 * * Redistributions in binary form must reproduce the above copyright
11 * notice, this list of conditions and the following disclaimer in the
12 * documentation and/or other materials provided with the distribution.
13 * * Neither the name of Redis nor the names of its contributors may be used
14 * to endorse or promote products derived from this software without
15 * specific prior written permission.
16 *
17 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
18 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
21 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
22 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
23 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
24 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
25 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
26 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
27 * POSSIBILITY OF SUCH DAMAGE.
28 */
29
30 #define REDIS_VERSION "2.1.1"
31
32 #include "fmacros.h"
33 #include "config.h"
34
35 #include <stdio.h>
36 #include <stdlib.h>
37 #include <string.h>
38 #include <time.h>
39 #include <unistd.h>
40 #include <signal.h>
41
42 #ifdef HAVE_BACKTRACE
43 #include <execinfo.h>
44 #include <ucontext.h>
45 #endif /* HAVE_BACKTRACE */
46
47 #include <sys/wait.h>
48 #include <errno.h>
49 #include <assert.h>
50 #include <ctype.h>
51 #include <stdarg.h>
52 #include <inttypes.h>
53 #include <arpa/inet.h>
54 #include <sys/stat.h>
55 #include <fcntl.h>
56 #include <sys/time.h>
57 #include <sys/resource.h>
58 #include <sys/uio.h>
59 #include <limits.h>
60 #include <float.h>
61 #include <math.h>
62 #include <pthread.h>
63
64 #if defined(__sun)
65 #include "solarisfixes.h"
66 #endif
67
68 #include "redis.h"
69 #include "ae.h" /* Event driven programming library */
70 #include "sds.h" /* Dynamic safe strings */
71 #include "anet.h" /* Networking the easy way */
72 #include "dict.h" /* Hash tables */
73 #include "adlist.h" /* Linked lists */
74 #include "zmalloc.h" /* total memory usage aware version of malloc/free */
75 #include "lzf.h" /* LZF compression library */
76 #include "pqsort.h" /* Partial qsort for SORT+LIMIT */
77 #include "zipmap.h" /* Compact dictionary-alike data structure */
78 #include "ziplist.h" /* Compact list data structure */
79 #include "sha1.h" /* SHA1 is used for DEBUG DIGEST */
80 #include "release.h" /* Release and/or git repository information */
81
82 /* Error codes */
83 #define REDIS_OK 0
84 #define REDIS_ERR -1
85
86 /* Static server configuration */
87 #define REDIS_SERVERPORT 6379 /* TCP port */
88 #define REDIS_MAXIDLETIME (60*5) /* default client timeout */
89 #define REDIS_IOBUF_LEN 1024
90 #define REDIS_LOADBUF_LEN 1024
91 #define REDIS_STATIC_ARGS 8
92 #define REDIS_DEFAULT_DBNUM 16
93 #define REDIS_CONFIGLINE_MAX 1024
94 #define REDIS_OBJFREELIST_MAX 1000000 /* Max number of objects to cache */
95 #define REDIS_MAX_SYNC_TIME 60 /* Slave can't take more to sync */
96 #define REDIS_EXPIRELOOKUPS_PER_CRON 10 /* lookup 10 expires per loop */
97 #define REDIS_MAX_WRITE_PER_EVENT (1024*64)
98 #define REDIS_REQUEST_MAX_SIZE (1024*1024*256) /* max bytes in inline command */
99
100 /* If more then REDIS_WRITEV_THRESHOLD write packets are pending use writev */
101 #define REDIS_WRITEV_THRESHOLD 3
102 /* Max number of iovecs used for each writev call */
103 #define REDIS_WRITEV_IOVEC_COUNT 256
104
105 /* Hash table parameters */
106 #define REDIS_HT_MINFILL 10 /* Minimal hash table fill 10% */
107
108 /* Command flags */
109 #define REDIS_CMD_BULK 1 /* Bulk write command */
110 #define REDIS_CMD_INLINE 2 /* Inline command */
111 /* REDIS_CMD_DENYOOM reserves a longer comment: all the commands marked with
112 this flags will return an error when the 'maxmemory' option is set in the
113 config file and the server is using more than maxmemory bytes of memory.
114 In short this commands are denied on low memory conditions. */
115 #define REDIS_CMD_DENYOOM 4
116 #define REDIS_CMD_FORCE_REPLICATION 8 /* Force replication even if dirty is 0 */
117
118 /* Object types */
119 #define REDIS_STRING 0
120 #define REDIS_LIST 1
121 #define REDIS_SET 2
122 #define REDIS_ZSET 3
123 #define REDIS_HASH 4
124 #define REDIS_VMPOINTER 8
125
126 /* Objects encoding. Some kind of objects like Strings and Hashes can be
127 * internally represented in multiple ways. The 'encoding' field of the object
128 * is set to one of this fields for this object. */
129 #define REDIS_ENCODING_RAW 0 /* Raw representation */
130 #define REDIS_ENCODING_INT 1 /* Encoded as integer */
131 #define REDIS_ENCODING_HT 2 /* Encoded as hash table */
132 #define REDIS_ENCODING_ZIPMAP 3 /* Encoded as zipmap */
133 #define REDIS_ENCODING_LIST 4 /* Encoded as zipmap */
134 #define REDIS_ENCODING_ZIPLIST 5 /* Encoded as ziplist */
135
136 static char* strencoding[] = {
137 "raw", "int", "hashtable", "zipmap", "list", "ziplist"
138 };
139
140 /* Object types only used for dumping to disk */
141 #define REDIS_EXPIRETIME 253
142 #define REDIS_SELECTDB 254
143 #define REDIS_EOF 255
144
145 /* Defines related to the dump file format. To store 32 bits lengths for short
146 * keys requires a lot of space, so we check the most significant 2 bits of
147 * the first byte to interpreter the length:
148 *
149 * 00|000000 => if the two MSB are 00 the len is the 6 bits of this byte
150 * 01|000000 00000000 => 01, the len is 14 byes, 6 bits + 8 bits of next byte
151 * 10|000000 [32 bit integer] => if it's 01, a full 32 bit len will follow
152 * 11|000000 this means: specially encoded object will follow. The six bits
153 * number specify the kind of object that follows.
154 * See the REDIS_RDB_ENC_* defines.
155 *
156 * Lenghts up to 63 are stored using a single byte, most DB keys, and may
157 * values, will fit inside. */
158 #define REDIS_RDB_6BITLEN 0
159 #define REDIS_RDB_14BITLEN 1
160 #define REDIS_RDB_32BITLEN 2
161 #define REDIS_RDB_ENCVAL 3
162 #define REDIS_RDB_LENERR UINT_MAX
163
164 /* When a length of a string object stored on disk has the first two bits
165 * set, the remaining two bits specify a special encoding for the object
166 * accordingly to the following defines: */
167 #define REDIS_RDB_ENC_INT8 0 /* 8 bit signed integer */
168 #define REDIS_RDB_ENC_INT16 1 /* 16 bit signed integer */
169 #define REDIS_RDB_ENC_INT32 2 /* 32 bit signed integer */
170 #define REDIS_RDB_ENC_LZF 3 /* string compressed with FASTLZ */
171
172 /* Virtual memory object->where field. */
173 #define REDIS_VM_MEMORY 0 /* The object is on memory */
174 #define REDIS_VM_SWAPPED 1 /* The object is on disk */
175 #define REDIS_VM_SWAPPING 2 /* Redis is swapping this object on disk */
176 #define REDIS_VM_LOADING 3 /* Redis is loading this object from disk */
177
178 /* Virtual memory static configuration stuff.
179 * Check vmFindContiguousPages() to know more about this magic numbers. */
180 #define REDIS_VM_MAX_NEAR_PAGES 65536
181 #define REDIS_VM_MAX_RANDOM_JUMP 4096
182 #define REDIS_VM_MAX_THREADS 32
183 #define REDIS_THREAD_STACK_SIZE (1024*1024*4)
184 /* The following is the *percentage* of completed I/O jobs to process when the
185 * handelr is called. While Virtual Memory I/O operations are performed by
186 * threads, this operations must be processed by the main thread when completed
187 * in order to take effect. */
188 #define REDIS_MAX_COMPLETED_JOBS_PROCESSED 1
189
190 /* Client flags */
191 #define REDIS_SLAVE 1 /* This client is a slave server */
192 #define REDIS_MASTER 2 /* This client is a master server */
193 #define REDIS_MONITOR 4 /* This client is a slave monitor, see MONITOR */
194 #define REDIS_MULTI 8 /* This client is in a MULTI context */
195 #define REDIS_BLOCKED 16 /* The client is waiting in a blocking operation */
196 #define REDIS_IO_WAIT 32 /* The client is waiting for Virtual Memory I/O */
197 #define REDIS_DIRTY_CAS 64 /* Watched keys modified. EXEC will fail. */
198
199 /* Slave replication state - slave side */
200 #define REDIS_REPL_NONE 0 /* No active replication */
201 #define REDIS_REPL_CONNECT 1 /* Must connect to master */
202 #define REDIS_REPL_CONNECTED 2 /* Connected to master */
203
204 /* Slave replication state - from the point of view of master
205 * Note that in SEND_BULK and ONLINE state the slave receives new updates
206 * in its output queue. In the WAIT_BGSAVE state instead the server is waiting
207 * to start the next background saving in order to send updates to it. */
208 #define REDIS_REPL_WAIT_BGSAVE_START 3 /* master waits bgsave to start feeding it */
209 #define REDIS_REPL_WAIT_BGSAVE_END 4 /* master waits bgsave to start bulk DB transmission */
210 #define REDIS_REPL_SEND_BULK 5 /* master is sending the bulk DB */
211 #define REDIS_REPL_ONLINE 6 /* bulk DB already transmitted, receive updates */
212
213 /* List related stuff */
214 #define REDIS_HEAD 0
215 #define REDIS_TAIL 1
216
217 /* Sort operations */
218 #define REDIS_SORT_GET 0
219 #define REDIS_SORT_ASC 1
220 #define REDIS_SORT_DESC 2
221 #define REDIS_SORTKEY_MAX 1024
222
223 /* Log levels */
224 #define REDIS_DEBUG 0
225 #define REDIS_VERBOSE 1
226 #define REDIS_NOTICE 2
227 #define REDIS_WARNING 3
228
229 /* Anti-warning macro... */
230 #define REDIS_NOTUSED(V) ((void) V)
231
232 #define ZSKIPLIST_MAXLEVEL 32 /* Should be enough for 2^32 elements */
233 #define ZSKIPLIST_P 0.25 /* Skiplist P = 1/4 */
234
235 /* Append only defines */
236 #define APPENDFSYNC_NO 0
237 #define APPENDFSYNC_ALWAYS 1
238 #define APPENDFSYNC_EVERYSEC 2
239
240 /* Zip structure related defaults */
241 #define REDIS_HASH_MAX_ZIPMAP_ENTRIES 64
242 #define REDIS_HASH_MAX_ZIPMAP_VALUE 512
243 #define REDIS_LIST_MAX_ZIPLIST_ENTRIES 1024
244 #define REDIS_LIST_MAX_ZIPLIST_VALUE 32
245
246 /* We can print the stacktrace, so our assert is defined this way: */
247 #define redisAssert(_e) ((_e)?(void)0 : (_redisAssert(#_e,__FILE__,__LINE__),_exit(1)))
248 #define redisPanic(_e) _redisPanic(#_e,__FILE__,__LINE__),_exit(1)
249 static void _redisAssert(char *estr, char *file, int line);
250 static void _redisPanic(char *msg, char *file, int line);
251
252 /*================================= Data types ============================== */
253
254 /* A redis object, that is a type able to hold a string / list / set */
255
256 /* The actual Redis Object */
257 typedef struct redisObject {
258 unsigned type:4;
259 unsigned storage:2; /* REDIS_VM_MEMORY or REDIS_VM_SWAPPING */
260 unsigned encoding:4;
261 unsigned lru:22; /* lru time (relative to server.lruclock) */
262 int refcount;
263 void *ptr;
264 /* VM fields are only allocated if VM is active, otherwise the
265 * object allocation function will just allocate
266 * sizeof(redisObjct) minus sizeof(redisObjectVM), so using
267 * Redis without VM active will not have any overhead. */
268 } robj;
269
270 /* The VM pointer structure - identifies an object in the swap file.
271 *
272 * This object is stored in place of the value
273 * object in the main key->value hash table representing a database.
274 * Note that the first fields (type, storage) are the same as the redisObject
275 * structure so that vmPointer strucuters can be accessed even when casted
276 * as redisObject structures.
277 *
278 * This is useful as we don't know if a value object is or not on disk, but we
279 * are always able to read obj->storage to check this. For vmPointer
280 * structures "type" is set to REDIS_VMPOINTER (even if without this field
281 * is still possible to check the kind of object from the value of 'storage').*/
282 typedef struct vmPointer {
283 unsigned type:4;
284 unsigned storage:2; /* REDIS_VM_SWAPPED or REDIS_VM_LOADING */
285 unsigned notused:26;
286 unsigned int vtype; /* type of the object stored in the swap file */
287 off_t page; /* the page at witch the object is stored on disk */
288 off_t usedpages; /* number of pages used on disk */
289 } vmpointer;
290
291 /* Macro used to initalize a Redis object allocated on the stack.
292 * Note that this macro is taken near the structure definition to make sure
293 * we'll update it when the structure is changed, to avoid bugs like
294 * bug #85 introduced exactly in this way. */
295 #define initStaticStringObject(_var,_ptr) do { \
296 _var.refcount = 1; \
297 _var.type = REDIS_STRING; \
298 _var.encoding = REDIS_ENCODING_RAW; \
299 _var.ptr = _ptr; \
300 _var.storage = REDIS_VM_MEMORY; \
301 } while(0);
302
303 typedef struct redisDb {
304 dict *dict; /* The keyspace for this DB */
305 dict *expires; /* Timeout of keys with a timeout set */
306 dict *blocking_keys; /* Keys with clients waiting for data (BLPOP) */
307 dict *io_keys; /* Keys with clients waiting for VM I/O */
308 dict *watched_keys; /* WATCHED keys for MULTI/EXEC CAS */
309 int id;
310 } redisDb;
311
312 /* Client MULTI/EXEC state */
313 typedef struct multiCmd {
314 robj **argv;
315 int argc;
316 struct redisCommand *cmd;
317 } multiCmd;
318
319 typedef struct multiState {
320 multiCmd *commands; /* Array of MULTI commands */
321 int count; /* Total number of MULTI commands */
322 } multiState;
323
324 /* With multiplexing we need to take per-clinet state.
325 * Clients are taken in a liked list. */
326 typedef struct redisClient {
327 int fd;
328 redisDb *db;
329 int dictid;
330 sds querybuf;
331 robj **argv, **mbargv;
332 int argc, mbargc;
333 int bulklen; /* bulk read len. -1 if not in bulk read mode */
334 int multibulk; /* multi bulk command format active */
335 list *reply;
336 int sentlen;
337 time_t lastinteraction; /* time of the last interaction, used for timeout */
338 int flags; /* REDIS_SLAVE | REDIS_MONITOR | REDIS_MULTI ... */
339 int slaveseldb; /* slave selected db, if this client is a slave */
340 int authenticated; /* when requirepass is non-NULL */
341 int replstate; /* replication state if this is a slave */
342 int repldbfd; /* replication DB file descriptor */
343 long repldboff; /* replication DB file offset */
344 off_t repldbsize; /* replication DB file size */
345 multiState mstate; /* MULTI/EXEC state */
346 robj **blocking_keys; /* The key we are waiting to terminate a blocking
347 * operation such as BLPOP. Otherwise NULL. */
348 int blocking_keys_num; /* Number of blocking keys */
349 time_t blockingto; /* Blocking operation timeout. If UNIX current time
350 * is >= blockingto then the operation timed out. */
351 list *io_keys; /* Keys this client is waiting to be loaded from the
352 * swap file in order to continue. */
353 list *watched_keys; /* Keys WATCHED for MULTI/EXEC CAS */
354 dict *pubsub_channels; /* channels a client is interested in (SUBSCRIBE) */
355 list *pubsub_patterns; /* patterns a client is interested in (SUBSCRIBE) */
356 } redisClient;
357
358 struct saveparam {
359 time_t seconds;
360 int changes;
361 };
362
363 /* Global server state structure */
364 struct redisServer {
365 int port;
366 int fd;
367 redisDb *db;
368 long long dirty; /* changes to DB from the last save */
369 list *clients;
370 list *slaves, *monitors;
371 char neterr[ANET_ERR_LEN];
372 aeEventLoop *el;
373 int cronloops; /* number of times the cron function run */
374 list *objfreelist; /* A list of freed objects to avoid malloc() */
375 time_t lastsave; /* Unix time of last save succeeede */
376 /* Fields used only for stats */
377 time_t stat_starttime; /* server start time */
378 long long stat_numcommands; /* number of processed commands */
379 long long stat_numconnections; /* number of connections received */
380 long long stat_expiredkeys; /* number of expired keys */
381 /* Configuration */
382 int verbosity;
383 int glueoutputbuf;
384 int maxidletime;
385 int dbnum;
386 int daemonize;
387 int appendonly;
388 int appendfsync;
389 int no_appendfsync_on_rewrite;
390 int shutdown_asap;
391 time_t lastfsync;
392 int appendfd;
393 int appendseldb;
394 char *pidfile;
395 pid_t bgsavechildpid;
396 pid_t bgrewritechildpid;
397 sds bgrewritebuf; /* buffer taken by parent during oppend only rewrite */
398 sds aofbuf; /* AOF buffer, written before entering the event loop */
399 struct saveparam *saveparams;
400 int saveparamslen;
401 char *logfile;
402 char *bindaddr;
403 char *dbfilename;
404 char *appendfilename;
405 char *requirepass;
406 int rdbcompression;
407 int activerehashing;
408 /* Replication related */
409 int isslave;
410 char *masterauth;
411 char *masterhost;
412 int masterport;
413 redisClient *master; /* client that is master for this slave */
414 int replstate;
415 unsigned int maxclients;
416 unsigned long long maxmemory;
417 unsigned int blpop_blocked_clients;
418 unsigned int vm_blocked_clients;
419 /* Sort parameters - qsort_r() is only available under BSD so we
420 * have to take this state global, in order to pass it to sortCompare() */
421 int sort_desc;
422 int sort_alpha;
423 int sort_bypattern;
424 /* Virtual memory configuration */
425 int vm_enabled;
426 char *vm_swap_file;
427 off_t vm_page_size;
428 off_t vm_pages;
429 unsigned long long vm_max_memory;
430 /* Zip structure config */
431 size_t hash_max_zipmap_entries;
432 size_t hash_max_zipmap_value;
433 size_t list_max_ziplist_entries;
434 size_t list_max_ziplist_value;
435 /* Virtual memory state */
436 FILE *vm_fp;
437 int vm_fd;
438 off_t vm_next_page; /* Next probably empty page */
439 off_t vm_near_pages; /* Number of pages allocated sequentially */
440 unsigned char *vm_bitmap; /* Bitmap of free/used pages */
441 time_t unixtime; /* Unix time sampled every second. */
442 /* Virtual memory I/O threads stuff */
443 /* An I/O thread process an element taken from the io_jobs queue and
444 * put the result of the operation in the io_done list. While the
445 * job is being processed, it's put on io_processing queue. */
446 list *io_newjobs; /* List of VM I/O jobs yet to be processed */
447 list *io_processing; /* List of VM I/O jobs being processed */
448 list *io_processed; /* List of VM I/O jobs already processed */
449 list *io_ready_clients; /* Clients ready to be unblocked. All keys loaded */
450 pthread_mutex_t io_mutex; /* lock to access io_jobs/io_done/io_thread_job */
451 pthread_mutex_t obj_freelist_mutex; /* safe redis objects creation/free */
452 pthread_mutex_t io_swapfile_mutex; /* So we can lseek + write */
453 pthread_attr_t io_threads_attr; /* attributes for threads creation */
454 int io_active_threads; /* Number of running I/O threads */
455 int vm_max_threads; /* Max number of I/O threads running at the same time */
456 /* Our main thread is blocked on the event loop, locking for sockets ready
457 * to be read or written, so when a threaded I/O operation is ready to be
458 * processed by the main thread, the I/O thread will use a unix pipe to
459 * awake the main thread. The followings are the two pipe FDs. */
460 int io_ready_pipe_read;
461 int io_ready_pipe_write;
462 /* Virtual memory stats */
463 unsigned long long vm_stats_used_pages;
464 unsigned long long vm_stats_swapped_objects;
465 unsigned long long vm_stats_swapouts;
466 unsigned long long vm_stats_swapins;
467 /* Pubsub */
468 dict *pubsub_channels; /* Map channels to list of subscribed clients */
469 list *pubsub_patterns; /* A list of pubsub_patterns */
470 /* Misc */
471 FILE *devnull;
472 unsigned lruclock:22; /* clock incrementing every minute, for LRU */
473 unsigned lruclock_padding:10;
474 };
475
476 typedef struct pubsubPattern {
477 redisClient *client;
478 robj *pattern;
479 } pubsubPattern;
480
481 typedef void redisCommandProc(redisClient *c);
482 typedef void redisVmPreloadProc(redisClient *c, struct redisCommand *cmd, int argc, robj **argv);
483 struct redisCommand {
484 char *name;
485 redisCommandProc *proc;
486 int arity;
487 int flags;
488 /* Use a function to determine which keys need to be loaded
489 * in the background prior to executing this command. Takes precedence
490 * over vm_firstkey and others, ignored when NULL */
491 redisVmPreloadProc *vm_preload_proc;
492 /* What keys should be loaded in background when calling this command? */
493 int vm_firstkey; /* The first argument that's a key (0 = no keys) */
494 int vm_lastkey; /* THe last argument that's a key */
495 int vm_keystep; /* The step between first and last key */
496 };
497
498 struct redisFunctionSym {
499 char *name;
500 unsigned long pointer;
501 };
502
503 typedef struct _redisSortObject {
504 robj *obj;
505 union {
506 double score;
507 robj *cmpobj;
508 } u;
509 } redisSortObject;
510
511 typedef struct _redisSortOperation {
512 int type;
513 robj *pattern;
514 } redisSortOperation;
515
516 /* ZSETs use a specialized version of Skiplists */
517
518 typedef struct zskiplistNode {
519 struct zskiplistNode **forward;
520 struct zskiplistNode *backward;
521 unsigned int *span;
522 double score;
523 robj *obj;
524 } zskiplistNode;
525
526 typedef struct zskiplist {
527 struct zskiplistNode *header, *tail;
528 unsigned long length;
529 int level;
530 } zskiplist;
531
532 typedef struct zset {
533 dict *dict;
534 zskiplist *zsl;
535 } zset;
536
537 /* Our shared "common" objects */
538
539 #define REDIS_SHARED_INTEGERS 10000
540 struct sharedObjectsStruct {
541 robj *crlf, *ok, *err, *emptybulk, *czero, *cone, *pong, *space,
542 *colon, *nullbulk, *nullmultibulk, *queued,
543 *emptymultibulk, *wrongtypeerr, *nokeyerr, *syntaxerr, *sameobjecterr,
544 *outofrangeerr, *plus,
545 *select0, *select1, *select2, *select3, *select4,
546 *select5, *select6, *select7, *select8, *select9,
547 *messagebulk, *pmessagebulk, *subscribebulk, *unsubscribebulk, *mbulk3,
548 *mbulk4, *psubscribebulk, *punsubscribebulk,
549 *integers[REDIS_SHARED_INTEGERS];
550 } shared;
551
552 /* Global vars that are actally used as constants. The following double
553 * values are used for double on-disk serialization, and are initialized
554 * at runtime to avoid strange compiler optimizations. */
555
556 static double R_Zero, R_PosInf, R_NegInf, R_Nan;
557
558 /* VM threaded I/O request message */
559 #define REDIS_IOJOB_LOAD 0 /* Load from disk to memory */
560 #define REDIS_IOJOB_PREPARE_SWAP 1 /* Compute needed pages */
561 #define REDIS_IOJOB_DO_SWAP 2 /* Swap from memory to disk */
562 typedef struct iojob {
563 int type; /* Request type, REDIS_IOJOB_* */
564 redisDb *db;/* Redis database */
565 robj *key; /* This I/O request is about swapping this key */
566 robj *id; /* Unique identifier of this job:
567 this is the object to swap for REDIS_IOREQ_*_SWAP, or the
568 vmpointer objct for REDIS_IOREQ_LOAD. */
569 robj *val; /* the value to swap for REDIS_IOREQ_*_SWAP, otherwise this
570 * field is populated by the I/O thread for REDIS_IOREQ_LOAD. */
571 off_t page; /* Swap page where to read/write the object */
572 off_t pages; /* Swap pages needed to save object. PREPARE_SWAP return val */
573 int canceled; /* True if this command was canceled by blocking side of VM */
574 pthread_t thread; /* ID of the thread processing this entry */
575 } iojob;
576
577 /*================================ Prototypes =============================== */
578
579 static void freeStringObject(robj *o);
580 static void freeListObject(robj *o);
581 static void freeSetObject(robj *o);
582 static void decrRefCount(void *o);
583 static robj *createObject(int type, void *ptr);
584 static void freeClient(redisClient *c);
585 static int rdbLoad(char *filename);
586 static void addReply(redisClient *c, robj *obj);
587 static void addReplySds(redisClient *c, sds s);
588 static void incrRefCount(robj *o);
589 static int rdbSaveBackground(char *filename);
590 static robj *createStringObject(char *ptr, size_t len);
591 static robj *dupStringObject(robj *o);
592 static void replicationFeedSlaves(list *slaves, int dictid, robj **argv, int argc);
593 static void replicationFeedMonitors(list *monitors, int dictid, robj **argv, int argc);
594 static void flushAppendOnlyFile(void);
595 static void feedAppendOnlyFile(struct redisCommand *cmd, int dictid, robj **argv, int argc);
596 static int syncWithMaster(void);
597 static robj *tryObjectEncoding(robj *o);
598 static robj *getDecodedObject(robj *o);
599 static int removeExpire(redisDb *db, robj *key);
600 static int expireIfNeeded(redisDb *db, robj *key);
601 static int deleteIfVolatile(redisDb *db, robj *key);
602 static int dbDelete(redisDb *db, robj *key);
603 static time_t getExpire(redisDb *db, robj *key);
604 static int setExpire(redisDb *db, robj *key, time_t when);
605 static void updateSlavesWaitingBgsave(int bgsaveerr);
606 static void freeMemoryIfNeeded(void);
607 static int processCommand(redisClient *c);
608 static void setupSigSegvAction(void);
609 static void rdbRemoveTempFile(pid_t childpid);
610 static void aofRemoveTempFile(pid_t childpid);
611 static size_t stringObjectLen(robj *o);
612 static void processInputBuffer(redisClient *c);
613 static zskiplist *zslCreate(void);
614 static void zslFree(zskiplist *zsl);
615 static void zslInsert(zskiplist *zsl, double score, robj *obj);
616 static void sendReplyToClientWritev(aeEventLoop *el, int fd, void *privdata, int mask);
617 static void initClientMultiState(redisClient *c);
618 static void freeClientMultiState(redisClient *c);
619 static void queueMultiCommand(redisClient *c, struct redisCommand *cmd);
620 static void unblockClientWaitingData(redisClient *c);
621 static int handleClientsWaitingListPush(redisClient *c, robj *key, robj *ele);
622 static void vmInit(void);
623 static void vmMarkPagesFree(off_t page, off_t count);
624 static robj *vmLoadObject(robj *o);
625 static robj *vmPreviewObject(robj *o);
626 static int vmSwapOneObjectBlocking(void);
627 static int vmSwapOneObjectThreaded(void);
628 static int vmCanSwapOut(void);
629 static int tryFreeOneObjectFromFreelist(void);
630 static void acceptHandler(aeEventLoop *el, int fd, void *privdata, int mask);
631 static void vmThreadedIOCompletedJob(aeEventLoop *el, int fd, void *privdata, int mask);
632 static void vmCancelThreadedIOJob(robj *o);
633 static void lockThreadedIO(void);
634 static void unlockThreadedIO(void);
635 static int vmSwapObjectThreaded(robj *key, robj *val, redisDb *db);
636 static void freeIOJob(iojob *j);
637 static void queueIOJob(iojob *j);
638 static int vmWriteObjectOnSwap(robj *o, off_t page);
639 static robj *vmReadObjectFromSwap(off_t page, int type);
640 static void waitEmptyIOJobsQueue(void);
641 static void vmReopenSwapFile(void);
642 static int vmFreePage(off_t page);
643 static void zunionInterBlockClientOnSwappedKeys(redisClient *c, struct redisCommand *cmd, int argc, robj **argv);
644 static void execBlockClientOnSwappedKeys(redisClient *c, struct redisCommand *cmd, int argc, robj **argv);
645 static int blockClientOnSwappedKeys(redisClient *c, struct redisCommand *cmd);
646 static int dontWaitForSwappedKey(redisClient *c, robj *key);
647 static void handleClientsBlockedOnSwappedKey(redisDb *db, robj *key);
648 static void readQueryFromClient(aeEventLoop *el, int fd, void *privdata, int mask);
649 static struct redisCommand *lookupCommand(char *name);
650 static void call(redisClient *c, struct redisCommand *cmd);
651 static void resetClient(redisClient *c);
652 static void convertToRealHash(robj *o);
653 static void listTypeConvert(robj *o, int enc);
654 static int pubsubUnsubscribeAllChannels(redisClient *c, int notify);
655 static int pubsubUnsubscribeAllPatterns(redisClient *c, int notify);
656 static void freePubsubPattern(void *p);
657 static int listMatchPubsubPattern(void *a, void *b);
658 static int compareStringObjects(robj *a, robj *b);
659 static int equalStringObjects(robj *a, robj *b);
660 static void usage();
661 static int rewriteAppendOnlyFileBackground(void);
662 static vmpointer *vmSwapObjectBlocking(robj *val);
663 static int prepareForShutdown();
664 static void touchWatchedKey(redisDb *db, robj *key);
665 static void touchWatchedKeysOnFlush(int dbid);
666 static void unwatchAllKeys(redisClient *c);
667
668 static void authCommand(redisClient *c);
669 static void pingCommand(redisClient *c);
670 static void echoCommand(redisClient *c);
671 static void setCommand(redisClient *c);
672 static void setnxCommand(redisClient *c);
673 static void setexCommand(redisClient *c);
674 static void getCommand(redisClient *c);
675 static void delCommand(redisClient *c);
676 static void existsCommand(redisClient *c);
677 static void incrCommand(redisClient *c);
678 static void decrCommand(redisClient *c);
679 static void incrbyCommand(redisClient *c);
680 static void decrbyCommand(redisClient *c);
681 static void selectCommand(redisClient *c);
682 static void randomkeyCommand(redisClient *c);
683 static void keysCommand(redisClient *c);
684 static void dbsizeCommand(redisClient *c);
685 static void lastsaveCommand(redisClient *c);
686 static void saveCommand(redisClient *c);
687 static void bgsaveCommand(redisClient *c);
688 static void bgrewriteaofCommand(redisClient *c);
689 static void shutdownCommand(redisClient *c);
690 static void moveCommand(redisClient *c);
691 static void renameCommand(redisClient *c);
692 static void renamenxCommand(redisClient *c);
693 static void lpushCommand(redisClient *c);
694 static void rpushCommand(redisClient *c);
695 static void lpushxCommand(redisClient *c);
696 static void rpushxCommand(redisClient *c);
697 static void linsertCommand(redisClient *c);
698 static void lpopCommand(redisClient *c);
699 static void rpopCommand(redisClient *c);
700 static void llenCommand(redisClient *c);
701 static void lindexCommand(redisClient *c);
702 static void lrangeCommand(redisClient *c);
703 static void ltrimCommand(redisClient *c);
704 static void typeCommand(redisClient *c);
705 static void lsetCommand(redisClient *c);
706 static void saddCommand(redisClient *c);
707 static void sremCommand(redisClient *c);
708 static void smoveCommand(redisClient *c);
709 static void sismemberCommand(redisClient *c);
710 static void scardCommand(redisClient *c);
711 static void spopCommand(redisClient *c);
712 static void srandmemberCommand(redisClient *c);
713 static void sinterCommand(redisClient *c);
714 static void sinterstoreCommand(redisClient *c);
715 static void sunionCommand(redisClient *c);
716 static void sunionstoreCommand(redisClient *c);
717 static void sdiffCommand(redisClient *c);
718 static void sdiffstoreCommand(redisClient *c);
719 static void syncCommand(redisClient *c);
720 static void flushdbCommand(redisClient *c);
721 static void flushallCommand(redisClient *c);
722 static void sortCommand(redisClient *c);
723 static void lremCommand(redisClient *c);
724 static void rpoplpushcommand(redisClient *c);
725 static void infoCommand(redisClient *c);
726 static void mgetCommand(redisClient *c);
727 static void monitorCommand(redisClient *c);
728 static void expireCommand(redisClient *c);
729 static void expireatCommand(redisClient *c);
730 static void getsetCommand(redisClient *c);
731 static void ttlCommand(redisClient *c);
732 static void slaveofCommand(redisClient *c);
733 static void debugCommand(redisClient *c);
734 static void msetCommand(redisClient *c);
735 static void msetnxCommand(redisClient *c);
736 static void zaddCommand(redisClient *c);
737 static void zincrbyCommand(redisClient *c);
738 static void zrangeCommand(redisClient *c);
739 static void zrangebyscoreCommand(redisClient *c);
740 static void zcountCommand(redisClient *c);
741 static void zrevrangeCommand(redisClient *c);
742 static void zcardCommand(redisClient *c);
743 static void zremCommand(redisClient *c);
744 static void zscoreCommand(redisClient *c);
745 static void zremrangebyscoreCommand(redisClient *c);
746 static void multiCommand(redisClient *c);
747 static void execCommand(redisClient *c);
748 static void discardCommand(redisClient *c);
749 static void blpopCommand(redisClient *c);
750 static void brpopCommand(redisClient *c);
751 static void appendCommand(redisClient *c);
752 static void substrCommand(redisClient *c);
753 static void zrankCommand(redisClient *c);
754 static void zrevrankCommand(redisClient *c);
755 static void hsetCommand(redisClient *c);
756 static void hsetnxCommand(redisClient *c);
757 static void hgetCommand(redisClient *c);
758 static void hmsetCommand(redisClient *c);
759 static void hmgetCommand(redisClient *c);
760 static void hdelCommand(redisClient *c);
761 static void hlenCommand(redisClient *c);
762 static void zremrangebyrankCommand(redisClient *c);
763 static void zunionstoreCommand(redisClient *c);
764 static void zinterstoreCommand(redisClient *c);
765 static void hkeysCommand(redisClient *c);
766 static void hvalsCommand(redisClient *c);
767 static void hgetallCommand(redisClient *c);
768 static void hexistsCommand(redisClient *c);
769 static void configCommand(redisClient *c);
770 static void hincrbyCommand(redisClient *c);
771 static void subscribeCommand(redisClient *c);
772 static void unsubscribeCommand(redisClient *c);
773 static void psubscribeCommand(redisClient *c);
774 static void punsubscribeCommand(redisClient *c);
775 static void publishCommand(redisClient *c);
776 static void watchCommand(redisClient *c);
777 static void unwatchCommand(redisClient *c);
778
779 /*================================= Globals ================================= */
780
781 /* Global vars */
782 static struct redisServer server; /* server global state */
783 static struct redisCommand *commandTable;
784 static struct redisCommand readonlyCommandTable[] = {
785 {"get",getCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
786 {"set",setCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,0,0,0},
787 {"setnx",setnxCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,0,0,0},
788 {"setex",setexCommand,4,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,0,0,0},
789 {"append",appendCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
790 {"substr",substrCommand,4,REDIS_CMD_INLINE,NULL,1,1,1},
791 {"del",delCommand,-2,REDIS_CMD_INLINE,NULL,0,0,0},
792 {"exists",existsCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
793 {"incr",incrCommand,2,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,1,1},
794 {"decr",decrCommand,2,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,1,1},
795 {"mget",mgetCommand,-2,REDIS_CMD_INLINE,NULL,1,-1,1},
796 {"rpush",rpushCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
797 {"lpush",lpushCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
798 {"rpushx",rpushxCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
799 {"lpushx",lpushxCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
800 {"linsert",linsertCommand,5,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
801 {"rpop",rpopCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
802 {"lpop",lpopCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
803 {"brpop",brpopCommand,-3,REDIS_CMD_INLINE,NULL,1,1,1},
804 {"blpop",blpopCommand,-3,REDIS_CMD_INLINE,NULL,1,1,1},
805 {"llen",llenCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
806 {"lindex",lindexCommand,3,REDIS_CMD_INLINE,NULL,1,1,1},
807 {"lset",lsetCommand,4,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
808 {"lrange",lrangeCommand,4,REDIS_CMD_INLINE,NULL,1,1,1},
809 {"ltrim",ltrimCommand,4,REDIS_CMD_INLINE,NULL,1,1,1},
810 {"lrem",lremCommand,4,REDIS_CMD_BULK,NULL,1,1,1},
811 {"rpoplpush",rpoplpushcommand,3,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,2,1},
812 {"sadd",saddCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
813 {"srem",sremCommand,3,REDIS_CMD_BULK,NULL,1,1,1},
814 {"smove",smoveCommand,4,REDIS_CMD_BULK,NULL,1,2,1},
815 {"sismember",sismemberCommand,3,REDIS_CMD_BULK,NULL,1,1,1},
816 {"scard",scardCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
817 {"spop",spopCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
818 {"srandmember",srandmemberCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
819 {"sinter",sinterCommand,-2,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,-1,1},
820 {"sinterstore",sinterstoreCommand,-3,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,2,-1,1},
821 {"sunion",sunionCommand,-2,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,-1,1},
822 {"sunionstore",sunionstoreCommand,-3,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,2,-1,1},
823 {"sdiff",sdiffCommand,-2,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,-1,1},
824 {"sdiffstore",sdiffstoreCommand,-3,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,2,-1,1},
825 {"smembers",sinterCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
826 {"zadd",zaddCommand,4,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
827 {"zincrby",zincrbyCommand,4,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
828 {"zrem",zremCommand,3,REDIS_CMD_BULK,NULL,1,1,1},
829 {"zremrangebyscore",zremrangebyscoreCommand,4,REDIS_CMD_INLINE,NULL,1,1,1},
830 {"zremrangebyrank",zremrangebyrankCommand,4,REDIS_CMD_INLINE,NULL,1,1,1},
831 {"zunionstore",zunionstoreCommand,-4,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,zunionInterBlockClientOnSwappedKeys,0,0,0},
832 {"zinterstore",zinterstoreCommand,-4,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,zunionInterBlockClientOnSwappedKeys,0,0,0},
833 {"zrange",zrangeCommand,-4,REDIS_CMD_INLINE,NULL,1,1,1},
834 {"zrangebyscore",zrangebyscoreCommand,-4,REDIS_CMD_INLINE,NULL,1,1,1},
835 {"zcount",zcountCommand,4,REDIS_CMD_INLINE,NULL,1,1,1},
836 {"zrevrange",zrevrangeCommand,-4,REDIS_CMD_INLINE,NULL,1,1,1},
837 {"zcard",zcardCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
838 {"zscore",zscoreCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
839 {"zrank",zrankCommand,3,REDIS_CMD_BULK,NULL,1,1,1},
840 {"zrevrank",zrevrankCommand,3,REDIS_CMD_BULK,NULL,1,1,1},
841 {"hset",hsetCommand,4,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
842 {"hsetnx",hsetnxCommand,4,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
843 {"hget",hgetCommand,3,REDIS_CMD_BULK,NULL,1,1,1},
844 {"hmset",hmsetCommand,-4,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
845 {"hmget",hmgetCommand,-3,REDIS_CMD_BULK,NULL,1,1,1},
846 {"hincrby",hincrbyCommand,4,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,1,1},
847 {"hdel",hdelCommand,3,REDIS_CMD_BULK,NULL,1,1,1},
848 {"hlen",hlenCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
849 {"hkeys",hkeysCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
850 {"hvals",hvalsCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
851 {"hgetall",hgetallCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
852 {"hexists",hexistsCommand,3,REDIS_CMD_BULK,NULL,1,1,1},
853 {"incrby",incrbyCommand,3,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,1,1},
854 {"decrby",decrbyCommand,3,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,1,1},
855 {"getset",getsetCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
856 {"mset",msetCommand,-3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,-1,2},
857 {"msetnx",msetnxCommand,-3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,-1,2},
858 {"randomkey",randomkeyCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
859 {"select",selectCommand,2,REDIS_CMD_INLINE,NULL,0,0,0},
860 {"move",moveCommand,3,REDIS_CMD_INLINE,NULL,1,1,1},
861 {"rename",renameCommand,3,REDIS_CMD_INLINE,NULL,1,1,1},
862 {"renamenx",renamenxCommand,3,REDIS_CMD_INLINE,NULL,1,1,1},
863 {"expire",expireCommand,3,REDIS_CMD_INLINE,NULL,0,0,0},
864 {"expireat",expireatCommand,3,REDIS_CMD_INLINE,NULL,0,0,0},
865 {"keys",keysCommand,2,REDIS_CMD_INLINE,NULL,0,0,0},
866 {"dbsize",dbsizeCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
867 {"auth",authCommand,2,REDIS_CMD_INLINE,NULL,0,0,0},
868 {"ping",pingCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
869 {"echo",echoCommand,2,REDIS_CMD_BULK,NULL,0,0,0},
870 {"save",saveCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
871 {"bgsave",bgsaveCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
872 {"bgrewriteaof",bgrewriteaofCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
873 {"shutdown",shutdownCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
874 {"lastsave",lastsaveCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
875 {"type",typeCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
876 {"multi",multiCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
877 {"exec",execCommand,1,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,execBlockClientOnSwappedKeys,0,0,0},
878 {"discard",discardCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
879 {"sync",syncCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
880 {"flushdb",flushdbCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
881 {"flushall",flushallCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
882 {"sort",sortCommand,-2,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,1,1},
883 {"info",infoCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
884 {"monitor",monitorCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
885 {"ttl",ttlCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
886 {"slaveof",slaveofCommand,3,REDIS_CMD_INLINE,NULL,0,0,0},
887 {"debug",debugCommand,-2,REDIS_CMD_INLINE,NULL,0,0,0},
888 {"config",configCommand,-2,REDIS_CMD_BULK,NULL,0,0,0},
889 {"subscribe",subscribeCommand,-2,REDIS_CMD_INLINE,NULL,0,0,0},
890 {"unsubscribe",unsubscribeCommand,-1,REDIS_CMD_INLINE,NULL,0,0,0},
891 {"psubscribe",psubscribeCommand,-2,REDIS_CMD_INLINE,NULL,0,0,0},
892 {"punsubscribe",punsubscribeCommand,-1,REDIS_CMD_INLINE,NULL,0,0,0},
893 {"publish",publishCommand,3,REDIS_CMD_BULK|REDIS_CMD_FORCE_REPLICATION,NULL,0,0,0},
894 {"watch",watchCommand,-2,REDIS_CMD_INLINE,NULL,0,0,0},
895 {"unwatch",unwatchCommand,1,REDIS_CMD_INLINE,NULL,0,0,0}
896 };
897
898 /*============================ Utility functions ============================ */
899
900 /* Glob-style pattern matching. */
901 static int stringmatchlen(const char *pattern, int patternLen,
902 const char *string, int stringLen, int nocase)
903 {
904 while(patternLen) {
905 switch(pattern[0]) {
906 case '*':
907 while (pattern[1] == '*') {
908 pattern++;
909 patternLen--;
910 }
911 if (patternLen == 1)
912 return 1; /* match */
913 while(stringLen) {
914 if (stringmatchlen(pattern+1, patternLen-1,
915 string, stringLen, nocase))
916 return 1; /* match */
917 string++;
918 stringLen--;
919 }
920 return 0; /* no match */
921 break;
922 case '?':
923 if (stringLen == 0)
924 return 0; /* no match */
925 string++;
926 stringLen--;
927 break;
928 case '[':
929 {
930 int not, match;
931
932 pattern++;
933 patternLen--;
934 not = pattern[0] == '^';
935 if (not) {
936 pattern++;
937 patternLen--;
938 }
939 match = 0;
940 while(1) {
941 if (pattern[0] == '\\') {
942 pattern++;
943 patternLen--;
944 if (pattern[0] == string[0])
945 match = 1;
946 } else if (pattern[0] == ']') {
947 break;
948 } else if (patternLen == 0) {
949 pattern--;
950 patternLen++;
951 break;
952 } else if (pattern[1] == '-' && patternLen >= 3) {
953 int start = pattern[0];
954 int end = pattern[2];
955 int c = string[0];
956 if (start > end) {
957 int t = start;
958 start = end;
959 end = t;
960 }
961 if (nocase) {
962 start = tolower(start);
963 end = tolower(end);
964 c = tolower(c);
965 }
966 pattern += 2;
967 patternLen -= 2;
968 if (c >= start && c <= end)
969 match = 1;
970 } else {
971 if (!nocase) {
972 if (pattern[0] == string[0])
973 match = 1;
974 } else {
975 if (tolower((int)pattern[0]) == tolower((int)string[0]))
976 match = 1;
977 }
978 }
979 pattern++;
980 patternLen--;
981 }
982 if (not)
983 match = !match;
984 if (!match)
985 return 0; /* no match */
986 string++;
987 stringLen--;
988 break;
989 }
990 case '\\':
991 if (patternLen >= 2) {
992 pattern++;
993 patternLen--;
994 }
995 /* fall through */
996 default:
997 if (!nocase) {
998 if (pattern[0] != string[0])
999 return 0; /* no match */
1000 } else {
1001 if (tolower((int)pattern[0]) != tolower((int)string[0]))
1002 return 0; /* no match */
1003 }
1004 string++;
1005 stringLen--;
1006 break;
1007 }
1008 pattern++;
1009 patternLen--;
1010 if (stringLen == 0) {
1011 while(*pattern == '*') {
1012 pattern++;
1013 patternLen--;
1014 }
1015 break;
1016 }
1017 }
1018 if (patternLen == 0 && stringLen == 0)
1019 return 1;
1020 return 0;
1021 }
1022
1023 static int stringmatch(const char *pattern, const char *string, int nocase) {
1024 return stringmatchlen(pattern,strlen(pattern),string,strlen(string),nocase);
1025 }
1026
1027 /* Convert a string representing an amount of memory into the number of
1028 * bytes, so for instance memtoll("1Gi") will return 1073741824 that is
1029 * (1024*1024*1024).
1030 *
1031 * On parsing error, if *err is not NULL, it's set to 1, otherwise it's
1032 * set to 0 */
1033 static long long memtoll(const char *p, int *err) {
1034 const char *u;
1035 char buf[128];
1036 long mul; /* unit multiplier */
1037 long long val;
1038 unsigned int digits;
1039
1040 if (err) *err = 0;
1041 /* Search the first non digit character. */
1042 u = p;
1043 if (*u == '-') u++;
1044 while(*u && isdigit(*u)) u++;
1045 if (*u == '\0' || !strcasecmp(u,"b")) {
1046 mul = 1;
1047 } else if (!strcasecmp(u,"k")) {
1048 mul = 1000;
1049 } else if (!strcasecmp(u,"kb")) {
1050 mul = 1024;
1051 } else if (!strcasecmp(u,"m")) {
1052 mul = 1000*1000;
1053 } else if (!strcasecmp(u,"mb")) {
1054 mul = 1024*1024;
1055 } else if (!strcasecmp(u,"g")) {
1056 mul = 1000L*1000*1000;
1057 } else if (!strcasecmp(u,"gb")) {
1058 mul = 1024L*1024*1024;
1059 } else {
1060 if (err) *err = 1;
1061 mul = 1;
1062 }
1063 digits = u-p;
1064 if (digits >= sizeof(buf)) {
1065 if (err) *err = 1;
1066 return LLONG_MAX;
1067 }
1068 memcpy(buf,p,digits);
1069 buf[digits] = '\0';
1070 val = strtoll(buf,NULL,10);
1071 return val*mul;
1072 }
1073
1074 /* Convert a long long into a string. Returns the number of
1075 * characters needed to represent the number, that can be shorter if passed
1076 * buffer length is not enough to store the whole number. */
1077 static int ll2string(char *s, size_t len, long long value) {
1078 char buf[32], *p;
1079 unsigned long long v;
1080 size_t l;
1081
1082 if (len == 0) return 0;
1083 v = (value < 0) ? -value : value;
1084 p = buf+31; /* point to the last character */
1085 do {
1086 *p-- = '0'+(v%10);
1087 v /= 10;
1088 } while(v);
1089 if (value < 0) *p-- = '-';
1090 p++;
1091 l = 32-(p-buf);
1092 if (l+1 > len) l = len-1; /* Make sure it fits, including the nul term */
1093 memcpy(s,p,l);
1094 s[l] = '\0';
1095 return l;
1096 }
1097
1098 static void redisLog(int level, const char *fmt, ...) {
1099 va_list ap;
1100 FILE *fp;
1101
1102 fp = (server.logfile == NULL) ? stdout : fopen(server.logfile,"a");
1103 if (!fp) return;
1104
1105 va_start(ap, fmt);
1106 if (level >= server.verbosity) {
1107 char *c = ".-*#";
1108 char buf[64];
1109 time_t now;
1110
1111 now = time(NULL);
1112 strftime(buf,64,"%d %b %H:%M:%S",localtime(&now));
1113 fprintf(fp,"[%d] %s %c ",(int)getpid(),buf,c[level]);
1114 vfprintf(fp, fmt, ap);
1115 fprintf(fp,"\n");
1116 fflush(fp);
1117 }
1118 va_end(ap);
1119
1120 if (server.logfile) fclose(fp);
1121 }
1122
1123 /*====================== Hash table type implementation ==================== */
1124
1125 /* This is an hash table type that uses the SDS dynamic strings libary as
1126 * keys and radis objects as values (objects can hold SDS strings,
1127 * lists, sets). */
1128
1129 static void dictVanillaFree(void *privdata, void *val)
1130 {
1131 DICT_NOTUSED(privdata);
1132 zfree(val);
1133 }
1134
1135 static void dictListDestructor(void *privdata, void *val)
1136 {
1137 DICT_NOTUSED(privdata);
1138 listRelease((list*)val);
1139 }
1140
1141 static int dictSdsKeyCompare(void *privdata, const void *key1,
1142 const void *key2)
1143 {
1144 int l1,l2;
1145 DICT_NOTUSED(privdata);
1146
1147 l1 = sdslen((sds)key1);
1148 l2 = sdslen((sds)key2);
1149 if (l1 != l2) return 0;
1150 return memcmp(key1, key2, l1) == 0;
1151 }
1152
1153 static void dictRedisObjectDestructor(void *privdata, void *val)
1154 {
1155 DICT_NOTUSED(privdata);
1156
1157 if (val == NULL) return; /* Values of swapped out keys as set to NULL */
1158 decrRefCount(val);
1159 }
1160
1161 static void dictSdsDestructor(void *privdata, void *val)
1162 {
1163 DICT_NOTUSED(privdata);
1164
1165 sdsfree(val);
1166 }
1167
1168 static int dictObjKeyCompare(void *privdata, const void *key1,
1169 const void *key2)
1170 {
1171 const robj *o1 = key1, *o2 = key2;
1172 return dictSdsKeyCompare(privdata,o1->ptr,o2->ptr);
1173 }
1174
1175 static unsigned int dictObjHash(const void *key) {
1176 const robj *o = key;
1177 return dictGenHashFunction(o->ptr, sdslen((sds)o->ptr));
1178 }
1179
1180 static unsigned int dictSdsHash(const void *key) {
1181 return dictGenHashFunction((unsigned char*)key, sdslen((char*)key));
1182 }
1183
1184 static int dictEncObjKeyCompare(void *privdata, const void *key1,
1185 const void *key2)
1186 {
1187 robj *o1 = (robj*) key1, *o2 = (robj*) key2;
1188 int cmp;
1189
1190 if (o1->encoding == REDIS_ENCODING_INT &&
1191 o2->encoding == REDIS_ENCODING_INT)
1192 return o1->ptr == o2->ptr;
1193
1194 o1 = getDecodedObject(o1);
1195 o2 = getDecodedObject(o2);
1196 cmp = dictSdsKeyCompare(privdata,o1->ptr,o2->ptr);
1197 decrRefCount(o1);
1198 decrRefCount(o2);
1199 return cmp;
1200 }
1201
1202 static unsigned int dictEncObjHash(const void *key) {
1203 robj *o = (robj*) key;
1204
1205 if (o->encoding == REDIS_ENCODING_RAW) {
1206 return dictGenHashFunction(o->ptr, sdslen((sds)o->ptr));
1207 } else {
1208 if (o->encoding == REDIS_ENCODING_INT) {
1209 char buf[32];
1210 int len;
1211
1212 len = ll2string(buf,32,(long)o->ptr);
1213 return dictGenHashFunction((unsigned char*)buf, len);
1214 } else {
1215 unsigned int hash;
1216
1217 o = getDecodedObject(o);
1218 hash = dictGenHashFunction(o->ptr, sdslen((sds)o->ptr));
1219 decrRefCount(o);
1220 return hash;
1221 }
1222 }
1223 }
1224
1225 /* Sets type */
1226 static dictType setDictType = {
1227 dictEncObjHash, /* hash function */
1228 NULL, /* key dup */
1229 NULL, /* val dup */
1230 dictEncObjKeyCompare, /* key compare */
1231 dictRedisObjectDestructor, /* key destructor */
1232 NULL /* val destructor */
1233 };
1234
1235 /* Sorted sets hash (note: a skiplist is used in addition to the hash table) */
1236 static dictType zsetDictType = {
1237 dictEncObjHash, /* hash function */
1238 NULL, /* key dup */
1239 NULL, /* val dup */
1240 dictEncObjKeyCompare, /* key compare */
1241 dictRedisObjectDestructor, /* key destructor */
1242 dictVanillaFree /* val destructor of malloc(sizeof(double)) */
1243 };
1244
1245 /* Db->dict, keys are sds strings, vals are Redis objects. */
1246 static dictType dbDictType = {
1247 dictSdsHash, /* hash function */
1248 NULL, /* key dup */
1249 NULL, /* val dup */
1250 dictSdsKeyCompare, /* key compare */
1251 dictSdsDestructor, /* key destructor */
1252 dictRedisObjectDestructor /* val destructor */
1253 };
1254
1255 /* Db->expires */
1256 static dictType keyptrDictType = {
1257 dictSdsHash, /* hash function */
1258 NULL, /* key dup */
1259 NULL, /* val dup */
1260 dictSdsKeyCompare, /* key compare */
1261 dictSdsDestructor, /* key destructor */
1262 NULL /* val destructor */
1263 };
1264
1265 /* Hash type hash table (note that small hashes are represented with zimpaps) */
1266 static dictType hashDictType = {
1267 dictEncObjHash, /* hash function */
1268 NULL, /* key dup */
1269 NULL, /* val dup */
1270 dictEncObjKeyCompare, /* key compare */
1271 dictRedisObjectDestructor, /* key destructor */
1272 dictRedisObjectDestructor /* val destructor */
1273 };
1274
1275 /* Keylist hash table type has unencoded redis objects as keys and
1276 * lists as values. It's used for blocking operations (BLPOP) and to
1277 * map swapped keys to a list of clients waiting for this keys to be loaded. */
1278 static dictType keylistDictType = {
1279 dictObjHash, /* hash function */
1280 NULL, /* key dup */
1281 NULL, /* val dup */
1282 dictObjKeyCompare, /* key compare */
1283 dictRedisObjectDestructor, /* key destructor */
1284 dictListDestructor /* val destructor */
1285 };
1286
1287 static void version();
1288
1289 /* ========================= Random utility functions ======================= */
1290
1291 /* Redis generally does not try to recover from out of memory conditions
1292 * when allocating objects or strings, it is not clear if it will be possible
1293 * to report this condition to the client since the networking layer itself
1294 * is based on heap allocation for send buffers, so we simply abort.
1295 * At least the code will be simpler to read... */
1296 static void oom(const char *msg) {
1297 redisLog(REDIS_WARNING, "%s: Out of memory\n",msg);
1298 sleep(1);
1299 abort();
1300 }
1301
1302 /* ====================== Redis server networking stuff ===================== */
1303 static void closeTimedoutClients(void) {
1304 redisClient *c;
1305 listNode *ln;
1306 time_t now = time(NULL);
1307 listIter li;
1308
1309 listRewind(server.clients,&li);
1310 while ((ln = listNext(&li)) != NULL) {
1311 c = listNodeValue(ln);
1312 if (server.maxidletime &&
1313 !(c->flags & REDIS_SLAVE) && /* no timeout for slaves */
1314 !(c->flags & REDIS_MASTER) && /* no timeout for masters */
1315 dictSize(c->pubsub_channels) == 0 && /* no timeout for pubsub */
1316 listLength(c->pubsub_patterns) == 0 &&
1317 (now - c->lastinteraction > server.maxidletime))
1318 {
1319 redisLog(REDIS_VERBOSE,"Closing idle client");
1320 freeClient(c);
1321 } else if (c->flags & REDIS_BLOCKED) {
1322 if (c->blockingto != 0 && c->blockingto < now) {
1323 addReply(c,shared.nullmultibulk);
1324 unblockClientWaitingData(c);
1325 }
1326 }
1327 }
1328 }
1329
1330 static int htNeedsResize(dict *dict) {
1331 long long size, used;
1332
1333 size = dictSlots(dict);
1334 used = dictSize(dict);
1335 return (size && used && size > DICT_HT_INITIAL_SIZE &&
1336 (used*100/size < REDIS_HT_MINFILL));
1337 }
1338
1339 /* If the percentage of used slots in the HT reaches REDIS_HT_MINFILL
1340 * we resize the hash table to save memory */
1341 static void tryResizeHashTables(void) {
1342 int j;
1343
1344 for (j = 0; j < server.dbnum; j++) {
1345 if (htNeedsResize(server.db[j].dict))
1346 dictResize(server.db[j].dict);
1347 if (htNeedsResize(server.db[j].expires))
1348 dictResize(server.db[j].expires);
1349 }
1350 }
1351
1352 /* Our hash table implementation performs rehashing incrementally while
1353 * we write/read from the hash table. Still if the server is idle, the hash
1354 * table will use two tables for a long time. So we try to use 1 millisecond
1355 * of CPU time at every serverCron() loop in order to rehash some key. */
1356 static void incrementallyRehash(void) {
1357 int j;
1358
1359 for (j = 0; j < server.dbnum; j++) {
1360 if (dictIsRehashing(server.db[j].dict)) {
1361 dictRehashMilliseconds(server.db[j].dict,1);
1362 break; /* already used our millisecond for this loop... */
1363 }
1364 }
1365 }
1366
1367 /* A background saving child (BGSAVE) terminated its work. Handle this. */
1368 void backgroundSaveDoneHandler(int statloc) {
1369 int exitcode = WEXITSTATUS(statloc);
1370 int bysignal = WIFSIGNALED(statloc);
1371
1372 if (!bysignal && exitcode == 0) {
1373 redisLog(REDIS_NOTICE,
1374 "Background saving terminated with success");
1375 server.dirty = 0;
1376 server.lastsave = time(NULL);
1377 } else if (!bysignal && exitcode != 0) {
1378 redisLog(REDIS_WARNING, "Background saving error");
1379 } else {
1380 redisLog(REDIS_WARNING,
1381 "Background saving terminated by signal %d", WTERMSIG(statloc));
1382 rdbRemoveTempFile(server.bgsavechildpid);
1383 }
1384 server.bgsavechildpid = -1;
1385 /* Possibly there are slaves waiting for a BGSAVE in order to be served
1386 * (the first stage of SYNC is a bulk transfer of dump.rdb) */
1387 updateSlavesWaitingBgsave(exitcode == 0 ? REDIS_OK : REDIS_ERR);
1388 }
1389
1390 /* A background append only file rewriting (BGREWRITEAOF) terminated its work.
1391 * Handle this. */
1392 void backgroundRewriteDoneHandler(int statloc) {
1393 int exitcode = WEXITSTATUS(statloc);
1394 int bysignal = WIFSIGNALED(statloc);
1395
1396 if (!bysignal && exitcode == 0) {
1397 int fd;
1398 char tmpfile[256];
1399
1400 redisLog(REDIS_NOTICE,
1401 "Background append only file rewriting terminated with success");
1402 /* Now it's time to flush the differences accumulated by the parent */
1403 snprintf(tmpfile,256,"temp-rewriteaof-bg-%d.aof", (int) server.bgrewritechildpid);
1404 fd = open(tmpfile,O_WRONLY|O_APPEND);
1405 if (fd == -1) {
1406 redisLog(REDIS_WARNING, "Not able to open the temp append only file produced by the child: %s", strerror(errno));
1407 goto cleanup;
1408 }
1409 /* Flush our data... */
1410 if (write(fd,server.bgrewritebuf,sdslen(server.bgrewritebuf)) !=
1411 (signed) sdslen(server.bgrewritebuf)) {
1412 redisLog(REDIS_WARNING, "Error or short write trying to flush the parent diff of the append log file in the child temp file: %s", strerror(errno));
1413 close(fd);
1414 goto cleanup;
1415 }
1416 redisLog(REDIS_NOTICE,"Parent diff flushed into the new append log file with success (%lu bytes)",sdslen(server.bgrewritebuf));
1417 /* Now our work is to rename the temp file into the stable file. And
1418 * switch the file descriptor used by the server for append only. */
1419 if (rename(tmpfile,server.appendfilename) == -1) {
1420 redisLog(REDIS_WARNING,"Can't rename the temp append only file into the stable one: %s", strerror(errno));
1421 close(fd);
1422 goto cleanup;
1423 }
1424 /* Mission completed... almost */
1425 redisLog(REDIS_NOTICE,"Append only file successfully rewritten.");
1426 if (server.appendfd != -1) {
1427 /* If append only is actually enabled... */
1428 close(server.appendfd);
1429 server.appendfd = fd;
1430 if (server.appendfsync != APPENDFSYNC_NO) aof_fsync(fd);
1431 server.appendseldb = -1; /* Make sure it will issue SELECT */
1432 redisLog(REDIS_NOTICE,"The new append only file was selected for future appends.");
1433 } else {
1434 /* If append only is disabled we just generate a dump in this
1435 * format. Why not? */
1436 close(fd);
1437 }
1438 } else if (!bysignal && exitcode != 0) {
1439 redisLog(REDIS_WARNING, "Background append only file rewriting error");
1440 } else {
1441 redisLog(REDIS_WARNING,
1442 "Background append only file rewriting terminated by signal %d",
1443 WTERMSIG(statloc));
1444 }
1445 cleanup:
1446 sdsfree(server.bgrewritebuf);
1447 server.bgrewritebuf = sdsempty();
1448 aofRemoveTempFile(server.bgrewritechildpid);
1449 server.bgrewritechildpid = -1;
1450 }
1451
1452 /* This function is called once a background process of some kind terminates,
1453 * as we want to avoid resizing the hash tables when there is a child in order
1454 * to play well with copy-on-write (otherwise when a resize happens lots of
1455 * memory pages are copied). The goal of this function is to update the ability
1456 * for dict.c to resize the hash tables accordingly to the fact we have o not
1457 * running childs. */
1458 static void updateDictResizePolicy(void) {
1459 if (server.bgsavechildpid == -1 && server.bgrewritechildpid == -1)
1460 dictEnableResize();
1461 else
1462 dictDisableResize();
1463 }
1464
1465 static int serverCron(struct aeEventLoop *eventLoop, long long id, void *clientData) {
1466 int j, loops = server.cronloops++;
1467 REDIS_NOTUSED(eventLoop);
1468 REDIS_NOTUSED(id);
1469 REDIS_NOTUSED(clientData);
1470
1471 /* We take a cached value of the unix time in the global state because
1472 * with virtual memory and aging there is to store the current time
1473 * in objects at every object access, and accuracy is not needed.
1474 * To access a global var is faster than calling time(NULL) */
1475 server.unixtime = time(NULL);
1476 /* We have just 21 bits per object for LRU information.
1477 * So we use an (eventually wrapping) LRU clock with minutes resolution.
1478 *
1479 * When we need to select what object to swap, we compute the minimum
1480 * time distance between the current lruclock and the object last access
1481 * lruclock info. Even if clocks will wrap on overflow, there is
1482 * the interesting property that we are sure that at least
1483 * ABS(A-B) minutes passed between current time and timestamp B.
1484 *
1485 * This is not precise but we don't need at all precision, but just
1486 * something statistically reasonable.
1487 */
1488 server.lruclock = (time(NULL)/60)&((1<<21)-1);
1489
1490 /* We received a SIGTERM, shutting down here in a safe way, as it is
1491 * not ok doing so inside the signal handler. */
1492 if (server.shutdown_asap) {
1493 if (prepareForShutdown() == REDIS_OK) exit(0);
1494 redisLog(REDIS_WARNING,"SIGTERM received but errors trying to shut down the server, check the logs for more information");
1495 }
1496
1497 /* Show some info about non-empty databases */
1498 for (j = 0; j < server.dbnum; j++) {
1499 long long size, used, vkeys;
1500
1501 size = dictSlots(server.db[j].dict);
1502 used = dictSize(server.db[j].dict);
1503 vkeys = dictSize(server.db[j].expires);
1504 if (!(loops % 50) && (used || vkeys)) {
1505 redisLog(REDIS_VERBOSE,"DB %d: %lld keys (%lld volatile) in %lld slots HT.",j,used,vkeys,size);
1506 /* dictPrintStats(server.dict); */
1507 }
1508 }
1509
1510 /* We don't want to resize the hash tables while a bacground saving
1511 * is in progress: the saving child is created using fork() that is
1512 * implemented with a copy-on-write semantic in most modern systems, so
1513 * if we resize the HT while there is the saving child at work actually
1514 * a lot of memory movements in the parent will cause a lot of pages
1515 * copied. */
1516 if (server.bgsavechildpid == -1 && server.bgrewritechildpid == -1) {
1517 if (!(loops % 10)) tryResizeHashTables();
1518 if (server.activerehashing) incrementallyRehash();
1519 }
1520
1521 /* Show information about connected clients */
1522 if (!(loops % 50)) {
1523 redisLog(REDIS_VERBOSE,"%d clients connected (%d slaves), %zu bytes in use",
1524 listLength(server.clients)-listLength(server.slaves),
1525 listLength(server.slaves),
1526 zmalloc_used_memory());
1527 }
1528
1529 /* Close connections of timedout clients */
1530 if ((server.maxidletime && !(loops % 100)) || server.blpop_blocked_clients)
1531 closeTimedoutClients();
1532
1533 /* Check if a background saving or AOF rewrite in progress terminated */
1534 if (server.bgsavechildpid != -1 || server.bgrewritechildpid != -1) {
1535 int statloc;
1536 pid_t pid;
1537
1538 if ((pid = wait3(&statloc,WNOHANG,NULL)) != 0) {
1539 if (pid == server.bgsavechildpid) {
1540 backgroundSaveDoneHandler(statloc);
1541 } else {
1542 backgroundRewriteDoneHandler(statloc);
1543 }
1544 updateDictResizePolicy();
1545 }
1546 } else {
1547 /* If there is not a background saving in progress check if
1548 * we have to save now */
1549 time_t now = time(NULL);
1550 for (j = 0; j < server.saveparamslen; j++) {
1551 struct saveparam *sp = server.saveparams+j;
1552
1553 if (server.dirty >= sp->changes &&
1554 now-server.lastsave > sp->seconds) {
1555 redisLog(REDIS_NOTICE,"%d changes in %d seconds. Saving...",
1556 sp->changes, sp->seconds);
1557 rdbSaveBackground(server.dbfilename);
1558 break;
1559 }
1560 }
1561 }
1562
1563 /* Try to expire a few timed out keys. The algorithm used is adaptive and
1564 * will use few CPU cycles if there are few expiring keys, otherwise
1565 * it will get more aggressive to avoid that too much memory is used by
1566 * keys that can be removed from the keyspace. */
1567 for (j = 0; j < server.dbnum; j++) {
1568 int expired;
1569 redisDb *db = server.db+j;
1570
1571 /* Continue to expire if at the end of the cycle more than 25%
1572 * of the keys were expired. */
1573 do {
1574 long num = dictSize(db->expires);
1575 time_t now = time(NULL);
1576
1577 expired = 0;
1578 if (num > REDIS_EXPIRELOOKUPS_PER_CRON)
1579 num = REDIS_EXPIRELOOKUPS_PER_CRON;
1580 while (num--) {
1581 dictEntry *de;
1582 time_t t;
1583
1584 if ((de = dictGetRandomKey(db->expires)) == NULL) break;
1585 t = (time_t) dictGetEntryVal(de);
1586 if (now > t) {
1587 sds key = dictGetEntryKey(de);
1588 robj *keyobj = createStringObject(key,sdslen(key));
1589
1590 dbDelete(db,keyobj);
1591 decrRefCount(keyobj);
1592 expired++;
1593 server.stat_expiredkeys++;
1594 }
1595 }
1596 } while (expired > REDIS_EXPIRELOOKUPS_PER_CRON/4);
1597 }
1598
1599 /* Swap a few keys on disk if we are over the memory limit and VM
1600 * is enbled. Try to free objects from the free list first. */
1601 if (vmCanSwapOut()) {
1602 while (server.vm_enabled && zmalloc_used_memory() >
1603 server.vm_max_memory)
1604 {
1605 int retval;
1606
1607 if (tryFreeOneObjectFromFreelist() == REDIS_OK) continue;
1608 retval = (server.vm_max_threads == 0) ?
1609 vmSwapOneObjectBlocking() :
1610 vmSwapOneObjectThreaded();
1611 if (retval == REDIS_ERR && !(loops % 300) &&
1612 zmalloc_used_memory() >
1613 (server.vm_max_memory+server.vm_max_memory/10))
1614 {
1615 redisLog(REDIS_WARNING,"WARNING: vm-max-memory limit exceeded by more than 10%% but unable to swap more objects out!");
1616 }
1617 /* Note that when using threade I/O we free just one object,
1618 * because anyway when the I/O thread in charge to swap this
1619 * object out will finish, the handler of completed jobs
1620 * will try to swap more objects if we are still out of memory. */
1621 if (retval == REDIS_ERR || server.vm_max_threads > 0) break;
1622 }
1623 }
1624
1625 /* Check if we should connect to a MASTER */
1626 if (server.replstate == REDIS_REPL_CONNECT && !(loops % 10)) {
1627 redisLog(REDIS_NOTICE,"Connecting to MASTER...");
1628 if (syncWithMaster() == REDIS_OK) {
1629 redisLog(REDIS_NOTICE,"MASTER <-> SLAVE sync succeeded");
1630 if (server.appendonly) rewriteAppendOnlyFileBackground();
1631 }
1632 }
1633 return 100;
1634 }
1635
1636 /* This function gets called every time Redis is entering the
1637 * main loop of the event driven library, that is, before to sleep
1638 * for ready file descriptors. */
1639 static void beforeSleep(struct aeEventLoop *eventLoop) {
1640 REDIS_NOTUSED(eventLoop);
1641
1642 /* Awake clients that got all the swapped keys they requested */
1643 if (server.vm_enabled && listLength(server.io_ready_clients)) {
1644 listIter li;
1645 listNode *ln;
1646
1647 listRewind(server.io_ready_clients,&li);
1648 while((ln = listNext(&li))) {
1649 redisClient *c = ln->value;
1650 struct redisCommand *cmd;
1651
1652 /* Resume the client. */
1653 listDelNode(server.io_ready_clients,ln);
1654 c->flags &= (~REDIS_IO_WAIT);
1655 server.vm_blocked_clients--;
1656 aeCreateFileEvent(server.el, c->fd, AE_READABLE,
1657 readQueryFromClient, c);
1658 cmd = lookupCommand(c->argv[0]->ptr);
1659 assert(cmd != NULL);
1660 call(c,cmd);
1661 resetClient(c);
1662 /* There may be more data to process in the input buffer. */
1663 if (c->querybuf && sdslen(c->querybuf) > 0)
1664 processInputBuffer(c);
1665 }
1666 }
1667 /* Write the AOF buffer on disk */
1668 flushAppendOnlyFile();
1669 }
1670
1671 static void createSharedObjects(void) {
1672 int j;
1673
1674 shared.crlf = createObject(REDIS_STRING,sdsnew("\r\n"));
1675 shared.ok = createObject(REDIS_STRING,sdsnew("+OK\r\n"));
1676 shared.err = createObject(REDIS_STRING,sdsnew("-ERR\r\n"));
1677 shared.emptybulk = createObject(REDIS_STRING,sdsnew("$0\r\n\r\n"));
1678 shared.czero = createObject(REDIS_STRING,sdsnew(":0\r\n"));
1679 shared.cone = createObject(REDIS_STRING,sdsnew(":1\r\n"));
1680 shared.nullbulk = createObject(REDIS_STRING,sdsnew("$-1\r\n"));
1681 shared.nullmultibulk = createObject(REDIS_STRING,sdsnew("*-1\r\n"));
1682 shared.emptymultibulk = createObject(REDIS_STRING,sdsnew("*0\r\n"));
1683 shared.pong = createObject(REDIS_STRING,sdsnew("+PONG\r\n"));
1684 shared.queued = createObject(REDIS_STRING,sdsnew("+QUEUED\r\n"));
1685 shared.wrongtypeerr = createObject(REDIS_STRING,sdsnew(
1686 "-ERR Operation against a key holding the wrong kind of value\r\n"));
1687 shared.nokeyerr = createObject(REDIS_STRING,sdsnew(
1688 "-ERR no such key\r\n"));
1689 shared.syntaxerr = createObject(REDIS_STRING,sdsnew(
1690 "-ERR syntax error\r\n"));
1691 shared.sameobjecterr = createObject(REDIS_STRING,sdsnew(
1692 "-ERR source and destination objects are the same\r\n"));
1693 shared.outofrangeerr = createObject(REDIS_STRING,sdsnew(
1694 "-ERR index out of range\r\n"));
1695 shared.space = createObject(REDIS_STRING,sdsnew(" "));
1696 shared.colon = createObject(REDIS_STRING,sdsnew(":"));
1697 shared.plus = createObject(REDIS_STRING,sdsnew("+"));
1698 shared.select0 = createStringObject("select 0\r\n",10);
1699 shared.select1 = createStringObject("select 1\r\n",10);
1700 shared.select2 = createStringObject("select 2\r\n",10);
1701 shared.select3 = createStringObject("select 3\r\n",10);
1702 shared.select4 = createStringObject("select 4\r\n",10);
1703 shared.select5 = createStringObject("select 5\r\n",10);
1704 shared.select6 = createStringObject("select 6\r\n",10);
1705 shared.select7 = createStringObject("select 7\r\n",10);
1706 shared.select8 = createStringObject("select 8\r\n",10);
1707 shared.select9 = createStringObject("select 9\r\n",10);
1708 shared.messagebulk = createStringObject("$7\r\nmessage\r\n",13);
1709 shared.pmessagebulk = createStringObject("$8\r\npmessage\r\n",14);
1710 shared.subscribebulk = createStringObject("$9\r\nsubscribe\r\n",15);
1711 shared.unsubscribebulk = createStringObject("$11\r\nunsubscribe\r\n",18);
1712 shared.psubscribebulk = createStringObject("$10\r\npsubscribe\r\n",17);
1713 shared.punsubscribebulk = createStringObject("$12\r\npunsubscribe\r\n",19);
1714 shared.mbulk3 = createStringObject("*3\r\n",4);
1715 shared.mbulk4 = createStringObject("*4\r\n",4);
1716 for (j = 0; j < REDIS_SHARED_INTEGERS; j++) {
1717 shared.integers[j] = createObject(REDIS_STRING,(void*)(long)j);
1718 shared.integers[j]->encoding = REDIS_ENCODING_INT;
1719 }
1720 }
1721
1722 static void appendServerSaveParams(time_t seconds, int changes) {
1723 server.saveparams = zrealloc(server.saveparams,sizeof(struct saveparam)*(server.saveparamslen+1));
1724 server.saveparams[server.saveparamslen].seconds = seconds;
1725 server.saveparams[server.saveparamslen].changes = changes;
1726 server.saveparamslen++;
1727 }
1728
1729 static void resetServerSaveParams() {
1730 zfree(server.saveparams);
1731 server.saveparams = NULL;
1732 server.saveparamslen = 0;
1733 }
1734
1735 static void initServerConfig() {
1736 server.dbnum = REDIS_DEFAULT_DBNUM;
1737 server.port = REDIS_SERVERPORT;
1738 server.verbosity = REDIS_VERBOSE;
1739 server.maxidletime = REDIS_MAXIDLETIME;
1740 server.saveparams = NULL;
1741 server.logfile = NULL; /* NULL = log on standard output */
1742 server.bindaddr = NULL;
1743 server.glueoutputbuf = 1;
1744 server.daemonize = 0;
1745 server.appendonly = 0;
1746 server.appendfsync = APPENDFSYNC_EVERYSEC;
1747 server.no_appendfsync_on_rewrite = 0;
1748 server.lastfsync = time(NULL);
1749 server.appendfd = -1;
1750 server.appendseldb = -1; /* Make sure the first time will not match */
1751 server.pidfile = zstrdup("/var/run/redis.pid");
1752 server.dbfilename = zstrdup("dump.rdb");
1753 server.appendfilename = zstrdup("appendonly.aof");
1754 server.requirepass = NULL;
1755 server.rdbcompression = 1;
1756 server.activerehashing = 1;
1757 server.maxclients = 0;
1758 server.blpop_blocked_clients = 0;
1759 server.maxmemory = 0;
1760 server.vm_enabled = 0;
1761 server.vm_swap_file = zstrdup("/tmp/redis-%p.vm");
1762 server.vm_page_size = 256; /* 256 bytes per page */
1763 server.vm_pages = 1024*1024*100; /* 104 millions of pages */
1764 server.vm_max_memory = 1024LL*1024*1024*1; /* 1 GB of RAM */
1765 server.vm_max_threads = 4;
1766 server.vm_blocked_clients = 0;
1767 server.hash_max_zipmap_entries = REDIS_HASH_MAX_ZIPMAP_ENTRIES;
1768 server.hash_max_zipmap_value = REDIS_HASH_MAX_ZIPMAP_VALUE;
1769 server.list_max_ziplist_entries = REDIS_LIST_MAX_ZIPLIST_ENTRIES;
1770 server.list_max_ziplist_value = REDIS_LIST_MAX_ZIPLIST_VALUE;
1771 server.shutdown_asap = 0;
1772
1773 resetServerSaveParams();
1774
1775 appendServerSaveParams(60*60,1); /* save after 1 hour and 1 change */
1776 appendServerSaveParams(300,100); /* save after 5 minutes and 100 changes */
1777 appendServerSaveParams(60,10000); /* save after 1 minute and 10000 changes */
1778 /* Replication related */
1779 server.isslave = 0;
1780 server.masterauth = NULL;
1781 server.masterhost = NULL;
1782 server.masterport = 6379;
1783 server.master = NULL;
1784 server.replstate = REDIS_REPL_NONE;
1785
1786 /* Double constants initialization */
1787 R_Zero = 0.0;
1788 R_PosInf = 1.0/R_Zero;
1789 R_NegInf = -1.0/R_Zero;
1790 R_Nan = R_Zero/R_Zero;
1791 }
1792
1793 static void initServer() {
1794 int j;
1795
1796 signal(SIGHUP, SIG_IGN);
1797 signal(SIGPIPE, SIG_IGN);
1798 setupSigSegvAction();
1799
1800 server.devnull = fopen("/dev/null","w");
1801 if (server.devnull == NULL) {
1802 redisLog(REDIS_WARNING, "Can't open /dev/null: %s", server.neterr);
1803 exit(1);
1804 }
1805 server.clients = listCreate();
1806 server.slaves = listCreate();
1807 server.monitors = listCreate();
1808 server.objfreelist = listCreate();
1809 createSharedObjects();
1810 server.el = aeCreateEventLoop();
1811 server.db = zmalloc(sizeof(redisDb)*server.dbnum);
1812 server.fd = anetTcpServer(server.neterr, server.port, server.bindaddr);
1813 if (server.fd == -1) {
1814 redisLog(REDIS_WARNING, "Opening TCP port: %s", server.neterr);
1815 exit(1);
1816 }
1817 for (j = 0; j < server.dbnum; j++) {
1818 server.db[j].dict = dictCreate(&dbDictType,NULL);
1819 server.db[j].expires = dictCreate(&keyptrDictType,NULL);
1820 server.db[j].blocking_keys = dictCreate(&keylistDictType,NULL);
1821 server.db[j].watched_keys = dictCreate(&keylistDictType,NULL);
1822 if (server.vm_enabled)
1823 server.db[j].io_keys = dictCreate(&keylistDictType,NULL);
1824 server.db[j].id = j;
1825 }
1826 server.pubsub_channels = dictCreate(&keylistDictType,NULL);
1827 server.pubsub_patterns = listCreate();
1828 listSetFreeMethod(server.pubsub_patterns,freePubsubPattern);
1829 listSetMatchMethod(server.pubsub_patterns,listMatchPubsubPattern);
1830 server.cronloops = 0;
1831 server.bgsavechildpid = -1;
1832 server.bgrewritechildpid = -1;
1833 server.bgrewritebuf = sdsempty();
1834 server.aofbuf = sdsempty();
1835 server.lastsave = time(NULL);
1836 server.dirty = 0;
1837 server.stat_numcommands = 0;
1838 server.stat_numconnections = 0;
1839 server.stat_expiredkeys = 0;
1840 server.stat_starttime = time(NULL);
1841 server.unixtime = time(NULL);
1842 aeCreateTimeEvent(server.el, 1, serverCron, NULL, NULL);
1843 if (aeCreateFileEvent(server.el, server.fd, AE_READABLE,
1844 acceptHandler, NULL) == AE_ERR) oom("creating file event");
1845
1846 if (server.appendonly) {
1847 server.appendfd = open(server.appendfilename,O_WRONLY|O_APPEND|O_CREAT,0644);
1848 if (server.appendfd == -1) {
1849 redisLog(REDIS_WARNING, "Can't open the append-only file: %s",
1850 strerror(errno));
1851 exit(1);
1852 }
1853 }
1854
1855 if (server.vm_enabled) vmInit();
1856 }
1857
1858 /* Empty the whole database */
1859 static long long emptyDb() {
1860 int j;
1861 long long removed = 0;
1862
1863 for (j = 0; j < server.dbnum; j++) {
1864 removed += dictSize(server.db[j].dict);
1865 dictEmpty(server.db[j].dict);
1866 dictEmpty(server.db[j].expires);
1867 }
1868 return removed;
1869 }
1870
1871 static int yesnotoi(char *s) {
1872 if (!strcasecmp(s,"yes")) return 1;
1873 else if (!strcasecmp(s,"no")) return 0;
1874 else return -1;
1875 }
1876
1877 /* I agree, this is a very rudimental way to load a configuration...
1878 will improve later if the config gets more complex */
1879 static void loadServerConfig(char *filename) {
1880 FILE *fp;
1881 char buf[REDIS_CONFIGLINE_MAX+1], *err = NULL;
1882 int linenum = 0;
1883 sds line = NULL;
1884
1885 if (filename[0] == '-' && filename[1] == '\0')
1886 fp = stdin;
1887 else {
1888 if ((fp = fopen(filename,"r")) == NULL) {
1889 redisLog(REDIS_WARNING, "Fatal error, can't open config file '%s'", filename);
1890 exit(1);
1891 }
1892 }
1893
1894 while(fgets(buf,REDIS_CONFIGLINE_MAX+1,fp) != NULL) {
1895 sds *argv;
1896 int argc, j;
1897
1898 linenum++;
1899 line = sdsnew(buf);
1900 line = sdstrim(line," \t\r\n");
1901
1902 /* Skip comments and blank lines*/
1903 if (line[0] == '#' || line[0] == '\0') {
1904 sdsfree(line);
1905 continue;
1906 }
1907
1908 /* Split into arguments */
1909 argv = sdssplitlen(line,sdslen(line)," ",1,&argc);
1910 sdstolower(argv[0]);
1911
1912 /* Execute config directives */
1913 if (!strcasecmp(argv[0],"timeout") && argc == 2) {
1914 server.maxidletime = atoi(argv[1]);
1915 if (server.maxidletime < 0) {
1916 err = "Invalid timeout value"; goto loaderr;
1917 }
1918 } else if (!strcasecmp(argv[0],"port") && argc == 2) {
1919 server.port = atoi(argv[1]);
1920 if (server.port < 1 || server.port > 65535) {
1921 err = "Invalid port"; goto loaderr;
1922 }
1923 } else if (!strcasecmp(argv[0],"bind") && argc == 2) {
1924 server.bindaddr = zstrdup(argv[1]);
1925 } else if (!strcasecmp(argv[0],"save") && argc == 3) {
1926 int seconds = atoi(argv[1]);
1927 int changes = atoi(argv[2]);
1928 if (seconds < 1 || changes < 0) {
1929 err = "Invalid save parameters"; goto loaderr;
1930 }
1931 appendServerSaveParams(seconds,changes);
1932 } else if (!strcasecmp(argv[0],"dir") && argc == 2) {
1933 if (chdir(argv[1]) == -1) {
1934 redisLog(REDIS_WARNING,"Can't chdir to '%s': %s",
1935 argv[1], strerror(errno));
1936 exit(1);
1937 }
1938 } else if (!strcasecmp(argv[0],"loglevel") && argc == 2) {
1939 if (!strcasecmp(argv[1],"debug")) server.verbosity = REDIS_DEBUG;
1940 else if (!strcasecmp(argv[1],"verbose")) server.verbosity = REDIS_VERBOSE;
1941 else if (!strcasecmp(argv[1],"notice")) server.verbosity = REDIS_NOTICE;
1942 else if (!strcasecmp(argv[1],"warning")) server.verbosity = REDIS_WARNING;
1943 else {
1944 err = "Invalid log level. Must be one of debug, notice, warning";
1945 goto loaderr;
1946 }
1947 } else if (!strcasecmp(argv[0],"logfile") && argc == 2) {
1948 FILE *logfp;
1949
1950 server.logfile = zstrdup(argv[1]);
1951 if (!strcasecmp(server.logfile,"stdout")) {
1952 zfree(server.logfile);
1953 server.logfile = NULL;
1954 }
1955 if (server.logfile) {
1956 /* Test if we are able to open the file. The server will not
1957 * be able to abort just for this problem later... */
1958 logfp = fopen(server.logfile,"a");
1959 if (logfp == NULL) {
1960 err = sdscatprintf(sdsempty(),
1961 "Can't open the log file: %s", strerror(errno));
1962 goto loaderr;
1963 }
1964 fclose(logfp);
1965 }
1966 } else if (!strcasecmp(argv[0],"databases") && argc == 2) {
1967 server.dbnum = atoi(argv[1]);
1968 if (server.dbnum < 1) {
1969 err = "Invalid number of databases"; goto loaderr;
1970 }
1971 } else if (!strcasecmp(argv[0],"include") && argc == 2) {
1972 loadServerConfig(argv[1]);
1973 } else if (!strcasecmp(argv[0],"maxclients") && argc == 2) {
1974 server.maxclients = atoi(argv[1]);
1975 } else if (!strcasecmp(argv[0],"maxmemory") && argc == 2) {
1976 server.maxmemory = memtoll(argv[1],NULL);
1977 } else if (!strcasecmp(argv[0],"slaveof") && argc == 3) {
1978 server.masterhost = sdsnew(argv[1]);
1979 server.masterport = atoi(argv[2]);
1980 server.replstate = REDIS_REPL_CONNECT;
1981 } else if (!strcasecmp(argv[0],"masterauth") && argc == 2) {
1982 server.masterauth = zstrdup(argv[1]);
1983 } else if (!strcasecmp(argv[0],"glueoutputbuf") && argc == 2) {
1984 if ((server.glueoutputbuf = yesnotoi(argv[1])) == -1) {
1985 err = "argument must be 'yes' or 'no'"; goto loaderr;
1986 }
1987 } else if (!strcasecmp(argv[0],"rdbcompression") && argc == 2) {
1988 if ((server.rdbcompression = yesnotoi(argv[1])) == -1) {
1989 err = "argument must be 'yes' or 'no'"; goto loaderr;
1990 }
1991 } else if (!strcasecmp(argv[0],"activerehashing") && argc == 2) {
1992 if ((server.activerehashing = yesnotoi(argv[1])) == -1) {
1993 err = "argument must be 'yes' or 'no'"; goto loaderr;
1994 }
1995 } else if (!strcasecmp(argv[0],"daemonize") && argc == 2) {
1996 if ((server.daemonize = yesnotoi(argv[1])) == -1) {
1997 err = "argument must be 'yes' or 'no'"; goto loaderr;
1998 }
1999 } else if (!strcasecmp(argv[0],"appendonly") && argc == 2) {
2000 if ((server.appendonly = yesnotoi(argv[1])) == -1) {
2001 err = "argument must be 'yes' or 'no'"; goto loaderr;
2002 }
2003 } else if (!strcasecmp(argv[0],"appendfilename") && argc == 2) {
2004 zfree(server.appendfilename);
2005 server.appendfilename = zstrdup(argv[1]);
2006 } else if (!strcasecmp(argv[0],"no-appendfsync-on-rewrite")
2007 && argc == 2) {
2008 if ((server.no_appendfsync_on_rewrite= yesnotoi(argv[1])) == -1) {
2009 err = "argument must be 'yes' or 'no'"; goto loaderr;
2010 }
2011 } else if (!strcasecmp(argv[0],"appendfsync") && argc == 2) {
2012 if (!strcasecmp(argv[1],"no")) {
2013 server.appendfsync = APPENDFSYNC_NO;
2014 } else if (!strcasecmp(argv[1],"always")) {
2015 server.appendfsync = APPENDFSYNC_ALWAYS;
2016 } else if (!strcasecmp(argv[1],"everysec")) {
2017 server.appendfsync = APPENDFSYNC_EVERYSEC;
2018 } else {
2019 err = "argument must be 'no', 'always' or 'everysec'";
2020 goto loaderr;
2021 }
2022 } else if (!strcasecmp(argv[0],"requirepass") && argc == 2) {
2023 server.requirepass = zstrdup(argv[1]);
2024 } else if (!strcasecmp(argv[0],"pidfile") && argc == 2) {
2025 zfree(server.pidfile);
2026 server.pidfile = zstrdup(argv[1]);
2027 } else if (!strcasecmp(argv[0],"dbfilename") && argc == 2) {
2028 zfree(server.dbfilename);
2029 server.dbfilename = zstrdup(argv[1]);
2030 } else if (!strcasecmp(argv[0],"vm-enabled") && argc == 2) {
2031 if ((server.vm_enabled = yesnotoi(argv[1])) == -1) {
2032 err = "argument must be 'yes' or 'no'"; goto loaderr;
2033 }
2034 } else if (!strcasecmp(argv[0],"vm-swap-file") && argc == 2) {
2035 zfree(server.vm_swap_file);
2036 server.vm_swap_file = zstrdup(argv[1]);
2037 } else if (!strcasecmp(argv[0],"vm-max-memory") && argc == 2) {
2038 server.vm_max_memory = memtoll(argv[1],NULL);
2039 } else if (!strcasecmp(argv[0],"vm-page-size") && argc == 2) {
2040 server.vm_page_size = memtoll(argv[1], NULL);
2041 } else if (!strcasecmp(argv[0],"vm-pages") && argc == 2) {
2042 server.vm_pages = memtoll(argv[1], NULL);
2043 } else if (!strcasecmp(argv[0],"vm-max-threads") && argc == 2) {
2044 server.vm_max_threads = strtoll(argv[1], NULL, 10);
2045 } else if (!strcasecmp(argv[0],"hash-max-zipmap-entries") && argc == 2){
2046 server.hash_max_zipmap_entries = memtoll(argv[1], NULL);
2047 } else if (!strcasecmp(argv[0],"hash-max-zipmap-value") && argc == 2){
2048 server.hash_max_zipmap_value = memtoll(argv[1], NULL);
2049 } else if (!strcasecmp(argv[0],"list-max-ziplist-entries") && argc == 2){
2050 server.list_max_ziplist_entries = memtoll(argv[1], NULL);
2051 } else if (!strcasecmp(argv[0],"list-max-ziplist-value") && argc == 2){
2052 server.list_max_ziplist_value = memtoll(argv[1], NULL);
2053 } else {
2054 err = "Bad directive or wrong number of arguments"; goto loaderr;
2055 }
2056 for (j = 0; j < argc; j++)
2057 sdsfree(argv[j]);
2058 zfree(argv);
2059 sdsfree(line);
2060 }
2061 if (fp != stdin) fclose(fp);
2062 return;
2063
2064 loaderr:
2065 fprintf(stderr, "\n*** FATAL CONFIG FILE ERROR ***\n");
2066 fprintf(stderr, "Reading the configuration file, at line %d\n", linenum);
2067 fprintf(stderr, ">>> '%s'\n", line);
2068 fprintf(stderr, "%s\n", err);
2069 exit(1);
2070 }
2071
2072 static void freeClientArgv(redisClient *c) {
2073 int j;
2074
2075 for (j = 0; j < c->argc; j++)
2076 decrRefCount(c->argv[j]);
2077 for (j = 0; j < c->mbargc; j++)
2078 decrRefCount(c->mbargv[j]);
2079 c->argc = 0;
2080 c->mbargc = 0;
2081 }
2082
2083 static void freeClient(redisClient *c) {
2084 listNode *ln;
2085
2086 /* Note that if the client we are freeing is blocked into a blocking
2087 * call, we have to set querybuf to NULL *before* to call
2088 * unblockClientWaitingData() to avoid processInputBuffer() will get
2089 * called. Also it is important to remove the file events after
2090 * this, because this call adds the READABLE event. */
2091 sdsfree(c->querybuf);
2092 c->querybuf = NULL;
2093 if (c->flags & REDIS_BLOCKED)
2094 unblockClientWaitingData(c);
2095
2096 /* UNWATCH all the keys */
2097 unwatchAllKeys(c);
2098 listRelease(c->watched_keys);
2099 /* Unsubscribe from all the pubsub channels */
2100 pubsubUnsubscribeAllChannels(c,0);
2101 pubsubUnsubscribeAllPatterns(c,0);
2102 dictRelease(c->pubsub_channels);
2103 listRelease(c->pubsub_patterns);
2104 /* Obvious cleanup */
2105 aeDeleteFileEvent(server.el,c->fd,AE_READABLE);
2106 aeDeleteFileEvent(server.el,c->fd,AE_WRITABLE);
2107 listRelease(c->reply);
2108 freeClientArgv(c);
2109 close(c->fd);
2110 /* Remove from the list of clients */
2111 ln = listSearchKey(server.clients,c);
2112 redisAssert(ln != NULL);
2113 listDelNode(server.clients,ln);
2114 /* Remove from the list of clients that are now ready to be restarted
2115 * after waiting for swapped keys */
2116 if (c->flags & REDIS_IO_WAIT && listLength(c->io_keys) == 0) {
2117 ln = listSearchKey(server.io_ready_clients,c);
2118 if (ln) {
2119 listDelNode(server.io_ready_clients,ln);
2120 server.vm_blocked_clients--;
2121 }
2122 }
2123 /* Remove from the list of clients waiting for swapped keys */
2124 while (server.vm_enabled && listLength(c->io_keys)) {
2125 ln = listFirst(c->io_keys);
2126 dontWaitForSwappedKey(c,ln->value);
2127 }
2128 listRelease(c->io_keys);
2129 /* Master/slave cleanup */
2130 if (c->flags & REDIS_SLAVE) {
2131 if (c->replstate == REDIS_REPL_SEND_BULK && c->repldbfd != -1)
2132 close(c->repldbfd);
2133 list *l = (c->flags & REDIS_MONITOR) ? server.monitors : server.slaves;
2134 ln = listSearchKey(l,c);
2135 redisAssert(ln != NULL);
2136 listDelNode(l,ln);
2137 }
2138 if (c->flags & REDIS_MASTER) {
2139 server.master = NULL;
2140 server.replstate = REDIS_REPL_CONNECT;
2141 }
2142 /* Release memory */
2143 zfree(c->argv);
2144 zfree(c->mbargv);
2145 freeClientMultiState(c);
2146 zfree(c);
2147 }
2148
2149 #define GLUEREPLY_UP_TO (1024)
2150 static void glueReplyBuffersIfNeeded(redisClient *c) {
2151 int copylen = 0;
2152 char buf[GLUEREPLY_UP_TO];
2153 listNode *ln;
2154 listIter li;
2155 robj *o;
2156
2157 listRewind(c->reply,&li);
2158 while((ln = listNext(&li))) {
2159 int objlen;
2160
2161 o = ln->value;
2162 objlen = sdslen(o->ptr);
2163 if (copylen + objlen <= GLUEREPLY_UP_TO) {
2164 memcpy(buf+copylen,o->ptr,objlen);
2165 copylen += objlen;
2166 listDelNode(c->reply,ln);
2167 } else {
2168 if (copylen == 0) return;
2169 break;
2170 }
2171 }
2172 /* Now the output buffer is empty, add the new single element */
2173 o = createObject(REDIS_STRING,sdsnewlen(buf,copylen));
2174 listAddNodeHead(c->reply,o);
2175 }
2176
2177 static void sendReplyToClient(aeEventLoop *el, int fd, void *privdata, int mask) {
2178 redisClient *c = privdata;
2179 int nwritten = 0, totwritten = 0, objlen;
2180 robj *o;
2181 REDIS_NOTUSED(el);
2182 REDIS_NOTUSED(mask);
2183
2184 /* Use writev() if we have enough buffers to send */
2185 if (!server.glueoutputbuf &&
2186 listLength(c->reply) > REDIS_WRITEV_THRESHOLD &&
2187 !(c->flags & REDIS_MASTER))
2188 {
2189 sendReplyToClientWritev(el, fd, privdata, mask);
2190 return;
2191 }
2192
2193 while(listLength(c->reply)) {
2194 if (server.glueoutputbuf && listLength(c->reply) > 1)
2195 glueReplyBuffersIfNeeded(c);
2196
2197 o = listNodeValue(listFirst(c->reply));
2198 objlen = sdslen(o->ptr);
2199
2200 if (objlen == 0) {
2201 listDelNode(c->reply,listFirst(c->reply));
2202 continue;
2203 }
2204
2205 if (c->flags & REDIS_MASTER) {
2206 /* Don't reply to a master */
2207 nwritten = objlen - c->sentlen;
2208 } else {
2209 nwritten = write(fd, ((char*)o->ptr)+c->sentlen, objlen - c->sentlen);
2210 if (nwritten <= 0) break;
2211 }
2212 c->sentlen += nwritten;
2213 totwritten += nwritten;
2214 /* If we fully sent the object on head go to the next one */
2215 if (c->sentlen == objlen) {
2216 listDelNode(c->reply,listFirst(c->reply));
2217 c->sentlen = 0;
2218 }
2219 /* Note that we avoid to send more thank REDIS_MAX_WRITE_PER_EVENT
2220 * bytes, in a single threaded server it's a good idea to serve
2221 * other clients as well, even if a very large request comes from
2222 * super fast link that is always able to accept data (in real world
2223 * scenario think about 'KEYS *' against the loopback interfae) */
2224 if (totwritten > REDIS_MAX_WRITE_PER_EVENT) break;
2225 }
2226 if (nwritten == -1) {
2227 if (errno == EAGAIN) {
2228 nwritten = 0;
2229 } else {
2230 redisLog(REDIS_VERBOSE,
2231 "Error writing to client: %s", strerror(errno));
2232 freeClient(c);
2233 return;
2234 }
2235 }
2236 if (totwritten > 0) c->lastinteraction = time(NULL);
2237 if (listLength(c->reply) == 0) {
2238 c->sentlen = 0;
2239 aeDeleteFileEvent(server.el,c->fd,AE_WRITABLE);
2240 }
2241 }
2242
2243 static void sendReplyToClientWritev(aeEventLoop *el, int fd, void *privdata, int mask)
2244 {
2245 redisClient *c = privdata;
2246 int nwritten = 0, totwritten = 0, objlen, willwrite;
2247 robj *o;
2248 struct iovec iov[REDIS_WRITEV_IOVEC_COUNT];
2249 int offset, ion = 0;
2250 REDIS_NOTUSED(el);
2251 REDIS_NOTUSED(mask);
2252
2253 listNode *node;
2254 while (listLength(c->reply)) {
2255 offset = c->sentlen;
2256 ion = 0;
2257 willwrite = 0;
2258
2259 /* fill-in the iov[] array */
2260 for(node = listFirst(c->reply); node; node = listNextNode(node)) {
2261 o = listNodeValue(node);
2262 objlen = sdslen(o->ptr);
2263
2264 if (totwritten + objlen - offset > REDIS_MAX_WRITE_PER_EVENT)
2265 break;
2266
2267 if(ion == REDIS_WRITEV_IOVEC_COUNT)
2268 break; /* no more iovecs */
2269
2270 iov[ion].iov_base = ((char*)o->ptr) + offset;
2271 iov[ion].iov_len = objlen - offset;
2272 willwrite += objlen - offset;
2273 offset = 0; /* just for the first item */
2274 ion++;
2275 }
2276
2277 if(willwrite == 0)
2278 break;
2279
2280 /* write all collected blocks at once */
2281 if((nwritten = writev(fd, iov, ion)) < 0) {
2282 if (errno != EAGAIN) {
2283 redisLog(REDIS_VERBOSE,
2284 "Error writing to client: %s", strerror(errno));
2285 freeClient(c);
2286 return;
2287 }
2288 break;
2289 }
2290
2291 totwritten += nwritten;
2292 offset = c->sentlen;
2293
2294 /* remove written robjs from c->reply */
2295 while (nwritten && listLength(c->reply)) {
2296 o = listNodeValue(listFirst(c->reply));
2297 objlen = sdslen(o->ptr);
2298
2299 if(nwritten >= objlen - offset) {
2300 listDelNode(c->reply, listFirst(c->reply));
2301 nwritten -= objlen - offset;
2302 c->sentlen = 0;
2303 } else {
2304 /* partial write */
2305 c->sentlen += nwritten;
2306 break;
2307 }
2308 offset = 0;
2309 }
2310 }
2311
2312 if (totwritten > 0)
2313 c->lastinteraction = time(NULL);
2314
2315 if (listLength(c->reply) == 0) {
2316 c->sentlen = 0;
2317 aeDeleteFileEvent(server.el,c->fd,AE_WRITABLE);
2318 }
2319 }
2320
2321 static int qsortRedisCommands(const void *r1, const void *r2) {
2322 return strcasecmp(
2323 ((struct redisCommand*)r1)->name,
2324 ((struct redisCommand*)r2)->name);
2325 }
2326
2327 static void sortCommandTable() {
2328 /* Copy and sort the read-only version of the command table */
2329 commandTable = (struct redisCommand*)malloc(sizeof(readonlyCommandTable));
2330 memcpy(commandTable,readonlyCommandTable,sizeof(readonlyCommandTable));
2331 qsort(commandTable,
2332 sizeof(readonlyCommandTable)/sizeof(struct redisCommand),
2333 sizeof(struct redisCommand),qsortRedisCommands);
2334 }
2335
2336 static struct redisCommand *lookupCommand(char *name) {
2337 struct redisCommand tmp = {name,NULL,0,0,NULL,0,0,0};
2338 return bsearch(
2339 &tmp,
2340 commandTable,
2341 sizeof(readonlyCommandTable)/sizeof(struct redisCommand),
2342 sizeof(struct redisCommand),
2343 qsortRedisCommands);
2344 }
2345
2346 /* resetClient prepare the client to process the next command */
2347 static void resetClient(redisClient *c) {
2348 freeClientArgv(c);
2349 c->bulklen = -1;
2350 c->multibulk = 0;
2351 }
2352
2353 /* Call() is the core of Redis execution of a command */
2354 static void call(redisClient *c, struct redisCommand *cmd) {
2355 long long dirty;
2356
2357 dirty = server.dirty;
2358 cmd->proc(c);
2359 dirty = server.dirty-dirty;
2360
2361 if (server.appendonly && dirty)
2362 feedAppendOnlyFile(cmd,c->db->id,c->argv,c->argc);
2363 if ((dirty || cmd->flags & REDIS_CMD_FORCE_REPLICATION) &&
2364 listLength(server.slaves))
2365 replicationFeedSlaves(server.slaves,c->db->id,c->argv,c->argc);
2366 if (listLength(server.monitors))
2367 replicationFeedMonitors(server.monitors,c->db->id,c->argv,c->argc);
2368 server.stat_numcommands++;
2369 }
2370
2371 /* If this function gets called we already read a whole
2372 * command, argments are in the client argv/argc fields.
2373 * processCommand() execute the command or prepare the
2374 * server for a bulk read from the client.
2375 *
2376 * If 1 is returned the client is still alive and valid and
2377 * and other operations can be performed by the caller. Otherwise
2378 * if 0 is returned the client was destroied (i.e. after QUIT). */
2379 static int processCommand(redisClient *c) {
2380 struct redisCommand *cmd;
2381
2382 /* Free some memory if needed (maxmemory setting) */
2383 if (server.maxmemory) freeMemoryIfNeeded();
2384
2385 /* Handle the multi bulk command type. This is an alternative protocol
2386 * supported by Redis in order to receive commands that are composed of
2387 * multiple binary-safe "bulk" arguments. The latency of processing is
2388 * a bit higher but this allows things like multi-sets, so if this
2389 * protocol is used only for MSET and similar commands this is a big win. */
2390 if (c->multibulk == 0 && c->argc == 1 && ((char*)(c->argv[0]->ptr))[0] == '*') {
2391 c->multibulk = atoi(((char*)c->argv[0]->ptr)+1);
2392 if (c->multibulk <= 0) {
2393 resetClient(c);
2394 return 1;
2395 } else {
2396 decrRefCount(c->argv[c->argc-1]);
2397 c->argc--;
2398 return 1;
2399 }
2400 } else if (c->multibulk) {
2401 if (c->bulklen == -1) {
2402 if (((char*)c->argv[0]->ptr)[0] != '$') {
2403 addReplySds(c,sdsnew("-ERR multi bulk protocol error\r\n"));
2404 resetClient(c);
2405 return 1;
2406 } else {
2407 int bulklen = atoi(((char*)c->argv[0]->ptr)+1);
2408 decrRefCount(c->argv[0]);
2409 if (bulklen < 0 || bulklen > 1024*1024*1024) {
2410 c->argc--;
2411 addReplySds(c,sdsnew("-ERR invalid bulk write count\r\n"));
2412 resetClient(c);
2413 return 1;
2414 }
2415 c->argc--;
2416 c->bulklen = bulklen+2; /* add two bytes for CR+LF */
2417 return 1;
2418 }
2419 } else {
2420 c->mbargv = zrealloc(c->mbargv,(sizeof(robj*))*(c->mbargc+1));
2421 c->mbargv[c->mbargc] = c->argv[0];
2422 c->mbargc++;
2423 c->argc--;
2424 c->multibulk--;
2425 if (c->multibulk == 0) {
2426 robj **auxargv;
2427 int auxargc;
2428
2429 /* Here we need to swap the multi-bulk argc/argv with the
2430 * normal argc/argv of the client structure. */
2431 auxargv = c->argv;
2432 c->argv = c->mbargv;
2433 c->mbargv = auxargv;
2434
2435 auxargc = c->argc;
2436 c->argc = c->mbargc;
2437 c->mbargc = auxargc;
2438
2439 /* We need to set bulklen to something different than -1
2440 * in order for the code below to process the command without
2441 * to try to read the last argument of a bulk command as
2442 * a special argument. */
2443 c->bulklen = 0;
2444 /* continue below and process the command */
2445 } else {
2446 c->bulklen = -1;
2447 return 1;
2448 }
2449 }
2450 }
2451 /* -- end of multi bulk commands processing -- */
2452
2453 /* The QUIT command is handled as a special case. Normal command
2454 * procs are unable to close the client connection safely */
2455 if (!strcasecmp(c->argv[0]->ptr,"quit")) {
2456 freeClient(c);
2457 return 0;
2458 }
2459
2460 /* Now lookup the command and check ASAP about trivial error conditions
2461 * such wrong arity, bad command name and so forth. */
2462 cmd = lookupCommand(c->argv[0]->ptr);
2463 if (!cmd) {
2464 addReplySds(c,
2465 sdscatprintf(sdsempty(), "-ERR unknown command '%s'\r\n",
2466 (char*)c->argv[0]->ptr));
2467 resetClient(c);
2468 return 1;
2469 } else if ((cmd->arity > 0 && cmd->arity != c->argc) ||
2470 (c->argc < -cmd->arity)) {
2471 addReplySds(c,
2472 sdscatprintf(sdsempty(),
2473 "-ERR wrong number of arguments for '%s' command\r\n",
2474 cmd->name));
2475 resetClient(c);
2476 return 1;
2477 } else if (cmd->flags & REDIS_CMD_BULK && c->bulklen == -1) {
2478 /* This is a bulk command, we have to read the last argument yet. */
2479 int bulklen = atoi(c->argv[c->argc-1]->ptr);
2480
2481 decrRefCount(c->argv[c->argc-1]);
2482 if (bulklen < 0 || bulklen > 1024*1024*1024) {
2483 c->argc--;
2484 addReplySds(c,sdsnew("-ERR invalid bulk write count\r\n"));
2485 resetClient(c);
2486 return 1;
2487 }
2488 c->argc--;
2489 c->bulklen = bulklen+2; /* add two bytes for CR+LF */
2490 /* It is possible that the bulk read is already in the
2491 * buffer. Check this condition and handle it accordingly.
2492 * This is just a fast path, alternative to call processInputBuffer().
2493 * It's a good idea since the code is small and this condition
2494 * happens most of the times. */
2495 if ((signed)sdslen(c->querybuf) >= c->bulklen) {
2496 c->argv[c->argc] = createStringObject(c->querybuf,c->bulklen-2);
2497 c->argc++;
2498 c->querybuf = sdsrange(c->querybuf,c->bulklen,-1);
2499 } else {
2500 /* Otherwise return... there is to read the last argument
2501 * from the socket. */
2502 return 1;
2503 }
2504 }
2505 /* Let's try to encode the bulk object to save space. */
2506 if (cmd->flags & REDIS_CMD_BULK)
2507 c->argv[c->argc-1] = tryObjectEncoding(c->argv[c->argc-1]);
2508
2509 /* Check if the user is authenticated */
2510 if (server.requirepass && !c->authenticated && cmd->proc != authCommand) {
2511 addReplySds(c,sdsnew("-ERR operation not permitted\r\n"));
2512 resetClient(c);
2513 return 1;
2514 }
2515
2516 /* Handle the maxmemory directive */
2517 if (server.maxmemory && (cmd->flags & REDIS_CMD_DENYOOM) &&
2518 zmalloc_used_memory() > server.maxmemory)
2519 {
2520 addReplySds(c,sdsnew("-ERR command not allowed when used memory > 'maxmemory'\r\n"));
2521 resetClient(c);
2522 return 1;
2523 }
2524
2525 /* Only allow SUBSCRIBE and UNSUBSCRIBE in the context of Pub/Sub */
2526 if ((dictSize(c->pubsub_channels) > 0 || listLength(c->pubsub_patterns) > 0)
2527 &&
2528 cmd->proc != subscribeCommand && cmd->proc != unsubscribeCommand &&
2529 cmd->proc != psubscribeCommand && cmd->proc != punsubscribeCommand) {
2530 addReplySds(c,sdsnew("-ERR only (P)SUBSCRIBE / (P)UNSUBSCRIBE / QUIT allowed in this context\r\n"));
2531 resetClient(c);
2532 return 1;
2533 }
2534
2535 /* Exec the command */
2536 if (c->flags & REDIS_MULTI &&
2537 cmd->proc != execCommand && cmd->proc != discardCommand &&
2538 cmd->proc != multiCommand && cmd->proc != watchCommand)
2539 {
2540 queueMultiCommand(c,cmd);
2541 addReply(c,shared.queued);
2542 } else {
2543 if (server.vm_enabled && server.vm_max_threads > 0 &&
2544 blockClientOnSwappedKeys(c,cmd)) return 1;
2545 call(c,cmd);
2546 }
2547
2548 /* Prepare the client for the next command */
2549 resetClient(c);
2550 return 1;
2551 }
2552
2553 static void replicationFeedSlaves(list *slaves, int dictid, robj **argv, int argc) {
2554 listNode *ln;
2555 listIter li;
2556 int outc = 0, j;
2557 robj **outv;
2558 /* We need 1+(ARGS*3) objects since commands are using the new protocol
2559 * and we one 1 object for the first "*<count>\r\n" multibulk count, then
2560 * for every additional object we have "$<count>\r\n" + object + "\r\n". */
2561 robj *static_outv[REDIS_STATIC_ARGS*3+1];
2562 robj *lenobj;
2563
2564 if (argc <= REDIS_STATIC_ARGS) {
2565 outv = static_outv;
2566 } else {
2567 outv = zmalloc(sizeof(robj*)*(argc*3+1));
2568 }
2569
2570 lenobj = createObject(REDIS_STRING,
2571 sdscatprintf(sdsempty(), "*%d\r\n", argc));
2572 lenobj->refcount = 0;
2573 outv[outc++] = lenobj;
2574 for (j = 0; j < argc; j++) {
2575 lenobj = createObject(REDIS_STRING,
2576 sdscatprintf(sdsempty(),"$%lu\r\n",
2577 (unsigned long) stringObjectLen(argv[j])));
2578 lenobj->refcount = 0;
2579 outv[outc++] = lenobj;
2580 outv[outc++] = argv[j];
2581 outv[outc++] = shared.crlf;
2582 }
2583
2584 /* Increment all the refcounts at start and decrement at end in order to
2585 * be sure to free objects if there is no slave in a replication state
2586 * able to be feed with commands */
2587 for (j = 0; j < outc; j++) incrRefCount(outv[j]);
2588 listRewind(slaves,&li);
2589 while((ln = listNext(&li))) {
2590 redisClient *slave = ln->value;
2591
2592 /* Don't feed slaves that are still waiting for BGSAVE to start */
2593 if (slave->replstate == REDIS_REPL_WAIT_BGSAVE_START) continue;
2594
2595 /* Feed all the other slaves, MONITORs and so on */
2596 if (slave->slaveseldb != dictid) {
2597 robj *selectcmd;
2598
2599 switch(dictid) {
2600 case 0: selectcmd = shared.select0; break;
2601 case 1: selectcmd = shared.select1; break;
2602 case 2: selectcmd = shared.select2; break;
2603 case 3: selectcmd = shared.select3; break;
2604 case 4: selectcmd = shared.select4; break;
2605 case 5: selectcmd = shared.select5; break;
2606 case 6: selectcmd = shared.select6; break;
2607 case 7: selectcmd = shared.select7; break;
2608 case 8: selectcmd = shared.select8; break;
2609 case 9: selectcmd = shared.select9; break;
2610 default:
2611 selectcmd = createObject(REDIS_STRING,
2612 sdscatprintf(sdsempty(),"select %d\r\n",dictid));
2613 selectcmd->refcount = 0;
2614 break;
2615 }
2616 addReply(slave,selectcmd);
2617 slave->slaveseldb = dictid;
2618 }
2619 for (j = 0; j < outc; j++) addReply(slave,outv[j]);
2620 }
2621 for (j = 0; j < outc; j++) decrRefCount(outv[j]);
2622 if (outv != static_outv) zfree(outv);
2623 }
2624
2625 static sds sdscatrepr(sds s, char *p, size_t len) {
2626 s = sdscatlen(s,"\"",1);
2627 while(len--) {
2628 switch(*p) {
2629 case '\\':
2630 case '"':
2631 s = sdscatprintf(s,"\\%c",*p);
2632 break;
2633 case '\n': s = sdscatlen(s,"\\n",1); break;
2634 case '\r': s = sdscatlen(s,"\\r",1); break;
2635 case '\t': s = sdscatlen(s,"\\t",1); break;
2636 case '\a': s = sdscatlen(s,"\\a",1); break;
2637 case '\b': s = sdscatlen(s,"\\b",1); break;
2638 default:
2639 if (isprint(*p))
2640 s = sdscatprintf(s,"%c",*p);
2641 else
2642 s = sdscatprintf(s,"\\x%02x",(unsigned char)*p);
2643 break;
2644 }
2645 p++;
2646 }
2647 return sdscatlen(s,"\"",1);
2648 }
2649
2650 static void replicationFeedMonitors(list *monitors, int dictid, robj **argv, int argc) {
2651 listNode *ln;
2652 listIter li;
2653 int j;
2654 sds cmdrepr = sdsnew("+");
2655 robj *cmdobj;
2656 struct timeval tv;
2657
2658 gettimeofday(&tv,NULL);
2659 cmdrepr = sdscatprintf(cmdrepr,"%ld.%ld ",(long)tv.tv_sec,(long)tv.tv_usec);
2660 if (dictid != 0) cmdrepr = sdscatprintf(cmdrepr,"(db %d) ", dictid);
2661
2662 for (j = 0; j < argc; j++) {
2663 if (argv[j]->encoding == REDIS_ENCODING_INT) {
2664 cmdrepr = sdscatprintf(cmdrepr, "%ld", (long)argv[j]->ptr);
2665 } else {
2666 cmdrepr = sdscatrepr(cmdrepr,(char*)argv[j]->ptr,
2667 sdslen(argv[j]->ptr));
2668 }
2669 if (j != argc-1)
2670 cmdrepr = sdscatlen(cmdrepr," ",1);
2671 }
2672 cmdrepr = sdscatlen(cmdrepr,"\r\n",2);
2673 cmdobj = createObject(REDIS_STRING,cmdrepr);
2674
2675 listRewind(monitors,&li);
2676 while((ln = listNext(&li))) {
2677 redisClient *monitor = ln->value;
2678 addReply(monitor,cmdobj);
2679 }
2680 decrRefCount(cmdobj);
2681 }
2682
2683 static void processInputBuffer(redisClient *c) {
2684 again:
2685 /* Before to process the input buffer, make sure the client is not
2686 * waitig for a blocking operation such as BLPOP. Note that the first
2687 * iteration the client is never blocked, otherwise the processInputBuffer
2688 * would not be called at all, but after the execution of the first commands
2689 * in the input buffer the client may be blocked, and the "goto again"
2690 * will try to reiterate. The following line will make it return asap. */
2691 if (c->flags & REDIS_BLOCKED || c->flags & REDIS_IO_WAIT) return;
2692 if (c->bulklen == -1) {
2693 /* Read the first line of the query */
2694 char *p = strchr(c->querybuf,'\n');
2695 size_t querylen;
2696
2697 if (p) {
2698 sds query, *argv;
2699 int argc, j;
2700
2701 query = c->querybuf;
2702 c->querybuf = sdsempty();
2703 querylen = 1+(p-(query));
2704 if (sdslen(query) > querylen) {
2705 /* leave data after the first line of the query in the buffer */
2706 c->querybuf = sdscatlen(c->querybuf,query+querylen,sdslen(query)-querylen);
2707 }
2708 *p = '\0'; /* remove "\n" */
2709 if (*(p-1) == '\r') *(p-1) = '\0'; /* and "\r" if any */
2710 sdsupdatelen(query);
2711
2712 /* Now we can split the query in arguments */
2713 argv = sdssplitlen(query,sdslen(query)," ",1,&argc);
2714 sdsfree(query);
2715
2716 if (c->argv) zfree(c->argv);
2717 c->argv = zmalloc(sizeof(robj*)*argc);
2718
2719 for (j = 0; j < argc; j++) {
2720 if (sdslen(argv[j])) {
2721 c->argv[c->argc] = createObject(REDIS_STRING,argv[j]);
2722 c->argc++;
2723 } else {
2724 sdsfree(argv[j]);
2725 }
2726 }
2727 zfree(argv);
2728 if (c->argc) {
2729 /* Execute the command. If the client is still valid
2730 * after processCommand() return and there is something
2731 * on the query buffer try to process the next command. */
2732 if (processCommand(c) && sdslen(c->querybuf)) goto again;
2733 } else {
2734 /* Nothing to process, argc == 0. Just process the query
2735 * buffer if it's not empty or return to the caller */
2736 if (sdslen(c->querybuf)) goto again;
2737 }
2738 return;
2739 } else if (sdslen(c->querybuf) >= REDIS_REQUEST_MAX_SIZE) {
2740 redisLog(REDIS_VERBOSE, "Client protocol error");
2741 freeClient(c);
2742 return;
2743 }
2744 } else {
2745 /* Bulk read handling. Note that if we are at this point
2746 the client already sent a command terminated with a newline,
2747 we are reading the bulk data that is actually the last
2748 argument of the command. */
2749 int qbl = sdslen(c->querybuf);
2750
2751 if (c->bulklen <= qbl) {
2752 /* Copy everything but the final CRLF as final argument */
2753 c->argv[c->argc] = createStringObject(c->querybuf,c->bulklen-2);
2754 c->argc++;
2755 c->querybuf = sdsrange(c->querybuf,c->bulklen,-1);
2756 /* Process the command. If the client is still valid after
2757 * the processing and there is more data in the buffer
2758 * try to parse it. */
2759 if (processCommand(c) && sdslen(c->querybuf)) goto again;
2760 return;
2761 }
2762 }
2763 }
2764
2765 static void readQueryFromClient(aeEventLoop *el, int fd, void *privdata, int mask) {
2766 redisClient *c = (redisClient*) privdata;
2767 char buf[REDIS_IOBUF_LEN];
2768 int nread;
2769 REDIS_NOTUSED(el);
2770 REDIS_NOTUSED(mask);
2771
2772 nread = read(fd, buf, REDIS_IOBUF_LEN);
2773 if (nread == -1) {
2774 if (errno == EAGAIN) {
2775 nread = 0;
2776 } else {
2777 redisLog(REDIS_VERBOSE, "Reading from client: %s",strerror(errno));
2778 freeClient(c);
2779 return;
2780 }
2781 } else if (nread == 0) {
2782 redisLog(REDIS_VERBOSE, "Client closed connection");
2783 freeClient(c);
2784 return;
2785 }
2786 if (nread) {
2787 c->querybuf = sdscatlen(c->querybuf, buf, nread);
2788 c->lastinteraction = time(NULL);
2789 } else {
2790 return;
2791 }
2792 processInputBuffer(c);
2793 }
2794
2795 static int selectDb(redisClient *c, int id) {
2796 if (id < 0 || id >= server.dbnum)
2797 return REDIS_ERR;
2798 c->db = &server.db[id];
2799 return REDIS_OK;
2800 }
2801
2802 static void *dupClientReplyValue(void *o) {
2803 incrRefCount((robj*)o);
2804 return o;
2805 }
2806
2807 static int listMatchObjects(void *a, void *b) {
2808 return equalStringObjects(a,b);
2809 }
2810
2811 static redisClient *createClient(int fd) {
2812 redisClient *c = zmalloc(sizeof(*c));
2813
2814 anetNonBlock(NULL,fd);
2815 anetTcpNoDelay(NULL,fd);
2816 if (!c) return NULL;
2817 selectDb(c,0);
2818 c->fd = fd;
2819 c->querybuf = sdsempty();
2820 c->argc = 0;
2821 c->argv = NULL;
2822 c->bulklen = -1;
2823 c->multibulk = 0;
2824 c->mbargc = 0;
2825 c->mbargv = NULL;
2826 c->sentlen = 0;
2827 c->flags = 0;
2828 c->lastinteraction = time(NULL);
2829 c->authenticated = 0;
2830 c->replstate = REDIS_REPL_NONE;
2831 c->reply = listCreate();
2832 listSetFreeMethod(c->reply,decrRefCount);
2833 listSetDupMethod(c->reply,dupClientReplyValue);
2834 c->blocking_keys = NULL;
2835 c->blocking_keys_num = 0;
2836 c->io_keys = listCreate();
2837 c->watched_keys = listCreate();
2838 listSetFreeMethod(c->io_keys,decrRefCount);
2839 c->pubsub_channels = dictCreate(&setDictType,NULL);
2840 c->pubsub_patterns = listCreate();
2841 listSetFreeMethod(c->pubsub_patterns,decrRefCount);
2842 listSetMatchMethod(c->pubsub_patterns,listMatchObjects);
2843 if (aeCreateFileEvent(server.el, c->fd, AE_READABLE,
2844 readQueryFromClient, c) == AE_ERR) {
2845 freeClient(c);
2846 return NULL;
2847 }
2848 listAddNodeTail(server.clients,c);
2849 initClientMultiState(c);
2850 return c;
2851 }
2852
2853 static void addReply(redisClient *c, robj *obj) {
2854 if (listLength(c->reply) == 0 &&
2855 (c->replstate == REDIS_REPL_NONE ||
2856 c->replstate == REDIS_REPL_ONLINE) &&
2857 aeCreateFileEvent(server.el, c->fd, AE_WRITABLE,
2858 sendReplyToClient, c) == AE_ERR) return;
2859
2860 if (server.vm_enabled && obj->storage != REDIS_VM_MEMORY) {
2861 obj = dupStringObject(obj);
2862 obj->refcount = 0; /* getDecodedObject() will increment the refcount */
2863 }
2864 listAddNodeTail(c->reply,getDecodedObject(obj));
2865 }
2866
2867 static void addReplySds(redisClient *c, sds s) {
2868 robj *o = createObject(REDIS_STRING,s);
2869 addReply(c,o);
2870 decrRefCount(o);
2871 }
2872
2873 static void addReplyDouble(redisClient *c, double d) {
2874 char buf[128];
2875
2876 snprintf(buf,sizeof(buf),"%.17g",d);
2877 addReplySds(c,sdscatprintf(sdsempty(),"$%lu\r\n%s\r\n",
2878 (unsigned long) strlen(buf),buf));
2879 }
2880
2881 static void addReplyLongLong(redisClient *c, long long ll) {
2882 char buf[128];
2883 size_t len;
2884
2885 if (ll == 0) {
2886 addReply(c,shared.czero);
2887 return;
2888 } else if (ll == 1) {
2889 addReply(c,shared.cone);
2890 return;
2891 }
2892 buf[0] = ':';
2893 len = ll2string(buf+1,sizeof(buf)-1,ll);
2894 buf[len+1] = '\r';
2895 buf[len+2] = '\n';
2896 addReplySds(c,sdsnewlen(buf,len+3));
2897 }
2898
2899 static void addReplyUlong(redisClient *c, unsigned long ul) {
2900 char buf[128];
2901 size_t len;
2902
2903 if (ul == 0) {
2904 addReply(c,shared.czero);
2905 return;
2906 } else if (ul == 1) {
2907 addReply(c,shared.cone);
2908 return;
2909 }
2910 len = snprintf(buf,sizeof(buf),":%lu\r\n",ul);
2911 addReplySds(c,sdsnewlen(buf,len));
2912 }
2913
2914 static void addReplyBulkLen(redisClient *c, robj *obj) {
2915 size_t len, intlen;
2916 char buf[128];
2917
2918 if (obj->encoding == REDIS_ENCODING_RAW) {
2919 len = sdslen(obj->ptr);
2920 } else {
2921 long n = (long)obj->ptr;
2922
2923 /* Compute how many bytes will take this integer as a radix 10 string */
2924 len = 1;
2925 if (n < 0) {
2926 len++;
2927 n = -n;
2928 }
2929 while((n = n/10) != 0) {
2930 len++;
2931 }
2932 }
2933 buf[0] = '$';
2934 intlen = ll2string(buf+1,sizeof(buf)-1,(long long)len);
2935 buf[intlen+1] = '\r';
2936 buf[intlen+2] = '\n';
2937 addReplySds(c,sdsnewlen(buf,intlen+3));
2938 }
2939
2940 static void addReplyBulk(redisClient *c, robj *obj) {
2941 addReplyBulkLen(c,obj);
2942 addReply(c,obj);
2943 addReply(c,shared.crlf);
2944 }
2945
2946 static void addReplyBulkSds(redisClient *c, sds s) {
2947 robj *o = createStringObject(s, sdslen(s));
2948 addReplyBulk(c,o);
2949 decrRefCount(o);
2950 }
2951
2952 /* In the CONFIG command we need to add vanilla C string as bulk replies */
2953 static void addReplyBulkCString(redisClient *c, char *s) {
2954 if (s == NULL) {
2955 addReply(c,shared.nullbulk);
2956 } else {
2957 robj *o = createStringObject(s,strlen(s));
2958 addReplyBulk(c,o);
2959 decrRefCount(o);
2960 }
2961 }
2962
2963 static void acceptHandler(aeEventLoop *el, int fd, void *privdata, int mask) {
2964 int cport, cfd;
2965 char cip[128];
2966 redisClient *c;
2967 REDIS_NOTUSED(el);
2968 REDIS_NOTUSED(mask);
2969 REDIS_NOTUSED(privdata);
2970
2971 cfd = anetAccept(server.neterr, fd, cip, &cport);
2972 if (cfd == AE_ERR) {
2973 redisLog(REDIS_VERBOSE,"Accepting client connection: %s", server.neterr);
2974 return;
2975 }
2976 redisLog(REDIS_VERBOSE,"Accepted %s:%d", cip, cport);
2977 if ((c = createClient(cfd)) == NULL) {
2978 redisLog(REDIS_WARNING,"Error allocating resoures for the client");
2979 close(cfd); /* May be already closed, just ingore errors */
2980 return;
2981 }
2982 /* If maxclient directive is set and this is one client more... close the
2983 * connection. Note that we create the client instead to check before
2984 * for this condition, since now the socket is already set in nonblocking
2985 * mode and we can send an error for free using the Kernel I/O */
2986 if (server.maxclients && listLength(server.clients) > server.maxclients) {
2987 char *err = "-ERR max number of clients reached\r\n";
2988
2989 /* That's a best effort error message, don't check write errors */
2990 if (write(c->fd,err,strlen(err)) == -1) {
2991 /* Nothing to do, Just to avoid the warning... */
2992 }
2993 freeClient(c);
2994 return;
2995 }
2996 server.stat_numconnections++;
2997 }
2998
2999 /* ======================= Redis objects implementation ===================== */
3000
3001 static robj *createObject(int type, void *ptr) {
3002 robj *o;
3003
3004 if (server.vm_enabled) pthread_mutex_lock(&server.obj_freelist_mutex);
3005 if (listLength(server.objfreelist)) {
3006 listNode *head = listFirst(server.objfreelist);
3007 o = listNodeValue(head);
3008 listDelNode(server.objfreelist,head);
3009 if (server.vm_enabled) pthread_mutex_unlock(&server.obj_freelist_mutex);
3010 } else {
3011 if (server.vm_enabled)
3012 pthread_mutex_unlock(&server.obj_freelist_mutex);
3013 o = zmalloc(sizeof(*o));
3014 }
3015 o->type = type;
3016 o->encoding = REDIS_ENCODING_RAW;
3017 o->ptr = ptr;
3018 o->refcount = 1;
3019 if (server.vm_enabled) {
3020 /* Note that this code may run in the context of an I/O thread
3021 * and accessing server.lruclock in theory is an error
3022 * (no locks). But in practice this is safe, and even if we read
3023 * garbage Redis will not fail. */
3024 o->lru = server.lruclock;
3025 o->storage = REDIS_VM_MEMORY;
3026 }
3027 return o;
3028 }
3029
3030 static robj *createStringObject(char *ptr, size_t len) {
3031 return createObject(REDIS_STRING,sdsnewlen(ptr,len));
3032 }
3033
3034 static robj *createStringObjectFromLongLong(long long value) {
3035 robj *o;
3036 if (value >= 0 && value < REDIS_SHARED_INTEGERS) {
3037 incrRefCount(shared.integers[value]);
3038 o = shared.integers[value];
3039 } else {
3040 if (value >= LONG_MIN && value <= LONG_MAX) {
3041 o = createObject(REDIS_STRING, NULL);
3042 o->encoding = REDIS_ENCODING_INT;
3043 o->ptr = (void*)((long)value);
3044 } else {
3045 o = createObject(REDIS_STRING,sdsfromlonglong(value));
3046 }
3047 }
3048 return o;
3049 }
3050
3051 static robj *dupStringObject(robj *o) {
3052 assert(o->encoding == REDIS_ENCODING_RAW);
3053 return createStringObject(o->ptr,sdslen(o->ptr));
3054 }
3055
3056 static robj *createListObject(void) {
3057 list *l = listCreate();
3058 robj *o = createObject(REDIS_LIST,l);
3059 listSetFreeMethod(l,decrRefCount);
3060 o->encoding = REDIS_ENCODING_LIST;
3061 return o;
3062 }
3063
3064 static robj *createZiplistObject(void) {
3065 unsigned char *zl = ziplistNew();
3066 robj *o = createObject(REDIS_LIST,zl);
3067 o->encoding = REDIS_ENCODING_ZIPLIST;
3068 return o;
3069 }
3070
3071 static robj *createSetObject(void) {
3072 dict *d = dictCreate(&setDictType,NULL);
3073 return createObject(REDIS_SET,d);
3074 }
3075
3076 static robj *createHashObject(void) {
3077 /* All the Hashes start as zipmaps. Will be automatically converted
3078 * into hash tables if there are enough elements or big elements
3079 * inside. */
3080 unsigned char *zm = zipmapNew();
3081 robj *o = createObject(REDIS_HASH,zm);
3082 o->encoding = REDIS_ENCODING_ZIPMAP;
3083 return o;
3084 }
3085
3086 static robj *createZsetObject(void) {
3087 zset *zs = zmalloc(sizeof(*zs));
3088
3089 zs->dict = dictCreate(&zsetDictType,NULL);
3090 zs->zsl = zslCreate();
3091 return createObject(REDIS_ZSET,zs);
3092 }
3093
3094 static void freeStringObject(robj *o) {
3095 if (o->encoding == REDIS_ENCODING_RAW) {
3096 sdsfree(o->ptr);
3097 }
3098 }
3099
3100 static void freeListObject(robj *o) {
3101 switch (o->encoding) {
3102 case REDIS_ENCODING_LIST:
3103 listRelease((list*) o->ptr);
3104 break;
3105 case REDIS_ENCODING_ZIPLIST:
3106 zfree(o->ptr);
3107 break;
3108 default:
3109 redisPanic("Unknown list encoding type");
3110 }
3111 }
3112
3113 static void freeSetObject(robj *o) {
3114 dictRelease((dict*) o->ptr);
3115 }
3116
3117 static void freeZsetObject(robj *o) {
3118 zset *zs = o->ptr;
3119
3120 dictRelease(zs->dict);
3121 zslFree(zs->zsl);
3122 zfree(zs);
3123 }
3124
3125 static void freeHashObject(robj *o) {
3126 switch (o->encoding) {
3127 case REDIS_ENCODING_HT:
3128 dictRelease((dict*) o->ptr);
3129 break;
3130 case REDIS_ENCODING_ZIPMAP:
3131 zfree(o->ptr);
3132 break;
3133 default:
3134 redisPanic("Unknown hash encoding type");
3135 break;
3136 }
3137 }
3138
3139 static void incrRefCount(robj *o) {
3140 o->refcount++;
3141 }
3142
3143 static void decrRefCount(void *obj) {
3144 robj *o = obj;
3145
3146 /* Object is a swapped out value, or in the process of being loaded. */
3147 if (server.vm_enabled &&
3148 (o->storage == REDIS_VM_SWAPPED || o->storage == REDIS_VM_LOADING))
3149 {
3150 vmpointer *vp = obj;
3151 if (o->storage == REDIS_VM_LOADING) vmCancelThreadedIOJob(o);
3152 vmMarkPagesFree(vp->page,vp->usedpages);
3153 server.vm_stats_swapped_objects--;
3154 zfree(vp);
3155 return;
3156 }
3157
3158 if (o->refcount <= 0) redisPanic("decrRefCount against refcount <= 0");
3159 /* Object is in memory, or in the process of being swapped out.
3160 *
3161 * If the object is being swapped out, abort the operation on
3162 * decrRefCount even if the refcount does not drop to 0: the object
3163 * is referenced at least two times, as value of the key AND as
3164 * job->val in the iojob. So if we don't invalidate the iojob, when it is
3165 * done but the relevant key was removed in the meantime, the
3166 * complete jobs handler will not find the key about the job and the
3167 * assert will fail. */
3168 if (server.vm_enabled && o->storage == REDIS_VM_SWAPPING)
3169 vmCancelThreadedIOJob(o);
3170 if (--(o->refcount) == 0) {
3171 switch(o->type) {
3172 case REDIS_STRING: freeStringObject(o); break;
3173 case REDIS_LIST: freeListObject(o); break;
3174 case REDIS_SET: freeSetObject(o); break;
3175 case REDIS_ZSET: freeZsetObject(o); break;
3176 case REDIS_HASH: freeHashObject(o); break;
3177 default: redisPanic("Unknown object type"); break;
3178 }
3179 if (server.vm_enabled) pthread_mutex_lock(&server.obj_freelist_mutex);
3180 if (listLength(server.objfreelist) > REDIS_OBJFREELIST_MAX ||
3181 !listAddNodeHead(server.objfreelist,o))
3182 zfree(o);
3183 if (server.vm_enabled) pthread_mutex_unlock(&server.obj_freelist_mutex);
3184 }
3185 }
3186
3187 static int checkType(redisClient *c, robj *o, int type) {
3188 if (o->type != type) {
3189 addReply(c,shared.wrongtypeerr);
3190 return 1;
3191 }
3192 return 0;
3193 }
3194
3195 /* Check if the nul-terminated string 's' can be represented by a long
3196 * (that is, is a number that fits into long without any other space or
3197 * character before or after the digits).
3198 *
3199 * If so, the function returns REDIS_OK and *longval is set to the value
3200 * of the number. Otherwise REDIS_ERR is returned */
3201 static int isStringRepresentableAsLong(sds s, long *longval) {
3202 char buf[32], *endptr;
3203 long value;
3204 int slen;
3205
3206 value = strtol(s, &endptr, 10);
3207 if (endptr[0] != '\0') return REDIS_ERR;
3208 slen = ll2string(buf,32,value);
3209
3210 /* If the number converted back into a string is not identical
3211 * then it's not possible to encode the string as integer */
3212 if (sdslen(s) != (unsigned)slen || memcmp(buf,s,slen)) return REDIS_ERR;
3213 if (longval) *longval = value;
3214 return REDIS_OK;
3215 }
3216
3217 /* Try to encode a string object in order to save space */
3218 static robj *tryObjectEncoding(robj *o) {
3219 long value;
3220 sds s = o->ptr;
3221
3222 if (o->encoding != REDIS_ENCODING_RAW)
3223 return o; /* Already encoded */
3224
3225 /* It's not safe to encode shared objects: shared objects can be shared
3226 * everywhere in the "object space" of Redis. Encoded objects can only
3227 * appear as "values" (and not, for instance, as keys) */
3228 if (o->refcount > 1) return o;
3229
3230 /* Currently we try to encode only strings */
3231 redisAssert(o->type == REDIS_STRING);
3232
3233 /* Check if we can represent this string as a long integer */
3234 if (isStringRepresentableAsLong(s,&value) == REDIS_ERR) return o;
3235
3236 /* Ok, this object can be encoded */
3237 if (value >= 0 && value < REDIS_SHARED_INTEGERS) {
3238 decrRefCount(o);
3239 incrRefCount(shared.integers[value]);
3240 return shared.integers[value];
3241 } else {
3242 o->encoding = REDIS_ENCODING_INT;
3243 sdsfree(o->ptr);
3244 o->ptr = (void*) value;
3245 return o;
3246 }
3247 }
3248
3249 /* Get a decoded version of an encoded object (returned as a new object).
3250 * If the object is already raw-encoded just increment the ref count. */
3251 static robj *getDecodedObject(robj *o) {
3252 robj *dec;
3253
3254 if (o->encoding == REDIS_ENCODING_RAW) {
3255 incrRefCount(o);
3256 return o;
3257 }
3258 if (o->type == REDIS_STRING && o->encoding == REDIS_ENCODING_INT) {
3259 char buf[32];
3260
3261 ll2string(buf,32,(long)o->ptr);
3262 dec = createStringObject(buf,strlen(buf));
3263 return dec;
3264 } else {
3265 redisPanic("Unknown encoding type");
3266 }
3267 }
3268
3269 /* Compare two string objects via strcmp() or alike.
3270 * Note that the objects may be integer-encoded. In such a case we
3271 * use ll2string() to get a string representation of the numbers on the stack
3272 * and compare the strings, it's much faster than calling getDecodedObject().
3273 *
3274 * Important note: if objects are not integer encoded, but binary-safe strings,
3275 * sdscmp() from sds.c will apply memcmp() so this function ca be considered
3276 * binary safe. */
3277 static int compareStringObjects(robj *a, robj *b) {
3278 redisAssert(a->type == REDIS_STRING && b->type == REDIS_STRING);
3279 char bufa[128], bufb[128], *astr, *bstr;
3280 int bothsds = 1;
3281
3282 if (a == b) return 0;
3283 if (a->encoding != REDIS_ENCODING_RAW) {
3284 ll2string(bufa,sizeof(bufa),(long) a->ptr);
3285 astr = bufa;
3286 bothsds = 0;
3287 } else {
3288 astr = a->ptr;
3289 }
3290 if (b->encoding != REDIS_ENCODING_RAW) {
3291 ll2string(bufb,sizeof(bufb),(long) b->ptr);
3292 bstr = bufb;
3293 bothsds = 0;
3294 } else {
3295 bstr = b->ptr;
3296 }
3297 return bothsds ? sdscmp(astr,bstr) : strcmp(astr,bstr);
3298 }
3299
3300 /* Equal string objects return 1 if the two objects are the same from the
3301 * point of view of a string comparison, otherwise 0 is returned. Note that
3302 * this function is faster then checking for (compareStringObject(a,b) == 0)
3303 * because it can perform some more optimization. */
3304 static int equalStringObjects(robj *a, robj *b) {
3305 if (a->encoding != REDIS_ENCODING_RAW && b->encoding != REDIS_ENCODING_RAW){
3306 return a->ptr == b->ptr;
3307 } else {
3308 return compareStringObjects(a,b) == 0;
3309 }
3310 }
3311
3312 static size_t stringObjectLen(robj *o) {
3313 redisAssert(o->type == REDIS_STRING);
3314 if (o->encoding == REDIS_ENCODING_RAW) {
3315 return sdslen(o->ptr);
3316 } else {
3317 char buf[32];
3318
3319 return ll2string(buf,32,(long)o->ptr);
3320 }
3321 }
3322
3323 static int getDoubleFromObject(robj *o, double *target) {
3324 double value;
3325 char *eptr;
3326
3327 if (o == NULL) {
3328 value = 0;
3329 } else {
3330 redisAssert(o->type == REDIS_STRING);
3331 if (o->encoding == REDIS_ENCODING_RAW) {
3332 value = strtod(o->ptr, &eptr);
3333 if (eptr[0] != '\0') return REDIS_ERR;
3334 } else if (o->encoding == REDIS_ENCODING_INT) {
3335 value = (long)o->ptr;
3336 } else {
3337 redisPanic("Unknown string encoding");
3338 }
3339 }
3340
3341 *target = value;
3342 return REDIS_OK;
3343 }
3344
3345 static int getDoubleFromObjectOrReply(redisClient *c, robj *o, double *target, const char *msg) {
3346 double value;
3347 if (getDoubleFromObject(o, &value) != REDIS_OK) {
3348 if (msg != NULL) {
3349 addReplySds(c, sdscatprintf(sdsempty(), "-ERR %s\r\n", msg));
3350 } else {
3351 addReplySds(c, sdsnew("-ERR value is not a double\r\n"));
3352 }
3353 return REDIS_ERR;
3354 }
3355
3356 *target = value;
3357 return REDIS_OK;
3358 }
3359
3360 static int getLongLongFromObject(robj *o, long long *target) {
3361 long long value;
3362 char *eptr;
3363
3364 if (o == NULL) {
3365 value = 0;
3366 } else {
3367 redisAssert(o->type == REDIS_STRING);
3368 if (o->encoding == REDIS_ENCODING_RAW) {
3369 value = strtoll(o->ptr, &eptr, 10);
3370 if (eptr[0] != '\0') return REDIS_ERR;
3371 } else if (o->encoding == REDIS_ENCODING_INT) {
3372 value = (long)o->ptr;
3373 } else {
3374 redisPanic("Unknown string encoding");
3375 }
3376 }
3377
3378 *target = value;
3379 return REDIS_OK;
3380 }
3381
3382 static int getLongLongFromObjectOrReply(redisClient *c, robj *o, long long *target, const char *msg) {
3383 long long value;
3384 if (getLongLongFromObject(o, &value) != REDIS_OK) {
3385 if (msg != NULL) {
3386 addReplySds(c, sdscatprintf(sdsempty(), "-ERR %s\r\n", msg));
3387 } else {
3388 addReplySds(c, sdsnew("-ERR value is not an integer\r\n"));
3389 }
3390 return REDIS_ERR;
3391 }
3392
3393 *target = value;
3394 return REDIS_OK;
3395 }
3396
3397 static int getLongFromObjectOrReply(redisClient *c, robj *o, long *target, const char *msg) {
3398 long long value;
3399
3400 if (getLongLongFromObjectOrReply(c, o, &value, msg) != REDIS_OK) return REDIS_ERR;
3401 if (value < LONG_MIN || value > LONG_MAX) {
3402 if (msg != NULL) {
3403 addReplySds(c, sdscatprintf(sdsempty(), "-ERR %s\r\n", msg));
3404 } else {
3405 addReplySds(c, sdsnew("-ERR value is out of range\r\n"));
3406 }
3407 return REDIS_ERR;
3408 }
3409
3410 *target = value;
3411 return REDIS_OK;
3412 }
3413
3414 /* =========================== Keyspace access API ========================== */
3415
3416 static robj *lookupKey(redisDb *db, robj *key) {
3417 dictEntry *de = dictFind(db->dict,key->ptr);
3418 if (de) {
3419 robj *val = dictGetEntryVal(de);
3420
3421 if (server.vm_enabled) {
3422 if (val->storage == REDIS_VM_MEMORY ||
3423 val->storage == REDIS_VM_SWAPPING)
3424 {
3425 /* If we were swapping the object out, cancel the operation */
3426 if (val->storage == REDIS_VM_SWAPPING)
3427 vmCancelThreadedIOJob(val);
3428 /* Update the access time for the aging algorithm. */
3429 val->lru = server.lruclock;
3430 } else {
3431 int notify = (val->storage == REDIS_VM_LOADING);
3432
3433 /* Our value was swapped on disk. Bring it at home. */
3434 redisAssert(val->type == REDIS_VMPOINTER);
3435 val = vmLoadObject(val);
3436 dictGetEntryVal(de) = val;
3437
3438 /* Clients blocked by the VM subsystem may be waiting for
3439 * this key... */
3440 if (notify) handleClientsBlockedOnSwappedKey(db,key);
3441 }
3442 }
3443 return val;
3444 } else {
3445 return NULL;
3446 }
3447 }
3448
3449 static robj *lookupKeyRead(redisDb *db, robj *key) {
3450 expireIfNeeded(db,key);
3451 return lookupKey(db,key);
3452 }
3453
3454 static robj *lookupKeyWrite(redisDb *db, robj *key) {
3455 deleteIfVolatile(db,key);
3456 touchWatchedKey(db,key);
3457 return lookupKey(db,key);
3458 }
3459
3460 static robj *lookupKeyReadOrReply(redisClient *c, robj *key, robj *reply) {
3461 robj *o = lookupKeyRead(c->db, key);
3462 if (!o) addReply(c,reply);
3463 return o;
3464 }
3465
3466 static robj *lookupKeyWriteOrReply(redisClient *c, robj *key, robj *reply) {
3467 robj *o = lookupKeyWrite(c->db, key);
3468 if (!o) addReply(c,reply);
3469 return o;
3470 }
3471
3472 /* Add the key to the DB. If the key already exists REDIS_ERR is returned,
3473 * otherwise REDIS_OK is returned, and the caller should increment the
3474 * refcount of 'val'. */
3475 static int dbAdd(redisDb *db, robj *key, robj *val) {
3476 /* Perform a lookup before adding the key, as we need to copy the
3477 * key value. */
3478 if (dictFind(db->dict, key->ptr) != NULL) {
3479 return REDIS_ERR;
3480 } else {
3481 sds copy = sdsdup(key->ptr);
3482 dictAdd(db->dict, copy, val);
3483 return REDIS_OK;
3484 }
3485 }
3486
3487 /* If the key does not exist, this is just like dbAdd(). Otherwise
3488 * the value associated to the key is replaced with the new one.
3489 *
3490 * On update (key already existed) 0 is returned. Otherwise 1. */
3491 static int dbReplace(redisDb *db, robj *key, robj *val) {
3492 if (dictFind(db->dict,key->ptr) == NULL) {
3493 sds copy = sdsdup(key->ptr);
3494 dictAdd(db->dict, copy, val);
3495 return 1;
3496 } else {
3497 dictReplace(db->dict, key->ptr, val);
3498 return 0;
3499 }
3500 }
3501
3502 static int dbExists(redisDb *db, robj *key) {
3503 return dictFind(db->dict,key->ptr) != NULL;
3504 }
3505
3506 /* Return a random key, in form of a Redis object.
3507 * If there are no keys, NULL is returned.
3508 *
3509 * The function makes sure to return keys not already expired. */
3510 static robj *dbRandomKey(redisDb *db) {
3511 struct dictEntry *de;
3512
3513 while(1) {
3514 sds key;
3515 robj *keyobj;
3516
3517 de = dictGetRandomKey(db->dict);
3518 if (de == NULL) return NULL;
3519
3520 key = dictGetEntryKey(de);
3521 keyobj = createStringObject(key,sdslen(key));
3522 if (dictFind(db->expires,key)) {
3523 if (expireIfNeeded(db,keyobj)) {
3524 decrRefCount(keyobj);
3525 continue; /* search for another key. This expired. */
3526 }
3527 }
3528 return keyobj;
3529 }
3530 }
3531
3532 /* Delete a key, value, and associated expiration entry if any, from the DB */
3533 static int dbDelete(redisDb *db, robj *key) {
3534 int retval;
3535
3536 if (dictSize(db->expires)) dictDelete(db->expires,key->ptr);
3537 retval = dictDelete(db->dict,key->ptr);
3538
3539 return retval == DICT_OK;
3540 }
3541
3542 /*============================ RDB saving/loading =========================== */
3543
3544 static int rdbSaveType(FILE *fp, unsigned char type) {
3545 if (fwrite(&type,1,1,fp) == 0) return -1;
3546 return 0;
3547 }
3548
3549 static int rdbSaveTime(FILE *fp, time_t t) {
3550 int32_t t32 = (int32_t) t;
3551 if (fwrite(&t32,4,1,fp) == 0) return -1;
3552 return 0;
3553 }
3554
3555 /* check rdbLoadLen() comments for more info */
3556 static int rdbSaveLen(FILE *fp, uint32_t len) {
3557 unsigned char buf[2];
3558
3559 if (len < (1<<6)) {
3560 /* Save a 6 bit len */
3561 buf[0] = (len&0xFF)|(REDIS_RDB_6BITLEN<<6);
3562 if (fwrite(buf,1,1,fp) == 0) return -1;
3563 } else if (len < (1<<14)) {
3564 /* Save a 14 bit len */
3565 buf[0] = ((len>>8)&0xFF)|(REDIS_RDB_14BITLEN<<6);
3566 buf[1] = len&0xFF;
3567 if (fwrite(buf,2,1,fp) == 0) return -1;
3568 } else {
3569 /* Save a 32 bit len */
3570 buf[0] = (REDIS_RDB_32BITLEN<<6);
3571 if (fwrite(buf,1,1,fp) == 0) return -1;
3572 len = htonl(len);
3573 if (fwrite(&len,4,1,fp) == 0) return -1;
3574 }
3575 return 0;
3576 }
3577
3578 /* Encode 'value' as an integer if possible (if integer will fit the
3579 * supported range). If the function sucessful encoded the integer
3580 * then the (up to 5 bytes) encoded representation is written in the
3581 * string pointed by 'enc' and the length is returned. Otherwise
3582 * 0 is returned. */
3583 static int rdbEncodeInteger(long long value, unsigned char *enc) {
3584 /* Finally check if it fits in our ranges */
3585 if (value >= -(1<<7) && value <= (1<<7)-1) {
3586 enc[0] = (REDIS_RDB_ENCVAL<<6)|REDIS_RDB_ENC_INT8;
3587 enc[1] = value&0xFF;
3588 return 2;
3589 } else if (value >= -(1<<15) && value <= (1<<15)-1) {
3590 enc[0] = (REDIS_RDB_ENCVAL<<6)|REDIS_RDB_ENC_INT16;
3591 enc[1] = value&0xFF;
3592 enc[2] = (value>>8)&0xFF;
3593 return 3;
3594 } else if (value >= -((long long)1<<31) && value <= ((long long)1<<31)-1) {
3595 enc[0] = (REDIS_RDB_ENCVAL<<6)|REDIS_RDB_ENC_INT32;
3596 enc[1] = value&0xFF;
3597 enc[2] = (value>>8)&0xFF;
3598 enc[3] = (value>>16)&0xFF;
3599 enc[4] = (value>>24)&0xFF;
3600 return 5;
3601 } else {
3602 return 0;
3603 }
3604 }
3605
3606 /* String objects in the form "2391" "-100" without any space and with a
3607 * range of values that can fit in an 8, 16 or 32 bit signed value can be
3608 * encoded as integers to save space */
3609 static int rdbTryIntegerEncoding(char *s, size_t len, unsigned char *enc) {
3610 long long value;
3611 char *endptr, buf[32];
3612
3613 /* Check if it's possible to encode this value as a number */
3614 value = strtoll(s, &endptr, 10);
3615 if (endptr[0] != '\0') return 0;
3616 ll2string(buf,32,value);
3617
3618 /* If the number converted back into a string is not identical
3619 * then it's not possible to encode the string as integer */
3620 if (strlen(buf) != len || memcmp(buf,s,len)) return 0;
3621
3622 return rdbEncodeInteger(value,enc);
3623 }
3624
3625 static int rdbSaveLzfStringObject(FILE *fp, unsigned char *s, size_t len) {
3626 size_t comprlen, outlen;
3627 unsigned char byte;
3628 void *out;
3629
3630 /* We require at least four bytes compression for this to be worth it */
3631 if (len <= 4) return 0;
3632 outlen = len-4;
3633 if ((out = zmalloc(outlen+1)) == NULL) return 0;
3634 comprlen = lzf_compress(s, len, out, outlen);
3635 if (comprlen == 0) {
3636 zfree(out);
3637 return 0;
3638 }
3639 /* Data compressed! Let's save it on disk */
3640 byte = (REDIS_RDB_ENCVAL<<6)|REDIS_RDB_ENC_LZF;
3641 if (fwrite(&byte,1,1,fp) == 0) goto writeerr;
3642 if (rdbSaveLen(fp,comprlen) == -1) goto writeerr;
3643 if (rdbSaveLen(fp,len) == -1) goto writeerr;
3644 if (fwrite(out,comprlen,1,fp) == 0) goto writeerr;
3645 zfree(out);
3646 return comprlen;
3647
3648 writeerr:
3649 zfree(out);
3650 return -1;
3651 }
3652
3653 /* Save a string objet as [len][data] on disk. If the object is a string
3654 * representation of an integer value we try to safe it in a special form */
3655 static int rdbSaveRawString(FILE *fp, unsigned char *s, size_t len) {
3656 int enclen;
3657
3658 /* Try integer encoding */
3659 if (len <= 11) {
3660 unsigned char buf[5];
3661 if ((enclen = rdbTryIntegerEncoding((char*)s,len,buf)) > 0) {
3662 if (fwrite(buf,enclen,1,fp) == 0) return -1;
3663 return 0;
3664 }
3665 }
3666
3667 /* Try LZF compression - under 20 bytes it's unable to compress even
3668 * aaaaaaaaaaaaaaaaaa so skip it */
3669 if (server.rdbcompression && len > 20) {
3670 int retval;
3671
3672 retval = rdbSaveLzfStringObject(fp,s,len);
3673 if (retval == -1) return -1;
3674 if (retval > 0) return 0;
3675 /* retval == 0 means data can't be compressed, save the old way */
3676 }
3677
3678 /* Store verbatim */
3679 if (rdbSaveLen(fp,len) == -1) return -1;
3680 if (len && fwrite(s,len,1,fp) == 0) return -1;
3681 return 0;
3682 }
3683
3684 /* Save a long long value as either an encoded string or a string. */
3685 static int rdbSaveLongLongAsStringObject(FILE *fp, long long value) {
3686 unsigned char buf[32];
3687 int enclen = rdbEncodeInteger(value,buf);
3688 if (enclen > 0) {
3689 if (fwrite(buf,enclen,1,fp) == 0) return -1;
3690 } else {
3691 /* Encode as string */
3692 enclen = ll2string((char*)buf,32,value);
3693 redisAssert(enclen < 32);
3694 if (rdbSaveLen(fp,enclen) == -1) return -1;
3695 if (fwrite(buf,enclen,1,fp) == 0) return -1;
3696 }
3697 return 0;
3698 }
3699
3700 /* Like rdbSaveStringObjectRaw() but handle encoded objects */
3701 static int rdbSaveStringObject(FILE *fp, robj *obj) {
3702 /* Avoid to decode the object, then encode it again, if the
3703 * object is alrady integer encoded. */
3704 if (obj->encoding == REDIS_ENCODING_INT) {
3705 return rdbSaveLongLongAsStringObject(fp,(long)obj->ptr);
3706 } else {
3707 redisAssert(obj->encoding == REDIS_ENCODING_RAW);
3708 return rdbSaveRawString(fp,obj->ptr,sdslen(obj->ptr));
3709 }
3710 }
3711
3712 /* Save a double value. Doubles are saved as strings prefixed by an unsigned
3713 * 8 bit integer specifing the length of the representation.
3714 * This 8 bit integer has special values in order to specify the following
3715 * conditions:
3716 * 253: not a number
3717 * 254: + inf
3718 * 255: - inf
3719 */
3720 static int rdbSaveDoubleValue(FILE *fp, double val) {
3721 unsigned char buf[128];
3722 int len;
3723
3724 if (isnan(val)) {
3725 buf[0] = 253;
3726 len = 1;
3727 } else if (!isfinite(val)) {
3728 len = 1;
3729 buf[0] = (val < 0) ? 255 : 254;
3730 } else {
3731 #if (DBL_MANT_DIG >= 52) && (LLONG_MAX == 0x7fffffffffffffffLL)
3732 /* Check if the float is in a safe range to be casted into a
3733 * long long. We are assuming that long long is 64 bit here.
3734 * Also we are assuming that there are no implementations around where
3735 * double has precision < 52 bit.
3736 *
3737 * Under this assumptions we test if a double is inside an interval
3738 * where casting to long long is safe. Then using two castings we
3739 * make sure the decimal part is zero. If all this is true we use
3740 * integer printing function that is much faster. */
3741 double min = -4503599627370495; /* (2^52)-1 */
3742 double max = 4503599627370496; /* -(2^52) */
3743 if (val > min && val < max && val == ((double)((long long)val)))
3744 ll2string((char*)buf+1,sizeof(buf),(long long)val);
3745 else
3746 #endif
3747 snprintf((char*)buf+1,sizeof(buf)-1,"%.17g",val);
3748 buf[0] = strlen((char*)buf+1);
3749 len = buf[0]+1;
3750 }
3751 if (fwrite(buf,len,1,fp) == 0) return -1;
3752 return 0;
3753 }
3754
3755 /* Save a Redis object. */
3756 static int rdbSaveObject(FILE *fp, robj *o) {
3757 if (o->type == REDIS_STRING) {
3758 /* Save a string value */
3759 if (rdbSaveStringObject(fp,o) == -1) return -1;
3760 } else if (o->type == REDIS_LIST) {
3761 /* Save a list value */
3762 if (o->encoding == REDIS_ENCODING_ZIPLIST) {
3763 unsigned char *p;
3764 unsigned char *vstr;
3765 unsigned int vlen;
3766 long long vlong;
3767
3768 if (rdbSaveLen(fp,ziplistLen(o->ptr)) == -1) return -1;
3769 p = ziplistIndex(o->ptr,0);
3770 while(ziplistGet(p,&vstr,&vlen,&vlong)) {
3771 if (vstr) {
3772 if (rdbSaveRawString(fp,vstr,vlen) == -1)
3773 return -1;
3774 } else {
3775 if (rdbSaveLongLongAsStringObject(fp,vlong) == -1)
3776 return -1;
3777 }
3778 p = ziplistNext(o->ptr,p);
3779 }
3780 } else if (o->encoding == REDIS_ENCODING_LIST) {
3781 list *list = o->ptr;
3782 listIter li;
3783 listNode *ln;
3784
3785 if (rdbSaveLen(fp,listLength(list)) == -1) return -1;
3786 listRewind(list,&li);
3787 while((ln = listNext(&li))) {
3788 robj *eleobj = listNodeValue(ln);
3789 if (rdbSaveStringObject(fp,eleobj) == -1) return -1;
3790 }
3791 } else {
3792 redisPanic("Unknown list encoding");
3793 }
3794 } else if (o->type == REDIS_SET) {
3795 /* Save a set value */
3796 dict *set = o->ptr;
3797 dictIterator *di = dictGetIterator(set);
3798 dictEntry *de;
3799
3800 if (rdbSaveLen(fp,dictSize(set)) == -1) return -1;
3801 while((de = dictNext(di)) != NULL) {
3802 robj *eleobj = dictGetEntryKey(de);
3803
3804 if (rdbSaveStringObject(fp,eleobj) == -1) return -1;
3805 }
3806 dictReleaseIterator(di);
3807 } else if (o->type == REDIS_ZSET) {
3808 /* Save a set value */
3809 zset *zs = o->ptr;
3810 dictIterator *di = dictGetIterator(zs->dict);
3811 dictEntry *de;
3812
3813 if (rdbSaveLen(fp,dictSize(zs->dict)) == -1) return -1;
3814 while((de = dictNext(di)) != NULL) {
3815 robj *eleobj = dictGetEntryKey(de);
3816 double *score = dictGetEntryVal(de);
3817
3818 if (rdbSaveStringObject(fp,eleobj) == -1) return -1;
3819 if (rdbSaveDoubleValue(fp,*score) == -1) return -1;
3820 }
3821 dictReleaseIterator(di);
3822 } else if (o->type == REDIS_HASH) {
3823 /* Save a hash value */
3824 if (o->encoding == REDIS_ENCODING_ZIPMAP) {
3825 unsigned char *p = zipmapRewind(o->ptr);
3826 unsigned int count = zipmapLen(o->ptr);
3827 unsigned char *key, *val;
3828 unsigned int klen, vlen;
3829
3830 if (rdbSaveLen(fp,count) == -1) return -1;
3831 while((p = zipmapNext(p,&key,&klen,&val,&vlen)) != NULL) {
3832 if (rdbSaveRawString(fp,key,klen) == -1) return -1;
3833 if (rdbSaveRawString(fp,val,vlen) == -1) return -1;
3834 }
3835 } else {
3836 dictIterator *di = dictGetIterator(o->ptr);
3837 dictEntry *de;
3838
3839 if (rdbSaveLen(fp,dictSize((dict*)o->ptr)) == -1) return -1;
3840 while((de = dictNext(di)) != NULL) {
3841 robj *key = dictGetEntryKey(de);
3842 robj *val = dictGetEntryVal(de);
3843
3844 if (rdbSaveStringObject(fp,key) == -1) return -1;
3845 if (rdbSaveStringObject(fp,val) == -1) return -1;
3846 }
3847 dictReleaseIterator(di);
3848 }
3849 } else {
3850 redisPanic("Unknown object type");
3851 }
3852 return 0;
3853 }
3854
3855 /* Return the length the object will have on disk if saved with
3856 * the rdbSaveObject() function. Currently we use a trick to get
3857 * this length with very little changes to the code. In the future
3858 * we could switch to a faster solution. */
3859 static off_t rdbSavedObjectLen(robj *o, FILE *fp) {
3860 if (fp == NULL) fp = server.devnull;
3861 rewind(fp);
3862 assert(rdbSaveObject(fp,o) != 1);
3863 return ftello(fp);
3864 }
3865
3866 /* Return the number of pages required to save this object in the swap file */
3867 static off_t rdbSavedObjectPages(robj *o, FILE *fp) {
3868 off_t bytes = rdbSavedObjectLen(o,fp);
3869
3870 return (bytes+(server.vm_page_size-1))/server.vm_page_size;
3871 }
3872
3873 /* Save the DB on disk. Return REDIS_ERR on error, REDIS_OK on success */
3874 static int rdbSave(char *filename) {
3875 dictIterator *di = NULL;
3876 dictEntry *de;
3877 FILE *fp;
3878 char tmpfile[256];
3879 int j;
3880 time_t now = time(NULL);
3881
3882 /* Wait for I/O therads to terminate, just in case this is a
3883 * foreground-saving, to avoid seeking the swap file descriptor at the
3884 * same time. */
3885 if (server.vm_enabled)
3886 waitEmptyIOJobsQueue();
3887
3888 snprintf(tmpfile,256,"temp-%d.rdb", (int) getpid());
3889 fp = fopen(tmpfile,"w");
3890 if (!fp) {
3891 redisLog(REDIS_WARNING, "Failed saving the DB: %s", strerror(errno));
3892 return REDIS_ERR;
3893 }
3894 if (fwrite("REDIS0001",9,1,fp) == 0) goto werr;
3895 for (j = 0; j < server.dbnum; j++) {
3896 redisDb *db = server.db+j;
3897 dict *d = db->dict;
3898 if (dictSize(d) == 0) continue;
3899 di = dictGetIterator(d);
3900 if (!di) {
3901 fclose(fp);
3902 return REDIS_ERR;
3903 }
3904
3905 /* Write the SELECT DB opcode */
3906 if (rdbSaveType(fp,REDIS_SELECTDB) == -1) goto werr;
3907 if (rdbSaveLen(fp,j) == -1) goto werr;
3908
3909 /* Iterate this DB writing every entry */
3910 while((de = dictNext(di)) != NULL) {
3911 sds keystr = dictGetEntryKey(de);
3912 robj key, *o = dictGetEntryVal(de);
3913 time_t expiretime;
3914
3915 initStaticStringObject(key,keystr);
3916 expiretime = getExpire(db,&key);
3917
3918 /* Save the expire time */
3919 if (expiretime != -1) {
3920 /* If this key is already expired skip it */
3921 if (expiretime < now) continue;
3922 if (rdbSaveType(fp,REDIS_EXPIRETIME) == -1) goto werr;
3923 if (rdbSaveTime(fp,expiretime) == -1) goto werr;
3924 }
3925 /* Save the key and associated value. This requires special
3926 * handling if the value is swapped out. */
3927 if (!server.vm_enabled || o->storage == REDIS_VM_MEMORY ||
3928 o->storage == REDIS_VM_SWAPPING) {
3929 /* Save type, key, value */
3930 if (rdbSaveType(fp,o->type) == -1) goto werr;
3931 if (rdbSaveStringObject(fp,&key) == -1) goto werr;
3932 if (rdbSaveObject(fp,o) == -1) goto werr;
3933 } else {
3934 /* REDIS_VM_SWAPPED or REDIS_VM_LOADING */
3935 robj *po;
3936 /* Get a preview of the object in memory */
3937 po = vmPreviewObject(o);
3938 /* Save type, key, value */
3939 if (rdbSaveType(fp,po->type) == -1) goto werr;
3940 if (rdbSaveStringObject(fp,&key) == -1) goto werr;
3941 if (rdbSaveObject(fp,po) == -1) goto werr;
3942 /* Remove the loaded object from memory */
3943 decrRefCount(po);
3944 }
3945 }
3946 dictReleaseIterator(di);
3947 }
3948 /* EOF opcode */
3949 if (rdbSaveType(fp,REDIS_EOF) == -1) goto werr;
3950
3951 /* Make sure data will not remain on the OS's output buffers */
3952 fflush(fp);
3953 fsync(fileno(fp));
3954 fclose(fp);
3955
3956 /* Use RENAME to make sure the DB file is changed atomically only
3957 * if the generate DB file is ok. */
3958 if (rename(tmpfile,filename) == -1) {
3959 redisLog(REDIS_WARNING,"Error moving temp DB file on the final destination: %s", strerror(errno));
3960 unlink(tmpfile);
3961 return REDIS_ERR;
3962 }
3963 redisLog(REDIS_NOTICE,"DB saved on disk");
3964 server.dirty = 0;
3965 server.lastsave = time(NULL);
3966 return REDIS_OK;
3967
3968 werr:
3969 fclose(fp);
3970 unlink(tmpfile);
3971 redisLog(REDIS_WARNING,"Write error saving DB on disk: %s", strerror(errno));
3972 if (di) dictReleaseIterator(di);
3973 return REDIS_ERR;
3974 }
3975
3976 static int rdbSaveBackground(char *filename) {
3977 pid_t childpid;
3978
3979 if (server.bgsavechildpid != -1) return REDIS_ERR;
3980 if (server.vm_enabled) waitEmptyIOJobsQueue();
3981 if ((childpid = fork()) == 0) {
3982 /* Child */
3983 if (server.vm_enabled) vmReopenSwapFile();
3984 close(server.fd);
3985 if (rdbSave(filename) == REDIS_OK) {
3986 _exit(0);
3987 } else {
3988 _exit(1);
3989 }
3990 } else {
3991 /* Parent */
3992 if (childpid == -1) {
3993 redisLog(REDIS_WARNING,"Can't save in background: fork: %s",
3994 strerror(errno));
3995 return REDIS_ERR;
3996 }
3997 redisLog(REDIS_NOTICE,"Background saving started by pid %d",childpid);
3998 server.bgsavechildpid = childpid;
3999 updateDictResizePolicy();
4000 return REDIS_OK;
4001 }
4002 return REDIS_OK; /* unreached */
4003 }
4004
4005 static void rdbRemoveTempFile(pid_t childpid) {
4006 char tmpfile[256];
4007
4008 snprintf(tmpfile,256,"temp-%d.rdb", (int) childpid);
4009 unlink(tmpfile);
4010 }
4011
4012 static int rdbLoadType(FILE *fp) {
4013 unsigned char type;
4014 if (fread(&type,1,1,fp) == 0) return -1;
4015 return type;
4016 }
4017
4018 static time_t rdbLoadTime(FILE *fp) {
4019 int32_t t32;
4020 if (fread(&t32,4,1,fp) == 0) return -1;
4021 return (time_t) t32;
4022 }
4023
4024 /* Load an encoded length from the DB, see the REDIS_RDB_* defines on the top
4025 * of this file for a description of how this are stored on disk.
4026 *
4027 * isencoded is set to 1 if the readed length is not actually a length but
4028 * an "encoding type", check the above comments for more info */
4029 static uint32_t rdbLoadLen(FILE *fp, int *isencoded) {
4030 unsigned char buf[2];
4031 uint32_t len;
4032 int type;
4033
4034 if (isencoded) *isencoded = 0;
4035 if (fread(buf,1,1,fp) == 0) return REDIS_RDB_LENERR;
4036 type = (buf[0]&0xC0)>>6;
4037 if (type == REDIS_RDB_6BITLEN) {
4038 /* Read a 6 bit len */
4039 return buf[0]&0x3F;
4040 } else if (type == REDIS_RDB_ENCVAL) {
4041 /* Read a 6 bit len encoding type */
4042 if (isencoded) *isencoded = 1;
4043 return buf[0]&0x3F;
4044 } else if (type == REDIS_RDB_14BITLEN) {
4045 /* Read a 14 bit len */
4046 if (fread(buf+1,1,1,fp) == 0) return REDIS_RDB_LENERR;
4047 return ((buf[0]&0x3F)<<8)|buf[1];
4048 } else {
4049 /* Read a 32 bit len */
4050 if (fread(&len,4,1,fp) == 0) return REDIS_RDB_LENERR;
4051 return ntohl(len);
4052 }
4053 }
4054
4055 /* Load an integer-encoded object from file 'fp', with the specified
4056 * encoding type 'enctype'. If encode is true the function may return
4057 * an integer-encoded object as reply, otherwise the returned object
4058 * will always be encoded as a raw string. */
4059 static robj *rdbLoadIntegerObject(FILE *fp, int enctype, int encode) {
4060 unsigned char enc[4];
4061 long long val;
4062
4063 if (enctype == REDIS_RDB_ENC_INT8) {
4064 if (fread(enc,1,1,fp) == 0) return NULL;
4065 val = (signed char)enc[0];
4066 } else if (enctype == REDIS_RDB_ENC_INT16) {
4067 uint16_t v;
4068 if (fread(enc,2,1,fp) == 0) return NULL;
4069 v = enc[0]|(enc[1]<<8);
4070 val = (int16_t)v;
4071 } else if (enctype == REDIS_RDB_ENC_INT32) {
4072 uint32_t v;
4073 if (fread(enc,4,1,fp) == 0) return NULL;
4074 v = enc[0]|(enc[1]<<8)|(enc[2]<<16)|(enc[3]<<24);
4075 val = (int32_t)v;
4076 } else {
4077 val = 0; /* anti-warning */
4078 redisPanic("Unknown RDB integer encoding type");
4079 }
4080 if (encode)
4081 return createStringObjectFromLongLong(val);
4082 else
4083 return createObject(REDIS_STRING,sdsfromlonglong(val));
4084 }
4085
4086 static robj *rdbLoadLzfStringObject(FILE*fp) {
4087 unsigned int len, clen;
4088 unsigned char *c = NULL;
4089 sds val = NULL;
4090
4091 if ((clen = rdbLoadLen(fp,NULL)) == REDIS_RDB_LENERR) return NULL;
4092 if ((len = rdbLoadLen(fp,NULL)) == REDIS_RDB_LENERR) return NULL;
4093 if ((c = zmalloc(clen)) == NULL) goto err;
4094 if ((val = sdsnewlen(NULL,len)) == NULL) goto err;
4095 if (fread(c,clen,1,fp) == 0) goto err;
4096 if (lzf_decompress(c,clen,val,len) == 0) goto err;
4097 zfree(c);
4098 return createObject(REDIS_STRING,val);
4099 err:
4100 zfree(c);
4101 sdsfree(val);
4102 return NULL;
4103 }
4104
4105 static robj *rdbGenericLoadStringObject(FILE*fp, int encode) {
4106 int isencoded;
4107 uint32_t len;
4108 sds val;
4109
4110 len = rdbLoadLen(fp,&isencoded);
4111 if (isencoded) {
4112 switch(len) {
4113 case REDIS_RDB_ENC_INT8:
4114 case REDIS_RDB_ENC_INT16:
4115 case REDIS_RDB_ENC_INT32:
4116 return rdbLoadIntegerObject(fp,len,encode);
4117 case REDIS_RDB_ENC_LZF:
4118 return rdbLoadLzfStringObject(fp);
4119 default:
4120 redisPanic("Unknown RDB encoding type");
4121 }
4122 }
4123
4124 if (len == REDIS_RDB_LENERR) return NULL;
4125 val = sdsnewlen(NULL,len);
4126 if (len && fread(val,len,1,fp) == 0) {
4127 sdsfree(val);
4128 return NULL;
4129 }
4130 return createObject(REDIS_STRING,val);
4131 }
4132
4133 static robj *rdbLoadStringObject(FILE *fp) {
4134 return rdbGenericLoadStringObject(fp,0);
4135 }
4136
4137 static robj *rdbLoadEncodedStringObject(FILE *fp) {
4138 return rdbGenericLoadStringObject(fp,1);
4139 }
4140
4141 /* For information about double serialization check rdbSaveDoubleValue() */
4142 static int rdbLoadDoubleValue(FILE *fp, double *val) {
4143 char buf[128];
4144 unsigned char len;
4145
4146 if (fread(&len,1,1,fp) == 0) return -1;
4147 switch(len) {
4148 case 255: *val = R_NegInf; return 0;
4149 case 254: *val = R_PosInf; return 0;
4150 case 253: *val = R_Nan; return 0;
4151 default:
4152 if (fread(buf,len,1,fp) == 0) return -1;
4153 buf[len] = '\0';
4154 sscanf(buf, "%lg", val);
4155 return 0;
4156 }
4157 }
4158
4159 /* Load a Redis object of the specified type from the specified file.
4160 * On success a newly allocated object is returned, otherwise NULL. */
4161 static robj *rdbLoadObject(int type, FILE *fp) {
4162 robj *o, *ele, *dec;
4163 size_t len;
4164
4165 redisLog(REDIS_DEBUG,"LOADING OBJECT %d (at %d)\n",type,ftell(fp));
4166 if (type == REDIS_STRING) {
4167 /* Read string value */
4168 if ((o = rdbLoadEncodedStringObject(fp)) == NULL) return NULL;
4169 o = tryObjectEncoding(o);
4170 } else if (type == REDIS_LIST) {
4171 /* Read list value */
4172 if ((len = rdbLoadLen(fp,NULL)) == REDIS_RDB_LENERR) return NULL;
4173
4174 /* Use a real list when there are too many entries */
4175 if (len > server.list_max_ziplist_entries) {
4176 o = createListObject();
4177 } else {
4178 o = createZiplistObject();
4179 }
4180
4181 /* Load every single element of the list */
4182 while(len--) {
4183 if ((ele = rdbLoadEncodedStringObject(fp)) == NULL) return NULL;
4184
4185 /* If we are using a ziplist and the value is too big, convert
4186 * the object to a real list. */
4187 if (o->encoding == REDIS_ENCODING_ZIPLIST &&
4188 ele->encoding == REDIS_ENCODING_RAW &&
4189 sdslen(ele->ptr) > server.list_max_ziplist_value)
4190 listTypeConvert(o,REDIS_ENCODING_LIST);
4191
4192 if (o->encoding == REDIS_ENCODING_ZIPLIST) {
4193 dec = getDecodedObject(ele);
4194 o->ptr = ziplistPush(o->ptr,dec->ptr,sdslen(dec->ptr),REDIS_TAIL);
4195 decrRefCount(dec);
4196 decrRefCount(ele);
4197 } else {
4198 ele = tryObjectEncoding(ele);
4199 listAddNodeTail(o->ptr,ele);
4200 }
4201 }
4202 } else if (type == REDIS_SET) {
4203 /* Read list/set value */
4204 if ((len = rdbLoadLen(fp,NULL)) == REDIS_RDB_LENERR) return NULL;
4205 o = createSetObject();
4206 /* It's faster to expand the dict to the right size asap in order
4207 * to avoid rehashing */
4208 if (len > DICT_HT_INITIAL_SIZE)
4209 dictExpand(o->ptr,len);
4210 /* Load every single element of the list/set */
4211 while(len--) {
4212 if ((ele = rdbLoadEncodedStringObject(fp)) == NULL) return NULL;
4213 ele = tryObjectEncoding(ele);
4214 dictAdd((dict*)o->ptr,ele,NULL);
4215 }
4216 } else if (type == REDIS_ZSET) {
4217 /* Read list/set value */
4218 size_t zsetlen;
4219 zset *zs;
4220
4221 if ((zsetlen = rdbLoadLen(fp,NULL)) == REDIS_RDB_LENERR) return NULL;
4222 o = createZsetObject();
4223 zs = o->ptr;
4224 /* Load every single element of the list/set */
4225 while(zsetlen--) {
4226 robj *ele;
4227 double *score = zmalloc(sizeof(double));
4228
4229 if ((ele = rdbLoadEncodedStringObject(fp)) == NULL) return NULL;
4230 ele = tryObjectEncoding(ele);
4231 if (rdbLoadDoubleValue(fp,score) == -1) return NULL;
4232 dictAdd(zs->dict,ele,score);
4233 zslInsert(zs->zsl,*score,ele);
4234 incrRefCount(ele); /* added to skiplist */
4235 }
4236 } else if (type == REDIS_HASH) {
4237 size_t hashlen;
4238
4239 if ((hashlen = rdbLoadLen(fp,NULL)) == REDIS_RDB_LENERR) return NULL;
4240 o = createHashObject();
4241 /* Too many entries? Use an hash table. */
4242 if (hashlen > server.hash_max_zipmap_entries)
4243 convertToRealHash(o);
4244 /* Load every key/value, then set it into the zipmap or hash
4245 * table, as needed. */
4246 while(hashlen--) {
4247 robj *key, *val;
4248
4249 if ((key = rdbLoadStringObject(fp)) == NULL) return NULL;
4250 if ((val = rdbLoadStringObject(fp)) == NULL) return NULL;
4251 /* If we are using a zipmap and there are too big values
4252 * the object is converted to real hash table encoding. */
4253 if (o->encoding != REDIS_ENCODING_HT &&
4254 (sdslen(key->ptr) > server.hash_max_zipmap_value ||
4255 sdslen(val->ptr) > server.hash_max_zipmap_value))
4256 {
4257 convertToRealHash(o);
4258 }
4259
4260 if (o->encoding == REDIS_ENCODING_ZIPMAP) {
4261 unsigned char *zm = o->ptr;
4262
4263 zm = zipmapSet(zm,key->ptr,sdslen(key->ptr),
4264 val->ptr,sdslen(val->ptr),NULL);
4265 o->ptr = zm;
4266 decrRefCount(key);
4267 decrRefCount(val);
4268 } else {
4269 key = tryObjectEncoding(key);
4270 val = tryObjectEncoding(val);
4271 dictAdd((dict*)o->ptr,key,val);
4272 }
4273 }
4274 } else {
4275 redisPanic("Unknown object type");
4276 }
4277 return o;
4278 }
4279
4280 static int rdbLoad(char *filename) {
4281 FILE *fp;
4282 uint32_t dbid;
4283 int type, retval, rdbver;
4284 int swap_all_values = 0;
4285 redisDb *db = server.db+0;
4286 char buf[1024];
4287 time_t expiretime, now = time(NULL);
4288
4289 fp = fopen(filename,"r");
4290 if (!fp) return REDIS_ERR;
4291 if (fread(buf,9,1,fp) == 0) goto eoferr;
4292 buf[9] = '\0';
4293 if (memcmp(buf,"REDIS",5) != 0) {
4294 fclose(fp);
4295 redisLog(REDIS_WARNING,"Wrong signature trying to load DB from file");
4296 return REDIS_ERR;
4297 }
4298 rdbver = atoi(buf+5);
4299 if (rdbver != 1) {
4300 fclose(fp);
4301 redisLog(REDIS_WARNING,"Can't handle RDB format version %d",rdbver);
4302 return REDIS_ERR;
4303 }
4304 while(1) {
4305 robj *key, *val;
4306 int force_swapout;
4307
4308 expiretime = -1;
4309 /* Read type. */
4310 if ((type = rdbLoadType(fp)) == -1) goto eoferr;
4311 if (type == REDIS_EXPIRETIME) {
4312 if ((expiretime = rdbLoadTime(fp)) == -1) goto eoferr;
4313 /* We read the time so we need to read the object type again */
4314 if ((type = rdbLoadType(fp)) == -1) goto eoferr;
4315 }
4316 if (type == REDIS_EOF) break;
4317 /* Handle SELECT DB opcode as a special case */
4318 if (type == REDIS_SELECTDB) {
4319 if ((dbid = rdbLoadLen(fp,NULL)) == REDIS_RDB_LENERR)
4320 goto eoferr;
4321 if (dbid >= (unsigned)server.dbnum) {
4322 redisLog(REDIS_WARNING,"FATAL: Data file was created with a Redis server configured to handle more than %d databases. Exiting\n", server.dbnum);
4323 exit(1);
4324 }
4325 db = server.db+dbid;
4326 continue;
4327 }
4328 /* Read key */
4329 if ((key = rdbLoadStringObject(fp)) == NULL) goto eoferr;
4330 /* Read value */
4331 if ((val = rdbLoadObject(type,fp)) == NULL) goto eoferr;
4332 /* Check if the key already expired */
4333 if (expiretime != -1 && expiretime < now) {
4334 decrRefCount(key);
4335 decrRefCount(val);
4336 continue;
4337 }
4338 /* Add the new object in the hash table */
4339 retval = dbAdd(db,key,val);
4340 if (retval == REDIS_ERR) {
4341 redisLog(REDIS_WARNING,"Loading DB, duplicated key (%s) found! Unrecoverable error, exiting now.", key->ptr);
4342 exit(1);
4343 }
4344 /* Set the expire time if needed */
4345 if (expiretime != -1) setExpire(db,key,expiretime);
4346
4347 /* Handle swapping while loading big datasets when VM is on */
4348
4349 /* If we detecter we are hopeless about fitting something in memory
4350 * we just swap every new key on disk. Directly...
4351 * Note that's important to check for this condition before resorting
4352 * to random sampling, otherwise we may try to swap already
4353 * swapped keys. */
4354 if (swap_all_values) {
4355 dictEntry *de = dictFind(db->dict,key->ptr);
4356
4357 /* de may be NULL since the key already expired */
4358 if (de) {
4359 vmpointer *vp;
4360 val = dictGetEntryVal(de);
4361
4362 if (val->refcount == 1 &&
4363 (vp = vmSwapObjectBlocking(val)) != NULL)
4364 dictGetEntryVal(de) = vp;
4365 }
4366 decrRefCount(key);
4367 continue;
4368 }
4369 decrRefCount(key);
4370
4371 /* Flush data on disk once 32 MB of additional RAM are used... */
4372 force_swapout = 0;
4373 if ((zmalloc_used_memory() - server.vm_max_memory) > 1024*1024*32)
4374 force_swapout = 1;
4375
4376 /* If we have still some hope of having some value fitting memory
4377 * then we try random sampling. */
4378 if (!swap_all_values && server.vm_enabled && force_swapout) {
4379 while (zmalloc_used_memory() > server.vm_max_memory) {
4380 if (vmSwapOneObjectBlocking() == REDIS_ERR) break;
4381 }
4382 if (zmalloc_used_memory() > server.vm_max_memory)
4383 swap_all_values = 1; /* We are already using too much mem */
4384 }
4385 }
4386 fclose(fp);
4387 return REDIS_OK;
4388
4389 eoferr: /* unexpected end of file is handled here with a fatal exit */
4390 redisLog(REDIS_WARNING,"Short read or OOM loading DB. Unrecoverable error, aborting now.");
4391 exit(1);
4392 return REDIS_ERR; /* Just to avoid warning */
4393 }
4394
4395 /*================================== Shutdown =============================== */
4396 static int prepareForShutdown() {
4397 redisLog(REDIS_WARNING,"User requested shutdown, saving DB...");
4398 /* Kill the saving child if there is a background saving in progress.
4399 We want to avoid race conditions, for instance our saving child may
4400 overwrite the synchronous saving did by SHUTDOWN. */
4401 if (server.bgsavechildpid != -1) {
4402 redisLog(REDIS_WARNING,"There is a live saving child. Killing it!");
4403 kill(server.bgsavechildpid,SIGKILL);
4404 rdbRemoveTempFile(server.bgsavechildpid);
4405 }
4406 if (server.appendonly) {
4407 /* Append only file: fsync() the AOF and exit */
4408 aof_fsync(server.appendfd);
4409 if (server.vm_enabled) unlink(server.vm_swap_file);
4410 } else {
4411 /* Snapshotting. Perform a SYNC SAVE and exit */
4412 if (rdbSave(server.dbfilename) == REDIS_OK) {
4413 if (server.daemonize)
4414 unlink(server.pidfile);
4415 redisLog(REDIS_WARNING,"%zu bytes used at exit",zmalloc_used_memory());
4416 } else {
4417 /* Ooops.. error saving! The best we can do is to continue
4418 * operating. Note that if there was a background saving process,
4419 * in the next cron() Redis will be notified that the background
4420 * saving aborted, handling special stuff like slaves pending for
4421 * synchronization... */
4422 redisLog(REDIS_WARNING,"Error trying to save the DB, can't exit");
4423 return REDIS_ERR;
4424 }
4425 }
4426 redisLog(REDIS_WARNING,"Server exit now, bye bye...");
4427 return REDIS_OK;
4428 }
4429
4430 /*================================== Commands =============================== */
4431
4432 static void authCommand(redisClient *c) {
4433 if (!server.requirepass || !strcmp(c->argv[1]->ptr, server.requirepass)) {
4434 c->authenticated = 1;
4435 addReply(c,shared.ok);
4436 } else {
4437 c->authenticated = 0;
4438 addReplySds(c,sdscatprintf(sdsempty(),"-ERR invalid password\r\n"));
4439 }
4440 }
4441
4442 static void pingCommand(redisClient *c) {
4443 addReply(c,shared.pong);
4444 }
4445
4446 static void echoCommand(redisClient *c) {
4447 addReplyBulk(c,c->argv[1]);
4448 }
4449
4450 /*=================================== Strings =============================== */
4451
4452 static void setGenericCommand(redisClient *c, int nx, robj *key, robj *val, robj *expire) {
4453 int retval;
4454 long seconds = 0; /* initialized to avoid an harmness warning */
4455
4456 if (expire) {
4457 if (getLongFromObjectOrReply(c, expire, &seconds, NULL) != REDIS_OK)
4458 return;
4459 if (seconds <= 0) {
4460 addReplySds(c,sdsnew("-ERR invalid expire time in SETEX\r\n"));
4461 return;
4462 }
4463 }
4464
4465 touchWatchedKey(c->db,key);
4466 if (nx) deleteIfVolatile(c->db,key);
4467 retval = dbAdd(c->db,key,val);
4468 if (retval == REDIS_ERR) {
4469 if (!nx) {
4470 dbReplace(c->db,key,val);
4471 incrRefCount(val);
4472 } else {
4473 addReply(c,shared.czero);
4474 return;
4475 }
4476 } else {
4477 incrRefCount(val);
4478 }
4479 server.dirty++;
4480 removeExpire(c->db,key);
4481 if (expire) setExpire(c->db,key,time(NULL)+seconds);
4482 addReply(c, nx ? shared.cone : shared.ok);
4483 }
4484
4485 static void setCommand(redisClient *c) {
4486 setGenericCommand(c,0,c->argv[1],c->argv[2],NULL);
4487 }
4488
4489 static void setnxCommand(redisClient *c) {
4490 setGenericCommand(c,1,c->argv[1],c->argv[2],NULL);
4491 }
4492
4493 static void setexCommand(redisClient *c) {
4494 setGenericCommand(c,0,c->argv[1],c->argv[3],c->argv[2]);
4495 }
4496
4497 static int getGenericCommand(redisClient *c) {
4498 robj *o;
4499
4500 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.nullbulk)) == NULL)
4501 return REDIS_OK;
4502
4503 if (o->type != REDIS_STRING) {
4504 addReply(c,shared.wrongtypeerr);
4505 return REDIS_ERR;
4506 } else {
4507 addReplyBulk(c,o);
4508 return REDIS_OK;
4509 }
4510 }
4511
4512 static void getCommand(redisClient *c) {
4513 getGenericCommand(c);
4514 }
4515
4516 static void getsetCommand(redisClient *c) {
4517 if (getGenericCommand(c) == REDIS_ERR) return;
4518 dbReplace(c->db,c->argv[1],c->argv[2]);
4519 incrRefCount(c->argv[2]);
4520 server.dirty++;
4521 removeExpire(c->db,c->argv[1]);
4522 }
4523
4524 static void mgetCommand(redisClient *c) {
4525 int j;
4526
4527 addReplySds(c,sdscatprintf(sdsempty(),"*%d\r\n",c->argc-1));
4528 for (j = 1; j < c->argc; j++) {
4529 robj *o = lookupKeyRead(c->db,c->argv[j]);
4530 if (o == NULL) {
4531 addReply(c,shared.nullbulk);
4532 } else {
4533 if (o->type != REDIS_STRING) {
4534 addReply(c,shared.nullbulk);
4535 } else {
4536 addReplyBulk(c,o);
4537 }
4538 }
4539 }
4540 }
4541
4542 static void msetGenericCommand(redisClient *c, int nx) {
4543 int j, busykeys = 0;
4544
4545 if ((c->argc % 2) == 0) {
4546 addReplySds(c,sdsnew("-ERR wrong number of arguments for MSET\r\n"));
4547 return;
4548 }
4549 /* Handle the NX flag. The MSETNX semantic is to return zero and don't
4550 * set nothing at all if at least one already key exists. */
4551 if (nx) {
4552 for (j = 1; j < c->argc; j += 2) {
4553 if (lookupKeyWrite(c->db,c->argv[j]) != NULL) {
4554 busykeys++;
4555 }
4556 }
4557 }
4558 if (busykeys) {
4559 addReply(c, shared.czero);
4560 return;
4561 }
4562
4563 for (j = 1; j < c->argc; j += 2) {
4564 c->argv[j+1] = tryObjectEncoding(c->argv[j+1]);
4565 dbReplace(c->db,c->argv[j],c->argv[j+1]);
4566 incrRefCount(c->argv[j+1]);
4567 removeExpire(c->db,c->argv[j]);
4568 }
4569 server.dirty += (c->argc-1)/2;
4570 addReply(c, nx ? shared.cone : shared.ok);
4571 }
4572
4573 static void msetCommand(redisClient *c) {
4574 msetGenericCommand(c,0);
4575 }
4576
4577 static void msetnxCommand(redisClient *c) {
4578 msetGenericCommand(c,1);
4579 }
4580
4581 static void incrDecrCommand(redisClient *c, long long incr) {
4582 long long value;
4583 robj *o;
4584
4585 o = lookupKeyWrite(c->db,c->argv[1]);
4586 if (o != NULL && checkType(c,o,REDIS_STRING)) return;
4587 if (getLongLongFromObjectOrReply(c,o,&value,NULL) != REDIS_OK) return;
4588
4589 value += incr;
4590 o = createStringObjectFromLongLong(value);
4591 dbReplace(c->db,c->argv[1],o);
4592 server.dirty++;
4593 addReply(c,shared.colon);
4594 addReply(c,o);
4595 addReply(c,shared.crlf);
4596 }
4597
4598 static void incrCommand(redisClient *c) {
4599 incrDecrCommand(c,1);
4600 }
4601
4602 static void decrCommand(redisClient *c) {
4603 incrDecrCommand(c,-1);
4604 }
4605
4606 static void incrbyCommand(redisClient *c) {
4607 long long incr;
4608
4609 if (getLongLongFromObjectOrReply(c, c->argv[2], &incr, NULL) != REDIS_OK) return;
4610 incrDecrCommand(c,incr);
4611 }
4612
4613 static void decrbyCommand(redisClient *c) {
4614 long long incr;
4615
4616 if (getLongLongFromObjectOrReply(c, c->argv[2], &incr, NULL) != REDIS_OK) return;
4617 incrDecrCommand(c,-incr);
4618 }
4619
4620 static void appendCommand(redisClient *c) {
4621 int retval;
4622 size_t totlen;
4623 robj *o;
4624
4625 o = lookupKeyWrite(c->db,c->argv[1]);
4626 if (o == NULL) {
4627 /* Create the key */
4628 retval = dbAdd(c->db,c->argv[1],c->argv[2]);
4629 incrRefCount(c->argv[2]);
4630 totlen = stringObjectLen(c->argv[2]);
4631 } else {
4632 if (o->type != REDIS_STRING) {
4633 addReply(c,shared.wrongtypeerr);
4634 return;
4635 }
4636 /* If the object is specially encoded or shared we have to make
4637 * a copy */
4638 if (o->refcount != 1 || o->encoding != REDIS_ENCODING_RAW) {
4639 robj *decoded = getDecodedObject(o);
4640
4641 o = createStringObject(decoded->ptr, sdslen(decoded->ptr));
4642 decrRefCount(decoded);
4643 dbReplace(c->db,c->argv[1],o);
4644 }
4645 /* APPEND! */
4646 if (c->argv[2]->encoding == REDIS_ENCODING_RAW) {
4647 o->ptr = sdscatlen(o->ptr,
4648 c->argv[2]->ptr, sdslen(c->argv[2]->ptr));
4649 } else {
4650 o->ptr = sdscatprintf(o->ptr, "%ld",
4651 (unsigned long) c->argv[2]->ptr);
4652 }
4653 totlen = sdslen(o->ptr);
4654 }
4655 server.dirty++;
4656 addReplySds(c,sdscatprintf(sdsempty(),":%lu\r\n",(unsigned long)totlen));
4657 }
4658
4659 static void substrCommand(redisClient *c) {
4660 robj *o;
4661 long start = atoi(c->argv[2]->ptr);
4662 long end = atoi(c->argv[3]->ptr);
4663 size_t rangelen, strlen;
4664 sds range;
4665
4666 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.nullbulk)) == NULL ||
4667 checkType(c,o,REDIS_STRING)) return;
4668
4669 o = getDecodedObject(o);
4670 strlen = sdslen(o->ptr);
4671
4672 /* convert negative indexes */
4673 if (start < 0) start = strlen+start;
4674 if (end < 0) end = strlen+end;
4675 if (start < 0) start = 0;
4676 if (end < 0) end = 0;
4677
4678 /* indexes sanity checks */
4679 if (start > end || (size_t)start >= strlen) {
4680 /* Out of range start or start > end result in null reply */
4681 addReply(c,shared.nullbulk);
4682 decrRefCount(o);
4683 return;
4684 }
4685 if ((size_t)end >= strlen) end = strlen-1;
4686 rangelen = (end-start)+1;
4687
4688 /* Return the result */
4689 addReplySds(c,sdscatprintf(sdsempty(),"$%zu\r\n",rangelen));
4690 range = sdsnewlen((char*)o->ptr+start,rangelen);
4691 addReplySds(c,range);
4692 addReply(c,shared.crlf);
4693 decrRefCount(o);
4694 }
4695
4696 /* ========================= Type agnostic commands ========================= */
4697
4698 static void delCommand(redisClient *c) {
4699 int deleted = 0, j;
4700
4701 for (j = 1; j < c->argc; j++) {
4702 if (dbDelete(c->db,c->argv[j])) {
4703 touchWatchedKey(c->db,c->argv[j]);
4704 server.dirty++;
4705 deleted++;
4706 }
4707 }
4708 addReplyLongLong(c,deleted);
4709 }
4710
4711 static void existsCommand(redisClient *c) {
4712 expireIfNeeded(c->db,c->argv[1]);
4713 if (dbExists(c->db,c->argv[1])) {
4714 addReply(c, shared.cone);
4715 } else {
4716 addReply(c, shared.czero);
4717 }
4718 }
4719
4720 static void selectCommand(redisClient *c) {
4721 int id = atoi(c->argv[1]->ptr);
4722
4723 if (selectDb(c,id) == REDIS_ERR) {
4724 addReplySds(c,sdsnew("-ERR invalid DB index\r\n"));
4725 } else {
4726 addReply(c,shared.ok);
4727 }
4728 }
4729
4730 static void randomkeyCommand(redisClient *c) {
4731 robj *key;
4732
4733 if ((key = dbRandomKey(c->db)) == NULL) {
4734 addReply(c,shared.nullbulk);
4735 return;
4736 }
4737
4738 addReplyBulk(c,key);
4739 decrRefCount(key);
4740 }
4741
4742 static void keysCommand(redisClient *c) {
4743 dictIterator *di;
4744 dictEntry *de;
4745 sds pattern = c->argv[1]->ptr;
4746 int plen = sdslen(pattern);
4747 unsigned long numkeys = 0;
4748 robj *lenobj = createObject(REDIS_STRING,NULL);
4749
4750 di = dictGetIterator(c->db->dict);
4751 addReply(c,lenobj);
4752 decrRefCount(lenobj);
4753 while((de = dictNext(di)) != NULL) {
4754 sds key = dictGetEntryKey(de);
4755 robj *keyobj;
4756
4757 if ((pattern[0] == '*' && pattern[1] == '\0') ||
4758 stringmatchlen(pattern,plen,key,sdslen(key),0)) {
4759 keyobj = createStringObject(key,sdslen(key));
4760 if (expireIfNeeded(c->db,keyobj) == 0) {
4761 addReplyBulk(c,keyobj);
4762 numkeys++;
4763 }
4764 decrRefCount(keyobj);
4765 }
4766 }
4767 dictReleaseIterator(di);
4768 lenobj->ptr = sdscatprintf(sdsempty(),"*%lu\r\n",numkeys);
4769 }
4770
4771 static void dbsizeCommand(redisClient *c) {
4772 addReplySds(c,
4773 sdscatprintf(sdsempty(),":%lu\r\n",dictSize(c->db->dict)));
4774 }
4775
4776 static void lastsaveCommand(redisClient *c) {
4777 addReplySds(c,
4778 sdscatprintf(sdsempty(),":%lu\r\n",server.lastsave));
4779 }
4780
4781 static void typeCommand(redisClient *c) {
4782 robj *o;
4783 char *type;
4784
4785 o = lookupKeyRead(c->db,c->argv[1]);
4786 if (o == NULL) {
4787 type = "+none";
4788 } else {
4789 switch(o->type) {
4790 case REDIS_STRING: type = "+string"; break;
4791 case REDIS_LIST: type = "+list"; break;
4792 case REDIS_SET: type = "+set"; break;
4793 case REDIS_ZSET: type = "+zset"; break;
4794 case REDIS_HASH: type = "+hash"; break;
4795 default: type = "+unknown"; break;
4796 }
4797 }
4798 addReplySds(c,sdsnew(type));
4799 addReply(c,shared.crlf);
4800 }
4801
4802 static void saveCommand(redisClient *c) {
4803 if (server.bgsavechildpid != -1) {
4804 addReplySds(c,sdsnew("-ERR background save in progress\r\n"));
4805 return;
4806 }
4807 if (rdbSave(server.dbfilename) == REDIS_OK) {
4808 addReply(c,shared.ok);
4809 } else {
4810 addReply(c,shared.err);
4811 }
4812 }
4813
4814 static void bgsaveCommand(redisClient *c) {
4815 if (server.bgsavechildpid != -1) {
4816 addReplySds(c,sdsnew("-ERR background save already in progress\r\n"));
4817 return;
4818 }
4819 if (rdbSaveBackground(server.dbfilename) == REDIS_OK) {
4820 char *status = "+Background saving started\r\n";
4821 addReplySds(c,sdsnew(status));
4822 } else {
4823 addReply(c,shared.err);
4824 }
4825 }
4826
4827 static void shutdownCommand(redisClient *c) {
4828 if (prepareForShutdown() == REDIS_OK)
4829 exit(0);
4830 addReplySds(c, sdsnew("-ERR Errors trying to SHUTDOWN. Check logs.\r\n"));
4831 }
4832
4833 static void renameGenericCommand(redisClient *c, int nx) {
4834 robj *o;
4835
4836 /* To use the same key as src and dst is probably an error */
4837 if (sdscmp(c->argv[1]->ptr,c->argv[2]->ptr) == 0) {
4838 addReply(c,shared.sameobjecterr);
4839 return;
4840 }
4841
4842 if ((o = lookupKeyWriteOrReply(c,c->argv[1],shared.nokeyerr)) == NULL)
4843 return;
4844
4845 incrRefCount(o);
4846 deleteIfVolatile(c->db,c->argv[2]);
4847 if (dbAdd(c->db,c->argv[2],o) == REDIS_ERR) {
4848 if (nx) {
4849 decrRefCount(o);
4850 addReply(c,shared.czero);
4851 return;
4852 }
4853 dbReplace(c->db,c->argv[2],o);
4854 }
4855 dbDelete(c->db,c->argv[1]);
4856 touchWatchedKey(c->db,c->argv[2]);
4857 server.dirty++;
4858 addReply(c,nx ? shared.cone : shared.ok);
4859 }
4860
4861 static void renameCommand(redisClient *c) {
4862 renameGenericCommand(c,0);
4863 }
4864
4865 static void renamenxCommand(redisClient *c) {
4866 renameGenericCommand(c,1);
4867 }
4868
4869 static void moveCommand(redisClient *c) {
4870 robj *o;
4871 redisDb *src, *dst;
4872 int srcid;
4873
4874 /* Obtain source and target DB pointers */
4875 src = c->db;
4876 srcid = c->db->id;
4877 if (selectDb(c,atoi(c->argv[2]->ptr)) == REDIS_ERR) {
4878 addReply(c,shared.outofrangeerr);
4879 return;
4880 }
4881 dst = c->db;
4882 selectDb(c,srcid); /* Back to the source DB */
4883
4884 /* If the user is moving using as target the same
4885 * DB as the source DB it is probably an error. */
4886 if (src == dst) {
4887 addReply(c,shared.sameobjecterr);
4888 return;
4889 }
4890
4891 /* Check if the element exists and get a reference */
4892 o = lookupKeyWrite(c->db,c->argv[1]);
4893 if (!o) {
4894 addReply(c,shared.czero);
4895 return;
4896 }
4897
4898 /* Try to add the element to the target DB */
4899 deleteIfVolatile(dst,c->argv[1]);
4900 if (dbAdd(dst,c->argv[1],o) == REDIS_ERR) {
4901 addReply(c,shared.czero);
4902 return;
4903 }
4904 incrRefCount(o);
4905
4906 /* OK! key moved, free the entry in the source DB */
4907 dbDelete(src,c->argv[1]);
4908 server.dirty++;
4909 addReply(c,shared.cone);
4910 }
4911
4912 /* =================================== Lists ================================ */
4913
4914
4915 /* Check the argument length to see if it requires us to convert the ziplist
4916 * to a real list. Only check raw-encoded objects because integer encoded
4917 * objects are never too long. */
4918 static void listTypeTryConversion(robj *subject, robj *value) {
4919 if (subject->encoding != REDIS_ENCODING_ZIPLIST) return;
4920 if (value->encoding == REDIS_ENCODING_RAW &&
4921 sdslen(value->ptr) > server.list_max_ziplist_value)
4922 listTypeConvert(subject,REDIS_ENCODING_LIST);
4923 }
4924
4925 static void listTypePush(robj *subject, robj *value, int where) {
4926 /* Check if we need to convert the ziplist */
4927 listTypeTryConversion(subject,value);
4928 if (subject->encoding == REDIS_ENCODING_ZIPLIST &&
4929 ziplistLen(subject->ptr) > server.list_max_ziplist_entries)
4930 listTypeConvert(subject,REDIS_ENCODING_LIST);
4931
4932 if (subject->encoding == REDIS_ENCODING_ZIPLIST) {
4933 int pos = (where == REDIS_HEAD) ? ZIPLIST_HEAD : ZIPLIST_TAIL;
4934 value = getDecodedObject(value);
4935 subject->ptr = ziplistPush(subject->ptr,value->ptr,sdslen(value->ptr),pos);
4936 decrRefCount(value);
4937 } else if (subject->encoding == REDIS_ENCODING_LIST) {
4938 if (where == REDIS_HEAD) {
4939 listAddNodeHead(subject->ptr,value);
4940 } else {
4941 listAddNodeTail(subject->ptr,value);
4942 }
4943 incrRefCount(value);
4944 } else {
4945 redisPanic("Unknown list encoding");
4946 }
4947 }
4948
4949 static robj *listTypePop(robj *subject, int where) {
4950 robj *value = NULL;
4951 if (subject->encoding == REDIS_ENCODING_ZIPLIST) {
4952 unsigned char *p;
4953 unsigned char *vstr;
4954 unsigned int vlen;
4955 long long vlong;
4956 int pos = (where == REDIS_HEAD) ? 0 : -1;
4957 p = ziplistIndex(subject->ptr,pos);
4958 if (ziplistGet(p,&vstr,&vlen,&vlong)) {
4959 if (vstr) {
4960 value = createStringObject((char*)vstr,vlen);
4961 } else {
4962 value = createStringObjectFromLongLong(vlong);
4963 }
4964 /* We only need to delete an element when it exists */
4965 subject->ptr = ziplistDelete(subject->ptr,&p);
4966 }
4967 } else if (subject->encoding == REDIS_ENCODING_LIST) {
4968 list *list = subject->ptr;
4969 listNode *ln;
4970 if (where == REDIS_HEAD) {
4971 ln = listFirst(list);
4972 } else {
4973 ln = listLast(list);
4974 }
4975 if (ln != NULL) {
4976 value = listNodeValue(ln);
4977 incrRefCount(value);
4978 listDelNode(list,ln);
4979 }
4980 } else {
4981 redisPanic("Unknown list encoding");
4982 }
4983 return value;
4984 }
4985
4986 static unsigned long listTypeLength(robj *subject) {
4987 if (subject->encoding == REDIS_ENCODING_ZIPLIST) {
4988 return ziplistLen(subject->ptr);
4989 } else if (subject->encoding == REDIS_ENCODING_LIST) {
4990 return listLength((list*)subject->ptr);
4991 } else {
4992 redisPanic("Unknown list encoding");
4993 }
4994 }
4995
4996 /* Structure to hold set iteration abstraction. */
4997 typedef struct {
4998 robj *subject;
4999 unsigned char encoding;
5000 unsigned char direction; /* Iteration direction */
5001 unsigned char *zi;
5002 listNode *ln;
5003 } listTypeIterator;
5004
5005 /* Structure for an entry while iterating over a list. */
5006 typedef struct {
5007 listTypeIterator *li;
5008 unsigned char *zi; /* Entry in ziplist */
5009 listNode *ln; /* Entry in linked list */
5010 } listTypeEntry;
5011
5012 /* Initialize an iterator at the specified index. */
5013 static listTypeIterator *listTypeInitIterator(robj *subject, int index, unsigned char direction) {
5014 listTypeIterator *li = zmalloc(sizeof(listTypeIterator));
5015 li->subject = subject;
5016 li->encoding = subject->encoding;
5017 li->direction = direction;
5018 if (li->encoding == REDIS_ENCODING_ZIPLIST) {
5019 li->zi = ziplistIndex(subject->ptr,index);
5020 } else if (li->encoding == REDIS_ENCODING_LIST) {
5021 li->ln = listIndex(subject->ptr,index);
5022 } else {
5023 redisPanic("Unknown list encoding");
5024 }
5025 return li;
5026 }
5027
5028 /* Clean up the iterator. */
5029 static void listTypeReleaseIterator(listTypeIterator *li) {
5030 zfree(li);
5031 }
5032
5033 /* Stores pointer to current the entry in the provided entry structure
5034 * and advances the position of the iterator. Returns 1 when the current
5035 * entry is in fact an entry, 0 otherwise. */
5036 static int listTypeNext(listTypeIterator *li, listTypeEntry *entry) {
5037 /* Protect from converting when iterating */
5038 redisAssert(li->subject->encoding == li->encoding);
5039
5040 entry->li = li;
5041 if (li->encoding == REDIS_ENCODING_ZIPLIST) {
5042 entry->zi = li->zi;
5043 if (entry->zi != NULL) {
5044 if (li->direction == REDIS_TAIL)
5045 li->zi = ziplistNext(li->subject->ptr,li->zi);
5046 else
5047 li->zi = ziplistPrev(li->subject->ptr,li->zi);
5048 return 1;
5049 }
5050 } else if (li->encoding == REDIS_ENCODING_LIST) {
5051 entry->ln = li->ln;
5052 if (entry->ln != NULL) {
5053 if (li->direction == REDIS_TAIL)
5054 li->ln = li->ln->next;
5055 else
5056 li->ln = li->ln->prev;
5057 return 1;
5058 }
5059 } else {
5060 redisPanic("Unknown list encoding");
5061 }
5062 return 0;
5063 }
5064
5065 /* Return entry or NULL at the current position of the iterator. */
5066 static robj *listTypeGet(listTypeEntry *entry) {
5067 listTypeIterator *li = entry->li;
5068 robj *value = NULL;
5069 if (li->encoding == REDIS_ENCODING_ZIPLIST) {
5070 unsigned char *vstr;
5071 unsigned int vlen;
5072 long long vlong;
5073 redisAssert(entry->zi != NULL);
5074 if (ziplistGet(entry->zi,&vstr,&vlen,&vlong)) {
5075 if (vstr) {
5076 value = createStringObject((char*)vstr,vlen);
5077 } else {
5078 value = createStringObjectFromLongLong(vlong);
5079 }
5080 }
5081 } else if (li->encoding == REDIS_ENCODING_LIST) {
5082 redisAssert(entry->ln != NULL);
5083 value = listNodeValue(entry->ln);
5084 incrRefCount(value);
5085 } else {
5086 redisPanic("Unknown list encoding");
5087 }
5088 return value;
5089 }
5090
5091 static void listTypeInsert(listTypeEntry *entry, robj *value, int where) {
5092 robj *subject = entry->li->subject;
5093 if (entry->li->encoding == REDIS_ENCODING_ZIPLIST) {
5094 value = getDecodedObject(value);
5095 if (where == REDIS_TAIL) {
5096 unsigned char *next = ziplistNext(subject->ptr,entry->zi);
5097
5098 /* When we insert after the current element, but the current element
5099 * is the tail of the list, we need to do a push. */
5100 if (next == NULL) {
5101 subject->ptr = ziplistPush(subject->ptr,value->ptr,sdslen(value->ptr),REDIS_TAIL);
5102 } else {
5103 subject->ptr = ziplistInsert(subject->ptr,next,value->ptr,sdslen(value->ptr));
5104 }
5105 } else {
5106 subject->ptr = ziplistInsert(subject->ptr,entry->zi,value->ptr,sdslen(value->ptr));
5107 }
5108 decrRefCount(value);
5109 } else if (entry->li->encoding == REDIS_ENCODING_LIST) {
5110 if (where == REDIS_TAIL) {
5111 listInsertNode(subject->ptr,entry->ln,value,AL_START_TAIL);
5112 } else {
5113 listInsertNode(subject->ptr,entry->ln,value,AL_START_HEAD);
5114 }
5115 incrRefCount(value);
5116 } else {
5117 redisPanic("Unknown list encoding");
5118 }
5119 }
5120
5121 /* Compare the given object with the entry at the current position. */
5122 static int listTypeEqual(listTypeEntry *entry, robj *o) {
5123 listTypeIterator *li = entry->li;
5124 if (li->encoding == REDIS_ENCODING_ZIPLIST) {
5125 redisAssert(o->encoding == REDIS_ENCODING_RAW);
5126 return ziplistCompare(entry->zi,o->ptr,sdslen(o->ptr));
5127 } else if (li->encoding == REDIS_ENCODING_LIST) {
5128 return equalStringObjects(o,listNodeValue(entry->ln));
5129 } else {
5130 redisPanic("Unknown list encoding");
5131 }
5132 }
5133
5134 /* Delete the element pointed to. */
5135 static void listTypeDelete(listTypeEntry *entry) {
5136 listTypeIterator *li = entry->li;
5137 if (li->encoding == REDIS_ENCODING_ZIPLIST) {
5138 unsigned char *p = entry->zi;
5139 li->subject->ptr = ziplistDelete(li->subject->ptr,&p);
5140
5141 /* Update position of the iterator depending on the direction */
5142 if (li->direction == REDIS_TAIL)
5143 li->zi = p;
5144 else
5145 li->zi = ziplistPrev(li->subject->ptr,p);
5146 } else if (entry->li->encoding == REDIS_ENCODING_LIST) {
5147 listNode *next;
5148 if (li->direction == REDIS_TAIL)
5149 next = entry->ln->next;
5150 else
5151 next = entry->ln->prev;
5152 listDelNode(li->subject->ptr,entry->ln);
5153 li->ln = next;
5154 } else {
5155 redisPanic("Unknown list encoding");
5156 }
5157 }
5158
5159 static void listTypeConvert(robj *subject, int enc) {
5160 listTypeIterator *li;
5161 listTypeEntry entry;
5162 redisAssert(subject->type == REDIS_LIST);
5163
5164 if (enc == REDIS_ENCODING_LIST) {
5165 list *l = listCreate();
5166 listSetFreeMethod(l,decrRefCount);
5167
5168 /* listTypeGet returns a robj with incremented refcount */
5169 li = listTypeInitIterator(subject,0,REDIS_TAIL);
5170 while (listTypeNext(li,&entry)) listAddNodeTail(l,listTypeGet(&entry));
5171 listTypeReleaseIterator(li);
5172
5173 subject->encoding = REDIS_ENCODING_LIST;
5174 zfree(subject->ptr);
5175 subject->ptr = l;
5176 } else {
5177 redisPanic("Unsupported list conversion");
5178 }
5179 }
5180
5181 static void pushGenericCommand(redisClient *c, int where) {
5182 robj *lobj = lookupKeyWrite(c->db,c->argv[1]);
5183 if (lobj == NULL) {
5184 if (handleClientsWaitingListPush(c,c->argv[1],c->argv[2])) {
5185 addReply(c,shared.cone);
5186 return;
5187 }
5188 lobj = createZiplistObject();
5189 dbAdd(c->db,c->argv[1],lobj);
5190 } else {
5191 if (lobj->type != REDIS_LIST) {
5192 addReply(c,shared.wrongtypeerr);
5193 return;
5194 }
5195 if (handleClientsWaitingListPush(c,c->argv[1],c->argv[2])) {
5196 addReply(c,shared.cone);
5197 return;
5198 }
5199 }
5200 listTypePush(lobj,c->argv[2],where);
5201 addReplyLongLong(c,listTypeLength(lobj));
5202 server.dirty++;
5203 }
5204
5205 static void lpushCommand(redisClient *c) {
5206 pushGenericCommand(c,REDIS_HEAD);
5207 }
5208
5209 static void rpushCommand(redisClient *c) {
5210 pushGenericCommand(c,REDIS_TAIL);
5211 }
5212
5213 static void pushxGenericCommand(redisClient *c, robj *refval, robj *val, int where) {
5214 robj *subject;
5215 listTypeIterator *iter;
5216 listTypeEntry entry;
5217
5218 if ((subject = lookupKeyReadOrReply(c,c->argv[1],shared.czero)) == NULL ||
5219 checkType(c,subject,REDIS_LIST)) return;
5220 if (handleClientsWaitingListPush(c,c->argv[1],val)) {
5221 addReply(c,shared.cone);
5222 return;
5223 }
5224
5225 if (refval != NULL) {
5226 /* Note: we expect refval to be string-encoded because it is *not* the
5227 * last argument of the multi-bulk LINSERT. */
5228 redisAssert(refval->encoding == REDIS_ENCODING_RAW);
5229
5230 /* Seek refval from head to tail */
5231 iter = listTypeInitIterator(subject,0,REDIS_TAIL);
5232 while (listTypeNext(iter,&entry)) {
5233 if (listTypeEqual(&entry,refval)) {
5234 listTypeInsert(&entry,val,where);
5235 break;
5236 }
5237 }
5238 listTypeReleaseIterator(iter);
5239 } else {
5240 listTypePush(subject,val,where);
5241 }
5242
5243 server.dirty++;
5244 addReplyUlong(c,listTypeLength(subject));
5245 }
5246
5247 static void lpushxCommand(redisClient *c) {
5248 pushxGenericCommand(c,NULL,c->argv[2],REDIS_HEAD);
5249 }
5250
5251 static void rpushxCommand(redisClient *c) {
5252 pushxGenericCommand(c,NULL,c->argv[2],REDIS_TAIL);
5253 }
5254
5255 static void linsertCommand(redisClient *c) {
5256 if (strcasecmp(c->argv[2]->ptr,"after") == 0) {
5257 pushxGenericCommand(c,c->argv[3],c->argv[4],REDIS_TAIL);
5258 } else if (strcasecmp(c->argv[2]->ptr,"before") == 0) {
5259 pushxGenericCommand(c,c->argv[3],c->argv[4],REDIS_HEAD);
5260 } else {
5261 addReply(c,shared.syntaxerr);
5262 }
5263 }
5264
5265 static void llenCommand(redisClient *c) {
5266 robj *o = lookupKeyReadOrReply(c,c->argv[1],shared.czero);
5267 if (o == NULL || checkType(c,o,REDIS_LIST)) return;
5268 addReplyUlong(c,listTypeLength(o));
5269 }
5270
5271 static void lindexCommand(redisClient *c) {
5272 robj *o = lookupKeyReadOrReply(c,c->argv[1],shared.nullbulk);
5273 if (o == NULL || checkType(c,o,REDIS_LIST)) return;
5274 int index = atoi(c->argv[2]->ptr);
5275 robj *value = NULL;
5276
5277 if (o->encoding == REDIS_ENCODING_ZIPLIST) {
5278 unsigned char *p;
5279 unsigned char *vstr;
5280 unsigned int vlen;
5281 long long vlong;
5282 p = ziplistIndex(o->ptr,index);
5283 if (ziplistGet(p,&vstr,&vlen,&vlong)) {
5284 if (vstr) {
5285 value = createStringObject((char*)vstr,vlen);
5286 } else {
5287 value = createStringObjectFromLongLong(vlong);
5288 }
5289 addReplyBulk(c,value);
5290 decrRefCount(value);
5291 } else {
5292 addReply(c,shared.nullbulk);
5293 }
5294 } else if (o->encoding == REDIS_ENCODING_LIST) {
5295 listNode *ln = listIndex(o->ptr,index);
5296 if (ln != NULL) {
5297 value = listNodeValue(ln);
5298 addReplyBulk(c,value);
5299 } else {
5300 addReply(c,shared.nullbulk);
5301 }
5302 } else {
5303 redisPanic("Unknown list encoding");
5304 }
5305 }
5306
5307 static void lsetCommand(redisClient *c) {
5308 robj *o = lookupKeyWriteOrReply(c,c->argv[1],shared.nokeyerr);
5309 if (o == NULL || checkType(c,o,REDIS_LIST)) return;
5310 int index = atoi(c->argv[2]->ptr);
5311 robj *value = c->argv[3];
5312
5313 listTypeTryConversion(o,value);
5314 if (o->encoding == REDIS_ENCODING_ZIPLIST) {
5315 unsigned char *p, *zl = o->ptr;
5316 p = ziplistIndex(zl,index);
5317 if (p == NULL) {
5318 addReply(c,shared.outofrangeerr);
5319 } else {
5320 o->ptr = ziplistDelete(o->ptr,&p);
5321 value = getDecodedObject(value);
5322 o->ptr = ziplistInsert(o->ptr,p,value->ptr,sdslen(value->ptr));
5323 decrRefCount(value);
5324 addReply(c,shared.ok);
5325 server.dirty++;
5326 }
5327 } else if (o->encoding == REDIS_ENCODING_LIST) {
5328 listNode *ln = listIndex(o->ptr,index);
5329 if (ln == NULL) {
5330 addReply(c,shared.outofrangeerr);
5331 } else {
5332 decrRefCount((robj*)listNodeValue(ln));
5333 listNodeValue(ln) = value;
5334 incrRefCount(value);
5335 addReply(c,shared.ok);
5336 server.dirty++;
5337 }
5338 } else {
5339 redisPanic("Unknown list encoding");
5340 }
5341 }
5342
5343 static void popGenericCommand(redisClient *c, int where) {
5344 robj *o = lookupKeyWriteOrReply(c,c->argv[1],shared.nullbulk);
5345 if (o == NULL || checkType(c,o,REDIS_LIST)) return;
5346
5347 robj *value = listTypePop(o,where);
5348 if (value == NULL) {
5349 addReply(c,shared.nullbulk);
5350 } else {
5351 addReplyBulk(c,value);
5352 decrRefCount(value);
5353 if (listTypeLength(o) == 0) dbDelete(c->db,c->argv[1]);
5354 server.dirty++;
5355 }
5356 }
5357
5358 static void lpopCommand(redisClient *c) {
5359 popGenericCommand(c,REDIS_HEAD);
5360 }
5361
5362 static void rpopCommand(redisClient *c) {
5363 popGenericCommand(c,REDIS_TAIL);
5364 }
5365
5366 static void lrangeCommand(redisClient *c) {
5367 robj *o, *value;
5368 int start = atoi(c->argv[2]->ptr);
5369 int end = atoi(c->argv[3]->ptr);
5370 int llen;
5371 int rangelen, j;
5372 listTypeEntry entry;
5373
5374 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.emptymultibulk)) == NULL
5375 || checkType(c,o,REDIS_LIST)) return;
5376 llen = listTypeLength(o);
5377
5378 /* convert negative indexes */
5379 if (start < 0) start = llen+start;
5380 if (end < 0) end = llen+end;
5381 if (start < 0) start = 0;
5382 if (end < 0) end = 0;
5383
5384 /* indexes sanity checks */
5385 if (start > end || start >= llen) {
5386 /* Out of range start or start > end result in empty list */
5387 addReply(c,shared.emptymultibulk);
5388 return;
5389 }
5390 if (end >= llen) end = llen-1;
5391 rangelen = (end-start)+1;
5392
5393 /* Return the result in form of a multi-bulk reply */
5394 addReplySds(c,sdscatprintf(sdsempty(),"*%d\r\n",rangelen));
5395 listTypeIterator *li = listTypeInitIterator(o,start,REDIS_TAIL);
5396 for (j = 0; j < rangelen; j++) {
5397 redisAssert(listTypeNext(li,&entry));
5398 value = listTypeGet(&entry);
5399 addReplyBulk(c,value);
5400 decrRefCount(value);
5401 }
5402 listTypeReleaseIterator(li);
5403 }
5404
5405 static void ltrimCommand(redisClient *c) {
5406 robj *o;
5407 int start = atoi(c->argv[2]->ptr);
5408 int end = atoi(c->argv[3]->ptr);
5409 int llen;
5410 int j, ltrim, rtrim;
5411 list *list;
5412 listNode *ln;
5413
5414 if ((o = lookupKeyWriteOrReply(c,c->argv[1],shared.ok)) == NULL ||
5415 checkType(c,o,REDIS_LIST)) return;
5416 llen = listTypeLength(o);
5417
5418 /* convert negative indexes */
5419 if (start < 0) start = llen+start;
5420 if (end < 0) end = llen+end;
5421 if (start < 0) start = 0;
5422 if (end < 0) end = 0;
5423
5424 /* indexes sanity checks */
5425 if (start > end || start >= llen) {
5426 /* Out of range start or start > end result in empty list */
5427 ltrim = llen;
5428 rtrim = 0;
5429 } else {
5430 if (end >= llen) end = llen-1;
5431 ltrim = start;
5432 rtrim = llen-end-1;
5433 }
5434
5435 /* Remove list elements to perform the trim */
5436 if (o->encoding == REDIS_ENCODING_ZIPLIST) {
5437 o->ptr = ziplistDeleteRange(o->ptr,0,ltrim);
5438 o->ptr = ziplistDeleteRange(o->ptr,-rtrim,rtrim);
5439 } else if (o->encoding == REDIS_ENCODING_LIST) {
5440 list = o->ptr;
5441 for (j = 0; j < ltrim; j++) {
5442 ln = listFirst(list);
5443 listDelNode(list,ln);
5444 }
5445 for (j = 0; j < rtrim; j++) {
5446 ln = listLast(list);
5447 listDelNode(list,ln);
5448 }
5449 } else {
5450 redisPanic("Unknown list encoding");
5451 }
5452 if (listTypeLength(o) == 0) dbDelete(c->db,c->argv[1]);
5453 server.dirty++;
5454 addReply(c,shared.ok);
5455 }
5456
5457 static void lremCommand(redisClient *c) {
5458 robj *subject, *obj = c->argv[3];
5459 int toremove = atoi(c->argv[2]->ptr);
5460 int removed = 0;
5461 listTypeEntry entry;
5462
5463 subject = lookupKeyWriteOrReply(c,c->argv[1],shared.czero);
5464 if (subject == NULL || checkType(c,subject,REDIS_LIST)) return;
5465
5466 /* Make sure obj is raw when we're dealing with a ziplist */
5467 if (subject->encoding == REDIS_ENCODING_ZIPLIST)
5468 obj = getDecodedObject(obj);
5469
5470 listTypeIterator *li;
5471 if (toremove < 0) {
5472 toremove = -toremove;
5473 li = listTypeInitIterator(subject,-1,REDIS_HEAD);
5474 } else {
5475 li = listTypeInitIterator(subject,0,REDIS_TAIL);
5476 }
5477
5478 while (listTypeNext(li,&entry)) {
5479 if (listTypeEqual(&entry,obj)) {
5480 listTypeDelete(&entry);
5481 server.dirty++;
5482 removed++;
5483 if (toremove && removed == toremove) break;
5484 }
5485 }
5486 listTypeReleaseIterator(li);
5487
5488 /* Clean up raw encoded object */
5489 if (subject->encoding == REDIS_ENCODING_ZIPLIST)
5490 decrRefCount(obj);
5491
5492 if (listTypeLength(subject) == 0) dbDelete(c->db,c->argv[1]);
5493 addReplySds(c,sdscatprintf(sdsempty(),":%d\r\n",removed));
5494 }
5495
5496 /* This is the semantic of this command:
5497 * RPOPLPUSH srclist dstlist:
5498 * IF LLEN(srclist) > 0
5499 * element = RPOP srclist
5500 * LPUSH dstlist element
5501 * RETURN element
5502 * ELSE
5503 * RETURN nil
5504 * END
5505 * END
5506 *
5507 * The idea is to be able to get an element from a list in a reliable way
5508 * since the element is not just returned but pushed against another list
5509 * as well. This command was originally proposed by Ezra Zygmuntowicz.
5510 */
5511 static void rpoplpushcommand(redisClient *c) {
5512 robj *sobj, *value;
5513 if ((sobj = lookupKeyWriteOrReply(c,c->argv[1],shared.nullbulk)) == NULL ||
5514 checkType(c,sobj,REDIS_LIST)) return;
5515
5516 if (listTypeLength(sobj) == 0) {
5517 addReply(c,shared.nullbulk);
5518 } else {
5519 robj *dobj = lookupKeyWrite(c->db,c->argv[2]);
5520 if (dobj && checkType(c,dobj,REDIS_LIST)) return;
5521 value = listTypePop(sobj,REDIS_TAIL);
5522
5523 /* Add the element to the target list (unless it's directly
5524 * passed to some BLPOP-ing client */
5525 if (!handleClientsWaitingListPush(c,c->argv[2],value)) {
5526 /* Create the list if the key does not exist */
5527 if (!dobj) {
5528 dobj = createZiplistObject();
5529 dbAdd(c->db,c->argv[2],dobj);
5530 }
5531 listTypePush(dobj,value,REDIS_HEAD);
5532 }
5533
5534 /* Send the element to the client as reply as well */
5535 addReplyBulk(c,value);
5536
5537 /* listTypePop returns an object with its refcount incremented */
5538 decrRefCount(value);
5539
5540 /* Delete the source list when it is empty */
5541 if (listTypeLength(sobj) == 0) dbDelete(c->db,c->argv[1]);
5542 server.dirty++;
5543 }
5544 }
5545
5546 /* ==================================== Sets ================================ */
5547
5548 static void saddCommand(redisClient *c) {
5549 robj *set;
5550
5551 set = lookupKeyWrite(c->db,c->argv[1]);
5552 if (set == NULL) {
5553 set = createSetObject();
5554 dbAdd(c->db,c->argv[1],set);
5555 } else {
5556 if (set->type != REDIS_SET) {
5557 addReply(c,shared.wrongtypeerr);
5558 return;
5559 }
5560 }
5561 if (dictAdd(set->ptr,c->argv[2],NULL) == DICT_OK) {
5562 incrRefCount(c->argv[2]);
5563 server.dirty++;
5564 addReply(c,shared.cone);
5565 } else {
5566 addReply(c,shared.czero);
5567 }
5568 }
5569
5570 static void sremCommand(redisClient *c) {
5571 robj *set;
5572
5573 if ((set = lookupKeyWriteOrReply(c,c->argv[1],shared.czero)) == NULL ||
5574 checkType(c,set,REDIS_SET)) return;
5575
5576 if (dictDelete(set->ptr,c->argv[2]) == DICT_OK) {
5577 server.dirty++;
5578 if (htNeedsResize(set->ptr)) dictResize(set->ptr);
5579 if (dictSize((dict*)set->ptr) == 0) dbDelete(c->db,c->argv[1]);
5580 addReply(c,shared.cone);
5581 } else {
5582 addReply(c,shared.czero);
5583 }
5584 }
5585
5586 static void smoveCommand(redisClient *c) {
5587 robj *srcset, *dstset;
5588
5589 srcset = lookupKeyWrite(c->db,c->argv[1]);
5590 dstset = lookupKeyWrite(c->db,c->argv[2]);
5591
5592 /* If the source key does not exist return 0, if it's of the wrong type
5593 * raise an error */
5594 if (srcset == NULL || srcset->type != REDIS_SET) {
5595 addReply(c, srcset ? shared.wrongtypeerr : shared.czero);
5596 return;
5597 }
5598 /* Error if the destination key is not a set as well */
5599 if (dstset && dstset->type != REDIS_SET) {
5600 addReply(c,shared.wrongtypeerr);
5601 return;
5602 }
5603 /* Remove the element from the source set */
5604 if (dictDelete(srcset->ptr,c->argv[3]) == DICT_ERR) {
5605 /* Key not found in the src set! return zero */
5606 addReply(c,shared.czero);
5607 return;
5608 }
5609 if (dictSize((dict*)srcset->ptr) == 0 && srcset != dstset)
5610 dbDelete(c->db,c->argv[1]);
5611 server.dirty++;
5612 /* Add the element to the destination set */
5613 if (!dstset) {
5614 dstset = createSetObject();
5615 dbAdd(c->db,c->argv[2],dstset);
5616 }
5617 if (dictAdd(dstset->ptr,c->argv[3],NULL) == DICT_OK)
5618 incrRefCount(c->argv[3]);
5619 addReply(c,shared.cone);
5620 }
5621
5622 static void sismemberCommand(redisClient *c) {
5623 robj *set;
5624
5625 if ((set = lookupKeyReadOrReply(c,c->argv[1],shared.czero)) == NULL ||
5626 checkType(c,set,REDIS_SET)) return;
5627
5628 if (dictFind(set->ptr,c->argv[2]))
5629 addReply(c,shared.cone);
5630 else
5631 addReply(c,shared.czero);
5632 }
5633
5634 static void scardCommand(redisClient *c) {
5635 robj *o;
5636 dict *s;
5637
5638 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.czero)) == NULL ||
5639 checkType(c,o,REDIS_SET)) return;
5640
5641 s = o->ptr;
5642 addReplyUlong(c,dictSize(s));
5643 }
5644
5645 static void spopCommand(redisClient *c) {
5646 robj *set;
5647 dictEntry *de;
5648
5649 if ((set = lookupKeyWriteOrReply(c,c->argv[1],shared.nullbulk)) == NULL ||
5650 checkType(c,set,REDIS_SET)) return;
5651
5652 de = dictGetRandomKey(set->ptr);
5653 if (de == NULL) {
5654 addReply(c,shared.nullbulk);
5655 } else {
5656 robj *ele = dictGetEntryKey(de);
5657
5658 addReplyBulk(c,ele);
5659 dictDelete(set->ptr,ele);
5660 if (htNeedsResize(set->ptr)) dictResize(set->ptr);
5661 if (dictSize((dict*)set->ptr) == 0) dbDelete(c->db,c->argv[1]);
5662 server.dirty++;
5663 }
5664 }
5665
5666 static void srandmemberCommand(redisClient *c) {
5667 robj *set;
5668 dictEntry *de;
5669
5670 if ((set = lookupKeyReadOrReply(c,c->argv[1],shared.nullbulk)) == NULL ||
5671 checkType(c,set,REDIS_SET)) return;
5672
5673 de = dictGetRandomKey(set->ptr);
5674 if (de == NULL) {
5675 addReply(c,shared.nullbulk);
5676 } else {
5677 robj *ele = dictGetEntryKey(de);
5678
5679 addReplyBulk(c,ele);
5680 }
5681 }
5682
5683 static int qsortCompareSetsByCardinality(const void *s1, const void *s2) {
5684 dict **d1 = (void*) s1, **d2 = (void*) s2;
5685
5686 return dictSize(*d1)-dictSize(*d2);
5687 }
5688
5689 static void sinterGenericCommand(redisClient *c, robj **setskeys, unsigned long setsnum, robj *dstkey) {
5690 dict **dv = zmalloc(sizeof(dict*)*setsnum);
5691 dictIterator *di;
5692 dictEntry *de;
5693 robj *lenobj = NULL, *dstset = NULL;
5694 unsigned long j, cardinality = 0;
5695
5696 for (j = 0; j < setsnum; j++) {
5697 robj *setobj;
5698
5699 setobj = dstkey ?
5700 lookupKeyWrite(c->db,setskeys[j]) :
5701 lookupKeyRead(c->db,setskeys[j]);
5702 if (!setobj) {
5703 zfree(dv);
5704 if (dstkey) {
5705 if (dbDelete(c->db,dstkey))
5706 server.dirty++;
5707 addReply(c,shared.czero);
5708 } else {
5709 addReply(c,shared.emptymultibulk);
5710 }
5711 return;
5712 }
5713 if (setobj->type != REDIS_SET) {
5714 zfree(dv);
5715 addReply(c,shared.wrongtypeerr);
5716 return;
5717 }
5718 dv[j] = setobj->ptr;
5719 }
5720 /* Sort sets from the smallest to largest, this will improve our
5721 * algorithm's performace */
5722 qsort(dv,setsnum,sizeof(dict*),qsortCompareSetsByCardinality);
5723
5724 /* The first thing we should output is the total number of elements...
5725 * since this is a multi-bulk write, but at this stage we don't know
5726 * the intersection set size, so we use a trick, append an empty object
5727 * to the output list and save the pointer to later modify it with the
5728 * right length */
5729 if (!dstkey) {
5730 lenobj = createObject(REDIS_STRING,NULL);
5731 addReply(c,lenobj);
5732 decrRefCount(lenobj);
5733 } else {
5734 /* If we have a target key where to store the resulting set
5735 * create this key with an empty set inside */
5736 dstset = createSetObject();
5737 }
5738
5739 /* Iterate all the elements of the first (smallest) set, and test
5740 * the element against all the other sets, if at least one set does
5741 * not include the element it is discarded */
5742 di = dictGetIterator(dv[0]);
5743
5744 while((de = dictNext(di)) != NULL) {
5745 robj *ele;
5746
5747 for (j = 1; j < setsnum; j++)
5748 if (dictFind(dv[j],dictGetEntryKey(de)) == NULL) break;
5749 if (j != setsnum)
5750 continue; /* at least one set does not contain the member */
5751 ele = dictGetEntryKey(de);
5752 if (!dstkey) {
5753 addReplyBulk(c,ele);
5754 cardinality++;
5755 } else {
5756 dictAdd(dstset->ptr,ele,NULL);
5757 incrRefCount(ele);
5758 }
5759 }
5760 dictReleaseIterator(di);
5761
5762 if (dstkey) {
5763 /* Store the resulting set into the target, if the intersection
5764 * is not an empty set. */
5765 dbDelete(c->db,dstkey);
5766 if (dictSize((dict*)dstset->ptr) > 0) {
5767 dbAdd(c->db,dstkey,dstset);
5768 addReplyLongLong(c,dictSize((dict*)dstset->ptr));
5769 } else {
5770 decrRefCount(dstset);
5771 addReply(c,shared.czero);
5772 }
5773 server.dirty++;
5774 } else {
5775 lenobj->ptr = sdscatprintf(sdsempty(),"*%lu\r\n",cardinality);
5776 }
5777 zfree(dv);
5778 }
5779
5780 static void sinterCommand(redisClient *c) {
5781 sinterGenericCommand(c,c->argv+1,c->argc-1,NULL);
5782 }
5783
5784 static void sinterstoreCommand(redisClient *c) {
5785 sinterGenericCommand(c,c->argv+2,c->argc-2,c->argv[1]);
5786 }
5787
5788 #define REDIS_OP_UNION 0
5789 #define REDIS_OP_DIFF 1
5790 #define REDIS_OP_INTER 2
5791
5792 static void sunionDiffGenericCommand(redisClient *c, robj **setskeys, int setsnum, robj *dstkey, int op) {
5793 dict **dv = zmalloc(sizeof(dict*)*setsnum);
5794 dictIterator *di;
5795 dictEntry *de;
5796 robj *dstset = NULL;
5797 int j, cardinality = 0;
5798
5799 for (j = 0; j < setsnum; j++) {
5800 robj *setobj;
5801
5802 setobj = dstkey ?
5803 lookupKeyWrite(c->db,setskeys[j]) :
5804 lookupKeyRead(c->db,setskeys[j]);
5805 if (!setobj) {
5806 dv[j] = NULL;
5807 continue;
5808 }
5809 if (setobj->type != REDIS_SET) {
5810 zfree(dv);
5811 addReply(c,shared.wrongtypeerr);
5812 return;
5813 }
5814 dv[j] = setobj->ptr;
5815 }
5816
5817 /* We need a temp set object to store our union. If the dstkey
5818 * is not NULL (that is, we are inside an SUNIONSTORE operation) then
5819 * this set object will be the resulting object to set into the target key*/
5820 dstset = createSetObject();
5821
5822 /* Iterate all the elements of all the sets, add every element a single
5823 * time to the result set */
5824 for (j = 0; j < setsnum; j++) {
5825 if (op == REDIS_OP_DIFF && j == 0 && !dv[j]) break; /* result set is empty */
5826 if (!dv[j]) continue; /* non existing keys are like empty sets */
5827
5828 di = dictGetIterator(dv[j]);
5829
5830 while((de = dictNext(di)) != NULL) {
5831 robj *ele;
5832
5833 /* dictAdd will not add the same element multiple times */
5834 ele = dictGetEntryKey(de);
5835 if (op == REDIS_OP_UNION || j == 0) {
5836 if (dictAdd(dstset->ptr,ele,NULL) == DICT_OK) {
5837 incrRefCount(ele);
5838 cardinality++;
5839 }
5840 } else if (op == REDIS_OP_DIFF) {
5841 if (dictDelete(dstset->ptr,ele) == DICT_OK) {
5842 cardinality--;
5843 }
5844 }
5845 }
5846 dictReleaseIterator(di);
5847
5848 /* result set is empty? Exit asap. */
5849 if (op == REDIS_OP_DIFF && cardinality == 0) break;
5850 }
5851
5852 /* Output the content of the resulting set, if not in STORE mode */
5853 if (!dstkey) {
5854 addReplySds(c,sdscatprintf(sdsempty(),"*%d\r\n",cardinality));
5855 di = dictGetIterator(dstset->ptr);
5856 while((de = dictNext(di)) != NULL) {
5857 robj *ele;
5858
5859 ele = dictGetEntryKey(de);
5860 addReplyBulk(c,ele);
5861 }
5862 dictReleaseIterator(di);
5863 decrRefCount(dstset);
5864 } else {
5865 /* If we have a target key where to store the resulting set
5866 * create this key with the result set inside */
5867 dbDelete(c->db,dstkey);
5868 if (dictSize((dict*)dstset->ptr) > 0) {
5869 dbAdd(c->db,dstkey,dstset);
5870 addReplyLongLong(c,dictSize((dict*)dstset->ptr));
5871 } else {
5872 decrRefCount(dstset);
5873 addReply(c,shared.czero);
5874 }
5875 server.dirty++;
5876 }
5877 zfree(dv);
5878 }
5879
5880 static void sunionCommand(redisClient *c) {
5881 sunionDiffGenericCommand(c,c->argv+1,c->argc-1,NULL,REDIS_OP_UNION);
5882 }
5883
5884 static void sunionstoreCommand(redisClient *c) {
5885 sunionDiffGenericCommand(c,c->argv+2,c->argc-2,c->argv[1],REDIS_OP_UNION);
5886 }
5887
5888 static void sdiffCommand(redisClient *c) {
5889 sunionDiffGenericCommand(c,c->argv+1,c->argc-1,NULL,REDIS_OP_DIFF);
5890 }
5891
5892 static void sdiffstoreCommand(redisClient *c) {
5893 sunionDiffGenericCommand(c,c->argv+2,c->argc-2,c->argv[1],REDIS_OP_DIFF);
5894 }
5895
5896 /* ==================================== ZSets =============================== */
5897
5898 /* ZSETs are ordered sets using two data structures to hold the same elements
5899 * in order to get O(log(N)) INSERT and REMOVE operations into a sorted
5900 * data structure.
5901 *
5902 * The elements are added to an hash table mapping Redis objects to scores.
5903 * At the same time the elements are added to a skip list mapping scores
5904 * to Redis objects (so objects are sorted by scores in this "view"). */
5905
5906 /* This skiplist implementation is almost a C translation of the original
5907 * algorithm described by William Pugh in "Skip Lists: A Probabilistic
5908 * Alternative to Balanced Trees", modified in three ways:
5909 * a) this implementation allows for repeated values.
5910 * b) the comparison is not just by key (our 'score') but by satellite data.
5911 * c) there is a back pointer, so it's a doubly linked list with the back
5912 * pointers being only at "level 1". This allows to traverse the list
5913 * from tail to head, useful for ZREVRANGE. */
5914
5915 static zskiplistNode *zslCreateNode(int level, double score, robj *obj) {
5916 zskiplistNode *zn = zmalloc(sizeof(*zn));
5917
5918 zn->forward = zmalloc(sizeof(zskiplistNode*) * level);
5919 if (level > 1)
5920 zn->span = zmalloc(sizeof(unsigned int) * (level - 1));
5921 else
5922 zn->span = NULL;
5923 zn->score = score;
5924 zn->obj = obj;
5925 return zn;
5926 }
5927
5928 static zskiplist *zslCreate(void) {
5929 int j;
5930 zskiplist *zsl;
5931
5932 zsl = zmalloc(sizeof(*zsl));
5933 zsl->level = 1;
5934 zsl->length = 0;
5935 zsl->header = zslCreateNode(ZSKIPLIST_MAXLEVEL,0,NULL);
5936 for (j = 0; j < ZSKIPLIST_MAXLEVEL; j++) {
5937 zsl->header->forward[j] = NULL;
5938
5939 /* span has space for ZSKIPLIST_MAXLEVEL-1 elements */
5940 if (j < ZSKIPLIST_MAXLEVEL-1)
5941 zsl->header->span[j] = 0;
5942 }
5943 zsl->header->backward = NULL;
5944 zsl->tail = NULL;
5945 return zsl;
5946 }
5947
5948 static void zslFreeNode(zskiplistNode *node) {
5949 decrRefCount(node->obj);
5950 zfree(node->forward);
5951 zfree(node->span);
5952 zfree(node);
5953 }
5954
5955 static void zslFree(zskiplist *zsl) {
5956 zskiplistNode *node = zsl->header->forward[0], *next;
5957
5958 zfree(zsl->header->forward);
5959 zfree(zsl->header->span);
5960 zfree(zsl->header);
5961 while(node) {
5962 next = node->forward[0];
5963 zslFreeNode(node);
5964 node = next;
5965 }
5966 zfree(zsl);
5967 }
5968
5969 static int zslRandomLevel(void) {
5970 int level = 1;
5971 while ((random()&0xFFFF) < (ZSKIPLIST_P * 0xFFFF))
5972 level += 1;
5973 return (level<ZSKIPLIST_MAXLEVEL) ? level : ZSKIPLIST_MAXLEVEL;
5974 }
5975
5976 static void zslInsert(zskiplist *zsl, double score, robj *obj) {
5977 zskiplistNode *update[ZSKIPLIST_MAXLEVEL], *x;
5978 unsigned int rank[ZSKIPLIST_MAXLEVEL];
5979 int i, level;
5980
5981 x = zsl->header;
5982 for (i = zsl->level-1; i >= 0; i--) {
5983 /* store rank that is crossed to reach the insert position */
5984 rank[i] = i == (zsl->level-1) ? 0 : rank[i+1];
5985
5986 while (x->forward[i] &&
5987 (x->forward[i]->score < score ||
5988 (x->forward[i]->score == score &&
5989 compareStringObjects(x->forward[i]->obj,obj) < 0))) {
5990 rank[i] += i > 0 ? x->span[i-1] : 1;
5991 x = x->forward[i];
5992 }
5993 update[i] = x;
5994 }
5995 /* we assume the key is not already inside, since we allow duplicated
5996 * scores, and the re-insertion of score and redis object should never
5997 * happpen since the caller of zslInsert() should test in the hash table
5998 * if the element is already inside or not. */
5999 level = zslRandomLevel();
6000 if (level > zsl->level) {
6001 for (i = zsl->level; i < level; i++) {
6002 rank[i] = 0;
6003 update[i] = zsl->header;
6004 update[i]->span[i-1] = zsl->length;
6005 }
6006 zsl->level = level;
6007 }
6008 x = zslCreateNode(level,score,obj);
6009 for (i = 0; i < level; i++) {
6010 x->forward[i] = update[i]->forward[i];
6011 update[i]->forward[i] = x;
6012
6013 /* update span covered by update[i] as x is inserted here */
6014 if (i > 0) {
6015 x->span[i-1] = update[i]->span[i-1] - (rank[0] - rank[i]);
6016 update[i]->span[i-1] = (rank[0] - rank[i]) + 1;
6017 }
6018 }
6019
6020 /* increment span for untouched levels */
6021 for (i = level; i < zsl->level; i++) {
6022 update[i]->span[i-1]++;
6023 }
6024
6025 x->backward = (update[0] == zsl->header) ? NULL : update[0];
6026 if (x->forward[0])
6027 x->forward[0]->backward = x;
6028 else
6029 zsl->tail = x;
6030 zsl->length++;
6031 }
6032
6033 /* Internal function used by zslDelete, zslDeleteByScore and zslDeleteByRank */
6034 void zslDeleteNode(zskiplist *zsl, zskiplistNode *x, zskiplistNode **update) {
6035 int i;
6036 for (i = 0; i < zsl->level; i++) {
6037 if (update[i]->forward[i] == x) {
6038 if (i > 0) {
6039 update[i]->span[i-1] += x->span[i-1] - 1;
6040 }
6041 update[i]->forward[i] = x->forward[i];
6042 } else {
6043 /* invariant: i > 0, because update[0]->forward[0]
6044 * is always equal to x */
6045 update[i]->span[i-1] -= 1;
6046 }
6047 }
6048 if (x->forward[0]) {
6049 x->forward[0]->backward = x->backward;
6050 } else {
6051 zsl->tail = x->backward;
6052 }
6053 while(zsl->level > 1 && zsl->header->forward[zsl->level-1] == NULL)
6054 zsl->level--;
6055 zsl->length--;
6056 }
6057
6058 /* Delete an element with matching score/object from the skiplist. */
6059 static int zslDelete(zskiplist *zsl, double score, robj *obj) {
6060 zskiplistNode *update[ZSKIPLIST_MAXLEVEL], *x;
6061 int i;
6062
6063 x = zsl->header;
6064 for (i = zsl->level-1; i >= 0; i--) {
6065 while (x->forward[i] &&
6066 (x->forward[i]->score < score ||
6067 (x->forward[i]->score == score &&
6068 compareStringObjects(x->forward[i]->obj,obj) < 0)))
6069 x = x->forward[i];
6070 update[i] = x;
6071 }
6072 /* We may have multiple elements with the same score, what we need
6073 * is to find the element with both the right score and object. */
6074 x = x->forward[0];
6075 if (x && score == x->score && equalStringObjects(x->obj,obj)) {
6076 zslDeleteNode(zsl, x, update);
6077 zslFreeNode(x);
6078 return 1;
6079 } else {
6080 return 0; /* not found */
6081 }
6082 return 0; /* not found */
6083 }
6084
6085 /* Delete all the elements with score between min and max from the skiplist.
6086 * Min and mx are inclusive, so a score >= min || score <= max is deleted.
6087 * Note that this function takes the reference to the hash table view of the
6088 * sorted set, in order to remove the elements from the hash table too. */
6089 static unsigned long zslDeleteRangeByScore(zskiplist *zsl, double min, double max, dict *dict) {
6090 zskiplistNode *update[ZSKIPLIST_MAXLEVEL], *x;
6091 unsigned long removed = 0;
6092 int i;
6093
6094 x = zsl->header;
6095 for (i = zsl->level-1; i >= 0; i--) {
6096 while (x->forward[i] && x->forward[i]->score < min)
6097 x = x->forward[i];
6098 update[i] = x;
6099 }
6100 /* We may have multiple elements with the same score, what we need
6101 * is to find the element with both the right score and object. */
6102 x = x->forward[0];
6103 while (x && x->score <= max) {
6104 zskiplistNode *next = x->forward[0];
6105 zslDeleteNode(zsl, x, update);
6106 dictDelete(dict,x->obj);
6107 zslFreeNode(x);
6108 removed++;
6109 x = next;
6110 }
6111 return removed; /* not found */
6112 }
6113
6114 /* Delete all the elements with rank between start and end from the skiplist.
6115 * Start and end are inclusive. Note that start and end need to be 1-based */
6116 static unsigned long zslDeleteRangeByRank(zskiplist *zsl, unsigned int start, unsigned int end, dict *dict) {
6117 zskiplistNode *update[ZSKIPLIST_MAXLEVEL], *x;
6118 unsigned long traversed = 0, removed = 0;
6119 int i;
6120
6121 x = zsl->header;
6122 for (i = zsl->level-1; i >= 0; i--) {
6123 while (x->forward[i] && (traversed + (i > 0 ? x->span[i-1] : 1)) < start) {
6124 traversed += i > 0 ? x->span[i-1] : 1;
6125 x = x->forward[i];
6126 }
6127 update[i] = x;
6128 }
6129
6130 traversed++;
6131 x = x->forward[0];
6132 while (x && traversed <= end) {
6133 zskiplistNode *next = x->forward[0];
6134 zslDeleteNode(zsl, x, update);
6135 dictDelete(dict,x->obj);
6136 zslFreeNode(x);
6137 removed++;
6138 traversed++;
6139 x = next;
6140 }
6141 return removed;
6142 }
6143
6144 /* Find the first node having a score equal or greater than the specified one.
6145 * Returns NULL if there is no match. */
6146 static zskiplistNode *zslFirstWithScore(zskiplist *zsl, double score) {
6147 zskiplistNode *x;
6148 int i;
6149
6150 x = zsl->header;
6151 for (i = zsl->level-1; i >= 0; i--) {
6152 while (x->forward[i] && x->forward[i]->score < score)
6153 x = x->forward[i];
6154 }
6155 /* We may have multiple elements with the same score, what we need
6156 * is to find the element with both the right score and object. */
6157 return x->forward[0];
6158 }
6159
6160 /* Find the rank for an element by both score and key.
6161 * Returns 0 when the element cannot be found, rank otherwise.
6162 * Note that the rank is 1-based due to the span of zsl->header to the
6163 * first element. */
6164 static unsigned long zslistTypeGetRank(zskiplist *zsl, double score, robj *o) {
6165 zskiplistNode *x;
6166 unsigned long rank = 0;
6167 int i;
6168
6169 x = zsl->header;
6170 for (i = zsl->level-1; i >= 0; i--) {
6171 while (x->forward[i] &&
6172 (x->forward[i]->score < score ||
6173 (x->forward[i]->score == score &&
6174 compareStringObjects(x->forward[i]->obj,o) <= 0))) {
6175 rank += i > 0 ? x->span[i-1] : 1;
6176 x = x->forward[i];
6177 }
6178
6179 /* x might be equal to zsl->header, so test if obj is non-NULL */
6180 if (x->obj && equalStringObjects(x->obj,o)) {
6181 return rank;
6182 }
6183 }
6184 return 0;
6185 }
6186
6187 /* Finds an element by its rank. The rank argument needs to be 1-based. */
6188 zskiplistNode* zslistTypeGetElementByRank(zskiplist *zsl, unsigned long rank) {
6189 zskiplistNode *x;
6190 unsigned long traversed = 0;
6191 int i;
6192
6193 x = zsl->header;
6194 for (i = zsl->level-1; i >= 0; i--) {
6195 while (x->forward[i] && (traversed + (i>0 ? x->span[i-1] : 1)) <= rank)
6196 {
6197 traversed += i > 0 ? x->span[i-1] : 1;
6198 x = x->forward[i];
6199 }
6200 if (traversed == rank) {
6201 return x;
6202 }
6203 }
6204 return NULL;
6205 }
6206
6207 /* The actual Z-commands implementations */
6208
6209 /* This generic command implements both ZADD and ZINCRBY.
6210 * scoreval is the score if the operation is a ZADD (doincrement == 0) or
6211 * the increment if the operation is a ZINCRBY (doincrement == 1). */
6212 static void zaddGenericCommand(redisClient *c, robj *key, robj *ele, double scoreval, int doincrement) {
6213 robj *zsetobj;
6214 zset *zs;
6215 double *score;
6216
6217 if (isnan(scoreval)) {
6218 addReplySds(c,sdsnew("-ERR provide score is Not A Number (nan)\r\n"));
6219 return;
6220 }
6221
6222 zsetobj = lookupKeyWrite(c->db,key);
6223 if (zsetobj == NULL) {
6224 zsetobj = createZsetObject();
6225 dbAdd(c->db,key,zsetobj);
6226 } else {
6227 if (zsetobj->type != REDIS_ZSET) {
6228 addReply(c,shared.wrongtypeerr);
6229 return;
6230 }
6231 }
6232 zs = zsetobj->ptr;
6233
6234 /* Ok now since we implement both ZADD and ZINCRBY here the code
6235 * needs to handle the two different conditions. It's all about setting
6236 * '*score', that is, the new score to set, to the right value. */
6237 score = zmalloc(sizeof(double));
6238 if (doincrement) {
6239 dictEntry *de;
6240
6241 /* Read the old score. If the element was not present starts from 0 */
6242 de = dictFind(zs->dict,ele);
6243 if (de) {
6244 double *oldscore = dictGetEntryVal(de);
6245 *score = *oldscore + scoreval;
6246 } else {
6247 *score = scoreval;
6248 }
6249 if (isnan(*score)) {
6250 addReplySds(c,
6251 sdsnew("-ERR resulting score is Not A Number (nan)\r\n"));
6252 zfree(score);
6253 /* Note that we don't need to check if the zset may be empty and
6254 * should be removed here, as we can only obtain Nan as score if
6255 * there was already an element in the sorted set. */
6256 return;
6257 }
6258 } else {
6259 *score = scoreval;
6260 }
6261
6262 /* What follows is a simple remove and re-insert operation that is common
6263 * to both ZADD and ZINCRBY... */
6264 if (dictAdd(zs->dict,ele,score) == DICT_OK) {
6265 /* case 1: New element */
6266 incrRefCount(ele); /* added to hash */
6267 zslInsert(zs->zsl,*score,ele);
6268 incrRefCount(ele); /* added to skiplist */
6269 server.dirty++;
6270 if (doincrement)
6271 addReplyDouble(c,*score);
6272 else
6273 addReply(c,shared.cone);
6274 } else {
6275 dictEntry *de;
6276 double *oldscore;
6277
6278 /* case 2: Score update operation */
6279 de = dictFind(zs->dict,ele);
6280 redisAssert(de != NULL);
6281 oldscore = dictGetEntryVal(de);
6282 if (*score != *oldscore) {
6283 int deleted;
6284
6285 /* Remove and insert the element in the skip list with new score */
6286 deleted = zslDelete(zs->zsl,*oldscore,ele);
6287 redisAssert(deleted != 0);
6288 zslInsert(zs->zsl,*score,ele);
6289 incrRefCount(ele);
6290 /* Update the score in the hash table */
6291 dictReplace(zs->dict,ele,score);
6292 server.dirty++;
6293 } else {
6294 zfree(score);
6295 }
6296 if (doincrement)
6297 addReplyDouble(c,*score);
6298 else
6299 addReply(c,shared.czero);
6300 }
6301 }
6302
6303 static void zaddCommand(redisClient *c) {
6304 double scoreval;
6305
6306 if (getDoubleFromObjectOrReply(c, c->argv[2], &scoreval, NULL) != REDIS_OK) return;
6307 zaddGenericCommand(c,c->argv[1],c->argv[3],scoreval,0);
6308 }
6309
6310 static void zincrbyCommand(redisClient *c) {
6311 double scoreval;
6312
6313 if (getDoubleFromObjectOrReply(c, c->argv[2], &scoreval, NULL) != REDIS_OK) return;
6314 zaddGenericCommand(c,c->argv[1],c->argv[3],scoreval,1);
6315 }
6316
6317 static void zremCommand(redisClient *c) {
6318 robj *zsetobj;
6319 zset *zs;
6320 dictEntry *de;
6321 double *oldscore;
6322 int deleted;
6323
6324 if ((zsetobj = lookupKeyWriteOrReply(c,c->argv[1],shared.czero)) == NULL ||
6325 checkType(c,zsetobj,REDIS_ZSET)) return;
6326
6327 zs = zsetobj->ptr;
6328 de = dictFind(zs->dict,c->argv[2]);
6329 if (de == NULL) {
6330 addReply(c,shared.czero);
6331 return;
6332 }
6333 /* Delete from the skiplist */
6334 oldscore = dictGetEntryVal(de);
6335 deleted = zslDelete(zs->zsl,*oldscore,c->argv[2]);
6336 redisAssert(deleted != 0);
6337
6338 /* Delete from the hash table */
6339 dictDelete(zs->dict,c->argv[2]);
6340 if (htNeedsResize(zs->dict)) dictResize(zs->dict);
6341 if (dictSize(zs->dict) == 0) dbDelete(c->db,c->argv[1]);
6342 server.dirty++;
6343 addReply(c,shared.cone);
6344 }
6345
6346 static void zremrangebyscoreCommand(redisClient *c) {
6347 double min;
6348 double max;
6349 long deleted;
6350 robj *zsetobj;
6351 zset *zs;
6352
6353 if ((getDoubleFromObjectOrReply(c, c->argv[2], &min, NULL) != REDIS_OK) ||
6354 (getDoubleFromObjectOrReply(c, c->argv[3], &max, NULL) != REDIS_OK)) return;
6355
6356 if ((zsetobj = lookupKeyWriteOrReply(c,c->argv[1],shared.czero)) == NULL ||
6357 checkType(c,zsetobj,REDIS_ZSET)) return;
6358
6359 zs = zsetobj->ptr;
6360 deleted = zslDeleteRangeByScore(zs->zsl,min,max,zs->dict);
6361 if (htNeedsResize(zs->dict)) dictResize(zs->dict);
6362 if (dictSize(zs->dict) == 0) dbDelete(c->db,c->argv[1]);
6363 server.dirty += deleted;
6364 addReplyLongLong(c,deleted);
6365 }
6366
6367 static void zremrangebyrankCommand(redisClient *c) {
6368 long start;
6369 long end;
6370 int llen;
6371 long deleted;
6372 robj *zsetobj;
6373 zset *zs;
6374
6375 if ((getLongFromObjectOrReply(c, c->argv[2], &start, NULL) != REDIS_OK) ||
6376 (getLongFromObjectOrReply(c, c->argv[3], &end, NULL) != REDIS_OK)) return;
6377
6378 if ((zsetobj = lookupKeyWriteOrReply(c,c->argv[1],shared.czero)) == NULL ||
6379 checkType(c,zsetobj,REDIS_ZSET)) return;
6380 zs = zsetobj->ptr;
6381 llen = zs->zsl->length;
6382
6383 /* convert negative indexes */
6384 if (start < 0) start = llen+start;
6385 if (end < 0) end = llen+end;
6386 if (start < 0) start = 0;
6387 if (end < 0) end = 0;
6388
6389 /* indexes sanity checks */
6390 if (start > end || start >= llen) {
6391 addReply(c,shared.czero);
6392 return;
6393 }
6394 if (end >= llen) end = llen-1;
6395
6396 /* increment start and end because zsl*Rank functions
6397 * use 1-based rank */
6398 deleted = zslDeleteRangeByRank(zs->zsl,start+1,end+1,zs->dict);
6399 if (htNeedsResize(zs->dict)) dictResize(zs->dict);
6400 if (dictSize(zs->dict) == 0) dbDelete(c->db,c->argv[1]);
6401 server.dirty += deleted;
6402 addReplyLongLong(c, deleted);
6403 }
6404
6405 typedef struct {
6406 dict *dict;
6407 double weight;
6408 } zsetopsrc;
6409
6410 static int qsortCompareZsetopsrcByCardinality(const void *s1, const void *s2) {
6411 zsetopsrc *d1 = (void*) s1, *d2 = (void*) s2;
6412 unsigned long size1, size2;
6413 size1 = d1->dict ? dictSize(d1->dict) : 0;
6414 size2 = d2->dict ? dictSize(d2->dict) : 0;
6415 return size1 - size2;
6416 }
6417
6418 #define REDIS_AGGR_SUM 1
6419 #define REDIS_AGGR_MIN 2
6420 #define REDIS_AGGR_MAX 3
6421 #define zunionInterDictValue(_e) (dictGetEntryVal(_e) == NULL ? 1.0 : *(double*)dictGetEntryVal(_e))
6422
6423 inline static void zunionInterAggregate(double *target, double val, int aggregate) {
6424 if (aggregate == REDIS_AGGR_SUM) {
6425 *target = *target + val;
6426 } else if (aggregate == REDIS_AGGR_MIN) {
6427 *target = val < *target ? val : *target;
6428 } else if (aggregate == REDIS_AGGR_MAX) {
6429 *target = val > *target ? val : *target;
6430 } else {
6431 /* safety net */
6432 redisPanic("Unknown ZUNION/INTER aggregate type");
6433 }
6434 }
6435
6436 static void zunionInterGenericCommand(redisClient *c, robj *dstkey, int op) {
6437 int i, j, setnum;
6438 int aggregate = REDIS_AGGR_SUM;
6439 zsetopsrc *src;
6440 robj *dstobj;
6441 zset *dstzset;
6442 dictIterator *di;
6443 dictEntry *de;
6444
6445 /* expect setnum input keys to be given */
6446 setnum = atoi(c->argv[2]->ptr);
6447 if (setnum < 1) {
6448 addReplySds(c,sdsnew("-ERR at least 1 input key is needed for ZUNIONSTORE/ZINTERSTORE\r\n"));
6449 return;
6450 }
6451
6452 /* test if the expected number of keys would overflow */
6453 if (3+setnum > c->argc) {
6454 addReply(c,shared.syntaxerr);
6455 return;
6456 }
6457
6458 /* read keys to be used for input */
6459 src = zmalloc(sizeof(zsetopsrc) * setnum);
6460 for (i = 0, j = 3; i < setnum; i++, j++) {
6461 robj *obj = lookupKeyWrite(c->db,c->argv[j]);
6462 if (!obj) {
6463 src[i].dict = NULL;
6464 } else {
6465 if (obj->type == REDIS_ZSET) {
6466 src[i].dict = ((zset*)obj->ptr)->dict;
6467 } else if (obj->type == REDIS_SET) {
6468 src[i].dict = (obj->ptr);
6469 } else {
6470 zfree(src);
6471 addReply(c,shared.wrongtypeerr);
6472 return;
6473 }
6474 }
6475
6476 /* default all weights to 1 */
6477 src[i].weight = 1.0;
6478 }
6479
6480 /* parse optional extra arguments */
6481 if (j < c->argc) {
6482 int remaining = c->argc - j;
6483
6484 while (remaining) {
6485 if (remaining >= (setnum + 1) && !strcasecmp(c->argv[j]->ptr,"weights")) {
6486 j++; remaining--;
6487 for (i = 0; i < setnum; i++, j++, remaining--) {
6488 if (getDoubleFromObjectOrReply(c, c->argv[j], &src[i].weight, NULL) != REDIS_OK)
6489 return;
6490 }
6491 } else if (remaining >= 2 && !strcasecmp(c->argv[j]->ptr,"aggregate")) {
6492 j++; remaining--;
6493 if (!strcasecmp(c->argv[j]->ptr,"sum")) {
6494 aggregate = REDIS_AGGR_SUM;
6495 } else if (!strcasecmp(c->argv[j]->ptr,"min")) {
6496 aggregate = REDIS_AGGR_MIN;
6497 } else if (!strcasecmp(c->argv[j]->ptr,"max")) {
6498 aggregate = REDIS_AGGR_MAX;
6499 } else {
6500 zfree(src);
6501 addReply(c,shared.syntaxerr);
6502 return;
6503 }
6504 j++; remaining--;
6505 } else {
6506 zfree(src);
6507 addReply(c,shared.syntaxerr);
6508 return;
6509 }
6510 }
6511 }
6512
6513 /* sort sets from the smallest to largest, this will improve our
6514 * algorithm's performance */
6515 qsort(src,setnum,sizeof(zsetopsrc),qsortCompareZsetopsrcByCardinality);
6516
6517 dstobj = createZsetObject();
6518 dstzset = dstobj->ptr;
6519
6520 if (op == REDIS_OP_INTER) {
6521 /* skip going over all entries if the smallest zset is NULL or empty */
6522 if (src[0].dict && dictSize(src[0].dict) > 0) {
6523 /* precondition: as src[0].dict is non-empty and the zsets are ordered
6524 * from small to large, all src[i > 0].dict are non-empty too */
6525 di = dictGetIterator(src[0].dict);
6526 while((de = dictNext(di)) != NULL) {
6527 double *score = zmalloc(sizeof(double)), value;
6528 *score = src[0].weight * zunionInterDictValue(de);
6529
6530 for (j = 1; j < setnum; j++) {
6531 dictEntry *other = dictFind(src[j].dict,dictGetEntryKey(de));
6532 if (other) {
6533 value = src[j].weight * zunionInterDictValue(other);
6534 zunionInterAggregate(score, value, aggregate);
6535 } else {
6536 break;
6537 }
6538 }
6539
6540 /* skip entry when not present in every source dict */
6541 if (j != setnum) {
6542 zfree(score);
6543 } else {
6544 robj *o = dictGetEntryKey(de);
6545 dictAdd(dstzset->dict,o,score);
6546 incrRefCount(o); /* added to dictionary */
6547 zslInsert(dstzset->zsl,*score,o);
6548 incrRefCount(o); /* added to skiplist */
6549 }
6550 }
6551 dictReleaseIterator(di);
6552 }
6553 } else if (op == REDIS_OP_UNION) {
6554 for (i = 0; i < setnum; i++) {
6555 if (!src[i].dict) continue;
6556
6557 di = dictGetIterator(src[i].dict);
6558 while((de = dictNext(di)) != NULL) {
6559 /* skip key when already processed */
6560 if (dictFind(dstzset->dict,dictGetEntryKey(de)) != NULL) continue;
6561
6562 double *score = zmalloc(sizeof(double)), value;
6563 *score = src[i].weight * zunionInterDictValue(de);
6564
6565 /* because the zsets are sorted by size, its only possible
6566 * for sets at larger indices to hold this entry */
6567 for (j = (i+1); j < setnum; j++) {
6568 dictEntry *other = dictFind(src[j].dict,dictGetEntryKey(de));
6569 if (other) {
6570 value = src[j].weight * zunionInterDictValue(other);
6571 zunionInterAggregate(score, value, aggregate);
6572 }
6573 }
6574
6575 robj *o = dictGetEntryKey(de);
6576 dictAdd(dstzset->dict,o,score);
6577 incrRefCount(o); /* added to dictionary */
6578 zslInsert(dstzset->zsl,*score,o);
6579 incrRefCount(o); /* added to skiplist */
6580 }
6581 dictReleaseIterator(di);
6582 }
6583 } else {
6584 /* unknown operator */
6585 redisAssert(op == REDIS_OP_INTER || op == REDIS_OP_UNION);
6586 }
6587
6588 dbDelete(c->db,dstkey);
6589 if (dstzset->zsl->length) {
6590 dbAdd(c->db,dstkey,dstobj);
6591 addReplyLongLong(c, dstzset->zsl->length);
6592 server.dirty++;
6593 } else {
6594 decrRefCount(dstobj);
6595 addReply(c, shared.czero);
6596 }
6597 zfree(src);
6598 }
6599
6600 static void zunionstoreCommand(redisClient *c) {
6601 zunionInterGenericCommand(c,c->argv[1], REDIS_OP_UNION);
6602 }
6603
6604 static void zinterstoreCommand(redisClient *c) {
6605 zunionInterGenericCommand(c,c->argv[1], REDIS_OP_INTER);
6606 }
6607
6608 static void zrangeGenericCommand(redisClient *c, int reverse) {
6609 robj *o;
6610 long start;
6611 long end;
6612 int withscores = 0;
6613 int llen;
6614 int rangelen, j;
6615 zset *zsetobj;
6616 zskiplist *zsl;
6617 zskiplistNode *ln;
6618 robj *ele;
6619
6620 if ((getLongFromObjectOrReply(c, c->argv[2], &start, NULL) != REDIS_OK) ||
6621 (getLongFromObjectOrReply(c, c->argv[3], &end, NULL) != REDIS_OK)) return;
6622
6623 if (c->argc == 5 && !strcasecmp(c->argv[4]->ptr,"withscores")) {
6624 withscores = 1;
6625 } else if (c->argc >= 5) {
6626 addReply(c,shared.syntaxerr);
6627 return;
6628 }
6629
6630 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.emptymultibulk)) == NULL
6631 || checkType(c,o,REDIS_ZSET)) return;
6632 zsetobj = o->ptr;
6633 zsl = zsetobj->zsl;
6634 llen = zsl->length;
6635
6636 /* convert negative indexes */
6637 if (start < 0) start = llen+start;
6638 if (end < 0) end = llen+end;
6639 if (start < 0) start = 0;
6640 if (end < 0) end = 0;
6641
6642 /* indexes sanity checks */
6643 if (start > end || start >= llen) {
6644 /* Out of range start or start > end result in empty list */
6645 addReply(c,shared.emptymultibulk);
6646 return;
6647 }
6648 if (end >= llen) end = llen-1;
6649 rangelen = (end-start)+1;
6650
6651 /* check if starting point is trivial, before searching
6652 * the element in log(N) time */
6653 if (reverse) {
6654 ln = start == 0 ? zsl->tail : zslistTypeGetElementByRank(zsl, llen-start);
6655 } else {
6656 ln = start == 0 ?
6657 zsl->header->forward[0] : zslistTypeGetElementByRank(zsl, start+1);
6658 }
6659
6660 /* Return the result in form of a multi-bulk reply */
6661 addReplySds(c,sdscatprintf(sdsempty(),"*%d\r\n",
6662 withscores ? (rangelen*2) : rangelen));
6663 for (j = 0; j < rangelen; j++) {
6664 ele = ln->obj;
6665 addReplyBulk(c,ele);
6666 if (withscores)
6667 addReplyDouble(c,ln->score);
6668 ln = reverse ? ln->backward : ln->forward[0];
6669 }
6670 }
6671
6672 static void zrangeCommand(redisClient *c) {
6673 zrangeGenericCommand(c,0);
6674 }
6675
6676 static void zrevrangeCommand(redisClient *c) {
6677 zrangeGenericCommand(c,1);
6678 }
6679
6680 /* This command implements both ZRANGEBYSCORE and ZCOUNT.
6681 * If justcount is non-zero, just the count is returned. */
6682 static void genericZrangebyscoreCommand(redisClient *c, int justcount) {
6683 robj *o;
6684 double min, max;
6685 int minex = 0, maxex = 0; /* are min or max exclusive? */
6686 int offset = 0, limit = -1;
6687 int withscores = 0;
6688 int badsyntax = 0;
6689
6690 /* Parse the min-max interval. If one of the values is prefixed
6691 * by the "(" character, it's considered "open". For instance
6692 * ZRANGEBYSCORE zset (1.5 (2.5 will match min < x < max
6693 * ZRANGEBYSCORE zset 1.5 2.5 will instead match min <= x <= max */
6694 if (((char*)c->argv[2]->ptr)[0] == '(') {
6695 min = strtod((char*)c->argv[2]->ptr+1,NULL);
6696 minex = 1;
6697 } else {
6698 min = strtod(c->argv[2]->ptr,NULL);
6699 }
6700 if (((char*)c->argv[3]->ptr)[0] == '(') {
6701 max = strtod((char*)c->argv[3]->ptr+1,NULL);
6702 maxex = 1;
6703 } else {
6704 max = strtod(c->argv[3]->ptr,NULL);
6705 }
6706
6707 /* Parse "WITHSCORES": note that if the command was called with
6708 * the name ZCOUNT then we are sure that c->argc == 4, so we'll never
6709 * enter the following paths to parse WITHSCORES and LIMIT. */
6710 if (c->argc == 5 || c->argc == 8) {
6711 if (strcasecmp(c->argv[c->argc-1]->ptr,"withscores") == 0)
6712 withscores = 1;
6713 else
6714 badsyntax = 1;
6715 }
6716 if (c->argc != (4 + withscores) && c->argc != (7 + withscores))
6717 badsyntax = 1;
6718 if (badsyntax) {
6719 addReplySds(c,
6720 sdsnew("-ERR wrong number of arguments for ZRANGEBYSCORE\r\n"));
6721 return;
6722 }
6723
6724 /* Parse "LIMIT" */
6725 if (c->argc == (7 + withscores) && strcasecmp(c->argv[4]->ptr,"limit")) {
6726 addReply(c,shared.syntaxerr);
6727 return;
6728 } else if (c->argc == (7 + withscores)) {
6729 offset = atoi(c->argv[5]->ptr);
6730 limit = atoi(c->argv[6]->ptr);
6731 if (offset < 0) offset = 0;
6732 }
6733
6734 /* Ok, lookup the key and get the range */
6735 o = lookupKeyRead(c->db,c->argv[1]);
6736 if (o == NULL) {
6737 addReply(c,justcount ? shared.czero : shared.emptymultibulk);
6738 } else {
6739 if (o->type != REDIS_ZSET) {
6740 addReply(c,shared.wrongtypeerr);
6741 } else {
6742 zset *zsetobj = o->ptr;
6743 zskiplist *zsl = zsetobj->zsl;
6744 zskiplistNode *ln;
6745 robj *ele, *lenobj = NULL;
6746 unsigned long rangelen = 0;
6747
6748 /* Get the first node with the score >= min, or with
6749 * score > min if 'minex' is true. */
6750 ln = zslFirstWithScore(zsl,min);
6751 while (minex && ln && ln->score == min) ln = ln->forward[0];
6752
6753 if (ln == NULL) {
6754 /* No element matching the speciifed interval */
6755 addReply(c,justcount ? shared.czero : shared.emptymultibulk);
6756 return;
6757 }
6758
6759 /* We don't know in advance how many matching elements there
6760 * are in the list, so we push this object that will represent
6761 * the multi-bulk length in the output buffer, and will "fix"
6762 * it later */
6763 if (!justcount) {
6764 lenobj = createObject(REDIS_STRING,NULL);
6765 addReply(c,lenobj);
6766 decrRefCount(lenobj);
6767 }
6768
6769 while(ln && (maxex ? (ln->score < max) : (ln->score <= max))) {
6770 if (offset) {
6771 offset--;
6772 ln = ln->forward[0];
6773 continue;
6774 }
6775 if (limit == 0) break;
6776 if (!justcount) {
6777 ele = ln->obj;
6778 addReplyBulk(c,ele);
6779 if (withscores)
6780 addReplyDouble(c,ln->score);
6781 }
6782 ln = ln->forward[0];
6783 rangelen++;
6784 if (limit > 0) limit--;
6785 }
6786 if (justcount) {
6787 addReplyLongLong(c,(long)rangelen);
6788 } else {
6789 lenobj->ptr = sdscatprintf(sdsempty(),"*%lu\r\n",
6790 withscores ? (rangelen*2) : rangelen);
6791 }
6792 }
6793 }
6794 }
6795
6796 static void zrangebyscoreCommand(redisClient *c) {
6797 genericZrangebyscoreCommand(c,0);
6798 }
6799
6800 static void zcountCommand(redisClient *c) {
6801 genericZrangebyscoreCommand(c,1);
6802 }
6803
6804 static void zcardCommand(redisClient *c) {
6805 robj *o;
6806 zset *zs;
6807
6808 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.czero)) == NULL ||
6809 checkType(c,o,REDIS_ZSET)) return;
6810
6811 zs = o->ptr;
6812 addReplyUlong(c,zs->zsl->length);
6813 }
6814
6815 static void zscoreCommand(redisClient *c) {
6816 robj *o;
6817 zset *zs;
6818 dictEntry *de;
6819
6820 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.nullbulk)) == NULL ||
6821 checkType(c,o,REDIS_ZSET)) return;
6822
6823 zs = o->ptr;
6824 de = dictFind(zs->dict,c->argv[2]);
6825 if (!de) {
6826 addReply(c,shared.nullbulk);
6827 } else {
6828 double *score = dictGetEntryVal(de);
6829
6830 addReplyDouble(c,*score);
6831 }
6832 }
6833
6834 static void zrankGenericCommand(redisClient *c, int reverse) {
6835 robj *o;
6836 zset *zs;
6837 zskiplist *zsl;
6838 dictEntry *de;
6839 unsigned long rank;
6840 double *score;
6841
6842 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.nullbulk)) == NULL ||
6843 checkType(c,o,REDIS_ZSET)) return;
6844
6845 zs = o->ptr;
6846 zsl = zs->zsl;
6847 de = dictFind(zs->dict,c->argv[2]);
6848 if (!de) {
6849 addReply(c,shared.nullbulk);
6850 return;
6851 }
6852
6853 score = dictGetEntryVal(de);
6854 rank = zslistTypeGetRank(zsl, *score, c->argv[2]);
6855 if (rank) {
6856 if (reverse) {
6857 addReplyLongLong(c, zsl->length - rank);
6858 } else {
6859 addReplyLongLong(c, rank-1);
6860 }
6861 } else {
6862 addReply(c,shared.nullbulk);
6863 }
6864 }
6865
6866 static void zrankCommand(redisClient *c) {
6867 zrankGenericCommand(c, 0);
6868 }
6869
6870 static void zrevrankCommand(redisClient *c) {
6871 zrankGenericCommand(c, 1);
6872 }
6873
6874 /* ========================= Hashes utility functions ======================= */
6875 #define REDIS_HASH_KEY 1
6876 #define REDIS_HASH_VALUE 2
6877
6878 /* Check the length of a number of objects to see if we need to convert a
6879 * zipmap to a real hash. Note that we only check string encoded objects
6880 * as their string length can be queried in constant time. */
6881 static void hashTypeTryConversion(robj *subject, robj **argv, int start, int end) {
6882 int i;
6883 if (subject->encoding != REDIS_ENCODING_ZIPMAP) return;
6884
6885 for (i = start; i <= end; i++) {
6886 if (argv[i]->encoding == REDIS_ENCODING_RAW &&
6887 sdslen(argv[i]->ptr) > server.hash_max_zipmap_value)
6888 {
6889 convertToRealHash(subject);
6890 return;
6891 }
6892 }
6893 }
6894
6895 /* Encode given objects in-place when the hash uses a dict. */
6896 static void hashTypeTryObjectEncoding(robj *subject, robj **o1, robj **o2) {
6897 if (subject->encoding == REDIS_ENCODING_HT) {
6898 if (o1) *o1 = tryObjectEncoding(*o1);
6899 if (o2) *o2 = tryObjectEncoding(*o2);
6900 }
6901 }
6902
6903 /* Get the value from a hash identified by key. Returns either a string
6904 * object or NULL if the value cannot be found. The refcount of the object
6905 * is always increased by 1 when the value was found. */
6906 static robj *hashTypeGet(robj *o, robj *key) {
6907 robj *value = NULL;
6908 if (o->encoding == REDIS_ENCODING_ZIPMAP) {
6909 unsigned char *v;
6910 unsigned int vlen;
6911 key = getDecodedObject(key);
6912 if (zipmapGet(o->ptr,key->ptr,sdslen(key->ptr),&v,&vlen)) {
6913 value = createStringObject((char*)v,vlen);
6914 }
6915 decrRefCount(key);
6916 } else {
6917 dictEntry *de = dictFind(o->ptr,key);
6918 if (de != NULL) {
6919 value = dictGetEntryVal(de);
6920 incrRefCount(value);
6921 }
6922 }
6923 return value;
6924 }
6925
6926 /* Test if the key exists in the given hash. Returns 1 if the key
6927 * exists and 0 when it doesn't. */
6928 static int hashTypeExists(robj *o, robj *key) {
6929 if (o->encoding == REDIS_ENCODING_ZIPMAP) {
6930 key = getDecodedObject(key);
6931 if (zipmapExists(o->ptr,key->ptr,sdslen(key->ptr))) {
6932 decrRefCount(key);
6933 return 1;
6934 }
6935 decrRefCount(key);
6936 } else {
6937 if (dictFind(o->ptr,key) != NULL) {
6938 return 1;
6939 }
6940 }
6941 return 0;
6942 }
6943
6944 /* Add an element, discard the old if the key already exists.
6945 * Return 0 on insert and 1 on update. */
6946 static int hashTypeSet(robj *o, robj *key, robj *value) {
6947 int update = 0;
6948 if (o->encoding == REDIS_ENCODING_ZIPMAP) {
6949 key = getDecodedObject(key);
6950 value = getDecodedObject(value);
6951 o->ptr = zipmapSet(o->ptr,
6952 key->ptr,sdslen(key->ptr),
6953 value->ptr,sdslen(value->ptr), &update);
6954 decrRefCount(key);
6955 decrRefCount(value);
6956
6957 /* Check if the zipmap needs to be upgraded to a real hash table */
6958 if (zipmapLen(o->ptr) > server.hash_max_zipmap_entries)
6959 convertToRealHash(o);
6960 } else {
6961 if (dictReplace(o->ptr,key,value)) {
6962 /* Insert */
6963 incrRefCount(key);
6964 } else {
6965 /* Update */
6966 update = 1;
6967 }
6968 incrRefCount(value);
6969 }
6970 return update;
6971 }
6972
6973 /* Delete an element from a hash.
6974 * Return 1 on deleted and 0 on not found. */
6975 static int hashTypeDelete(robj *o, robj *key) {
6976 int deleted = 0;
6977 if (o->encoding == REDIS_ENCODING_ZIPMAP) {
6978 key = getDecodedObject(key);
6979 o->ptr = zipmapDel(o->ptr,key->ptr,sdslen(key->ptr), &deleted);
6980 decrRefCount(key);
6981 } else {
6982 deleted = dictDelete((dict*)o->ptr,key) == DICT_OK;
6983 /* Always check if the dictionary needs a resize after a delete. */
6984 if (deleted && htNeedsResize(o->ptr)) dictResize(o->ptr);
6985 }
6986 return deleted;
6987 }
6988
6989 /* Return the number of elements in a hash. */
6990 static unsigned long hashTypeLength(robj *o) {
6991 return (o->encoding == REDIS_ENCODING_ZIPMAP) ?
6992 zipmapLen((unsigned char*)o->ptr) : dictSize((dict*)o->ptr);
6993 }
6994
6995 /* Structure to hold hash iteration abstration. Note that iteration over
6996 * hashes involves both fields and values. Because it is possible that
6997 * not both are required, store pointers in the iterator to avoid
6998 * unnecessary memory allocation for fields/values. */
6999 typedef struct {
7000 int encoding;
7001 unsigned char *zi;
7002 unsigned char *zk, *zv;
7003 unsigned int zklen, zvlen;
7004
7005 dictIterator *di;
7006 dictEntry *de;
7007 } hashTypeIterator;
7008
7009 static hashTypeIterator *hashTypeInitIterator(robj *subject) {
7010 hashTypeIterator *hi = zmalloc(sizeof(hashTypeIterator));
7011 hi->encoding = subject->encoding;
7012 if (hi->encoding == REDIS_ENCODING_ZIPMAP) {
7013 hi->zi = zipmapRewind(subject->ptr);
7014 } else if (hi->encoding == REDIS_ENCODING_HT) {
7015 hi->di = dictGetIterator(subject->ptr);
7016 } else {
7017 redisAssert(NULL);
7018 }
7019 return hi;
7020 }
7021
7022 static void hashTypeReleaseIterator(hashTypeIterator *hi) {
7023 if (hi->encoding == REDIS_ENCODING_HT) {
7024 dictReleaseIterator(hi->di);
7025 }
7026 zfree(hi);
7027 }
7028
7029 /* Move to the next entry in the hash. Return REDIS_OK when the next entry
7030 * could be found and REDIS_ERR when the iterator reaches the end. */
7031 static int hashTypeNext(hashTypeIterator *hi) {
7032 if (hi->encoding == REDIS_ENCODING_ZIPMAP) {
7033 if ((hi->zi = zipmapNext(hi->zi, &hi->zk, &hi->zklen,
7034 &hi->zv, &hi->zvlen)) == NULL) return REDIS_ERR;
7035 } else {
7036 if ((hi->de = dictNext(hi->di)) == NULL) return REDIS_ERR;
7037 }
7038 return REDIS_OK;
7039 }
7040
7041 /* Get key or value object at current iteration position.
7042 * This increases the refcount of the field object by 1. */
7043 static robj *hashTypeCurrent(hashTypeIterator *hi, int what) {
7044 robj *o;
7045 if (hi->encoding == REDIS_ENCODING_ZIPMAP) {
7046 if (what & REDIS_HASH_KEY) {
7047 o = createStringObject((char*)hi->zk,hi->zklen);
7048 } else {
7049 o = createStringObject((char*)hi->zv,hi->zvlen);
7050 }
7051 } else {
7052 if (what & REDIS_HASH_KEY) {
7053 o = dictGetEntryKey(hi->de);
7054 } else {
7055 o = dictGetEntryVal(hi->de);
7056 }
7057 incrRefCount(o);
7058 }
7059 return o;
7060 }
7061
7062 static robj *hashTypeLookupWriteOrCreate(redisClient *c, robj *key) {
7063 robj *o = lookupKeyWrite(c->db,key);
7064 if (o == NULL) {
7065 o = createHashObject();
7066 dbAdd(c->db,key,o);
7067 } else {
7068 if (o->type != REDIS_HASH) {
7069 addReply(c,shared.wrongtypeerr);
7070 return NULL;
7071 }
7072 }
7073 return o;
7074 }
7075
7076 /* ============================= Hash commands ============================== */
7077 static void hsetCommand(redisClient *c) {
7078 int update;
7079 robj *o;
7080
7081 if ((o = hashTypeLookupWriteOrCreate(c,c->argv[1])) == NULL) return;
7082 hashTypeTryConversion(o,c->argv,2,3);
7083 hashTypeTryObjectEncoding(o,&c->argv[2], &c->argv[3]);
7084 update = hashTypeSet(o,c->argv[2],c->argv[3]);
7085 addReply(c, update ? shared.czero : shared.cone);
7086 server.dirty++;
7087 }
7088
7089 static void hsetnxCommand(redisClient *c) {
7090 robj *o;
7091 if ((o = hashTypeLookupWriteOrCreate(c,c->argv[1])) == NULL) return;
7092 hashTypeTryConversion(o,c->argv,2,3);
7093
7094 if (hashTypeExists(o, c->argv[2])) {
7095 addReply(c, shared.czero);
7096 } else {
7097 hashTypeTryObjectEncoding(o,&c->argv[2], &c->argv[3]);
7098 hashTypeSet(o,c->argv[2],c->argv[3]);
7099 addReply(c, shared.cone);
7100 server.dirty++;
7101 }
7102 }
7103
7104 static void hmsetCommand(redisClient *c) {
7105 int i;
7106 robj *o;
7107
7108 if ((c->argc % 2) == 1) {
7109 addReplySds(c,sdsnew("-ERR wrong number of arguments for HMSET\r\n"));
7110 return;
7111 }
7112
7113 if ((o = hashTypeLookupWriteOrCreate(c,c->argv[1])) == NULL) return;
7114 hashTypeTryConversion(o,c->argv,2,c->argc-1);
7115 for (i = 2; i < c->argc; i += 2) {
7116 hashTypeTryObjectEncoding(o,&c->argv[i], &c->argv[i+1]);
7117 hashTypeSet(o,c->argv[i],c->argv[i+1]);
7118 }
7119 addReply(c, shared.ok);
7120 server.dirty++;
7121 }
7122
7123 static void hincrbyCommand(redisClient *c) {
7124 long long value, incr;
7125 robj *o, *current, *new;
7126
7127 if (getLongLongFromObjectOrReply(c,c->argv[3],&incr,NULL) != REDIS_OK) return;
7128 if ((o = hashTypeLookupWriteOrCreate(c,c->argv[1])) == NULL) return;
7129 if ((current = hashTypeGet(o,c->argv[2])) != NULL) {
7130 if (getLongLongFromObjectOrReply(c,current,&value,
7131 "hash value is not an integer") != REDIS_OK) {
7132 decrRefCount(current);
7133 return;
7134 }
7135 decrRefCount(current);
7136 } else {
7137 value = 0;
7138 }
7139
7140 value += incr;
7141 new = createStringObjectFromLongLong(value);
7142 hashTypeTryObjectEncoding(o,&c->argv[2],NULL);
7143 hashTypeSet(o,c->argv[2],new);
7144 decrRefCount(new);
7145 addReplyLongLong(c,value);
7146 server.dirty++;
7147 }
7148
7149 static void hgetCommand(redisClient *c) {
7150 robj *o, *value;
7151 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.nullbulk)) == NULL ||
7152 checkType(c,o,REDIS_HASH)) return;
7153
7154 if ((value = hashTypeGet(o,c->argv[2])) != NULL) {
7155 addReplyBulk(c,value);
7156 decrRefCount(value);
7157 } else {
7158 addReply(c,shared.nullbulk);
7159 }
7160 }
7161
7162 static void hmgetCommand(redisClient *c) {
7163 int i;
7164 robj *o, *value;
7165 o = lookupKeyRead(c->db,c->argv[1]);
7166 if (o != NULL && o->type != REDIS_HASH) {
7167 addReply(c,shared.wrongtypeerr);
7168 }
7169
7170 /* Note the check for o != NULL happens inside the loop. This is
7171 * done because objects that cannot be found are considered to be
7172 * an empty hash. The reply should then be a series of NULLs. */
7173 addReplySds(c,sdscatprintf(sdsempty(),"*%d\r\n",c->argc-2));
7174 for (i = 2; i < c->argc; i++) {
7175 if (o != NULL && (value = hashTypeGet(o,c->argv[i])) != NULL) {
7176 addReplyBulk(c,value);
7177 decrRefCount(value);
7178 } else {
7179 addReply(c,shared.nullbulk);
7180 }
7181 }
7182 }
7183
7184 static void hdelCommand(redisClient *c) {
7185 robj *o;
7186 if ((o = lookupKeyWriteOrReply(c,c->argv[1],shared.czero)) == NULL ||
7187 checkType(c,o,REDIS_HASH)) return;
7188
7189 if (hashTypeDelete(o,c->argv[2])) {
7190 if (hashTypeLength(o) == 0) dbDelete(c->db,c->argv[1]);
7191 addReply(c,shared.cone);
7192 server.dirty++;
7193 } else {
7194 addReply(c,shared.czero);
7195 }
7196 }
7197
7198 static void hlenCommand(redisClient *c) {
7199 robj *o;
7200 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.czero)) == NULL ||
7201 checkType(c,o,REDIS_HASH)) return;
7202
7203 addReplyUlong(c,hashTypeLength(o));
7204 }
7205
7206 static void genericHgetallCommand(redisClient *c, int flags) {
7207 robj *o, *lenobj, *obj;
7208 unsigned long count = 0;
7209 hashTypeIterator *hi;
7210
7211 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.emptymultibulk)) == NULL
7212 || checkType(c,o,REDIS_HASH)) return;
7213
7214 lenobj = createObject(REDIS_STRING,NULL);
7215 addReply(c,lenobj);
7216 decrRefCount(lenobj);
7217
7218 hi = hashTypeInitIterator(o);
7219 while (hashTypeNext(hi) != REDIS_ERR) {
7220 if (flags & REDIS_HASH_KEY) {
7221 obj = hashTypeCurrent(hi,REDIS_HASH_KEY);
7222 addReplyBulk(c,obj);
7223 decrRefCount(obj);
7224 count++;
7225 }
7226 if (flags & REDIS_HASH_VALUE) {
7227 obj = hashTypeCurrent(hi,REDIS_HASH_VALUE);
7228 addReplyBulk(c,obj);
7229 decrRefCount(obj);
7230 count++;
7231 }
7232 }
7233 hashTypeReleaseIterator(hi);
7234
7235 lenobj->ptr = sdscatprintf(sdsempty(),"*%lu\r\n",count);
7236 }
7237
7238 static void hkeysCommand(redisClient *c) {
7239 genericHgetallCommand(c,REDIS_HASH_KEY);
7240 }
7241
7242 static void hvalsCommand(redisClient *c) {
7243 genericHgetallCommand(c,REDIS_HASH_VALUE);
7244 }
7245
7246 static void hgetallCommand(redisClient *c) {
7247 genericHgetallCommand(c,REDIS_HASH_KEY|REDIS_HASH_VALUE);
7248 }
7249
7250 static void hexistsCommand(redisClient *c) {
7251 robj *o;
7252 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.czero)) == NULL ||
7253 checkType(c,o,REDIS_HASH)) return;
7254
7255 addReply(c, hashTypeExists(o,c->argv[2]) ? shared.cone : shared.czero);
7256 }
7257
7258 static void convertToRealHash(robj *o) {
7259 unsigned char *key, *val, *p, *zm = o->ptr;
7260 unsigned int klen, vlen;
7261 dict *dict = dictCreate(&hashDictType,NULL);
7262
7263 assert(o->type == REDIS_HASH && o->encoding != REDIS_ENCODING_HT);
7264 p = zipmapRewind(zm);
7265 while((p = zipmapNext(p,&key,&klen,&val,&vlen)) != NULL) {
7266 robj *keyobj, *valobj;
7267
7268 keyobj = createStringObject((char*)key,klen);
7269 valobj = createStringObject((char*)val,vlen);
7270 keyobj = tryObjectEncoding(keyobj);
7271 valobj = tryObjectEncoding(valobj);
7272 dictAdd(dict,keyobj,valobj);
7273 }
7274 o->encoding = REDIS_ENCODING_HT;
7275 o->ptr = dict;
7276 zfree(zm);
7277 }
7278
7279 /* ========================= Non type-specific commands ==================== */
7280
7281 static void flushdbCommand(redisClient *c) {
7282 server.dirty += dictSize(c->db->dict);
7283 touchWatchedKeysOnFlush(c->db->id);
7284 dictEmpty(c->db->dict);
7285 dictEmpty(c->db->expires);
7286 addReply(c,shared.ok);
7287 }
7288
7289 static void flushallCommand(redisClient *c) {
7290 touchWatchedKeysOnFlush(-1);
7291 server.dirty += emptyDb();
7292 addReply(c,shared.ok);
7293 if (server.bgsavechildpid != -1) {
7294 kill(server.bgsavechildpid,SIGKILL);
7295 rdbRemoveTempFile(server.bgsavechildpid);
7296 }
7297 rdbSave(server.dbfilename);
7298 server.dirty++;
7299 }
7300
7301 static redisSortOperation *createSortOperation(int type, robj *pattern) {
7302 redisSortOperation *so = zmalloc(sizeof(*so));
7303 so->type = type;
7304 so->pattern = pattern;
7305 return so;
7306 }
7307
7308 /* Return the value associated to the key with a name obtained
7309 * substituting the first occurence of '*' in 'pattern' with 'subst'.
7310 * The returned object will always have its refcount increased by 1
7311 * when it is non-NULL. */
7312 static robj *lookupKeyByPattern(redisDb *db, robj *pattern, robj *subst) {
7313 char *p, *f;
7314 sds spat, ssub;
7315 robj keyobj, fieldobj, *o;
7316 int prefixlen, sublen, postfixlen, fieldlen;
7317 /* Expoit the internal sds representation to create a sds string allocated on the stack in order to make this function faster */
7318 struct {
7319 long len;
7320 long free;
7321 char buf[REDIS_SORTKEY_MAX+1];
7322 } keyname, fieldname;
7323
7324 /* If the pattern is "#" return the substitution object itself in order
7325 * to implement the "SORT ... GET #" feature. */
7326 spat = pattern->ptr;
7327 if (spat[0] == '#' && spat[1] == '\0') {
7328 incrRefCount(subst);
7329 return subst;
7330 }
7331
7332 /* The substitution object may be specially encoded. If so we create
7333 * a decoded object on the fly. Otherwise getDecodedObject will just
7334 * increment the ref count, that we'll decrement later. */
7335 subst = getDecodedObject(subst);
7336
7337 ssub = subst->ptr;
7338 if (sdslen(spat)+sdslen(ssub)-1 > REDIS_SORTKEY_MAX) return NULL;
7339 p = strchr(spat,'*');
7340 if (!p) {
7341 decrRefCount(subst);
7342 return NULL;
7343 }
7344
7345 /* Find out if we're dealing with a hash dereference. */
7346 if ((f = strstr(p+1, "->")) != NULL) {
7347 fieldlen = sdslen(spat)-(f-spat);
7348 /* this also copies \0 character */
7349 memcpy(fieldname.buf,f+2,fieldlen-1);
7350 fieldname.len = fieldlen-2;
7351 } else {
7352 fieldlen = 0;
7353 }
7354
7355 prefixlen = p-spat;
7356 sublen = sdslen(ssub);
7357 postfixlen = sdslen(spat)-(prefixlen+1)-fieldlen;
7358 memcpy(keyname.buf,spat,prefixlen);
7359 memcpy(keyname.buf+prefixlen,ssub,sublen);
7360 memcpy(keyname.buf+prefixlen+sublen,p+1,postfixlen);
7361 keyname.buf[prefixlen+sublen+postfixlen] = '\0';
7362 keyname.len = prefixlen+sublen+postfixlen;
7363 decrRefCount(subst);
7364
7365 /* Lookup substituted key */
7366 initStaticStringObject(keyobj,((char*)&keyname)+(sizeof(long)*2));
7367 o = lookupKeyRead(db,&keyobj);
7368 if (o == NULL) return NULL;
7369
7370 if (fieldlen > 0) {
7371 if (o->type != REDIS_HASH || fieldname.len < 1) return NULL;
7372
7373 /* Retrieve value from hash by the field name. This operation
7374 * already increases the refcount of the returned object. */
7375 initStaticStringObject(fieldobj,((char*)&fieldname)+(sizeof(long)*2));
7376 o = hashTypeGet(o, &fieldobj);
7377 } else {
7378 if (o->type != REDIS_STRING) return NULL;
7379
7380 /* Every object that this function returns needs to have its refcount
7381 * increased. sortCommand decreases it again. */
7382 incrRefCount(o);
7383 }
7384
7385 return o;
7386 }
7387
7388 /* sortCompare() is used by qsort in sortCommand(). Given that qsort_r with
7389 * the additional parameter is not standard but a BSD-specific we have to
7390 * pass sorting parameters via the global 'server' structure */
7391 static int sortCompare(const void *s1, const void *s2) {
7392 const redisSortObject *so1 = s1, *so2 = s2;
7393 int cmp;
7394
7395 if (!server.sort_alpha) {
7396 /* Numeric sorting. Here it's trivial as we precomputed scores */
7397 if (so1->u.score > so2->u.score) {
7398 cmp = 1;
7399 } else if (so1->u.score < so2->u.score) {
7400 cmp = -1;
7401 } else {
7402 cmp = 0;
7403 }
7404 } else {
7405 /* Alphanumeric sorting */
7406 if (server.sort_bypattern) {
7407 if (!so1->u.cmpobj || !so2->u.cmpobj) {
7408 /* At least one compare object is NULL */
7409 if (so1->u.cmpobj == so2->u.cmpobj)
7410 cmp = 0;
7411 else if (so1->u.cmpobj == NULL)
7412 cmp = -1;
7413 else
7414 cmp = 1;
7415 } else {
7416 /* We have both the objects, use strcoll */
7417 cmp = strcoll(so1->u.cmpobj->ptr,so2->u.cmpobj->ptr);
7418 }
7419 } else {
7420 /* Compare elements directly. */
7421 cmp = compareStringObjects(so1->obj,so2->obj);
7422 }
7423 }
7424 return server.sort_desc ? -cmp : cmp;
7425 }
7426
7427 /* The SORT command is the most complex command in Redis. Warning: this code
7428 * is optimized for speed and a bit less for readability */
7429 static void sortCommand(redisClient *c) {
7430 list *operations;
7431 unsigned int outputlen = 0;
7432 int desc = 0, alpha = 0;
7433 int limit_start = 0, limit_count = -1, start, end;
7434 int j, dontsort = 0, vectorlen;
7435 int getop = 0; /* GET operation counter */
7436 robj *sortval, *sortby = NULL, *storekey = NULL;
7437 redisSortObject *vector; /* Resulting vector to sort */
7438
7439 /* Lookup the key to sort. It must be of the right types */
7440 sortval = lookupKeyRead(c->db,c->argv[1]);
7441 if (sortval == NULL) {
7442 addReply(c,shared.emptymultibulk);
7443 return;
7444 }
7445 if (sortval->type != REDIS_SET && sortval->type != REDIS_LIST &&
7446 sortval->type != REDIS_ZSET)
7447 {
7448 addReply(c,shared.wrongtypeerr);
7449 return;
7450 }
7451
7452 /* Create a list of operations to perform for every sorted element.
7453 * Operations can be GET/DEL/INCR/DECR */
7454 operations = listCreate();
7455 listSetFreeMethod(operations,zfree);
7456 j = 2;
7457
7458 /* Now we need to protect sortval incrementing its count, in the future
7459 * SORT may have options able to overwrite/delete keys during the sorting
7460 * and the sorted key itself may get destroied */
7461 incrRefCount(sortval);
7462
7463 /* The SORT command has an SQL-alike syntax, parse it */
7464 while(j < c->argc) {
7465 int leftargs = c->argc-j-1;
7466 if (!strcasecmp(c->argv[j]->ptr,"asc")) {
7467 desc = 0;
7468 } else if (!strcasecmp(c->argv[j]->ptr,"desc")) {
7469 desc = 1;
7470 } else if (!strcasecmp(c->argv[j]->ptr,"alpha")) {
7471 alpha = 1;
7472 } else if (!strcasecmp(c->argv[j]->ptr,"limit") && leftargs >= 2) {
7473 limit_start = atoi(c->argv[j+1]->ptr);
7474 limit_count = atoi(c->argv[j+2]->ptr);
7475 j+=2;
7476 } else if (!strcasecmp(c->argv[j]->ptr,"store") && leftargs >= 1) {
7477 storekey = c->argv[j+1];
7478 j++;
7479 } else if (!strcasecmp(c->argv[j]->ptr,"by") && leftargs >= 1) {
7480 sortby = c->argv[j+1];
7481 /* If the BY pattern does not contain '*', i.e. it is constant,
7482 * we don't need to sort nor to lookup the weight keys. */
7483 if (strchr(c->argv[j+1]->ptr,'*') == NULL) dontsort = 1;
7484 j++;
7485 } else if (!strcasecmp(c->argv[j]->ptr,"get") && leftargs >= 1) {
7486 listAddNodeTail(operations,createSortOperation(
7487 REDIS_SORT_GET,c->argv[j+1]));
7488 getop++;
7489 j++;
7490 } else {
7491 decrRefCount(sortval);
7492 listRelease(operations);
7493 addReply(c,shared.syntaxerr);
7494 return;
7495 }
7496 j++;
7497 }
7498
7499 /* Load the sorting vector with all the objects to sort */
7500 switch(sortval->type) {
7501 case REDIS_LIST: vectorlen = listTypeLength(sortval); break;
7502 case REDIS_SET: vectorlen = dictSize((dict*)sortval->ptr); break;
7503 case REDIS_ZSET: vectorlen = dictSize(((zset*)sortval->ptr)->dict); break;
7504 default: vectorlen = 0; redisPanic("Bad SORT type"); /* Avoid GCC warning */
7505 }
7506 vector = zmalloc(sizeof(redisSortObject)*vectorlen);
7507 j = 0;
7508
7509 if (sortval->type == REDIS_LIST) {
7510 listTypeIterator *li = listTypeInitIterator(sortval,0,REDIS_TAIL);
7511 listTypeEntry entry;
7512 while(listTypeNext(li,&entry)) {
7513 vector[j].obj = listTypeGet(&entry);
7514 vector[j].u.score = 0;
7515 vector[j].u.cmpobj = NULL;
7516 j++;
7517 }
7518 listTypeReleaseIterator(li);
7519 } else {
7520 dict *set;
7521 dictIterator *di;
7522 dictEntry *setele;
7523
7524 if (sortval->type == REDIS_SET) {
7525 set = sortval->ptr;
7526 } else {
7527 zset *zs = sortval->ptr;
7528 set = zs->dict;
7529 }
7530
7531 di = dictGetIterator(set);
7532 while((setele = dictNext(di)) != NULL) {
7533 vector[j].obj = dictGetEntryKey(setele);
7534 vector[j].u.score = 0;
7535 vector[j].u.cmpobj = NULL;
7536 j++;
7537 }
7538 dictReleaseIterator(di);
7539 }
7540 redisAssert(j == vectorlen);
7541
7542 /* Now it's time to load the right scores in the sorting vector */
7543 if (dontsort == 0) {
7544 for (j = 0; j < vectorlen; j++) {
7545 robj *byval;
7546 if (sortby) {
7547 /* lookup value to sort by */
7548 byval = lookupKeyByPattern(c->db,sortby,vector[j].obj);
7549 if (!byval) continue;
7550 } else {
7551 /* use object itself to sort by */
7552 byval = vector[j].obj;
7553 }
7554
7555 if (alpha) {
7556 if (sortby) vector[j].u.cmpobj = getDecodedObject(byval);
7557 } else {
7558 if (byval->encoding == REDIS_ENCODING_RAW) {
7559 vector[j].u.score = strtod(byval->ptr,NULL);
7560 } else if (byval->encoding == REDIS_ENCODING_INT) {
7561 /* Don't need to decode the object if it's
7562 * integer-encoded (the only encoding supported) so
7563 * far. We can just cast it */
7564 vector[j].u.score = (long)byval->ptr;
7565 } else {
7566 redisAssert(1 != 1);
7567 }
7568 }
7569
7570 /* when the object was retrieved using lookupKeyByPattern,
7571 * its refcount needs to be decreased. */
7572 if (sortby) {
7573 decrRefCount(byval);
7574 }
7575 }
7576 }
7577
7578 /* We are ready to sort the vector... perform a bit of sanity check
7579 * on the LIMIT option too. We'll use a partial version of quicksort. */
7580 start = (limit_start < 0) ? 0 : limit_start;
7581 end = (limit_count < 0) ? vectorlen-1 : start+limit_count-1;
7582 if (start >= vectorlen) {
7583 start = vectorlen-1;
7584 end = vectorlen-2;
7585 }
7586 if (end >= vectorlen) end = vectorlen-1;
7587
7588 if (dontsort == 0) {
7589 server.sort_desc = desc;
7590 server.sort_alpha = alpha;
7591 server.sort_bypattern = sortby ? 1 : 0;
7592 if (sortby && (start != 0 || end != vectorlen-1))
7593 pqsort(vector,vectorlen,sizeof(redisSortObject),sortCompare, start,end);
7594 else
7595 qsort(vector,vectorlen,sizeof(redisSortObject),sortCompare);
7596 }
7597
7598 /* Send command output to the output buffer, performing the specified
7599 * GET/DEL/INCR/DECR operations if any. */
7600 outputlen = getop ? getop*(end-start+1) : end-start+1;
7601 if (storekey == NULL) {
7602 /* STORE option not specified, sent the sorting result to client */
7603 addReplySds(c,sdscatprintf(sdsempty(),"*%d\r\n",outputlen));
7604 for (j = start; j <= end; j++) {
7605 listNode *ln;
7606 listIter li;
7607
7608 if (!getop) addReplyBulk(c,vector[j].obj);
7609 listRewind(operations,&li);
7610 while((ln = listNext(&li))) {
7611 redisSortOperation *sop = ln->value;
7612 robj *val = lookupKeyByPattern(c->db,sop->pattern,
7613 vector[j].obj);
7614
7615 if (sop->type == REDIS_SORT_GET) {
7616 if (!val) {
7617 addReply(c,shared.nullbulk);
7618 } else {
7619 addReplyBulk(c,val);
7620 decrRefCount(val);
7621 }
7622 } else {
7623 redisAssert(sop->type == REDIS_SORT_GET); /* always fails */
7624 }
7625 }
7626 }
7627 } else {
7628 robj *sobj = createZiplistObject();
7629
7630 /* STORE option specified, set the sorting result as a List object */
7631 for (j = start; j <= end; j++) {
7632 listNode *ln;
7633 listIter li;
7634
7635 if (!getop) {
7636 listTypePush(sobj,vector[j].obj,REDIS_TAIL);
7637 } else {
7638 listRewind(operations,&li);
7639 while((ln = listNext(&li))) {
7640 redisSortOperation *sop = ln->value;
7641 robj *val = lookupKeyByPattern(c->db,sop->pattern,
7642 vector[j].obj);
7643
7644 if (sop->type == REDIS_SORT_GET) {
7645 if (!val) val = createStringObject("",0);
7646
7647 /* listTypePush does an incrRefCount, so we should take care
7648 * care of the incremented refcount caused by either
7649 * lookupKeyByPattern or createStringObject("",0) */
7650 listTypePush(sobj,val,REDIS_TAIL);
7651 decrRefCount(val);
7652 } else {
7653 /* always fails */
7654 redisAssert(sop->type == REDIS_SORT_GET);
7655 }
7656 }
7657 }
7658 }
7659 dbReplace(c->db,storekey,sobj);
7660 /* Note: we add 1 because the DB is dirty anyway since even if the
7661 * SORT result is empty a new key is set and maybe the old content
7662 * replaced. */
7663 server.dirty += 1+outputlen;
7664 addReplySds(c,sdscatprintf(sdsempty(),":%d\r\n",outputlen));
7665 }
7666
7667 /* Cleanup */
7668 if (sortval->type == REDIS_LIST)
7669 for (j = 0; j < vectorlen; j++)
7670 decrRefCount(vector[j].obj);
7671 decrRefCount(sortval);
7672 listRelease(operations);
7673 for (j = 0; j < vectorlen; j++) {
7674 if (alpha && vector[j].u.cmpobj)
7675 decrRefCount(vector[j].u.cmpobj);
7676 }
7677 zfree(vector);
7678 }
7679
7680 /* Convert an amount of bytes into a human readable string in the form
7681 * of 100B, 2G, 100M, 4K, and so forth. */
7682 static void bytesToHuman(char *s, unsigned long long n) {
7683 double d;
7684
7685 if (n < 1024) {
7686 /* Bytes */
7687 sprintf(s,"%lluB",n);
7688 return;
7689 } else if (n < (1024*1024)) {
7690 d = (double)n/(1024);
7691 sprintf(s,"%.2fK",d);
7692 } else if (n < (1024LL*1024*1024)) {
7693 d = (double)n/(1024*1024);
7694 sprintf(s,"%.2fM",d);
7695 } else if (n < (1024LL*1024*1024*1024)) {
7696 d = (double)n/(1024LL*1024*1024);
7697 sprintf(s,"%.2fG",d);
7698 }
7699 }
7700
7701 /* Create the string returned by the INFO command. This is decoupled
7702 * by the INFO command itself as we need to report the same information
7703 * on memory corruption problems. */
7704 static sds genRedisInfoString(void) {
7705 sds info;
7706 time_t uptime = time(NULL)-server.stat_starttime;
7707 int j;
7708 char hmem[64];
7709
7710 bytesToHuman(hmem,zmalloc_used_memory());
7711 info = sdscatprintf(sdsempty(),
7712 "redis_version:%s\r\n"
7713 "redis_git_sha1:%s\r\n"
7714 "redis_git_dirty:%d\r\n"
7715 "arch_bits:%s\r\n"
7716 "multiplexing_api:%s\r\n"
7717 "process_id:%ld\r\n"
7718 "uptime_in_seconds:%ld\r\n"
7719 "uptime_in_days:%ld\r\n"
7720 "connected_clients:%d\r\n"
7721 "connected_slaves:%d\r\n"
7722 "blocked_clients:%d\r\n"
7723 "used_memory:%zu\r\n"
7724 "used_memory_human:%s\r\n"
7725 "changes_since_last_save:%lld\r\n"
7726 "bgsave_in_progress:%d\r\n"
7727 "last_save_time:%ld\r\n"
7728 "bgrewriteaof_in_progress:%d\r\n"
7729 "total_connections_received:%lld\r\n"
7730 "total_commands_processed:%lld\r\n"
7731 "expired_keys:%lld\r\n"
7732 "hash_max_zipmap_entries:%zu\r\n"
7733 "hash_max_zipmap_value:%zu\r\n"
7734 "pubsub_channels:%ld\r\n"
7735 "pubsub_patterns:%u\r\n"
7736 "vm_enabled:%d\r\n"
7737 "role:%s\r\n"
7738 ,REDIS_VERSION,
7739 REDIS_GIT_SHA1,
7740 strtol(REDIS_GIT_DIRTY,NULL,10) > 0,
7741 (sizeof(long) == 8) ? "64" : "32",
7742 aeGetApiName(),
7743 (long) getpid(),
7744 uptime,
7745 uptime/(3600*24),
7746 listLength(server.clients)-listLength(server.slaves),
7747 listLength(server.slaves),
7748 server.blpop_blocked_clients,
7749 zmalloc_used_memory(),
7750 hmem,
7751 server.dirty,
7752 server.bgsavechildpid != -1,
7753 server.lastsave,
7754 server.bgrewritechildpid != -1,
7755 server.stat_numconnections,
7756 server.stat_numcommands,
7757 server.stat_expiredkeys,
7758 server.hash_max_zipmap_entries,
7759 server.hash_max_zipmap_value,
7760 dictSize(server.pubsub_channels),
7761 listLength(server.pubsub_patterns),
7762 server.vm_enabled != 0,
7763 server.masterhost == NULL ? "master" : "slave"
7764 );
7765 if (server.masterhost) {
7766 info = sdscatprintf(info,
7767 "master_host:%s\r\n"
7768 "master_port:%d\r\n"
7769 "master_link_status:%s\r\n"
7770 "master_last_io_seconds_ago:%d\r\n"
7771 ,server.masterhost,
7772 server.masterport,
7773 (server.replstate == REDIS_REPL_CONNECTED) ?
7774 "up" : "down",
7775 server.master ? ((int)(time(NULL)-server.master->lastinteraction)) : -1
7776 );
7777 }
7778 if (server.vm_enabled) {
7779 lockThreadedIO();
7780 info = sdscatprintf(info,
7781 "vm_conf_max_memory:%llu\r\n"
7782 "vm_conf_page_size:%llu\r\n"
7783 "vm_conf_pages:%llu\r\n"
7784 "vm_stats_used_pages:%llu\r\n"
7785 "vm_stats_swapped_objects:%llu\r\n"
7786 "vm_stats_swappin_count:%llu\r\n"
7787 "vm_stats_swappout_count:%llu\r\n"
7788 "vm_stats_io_newjobs_len:%lu\r\n"
7789 "vm_stats_io_processing_len:%lu\r\n"
7790 "vm_stats_io_processed_len:%lu\r\n"
7791 "vm_stats_io_active_threads:%lu\r\n"
7792 "vm_stats_blocked_clients:%lu\r\n"
7793 ,(unsigned long long) server.vm_max_memory,
7794 (unsigned long long) server.vm_page_size,
7795 (unsigned long long) server.vm_pages,
7796 (unsigned long long) server.vm_stats_used_pages,
7797 (unsigned long long) server.vm_stats_swapped_objects,
7798 (unsigned long long) server.vm_stats_swapins,
7799 (unsigned long long) server.vm_stats_swapouts,
7800 (unsigned long) listLength(server.io_newjobs),
7801 (unsigned long) listLength(server.io_processing),
7802 (unsigned long) listLength(server.io_processed),
7803 (unsigned long) server.io_active_threads,
7804 (unsigned long) server.vm_blocked_clients
7805 );
7806 unlockThreadedIO();
7807 }
7808 for (j = 0; j < server.dbnum; j++) {
7809 long long keys, vkeys;
7810
7811 keys = dictSize(server.db[j].dict);
7812 vkeys = dictSize(server.db[j].expires);
7813 if (keys || vkeys) {
7814 info = sdscatprintf(info, "db%d:keys=%lld,expires=%lld\r\n",
7815 j, keys, vkeys);
7816 }
7817 }
7818 return info;
7819 }
7820
7821 static void infoCommand(redisClient *c) {
7822 sds info = genRedisInfoString();
7823 addReplySds(c,sdscatprintf(sdsempty(),"$%lu\r\n",
7824 (unsigned long)sdslen(info)));
7825 addReplySds(c,info);
7826 addReply(c,shared.crlf);
7827 }
7828
7829 static void monitorCommand(redisClient *c) {
7830 /* ignore MONITOR if aleady slave or in monitor mode */
7831 if (c->flags & REDIS_SLAVE) return;
7832
7833 c->flags |= (REDIS_SLAVE|REDIS_MONITOR);
7834 c->slaveseldb = 0;
7835 listAddNodeTail(server.monitors,c);
7836 addReply(c,shared.ok);
7837 }
7838
7839 /* ================================= Expire ================================= */
7840 static int removeExpire(redisDb *db, robj *key) {
7841 if (dictDelete(db->expires,key->ptr) == DICT_OK) {
7842 return 1;
7843 } else {
7844 return 0;
7845 }
7846 }
7847
7848 static int setExpire(redisDb *db, robj *key, time_t when) {
7849 sds copy = sdsdup(key->ptr);
7850 if (dictAdd(db->expires,copy,(void*)when) == DICT_ERR) {
7851 sdsfree(copy);
7852 return 0;
7853 } else {
7854 return 1;
7855 }
7856 }
7857
7858 /* Return the expire time of the specified key, or -1 if no expire
7859 * is associated with this key (i.e. the key is non volatile) */
7860 static time_t getExpire(redisDb *db, robj *key) {
7861 dictEntry *de;
7862
7863 /* No expire? return ASAP */
7864 if (dictSize(db->expires) == 0 ||
7865 (de = dictFind(db->expires,key->ptr)) == NULL) return -1;
7866
7867 return (time_t) dictGetEntryVal(de);
7868 }
7869
7870 static int expireIfNeeded(redisDb *db, robj *key) {
7871 time_t when;
7872 dictEntry *de;
7873
7874 /* No expire? return ASAP */
7875 if (dictSize(db->expires) == 0 ||
7876 (de = dictFind(db->expires,key->ptr)) == NULL) return 0;
7877
7878 /* Lookup the expire */
7879 when = (time_t) dictGetEntryVal(de);
7880 if (time(NULL) <= when) return 0;
7881
7882 /* Delete the key */
7883 dbDelete(db,key);
7884 server.stat_expiredkeys++;
7885 return 1;
7886 }
7887
7888 static int deleteIfVolatile(redisDb *db, robj *key) {
7889 dictEntry *de;
7890
7891 /* No expire? return ASAP */
7892 if (dictSize(db->expires) == 0 ||
7893 (de = dictFind(db->expires,key->ptr)) == NULL) return 0;
7894
7895 /* Delete the key */
7896 server.dirty++;
7897 server.stat_expiredkeys++;
7898 dictDelete(db->expires,key->ptr);
7899 return dictDelete(db->dict,key->ptr) == DICT_OK;
7900 }
7901
7902 static void expireGenericCommand(redisClient *c, robj *key, robj *param, long offset) {
7903 dictEntry *de;
7904 time_t seconds;
7905
7906 if (getLongFromObjectOrReply(c, param, &seconds, NULL) != REDIS_OK) return;
7907
7908 seconds -= offset;
7909
7910 de = dictFind(c->db->dict,key->ptr);
7911 if (de == NULL) {
7912 addReply(c,shared.czero);
7913 return;
7914 }
7915 if (seconds <= 0) {
7916 if (dbDelete(c->db,key)) server.dirty++;
7917 addReply(c, shared.cone);
7918 return;
7919 } else {
7920 time_t when = time(NULL)+seconds;
7921 if (setExpire(c->db,key,when)) {
7922 addReply(c,shared.cone);
7923 server.dirty++;
7924 } else {
7925 addReply(c,shared.czero);
7926 }
7927 return;
7928 }
7929 }
7930
7931 static void expireCommand(redisClient *c) {
7932 expireGenericCommand(c,c->argv[1],c->argv[2],0);
7933 }
7934
7935 static void expireatCommand(redisClient *c) {
7936 expireGenericCommand(c,c->argv[1],c->argv[2],time(NULL));
7937 }
7938
7939 static void ttlCommand(redisClient *c) {
7940 time_t expire;
7941 int ttl = -1;
7942
7943 expire = getExpire(c->db,c->argv[1]);
7944 if (expire != -1) {
7945 ttl = (int) (expire-time(NULL));
7946 if (ttl < 0) ttl = -1;
7947 }
7948 addReplySds(c,sdscatprintf(sdsempty(),":%d\r\n",ttl));
7949 }
7950
7951 /* ================================ MULTI/EXEC ============================== */
7952
7953 /* Client state initialization for MULTI/EXEC */
7954 static void initClientMultiState(redisClient *c) {
7955 c->mstate.commands = NULL;
7956 c->mstate.count = 0;
7957 }
7958
7959 /* Release all the resources associated with MULTI/EXEC state */
7960 static void freeClientMultiState(redisClient *c) {
7961 int j;
7962
7963 for (j = 0; j < c->mstate.count; j++) {
7964 int i;
7965 multiCmd *mc = c->mstate.commands+j;
7966
7967 for (i = 0; i < mc->argc; i++)
7968 decrRefCount(mc->argv[i]);
7969 zfree(mc->argv);
7970 }
7971 zfree(c->mstate.commands);
7972 }
7973
7974 /* Add a new command into the MULTI commands queue */
7975 static void queueMultiCommand(redisClient *c, struct redisCommand *cmd) {
7976 multiCmd *mc;
7977 int j;
7978
7979 c->mstate.commands = zrealloc(c->mstate.commands,
7980 sizeof(multiCmd)*(c->mstate.count+1));
7981 mc = c->mstate.commands+c->mstate.count;
7982 mc->cmd = cmd;
7983 mc->argc = c->argc;
7984 mc->argv = zmalloc(sizeof(robj*)*c->argc);
7985 memcpy(mc->argv,c->argv,sizeof(robj*)*c->argc);
7986 for (j = 0; j < c->argc; j++)
7987 incrRefCount(mc->argv[j]);
7988 c->mstate.count++;
7989 }
7990
7991 static void multiCommand(redisClient *c) {
7992 if (c->flags & REDIS_MULTI) {
7993 addReplySds(c,sdsnew("-ERR MULTI calls can not be nested\r\n"));
7994 return;
7995 }
7996 c->flags |= REDIS_MULTI;
7997 addReply(c,shared.ok);
7998 }
7999
8000 static void discardCommand(redisClient *c) {
8001 if (!(c->flags & REDIS_MULTI)) {
8002 addReplySds(c,sdsnew("-ERR DISCARD without MULTI\r\n"));
8003 return;
8004 }
8005
8006 freeClientMultiState(c);
8007 initClientMultiState(c);
8008 c->flags &= (~REDIS_MULTI);
8009 addReply(c,shared.ok);
8010 }
8011
8012 /* Send a MULTI command to all the slaves and AOF file. Check the execCommand
8013 * implememntation for more information. */
8014 static void execCommandReplicateMulti(redisClient *c) {
8015 struct redisCommand *cmd;
8016 robj *multistring = createStringObject("MULTI",5);
8017
8018 cmd = lookupCommand("multi");
8019 if (server.appendonly)
8020 feedAppendOnlyFile(cmd,c->db->id,&multistring,1);
8021 if (listLength(server.slaves))
8022 replicationFeedSlaves(server.slaves,c->db->id,&multistring,1);
8023 decrRefCount(multistring);
8024 }
8025
8026 static void execCommand(redisClient *c) {
8027 int j;
8028 robj **orig_argv;
8029 int orig_argc;
8030
8031 if (!(c->flags & REDIS_MULTI)) {
8032 addReplySds(c,sdsnew("-ERR EXEC without MULTI\r\n"));
8033 return;
8034 }
8035
8036 /* Check if we need to abort the EXEC if some WATCHed key was touched.
8037 * A failed EXEC will return a multi bulk nil object. */
8038 if (c->flags & REDIS_DIRTY_CAS) {
8039 freeClientMultiState(c);
8040 initClientMultiState(c);
8041 c->flags &= ~(REDIS_MULTI|REDIS_DIRTY_CAS);
8042 unwatchAllKeys(c);
8043 addReply(c,shared.nullmultibulk);
8044 return;
8045 }
8046
8047 /* Replicate a MULTI request now that we are sure the block is executed.
8048 * This way we'll deliver the MULTI/..../EXEC block as a whole and
8049 * both the AOF and the replication link will have the same consistency
8050 * and atomicity guarantees. */
8051 execCommandReplicateMulti(c);
8052
8053 /* Exec all the queued commands */
8054 unwatchAllKeys(c); /* Unwatch ASAP otherwise we'll waste CPU cycles */
8055 orig_argv = c->argv;
8056 orig_argc = c->argc;
8057 addReplySds(c,sdscatprintf(sdsempty(),"*%d\r\n",c->mstate.count));
8058 for (j = 0; j < c->mstate.count; j++) {
8059 c->argc = c->mstate.commands[j].argc;
8060 c->argv = c->mstate.commands[j].argv;
8061 call(c,c->mstate.commands[j].cmd);
8062 }
8063 c->argv = orig_argv;
8064 c->argc = orig_argc;
8065 freeClientMultiState(c);
8066 initClientMultiState(c);
8067 c->flags &= ~(REDIS_MULTI|REDIS_DIRTY_CAS);
8068 /* Make sure the EXEC command is always replicated / AOF, since we
8069 * always send the MULTI command (we can't know beforehand if the
8070 * next operations will contain at least a modification to the DB). */
8071 server.dirty++;
8072 }
8073
8074 /* =========================== Blocking Operations ========================= */
8075
8076 /* Currently Redis blocking operations support is limited to list POP ops,
8077 * so the current implementation is not fully generic, but it is also not
8078 * completely specific so it will not require a rewrite to support new
8079 * kind of blocking operations in the future.
8080 *
8081 * Still it's important to note that list blocking operations can be already
8082 * used as a notification mechanism in order to implement other blocking
8083 * operations at application level, so there must be a very strong evidence
8084 * of usefulness and generality before new blocking operations are implemented.
8085 *
8086 * This is how the current blocking POP works, we use BLPOP as example:
8087 * - If the user calls BLPOP and the key exists and contains a non empty list
8088 * then LPOP is called instead. So BLPOP is semantically the same as LPOP
8089 * if there is not to block.
8090 * - If instead BLPOP is called and the key does not exists or the list is
8091 * empty we need to block. In order to do so we remove the notification for
8092 * new data to read in the client socket (so that we'll not serve new
8093 * requests if the blocking request is not served). Also we put the client
8094 * in a dictionary (db->blocking_keys) mapping keys to a list of clients
8095 * blocking for this keys.
8096 * - If a PUSH operation against a key with blocked clients waiting is
8097 * performed, we serve the first in the list: basically instead to push
8098 * the new element inside the list we return it to the (first / oldest)
8099 * blocking client, unblock the client, and remove it form the list.
8100 *
8101 * The above comment and the source code should be enough in order to understand
8102 * the implementation and modify / fix it later.
8103 */
8104
8105 /* Set a client in blocking mode for the specified key, with the specified
8106 * timeout */
8107 static void blockForKeys(redisClient *c, robj **keys, int numkeys, time_t timeout) {
8108 dictEntry *de;
8109 list *l;
8110 int j;
8111
8112 c->blocking_keys = zmalloc(sizeof(robj*)*numkeys);
8113 c->blocking_keys_num = numkeys;
8114 c->blockingto = timeout;
8115 for (j = 0; j < numkeys; j++) {
8116 /* Add the key in the client structure, to map clients -> keys */
8117 c->blocking_keys[j] = keys[j];
8118 incrRefCount(keys[j]);
8119
8120 /* And in the other "side", to map keys -> clients */
8121 de = dictFind(c->db->blocking_keys,keys[j]);
8122 if (de == NULL) {
8123 int retval;
8124
8125 /* For every key we take a list of clients blocked for it */
8126 l = listCreate();
8127 retval = dictAdd(c->db->blocking_keys,keys[j],l);
8128 incrRefCount(keys[j]);
8129 assert(retval == DICT_OK);
8130 } else {
8131 l = dictGetEntryVal(de);
8132 }
8133 listAddNodeTail(l,c);
8134 }
8135 /* Mark the client as a blocked client */
8136 c->flags |= REDIS_BLOCKED;
8137 server.blpop_blocked_clients++;
8138 }
8139
8140 /* Unblock a client that's waiting in a blocking operation such as BLPOP */
8141 static void unblockClientWaitingData(redisClient *c) {
8142 dictEntry *de;
8143 list *l;
8144 int j;
8145
8146 assert(c->blocking_keys != NULL);
8147 /* The client may wait for multiple keys, so unblock it for every key. */
8148 for (j = 0; j < c->blocking_keys_num; j++) {
8149 /* Remove this client from the list of clients waiting for this key. */
8150 de = dictFind(c->db->blocking_keys,c->blocking_keys[j]);
8151 assert(de != NULL);
8152 l = dictGetEntryVal(de);
8153 listDelNode(l,listSearchKey(l,c));
8154 /* If the list is empty we need to remove it to avoid wasting memory */
8155 if (listLength(l) == 0)
8156 dictDelete(c->db->blocking_keys,c->blocking_keys[j]);
8157 decrRefCount(c->blocking_keys[j]);
8158 }
8159 /* Cleanup the client structure */
8160 zfree(c->blocking_keys);
8161 c->blocking_keys = NULL;
8162 c->flags &= (~REDIS_BLOCKED);
8163 server.blpop_blocked_clients--;
8164 /* We want to process data if there is some command waiting
8165 * in the input buffer. Note that this is safe even if
8166 * unblockClientWaitingData() gets called from freeClient() because
8167 * freeClient() will be smart enough to call this function
8168 * *after* c->querybuf was set to NULL. */
8169 if (c->querybuf && sdslen(c->querybuf) > 0) processInputBuffer(c);
8170 }
8171
8172 /* This should be called from any function PUSHing into lists.
8173 * 'c' is the "pushing client", 'key' is the key it is pushing data against,
8174 * 'ele' is the element pushed.
8175 *
8176 * If the function returns 0 there was no client waiting for a list push
8177 * against this key.
8178 *
8179 * If the function returns 1 there was a client waiting for a list push
8180 * against this key, the element was passed to this client thus it's not
8181 * needed to actually add it to the list and the caller should return asap. */
8182 static int handleClientsWaitingListPush(redisClient *c, robj *key, robj *ele) {
8183 struct dictEntry *de;
8184 redisClient *receiver;
8185 list *l;
8186 listNode *ln;
8187
8188 de = dictFind(c->db->blocking_keys,key);
8189 if (de == NULL) return 0;
8190 l = dictGetEntryVal(de);
8191 ln = listFirst(l);
8192 assert(ln != NULL);
8193 receiver = ln->value;
8194
8195 addReplySds(receiver,sdsnew("*2\r\n"));
8196 addReplyBulk(receiver,key);
8197 addReplyBulk(receiver,ele);
8198 unblockClientWaitingData(receiver);
8199 return 1;
8200 }
8201
8202 /* Blocking RPOP/LPOP */
8203 static void blockingPopGenericCommand(redisClient *c, int where) {
8204 robj *o;
8205 time_t timeout;
8206 int j;
8207
8208 for (j = 1; j < c->argc-1; j++) {
8209 o = lookupKeyWrite(c->db,c->argv[j]);
8210 if (o != NULL) {
8211 if (o->type != REDIS_LIST) {
8212 addReply(c,shared.wrongtypeerr);
8213 return;
8214 } else {
8215 list *list = o->ptr;
8216 if (listLength(list) != 0) {
8217 /* If the list contains elements fall back to the usual
8218 * non-blocking POP operation */
8219 robj *argv[2], **orig_argv;
8220 int orig_argc;
8221
8222 /* We need to alter the command arguments before to call
8223 * popGenericCommand() as the command takes a single key. */
8224 orig_argv = c->argv;
8225 orig_argc = c->argc;
8226 argv[1] = c->argv[j];
8227 c->argv = argv;
8228 c->argc = 2;
8229
8230 /* Also the return value is different, we need to output
8231 * the multi bulk reply header and the key name. The
8232 * "real" command will add the last element (the value)
8233 * for us. If this souds like an hack to you it's just
8234 * because it is... */
8235 addReplySds(c,sdsnew("*2\r\n"));
8236 addReplyBulk(c,argv[1]);
8237 popGenericCommand(c,where);
8238
8239 /* Fix the client structure with the original stuff */
8240 c->argv = orig_argv;
8241 c->argc = orig_argc;
8242 return;
8243 }
8244 }
8245 }
8246 }
8247 /* If the list is empty or the key does not exists we must block */
8248 timeout = strtol(c->argv[c->argc-1]->ptr,NULL,10);
8249 if (timeout > 0) timeout += time(NULL);
8250 blockForKeys(c,c->argv+1,c->argc-2,timeout);
8251 }
8252
8253 static void blpopCommand(redisClient *c) {
8254 blockingPopGenericCommand(c,REDIS_HEAD);
8255 }
8256
8257 static void brpopCommand(redisClient *c) {
8258 blockingPopGenericCommand(c,REDIS_TAIL);
8259 }
8260
8261 /* =============================== Replication ============================= */
8262
8263 static int syncWrite(int fd, char *ptr, ssize_t size, int timeout) {
8264 ssize_t nwritten, ret = size;
8265 time_t start = time(NULL);
8266
8267 timeout++;
8268 while(size) {
8269 if (aeWait(fd,AE_WRITABLE,1000) & AE_WRITABLE) {
8270 nwritten = write(fd,ptr,size);
8271 if (nwritten == -1) return -1;
8272 ptr += nwritten;
8273 size -= nwritten;
8274 }
8275 if ((time(NULL)-start) > timeout) {
8276 errno = ETIMEDOUT;
8277 return -1;
8278 }
8279 }
8280 return ret;
8281 }
8282
8283 static int syncRead(int fd, char *ptr, ssize_t size, int timeout) {
8284 ssize_t nread, totread = 0;
8285 time_t start = time(NULL);
8286
8287 timeout++;
8288 while(size) {
8289 if (aeWait(fd,AE_READABLE,1000) & AE_READABLE) {
8290 nread = read(fd,ptr,size);
8291 if (nread == -1) return -1;
8292 ptr += nread;
8293 size -= nread;
8294 totread += nread;
8295 }
8296 if ((time(NULL)-start) > timeout) {
8297 errno = ETIMEDOUT;
8298 return -1;
8299 }
8300 }
8301 return totread;
8302 }
8303
8304 static int syncReadLine(int fd, char *ptr, ssize_t size, int timeout) {
8305 ssize_t nread = 0;
8306
8307 size--;
8308 while(size) {
8309 char c;
8310
8311 if (syncRead(fd,&c,1,timeout) == -1) return -1;
8312 if (c == '\n') {
8313 *ptr = '\0';
8314 if (nread && *(ptr-1) == '\r') *(ptr-1) = '\0';
8315 return nread;
8316 } else {
8317 *ptr++ = c;
8318 *ptr = '\0';
8319 nread++;
8320 }
8321 }
8322 return nread;
8323 }
8324
8325 static void syncCommand(redisClient *c) {
8326 /* ignore SYNC if aleady slave or in monitor mode */
8327 if (c->flags & REDIS_SLAVE) return;
8328
8329 /* SYNC can't be issued when the server has pending data to send to
8330 * the client about already issued commands. We need a fresh reply
8331 * buffer registering the differences between the BGSAVE and the current
8332 * dataset, so that we can copy to other slaves if needed. */
8333 if (listLength(c->reply) != 0) {
8334 addReplySds(c,sdsnew("-ERR SYNC is invalid with pending input\r\n"));
8335 return;
8336 }
8337
8338 redisLog(REDIS_NOTICE,"Slave ask for synchronization");
8339 /* Here we need to check if there is a background saving operation
8340 * in progress, or if it is required to start one */
8341 if (server.bgsavechildpid != -1) {
8342 /* Ok a background save is in progress. Let's check if it is a good
8343 * one for replication, i.e. if there is another slave that is
8344 * registering differences since the server forked to save */
8345 redisClient *slave;
8346 listNode *ln;
8347 listIter li;
8348
8349 listRewind(server.slaves,&li);
8350 while((ln = listNext(&li))) {
8351 slave = ln->value;
8352 if (slave->replstate == REDIS_REPL_WAIT_BGSAVE_END) break;
8353 }
8354 if (ln) {
8355 /* Perfect, the server is already registering differences for
8356 * another slave. Set the right state, and copy the buffer. */
8357 listRelease(c->reply);
8358 c->reply = listDup(slave->reply);
8359 c->replstate = REDIS_REPL_WAIT_BGSAVE_END;
8360 redisLog(REDIS_NOTICE,"Waiting for end of BGSAVE for SYNC");
8361 } else {
8362 /* No way, we need to wait for the next BGSAVE in order to
8363 * register differences */
8364 c->replstate = REDIS_REPL_WAIT_BGSAVE_START;
8365 redisLog(REDIS_NOTICE,"Waiting for next BGSAVE for SYNC");
8366 }
8367 } else {
8368 /* Ok we don't have a BGSAVE in progress, let's start one */
8369 redisLog(REDIS_NOTICE,"Starting BGSAVE for SYNC");
8370 if (rdbSaveBackground(server.dbfilename) != REDIS_OK) {
8371 redisLog(REDIS_NOTICE,"Replication failed, can't BGSAVE");
8372 addReplySds(c,sdsnew("-ERR Unalbe to perform background save\r\n"));
8373 return;
8374 }
8375 c->replstate = REDIS_REPL_WAIT_BGSAVE_END;
8376 }
8377 c->repldbfd = -1;
8378 c->flags |= REDIS_SLAVE;
8379 c->slaveseldb = 0;
8380 listAddNodeTail(server.slaves,c);
8381 return;
8382 }
8383
8384 static void sendBulkToSlave(aeEventLoop *el, int fd, void *privdata, int mask) {
8385 redisClient *slave = privdata;
8386 REDIS_NOTUSED(el);
8387 REDIS_NOTUSED(mask);
8388 char buf[REDIS_IOBUF_LEN];
8389 ssize_t nwritten, buflen;
8390
8391 if (slave->repldboff == 0) {
8392 /* Write the bulk write count before to transfer the DB. In theory here
8393 * we don't know how much room there is in the output buffer of the
8394 * socket, but in pratice SO_SNDLOWAT (the minimum count for output
8395 * operations) will never be smaller than the few bytes we need. */
8396 sds bulkcount;
8397
8398 bulkcount = sdscatprintf(sdsempty(),"$%lld\r\n",(unsigned long long)
8399 slave->repldbsize);
8400 if (write(fd,bulkcount,sdslen(bulkcount)) != (signed)sdslen(bulkcount))
8401 {
8402 sdsfree(bulkcount);
8403 freeClient(slave);
8404 return;
8405 }
8406 sdsfree(bulkcount);
8407 }
8408 lseek(slave->repldbfd,slave->repldboff,SEEK_SET);
8409 buflen = read(slave->repldbfd,buf,REDIS_IOBUF_LEN);
8410 if (buflen <= 0) {
8411 redisLog(REDIS_WARNING,"Read error sending DB to slave: %s",
8412 (buflen == 0) ? "premature EOF" : strerror(errno));
8413 freeClient(slave);
8414 return;
8415 }
8416 if ((nwritten = write(fd,buf,buflen)) == -1) {
8417 redisLog(REDIS_VERBOSE,"Write error sending DB to slave: %s",
8418 strerror(errno));
8419 freeClient(slave);
8420 return;
8421 }
8422 slave->repldboff += nwritten;
8423 if (slave->repldboff == slave->repldbsize) {
8424 close(slave->repldbfd);
8425 slave->repldbfd = -1;
8426 aeDeleteFileEvent(server.el,slave->fd,AE_WRITABLE);
8427 slave->replstate = REDIS_REPL_ONLINE;
8428 if (aeCreateFileEvent(server.el, slave->fd, AE_WRITABLE,
8429 sendReplyToClient, slave) == AE_ERR) {
8430 freeClient(slave);
8431 return;
8432 }
8433 addReplySds(slave,sdsempty());
8434 redisLog(REDIS_NOTICE,"Synchronization with slave succeeded");
8435 }
8436 }
8437
8438 /* This function is called at the end of every backgrond saving.
8439 * The argument bgsaveerr is REDIS_OK if the background saving succeeded
8440 * otherwise REDIS_ERR is passed to the function.
8441 *
8442 * The goal of this function is to handle slaves waiting for a successful
8443 * background saving in order to perform non-blocking synchronization. */
8444 static void updateSlavesWaitingBgsave(int bgsaveerr) {
8445 listNode *ln;
8446 int startbgsave = 0;
8447 listIter li;
8448
8449 listRewind(server.slaves,&li);
8450 while((ln = listNext(&li))) {
8451 redisClient *slave = ln->value;
8452
8453 if (slave->replstate == REDIS_REPL_WAIT_BGSAVE_START) {
8454 startbgsave = 1;
8455 slave->replstate = REDIS_REPL_WAIT_BGSAVE_END;
8456 } else if (slave->replstate == REDIS_REPL_WAIT_BGSAVE_END) {
8457 struct redis_stat buf;
8458
8459 if (bgsaveerr != REDIS_OK) {
8460 freeClient(slave);
8461 redisLog(REDIS_WARNING,"SYNC failed. BGSAVE child returned an error");
8462 continue;
8463 }
8464 if ((slave->repldbfd = open(server.dbfilename,O_RDONLY)) == -1 ||
8465 redis_fstat(slave->repldbfd,&buf) == -1) {
8466 freeClient(slave);
8467 redisLog(REDIS_WARNING,"SYNC failed. Can't open/stat DB after BGSAVE: %s", strerror(errno));
8468 continue;
8469 }
8470 slave->repldboff = 0;
8471 slave->repldbsize = buf.st_size;
8472 slave->replstate = REDIS_REPL_SEND_BULK;
8473 aeDeleteFileEvent(server.el,slave->fd,AE_WRITABLE);
8474 if (aeCreateFileEvent(server.el, slave->fd, AE_WRITABLE, sendBulkToSlave, slave) == AE_ERR) {
8475 freeClient(slave);
8476 continue;
8477 }
8478 }
8479 }
8480 if (startbgsave) {
8481 if (rdbSaveBackground(server.dbfilename) != REDIS_OK) {
8482 listIter li;
8483
8484 listRewind(server.slaves,&li);
8485 redisLog(REDIS_WARNING,"SYNC failed. BGSAVE failed");
8486 while((ln = listNext(&li))) {
8487 redisClient *slave = ln->value;
8488
8489 if (slave->replstate == REDIS_REPL_WAIT_BGSAVE_START)
8490 freeClient(slave);
8491 }
8492 }
8493 }
8494 }
8495
8496 static int syncWithMaster(void) {
8497 char buf[1024], tmpfile[256], authcmd[1024];
8498 long dumpsize;
8499 int fd = anetTcpConnect(NULL,server.masterhost,server.masterport);
8500 int dfd, maxtries = 5;
8501
8502 if (fd == -1) {
8503 redisLog(REDIS_WARNING,"Unable to connect to MASTER: %s",
8504 strerror(errno));
8505 return REDIS_ERR;
8506 }
8507
8508 /* AUTH with the master if required. */
8509 if(server.masterauth) {
8510 snprintf(authcmd, 1024, "AUTH %s\r\n", server.masterauth);
8511 if (syncWrite(fd, authcmd, strlen(server.masterauth)+7, 5) == -1) {
8512 close(fd);
8513 redisLog(REDIS_WARNING,"Unable to AUTH to MASTER: %s",
8514 strerror(errno));
8515 return REDIS_ERR;
8516 }
8517 /* Read the AUTH result. */
8518 if (syncReadLine(fd,buf,1024,3600) == -1) {
8519 close(fd);
8520 redisLog(REDIS_WARNING,"I/O error reading auth result from MASTER: %s",
8521 strerror(errno));
8522 return REDIS_ERR;
8523 }
8524 if (buf[0] != '+') {
8525 close(fd);
8526 redisLog(REDIS_WARNING,"Cannot AUTH to MASTER, is the masterauth password correct?");
8527 return REDIS_ERR;
8528 }
8529 }
8530
8531 /* Issue the SYNC command */
8532 if (syncWrite(fd,"SYNC \r\n",7,5) == -1) {
8533 close(fd);
8534 redisLog(REDIS_WARNING,"I/O error writing to MASTER: %s",
8535 strerror(errno));
8536 return REDIS_ERR;
8537 }
8538 /* Read the bulk write count */
8539 if (syncReadLine(fd,buf,1024,3600) == -1) {
8540 close(fd);
8541 redisLog(REDIS_WARNING,"I/O error reading bulk count from MASTER: %s",
8542 strerror(errno));
8543 return REDIS_ERR;
8544 }
8545 if (buf[0] != '$') {
8546 close(fd);
8547 redisLog(REDIS_WARNING,"Bad protocol from MASTER, the first byte is not '$', are you sure the host and port are right?");
8548 return REDIS_ERR;
8549 }
8550 dumpsize = strtol(buf+1,NULL,10);
8551 redisLog(REDIS_NOTICE,"Receiving %ld bytes data dump from MASTER",dumpsize);
8552 /* Read the bulk write data on a temp file */
8553 while(maxtries--) {
8554 snprintf(tmpfile,256,
8555 "temp-%d.%ld.rdb",(int)time(NULL),(long int)getpid());
8556 dfd = open(tmpfile,O_CREAT|O_WRONLY|O_EXCL,0644);
8557 if (dfd != -1) break;
8558 sleep(1);
8559 }
8560 if (dfd == -1) {
8561 close(fd);
8562 redisLog(REDIS_WARNING,"Opening the temp file needed for MASTER <-> SLAVE synchronization: %s",strerror(errno));
8563 return REDIS_ERR;
8564 }
8565 while(dumpsize) {
8566 int nread, nwritten;
8567
8568 nread = read(fd,buf,(dumpsize < 1024)?dumpsize:1024);
8569 if (nread == -1) {
8570 redisLog(REDIS_WARNING,"I/O error trying to sync with MASTER: %s",
8571 strerror(errno));
8572 close(fd);
8573 close(dfd);
8574 return REDIS_ERR;
8575 }
8576 nwritten = write(dfd,buf,nread);
8577 if (nwritten == -1) {
8578 redisLog(REDIS_WARNING,"Write error writing to the DB dump file needed for MASTER <-> SLAVE synchrnonization: %s", strerror(errno));
8579 close(fd);
8580 close(dfd);
8581 return REDIS_ERR;
8582 }
8583 dumpsize -= nread;
8584 }
8585 close(dfd);
8586 if (rename(tmpfile,server.dbfilename) == -1) {
8587 redisLog(REDIS_WARNING,"Failed trying to rename the temp DB into dump.rdb in MASTER <-> SLAVE synchronization: %s", strerror(errno));
8588 unlink(tmpfile);
8589 close(fd);
8590 return REDIS_ERR;
8591 }
8592 emptyDb();
8593 if (rdbLoad(server.dbfilename) != REDIS_OK) {
8594 redisLog(REDIS_WARNING,"Failed trying to load the MASTER synchronization DB from disk");
8595 close(fd);
8596 return REDIS_ERR;
8597 }
8598 server.master = createClient(fd);
8599 server.master->flags |= REDIS_MASTER;
8600 server.master->authenticated = 1;
8601 server.replstate = REDIS_REPL_CONNECTED;
8602 return REDIS_OK;
8603 }
8604
8605 static void slaveofCommand(redisClient *c) {
8606 if (!strcasecmp(c->argv[1]->ptr,"no") &&
8607 !strcasecmp(c->argv[2]->ptr,"one")) {
8608 if (server.masterhost) {
8609 sdsfree(server.masterhost);
8610 server.masterhost = NULL;
8611 if (server.master) freeClient(server.master);
8612 server.replstate = REDIS_REPL_NONE;
8613 redisLog(REDIS_NOTICE,"MASTER MODE enabled (user request)");
8614 }
8615 } else {
8616 sdsfree(server.masterhost);
8617 server.masterhost = sdsdup(c->argv[1]->ptr);
8618 server.masterport = atoi(c->argv[2]->ptr);
8619 if (server.master) freeClient(server.master);
8620 server.replstate = REDIS_REPL_CONNECT;
8621 redisLog(REDIS_NOTICE,"SLAVE OF %s:%d enabled (user request)",
8622 server.masterhost, server.masterport);
8623 }
8624 addReply(c,shared.ok);
8625 }
8626
8627 /* ============================ Maxmemory directive ======================== */
8628
8629 /* Try to free one object form the pre-allocated objects free list.
8630 * This is useful under low mem conditions as by default we take 1 million
8631 * free objects allocated. On success REDIS_OK is returned, otherwise
8632 * REDIS_ERR. */
8633 static int tryFreeOneObjectFromFreelist(void) {
8634 robj *o;
8635
8636 if (server.vm_enabled) pthread_mutex_lock(&server.obj_freelist_mutex);
8637 if (listLength(server.objfreelist)) {
8638 listNode *head = listFirst(server.objfreelist);
8639 o = listNodeValue(head);
8640 listDelNode(server.objfreelist,head);
8641 if (server.vm_enabled) pthread_mutex_unlock(&server.obj_freelist_mutex);
8642 zfree(o);
8643 return REDIS_OK;
8644 } else {
8645 if (server.vm_enabled) pthread_mutex_unlock(&server.obj_freelist_mutex);
8646 return REDIS_ERR;
8647 }
8648 }
8649
8650 /* This function gets called when 'maxmemory' is set on the config file to limit
8651 * the max memory used by the server, and we are out of memory.
8652 * This function will try to, in order:
8653 *
8654 * - Free objects from the free list
8655 * - Try to remove keys with an EXPIRE set
8656 *
8657 * It is not possible to free enough memory to reach used-memory < maxmemory
8658 * the server will start refusing commands that will enlarge even more the
8659 * memory usage.
8660 */
8661 static void freeMemoryIfNeeded(void) {
8662 while (server.maxmemory && zmalloc_used_memory() > server.maxmemory) {
8663 int j, k, freed = 0;
8664
8665 if (tryFreeOneObjectFromFreelist() == REDIS_OK) continue;
8666 for (j = 0; j < server.dbnum; j++) {
8667 int minttl = -1;
8668 robj *minkey = NULL;
8669 struct dictEntry *de;
8670
8671 if (dictSize(server.db[j].expires)) {
8672 freed = 1;
8673 /* From a sample of three keys drop the one nearest to
8674 * the natural expire */
8675 for (k = 0; k < 3; k++) {
8676 time_t t;
8677
8678 de = dictGetRandomKey(server.db[j].expires);
8679 t = (time_t) dictGetEntryVal(de);
8680 if (minttl == -1 || t < minttl) {
8681 minkey = dictGetEntryKey(de);
8682 minttl = t;
8683 }
8684 }
8685 dbDelete(server.db+j,minkey);
8686 }
8687 }
8688 if (!freed) return; /* nothing to free... */
8689 }
8690 }
8691
8692 /* ============================== Append Only file ========================== */
8693
8694 /* Called when the user switches from "appendonly yes" to "appendonly no"
8695 * at runtime using the CONFIG command. */
8696 static void stopAppendOnly(void) {
8697 flushAppendOnlyFile();
8698 aof_fsync(server.appendfd);
8699 close(server.appendfd);
8700
8701 server.appendfd = -1;
8702 server.appendseldb = -1;
8703 server.appendonly = 0;
8704 /* rewrite operation in progress? kill it, wait child exit */
8705 if (server.bgsavechildpid != -1) {
8706 int statloc;
8707
8708 if (kill(server.bgsavechildpid,SIGKILL) != -1)
8709 wait3(&statloc,0,NULL);
8710 /* reset the buffer accumulating changes while the child saves */
8711 sdsfree(server.bgrewritebuf);
8712 server.bgrewritebuf = sdsempty();
8713 server.bgsavechildpid = -1;
8714 }
8715 }
8716
8717 /* Called when the user switches from "appendonly no" to "appendonly yes"
8718 * at runtime using the CONFIG command. */
8719 static int startAppendOnly(void) {
8720 server.appendonly = 1;
8721 server.lastfsync = time(NULL);
8722 server.appendfd = open(server.appendfilename,O_WRONLY|O_APPEND|O_CREAT,0644);
8723 if (server.appendfd == -1) {
8724 redisLog(REDIS_WARNING,"Used tried to switch on AOF via CONFIG, but I can't open the AOF file: %s",strerror(errno));
8725 return REDIS_ERR;
8726 }
8727 if (rewriteAppendOnlyFileBackground() == REDIS_ERR) {
8728 server.appendonly = 0;
8729 close(server.appendfd);
8730 redisLog(REDIS_WARNING,"Used tried to switch on AOF via CONFIG, I can't trigger a background AOF rewrite operation. Check the above logs for more info about the error.",strerror(errno));
8731 return REDIS_ERR;
8732 }
8733 return REDIS_OK;
8734 }
8735
8736 /* Write the append only file buffer on disk.
8737 *
8738 * Since we are required to write the AOF before replying to the client,
8739 * and the only way the client socket can get a write is entering when the
8740 * the event loop, we accumulate all the AOF writes in a memory
8741 * buffer and write it on disk using this function just before entering
8742 * the event loop again. */
8743 static void flushAppendOnlyFile(void) {
8744 time_t now;
8745 ssize_t nwritten;
8746
8747 if (sdslen(server.aofbuf) == 0) return;
8748
8749 /* We want to perform a single write. This should be guaranteed atomic
8750 * at least if the filesystem we are writing is a real physical one.
8751 * While this will save us against the server being killed I don't think
8752 * there is much to do about the whole server stopping for power problems
8753 * or alike */
8754 nwritten = write(server.appendfd,server.aofbuf,sdslen(server.aofbuf));
8755 if (nwritten != (signed)sdslen(server.aofbuf)) {
8756 /* Ooops, we are in troubles. The best thing to do for now is
8757 * aborting instead of giving the illusion that everything is
8758 * working as expected. */
8759 if (nwritten == -1) {
8760 redisLog(REDIS_WARNING,"Exiting on error writing to the append-only file: %s",strerror(errno));
8761 } else {
8762 redisLog(REDIS_WARNING,"Exiting on short write while writing to the append-only file: %s",strerror(errno));
8763 }
8764 exit(1);
8765 }
8766 sdsfree(server.aofbuf);
8767 server.aofbuf = sdsempty();
8768
8769 /* Don't Fsync if no-appendfsync-on-rewrite is set to yes and we have
8770 * childs performing heavy I/O on disk. */
8771 if (server.no_appendfsync_on_rewrite &&
8772 (server.bgrewritechildpid != -1 || server.bgsavechildpid != -1))
8773 return;
8774 /* Fsync if needed */
8775 now = time(NULL);
8776 if (server.appendfsync == APPENDFSYNC_ALWAYS ||
8777 (server.appendfsync == APPENDFSYNC_EVERYSEC &&
8778 now-server.lastfsync > 1))
8779 {
8780 /* aof_fsync is defined as fdatasync() for Linux in order to avoid
8781 * flushing metadata. */
8782 aof_fsync(server.appendfd); /* Let's try to get this data on the disk */
8783 server.lastfsync = now;
8784 }
8785 }
8786
8787 static sds catAppendOnlyGenericCommand(sds buf, int argc, robj **argv) {
8788 int j;
8789 buf = sdscatprintf(buf,"*%d\r\n",argc);
8790 for (j = 0; j < argc; j++) {
8791 robj *o = getDecodedObject(argv[j]);
8792 buf = sdscatprintf(buf,"$%lu\r\n",(unsigned long)sdslen(o->ptr));
8793 buf = sdscatlen(buf,o->ptr,sdslen(o->ptr));
8794 buf = sdscatlen(buf,"\r\n",2);
8795 decrRefCount(o);
8796 }
8797 return buf;
8798 }
8799
8800 static sds catAppendOnlyExpireAtCommand(sds buf, robj *key, robj *seconds) {
8801 int argc = 3;
8802 long when;
8803 robj *argv[3];
8804
8805 /* Make sure we can use strtol */
8806 seconds = getDecodedObject(seconds);
8807 when = time(NULL)+strtol(seconds->ptr,NULL,10);
8808 decrRefCount(seconds);
8809
8810 argv[0] = createStringObject("EXPIREAT",8);
8811 argv[1] = key;
8812 argv[2] = createObject(REDIS_STRING,
8813 sdscatprintf(sdsempty(),"%ld",when));
8814 buf = catAppendOnlyGenericCommand(buf, argc, argv);
8815 decrRefCount(argv[0]);
8816 decrRefCount(argv[2]);
8817 return buf;
8818 }
8819
8820 static void feedAppendOnlyFile(struct redisCommand *cmd, int dictid, robj **argv, int argc) {
8821 sds buf = sdsempty();
8822 robj *tmpargv[3];
8823
8824 /* The DB this command was targetting is not the same as the last command
8825 * we appendend. To issue a SELECT command is needed. */
8826 if (dictid != server.appendseldb) {
8827 char seldb[64];
8828
8829 snprintf(seldb,sizeof(seldb),"%d",dictid);
8830 buf = sdscatprintf(buf,"*2\r\n$6\r\nSELECT\r\n$%lu\r\n%s\r\n",
8831 (unsigned long)strlen(seldb),seldb);
8832 server.appendseldb = dictid;
8833 }
8834
8835 if (cmd->proc == expireCommand) {
8836 /* Translate EXPIRE into EXPIREAT */
8837 buf = catAppendOnlyExpireAtCommand(buf,argv[1],argv[2]);
8838 } else if (cmd->proc == setexCommand) {
8839 /* Translate SETEX to SET and EXPIREAT */
8840 tmpargv[0] = createStringObject("SET",3);
8841 tmpargv[1] = argv[1];
8842 tmpargv[2] = argv[3];
8843 buf = catAppendOnlyGenericCommand(buf,3,tmpargv);
8844 decrRefCount(tmpargv[0]);
8845 buf = catAppendOnlyExpireAtCommand(buf,argv[1],argv[2]);
8846 } else {
8847 buf = catAppendOnlyGenericCommand(buf,argc,argv);
8848 }
8849
8850 /* Append to the AOF buffer. This will be flushed on disk just before
8851 * of re-entering the event loop, so before the client will get a
8852 * positive reply about the operation performed. */
8853 server.aofbuf = sdscatlen(server.aofbuf,buf,sdslen(buf));
8854
8855 /* If a background append only file rewriting is in progress we want to
8856 * accumulate the differences between the child DB and the current one
8857 * in a buffer, so that when the child process will do its work we
8858 * can append the differences to the new append only file. */
8859 if (server.bgrewritechildpid != -1)
8860 server.bgrewritebuf = sdscatlen(server.bgrewritebuf,buf,sdslen(buf));
8861
8862 sdsfree(buf);
8863 }
8864
8865 /* In Redis commands are always executed in the context of a client, so in
8866 * order to load the append only file we need to create a fake client. */
8867 static struct redisClient *createFakeClient(void) {
8868 struct redisClient *c = zmalloc(sizeof(*c));
8869
8870 selectDb(c,0);
8871 c->fd = -1;
8872 c->querybuf = sdsempty();
8873 c->argc = 0;
8874 c->argv = NULL;
8875 c->flags = 0;
8876 /* We set the fake client as a slave waiting for the synchronization
8877 * so that Redis will not try to send replies to this client. */
8878 c->replstate = REDIS_REPL_WAIT_BGSAVE_START;
8879 c->reply = listCreate();
8880 listSetFreeMethod(c->reply,decrRefCount);
8881 listSetDupMethod(c->reply,dupClientReplyValue);
8882 initClientMultiState(c);
8883 return c;
8884 }
8885
8886 static void freeFakeClient(struct redisClient *c) {
8887 sdsfree(c->querybuf);
8888 listRelease(c->reply);
8889 freeClientMultiState(c);
8890 zfree(c);
8891 }
8892
8893 /* Replay the append log file. On error REDIS_OK is returned. On non fatal
8894 * error (the append only file is zero-length) REDIS_ERR is returned. On
8895 * fatal error an error message is logged and the program exists. */
8896 int loadAppendOnlyFile(char *filename) {
8897 struct redisClient *fakeClient;
8898 FILE *fp = fopen(filename,"r");
8899 struct redis_stat sb;
8900 int appendonly = server.appendonly;
8901
8902 if (redis_fstat(fileno(fp),&sb) != -1 && sb.st_size == 0)
8903 return REDIS_ERR;
8904
8905 if (fp == NULL) {
8906 redisLog(REDIS_WARNING,"Fatal error: can't open the append log file for reading: %s",strerror(errno));
8907 exit(1);
8908 }
8909
8910 /* Temporarily disable AOF, to prevent EXEC from feeding a MULTI
8911 * to the same file we're about to read. */
8912 server.appendonly = 0;
8913
8914 fakeClient = createFakeClient();
8915 while(1) {
8916 int argc, j;
8917 unsigned long len;
8918 robj **argv;
8919 char buf[128];
8920 sds argsds;
8921 struct redisCommand *cmd;
8922 int force_swapout;
8923
8924 if (fgets(buf,sizeof(buf),fp) == NULL) {
8925 if (feof(fp))
8926 break;
8927 else
8928 goto readerr;
8929 }
8930 if (buf[0] != '*') goto fmterr;
8931 argc = atoi(buf+1);
8932 argv = zmalloc(sizeof(robj*)*argc);
8933 for (j = 0; j < argc; j++) {
8934 if (fgets(buf,sizeof(buf),fp) == NULL) goto readerr;
8935 if (buf[0] != '$') goto fmterr;
8936 len = strtol(buf+1,NULL,10);
8937 argsds = sdsnewlen(NULL,len);
8938 if (len && fread(argsds,len,1,fp) == 0) goto fmterr;
8939 argv[j] = createObject(REDIS_STRING,argsds);
8940 if (fread(buf,2,1,fp) == 0) goto fmterr; /* discard CRLF */
8941 }
8942
8943 /* Command lookup */
8944 cmd = lookupCommand(argv[0]->ptr);
8945 if (!cmd) {
8946 redisLog(REDIS_WARNING,"Unknown command '%s' reading the append only file", argv[0]->ptr);
8947 exit(1);
8948 }
8949 /* Try object encoding */
8950 if (cmd->flags & REDIS_CMD_BULK)
8951 argv[argc-1] = tryObjectEncoding(argv[argc-1]);
8952 /* Run the command in the context of a fake client */
8953 fakeClient->argc = argc;
8954 fakeClient->argv = argv;
8955 cmd->proc(fakeClient);
8956 /* Discard the reply objects list from the fake client */
8957 while(listLength(fakeClient->reply))
8958 listDelNode(fakeClient->reply,listFirst(fakeClient->reply));
8959 /* Clean up, ready for the next command */
8960 for (j = 0; j < argc; j++) decrRefCount(argv[j]);
8961 zfree(argv);
8962 /* Handle swapping while loading big datasets when VM is on */
8963 force_swapout = 0;
8964 if ((zmalloc_used_memory() - server.vm_max_memory) > 1024*1024*32)
8965 force_swapout = 1;
8966
8967 if (server.vm_enabled && force_swapout) {
8968 while (zmalloc_used_memory() > server.vm_max_memory) {
8969 if (vmSwapOneObjectBlocking() == REDIS_ERR) break;
8970 }
8971 }
8972 }
8973
8974 /* This point can only be reached when EOF is reached without errors.
8975 * If the client is in the middle of a MULTI/EXEC, log error and quit. */
8976 if (fakeClient->flags & REDIS_MULTI) goto readerr;
8977
8978 fclose(fp);
8979 freeFakeClient(fakeClient);
8980 server.appendonly = appendonly;
8981 return REDIS_OK;
8982
8983 readerr:
8984 if (feof(fp)) {
8985 redisLog(REDIS_WARNING,"Unexpected end of file reading the append only file");
8986 } else {
8987 redisLog(REDIS_WARNING,"Unrecoverable error reading the append only file: %s", strerror(errno));
8988 }
8989 exit(1);
8990 fmterr:
8991 redisLog(REDIS_WARNING,"Bad file format reading the append only file");
8992 exit(1);
8993 }
8994
8995 /* Write binary-safe string into a file in the bulkformat
8996 * $<count>\r\n<payload>\r\n */
8997 static int fwriteBulkString(FILE *fp, char *s, unsigned long len) {
8998 char cbuf[128];
8999 int clen;
9000 cbuf[0] = '$';
9001 clen = 1+ll2string(cbuf+1,sizeof(cbuf)-1,len);
9002 cbuf[clen++] = '\r';
9003 cbuf[clen++] = '\n';
9004 if (fwrite(cbuf,clen,1,fp) == 0) return 0;
9005 if (len > 0 && fwrite(s,len,1,fp) == 0) return 0;
9006 if (fwrite("\r\n",2,1,fp) == 0) return 0;
9007 return 1;
9008 }
9009
9010 /* Write a double value in bulk format $<count>\r\n<payload>\r\n */
9011 static int fwriteBulkDouble(FILE *fp, double d) {
9012 char buf[128], dbuf[128];
9013
9014 snprintf(dbuf,sizeof(dbuf),"%.17g\r\n",d);
9015 snprintf(buf,sizeof(buf),"$%lu\r\n",(unsigned long)strlen(dbuf)-2);
9016 if (fwrite(buf,strlen(buf),1,fp) == 0) return 0;
9017 if (fwrite(dbuf,strlen(dbuf),1,fp) == 0) return 0;
9018 return 1;
9019 }
9020
9021 /* Write a long value in bulk format $<count>\r\n<payload>\r\n */
9022 static int fwriteBulkLongLong(FILE *fp, long long l) {
9023 char bbuf[128], lbuf[128];
9024 unsigned int blen, llen;
9025 llen = ll2string(lbuf,32,l);
9026 blen = snprintf(bbuf,sizeof(bbuf),"$%u\r\n%s\r\n",llen,lbuf);
9027 if (fwrite(bbuf,blen,1,fp) == 0) return 0;
9028 return 1;
9029 }
9030
9031 /* Delegate writing an object to writing a bulk string or bulk long long. */
9032 static int fwriteBulkObject(FILE *fp, robj *obj) {
9033 /* Avoid using getDecodedObject to help copy-on-write (we are often
9034 * in a child process when this function is called). */
9035 if (obj->encoding == REDIS_ENCODING_INT) {
9036 return fwriteBulkLongLong(fp,(long)obj->ptr);
9037 } else if (obj->encoding == REDIS_ENCODING_RAW) {
9038 return fwriteBulkString(fp,obj->ptr,sdslen(obj->ptr));
9039 } else {
9040 redisPanic("Unknown string encoding");
9041 }
9042 }
9043
9044 /* Write a sequence of commands able to fully rebuild the dataset into
9045 * "filename". Used both by REWRITEAOF and BGREWRITEAOF. */
9046 static int rewriteAppendOnlyFile(char *filename) {
9047 dictIterator *di = NULL;
9048 dictEntry *de;
9049 FILE *fp;
9050 char tmpfile[256];
9051 int j;
9052 time_t now = time(NULL);
9053
9054 /* Note that we have to use a different temp name here compared to the
9055 * one used by rewriteAppendOnlyFileBackground() function. */
9056 snprintf(tmpfile,256,"temp-rewriteaof-%d.aof", (int) getpid());
9057 fp = fopen(tmpfile,"w");
9058 if (!fp) {
9059 redisLog(REDIS_WARNING, "Failed rewriting the append only file: %s", strerror(errno));
9060 return REDIS_ERR;
9061 }
9062 for (j = 0; j < server.dbnum; j++) {
9063 char selectcmd[] = "*2\r\n$6\r\nSELECT\r\n";
9064 redisDb *db = server.db+j;
9065 dict *d = db->dict;
9066 if (dictSize(d) == 0) continue;
9067 di = dictGetIterator(d);
9068 if (!di) {
9069 fclose(fp);
9070 return REDIS_ERR;
9071 }
9072
9073 /* SELECT the new DB */
9074 if (fwrite(selectcmd,sizeof(selectcmd)-1,1,fp) == 0) goto werr;
9075 if (fwriteBulkLongLong(fp,j) == 0) goto werr;
9076
9077 /* Iterate this DB writing every entry */
9078 while((de = dictNext(di)) != NULL) {
9079 sds keystr = dictGetEntryKey(de);
9080 robj key, *o;
9081 time_t expiretime;
9082 int swapped;
9083
9084 keystr = dictGetEntryKey(de);
9085 o = dictGetEntryVal(de);
9086 initStaticStringObject(key,keystr);
9087 /* If the value for this key is swapped, load a preview in memory.
9088 * We use a "swapped" flag to remember if we need to free the
9089 * value object instead to just increment the ref count anyway
9090 * in order to avoid copy-on-write of pages if we are forked() */
9091 if (!server.vm_enabled || o->storage == REDIS_VM_MEMORY ||
9092 o->storage == REDIS_VM_SWAPPING) {
9093 swapped = 0;
9094 } else {
9095 o = vmPreviewObject(o);
9096 swapped = 1;
9097 }
9098 expiretime = getExpire(db,&key);
9099
9100 /* Save the key and associated value */
9101 if (o->type == REDIS_STRING) {
9102 /* Emit a SET command */
9103 char cmd[]="*3\r\n$3\r\nSET\r\n";
9104 if (fwrite(cmd,sizeof(cmd)-1,1,fp) == 0) goto werr;
9105 /* Key and value */
9106 if (fwriteBulkObject(fp,&key) == 0) goto werr;
9107 if (fwriteBulkObject(fp,o) == 0) goto werr;
9108 } else if (o->type == REDIS_LIST) {
9109 /* Emit the RPUSHes needed to rebuild the list */
9110 char cmd[]="*3\r\n$5\r\nRPUSH\r\n";
9111 if (o->encoding == REDIS_ENCODING_ZIPLIST) {
9112 unsigned char *zl = o->ptr;
9113 unsigned char *p = ziplistIndex(zl,0);
9114 unsigned char *vstr;
9115 unsigned int vlen;
9116 long long vlong;
9117
9118 while(ziplistGet(p,&vstr,&vlen,&vlong)) {
9119 if (fwrite(cmd,sizeof(cmd)-1,1,fp) == 0) goto werr;
9120 if (fwriteBulkObject(fp,&key) == 0) goto werr;
9121 if (vstr) {
9122 if (fwriteBulkString(fp,(char*)vstr,vlen) == 0)
9123 goto werr;
9124 } else {
9125 if (fwriteBulkLongLong(fp,vlong) == 0)
9126 goto werr;
9127 }
9128 p = ziplistNext(zl,p);
9129 }
9130 } else if (o->encoding == REDIS_ENCODING_LIST) {
9131 list *list = o->ptr;
9132 listNode *ln;
9133 listIter li;
9134
9135 listRewind(list,&li);
9136 while((ln = listNext(&li))) {
9137 robj *eleobj = listNodeValue(ln);
9138
9139 if (fwrite(cmd,sizeof(cmd)-1,1,fp) == 0) goto werr;
9140 if (fwriteBulkObject(fp,&key) == 0) goto werr;
9141 if (fwriteBulkObject(fp,eleobj) == 0) goto werr;
9142 }
9143 } else {
9144 redisPanic("Unknown list encoding");
9145 }
9146 } else if (o->type == REDIS_SET) {
9147 /* Emit the SADDs needed to rebuild the set */
9148 dict *set = o->ptr;
9149 dictIterator *di = dictGetIterator(set);
9150 dictEntry *de;
9151
9152 while((de = dictNext(di)) != NULL) {
9153 char cmd[]="*3\r\n$4\r\nSADD\r\n";
9154 robj *eleobj = dictGetEntryKey(de);
9155
9156 if (fwrite(cmd,sizeof(cmd)-1,1,fp) == 0) goto werr;
9157 if (fwriteBulkObject(fp,&key) == 0) goto werr;
9158 if (fwriteBulkObject(fp,eleobj) == 0) goto werr;
9159 }
9160 dictReleaseIterator(di);
9161 } else if (o->type == REDIS_ZSET) {
9162 /* Emit the ZADDs needed to rebuild the sorted set */
9163 zset *zs = o->ptr;
9164 dictIterator *di = dictGetIterator(zs->dict);
9165 dictEntry *de;
9166
9167 while((de = dictNext(di)) != NULL) {
9168 char cmd[]="*4\r\n$4\r\nZADD\r\n";
9169 robj *eleobj = dictGetEntryKey(de);
9170 double *score = dictGetEntryVal(de);
9171
9172 if (fwrite(cmd,sizeof(cmd)-1,1,fp) == 0) goto werr;
9173 if (fwriteBulkObject(fp,&key) == 0) goto werr;
9174 if (fwriteBulkDouble(fp,*score) == 0) goto werr;
9175 if (fwriteBulkObject(fp,eleobj) == 0) goto werr;
9176 }
9177 dictReleaseIterator(di);
9178 } else if (o->type == REDIS_HASH) {
9179 char cmd[]="*4\r\n$4\r\nHSET\r\n";
9180
9181 /* Emit the HSETs needed to rebuild the hash */
9182 if (o->encoding == REDIS_ENCODING_ZIPMAP) {
9183 unsigned char *p = zipmapRewind(o->ptr);
9184 unsigned char *field, *val;
9185 unsigned int flen, vlen;
9186
9187 while((p = zipmapNext(p,&field,&flen,&val,&vlen)) != NULL) {
9188 if (fwrite(cmd,sizeof(cmd)-1,1,fp) == 0) goto werr;
9189 if (fwriteBulkObject(fp,&key) == 0) goto werr;
9190 if (fwriteBulkString(fp,(char*)field,flen) == -1)
9191 return -1;
9192 if (fwriteBulkString(fp,(char*)val,vlen) == -1)
9193 return -1;
9194 }
9195 } else {
9196 dictIterator *di = dictGetIterator(o->ptr);
9197 dictEntry *de;
9198
9199 while((de = dictNext(di)) != NULL) {
9200 robj *field = dictGetEntryKey(de);
9201 robj *val = dictGetEntryVal(de);
9202
9203 if (fwrite(cmd,sizeof(cmd)-1,1,fp) == 0) goto werr;
9204 if (fwriteBulkObject(fp,&key) == 0) goto werr;
9205 if (fwriteBulkObject(fp,field) == -1) return -1;
9206 if (fwriteBulkObject(fp,val) == -1) return -1;
9207 }
9208 dictReleaseIterator(di);
9209 }
9210 } else {
9211 redisPanic("Unknown object type");
9212 }
9213 /* Save the expire time */
9214 if (expiretime != -1) {
9215 char cmd[]="*3\r\n$8\r\nEXPIREAT\r\n";
9216 /* If this key is already expired skip it */
9217 if (expiretime < now) continue;
9218 if (fwrite(cmd,sizeof(cmd)-1,1,fp) == 0) goto werr;
9219 if (fwriteBulkObject(fp,&key) == 0) goto werr;
9220 if (fwriteBulkLongLong(fp,expiretime) == 0) goto werr;
9221 }
9222 if (swapped) decrRefCount(o);
9223 }
9224 dictReleaseIterator(di);
9225 }
9226
9227 /* Make sure data will not remain on the OS's output buffers */
9228 fflush(fp);
9229 aof_fsync(fileno(fp));
9230 fclose(fp);
9231
9232 /* Use RENAME to make sure the DB file is changed atomically only
9233 * if the generate DB file is ok. */
9234 if (rename(tmpfile,filename) == -1) {
9235 redisLog(REDIS_WARNING,"Error moving temp append only file on the final destination: %s", strerror(errno));
9236 unlink(tmpfile);
9237 return REDIS_ERR;
9238 }
9239 redisLog(REDIS_NOTICE,"SYNC append only file rewrite performed");
9240 return REDIS_OK;
9241
9242 werr:
9243 fclose(fp);
9244 unlink(tmpfile);
9245 redisLog(REDIS_WARNING,"Write error writing append only file on disk: %s", strerror(errno));
9246 if (di) dictReleaseIterator(di);
9247 return REDIS_ERR;
9248 }
9249
9250 /* This is how rewriting of the append only file in background works:
9251 *
9252 * 1) The user calls BGREWRITEAOF
9253 * 2) Redis calls this function, that forks():
9254 * 2a) the child rewrite the append only file in a temp file.
9255 * 2b) the parent accumulates differences in server.bgrewritebuf.
9256 * 3) When the child finished '2a' exists.
9257 * 4) The parent will trap the exit code, if it's OK, will append the
9258 * data accumulated into server.bgrewritebuf into the temp file, and
9259 * finally will rename(2) the temp file in the actual file name.
9260 * The the new file is reopened as the new append only file. Profit!
9261 */
9262 static int rewriteAppendOnlyFileBackground(void) {
9263 pid_t childpid;
9264
9265 if (server.bgrewritechildpid != -1) return REDIS_ERR;
9266 if (server.vm_enabled) waitEmptyIOJobsQueue();
9267 if ((childpid = fork()) == 0) {
9268 /* Child */
9269 char tmpfile[256];
9270
9271 if (server.vm_enabled) vmReopenSwapFile();
9272 close(server.fd);
9273 snprintf(tmpfile,256,"temp-rewriteaof-bg-%d.aof", (int) getpid());
9274 if (rewriteAppendOnlyFile(tmpfile) == REDIS_OK) {
9275 _exit(0);
9276 } else {
9277 _exit(1);
9278 }
9279 } else {
9280 /* Parent */
9281 if (childpid == -1) {
9282 redisLog(REDIS_WARNING,
9283 "Can't rewrite append only file in background: fork: %s",
9284 strerror(errno));
9285 return REDIS_ERR;
9286 }
9287 redisLog(REDIS_NOTICE,
9288 "Background append only file rewriting started by pid %d",childpid);
9289 server.bgrewritechildpid = childpid;
9290 updateDictResizePolicy();
9291 /* We set appendseldb to -1 in order to force the next call to the
9292 * feedAppendOnlyFile() to issue a SELECT command, so the differences
9293 * accumulated by the parent into server.bgrewritebuf will start
9294 * with a SELECT statement and it will be safe to merge. */
9295 server.appendseldb = -1;
9296 return REDIS_OK;
9297 }
9298 return REDIS_OK; /* unreached */
9299 }
9300
9301 static void bgrewriteaofCommand(redisClient *c) {
9302 if (server.bgrewritechildpid != -1) {
9303 addReplySds(c,sdsnew("-ERR background append only file rewriting already in progress\r\n"));
9304 return;
9305 }
9306 if (rewriteAppendOnlyFileBackground() == REDIS_OK) {
9307 char *status = "+Background append only file rewriting started\r\n";
9308 addReplySds(c,sdsnew(status));
9309 } else {
9310 addReply(c,shared.err);
9311 }
9312 }
9313
9314 static void aofRemoveTempFile(pid_t childpid) {
9315 char tmpfile[256];
9316
9317 snprintf(tmpfile,256,"temp-rewriteaof-bg-%d.aof", (int) childpid);
9318 unlink(tmpfile);
9319 }
9320
9321 /* Virtual Memory is composed mainly of two subsystems:
9322 * - Blocking Virutal Memory
9323 * - Threaded Virtual Memory I/O
9324 * The two parts are not fully decoupled, but functions are split among two
9325 * different sections of the source code (delimited by comments) in order to
9326 * make more clear what functionality is about the blocking VM and what about
9327 * the threaded (not blocking) VM.
9328 *
9329 * Redis VM design:
9330 *
9331 * Redis VM is a blocking VM (one that blocks reading swapped values from
9332 * disk into memory when a value swapped out is needed in memory) that is made
9333 * unblocking by trying to examine the command argument vector in order to
9334 * load in background values that will likely be needed in order to exec
9335 * the command. The command is executed only once all the relevant keys
9336 * are loaded into memory.
9337 *
9338 * This basically is almost as simple of a blocking VM, but almost as parallel
9339 * as a fully non-blocking VM.
9340 */
9341
9342 /* =================== Virtual Memory - Blocking Side ====================== */
9343
9344 /* Create a VM pointer object. This kind of objects are used in place of
9345 * values in the key -> value hash table, for swapped out objects. */
9346 static vmpointer *createVmPointer(int vtype) {
9347 vmpointer *vp = zmalloc(sizeof(vmpointer));
9348
9349 vp->type = REDIS_VMPOINTER;
9350 vp->storage = REDIS_VM_SWAPPED;
9351 vp->vtype = vtype;
9352 return vp;
9353 }
9354
9355 static void vmInit(void) {
9356 off_t totsize;
9357 int pipefds[2];
9358 size_t stacksize;
9359 struct flock fl;
9360
9361 if (server.vm_max_threads != 0)
9362 zmalloc_enable_thread_safeness(); /* we need thread safe zmalloc() */
9363
9364 redisLog(REDIS_NOTICE,"Using '%s' as swap file",server.vm_swap_file);
9365 /* Try to open the old swap file, otherwise create it */
9366 if ((server.vm_fp = fopen(server.vm_swap_file,"r+b")) == NULL) {
9367 server.vm_fp = fopen(server.vm_swap_file,"w+b");
9368 }
9369 if (server.vm_fp == NULL) {
9370 redisLog(REDIS_WARNING,
9371 "Can't open the swap file: %s. Exiting.",
9372 strerror(errno));
9373 exit(1);
9374 }
9375 server.vm_fd = fileno(server.vm_fp);
9376 /* Lock the swap file for writing, this is useful in order to avoid
9377 * another instance to use the same swap file for a config error. */
9378 fl.l_type = F_WRLCK;
9379 fl.l_whence = SEEK_SET;
9380 fl.l_start = fl.l_len = 0;
9381 if (fcntl(server.vm_fd,F_SETLK,&fl) == -1) {
9382 redisLog(REDIS_WARNING,
9383 "Can't lock the swap file at '%s': %s. Make sure it is not used by another Redis instance.", server.vm_swap_file, strerror(errno));
9384 exit(1);
9385 }
9386 /* Initialize */
9387 server.vm_next_page = 0;
9388 server.vm_near_pages = 0;
9389 server.vm_stats_used_pages = 0;
9390 server.vm_stats_swapped_objects = 0;
9391 server.vm_stats_swapouts = 0;
9392 server.vm_stats_swapins = 0;
9393 totsize = server.vm_pages*server.vm_page_size;
9394 redisLog(REDIS_NOTICE,"Allocating %lld bytes of swap file",totsize);
9395 if (ftruncate(server.vm_fd,totsize) == -1) {
9396 redisLog(REDIS_WARNING,"Can't ftruncate swap file: %s. Exiting.",
9397 strerror(errno));
9398 exit(1);
9399 } else {
9400 redisLog(REDIS_NOTICE,"Swap file allocated with success");
9401 }
9402 server.vm_bitmap = zmalloc((server.vm_pages+7)/8);
9403 redisLog(REDIS_VERBOSE,"Allocated %lld bytes page table for %lld pages",
9404 (long long) (server.vm_pages+7)/8, server.vm_pages);
9405 memset(server.vm_bitmap,0,(server.vm_pages+7)/8);
9406
9407 /* Initialize threaded I/O (used by Virtual Memory) */
9408 server.io_newjobs = listCreate();
9409 server.io_processing = listCreate();
9410 server.io_processed = listCreate();
9411 server.io_ready_clients = listCreate();
9412 pthread_mutex_init(&server.io_mutex,NULL);
9413 pthread_mutex_init(&server.obj_freelist_mutex,NULL);
9414 pthread_mutex_init(&server.io_swapfile_mutex,NULL);
9415 server.io_active_threads = 0;
9416 if (pipe(pipefds) == -1) {
9417 redisLog(REDIS_WARNING,"Unable to intialized VM: pipe(2): %s. Exiting."
9418 ,strerror(errno));
9419 exit(1);
9420 }
9421 server.io_ready_pipe_read = pipefds[0];
9422 server.io_ready_pipe_write = pipefds[1];
9423 redisAssert(anetNonBlock(NULL,server.io_ready_pipe_read) != ANET_ERR);
9424 /* LZF requires a lot of stack */
9425 pthread_attr_init(&server.io_threads_attr);
9426 pthread_attr_getstacksize(&server.io_threads_attr, &stacksize);
9427 while (stacksize < REDIS_THREAD_STACK_SIZE) stacksize *= 2;
9428 pthread_attr_setstacksize(&server.io_threads_attr, stacksize);
9429 /* Listen for events in the threaded I/O pipe */
9430 if (aeCreateFileEvent(server.el, server.io_ready_pipe_read, AE_READABLE,
9431 vmThreadedIOCompletedJob, NULL) == AE_ERR)
9432 oom("creating file event");
9433 }
9434
9435 /* Mark the page as used */
9436 static void vmMarkPageUsed(off_t page) {
9437 off_t byte = page/8;
9438 int bit = page&7;
9439 redisAssert(vmFreePage(page) == 1);
9440 server.vm_bitmap[byte] |= 1<<bit;
9441 }
9442
9443 /* Mark N contiguous pages as used, with 'page' being the first. */
9444 static void vmMarkPagesUsed(off_t page, off_t count) {
9445 off_t j;
9446
9447 for (j = 0; j < count; j++)
9448 vmMarkPageUsed(page+j);
9449 server.vm_stats_used_pages += count;
9450 redisLog(REDIS_DEBUG,"Mark USED pages: %lld pages at %lld\n",
9451 (long long)count, (long long)page);
9452 }
9453
9454 /* Mark the page as free */
9455 static void vmMarkPageFree(off_t page) {
9456 off_t byte = page/8;
9457 int bit = page&7;
9458 redisAssert(vmFreePage(page) == 0);
9459 server.vm_bitmap[byte] &= ~(1<<bit);
9460 }
9461
9462 /* Mark N contiguous pages as free, with 'page' being the first. */
9463 static void vmMarkPagesFree(off_t page, off_t count) {
9464 off_t j;
9465
9466 for (j = 0; j < count; j++)
9467 vmMarkPageFree(page+j);
9468 server.vm_stats_used_pages -= count;
9469 redisLog(REDIS_DEBUG,"Mark FREE pages: %lld pages at %lld\n",
9470 (long long)count, (long long)page);
9471 }
9472
9473 /* Test if the page is free */
9474 static int vmFreePage(off_t page) {
9475 off_t byte = page/8;
9476 int bit = page&7;
9477 return (server.vm_bitmap[byte] & (1<<bit)) == 0;
9478 }
9479
9480 /* Find N contiguous free pages storing the first page of the cluster in *first.
9481 * Returns REDIS_OK if it was able to find N contiguous pages, otherwise
9482 * REDIS_ERR is returned.
9483 *
9484 * This function uses a simple algorithm: we try to allocate
9485 * REDIS_VM_MAX_NEAR_PAGES sequentially, when we reach this limit we start
9486 * again from the start of the swap file searching for free spaces.
9487 *
9488 * If it looks pretty clear that there are no free pages near our offset
9489 * we try to find less populated places doing a forward jump of
9490 * REDIS_VM_MAX_RANDOM_JUMP, then we start scanning again a few pages
9491 * without hurry, and then we jump again and so forth...
9492 *
9493 * This function can be improved using a free list to avoid to guess
9494 * too much, since we could collect data about freed pages.
9495 *
9496 * note: I implemented this function just after watching an episode of
9497 * Battlestar Galactica, where the hybrid was continuing to say "JUMP!"
9498 */
9499 static int vmFindContiguousPages(off_t *first, off_t n) {
9500 off_t base, offset = 0, since_jump = 0, numfree = 0;
9501
9502 if (server.vm_near_pages == REDIS_VM_MAX_NEAR_PAGES) {
9503 server.vm_near_pages = 0;
9504 server.vm_next_page = 0;
9505 }
9506 server.vm_near_pages++; /* Yet another try for pages near to the old ones */
9507 base = server.vm_next_page;
9508
9509 while(offset < server.vm_pages) {
9510 off_t this = base+offset;
9511
9512 /* If we overflow, restart from page zero */
9513 if (this >= server.vm_pages) {
9514 this -= server.vm_pages;
9515 if (this == 0) {
9516 /* Just overflowed, what we found on tail is no longer
9517 * interesting, as it's no longer contiguous. */
9518 numfree = 0;
9519 }
9520 }
9521 if (vmFreePage(this)) {
9522 /* This is a free page */
9523 numfree++;
9524 /* Already got N free pages? Return to the caller, with success */
9525 if (numfree == n) {
9526 *first = this-(n-1);
9527 server.vm_next_page = this+1;
9528 redisLog(REDIS_DEBUG, "FOUND CONTIGUOUS PAGES: %lld pages at %lld\n", (long long) n, (long long) *first);
9529 return REDIS_OK;
9530 }
9531 } else {
9532 /* The current one is not a free page */
9533 numfree = 0;
9534 }
9535
9536 /* Fast-forward if the current page is not free and we already
9537 * searched enough near this place. */
9538 since_jump++;
9539 if (!numfree && since_jump >= REDIS_VM_MAX_RANDOM_JUMP/4) {
9540 offset += random() % REDIS_VM_MAX_RANDOM_JUMP;
9541 since_jump = 0;
9542 /* Note that even if we rewind after the jump, we are don't need
9543 * to make sure numfree is set to zero as we only jump *if* it
9544 * is set to zero. */
9545 } else {
9546 /* Otherwise just check the next page */
9547 offset++;
9548 }
9549 }
9550 return REDIS_ERR;
9551 }
9552
9553 /* Write the specified object at the specified page of the swap file */
9554 static int vmWriteObjectOnSwap(robj *o, off_t page) {
9555 if (server.vm_enabled) pthread_mutex_lock(&server.io_swapfile_mutex);
9556 if (fseeko(server.vm_fp,page*server.vm_page_size,SEEK_SET) == -1) {
9557 if (server.vm_enabled) pthread_mutex_unlock(&server.io_swapfile_mutex);
9558 redisLog(REDIS_WARNING,
9559 "Critical VM problem in vmWriteObjectOnSwap(): can't seek: %s",
9560 strerror(errno));
9561 return REDIS_ERR;
9562 }
9563 rdbSaveObject(server.vm_fp,o);
9564 fflush(server.vm_fp);
9565 if (server.vm_enabled) pthread_mutex_unlock(&server.io_swapfile_mutex);
9566 return REDIS_OK;
9567 }
9568
9569 /* Transfers the 'val' object to disk. Store all the information
9570 * a 'vmpointer' object containing all the information needed to load the
9571 * object back later is returned.
9572 *
9573 * If we can't find enough contiguous empty pages to swap the object on disk
9574 * NULL is returned. */
9575 static vmpointer *vmSwapObjectBlocking(robj *val) {
9576 off_t pages = rdbSavedObjectPages(val,NULL);
9577 off_t page;
9578 vmpointer *vp;
9579
9580 assert(val->storage == REDIS_VM_MEMORY);
9581 assert(val->refcount == 1);
9582 if (vmFindContiguousPages(&page,pages) == REDIS_ERR) return NULL;
9583 if (vmWriteObjectOnSwap(val,page) == REDIS_ERR) return NULL;
9584
9585 vp = createVmPointer(val->type);
9586 vp->page = page;
9587 vp->usedpages = pages;
9588 decrRefCount(val); /* Deallocate the object from memory. */
9589 vmMarkPagesUsed(page,pages);
9590 redisLog(REDIS_DEBUG,"VM: object %p swapped out at %lld (%lld pages)",
9591 (void*) val,
9592 (unsigned long long) page, (unsigned long long) pages);
9593 server.vm_stats_swapped_objects++;
9594 server.vm_stats_swapouts++;
9595 return vp;
9596 }
9597
9598 static robj *vmReadObjectFromSwap(off_t page, int type) {
9599 robj *o;
9600
9601 if (server.vm_enabled) pthread_mutex_lock(&server.io_swapfile_mutex);
9602 if (fseeko(server.vm_fp,page*server.vm_page_size,SEEK_SET) == -1) {
9603 redisLog(REDIS_WARNING,
9604 "Unrecoverable VM problem in vmReadObjectFromSwap(): can't seek: %s",
9605 strerror(errno));
9606 _exit(1);
9607 }
9608 o = rdbLoadObject(type,server.vm_fp);
9609 if (o == NULL) {
9610 redisLog(REDIS_WARNING, "Unrecoverable VM problem in vmReadObjectFromSwap(): can't load object from swap file: %s", strerror(errno));
9611 _exit(1);
9612 }
9613 if (server.vm_enabled) pthread_mutex_unlock(&server.io_swapfile_mutex);
9614 return o;
9615 }
9616
9617 /* Load the specified object from swap to memory.
9618 * The newly allocated object is returned.
9619 *
9620 * If preview is true the unserialized object is returned to the caller but
9621 * the pages are not marked as freed, nor the vp object is freed. */
9622 static robj *vmGenericLoadObject(vmpointer *vp, int preview) {
9623 robj *val;
9624
9625 redisAssert(vp->type == REDIS_VMPOINTER &&
9626 (vp->storage == REDIS_VM_SWAPPED || vp->storage == REDIS_VM_LOADING));
9627 val = vmReadObjectFromSwap(vp->page,vp->vtype);
9628 if (!preview) {
9629 redisLog(REDIS_DEBUG, "VM: object %p loaded from disk", (void*)vp);
9630 vmMarkPagesFree(vp->page,vp->usedpages);
9631 zfree(vp);
9632 server.vm_stats_swapped_objects--;
9633 } else {
9634 redisLog(REDIS_DEBUG, "VM: object %p previewed from disk", (void*)vp);
9635 }
9636 server.vm_stats_swapins++;
9637 return val;
9638 }
9639
9640 /* Plain object loading, from swap to memory.
9641 *
9642 * 'o' is actually a redisVmPointer structure that will be freed by the call.
9643 * The return value is the loaded object. */
9644 static robj *vmLoadObject(robj *o) {
9645 /* If we are loading the object in background, stop it, we
9646 * need to load this object synchronously ASAP. */
9647 if (o->storage == REDIS_VM_LOADING)
9648 vmCancelThreadedIOJob(o);
9649 return vmGenericLoadObject((vmpointer*)o,0);
9650 }
9651
9652 /* Just load the value on disk, without to modify the key.
9653 * This is useful when we want to perform some operation on the value
9654 * without to really bring it from swap to memory, like while saving the
9655 * dataset or rewriting the append only log. */
9656 static robj *vmPreviewObject(robj *o) {
9657 return vmGenericLoadObject((vmpointer*)o,1);
9658 }
9659
9660 /* How a good candidate is this object for swapping?
9661 * The better candidate it is, the greater the returned value.
9662 *
9663 * Currently we try to perform a fast estimation of the object size in
9664 * memory, and combine it with aging informations.
9665 *
9666 * Basically swappability = idle-time * log(estimated size)
9667 *
9668 * Bigger objects are preferred over smaller objects, but not
9669 * proportionally, this is why we use the logarithm. This algorithm is
9670 * just a first try and will probably be tuned later. */
9671 static double computeObjectSwappability(robj *o) {
9672 /* actual age can be >= minage, but not < minage. As we use wrapping
9673 * 21 bit clocks with minutes resolution for the LRU. */
9674 time_t minage = abs(server.lruclock - o->lru);
9675 long asize = 0, elesize;
9676 robj *ele;
9677 list *l;
9678 listNode *ln;
9679 dict *d;
9680 struct dictEntry *de;
9681 int z;
9682
9683 if (minage <= 0) return 0;
9684 switch(o->type) {
9685 case REDIS_STRING:
9686 if (o->encoding != REDIS_ENCODING_RAW) {
9687 asize = sizeof(*o);
9688 } else {
9689 asize = sdslen(o->ptr)+sizeof(*o)+sizeof(long)*2;
9690 }
9691 break;
9692 case REDIS_LIST:
9693 if (o->encoding == REDIS_ENCODING_ZIPLIST) {
9694 asize = sizeof(*o)+ziplistSize(o->ptr);
9695 } else {
9696 l = o->ptr;
9697 ln = listFirst(l);
9698 asize = sizeof(list);
9699 if (ln) {
9700 ele = ln->value;
9701 elesize = (ele->encoding == REDIS_ENCODING_RAW) ?
9702 (sizeof(*o)+sdslen(ele->ptr)) : sizeof(*o);
9703 asize += (sizeof(listNode)+elesize)*listLength(l);
9704 }
9705 }
9706 break;
9707 case REDIS_SET:
9708 case REDIS_ZSET:
9709 z = (o->type == REDIS_ZSET);
9710 d = z ? ((zset*)o->ptr)->dict : o->ptr;
9711
9712 asize = sizeof(dict)+(sizeof(struct dictEntry*)*dictSlots(d));
9713 if (z) asize += sizeof(zset)-sizeof(dict);
9714 if (dictSize(d)) {
9715 de = dictGetRandomKey(d);
9716 ele = dictGetEntryKey(de);
9717 elesize = (ele->encoding == REDIS_ENCODING_RAW) ?
9718 (sizeof(*o)+sdslen(ele->ptr)) : sizeof(*o);
9719 asize += (sizeof(struct dictEntry)+elesize)*dictSize(d);
9720 if (z) asize += sizeof(zskiplistNode)*dictSize(d);
9721 }
9722 break;
9723 case REDIS_HASH:
9724 if (o->encoding == REDIS_ENCODING_ZIPMAP) {
9725 unsigned char *p = zipmapRewind((unsigned char*)o->ptr);
9726 unsigned int len = zipmapLen((unsigned char*)o->ptr);
9727 unsigned int klen, vlen;
9728 unsigned char *key, *val;
9729
9730 if ((p = zipmapNext(p,&key,&klen,&val,&vlen)) == NULL) {
9731 klen = 0;
9732 vlen = 0;
9733 }
9734 asize = len*(klen+vlen+3);
9735 } else if (o->encoding == REDIS_ENCODING_HT) {
9736 d = o->ptr;
9737 asize = sizeof(dict)+(sizeof(struct dictEntry*)*dictSlots(d));
9738 if (dictSize(d)) {
9739 de = dictGetRandomKey(d);
9740 ele = dictGetEntryKey(de);
9741 elesize = (ele->encoding == REDIS_ENCODING_RAW) ?
9742 (sizeof(*o)+sdslen(ele->ptr)) : sizeof(*o);
9743 ele = dictGetEntryVal(de);
9744 elesize = (ele->encoding == REDIS_ENCODING_RAW) ?
9745 (sizeof(*o)+sdslen(ele->ptr)) : sizeof(*o);
9746 asize += (sizeof(struct dictEntry)+elesize)*dictSize(d);
9747 }
9748 }
9749 break;
9750 }
9751 return (double)minage*log(1+asize);
9752 }
9753
9754 /* Try to swap an object that's a good candidate for swapping.
9755 * Returns REDIS_OK if the object was swapped, REDIS_ERR if it's not possible
9756 * to swap any object at all.
9757 *
9758 * If 'usethreaded' is true, Redis will try to swap the object in background
9759 * using I/O threads. */
9760 static int vmSwapOneObject(int usethreads) {
9761 int j, i;
9762 struct dictEntry *best = NULL;
9763 double best_swappability = 0;
9764 redisDb *best_db = NULL;
9765 robj *val;
9766 sds key;
9767
9768 for (j = 0; j < server.dbnum; j++) {
9769 redisDb *db = server.db+j;
9770 /* Why maxtries is set to 100?
9771 * Because this way (usually) we'll find 1 object even if just 1% - 2%
9772 * are swappable objects */
9773 int maxtries = 100;
9774
9775 if (dictSize(db->dict) == 0) continue;
9776 for (i = 0; i < 5; i++) {
9777 dictEntry *de;
9778 double swappability;
9779
9780 if (maxtries) maxtries--;
9781 de = dictGetRandomKey(db->dict);
9782 val = dictGetEntryVal(de);
9783 /* Only swap objects that are currently in memory.
9784 *
9785 * Also don't swap shared objects: not a good idea in general and
9786 * we need to ensure that the main thread does not touch the
9787 * object while the I/O thread is using it, but we can't
9788 * control other keys without adding additional mutex. */
9789 if (val->storage != REDIS_VM_MEMORY || val->refcount != 1) {
9790 if (maxtries) i--; /* don't count this try */
9791 continue;
9792 }
9793 swappability = computeObjectSwappability(val);
9794 if (!best || swappability > best_swappability) {
9795 best = de;
9796 best_swappability = swappability;
9797 best_db = db;
9798 }
9799 }
9800 }
9801 if (best == NULL) return REDIS_ERR;
9802 key = dictGetEntryKey(best);
9803 val = dictGetEntryVal(best);
9804
9805 redisLog(REDIS_DEBUG,"Key with best swappability: %s, %f",
9806 key, best_swappability);
9807
9808 /* Swap it */
9809 if (usethreads) {
9810 robj *keyobj = createStringObject(key,sdslen(key));
9811 vmSwapObjectThreaded(keyobj,val,best_db);
9812 decrRefCount(keyobj);
9813 return REDIS_OK;
9814 } else {
9815 vmpointer *vp;
9816
9817 if ((vp = vmSwapObjectBlocking(val)) != NULL) {
9818 dictGetEntryVal(best) = vp;
9819 return REDIS_OK;
9820 } else {
9821 return REDIS_ERR;
9822 }
9823 }
9824 }
9825
9826 static int vmSwapOneObjectBlocking() {
9827 return vmSwapOneObject(0);
9828 }
9829
9830 static int vmSwapOneObjectThreaded() {
9831 return vmSwapOneObject(1);
9832 }
9833
9834 /* Return true if it's safe to swap out objects in a given moment.
9835 * Basically we don't want to swap objects out while there is a BGSAVE
9836 * or a BGAEOREWRITE running in backgroud. */
9837 static int vmCanSwapOut(void) {
9838 return (server.bgsavechildpid == -1 && server.bgrewritechildpid == -1);
9839 }
9840
9841 /* =================== Virtual Memory - Threaded I/O ======================= */
9842
9843 static void freeIOJob(iojob *j) {
9844 if ((j->type == REDIS_IOJOB_PREPARE_SWAP ||
9845 j->type == REDIS_IOJOB_DO_SWAP ||
9846 j->type == REDIS_IOJOB_LOAD) && j->val != NULL)
9847 {
9848 /* we fix the storage type, otherwise decrRefCount() will try to
9849 * kill the I/O thread Job (that does no longer exists). */
9850 if (j->val->storage == REDIS_VM_SWAPPING)
9851 j->val->storage = REDIS_VM_MEMORY;
9852 decrRefCount(j->val);
9853 }
9854 decrRefCount(j->key);
9855 zfree(j);
9856 }
9857
9858 /* Every time a thread finished a Job, it writes a byte into the write side
9859 * of an unix pipe in order to "awake" the main thread, and this function
9860 * is called. */
9861 static void vmThreadedIOCompletedJob(aeEventLoop *el, int fd, void *privdata,
9862 int mask)
9863 {
9864 char buf[1];
9865 int retval, processed = 0, toprocess = -1, trytoswap = 1;
9866 REDIS_NOTUSED(el);
9867 REDIS_NOTUSED(mask);
9868 REDIS_NOTUSED(privdata);
9869
9870 /* For every byte we read in the read side of the pipe, there is one
9871 * I/O job completed to process. */
9872 while((retval = read(fd,buf,1)) == 1) {
9873 iojob *j;
9874 listNode *ln;
9875 struct dictEntry *de;
9876
9877 redisLog(REDIS_DEBUG,"Processing I/O completed job");
9878
9879 /* Get the processed element (the oldest one) */
9880 lockThreadedIO();
9881 assert(listLength(server.io_processed) != 0);
9882 if (toprocess == -1) {
9883 toprocess = (listLength(server.io_processed)*REDIS_MAX_COMPLETED_JOBS_PROCESSED)/100;
9884 if (toprocess <= 0) toprocess = 1;
9885 }
9886 ln = listFirst(server.io_processed);
9887 j = ln->value;
9888 listDelNode(server.io_processed,ln);
9889 unlockThreadedIO();
9890 /* If this job is marked as canceled, just ignore it */
9891 if (j->canceled) {
9892 freeIOJob(j);
9893 continue;
9894 }
9895 /* Post process it in the main thread, as there are things we
9896 * can do just here to avoid race conditions and/or invasive locks */
9897 redisLog(REDIS_DEBUG,"COMPLETED Job type: %d, ID %p, key: %s", j->type, (void*)j->id, (unsigned char*)j->key->ptr);
9898 de = dictFind(j->db->dict,j->key->ptr);
9899 redisAssert(de != NULL);
9900 if (j->type == REDIS_IOJOB_LOAD) {
9901 redisDb *db;
9902 vmpointer *vp = dictGetEntryVal(de);
9903
9904 /* Key loaded, bring it at home */
9905 vmMarkPagesFree(vp->page,vp->usedpages);
9906 redisLog(REDIS_DEBUG, "VM: object %s loaded from disk (threaded)",
9907 (unsigned char*) j->key->ptr);
9908 server.vm_stats_swapped_objects--;
9909 server.vm_stats_swapins++;
9910 dictGetEntryVal(de) = j->val;
9911 incrRefCount(j->val);
9912 db = j->db;
9913 /* Handle clients waiting for this key to be loaded. */
9914 handleClientsBlockedOnSwappedKey(db,j->key);
9915 freeIOJob(j);
9916 zfree(vp);
9917 } else if (j->type == REDIS_IOJOB_PREPARE_SWAP) {
9918 /* Now we know the amount of pages required to swap this object.
9919 * Let's find some space for it, and queue this task again
9920 * rebranded as REDIS_IOJOB_DO_SWAP. */
9921 if (!vmCanSwapOut() ||
9922 vmFindContiguousPages(&j->page,j->pages) == REDIS_ERR)
9923 {
9924 /* Ooops... no space or we can't swap as there is
9925 * a fork()ed Redis trying to save stuff on disk. */
9926 j->val->storage = REDIS_VM_MEMORY; /* undo operation */
9927 freeIOJob(j);
9928 } else {
9929 /* Note that we need to mark this pages as used now,
9930 * if the job will be canceled, we'll mark them as freed
9931 * again. */
9932 vmMarkPagesUsed(j->page,j->pages);
9933 j->type = REDIS_IOJOB_DO_SWAP;
9934 lockThreadedIO();
9935 queueIOJob(j);
9936 unlockThreadedIO();
9937 }
9938 } else if (j->type == REDIS_IOJOB_DO_SWAP) {
9939 vmpointer *vp;
9940
9941 /* Key swapped. We can finally free some memory. */
9942 if (j->val->storage != REDIS_VM_SWAPPING) {
9943 vmpointer *vp = (vmpointer*) j->id;
9944 printf("storage: %d\n",vp->storage);
9945 printf("key->name: %s\n",(char*)j->key->ptr);
9946 printf("val: %p\n",(void*)j->val);
9947 printf("val->type: %d\n",j->val->type);
9948 printf("val->ptr: %s\n",(char*)j->val->ptr);
9949 }
9950 redisAssert(j->val->storage == REDIS_VM_SWAPPING);
9951 vp = createVmPointer(j->val->type);
9952 vp->page = j->page;
9953 vp->usedpages = j->pages;
9954 dictGetEntryVal(de) = vp;
9955 /* Fix the storage otherwise decrRefCount will attempt to
9956 * remove the associated I/O job */
9957 j->val->storage = REDIS_VM_MEMORY;
9958 decrRefCount(j->val);
9959 redisLog(REDIS_DEBUG,
9960 "VM: object %s swapped out at %lld (%lld pages) (threaded)",
9961 (unsigned char*) j->key->ptr,
9962 (unsigned long long) j->page, (unsigned long long) j->pages);
9963 server.vm_stats_swapped_objects++;
9964 server.vm_stats_swapouts++;
9965 freeIOJob(j);
9966 /* Put a few more swap requests in queue if we are still
9967 * out of memory */
9968 if (trytoswap && vmCanSwapOut() &&
9969 zmalloc_used_memory() > server.vm_max_memory)
9970 {
9971 int more = 1;
9972 while(more) {
9973 lockThreadedIO();
9974 more = listLength(server.io_newjobs) <
9975 (unsigned) server.vm_max_threads;
9976 unlockThreadedIO();
9977 /* Don't waste CPU time if swappable objects are rare. */
9978 if (vmSwapOneObjectThreaded() == REDIS_ERR) {
9979 trytoswap = 0;
9980 break;
9981 }
9982 }
9983 }
9984 }
9985 processed++;
9986 if (processed == toprocess) return;
9987 }
9988 if (retval < 0 && errno != EAGAIN) {
9989 redisLog(REDIS_WARNING,
9990 "WARNING: read(2) error in vmThreadedIOCompletedJob() %s",
9991 strerror(errno));
9992 }
9993 }
9994
9995 static void lockThreadedIO(void) {
9996 pthread_mutex_lock(&server.io_mutex);
9997 }
9998
9999 static void unlockThreadedIO(void) {
10000 pthread_mutex_unlock(&server.io_mutex);
10001 }
10002
10003 /* Remove the specified object from the threaded I/O queue if still not
10004 * processed, otherwise make sure to flag it as canceled. */
10005 static void vmCancelThreadedIOJob(robj *o) {
10006 list *lists[3] = {
10007 server.io_newjobs, /* 0 */
10008 server.io_processing, /* 1 */
10009 server.io_processed /* 2 */
10010 };
10011 int i;
10012
10013 assert(o->storage == REDIS_VM_LOADING || o->storage == REDIS_VM_SWAPPING);
10014 again:
10015 lockThreadedIO();
10016 /* Search for a matching object in one of the queues */
10017 for (i = 0; i < 3; i++) {
10018 listNode *ln;
10019 listIter li;
10020
10021 listRewind(lists[i],&li);
10022 while ((ln = listNext(&li)) != NULL) {
10023 iojob *job = ln->value;
10024
10025 if (job->canceled) continue; /* Skip this, already canceled. */
10026 if (job->id == o) {
10027 redisLog(REDIS_DEBUG,"*** CANCELED %p (key %s) (type %d) (LIST ID %d)\n",
10028 (void*)job, (char*)job->key->ptr, job->type, i);
10029 /* Mark the pages as free since the swap didn't happened
10030 * or happened but is now discarded. */
10031 if (i != 1 && job->type == REDIS_IOJOB_DO_SWAP)
10032 vmMarkPagesFree(job->page,job->pages);
10033 /* Cancel the job. It depends on the list the job is
10034 * living in. */
10035 switch(i) {
10036 case 0: /* io_newjobs */
10037 /* If the job was yet not processed the best thing to do
10038 * is to remove it from the queue at all */
10039 freeIOJob(job);
10040 listDelNode(lists[i],ln);
10041 break;
10042 case 1: /* io_processing */
10043 /* Oh Shi- the thread is messing with the Job:
10044 *
10045 * Probably it's accessing the object if this is a
10046 * PREPARE_SWAP or DO_SWAP job.
10047 * If it's a LOAD job it may be reading from disk and
10048 * if we don't wait for the job to terminate before to
10049 * cancel it, maybe in a few microseconds data can be
10050 * corrupted in this pages. So the short story is:
10051 *
10052 * Better to wait for the job to move into the
10053 * next queue (processed)... */
10054
10055 /* We try again and again until the job is completed. */
10056 unlockThreadedIO();
10057 /* But let's wait some time for the I/O thread
10058 * to finish with this job. After all this condition
10059 * should be very rare. */
10060 usleep(1);
10061 goto again;
10062 case 2: /* io_processed */
10063 /* The job was already processed, that's easy...
10064 * just mark it as canceled so that we'll ignore it
10065 * when processing completed jobs. */
10066 job->canceled = 1;
10067 break;
10068 }
10069 /* Finally we have to adjust the storage type of the object
10070 * in order to "UNDO" the operaiton. */
10071 if (o->storage == REDIS_VM_LOADING)
10072 o->storage = REDIS_VM_SWAPPED;
10073 else if (o->storage == REDIS_VM_SWAPPING)
10074 o->storage = REDIS_VM_MEMORY;
10075 unlockThreadedIO();
10076 redisLog(REDIS_DEBUG,"*** DONE");
10077 return;
10078 }
10079 }
10080 }
10081 unlockThreadedIO();
10082 printf("Not found: %p\n", (void*)o);
10083 redisAssert(1 != 1); /* We should never reach this */
10084 }
10085
10086 static void *IOThreadEntryPoint(void *arg) {
10087 iojob *j;
10088 listNode *ln;
10089 REDIS_NOTUSED(arg);
10090
10091 pthread_detach(pthread_self());
10092 while(1) {
10093 /* Get a new job to process */
10094 lockThreadedIO();
10095 if (listLength(server.io_newjobs) == 0) {
10096 /* No new jobs in queue, exit. */
10097 redisLog(REDIS_DEBUG,"Thread %ld exiting, nothing to do",
10098 (long) pthread_self());
10099 server.io_active_threads--;
10100 unlockThreadedIO();
10101 return NULL;
10102 }
10103 ln = listFirst(server.io_newjobs);
10104 j = ln->value;
10105 listDelNode(server.io_newjobs,ln);
10106 /* Add the job in the processing queue */
10107 j->thread = pthread_self();
10108 listAddNodeTail(server.io_processing,j);
10109 ln = listLast(server.io_processing); /* We use ln later to remove it */
10110 unlockThreadedIO();
10111 redisLog(REDIS_DEBUG,"Thread %ld got a new job (type %d): %p about key '%s'",
10112 (long) pthread_self(), j->type, (void*)j, (char*)j->key->ptr);
10113
10114 /* Process the Job */
10115 if (j->type == REDIS_IOJOB_LOAD) {
10116 vmpointer *vp = (vmpointer*)j->id;
10117 j->val = vmReadObjectFromSwap(j->page,vp->vtype);
10118 } else if (j->type == REDIS_IOJOB_PREPARE_SWAP) {
10119 FILE *fp = fopen("/dev/null","w+");
10120 j->pages = rdbSavedObjectPages(j->val,fp);
10121 fclose(fp);
10122 } else if (j->type == REDIS_IOJOB_DO_SWAP) {
10123 if (vmWriteObjectOnSwap(j->val,j->page) == REDIS_ERR)
10124 j->canceled = 1;
10125 }
10126
10127 /* Done: insert the job into the processed queue */
10128 redisLog(REDIS_DEBUG,"Thread %ld completed the job: %p (key %s)",
10129 (long) pthread_self(), (void*)j, (char*)j->key->ptr);
10130 lockThreadedIO();
10131 listDelNode(server.io_processing,ln);
10132 listAddNodeTail(server.io_processed,j);
10133 unlockThreadedIO();
10134
10135 /* Signal the main thread there is new stuff to process */
10136 assert(write(server.io_ready_pipe_write,"x",1) == 1);
10137 }
10138 return NULL; /* never reached */
10139 }
10140
10141 static void spawnIOThread(void) {
10142 pthread_t thread;
10143 sigset_t mask, omask;
10144 int err;
10145
10146 sigemptyset(&mask);
10147 sigaddset(&mask,SIGCHLD);
10148 sigaddset(&mask,SIGHUP);
10149 sigaddset(&mask,SIGPIPE);
10150 pthread_sigmask(SIG_SETMASK, &mask, &omask);
10151 while ((err = pthread_create(&thread,&server.io_threads_attr,IOThreadEntryPoint,NULL)) != 0) {
10152 redisLog(REDIS_WARNING,"Unable to spawn an I/O thread: %s",
10153 strerror(err));
10154 usleep(1000000);
10155 }
10156 pthread_sigmask(SIG_SETMASK, &omask, NULL);
10157 server.io_active_threads++;
10158 }
10159
10160 /* We need to wait for the last thread to exit before we are able to
10161 * fork() in order to BGSAVE or BGREWRITEAOF. */
10162 static void waitEmptyIOJobsQueue(void) {
10163 while(1) {
10164 int io_processed_len;
10165
10166 lockThreadedIO();
10167 if (listLength(server.io_newjobs) == 0 &&
10168 listLength(server.io_processing) == 0 &&
10169 server.io_active_threads == 0)
10170 {
10171 unlockThreadedIO();
10172 return;
10173 }
10174 /* While waiting for empty jobs queue condition we post-process some
10175 * finshed job, as I/O threads may be hanging trying to write against
10176 * the io_ready_pipe_write FD but there are so much pending jobs that
10177 * it's blocking. */
10178 io_processed_len = listLength(server.io_processed);
10179 unlockThreadedIO();
10180 if (io_processed_len) {
10181 vmThreadedIOCompletedJob(NULL,server.io_ready_pipe_read,NULL,0);
10182 usleep(1000); /* 1 millisecond */
10183 } else {
10184 usleep(10000); /* 10 milliseconds */
10185 }
10186 }
10187 }
10188
10189 static void vmReopenSwapFile(void) {
10190 /* Note: we don't close the old one as we are in the child process
10191 * and don't want to mess at all with the original file object. */
10192 server.vm_fp = fopen(server.vm_swap_file,"r+b");
10193 if (server.vm_fp == NULL) {
10194 redisLog(REDIS_WARNING,"Can't re-open the VM swap file: %s. Exiting.",
10195 server.vm_swap_file);
10196 _exit(1);
10197 }
10198 server.vm_fd = fileno(server.vm_fp);
10199 }
10200
10201 /* This function must be called while with threaded IO locked */
10202 static void queueIOJob(iojob *j) {
10203 redisLog(REDIS_DEBUG,"Queued IO Job %p type %d about key '%s'\n",
10204 (void*)j, j->type, (char*)j->key->ptr);
10205 listAddNodeTail(server.io_newjobs,j);
10206 if (server.io_active_threads < server.vm_max_threads)
10207 spawnIOThread();
10208 }
10209
10210 static int vmSwapObjectThreaded(robj *key, robj *val, redisDb *db) {
10211 iojob *j;
10212
10213 j = zmalloc(sizeof(*j));
10214 j->type = REDIS_IOJOB_PREPARE_SWAP;
10215 j->db = db;
10216 j->key = key;
10217 incrRefCount(key);
10218 j->id = j->val = val;
10219 incrRefCount(val);
10220 j->canceled = 0;
10221 j->thread = (pthread_t) -1;
10222 val->storage = REDIS_VM_SWAPPING;
10223
10224 lockThreadedIO();
10225 queueIOJob(j);
10226 unlockThreadedIO();
10227 return REDIS_OK;
10228 }
10229
10230 /* ============ Virtual Memory - Blocking clients on missing keys =========== */
10231
10232 /* This function makes the clinet 'c' waiting for the key 'key' to be loaded.
10233 * If there is not already a job loading the key, it is craeted.
10234 * The key is added to the io_keys list in the client structure, and also
10235 * in the hash table mapping swapped keys to waiting clients, that is,
10236 * server.io_waited_keys. */
10237 static int waitForSwappedKey(redisClient *c, robj *key) {
10238 struct dictEntry *de;
10239 robj *o;
10240 list *l;
10241
10242 /* If the key does not exist or is already in RAM we don't need to
10243 * block the client at all. */
10244 de = dictFind(c->db->dict,key->ptr);
10245 if (de == NULL) return 0;
10246 o = dictGetEntryVal(de);
10247 if (o->storage == REDIS_VM_MEMORY) {
10248 return 0;
10249 } else if (o->storage == REDIS_VM_SWAPPING) {
10250 /* We were swapping the key, undo it! */
10251 vmCancelThreadedIOJob(o);
10252 return 0;
10253 }
10254
10255 /* OK: the key is either swapped, or being loaded just now. */
10256
10257 /* Add the key to the list of keys this client is waiting for.
10258 * This maps clients to keys they are waiting for. */
10259 listAddNodeTail(c->io_keys,key);
10260 incrRefCount(key);
10261
10262 /* Add the client to the swapped keys => clients waiting map. */
10263 de = dictFind(c->db->io_keys,key);
10264 if (de == NULL) {
10265 int retval;
10266
10267 /* For every key we take a list of clients blocked for it */
10268 l = listCreate();
10269 retval = dictAdd(c->db->io_keys,key,l);
10270 incrRefCount(key);
10271 assert(retval == DICT_OK);
10272 } else {
10273 l = dictGetEntryVal(de);
10274 }
10275 listAddNodeTail(l,c);
10276
10277 /* Are we already loading the key from disk? If not create a job */
10278 if (o->storage == REDIS_VM_SWAPPED) {
10279 iojob *j;
10280 vmpointer *vp = (vmpointer*)o;
10281
10282 o->storage = REDIS_VM_LOADING;
10283 j = zmalloc(sizeof(*j));
10284 j->type = REDIS_IOJOB_LOAD;
10285 j->db = c->db;
10286 j->id = (robj*)vp;
10287 j->key = key;
10288 incrRefCount(key);
10289 j->page = vp->page;
10290 j->val = NULL;
10291 j->canceled = 0;
10292 j->thread = (pthread_t) -1;
10293 lockThreadedIO();
10294 queueIOJob(j);
10295 unlockThreadedIO();
10296 }
10297 return 1;
10298 }
10299
10300 /* Preload keys for any command with first, last and step values for
10301 * the command keys prototype, as defined in the command table. */
10302 static void waitForMultipleSwappedKeys(redisClient *c, struct redisCommand *cmd, int argc, robj **argv) {
10303 int j, last;
10304 if (cmd->vm_firstkey == 0) return;
10305 last = cmd->vm_lastkey;
10306 if (last < 0) last = argc+last;
10307 for (j = cmd->vm_firstkey; j <= last; j += cmd->vm_keystep) {
10308 redisAssert(j < argc);
10309 waitForSwappedKey(c,argv[j]);
10310 }
10311 }
10312
10313 /* Preload keys needed for the ZUNIONSTORE and ZINTERSTORE commands.
10314 * Note that the number of keys to preload is user-defined, so we need to
10315 * apply a sanity check against argc. */
10316 static void zunionInterBlockClientOnSwappedKeys(redisClient *c, struct redisCommand *cmd, int argc, robj **argv) {
10317 int i, num;
10318 REDIS_NOTUSED(cmd);
10319
10320 num = atoi(argv[2]->ptr);
10321 if (num > (argc-3)) return;
10322 for (i = 0; i < num; i++) {
10323 waitForSwappedKey(c,argv[3+i]);
10324 }
10325 }
10326
10327 /* Preload keys needed to execute the entire MULTI/EXEC block.
10328 *
10329 * This function is called by blockClientOnSwappedKeys when EXEC is issued,
10330 * and will block the client when any command requires a swapped out value. */
10331 static void execBlockClientOnSwappedKeys(redisClient *c, struct redisCommand *cmd, int argc, robj **argv) {
10332 int i, margc;
10333 struct redisCommand *mcmd;
10334 robj **margv;
10335 REDIS_NOTUSED(cmd);
10336 REDIS_NOTUSED(argc);
10337 REDIS_NOTUSED(argv);
10338
10339 if (!(c->flags & REDIS_MULTI)) return;
10340 for (i = 0; i < c->mstate.count; i++) {
10341 mcmd = c->mstate.commands[i].cmd;
10342 margc = c->mstate.commands[i].argc;
10343 margv = c->mstate.commands[i].argv;
10344
10345 if (mcmd->vm_preload_proc != NULL) {
10346 mcmd->vm_preload_proc(c,mcmd,margc,margv);
10347 } else {
10348 waitForMultipleSwappedKeys(c,mcmd,margc,margv);
10349 }
10350 }
10351 }
10352
10353 /* Is this client attempting to run a command against swapped keys?
10354 * If so, block it ASAP, load the keys in background, then resume it.
10355 *
10356 * The important idea about this function is that it can fail! If keys will
10357 * still be swapped when the client is resumed, this key lookups will
10358 * just block loading keys from disk. In practical terms this should only
10359 * happen with SORT BY command or if there is a bug in this function.
10360 *
10361 * Return 1 if the client is marked as blocked, 0 if the client can
10362 * continue as the keys it is going to access appear to be in memory. */
10363 static int blockClientOnSwappedKeys(redisClient *c, struct redisCommand *cmd) {
10364 if (cmd->vm_preload_proc != NULL) {
10365 cmd->vm_preload_proc(c,cmd,c->argc,c->argv);
10366 } else {
10367 waitForMultipleSwappedKeys(c,cmd,c->argc,c->argv);
10368 }
10369
10370 /* If the client was blocked for at least one key, mark it as blocked. */
10371 if (listLength(c->io_keys)) {
10372 c->flags |= REDIS_IO_WAIT;
10373 aeDeleteFileEvent(server.el,c->fd,AE_READABLE);
10374 server.vm_blocked_clients++;
10375 return 1;
10376 } else {
10377 return 0;
10378 }
10379 }
10380
10381 /* Remove the 'key' from the list of blocked keys for a given client.
10382 *
10383 * The function returns 1 when there are no longer blocking keys after
10384 * the current one was removed (and the client can be unblocked). */
10385 static int dontWaitForSwappedKey(redisClient *c, robj *key) {
10386 list *l;
10387 listNode *ln;
10388 listIter li;
10389 struct dictEntry *de;
10390
10391 /* Remove the key from the list of keys this client is waiting for. */
10392 listRewind(c->io_keys,&li);
10393 while ((ln = listNext(&li)) != NULL) {
10394 if (equalStringObjects(ln->value,key)) {
10395 listDelNode(c->io_keys,ln);
10396 break;
10397 }
10398 }
10399 assert(ln != NULL);
10400
10401 /* Remove the client form the key => waiting clients map. */
10402 de = dictFind(c->db->io_keys,key);
10403 assert(de != NULL);
10404 l = dictGetEntryVal(de);
10405 ln = listSearchKey(l,c);
10406 assert(ln != NULL);
10407 listDelNode(l,ln);
10408 if (listLength(l) == 0)
10409 dictDelete(c->db->io_keys,key);
10410
10411 return listLength(c->io_keys) == 0;
10412 }
10413
10414 /* Every time we now a key was loaded back in memory, we handle clients
10415 * waiting for this key if any. */
10416 static void handleClientsBlockedOnSwappedKey(redisDb *db, robj *key) {
10417 struct dictEntry *de;
10418 list *l;
10419 listNode *ln;
10420 int len;
10421
10422 de = dictFind(db->io_keys,key);
10423 if (!de) return;
10424
10425 l = dictGetEntryVal(de);
10426 len = listLength(l);
10427 /* Note: we can't use something like while(listLength(l)) as the list
10428 * can be freed by the calling function when we remove the last element. */
10429 while (len--) {
10430 ln = listFirst(l);
10431 redisClient *c = ln->value;
10432
10433 if (dontWaitForSwappedKey(c,key)) {
10434 /* Put the client in the list of clients ready to go as we
10435 * loaded all the keys about it. */
10436 listAddNodeTail(server.io_ready_clients,c);
10437 }
10438 }
10439 }
10440
10441 /* =========================== Remote Configuration ========================= */
10442
10443 static void configSetCommand(redisClient *c) {
10444 robj *o = getDecodedObject(c->argv[3]);
10445 long long ll;
10446
10447 if (!strcasecmp(c->argv[2]->ptr,"dbfilename")) {
10448 zfree(server.dbfilename);
10449 server.dbfilename = zstrdup(o->ptr);
10450 } else if (!strcasecmp(c->argv[2]->ptr,"requirepass")) {
10451 zfree(server.requirepass);
10452 server.requirepass = zstrdup(o->ptr);
10453 } else if (!strcasecmp(c->argv[2]->ptr,"masterauth")) {
10454 zfree(server.masterauth);
10455 server.masterauth = zstrdup(o->ptr);
10456 } else if (!strcasecmp(c->argv[2]->ptr,"maxmemory")) {
10457 if (getLongLongFromObject(o,&ll) == REDIS_ERR ||
10458 ll < 0) goto badfmt;
10459 server.maxmemory = ll;
10460 } else if (!strcasecmp(c->argv[2]->ptr,"timeout")) {
10461 if (getLongLongFromObject(o,&ll) == REDIS_ERR ||
10462 ll < 0 || ll > LONG_MAX) goto badfmt;
10463 server.maxidletime = ll;
10464 } else if (!strcasecmp(c->argv[2]->ptr,"appendfsync")) {
10465 if (!strcasecmp(o->ptr,"no")) {
10466 server.appendfsync = APPENDFSYNC_NO;
10467 } else if (!strcasecmp(o->ptr,"everysec")) {
10468 server.appendfsync = APPENDFSYNC_EVERYSEC;
10469 } else if (!strcasecmp(o->ptr,"always")) {
10470 server.appendfsync = APPENDFSYNC_ALWAYS;
10471 } else {
10472 goto badfmt;
10473 }
10474 } else if (!strcasecmp(c->argv[2]->ptr,"no-appendfsync-on-rewrite")) {
10475 int yn = yesnotoi(o->ptr);
10476
10477 if (yn == -1) goto badfmt;
10478 server.no_appendfsync_on_rewrite = yn;
10479 } else if (!strcasecmp(c->argv[2]->ptr,"appendonly")) {
10480 int old = server.appendonly;
10481 int new = yesnotoi(o->ptr);
10482
10483 if (new == -1) goto badfmt;
10484 if (old != new) {
10485 if (new == 0) {
10486 stopAppendOnly();
10487 } else {
10488 if (startAppendOnly() == REDIS_ERR) {
10489 addReplySds(c,sdscatprintf(sdsempty(),
10490 "-ERR Unable to turn on AOF. Check server logs.\r\n"));
10491 decrRefCount(o);
10492 return;
10493 }
10494 }
10495 }
10496 } else if (!strcasecmp(c->argv[2]->ptr,"save")) {
10497 int vlen, j;
10498 sds *v = sdssplitlen(o->ptr,sdslen(o->ptr)," ",1,&vlen);
10499
10500 /* Perform sanity check before setting the new config:
10501 * - Even number of args
10502 * - Seconds >= 1, changes >= 0 */
10503 if (vlen & 1) {
10504 sdsfreesplitres(v,vlen);
10505 goto badfmt;
10506 }
10507 for (j = 0; j < vlen; j++) {
10508 char *eptr;
10509 long val;
10510
10511 val = strtoll(v[j], &eptr, 10);
10512 if (eptr[0] != '\0' ||
10513 ((j & 1) == 0 && val < 1) ||
10514 ((j & 1) == 1 && val < 0)) {
10515 sdsfreesplitres(v,vlen);
10516 goto badfmt;
10517 }
10518 }
10519 /* Finally set the new config */
10520 resetServerSaveParams();
10521 for (j = 0; j < vlen; j += 2) {
10522 time_t seconds;
10523 int changes;
10524
10525 seconds = strtoll(v[j],NULL,10);
10526 changes = strtoll(v[j+1],NULL,10);
10527 appendServerSaveParams(seconds, changes);
10528 }
10529 sdsfreesplitres(v,vlen);
10530 } else {
10531 addReplySds(c,sdscatprintf(sdsempty(),
10532 "-ERR not supported CONFIG parameter %s\r\n",
10533 (char*)c->argv[2]->ptr));
10534 decrRefCount(o);
10535 return;
10536 }
10537 decrRefCount(o);
10538 addReply(c,shared.ok);
10539 return;
10540
10541 badfmt: /* Bad format errors */
10542 addReplySds(c,sdscatprintf(sdsempty(),
10543 "-ERR invalid argument '%s' for CONFIG SET '%s'\r\n",
10544 (char*)o->ptr,
10545 (char*)c->argv[2]->ptr));
10546 decrRefCount(o);
10547 }
10548
10549 static void configGetCommand(redisClient *c) {
10550 robj *o = getDecodedObject(c->argv[2]);
10551 robj *lenobj = createObject(REDIS_STRING,NULL);
10552 char *pattern = o->ptr;
10553 int matches = 0;
10554
10555 addReply(c,lenobj);
10556 decrRefCount(lenobj);
10557
10558 if (stringmatch(pattern,"dbfilename",0)) {
10559 addReplyBulkCString(c,"dbfilename");
10560 addReplyBulkCString(c,server.dbfilename);
10561 matches++;
10562 }
10563 if (stringmatch(pattern,"requirepass",0)) {
10564 addReplyBulkCString(c,"requirepass");
10565 addReplyBulkCString(c,server.requirepass);
10566 matches++;
10567 }
10568 if (stringmatch(pattern,"masterauth",0)) {
10569 addReplyBulkCString(c,"masterauth");
10570 addReplyBulkCString(c,server.masterauth);
10571 matches++;
10572 }
10573 if (stringmatch(pattern,"maxmemory",0)) {
10574 char buf[128];
10575
10576 ll2string(buf,128,server.maxmemory);
10577 addReplyBulkCString(c,"maxmemory");
10578 addReplyBulkCString(c,buf);
10579 matches++;
10580 }
10581 if (stringmatch(pattern,"timeout",0)) {
10582 char buf[128];
10583
10584 ll2string(buf,128,server.maxidletime);
10585 addReplyBulkCString(c,"timeout");
10586 addReplyBulkCString(c,buf);
10587 matches++;
10588 }
10589 if (stringmatch(pattern,"appendonly",0)) {
10590 addReplyBulkCString(c,"appendonly");
10591 addReplyBulkCString(c,server.appendonly ? "yes" : "no");
10592 matches++;
10593 }
10594 if (stringmatch(pattern,"no-appendfsync-on-rewrite",0)) {
10595 addReplyBulkCString(c,"no-appendfsync-on-rewrite");
10596 addReplyBulkCString(c,server.no_appendfsync_on_rewrite ? "yes" : "no");
10597 matches++;
10598 }
10599 if (stringmatch(pattern,"appendfsync",0)) {
10600 char *policy;
10601
10602 switch(server.appendfsync) {
10603 case APPENDFSYNC_NO: policy = "no"; break;
10604 case APPENDFSYNC_EVERYSEC: policy = "everysec"; break;
10605 case APPENDFSYNC_ALWAYS: policy = "always"; break;
10606 default: policy = "unknown"; break; /* too harmless to panic */
10607 }
10608 addReplyBulkCString(c,"appendfsync");
10609 addReplyBulkCString(c,policy);
10610 matches++;
10611 }
10612 if (stringmatch(pattern,"save",0)) {
10613 sds buf = sdsempty();
10614 int j;
10615
10616 for (j = 0; j < server.saveparamslen; j++) {
10617 buf = sdscatprintf(buf,"%ld %d",
10618 server.saveparams[j].seconds,
10619 server.saveparams[j].changes);
10620 if (j != server.saveparamslen-1)
10621 buf = sdscatlen(buf," ",1);
10622 }
10623 addReplyBulkCString(c,"save");
10624 addReplyBulkCString(c,buf);
10625 sdsfree(buf);
10626 matches++;
10627 }
10628 decrRefCount(o);
10629 lenobj->ptr = sdscatprintf(sdsempty(),"*%d\r\n",matches*2);
10630 }
10631
10632 static void configCommand(redisClient *c) {
10633 if (!strcasecmp(c->argv[1]->ptr,"set")) {
10634 if (c->argc != 4) goto badarity;
10635 configSetCommand(c);
10636 } else if (!strcasecmp(c->argv[1]->ptr,"get")) {
10637 if (c->argc != 3) goto badarity;
10638 configGetCommand(c);
10639 } else if (!strcasecmp(c->argv[1]->ptr,"resetstat")) {
10640 if (c->argc != 2) goto badarity;
10641 server.stat_numcommands = 0;
10642 server.stat_numconnections = 0;
10643 server.stat_expiredkeys = 0;
10644 server.stat_starttime = time(NULL);
10645 addReply(c,shared.ok);
10646 } else {
10647 addReplySds(c,sdscatprintf(sdsempty(),
10648 "-ERR CONFIG subcommand must be one of GET, SET, RESETSTAT\r\n"));
10649 }
10650 return;
10651
10652 badarity:
10653 addReplySds(c,sdscatprintf(sdsempty(),
10654 "-ERR Wrong number of arguments for CONFIG %s\r\n",
10655 (char*) c->argv[1]->ptr));
10656 }
10657
10658 /* =========================== Pubsub implementation ======================== */
10659
10660 static void freePubsubPattern(void *p) {
10661 pubsubPattern *pat = p;
10662
10663 decrRefCount(pat->pattern);
10664 zfree(pat);
10665 }
10666
10667 static int listMatchPubsubPattern(void *a, void *b) {
10668 pubsubPattern *pa = a, *pb = b;
10669
10670 return (pa->client == pb->client) &&
10671 (equalStringObjects(pa->pattern,pb->pattern));
10672 }
10673
10674 /* Subscribe a client to a channel. Returns 1 if the operation succeeded, or
10675 * 0 if the client was already subscribed to that channel. */
10676 static int pubsubSubscribeChannel(redisClient *c, robj *channel) {
10677 struct dictEntry *de;
10678 list *clients = NULL;
10679 int retval = 0;
10680
10681 /* Add the channel to the client -> channels hash table */
10682 if (dictAdd(c->pubsub_channels,channel,NULL) == DICT_OK) {
10683 retval = 1;
10684 incrRefCount(channel);
10685 /* Add the client to the channel -> list of clients hash table */
10686 de = dictFind(server.pubsub_channels,channel);
10687 if (de == NULL) {
10688 clients = listCreate();
10689 dictAdd(server.pubsub_channels,channel,clients);
10690 incrRefCount(channel);
10691 } else {
10692 clients = dictGetEntryVal(de);
10693 }
10694 listAddNodeTail(clients,c);
10695 }
10696 /* Notify the client */
10697 addReply(c,shared.mbulk3);
10698 addReply(c,shared.subscribebulk);
10699 addReplyBulk(c,channel);
10700 addReplyLongLong(c,dictSize(c->pubsub_channels)+listLength(c->pubsub_patterns));
10701 return retval;
10702 }
10703
10704 /* Unsubscribe a client from a channel. Returns 1 if the operation succeeded, or
10705 * 0 if the client was not subscribed to the specified channel. */
10706 static int pubsubUnsubscribeChannel(redisClient *c, robj *channel, int notify) {
10707 struct dictEntry *de;
10708 list *clients;
10709 listNode *ln;
10710 int retval = 0;
10711
10712 /* Remove the channel from the client -> channels hash table */
10713 incrRefCount(channel); /* channel may be just a pointer to the same object
10714 we have in the hash tables. Protect it... */
10715 if (dictDelete(c->pubsub_channels,channel) == DICT_OK) {
10716 retval = 1;
10717 /* Remove the client from the channel -> clients list hash table */
10718 de = dictFind(server.pubsub_channels,channel);
10719 assert(de != NULL);
10720 clients = dictGetEntryVal(de);
10721 ln = listSearchKey(clients,c);
10722 assert(ln != NULL);
10723 listDelNode(clients,ln);
10724 if (listLength(clients) == 0) {
10725 /* Free the list and associated hash entry at all if this was
10726 * the latest client, so that it will be possible to abuse
10727 * Redis PUBSUB creating millions of channels. */
10728 dictDelete(server.pubsub_channels,channel);
10729 }
10730 }
10731 /* Notify the client */
10732 if (notify) {
10733 addReply(c,shared.mbulk3);
10734 addReply(c,shared.unsubscribebulk);
10735 addReplyBulk(c,channel);
10736 addReplyLongLong(c,dictSize(c->pubsub_channels)+
10737 listLength(c->pubsub_patterns));
10738
10739 }
10740 decrRefCount(channel); /* it is finally safe to release it */
10741 return retval;
10742 }
10743
10744 /* Subscribe a client to a pattern. Returns 1 if the operation succeeded, or 0 if the clinet was already subscribed to that pattern. */
10745 static int pubsubSubscribePattern(redisClient *c, robj *pattern) {
10746 int retval = 0;
10747
10748 if (listSearchKey(c->pubsub_patterns,pattern) == NULL) {
10749 retval = 1;
10750 pubsubPattern *pat;
10751 listAddNodeTail(c->pubsub_patterns,pattern);
10752 incrRefCount(pattern);
10753 pat = zmalloc(sizeof(*pat));
10754 pat->pattern = getDecodedObject(pattern);
10755 pat->client = c;
10756 listAddNodeTail(server.pubsub_patterns,pat);
10757 }
10758 /* Notify the client */
10759 addReply(c,shared.mbulk3);
10760 addReply(c,shared.psubscribebulk);
10761 addReplyBulk(c,pattern);
10762 addReplyLongLong(c,dictSize(c->pubsub_channels)+listLength(c->pubsub_patterns));
10763 return retval;
10764 }
10765
10766 /* Unsubscribe a client from a channel. Returns 1 if the operation succeeded, or
10767 * 0 if the client was not subscribed to the specified channel. */
10768 static int pubsubUnsubscribePattern(redisClient *c, robj *pattern, int notify) {
10769 listNode *ln;
10770 pubsubPattern pat;
10771 int retval = 0;
10772
10773 incrRefCount(pattern); /* Protect the object. May be the same we remove */
10774 if ((ln = listSearchKey(c->pubsub_patterns,pattern)) != NULL) {
10775 retval = 1;
10776 listDelNode(c->pubsub_patterns,ln);
10777 pat.client = c;
10778 pat.pattern = pattern;
10779 ln = listSearchKey(server.pubsub_patterns,&pat);
10780 listDelNode(server.pubsub_patterns,ln);
10781 }
10782 /* Notify the client */
10783 if (notify) {
10784 addReply(c,shared.mbulk3);
10785 addReply(c,shared.punsubscribebulk);
10786 addReplyBulk(c,pattern);
10787 addReplyLongLong(c,dictSize(c->pubsub_channels)+
10788 listLength(c->pubsub_patterns));
10789 }
10790 decrRefCount(pattern);
10791 return retval;
10792 }
10793
10794 /* Unsubscribe from all the channels. Return the number of channels the
10795 * client was subscribed from. */
10796 static int pubsubUnsubscribeAllChannels(redisClient *c, int notify) {
10797 dictIterator *di = dictGetIterator(c->pubsub_channels);
10798 dictEntry *de;
10799 int count = 0;
10800
10801 while((de = dictNext(di)) != NULL) {
10802 robj *channel = dictGetEntryKey(de);
10803
10804 count += pubsubUnsubscribeChannel(c,channel,notify);
10805 }
10806 dictReleaseIterator(di);
10807 return count;
10808 }
10809
10810 /* Unsubscribe from all the patterns. Return the number of patterns the
10811 * client was subscribed from. */
10812 static int pubsubUnsubscribeAllPatterns(redisClient *c, int notify) {
10813 listNode *ln;
10814 listIter li;
10815 int count = 0;
10816
10817 listRewind(c->pubsub_patterns,&li);
10818 while ((ln = listNext(&li)) != NULL) {
10819 robj *pattern = ln->value;
10820
10821 count += pubsubUnsubscribePattern(c,pattern,notify);
10822 }
10823 return count;
10824 }
10825
10826 /* Publish a message */
10827 static int pubsubPublishMessage(robj *channel, robj *message) {
10828 int receivers = 0;
10829 struct dictEntry *de;
10830 listNode *ln;
10831 listIter li;
10832
10833 /* Send to clients listening for that channel */
10834 de = dictFind(server.pubsub_channels,channel);
10835 if (de) {
10836 list *list = dictGetEntryVal(de);
10837 listNode *ln;
10838 listIter li;
10839
10840 listRewind(list,&li);
10841 while ((ln = listNext(&li)) != NULL) {
10842 redisClient *c = ln->value;
10843
10844 addReply(c,shared.mbulk3);
10845 addReply(c,shared.messagebulk);
10846 addReplyBulk(c,channel);
10847 addReplyBulk(c,message);
10848 receivers++;
10849 }
10850 }
10851 /* Send to clients listening to matching channels */
10852 if (listLength(server.pubsub_patterns)) {
10853 listRewind(server.pubsub_patterns,&li);
10854 channel = getDecodedObject(channel);
10855 while ((ln = listNext(&li)) != NULL) {
10856 pubsubPattern *pat = ln->value;
10857
10858 if (stringmatchlen((char*)pat->pattern->ptr,
10859 sdslen(pat->pattern->ptr),
10860 (char*)channel->ptr,
10861 sdslen(channel->ptr),0)) {
10862 addReply(pat->client,shared.mbulk4);
10863 addReply(pat->client,shared.pmessagebulk);
10864 addReplyBulk(pat->client,pat->pattern);
10865 addReplyBulk(pat->client,channel);
10866 addReplyBulk(pat->client,message);
10867 receivers++;
10868 }
10869 }
10870 decrRefCount(channel);
10871 }
10872 return receivers;
10873 }
10874
10875 static void subscribeCommand(redisClient *c) {
10876 int j;
10877
10878 for (j = 1; j < c->argc; j++)
10879 pubsubSubscribeChannel(c,c->argv[j]);
10880 }
10881
10882 static void unsubscribeCommand(redisClient *c) {
10883 if (c->argc == 1) {
10884 pubsubUnsubscribeAllChannels(c,1);
10885 return;
10886 } else {
10887 int j;
10888
10889 for (j = 1; j < c->argc; j++)
10890 pubsubUnsubscribeChannel(c,c->argv[j],1);
10891 }
10892 }
10893
10894 static void psubscribeCommand(redisClient *c) {
10895 int j;
10896
10897 for (j = 1; j < c->argc; j++)
10898 pubsubSubscribePattern(c,c->argv[j]);
10899 }
10900
10901 static void punsubscribeCommand(redisClient *c) {
10902 if (c->argc == 1) {
10903 pubsubUnsubscribeAllPatterns(c,1);
10904 return;
10905 } else {
10906 int j;
10907
10908 for (j = 1; j < c->argc; j++)
10909 pubsubUnsubscribePattern(c,c->argv[j],1);
10910 }
10911 }
10912
10913 static void publishCommand(redisClient *c) {
10914 int receivers = pubsubPublishMessage(c->argv[1],c->argv[2]);
10915 addReplyLongLong(c,receivers);
10916 }
10917
10918 /* ===================== WATCH (CAS alike for MULTI/EXEC) ===================
10919 *
10920 * The implementation uses a per-DB hash table mapping keys to list of clients
10921 * WATCHing those keys, so that given a key that is going to be modified
10922 * we can mark all the associated clients as dirty.
10923 *
10924 * Also every client contains a list of WATCHed keys so that's possible to
10925 * un-watch such keys when the client is freed or when UNWATCH is called. */
10926
10927 /* In the client->watched_keys list we need to use watchedKey structures
10928 * as in order to identify a key in Redis we need both the key name and the
10929 * DB */
10930 typedef struct watchedKey {
10931 robj *key;
10932 redisDb *db;
10933 } watchedKey;
10934
10935 /* Watch for the specified key */
10936 static void watchForKey(redisClient *c, robj *key) {
10937 list *clients = NULL;
10938 listIter li;
10939 listNode *ln;
10940 watchedKey *wk;
10941
10942 /* Check if we are already watching for this key */
10943 listRewind(c->watched_keys,&li);
10944 while((ln = listNext(&li))) {
10945 wk = listNodeValue(ln);
10946 if (wk->db == c->db && equalStringObjects(key,wk->key))
10947 return; /* Key already watched */
10948 }
10949 /* This key is not already watched in this DB. Let's add it */
10950 clients = dictFetchValue(c->db->watched_keys,key);
10951 if (!clients) {
10952 clients = listCreate();
10953 dictAdd(c->db->watched_keys,key,clients);
10954 incrRefCount(key);
10955 }
10956 listAddNodeTail(clients,c);
10957 /* Add the new key to the lits of keys watched by this client */
10958 wk = zmalloc(sizeof(*wk));
10959 wk->key = key;
10960 wk->db = c->db;
10961 incrRefCount(key);
10962 listAddNodeTail(c->watched_keys,wk);
10963 }
10964
10965 /* Unwatch all the keys watched by this client. To clean the EXEC dirty
10966 * flag is up to the caller. */
10967 static void unwatchAllKeys(redisClient *c) {
10968 listIter li;
10969 listNode *ln;
10970
10971 if (listLength(c->watched_keys) == 0) return;
10972 listRewind(c->watched_keys,&li);
10973 while((ln = listNext(&li))) {
10974 list *clients;
10975 watchedKey *wk;
10976
10977 /* Lookup the watched key -> clients list and remove the client
10978 * from the list */
10979 wk = listNodeValue(ln);
10980 clients = dictFetchValue(wk->db->watched_keys, wk->key);
10981 assert(clients != NULL);
10982 listDelNode(clients,listSearchKey(clients,c));
10983 /* Kill the entry at all if this was the only client */
10984 if (listLength(clients) == 0)
10985 dictDelete(wk->db->watched_keys, wk->key);
10986 /* Remove this watched key from the client->watched list */
10987 listDelNode(c->watched_keys,ln);
10988 decrRefCount(wk->key);
10989 zfree(wk);
10990 }
10991 }
10992
10993 /* "Touch" a key, so that if this key is being WATCHed by some client the
10994 * next EXEC will fail. */
10995 static void touchWatchedKey(redisDb *db, robj *key) {
10996 list *clients;
10997 listIter li;
10998 listNode *ln;
10999
11000 if (dictSize(db->watched_keys) == 0) return;
11001 clients = dictFetchValue(db->watched_keys, key);
11002 if (!clients) return;
11003
11004 /* Mark all the clients watching this key as REDIS_DIRTY_CAS */
11005 /* Check if we are already watching for this key */
11006 listRewind(clients,&li);
11007 while((ln = listNext(&li))) {
11008 redisClient *c = listNodeValue(ln);
11009
11010 c->flags |= REDIS_DIRTY_CAS;
11011 }
11012 }
11013
11014 /* On FLUSHDB or FLUSHALL all the watched keys that are present before the
11015 * flush but will be deleted as effect of the flushing operation should
11016 * be touched. "dbid" is the DB that's getting the flush. -1 if it is
11017 * a FLUSHALL operation (all the DBs flushed). */
11018 static void touchWatchedKeysOnFlush(int dbid) {
11019 listIter li1, li2;
11020 listNode *ln;
11021
11022 /* For every client, check all the waited keys */
11023 listRewind(server.clients,&li1);
11024 while((ln = listNext(&li1))) {
11025 redisClient *c = listNodeValue(ln);
11026 listRewind(c->watched_keys,&li2);
11027 while((ln = listNext(&li2))) {
11028 watchedKey *wk = listNodeValue(ln);
11029
11030 /* For every watched key matching the specified DB, if the
11031 * key exists, mark the client as dirty, as the key will be
11032 * removed. */
11033 if (dbid == -1 || wk->db->id == dbid) {
11034 if (dictFind(wk->db->dict, wk->key->ptr) != NULL)
11035 c->flags |= REDIS_DIRTY_CAS;
11036 }
11037 }
11038 }
11039 }
11040
11041 static void watchCommand(redisClient *c) {
11042 int j;
11043
11044 if (c->flags & REDIS_MULTI) {
11045 addReplySds(c,sdsnew("-ERR WATCH inside MULTI is not allowed\r\n"));
11046 return;
11047 }
11048 for (j = 1; j < c->argc; j++)
11049 watchForKey(c,c->argv[j]);
11050 addReply(c,shared.ok);
11051 }
11052
11053 static void unwatchCommand(redisClient *c) {
11054 unwatchAllKeys(c);
11055 c->flags &= (~REDIS_DIRTY_CAS);
11056 addReply(c,shared.ok);
11057 }
11058
11059 /* ================================= Debugging ============================== */
11060
11061 /* Compute the sha1 of string at 's' with 'len' bytes long.
11062 * The SHA1 is then xored againt the string pointed by digest.
11063 * Since xor is commutative, this operation is used in order to
11064 * "add" digests relative to unordered elements.
11065 *
11066 * So digest(a,b,c,d) will be the same of digest(b,a,c,d) */
11067 static void xorDigest(unsigned char *digest, void *ptr, size_t len) {
11068 SHA1_CTX ctx;
11069 unsigned char hash[20], *s = ptr;
11070 int j;
11071
11072 SHA1Init(&ctx);
11073 SHA1Update(&ctx,s,len);
11074 SHA1Final(hash,&ctx);
11075
11076 for (j = 0; j < 20; j++)
11077 digest[j] ^= hash[j];
11078 }
11079
11080 static void xorObjectDigest(unsigned char *digest, robj *o) {
11081 o = getDecodedObject(o);
11082 xorDigest(digest,o->ptr,sdslen(o->ptr));
11083 decrRefCount(o);
11084 }
11085
11086 /* This function instead of just computing the SHA1 and xoring it
11087 * against diget, also perform the digest of "digest" itself and
11088 * replace the old value with the new one.
11089 *
11090 * So the final digest will be:
11091 *
11092 * digest = SHA1(digest xor SHA1(data))
11093 *
11094 * This function is used every time we want to preserve the order so
11095 * that digest(a,b,c,d) will be different than digest(b,c,d,a)
11096 *
11097 * Also note that mixdigest("foo") followed by mixdigest("bar")
11098 * will lead to a different digest compared to "fo", "obar".
11099 */
11100 static void mixDigest(unsigned char *digest, void *ptr, size_t len) {
11101 SHA1_CTX ctx;
11102 char *s = ptr;
11103
11104 xorDigest(digest,s,len);
11105 SHA1Init(&ctx);
11106 SHA1Update(&ctx,digest,20);
11107 SHA1Final(digest,&ctx);
11108 }
11109
11110 static void mixObjectDigest(unsigned char *digest, robj *o) {
11111 o = getDecodedObject(o);
11112 mixDigest(digest,o->ptr,sdslen(o->ptr));
11113 decrRefCount(o);
11114 }
11115
11116 /* Compute the dataset digest. Since keys, sets elements, hashes elements
11117 * are not ordered, we use a trick: every aggregate digest is the xor
11118 * of the digests of their elements. This way the order will not change
11119 * the result. For list instead we use a feedback entering the output digest
11120 * as input in order to ensure that a different ordered list will result in
11121 * a different digest. */
11122 static void computeDatasetDigest(unsigned char *final) {
11123 unsigned char digest[20];
11124 char buf[128];
11125 dictIterator *di = NULL;
11126 dictEntry *de;
11127 int j;
11128 uint32_t aux;
11129
11130 memset(final,0,20); /* Start with a clean result */
11131
11132 for (j = 0; j < server.dbnum; j++) {
11133 redisDb *db = server.db+j;
11134
11135 if (dictSize(db->dict) == 0) continue;
11136 di = dictGetIterator(db->dict);
11137
11138 /* hash the DB id, so the same dataset moved in a different
11139 * DB will lead to a different digest */
11140 aux = htonl(j);
11141 mixDigest(final,&aux,sizeof(aux));
11142
11143 /* Iterate this DB writing every entry */
11144 while((de = dictNext(di)) != NULL) {
11145 sds key;
11146 robj *keyobj, *o;
11147 time_t expiretime;
11148
11149 memset(digest,0,20); /* This key-val digest */
11150 key = dictGetEntryKey(de);
11151 keyobj = createStringObject(key,sdslen(key));
11152
11153 mixDigest(digest,key,sdslen(key));
11154
11155 /* Make sure the key is loaded if VM is active */
11156 o = lookupKeyRead(db,keyobj);
11157
11158 aux = htonl(o->type);
11159 mixDigest(digest,&aux,sizeof(aux));
11160 expiretime = getExpire(db,keyobj);
11161
11162 /* Save the key and associated value */
11163 if (o->type == REDIS_STRING) {
11164 mixObjectDigest(digest,o);
11165 } else if (o->type == REDIS_LIST) {
11166 listTypeIterator *li = listTypeInitIterator(o,0,REDIS_TAIL);
11167 listTypeEntry entry;
11168 while(listTypeNext(li,&entry)) {
11169 robj *eleobj = listTypeGet(&entry);
11170 mixObjectDigest(digest,eleobj);
11171 decrRefCount(eleobj);
11172 }
11173 listTypeReleaseIterator(li);
11174 } else if (o->type == REDIS_SET) {
11175 dict *set = o->ptr;
11176 dictIterator *di = dictGetIterator(set);
11177 dictEntry *de;
11178
11179 while((de = dictNext(di)) != NULL) {
11180 robj *eleobj = dictGetEntryKey(de);
11181
11182 xorObjectDigest(digest,eleobj);
11183 }
11184 dictReleaseIterator(di);
11185 } else if (o->type == REDIS_ZSET) {
11186 zset *zs = o->ptr;
11187 dictIterator *di = dictGetIterator(zs->dict);
11188 dictEntry *de;
11189
11190 while((de = dictNext(di)) != NULL) {
11191 robj *eleobj = dictGetEntryKey(de);
11192 double *score = dictGetEntryVal(de);
11193 unsigned char eledigest[20];
11194
11195 snprintf(buf,sizeof(buf),"%.17g",*score);
11196 memset(eledigest,0,20);
11197 mixObjectDigest(eledigest,eleobj);
11198 mixDigest(eledigest,buf,strlen(buf));
11199 xorDigest(digest,eledigest,20);
11200 }
11201 dictReleaseIterator(di);
11202 } else if (o->type == REDIS_HASH) {
11203 hashTypeIterator *hi;
11204 robj *obj;
11205
11206 hi = hashTypeInitIterator(o);
11207 while (hashTypeNext(hi) != REDIS_ERR) {
11208 unsigned char eledigest[20];
11209
11210 memset(eledigest,0,20);
11211 obj = hashTypeCurrent(hi,REDIS_HASH_KEY);
11212 mixObjectDigest(eledigest,obj);
11213 decrRefCount(obj);
11214 obj = hashTypeCurrent(hi,REDIS_HASH_VALUE);
11215 mixObjectDigest(eledigest,obj);
11216 decrRefCount(obj);
11217 xorDigest(digest,eledigest,20);
11218 }
11219 hashTypeReleaseIterator(hi);
11220 } else {
11221 redisPanic("Unknown object type");
11222 }
11223 /* If the key has an expire, add it to the mix */
11224 if (expiretime != -1) xorDigest(digest,"!!expire!!",10);
11225 /* We can finally xor the key-val digest to the final digest */
11226 xorDigest(final,digest,20);
11227 decrRefCount(keyobj);
11228 }
11229 dictReleaseIterator(di);
11230 }
11231 }
11232
11233 static void debugCommand(redisClient *c) {
11234 if (!strcasecmp(c->argv[1]->ptr,"segfault")) {
11235 *((char*)-1) = 'x';
11236 } else if (!strcasecmp(c->argv[1]->ptr,"reload")) {
11237 if (rdbSave(server.dbfilename) != REDIS_OK) {
11238 addReply(c,shared.err);
11239 return;
11240 }
11241 emptyDb();
11242 if (rdbLoad(server.dbfilename) != REDIS_OK) {
11243 addReply(c,shared.err);
11244 return;
11245 }
11246 redisLog(REDIS_WARNING,"DB reloaded by DEBUG RELOAD");
11247 addReply(c,shared.ok);
11248 } else if (!strcasecmp(c->argv[1]->ptr,"loadaof")) {
11249 emptyDb();
11250 if (loadAppendOnlyFile(server.appendfilename) != REDIS_OK) {
11251 addReply(c,shared.err);
11252 return;
11253 }
11254 redisLog(REDIS_WARNING,"Append Only File loaded by DEBUG LOADAOF");
11255 addReply(c,shared.ok);
11256 } else if (!strcasecmp(c->argv[1]->ptr,"object") && c->argc == 3) {
11257 dictEntry *de = dictFind(c->db->dict,c->argv[2]->ptr);
11258 robj *val;
11259
11260 if (!de) {
11261 addReply(c,shared.nokeyerr);
11262 return;
11263 }
11264 val = dictGetEntryVal(de);
11265 if (!server.vm_enabled || (val->storage == REDIS_VM_MEMORY ||
11266 val->storage == REDIS_VM_SWAPPING)) {
11267 char *strenc;
11268 char buf[128];
11269
11270 if (val->encoding < (sizeof(strencoding)/sizeof(char*))) {
11271 strenc = strencoding[val->encoding];
11272 } else {
11273 snprintf(buf,64,"unknown encoding %d\n", val->encoding);
11274 strenc = buf;
11275 }
11276 addReplySds(c,sdscatprintf(sdsempty(),
11277 "+Value at:%p refcount:%d "
11278 "encoding:%s serializedlength:%lld\r\n",
11279 (void*)val, val->refcount,
11280 strenc, (long long) rdbSavedObjectLen(val,NULL)));
11281 } else {
11282 vmpointer *vp = (vmpointer*) val;
11283 addReplySds(c,sdscatprintf(sdsempty(),
11284 "+Value swapped at: page %llu "
11285 "using %llu pages\r\n",
11286 (unsigned long long) vp->page,
11287 (unsigned long long) vp->usedpages));
11288 }
11289 } else if (!strcasecmp(c->argv[1]->ptr,"swapin") && c->argc == 3) {
11290 lookupKeyRead(c->db,c->argv[2]);
11291 addReply(c,shared.ok);
11292 } else if (!strcasecmp(c->argv[1]->ptr,"swapout") && c->argc == 3) {
11293 dictEntry *de = dictFind(c->db->dict,c->argv[2]->ptr);
11294 robj *val;
11295 vmpointer *vp;
11296
11297 if (!server.vm_enabled) {
11298 addReplySds(c,sdsnew("-ERR Virtual Memory is disabled\r\n"));
11299 return;
11300 }
11301 if (!de) {
11302 addReply(c,shared.nokeyerr);
11303 return;
11304 }
11305 val = dictGetEntryVal(de);
11306 /* Swap it */
11307 if (val->storage != REDIS_VM_MEMORY) {
11308 addReplySds(c,sdsnew("-ERR This key is not in memory\r\n"));
11309 } else if (val->refcount != 1) {
11310 addReplySds(c,sdsnew("-ERR Object is shared\r\n"));
11311 } else if ((vp = vmSwapObjectBlocking(val)) != NULL) {
11312 dictGetEntryVal(de) = vp;
11313 addReply(c,shared.ok);
11314 } else {
11315 addReply(c,shared.err);
11316 }
11317 } else if (!strcasecmp(c->argv[1]->ptr,"populate") && c->argc == 3) {
11318 long keys, j;
11319 robj *key, *val;
11320 char buf[128];
11321
11322 if (getLongFromObjectOrReply(c, c->argv[2], &keys, NULL) != REDIS_OK)
11323 return;
11324 for (j = 0; j < keys; j++) {
11325 snprintf(buf,sizeof(buf),"key:%lu",j);
11326 key = createStringObject(buf,strlen(buf));
11327 if (lookupKeyRead(c->db,key) != NULL) {
11328 decrRefCount(key);
11329 continue;
11330 }
11331 snprintf(buf,sizeof(buf),"value:%lu",j);
11332 val = createStringObject(buf,strlen(buf));
11333 dbAdd(c->db,key,val);
11334 decrRefCount(key);
11335 }
11336 addReply(c,shared.ok);
11337 } else if (!strcasecmp(c->argv[1]->ptr,"digest") && c->argc == 2) {
11338 unsigned char digest[20];
11339 sds d = sdsnew("+");
11340 int j;
11341
11342 computeDatasetDigest(digest);
11343 for (j = 0; j < 20; j++)
11344 d = sdscatprintf(d, "%02x",digest[j]);
11345
11346 d = sdscatlen(d,"\r\n",2);
11347 addReplySds(c,d);
11348 } else {
11349 addReplySds(c,sdsnew(
11350 "-ERR Syntax error, try DEBUG [SEGFAULT|OBJECT <key>|SWAPIN <key>|SWAPOUT <key>|RELOAD]\r\n"));
11351 }
11352 }
11353
11354 static void _redisAssert(char *estr, char *file, int line) {
11355 redisLog(REDIS_WARNING,"=== ASSERTION FAILED ===");
11356 redisLog(REDIS_WARNING,"==> %s:%d '%s' is not true",file,line,estr);
11357 #ifdef HAVE_BACKTRACE
11358 redisLog(REDIS_WARNING,"(forcing SIGSEGV in order to print the stack trace)");
11359 *((char*)-1) = 'x';
11360 #endif
11361 }
11362
11363 static void _redisPanic(char *msg, char *file, int line) {
11364 redisLog(REDIS_WARNING,"!!! Software Failure. Press left mouse button to continue");
11365 redisLog(REDIS_WARNING,"Guru Meditation: %s #%s:%d",msg,file,line);
11366 #ifdef HAVE_BACKTRACE
11367 redisLog(REDIS_WARNING,"(forcing SIGSEGV in order to print the stack trace)");
11368 *((char*)-1) = 'x';
11369 #endif
11370 }
11371
11372 /* =================================== Main! ================================ */
11373
11374 #ifdef __linux__
11375 int linuxOvercommitMemoryValue(void) {
11376 FILE *fp = fopen("/proc/sys/vm/overcommit_memory","r");
11377 char buf[64];
11378
11379 if (!fp) return -1;
11380 if (fgets(buf,64,fp) == NULL) {
11381 fclose(fp);
11382 return -1;
11383 }
11384 fclose(fp);
11385
11386 return atoi(buf);
11387 }
11388
11389 void linuxOvercommitMemoryWarning(void) {
11390 if (linuxOvercommitMemoryValue() == 0) {
11391 redisLog(REDIS_WARNING,"WARNING overcommit_memory is set to 0! Background save may fail under low memory condition. To fix this issue add 'vm.overcommit_memory = 1' to /etc/sysctl.conf and then reboot or run the command 'sysctl vm.overcommit_memory=1' for this to take effect.");
11392 }
11393 }
11394 #endif /* __linux__ */
11395
11396 static void daemonize(void) {
11397 int fd;
11398 FILE *fp;
11399
11400 if (fork() != 0) exit(0); /* parent exits */
11401 setsid(); /* create a new session */
11402
11403 /* Every output goes to /dev/null. If Redis is daemonized but
11404 * the 'logfile' is set to 'stdout' in the configuration file
11405 * it will not log at all. */
11406 if ((fd = open("/dev/null", O_RDWR, 0)) != -1) {
11407 dup2(fd, STDIN_FILENO);
11408 dup2(fd, STDOUT_FILENO);
11409 dup2(fd, STDERR_FILENO);
11410 if (fd > STDERR_FILENO) close(fd);
11411 }
11412 /* Try to write the pid file */
11413 fp = fopen(server.pidfile,"w");
11414 if (fp) {
11415 fprintf(fp,"%d\n",getpid());
11416 fclose(fp);
11417 }
11418 }
11419
11420 static void version() {
11421 printf("Redis server version %s (%s:%d)\n", REDIS_VERSION,
11422 REDIS_GIT_SHA1, atoi(REDIS_GIT_DIRTY) > 0);
11423 exit(0);
11424 }
11425
11426 static void usage() {
11427 fprintf(stderr,"Usage: ./redis-server [/path/to/redis.conf]\n");
11428 fprintf(stderr," ./redis-server - (read config from stdin)\n");
11429 exit(1);
11430 }
11431
11432 int main(int argc, char **argv) {
11433 time_t start;
11434
11435 initServerConfig();
11436 sortCommandTable();
11437 if (argc == 2) {
11438 if (strcmp(argv[1], "-v") == 0 ||
11439 strcmp(argv[1], "--version") == 0) version();
11440 if (strcmp(argv[1], "--help") == 0) usage();
11441 resetServerSaveParams();
11442 loadServerConfig(argv[1]);
11443 } else if ((argc > 2)) {
11444 usage();
11445 } else {
11446 redisLog(REDIS_WARNING,"Warning: no config file specified, using the default config. In order to specify a config file use 'redis-server /path/to/redis.conf'");
11447 }
11448 if (server.daemonize) daemonize();
11449 initServer();
11450 redisLog(REDIS_NOTICE,"Server started, Redis version " REDIS_VERSION);
11451 #ifdef __linux__
11452 linuxOvercommitMemoryWarning();
11453 #endif
11454 start = time(NULL);
11455 if (server.appendonly) {
11456 if (loadAppendOnlyFile(server.appendfilename) == REDIS_OK)
11457 redisLog(REDIS_NOTICE,"DB loaded from append only file: %ld seconds",time(NULL)-start);
11458 } else {
11459 if (rdbLoad(server.dbfilename) == REDIS_OK)
11460 redisLog(REDIS_NOTICE,"DB loaded from disk: %ld seconds",time(NULL)-start);
11461 }
11462 redisLog(REDIS_NOTICE,"The server is now ready to accept connections on port %d", server.port);
11463 aeSetBeforeSleepProc(server.el,beforeSleep);
11464 aeMain(server.el);
11465 aeDeleteEventLoop(server.el);
11466 return 0;
11467 }
11468
11469 /* ============================= Backtrace support ========================= */
11470
11471 #ifdef HAVE_BACKTRACE
11472 static char *findFuncName(void *pointer, unsigned long *offset);
11473
11474 static void *getMcontextEip(ucontext_t *uc) {
11475 #if defined(__FreeBSD__)
11476 return (void*) uc->uc_mcontext.mc_eip;
11477 #elif defined(__dietlibc__)
11478 return (void*) uc->uc_mcontext.eip;
11479 #elif defined(__APPLE__) && !defined(MAC_OS_X_VERSION_10_6)
11480 #if __x86_64__
11481 return (void*) uc->uc_mcontext->__ss.__rip;
11482 #else
11483 return (void*) uc->uc_mcontext->__ss.__eip;
11484 #endif
11485 #elif defined(__APPLE__) && defined(MAC_OS_X_VERSION_10_6)
11486 #if defined(_STRUCT_X86_THREAD_STATE64) && !defined(__i386__)
11487 return (void*) uc->uc_mcontext->__ss.__rip;
11488 #else
11489 return (void*) uc->uc_mcontext->__ss.__eip;
11490 #endif
11491 #elif defined(__i386__) || defined(__X86_64__) || defined(__x86_64__)
11492 return (void*) uc->uc_mcontext.gregs[REG_EIP]; /* Linux 32/64 bit */
11493 #elif defined(__ia64__) /* Linux IA64 */
11494 return (void*) uc->uc_mcontext.sc_ip;
11495 #else
11496 return NULL;
11497 #endif
11498 }
11499
11500 static void segvHandler(int sig, siginfo_t *info, void *secret) {
11501 void *trace[100];
11502 char **messages = NULL;
11503 int i, trace_size = 0;
11504 unsigned long offset=0;
11505 ucontext_t *uc = (ucontext_t*) secret;
11506 sds infostring;
11507 REDIS_NOTUSED(info);
11508
11509 redisLog(REDIS_WARNING,
11510 "======= Ooops! Redis %s got signal: -%d- =======", REDIS_VERSION, sig);
11511 infostring = genRedisInfoString();
11512 redisLog(REDIS_WARNING, "%s",infostring);
11513 /* It's not safe to sdsfree() the returned string under memory
11514 * corruption conditions. Let it leak as we are going to abort */
11515
11516 trace_size = backtrace(trace, 100);
11517 /* overwrite sigaction with caller's address */
11518 if (getMcontextEip(uc) != NULL) {
11519 trace[1] = getMcontextEip(uc);
11520 }
11521 messages = backtrace_symbols(trace, trace_size);
11522
11523 for (i=1; i<trace_size; ++i) {
11524 char *fn = findFuncName(trace[i], &offset), *p;
11525
11526 p = strchr(messages[i],'+');
11527 if (!fn || (p && ((unsigned long)strtol(p+1,NULL,10)) < offset)) {
11528 redisLog(REDIS_WARNING,"%s", messages[i]);
11529 } else {
11530 redisLog(REDIS_WARNING,"%d redis-server %p %s + %d", i, trace[i], fn, (unsigned int)offset);
11531 }
11532 }
11533 /* free(messages); Don't call free() with possibly corrupted memory. */
11534 _exit(0);
11535 }
11536
11537 static void sigtermHandler(int sig) {
11538 REDIS_NOTUSED(sig);
11539
11540 redisLog(REDIS_WARNING,"SIGTERM received, scheduling shutting down...");
11541 server.shutdown_asap = 1;
11542 }
11543
11544 static void setupSigSegvAction(void) {
11545 struct sigaction act;
11546
11547 sigemptyset (&act.sa_mask);
11548 /* When the SA_SIGINFO flag is set in sa_flags then sa_sigaction
11549 * is used. Otherwise, sa_handler is used */
11550 act.sa_flags = SA_NODEFER | SA_ONSTACK | SA_RESETHAND | SA_SIGINFO;
11551 act.sa_sigaction = segvHandler;
11552 sigaction (SIGSEGV, &act, NULL);
11553 sigaction (SIGBUS, &act, NULL);
11554 sigaction (SIGFPE, &act, NULL);
11555 sigaction (SIGILL, &act, NULL);
11556 sigaction (SIGBUS, &act, NULL);
11557
11558 act.sa_flags = SA_NODEFER | SA_ONSTACK | SA_RESETHAND;
11559 act.sa_handler = sigtermHandler;
11560 sigaction (SIGTERM, &act, NULL);
11561 return;
11562 }
11563
11564 #include "staticsymbols.h"
11565 /* This function try to convert a pointer into a function name. It's used in
11566 * oreder to provide a backtrace under segmentation fault that's able to
11567 * display functions declared as static (otherwise the backtrace is useless). */
11568 static char *findFuncName(void *pointer, unsigned long *offset){
11569 int i, ret = -1;
11570 unsigned long off, minoff = 0;
11571
11572 /* Try to match against the Symbol with the smallest offset */
11573 for (i=0; symsTable[i].pointer; i++) {
11574 unsigned long lp = (unsigned long) pointer;
11575
11576 if (lp != (unsigned long)-1 && lp >= symsTable[i].pointer) {
11577 off=lp-symsTable[i].pointer;
11578 if (ret < 0 || off < minoff) {
11579 minoff=off;
11580 ret=i;
11581 }
11582 }
11583 }
11584 if (ret == -1) return NULL;
11585 *offset = minoff;
11586 return symsTable[ret].name;
11587 }
11588 #else /* HAVE_BACKTRACE */
11589 static void setupSigSegvAction(void) {
11590 }
11591 #endif /* HAVE_BACKTRACE */
11592
11593
11594
11595 /* The End */
11596
11597
11598