]> git.saurik.com Git - redis.git/blob - redis.c
9253ed351c480538823726d5e2987e2bdf39dabc
[redis.git] / redis.c
1 /*
2 * Copyright (c) 2009-2010, Salvatore Sanfilippo <antirez at gmail dot com>
3 * All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions are met:
7 *
8 * * Redistributions of source code must retain the above copyright notice,
9 * this list of conditions and the following disclaimer.
10 * * Redistributions in binary form must reproduce the above copyright
11 * notice, this list of conditions and the following disclaimer in the
12 * documentation and/or other materials provided with the distribution.
13 * * Neither the name of Redis nor the names of its contributors may be used
14 * to endorse or promote products derived from this software without
15 * specific prior written permission.
16 *
17 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
18 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
21 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
22 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
23 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
24 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
25 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
26 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
27 * POSSIBILITY OF SUCH DAMAGE.
28 */
29
30 #define REDIS_VERSION "2.1.1"
31
32 #include "fmacros.h"
33 #include "config.h"
34
35 #include <stdio.h>
36 #include <stdlib.h>
37 #include <string.h>
38 #include <time.h>
39 #include <unistd.h>
40 #include <signal.h>
41
42 #ifdef HAVE_BACKTRACE
43 #include <execinfo.h>
44 #include <ucontext.h>
45 #endif /* HAVE_BACKTRACE */
46
47 #include <sys/wait.h>
48 #include <errno.h>
49 #include <assert.h>
50 #include <ctype.h>
51 #include <stdarg.h>
52 #include <inttypes.h>
53 #include <arpa/inet.h>
54 #include <sys/stat.h>
55 #include <fcntl.h>
56 #include <sys/time.h>
57 #include <sys/resource.h>
58 #include <sys/uio.h>
59 #include <limits.h>
60 #include <float.h>
61 #include <math.h>
62 #include <pthread.h>
63
64 #if defined(__sun)
65 #include "solarisfixes.h"
66 #endif
67
68 #include "redis.h"
69 #include "ae.h" /* Event driven programming library */
70 #include "sds.h" /* Dynamic safe strings */
71 #include "anet.h" /* Networking the easy way */
72 #include "dict.h" /* Hash tables */
73 #include "adlist.h" /* Linked lists */
74 #include "zmalloc.h" /* total memory usage aware version of malloc/free */
75 #include "lzf.h" /* LZF compression library */
76 #include "pqsort.h" /* Partial qsort for SORT+LIMIT */
77 #include "zipmap.h" /* Compact dictionary-alike data structure */
78 #include "ziplist.h" /* Compact list data structure */
79 #include "sha1.h" /* SHA1 is used for DEBUG DIGEST */
80 #include "release.h" /* Release and/or git repository information */
81
82 /* Error codes */
83 #define REDIS_OK 0
84 #define REDIS_ERR -1
85
86 /* Static server configuration */
87 #define REDIS_SERVERPORT 6379 /* TCP port */
88 #define REDIS_MAXIDLETIME (60*5) /* default client timeout */
89 #define REDIS_IOBUF_LEN 1024
90 #define REDIS_LOADBUF_LEN 1024
91 #define REDIS_STATIC_ARGS 8
92 #define REDIS_DEFAULT_DBNUM 16
93 #define REDIS_CONFIGLINE_MAX 1024
94 #define REDIS_OBJFREELIST_MAX 1000000 /* Max number of objects to cache */
95 #define REDIS_MAX_SYNC_TIME 60 /* Slave can't take more to sync */
96 #define REDIS_EXPIRELOOKUPS_PER_CRON 10 /* lookup 10 expires per loop */
97 #define REDIS_MAX_WRITE_PER_EVENT (1024*64)
98 #define REDIS_REQUEST_MAX_SIZE (1024*1024*256) /* max bytes in inline command */
99
100 /* If more then REDIS_WRITEV_THRESHOLD write packets are pending use writev */
101 #define REDIS_WRITEV_THRESHOLD 3
102 /* Max number of iovecs used for each writev call */
103 #define REDIS_WRITEV_IOVEC_COUNT 256
104
105 /* Hash table parameters */
106 #define REDIS_HT_MINFILL 10 /* Minimal hash table fill 10% */
107
108 /* Command flags */
109 #define REDIS_CMD_BULK 1 /* Bulk write command */
110 #define REDIS_CMD_INLINE 2 /* Inline command */
111 /* REDIS_CMD_DENYOOM reserves a longer comment: all the commands marked with
112 this flags will return an error when the 'maxmemory' option is set in the
113 config file and the server is using more than maxmemory bytes of memory.
114 In short this commands are denied on low memory conditions. */
115 #define REDIS_CMD_DENYOOM 4
116 #define REDIS_CMD_FORCE_REPLICATION 8 /* Force replication even if dirty is 0 */
117
118 /* Object types */
119 #define REDIS_STRING 0
120 #define REDIS_LIST 1
121 #define REDIS_SET 2
122 #define REDIS_ZSET 3
123 #define REDIS_HASH 4
124 #define REDIS_VMPOINTER 8
125
126 /* Objects encoding. Some kind of objects like Strings and Hashes can be
127 * internally represented in multiple ways. The 'encoding' field of the object
128 * is set to one of this fields for this object. */
129 #define REDIS_ENCODING_RAW 0 /* Raw representation */
130 #define REDIS_ENCODING_INT 1 /* Encoded as integer */
131 #define REDIS_ENCODING_HT 2 /* Encoded as hash table */
132 #define REDIS_ENCODING_ZIPMAP 3 /* Encoded as zipmap */
133 #define REDIS_ENCODING_LIST 4 /* Encoded as zipmap */
134 #define REDIS_ENCODING_ZIPLIST 5 /* Encoded as ziplist */
135
136 static char* strencoding[] = {
137 "raw", "int", "hashtable", "zipmap", "list", "ziplist"
138 };
139
140 /* Object types only used for dumping to disk */
141 #define REDIS_EXPIRETIME 253
142 #define REDIS_SELECTDB 254
143 #define REDIS_EOF 255
144
145 /* Defines related to the dump file format. To store 32 bits lengths for short
146 * keys requires a lot of space, so we check the most significant 2 bits of
147 * the first byte to interpreter the length:
148 *
149 * 00|000000 => if the two MSB are 00 the len is the 6 bits of this byte
150 * 01|000000 00000000 => 01, the len is 14 byes, 6 bits + 8 bits of next byte
151 * 10|000000 [32 bit integer] => if it's 01, a full 32 bit len will follow
152 * 11|000000 this means: specially encoded object will follow. The six bits
153 * number specify the kind of object that follows.
154 * See the REDIS_RDB_ENC_* defines.
155 *
156 * Lenghts up to 63 are stored using a single byte, most DB keys, and may
157 * values, will fit inside. */
158 #define REDIS_RDB_6BITLEN 0
159 #define REDIS_RDB_14BITLEN 1
160 #define REDIS_RDB_32BITLEN 2
161 #define REDIS_RDB_ENCVAL 3
162 #define REDIS_RDB_LENERR UINT_MAX
163
164 /* When a length of a string object stored on disk has the first two bits
165 * set, the remaining two bits specify a special encoding for the object
166 * accordingly to the following defines: */
167 #define REDIS_RDB_ENC_INT8 0 /* 8 bit signed integer */
168 #define REDIS_RDB_ENC_INT16 1 /* 16 bit signed integer */
169 #define REDIS_RDB_ENC_INT32 2 /* 32 bit signed integer */
170 #define REDIS_RDB_ENC_LZF 3 /* string compressed with FASTLZ */
171
172 /* Virtual memory object->where field. */
173 #define REDIS_VM_MEMORY 0 /* The object is on memory */
174 #define REDIS_VM_SWAPPED 1 /* The object is on disk */
175 #define REDIS_VM_SWAPPING 2 /* Redis is swapping this object on disk */
176 #define REDIS_VM_LOADING 3 /* Redis is loading this object from disk */
177
178 /* Virtual memory static configuration stuff.
179 * Check vmFindContiguousPages() to know more about this magic numbers. */
180 #define REDIS_VM_MAX_NEAR_PAGES 65536
181 #define REDIS_VM_MAX_RANDOM_JUMP 4096
182 #define REDIS_VM_MAX_THREADS 32
183 #define REDIS_THREAD_STACK_SIZE (1024*1024*4)
184 /* The following is the *percentage* of completed I/O jobs to process when the
185 * handelr is called. While Virtual Memory I/O operations are performed by
186 * threads, this operations must be processed by the main thread when completed
187 * in order to take effect. */
188 #define REDIS_MAX_COMPLETED_JOBS_PROCESSED 1
189
190 /* Client flags */
191 #define REDIS_SLAVE 1 /* This client is a slave server */
192 #define REDIS_MASTER 2 /* This client is a master server */
193 #define REDIS_MONITOR 4 /* This client is a slave monitor, see MONITOR */
194 #define REDIS_MULTI 8 /* This client is in a MULTI context */
195 #define REDIS_BLOCKED 16 /* The client is waiting in a blocking operation */
196 #define REDIS_IO_WAIT 32 /* The client is waiting for Virtual Memory I/O */
197 #define REDIS_DIRTY_CAS 64 /* Watched keys modified. EXEC will fail. */
198
199 /* Slave replication state - slave side */
200 #define REDIS_REPL_NONE 0 /* No active replication */
201 #define REDIS_REPL_CONNECT 1 /* Must connect to master */
202 #define REDIS_REPL_CONNECTED 2 /* Connected to master */
203
204 /* Slave replication state - from the point of view of master
205 * Note that in SEND_BULK and ONLINE state the slave receives new updates
206 * in its output queue. In the WAIT_BGSAVE state instead the server is waiting
207 * to start the next background saving in order to send updates to it. */
208 #define REDIS_REPL_WAIT_BGSAVE_START 3 /* master waits bgsave to start feeding it */
209 #define REDIS_REPL_WAIT_BGSAVE_END 4 /* master waits bgsave to start bulk DB transmission */
210 #define REDIS_REPL_SEND_BULK 5 /* master is sending the bulk DB */
211 #define REDIS_REPL_ONLINE 6 /* bulk DB already transmitted, receive updates */
212
213 /* List related stuff */
214 #define REDIS_HEAD 0
215 #define REDIS_TAIL 1
216
217 /* Sort operations */
218 #define REDIS_SORT_GET 0
219 #define REDIS_SORT_ASC 1
220 #define REDIS_SORT_DESC 2
221 #define REDIS_SORTKEY_MAX 1024
222
223 /* Log levels */
224 #define REDIS_DEBUG 0
225 #define REDIS_VERBOSE 1
226 #define REDIS_NOTICE 2
227 #define REDIS_WARNING 3
228
229 /* Anti-warning macro... */
230 #define REDIS_NOTUSED(V) ((void) V)
231
232 #define ZSKIPLIST_MAXLEVEL 32 /* Should be enough for 2^32 elements */
233 #define ZSKIPLIST_P 0.25 /* Skiplist P = 1/4 */
234
235 /* Append only defines */
236 #define APPENDFSYNC_NO 0
237 #define APPENDFSYNC_ALWAYS 1
238 #define APPENDFSYNC_EVERYSEC 2
239
240 /* Zip structure related defaults */
241 #define REDIS_HASH_MAX_ZIPMAP_ENTRIES 64
242 #define REDIS_HASH_MAX_ZIPMAP_VALUE 512
243 #define REDIS_LIST_MAX_ZIPLIST_ENTRIES 1024
244 #define REDIS_LIST_MAX_ZIPLIST_VALUE 32
245
246 /* We can print the stacktrace, so our assert is defined this way: */
247 #define redisAssert(_e) ((_e)?(void)0 : (_redisAssert(#_e,__FILE__,__LINE__),_exit(1)))
248 #define redisPanic(_e) _redisPanic(#_e,__FILE__,__LINE__),_exit(1)
249 static void _redisAssert(char *estr, char *file, int line);
250 static void _redisPanic(char *msg, char *file, int line);
251
252 /*================================= Data types ============================== */
253
254 /* A redis object, that is a type able to hold a string / list / set */
255
256 /* The actual Redis Object */
257 typedef struct redisObject {
258 unsigned type:4;
259 unsigned storage:2; /* REDIS_VM_MEMORY or REDIS_VM_SWAPPING */
260 unsigned encoding:4;
261 unsigned lru:22; /* lru time (relative to server.lruclock) */
262 int refcount;
263 void *ptr;
264 /* VM fields are only allocated if VM is active, otherwise the
265 * object allocation function will just allocate
266 * sizeof(redisObjct) minus sizeof(redisObjectVM), so using
267 * Redis without VM active will not have any overhead. */
268 } robj;
269
270 /* The VM pointer structure - identifies an object in the swap file.
271 *
272 * This object is stored in place of the value
273 * object in the main key->value hash table representing a database.
274 * Note that the first fields (type, storage) are the same as the redisObject
275 * structure so that vmPointer strucuters can be accessed even when casted
276 * as redisObject structures.
277 *
278 * This is useful as we don't know if a value object is or not on disk, but we
279 * are always able to read obj->storage to check this. For vmPointer
280 * structures "type" is set to REDIS_VMPOINTER (even if without this field
281 * is still possible to check the kind of object from the value of 'storage').*/
282 typedef struct vmPointer {
283 unsigned type:4;
284 unsigned storage:2; /* REDIS_VM_SWAPPED or REDIS_VM_LOADING */
285 unsigned notused:26;
286 unsigned int vtype; /* type of the object stored in the swap file */
287 off_t page; /* the page at witch the object is stored on disk */
288 off_t usedpages; /* number of pages used on disk */
289 } vmpointer;
290
291 /* Macro used to initalize a Redis object allocated on the stack.
292 * Note that this macro is taken near the structure definition to make sure
293 * we'll update it when the structure is changed, to avoid bugs like
294 * bug #85 introduced exactly in this way. */
295 #define initStaticStringObject(_var,_ptr) do { \
296 _var.refcount = 1; \
297 _var.type = REDIS_STRING; \
298 _var.encoding = REDIS_ENCODING_RAW; \
299 _var.ptr = _ptr; \
300 _var.storage = REDIS_VM_MEMORY; \
301 } while(0);
302
303 typedef struct redisDb {
304 dict *dict; /* The keyspace for this DB */
305 dict *expires; /* Timeout of keys with a timeout set */
306 dict *blocking_keys; /* Keys with clients waiting for data (BLPOP) */
307 dict *io_keys; /* Keys with clients waiting for VM I/O */
308 dict *watched_keys; /* WATCHED keys for MULTI/EXEC CAS */
309 int id;
310 } redisDb;
311
312 /* Client MULTI/EXEC state */
313 typedef struct multiCmd {
314 robj **argv;
315 int argc;
316 struct redisCommand *cmd;
317 } multiCmd;
318
319 typedef struct multiState {
320 multiCmd *commands; /* Array of MULTI commands */
321 int count; /* Total number of MULTI commands */
322 } multiState;
323
324 /* With multiplexing we need to take per-clinet state.
325 * Clients are taken in a liked list. */
326 typedef struct redisClient {
327 int fd;
328 redisDb *db;
329 int dictid;
330 sds querybuf;
331 robj **argv, **mbargv;
332 int argc, mbargc;
333 int bulklen; /* bulk read len. -1 if not in bulk read mode */
334 int multibulk; /* multi bulk command format active */
335 list *reply;
336 int sentlen;
337 time_t lastinteraction; /* time of the last interaction, used for timeout */
338 int flags; /* REDIS_SLAVE | REDIS_MONITOR | REDIS_MULTI ... */
339 int slaveseldb; /* slave selected db, if this client is a slave */
340 int authenticated; /* when requirepass is non-NULL */
341 int replstate; /* replication state if this is a slave */
342 int repldbfd; /* replication DB file descriptor */
343 long repldboff; /* replication DB file offset */
344 off_t repldbsize; /* replication DB file size */
345 multiState mstate; /* MULTI/EXEC state */
346 robj **blocking_keys; /* The key we are waiting to terminate a blocking
347 * operation such as BLPOP. Otherwise NULL. */
348 int blocking_keys_num; /* Number of blocking keys */
349 time_t blockingto; /* Blocking operation timeout. If UNIX current time
350 * is >= blockingto then the operation timed out. */
351 list *io_keys; /* Keys this client is waiting to be loaded from the
352 * swap file in order to continue. */
353 list *watched_keys; /* Keys WATCHED for MULTI/EXEC CAS */
354 dict *pubsub_channels; /* channels a client is interested in (SUBSCRIBE) */
355 list *pubsub_patterns; /* patterns a client is interested in (SUBSCRIBE) */
356 } redisClient;
357
358 struct saveparam {
359 time_t seconds;
360 int changes;
361 };
362
363 /* Global server state structure */
364 struct redisServer {
365 int port;
366 int fd;
367 redisDb *db;
368 long long dirty; /* changes to DB from the last save */
369 list *clients;
370 list *slaves, *monitors;
371 char neterr[ANET_ERR_LEN];
372 aeEventLoop *el;
373 int cronloops; /* number of times the cron function run */
374 list *objfreelist; /* A list of freed objects to avoid malloc() */
375 time_t lastsave; /* Unix time of last save succeeede */
376 /* Fields used only for stats */
377 time_t stat_starttime; /* server start time */
378 long long stat_numcommands; /* number of processed commands */
379 long long stat_numconnections; /* number of connections received */
380 long long stat_expiredkeys; /* number of expired keys */
381 /* Configuration */
382 int verbosity;
383 int glueoutputbuf;
384 int maxidletime;
385 int dbnum;
386 int daemonize;
387 int appendonly;
388 int appendfsync;
389 int no_appendfsync_on_rewrite;
390 int shutdown_asap;
391 time_t lastfsync;
392 int appendfd;
393 int appendseldb;
394 char *pidfile;
395 pid_t bgsavechildpid;
396 pid_t bgrewritechildpid;
397 sds bgrewritebuf; /* buffer taken by parent during oppend only rewrite */
398 sds aofbuf; /* AOF buffer, written before entering the event loop */
399 struct saveparam *saveparams;
400 int saveparamslen;
401 char *logfile;
402 char *bindaddr;
403 char *dbfilename;
404 char *appendfilename;
405 char *requirepass;
406 int rdbcompression;
407 int activerehashing;
408 /* Replication related */
409 int isslave;
410 char *masterauth;
411 char *masterhost;
412 int masterport;
413 redisClient *master; /* client that is master for this slave */
414 int replstate;
415 unsigned int maxclients;
416 unsigned long long maxmemory;
417 unsigned int blpop_blocked_clients;
418 unsigned int vm_blocked_clients;
419 /* Sort parameters - qsort_r() is only available under BSD so we
420 * have to take this state global, in order to pass it to sortCompare() */
421 int sort_desc;
422 int sort_alpha;
423 int sort_bypattern;
424 /* Virtual memory configuration */
425 int vm_enabled;
426 char *vm_swap_file;
427 off_t vm_page_size;
428 off_t vm_pages;
429 unsigned long long vm_max_memory;
430 /* Zip structure config */
431 size_t hash_max_zipmap_entries;
432 size_t hash_max_zipmap_value;
433 size_t list_max_ziplist_entries;
434 size_t list_max_ziplist_value;
435 /* Virtual memory state */
436 FILE *vm_fp;
437 int vm_fd;
438 off_t vm_next_page; /* Next probably empty page */
439 off_t vm_near_pages; /* Number of pages allocated sequentially */
440 unsigned char *vm_bitmap; /* Bitmap of free/used pages */
441 time_t unixtime; /* Unix time sampled every second. */
442 /* Virtual memory I/O threads stuff */
443 /* An I/O thread process an element taken from the io_jobs queue and
444 * put the result of the operation in the io_done list. While the
445 * job is being processed, it's put on io_processing queue. */
446 list *io_newjobs; /* List of VM I/O jobs yet to be processed */
447 list *io_processing; /* List of VM I/O jobs being processed */
448 list *io_processed; /* List of VM I/O jobs already processed */
449 list *io_ready_clients; /* Clients ready to be unblocked. All keys loaded */
450 pthread_mutex_t io_mutex; /* lock to access io_jobs/io_done/io_thread_job */
451 pthread_mutex_t obj_freelist_mutex; /* safe redis objects creation/free */
452 pthread_mutex_t io_swapfile_mutex; /* So we can lseek + write */
453 pthread_attr_t io_threads_attr; /* attributes for threads creation */
454 int io_active_threads; /* Number of running I/O threads */
455 int vm_max_threads; /* Max number of I/O threads running at the same time */
456 /* Our main thread is blocked on the event loop, locking for sockets ready
457 * to be read or written, so when a threaded I/O operation is ready to be
458 * processed by the main thread, the I/O thread will use a unix pipe to
459 * awake the main thread. The followings are the two pipe FDs. */
460 int io_ready_pipe_read;
461 int io_ready_pipe_write;
462 /* Virtual memory stats */
463 unsigned long long vm_stats_used_pages;
464 unsigned long long vm_stats_swapped_objects;
465 unsigned long long vm_stats_swapouts;
466 unsigned long long vm_stats_swapins;
467 /* Pubsub */
468 dict *pubsub_channels; /* Map channels to list of subscribed clients */
469 list *pubsub_patterns; /* A list of pubsub_patterns */
470 /* Misc */
471 FILE *devnull;
472 unsigned lruclock:22; /* clock incrementing every minute, for LRU */
473 unsigned lruclock_padding:10;
474 };
475
476 typedef struct pubsubPattern {
477 redisClient *client;
478 robj *pattern;
479 } pubsubPattern;
480
481 typedef void redisCommandProc(redisClient *c);
482 typedef void redisVmPreloadProc(redisClient *c, struct redisCommand *cmd, int argc, robj **argv);
483 struct redisCommand {
484 char *name;
485 redisCommandProc *proc;
486 int arity;
487 int flags;
488 /* Use a function to determine which keys need to be loaded
489 * in the background prior to executing this command. Takes precedence
490 * over vm_firstkey and others, ignored when NULL */
491 redisVmPreloadProc *vm_preload_proc;
492 /* What keys should be loaded in background when calling this command? */
493 int vm_firstkey; /* The first argument that's a key (0 = no keys) */
494 int vm_lastkey; /* THe last argument that's a key */
495 int vm_keystep; /* The step between first and last key */
496 };
497
498 struct redisFunctionSym {
499 char *name;
500 unsigned long pointer;
501 };
502
503 typedef struct _redisSortObject {
504 robj *obj;
505 union {
506 double score;
507 robj *cmpobj;
508 } u;
509 } redisSortObject;
510
511 typedef struct _redisSortOperation {
512 int type;
513 robj *pattern;
514 } redisSortOperation;
515
516 /* ZSETs use a specialized version of Skiplists */
517
518 typedef struct zskiplistNode {
519 struct zskiplistNode **forward;
520 struct zskiplistNode *backward;
521 unsigned int *span;
522 double score;
523 robj *obj;
524 } zskiplistNode;
525
526 typedef struct zskiplist {
527 struct zskiplistNode *header, *tail;
528 unsigned long length;
529 int level;
530 } zskiplist;
531
532 typedef struct zset {
533 dict *dict;
534 zskiplist *zsl;
535 } zset;
536
537 /* Our shared "common" objects */
538
539 #define REDIS_SHARED_INTEGERS 10000
540 struct sharedObjectsStruct {
541 robj *crlf, *ok, *err, *emptybulk, *czero, *cone, *cnegone, *pong, *space,
542 *colon, *nullbulk, *nullmultibulk, *queued,
543 *emptymultibulk, *wrongtypeerr, *nokeyerr, *syntaxerr, *sameobjecterr,
544 *outofrangeerr, *plus,
545 *select0, *select1, *select2, *select3, *select4,
546 *select5, *select6, *select7, *select8, *select9,
547 *messagebulk, *pmessagebulk, *subscribebulk, *unsubscribebulk, *mbulk3,
548 *mbulk4, *psubscribebulk, *punsubscribebulk,
549 *integers[REDIS_SHARED_INTEGERS];
550 } shared;
551
552 /* Global vars that are actally used as constants. The following double
553 * values are used for double on-disk serialization, and are initialized
554 * at runtime to avoid strange compiler optimizations. */
555
556 static double R_Zero, R_PosInf, R_NegInf, R_Nan;
557
558 /* VM threaded I/O request message */
559 #define REDIS_IOJOB_LOAD 0 /* Load from disk to memory */
560 #define REDIS_IOJOB_PREPARE_SWAP 1 /* Compute needed pages */
561 #define REDIS_IOJOB_DO_SWAP 2 /* Swap from memory to disk */
562 typedef struct iojob {
563 int type; /* Request type, REDIS_IOJOB_* */
564 redisDb *db;/* Redis database */
565 robj *key; /* This I/O request is about swapping this key */
566 robj *id; /* Unique identifier of this job:
567 this is the object to swap for REDIS_IOREQ_*_SWAP, or the
568 vmpointer objct for REDIS_IOREQ_LOAD. */
569 robj *val; /* the value to swap for REDIS_IOREQ_*_SWAP, otherwise this
570 * field is populated by the I/O thread for REDIS_IOREQ_LOAD. */
571 off_t page; /* Swap page where to read/write the object */
572 off_t pages; /* Swap pages needed to save object. PREPARE_SWAP return val */
573 int canceled; /* True if this command was canceled by blocking side of VM */
574 pthread_t thread; /* ID of the thread processing this entry */
575 } iojob;
576
577 /*================================ Prototypes =============================== */
578
579 static void freeStringObject(robj *o);
580 static void freeListObject(robj *o);
581 static void freeSetObject(robj *o);
582 static void decrRefCount(void *o);
583 static robj *createObject(int type, void *ptr);
584 static void freeClient(redisClient *c);
585 static int rdbLoad(char *filename);
586 static void addReply(redisClient *c, robj *obj);
587 static void addReplySds(redisClient *c, sds s);
588 static void incrRefCount(robj *o);
589 static int rdbSaveBackground(char *filename);
590 static robj *createStringObject(char *ptr, size_t len);
591 static robj *dupStringObject(robj *o);
592 static void replicationFeedSlaves(list *slaves, int dictid, robj **argv, int argc);
593 static void replicationFeedMonitors(list *monitors, int dictid, robj **argv, int argc);
594 static void flushAppendOnlyFile(void);
595 static void feedAppendOnlyFile(struct redisCommand *cmd, int dictid, robj **argv, int argc);
596 static int syncWithMaster(void);
597 static robj *tryObjectEncoding(robj *o);
598 static robj *getDecodedObject(robj *o);
599 static int removeExpire(redisDb *db, robj *key);
600 static int expireIfNeeded(redisDb *db, robj *key);
601 static int deleteIfVolatile(redisDb *db, robj *key);
602 static int dbDelete(redisDb *db, robj *key);
603 static time_t getExpire(redisDb *db, robj *key);
604 static int setExpire(redisDb *db, robj *key, time_t when);
605 static void updateSlavesWaitingBgsave(int bgsaveerr);
606 static void freeMemoryIfNeeded(void);
607 static int processCommand(redisClient *c);
608 static void setupSigSegvAction(void);
609 static void rdbRemoveTempFile(pid_t childpid);
610 static void aofRemoveTempFile(pid_t childpid);
611 static size_t stringObjectLen(robj *o);
612 static void processInputBuffer(redisClient *c);
613 static zskiplist *zslCreate(void);
614 static void zslFree(zskiplist *zsl);
615 static void zslInsert(zskiplist *zsl, double score, robj *obj);
616 static void sendReplyToClientWritev(aeEventLoop *el, int fd, void *privdata, int mask);
617 static void initClientMultiState(redisClient *c);
618 static void freeClientMultiState(redisClient *c);
619 static void queueMultiCommand(redisClient *c, struct redisCommand *cmd);
620 static void unblockClientWaitingData(redisClient *c);
621 static int handleClientsWaitingListPush(redisClient *c, robj *key, robj *ele);
622 static void vmInit(void);
623 static void vmMarkPagesFree(off_t page, off_t count);
624 static robj *vmLoadObject(robj *o);
625 static robj *vmPreviewObject(robj *o);
626 static int vmSwapOneObjectBlocking(void);
627 static int vmSwapOneObjectThreaded(void);
628 static int vmCanSwapOut(void);
629 static int tryFreeOneObjectFromFreelist(void);
630 static void acceptHandler(aeEventLoop *el, int fd, void *privdata, int mask);
631 static void vmThreadedIOCompletedJob(aeEventLoop *el, int fd, void *privdata, int mask);
632 static void vmCancelThreadedIOJob(robj *o);
633 static void lockThreadedIO(void);
634 static void unlockThreadedIO(void);
635 static int vmSwapObjectThreaded(robj *key, robj *val, redisDb *db);
636 static void freeIOJob(iojob *j);
637 static void queueIOJob(iojob *j);
638 static int vmWriteObjectOnSwap(robj *o, off_t page);
639 static robj *vmReadObjectFromSwap(off_t page, int type);
640 static void waitEmptyIOJobsQueue(void);
641 static void vmReopenSwapFile(void);
642 static int vmFreePage(off_t page);
643 static void zunionInterBlockClientOnSwappedKeys(redisClient *c, struct redisCommand *cmd, int argc, robj **argv);
644 static void execBlockClientOnSwappedKeys(redisClient *c, struct redisCommand *cmd, int argc, robj **argv);
645 static int blockClientOnSwappedKeys(redisClient *c, struct redisCommand *cmd);
646 static int dontWaitForSwappedKey(redisClient *c, robj *key);
647 static void handleClientsBlockedOnSwappedKey(redisDb *db, robj *key);
648 static void readQueryFromClient(aeEventLoop *el, int fd, void *privdata, int mask);
649 static struct redisCommand *lookupCommand(char *name);
650 static void call(redisClient *c, struct redisCommand *cmd);
651 static void resetClient(redisClient *c);
652 static void convertToRealHash(robj *o);
653 static void listTypeConvert(robj *o, int enc);
654 static int pubsubUnsubscribeAllChannels(redisClient *c, int notify);
655 static int pubsubUnsubscribeAllPatterns(redisClient *c, int notify);
656 static void freePubsubPattern(void *p);
657 static int listMatchPubsubPattern(void *a, void *b);
658 static int compareStringObjects(robj *a, robj *b);
659 static int equalStringObjects(robj *a, robj *b);
660 static void usage();
661 static int rewriteAppendOnlyFileBackground(void);
662 static vmpointer *vmSwapObjectBlocking(robj *val);
663 static int prepareForShutdown();
664 static void touchWatchedKey(redisDb *db, robj *key);
665 static void touchWatchedKeysOnFlush(int dbid);
666 static void unwatchAllKeys(redisClient *c);
667
668 static void authCommand(redisClient *c);
669 static void pingCommand(redisClient *c);
670 static void echoCommand(redisClient *c);
671 static void setCommand(redisClient *c);
672 static void setnxCommand(redisClient *c);
673 static void setexCommand(redisClient *c);
674 static void getCommand(redisClient *c);
675 static void delCommand(redisClient *c);
676 static void existsCommand(redisClient *c);
677 static void incrCommand(redisClient *c);
678 static void decrCommand(redisClient *c);
679 static void incrbyCommand(redisClient *c);
680 static void decrbyCommand(redisClient *c);
681 static void selectCommand(redisClient *c);
682 static void randomkeyCommand(redisClient *c);
683 static void keysCommand(redisClient *c);
684 static void dbsizeCommand(redisClient *c);
685 static void lastsaveCommand(redisClient *c);
686 static void saveCommand(redisClient *c);
687 static void bgsaveCommand(redisClient *c);
688 static void bgrewriteaofCommand(redisClient *c);
689 static void shutdownCommand(redisClient *c);
690 static void moveCommand(redisClient *c);
691 static void renameCommand(redisClient *c);
692 static void renamenxCommand(redisClient *c);
693 static void lpushCommand(redisClient *c);
694 static void rpushCommand(redisClient *c);
695 static void lpushxCommand(redisClient *c);
696 static void rpushxCommand(redisClient *c);
697 static void linsertCommand(redisClient *c);
698 static void lpopCommand(redisClient *c);
699 static void rpopCommand(redisClient *c);
700 static void llenCommand(redisClient *c);
701 static void lindexCommand(redisClient *c);
702 static void lrangeCommand(redisClient *c);
703 static void ltrimCommand(redisClient *c);
704 static void typeCommand(redisClient *c);
705 static void lsetCommand(redisClient *c);
706 static void saddCommand(redisClient *c);
707 static void sremCommand(redisClient *c);
708 static void smoveCommand(redisClient *c);
709 static void sismemberCommand(redisClient *c);
710 static void scardCommand(redisClient *c);
711 static void spopCommand(redisClient *c);
712 static void srandmemberCommand(redisClient *c);
713 static void sinterCommand(redisClient *c);
714 static void sinterstoreCommand(redisClient *c);
715 static void sunionCommand(redisClient *c);
716 static void sunionstoreCommand(redisClient *c);
717 static void sdiffCommand(redisClient *c);
718 static void sdiffstoreCommand(redisClient *c);
719 static void syncCommand(redisClient *c);
720 static void flushdbCommand(redisClient *c);
721 static void flushallCommand(redisClient *c);
722 static void sortCommand(redisClient *c);
723 static void lremCommand(redisClient *c);
724 static void rpoplpushcommand(redisClient *c);
725 static void infoCommand(redisClient *c);
726 static void mgetCommand(redisClient *c);
727 static void monitorCommand(redisClient *c);
728 static void expireCommand(redisClient *c);
729 static void expireatCommand(redisClient *c);
730 static void getsetCommand(redisClient *c);
731 static void ttlCommand(redisClient *c);
732 static void slaveofCommand(redisClient *c);
733 static void debugCommand(redisClient *c);
734 static void msetCommand(redisClient *c);
735 static void msetnxCommand(redisClient *c);
736 static void zaddCommand(redisClient *c);
737 static void zincrbyCommand(redisClient *c);
738 static void zrangeCommand(redisClient *c);
739 static void zrangebyscoreCommand(redisClient *c);
740 static void zcountCommand(redisClient *c);
741 static void zrevrangeCommand(redisClient *c);
742 static void zcardCommand(redisClient *c);
743 static void zremCommand(redisClient *c);
744 static void zscoreCommand(redisClient *c);
745 static void zremrangebyscoreCommand(redisClient *c);
746 static void multiCommand(redisClient *c);
747 static void execCommand(redisClient *c);
748 static void discardCommand(redisClient *c);
749 static void blpopCommand(redisClient *c);
750 static void brpopCommand(redisClient *c);
751 static void appendCommand(redisClient *c);
752 static void substrCommand(redisClient *c);
753 static void zrankCommand(redisClient *c);
754 static void zrevrankCommand(redisClient *c);
755 static void hsetCommand(redisClient *c);
756 static void hsetnxCommand(redisClient *c);
757 static void hgetCommand(redisClient *c);
758 static void hmsetCommand(redisClient *c);
759 static void hmgetCommand(redisClient *c);
760 static void hdelCommand(redisClient *c);
761 static void hlenCommand(redisClient *c);
762 static void zremrangebyrankCommand(redisClient *c);
763 static void zunionstoreCommand(redisClient *c);
764 static void zinterstoreCommand(redisClient *c);
765 static void hkeysCommand(redisClient *c);
766 static void hvalsCommand(redisClient *c);
767 static void hgetallCommand(redisClient *c);
768 static void hexistsCommand(redisClient *c);
769 static void configCommand(redisClient *c);
770 static void hincrbyCommand(redisClient *c);
771 static void subscribeCommand(redisClient *c);
772 static void unsubscribeCommand(redisClient *c);
773 static void psubscribeCommand(redisClient *c);
774 static void punsubscribeCommand(redisClient *c);
775 static void publishCommand(redisClient *c);
776 static void watchCommand(redisClient *c);
777 static void unwatchCommand(redisClient *c);
778
779 /*================================= Globals ================================= */
780
781 /* Global vars */
782 static struct redisServer server; /* server global state */
783 static struct redisCommand *commandTable;
784 static struct redisCommand readonlyCommandTable[] = {
785 {"get",getCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
786 {"set",setCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,0,0,0},
787 {"setnx",setnxCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,0,0,0},
788 {"setex",setexCommand,4,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,0,0,0},
789 {"append",appendCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
790 {"substr",substrCommand,4,REDIS_CMD_INLINE,NULL,1,1,1},
791 {"del",delCommand,-2,REDIS_CMD_INLINE,NULL,0,0,0},
792 {"exists",existsCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
793 {"incr",incrCommand,2,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,1,1},
794 {"decr",decrCommand,2,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,1,1},
795 {"mget",mgetCommand,-2,REDIS_CMD_INLINE,NULL,1,-1,1},
796 {"rpush",rpushCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
797 {"lpush",lpushCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
798 {"rpushx",rpushxCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
799 {"lpushx",lpushxCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
800 {"linsert",linsertCommand,5,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
801 {"rpop",rpopCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
802 {"lpop",lpopCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
803 {"brpop",brpopCommand,-3,REDIS_CMD_INLINE,NULL,1,1,1},
804 {"blpop",blpopCommand,-3,REDIS_CMD_INLINE,NULL,1,1,1},
805 {"llen",llenCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
806 {"lindex",lindexCommand,3,REDIS_CMD_INLINE,NULL,1,1,1},
807 {"lset",lsetCommand,4,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
808 {"lrange",lrangeCommand,4,REDIS_CMD_INLINE,NULL,1,1,1},
809 {"ltrim",ltrimCommand,4,REDIS_CMD_INLINE,NULL,1,1,1},
810 {"lrem",lremCommand,4,REDIS_CMD_BULK,NULL,1,1,1},
811 {"rpoplpush",rpoplpushcommand,3,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,2,1},
812 {"sadd",saddCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
813 {"srem",sremCommand,3,REDIS_CMD_BULK,NULL,1,1,1},
814 {"smove",smoveCommand,4,REDIS_CMD_BULK,NULL,1,2,1},
815 {"sismember",sismemberCommand,3,REDIS_CMD_BULK,NULL,1,1,1},
816 {"scard",scardCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
817 {"spop",spopCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
818 {"srandmember",srandmemberCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
819 {"sinter",sinterCommand,-2,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,-1,1},
820 {"sinterstore",sinterstoreCommand,-3,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,2,-1,1},
821 {"sunion",sunionCommand,-2,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,-1,1},
822 {"sunionstore",sunionstoreCommand,-3,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,2,-1,1},
823 {"sdiff",sdiffCommand,-2,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,-1,1},
824 {"sdiffstore",sdiffstoreCommand,-3,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,2,-1,1},
825 {"smembers",sinterCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
826 {"zadd",zaddCommand,4,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
827 {"zincrby",zincrbyCommand,4,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
828 {"zrem",zremCommand,3,REDIS_CMD_BULK,NULL,1,1,1},
829 {"zremrangebyscore",zremrangebyscoreCommand,4,REDIS_CMD_INLINE,NULL,1,1,1},
830 {"zremrangebyrank",zremrangebyrankCommand,4,REDIS_CMD_INLINE,NULL,1,1,1},
831 {"zunionstore",zunionstoreCommand,-4,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,zunionInterBlockClientOnSwappedKeys,0,0,0},
832 {"zinterstore",zinterstoreCommand,-4,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,zunionInterBlockClientOnSwappedKeys,0,0,0},
833 {"zrange",zrangeCommand,-4,REDIS_CMD_INLINE,NULL,1,1,1},
834 {"zrangebyscore",zrangebyscoreCommand,-4,REDIS_CMD_INLINE,NULL,1,1,1},
835 {"zcount",zcountCommand,4,REDIS_CMD_INLINE,NULL,1,1,1},
836 {"zrevrange",zrevrangeCommand,-4,REDIS_CMD_INLINE,NULL,1,1,1},
837 {"zcard",zcardCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
838 {"zscore",zscoreCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
839 {"zrank",zrankCommand,3,REDIS_CMD_BULK,NULL,1,1,1},
840 {"zrevrank",zrevrankCommand,3,REDIS_CMD_BULK,NULL,1,1,1},
841 {"hset",hsetCommand,4,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
842 {"hsetnx",hsetnxCommand,4,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
843 {"hget",hgetCommand,3,REDIS_CMD_BULK,NULL,1,1,1},
844 {"hmset",hmsetCommand,-4,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
845 {"hmget",hmgetCommand,-3,REDIS_CMD_BULK,NULL,1,1,1},
846 {"hincrby",hincrbyCommand,4,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,1,1},
847 {"hdel",hdelCommand,3,REDIS_CMD_BULK,NULL,1,1,1},
848 {"hlen",hlenCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
849 {"hkeys",hkeysCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
850 {"hvals",hvalsCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
851 {"hgetall",hgetallCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
852 {"hexists",hexistsCommand,3,REDIS_CMD_BULK,NULL,1,1,1},
853 {"incrby",incrbyCommand,3,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,1,1},
854 {"decrby",decrbyCommand,3,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,1,1},
855 {"getset",getsetCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
856 {"mset",msetCommand,-3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,-1,2},
857 {"msetnx",msetnxCommand,-3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,-1,2},
858 {"randomkey",randomkeyCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
859 {"select",selectCommand,2,REDIS_CMD_INLINE,NULL,0,0,0},
860 {"move",moveCommand,3,REDIS_CMD_INLINE,NULL,1,1,1},
861 {"rename",renameCommand,3,REDIS_CMD_INLINE,NULL,1,1,1},
862 {"renamenx",renamenxCommand,3,REDIS_CMD_INLINE,NULL,1,1,1},
863 {"expire",expireCommand,3,REDIS_CMD_INLINE,NULL,0,0,0},
864 {"expireat",expireatCommand,3,REDIS_CMD_INLINE,NULL,0,0,0},
865 {"keys",keysCommand,2,REDIS_CMD_INLINE,NULL,0,0,0},
866 {"dbsize",dbsizeCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
867 {"auth",authCommand,2,REDIS_CMD_INLINE,NULL,0,0,0},
868 {"ping",pingCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
869 {"echo",echoCommand,2,REDIS_CMD_BULK,NULL,0,0,0},
870 {"save",saveCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
871 {"bgsave",bgsaveCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
872 {"bgrewriteaof",bgrewriteaofCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
873 {"shutdown",shutdownCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
874 {"lastsave",lastsaveCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
875 {"type",typeCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
876 {"multi",multiCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
877 {"exec",execCommand,1,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,execBlockClientOnSwappedKeys,0,0,0},
878 {"discard",discardCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
879 {"sync",syncCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
880 {"flushdb",flushdbCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
881 {"flushall",flushallCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
882 {"sort",sortCommand,-2,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,1,1},
883 {"info",infoCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
884 {"monitor",monitorCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
885 {"ttl",ttlCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
886 {"slaveof",slaveofCommand,3,REDIS_CMD_INLINE,NULL,0,0,0},
887 {"debug",debugCommand,-2,REDIS_CMD_INLINE,NULL,0,0,0},
888 {"config",configCommand,-2,REDIS_CMD_BULK,NULL,0,0,0},
889 {"subscribe",subscribeCommand,-2,REDIS_CMD_INLINE,NULL,0,0,0},
890 {"unsubscribe",unsubscribeCommand,-1,REDIS_CMD_INLINE,NULL,0,0,0},
891 {"psubscribe",psubscribeCommand,-2,REDIS_CMD_INLINE,NULL,0,0,0},
892 {"punsubscribe",punsubscribeCommand,-1,REDIS_CMD_INLINE,NULL,0,0,0},
893 {"publish",publishCommand,3,REDIS_CMD_BULK|REDIS_CMD_FORCE_REPLICATION,NULL,0,0,0},
894 {"watch",watchCommand,-2,REDIS_CMD_INLINE,NULL,0,0,0},
895 {"unwatch",unwatchCommand,1,REDIS_CMD_INLINE,NULL,0,0,0}
896 };
897
898 /*============================ Utility functions ============================ */
899
900 /* Glob-style pattern matching. */
901 static int stringmatchlen(const char *pattern, int patternLen,
902 const char *string, int stringLen, int nocase)
903 {
904 while(patternLen) {
905 switch(pattern[0]) {
906 case '*':
907 while (pattern[1] == '*') {
908 pattern++;
909 patternLen--;
910 }
911 if (patternLen == 1)
912 return 1; /* match */
913 while(stringLen) {
914 if (stringmatchlen(pattern+1, patternLen-1,
915 string, stringLen, nocase))
916 return 1; /* match */
917 string++;
918 stringLen--;
919 }
920 return 0; /* no match */
921 break;
922 case '?':
923 if (stringLen == 0)
924 return 0; /* no match */
925 string++;
926 stringLen--;
927 break;
928 case '[':
929 {
930 int not, match;
931
932 pattern++;
933 patternLen--;
934 not = pattern[0] == '^';
935 if (not) {
936 pattern++;
937 patternLen--;
938 }
939 match = 0;
940 while(1) {
941 if (pattern[0] == '\\') {
942 pattern++;
943 patternLen--;
944 if (pattern[0] == string[0])
945 match = 1;
946 } else if (pattern[0] == ']') {
947 break;
948 } else if (patternLen == 0) {
949 pattern--;
950 patternLen++;
951 break;
952 } else if (pattern[1] == '-' && patternLen >= 3) {
953 int start = pattern[0];
954 int end = pattern[2];
955 int c = string[0];
956 if (start > end) {
957 int t = start;
958 start = end;
959 end = t;
960 }
961 if (nocase) {
962 start = tolower(start);
963 end = tolower(end);
964 c = tolower(c);
965 }
966 pattern += 2;
967 patternLen -= 2;
968 if (c >= start && c <= end)
969 match = 1;
970 } else {
971 if (!nocase) {
972 if (pattern[0] == string[0])
973 match = 1;
974 } else {
975 if (tolower((int)pattern[0]) == tolower((int)string[0]))
976 match = 1;
977 }
978 }
979 pattern++;
980 patternLen--;
981 }
982 if (not)
983 match = !match;
984 if (!match)
985 return 0; /* no match */
986 string++;
987 stringLen--;
988 break;
989 }
990 case '\\':
991 if (patternLen >= 2) {
992 pattern++;
993 patternLen--;
994 }
995 /* fall through */
996 default:
997 if (!nocase) {
998 if (pattern[0] != string[0])
999 return 0; /* no match */
1000 } else {
1001 if (tolower((int)pattern[0]) != tolower((int)string[0]))
1002 return 0; /* no match */
1003 }
1004 string++;
1005 stringLen--;
1006 break;
1007 }
1008 pattern++;
1009 patternLen--;
1010 if (stringLen == 0) {
1011 while(*pattern == '*') {
1012 pattern++;
1013 patternLen--;
1014 }
1015 break;
1016 }
1017 }
1018 if (patternLen == 0 && stringLen == 0)
1019 return 1;
1020 return 0;
1021 }
1022
1023 static int stringmatch(const char *pattern, const char *string, int nocase) {
1024 return stringmatchlen(pattern,strlen(pattern),string,strlen(string),nocase);
1025 }
1026
1027 /* Convert a string representing an amount of memory into the number of
1028 * bytes, so for instance memtoll("1Gi") will return 1073741824 that is
1029 * (1024*1024*1024).
1030 *
1031 * On parsing error, if *err is not NULL, it's set to 1, otherwise it's
1032 * set to 0 */
1033 static long long memtoll(const char *p, int *err) {
1034 const char *u;
1035 char buf[128];
1036 long mul; /* unit multiplier */
1037 long long val;
1038 unsigned int digits;
1039
1040 if (err) *err = 0;
1041 /* Search the first non digit character. */
1042 u = p;
1043 if (*u == '-') u++;
1044 while(*u && isdigit(*u)) u++;
1045 if (*u == '\0' || !strcasecmp(u,"b")) {
1046 mul = 1;
1047 } else if (!strcasecmp(u,"k")) {
1048 mul = 1000;
1049 } else if (!strcasecmp(u,"kb")) {
1050 mul = 1024;
1051 } else if (!strcasecmp(u,"m")) {
1052 mul = 1000*1000;
1053 } else if (!strcasecmp(u,"mb")) {
1054 mul = 1024*1024;
1055 } else if (!strcasecmp(u,"g")) {
1056 mul = 1000L*1000*1000;
1057 } else if (!strcasecmp(u,"gb")) {
1058 mul = 1024L*1024*1024;
1059 } else {
1060 if (err) *err = 1;
1061 mul = 1;
1062 }
1063 digits = u-p;
1064 if (digits >= sizeof(buf)) {
1065 if (err) *err = 1;
1066 return LLONG_MAX;
1067 }
1068 memcpy(buf,p,digits);
1069 buf[digits] = '\0';
1070 val = strtoll(buf,NULL,10);
1071 return val*mul;
1072 }
1073
1074 /* Convert a long long into a string. Returns the number of
1075 * characters needed to represent the number, that can be shorter if passed
1076 * buffer length is not enough to store the whole number. */
1077 static int ll2string(char *s, size_t len, long long value) {
1078 char buf[32], *p;
1079 unsigned long long v;
1080 size_t l;
1081
1082 if (len == 0) return 0;
1083 v = (value < 0) ? -value : value;
1084 p = buf+31; /* point to the last character */
1085 do {
1086 *p-- = '0'+(v%10);
1087 v /= 10;
1088 } while(v);
1089 if (value < 0) *p-- = '-';
1090 p++;
1091 l = 32-(p-buf);
1092 if (l+1 > len) l = len-1; /* Make sure it fits, including the nul term */
1093 memcpy(s,p,l);
1094 s[l] = '\0';
1095 return l;
1096 }
1097
1098 static void redisLog(int level, const char *fmt, ...) {
1099 va_list ap;
1100 FILE *fp;
1101
1102 fp = (server.logfile == NULL) ? stdout : fopen(server.logfile,"a");
1103 if (!fp) return;
1104
1105 va_start(ap, fmt);
1106 if (level >= server.verbosity) {
1107 char *c = ".-*#";
1108 char buf[64];
1109 time_t now;
1110
1111 now = time(NULL);
1112 strftime(buf,64,"%d %b %H:%M:%S",localtime(&now));
1113 fprintf(fp,"[%d] %s %c ",(int)getpid(),buf,c[level]);
1114 vfprintf(fp, fmt, ap);
1115 fprintf(fp,"\n");
1116 fflush(fp);
1117 }
1118 va_end(ap);
1119
1120 if (server.logfile) fclose(fp);
1121 }
1122
1123 /*====================== Hash table type implementation ==================== */
1124
1125 /* This is an hash table type that uses the SDS dynamic strings libary as
1126 * keys and radis objects as values (objects can hold SDS strings,
1127 * lists, sets). */
1128
1129 static void dictVanillaFree(void *privdata, void *val)
1130 {
1131 DICT_NOTUSED(privdata);
1132 zfree(val);
1133 }
1134
1135 static void dictListDestructor(void *privdata, void *val)
1136 {
1137 DICT_NOTUSED(privdata);
1138 listRelease((list*)val);
1139 }
1140
1141 static int dictSdsKeyCompare(void *privdata, const void *key1,
1142 const void *key2)
1143 {
1144 int l1,l2;
1145 DICT_NOTUSED(privdata);
1146
1147 l1 = sdslen((sds)key1);
1148 l2 = sdslen((sds)key2);
1149 if (l1 != l2) return 0;
1150 return memcmp(key1, key2, l1) == 0;
1151 }
1152
1153 static void dictRedisObjectDestructor(void *privdata, void *val)
1154 {
1155 DICT_NOTUSED(privdata);
1156
1157 if (val == NULL) return; /* Values of swapped out keys as set to NULL */
1158 decrRefCount(val);
1159 }
1160
1161 static void dictSdsDestructor(void *privdata, void *val)
1162 {
1163 DICT_NOTUSED(privdata);
1164
1165 sdsfree(val);
1166 }
1167
1168 static int dictObjKeyCompare(void *privdata, const void *key1,
1169 const void *key2)
1170 {
1171 const robj *o1 = key1, *o2 = key2;
1172 return dictSdsKeyCompare(privdata,o1->ptr,o2->ptr);
1173 }
1174
1175 static unsigned int dictObjHash(const void *key) {
1176 const robj *o = key;
1177 return dictGenHashFunction(o->ptr, sdslen((sds)o->ptr));
1178 }
1179
1180 static unsigned int dictSdsHash(const void *key) {
1181 return dictGenHashFunction((unsigned char*)key, sdslen((char*)key));
1182 }
1183
1184 static int dictEncObjKeyCompare(void *privdata, const void *key1,
1185 const void *key2)
1186 {
1187 robj *o1 = (robj*) key1, *o2 = (robj*) key2;
1188 int cmp;
1189
1190 if (o1->encoding == REDIS_ENCODING_INT &&
1191 o2->encoding == REDIS_ENCODING_INT)
1192 return o1->ptr == o2->ptr;
1193
1194 o1 = getDecodedObject(o1);
1195 o2 = getDecodedObject(o2);
1196 cmp = dictSdsKeyCompare(privdata,o1->ptr,o2->ptr);
1197 decrRefCount(o1);
1198 decrRefCount(o2);
1199 return cmp;
1200 }
1201
1202 static unsigned int dictEncObjHash(const void *key) {
1203 robj *o = (robj*) key;
1204
1205 if (o->encoding == REDIS_ENCODING_RAW) {
1206 return dictGenHashFunction(o->ptr, sdslen((sds)o->ptr));
1207 } else {
1208 if (o->encoding == REDIS_ENCODING_INT) {
1209 char buf[32];
1210 int len;
1211
1212 len = ll2string(buf,32,(long)o->ptr);
1213 return dictGenHashFunction((unsigned char*)buf, len);
1214 } else {
1215 unsigned int hash;
1216
1217 o = getDecodedObject(o);
1218 hash = dictGenHashFunction(o->ptr, sdslen((sds)o->ptr));
1219 decrRefCount(o);
1220 return hash;
1221 }
1222 }
1223 }
1224
1225 /* Sets type */
1226 static dictType setDictType = {
1227 dictEncObjHash, /* hash function */
1228 NULL, /* key dup */
1229 NULL, /* val dup */
1230 dictEncObjKeyCompare, /* key compare */
1231 dictRedisObjectDestructor, /* key destructor */
1232 NULL /* val destructor */
1233 };
1234
1235 /* Sorted sets hash (note: a skiplist is used in addition to the hash table) */
1236 static dictType zsetDictType = {
1237 dictEncObjHash, /* hash function */
1238 NULL, /* key dup */
1239 NULL, /* val dup */
1240 dictEncObjKeyCompare, /* key compare */
1241 dictRedisObjectDestructor, /* key destructor */
1242 dictVanillaFree /* val destructor of malloc(sizeof(double)) */
1243 };
1244
1245 /* Db->dict, keys are sds strings, vals are Redis objects. */
1246 static dictType dbDictType = {
1247 dictSdsHash, /* hash function */
1248 NULL, /* key dup */
1249 NULL, /* val dup */
1250 dictSdsKeyCompare, /* key compare */
1251 dictSdsDestructor, /* key destructor */
1252 dictRedisObjectDestructor /* val destructor */
1253 };
1254
1255 /* Db->expires */
1256 static dictType keyptrDictType = {
1257 dictSdsHash, /* hash function */
1258 NULL, /* key dup */
1259 NULL, /* val dup */
1260 dictSdsKeyCompare, /* key compare */
1261 NULL, /* key destructor */
1262 NULL /* val destructor */
1263 };
1264
1265 /* Hash type hash table (note that small hashes are represented with zimpaps) */
1266 static dictType hashDictType = {
1267 dictEncObjHash, /* hash function */
1268 NULL, /* key dup */
1269 NULL, /* val dup */
1270 dictEncObjKeyCompare, /* key compare */
1271 dictRedisObjectDestructor, /* key destructor */
1272 dictRedisObjectDestructor /* val destructor */
1273 };
1274
1275 /* Keylist hash table type has unencoded redis objects as keys and
1276 * lists as values. It's used for blocking operations (BLPOP) and to
1277 * map swapped keys to a list of clients waiting for this keys to be loaded. */
1278 static dictType keylistDictType = {
1279 dictObjHash, /* hash function */
1280 NULL, /* key dup */
1281 NULL, /* val dup */
1282 dictObjKeyCompare, /* key compare */
1283 dictRedisObjectDestructor, /* key destructor */
1284 dictListDestructor /* val destructor */
1285 };
1286
1287 static void version();
1288
1289 /* ========================= Random utility functions ======================= */
1290
1291 /* Redis generally does not try to recover from out of memory conditions
1292 * when allocating objects or strings, it is not clear if it will be possible
1293 * to report this condition to the client since the networking layer itself
1294 * is based on heap allocation for send buffers, so we simply abort.
1295 * At least the code will be simpler to read... */
1296 static void oom(const char *msg) {
1297 redisLog(REDIS_WARNING, "%s: Out of memory\n",msg);
1298 sleep(1);
1299 abort();
1300 }
1301
1302 /* ====================== Redis server networking stuff ===================== */
1303 static void closeTimedoutClients(void) {
1304 redisClient *c;
1305 listNode *ln;
1306 time_t now = time(NULL);
1307 listIter li;
1308
1309 listRewind(server.clients,&li);
1310 while ((ln = listNext(&li)) != NULL) {
1311 c = listNodeValue(ln);
1312 if (server.maxidletime &&
1313 !(c->flags & REDIS_SLAVE) && /* no timeout for slaves */
1314 !(c->flags & REDIS_MASTER) && /* no timeout for masters */
1315 dictSize(c->pubsub_channels) == 0 && /* no timeout for pubsub */
1316 listLength(c->pubsub_patterns) == 0 &&
1317 (now - c->lastinteraction > server.maxidletime))
1318 {
1319 redisLog(REDIS_VERBOSE,"Closing idle client");
1320 freeClient(c);
1321 } else if (c->flags & REDIS_BLOCKED) {
1322 if (c->blockingto != 0 && c->blockingto < now) {
1323 addReply(c,shared.nullmultibulk);
1324 unblockClientWaitingData(c);
1325 }
1326 }
1327 }
1328 }
1329
1330 static int htNeedsResize(dict *dict) {
1331 long long size, used;
1332
1333 size = dictSlots(dict);
1334 used = dictSize(dict);
1335 return (size && used && size > DICT_HT_INITIAL_SIZE &&
1336 (used*100/size < REDIS_HT_MINFILL));
1337 }
1338
1339 /* If the percentage of used slots in the HT reaches REDIS_HT_MINFILL
1340 * we resize the hash table to save memory */
1341 static void tryResizeHashTables(void) {
1342 int j;
1343
1344 for (j = 0; j < server.dbnum; j++) {
1345 if (htNeedsResize(server.db[j].dict))
1346 dictResize(server.db[j].dict);
1347 if (htNeedsResize(server.db[j].expires))
1348 dictResize(server.db[j].expires);
1349 }
1350 }
1351
1352 /* Our hash table implementation performs rehashing incrementally while
1353 * we write/read from the hash table. Still if the server is idle, the hash
1354 * table will use two tables for a long time. So we try to use 1 millisecond
1355 * of CPU time at every serverCron() loop in order to rehash some key. */
1356 static void incrementallyRehash(void) {
1357 int j;
1358
1359 for (j = 0; j < server.dbnum; j++) {
1360 if (dictIsRehashing(server.db[j].dict)) {
1361 dictRehashMilliseconds(server.db[j].dict,1);
1362 break; /* already used our millisecond for this loop... */
1363 }
1364 }
1365 }
1366
1367 /* A background saving child (BGSAVE) terminated its work. Handle this. */
1368 void backgroundSaveDoneHandler(int statloc) {
1369 int exitcode = WEXITSTATUS(statloc);
1370 int bysignal = WIFSIGNALED(statloc);
1371
1372 if (!bysignal && exitcode == 0) {
1373 redisLog(REDIS_NOTICE,
1374 "Background saving terminated with success");
1375 server.dirty = 0;
1376 server.lastsave = time(NULL);
1377 } else if (!bysignal && exitcode != 0) {
1378 redisLog(REDIS_WARNING, "Background saving error");
1379 } else {
1380 redisLog(REDIS_WARNING,
1381 "Background saving terminated by signal %d", WTERMSIG(statloc));
1382 rdbRemoveTempFile(server.bgsavechildpid);
1383 }
1384 server.bgsavechildpid = -1;
1385 /* Possibly there are slaves waiting for a BGSAVE in order to be served
1386 * (the first stage of SYNC is a bulk transfer of dump.rdb) */
1387 updateSlavesWaitingBgsave(exitcode == 0 ? REDIS_OK : REDIS_ERR);
1388 }
1389
1390 /* A background append only file rewriting (BGREWRITEAOF) terminated its work.
1391 * Handle this. */
1392 void backgroundRewriteDoneHandler(int statloc) {
1393 int exitcode = WEXITSTATUS(statloc);
1394 int bysignal = WIFSIGNALED(statloc);
1395
1396 if (!bysignal && exitcode == 0) {
1397 int fd;
1398 char tmpfile[256];
1399
1400 redisLog(REDIS_NOTICE,
1401 "Background append only file rewriting terminated with success");
1402 /* Now it's time to flush the differences accumulated by the parent */
1403 snprintf(tmpfile,256,"temp-rewriteaof-bg-%d.aof", (int) server.bgrewritechildpid);
1404 fd = open(tmpfile,O_WRONLY|O_APPEND);
1405 if (fd == -1) {
1406 redisLog(REDIS_WARNING, "Not able to open the temp append only file produced by the child: %s", strerror(errno));
1407 goto cleanup;
1408 }
1409 /* Flush our data... */
1410 if (write(fd,server.bgrewritebuf,sdslen(server.bgrewritebuf)) !=
1411 (signed) sdslen(server.bgrewritebuf)) {
1412 redisLog(REDIS_WARNING, "Error or short write trying to flush the parent diff of the append log file in the child temp file: %s", strerror(errno));
1413 close(fd);
1414 goto cleanup;
1415 }
1416 redisLog(REDIS_NOTICE,"Parent diff flushed into the new append log file with success (%lu bytes)",sdslen(server.bgrewritebuf));
1417 /* Now our work is to rename the temp file into the stable file. And
1418 * switch the file descriptor used by the server for append only. */
1419 if (rename(tmpfile,server.appendfilename) == -1) {
1420 redisLog(REDIS_WARNING,"Can't rename the temp append only file into the stable one: %s", strerror(errno));
1421 close(fd);
1422 goto cleanup;
1423 }
1424 /* Mission completed... almost */
1425 redisLog(REDIS_NOTICE,"Append only file successfully rewritten.");
1426 if (server.appendfd != -1) {
1427 /* If append only is actually enabled... */
1428 close(server.appendfd);
1429 server.appendfd = fd;
1430 if (server.appendfsync != APPENDFSYNC_NO) aof_fsync(fd);
1431 server.appendseldb = -1; /* Make sure it will issue SELECT */
1432 redisLog(REDIS_NOTICE,"The new append only file was selected for future appends.");
1433 } else {
1434 /* If append only is disabled we just generate a dump in this
1435 * format. Why not? */
1436 close(fd);
1437 }
1438 } else if (!bysignal && exitcode != 0) {
1439 redisLog(REDIS_WARNING, "Background append only file rewriting error");
1440 } else {
1441 redisLog(REDIS_WARNING,
1442 "Background append only file rewriting terminated by signal %d",
1443 WTERMSIG(statloc));
1444 }
1445 cleanup:
1446 sdsfree(server.bgrewritebuf);
1447 server.bgrewritebuf = sdsempty();
1448 aofRemoveTempFile(server.bgrewritechildpid);
1449 server.bgrewritechildpid = -1;
1450 }
1451
1452 /* This function is called once a background process of some kind terminates,
1453 * as we want to avoid resizing the hash tables when there is a child in order
1454 * to play well with copy-on-write (otherwise when a resize happens lots of
1455 * memory pages are copied). The goal of this function is to update the ability
1456 * for dict.c to resize the hash tables accordingly to the fact we have o not
1457 * running childs. */
1458 static void updateDictResizePolicy(void) {
1459 if (server.bgsavechildpid == -1 && server.bgrewritechildpid == -1)
1460 dictEnableResize();
1461 else
1462 dictDisableResize();
1463 }
1464
1465 static int serverCron(struct aeEventLoop *eventLoop, long long id, void *clientData) {
1466 int j, loops = server.cronloops++;
1467 REDIS_NOTUSED(eventLoop);
1468 REDIS_NOTUSED(id);
1469 REDIS_NOTUSED(clientData);
1470
1471 /* We take a cached value of the unix time in the global state because
1472 * with virtual memory and aging there is to store the current time
1473 * in objects at every object access, and accuracy is not needed.
1474 * To access a global var is faster than calling time(NULL) */
1475 server.unixtime = time(NULL);
1476 /* We have just 21 bits per object for LRU information.
1477 * So we use an (eventually wrapping) LRU clock with minutes resolution.
1478 *
1479 * When we need to select what object to swap, we compute the minimum
1480 * time distance between the current lruclock and the object last access
1481 * lruclock info. Even if clocks will wrap on overflow, there is
1482 * the interesting property that we are sure that at least
1483 * ABS(A-B) minutes passed between current time and timestamp B.
1484 *
1485 * This is not precise but we don't need at all precision, but just
1486 * something statistically reasonable.
1487 */
1488 server.lruclock = (time(NULL)/60)&((1<<21)-1);
1489
1490 /* We received a SIGTERM, shutting down here in a safe way, as it is
1491 * not ok doing so inside the signal handler. */
1492 if (server.shutdown_asap) {
1493 if (prepareForShutdown() == REDIS_OK) exit(0);
1494 redisLog(REDIS_WARNING,"SIGTERM received but errors trying to shut down the server, check the logs for more information");
1495 }
1496
1497 /* Show some info about non-empty databases */
1498 for (j = 0; j < server.dbnum; j++) {
1499 long long size, used, vkeys;
1500
1501 size = dictSlots(server.db[j].dict);
1502 used = dictSize(server.db[j].dict);
1503 vkeys = dictSize(server.db[j].expires);
1504 if (!(loops % 50) && (used || vkeys)) {
1505 redisLog(REDIS_VERBOSE,"DB %d: %lld keys (%lld volatile) in %lld slots HT.",j,used,vkeys,size);
1506 /* dictPrintStats(server.dict); */
1507 }
1508 }
1509
1510 /* We don't want to resize the hash tables while a bacground saving
1511 * is in progress: the saving child is created using fork() that is
1512 * implemented with a copy-on-write semantic in most modern systems, so
1513 * if we resize the HT while there is the saving child at work actually
1514 * a lot of memory movements in the parent will cause a lot of pages
1515 * copied. */
1516 if (server.bgsavechildpid == -1 && server.bgrewritechildpid == -1) {
1517 if (!(loops % 10)) tryResizeHashTables();
1518 if (server.activerehashing) incrementallyRehash();
1519 }
1520
1521 /* Show information about connected clients */
1522 if (!(loops % 50)) {
1523 redisLog(REDIS_VERBOSE,"%d clients connected (%d slaves), %zu bytes in use",
1524 listLength(server.clients)-listLength(server.slaves),
1525 listLength(server.slaves),
1526 zmalloc_used_memory());
1527 }
1528
1529 /* Close connections of timedout clients */
1530 if ((server.maxidletime && !(loops % 100)) || server.blpop_blocked_clients)
1531 closeTimedoutClients();
1532
1533 /* Check if a background saving or AOF rewrite in progress terminated */
1534 if (server.bgsavechildpid != -1 || server.bgrewritechildpid != -1) {
1535 int statloc;
1536 pid_t pid;
1537
1538 if ((pid = wait3(&statloc,WNOHANG,NULL)) != 0) {
1539 if (pid == server.bgsavechildpid) {
1540 backgroundSaveDoneHandler(statloc);
1541 } else {
1542 backgroundRewriteDoneHandler(statloc);
1543 }
1544 updateDictResizePolicy();
1545 }
1546 } else {
1547 /* If there is not a background saving in progress check if
1548 * we have to save now */
1549 time_t now = time(NULL);
1550 for (j = 0; j < server.saveparamslen; j++) {
1551 struct saveparam *sp = server.saveparams+j;
1552
1553 if (server.dirty >= sp->changes &&
1554 now-server.lastsave > sp->seconds) {
1555 redisLog(REDIS_NOTICE,"%d changes in %d seconds. Saving...",
1556 sp->changes, sp->seconds);
1557 rdbSaveBackground(server.dbfilename);
1558 break;
1559 }
1560 }
1561 }
1562
1563 /* Try to expire a few timed out keys. The algorithm used is adaptive and
1564 * will use few CPU cycles if there are few expiring keys, otherwise
1565 * it will get more aggressive to avoid that too much memory is used by
1566 * keys that can be removed from the keyspace. */
1567 for (j = 0; j < server.dbnum; j++) {
1568 int expired;
1569 redisDb *db = server.db+j;
1570
1571 /* Continue to expire if at the end of the cycle more than 25%
1572 * of the keys were expired. */
1573 do {
1574 long num = dictSize(db->expires);
1575 time_t now = time(NULL);
1576
1577 expired = 0;
1578 if (num > REDIS_EXPIRELOOKUPS_PER_CRON)
1579 num = REDIS_EXPIRELOOKUPS_PER_CRON;
1580 while (num--) {
1581 dictEntry *de;
1582 time_t t;
1583
1584 if ((de = dictGetRandomKey(db->expires)) == NULL) break;
1585 t = (time_t) dictGetEntryVal(de);
1586 if (now > t) {
1587 sds key = dictGetEntryKey(de);
1588 robj *keyobj = createStringObject(key,sdslen(key));
1589
1590 dbDelete(db,keyobj);
1591 decrRefCount(keyobj);
1592 expired++;
1593 server.stat_expiredkeys++;
1594 }
1595 }
1596 } while (expired > REDIS_EXPIRELOOKUPS_PER_CRON/4);
1597 }
1598
1599 /* Swap a few keys on disk if we are over the memory limit and VM
1600 * is enbled. Try to free objects from the free list first. */
1601 if (vmCanSwapOut()) {
1602 while (server.vm_enabled && zmalloc_used_memory() >
1603 server.vm_max_memory)
1604 {
1605 int retval;
1606
1607 if (tryFreeOneObjectFromFreelist() == REDIS_OK) continue;
1608 retval = (server.vm_max_threads == 0) ?
1609 vmSwapOneObjectBlocking() :
1610 vmSwapOneObjectThreaded();
1611 if (retval == REDIS_ERR && !(loops % 300) &&
1612 zmalloc_used_memory() >
1613 (server.vm_max_memory+server.vm_max_memory/10))
1614 {
1615 redisLog(REDIS_WARNING,"WARNING: vm-max-memory limit exceeded by more than 10%% but unable to swap more objects out!");
1616 }
1617 /* Note that when using threade I/O we free just one object,
1618 * because anyway when the I/O thread in charge to swap this
1619 * object out will finish, the handler of completed jobs
1620 * will try to swap more objects if we are still out of memory. */
1621 if (retval == REDIS_ERR || server.vm_max_threads > 0) break;
1622 }
1623 }
1624
1625 /* Check if we should connect to a MASTER */
1626 if (server.replstate == REDIS_REPL_CONNECT && !(loops % 10)) {
1627 redisLog(REDIS_NOTICE,"Connecting to MASTER...");
1628 if (syncWithMaster() == REDIS_OK) {
1629 redisLog(REDIS_NOTICE,"MASTER <-> SLAVE sync succeeded");
1630 if (server.appendonly) rewriteAppendOnlyFileBackground();
1631 }
1632 }
1633 return 100;
1634 }
1635
1636 /* This function gets called every time Redis is entering the
1637 * main loop of the event driven library, that is, before to sleep
1638 * for ready file descriptors. */
1639 static void beforeSleep(struct aeEventLoop *eventLoop) {
1640 REDIS_NOTUSED(eventLoop);
1641
1642 /* Awake clients that got all the swapped keys they requested */
1643 if (server.vm_enabled && listLength(server.io_ready_clients)) {
1644 listIter li;
1645 listNode *ln;
1646
1647 listRewind(server.io_ready_clients,&li);
1648 while((ln = listNext(&li))) {
1649 redisClient *c = ln->value;
1650 struct redisCommand *cmd;
1651
1652 /* Resume the client. */
1653 listDelNode(server.io_ready_clients,ln);
1654 c->flags &= (~REDIS_IO_WAIT);
1655 server.vm_blocked_clients--;
1656 aeCreateFileEvent(server.el, c->fd, AE_READABLE,
1657 readQueryFromClient, c);
1658 cmd = lookupCommand(c->argv[0]->ptr);
1659 assert(cmd != NULL);
1660 call(c,cmd);
1661 resetClient(c);
1662 /* There may be more data to process in the input buffer. */
1663 if (c->querybuf && sdslen(c->querybuf) > 0)
1664 processInputBuffer(c);
1665 }
1666 }
1667 /* Write the AOF buffer on disk */
1668 flushAppendOnlyFile();
1669 }
1670
1671 static void createSharedObjects(void) {
1672 int j;
1673
1674 shared.crlf = createObject(REDIS_STRING,sdsnew("\r\n"));
1675 shared.ok = createObject(REDIS_STRING,sdsnew("+OK\r\n"));
1676 shared.err = createObject(REDIS_STRING,sdsnew("-ERR\r\n"));
1677 shared.emptybulk = createObject(REDIS_STRING,sdsnew("$0\r\n\r\n"));
1678 shared.czero = createObject(REDIS_STRING,sdsnew(":0\r\n"));
1679 shared.cone = createObject(REDIS_STRING,sdsnew(":1\r\n"));
1680 shared.cnegone = createObject(REDIS_STRING,sdsnew(":-1\r\n"));
1681 shared.nullbulk = createObject(REDIS_STRING,sdsnew("$-1\r\n"));
1682 shared.nullmultibulk = createObject(REDIS_STRING,sdsnew("*-1\r\n"));
1683 shared.emptymultibulk = createObject(REDIS_STRING,sdsnew("*0\r\n"));
1684 shared.pong = createObject(REDIS_STRING,sdsnew("+PONG\r\n"));
1685 shared.queued = createObject(REDIS_STRING,sdsnew("+QUEUED\r\n"));
1686 shared.wrongtypeerr = createObject(REDIS_STRING,sdsnew(
1687 "-ERR Operation against a key holding the wrong kind of value\r\n"));
1688 shared.nokeyerr = createObject(REDIS_STRING,sdsnew(
1689 "-ERR no such key\r\n"));
1690 shared.syntaxerr = createObject(REDIS_STRING,sdsnew(
1691 "-ERR syntax error\r\n"));
1692 shared.sameobjecterr = createObject(REDIS_STRING,sdsnew(
1693 "-ERR source and destination objects are the same\r\n"));
1694 shared.outofrangeerr = createObject(REDIS_STRING,sdsnew(
1695 "-ERR index out of range\r\n"));
1696 shared.space = createObject(REDIS_STRING,sdsnew(" "));
1697 shared.colon = createObject(REDIS_STRING,sdsnew(":"));
1698 shared.plus = createObject(REDIS_STRING,sdsnew("+"));
1699 shared.select0 = createStringObject("select 0\r\n",10);
1700 shared.select1 = createStringObject("select 1\r\n",10);
1701 shared.select2 = createStringObject("select 2\r\n",10);
1702 shared.select3 = createStringObject("select 3\r\n",10);
1703 shared.select4 = createStringObject("select 4\r\n",10);
1704 shared.select5 = createStringObject("select 5\r\n",10);
1705 shared.select6 = createStringObject("select 6\r\n",10);
1706 shared.select7 = createStringObject("select 7\r\n",10);
1707 shared.select8 = createStringObject("select 8\r\n",10);
1708 shared.select9 = createStringObject("select 9\r\n",10);
1709 shared.messagebulk = createStringObject("$7\r\nmessage\r\n",13);
1710 shared.pmessagebulk = createStringObject("$8\r\npmessage\r\n",14);
1711 shared.subscribebulk = createStringObject("$9\r\nsubscribe\r\n",15);
1712 shared.unsubscribebulk = createStringObject("$11\r\nunsubscribe\r\n",18);
1713 shared.psubscribebulk = createStringObject("$10\r\npsubscribe\r\n",17);
1714 shared.punsubscribebulk = createStringObject("$12\r\npunsubscribe\r\n",19);
1715 shared.mbulk3 = createStringObject("*3\r\n",4);
1716 shared.mbulk4 = createStringObject("*4\r\n",4);
1717 for (j = 0; j < REDIS_SHARED_INTEGERS; j++) {
1718 shared.integers[j] = createObject(REDIS_STRING,(void*)(long)j);
1719 shared.integers[j]->encoding = REDIS_ENCODING_INT;
1720 }
1721 }
1722
1723 static void appendServerSaveParams(time_t seconds, int changes) {
1724 server.saveparams = zrealloc(server.saveparams,sizeof(struct saveparam)*(server.saveparamslen+1));
1725 server.saveparams[server.saveparamslen].seconds = seconds;
1726 server.saveparams[server.saveparamslen].changes = changes;
1727 server.saveparamslen++;
1728 }
1729
1730 static void resetServerSaveParams() {
1731 zfree(server.saveparams);
1732 server.saveparams = NULL;
1733 server.saveparamslen = 0;
1734 }
1735
1736 static void initServerConfig() {
1737 server.dbnum = REDIS_DEFAULT_DBNUM;
1738 server.port = REDIS_SERVERPORT;
1739 server.verbosity = REDIS_VERBOSE;
1740 server.maxidletime = REDIS_MAXIDLETIME;
1741 server.saveparams = NULL;
1742 server.logfile = NULL; /* NULL = log on standard output */
1743 server.bindaddr = NULL;
1744 server.glueoutputbuf = 1;
1745 server.daemonize = 0;
1746 server.appendonly = 0;
1747 server.appendfsync = APPENDFSYNC_EVERYSEC;
1748 server.no_appendfsync_on_rewrite = 0;
1749 server.lastfsync = time(NULL);
1750 server.appendfd = -1;
1751 server.appendseldb = -1; /* Make sure the first time will not match */
1752 server.pidfile = zstrdup("/var/run/redis.pid");
1753 server.dbfilename = zstrdup("dump.rdb");
1754 server.appendfilename = zstrdup("appendonly.aof");
1755 server.requirepass = NULL;
1756 server.rdbcompression = 1;
1757 server.activerehashing = 1;
1758 server.maxclients = 0;
1759 server.blpop_blocked_clients = 0;
1760 server.maxmemory = 0;
1761 server.vm_enabled = 0;
1762 server.vm_swap_file = zstrdup("/tmp/redis-%p.vm");
1763 server.vm_page_size = 256; /* 256 bytes per page */
1764 server.vm_pages = 1024*1024*100; /* 104 millions of pages */
1765 server.vm_max_memory = 1024LL*1024*1024*1; /* 1 GB of RAM */
1766 server.vm_max_threads = 4;
1767 server.vm_blocked_clients = 0;
1768 server.hash_max_zipmap_entries = REDIS_HASH_MAX_ZIPMAP_ENTRIES;
1769 server.hash_max_zipmap_value = REDIS_HASH_MAX_ZIPMAP_VALUE;
1770 server.list_max_ziplist_entries = REDIS_LIST_MAX_ZIPLIST_ENTRIES;
1771 server.list_max_ziplist_value = REDIS_LIST_MAX_ZIPLIST_VALUE;
1772 server.shutdown_asap = 0;
1773
1774 resetServerSaveParams();
1775
1776 appendServerSaveParams(60*60,1); /* save after 1 hour and 1 change */
1777 appendServerSaveParams(300,100); /* save after 5 minutes and 100 changes */
1778 appendServerSaveParams(60,10000); /* save after 1 minute and 10000 changes */
1779 /* Replication related */
1780 server.isslave = 0;
1781 server.masterauth = NULL;
1782 server.masterhost = NULL;
1783 server.masterport = 6379;
1784 server.master = NULL;
1785 server.replstate = REDIS_REPL_NONE;
1786
1787 /* Double constants initialization */
1788 R_Zero = 0.0;
1789 R_PosInf = 1.0/R_Zero;
1790 R_NegInf = -1.0/R_Zero;
1791 R_Nan = R_Zero/R_Zero;
1792 }
1793
1794 static void initServer() {
1795 int j;
1796
1797 signal(SIGHUP, SIG_IGN);
1798 signal(SIGPIPE, SIG_IGN);
1799 setupSigSegvAction();
1800
1801 server.devnull = fopen("/dev/null","w");
1802 if (server.devnull == NULL) {
1803 redisLog(REDIS_WARNING, "Can't open /dev/null: %s", server.neterr);
1804 exit(1);
1805 }
1806 server.clients = listCreate();
1807 server.slaves = listCreate();
1808 server.monitors = listCreate();
1809 server.objfreelist = listCreate();
1810 createSharedObjects();
1811 server.el = aeCreateEventLoop();
1812 server.db = zmalloc(sizeof(redisDb)*server.dbnum);
1813 server.fd = anetTcpServer(server.neterr, server.port, server.bindaddr);
1814 if (server.fd == -1) {
1815 redisLog(REDIS_WARNING, "Opening TCP port: %s", server.neterr);
1816 exit(1);
1817 }
1818 for (j = 0; j < server.dbnum; j++) {
1819 server.db[j].dict = dictCreate(&dbDictType,NULL);
1820 server.db[j].expires = dictCreate(&keyptrDictType,NULL);
1821 server.db[j].blocking_keys = dictCreate(&keylistDictType,NULL);
1822 server.db[j].watched_keys = dictCreate(&keylistDictType,NULL);
1823 if (server.vm_enabled)
1824 server.db[j].io_keys = dictCreate(&keylistDictType,NULL);
1825 server.db[j].id = j;
1826 }
1827 server.pubsub_channels = dictCreate(&keylistDictType,NULL);
1828 server.pubsub_patterns = listCreate();
1829 listSetFreeMethod(server.pubsub_patterns,freePubsubPattern);
1830 listSetMatchMethod(server.pubsub_patterns,listMatchPubsubPattern);
1831 server.cronloops = 0;
1832 server.bgsavechildpid = -1;
1833 server.bgrewritechildpid = -1;
1834 server.bgrewritebuf = sdsempty();
1835 server.aofbuf = sdsempty();
1836 server.lastsave = time(NULL);
1837 server.dirty = 0;
1838 server.stat_numcommands = 0;
1839 server.stat_numconnections = 0;
1840 server.stat_expiredkeys = 0;
1841 server.stat_starttime = time(NULL);
1842 server.unixtime = time(NULL);
1843 aeCreateTimeEvent(server.el, 1, serverCron, NULL, NULL);
1844 if (aeCreateFileEvent(server.el, server.fd, AE_READABLE,
1845 acceptHandler, NULL) == AE_ERR) oom("creating file event");
1846
1847 if (server.appendonly) {
1848 server.appendfd = open(server.appendfilename,O_WRONLY|O_APPEND|O_CREAT,0644);
1849 if (server.appendfd == -1) {
1850 redisLog(REDIS_WARNING, "Can't open the append-only file: %s",
1851 strerror(errno));
1852 exit(1);
1853 }
1854 }
1855
1856 if (server.vm_enabled) vmInit();
1857 }
1858
1859 /* Empty the whole database */
1860 static long long emptyDb() {
1861 int j;
1862 long long removed = 0;
1863
1864 for (j = 0; j < server.dbnum; j++) {
1865 removed += dictSize(server.db[j].dict);
1866 dictEmpty(server.db[j].dict);
1867 dictEmpty(server.db[j].expires);
1868 }
1869 return removed;
1870 }
1871
1872 static int yesnotoi(char *s) {
1873 if (!strcasecmp(s,"yes")) return 1;
1874 else if (!strcasecmp(s,"no")) return 0;
1875 else return -1;
1876 }
1877
1878 /* I agree, this is a very rudimental way to load a configuration...
1879 will improve later if the config gets more complex */
1880 static void loadServerConfig(char *filename) {
1881 FILE *fp;
1882 char buf[REDIS_CONFIGLINE_MAX+1], *err = NULL;
1883 int linenum = 0;
1884 sds line = NULL;
1885
1886 if (filename[0] == '-' && filename[1] == '\0')
1887 fp = stdin;
1888 else {
1889 if ((fp = fopen(filename,"r")) == NULL) {
1890 redisLog(REDIS_WARNING, "Fatal error, can't open config file '%s'", filename);
1891 exit(1);
1892 }
1893 }
1894
1895 while(fgets(buf,REDIS_CONFIGLINE_MAX+1,fp) != NULL) {
1896 sds *argv;
1897 int argc, j;
1898
1899 linenum++;
1900 line = sdsnew(buf);
1901 line = sdstrim(line," \t\r\n");
1902
1903 /* Skip comments and blank lines*/
1904 if (line[0] == '#' || line[0] == '\0') {
1905 sdsfree(line);
1906 continue;
1907 }
1908
1909 /* Split into arguments */
1910 argv = sdssplitlen(line,sdslen(line)," ",1,&argc);
1911 sdstolower(argv[0]);
1912
1913 /* Execute config directives */
1914 if (!strcasecmp(argv[0],"timeout") && argc == 2) {
1915 server.maxidletime = atoi(argv[1]);
1916 if (server.maxidletime < 0) {
1917 err = "Invalid timeout value"; goto loaderr;
1918 }
1919 } else if (!strcasecmp(argv[0],"port") && argc == 2) {
1920 server.port = atoi(argv[1]);
1921 if (server.port < 1 || server.port > 65535) {
1922 err = "Invalid port"; goto loaderr;
1923 }
1924 } else if (!strcasecmp(argv[0],"bind") && argc == 2) {
1925 server.bindaddr = zstrdup(argv[1]);
1926 } else if (!strcasecmp(argv[0],"save") && argc == 3) {
1927 int seconds = atoi(argv[1]);
1928 int changes = atoi(argv[2]);
1929 if (seconds < 1 || changes < 0) {
1930 err = "Invalid save parameters"; goto loaderr;
1931 }
1932 appendServerSaveParams(seconds,changes);
1933 } else if (!strcasecmp(argv[0],"dir") && argc == 2) {
1934 if (chdir(argv[1]) == -1) {
1935 redisLog(REDIS_WARNING,"Can't chdir to '%s': %s",
1936 argv[1], strerror(errno));
1937 exit(1);
1938 }
1939 } else if (!strcasecmp(argv[0],"loglevel") && argc == 2) {
1940 if (!strcasecmp(argv[1],"debug")) server.verbosity = REDIS_DEBUG;
1941 else if (!strcasecmp(argv[1],"verbose")) server.verbosity = REDIS_VERBOSE;
1942 else if (!strcasecmp(argv[1],"notice")) server.verbosity = REDIS_NOTICE;
1943 else if (!strcasecmp(argv[1],"warning")) server.verbosity = REDIS_WARNING;
1944 else {
1945 err = "Invalid log level. Must be one of debug, notice, warning";
1946 goto loaderr;
1947 }
1948 } else if (!strcasecmp(argv[0],"logfile") && argc == 2) {
1949 FILE *logfp;
1950
1951 server.logfile = zstrdup(argv[1]);
1952 if (!strcasecmp(server.logfile,"stdout")) {
1953 zfree(server.logfile);
1954 server.logfile = NULL;
1955 }
1956 if (server.logfile) {
1957 /* Test if we are able to open the file. The server will not
1958 * be able to abort just for this problem later... */
1959 logfp = fopen(server.logfile,"a");
1960 if (logfp == NULL) {
1961 err = sdscatprintf(sdsempty(),
1962 "Can't open the log file: %s", strerror(errno));
1963 goto loaderr;
1964 }
1965 fclose(logfp);
1966 }
1967 } else if (!strcasecmp(argv[0],"databases") && argc == 2) {
1968 server.dbnum = atoi(argv[1]);
1969 if (server.dbnum < 1) {
1970 err = "Invalid number of databases"; goto loaderr;
1971 }
1972 } else if (!strcasecmp(argv[0],"include") && argc == 2) {
1973 loadServerConfig(argv[1]);
1974 } else if (!strcasecmp(argv[0],"maxclients") && argc == 2) {
1975 server.maxclients = atoi(argv[1]);
1976 } else if (!strcasecmp(argv[0],"maxmemory") && argc == 2) {
1977 server.maxmemory = memtoll(argv[1],NULL);
1978 } else if (!strcasecmp(argv[0],"slaveof") && argc == 3) {
1979 server.masterhost = sdsnew(argv[1]);
1980 server.masterport = atoi(argv[2]);
1981 server.replstate = REDIS_REPL_CONNECT;
1982 } else if (!strcasecmp(argv[0],"masterauth") && argc == 2) {
1983 server.masterauth = zstrdup(argv[1]);
1984 } else if (!strcasecmp(argv[0],"glueoutputbuf") && argc == 2) {
1985 if ((server.glueoutputbuf = yesnotoi(argv[1])) == -1) {
1986 err = "argument must be 'yes' or 'no'"; goto loaderr;
1987 }
1988 } else if (!strcasecmp(argv[0],"rdbcompression") && argc == 2) {
1989 if ((server.rdbcompression = yesnotoi(argv[1])) == -1) {
1990 err = "argument must be 'yes' or 'no'"; goto loaderr;
1991 }
1992 } else if (!strcasecmp(argv[0],"activerehashing") && argc == 2) {
1993 if ((server.activerehashing = yesnotoi(argv[1])) == -1) {
1994 err = "argument must be 'yes' or 'no'"; goto loaderr;
1995 }
1996 } else if (!strcasecmp(argv[0],"daemonize") && argc == 2) {
1997 if ((server.daemonize = yesnotoi(argv[1])) == -1) {
1998 err = "argument must be 'yes' or 'no'"; goto loaderr;
1999 }
2000 } else if (!strcasecmp(argv[0],"appendonly") && argc == 2) {
2001 if ((server.appendonly = yesnotoi(argv[1])) == -1) {
2002 err = "argument must be 'yes' or 'no'"; goto loaderr;
2003 }
2004 } else if (!strcasecmp(argv[0],"appendfilename") && argc == 2) {
2005 zfree(server.appendfilename);
2006 server.appendfilename = zstrdup(argv[1]);
2007 } else if (!strcasecmp(argv[0],"no-appendfsync-on-rewrite")
2008 && argc == 2) {
2009 if ((server.no_appendfsync_on_rewrite= yesnotoi(argv[1])) == -1) {
2010 err = "argument must be 'yes' or 'no'"; goto loaderr;
2011 }
2012 } else if (!strcasecmp(argv[0],"appendfsync") && argc == 2) {
2013 if (!strcasecmp(argv[1],"no")) {
2014 server.appendfsync = APPENDFSYNC_NO;
2015 } else if (!strcasecmp(argv[1],"always")) {
2016 server.appendfsync = APPENDFSYNC_ALWAYS;
2017 } else if (!strcasecmp(argv[1],"everysec")) {
2018 server.appendfsync = APPENDFSYNC_EVERYSEC;
2019 } else {
2020 err = "argument must be 'no', 'always' or 'everysec'";
2021 goto loaderr;
2022 }
2023 } else if (!strcasecmp(argv[0],"requirepass") && argc == 2) {
2024 server.requirepass = zstrdup(argv[1]);
2025 } else if (!strcasecmp(argv[0],"pidfile") && argc == 2) {
2026 zfree(server.pidfile);
2027 server.pidfile = zstrdup(argv[1]);
2028 } else if (!strcasecmp(argv[0],"dbfilename") && argc == 2) {
2029 zfree(server.dbfilename);
2030 server.dbfilename = zstrdup(argv[1]);
2031 } else if (!strcasecmp(argv[0],"vm-enabled") && argc == 2) {
2032 if ((server.vm_enabled = yesnotoi(argv[1])) == -1) {
2033 err = "argument must be 'yes' or 'no'"; goto loaderr;
2034 }
2035 } else if (!strcasecmp(argv[0],"vm-swap-file") && argc == 2) {
2036 zfree(server.vm_swap_file);
2037 server.vm_swap_file = zstrdup(argv[1]);
2038 } else if (!strcasecmp(argv[0],"vm-max-memory") && argc == 2) {
2039 server.vm_max_memory = memtoll(argv[1],NULL);
2040 } else if (!strcasecmp(argv[0],"vm-page-size") && argc == 2) {
2041 server.vm_page_size = memtoll(argv[1], NULL);
2042 } else if (!strcasecmp(argv[0],"vm-pages") && argc == 2) {
2043 server.vm_pages = memtoll(argv[1], NULL);
2044 } else if (!strcasecmp(argv[0],"vm-max-threads") && argc == 2) {
2045 server.vm_max_threads = strtoll(argv[1], NULL, 10);
2046 } else if (!strcasecmp(argv[0],"hash-max-zipmap-entries") && argc == 2){
2047 server.hash_max_zipmap_entries = memtoll(argv[1], NULL);
2048 } else if (!strcasecmp(argv[0],"hash-max-zipmap-value") && argc == 2){
2049 server.hash_max_zipmap_value = memtoll(argv[1], NULL);
2050 } else if (!strcasecmp(argv[0],"list-max-ziplist-entries") && argc == 2){
2051 server.list_max_ziplist_entries = memtoll(argv[1], NULL);
2052 } else if (!strcasecmp(argv[0],"list-max-ziplist-value") && argc == 2){
2053 server.list_max_ziplist_value = memtoll(argv[1], NULL);
2054 } else {
2055 err = "Bad directive or wrong number of arguments"; goto loaderr;
2056 }
2057 for (j = 0; j < argc; j++)
2058 sdsfree(argv[j]);
2059 zfree(argv);
2060 sdsfree(line);
2061 }
2062 if (fp != stdin) fclose(fp);
2063 return;
2064
2065 loaderr:
2066 fprintf(stderr, "\n*** FATAL CONFIG FILE ERROR ***\n");
2067 fprintf(stderr, "Reading the configuration file, at line %d\n", linenum);
2068 fprintf(stderr, ">>> '%s'\n", line);
2069 fprintf(stderr, "%s\n", err);
2070 exit(1);
2071 }
2072
2073 static void freeClientArgv(redisClient *c) {
2074 int j;
2075
2076 for (j = 0; j < c->argc; j++)
2077 decrRefCount(c->argv[j]);
2078 for (j = 0; j < c->mbargc; j++)
2079 decrRefCount(c->mbargv[j]);
2080 c->argc = 0;
2081 c->mbargc = 0;
2082 }
2083
2084 static void freeClient(redisClient *c) {
2085 listNode *ln;
2086
2087 /* Note that if the client we are freeing is blocked into a blocking
2088 * call, we have to set querybuf to NULL *before* to call
2089 * unblockClientWaitingData() to avoid processInputBuffer() will get
2090 * called. Also it is important to remove the file events after
2091 * this, because this call adds the READABLE event. */
2092 sdsfree(c->querybuf);
2093 c->querybuf = NULL;
2094 if (c->flags & REDIS_BLOCKED)
2095 unblockClientWaitingData(c);
2096
2097 /* UNWATCH all the keys */
2098 unwatchAllKeys(c);
2099 listRelease(c->watched_keys);
2100 /* Unsubscribe from all the pubsub channels */
2101 pubsubUnsubscribeAllChannels(c,0);
2102 pubsubUnsubscribeAllPatterns(c,0);
2103 dictRelease(c->pubsub_channels);
2104 listRelease(c->pubsub_patterns);
2105 /* Obvious cleanup */
2106 aeDeleteFileEvent(server.el,c->fd,AE_READABLE);
2107 aeDeleteFileEvent(server.el,c->fd,AE_WRITABLE);
2108 listRelease(c->reply);
2109 freeClientArgv(c);
2110 close(c->fd);
2111 /* Remove from the list of clients */
2112 ln = listSearchKey(server.clients,c);
2113 redisAssert(ln != NULL);
2114 listDelNode(server.clients,ln);
2115 /* Remove from the list of clients that are now ready to be restarted
2116 * after waiting for swapped keys */
2117 if (c->flags & REDIS_IO_WAIT && listLength(c->io_keys) == 0) {
2118 ln = listSearchKey(server.io_ready_clients,c);
2119 if (ln) {
2120 listDelNode(server.io_ready_clients,ln);
2121 server.vm_blocked_clients--;
2122 }
2123 }
2124 /* Remove from the list of clients waiting for swapped keys */
2125 while (server.vm_enabled && listLength(c->io_keys)) {
2126 ln = listFirst(c->io_keys);
2127 dontWaitForSwappedKey(c,ln->value);
2128 }
2129 listRelease(c->io_keys);
2130 /* Master/slave cleanup */
2131 if (c->flags & REDIS_SLAVE) {
2132 if (c->replstate == REDIS_REPL_SEND_BULK && c->repldbfd != -1)
2133 close(c->repldbfd);
2134 list *l = (c->flags & REDIS_MONITOR) ? server.monitors : server.slaves;
2135 ln = listSearchKey(l,c);
2136 redisAssert(ln != NULL);
2137 listDelNode(l,ln);
2138 }
2139 if (c->flags & REDIS_MASTER) {
2140 server.master = NULL;
2141 server.replstate = REDIS_REPL_CONNECT;
2142 }
2143 /* Release memory */
2144 zfree(c->argv);
2145 zfree(c->mbargv);
2146 freeClientMultiState(c);
2147 zfree(c);
2148 }
2149
2150 #define GLUEREPLY_UP_TO (1024)
2151 static void glueReplyBuffersIfNeeded(redisClient *c) {
2152 int copylen = 0;
2153 char buf[GLUEREPLY_UP_TO];
2154 listNode *ln;
2155 listIter li;
2156 robj *o;
2157
2158 listRewind(c->reply,&li);
2159 while((ln = listNext(&li))) {
2160 int objlen;
2161
2162 o = ln->value;
2163 objlen = sdslen(o->ptr);
2164 if (copylen + objlen <= GLUEREPLY_UP_TO) {
2165 memcpy(buf+copylen,o->ptr,objlen);
2166 copylen += objlen;
2167 listDelNode(c->reply,ln);
2168 } else {
2169 if (copylen == 0) return;
2170 break;
2171 }
2172 }
2173 /* Now the output buffer is empty, add the new single element */
2174 o = createObject(REDIS_STRING,sdsnewlen(buf,copylen));
2175 listAddNodeHead(c->reply,o);
2176 }
2177
2178 static void sendReplyToClient(aeEventLoop *el, int fd, void *privdata, int mask) {
2179 redisClient *c = privdata;
2180 int nwritten = 0, totwritten = 0, objlen;
2181 robj *o;
2182 REDIS_NOTUSED(el);
2183 REDIS_NOTUSED(mask);
2184
2185 /* Use writev() if we have enough buffers to send */
2186 if (!server.glueoutputbuf &&
2187 listLength(c->reply) > REDIS_WRITEV_THRESHOLD &&
2188 !(c->flags & REDIS_MASTER))
2189 {
2190 sendReplyToClientWritev(el, fd, privdata, mask);
2191 return;
2192 }
2193
2194 while(listLength(c->reply)) {
2195 if (server.glueoutputbuf && listLength(c->reply) > 1)
2196 glueReplyBuffersIfNeeded(c);
2197
2198 o = listNodeValue(listFirst(c->reply));
2199 objlen = sdslen(o->ptr);
2200
2201 if (objlen == 0) {
2202 listDelNode(c->reply,listFirst(c->reply));
2203 continue;
2204 }
2205
2206 if (c->flags & REDIS_MASTER) {
2207 /* Don't reply to a master */
2208 nwritten = objlen - c->sentlen;
2209 } else {
2210 nwritten = write(fd, ((char*)o->ptr)+c->sentlen, objlen - c->sentlen);
2211 if (nwritten <= 0) break;
2212 }
2213 c->sentlen += nwritten;
2214 totwritten += nwritten;
2215 /* If we fully sent the object on head go to the next one */
2216 if (c->sentlen == objlen) {
2217 listDelNode(c->reply,listFirst(c->reply));
2218 c->sentlen = 0;
2219 }
2220 /* Note that we avoid to send more thank REDIS_MAX_WRITE_PER_EVENT
2221 * bytes, in a single threaded server it's a good idea to serve
2222 * other clients as well, even if a very large request comes from
2223 * super fast link that is always able to accept data (in real world
2224 * scenario think about 'KEYS *' against the loopback interfae) */
2225 if (totwritten > REDIS_MAX_WRITE_PER_EVENT) break;
2226 }
2227 if (nwritten == -1) {
2228 if (errno == EAGAIN) {
2229 nwritten = 0;
2230 } else {
2231 redisLog(REDIS_VERBOSE,
2232 "Error writing to client: %s", strerror(errno));
2233 freeClient(c);
2234 return;
2235 }
2236 }
2237 if (totwritten > 0) c->lastinteraction = time(NULL);
2238 if (listLength(c->reply) == 0) {
2239 c->sentlen = 0;
2240 aeDeleteFileEvent(server.el,c->fd,AE_WRITABLE);
2241 }
2242 }
2243
2244 static void sendReplyToClientWritev(aeEventLoop *el, int fd, void *privdata, int mask)
2245 {
2246 redisClient *c = privdata;
2247 int nwritten = 0, totwritten = 0, objlen, willwrite;
2248 robj *o;
2249 struct iovec iov[REDIS_WRITEV_IOVEC_COUNT];
2250 int offset, ion = 0;
2251 REDIS_NOTUSED(el);
2252 REDIS_NOTUSED(mask);
2253
2254 listNode *node;
2255 while (listLength(c->reply)) {
2256 offset = c->sentlen;
2257 ion = 0;
2258 willwrite = 0;
2259
2260 /* fill-in the iov[] array */
2261 for(node = listFirst(c->reply); node; node = listNextNode(node)) {
2262 o = listNodeValue(node);
2263 objlen = sdslen(o->ptr);
2264
2265 if (totwritten + objlen - offset > REDIS_MAX_WRITE_PER_EVENT)
2266 break;
2267
2268 if(ion == REDIS_WRITEV_IOVEC_COUNT)
2269 break; /* no more iovecs */
2270
2271 iov[ion].iov_base = ((char*)o->ptr) + offset;
2272 iov[ion].iov_len = objlen - offset;
2273 willwrite += objlen - offset;
2274 offset = 0; /* just for the first item */
2275 ion++;
2276 }
2277
2278 if(willwrite == 0)
2279 break;
2280
2281 /* write all collected blocks at once */
2282 if((nwritten = writev(fd, iov, ion)) < 0) {
2283 if (errno != EAGAIN) {
2284 redisLog(REDIS_VERBOSE,
2285 "Error writing to client: %s", strerror(errno));
2286 freeClient(c);
2287 return;
2288 }
2289 break;
2290 }
2291
2292 totwritten += nwritten;
2293 offset = c->sentlen;
2294
2295 /* remove written robjs from c->reply */
2296 while (nwritten && listLength(c->reply)) {
2297 o = listNodeValue(listFirst(c->reply));
2298 objlen = sdslen(o->ptr);
2299
2300 if(nwritten >= objlen - offset) {
2301 listDelNode(c->reply, listFirst(c->reply));
2302 nwritten -= objlen - offset;
2303 c->sentlen = 0;
2304 } else {
2305 /* partial write */
2306 c->sentlen += nwritten;
2307 break;
2308 }
2309 offset = 0;
2310 }
2311 }
2312
2313 if (totwritten > 0)
2314 c->lastinteraction = time(NULL);
2315
2316 if (listLength(c->reply) == 0) {
2317 c->sentlen = 0;
2318 aeDeleteFileEvent(server.el,c->fd,AE_WRITABLE);
2319 }
2320 }
2321
2322 static int qsortRedisCommands(const void *r1, const void *r2) {
2323 return strcasecmp(
2324 ((struct redisCommand*)r1)->name,
2325 ((struct redisCommand*)r2)->name);
2326 }
2327
2328 static void sortCommandTable() {
2329 /* Copy and sort the read-only version of the command table */
2330 commandTable = (struct redisCommand*)malloc(sizeof(readonlyCommandTable));
2331 memcpy(commandTable,readonlyCommandTable,sizeof(readonlyCommandTable));
2332 qsort(commandTable,
2333 sizeof(readonlyCommandTable)/sizeof(struct redisCommand),
2334 sizeof(struct redisCommand),qsortRedisCommands);
2335 }
2336
2337 static struct redisCommand *lookupCommand(char *name) {
2338 struct redisCommand tmp = {name,NULL,0,0,NULL,0,0,0};
2339 return bsearch(
2340 &tmp,
2341 commandTable,
2342 sizeof(readonlyCommandTable)/sizeof(struct redisCommand),
2343 sizeof(struct redisCommand),
2344 qsortRedisCommands);
2345 }
2346
2347 /* resetClient prepare the client to process the next command */
2348 static void resetClient(redisClient *c) {
2349 freeClientArgv(c);
2350 c->bulklen = -1;
2351 c->multibulk = 0;
2352 }
2353
2354 /* Call() is the core of Redis execution of a command */
2355 static void call(redisClient *c, struct redisCommand *cmd) {
2356 long long dirty;
2357
2358 dirty = server.dirty;
2359 cmd->proc(c);
2360 dirty = server.dirty-dirty;
2361
2362 if (server.appendonly && dirty)
2363 feedAppendOnlyFile(cmd,c->db->id,c->argv,c->argc);
2364 if ((dirty || cmd->flags & REDIS_CMD_FORCE_REPLICATION) &&
2365 listLength(server.slaves))
2366 replicationFeedSlaves(server.slaves,c->db->id,c->argv,c->argc);
2367 if (listLength(server.monitors))
2368 replicationFeedMonitors(server.monitors,c->db->id,c->argv,c->argc);
2369 server.stat_numcommands++;
2370 }
2371
2372 /* If this function gets called we already read a whole
2373 * command, argments are in the client argv/argc fields.
2374 * processCommand() execute the command or prepare the
2375 * server for a bulk read from the client.
2376 *
2377 * If 1 is returned the client is still alive and valid and
2378 * and other operations can be performed by the caller. Otherwise
2379 * if 0 is returned the client was destroied (i.e. after QUIT). */
2380 static int processCommand(redisClient *c) {
2381 struct redisCommand *cmd;
2382
2383 /* Free some memory if needed (maxmemory setting) */
2384 if (server.maxmemory) freeMemoryIfNeeded();
2385
2386 /* Handle the multi bulk command type. This is an alternative protocol
2387 * supported by Redis in order to receive commands that are composed of
2388 * multiple binary-safe "bulk" arguments. The latency of processing is
2389 * a bit higher but this allows things like multi-sets, so if this
2390 * protocol is used only for MSET and similar commands this is a big win. */
2391 if (c->multibulk == 0 && c->argc == 1 && ((char*)(c->argv[0]->ptr))[0] == '*') {
2392 c->multibulk = atoi(((char*)c->argv[0]->ptr)+1);
2393 if (c->multibulk <= 0) {
2394 resetClient(c);
2395 return 1;
2396 } else {
2397 decrRefCount(c->argv[c->argc-1]);
2398 c->argc--;
2399 return 1;
2400 }
2401 } else if (c->multibulk) {
2402 if (c->bulklen == -1) {
2403 if (((char*)c->argv[0]->ptr)[0] != '$') {
2404 addReplySds(c,sdsnew("-ERR multi bulk protocol error\r\n"));
2405 resetClient(c);
2406 return 1;
2407 } else {
2408 int bulklen = atoi(((char*)c->argv[0]->ptr)+1);
2409 decrRefCount(c->argv[0]);
2410 if (bulklen < 0 || bulklen > 1024*1024*1024) {
2411 c->argc--;
2412 addReplySds(c,sdsnew("-ERR invalid bulk write count\r\n"));
2413 resetClient(c);
2414 return 1;
2415 }
2416 c->argc--;
2417 c->bulklen = bulklen+2; /* add two bytes for CR+LF */
2418 return 1;
2419 }
2420 } else {
2421 c->mbargv = zrealloc(c->mbargv,(sizeof(robj*))*(c->mbargc+1));
2422 c->mbargv[c->mbargc] = c->argv[0];
2423 c->mbargc++;
2424 c->argc--;
2425 c->multibulk--;
2426 if (c->multibulk == 0) {
2427 robj **auxargv;
2428 int auxargc;
2429
2430 /* Here we need to swap the multi-bulk argc/argv with the
2431 * normal argc/argv of the client structure. */
2432 auxargv = c->argv;
2433 c->argv = c->mbargv;
2434 c->mbargv = auxargv;
2435
2436 auxargc = c->argc;
2437 c->argc = c->mbargc;
2438 c->mbargc = auxargc;
2439
2440 /* We need to set bulklen to something different than -1
2441 * in order for the code below to process the command without
2442 * to try to read the last argument of a bulk command as
2443 * a special argument. */
2444 c->bulklen = 0;
2445 /* continue below and process the command */
2446 } else {
2447 c->bulklen = -1;
2448 return 1;
2449 }
2450 }
2451 }
2452 /* -- end of multi bulk commands processing -- */
2453
2454 /* The QUIT command is handled as a special case. Normal command
2455 * procs are unable to close the client connection safely */
2456 if (!strcasecmp(c->argv[0]->ptr,"quit")) {
2457 freeClient(c);
2458 return 0;
2459 }
2460
2461 /* Now lookup the command and check ASAP about trivial error conditions
2462 * such wrong arity, bad command name and so forth. */
2463 cmd = lookupCommand(c->argv[0]->ptr);
2464 if (!cmd) {
2465 addReplySds(c,
2466 sdscatprintf(sdsempty(), "-ERR unknown command '%s'\r\n",
2467 (char*)c->argv[0]->ptr));
2468 resetClient(c);
2469 return 1;
2470 } else if ((cmd->arity > 0 && cmd->arity != c->argc) ||
2471 (c->argc < -cmd->arity)) {
2472 addReplySds(c,
2473 sdscatprintf(sdsempty(),
2474 "-ERR wrong number of arguments for '%s' command\r\n",
2475 cmd->name));
2476 resetClient(c);
2477 return 1;
2478 } else if (cmd->flags & REDIS_CMD_BULK && c->bulklen == -1) {
2479 /* This is a bulk command, we have to read the last argument yet. */
2480 int bulklen = atoi(c->argv[c->argc-1]->ptr);
2481
2482 decrRefCount(c->argv[c->argc-1]);
2483 if (bulklen < 0 || bulklen > 1024*1024*1024) {
2484 c->argc--;
2485 addReplySds(c,sdsnew("-ERR invalid bulk write count\r\n"));
2486 resetClient(c);
2487 return 1;
2488 }
2489 c->argc--;
2490 c->bulklen = bulklen+2; /* add two bytes for CR+LF */
2491 /* It is possible that the bulk read is already in the
2492 * buffer. Check this condition and handle it accordingly.
2493 * This is just a fast path, alternative to call processInputBuffer().
2494 * It's a good idea since the code is small and this condition
2495 * happens most of the times. */
2496 if ((signed)sdslen(c->querybuf) >= c->bulklen) {
2497 c->argv[c->argc] = createStringObject(c->querybuf,c->bulklen-2);
2498 c->argc++;
2499 c->querybuf = sdsrange(c->querybuf,c->bulklen,-1);
2500 } else {
2501 /* Otherwise return... there is to read the last argument
2502 * from the socket. */
2503 return 1;
2504 }
2505 }
2506 /* Let's try to encode the bulk object to save space. */
2507 if (cmd->flags & REDIS_CMD_BULK)
2508 c->argv[c->argc-1] = tryObjectEncoding(c->argv[c->argc-1]);
2509
2510 /* Check if the user is authenticated */
2511 if (server.requirepass && !c->authenticated && cmd->proc != authCommand) {
2512 addReplySds(c,sdsnew("-ERR operation not permitted\r\n"));
2513 resetClient(c);
2514 return 1;
2515 }
2516
2517 /* Handle the maxmemory directive */
2518 if (server.maxmemory && (cmd->flags & REDIS_CMD_DENYOOM) &&
2519 zmalloc_used_memory() > server.maxmemory)
2520 {
2521 addReplySds(c,sdsnew("-ERR command not allowed when used memory > 'maxmemory'\r\n"));
2522 resetClient(c);
2523 return 1;
2524 }
2525
2526 /* Only allow SUBSCRIBE and UNSUBSCRIBE in the context of Pub/Sub */
2527 if ((dictSize(c->pubsub_channels) > 0 || listLength(c->pubsub_patterns) > 0)
2528 &&
2529 cmd->proc != subscribeCommand && cmd->proc != unsubscribeCommand &&
2530 cmd->proc != psubscribeCommand && cmd->proc != punsubscribeCommand) {
2531 addReplySds(c,sdsnew("-ERR only (P)SUBSCRIBE / (P)UNSUBSCRIBE / QUIT allowed in this context\r\n"));
2532 resetClient(c);
2533 return 1;
2534 }
2535
2536 /* Exec the command */
2537 if (c->flags & REDIS_MULTI &&
2538 cmd->proc != execCommand && cmd->proc != discardCommand &&
2539 cmd->proc != multiCommand && cmd->proc != watchCommand)
2540 {
2541 queueMultiCommand(c,cmd);
2542 addReply(c,shared.queued);
2543 } else {
2544 if (server.vm_enabled && server.vm_max_threads > 0 &&
2545 blockClientOnSwappedKeys(c,cmd)) return 1;
2546 call(c,cmd);
2547 }
2548
2549 /* Prepare the client for the next command */
2550 resetClient(c);
2551 return 1;
2552 }
2553
2554 static void replicationFeedSlaves(list *slaves, int dictid, robj **argv, int argc) {
2555 listNode *ln;
2556 listIter li;
2557 int outc = 0, j;
2558 robj **outv;
2559 /* We need 1+(ARGS*3) objects since commands are using the new protocol
2560 * and we one 1 object for the first "*<count>\r\n" multibulk count, then
2561 * for every additional object we have "$<count>\r\n" + object + "\r\n". */
2562 robj *static_outv[REDIS_STATIC_ARGS*3+1];
2563 robj *lenobj;
2564
2565 if (argc <= REDIS_STATIC_ARGS) {
2566 outv = static_outv;
2567 } else {
2568 outv = zmalloc(sizeof(robj*)*(argc*3+1));
2569 }
2570
2571 lenobj = createObject(REDIS_STRING,
2572 sdscatprintf(sdsempty(), "*%d\r\n", argc));
2573 lenobj->refcount = 0;
2574 outv[outc++] = lenobj;
2575 for (j = 0; j < argc; j++) {
2576 lenobj = createObject(REDIS_STRING,
2577 sdscatprintf(sdsempty(),"$%lu\r\n",
2578 (unsigned long) stringObjectLen(argv[j])));
2579 lenobj->refcount = 0;
2580 outv[outc++] = lenobj;
2581 outv[outc++] = argv[j];
2582 outv[outc++] = shared.crlf;
2583 }
2584
2585 /* Increment all the refcounts at start and decrement at end in order to
2586 * be sure to free objects if there is no slave in a replication state
2587 * able to be feed with commands */
2588 for (j = 0; j < outc; j++) incrRefCount(outv[j]);
2589 listRewind(slaves,&li);
2590 while((ln = listNext(&li))) {
2591 redisClient *slave = ln->value;
2592
2593 /* Don't feed slaves that are still waiting for BGSAVE to start */
2594 if (slave->replstate == REDIS_REPL_WAIT_BGSAVE_START) continue;
2595
2596 /* Feed all the other slaves, MONITORs and so on */
2597 if (slave->slaveseldb != dictid) {
2598 robj *selectcmd;
2599
2600 switch(dictid) {
2601 case 0: selectcmd = shared.select0; break;
2602 case 1: selectcmd = shared.select1; break;
2603 case 2: selectcmd = shared.select2; break;
2604 case 3: selectcmd = shared.select3; break;
2605 case 4: selectcmd = shared.select4; break;
2606 case 5: selectcmd = shared.select5; break;
2607 case 6: selectcmd = shared.select6; break;
2608 case 7: selectcmd = shared.select7; break;
2609 case 8: selectcmd = shared.select8; break;
2610 case 9: selectcmd = shared.select9; break;
2611 default:
2612 selectcmd = createObject(REDIS_STRING,
2613 sdscatprintf(sdsempty(),"select %d\r\n",dictid));
2614 selectcmd->refcount = 0;
2615 break;
2616 }
2617 addReply(slave,selectcmd);
2618 slave->slaveseldb = dictid;
2619 }
2620 for (j = 0; j < outc; j++) addReply(slave,outv[j]);
2621 }
2622 for (j = 0; j < outc; j++) decrRefCount(outv[j]);
2623 if (outv != static_outv) zfree(outv);
2624 }
2625
2626 static sds sdscatrepr(sds s, char *p, size_t len) {
2627 s = sdscatlen(s,"\"",1);
2628 while(len--) {
2629 switch(*p) {
2630 case '\\':
2631 case '"':
2632 s = sdscatprintf(s,"\\%c",*p);
2633 break;
2634 case '\n': s = sdscatlen(s,"\\n",1); break;
2635 case '\r': s = sdscatlen(s,"\\r",1); break;
2636 case '\t': s = sdscatlen(s,"\\t",1); break;
2637 case '\a': s = sdscatlen(s,"\\a",1); break;
2638 case '\b': s = sdscatlen(s,"\\b",1); break;
2639 default:
2640 if (isprint(*p))
2641 s = sdscatprintf(s,"%c",*p);
2642 else
2643 s = sdscatprintf(s,"\\x%02x",(unsigned char)*p);
2644 break;
2645 }
2646 p++;
2647 }
2648 return sdscatlen(s,"\"",1);
2649 }
2650
2651 static void replicationFeedMonitors(list *monitors, int dictid, robj **argv, int argc) {
2652 listNode *ln;
2653 listIter li;
2654 int j;
2655 sds cmdrepr = sdsnew("+");
2656 robj *cmdobj;
2657 struct timeval tv;
2658
2659 gettimeofday(&tv,NULL);
2660 cmdrepr = sdscatprintf(cmdrepr,"%ld.%ld ",(long)tv.tv_sec,(long)tv.tv_usec);
2661 if (dictid != 0) cmdrepr = sdscatprintf(cmdrepr,"(db %d) ", dictid);
2662
2663 for (j = 0; j < argc; j++) {
2664 if (argv[j]->encoding == REDIS_ENCODING_INT) {
2665 cmdrepr = sdscatprintf(cmdrepr, "%ld", (long)argv[j]->ptr);
2666 } else {
2667 cmdrepr = sdscatrepr(cmdrepr,(char*)argv[j]->ptr,
2668 sdslen(argv[j]->ptr));
2669 }
2670 if (j != argc-1)
2671 cmdrepr = sdscatlen(cmdrepr," ",1);
2672 }
2673 cmdrepr = sdscatlen(cmdrepr,"\r\n",2);
2674 cmdobj = createObject(REDIS_STRING,cmdrepr);
2675
2676 listRewind(monitors,&li);
2677 while((ln = listNext(&li))) {
2678 redisClient *monitor = ln->value;
2679 addReply(monitor,cmdobj);
2680 }
2681 decrRefCount(cmdobj);
2682 }
2683
2684 static void processInputBuffer(redisClient *c) {
2685 again:
2686 /* Before to process the input buffer, make sure the client is not
2687 * waitig for a blocking operation such as BLPOP. Note that the first
2688 * iteration the client is never blocked, otherwise the processInputBuffer
2689 * would not be called at all, but after the execution of the first commands
2690 * in the input buffer the client may be blocked, and the "goto again"
2691 * will try to reiterate. The following line will make it return asap. */
2692 if (c->flags & REDIS_BLOCKED || c->flags & REDIS_IO_WAIT) return;
2693 if (c->bulklen == -1) {
2694 /* Read the first line of the query */
2695 char *p = strchr(c->querybuf,'\n');
2696 size_t querylen;
2697
2698 if (p) {
2699 sds query, *argv;
2700 int argc, j;
2701
2702 query = c->querybuf;
2703 c->querybuf = sdsempty();
2704 querylen = 1+(p-(query));
2705 if (sdslen(query) > querylen) {
2706 /* leave data after the first line of the query in the buffer */
2707 c->querybuf = sdscatlen(c->querybuf,query+querylen,sdslen(query)-querylen);
2708 }
2709 *p = '\0'; /* remove "\n" */
2710 if (*(p-1) == '\r') *(p-1) = '\0'; /* and "\r" if any */
2711 sdsupdatelen(query);
2712
2713 /* Now we can split the query in arguments */
2714 argv = sdssplitlen(query,sdslen(query)," ",1,&argc);
2715 sdsfree(query);
2716
2717 if (c->argv) zfree(c->argv);
2718 c->argv = zmalloc(sizeof(robj*)*argc);
2719
2720 for (j = 0; j < argc; j++) {
2721 if (sdslen(argv[j])) {
2722 c->argv[c->argc] = createObject(REDIS_STRING,argv[j]);
2723 c->argc++;
2724 } else {
2725 sdsfree(argv[j]);
2726 }
2727 }
2728 zfree(argv);
2729 if (c->argc) {
2730 /* Execute the command. If the client is still valid
2731 * after processCommand() return and there is something
2732 * on the query buffer try to process the next command. */
2733 if (processCommand(c) && sdslen(c->querybuf)) goto again;
2734 } else {
2735 /* Nothing to process, argc == 0. Just process the query
2736 * buffer if it's not empty or return to the caller */
2737 if (sdslen(c->querybuf)) goto again;
2738 }
2739 return;
2740 } else if (sdslen(c->querybuf) >= REDIS_REQUEST_MAX_SIZE) {
2741 redisLog(REDIS_VERBOSE, "Client protocol error");
2742 freeClient(c);
2743 return;
2744 }
2745 } else {
2746 /* Bulk read handling. Note that if we are at this point
2747 the client already sent a command terminated with a newline,
2748 we are reading the bulk data that is actually the last
2749 argument of the command. */
2750 int qbl = sdslen(c->querybuf);
2751
2752 if (c->bulklen <= qbl) {
2753 /* Copy everything but the final CRLF as final argument */
2754 c->argv[c->argc] = createStringObject(c->querybuf,c->bulklen-2);
2755 c->argc++;
2756 c->querybuf = sdsrange(c->querybuf,c->bulklen,-1);
2757 /* Process the command. If the client is still valid after
2758 * the processing and there is more data in the buffer
2759 * try to parse it. */
2760 if (processCommand(c) && sdslen(c->querybuf)) goto again;
2761 return;
2762 }
2763 }
2764 }
2765
2766 static void readQueryFromClient(aeEventLoop *el, int fd, void *privdata, int mask) {
2767 redisClient *c = (redisClient*) privdata;
2768 char buf[REDIS_IOBUF_LEN];
2769 int nread;
2770 REDIS_NOTUSED(el);
2771 REDIS_NOTUSED(mask);
2772
2773 nread = read(fd, buf, REDIS_IOBUF_LEN);
2774 if (nread == -1) {
2775 if (errno == EAGAIN) {
2776 nread = 0;
2777 } else {
2778 redisLog(REDIS_VERBOSE, "Reading from client: %s",strerror(errno));
2779 freeClient(c);
2780 return;
2781 }
2782 } else if (nread == 0) {
2783 redisLog(REDIS_VERBOSE, "Client closed connection");
2784 freeClient(c);
2785 return;
2786 }
2787 if (nread) {
2788 c->querybuf = sdscatlen(c->querybuf, buf, nread);
2789 c->lastinteraction = time(NULL);
2790 } else {
2791 return;
2792 }
2793 processInputBuffer(c);
2794 }
2795
2796 static int selectDb(redisClient *c, int id) {
2797 if (id < 0 || id >= server.dbnum)
2798 return REDIS_ERR;
2799 c->db = &server.db[id];
2800 return REDIS_OK;
2801 }
2802
2803 static void *dupClientReplyValue(void *o) {
2804 incrRefCount((robj*)o);
2805 return o;
2806 }
2807
2808 static int listMatchObjects(void *a, void *b) {
2809 return equalStringObjects(a,b);
2810 }
2811
2812 static redisClient *createClient(int fd) {
2813 redisClient *c = zmalloc(sizeof(*c));
2814
2815 anetNonBlock(NULL,fd);
2816 anetTcpNoDelay(NULL,fd);
2817 if (!c) return NULL;
2818 selectDb(c,0);
2819 c->fd = fd;
2820 c->querybuf = sdsempty();
2821 c->argc = 0;
2822 c->argv = NULL;
2823 c->bulklen = -1;
2824 c->multibulk = 0;
2825 c->mbargc = 0;
2826 c->mbargv = NULL;
2827 c->sentlen = 0;
2828 c->flags = 0;
2829 c->lastinteraction = time(NULL);
2830 c->authenticated = 0;
2831 c->replstate = REDIS_REPL_NONE;
2832 c->reply = listCreate();
2833 listSetFreeMethod(c->reply,decrRefCount);
2834 listSetDupMethod(c->reply,dupClientReplyValue);
2835 c->blocking_keys = NULL;
2836 c->blocking_keys_num = 0;
2837 c->io_keys = listCreate();
2838 c->watched_keys = listCreate();
2839 listSetFreeMethod(c->io_keys,decrRefCount);
2840 c->pubsub_channels = dictCreate(&setDictType,NULL);
2841 c->pubsub_patterns = listCreate();
2842 listSetFreeMethod(c->pubsub_patterns,decrRefCount);
2843 listSetMatchMethod(c->pubsub_patterns,listMatchObjects);
2844 if (aeCreateFileEvent(server.el, c->fd, AE_READABLE,
2845 readQueryFromClient, c) == AE_ERR) {
2846 freeClient(c);
2847 return NULL;
2848 }
2849 listAddNodeTail(server.clients,c);
2850 initClientMultiState(c);
2851 return c;
2852 }
2853
2854 static void addReply(redisClient *c, robj *obj) {
2855 if (listLength(c->reply) == 0 &&
2856 (c->replstate == REDIS_REPL_NONE ||
2857 c->replstate == REDIS_REPL_ONLINE) &&
2858 aeCreateFileEvent(server.el, c->fd, AE_WRITABLE,
2859 sendReplyToClient, c) == AE_ERR) return;
2860
2861 if (server.vm_enabled && obj->storage != REDIS_VM_MEMORY) {
2862 obj = dupStringObject(obj);
2863 obj->refcount = 0; /* getDecodedObject() will increment the refcount */
2864 }
2865 listAddNodeTail(c->reply,getDecodedObject(obj));
2866 }
2867
2868 static void addReplySds(redisClient *c, sds s) {
2869 robj *o = createObject(REDIS_STRING,s);
2870 addReply(c,o);
2871 decrRefCount(o);
2872 }
2873
2874 static void addReplyDouble(redisClient *c, double d) {
2875 char buf[128];
2876
2877 snprintf(buf,sizeof(buf),"%.17g",d);
2878 addReplySds(c,sdscatprintf(sdsempty(),"$%lu\r\n%s\r\n",
2879 (unsigned long) strlen(buf),buf));
2880 }
2881
2882 static void addReplyLongLong(redisClient *c, long long ll) {
2883 char buf[128];
2884 size_t len;
2885
2886 if (ll == 0) {
2887 addReply(c,shared.czero);
2888 return;
2889 } else if (ll == 1) {
2890 addReply(c,shared.cone);
2891 return;
2892 }
2893 buf[0] = ':';
2894 len = ll2string(buf+1,sizeof(buf)-1,ll);
2895 buf[len+1] = '\r';
2896 buf[len+2] = '\n';
2897 addReplySds(c,sdsnewlen(buf,len+3));
2898 }
2899
2900 static void addReplyUlong(redisClient *c, unsigned long ul) {
2901 char buf[128];
2902 size_t len;
2903
2904 if (ul == 0) {
2905 addReply(c,shared.czero);
2906 return;
2907 } else if (ul == 1) {
2908 addReply(c,shared.cone);
2909 return;
2910 }
2911 len = snprintf(buf,sizeof(buf),":%lu\r\n",ul);
2912 addReplySds(c,sdsnewlen(buf,len));
2913 }
2914
2915 static void addReplyBulkLen(redisClient *c, robj *obj) {
2916 size_t len, intlen;
2917 char buf[128];
2918
2919 if (obj->encoding == REDIS_ENCODING_RAW) {
2920 len = sdslen(obj->ptr);
2921 } else {
2922 long n = (long)obj->ptr;
2923
2924 /* Compute how many bytes will take this integer as a radix 10 string */
2925 len = 1;
2926 if (n < 0) {
2927 len++;
2928 n = -n;
2929 }
2930 while((n = n/10) != 0) {
2931 len++;
2932 }
2933 }
2934 buf[0] = '$';
2935 intlen = ll2string(buf+1,sizeof(buf)-1,(long long)len);
2936 buf[intlen+1] = '\r';
2937 buf[intlen+2] = '\n';
2938 addReplySds(c,sdsnewlen(buf,intlen+3));
2939 }
2940
2941 static void addReplyBulk(redisClient *c, robj *obj) {
2942 addReplyBulkLen(c,obj);
2943 addReply(c,obj);
2944 addReply(c,shared.crlf);
2945 }
2946
2947 static void addReplyBulkSds(redisClient *c, sds s) {
2948 robj *o = createStringObject(s, sdslen(s));
2949 addReplyBulk(c,o);
2950 decrRefCount(o);
2951 }
2952
2953 /* In the CONFIG command we need to add vanilla C string as bulk replies */
2954 static void addReplyBulkCString(redisClient *c, char *s) {
2955 if (s == NULL) {
2956 addReply(c,shared.nullbulk);
2957 } else {
2958 robj *o = createStringObject(s,strlen(s));
2959 addReplyBulk(c,o);
2960 decrRefCount(o);
2961 }
2962 }
2963
2964 static void acceptHandler(aeEventLoop *el, int fd, void *privdata, int mask) {
2965 int cport, cfd;
2966 char cip[128];
2967 redisClient *c;
2968 REDIS_NOTUSED(el);
2969 REDIS_NOTUSED(mask);
2970 REDIS_NOTUSED(privdata);
2971
2972 cfd = anetAccept(server.neterr, fd, cip, &cport);
2973 if (cfd == AE_ERR) {
2974 redisLog(REDIS_VERBOSE,"Accepting client connection: %s", server.neterr);
2975 return;
2976 }
2977 redisLog(REDIS_VERBOSE,"Accepted %s:%d", cip, cport);
2978 if ((c = createClient(cfd)) == NULL) {
2979 redisLog(REDIS_WARNING,"Error allocating resoures for the client");
2980 close(cfd); /* May be already closed, just ingore errors */
2981 return;
2982 }
2983 /* If maxclient directive is set and this is one client more... close the
2984 * connection. Note that we create the client instead to check before
2985 * for this condition, since now the socket is already set in nonblocking
2986 * mode and we can send an error for free using the Kernel I/O */
2987 if (server.maxclients && listLength(server.clients) > server.maxclients) {
2988 char *err = "-ERR max number of clients reached\r\n";
2989
2990 /* That's a best effort error message, don't check write errors */
2991 if (write(c->fd,err,strlen(err)) == -1) {
2992 /* Nothing to do, Just to avoid the warning... */
2993 }
2994 freeClient(c);
2995 return;
2996 }
2997 server.stat_numconnections++;
2998 }
2999
3000 /* ======================= Redis objects implementation ===================== */
3001
3002 static robj *createObject(int type, void *ptr) {
3003 robj *o;
3004
3005 if (server.vm_enabled) pthread_mutex_lock(&server.obj_freelist_mutex);
3006 if (listLength(server.objfreelist)) {
3007 listNode *head = listFirst(server.objfreelist);
3008 o = listNodeValue(head);
3009 listDelNode(server.objfreelist,head);
3010 if (server.vm_enabled) pthread_mutex_unlock(&server.obj_freelist_mutex);
3011 } else {
3012 if (server.vm_enabled)
3013 pthread_mutex_unlock(&server.obj_freelist_mutex);
3014 o = zmalloc(sizeof(*o));
3015 }
3016 o->type = type;
3017 o->encoding = REDIS_ENCODING_RAW;
3018 o->ptr = ptr;
3019 o->refcount = 1;
3020 if (server.vm_enabled) {
3021 /* Note that this code may run in the context of an I/O thread
3022 * and accessing server.lruclock in theory is an error
3023 * (no locks). But in practice this is safe, and even if we read
3024 * garbage Redis will not fail. */
3025 o->lru = server.lruclock;
3026 o->storage = REDIS_VM_MEMORY;
3027 }
3028 return o;
3029 }
3030
3031 static robj *createStringObject(char *ptr, size_t len) {
3032 return createObject(REDIS_STRING,sdsnewlen(ptr,len));
3033 }
3034
3035 static robj *createStringObjectFromLongLong(long long value) {
3036 robj *o;
3037 if (value >= 0 && value < REDIS_SHARED_INTEGERS) {
3038 incrRefCount(shared.integers[value]);
3039 o = shared.integers[value];
3040 } else {
3041 if (value >= LONG_MIN && value <= LONG_MAX) {
3042 o = createObject(REDIS_STRING, NULL);
3043 o->encoding = REDIS_ENCODING_INT;
3044 o->ptr = (void*)((long)value);
3045 } else {
3046 o = createObject(REDIS_STRING,sdsfromlonglong(value));
3047 }
3048 }
3049 return o;
3050 }
3051
3052 static robj *dupStringObject(robj *o) {
3053 assert(o->encoding == REDIS_ENCODING_RAW);
3054 return createStringObject(o->ptr,sdslen(o->ptr));
3055 }
3056
3057 static robj *createListObject(void) {
3058 list *l = listCreate();
3059 robj *o = createObject(REDIS_LIST,l);
3060 listSetFreeMethod(l,decrRefCount);
3061 o->encoding = REDIS_ENCODING_LIST;
3062 return o;
3063 }
3064
3065 static robj *createZiplistObject(void) {
3066 unsigned char *zl = ziplistNew();
3067 robj *o = createObject(REDIS_LIST,zl);
3068 o->encoding = REDIS_ENCODING_ZIPLIST;
3069 return o;
3070 }
3071
3072 static robj *createSetObject(void) {
3073 dict *d = dictCreate(&setDictType,NULL);
3074 return createObject(REDIS_SET,d);
3075 }
3076
3077 static robj *createHashObject(void) {
3078 /* All the Hashes start as zipmaps. Will be automatically converted
3079 * into hash tables if there are enough elements or big elements
3080 * inside. */
3081 unsigned char *zm = zipmapNew();
3082 robj *o = createObject(REDIS_HASH,zm);
3083 o->encoding = REDIS_ENCODING_ZIPMAP;
3084 return o;
3085 }
3086
3087 static robj *createZsetObject(void) {
3088 zset *zs = zmalloc(sizeof(*zs));
3089
3090 zs->dict = dictCreate(&zsetDictType,NULL);
3091 zs->zsl = zslCreate();
3092 return createObject(REDIS_ZSET,zs);
3093 }
3094
3095 static void freeStringObject(robj *o) {
3096 if (o->encoding == REDIS_ENCODING_RAW) {
3097 sdsfree(o->ptr);
3098 }
3099 }
3100
3101 static void freeListObject(robj *o) {
3102 switch (o->encoding) {
3103 case REDIS_ENCODING_LIST:
3104 listRelease((list*) o->ptr);
3105 break;
3106 case REDIS_ENCODING_ZIPLIST:
3107 zfree(o->ptr);
3108 break;
3109 default:
3110 redisPanic("Unknown list encoding type");
3111 }
3112 }
3113
3114 static void freeSetObject(robj *o) {
3115 dictRelease((dict*) o->ptr);
3116 }
3117
3118 static void freeZsetObject(robj *o) {
3119 zset *zs = o->ptr;
3120
3121 dictRelease(zs->dict);
3122 zslFree(zs->zsl);
3123 zfree(zs);
3124 }
3125
3126 static void freeHashObject(robj *o) {
3127 switch (o->encoding) {
3128 case REDIS_ENCODING_HT:
3129 dictRelease((dict*) o->ptr);
3130 break;
3131 case REDIS_ENCODING_ZIPMAP:
3132 zfree(o->ptr);
3133 break;
3134 default:
3135 redisPanic("Unknown hash encoding type");
3136 break;
3137 }
3138 }
3139
3140 static void incrRefCount(robj *o) {
3141 o->refcount++;
3142 }
3143
3144 static void decrRefCount(void *obj) {
3145 robj *o = obj;
3146
3147 /* Object is a swapped out value, or in the process of being loaded. */
3148 if (server.vm_enabled &&
3149 (o->storage == REDIS_VM_SWAPPED || o->storage == REDIS_VM_LOADING))
3150 {
3151 vmpointer *vp = obj;
3152 if (o->storage == REDIS_VM_LOADING) vmCancelThreadedIOJob(o);
3153 vmMarkPagesFree(vp->page,vp->usedpages);
3154 server.vm_stats_swapped_objects--;
3155 zfree(vp);
3156 return;
3157 }
3158
3159 if (o->refcount <= 0) redisPanic("decrRefCount against refcount <= 0");
3160 /* Object is in memory, or in the process of being swapped out.
3161 *
3162 * If the object is being swapped out, abort the operation on
3163 * decrRefCount even if the refcount does not drop to 0: the object
3164 * is referenced at least two times, as value of the key AND as
3165 * job->val in the iojob. So if we don't invalidate the iojob, when it is
3166 * done but the relevant key was removed in the meantime, the
3167 * complete jobs handler will not find the key about the job and the
3168 * assert will fail. */
3169 if (server.vm_enabled && o->storage == REDIS_VM_SWAPPING)
3170 vmCancelThreadedIOJob(o);
3171 if (--(o->refcount) == 0) {
3172 switch(o->type) {
3173 case REDIS_STRING: freeStringObject(o); break;
3174 case REDIS_LIST: freeListObject(o); break;
3175 case REDIS_SET: freeSetObject(o); break;
3176 case REDIS_ZSET: freeZsetObject(o); break;
3177 case REDIS_HASH: freeHashObject(o); break;
3178 default: redisPanic("Unknown object type"); break;
3179 }
3180 if (server.vm_enabled) pthread_mutex_lock(&server.obj_freelist_mutex);
3181 if (listLength(server.objfreelist) > REDIS_OBJFREELIST_MAX ||
3182 !listAddNodeHead(server.objfreelist,o))
3183 zfree(o);
3184 if (server.vm_enabled) pthread_mutex_unlock(&server.obj_freelist_mutex);
3185 }
3186 }
3187
3188 static int checkType(redisClient *c, robj *o, int type) {
3189 if (o->type != type) {
3190 addReply(c,shared.wrongtypeerr);
3191 return 1;
3192 }
3193 return 0;
3194 }
3195
3196 /* Check if the nul-terminated string 's' can be represented by a long
3197 * (that is, is a number that fits into long without any other space or
3198 * character before or after the digits).
3199 *
3200 * If so, the function returns REDIS_OK and *longval is set to the value
3201 * of the number. Otherwise REDIS_ERR is returned */
3202 static int isStringRepresentableAsLong(sds s, long *longval) {
3203 char buf[32], *endptr;
3204 long value;
3205 int slen;
3206
3207 value = strtol(s, &endptr, 10);
3208 if (endptr[0] != '\0') return REDIS_ERR;
3209 slen = ll2string(buf,32,value);
3210
3211 /* If the number converted back into a string is not identical
3212 * then it's not possible to encode the string as integer */
3213 if (sdslen(s) != (unsigned)slen || memcmp(buf,s,slen)) return REDIS_ERR;
3214 if (longval) *longval = value;
3215 return REDIS_OK;
3216 }
3217
3218 /* Try to encode a string object in order to save space */
3219 static robj *tryObjectEncoding(robj *o) {
3220 long value;
3221 sds s = o->ptr;
3222
3223 if (o->encoding != REDIS_ENCODING_RAW)
3224 return o; /* Already encoded */
3225
3226 /* It's not safe to encode shared objects: shared objects can be shared
3227 * everywhere in the "object space" of Redis. Encoded objects can only
3228 * appear as "values" (and not, for instance, as keys) */
3229 if (o->refcount > 1) return o;
3230
3231 /* Currently we try to encode only strings */
3232 redisAssert(o->type == REDIS_STRING);
3233
3234 /* Check if we can represent this string as a long integer */
3235 if (isStringRepresentableAsLong(s,&value) == REDIS_ERR) return o;
3236
3237 /* Ok, this object can be encoded */
3238 if (value >= 0 && value < REDIS_SHARED_INTEGERS) {
3239 decrRefCount(o);
3240 incrRefCount(shared.integers[value]);
3241 return shared.integers[value];
3242 } else {
3243 o->encoding = REDIS_ENCODING_INT;
3244 sdsfree(o->ptr);
3245 o->ptr = (void*) value;
3246 return o;
3247 }
3248 }
3249
3250 /* Get a decoded version of an encoded object (returned as a new object).
3251 * If the object is already raw-encoded just increment the ref count. */
3252 static robj *getDecodedObject(robj *o) {
3253 robj *dec;
3254
3255 if (o->encoding == REDIS_ENCODING_RAW) {
3256 incrRefCount(o);
3257 return o;
3258 }
3259 if (o->type == REDIS_STRING && o->encoding == REDIS_ENCODING_INT) {
3260 char buf[32];
3261
3262 ll2string(buf,32,(long)o->ptr);
3263 dec = createStringObject(buf,strlen(buf));
3264 return dec;
3265 } else {
3266 redisPanic("Unknown encoding type");
3267 }
3268 }
3269
3270 /* Compare two string objects via strcmp() or alike.
3271 * Note that the objects may be integer-encoded. In such a case we
3272 * use ll2string() to get a string representation of the numbers on the stack
3273 * and compare the strings, it's much faster than calling getDecodedObject().
3274 *
3275 * Important note: if objects are not integer encoded, but binary-safe strings,
3276 * sdscmp() from sds.c will apply memcmp() so this function ca be considered
3277 * binary safe. */
3278 static int compareStringObjects(robj *a, robj *b) {
3279 redisAssert(a->type == REDIS_STRING && b->type == REDIS_STRING);
3280 char bufa[128], bufb[128], *astr, *bstr;
3281 int bothsds = 1;
3282
3283 if (a == b) return 0;
3284 if (a->encoding != REDIS_ENCODING_RAW) {
3285 ll2string(bufa,sizeof(bufa),(long) a->ptr);
3286 astr = bufa;
3287 bothsds = 0;
3288 } else {
3289 astr = a->ptr;
3290 }
3291 if (b->encoding != REDIS_ENCODING_RAW) {
3292 ll2string(bufb,sizeof(bufb),(long) b->ptr);
3293 bstr = bufb;
3294 bothsds = 0;
3295 } else {
3296 bstr = b->ptr;
3297 }
3298 return bothsds ? sdscmp(astr,bstr) : strcmp(astr,bstr);
3299 }
3300
3301 /* Equal string objects return 1 if the two objects are the same from the
3302 * point of view of a string comparison, otherwise 0 is returned. Note that
3303 * this function is faster then checking for (compareStringObject(a,b) == 0)
3304 * because it can perform some more optimization. */
3305 static int equalStringObjects(robj *a, robj *b) {
3306 if (a->encoding != REDIS_ENCODING_RAW && b->encoding != REDIS_ENCODING_RAW){
3307 return a->ptr == b->ptr;
3308 } else {
3309 return compareStringObjects(a,b) == 0;
3310 }
3311 }
3312
3313 static size_t stringObjectLen(robj *o) {
3314 redisAssert(o->type == REDIS_STRING);
3315 if (o->encoding == REDIS_ENCODING_RAW) {
3316 return sdslen(o->ptr);
3317 } else {
3318 char buf[32];
3319
3320 return ll2string(buf,32,(long)o->ptr);
3321 }
3322 }
3323
3324 static int getDoubleFromObject(robj *o, double *target) {
3325 double value;
3326 char *eptr;
3327
3328 if (o == NULL) {
3329 value = 0;
3330 } else {
3331 redisAssert(o->type == REDIS_STRING);
3332 if (o->encoding == REDIS_ENCODING_RAW) {
3333 value = strtod(o->ptr, &eptr);
3334 if (eptr[0] != '\0') return REDIS_ERR;
3335 } else if (o->encoding == REDIS_ENCODING_INT) {
3336 value = (long)o->ptr;
3337 } else {
3338 redisPanic("Unknown string encoding");
3339 }
3340 }
3341
3342 *target = value;
3343 return REDIS_OK;
3344 }
3345
3346 static int getDoubleFromObjectOrReply(redisClient *c, robj *o, double *target, const char *msg) {
3347 double value;
3348 if (getDoubleFromObject(o, &value) != REDIS_OK) {
3349 if (msg != NULL) {
3350 addReplySds(c, sdscatprintf(sdsempty(), "-ERR %s\r\n", msg));
3351 } else {
3352 addReplySds(c, sdsnew("-ERR value is not a double\r\n"));
3353 }
3354 return REDIS_ERR;
3355 }
3356
3357 *target = value;
3358 return REDIS_OK;
3359 }
3360
3361 static int getLongLongFromObject(robj *o, long long *target) {
3362 long long value;
3363 char *eptr;
3364
3365 if (o == NULL) {
3366 value = 0;
3367 } else {
3368 redisAssert(o->type == REDIS_STRING);
3369 if (o->encoding == REDIS_ENCODING_RAW) {
3370 value = strtoll(o->ptr, &eptr, 10);
3371 if (eptr[0] != '\0') return REDIS_ERR;
3372 } else if (o->encoding == REDIS_ENCODING_INT) {
3373 value = (long)o->ptr;
3374 } else {
3375 redisPanic("Unknown string encoding");
3376 }
3377 }
3378
3379 *target = value;
3380 return REDIS_OK;
3381 }
3382
3383 static int getLongLongFromObjectOrReply(redisClient *c, robj *o, long long *target, const char *msg) {
3384 long long value;
3385 if (getLongLongFromObject(o, &value) != REDIS_OK) {
3386 if (msg != NULL) {
3387 addReplySds(c, sdscatprintf(sdsempty(), "-ERR %s\r\n", msg));
3388 } else {
3389 addReplySds(c, sdsnew("-ERR value is not an integer\r\n"));
3390 }
3391 return REDIS_ERR;
3392 }
3393
3394 *target = value;
3395 return REDIS_OK;
3396 }
3397
3398 static int getLongFromObjectOrReply(redisClient *c, robj *o, long *target, const char *msg) {
3399 long long value;
3400
3401 if (getLongLongFromObjectOrReply(c, o, &value, msg) != REDIS_OK) return REDIS_ERR;
3402 if (value < LONG_MIN || value > LONG_MAX) {
3403 if (msg != NULL) {
3404 addReplySds(c, sdscatprintf(sdsempty(), "-ERR %s\r\n", msg));
3405 } else {
3406 addReplySds(c, sdsnew("-ERR value is out of range\r\n"));
3407 }
3408 return REDIS_ERR;
3409 }
3410
3411 *target = value;
3412 return REDIS_OK;
3413 }
3414
3415 /* =========================== Keyspace access API ========================== */
3416
3417 static robj *lookupKey(redisDb *db, robj *key) {
3418 dictEntry *de = dictFind(db->dict,key->ptr);
3419 if (de) {
3420 robj *val = dictGetEntryVal(de);
3421
3422 if (server.vm_enabled) {
3423 if (val->storage == REDIS_VM_MEMORY ||
3424 val->storage == REDIS_VM_SWAPPING)
3425 {
3426 /* If we were swapping the object out, cancel the operation */
3427 if (val->storage == REDIS_VM_SWAPPING)
3428 vmCancelThreadedIOJob(val);
3429 /* Update the access time for the aging algorithm. */
3430 val->lru = server.lruclock;
3431 } else {
3432 int notify = (val->storage == REDIS_VM_LOADING);
3433
3434 /* Our value was swapped on disk. Bring it at home. */
3435 redisAssert(val->type == REDIS_VMPOINTER);
3436 val = vmLoadObject(val);
3437 dictGetEntryVal(de) = val;
3438
3439 /* Clients blocked by the VM subsystem may be waiting for
3440 * this key... */
3441 if (notify) handleClientsBlockedOnSwappedKey(db,key);
3442 }
3443 }
3444 return val;
3445 } else {
3446 return NULL;
3447 }
3448 }
3449
3450 static robj *lookupKeyRead(redisDb *db, robj *key) {
3451 expireIfNeeded(db,key);
3452 return lookupKey(db,key);
3453 }
3454
3455 static robj *lookupKeyWrite(redisDb *db, robj *key) {
3456 deleteIfVolatile(db,key);
3457 touchWatchedKey(db,key);
3458 return lookupKey(db,key);
3459 }
3460
3461 static robj *lookupKeyReadOrReply(redisClient *c, robj *key, robj *reply) {
3462 robj *o = lookupKeyRead(c->db, key);
3463 if (!o) addReply(c,reply);
3464 return o;
3465 }
3466
3467 static robj *lookupKeyWriteOrReply(redisClient *c, robj *key, robj *reply) {
3468 robj *o = lookupKeyWrite(c->db, key);
3469 if (!o) addReply(c,reply);
3470 return o;
3471 }
3472
3473 /* Add the key to the DB. If the key already exists REDIS_ERR is returned,
3474 * otherwise REDIS_OK is returned, and the caller should increment the
3475 * refcount of 'val'. */
3476 static int dbAdd(redisDb *db, robj *key, robj *val) {
3477 /* Perform a lookup before adding the key, as we need to copy the
3478 * key value. */
3479 if (dictFind(db->dict, key->ptr) != NULL) {
3480 return REDIS_ERR;
3481 } else {
3482 sds copy = sdsdup(key->ptr);
3483 dictAdd(db->dict, copy, val);
3484 return REDIS_OK;
3485 }
3486 }
3487
3488 /* If the key does not exist, this is just like dbAdd(). Otherwise
3489 * the value associated to the key is replaced with the new one.
3490 *
3491 * On update (key already existed) 0 is returned. Otherwise 1. */
3492 static int dbReplace(redisDb *db, robj *key, robj *val) {
3493 if (dictFind(db->dict,key->ptr) == NULL) {
3494 sds copy = sdsdup(key->ptr);
3495 dictAdd(db->dict, copy, val);
3496 return 1;
3497 } else {
3498 dictReplace(db->dict, key->ptr, val);
3499 return 0;
3500 }
3501 }
3502
3503 static int dbExists(redisDb *db, robj *key) {
3504 return dictFind(db->dict,key->ptr) != NULL;
3505 }
3506
3507 /* Return a random key, in form of a Redis object.
3508 * If there are no keys, NULL is returned.
3509 *
3510 * The function makes sure to return keys not already expired. */
3511 static robj *dbRandomKey(redisDb *db) {
3512 struct dictEntry *de;
3513
3514 while(1) {
3515 sds key;
3516 robj *keyobj;
3517
3518 de = dictGetRandomKey(db->dict);
3519 if (de == NULL) return NULL;
3520
3521 key = dictGetEntryKey(de);
3522 keyobj = createStringObject(key,sdslen(key));
3523 if (dictFind(db->expires,key)) {
3524 if (expireIfNeeded(db,keyobj)) {
3525 decrRefCount(keyobj);
3526 continue; /* search for another key. This expired. */
3527 }
3528 }
3529 return keyobj;
3530 }
3531 }
3532
3533 /* Delete a key, value, and associated expiration entry if any, from the DB */
3534 static int dbDelete(redisDb *db, robj *key) {
3535 /* Deleting an entry from the expires dict will not free the sds of
3536 * the key, because it is shared with the main dictionary. */
3537 if (dictSize(db->expires) > 0) dictDelete(db->expires,key->ptr);
3538 return dictDelete(db->dict,key->ptr) == DICT_OK;
3539 }
3540
3541 /*============================ RDB saving/loading =========================== */
3542
3543 static int rdbSaveType(FILE *fp, unsigned char type) {
3544 if (fwrite(&type,1,1,fp) == 0) return -1;
3545 return 0;
3546 }
3547
3548 static int rdbSaveTime(FILE *fp, time_t t) {
3549 int32_t t32 = (int32_t) t;
3550 if (fwrite(&t32,4,1,fp) == 0) return -1;
3551 return 0;
3552 }
3553
3554 /* check rdbLoadLen() comments for more info */
3555 static int rdbSaveLen(FILE *fp, uint32_t len) {
3556 unsigned char buf[2];
3557
3558 if (len < (1<<6)) {
3559 /* Save a 6 bit len */
3560 buf[0] = (len&0xFF)|(REDIS_RDB_6BITLEN<<6);
3561 if (fwrite(buf,1,1,fp) == 0) return -1;
3562 } else if (len < (1<<14)) {
3563 /* Save a 14 bit len */
3564 buf[0] = ((len>>8)&0xFF)|(REDIS_RDB_14BITLEN<<6);
3565 buf[1] = len&0xFF;
3566 if (fwrite(buf,2,1,fp) == 0) return -1;
3567 } else {
3568 /* Save a 32 bit len */
3569 buf[0] = (REDIS_RDB_32BITLEN<<6);
3570 if (fwrite(buf,1,1,fp) == 0) return -1;
3571 len = htonl(len);
3572 if (fwrite(&len,4,1,fp) == 0) return -1;
3573 }
3574 return 0;
3575 }
3576
3577 /* Encode 'value' as an integer if possible (if integer will fit the
3578 * supported range). If the function sucessful encoded the integer
3579 * then the (up to 5 bytes) encoded representation is written in the
3580 * string pointed by 'enc' and the length is returned. Otherwise
3581 * 0 is returned. */
3582 static int rdbEncodeInteger(long long value, unsigned char *enc) {
3583 /* Finally check if it fits in our ranges */
3584 if (value >= -(1<<7) && value <= (1<<7)-1) {
3585 enc[0] = (REDIS_RDB_ENCVAL<<6)|REDIS_RDB_ENC_INT8;
3586 enc[1] = value&0xFF;
3587 return 2;
3588 } else if (value >= -(1<<15) && value <= (1<<15)-1) {
3589 enc[0] = (REDIS_RDB_ENCVAL<<6)|REDIS_RDB_ENC_INT16;
3590 enc[1] = value&0xFF;
3591 enc[2] = (value>>8)&0xFF;
3592 return 3;
3593 } else if (value >= -((long long)1<<31) && value <= ((long long)1<<31)-1) {
3594 enc[0] = (REDIS_RDB_ENCVAL<<6)|REDIS_RDB_ENC_INT32;
3595 enc[1] = value&0xFF;
3596 enc[2] = (value>>8)&0xFF;
3597 enc[3] = (value>>16)&0xFF;
3598 enc[4] = (value>>24)&0xFF;
3599 return 5;
3600 } else {
3601 return 0;
3602 }
3603 }
3604
3605 /* String objects in the form "2391" "-100" without any space and with a
3606 * range of values that can fit in an 8, 16 or 32 bit signed value can be
3607 * encoded as integers to save space */
3608 static int rdbTryIntegerEncoding(char *s, size_t len, unsigned char *enc) {
3609 long long value;
3610 char *endptr, buf[32];
3611
3612 /* Check if it's possible to encode this value as a number */
3613 value = strtoll(s, &endptr, 10);
3614 if (endptr[0] != '\0') return 0;
3615 ll2string(buf,32,value);
3616
3617 /* If the number converted back into a string is not identical
3618 * then it's not possible to encode the string as integer */
3619 if (strlen(buf) != len || memcmp(buf,s,len)) return 0;
3620
3621 return rdbEncodeInteger(value,enc);
3622 }
3623
3624 static int rdbSaveLzfStringObject(FILE *fp, unsigned char *s, size_t len) {
3625 size_t comprlen, outlen;
3626 unsigned char byte;
3627 void *out;
3628
3629 /* We require at least four bytes compression for this to be worth it */
3630 if (len <= 4) return 0;
3631 outlen = len-4;
3632 if ((out = zmalloc(outlen+1)) == NULL) return 0;
3633 comprlen = lzf_compress(s, len, out, outlen);
3634 if (comprlen == 0) {
3635 zfree(out);
3636 return 0;
3637 }
3638 /* Data compressed! Let's save it on disk */
3639 byte = (REDIS_RDB_ENCVAL<<6)|REDIS_RDB_ENC_LZF;
3640 if (fwrite(&byte,1,1,fp) == 0) goto writeerr;
3641 if (rdbSaveLen(fp,comprlen) == -1) goto writeerr;
3642 if (rdbSaveLen(fp,len) == -1) goto writeerr;
3643 if (fwrite(out,comprlen,1,fp) == 0) goto writeerr;
3644 zfree(out);
3645 return comprlen;
3646
3647 writeerr:
3648 zfree(out);
3649 return -1;
3650 }
3651
3652 /* Save a string objet as [len][data] on disk. If the object is a string
3653 * representation of an integer value we try to safe it in a special form */
3654 static int rdbSaveRawString(FILE *fp, unsigned char *s, size_t len) {
3655 int enclen;
3656
3657 /* Try integer encoding */
3658 if (len <= 11) {
3659 unsigned char buf[5];
3660 if ((enclen = rdbTryIntegerEncoding((char*)s,len,buf)) > 0) {
3661 if (fwrite(buf,enclen,1,fp) == 0) return -1;
3662 return 0;
3663 }
3664 }
3665
3666 /* Try LZF compression - under 20 bytes it's unable to compress even
3667 * aaaaaaaaaaaaaaaaaa so skip it */
3668 if (server.rdbcompression && len > 20) {
3669 int retval;
3670
3671 retval = rdbSaveLzfStringObject(fp,s,len);
3672 if (retval == -1) return -1;
3673 if (retval > 0) return 0;
3674 /* retval == 0 means data can't be compressed, save the old way */
3675 }
3676
3677 /* Store verbatim */
3678 if (rdbSaveLen(fp,len) == -1) return -1;
3679 if (len && fwrite(s,len,1,fp) == 0) return -1;
3680 return 0;
3681 }
3682
3683 /* Save a long long value as either an encoded string or a string. */
3684 static int rdbSaveLongLongAsStringObject(FILE *fp, long long value) {
3685 unsigned char buf[32];
3686 int enclen = rdbEncodeInteger(value,buf);
3687 if (enclen > 0) {
3688 if (fwrite(buf,enclen,1,fp) == 0) return -1;
3689 } else {
3690 /* Encode as string */
3691 enclen = ll2string((char*)buf,32,value);
3692 redisAssert(enclen < 32);
3693 if (rdbSaveLen(fp,enclen) == -1) return -1;
3694 if (fwrite(buf,enclen,1,fp) == 0) return -1;
3695 }
3696 return 0;
3697 }
3698
3699 /* Like rdbSaveStringObjectRaw() but handle encoded objects */
3700 static int rdbSaveStringObject(FILE *fp, robj *obj) {
3701 /* Avoid to decode the object, then encode it again, if the
3702 * object is alrady integer encoded. */
3703 if (obj->encoding == REDIS_ENCODING_INT) {
3704 return rdbSaveLongLongAsStringObject(fp,(long)obj->ptr);
3705 } else {
3706 redisAssert(obj->encoding == REDIS_ENCODING_RAW);
3707 return rdbSaveRawString(fp,obj->ptr,sdslen(obj->ptr));
3708 }
3709 }
3710
3711 /* Save a double value. Doubles are saved as strings prefixed by an unsigned
3712 * 8 bit integer specifing the length of the representation.
3713 * This 8 bit integer has special values in order to specify the following
3714 * conditions:
3715 * 253: not a number
3716 * 254: + inf
3717 * 255: - inf
3718 */
3719 static int rdbSaveDoubleValue(FILE *fp, double val) {
3720 unsigned char buf[128];
3721 int len;
3722
3723 if (isnan(val)) {
3724 buf[0] = 253;
3725 len = 1;
3726 } else if (!isfinite(val)) {
3727 len = 1;
3728 buf[0] = (val < 0) ? 255 : 254;
3729 } else {
3730 #if (DBL_MANT_DIG >= 52) && (LLONG_MAX == 0x7fffffffffffffffLL)
3731 /* Check if the float is in a safe range to be casted into a
3732 * long long. We are assuming that long long is 64 bit here.
3733 * Also we are assuming that there are no implementations around where
3734 * double has precision < 52 bit.
3735 *
3736 * Under this assumptions we test if a double is inside an interval
3737 * where casting to long long is safe. Then using two castings we
3738 * make sure the decimal part is zero. If all this is true we use
3739 * integer printing function that is much faster. */
3740 double min = -4503599627370495; /* (2^52)-1 */
3741 double max = 4503599627370496; /* -(2^52) */
3742 if (val > min && val < max && val == ((double)((long long)val)))
3743 ll2string((char*)buf+1,sizeof(buf),(long long)val);
3744 else
3745 #endif
3746 snprintf((char*)buf+1,sizeof(buf)-1,"%.17g",val);
3747 buf[0] = strlen((char*)buf+1);
3748 len = buf[0]+1;
3749 }
3750 if (fwrite(buf,len,1,fp) == 0) return -1;
3751 return 0;
3752 }
3753
3754 /* Save a Redis object. */
3755 static int rdbSaveObject(FILE *fp, robj *o) {
3756 if (o->type == REDIS_STRING) {
3757 /* Save a string value */
3758 if (rdbSaveStringObject(fp,o) == -1) return -1;
3759 } else if (o->type == REDIS_LIST) {
3760 /* Save a list value */
3761 if (o->encoding == REDIS_ENCODING_ZIPLIST) {
3762 unsigned char *p;
3763 unsigned char *vstr;
3764 unsigned int vlen;
3765 long long vlong;
3766
3767 if (rdbSaveLen(fp,ziplistLen(o->ptr)) == -1) return -1;
3768 p = ziplistIndex(o->ptr,0);
3769 while(ziplistGet(p,&vstr,&vlen,&vlong)) {
3770 if (vstr) {
3771 if (rdbSaveRawString(fp,vstr,vlen) == -1)
3772 return -1;
3773 } else {
3774 if (rdbSaveLongLongAsStringObject(fp,vlong) == -1)
3775 return -1;
3776 }
3777 p = ziplistNext(o->ptr,p);
3778 }
3779 } else if (o->encoding == REDIS_ENCODING_LIST) {
3780 list *list = o->ptr;
3781 listIter li;
3782 listNode *ln;
3783
3784 if (rdbSaveLen(fp,listLength(list)) == -1) return -1;
3785 listRewind(list,&li);
3786 while((ln = listNext(&li))) {
3787 robj *eleobj = listNodeValue(ln);
3788 if (rdbSaveStringObject(fp,eleobj) == -1) return -1;
3789 }
3790 } else {
3791 redisPanic("Unknown list encoding");
3792 }
3793 } else if (o->type == REDIS_SET) {
3794 /* Save a set value */
3795 dict *set = o->ptr;
3796 dictIterator *di = dictGetIterator(set);
3797 dictEntry *de;
3798
3799 if (rdbSaveLen(fp,dictSize(set)) == -1) return -1;
3800 while((de = dictNext(di)) != NULL) {
3801 robj *eleobj = dictGetEntryKey(de);
3802
3803 if (rdbSaveStringObject(fp,eleobj) == -1) return -1;
3804 }
3805 dictReleaseIterator(di);
3806 } else if (o->type == REDIS_ZSET) {
3807 /* Save a set value */
3808 zset *zs = o->ptr;
3809 dictIterator *di = dictGetIterator(zs->dict);
3810 dictEntry *de;
3811
3812 if (rdbSaveLen(fp,dictSize(zs->dict)) == -1) return -1;
3813 while((de = dictNext(di)) != NULL) {
3814 robj *eleobj = dictGetEntryKey(de);
3815 double *score = dictGetEntryVal(de);
3816
3817 if (rdbSaveStringObject(fp,eleobj) == -1) return -1;
3818 if (rdbSaveDoubleValue(fp,*score) == -1) return -1;
3819 }
3820 dictReleaseIterator(di);
3821 } else if (o->type == REDIS_HASH) {
3822 /* Save a hash value */
3823 if (o->encoding == REDIS_ENCODING_ZIPMAP) {
3824 unsigned char *p = zipmapRewind(o->ptr);
3825 unsigned int count = zipmapLen(o->ptr);
3826 unsigned char *key, *val;
3827 unsigned int klen, vlen;
3828
3829 if (rdbSaveLen(fp,count) == -1) return -1;
3830 while((p = zipmapNext(p,&key,&klen,&val,&vlen)) != NULL) {
3831 if (rdbSaveRawString(fp,key,klen) == -1) return -1;
3832 if (rdbSaveRawString(fp,val,vlen) == -1) return -1;
3833 }
3834 } else {
3835 dictIterator *di = dictGetIterator(o->ptr);
3836 dictEntry *de;
3837
3838 if (rdbSaveLen(fp,dictSize((dict*)o->ptr)) == -1) return -1;
3839 while((de = dictNext(di)) != NULL) {
3840 robj *key = dictGetEntryKey(de);
3841 robj *val = dictGetEntryVal(de);
3842
3843 if (rdbSaveStringObject(fp,key) == -1) return -1;
3844 if (rdbSaveStringObject(fp,val) == -1) return -1;
3845 }
3846 dictReleaseIterator(di);
3847 }
3848 } else {
3849 redisPanic("Unknown object type");
3850 }
3851 return 0;
3852 }
3853
3854 /* Return the length the object will have on disk if saved with
3855 * the rdbSaveObject() function. Currently we use a trick to get
3856 * this length with very little changes to the code. In the future
3857 * we could switch to a faster solution. */
3858 static off_t rdbSavedObjectLen(robj *o, FILE *fp) {
3859 if (fp == NULL) fp = server.devnull;
3860 rewind(fp);
3861 assert(rdbSaveObject(fp,o) != 1);
3862 return ftello(fp);
3863 }
3864
3865 /* Return the number of pages required to save this object in the swap file */
3866 static off_t rdbSavedObjectPages(robj *o, FILE *fp) {
3867 off_t bytes = rdbSavedObjectLen(o,fp);
3868
3869 return (bytes+(server.vm_page_size-1))/server.vm_page_size;
3870 }
3871
3872 /* Save the DB on disk. Return REDIS_ERR on error, REDIS_OK on success */
3873 static int rdbSave(char *filename) {
3874 dictIterator *di = NULL;
3875 dictEntry *de;
3876 FILE *fp;
3877 char tmpfile[256];
3878 int j;
3879 time_t now = time(NULL);
3880
3881 /* Wait for I/O therads to terminate, just in case this is a
3882 * foreground-saving, to avoid seeking the swap file descriptor at the
3883 * same time. */
3884 if (server.vm_enabled)
3885 waitEmptyIOJobsQueue();
3886
3887 snprintf(tmpfile,256,"temp-%d.rdb", (int) getpid());
3888 fp = fopen(tmpfile,"w");
3889 if (!fp) {
3890 redisLog(REDIS_WARNING, "Failed saving the DB: %s", strerror(errno));
3891 return REDIS_ERR;
3892 }
3893 if (fwrite("REDIS0001",9,1,fp) == 0) goto werr;
3894 for (j = 0; j < server.dbnum; j++) {
3895 redisDb *db = server.db+j;
3896 dict *d = db->dict;
3897 if (dictSize(d) == 0) continue;
3898 di = dictGetIterator(d);
3899 if (!di) {
3900 fclose(fp);
3901 return REDIS_ERR;
3902 }
3903
3904 /* Write the SELECT DB opcode */
3905 if (rdbSaveType(fp,REDIS_SELECTDB) == -1) goto werr;
3906 if (rdbSaveLen(fp,j) == -1) goto werr;
3907
3908 /* Iterate this DB writing every entry */
3909 while((de = dictNext(di)) != NULL) {
3910 sds keystr = dictGetEntryKey(de);
3911 robj key, *o = dictGetEntryVal(de);
3912 time_t expiretime;
3913
3914 initStaticStringObject(key,keystr);
3915 expiretime = getExpire(db,&key);
3916
3917 /* Save the expire time */
3918 if (expiretime != -1) {
3919 /* If this key is already expired skip it */
3920 if (expiretime < now) continue;
3921 if (rdbSaveType(fp,REDIS_EXPIRETIME) == -1) goto werr;
3922 if (rdbSaveTime(fp,expiretime) == -1) goto werr;
3923 }
3924 /* Save the key and associated value. This requires special
3925 * handling if the value is swapped out. */
3926 if (!server.vm_enabled || o->storage == REDIS_VM_MEMORY ||
3927 o->storage == REDIS_VM_SWAPPING) {
3928 /* Save type, key, value */
3929 if (rdbSaveType(fp,o->type) == -1) goto werr;
3930 if (rdbSaveStringObject(fp,&key) == -1) goto werr;
3931 if (rdbSaveObject(fp,o) == -1) goto werr;
3932 } else {
3933 /* REDIS_VM_SWAPPED or REDIS_VM_LOADING */
3934 robj *po;
3935 /* Get a preview of the object in memory */
3936 po = vmPreviewObject(o);
3937 /* Save type, key, value */
3938 if (rdbSaveType(fp,po->type) == -1) goto werr;
3939 if (rdbSaveStringObject(fp,&key) == -1) goto werr;
3940 if (rdbSaveObject(fp,po) == -1) goto werr;
3941 /* Remove the loaded object from memory */
3942 decrRefCount(po);
3943 }
3944 }
3945 dictReleaseIterator(di);
3946 }
3947 /* EOF opcode */
3948 if (rdbSaveType(fp,REDIS_EOF) == -1) goto werr;
3949
3950 /* Make sure data will not remain on the OS's output buffers */
3951 fflush(fp);
3952 fsync(fileno(fp));
3953 fclose(fp);
3954
3955 /* Use RENAME to make sure the DB file is changed atomically only
3956 * if the generate DB file is ok. */
3957 if (rename(tmpfile,filename) == -1) {
3958 redisLog(REDIS_WARNING,"Error moving temp DB file on the final destination: %s", strerror(errno));
3959 unlink(tmpfile);
3960 return REDIS_ERR;
3961 }
3962 redisLog(REDIS_NOTICE,"DB saved on disk");
3963 server.dirty = 0;
3964 server.lastsave = time(NULL);
3965 return REDIS_OK;
3966
3967 werr:
3968 fclose(fp);
3969 unlink(tmpfile);
3970 redisLog(REDIS_WARNING,"Write error saving DB on disk: %s", strerror(errno));
3971 if (di) dictReleaseIterator(di);
3972 return REDIS_ERR;
3973 }
3974
3975 static int rdbSaveBackground(char *filename) {
3976 pid_t childpid;
3977
3978 if (server.bgsavechildpid != -1) return REDIS_ERR;
3979 if (server.vm_enabled) waitEmptyIOJobsQueue();
3980 if ((childpid = fork()) == 0) {
3981 /* Child */
3982 if (server.vm_enabled) vmReopenSwapFile();
3983 close(server.fd);
3984 if (rdbSave(filename) == REDIS_OK) {
3985 _exit(0);
3986 } else {
3987 _exit(1);
3988 }
3989 } else {
3990 /* Parent */
3991 if (childpid == -1) {
3992 redisLog(REDIS_WARNING,"Can't save in background: fork: %s",
3993 strerror(errno));
3994 return REDIS_ERR;
3995 }
3996 redisLog(REDIS_NOTICE,"Background saving started by pid %d",childpid);
3997 server.bgsavechildpid = childpid;
3998 updateDictResizePolicy();
3999 return REDIS_OK;
4000 }
4001 return REDIS_OK; /* unreached */
4002 }
4003
4004 static void rdbRemoveTempFile(pid_t childpid) {
4005 char tmpfile[256];
4006
4007 snprintf(tmpfile,256,"temp-%d.rdb", (int) childpid);
4008 unlink(tmpfile);
4009 }
4010
4011 static int rdbLoadType(FILE *fp) {
4012 unsigned char type;
4013 if (fread(&type,1,1,fp) == 0) return -1;
4014 return type;
4015 }
4016
4017 static time_t rdbLoadTime(FILE *fp) {
4018 int32_t t32;
4019 if (fread(&t32,4,1,fp) == 0) return -1;
4020 return (time_t) t32;
4021 }
4022
4023 /* Load an encoded length from the DB, see the REDIS_RDB_* defines on the top
4024 * of this file for a description of how this are stored on disk.
4025 *
4026 * isencoded is set to 1 if the readed length is not actually a length but
4027 * an "encoding type", check the above comments for more info */
4028 static uint32_t rdbLoadLen(FILE *fp, int *isencoded) {
4029 unsigned char buf[2];
4030 uint32_t len;
4031 int type;
4032
4033 if (isencoded) *isencoded = 0;
4034 if (fread(buf,1,1,fp) == 0) return REDIS_RDB_LENERR;
4035 type = (buf[0]&0xC0)>>6;
4036 if (type == REDIS_RDB_6BITLEN) {
4037 /* Read a 6 bit len */
4038 return buf[0]&0x3F;
4039 } else if (type == REDIS_RDB_ENCVAL) {
4040 /* Read a 6 bit len encoding type */
4041 if (isencoded) *isencoded = 1;
4042 return buf[0]&0x3F;
4043 } else if (type == REDIS_RDB_14BITLEN) {
4044 /* Read a 14 bit len */
4045 if (fread(buf+1,1,1,fp) == 0) return REDIS_RDB_LENERR;
4046 return ((buf[0]&0x3F)<<8)|buf[1];
4047 } else {
4048 /* Read a 32 bit len */
4049 if (fread(&len,4,1,fp) == 0) return REDIS_RDB_LENERR;
4050 return ntohl(len);
4051 }
4052 }
4053
4054 /* Load an integer-encoded object from file 'fp', with the specified
4055 * encoding type 'enctype'. If encode is true the function may return
4056 * an integer-encoded object as reply, otherwise the returned object
4057 * will always be encoded as a raw string. */
4058 static robj *rdbLoadIntegerObject(FILE *fp, int enctype, int encode) {
4059 unsigned char enc[4];
4060 long long val;
4061
4062 if (enctype == REDIS_RDB_ENC_INT8) {
4063 if (fread(enc,1,1,fp) == 0) return NULL;
4064 val = (signed char)enc[0];
4065 } else if (enctype == REDIS_RDB_ENC_INT16) {
4066 uint16_t v;
4067 if (fread(enc,2,1,fp) == 0) return NULL;
4068 v = enc[0]|(enc[1]<<8);
4069 val = (int16_t)v;
4070 } else if (enctype == REDIS_RDB_ENC_INT32) {
4071 uint32_t v;
4072 if (fread(enc,4,1,fp) == 0) return NULL;
4073 v = enc[0]|(enc[1]<<8)|(enc[2]<<16)|(enc[3]<<24);
4074 val = (int32_t)v;
4075 } else {
4076 val = 0; /* anti-warning */
4077 redisPanic("Unknown RDB integer encoding type");
4078 }
4079 if (encode)
4080 return createStringObjectFromLongLong(val);
4081 else
4082 return createObject(REDIS_STRING,sdsfromlonglong(val));
4083 }
4084
4085 static robj *rdbLoadLzfStringObject(FILE*fp) {
4086 unsigned int len, clen;
4087 unsigned char *c = NULL;
4088 sds val = NULL;
4089
4090 if ((clen = rdbLoadLen(fp,NULL)) == REDIS_RDB_LENERR) return NULL;
4091 if ((len = rdbLoadLen(fp,NULL)) == REDIS_RDB_LENERR) return NULL;
4092 if ((c = zmalloc(clen)) == NULL) goto err;
4093 if ((val = sdsnewlen(NULL,len)) == NULL) goto err;
4094 if (fread(c,clen,1,fp) == 0) goto err;
4095 if (lzf_decompress(c,clen,val,len) == 0) goto err;
4096 zfree(c);
4097 return createObject(REDIS_STRING,val);
4098 err:
4099 zfree(c);
4100 sdsfree(val);
4101 return NULL;
4102 }
4103
4104 static robj *rdbGenericLoadStringObject(FILE*fp, int encode) {
4105 int isencoded;
4106 uint32_t len;
4107 sds val;
4108
4109 len = rdbLoadLen(fp,&isencoded);
4110 if (isencoded) {
4111 switch(len) {
4112 case REDIS_RDB_ENC_INT8:
4113 case REDIS_RDB_ENC_INT16:
4114 case REDIS_RDB_ENC_INT32:
4115 return rdbLoadIntegerObject(fp,len,encode);
4116 case REDIS_RDB_ENC_LZF:
4117 return rdbLoadLzfStringObject(fp);
4118 default:
4119 redisPanic("Unknown RDB encoding type");
4120 }
4121 }
4122
4123 if (len == REDIS_RDB_LENERR) return NULL;
4124 val = sdsnewlen(NULL,len);
4125 if (len && fread(val,len,1,fp) == 0) {
4126 sdsfree(val);
4127 return NULL;
4128 }
4129 return createObject(REDIS_STRING,val);
4130 }
4131
4132 static robj *rdbLoadStringObject(FILE *fp) {
4133 return rdbGenericLoadStringObject(fp,0);
4134 }
4135
4136 static robj *rdbLoadEncodedStringObject(FILE *fp) {
4137 return rdbGenericLoadStringObject(fp,1);
4138 }
4139
4140 /* For information about double serialization check rdbSaveDoubleValue() */
4141 static int rdbLoadDoubleValue(FILE *fp, double *val) {
4142 char buf[128];
4143 unsigned char len;
4144
4145 if (fread(&len,1,1,fp) == 0) return -1;
4146 switch(len) {
4147 case 255: *val = R_NegInf; return 0;
4148 case 254: *val = R_PosInf; return 0;
4149 case 253: *val = R_Nan; return 0;
4150 default:
4151 if (fread(buf,len,1,fp) == 0) return -1;
4152 buf[len] = '\0';
4153 sscanf(buf, "%lg", val);
4154 return 0;
4155 }
4156 }
4157
4158 /* Load a Redis object of the specified type from the specified file.
4159 * On success a newly allocated object is returned, otherwise NULL. */
4160 static robj *rdbLoadObject(int type, FILE *fp) {
4161 robj *o, *ele, *dec;
4162 size_t len;
4163
4164 redisLog(REDIS_DEBUG,"LOADING OBJECT %d (at %d)\n",type,ftell(fp));
4165 if (type == REDIS_STRING) {
4166 /* Read string value */
4167 if ((o = rdbLoadEncodedStringObject(fp)) == NULL) return NULL;
4168 o = tryObjectEncoding(o);
4169 } else if (type == REDIS_LIST) {
4170 /* Read list value */
4171 if ((len = rdbLoadLen(fp,NULL)) == REDIS_RDB_LENERR) return NULL;
4172
4173 /* Use a real list when there are too many entries */
4174 if (len > server.list_max_ziplist_entries) {
4175 o = createListObject();
4176 } else {
4177 o = createZiplistObject();
4178 }
4179
4180 /* Load every single element of the list */
4181 while(len--) {
4182 if ((ele = rdbLoadEncodedStringObject(fp)) == NULL) return NULL;
4183
4184 /* If we are using a ziplist and the value is too big, convert
4185 * the object to a real list. */
4186 if (o->encoding == REDIS_ENCODING_ZIPLIST &&
4187 ele->encoding == REDIS_ENCODING_RAW &&
4188 sdslen(ele->ptr) > server.list_max_ziplist_value)
4189 listTypeConvert(o,REDIS_ENCODING_LIST);
4190
4191 if (o->encoding == REDIS_ENCODING_ZIPLIST) {
4192 dec = getDecodedObject(ele);
4193 o->ptr = ziplistPush(o->ptr,dec->ptr,sdslen(dec->ptr),REDIS_TAIL);
4194 decrRefCount(dec);
4195 decrRefCount(ele);
4196 } else {
4197 ele = tryObjectEncoding(ele);
4198 listAddNodeTail(o->ptr,ele);
4199 }
4200 }
4201 } else if (type == REDIS_SET) {
4202 /* Read list/set value */
4203 if ((len = rdbLoadLen(fp,NULL)) == REDIS_RDB_LENERR) return NULL;
4204 o = createSetObject();
4205 /* It's faster to expand the dict to the right size asap in order
4206 * to avoid rehashing */
4207 if (len > DICT_HT_INITIAL_SIZE)
4208 dictExpand(o->ptr,len);
4209 /* Load every single element of the list/set */
4210 while(len--) {
4211 if ((ele = rdbLoadEncodedStringObject(fp)) == NULL) return NULL;
4212 ele = tryObjectEncoding(ele);
4213 dictAdd((dict*)o->ptr,ele,NULL);
4214 }
4215 } else if (type == REDIS_ZSET) {
4216 /* Read list/set value */
4217 size_t zsetlen;
4218 zset *zs;
4219
4220 if ((zsetlen = rdbLoadLen(fp,NULL)) == REDIS_RDB_LENERR) return NULL;
4221 o = createZsetObject();
4222 zs = o->ptr;
4223 /* Load every single element of the list/set */
4224 while(zsetlen--) {
4225 robj *ele;
4226 double *score = zmalloc(sizeof(double));
4227
4228 if ((ele = rdbLoadEncodedStringObject(fp)) == NULL) return NULL;
4229 ele = tryObjectEncoding(ele);
4230 if (rdbLoadDoubleValue(fp,score) == -1) return NULL;
4231 dictAdd(zs->dict,ele,score);
4232 zslInsert(zs->zsl,*score,ele);
4233 incrRefCount(ele); /* added to skiplist */
4234 }
4235 } else if (type == REDIS_HASH) {
4236 size_t hashlen;
4237
4238 if ((hashlen = rdbLoadLen(fp,NULL)) == REDIS_RDB_LENERR) return NULL;
4239 o = createHashObject();
4240 /* Too many entries? Use an hash table. */
4241 if (hashlen > server.hash_max_zipmap_entries)
4242 convertToRealHash(o);
4243 /* Load every key/value, then set it into the zipmap or hash
4244 * table, as needed. */
4245 while(hashlen--) {
4246 robj *key, *val;
4247
4248 if ((key = rdbLoadEncodedStringObject(fp)) == NULL) return NULL;
4249 if ((val = rdbLoadEncodedStringObject(fp)) == NULL) return NULL;
4250 /* If we are using a zipmap and there are too big values
4251 * the object is converted to real hash table encoding. */
4252 if (o->encoding != REDIS_ENCODING_HT &&
4253 (sdslen(key->ptr) > server.hash_max_zipmap_value ||
4254 sdslen(val->ptr) > server.hash_max_zipmap_value))
4255 {
4256 convertToRealHash(o);
4257 }
4258
4259 if (o->encoding == REDIS_ENCODING_ZIPMAP) {
4260 unsigned char *zm = o->ptr;
4261
4262 zm = zipmapSet(zm,key->ptr,sdslen(key->ptr),
4263 val->ptr,sdslen(val->ptr),NULL);
4264 o->ptr = zm;
4265 decrRefCount(key);
4266 decrRefCount(val);
4267 } else {
4268 key = tryObjectEncoding(key);
4269 val = tryObjectEncoding(val);
4270 dictAdd((dict*)o->ptr,key,val);
4271 }
4272 }
4273 } else {
4274 redisPanic("Unknown object type");
4275 }
4276 return o;
4277 }
4278
4279 static int rdbLoad(char *filename) {
4280 FILE *fp;
4281 uint32_t dbid;
4282 int type, retval, rdbver;
4283 int swap_all_values = 0;
4284 redisDb *db = server.db+0;
4285 char buf[1024];
4286 time_t expiretime, now = time(NULL);
4287
4288 fp = fopen(filename,"r");
4289 if (!fp) return REDIS_ERR;
4290 if (fread(buf,9,1,fp) == 0) goto eoferr;
4291 buf[9] = '\0';
4292 if (memcmp(buf,"REDIS",5) != 0) {
4293 fclose(fp);
4294 redisLog(REDIS_WARNING,"Wrong signature trying to load DB from file");
4295 return REDIS_ERR;
4296 }
4297 rdbver = atoi(buf+5);
4298 if (rdbver != 1) {
4299 fclose(fp);
4300 redisLog(REDIS_WARNING,"Can't handle RDB format version %d",rdbver);
4301 return REDIS_ERR;
4302 }
4303 while(1) {
4304 robj *key, *val;
4305 int force_swapout;
4306
4307 expiretime = -1;
4308 /* Read type. */
4309 if ((type = rdbLoadType(fp)) == -1) goto eoferr;
4310 if (type == REDIS_EXPIRETIME) {
4311 if ((expiretime = rdbLoadTime(fp)) == -1) goto eoferr;
4312 /* We read the time so we need to read the object type again */
4313 if ((type = rdbLoadType(fp)) == -1) goto eoferr;
4314 }
4315 if (type == REDIS_EOF) break;
4316 /* Handle SELECT DB opcode as a special case */
4317 if (type == REDIS_SELECTDB) {
4318 if ((dbid = rdbLoadLen(fp,NULL)) == REDIS_RDB_LENERR)
4319 goto eoferr;
4320 if (dbid >= (unsigned)server.dbnum) {
4321 redisLog(REDIS_WARNING,"FATAL: Data file was created with a Redis server configured to handle more than %d databases. Exiting\n", server.dbnum);
4322 exit(1);
4323 }
4324 db = server.db+dbid;
4325 continue;
4326 }
4327 /* Read key */
4328 if ((key = rdbLoadStringObject(fp)) == NULL) goto eoferr;
4329 /* Read value */
4330 if ((val = rdbLoadObject(type,fp)) == NULL) goto eoferr;
4331 /* Check if the key already expired */
4332 if (expiretime != -1 && expiretime < now) {
4333 decrRefCount(key);
4334 decrRefCount(val);
4335 continue;
4336 }
4337 /* Add the new object in the hash table */
4338 retval = dbAdd(db,key,val);
4339 if (retval == REDIS_ERR) {
4340 redisLog(REDIS_WARNING,"Loading DB, duplicated key (%s) found! Unrecoverable error, exiting now.", key->ptr);
4341 exit(1);
4342 }
4343 /* Set the expire time if needed */
4344 if (expiretime != -1) setExpire(db,key,expiretime);
4345
4346 /* Handle swapping while loading big datasets when VM is on */
4347
4348 /* If we detecter we are hopeless about fitting something in memory
4349 * we just swap every new key on disk. Directly...
4350 * Note that's important to check for this condition before resorting
4351 * to random sampling, otherwise we may try to swap already
4352 * swapped keys. */
4353 if (swap_all_values) {
4354 dictEntry *de = dictFind(db->dict,key->ptr);
4355
4356 /* de may be NULL since the key already expired */
4357 if (de) {
4358 vmpointer *vp;
4359 val = dictGetEntryVal(de);
4360
4361 if (val->refcount == 1 &&
4362 (vp = vmSwapObjectBlocking(val)) != NULL)
4363 dictGetEntryVal(de) = vp;
4364 }
4365 decrRefCount(key);
4366 continue;
4367 }
4368 decrRefCount(key);
4369
4370 /* Flush data on disk once 32 MB of additional RAM are used... */
4371 force_swapout = 0;
4372 if ((zmalloc_used_memory() - server.vm_max_memory) > 1024*1024*32)
4373 force_swapout = 1;
4374
4375 /* If we have still some hope of having some value fitting memory
4376 * then we try random sampling. */
4377 if (!swap_all_values && server.vm_enabled && force_swapout) {
4378 while (zmalloc_used_memory() > server.vm_max_memory) {
4379 if (vmSwapOneObjectBlocking() == REDIS_ERR) break;
4380 }
4381 if (zmalloc_used_memory() > server.vm_max_memory)
4382 swap_all_values = 1; /* We are already using too much mem */
4383 }
4384 }
4385 fclose(fp);
4386 return REDIS_OK;
4387
4388 eoferr: /* unexpected end of file is handled here with a fatal exit */
4389 redisLog(REDIS_WARNING,"Short read or OOM loading DB. Unrecoverable error, aborting now.");
4390 exit(1);
4391 return REDIS_ERR; /* Just to avoid warning */
4392 }
4393
4394 /*================================== Shutdown =============================== */
4395 static int prepareForShutdown() {
4396 redisLog(REDIS_WARNING,"User requested shutdown, saving DB...");
4397 /* Kill the saving child if there is a background saving in progress.
4398 We want to avoid race conditions, for instance our saving child may
4399 overwrite the synchronous saving did by SHUTDOWN. */
4400 if (server.bgsavechildpid != -1) {
4401 redisLog(REDIS_WARNING,"There is a live saving child. Killing it!");
4402 kill(server.bgsavechildpid,SIGKILL);
4403 rdbRemoveTempFile(server.bgsavechildpid);
4404 }
4405 if (server.appendonly) {
4406 /* Append only file: fsync() the AOF and exit */
4407 aof_fsync(server.appendfd);
4408 if (server.vm_enabled) unlink(server.vm_swap_file);
4409 } else {
4410 /* Snapshotting. Perform a SYNC SAVE and exit */
4411 if (rdbSave(server.dbfilename) == REDIS_OK) {
4412 if (server.daemonize)
4413 unlink(server.pidfile);
4414 redisLog(REDIS_WARNING,"%zu bytes used at exit",zmalloc_used_memory());
4415 } else {
4416 /* Ooops.. error saving! The best we can do is to continue
4417 * operating. Note that if there was a background saving process,
4418 * in the next cron() Redis will be notified that the background
4419 * saving aborted, handling special stuff like slaves pending for
4420 * synchronization... */
4421 redisLog(REDIS_WARNING,"Error trying to save the DB, can't exit");
4422 return REDIS_ERR;
4423 }
4424 }
4425 redisLog(REDIS_WARNING,"Server exit now, bye bye...");
4426 return REDIS_OK;
4427 }
4428
4429 /*================================== Commands =============================== */
4430
4431 static void authCommand(redisClient *c) {
4432 if (!server.requirepass || !strcmp(c->argv[1]->ptr, server.requirepass)) {
4433 c->authenticated = 1;
4434 addReply(c,shared.ok);
4435 } else {
4436 c->authenticated = 0;
4437 addReplySds(c,sdscatprintf(sdsempty(),"-ERR invalid password\r\n"));
4438 }
4439 }
4440
4441 static void pingCommand(redisClient *c) {
4442 addReply(c,shared.pong);
4443 }
4444
4445 static void echoCommand(redisClient *c) {
4446 addReplyBulk(c,c->argv[1]);
4447 }
4448
4449 /*=================================== Strings =============================== */
4450
4451 static void setGenericCommand(redisClient *c, int nx, robj *key, robj *val, robj *expire) {
4452 int retval;
4453 long seconds = 0; /* initialized to avoid an harmness warning */
4454
4455 if (expire) {
4456 if (getLongFromObjectOrReply(c, expire, &seconds, NULL) != REDIS_OK)
4457 return;
4458 if (seconds <= 0) {
4459 addReplySds(c,sdsnew("-ERR invalid expire time in SETEX\r\n"));
4460 return;
4461 }
4462 }
4463
4464 touchWatchedKey(c->db,key);
4465 if (nx) deleteIfVolatile(c->db,key);
4466 retval = dbAdd(c->db,key,val);
4467 if (retval == REDIS_ERR) {
4468 if (!nx) {
4469 dbReplace(c->db,key,val);
4470 incrRefCount(val);
4471 } else {
4472 addReply(c,shared.czero);
4473 return;
4474 }
4475 } else {
4476 incrRefCount(val);
4477 }
4478 server.dirty++;
4479 removeExpire(c->db,key);
4480 if (expire) setExpire(c->db,key,time(NULL)+seconds);
4481 addReply(c, nx ? shared.cone : shared.ok);
4482 }
4483
4484 static void setCommand(redisClient *c) {
4485 setGenericCommand(c,0,c->argv[1],c->argv[2],NULL);
4486 }
4487
4488 static void setnxCommand(redisClient *c) {
4489 setGenericCommand(c,1,c->argv[1],c->argv[2],NULL);
4490 }
4491
4492 static void setexCommand(redisClient *c) {
4493 setGenericCommand(c,0,c->argv[1],c->argv[3],c->argv[2]);
4494 }
4495
4496 static int getGenericCommand(redisClient *c) {
4497 robj *o;
4498
4499 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.nullbulk)) == NULL)
4500 return REDIS_OK;
4501
4502 if (o->type != REDIS_STRING) {
4503 addReply(c,shared.wrongtypeerr);
4504 return REDIS_ERR;
4505 } else {
4506 addReplyBulk(c,o);
4507 return REDIS_OK;
4508 }
4509 }
4510
4511 static void getCommand(redisClient *c) {
4512 getGenericCommand(c);
4513 }
4514
4515 static void getsetCommand(redisClient *c) {
4516 if (getGenericCommand(c) == REDIS_ERR) return;
4517 dbReplace(c->db,c->argv[1],c->argv[2]);
4518 incrRefCount(c->argv[2]);
4519 server.dirty++;
4520 removeExpire(c->db,c->argv[1]);
4521 }
4522
4523 static void mgetCommand(redisClient *c) {
4524 int j;
4525
4526 addReplySds(c,sdscatprintf(sdsempty(),"*%d\r\n",c->argc-1));
4527 for (j = 1; j < c->argc; j++) {
4528 robj *o = lookupKeyRead(c->db,c->argv[j]);
4529 if (o == NULL) {
4530 addReply(c,shared.nullbulk);
4531 } else {
4532 if (o->type != REDIS_STRING) {
4533 addReply(c,shared.nullbulk);
4534 } else {
4535 addReplyBulk(c,o);
4536 }
4537 }
4538 }
4539 }
4540
4541 static void msetGenericCommand(redisClient *c, int nx) {
4542 int j, busykeys = 0;
4543
4544 if ((c->argc % 2) == 0) {
4545 addReplySds(c,sdsnew("-ERR wrong number of arguments for MSET\r\n"));
4546 return;
4547 }
4548 /* Handle the NX flag. The MSETNX semantic is to return zero and don't
4549 * set nothing at all if at least one already key exists. */
4550 if (nx) {
4551 for (j = 1; j < c->argc; j += 2) {
4552 if (lookupKeyWrite(c->db,c->argv[j]) != NULL) {
4553 busykeys++;
4554 }
4555 }
4556 }
4557 if (busykeys) {
4558 addReply(c, shared.czero);
4559 return;
4560 }
4561
4562 for (j = 1; j < c->argc; j += 2) {
4563 c->argv[j+1] = tryObjectEncoding(c->argv[j+1]);
4564 dbReplace(c->db,c->argv[j],c->argv[j+1]);
4565 incrRefCount(c->argv[j+1]);
4566 removeExpire(c->db,c->argv[j]);
4567 }
4568 server.dirty += (c->argc-1)/2;
4569 addReply(c, nx ? shared.cone : shared.ok);
4570 }
4571
4572 static void msetCommand(redisClient *c) {
4573 msetGenericCommand(c,0);
4574 }
4575
4576 static void msetnxCommand(redisClient *c) {
4577 msetGenericCommand(c,1);
4578 }
4579
4580 static void incrDecrCommand(redisClient *c, long long incr) {
4581 long long value;
4582 robj *o;
4583
4584 o = lookupKeyWrite(c->db,c->argv[1]);
4585 if (o != NULL && checkType(c,o,REDIS_STRING)) return;
4586 if (getLongLongFromObjectOrReply(c,o,&value,NULL) != REDIS_OK) return;
4587
4588 value += incr;
4589 o = createStringObjectFromLongLong(value);
4590 dbReplace(c->db,c->argv[1],o);
4591 server.dirty++;
4592 addReply(c,shared.colon);
4593 addReply(c,o);
4594 addReply(c,shared.crlf);
4595 }
4596
4597 static void incrCommand(redisClient *c) {
4598 incrDecrCommand(c,1);
4599 }
4600
4601 static void decrCommand(redisClient *c) {
4602 incrDecrCommand(c,-1);
4603 }
4604
4605 static void incrbyCommand(redisClient *c) {
4606 long long incr;
4607
4608 if (getLongLongFromObjectOrReply(c, c->argv[2], &incr, NULL) != REDIS_OK) return;
4609 incrDecrCommand(c,incr);
4610 }
4611
4612 static void decrbyCommand(redisClient *c) {
4613 long long incr;
4614
4615 if (getLongLongFromObjectOrReply(c, c->argv[2], &incr, NULL) != REDIS_OK) return;
4616 incrDecrCommand(c,-incr);
4617 }
4618
4619 static void appendCommand(redisClient *c) {
4620 int retval;
4621 size_t totlen;
4622 robj *o;
4623
4624 o = lookupKeyWrite(c->db,c->argv[1]);
4625 if (o == NULL) {
4626 /* Create the key */
4627 retval = dbAdd(c->db,c->argv[1],c->argv[2]);
4628 incrRefCount(c->argv[2]);
4629 totlen = stringObjectLen(c->argv[2]);
4630 } else {
4631 if (o->type != REDIS_STRING) {
4632 addReply(c,shared.wrongtypeerr);
4633 return;
4634 }
4635 /* If the object is specially encoded or shared we have to make
4636 * a copy */
4637 if (o->refcount != 1 || o->encoding != REDIS_ENCODING_RAW) {
4638 robj *decoded = getDecodedObject(o);
4639
4640 o = createStringObject(decoded->ptr, sdslen(decoded->ptr));
4641 decrRefCount(decoded);
4642 dbReplace(c->db,c->argv[1],o);
4643 }
4644 /* APPEND! */
4645 if (c->argv[2]->encoding == REDIS_ENCODING_RAW) {
4646 o->ptr = sdscatlen(o->ptr,
4647 c->argv[2]->ptr, sdslen(c->argv[2]->ptr));
4648 } else {
4649 o->ptr = sdscatprintf(o->ptr, "%ld",
4650 (unsigned long) c->argv[2]->ptr);
4651 }
4652 totlen = sdslen(o->ptr);
4653 }
4654 server.dirty++;
4655 addReplySds(c,sdscatprintf(sdsempty(),":%lu\r\n",(unsigned long)totlen));
4656 }
4657
4658 static void substrCommand(redisClient *c) {
4659 robj *o;
4660 long start = atoi(c->argv[2]->ptr);
4661 long end = atoi(c->argv[3]->ptr);
4662 size_t rangelen, strlen;
4663 sds range;
4664
4665 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.nullbulk)) == NULL ||
4666 checkType(c,o,REDIS_STRING)) return;
4667
4668 o = getDecodedObject(o);
4669 strlen = sdslen(o->ptr);
4670
4671 /* convert negative indexes */
4672 if (start < 0) start = strlen+start;
4673 if (end < 0) end = strlen+end;
4674 if (start < 0) start = 0;
4675 if (end < 0) end = 0;
4676
4677 /* indexes sanity checks */
4678 if (start > end || (size_t)start >= strlen) {
4679 /* Out of range start or start > end result in null reply */
4680 addReply(c,shared.nullbulk);
4681 decrRefCount(o);
4682 return;
4683 }
4684 if ((size_t)end >= strlen) end = strlen-1;
4685 rangelen = (end-start)+1;
4686
4687 /* Return the result */
4688 addReplySds(c,sdscatprintf(sdsempty(),"$%zu\r\n",rangelen));
4689 range = sdsnewlen((char*)o->ptr+start,rangelen);
4690 addReplySds(c,range);
4691 addReply(c,shared.crlf);
4692 decrRefCount(o);
4693 }
4694
4695 /* ========================= Type agnostic commands ========================= */
4696
4697 static void delCommand(redisClient *c) {
4698 int deleted = 0, j;
4699
4700 for (j = 1; j < c->argc; j++) {
4701 if (dbDelete(c->db,c->argv[j])) {
4702 touchWatchedKey(c->db,c->argv[j]);
4703 server.dirty++;
4704 deleted++;
4705 }
4706 }
4707 addReplyLongLong(c,deleted);
4708 }
4709
4710 static void existsCommand(redisClient *c) {
4711 expireIfNeeded(c->db,c->argv[1]);
4712 if (dbExists(c->db,c->argv[1])) {
4713 addReply(c, shared.cone);
4714 } else {
4715 addReply(c, shared.czero);
4716 }
4717 }
4718
4719 static void selectCommand(redisClient *c) {
4720 int id = atoi(c->argv[1]->ptr);
4721
4722 if (selectDb(c,id) == REDIS_ERR) {
4723 addReplySds(c,sdsnew("-ERR invalid DB index\r\n"));
4724 } else {
4725 addReply(c,shared.ok);
4726 }
4727 }
4728
4729 static void randomkeyCommand(redisClient *c) {
4730 robj *key;
4731
4732 if ((key = dbRandomKey(c->db)) == NULL) {
4733 addReply(c,shared.nullbulk);
4734 return;
4735 }
4736
4737 addReplyBulk(c,key);
4738 decrRefCount(key);
4739 }
4740
4741 static void keysCommand(redisClient *c) {
4742 dictIterator *di;
4743 dictEntry *de;
4744 sds pattern = c->argv[1]->ptr;
4745 int plen = sdslen(pattern);
4746 unsigned long numkeys = 0;
4747 robj *lenobj = createObject(REDIS_STRING,NULL);
4748
4749 di = dictGetIterator(c->db->dict);
4750 addReply(c,lenobj);
4751 decrRefCount(lenobj);
4752 while((de = dictNext(di)) != NULL) {
4753 sds key = dictGetEntryKey(de);
4754 robj *keyobj;
4755
4756 if ((pattern[0] == '*' && pattern[1] == '\0') ||
4757 stringmatchlen(pattern,plen,key,sdslen(key),0)) {
4758 keyobj = createStringObject(key,sdslen(key));
4759 if (expireIfNeeded(c->db,keyobj) == 0) {
4760 addReplyBulk(c,keyobj);
4761 numkeys++;
4762 }
4763 decrRefCount(keyobj);
4764 }
4765 }
4766 dictReleaseIterator(di);
4767 lenobj->ptr = sdscatprintf(sdsempty(),"*%lu\r\n",numkeys);
4768 }
4769
4770 static void dbsizeCommand(redisClient *c) {
4771 addReplySds(c,
4772 sdscatprintf(sdsempty(),":%lu\r\n",dictSize(c->db->dict)));
4773 }
4774
4775 static void lastsaveCommand(redisClient *c) {
4776 addReplySds(c,
4777 sdscatprintf(sdsempty(),":%lu\r\n",server.lastsave));
4778 }
4779
4780 static void typeCommand(redisClient *c) {
4781 robj *o;
4782 char *type;
4783
4784 o = lookupKeyRead(c->db,c->argv[1]);
4785 if (o == NULL) {
4786 type = "+none";
4787 } else {
4788 switch(o->type) {
4789 case REDIS_STRING: type = "+string"; break;
4790 case REDIS_LIST: type = "+list"; break;
4791 case REDIS_SET: type = "+set"; break;
4792 case REDIS_ZSET: type = "+zset"; break;
4793 case REDIS_HASH: type = "+hash"; break;
4794 default: type = "+unknown"; break;
4795 }
4796 }
4797 addReplySds(c,sdsnew(type));
4798 addReply(c,shared.crlf);
4799 }
4800
4801 static void saveCommand(redisClient *c) {
4802 if (server.bgsavechildpid != -1) {
4803 addReplySds(c,sdsnew("-ERR background save in progress\r\n"));
4804 return;
4805 }
4806 if (rdbSave(server.dbfilename) == REDIS_OK) {
4807 addReply(c,shared.ok);
4808 } else {
4809 addReply(c,shared.err);
4810 }
4811 }
4812
4813 static void bgsaveCommand(redisClient *c) {
4814 if (server.bgsavechildpid != -1) {
4815 addReplySds(c,sdsnew("-ERR background save already in progress\r\n"));
4816 return;
4817 }
4818 if (rdbSaveBackground(server.dbfilename) == REDIS_OK) {
4819 char *status = "+Background saving started\r\n";
4820 addReplySds(c,sdsnew(status));
4821 } else {
4822 addReply(c,shared.err);
4823 }
4824 }
4825
4826 static void shutdownCommand(redisClient *c) {
4827 if (prepareForShutdown() == REDIS_OK)
4828 exit(0);
4829 addReplySds(c, sdsnew("-ERR Errors trying to SHUTDOWN. Check logs.\r\n"));
4830 }
4831
4832 static void renameGenericCommand(redisClient *c, int nx) {
4833 robj *o;
4834
4835 /* To use the same key as src and dst is probably an error */
4836 if (sdscmp(c->argv[1]->ptr,c->argv[2]->ptr) == 0) {
4837 addReply(c,shared.sameobjecterr);
4838 return;
4839 }
4840
4841 if ((o = lookupKeyWriteOrReply(c,c->argv[1],shared.nokeyerr)) == NULL)
4842 return;
4843
4844 incrRefCount(o);
4845 deleteIfVolatile(c->db,c->argv[2]);
4846 if (dbAdd(c->db,c->argv[2],o) == REDIS_ERR) {
4847 if (nx) {
4848 decrRefCount(o);
4849 addReply(c,shared.czero);
4850 return;
4851 }
4852 dbReplace(c->db,c->argv[2],o);
4853 }
4854 dbDelete(c->db,c->argv[1]);
4855 touchWatchedKey(c->db,c->argv[2]);
4856 server.dirty++;
4857 addReply(c,nx ? shared.cone : shared.ok);
4858 }
4859
4860 static void renameCommand(redisClient *c) {
4861 renameGenericCommand(c,0);
4862 }
4863
4864 static void renamenxCommand(redisClient *c) {
4865 renameGenericCommand(c,1);
4866 }
4867
4868 static void moveCommand(redisClient *c) {
4869 robj *o;
4870 redisDb *src, *dst;
4871 int srcid;
4872
4873 /* Obtain source and target DB pointers */
4874 src = c->db;
4875 srcid = c->db->id;
4876 if (selectDb(c,atoi(c->argv[2]->ptr)) == REDIS_ERR) {
4877 addReply(c,shared.outofrangeerr);
4878 return;
4879 }
4880 dst = c->db;
4881 selectDb(c,srcid); /* Back to the source DB */
4882
4883 /* If the user is moving using as target the same
4884 * DB as the source DB it is probably an error. */
4885 if (src == dst) {
4886 addReply(c,shared.sameobjecterr);
4887 return;
4888 }
4889
4890 /* Check if the element exists and get a reference */
4891 o = lookupKeyWrite(c->db,c->argv[1]);
4892 if (!o) {
4893 addReply(c,shared.czero);
4894 return;
4895 }
4896
4897 /* Try to add the element to the target DB */
4898 deleteIfVolatile(dst,c->argv[1]);
4899 if (dbAdd(dst,c->argv[1],o) == REDIS_ERR) {
4900 addReply(c,shared.czero);
4901 return;
4902 }
4903 incrRefCount(o);
4904
4905 /* OK! key moved, free the entry in the source DB */
4906 dbDelete(src,c->argv[1]);
4907 server.dirty++;
4908 addReply(c,shared.cone);
4909 }
4910
4911 /* =================================== Lists ================================ */
4912
4913
4914 /* Check the argument length to see if it requires us to convert the ziplist
4915 * to a real list. Only check raw-encoded objects because integer encoded
4916 * objects are never too long. */
4917 static void listTypeTryConversion(robj *subject, robj *value) {
4918 if (subject->encoding != REDIS_ENCODING_ZIPLIST) return;
4919 if (value->encoding == REDIS_ENCODING_RAW &&
4920 sdslen(value->ptr) > server.list_max_ziplist_value)
4921 listTypeConvert(subject,REDIS_ENCODING_LIST);
4922 }
4923
4924 static void listTypePush(robj *subject, robj *value, int where) {
4925 /* Check if we need to convert the ziplist */
4926 listTypeTryConversion(subject,value);
4927 if (subject->encoding == REDIS_ENCODING_ZIPLIST &&
4928 ziplistLen(subject->ptr) >= server.list_max_ziplist_entries)
4929 listTypeConvert(subject,REDIS_ENCODING_LIST);
4930
4931 if (subject->encoding == REDIS_ENCODING_ZIPLIST) {
4932 int pos = (where == REDIS_HEAD) ? ZIPLIST_HEAD : ZIPLIST_TAIL;
4933 value = getDecodedObject(value);
4934 subject->ptr = ziplistPush(subject->ptr,value->ptr,sdslen(value->ptr),pos);
4935 decrRefCount(value);
4936 } else if (subject->encoding == REDIS_ENCODING_LIST) {
4937 if (where == REDIS_HEAD) {
4938 listAddNodeHead(subject->ptr,value);
4939 } else {
4940 listAddNodeTail(subject->ptr,value);
4941 }
4942 incrRefCount(value);
4943 } else {
4944 redisPanic("Unknown list encoding");
4945 }
4946 }
4947
4948 static robj *listTypePop(robj *subject, int where) {
4949 robj *value = NULL;
4950 if (subject->encoding == REDIS_ENCODING_ZIPLIST) {
4951 unsigned char *p;
4952 unsigned char *vstr;
4953 unsigned int vlen;
4954 long long vlong;
4955 int pos = (where == REDIS_HEAD) ? 0 : -1;
4956 p = ziplistIndex(subject->ptr,pos);
4957 if (ziplistGet(p,&vstr,&vlen,&vlong)) {
4958 if (vstr) {
4959 value = createStringObject((char*)vstr,vlen);
4960 } else {
4961 value = createStringObjectFromLongLong(vlong);
4962 }
4963 /* We only need to delete an element when it exists */
4964 subject->ptr = ziplistDelete(subject->ptr,&p);
4965 }
4966 } else if (subject->encoding == REDIS_ENCODING_LIST) {
4967 list *list = subject->ptr;
4968 listNode *ln;
4969 if (where == REDIS_HEAD) {
4970 ln = listFirst(list);
4971 } else {
4972 ln = listLast(list);
4973 }
4974 if (ln != NULL) {
4975 value = listNodeValue(ln);
4976 incrRefCount(value);
4977 listDelNode(list,ln);
4978 }
4979 } else {
4980 redisPanic("Unknown list encoding");
4981 }
4982 return value;
4983 }
4984
4985 static unsigned long listTypeLength(robj *subject) {
4986 if (subject->encoding == REDIS_ENCODING_ZIPLIST) {
4987 return ziplistLen(subject->ptr);
4988 } else if (subject->encoding == REDIS_ENCODING_LIST) {
4989 return listLength((list*)subject->ptr);
4990 } else {
4991 redisPanic("Unknown list encoding");
4992 }
4993 }
4994
4995 /* Structure to hold set iteration abstraction. */
4996 typedef struct {
4997 robj *subject;
4998 unsigned char encoding;
4999 unsigned char direction; /* Iteration direction */
5000 unsigned char *zi;
5001 listNode *ln;
5002 } listTypeIterator;
5003
5004 /* Structure for an entry while iterating over a list. */
5005 typedef struct {
5006 listTypeIterator *li;
5007 unsigned char *zi; /* Entry in ziplist */
5008 listNode *ln; /* Entry in linked list */
5009 } listTypeEntry;
5010
5011 /* Initialize an iterator at the specified index. */
5012 static listTypeIterator *listTypeInitIterator(robj *subject, int index, unsigned char direction) {
5013 listTypeIterator *li = zmalloc(sizeof(listTypeIterator));
5014 li->subject = subject;
5015 li->encoding = subject->encoding;
5016 li->direction = direction;
5017 if (li->encoding == REDIS_ENCODING_ZIPLIST) {
5018 li->zi = ziplistIndex(subject->ptr,index);
5019 } else if (li->encoding == REDIS_ENCODING_LIST) {
5020 li->ln = listIndex(subject->ptr,index);
5021 } else {
5022 redisPanic("Unknown list encoding");
5023 }
5024 return li;
5025 }
5026
5027 /* Clean up the iterator. */
5028 static void listTypeReleaseIterator(listTypeIterator *li) {
5029 zfree(li);
5030 }
5031
5032 /* Stores pointer to current the entry in the provided entry structure
5033 * and advances the position of the iterator. Returns 1 when the current
5034 * entry is in fact an entry, 0 otherwise. */
5035 static int listTypeNext(listTypeIterator *li, listTypeEntry *entry) {
5036 /* Protect from converting when iterating */
5037 redisAssert(li->subject->encoding == li->encoding);
5038
5039 entry->li = li;
5040 if (li->encoding == REDIS_ENCODING_ZIPLIST) {
5041 entry->zi = li->zi;
5042 if (entry->zi != NULL) {
5043 if (li->direction == REDIS_TAIL)
5044 li->zi = ziplistNext(li->subject->ptr,li->zi);
5045 else
5046 li->zi = ziplistPrev(li->subject->ptr,li->zi);
5047 return 1;
5048 }
5049 } else if (li->encoding == REDIS_ENCODING_LIST) {
5050 entry->ln = li->ln;
5051 if (entry->ln != NULL) {
5052 if (li->direction == REDIS_TAIL)
5053 li->ln = li->ln->next;
5054 else
5055 li->ln = li->ln->prev;
5056 return 1;
5057 }
5058 } else {
5059 redisPanic("Unknown list encoding");
5060 }
5061 return 0;
5062 }
5063
5064 /* Return entry or NULL at the current position of the iterator. */
5065 static robj *listTypeGet(listTypeEntry *entry) {
5066 listTypeIterator *li = entry->li;
5067 robj *value = NULL;
5068 if (li->encoding == REDIS_ENCODING_ZIPLIST) {
5069 unsigned char *vstr;
5070 unsigned int vlen;
5071 long long vlong;
5072 redisAssert(entry->zi != NULL);
5073 if (ziplistGet(entry->zi,&vstr,&vlen,&vlong)) {
5074 if (vstr) {
5075 value = createStringObject((char*)vstr,vlen);
5076 } else {
5077 value = createStringObjectFromLongLong(vlong);
5078 }
5079 }
5080 } else if (li->encoding == REDIS_ENCODING_LIST) {
5081 redisAssert(entry->ln != NULL);
5082 value = listNodeValue(entry->ln);
5083 incrRefCount(value);
5084 } else {
5085 redisPanic("Unknown list encoding");
5086 }
5087 return value;
5088 }
5089
5090 static void listTypeInsert(listTypeEntry *entry, robj *value, int where) {
5091 robj *subject = entry->li->subject;
5092 if (entry->li->encoding == REDIS_ENCODING_ZIPLIST) {
5093 value = getDecodedObject(value);
5094 if (where == REDIS_TAIL) {
5095 unsigned char *next = ziplistNext(subject->ptr,entry->zi);
5096
5097 /* When we insert after the current element, but the current element
5098 * is the tail of the list, we need to do a push. */
5099 if (next == NULL) {
5100 subject->ptr = ziplistPush(subject->ptr,value->ptr,sdslen(value->ptr),REDIS_TAIL);
5101 } else {
5102 subject->ptr = ziplistInsert(subject->ptr,next,value->ptr,sdslen(value->ptr));
5103 }
5104 } else {
5105 subject->ptr = ziplistInsert(subject->ptr,entry->zi,value->ptr,sdslen(value->ptr));
5106 }
5107 decrRefCount(value);
5108 } else if (entry->li->encoding == REDIS_ENCODING_LIST) {
5109 if (where == REDIS_TAIL) {
5110 listInsertNode(subject->ptr,entry->ln,value,AL_START_TAIL);
5111 } else {
5112 listInsertNode(subject->ptr,entry->ln,value,AL_START_HEAD);
5113 }
5114 incrRefCount(value);
5115 } else {
5116 redisPanic("Unknown list encoding");
5117 }
5118 }
5119
5120 /* Compare the given object with the entry at the current position. */
5121 static int listTypeEqual(listTypeEntry *entry, robj *o) {
5122 listTypeIterator *li = entry->li;
5123 if (li->encoding == REDIS_ENCODING_ZIPLIST) {
5124 redisAssert(o->encoding == REDIS_ENCODING_RAW);
5125 return ziplistCompare(entry->zi,o->ptr,sdslen(o->ptr));
5126 } else if (li->encoding == REDIS_ENCODING_LIST) {
5127 return equalStringObjects(o,listNodeValue(entry->ln));
5128 } else {
5129 redisPanic("Unknown list encoding");
5130 }
5131 }
5132
5133 /* Delete the element pointed to. */
5134 static void listTypeDelete(listTypeEntry *entry) {
5135 listTypeIterator *li = entry->li;
5136 if (li->encoding == REDIS_ENCODING_ZIPLIST) {
5137 unsigned char *p = entry->zi;
5138 li->subject->ptr = ziplistDelete(li->subject->ptr,&p);
5139
5140 /* Update position of the iterator depending on the direction */
5141 if (li->direction == REDIS_TAIL)
5142 li->zi = p;
5143 else
5144 li->zi = ziplistPrev(li->subject->ptr,p);
5145 } else if (entry->li->encoding == REDIS_ENCODING_LIST) {
5146 listNode *next;
5147 if (li->direction == REDIS_TAIL)
5148 next = entry->ln->next;
5149 else
5150 next = entry->ln->prev;
5151 listDelNode(li->subject->ptr,entry->ln);
5152 li->ln = next;
5153 } else {
5154 redisPanic("Unknown list encoding");
5155 }
5156 }
5157
5158 static void listTypeConvert(robj *subject, int enc) {
5159 listTypeIterator *li;
5160 listTypeEntry entry;
5161 redisAssert(subject->type == REDIS_LIST);
5162
5163 if (enc == REDIS_ENCODING_LIST) {
5164 list *l = listCreate();
5165 listSetFreeMethod(l,decrRefCount);
5166
5167 /* listTypeGet returns a robj with incremented refcount */
5168 li = listTypeInitIterator(subject,0,REDIS_TAIL);
5169 while (listTypeNext(li,&entry)) listAddNodeTail(l,listTypeGet(&entry));
5170 listTypeReleaseIterator(li);
5171
5172 subject->encoding = REDIS_ENCODING_LIST;
5173 zfree(subject->ptr);
5174 subject->ptr = l;
5175 } else {
5176 redisPanic("Unsupported list conversion");
5177 }
5178 }
5179
5180 static void pushGenericCommand(redisClient *c, int where) {
5181 robj *lobj = lookupKeyWrite(c->db,c->argv[1]);
5182 if (lobj == NULL) {
5183 if (handleClientsWaitingListPush(c,c->argv[1],c->argv[2])) {
5184 addReply(c,shared.cone);
5185 return;
5186 }
5187 lobj = createZiplistObject();
5188 dbAdd(c->db,c->argv[1],lobj);
5189 } else {
5190 if (lobj->type != REDIS_LIST) {
5191 addReply(c,shared.wrongtypeerr);
5192 return;
5193 }
5194 if (handleClientsWaitingListPush(c,c->argv[1],c->argv[2])) {
5195 addReply(c,shared.cone);
5196 return;
5197 }
5198 }
5199 listTypePush(lobj,c->argv[2],where);
5200 addReplyLongLong(c,listTypeLength(lobj));
5201 server.dirty++;
5202 }
5203
5204 static void lpushCommand(redisClient *c) {
5205 pushGenericCommand(c,REDIS_HEAD);
5206 }
5207
5208 static void rpushCommand(redisClient *c) {
5209 pushGenericCommand(c,REDIS_TAIL);
5210 }
5211
5212 static void pushxGenericCommand(redisClient *c, robj *refval, robj *val, int where) {
5213 robj *subject;
5214 listTypeIterator *iter;
5215 listTypeEntry entry;
5216 int inserted = 0;
5217
5218 if ((subject = lookupKeyReadOrReply(c,c->argv[1],shared.czero)) == NULL ||
5219 checkType(c,subject,REDIS_LIST)) return;
5220
5221 if (refval != NULL) {
5222 /* Note: we expect refval to be string-encoded because it is *not* the
5223 * last argument of the multi-bulk LINSERT. */
5224 redisAssert(refval->encoding == REDIS_ENCODING_RAW);
5225
5226 /* We're not sure if this value can be inserted yet, but we cannot
5227 * convert the list inside the iterator. We don't want to loop over
5228 * the list twice (once to see if the value can be inserted and once
5229 * to do the actual insert), so we assume this value can be inserted
5230 * and convert the ziplist to a regular list if necessary. */
5231 listTypeTryConversion(subject,val);
5232
5233 /* Seek refval from head to tail */
5234 iter = listTypeInitIterator(subject,0,REDIS_TAIL);
5235 while (listTypeNext(iter,&entry)) {
5236 if (listTypeEqual(&entry,refval)) {
5237 listTypeInsert(&entry,val,where);
5238 inserted = 1;
5239 break;
5240 }
5241 }
5242 listTypeReleaseIterator(iter);
5243
5244 if (inserted) {
5245 /* Check if the length exceeds the ziplist length threshold. */
5246 if (subject->encoding == REDIS_ENCODING_ZIPLIST &&
5247 ziplistLen(subject->ptr) > server.list_max_ziplist_entries)
5248 listTypeConvert(subject,REDIS_ENCODING_LIST);
5249 server.dirty++;
5250 } else {
5251 /* Notify client of a failed insert */
5252 addReply(c,shared.cnegone);
5253 return;
5254 }
5255 } else {
5256 listTypePush(subject,val,where);
5257 server.dirty++;
5258 }
5259
5260 addReplyUlong(c,listTypeLength(subject));
5261 }
5262
5263 static void lpushxCommand(redisClient *c) {
5264 pushxGenericCommand(c,NULL,c->argv[2],REDIS_HEAD);
5265 }
5266
5267 static void rpushxCommand(redisClient *c) {
5268 pushxGenericCommand(c,NULL,c->argv[2],REDIS_TAIL);
5269 }
5270
5271 static void linsertCommand(redisClient *c) {
5272 if (strcasecmp(c->argv[2]->ptr,"after") == 0) {
5273 pushxGenericCommand(c,c->argv[3],c->argv[4],REDIS_TAIL);
5274 } else if (strcasecmp(c->argv[2]->ptr,"before") == 0) {
5275 pushxGenericCommand(c,c->argv[3],c->argv[4],REDIS_HEAD);
5276 } else {
5277 addReply(c,shared.syntaxerr);
5278 }
5279 }
5280
5281 static void llenCommand(redisClient *c) {
5282 robj *o = lookupKeyReadOrReply(c,c->argv[1],shared.czero);
5283 if (o == NULL || checkType(c,o,REDIS_LIST)) return;
5284 addReplyUlong(c,listTypeLength(o));
5285 }
5286
5287 static void lindexCommand(redisClient *c) {
5288 robj *o = lookupKeyReadOrReply(c,c->argv[1],shared.nullbulk);
5289 if (o == NULL || checkType(c,o,REDIS_LIST)) return;
5290 int index = atoi(c->argv[2]->ptr);
5291 robj *value = NULL;
5292
5293 if (o->encoding == REDIS_ENCODING_ZIPLIST) {
5294 unsigned char *p;
5295 unsigned char *vstr;
5296 unsigned int vlen;
5297 long long vlong;
5298 p = ziplistIndex(o->ptr,index);
5299 if (ziplistGet(p,&vstr,&vlen,&vlong)) {
5300 if (vstr) {
5301 value = createStringObject((char*)vstr,vlen);
5302 } else {
5303 value = createStringObjectFromLongLong(vlong);
5304 }
5305 addReplyBulk(c,value);
5306 decrRefCount(value);
5307 } else {
5308 addReply(c,shared.nullbulk);
5309 }
5310 } else if (o->encoding == REDIS_ENCODING_LIST) {
5311 listNode *ln = listIndex(o->ptr,index);
5312 if (ln != NULL) {
5313 value = listNodeValue(ln);
5314 addReplyBulk(c,value);
5315 } else {
5316 addReply(c,shared.nullbulk);
5317 }
5318 } else {
5319 redisPanic("Unknown list encoding");
5320 }
5321 }
5322
5323 static void lsetCommand(redisClient *c) {
5324 robj *o = lookupKeyWriteOrReply(c,c->argv[1],shared.nokeyerr);
5325 if (o == NULL || checkType(c,o,REDIS_LIST)) return;
5326 int index = atoi(c->argv[2]->ptr);
5327 robj *value = c->argv[3];
5328
5329 listTypeTryConversion(o,value);
5330 if (o->encoding == REDIS_ENCODING_ZIPLIST) {
5331 unsigned char *p, *zl = o->ptr;
5332 p = ziplistIndex(zl,index);
5333 if (p == NULL) {
5334 addReply(c,shared.outofrangeerr);
5335 } else {
5336 o->ptr = ziplistDelete(o->ptr,&p);
5337 value = getDecodedObject(value);
5338 o->ptr = ziplistInsert(o->ptr,p,value->ptr,sdslen(value->ptr));
5339 decrRefCount(value);
5340 addReply(c,shared.ok);
5341 server.dirty++;
5342 }
5343 } else if (o->encoding == REDIS_ENCODING_LIST) {
5344 listNode *ln = listIndex(o->ptr,index);
5345 if (ln == NULL) {
5346 addReply(c,shared.outofrangeerr);
5347 } else {
5348 decrRefCount((robj*)listNodeValue(ln));
5349 listNodeValue(ln) = value;
5350 incrRefCount(value);
5351 addReply(c,shared.ok);
5352 server.dirty++;
5353 }
5354 } else {
5355 redisPanic("Unknown list encoding");
5356 }
5357 }
5358
5359 static void popGenericCommand(redisClient *c, int where) {
5360 robj *o = lookupKeyWriteOrReply(c,c->argv[1],shared.nullbulk);
5361 if (o == NULL || checkType(c,o,REDIS_LIST)) return;
5362
5363 robj *value = listTypePop(o,where);
5364 if (value == NULL) {
5365 addReply(c,shared.nullbulk);
5366 } else {
5367 addReplyBulk(c,value);
5368 decrRefCount(value);
5369 if (listTypeLength(o) == 0) dbDelete(c->db,c->argv[1]);
5370 server.dirty++;
5371 }
5372 }
5373
5374 static void lpopCommand(redisClient *c) {
5375 popGenericCommand(c,REDIS_HEAD);
5376 }
5377
5378 static void rpopCommand(redisClient *c) {
5379 popGenericCommand(c,REDIS_TAIL);
5380 }
5381
5382 static void lrangeCommand(redisClient *c) {
5383 robj *o, *value;
5384 int start = atoi(c->argv[2]->ptr);
5385 int end = atoi(c->argv[3]->ptr);
5386 int llen;
5387 int rangelen, j;
5388 listTypeEntry entry;
5389
5390 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.emptymultibulk)) == NULL
5391 || checkType(c,o,REDIS_LIST)) return;
5392 llen = listTypeLength(o);
5393
5394 /* convert negative indexes */
5395 if (start < 0) start = llen+start;
5396 if (end < 0) end = llen+end;
5397 if (start < 0) start = 0;
5398 if (end < 0) end = 0;
5399
5400 /* indexes sanity checks */
5401 if (start > end || start >= llen) {
5402 /* Out of range start or start > end result in empty list */
5403 addReply(c,shared.emptymultibulk);
5404 return;
5405 }
5406 if (end >= llen) end = llen-1;
5407 rangelen = (end-start)+1;
5408
5409 /* Return the result in form of a multi-bulk reply */
5410 addReplySds(c,sdscatprintf(sdsempty(),"*%d\r\n",rangelen));
5411 listTypeIterator *li = listTypeInitIterator(o,start,REDIS_TAIL);
5412 for (j = 0; j < rangelen; j++) {
5413 redisAssert(listTypeNext(li,&entry));
5414 value = listTypeGet(&entry);
5415 addReplyBulk(c,value);
5416 decrRefCount(value);
5417 }
5418 listTypeReleaseIterator(li);
5419 }
5420
5421 static void ltrimCommand(redisClient *c) {
5422 robj *o;
5423 int start = atoi(c->argv[2]->ptr);
5424 int end = atoi(c->argv[3]->ptr);
5425 int llen;
5426 int j, ltrim, rtrim;
5427 list *list;
5428 listNode *ln;
5429
5430 if ((o = lookupKeyWriteOrReply(c,c->argv[1],shared.ok)) == NULL ||
5431 checkType(c,o,REDIS_LIST)) return;
5432 llen = listTypeLength(o);
5433
5434 /* convert negative indexes */
5435 if (start < 0) start = llen+start;
5436 if (end < 0) end = llen+end;
5437 if (start < 0) start = 0;
5438 if (end < 0) end = 0;
5439
5440 /* indexes sanity checks */
5441 if (start > end || start >= llen) {
5442 /* Out of range start or start > end result in empty list */
5443 ltrim = llen;
5444 rtrim = 0;
5445 } else {
5446 if (end >= llen) end = llen-1;
5447 ltrim = start;
5448 rtrim = llen-end-1;
5449 }
5450
5451 /* Remove list elements to perform the trim */
5452 if (o->encoding == REDIS_ENCODING_ZIPLIST) {
5453 o->ptr = ziplistDeleteRange(o->ptr,0,ltrim);
5454 o->ptr = ziplistDeleteRange(o->ptr,-rtrim,rtrim);
5455 } else if (o->encoding == REDIS_ENCODING_LIST) {
5456 list = o->ptr;
5457 for (j = 0; j < ltrim; j++) {
5458 ln = listFirst(list);
5459 listDelNode(list,ln);
5460 }
5461 for (j = 0; j < rtrim; j++) {
5462 ln = listLast(list);
5463 listDelNode(list,ln);
5464 }
5465 } else {
5466 redisPanic("Unknown list encoding");
5467 }
5468 if (listTypeLength(o) == 0) dbDelete(c->db,c->argv[1]);
5469 server.dirty++;
5470 addReply(c,shared.ok);
5471 }
5472
5473 static void lremCommand(redisClient *c) {
5474 robj *subject, *obj = c->argv[3];
5475 int toremove = atoi(c->argv[2]->ptr);
5476 int removed = 0;
5477 listTypeEntry entry;
5478
5479 subject = lookupKeyWriteOrReply(c,c->argv[1],shared.czero);
5480 if (subject == NULL || checkType(c,subject,REDIS_LIST)) return;
5481
5482 /* Make sure obj is raw when we're dealing with a ziplist */
5483 if (subject->encoding == REDIS_ENCODING_ZIPLIST)
5484 obj = getDecodedObject(obj);
5485
5486 listTypeIterator *li;
5487 if (toremove < 0) {
5488 toremove = -toremove;
5489 li = listTypeInitIterator(subject,-1,REDIS_HEAD);
5490 } else {
5491 li = listTypeInitIterator(subject,0,REDIS_TAIL);
5492 }
5493
5494 while (listTypeNext(li,&entry)) {
5495 if (listTypeEqual(&entry,obj)) {
5496 listTypeDelete(&entry);
5497 server.dirty++;
5498 removed++;
5499 if (toremove && removed == toremove) break;
5500 }
5501 }
5502 listTypeReleaseIterator(li);
5503
5504 /* Clean up raw encoded object */
5505 if (subject->encoding == REDIS_ENCODING_ZIPLIST)
5506 decrRefCount(obj);
5507
5508 if (listTypeLength(subject) == 0) dbDelete(c->db,c->argv[1]);
5509 addReplySds(c,sdscatprintf(sdsempty(),":%d\r\n",removed));
5510 }
5511
5512 /* This is the semantic of this command:
5513 * RPOPLPUSH srclist dstlist:
5514 * IF LLEN(srclist) > 0
5515 * element = RPOP srclist
5516 * LPUSH dstlist element
5517 * RETURN element
5518 * ELSE
5519 * RETURN nil
5520 * END
5521 * END
5522 *
5523 * The idea is to be able to get an element from a list in a reliable way
5524 * since the element is not just returned but pushed against another list
5525 * as well. This command was originally proposed by Ezra Zygmuntowicz.
5526 */
5527 static void rpoplpushcommand(redisClient *c) {
5528 robj *sobj, *value;
5529 if ((sobj = lookupKeyWriteOrReply(c,c->argv[1],shared.nullbulk)) == NULL ||
5530 checkType(c,sobj,REDIS_LIST)) return;
5531
5532 if (listTypeLength(sobj) == 0) {
5533 addReply(c,shared.nullbulk);
5534 } else {
5535 robj *dobj = lookupKeyWrite(c->db,c->argv[2]);
5536 if (dobj && checkType(c,dobj,REDIS_LIST)) return;
5537 value = listTypePop(sobj,REDIS_TAIL);
5538
5539 /* Add the element to the target list (unless it's directly
5540 * passed to some BLPOP-ing client */
5541 if (!handleClientsWaitingListPush(c,c->argv[2],value)) {
5542 /* Create the list if the key does not exist */
5543 if (!dobj) {
5544 dobj = createZiplistObject();
5545 dbAdd(c->db,c->argv[2],dobj);
5546 }
5547 listTypePush(dobj,value,REDIS_HEAD);
5548 }
5549
5550 /* Send the element to the client as reply as well */
5551 addReplyBulk(c,value);
5552
5553 /* listTypePop returns an object with its refcount incremented */
5554 decrRefCount(value);
5555
5556 /* Delete the source list when it is empty */
5557 if (listTypeLength(sobj) == 0) dbDelete(c->db,c->argv[1]);
5558 server.dirty++;
5559 }
5560 }
5561
5562 /* ==================================== Sets ================================ */
5563
5564 static void saddCommand(redisClient *c) {
5565 robj *set;
5566
5567 set = lookupKeyWrite(c->db,c->argv[1]);
5568 if (set == NULL) {
5569 set = createSetObject();
5570 dbAdd(c->db,c->argv[1],set);
5571 } else {
5572 if (set->type != REDIS_SET) {
5573 addReply(c,shared.wrongtypeerr);
5574 return;
5575 }
5576 }
5577 if (dictAdd(set->ptr,c->argv[2],NULL) == DICT_OK) {
5578 incrRefCount(c->argv[2]);
5579 server.dirty++;
5580 addReply(c,shared.cone);
5581 } else {
5582 addReply(c,shared.czero);
5583 }
5584 }
5585
5586 static void sremCommand(redisClient *c) {
5587 robj *set;
5588
5589 if ((set = lookupKeyWriteOrReply(c,c->argv[1],shared.czero)) == NULL ||
5590 checkType(c,set,REDIS_SET)) return;
5591
5592 if (dictDelete(set->ptr,c->argv[2]) == DICT_OK) {
5593 server.dirty++;
5594 if (htNeedsResize(set->ptr)) dictResize(set->ptr);
5595 if (dictSize((dict*)set->ptr) == 0) dbDelete(c->db,c->argv[1]);
5596 addReply(c,shared.cone);
5597 } else {
5598 addReply(c,shared.czero);
5599 }
5600 }
5601
5602 static void smoveCommand(redisClient *c) {
5603 robj *srcset, *dstset;
5604
5605 srcset = lookupKeyWrite(c->db,c->argv[1]);
5606 dstset = lookupKeyWrite(c->db,c->argv[2]);
5607
5608 /* If the source key does not exist return 0, if it's of the wrong type
5609 * raise an error */
5610 if (srcset == NULL || srcset->type != REDIS_SET) {
5611 addReply(c, srcset ? shared.wrongtypeerr : shared.czero);
5612 return;
5613 }
5614 /* Error if the destination key is not a set as well */
5615 if (dstset && dstset->type != REDIS_SET) {
5616 addReply(c,shared.wrongtypeerr);
5617 return;
5618 }
5619 /* Remove the element from the source set */
5620 if (dictDelete(srcset->ptr,c->argv[3]) == DICT_ERR) {
5621 /* Key not found in the src set! return zero */
5622 addReply(c,shared.czero);
5623 return;
5624 }
5625 if (dictSize((dict*)srcset->ptr) == 0 && srcset != dstset)
5626 dbDelete(c->db,c->argv[1]);
5627 server.dirty++;
5628 /* Add the element to the destination set */
5629 if (!dstset) {
5630 dstset = createSetObject();
5631 dbAdd(c->db,c->argv[2],dstset);
5632 }
5633 if (dictAdd(dstset->ptr,c->argv[3],NULL) == DICT_OK)
5634 incrRefCount(c->argv[3]);
5635 addReply(c,shared.cone);
5636 }
5637
5638 static void sismemberCommand(redisClient *c) {
5639 robj *set;
5640
5641 if ((set = lookupKeyReadOrReply(c,c->argv[1],shared.czero)) == NULL ||
5642 checkType(c,set,REDIS_SET)) return;
5643
5644 if (dictFind(set->ptr,c->argv[2]))
5645 addReply(c,shared.cone);
5646 else
5647 addReply(c,shared.czero);
5648 }
5649
5650 static void scardCommand(redisClient *c) {
5651 robj *o;
5652 dict *s;
5653
5654 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.czero)) == NULL ||
5655 checkType(c,o,REDIS_SET)) return;
5656
5657 s = o->ptr;
5658 addReplyUlong(c,dictSize(s));
5659 }
5660
5661 static void spopCommand(redisClient *c) {
5662 robj *set;
5663 dictEntry *de;
5664
5665 if ((set = lookupKeyWriteOrReply(c,c->argv[1],shared.nullbulk)) == NULL ||
5666 checkType(c,set,REDIS_SET)) return;
5667
5668 de = dictGetRandomKey(set->ptr);
5669 if (de == NULL) {
5670 addReply(c,shared.nullbulk);
5671 } else {
5672 robj *ele = dictGetEntryKey(de);
5673
5674 addReplyBulk(c,ele);
5675 dictDelete(set->ptr,ele);
5676 if (htNeedsResize(set->ptr)) dictResize(set->ptr);
5677 if (dictSize((dict*)set->ptr) == 0) dbDelete(c->db,c->argv[1]);
5678 server.dirty++;
5679 }
5680 }
5681
5682 static void srandmemberCommand(redisClient *c) {
5683 robj *set;
5684 dictEntry *de;
5685
5686 if ((set = lookupKeyReadOrReply(c,c->argv[1],shared.nullbulk)) == NULL ||
5687 checkType(c,set,REDIS_SET)) return;
5688
5689 de = dictGetRandomKey(set->ptr);
5690 if (de == NULL) {
5691 addReply(c,shared.nullbulk);
5692 } else {
5693 robj *ele = dictGetEntryKey(de);
5694
5695 addReplyBulk(c,ele);
5696 }
5697 }
5698
5699 static int qsortCompareSetsByCardinality(const void *s1, const void *s2) {
5700 dict **d1 = (void*) s1, **d2 = (void*) s2;
5701
5702 return dictSize(*d1)-dictSize(*d2);
5703 }
5704
5705 static void sinterGenericCommand(redisClient *c, robj **setskeys, unsigned long setsnum, robj *dstkey) {
5706 dict **dv = zmalloc(sizeof(dict*)*setsnum);
5707 dictIterator *di;
5708 dictEntry *de;
5709 robj *lenobj = NULL, *dstset = NULL;
5710 unsigned long j, cardinality = 0;
5711
5712 for (j = 0; j < setsnum; j++) {
5713 robj *setobj;
5714
5715 setobj = dstkey ?
5716 lookupKeyWrite(c->db,setskeys[j]) :
5717 lookupKeyRead(c->db,setskeys[j]);
5718 if (!setobj) {
5719 zfree(dv);
5720 if (dstkey) {
5721 if (dbDelete(c->db,dstkey))
5722 server.dirty++;
5723 addReply(c,shared.czero);
5724 } else {
5725 addReply(c,shared.emptymultibulk);
5726 }
5727 return;
5728 }
5729 if (setobj->type != REDIS_SET) {
5730 zfree(dv);
5731 addReply(c,shared.wrongtypeerr);
5732 return;
5733 }
5734 dv[j] = setobj->ptr;
5735 }
5736 /* Sort sets from the smallest to largest, this will improve our
5737 * algorithm's performace */
5738 qsort(dv,setsnum,sizeof(dict*),qsortCompareSetsByCardinality);
5739
5740 /* The first thing we should output is the total number of elements...
5741 * since this is a multi-bulk write, but at this stage we don't know
5742 * the intersection set size, so we use a trick, append an empty object
5743 * to the output list and save the pointer to later modify it with the
5744 * right length */
5745 if (!dstkey) {
5746 lenobj = createObject(REDIS_STRING,NULL);
5747 addReply(c,lenobj);
5748 decrRefCount(lenobj);
5749 } else {
5750 /* If we have a target key where to store the resulting set
5751 * create this key with an empty set inside */
5752 dstset = createSetObject();
5753 }
5754
5755 /* Iterate all the elements of the first (smallest) set, and test
5756 * the element against all the other sets, if at least one set does
5757 * not include the element it is discarded */
5758 di = dictGetIterator(dv[0]);
5759
5760 while((de = dictNext(di)) != NULL) {
5761 robj *ele;
5762
5763 for (j = 1; j < setsnum; j++)
5764 if (dictFind(dv[j],dictGetEntryKey(de)) == NULL) break;
5765 if (j != setsnum)
5766 continue; /* at least one set does not contain the member */
5767 ele = dictGetEntryKey(de);
5768 if (!dstkey) {
5769 addReplyBulk(c,ele);
5770 cardinality++;
5771 } else {
5772 dictAdd(dstset->ptr,ele,NULL);
5773 incrRefCount(ele);
5774 }
5775 }
5776 dictReleaseIterator(di);
5777
5778 if (dstkey) {
5779 /* Store the resulting set into the target, if the intersection
5780 * is not an empty set. */
5781 dbDelete(c->db,dstkey);
5782 if (dictSize((dict*)dstset->ptr) > 0) {
5783 dbAdd(c->db,dstkey,dstset);
5784 addReplyLongLong(c,dictSize((dict*)dstset->ptr));
5785 } else {
5786 decrRefCount(dstset);
5787 addReply(c,shared.czero);
5788 }
5789 server.dirty++;
5790 } else {
5791 lenobj->ptr = sdscatprintf(sdsempty(),"*%lu\r\n",cardinality);
5792 }
5793 zfree(dv);
5794 }
5795
5796 static void sinterCommand(redisClient *c) {
5797 sinterGenericCommand(c,c->argv+1,c->argc-1,NULL);
5798 }
5799
5800 static void sinterstoreCommand(redisClient *c) {
5801 sinterGenericCommand(c,c->argv+2,c->argc-2,c->argv[1]);
5802 }
5803
5804 #define REDIS_OP_UNION 0
5805 #define REDIS_OP_DIFF 1
5806 #define REDIS_OP_INTER 2
5807
5808 static void sunionDiffGenericCommand(redisClient *c, robj **setskeys, int setsnum, robj *dstkey, int op) {
5809 dict **dv = zmalloc(sizeof(dict*)*setsnum);
5810 dictIterator *di;
5811 dictEntry *de;
5812 robj *dstset = NULL;
5813 int j, cardinality = 0;
5814
5815 for (j = 0; j < setsnum; j++) {
5816 robj *setobj;
5817
5818 setobj = dstkey ?
5819 lookupKeyWrite(c->db,setskeys[j]) :
5820 lookupKeyRead(c->db,setskeys[j]);
5821 if (!setobj) {
5822 dv[j] = NULL;
5823 continue;
5824 }
5825 if (setobj->type != REDIS_SET) {
5826 zfree(dv);
5827 addReply(c,shared.wrongtypeerr);
5828 return;
5829 }
5830 dv[j] = setobj->ptr;
5831 }
5832
5833 /* We need a temp set object to store our union. If the dstkey
5834 * is not NULL (that is, we are inside an SUNIONSTORE operation) then
5835 * this set object will be the resulting object to set into the target key*/
5836 dstset = createSetObject();
5837
5838 /* Iterate all the elements of all the sets, add every element a single
5839 * time to the result set */
5840 for (j = 0; j < setsnum; j++) {
5841 if (op == REDIS_OP_DIFF && j == 0 && !dv[j]) break; /* result set is empty */
5842 if (!dv[j]) continue; /* non existing keys are like empty sets */
5843
5844 di = dictGetIterator(dv[j]);
5845
5846 while((de = dictNext(di)) != NULL) {
5847 robj *ele;
5848
5849 /* dictAdd will not add the same element multiple times */
5850 ele = dictGetEntryKey(de);
5851 if (op == REDIS_OP_UNION || j == 0) {
5852 if (dictAdd(dstset->ptr,ele,NULL) == DICT_OK) {
5853 incrRefCount(ele);
5854 cardinality++;
5855 }
5856 } else if (op == REDIS_OP_DIFF) {
5857 if (dictDelete(dstset->ptr,ele) == DICT_OK) {
5858 cardinality--;
5859 }
5860 }
5861 }
5862 dictReleaseIterator(di);
5863
5864 /* result set is empty? Exit asap. */
5865 if (op == REDIS_OP_DIFF && cardinality == 0) break;
5866 }
5867
5868 /* Output the content of the resulting set, if not in STORE mode */
5869 if (!dstkey) {
5870 addReplySds(c,sdscatprintf(sdsempty(),"*%d\r\n",cardinality));
5871 di = dictGetIterator(dstset->ptr);
5872 while((de = dictNext(di)) != NULL) {
5873 robj *ele;
5874
5875 ele = dictGetEntryKey(de);
5876 addReplyBulk(c,ele);
5877 }
5878 dictReleaseIterator(di);
5879 decrRefCount(dstset);
5880 } else {
5881 /* If we have a target key where to store the resulting set
5882 * create this key with the result set inside */
5883 dbDelete(c->db,dstkey);
5884 if (dictSize((dict*)dstset->ptr) > 0) {
5885 dbAdd(c->db,dstkey,dstset);
5886 addReplyLongLong(c,dictSize((dict*)dstset->ptr));
5887 } else {
5888 decrRefCount(dstset);
5889 addReply(c,shared.czero);
5890 }
5891 server.dirty++;
5892 }
5893 zfree(dv);
5894 }
5895
5896 static void sunionCommand(redisClient *c) {
5897 sunionDiffGenericCommand(c,c->argv+1,c->argc-1,NULL,REDIS_OP_UNION);
5898 }
5899
5900 static void sunionstoreCommand(redisClient *c) {
5901 sunionDiffGenericCommand(c,c->argv+2,c->argc-2,c->argv[1],REDIS_OP_UNION);
5902 }
5903
5904 static void sdiffCommand(redisClient *c) {
5905 sunionDiffGenericCommand(c,c->argv+1,c->argc-1,NULL,REDIS_OP_DIFF);
5906 }
5907
5908 static void sdiffstoreCommand(redisClient *c) {
5909 sunionDiffGenericCommand(c,c->argv+2,c->argc-2,c->argv[1],REDIS_OP_DIFF);
5910 }
5911
5912 /* ==================================== ZSets =============================== */
5913
5914 /* ZSETs are ordered sets using two data structures to hold the same elements
5915 * in order to get O(log(N)) INSERT and REMOVE operations into a sorted
5916 * data structure.
5917 *
5918 * The elements are added to an hash table mapping Redis objects to scores.
5919 * At the same time the elements are added to a skip list mapping scores
5920 * to Redis objects (so objects are sorted by scores in this "view"). */
5921
5922 /* This skiplist implementation is almost a C translation of the original
5923 * algorithm described by William Pugh in "Skip Lists: A Probabilistic
5924 * Alternative to Balanced Trees", modified in three ways:
5925 * a) this implementation allows for repeated values.
5926 * b) the comparison is not just by key (our 'score') but by satellite data.
5927 * c) there is a back pointer, so it's a doubly linked list with the back
5928 * pointers being only at "level 1". This allows to traverse the list
5929 * from tail to head, useful for ZREVRANGE. */
5930
5931 static zskiplistNode *zslCreateNode(int level, double score, robj *obj) {
5932 zskiplistNode *zn = zmalloc(sizeof(*zn));
5933
5934 zn->forward = zmalloc(sizeof(zskiplistNode*) * level);
5935 if (level > 1)
5936 zn->span = zmalloc(sizeof(unsigned int) * (level - 1));
5937 else
5938 zn->span = NULL;
5939 zn->score = score;
5940 zn->obj = obj;
5941 return zn;
5942 }
5943
5944 static zskiplist *zslCreate(void) {
5945 int j;
5946 zskiplist *zsl;
5947
5948 zsl = zmalloc(sizeof(*zsl));
5949 zsl->level = 1;
5950 zsl->length = 0;
5951 zsl->header = zslCreateNode(ZSKIPLIST_MAXLEVEL,0,NULL);
5952 for (j = 0; j < ZSKIPLIST_MAXLEVEL; j++) {
5953 zsl->header->forward[j] = NULL;
5954
5955 /* span has space for ZSKIPLIST_MAXLEVEL-1 elements */
5956 if (j < ZSKIPLIST_MAXLEVEL-1)
5957 zsl->header->span[j] = 0;
5958 }
5959 zsl->header->backward = NULL;
5960 zsl->tail = NULL;
5961 return zsl;
5962 }
5963
5964 static void zslFreeNode(zskiplistNode *node) {
5965 decrRefCount(node->obj);
5966 zfree(node->forward);
5967 zfree(node->span);
5968 zfree(node);
5969 }
5970
5971 static void zslFree(zskiplist *zsl) {
5972 zskiplistNode *node = zsl->header->forward[0], *next;
5973
5974 zfree(zsl->header->forward);
5975 zfree(zsl->header->span);
5976 zfree(zsl->header);
5977 while(node) {
5978 next = node->forward[0];
5979 zslFreeNode(node);
5980 node = next;
5981 }
5982 zfree(zsl);
5983 }
5984
5985 static int zslRandomLevel(void) {
5986 int level = 1;
5987 while ((random()&0xFFFF) < (ZSKIPLIST_P * 0xFFFF))
5988 level += 1;
5989 return (level<ZSKIPLIST_MAXLEVEL) ? level : ZSKIPLIST_MAXLEVEL;
5990 }
5991
5992 static void zslInsert(zskiplist *zsl, double score, robj *obj) {
5993 zskiplistNode *update[ZSKIPLIST_MAXLEVEL], *x;
5994 unsigned int rank[ZSKIPLIST_MAXLEVEL];
5995 int i, level;
5996
5997 x = zsl->header;
5998 for (i = zsl->level-1; i >= 0; i--) {
5999 /* store rank that is crossed to reach the insert position */
6000 rank[i] = i == (zsl->level-1) ? 0 : rank[i+1];
6001
6002 while (x->forward[i] &&
6003 (x->forward[i]->score < score ||
6004 (x->forward[i]->score == score &&
6005 compareStringObjects(x->forward[i]->obj,obj) < 0))) {
6006 rank[i] += i > 0 ? x->span[i-1] : 1;
6007 x = x->forward[i];
6008 }
6009 update[i] = x;
6010 }
6011 /* we assume the key is not already inside, since we allow duplicated
6012 * scores, and the re-insertion of score and redis object should never
6013 * happpen since the caller of zslInsert() should test in the hash table
6014 * if the element is already inside or not. */
6015 level = zslRandomLevel();
6016 if (level > zsl->level) {
6017 for (i = zsl->level; i < level; i++) {
6018 rank[i] = 0;
6019 update[i] = zsl->header;
6020 update[i]->span[i-1] = zsl->length;
6021 }
6022 zsl->level = level;
6023 }
6024 x = zslCreateNode(level,score,obj);
6025 for (i = 0; i < level; i++) {
6026 x->forward[i] = update[i]->forward[i];
6027 update[i]->forward[i] = x;
6028
6029 /* update span covered by update[i] as x is inserted here */
6030 if (i > 0) {
6031 x->span[i-1] = update[i]->span[i-1] - (rank[0] - rank[i]);
6032 update[i]->span[i-1] = (rank[0] - rank[i]) + 1;
6033 }
6034 }
6035
6036 /* increment span for untouched levels */
6037 for (i = level; i < zsl->level; i++) {
6038 update[i]->span[i-1]++;
6039 }
6040
6041 x->backward = (update[0] == zsl->header) ? NULL : update[0];
6042 if (x->forward[0])
6043 x->forward[0]->backward = x;
6044 else
6045 zsl->tail = x;
6046 zsl->length++;
6047 }
6048
6049 /* Internal function used by zslDelete, zslDeleteByScore and zslDeleteByRank */
6050 void zslDeleteNode(zskiplist *zsl, zskiplistNode *x, zskiplistNode **update) {
6051 int i;
6052 for (i = 0; i < zsl->level; i++) {
6053 if (update[i]->forward[i] == x) {
6054 if (i > 0) {
6055 update[i]->span[i-1] += x->span[i-1] - 1;
6056 }
6057 update[i]->forward[i] = x->forward[i];
6058 } else {
6059 /* invariant: i > 0, because update[0]->forward[0]
6060 * is always equal to x */
6061 update[i]->span[i-1] -= 1;
6062 }
6063 }
6064 if (x->forward[0]) {
6065 x->forward[0]->backward = x->backward;
6066 } else {
6067 zsl->tail = x->backward;
6068 }
6069 while(zsl->level > 1 && zsl->header->forward[zsl->level-1] == NULL)
6070 zsl->level--;
6071 zsl->length--;
6072 }
6073
6074 /* Delete an element with matching score/object from the skiplist. */
6075 static int zslDelete(zskiplist *zsl, double score, robj *obj) {
6076 zskiplistNode *update[ZSKIPLIST_MAXLEVEL], *x;
6077 int i;
6078
6079 x = zsl->header;
6080 for (i = zsl->level-1; i >= 0; i--) {
6081 while (x->forward[i] &&
6082 (x->forward[i]->score < score ||
6083 (x->forward[i]->score == score &&
6084 compareStringObjects(x->forward[i]->obj,obj) < 0)))
6085 x = x->forward[i];
6086 update[i] = x;
6087 }
6088 /* We may have multiple elements with the same score, what we need
6089 * is to find the element with both the right score and object. */
6090 x = x->forward[0];
6091 if (x && score == x->score && equalStringObjects(x->obj,obj)) {
6092 zslDeleteNode(zsl, x, update);
6093 zslFreeNode(x);
6094 return 1;
6095 } else {
6096 return 0; /* not found */
6097 }
6098 return 0; /* not found */
6099 }
6100
6101 /* Delete all the elements with score between min and max from the skiplist.
6102 * Min and mx are inclusive, so a score >= min || score <= max is deleted.
6103 * Note that this function takes the reference to the hash table view of the
6104 * sorted set, in order to remove the elements from the hash table too. */
6105 static unsigned long zslDeleteRangeByScore(zskiplist *zsl, double min, double max, dict *dict) {
6106 zskiplistNode *update[ZSKIPLIST_MAXLEVEL], *x;
6107 unsigned long removed = 0;
6108 int i;
6109
6110 x = zsl->header;
6111 for (i = zsl->level-1; i >= 0; i--) {
6112 while (x->forward[i] && x->forward[i]->score < min)
6113 x = x->forward[i];
6114 update[i] = x;
6115 }
6116 /* We may have multiple elements with the same score, what we need
6117 * is to find the element with both the right score and object. */
6118 x = x->forward[0];
6119 while (x && x->score <= max) {
6120 zskiplistNode *next = x->forward[0];
6121 zslDeleteNode(zsl, x, update);
6122 dictDelete(dict,x->obj);
6123 zslFreeNode(x);
6124 removed++;
6125 x = next;
6126 }
6127 return removed; /* not found */
6128 }
6129
6130 /* Delete all the elements with rank between start and end from the skiplist.
6131 * Start and end are inclusive. Note that start and end need to be 1-based */
6132 static unsigned long zslDeleteRangeByRank(zskiplist *zsl, unsigned int start, unsigned int end, dict *dict) {
6133 zskiplistNode *update[ZSKIPLIST_MAXLEVEL], *x;
6134 unsigned long traversed = 0, removed = 0;
6135 int i;
6136
6137 x = zsl->header;
6138 for (i = zsl->level-1; i >= 0; i--) {
6139 while (x->forward[i] && (traversed + (i > 0 ? x->span[i-1] : 1)) < start) {
6140 traversed += i > 0 ? x->span[i-1] : 1;
6141 x = x->forward[i];
6142 }
6143 update[i] = x;
6144 }
6145
6146 traversed++;
6147 x = x->forward[0];
6148 while (x && traversed <= end) {
6149 zskiplistNode *next = x->forward[0];
6150 zslDeleteNode(zsl, x, update);
6151 dictDelete(dict,x->obj);
6152 zslFreeNode(x);
6153 removed++;
6154 traversed++;
6155 x = next;
6156 }
6157 return removed;
6158 }
6159
6160 /* Find the first node having a score equal or greater than the specified one.
6161 * Returns NULL if there is no match. */
6162 static zskiplistNode *zslFirstWithScore(zskiplist *zsl, double score) {
6163 zskiplistNode *x;
6164 int i;
6165
6166 x = zsl->header;
6167 for (i = zsl->level-1; i >= 0; i--) {
6168 while (x->forward[i] && x->forward[i]->score < score)
6169 x = x->forward[i];
6170 }
6171 /* We may have multiple elements with the same score, what we need
6172 * is to find the element with both the right score and object. */
6173 return x->forward[0];
6174 }
6175
6176 /* Find the rank for an element by both score and key.
6177 * Returns 0 when the element cannot be found, rank otherwise.
6178 * Note that the rank is 1-based due to the span of zsl->header to the
6179 * first element. */
6180 static unsigned long zslistTypeGetRank(zskiplist *zsl, double score, robj *o) {
6181 zskiplistNode *x;
6182 unsigned long rank = 0;
6183 int i;
6184
6185 x = zsl->header;
6186 for (i = zsl->level-1; i >= 0; i--) {
6187 while (x->forward[i] &&
6188 (x->forward[i]->score < score ||
6189 (x->forward[i]->score == score &&
6190 compareStringObjects(x->forward[i]->obj,o) <= 0))) {
6191 rank += i > 0 ? x->span[i-1] : 1;
6192 x = x->forward[i];
6193 }
6194
6195 /* x might be equal to zsl->header, so test if obj is non-NULL */
6196 if (x->obj && equalStringObjects(x->obj,o)) {
6197 return rank;
6198 }
6199 }
6200 return 0;
6201 }
6202
6203 /* Finds an element by its rank. The rank argument needs to be 1-based. */
6204 zskiplistNode* zslistTypeGetElementByRank(zskiplist *zsl, unsigned long rank) {
6205 zskiplistNode *x;
6206 unsigned long traversed = 0;
6207 int i;
6208
6209 x = zsl->header;
6210 for (i = zsl->level-1; i >= 0; i--) {
6211 while (x->forward[i] && (traversed + (i>0 ? x->span[i-1] : 1)) <= rank)
6212 {
6213 traversed += i > 0 ? x->span[i-1] : 1;
6214 x = x->forward[i];
6215 }
6216 if (traversed == rank) {
6217 return x;
6218 }
6219 }
6220 return NULL;
6221 }
6222
6223 /* The actual Z-commands implementations */
6224
6225 /* This generic command implements both ZADD and ZINCRBY.
6226 * scoreval is the score if the operation is a ZADD (doincrement == 0) or
6227 * the increment if the operation is a ZINCRBY (doincrement == 1). */
6228 static void zaddGenericCommand(redisClient *c, robj *key, robj *ele, double scoreval, int doincrement) {
6229 robj *zsetobj;
6230 zset *zs;
6231 double *score;
6232
6233 if (isnan(scoreval)) {
6234 addReplySds(c,sdsnew("-ERR provide score is Not A Number (nan)\r\n"));
6235 return;
6236 }
6237
6238 zsetobj = lookupKeyWrite(c->db,key);
6239 if (zsetobj == NULL) {
6240 zsetobj = createZsetObject();
6241 dbAdd(c->db,key,zsetobj);
6242 } else {
6243 if (zsetobj->type != REDIS_ZSET) {
6244 addReply(c,shared.wrongtypeerr);
6245 return;
6246 }
6247 }
6248 zs = zsetobj->ptr;
6249
6250 /* Ok now since we implement both ZADD and ZINCRBY here the code
6251 * needs to handle the two different conditions. It's all about setting
6252 * '*score', that is, the new score to set, to the right value. */
6253 score = zmalloc(sizeof(double));
6254 if (doincrement) {
6255 dictEntry *de;
6256
6257 /* Read the old score. If the element was not present starts from 0 */
6258 de = dictFind(zs->dict,ele);
6259 if (de) {
6260 double *oldscore = dictGetEntryVal(de);
6261 *score = *oldscore + scoreval;
6262 } else {
6263 *score = scoreval;
6264 }
6265 if (isnan(*score)) {
6266 addReplySds(c,
6267 sdsnew("-ERR resulting score is Not A Number (nan)\r\n"));
6268 zfree(score);
6269 /* Note that we don't need to check if the zset may be empty and
6270 * should be removed here, as we can only obtain Nan as score if
6271 * there was already an element in the sorted set. */
6272 return;
6273 }
6274 } else {
6275 *score = scoreval;
6276 }
6277
6278 /* What follows is a simple remove and re-insert operation that is common
6279 * to both ZADD and ZINCRBY... */
6280 if (dictAdd(zs->dict,ele,score) == DICT_OK) {
6281 /* case 1: New element */
6282 incrRefCount(ele); /* added to hash */
6283 zslInsert(zs->zsl,*score,ele);
6284 incrRefCount(ele); /* added to skiplist */
6285 server.dirty++;
6286 if (doincrement)
6287 addReplyDouble(c,*score);
6288 else
6289 addReply(c,shared.cone);
6290 } else {
6291 dictEntry *de;
6292 double *oldscore;
6293
6294 /* case 2: Score update operation */
6295 de = dictFind(zs->dict,ele);
6296 redisAssert(de != NULL);
6297 oldscore = dictGetEntryVal(de);
6298 if (*score != *oldscore) {
6299 int deleted;
6300
6301 /* Remove and insert the element in the skip list with new score */
6302 deleted = zslDelete(zs->zsl,*oldscore,ele);
6303 redisAssert(deleted != 0);
6304 zslInsert(zs->zsl,*score,ele);
6305 incrRefCount(ele);
6306 /* Update the score in the hash table */
6307 dictReplace(zs->dict,ele,score);
6308 server.dirty++;
6309 } else {
6310 zfree(score);
6311 }
6312 if (doincrement)
6313 addReplyDouble(c,*score);
6314 else
6315 addReply(c,shared.czero);
6316 }
6317 }
6318
6319 static void zaddCommand(redisClient *c) {
6320 double scoreval;
6321
6322 if (getDoubleFromObjectOrReply(c, c->argv[2], &scoreval, NULL) != REDIS_OK) return;
6323 zaddGenericCommand(c,c->argv[1],c->argv[3],scoreval,0);
6324 }
6325
6326 static void zincrbyCommand(redisClient *c) {
6327 double scoreval;
6328
6329 if (getDoubleFromObjectOrReply(c, c->argv[2], &scoreval, NULL) != REDIS_OK) return;
6330 zaddGenericCommand(c,c->argv[1],c->argv[3],scoreval,1);
6331 }
6332
6333 static void zremCommand(redisClient *c) {
6334 robj *zsetobj;
6335 zset *zs;
6336 dictEntry *de;
6337 double *oldscore;
6338 int deleted;
6339
6340 if ((zsetobj = lookupKeyWriteOrReply(c,c->argv[1],shared.czero)) == NULL ||
6341 checkType(c,zsetobj,REDIS_ZSET)) return;
6342
6343 zs = zsetobj->ptr;
6344 de = dictFind(zs->dict,c->argv[2]);
6345 if (de == NULL) {
6346 addReply(c,shared.czero);
6347 return;
6348 }
6349 /* Delete from the skiplist */
6350 oldscore = dictGetEntryVal(de);
6351 deleted = zslDelete(zs->zsl,*oldscore,c->argv[2]);
6352 redisAssert(deleted != 0);
6353
6354 /* Delete from the hash table */
6355 dictDelete(zs->dict,c->argv[2]);
6356 if (htNeedsResize(zs->dict)) dictResize(zs->dict);
6357 if (dictSize(zs->dict) == 0) dbDelete(c->db,c->argv[1]);
6358 server.dirty++;
6359 addReply(c,shared.cone);
6360 }
6361
6362 static void zremrangebyscoreCommand(redisClient *c) {
6363 double min;
6364 double max;
6365 long deleted;
6366 robj *zsetobj;
6367 zset *zs;
6368
6369 if ((getDoubleFromObjectOrReply(c, c->argv[2], &min, NULL) != REDIS_OK) ||
6370 (getDoubleFromObjectOrReply(c, c->argv[3], &max, NULL) != REDIS_OK)) return;
6371
6372 if ((zsetobj = lookupKeyWriteOrReply(c,c->argv[1],shared.czero)) == NULL ||
6373 checkType(c,zsetobj,REDIS_ZSET)) return;
6374
6375 zs = zsetobj->ptr;
6376 deleted = zslDeleteRangeByScore(zs->zsl,min,max,zs->dict);
6377 if (htNeedsResize(zs->dict)) dictResize(zs->dict);
6378 if (dictSize(zs->dict) == 0) dbDelete(c->db,c->argv[1]);
6379 server.dirty += deleted;
6380 addReplyLongLong(c,deleted);
6381 }
6382
6383 static void zremrangebyrankCommand(redisClient *c) {
6384 long start;
6385 long end;
6386 int llen;
6387 long deleted;
6388 robj *zsetobj;
6389 zset *zs;
6390
6391 if ((getLongFromObjectOrReply(c, c->argv[2], &start, NULL) != REDIS_OK) ||
6392 (getLongFromObjectOrReply(c, c->argv[3], &end, NULL) != REDIS_OK)) return;
6393
6394 if ((zsetobj = lookupKeyWriteOrReply(c,c->argv[1],shared.czero)) == NULL ||
6395 checkType(c,zsetobj,REDIS_ZSET)) return;
6396 zs = zsetobj->ptr;
6397 llen = zs->zsl->length;
6398
6399 /* convert negative indexes */
6400 if (start < 0) start = llen+start;
6401 if (end < 0) end = llen+end;
6402 if (start < 0) start = 0;
6403 if (end < 0) end = 0;
6404
6405 /* indexes sanity checks */
6406 if (start > end || start >= llen) {
6407 addReply(c,shared.czero);
6408 return;
6409 }
6410 if (end >= llen) end = llen-1;
6411
6412 /* increment start and end because zsl*Rank functions
6413 * use 1-based rank */
6414 deleted = zslDeleteRangeByRank(zs->zsl,start+1,end+1,zs->dict);
6415 if (htNeedsResize(zs->dict)) dictResize(zs->dict);
6416 if (dictSize(zs->dict) == 0) dbDelete(c->db,c->argv[1]);
6417 server.dirty += deleted;
6418 addReplyLongLong(c, deleted);
6419 }
6420
6421 typedef struct {
6422 dict *dict;
6423 double weight;
6424 } zsetopsrc;
6425
6426 static int qsortCompareZsetopsrcByCardinality(const void *s1, const void *s2) {
6427 zsetopsrc *d1 = (void*) s1, *d2 = (void*) s2;
6428 unsigned long size1, size2;
6429 size1 = d1->dict ? dictSize(d1->dict) : 0;
6430 size2 = d2->dict ? dictSize(d2->dict) : 0;
6431 return size1 - size2;
6432 }
6433
6434 #define REDIS_AGGR_SUM 1
6435 #define REDIS_AGGR_MIN 2
6436 #define REDIS_AGGR_MAX 3
6437 #define zunionInterDictValue(_e) (dictGetEntryVal(_e) == NULL ? 1.0 : *(double*)dictGetEntryVal(_e))
6438
6439 inline static void zunionInterAggregate(double *target, double val, int aggregate) {
6440 if (aggregate == REDIS_AGGR_SUM) {
6441 *target = *target + val;
6442 } else if (aggregate == REDIS_AGGR_MIN) {
6443 *target = val < *target ? val : *target;
6444 } else if (aggregate == REDIS_AGGR_MAX) {
6445 *target = val > *target ? val : *target;
6446 } else {
6447 /* safety net */
6448 redisPanic("Unknown ZUNION/INTER aggregate type");
6449 }
6450 }
6451
6452 static void zunionInterGenericCommand(redisClient *c, robj *dstkey, int op) {
6453 int i, j, setnum;
6454 int aggregate = REDIS_AGGR_SUM;
6455 zsetopsrc *src;
6456 robj *dstobj;
6457 zset *dstzset;
6458 dictIterator *di;
6459 dictEntry *de;
6460
6461 /* expect setnum input keys to be given */
6462 setnum = atoi(c->argv[2]->ptr);
6463 if (setnum < 1) {
6464 addReplySds(c,sdsnew("-ERR at least 1 input key is needed for ZUNIONSTORE/ZINTERSTORE\r\n"));
6465 return;
6466 }
6467
6468 /* test if the expected number of keys would overflow */
6469 if (3+setnum > c->argc) {
6470 addReply(c,shared.syntaxerr);
6471 return;
6472 }
6473
6474 /* read keys to be used for input */
6475 src = zmalloc(sizeof(zsetopsrc) * setnum);
6476 for (i = 0, j = 3; i < setnum; i++, j++) {
6477 robj *obj = lookupKeyWrite(c->db,c->argv[j]);
6478 if (!obj) {
6479 src[i].dict = NULL;
6480 } else {
6481 if (obj->type == REDIS_ZSET) {
6482 src[i].dict = ((zset*)obj->ptr)->dict;
6483 } else if (obj->type == REDIS_SET) {
6484 src[i].dict = (obj->ptr);
6485 } else {
6486 zfree(src);
6487 addReply(c,shared.wrongtypeerr);
6488 return;
6489 }
6490 }
6491
6492 /* default all weights to 1 */
6493 src[i].weight = 1.0;
6494 }
6495
6496 /* parse optional extra arguments */
6497 if (j < c->argc) {
6498 int remaining = c->argc - j;
6499
6500 while (remaining) {
6501 if (remaining >= (setnum + 1) && !strcasecmp(c->argv[j]->ptr,"weights")) {
6502 j++; remaining--;
6503 for (i = 0; i < setnum; i++, j++, remaining--) {
6504 if (getDoubleFromObjectOrReply(c, c->argv[j], &src[i].weight, NULL) != REDIS_OK)
6505 return;
6506 }
6507 } else if (remaining >= 2 && !strcasecmp(c->argv[j]->ptr,"aggregate")) {
6508 j++; remaining--;
6509 if (!strcasecmp(c->argv[j]->ptr,"sum")) {
6510 aggregate = REDIS_AGGR_SUM;
6511 } else if (!strcasecmp(c->argv[j]->ptr,"min")) {
6512 aggregate = REDIS_AGGR_MIN;
6513 } else if (!strcasecmp(c->argv[j]->ptr,"max")) {
6514 aggregate = REDIS_AGGR_MAX;
6515 } else {
6516 zfree(src);
6517 addReply(c,shared.syntaxerr);
6518 return;
6519 }
6520 j++; remaining--;
6521 } else {
6522 zfree(src);
6523 addReply(c,shared.syntaxerr);
6524 return;
6525 }
6526 }
6527 }
6528
6529 /* sort sets from the smallest to largest, this will improve our
6530 * algorithm's performance */
6531 qsort(src,setnum,sizeof(zsetopsrc),qsortCompareZsetopsrcByCardinality);
6532
6533 dstobj = createZsetObject();
6534 dstzset = dstobj->ptr;
6535
6536 if (op == REDIS_OP_INTER) {
6537 /* skip going over all entries if the smallest zset is NULL or empty */
6538 if (src[0].dict && dictSize(src[0].dict) > 0) {
6539 /* precondition: as src[0].dict is non-empty and the zsets are ordered
6540 * from small to large, all src[i > 0].dict are non-empty too */
6541 di = dictGetIterator(src[0].dict);
6542 while((de = dictNext(di)) != NULL) {
6543 double *score = zmalloc(sizeof(double)), value;
6544 *score = src[0].weight * zunionInterDictValue(de);
6545
6546 for (j = 1; j < setnum; j++) {
6547 dictEntry *other = dictFind(src[j].dict,dictGetEntryKey(de));
6548 if (other) {
6549 value = src[j].weight * zunionInterDictValue(other);
6550 zunionInterAggregate(score, value, aggregate);
6551 } else {
6552 break;
6553 }
6554 }
6555
6556 /* skip entry when not present in every source dict */
6557 if (j != setnum) {
6558 zfree(score);
6559 } else {
6560 robj *o = dictGetEntryKey(de);
6561 dictAdd(dstzset->dict,o,score);
6562 incrRefCount(o); /* added to dictionary */
6563 zslInsert(dstzset->zsl,*score,o);
6564 incrRefCount(o); /* added to skiplist */
6565 }
6566 }
6567 dictReleaseIterator(di);
6568 }
6569 } else if (op == REDIS_OP_UNION) {
6570 for (i = 0; i < setnum; i++) {
6571 if (!src[i].dict) continue;
6572
6573 di = dictGetIterator(src[i].dict);
6574 while((de = dictNext(di)) != NULL) {
6575 /* skip key when already processed */
6576 if (dictFind(dstzset->dict,dictGetEntryKey(de)) != NULL) continue;
6577
6578 double *score = zmalloc(sizeof(double)), value;
6579 *score = src[i].weight * zunionInterDictValue(de);
6580
6581 /* because the zsets are sorted by size, its only possible
6582 * for sets at larger indices to hold this entry */
6583 for (j = (i+1); j < setnum; j++) {
6584 dictEntry *other = dictFind(src[j].dict,dictGetEntryKey(de));
6585 if (other) {
6586 value = src[j].weight * zunionInterDictValue(other);
6587 zunionInterAggregate(score, value, aggregate);
6588 }
6589 }
6590
6591 robj *o = dictGetEntryKey(de);
6592 dictAdd(dstzset->dict,o,score);
6593 incrRefCount(o); /* added to dictionary */
6594 zslInsert(dstzset->zsl,*score,o);
6595 incrRefCount(o); /* added to skiplist */
6596 }
6597 dictReleaseIterator(di);
6598 }
6599 } else {
6600 /* unknown operator */
6601 redisAssert(op == REDIS_OP_INTER || op == REDIS_OP_UNION);
6602 }
6603
6604 dbDelete(c->db,dstkey);
6605 if (dstzset->zsl->length) {
6606 dbAdd(c->db,dstkey,dstobj);
6607 addReplyLongLong(c, dstzset->zsl->length);
6608 server.dirty++;
6609 } else {
6610 decrRefCount(dstobj);
6611 addReply(c, shared.czero);
6612 }
6613 zfree(src);
6614 }
6615
6616 static void zunionstoreCommand(redisClient *c) {
6617 zunionInterGenericCommand(c,c->argv[1], REDIS_OP_UNION);
6618 }
6619
6620 static void zinterstoreCommand(redisClient *c) {
6621 zunionInterGenericCommand(c,c->argv[1], REDIS_OP_INTER);
6622 }
6623
6624 static void zrangeGenericCommand(redisClient *c, int reverse) {
6625 robj *o;
6626 long start;
6627 long end;
6628 int withscores = 0;
6629 int llen;
6630 int rangelen, j;
6631 zset *zsetobj;
6632 zskiplist *zsl;
6633 zskiplistNode *ln;
6634 robj *ele;
6635
6636 if ((getLongFromObjectOrReply(c, c->argv[2], &start, NULL) != REDIS_OK) ||
6637 (getLongFromObjectOrReply(c, c->argv[3], &end, NULL) != REDIS_OK)) return;
6638
6639 if (c->argc == 5 && !strcasecmp(c->argv[4]->ptr,"withscores")) {
6640 withscores = 1;
6641 } else if (c->argc >= 5) {
6642 addReply(c,shared.syntaxerr);
6643 return;
6644 }
6645
6646 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.emptymultibulk)) == NULL
6647 || checkType(c,o,REDIS_ZSET)) return;
6648 zsetobj = o->ptr;
6649 zsl = zsetobj->zsl;
6650 llen = zsl->length;
6651
6652 /* convert negative indexes */
6653 if (start < 0) start = llen+start;
6654 if (end < 0) end = llen+end;
6655 if (start < 0) start = 0;
6656 if (end < 0) end = 0;
6657
6658 /* indexes sanity checks */
6659 if (start > end || start >= llen) {
6660 /* Out of range start or start > end result in empty list */
6661 addReply(c,shared.emptymultibulk);
6662 return;
6663 }
6664 if (end >= llen) end = llen-1;
6665 rangelen = (end-start)+1;
6666
6667 /* check if starting point is trivial, before searching
6668 * the element in log(N) time */
6669 if (reverse) {
6670 ln = start == 0 ? zsl->tail : zslistTypeGetElementByRank(zsl, llen-start);
6671 } else {
6672 ln = start == 0 ?
6673 zsl->header->forward[0] : zslistTypeGetElementByRank(zsl, start+1);
6674 }
6675
6676 /* Return the result in form of a multi-bulk reply */
6677 addReplySds(c,sdscatprintf(sdsempty(),"*%d\r\n",
6678 withscores ? (rangelen*2) : rangelen));
6679 for (j = 0; j < rangelen; j++) {
6680 ele = ln->obj;
6681 addReplyBulk(c,ele);
6682 if (withscores)
6683 addReplyDouble(c,ln->score);
6684 ln = reverse ? ln->backward : ln->forward[0];
6685 }
6686 }
6687
6688 static void zrangeCommand(redisClient *c) {
6689 zrangeGenericCommand(c,0);
6690 }
6691
6692 static void zrevrangeCommand(redisClient *c) {
6693 zrangeGenericCommand(c,1);
6694 }
6695
6696 /* This command implements both ZRANGEBYSCORE and ZCOUNT.
6697 * If justcount is non-zero, just the count is returned. */
6698 static void genericZrangebyscoreCommand(redisClient *c, int justcount) {
6699 robj *o;
6700 double min, max;
6701 int minex = 0, maxex = 0; /* are min or max exclusive? */
6702 int offset = 0, limit = -1;
6703 int withscores = 0;
6704 int badsyntax = 0;
6705
6706 /* Parse the min-max interval. If one of the values is prefixed
6707 * by the "(" character, it's considered "open". For instance
6708 * ZRANGEBYSCORE zset (1.5 (2.5 will match min < x < max
6709 * ZRANGEBYSCORE zset 1.5 2.5 will instead match min <= x <= max */
6710 if (((char*)c->argv[2]->ptr)[0] == '(') {
6711 min = strtod((char*)c->argv[2]->ptr+1,NULL);
6712 minex = 1;
6713 } else {
6714 min = strtod(c->argv[2]->ptr,NULL);
6715 }
6716 if (((char*)c->argv[3]->ptr)[0] == '(') {
6717 max = strtod((char*)c->argv[3]->ptr+1,NULL);
6718 maxex = 1;
6719 } else {
6720 max = strtod(c->argv[3]->ptr,NULL);
6721 }
6722
6723 /* Parse "WITHSCORES": note that if the command was called with
6724 * the name ZCOUNT then we are sure that c->argc == 4, so we'll never
6725 * enter the following paths to parse WITHSCORES and LIMIT. */
6726 if (c->argc == 5 || c->argc == 8) {
6727 if (strcasecmp(c->argv[c->argc-1]->ptr,"withscores") == 0)
6728 withscores = 1;
6729 else
6730 badsyntax = 1;
6731 }
6732 if (c->argc != (4 + withscores) && c->argc != (7 + withscores))
6733 badsyntax = 1;
6734 if (badsyntax) {
6735 addReplySds(c,
6736 sdsnew("-ERR wrong number of arguments for ZRANGEBYSCORE\r\n"));
6737 return;
6738 }
6739
6740 /* Parse "LIMIT" */
6741 if (c->argc == (7 + withscores) && strcasecmp(c->argv[4]->ptr,"limit")) {
6742 addReply(c,shared.syntaxerr);
6743 return;
6744 } else if (c->argc == (7 + withscores)) {
6745 offset = atoi(c->argv[5]->ptr);
6746 limit = atoi(c->argv[6]->ptr);
6747 if (offset < 0) offset = 0;
6748 }
6749
6750 /* Ok, lookup the key and get the range */
6751 o = lookupKeyRead(c->db,c->argv[1]);
6752 if (o == NULL) {
6753 addReply(c,justcount ? shared.czero : shared.emptymultibulk);
6754 } else {
6755 if (o->type != REDIS_ZSET) {
6756 addReply(c,shared.wrongtypeerr);
6757 } else {
6758 zset *zsetobj = o->ptr;
6759 zskiplist *zsl = zsetobj->zsl;
6760 zskiplistNode *ln;
6761 robj *ele, *lenobj = NULL;
6762 unsigned long rangelen = 0;
6763
6764 /* Get the first node with the score >= min, or with
6765 * score > min if 'minex' is true. */
6766 ln = zslFirstWithScore(zsl,min);
6767 while (minex && ln && ln->score == min) ln = ln->forward[0];
6768
6769 if (ln == NULL) {
6770 /* No element matching the speciifed interval */
6771 addReply(c,justcount ? shared.czero : shared.emptymultibulk);
6772 return;
6773 }
6774
6775 /* We don't know in advance how many matching elements there
6776 * are in the list, so we push this object that will represent
6777 * the multi-bulk length in the output buffer, and will "fix"
6778 * it later */
6779 if (!justcount) {
6780 lenobj = createObject(REDIS_STRING,NULL);
6781 addReply(c,lenobj);
6782 decrRefCount(lenobj);
6783 }
6784
6785 while(ln && (maxex ? (ln->score < max) : (ln->score <= max))) {
6786 if (offset) {
6787 offset--;
6788 ln = ln->forward[0];
6789 continue;
6790 }
6791 if (limit == 0) break;
6792 if (!justcount) {
6793 ele = ln->obj;
6794 addReplyBulk(c,ele);
6795 if (withscores)
6796 addReplyDouble(c,ln->score);
6797 }
6798 ln = ln->forward[0];
6799 rangelen++;
6800 if (limit > 0) limit--;
6801 }
6802 if (justcount) {
6803 addReplyLongLong(c,(long)rangelen);
6804 } else {
6805 lenobj->ptr = sdscatprintf(sdsempty(),"*%lu\r\n",
6806 withscores ? (rangelen*2) : rangelen);
6807 }
6808 }
6809 }
6810 }
6811
6812 static void zrangebyscoreCommand(redisClient *c) {
6813 genericZrangebyscoreCommand(c,0);
6814 }
6815
6816 static void zcountCommand(redisClient *c) {
6817 genericZrangebyscoreCommand(c,1);
6818 }
6819
6820 static void zcardCommand(redisClient *c) {
6821 robj *o;
6822 zset *zs;
6823
6824 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.czero)) == NULL ||
6825 checkType(c,o,REDIS_ZSET)) return;
6826
6827 zs = o->ptr;
6828 addReplyUlong(c,zs->zsl->length);
6829 }
6830
6831 static void zscoreCommand(redisClient *c) {
6832 robj *o;
6833 zset *zs;
6834 dictEntry *de;
6835
6836 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.nullbulk)) == NULL ||
6837 checkType(c,o,REDIS_ZSET)) return;
6838
6839 zs = o->ptr;
6840 de = dictFind(zs->dict,c->argv[2]);
6841 if (!de) {
6842 addReply(c,shared.nullbulk);
6843 } else {
6844 double *score = dictGetEntryVal(de);
6845
6846 addReplyDouble(c,*score);
6847 }
6848 }
6849
6850 static void zrankGenericCommand(redisClient *c, int reverse) {
6851 robj *o;
6852 zset *zs;
6853 zskiplist *zsl;
6854 dictEntry *de;
6855 unsigned long rank;
6856 double *score;
6857
6858 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.nullbulk)) == NULL ||
6859 checkType(c,o,REDIS_ZSET)) return;
6860
6861 zs = o->ptr;
6862 zsl = zs->zsl;
6863 de = dictFind(zs->dict,c->argv[2]);
6864 if (!de) {
6865 addReply(c,shared.nullbulk);
6866 return;
6867 }
6868
6869 score = dictGetEntryVal(de);
6870 rank = zslistTypeGetRank(zsl, *score, c->argv[2]);
6871 if (rank) {
6872 if (reverse) {
6873 addReplyLongLong(c, zsl->length - rank);
6874 } else {
6875 addReplyLongLong(c, rank-1);
6876 }
6877 } else {
6878 addReply(c,shared.nullbulk);
6879 }
6880 }
6881
6882 static void zrankCommand(redisClient *c) {
6883 zrankGenericCommand(c, 0);
6884 }
6885
6886 static void zrevrankCommand(redisClient *c) {
6887 zrankGenericCommand(c, 1);
6888 }
6889
6890 /* ========================= Hashes utility functions ======================= */
6891 #define REDIS_HASH_KEY 1
6892 #define REDIS_HASH_VALUE 2
6893
6894 /* Check the length of a number of objects to see if we need to convert a
6895 * zipmap to a real hash. Note that we only check string encoded objects
6896 * as their string length can be queried in constant time. */
6897 static void hashTypeTryConversion(robj *subject, robj **argv, int start, int end) {
6898 int i;
6899 if (subject->encoding != REDIS_ENCODING_ZIPMAP) return;
6900
6901 for (i = start; i <= end; i++) {
6902 if (argv[i]->encoding == REDIS_ENCODING_RAW &&
6903 sdslen(argv[i]->ptr) > server.hash_max_zipmap_value)
6904 {
6905 convertToRealHash(subject);
6906 return;
6907 }
6908 }
6909 }
6910
6911 /* Encode given objects in-place when the hash uses a dict. */
6912 static void hashTypeTryObjectEncoding(robj *subject, robj **o1, robj **o2) {
6913 if (subject->encoding == REDIS_ENCODING_HT) {
6914 if (o1) *o1 = tryObjectEncoding(*o1);
6915 if (o2) *o2 = tryObjectEncoding(*o2);
6916 }
6917 }
6918
6919 /* Get the value from a hash identified by key. Returns either a string
6920 * object or NULL if the value cannot be found. The refcount of the object
6921 * is always increased by 1 when the value was found. */
6922 static robj *hashTypeGet(robj *o, robj *key) {
6923 robj *value = NULL;
6924 if (o->encoding == REDIS_ENCODING_ZIPMAP) {
6925 unsigned char *v;
6926 unsigned int vlen;
6927 key = getDecodedObject(key);
6928 if (zipmapGet(o->ptr,key->ptr,sdslen(key->ptr),&v,&vlen)) {
6929 value = createStringObject((char*)v,vlen);
6930 }
6931 decrRefCount(key);
6932 } else {
6933 dictEntry *de = dictFind(o->ptr,key);
6934 if (de != NULL) {
6935 value = dictGetEntryVal(de);
6936 incrRefCount(value);
6937 }
6938 }
6939 return value;
6940 }
6941
6942 /* Test if the key exists in the given hash. Returns 1 if the key
6943 * exists and 0 when it doesn't. */
6944 static int hashTypeExists(robj *o, robj *key) {
6945 if (o->encoding == REDIS_ENCODING_ZIPMAP) {
6946 key = getDecodedObject(key);
6947 if (zipmapExists(o->ptr,key->ptr,sdslen(key->ptr))) {
6948 decrRefCount(key);
6949 return 1;
6950 }
6951 decrRefCount(key);
6952 } else {
6953 if (dictFind(o->ptr,key) != NULL) {
6954 return 1;
6955 }
6956 }
6957 return 0;
6958 }
6959
6960 /* Add an element, discard the old if the key already exists.
6961 * Return 0 on insert and 1 on update. */
6962 static int hashTypeSet(robj *o, robj *key, robj *value) {
6963 int update = 0;
6964 if (o->encoding == REDIS_ENCODING_ZIPMAP) {
6965 key = getDecodedObject(key);
6966 value = getDecodedObject(value);
6967 o->ptr = zipmapSet(o->ptr,
6968 key->ptr,sdslen(key->ptr),
6969 value->ptr,sdslen(value->ptr), &update);
6970 decrRefCount(key);
6971 decrRefCount(value);
6972
6973 /* Check if the zipmap needs to be upgraded to a real hash table */
6974 if (zipmapLen(o->ptr) > server.hash_max_zipmap_entries)
6975 convertToRealHash(o);
6976 } else {
6977 if (dictReplace(o->ptr,key,value)) {
6978 /* Insert */
6979 incrRefCount(key);
6980 } else {
6981 /* Update */
6982 update = 1;
6983 }
6984 incrRefCount(value);
6985 }
6986 return update;
6987 }
6988
6989 /* Delete an element from a hash.
6990 * Return 1 on deleted and 0 on not found. */
6991 static int hashTypeDelete(robj *o, robj *key) {
6992 int deleted = 0;
6993 if (o->encoding == REDIS_ENCODING_ZIPMAP) {
6994 key = getDecodedObject(key);
6995 o->ptr = zipmapDel(o->ptr,key->ptr,sdslen(key->ptr), &deleted);
6996 decrRefCount(key);
6997 } else {
6998 deleted = dictDelete((dict*)o->ptr,key) == DICT_OK;
6999 /* Always check if the dictionary needs a resize after a delete. */
7000 if (deleted && htNeedsResize(o->ptr)) dictResize(o->ptr);
7001 }
7002 return deleted;
7003 }
7004
7005 /* Return the number of elements in a hash. */
7006 static unsigned long hashTypeLength(robj *o) {
7007 return (o->encoding == REDIS_ENCODING_ZIPMAP) ?
7008 zipmapLen((unsigned char*)o->ptr) : dictSize((dict*)o->ptr);
7009 }
7010
7011 /* Structure to hold hash iteration abstration. Note that iteration over
7012 * hashes involves both fields and values. Because it is possible that
7013 * not both are required, store pointers in the iterator to avoid
7014 * unnecessary memory allocation for fields/values. */
7015 typedef struct {
7016 int encoding;
7017 unsigned char *zi;
7018 unsigned char *zk, *zv;
7019 unsigned int zklen, zvlen;
7020
7021 dictIterator *di;
7022 dictEntry *de;
7023 } hashTypeIterator;
7024
7025 static hashTypeIterator *hashTypeInitIterator(robj *subject) {
7026 hashTypeIterator *hi = zmalloc(sizeof(hashTypeIterator));
7027 hi->encoding = subject->encoding;
7028 if (hi->encoding == REDIS_ENCODING_ZIPMAP) {
7029 hi->zi = zipmapRewind(subject->ptr);
7030 } else if (hi->encoding == REDIS_ENCODING_HT) {
7031 hi->di = dictGetIterator(subject->ptr);
7032 } else {
7033 redisAssert(NULL);
7034 }
7035 return hi;
7036 }
7037
7038 static void hashTypeReleaseIterator(hashTypeIterator *hi) {
7039 if (hi->encoding == REDIS_ENCODING_HT) {
7040 dictReleaseIterator(hi->di);
7041 }
7042 zfree(hi);
7043 }
7044
7045 /* Move to the next entry in the hash. Return REDIS_OK when the next entry
7046 * could be found and REDIS_ERR when the iterator reaches the end. */
7047 static int hashTypeNext(hashTypeIterator *hi) {
7048 if (hi->encoding == REDIS_ENCODING_ZIPMAP) {
7049 if ((hi->zi = zipmapNext(hi->zi, &hi->zk, &hi->zklen,
7050 &hi->zv, &hi->zvlen)) == NULL) return REDIS_ERR;
7051 } else {
7052 if ((hi->de = dictNext(hi->di)) == NULL) return REDIS_ERR;
7053 }
7054 return REDIS_OK;
7055 }
7056
7057 /* Get key or value object at current iteration position.
7058 * This increases the refcount of the field object by 1. */
7059 static robj *hashTypeCurrent(hashTypeIterator *hi, int what) {
7060 robj *o;
7061 if (hi->encoding == REDIS_ENCODING_ZIPMAP) {
7062 if (what & REDIS_HASH_KEY) {
7063 o = createStringObject((char*)hi->zk,hi->zklen);
7064 } else {
7065 o = createStringObject((char*)hi->zv,hi->zvlen);
7066 }
7067 } else {
7068 if (what & REDIS_HASH_KEY) {
7069 o = dictGetEntryKey(hi->de);
7070 } else {
7071 o = dictGetEntryVal(hi->de);
7072 }
7073 incrRefCount(o);
7074 }
7075 return o;
7076 }
7077
7078 static robj *hashTypeLookupWriteOrCreate(redisClient *c, robj *key) {
7079 robj *o = lookupKeyWrite(c->db,key);
7080 if (o == NULL) {
7081 o = createHashObject();
7082 dbAdd(c->db,key,o);
7083 } else {
7084 if (o->type != REDIS_HASH) {
7085 addReply(c,shared.wrongtypeerr);
7086 return NULL;
7087 }
7088 }
7089 return o;
7090 }
7091
7092 /* ============================= Hash commands ============================== */
7093 static void hsetCommand(redisClient *c) {
7094 int update;
7095 robj *o;
7096
7097 if ((o = hashTypeLookupWriteOrCreate(c,c->argv[1])) == NULL) return;
7098 hashTypeTryConversion(o,c->argv,2,3);
7099 hashTypeTryObjectEncoding(o,&c->argv[2], &c->argv[3]);
7100 update = hashTypeSet(o,c->argv[2],c->argv[3]);
7101 addReply(c, update ? shared.czero : shared.cone);
7102 server.dirty++;
7103 }
7104
7105 static void hsetnxCommand(redisClient *c) {
7106 robj *o;
7107 if ((o = hashTypeLookupWriteOrCreate(c,c->argv[1])) == NULL) return;
7108 hashTypeTryConversion(o,c->argv,2,3);
7109
7110 if (hashTypeExists(o, c->argv[2])) {
7111 addReply(c, shared.czero);
7112 } else {
7113 hashTypeTryObjectEncoding(o,&c->argv[2], &c->argv[3]);
7114 hashTypeSet(o,c->argv[2],c->argv[3]);
7115 addReply(c, shared.cone);
7116 server.dirty++;
7117 }
7118 }
7119
7120 static void hmsetCommand(redisClient *c) {
7121 int i;
7122 robj *o;
7123
7124 if ((c->argc % 2) == 1) {
7125 addReplySds(c,sdsnew("-ERR wrong number of arguments for HMSET\r\n"));
7126 return;
7127 }
7128
7129 if ((o = hashTypeLookupWriteOrCreate(c,c->argv[1])) == NULL) return;
7130 hashTypeTryConversion(o,c->argv,2,c->argc-1);
7131 for (i = 2; i < c->argc; i += 2) {
7132 hashTypeTryObjectEncoding(o,&c->argv[i], &c->argv[i+1]);
7133 hashTypeSet(o,c->argv[i],c->argv[i+1]);
7134 }
7135 addReply(c, shared.ok);
7136 server.dirty++;
7137 }
7138
7139 static void hincrbyCommand(redisClient *c) {
7140 long long value, incr;
7141 robj *o, *current, *new;
7142
7143 if (getLongLongFromObjectOrReply(c,c->argv[3],&incr,NULL) != REDIS_OK) return;
7144 if ((o = hashTypeLookupWriteOrCreate(c,c->argv[1])) == NULL) return;
7145 if ((current = hashTypeGet(o,c->argv[2])) != NULL) {
7146 if (getLongLongFromObjectOrReply(c,current,&value,
7147 "hash value is not an integer") != REDIS_OK) {
7148 decrRefCount(current);
7149 return;
7150 }
7151 decrRefCount(current);
7152 } else {
7153 value = 0;
7154 }
7155
7156 value += incr;
7157 new = createStringObjectFromLongLong(value);
7158 hashTypeTryObjectEncoding(o,&c->argv[2],NULL);
7159 hashTypeSet(o,c->argv[2],new);
7160 decrRefCount(new);
7161 addReplyLongLong(c,value);
7162 server.dirty++;
7163 }
7164
7165 static void hgetCommand(redisClient *c) {
7166 robj *o, *value;
7167 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.nullbulk)) == NULL ||
7168 checkType(c,o,REDIS_HASH)) return;
7169
7170 if ((value = hashTypeGet(o,c->argv[2])) != NULL) {
7171 addReplyBulk(c,value);
7172 decrRefCount(value);
7173 } else {
7174 addReply(c,shared.nullbulk);
7175 }
7176 }
7177
7178 static void hmgetCommand(redisClient *c) {
7179 int i;
7180 robj *o, *value;
7181 o = lookupKeyRead(c->db,c->argv[1]);
7182 if (o != NULL && o->type != REDIS_HASH) {
7183 addReply(c,shared.wrongtypeerr);
7184 }
7185
7186 /* Note the check for o != NULL happens inside the loop. This is
7187 * done because objects that cannot be found are considered to be
7188 * an empty hash. The reply should then be a series of NULLs. */
7189 addReplySds(c,sdscatprintf(sdsempty(),"*%d\r\n",c->argc-2));
7190 for (i = 2; i < c->argc; i++) {
7191 if (o != NULL && (value = hashTypeGet(o,c->argv[i])) != NULL) {
7192 addReplyBulk(c,value);
7193 decrRefCount(value);
7194 } else {
7195 addReply(c,shared.nullbulk);
7196 }
7197 }
7198 }
7199
7200 static void hdelCommand(redisClient *c) {
7201 robj *o;
7202 if ((o = lookupKeyWriteOrReply(c,c->argv[1],shared.czero)) == NULL ||
7203 checkType(c,o,REDIS_HASH)) return;
7204
7205 if (hashTypeDelete(o,c->argv[2])) {
7206 if (hashTypeLength(o) == 0) dbDelete(c->db,c->argv[1]);
7207 addReply(c,shared.cone);
7208 server.dirty++;
7209 } else {
7210 addReply(c,shared.czero);
7211 }
7212 }
7213
7214 static void hlenCommand(redisClient *c) {
7215 robj *o;
7216 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.czero)) == NULL ||
7217 checkType(c,o,REDIS_HASH)) return;
7218
7219 addReplyUlong(c,hashTypeLength(o));
7220 }
7221
7222 static void genericHgetallCommand(redisClient *c, int flags) {
7223 robj *o, *lenobj, *obj;
7224 unsigned long count = 0;
7225 hashTypeIterator *hi;
7226
7227 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.emptymultibulk)) == NULL
7228 || checkType(c,o,REDIS_HASH)) return;
7229
7230 lenobj = createObject(REDIS_STRING,NULL);
7231 addReply(c,lenobj);
7232 decrRefCount(lenobj);
7233
7234 hi = hashTypeInitIterator(o);
7235 while (hashTypeNext(hi) != REDIS_ERR) {
7236 if (flags & REDIS_HASH_KEY) {
7237 obj = hashTypeCurrent(hi,REDIS_HASH_KEY);
7238 addReplyBulk(c,obj);
7239 decrRefCount(obj);
7240 count++;
7241 }
7242 if (flags & REDIS_HASH_VALUE) {
7243 obj = hashTypeCurrent(hi,REDIS_HASH_VALUE);
7244 addReplyBulk(c,obj);
7245 decrRefCount(obj);
7246 count++;
7247 }
7248 }
7249 hashTypeReleaseIterator(hi);
7250
7251 lenobj->ptr = sdscatprintf(sdsempty(),"*%lu\r\n",count);
7252 }
7253
7254 static void hkeysCommand(redisClient *c) {
7255 genericHgetallCommand(c,REDIS_HASH_KEY);
7256 }
7257
7258 static void hvalsCommand(redisClient *c) {
7259 genericHgetallCommand(c,REDIS_HASH_VALUE);
7260 }
7261
7262 static void hgetallCommand(redisClient *c) {
7263 genericHgetallCommand(c,REDIS_HASH_KEY|REDIS_HASH_VALUE);
7264 }
7265
7266 static void hexistsCommand(redisClient *c) {
7267 robj *o;
7268 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.czero)) == NULL ||
7269 checkType(c,o,REDIS_HASH)) return;
7270
7271 addReply(c, hashTypeExists(o,c->argv[2]) ? shared.cone : shared.czero);
7272 }
7273
7274 static void convertToRealHash(robj *o) {
7275 unsigned char *key, *val, *p, *zm = o->ptr;
7276 unsigned int klen, vlen;
7277 dict *dict = dictCreate(&hashDictType,NULL);
7278
7279 assert(o->type == REDIS_HASH && o->encoding != REDIS_ENCODING_HT);
7280 p = zipmapRewind(zm);
7281 while((p = zipmapNext(p,&key,&klen,&val,&vlen)) != NULL) {
7282 robj *keyobj, *valobj;
7283
7284 keyobj = createStringObject((char*)key,klen);
7285 valobj = createStringObject((char*)val,vlen);
7286 keyobj = tryObjectEncoding(keyobj);
7287 valobj = tryObjectEncoding(valobj);
7288 dictAdd(dict,keyobj,valobj);
7289 }
7290 o->encoding = REDIS_ENCODING_HT;
7291 o->ptr = dict;
7292 zfree(zm);
7293 }
7294
7295 /* ========================= Non type-specific commands ==================== */
7296
7297 static void flushdbCommand(redisClient *c) {
7298 server.dirty += dictSize(c->db->dict);
7299 touchWatchedKeysOnFlush(c->db->id);
7300 dictEmpty(c->db->dict);
7301 dictEmpty(c->db->expires);
7302 addReply(c,shared.ok);
7303 }
7304
7305 static void flushallCommand(redisClient *c) {
7306 touchWatchedKeysOnFlush(-1);
7307 server.dirty += emptyDb();
7308 addReply(c,shared.ok);
7309 if (server.bgsavechildpid != -1) {
7310 kill(server.bgsavechildpid,SIGKILL);
7311 rdbRemoveTempFile(server.bgsavechildpid);
7312 }
7313 rdbSave(server.dbfilename);
7314 server.dirty++;
7315 }
7316
7317 static redisSortOperation *createSortOperation(int type, robj *pattern) {
7318 redisSortOperation *so = zmalloc(sizeof(*so));
7319 so->type = type;
7320 so->pattern = pattern;
7321 return so;
7322 }
7323
7324 /* Return the value associated to the key with a name obtained
7325 * substituting the first occurence of '*' in 'pattern' with 'subst'.
7326 * The returned object will always have its refcount increased by 1
7327 * when it is non-NULL. */
7328 static robj *lookupKeyByPattern(redisDb *db, robj *pattern, robj *subst) {
7329 char *p, *f;
7330 sds spat, ssub;
7331 robj keyobj, fieldobj, *o;
7332 int prefixlen, sublen, postfixlen, fieldlen;
7333 /* Expoit the internal sds representation to create a sds string allocated on the stack in order to make this function faster */
7334 struct {
7335 long len;
7336 long free;
7337 char buf[REDIS_SORTKEY_MAX+1];
7338 } keyname, fieldname;
7339
7340 /* If the pattern is "#" return the substitution object itself in order
7341 * to implement the "SORT ... GET #" feature. */
7342 spat = pattern->ptr;
7343 if (spat[0] == '#' && spat[1] == '\0') {
7344 incrRefCount(subst);
7345 return subst;
7346 }
7347
7348 /* The substitution object may be specially encoded. If so we create
7349 * a decoded object on the fly. Otherwise getDecodedObject will just
7350 * increment the ref count, that we'll decrement later. */
7351 subst = getDecodedObject(subst);
7352
7353 ssub = subst->ptr;
7354 if (sdslen(spat)+sdslen(ssub)-1 > REDIS_SORTKEY_MAX) return NULL;
7355 p = strchr(spat,'*');
7356 if (!p) {
7357 decrRefCount(subst);
7358 return NULL;
7359 }
7360
7361 /* Find out if we're dealing with a hash dereference. */
7362 if ((f = strstr(p+1, "->")) != NULL) {
7363 fieldlen = sdslen(spat)-(f-spat);
7364 /* this also copies \0 character */
7365 memcpy(fieldname.buf,f+2,fieldlen-1);
7366 fieldname.len = fieldlen-2;
7367 } else {
7368 fieldlen = 0;
7369 }
7370
7371 prefixlen = p-spat;
7372 sublen = sdslen(ssub);
7373 postfixlen = sdslen(spat)-(prefixlen+1)-fieldlen;
7374 memcpy(keyname.buf,spat,prefixlen);
7375 memcpy(keyname.buf+prefixlen,ssub,sublen);
7376 memcpy(keyname.buf+prefixlen+sublen,p+1,postfixlen);
7377 keyname.buf[prefixlen+sublen+postfixlen] = '\0';
7378 keyname.len = prefixlen+sublen+postfixlen;
7379 decrRefCount(subst);
7380
7381 /* Lookup substituted key */
7382 initStaticStringObject(keyobj,((char*)&keyname)+(sizeof(long)*2));
7383 o = lookupKeyRead(db,&keyobj);
7384 if (o == NULL) return NULL;
7385
7386 if (fieldlen > 0) {
7387 if (o->type != REDIS_HASH || fieldname.len < 1) return NULL;
7388
7389 /* Retrieve value from hash by the field name. This operation
7390 * already increases the refcount of the returned object. */
7391 initStaticStringObject(fieldobj,((char*)&fieldname)+(sizeof(long)*2));
7392 o = hashTypeGet(o, &fieldobj);
7393 } else {
7394 if (o->type != REDIS_STRING) return NULL;
7395
7396 /* Every object that this function returns needs to have its refcount
7397 * increased. sortCommand decreases it again. */
7398 incrRefCount(o);
7399 }
7400
7401 return o;
7402 }
7403
7404 /* sortCompare() is used by qsort in sortCommand(). Given that qsort_r with
7405 * the additional parameter is not standard but a BSD-specific we have to
7406 * pass sorting parameters via the global 'server' structure */
7407 static int sortCompare(const void *s1, const void *s2) {
7408 const redisSortObject *so1 = s1, *so2 = s2;
7409 int cmp;
7410
7411 if (!server.sort_alpha) {
7412 /* Numeric sorting. Here it's trivial as we precomputed scores */
7413 if (so1->u.score > so2->u.score) {
7414 cmp = 1;
7415 } else if (so1->u.score < so2->u.score) {
7416 cmp = -1;
7417 } else {
7418 cmp = 0;
7419 }
7420 } else {
7421 /* Alphanumeric sorting */
7422 if (server.sort_bypattern) {
7423 if (!so1->u.cmpobj || !so2->u.cmpobj) {
7424 /* At least one compare object is NULL */
7425 if (so1->u.cmpobj == so2->u.cmpobj)
7426 cmp = 0;
7427 else if (so1->u.cmpobj == NULL)
7428 cmp = -1;
7429 else
7430 cmp = 1;
7431 } else {
7432 /* We have both the objects, use strcoll */
7433 cmp = strcoll(so1->u.cmpobj->ptr,so2->u.cmpobj->ptr);
7434 }
7435 } else {
7436 /* Compare elements directly. */
7437 cmp = compareStringObjects(so1->obj,so2->obj);
7438 }
7439 }
7440 return server.sort_desc ? -cmp : cmp;
7441 }
7442
7443 /* The SORT command is the most complex command in Redis. Warning: this code
7444 * is optimized for speed and a bit less for readability */
7445 static void sortCommand(redisClient *c) {
7446 list *operations;
7447 unsigned int outputlen = 0;
7448 int desc = 0, alpha = 0;
7449 int limit_start = 0, limit_count = -1, start, end;
7450 int j, dontsort = 0, vectorlen;
7451 int getop = 0; /* GET operation counter */
7452 robj *sortval, *sortby = NULL, *storekey = NULL;
7453 redisSortObject *vector; /* Resulting vector to sort */
7454
7455 /* Lookup the key to sort. It must be of the right types */
7456 sortval = lookupKeyRead(c->db,c->argv[1]);
7457 if (sortval == NULL) {
7458 addReply(c,shared.emptymultibulk);
7459 return;
7460 }
7461 if (sortval->type != REDIS_SET && sortval->type != REDIS_LIST &&
7462 sortval->type != REDIS_ZSET)
7463 {
7464 addReply(c,shared.wrongtypeerr);
7465 return;
7466 }
7467
7468 /* Create a list of operations to perform for every sorted element.
7469 * Operations can be GET/DEL/INCR/DECR */
7470 operations = listCreate();
7471 listSetFreeMethod(operations,zfree);
7472 j = 2;
7473
7474 /* Now we need to protect sortval incrementing its count, in the future
7475 * SORT may have options able to overwrite/delete keys during the sorting
7476 * and the sorted key itself may get destroied */
7477 incrRefCount(sortval);
7478
7479 /* The SORT command has an SQL-alike syntax, parse it */
7480 while(j < c->argc) {
7481 int leftargs = c->argc-j-1;
7482 if (!strcasecmp(c->argv[j]->ptr,"asc")) {
7483 desc = 0;
7484 } else if (!strcasecmp(c->argv[j]->ptr,"desc")) {
7485 desc = 1;
7486 } else if (!strcasecmp(c->argv[j]->ptr,"alpha")) {
7487 alpha = 1;
7488 } else if (!strcasecmp(c->argv[j]->ptr,"limit") && leftargs >= 2) {
7489 limit_start = atoi(c->argv[j+1]->ptr);
7490 limit_count = atoi(c->argv[j+2]->ptr);
7491 j+=2;
7492 } else if (!strcasecmp(c->argv[j]->ptr,"store") && leftargs >= 1) {
7493 storekey = c->argv[j+1];
7494 j++;
7495 } else if (!strcasecmp(c->argv[j]->ptr,"by") && leftargs >= 1) {
7496 sortby = c->argv[j+1];
7497 /* If the BY pattern does not contain '*', i.e. it is constant,
7498 * we don't need to sort nor to lookup the weight keys. */
7499 if (strchr(c->argv[j+1]->ptr,'*') == NULL) dontsort = 1;
7500 j++;
7501 } else if (!strcasecmp(c->argv[j]->ptr,"get") && leftargs >= 1) {
7502 listAddNodeTail(operations,createSortOperation(
7503 REDIS_SORT_GET,c->argv[j+1]));
7504 getop++;
7505 j++;
7506 } else {
7507 decrRefCount(sortval);
7508 listRelease(operations);
7509 addReply(c,shared.syntaxerr);
7510 return;
7511 }
7512 j++;
7513 }
7514
7515 /* Load the sorting vector with all the objects to sort */
7516 switch(sortval->type) {
7517 case REDIS_LIST: vectorlen = listTypeLength(sortval); break;
7518 case REDIS_SET: vectorlen = dictSize((dict*)sortval->ptr); break;
7519 case REDIS_ZSET: vectorlen = dictSize(((zset*)sortval->ptr)->dict); break;
7520 default: vectorlen = 0; redisPanic("Bad SORT type"); /* Avoid GCC warning */
7521 }
7522 vector = zmalloc(sizeof(redisSortObject)*vectorlen);
7523 j = 0;
7524
7525 if (sortval->type == REDIS_LIST) {
7526 listTypeIterator *li = listTypeInitIterator(sortval,0,REDIS_TAIL);
7527 listTypeEntry entry;
7528 while(listTypeNext(li,&entry)) {
7529 vector[j].obj = listTypeGet(&entry);
7530 vector[j].u.score = 0;
7531 vector[j].u.cmpobj = NULL;
7532 j++;
7533 }
7534 listTypeReleaseIterator(li);
7535 } else {
7536 dict *set;
7537 dictIterator *di;
7538 dictEntry *setele;
7539
7540 if (sortval->type == REDIS_SET) {
7541 set = sortval->ptr;
7542 } else {
7543 zset *zs = sortval->ptr;
7544 set = zs->dict;
7545 }
7546
7547 di = dictGetIterator(set);
7548 while((setele = dictNext(di)) != NULL) {
7549 vector[j].obj = dictGetEntryKey(setele);
7550 vector[j].u.score = 0;
7551 vector[j].u.cmpobj = NULL;
7552 j++;
7553 }
7554 dictReleaseIterator(di);
7555 }
7556 redisAssert(j == vectorlen);
7557
7558 /* Now it's time to load the right scores in the sorting vector */
7559 if (dontsort == 0) {
7560 for (j = 0; j < vectorlen; j++) {
7561 robj *byval;
7562 if (sortby) {
7563 /* lookup value to sort by */
7564 byval = lookupKeyByPattern(c->db,sortby,vector[j].obj);
7565 if (!byval) continue;
7566 } else {
7567 /* use object itself to sort by */
7568 byval = vector[j].obj;
7569 }
7570
7571 if (alpha) {
7572 if (sortby) vector[j].u.cmpobj = getDecodedObject(byval);
7573 } else {
7574 if (byval->encoding == REDIS_ENCODING_RAW) {
7575 vector[j].u.score = strtod(byval->ptr,NULL);
7576 } else if (byval->encoding == REDIS_ENCODING_INT) {
7577 /* Don't need to decode the object if it's
7578 * integer-encoded (the only encoding supported) so
7579 * far. We can just cast it */
7580 vector[j].u.score = (long)byval->ptr;
7581 } else {
7582 redisAssert(1 != 1);
7583 }
7584 }
7585
7586 /* when the object was retrieved using lookupKeyByPattern,
7587 * its refcount needs to be decreased. */
7588 if (sortby) {
7589 decrRefCount(byval);
7590 }
7591 }
7592 }
7593
7594 /* We are ready to sort the vector... perform a bit of sanity check
7595 * on the LIMIT option too. We'll use a partial version of quicksort. */
7596 start = (limit_start < 0) ? 0 : limit_start;
7597 end = (limit_count < 0) ? vectorlen-1 : start+limit_count-1;
7598 if (start >= vectorlen) {
7599 start = vectorlen-1;
7600 end = vectorlen-2;
7601 }
7602 if (end >= vectorlen) end = vectorlen-1;
7603
7604 if (dontsort == 0) {
7605 server.sort_desc = desc;
7606 server.sort_alpha = alpha;
7607 server.sort_bypattern = sortby ? 1 : 0;
7608 if (sortby && (start != 0 || end != vectorlen-1))
7609 pqsort(vector,vectorlen,sizeof(redisSortObject),sortCompare, start,end);
7610 else
7611 qsort(vector,vectorlen,sizeof(redisSortObject),sortCompare);
7612 }
7613
7614 /* Send command output to the output buffer, performing the specified
7615 * GET/DEL/INCR/DECR operations if any. */
7616 outputlen = getop ? getop*(end-start+1) : end-start+1;
7617 if (storekey == NULL) {
7618 /* STORE option not specified, sent the sorting result to client */
7619 addReplySds(c,sdscatprintf(sdsempty(),"*%d\r\n",outputlen));
7620 for (j = start; j <= end; j++) {
7621 listNode *ln;
7622 listIter li;
7623
7624 if (!getop) addReplyBulk(c,vector[j].obj);
7625 listRewind(operations,&li);
7626 while((ln = listNext(&li))) {
7627 redisSortOperation *sop = ln->value;
7628 robj *val = lookupKeyByPattern(c->db,sop->pattern,
7629 vector[j].obj);
7630
7631 if (sop->type == REDIS_SORT_GET) {
7632 if (!val) {
7633 addReply(c,shared.nullbulk);
7634 } else {
7635 addReplyBulk(c,val);
7636 decrRefCount(val);
7637 }
7638 } else {
7639 redisAssert(sop->type == REDIS_SORT_GET); /* always fails */
7640 }
7641 }
7642 }
7643 } else {
7644 robj *sobj = createZiplistObject();
7645
7646 /* STORE option specified, set the sorting result as a List object */
7647 for (j = start; j <= end; j++) {
7648 listNode *ln;
7649 listIter li;
7650
7651 if (!getop) {
7652 listTypePush(sobj,vector[j].obj,REDIS_TAIL);
7653 } else {
7654 listRewind(operations,&li);
7655 while((ln = listNext(&li))) {
7656 redisSortOperation *sop = ln->value;
7657 robj *val = lookupKeyByPattern(c->db,sop->pattern,
7658 vector[j].obj);
7659
7660 if (sop->type == REDIS_SORT_GET) {
7661 if (!val) val = createStringObject("",0);
7662
7663 /* listTypePush does an incrRefCount, so we should take care
7664 * care of the incremented refcount caused by either
7665 * lookupKeyByPattern or createStringObject("",0) */
7666 listTypePush(sobj,val,REDIS_TAIL);
7667 decrRefCount(val);
7668 } else {
7669 /* always fails */
7670 redisAssert(sop->type == REDIS_SORT_GET);
7671 }
7672 }
7673 }
7674 }
7675 dbReplace(c->db,storekey,sobj);
7676 /* Note: we add 1 because the DB is dirty anyway since even if the
7677 * SORT result is empty a new key is set and maybe the old content
7678 * replaced. */
7679 server.dirty += 1+outputlen;
7680 addReplySds(c,sdscatprintf(sdsempty(),":%d\r\n",outputlen));
7681 }
7682
7683 /* Cleanup */
7684 if (sortval->type == REDIS_LIST)
7685 for (j = 0; j < vectorlen; j++)
7686 decrRefCount(vector[j].obj);
7687 decrRefCount(sortval);
7688 listRelease(operations);
7689 for (j = 0; j < vectorlen; j++) {
7690 if (alpha && vector[j].u.cmpobj)
7691 decrRefCount(vector[j].u.cmpobj);
7692 }
7693 zfree(vector);
7694 }
7695
7696 /* Convert an amount of bytes into a human readable string in the form
7697 * of 100B, 2G, 100M, 4K, and so forth. */
7698 static void bytesToHuman(char *s, unsigned long long n) {
7699 double d;
7700
7701 if (n < 1024) {
7702 /* Bytes */
7703 sprintf(s,"%lluB",n);
7704 return;
7705 } else if (n < (1024*1024)) {
7706 d = (double)n/(1024);
7707 sprintf(s,"%.2fK",d);
7708 } else if (n < (1024LL*1024*1024)) {
7709 d = (double)n/(1024*1024);
7710 sprintf(s,"%.2fM",d);
7711 } else if (n < (1024LL*1024*1024*1024)) {
7712 d = (double)n/(1024LL*1024*1024);
7713 sprintf(s,"%.2fG",d);
7714 }
7715 }
7716
7717 /* Create the string returned by the INFO command. This is decoupled
7718 * by the INFO command itself as we need to report the same information
7719 * on memory corruption problems. */
7720 static sds genRedisInfoString(void) {
7721 sds info;
7722 time_t uptime = time(NULL)-server.stat_starttime;
7723 int j;
7724 char hmem[64];
7725
7726 bytesToHuman(hmem,zmalloc_used_memory());
7727 info = sdscatprintf(sdsempty(),
7728 "redis_version:%s\r\n"
7729 "redis_git_sha1:%s\r\n"
7730 "redis_git_dirty:%d\r\n"
7731 "arch_bits:%s\r\n"
7732 "multiplexing_api:%s\r\n"
7733 "process_id:%ld\r\n"
7734 "uptime_in_seconds:%ld\r\n"
7735 "uptime_in_days:%ld\r\n"
7736 "connected_clients:%d\r\n"
7737 "connected_slaves:%d\r\n"
7738 "blocked_clients:%d\r\n"
7739 "used_memory:%zu\r\n"
7740 "used_memory_human:%s\r\n"
7741 "changes_since_last_save:%lld\r\n"
7742 "bgsave_in_progress:%d\r\n"
7743 "last_save_time:%ld\r\n"
7744 "bgrewriteaof_in_progress:%d\r\n"
7745 "total_connections_received:%lld\r\n"
7746 "total_commands_processed:%lld\r\n"
7747 "expired_keys:%lld\r\n"
7748 "hash_max_zipmap_entries:%zu\r\n"
7749 "hash_max_zipmap_value:%zu\r\n"
7750 "pubsub_channels:%ld\r\n"
7751 "pubsub_patterns:%u\r\n"
7752 "vm_enabled:%d\r\n"
7753 "role:%s\r\n"
7754 ,REDIS_VERSION,
7755 REDIS_GIT_SHA1,
7756 strtol(REDIS_GIT_DIRTY,NULL,10) > 0,
7757 (sizeof(long) == 8) ? "64" : "32",
7758 aeGetApiName(),
7759 (long) getpid(),
7760 uptime,
7761 uptime/(3600*24),
7762 listLength(server.clients)-listLength(server.slaves),
7763 listLength(server.slaves),
7764 server.blpop_blocked_clients,
7765 zmalloc_used_memory(),
7766 hmem,
7767 server.dirty,
7768 server.bgsavechildpid != -1,
7769 server.lastsave,
7770 server.bgrewritechildpid != -1,
7771 server.stat_numconnections,
7772 server.stat_numcommands,
7773 server.stat_expiredkeys,
7774 server.hash_max_zipmap_entries,
7775 server.hash_max_zipmap_value,
7776 dictSize(server.pubsub_channels),
7777 listLength(server.pubsub_patterns),
7778 server.vm_enabled != 0,
7779 server.masterhost == NULL ? "master" : "slave"
7780 );
7781 if (server.masterhost) {
7782 info = sdscatprintf(info,
7783 "master_host:%s\r\n"
7784 "master_port:%d\r\n"
7785 "master_link_status:%s\r\n"
7786 "master_last_io_seconds_ago:%d\r\n"
7787 ,server.masterhost,
7788 server.masterport,
7789 (server.replstate == REDIS_REPL_CONNECTED) ?
7790 "up" : "down",
7791 server.master ? ((int)(time(NULL)-server.master->lastinteraction)) : -1
7792 );
7793 }
7794 if (server.vm_enabled) {
7795 lockThreadedIO();
7796 info = sdscatprintf(info,
7797 "vm_conf_max_memory:%llu\r\n"
7798 "vm_conf_page_size:%llu\r\n"
7799 "vm_conf_pages:%llu\r\n"
7800 "vm_stats_used_pages:%llu\r\n"
7801 "vm_stats_swapped_objects:%llu\r\n"
7802 "vm_stats_swappin_count:%llu\r\n"
7803 "vm_stats_swappout_count:%llu\r\n"
7804 "vm_stats_io_newjobs_len:%lu\r\n"
7805 "vm_stats_io_processing_len:%lu\r\n"
7806 "vm_stats_io_processed_len:%lu\r\n"
7807 "vm_stats_io_active_threads:%lu\r\n"
7808 "vm_stats_blocked_clients:%lu\r\n"
7809 ,(unsigned long long) server.vm_max_memory,
7810 (unsigned long long) server.vm_page_size,
7811 (unsigned long long) server.vm_pages,
7812 (unsigned long long) server.vm_stats_used_pages,
7813 (unsigned long long) server.vm_stats_swapped_objects,
7814 (unsigned long long) server.vm_stats_swapins,
7815 (unsigned long long) server.vm_stats_swapouts,
7816 (unsigned long) listLength(server.io_newjobs),
7817 (unsigned long) listLength(server.io_processing),
7818 (unsigned long) listLength(server.io_processed),
7819 (unsigned long) server.io_active_threads,
7820 (unsigned long) server.vm_blocked_clients
7821 );
7822 unlockThreadedIO();
7823 }
7824 for (j = 0; j < server.dbnum; j++) {
7825 long long keys, vkeys;
7826
7827 keys = dictSize(server.db[j].dict);
7828 vkeys = dictSize(server.db[j].expires);
7829 if (keys || vkeys) {
7830 info = sdscatprintf(info, "db%d:keys=%lld,expires=%lld\r\n",
7831 j, keys, vkeys);
7832 }
7833 }
7834 return info;
7835 }
7836
7837 static void infoCommand(redisClient *c) {
7838 sds info = genRedisInfoString();
7839 addReplySds(c,sdscatprintf(sdsempty(),"$%lu\r\n",
7840 (unsigned long)sdslen(info)));
7841 addReplySds(c,info);
7842 addReply(c,shared.crlf);
7843 }
7844
7845 static void monitorCommand(redisClient *c) {
7846 /* ignore MONITOR if aleady slave or in monitor mode */
7847 if (c->flags & REDIS_SLAVE) return;
7848
7849 c->flags |= (REDIS_SLAVE|REDIS_MONITOR);
7850 c->slaveseldb = 0;
7851 listAddNodeTail(server.monitors,c);
7852 addReply(c,shared.ok);
7853 }
7854
7855 /* ================================= Expire ================================= */
7856 static int removeExpire(redisDb *db, robj *key) {
7857 /* An expire may only be removed if there is a corresponding entry in the
7858 * main dict. Otherwise, the key will never be freed. */
7859 redisAssert(dictFind(db->dict,key->ptr) != NULL);
7860 if (dictDelete(db->expires,key->ptr) == DICT_OK) {
7861 return 1;
7862 } else {
7863 return 0;
7864 }
7865 }
7866
7867 static int setExpire(redisDb *db, robj *key, time_t when) {
7868 dictEntry *de;
7869
7870 /* Reuse the sds from the main dict in the expire dict */
7871 redisAssert((de = dictFind(db->dict,key->ptr)) != NULL);
7872 if (dictAdd(db->expires,dictGetEntryKey(de),(void*)when) == DICT_ERR) {
7873 return 0;
7874 } else {
7875 return 1;
7876 }
7877 }
7878
7879 /* Return the expire time of the specified key, or -1 if no expire
7880 * is associated with this key (i.e. the key is non volatile) */
7881 static time_t getExpire(redisDb *db, robj *key) {
7882 dictEntry *de;
7883
7884 /* No expire? return ASAP */
7885 if (dictSize(db->expires) == 0 ||
7886 (de = dictFind(db->expires,key->ptr)) == NULL) return -1;
7887
7888 /* The entry was found in the expire dict, this means it should also
7889 * be present in the main dict (safety check). */
7890 redisAssert(dictFind(db->dict,key->ptr) != NULL);
7891 return (time_t) dictGetEntryVal(de);
7892 }
7893
7894 static int expireIfNeeded(redisDb *db, robj *key) {
7895 time_t when = getExpire(db,key);
7896 if (when < 0) return 0;
7897
7898 /* Return when this key has not expired */
7899 if (time(NULL) <= when) return 0;
7900
7901 /* Delete the key */
7902 server.stat_expiredkeys++;
7903 server.dirty++;
7904 return dbDelete(db,key);
7905 }
7906
7907 static int deleteIfVolatile(redisDb *db, robj *key) {
7908 if (getExpire(db,key) < 0) return 0;
7909
7910 /* Delete the key */
7911 server.stat_expiredkeys++;
7912 server.dirty++;
7913 return dbDelete(db,key);
7914 }
7915
7916 static void expireGenericCommand(redisClient *c, robj *key, robj *param, long offset) {
7917 dictEntry *de;
7918 time_t seconds;
7919
7920 if (getLongFromObjectOrReply(c, param, &seconds, NULL) != REDIS_OK) return;
7921
7922 seconds -= offset;
7923
7924 de = dictFind(c->db->dict,key->ptr);
7925 if (de == NULL) {
7926 addReply(c,shared.czero);
7927 return;
7928 }
7929 if (seconds <= 0) {
7930 if (dbDelete(c->db,key)) server.dirty++;
7931 addReply(c, shared.cone);
7932 return;
7933 } else {
7934 time_t when = time(NULL)+seconds;
7935 if (setExpire(c->db,key,when)) {
7936 addReply(c,shared.cone);
7937 server.dirty++;
7938 } else {
7939 addReply(c,shared.czero);
7940 }
7941 return;
7942 }
7943 }
7944
7945 static void expireCommand(redisClient *c) {
7946 expireGenericCommand(c,c->argv[1],c->argv[2],0);
7947 }
7948
7949 static void expireatCommand(redisClient *c) {
7950 expireGenericCommand(c,c->argv[1],c->argv[2],time(NULL));
7951 }
7952
7953 static void ttlCommand(redisClient *c) {
7954 time_t expire;
7955 int ttl = -1;
7956
7957 expire = getExpire(c->db,c->argv[1]);
7958 if (expire != -1) {
7959 ttl = (int) (expire-time(NULL));
7960 if (ttl < 0) ttl = -1;
7961 }
7962 addReplySds(c,sdscatprintf(sdsempty(),":%d\r\n",ttl));
7963 }
7964
7965 /* ================================ MULTI/EXEC ============================== */
7966
7967 /* Client state initialization for MULTI/EXEC */
7968 static void initClientMultiState(redisClient *c) {
7969 c->mstate.commands = NULL;
7970 c->mstate.count = 0;
7971 }
7972
7973 /* Release all the resources associated with MULTI/EXEC state */
7974 static void freeClientMultiState(redisClient *c) {
7975 int j;
7976
7977 for (j = 0; j < c->mstate.count; j++) {
7978 int i;
7979 multiCmd *mc = c->mstate.commands+j;
7980
7981 for (i = 0; i < mc->argc; i++)
7982 decrRefCount(mc->argv[i]);
7983 zfree(mc->argv);
7984 }
7985 zfree(c->mstate.commands);
7986 }
7987
7988 /* Add a new command into the MULTI commands queue */
7989 static void queueMultiCommand(redisClient *c, struct redisCommand *cmd) {
7990 multiCmd *mc;
7991 int j;
7992
7993 c->mstate.commands = zrealloc(c->mstate.commands,
7994 sizeof(multiCmd)*(c->mstate.count+1));
7995 mc = c->mstate.commands+c->mstate.count;
7996 mc->cmd = cmd;
7997 mc->argc = c->argc;
7998 mc->argv = zmalloc(sizeof(robj*)*c->argc);
7999 memcpy(mc->argv,c->argv,sizeof(robj*)*c->argc);
8000 for (j = 0; j < c->argc; j++)
8001 incrRefCount(mc->argv[j]);
8002 c->mstate.count++;
8003 }
8004
8005 static void multiCommand(redisClient *c) {
8006 if (c->flags & REDIS_MULTI) {
8007 addReplySds(c,sdsnew("-ERR MULTI calls can not be nested\r\n"));
8008 return;
8009 }
8010 c->flags |= REDIS_MULTI;
8011 addReply(c,shared.ok);
8012 }
8013
8014 static void discardCommand(redisClient *c) {
8015 if (!(c->flags & REDIS_MULTI)) {
8016 addReplySds(c,sdsnew("-ERR DISCARD without MULTI\r\n"));
8017 return;
8018 }
8019
8020 freeClientMultiState(c);
8021 initClientMultiState(c);
8022 c->flags &= (~REDIS_MULTI);
8023 unwatchAllKeys(c);
8024 addReply(c,shared.ok);
8025 }
8026
8027 /* Send a MULTI command to all the slaves and AOF file. Check the execCommand
8028 * implememntation for more information. */
8029 static void execCommandReplicateMulti(redisClient *c) {
8030 struct redisCommand *cmd;
8031 robj *multistring = createStringObject("MULTI",5);
8032
8033 cmd = lookupCommand("multi");
8034 if (server.appendonly)
8035 feedAppendOnlyFile(cmd,c->db->id,&multistring,1);
8036 if (listLength(server.slaves))
8037 replicationFeedSlaves(server.slaves,c->db->id,&multistring,1);
8038 decrRefCount(multistring);
8039 }
8040
8041 static void execCommand(redisClient *c) {
8042 int j;
8043 robj **orig_argv;
8044 int orig_argc;
8045
8046 if (!(c->flags & REDIS_MULTI)) {
8047 addReplySds(c,sdsnew("-ERR EXEC without MULTI\r\n"));
8048 return;
8049 }
8050
8051 /* Check if we need to abort the EXEC if some WATCHed key was touched.
8052 * A failed EXEC will return a multi bulk nil object. */
8053 if (c->flags & REDIS_DIRTY_CAS) {
8054 freeClientMultiState(c);
8055 initClientMultiState(c);
8056 c->flags &= ~(REDIS_MULTI|REDIS_DIRTY_CAS);
8057 unwatchAllKeys(c);
8058 addReply(c,shared.nullmultibulk);
8059 return;
8060 }
8061
8062 /* Replicate a MULTI request now that we are sure the block is executed.
8063 * This way we'll deliver the MULTI/..../EXEC block as a whole and
8064 * both the AOF and the replication link will have the same consistency
8065 * and atomicity guarantees. */
8066 execCommandReplicateMulti(c);
8067
8068 /* Exec all the queued commands */
8069 unwatchAllKeys(c); /* Unwatch ASAP otherwise we'll waste CPU cycles */
8070 orig_argv = c->argv;
8071 orig_argc = c->argc;
8072 addReplySds(c,sdscatprintf(sdsempty(),"*%d\r\n",c->mstate.count));
8073 for (j = 0; j < c->mstate.count; j++) {
8074 c->argc = c->mstate.commands[j].argc;
8075 c->argv = c->mstate.commands[j].argv;
8076 call(c,c->mstate.commands[j].cmd);
8077 }
8078 c->argv = orig_argv;
8079 c->argc = orig_argc;
8080 freeClientMultiState(c);
8081 initClientMultiState(c);
8082 c->flags &= ~(REDIS_MULTI|REDIS_DIRTY_CAS);
8083 /* Make sure the EXEC command is always replicated / AOF, since we
8084 * always send the MULTI command (we can't know beforehand if the
8085 * next operations will contain at least a modification to the DB). */
8086 server.dirty++;
8087 }
8088
8089 /* =========================== Blocking Operations ========================= */
8090
8091 /* Currently Redis blocking operations support is limited to list POP ops,
8092 * so the current implementation is not fully generic, but it is also not
8093 * completely specific so it will not require a rewrite to support new
8094 * kind of blocking operations in the future.
8095 *
8096 * Still it's important to note that list blocking operations can be already
8097 * used as a notification mechanism in order to implement other blocking
8098 * operations at application level, so there must be a very strong evidence
8099 * of usefulness and generality before new blocking operations are implemented.
8100 *
8101 * This is how the current blocking POP works, we use BLPOP as example:
8102 * - If the user calls BLPOP and the key exists and contains a non empty list
8103 * then LPOP is called instead. So BLPOP is semantically the same as LPOP
8104 * if there is not to block.
8105 * - If instead BLPOP is called and the key does not exists or the list is
8106 * empty we need to block. In order to do so we remove the notification for
8107 * new data to read in the client socket (so that we'll not serve new
8108 * requests if the blocking request is not served). Also we put the client
8109 * in a dictionary (db->blocking_keys) mapping keys to a list of clients
8110 * blocking for this keys.
8111 * - If a PUSH operation against a key with blocked clients waiting is
8112 * performed, we serve the first in the list: basically instead to push
8113 * the new element inside the list we return it to the (first / oldest)
8114 * blocking client, unblock the client, and remove it form the list.
8115 *
8116 * The above comment and the source code should be enough in order to understand
8117 * the implementation and modify / fix it later.
8118 */
8119
8120 /* Set a client in blocking mode for the specified key, with the specified
8121 * timeout */
8122 static void blockForKeys(redisClient *c, robj **keys, int numkeys, time_t timeout) {
8123 dictEntry *de;
8124 list *l;
8125 int j;
8126
8127 c->blocking_keys = zmalloc(sizeof(robj*)*numkeys);
8128 c->blocking_keys_num = numkeys;
8129 c->blockingto = timeout;
8130 for (j = 0; j < numkeys; j++) {
8131 /* Add the key in the client structure, to map clients -> keys */
8132 c->blocking_keys[j] = keys[j];
8133 incrRefCount(keys[j]);
8134
8135 /* And in the other "side", to map keys -> clients */
8136 de = dictFind(c->db->blocking_keys,keys[j]);
8137 if (de == NULL) {
8138 int retval;
8139
8140 /* For every key we take a list of clients blocked for it */
8141 l = listCreate();
8142 retval = dictAdd(c->db->blocking_keys,keys[j],l);
8143 incrRefCount(keys[j]);
8144 assert(retval == DICT_OK);
8145 } else {
8146 l = dictGetEntryVal(de);
8147 }
8148 listAddNodeTail(l,c);
8149 }
8150 /* Mark the client as a blocked client */
8151 c->flags |= REDIS_BLOCKED;
8152 server.blpop_blocked_clients++;
8153 }
8154
8155 /* Unblock a client that's waiting in a blocking operation such as BLPOP */
8156 static void unblockClientWaitingData(redisClient *c) {
8157 dictEntry *de;
8158 list *l;
8159 int j;
8160
8161 assert(c->blocking_keys != NULL);
8162 /* The client may wait for multiple keys, so unblock it for every key. */
8163 for (j = 0; j < c->blocking_keys_num; j++) {
8164 /* Remove this client from the list of clients waiting for this key. */
8165 de = dictFind(c->db->blocking_keys,c->blocking_keys[j]);
8166 assert(de != NULL);
8167 l = dictGetEntryVal(de);
8168 listDelNode(l,listSearchKey(l,c));
8169 /* If the list is empty we need to remove it to avoid wasting memory */
8170 if (listLength(l) == 0)
8171 dictDelete(c->db->blocking_keys,c->blocking_keys[j]);
8172 decrRefCount(c->blocking_keys[j]);
8173 }
8174 /* Cleanup the client structure */
8175 zfree(c->blocking_keys);
8176 c->blocking_keys = NULL;
8177 c->flags &= (~REDIS_BLOCKED);
8178 server.blpop_blocked_clients--;
8179 /* We want to process data if there is some command waiting
8180 * in the input buffer. Note that this is safe even if
8181 * unblockClientWaitingData() gets called from freeClient() because
8182 * freeClient() will be smart enough to call this function
8183 * *after* c->querybuf was set to NULL. */
8184 if (c->querybuf && sdslen(c->querybuf) > 0) processInputBuffer(c);
8185 }
8186
8187 /* This should be called from any function PUSHing into lists.
8188 * 'c' is the "pushing client", 'key' is the key it is pushing data against,
8189 * 'ele' is the element pushed.
8190 *
8191 * If the function returns 0 there was no client waiting for a list push
8192 * against this key.
8193 *
8194 * If the function returns 1 there was a client waiting for a list push
8195 * against this key, the element was passed to this client thus it's not
8196 * needed to actually add it to the list and the caller should return asap. */
8197 static int handleClientsWaitingListPush(redisClient *c, robj *key, robj *ele) {
8198 struct dictEntry *de;
8199 redisClient *receiver;
8200 list *l;
8201 listNode *ln;
8202
8203 de = dictFind(c->db->blocking_keys,key);
8204 if (de == NULL) return 0;
8205 l = dictGetEntryVal(de);
8206 ln = listFirst(l);
8207 assert(ln != NULL);
8208 receiver = ln->value;
8209
8210 addReplySds(receiver,sdsnew("*2\r\n"));
8211 addReplyBulk(receiver,key);
8212 addReplyBulk(receiver,ele);
8213 unblockClientWaitingData(receiver);
8214 return 1;
8215 }
8216
8217 /* Blocking RPOP/LPOP */
8218 static void blockingPopGenericCommand(redisClient *c, int where) {
8219 robj *o;
8220 time_t timeout;
8221 int j;
8222
8223 for (j = 1; j < c->argc-1; j++) {
8224 o = lookupKeyWrite(c->db,c->argv[j]);
8225 if (o != NULL) {
8226 if (o->type != REDIS_LIST) {
8227 addReply(c,shared.wrongtypeerr);
8228 return;
8229 } else {
8230 list *list = o->ptr;
8231 if (listLength(list) != 0) {
8232 /* If the list contains elements fall back to the usual
8233 * non-blocking POP operation */
8234 robj *argv[2], **orig_argv;
8235 int orig_argc;
8236
8237 /* We need to alter the command arguments before to call
8238 * popGenericCommand() as the command takes a single key. */
8239 orig_argv = c->argv;
8240 orig_argc = c->argc;
8241 argv[1] = c->argv[j];
8242 c->argv = argv;
8243 c->argc = 2;
8244
8245 /* Also the return value is different, we need to output
8246 * the multi bulk reply header and the key name. The
8247 * "real" command will add the last element (the value)
8248 * for us. If this souds like an hack to you it's just
8249 * because it is... */
8250 addReplySds(c,sdsnew("*2\r\n"));
8251 addReplyBulk(c,argv[1]);
8252 popGenericCommand(c,where);
8253
8254 /* Fix the client structure with the original stuff */
8255 c->argv = orig_argv;
8256 c->argc = orig_argc;
8257 return;
8258 }
8259 }
8260 }
8261 }
8262 /* If the list is empty or the key does not exists we must block */
8263 timeout = strtol(c->argv[c->argc-1]->ptr,NULL,10);
8264 if (timeout > 0) timeout += time(NULL);
8265 blockForKeys(c,c->argv+1,c->argc-2,timeout);
8266 }
8267
8268 static void blpopCommand(redisClient *c) {
8269 blockingPopGenericCommand(c,REDIS_HEAD);
8270 }
8271
8272 static void brpopCommand(redisClient *c) {
8273 blockingPopGenericCommand(c,REDIS_TAIL);
8274 }
8275
8276 /* =============================== Replication ============================= */
8277
8278 static int syncWrite(int fd, char *ptr, ssize_t size, int timeout) {
8279 ssize_t nwritten, ret = size;
8280 time_t start = time(NULL);
8281
8282 timeout++;
8283 while(size) {
8284 if (aeWait(fd,AE_WRITABLE,1000) & AE_WRITABLE) {
8285 nwritten = write(fd,ptr,size);
8286 if (nwritten == -1) return -1;
8287 ptr += nwritten;
8288 size -= nwritten;
8289 }
8290 if ((time(NULL)-start) > timeout) {
8291 errno = ETIMEDOUT;
8292 return -1;
8293 }
8294 }
8295 return ret;
8296 }
8297
8298 static int syncRead(int fd, char *ptr, ssize_t size, int timeout) {
8299 ssize_t nread, totread = 0;
8300 time_t start = time(NULL);
8301
8302 timeout++;
8303 while(size) {
8304 if (aeWait(fd,AE_READABLE,1000) & AE_READABLE) {
8305 nread = read(fd,ptr,size);
8306 if (nread == -1) return -1;
8307 ptr += nread;
8308 size -= nread;
8309 totread += nread;
8310 }
8311 if ((time(NULL)-start) > timeout) {
8312 errno = ETIMEDOUT;
8313 return -1;
8314 }
8315 }
8316 return totread;
8317 }
8318
8319 static int syncReadLine(int fd, char *ptr, ssize_t size, int timeout) {
8320 ssize_t nread = 0;
8321
8322 size--;
8323 while(size) {
8324 char c;
8325
8326 if (syncRead(fd,&c,1,timeout) == -1) return -1;
8327 if (c == '\n') {
8328 *ptr = '\0';
8329 if (nread && *(ptr-1) == '\r') *(ptr-1) = '\0';
8330 return nread;
8331 } else {
8332 *ptr++ = c;
8333 *ptr = '\0';
8334 nread++;
8335 }
8336 }
8337 return nread;
8338 }
8339
8340 static void syncCommand(redisClient *c) {
8341 /* ignore SYNC if aleady slave or in monitor mode */
8342 if (c->flags & REDIS_SLAVE) return;
8343
8344 /* SYNC can't be issued when the server has pending data to send to
8345 * the client about already issued commands. We need a fresh reply
8346 * buffer registering the differences between the BGSAVE and the current
8347 * dataset, so that we can copy to other slaves if needed. */
8348 if (listLength(c->reply) != 0) {
8349 addReplySds(c,sdsnew("-ERR SYNC is invalid with pending input\r\n"));
8350 return;
8351 }
8352
8353 redisLog(REDIS_NOTICE,"Slave ask for synchronization");
8354 /* Here we need to check if there is a background saving operation
8355 * in progress, or if it is required to start one */
8356 if (server.bgsavechildpid != -1) {
8357 /* Ok a background save is in progress. Let's check if it is a good
8358 * one for replication, i.e. if there is another slave that is
8359 * registering differences since the server forked to save */
8360 redisClient *slave;
8361 listNode *ln;
8362 listIter li;
8363
8364 listRewind(server.slaves,&li);
8365 while((ln = listNext(&li))) {
8366 slave = ln->value;
8367 if (slave->replstate == REDIS_REPL_WAIT_BGSAVE_END) break;
8368 }
8369 if (ln) {
8370 /* Perfect, the server is already registering differences for
8371 * another slave. Set the right state, and copy the buffer. */
8372 listRelease(c->reply);
8373 c->reply = listDup(slave->reply);
8374 c->replstate = REDIS_REPL_WAIT_BGSAVE_END;
8375 redisLog(REDIS_NOTICE,"Waiting for end of BGSAVE for SYNC");
8376 } else {
8377 /* No way, we need to wait for the next BGSAVE in order to
8378 * register differences */
8379 c->replstate = REDIS_REPL_WAIT_BGSAVE_START;
8380 redisLog(REDIS_NOTICE,"Waiting for next BGSAVE for SYNC");
8381 }
8382 } else {
8383 /* Ok we don't have a BGSAVE in progress, let's start one */
8384 redisLog(REDIS_NOTICE,"Starting BGSAVE for SYNC");
8385 if (rdbSaveBackground(server.dbfilename) != REDIS_OK) {
8386 redisLog(REDIS_NOTICE,"Replication failed, can't BGSAVE");
8387 addReplySds(c,sdsnew("-ERR Unalbe to perform background save\r\n"));
8388 return;
8389 }
8390 c->replstate = REDIS_REPL_WAIT_BGSAVE_END;
8391 }
8392 c->repldbfd = -1;
8393 c->flags |= REDIS_SLAVE;
8394 c->slaveseldb = 0;
8395 listAddNodeTail(server.slaves,c);
8396 return;
8397 }
8398
8399 static void sendBulkToSlave(aeEventLoop *el, int fd, void *privdata, int mask) {
8400 redisClient *slave = privdata;
8401 REDIS_NOTUSED(el);
8402 REDIS_NOTUSED(mask);
8403 char buf[REDIS_IOBUF_LEN];
8404 ssize_t nwritten, buflen;
8405
8406 if (slave->repldboff == 0) {
8407 /* Write the bulk write count before to transfer the DB. In theory here
8408 * we don't know how much room there is in the output buffer of the
8409 * socket, but in pratice SO_SNDLOWAT (the minimum count for output
8410 * operations) will never be smaller than the few bytes we need. */
8411 sds bulkcount;
8412
8413 bulkcount = sdscatprintf(sdsempty(),"$%lld\r\n",(unsigned long long)
8414 slave->repldbsize);
8415 if (write(fd,bulkcount,sdslen(bulkcount)) != (signed)sdslen(bulkcount))
8416 {
8417 sdsfree(bulkcount);
8418 freeClient(slave);
8419 return;
8420 }
8421 sdsfree(bulkcount);
8422 }
8423 lseek(slave->repldbfd,slave->repldboff,SEEK_SET);
8424 buflen = read(slave->repldbfd,buf,REDIS_IOBUF_LEN);
8425 if (buflen <= 0) {
8426 redisLog(REDIS_WARNING,"Read error sending DB to slave: %s",
8427 (buflen == 0) ? "premature EOF" : strerror(errno));
8428 freeClient(slave);
8429 return;
8430 }
8431 if ((nwritten = write(fd,buf,buflen)) == -1) {
8432 redisLog(REDIS_VERBOSE,"Write error sending DB to slave: %s",
8433 strerror(errno));
8434 freeClient(slave);
8435 return;
8436 }
8437 slave->repldboff += nwritten;
8438 if (slave->repldboff == slave->repldbsize) {
8439 close(slave->repldbfd);
8440 slave->repldbfd = -1;
8441 aeDeleteFileEvent(server.el,slave->fd,AE_WRITABLE);
8442 slave->replstate = REDIS_REPL_ONLINE;
8443 if (aeCreateFileEvent(server.el, slave->fd, AE_WRITABLE,
8444 sendReplyToClient, slave) == AE_ERR) {
8445 freeClient(slave);
8446 return;
8447 }
8448 addReplySds(slave,sdsempty());
8449 redisLog(REDIS_NOTICE,"Synchronization with slave succeeded");
8450 }
8451 }
8452
8453 /* This function is called at the end of every backgrond saving.
8454 * The argument bgsaveerr is REDIS_OK if the background saving succeeded
8455 * otherwise REDIS_ERR is passed to the function.
8456 *
8457 * The goal of this function is to handle slaves waiting for a successful
8458 * background saving in order to perform non-blocking synchronization. */
8459 static void updateSlavesWaitingBgsave(int bgsaveerr) {
8460 listNode *ln;
8461 int startbgsave = 0;
8462 listIter li;
8463
8464 listRewind(server.slaves,&li);
8465 while((ln = listNext(&li))) {
8466 redisClient *slave = ln->value;
8467
8468 if (slave->replstate == REDIS_REPL_WAIT_BGSAVE_START) {
8469 startbgsave = 1;
8470 slave->replstate = REDIS_REPL_WAIT_BGSAVE_END;
8471 } else if (slave->replstate == REDIS_REPL_WAIT_BGSAVE_END) {
8472 struct redis_stat buf;
8473
8474 if (bgsaveerr != REDIS_OK) {
8475 freeClient(slave);
8476 redisLog(REDIS_WARNING,"SYNC failed. BGSAVE child returned an error");
8477 continue;
8478 }
8479 if ((slave->repldbfd = open(server.dbfilename,O_RDONLY)) == -1 ||
8480 redis_fstat(slave->repldbfd,&buf) == -1) {
8481 freeClient(slave);
8482 redisLog(REDIS_WARNING,"SYNC failed. Can't open/stat DB after BGSAVE: %s", strerror(errno));
8483 continue;
8484 }
8485 slave->repldboff = 0;
8486 slave->repldbsize = buf.st_size;
8487 slave->replstate = REDIS_REPL_SEND_BULK;
8488 aeDeleteFileEvent(server.el,slave->fd,AE_WRITABLE);
8489 if (aeCreateFileEvent(server.el, slave->fd, AE_WRITABLE, sendBulkToSlave, slave) == AE_ERR) {
8490 freeClient(slave);
8491 continue;
8492 }
8493 }
8494 }
8495 if (startbgsave) {
8496 if (rdbSaveBackground(server.dbfilename) != REDIS_OK) {
8497 listIter li;
8498
8499 listRewind(server.slaves,&li);
8500 redisLog(REDIS_WARNING,"SYNC failed. BGSAVE failed");
8501 while((ln = listNext(&li))) {
8502 redisClient *slave = ln->value;
8503
8504 if (slave->replstate == REDIS_REPL_WAIT_BGSAVE_START)
8505 freeClient(slave);
8506 }
8507 }
8508 }
8509 }
8510
8511 static int syncWithMaster(void) {
8512 char buf[1024], tmpfile[256], authcmd[1024];
8513 long dumpsize;
8514 int fd = anetTcpConnect(NULL,server.masterhost,server.masterport);
8515 int dfd, maxtries = 5;
8516
8517 if (fd == -1) {
8518 redisLog(REDIS_WARNING,"Unable to connect to MASTER: %s",
8519 strerror(errno));
8520 return REDIS_ERR;
8521 }
8522
8523 /* AUTH with the master if required. */
8524 if(server.masterauth) {
8525 snprintf(authcmd, 1024, "AUTH %s\r\n", server.masterauth);
8526 if (syncWrite(fd, authcmd, strlen(server.masterauth)+7, 5) == -1) {
8527 close(fd);
8528 redisLog(REDIS_WARNING,"Unable to AUTH to MASTER: %s",
8529 strerror(errno));
8530 return REDIS_ERR;
8531 }
8532 /* Read the AUTH result. */
8533 if (syncReadLine(fd,buf,1024,3600) == -1) {
8534 close(fd);
8535 redisLog(REDIS_WARNING,"I/O error reading auth result from MASTER: %s",
8536 strerror(errno));
8537 return REDIS_ERR;
8538 }
8539 if (buf[0] != '+') {
8540 close(fd);
8541 redisLog(REDIS_WARNING,"Cannot AUTH to MASTER, is the masterauth password correct?");
8542 return REDIS_ERR;
8543 }
8544 }
8545
8546 /* Issue the SYNC command */
8547 if (syncWrite(fd,"SYNC \r\n",7,5) == -1) {
8548 close(fd);
8549 redisLog(REDIS_WARNING,"I/O error writing to MASTER: %s",
8550 strerror(errno));
8551 return REDIS_ERR;
8552 }
8553 /* Read the bulk write count */
8554 if (syncReadLine(fd,buf,1024,3600) == -1) {
8555 close(fd);
8556 redisLog(REDIS_WARNING,"I/O error reading bulk count from MASTER: %s",
8557 strerror(errno));
8558 return REDIS_ERR;
8559 }
8560 if (buf[0] != '$') {
8561 close(fd);
8562 redisLog(REDIS_WARNING,"Bad protocol from MASTER, the first byte is not '$', are you sure the host and port are right?");
8563 return REDIS_ERR;
8564 }
8565 dumpsize = strtol(buf+1,NULL,10);
8566 redisLog(REDIS_NOTICE,"Receiving %ld bytes data dump from MASTER",dumpsize);
8567 /* Read the bulk write data on a temp file */
8568 while(maxtries--) {
8569 snprintf(tmpfile,256,
8570 "temp-%d.%ld.rdb",(int)time(NULL),(long int)getpid());
8571 dfd = open(tmpfile,O_CREAT|O_WRONLY|O_EXCL,0644);
8572 if (dfd != -1) break;
8573 sleep(1);
8574 }
8575 if (dfd == -1) {
8576 close(fd);
8577 redisLog(REDIS_WARNING,"Opening the temp file needed for MASTER <-> SLAVE synchronization: %s",strerror(errno));
8578 return REDIS_ERR;
8579 }
8580 while(dumpsize) {
8581 int nread, nwritten;
8582
8583 nread = read(fd,buf,(dumpsize < 1024)?dumpsize:1024);
8584 if (nread == -1) {
8585 redisLog(REDIS_WARNING,"I/O error trying to sync with MASTER: %s",
8586 strerror(errno));
8587 close(fd);
8588 close(dfd);
8589 return REDIS_ERR;
8590 }
8591 nwritten = write(dfd,buf,nread);
8592 if (nwritten == -1) {
8593 redisLog(REDIS_WARNING,"Write error writing to the DB dump file needed for MASTER <-> SLAVE synchrnonization: %s", strerror(errno));
8594 close(fd);
8595 close(dfd);
8596 return REDIS_ERR;
8597 }
8598 dumpsize -= nread;
8599 }
8600 close(dfd);
8601 if (rename(tmpfile,server.dbfilename) == -1) {
8602 redisLog(REDIS_WARNING,"Failed trying to rename the temp DB into dump.rdb in MASTER <-> SLAVE synchronization: %s", strerror(errno));
8603 unlink(tmpfile);
8604 close(fd);
8605 return REDIS_ERR;
8606 }
8607 emptyDb();
8608 if (rdbLoad(server.dbfilename) != REDIS_OK) {
8609 redisLog(REDIS_WARNING,"Failed trying to load the MASTER synchronization DB from disk");
8610 close(fd);
8611 return REDIS_ERR;
8612 }
8613 server.master = createClient(fd);
8614 server.master->flags |= REDIS_MASTER;
8615 server.master->authenticated = 1;
8616 server.replstate = REDIS_REPL_CONNECTED;
8617 return REDIS_OK;
8618 }
8619
8620 static void slaveofCommand(redisClient *c) {
8621 if (!strcasecmp(c->argv[1]->ptr,"no") &&
8622 !strcasecmp(c->argv[2]->ptr,"one")) {
8623 if (server.masterhost) {
8624 sdsfree(server.masterhost);
8625 server.masterhost = NULL;
8626 if (server.master) freeClient(server.master);
8627 server.replstate = REDIS_REPL_NONE;
8628 redisLog(REDIS_NOTICE,"MASTER MODE enabled (user request)");
8629 }
8630 } else {
8631 sdsfree(server.masterhost);
8632 server.masterhost = sdsdup(c->argv[1]->ptr);
8633 server.masterport = atoi(c->argv[2]->ptr);
8634 if (server.master) freeClient(server.master);
8635 server.replstate = REDIS_REPL_CONNECT;
8636 redisLog(REDIS_NOTICE,"SLAVE OF %s:%d enabled (user request)",
8637 server.masterhost, server.masterport);
8638 }
8639 addReply(c,shared.ok);
8640 }
8641
8642 /* ============================ Maxmemory directive ======================== */
8643
8644 /* Try to free one object form the pre-allocated objects free list.
8645 * This is useful under low mem conditions as by default we take 1 million
8646 * free objects allocated. On success REDIS_OK is returned, otherwise
8647 * REDIS_ERR. */
8648 static int tryFreeOneObjectFromFreelist(void) {
8649 robj *o;
8650
8651 if (server.vm_enabled) pthread_mutex_lock(&server.obj_freelist_mutex);
8652 if (listLength(server.objfreelist)) {
8653 listNode *head = listFirst(server.objfreelist);
8654 o = listNodeValue(head);
8655 listDelNode(server.objfreelist,head);
8656 if (server.vm_enabled) pthread_mutex_unlock(&server.obj_freelist_mutex);
8657 zfree(o);
8658 return REDIS_OK;
8659 } else {
8660 if (server.vm_enabled) pthread_mutex_unlock(&server.obj_freelist_mutex);
8661 return REDIS_ERR;
8662 }
8663 }
8664
8665 /* This function gets called when 'maxmemory' is set on the config file to limit
8666 * the max memory used by the server, and we are out of memory.
8667 * This function will try to, in order:
8668 *
8669 * - Free objects from the free list
8670 * - Try to remove keys with an EXPIRE set
8671 *
8672 * It is not possible to free enough memory to reach used-memory < maxmemory
8673 * the server will start refusing commands that will enlarge even more the
8674 * memory usage.
8675 */
8676 static void freeMemoryIfNeeded(void) {
8677 while (server.maxmemory && zmalloc_used_memory() > server.maxmemory) {
8678 int j, k, freed = 0;
8679
8680 if (tryFreeOneObjectFromFreelist() == REDIS_OK) continue;
8681 for (j = 0; j < server.dbnum; j++) {
8682 int minttl = -1;
8683 robj *minkey = NULL;
8684 struct dictEntry *de;
8685
8686 if (dictSize(server.db[j].expires)) {
8687 freed = 1;
8688 /* From a sample of three keys drop the one nearest to
8689 * the natural expire */
8690 for (k = 0; k < 3; k++) {
8691 time_t t;
8692
8693 de = dictGetRandomKey(server.db[j].expires);
8694 t = (time_t) dictGetEntryVal(de);
8695 if (minttl == -1 || t < minttl) {
8696 minkey = dictGetEntryKey(de);
8697 minttl = t;
8698 }
8699 }
8700 dbDelete(server.db+j,minkey);
8701 }
8702 }
8703 if (!freed) return; /* nothing to free... */
8704 }
8705 }
8706
8707 /* ============================== Append Only file ========================== */
8708
8709 /* Called when the user switches from "appendonly yes" to "appendonly no"
8710 * at runtime using the CONFIG command. */
8711 static void stopAppendOnly(void) {
8712 flushAppendOnlyFile();
8713 aof_fsync(server.appendfd);
8714 close(server.appendfd);
8715
8716 server.appendfd = -1;
8717 server.appendseldb = -1;
8718 server.appendonly = 0;
8719 /* rewrite operation in progress? kill it, wait child exit */
8720 if (server.bgsavechildpid != -1) {
8721 int statloc;
8722
8723 if (kill(server.bgsavechildpid,SIGKILL) != -1)
8724 wait3(&statloc,0,NULL);
8725 /* reset the buffer accumulating changes while the child saves */
8726 sdsfree(server.bgrewritebuf);
8727 server.bgrewritebuf = sdsempty();
8728 server.bgsavechildpid = -1;
8729 }
8730 }
8731
8732 /* Called when the user switches from "appendonly no" to "appendonly yes"
8733 * at runtime using the CONFIG command. */
8734 static int startAppendOnly(void) {
8735 server.appendonly = 1;
8736 server.lastfsync = time(NULL);
8737 server.appendfd = open(server.appendfilename,O_WRONLY|O_APPEND|O_CREAT,0644);
8738 if (server.appendfd == -1) {
8739 redisLog(REDIS_WARNING,"Used tried to switch on AOF via CONFIG, but I can't open the AOF file: %s",strerror(errno));
8740 return REDIS_ERR;
8741 }
8742 if (rewriteAppendOnlyFileBackground() == REDIS_ERR) {
8743 server.appendonly = 0;
8744 close(server.appendfd);
8745 redisLog(REDIS_WARNING,"Used tried to switch on AOF via CONFIG, I can't trigger a background AOF rewrite operation. Check the above logs for more info about the error.",strerror(errno));
8746 return REDIS_ERR;
8747 }
8748 return REDIS_OK;
8749 }
8750
8751 /* Write the append only file buffer on disk.
8752 *
8753 * Since we are required to write the AOF before replying to the client,
8754 * and the only way the client socket can get a write is entering when the
8755 * the event loop, we accumulate all the AOF writes in a memory
8756 * buffer and write it on disk using this function just before entering
8757 * the event loop again. */
8758 static void flushAppendOnlyFile(void) {
8759 time_t now;
8760 ssize_t nwritten;
8761
8762 if (sdslen(server.aofbuf) == 0) return;
8763
8764 /* We want to perform a single write. This should be guaranteed atomic
8765 * at least if the filesystem we are writing is a real physical one.
8766 * While this will save us against the server being killed I don't think
8767 * there is much to do about the whole server stopping for power problems
8768 * or alike */
8769 nwritten = write(server.appendfd,server.aofbuf,sdslen(server.aofbuf));
8770 if (nwritten != (signed)sdslen(server.aofbuf)) {
8771 /* Ooops, we are in troubles. The best thing to do for now is
8772 * aborting instead of giving the illusion that everything is
8773 * working as expected. */
8774 if (nwritten == -1) {
8775 redisLog(REDIS_WARNING,"Exiting on error writing to the append-only file: %s",strerror(errno));
8776 } else {
8777 redisLog(REDIS_WARNING,"Exiting on short write while writing to the append-only file: %s",strerror(errno));
8778 }
8779 exit(1);
8780 }
8781 sdsfree(server.aofbuf);
8782 server.aofbuf = sdsempty();
8783
8784 /* Don't Fsync if no-appendfsync-on-rewrite is set to yes and we have
8785 * childs performing heavy I/O on disk. */
8786 if (server.no_appendfsync_on_rewrite &&
8787 (server.bgrewritechildpid != -1 || server.bgsavechildpid != -1))
8788 return;
8789 /* Fsync if needed */
8790 now = time(NULL);
8791 if (server.appendfsync == APPENDFSYNC_ALWAYS ||
8792 (server.appendfsync == APPENDFSYNC_EVERYSEC &&
8793 now-server.lastfsync > 1))
8794 {
8795 /* aof_fsync is defined as fdatasync() for Linux in order to avoid
8796 * flushing metadata. */
8797 aof_fsync(server.appendfd); /* Let's try to get this data on the disk */
8798 server.lastfsync = now;
8799 }
8800 }
8801
8802 static sds catAppendOnlyGenericCommand(sds buf, int argc, robj **argv) {
8803 int j;
8804 buf = sdscatprintf(buf,"*%d\r\n",argc);
8805 for (j = 0; j < argc; j++) {
8806 robj *o = getDecodedObject(argv[j]);
8807 buf = sdscatprintf(buf,"$%lu\r\n",(unsigned long)sdslen(o->ptr));
8808 buf = sdscatlen(buf,o->ptr,sdslen(o->ptr));
8809 buf = sdscatlen(buf,"\r\n",2);
8810 decrRefCount(o);
8811 }
8812 return buf;
8813 }
8814
8815 static sds catAppendOnlyExpireAtCommand(sds buf, robj *key, robj *seconds) {
8816 int argc = 3;
8817 long when;
8818 robj *argv[3];
8819
8820 /* Make sure we can use strtol */
8821 seconds = getDecodedObject(seconds);
8822 when = time(NULL)+strtol(seconds->ptr,NULL,10);
8823 decrRefCount(seconds);
8824
8825 argv[0] = createStringObject("EXPIREAT",8);
8826 argv[1] = key;
8827 argv[2] = createObject(REDIS_STRING,
8828 sdscatprintf(sdsempty(),"%ld",when));
8829 buf = catAppendOnlyGenericCommand(buf, argc, argv);
8830 decrRefCount(argv[0]);
8831 decrRefCount(argv[2]);
8832 return buf;
8833 }
8834
8835 static void feedAppendOnlyFile(struct redisCommand *cmd, int dictid, robj **argv, int argc) {
8836 sds buf = sdsempty();
8837 robj *tmpargv[3];
8838
8839 /* The DB this command was targetting is not the same as the last command
8840 * we appendend. To issue a SELECT command is needed. */
8841 if (dictid != server.appendseldb) {
8842 char seldb[64];
8843
8844 snprintf(seldb,sizeof(seldb),"%d",dictid);
8845 buf = sdscatprintf(buf,"*2\r\n$6\r\nSELECT\r\n$%lu\r\n%s\r\n",
8846 (unsigned long)strlen(seldb),seldb);
8847 server.appendseldb = dictid;
8848 }
8849
8850 if (cmd->proc == expireCommand) {
8851 /* Translate EXPIRE into EXPIREAT */
8852 buf = catAppendOnlyExpireAtCommand(buf,argv[1],argv[2]);
8853 } else if (cmd->proc == setexCommand) {
8854 /* Translate SETEX to SET and EXPIREAT */
8855 tmpargv[0] = createStringObject("SET",3);
8856 tmpargv[1] = argv[1];
8857 tmpargv[2] = argv[3];
8858 buf = catAppendOnlyGenericCommand(buf,3,tmpargv);
8859 decrRefCount(tmpargv[0]);
8860 buf = catAppendOnlyExpireAtCommand(buf,argv[1],argv[2]);
8861 } else {
8862 buf = catAppendOnlyGenericCommand(buf,argc,argv);
8863 }
8864
8865 /* Append to the AOF buffer. This will be flushed on disk just before
8866 * of re-entering the event loop, so before the client will get a
8867 * positive reply about the operation performed. */
8868 server.aofbuf = sdscatlen(server.aofbuf,buf,sdslen(buf));
8869
8870 /* If a background append only file rewriting is in progress we want to
8871 * accumulate the differences between the child DB and the current one
8872 * in a buffer, so that when the child process will do its work we
8873 * can append the differences to the new append only file. */
8874 if (server.bgrewritechildpid != -1)
8875 server.bgrewritebuf = sdscatlen(server.bgrewritebuf,buf,sdslen(buf));
8876
8877 sdsfree(buf);
8878 }
8879
8880 /* In Redis commands are always executed in the context of a client, so in
8881 * order to load the append only file we need to create a fake client. */
8882 static struct redisClient *createFakeClient(void) {
8883 struct redisClient *c = zmalloc(sizeof(*c));
8884
8885 selectDb(c,0);
8886 c->fd = -1;
8887 c->querybuf = sdsempty();
8888 c->argc = 0;
8889 c->argv = NULL;
8890 c->flags = 0;
8891 /* We set the fake client as a slave waiting for the synchronization
8892 * so that Redis will not try to send replies to this client. */
8893 c->replstate = REDIS_REPL_WAIT_BGSAVE_START;
8894 c->reply = listCreate();
8895 listSetFreeMethod(c->reply,decrRefCount);
8896 listSetDupMethod(c->reply,dupClientReplyValue);
8897 initClientMultiState(c);
8898 return c;
8899 }
8900
8901 static void freeFakeClient(struct redisClient *c) {
8902 sdsfree(c->querybuf);
8903 listRelease(c->reply);
8904 freeClientMultiState(c);
8905 zfree(c);
8906 }
8907
8908 /* Replay the append log file. On error REDIS_OK is returned. On non fatal
8909 * error (the append only file is zero-length) REDIS_ERR is returned. On
8910 * fatal error an error message is logged and the program exists. */
8911 int loadAppendOnlyFile(char *filename) {
8912 struct redisClient *fakeClient;
8913 FILE *fp = fopen(filename,"r");
8914 struct redis_stat sb;
8915 int appendonly = server.appendonly;
8916
8917 if (redis_fstat(fileno(fp),&sb) != -1 && sb.st_size == 0)
8918 return REDIS_ERR;
8919
8920 if (fp == NULL) {
8921 redisLog(REDIS_WARNING,"Fatal error: can't open the append log file for reading: %s",strerror(errno));
8922 exit(1);
8923 }
8924
8925 /* Temporarily disable AOF, to prevent EXEC from feeding a MULTI
8926 * to the same file we're about to read. */
8927 server.appendonly = 0;
8928
8929 fakeClient = createFakeClient();
8930 while(1) {
8931 int argc, j;
8932 unsigned long len;
8933 robj **argv;
8934 char buf[128];
8935 sds argsds;
8936 struct redisCommand *cmd;
8937 int force_swapout;
8938
8939 if (fgets(buf,sizeof(buf),fp) == NULL) {
8940 if (feof(fp))
8941 break;
8942 else
8943 goto readerr;
8944 }
8945 if (buf[0] != '*') goto fmterr;
8946 argc = atoi(buf+1);
8947 argv = zmalloc(sizeof(robj*)*argc);
8948 for (j = 0; j < argc; j++) {
8949 if (fgets(buf,sizeof(buf),fp) == NULL) goto readerr;
8950 if (buf[0] != '$') goto fmterr;
8951 len = strtol(buf+1,NULL,10);
8952 argsds = sdsnewlen(NULL,len);
8953 if (len && fread(argsds,len,1,fp) == 0) goto fmterr;
8954 argv[j] = createObject(REDIS_STRING,argsds);
8955 if (fread(buf,2,1,fp) == 0) goto fmterr; /* discard CRLF */
8956 }
8957
8958 /* Command lookup */
8959 cmd = lookupCommand(argv[0]->ptr);
8960 if (!cmd) {
8961 redisLog(REDIS_WARNING,"Unknown command '%s' reading the append only file", argv[0]->ptr);
8962 exit(1);
8963 }
8964 /* Try object encoding */
8965 if (cmd->flags & REDIS_CMD_BULK)
8966 argv[argc-1] = tryObjectEncoding(argv[argc-1]);
8967 /* Run the command in the context of a fake client */
8968 fakeClient->argc = argc;
8969 fakeClient->argv = argv;
8970 cmd->proc(fakeClient);
8971 /* Discard the reply objects list from the fake client */
8972 while(listLength(fakeClient->reply))
8973 listDelNode(fakeClient->reply,listFirst(fakeClient->reply));
8974 /* Clean up, ready for the next command */
8975 for (j = 0; j < argc; j++) decrRefCount(argv[j]);
8976 zfree(argv);
8977 /* Handle swapping while loading big datasets when VM is on */
8978 force_swapout = 0;
8979 if ((zmalloc_used_memory() - server.vm_max_memory) > 1024*1024*32)
8980 force_swapout = 1;
8981
8982 if (server.vm_enabled && force_swapout) {
8983 while (zmalloc_used_memory() > server.vm_max_memory) {
8984 if (vmSwapOneObjectBlocking() == REDIS_ERR) break;
8985 }
8986 }
8987 }
8988
8989 /* This point can only be reached when EOF is reached without errors.
8990 * If the client is in the middle of a MULTI/EXEC, log error and quit. */
8991 if (fakeClient->flags & REDIS_MULTI) goto readerr;
8992
8993 fclose(fp);
8994 freeFakeClient(fakeClient);
8995 server.appendonly = appendonly;
8996 return REDIS_OK;
8997
8998 readerr:
8999 if (feof(fp)) {
9000 redisLog(REDIS_WARNING,"Unexpected end of file reading the append only file");
9001 } else {
9002 redisLog(REDIS_WARNING,"Unrecoverable error reading the append only file: %s", strerror(errno));
9003 }
9004 exit(1);
9005 fmterr:
9006 redisLog(REDIS_WARNING,"Bad file format reading the append only file");
9007 exit(1);
9008 }
9009
9010 /* Write binary-safe string into a file in the bulkformat
9011 * $<count>\r\n<payload>\r\n */
9012 static int fwriteBulkString(FILE *fp, char *s, unsigned long len) {
9013 char cbuf[128];
9014 int clen;
9015 cbuf[0] = '$';
9016 clen = 1+ll2string(cbuf+1,sizeof(cbuf)-1,len);
9017 cbuf[clen++] = '\r';
9018 cbuf[clen++] = '\n';
9019 if (fwrite(cbuf,clen,1,fp) == 0) return 0;
9020 if (len > 0 && fwrite(s,len,1,fp) == 0) return 0;
9021 if (fwrite("\r\n",2,1,fp) == 0) return 0;
9022 return 1;
9023 }
9024
9025 /* Write a double value in bulk format $<count>\r\n<payload>\r\n */
9026 static int fwriteBulkDouble(FILE *fp, double d) {
9027 char buf[128], dbuf[128];
9028
9029 snprintf(dbuf,sizeof(dbuf),"%.17g\r\n",d);
9030 snprintf(buf,sizeof(buf),"$%lu\r\n",(unsigned long)strlen(dbuf)-2);
9031 if (fwrite(buf,strlen(buf),1,fp) == 0) return 0;
9032 if (fwrite(dbuf,strlen(dbuf),1,fp) == 0) return 0;
9033 return 1;
9034 }
9035
9036 /* Write a long value in bulk format $<count>\r\n<payload>\r\n */
9037 static int fwriteBulkLongLong(FILE *fp, long long l) {
9038 char bbuf[128], lbuf[128];
9039 unsigned int blen, llen;
9040 llen = ll2string(lbuf,32,l);
9041 blen = snprintf(bbuf,sizeof(bbuf),"$%u\r\n%s\r\n",llen,lbuf);
9042 if (fwrite(bbuf,blen,1,fp) == 0) return 0;
9043 return 1;
9044 }
9045
9046 /* Delegate writing an object to writing a bulk string or bulk long long. */
9047 static int fwriteBulkObject(FILE *fp, robj *obj) {
9048 /* Avoid using getDecodedObject to help copy-on-write (we are often
9049 * in a child process when this function is called). */
9050 if (obj->encoding == REDIS_ENCODING_INT) {
9051 return fwriteBulkLongLong(fp,(long)obj->ptr);
9052 } else if (obj->encoding == REDIS_ENCODING_RAW) {
9053 return fwriteBulkString(fp,obj->ptr,sdslen(obj->ptr));
9054 } else {
9055 redisPanic("Unknown string encoding");
9056 }
9057 }
9058
9059 /* Write a sequence of commands able to fully rebuild the dataset into
9060 * "filename". Used both by REWRITEAOF and BGREWRITEAOF. */
9061 static int rewriteAppendOnlyFile(char *filename) {
9062 dictIterator *di = NULL;
9063 dictEntry *de;
9064 FILE *fp;
9065 char tmpfile[256];
9066 int j;
9067 time_t now = time(NULL);
9068
9069 /* Note that we have to use a different temp name here compared to the
9070 * one used by rewriteAppendOnlyFileBackground() function. */
9071 snprintf(tmpfile,256,"temp-rewriteaof-%d.aof", (int) getpid());
9072 fp = fopen(tmpfile,"w");
9073 if (!fp) {
9074 redisLog(REDIS_WARNING, "Failed rewriting the append only file: %s", strerror(errno));
9075 return REDIS_ERR;
9076 }
9077 for (j = 0; j < server.dbnum; j++) {
9078 char selectcmd[] = "*2\r\n$6\r\nSELECT\r\n";
9079 redisDb *db = server.db+j;
9080 dict *d = db->dict;
9081 if (dictSize(d) == 0) continue;
9082 di = dictGetIterator(d);
9083 if (!di) {
9084 fclose(fp);
9085 return REDIS_ERR;
9086 }
9087
9088 /* SELECT the new DB */
9089 if (fwrite(selectcmd,sizeof(selectcmd)-1,1,fp) == 0) goto werr;
9090 if (fwriteBulkLongLong(fp,j) == 0) goto werr;
9091
9092 /* Iterate this DB writing every entry */
9093 while((de = dictNext(di)) != NULL) {
9094 sds keystr = dictGetEntryKey(de);
9095 robj key, *o;
9096 time_t expiretime;
9097 int swapped;
9098
9099 keystr = dictGetEntryKey(de);
9100 o = dictGetEntryVal(de);
9101 initStaticStringObject(key,keystr);
9102 /* If the value for this key is swapped, load a preview in memory.
9103 * We use a "swapped" flag to remember if we need to free the
9104 * value object instead to just increment the ref count anyway
9105 * in order to avoid copy-on-write of pages if we are forked() */
9106 if (!server.vm_enabled || o->storage == REDIS_VM_MEMORY ||
9107 o->storage == REDIS_VM_SWAPPING) {
9108 swapped = 0;
9109 } else {
9110 o = vmPreviewObject(o);
9111 swapped = 1;
9112 }
9113 expiretime = getExpire(db,&key);
9114
9115 /* Save the key and associated value */
9116 if (o->type == REDIS_STRING) {
9117 /* Emit a SET command */
9118 char cmd[]="*3\r\n$3\r\nSET\r\n";
9119 if (fwrite(cmd,sizeof(cmd)-1,1,fp) == 0) goto werr;
9120 /* Key and value */
9121 if (fwriteBulkObject(fp,&key) == 0) goto werr;
9122 if (fwriteBulkObject(fp,o) == 0) goto werr;
9123 } else if (o->type == REDIS_LIST) {
9124 /* Emit the RPUSHes needed to rebuild the list */
9125 char cmd[]="*3\r\n$5\r\nRPUSH\r\n";
9126 if (o->encoding == REDIS_ENCODING_ZIPLIST) {
9127 unsigned char *zl = o->ptr;
9128 unsigned char *p = ziplistIndex(zl,0);
9129 unsigned char *vstr;
9130 unsigned int vlen;
9131 long long vlong;
9132
9133 while(ziplistGet(p,&vstr,&vlen,&vlong)) {
9134 if (fwrite(cmd,sizeof(cmd)-1,1,fp) == 0) goto werr;
9135 if (fwriteBulkObject(fp,&key) == 0) goto werr;
9136 if (vstr) {
9137 if (fwriteBulkString(fp,(char*)vstr,vlen) == 0)
9138 goto werr;
9139 } else {
9140 if (fwriteBulkLongLong(fp,vlong) == 0)
9141 goto werr;
9142 }
9143 p = ziplistNext(zl,p);
9144 }
9145 } else if (o->encoding == REDIS_ENCODING_LIST) {
9146 list *list = o->ptr;
9147 listNode *ln;
9148 listIter li;
9149
9150 listRewind(list,&li);
9151 while((ln = listNext(&li))) {
9152 robj *eleobj = listNodeValue(ln);
9153
9154 if (fwrite(cmd,sizeof(cmd)-1,1,fp) == 0) goto werr;
9155 if (fwriteBulkObject(fp,&key) == 0) goto werr;
9156 if (fwriteBulkObject(fp,eleobj) == 0) goto werr;
9157 }
9158 } else {
9159 redisPanic("Unknown list encoding");
9160 }
9161 } else if (o->type == REDIS_SET) {
9162 /* Emit the SADDs needed to rebuild the set */
9163 dict *set = o->ptr;
9164 dictIterator *di = dictGetIterator(set);
9165 dictEntry *de;
9166
9167 while((de = dictNext(di)) != NULL) {
9168 char cmd[]="*3\r\n$4\r\nSADD\r\n";
9169 robj *eleobj = dictGetEntryKey(de);
9170
9171 if (fwrite(cmd,sizeof(cmd)-1,1,fp) == 0) goto werr;
9172 if (fwriteBulkObject(fp,&key) == 0) goto werr;
9173 if (fwriteBulkObject(fp,eleobj) == 0) goto werr;
9174 }
9175 dictReleaseIterator(di);
9176 } else if (o->type == REDIS_ZSET) {
9177 /* Emit the ZADDs needed to rebuild the sorted set */
9178 zset *zs = o->ptr;
9179 dictIterator *di = dictGetIterator(zs->dict);
9180 dictEntry *de;
9181
9182 while((de = dictNext(di)) != NULL) {
9183 char cmd[]="*4\r\n$4\r\nZADD\r\n";
9184 robj *eleobj = dictGetEntryKey(de);
9185 double *score = dictGetEntryVal(de);
9186
9187 if (fwrite(cmd,sizeof(cmd)-1,1,fp) == 0) goto werr;
9188 if (fwriteBulkObject(fp,&key) == 0) goto werr;
9189 if (fwriteBulkDouble(fp,*score) == 0) goto werr;
9190 if (fwriteBulkObject(fp,eleobj) == 0) goto werr;
9191 }
9192 dictReleaseIterator(di);
9193 } else if (o->type == REDIS_HASH) {
9194 char cmd[]="*4\r\n$4\r\nHSET\r\n";
9195
9196 /* Emit the HSETs needed to rebuild the hash */
9197 if (o->encoding == REDIS_ENCODING_ZIPMAP) {
9198 unsigned char *p = zipmapRewind(o->ptr);
9199 unsigned char *field, *val;
9200 unsigned int flen, vlen;
9201
9202 while((p = zipmapNext(p,&field,&flen,&val,&vlen)) != NULL) {
9203 if (fwrite(cmd,sizeof(cmd)-1,1,fp) == 0) goto werr;
9204 if (fwriteBulkObject(fp,&key) == 0) goto werr;
9205 if (fwriteBulkString(fp,(char*)field,flen) == -1)
9206 return -1;
9207 if (fwriteBulkString(fp,(char*)val,vlen) == -1)
9208 return -1;
9209 }
9210 } else {
9211 dictIterator *di = dictGetIterator(o->ptr);
9212 dictEntry *de;
9213
9214 while((de = dictNext(di)) != NULL) {
9215 robj *field = dictGetEntryKey(de);
9216 robj *val = dictGetEntryVal(de);
9217
9218 if (fwrite(cmd,sizeof(cmd)-1,1,fp) == 0) goto werr;
9219 if (fwriteBulkObject(fp,&key) == 0) goto werr;
9220 if (fwriteBulkObject(fp,field) == -1) return -1;
9221 if (fwriteBulkObject(fp,val) == -1) return -1;
9222 }
9223 dictReleaseIterator(di);
9224 }
9225 } else {
9226 redisPanic("Unknown object type");
9227 }
9228 /* Save the expire time */
9229 if (expiretime != -1) {
9230 char cmd[]="*3\r\n$8\r\nEXPIREAT\r\n";
9231 /* If this key is already expired skip it */
9232 if (expiretime < now) continue;
9233 if (fwrite(cmd,sizeof(cmd)-1,1,fp) == 0) goto werr;
9234 if (fwriteBulkObject(fp,&key) == 0) goto werr;
9235 if (fwriteBulkLongLong(fp,expiretime) == 0) goto werr;
9236 }
9237 if (swapped) decrRefCount(o);
9238 }
9239 dictReleaseIterator(di);
9240 }
9241
9242 /* Make sure data will not remain on the OS's output buffers */
9243 fflush(fp);
9244 aof_fsync(fileno(fp));
9245 fclose(fp);
9246
9247 /* Use RENAME to make sure the DB file is changed atomically only
9248 * if the generate DB file is ok. */
9249 if (rename(tmpfile,filename) == -1) {
9250 redisLog(REDIS_WARNING,"Error moving temp append only file on the final destination: %s", strerror(errno));
9251 unlink(tmpfile);
9252 return REDIS_ERR;
9253 }
9254 redisLog(REDIS_NOTICE,"SYNC append only file rewrite performed");
9255 return REDIS_OK;
9256
9257 werr:
9258 fclose(fp);
9259 unlink(tmpfile);
9260 redisLog(REDIS_WARNING,"Write error writing append only file on disk: %s", strerror(errno));
9261 if (di) dictReleaseIterator(di);
9262 return REDIS_ERR;
9263 }
9264
9265 /* This is how rewriting of the append only file in background works:
9266 *
9267 * 1) The user calls BGREWRITEAOF
9268 * 2) Redis calls this function, that forks():
9269 * 2a) the child rewrite the append only file in a temp file.
9270 * 2b) the parent accumulates differences in server.bgrewritebuf.
9271 * 3) When the child finished '2a' exists.
9272 * 4) The parent will trap the exit code, if it's OK, will append the
9273 * data accumulated into server.bgrewritebuf into the temp file, and
9274 * finally will rename(2) the temp file in the actual file name.
9275 * The the new file is reopened as the new append only file. Profit!
9276 */
9277 static int rewriteAppendOnlyFileBackground(void) {
9278 pid_t childpid;
9279
9280 if (server.bgrewritechildpid != -1) return REDIS_ERR;
9281 if (server.vm_enabled) waitEmptyIOJobsQueue();
9282 if ((childpid = fork()) == 0) {
9283 /* Child */
9284 char tmpfile[256];
9285
9286 if (server.vm_enabled) vmReopenSwapFile();
9287 close(server.fd);
9288 snprintf(tmpfile,256,"temp-rewriteaof-bg-%d.aof", (int) getpid());
9289 if (rewriteAppendOnlyFile(tmpfile) == REDIS_OK) {
9290 _exit(0);
9291 } else {
9292 _exit(1);
9293 }
9294 } else {
9295 /* Parent */
9296 if (childpid == -1) {
9297 redisLog(REDIS_WARNING,
9298 "Can't rewrite append only file in background: fork: %s",
9299 strerror(errno));
9300 return REDIS_ERR;
9301 }
9302 redisLog(REDIS_NOTICE,
9303 "Background append only file rewriting started by pid %d",childpid);
9304 server.bgrewritechildpid = childpid;
9305 updateDictResizePolicy();
9306 /* We set appendseldb to -1 in order to force the next call to the
9307 * feedAppendOnlyFile() to issue a SELECT command, so the differences
9308 * accumulated by the parent into server.bgrewritebuf will start
9309 * with a SELECT statement and it will be safe to merge. */
9310 server.appendseldb = -1;
9311 return REDIS_OK;
9312 }
9313 return REDIS_OK; /* unreached */
9314 }
9315
9316 static void bgrewriteaofCommand(redisClient *c) {
9317 if (server.bgrewritechildpid != -1) {
9318 addReplySds(c,sdsnew("-ERR background append only file rewriting already in progress\r\n"));
9319 return;
9320 }
9321 if (rewriteAppendOnlyFileBackground() == REDIS_OK) {
9322 char *status = "+Background append only file rewriting started\r\n";
9323 addReplySds(c,sdsnew(status));
9324 } else {
9325 addReply(c,shared.err);
9326 }
9327 }
9328
9329 static void aofRemoveTempFile(pid_t childpid) {
9330 char tmpfile[256];
9331
9332 snprintf(tmpfile,256,"temp-rewriteaof-bg-%d.aof", (int) childpid);
9333 unlink(tmpfile);
9334 }
9335
9336 /* Virtual Memory is composed mainly of two subsystems:
9337 * - Blocking Virutal Memory
9338 * - Threaded Virtual Memory I/O
9339 * The two parts are not fully decoupled, but functions are split among two
9340 * different sections of the source code (delimited by comments) in order to
9341 * make more clear what functionality is about the blocking VM and what about
9342 * the threaded (not blocking) VM.
9343 *
9344 * Redis VM design:
9345 *
9346 * Redis VM is a blocking VM (one that blocks reading swapped values from
9347 * disk into memory when a value swapped out is needed in memory) that is made
9348 * unblocking by trying to examine the command argument vector in order to
9349 * load in background values that will likely be needed in order to exec
9350 * the command. The command is executed only once all the relevant keys
9351 * are loaded into memory.
9352 *
9353 * This basically is almost as simple of a blocking VM, but almost as parallel
9354 * as a fully non-blocking VM.
9355 */
9356
9357 /* =================== Virtual Memory - Blocking Side ====================== */
9358
9359 /* Create a VM pointer object. This kind of objects are used in place of
9360 * values in the key -> value hash table, for swapped out objects. */
9361 static vmpointer *createVmPointer(int vtype) {
9362 vmpointer *vp = zmalloc(sizeof(vmpointer));
9363
9364 vp->type = REDIS_VMPOINTER;
9365 vp->storage = REDIS_VM_SWAPPED;
9366 vp->vtype = vtype;
9367 return vp;
9368 }
9369
9370 static void vmInit(void) {
9371 off_t totsize;
9372 int pipefds[2];
9373 size_t stacksize;
9374 struct flock fl;
9375
9376 if (server.vm_max_threads != 0)
9377 zmalloc_enable_thread_safeness(); /* we need thread safe zmalloc() */
9378
9379 redisLog(REDIS_NOTICE,"Using '%s' as swap file",server.vm_swap_file);
9380 /* Try to open the old swap file, otherwise create it */
9381 if ((server.vm_fp = fopen(server.vm_swap_file,"r+b")) == NULL) {
9382 server.vm_fp = fopen(server.vm_swap_file,"w+b");
9383 }
9384 if (server.vm_fp == NULL) {
9385 redisLog(REDIS_WARNING,
9386 "Can't open the swap file: %s. Exiting.",
9387 strerror(errno));
9388 exit(1);
9389 }
9390 server.vm_fd = fileno(server.vm_fp);
9391 /* Lock the swap file for writing, this is useful in order to avoid
9392 * another instance to use the same swap file for a config error. */
9393 fl.l_type = F_WRLCK;
9394 fl.l_whence = SEEK_SET;
9395 fl.l_start = fl.l_len = 0;
9396 if (fcntl(server.vm_fd,F_SETLK,&fl) == -1) {
9397 redisLog(REDIS_WARNING,
9398 "Can't lock the swap file at '%s': %s. Make sure it is not used by another Redis instance.", server.vm_swap_file, strerror(errno));
9399 exit(1);
9400 }
9401 /* Initialize */
9402 server.vm_next_page = 0;
9403 server.vm_near_pages = 0;
9404 server.vm_stats_used_pages = 0;
9405 server.vm_stats_swapped_objects = 0;
9406 server.vm_stats_swapouts = 0;
9407 server.vm_stats_swapins = 0;
9408 totsize = server.vm_pages*server.vm_page_size;
9409 redisLog(REDIS_NOTICE,"Allocating %lld bytes of swap file",totsize);
9410 if (ftruncate(server.vm_fd,totsize) == -1) {
9411 redisLog(REDIS_WARNING,"Can't ftruncate swap file: %s. Exiting.",
9412 strerror(errno));
9413 exit(1);
9414 } else {
9415 redisLog(REDIS_NOTICE,"Swap file allocated with success");
9416 }
9417 server.vm_bitmap = zmalloc((server.vm_pages+7)/8);
9418 redisLog(REDIS_VERBOSE,"Allocated %lld bytes page table for %lld pages",
9419 (long long) (server.vm_pages+7)/8, server.vm_pages);
9420 memset(server.vm_bitmap,0,(server.vm_pages+7)/8);
9421
9422 /* Initialize threaded I/O (used by Virtual Memory) */
9423 server.io_newjobs = listCreate();
9424 server.io_processing = listCreate();
9425 server.io_processed = listCreate();
9426 server.io_ready_clients = listCreate();
9427 pthread_mutex_init(&server.io_mutex,NULL);
9428 pthread_mutex_init(&server.obj_freelist_mutex,NULL);
9429 pthread_mutex_init(&server.io_swapfile_mutex,NULL);
9430 server.io_active_threads = 0;
9431 if (pipe(pipefds) == -1) {
9432 redisLog(REDIS_WARNING,"Unable to intialized VM: pipe(2): %s. Exiting."
9433 ,strerror(errno));
9434 exit(1);
9435 }
9436 server.io_ready_pipe_read = pipefds[0];
9437 server.io_ready_pipe_write = pipefds[1];
9438 redisAssert(anetNonBlock(NULL,server.io_ready_pipe_read) != ANET_ERR);
9439 /* LZF requires a lot of stack */
9440 pthread_attr_init(&server.io_threads_attr);
9441 pthread_attr_getstacksize(&server.io_threads_attr, &stacksize);
9442 while (stacksize < REDIS_THREAD_STACK_SIZE) stacksize *= 2;
9443 pthread_attr_setstacksize(&server.io_threads_attr, stacksize);
9444 /* Listen for events in the threaded I/O pipe */
9445 if (aeCreateFileEvent(server.el, server.io_ready_pipe_read, AE_READABLE,
9446 vmThreadedIOCompletedJob, NULL) == AE_ERR)
9447 oom("creating file event");
9448 }
9449
9450 /* Mark the page as used */
9451 static void vmMarkPageUsed(off_t page) {
9452 off_t byte = page/8;
9453 int bit = page&7;
9454 redisAssert(vmFreePage(page) == 1);
9455 server.vm_bitmap[byte] |= 1<<bit;
9456 }
9457
9458 /* Mark N contiguous pages as used, with 'page' being the first. */
9459 static void vmMarkPagesUsed(off_t page, off_t count) {
9460 off_t j;
9461
9462 for (j = 0; j < count; j++)
9463 vmMarkPageUsed(page+j);
9464 server.vm_stats_used_pages += count;
9465 redisLog(REDIS_DEBUG,"Mark USED pages: %lld pages at %lld\n",
9466 (long long)count, (long long)page);
9467 }
9468
9469 /* Mark the page as free */
9470 static void vmMarkPageFree(off_t page) {
9471 off_t byte = page/8;
9472 int bit = page&7;
9473 redisAssert(vmFreePage(page) == 0);
9474 server.vm_bitmap[byte] &= ~(1<<bit);
9475 }
9476
9477 /* Mark N contiguous pages as free, with 'page' being the first. */
9478 static void vmMarkPagesFree(off_t page, off_t count) {
9479 off_t j;
9480
9481 for (j = 0; j < count; j++)
9482 vmMarkPageFree(page+j);
9483 server.vm_stats_used_pages -= count;
9484 redisLog(REDIS_DEBUG,"Mark FREE pages: %lld pages at %lld\n",
9485 (long long)count, (long long)page);
9486 }
9487
9488 /* Test if the page is free */
9489 static int vmFreePage(off_t page) {
9490 off_t byte = page/8;
9491 int bit = page&7;
9492 return (server.vm_bitmap[byte] & (1<<bit)) == 0;
9493 }
9494
9495 /* Find N contiguous free pages storing the first page of the cluster in *first.
9496 * Returns REDIS_OK if it was able to find N contiguous pages, otherwise
9497 * REDIS_ERR is returned.
9498 *
9499 * This function uses a simple algorithm: we try to allocate
9500 * REDIS_VM_MAX_NEAR_PAGES sequentially, when we reach this limit we start
9501 * again from the start of the swap file searching for free spaces.
9502 *
9503 * If it looks pretty clear that there are no free pages near our offset
9504 * we try to find less populated places doing a forward jump of
9505 * REDIS_VM_MAX_RANDOM_JUMP, then we start scanning again a few pages
9506 * without hurry, and then we jump again and so forth...
9507 *
9508 * This function can be improved using a free list to avoid to guess
9509 * too much, since we could collect data about freed pages.
9510 *
9511 * note: I implemented this function just after watching an episode of
9512 * Battlestar Galactica, where the hybrid was continuing to say "JUMP!"
9513 */
9514 static int vmFindContiguousPages(off_t *first, off_t n) {
9515 off_t base, offset = 0, since_jump = 0, numfree = 0;
9516
9517 if (server.vm_near_pages == REDIS_VM_MAX_NEAR_PAGES) {
9518 server.vm_near_pages = 0;
9519 server.vm_next_page = 0;
9520 }
9521 server.vm_near_pages++; /* Yet another try for pages near to the old ones */
9522 base = server.vm_next_page;
9523
9524 while(offset < server.vm_pages) {
9525 off_t this = base+offset;
9526
9527 /* If we overflow, restart from page zero */
9528 if (this >= server.vm_pages) {
9529 this -= server.vm_pages;
9530 if (this == 0) {
9531 /* Just overflowed, what we found on tail is no longer
9532 * interesting, as it's no longer contiguous. */
9533 numfree = 0;
9534 }
9535 }
9536 if (vmFreePage(this)) {
9537 /* This is a free page */
9538 numfree++;
9539 /* Already got N free pages? Return to the caller, with success */
9540 if (numfree == n) {
9541 *first = this-(n-1);
9542 server.vm_next_page = this+1;
9543 redisLog(REDIS_DEBUG, "FOUND CONTIGUOUS PAGES: %lld pages at %lld\n", (long long) n, (long long) *first);
9544 return REDIS_OK;
9545 }
9546 } else {
9547 /* The current one is not a free page */
9548 numfree = 0;
9549 }
9550
9551 /* Fast-forward if the current page is not free and we already
9552 * searched enough near this place. */
9553 since_jump++;
9554 if (!numfree && since_jump >= REDIS_VM_MAX_RANDOM_JUMP/4) {
9555 offset += random() % REDIS_VM_MAX_RANDOM_JUMP;
9556 since_jump = 0;
9557 /* Note that even if we rewind after the jump, we are don't need
9558 * to make sure numfree is set to zero as we only jump *if* it
9559 * is set to zero. */
9560 } else {
9561 /* Otherwise just check the next page */
9562 offset++;
9563 }
9564 }
9565 return REDIS_ERR;
9566 }
9567
9568 /* Write the specified object at the specified page of the swap file */
9569 static int vmWriteObjectOnSwap(robj *o, off_t page) {
9570 if (server.vm_enabled) pthread_mutex_lock(&server.io_swapfile_mutex);
9571 if (fseeko(server.vm_fp,page*server.vm_page_size,SEEK_SET) == -1) {
9572 if (server.vm_enabled) pthread_mutex_unlock(&server.io_swapfile_mutex);
9573 redisLog(REDIS_WARNING,
9574 "Critical VM problem in vmWriteObjectOnSwap(): can't seek: %s",
9575 strerror(errno));
9576 return REDIS_ERR;
9577 }
9578 rdbSaveObject(server.vm_fp,o);
9579 fflush(server.vm_fp);
9580 if (server.vm_enabled) pthread_mutex_unlock(&server.io_swapfile_mutex);
9581 return REDIS_OK;
9582 }
9583
9584 /* Transfers the 'val' object to disk. Store all the information
9585 * a 'vmpointer' object containing all the information needed to load the
9586 * object back later is returned.
9587 *
9588 * If we can't find enough contiguous empty pages to swap the object on disk
9589 * NULL is returned. */
9590 static vmpointer *vmSwapObjectBlocking(robj *val) {
9591 off_t pages = rdbSavedObjectPages(val,NULL);
9592 off_t page;
9593 vmpointer *vp;
9594
9595 assert(val->storage == REDIS_VM_MEMORY);
9596 assert(val->refcount == 1);
9597 if (vmFindContiguousPages(&page,pages) == REDIS_ERR) return NULL;
9598 if (vmWriteObjectOnSwap(val,page) == REDIS_ERR) return NULL;
9599
9600 vp = createVmPointer(val->type);
9601 vp->page = page;
9602 vp->usedpages = pages;
9603 decrRefCount(val); /* Deallocate the object from memory. */
9604 vmMarkPagesUsed(page,pages);
9605 redisLog(REDIS_DEBUG,"VM: object %p swapped out at %lld (%lld pages)",
9606 (void*) val,
9607 (unsigned long long) page, (unsigned long long) pages);
9608 server.vm_stats_swapped_objects++;
9609 server.vm_stats_swapouts++;
9610 return vp;
9611 }
9612
9613 static robj *vmReadObjectFromSwap(off_t page, int type) {
9614 robj *o;
9615
9616 if (server.vm_enabled) pthread_mutex_lock(&server.io_swapfile_mutex);
9617 if (fseeko(server.vm_fp,page*server.vm_page_size,SEEK_SET) == -1) {
9618 redisLog(REDIS_WARNING,
9619 "Unrecoverable VM problem in vmReadObjectFromSwap(): can't seek: %s",
9620 strerror(errno));
9621 _exit(1);
9622 }
9623 o = rdbLoadObject(type,server.vm_fp);
9624 if (o == NULL) {
9625 redisLog(REDIS_WARNING, "Unrecoverable VM problem in vmReadObjectFromSwap(): can't load object from swap file: %s", strerror(errno));
9626 _exit(1);
9627 }
9628 if (server.vm_enabled) pthread_mutex_unlock(&server.io_swapfile_mutex);
9629 return o;
9630 }
9631
9632 /* Load the specified object from swap to memory.
9633 * The newly allocated object is returned.
9634 *
9635 * If preview is true the unserialized object is returned to the caller but
9636 * the pages are not marked as freed, nor the vp object is freed. */
9637 static robj *vmGenericLoadObject(vmpointer *vp, int preview) {
9638 robj *val;
9639
9640 redisAssert(vp->type == REDIS_VMPOINTER &&
9641 (vp->storage == REDIS_VM_SWAPPED || vp->storage == REDIS_VM_LOADING));
9642 val = vmReadObjectFromSwap(vp->page,vp->vtype);
9643 if (!preview) {
9644 redisLog(REDIS_DEBUG, "VM: object %p loaded from disk", (void*)vp);
9645 vmMarkPagesFree(vp->page,vp->usedpages);
9646 zfree(vp);
9647 server.vm_stats_swapped_objects--;
9648 } else {
9649 redisLog(REDIS_DEBUG, "VM: object %p previewed from disk", (void*)vp);
9650 }
9651 server.vm_stats_swapins++;
9652 return val;
9653 }
9654
9655 /* Plain object loading, from swap to memory.
9656 *
9657 * 'o' is actually a redisVmPointer structure that will be freed by the call.
9658 * The return value is the loaded object. */
9659 static robj *vmLoadObject(robj *o) {
9660 /* If we are loading the object in background, stop it, we
9661 * need to load this object synchronously ASAP. */
9662 if (o->storage == REDIS_VM_LOADING)
9663 vmCancelThreadedIOJob(o);
9664 return vmGenericLoadObject((vmpointer*)o,0);
9665 }
9666
9667 /* Just load the value on disk, without to modify the key.
9668 * This is useful when we want to perform some operation on the value
9669 * without to really bring it from swap to memory, like while saving the
9670 * dataset or rewriting the append only log. */
9671 static robj *vmPreviewObject(robj *o) {
9672 return vmGenericLoadObject((vmpointer*)o,1);
9673 }
9674
9675 /* How a good candidate is this object for swapping?
9676 * The better candidate it is, the greater the returned value.
9677 *
9678 * Currently we try to perform a fast estimation of the object size in
9679 * memory, and combine it with aging informations.
9680 *
9681 * Basically swappability = idle-time * log(estimated size)
9682 *
9683 * Bigger objects are preferred over smaller objects, but not
9684 * proportionally, this is why we use the logarithm. This algorithm is
9685 * just a first try and will probably be tuned later. */
9686 static double computeObjectSwappability(robj *o) {
9687 /* actual age can be >= minage, but not < minage. As we use wrapping
9688 * 21 bit clocks with minutes resolution for the LRU. */
9689 time_t minage = abs(server.lruclock - o->lru);
9690 long asize = 0, elesize;
9691 robj *ele;
9692 list *l;
9693 listNode *ln;
9694 dict *d;
9695 struct dictEntry *de;
9696 int z;
9697
9698 if (minage <= 0) return 0;
9699 switch(o->type) {
9700 case REDIS_STRING:
9701 if (o->encoding != REDIS_ENCODING_RAW) {
9702 asize = sizeof(*o);
9703 } else {
9704 asize = sdslen(o->ptr)+sizeof(*o)+sizeof(long)*2;
9705 }
9706 break;
9707 case REDIS_LIST:
9708 if (o->encoding == REDIS_ENCODING_ZIPLIST) {
9709 asize = sizeof(*o)+ziplistSize(o->ptr);
9710 } else {
9711 l = o->ptr;
9712 ln = listFirst(l);
9713 asize = sizeof(list);
9714 if (ln) {
9715 ele = ln->value;
9716 elesize = (ele->encoding == REDIS_ENCODING_RAW) ?
9717 (sizeof(*o)+sdslen(ele->ptr)) : sizeof(*o);
9718 asize += (sizeof(listNode)+elesize)*listLength(l);
9719 }
9720 }
9721 break;
9722 case REDIS_SET:
9723 case REDIS_ZSET:
9724 z = (o->type == REDIS_ZSET);
9725 d = z ? ((zset*)o->ptr)->dict : o->ptr;
9726
9727 asize = sizeof(dict)+(sizeof(struct dictEntry*)*dictSlots(d));
9728 if (z) asize += sizeof(zset)-sizeof(dict);
9729 if (dictSize(d)) {
9730 de = dictGetRandomKey(d);
9731 ele = dictGetEntryKey(de);
9732 elesize = (ele->encoding == REDIS_ENCODING_RAW) ?
9733 (sizeof(*o)+sdslen(ele->ptr)) : sizeof(*o);
9734 asize += (sizeof(struct dictEntry)+elesize)*dictSize(d);
9735 if (z) asize += sizeof(zskiplistNode)*dictSize(d);
9736 }
9737 break;
9738 case REDIS_HASH:
9739 if (o->encoding == REDIS_ENCODING_ZIPMAP) {
9740 unsigned char *p = zipmapRewind((unsigned char*)o->ptr);
9741 unsigned int len = zipmapLen((unsigned char*)o->ptr);
9742 unsigned int klen, vlen;
9743 unsigned char *key, *val;
9744
9745 if ((p = zipmapNext(p,&key,&klen,&val,&vlen)) == NULL) {
9746 klen = 0;
9747 vlen = 0;
9748 }
9749 asize = len*(klen+vlen+3);
9750 } else if (o->encoding == REDIS_ENCODING_HT) {
9751 d = o->ptr;
9752 asize = sizeof(dict)+(sizeof(struct dictEntry*)*dictSlots(d));
9753 if (dictSize(d)) {
9754 de = dictGetRandomKey(d);
9755 ele = dictGetEntryKey(de);
9756 elesize = (ele->encoding == REDIS_ENCODING_RAW) ?
9757 (sizeof(*o)+sdslen(ele->ptr)) : sizeof(*o);
9758 ele = dictGetEntryVal(de);
9759 elesize = (ele->encoding == REDIS_ENCODING_RAW) ?
9760 (sizeof(*o)+sdslen(ele->ptr)) : sizeof(*o);
9761 asize += (sizeof(struct dictEntry)+elesize)*dictSize(d);
9762 }
9763 }
9764 break;
9765 }
9766 return (double)minage*log(1+asize);
9767 }
9768
9769 /* Try to swap an object that's a good candidate for swapping.
9770 * Returns REDIS_OK if the object was swapped, REDIS_ERR if it's not possible
9771 * to swap any object at all.
9772 *
9773 * If 'usethreaded' is true, Redis will try to swap the object in background
9774 * using I/O threads. */
9775 static int vmSwapOneObject(int usethreads) {
9776 int j, i;
9777 struct dictEntry *best = NULL;
9778 double best_swappability = 0;
9779 redisDb *best_db = NULL;
9780 robj *val;
9781 sds key;
9782
9783 for (j = 0; j < server.dbnum; j++) {
9784 redisDb *db = server.db+j;
9785 /* Why maxtries is set to 100?
9786 * Because this way (usually) we'll find 1 object even if just 1% - 2%
9787 * are swappable objects */
9788 int maxtries = 100;
9789
9790 if (dictSize(db->dict) == 0) continue;
9791 for (i = 0; i < 5; i++) {
9792 dictEntry *de;
9793 double swappability;
9794
9795 if (maxtries) maxtries--;
9796 de = dictGetRandomKey(db->dict);
9797 val = dictGetEntryVal(de);
9798 /* Only swap objects that are currently in memory.
9799 *
9800 * Also don't swap shared objects: not a good idea in general and
9801 * we need to ensure that the main thread does not touch the
9802 * object while the I/O thread is using it, but we can't
9803 * control other keys without adding additional mutex. */
9804 if (val->storage != REDIS_VM_MEMORY || val->refcount != 1) {
9805 if (maxtries) i--; /* don't count this try */
9806 continue;
9807 }
9808 swappability = computeObjectSwappability(val);
9809 if (!best || swappability > best_swappability) {
9810 best = de;
9811 best_swappability = swappability;
9812 best_db = db;
9813 }
9814 }
9815 }
9816 if (best == NULL) return REDIS_ERR;
9817 key = dictGetEntryKey(best);
9818 val = dictGetEntryVal(best);
9819
9820 redisLog(REDIS_DEBUG,"Key with best swappability: %s, %f",
9821 key, best_swappability);
9822
9823 /* Swap it */
9824 if (usethreads) {
9825 robj *keyobj = createStringObject(key,sdslen(key));
9826 vmSwapObjectThreaded(keyobj,val,best_db);
9827 decrRefCount(keyobj);
9828 return REDIS_OK;
9829 } else {
9830 vmpointer *vp;
9831
9832 if ((vp = vmSwapObjectBlocking(val)) != NULL) {
9833 dictGetEntryVal(best) = vp;
9834 return REDIS_OK;
9835 } else {
9836 return REDIS_ERR;
9837 }
9838 }
9839 }
9840
9841 static int vmSwapOneObjectBlocking() {
9842 return vmSwapOneObject(0);
9843 }
9844
9845 static int vmSwapOneObjectThreaded() {
9846 return vmSwapOneObject(1);
9847 }
9848
9849 /* Return true if it's safe to swap out objects in a given moment.
9850 * Basically we don't want to swap objects out while there is a BGSAVE
9851 * or a BGAEOREWRITE running in backgroud. */
9852 static int vmCanSwapOut(void) {
9853 return (server.bgsavechildpid == -1 && server.bgrewritechildpid == -1);
9854 }
9855
9856 /* =================== Virtual Memory - Threaded I/O ======================= */
9857
9858 static void freeIOJob(iojob *j) {
9859 if ((j->type == REDIS_IOJOB_PREPARE_SWAP ||
9860 j->type == REDIS_IOJOB_DO_SWAP ||
9861 j->type == REDIS_IOJOB_LOAD) && j->val != NULL)
9862 {
9863 /* we fix the storage type, otherwise decrRefCount() will try to
9864 * kill the I/O thread Job (that does no longer exists). */
9865 if (j->val->storage == REDIS_VM_SWAPPING)
9866 j->val->storage = REDIS_VM_MEMORY;
9867 decrRefCount(j->val);
9868 }
9869 decrRefCount(j->key);
9870 zfree(j);
9871 }
9872
9873 /* Every time a thread finished a Job, it writes a byte into the write side
9874 * of an unix pipe in order to "awake" the main thread, and this function
9875 * is called. */
9876 static void vmThreadedIOCompletedJob(aeEventLoop *el, int fd, void *privdata,
9877 int mask)
9878 {
9879 char buf[1];
9880 int retval, processed = 0, toprocess = -1, trytoswap = 1;
9881 REDIS_NOTUSED(el);
9882 REDIS_NOTUSED(mask);
9883 REDIS_NOTUSED(privdata);
9884
9885 /* For every byte we read in the read side of the pipe, there is one
9886 * I/O job completed to process. */
9887 while((retval = read(fd,buf,1)) == 1) {
9888 iojob *j;
9889 listNode *ln;
9890 struct dictEntry *de;
9891
9892 redisLog(REDIS_DEBUG,"Processing I/O completed job");
9893
9894 /* Get the processed element (the oldest one) */
9895 lockThreadedIO();
9896 assert(listLength(server.io_processed) != 0);
9897 if (toprocess == -1) {
9898 toprocess = (listLength(server.io_processed)*REDIS_MAX_COMPLETED_JOBS_PROCESSED)/100;
9899 if (toprocess <= 0) toprocess = 1;
9900 }
9901 ln = listFirst(server.io_processed);
9902 j = ln->value;
9903 listDelNode(server.io_processed,ln);
9904 unlockThreadedIO();
9905 /* If this job is marked as canceled, just ignore it */
9906 if (j->canceled) {
9907 freeIOJob(j);
9908 continue;
9909 }
9910 /* Post process it in the main thread, as there are things we
9911 * can do just here to avoid race conditions and/or invasive locks */
9912 redisLog(REDIS_DEBUG,"COMPLETED Job type: %d, ID %p, key: %s", j->type, (void*)j->id, (unsigned char*)j->key->ptr);
9913 de = dictFind(j->db->dict,j->key->ptr);
9914 redisAssert(de != NULL);
9915 if (j->type == REDIS_IOJOB_LOAD) {
9916 redisDb *db;
9917 vmpointer *vp = dictGetEntryVal(de);
9918
9919 /* Key loaded, bring it at home */
9920 vmMarkPagesFree(vp->page,vp->usedpages);
9921 redisLog(REDIS_DEBUG, "VM: object %s loaded from disk (threaded)",
9922 (unsigned char*) j->key->ptr);
9923 server.vm_stats_swapped_objects--;
9924 server.vm_stats_swapins++;
9925 dictGetEntryVal(de) = j->val;
9926 incrRefCount(j->val);
9927 db = j->db;
9928 /* Handle clients waiting for this key to be loaded. */
9929 handleClientsBlockedOnSwappedKey(db,j->key);
9930 freeIOJob(j);
9931 zfree(vp);
9932 } else if (j->type == REDIS_IOJOB_PREPARE_SWAP) {
9933 /* Now we know the amount of pages required to swap this object.
9934 * Let's find some space for it, and queue this task again
9935 * rebranded as REDIS_IOJOB_DO_SWAP. */
9936 if (!vmCanSwapOut() ||
9937 vmFindContiguousPages(&j->page,j->pages) == REDIS_ERR)
9938 {
9939 /* Ooops... no space or we can't swap as there is
9940 * a fork()ed Redis trying to save stuff on disk. */
9941 j->val->storage = REDIS_VM_MEMORY; /* undo operation */
9942 freeIOJob(j);
9943 } else {
9944 /* Note that we need to mark this pages as used now,
9945 * if the job will be canceled, we'll mark them as freed
9946 * again. */
9947 vmMarkPagesUsed(j->page,j->pages);
9948 j->type = REDIS_IOJOB_DO_SWAP;
9949 lockThreadedIO();
9950 queueIOJob(j);
9951 unlockThreadedIO();
9952 }
9953 } else if (j->type == REDIS_IOJOB_DO_SWAP) {
9954 vmpointer *vp;
9955
9956 /* Key swapped. We can finally free some memory. */
9957 if (j->val->storage != REDIS_VM_SWAPPING) {
9958 vmpointer *vp = (vmpointer*) j->id;
9959 printf("storage: %d\n",vp->storage);
9960 printf("key->name: %s\n",(char*)j->key->ptr);
9961 printf("val: %p\n",(void*)j->val);
9962 printf("val->type: %d\n",j->val->type);
9963 printf("val->ptr: %s\n",(char*)j->val->ptr);
9964 }
9965 redisAssert(j->val->storage == REDIS_VM_SWAPPING);
9966 vp = createVmPointer(j->val->type);
9967 vp->page = j->page;
9968 vp->usedpages = j->pages;
9969 dictGetEntryVal(de) = vp;
9970 /* Fix the storage otherwise decrRefCount will attempt to
9971 * remove the associated I/O job */
9972 j->val->storage = REDIS_VM_MEMORY;
9973 decrRefCount(j->val);
9974 redisLog(REDIS_DEBUG,
9975 "VM: object %s swapped out at %lld (%lld pages) (threaded)",
9976 (unsigned char*) j->key->ptr,
9977 (unsigned long long) j->page, (unsigned long long) j->pages);
9978 server.vm_stats_swapped_objects++;
9979 server.vm_stats_swapouts++;
9980 freeIOJob(j);
9981 /* Put a few more swap requests in queue if we are still
9982 * out of memory */
9983 if (trytoswap && vmCanSwapOut() &&
9984 zmalloc_used_memory() > server.vm_max_memory)
9985 {
9986 int more = 1;
9987 while(more) {
9988 lockThreadedIO();
9989 more = listLength(server.io_newjobs) <
9990 (unsigned) server.vm_max_threads;
9991 unlockThreadedIO();
9992 /* Don't waste CPU time if swappable objects are rare. */
9993 if (vmSwapOneObjectThreaded() == REDIS_ERR) {
9994 trytoswap = 0;
9995 break;
9996 }
9997 }
9998 }
9999 }
10000 processed++;
10001 if (processed == toprocess) return;
10002 }
10003 if (retval < 0 && errno != EAGAIN) {
10004 redisLog(REDIS_WARNING,
10005 "WARNING: read(2) error in vmThreadedIOCompletedJob() %s",
10006 strerror(errno));
10007 }
10008 }
10009
10010 static void lockThreadedIO(void) {
10011 pthread_mutex_lock(&server.io_mutex);
10012 }
10013
10014 static void unlockThreadedIO(void) {
10015 pthread_mutex_unlock(&server.io_mutex);
10016 }
10017
10018 /* Remove the specified object from the threaded I/O queue if still not
10019 * processed, otherwise make sure to flag it as canceled. */
10020 static void vmCancelThreadedIOJob(robj *o) {
10021 list *lists[3] = {
10022 server.io_newjobs, /* 0 */
10023 server.io_processing, /* 1 */
10024 server.io_processed /* 2 */
10025 };
10026 int i;
10027
10028 assert(o->storage == REDIS_VM_LOADING || o->storage == REDIS_VM_SWAPPING);
10029 again:
10030 lockThreadedIO();
10031 /* Search for a matching object in one of the queues */
10032 for (i = 0; i < 3; i++) {
10033 listNode *ln;
10034 listIter li;
10035
10036 listRewind(lists[i],&li);
10037 while ((ln = listNext(&li)) != NULL) {
10038 iojob *job = ln->value;
10039
10040 if (job->canceled) continue; /* Skip this, already canceled. */
10041 if (job->id == o) {
10042 redisLog(REDIS_DEBUG,"*** CANCELED %p (key %s) (type %d) (LIST ID %d)\n",
10043 (void*)job, (char*)job->key->ptr, job->type, i);
10044 /* Mark the pages as free since the swap didn't happened
10045 * or happened but is now discarded. */
10046 if (i != 1 && job->type == REDIS_IOJOB_DO_SWAP)
10047 vmMarkPagesFree(job->page,job->pages);
10048 /* Cancel the job. It depends on the list the job is
10049 * living in. */
10050 switch(i) {
10051 case 0: /* io_newjobs */
10052 /* If the job was yet not processed the best thing to do
10053 * is to remove it from the queue at all */
10054 freeIOJob(job);
10055 listDelNode(lists[i],ln);
10056 break;
10057 case 1: /* io_processing */
10058 /* Oh Shi- the thread is messing with the Job:
10059 *
10060 * Probably it's accessing the object if this is a
10061 * PREPARE_SWAP or DO_SWAP job.
10062 * If it's a LOAD job it may be reading from disk and
10063 * if we don't wait for the job to terminate before to
10064 * cancel it, maybe in a few microseconds data can be
10065 * corrupted in this pages. So the short story is:
10066 *
10067 * Better to wait for the job to move into the
10068 * next queue (processed)... */
10069
10070 /* We try again and again until the job is completed. */
10071 unlockThreadedIO();
10072 /* But let's wait some time for the I/O thread
10073 * to finish with this job. After all this condition
10074 * should be very rare. */
10075 usleep(1);
10076 goto again;
10077 case 2: /* io_processed */
10078 /* The job was already processed, that's easy...
10079 * just mark it as canceled so that we'll ignore it
10080 * when processing completed jobs. */
10081 job->canceled = 1;
10082 break;
10083 }
10084 /* Finally we have to adjust the storage type of the object
10085 * in order to "UNDO" the operaiton. */
10086 if (o->storage == REDIS_VM_LOADING)
10087 o->storage = REDIS_VM_SWAPPED;
10088 else if (o->storage == REDIS_VM_SWAPPING)
10089 o->storage = REDIS_VM_MEMORY;
10090 unlockThreadedIO();
10091 redisLog(REDIS_DEBUG,"*** DONE");
10092 return;
10093 }
10094 }
10095 }
10096 unlockThreadedIO();
10097 printf("Not found: %p\n", (void*)o);
10098 redisAssert(1 != 1); /* We should never reach this */
10099 }
10100
10101 static void *IOThreadEntryPoint(void *arg) {
10102 iojob *j;
10103 listNode *ln;
10104 REDIS_NOTUSED(arg);
10105
10106 pthread_detach(pthread_self());
10107 while(1) {
10108 /* Get a new job to process */
10109 lockThreadedIO();
10110 if (listLength(server.io_newjobs) == 0) {
10111 /* No new jobs in queue, exit. */
10112 redisLog(REDIS_DEBUG,"Thread %ld exiting, nothing to do",
10113 (long) pthread_self());
10114 server.io_active_threads--;
10115 unlockThreadedIO();
10116 return NULL;
10117 }
10118 ln = listFirst(server.io_newjobs);
10119 j = ln->value;
10120 listDelNode(server.io_newjobs,ln);
10121 /* Add the job in the processing queue */
10122 j->thread = pthread_self();
10123 listAddNodeTail(server.io_processing,j);
10124 ln = listLast(server.io_processing); /* We use ln later to remove it */
10125 unlockThreadedIO();
10126 redisLog(REDIS_DEBUG,"Thread %ld got a new job (type %d): %p about key '%s'",
10127 (long) pthread_self(), j->type, (void*)j, (char*)j->key->ptr);
10128
10129 /* Process the Job */
10130 if (j->type == REDIS_IOJOB_LOAD) {
10131 vmpointer *vp = (vmpointer*)j->id;
10132 j->val = vmReadObjectFromSwap(j->page,vp->vtype);
10133 } else if (j->type == REDIS_IOJOB_PREPARE_SWAP) {
10134 FILE *fp = fopen("/dev/null","w+");
10135 j->pages = rdbSavedObjectPages(j->val,fp);
10136 fclose(fp);
10137 } else if (j->type == REDIS_IOJOB_DO_SWAP) {
10138 if (vmWriteObjectOnSwap(j->val,j->page) == REDIS_ERR)
10139 j->canceled = 1;
10140 }
10141
10142 /* Done: insert the job into the processed queue */
10143 redisLog(REDIS_DEBUG,"Thread %ld completed the job: %p (key %s)",
10144 (long) pthread_self(), (void*)j, (char*)j->key->ptr);
10145 lockThreadedIO();
10146 listDelNode(server.io_processing,ln);
10147 listAddNodeTail(server.io_processed,j);
10148 unlockThreadedIO();
10149
10150 /* Signal the main thread there is new stuff to process */
10151 assert(write(server.io_ready_pipe_write,"x",1) == 1);
10152 }
10153 return NULL; /* never reached */
10154 }
10155
10156 static void spawnIOThread(void) {
10157 pthread_t thread;
10158 sigset_t mask, omask;
10159 int err;
10160
10161 sigemptyset(&mask);
10162 sigaddset(&mask,SIGCHLD);
10163 sigaddset(&mask,SIGHUP);
10164 sigaddset(&mask,SIGPIPE);
10165 pthread_sigmask(SIG_SETMASK, &mask, &omask);
10166 while ((err = pthread_create(&thread,&server.io_threads_attr,IOThreadEntryPoint,NULL)) != 0) {
10167 redisLog(REDIS_WARNING,"Unable to spawn an I/O thread: %s",
10168 strerror(err));
10169 usleep(1000000);
10170 }
10171 pthread_sigmask(SIG_SETMASK, &omask, NULL);
10172 server.io_active_threads++;
10173 }
10174
10175 /* We need to wait for the last thread to exit before we are able to
10176 * fork() in order to BGSAVE or BGREWRITEAOF. */
10177 static void waitEmptyIOJobsQueue(void) {
10178 while(1) {
10179 int io_processed_len;
10180
10181 lockThreadedIO();
10182 if (listLength(server.io_newjobs) == 0 &&
10183 listLength(server.io_processing) == 0 &&
10184 server.io_active_threads == 0)
10185 {
10186 unlockThreadedIO();
10187 return;
10188 }
10189 /* While waiting for empty jobs queue condition we post-process some
10190 * finshed job, as I/O threads may be hanging trying to write against
10191 * the io_ready_pipe_write FD but there are so much pending jobs that
10192 * it's blocking. */
10193 io_processed_len = listLength(server.io_processed);
10194 unlockThreadedIO();
10195 if (io_processed_len) {
10196 vmThreadedIOCompletedJob(NULL,server.io_ready_pipe_read,NULL,0);
10197 usleep(1000); /* 1 millisecond */
10198 } else {
10199 usleep(10000); /* 10 milliseconds */
10200 }
10201 }
10202 }
10203
10204 static void vmReopenSwapFile(void) {
10205 /* Note: we don't close the old one as we are in the child process
10206 * and don't want to mess at all with the original file object. */
10207 server.vm_fp = fopen(server.vm_swap_file,"r+b");
10208 if (server.vm_fp == NULL) {
10209 redisLog(REDIS_WARNING,"Can't re-open the VM swap file: %s. Exiting.",
10210 server.vm_swap_file);
10211 _exit(1);
10212 }
10213 server.vm_fd = fileno(server.vm_fp);
10214 }
10215
10216 /* This function must be called while with threaded IO locked */
10217 static void queueIOJob(iojob *j) {
10218 redisLog(REDIS_DEBUG,"Queued IO Job %p type %d about key '%s'\n",
10219 (void*)j, j->type, (char*)j->key->ptr);
10220 listAddNodeTail(server.io_newjobs,j);
10221 if (server.io_active_threads < server.vm_max_threads)
10222 spawnIOThread();
10223 }
10224
10225 static int vmSwapObjectThreaded(robj *key, robj *val, redisDb *db) {
10226 iojob *j;
10227
10228 j = zmalloc(sizeof(*j));
10229 j->type = REDIS_IOJOB_PREPARE_SWAP;
10230 j->db = db;
10231 j->key = key;
10232 incrRefCount(key);
10233 j->id = j->val = val;
10234 incrRefCount(val);
10235 j->canceled = 0;
10236 j->thread = (pthread_t) -1;
10237 val->storage = REDIS_VM_SWAPPING;
10238
10239 lockThreadedIO();
10240 queueIOJob(j);
10241 unlockThreadedIO();
10242 return REDIS_OK;
10243 }
10244
10245 /* ============ Virtual Memory - Blocking clients on missing keys =========== */
10246
10247 /* This function makes the clinet 'c' waiting for the key 'key' to be loaded.
10248 * If there is not already a job loading the key, it is craeted.
10249 * The key is added to the io_keys list in the client structure, and also
10250 * in the hash table mapping swapped keys to waiting clients, that is,
10251 * server.io_waited_keys. */
10252 static int waitForSwappedKey(redisClient *c, robj *key) {
10253 struct dictEntry *de;
10254 robj *o;
10255 list *l;
10256
10257 /* If the key does not exist or is already in RAM we don't need to
10258 * block the client at all. */
10259 de = dictFind(c->db->dict,key->ptr);
10260 if (de == NULL) return 0;
10261 o = dictGetEntryVal(de);
10262 if (o->storage == REDIS_VM_MEMORY) {
10263 return 0;
10264 } else if (o->storage == REDIS_VM_SWAPPING) {
10265 /* We were swapping the key, undo it! */
10266 vmCancelThreadedIOJob(o);
10267 return 0;
10268 }
10269
10270 /* OK: the key is either swapped, or being loaded just now. */
10271
10272 /* Add the key to the list of keys this client is waiting for.
10273 * This maps clients to keys they are waiting for. */
10274 listAddNodeTail(c->io_keys,key);
10275 incrRefCount(key);
10276
10277 /* Add the client to the swapped keys => clients waiting map. */
10278 de = dictFind(c->db->io_keys,key);
10279 if (de == NULL) {
10280 int retval;
10281
10282 /* For every key we take a list of clients blocked for it */
10283 l = listCreate();
10284 retval = dictAdd(c->db->io_keys,key,l);
10285 incrRefCount(key);
10286 assert(retval == DICT_OK);
10287 } else {
10288 l = dictGetEntryVal(de);
10289 }
10290 listAddNodeTail(l,c);
10291
10292 /* Are we already loading the key from disk? If not create a job */
10293 if (o->storage == REDIS_VM_SWAPPED) {
10294 iojob *j;
10295 vmpointer *vp = (vmpointer*)o;
10296
10297 o->storage = REDIS_VM_LOADING;
10298 j = zmalloc(sizeof(*j));
10299 j->type = REDIS_IOJOB_LOAD;
10300 j->db = c->db;
10301 j->id = (robj*)vp;
10302 j->key = key;
10303 incrRefCount(key);
10304 j->page = vp->page;
10305 j->val = NULL;
10306 j->canceled = 0;
10307 j->thread = (pthread_t) -1;
10308 lockThreadedIO();
10309 queueIOJob(j);
10310 unlockThreadedIO();
10311 }
10312 return 1;
10313 }
10314
10315 /* Preload keys for any command with first, last and step values for
10316 * the command keys prototype, as defined in the command table. */
10317 static void waitForMultipleSwappedKeys(redisClient *c, struct redisCommand *cmd, int argc, robj **argv) {
10318 int j, last;
10319 if (cmd->vm_firstkey == 0) return;
10320 last = cmd->vm_lastkey;
10321 if (last < 0) last = argc+last;
10322 for (j = cmd->vm_firstkey; j <= last; j += cmd->vm_keystep) {
10323 redisAssert(j < argc);
10324 waitForSwappedKey(c,argv[j]);
10325 }
10326 }
10327
10328 /* Preload keys needed for the ZUNIONSTORE and ZINTERSTORE commands.
10329 * Note that the number of keys to preload is user-defined, so we need to
10330 * apply a sanity check against argc. */
10331 static void zunionInterBlockClientOnSwappedKeys(redisClient *c, struct redisCommand *cmd, int argc, robj **argv) {
10332 int i, num;
10333 REDIS_NOTUSED(cmd);
10334
10335 num = atoi(argv[2]->ptr);
10336 if (num > (argc-3)) return;
10337 for (i = 0; i < num; i++) {
10338 waitForSwappedKey(c,argv[3+i]);
10339 }
10340 }
10341
10342 /* Preload keys needed to execute the entire MULTI/EXEC block.
10343 *
10344 * This function is called by blockClientOnSwappedKeys when EXEC is issued,
10345 * and will block the client when any command requires a swapped out value. */
10346 static void execBlockClientOnSwappedKeys(redisClient *c, struct redisCommand *cmd, int argc, robj **argv) {
10347 int i, margc;
10348 struct redisCommand *mcmd;
10349 robj **margv;
10350 REDIS_NOTUSED(cmd);
10351 REDIS_NOTUSED(argc);
10352 REDIS_NOTUSED(argv);
10353
10354 if (!(c->flags & REDIS_MULTI)) return;
10355 for (i = 0; i < c->mstate.count; i++) {
10356 mcmd = c->mstate.commands[i].cmd;
10357 margc = c->mstate.commands[i].argc;
10358 margv = c->mstate.commands[i].argv;
10359
10360 if (mcmd->vm_preload_proc != NULL) {
10361 mcmd->vm_preload_proc(c,mcmd,margc,margv);
10362 } else {
10363 waitForMultipleSwappedKeys(c,mcmd,margc,margv);
10364 }
10365 }
10366 }
10367
10368 /* Is this client attempting to run a command against swapped keys?
10369 * If so, block it ASAP, load the keys in background, then resume it.
10370 *
10371 * The important idea about this function is that it can fail! If keys will
10372 * still be swapped when the client is resumed, this key lookups will
10373 * just block loading keys from disk. In practical terms this should only
10374 * happen with SORT BY command or if there is a bug in this function.
10375 *
10376 * Return 1 if the client is marked as blocked, 0 if the client can
10377 * continue as the keys it is going to access appear to be in memory. */
10378 static int blockClientOnSwappedKeys(redisClient *c, struct redisCommand *cmd) {
10379 if (cmd->vm_preload_proc != NULL) {
10380 cmd->vm_preload_proc(c,cmd,c->argc,c->argv);
10381 } else {
10382 waitForMultipleSwappedKeys(c,cmd,c->argc,c->argv);
10383 }
10384
10385 /* If the client was blocked for at least one key, mark it as blocked. */
10386 if (listLength(c->io_keys)) {
10387 c->flags |= REDIS_IO_WAIT;
10388 aeDeleteFileEvent(server.el,c->fd,AE_READABLE);
10389 server.vm_blocked_clients++;
10390 return 1;
10391 } else {
10392 return 0;
10393 }
10394 }
10395
10396 /* Remove the 'key' from the list of blocked keys for a given client.
10397 *
10398 * The function returns 1 when there are no longer blocking keys after
10399 * the current one was removed (and the client can be unblocked). */
10400 static int dontWaitForSwappedKey(redisClient *c, robj *key) {
10401 list *l;
10402 listNode *ln;
10403 listIter li;
10404 struct dictEntry *de;
10405
10406 /* Remove the key from the list of keys this client is waiting for. */
10407 listRewind(c->io_keys,&li);
10408 while ((ln = listNext(&li)) != NULL) {
10409 if (equalStringObjects(ln->value,key)) {
10410 listDelNode(c->io_keys,ln);
10411 break;
10412 }
10413 }
10414 assert(ln != NULL);
10415
10416 /* Remove the client form the key => waiting clients map. */
10417 de = dictFind(c->db->io_keys,key);
10418 assert(de != NULL);
10419 l = dictGetEntryVal(de);
10420 ln = listSearchKey(l,c);
10421 assert(ln != NULL);
10422 listDelNode(l,ln);
10423 if (listLength(l) == 0)
10424 dictDelete(c->db->io_keys,key);
10425
10426 return listLength(c->io_keys) == 0;
10427 }
10428
10429 /* Every time we now a key was loaded back in memory, we handle clients
10430 * waiting for this key if any. */
10431 static void handleClientsBlockedOnSwappedKey(redisDb *db, robj *key) {
10432 struct dictEntry *de;
10433 list *l;
10434 listNode *ln;
10435 int len;
10436
10437 de = dictFind(db->io_keys,key);
10438 if (!de) return;
10439
10440 l = dictGetEntryVal(de);
10441 len = listLength(l);
10442 /* Note: we can't use something like while(listLength(l)) as the list
10443 * can be freed by the calling function when we remove the last element. */
10444 while (len--) {
10445 ln = listFirst(l);
10446 redisClient *c = ln->value;
10447
10448 if (dontWaitForSwappedKey(c,key)) {
10449 /* Put the client in the list of clients ready to go as we
10450 * loaded all the keys about it. */
10451 listAddNodeTail(server.io_ready_clients,c);
10452 }
10453 }
10454 }
10455
10456 /* =========================== Remote Configuration ========================= */
10457
10458 static void configSetCommand(redisClient *c) {
10459 robj *o = getDecodedObject(c->argv[3]);
10460 long long ll;
10461
10462 if (!strcasecmp(c->argv[2]->ptr,"dbfilename")) {
10463 zfree(server.dbfilename);
10464 server.dbfilename = zstrdup(o->ptr);
10465 } else if (!strcasecmp(c->argv[2]->ptr,"requirepass")) {
10466 zfree(server.requirepass);
10467 server.requirepass = zstrdup(o->ptr);
10468 } else if (!strcasecmp(c->argv[2]->ptr,"masterauth")) {
10469 zfree(server.masterauth);
10470 server.masterauth = zstrdup(o->ptr);
10471 } else if (!strcasecmp(c->argv[2]->ptr,"maxmemory")) {
10472 if (getLongLongFromObject(o,&ll) == REDIS_ERR ||
10473 ll < 0) goto badfmt;
10474 server.maxmemory = ll;
10475 } else if (!strcasecmp(c->argv[2]->ptr,"timeout")) {
10476 if (getLongLongFromObject(o,&ll) == REDIS_ERR ||
10477 ll < 0 || ll > LONG_MAX) goto badfmt;
10478 server.maxidletime = ll;
10479 } else if (!strcasecmp(c->argv[2]->ptr,"appendfsync")) {
10480 if (!strcasecmp(o->ptr,"no")) {
10481 server.appendfsync = APPENDFSYNC_NO;
10482 } else if (!strcasecmp(o->ptr,"everysec")) {
10483 server.appendfsync = APPENDFSYNC_EVERYSEC;
10484 } else if (!strcasecmp(o->ptr,"always")) {
10485 server.appendfsync = APPENDFSYNC_ALWAYS;
10486 } else {
10487 goto badfmt;
10488 }
10489 } else if (!strcasecmp(c->argv[2]->ptr,"no-appendfsync-on-rewrite")) {
10490 int yn = yesnotoi(o->ptr);
10491
10492 if (yn == -1) goto badfmt;
10493 server.no_appendfsync_on_rewrite = yn;
10494 } else if (!strcasecmp(c->argv[2]->ptr,"appendonly")) {
10495 int old = server.appendonly;
10496 int new = yesnotoi(o->ptr);
10497
10498 if (new == -1) goto badfmt;
10499 if (old != new) {
10500 if (new == 0) {
10501 stopAppendOnly();
10502 } else {
10503 if (startAppendOnly() == REDIS_ERR) {
10504 addReplySds(c,sdscatprintf(sdsempty(),
10505 "-ERR Unable to turn on AOF. Check server logs.\r\n"));
10506 decrRefCount(o);
10507 return;
10508 }
10509 }
10510 }
10511 } else if (!strcasecmp(c->argv[2]->ptr,"save")) {
10512 int vlen, j;
10513 sds *v = sdssplitlen(o->ptr,sdslen(o->ptr)," ",1,&vlen);
10514
10515 /* Perform sanity check before setting the new config:
10516 * - Even number of args
10517 * - Seconds >= 1, changes >= 0 */
10518 if (vlen & 1) {
10519 sdsfreesplitres(v,vlen);
10520 goto badfmt;
10521 }
10522 for (j = 0; j < vlen; j++) {
10523 char *eptr;
10524 long val;
10525
10526 val = strtoll(v[j], &eptr, 10);
10527 if (eptr[0] != '\0' ||
10528 ((j & 1) == 0 && val < 1) ||
10529 ((j & 1) == 1 && val < 0)) {
10530 sdsfreesplitres(v,vlen);
10531 goto badfmt;
10532 }
10533 }
10534 /* Finally set the new config */
10535 resetServerSaveParams();
10536 for (j = 0; j < vlen; j += 2) {
10537 time_t seconds;
10538 int changes;
10539
10540 seconds = strtoll(v[j],NULL,10);
10541 changes = strtoll(v[j+1],NULL,10);
10542 appendServerSaveParams(seconds, changes);
10543 }
10544 sdsfreesplitres(v,vlen);
10545 } else {
10546 addReplySds(c,sdscatprintf(sdsempty(),
10547 "-ERR not supported CONFIG parameter %s\r\n",
10548 (char*)c->argv[2]->ptr));
10549 decrRefCount(o);
10550 return;
10551 }
10552 decrRefCount(o);
10553 addReply(c,shared.ok);
10554 return;
10555
10556 badfmt: /* Bad format errors */
10557 addReplySds(c,sdscatprintf(sdsempty(),
10558 "-ERR invalid argument '%s' for CONFIG SET '%s'\r\n",
10559 (char*)o->ptr,
10560 (char*)c->argv[2]->ptr));
10561 decrRefCount(o);
10562 }
10563
10564 static void configGetCommand(redisClient *c) {
10565 robj *o = getDecodedObject(c->argv[2]);
10566 robj *lenobj = createObject(REDIS_STRING,NULL);
10567 char *pattern = o->ptr;
10568 int matches = 0;
10569
10570 addReply(c,lenobj);
10571 decrRefCount(lenobj);
10572
10573 if (stringmatch(pattern,"dbfilename",0)) {
10574 addReplyBulkCString(c,"dbfilename");
10575 addReplyBulkCString(c,server.dbfilename);
10576 matches++;
10577 }
10578 if (stringmatch(pattern,"requirepass",0)) {
10579 addReplyBulkCString(c,"requirepass");
10580 addReplyBulkCString(c,server.requirepass);
10581 matches++;
10582 }
10583 if (stringmatch(pattern,"masterauth",0)) {
10584 addReplyBulkCString(c,"masterauth");
10585 addReplyBulkCString(c,server.masterauth);
10586 matches++;
10587 }
10588 if (stringmatch(pattern,"maxmemory",0)) {
10589 char buf[128];
10590
10591 ll2string(buf,128,server.maxmemory);
10592 addReplyBulkCString(c,"maxmemory");
10593 addReplyBulkCString(c,buf);
10594 matches++;
10595 }
10596 if (stringmatch(pattern,"timeout",0)) {
10597 char buf[128];
10598
10599 ll2string(buf,128,server.maxidletime);
10600 addReplyBulkCString(c,"timeout");
10601 addReplyBulkCString(c,buf);
10602 matches++;
10603 }
10604 if (stringmatch(pattern,"appendonly",0)) {
10605 addReplyBulkCString(c,"appendonly");
10606 addReplyBulkCString(c,server.appendonly ? "yes" : "no");
10607 matches++;
10608 }
10609 if (stringmatch(pattern,"no-appendfsync-on-rewrite",0)) {
10610 addReplyBulkCString(c,"no-appendfsync-on-rewrite");
10611 addReplyBulkCString(c,server.no_appendfsync_on_rewrite ? "yes" : "no");
10612 matches++;
10613 }
10614 if (stringmatch(pattern,"appendfsync",0)) {
10615 char *policy;
10616
10617 switch(server.appendfsync) {
10618 case APPENDFSYNC_NO: policy = "no"; break;
10619 case APPENDFSYNC_EVERYSEC: policy = "everysec"; break;
10620 case APPENDFSYNC_ALWAYS: policy = "always"; break;
10621 default: policy = "unknown"; break; /* too harmless to panic */
10622 }
10623 addReplyBulkCString(c,"appendfsync");
10624 addReplyBulkCString(c,policy);
10625 matches++;
10626 }
10627 if (stringmatch(pattern,"save",0)) {
10628 sds buf = sdsempty();
10629 int j;
10630
10631 for (j = 0; j < server.saveparamslen; j++) {
10632 buf = sdscatprintf(buf,"%ld %d",
10633 server.saveparams[j].seconds,
10634 server.saveparams[j].changes);
10635 if (j != server.saveparamslen-1)
10636 buf = sdscatlen(buf," ",1);
10637 }
10638 addReplyBulkCString(c,"save");
10639 addReplyBulkCString(c,buf);
10640 sdsfree(buf);
10641 matches++;
10642 }
10643 decrRefCount(o);
10644 lenobj->ptr = sdscatprintf(sdsempty(),"*%d\r\n",matches*2);
10645 }
10646
10647 static void configCommand(redisClient *c) {
10648 if (!strcasecmp(c->argv[1]->ptr,"set")) {
10649 if (c->argc != 4) goto badarity;
10650 configSetCommand(c);
10651 } else if (!strcasecmp(c->argv[1]->ptr,"get")) {
10652 if (c->argc != 3) goto badarity;
10653 configGetCommand(c);
10654 } else if (!strcasecmp(c->argv[1]->ptr,"resetstat")) {
10655 if (c->argc != 2) goto badarity;
10656 server.stat_numcommands = 0;
10657 server.stat_numconnections = 0;
10658 server.stat_expiredkeys = 0;
10659 server.stat_starttime = time(NULL);
10660 addReply(c,shared.ok);
10661 } else {
10662 addReplySds(c,sdscatprintf(sdsempty(),
10663 "-ERR CONFIG subcommand must be one of GET, SET, RESETSTAT\r\n"));
10664 }
10665 return;
10666
10667 badarity:
10668 addReplySds(c,sdscatprintf(sdsempty(),
10669 "-ERR Wrong number of arguments for CONFIG %s\r\n",
10670 (char*) c->argv[1]->ptr));
10671 }
10672
10673 /* =========================== Pubsub implementation ======================== */
10674
10675 static void freePubsubPattern(void *p) {
10676 pubsubPattern *pat = p;
10677
10678 decrRefCount(pat->pattern);
10679 zfree(pat);
10680 }
10681
10682 static int listMatchPubsubPattern(void *a, void *b) {
10683 pubsubPattern *pa = a, *pb = b;
10684
10685 return (pa->client == pb->client) &&
10686 (equalStringObjects(pa->pattern,pb->pattern));
10687 }
10688
10689 /* Subscribe a client to a channel. Returns 1 if the operation succeeded, or
10690 * 0 if the client was already subscribed to that channel. */
10691 static int pubsubSubscribeChannel(redisClient *c, robj *channel) {
10692 struct dictEntry *de;
10693 list *clients = NULL;
10694 int retval = 0;
10695
10696 /* Add the channel to the client -> channels hash table */
10697 if (dictAdd(c->pubsub_channels,channel,NULL) == DICT_OK) {
10698 retval = 1;
10699 incrRefCount(channel);
10700 /* Add the client to the channel -> list of clients hash table */
10701 de = dictFind(server.pubsub_channels,channel);
10702 if (de == NULL) {
10703 clients = listCreate();
10704 dictAdd(server.pubsub_channels,channel,clients);
10705 incrRefCount(channel);
10706 } else {
10707 clients = dictGetEntryVal(de);
10708 }
10709 listAddNodeTail(clients,c);
10710 }
10711 /* Notify the client */
10712 addReply(c,shared.mbulk3);
10713 addReply(c,shared.subscribebulk);
10714 addReplyBulk(c,channel);
10715 addReplyLongLong(c,dictSize(c->pubsub_channels)+listLength(c->pubsub_patterns));
10716 return retval;
10717 }
10718
10719 /* Unsubscribe a client from a channel. Returns 1 if the operation succeeded, or
10720 * 0 if the client was not subscribed to the specified channel. */
10721 static int pubsubUnsubscribeChannel(redisClient *c, robj *channel, int notify) {
10722 struct dictEntry *de;
10723 list *clients;
10724 listNode *ln;
10725 int retval = 0;
10726
10727 /* Remove the channel from the client -> channels hash table */
10728 incrRefCount(channel); /* channel may be just a pointer to the same object
10729 we have in the hash tables. Protect it... */
10730 if (dictDelete(c->pubsub_channels,channel) == DICT_OK) {
10731 retval = 1;
10732 /* Remove the client from the channel -> clients list hash table */
10733 de = dictFind(server.pubsub_channels,channel);
10734 assert(de != NULL);
10735 clients = dictGetEntryVal(de);
10736 ln = listSearchKey(clients,c);
10737 assert(ln != NULL);
10738 listDelNode(clients,ln);
10739 if (listLength(clients) == 0) {
10740 /* Free the list and associated hash entry at all if this was
10741 * the latest client, so that it will be possible to abuse
10742 * Redis PUBSUB creating millions of channels. */
10743 dictDelete(server.pubsub_channels,channel);
10744 }
10745 }
10746 /* Notify the client */
10747 if (notify) {
10748 addReply(c,shared.mbulk3);
10749 addReply(c,shared.unsubscribebulk);
10750 addReplyBulk(c,channel);
10751 addReplyLongLong(c,dictSize(c->pubsub_channels)+
10752 listLength(c->pubsub_patterns));
10753
10754 }
10755 decrRefCount(channel); /* it is finally safe to release it */
10756 return retval;
10757 }
10758
10759 /* Subscribe a client to a pattern. Returns 1 if the operation succeeded, or 0 if the clinet was already subscribed to that pattern. */
10760 static int pubsubSubscribePattern(redisClient *c, robj *pattern) {
10761 int retval = 0;
10762
10763 if (listSearchKey(c->pubsub_patterns,pattern) == NULL) {
10764 retval = 1;
10765 pubsubPattern *pat;
10766 listAddNodeTail(c->pubsub_patterns,pattern);
10767 incrRefCount(pattern);
10768 pat = zmalloc(sizeof(*pat));
10769 pat->pattern = getDecodedObject(pattern);
10770 pat->client = c;
10771 listAddNodeTail(server.pubsub_patterns,pat);
10772 }
10773 /* Notify the client */
10774 addReply(c,shared.mbulk3);
10775 addReply(c,shared.psubscribebulk);
10776 addReplyBulk(c,pattern);
10777 addReplyLongLong(c,dictSize(c->pubsub_channels)+listLength(c->pubsub_patterns));
10778 return retval;
10779 }
10780
10781 /* Unsubscribe a client from a channel. Returns 1 if the operation succeeded, or
10782 * 0 if the client was not subscribed to the specified channel. */
10783 static int pubsubUnsubscribePattern(redisClient *c, robj *pattern, int notify) {
10784 listNode *ln;
10785 pubsubPattern pat;
10786 int retval = 0;
10787
10788 incrRefCount(pattern); /* Protect the object. May be the same we remove */
10789 if ((ln = listSearchKey(c->pubsub_patterns,pattern)) != NULL) {
10790 retval = 1;
10791 listDelNode(c->pubsub_patterns,ln);
10792 pat.client = c;
10793 pat.pattern = pattern;
10794 ln = listSearchKey(server.pubsub_patterns,&pat);
10795 listDelNode(server.pubsub_patterns,ln);
10796 }
10797 /* Notify the client */
10798 if (notify) {
10799 addReply(c,shared.mbulk3);
10800 addReply(c,shared.punsubscribebulk);
10801 addReplyBulk(c,pattern);
10802 addReplyLongLong(c,dictSize(c->pubsub_channels)+
10803 listLength(c->pubsub_patterns));
10804 }
10805 decrRefCount(pattern);
10806 return retval;
10807 }
10808
10809 /* Unsubscribe from all the channels. Return the number of channels the
10810 * client was subscribed from. */
10811 static int pubsubUnsubscribeAllChannels(redisClient *c, int notify) {
10812 dictIterator *di = dictGetIterator(c->pubsub_channels);
10813 dictEntry *de;
10814 int count = 0;
10815
10816 while((de = dictNext(di)) != NULL) {
10817 robj *channel = dictGetEntryKey(de);
10818
10819 count += pubsubUnsubscribeChannel(c,channel,notify);
10820 }
10821 dictReleaseIterator(di);
10822 return count;
10823 }
10824
10825 /* Unsubscribe from all the patterns. Return the number of patterns the
10826 * client was subscribed from. */
10827 static int pubsubUnsubscribeAllPatterns(redisClient *c, int notify) {
10828 listNode *ln;
10829 listIter li;
10830 int count = 0;
10831
10832 listRewind(c->pubsub_patterns,&li);
10833 while ((ln = listNext(&li)) != NULL) {
10834 robj *pattern = ln->value;
10835
10836 count += pubsubUnsubscribePattern(c,pattern,notify);
10837 }
10838 return count;
10839 }
10840
10841 /* Publish a message */
10842 static int pubsubPublishMessage(robj *channel, robj *message) {
10843 int receivers = 0;
10844 struct dictEntry *de;
10845 listNode *ln;
10846 listIter li;
10847
10848 /* Send to clients listening for that channel */
10849 de = dictFind(server.pubsub_channels,channel);
10850 if (de) {
10851 list *list = dictGetEntryVal(de);
10852 listNode *ln;
10853 listIter li;
10854
10855 listRewind(list,&li);
10856 while ((ln = listNext(&li)) != NULL) {
10857 redisClient *c = ln->value;
10858
10859 addReply(c,shared.mbulk3);
10860 addReply(c,shared.messagebulk);
10861 addReplyBulk(c,channel);
10862 addReplyBulk(c,message);
10863 receivers++;
10864 }
10865 }
10866 /* Send to clients listening to matching channels */
10867 if (listLength(server.pubsub_patterns)) {
10868 listRewind(server.pubsub_patterns,&li);
10869 channel = getDecodedObject(channel);
10870 while ((ln = listNext(&li)) != NULL) {
10871 pubsubPattern *pat = ln->value;
10872
10873 if (stringmatchlen((char*)pat->pattern->ptr,
10874 sdslen(pat->pattern->ptr),
10875 (char*)channel->ptr,
10876 sdslen(channel->ptr),0)) {
10877 addReply(pat->client,shared.mbulk4);
10878 addReply(pat->client,shared.pmessagebulk);
10879 addReplyBulk(pat->client,pat->pattern);
10880 addReplyBulk(pat->client,channel);
10881 addReplyBulk(pat->client,message);
10882 receivers++;
10883 }
10884 }
10885 decrRefCount(channel);
10886 }
10887 return receivers;
10888 }
10889
10890 static void subscribeCommand(redisClient *c) {
10891 int j;
10892
10893 for (j = 1; j < c->argc; j++)
10894 pubsubSubscribeChannel(c,c->argv[j]);
10895 }
10896
10897 static void unsubscribeCommand(redisClient *c) {
10898 if (c->argc == 1) {
10899 pubsubUnsubscribeAllChannels(c,1);
10900 return;
10901 } else {
10902 int j;
10903
10904 for (j = 1; j < c->argc; j++)
10905 pubsubUnsubscribeChannel(c,c->argv[j],1);
10906 }
10907 }
10908
10909 static void psubscribeCommand(redisClient *c) {
10910 int j;
10911
10912 for (j = 1; j < c->argc; j++)
10913 pubsubSubscribePattern(c,c->argv[j]);
10914 }
10915
10916 static void punsubscribeCommand(redisClient *c) {
10917 if (c->argc == 1) {
10918 pubsubUnsubscribeAllPatterns(c,1);
10919 return;
10920 } else {
10921 int j;
10922
10923 for (j = 1; j < c->argc; j++)
10924 pubsubUnsubscribePattern(c,c->argv[j],1);
10925 }
10926 }
10927
10928 static void publishCommand(redisClient *c) {
10929 int receivers = pubsubPublishMessage(c->argv[1],c->argv[2]);
10930 addReplyLongLong(c,receivers);
10931 }
10932
10933 /* ===================== WATCH (CAS alike for MULTI/EXEC) ===================
10934 *
10935 * The implementation uses a per-DB hash table mapping keys to list of clients
10936 * WATCHing those keys, so that given a key that is going to be modified
10937 * we can mark all the associated clients as dirty.
10938 *
10939 * Also every client contains a list of WATCHed keys so that's possible to
10940 * un-watch such keys when the client is freed or when UNWATCH is called. */
10941
10942 /* In the client->watched_keys list we need to use watchedKey structures
10943 * as in order to identify a key in Redis we need both the key name and the
10944 * DB */
10945 typedef struct watchedKey {
10946 robj *key;
10947 redisDb *db;
10948 } watchedKey;
10949
10950 /* Watch for the specified key */
10951 static void watchForKey(redisClient *c, robj *key) {
10952 list *clients = NULL;
10953 listIter li;
10954 listNode *ln;
10955 watchedKey *wk;
10956
10957 /* Check if we are already watching for this key */
10958 listRewind(c->watched_keys,&li);
10959 while((ln = listNext(&li))) {
10960 wk = listNodeValue(ln);
10961 if (wk->db == c->db && equalStringObjects(key,wk->key))
10962 return; /* Key already watched */
10963 }
10964 /* This key is not already watched in this DB. Let's add it */
10965 clients = dictFetchValue(c->db->watched_keys,key);
10966 if (!clients) {
10967 clients = listCreate();
10968 dictAdd(c->db->watched_keys,key,clients);
10969 incrRefCount(key);
10970 }
10971 listAddNodeTail(clients,c);
10972 /* Add the new key to the lits of keys watched by this client */
10973 wk = zmalloc(sizeof(*wk));
10974 wk->key = key;
10975 wk->db = c->db;
10976 incrRefCount(key);
10977 listAddNodeTail(c->watched_keys,wk);
10978 }
10979
10980 /* Unwatch all the keys watched by this client. To clean the EXEC dirty
10981 * flag is up to the caller. */
10982 static void unwatchAllKeys(redisClient *c) {
10983 listIter li;
10984 listNode *ln;
10985
10986 if (listLength(c->watched_keys) == 0) return;
10987 listRewind(c->watched_keys,&li);
10988 while((ln = listNext(&li))) {
10989 list *clients;
10990 watchedKey *wk;
10991
10992 /* Lookup the watched key -> clients list and remove the client
10993 * from the list */
10994 wk = listNodeValue(ln);
10995 clients = dictFetchValue(wk->db->watched_keys, wk->key);
10996 assert(clients != NULL);
10997 listDelNode(clients,listSearchKey(clients,c));
10998 /* Kill the entry at all if this was the only client */
10999 if (listLength(clients) == 0)
11000 dictDelete(wk->db->watched_keys, wk->key);
11001 /* Remove this watched key from the client->watched list */
11002 listDelNode(c->watched_keys,ln);
11003 decrRefCount(wk->key);
11004 zfree(wk);
11005 }
11006 }
11007
11008 /* "Touch" a key, so that if this key is being WATCHed by some client the
11009 * next EXEC will fail. */
11010 static void touchWatchedKey(redisDb *db, robj *key) {
11011 list *clients;
11012 listIter li;
11013 listNode *ln;
11014
11015 if (dictSize(db->watched_keys) == 0) return;
11016 clients = dictFetchValue(db->watched_keys, key);
11017 if (!clients) return;
11018
11019 /* Mark all the clients watching this key as REDIS_DIRTY_CAS */
11020 /* Check if we are already watching for this key */
11021 listRewind(clients,&li);
11022 while((ln = listNext(&li))) {
11023 redisClient *c = listNodeValue(ln);
11024
11025 c->flags |= REDIS_DIRTY_CAS;
11026 }
11027 }
11028
11029 /* On FLUSHDB or FLUSHALL all the watched keys that are present before the
11030 * flush but will be deleted as effect of the flushing operation should
11031 * be touched. "dbid" is the DB that's getting the flush. -1 if it is
11032 * a FLUSHALL operation (all the DBs flushed). */
11033 static void touchWatchedKeysOnFlush(int dbid) {
11034 listIter li1, li2;
11035 listNode *ln;
11036
11037 /* For every client, check all the waited keys */
11038 listRewind(server.clients,&li1);
11039 while((ln = listNext(&li1))) {
11040 redisClient *c = listNodeValue(ln);
11041 listRewind(c->watched_keys,&li2);
11042 while((ln = listNext(&li2))) {
11043 watchedKey *wk = listNodeValue(ln);
11044
11045 /* For every watched key matching the specified DB, if the
11046 * key exists, mark the client as dirty, as the key will be
11047 * removed. */
11048 if (dbid == -1 || wk->db->id == dbid) {
11049 if (dictFind(wk->db->dict, wk->key->ptr) != NULL)
11050 c->flags |= REDIS_DIRTY_CAS;
11051 }
11052 }
11053 }
11054 }
11055
11056 static void watchCommand(redisClient *c) {
11057 int j;
11058
11059 if (c->flags & REDIS_MULTI) {
11060 addReplySds(c,sdsnew("-ERR WATCH inside MULTI is not allowed\r\n"));
11061 return;
11062 }
11063 for (j = 1; j < c->argc; j++)
11064 watchForKey(c,c->argv[j]);
11065 addReply(c,shared.ok);
11066 }
11067
11068 static void unwatchCommand(redisClient *c) {
11069 unwatchAllKeys(c);
11070 c->flags &= (~REDIS_DIRTY_CAS);
11071 addReply(c,shared.ok);
11072 }
11073
11074 /* ================================= Debugging ============================== */
11075
11076 /* Compute the sha1 of string at 's' with 'len' bytes long.
11077 * The SHA1 is then xored againt the string pointed by digest.
11078 * Since xor is commutative, this operation is used in order to
11079 * "add" digests relative to unordered elements.
11080 *
11081 * So digest(a,b,c,d) will be the same of digest(b,a,c,d) */
11082 static void xorDigest(unsigned char *digest, void *ptr, size_t len) {
11083 SHA1_CTX ctx;
11084 unsigned char hash[20], *s = ptr;
11085 int j;
11086
11087 SHA1Init(&ctx);
11088 SHA1Update(&ctx,s,len);
11089 SHA1Final(hash,&ctx);
11090
11091 for (j = 0; j < 20; j++)
11092 digest[j] ^= hash[j];
11093 }
11094
11095 static void xorObjectDigest(unsigned char *digest, robj *o) {
11096 o = getDecodedObject(o);
11097 xorDigest(digest,o->ptr,sdslen(o->ptr));
11098 decrRefCount(o);
11099 }
11100
11101 /* This function instead of just computing the SHA1 and xoring it
11102 * against diget, also perform the digest of "digest" itself and
11103 * replace the old value with the new one.
11104 *
11105 * So the final digest will be:
11106 *
11107 * digest = SHA1(digest xor SHA1(data))
11108 *
11109 * This function is used every time we want to preserve the order so
11110 * that digest(a,b,c,d) will be different than digest(b,c,d,a)
11111 *
11112 * Also note that mixdigest("foo") followed by mixdigest("bar")
11113 * will lead to a different digest compared to "fo", "obar".
11114 */
11115 static void mixDigest(unsigned char *digest, void *ptr, size_t len) {
11116 SHA1_CTX ctx;
11117 char *s = ptr;
11118
11119 xorDigest(digest,s,len);
11120 SHA1Init(&ctx);
11121 SHA1Update(&ctx,digest,20);
11122 SHA1Final(digest,&ctx);
11123 }
11124
11125 static void mixObjectDigest(unsigned char *digest, robj *o) {
11126 o = getDecodedObject(o);
11127 mixDigest(digest,o->ptr,sdslen(o->ptr));
11128 decrRefCount(o);
11129 }
11130
11131 /* Compute the dataset digest. Since keys, sets elements, hashes elements
11132 * are not ordered, we use a trick: every aggregate digest is the xor
11133 * of the digests of their elements. This way the order will not change
11134 * the result. For list instead we use a feedback entering the output digest
11135 * as input in order to ensure that a different ordered list will result in
11136 * a different digest. */
11137 static void computeDatasetDigest(unsigned char *final) {
11138 unsigned char digest[20];
11139 char buf[128];
11140 dictIterator *di = NULL;
11141 dictEntry *de;
11142 int j;
11143 uint32_t aux;
11144
11145 memset(final,0,20); /* Start with a clean result */
11146
11147 for (j = 0; j < server.dbnum; j++) {
11148 redisDb *db = server.db+j;
11149
11150 if (dictSize(db->dict) == 0) continue;
11151 di = dictGetIterator(db->dict);
11152
11153 /* hash the DB id, so the same dataset moved in a different
11154 * DB will lead to a different digest */
11155 aux = htonl(j);
11156 mixDigest(final,&aux,sizeof(aux));
11157
11158 /* Iterate this DB writing every entry */
11159 while((de = dictNext(di)) != NULL) {
11160 sds key;
11161 robj *keyobj, *o;
11162 time_t expiretime;
11163
11164 memset(digest,0,20); /* This key-val digest */
11165 key = dictGetEntryKey(de);
11166 keyobj = createStringObject(key,sdslen(key));
11167
11168 mixDigest(digest,key,sdslen(key));
11169
11170 /* Make sure the key is loaded if VM is active */
11171 o = lookupKeyRead(db,keyobj);
11172
11173 aux = htonl(o->type);
11174 mixDigest(digest,&aux,sizeof(aux));
11175 expiretime = getExpire(db,keyobj);
11176
11177 /* Save the key and associated value */
11178 if (o->type == REDIS_STRING) {
11179 mixObjectDigest(digest,o);
11180 } else if (o->type == REDIS_LIST) {
11181 listTypeIterator *li = listTypeInitIterator(o,0,REDIS_TAIL);
11182 listTypeEntry entry;
11183 while(listTypeNext(li,&entry)) {
11184 robj *eleobj = listTypeGet(&entry);
11185 mixObjectDigest(digest,eleobj);
11186 decrRefCount(eleobj);
11187 }
11188 listTypeReleaseIterator(li);
11189 } else if (o->type == REDIS_SET) {
11190 dict *set = o->ptr;
11191 dictIterator *di = dictGetIterator(set);
11192 dictEntry *de;
11193
11194 while((de = dictNext(di)) != NULL) {
11195 robj *eleobj = dictGetEntryKey(de);
11196
11197 xorObjectDigest(digest,eleobj);
11198 }
11199 dictReleaseIterator(di);
11200 } else if (o->type == REDIS_ZSET) {
11201 zset *zs = o->ptr;
11202 dictIterator *di = dictGetIterator(zs->dict);
11203 dictEntry *de;
11204
11205 while((de = dictNext(di)) != NULL) {
11206 robj *eleobj = dictGetEntryKey(de);
11207 double *score = dictGetEntryVal(de);
11208 unsigned char eledigest[20];
11209
11210 snprintf(buf,sizeof(buf),"%.17g",*score);
11211 memset(eledigest,0,20);
11212 mixObjectDigest(eledigest,eleobj);
11213 mixDigest(eledigest,buf,strlen(buf));
11214 xorDigest(digest,eledigest,20);
11215 }
11216 dictReleaseIterator(di);
11217 } else if (o->type == REDIS_HASH) {
11218 hashTypeIterator *hi;
11219 robj *obj;
11220
11221 hi = hashTypeInitIterator(o);
11222 while (hashTypeNext(hi) != REDIS_ERR) {
11223 unsigned char eledigest[20];
11224
11225 memset(eledigest,0,20);
11226 obj = hashTypeCurrent(hi,REDIS_HASH_KEY);
11227 mixObjectDigest(eledigest,obj);
11228 decrRefCount(obj);
11229 obj = hashTypeCurrent(hi,REDIS_HASH_VALUE);
11230 mixObjectDigest(eledigest,obj);
11231 decrRefCount(obj);
11232 xorDigest(digest,eledigest,20);
11233 }
11234 hashTypeReleaseIterator(hi);
11235 } else {
11236 redisPanic("Unknown object type");
11237 }
11238 /* If the key has an expire, add it to the mix */
11239 if (expiretime != -1) xorDigest(digest,"!!expire!!",10);
11240 /* We can finally xor the key-val digest to the final digest */
11241 xorDigest(final,digest,20);
11242 decrRefCount(keyobj);
11243 }
11244 dictReleaseIterator(di);
11245 }
11246 }
11247
11248 static void debugCommand(redisClient *c) {
11249 if (!strcasecmp(c->argv[1]->ptr,"segfault")) {
11250 *((char*)-1) = 'x';
11251 } else if (!strcasecmp(c->argv[1]->ptr,"reload")) {
11252 if (rdbSave(server.dbfilename) != REDIS_OK) {
11253 addReply(c,shared.err);
11254 return;
11255 }
11256 emptyDb();
11257 if (rdbLoad(server.dbfilename) != REDIS_OK) {
11258 addReply(c,shared.err);
11259 return;
11260 }
11261 redisLog(REDIS_WARNING,"DB reloaded by DEBUG RELOAD");
11262 addReply(c,shared.ok);
11263 } else if (!strcasecmp(c->argv[1]->ptr,"loadaof")) {
11264 emptyDb();
11265 if (loadAppendOnlyFile(server.appendfilename) != REDIS_OK) {
11266 addReply(c,shared.err);
11267 return;
11268 }
11269 redisLog(REDIS_WARNING,"Append Only File loaded by DEBUG LOADAOF");
11270 addReply(c,shared.ok);
11271 } else if (!strcasecmp(c->argv[1]->ptr,"object") && c->argc == 3) {
11272 dictEntry *de = dictFind(c->db->dict,c->argv[2]->ptr);
11273 robj *val;
11274
11275 if (!de) {
11276 addReply(c,shared.nokeyerr);
11277 return;
11278 }
11279 val = dictGetEntryVal(de);
11280 if (!server.vm_enabled || (val->storage == REDIS_VM_MEMORY ||
11281 val->storage == REDIS_VM_SWAPPING)) {
11282 char *strenc;
11283 char buf[128];
11284
11285 if (val->encoding < (sizeof(strencoding)/sizeof(char*))) {
11286 strenc = strencoding[val->encoding];
11287 } else {
11288 snprintf(buf,64,"unknown encoding %d\n", val->encoding);
11289 strenc = buf;
11290 }
11291 addReplySds(c,sdscatprintf(sdsempty(),
11292 "+Value at:%p refcount:%d "
11293 "encoding:%s serializedlength:%lld\r\n",
11294 (void*)val, val->refcount,
11295 strenc, (long long) rdbSavedObjectLen(val,NULL)));
11296 } else {
11297 vmpointer *vp = (vmpointer*) val;
11298 addReplySds(c,sdscatprintf(sdsempty(),
11299 "+Value swapped at: page %llu "
11300 "using %llu pages\r\n",
11301 (unsigned long long) vp->page,
11302 (unsigned long long) vp->usedpages));
11303 }
11304 } else if (!strcasecmp(c->argv[1]->ptr,"swapin") && c->argc == 3) {
11305 lookupKeyRead(c->db,c->argv[2]);
11306 addReply(c,shared.ok);
11307 } else if (!strcasecmp(c->argv[1]->ptr,"swapout") && c->argc == 3) {
11308 dictEntry *de = dictFind(c->db->dict,c->argv[2]->ptr);
11309 robj *val;
11310 vmpointer *vp;
11311
11312 if (!server.vm_enabled) {
11313 addReplySds(c,sdsnew("-ERR Virtual Memory is disabled\r\n"));
11314 return;
11315 }
11316 if (!de) {
11317 addReply(c,shared.nokeyerr);
11318 return;
11319 }
11320 val = dictGetEntryVal(de);
11321 /* Swap it */
11322 if (val->storage != REDIS_VM_MEMORY) {
11323 addReplySds(c,sdsnew("-ERR This key is not in memory\r\n"));
11324 } else if (val->refcount != 1) {
11325 addReplySds(c,sdsnew("-ERR Object is shared\r\n"));
11326 } else if ((vp = vmSwapObjectBlocking(val)) != NULL) {
11327 dictGetEntryVal(de) = vp;
11328 addReply(c,shared.ok);
11329 } else {
11330 addReply(c,shared.err);
11331 }
11332 } else if (!strcasecmp(c->argv[1]->ptr,"populate") && c->argc == 3) {
11333 long keys, j;
11334 robj *key, *val;
11335 char buf[128];
11336
11337 if (getLongFromObjectOrReply(c, c->argv[2], &keys, NULL) != REDIS_OK)
11338 return;
11339 for (j = 0; j < keys; j++) {
11340 snprintf(buf,sizeof(buf),"key:%lu",j);
11341 key = createStringObject(buf,strlen(buf));
11342 if (lookupKeyRead(c->db,key) != NULL) {
11343 decrRefCount(key);
11344 continue;
11345 }
11346 snprintf(buf,sizeof(buf),"value:%lu",j);
11347 val = createStringObject(buf,strlen(buf));
11348 dbAdd(c->db,key,val);
11349 decrRefCount(key);
11350 }
11351 addReply(c,shared.ok);
11352 } else if (!strcasecmp(c->argv[1]->ptr,"digest") && c->argc == 2) {
11353 unsigned char digest[20];
11354 sds d = sdsnew("+");
11355 int j;
11356
11357 computeDatasetDigest(digest);
11358 for (j = 0; j < 20; j++)
11359 d = sdscatprintf(d, "%02x",digest[j]);
11360
11361 d = sdscatlen(d,"\r\n",2);
11362 addReplySds(c,d);
11363 } else {
11364 addReplySds(c,sdsnew(
11365 "-ERR Syntax error, try DEBUG [SEGFAULT|OBJECT <key>|SWAPIN <key>|SWAPOUT <key>|RELOAD]\r\n"));
11366 }
11367 }
11368
11369 static void _redisAssert(char *estr, char *file, int line) {
11370 redisLog(REDIS_WARNING,"=== ASSERTION FAILED ===");
11371 redisLog(REDIS_WARNING,"==> %s:%d '%s' is not true",file,line,estr);
11372 #ifdef HAVE_BACKTRACE
11373 redisLog(REDIS_WARNING,"(forcing SIGSEGV in order to print the stack trace)");
11374 *((char*)-1) = 'x';
11375 #endif
11376 }
11377
11378 static void _redisPanic(char *msg, char *file, int line) {
11379 redisLog(REDIS_WARNING,"!!! Software Failure. Press left mouse button to continue");
11380 redisLog(REDIS_WARNING,"Guru Meditation: %s #%s:%d",msg,file,line);
11381 #ifdef HAVE_BACKTRACE
11382 redisLog(REDIS_WARNING,"(forcing SIGSEGV in order to print the stack trace)");
11383 *((char*)-1) = 'x';
11384 #endif
11385 }
11386
11387 /* =================================== Main! ================================ */
11388
11389 #ifdef __linux__
11390 int linuxOvercommitMemoryValue(void) {
11391 FILE *fp = fopen("/proc/sys/vm/overcommit_memory","r");
11392 char buf[64];
11393
11394 if (!fp) return -1;
11395 if (fgets(buf,64,fp) == NULL) {
11396 fclose(fp);
11397 return -1;
11398 }
11399 fclose(fp);
11400
11401 return atoi(buf);
11402 }
11403
11404 void linuxOvercommitMemoryWarning(void) {
11405 if (linuxOvercommitMemoryValue() == 0) {
11406 redisLog(REDIS_WARNING,"WARNING overcommit_memory is set to 0! Background save may fail under low memory condition. To fix this issue add 'vm.overcommit_memory = 1' to /etc/sysctl.conf and then reboot or run the command 'sysctl vm.overcommit_memory=1' for this to take effect.");
11407 }
11408 }
11409 #endif /* __linux__ */
11410
11411 static void daemonize(void) {
11412 int fd;
11413 FILE *fp;
11414
11415 if (fork() != 0) exit(0); /* parent exits */
11416 setsid(); /* create a new session */
11417
11418 /* Every output goes to /dev/null. If Redis is daemonized but
11419 * the 'logfile' is set to 'stdout' in the configuration file
11420 * it will not log at all. */
11421 if ((fd = open("/dev/null", O_RDWR, 0)) != -1) {
11422 dup2(fd, STDIN_FILENO);
11423 dup2(fd, STDOUT_FILENO);
11424 dup2(fd, STDERR_FILENO);
11425 if (fd > STDERR_FILENO) close(fd);
11426 }
11427 /* Try to write the pid file */
11428 fp = fopen(server.pidfile,"w");
11429 if (fp) {
11430 fprintf(fp,"%d\n",getpid());
11431 fclose(fp);
11432 }
11433 }
11434
11435 static void version() {
11436 printf("Redis server version %s (%s:%d)\n", REDIS_VERSION,
11437 REDIS_GIT_SHA1, atoi(REDIS_GIT_DIRTY) > 0);
11438 exit(0);
11439 }
11440
11441 static void usage() {
11442 fprintf(stderr,"Usage: ./redis-server [/path/to/redis.conf]\n");
11443 fprintf(stderr," ./redis-server - (read config from stdin)\n");
11444 exit(1);
11445 }
11446
11447 int main(int argc, char **argv) {
11448 time_t start;
11449
11450 initServerConfig();
11451 sortCommandTable();
11452 if (argc == 2) {
11453 if (strcmp(argv[1], "-v") == 0 ||
11454 strcmp(argv[1], "--version") == 0) version();
11455 if (strcmp(argv[1], "--help") == 0) usage();
11456 resetServerSaveParams();
11457 loadServerConfig(argv[1]);
11458 } else if ((argc > 2)) {
11459 usage();
11460 } else {
11461 redisLog(REDIS_WARNING,"Warning: no config file specified, using the default config. In order to specify a config file use 'redis-server /path/to/redis.conf'");
11462 }
11463 if (server.daemonize) daemonize();
11464 initServer();
11465 redisLog(REDIS_NOTICE,"Server started, Redis version " REDIS_VERSION);
11466 #ifdef __linux__
11467 linuxOvercommitMemoryWarning();
11468 #endif
11469 start = time(NULL);
11470 if (server.appendonly) {
11471 if (loadAppendOnlyFile(server.appendfilename) == REDIS_OK)
11472 redisLog(REDIS_NOTICE,"DB loaded from append only file: %ld seconds",time(NULL)-start);
11473 } else {
11474 if (rdbLoad(server.dbfilename) == REDIS_OK)
11475 redisLog(REDIS_NOTICE,"DB loaded from disk: %ld seconds",time(NULL)-start);
11476 }
11477 redisLog(REDIS_NOTICE,"The server is now ready to accept connections on port %d", server.port);
11478 aeSetBeforeSleepProc(server.el,beforeSleep);
11479 aeMain(server.el);
11480 aeDeleteEventLoop(server.el);
11481 return 0;
11482 }
11483
11484 /* ============================= Backtrace support ========================= */
11485
11486 #ifdef HAVE_BACKTRACE
11487 static char *findFuncName(void *pointer, unsigned long *offset);
11488
11489 static void *getMcontextEip(ucontext_t *uc) {
11490 #if defined(__FreeBSD__)
11491 return (void*) uc->uc_mcontext.mc_eip;
11492 #elif defined(__dietlibc__)
11493 return (void*) uc->uc_mcontext.eip;
11494 #elif defined(__APPLE__) && !defined(MAC_OS_X_VERSION_10_6)
11495 #if __x86_64__
11496 return (void*) uc->uc_mcontext->__ss.__rip;
11497 #else
11498 return (void*) uc->uc_mcontext->__ss.__eip;
11499 #endif
11500 #elif defined(__APPLE__) && defined(MAC_OS_X_VERSION_10_6)
11501 #if defined(_STRUCT_X86_THREAD_STATE64) && !defined(__i386__)
11502 return (void*) uc->uc_mcontext->__ss.__rip;
11503 #else
11504 return (void*) uc->uc_mcontext->__ss.__eip;
11505 #endif
11506 #elif defined(__i386__) || defined(__X86_64__) || defined(__x86_64__)
11507 return (void*) uc->uc_mcontext.gregs[REG_EIP]; /* Linux 32/64 bit */
11508 #elif defined(__ia64__) /* Linux IA64 */
11509 return (void*) uc->uc_mcontext.sc_ip;
11510 #else
11511 return NULL;
11512 #endif
11513 }
11514
11515 static void segvHandler(int sig, siginfo_t *info, void *secret) {
11516 void *trace[100];
11517 char **messages = NULL;
11518 int i, trace_size = 0;
11519 unsigned long offset=0;
11520 ucontext_t *uc = (ucontext_t*) secret;
11521 sds infostring;
11522 REDIS_NOTUSED(info);
11523
11524 redisLog(REDIS_WARNING,
11525 "======= Ooops! Redis %s got signal: -%d- =======", REDIS_VERSION, sig);
11526 infostring = genRedisInfoString();
11527 redisLog(REDIS_WARNING, "%s",infostring);
11528 /* It's not safe to sdsfree() the returned string under memory
11529 * corruption conditions. Let it leak as we are going to abort */
11530
11531 trace_size = backtrace(trace, 100);
11532 /* overwrite sigaction with caller's address */
11533 if (getMcontextEip(uc) != NULL) {
11534 trace[1] = getMcontextEip(uc);
11535 }
11536 messages = backtrace_symbols(trace, trace_size);
11537
11538 for (i=1; i<trace_size; ++i) {
11539 char *fn = findFuncName(trace[i], &offset), *p;
11540
11541 p = strchr(messages[i],'+');
11542 if (!fn || (p && ((unsigned long)strtol(p+1,NULL,10)) < offset)) {
11543 redisLog(REDIS_WARNING,"%s", messages[i]);
11544 } else {
11545 redisLog(REDIS_WARNING,"%d redis-server %p %s + %d", i, trace[i], fn, (unsigned int)offset);
11546 }
11547 }
11548 /* free(messages); Don't call free() with possibly corrupted memory. */
11549 _exit(0);
11550 }
11551
11552 static void sigtermHandler(int sig) {
11553 REDIS_NOTUSED(sig);
11554
11555 redisLog(REDIS_WARNING,"SIGTERM received, scheduling shutting down...");
11556 server.shutdown_asap = 1;
11557 }
11558
11559 static void setupSigSegvAction(void) {
11560 struct sigaction act;
11561
11562 sigemptyset (&act.sa_mask);
11563 /* When the SA_SIGINFO flag is set in sa_flags then sa_sigaction
11564 * is used. Otherwise, sa_handler is used */
11565 act.sa_flags = SA_NODEFER | SA_ONSTACK | SA_RESETHAND | SA_SIGINFO;
11566 act.sa_sigaction = segvHandler;
11567 sigaction (SIGSEGV, &act, NULL);
11568 sigaction (SIGBUS, &act, NULL);
11569 sigaction (SIGFPE, &act, NULL);
11570 sigaction (SIGILL, &act, NULL);
11571 sigaction (SIGBUS, &act, NULL);
11572
11573 act.sa_flags = SA_NODEFER | SA_ONSTACK | SA_RESETHAND;
11574 act.sa_handler = sigtermHandler;
11575 sigaction (SIGTERM, &act, NULL);
11576 return;
11577 }
11578
11579 #include "staticsymbols.h"
11580 /* This function try to convert a pointer into a function name. It's used in
11581 * oreder to provide a backtrace under segmentation fault that's able to
11582 * display functions declared as static (otherwise the backtrace is useless). */
11583 static char *findFuncName(void *pointer, unsigned long *offset){
11584 int i, ret = -1;
11585 unsigned long off, minoff = 0;
11586
11587 /* Try to match against the Symbol with the smallest offset */
11588 for (i=0; symsTable[i].pointer; i++) {
11589 unsigned long lp = (unsigned long) pointer;
11590
11591 if (lp != (unsigned long)-1 && lp >= symsTable[i].pointer) {
11592 off=lp-symsTable[i].pointer;
11593 if (ret < 0 || off < minoff) {
11594 minoff=off;
11595 ret=i;
11596 }
11597 }
11598 }
11599 if (ret == -1) return NULL;
11600 *offset = minoff;
11601 return symsTable[ret].name;
11602 }
11603 #else /* HAVE_BACKTRACE */
11604 static void setupSigSegvAction(void) {
11605 }
11606 #endif /* HAVE_BACKTRACE */
11607
11608
11609
11610 /* The End */
11611
11612
11613