]> git.saurik.com Git - redis.git/blob - redis.c
527b11ea05f54f93a843aefa3d779cdef8327d1e
[redis.git] / redis.c
1 /*
2 * Copyright (c) 2009-2010, Salvatore Sanfilippo <antirez at gmail dot com>
3 * All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions are met:
7 *
8 * * Redistributions of source code must retain the above copyright notice,
9 * this list of conditions and the following disclaimer.
10 * * Redistributions in binary form must reproduce the above copyright
11 * notice, this list of conditions and the following disclaimer in the
12 * documentation and/or other materials provided with the distribution.
13 * * Neither the name of Redis nor the names of its contributors may be used
14 * to endorse or promote products derived from this software without
15 * specific prior written permission.
16 *
17 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
18 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
21 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
22 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
23 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
24 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
25 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
26 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
27 * POSSIBILITY OF SUCH DAMAGE.
28 */
29
30 #define REDIS_VERSION "1.3.7"
31
32 #include "fmacros.h"
33 #include "config.h"
34
35 #include <stdio.h>
36 #include <stdlib.h>
37 #include <string.h>
38 #include <time.h>
39 #include <unistd.h>
40 #define __USE_POSIX199309
41 #define __USE_UNIX98
42 #include <signal.h>
43
44 #ifdef HAVE_BACKTRACE
45 #include <execinfo.h>
46 #include <ucontext.h>
47 #endif /* HAVE_BACKTRACE */
48
49 #include <sys/wait.h>
50 #include <errno.h>
51 #include <assert.h>
52 #include <ctype.h>
53 #include <stdarg.h>
54 #include <inttypes.h>
55 #include <arpa/inet.h>
56 #include <sys/stat.h>
57 #include <fcntl.h>
58 #include <sys/time.h>
59 #include <sys/resource.h>
60 #include <sys/uio.h>
61 #include <limits.h>
62 #include <math.h>
63 #include <pthread.h>
64
65 #if defined(__sun)
66 #include "solarisfixes.h"
67 #endif
68
69 #include "redis.h"
70 #include "ae.h" /* Event driven programming library */
71 #include "sds.h" /* Dynamic safe strings */
72 #include "anet.h" /* Networking the easy way */
73 #include "dict.h" /* Hash tables */
74 #include "adlist.h" /* Linked lists */
75 #include "zmalloc.h" /* total memory usage aware version of malloc/free */
76 #include "lzf.h" /* LZF compression library */
77 #include "pqsort.h" /* Partial qsort for SORT+LIMIT */
78 #include "zipmap.h"
79
80 /* Error codes */
81 #define REDIS_OK 0
82 #define REDIS_ERR -1
83
84 /* Static server configuration */
85 #define REDIS_SERVERPORT 6379 /* TCP port */
86 #define REDIS_MAXIDLETIME (60*5) /* default client timeout */
87 #define REDIS_IOBUF_LEN 1024
88 #define REDIS_LOADBUF_LEN 1024
89 #define REDIS_STATIC_ARGS 4
90 #define REDIS_DEFAULT_DBNUM 16
91 #define REDIS_CONFIGLINE_MAX 1024
92 #define REDIS_OBJFREELIST_MAX 1000000 /* Max number of objects to cache */
93 #define REDIS_MAX_SYNC_TIME 60 /* Slave can't take more to sync */
94 #define REDIS_EXPIRELOOKUPS_PER_CRON 10 /* try to expire 10 keys/loop */
95 #define REDIS_MAX_WRITE_PER_EVENT (1024*64)
96 #define REDIS_REQUEST_MAX_SIZE (1024*1024*256) /* max bytes in inline command */
97
98 /* If more then REDIS_WRITEV_THRESHOLD write packets are pending use writev */
99 #define REDIS_WRITEV_THRESHOLD 3
100 /* Max number of iovecs used for each writev call */
101 #define REDIS_WRITEV_IOVEC_COUNT 256
102
103 /* Hash table parameters */
104 #define REDIS_HT_MINFILL 10 /* Minimal hash table fill 10% */
105
106 /* Command flags */
107 #define REDIS_CMD_BULK 1 /* Bulk write command */
108 #define REDIS_CMD_INLINE 2 /* Inline command */
109 /* REDIS_CMD_DENYOOM reserves a longer comment: all the commands marked with
110 this flags will return an error when the 'maxmemory' option is set in the
111 config file and the server is using more than maxmemory bytes of memory.
112 In short this commands are denied on low memory conditions. */
113 #define REDIS_CMD_DENYOOM 4
114
115 /* Object types */
116 #define REDIS_STRING 0
117 #define REDIS_LIST 1
118 #define REDIS_SET 2
119 #define REDIS_ZSET 3
120 #define REDIS_HASH 4
121
122 /* Objects encoding. Some kind of objects like Strings and Hashes can be
123 * internally represented in multiple ways. The 'encoding' field of the object
124 * is set to one of this fields for this object. */
125 #define REDIS_ENCODING_RAW 0 /* Raw representation */
126 #define REDIS_ENCODING_INT 1 /* Encoded as integer */
127 #define REDIS_ENCODING_ZIPMAP 2 /* Encoded as zipmap */
128 #define REDIS_ENCODING_HT 3 /* Encoded as an hash table */
129
130 static char* strencoding[] = {
131 "raw", "int", "zipmap", "hashtable"
132 };
133
134 /* Object types only used for dumping to disk */
135 #define REDIS_EXPIRETIME 253
136 #define REDIS_SELECTDB 254
137 #define REDIS_EOF 255
138
139 /* Defines related to the dump file format. To store 32 bits lengths for short
140 * keys requires a lot of space, so we check the most significant 2 bits of
141 * the first byte to interpreter the length:
142 *
143 * 00|000000 => if the two MSB are 00 the len is the 6 bits of this byte
144 * 01|000000 00000000 => 01, the len is 14 byes, 6 bits + 8 bits of next byte
145 * 10|000000 [32 bit integer] => if it's 01, a full 32 bit len will follow
146 * 11|000000 this means: specially encoded object will follow. The six bits
147 * number specify the kind of object that follows.
148 * See the REDIS_RDB_ENC_* defines.
149 *
150 * Lenghts up to 63 are stored using a single byte, most DB keys, and may
151 * values, will fit inside. */
152 #define REDIS_RDB_6BITLEN 0
153 #define REDIS_RDB_14BITLEN 1
154 #define REDIS_RDB_32BITLEN 2
155 #define REDIS_RDB_ENCVAL 3
156 #define REDIS_RDB_LENERR UINT_MAX
157
158 /* When a length of a string object stored on disk has the first two bits
159 * set, the remaining two bits specify a special encoding for the object
160 * accordingly to the following defines: */
161 #define REDIS_RDB_ENC_INT8 0 /* 8 bit signed integer */
162 #define REDIS_RDB_ENC_INT16 1 /* 16 bit signed integer */
163 #define REDIS_RDB_ENC_INT32 2 /* 32 bit signed integer */
164 #define REDIS_RDB_ENC_LZF 3 /* string compressed with FASTLZ */
165
166 /* Virtual memory object->where field. */
167 #define REDIS_VM_MEMORY 0 /* The object is on memory */
168 #define REDIS_VM_SWAPPED 1 /* The object is on disk */
169 #define REDIS_VM_SWAPPING 2 /* Redis is swapping this object on disk */
170 #define REDIS_VM_LOADING 3 /* Redis is loading this object from disk */
171
172 /* Virtual memory static configuration stuff.
173 * Check vmFindContiguousPages() to know more about this magic numbers. */
174 #define REDIS_VM_MAX_NEAR_PAGES 65536
175 #define REDIS_VM_MAX_RANDOM_JUMP 4096
176 #define REDIS_VM_MAX_THREADS 32
177 #define REDIS_THREAD_STACK_SIZE (1024*1024*4)
178 /* The following is the *percentage* of completed I/O jobs to process when the
179 * handelr is called. While Virtual Memory I/O operations are performed by
180 * threads, this operations must be processed by the main thread when completed
181 * in order to take effect. */
182 #define REDIS_MAX_COMPLETED_JOBS_PROCESSED 1
183
184 /* Client flags */
185 #define REDIS_SLAVE 1 /* This client is a slave server */
186 #define REDIS_MASTER 2 /* This client is a master server */
187 #define REDIS_MONITOR 4 /* This client is a slave monitor, see MONITOR */
188 #define REDIS_MULTI 8 /* This client is in a MULTI context */
189 #define REDIS_BLOCKED 16 /* The client is waiting in a blocking operation */
190 #define REDIS_IO_WAIT 32 /* The client is waiting for Virtual Memory I/O */
191
192 /* Slave replication state - slave side */
193 #define REDIS_REPL_NONE 0 /* No active replication */
194 #define REDIS_REPL_CONNECT 1 /* Must connect to master */
195 #define REDIS_REPL_CONNECTED 2 /* Connected to master */
196
197 /* Slave replication state - from the point of view of master
198 * Note that in SEND_BULK and ONLINE state the slave receives new updates
199 * in its output queue. In the WAIT_BGSAVE state instead the server is waiting
200 * to start the next background saving in order to send updates to it. */
201 #define REDIS_REPL_WAIT_BGSAVE_START 3 /* master waits bgsave to start feeding it */
202 #define REDIS_REPL_WAIT_BGSAVE_END 4 /* master waits bgsave to start bulk DB transmission */
203 #define REDIS_REPL_SEND_BULK 5 /* master is sending the bulk DB */
204 #define REDIS_REPL_ONLINE 6 /* bulk DB already transmitted, receive updates */
205
206 /* List related stuff */
207 #define REDIS_HEAD 0
208 #define REDIS_TAIL 1
209
210 /* Sort operations */
211 #define REDIS_SORT_GET 0
212 #define REDIS_SORT_ASC 1
213 #define REDIS_SORT_DESC 2
214 #define REDIS_SORTKEY_MAX 1024
215
216 /* Log levels */
217 #define REDIS_DEBUG 0
218 #define REDIS_VERBOSE 1
219 #define REDIS_NOTICE 2
220 #define REDIS_WARNING 3
221
222 /* Anti-warning macro... */
223 #define REDIS_NOTUSED(V) ((void) V)
224
225 #define ZSKIPLIST_MAXLEVEL 32 /* Should be enough for 2^32 elements */
226 #define ZSKIPLIST_P 0.25 /* Skiplist P = 1/4 */
227
228 /* Append only defines */
229 #define APPENDFSYNC_NO 0
230 #define APPENDFSYNC_ALWAYS 1
231 #define APPENDFSYNC_EVERYSEC 2
232
233 /* Hashes related defaults */
234 #define REDIS_HASH_MAX_ZIPMAP_ENTRIES 64
235 #define REDIS_HASH_MAX_ZIPMAP_VALUE 512
236
237 /* We can print the stacktrace, so our assert is defined this way: */
238 #define redisAssert(_e) ((_e)?(void)0 : (_redisAssert(#_e,__FILE__,__LINE__),_exit(1)))
239 static void _redisAssert(char *estr, char *file, int line);
240
241 /*================================= Data types ============================== */
242
243 /* A redis object, that is a type able to hold a string / list / set */
244
245 /* The VM object structure */
246 struct redisObjectVM {
247 off_t page; /* the page at witch the object is stored on disk */
248 off_t usedpages; /* number of pages used on disk */
249 time_t atime; /* Last access time */
250 } vm;
251
252 /* The actual Redis Object */
253 typedef struct redisObject {
254 void *ptr;
255 unsigned char type;
256 unsigned char encoding;
257 unsigned char storage; /* If this object is a key, where is the value?
258 * REDIS_VM_MEMORY, REDIS_VM_SWAPPED, ... */
259 unsigned char vtype; /* If this object is a key, and value is swapped out,
260 * this is the type of the swapped out object. */
261 int refcount;
262 /* VM fields, this are only allocated if VM is active, otherwise the
263 * object allocation function will just allocate
264 * sizeof(redisObjct) minus sizeof(redisObjectVM), so using
265 * Redis without VM active will not have any overhead. */
266 struct redisObjectVM vm;
267 } robj;
268
269 /* Macro used to initalize a Redis object allocated on the stack.
270 * Note that this macro is taken near the structure definition to make sure
271 * we'll update it when the structure is changed, to avoid bugs like
272 * bug #85 introduced exactly in this way. */
273 #define initStaticStringObject(_var,_ptr) do { \
274 _var.refcount = 1; \
275 _var.type = REDIS_STRING; \
276 _var.encoding = REDIS_ENCODING_RAW; \
277 _var.ptr = _ptr; \
278 if (server.vm_enabled) _var.storage = REDIS_VM_MEMORY; \
279 } while(0);
280
281 typedef struct redisDb {
282 dict *dict; /* The keyspace for this DB */
283 dict *expires; /* Timeout of keys with a timeout set */
284 dict *blockingkeys; /* Keys with clients waiting for data (BLPOP) */
285 dict *io_keys; /* Keys with clients waiting for VM I/O */
286 int id;
287 } redisDb;
288
289 /* Client MULTI/EXEC state */
290 typedef struct multiCmd {
291 robj **argv;
292 int argc;
293 struct redisCommand *cmd;
294 } multiCmd;
295
296 typedef struct multiState {
297 multiCmd *commands; /* Array of MULTI commands */
298 int count; /* Total number of MULTI commands */
299 } multiState;
300
301 /* With multiplexing we need to take per-clinet state.
302 * Clients are taken in a liked list. */
303 typedef struct redisClient {
304 int fd;
305 redisDb *db;
306 int dictid;
307 sds querybuf;
308 robj **argv, **mbargv;
309 int argc, mbargc;
310 int bulklen; /* bulk read len. -1 if not in bulk read mode */
311 int multibulk; /* multi bulk command format active */
312 list *reply;
313 int sentlen;
314 time_t lastinteraction; /* time of the last interaction, used for timeout */
315 int flags; /* REDIS_SLAVE | REDIS_MONITOR | REDIS_MULTI ... */
316 int slaveseldb; /* slave selected db, if this client is a slave */
317 int authenticated; /* when requirepass is non-NULL */
318 int replstate; /* replication state if this is a slave */
319 int repldbfd; /* replication DB file descriptor */
320 long repldboff; /* replication DB file offset */
321 off_t repldbsize; /* replication DB file size */
322 multiState mstate; /* MULTI/EXEC state */
323 robj **blockingkeys; /* The key we are waiting to terminate a blocking
324 * operation such as BLPOP. Otherwise NULL. */
325 int blockingkeysnum; /* Number of blocking keys */
326 time_t blockingto; /* Blocking operation timeout. If UNIX current time
327 * is >= blockingto then the operation timed out. */
328 list *io_keys; /* Keys this client is waiting to be loaded from the
329 * swap file in order to continue. */
330 } redisClient;
331
332 struct saveparam {
333 time_t seconds;
334 int changes;
335 };
336
337 /* Global server state structure */
338 struct redisServer {
339 int port;
340 int fd;
341 redisDb *db;
342 dict *sharingpool; /* Poll used for object sharing */
343 unsigned int sharingpoolsize;
344 long long dirty; /* changes to DB from the last save */
345 list *clients;
346 list *slaves, *monitors;
347 char neterr[ANET_ERR_LEN];
348 aeEventLoop *el;
349 int cronloops; /* number of times the cron function run */
350 list *objfreelist; /* A list of freed objects to avoid malloc() */
351 time_t lastsave; /* Unix time of last save succeeede */
352 /* Fields used only for stats */
353 time_t stat_starttime; /* server start time */
354 long long stat_numcommands; /* number of processed commands */
355 long long stat_numconnections; /* number of connections received */
356 long long stat_expiredkeys; /* number of expired keys */
357 /* Configuration */
358 int verbosity;
359 int glueoutputbuf;
360 int maxidletime;
361 int dbnum;
362 int daemonize;
363 int appendonly;
364 int appendfsync;
365 time_t lastfsync;
366 int appendfd;
367 int appendseldb;
368 char *pidfile;
369 pid_t bgsavechildpid;
370 pid_t bgrewritechildpid;
371 sds bgrewritebuf; /* buffer taken by parent during oppend only rewrite */
372 struct saveparam *saveparams;
373 int saveparamslen;
374 char *logfile;
375 char *bindaddr;
376 char *dbfilename;
377 char *appendfilename;
378 char *requirepass;
379 int shareobjects;
380 int rdbcompression;
381 /* Replication related */
382 int isslave;
383 char *masterauth;
384 char *masterhost;
385 int masterport;
386 redisClient *master; /* client that is master for this slave */
387 int replstate;
388 unsigned int maxclients;
389 unsigned long long maxmemory;
390 unsigned int blpop_blocked_clients;
391 unsigned int vm_blocked_clients;
392 /* Sort parameters - qsort_r() is only available under BSD so we
393 * have to take this state global, in order to pass it to sortCompare() */
394 int sort_desc;
395 int sort_alpha;
396 int sort_bypattern;
397 /* Virtual memory configuration */
398 int vm_enabled;
399 char *vm_swap_file;
400 off_t vm_page_size;
401 off_t vm_pages;
402 unsigned long long vm_max_memory;
403 /* Hashes config */
404 size_t hash_max_zipmap_entries;
405 size_t hash_max_zipmap_value;
406 /* Virtual memory state */
407 FILE *vm_fp;
408 int vm_fd;
409 off_t vm_next_page; /* Next probably empty page */
410 off_t vm_near_pages; /* Number of pages allocated sequentially */
411 unsigned char *vm_bitmap; /* Bitmap of free/used pages */
412 time_t unixtime; /* Unix time sampled every second. */
413 /* Virtual memory I/O threads stuff */
414 /* An I/O thread process an element taken from the io_jobs queue and
415 * put the result of the operation in the io_done list. While the
416 * job is being processed, it's put on io_processing queue. */
417 list *io_newjobs; /* List of VM I/O jobs yet to be processed */
418 list *io_processing; /* List of VM I/O jobs being processed */
419 list *io_processed; /* List of VM I/O jobs already processed */
420 list *io_ready_clients; /* Clients ready to be unblocked. All keys loaded */
421 pthread_mutex_t io_mutex; /* lock to access io_jobs/io_done/io_thread_job */
422 pthread_mutex_t obj_freelist_mutex; /* safe redis objects creation/free */
423 pthread_mutex_t io_swapfile_mutex; /* So we can lseek + write */
424 pthread_attr_t io_threads_attr; /* attributes for threads creation */
425 int io_active_threads; /* Number of running I/O threads */
426 int vm_max_threads; /* Max number of I/O threads running at the same time */
427 /* Our main thread is blocked on the event loop, locking for sockets ready
428 * to be read or written, so when a threaded I/O operation is ready to be
429 * processed by the main thread, the I/O thread will use a unix pipe to
430 * awake the main thread. The followings are the two pipe FDs. */
431 int io_ready_pipe_read;
432 int io_ready_pipe_write;
433 /* Virtual memory stats */
434 unsigned long long vm_stats_used_pages;
435 unsigned long long vm_stats_swapped_objects;
436 unsigned long long vm_stats_swapouts;
437 unsigned long long vm_stats_swapins;
438 FILE *devnull;
439 };
440
441 typedef void redisCommandProc(redisClient *c);
442 struct redisCommand {
443 char *name;
444 redisCommandProc *proc;
445 int arity;
446 int flags;
447 /* Use a function to determine which keys need to be loaded
448 * in the background prior to executing this command. Takes precedence
449 * over vm_firstkey and others, ignored when NULL */
450 redisCommandProc *vm_preload_proc;
451 /* What keys should be loaded in background when calling this command? */
452 int vm_firstkey; /* The first argument that's a key (0 = no keys) */
453 int vm_lastkey; /* THe last argument that's a key */
454 int vm_keystep; /* The step between first and last key */
455 };
456
457 struct redisFunctionSym {
458 char *name;
459 unsigned long pointer;
460 };
461
462 typedef struct _redisSortObject {
463 robj *obj;
464 union {
465 double score;
466 robj *cmpobj;
467 } u;
468 } redisSortObject;
469
470 typedef struct _redisSortOperation {
471 int type;
472 robj *pattern;
473 } redisSortOperation;
474
475 /* ZSETs use a specialized version of Skiplists */
476
477 typedef struct zskiplistNode {
478 struct zskiplistNode **forward;
479 struct zskiplistNode *backward;
480 unsigned int *span;
481 double score;
482 robj *obj;
483 } zskiplistNode;
484
485 typedef struct zskiplist {
486 struct zskiplistNode *header, *tail;
487 unsigned long length;
488 int level;
489 } zskiplist;
490
491 typedef struct zset {
492 dict *dict;
493 zskiplist *zsl;
494 } zset;
495
496 /* Our shared "common" objects */
497
498 struct sharedObjectsStruct {
499 robj *crlf, *ok, *err, *emptybulk, *czero, *cone, *pong, *space,
500 *colon, *nullbulk, *nullmultibulk, *queued,
501 *emptymultibulk, *wrongtypeerr, *nokeyerr, *syntaxerr, *sameobjecterr,
502 *outofrangeerr, *plus,
503 *select0, *select1, *select2, *select3, *select4,
504 *select5, *select6, *select7, *select8, *select9;
505 } shared;
506
507 /* Global vars that are actally used as constants. The following double
508 * values are used for double on-disk serialization, and are initialized
509 * at runtime to avoid strange compiler optimizations. */
510
511 static double R_Zero, R_PosInf, R_NegInf, R_Nan;
512
513 /* VM threaded I/O request message */
514 #define REDIS_IOJOB_LOAD 0 /* Load from disk to memory */
515 #define REDIS_IOJOB_PREPARE_SWAP 1 /* Compute needed pages */
516 #define REDIS_IOJOB_DO_SWAP 2 /* Swap from memory to disk */
517 typedef struct iojob {
518 int type; /* Request type, REDIS_IOJOB_* */
519 redisDb *db;/* Redis database */
520 robj *key; /* This I/O request is about swapping this key */
521 robj *val; /* the value to swap for REDIS_IOREQ_*_SWAP, otherwise this
522 * field is populated by the I/O thread for REDIS_IOREQ_LOAD. */
523 off_t page; /* Swap page where to read/write the object */
524 off_t pages; /* Swap pages needed to safe object. PREPARE_SWAP return val */
525 int canceled; /* True if this command was canceled by blocking side of VM */
526 pthread_t thread; /* ID of the thread processing this entry */
527 } iojob;
528
529 /*================================ Prototypes =============================== */
530
531 static void freeStringObject(robj *o);
532 static void freeListObject(robj *o);
533 static void freeSetObject(robj *o);
534 static void decrRefCount(void *o);
535 static robj *createObject(int type, void *ptr);
536 static void freeClient(redisClient *c);
537 static int rdbLoad(char *filename);
538 static void addReply(redisClient *c, robj *obj);
539 static void addReplySds(redisClient *c, sds s);
540 static void incrRefCount(robj *o);
541 static int rdbSaveBackground(char *filename);
542 static robj *createStringObject(char *ptr, size_t len);
543 static robj *dupStringObject(robj *o);
544 static void replicationFeedSlaves(list *slaves, struct redisCommand *cmd, int dictid, robj **argv, int argc);
545 static void feedAppendOnlyFile(struct redisCommand *cmd, int dictid, robj **argv, int argc);
546 static int syncWithMaster(void);
547 static robj *tryObjectSharing(robj *o);
548 static int tryObjectEncoding(robj *o);
549 static robj *getDecodedObject(robj *o);
550 static int removeExpire(redisDb *db, robj *key);
551 static int expireIfNeeded(redisDb *db, robj *key);
552 static int deleteIfVolatile(redisDb *db, robj *key);
553 static int deleteIfSwapped(redisDb *db, robj *key);
554 static int deleteKey(redisDb *db, robj *key);
555 static time_t getExpire(redisDb *db, robj *key);
556 static int setExpire(redisDb *db, robj *key, time_t when);
557 static void updateSlavesWaitingBgsave(int bgsaveerr);
558 static void freeMemoryIfNeeded(void);
559 static int processCommand(redisClient *c);
560 static void setupSigSegvAction(void);
561 static void rdbRemoveTempFile(pid_t childpid);
562 static void aofRemoveTempFile(pid_t childpid);
563 static size_t stringObjectLen(robj *o);
564 static void processInputBuffer(redisClient *c);
565 static zskiplist *zslCreate(void);
566 static void zslFree(zskiplist *zsl);
567 static void zslInsert(zskiplist *zsl, double score, robj *obj);
568 static void sendReplyToClientWritev(aeEventLoop *el, int fd, void *privdata, int mask);
569 static void initClientMultiState(redisClient *c);
570 static void freeClientMultiState(redisClient *c);
571 static void queueMultiCommand(redisClient *c, struct redisCommand *cmd);
572 static void unblockClientWaitingData(redisClient *c);
573 static int handleClientsWaitingListPush(redisClient *c, robj *key, robj *ele);
574 static void vmInit(void);
575 static void vmMarkPagesFree(off_t page, off_t count);
576 static robj *vmLoadObject(robj *key);
577 static robj *vmPreviewObject(robj *key);
578 static int vmSwapOneObjectBlocking(void);
579 static int vmSwapOneObjectThreaded(void);
580 static int vmCanSwapOut(void);
581 static int tryFreeOneObjectFromFreelist(void);
582 static void acceptHandler(aeEventLoop *el, int fd, void *privdata, int mask);
583 static void vmThreadedIOCompletedJob(aeEventLoop *el, int fd, void *privdata, int mask);
584 static void vmCancelThreadedIOJob(robj *o);
585 static void lockThreadedIO(void);
586 static void unlockThreadedIO(void);
587 static int vmSwapObjectThreaded(robj *key, robj *val, redisDb *db);
588 static void freeIOJob(iojob *j);
589 static void queueIOJob(iojob *j);
590 static int vmWriteObjectOnSwap(robj *o, off_t page);
591 static robj *vmReadObjectFromSwap(off_t page, int type);
592 static void waitEmptyIOJobsQueue(void);
593 static void vmReopenSwapFile(void);
594 static int vmFreePage(off_t page);
595 static void zunionInterBlockClientOnSwappedKeys(redisClient *c);
596 static int blockClientOnSwappedKeys(struct redisCommand *cmd, redisClient *c);
597 static int dontWaitForSwappedKey(redisClient *c, robj *key);
598 static void handleClientsBlockedOnSwappedKey(redisDb *db, robj *key);
599 static void readQueryFromClient(aeEventLoop *el, int fd, void *privdata, int mask);
600 static struct redisCommand *lookupCommand(char *name);
601 static void call(redisClient *c, struct redisCommand *cmd);
602 static void resetClient(redisClient *c);
603 static void convertToRealHash(robj *o);
604
605 static void authCommand(redisClient *c);
606 static void pingCommand(redisClient *c);
607 static void echoCommand(redisClient *c);
608 static void setCommand(redisClient *c);
609 static void setnxCommand(redisClient *c);
610 static void getCommand(redisClient *c);
611 static void delCommand(redisClient *c);
612 static void existsCommand(redisClient *c);
613 static void incrCommand(redisClient *c);
614 static void decrCommand(redisClient *c);
615 static void incrbyCommand(redisClient *c);
616 static void decrbyCommand(redisClient *c);
617 static void selectCommand(redisClient *c);
618 static void randomkeyCommand(redisClient *c);
619 static void keysCommand(redisClient *c);
620 static void dbsizeCommand(redisClient *c);
621 static void lastsaveCommand(redisClient *c);
622 static void saveCommand(redisClient *c);
623 static void bgsaveCommand(redisClient *c);
624 static void bgrewriteaofCommand(redisClient *c);
625 static void shutdownCommand(redisClient *c);
626 static void moveCommand(redisClient *c);
627 static void renameCommand(redisClient *c);
628 static void renamenxCommand(redisClient *c);
629 static void lpushCommand(redisClient *c);
630 static void rpushCommand(redisClient *c);
631 static void lpopCommand(redisClient *c);
632 static void rpopCommand(redisClient *c);
633 static void llenCommand(redisClient *c);
634 static void lindexCommand(redisClient *c);
635 static void lrangeCommand(redisClient *c);
636 static void ltrimCommand(redisClient *c);
637 static void typeCommand(redisClient *c);
638 static void lsetCommand(redisClient *c);
639 static void saddCommand(redisClient *c);
640 static void sremCommand(redisClient *c);
641 static void smoveCommand(redisClient *c);
642 static void sismemberCommand(redisClient *c);
643 static void scardCommand(redisClient *c);
644 static void spopCommand(redisClient *c);
645 static void srandmemberCommand(redisClient *c);
646 static void sinterCommand(redisClient *c);
647 static void sinterstoreCommand(redisClient *c);
648 static void sunionCommand(redisClient *c);
649 static void sunionstoreCommand(redisClient *c);
650 static void sdiffCommand(redisClient *c);
651 static void sdiffstoreCommand(redisClient *c);
652 static void syncCommand(redisClient *c);
653 static void flushdbCommand(redisClient *c);
654 static void flushallCommand(redisClient *c);
655 static void sortCommand(redisClient *c);
656 static void lremCommand(redisClient *c);
657 static void rpoplpushcommand(redisClient *c);
658 static void infoCommand(redisClient *c);
659 static void mgetCommand(redisClient *c);
660 static void monitorCommand(redisClient *c);
661 static void expireCommand(redisClient *c);
662 static void expireatCommand(redisClient *c);
663 static void getsetCommand(redisClient *c);
664 static void ttlCommand(redisClient *c);
665 static void slaveofCommand(redisClient *c);
666 static void debugCommand(redisClient *c);
667 static void msetCommand(redisClient *c);
668 static void msetnxCommand(redisClient *c);
669 static void zaddCommand(redisClient *c);
670 static void zincrbyCommand(redisClient *c);
671 static void zrangeCommand(redisClient *c);
672 static void zrangebyscoreCommand(redisClient *c);
673 static void zcountCommand(redisClient *c);
674 static void zrevrangeCommand(redisClient *c);
675 static void zcardCommand(redisClient *c);
676 static void zremCommand(redisClient *c);
677 static void zscoreCommand(redisClient *c);
678 static void zremrangebyscoreCommand(redisClient *c);
679 static void multiCommand(redisClient *c);
680 static void execCommand(redisClient *c);
681 static void discardCommand(redisClient *c);
682 static void blpopCommand(redisClient *c);
683 static void brpopCommand(redisClient *c);
684 static void appendCommand(redisClient *c);
685 static void substrCommand(redisClient *c);
686 static void zrankCommand(redisClient *c);
687 static void zrevrankCommand(redisClient *c);
688 static void hsetCommand(redisClient *c);
689 static void hgetCommand(redisClient *c);
690 static void hdelCommand(redisClient *c);
691 static void hlenCommand(redisClient *c);
692 static void zremrangebyrankCommand(redisClient *c);
693 static void zunionCommand(redisClient *c);
694 static void zinterCommand(redisClient *c);
695 static void hkeysCommand(redisClient *c);
696 static void hvalsCommand(redisClient *c);
697 static void hgetallCommand(redisClient *c);
698 static void hexistsCommand(redisClient *c);
699 static void configCommand(redisClient *c);
700 static void hincrbyCommand(redisClient *c);
701
702 /*================================= Globals ================================= */
703
704 /* Global vars */
705 static struct redisServer server; /* server global state */
706 static struct redisCommand cmdTable[] = {
707 {"get",getCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
708 {"set",setCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,0,0,0},
709 {"setnx",setnxCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,0,0,0},
710 {"append",appendCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
711 {"substr",substrCommand,4,REDIS_CMD_INLINE,NULL,1,1,1},
712 {"del",delCommand,-2,REDIS_CMD_INLINE,NULL,0,0,0},
713 {"exists",existsCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
714 {"incr",incrCommand,2,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,1,1},
715 {"decr",decrCommand,2,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,1,1},
716 {"mget",mgetCommand,-2,REDIS_CMD_INLINE,NULL,1,-1,1},
717 {"rpush",rpushCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
718 {"lpush",lpushCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
719 {"rpop",rpopCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
720 {"lpop",lpopCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
721 {"brpop",brpopCommand,-3,REDIS_CMD_INLINE,NULL,1,1,1},
722 {"blpop",blpopCommand,-3,REDIS_CMD_INLINE,NULL,1,1,1},
723 {"llen",llenCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
724 {"lindex",lindexCommand,3,REDIS_CMD_INLINE,NULL,1,1,1},
725 {"lset",lsetCommand,4,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
726 {"lrange",lrangeCommand,4,REDIS_CMD_INLINE,NULL,1,1,1},
727 {"ltrim",ltrimCommand,4,REDIS_CMD_INLINE,NULL,1,1,1},
728 {"lrem",lremCommand,4,REDIS_CMD_BULK,NULL,1,1,1},
729 {"rpoplpush",rpoplpushcommand,3,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,2,1},
730 {"sadd",saddCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
731 {"srem",sremCommand,3,REDIS_CMD_BULK,NULL,1,1,1},
732 {"smove",smoveCommand,4,REDIS_CMD_BULK,NULL,1,2,1},
733 {"sismember",sismemberCommand,3,REDIS_CMD_BULK,NULL,1,1,1},
734 {"scard",scardCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
735 {"spop",spopCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
736 {"srandmember",srandmemberCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
737 {"sinter",sinterCommand,-2,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,-1,1},
738 {"sinterstore",sinterstoreCommand,-3,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,2,-1,1},
739 {"sunion",sunionCommand,-2,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,-1,1},
740 {"sunionstore",sunionstoreCommand,-3,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,2,-1,1},
741 {"sdiff",sdiffCommand,-2,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,-1,1},
742 {"sdiffstore",sdiffstoreCommand,-3,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,2,-1,1},
743 {"smembers",sinterCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
744 {"zadd",zaddCommand,4,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
745 {"zincrby",zincrbyCommand,4,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
746 {"zrem",zremCommand,3,REDIS_CMD_BULK,NULL,1,1,1},
747 {"zremrangebyscore",zremrangebyscoreCommand,4,REDIS_CMD_INLINE,NULL,1,1,1},
748 {"zremrangebyrank",zremrangebyrankCommand,4,REDIS_CMD_INLINE,NULL,1,1,1},
749 {"zunion",zunionCommand,-4,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,zunionInterBlockClientOnSwappedKeys,0,0,0},
750 {"zinter",zinterCommand,-4,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,zunionInterBlockClientOnSwappedKeys,0,0,0},
751 {"zrange",zrangeCommand,-4,REDIS_CMD_INLINE,NULL,1,1,1},
752 {"zrangebyscore",zrangebyscoreCommand,-4,REDIS_CMD_INLINE,NULL,1,1,1},
753 {"zcount",zcountCommand,4,REDIS_CMD_INLINE,NULL,1,1,1},
754 {"zrevrange",zrevrangeCommand,-4,REDIS_CMD_INLINE,NULL,1,1,1},
755 {"zcard",zcardCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
756 {"zscore",zscoreCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
757 {"zrank",zrankCommand,3,REDIS_CMD_BULK,NULL,1,1,1},
758 {"zrevrank",zrevrankCommand,3,REDIS_CMD_BULK,NULL,1,1,1},
759 {"hset",hsetCommand,4,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
760 {"hincrby",hincrbyCommand,4,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,1,1},
761 {"hget",hgetCommand,3,REDIS_CMD_BULK,NULL,1,1,1},
762 {"hdel",hdelCommand,3,REDIS_CMD_BULK,NULL,1,1,1},
763 {"hlen",hlenCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
764 {"hkeys",hkeysCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
765 {"hvals",hvalsCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
766 {"hgetall",hgetallCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
767 {"hexists",hexistsCommand,3,REDIS_CMD_BULK,NULL,1,1,1},
768 {"incrby",incrbyCommand,3,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,1,1},
769 {"decrby",decrbyCommand,3,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,1,1},
770 {"getset",getsetCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
771 {"mset",msetCommand,-3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,-1,2},
772 {"msetnx",msetnxCommand,-3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,-1,2},
773 {"randomkey",randomkeyCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
774 {"select",selectCommand,2,REDIS_CMD_INLINE,NULL,0,0,0},
775 {"move",moveCommand,3,REDIS_CMD_INLINE,NULL,1,1,1},
776 {"rename",renameCommand,3,REDIS_CMD_INLINE,NULL,1,1,1},
777 {"renamenx",renamenxCommand,3,REDIS_CMD_INLINE,NULL,1,1,1},
778 {"expire",expireCommand,3,REDIS_CMD_INLINE,NULL,0,0,0},
779 {"expireat",expireatCommand,3,REDIS_CMD_INLINE,NULL,0,0,0},
780 {"keys",keysCommand,2,REDIS_CMD_INLINE,NULL,0,0,0},
781 {"dbsize",dbsizeCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
782 {"auth",authCommand,2,REDIS_CMD_INLINE,NULL,0,0,0},
783 {"ping",pingCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
784 {"echo",echoCommand,2,REDIS_CMD_BULK,NULL,0,0,0},
785 {"save",saveCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
786 {"bgsave",bgsaveCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
787 {"bgrewriteaof",bgrewriteaofCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
788 {"shutdown",shutdownCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
789 {"lastsave",lastsaveCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
790 {"type",typeCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
791 {"multi",multiCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
792 {"exec",execCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
793 {"discard",discardCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
794 {"sync",syncCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
795 {"flushdb",flushdbCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
796 {"flushall",flushallCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
797 {"sort",sortCommand,-2,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,1,1},
798 {"info",infoCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
799 {"monitor",monitorCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
800 {"ttl",ttlCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
801 {"slaveof",slaveofCommand,3,REDIS_CMD_INLINE,NULL,0,0,0},
802 {"debug",debugCommand,-2,REDIS_CMD_INLINE,NULL,0,0,0},
803 {"config",configCommand,-2,REDIS_CMD_BULK,NULL,0,0,0},
804 {NULL,NULL,0,0,NULL,0,0,0}
805 };
806
807 static void usage();
808
809 /*============================ Utility functions ============================ */
810
811 /* Glob-style pattern matching. */
812 static int stringmatchlen(const char *pattern, int patternLen,
813 const char *string, int stringLen, int nocase)
814 {
815 while(patternLen) {
816 switch(pattern[0]) {
817 case '*':
818 while (pattern[1] == '*') {
819 pattern++;
820 patternLen--;
821 }
822 if (patternLen == 1)
823 return 1; /* match */
824 while(stringLen) {
825 if (stringmatchlen(pattern+1, patternLen-1,
826 string, stringLen, nocase))
827 return 1; /* match */
828 string++;
829 stringLen--;
830 }
831 return 0; /* no match */
832 break;
833 case '?':
834 if (stringLen == 0)
835 return 0; /* no match */
836 string++;
837 stringLen--;
838 break;
839 case '[':
840 {
841 int not, match;
842
843 pattern++;
844 patternLen--;
845 not = pattern[0] == '^';
846 if (not) {
847 pattern++;
848 patternLen--;
849 }
850 match = 0;
851 while(1) {
852 if (pattern[0] == '\\') {
853 pattern++;
854 patternLen--;
855 if (pattern[0] == string[0])
856 match = 1;
857 } else if (pattern[0] == ']') {
858 break;
859 } else if (patternLen == 0) {
860 pattern--;
861 patternLen++;
862 break;
863 } else if (pattern[1] == '-' && patternLen >= 3) {
864 int start = pattern[0];
865 int end = pattern[2];
866 int c = string[0];
867 if (start > end) {
868 int t = start;
869 start = end;
870 end = t;
871 }
872 if (nocase) {
873 start = tolower(start);
874 end = tolower(end);
875 c = tolower(c);
876 }
877 pattern += 2;
878 patternLen -= 2;
879 if (c >= start && c <= end)
880 match = 1;
881 } else {
882 if (!nocase) {
883 if (pattern[0] == string[0])
884 match = 1;
885 } else {
886 if (tolower((int)pattern[0]) == tolower((int)string[0]))
887 match = 1;
888 }
889 }
890 pattern++;
891 patternLen--;
892 }
893 if (not)
894 match = !match;
895 if (!match)
896 return 0; /* no match */
897 string++;
898 stringLen--;
899 break;
900 }
901 case '\\':
902 if (patternLen >= 2) {
903 pattern++;
904 patternLen--;
905 }
906 /* fall through */
907 default:
908 if (!nocase) {
909 if (pattern[0] != string[0])
910 return 0; /* no match */
911 } else {
912 if (tolower((int)pattern[0]) != tolower((int)string[0]))
913 return 0; /* no match */
914 }
915 string++;
916 stringLen--;
917 break;
918 }
919 pattern++;
920 patternLen--;
921 if (stringLen == 0) {
922 while(*pattern == '*') {
923 pattern++;
924 patternLen--;
925 }
926 break;
927 }
928 }
929 if (patternLen == 0 && stringLen == 0)
930 return 1;
931 return 0;
932 }
933
934 static int stringmatch(const char *pattern, const char *string, int nocase) {
935 return stringmatchlen(pattern,strlen(pattern),string,strlen(string),nocase);
936 }
937
938 static void redisLog(int level, const char *fmt, ...) {
939 va_list ap;
940 FILE *fp;
941
942 fp = (server.logfile == NULL) ? stdout : fopen(server.logfile,"a");
943 if (!fp) return;
944
945 va_start(ap, fmt);
946 if (level >= server.verbosity) {
947 char *c = ".-*#";
948 char buf[64];
949 time_t now;
950
951 now = time(NULL);
952 strftime(buf,64,"%d %b %H:%M:%S",localtime(&now));
953 fprintf(fp,"[%d] %s %c ",(int)getpid(),buf,c[level]);
954 vfprintf(fp, fmt, ap);
955 fprintf(fp,"\n");
956 fflush(fp);
957 }
958 va_end(ap);
959
960 if (server.logfile) fclose(fp);
961 }
962
963 /*====================== Hash table type implementation ==================== */
964
965 /* This is an hash table type that uses the SDS dynamic strings libary as
966 * keys and radis objects as values (objects can hold SDS strings,
967 * lists, sets). */
968
969 static void dictVanillaFree(void *privdata, void *val)
970 {
971 DICT_NOTUSED(privdata);
972 zfree(val);
973 }
974
975 static void dictListDestructor(void *privdata, void *val)
976 {
977 DICT_NOTUSED(privdata);
978 listRelease((list*)val);
979 }
980
981 static int sdsDictKeyCompare(void *privdata, const void *key1,
982 const void *key2)
983 {
984 int l1,l2;
985 DICT_NOTUSED(privdata);
986
987 l1 = sdslen((sds)key1);
988 l2 = sdslen((sds)key2);
989 if (l1 != l2) return 0;
990 return memcmp(key1, key2, l1) == 0;
991 }
992
993 static void dictRedisObjectDestructor(void *privdata, void *val)
994 {
995 DICT_NOTUSED(privdata);
996
997 if (val == NULL) return; /* Values of swapped out keys as set to NULL */
998 decrRefCount(val);
999 }
1000
1001 static int dictObjKeyCompare(void *privdata, const void *key1,
1002 const void *key2)
1003 {
1004 const robj *o1 = key1, *o2 = key2;
1005 return sdsDictKeyCompare(privdata,o1->ptr,o2->ptr);
1006 }
1007
1008 static unsigned int dictObjHash(const void *key) {
1009 const robj *o = key;
1010 return dictGenHashFunction(o->ptr, sdslen((sds)o->ptr));
1011 }
1012
1013 static int dictEncObjKeyCompare(void *privdata, const void *key1,
1014 const void *key2)
1015 {
1016 robj *o1 = (robj*) key1, *o2 = (robj*) key2;
1017 int cmp;
1018
1019 if (o1->encoding == REDIS_ENCODING_INT &&
1020 o2->encoding == REDIS_ENCODING_INT &&
1021 o1->ptr == o2->ptr) return 1;
1022
1023 o1 = getDecodedObject(o1);
1024 o2 = getDecodedObject(o2);
1025 cmp = sdsDictKeyCompare(privdata,o1->ptr,o2->ptr);
1026 decrRefCount(o1);
1027 decrRefCount(o2);
1028 return cmp;
1029 }
1030
1031 static unsigned int dictEncObjHash(const void *key) {
1032 robj *o = (robj*) key;
1033
1034 if (o->encoding == REDIS_ENCODING_RAW) {
1035 return dictGenHashFunction(o->ptr, sdslen((sds)o->ptr));
1036 } else {
1037 if (o->encoding == REDIS_ENCODING_INT) {
1038 char buf[32];
1039 int len;
1040
1041 len = snprintf(buf,32,"%ld",(long)o->ptr);
1042 return dictGenHashFunction((unsigned char*)buf, len);
1043 } else {
1044 unsigned int hash;
1045
1046 o = getDecodedObject(o);
1047 hash = dictGenHashFunction(o->ptr, sdslen((sds)o->ptr));
1048 decrRefCount(o);
1049 return hash;
1050 }
1051 }
1052 }
1053
1054 /* Sets type and expires */
1055 static dictType setDictType = {
1056 dictEncObjHash, /* hash function */
1057 NULL, /* key dup */
1058 NULL, /* val dup */
1059 dictEncObjKeyCompare, /* key compare */
1060 dictRedisObjectDestructor, /* key destructor */
1061 NULL /* val destructor */
1062 };
1063
1064 /* Sorted sets hash (note: a skiplist is used in addition to the hash table) */
1065 static dictType zsetDictType = {
1066 dictEncObjHash, /* hash function */
1067 NULL, /* key dup */
1068 NULL, /* val dup */
1069 dictEncObjKeyCompare, /* key compare */
1070 dictRedisObjectDestructor, /* key destructor */
1071 dictVanillaFree /* val destructor of malloc(sizeof(double)) */
1072 };
1073
1074 /* Db->dict */
1075 static dictType dbDictType = {
1076 dictObjHash, /* hash function */
1077 NULL, /* key dup */
1078 NULL, /* val dup */
1079 dictObjKeyCompare, /* key compare */
1080 dictRedisObjectDestructor, /* key destructor */
1081 dictRedisObjectDestructor /* val destructor */
1082 };
1083
1084 /* Db->expires */
1085 static dictType keyptrDictType = {
1086 dictObjHash, /* hash function */
1087 NULL, /* key dup */
1088 NULL, /* val dup */
1089 dictObjKeyCompare, /* key compare */
1090 dictRedisObjectDestructor, /* key destructor */
1091 NULL /* val destructor */
1092 };
1093
1094 /* Hash type hash table (note that small hashes are represented with zimpaps) */
1095 static dictType hashDictType = {
1096 dictEncObjHash, /* hash function */
1097 NULL, /* key dup */
1098 NULL, /* val dup */
1099 dictEncObjKeyCompare, /* key compare */
1100 dictRedisObjectDestructor, /* key destructor */
1101 dictRedisObjectDestructor /* val destructor */
1102 };
1103
1104 /* Keylist hash table type has unencoded redis objects as keys and
1105 * lists as values. It's used for blocking operations (BLPOP) and to
1106 * map swapped keys to a list of clients waiting for this keys to be loaded. */
1107 static dictType keylistDictType = {
1108 dictObjHash, /* hash function */
1109 NULL, /* key dup */
1110 NULL, /* val dup */
1111 dictObjKeyCompare, /* key compare */
1112 dictRedisObjectDestructor, /* key destructor */
1113 dictListDestructor /* val destructor */
1114 };
1115
1116 static void version();
1117
1118 /* ========================= Random utility functions ======================= */
1119
1120 /* Redis generally does not try to recover from out of memory conditions
1121 * when allocating objects or strings, it is not clear if it will be possible
1122 * to report this condition to the client since the networking layer itself
1123 * is based on heap allocation for send buffers, so we simply abort.
1124 * At least the code will be simpler to read... */
1125 static void oom(const char *msg) {
1126 redisLog(REDIS_WARNING, "%s: Out of memory\n",msg);
1127 sleep(1);
1128 abort();
1129 }
1130
1131 /* ====================== Redis server networking stuff ===================== */
1132 static void closeTimedoutClients(void) {
1133 redisClient *c;
1134 listNode *ln;
1135 time_t now = time(NULL);
1136 listIter li;
1137
1138 listRewind(server.clients,&li);
1139 while ((ln = listNext(&li)) != NULL) {
1140 c = listNodeValue(ln);
1141 if (server.maxidletime &&
1142 !(c->flags & REDIS_SLAVE) && /* no timeout for slaves */
1143 !(c->flags & REDIS_MASTER) && /* no timeout for masters */
1144 (now - c->lastinteraction > server.maxidletime))
1145 {
1146 redisLog(REDIS_VERBOSE,"Closing idle client");
1147 freeClient(c);
1148 } else if (c->flags & REDIS_BLOCKED) {
1149 if (c->blockingto != 0 && c->blockingto < now) {
1150 addReply(c,shared.nullmultibulk);
1151 unblockClientWaitingData(c);
1152 }
1153 }
1154 }
1155 }
1156
1157 static int htNeedsResize(dict *dict) {
1158 long long size, used;
1159
1160 size = dictSlots(dict);
1161 used = dictSize(dict);
1162 return (size && used && size > DICT_HT_INITIAL_SIZE &&
1163 (used*100/size < REDIS_HT_MINFILL));
1164 }
1165
1166 /* If the percentage of used slots in the HT reaches REDIS_HT_MINFILL
1167 * we resize the hash table to save memory */
1168 static void tryResizeHashTables(void) {
1169 int j;
1170
1171 for (j = 0; j < server.dbnum; j++) {
1172 if (htNeedsResize(server.db[j].dict)) {
1173 redisLog(REDIS_VERBOSE,"The hash table %d is too sparse, resize it...",j);
1174 dictResize(server.db[j].dict);
1175 redisLog(REDIS_VERBOSE,"Hash table %d resized.",j);
1176 }
1177 if (htNeedsResize(server.db[j].expires))
1178 dictResize(server.db[j].expires);
1179 }
1180 }
1181
1182 /* A background saving child (BGSAVE) terminated its work. Handle this. */
1183 void backgroundSaveDoneHandler(int statloc) {
1184 int exitcode = WEXITSTATUS(statloc);
1185 int bysignal = WIFSIGNALED(statloc);
1186
1187 if (!bysignal && exitcode == 0) {
1188 redisLog(REDIS_NOTICE,
1189 "Background saving terminated with success");
1190 server.dirty = 0;
1191 server.lastsave = time(NULL);
1192 } else if (!bysignal && exitcode != 0) {
1193 redisLog(REDIS_WARNING, "Background saving error");
1194 } else {
1195 redisLog(REDIS_WARNING,
1196 "Background saving terminated by signal");
1197 rdbRemoveTempFile(server.bgsavechildpid);
1198 }
1199 server.bgsavechildpid = -1;
1200 /* Possibly there are slaves waiting for a BGSAVE in order to be served
1201 * (the first stage of SYNC is a bulk transfer of dump.rdb) */
1202 updateSlavesWaitingBgsave(exitcode == 0 ? REDIS_OK : REDIS_ERR);
1203 }
1204
1205 /* A background append only file rewriting (BGREWRITEAOF) terminated its work.
1206 * Handle this. */
1207 void backgroundRewriteDoneHandler(int statloc) {
1208 int exitcode = WEXITSTATUS(statloc);
1209 int bysignal = WIFSIGNALED(statloc);
1210
1211 if (!bysignal && exitcode == 0) {
1212 int fd;
1213 char tmpfile[256];
1214
1215 redisLog(REDIS_NOTICE,
1216 "Background append only file rewriting terminated with success");
1217 /* Now it's time to flush the differences accumulated by the parent */
1218 snprintf(tmpfile,256,"temp-rewriteaof-bg-%d.aof", (int) server.bgrewritechildpid);
1219 fd = open(tmpfile,O_WRONLY|O_APPEND);
1220 if (fd == -1) {
1221 redisLog(REDIS_WARNING, "Not able to open the temp append only file produced by the child: %s", strerror(errno));
1222 goto cleanup;
1223 }
1224 /* Flush our data... */
1225 if (write(fd,server.bgrewritebuf,sdslen(server.bgrewritebuf)) !=
1226 (signed) sdslen(server.bgrewritebuf)) {
1227 redisLog(REDIS_WARNING, "Error or short write trying to flush the parent diff of the append log file in the child temp file: %s", strerror(errno));
1228 close(fd);
1229 goto cleanup;
1230 }
1231 redisLog(REDIS_NOTICE,"Parent diff flushed into the new append log file with success (%lu bytes)",sdslen(server.bgrewritebuf));
1232 /* Now our work is to rename the temp file into the stable file. And
1233 * switch the file descriptor used by the server for append only. */
1234 if (rename(tmpfile,server.appendfilename) == -1) {
1235 redisLog(REDIS_WARNING,"Can't rename the temp append only file into the stable one: %s", strerror(errno));
1236 close(fd);
1237 goto cleanup;
1238 }
1239 /* Mission completed... almost */
1240 redisLog(REDIS_NOTICE,"Append only file successfully rewritten.");
1241 if (server.appendfd != -1) {
1242 /* If append only is actually enabled... */
1243 close(server.appendfd);
1244 server.appendfd = fd;
1245 fsync(fd);
1246 server.appendseldb = -1; /* Make sure it will issue SELECT */
1247 redisLog(REDIS_NOTICE,"The new append only file was selected for future appends.");
1248 } else {
1249 /* If append only is disabled we just generate a dump in this
1250 * format. Why not? */
1251 close(fd);
1252 }
1253 } else if (!bysignal && exitcode != 0) {
1254 redisLog(REDIS_WARNING, "Background append only file rewriting error");
1255 } else {
1256 redisLog(REDIS_WARNING,
1257 "Background append only file rewriting terminated by signal");
1258 }
1259 cleanup:
1260 sdsfree(server.bgrewritebuf);
1261 server.bgrewritebuf = sdsempty();
1262 aofRemoveTempFile(server.bgrewritechildpid);
1263 server.bgrewritechildpid = -1;
1264 }
1265
1266 static int serverCron(struct aeEventLoop *eventLoop, long long id, void *clientData) {
1267 int j, loops = server.cronloops++;
1268 REDIS_NOTUSED(eventLoop);
1269 REDIS_NOTUSED(id);
1270 REDIS_NOTUSED(clientData);
1271
1272 /* We take a cached value of the unix time in the global state because
1273 * with virtual memory and aging there is to store the current time
1274 * in objects at every object access, and accuracy is not needed.
1275 * To access a global var is faster than calling time(NULL) */
1276 server.unixtime = time(NULL);
1277
1278 /* Show some info about non-empty databases */
1279 for (j = 0; j < server.dbnum; j++) {
1280 long long size, used, vkeys;
1281
1282 size = dictSlots(server.db[j].dict);
1283 used = dictSize(server.db[j].dict);
1284 vkeys = dictSize(server.db[j].expires);
1285 if (!(loops % 50) && (used || vkeys)) {
1286 redisLog(REDIS_VERBOSE,"DB %d: %lld keys (%lld volatile) in %lld slots HT.",j,used,vkeys,size);
1287 /* dictPrintStats(server.dict); */
1288 }
1289 }
1290
1291 /* We don't want to resize the hash tables while a bacground saving
1292 * is in progress: the saving child is created using fork() that is
1293 * implemented with a copy-on-write semantic in most modern systems, so
1294 * if we resize the HT while there is the saving child at work actually
1295 * a lot of memory movements in the parent will cause a lot of pages
1296 * copied. */
1297 if (server.bgsavechildpid == -1 && !(loops % 10)) tryResizeHashTables();
1298
1299 /* Show information about connected clients */
1300 if (!(loops % 50)) {
1301 redisLog(REDIS_VERBOSE,"%d clients connected (%d slaves), %zu bytes in use, %d shared objects",
1302 listLength(server.clients)-listLength(server.slaves),
1303 listLength(server.slaves),
1304 zmalloc_used_memory(),
1305 dictSize(server.sharingpool));
1306 }
1307
1308 /* Close connections of timedout clients */
1309 if ((server.maxidletime && !(loops % 100)) || server.blpop_blocked_clients)
1310 closeTimedoutClients();
1311
1312 /* Check if a background saving or AOF rewrite in progress terminated */
1313 if (server.bgsavechildpid != -1 || server.bgrewritechildpid != -1) {
1314 int statloc;
1315 pid_t pid;
1316
1317 if ((pid = wait3(&statloc,WNOHANG,NULL)) != 0) {
1318 if (pid == server.bgsavechildpid) {
1319 backgroundSaveDoneHandler(statloc);
1320 } else {
1321 backgroundRewriteDoneHandler(statloc);
1322 }
1323 }
1324 } else {
1325 /* If there is not a background saving in progress check if
1326 * we have to save now */
1327 time_t now = time(NULL);
1328 for (j = 0; j < server.saveparamslen; j++) {
1329 struct saveparam *sp = server.saveparams+j;
1330
1331 if (server.dirty >= sp->changes &&
1332 now-server.lastsave > sp->seconds) {
1333 redisLog(REDIS_NOTICE,"%d changes in %d seconds. Saving...",
1334 sp->changes, sp->seconds);
1335 rdbSaveBackground(server.dbfilename);
1336 break;
1337 }
1338 }
1339 }
1340
1341 /* Try to expire a few timed out keys. The algorithm used is adaptive and
1342 * will use few CPU cycles if there are few expiring keys, otherwise
1343 * it will get more aggressive to avoid that too much memory is used by
1344 * keys that can be removed from the keyspace. */
1345 for (j = 0; j < server.dbnum; j++) {
1346 int expired;
1347 redisDb *db = server.db+j;
1348
1349 /* Continue to expire if at the end of the cycle more than 25%
1350 * of the keys were expired. */
1351 do {
1352 long num = dictSize(db->expires);
1353 time_t now = time(NULL);
1354
1355 expired = 0;
1356 if (num > REDIS_EXPIRELOOKUPS_PER_CRON)
1357 num = REDIS_EXPIRELOOKUPS_PER_CRON;
1358 while (num--) {
1359 dictEntry *de;
1360 time_t t;
1361
1362 if ((de = dictGetRandomKey(db->expires)) == NULL) break;
1363 t = (time_t) dictGetEntryVal(de);
1364 if (now > t) {
1365 deleteKey(db,dictGetEntryKey(de));
1366 expired++;
1367 server.stat_expiredkeys++;
1368 }
1369 }
1370 } while (expired > REDIS_EXPIRELOOKUPS_PER_CRON/4);
1371 }
1372
1373 /* Swap a few keys on disk if we are over the memory limit and VM
1374 * is enbled. Try to free objects from the free list first. */
1375 if (vmCanSwapOut()) {
1376 while (server.vm_enabled && zmalloc_used_memory() >
1377 server.vm_max_memory)
1378 {
1379 int retval;
1380
1381 if (tryFreeOneObjectFromFreelist() == REDIS_OK) continue;
1382 retval = (server.vm_max_threads == 0) ?
1383 vmSwapOneObjectBlocking() :
1384 vmSwapOneObjectThreaded();
1385 if (retval == REDIS_ERR && !(loops % 300) &&
1386 zmalloc_used_memory() >
1387 (server.vm_max_memory+server.vm_max_memory/10))
1388 {
1389 redisLog(REDIS_WARNING,"WARNING: vm-max-memory limit exceeded by more than 10%% but unable to swap more objects out!");
1390 }
1391 /* Note that when using threade I/O we free just one object,
1392 * because anyway when the I/O thread in charge to swap this
1393 * object out will finish, the handler of completed jobs
1394 * will try to swap more objects if we are still out of memory. */
1395 if (retval == REDIS_ERR || server.vm_max_threads > 0) break;
1396 }
1397 }
1398
1399 /* Check if we should connect to a MASTER */
1400 if (server.replstate == REDIS_REPL_CONNECT && !(loops % 10)) {
1401 redisLog(REDIS_NOTICE,"Connecting to MASTER...");
1402 if (syncWithMaster() == REDIS_OK) {
1403 redisLog(REDIS_NOTICE,"MASTER <-> SLAVE sync succeeded");
1404 }
1405 }
1406 return 100;
1407 }
1408
1409 /* This function gets called every time Redis is entering the
1410 * main loop of the event driven library, that is, before to sleep
1411 * for ready file descriptors. */
1412 static void beforeSleep(struct aeEventLoop *eventLoop) {
1413 REDIS_NOTUSED(eventLoop);
1414
1415 if (server.vm_enabled && listLength(server.io_ready_clients)) {
1416 listIter li;
1417 listNode *ln;
1418
1419 listRewind(server.io_ready_clients,&li);
1420 while((ln = listNext(&li))) {
1421 redisClient *c = ln->value;
1422 struct redisCommand *cmd;
1423
1424 /* Resume the client. */
1425 listDelNode(server.io_ready_clients,ln);
1426 c->flags &= (~REDIS_IO_WAIT);
1427 server.vm_blocked_clients--;
1428 aeCreateFileEvent(server.el, c->fd, AE_READABLE,
1429 readQueryFromClient, c);
1430 cmd = lookupCommand(c->argv[0]->ptr);
1431 assert(cmd != NULL);
1432 call(c,cmd);
1433 resetClient(c);
1434 /* There may be more data to process in the input buffer. */
1435 if (c->querybuf && sdslen(c->querybuf) > 0)
1436 processInputBuffer(c);
1437 }
1438 }
1439 }
1440
1441 static void createSharedObjects(void) {
1442 shared.crlf = createObject(REDIS_STRING,sdsnew("\r\n"));
1443 shared.ok = createObject(REDIS_STRING,sdsnew("+OK\r\n"));
1444 shared.err = createObject(REDIS_STRING,sdsnew("-ERR\r\n"));
1445 shared.emptybulk = createObject(REDIS_STRING,sdsnew("$0\r\n\r\n"));
1446 shared.czero = createObject(REDIS_STRING,sdsnew(":0\r\n"));
1447 shared.cone = createObject(REDIS_STRING,sdsnew(":1\r\n"));
1448 shared.nullbulk = createObject(REDIS_STRING,sdsnew("$-1\r\n"));
1449 shared.nullmultibulk = createObject(REDIS_STRING,sdsnew("*-1\r\n"));
1450 shared.emptymultibulk = createObject(REDIS_STRING,sdsnew("*0\r\n"));
1451 shared.pong = createObject(REDIS_STRING,sdsnew("+PONG\r\n"));
1452 shared.queued = createObject(REDIS_STRING,sdsnew("+QUEUED\r\n"));
1453 shared.wrongtypeerr = createObject(REDIS_STRING,sdsnew(
1454 "-ERR Operation against a key holding the wrong kind of value\r\n"));
1455 shared.nokeyerr = createObject(REDIS_STRING,sdsnew(
1456 "-ERR no such key\r\n"));
1457 shared.syntaxerr = createObject(REDIS_STRING,sdsnew(
1458 "-ERR syntax error\r\n"));
1459 shared.sameobjecterr = createObject(REDIS_STRING,sdsnew(
1460 "-ERR source and destination objects are the same\r\n"));
1461 shared.outofrangeerr = createObject(REDIS_STRING,sdsnew(
1462 "-ERR index out of range\r\n"));
1463 shared.space = createObject(REDIS_STRING,sdsnew(" "));
1464 shared.colon = createObject(REDIS_STRING,sdsnew(":"));
1465 shared.plus = createObject(REDIS_STRING,sdsnew("+"));
1466 shared.select0 = createStringObject("select 0\r\n",10);
1467 shared.select1 = createStringObject("select 1\r\n",10);
1468 shared.select2 = createStringObject("select 2\r\n",10);
1469 shared.select3 = createStringObject("select 3\r\n",10);
1470 shared.select4 = createStringObject("select 4\r\n",10);
1471 shared.select5 = createStringObject("select 5\r\n",10);
1472 shared.select6 = createStringObject("select 6\r\n",10);
1473 shared.select7 = createStringObject("select 7\r\n",10);
1474 shared.select8 = createStringObject("select 8\r\n",10);
1475 shared.select9 = createStringObject("select 9\r\n",10);
1476 }
1477
1478 static void appendServerSaveParams(time_t seconds, int changes) {
1479 server.saveparams = zrealloc(server.saveparams,sizeof(struct saveparam)*(server.saveparamslen+1));
1480 server.saveparams[server.saveparamslen].seconds = seconds;
1481 server.saveparams[server.saveparamslen].changes = changes;
1482 server.saveparamslen++;
1483 }
1484
1485 static void resetServerSaveParams() {
1486 zfree(server.saveparams);
1487 server.saveparams = NULL;
1488 server.saveparamslen = 0;
1489 }
1490
1491 static void initServerConfig() {
1492 server.dbnum = REDIS_DEFAULT_DBNUM;
1493 server.port = REDIS_SERVERPORT;
1494 server.verbosity = REDIS_VERBOSE;
1495 server.maxidletime = REDIS_MAXIDLETIME;
1496 server.saveparams = NULL;
1497 server.logfile = NULL; /* NULL = log on standard output */
1498 server.bindaddr = NULL;
1499 server.glueoutputbuf = 1;
1500 server.daemonize = 0;
1501 server.appendonly = 0;
1502 server.appendfsync = APPENDFSYNC_ALWAYS;
1503 server.lastfsync = time(NULL);
1504 server.appendfd = -1;
1505 server.appendseldb = -1; /* Make sure the first time will not match */
1506 server.pidfile = zstrdup("/var/run/redis.pid");
1507 server.dbfilename = zstrdup("dump.rdb");
1508 server.appendfilename = zstrdup("appendonly.aof");
1509 server.requirepass = NULL;
1510 server.shareobjects = 0;
1511 server.rdbcompression = 1;
1512 server.sharingpoolsize = 1024;
1513 server.maxclients = 0;
1514 server.blpop_blocked_clients = 0;
1515 server.maxmemory = 0;
1516 server.vm_enabled = 0;
1517 server.vm_swap_file = zstrdup("/tmp/redis-%p.vm");
1518 server.vm_page_size = 256; /* 256 bytes per page */
1519 server.vm_pages = 1024*1024*100; /* 104 millions of pages */
1520 server.vm_max_memory = 1024LL*1024*1024*1; /* 1 GB of RAM */
1521 server.vm_max_threads = 4;
1522 server.vm_blocked_clients = 0;
1523 server.hash_max_zipmap_entries = REDIS_HASH_MAX_ZIPMAP_ENTRIES;
1524 server.hash_max_zipmap_value = REDIS_HASH_MAX_ZIPMAP_VALUE;
1525
1526 resetServerSaveParams();
1527
1528 appendServerSaveParams(60*60,1); /* save after 1 hour and 1 change */
1529 appendServerSaveParams(300,100); /* save after 5 minutes and 100 changes */
1530 appendServerSaveParams(60,10000); /* save after 1 minute and 10000 changes */
1531 /* Replication related */
1532 server.isslave = 0;
1533 server.masterauth = NULL;
1534 server.masterhost = NULL;
1535 server.masterport = 6379;
1536 server.master = NULL;
1537 server.replstate = REDIS_REPL_NONE;
1538
1539 /* Double constants initialization */
1540 R_Zero = 0.0;
1541 R_PosInf = 1.0/R_Zero;
1542 R_NegInf = -1.0/R_Zero;
1543 R_Nan = R_Zero/R_Zero;
1544 }
1545
1546 static void initServer() {
1547 int j;
1548
1549 signal(SIGHUP, SIG_IGN);
1550 signal(SIGPIPE, SIG_IGN);
1551 setupSigSegvAction();
1552
1553 server.devnull = fopen("/dev/null","w");
1554 if (server.devnull == NULL) {
1555 redisLog(REDIS_WARNING, "Can't open /dev/null: %s", server.neterr);
1556 exit(1);
1557 }
1558 server.clients = listCreate();
1559 server.slaves = listCreate();
1560 server.monitors = listCreate();
1561 server.objfreelist = listCreate();
1562 createSharedObjects();
1563 server.el = aeCreateEventLoop();
1564 server.db = zmalloc(sizeof(redisDb)*server.dbnum);
1565 server.sharingpool = dictCreate(&setDictType,NULL);
1566 server.fd = anetTcpServer(server.neterr, server.port, server.bindaddr);
1567 if (server.fd == -1) {
1568 redisLog(REDIS_WARNING, "Opening TCP port: %s", server.neterr);
1569 exit(1);
1570 }
1571 for (j = 0; j < server.dbnum; j++) {
1572 server.db[j].dict = dictCreate(&dbDictType,NULL);
1573 server.db[j].expires = dictCreate(&keyptrDictType,NULL);
1574 server.db[j].blockingkeys = dictCreate(&keylistDictType,NULL);
1575 if (server.vm_enabled)
1576 server.db[j].io_keys = dictCreate(&keylistDictType,NULL);
1577 server.db[j].id = j;
1578 }
1579 server.cronloops = 0;
1580 server.bgsavechildpid = -1;
1581 server.bgrewritechildpid = -1;
1582 server.bgrewritebuf = sdsempty();
1583 server.lastsave = time(NULL);
1584 server.dirty = 0;
1585 server.stat_numcommands = 0;
1586 server.stat_numconnections = 0;
1587 server.stat_expiredkeys = 0;
1588 server.stat_starttime = time(NULL);
1589 server.unixtime = time(NULL);
1590 aeCreateTimeEvent(server.el, 1, serverCron, NULL, NULL);
1591 if (aeCreateFileEvent(server.el, server.fd, AE_READABLE,
1592 acceptHandler, NULL) == AE_ERR) oom("creating file event");
1593
1594 if (server.appendonly) {
1595 server.appendfd = open(server.appendfilename,O_WRONLY|O_APPEND|O_CREAT,0644);
1596 if (server.appendfd == -1) {
1597 redisLog(REDIS_WARNING, "Can't open the append-only file: %s",
1598 strerror(errno));
1599 exit(1);
1600 }
1601 }
1602
1603 if (server.vm_enabled) vmInit();
1604 }
1605
1606 /* Empty the whole database */
1607 static long long emptyDb() {
1608 int j;
1609 long long removed = 0;
1610
1611 for (j = 0; j < server.dbnum; j++) {
1612 removed += dictSize(server.db[j].dict);
1613 dictEmpty(server.db[j].dict);
1614 dictEmpty(server.db[j].expires);
1615 }
1616 return removed;
1617 }
1618
1619 static int yesnotoi(char *s) {
1620 if (!strcasecmp(s,"yes")) return 1;
1621 else if (!strcasecmp(s,"no")) return 0;
1622 else return -1;
1623 }
1624
1625 /* I agree, this is a very rudimental way to load a configuration...
1626 will improve later if the config gets more complex */
1627 static void loadServerConfig(char *filename) {
1628 FILE *fp;
1629 char buf[REDIS_CONFIGLINE_MAX+1], *err = NULL;
1630 int linenum = 0;
1631 sds line = NULL;
1632 char *errormsg = "Fatal error, can't open config file '%s'";
1633 char *errorbuf = zmalloc(sizeof(char)*(strlen(errormsg)+strlen(filename)));
1634 sprintf(errorbuf, errormsg, filename);
1635
1636 if (filename[0] == '-' && filename[1] == '\0')
1637 fp = stdin;
1638 else {
1639 if ((fp = fopen(filename,"r")) == NULL) {
1640 redisLog(REDIS_WARNING, errorbuf);
1641 exit(1);
1642 }
1643 }
1644
1645 while(fgets(buf,REDIS_CONFIGLINE_MAX+1,fp) != NULL) {
1646 sds *argv;
1647 int argc, j;
1648
1649 linenum++;
1650 line = sdsnew(buf);
1651 line = sdstrim(line," \t\r\n");
1652
1653 /* Skip comments and blank lines*/
1654 if (line[0] == '#' || line[0] == '\0') {
1655 sdsfree(line);
1656 continue;
1657 }
1658
1659 /* Split into arguments */
1660 argv = sdssplitlen(line,sdslen(line)," ",1,&argc);
1661 sdstolower(argv[0]);
1662
1663 /* Execute config directives */
1664 if (!strcasecmp(argv[0],"timeout") && argc == 2) {
1665 server.maxidletime = atoi(argv[1]);
1666 if (server.maxidletime < 0) {
1667 err = "Invalid timeout value"; goto loaderr;
1668 }
1669 } else if (!strcasecmp(argv[0],"port") && argc == 2) {
1670 server.port = atoi(argv[1]);
1671 if (server.port < 1 || server.port > 65535) {
1672 err = "Invalid port"; goto loaderr;
1673 }
1674 } else if (!strcasecmp(argv[0],"bind") && argc == 2) {
1675 server.bindaddr = zstrdup(argv[1]);
1676 } else if (!strcasecmp(argv[0],"save") && argc == 3) {
1677 int seconds = atoi(argv[1]);
1678 int changes = atoi(argv[2]);
1679 if (seconds < 1 || changes < 0) {
1680 err = "Invalid save parameters"; goto loaderr;
1681 }
1682 appendServerSaveParams(seconds,changes);
1683 } else if (!strcasecmp(argv[0],"dir") && argc == 2) {
1684 if (chdir(argv[1]) == -1) {
1685 redisLog(REDIS_WARNING,"Can't chdir to '%s': %s",
1686 argv[1], strerror(errno));
1687 exit(1);
1688 }
1689 } else if (!strcasecmp(argv[0],"loglevel") && argc == 2) {
1690 if (!strcasecmp(argv[1],"debug")) server.verbosity = REDIS_DEBUG;
1691 else if (!strcasecmp(argv[1],"verbose")) server.verbosity = REDIS_VERBOSE;
1692 else if (!strcasecmp(argv[1],"notice")) server.verbosity = REDIS_NOTICE;
1693 else if (!strcasecmp(argv[1],"warning")) server.verbosity = REDIS_WARNING;
1694 else {
1695 err = "Invalid log level. Must be one of debug, notice, warning";
1696 goto loaderr;
1697 }
1698 } else if (!strcasecmp(argv[0],"logfile") && argc == 2) {
1699 FILE *logfp;
1700
1701 server.logfile = zstrdup(argv[1]);
1702 if (!strcasecmp(server.logfile,"stdout")) {
1703 zfree(server.logfile);
1704 server.logfile = NULL;
1705 }
1706 if (server.logfile) {
1707 /* Test if we are able to open the file. The server will not
1708 * be able to abort just for this problem later... */
1709 logfp = fopen(server.logfile,"a");
1710 if (logfp == NULL) {
1711 err = sdscatprintf(sdsempty(),
1712 "Can't open the log file: %s", strerror(errno));
1713 goto loaderr;
1714 }
1715 fclose(logfp);
1716 }
1717 } else if (!strcasecmp(argv[0],"databases") && argc == 2) {
1718 server.dbnum = atoi(argv[1]);
1719 if (server.dbnum < 1) {
1720 err = "Invalid number of databases"; goto loaderr;
1721 }
1722 } else if (!strcasecmp(argv[0],"include") && argc == 2) {
1723 loadServerConfig(argv[1]);
1724 } else if (!strcasecmp(argv[0],"maxclients") && argc == 2) {
1725 server.maxclients = atoi(argv[1]);
1726 } else if (!strcasecmp(argv[0],"maxmemory") && argc == 2) {
1727 server.maxmemory = strtoll(argv[1], NULL, 10);
1728 } else if (!strcasecmp(argv[0],"slaveof") && argc == 3) {
1729 server.masterhost = sdsnew(argv[1]);
1730 server.masterport = atoi(argv[2]);
1731 server.replstate = REDIS_REPL_CONNECT;
1732 } else if (!strcasecmp(argv[0],"masterauth") && argc == 2) {
1733 server.masterauth = zstrdup(argv[1]);
1734 } else if (!strcasecmp(argv[0],"glueoutputbuf") && argc == 2) {
1735 if ((server.glueoutputbuf = yesnotoi(argv[1])) == -1) {
1736 err = "argument must be 'yes' or 'no'"; goto loaderr;
1737 }
1738 } else if (!strcasecmp(argv[0],"shareobjects") && argc == 2) {
1739 if ((server.shareobjects = yesnotoi(argv[1])) == -1) {
1740 err = "argument must be 'yes' or 'no'"; goto loaderr;
1741 }
1742 } else if (!strcasecmp(argv[0],"rdbcompression") && argc == 2) {
1743 if ((server.rdbcompression = yesnotoi(argv[1])) == -1) {
1744 err = "argument must be 'yes' or 'no'"; goto loaderr;
1745 }
1746 } else if (!strcasecmp(argv[0],"shareobjectspoolsize") && argc == 2) {
1747 server.sharingpoolsize = atoi(argv[1]);
1748 if (server.sharingpoolsize < 1) {
1749 err = "invalid object sharing pool size"; goto loaderr;
1750 }
1751 } else if (!strcasecmp(argv[0],"daemonize") && argc == 2) {
1752 if ((server.daemonize = yesnotoi(argv[1])) == -1) {
1753 err = "argument must be 'yes' or 'no'"; goto loaderr;
1754 }
1755 } else if (!strcasecmp(argv[0],"appendonly") && argc == 2) {
1756 if ((server.appendonly = yesnotoi(argv[1])) == -1) {
1757 err = "argument must be 'yes' or 'no'"; goto loaderr;
1758 }
1759 } else if (!strcasecmp(argv[0],"appendfsync") && argc == 2) {
1760 if (!strcasecmp(argv[1],"no")) {
1761 server.appendfsync = APPENDFSYNC_NO;
1762 } else if (!strcasecmp(argv[1],"always")) {
1763 server.appendfsync = APPENDFSYNC_ALWAYS;
1764 } else if (!strcasecmp(argv[1],"everysec")) {
1765 server.appendfsync = APPENDFSYNC_EVERYSEC;
1766 } else {
1767 err = "argument must be 'no', 'always' or 'everysec'";
1768 goto loaderr;
1769 }
1770 } else if (!strcasecmp(argv[0],"requirepass") && argc == 2) {
1771 server.requirepass = zstrdup(argv[1]);
1772 } else if (!strcasecmp(argv[0],"pidfile") && argc == 2) {
1773 zfree(server.pidfile);
1774 server.pidfile = zstrdup(argv[1]);
1775 } else if (!strcasecmp(argv[0],"dbfilename") && argc == 2) {
1776 zfree(server.dbfilename);
1777 server.dbfilename = zstrdup(argv[1]);
1778 } else if (!strcasecmp(argv[0],"vm-enabled") && argc == 2) {
1779 if ((server.vm_enabled = yesnotoi(argv[1])) == -1) {
1780 err = "argument must be 'yes' or 'no'"; goto loaderr;
1781 }
1782 } else if (!strcasecmp(argv[0],"vm-swap-file") && argc == 2) {
1783 zfree(server.vm_swap_file);
1784 server.vm_swap_file = zstrdup(argv[1]);
1785 } else if (!strcasecmp(argv[0],"vm-max-memory") && argc == 2) {
1786 server.vm_max_memory = strtoll(argv[1], NULL, 10);
1787 } else if (!strcasecmp(argv[0],"vm-page-size") && argc == 2) {
1788 server.vm_page_size = strtoll(argv[1], NULL, 10);
1789 } else if (!strcasecmp(argv[0],"vm-pages") && argc == 2) {
1790 server.vm_pages = strtoll(argv[1], NULL, 10);
1791 } else if (!strcasecmp(argv[0],"vm-max-threads") && argc == 2) {
1792 server.vm_max_threads = strtoll(argv[1], NULL, 10);
1793 } else if (!strcasecmp(argv[0],"hash-max-zipmap-entries") && argc == 2){
1794 server.hash_max_zipmap_entries = strtol(argv[1], NULL, 10);
1795 } else if (!strcasecmp(argv[0],"hash-max-zipmap-value") && argc == 2){
1796 server.hash_max_zipmap_value = strtol(argv[1], NULL, 10);
1797 } else if (!strcasecmp(argv[0],"vm-max-threads") && argc == 2) {
1798 server.vm_max_threads = strtoll(argv[1], NULL, 10);
1799 } else {
1800 err = "Bad directive or wrong number of arguments"; goto loaderr;
1801 }
1802 for (j = 0; j < argc; j++)
1803 sdsfree(argv[j]);
1804 zfree(argv);
1805 sdsfree(line);
1806 }
1807 if (fp != stdin) fclose(fp);
1808 return;
1809
1810 loaderr:
1811 fprintf(stderr, "\n*** FATAL CONFIG FILE ERROR ***\n");
1812 fprintf(stderr, "Reading the configuration file, at line %d\n", linenum);
1813 fprintf(stderr, ">>> '%s'\n", line);
1814 fprintf(stderr, "%s\n", err);
1815 exit(1);
1816 }
1817
1818 static void freeClientArgv(redisClient *c) {
1819 int j;
1820
1821 for (j = 0; j < c->argc; j++)
1822 decrRefCount(c->argv[j]);
1823 for (j = 0; j < c->mbargc; j++)
1824 decrRefCount(c->mbargv[j]);
1825 c->argc = 0;
1826 c->mbargc = 0;
1827 }
1828
1829 static void freeClient(redisClient *c) {
1830 listNode *ln;
1831
1832 /* Note that if the client we are freeing is blocked into a blocking
1833 * call, we have to set querybuf to NULL *before* to call
1834 * unblockClientWaitingData() to avoid processInputBuffer() will get
1835 * called. Also it is important to remove the file events after
1836 * this, because this call adds the READABLE event. */
1837 sdsfree(c->querybuf);
1838 c->querybuf = NULL;
1839 if (c->flags & REDIS_BLOCKED)
1840 unblockClientWaitingData(c);
1841
1842 aeDeleteFileEvent(server.el,c->fd,AE_READABLE);
1843 aeDeleteFileEvent(server.el,c->fd,AE_WRITABLE);
1844 listRelease(c->reply);
1845 freeClientArgv(c);
1846 close(c->fd);
1847 /* Remove from the list of clients */
1848 ln = listSearchKey(server.clients,c);
1849 redisAssert(ln != NULL);
1850 listDelNode(server.clients,ln);
1851 /* Remove from the list of clients waiting for swapped keys */
1852 if (c->flags & REDIS_IO_WAIT && listLength(c->io_keys) == 0) {
1853 ln = listSearchKey(server.io_ready_clients,c);
1854 if (ln) {
1855 listDelNode(server.io_ready_clients,ln);
1856 server.vm_blocked_clients--;
1857 }
1858 }
1859 while (server.vm_enabled && listLength(c->io_keys)) {
1860 ln = listFirst(c->io_keys);
1861 dontWaitForSwappedKey(c,ln->value);
1862 }
1863 listRelease(c->io_keys);
1864 /* Other cleanup */
1865 if (c->flags & REDIS_SLAVE) {
1866 if (c->replstate == REDIS_REPL_SEND_BULK && c->repldbfd != -1)
1867 close(c->repldbfd);
1868 list *l = (c->flags & REDIS_MONITOR) ? server.monitors : server.slaves;
1869 ln = listSearchKey(l,c);
1870 redisAssert(ln != NULL);
1871 listDelNode(l,ln);
1872 }
1873 if (c->flags & REDIS_MASTER) {
1874 server.master = NULL;
1875 server.replstate = REDIS_REPL_CONNECT;
1876 }
1877 zfree(c->argv);
1878 zfree(c->mbargv);
1879 freeClientMultiState(c);
1880 zfree(c);
1881 }
1882
1883 #define GLUEREPLY_UP_TO (1024)
1884 static void glueReplyBuffersIfNeeded(redisClient *c) {
1885 int copylen = 0;
1886 char buf[GLUEREPLY_UP_TO];
1887 listNode *ln;
1888 listIter li;
1889 robj *o;
1890
1891 listRewind(c->reply,&li);
1892 while((ln = listNext(&li))) {
1893 int objlen;
1894
1895 o = ln->value;
1896 objlen = sdslen(o->ptr);
1897 if (copylen + objlen <= GLUEREPLY_UP_TO) {
1898 memcpy(buf+copylen,o->ptr,objlen);
1899 copylen += objlen;
1900 listDelNode(c->reply,ln);
1901 } else {
1902 if (copylen == 0) return;
1903 break;
1904 }
1905 }
1906 /* Now the output buffer is empty, add the new single element */
1907 o = createObject(REDIS_STRING,sdsnewlen(buf,copylen));
1908 listAddNodeHead(c->reply,o);
1909 }
1910
1911 static void sendReplyToClient(aeEventLoop *el, int fd, void *privdata, int mask) {
1912 redisClient *c = privdata;
1913 int nwritten = 0, totwritten = 0, objlen;
1914 robj *o;
1915 REDIS_NOTUSED(el);
1916 REDIS_NOTUSED(mask);
1917
1918 /* Use writev() if we have enough buffers to send */
1919 if (!server.glueoutputbuf &&
1920 listLength(c->reply) > REDIS_WRITEV_THRESHOLD &&
1921 !(c->flags & REDIS_MASTER))
1922 {
1923 sendReplyToClientWritev(el, fd, privdata, mask);
1924 return;
1925 }
1926
1927 while(listLength(c->reply)) {
1928 if (server.glueoutputbuf && listLength(c->reply) > 1)
1929 glueReplyBuffersIfNeeded(c);
1930
1931 o = listNodeValue(listFirst(c->reply));
1932 objlen = sdslen(o->ptr);
1933
1934 if (objlen == 0) {
1935 listDelNode(c->reply,listFirst(c->reply));
1936 continue;
1937 }
1938
1939 if (c->flags & REDIS_MASTER) {
1940 /* Don't reply to a master */
1941 nwritten = objlen - c->sentlen;
1942 } else {
1943 nwritten = write(fd, ((char*)o->ptr)+c->sentlen, objlen - c->sentlen);
1944 if (nwritten <= 0) break;
1945 }
1946 c->sentlen += nwritten;
1947 totwritten += nwritten;
1948 /* If we fully sent the object on head go to the next one */
1949 if (c->sentlen == objlen) {
1950 listDelNode(c->reply,listFirst(c->reply));
1951 c->sentlen = 0;
1952 }
1953 /* Note that we avoid to send more thank REDIS_MAX_WRITE_PER_EVENT
1954 * bytes, in a single threaded server it's a good idea to serve
1955 * other clients as well, even if a very large request comes from
1956 * super fast link that is always able to accept data (in real world
1957 * scenario think about 'KEYS *' against the loopback interfae) */
1958 if (totwritten > REDIS_MAX_WRITE_PER_EVENT) break;
1959 }
1960 if (nwritten == -1) {
1961 if (errno == EAGAIN) {
1962 nwritten = 0;
1963 } else {
1964 redisLog(REDIS_VERBOSE,
1965 "Error writing to client: %s", strerror(errno));
1966 freeClient(c);
1967 return;
1968 }
1969 }
1970 if (totwritten > 0) c->lastinteraction = time(NULL);
1971 if (listLength(c->reply) == 0) {
1972 c->sentlen = 0;
1973 aeDeleteFileEvent(server.el,c->fd,AE_WRITABLE);
1974 }
1975 }
1976
1977 static void sendReplyToClientWritev(aeEventLoop *el, int fd, void *privdata, int mask)
1978 {
1979 redisClient *c = privdata;
1980 int nwritten = 0, totwritten = 0, objlen, willwrite;
1981 robj *o;
1982 struct iovec iov[REDIS_WRITEV_IOVEC_COUNT];
1983 int offset, ion = 0;
1984 REDIS_NOTUSED(el);
1985 REDIS_NOTUSED(mask);
1986
1987 listNode *node;
1988 while (listLength(c->reply)) {
1989 offset = c->sentlen;
1990 ion = 0;
1991 willwrite = 0;
1992
1993 /* fill-in the iov[] array */
1994 for(node = listFirst(c->reply); node; node = listNextNode(node)) {
1995 o = listNodeValue(node);
1996 objlen = sdslen(o->ptr);
1997
1998 if (totwritten + objlen - offset > REDIS_MAX_WRITE_PER_EVENT)
1999 break;
2000
2001 if(ion == REDIS_WRITEV_IOVEC_COUNT)
2002 break; /* no more iovecs */
2003
2004 iov[ion].iov_base = ((char*)o->ptr) + offset;
2005 iov[ion].iov_len = objlen - offset;
2006 willwrite += objlen - offset;
2007 offset = 0; /* just for the first item */
2008 ion++;
2009 }
2010
2011 if(willwrite == 0)
2012 break;
2013
2014 /* write all collected blocks at once */
2015 if((nwritten = writev(fd, iov, ion)) < 0) {
2016 if (errno != EAGAIN) {
2017 redisLog(REDIS_VERBOSE,
2018 "Error writing to client: %s", strerror(errno));
2019 freeClient(c);
2020 return;
2021 }
2022 break;
2023 }
2024
2025 totwritten += nwritten;
2026 offset = c->sentlen;
2027
2028 /* remove written robjs from c->reply */
2029 while (nwritten && listLength(c->reply)) {
2030 o = listNodeValue(listFirst(c->reply));
2031 objlen = sdslen(o->ptr);
2032
2033 if(nwritten >= objlen - offset) {
2034 listDelNode(c->reply, listFirst(c->reply));
2035 nwritten -= objlen - offset;
2036 c->sentlen = 0;
2037 } else {
2038 /* partial write */
2039 c->sentlen += nwritten;
2040 break;
2041 }
2042 offset = 0;
2043 }
2044 }
2045
2046 if (totwritten > 0)
2047 c->lastinteraction = time(NULL);
2048
2049 if (listLength(c->reply) == 0) {
2050 c->sentlen = 0;
2051 aeDeleteFileEvent(server.el,c->fd,AE_WRITABLE);
2052 }
2053 }
2054
2055 static struct redisCommand *lookupCommand(char *name) {
2056 int j = 0;
2057 while(cmdTable[j].name != NULL) {
2058 if (!strcasecmp(name,cmdTable[j].name)) return &cmdTable[j];
2059 j++;
2060 }
2061 return NULL;
2062 }
2063
2064 /* resetClient prepare the client to process the next command */
2065 static void resetClient(redisClient *c) {
2066 freeClientArgv(c);
2067 c->bulklen = -1;
2068 c->multibulk = 0;
2069 }
2070
2071 /* Call() is the core of Redis execution of a command */
2072 static void call(redisClient *c, struct redisCommand *cmd) {
2073 long long dirty;
2074
2075 dirty = server.dirty;
2076 cmd->proc(c);
2077 if (server.appendonly && server.dirty-dirty)
2078 feedAppendOnlyFile(cmd,c->db->id,c->argv,c->argc);
2079 if (server.dirty-dirty && listLength(server.slaves))
2080 replicationFeedSlaves(server.slaves,cmd,c->db->id,c->argv,c->argc);
2081 if (listLength(server.monitors))
2082 replicationFeedSlaves(server.monitors,cmd,c->db->id,c->argv,c->argc);
2083 server.stat_numcommands++;
2084 }
2085
2086 /* If this function gets called we already read a whole
2087 * command, argments are in the client argv/argc fields.
2088 * processCommand() execute the command or prepare the
2089 * server for a bulk read from the client.
2090 *
2091 * If 1 is returned the client is still alive and valid and
2092 * and other operations can be performed by the caller. Otherwise
2093 * if 0 is returned the client was destroied (i.e. after QUIT). */
2094 static int processCommand(redisClient *c) {
2095 struct redisCommand *cmd;
2096
2097 /* Free some memory if needed (maxmemory setting) */
2098 if (server.maxmemory) freeMemoryIfNeeded();
2099
2100 /* Handle the multi bulk command type. This is an alternative protocol
2101 * supported by Redis in order to receive commands that are composed of
2102 * multiple binary-safe "bulk" arguments. The latency of processing is
2103 * a bit higher but this allows things like multi-sets, so if this
2104 * protocol is used only for MSET and similar commands this is a big win. */
2105 if (c->multibulk == 0 && c->argc == 1 && ((char*)(c->argv[0]->ptr))[0] == '*') {
2106 c->multibulk = atoi(((char*)c->argv[0]->ptr)+1);
2107 if (c->multibulk <= 0) {
2108 resetClient(c);
2109 return 1;
2110 } else {
2111 decrRefCount(c->argv[c->argc-1]);
2112 c->argc--;
2113 return 1;
2114 }
2115 } else if (c->multibulk) {
2116 if (c->bulklen == -1) {
2117 if (((char*)c->argv[0]->ptr)[0] != '$') {
2118 addReplySds(c,sdsnew("-ERR multi bulk protocol error\r\n"));
2119 resetClient(c);
2120 return 1;
2121 } else {
2122 int bulklen = atoi(((char*)c->argv[0]->ptr)+1);
2123 decrRefCount(c->argv[0]);
2124 if (bulklen < 0 || bulklen > 1024*1024*1024) {
2125 c->argc--;
2126 addReplySds(c,sdsnew("-ERR invalid bulk write count\r\n"));
2127 resetClient(c);
2128 return 1;
2129 }
2130 c->argc--;
2131 c->bulklen = bulklen+2; /* add two bytes for CR+LF */
2132 return 1;
2133 }
2134 } else {
2135 c->mbargv = zrealloc(c->mbargv,(sizeof(robj*))*(c->mbargc+1));
2136 c->mbargv[c->mbargc] = c->argv[0];
2137 c->mbargc++;
2138 c->argc--;
2139 c->multibulk--;
2140 if (c->multibulk == 0) {
2141 robj **auxargv;
2142 int auxargc;
2143
2144 /* Here we need to swap the multi-bulk argc/argv with the
2145 * normal argc/argv of the client structure. */
2146 auxargv = c->argv;
2147 c->argv = c->mbargv;
2148 c->mbargv = auxargv;
2149
2150 auxargc = c->argc;
2151 c->argc = c->mbargc;
2152 c->mbargc = auxargc;
2153
2154 /* We need to set bulklen to something different than -1
2155 * in order for the code below to process the command without
2156 * to try to read the last argument of a bulk command as
2157 * a special argument. */
2158 c->bulklen = 0;
2159 /* continue below and process the command */
2160 } else {
2161 c->bulklen = -1;
2162 return 1;
2163 }
2164 }
2165 }
2166 /* -- end of multi bulk commands processing -- */
2167
2168 /* The QUIT command is handled as a special case. Normal command
2169 * procs are unable to close the client connection safely */
2170 if (!strcasecmp(c->argv[0]->ptr,"quit")) {
2171 freeClient(c);
2172 return 0;
2173 }
2174
2175 /* Now lookup the command and check ASAP about trivial error conditions
2176 * such wrong arity, bad command name and so forth. */
2177 cmd = lookupCommand(c->argv[0]->ptr);
2178 if (!cmd) {
2179 addReplySds(c,
2180 sdscatprintf(sdsempty(), "-ERR unknown command '%s'\r\n",
2181 (char*)c->argv[0]->ptr));
2182 resetClient(c);
2183 return 1;
2184 } else if ((cmd->arity > 0 && cmd->arity != c->argc) ||
2185 (c->argc < -cmd->arity)) {
2186 addReplySds(c,
2187 sdscatprintf(sdsempty(),
2188 "-ERR wrong number of arguments for '%s' command\r\n",
2189 cmd->name));
2190 resetClient(c);
2191 return 1;
2192 } else if (cmd->flags & REDIS_CMD_BULK && c->bulklen == -1) {
2193 /* This is a bulk command, we have to read the last argument yet. */
2194 int bulklen = atoi(c->argv[c->argc-1]->ptr);
2195
2196 decrRefCount(c->argv[c->argc-1]);
2197 if (bulklen < 0 || bulklen > 1024*1024*1024) {
2198 c->argc--;
2199 addReplySds(c,sdsnew("-ERR invalid bulk write count\r\n"));
2200 resetClient(c);
2201 return 1;
2202 }
2203 c->argc--;
2204 c->bulklen = bulklen+2; /* add two bytes for CR+LF */
2205 /* It is possible that the bulk read is already in the
2206 * buffer. Check this condition and handle it accordingly.
2207 * This is just a fast path, alternative to call processInputBuffer().
2208 * It's a good idea since the code is small and this condition
2209 * happens most of the times. */
2210 if ((signed)sdslen(c->querybuf) >= c->bulklen) {
2211 c->argv[c->argc] = createStringObject(c->querybuf,c->bulklen-2);
2212 c->argc++;
2213 c->querybuf = sdsrange(c->querybuf,c->bulklen,-1);
2214 } else {
2215 /* Otherwise return... there is to read the last argument
2216 * from the socket. */
2217 return 1;
2218 }
2219 }
2220 /* Let's try to share objects on the command arguments vector */
2221 if (server.shareobjects) {
2222 int j;
2223 for(j = 1; j < c->argc; j++)
2224 c->argv[j] = tryObjectSharing(c->argv[j]);
2225 }
2226 /* Let's try to encode the bulk object to save space. */
2227 if (cmd->flags & REDIS_CMD_BULK)
2228 tryObjectEncoding(c->argv[c->argc-1]);
2229
2230 /* Check if the user is authenticated */
2231 if (server.requirepass && !c->authenticated && cmd->proc != authCommand) {
2232 addReplySds(c,sdsnew("-ERR operation not permitted\r\n"));
2233 resetClient(c);
2234 return 1;
2235 }
2236
2237 /* Handle the maxmemory directive */
2238 if (server.maxmemory && (cmd->flags & REDIS_CMD_DENYOOM) &&
2239 zmalloc_used_memory() > server.maxmemory)
2240 {
2241 addReplySds(c,sdsnew("-ERR command not allowed when used memory > 'maxmemory'\r\n"));
2242 resetClient(c);
2243 return 1;
2244 }
2245
2246 /* Exec the command */
2247 if (c->flags & REDIS_MULTI && cmd->proc != execCommand && cmd->proc != discardCommand) {
2248 queueMultiCommand(c,cmd);
2249 addReply(c,shared.queued);
2250 } else {
2251 if (server.vm_enabled && server.vm_max_threads > 0 &&
2252 blockClientOnSwappedKeys(cmd,c)) return 1;
2253 call(c,cmd);
2254 }
2255
2256 /* Prepare the client for the next command */
2257 resetClient(c);
2258 return 1;
2259 }
2260
2261 static void replicationFeedSlaves(list *slaves, struct redisCommand *cmd, int dictid, robj **argv, int argc) {
2262 listNode *ln;
2263 listIter li;
2264 int outc = 0, j;
2265 robj **outv;
2266 /* (args*2)+1 is enough room for args, spaces, newlines */
2267 robj *static_outv[REDIS_STATIC_ARGS*2+1];
2268
2269 if (argc <= REDIS_STATIC_ARGS) {
2270 outv = static_outv;
2271 } else {
2272 outv = zmalloc(sizeof(robj*)*(argc*2+1));
2273 }
2274
2275 for (j = 0; j < argc; j++) {
2276 if (j != 0) outv[outc++] = shared.space;
2277 if ((cmd->flags & REDIS_CMD_BULK) && j == argc-1) {
2278 robj *lenobj;
2279
2280 lenobj = createObject(REDIS_STRING,
2281 sdscatprintf(sdsempty(),"%lu\r\n",
2282 (unsigned long) stringObjectLen(argv[j])));
2283 lenobj->refcount = 0;
2284 outv[outc++] = lenobj;
2285 }
2286 outv[outc++] = argv[j];
2287 }
2288 outv[outc++] = shared.crlf;
2289
2290 /* Increment all the refcounts at start and decrement at end in order to
2291 * be sure to free objects if there is no slave in a replication state
2292 * able to be feed with commands */
2293 for (j = 0; j < outc; j++) incrRefCount(outv[j]);
2294 listRewind(slaves,&li);
2295 while((ln = listNext(&li))) {
2296 redisClient *slave = ln->value;
2297
2298 /* Don't feed slaves that are still waiting for BGSAVE to start */
2299 if (slave->replstate == REDIS_REPL_WAIT_BGSAVE_START) continue;
2300
2301 /* Feed all the other slaves, MONITORs and so on */
2302 if (slave->slaveseldb != dictid) {
2303 robj *selectcmd;
2304
2305 switch(dictid) {
2306 case 0: selectcmd = shared.select0; break;
2307 case 1: selectcmd = shared.select1; break;
2308 case 2: selectcmd = shared.select2; break;
2309 case 3: selectcmd = shared.select3; break;
2310 case 4: selectcmd = shared.select4; break;
2311 case 5: selectcmd = shared.select5; break;
2312 case 6: selectcmd = shared.select6; break;
2313 case 7: selectcmd = shared.select7; break;
2314 case 8: selectcmd = shared.select8; break;
2315 case 9: selectcmd = shared.select9; break;
2316 default:
2317 selectcmd = createObject(REDIS_STRING,
2318 sdscatprintf(sdsempty(),"select %d\r\n",dictid));
2319 selectcmd->refcount = 0;
2320 break;
2321 }
2322 addReply(slave,selectcmd);
2323 slave->slaveseldb = dictid;
2324 }
2325 for (j = 0; j < outc; j++) addReply(slave,outv[j]);
2326 }
2327 for (j = 0; j < outc; j++) decrRefCount(outv[j]);
2328 if (outv != static_outv) zfree(outv);
2329 }
2330
2331 static void processInputBuffer(redisClient *c) {
2332 again:
2333 /* Before to process the input buffer, make sure the client is not
2334 * waitig for a blocking operation such as BLPOP. Note that the first
2335 * iteration the client is never blocked, otherwise the processInputBuffer
2336 * would not be called at all, but after the execution of the first commands
2337 * in the input buffer the client may be blocked, and the "goto again"
2338 * will try to reiterate. The following line will make it return asap. */
2339 if (c->flags & REDIS_BLOCKED || c->flags & REDIS_IO_WAIT) return;
2340 if (c->bulklen == -1) {
2341 /* Read the first line of the query */
2342 char *p = strchr(c->querybuf,'\n');
2343 size_t querylen;
2344
2345 if (p) {
2346 sds query, *argv;
2347 int argc, j;
2348
2349 query = c->querybuf;
2350 c->querybuf = sdsempty();
2351 querylen = 1+(p-(query));
2352 if (sdslen(query) > querylen) {
2353 /* leave data after the first line of the query in the buffer */
2354 c->querybuf = sdscatlen(c->querybuf,query+querylen,sdslen(query)-querylen);
2355 }
2356 *p = '\0'; /* remove "\n" */
2357 if (*(p-1) == '\r') *(p-1) = '\0'; /* and "\r" if any */
2358 sdsupdatelen(query);
2359
2360 /* Now we can split the query in arguments */
2361 argv = sdssplitlen(query,sdslen(query)," ",1,&argc);
2362 sdsfree(query);
2363
2364 if (c->argv) zfree(c->argv);
2365 c->argv = zmalloc(sizeof(robj*)*argc);
2366
2367 for (j = 0; j < argc; j++) {
2368 if (sdslen(argv[j])) {
2369 c->argv[c->argc] = createObject(REDIS_STRING,argv[j]);
2370 c->argc++;
2371 } else {
2372 sdsfree(argv[j]);
2373 }
2374 }
2375 zfree(argv);
2376 if (c->argc) {
2377 /* Execute the command. If the client is still valid
2378 * after processCommand() return and there is something
2379 * on the query buffer try to process the next command. */
2380 if (processCommand(c) && sdslen(c->querybuf)) goto again;
2381 } else {
2382 /* Nothing to process, argc == 0. Just process the query
2383 * buffer if it's not empty or return to the caller */
2384 if (sdslen(c->querybuf)) goto again;
2385 }
2386 return;
2387 } else if (sdslen(c->querybuf) >= REDIS_REQUEST_MAX_SIZE) {
2388 redisLog(REDIS_VERBOSE, "Client protocol error");
2389 freeClient(c);
2390 return;
2391 }
2392 } else {
2393 /* Bulk read handling. Note that if we are at this point
2394 the client already sent a command terminated with a newline,
2395 we are reading the bulk data that is actually the last
2396 argument of the command. */
2397 int qbl = sdslen(c->querybuf);
2398
2399 if (c->bulklen <= qbl) {
2400 /* Copy everything but the final CRLF as final argument */
2401 c->argv[c->argc] = createStringObject(c->querybuf,c->bulklen-2);
2402 c->argc++;
2403 c->querybuf = sdsrange(c->querybuf,c->bulklen,-1);
2404 /* Process the command. If the client is still valid after
2405 * the processing and there is more data in the buffer
2406 * try to parse it. */
2407 if (processCommand(c) && sdslen(c->querybuf)) goto again;
2408 return;
2409 }
2410 }
2411 }
2412
2413 static void readQueryFromClient(aeEventLoop *el, int fd, void *privdata, int mask) {
2414 redisClient *c = (redisClient*) privdata;
2415 char buf[REDIS_IOBUF_LEN];
2416 int nread;
2417 REDIS_NOTUSED(el);
2418 REDIS_NOTUSED(mask);
2419
2420 nread = read(fd, buf, REDIS_IOBUF_LEN);
2421 if (nread == -1) {
2422 if (errno == EAGAIN) {
2423 nread = 0;
2424 } else {
2425 redisLog(REDIS_VERBOSE, "Reading from client: %s",strerror(errno));
2426 freeClient(c);
2427 return;
2428 }
2429 } else if (nread == 0) {
2430 redisLog(REDIS_VERBOSE, "Client closed connection");
2431 freeClient(c);
2432 return;
2433 }
2434 if (nread) {
2435 c->querybuf = sdscatlen(c->querybuf, buf, nread);
2436 c->lastinteraction = time(NULL);
2437 } else {
2438 return;
2439 }
2440 if (!(c->flags & REDIS_BLOCKED))
2441 processInputBuffer(c);
2442 }
2443
2444 static int selectDb(redisClient *c, int id) {
2445 if (id < 0 || id >= server.dbnum)
2446 return REDIS_ERR;
2447 c->db = &server.db[id];
2448 return REDIS_OK;
2449 }
2450
2451 static void *dupClientReplyValue(void *o) {
2452 incrRefCount((robj*)o);
2453 return o;
2454 }
2455
2456 static redisClient *createClient(int fd) {
2457 redisClient *c = zmalloc(sizeof(*c));
2458
2459 anetNonBlock(NULL,fd);
2460 anetTcpNoDelay(NULL,fd);
2461 if (!c) return NULL;
2462 selectDb(c,0);
2463 c->fd = fd;
2464 c->querybuf = sdsempty();
2465 c->argc = 0;
2466 c->argv = NULL;
2467 c->bulklen = -1;
2468 c->multibulk = 0;
2469 c->mbargc = 0;
2470 c->mbargv = NULL;
2471 c->sentlen = 0;
2472 c->flags = 0;
2473 c->lastinteraction = time(NULL);
2474 c->authenticated = 0;
2475 c->replstate = REDIS_REPL_NONE;
2476 c->reply = listCreate();
2477 listSetFreeMethod(c->reply,decrRefCount);
2478 listSetDupMethod(c->reply,dupClientReplyValue);
2479 c->blockingkeys = NULL;
2480 c->blockingkeysnum = 0;
2481 c->io_keys = listCreate();
2482 listSetFreeMethod(c->io_keys,decrRefCount);
2483 if (aeCreateFileEvent(server.el, c->fd, AE_READABLE,
2484 readQueryFromClient, c) == AE_ERR) {
2485 freeClient(c);
2486 return NULL;
2487 }
2488 listAddNodeTail(server.clients,c);
2489 initClientMultiState(c);
2490 return c;
2491 }
2492
2493 static void addReply(redisClient *c, robj *obj) {
2494 if (listLength(c->reply) == 0 &&
2495 (c->replstate == REDIS_REPL_NONE ||
2496 c->replstate == REDIS_REPL_ONLINE) &&
2497 aeCreateFileEvent(server.el, c->fd, AE_WRITABLE,
2498 sendReplyToClient, c) == AE_ERR) return;
2499
2500 if (server.vm_enabled && obj->storage != REDIS_VM_MEMORY) {
2501 obj = dupStringObject(obj);
2502 obj->refcount = 0; /* getDecodedObject() will increment the refcount */
2503 }
2504 listAddNodeTail(c->reply,getDecodedObject(obj));
2505 }
2506
2507 static void addReplySds(redisClient *c, sds s) {
2508 robj *o = createObject(REDIS_STRING,s);
2509 addReply(c,o);
2510 decrRefCount(o);
2511 }
2512
2513 static void addReplyDouble(redisClient *c, double d) {
2514 char buf[128];
2515
2516 snprintf(buf,sizeof(buf),"%.17g",d);
2517 addReplySds(c,sdscatprintf(sdsempty(),"$%lu\r\n%s\r\n",
2518 (unsigned long) strlen(buf),buf));
2519 }
2520
2521 static void addReplyLong(redisClient *c, long l) {
2522 char buf[128];
2523 size_t len;
2524
2525 if (l == 0) {
2526 addReply(c,shared.czero);
2527 return;
2528 } else if (l == 1) {
2529 addReply(c,shared.cone);
2530 return;
2531 }
2532 len = snprintf(buf,sizeof(buf),":%ld\r\n",l);
2533 addReplySds(c,sdsnewlen(buf,len));
2534 }
2535
2536 static void addReplyUlong(redisClient *c, unsigned long ul) {
2537 char buf[128];
2538 size_t len;
2539
2540 if (ul == 0) {
2541 addReply(c,shared.czero);
2542 return;
2543 } else if (ul == 1) {
2544 addReply(c,shared.cone);
2545 return;
2546 }
2547 len = snprintf(buf,sizeof(buf),":%lu\r\n",ul);
2548 addReplySds(c,sdsnewlen(buf,len));
2549 }
2550
2551 static void addReplyBulkLen(redisClient *c, robj *obj) {
2552 size_t len;
2553
2554 if (obj->encoding == REDIS_ENCODING_RAW) {
2555 len = sdslen(obj->ptr);
2556 } else {
2557 long n = (long)obj->ptr;
2558
2559 /* Compute how many bytes will take this integer as a radix 10 string */
2560 len = 1;
2561 if (n < 0) {
2562 len++;
2563 n = -n;
2564 }
2565 while((n = n/10) != 0) {
2566 len++;
2567 }
2568 }
2569 addReplySds(c,sdscatprintf(sdsempty(),"$%lu\r\n",(unsigned long)len));
2570 }
2571
2572 static void addReplyBulk(redisClient *c, robj *obj) {
2573 addReplyBulkLen(c,obj);
2574 addReply(c,obj);
2575 addReply(c,shared.crlf);
2576 }
2577
2578 /* In the CONFIG command we need to add vanilla C string as bulk replies */
2579 static void addReplyBulkCString(redisClient *c, char *s) {
2580 if (s == NULL) {
2581 addReply(c,shared.nullbulk);
2582 } else {
2583 robj *o = createStringObject(s,strlen(s));
2584 addReplyBulk(c,o);
2585 decrRefCount(o);
2586 }
2587 }
2588
2589 static void acceptHandler(aeEventLoop *el, int fd, void *privdata, int mask) {
2590 int cport, cfd;
2591 char cip[128];
2592 redisClient *c;
2593 REDIS_NOTUSED(el);
2594 REDIS_NOTUSED(mask);
2595 REDIS_NOTUSED(privdata);
2596
2597 cfd = anetAccept(server.neterr, fd, cip, &cport);
2598 if (cfd == AE_ERR) {
2599 redisLog(REDIS_VERBOSE,"Accepting client connection: %s", server.neterr);
2600 return;
2601 }
2602 redisLog(REDIS_VERBOSE,"Accepted %s:%d", cip, cport);
2603 if ((c = createClient(cfd)) == NULL) {
2604 redisLog(REDIS_WARNING,"Error allocating resoures for the client");
2605 close(cfd); /* May be already closed, just ingore errors */
2606 return;
2607 }
2608 /* If maxclient directive is set and this is one client more... close the
2609 * connection. Note that we create the client instead to check before
2610 * for this condition, since now the socket is already set in nonblocking
2611 * mode and we can send an error for free using the Kernel I/O */
2612 if (server.maxclients && listLength(server.clients) > server.maxclients) {
2613 char *err = "-ERR max number of clients reached\r\n";
2614
2615 /* That's a best effort error message, don't check write errors */
2616 if (write(c->fd,err,strlen(err)) == -1) {
2617 /* Nothing to do, Just to avoid the warning... */
2618 }
2619 freeClient(c);
2620 return;
2621 }
2622 server.stat_numconnections++;
2623 }
2624
2625 /* ======================= Redis objects implementation ===================== */
2626
2627 static robj *createObject(int type, void *ptr) {
2628 robj *o;
2629
2630 if (server.vm_enabled) pthread_mutex_lock(&server.obj_freelist_mutex);
2631 if (listLength(server.objfreelist)) {
2632 listNode *head = listFirst(server.objfreelist);
2633 o = listNodeValue(head);
2634 listDelNode(server.objfreelist,head);
2635 if (server.vm_enabled) pthread_mutex_unlock(&server.obj_freelist_mutex);
2636 } else {
2637 if (server.vm_enabled) {
2638 pthread_mutex_unlock(&server.obj_freelist_mutex);
2639 o = zmalloc(sizeof(*o));
2640 } else {
2641 o = zmalloc(sizeof(*o)-sizeof(struct redisObjectVM));
2642 }
2643 }
2644 o->type = type;
2645 o->encoding = REDIS_ENCODING_RAW;
2646 o->ptr = ptr;
2647 o->refcount = 1;
2648 if (server.vm_enabled) {
2649 /* Note that this code may run in the context of an I/O thread
2650 * and accessing to server.unixtime in theory is an error
2651 * (no locks). But in practice this is safe, and even if we read
2652 * garbage Redis will not fail, as it's just a statistical info */
2653 o->vm.atime = server.unixtime;
2654 o->storage = REDIS_VM_MEMORY;
2655 }
2656 return o;
2657 }
2658
2659 static robj *createStringObject(char *ptr, size_t len) {
2660 return createObject(REDIS_STRING,sdsnewlen(ptr,len));
2661 }
2662
2663 static robj *dupStringObject(robj *o) {
2664 assert(o->encoding == REDIS_ENCODING_RAW);
2665 return createStringObject(o->ptr,sdslen(o->ptr));
2666 }
2667
2668 static robj *createListObject(void) {
2669 list *l = listCreate();
2670
2671 listSetFreeMethod(l,decrRefCount);
2672 return createObject(REDIS_LIST,l);
2673 }
2674
2675 static robj *createSetObject(void) {
2676 dict *d = dictCreate(&setDictType,NULL);
2677 return createObject(REDIS_SET,d);
2678 }
2679
2680 static robj *createHashObject(void) {
2681 /* All the Hashes start as zipmaps. Will be automatically converted
2682 * into hash tables if there are enough elements or big elements
2683 * inside. */
2684 unsigned char *zm = zipmapNew();
2685 robj *o = createObject(REDIS_HASH,zm);
2686 o->encoding = REDIS_ENCODING_ZIPMAP;
2687 return o;
2688 }
2689
2690 static robj *createZsetObject(void) {
2691 zset *zs = zmalloc(sizeof(*zs));
2692
2693 zs->dict = dictCreate(&zsetDictType,NULL);
2694 zs->zsl = zslCreate();
2695 return createObject(REDIS_ZSET,zs);
2696 }
2697
2698 static void freeStringObject(robj *o) {
2699 if (o->encoding == REDIS_ENCODING_RAW) {
2700 sdsfree(o->ptr);
2701 }
2702 }
2703
2704 static void freeListObject(robj *o) {
2705 listRelease((list*) o->ptr);
2706 }
2707
2708 static void freeSetObject(robj *o) {
2709 dictRelease((dict*) o->ptr);
2710 }
2711
2712 static void freeZsetObject(robj *o) {
2713 zset *zs = o->ptr;
2714
2715 dictRelease(zs->dict);
2716 zslFree(zs->zsl);
2717 zfree(zs);
2718 }
2719
2720 static void freeHashObject(robj *o) {
2721 switch (o->encoding) {
2722 case REDIS_ENCODING_HT:
2723 dictRelease((dict*) o->ptr);
2724 break;
2725 case REDIS_ENCODING_ZIPMAP:
2726 zfree(o->ptr);
2727 break;
2728 default:
2729 redisAssert(0);
2730 break;
2731 }
2732 }
2733
2734 static void incrRefCount(robj *o) {
2735 redisAssert(!server.vm_enabled || o->storage == REDIS_VM_MEMORY);
2736 o->refcount++;
2737 }
2738
2739 static void decrRefCount(void *obj) {
2740 robj *o = obj;
2741
2742 /* Object is a key of a swapped out value, or in the process of being
2743 * loaded. */
2744 if (server.vm_enabled &&
2745 (o->storage == REDIS_VM_SWAPPED || o->storage == REDIS_VM_LOADING))
2746 {
2747 if (o->storage == REDIS_VM_SWAPPED || o->storage == REDIS_VM_LOADING) {
2748 redisAssert(o->refcount == 1);
2749 }
2750 if (o->storage == REDIS_VM_LOADING) vmCancelThreadedIOJob(obj);
2751 redisAssert(o->type == REDIS_STRING);
2752 freeStringObject(o);
2753 vmMarkPagesFree(o->vm.page,o->vm.usedpages);
2754 pthread_mutex_lock(&server.obj_freelist_mutex);
2755 if (listLength(server.objfreelist) > REDIS_OBJFREELIST_MAX ||
2756 !listAddNodeHead(server.objfreelist,o))
2757 zfree(o);
2758 pthread_mutex_unlock(&server.obj_freelist_mutex);
2759 server.vm_stats_swapped_objects--;
2760 return;
2761 }
2762 /* Object is in memory, or in the process of being swapped out. */
2763 if (--(o->refcount) == 0) {
2764 if (server.vm_enabled && o->storage == REDIS_VM_SWAPPING)
2765 vmCancelThreadedIOJob(obj);
2766 switch(o->type) {
2767 case REDIS_STRING: freeStringObject(o); break;
2768 case REDIS_LIST: freeListObject(o); break;
2769 case REDIS_SET: freeSetObject(o); break;
2770 case REDIS_ZSET: freeZsetObject(o); break;
2771 case REDIS_HASH: freeHashObject(o); break;
2772 default: redisAssert(0); break;
2773 }
2774 if (server.vm_enabled) pthread_mutex_lock(&server.obj_freelist_mutex);
2775 if (listLength(server.objfreelist) > REDIS_OBJFREELIST_MAX ||
2776 !listAddNodeHead(server.objfreelist,o))
2777 zfree(o);
2778 if (server.vm_enabled) pthread_mutex_unlock(&server.obj_freelist_mutex);
2779 }
2780 }
2781
2782 static robj *lookupKey(redisDb *db, robj *key) {
2783 dictEntry *de = dictFind(db->dict,key);
2784 if (de) {
2785 robj *key = dictGetEntryKey(de);
2786 robj *val = dictGetEntryVal(de);
2787
2788 if (server.vm_enabled) {
2789 if (key->storage == REDIS_VM_MEMORY ||
2790 key->storage == REDIS_VM_SWAPPING)
2791 {
2792 /* If we were swapping the object out, stop it, this key
2793 * was requested. */
2794 if (key->storage == REDIS_VM_SWAPPING)
2795 vmCancelThreadedIOJob(key);
2796 /* Update the access time of the key for the aging algorithm. */
2797 key->vm.atime = server.unixtime;
2798 } else {
2799 int notify = (key->storage == REDIS_VM_LOADING);
2800
2801 /* Our value was swapped on disk. Bring it at home. */
2802 redisAssert(val == NULL);
2803 val = vmLoadObject(key);
2804 dictGetEntryVal(de) = val;
2805
2806 /* Clients blocked by the VM subsystem may be waiting for
2807 * this key... */
2808 if (notify) handleClientsBlockedOnSwappedKey(db,key);
2809 }
2810 }
2811 return val;
2812 } else {
2813 return NULL;
2814 }
2815 }
2816
2817 static robj *lookupKeyRead(redisDb *db, robj *key) {
2818 expireIfNeeded(db,key);
2819 return lookupKey(db,key);
2820 }
2821
2822 static robj *lookupKeyWrite(redisDb *db, robj *key) {
2823 deleteIfVolatile(db,key);
2824 return lookupKey(db,key);
2825 }
2826
2827 static robj *lookupKeyReadOrReply(redisClient *c, robj *key, robj *reply) {
2828 robj *o = lookupKeyRead(c->db, key);
2829 if (!o) addReply(c,reply);
2830 return o;
2831 }
2832
2833 static robj *lookupKeyWriteOrReply(redisClient *c, robj *key, robj *reply) {
2834 robj *o = lookupKeyWrite(c->db, key);
2835 if (!o) addReply(c,reply);
2836 return o;
2837 }
2838
2839 static int checkType(redisClient *c, robj *o, int type) {
2840 if (o->type != type) {
2841 addReply(c,shared.wrongtypeerr);
2842 return 1;
2843 }
2844 return 0;
2845 }
2846
2847 static int deleteKey(redisDb *db, robj *key) {
2848 int retval;
2849
2850 /* We need to protect key from destruction: after the first dictDelete()
2851 * it may happen that 'key' is no longer valid if we don't increment
2852 * it's count. This may happen when we get the object reference directly
2853 * from the hash table with dictRandomKey() or dict iterators */
2854 incrRefCount(key);
2855 if (dictSize(db->expires)) dictDelete(db->expires,key);
2856 retval = dictDelete(db->dict,key);
2857 decrRefCount(key);
2858
2859 return retval == DICT_OK;
2860 }
2861
2862 /* Try to share an object against the shared objects pool */
2863 static robj *tryObjectSharing(robj *o) {
2864 struct dictEntry *de;
2865 unsigned long c;
2866
2867 if (o == NULL || server.shareobjects == 0) return o;
2868
2869 redisAssert(o->type == REDIS_STRING);
2870 de = dictFind(server.sharingpool,o);
2871 if (de) {
2872 robj *shared = dictGetEntryKey(de);
2873
2874 c = ((unsigned long) dictGetEntryVal(de))+1;
2875 dictGetEntryVal(de) = (void*) c;
2876 incrRefCount(shared);
2877 decrRefCount(o);
2878 return shared;
2879 } else {
2880 /* Here we are using a stream algorihtm: Every time an object is
2881 * shared we increment its count, everytime there is a miss we
2882 * recrement the counter of a random object. If this object reaches
2883 * zero we remove the object and put the current object instead. */
2884 if (dictSize(server.sharingpool) >=
2885 server.sharingpoolsize) {
2886 de = dictGetRandomKey(server.sharingpool);
2887 redisAssert(de != NULL);
2888 c = ((unsigned long) dictGetEntryVal(de))-1;
2889 dictGetEntryVal(de) = (void*) c;
2890 if (c == 0) {
2891 dictDelete(server.sharingpool,de->key);
2892 }
2893 } else {
2894 c = 0; /* If the pool is empty we want to add this object */
2895 }
2896 if (c == 0) {
2897 int retval;
2898
2899 retval = dictAdd(server.sharingpool,o,(void*)1);
2900 redisAssert(retval == DICT_OK);
2901 incrRefCount(o);
2902 }
2903 return o;
2904 }
2905 }
2906
2907 /* Check if the nul-terminated string 's' can be represented by a long
2908 * (that is, is a number that fits into long without any other space or
2909 * character before or after the digits).
2910 *
2911 * If so, the function returns REDIS_OK and *longval is set to the value
2912 * of the number. Otherwise REDIS_ERR is returned */
2913 static int isStringRepresentableAsLong(sds s, long *longval) {
2914 char buf[32], *endptr;
2915 long value;
2916 int slen;
2917
2918 value = strtol(s, &endptr, 10);
2919 if (endptr[0] != '\0') return REDIS_ERR;
2920 slen = snprintf(buf,32,"%ld",value);
2921
2922 /* If the number converted back into a string is not identical
2923 * then it's not possible to encode the string as integer */
2924 if (sdslen(s) != (unsigned)slen || memcmp(buf,s,slen)) return REDIS_ERR;
2925 if (longval) *longval = value;
2926 return REDIS_OK;
2927 }
2928
2929 /* Try to encode a string object in order to save space */
2930 static int tryObjectEncoding(robj *o) {
2931 long value;
2932 sds s = o->ptr;
2933
2934 if (o->encoding != REDIS_ENCODING_RAW)
2935 return REDIS_ERR; /* Already encoded */
2936
2937 /* It's not save to encode shared objects: shared objects can be shared
2938 * everywhere in the "object space" of Redis. Encoded objects can only
2939 * appear as "values" (and not, for instance, as keys) */
2940 if (o->refcount > 1) return REDIS_ERR;
2941
2942 /* Currently we try to encode only strings */
2943 redisAssert(o->type == REDIS_STRING);
2944
2945 /* Check if we can represent this string as a long integer */
2946 if (isStringRepresentableAsLong(s,&value) == REDIS_ERR) return REDIS_ERR;
2947
2948 /* Ok, this object can be encoded */
2949 o->encoding = REDIS_ENCODING_INT;
2950 sdsfree(o->ptr);
2951 o->ptr = (void*) value;
2952 return REDIS_OK;
2953 }
2954
2955 /* Get a decoded version of an encoded object (returned as a new object).
2956 * If the object is already raw-encoded just increment the ref count. */
2957 static robj *getDecodedObject(robj *o) {
2958 robj *dec;
2959
2960 if (o->encoding == REDIS_ENCODING_RAW) {
2961 incrRefCount(o);
2962 return o;
2963 }
2964 if (o->type == REDIS_STRING && o->encoding == REDIS_ENCODING_INT) {
2965 char buf[32];
2966
2967 snprintf(buf,32,"%ld",(long)o->ptr);
2968 dec = createStringObject(buf,strlen(buf));
2969 return dec;
2970 } else {
2971 redisAssert(1 != 1);
2972 }
2973 }
2974
2975 /* Compare two string objects via strcmp() or alike.
2976 * Note that the objects may be integer-encoded. In such a case we
2977 * use snprintf() to get a string representation of the numbers on the stack
2978 * and compare the strings, it's much faster than calling getDecodedObject().
2979 *
2980 * Important note: if objects are not integer encoded, but binary-safe strings,
2981 * sdscmp() from sds.c will apply memcmp() so this function ca be considered
2982 * binary safe. */
2983 static int compareStringObjects(robj *a, robj *b) {
2984 redisAssert(a->type == REDIS_STRING && b->type == REDIS_STRING);
2985 char bufa[128], bufb[128], *astr, *bstr;
2986 int bothsds = 1;
2987
2988 if (a == b) return 0;
2989 if (a->encoding != REDIS_ENCODING_RAW) {
2990 snprintf(bufa,sizeof(bufa),"%ld",(long) a->ptr);
2991 astr = bufa;
2992 bothsds = 0;
2993 } else {
2994 astr = a->ptr;
2995 }
2996 if (b->encoding != REDIS_ENCODING_RAW) {
2997 snprintf(bufb,sizeof(bufb),"%ld",(long) b->ptr);
2998 bstr = bufb;
2999 bothsds = 0;
3000 } else {
3001 bstr = b->ptr;
3002 }
3003 return bothsds ? sdscmp(astr,bstr) : strcmp(astr,bstr);
3004 }
3005
3006 static size_t stringObjectLen(robj *o) {
3007 redisAssert(o->type == REDIS_STRING);
3008 if (o->encoding == REDIS_ENCODING_RAW) {
3009 return sdslen(o->ptr);
3010 } else {
3011 char buf[32];
3012
3013 return snprintf(buf,32,"%ld",(long)o->ptr);
3014 }
3015 }
3016
3017 /*============================ RDB saving/loading =========================== */
3018
3019 static int rdbSaveType(FILE *fp, unsigned char type) {
3020 if (fwrite(&type,1,1,fp) == 0) return -1;
3021 return 0;
3022 }
3023
3024 static int rdbSaveTime(FILE *fp, time_t t) {
3025 int32_t t32 = (int32_t) t;
3026 if (fwrite(&t32,4,1,fp) == 0) return -1;
3027 return 0;
3028 }
3029
3030 /* check rdbLoadLen() comments for more info */
3031 static int rdbSaveLen(FILE *fp, uint32_t len) {
3032 unsigned char buf[2];
3033
3034 if (len < (1<<6)) {
3035 /* Save a 6 bit len */
3036 buf[0] = (len&0xFF)|(REDIS_RDB_6BITLEN<<6);
3037 if (fwrite(buf,1,1,fp) == 0) return -1;
3038 } else if (len < (1<<14)) {
3039 /* Save a 14 bit len */
3040 buf[0] = ((len>>8)&0xFF)|(REDIS_RDB_14BITLEN<<6);
3041 buf[1] = len&0xFF;
3042 if (fwrite(buf,2,1,fp) == 0) return -1;
3043 } else {
3044 /* Save a 32 bit len */
3045 buf[0] = (REDIS_RDB_32BITLEN<<6);
3046 if (fwrite(buf,1,1,fp) == 0) return -1;
3047 len = htonl(len);
3048 if (fwrite(&len,4,1,fp) == 0) return -1;
3049 }
3050 return 0;
3051 }
3052
3053 /* String objects in the form "2391" "-100" without any space and with a
3054 * range of values that can fit in an 8, 16 or 32 bit signed value can be
3055 * encoded as integers to save space */
3056 static int rdbTryIntegerEncoding(char *s, size_t len, unsigned char *enc) {
3057 long long value;
3058 char *endptr, buf[32];
3059
3060 /* Check if it's possible to encode this value as a number */
3061 value = strtoll(s, &endptr, 10);
3062 if (endptr[0] != '\0') return 0;
3063 snprintf(buf,32,"%lld",value);
3064
3065 /* If the number converted back into a string is not identical
3066 * then it's not possible to encode the string as integer */
3067 if (strlen(buf) != len || memcmp(buf,s,len)) return 0;
3068
3069 /* Finally check if it fits in our ranges */
3070 if (value >= -(1<<7) && value <= (1<<7)-1) {
3071 enc[0] = (REDIS_RDB_ENCVAL<<6)|REDIS_RDB_ENC_INT8;
3072 enc[1] = value&0xFF;
3073 return 2;
3074 } else if (value >= -(1<<15) && value <= (1<<15)-1) {
3075 enc[0] = (REDIS_RDB_ENCVAL<<6)|REDIS_RDB_ENC_INT16;
3076 enc[1] = value&0xFF;
3077 enc[2] = (value>>8)&0xFF;
3078 return 3;
3079 } else if (value >= -((long long)1<<31) && value <= ((long long)1<<31)-1) {
3080 enc[0] = (REDIS_RDB_ENCVAL<<6)|REDIS_RDB_ENC_INT32;
3081 enc[1] = value&0xFF;
3082 enc[2] = (value>>8)&0xFF;
3083 enc[3] = (value>>16)&0xFF;
3084 enc[4] = (value>>24)&0xFF;
3085 return 5;
3086 } else {
3087 return 0;
3088 }
3089 }
3090
3091 static int rdbSaveLzfStringObject(FILE *fp, unsigned char *s, size_t len) {
3092 size_t comprlen, outlen;
3093 unsigned char byte;
3094 void *out;
3095
3096 /* We require at least four bytes compression for this to be worth it */
3097 if (len <= 4) return 0;
3098 outlen = len-4;
3099 if ((out = zmalloc(outlen+1)) == NULL) return 0;
3100 comprlen = lzf_compress(s, len, out, outlen);
3101 if (comprlen == 0) {
3102 zfree(out);
3103 return 0;
3104 }
3105 /* Data compressed! Let's save it on disk */
3106 byte = (REDIS_RDB_ENCVAL<<6)|REDIS_RDB_ENC_LZF;
3107 if (fwrite(&byte,1,1,fp) == 0) goto writeerr;
3108 if (rdbSaveLen(fp,comprlen) == -1) goto writeerr;
3109 if (rdbSaveLen(fp,len) == -1) goto writeerr;
3110 if (fwrite(out,comprlen,1,fp) == 0) goto writeerr;
3111 zfree(out);
3112 return comprlen;
3113
3114 writeerr:
3115 zfree(out);
3116 return -1;
3117 }
3118
3119 /* Save a string objet as [len][data] on disk. If the object is a string
3120 * representation of an integer value we try to safe it in a special form */
3121 static int rdbSaveRawString(FILE *fp, unsigned char *s, size_t len) {
3122 int enclen;
3123
3124 /* Try integer encoding */
3125 if (len <= 11) {
3126 unsigned char buf[5];
3127 if ((enclen = rdbTryIntegerEncoding((char*)s,len,buf)) > 0) {
3128 if (fwrite(buf,enclen,1,fp) == 0) return -1;
3129 return 0;
3130 }
3131 }
3132
3133 /* Try LZF compression - under 20 bytes it's unable to compress even
3134 * aaaaaaaaaaaaaaaaaa so skip it */
3135 if (server.rdbcompression && len > 20) {
3136 int retval;
3137
3138 retval = rdbSaveLzfStringObject(fp,s,len);
3139 if (retval == -1) return -1;
3140 if (retval > 0) return 0;
3141 /* retval == 0 means data can't be compressed, save the old way */
3142 }
3143
3144 /* Store verbatim */
3145 if (rdbSaveLen(fp,len) == -1) return -1;
3146 if (len && fwrite(s,len,1,fp) == 0) return -1;
3147 return 0;
3148 }
3149
3150 /* Like rdbSaveStringObjectRaw() but handle encoded objects */
3151 static int rdbSaveStringObject(FILE *fp, robj *obj) {
3152 int retval;
3153
3154 /* Avoid incr/decr ref count business when possible.
3155 * This plays well with copy-on-write given that we are probably
3156 * in a child process (BGSAVE). Also this makes sure key objects
3157 * of swapped objects are not incRefCount-ed (an assert does not allow
3158 * this in order to avoid bugs) */
3159 if (obj->encoding != REDIS_ENCODING_RAW) {
3160 obj = getDecodedObject(obj);
3161 retval = rdbSaveRawString(fp,obj->ptr,sdslen(obj->ptr));
3162 decrRefCount(obj);
3163 } else {
3164 retval = rdbSaveRawString(fp,obj->ptr,sdslen(obj->ptr));
3165 }
3166 return retval;
3167 }
3168
3169 /* Save a double value. Doubles are saved as strings prefixed by an unsigned
3170 * 8 bit integer specifing the length of the representation.
3171 * This 8 bit integer has special values in order to specify the following
3172 * conditions:
3173 * 253: not a number
3174 * 254: + inf
3175 * 255: - inf
3176 */
3177 static int rdbSaveDoubleValue(FILE *fp, double val) {
3178 unsigned char buf[128];
3179 int len;
3180
3181 if (isnan(val)) {
3182 buf[0] = 253;
3183 len = 1;
3184 } else if (!isfinite(val)) {
3185 len = 1;
3186 buf[0] = (val < 0) ? 255 : 254;
3187 } else {
3188 snprintf((char*)buf+1,sizeof(buf)-1,"%.17g",val);
3189 buf[0] = strlen((char*)buf+1);
3190 len = buf[0]+1;
3191 }
3192 if (fwrite(buf,len,1,fp) == 0) return -1;
3193 return 0;
3194 }
3195
3196 /* Save a Redis object. */
3197 static int rdbSaveObject(FILE *fp, robj *o) {
3198 if (o->type == REDIS_STRING) {
3199 /* Save a string value */
3200 if (rdbSaveStringObject(fp,o) == -1) return -1;
3201 } else if (o->type == REDIS_LIST) {
3202 /* Save a list value */
3203 list *list = o->ptr;
3204 listIter li;
3205 listNode *ln;
3206
3207 if (rdbSaveLen(fp,listLength(list)) == -1) return -1;
3208 listRewind(list,&li);
3209 while((ln = listNext(&li))) {
3210 robj *eleobj = listNodeValue(ln);
3211
3212 if (rdbSaveStringObject(fp,eleobj) == -1) return -1;
3213 }
3214 } else if (o->type == REDIS_SET) {
3215 /* Save a set value */
3216 dict *set = o->ptr;
3217 dictIterator *di = dictGetIterator(set);
3218 dictEntry *de;
3219
3220 if (rdbSaveLen(fp,dictSize(set)) == -1) return -1;
3221 while((de = dictNext(di)) != NULL) {
3222 robj *eleobj = dictGetEntryKey(de);
3223
3224 if (rdbSaveStringObject(fp,eleobj) == -1) return -1;
3225 }
3226 dictReleaseIterator(di);
3227 } else if (o->type == REDIS_ZSET) {
3228 /* Save a set value */
3229 zset *zs = o->ptr;
3230 dictIterator *di = dictGetIterator(zs->dict);
3231 dictEntry *de;
3232
3233 if (rdbSaveLen(fp,dictSize(zs->dict)) == -1) return -1;
3234 while((de = dictNext(di)) != NULL) {
3235 robj *eleobj = dictGetEntryKey(de);
3236 double *score = dictGetEntryVal(de);
3237
3238 if (rdbSaveStringObject(fp,eleobj) == -1) return -1;
3239 if (rdbSaveDoubleValue(fp,*score) == -1) return -1;
3240 }
3241 dictReleaseIterator(di);
3242 } else if (o->type == REDIS_HASH) {
3243 /* Save a hash value */
3244 if (o->encoding == REDIS_ENCODING_ZIPMAP) {
3245 unsigned char *p = zipmapRewind(o->ptr);
3246 unsigned int count = zipmapLen(o->ptr);
3247 unsigned char *key, *val;
3248 unsigned int klen, vlen;
3249
3250 if (rdbSaveLen(fp,count) == -1) return -1;
3251 while((p = zipmapNext(p,&key,&klen,&val,&vlen)) != NULL) {
3252 if (rdbSaveRawString(fp,key,klen) == -1) return -1;
3253 if (rdbSaveRawString(fp,val,vlen) == -1) return -1;
3254 }
3255 } else {
3256 dictIterator *di = dictGetIterator(o->ptr);
3257 dictEntry *de;
3258
3259 if (rdbSaveLen(fp,dictSize((dict*)o->ptr)) == -1) return -1;
3260 while((de = dictNext(di)) != NULL) {
3261 robj *key = dictGetEntryKey(de);
3262 robj *val = dictGetEntryVal(de);
3263
3264 if (rdbSaveStringObject(fp,key) == -1) return -1;
3265 if (rdbSaveStringObject(fp,val) == -1) return -1;
3266 }
3267 dictReleaseIterator(di);
3268 }
3269 } else {
3270 redisAssert(0);
3271 }
3272 return 0;
3273 }
3274
3275 /* Return the length the object will have on disk if saved with
3276 * the rdbSaveObject() function. Currently we use a trick to get
3277 * this length with very little changes to the code. In the future
3278 * we could switch to a faster solution. */
3279 static off_t rdbSavedObjectLen(robj *o, FILE *fp) {
3280 if (fp == NULL) fp = server.devnull;
3281 rewind(fp);
3282 assert(rdbSaveObject(fp,o) != 1);
3283 return ftello(fp);
3284 }
3285
3286 /* Return the number of pages required to save this object in the swap file */
3287 static off_t rdbSavedObjectPages(robj *o, FILE *fp) {
3288 off_t bytes = rdbSavedObjectLen(o,fp);
3289
3290 return (bytes+(server.vm_page_size-1))/server.vm_page_size;
3291 }
3292
3293 /* Save the DB on disk. Return REDIS_ERR on error, REDIS_OK on success */
3294 static int rdbSave(char *filename) {
3295 dictIterator *di = NULL;
3296 dictEntry *de;
3297 FILE *fp;
3298 char tmpfile[256];
3299 int j;
3300 time_t now = time(NULL);
3301
3302 /* Wait for I/O therads to terminate, just in case this is a
3303 * foreground-saving, to avoid seeking the swap file descriptor at the
3304 * same time. */
3305 if (server.vm_enabled)
3306 waitEmptyIOJobsQueue();
3307
3308 snprintf(tmpfile,256,"temp-%d.rdb", (int) getpid());
3309 fp = fopen(tmpfile,"w");
3310 if (!fp) {
3311 redisLog(REDIS_WARNING, "Failed saving the DB: %s", strerror(errno));
3312 return REDIS_ERR;
3313 }
3314 if (fwrite("REDIS0001",9,1,fp) == 0) goto werr;
3315 for (j = 0; j < server.dbnum; j++) {
3316 redisDb *db = server.db+j;
3317 dict *d = db->dict;
3318 if (dictSize(d) == 0) continue;
3319 di = dictGetIterator(d);
3320 if (!di) {
3321 fclose(fp);
3322 return REDIS_ERR;
3323 }
3324
3325 /* Write the SELECT DB opcode */
3326 if (rdbSaveType(fp,REDIS_SELECTDB) == -1) goto werr;
3327 if (rdbSaveLen(fp,j) == -1) goto werr;
3328
3329 /* Iterate this DB writing every entry */
3330 while((de = dictNext(di)) != NULL) {
3331 robj *key = dictGetEntryKey(de);
3332 robj *o = dictGetEntryVal(de);
3333 time_t expiretime = getExpire(db,key);
3334
3335 /* Save the expire time */
3336 if (expiretime != -1) {
3337 /* If this key is already expired skip it */
3338 if (expiretime < now) continue;
3339 if (rdbSaveType(fp,REDIS_EXPIRETIME) == -1) goto werr;
3340 if (rdbSaveTime(fp,expiretime) == -1) goto werr;
3341 }
3342 /* Save the key and associated value. This requires special
3343 * handling if the value is swapped out. */
3344 if (!server.vm_enabled || key->storage == REDIS_VM_MEMORY ||
3345 key->storage == REDIS_VM_SWAPPING) {
3346 /* Save type, key, value */
3347 if (rdbSaveType(fp,o->type) == -1) goto werr;
3348 if (rdbSaveStringObject(fp,key) == -1) goto werr;
3349 if (rdbSaveObject(fp,o) == -1) goto werr;
3350 } else {
3351 /* REDIS_VM_SWAPPED or REDIS_VM_LOADING */
3352 robj *po;
3353 /* Get a preview of the object in memory */
3354 po = vmPreviewObject(key);
3355 /* Save type, key, value */
3356 if (rdbSaveType(fp,key->vtype) == -1) goto werr;
3357 if (rdbSaveStringObject(fp,key) == -1) goto werr;
3358 if (rdbSaveObject(fp,po) == -1) goto werr;
3359 /* Remove the loaded object from memory */
3360 decrRefCount(po);
3361 }
3362 }
3363 dictReleaseIterator(di);
3364 }
3365 /* EOF opcode */
3366 if (rdbSaveType(fp,REDIS_EOF) == -1) goto werr;
3367
3368 /* Make sure data will not remain on the OS's output buffers */
3369 fflush(fp);
3370 fsync(fileno(fp));
3371 fclose(fp);
3372
3373 /* Use RENAME to make sure the DB file is changed atomically only
3374 * if the generate DB file is ok. */
3375 if (rename(tmpfile,filename) == -1) {
3376 redisLog(REDIS_WARNING,"Error moving temp DB file on the final destination: %s", strerror(errno));
3377 unlink(tmpfile);
3378 return REDIS_ERR;
3379 }
3380 redisLog(REDIS_NOTICE,"DB saved on disk");
3381 server.dirty = 0;
3382 server.lastsave = time(NULL);
3383 return REDIS_OK;
3384
3385 werr:
3386 fclose(fp);
3387 unlink(tmpfile);
3388 redisLog(REDIS_WARNING,"Write error saving DB on disk: %s", strerror(errno));
3389 if (di) dictReleaseIterator(di);
3390 return REDIS_ERR;
3391 }
3392
3393 static int rdbSaveBackground(char *filename) {
3394 pid_t childpid;
3395
3396 if (server.bgsavechildpid != -1) return REDIS_ERR;
3397 if (server.vm_enabled) waitEmptyIOJobsQueue();
3398 if ((childpid = fork()) == 0) {
3399 /* Child */
3400 if (server.vm_enabled) vmReopenSwapFile();
3401 close(server.fd);
3402 if (rdbSave(filename) == REDIS_OK) {
3403 _exit(0);
3404 } else {
3405 _exit(1);
3406 }
3407 } else {
3408 /* Parent */
3409 if (childpid == -1) {
3410 redisLog(REDIS_WARNING,"Can't save in background: fork: %s",
3411 strerror(errno));
3412 return REDIS_ERR;
3413 }
3414 redisLog(REDIS_NOTICE,"Background saving started by pid %d",childpid);
3415 server.bgsavechildpid = childpid;
3416 return REDIS_OK;
3417 }
3418 return REDIS_OK; /* unreached */
3419 }
3420
3421 static void rdbRemoveTempFile(pid_t childpid) {
3422 char tmpfile[256];
3423
3424 snprintf(tmpfile,256,"temp-%d.rdb", (int) childpid);
3425 unlink(tmpfile);
3426 }
3427
3428 static int rdbLoadType(FILE *fp) {
3429 unsigned char type;
3430 if (fread(&type,1,1,fp) == 0) return -1;
3431 return type;
3432 }
3433
3434 static time_t rdbLoadTime(FILE *fp) {
3435 int32_t t32;
3436 if (fread(&t32,4,1,fp) == 0) return -1;
3437 return (time_t) t32;
3438 }
3439
3440 /* Load an encoded length from the DB, see the REDIS_RDB_* defines on the top
3441 * of this file for a description of how this are stored on disk.
3442 *
3443 * isencoded is set to 1 if the readed length is not actually a length but
3444 * an "encoding type", check the above comments for more info */
3445 static uint32_t rdbLoadLen(FILE *fp, int *isencoded) {
3446 unsigned char buf[2];
3447 uint32_t len;
3448 int type;
3449
3450 if (isencoded) *isencoded = 0;
3451 if (fread(buf,1,1,fp) == 0) return REDIS_RDB_LENERR;
3452 type = (buf[0]&0xC0)>>6;
3453 if (type == REDIS_RDB_6BITLEN) {
3454 /* Read a 6 bit len */
3455 return buf[0]&0x3F;
3456 } else if (type == REDIS_RDB_ENCVAL) {
3457 /* Read a 6 bit len encoding type */
3458 if (isencoded) *isencoded = 1;
3459 return buf[0]&0x3F;
3460 } else if (type == REDIS_RDB_14BITLEN) {
3461 /* Read a 14 bit len */
3462 if (fread(buf+1,1,1,fp) == 0) return REDIS_RDB_LENERR;
3463 return ((buf[0]&0x3F)<<8)|buf[1];
3464 } else {
3465 /* Read a 32 bit len */
3466 if (fread(&len,4,1,fp) == 0) return REDIS_RDB_LENERR;
3467 return ntohl(len);
3468 }
3469 }
3470
3471 static robj *rdbLoadIntegerObject(FILE *fp, int enctype) {
3472 unsigned char enc[4];
3473 long long val;
3474
3475 if (enctype == REDIS_RDB_ENC_INT8) {
3476 if (fread(enc,1,1,fp) == 0) return NULL;
3477 val = (signed char)enc[0];
3478 } else if (enctype == REDIS_RDB_ENC_INT16) {
3479 uint16_t v;
3480 if (fread(enc,2,1,fp) == 0) return NULL;
3481 v = enc[0]|(enc[1]<<8);
3482 val = (int16_t)v;
3483 } else if (enctype == REDIS_RDB_ENC_INT32) {
3484 uint32_t v;
3485 if (fread(enc,4,1,fp) == 0) return NULL;
3486 v = enc[0]|(enc[1]<<8)|(enc[2]<<16)|(enc[3]<<24);
3487 val = (int32_t)v;
3488 } else {
3489 val = 0; /* anti-warning */
3490 redisAssert(0);
3491 }
3492 return createObject(REDIS_STRING,sdscatprintf(sdsempty(),"%lld",val));
3493 }
3494
3495 static robj *rdbLoadLzfStringObject(FILE*fp) {
3496 unsigned int len, clen;
3497 unsigned char *c = NULL;
3498 sds val = NULL;
3499
3500 if ((clen = rdbLoadLen(fp,NULL)) == REDIS_RDB_LENERR) return NULL;
3501 if ((len = rdbLoadLen(fp,NULL)) == REDIS_RDB_LENERR) return NULL;
3502 if ((c = zmalloc(clen)) == NULL) goto err;
3503 if ((val = sdsnewlen(NULL,len)) == NULL) goto err;
3504 if (fread(c,clen,1,fp) == 0) goto err;
3505 if (lzf_decompress(c,clen,val,len) == 0) goto err;
3506 zfree(c);
3507 return createObject(REDIS_STRING,val);
3508 err:
3509 zfree(c);
3510 sdsfree(val);
3511 return NULL;
3512 }
3513
3514 static robj *rdbLoadStringObject(FILE*fp) {
3515 int isencoded;
3516 uint32_t len;
3517 sds val;
3518
3519 len = rdbLoadLen(fp,&isencoded);
3520 if (isencoded) {
3521 switch(len) {
3522 case REDIS_RDB_ENC_INT8:
3523 case REDIS_RDB_ENC_INT16:
3524 case REDIS_RDB_ENC_INT32:
3525 return tryObjectSharing(rdbLoadIntegerObject(fp,len));
3526 case REDIS_RDB_ENC_LZF:
3527 return tryObjectSharing(rdbLoadLzfStringObject(fp));
3528 default:
3529 redisAssert(0);
3530 }
3531 }
3532
3533 if (len == REDIS_RDB_LENERR) return NULL;
3534 val = sdsnewlen(NULL,len);
3535 if (len && fread(val,len,1,fp) == 0) {
3536 sdsfree(val);
3537 return NULL;
3538 }
3539 return tryObjectSharing(createObject(REDIS_STRING,val));
3540 }
3541
3542 /* For information about double serialization check rdbSaveDoubleValue() */
3543 static int rdbLoadDoubleValue(FILE *fp, double *val) {
3544 char buf[128];
3545 unsigned char len;
3546
3547 if (fread(&len,1,1,fp) == 0) return -1;
3548 switch(len) {
3549 case 255: *val = R_NegInf; return 0;
3550 case 254: *val = R_PosInf; return 0;
3551 case 253: *val = R_Nan; return 0;
3552 default:
3553 if (fread(buf,len,1,fp) == 0) return -1;
3554 buf[len] = '\0';
3555 sscanf(buf, "%lg", val);
3556 return 0;
3557 }
3558 }
3559
3560 /* Load a Redis object of the specified type from the specified file.
3561 * On success a newly allocated object is returned, otherwise NULL. */
3562 static robj *rdbLoadObject(int type, FILE *fp) {
3563 robj *o;
3564
3565 redisLog(REDIS_DEBUG,"LOADING OBJECT %d (at %d)\n",type,ftell(fp));
3566 if (type == REDIS_STRING) {
3567 /* Read string value */
3568 if ((o = rdbLoadStringObject(fp)) == NULL) return NULL;
3569 tryObjectEncoding(o);
3570 } else if (type == REDIS_LIST || type == REDIS_SET) {
3571 /* Read list/set value */
3572 uint32_t listlen;
3573
3574 if ((listlen = rdbLoadLen(fp,NULL)) == REDIS_RDB_LENERR) return NULL;
3575 o = (type == REDIS_LIST) ? createListObject() : createSetObject();
3576 /* It's faster to expand the dict to the right size asap in order
3577 * to avoid rehashing */
3578 if (type == REDIS_SET && listlen > DICT_HT_INITIAL_SIZE)
3579 dictExpand(o->ptr,listlen);
3580 /* Load every single element of the list/set */
3581 while(listlen--) {
3582 robj *ele;
3583
3584 if ((ele = rdbLoadStringObject(fp)) == NULL) return NULL;
3585 tryObjectEncoding(ele);
3586 if (type == REDIS_LIST) {
3587 listAddNodeTail((list*)o->ptr,ele);
3588 } else {
3589 dictAdd((dict*)o->ptr,ele,NULL);
3590 }
3591 }
3592 } else if (type == REDIS_ZSET) {
3593 /* Read list/set value */
3594 size_t zsetlen;
3595 zset *zs;
3596
3597 if ((zsetlen = rdbLoadLen(fp,NULL)) == REDIS_RDB_LENERR) return NULL;
3598 o = createZsetObject();
3599 zs = o->ptr;
3600 /* Load every single element of the list/set */
3601 while(zsetlen--) {
3602 robj *ele;
3603 double *score = zmalloc(sizeof(double));
3604
3605 if ((ele = rdbLoadStringObject(fp)) == NULL) return NULL;
3606 tryObjectEncoding(ele);
3607 if (rdbLoadDoubleValue(fp,score) == -1) return NULL;
3608 dictAdd(zs->dict,ele,score);
3609 zslInsert(zs->zsl,*score,ele);
3610 incrRefCount(ele); /* added to skiplist */
3611 }
3612 } else if (type == REDIS_HASH) {
3613 size_t hashlen;
3614
3615 if ((hashlen = rdbLoadLen(fp,NULL)) == REDIS_RDB_LENERR) return NULL;
3616 o = createHashObject();
3617 /* Too many entries? Use an hash table. */
3618 if (hashlen > server.hash_max_zipmap_entries)
3619 convertToRealHash(o);
3620 /* Load every key/value, then set it into the zipmap or hash
3621 * table, as needed. */
3622 while(hashlen--) {
3623 robj *key, *val;
3624
3625 if ((key = rdbLoadStringObject(fp)) == NULL) return NULL;
3626 if ((val = rdbLoadStringObject(fp)) == NULL) return NULL;
3627 /* If we are using a zipmap and there are too big values
3628 * the object is converted to real hash table encoding. */
3629 if (o->encoding != REDIS_ENCODING_HT &&
3630 (sdslen(key->ptr) > server.hash_max_zipmap_value ||
3631 sdslen(val->ptr) > server.hash_max_zipmap_value))
3632 {
3633 convertToRealHash(o);
3634 }
3635
3636 if (o->encoding == REDIS_ENCODING_ZIPMAP) {
3637 unsigned char *zm = o->ptr;
3638
3639 zm = zipmapSet(zm,key->ptr,sdslen(key->ptr),
3640 val->ptr,sdslen(val->ptr),NULL);
3641 o->ptr = zm;
3642 decrRefCount(key);
3643 decrRefCount(val);
3644 } else {
3645 tryObjectEncoding(key);
3646 tryObjectEncoding(val);
3647 dictAdd((dict*)o->ptr,key,val);
3648 }
3649 }
3650 } else {
3651 redisAssert(0);
3652 }
3653 return o;
3654 }
3655
3656 static int rdbLoad(char *filename) {
3657 FILE *fp;
3658 robj *keyobj = NULL;
3659 uint32_t dbid;
3660 int type, retval, rdbver;
3661 dict *d = server.db[0].dict;
3662 redisDb *db = server.db+0;
3663 char buf[1024];
3664 time_t expiretime = -1, now = time(NULL);
3665 long long loadedkeys = 0;
3666
3667 fp = fopen(filename,"r");
3668 if (!fp) return REDIS_ERR;
3669 if (fread(buf,9,1,fp) == 0) goto eoferr;
3670 buf[9] = '\0';
3671 if (memcmp(buf,"REDIS",5) != 0) {
3672 fclose(fp);
3673 redisLog(REDIS_WARNING,"Wrong signature trying to load DB from file");
3674 return REDIS_ERR;
3675 }
3676 rdbver = atoi(buf+5);
3677 if (rdbver != 1) {
3678 fclose(fp);
3679 redisLog(REDIS_WARNING,"Can't handle RDB format version %d",rdbver);
3680 return REDIS_ERR;
3681 }
3682 while(1) {
3683 robj *o;
3684
3685 /* Read type. */
3686 if ((type = rdbLoadType(fp)) == -1) goto eoferr;
3687 if (type == REDIS_EXPIRETIME) {
3688 if ((expiretime = rdbLoadTime(fp)) == -1) goto eoferr;
3689 /* We read the time so we need to read the object type again */
3690 if ((type = rdbLoadType(fp)) == -1) goto eoferr;
3691 }
3692 if (type == REDIS_EOF) break;
3693 /* Handle SELECT DB opcode as a special case */
3694 if (type == REDIS_SELECTDB) {
3695 if ((dbid = rdbLoadLen(fp,NULL)) == REDIS_RDB_LENERR)
3696 goto eoferr;
3697 if (dbid >= (unsigned)server.dbnum) {
3698 redisLog(REDIS_WARNING,"FATAL: Data file was created with a Redis server configured to handle more than %d databases. Exiting\n", server.dbnum);
3699 exit(1);
3700 }
3701 db = server.db+dbid;
3702 d = db->dict;
3703 continue;
3704 }
3705 /* Read key */
3706 if ((keyobj = rdbLoadStringObject(fp)) == NULL) goto eoferr;
3707 /* Read value */
3708 if ((o = rdbLoadObject(type,fp)) == NULL) goto eoferr;
3709 /* Add the new object in the hash table */
3710 retval = dictAdd(d,keyobj,o);
3711 if (retval == DICT_ERR) {
3712 redisLog(REDIS_WARNING,"Loading DB, duplicated key (%s) found! Unrecoverable error, exiting now.", keyobj->ptr);
3713 exit(1);
3714 }
3715 /* Set the expire time if needed */
3716 if (expiretime != -1) {
3717 setExpire(db,keyobj,expiretime);
3718 /* Delete this key if already expired */
3719 if (expiretime < now) deleteKey(db,keyobj);
3720 expiretime = -1;
3721 }
3722 keyobj = o = NULL;
3723 /* Handle swapping while loading big datasets when VM is on */
3724 loadedkeys++;
3725 if (server.vm_enabled && (loadedkeys % 5000) == 0) {
3726 while (zmalloc_used_memory() > server.vm_max_memory) {
3727 if (vmSwapOneObjectBlocking() == REDIS_ERR) break;
3728 }
3729 }
3730 }
3731 fclose(fp);
3732 return REDIS_OK;
3733
3734 eoferr: /* unexpected end of file is handled here with a fatal exit */
3735 if (keyobj) decrRefCount(keyobj);
3736 redisLog(REDIS_WARNING,"Short read or OOM loading DB. Unrecoverable error, aborting now.");
3737 exit(1);
3738 return REDIS_ERR; /* Just to avoid warning */
3739 }
3740
3741 /*================================== Commands =============================== */
3742
3743 static void authCommand(redisClient *c) {
3744 if (!server.requirepass || !strcmp(c->argv[1]->ptr, server.requirepass)) {
3745 c->authenticated = 1;
3746 addReply(c,shared.ok);
3747 } else {
3748 c->authenticated = 0;
3749 addReplySds(c,sdscatprintf(sdsempty(),"-ERR invalid password\r\n"));
3750 }
3751 }
3752
3753 static void pingCommand(redisClient *c) {
3754 addReply(c,shared.pong);
3755 }
3756
3757 static void echoCommand(redisClient *c) {
3758 addReplyBulk(c,c->argv[1]);
3759 }
3760
3761 /*=================================== Strings =============================== */
3762
3763 static void setGenericCommand(redisClient *c, int nx) {
3764 int retval;
3765
3766 if (nx) deleteIfVolatile(c->db,c->argv[1]);
3767 retval = dictAdd(c->db->dict,c->argv[1],c->argv[2]);
3768 if (retval == DICT_ERR) {
3769 if (!nx) {
3770 /* If the key is about a swapped value, we want a new key object
3771 * to overwrite the old. So we delete the old key in the database.
3772 * This will also make sure that swap pages about the old object
3773 * will be marked as free. */
3774 if (server.vm_enabled && deleteIfSwapped(c->db,c->argv[1]))
3775 incrRefCount(c->argv[1]);
3776 dictReplace(c->db->dict,c->argv[1],c->argv[2]);
3777 incrRefCount(c->argv[2]);
3778 } else {
3779 addReply(c,shared.czero);
3780 return;
3781 }
3782 } else {
3783 incrRefCount(c->argv[1]);
3784 incrRefCount(c->argv[2]);
3785 }
3786 server.dirty++;
3787 removeExpire(c->db,c->argv[1]);
3788 addReply(c, nx ? shared.cone : shared.ok);
3789 }
3790
3791 static void setCommand(redisClient *c) {
3792 setGenericCommand(c,0);
3793 }
3794
3795 static void setnxCommand(redisClient *c) {
3796 setGenericCommand(c,1);
3797 }
3798
3799 static int getGenericCommand(redisClient *c) {
3800 robj *o;
3801
3802 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.nullbulk)) == NULL)
3803 return REDIS_OK;
3804
3805 if (o->type != REDIS_STRING) {
3806 addReply(c,shared.wrongtypeerr);
3807 return REDIS_ERR;
3808 } else {
3809 addReplyBulk(c,o);
3810 return REDIS_OK;
3811 }
3812 }
3813
3814 static void getCommand(redisClient *c) {
3815 getGenericCommand(c);
3816 }
3817
3818 static void getsetCommand(redisClient *c) {
3819 if (getGenericCommand(c) == REDIS_ERR) return;
3820 if (dictAdd(c->db->dict,c->argv[1],c->argv[2]) == DICT_ERR) {
3821 dictReplace(c->db->dict,c->argv[1],c->argv[2]);
3822 } else {
3823 incrRefCount(c->argv[1]);
3824 }
3825 incrRefCount(c->argv[2]);
3826 server.dirty++;
3827 removeExpire(c->db,c->argv[1]);
3828 }
3829
3830 static void mgetCommand(redisClient *c) {
3831 int j;
3832
3833 addReplySds(c,sdscatprintf(sdsempty(),"*%d\r\n",c->argc-1));
3834 for (j = 1; j < c->argc; j++) {
3835 robj *o = lookupKeyRead(c->db,c->argv[j]);
3836 if (o == NULL) {
3837 addReply(c,shared.nullbulk);
3838 } else {
3839 if (o->type != REDIS_STRING) {
3840 addReply(c,shared.nullbulk);
3841 } else {
3842 addReplyBulk(c,o);
3843 }
3844 }
3845 }
3846 }
3847
3848 static void msetGenericCommand(redisClient *c, int nx) {
3849 int j, busykeys = 0;
3850
3851 if ((c->argc % 2) == 0) {
3852 addReplySds(c,sdsnew("-ERR wrong number of arguments for MSET\r\n"));
3853 return;
3854 }
3855 /* Handle the NX flag. The MSETNX semantic is to return zero and don't
3856 * set nothing at all if at least one already key exists. */
3857 if (nx) {
3858 for (j = 1; j < c->argc; j += 2) {
3859 if (lookupKeyWrite(c->db,c->argv[j]) != NULL) {
3860 busykeys++;
3861 }
3862 }
3863 }
3864 if (busykeys) {
3865 addReply(c, shared.czero);
3866 return;
3867 }
3868
3869 for (j = 1; j < c->argc; j += 2) {
3870 int retval;
3871
3872 tryObjectEncoding(c->argv[j+1]);
3873 retval = dictAdd(c->db->dict,c->argv[j],c->argv[j+1]);
3874 if (retval == DICT_ERR) {
3875 dictReplace(c->db->dict,c->argv[j],c->argv[j+1]);
3876 incrRefCount(c->argv[j+1]);
3877 } else {
3878 incrRefCount(c->argv[j]);
3879 incrRefCount(c->argv[j+1]);
3880 }
3881 removeExpire(c->db,c->argv[j]);
3882 }
3883 server.dirty += (c->argc-1)/2;
3884 addReply(c, nx ? shared.cone : shared.ok);
3885 }
3886
3887 static void msetCommand(redisClient *c) {
3888 msetGenericCommand(c,0);
3889 }
3890
3891 static void msetnxCommand(redisClient *c) {
3892 msetGenericCommand(c,1);
3893 }
3894
3895 static void incrDecrCommand(redisClient *c, long long incr) {
3896 long long value;
3897 int retval;
3898 robj *o;
3899
3900 o = lookupKeyWrite(c->db,c->argv[1]);
3901 if (o == NULL) {
3902 value = 0;
3903 } else {
3904 if (o->type != REDIS_STRING) {
3905 value = 0;
3906 } else {
3907 char *eptr;
3908
3909 if (o->encoding == REDIS_ENCODING_RAW)
3910 value = strtoll(o->ptr, &eptr, 10);
3911 else if (o->encoding == REDIS_ENCODING_INT)
3912 value = (long)o->ptr;
3913 else
3914 redisAssert(1 != 1);
3915 }
3916 }
3917
3918 value += incr;
3919 o = createObject(REDIS_STRING,sdscatprintf(sdsempty(),"%lld",value));
3920 tryObjectEncoding(o);
3921 retval = dictAdd(c->db->dict,c->argv[1],o);
3922 if (retval == DICT_ERR) {
3923 dictReplace(c->db->dict,c->argv[1],o);
3924 removeExpire(c->db,c->argv[1]);
3925 } else {
3926 incrRefCount(c->argv[1]);
3927 }
3928 server.dirty++;
3929 addReply(c,shared.colon);
3930 addReply(c,o);
3931 addReply(c,shared.crlf);
3932 }
3933
3934 static void incrCommand(redisClient *c) {
3935 incrDecrCommand(c,1);
3936 }
3937
3938 static void decrCommand(redisClient *c) {
3939 incrDecrCommand(c,-1);
3940 }
3941
3942 static void incrbyCommand(redisClient *c) {
3943 long long incr = strtoll(c->argv[2]->ptr, NULL, 10);
3944 incrDecrCommand(c,incr);
3945 }
3946
3947 static void decrbyCommand(redisClient *c) {
3948 long long incr = strtoll(c->argv[2]->ptr, NULL, 10);
3949 incrDecrCommand(c,-incr);
3950 }
3951
3952 static void appendCommand(redisClient *c) {
3953 int retval;
3954 size_t totlen;
3955 robj *o;
3956
3957 o = lookupKeyWrite(c->db,c->argv[1]);
3958 if (o == NULL) {
3959 /* Create the key */
3960 retval = dictAdd(c->db->dict,c->argv[1],c->argv[2]);
3961 incrRefCount(c->argv[1]);
3962 incrRefCount(c->argv[2]);
3963 totlen = stringObjectLen(c->argv[2]);
3964 } else {
3965 dictEntry *de;
3966
3967 de = dictFind(c->db->dict,c->argv[1]);
3968 assert(de != NULL);
3969
3970 o = dictGetEntryVal(de);
3971 if (o->type != REDIS_STRING) {
3972 addReply(c,shared.wrongtypeerr);
3973 return;
3974 }
3975 /* If the object is specially encoded or shared we have to make
3976 * a copy */
3977 if (o->refcount != 1 || o->encoding != REDIS_ENCODING_RAW) {
3978 robj *decoded = getDecodedObject(o);
3979
3980 o = createStringObject(decoded->ptr, sdslen(decoded->ptr));
3981 decrRefCount(decoded);
3982 dictReplace(c->db->dict,c->argv[1],o);
3983 }
3984 /* APPEND! */
3985 if (c->argv[2]->encoding == REDIS_ENCODING_RAW) {
3986 o->ptr = sdscatlen(o->ptr,
3987 c->argv[2]->ptr, sdslen(c->argv[2]->ptr));
3988 } else {
3989 o->ptr = sdscatprintf(o->ptr, "%ld",
3990 (unsigned long) c->argv[2]->ptr);
3991 }
3992 totlen = sdslen(o->ptr);
3993 }
3994 server.dirty++;
3995 addReplySds(c,sdscatprintf(sdsempty(),":%lu\r\n",(unsigned long)totlen));
3996 }
3997
3998 static void substrCommand(redisClient *c) {
3999 robj *o;
4000 long start = atoi(c->argv[2]->ptr);
4001 long end = atoi(c->argv[3]->ptr);
4002 size_t rangelen, strlen;
4003 sds range;
4004
4005 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.nullbulk)) == NULL ||
4006 checkType(c,o,REDIS_STRING)) return;
4007
4008 o = getDecodedObject(o);
4009 strlen = sdslen(o->ptr);
4010
4011 /* convert negative indexes */
4012 if (start < 0) start = strlen+start;
4013 if (end < 0) end = strlen+end;
4014 if (start < 0) start = 0;
4015 if (end < 0) end = 0;
4016
4017 /* indexes sanity checks */
4018 if (start > end || (size_t)start >= strlen) {
4019 /* Out of range start or start > end result in null reply */
4020 addReply(c,shared.nullbulk);
4021 decrRefCount(o);
4022 return;
4023 }
4024 if ((size_t)end >= strlen) end = strlen-1;
4025 rangelen = (end-start)+1;
4026
4027 /* Return the result */
4028 addReplySds(c,sdscatprintf(sdsempty(),"$%zu\r\n",rangelen));
4029 range = sdsnewlen((char*)o->ptr+start,rangelen);
4030 addReplySds(c,range);
4031 addReply(c,shared.crlf);
4032 decrRefCount(o);
4033 }
4034
4035 /* ========================= Type agnostic commands ========================= */
4036
4037 static void delCommand(redisClient *c) {
4038 int deleted = 0, j;
4039
4040 for (j = 1; j < c->argc; j++) {
4041 if (deleteKey(c->db,c->argv[j])) {
4042 server.dirty++;
4043 deleted++;
4044 }
4045 }
4046 addReplyLong(c,deleted);
4047 }
4048
4049 static void existsCommand(redisClient *c) {
4050 addReply(c,lookupKeyRead(c->db,c->argv[1]) ? shared.cone : shared.czero);
4051 }
4052
4053 static void selectCommand(redisClient *c) {
4054 int id = atoi(c->argv[1]->ptr);
4055
4056 if (selectDb(c,id) == REDIS_ERR) {
4057 addReplySds(c,sdsnew("-ERR invalid DB index\r\n"));
4058 } else {
4059 addReply(c,shared.ok);
4060 }
4061 }
4062
4063 static void randomkeyCommand(redisClient *c) {
4064 dictEntry *de;
4065
4066 while(1) {
4067 de = dictGetRandomKey(c->db->dict);
4068 if (!de || expireIfNeeded(c->db,dictGetEntryKey(de)) == 0) break;
4069 }
4070 if (de == NULL) {
4071 addReply(c,shared.plus);
4072 addReply(c,shared.crlf);
4073 } else {
4074 addReply(c,shared.plus);
4075 addReply(c,dictGetEntryKey(de));
4076 addReply(c,shared.crlf);
4077 }
4078 }
4079
4080 static void keysCommand(redisClient *c) {
4081 dictIterator *di;
4082 dictEntry *de;
4083 sds pattern = c->argv[1]->ptr;
4084 int plen = sdslen(pattern);
4085 unsigned long numkeys = 0;
4086 robj *lenobj = createObject(REDIS_STRING,NULL);
4087
4088 di = dictGetIterator(c->db->dict);
4089 addReply(c,lenobj);
4090 decrRefCount(lenobj);
4091 while((de = dictNext(di)) != NULL) {
4092 robj *keyobj = dictGetEntryKey(de);
4093
4094 sds key = keyobj->ptr;
4095 if ((pattern[0] == '*' && pattern[1] == '\0') ||
4096 stringmatchlen(pattern,plen,key,sdslen(key),0)) {
4097 if (expireIfNeeded(c->db,keyobj) == 0) {
4098 addReplyBulk(c,keyobj);
4099 numkeys++;
4100 }
4101 }
4102 }
4103 dictReleaseIterator(di);
4104 lenobj->ptr = sdscatprintf(sdsempty(),"*%lu\r\n",numkeys);
4105 }
4106
4107 static void dbsizeCommand(redisClient *c) {
4108 addReplySds(c,
4109 sdscatprintf(sdsempty(),":%lu\r\n",dictSize(c->db->dict)));
4110 }
4111
4112 static void lastsaveCommand(redisClient *c) {
4113 addReplySds(c,
4114 sdscatprintf(sdsempty(),":%lu\r\n",server.lastsave));
4115 }
4116
4117 static void typeCommand(redisClient *c) {
4118 robj *o;
4119 char *type;
4120
4121 o = lookupKeyRead(c->db,c->argv[1]);
4122 if (o == NULL) {
4123 type = "+none";
4124 } else {
4125 switch(o->type) {
4126 case REDIS_STRING: type = "+string"; break;
4127 case REDIS_LIST: type = "+list"; break;
4128 case REDIS_SET: type = "+set"; break;
4129 case REDIS_ZSET: type = "+zset"; break;
4130 case REDIS_HASH: type = "+hash"; break;
4131 default: type = "+unknown"; break;
4132 }
4133 }
4134 addReplySds(c,sdsnew(type));
4135 addReply(c,shared.crlf);
4136 }
4137
4138 static void saveCommand(redisClient *c) {
4139 if (server.bgsavechildpid != -1) {
4140 addReplySds(c,sdsnew("-ERR background save in progress\r\n"));
4141 return;
4142 }
4143 if (rdbSave(server.dbfilename) == REDIS_OK) {
4144 addReply(c,shared.ok);
4145 } else {
4146 addReply(c,shared.err);
4147 }
4148 }
4149
4150 static void bgsaveCommand(redisClient *c) {
4151 if (server.bgsavechildpid != -1) {
4152 addReplySds(c,sdsnew("-ERR background save already in progress\r\n"));
4153 return;
4154 }
4155 if (rdbSaveBackground(server.dbfilename) == REDIS_OK) {
4156 char *status = "+Background saving started\r\n";
4157 addReplySds(c,sdsnew(status));
4158 } else {
4159 addReply(c,shared.err);
4160 }
4161 }
4162
4163 static void shutdownCommand(redisClient *c) {
4164 redisLog(REDIS_WARNING,"User requested shutdown, saving DB...");
4165 /* Kill the saving child if there is a background saving in progress.
4166 We want to avoid race conditions, for instance our saving child may
4167 overwrite the synchronous saving did by SHUTDOWN. */
4168 if (server.bgsavechildpid != -1) {
4169 redisLog(REDIS_WARNING,"There is a live saving child. Killing it!");
4170 kill(server.bgsavechildpid,SIGKILL);
4171 rdbRemoveTempFile(server.bgsavechildpid);
4172 }
4173 if (server.appendonly) {
4174 /* Append only file: fsync() the AOF and exit */
4175 fsync(server.appendfd);
4176 if (server.vm_enabled) unlink(server.vm_swap_file);
4177 exit(0);
4178 } else {
4179 /* Snapshotting. Perform a SYNC SAVE and exit */
4180 if (rdbSave(server.dbfilename) == REDIS_OK) {
4181 if (server.daemonize)
4182 unlink(server.pidfile);
4183 redisLog(REDIS_WARNING,"%zu bytes used at exit",zmalloc_used_memory());
4184 redisLog(REDIS_WARNING,"Server exit now, bye bye...");
4185 if (server.vm_enabled) unlink(server.vm_swap_file);
4186 exit(0);
4187 } else {
4188 /* Ooops.. error saving! The best we can do is to continue
4189 * operating. Note that if there was a background saving process,
4190 * in the next cron() Redis will be notified that the background
4191 * saving aborted, handling special stuff like slaves pending for
4192 * synchronization... */
4193 redisLog(REDIS_WARNING,"Error trying to save the DB, can't exit");
4194 addReplySds(c,
4195 sdsnew("-ERR can't quit, problems saving the DB\r\n"));
4196 }
4197 }
4198 }
4199
4200 static void renameGenericCommand(redisClient *c, int nx) {
4201 robj *o;
4202
4203 /* To use the same key as src and dst is probably an error */
4204 if (sdscmp(c->argv[1]->ptr,c->argv[2]->ptr) == 0) {
4205 addReply(c,shared.sameobjecterr);
4206 return;
4207 }
4208
4209 if ((o = lookupKeyWriteOrReply(c,c->argv[1],shared.nokeyerr)) == NULL)
4210 return;
4211
4212 incrRefCount(o);
4213 deleteIfVolatile(c->db,c->argv[2]);
4214 if (dictAdd(c->db->dict,c->argv[2],o) == DICT_ERR) {
4215 if (nx) {
4216 decrRefCount(o);
4217 addReply(c,shared.czero);
4218 return;
4219 }
4220 dictReplace(c->db->dict,c->argv[2],o);
4221 } else {
4222 incrRefCount(c->argv[2]);
4223 }
4224 deleteKey(c->db,c->argv[1]);
4225 server.dirty++;
4226 addReply(c,nx ? shared.cone : shared.ok);
4227 }
4228
4229 static void renameCommand(redisClient *c) {
4230 renameGenericCommand(c,0);
4231 }
4232
4233 static void renamenxCommand(redisClient *c) {
4234 renameGenericCommand(c,1);
4235 }
4236
4237 static void moveCommand(redisClient *c) {
4238 robj *o;
4239 redisDb *src, *dst;
4240 int srcid;
4241
4242 /* Obtain source and target DB pointers */
4243 src = c->db;
4244 srcid = c->db->id;
4245 if (selectDb(c,atoi(c->argv[2]->ptr)) == REDIS_ERR) {
4246 addReply(c,shared.outofrangeerr);
4247 return;
4248 }
4249 dst = c->db;
4250 selectDb(c,srcid); /* Back to the source DB */
4251
4252 /* If the user is moving using as target the same
4253 * DB as the source DB it is probably an error. */
4254 if (src == dst) {
4255 addReply(c,shared.sameobjecterr);
4256 return;
4257 }
4258
4259 /* Check if the element exists and get a reference */
4260 o = lookupKeyWrite(c->db,c->argv[1]);
4261 if (!o) {
4262 addReply(c,shared.czero);
4263 return;
4264 }
4265
4266 /* Try to add the element to the target DB */
4267 deleteIfVolatile(dst,c->argv[1]);
4268 if (dictAdd(dst->dict,c->argv[1],o) == DICT_ERR) {
4269 addReply(c,shared.czero);
4270 return;
4271 }
4272 incrRefCount(c->argv[1]);
4273 incrRefCount(o);
4274
4275 /* OK! key moved, free the entry in the source DB */
4276 deleteKey(src,c->argv[1]);
4277 server.dirty++;
4278 addReply(c,shared.cone);
4279 }
4280
4281 /* =================================== Lists ================================ */
4282 static void pushGenericCommand(redisClient *c, int where) {
4283 robj *lobj;
4284 list *list;
4285
4286 lobj = lookupKeyWrite(c->db,c->argv[1]);
4287 if (lobj == NULL) {
4288 if (handleClientsWaitingListPush(c,c->argv[1],c->argv[2])) {
4289 addReply(c,shared.cone);
4290 return;
4291 }
4292 lobj = createListObject();
4293 list = lobj->ptr;
4294 if (where == REDIS_HEAD) {
4295 listAddNodeHead(list,c->argv[2]);
4296 } else {
4297 listAddNodeTail(list,c->argv[2]);
4298 }
4299 dictAdd(c->db->dict,c->argv[1],lobj);
4300 incrRefCount(c->argv[1]);
4301 incrRefCount(c->argv[2]);
4302 } else {
4303 if (lobj->type != REDIS_LIST) {
4304 addReply(c,shared.wrongtypeerr);
4305 return;
4306 }
4307 if (handleClientsWaitingListPush(c,c->argv[1],c->argv[2])) {
4308 addReply(c,shared.cone);
4309 return;
4310 }
4311 list = lobj->ptr;
4312 if (where == REDIS_HEAD) {
4313 listAddNodeHead(list,c->argv[2]);
4314 } else {
4315 listAddNodeTail(list,c->argv[2]);
4316 }
4317 incrRefCount(c->argv[2]);
4318 }
4319 server.dirty++;
4320 addReplySds(c,sdscatprintf(sdsempty(),":%d\r\n",listLength(list)));
4321 }
4322
4323 static void lpushCommand(redisClient *c) {
4324 pushGenericCommand(c,REDIS_HEAD);
4325 }
4326
4327 static void rpushCommand(redisClient *c) {
4328 pushGenericCommand(c,REDIS_TAIL);
4329 }
4330
4331 static void llenCommand(redisClient *c) {
4332 robj *o;
4333 list *l;
4334
4335 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.czero)) == NULL ||
4336 checkType(c,o,REDIS_LIST)) return;
4337
4338 l = o->ptr;
4339 addReplyUlong(c,listLength(l));
4340 }
4341
4342 static void lindexCommand(redisClient *c) {
4343 robj *o;
4344 int index = atoi(c->argv[2]->ptr);
4345 list *list;
4346 listNode *ln;
4347
4348 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.nullbulk)) == NULL ||
4349 checkType(c,o,REDIS_LIST)) return;
4350 list = o->ptr;
4351
4352 ln = listIndex(list, index);
4353 if (ln == NULL) {
4354 addReply(c,shared.nullbulk);
4355 } else {
4356 robj *ele = listNodeValue(ln);
4357 addReplyBulk(c,ele);
4358 }
4359 }
4360
4361 static void lsetCommand(redisClient *c) {
4362 robj *o;
4363 int index = atoi(c->argv[2]->ptr);
4364 list *list;
4365 listNode *ln;
4366
4367 if ((o = lookupKeyWriteOrReply(c,c->argv[1],shared.nokeyerr)) == NULL ||
4368 checkType(c,o,REDIS_LIST)) return;
4369 list = o->ptr;
4370
4371 ln = listIndex(list, index);
4372 if (ln == NULL) {
4373 addReply(c,shared.outofrangeerr);
4374 } else {
4375 robj *ele = listNodeValue(ln);
4376
4377 decrRefCount(ele);
4378 listNodeValue(ln) = c->argv[3];
4379 incrRefCount(c->argv[3]);
4380 addReply(c,shared.ok);
4381 server.dirty++;
4382 }
4383 }
4384
4385 static void popGenericCommand(redisClient *c, int where) {
4386 robj *o;
4387 list *list;
4388 listNode *ln;
4389
4390 if ((o = lookupKeyWriteOrReply(c,c->argv[1],shared.nullbulk)) == NULL ||
4391 checkType(c,o,REDIS_LIST)) return;
4392 list = o->ptr;
4393
4394 if (where == REDIS_HEAD)
4395 ln = listFirst(list);
4396 else
4397 ln = listLast(list);
4398
4399 if (ln == NULL) {
4400 addReply(c,shared.nullbulk);
4401 } else {
4402 robj *ele = listNodeValue(ln);
4403 addReplyBulk(c,ele);
4404 listDelNode(list,ln);
4405 if (listLength(list) == 0) deleteKey(c->db,c->argv[1]);
4406 server.dirty++;
4407 }
4408 }
4409
4410 static void lpopCommand(redisClient *c) {
4411 popGenericCommand(c,REDIS_HEAD);
4412 }
4413
4414 static void rpopCommand(redisClient *c) {
4415 popGenericCommand(c,REDIS_TAIL);
4416 }
4417
4418 static void lrangeCommand(redisClient *c) {
4419 robj *o;
4420 int start = atoi(c->argv[2]->ptr);
4421 int end = atoi(c->argv[3]->ptr);
4422 int llen;
4423 int rangelen, j;
4424 list *list;
4425 listNode *ln;
4426 robj *ele;
4427
4428 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.nullmultibulk)) == NULL ||
4429 checkType(c,o,REDIS_LIST)) return;
4430 list = o->ptr;
4431 llen = listLength(list);
4432
4433 /* convert negative indexes */
4434 if (start < 0) start = llen+start;
4435 if (end < 0) end = llen+end;
4436 if (start < 0) start = 0;
4437 if (end < 0) end = 0;
4438
4439 /* indexes sanity checks */
4440 if (start > end || start >= llen) {
4441 /* Out of range start or start > end result in empty list */
4442 addReply(c,shared.emptymultibulk);
4443 return;
4444 }
4445 if (end >= llen) end = llen-1;
4446 rangelen = (end-start)+1;
4447
4448 /* Return the result in form of a multi-bulk reply */
4449 ln = listIndex(list, start);
4450 addReplySds(c,sdscatprintf(sdsempty(),"*%d\r\n",rangelen));
4451 for (j = 0; j < rangelen; j++) {
4452 ele = listNodeValue(ln);
4453 addReplyBulk(c,ele);
4454 ln = ln->next;
4455 }
4456 }
4457
4458 static void ltrimCommand(redisClient *c) {
4459 robj *o;
4460 int start = atoi(c->argv[2]->ptr);
4461 int end = atoi(c->argv[3]->ptr);
4462 int llen;
4463 int j, ltrim, rtrim;
4464 list *list;
4465 listNode *ln;
4466
4467 if ((o = lookupKeyWriteOrReply(c,c->argv[1],shared.ok)) == NULL ||
4468 checkType(c,o,REDIS_LIST)) return;
4469 list = o->ptr;
4470 llen = listLength(list);
4471
4472 /* convert negative indexes */
4473 if (start < 0) start = llen+start;
4474 if (end < 0) end = llen+end;
4475 if (start < 0) start = 0;
4476 if (end < 0) end = 0;
4477
4478 /* indexes sanity checks */
4479 if (start > end || start >= llen) {
4480 /* Out of range start or start > end result in empty list */
4481 ltrim = llen;
4482 rtrim = 0;
4483 } else {
4484 if (end >= llen) end = llen-1;
4485 ltrim = start;
4486 rtrim = llen-end-1;
4487 }
4488
4489 /* Remove list elements to perform the trim */
4490 for (j = 0; j < ltrim; j++) {
4491 ln = listFirst(list);
4492 listDelNode(list,ln);
4493 }
4494 for (j = 0; j < rtrim; j++) {
4495 ln = listLast(list);
4496 listDelNode(list,ln);
4497 }
4498 if (listLength(list) == 0) deleteKey(c->db,c->argv[1]);
4499 server.dirty++;
4500 addReply(c,shared.ok);
4501 }
4502
4503 static void lremCommand(redisClient *c) {
4504 robj *o;
4505 list *list;
4506 listNode *ln, *next;
4507 int toremove = atoi(c->argv[2]->ptr);
4508 int removed = 0;
4509 int fromtail = 0;
4510
4511 if ((o = lookupKeyWriteOrReply(c,c->argv[1],shared.czero)) == NULL ||
4512 checkType(c,o,REDIS_LIST)) return;
4513 list = o->ptr;
4514
4515 if (toremove < 0) {
4516 toremove = -toremove;
4517 fromtail = 1;
4518 }
4519 ln = fromtail ? list->tail : list->head;
4520 while (ln) {
4521 robj *ele = listNodeValue(ln);
4522
4523 next = fromtail ? ln->prev : ln->next;
4524 if (compareStringObjects(ele,c->argv[3]) == 0) {
4525 listDelNode(list,ln);
4526 server.dirty++;
4527 removed++;
4528 if (toremove && removed == toremove) break;
4529 }
4530 ln = next;
4531 }
4532 if (listLength(list) == 0) deleteKey(c->db,c->argv[1]);
4533 addReplySds(c,sdscatprintf(sdsempty(),":%d\r\n",removed));
4534 }
4535
4536 /* This is the semantic of this command:
4537 * RPOPLPUSH srclist dstlist:
4538 * IF LLEN(srclist) > 0
4539 * element = RPOP srclist
4540 * LPUSH dstlist element
4541 * RETURN element
4542 * ELSE
4543 * RETURN nil
4544 * END
4545 * END
4546 *
4547 * The idea is to be able to get an element from a list in a reliable way
4548 * since the element is not just returned but pushed against another list
4549 * as well. This command was originally proposed by Ezra Zygmuntowicz.
4550 */
4551 static void rpoplpushcommand(redisClient *c) {
4552 robj *sobj;
4553 list *srclist;
4554 listNode *ln;
4555
4556 if ((sobj = lookupKeyWriteOrReply(c,c->argv[1],shared.nullbulk)) == NULL ||
4557 checkType(c,sobj,REDIS_LIST)) return;
4558 srclist = sobj->ptr;
4559 ln = listLast(srclist);
4560
4561 if (ln == NULL) {
4562 addReply(c,shared.nullbulk);
4563 } else {
4564 robj *dobj = lookupKeyWrite(c->db,c->argv[2]);
4565 robj *ele = listNodeValue(ln);
4566 list *dstlist;
4567
4568 if (dobj && dobj->type != REDIS_LIST) {
4569 addReply(c,shared.wrongtypeerr);
4570 return;
4571 }
4572
4573 /* Add the element to the target list (unless it's directly
4574 * passed to some BLPOP-ing client */
4575 if (!handleClientsWaitingListPush(c,c->argv[2],ele)) {
4576 if (dobj == NULL) {
4577 /* Create the list if the key does not exist */
4578 dobj = createListObject();
4579 dictAdd(c->db->dict,c->argv[2],dobj);
4580 incrRefCount(c->argv[2]);
4581 }
4582 dstlist = dobj->ptr;
4583 listAddNodeHead(dstlist,ele);
4584 incrRefCount(ele);
4585 }
4586
4587 /* Send the element to the client as reply as well */
4588 addReplyBulk(c,ele);
4589
4590 /* Finally remove the element from the source list */
4591 listDelNode(srclist,ln);
4592 if (listLength(srclist) == 0) deleteKey(c->db,c->argv[1]);
4593 server.dirty++;
4594 }
4595 }
4596
4597 /* ==================================== Sets ================================ */
4598
4599 static void saddCommand(redisClient *c) {
4600 robj *set;
4601
4602 set = lookupKeyWrite(c->db,c->argv[1]);
4603 if (set == NULL) {
4604 set = createSetObject();
4605 dictAdd(c->db->dict,c->argv[1],set);
4606 incrRefCount(c->argv[1]);
4607 } else {
4608 if (set->type != REDIS_SET) {
4609 addReply(c,shared.wrongtypeerr);
4610 return;
4611 }
4612 }
4613 if (dictAdd(set->ptr,c->argv[2],NULL) == DICT_OK) {
4614 incrRefCount(c->argv[2]);
4615 server.dirty++;
4616 addReply(c,shared.cone);
4617 } else {
4618 addReply(c,shared.czero);
4619 }
4620 }
4621
4622 static void sremCommand(redisClient *c) {
4623 robj *set;
4624
4625 if ((set = lookupKeyWriteOrReply(c,c->argv[1],shared.czero)) == NULL ||
4626 checkType(c,set,REDIS_SET)) return;
4627
4628 if (dictDelete(set->ptr,c->argv[2]) == DICT_OK) {
4629 server.dirty++;
4630 if (htNeedsResize(set->ptr)) dictResize(set->ptr);
4631 if (dictSize((dict*)set->ptr) == 0) deleteKey(c->db,c->argv[1]);
4632 addReply(c,shared.cone);
4633 } else {
4634 addReply(c,shared.czero);
4635 }
4636 }
4637
4638 static void smoveCommand(redisClient *c) {
4639 robj *srcset, *dstset;
4640
4641 srcset = lookupKeyWrite(c->db,c->argv[1]);
4642 dstset = lookupKeyWrite(c->db,c->argv[2]);
4643
4644 /* If the source key does not exist return 0, if it's of the wrong type
4645 * raise an error */
4646 if (srcset == NULL || srcset->type != REDIS_SET) {
4647 addReply(c, srcset ? shared.wrongtypeerr : shared.czero);
4648 return;
4649 }
4650 /* Error if the destination key is not a set as well */
4651 if (dstset && dstset->type != REDIS_SET) {
4652 addReply(c,shared.wrongtypeerr);
4653 return;
4654 }
4655 /* Remove the element from the source set */
4656 if (dictDelete(srcset->ptr,c->argv[3]) == DICT_ERR) {
4657 /* Key not found in the src set! return zero */
4658 addReply(c,shared.czero);
4659 return;
4660 }
4661 if (dictSize((dict*)srcset->ptr) == 0 && srcset != dstset)
4662 deleteKey(c->db,c->argv[1]);
4663 server.dirty++;
4664 /* Add the element to the destination set */
4665 if (!dstset) {
4666 dstset = createSetObject();
4667 dictAdd(c->db->dict,c->argv[2],dstset);
4668 incrRefCount(c->argv[2]);
4669 }
4670 if (dictAdd(dstset->ptr,c->argv[3],NULL) == DICT_OK)
4671 incrRefCount(c->argv[3]);
4672 addReply(c,shared.cone);
4673 }
4674
4675 static void sismemberCommand(redisClient *c) {
4676 robj *set;
4677
4678 if ((set = lookupKeyReadOrReply(c,c->argv[1],shared.czero)) == NULL ||
4679 checkType(c,set,REDIS_SET)) return;
4680
4681 if (dictFind(set->ptr,c->argv[2]))
4682 addReply(c,shared.cone);
4683 else
4684 addReply(c,shared.czero);
4685 }
4686
4687 static void scardCommand(redisClient *c) {
4688 robj *o;
4689 dict *s;
4690
4691 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.czero)) == NULL ||
4692 checkType(c,o,REDIS_SET)) return;
4693
4694 s = o->ptr;
4695 addReplyUlong(c,dictSize(s));
4696 }
4697
4698 static void spopCommand(redisClient *c) {
4699 robj *set;
4700 dictEntry *de;
4701
4702 if ((set = lookupKeyWriteOrReply(c,c->argv[1],shared.nullbulk)) == NULL ||
4703 checkType(c,set,REDIS_SET)) return;
4704
4705 de = dictGetRandomKey(set->ptr);
4706 if (de == NULL) {
4707 addReply(c,shared.nullbulk);
4708 } else {
4709 robj *ele = dictGetEntryKey(de);
4710
4711 addReplyBulk(c,ele);
4712 dictDelete(set->ptr,ele);
4713 if (htNeedsResize(set->ptr)) dictResize(set->ptr);
4714 if (dictSize((dict*)set->ptr) == 0) deleteKey(c->db,c->argv[1]);
4715 server.dirty++;
4716 }
4717 }
4718
4719 static void srandmemberCommand(redisClient *c) {
4720 robj *set;
4721 dictEntry *de;
4722
4723 if ((set = lookupKeyReadOrReply(c,c->argv[1],shared.nullbulk)) == NULL ||
4724 checkType(c,set,REDIS_SET)) return;
4725
4726 de = dictGetRandomKey(set->ptr);
4727 if (de == NULL) {
4728 addReply(c,shared.nullbulk);
4729 } else {
4730 robj *ele = dictGetEntryKey(de);
4731
4732 addReplyBulk(c,ele);
4733 }
4734 }
4735
4736 static int qsortCompareSetsByCardinality(const void *s1, const void *s2) {
4737 dict **d1 = (void*) s1, **d2 = (void*) s2;
4738
4739 return dictSize(*d1)-dictSize(*d2);
4740 }
4741
4742 static void sinterGenericCommand(redisClient *c, robj **setskeys, unsigned long setsnum, robj *dstkey) {
4743 dict **dv = zmalloc(sizeof(dict*)*setsnum);
4744 dictIterator *di;
4745 dictEntry *de;
4746 robj *lenobj = NULL, *dstset = NULL;
4747 unsigned long j, cardinality = 0;
4748
4749 for (j = 0; j < setsnum; j++) {
4750 robj *setobj;
4751
4752 setobj = dstkey ?
4753 lookupKeyWrite(c->db,setskeys[j]) :
4754 lookupKeyRead(c->db,setskeys[j]);
4755 if (!setobj) {
4756 zfree(dv);
4757 if (dstkey) {
4758 if (deleteKey(c->db,dstkey))
4759 server.dirty++;
4760 addReply(c,shared.czero);
4761 } else {
4762 addReply(c,shared.nullmultibulk);
4763 }
4764 return;
4765 }
4766 if (setobj->type != REDIS_SET) {
4767 zfree(dv);
4768 addReply(c,shared.wrongtypeerr);
4769 return;
4770 }
4771 dv[j] = setobj->ptr;
4772 }
4773 /* Sort sets from the smallest to largest, this will improve our
4774 * algorithm's performace */
4775 qsort(dv,setsnum,sizeof(dict*),qsortCompareSetsByCardinality);
4776
4777 /* The first thing we should output is the total number of elements...
4778 * since this is a multi-bulk write, but at this stage we don't know
4779 * the intersection set size, so we use a trick, append an empty object
4780 * to the output list and save the pointer to later modify it with the
4781 * right length */
4782 if (!dstkey) {
4783 lenobj = createObject(REDIS_STRING,NULL);
4784 addReply(c,lenobj);
4785 decrRefCount(lenobj);
4786 } else {
4787 /* If we have a target key where to store the resulting set
4788 * create this key with an empty set inside */
4789 dstset = createSetObject();
4790 }
4791
4792 /* Iterate all the elements of the first (smallest) set, and test
4793 * the element against all the other sets, if at least one set does
4794 * not include the element it is discarded */
4795 di = dictGetIterator(dv[0]);
4796
4797 while((de = dictNext(di)) != NULL) {
4798 robj *ele;
4799
4800 for (j = 1; j < setsnum; j++)
4801 if (dictFind(dv[j],dictGetEntryKey(de)) == NULL) break;
4802 if (j != setsnum)
4803 continue; /* at least one set does not contain the member */
4804 ele = dictGetEntryKey(de);
4805 if (!dstkey) {
4806 addReplyBulk(c,ele);
4807 cardinality++;
4808 } else {
4809 dictAdd(dstset->ptr,ele,NULL);
4810 incrRefCount(ele);
4811 }
4812 }
4813 dictReleaseIterator(di);
4814
4815 if (dstkey) {
4816 /* Store the resulting set into the target, if the intersection
4817 * is not an empty set. */
4818 deleteKey(c->db,dstkey);
4819 if (dictSize((dict*)dstset->ptr) > 0) {
4820 dictAdd(c->db->dict,dstkey,dstset);
4821 incrRefCount(dstkey);
4822 addReplyLong(c,dictSize((dict*)dstset->ptr));
4823 } else {
4824 decrRefCount(dstset);
4825 addReply(c,shared.czero);
4826 }
4827 server.dirty++;
4828 } else {
4829 lenobj->ptr = sdscatprintf(sdsempty(),"*%lu\r\n",cardinality);
4830 }
4831 zfree(dv);
4832 }
4833
4834 static void sinterCommand(redisClient *c) {
4835 sinterGenericCommand(c,c->argv+1,c->argc-1,NULL);
4836 }
4837
4838 static void sinterstoreCommand(redisClient *c) {
4839 sinterGenericCommand(c,c->argv+2,c->argc-2,c->argv[1]);
4840 }
4841
4842 #define REDIS_OP_UNION 0
4843 #define REDIS_OP_DIFF 1
4844 #define REDIS_OP_INTER 2
4845
4846 static void sunionDiffGenericCommand(redisClient *c, robj **setskeys, int setsnum, robj *dstkey, int op) {
4847 dict **dv = zmalloc(sizeof(dict*)*setsnum);
4848 dictIterator *di;
4849 dictEntry *de;
4850 robj *dstset = NULL;
4851 int j, cardinality = 0;
4852
4853 for (j = 0; j < setsnum; j++) {
4854 robj *setobj;
4855
4856 setobj = dstkey ?
4857 lookupKeyWrite(c->db,setskeys[j]) :
4858 lookupKeyRead(c->db,setskeys[j]);
4859 if (!setobj) {
4860 dv[j] = NULL;
4861 continue;
4862 }
4863 if (setobj->type != REDIS_SET) {
4864 zfree(dv);
4865 addReply(c,shared.wrongtypeerr);
4866 return;
4867 }
4868 dv[j] = setobj->ptr;
4869 }
4870
4871 /* We need a temp set object to store our union. If the dstkey
4872 * is not NULL (that is, we are inside an SUNIONSTORE operation) then
4873 * this set object will be the resulting object to set into the target key*/
4874 dstset = createSetObject();
4875
4876 /* Iterate all the elements of all the sets, add every element a single
4877 * time to the result set */
4878 for (j = 0; j < setsnum; j++) {
4879 if (op == REDIS_OP_DIFF && j == 0 && !dv[j]) break; /* result set is empty */
4880 if (!dv[j]) continue; /* non existing keys are like empty sets */
4881
4882 di = dictGetIterator(dv[j]);
4883
4884 while((de = dictNext(di)) != NULL) {
4885 robj *ele;
4886
4887 /* dictAdd will not add the same element multiple times */
4888 ele = dictGetEntryKey(de);
4889 if (op == REDIS_OP_UNION || j == 0) {
4890 if (dictAdd(dstset->ptr,ele,NULL) == DICT_OK) {
4891 incrRefCount(ele);
4892 cardinality++;
4893 }
4894 } else if (op == REDIS_OP_DIFF) {
4895 if (dictDelete(dstset->ptr,ele) == DICT_OK) {
4896 cardinality--;
4897 }
4898 }
4899 }
4900 dictReleaseIterator(di);
4901
4902 /* result set is empty? Exit asap. */
4903 if (op == REDIS_OP_DIFF && cardinality == 0) break;
4904 }
4905
4906 /* Output the content of the resulting set, if not in STORE mode */
4907 if (!dstkey) {
4908 addReplySds(c,sdscatprintf(sdsempty(),"*%d\r\n",cardinality));
4909 di = dictGetIterator(dstset->ptr);
4910 while((de = dictNext(di)) != NULL) {
4911 robj *ele;
4912
4913 ele = dictGetEntryKey(de);
4914 addReplyBulk(c,ele);
4915 }
4916 dictReleaseIterator(di);
4917 decrRefCount(dstset);
4918 } else {
4919 /* If we have a target key where to store the resulting set
4920 * create this key with the result set inside */
4921 deleteKey(c->db,dstkey);
4922 if (dictSize((dict*)dstset->ptr) > 0) {
4923 dictAdd(c->db->dict,dstkey,dstset);
4924 incrRefCount(dstkey);
4925 addReplyLong(c,dictSize((dict*)dstset->ptr));
4926 } else {
4927 decrRefCount(dstset);
4928 addReply(c,shared.czero);
4929 }
4930 server.dirty++;
4931 }
4932 zfree(dv);
4933 }
4934
4935 static void sunionCommand(redisClient *c) {
4936 sunionDiffGenericCommand(c,c->argv+1,c->argc-1,NULL,REDIS_OP_UNION);
4937 }
4938
4939 static void sunionstoreCommand(redisClient *c) {
4940 sunionDiffGenericCommand(c,c->argv+2,c->argc-2,c->argv[1],REDIS_OP_UNION);
4941 }
4942
4943 static void sdiffCommand(redisClient *c) {
4944 sunionDiffGenericCommand(c,c->argv+1,c->argc-1,NULL,REDIS_OP_DIFF);
4945 }
4946
4947 static void sdiffstoreCommand(redisClient *c) {
4948 sunionDiffGenericCommand(c,c->argv+2,c->argc-2,c->argv[1],REDIS_OP_DIFF);
4949 }
4950
4951 /* ==================================== ZSets =============================== */
4952
4953 /* ZSETs are ordered sets using two data structures to hold the same elements
4954 * in order to get O(log(N)) INSERT and REMOVE operations into a sorted
4955 * data structure.
4956 *
4957 * The elements are added to an hash table mapping Redis objects to scores.
4958 * At the same time the elements are added to a skip list mapping scores
4959 * to Redis objects (so objects are sorted by scores in this "view"). */
4960
4961 /* This skiplist implementation is almost a C translation of the original
4962 * algorithm described by William Pugh in "Skip Lists: A Probabilistic
4963 * Alternative to Balanced Trees", modified in three ways:
4964 * a) this implementation allows for repeated values.
4965 * b) the comparison is not just by key (our 'score') but by satellite data.
4966 * c) there is a back pointer, so it's a doubly linked list with the back
4967 * pointers being only at "level 1". This allows to traverse the list
4968 * from tail to head, useful for ZREVRANGE. */
4969
4970 static zskiplistNode *zslCreateNode(int level, double score, robj *obj) {
4971 zskiplistNode *zn = zmalloc(sizeof(*zn));
4972
4973 zn->forward = zmalloc(sizeof(zskiplistNode*) * level);
4974 if (level > 0)
4975 zn->span = zmalloc(sizeof(unsigned int) * (level - 1));
4976 zn->score = score;
4977 zn->obj = obj;
4978 return zn;
4979 }
4980
4981 static zskiplist *zslCreate(void) {
4982 int j;
4983 zskiplist *zsl;
4984
4985 zsl = zmalloc(sizeof(*zsl));
4986 zsl->level = 1;
4987 zsl->length = 0;
4988 zsl->header = zslCreateNode(ZSKIPLIST_MAXLEVEL,0,NULL);
4989 for (j = 0; j < ZSKIPLIST_MAXLEVEL; j++) {
4990 zsl->header->forward[j] = NULL;
4991
4992 /* span has space for ZSKIPLIST_MAXLEVEL-1 elements */
4993 if (j < ZSKIPLIST_MAXLEVEL-1)
4994 zsl->header->span[j] = 0;
4995 }
4996 zsl->header->backward = NULL;
4997 zsl->tail = NULL;
4998 return zsl;
4999 }
5000
5001 static void zslFreeNode(zskiplistNode *node) {
5002 decrRefCount(node->obj);
5003 zfree(node->forward);
5004 zfree(node->span);
5005 zfree(node);
5006 }
5007
5008 static void zslFree(zskiplist *zsl) {
5009 zskiplistNode *node = zsl->header->forward[0], *next;
5010
5011 zfree(zsl->header->forward);
5012 zfree(zsl->header->span);
5013 zfree(zsl->header);
5014 while(node) {
5015 next = node->forward[0];
5016 zslFreeNode(node);
5017 node = next;
5018 }
5019 zfree(zsl);
5020 }
5021
5022 static int zslRandomLevel(void) {
5023 int level = 1;
5024 while ((random()&0xFFFF) < (ZSKIPLIST_P * 0xFFFF))
5025 level += 1;
5026 return level;
5027 }
5028
5029 static void zslInsert(zskiplist *zsl, double score, robj *obj) {
5030 zskiplistNode *update[ZSKIPLIST_MAXLEVEL], *x;
5031 unsigned int rank[ZSKIPLIST_MAXLEVEL];
5032 int i, level;
5033
5034 x = zsl->header;
5035 for (i = zsl->level-1; i >= 0; i--) {
5036 /* store rank that is crossed to reach the insert position */
5037 rank[i] = i == (zsl->level-1) ? 0 : rank[i+1];
5038
5039 while (x->forward[i] &&
5040 (x->forward[i]->score < score ||
5041 (x->forward[i]->score == score &&
5042 compareStringObjects(x->forward[i]->obj,obj) < 0))) {
5043 rank[i] += i > 0 ? x->span[i-1] : 1;
5044 x = x->forward[i];
5045 }
5046 update[i] = x;
5047 }
5048 /* we assume the key is not already inside, since we allow duplicated
5049 * scores, and the re-insertion of score and redis object should never
5050 * happpen since the caller of zslInsert() should test in the hash table
5051 * if the element is already inside or not. */
5052 level = zslRandomLevel();
5053 if (level > zsl->level) {
5054 for (i = zsl->level; i < level; i++) {
5055 rank[i] = 0;
5056 update[i] = zsl->header;
5057 update[i]->span[i-1] = zsl->length;
5058 }
5059 zsl->level = level;
5060 }
5061 x = zslCreateNode(level,score,obj);
5062 for (i = 0; i < level; i++) {
5063 x->forward[i] = update[i]->forward[i];
5064 update[i]->forward[i] = x;
5065
5066 /* update span covered by update[i] as x is inserted here */
5067 if (i > 0) {
5068 x->span[i-1] = update[i]->span[i-1] - (rank[0] - rank[i]);
5069 update[i]->span[i-1] = (rank[0] - rank[i]) + 1;
5070 }
5071 }
5072
5073 /* increment span for untouched levels */
5074 for (i = level; i < zsl->level; i++) {
5075 update[i]->span[i-1]++;
5076 }
5077
5078 x->backward = (update[0] == zsl->header) ? NULL : update[0];
5079 if (x->forward[0])
5080 x->forward[0]->backward = x;
5081 else
5082 zsl->tail = x;
5083 zsl->length++;
5084 }
5085
5086 /* Internal function used by zslDelete, zslDeleteByScore and zslDeleteByRank */
5087 void zslDeleteNode(zskiplist *zsl, zskiplistNode *x, zskiplistNode **update) {
5088 int i;
5089 for (i = 0; i < zsl->level; i++) {
5090 if (update[i]->forward[i] == x) {
5091 if (i > 0) {
5092 update[i]->span[i-1] += x->span[i-1] - 1;
5093 }
5094 update[i]->forward[i] = x->forward[i];
5095 } else {
5096 /* invariant: i > 0, because update[0]->forward[0]
5097 * is always equal to x */
5098 update[i]->span[i-1] -= 1;
5099 }
5100 }
5101 if (x->forward[0]) {
5102 x->forward[0]->backward = x->backward;
5103 } else {
5104 zsl->tail = x->backward;
5105 }
5106 while(zsl->level > 1 && zsl->header->forward[zsl->level-1] == NULL)
5107 zsl->level--;
5108 zsl->length--;
5109 }
5110
5111 /* Delete an element with matching score/object from the skiplist. */
5112 static int zslDelete(zskiplist *zsl, double score, robj *obj) {
5113 zskiplistNode *update[ZSKIPLIST_MAXLEVEL], *x;
5114 int i;
5115
5116 x = zsl->header;
5117 for (i = zsl->level-1; i >= 0; i--) {
5118 while (x->forward[i] &&
5119 (x->forward[i]->score < score ||
5120 (x->forward[i]->score == score &&
5121 compareStringObjects(x->forward[i]->obj,obj) < 0)))
5122 x = x->forward[i];
5123 update[i] = x;
5124 }
5125 /* We may have multiple elements with the same score, what we need
5126 * is to find the element with both the right score and object. */
5127 x = x->forward[0];
5128 if (x && score == x->score && compareStringObjects(x->obj,obj) == 0) {
5129 zslDeleteNode(zsl, x, update);
5130 zslFreeNode(x);
5131 return 1;
5132 } else {
5133 return 0; /* not found */
5134 }
5135 return 0; /* not found */
5136 }
5137
5138 /* Delete all the elements with score between min and max from the skiplist.
5139 * Min and mx are inclusive, so a score >= min || score <= max is deleted.
5140 * Note that this function takes the reference to the hash table view of the
5141 * sorted set, in order to remove the elements from the hash table too. */
5142 static unsigned long zslDeleteRangeByScore(zskiplist *zsl, double min, double max, dict *dict) {
5143 zskiplistNode *update[ZSKIPLIST_MAXLEVEL], *x;
5144 unsigned long removed = 0;
5145 int i;
5146
5147 x = zsl->header;
5148 for (i = zsl->level-1; i >= 0; i--) {
5149 while (x->forward[i] && x->forward[i]->score < min)
5150 x = x->forward[i];
5151 update[i] = x;
5152 }
5153 /* We may have multiple elements with the same score, what we need
5154 * is to find the element with both the right score and object. */
5155 x = x->forward[0];
5156 while (x && x->score <= max) {
5157 zskiplistNode *next = x->forward[0];
5158 zslDeleteNode(zsl, x, update);
5159 dictDelete(dict,x->obj);
5160 zslFreeNode(x);
5161 removed++;
5162 x = next;
5163 }
5164 return removed; /* not found */
5165 }
5166
5167 /* Delete all the elements with rank between start and end from the skiplist.
5168 * Start and end are inclusive. Note that start and end need to be 1-based */
5169 static unsigned long zslDeleteRangeByRank(zskiplist *zsl, unsigned int start, unsigned int end, dict *dict) {
5170 zskiplistNode *update[ZSKIPLIST_MAXLEVEL], *x;
5171 unsigned long traversed = 0, removed = 0;
5172 int i;
5173
5174 x = zsl->header;
5175 for (i = zsl->level-1; i >= 0; i--) {
5176 while (x->forward[i] && (traversed + (i > 0 ? x->span[i-1] : 1)) < start) {
5177 traversed += i > 0 ? x->span[i-1] : 1;
5178 x = x->forward[i];
5179 }
5180 update[i] = x;
5181 }
5182
5183 traversed++;
5184 x = x->forward[0];
5185 while (x && traversed <= end) {
5186 zskiplistNode *next = x->forward[0];
5187 zslDeleteNode(zsl, x, update);
5188 dictDelete(dict,x->obj);
5189 zslFreeNode(x);
5190 removed++;
5191 traversed++;
5192 x = next;
5193 }
5194 return removed;
5195 }
5196
5197 /* Find the first node having a score equal or greater than the specified one.
5198 * Returns NULL if there is no match. */
5199 static zskiplistNode *zslFirstWithScore(zskiplist *zsl, double score) {
5200 zskiplistNode *x;
5201 int i;
5202
5203 x = zsl->header;
5204 for (i = zsl->level-1; i >= 0; i--) {
5205 while (x->forward[i] && x->forward[i]->score < score)
5206 x = x->forward[i];
5207 }
5208 /* We may have multiple elements with the same score, what we need
5209 * is to find the element with both the right score and object. */
5210 return x->forward[0];
5211 }
5212
5213 /* Find the rank for an element by both score and key.
5214 * Returns 0 when the element cannot be found, rank otherwise.
5215 * Note that the rank is 1-based due to the span of zsl->header to the
5216 * first element. */
5217 static unsigned long zslGetRank(zskiplist *zsl, double score, robj *o) {
5218 zskiplistNode *x;
5219 unsigned long rank = 0;
5220 int i;
5221
5222 x = zsl->header;
5223 for (i = zsl->level-1; i >= 0; i--) {
5224 while (x->forward[i] &&
5225 (x->forward[i]->score < score ||
5226 (x->forward[i]->score == score &&
5227 compareStringObjects(x->forward[i]->obj,o) <= 0))) {
5228 rank += i > 0 ? x->span[i-1] : 1;
5229 x = x->forward[i];
5230 }
5231
5232 /* x might be equal to zsl->header, so test if obj is non-NULL */
5233 if (x->obj && compareStringObjects(x->obj,o) == 0) {
5234 return rank;
5235 }
5236 }
5237 return 0;
5238 }
5239
5240 /* Finds an element by its rank. The rank argument needs to be 1-based. */
5241 zskiplistNode* zslGetElementByRank(zskiplist *zsl, unsigned long rank) {
5242 zskiplistNode *x;
5243 unsigned long traversed = 0;
5244 int i;
5245
5246 x = zsl->header;
5247 for (i = zsl->level-1; i >= 0; i--) {
5248 while (x->forward[i] && (traversed + (i>0 ? x->span[i-1] : 1)) <= rank)
5249 {
5250 traversed += i > 0 ? x->span[i-1] : 1;
5251 x = x->forward[i];
5252 }
5253 if (traversed == rank) {
5254 return x;
5255 }
5256 }
5257 return NULL;
5258 }
5259
5260 /* The actual Z-commands implementations */
5261
5262 /* This generic command implements both ZADD and ZINCRBY.
5263 * scoreval is the score if the operation is a ZADD (doincrement == 0) or
5264 * the increment if the operation is a ZINCRBY (doincrement == 1). */
5265 static void zaddGenericCommand(redisClient *c, robj *key, robj *ele, double scoreval, int doincrement) {
5266 robj *zsetobj;
5267 zset *zs;
5268 double *score;
5269
5270 zsetobj = lookupKeyWrite(c->db,key);
5271 if (zsetobj == NULL) {
5272 zsetobj = createZsetObject();
5273 dictAdd(c->db->dict,key,zsetobj);
5274 incrRefCount(key);
5275 } else {
5276 if (zsetobj->type != REDIS_ZSET) {
5277 addReply(c,shared.wrongtypeerr);
5278 return;
5279 }
5280 }
5281 zs = zsetobj->ptr;
5282
5283 /* Ok now since we implement both ZADD and ZINCRBY here the code
5284 * needs to handle the two different conditions. It's all about setting
5285 * '*score', that is, the new score to set, to the right value. */
5286 score = zmalloc(sizeof(double));
5287 if (doincrement) {
5288 dictEntry *de;
5289
5290 /* Read the old score. If the element was not present starts from 0 */
5291 de = dictFind(zs->dict,ele);
5292 if (de) {
5293 double *oldscore = dictGetEntryVal(de);
5294 *score = *oldscore + scoreval;
5295 } else {
5296 *score = scoreval;
5297 }
5298 } else {
5299 *score = scoreval;
5300 }
5301
5302 /* What follows is a simple remove and re-insert operation that is common
5303 * to both ZADD and ZINCRBY... */
5304 if (dictAdd(zs->dict,ele,score) == DICT_OK) {
5305 /* case 1: New element */
5306 incrRefCount(ele); /* added to hash */
5307 zslInsert(zs->zsl,*score,ele);
5308 incrRefCount(ele); /* added to skiplist */
5309 server.dirty++;
5310 if (doincrement)
5311 addReplyDouble(c,*score);
5312 else
5313 addReply(c,shared.cone);
5314 } else {
5315 dictEntry *de;
5316 double *oldscore;
5317
5318 /* case 2: Score update operation */
5319 de = dictFind(zs->dict,ele);
5320 redisAssert(de != NULL);
5321 oldscore = dictGetEntryVal(de);
5322 if (*score != *oldscore) {
5323 int deleted;
5324
5325 /* Remove and insert the element in the skip list with new score */
5326 deleted = zslDelete(zs->zsl,*oldscore,ele);
5327 redisAssert(deleted != 0);
5328 zslInsert(zs->zsl,*score,ele);
5329 incrRefCount(ele);
5330 /* Update the score in the hash table */
5331 dictReplace(zs->dict,ele,score);
5332 server.dirty++;
5333 } else {
5334 zfree(score);
5335 }
5336 if (doincrement)
5337 addReplyDouble(c,*score);
5338 else
5339 addReply(c,shared.czero);
5340 }
5341 }
5342
5343 static void zaddCommand(redisClient *c) {
5344 double scoreval;
5345
5346 scoreval = strtod(c->argv[2]->ptr,NULL);
5347 zaddGenericCommand(c,c->argv[1],c->argv[3],scoreval,0);
5348 }
5349
5350 static void zincrbyCommand(redisClient *c) {
5351 double scoreval;
5352
5353 scoreval = strtod(c->argv[2]->ptr,NULL);
5354 zaddGenericCommand(c,c->argv[1],c->argv[3],scoreval,1);
5355 }
5356
5357 static void zremCommand(redisClient *c) {
5358 robj *zsetobj;
5359 zset *zs;
5360 dictEntry *de;
5361 double *oldscore;
5362 int deleted;
5363
5364 if ((zsetobj = lookupKeyWriteOrReply(c,c->argv[1],shared.czero)) == NULL ||
5365 checkType(c,zsetobj,REDIS_ZSET)) return;
5366
5367 zs = zsetobj->ptr;
5368 de = dictFind(zs->dict,c->argv[2]);
5369 if (de == NULL) {
5370 addReply(c,shared.czero);
5371 return;
5372 }
5373 /* Delete from the skiplist */
5374 oldscore = dictGetEntryVal(de);
5375 deleted = zslDelete(zs->zsl,*oldscore,c->argv[2]);
5376 redisAssert(deleted != 0);
5377
5378 /* Delete from the hash table */
5379 dictDelete(zs->dict,c->argv[2]);
5380 if (htNeedsResize(zs->dict)) dictResize(zs->dict);
5381 if (dictSize(zs->dict) == 0) deleteKey(c->db,c->argv[1]);
5382 server.dirty++;
5383 addReply(c,shared.cone);
5384 }
5385
5386 static void zremrangebyscoreCommand(redisClient *c) {
5387 double min = strtod(c->argv[2]->ptr,NULL);
5388 double max = strtod(c->argv[3]->ptr,NULL);
5389 long deleted;
5390 robj *zsetobj;
5391 zset *zs;
5392
5393 if ((zsetobj = lookupKeyWriteOrReply(c,c->argv[1],shared.czero)) == NULL ||
5394 checkType(c,zsetobj,REDIS_ZSET)) return;
5395
5396 zs = zsetobj->ptr;
5397 deleted = zslDeleteRangeByScore(zs->zsl,min,max,zs->dict);
5398 if (htNeedsResize(zs->dict)) dictResize(zs->dict);
5399 if (dictSize(zs->dict) == 0) deleteKey(c->db,c->argv[1]);
5400 server.dirty += deleted;
5401 addReplyLong(c,deleted);
5402 }
5403
5404 static void zremrangebyrankCommand(redisClient *c) {
5405 int start = atoi(c->argv[2]->ptr);
5406 int end = atoi(c->argv[3]->ptr);
5407 int llen;
5408 long deleted;
5409 robj *zsetobj;
5410 zset *zs;
5411
5412 if ((zsetobj = lookupKeyWriteOrReply(c,c->argv[1],shared.czero)) == NULL ||
5413 checkType(c,zsetobj,REDIS_ZSET)) return;
5414 zs = zsetobj->ptr;
5415 llen = zs->zsl->length;
5416
5417 /* convert negative indexes */
5418 if (start < 0) start = llen+start;
5419 if (end < 0) end = llen+end;
5420 if (start < 0) start = 0;
5421 if (end < 0) end = 0;
5422
5423 /* indexes sanity checks */
5424 if (start > end || start >= llen) {
5425 addReply(c,shared.czero);
5426 return;
5427 }
5428 if (end >= llen) end = llen-1;
5429
5430 /* increment start and end because zsl*Rank functions
5431 * use 1-based rank */
5432 deleted = zslDeleteRangeByRank(zs->zsl,start+1,end+1,zs->dict);
5433 if (htNeedsResize(zs->dict)) dictResize(zs->dict);
5434 if (dictSize(zs->dict) == 0) deleteKey(c->db,c->argv[1]);
5435 server.dirty += deleted;
5436 addReplyLong(c, deleted);
5437 }
5438
5439 typedef struct {
5440 dict *dict;
5441 double weight;
5442 } zsetopsrc;
5443
5444 static int qsortCompareZsetopsrcByCardinality(const void *s1, const void *s2) {
5445 zsetopsrc *d1 = (void*) s1, *d2 = (void*) s2;
5446 unsigned long size1, size2;
5447 size1 = d1->dict ? dictSize(d1->dict) : 0;
5448 size2 = d2->dict ? dictSize(d2->dict) : 0;
5449 return size1 - size2;
5450 }
5451
5452 #define REDIS_AGGR_SUM 1
5453 #define REDIS_AGGR_MIN 2
5454 #define REDIS_AGGR_MAX 3
5455
5456 inline static void zunionInterAggregate(double *target, double val, int aggregate) {
5457 if (aggregate == REDIS_AGGR_SUM) {
5458 *target = *target + val;
5459 } else if (aggregate == REDIS_AGGR_MIN) {
5460 *target = val < *target ? val : *target;
5461 } else if (aggregate == REDIS_AGGR_MAX) {
5462 *target = val > *target ? val : *target;
5463 } else {
5464 /* safety net */
5465 redisAssert(0 != 0);
5466 }
5467 }
5468
5469 static void zunionInterGenericCommand(redisClient *c, robj *dstkey, int op) {
5470 int i, j, zsetnum;
5471 int aggregate = REDIS_AGGR_SUM;
5472 zsetopsrc *src;
5473 robj *dstobj;
5474 zset *dstzset;
5475 dictIterator *di;
5476 dictEntry *de;
5477
5478 /* expect zsetnum input keys to be given */
5479 zsetnum = atoi(c->argv[2]->ptr);
5480 if (zsetnum < 1) {
5481 addReplySds(c,sdsnew("-ERR at least 1 input key is needed for ZUNION/ZINTER\r\n"));
5482 return;
5483 }
5484
5485 /* test if the expected number of keys would overflow */
5486 if (3+zsetnum > c->argc) {
5487 addReply(c,shared.syntaxerr);
5488 return;
5489 }
5490
5491 /* read keys to be used for input */
5492 src = zmalloc(sizeof(zsetopsrc) * zsetnum);
5493 for (i = 0, j = 3; i < zsetnum; i++, j++) {
5494 robj *zsetobj = lookupKeyWrite(c->db,c->argv[j]);
5495 if (!zsetobj) {
5496 src[i].dict = NULL;
5497 } else {
5498 if (zsetobj->type != REDIS_ZSET) {
5499 zfree(src);
5500 addReply(c,shared.wrongtypeerr);
5501 return;
5502 }
5503 src[i].dict = ((zset*)zsetobj->ptr)->dict;
5504 }
5505
5506 /* default all weights to 1 */
5507 src[i].weight = 1.0;
5508 }
5509
5510 /* parse optional extra arguments */
5511 if (j < c->argc) {
5512 int remaining = c->argc - j;
5513
5514 while (remaining) {
5515 if (remaining >= (zsetnum + 1) && !strcasecmp(c->argv[j]->ptr,"weights")) {
5516 j++; remaining--;
5517 for (i = 0; i < zsetnum; i++, j++, remaining--) {
5518 src[i].weight = strtod(c->argv[j]->ptr, NULL);
5519 }
5520 } else if (remaining >= 2 && !strcasecmp(c->argv[j]->ptr,"aggregate")) {
5521 j++; remaining--;
5522 if (!strcasecmp(c->argv[j]->ptr,"sum")) {
5523 aggregate = REDIS_AGGR_SUM;
5524 } else if (!strcasecmp(c->argv[j]->ptr,"min")) {
5525 aggregate = REDIS_AGGR_MIN;
5526 } else if (!strcasecmp(c->argv[j]->ptr,"max")) {
5527 aggregate = REDIS_AGGR_MAX;
5528 } else {
5529 zfree(src);
5530 addReply(c,shared.syntaxerr);
5531 return;
5532 }
5533 j++; remaining--;
5534 } else {
5535 zfree(src);
5536 addReply(c,shared.syntaxerr);
5537 return;
5538 }
5539 }
5540 }
5541
5542 /* sort sets from the smallest to largest, this will improve our
5543 * algorithm's performance */
5544 qsort(src,zsetnum,sizeof(zsetopsrc), qsortCompareZsetopsrcByCardinality);
5545
5546 dstobj = createZsetObject();
5547 dstzset = dstobj->ptr;
5548
5549 if (op == REDIS_OP_INTER) {
5550 /* skip going over all entries if the smallest zset is NULL or empty */
5551 if (src[0].dict && dictSize(src[0].dict) > 0) {
5552 /* precondition: as src[0].dict is non-empty and the zsets are ordered
5553 * from small to large, all src[i > 0].dict are non-empty too */
5554 di = dictGetIterator(src[0].dict);
5555 while((de = dictNext(di)) != NULL) {
5556 double *score = zmalloc(sizeof(double)), value;
5557 *score = src[0].weight * (*(double*)dictGetEntryVal(de));
5558
5559 for (j = 1; j < zsetnum; j++) {
5560 dictEntry *other = dictFind(src[j].dict,dictGetEntryKey(de));
5561 if (other) {
5562 value = src[j].weight * (*(double*)dictGetEntryVal(other));
5563 zunionInterAggregate(score, value, aggregate);
5564 } else {
5565 break;
5566 }
5567 }
5568
5569 /* skip entry when not present in every source dict */
5570 if (j != zsetnum) {
5571 zfree(score);
5572 } else {
5573 robj *o = dictGetEntryKey(de);
5574 dictAdd(dstzset->dict,o,score);
5575 incrRefCount(o); /* added to dictionary */
5576 zslInsert(dstzset->zsl,*score,o);
5577 incrRefCount(o); /* added to skiplist */
5578 }
5579 }
5580 dictReleaseIterator(di);
5581 }
5582 } else if (op == REDIS_OP_UNION) {
5583 for (i = 0; i < zsetnum; i++) {
5584 if (!src[i].dict) continue;
5585
5586 di = dictGetIterator(src[i].dict);
5587 while((de = dictNext(di)) != NULL) {
5588 /* skip key when already processed */
5589 if (dictFind(dstzset->dict,dictGetEntryKey(de)) != NULL) continue;
5590
5591 double *score = zmalloc(sizeof(double)), value;
5592 *score = src[i].weight * (*(double*)dictGetEntryVal(de));
5593
5594 /* because the zsets are sorted by size, its only possible
5595 * for sets at larger indices to hold this entry */
5596 for (j = (i+1); j < zsetnum; j++) {
5597 dictEntry *other = dictFind(src[j].dict,dictGetEntryKey(de));
5598 if (other) {
5599 value = src[j].weight * (*(double*)dictGetEntryVal(other));
5600 zunionInterAggregate(score, value, aggregate);
5601 }
5602 }
5603
5604 robj *o = dictGetEntryKey(de);
5605 dictAdd(dstzset->dict,o,score);
5606 incrRefCount(o); /* added to dictionary */
5607 zslInsert(dstzset->zsl,*score,o);
5608 incrRefCount(o); /* added to skiplist */
5609 }
5610 dictReleaseIterator(di);
5611 }
5612 } else {
5613 /* unknown operator */
5614 redisAssert(op == REDIS_OP_INTER || op == REDIS_OP_UNION);
5615 }
5616
5617 deleteKey(c->db,dstkey);
5618 if (dstzset->zsl->length) {
5619 dictAdd(c->db->dict,dstkey,dstobj);
5620 incrRefCount(dstkey);
5621 addReplyLong(c, dstzset->zsl->length);
5622 server.dirty++;
5623 } else {
5624 decrRefCount(dstzset);
5625 addReply(c, shared.czero);
5626 }
5627 zfree(src);
5628 }
5629
5630 static void zunionCommand(redisClient *c) {
5631 zunionInterGenericCommand(c,c->argv[1], REDIS_OP_UNION);
5632 }
5633
5634 static void zinterCommand(redisClient *c) {
5635 zunionInterGenericCommand(c,c->argv[1], REDIS_OP_INTER);
5636 }
5637
5638 static void zrangeGenericCommand(redisClient *c, int reverse) {
5639 robj *o;
5640 int start = atoi(c->argv[2]->ptr);
5641 int end = atoi(c->argv[3]->ptr);
5642 int withscores = 0;
5643 int llen;
5644 int rangelen, j;
5645 zset *zsetobj;
5646 zskiplist *zsl;
5647 zskiplistNode *ln;
5648 robj *ele;
5649
5650 if (c->argc == 5 && !strcasecmp(c->argv[4]->ptr,"withscores")) {
5651 withscores = 1;
5652 } else if (c->argc >= 5) {
5653 addReply(c,shared.syntaxerr);
5654 return;
5655 }
5656
5657 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.nullmultibulk)) == NULL ||
5658 checkType(c,o,REDIS_ZSET)) return;
5659 zsetobj = o->ptr;
5660 zsl = zsetobj->zsl;
5661 llen = zsl->length;
5662
5663 /* convert negative indexes */
5664 if (start < 0) start = llen+start;
5665 if (end < 0) end = llen+end;
5666 if (start < 0) start = 0;
5667 if (end < 0) end = 0;
5668
5669 /* indexes sanity checks */
5670 if (start > end || start >= llen) {
5671 /* Out of range start or start > end result in empty list */
5672 addReply(c,shared.emptymultibulk);
5673 return;
5674 }
5675 if (end >= llen) end = llen-1;
5676 rangelen = (end-start)+1;
5677
5678 /* check if starting point is trivial, before searching
5679 * the element in log(N) time */
5680 if (reverse) {
5681 ln = start == 0 ? zsl->tail : zslGetElementByRank(zsl, llen-start);
5682 } else {
5683 ln = start == 0 ?
5684 zsl->header->forward[0] : zslGetElementByRank(zsl, start+1);
5685 }
5686
5687 /* Return the result in form of a multi-bulk reply */
5688 addReplySds(c,sdscatprintf(sdsempty(),"*%d\r\n",
5689 withscores ? (rangelen*2) : rangelen));
5690 for (j = 0; j < rangelen; j++) {
5691 ele = ln->obj;
5692 addReplyBulk(c,ele);
5693 if (withscores)
5694 addReplyDouble(c,ln->score);
5695 ln = reverse ? ln->backward : ln->forward[0];
5696 }
5697 }
5698
5699 static void zrangeCommand(redisClient *c) {
5700 zrangeGenericCommand(c,0);
5701 }
5702
5703 static void zrevrangeCommand(redisClient *c) {
5704 zrangeGenericCommand(c,1);
5705 }
5706
5707 /* This command implements both ZRANGEBYSCORE and ZCOUNT.
5708 * If justcount is non-zero, just the count is returned. */
5709 static void genericZrangebyscoreCommand(redisClient *c, int justcount) {
5710 robj *o;
5711 double min, max;
5712 int minex = 0, maxex = 0; /* are min or max exclusive? */
5713 int offset = 0, limit = -1;
5714 int withscores = 0;
5715 int badsyntax = 0;
5716
5717 /* Parse the min-max interval. If one of the values is prefixed
5718 * by the "(" character, it's considered "open". For instance
5719 * ZRANGEBYSCORE zset (1.5 (2.5 will match min < x < max
5720 * ZRANGEBYSCORE zset 1.5 2.5 will instead match min <= x <= max */
5721 if (((char*)c->argv[2]->ptr)[0] == '(') {
5722 min = strtod((char*)c->argv[2]->ptr+1,NULL);
5723 minex = 1;
5724 } else {
5725 min = strtod(c->argv[2]->ptr,NULL);
5726 }
5727 if (((char*)c->argv[3]->ptr)[0] == '(') {
5728 max = strtod((char*)c->argv[3]->ptr+1,NULL);
5729 maxex = 1;
5730 } else {
5731 max = strtod(c->argv[3]->ptr,NULL);
5732 }
5733
5734 /* Parse "WITHSCORES": note that if the command was called with
5735 * the name ZCOUNT then we are sure that c->argc == 4, so we'll never
5736 * enter the following paths to parse WITHSCORES and LIMIT. */
5737 if (c->argc == 5 || c->argc == 8) {
5738 if (strcasecmp(c->argv[c->argc-1]->ptr,"withscores") == 0)
5739 withscores = 1;
5740 else
5741 badsyntax = 1;
5742 }
5743 if (c->argc != (4 + withscores) && c->argc != (7 + withscores))
5744 badsyntax = 1;
5745 if (badsyntax) {
5746 addReplySds(c,
5747 sdsnew("-ERR wrong number of arguments for ZRANGEBYSCORE\r\n"));
5748 return;
5749 }
5750
5751 /* Parse "LIMIT" */
5752 if (c->argc == (7 + withscores) && strcasecmp(c->argv[4]->ptr,"limit")) {
5753 addReply(c,shared.syntaxerr);
5754 return;
5755 } else if (c->argc == (7 + withscores)) {
5756 offset = atoi(c->argv[5]->ptr);
5757 limit = atoi(c->argv[6]->ptr);
5758 if (offset < 0) offset = 0;
5759 }
5760
5761 /* Ok, lookup the key and get the range */
5762 o = lookupKeyRead(c->db,c->argv[1]);
5763 if (o == NULL) {
5764 addReply(c,justcount ? shared.czero : shared.nullmultibulk);
5765 } else {
5766 if (o->type != REDIS_ZSET) {
5767 addReply(c,shared.wrongtypeerr);
5768 } else {
5769 zset *zsetobj = o->ptr;
5770 zskiplist *zsl = zsetobj->zsl;
5771 zskiplistNode *ln;
5772 robj *ele, *lenobj = NULL;
5773 unsigned long rangelen = 0;
5774
5775 /* Get the first node with the score >= min, or with
5776 * score > min if 'minex' is true. */
5777 ln = zslFirstWithScore(zsl,min);
5778 while (minex && ln && ln->score == min) ln = ln->forward[0];
5779
5780 if (ln == NULL) {
5781 /* No element matching the speciifed interval */
5782 addReply(c,justcount ? shared.czero : shared.emptymultibulk);
5783 return;
5784 }
5785
5786 /* We don't know in advance how many matching elements there
5787 * are in the list, so we push this object that will represent
5788 * the multi-bulk length in the output buffer, and will "fix"
5789 * it later */
5790 if (!justcount) {
5791 lenobj = createObject(REDIS_STRING,NULL);
5792 addReply(c,lenobj);
5793 decrRefCount(lenobj);
5794 }
5795
5796 while(ln && (maxex ? (ln->score < max) : (ln->score <= max))) {
5797 if (offset) {
5798 offset--;
5799 ln = ln->forward[0];
5800 continue;
5801 }
5802 if (limit == 0) break;
5803 if (!justcount) {
5804 ele = ln->obj;
5805 addReplyBulk(c,ele);
5806 if (withscores)
5807 addReplyDouble(c,ln->score);
5808 }
5809 ln = ln->forward[0];
5810 rangelen++;
5811 if (limit > 0) limit--;
5812 }
5813 if (justcount) {
5814 addReplyLong(c,(long)rangelen);
5815 } else {
5816 lenobj->ptr = sdscatprintf(sdsempty(),"*%lu\r\n",
5817 withscores ? (rangelen*2) : rangelen);
5818 }
5819 }
5820 }
5821 }
5822
5823 static void zrangebyscoreCommand(redisClient *c) {
5824 genericZrangebyscoreCommand(c,0);
5825 }
5826
5827 static void zcountCommand(redisClient *c) {
5828 genericZrangebyscoreCommand(c,1);
5829 }
5830
5831 static void zcardCommand(redisClient *c) {
5832 robj *o;
5833 zset *zs;
5834
5835 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.czero)) == NULL ||
5836 checkType(c,o,REDIS_ZSET)) return;
5837
5838 zs = o->ptr;
5839 addReplyUlong(c,zs->zsl->length);
5840 }
5841
5842 static void zscoreCommand(redisClient *c) {
5843 robj *o;
5844 zset *zs;
5845 dictEntry *de;
5846
5847 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.nullbulk)) == NULL ||
5848 checkType(c,o,REDIS_ZSET)) return;
5849
5850 zs = o->ptr;
5851 de = dictFind(zs->dict,c->argv[2]);
5852 if (!de) {
5853 addReply(c,shared.nullbulk);
5854 } else {
5855 double *score = dictGetEntryVal(de);
5856
5857 addReplyDouble(c,*score);
5858 }
5859 }
5860
5861 static void zrankGenericCommand(redisClient *c, int reverse) {
5862 robj *o;
5863 zset *zs;
5864 zskiplist *zsl;
5865 dictEntry *de;
5866 unsigned long rank;
5867 double *score;
5868
5869 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.nullbulk)) == NULL ||
5870 checkType(c,o,REDIS_ZSET)) return;
5871
5872 zs = o->ptr;
5873 zsl = zs->zsl;
5874 de = dictFind(zs->dict,c->argv[2]);
5875 if (!de) {
5876 addReply(c,shared.nullbulk);
5877 return;
5878 }
5879
5880 score = dictGetEntryVal(de);
5881 rank = zslGetRank(zsl, *score, c->argv[2]);
5882 if (rank) {
5883 if (reverse) {
5884 addReplyLong(c, zsl->length - rank);
5885 } else {
5886 addReplyLong(c, rank-1);
5887 }
5888 } else {
5889 addReply(c,shared.nullbulk);
5890 }
5891 }
5892
5893 static void zrankCommand(redisClient *c) {
5894 zrankGenericCommand(c, 0);
5895 }
5896
5897 static void zrevrankCommand(redisClient *c) {
5898 zrankGenericCommand(c, 1);
5899 }
5900
5901 /* =================================== Hashes =============================== */
5902 static void hsetCommand(redisClient *c) {
5903 int update = 0;
5904 robj *o = lookupKeyWrite(c->db,c->argv[1]);
5905
5906 if (o == NULL) {
5907 o = createHashObject();
5908 dictAdd(c->db->dict,c->argv[1],o);
5909 incrRefCount(c->argv[1]);
5910 } else {
5911 if (o->type != REDIS_HASH) {
5912 addReply(c,shared.wrongtypeerr);
5913 return;
5914 }
5915 }
5916 /* We want to convert the zipmap into an hash table right now if the
5917 * entry to be added is too big. Note that we check if the object
5918 * is integer encoded before to try fetching the length in the test below.
5919 * This is because integers are small, but currently stringObjectLen()
5920 * performs a slow conversion: not worth it. */
5921 if (o->encoding == REDIS_ENCODING_ZIPMAP &&
5922 ((c->argv[2]->encoding == REDIS_ENCODING_RAW &&
5923 sdslen(c->argv[2]->ptr) > server.hash_max_zipmap_value) ||
5924 (c->argv[3]->encoding == REDIS_ENCODING_RAW &&
5925 sdslen(c->argv[3]->ptr) > server.hash_max_zipmap_value)))
5926 {
5927 convertToRealHash(o);
5928 }
5929
5930 if (o->encoding == REDIS_ENCODING_ZIPMAP) {
5931 unsigned char *zm = o->ptr;
5932 robj *valobj = getDecodedObject(c->argv[3]);
5933
5934 zm = zipmapSet(zm,c->argv[2]->ptr,sdslen(c->argv[2]->ptr),
5935 valobj->ptr,sdslen(valobj->ptr),&update);
5936 decrRefCount(valobj);
5937 o->ptr = zm;
5938
5939 /* And here there is the second check for hash conversion...
5940 * we want to do it only if the operation was not just an update as
5941 * zipmapLen() is O(N). */
5942 if (!update && zipmapLen(zm) > server.hash_max_zipmap_entries)
5943 convertToRealHash(o);
5944 } else {
5945 tryObjectEncoding(c->argv[2]);
5946 /* note that c->argv[3] is already encoded, as the latest arg
5947 * of a bulk command is always integer encoded if possible. */
5948 if (dictReplace(o->ptr,c->argv[2],c->argv[3])) {
5949 incrRefCount(c->argv[2]);
5950 } else {
5951 update = 1;
5952 }
5953 incrRefCount(c->argv[3]);
5954 }
5955 server.dirty++;
5956 addReplySds(c,sdscatprintf(sdsempty(),":%d\r\n",update == 0));
5957 }
5958
5959 static void hincrbyCommand(redisClient *c) {
5960 int update = 0;
5961 long long value = 0, incr = 0;
5962 robj *o = lookupKeyWrite(c->db,c->argv[1]);
5963
5964 if (o == NULL) {
5965 o = createHashObject();
5966 dictAdd(c->db->dict,c->argv[1],o);
5967 incrRefCount(c->argv[1]);
5968 } else {
5969 if (o->type != REDIS_HASH) {
5970 addReply(c,shared.wrongtypeerr);
5971 return;
5972 }
5973 }
5974
5975 robj *o_incr = getDecodedObject(c->argv[3]);
5976 incr = strtoll(o_incr->ptr, NULL, 10);
5977 decrRefCount(o_incr);
5978
5979 if (o->encoding == REDIS_ENCODING_ZIPMAP) {
5980 unsigned char *zm = o->ptr;
5981 unsigned char *zval;
5982 unsigned int zvlen;
5983
5984 /* Find value if already present in hash */
5985 if (zipmapGet(zm,c->argv[2]->ptr,sdslen(c->argv[2]->ptr),
5986 &zval,&zvlen)) {
5987 /* strtoll needs the char* to have a trailing \0, but
5988 * the zipmap doesn't include them. */
5989 sds szval = sdsnewlen(zval, zvlen);
5990 value = strtoll(szval,NULL,10);
5991 sdsfree(szval);
5992 }
5993
5994 value += incr;
5995 sds svalue = sdscatprintf(sdsempty(),"%lld",value);
5996 zm = zipmapSet(zm,c->argv[2]->ptr,sdslen(c->argv[2]->ptr),
5997 (unsigned char*)svalue,sdslen(svalue),&update);
5998 sdsfree(svalue);
5999 o->ptr = zm;
6000
6001 /* Check if the zipmap needs to be converted
6002 * if this was not an update. */
6003 if (!update && zipmapLen(zm) > server.hash_max_zipmap_entries)
6004 convertToRealHash(o);
6005 } else {
6006 robj *hval;
6007 dictEntry *de;
6008
6009 /* Find value if already present in hash */
6010 de = dictFind(o->ptr,c->argv[2]);
6011 if (de != NULL) {
6012 hval = dictGetEntryVal(de);
6013 if (hval->encoding == REDIS_ENCODING_RAW)
6014 value = strtoll(hval->ptr,NULL,10);
6015 else if (hval->encoding == REDIS_ENCODING_INT)
6016 value = (long)hval->ptr;
6017 else
6018 redisAssert(1 != 1);
6019 }
6020
6021 value += incr;
6022 hval = createObject(REDIS_STRING,sdscatprintf(sdsempty(),"%lld",value));
6023 tryObjectEncoding(hval);
6024 incrRefCount(hval);
6025
6026 if (dictReplace(o->ptr,c->argv[2],hval)) {
6027 incrRefCount(c->argv[2]);
6028 }
6029 }
6030
6031 server.dirty++;
6032 addReplyLong(c, value);
6033 }
6034
6035 static void hgetCommand(redisClient *c) {
6036 robj *o;
6037
6038 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.nullbulk)) == NULL ||
6039 checkType(c,o,REDIS_HASH)) return;
6040
6041 if (o->encoding == REDIS_ENCODING_ZIPMAP) {
6042 unsigned char *zm = o->ptr;
6043 unsigned char *val;
6044 unsigned int vlen;
6045 robj *field;
6046
6047 field = getDecodedObject(c->argv[2]);
6048 if (zipmapGet(zm,field->ptr,sdslen(field->ptr), &val,&vlen)) {
6049 addReplySds(c,sdscatprintf(sdsempty(),"$%u\r\n", vlen));
6050 addReplySds(c,sdsnewlen(val,vlen));
6051 addReply(c,shared.crlf);
6052 decrRefCount(field);
6053 return;
6054 } else {
6055 addReply(c,shared.nullbulk);
6056 decrRefCount(field);
6057 return;
6058 }
6059 } else {
6060 struct dictEntry *de;
6061
6062 de = dictFind(o->ptr,c->argv[2]);
6063 if (de == NULL) {
6064 addReply(c,shared.nullbulk);
6065 } else {
6066 robj *e = dictGetEntryVal(de);
6067
6068 addReplyBulk(c,e);
6069 }
6070 }
6071 }
6072
6073 static void hdelCommand(redisClient *c) {
6074 robj *o;
6075 int deleted = 0;
6076
6077 if ((o = lookupKeyWriteOrReply(c,c->argv[1],shared.czero)) == NULL ||
6078 checkType(c,o,REDIS_HASH)) return;
6079
6080 if (o->encoding == REDIS_ENCODING_ZIPMAP) {
6081 robj *field = getDecodedObject(c->argv[2]);
6082
6083 o->ptr = zipmapDel((unsigned char*) o->ptr,
6084 (unsigned char*) field->ptr,
6085 sdslen(field->ptr), &deleted);
6086 decrRefCount(field);
6087 if (zipmapLen((unsigned char*) o->ptr) == 0)
6088 deleteKey(c->db,c->argv[1]);
6089 } else {
6090 deleted = dictDelete((dict*)o->ptr,c->argv[2]) == DICT_OK;
6091 if (htNeedsResize(o->ptr)) dictResize(o->ptr);
6092 if (dictSize((dict*)o->ptr) == 0) deleteKey(c->db,c->argv[1]);
6093 }
6094 if (deleted) server.dirty++;
6095 addReply(c,deleted ? shared.cone : shared.czero);
6096 }
6097
6098 static void hlenCommand(redisClient *c) {
6099 robj *o;
6100 unsigned long len;
6101
6102 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.czero)) == NULL ||
6103 checkType(c,o,REDIS_HASH)) return;
6104
6105 len = (o->encoding == REDIS_ENCODING_ZIPMAP) ?
6106 zipmapLen((unsigned char*)o->ptr) : dictSize((dict*)o->ptr);
6107 addReplyUlong(c,len);
6108 }
6109
6110 #define REDIS_GETALL_KEYS 1
6111 #define REDIS_GETALL_VALS 2
6112 static void genericHgetallCommand(redisClient *c, int flags) {
6113 robj *o, *lenobj;
6114 unsigned long count = 0;
6115
6116 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.nullmultibulk)) == NULL
6117 || checkType(c,o,REDIS_HASH)) return;
6118
6119 lenobj = createObject(REDIS_STRING,NULL);
6120 addReply(c,lenobj);
6121 decrRefCount(lenobj);
6122
6123 if (o->encoding == REDIS_ENCODING_ZIPMAP) {
6124 unsigned char *p = zipmapRewind(o->ptr);
6125 unsigned char *field, *val;
6126 unsigned int flen, vlen;
6127
6128 while((p = zipmapNext(p,&field,&flen,&val,&vlen)) != NULL) {
6129 robj *aux;
6130
6131 if (flags & REDIS_GETALL_KEYS) {
6132 aux = createStringObject((char*)field,flen);
6133 addReplyBulk(c,aux);
6134 decrRefCount(aux);
6135 count++;
6136 }
6137 if (flags & REDIS_GETALL_VALS) {
6138 aux = createStringObject((char*)val,vlen);
6139 addReplyBulk(c,aux);
6140 decrRefCount(aux);
6141 count++;
6142 }
6143 }
6144 } else {
6145 dictIterator *di = dictGetIterator(o->ptr);
6146 dictEntry *de;
6147
6148 while((de = dictNext(di)) != NULL) {
6149 robj *fieldobj = dictGetEntryKey(de);
6150 robj *valobj = dictGetEntryVal(de);
6151
6152 if (flags & REDIS_GETALL_KEYS) {
6153 addReplyBulk(c,fieldobj);
6154 count++;
6155 }
6156 if (flags & REDIS_GETALL_VALS) {
6157 addReplyBulk(c,valobj);
6158 count++;
6159 }
6160 }
6161 dictReleaseIterator(di);
6162 }
6163 lenobj->ptr = sdscatprintf(sdsempty(),"*%lu\r\n",count);
6164 }
6165
6166 static void hkeysCommand(redisClient *c) {
6167 genericHgetallCommand(c,REDIS_GETALL_KEYS);
6168 }
6169
6170 static void hvalsCommand(redisClient *c) {
6171 genericHgetallCommand(c,REDIS_GETALL_VALS);
6172 }
6173
6174 static void hgetallCommand(redisClient *c) {
6175 genericHgetallCommand(c,REDIS_GETALL_KEYS|REDIS_GETALL_VALS);
6176 }
6177
6178 static void hexistsCommand(redisClient *c) {
6179 robj *o;
6180 int exists = 0;
6181
6182 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.czero)) == NULL ||
6183 checkType(c,o,REDIS_HASH)) return;
6184
6185 if (o->encoding == REDIS_ENCODING_ZIPMAP) {
6186 robj *field;
6187 unsigned char *zm = o->ptr;
6188
6189 field = getDecodedObject(c->argv[2]);
6190 exists = zipmapExists(zm,field->ptr,sdslen(field->ptr));
6191 decrRefCount(field);
6192 } else {
6193 exists = dictFind(o->ptr,c->argv[2]) != NULL;
6194 }
6195 addReply(c,exists ? shared.cone : shared.czero);
6196 }
6197
6198 static void convertToRealHash(robj *o) {
6199 unsigned char *key, *val, *p, *zm = o->ptr;
6200 unsigned int klen, vlen;
6201 dict *dict = dictCreate(&hashDictType,NULL);
6202
6203 assert(o->type == REDIS_HASH && o->encoding != REDIS_ENCODING_HT);
6204 p = zipmapRewind(zm);
6205 while((p = zipmapNext(p,&key,&klen,&val,&vlen)) != NULL) {
6206 robj *keyobj, *valobj;
6207
6208 keyobj = createStringObject((char*)key,klen);
6209 valobj = createStringObject((char*)val,vlen);
6210 tryObjectEncoding(keyobj);
6211 tryObjectEncoding(valobj);
6212 dictAdd(dict,keyobj,valobj);
6213 }
6214 o->encoding = REDIS_ENCODING_HT;
6215 o->ptr = dict;
6216 zfree(zm);
6217 }
6218
6219 /* ========================= Non type-specific commands ==================== */
6220
6221 static void flushdbCommand(redisClient *c) {
6222 server.dirty += dictSize(c->db->dict);
6223 dictEmpty(c->db->dict);
6224 dictEmpty(c->db->expires);
6225 addReply(c,shared.ok);
6226 }
6227
6228 static void flushallCommand(redisClient *c) {
6229 server.dirty += emptyDb();
6230 addReply(c,shared.ok);
6231 if (server.bgsavechildpid != -1) {
6232 kill(server.bgsavechildpid,SIGKILL);
6233 rdbRemoveTempFile(server.bgsavechildpid);
6234 }
6235 rdbSave(server.dbfilename);
6236 server.dirty++;
6237 }
6238
6239 static redisSortOperation *createSortOperation(int type, robj *pattern) {
6240 redisSortOperation *so = zmalloc(sizeof(*so));
6241 so->type = type;
6242 so->pattern = pattern;
6243 return so;
6244 }
6245
6246 /* Return the value associated to the key with a name obtained
6247 * substituting the first occurence of '*' in 'pattern' with 'subst' */
6248 static robj *lookupKeyByPattern(redisDb *db, robj *pattern, robj *subst) {
6249 char *p;
6250 sds spat, ssub;
6251 robj keyobj;
6252 int prefixlen, sublen, postfixlen;
6253 /* Expoit the internal sds representation to create a sds string allocated on the stack in order to make this function faster */
6254 struct {
6255 long len;
6256 long free;
6257 char buf[REDIS_SORTKEY_MAX+1];
6258 } keyname;
6259
6260 /* If the pattern is "#" return the substitution object itself in order
6261 * to implement the "SORT ... GET #" feature. */
6262 spat = pattern->ptr;
6263 if (spat[0] == '#' && spat[1] == '\0') {
6264 return subst;
6265 }
6266
6267 /* The substitution object may be specially encoded. If so we create
6268 * a decoded object on the fly. Otherwise getDecodedObject will just
6269 * increment the ref count, that we'll decrement later. */
6270 subst = getDecodedObject(subst);
6271
6272 ssub = subst->ptr;
6273 if (sdslen(spat)+sdslen(ssub)-1 > REDIS_SORTKEY_MAX) return NULL;
6274 p = strchr(spat,'*');
6275 if (!p) {
6276 decrRefCount(subst);
6277 return NULL;
6278 }
6279
6280 prefixlen = p-spat;
6281 sublen = sdslen(ssub);
6282 postfixlen = sdslen(spat)-(prefixlen+1);
6283 memcpy(keyname.buf,spat,prefixlen);
6284 memcpy(keyname.buf+prefixlen,ssub,sublen);
6285 memcpy(keyname.buf+prefixlen+sublen,p+1,postfixlen);
6286 keyname.buf[prefixlen+sublen+postfixlen] = '\0';
6287 keyname.len = prefixlen+sublen+postfixlen;
6288
6289 initStaticStringObject(keyobj,((char*)&keyname)+(sizeof(long)*2))
6290 decrRefCount(subst);
6291
6292 /* printf("lookup '%s' => %p\n", keyname.buf,de); */
6293 return lookupKeyRead(db,&keyobj);
6294 }
6295
6296 /* sortCompare() is used by qsort in sortCommand(). Given that qsort_r with
6297 * the additional parameter is not standard but a BSD-specific we have to
6298 * pass sorting parameters via the global 'server' structure */
6299 static int sortCompare(const void *s1, const void *s2) {
6300 const redisSortObject *so1 = s1, *so2 = s2;
6301 int cmp;
6302
6303 if (!server.sort_alpha) {
6304 /* Numeric sorting. Here it's trivial as we precomputed scores */
6305 if (so1->u.score > so2->u.score) {
6306 cmp = 1;
6307 } else if (so1->u.score < so2->u.score) {
6308 cmp = -1;
6309 } else {
6310 cmp = 0;
6311 }
6312 } else {
6313 /* Alphanumeric sorting */
6314 if (server.sort_bypattern) {
6315 if (!so1->u.cmpobj || !so2->u.cmpobj) {
6316 /* At least one compare object is NULL */
6317 if (so1->u.cmpobj == so2->u.cmpobj)
6318 cmp = 0;
6319 else if (so1->u.cmpobj == NULL)
6320 cmp = -1;
6321 else
6322 cmp = 1;
6323 } else {
6324 /* We have both the objects, use strcoll */
6325 cmp = strcoll(so1->u.cmpobj->ptr,so2->u.cmpobj->ptr);
6326 }
6327 } else {
6328 /* Compare elements directly */
6329 robj *dec1, *dec2;
6330
6331 dec1 = getDecodedObject(so1->obj);
6332 dec2 = getDecodedObject(so2->obj);
6333 cmp = strcoll(dec1->ptr,dec2->ptr);
6334 decrRefCount(dec1);
6335 decrRefCount(dec2);
6336 }
6337 }
6338 return server.sort_desc ? -cmp : cmp;
6339 }
6340
6341 /* The SORT command is the most complex command in Redis. Warning: this code
6342 * is optimized for speed and a bit less for readability */
6343 static void sortCommand(redisClient *c) {
6344 list *operations;
6345 int outputlen = 0;
6346 int desc = 0, alpha = 0;
6347 int limit_start = 0, limit_count = -1, start, end;
6348 int j, dontsort = 0, vectorlen;
6349 int getop = 0; /* GET operation counter */
6350 robj *sortval, *sortby = NULL, *storekey = NULL;
6351 redisSortObject *vector; /* Resulting vector to sort */
6352
6353 /* Lookup the key to sort. It must be of the right types */
6354 sortval = lookupKeyRead(c->db,c->argv[1]);
6355 if (sortval == NULL) {
6356 addReply(c,shared.nullmultibulk);
6357 return;
6358 }
6359 if (sortval->type != REDIS_SET && sortval->type != REDIS_LIST &&
6360 sortval->type != REDIS_ZSET)
6361 {
6362 addReply(c,shared.wrongtypeerr);
6363 return;
6364 }
6365
6366 /* Create a list of operations to perform for every sorted element.
6367 * Operations can be GET/DEL/INCR/DECR */
6368 operations = listCreate();
6369 listSetFreeMethod(operations,zfree);
6370 j = 2;
6371
6372 /* Now we need to protect sortval incrementing its count, in the future
6373 * SORT may have options able to overwrite/delete keys during the sorting
6374 * and the sorted key itself may get destroied */
6375 incrRefCount(sortval);
6376
6377 /* The SORT command has an SQL-alike syntax, parse it */
6378 while(j < c->argc) {
6379 int leftargs = c->argc-j-1;
6380 if (!strcasecmp(c->argv[j]->ptr,"asc")) {
6381 desc = 0;
6382 } else if (!strcasecmp(c->argv[j]->ptr,"desc")) {
6383 desc = 1;
6384 } else if (!strcasecmp(c->argv[j]->ptr,"alpha")) {
6385 alpha = 1;
6386 } else if (!strcasecmp(c->argv[j]->ptr,"limit") && leftargs >= 2) {
6387 limit_start = atoi(c->argv[j+1]->ptr);
6388 limit_count = atoi(c->argv[j+2]->ptr);
6389 j+=2;
6390 } else if (!strcasecmp(c->argv[j]->ptr,"store") && leftargs >= 1) {
6391 storekey = c->argv[j+1];
6392 j++;
6393 } else if (!strcasecmp(c->argv[j]->ptr,"by") && leftargs >= 1) {
6394 sortby = c->argv[j+1];
6395 /* If the BY pattern does not contain '*', i.e. it is constant,
6396 * we don't need to sort nor to lookup the weight keys. */
6397 if (strchr(c->argv[j+1]->ptr,'*') == NULL) dontsort = 1;
6398 j++;
6399 } else if (!strcasecmp(c->argv[j]->ptr,"get") && leftargs >= 1) {
6400 listAddNodeTail(operations,createSortOperation(
6401 REDIS_SORT_GET,c->argv[j+1]));
6402 getop++;
6403 j++;
6404 } else {
6405 decrRefCount(sortval);
6406 listRelease(operations);
6407 addReply(c,shared.syntaxerr);
6408 return;
6409 }
6410 j++;
6411 }
6412
6413 /* Load the sorting vector with all the objects to sort */
6414 switch(sortval->type) {
6415 case REDIS_LIST: vectorlen = listLength((list*)sortval->ptr); break;
6416 case REDIS_SET: vectorlen = dictSize((dict*)sortval->ptr); break;
6417 case REDIS_ZSET: vectorlen = dictSize(((zset*)sortval->ptr)->dict); break;
6418 default: vectorlen = 0; redisAssert(0); /* Avoid GCC warning */
6419 }
6420 vector = zmalloc(sizeof(redisSortObject)*vectorlen);
6421 j = 0;
6422
6423 if (sortval->type == REDIS_LIST) {
6424 list *list = sortval->ptr;
6425 listNode *ln;
6426 listIter li;
6427
6428 listRewind(list,&li);
6429 while((ln = listNext(&li))) {
6430 robj *ele = ln->value;
6431 vector[j].obj = ele;
6432 vector[j].u.score = 0;
6433 vector[j].u.cmpobj = NULL;
6434 j++;
6435 }
6436 } else {
6437 dict *set;
6438 dictIterator *di;
6439 dictEntry *setele;
6440
6441 if (sortval->type == REDIS_SET) {
6442 set = sortval->ptr;
6443 } else {
6444 zset *zs = sortval->ptr;
6445 set = zs->dict;
6446 }
6447
6448 di = dictGetIterator(set);
6449 while((setele = dictNext(di)) != NULL) {
6450 vector[j].obj = dictGetEntryKey(setele);
6451 vector[j].u.score = 0;
6452 vector[j].u.cmpobj = NULL;
6453 j++;
6454 }
6455 dictReleaseIterator(di);
6456 }
6457 redisAssert(j == vectorlen);
6458
6459 /* Now it's time to load the right scores in the sorting vector */
6460 if (dontsort == 0) {
6461 for (j = 0; j < vectorlen; j++) {
6462 if (sortby) {
6463 robj *byval;
6464
6465 byval = lookupKeyByPattern(c->db,sortby,vector[j].obj);
6466 if (!byval || byval->type != REDIS_STRING) continue;
6467 if (alpha) {
6468 vector[j].u.cmpobj = getDecodedObject(byval);
6469 } else {
6470 if (byval->encoding == REDIS_ENCODING_RAW) {
6471 vector[j].u.score = strtod(byval->ptr,NULL);
6472 } else {
6473 /* Don't need to decode the object if it's
6474 * integer-encoded (the only encoding supported) so
6475 * far. We can just cast it */
6476 if (byval->encoding == REDIS_ENCODING_INT) {
6477 vector[j].u.score = (long)byval->ptr;
6478 } else
6479 redisAssert(1 != 1);
6480 }
6481 }
6482 } else {
6483 if (!alpha) {
6484 if (vector[j].obj->encoding == REDIS_ENCODING_RAW)
6485 vector[j].u.score = strtod(vector[j].obj->ptr,NULL);
6486 else {
6487 if (vector[j].obj->encoding == REDIS_ENCODING_INT)
6488 vector[j].u.score = (long) vector[j].obj->ptr;
6489 else
6490 redisAssert(1 != 1);
6491 }
6492 }
6493 }
6494 }
6495 }
6496
6497 /* We are ready to sort the vector... perform a bit of sanity check
6498 * on the LIMIT option too. We'll use a partial version of quicksort. */
6499 start = (limit_start < 0) ? 0 : limit_start;
6500 end = (limit_count < 0) ? vectorlen-1 : start+limit_count-1;
6501 if (start >= vectorlen) {
6502 start = vectorlen-1;
6503 end = vectorlen-2;
6504 }
6505 if (end >= vectorlen) end = vectorlen-1;
6506
6507 if (dontsort == 0) {
6508 server.sort_desc = desc;
6509 server.sort_alpha = alpha;
6510 server.sort_bypattern = sortby ? 1 : 0;
6511 if (sortby && (start != 0 || end != vectorlen-1))
6512 pqsort(vector,vectorlen,sizeof(redisSortObject),sortCompare, start,end);
6513 else
6514 qsort(vector,vectorlen,sizeof(redisSortObject),sortCompare);
6515 }
6516
6517 /* Send command output to the output buffer, performing the specified
6518 * GET/DEL/INCR/DECR operations if any. */
6519 outputlen = getop ? getop*(end-start+1) : end-start+1;
6520 if (storekey == NULL) {
6521 /* STORE option not specified, sent the sorting result to client */
6522 addReplySds(c,sdscatprintf(sdsempty(),"*%d\r\n",outputlen));
6523 for (j = start; j <= end; j++) {
6524 listNode *ln;
6525 listIter li;
6526
6527 if (!getop) addReplyBulk(c,vector[j].obj);
6528 listRewind(operations,&li);
6529 while((ln = listNext(&li))) {
6530 redisSortOperation *sop = ln->value;
6531 robj *val = lookupKeyByPattern(c->db,sop->pattern,
6532 vector[j].obj);
6533
6534 if (sop->type == REDIS_SORT_GET) {
6535 if (!val || val->type != REDIS_STRING) {
6536 addReply(c,shared.nullbulk);
6537 } else {
6538 addReplyBulk(c,val);
6539 }
6540 } else {
6541 redisAssert(sop->type == REDIS_SORT_GET); /* always fails */
6542 }
6543 }
6544 }
6545 } else {
6546 robj *listObject = createListObject();
6547 list *listPtr = (list*) listObject->ptr;
6548
6549 /* STORE option specified, set the sorting result as a List object */
6550 for (j = start; j <= end; j++) {
6551 listNode *ln;
6552 listIter li;
6553
6554 if (!getop) {
6555 listAddNodeTail(listPtr,vector[j].obj);
6556 incrRefCount(vector[j].obj);
6557 }
6558 listRewind(operations,&li);
6559 while((ln = listNext(&li))) {
6560 redisSortOperation *sop = ln->value;
6561 robj *val = lookupKeyByPattern(c->db,sop->pattern,
6562 vector[j].obj);
6563
6564 if (sop->type == REDIS_SORT_GET) {
6565 if (!val || val->type != REDIS_STRING) {
6566 listAddNodeTail(listPtr,createStringObject("",0));
6567 } else {
6568 listAddNodeTail(listPtr,val);
6569 incrRefCount(val);
6570 }
6571 } else {
6572 redisAssert(sop->type == REDIS_SORT_GET); /* always fails */
6573 }
6574 }
6575 }
6576 if (dictReplace(c->db->dict,storekey,listObject)) {
6577 incrRefCount(storekey);
6578 }
6579 /* Note: we add 1 because the DB is dirty anyway since even if the
6580 * SORT result is empty a new key is set and maybe the old content
6581 * replaced. */
6582 server.dirty += 1+outputlen;
6583 addReplySds(c,sdscatprintf(sdsempty(),":%d\r\n",outputlen));
6584 }
6585
6586 /* Cleanup */
6587 decrRefCount(sortval);
6588 listRelease(operations);
6589 for (j = 0; j < vectorlen; j++) {
6590 if (sortby && alpha && vector[j].u.cmpobj)
6591 decrRefCount(vector[j].u.cmpobj);
6592 }
6593 zfree(vector);
6594 }
6595
6596 /* Convert an amount of bytes into a human readable string in the form
6597 * of 100B, 2G, 100M, 4K, and so forth. */
6598 static void bytesToHuman(char *s, unsigned long long n) {
6599 double d;
6600
6601 if (n < 1024) {
6602 /* Bytes */
6603 sprintf(s,"%lluB",n);
6604 return;
6605 } else if (n < (1024*1024)) {
6606 d = (double)n/(1024);
6607 sprintf(s,"%.2fK",d);
6608 } else if (n < (1024LL*1024*1024)) {
6609 d = (double)n/(1024*1024);
6610 sprintf(s,"%.2fM",d);
6611 } else if (n < (1024LL*1024*1024*1024)) {
6612 d = (double)n/(1024LL*1024*1024);
6613 sprintf(s,"%.2fG",d);
6614 }
6615 }
6616
6617 /* Create the string returned by the INFO command. This is decoupled
6618 * by the INFO command itself as we need to report the same information
6619 * on memory corruption problems. */
6620 static sds genRedisInfoString(void) {
6621 sds info;
6622 time_t uptime = time(NULL)-server.stat_starttime;
6623 int j;
6624 char hmem[64];
6625
6626 bytesToHuman(hmem,zmalloc_used_memory());
6627 info = sdscatprintf(sdsempty(),
6628 "redis_version:%s\r\n"
6629 "arch_bits:%s\r\n"
6630 "multiplexing_api:%s\r\n"
6631 "process_id:%ld\r\n"
6632 "uptime_in_seconds:%ld\r\n"
6633 "uptime_in_days:%ld\r\n"
6634 "connected_clients:%d\r\n"
6635 "connected_slaves:%d\r\n"
6636 "blocked_clients:%d\r\n"
6637 "used_memory:%zu\r\n"
6638 "used_memory_human:%s\r\n"
6639 "changes_since_last_save:%lld\r\n"
6640 "bgsave_in_progress:%d\r\n"
6641 "last_save_time:%ld\r\n"
6642 "bgrewriteaof_in_progress:%d\r\n"
6643 "total_connections_received:%lld\r\n"
6644 "total_commands_processed:%lld\r\n"
6645 "expired_keys:%lld\r\n"
6646 "hash_max_zipmap_entries:%ld\r\n"
6647 "hash_max_zipmap_value:%ld\r\n"
6648 "vm_enabled:%d\r\n"
6649 "role:%s\r\n"
6650 ,REDIS_VERSION,
6651 (sizeof(long) == 8) ? "64" : "32",
6652 aeGetApiName(),
6653 (long) getpid(),
6654 uptime,
6655 uptime/(3600*24),
6656 listLength(server.clients)-listLength(server.slaves),
6657 listLength(server.slaves),
6658 server.blpop_blocked_clients,
6659 zmalloc_used_memory(),
6660 hmem,
6661 server.dirty,
6662 server.bgsavechildpid != -1,
6663 server.lastsave,
6664 server.bgrewritechildpid != -1,
6665 server.stat_numconnections,
6666 server.stat_numcommands,
6667 server.stat_expiredkeys,
6668 server.hash_max_zipmap_entries,
6669 server.hash_max_zipmap_value,
6670 server.vm_enabled != 0,
6671 server.masterhost == NULL ? "master" : "slave"
6672 );
6673 if (server.masterhost) {
6674 info = sdscatprintf(info,
6675 "master_host:%s\r\n"
6676 "master_port:%d\r\n"
6677 "master_link_status:%s\r\n"
6678 "master_last_io_seconds_ago:%d\r\n"
6679 ,server.masterhost,
6680 server.masterport,
6681 (server.replstate == REDIS_REPL_CONNECTED) ?
6682 "up" : "down",
6683 server.master ? ((int)(time(NULL)-server.master->lastinteraction)) : -1
6684 );
6685 }
6686 if (server.vm_enabled) {
6687 lockThreadedIO();
6688 info = sdscatprintf(info,
6689 "vm_conf_max_memory:%llu\r\n"
6690 "vm_conf_page_size:%llu\r\n"
6691 "vm_conf_pages:%llu\r\n"
6692 "vm_stats_used_pages:%llu\r\n"
6693 "vm_stats_swapped_objects:%llu\r\n"
6694 "vm_stats_swappin_count:%llu\r\n"
6695 "vm_stats_swappout_count:%llu\r\n"
6696 "vm_stats_io_newjobs_len:%lu\r\n"
6697 "vm_stats_io_processing_len:%lu\r\n"
6698 "vm_stats_io_processed_len:%lu\r\n"
6699 "vm_stats_io_active_threads:%lu\r\n"
6700 "vm_stats_blocked_clients:%lu\r\n"
6701 ,(unsigned long long) server.vm_max_memory,
6702 (unsigned long long) server.vm_page_size,
6703 (unsigned long long) server.vm_pages,
6704 (unsigned long long) server.vm_stats_used_pages,
6705 (unsigned long long) server.vm_stats_swapped_objects,
6706 (unsigned long long) server.vm_stats_swapins,
6707 (unsigned long long) server.vm_stats_swapouts,
6708 (unsigned long) listLength(server.io_newjobs),
6709 (unsigned long) listLength(server.io_processing),
6710 (unsigned long) listLength(server.io_processed),
6711 (unsigned long) server.io_active_threads,
6712 (unsigned long) server.vm_blocked_clients
6713 );
6714 unlockThreadedIO();
6715 }
6716 for (j = 0; j < server.dbnum; j++) {
6717 long long keys, vkeys;
6718
6719 keys = dictSize(server.db[j].dict);
6720 vkeys = dictSize(server.db[j].expires);
6721 if (keys || vkeys) {
6722 info = sdscatprintf(info, "db%d:keys=%lld,expires=%lld\r\n",
6723 j, keys, vkeys);
6724 }
6725 }
6726 return info;
6727 }
6728
6729 static void infoCommand(redisClient *c) {
6730 sds info = genRedisInfoString();
6731 addReplySds(c,sdscatprintf(sdsempty(),"$%lu\r\n",
6732 (unsigned long)sdslen(info)));
6733 addReplySds(c,info);
6734 addReply(c,shared.crlf);
6735 }
6736
6737 static void monitorCommand(redisClient *c) {
6738 /* ignore MONITOR if aleady slave or in monitor mode */
6739 if (c->flags & REDIS_SLAVE) return;
6740
6741 c->flags |= (REDIS_SLAVE|REDIS_MONITOR);
6742 c->slaveseldb = 0;
6743 listAddNodeTail(server.monitors,c);
6744 addReply(c,shared.ok);
6745 }
6746
6747 /* ================================= Expire ================================= */
6748 static int removeExpire(redisDb *db, robj *key) {
6749 if (dictDelete(db->expires,key) == DICT_OK) {
6750 return 1;
6751 } else {
6752 return 0;
6753 }
6754 }
6755
6756 static int setExpire(redisDb *db, robj *key, time_t when) {
6757 if (dictAdd(db->expires,key,(void*)when) == DICT_ERR) {
6758 return 0;
6759 } else {
6760 incrRefCount(key);
6761 return 1;
6762 }
6763 }
6764
6765 /* Return the expire time of the specified key, or -1 if no expire
6766 * is associated with this key (i.e. the key is non volatile) */
6767 static time_t getExpire(redisDb *db, robj *key) {
6768 dictEntry *de;
6769
6770 /* No expire? return ASAP */
6771 if (dictSize(db->expires) == 0 ||
6772 (de = dictFind(db->expires,key)) == NULL) return -1;
6773
6774 return (time_t) dictGetEntryVal(de);
6775 }
6776
6777 static int expireIfNeeded(redisDb *db, robj *key) {
6778 time_t when;
6779 dictEntry *de;
6780
6781 /* No expire? return ASAP */
6782 if (dictSize(db->expires) == 0 ||
6783 (de = dictFind(db->expires,key)) == NULL) return 0;
6784
6785 /* Lookup the expire */
6786 when = (time_t) dictGetEntryVal(de);
6787 if (time(NULL) <= when) return 0;
6788
6789 /* Delete the key */
6790 dictDelete(db->expires,key);
6791 server.stat_expiredkeys++;
6792 return dictDelete(db->dict,key) == DICT_OK;
6793 }
6794
6795 static int deleteIfVolatile(redisDb *db, robj *key) {
6796 dictEntry *de;
6797
6798 /* No expire? return ASAP */
6799 if (dictSize(db->expires) == 0 ||
6800 (de = dictFind(db->expires,key)) == NULL) return 0;
6801
6802 /* Delete the key */
6803 server.dirty++;
6804 server.stat_expiredkeys++;
6805 dictDelete(db->expires,key);
6806 return dictDelete(db->dict,key) == DICT_OK;
6807 }
6808
6809 static void expireGenericCommand(redisClient *c, robj *key, time_t seconds) {
6810 dictEntry *de;
6811
6812 de = dictFind(c->db->dict,key);
6813 if (de == NULL) {
6814 addReply(c,shared.czero);
6815 return;
6816 }
6817 if (seconds < 0) {
6818 if (deleteKey(c->db,key)) server.dirty++;
6819 addReply(c, shared.cone);
6820 return;
6821 } else {
6822 time_t when = time(NULL)+seconds;
6823 if (setExpire(c->db,key,when)) {
6824 addReply(c,shared.cone);
6825 server.dirty++;
6826 } else {
6827 addReply(c,shared.czero);
6828 }
6829 return;
6830 }
6831 }
6832
6833 static void expireCommand(redisClient *c) {
6834 expireGenericCommand(c,c->argv[1],strtol(c->argv[2]->ptr,NULL,10));
6835 }
6836
6837 static void expireatCommand(redisClient *c) {
6838 expireGenericCommand(c,c->argv[1],strtol(c->argv[2]->ptr,NULL,10)-time(NULL));
6839 }
6840
6841 static void ttlCommand(redisClient *c) {
6842 time_t expire;
6843 int ttl = -1;
6844
6845 expire = getExpire(c->db,c->argv[1]);
6846 if (expire != -1) {
6847 ttl = (int) (expire-time(NULL));
6848 if (ttl < 0) ttl = -1;
6849 }
6850 addReplySds(c,sdscatprintf(sdsempty(),":%d\r\n",ttl));
6851 }
6852
6853 /* ================================ MULTI/EXEC ============================== */
6854
6855 /* Client state initialization for MULTI/EXEC */
6856 static void initClientMultiState(redisClient *c) {
6857 c->mstate.commands = NULL;
6858 c->mstate.count = 0;
6859 }
6860
6861 /* Release all the resources associated with MULTI/EXEC state */
6862 static void freeClientMultiState(redisClient *c) {
6863 int j;
6864
6865 for (j = 0; j < c->mstate.count; j++) {
6866 int i;
6867 multiCmd *mc = c->mstate.commands+j;
6868
6869 for (i = 0; i < mc->argc; i++)
6870 decrRefCount(mc->argv[i]);
6871 zfree(mc->argv);
6872 }
6873 zfree(c->mstate.commands);
6874 }
6875
6876 /* Add a new command into the MULTI commands queue */
6877 static void queueMultiCommand(redisClient *c, struct redisCommand *cmd) {
6878 multiCmd *mc;
6879 int j;
6880
6881 c->mstate.commands = zrealloc(c->mstate.commands,
6882 sizeof(multiCmd)*(c->mstate.count+1));
6883 mc = c->mstate.commands+c->mstate.count;
6884 mc->cmd = cmd;
6885 mc->argc = c->argc;
6886 mc->argv = zmalloc(sizeof(robj*)*c->argc);
6887 memcpy(mc->argv,c->argv,sizeof(robj*)*c->argc);
6888 for (j = 0; j < c->argc; j++)
6889 incrRefCount(mc->argv[j]);
6890 c->mstate.count++;
6891 }
6892
6893 static void multiCommand(redisClient *c) {
6894 c->flags |= REDIS_MULTI;
6895 addReply(c,shared.ok);
6896 }
6897
6898 static void discardCommand(redisClient *c) {
6899 if (!(c->flags & REDIS_MULTI)) {
6900 addReplySds(c,sdsnew("-ERR DISCARD without MULTI\r\n"));
6901 return;
6902 }
6903
6904 freeClientMultiState(c);
6905 initClientMultiState(c);
6906 c->flags &= (~REDIS_MULTI);
6907 addReply(c,shared.ok);
6908 }
6909
6910 static void execCommand(redisClient *c) {
6911 int j;
6912 robj **orig_argv;
6913 int orig_argc;
6914
6915 if (!(c->flags & REDIS_MULTI)) {
6916 addReplySds(c,sdsnew("-ERR EXEC without MULTI\r\n"));
6917 return;
6918 }
6919
6920 orig_argv = c->argv;
6921 orig_argc = c->argc;
6922 addReplySds(c,sdscatprintf(sdsempty(),"*%d\r\n",c->mstate.count));
6923 for (j = 0; j < c->mstate.count; j++) {
6924 c->argc = c->mstate.commands[j].argc;
6925 c->argv = c->mstate.commands[j].argv;
6926 call(c,c->mstate.commands[j].cmd);
6927 }
6928 c->argv = orig_argv;
6929 c->argc = orig_argc;
6930 freeClientMultiState(c);
6931 initClientMultiState(c);
6932 c->flags &= (~REDIS_MULTI);
6933 }
6934
6935 /* =========================== Blocking Operations ========================= */
6936
6937 /* Currently Redis blocking operations support is limited to list POP ops,
6938 * so the current implementation is not fully generic, but it is also not
6939 * completely specific so it will not require a rewrite to support new
6940 * kind of blocking operations in the future.
6941 *
6942 * Still it's important to note that list blocking operations can be already
6943 * used as a notification mechanism in order to implement other blocking
6944 * operations at application level, so there must be a very strong evidence
6945 * of usefulness and generality before new blocking operations are implemented.
6946 *
6947 * This is how the current blocking POP works, we use BLPOP as example:
6948 * - If the user calls BLPOP and the key exists and contains a non empty list
6949 * then LPOP is called instead. So BLPOP is semantically the same as LPOP
6950 * if there is not to block.
6951 * - If instead BLPOP is called and the key does not exists or the list is
6952 * empty we need to block. In order to do so we remove the notification for
6953 * new data to read in the client socket (so that we'll not serve new
6954 * requests if the blocking request is not served). Also we put the client
6955 * in a dictionary (db->blockingkeys) mapping keys to a list of clients
6956 * blocking for this keys.
6957 * - If a PUSH operation against a key with blocked clients waiting is
6958 * performed, we serve the first in the list: basically instead to push
6959 * the new element inside the list we return it to the (first / oldest)
6960 * blocking client, unblock the client, and remove it form the list.
6961 *
6962 * The above comment and the source code should be enough in order to understand
6963 * the implementation and modify / fix it later.
6964 */
6965
6966 /* Set a client in blocking mode for the specified key, with the specified
6967 * timeout */
6968 static void blockForKeys(redisClient *c, robj **keys, int numkeys, time_t timeout) {
6969 dictEntry *de;
6970 list *l;
6971 int j;
6972
6973 c->blockingkeys = zmalloc(sizeof(robj*)*numkeys);
6974 c->blockingkeysnum = numkeys;
6975 c->blockingto = timeout;
6976 for (j = 0; j < numkeys; j++) {
6977 /* Add the key in the client structure, to map clients -> keys */
6978 c->blockingkeys[j] = keys[j];
6979 incrRefCount(keys[j]);
6980
6981 /* And in the other "side", to map keys -> clients */
6982 de = dictFind(c->db->blockingkeys,keys[j]);
6983 if (de == NULL) {
6984 int retval;
6985
6986 /* For every key we take a list of clients blocked for it */
6987 l = listCreate();
6988 retval = dictAdd(c->db->blockingkeys,keys[j],l);
6989 incrRefCount(keys[j]);
6990 assert(retval == DICT_OK);
6991 } else {
6992 l = dictGetEntryVal(de);
6993 }
6994 listAddNodeTail(l,c);
6995 }
6996 /* Mark the client as a blocked client */
6997 c->flags |= REDIS_BLOCKED;
6998 server.blpop_blocked_clients++;
6999 }
7000
7001 /* Unblock a client that's waiting in a blocking operation such as BLPOP */
7002 static void unblockClientWaitingData(redisClient *c) {
7003 dictEntry *de;
7004 list *l;
7005 int j;
7006
7007 assert(c->blockingkeys != NULL);
7008 /* The client may wait for multiple keys, so unblock it for every key. */
7009 for (j = 0; j < c->blockingkeysnum; j++) {
7010 /* Remove this client from the list of clients waiting for this key. */
7011 de = dictFind(c->db->blockingkeys,c->blockingkeys[j]);
7012 assert(de != NULL);
7013 l = dictGetEntryVal(de);
7014 listDelNode(l,listSearchKey(l,c));
7015 /* If the list is empty we need to remove it to avoid wasting memory */
7016 if (listLength(l) == 0)
7017 dictDelete(c->db->blockingkeys,c->blockingkeys[j]);
7018 decrRefCount(c->blockingkeys[j]);
7019 }
7020 /* Cleanup the client structure */
7021 zfree(c->blockingkeys);
7022 c->blockingkeys = NULL;
7023 c->flags &= (~REDIS_BLOCKED);
7024 server.blpop_blocked_clients--;
7025 /* We want to process data if there is some command waiting
7026 * in the input buffer. Note that this is safe even if
7027 * unblockClientWaitingData() gets called from freeClient() because
7028 * freeClient() will be smart enough to call this function
7029 * *after* c->querybuf was set to NULL. */
7030 if (c->querybuf && sdslen(c->querybuf) > 0) processInputBuffer(c);
7031 }
7032
7033 /* This should be called from any function PUSHing into lists.
7034 * 'c' is the "pushing client", 'key' is the key it is pushing data against,
7035 * 'ele' is the element pushed.
7036 *
7037 * If the function returns 0 there was no client waiting for a list push
7038 * against this key.
7039 *
7040 * If the function returns 1 there was a client waiting for a list push
7041 * against this key, the element was passed to this client thus it's not
7042 * needed to actually add it to the list and the caller should return asap. */
7043 static int handleClientsWaitingListPush(redisClient *c, robj *key, robj *ele) {
7044 struct dictEntry *de;
7045 redisClient *receiver;
7046 list *l;
7047 listNode *ln;
7048
7049 de = dictFind(c->db->blockingkeys,key);
7050 if (de == NULL) return 0;
7051 l = dictGetEntryVal(de);
7052 ln = listFirst(l);
7053 assert(ln != NULL);
7054 receiver = ln->value;
7055
7056 addReplySds(receiver,sdsnew("*2\r\n"));
7057 addReplyBulk(receiver,key);
7058 addReplyBulk(receiver,ele);
7059 unblockClientWaitingData(receiver);
7060 return 1;
7061 }
7062
7063 /* Blocking RPOP/LPOP */
7064 static void blockingPopGenericCommand(redisClient *c, int where) {
7065 robj *o;
7066 time_t timeout;
7067 int j;
7068
7069 for (j = 1; j < c->argc-1; j++) {
7070 o = lookupKeyWrite(c->db,c->argv[j]);
7071 if (o != NULL) {
7072 if (o->type != REDIS_LIST) {
7073 addReply(c,shared.wrongtypeerr);
7074 return;
7075 } else {
7076 list *list = o->ptr;
7077 if (listLength(list) != 0) {
7078 /* If the list contains elements fall back to the usual
7079 * non-blocking POP operation */
7080 robj *argv[2], **orig_argv;
7081 int orig_argc;
7082
7083 /* We need to alter the command arguments before to call
7084 * popGenericCommand() as the command takes a single key. */
7085 orig_argv = c->argv;
7086 orig_argc = c->argc;
7087 argv[1] = c->argv[j];
7088 c->argv = argv;
7089 c->argc = 2;
7090
7091 /* Also the return value is different, we need to output
7092 * the multi bulk reply header and the key name. The
7093 * "real" command will add the last element (the value)
7094 * for us. If this souds like an hack to you it's just
7095 * because it is... */
7096 addReplySds(c,sdsnew("*2\r\n"));
7097 addReplyBulk(c,argv[1]);
7098 popGenericCommand(c,where);
7099
7100 /* Fix the client structure with the original stuff */
7101 c->argv = orig_argv;
7102 c->argc = orig_argc;
7103 return;
7104 }
7105 }
7106 }
7107 }
7108 /* If the list is empty or the key does not exists we must block */
7109 timeout = strtol(c->argv[c->argc-1]->ptr,NULL,10);
7110 if (timeout > 0) timeout += time(NULL);
7111 blockForKeys(c,c->argv+1,c->argc-2,timeout);
7112 }
7113
7114 static void blpopCommand(redisClient *c) {
7115 blockingPopGenericCommand(c,REDIS_HEAD);
7116 }
7117
7118 static void brpopCommand(redisClient *c) {
7119 blockingPopGenericCommand(c,REDIS_TAIL);
7120 }
7121
7122 /* =============================== Replication ============================= */
7123
7124 static int syncWrite(int fd, char *ptr, ssize_t size, int timeout) {
7125 ssize_t nwritten, ret = size;
7126 time_t start = time(NULL);
7127
7128 timeout++;
7129 while(size) {
7130 if (aeWait(fd,AE_WRITABLE,1000) & AE_WRITABLE) {
7131 nwritten = write(fd,ptr,size);
7132 if (nwritten == -1) return -1;
7133 ptr += nwritten;
7134 size -= nwritten;
7135 }
7136 if ((time(NULL)-start) > timeout) {
7137 errno = ETIMEDOUT;
7138 return -1;
7139 }
7140 }
7141 return ret;
7142 }
7143
7144 static int syncRead(int fd, char *ptr, ssize_t size, int timeout) {
7145 ssize_t nread, totread = 0;
7146 time_t start = time(NULL);
7147
7148 timeout++;
7149 while(size) {
7150 if (aeWait(fd,AE_READABLE,1000) & AE_READABLE) {
7151 nread = read(fd,ptr,size);
7152 if (nread == -1) return -1;
7153 ptr += nread;
7154 size -= nread;
7155 totread += nread;
7156 }
7157 if ((time(NULL)-start) > timeout) {
7158 errno = ETIMEDOUT;
7159 return -1;
7160 }
7161 }
7162 return totread;
7163 }
7164
7165 static int syncReadLine(int fd, char *ptr, ssize_t size, int timeout) {
7166 ssize_t nread = 0;
7167
7168 size--;
7169 while(size) {
7170 char c;
7171
7172 if (syncRead(fd,&c,1,timeout) == -1) return -1;
7173 if (c == '\n') {
7174 *ptr = '\0';
7175 if (nread && *(ptr-1) == '\r') *(ptr-1) = '\0';
7176 return nread;
7177 } else {
7178 *ptr++ = c;
7179 *ptr = '\0';
7180 nread++;
7181 }
7182 }
7183 return nread;
7184 }
7185
7186 static void syncCommand(redisClient *c) {
7187 /* ignore SYNC if aleady slave or in monitor mode */
7188 if (c->flags & REDIS_SLAVE) return;
7189
7190 /* SYNC can't be issued when the server has pending data to send to
7191 * the client about already issued commands. We need a fresh reply
7192 * buffer registering the differences between the BGSAVE and the current
7193 * dataset, so that we can copy to other slaves if needed. */
7194 if (listLength(c->reply) != 0) {
7195 addReplySds(c,sdsnew("-ERR SYNC is invalid with pending input\r\n"));
7196 return;
7197 }
7198
7199 redisLog(REDIS_NOTICE,"Slave ask for synchronization");
7200 /* Here we need to check if there is a background saving operation
7201 * in progress, or if it is required to start one */
7202 if (server.bgsavechildpid != -1) {
7203 /* Ok a background save is in progress. Let's check if it is a good
7204 * one for replication, i.e. if there is another slave that is
7205 * registering differences since the server forked to save */
7206 redisClient *slave;
7207 listNode *ln;
7208 listIter li;
7209
7210 listRewind(server.slaves,&li);
7211 while((ln = listNext(&li))) {
7212 slave = ln->value;
7213 if (slave->replstate == REDIS_REPL_WAIT_BGSAVE_END) break;
7214 }
7215 if (ln) {
7216 /* Perfect, the server is already registering differences for
7217 * another slave. Set the right state, and copy the buffer. */
7218 listRelease(c->reply);
7219 c->reply = listDup(slave->reply);
7220 c->replstate = REDIS_REPL_WAIT_BGSAVE_END;
7221 redisLog(REDIS_NOTICE,"Waiting for end of BGSAVE for SYNC");
7222 } else {
7223 /* No way, we need to wait for the next BGSAVE in order to
7224 * register differences */
7225 c->replstate = REDIS_REPL_WAIT_BGSAVE_START;
7226 redisLog(REDIS_NOTICE,"Waiting for next BGSAVE for SYNC");
7227 }
7228 } else {
7229 /* Ok we don't have a BGSAVE in progress, let's start one */
7230 redisLog(REDIS_NOTICE,"Starting BGSAVE for SYNC");
7231 if (rdbSaveBackground(server.dbfilename) != REDIS_OK) {
7232 redisLog(REDIS_NOTICE,"Replication failed, can't BGSAVE");
7233 addReplySds(c,sdsnew("-ERR Unalbe to perform background save\r\n"));
7234 return;
7235 }
7236 c->replstate = REDIS_REPL_WAIT_BGSAVE_END;
7237 }
7238 c->repldbfd = -1;
7239 c->flags |= REDIS_SLAVE;
7240 c->slaveseldb = 0;
7241 listAddNodeTail(server.slaves,c);
7242 return;
7243 }
7244
7245 static void sendBulkToSlave(aeEventLoop *el, int fd, void *privdata, int mask) {
7246 redisClient *slave = privdata;
7247 REDIS_NOTUSED(el);
7248 REDIS_NOTUSED(mask);
7249 char buf[REDIS_IOBUF_LEN];
7250 ssize_t nwritten, buflen;
7251
7252 if (slave->repldboff == 0) {
7253 /* Write the bulk write count before to transfer the DB. In theory here
7254 * we don't know how much room there is in the output buffer of the
7255 * socket, but in pratice SO_SNDLOWAT (the minimum count for output
7256 * operations) will never be smaller than the few bytes we need. */
7257 sds bulkcount;
7258
7259 bulkcount = sdscatprintf(sdsempty(),"$%lld\r\n",(unsigned long long)
7260 slave->repldbsize);
7261 if (write(fd,bulkcount,sdslen(bulkcount)) != (signed)sdslen(bulkcount))
7262 {
7263 sdsfree(bulkcount);
7264 freeClient(slave);
7265 return;
7266 }
7267 sdsfree(bulkcount);
7268 }
7269 lseek(slave->repldbfd,slave->repldboff,SEEK_SET);
7270 buflen = read(slave->repldbfd,buf,REDIS_IOBUF_LEN);
7271 if (buflen <= 0) {
7272 redisLog(REDIS_WARNING,"Read error sending DB to slave: %s",
7273 (buflen == 0) ? "premature EOF" : strerror(errno));
7274 freeClient(slave);
7275 return;
7276 }
7277 if ((nwritten = write(fd,buf,buflen)) == -1) {
7278 redisLog(REDIS_VERBOSE,"Write error sending DB to slave: %s",
7279 strerror(errno));
7280 freeClient(slave);
7281 return;
7282 }
7283 slave->repldboff += nwritten;
7284 if (slave->repldboff == slave->repldbsize) {
7285 close(slave->repldbfd);
7286 slave->repldbfd = -1;
7287 aeDeleteFileEvent(server.el,slave->fd,AE_WRITABLE);
7288 slave->replstate = REDIS_REPL_ONLINE;
7289 if (aeCreateFileEvent(server.el, slave->fd, AE_WRITABLE,
7290 sendReplyToClient, slave) == AE_ERR) {
7291 freeClient(slave);
7292 return;
7293 }
7294 addReplySds(slave,sdsempty());
7295 redisLog(REDIS_NOTICE,"Synchronization with slave succeeded");
7296 }
7297 }
7298
7299 /* This function is called at the end of every backgrond saving.
7300 * The argument bgsaveerr is REDIS_OK if the background saving succeeded
7301 * otherwise REDIS_ERR is passed to the function.
7302 *
7303 * The goal of this function is to handle slaves waiting for a successful
7304 * background saving in order to perform non-blocking synchronization. */
7305 static void updateSlavesWaitingBgsave(int bgsaveerr) {
7306 listNode *ln;
7307 int startbgsave = 0;
7308 listIter li;
7309
7310 listRewind(server.slaves,&li);
7311 while((ln = listNext(&li))) {
7312 redisClient *slave = ln->value;
7313
7314 if (slave->replstate == REDIS_REPL_WAIT_BGSAVE_START) {
7315 startbgsave = 1;
7316 slave->replstate = REDIS_REPL_WAIT_BGSAVE_END;
7317 } else if (slave->replstate == REDIS_REPL_WAIT_BGSAVE_END) {
7318 struct redis_stat buf;
7319
7320 if (bgsaveerr != REDIS_OK) {
7321 freeClient(slave);
7322 redisLog(REDIS_WARNING,"SYNC failed. BGSAVE child returned an error");
7323 continue;
7324 }
7325 if ((slave->repldbfd = open(server.dbfilename,O_RDONLY)) == -1 ||
7326 redis_fstat(slave->repldbfd,&buf) == -1) {
7327 freeClient(slave);
7328 redisLog(REDIS_WARNING,"SYNC failed. Can't open/stat DB after BGSAVE: %s", strerror(errno));
7329 continue;
7330 }
7331 slave->repldboff = 0;
7332 slave->repldbsize = buf.st_size;
7333 slave->replstate = REDIS_REPL_SEND_BULK;
7334 aeDeleteFileEvent(server.el,slave->fd,AE_WRITABLE);
7335 if (aeCreateFileEvent(server.el, slave->fd, AE_WRITABLE, sendBulkToSlave, slave) == AE_ERR) {
7336 freeClient(slave);
7337 continue;
7338 }
7339 }
7340 }
7341 if (startbgsave) {
7342 if (rdbSaveBackground(server.dbfilename) != REDIS_OK) {
7343 listIter li;
7344
7345 listRewind(server.slaves,&li);
7346 redisLog(REDIS_WARNING,"SYNC failed. BGSAVE failed");
7347 while((ln = listNext(&li))) {
7348 redisClient *slave = ln->value;
7349
7350 if (slave->replstate == REDIS_REPL_WAIT_BGSAVE_START)
7351 freeClient(slave);
7352 }
7353 }
7354 }
7355 }
7356
7357 static int syncWithMaster(void) {
7358 char buf[1024], tmpfile[256], authcmd[1024];
7359 long dumpsize;
7360 int fd = anetTcpConnect(NULL,server.masterhost,server.masterport);
7361 int dfd, maxtries = 5;
7362
7363 if (fd == -1) {
7364 redisLog(REDIS_WARNING,"Unable to connect to MASTER: %s",
7365 strerror(errno));
7366 return REDIS_ERR;
7367 }
7368
7369 /* AUTH with the master if required. */
7370 if(server.masterauth) {
7371 snprintf(authcmd, 1024, "AUTH %s\r\n", server.masterauth);
7372 if (syncWrite(fd, authcmd, strlen(server.masterauth)+7, 5) == -1) {
7373 close(fd);
7374 redisLog(REDIS_WARNING,"Unable to AUTH to MASTER: %s",
7375 strerror(errno));
7376 return REDIS_ERR;
7377 }
7378 /* Read the AUTH result. */
7379 if (syncReadLine(fd,buf,1024,3600) == -1) {
7380 close(fd);
7381 redisLog(REDIS_WARNING,"I/O error reading auth result from MASTER: %s",
7382 strerror(errno));
7383 return REDIS_ERR;
7384 }
7385 if (buf[0] != '+') {
7386 close(fd);
7387 redisLog(REDIS_WARNING,"Cannot AUTH to MASTER, is the masterauth password correct?");
7388 return REDIS_ERR;
7389 }
7390 }
7391
7392 /* Issue the SYNC command */
7393 if (syncWrite(fd,"SYNC \r\n",7,5) == -1) {
7394 close(fd);
7395 redisLog(REDIS_WARNING,"I/O error writing to MASTER: %s",
7396 strerror(errno));
7397 return REDIS_ERR;
7398 }
7399 /* Read the bulk write count */
7400 if (syncReadLine(fd,buf,1024,3600) == -1) {
7401 close(fd);
7402 redisLog(REDIS_WARNING,"I/O error reading bulk count from MASTER: %s",
7403 strerror(errno));
7404 return REDIS_ERR;
7405 }
7406 if (buf[0] != '$') {
7407 close(fd);
7408 redisLog(REDIS_WARNING,"Bad protocol from MASTER, the first byte is not '$', are you sure the host and port are right?");
7409 return REDIS_ERR;
7410 }
7411 dumpsize = strtol(buf+1,NULL,10);
7412 redisLog(REDIS_NOTICE,"Receiving %ld bytes data dump from MASTER",dumpsize);
7413 /* Read the bulk write data on a temp file */
7414 while(maxtries--) {
7415 snprintf(tmpfile,256,
7416 "temp-%d.%ld.rdb",(int)time(NULL),(long int)getpid());
7417 dfd = open(tmpfile,O_CREAT|O_WRONLY|O_EXCL,0644);
7418 if (dfd != -1) break;
7419 sleep(1);
7420 }
7421 if (dfd == -1) {
7422 close(fd);
7423 redisLog(REDIS_WARNING,"Opening the temp file needed for MASTER <-> SLAVE synchronization: %s",strerror(errno));
7424 return REDIS_ERR;
7425 }
7426 while(dumpsize) {
7427 int nread, nwritten;
7428
7429 nread = read(fd,buf,(dumpsize < 1024)?dumpsize:1024);
7430 if (nread == -1) {
7431 redisLog(REDIS_WARNING,"I/O error trying to sync with MASTER: %s",
7432 strerror(errno));
7433 close(fd);
7434 close(dfd);
7435 return REDIS_ERR;
7436 }
7437 nwritten = write(dfd,buf,nread);
7438 if (nwritten == -1) {
7439 redisLog(REDIS_WARNING,"Write error writing to the DB dump file needed for MASTER <-> SLAVE synchrnonization: %s", strerror(errno));
7440 close(fd);
7441 close(dfd);
7442 return REDIS_ERR;
7443 }
7444 dumpsize -= nread;
7445 }
7446 close(dfd);
7447 if (rename(tmpfile,server.dbfilename) == -1) {
7448 redisLog(REDIS_WARNING,"Failed trying to rename the temp DB into dump.rdb in MASTER <-> SLAVE synchronization: %s", strerror(errno));
7449 unlink(tmpfile);
7450 close(fd);
7451 return REDIS_ERR;
7452 }
7453 emptyDb();
7454 if (rdbLoad(server.dbfilename) != REDIS_OK) {
7455 redisLog(REDIS_WARNING,"Failed trying to load the MASTER synchronization DB from disk");
7456 close(fd);
7457 return REDIS_ERR;
7458 }
7459 server.master = createClient(fd);
7460 server.master->flags |= REDIS_MASTER;
7461 server.master->authenticated = 1;
7462 server.replstate = REDIS_REPL_CONNECTED;
7463 return REDIS_OK;
7464 }
7465
7466 static void slaveofCommand(redisClient *c) {
7467 if (!strcasecmp(c->argv[1]->ptr,"no") &&
7468 !strcasecmp(c->argv[2]->ptr,"one")) {
7469 if (server.masterhost) {
7470 sdsfree(server.masterhost);
7471 server.masterhost = NULL;
7472 if (server.master) freeClient(server.master);
7473 server.replstate = REDIS_REPL_NONE;
7474 redisLog(REDIS_NOTICE,"MASTER MODE enabled (user request)");
7475 }
7476 } else {
7477 sdsfree(server.masterhost);
7478 server.masterhost = sdsdup(c->argv[1]->ptr);
7479 server.masterport = atoi(c->argv[2]->ptr);
7480 if (server.master) freeClient(server.master);
7481 server.replstate = REDIS_REPL_CONNECT;
7482 redisLog(REDIS_NOTICE,"SLAVE OF %s:%d enabled (user request)",
7483 server.masterhost, server.masterport);
7484 }
7485 addReply(c,shared.ok);
7486 }
7487
7488 /* ============================ Maxmemory directive ======================== */
7489
7490 /* Try to free one object form the pre-allocated objects free list.
7491 * This is useful under low mem conditions as by default we take 1 million
7492 * free objects allocated. On success REDIS_OK is returned, otherwise
7493 * REDIS_ERR. */
7494 static int tryFreeOneObjectFromFreelist(void) {
7495 robj *o;
7496
7497 if (server.vm_enabled) pthread_mutex_lock(&server.obj_freelist_mutex);
7498 if (listLength(server.objfreelist)) {
7499 listNode *head = listFirst(server.objfreelist);
7500 o = listNodeValue(head);
7501 listDelNode(server.objfreelist,head);
7502 if (server.vm_enabled) pthread_mutex_unlock(&server.obj_freelist_mutex);
7503 zfree(o);
7504 return REDIS_OK;
7505 } else {
7506 if (server.vm_enabled) pthread_mutex_unlock(&server.obj_freelist_mutex);
7507 return REDIS_ERR;
7508 }
7509 }
7510
7511 /* This function gets called when 'maxmemory' is set on the config file to limit
7512 * the max memory used by the server, and we are out of memory.
7513 * This function will try to, in order:
7514 *
7515 * - Free objects from the free list
7516 * - Try to remove keys with an EXPIRE set
7517 *
7518 * It is not possible to free enough memory to reach used-memory < maxmemory
7519 * the server will start refusing commands that will enlarge even more the
7520 * memory usage.
7521 */
7522 static void freeMemoryIfNeeded(void) {
7523 while (server.maxmemory && zmalloc_used_memory() > server.maxmemory) {
7524 int j, k, freed = 0;
7525
7526 if (tryFreeOneObjectFromFreelist() == REDIS_OK) continue;
7527 for (j = 0; j < server.dbnum; j++) {
7528 int minttl = -1;
7529 robj *minkey = NULL;
7530 struct dictEntry *de;
7531
7532 if (dictSize(server.db[j].expires)) {
7533 freed = 1;
7534 /* From a sample of three keys drop the one nearest to
7535 * the natural expire */
7536 for (k = 0; k < 3; k++) {
7537 time_t t;
7538
7539 de = dictGetRandomKey(server.db[j].expires);
7540 t = (time_t) dictGetEntryVal(de);
7541 if (minttl == -1 || t < minttl) {
7542 minkey = dictGetEntryKey(de);
7543 minttl = t;
7544 }
7545 }
7546 deleteKey(server.db+j,minkey);
7547 }
7548 }
7549 if (!freed) return; /* nothing to free... */
7550 }
7551 }
7552
7553 /* ============================== Append Only file ========================== */
7554
7555 static void feedAppendOnlyFile(struct redisCommand *cmd, int dictid, robj **argv, int argc) {
7556 sds buf = sdsempty();
7557 int j;
7558 ssize_t nwritten;
7559 time_t now;
7560 robj *tmpargv[3];
7561
7562 /* The DB this command was targetting is not the same as the last command
7563 * we appendend. To issue a SELECT command is needed. */
7564 if (dictid != server.appendseldb) {
7565 char seldb[64];
7566
7567 snprintf(seldb,sizeof(seldb),"%d",dictid);
7568 buf = sdscatprintf(buf,"*2\r\n$6\r\nSELECT\r\n$%lu\r\n%s\r\n",
7569 (unsigned long)strlen(seldb),seldb);
7570 server.appendseldb = dictid;
7571 }
7572
7573 /* "Fix" the argv vector if the command is EXPIRE. We want to translate
7574 * EXPIREs into EXPIREATs calls */
7575 if (cmd->proc == expireCommand) {
7576 long when;
7577
7578 tmpargv[0] = createStringObject("EXPIREAT",8);
7579 tmpargv[1] = argv[1];
7580 incrRefCount(argv[1]);
7581 when = time(NULL)+strtol(argv[2]->ptr,NULL,10);
7582 tmpargv[2] = createObject(REDIS_STRING,
7583 sdscatprintf(sdsempty(),"%ld",when));
7584 argv = tmpargv;
7585 }
7586
7587 /* Append the actual command */
7588 buf = sdscatprintf(buf,"*%d\r\n",argc);
7589 for (j = 0; j < argc; j++) {
7590 robj *o = argv[j];
7591
7592 o = getDecodedObject(o);
7593 buf = sdscatprintf(buf,"$%lu\r\n",(unsigned long)sdslen(o->ptr));
7594 buf = sdscatlen(buf,o->ptr,sdslen(o->ptr));
7595 buf = sdscatlen(buf,"\r\n",2);
7596 decrRefCount(o);
7597 }
7598
7599 /* Free the objects from the modified argv for EXPIREAT */
7600 if (cmd->proc == expireCommand) {
7601 for (j = 0; j < 3; j++)
7602 decrRefCount(argv[j]);
7603 }
7604
7605 /* We want to perform a single write. This should be guaranteed atomic
7606 * at least if the filesystem we are writing is a real physical one.
7607 * While this will save us against the server being killed I don't think
7608 * there is much to do about the whole server stopping for power problems
7609 * or alike */
7610 nwritten = write(server.appendfd,buf,sdslen(buf));
7611 if (nwritten != (signed)sdslen(buf)) {
7612 /* Ooops, we are in troubles. The best thing to do for now is
7613 * to simply exit instead to give the illusion that everything is
7614 * working as expected. */
7615 if (nwritten == -1) {
7616 redisLog(REDIS_WARNING,"Exiting on error writing to the append-only file: %s",strerror(errno));
7617 } else {
7618 redisLog(REDIS_WARNING,"Exiting on short write while writing to the append-only file: %s",strerror(errno));
7619 }
7620 exit(1);
7621 }
7622 /* If a background append only file rewriting is in progress we want to
7623 * accumulate the differences between the child DB and the current one
7624 * in a buffer, so that when the child process will do its work we
7625 * can append the differences to the new append only file. */
7626 if (server.bgrewritechildpid != -1)
7627 server.bgrewritebuf = sdscatlen(server.bgrewritebuf,buf,sdslen(buf));
7628
7629 sdsfree(buf);
7630 now = time(NULL);
7631 if (server.appendfsync == APPENDFSYNC_ALWAYS ||
7632 (server.appendfsync == APPENDFSYNC_EVERYSEC &&
7633 now-server.lastfsync > 1))
7634 {
7635 fsync(server.appendfd); /* Let's try to get this data on the disk */
7636 server.lastfsync = now;
7637 }
7638 }
7639
7640 /* In Redis commands are always executed in the context of a client, so in
7641 * order to load the append only file we need to create a fake client. */
7642 static struct redisClient *createFakeClient(void) {
7643 struct redisClient *c = zmalloc(sizeof(*c));
7644
7645 selectDb(c,0);
7646 c->fd = -1;
7647 c->querybuf = sdsempty();
7648 c->argc = 0;
7649 c->argv = NULL;
7650 c->flags = 0;
7651 /* We set the fake client as a slave waiting for the synchronization
7652 * so that Redis will not try to send replies to this client. */
7653 c->replstate = REDIS_REPL_WAIT_BGSAVE_START;
7654 c->reply = listCreate();
7655 listSetFreeMethod(c->reply,decrRefCount);
7656 listSetDupMethod(c->reply,dupClientReplyValue);
7657 return c;
7658 }
7659
7660 static void freeFakeClient(struct redisClient *c) {
7661 sdsfree(c->querybuf);
7662 listRelease(c->reply);
7663 zfree(c);
7664 }
7665
7666 /* Replay the append log file. On error REDIS_OK is returned. On non fatal
7667 * error (the append only file is zero-length) REDIS_ERR is returned. On
7668 * fatal error an error message is logged and the program exists. */
7669 int loadAppendOnlyFile(char *filename) {
7670 struct redisClient *fakeClient;
7671 FILE *fp = fopen(filename,"r");
7672 struct redis_stat sb;
7673 unsigned long long loadedkeys = 0;
7674
7675 if (redis_fstat(fileno(fp),&sb) != -1 && sb.st_size == 0)
7676 return REDIS_ERR;
7677
7678 if (fp == NULL) {
7679 redisLog(REDIS_WARNING,"Fatal error: can't open the append log file for reading: %s",strerror(errno));
7680 exit(1);
7681 }
7682
7683 fakeClient = createFakeClient();
7684 while(1) {
7685 int argc, j;
7686 unsigned long len;
7687 robj **argv;
7688 char buf[128];
7689 sds argsds;
7690 struct redisCommand *cmd;
7691
7692 if (fgets(buf,sizeof(buf),fp) == NULL) {
7693 if (feof(fp))
7694 break;
7695 else
7696 goto readerr;
7697 }
7698 if (buf[0] != '*') goto fmterr;
7699 argc = atoi(buf+1);
7700 argv = zmalloc(sizeof(robj*)*argc);
7701 for (j = 0; j < argc; j++) {
7702 if (fgets(buf,sizeof(buf),fp) == NULL) goto readerr;
7703 if (buf[0] != '$') goto fmterr;
7704 len = strtol(buf+1,NULL,10);
7705 argsds = sdsnewlen(NULL,len);
7706 if (len && fread(argsds,len,1,fp) == 0) goto fmterr;
7707 argv[j] = createObject(REDIS_STRING,argsds);
7708 if (fread(buf,2,1,fp) == 0) goto fmterr; /* discard CRLF */
7709 }
7710
7711 /* Command lookup */
7712 cmd = lookupCommand(argv[0]->ptr);
7713 if (!cmd) {
7714 redisLog(REDIS_WARNING,"Unknown command '%s' reading the append only file", argv[0]->ptr);
7715 exit(1);
7716 }
7717 /* Try object sharing and encoding */
7718 if (server.shareobjects) {
7719 int j;
7720 for(j = 1; j < argc; j++)
7721 argv[j] = tryObjectSharing(argv[j]);
7722 }
7723 if (cmd->flags & REDIS_CMD_BULK)
7724 tryObjectEncoding(argv[argc-1]);
7725 /* Run the command in the context of a fake client */
7726 fakeClient->argc = argc;
7727 fakeClient->argv = argv;
7728 cmd->proc(fakeClient);
7729 /* Discard the reply objects list from the fake client */
7730 while(listLength(fakeClient->reply))
7731 listDelNode(fakeClient->reply,listFirst(fakeClient->reply));
7732 /* Clean up, ready for the next command */
7733 for (j = 0; j < argc; j++) decrRefCount(argv[j]);
7734 zfree(argv);
7735 /* Handle swapping while loading big datasets when VM is on */
7736 loadedkeys++;
7737 if (server.vm_enabled && (loadedkeys % 5000) == 0) {
7738 while (zmalloc_used_memory() > server.vm_max_memory) {
7739 if (vmSwapOneObjectBlocking() == REDIS_ERR) break;
7740 }
7741 }
7742 }
7743 fclose(fp);
7744 freeFakeClient(fakeClient);
7745 return REDIS_OK;
7746
7747 readerr:
7748 if (feof(fp)) {
7749 redisLog(REDIS_WARNING,"Unexpected end of file reading the append only file");
7750 } else {
7751 redisLog(REDIS_WARNING,"Unrecoverable error reading the append only file: %s", strerror(errno));
7752 }
7753 exit(1);
7754 fmterr:
7755 redisLog(REDIS_WARNING,"Bad file format reading the append only file");
7756 exit(1);
7757 }
7758
7759 /* Write an object into a file in the bulk format $<count>\r\n<payload>\r\n */
7760 static int fwriteBulkObject(FILE *fp, robj *obj) {
7761 char buf[128];
7762 int decrrc = 0;
7763
7764 /* Avoid the incr/decr ref count business if possible to help
7765 * copy-on-write (we are often in a child process when this function
7766 * is called).
7767 * Also makes sure that key objects don't get incrRefCount-ed when VM
7768 * is enabled */
7769 if (obj->encoding != REDIS_ENCODING_RAW) {
7770 obj = getDecodedObject(obj);
7771 decrrc = 1;
7772 }
7773 snprintf(buf,sizeof(buf),"$%ld\r\n",(long)sdslen(obj->ptr));
7774 if (fwrite(buf,strlen(buf),1,fp) == 0) goto err;
7775 if (sdslen(obj->ptr) && fwrite(obj->ptr,sdslen(obj->ptr),1,fp) == 0)
7776 goto err;
7777 if (fwrite("\r\n",2,1,fp) == 0) goto err;
7778 if (decrrc) decrRefCount(obj);
7779 return 1;
7780 err:
7781 if (decrrc) decrRefCount(obj);
7782 return 0;
7783 }
7784
7785 /* Write binary-safe string into a file in the bulkformat
7786 * $<count>\r\n<payload>\r\n */
7787 static int fwriteBulkString(FILE *fp, char *s, unsigned long len) {
7788 char buf[128];
7789
7790 snprintf(buf,sizeof(buf),"$%ld\r\n",(unsigned long)len);
7791 if (fwrite(buf,strlen(buf),1,fp) == 0) return 0;
7792 if (len && fwrite(s,len,1,fp) == 0) return 0;
7793 if (fwrite("\r\n",2,1,fp) == 0) return 0;
7794 return 1;
7795 }
7796
7797 /* Write a double value in bulk format $<count>\r\n<payload>\r\n */
7798 static int fwriteBulkDouble(FILE *fp, double d) {
7799 char buf[128], dbuf[128];
7800
7801 snprintf(dbuf,sizeof(dbuf),"%.17g\r\n",d);
7802 snprintf(buf,sizeof(buf),"$%lu\r\n",(unsigned long)strlen(dbuf)-2);
7803 if (fwrite(buf,strlen(buf),1,fp) == 0) return 0;
7804 if (fwrite(dbuf,strlen(dbuf),1,fp) == 0) return 0;
7805 return 1;
7806 }
7807
7808 /* Write a long value in bulk format $<count>\r\n<payload>\r\n */
7809 static int fwriteBulkLong(FILE *fp, long l) {
7810 char buf[128], lbuf[128];
7811
7812 snprintf(lbuf,sizeof(lbuf),"%ld\r\n",l);
7813 snprintf(buf,sizeof(buf),"$%lu\r\n",(unsigned long)strlen(lbuf)-2);
7814 if (fwrite(buf,strlen(buf),1,fp) == 0) return 0;
7815 if (fwrite(lbuf,strlen(lbuf),1,fp) == 0) return 0;
7816 return 1;
7817 }
7818
7819 /* Write a sequence of commands able to fully rebuild the dataset into
7820 * "filename". Used both by REWRITEAOF and BGREWRITEAOF. */
7821 static int rewriteAppendOnlyFile(char *filename) {
7822 dictIterator *di = NULL;
7823 dictEntry *de;
7824 FILE *fp;
7825 char tmpfile[256];
7826 int j;
7827 time_t now = time(NULL);
7828
7829 /* Note that we have to use a different temp name here compared to the
7830 * one used by rewriteAppendOnlyFileBackground() function. */
7831 snprintf(tmpfile,256,"temp-rewriteaof-%d.aof", (int) getpid());
7832 fp = fopen(tmpfile,"w");
7833 if (!fp) {
7834 redisLog(REDIS_WARNING, "Failed rewriting the append only file: %s", strerror(errno));
7835 return REDIS_ERR;
7836 }
7837 for (j = 0; j < server.dbnum; j++) {
7838 char selectcmd[] = "*2\r\n$6\r\nSELECT\r\n";
7839 redisDb *db = server.db+j;
7840 dict *d = db->dict;
7841 if (dictSize(d) == 0) continue;
7842 di = dictGetIterator(d);
7843 if (!di) {
7844 fclose(fp);
7845 return REDIS_ERR;
7846 }
7847
7848 /* SELECT the new DB */
7849 if (fwrite(selectcmd,sizeof(selectcmd)-1,1,fp) == 0) goto werr;
7850 if (fwriteBulkLong(fp,j) == 0) goto werr;
7851
7852 /* Iterate this DB writing every entry */
7853 while((de = dictNext(di)) != NULL) {
7854 robj *key, *o;
7855 time_t expiretime;
7856 int swapped;
7857
7858 key = dictGetEntryKey(de);
7859 /* If the value for this key is swapped, load a preview in memory.
7860 * We use a "swapped" flag to remember if we need to free the
7861 * value object instead to just increment the ref count anyway
7862 * in order to avoid copy-on-write of pages if we are forked() */
7863 if (!server.vm_enabled || key->storage == REDIS_VM_MEMORY ||
7864 key->storage == REDIS_VM_SWAPPING) {
7865 o = dictGetEntryVal(de);
7866 swapped = 0;
7867 } else {
7868 o = vmPreviewObject(key);
7869 swapped = 1;
7870 }
7871 expiretime = getExpire(db,key);
7872
7873 /* Save the key and associated value */
7874 if (o->type == REDIS_STRING) {
7875 /* Emit a SET command */
7876 char cmd[]="*3\r\n$3\r\nSET\r\n";
7877 if (fwrite(cmd,sizeof(cmd)-1,1,fp) == 0) goto werr;
7878 /* Key and value */
7879 if (fwriteBulkObject(fp,key) == 0) goto werr;
7880 if (fwriteBulkObject(fp,o) == 0) goto werr;
7881 } else if (o->type == REDIS_LIST) {
7882 /* Emit the RPUSHes needed to rebuild the list */
7883 list *list = o->ptr;
7884 listNode *ln;
7885 listIter li;
7886
7887 listRewind(list,&li);
7888 while((ln = listNext(&li))) {
7889 char cmd[]="*3\r\n$5\r\nRPUSH\r\n";
7890 robj *eleobj = listNodeValue(ln);
7891
7892 if (fwrite(cmd,sizeof(cmd)-1,1,fp) == 0) goto werr;
7893 if (fwriteBulkObject(fp,key) == 0) goto werr;
7894 if (fwriteBulkObject(fp,eleobj) == 0) goto werr;
7895 }
7896 } else if (o->type == REDIS_SET) {
7897 /* Emit the SADDs needed to rebuild the set */
7898 dict *set = o->ptr;
7899 dictIterator *di = dictGetIterator(set);
7900 dictEntry *de;
7901
7902 while((de = dictNext(di)) != NULL) {
7903 char cmd[]="*3\r\n$4\r\nSADD\r\n";
7904 robj *eleobj = dictGetEntryKey(de);
7905
7906 if (fwrite(cmd,sizeof(cmd)-1,1,fp) == 0) goto werr;
7907 if (fwriteBulkObject(fp,key) == 0) goto werr;
7908 if (fwriteBulkObject(fp,eleobj) == 0) goto werr;
7909 }
7910 dictReleaseIterator(di);
7911 } else if (o->type == REDIS_ZSET) {
7912 /* Emit the ZADDs needed to rebuild the sorted set */
7913 zset *zs = o->ptr;
7914 dictIterator *di = dictGetIterator(zs->dict);
7915 dictEntry *de;
7916
7917 while((de = dictNext(di)) != NULL) {
7918 char cmd[]="*4\r\n$4\r\nZADD\r\n";
7919 robj *eleobj = dictGetEntryKey(de);
7920 double *score = dictGetEntryVal(de);
7921
7922 if (fwrite(cmd,sizeof(cmd)-1,1,fp) == 0) goto werr;
7923 if (fwriteBulkObject(fp,key) == 0) goto werr;
7924 if (fwriteBulkDouble(fp,*score) == 0) goto werr;
7925 if (fwriteBulkObject(fp,eleobj) == 0) goto werr;
7926 }
7927 dictReleaseIterator(di);
7928 } else if (o->type == REDIS_HASH) {
7929 char cmd[]="*4\r\n$4\r\nHSET\r\n";
7930
7931 /* Emit the HSETs needed to rebuild the hash */
7932 if (o->encoding == REDIS_ENCODING_ZIPMAP) {
7933 unsigned char *p = zipmapRewind(o->ptr);
7934 unsigned char *field, *val;
7935 unsigned int flen, vlen;
7936
7937 while((p = zipmapNext(p,&field,&flen,&val,&vlen)) != NULL) {
7938 if (fwrite(cmd,sizeof(cmd)-1,1,fp) == 0) goto werr;
7939 if (fwriteBulkObject(fp,key) == 0) goto werr;
7940 if (fwriteBulkString(fp,(char*)field,flen) == -1)
7941 return -1;
7942 if (fwriteBulkString(fp,(char*)val,vlen) == -1)
7943 return -1;
7944 }
7945 } else {
7946 dictIterator *di = dictGetIterator(o->ptr);
7947 dictEntry *de;
7948
7949 while((de = dictNext(di)) != NULL) {
7950 robj *field = dictGetEntryKey(de);
7951 robj *val = dictGetEntryVal(de);
7952
7953 if (fwrite(cmd,sizeof(cmd)-1,1,fp) == 0) goto werr;
7954 if (fwriteBulkObject(fp,key) == 0) goto werr;
7955 if (fwriteBulkObject(fp,field) == -1) return -1;
7956 if (fwriteBulkObject(fp,val) == -1) return -1;
7957 }
7958 dictReleaseIterator(di);
7959 }
7960 } else {
7961 redisAssert(0);
7962 }
7963 /* Save the expire time */
7964 if (expiretime != -1) {
7965 char cmd[]="*3\r\n$8\r\nEXPIREAT\r\n";
7966 /* If this key is already expired skip it */
7967 if (expiretime < now) continue;
7968 if (fwrite(cmd,sizeof(cmd)-1,1,fp) == 0) goto werr;
7969 if (fwriteBulkObject(fp,key) == 0) goto werr;
7970 if (fwriteBulkLong(fp,expiretime) == 0) goto werr;
7971 }
7972 if (swapped) decrRefCount(o);
7973 }
7974 dictReleaseIterator(di);
7975 }
7976
7977 /* Make sure data will not remain on the OS's output buffers */
7978 fflush(fp);
7979 fsync(fileno(fp));
7980 fclose(fp);
7981
7982 /* Use RENAME to make sure the DB file is changed atomically only
7983 * if the generate DB file is ok. */
7984 if (rename(tmpfile,filename) == -1) {
7985 redisLog(REDIS_WARNING,"Error moving temp append only file on the final destination: %s", strerror(errno));
7986 unlink(tmpfile);
7987 return REDIS_ERR;
7988 }
7989 redisLog(REDIS_NOTICE,"SYNC append only file rewrite performed");
7990 return REDIS_OK;
7991
7992 werr:
7993 fclose(fp);
7994 unlink(tmpfile);
7995 redisLog(REDIS_WARNING,"Write error writing append only file on disk: %s", strerror(errno));
7996 if (di) dictReleaseIterator(di);
7997 return REDIS_ERR;
7998 }
7999
8000 /* This is how rewriting of the append only file in background works:
8001 *
8002 * 1) The user calls BGREWRITEAOF
8003 * 2) Redis calls this function, that forks():
8004 * 2a) the child rewrite the append only file in a temp file.
8005 * 2b) the parent accumulates differences in server.bgrewritebuf.
8006 * 3) When the child finished '2a' exists.
8007 * 4) The parent will trap the exit code, if it's OK, will append the
8008 * data accumulated into server.bgrewritebuf into the temp file, and
8009 * finally will rename(2) the temp file in the actual file name.
8010 * The the new file is reopened as the new append only file. Profit!
8011 */
8012 static int rewriteAppendOnlyFileBackground(void) {
8013 pid_t childpid;
8014
8015 if (server.bgrewritechildpid != -1) return REDIS_ERR;
8016 if (server.vm_enabled) waitEmptyIOJobsQueue();
8017 if ((childpid = fork()) == 0) {
8018 /* Child */
8019 char tmpfile[256];
8020
8021 if (server.vm_enabled) vmReopenSwapFile();
8022 close(server.fd);
8023 snprintf(tmpfile,256,"temp-rewriteaof-bg-%d.aof", (int) getpid());
8024 if (rewriteAppendOnlyFile(tmpfile) == REDIS_OK) {
8025 _exit(0);
8026 } else {
8027 _exit(1);
8028 }
8029 } else {
8030 /* Parent */
8031 if (childpid == -1) {
8032 redisLog(REDIS_WARNING,
8033 "Can't rewrite append only file in background: fork: %s",
8034 strerror(errno));
8035 return REDIS_ERR;
8036 }
8037 redisLog(REDIS_NOTICE,
8038 "Background append only file rewriting started by pid %d",childpid);
8039 server.bgrewritechildpid = childpid;
8040 /* We set appendseldb to -1 in order to force the next call to the
8041 * feedAppendOnlyFile() to issue a SELECT command, so the differences
8042 * accumulated by the parent into server.bgrewritebuf will start
8043 * with a SELECT statement and it will be safe to merge. */
8044 server.appendseldb = -1;
8045 return REDIS_OK;
8046 }
8047 return REDIS_OK; /* unreached */
8048 }
8049
8050 static void bgrewriteaofCommand(redisClient *c) {
8051 if (server.bgrewritechildpid != -1) {
8052 addReplySds(c,sdsnew("-ERR background append only file rewriting already in progress\r\n"));
8053 return;
8054 }
8055 if (rewriteAppendOnlyFileBackground() == REDIS_OK) {
8056 char *status = "+Background append only file rewriting started\r\n";
8057 addReplySds(c,sdsnew(status));
8058 } else {
8059 addReply(c,shared.err);
8060 }
8061 }
8062
8063 static void aofRemoveTempFile(pid_t childpid) {
8064 char tmpfile[256];
8065
8066 snprintf(tmpfile,256,"temp-rewriteaof-bg-%d.aof", (int) childpid);
8067 unlink(tmpfile);
8068 }
8069
8070 /* Virtual Memory is composed mainly of two subsystems:
8071 * - Blocking Virutal Memory
8072 * - Threaded Virtual Memory I/O
8073 * The two parts are not fully decoupled, but functions are split among two
8074 * different sections of the source code (delimited by comments) in order to
8075 * make more clear what functionality is about the blocking VM and what about
8076 * the threaded (not blocking) VM.
8077 *
8078 * Redis VM design:
8079 *
8080 * Redis VM is a blocking VM (one that blocks reading swapped values from
8081 * disk into memory when a value swapped out is needed in memory) that is made
8082 * unblocking by trying to examine the command argument vector in order to
8083 * load in background values that will likely be needed in order to exec
8084 * the command. The command is executed only once all the relevant keys
8085 * are loaded into memory.
8086 *
8087 * This basically is almost as simple of a blocking VM, but almost as parallel
8088 * as a fully non-blocking VM.
8089 */
8090
8091 /* =================== Virtual Memory - Blocking Side ====================== */
8092
8093 /* substitute the first occurrence of '%p' with the process pid in the
8094 * swap file name. */
8095 static void expandVmSwapFilename(void) {
8096 char *p = strstr(server.vm_swap_file,"%p");
8097 sds new;
8098
8099 if (!p) return;
8100 new = sdsempty();
8101 *p = '\0';
8102 new = sdscat(new,server.vm_swap_file);
8103 new = sdscatprintf(new,"%ld",(long) getpid());
8104 new = sdscat(new,p+2);
8105 zfree(server.vm_swap_file);
8106 server.vm_swap_file = new;
8107 }
8108
8109 static void vmInit(void) {
8110 off_t totsize;
8111 int pipefds[2];
8112 size_t stacksize;
8113
8114 if (server.vm_max_threads != 0)
8115 zmalloc_enable_thread_safeness(); /* we need thread safe zmalloc() */
8116
8117 expandVmSwapFilename();
8118 redisLog(REDIS_NOTICE,"Using '%s' as swap file",server.vm_swap_file);
8119 if ((server.vm_fp = fopen(server.vm_swap_file,"r+b")) == NULL) {
8120 server.vm_fp = fopen(server.vm_swap_file,"w+b");
8121 }
8122 if (server.vm_fp == NULL) {
8123 redisLog(REDIS_WARNING,
8124 "Impossible to open the swap file: %s. Exiting.",
8125 strerror(errno));
8126 exit(1);
8127 }
8128 server.vm_fd = fileno(server.vm_fp);
8129 server.vm_next_page = 0;
8130 server.vm_near_pages = 0;
8131 server.vm_stats_used_pages = 0;
8132 server.vm_stats_swapped_objects = 0;
8133 server.vm_stats_swapouts = 0;
8134 server.vm_stats_swapins = 0;
8135 totsize = server.vm_pages*server.vm_page_size;
8136 redisLog(REDIS_NOTICE,"Allocating %lld bytes of swap file",totsize);
8137 if (ftruncate(server.vm_fd,totsize) == -1) {
8138 redisLog(REDIS_WARNING,"Can't ftruncate swap file: %s. Exiting.",
8139 strerror(errno));
8140 exit(1);
8141 } else {
8142 redisLog(REDIS_NOTICE,"Swap file allocated with success");
8143 }
8144 server.vm_bitmap = zmalloc((server.vm_pages+7)/8);
8145 redisLog(REDIS_VERBOSE,"Allocated %lld bytes page table for %lld pages",
8146 (long long) (server.vm_pages+7)/8, server.vm_pages);
8147 memset(server.vm_bitmap,0,(server.vm_pages+7)/8);
8148
8149 /* Initialize threaded I/O (used by Virtual Memory) */
8150 server.io_newjobs = listCreate();
8151 server.io_processing = listCreate();
8152 server.io_processed = listCreate();
8153 server.io_ready_clients = listCreate();
8154 pthread_mutex_init(&server.io_mutex,NULL);
8155 pthread_mutex_init(&server.obj_freelist_mutex,NULL);
8156 pthread_mutex_init(&server.io_swapfile_mutex,NULL);
8157 server.io_active_threads = 0;
8158 if (pipe(pipefds) == -1) {
8159 redisLog(REDIS_WARNING,"Unable to intialized VM: pipe(2): %s. Exiting."
8160 ,strerror(errno));
8161 exit(1);
8162 }
8163 server.io_ready_pipe_read = pipefds[0];
8164 server.io_ready_pipe_write = pipefds[1];
8165 redisAssert(anetNonBlock(NULL,server.io_ready_pipe_read) != ANET_ERR);
8166 /* LZF requires a lot of stack */
8167 pthread_attr_init(&server.io_threads_attr);
8168 pthread_attr_getstacksize(&server.io_threads_attr, &stacksize);
8169 while (stacksize < REDIS_THREAD_STACK_SIZE) stacksize *= 2;
8170 pthread_attr_setstacksize(&server.io_threads_attr, stacksize);
8171 /* Listen for events in the threaded I/O pipe */
8172 if (aeCreateFileEvent(server.el, server.io_ready_pipe_read, AE_READABLE,
8173 vmThreadedIOCompletedJob, NULL) == AE_ERR)
8174 oom("creating file event");
8175 }
8176
8177 /* Mark the page as used */
8178 static void vmMarkPageUsed(off_t page) {
8179 off_t byte = page/8;
8180 int bit = page&7;
8181 redisAssert(vmFreePage(page) == 1);
8182 server.vm_bitmap[byte] |= 1<<bit;
8183 }
8184
8185 /* Mark N contiguous pages as used, with 'page' being the first. */
8186 static void vmMarkPagesUsed(off_t page, off_t count) {
8187 off_t j;
8188
8189 for (j = 0; j < count; j++)
8190 vmMarkPageUsed(page+j);
8191 server.vm_stats_used_pages += count;
8192 redisLog(REDIS_DEBUG,"Mark USED pages: %lld pages at %lld\n",
8193 (long long)count, (long long)page);
8194 }
8195
8196 /* Mark the page as free */
8197 static void vmMarkPageFree(off_t page) {
8198 off_t byte = page/8;
8199 int bit = page&7;
8200 redisAssert(vmFreePage(page) == 0);
8201 server.vm_bitmap[byte] &= ~(1<<bit);
8202 }
8203
8204 /* Mark N contiguous pages as free, with 'page' being the first. */
8205 static void vmMarkPagesFree(off_t page, off_t count) {
8206 off_t j;
8207
8208 for (j = 0; j < count; j++)
8209 vmMarkPageFree(page+j);
8210 server.vm_stats_used_pages -= count;
8211 redisLog(REDIS_DEBUG,"Mark FREE pages: %lld pages at %lld\n",
8212 (long long)count, (long long)page);
8213 }
8214
8215 /* Test if the page is free */
8216 static int vmFreePage(off_t page) {
8217 off_t byte = page/8;
8218 int bit = page&7;
8219 return (server.vm_bitmap[byte] & (1<<bit)) == 0;
8220 }
8221
8222 /* Find N contiguous free pages storing the first page of the cluster in *first.
8223 * Returns REDIS_OK if it was able to find N contiguous pages, otherwise
8224 * REDIS_ERR is returned.
8225 *
8226 * This function uses a simple algorithm: we try to allocate
8227 * REDIS_VM_MAX_NEAR_PAGES sequentially, when we reach this limit we start
8228 * again from the start of the swap file searching for free spaces.
8229 *
8230 * If it looks pretty clear that there are no free pages near our offset
8231 * we try to find less populated places doing a forward jump of
8232 * REDIS_VM_MAX_RANDOM_JUMP, then we start scanning again a few pages
8233 * without hurry, and then we jump again and so forth...
8234 *
8235 * This function can be improved using a free list to avoid to guess
8236 * too much, since we could collect data about freed pages.
8237 *
8238 * note: I implemented this function just after watching an episode of
8239 * Battlestar Galactica, where the hybrid was continuing to say "JUMP!"
8240 */
8241 static int vmFindContiguousPages(off_t *first, off_t n) {
8242 off_t base, offset = 0, since_jump = 0, numfree = 0;
8243
8244 if (server.vm_near_pages == REDIS_VM_MAX_NEAR_PAGES) {
8245 server.vm_near_pages = 0;
8246 server.vm_next_page = 0;
8247 }
8248 server.vm_near_pages++; /* Yet another try for pages near to the old ones */
8249 base = server.vm_next_page;
8250
8251 while(offset < server.vm_pages) {
8252 off_t this = base+offset;
8253
8254 /* If we overflow, restart from page zero */
8255 if (this >= server.vm_pages) {
8256 this -= server.vm_pages;
8257 if (this == 0) {
8258 /* Just overflowed, what we found on tail is no longer
8259 * interesting, as it's no longer contiguous. */
8260 numfree = 0;
8261 }
8262 }
8263 if (vmFreePage(this)) {
8264 /* This is a free page */
8265 numfree++;
8266 /* Already got N free pages? Return to the caller, with success */
8267 if (numfree == n) {
8268 *first = this-(n-1);
8269 server.vm_next_page = this+1;
8270 redisLog(REDIS_DEBUG, "FOUND CONTIGUOUS PAGES: %lld pages at %lld\n", (long long) n, (long long) *first);
8271 return REDIS_OK;
8272 }
8273 } else {
8274 /* The current one is not a free page */
8275 numfree = 0;
8276 }
8277
8278 /* Fast-forward if the current page is not free and we already
8279 * searched enough near this place. */
8280 since_jump++;
8281 if (!numfree && since_jump >= REDIS_VM_MAX_RANDOM_JUMP/4) {
8282 offset += random() % REDIS_VM_MAX_RANDOM_JUMP;
8283 since_jump = 0;
8284 /* Note that even if we rewind after the jump, we are don't need
8285 * to make sure numfree is set to zero as we only jump *if* it
8286 * is set to zero. */
8287 } else {
8288 /* Otherwise just check the next page */
8289 offset++;
8290 }
8291 }
8292 return REDIS_ERR;
8293 }
8294
8295 /* Write the specified object at the specified page of the swap file */
8296 static int vmWriteObjectOnSwap(robj *o, off_t page) {
8297 if (server.vm_enabled) pthread_mutex_lock(&server.io_swapfile_mutex);
8298 if (fseeko(server.vm_fp,page*server.vm_page_size,SEEK_SET) == -1) {
8299 if (server.vm_enabled) pthread_mutex_unlock(&server.io_swapfile_mutex);
8300 redisLog(REDIS_WARNING,
8301 "Critical VM problem in vmWriteObjectOnSwap(): can't seek: %s",
8302 strerror(errno));
8303 return REDIS_ERR;
8304 }
8305 rdbSaveObject(server.vm_fp,o);
8306 fflush(server.vm_fp);
8307 if (server.vm_enabled) pthread_mutex_unlock(&server.io_swapfile_mutex);
8308 return REDIS_OK;
8309 }
8310
8311 /* Swap the 'val' object relative to 'key' into disk. Store all the information
8312 * needed to later retrieve the object into the key object.
8313 * If we can't find enough contiguous empty pages to swap the object on disk
8314 * REDIS_ERR is returned. */
8315 static int vmSwapObjectBlocking(robj *key, robj *val) {
8316 off_t pages = rdbSavedObjectPages(val,NULL);
8317 off_t page;
8318
8319 assert(key->storage == REDIS_VM_MEMORY);
8320 assert(key->refcount == 1);
8321 if (vmFindContiguousPages(&page,pages) == REDIS_ERR) return REDIS_ERR;
8322 if (vmWriteObjectOnSwap(val,page) == REDIS_ERR) return REDIS_ERR;
8323 key->vm.page = page;
8324 key->vm.usedpages = pages;
8325 key->storage = REDIS_VM_SWAPPED;
8326 key->vtype = val->type;
8327 decrRefCount(val); /* Deallocate the object from memory. */
8328 vmMarkPagesUsed(page,pages);
8329 redisLog(REDIS_DEBUG,"VM: object %s swapped out at %lld (%lld pages)",
8330 (unsigned char*) key->ptr,
8331 (unsigned long long) page, (unsigned long long) pages);
8332 server.vm_stats_swapped_objects++;
8333 server.vm_stats_swapouts++;
8334 return REDIS_OK;
8335 }
8336
8337 static robj *vmReadObjectFromSwap(off_t page, int type) {
8338 robj *o;
8339
8340 if (server.vm_enabled) pthread_mutex_lock(&server.io_swapfile_mutex);
8341 if (fseeko(server.vm_fp,page*server.vm_page_size,SEEK_SET) == -1) {
8342 redisLog(REDIS_WARNING,
8343 "Unrecoverable VM problem in vmReadObjectFromSwap(): can't seek: %s",
8344 strerror(errno));
8345 _exit(1);
8346 }
8347 o = rdbLoadObject(type,server.vm_fp);
8348 if (o == NULL) {
8349 redisLog(REDIS_WARNING, "Unrecoverable VM problem in vmReadObjectFromSwap(): can't load object from swap file: %s", strerror(errno));
8350 _exit(1);
8351 }
8352 if (server.vm_enabled) pthread_mutex_unlock(&server.io_swapfile_mutex);
8353 return o;
8354 }
8355
8356 /* Load the value object relative to the 'key' object from swap to memory.
8357 * The newly allocated object is returned.
8358 *
8359 * If preview is true the unserialized object is returned to the caller but
8360 * no changes are made to the key object, nor the pages are marked as freed */
8361 static robj *vmGenericLoadObject(robj *key, int preview) {
8362 robj *val;
8363
8364 redisAssert(key->storage == REDIS_VM_SWAPPED || key->storage == REDIS_VM_LOADING);
8365 val = vmReadObjectFromSwap(key->vm.page,key->vtype);
8366 if (!preview) {
8367 key->storage = REDIS_VM_MEMORY;
8368 key->vm.atime = server.unixtime;
8369 vmMarkPagesFree(key->vm.page,key->vm.usedpages);
8370 redisLog(REDIS_DEBUG, "VM: object %s loaded from disk",
8371 (unsigned char*) key->ptr);
8372 server.vm_stats_swapped_objects--;
8373 } else {
8374 redisLog(REDIS_DEBUG, "VM: object %s previewed from disk",
8375 (unsigned char*) key->ptr);
8376 }
8377 server.vm_stats_swapins++;
8378 return val;
8379 }
8380
8381 /* Plain object loading, from swap to memory */
8382 static robj *vmLoadObject(robj *key) {
8383 /* If we are loading the object in background, stop it, we
8384 * need to load this object synchronously ASAP. */
8385 if (key->storage == REDIS_VM_LOADING)
8386 vmCancelThreadedIOJob(key);
8387 return vmGenericLoadObject(key,0);
8388 }
8389
8390 /* Just load the value on disk, without to modify the key.
8391 * This is useful when we want to perform some operation on the value
8392 * without to really bring it from swap to memory, like while saving the
8393 * dataset or rewriting the append only log. */
8394 static robj *vmPreviewObject(robj *key) {
8395 return vmGenericLoadObject(key,1);
8396 }
8397
8398 /* How a good candidate is this object for swapping?
8399 * The better candidate it is, the greater the returned value.
8400 *
8401 * Currently we try to perform a fast estimation of the object size in
8402 * memory, and combine it with aging informations.
8403 *
8404 * Basically swappability = idle-time * log(estimated size)
8405 *
8406 * Bigger objects are preferred over smaller objects, but not
8407 * proportionally, this is why we use the logarithm. This algorithm is
8408 * just a first try and will probably be tuned later. */
8409 static double computeObjectSwappability(robj *o) {
8410 time_t age = server.unixtime - o->vm.atime;
8411 long asize = 0;
8412 list *l;
8413 dict *d;
8414 struct dictEntry *de;
8415 int z;
8416
8417 if (age <= 0) return 0;
8418 switch(o->type) {
8419 case REDIS_STRING:
8420 if (o->encoding != REDIS_ENCODING_RAW) {
8421 asize = sizeof(*o);
8422 } else {
8423 asize = sdslen(o->ptr)+sizeof(*o)+sizeof(long)*2;
8424 }
8425 break;
8426 case REDIS_LIST:
8427 l = o->ptr;
8428 listNode *ln = listFirst(l);
8429
8430 asize = sizeof(list);
8431 if (ln) {
8432 robj *ele = ln->value;
8433 long elesize;
8434
8435 elesize = (ele->encoding == REDIS_ENCODING_RAW) ?
8436 (sizeof(*o)+sdslen(ele->ptr)) :
8437 sizeof(*o);
8438 asize += (sizeof(listNode)+elesize)*listLength(l);
8439 }
8440 break;
8441 case REDIS_SET:
8442 case REDIS_ZSET:
8443 z = (o->type == REDIS_ZSET);
8444 d = z ? ((zset*)o->ptr)->dict : o->ptr;
8445
8446 asize = sizeof(dict)+(sizeof(struct dictEntry*)*dictSlots(d));
8447 if (z) asize += sizeof(zset)-sizeof(dict);
8448 if (dictSize(d)) {
8449 long elesize;
8450 robj *ele;
8451
8452 de = dictGetRandomKey(d);
8453 ele = dictGetEntryKey(de);
8454 elesize = (ele->encoding == REDIS_ENCODING_RAW) ?
8455 (sizeof(*o)+sdslen(ele->ptr)) :
8456 sizeof(*o);
8457 asize += (sizeof(struct dictEntry)+elesize)*dictSize(d);
8458 if (z) asize += sizeof(zskiplistNode)*dictSize(d);
8459 }
8460 break;
8461 case REDIS_HASH:
8462 if (o->encoding == REDIS_ENCODING_ZIPMAP) {
8463 unsigned char *p = zipmapRewind((unsigned char*)o->ptr);
8464 unsigned int len = zipmapLen((unsigned char*)o->ptr);
8465 unsigned int klen, vlen;
8466 unsigned char *key, *val;
8467
8468 if ((p = zipmapNext(p,&key,&klen,&val,&vlen)) == NULL) {
8469 klen = 0;
8470 vlen = 0;
8471 }
8472 asize = len*(klen+vlen+3);
8473 } else if (o->encoding == REDIS_ENCODING_HT) {
8474 d = o->ptr;
8475 asize = sizeof(dict)+(sizeof(struct dictEntry*)*dictSlots(d));
8476 if (dictSize(d)) {
8477 long elesize;
8478 robj *ele;
8479
8480 de = dictGetRandomKey(d);
8481 ele = dictGetEntryKey(de);
8482 elesize = (ele->encoding == REDIS_ENCODING_RAW) ?
8483 (sizeof(*o)+sdslen(ele->ptr)) :
8484 sizeof(*o);
8485 ele = dictGetEntryVal(de);
8486 elesize = (ele->encoding == REDIS_ENCODING_RAW) ?
8487 (sizeof(*o)+sdslen(ele->ptr)) :
8488 sizeof(*o);
8489 asize += (sizeof(struct dictEntry)+elesize)*dictSize(d);
8490 }
8491 }
8492 break;
8493 }
8494 return (double)age*log(1+asize);
8495 }
8496
8497 /* Try to swap an object that's a good candidate for swapping.
8498 * Returns REDIS_OK if the object was swapped, REDIS_ERR if it's not possible
8499 * to swap any object at all.
8500 *
8501 * If 'usethreaded' is true, Redis will try to swap the object in background
8502 * using I/O threads. */
8503 static int vmSwapOneObject(int usethreads) {
8504 int j, i;
8505 struct dictEntry *best = NULL;
8506 double best_swappability = 0;
8507 redisDb *best_db = NULL;
8508 robj *key, *val;
8509
8510 for (j = 0; j < server.dbnum; j++) {
8511 redisDb *db = server.db+j;
8512 /* Why maxtries is set to 100?
8513 * Because this way (usually) we'll find 1 object even if just 1% - 2%
8514 * are swappable objects */
8515 int maxtries = 100;
8516
8517 if (dictSize(db->dict) == 0) continue;
8518 for (i = 0; i < 5; i++) {
8519 dictEntry *de;
8520 double swappability;
8521
8522 if (maxtries) maxtries--;
8523 de = dictGetRandomKey(db->dict);
8524 key = dictGetEntryKey(de);
8525 val = dictGetEntryVal(de);
8526 /* Only swap objects that are currently in memory.
8527 *
8528 * Also don't swap shared objects if threaded VM is on, as we
8529 * try to ensure that the main thread does not touch the
8530 * object while the I/O thread is using it, but we can't
8531 * control other keys without adding additional mutex. */
8532 if (key->storage != REDIS_VM_MEMORY ||
8533 (server.vm_max_threads != 0 && val->refcount != 1)) {
8534 if (maxtries) i--; /* don't count this try */
8535 continue;
8536 }
8537 swappability = computeObjectSwappability(val);
8538 if (!best || swappability > best_swappability) {
8539 best = de;
8540 best_swappability = swappability;
8541 best_db = db;
8542 }
8543 }
8544 }
8545 if (best == NULL) return REDIS_ERR;
8546 key = dictGetEntryKey(best);
8547 val = dictGetEntryVal(best);
8548
8549 redisLog(REDIS_DEBUG,"Key with best swappability: %s, %f",
8550 key->ptr, best_swappability);
8551
8552 /* Unshare the key if needed */
8553 if (key->refcount > 1) {
8554 robj *newkey = dupStringObject(key);
8555 decrRefCount(key);
8556 key = dictGetEntryKey(best) = newkey;
8557 }
8558 /* Swap it */
8559 if (usethreads) {
8560 vmSwapObjectThreaded(key,val,best_db);
8561 return REDIS_OK;
8562 } else {
8563 if (vmSwapObjectBlocking(key,val) == REDIS_OK) {
8564 dictGetEntryVal(best) = NULL;
8565 return REDIS_OK;
8566 } else {
8567 return REDIS_ERR;
8568 }
8569 }
8570 }
8571
8572 static int vmSwapOneObjectBlocking() {
8573 return vmSwapOneObject(0);
8574 }
8575
8576 static int vmSwapOneObjectThreaded() {
8577 return vmSwapOneObject(1);
8578 }
8579
8580 /* Return true if it's safe to swap out objects in a given moment.
8581 * Basically we don't want to swap objects out while there is a BGSAVE
8582 * or a BGAEOREWRITE running in backgroud. */
8583 static int vmCanSwapOut(void) {
8584 return (server.bgsavechildpid == -1 && server.bgrewritechildpid == -1);
8585 }
8586
8587 /* Delete a key if swapped. Returns 1 if the key was found, was swapped
8588 * and was deleted. Otherwise 0 is returned. */
8589 static int deleteIfSwapped(redisDb *db, robj *key) {
8590 dictEntry *de;
8591 robj *foundkey;
8592
8593 if ((de = dictFind(db->dict,key)) == NULL) return 0;
8594 foundkey = dictGetEntryKey(de);
8595 if (foundkey->storage == REDIS_VM_MEMORY) return 0;
8596 deleteKey(db,key);
8597 return 1;
8598 }
8599
8600 /* =================== Virtual Memory - Threaded I/O ======================= */
8601
8602 static void freeIOJob(iojob *j) {
8603 if ((j->type == REDIS_IOJOB_PREPARE_SWAP ||
8604 j->type == REDIS_IOJOB_DO_SWAP ||
8605 j->type == REDIS_IOJOB_LOAD) && j->val != NULL)
8606 decrRefCount(j->val);
8607 decrRefCount(j->key);
8608 zfree(j);
8609 }
8610
8611 /* Every time a thread finished a Job, it writes a byte into the write side
8612 * of an unix pipe in order to "awake" the main thread, and this function
8613 * is called. */
8614 static void vmThreadedIOCompletedJob(aeEventLoop *el, int fd, void *privdata,
8615 int mask)
8616 {
8617 char buf[1];
8618 int retval, processed = 0, toprocess = -1, trytoswap = 1;
8619 REDIS_NOTUSED(el);
8620 REDIS_NOTUSED(mask);
8621 REDIS_NOTUSED(privdata);
8622
8623 /* For every byte we read in the read side of the pipe, there is one
8624 * I/O job completed to process. */
8625 while((retval = read(fd,buf,1)) == 1) {
8626 iojob *j;
8627 listNode *ln;
8628 robj *key;
8629 struct dictEntry *de;
8630
8631 redisLog(REDIS_DEBUG,"Processing I/O completed job");
8632
8633 /* Get the processed element (the oldest one) */
8634 lockThreadedIO();
8635 assert(listLength(server.io_processed) != 0);
8636 if (toprocess == -1) {
8637 toprocess = (listLength(server.io_processed)*REDIS_MAX_COMPLETED_JOBS_PROCESSED)/100;
8638 if (toprocess <= 0) toprocess = 1;
8639 }
8640 ln = listFirst(server.io_processed);
8641 j = ln->value;
8642 listDelNode(server.io_processed,ln);
8643 unlockThreadedIO();
8644 /* If this job is marked as canceled, just ignore it */
8645 if (j->canceled) {
8646 freeIOJob(j);
8647 continue;
8648 }
8649 /* Post process it in the main thread, as there are things we
8650 * can do just here to avoid race conditions and/or invasive locks */
8651 redisLog(REDIS_DEBUG,"Job %p type: %d, key at %p (%s) refcount: %d\n", (void*) j, j->type, (void*)j->key, (char*)j->key->ptr, j->key->refcount);
8652 de = dictFind(j->db->dict,j->key);
8653 assert(de != NULL);
8654 key = dictGetEntryKey(de);
8655 if (j->type == REDIS_IOJOB_LOAD) {
8656 redisDb *db;
8657
8658 /* Key loaded, bring it at home */
8659 key->storage = REDIS_VM_MEMORY;
8660 key->vm.atime = server.unixtime;
8661 vmMarkPagesFree(key->vm.page,key->vm.usedpages);
8662 redisLog(REDIS_DEBUG, "VM: object %s loaded from disk (threaded)",
8663 (unsigned char*) key->ptr);
8664 server.vm_stats_swapped_objects--;
8665 server.vm_stats_swapins++;
8666 dictGetEntryVal(de) = j->val;
8667 incrRefCount(j->val);
8668 db = j->db;
8669 freeIOJob(j);
8670 /* Handle clients waiting for this key to be loaded. */
8671 handleClientsBlockedOnSwappedKey(db,key);
8672 } else if (j->type == REDIS_IOJOB_PREPARE_SWAP) {
8673 /* Now we know the amount of pages required to swap this object.
8674 * Let's find some space for it, and queue this task again
8675 * rebranded as REDIS_IOJOB_DO_SWAP. */
8676 if (!vmCanSwapOut() ||
8677 vmFindContiguousPages(&j->page,j->pages) == REDIS_ERR)
8678 {
8679 /* Ooops... no space or we can't swap as there is
8680 * a fork()ed Redis trying to save stuff on disk. */
8681 freeIOJob(j);
8682 key->storage = REDIS_VM_MEMORY; /* undo operation */
8683 } else {
8684 /* Note that we need to mark this pages as used now,
8685 * if the job will be canceled, we'll mark them as freed
8686 * again. */
8687 vmMarkPagesUsed(j->page,j->pages);
8688 j->type = REDIS_IOJOB_DO_SWAP;
8689 lockThreadedIO();
8690 queueIOJob(j);
8691 unlockThreadedIO();
8692 }
8693 } else if (j->type == REDIS_IOJOB_DO_SWAP) {
8694 robj *val;
8695
8696 /* Key swapped. We can finally free some memory. */
8697 if (key->storage != REDIS_VM_SWAPPING) {
8698 printf("key->storage: %d\n",key->storage);
8699 printf("key->name: %s\n",(char*)key->ptr);
8700 printf("key->refcount: %d\n",key->refcount);
8701 printf("val: %p\n",(void*)j->val);
8702 printf("val->type: %d\n",j->val->type);
8703 printf("val->ptr: %s\n",(char*)j->val->ptr);
8704 }
8705 redisAssert(key->storage == REDIS_VM_SWAPPING);
8706 val = dictGetEntryVal(de);
8707 key->vm.page = j->page;
8708 key->vm.usedpages = j->pages;
8709 key->storage = REDIS_VM_SWAPPED;
8710 key->vtype = j->val->type;
8711 decrRefCount(val); /* Deallocate the object from memory. */
8712 dictGetEntryVal(de) = NULL;
8713 redisLog(REDIS_DEBUG,
8714 "VM: object %s swapped out at %lld (%lld pages) (threaded)",
8715 (unsigned char*) key->ptr,
8716 (unsigned long long) j->page, (unsigned long long) j->pages);
8717 server.vm_stats_swapped_objects++;
8718 server.vm_stats_swapouts++;
8719 freeIOJob(j);
8720 /* Put a few more swap requests in queue if we are still
8721 * out of memory */
8722 if (trytoswap && vmCanSwapOut() &&
8723 zmalloc_used_memory() > server.vm_max_memory)
8724 {
8725 int more = 1;
8726 while(more) {
8727 lockThreadedIO();
8728 more = listLength(server.io_newjobs) <
8729 (unsigned) server.vm_max_threads;
8730 unlockThreadedIO();
8731 /* Don't waste CPU time if swappable objects are rare. */
8732 if (vmSwapOneObjectThreaded() == REDIS_ERR) {
8733 trytoswap = 0;
8734 break;
8735 }
8736 }
8737 }
8738 }
8739 processed++;
8740 if (processed == toprocess) return;
8741 }
8742 if (retval < 0 && errno != EAGAIN) {
8743 redisLog(REDIS_WARNING,
8744 "WARNING: read(2) error in vmThreadedIOCompletedJob() %s",
8745 strerror(errno));
8746 }
8747 }
8748
8749 static void lockThreadedIO(void) {
8750 pthread_mutex_lock(&server.io_mutex);
8751 }
8752
8753 static void unlockThreadedIO(void) {
8754 pthread_mutex_unlock(&server.io_mutex);
8755 }
8756
8757 /* Remove the specified object from the threaded I/O queue if still not
8758 * processed, otherwise make sure to flag it as canceled. */
8759 static void vmCancelThreadedIOJob(robj *o) {
8760 list *lists[3] = {
8761 server.io_newjobs, /* 0 */
8762 server.io_processing, /* 1 */
8763 server.io_processed /* 2 */
8764 };
8765 int i;
8766
8767 assert(o->storage == REDIS_VM_LOADING || o->storage == REDIS_VM_SWAPPING);
8768 again:
8769 lockThreadedIO();
8770 /* Search for a matching key in one of the queues */
8771 for (i = 0; i < 3; i++) {
8772 listNode *ln;
8773 listIter li;
8774
8775 listRewind(lists[i],&li);
8776 while ((ln = listNext(&li)) != NULL) {
8777 iojob *job = ln->value;
8778
8779 if (job->canceled) continue; /* Skip this, already canceled. */
8780 if (compareStringObjects(job->key,o) == 0) {
8781 redisLog(REDIS_DEBUG,"*** CANCELED %p (%s) (type %d) (LIST ID %d)\n",
8782 (void*)job, (char*)o->ptr, job->type, i);
8783 /* Mark the pages as free since the swap didn't happened
8784 * or happened but is now discarded. */
8785 if (i != 1 && job->type == REDIS_IOJOB_DO_SWAP)
8786 vmMarkPagesFree(job->page,job->pages);
8787 /* Cancel the job. It depends on the list the job is
8788 * living in. */
8789 switch(i) {
8790 case 0: /* io_newjobs */
8791 /* If the job was yet not processed the best thing to do
8792 * is to remove it from the queue at all */
8793 freeIOJob(job);
8794 listDelNode(lists[i],ln);
8795 break;
8796 case 1: /* io_processing */
8797 /* Oh Shi- the thread is messing with the Job:
8798 *
8799 * Probably it's accessing the object if this is a
8800 * PREPARE_SWAP or DO_SWAP job.
8801 * If it's a LOAD job it may be reading from disk and
8802 * if we don't wait for the job to terminate before to
8803 * cancel it, maybe in a few microseconds data can be
8804 * corrupted in this pages. So the short story is:
8805 *
8806 * Better to wait for the job to move into the
8807 * next queue (processed)... */
8808
8809 /* We try again and again until the job is completed. */
8810 unlockThreadedIO();
8811 /* But let's wait some time for the I/O thread
8812 * to finish with this job. After all this condition
8813 * should be very rare. */
8814 usleep(1);
8815 goto again;
8816 case 2: /* io_processed */
8817 /* The job was already processed, that's easy...
8818 * just mark it as canceled so that we'll ignore it
8819 * when processing completed jobs. */
8820 job->canceled = 1;
8821 break;
8822 }
8823 /* Finally we have to adjust the storage type of the object
8824 * in order to "UNDO" the operaiton. */
8825 if (o->storage == REDIS_VM_LOADING)
8826 o->storage = REDIS_VM_SWAPPED;
8827 else if (o->storage == REDIS_VM_SWAPPING)
8828 o->storage = REDIS_VM_MEMORY;
8829 unlockThreadedIO();
8830 return;
8831 }
8832 }
8833 }
8834 unlockThreadedIO();
8835 assert(1 != 1); /* We should never reach this */
8836 }
8837
8838 static void *IOThreadEntryPoint(void *arg) {
8839 iojob *j;
8840 listNode *ln;
8841 REDIS_NOTUSED(arg);
8842
8843 pthread_detach(pthread_self());
8844 while(1) {
8845 /* Get a new job to process */
8846 lockThreadedIO();
8847 if (listLength(server.io_newjobs) == 0) {
8848 /* No new jobs in queue, exit. */
8849 redisLog(REDIS_DEBUG,"Thread %ld exiting, nothing to do",
8850 (long) pthread_self());
8851 server.io_active_threads--;
8852 unlockThreadedIO();
8853 return NULL;
8854 }
8855 ln = listFirst(server.io_newjobs);
8856 j = ln->value;
8857 listDelNode(server.io_newjobs,ln);
8858 /* Add the job in the processing queue */
8859 j->thread = pthread_self();
8860 listAddNodeTail(server.io_processing,j);
8861 ln = listLast(server.io_processing); /* We use ln later to remove it */
8862 unlockThreadedIO();
8863 redisLog(REDIS_DEBUG,"Thread %ld got a new job (type %d): %p about key '%s'",
8864 (long) pthread_self(), j->type, (void*)j, (char*)j->key->ptr);
8865
8866 /* Process the Job */
8867 if (j->type == REDIS_IOJOB_LOAD) {
8868 j->val = vmReadObjectFromSwap(j->page,j->key->vtype);
8869 } else if (j->type == REDIS_IOJOB_PREPARE_SWAP) {
8870 FILE *fp = fopen("/dev/null","w+");
8871 j->pages = rdbSavedObjectPages(j->val,fp);
8872 fclose(fp);
8873 } else if (j->type == REDIS_IOJOB_DO_SWAP) {
8874 if (vmWriteObjectOnSwap(j->val,j->page) == REDIS_ERR)
8875 j->canceled = 1;
8876 }
8877
8878 /* Done: insert the job into the processed queue */
8879 redisLog(REDIS_DEBUG,"Thread %ld completed the job: %p (key %s)",
8880 (long) pthread_self(), (void*)j, (char*)j->key->ptr);
8881 lockThreadedIO();
8882 listDelNode(server.io_processing,ln);
8883 listAddNodeTail(server.io_processed,j);
8884 unlockThreadedIO();
8885
8886 /* Signal the main thread there is new stuff to process */
8887 assert(write(server.io_ready_pipe_write,"x",1) == 1);
8888 }
8889 return NULL; /* never reached */
8890 }
8891
8892 static void spawnIOThread(void) {
8893 pthread_t thread;
8894 sigset_t mask, omask;
8895 int err;
8896
8897 sigemptyset(&mask);
8898 sigaddset(&mask,SIGCHLD);
8899 sigaddset(&mask,SIGHUP);
8900 sigaddset(&mask,SIGPIPE);
8901 pthread_sigmask(SIG_SETMASK, &mask, &omask);
8902 while ((err = pthread_create(&thread,&server.io_threads_attr,IOThreadEntryPoint,NULL)) != 0) {
8903 redisLog(REDIS_WARNING,"Unable to spawn an I/O thread: %s",
8904 strerror(err));
8905 usleep(1000000);
8906 }
8907 pthread_sigmask(SIG_SETMASK, &omask, NULL);
8908 server.io_active_threads++;
8909 }
8910
8911 /* We need to wait for the last thread to exit before we are able to
8912 * fork() in order to BGSAVE or BGREWRITEAOF. */
8913 static void waitEmptyIOJobsQueue(void) {
8914 while(1) {
8915 int io_processed_len;
8916
8917 lockThreadedIO();
8918 if (listLength(server.io_newjobs) == 0 &&
8919 listLength(server.io_processing) == 0 &&
8920 server.io_active_threads == 0)
8921 {
8922 unlockThreadedIO();
8923 return;
8924 }
8925 /* While waiting for empty jobs queue condition we post-process some
8926 * finshed job, as I/O threads may be hanging trying to write against
8927 * the io_ready_pipe_write FD but there are so much pending jobs that
8928 * it's blocking. */
8929 io_processed_len = listLength(server.io_processed);
8930 unlockThreadedIO();
8931 if (io_processed_len) {
8932 vmThreadedIOCompletedJob(NULL,server.io_ready_pipe_read,NULL,0);
8933 usleep(1000); /* 1 millisecond */
8934 } else {
8935 usleep(10000); /* 10 milliseconds */
8936 }
8937 }
8938 }
8939
8940 static void vmReopenSwapFile(void) {
8941 /* Note: we don't close the old one as we are in the child process
8942 * and don't want to mess at all with the original file object. */
8943 server.vm_fp = fopen(server.vm_swap_file,"r+b");
8944 if (server.vm_fp == NULL) {
8945 redisLog(REDIS_WARNING,"Can't re-open the VM swap file: %s. Exiting.",
8946 server.vm_swap_file);
8947 _exit(1);
8948 }
8949 server.vm_fd = fileno(server.vm_fp);
8950 }
8951
8952 /* This function must be called while with threaded IO locked */
8953 static void queueIOJob(iojob *j) {
8954 redisLog(REDIS_DEBUG,"Queued IO Job %p type %d about key '%s'\n",
8955 (void*)j, j->type, (char*)j->key->ptr);
8956 listAddNodeTail(server.io_newjobs,j);
8957 if (server.io_active_threads < server.vm_max_threads)
8958 spawnIOThread();
8959 }
8960
8961 static int vmSwapObjectThreaded(robj *key, robj *val, redisDb *db) {
8962 iojob *j;
8963
8964 assert(key->storage == REDIS_VM_MEMORY);
8965 assert(key->refcount == 1);
8966
8967 j = zmalloc(sizeof(*j));
8968 j->type = REDIS_IOJOB_PREPARE_SWAP;
8969 j->db = db;
8970 j->key = dupStringObject(key);
8971 j->val = val;
8972 incrRefCount(val);
8973 j->canceled = 0;
8974 j->thread = (pthread_t) -1;
8975 key->storage = REDIS_VM_SWAPPING;
8976
8977 lockThreadedIO();
8978 queueIOJob(j);
8979 unlockThreadedIO();
8980 return REDIS_OK;
8981 }
8982
8983 /* ============ Virtual Memory - Blocking clients on missing keys =========== */
8984
8985 /* This function makes the clinet 'c' waiting for the key 'key' to be loaded.
8986 * If there is not already a job loading the key, it is craeted.
8987 * The key is added to the io_keys list in the client structure, and also
8988 * in the hash table mapping swapped keys to waiting clients, that is,
8989 * server.io_waited_keys. */
8990 static int waitForSwappedKey(redisClient *c, robj *key) {
8991 struct dictEntry *de;
8992 robj *o;
8993 list *l;
8994
8995 /* If the key does not exist or is already in RAM we don't need to
8996 * block the client at all. */
8997 de = dictFind(c->db->dict,key);
8998 if (de == NULL) return 0;
8999 o = dictGetEntryKey(de);
9000 if (o->storage == REDIS_VM_MEMORY) {
9001 return 0;
9002 } else if (o->storage == REDIS_VM_SWAPPING) {
9003 /* We were swapping the key, undo it! */
9004 vmCancelThreadedIOJob(o);
9005 return 0;
9006 }
9007
9008 /* OK: the key is either swapped, or being loaded just now. */
9009
9010 /* Add the key to the list of keys this client is waiting for.
9011 * This maps clients to keys they are waiting for. */
9012 listAddNodeTail(c->io_keys,key);
9013 incrRefCount(key);
9014
9015 /* Add the client to the swapped keys => clients waiting map. */
9016 de = dictFind(c->db->io_keys,key);
9017 if (de == NULL) {
9018 int retval;
9019
9020 /* For every key we take a list of clients blocked for it */
9021 l = listCreate();
9022 retval = dictAdd(c->db->io_keys,key,l);
9023 incrRefCount(key);
9024 assert(retval == DICT_OK);
9025 } else {
9026 l = dictGetEntryVal(de);
9027 }
9028 listAddNodeTail(l,c);
9029
9030 /* Are we already loading the key from disk? If not create a job */
9031 if (o->storage == REDIS_VM_SWAPPED) {
9032 iojob *j;
9033
9034 o->storage = REDIS_VM_LOADING;
9035 j = zmalloc(sizeof(*j));
9036 j->type = REDIS_IOJOB_LOAD;
9037 j->db = c->db;
9038 j->key = dupStringObject(key);
9039 j->key->vtype = o->vtype;
9040 j->page = o->vm.page;
9041 j->val = NULL;
9042 j->canceled = 0;
9043 j->thread = (pthread_t) -1;
9044 lockThreadedIO();
9045 queueIOJob(j);
9046 unlockThreadedIO();
9047 }
9048 return 1;
9049 }
9050
9051 /* Preload keys needed for the ZUNION and ZINTER commands. */
9052 static void zunionInterBlockClientOnSwappedKeys(redisClient *c) {
9053 int i, num;
9054 num = atoi(c->argv[2]->ptr);
9055 for (i = 0; i < num; i++) {
9056 waitForSwappedKey(c,c->argv[3+i]);
9057 }
9058 }
9059
9060 /* Is this client attempting to run a command against swapped keys?
9061 * If so, block it ASAP, load the keys in background, then resume it.
9062 *
9063 * The important idea about this function is that it can fail! If keys will
9064 * still be swapped when the client is resumed, this key lookups will
9065 * just block loading keys from disk. In practical terms this should only
9066 * happen with SORT BY command or if there is a bug in this function.
9067 *
9068 * Return 1 if the client is marked as blocked, 0 if the client can
9069 * continue as the keys it is going to access appear to be in memory. */
9070 static int blockClientOnSwappedKeys(struct redisCommand *cmd, redisClient *c) {
9071 int j, last;
9072
9073 if (cmd->vm_preload_proc != NULL) {
9074 cmd->vm_preload_proc(c);
9075 } else {
9076 if (cmd->vm_firstkey == 0) return 0;
9077 last = cmd->vm_lastkey;
9078 if (last < 0) last = c->argc+last;
9079 for (j = cmd->vm_firstkey; j <= last; j += cmd->vm_keystep)
9080 waitForSwappedKey(c,c->argv[j]);
9081 }
9082
9083 /* If the client was blocked for at least one key, mark it as blocked. */
9084 if (listLength(c->io_keys)) {
9085 c->flags |= REDIS_IO_WAIT;
9086 aeDeleteFileEvent(server.el,c->fd,AE_READABLE);
9087 server.vm_blocked_clients++;
9088 return 1;
9089 } else {
9090 return 0;
9091 }
9092 }
9093
9094 /* Remove the 'key' from the list of blocked keys for a given client.
9095 *
9096 * The function returns 1 when there are no longer blocking keys after
9097 * the current one was removed (and the client can be unblocked). */
9098 static int dontWaitForSwappedKey(redisClient *c, robj *key) {
9099 list *l;
9100 listNode *ln;
9101 listIter li;
9102 struct dictEntry *de;
9103
9104 /* Remove the key from the list of keys this client is waiting for. */
9105 listRewind(c->io_keys,&li);
9106 while ((ln = listNext(&li)) != NULL) {
9107 if (compareStringObjects(ln->value,key) == 0) {
9108 listDelNode(c->io_keys,ln);
9109 break;
9110 }
9111 }
9112 assert(ln != NULL);
9113
9114 /* Remove the client form the key => waiting clients map. */
9115 de = dictFind(c->db->io_keys,key);
9116 assert(de != NULL);
9117 l = dictGetEntryVal(de);
9118 ln = listSearchKey(l,c);
9119 assert(ln != NULL);
9120 listDelNode(l,ln);
9121 if (listLength(l) == 0)
9122 dictDelete(c->db->io_keys,key);
9123
9124 return listLength(c->io_keys) == 0;
9125 }
9126
9127 static void handleClientsBlockedOnSwappedKey(redisDb *db, robj *key) {
9128 struct dictEntry *de;
9129 list *l;
9130 listNode *ln;
9131 int len;
9132
9133 de = dictFind(db->io_keys,key);
9134 if (!de) return;
9135
9136 l = dictGetEntryVal(de);
9137 len = listLength(l);
9138 /* Note: we can't use something like while(listLength(l)) as the list
9139 * can be freed by the calling function when we remove the last element. */
9140 while (len--) {
9141 ln = listFirst(l);
9142 redisClient *c = ln->value;
9143
9144 if (dontWaitForSwappedKey(c,key)) {
9145 /* Put the client in the list of clients ready to go as we
9146 * loaded all the keys about it. */
9147 listAddNodeTail(server.io_ready_clients,c);
9148 }
9149 }
9150 }
9151
9152 /* =========================== Remote Configuration ========================= */
9153
9154 static void configSetCommand(redisClient *c) {
9155 robj *o = getDecodedObject(c->argv[3]);
9156 if (!strcasecmp(c->argv[2]->ptr,"dbfilename")) {
9157 zfree(server.dbfilename);
9158 server.dbfilename = zstrdup(o->ptr);
9159 } else if (!strcasecmp(c->argv[2]->ptr,"requirepass")) {
9160 zfree(server.requirepass);
9161 server.requirepass = zstrdup(o->ptr);
9162 } else if (!strcasecmp(c->argv[2]->ptr,"masterauth")) {
9163 zfree(server.masterauth);
9164 server.masterauth = zstrdup(o->ptr);
9165 } else if (!strcasecmp(c->argv[2]->ptr,"maxmemory")) {
9166 server.maxmemory = strtoll(o->ptr, NULL, 10);
9167 } else {
9168 addReplySds(c,sdscatprintf(sdsempty(),
9169 "-ERR not supported CONFIG parameter %s\r\n",
9170 (char*)c->argv[2]->ptr));
9171 decrRefCount(o);
9172 return;
9173 }
9174 decrRefCount(o);
9175 addReply(c,shared.ok);
9176 }
9177
9178 static void configGetCommand(redisClient *c) {
9179 robj *o = getDecodedObject(c->argv[2]);
9180 robj *lenobj = createObject(REDIS_STRING,NULL);
9181 char *pattern = o->ptr;
9182 int matches = 0;
9183
9184 addReply(c,lenobj);
9185 decrRefCount(lenobj);
9186
9187 if (stringmatch(pattern,"dbfilename",0)) {
9188 addReplyBulkCString(c,"dbfilename");
9189 addReplyBulkCString(c,server.dbfilename);
9190 matches++;
9191 }
9192 if (stringmatch(pattern,"requirepass",0)) {
9193 addReplyBulkCString(c,"requirepass");
9194 addReplyBulkCString(c,server.requirepass);
9195 matches++;
9196 }
9197 if (stringmatch(pattern,"masterauth",0)) {
9198 addReplyBulkCString(c,"masterauth");
9199 addReplyBulkCString(c,server.masterauth);
9200 matches++;
9201 }
9202 if (stringmatch(pattern,"maxmemory",0)) {
9203 char buf[128];
9204
9205 snprintf(buf,128,"%llu\n",server.maxmemory);
9206 addReplyBulkCString(c,"maxmemory");
9207 addReplyBulkCString(c,buf);
9208 matches++;
9209 }
9210 decrRefCount(o);
9211 lenobj->ptr = sdscatprintf(sdsempty(),"*%d\r\n",matches*2);
9212 }
9213
9214 static void configCommand(redisClient *c) {
9215 if (!strcasecmp(c->argv[1]->ptr,"set")) {
9216 if (c->argc != 4) goto badarity;
9217 configSetCommand(c);
9218 } else if (!strcasecmp(c->argv[1]->ptr,"get")) {
9219 if (c->argc != 3) goto badarity;
9220 configGetCommand(c);
9221 } else if (!strcasecmp(c->argv[1]->ptr,"resetstat")) {
9222 if (c->argc != 2) goto badarity;
9223 server.stat_numcommands = 0;
9224 server.stat_numconnections = 0;
9225 server.stat_expiredkeys = 0;
9226 server.stat_starttime = time(NULL);
9227 addReply(c,shared.ok);
9228 } else {
9229 addReplySds(c,sdscatprintf(sdsempty(),
9230 "-ERR CONFIG subcommand must be one of GET, SET, RESETSTAT\r\n"));
9231 }
9232 return;
9233
9234 badarity:
9235 addReplySds(c,sdscatprintf(sdsempty(),
9236 "-ERR Wrong number of arguments for CONFIG %s\r\n",
9237 (char*) c->argv[1]->ptr));
9238 }
9239
9240 /* ================================= Debugging ============================== */
9241
9242 static void debugCommand(redisClient *c) {
9243 if (!strcasecmp(c->argv[1]->ptr,"segfault")) {
9244 *((char*)-1) = 'x';
9245 } else if (!strcasecmp(c->argv[1]->ptr,"reload")) {
9246 if (rdbSave(server.dbfilename) != REDIS_OK) {
9247 addReply(c,shared.err);
9248 return;
9249 }
9250 emptyDb();
9251 if (rdbLoad(server.dbfilename) != REDIS_OK) {
9252 addReply(c,shared.err);
9253 return;
9254 }
9255 redisLog(REDIS_WARNING,"DB reloaded by DEBUG RELOAD");
9256 addReply(c,shared.ok);
9257 } else if (!strcasecmp(c->argv[1]->ptr,"loadaof")) {
9258 emptyDb();
9259 if (loadAppendOnlyFile(server.appendfilename) != REDIS_OK) {
9260 addReply(c,shared.err);
9261 return;
9262 }
9263 redisLog(REDIS_WARNING,"Append Only File loaded by DEBUG LOADAOF");
9264 addReply(c,shared.ok);
9265 } else if (!strcasecmp(c->argv[1]->ptr,"object") && c->argc == 3) {
9266 dictEntry *de = dictFind(c->db->dict,c->argv[2]);
9267 robj *key, *val;
9268
9269 if (!de) {
9270 addReply(c,shared.nokeyerr);
9271 return;
9272 }
9273 key = dictGetEntryKey(de);
9274 val = dictGetEntryVal(de);
9275 if (!server.vm_enabled || (key->storage == REDIS_VM_MEMORY ||
9276 key->storage == REDIS_VM_SWAPPING)) {
9277 char *strenc;
9278 char buf[128];
9279
9280 if (val->encoding < (sizeof(strencoding)/sizeof(char*))) {
9281 strenc = strencoding[val->encoding];
9282 } else {
9283 snprintf(buf,64,"unknown encoding %d\n", val->encoding);
9284 strenc = buf;
9285 }
9286 addReplySds(c,sdscatprintf(sdsempty(),
9287 "+Key at:%p refcount:%d, value at:%p refcount:%d "
9288 "encoding:%s serializedlength:%lld\r\n",
9289 (void*)key, key->refcount, (void*)val, val->refcount,
9290 strenc, (long long) rdbSavedObjectLen(val,NULL)));
9291 } else {
9292 addReplySds(c,sdscatprintf(sdsempty(),
9293 "+Key at:%p refcount:%d, value swapped at: page %llu "
9294 "using %llu pages\r\n",
9295 (void*)key, key->refcount, (unsigned long long) key->vm.page,
9296 (unsigned long long) key->vm.usedpages));
9297 }
9298 } else if (!strcasecmp(c->argv[1]->ptr,"swapout") && c->argc == 3) {
9299 dictEntry *de = dictFind(c->db->dict,c->argv[2]);
9300 robj *key, *val;
9301
9302 if (!server.vm_enabled) {
9303 addReplySds(c,sdsnew("-ERR Virtual Memory is disabled\r\n"));
9304 return;
9305 }
9306 if (!de) {
9307 addReply(c,shared.nokeyerr);
9308 return;
9309 }
9310 key = dictGetEntryKey(de);
9311 val = dictGetEntryVal(de);
9312 /* If the key is shared we want to create a copy */
9313 if (key->refcount > 1) {
9314 robj *newkey = dupStringObject(key);
9315 decrRefCount(key);
9316 key = dictGetEntryKey(de) = newkey;
9317 }
9318 /* Swap it */
9319 if (key->storage != REDIS_VM_MEMORY) {
9320 addReplySds(c,sdsnew("-ERR This key is not in memory\r\n"));
9321 } else if (vmSwapObjectBlocking(key,val) == REDIS_OK) {
9322 dictGetEntryVal(de) = NULL;
9323 addReply(c,shared.ok);
9324 } else {
9325 addReply(c,shared.err);
9326 }
9327 } else {
9328 addReplySds(c,sdsnew(
9329 "-ERR Syntax error, try DEBUG [SEGFAULT|OBJECT <key>|SWAPOUT <key>|RELOAD]\r\n"));
9330 }
9331 }
9332
9333 static void _redisAssert(char *estr, char *file, int line) {
9334 redisLog(REDIS_WARNING,"=== ASSERTION FAILED ===");
9335 redisLog(REDIS_WARNING,"==> %s:%d '%s' is not true\n",file,line,estr);
9336 #ifdef HAVE_BACKTRACE
9337 redisLog(REDIS_WARNING,"(forcing SIGSEGV in order to print the stack trace)");
9338 *((char*)-1) = 'x';
9339 #endif
9340 }
9341
9342 /* =================================== Main! ================================ */
9343
9344 #ifdef __linux__
9345 int linuxOvercommitMemoryValue(void) {
9346 FILE *fp = fopen("/proc/sys/vm/overcommit_memory","r");
9347 char buf[64];
9348
9349 if (!fp) return -1;
9350 if (fgets(buf,64,fp) == NULL) {
9351 fclose(fp);
9352 return -1;
9353 }
9354 fclose(fp);
9355
9356 return atoi(buf);
9357 }
9358
9359 void linuxOvercommitMemoryWarning(void) {
9360 if (linuxOvercommitMemoryValue() == 0) {
9361 redisLog(REDIS_WARNING,"WARNING overcommit_memory is set to 0! Background save may fail under low condition memory. To fix this issue add 'vm.overcommit_memory = 1' to /etc/sysctl.conf and then reboot or run the command 'sysctl vm.overcommit_memory=1' for this to take effect.");
9362 }
9363 }
9364 #endif /* __linux__ */
9365
9366 static void daemonize(void) {
9367 int fd;
9368 FILE *fp;
9369
9370 if (fork() != 0) exit(0); /* parent exits */
9371 setsid(); /* create a new session */
9372
9373 /* Every output goes to /dev/null. If Redis is daemonized but
9374 * the 'logfile' is set to 'stdout' in the configuration file
9375 * it will not log at all. */
9376 if ((fd = open("/dev/null", O_RDWR, 0)) != -1) {
9377 dup2(fd, STDIN_FILENO);
9378 dup2(fd, STDOUT_FILENO);
9379 dup2(fd, STDERR_FILENO);
9380 if (fd > STDERR_FILENO) close(fd);
9381 }
9382 /* Try to write the pid file */
9383 fp = fopen(server.pidfile,"w");
9384 if (fp) {
9385 fprintf(fp,"%d\n",getpid());
9386 fclose(fp);
9387 }
9388 }
9389
9390 static void version() {
9391 printf("Redis server version %s\n", REDIS_VERSION);
9392 exit(0);
9393 }
9394
9395 static void usage() {
9396 fprintf(stderr,"Usage: ./redis-server [/path/to/redis.conf]\n");
9397 fprintf(stderr," ./redis-server - (read config from stdin)\n");
9398 exit(1);
9399 }
9400
9401 int main(int argc, char **argv) {
9402 time_t start;
9403
9404 initServerConfig();
9405 if (argc == 2) {
9406 if (strcmp(argv[1], "-v") == 0 ||
9407 strcmp(argv[1], "--version") == 0) version();
9408 if (strcmp(argv[1], "--help") == 0) usage();
9409 resetServerSaveParams();
9410 loadServerConfig(argv[1]);
9411 } else if ((argc > 2)) {
9412 usage();
9413 } else {
9414 redisLog(REDIS_WARNING,"Warning: no config file specified, using the default config. In order to specify a config file use 'redis-server /path/to/redis.conf'");
9415 }
9416 if (server.daemonize) daemonize();
9417 initServer();
9418 redisLog(REDIS_NOTICE,"Server started, Redis version " REDIS_VERSION);
9419 #ifdef __linux__
9420 linuxOvercommitMemoryWarning();
9421 #endif
9422 start = time(NULL);
9423 if (server.appendonly) {
9424 if (loadAppendOnlyFile(server.appendfilename) == REDIS_OK)
9425 redisLog(REDIS_NOTICE,"DB loaded from append only file: %ld seconds",time(NULL)-start);
9426 } else {
9427 if (rdbLoad(server.dbfilename) == REDIS_OK)
9428 redisLog(REDIS_NOTICE,"DB loaded from disk: %ld seconds",time(NULL)-start);
9429 }
9430 redisLog(REDIS_NOTICE,"The server is now ready to accept connections on port %d", server.port);
9431 aeSetBeforeSleepProc(server.el,beforeSleep);
9432 aeMain(server.el);
9433 aeDeleteEventLoop(server.el);
9434 return 0;
9435 }
9436
9437 /* ============================= Backtrace support ========================= */
9438
9439 #ifdef HAVE_BACKTRACE
9440 static char *findFuncName(void *pointer, unsigned long *offset);
9441
9442 static void *getMcontextEip(ucontext_t *uc) {
9443 #if defined(__FreeBSD__)
9444 return (void*) uc->uc_mcontext.mc_eip;
9445 #elif defined(__dietlibc__)
9446 return (void*) uc->uc_mcontext.eip;
9447 #elif defined(__APPLE__) && !defined(MAC_OS_X_VERSION_10_6)
9448 #if __x86_64__
9449 return (void*) uc->uc_mcontext->__ss.__rip;
9450 #else
9451 return (void*) uc->uc_mcontext->__ss.__eip;
9452 #endif
9453 #elif defined(__APPLE__) && defined(MAC_OS_X_VERSION_10_6)
9454 #if defined(_STRUCT_X86_THREAD_STATE64) && !defined(__i386__)
9455 return (void*) uc->uc_mcontext->__ss.__rip;
9456 #else
9457 return (void*) uc->uc_mcontext->__ss.__eip;
9458 #endif
9459 #elif defined(__i386__) || defined(__X86_64__) || defined(__x86_64__)
9460 return (void*) uc->uc_mcontext.gregs[REG_EIP]; /* Linux 32/64 bit */
9461 #elif defined(__ia64__) /* Linux IA64 */
9462 return (void*) uc->uc_mcontext.sc_ip;
9463 #else
9464 return NULL;
9465 #endif
9466 }
9467
9468 static void segvHandler(int sig, siginfo_t *info, void *secret) {
9469 void *trace[100];
9470 char **messages = NULL;
9471 int i, trace_size = 0;
9472 unsigned long offset=0;
9473 ucontext_t *uc = (ucontext_t*) secret;
9474 sds infostring;
9475 REDIS_NOTUSED(info);
9476
9477 redisLog(REDIS_WARNING,
9478 "======= Ooops! Redis %s got signal: -%d- =======", REDIS_VERSION, sig);
9479 infostring = genRedisInfoString();
9480 redisLog(REDIS_WARNING, "%s",infostring);
9481 /* It's not safe to sdsfree() the returned string under memory
9482 * corruption conditions. Let it leak as we are going to abort */
9483
9484 trace_size = backtrace(trace, 100);
9485 /* overwrite sigaction with caller's address */
9486 if (getMcontextEip(uc) != NULL) {
9487 trace[1] = getMcontextEip(uc);
9488 }
9489 messages = backtrace_symbols(trace, trace_size);
9490
9491 for (i=1; i<trace_size; ++i) {
9492 char *fn = findFuncName(trace[i], &offset), *p;
9493
9494 p = strchr(messages[i],'+');
9495 if (!fn || (p && ((unsigned long)strtol(p+1,NULL,10)) < offset)) {
9496 redisLog(REDIS_WARNING,"%s", messages[i]);
9497 } else {
9498 redisLog(REDIS_WARNING,"%d redis-server %p %s + %d", i, trace[i], fn, (unsigned int)offset);
9499 }
9500 }
9501 /* free(messages); Don't call free() with possibly corrupted memory. */
9502 _exit(0);
9503 }
9504
9505 static void setupSigSegvAction(void) {
9506 struct sigaction act;
9507
9508 sigemptyset (&act.sa_mask);
9509 /* When the SA_SIGINFO flag is set in sa_flags then sa_sigaction
9510 * is used. Otherwise, sa_handler is used */
9511 act.sa_flags = SA_NODEFER | SA_ONSTACK | SA_RESETHAND | SA_SIGINFO;
9512 act.sa_sigaction = segvHandler;
9513 sigaction (SIGSEGV, &act, NULL);
9514 sigaction (SIGBUS, &act, NULL);
9515 sigaction (SIGFPE, &act, NULL);
9516 sigaction (SIGILL, &act, NULL);
9517 sigaction (SIGBUS, &act, NULL);
9518 return;
9519 }
9520
9521 #include "staticsymbols.h"
9522 /* This function try to convert a pointer into a function name. It's used in
9523 * oreder to provide a backtrace under segmentation fault that's able to
9524 * display functions declared as static (otherwise the backtrace is useless). */
9525 static char *findFuncName(void *pointer, unsigned long *offset){
9526 int i, ret = -1;
9527 unsigned long off, minoff = 0;
9528
9529 /* Try to match against the Symbol with the smallest offset */
9530 for (i=0; symsTable[i].pointer; i++) {
9531 unsigned long lp = (unsigned long) pointer;
9532
9533 if (lp != (unsigned long)-1 && lp >= symsTable[i].pointer) {
9534 off=lp-symsTable[i].pointer;
9535 if (ret < 0 || off < minoff) {
9536 minoff=off;
9537 ret=i;
9538 }
9539 }
9540 }
9541 if (ret == -1) return NULL;
9542 *offset = minoff;
9543 return symsTable[ret].name;
9544 }
9545 #else /* HAVE_BACKTRACE */
9546 static void setupSigSegvAction(void) {
9547 }
9548 #endif /* HAVE_BACKTRACE */
9549
9550
9551
9552 /* The End */
9553
9554
9555