1 /* diskstore.c implements a very simple disk backed key-value store used
2 * by Redis for the "disk" backend. This implementation uses the filesystem
3 * to store key/value pairs. Every file represents a given key.
5 * The key path is calculated using the SHA1 of the key name. For instance
6 * the key "foo" is stored as a file name called:
8 * /0b/ee/0beec7b5ea3f0fdbc95d0dd47f3c5bc275da8a33
10 * The couples of characters from the hex output of SHA1 are also used
11 * to locate two two levels of directories to store the file (as most
12 * filesystems are not able to handle too many files in a single dir).
14 * In the end there are 65536 final directories (256 directories inside
15 * every 256 top level directories), so that with 1 billion of files every
16 * directory will contain in the average 15258 entires, that is ok with
17 * most filesystems implementation.
19 * Note that since Redis supports multiple databases, the actual key name
22 * /0b/ee/<dbid>_0beec7b5ea3f0fdbc95d0dd47f3c5bc275da8a33
24 * so for instance if the key is inside DB 0:
26 * /0b/ee/0_0beec7b5ea3f0fdbc95d0dd47f3c5bc275da8a33
28 * The actaul implementation of this disk store is highly dependant to the
29 * filesystem implementation itself. This implementation may be replaced by
30 * a B+TREE implementation in future implementations.
32 * Data ok every key is serialized using the same format used for .rdb
33 * serialization. Everything is serialized on every entry: key name,
34 * ttl information in case of keys with an associated expire time, and the
35 * serialized value itself.
37 * Because the format is the same of the .rdb files it is trivial to create
38 * an .rdb file starting from this format just by mean of scanning the
39 * directories and concatenating entries, with the sole addition of an
40 * .rdb header at the start and the end-of-db opcode at the end.
42 * -------------------------------------------------------------------------
44 * Copyright (c) 2010-2011, Salvatore Sanfilippo <antirez at gmail dot com>
45 * All rights reserved.
47 * Redistribution and use in source and binary forms, with or without
48 * modification, are permitted provided that the following conditions are met:
50 * * Redistributions of source code must retain the above copyright notice,
51 * this list of conditions and the following disclaimer.
52 * * Redistributions in binary form must reproduce the above copyright
53 * notice, this list of conditions and the following disclaimer in the
54 * documentation and/or other materials provided with the distribution.
55 * * Neither the name of Redis nor the names of its contributors may be used
56 * to endorse or promote products derived from this software without
57 * specific prior written permission.
59 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
60 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
61 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
62 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
63 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
64 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
65 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
66 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
67 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
68 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
69 * POSSIBILITY OF SUCH DAMAGE.
79 int create256dir(char *prefix
) {
83 for (j
= 0; j
< 256; j
++) {
84 snprintf(buf
,sizeof(buf
),"%s%02x",prefix
,j
);
85 if (mkdir(buf
,0755) == -1) {
86 redisLog(REDIS_WARNING
,"Error creating dir %s for diskstore: %s",
97 char *path
= server
.ds_path
;
100 if ((retval
= stat(path
,&sb
) == -1) && errno
!= ENOENT
) {
101 redisLog(REDIS_WARNING
, "Error opening disk store at %s: %s",
102 path
, strerror(errno
));
106 /* Directory already in place. Assume everything is ok. */
107 if (retval
== 0 && S_ISDIR(sb
.st_mode
)) {
108 redisLog(REDIS_NOTICE
,"Disk store %s exists", path
);
112 /* File exists but it's not a directory */
113 if (retval
== 0 && !S_ISDIR(sb
.st_mode
)) {
114 redisLog(REDIS_WARNING
,"Disk store at %s is not a directory", path
);
118 /* New disk store, create the directory structure now, as creating
119 * them in a lazy way is not a good idea, after very few insertions
120 * we'll need most of the 65536 directories anyway. */
121 redisLog(REDIS_NOTICE
,"Disk store %s does not exist: creating", path
);
122 if (mkdir(path
,0755) == -1) {
123 redisLog(REDIS_WARNING
,"Disk store init failed creating dir %s: %s",
124 path
, strerror(errno
));
127 /* Create the top level 256 directories */
128 snprintf(buf
,sizeof(buf
),"%s/",path
);
129 if (create256dir(buf
) == REDIS_ERR
) return REDIS_ERR
;
131 /* For every 256 top level dir, create 256 nested dirs */
132 for (j
= 0; j
< 256; j
++) {
133 snprintf(buf
,sizeof(buf
),"%s/%02x/",path
,j
);
134 if (create256dir(buf
) == REDIS_ERR
) return REDIS_ERR
;
143 /* Convert key into full path for this object. Dirty but hopefully
144 * is fast enough. Returns the length of the returned path. */
145 int dsKeyToPath(redisDb
*db
, char *buf
, robj
*key
) {
147 unsigned char hash
[20];
148 char hex
[40], digits
[] = "0123456789abcdef";
153 SHA1Update(&ctx
,key
->ptr
,sdslen(key
->ptr
));
154 SHA1Final(hash
,&ctx
);
156 /* Convert the hash into hex format */
157 for (j
= 0; j
< 20; j
++) {
158 hex
[j
*2] = digits
[(hash
[j
]&0xF0)>>4];
159 hex
[(j
*2)+1] = digits
[hash
[j
]&0x0F];
162 /* Create the object path. Start with server.ds_path that's the root dir */
163 l
= sdslen(server
.ds_path
);
164 memcpy(buf
,server
.ds_path
,l
);
168 /* Then add xx/yy/ that is the two level directories */
177 /* Add the database number followed by _ and finall the SHA1 hex */
178 l
= ll2string(buf
,64,db
->id
);
181 memcpy(buf
+1,hex
,40);
183 return (buf
-origbuf
)+41;
186 int dsSet(redisDb
*db
, robj
*key
, robj
*val
, time_t expire
) {
187 char buf
[1024], buf2
[1024];
192 len
= dsKeyToPath(db
,buf
,key
);
193 memcpy(buf2
,buf
,len
);
194 snprintf(buf2
+len
,sizeof(buf2
)-len
,"-%ld-%ld",(long)time(NULL
),(long)val
);
195 while ((fp
= fopen(buf2
,"w")) == NULL
) {
196 if (errno
== ENOSPC
) {
197 redisLog(REDIS_WARNING
,"Diskstore: No space left on device. Please make room and wait 30 seconds for Redis to continue.");
200 redisLog(REDIS_WARNING
,"diskstore error opening %s: %s",
201 buf2
, strerror(errno
));
202 redisPanic("Unrecoverable diskstore error. Exiting.");
206 rdb
= rioInitWithFile(fp
);
207 if ((retval
= rdbSaveKeyValuePair(&rdb
,key
,val
,expire
,time(NULL
))) == -1)
211 /* Expired key. Unlink failing not critical */
215 /* Use rename for atomic updadte of value */
216 if (rename(buf2
,buf
) == -1) {
217 redisLog(REDIS_WARNING
,"rename(2) returned an error: %s",
219 redisPanic("Unrecoverable diskstore error. Exiting.");
225 robj
*dsGet(redisDb
*db
, robj
*key
, time_t *expire
) {
228 time_t expiretime
= -1; /* -1 means: no expire */
229 robj
*dskey
; /* Key as loaded from disk. */
234 dsKeyToPath(db
,buf
,key
);
236 if (fp
== NULL
&& errno
== ENOENT
) return NULL
; /* No such key */
238 redisLog(REDIS_WARNING
,"Disk store failed opening %s: %s",
239 buf
, strerror(errno
));
243 rdb
= rioInitWithFile(fp
);
244 if ((type
= rdbLoadType(&rdb
)) == -1) goto readerr
;
245 if (type
== REDIS_EXPIRETIME
) {
246 if ((expiretime
= rdbLoadTime(&rdb
)) == -1) goto readerr
;
247 /* We read the time so we need to read the object type again */
248 if ((type
= rdbLoadType(&rdb
)) == -1) goto readerr
;
251 if ((dskey
= rdbLoadStringObject(&rdb
)) == NULL
) goto readerr
;
253 if ((val
= rdbLoadObject(type
,&rdb
)) == NULL
) goto readerr
;
256 /* The key we asked, and the key returned, must be the same */
257 redisAssert(equalStringObjects(key
,dskey
));
259 /* Check if the key already expired */
261 if (expiretime
!= -1 && expiretime
< time(NULL
)) {
263 unlink(buf
); /* This failing is non critical here */
267 /* Everything ok... */
268 *expire
= expiretime
;
272 redisLog(REDIS_WARNING
,"Read error reading reading %s. Corrupted key?",
274 redisPanic("Unrecoverable error reading from disk store");
275 return NULL
; /* unreached */
278 int dsDel(redisDb
*db
, robj
*key
) {
281 dsKeyToPath(db
,buf
,key
);
282 if (unlink(buf
) == -1) {
283 if (errno
== ENOENT
) {
286 redisLog(REDIS_WARNING
,"Disk store can't remove %s: %s",
287 buf
, strerror(errno
));
288 redisPanic("Unrecoverable Disk store errore. Existing.");
289 return REDIS_ERR
; /* unreached */
296 int dsExists(redisDb
*db
, robj
*key
) {
299 dsKeyToPath(db
,buf
,key
);
300 return access(buf
,R_OK
) == 0;
303 int dsGetDbidFromFilename(char *path
) {
305 char *p
= strchr(path
,'_');
306 int len
= (p
- path
);
308 redisAssert(p
!= NULL
&& len
< 64);
314 void dsFlushOneDir(char *path
, int dbid
) {
316 struct dirent
*dp
, de
;
320 redisLog(REDIS_WARNING
,"Disk store can't open dir %s: %s",
321 path
, strerror(errno
));
322 redisPanic("Unrecoverable Disk store errore. Existing.");
327 readdir_r(dir
,&de
,&dp
);
328 if (dp
== NULL
) break;
329 if (dp
->d_name
[0] == '.') continue;
331 /* Check if we need to remove this entry accordingly to the
333 if (dbid
!= -1 && dsGetDbidFromFilename(dp
->d_name
)) continue;
335 /* Finally unlink the file */
336 snprintf(buf
,1024,"%s/%s",path
,dp
->d_name
);
337 if (unlink(buf
) == -1) {
338 redisLog(REDIS_WARNING
,
339 "Can't unlink %s: %s", buf
, strerror(errno
));
340 redisPanic("Unrecoverable Disk store errore. Existing.");
346 void dsFlushDb(int dbid
) {
350 redisLog(REDIS_NOTICE
,"Flushing diskstore DB (%d)",dbid
);
351 for (j
= 0; j
< 256; j
++) {
352 for (i
= 0; i
< 256; i
++) {
353 snprintf(buf
,1024,"%s/%02x/%02x",server
.ds_path
,j
,i
);
354 dsFlushOneDir(buf
,dbid
);
359 void dsRdbSaveSetState(int state
) {
360 pthread_mutex_lock(&server
.bgsavethread_mutex
);
361 server
.bgsavethread_state
= state
;
362 pthread_mutex_unlock(&server
.bgsavethread_mutex
);
365 void *dsRdbSave_thread(void *arg
) {
366 char tmpfile
[256], *filename
= (char*)arg
;
367 struct dirent
*dp
, de
;
368 int j
, i
, last_dbid
= -1;
372 /* Change state to ACTIVE, to signal there is a saving thead working. */
373 redisLog(REDIS_NOTICE
,"Diskstore BGSAVE thread started");
374 dsRdbSaveSetState(REDIS_BGSAVE_THREAD_ACTIVE
);
376 snprintf(tmpfile
,256,"temp-%d.rdb", (int) getpid());
377 fp
= fopen(tmpfile
,"w");
379 redisLog(REDIS_WARNING
, "Failed opening .rdb for saving: %s",
381 dsRdbSaveSetState(REDIS_BGSAVE_THREAD_DONE_ERR
);
385 rdb
= rioInitWithFile(fp
);
386 if (rioWrite(&rdb
,"REDIS0001",9) == 0) goto werr
;
390 /* Scan all diskstore dirs looking for keys */
391 for (j
= 0; j
< 256; j
++) {
392 for (i
= 0; i
< 256; i
++) {
396 /* For every directory, collect all the keys */
397 snprintf(buf
,sizeof(buf
),"%s/%02x/%02x",server
.ds_path
,j
,i
);
398 if ((dir
= opendir(buf
)) == NULL
) {
399 redisLog(REDIS_WARNING
,"Disk store can't open dir %s: %s",
400 buf
, strerror(errno
));
409 readdir_r(dir
,&de
,&dp
);
410 if (dp
== NULL
) break;
411 if (dp
->d_name
[0] == '.') continue;
412 /* If there is a '-' char in the file name, it's a temp file */
413 if (strchr(dp
->d_name
,'-') != NULL
) continue;
415 /* Emit the SELECT DB opcode if needed. */
416 dbid
= dsGetDbidFromFilename(dp
->d_name
);
417 if (dbid
!= last_dbid
) {
419 if (rdbSaveType(&rdb
,REDIS_SELECTDB
) == -1) goto werr
;
420 if (rdbSaveLen(&rdb
,dbid
) == -1) goto werr
;
423 /* Let's copy this file into the target .rdb */
424 snprintf(buf
,sizeof(buf
),"%s/%02x/%02x/%s",
425 server
.ds_path
,j
,i
,dp
->d_name
);
426 if ((entryfp
= fopen(buf
,"r")) == NULL
) {
427 redisLog(REDIS_WARNING
,"Can't open %s: %s",
428 buf
,strerror(errno
));
433 size_t nread
= fread(buf
,1,sizeof(buf
),entryfp
);
436 if (ferror(entryfp
)) {
437 redisLog(REDIS_WARNING
,"Error reading from file entry while performing BGSAVE for diskstore: %s", strerror(errno
));
444 if (rioWrite(&rdb
,buf
,nread
) == 0) {
455 /* Output the end of file opcode */
456 if (rdbSaveType(&rdb
,REDIS_EOF
) == -1) goto werr
;
458 /* Make sure data will not remain on the OS's output buffers */
464 /* Use RENAME to make sure the DB file is changed atomically only
465 * if the generate DB file is ok. */
466 if (rename(tmpfile
,filename
) == -1) {
467 redisLog(REDIS_WARNING
,"Error moving temp DB file on the final destination: %s (diskstore)", strerror(errno
));
469 dsRdbSaveSetState(REDIS_BGSAVE_THREAD_DONE_ERR
);
472 redisLog(REDIS_NOTICE
,"DB saved on disk by diskstore thread");
473 dsRdbSaveSetState(REDIS_BGSAVE_THREAD_DONE_OK
);
480 dsRdbSaveSetState(REDIS_BGSAVE_THREAD_DONE_ERR
);
481 redisLog(REDIS_WARNING
,"Write error saving DB on disk: %s", strerror(errno
));
485 int dsRdbSaveBackground(char *filename
) {
488 if (pthread_create(&thread
,NULL
,dsRdbSave_thread
,zstrdup(filename
)) != 0) {
489 redisLog(REDIS_WARNING
,"Can't create diskstore BGSAVE thread: %s",
493 server
.bgsavethread
= thread
;
498 int dsRdbSave(char *filename
) {
499 /* A blocking save is actually a non blocking save... just we wait
500 * for it to terminate in a non-busy loop. */
502 redisLog(REDIS_NOTICE
,"Starting a blocking SAVE (BGSAVE + blocking wait)");
503 server
.dirty_before_bgsave
= server
.dirty
;
504 if (dsRdbSaveBackground(filename
) == REDIS_ERR
) return REDIS_ERR
;
509 pthread_mutex_lock(&server
.bgsavethread_mutex
);
510 state
= server
.bgsavethread_state
;
511 pthread_mutex_unlock(&server
.bgsavethread_mutex
);
513 if (state
== REDIS_BGSAVE_THREAD_DONE_OK
||
514 state
== REDIS_BGSAVE_THREAD_DONE_ERR
) break;