1 /* diskstore.c implements a very simple disk backed key-value store used 
   2  * by Redis for the "disk" backend. This implementation uses the filesystem 
   3  * to store key/value pairs. Every file represents a given key. 
   5  * The key path is calculated using the SHA1 of the key name. For instance 
   6  * the key "foo" is stored as a file name called: 
   8  *  /0b/ee/0beec7b5ea3f0fdbc95d0dd47f3c5bc275da8a33 
  10  * The couples of characters from the hex output of SHA1 are also used 
  11  * to locate two two levels of directories to store the file (as most 
  12  * filesystems are not able to handle too many files in a single dir). 
  14  * In the end there are 65536 final directories (256 directories inside 
  15  * every 256 top level directories), so that with 1 billion of files every 
  16  * directory will contain in the average 15258 entires, that is ok with 
  17  * most filesystems implementation. 
  19  * Note that since Redis supports multiple databases, the actual key name 
  22  *  /0b/ee/<dbid>_0beec7b5ea3f0fdbc95d0dd47f3c5bc275da8a33 
  24  *  so for instance if the key is inside DB 0: 
  26  *  /0b/ee/0_0beec7b5ea3f0fdbc95d0dd47f3c5bc275da8a33 
  28  * The actaul implementation of this disk store is highly dependant to the 
  29  * filesystem implementation itself. This implementation may be replaced by 
  30  * a B+TREE implementation in future implementations. 
  32  * Data ok every key is serialized using the same format used for .rdb 
  33  * serialization. Everything is serialized on every entry: key name, 
  34  * ttl information in case of keys with an associated expire time, and the 
  35  * serialized value itself. 
  37  * Because the format is the same of the .rdb files it is trivial to create 
  38  * an .rdb file starting from this format just by mean of scanning the 
  39  * directories and concatenating entries, with the sole addition of an 
  40  * .rdb header at the start and the end-of-db opcode at the end. 
  42  * ------------------------------------------------------------------------- 
  44  * Copyright (c) 2010-2011, Salvatore Sanfilippo <antirez at gmail dot com> 
  45  * All rights reserved. 
  47  * Redistribution and use in source and binary forms, with or without 
  48  * modification, are permitted provided that the following conditions are met: 
  50  *   * Redistributions of source code must retain the above copyright notice, 
  51  *     this list of conditions and the following disclaimer. 
  52  *   * Redistributions in binary form must reproduce the above copyright 
  53  *     notice, this list of conditions and the following disclaimer in the 
  54  *     documentation and/or other materials provided with the distribution. 
  55  *   * Neither the name of Redis nor the names of its contributors may be used 
  56  *     to endorse or promote products derived from this software without 
  57  *     specific prior written permission. 
  59  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 
  60  * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 
  61  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 
  62  * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE 
  63  * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 
  64  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 
  65  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 
  66  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 
  67  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 
  68  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 
  69  * POSSIBILITY OF SUCH DAMAGE. 
  79 int create256dir(char *prefix
) { 
  83     for (j 
= 0; j 
< 256; j
++) { 
  84         snprintf(buf
,sizeof(buf
),"%s%02x",prefix
,j
); 
  85         if (mkdir(buf
,0755) == -1) { 
  86             redisLog(REDIS_WARNING
,"Error creating dir %s for diskstore: %s", 
  97     char *path 
= server
.ds_path
; 
 100     if ((retval 
= stat(path
,&sb
) == -1) && errno 
!= ENOENT
) { 
 101         redisLog(REDIS_WARNING
, "Error opening disk store at %s: %s", 
 102                 path
, strerror(errno
)); 
 106     /* Directory already in place. Assume everything is ok. */ 
 107     if (retval 
== 0 && S_ISDIR(sb
.st_mode
)) { 
 108         redisLog(REDIS_NOTICE
,"Disk store %s exists", path
); 
 112     /* File exists but it's not a directory */ 
 113     if (retval 
== 0 && !S_ISDIR(sb
.st_mode
)) { 
 114         redisLog(REDIS_WARNING
,"Disk store at %s is not a directory", path
); 
 118     /* New disk store, create the directory structure now, as creating 
 119      * them in a lazy way is not a good idea, after very few insertions 
 120      * we'll need most of the 65536 directories anyway. */ 
 121     redisLog(REDIS_NOTICE
,"Disk store %s does not exist: creating", path
); 
 122     if (mkdir(path
,0755) == -1) { 
 123         redisLog(REDIS_WARNING
,"Disk store init failed creating dir %s: %s", 
 124             path
, strerror(errno
)); 
 127     /* Create the top level 256 directories */ 
 128     snprintf(buf
,sizeof(buf
),"%s/",path
); 
 129     if (create256dir(buf
) == REDIS_ERR
) return REDIS_ERR
; 
 131     /* For every 256 top level dir, create 256 nested dirs */ 
 132     for (j 
= 0; j 
< 256; j
++) { 
 133         snprintf(buf
,sizeof(buf
),"%s/%02x/",path
,j
); 
 134         if (create256dir(buf
) == REDIS_ERR
) return REDIS_ERR
; 
 143 /* Convert key into full path for this object. Dirty but hopefully 
 144  * is fast enough. Returns the length of the returned path. */ 
 145 int dsKeyToPath(redisDb 
*db
, char *buf
, robj 
*key
) { 
 147     unsigned char hash
[20]; 
 148     char hex
[40], digits
[] = "0123456789abcdef"; 
 153     SHA1Update(&ctx
,key
->ptr
,sdslen(key
->ptr
)); 
 154     SHA1Final(hash
,&ctx
); 
 156     /* Convert the hash into hex format */ 
 157     for (j 
= 0; j 
< 20; j
++) { 
 158         hex
[j
*2] = digits
[(hash
[j
]&0xF0)>>4]; 
 159         hex
[(j
*2)+1] = digits
[hash
[j
]&0x0F]; 
 162     /* Create the object path. Start with server.ds_path that's the root dir */ 
 163     l 
= sdslen(server
.ds_path
); 
 164     memcpy(buf
,server
.ds_path
,l
); 
 168     /* Then add xx/yy/ that is the two level directories */ 
 177     /* Add the database number followed by _ and finall the SHA1 hex */ 
 178     l 
= ll2string(buf
,64,db
->id
); 
 181     memcpy(buf
+1,hex
,40); 
 183     return (buf
-origbuf
)+41; 
 186 int dsSet(redisDb 
*db
, robj 
*key
, robj 
*val
, time_t expire
) { 
 187     char buf
[1024], buf2
[1024]; 
 191     len 
= dsKeyToPath(db
,buf
,key
); 
 192     memcpy(buf2
,buf
,len
); 
 193     snprintf(buf2
+len
,sizeof(buf2
)-len
,"-%ld-%ld",(long)time(NULL
),(long)val
); 
 194     while ((fp 
= fopen(buf2
,"w")) == NULL
) { 
 195         if (errno 
== ENOSPC
) { 
 196             redisLog(REDIS_WARNING
,"Diskstore: No space left on device. Please make room and wait 30 seconds for Redis to continue."); 
 199             redisLog(REDIS_WARNING
,"diskstore error opening %s: %s", 
 200                 buf2
, strerror(errno
)); 
 201             redisPanic("Unrecoverable diskstore error. Exiting."); 
 204     if ((retval 
= rdbSaveKeyValuePair(fp
,key
,val
,expire
,time(NULL
))) == -1) 
 208         /* Expired key. Unlink failing not critical */ 
 212         /* Use rename for atomic updadte of value */ 
 213         if (rename(buf2
,buf
) == -1) { 
 214             redisLog(REDIS_WARNING
,"rename(2) returned an error: %s", 
 216             redisPanic("Unrecoverable diskstore error. Exiting."); 
 222 robj 
*dsGet(redisDb 
*db
, robj 
*key
, time_t *expire
) { 
 225     time_t expiretime 
= -1; /* -1 means: no expire */ 
 226     robj 
*dskey
; /* Key as loaded from disk. */ 
 230     dsKeyToPath(db
,buf
,key
); 
 232     if (fp 
== NULL 
&& errno 
== ENOENT
) return NULL
; /* No such key */ 
 234         redisLog(REDIS_WARNING
,"Disk store failed opening %s: %s", 
 235             buf
, strerror(errno
)); 
 239     if ((type 
= rdbLoadType(fp
)) == -1) goto readerr
; 
 240     if (type 
== REDIS_EXPIRETIME
) { 
 241         if ((expiretime 
= rdbLoadTime(fp
)) == -1) goto readerr
; 
 242         /* We read the time so we need to read the object type again */ 
 243         if ((type 
= rdbLoadType(fp
)) == -1) goto readerr
; 
 246     if ((dskey 
= rdbLoadStringObject(fp
)) == NULL
) goto readerr
; 
 248     if ((val 
= rdbLoadObject(type
,fp
)) == NULL
) goto readerr
; 
 251     /* The key we asked, and the key returned, must be the same */ 
 252     redisAssert(equalStringObjects(key
,dskey
)); 
 254     /* Check if the key already expired */ 
 256     if (expiretime 
!= -1 && expiretime 
< time(NULL
)) { 
 258         unlink(buf
); /* This failing is non critical here */ 
 262     /* Everything ok... */ 
 263     *expire 
= expiretime
; 
 267     redisLog(REDIS_WARNING
,"Read error reading reading %s. Corrupted key?", 
 269     redisPanic("Unrecoverable error reading from disk store"); 
 270     return NULL
; /* unreached */ 
 273 int dsDel(redisDb 
*db
, robj 
*key
) { 
 276     dsKeyToPath(db
,buf
,key
); 
 277     if (unlink(buf
) == -1) { 
 278         if (errno 
== ENOENT
) { 
 281             redisLog(REDIS_WARNING
,"Disk store can't remove %s: %s", 
 282                 buf
, strerror(errno
)); 
 283             redisPanic("Unrecoverable Disk store errore. Existing."); 
 284             return REDIS_ERR
; /* unreached */ 
 291 int dsExists(redisDb 
*db
, robj 
*key
) { 
 294     dsKeyToPath(db
,buf
,key
); 
 295     return access(buf
,R_OK
) == 0; 
 298 int dsGetDbidFromFilename(char *path
) { 
 300     char *p 
= strchr(path
,'_'); 
 301     int len 
= (p 
- path
); 
 303     redisAssert(p 
!= NULL 
&& len 
< 64); 
 309 void dsFlushOneDir(char *path
, int dbid
) { 
 311     struct dirent 
*dp
, de
; 
 315         redisLog(REDIS_WARNING
,"Disk store can't open dir %s: %s", 
 316             path
, strerror(errno
)); 
 317         redisPanic("Unrecoverable Disk store errore. Existing."); 
 322         readdir_r(dir
,&de
,&dp
); 
 323         if (dp 
== NULL
) break; 
 324         if (dp
->d_name
[0] == '.') continue; 
 326         /* Check if we need to remove this entry accordingly to the 
 328         if (dbid 
!= -1 && dsGetDbidFromFilename(dp
->d_name
)) continue; 
 330         /* Finally unlink the file */ 
 331         snprintf(buf
,1024,"%s/%s",path
,dp
->d_name
); 
 332         if (unlink(buf
) == -1) { 
 333             redisLog(REDIS_WARNING
, 
 334                 "Can't unlink %s: %s", buf
, strerror(errno
)); 
 335             redisPanic("Unrecoverable Disk store errore. Existing."); 
 341 void dsFlushDb(int dbid
) { 
 345     redisLog(REDIS_NOTICE
,"Flushing diskstore DB (%d)",dbid
); 
 346     for (j 
= 0; j 
< 256; j
++) { 
 347         for (i 
= 0; i 
< 256; i
++) { 
 348             snprintf(buf
,1024,"%s/%02x/%02x",server
.ds_path
,j
,i
); 
 349             dsFlushOneDir(buf
,dbid
); 
 354 void dsRdbSaveSetState(int state
) { 
 355     pthread_mutex_lock(&server
.bgsavethread_mutex
); 
 356     server
.bgsavethread_state 
= state
; 
 357     pthread_mutex_unlock(&server
.bgsavethread_mutex
); 
 360 void *dsRdbSave_thread(void *arg
) { 
 361     char tmpfile
[256], *filename 
= (char*)arg
; 
 362     struct dirent 
*dp
, de
; 
 363     int j
, i
, last_dbid 
= -1; 
 366     /* Change state to ACTIVE, to signal there is a saving thead working. */ 
 367     redisLog(REDIS_NOTICE
,"Diskstore BGSAVE thread started"); 
 368     dsRdbSaveSetState(REDIS_BGSAVE_THREAD_ACTIVE
); 
 370     snprintf(tmpfile
,256,"temp-%d.rdb", (int) getpid()); 
 371     fp 
= fopen(tmpfile
,"w"); 
 373         redisLog(REDIS_WARNING
, "Failed opening .rdb for saving: %s", 
 375         dsRdbSaveSetState(REDIS_BGSAVE_THREAD_DONE_ERR
); 
 378     if (fwrite("REDIS0001",9,1,fp
) == 0) goto werr
; 
 382     /* Scan all diskstore dirs looking for keys */ 
 383     for (j 
= 0; j 
< 256; j
++) { 
 384         for (i 
= 0; i 
< 256; i
++) { 
 388             /* For every directory, collect all the keys */ 
 389             snprintf(buf
,sizeof(buf
),"%s/%02x/%02x",server
.ds_path
,j
,i
); 
 390             if ((dir 
= opendir(buf
)) == NULL
) { 
 391                 redisLog(REDIS_WARNING
,"Disk store can't open dir %s: %s", 
 392                     buf
, strerror(errno
)); 
 401                 readdir_r(dir
,&de
,&dp
); 
 402                 if (dp 
== NULL
) break; 
 403                 if (dp
->d_name
[0] == '.') continue; 
 404                 /* If there is a '-' char in the file name, it's a temp file */ 
 405                 if (strchr(dp
->d_name
,'-') != NULL
) continue; 
 407                 /* Emit the SELECT DB opcode if needed. */ 
 408                 dbid 
= dsGetDbidFromFilename(dp
->d_name
); 
 409                 if (dbid 
!= last_dbid
) { 
 411                     if (rdbSaveType(fp
,REDIS_SELECTDB
) == -1) goto werr
; 
 412                     if (rdbSaveLen(fp
,dbid
) == -1) goto werr
; 
 415                 /* Let's copy this file into the target .rdb */ 
 416                 snprintf(buf
,sizeof(buf
),"%s/%02x/%02x/%s", 
 417                     server
.ds_path
,j
,i
,dp
->d_name
); 
 418                 if ((entryfp 
= fopen(buf
,"r")) == NULL
) { 
 419                     redisLog(REDIS_WARNING
,"Can't open %s: %s", 
 420                         buf
,strerror(errno
)); 
 425                     int nread 
= fread(buf
,1,sizeof(buf
),entryfp
); 
 428                         if (ferror(entryfp
)) { 
 429                             redisLog(REDIS_WARNING
,"Error reading from file entry while performing BGSAVE for diskstore: %s", strerror(errno
)); 
 436                     if (fwrite(buf
,1,nread
,fp
) != (unsigned)nread
) { 
 447     /* Output the end of file opcode */ 
 448     if (rdbSaveType(fp
,REDIS_EOF
) == -1) goto werr
; 
 450     /* Make sure data will not remain on the OS's output buffers */ 
 456     /* Use RENAME to make sure the DB file is changed atomically only 
 457      * if the generate DB file is ok. */ 
 458     if (rename(tmpfile
,filename
) == -1) { 
 459         redisLog(REDIS_WARNING
,"Error moving temp DB file on the final destination: %s (diskstore)", strerror(errno
)); 
 461         dsRdbSaveSetState(REDIS_BGSAVE_THREAD_DONE_ERR
); 
 464     redisLog(REDIS_NOTICE
,"DB saved on disk by diskstore thread"); 
 465     dsRdbSaveSetState(REDIS_BGSAVE_THREAD_DONE_OK
); 
 472     dsRdbSaveSetState(REDIS_BGSAVE_THREAD_DONE_ERR
); 
 473     redisLog(REDIS_WARNING
,"Write error saving DB on disk: %s", strerror(errno
)); 
 477 int dsRdbSaveBackground(char *filename
) { 
 480     if (pthread_create(&thread
,NULL
,dsRdbSave_thread
,zstrdup(filename
)) != 0) { 
 481         redisLog(REDIS_WARNING
,"Can't create diskstore BGSAVE thread: %s", 
 485         server
.bgsavethread 
= thread
; 
 490 int dsRdbSave(char *filename
) { 
 491     /* A blocking save is actually a non blocking save... just we wait 
 492      * for it to terminate in a non-busy loop. */ 
 494     redisLog(REDIS_NOTICE
,"Starting a blocking SAVE (BGSAVE + blocking wait)"); 
 495     server
.dirty_before_bgsave 
= server
.dirty
; 
 496     if (dsRdbSaveBackground(filename
) == REDIS_ERR
) return REDIS_ERR
; 
 501         pthread_mutex_lock(&server
.bgsavethread_mutex
); 
 502         state 
= server
.bgsavethread_state
; 
 503         pthread_mutex_unlock(&server
.bgsavethread_mutex
); 
 505         if (state 
== REDIS_BGSAVE_THREAD_DONE_OK 
|| 
 506             state 
== REDIS_BGSAVE_THREAD_DONE_ERR
) break;