]> git.saurik.com Git - redis.git/blame - src/diskstore.c
serious performance enhancement of diskstore
[redis.git] / src / diskstore.c
CommitLineData
52970711 1/* diskstore.c implements a very simple disk backed key-value store used
2 * by Redis for the "disk" backend. This implementation uses the filesystem
3 * to store key/value pairs. Every file represents a given key.
4 *
5 * The key path is calculated using the SHA1 of the key name. For instance
6 * the key "foo" is stored as a file name called:
7 *
8 * /0b/ee/0beec7b5ea3f0fdbc95d0dd47f3c5bc275da8a33
9 *
10 * The couples of characters from the hex output of SHA1 are also used
11 * to locate two two levels of directories to store the file (as most
12 * filesystems are not able to handle too many files in a single dir).
13 *
14 * In the end there are 65536 final directories (256 directories inside
15 * every 256 top level directories), so that with 1 billion of files every
16 * directory will contain in the average 15258 entires, that is ok with
17 * most filesystems implementation.
18 *
697af434 19 * Note that since Redis supports multiple databases, the actual key name
20 * is:
21 *
22 * /0b/ee/<dbid>_0beec7b5ea3f0fdbc95d0dd47f3c5bc275da8a33
23 *
24 * so for instance if the key is inside DB 0:
25 *
26 * /0b/ee/0_0beec7b5ea3f0fdbc95d0dd47f3c5bc275da8a33
27 *
28 * The actaul implementation of this disk store is highly dependant to the
29 * filesystem implementation itself. This implementation may be replaced by
52970711 30 * a B+TREE implementation in future implementations.
31 *
32 * Data ok every key is serialized using the same format used for .rdb
33 * serialization. Everything is serialized on every entry: key name,
34 * ttl information in case of keys with an associated expire time, and the
35 * serialized value itself.
36 *
37 * Because the format is the same of the .rdb files it is trivial to create
38 * an .rdb file starting from this format just by mean of scanning the
39 * directories and concatenating entries, with the sole addition of an
40 * .rdb header at the start and the end-of-db opcode at the end.
41 *
42 * -------------------------------------------------------------------------
43 *
44 * Copyright (c) 2010-2011, Salvatore Sanfilippo <antirez at gmail dot com>
45 * All rights reserved.
46 *
47 * Redistribution and use in source and binary forms, with or without
48 * modification, are permitted provided that the following conditions are met:
49 *
50 * * Redistributions of source code must retain the above copyright notice,
51 * this list of conditions and the following disclaimer.
52 * * Redistributions in binary form must reproduce the above copyright
53 * notice, this list of conditions and the following disclaimer in the
54 * documentation and/or other materials provided with the distribution.
55 * * Neither the name of Redis nor the names of its contributors may be used
56 * to endorse or promote products derived from this software without
57 * specific prior written permission.
58 *
59 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
60 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
61 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
62 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
63 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
64 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
65 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
66 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
67 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
68 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
69 * POSSIBILITY OF SUCH DAMAGE.
70 */
71
72#include "redis.h"
4ab98823 73#include "sha1.h"
52970711 74
75#include <fcntl.h>
76#include <sys/stat.h>
120b9ba8 77#include <dirent.h>
52970711 78
ddbc81af 79int create256dir(char *prefix) {
80 char buf[1024];
81 int j;
82
83 for (j = 0; j < 256; j++) {
84 snprintf(buf,sizeof(buf),"%s%02x",prefix,j);
85 if (mkdir(buf,0755) == -1) {
86 redisLog(REDIS_WARNING,"Error creating dir %s for diskstore: %s",
87 buf,strerror(errno));
88 return REDIS_ERR;
89 }
90 }
91 return REDIS_OK;
92}
93
52970711 94int dsOpen(void) {
95 struct stat sb;
ddbc81af 96 int retval, j;
697af434 97 char *path = server.ds_path;
ddbc81af 98 char buf[1024];
52970711 99
100 if ((retval = stat(path,&sb) == -1) && errno != ENOENT) {
101 redisLog(REDIS_WARNING, "Error opening disk store at %s: %s",
102 path, strerror(errno));
103 return REDIS_ERR;
104 }
105
106 /* Directory already in place. Assume everything is ok. */
67b0b41c 107 if (retval == 0 && S_ISDIR(sb.st_mode)) {
108 redisLog(REDIS_NOTICE,"Disk store %s exists", path);
109 return REDIS_OK;
110 }
52970711 111
112 /* File exists but it's not a directory */
113 if (retval == 0 && !S_ISDIR(sb.st_mode)) {
114 redisLog(REDIS_WARNING,"Disk store at %s is not a directory", path);
115 return REDIS_ERR;
116 }
117
118 /* New disk store, create the directory structure now, as creating
119 * them in a lazy way is not a good idea, after very few insertions
120 * we'll need most of the 65536 directories anyway. */
67b0b41c 121 redisLog(REDIS_NOTICE,"Disk store %s does not exist: creating", path);
ddbc81af 122 if (mkdir(path,0755) == -1) {
52970711 123 redisLog(REDIS_WARNING,"Disk store init failed creating dir %s: %s",
124 path, strerror(errno));
125 return REDIS_ERR;
126 }
ddbc81af 127 /* Create the top level 256 directories */
128 snprintf(buf,sizeof(buf),"%s/",path);
129 if (create256dir(buf) == REDIS_ERR) return REDIS_ERR;
130
131 /* For every 256 top level dir, create 256 nested dirs */
132 for (j = 0; j < 256; j++) {
133 snprintf(buf,sizeof(buf),"%s/%02x/",path,j);
134 if (create256dir(buf) == REDIS_ERR) return REDIS_ERR;
135 }
52970711 136 return REDIS_OK;
137}
138
139int dsClose(void) {
140 return REDIS_OK;
141}
142
4ab98823 143/* Convert key into full path for this object. Dirty but hopefully
144 * is fast enough. */
1fce3201 145void dsKeyToPath(redisDb *db, char *buf, robj *key) {
4ab98823 146 SHA1_CTX ctx;
147 unsigned char hash[20];
1fce3201 148 char hex[40], digits[] = "0123456789abcdef";
4ab98823 149 int j, l;
150
151 SHA1Init(&ctx);
152 SHA1Update(&ctx,key->ptr,sdslen(key->ptr));
153 SHA1Final(hash,&ctx);
154
155 /* Convert the hash into hex format */
156 for (j = 0; j < 20; j++) {
157 hex[j*2] = digits[(hash[j]&0xF0)>>4];
158 hex[(j*2)+1] = digits[hash[j]&0x0F];
159 }
160
161 /* Create the object path. Start with server.ds_path that's the root dir */
162 l = sdslen(server.ds_path);
163 memcpy(buf,server.ds_path,l);
164 buf += l;
165 *buf++ = '/';
166
167 /* Then add xx/yy/ that is the two level directories */
168 buf[0] = hex[0];
169 buf[1] = hex[1];
170 buf[2] = '/';
171 buf[3] = hex[2];
172 buf[4] = hex[3];
173 buf[5] = '/';
174 buf += 6;
175
176 /* Add the database number followed by _ and finall the SHA1 hex */
177 l = ll2string(buf,64,db->id);
178 buf += l;
179 buf[0] = '_';
180 memcpy(buf+1,hex,40);
181 buf[41] = '\0';
182}
183
52970711 184int dsSet(redisDb *db, robj *key, robj *val) {
4ab98823 185 char buf[1024];
186 FILE *fp;
187 int retval;
188
1fce3201 189 dsKeyToPath(db,buf,key);
4ab98823 190 fp = fopen(buf,"w");
191 if ((retval = rdbSaveKeyValuePair(fp,db,key,val,time(NULL))) == -1)
192 return REDIS_ERR;
193 fclose(fp);
1fce3201 194 if (retval == 0) unlink(buf); /* Expired key. Unlink failing not critical */
4ab98823 195 return REDIS_OK;
52970711 196}
197
4ab98823 198robj *dsGet(redisDb *db, robj *key, time_t *expire) {
1fce3201 199 char buf[1024];
200 int type;
201 time_t expiretime = -1; /* -1 means: no expire */
202 robj *dskey; /* Key as loaded from disk. */
203 robj *val;
204 FILE *fp;
205
206 dsKeyToPath(db,buf,key);
207 fp = fopen(buf,"r");
208 if (fp == NULL && errno == ENOENT) return NULL; /* No such key */
209 if (fp == NULL) {
210 redisLog(REDIS_WARNING,"Disk store failed opening %s: %s",
211 buf, strerror(errno));
212 goto readerr;
213 }
214
215 if ((type = rdbLoadType(fp)) == -1) goto readerr;
216 if (type == REDIS_EXPIRETIME) {
217 if ((expiretime = rdbLoadTime(fp)) == -1) goto readerr;
218 /* We read the time so we need to read the object type again */
219 if ((type = rdbLoadType(fp)) == -1) goto readerr;
220 }
221 /* Read key */
222 if ((dskey = rdbLoadStringObject(fp)) == NULL) goto readerr;
223 /* Read value */
224 if ((val = rdbLoadObject(type,fp)) == NULL) goto readerr;
225 fclose(fp);
226
227 /* The key we asked, and the key returned, must be the same */
228 redisAssert(equalStringObjects(key,dskey));
229
230 /* Check if the key already expired */
231 decrRefCount(dskey);
232 if (expiretime != -1 && expiretime < time(NULL)) {
233 decrRefCount(val);
234 unlink(buf); /* This failing is non critical here */
235 return NULL;
236 }
237
238 /* Everything ok... */
239 *expire = expiretime;
240 return val;
241
242readerr:
c4b64a13 243 redisLog(REDIS_WARNING,"Read error reading reading %s. Corrupted key?",
244 buf);
1fce3201 245 redisPanic("Unrecoverable error reading from disk store");
246 return NULL; /* unreached */
52970711 247}
248
5ef64098 249int dsDel(redisDb *db, robj *key) {
1fce3201 250 char buf[1024];
251
252 dsKeyToPath(db,buf,key);
253 if (unlink(buf) == -1) {
254 if (errno == ENOENT) {
255 return REDIS_ERR;
256 } else {
257 redisLog(REDIS_WARNING,"Disk store can't remove %s: %s",
258 buf, strerror(errno));
259 redisPanic("Unrecoverable Disk store errore. Existing.");
260 return REDIS_ERR; /* unreached */
261 }
262 } else {
263 return REDIS_OK;
264 }
5ef64098 265}
266
52970711 267int dsExists(redisDb *db, robj *key) {
31222292 268 char buf[1024];
269
270 dsKeyToPath(db,buf,key);
271 return access(buf,R_OK) == 0;
52970711 272}
cea8c5cd 273
120b9ba8 274void dsFlushOneDir(char *path, int dbid) {
275 DIR *dir;
276 struct dirent *dp, de;
277
278 dir = opendir(path);
279 if (dir == NULL) {
280 redisLog(REDIS_WARNING,"Disk store can't open dir %s: %s",
281 path, strerror(errno));
282 redisPanic("Unrecoverable Disk store errore. Existing.");
283 }
284 while(1) {
0b305fcf 285 char buf[1024];
286
120b9ba8 287 readdir_r(dir,&de,&dp);
288 if (dp == NULL) break;
289 if (dp->d_name[0] == '.') continue;
290
291 /* Check if we need to remove this entry accordingly to the
292 * DB number */
293 if (dbid != -1) {
294 char id[64];
295 char *p = strchr(dp->d_name,'_');
296 int len = (p - dp->d_name);
297
298 redisAssert(p != NULL && len < 64);
299 memcpy(id,dp->d_name,len);
300 id[len] = '\0';
301 if (atoi(id) != dbid) continue; /* skip this file */
302 }
0b305fcf 303
304 /* Finally unlink the file */
305 snprintf(buf,1024,"%s/%s",path,dp->d_name);
306 if (unlink(buf) == -1) {
120b9ba8 307 redisLog(REDIS_WARNING,
0b305fcf 308 "Can't unlink %s: %s", buf, strerror(errno));
120b9ba8 309 redisPanic("Unrecoverable Disk store errore. Existing.");
310 }
311 }
312 closedir(dir);
313}
314
315void dsFlushDb(int dbid) {
316 char buf[1024];
317 int j, i;
318
319 redisLog(REDIS_NOTICE,"Flushing diskstore DB (%d)",dbid);
320 for (j = 0; j < 256; j++) {
321 for (i = 0; i < 256; i++) {
322 snprintf(buf,1024,"%s/%02x/%02x",server.ds_path,j,i);
323 dsFlushOneDir(buf,dbid);
324 }
325 }
cea8c5cd 326}