]>
Commit | Line | Data |
---|---|---|
52970711 | 1 | /* diskstore.c implements a very simple disk backed key-value store used |
2 | * by Redis for the "disk" backend. This implementation uses the filesystem | |
3 | * to store key/value pairs. Every file represents a given key. | |
4 | * | |
5 | * The key path is calculated using the SHA1 of the key name. For instance | |
6 | * the key "foo" is stored as a file name called: | |
7 | * | |
8 | * /0b/ee/0beec7b5ea3f0fdbc95d0dd47f3c5bc275da8a33 | |
9 | * | |
10 | * The couples of characters from the hex output of SHA1 are also used | |
11 | * to locate two two levels of directories to store the file (as most | |
12 | * filesystems are not able to handle too many files in a single dir). | |
13 | * | |
14 | * In the end there are 65536 final directories (256 directories inside | |
15 | * every 256 top level directories), so that with 1 billion of files every | |
16 | * directory will contain in the average 15258 entires, that is ok with | |
17 | * most filesystems implementation. | |
18 | * | |
697af434 | 19 | * Note that since Redis supports multiple databases, the actual key name |
20 | * is: | |
21 | * | |
22 | * /0b/ee/<dbid>_0beec7b5ea3f0fdbc95d0dd47f3c5bc275da8a33 | |
23 | * | |
24 | * so for instance if the key is inside DB 0: | |
25 | * | |
26 | * /0b/ee/0_0beec7b5ea3f0fdbc95d0dd47f3c5bc275da8a33 | |
27 | * | |
28 | * The actaul implementation of this disk store is highly dependant to the | |
29 | * filesystem implementation itself. This implementation may be replaced by | |
52970711 | 30 | * a B+TREE implementation in future implementations. |
31 | * | |
32 | * Data ok every key is serialized using the same format used for .rdb | |
33 | * serialization. Everything is serialized on every entry: key name, | |
34 | * ttl information in case of keys with an associated expire time, and the | |
35 | * serialized value itself. | |
36 | * | |
37 | * Because the format is the same of the .rdb files it is trivial to create | |
38 | * an .rdb file starting from this format just by mean of scanning the | |
39 | * directories and concatenating entries, with the sole addition of an | |
40 | * .rdb header at the start and the end-of-db opcode at the end. | |
41 | * | |
42 | * ------------------------------------------------------------------------- | |
43 | * | |
44 | * Copyright (c) 2010-2011, Salvatore Sanfilippo <antirez at gmail dot com> | |
45 | * All rights reserved. | |
46 | * | |
47 | * Redistribution and use in source and binary forms, with or without | |
48 | * modification, are permitted provided that the following conditions are met: | |
49 | * | |
50 | * * Redistributions of source code must retain the above copyright notice, | |
51 | * this list of conditions and the following disclaimer. | |
52 | * * Redistributions in binary form must reproduce the above copyright | |
53 | * notice, this list of conditions and the following disclaimer in the | |
54 | * documentation and/or other materials provided with the distribution. | |
55 | * * Neither the name of Redis nor the names of its contributors may be used | |
56 | * to endorse or promote products derived from this software without | |
57 | * specific prior written permission. | |
58 | * | |
59 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |
60 | * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |
61 | * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |
62 | * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE | |
63 | * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR | |
64 | * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF | |
65 | * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS | |
66 | * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN | |
67 | * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) | |
68 | * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE | |
69 | * POSSIBILITY OF SUCH DAMAGE. | |
70 | */ | |
71 | ||
72 | #include "redis.h" | |
4ab98823 | 73 | #include "sha1.h" |
52970711 | 74 | |
75 | #include <fcntl.h> | |
76 | #include <sys/stat.h> | |
120b9ba8 | 77 | #include <dirent.h> |
52970711 | 78 | |
ddbc81af | 79 | int create256dir(char *prefix) { |
80 | char buf[1024]; | |
81 | int j; | |
82 | ||
83 | for (j = 0; j < 256; j++) { | |
84 | snprintf(buf,sizeof(buf),"%s%02x",prefix,j); | |
85 | if (mkdir(buf,0755) == -1) { | |
86 | redisLog(REDIS_WARNING,"Error creating dir %s for diskstore: %s", | |
87 | buf,strerror(errno)); | |
88 | return REDIS_ERR; | |
89 | } | |
90 | } | |
91 | return REDIS_OK; | |
92 | } | |
93 | ||
52970711 | 94 | int dsOpen(void) { |
95 | struct stat sb; | |
ddbc81af | 96 | int retval, j; |
697af434 | 97 | char *path = server.ds_path; |
ddbc81af | 98 | char buf[1024]; |
52970711 | 99 | |
100 | if ((retval = stat(path,&sb) == -1) && errno != ENOENT) { | |
101 | redisLog(REDIS_WARNING, "Error opening disk store at %s: %s", | |
102 | path, strerror(errno)); | |
103 | return REDIS_ERR; | |
104 | } | |
105 | ||
106 | /* Directory already in place. Assume everything is ok. */ | |
67b0b41c | 107 | if (retval == 0 && S_ISDIR(sb.st_mode)) { |
108 | redisLog(REDIS_NOTICE,"Disk store %s exists", path); | |
109 | return REDIS_OK; | |
110 | } | |
52970711 | 111 | |
112 | /* File exists but it's not a directory */ | |
113 | if (retval == 0 && !S_ISDIR(sb.st_mode)) { | |
114 | redisLog(REDIS_WARNING,"Disk store at %s is not a directory", path); | |
115 | return REDIS_ERR; | |
116 | } | |
117 | ||
118 | /* New disk store, create the directory structure now, as creating | |
119 | * them in a lazy way is not a good idea, after very few insertions | |
120 | * we'll need most of the 65536 directories anyway. */ | |
67b0b41c | 121 | redisLog(REDIS_NOTICE,"Disk store %s does not exist: creating", path); |
ddbc81af | 122 | if (mkdir(path,0755) == -1) { |
52970711 | 123 | redisLog(REDIS_WARNING,"Disk store init failed creating dir %s: %s", |
124 | path, strerror(errno)); | |
125 | return REDIS_ERR; | |
126 | } | |
ddbc81af | 127 | /* Create the top level 256 directories */ |
128 | snprintf(buf,sizeof(buf),"%s/",path); | |
129 | if (create256dir(buf) == REDIS_ERR) return REDIS_ERR; | |
130 | ||
131 | /* For every 256 top level dir, create 256 nested dirs */ | |
132 | for (j = 0; j < 256; j++) { | |
133 | snprintf(buf,sizeof(buf),"%s/%02x/",path,j); | |
134 | if (create256dir(buf) == REDIS_ERR) return REDIS_ERR; | |
135 | } | |
52970711 | 136 | return REDIS_OK; |
137 | } | |
138 | ||
139 | int dsClose(void) { | |
140 | return REDIS_OK; | |
141 | } | |
142 | ||
4ab98823 | 143 | /* Convert key into full path for this object. Dirty but hopefully |
779fa2af | 144 | * is fast enough. Returns the length of the returned path. */ |
145 | int dsKeyToPath(redisDb *db, char *buf, robj *key) { | |
4ab98823 | 146 | SHA1_CTX ctx; |
147 | unsigned char hash[20]; | |
1fce3201 | 148 | char hex[40], digits[] = "0123456789abcdef"; |
4ab98823 | 149 | int j, l; |
779fa2af | 150 | char *origbuf = buf; |
4ab98823 | 151 | |
152 | SHA1Init(&ctx); | |
153 | SHA1Update(&ctx,key->ptr,sdslen(key->ptr)); | |
154 | SHA1Final(hash,&ctx); | |
155 | ||
156 | /* Convert the hash into hex format */ | |
157 | for (j = 0; j < 20; j++) { | |
158 | hex[j*2] = digits[(hash[j]&0xF0)>>4]; | |
159 | hex[(j*2)+1] = digits[hash[j]&0x0F]; | |
160 | } | |
161 | ||
162 | /* Create the object path. Start with server.ds_path that's the root dir */ | |
163 | l = sdslen(server.ds_path); | |
164 | memcpy(buf,server.ds_path,l); | |
165 | buf += l; | |
166 | *buf++ = '/'; | |
167 | ||
168 | /* Then add xx/yy/ that is the two level directories */ | |
169 | buf[0] = hex[0]; | |
170 | buf[1] = hex[1]; | |
171 | buf[2] = '/'; | |
172 | buf[3] = hex[2]; | |
173 | buf[4] = hex[3]; | |
174 | buf[5] = '/'; | |
175 | buf += 6; | |
176 | ||
177 | /* Add the database number followed by _ and finall the SHA1 hex */ | |
178 | l = ll2string(buf,64,db->id); | |
179 | buf += l; | |
180 | buf[0] = '_'; | |
181 | memcpy(buf+1,hex,40); | |
182 | buf[41] = '\0'; | |
779fa2af | 183 | return (buf-origbuf)+41; |
4ab98823 | 184 | } |
185 | ||
52970711 | 186 | int dsSet(redisDb *db, robj *key, robj *val) { |
779fa2af | 187 | char buf[1024], buf2[1024]; |
4ab98823 | 188 | FILE *fp; |
779fa2af | 189 | int retval, len; |
4ab98823 | 190 | |
779fa2af | 191 | len = dsKeyToPath(db,buf,key); |
192 | memcpy(buf2,buf,len); | |
418d5eaf | 193 | snprintf(buf2+len,sizeof(buf2)-len,"_%ld_%ld",(long)time(NULL),(long)val); |
1190c6cb | 194 | while ((fp = fopen(buf2,"w")) == NULL) { |
195 | if (errno == ENOSPC) { | |
196 | redisLog(REDIS_WARNING,"Diskstore: No space left on device. Please make room and wait 30 seconds for Redis to continue."); | |
197 | sleep(30); | |
198 | } else { | |
199 | redisLog(REDIS_WARNING,"diskstore error opening %s: %s", | |
200 | buf2, strerror(errno)); | |
201 | redisPanic("Unrecoverable diskstore error. Exiting."); | |
202 | } | |
203 | } | |
4ab98823 | 204 | if ((retval = rdbSaveKeyValuePair(fp,db,key,val,time(NULL))) == -1) |
205 | return REDIS_ERR; | |
206 | fclose(fp); | |
779fa2af | 207 | if (retval == 0) { |
208 | /* Expired key. Unlink failing not critical */ | |
209 | unlink(buf); | |
210 | unlink(buf2); | |
211 | } else { | |
212 | /* Use rename for atomic updadte of value */ | |
213 | if (rename(buf2,buf) == -1) { | |
214 | redisLog(REDIS_WARNING,"rename(2) returned an error: %s", | |
215 | strerror(errno)); | |
216 | redisPanic("Unrecoverable diskstore error. Exiting."); | |
217 | } | |
218 | } | |
4ab98823 | 219 | return REDIS_OK; |
52970711 | 220 | } |
221 | ||
4ab98823 | 222 | robj *dsGet(redisDb *db, robj *key, time_t *expire) { |
1fce3201 | 223 | char buf[1024]; |
224 | int type; | |
225 | time_t expiretime = -1; /* -1 means: no expire */ | |
226 | robj *dskey; /* Key as loaded from disk. */ | |
227 | robj *val; | |
228 | FILE *fp; | |
229 | ||
230 | dsKeyToPath(db,buf,key); | |
231 | fp = fopen(buf,"r"); | |
232 | if (fp == NULL && errno == ENOENT) return NULL; /* No such key */ | |
233 | if (fp == NULL) { | |
234 | redisLog(REDIS_WARNING,"Disk store failed opening %s: %s", | |
235 | buf, strerror(errno)); | |
236 | goto readerr; | |
237 | } | |
238 | ||
239 | if ((type = rdbLoadType(fp)) == -1) goto readerr; | |
240 | if (type == REDIS_EXPIRETIME) { | |
241 | if ((expiretime = rdbLoadTime(fp)) == -1) goto readerr; | |
242 | /* We read the time so we need to read the object type again */ | |
243 | if ((type = rdbLoadType(fp)) == -1) goto readerr; | |
244 | } | |
245 | /* Read key */ | |
246 | if ((dskey = rdbLoadStringObject(fp)) == NULL) goto readerr; | |
247 | /* Read value */ | |
248 | if ((val = rdbLoadObject(type,fp)) == NULL) goto readerr; | |
249 | fclose(fp); | |
250 | ||
251 | /* The key we asked, and the key returned, must be the same */ | |
252 | redisAssert(equalStringObjects(key,dskey)); | |
253 | ||
254 | /* Check if the key already expired */ | |
255 | decrRefCount(dskey); | |
256 | if (expiretime != -1 && expiretime < time(NULL)) { | |
257 | decrRefCount(val); | |
258 | unlink(buf); /* This failing is non critical here */ | |
259 | return NULL; | |
260 | } | |
261 | ||
262 | /* Everything ok... */ | |
263 | *expire = expiretime; | |
264 | return val; | |
265 | ||
266 | readerr: | |
c4b64a13 | 267 | redisLog(REDIS_WARNING,"Read error reading reading %s. Corrupted key?", |
268 | buf); | |
1fce3201 | 269 | redisPanic("Unrecoverable error reading from disk store"); |
270 | return NULL; /* unreached */ | |
52970711 | 271 | } |
272 | ||
5ef64098 | 273 | int dsDel(redisDb *db, robj *key) { |
1fce3201 | 274 | char buf[1024]; |
275 | ||
276 | dsKeyToPath(db,buf,key); | |
277 | if (unlink(buf) == -1) { | |
278 | if (errno == ENOENT) { | |
279 | return REDIS_ERR; | |
280 | } else { | |
281 | redisLog(REDIS_WARNING,"Disk store can't remove %s: %s", | |
282 | buf, strerror(errno)); | |
283 | redisPanic("Unrecoverable Disk store errore. Existing."); | |
284 | return REDIS_ERR; /* unreached */ | |
285 | } | |
286 | } else { | |
287 | return REDIS_OK; | |
288 | } | |
5ef64098 | 289 | } |
290 | ||
52970711 | 291 | int dsExists(redisDb *db, robj *key) { |
31222292 | 292 | char buf[1024]; |
293 | ||
294 | dsKeyToPath(db,buf,key); | |
295 | return access(buf,R_OK) == 0; | |
52970711 | 296 | } |
cea8c5cd | 297 | |
120b9ba8 | 298 | void dsFlushOneDir(char *path, int dbid) { |
299 | DIR *dir; | |
300 | struct dirent *dp, de; | |
301 | ||
302 | dir = opendir(path); | |
303 | if (dir == NULL) { | |
304 | redisLog(REDIS_WARNING,"Disk store can't open dir %s: %s", | |
305 | path, strerror(errno)); | |
306 | redisPanic("Unrecoverable Disk store errore. Existing."); | |
307 | } | |
308 | while(1) { | |
0b305fcf | 309 | char buf[1024]; |
310 | ||
120b9ba8 | 311 | readdir_r(dir,&de,&dp); |
312 | if (dp == NULL) break; | |
313 | if (dp->d_name[0] == '.') continue; | |
314 | ||
315 | /* Check if we need to remove this entry accordingly to the | |
316 | * DB number */ | |
317 | if (dbid != -1) { | |
318 | char id[64]; | |
319 | char *p = strchr(dp->d_name,'_'); | |
320 | int len = (p - dp->d_name); | |
321 | ||
322 | redisAssert(p != NULL && len < 64); | |
323 | memcpy(id,dp->d_name,len); | |
324 | id[len] = '\0'; | |
325 | if (atoi(id) != dbid) continue; /* skip this file */ | |
326 | } | |
0b305fcf | 327 | |
328 | /* Finally unlink the file */ | |
329 | snprintf(buf,1024,"%s/%s",path,dp->d_name); | |
330 | if (unlink(buf) == -1) { | |
120b9ba8 | 331 | redisLog(REDIS_WARNING, |
0b305fcf | 332 | "Can't unlink %s: %s", buf, strerror(errno)); |
120b9ba8 | 333 | redisPanic("Unrecoverable Disk store errore. Existing."); |
334 | } | |
335 | } | |
336 | closedir(dir); | |
337 | } | |
338 | ||
339 | void dsFlushDb(int dbid) { | |
340 | char buf[1024]; | |
341 | int j, i; | |
342 | ||
343 | redisLog(REDIS_NOTICE,"Flushing diskstore DB (%d)",dbid); | |
344 | for (j = 0; j < 256; j++) { | |
345 | for (i = 0; i < 256; i++) { | |
346 | snprintf(buf,1024,"%s/%02x/%02x",server.ds_path,j,i); | |
347 | dsFlushOneDir(buf,dbid); | |
348 | } | |
349 | } | |
cea8c5cd | 350 | } |