]>
Commit | Line | Data |
---|---|---|
52970711 | 1 | /* diskstore.c implements a very simple disk backed key-value store used |
2 | * by Redis for the "disk" backend. This implementation uses the filesystem | |
3 | * to store key/value pairs. Every file represents a given key. | |
4 | * | |
5 | * The key path is calculated using the SHA1 of the key name. For instance | |
6 | * the key "foo" is stored as a file name called: | |
7 | * | |
8 | * /0b/ee/0beec7b5ea3f0fdbc95d0dd47f3c5bc275da8a33 | |
9 | * | |
10 | * The couples of characters from the hex output of SHA1 are also used | |
11 | * to locate two two levels of directories to store the file (as most | |
12 | * filesystems are not able to handle too many files in a single dir). | |
13 | * | |
14 | * In the end there are 65536 final directories (256 directories inside | |
15 | * every 256 top level directories), so that with 1 billion of files every | |
16 | * directory will contain in the average 15258 entires, that is ok with | |
17 | * most filesystems implementation. | |
18 | * | |
697af434 | 19 | * Note that since Redis supports multiple databases, the actual key name |
20 | * is: | |
21 | * | |
22 | * /0b/ee/<dbid>_0beec7b5ea3f0fdbc95d0dd47f3c5bc275da8a33 | |
23 | * | |
24 | * so for instance if the key is inside DB 0: | |
25 | * | |
26 | * /0b/ee/0_0beec7b5ea3f0fdbc95d0dd47f3c5bc275da8a33 | |
27 | * | |
28 | * The actaul implementation of this disk store is highly dependant to the | |
29 | * filesystem implementation itself. This implementation may be replaced by | |
52970711 | 30 | * a B+TREE implementation in future implementations. |
31 | * | |
32 | * Data ok every key is serialized using the same format used for .rdb | |
33 | * serialization. Everything is serialized on every entry: key name, | |
34 | * ttl information in case of keys with an associated expire time, and the | |
35 | * serialized value itself. | |
36 | * | |
37 | * Because the format is the same of the .rdb files it is trivial to create | |
38 | * an .rdb file starting from this format just by mean of scanning the | |
39 | * directories and concatenating entries, with the sole addition of an | |
40 | * .rdb header at the start and the end-of-db opcode at the end. | |
41 | * | |
42 | * ------------------------------------------------------------------------- | |
43 | * | |
44 | * Copyright (c) 2010-2011, Salvatore Sanfilippo <antirez at gmail dot com> | |
45 | * All rights reserved. | |
46 | * | |
47 | * Redistribution and use in source and binary forms, with or without | |
48 | * modification, are permitted provided that the following conditions are met: | |
49 | * | |
50 | * * Redistributions of source code must retain the above copyright notice, | |
51 | * this list of conditions and the following disclaimer. | |
52 | * * Redistributions in binary form must reproduce the above copyright | |
53 | * notice, this list of conditions and the following disclaimer in the | |
54 | * documentation and/or other materials provided with the distribution. | |
55 | * * Neither the name of Redis nor the names of its contributors may be used | |
56 | * to endorse or promote products derived from this software without | |
57 | * specific prior written permission. | |
58 | * | |
59 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |
60 | * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |
61 | * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |
62 | * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE | |
63 | * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR | |
64 | * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF | |
65 | * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS | |
66 | * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN | |
67 | * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) | |
68 | * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE | |
69 | * POSSIBILITY OF SUCH DAMAGE. | |
70 | */ | |
71 | ||
72 | #include "redis.h" | |
4ab98823 | 73 | #include "sha1.h" |
52970711 | 74 | |
75 | #include <fcntl.h> | |
76 | #include <sys/stat.h> | |
120b9ba8 | 77 | #include <dirent.h> |
52970711 | 78 | |
ddbc81af | 79 | int create256dir(char *prefix) { |
80 | char buf[1024]; | |
81 | int j; | |
82 | ||
83 | for (j = 0; j < 256; j++) { | |
84 | snprintf(buf,sizeof(buf),"%s%02x",prefix,j); | |
85 | if (mkdir(buf,0755) == -1) { | |
86 | redisLog(REDIS_WARNING,"Error creating dir %s for diskstore: %s", | |
87 | buf,strerror(errno)); | |
88 | return REDIS_ERR; | |
89 | } | |
90 | } | |
91 | return REDIS_OK; | |
92 | } | |
93 | ||
52970711 | 94 | int dsOpen(void) { |
95 | struct stat sb; | |
ddbc81af | 96 | int retval, j; |
697af434 | 97 | char *path = server.ds_path; |
ddbc81af | 98 | char buf[1024]; |
52970711 | 99 | |
100 | if ((retval = stat(path,&sb) == -1) && errno != ENOENT) { | |
101 | redisLog(REDIS_WARNING, "Error opening disk store at %s: %s", | |
102 | path, strerror(errno)); | |
103 | return REDIS_ERR; | |
104 | } | |
105 | ||
106 | /* Directory already in place. Assume everything is ok. */ | |
67b0b41c | 107 | if (retval == 0 && S_ISDIR(sb.st_mode)) { |
108 | redisLog(REDIS_NOTICE,"Disk store %s exists", path); | |
109 | return REDIS_OK; | |
110 | } | |
52970711 | 111 | |
112 | /* File exists but it's not a directory */ | |
113 | if (retval == 0 && !S_ISDIR(sb.st_mode)) { | |
114 | redisLog(REDIS_WARNING,"Disk store at %s is not a directory", path); | |
115 | return REDIS_ERR; | |
116 | } | |
117 | ||
118 | /* New disk store, create the directory structure now, as creating | |
119 | * them in a lazy way is not a good idea, after very few insertions | |
120 | * we'll need most of the 65536 directories anyway. */ | |
67b0b41c | 121 | redisLog(REDIS_NOTICE,"Disk store %s does not exist: creating", path); |
ddbc81af | 122 | if (mkdir(path,0755) == -1) { |
52970711 | 123 | redisLog(REDIS_WARNING,"Disk store init failed creating dir %s: %s", |
124 | path, strerror(errno)); | |
125 | return REDIS_ERR; | |
126 | } | |
ddbc81af | 127 | /* Create the top level 256 directories */ |
128 | snprintf(buf,sizeof(buf),"%s/",path); | |
129 | if (create256dir(buf) == REDIS_ERR) return REDIS_ERR; | |
130 | ||
131 | /* For every 256 top level dir, create 256 nested dirs */ | |
132 | for (j = 0; j < 256; j++) { | |
133 | snprintf(buf,sizeof(buf),"%s/%02x/",path,j); | |
134 | if (create256dir(buf) == REDIS_ERR) return REDIS_ERR; | |
135 | } | |
52970711 | 136 | return REDIS_OK; |
137 | } | |
138 | ||
139 | int dsClose(void) { | |
140 | return REDIS_OK; | |
141 | } | |
142 | ||
4ab98823 | 143 | /* Convert key into full path for this object. Dirty but hopefully |
144 | * is fast enough. */ | |
1fce3201 | 145 | void dsKeyToPath(redisDb *db, char *buf, robj *key) { |
4ab98823 | 146 | SHA1_CTX ctx; |
147 | unsigned char hash[20]; | |
1fce3201 | 148 | char hex[40], digits[] = "0123456789abcdef"; |
4ab98823 | 149 | int j, l; |
150 | ||
151 | SHA1Init(&ctx); | |
152 | SHA1Update(&ctx,key->ptr,sdslen(key->ptr)); | |
153 | SHA1Final(hash,&ctx); | |
154 | ||
155 | /* Convert the hash into hex format */ | |
156 | for (j = 0; j < 20; j++) { | |
157 | hex[j*2] = digits[(hash[j]&0xF0)>>4]; | |
158 | hex[(j*2)+1] = digits[hash[j]&0x0F]; | |
159 | } | |
160 | ||
161 | /* Create the object path. Start with server.ds_path that's the root dir */ | |
162 | l = sdslen(server.ds_path); | |
163 | memcpy(buf,server.ds_path,l); | |
164 | buf += l; | |
165 | *buf++ = '/'; | |
166 | ||
167 | /* Then add xx/yy/ that is the two level directories */ | |
168 | buf[0] = hex[0]; | |
169 | buf[1] = hex[1]; | |
170 | buf[2] = '/'; | |
171 | buf[3] = hex[2]; | |
172 | buf[4] = hex[3]; | |
173 | buf[5] = '/'; | |
174 | buf += 6; | |
175 | ||
176 | /* Add the database number followed by _ and finall the SHA1 hex */ | |
177 | l = ll2string(buf,64,db->id); | |
178 | buf += l; | |
179 | buf[0] = '_'; | |
180 | memcpy(buf+1,hex,40); | |
181 | buf[41] = '\0'; | |
182 | } | |
183 | ||
52970711 | 184 | int dsSet(redisDb *db, robj *key, robj *val) { |
4ab98823 | 185 | char buf[1024]; |
186 | FILE *fp; | |
187 | int retval; | |
188 | ||
1fce3201 | 189 | dsKeyToPath(db,buf,key); |
4ab98823 | 190 | fp = fopen(buf,"w"); |
191 | if ((retval = rdbSaveKeyValuePair(fp,db,key,val,time(NULL))) == -1) | |
192 | return REDIS_ERR; | |
193 | fclose(fp); | |
1fce3201 | 194 | if (retval == 0) unlink(buf); /* Expired key. Unlink failing not critical */ |
4ab98823 | 195 | return REDIS_OK; |
52970711 | 196 | } |
197 | ||
4ab98823 | 198 | robj *dsGet(redisDb *db, robj *key, time_t *expire) { |
1fce3201 | 199 | char buf[1024]; |
200 | int type; | |
201 | time_t expiretime = -1; /* -1 means: no expire */ | |
202 | robj *dskey; /* Key as loaded from disk. */ | |
203 | robj *val; | |
204 | FILE *fp; | |
205 | ||
206 | dsKeyToPath(db,buf,key); | |
207 | fp = fopen(buf,"r"); | |
208 | if (fp == NULL && errno == ENOENT) return NULL; /* No such key */ | |
209 | if (fp == NULL) { | |
210 | redisLog(REDIS_WARNING,"Disk store failed opening %s: %s", | |
211 | buf, strerror(errno)); | |
212 | goto readerr; | |
213 | } | |
214 | ||
215 | if ((type = rdbLoadType(fp)) == -1) goto readerr; | |
216 | if (type == REDIS_EXPIRETIME) { | |
217 | if ((expiretime = rdbLoadTime(fp)) == -1) goto readerr; | |
218 | /* We read the time so we need to read the object type again */ | |
219 | if ((type = rdbLoadType(fp)) == -1) goto readerr; | |
220 | } | |
221 | /* Read key */ | |
222 | if ((dskey = rdbLoadStringObject(fp)) == NULL) goto readerr; | |
223 | /* Read value */ | |
224 | if ((val = rdbLoadObject(type,fp)) == NULL) goto readerr; | |
225 | fclose(fp); | |
226 | ||
227 | /* The key we asked, and the key returned, must be the same */ | |
228 | redisAssert(equalStringObjects(key,dskey)); | |
229 | ||
230 | /* Check if the key already expired */ | |
231 | decrRefCount(dskey); | |
232 | if (expiretime != -1 && expiretime < time(NULL)) { | |
233 | decrRefCount(val); | |
234 | unlink(buf); /* This failing is non critical here */ | |
235 | return NULL; | |
236 | } | |
237 | ||
238 | /* Everything ok... */ | |
239 | *expire = expiretime; | |
240 | return val; | |
241 | ||
242 | readerr: | |
c4b64a13 | 243 | redisLog(REDIS_WARNING,"Read error reading reading %s. Corrupted key?", |
244 | buf); | |
1fce3201 | 245 | redisPanic("Unrecoverable error reading from disk store"); |
246 | return NULL; /* unreached */ | |
52970711 | 247 | } |
248 | ||
5ef64098 | 249 | int dsDel(redisDb *db, robj *key) { |
1fce3201 | 250 | char buf[1024]; |
251 | ||
252 | dsKeyToPath(db,buf,key); | |
253 | if (unlink(buf) == -1) { | |
254 | if (errno == ENOENT) { | |
255 | return REDIS_ERR; | |
256 | } else { | |
257 | redisLog(REDIS_WARNING,"Disk store can't remove %s: %s", | |
258 | buf, strerror(errno)); | |
259 | redisPanic("Unrecoverable Disk store errore. Existing."); | |
260 | return REDIS_ERR; /* unreached */ | |
261 | } | |
262 | } else { | |
263 | return REDIS_OK; | |
264 | } | |
5ef64098 | 265 | } |
266 | ||
52970711 | 267 | int dsExists(redisDb *db, robj *key) { |
31222292 | 268 | char buf[1024]; |
269 | ||
270 | dsKeyToPath(db,buf,key); | |
271 | return access(buf,R_OK) == 0; | |
52970711 | 272 | } |
cea8c5cd | 273 | |
120b9ba8 | 274 | void dsFlushOneDir(char *path, int dbid) { |
275 | DIR *dir; | |
276 | struct dirent *dp, de; | |
277 | ||
278 | dir = opendir(path); | |
279 | if (dir == NULL) { | |
280 | redisLog(REDIS_WARNING,"Disk store can't open dir %s: %s", | |
281 | path, strerror(errno)); | |
282 | redisPanic("Unrecoverable Disk store errore. Existing."); | |
283 | } | |
284 | while(1) { | |
0b305fcf | 285 | char buf[1024]; |
286 | ||
120b9ba8 | 287 | readdir_r(dir,&de,&dp); |
288 | if (dp == NULL) break; | |
289 | if (dp->d_name[0] == '.') continue; | |
290 | ||
291 | /* Check if we need to remove this entry accordingly to the | |
292 | * DB number */ | |
293 | if (dbid != -1) { | |
294 | char id[64]; | |
295 | char *p = strchr(dp->d_name,'_'); | |
296 | int len = (p - dp->d_name); | |
297 | ||
298 | redisAssert(p != NULL && len < 64); | |
299 | memcpy(id,dp->d_name,len); | |
300 | id[len] = '\0'; | |
301 | if (atoi(id) != dbid) continue; /* skip this file */ | |
302 | } | |
0b305fcf | 303 | |
304 | /* Finally unlink the file */ | |
305 | snprintf(buf,1024,"%s/%s",path,dp->d_name); | |
306 | if (unlink(buf) == -1) { | |
120b9ba8 | 307 | redisLog(REDIS_WARNING, |
0b305fcf | 308 | "Can't unlink %s: %s", buf, strerror(errno)); |
120b9ba8 | 309 | redisPanic("Unrecoverable Disk store errore. Existing."); |
310 | } | |
311 | } | |
312 | closedir(dir); | |
313 | } | |
314 | ||
315 | void dsFlushDb(int dbid) { | |
316 | char buf[1024]; | |
317 | int j, i; | |
318 | ||
319 | redisLog(REDIS_NOTICE,"Flushing diskstore DB (%d)",dbid); | |
320 | for (j = 0; j < 256; j++) { | |
321 | for (i = 0; i < 256; i++) { | |
322 | snprintf(buf,1024,"%s/%02x/%02x",server.ds_path,j,i); | |
323 | dsFlushOneDir(buf,dbid); | |
324 | } | |
325 | } | |
cea8c5cd | 326 | } |