]>
Commit | Line | Data |
---|---|---|
52970711 | 1 | /* diskstore.c implements a very simple disk backed key-value store used |
2 | * by Redis for the "disk" backend. This implementation uses the filesystem | |
3 | * to store key/value pairs. Every file represents a given key. | |
4 | * | |
5 | * The key path is calculated using the SHA1 of the key name. For instance | |
6 | * the key "foo" is stored as a file name called: | |
7 | * | |
8 | * /0b/ee/0beec7b5ea3f0fdbc95d0dd47f3c5bc275da8a33 | |
9 | * | |
10 | * The couples of characters from the hex output of SHA1 are also used | |
11 | * to locate two two levels of directories to store the file (as most | |
12 | * filesystems are not able to handle too many files in a single dir). | |
13 | * | |
14 | * In the end there are 65536 final directories (256 directories inside | |
15 | * every 256 top level directories), so that with 1 billion of files every | |
16 | * directory will contain in the average 15258 entires, that is ok with | |
17 | * most filesystems implementation. | |
18 | * | |
697af434 | 19 | * Note that since Redis supports multiple databases, the actual key name |
20 | * is: | |
21 | * | |
22 | * /0b/ee/<dbid>_0beec7b5ea3f0fdbc95d0dd47f3c5bc275da8a33 | |
23 | * | |
24 | * so for instance if the key is inside DB 0: | |
25 | * | |
26 | * /0b/ee/0_0beec7b5ea3f0fdbc95d0dd47f3c5bc275da8a33 | |
27 | * | |
28 | * The actaul implementation of this disk store is highly dependant to the | |
29 | * filesystem implementation itself. This implementation may be replaced by | |
52970711 | 30 | * a B+TREE implementation in future implementations. |
31 | * | |
32 | * Data ok every key is serialized using the same format used for .rdb | |
33 | * serialization. Everything is serialized on every entry: key name, | |
34 | * ttl information in case of keys with an associated expire time, and the | |
35 | * serialized value itself. | |
36 | * | |
37 | * Because the format is the same of the .rdb files it is trivial to create | |
38 | * an .rdb file starting from this format just by mean of scanning the | |
39 | * directories and concatenating entries, with the sole addition of an | |
40 | * .rdb header at the start and the end-of-db opcode at the end. | |
41 | * | |
42 | * ------------------------------------------------------------------------- | |
43 | * | |
44 | * Copyright (c) 2010-2011, Salvatore Sanfilippo <antirez at gmail dot com> | |
45 | * All rights reserved. | |
46 | * | |
47 | * Redistribution and use in source and binary forms, with or without | |
48 | * modification, are permitted provided that the following conditions are met: | |
49 | * | |
50 | * * Redistributions of source code must retain the above copyright notice, | |
51 | * this list of conditions and the following disclaimer. | |
52 | * * Redistributions in binary form must reproduce the above copyright | |
53 | * notice, this list of conditions and the following disclaimer in the | |
54 | * documentation and/or other materials provided with the distribution. | |
55 | * * Neither the name of Redis nor the names of its contributors may be used | |
56 | * to endorse or promote products derived from this software without | |
57 | * specific prior written permission. | |
58 | * | |
59 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |
60 | * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |
61 | * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |
62 | * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE | |
63 | * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR | |
64 | * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF | |
65 | * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS | |
66 | * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN | |
67 | * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) | |
68 | * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE | |
69 | * POSSIBILITY OF SUCH DAMAGE. | |
70 | */ | |
71 | ||
72 | #include "redis.h" | |
4ab98823 | 73 | #include "sha1.h" |
52970711 | 74 | |
75 | #include <fcntl.h> | |
76 | #include <sys/stat.h> | |
77 | ||
ddbc81af | 78 | int create256dir(char *prefix) { |
79 | char buf[1024]; | |
80 | int j; | |
81 | ||
82 | for (j = 0; j < 256; j++) { | |
83 | snprintf(buf,sizeof(buf),"%s%02x",prefix,j); | |
84 | if (mkdir(buf,0755) == -1) { | |
85 | redisLog(REDIS_WARNING,"Error creating dir %s for diskstore: %s", | |
86 | buf,strerror(errno)); | |
87 | return REDIS_ERR; | |
88 | } | |
89 | } | |
90 | return REDIS_OK; | |
91 | } | |
92 | ||
52970711 | 93 | int dsOpen(void) { |
94 | struct stat sb; | |
ddbc81af | 95 | int retval, j; |
697af434 | 96 | char *path = server.ds_path; |
ddbc81af | 97 | char buf[1024]; |
52970711 | 98 | |
99 | if ((retval = stat(path,&sb) == -1) && errno != ENOENT) { | |
100 | redisLog(REDIS_WARNING, "Error opening disk store at %s: %s", | |
101 | path, strerror(errno)); | |
102 | return REDIS_ERR; | |
103 | } | |
104 | ||
105 | /* Directory already in place. Assume everything is ok. */ | |
67b0b41c | 106 | if (retval == 0 && S_ISDIR(sb.st_mode)) { |
107 | redisLog(REDIS_NOTICE,"Disk store %s exists", path); | |
108 | return REDIS_OK; | |
109 | } | |
52970711 | 110 | |
111 | /* File exists but it's not a directory */ | |
112 | if (retval == 0 && !S_ISDIR(sb.st_mode)) { | |
113 | redisLog(REDIS_WARNING,"Disk store at %s is not a directory", path); | |
114 | return REDIS_ERR; | |
115 | } | |
116 | ||
117 | /* New disk store, create the directory structure now, as creating | |
118 | * them in a lazy way is not a good idea, after very few insertions | |
119 | * we'll need most of the 65536 directories anyway. */ | |
67b0b41c | 120 | redisLog(REDIS_NOTICE,"Disk store %s does not exist: creating", path); |
ddbc81af | 121 | if (mkdir(path,0755) == -1) { |
52970711 | 122 | redisLog(REDIS_WARNING,"Disk store init failed creating dir %s: %s", |
123 | path, strerror(errno)); | |
124 | return REDIS_ERR; | |
125 | } | |
ddbc81af | 126 | /* Create the top level 256 directories */ |
127 | snprintf(buf,sizeof(buf),"%s/",path); | |
128 | if (create256dir(buf) == REDIS_ERR) return REDIS_ERR; | |
129 | ||
130 | /* For every 256 top level dir, create 256 nested dirs */ | |
131 | for (j = 0; j < 256; j++) { | |
132 | snprintf(buf,sizeof(buf),"%s/%02x/",path,j); | |
133 | if (create256dir(buf) == REDIS_ERR) return REDIS_ERR; | |
134 | } | |
52970711 | 135 | return REDIS_OK; |
136 | } | |
137 | ||
138 | int dsClose(void) { | |
139 | return REDIS_OK; | |
140 | } | |
141 | ||
4ab98823 | 142 | /* Convert key into full path for this object. Dirty but hopefully |
143 | * is fast enough. */ | |
1fce3201 | 144 | void dsKeyToPath(redisDb *db, char *buf, robj *key) { |
4ab98823 | 145 | SHA1_CTX ctx; |
146 | unsigned char hash[20]; | |
1fce3201 | 147 | char hex[40], digits[] = "0123456789abcdef"; |
4ab98823 | 148 | int j, l; |
149 | ||
150 | SHA1Init(&ctx); | |
151 | SHA1Update(&ctx,key->ptr,sdslen(key->ptr)); | |
152 | SHA1Final(hash,&ctx); | |
153 | ||
154 | /* Convert the hash into hex format */ | |
155 | for (j = 0; j < 20; j++) { | |
156 | hex[j*2] = digits[(hash[j]&0xF0)>>4]; | |
157 | hex[(j*2)+1] = digits[hash[j]&0x0F]; | |
158 | } | |
159 | ||
160 | /* Create the object path. Start with server.ds_path that's the root dir */ | |
161 | l = sdslen(server.ds_path); | |
162 | memcpy(buf,server.ds_path,l); | |
163 | buf += l; | |
164 | *buf++ = '/'; | |
165 | ||
166 | /* Then add xx/yy/ that is the two level directories */ | |
167 | buf[0] = hex[0]; | |
168 | buf[1] = hex[1]; | |
169 | buf[2] = '/'; | |
170 | buf[3] = hex[2]; | |
171 | buf[4] = hex[3]; | |
172 | buf[5] = '/'; | |
173 | buf += 6; | |
174 | ||
175 | /* Add the database number followed by _ and finall the SHA1 hex */ | |
176 | l = ll2string(buf,64,db->id); | |
177 | buf += l; | |
178 | buf[0] = '_'; | |
179 | memcpy(buf+1,hex,40); | |
180 | buf[41] = '\0'; | |
181 | } | |
182 | ||
52970711 | 183 | int dsSet(redisDb *db, robj *key, robj *val) { |
4ab98823 | 184 | char buf[1024]; |
185 | FILE *fp; | |
186 | int retval; | |
187 | ||
1fce3201 | 188 | dsKeyToPath(db,buf,key); |
4ab98823 | 189 | fp = fopen(buf,"w"); |
190 | if ((retval = rdbSaveKeyValuePair(fp,db,key,val,time(NULL))) == -1) | |
191 | return REDIS_ERR; | |
192 | fclose(fp); | |
1fce3201 | 193 | if (retval == 0) unlink(buf); /* Expired key. Unlink failing not critical */ |
4ab98823 | 194 | return REDIS_OK; |
52970711 | 195 | } |
196 | ||
4ab98823 | 197 | robj *dsGet(redisDb *db, robj *key, time_t *expire) { |
1fce3201 | 198 | char buf[1024]; |
199 | int type; | |
200 | time_t expiretime = -1; /* -1 means: no expire */ | |
201 | robj *dskey; /* Key as loaded from disk. */ | |
202 | robj *val; | |
203 | FILE *fp; | |
204 | ||
205 | dsKeyToPath(db,buf,key); | |
206 | fp = fopen(buf,"r"); | |
207 | if (fp == NULL && errno == ENOENT) return NULL; /* No such key */ | |
208 | if (fp == NULL) { | |
209 | redisLog(REDIS_WARNING,"Disk store failed opening %s: %s", | |
210 | buf, strerror(errno)); | |
211 | goto readerr; | |
212 | } | |
213 | ||
214 | if ((type = rdbLoadType(fp)) == -1) goto readerr; | |
215 | if (type == REDIS_EXPIRETIME) { | |
216 | if ((expiretime = rdbLoadTime(fp)) == -1) goto readerr; | |
217 | /* We read the time so we need to read the object type again */ | |
218 | if ((type = rdbLoadType(fp)) == -1) goto readerr; | |
219 | } | |
220 | /* Read key */ | |
221 | if ((dskey = rdbLoadStringObject(fp)) == NULL) goto readerr; | |
222 | /* Read value */ | |
223 | if ((val = rdbLoadObject(type,fp)) == NULL) goto readerr; | |
224 | fclose(fp); | |
225 | ||
226 | /* The key we asked, and the key returned, must be the same */ | |
227 | redisAssert(equalStringObjects(key,dskey)); | |
228 | ||
229 | /* Check if the key already expired */ | |
230 | decrRefCount(dskey); | |
231 | if (expiretime != -1 && expiretime < time(NULL)) { | |
232 | decrRefCount(val); | |
233 | unlink(buf); /* This failing is non critical here */ | |
234 | return NULL; | |
235 | } | |
236 | ||
237 | /* Everything ok... */ | |
238 | *expire = expiretime; | |
239 | return val; | |
240 | ||
241 | readerr: | |
c4b64a13 | 242 | redisLog(REDIS_WARNING,"Read error reading reading %s. Corrupted key?", |
243 | buf); | |
1fce3201 | 244 | redisPanic("Unrecoverable error reading from disk store"); |
245 | return NULL; /* unreached */ | |
52970711 | 246 | } |
247 | ||
5ef64098 | 248 | int dsDel(redisDb *db, robj *key) { |
1fce3201 | 249 | char buf[1024]; |
250 | ||
251 | dsKeyToPath(db,buf,key); | |
252 | if (unlink(buf) == -1) { | |
253 | if (errno == ENOENT) { | |
254 | return REDIS_ERR; | |
255 | } else { | |
256 | redisLog(REDIS_WARNING,"Disk store can't remove %s: %s", | |
257 | buf, strerror(errno)); | |
258 | redisPanic("Unrecoverable Disk store errore. Existing."); | |
259 | return REDIS_ERR; /* unreached */ | |
260 | } | |
261 | } else { | |
262 | return REDIS_OK; | |
263 | } | |
5ef64098 | 264 | } |
265 | ||
52970711 | 266 | int dsExists(redisDb *db, robj *key) { |
31222292 | 267 | char buf[1024]; |
268 | ||
269 | dsKeyToPath(db,buf,key); | |
270 | return access(buf,R_OK) == 0; | |
52970711 | 271 | } |
cea8c5cd | 272 | |
273 | int dsFlushDb(int dbid) { | |
274 | } |