]>
Commit | Line | Data |
---|---|---|
1 | /* diskstore.c implements a very simple disk backed key-value store used | |
2 | * by Redis for the "disk" backend. This implementation uses the filesystem | |
3 | * to store key/value pairs. Every file represents a given key. | |
4 | * | |
5 | * The key path is calculated using the SHA1 of the key name. For instance | |
6 | * the key "foo" is stored as a file name called: | |
7 | * | |
8 | * /0b/ee/0beec7b5ea3f0fdbc95d0dd47f3c5bc275da8a33 | |
9 | * | |
10 | * The couples of characters from the hex output of SHA1 are also used | |
11 | * to locate two two levels of directories to store the file (as most | |
12 | * filesystems are not able to handle too many files in a single dir). | |
13 | * | |
14 | * In the end there are 65536 final directories (256 directories inside | |
15 | * every 256 top level directories), so that with 1 billion of files every | |
16 | * directory will contain in the average 15258 entires, that is ok with | |
17 | * most filesystems implementation. | |
18 | * | |
19 | * Note that since Redis supports multiple databases, the actual key name | |
20 | * is: | |
21 | * | |
22 | * /0b/ee/<dbid>_0beec7b5ea3f0fdbc95d0dd47f3c5bc275da8a33 | |
23 | * | |
24 | * so for instance if the key is inside DB 0: | |
25 | * | |
26 | * /0b/ee/0_0beec7b5ea3f0fdbc95d0dd47f3c5bc275da8a33 | |
27 | * | |
28 | * The actaul implementation of this disk store is highly dependant to the | |
29 | * filesystem implementation itself. This implementation may be replaced by | |
30 | * a B+TREE implementation in future implementations. | |
31 | * | |
32 | * Data ok every key is serialized using the same format used for .rdb | |
33 | * serialization. Everything is serialized on every entry: key name, | |
34 | * ttl information in case of keys with an associated expire time, and the | |
35 | * serialized value itself. | |
36 | * | |
37 | * Because the format is the same of the .rdb files it is trivial to create | |
38 | * an .rdb file starting from this format just by mean of scanning the | |
39 | * directories and concatenating entries, with the sole addition of an | |
40 | * .rdb header at the start and the end-of-db opcode at the end. | |
41 | * | |
42 | * ------------------------------------------------------------------------- | |
43 | * | |
44 | * Copyright (c) 2010-2011, Salvatore Sanfilippo <antirez at gmail dot com> | |
45 | * All rights reserved. | |
46 | * | |
47 | * Redistribution and use in source and binary forms, with or without | |
48 | * modification, are permitted provided that the following conditions are met: | |
49 | * | |
50 | * * Redistributions of source code must retain the above copyright notice, | |
51 | * this list of conditions and the following disclaimer. | |
52 | * * Redistributions in binary form must reproduce the above copyright | |
53 | * notice, this list of conditions and the following disclaimer in the | |
54 | * documentation and/or other materials provided with the distribution. | |
55 | * * Neither the name of Redis nor the names of its contributors may be used | |
56 | * to endorse or promote products derived from this software without | |
57 | * specific prior written permission. | |
58 | * | |
59 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |
60 | * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |
61 | * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |
62 | * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE | |
63 | * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR | |
64 | * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF | |
65 | * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS | |
66 | * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN | |
67 | * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) | |
68 | * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE | |
69 | * POSSIBILITY OF SUCH DAMAGE. | |
70 | */ | |
71 | ||
72 | #include "redis.h" | |
73 | #include "sha1.h" | |
74 | ||
75 | #include <fcntl.h> | |
76 | #include <sys/stat.h> | |
77 | #include <dirent.h> | |
78 | ||
79 | int create256dir(char *prefix) { | |
80 | char buf[1024]; | |
81 | int j; | |
82 | ||
83 | for (j = 0; j < 256; j++) { | |
84 | snprintf(buf,sizeof(buf),"%s%02x",prefix,j); | |
85 | if (mkdir(buf,0755) == -1) { | |
86 | redisLog(REDIS_WARNING,"Error creating dir %s for diskstore: %s", | |
87 | buf,strerror(errno)); | |
88 | return REDIS_ERR; | |
89 | } | |
90 | } | |
91 | return REDIS_OK; | |
92 | } | |
93 | ||
94 | int dsOpen(void) { | |
95 | struct stat sb; | |
96 | int retval, j; | |
97 | char *path = server.ds_path; | |
98 | char buf[1024]; | |
99 | ||
100 | if ((retval = stat(path,&sb) == -1) && errno != ENOENT) { | |
101 | redisLog(REDIS_WARNING, "Error opening disk store at %s: %s", | |
102 | path, strerror(errno)); | |
103 | return REDIS_ERR; | |
104 | } | |
105 | ||
106 | /* Directory already in place. Assume everything is ok. */ | |
107 | if (retval == 0 && S_ISDIR(sb.st_mode)) { | |
108 | redisLog(REDIS_NOTICE,"Disk store %s exists", path); | |
109 | return REDIS_OK; | |
110 | } | |
111 | ||
112 | /* File exists but it's not a directory */ | |
113 | if (retval == 0 && !S_ISDIR(sb.st_mode)) { | |
114 | redisLog(REDIS_WARNING,"Disk store at %s is not a directory", path); | |
115 | return REDIS_ERR; | |
116 | } | |
117 | ||
118 | /* New disk store, create the directory structure now, as creating | |
119 | * them in a lazy way is not a good idea, after very few insertions | |
120 | * we'll need most of the 65536 directories anyway. */ | |
121 | redisLog(REDIS_NOTICE,"Disk store %s does not exist: creating", path); | |
122 | if (mkdir(path,0755) == -1) { | |
123 | redisLog(REDIS_WARNING,"Disk store init failed creating dir %s: %s", | |
124 | path, strerror(errno)); | |
125 | return REDIS_ERR; | |
126 | } | |
127 | /* Create the top level 256 directories */ | |
128 | snprintf(buf,sizeof(buf),"%s/",path); | |
129 | if (create256dir(buf) == REDIS_ERR) return REDIS_ERR; | |
130 | ||
131 | /* For every 256 top level dir, create 256 nested dirs */ | |
132 | for (j = 0; j < 256; j++) { | |
133 | snprintf(buf,sizeof(buf),"%s/%02x/",path,j); | |
134 | if (create256dir(buf) == REDIS_ERR) return REDIS_ERR; | |
135 | } | |
136 | return REDIS_OK; | |
137 | } | |
138 | ||
139 | int dsClose(void) { | |
140 | return REDIS_OK; | |
141 | } | |
142 | ||
143 | /* Convert key into full path for this object. Dirty but hopefully | |
144 | * is fast enough. Returns the length of the returned path. */ | |
145 | int dsKeyToPath(redisDb *db, char *buf, robj *key) { | |
146 | SHA1_CTX ctx; | |
147 | unsigned char hash[20]; | |
148 | char hex[40], digits[] = "0123456789abcdef"; | |
149 | int j, l; | |
150 | char *origbuf = buf; | |
151 | ||
152 | SHA1Init(&ctx); | |
153 | SHA1Update(&ctx,key->ptr,sdslen(key->ptr)); | |
154 | SHA1Final(hash,&ctx); | |
155 | ||
156 | /* Convert the hash into hex format */ | |
157 | for (j = 0; j < 20; j++) { | |
158 | hex[j*2] = digits[(hash[j]&0xF0)>>4]; | |
159 | hex[(j*2)+1] = digits[hash[j]&0x0F]; | |
160 | } | |
161 | ||
162 | /* Create the object path. Start with server.ds_path that's the root dir */ | |
163 | l = sdslen(server.ds_path); | |
164 | memcpy(buf,server.ds_path,l); | |
165 | buf += l; | |
166 | *buf++ = '/'; | |
167 | ||
168 | /* Then add xx/yy/ that is the two level directories */ | |
169 | buf[0] = hex[0]; | |
170 | buf[1] = hex[1]; | |
171 | buf[2] = '/'; | |
172 | buf[3] = hex[2]; | |
173 | buf[4] = hex[3]; | |
174 | buf[5] = '/'; | |
175 | buf += 6; | |
176 | ||
177 | /* Add the database number followed by _ and finall the SHA1 hex */ | |
178 | l = ll2string(buf,64,db->id); | |
179 | buf += l; | |
180 | buf[0] = '_'; | |
181 | memcpy(buf+1,hex,40); | |
182 | buf[41] = '\0'; | |
183 | return (buf-origbuf)+41; | |
184 | } | |
185 | ||
186 | int dsSet(redisDb *db, robj *key, robj *val, time_t expire) { | |
187 | char buf[1024], buf2[1024]; | |
188 | FILE *fp; | |
189 | int retval, len; | |
190 | ||
191 | len = dsKeyToPath(db,buf,key); | |
192 | memcpy(buf2,buf,len); | |
193 | snprintf(buf2+len,sizeof(buf2)-len,"-%ld-%ld",(long)time(NULL),(long)val); | |
194 | while ((fp = fopen(buf2,"w")) == NULL) { | |
195 | if (errno == ENOSPC) { | |
196 | redisLog(REDIS_WARNING,"Diskstore: No space left on device. Please make room and wait 30 seconds for Redis to continue."); | |
197 | sleep(30); | |
198 | } else { | |
199 | redisLog(REDIS_WARNING,"diskstore error opening %s: %s", | |
200 | buf2, strerror(errno)); | |
201 | redisPanic("Unrecoverable diskstore error. Exiting."); | |
202 | } | |
203 | } | |
204 | if ((retval = rdbSaveKeyValuePair(fp,key,val,expire,time(NULL))) == -1) | |
205 | return REDIS_ERR; | |
206 | fclose(fp); | |
207 | if (retval == 0) { | |
208 | /* Expired key. Unlink failing not critical */ | |
209 | unlink(buf); | |
210 | unlink(buf2); | |
211 | } else { | |
212 | /* Use rename for atomic updadte of value */ | |
213 | if (rename(buf2,buf) == -1) { | |
214 | redisLog(REDIS_WARNING,"rename(2) returned an error: %s", | |
215 | strerror(errno)); | |
216 | redisPanic("Unrecoverable diskstore error. Exiting."); | |
217 | } | |
218 | } | |
219 | return REDIS_OK; | |
220 | } | |
221 | ||
222 | robj *dsGet(redisDb *db, robj *key, time_t *expire) { | |
223 | char buf[1024]; | |
224 | int type; | |
225 | time_t expiretime = -1; /* -1 means: no expire */ | |
226 | robj *dskey; /* Key as loaded from disk. */ | |
227 | robj *val; | |
228 | FILE *fp; | |
229 | ||
230 | dsKeyToPath(db,buf,key); | |
231 | fp = fopen(buf,"r"); | |
232 | if (fp == NULL && errno == ENOENT) return NULL; /* No such key */ | |
233 | if (fp == NULL) { | |
234 | redisLog(REDIS_WARNING,"Disk store failed opening %s: %s", | |
235 | buf, strerror(errno)); | |
236 | goto readerr; | |
237 | } | |
238 | ||
239 | if ((type = rdbLoadType(fp)) == -1) goto readerr; | |
240 | if (type == REDIS_EXPIRETIME) { | |
241 | if ((expiretime = rdbLoadTime(fp)) == -1) goto readerr; | |
242 | /* We read the time so we need to read the object type again */ | |
243 | if ((type = rdbLoadType(fp)) == -1) goto readerr; | |
244 | } | |
245 | /* Read key */ | |
246 | if ((dskey = rdbLoadStringObject(fp)) == NULL) goto readerr; | |
247 | /* Read value */ | |
248 | if ((val = rdbLoadObject(type,fp)) == NULL) goto readerr; | |
249 | fclose(fp); | |
250 | ||
251 | /* The key we asked, and the key returned, must be the same */ | |
252 | redisAssert(equalStringObjects(key,dskey)); | |
253 | ||
254 | /* Check if the key already expired */ | |
255 | decrRefCount(dskey); | |
256 | if (expiretime != -1 && expiretime < time(NULL)) { | |
257 | decrRefCount(val); | |
258 | unlink(buf); /* This failing is non critical here */ | |
259 | return NULL; | |
260 | } | |
261 | ||
262 | /* Everything ok... */ | |
263 | *expire = expiretime; | |
264 | return val; | |
265 | ||
266 | readerr: | |
267 | redisLog(REDIS_WARNING,"Read error reading reading %s. Corrupted key?", | |
268 | buf); | |
269 | redisPanic("Unrecoverable error reading from disk store"); | |
270 | return NULL; /* unreached */ | |
271 | } | |
272 | ||
273 | int dsDel(redisDb *db, robj *key) { | |
274 | char buf[1024]; | |
275 | ||
276 | dsKeyToPath(db,buf,key); | |
277 | if (unlink(buf) == -1) { | |
278 | if (errno == ENOENT) { | |
279 | return REDIS_ERR; | |
280 | } else { | |
281 | redisLog(REDIS_WARNING,"Disk store can't remove %s: %s", | |
282 | buf, strerror(errno)); | |
283 | redisPanic("Unrecoverable Disk store errore. Existing."); | |
284 | return REDIS_ERR; /* unreached */ | |
285 | } | |
286 | } else { | |
287 | return REDIS_OK; | |
288 | } | |
289 | } | |
290 | ||
291 | int dsExists(redisDb *db, robj *key) { | |
292 | char buf[1024]; | |
293 | ||
294 | dsKeyToPath(db,buf,key); | |
295 | return access(buf,R_OK) == 0; | |
296 | } | |
297 | ||
298 | int dsGetDbidFromFilename(char *path) { | |
299 | char id[64]; | |
300 | char *p = strchr(path,'_'); | |
301 | int len = (p - path); | |
302 | ||
303 | redisAssert(p != NULL && len < 64); | |
304 | memcpy(id,path,len); | |
305 | id[len] = '\0'; | |
306 | return atoi(id); | |
307 | } | |
308 | ||
309 | void dsFlushOneDir(char *path, int dbid) { | |
310 | DIR *dir; | |
311 | struct dirent *dp, de; | |
312 | ||
313 | dir = opendir(path); | |
314 | if (dir == NULL) { | |
315 | redisLog(REDIS_WARNING,"Disk store can't open dir %s: %s", | |
316 | path, strerror(errno)); | |
317 | redisPanic("Unrecoverable Disk store errore. Existing."); | |
318 | } | |
319 | while(1) { | |
320 | char buf[1024]; | |
321 | ||
322 | readdir_r(dir,&de,&dp); | |
323 | if (dp == NULL) break; | |
324 | if (dp->d_name[0] == '.') continue; | |
325 | ||
326 | /* Check if we need to remove this entry accordingly to the | |
327 | * DB number. */ | |
328 | if (dbid != -1 && dsGetDbidFromFilename(dp->d_name)) continue; | |
329 | ||
330 | /* Finally unlink the file */ | |
331 | snprintf(buf,1024,"%s/%s",path,dp->d_name); | |
332 | if (unlink(buf) == -1) { | |
333 | redisLog(REDIS_WARNING, | |
334 | "Can't unlink %s: %s", buf, strerror(errno)); | |
335 | redisPanic("Unrecoverable Disk store errore. Existing."); | |
336 | } | |
337 | } | |
338 | closedir(dir); | |
339 | } | |
340 | ||
341 | void dsFlushDb(int dbid) { | |
342 | char buf[1024]; | |
343 | int j, i; | |
344 | ||
345 | redisLog(REDIS_NOTICE,"Flushing diskstore DB (%d)",dbid); | |
346 | for (j = 0; j < 256; j++) { | |
347 | for (i = 0; i < 256; i++) { | |
348 | snprintf(buf,1024,"%s/%02x/%02x",server.ds_path,j,i); | |
349 | dsFlushOneDir(buf,dbid); | |
350 | } | |
351 | } | |
352 | } | |
353 | ||
354 | void dsRdbSaveSetState(int state) { | |
355 | pthread_mutex_lock(&server.bgsavethread_mutex); | |
356 | server.bgsavethread_state = state; | |
357 | pthread_mutex_unlock(&server.bgsavethread_mutex); | |
358 | } | |
359 | ||
360 | void *dsRdbSave_thread(void *arg) { | |
361 | char tmpfile[256], *filename = (char*)arg; | |
362 | struct dirent *dp, de; | |
363 | int j, i, last_dbid = -1; | |
364 | FILE *fp; | |
365 | ||
366 | /* Change state to ACTIVE, to signal there is a saving thead working. */ | |
367 | redisLog(REDIS_NOTICE,"Diskstore BGSAVE thread started"); | |
368 | dsRdbSaveSetState(REDIS_BGSAVE_THREAD_ACTIVE); | |
369 | ||
370 | snprintf(tmpfile,256,"temp-%d.rdb", (int) getpid()); | |
371 | fp = fopen(tmpfile,"w"); | |
372 | if (!fp) { | |
373 | redisLog(REDIS_WARNING, "Failed opening .rdb for saving: %s", | |
374 | strerror(errno)); | |
375 | dsRdbSaveSetState(REDIS_BGSAVE_THREAD_DONE_ERR); | |
376 | return NULL; | |
377 | } | |
378 | if (fwrite("REDIS0001",9,1,fp) == 0) goto werr; | |
379 | ||
380 | sleep(5); | |
381 | ||
382 | /* Scan all diskstore dirs looking for keys */ | |
383 | for (j = 0; j < 256; j++) { | |
384 | for (i = 0; i < 256; i++) { | |
385 | DIR *dir; | |
386 | char buf[1024]; | |
387 | ||
388 | /* For every directory, collect all the keys */ | |
389 | snprintf(buf,sizeof(buf),"%s/%02x/%02x",server.ds_path,j,i); | |
390 | if ((dir = opendir(buf)) == NULL) { | |
391 | redisLog(REDIS_WARNING,"Disk store can't open dir %s: %s", | |
392 | buf, strerror(errno)); | |
393 | goto werr; | |
394 | } | |
395 | ||
396 | while(1) { | |
397 | char buf[1024]; | |
398 | int dbid; | |
399 | FILE *entryfp; | |
400 | ||
401 | readdir_r(dir,&de,&dp); | |
402 | if (dp == NULL) break; | |
403 | if (dp->d_name[0] == '.') continue; | |
404 | /* If there is a '-' char in the file name, it's a temp file */ | |
405 | if (strchr(dp->d_name,'-') != NULL) continue; | |
406 | ||
407 | /* Emit the SELECT DB opcode if needed. */ | |
408 | dbid = dsGetDbidFromFilename(dp->d_name); | |
409 | if (dbid != last_dbid) { | |
410 | last_dbid = dbid; | |
411 | if (rdbSaveType(fp,REDIS_SELECTDB) == -1) goto werr; | |
412 | if (rdbSaveLen(fp,dbid) == -1) goto werr; | |
413 | } | |
414 | ||
415 | /* Let's copy this file into the target .rdb */ | |
416 | snprintf(buf,sizeof(buf),"%s/%02x/%02x/%s", | |
417 | server.ds_path,j,i,dp->d_name); | |
418 | if ((entryfp = fopen(buf,"r")) == NULL) { | |
419 | redisLog(REDIS_WARNING,"Can't open %s: %s", | |
420 | buf,strerror(errno)); | |
421 | closedir(dir); | |
422 | goto werr; | |
423 | } | |
424 | while(1) { | |
425 | int nread = fread(buf,1,sizeof(buf),entryfp); | |
426 | ||
427 | if (nread == 0) { | |
428 | if (ferror(entryfp)) { | |
429 | redisLog(REDIS_WARNING,"Error reading from file entry while performing BGSAVE for diskstore: %s", strerror(errno)); | |
430 | closedir(dir); | |
431 | goto werr; | |
432 | } else { | |
433 | break; | |
434 | } | |
435 | } | |
436 | if (fwrite(buf,1,nread,fp) != (unsigned)nread) { | |
437 | closedir(dir); | |
438 | goto werr; | |
439 | } | |
440 | } | |
441 | fclose(entryfp); | |
442 | } | |
443 | closedir(dir); | |
444 | } | |
445 | } | |
446 | ||
447 | /* Output the end of file opcode */ | |
448 | if (rdbSaveType(fp,REDIS_EOF) == -1) goto werr; | |
449 | ||
450 | /* Make sure data will not remain on the OS's output buffers */ | |
451 | fflush(fp); | |
452 | fsync(fileno(fp)); | |
453 | fclose(fp); | |
454 | zfree(filename); | |
455 | ||
456 | /* Use RENAME to make sure the DB file is changed atomically only | |
457 | * if the generate DB file is ok. */ | |
458 | if (rename(tmpfile,filename) == -1) { | |
459 | redisLog(REDIS_WARNING,"Error moving temp DB file on the final destination: %s (diskstore)", strerror(errno)); | |
460 | unlink(tmpfile); | |
461 | dsRdbSaveSetState(REDIS_BGSAVE_THREAD_DONE_ERR); | |
462 | return NULL; | |
463 | } | |
464 | redisLog(REDIS_NOTICE,"DB saved on disk by diskstore thread"); | |
465 | dsRdbSaveSetState(REDIS_BGSAVE_THREAD_DONE_OK); | |
466 | return NULL; | |
467 | ||
468 | werr: | |
469 | zfree(filename); | |
470 | fclose(fp); | |
471 | unlink(tmpfile); | |
472 | dsRdbSaveSetState(REDIS_BGSAVE_THREAD_DONE_ERR); | |
473 | redisLog(REDIS_WARNING,"Write error saving DB on disk: %s", strerror(errno)); | |
474 | return NULL; | |
475 | } | |
476 | ||
477 | int dsRdbSaveBackground(char *filename) { | |
478 | pthread_t thread; | |
479 | ||
480 | if (pthread_create(&thread,NULL,dsRdbSave_thread,zstrdup(filename)) != 0) { | |
481 | redisLog(REDIS_WARNING,"Can't create diskstore BGSAVE thread: %s", | |
482 | strerror(errno)); | |
483 | return REDIS_ERR; | |
484 | } else { | |
485 | server.bgsavethread = thread; | |
486 | return REDIS_OK; | |
487 | } | |
488 | } | |
489 | ||
490 | int dsRdbSave(char *filename) { | |
491 | /* A blocking save is actually a non blocking save... just we wait | |
492 | * for it to terminate in a non-busy loop. */ | |
493 | ||
494 | redisLog(REDIS_NOTICE,"Starting a blocking SAVE (BGSAVE + blocking wait)"); | |
495 | server.dirty_before_bgsave = server.dirty; | |
496 | if (dsRdbSaveBackground(filename) == REDIS_ERR) return REDIS_ERR; | |
497 | while(1) { | |
498 | usleep(1000); | |
499 | int state; | |
500 | ||
501 | pthread_mutex_lock(&server.bgsavethread_mutex); | |
502 | state = server.bgsavethread_state; | |
503 | pthread_mutex_unlock(&server.bgsavethread_mutex); | |
504 | ||
505 | if (state == REDIS_BGSAVE_THREAD_DONE_OK || | |
506 | state == REDIS_BGSAVE_THREAD_DONE_ERR) break; | |
507 | } | |
508 | return REDIS_OK; | |
509 | } |