52970711 |
1 | /* diskstore.c implements a very simple disk backed key-value store used |
2 | * by Redis for the "disk" backend. This implementation uses the filesystem |
3 | * to store key/value pairs. Every file represents a given key. |
4 | * |
5 | * The key path is calculated using the SHA1 of the key name. For instance |
6 | * the key "foo" is stored as a file name called: |
7 | * |
8 | * /0b/ee/0beec7b5ea3f0fdbc95d0dd47f3c5bc275da8a33 |
9 | * |
10 | * The couples of characters from the hex output of SHA1 are also used |
11 | * to locate two two levels of directories to store the file (as most |
12 | * filesystems are not able to handle too many files in a single dir). |
13 | * |
14 | * In the end there are 65536 final directories (256 directories inside |
15 | * every 256 top level directories), so that with 1 billion of files every |
16 | * directory will contain in the average 15258 entires, that is ok with |
17 | * most filesystems implementation. |
18 | * |
697af434 |
19 | * Note that since Redis supports multiple databases, the actual key name |
20 | * is: |
21 | * |
22 | * /0b/ee/<dbid>_0beec7b5ea3f0fdbc95d0dd47f3c5bc275da8a33 |
23 | * |
24 | * so for instance if the key is inside DB 0: |
25 | * |
26 | * /0b/ee/0_0beec7b5ea3f0fdbc95d0dd47f3c5bc275da8a33 |
27 | * |
28 | * The actaul implementation of this disk store is highly dependant to the |
29 | * filesystem implementation itself. This implementation may be replaced by |
52970711 |
30 | * a B+TREE implementation in future implementations. |
31 | * |
32 | * Data ok every key is serialized using the same format used for .rdb |
33 | * serialization. Everything is serialized on every entry: key name, |
34 | * ttl information in case of keys with an associated expire time, and the |
35 | * serialized value itself. |
36 | * |
37 | * Because the format is the same of the .rdb files it is trivial to create |
38 | * an .rdb file starting from this format just by mean of scanning the |
39 | * directories and concatenating entries, with the sole addition of an |
40 | * .rdb header at the start and the end-of-db opcode at the end. |
41 | * |
42 | * ------------------------------------------------------------------------- |
43 | * |
44 | * Copyright (c) 2010-2011, Salvatore Sanfilippo <antirez at gmail dot com> |
45 | * All rights reserved. |
46 | * |
47 | * Redistribution and use in source and binary forms, with or without |
48 | * modification, are permitted provided that the following conditions are met: |
49 | * |
50 | * * Redistributions of source code must retain the above copyright notice, |
51 | * this list of conditions and the following disclaimer. |
52 | * * Redistributions in binary form must reproduce the above copyright |
53 | * notice, this list of conditions and the following disclaimer in the |
54 | * documentation and/or other materials provided with the distribution. |
55 | * * Neither the name of Redis nor the names of its contributors may be used |
56 | * to endorse or promote products derived from this software without |
57 | * specific prior written permission. |
58 | * |
59 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" |
60 | * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE |
61 | * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE |
62 | * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE |
63 | * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR |
64 | * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF |
65 | * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS |
66 | * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN |
67 | * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) |
68 | * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE |
69 | * POSSIBILITY OF SUCH DAMAGE. |
70 | */ |
71 | |
72 | #include "redis.h" |
4ab98823 |
73 | #include "sha1.h" |
52970711 |
74 | |
75 | #include <fcntl.h> |
76 | #include <sys/stat.h> |
120b9ba8 |
77 | #include <dirent.h> |
52970711 |
78 | |
ddbc81af |
79 | int create256dir(char *prefix) { |
80 | char buf[1024]; |
81 | int j; |
82 | |
83 | for (j = 0; j < 256; j++) { |
84 | snprintf(buf,sizeof(buf),"%s%02x",prefix,j); |
85 | if (mkdir(buf,0755) == -1) { |
86 | redisLog(REDIS_WARNING,"Error creating dir %s for diskstore: %s", |
87 | buf,strerror(errno)); |
88 | return REDIS_ERR; |
89 | } |
90 | } |
91 | return REDIS_OK; |
92 | } |
93 | |
52970711 |
94 | int dsOpen(void) { |
95 | struct stat sb; |
ddbc81af |
96 | int retval, j; |
697af434 |
97 | char *path = server.ds_path; |
ddbc81af |
98 | char buf[1024]; |
52970711 |
99 | |
100 | if ((retval = stat(path,&sb) == -1) && errno != ENOENT) { |
101 | redisLog(REDIS_WARNING, "Error opening disk store at %s: %s", |
102 | path, strerror(errno)); |
103 | return REDIS_ERR; |
104 | } |
105 | |
106 | /* Directory already in place. Assume everything is ok. */ |
67b0b41c |
107 | if (retval == 0 && S_ISDIR(sb.st_mode)) { |
108 | redisLog(REDIS_NOTICE,"Disk store %s exists", path); |
109 | return REDIS_OK; |
110 | } |
52970711 |
111 | |
112 | /* File exists but it's not a directory */ |
113 | if (retval == 0 && !S_ISDIR(sb.st_mode)) { |
114 | redisLog(REDIS_WARNING,"Disk store at %s is not a directory", path); |
115 | return REDIS_ERR; |
116 | } |
117 | |
118 | /* New disk store, create the directory structure now, as creating |
119 | * them in a lazy way is not a good idea, after very few insertions |
120 | * we'll need most of the 65536 directories anyway. */ |
67b0b41c |
121 | redisLog(REDIS_NOTICE,"Disk store %s does not exist: creating", path); |
ddbc81af |
122 | if (mkdir(path,0755) == -1) { |
52970711 |
123 | redisLog(REDIS_WARNING,"Disk store init failed creating dir %s: %s", |
124 | path, strerror(errno)); |
125 | return REDIS_ERR; |
126 | } |
ddbc81af |
127 | /* Create the top level 256 directories */ |
128 | snprintf(buf,sizeof(buf),"%s/",path); |
129 | if (create256dir(buf) == REDIS_ERR) return REDIS_ERR; |
130 | |
131 | /* For every 256 top level dir, create 256 nested dirs */ |
132 | for (j = 0; j < 256; j++) { |
133 | snprintf(buf,sizeof(buf),"%s/%02x/",path,j); |
134 | if (create256dir(buf) == REDIS_ERR) return REDIS_ERR; |
135 | } |
52970711 |
136 | return REDIS_OK; |
137 | } |
138 | |
139 | int dsClose(void) { |
140 | return REDIS_OK; |
141 | } |
142 | |
4ab98823 |
143 | /* Convert key into full path for this object. Dirty but hopefully |
779fa2af |
144 | * is fast enough. Returns the length of the returned path. */ |
145 | int dsKeyToPath(redisDb *db, char *buf, robj *key) { |
4ab98823 |
146 | SHA1_CTX ctx; |
147 | unsigned char hash[20]; |
1fce3201 |
148 | char hex[40], digits[] = "0123456789abcdef"; |
4ab98823 |
149 | int j, l; |
779fa2af |
150 | char *origbuf = buf; |
4ab98823 |
151 | |
152 | SHA1Init(&ctx); |
153 | SHA1Update(&ctx,key->ptr,sdslen(key->ptr)); |
154 | SHA1Final(hash,&ctx); |
155 | |
156 | /* Convert the hash into hex format */ |
157 | for (j = 0; j < 20; j++) { |
158 | hex[j*2] = digits[(hash[j]&0xF0)>>4]; |
159 | hex[(j*2)+1] = digits[hash[j]&0x0F]; |
160 | } |
161 | |
162 | /* Create the object path. Start with server.ds_path that's the root dir */ |
163 | l = sdslen(server.ds_path); |
164 | memcpy(buf,server.ds_path,l); |
165 | buf += l; |
166 | *buf++ = '/'; |
167 | |
168 | /* Then add xx/yy/ that is the two level directories */ |
169 | buf[0] = hex[0]; |
170 | buf[1] = hex[1]; |
171 | buf[2] = '/'; |
172 | buf[3] = hex[2]; |
173 | buf[4] = hex[3]; |
174 | buf[5] = '/'; |
175 | buf += 6; |
176 | |
177 | /* Add the database number followed by _ and finall the SHA1 hex */ |
178 | l = ll2string(buf,64,db->id); |
179 | buf += l; |
180 | buf[0] = '_'; |
181 | memcpy(buf+1,hex,40); |
182 | buf[41] = '\0'; |
779fa2af |
183 | return (buf-origbuf)+41; |
4ab98823 |
184 | } |
185 | |
05600eb8 |
186 | int dsSet(redisDb *db, robj *key, robj *val, time_t expire) { |
779fa2af |
187 | char buf[1024], buf2[1024]; |
4ab98823 |
188 | FILE *fp; |
779fa2af |
189 | int retval, len; |
4ab98823 |
190 | |
779fa2af |
191 | len = dsKeyToPath(db,buf,key); |
192 | memcpy(buf2,buf,len); |
43574a72 |
193 | snprintf(buf2+len,sizeof(buf2)-len,"-%ld-%ld",(long)time(NULL),(long)val); |
1190c6cb |
194 | while ((fp = fopen(buf2,"w")) == NULL) { |
195 | if (errno == ENOSPC) { |
196 | redisLog(REDIS_WARNING,"Diskstore: No space left on device. Please make room and wait 30 seconds for Redis to continue."); |
197 | sleep(30); |
198 | } else { |
199 | redisLog(REDIS_WARNING,"diskstore error opening %s: %s", |
200 | buf2, strerror(errno)); |
201 | redisPanic("Unrecoverable diskstore error. Exiting."); |
202 | } |
203 | } |
05600eb8 |
204 | if ((retval = rdbSaveKeyValuePair(fp,key,val,expire,time(NULL))) == -1) |
4ab98823 |
205 | return REDIS_ERR; |
206 | fclose(fp); |
779fa2af |
207 | if (retval == 0) { |
208 | /* Expired key. Unlink failing not critical */ |
209 | unlink(buf); |
210 | unlink(buf2); |
211 | } else { |
212 | /* Use rename for atomic updadte of value */ |
213 | if (rename(buf2,buf) == -1) { |
214 | redisLog(REDIS_WARNING,"rename(2) returned an error: %s", |
215 | strerror(errno)); |
216 | redisPanic("Unrecoverable diskstore error. Exiting."); |
217 | } |
218 | } |
4ab98823 |
219 | return REDIS_OK; |
52970711 |
220 | } |
221 | |
4ab98823 |
222 | robj *dsGet(redisDb *db, robj *key, time_t *expire) { |
1fce3201 |
223 | char buf[1024]; |
224 | int type; |
225 | time_t expiretime = -1; /* -1 means: no expire */ |
226 | robj *dskey; /* Key as loaded from disk. */ |
227 | robj *val; |
228 | FILE *fp; |
229 | |
230 | dsKeyToPath(db,buf,key); |
231 | fp = fopen(buf,"r"); |
232 | if (fp == NULL && errno == ENOENT) return NULL; /* No such key */ |
233 | if (fp == NULL) { |
234 | redisLog(REDIS_WARNING,"Disk store failed opening %s: %s", |
235 | buf, strerror(errno)); |
236 | goto readerr; |
237 | } |
238 | |
239 | if ((type = rdbLoadType(fp)) == -1) goto readerr; |
240 | if (type == REDIS_EXPIRETIME) { |
241 | if ((expiretime = rdbLoadTime(fp)) == -1) goto readerr; |
242 | /* We read the time so we need to read the object type again */ |
243 | if ((type = rdbLoadType(fp)) == -1) goto readerr; |
244 | } |
245 | /* Read key */ |
246 | if ((dskey = rdbLoadStringObject(fp)) == NULL) goto readerr; |
247 | /* Read value */ |
248 | if ((val = rdbLoadObject(type,fp)) == NULL) goto readerr; |
249 | fclose(fp); |
250 | |
251 | /* The key we asked, and the key returned, must be the same */ |
252 | redisAssert(equalStringObjects(key,dskey)); |
253 | |
254 | /* Check if the key already expired */ |
255 | decrRefCount(dskey); |
256 | if (expiretime != -1 && expiretime < time(NULL)) { |
257 | decrRefCount(val); |
258 | unlink(buf); /* This failing is non critical here */ |
259 | return NULL; |
260 | } |
261 | |
262 | /* Everything ok... */ |
263 | *expire = expiretime; |
264 | return val; |
265 | |
266 | readerr: |
c4b64a13 |
267 | redisLog(REDIS_WARNING,"Read error reading reading %s. Corrupted key?", |
268 | buf); |
1fce3201 |
269 | redisPanic("Unrecoverable error reading from disk store"); |
270 | return NULL; /* unreached */ |
52970711 |
271 | } |
272 | |
5ef64098 |
273 | int dsDel(redisDb *db, robj *key) { |
1fce3201 |
274 | char buf[1024]; |
275 | |
276 | dsKeyToPath(db,buf,key); |
277 | if (unlink(buf) == -1) { |
278 | if (errno == ENOENT) { |
279 | return REDIS_ERR; |
280 | } else { |
281 | redisLog(REDIS_WARNING,"Disk store can't remove %s: %s", |
282 | buf, strerror(errno)); |
283 | redisPanic("Unrecoverable Disk store errore. Existing."); |
284 | return REDIS_ERR; /* unreached */ |
285 | } |
286 | } else { |
287 | return REDIS_OK; |
288 | } |
5ef64098 |
289 | } |
290 | |
52970711 |
291 | int dsExists(redisDb *db, robj *key) { |
31222292 |
292 | char buf[1024]; |
293 | |
294 | dsKeyToPath(db,buf,key); |
295 | return access(buf,R_OK) == 0; |
52970711 |
296 | } |
cea8c5cd |
297 | |
f03fe802 |
298 | int dsGetDbidFromFilename(char *path) { |
299 | char id[64]; |
300 | char *p = strchr(path,'_'); |
301 | int len = (p - path); |
302 | |
303 | redisAssert(p != NULL && len < 64); |
304 | memcpy(id,path,len); |
305 | id[len] = '\0'; |
306 | return atoi(id); |
307 | } |
308 | |
120b9ba8 |
309 | void dsFlushOneDir(char *path, int dbid) { |
310 | DIR *dir; |
311 | struct dirent *dp, de; |
312 | |
313 | dir = opendir(path); |
314 | if (dir == NULL) { |
315 | redisLog(REDIS_WARNING,"Disk store can't open dir %s: %s", |
316 | path, strerror(errno)); |
317 | redisPanic("Unrecoverable Disk store errore. Existing."); |
318 | } |
319 | while(1) { |
0b305fcf |
320 | char buf[1024]; |
321 | |
120b9ba8 |
322 | readdir_r(dir,&de,&dp); |
323 | if (dp == NULL) break; |
324 | if (dp->d_name[0] == '.') continue; |
325 | |
326 | /* Check if we need to remove this entry accordingly to the |
f03fe802 |
327 | * DB number. */ |
328 | if (dbid != -1 && dsGetDbidFromFilename(dp->d_name)) continue; |
0b305fcf |
329 | |
330 | /* Finally unlink the file */ |
331 | snprintf(buf,1024,"%s/%s",path,dp->d_name); |
332 | if (unlink(buf) == -1) { |
120b9ba8 |
333 | redisLog(REDIS_WARNING, |
0b305fcf |
334 | "Can't unlink %s: %s", buf, strerror(errno)); |
120b9ba8 |
335 | redisPanic("Unrecoverable Disk store errore. Existing."); |
336 | } |
337 | } |
338 | closedir(dir); |
339 | } |
340 | |
341 | void dsFlushDb(int dbid) { |
342 | char buf[1024]; |
343 | int j, i; |
344 | |
345 | redisLog(REDIS_NOTICE,"Flushing diskstore DB (%d)",dbid); |
346 | for (j = 0; j < 256; j++) { |
347 | for (i = 0; i < 256; i++) { |
348 | snprintf(buf,1024,"%s/%02x/%02x",server.ds_path,j,i); |
349 | dsFlushOneDir(buf,dbid); |
350 | } |
351 | } |
cea8c5cd |
352 | } |
249ad25f |
353 | |
5b8ce853 |
354 | void dsRdbSaveSetState(int state) { |
355 | pthread_mutex_lock(&server.bgsavethread_mutex); |
356 | server.bgsavethread_state = state; |
357 | pthread_mutex_unlock(&server.bgsavethread_mutex); |
358 | } |
359 | |
36c17a53 |
360 | void *dsRdbSave_thread(void *arg) { |
361 | char tmpfile[256], *filename = (char*)arg; |
f03fe802 |
362 | struct dirent *dp, de; |
363 | int j, i, last_dbid = -1; |
5b8ce853 |
364 | FILE *fp; |
249ad25f |
365 | |
36c17a53 |
366 | /* Change state to ACTIVE, to signal there is a saving thead working. */ |
f03fe802 |
367 | redisLog(REDIS_NOTICE,"Diskstore BGSAVE thread started"); |
368 | dsRdbSaveSetState(REDIS_BGSAVE_THREAD_ACTIVE); |
36c17a53 |
369 | |
249ad25f |
370 | snprintf(tmpfile,256,"temp-%d.rdb", (int) getpid()); |
371 | fp = fopen(tmpfile,"w"); |
372 | if (!fp) { |
5b8ce853 |
373 | redisLog(REDIS_WARNING, "Failed opening .rdb for saving: %s", |
374 | strerror(errno)); |
375 | dsRdbSaveSetState(REDIS_BGSAVE_THREAD_DONE_ERR); |
376 | return NULL; |
249ad25f |
377 | } |
378 | if (fwrite("REDIS0001",9,1,fp) == 0) goto werr; |
379 | |
5b8ce853 |
380 | sleep(5); |
381 | |
249ad25f |
382 | /* Scan all diskstore dirs looking for keys */ |
383 | for (j = 0; j < 256; j++) { |
384 | for (i = 0; i < 256; i++) { |
f03fe802 |
385 | DIR *dir; |
386 | char buf[1024]; |
387 | |
388 | /* For every directory, collect all the keys */ |
389 | snprintf(buf,sizeof(buf),"%s/%02x/%02x",server.ds_path,j,i); |
390 | if ((dir = opendir(buf)) == NULL) { |
391 | redisLog(REDIS_WARNING,"Disk store can't open dir %s: %s", |
392 | buf, strerror(errno)); |
393 | goto werr; |
394 | } |
395 | |
396 | while(1) { |
397 | char buf[1024]; |
398 | int dbid; |
399 | FILE *entryfp; |
400 | |
401 | readdir_r(dir,&de,&dp); |
402 | if (dp == NULL) break; |
403 | if (dp->d_name[0] == '.') continue; |
43574a72 |
404 | /* If there is a '-' char in the file name, it's a temp file */ |
405 | if (strchr(dp->d_name,'-') != NULL) continue; |
f03fe802 |
406 | |
407 | /* Emit the SELECT DB opcode if needed. */ |
408 | dbid = dsGetDbidFromFilename(dp->d_name); |
409 | if (dbid != last_dbid) { |
410 | last_dbid = dbid; |
411 | if (rdbSaveType(fp,REDIS_SELECTDB) == -1) goto werr; |
412 | if (rdbSaveLen(fp,dbid) == -1) goto werr; |
413 | } |
414 | |
415 | /* Let's copy this file into the target .rdb */ |
416 | snprintf(buf,sizeof(buf),"%s/%02x/%02x/%s", |
417 | server.ds_path,j,i,dp->d_name); |
418 | if ((entryfp = fopen(buf,"r")) == NULL) { |
419 | redisLog(REDIS_WARNING,"Can't open %s: %s", |
420 | buf,strerror(errno)); |
421 | closedir(dir); |
422 | goto werr; |
423 | } |
424 | while(1) { |
425 | int nread = fread(buf,1,sizeof(buf),entryfp); |
426 | |
427 | if (nread == 0) { |
428 | if (ferror(entryfp)) { |
429 | redisLog(REDIS_WARNING,"Error reading from file entry while performing BGSAVE for diskstore: %s", strerror(errno)); |
430 | closedir(dir); |
431 | goto werr; |
432 | } else { |
433 | break; |
434 | } |
435 | } |
436 | if (fwrite(buf,1,nread,fp) != (unsigned)nread) { |
437 | closedir(dir); |
438 | goto werr; |
439 | } |
440 | } |
441 | fclose(entryfp); |
442 | } |
443 | closedir(dir); |
249ad25f |
444 | } |
445 | } |
f03fe802 |
446 | |
447 | /* Output the end of file opcode */ |
448 | if (rdbSaveType(fp,REDIS_EOF) == -1) goto werr; |
249ad25f |
449 | |
450 | /* Make sure data will not remain on the OS's output buffers */ |
451 | fflush(fp); |
452 | fsync(fileno(fp)); |
453 | fclose(fp); |
36c17a53 |
454 | zfree(filename); |
249ad25f |
455 | |
456 | /* Use RENAME to make sure the DB file is changed atomically only |
457 | * if the generate DB file is ok. */ |
458 | if (rename(tmpfile,filename) == -1) { |
69bfffb4 |
459 | redisLog(REDIS_WARNING,"Error moving temp DB file on the final destination: %s (diskstore)", strerror(errno)); |
249ad25f |
460 | unlink(tmpfile); |
5b8ce853 |
461 | dsRdbSaveSetState(REDIS_BGSAVE_THREAD_DONE_ERR); |
462 | return NULL; |
249ad25f |
463 | } |
f03fe802 |
464 | redisLog(REDIS_NOTICE,"DB saved on disk by diskstore thread"); |
5b8ce853 |
465 | dsRdbSaveSetState(REDIS_BGSAVE_THREAD_DONE_OK); |
466 | return NULL; |
249ad25f |
467 | |
468 | werr: |
36c17a53 |
469 | zfree(filename); |
249ad25f |
470 | fclose(fp); |
471 | unlink(tmpfile); |
5b8ce853 |
472 | dsRdbSaveSetState(REDIS_BGSAVE_THREAD_DONE_ERR); |
249ad25f |
473 | redisLog(REDIS_WARNING,"Write error saving DB on disk: %s", strerror(errno)); |
5b8ce853 |
474 | return NULL; |
249ad25f |
475 | } |
36c17a53 |
476 | |
cc275067 |
477 | int dsRdbSaveBackground(char *filename) { |
36c17a53 |
478 | pthread_t thread; |
479 | |
480 | if (pthread_create(&thread,NULL,dsRdbSave_thread,zstrdup(filename)) != 0) { |
481 | redisLog(REDIS_WARNING,"Can't create diskstore BGSAVE thread: %s", |
482 | strerror(errno)); |
483 | return REDIS_ERR; |
484 | } else { |
485 | server.bgsavethread = thread; |
486 | return REDIS_OK; |
487 | } |
488 | } |
cc275067 |
489 | |
490 | int dsRdbSave(char *filename) { |
491 | /* A blocking save is actually a non blocking save... just we wait |
492 | * for it to terminate in a non-busy loop. */ |
493 | |
494 | redisLog(REDIS_NOTICE,"Starting a blocking SAVE (BGSAVE + blocking wait)"); |
495 | server.dirty_before_bgsave = server.dirty; |
496 | if (dsRdbSaveBackground(filename) == REDIS_ERR) return REDIS_ERR; |
497 | while(1) { |
498 | usleep(1000); |
499 | int state; |
500 | |
501 | pthread_mutex_lock(&server.bgsavethread_mutex); |
502 | state = server.bgsavethread_state; |
503 | pthread_mutex_unlock(&server.bgsavethread_mutex); |
504 | |
505 | if (state == REDIS_BGSAVE_THREAD_DONE_OK || |
506 | state == REDIS_BGSAVE_THREAD_DONE_ERR) break; |
507 | } |
508 | return REDIS_OK; |
509 | } |