]> git.saurik.com Git - redis.git/blob - src/diskstore.c
9e45305f8eb1a7daebd947c36139f773dd207162
[redis.git] / src / diskstore.c
1 /* diskstore.c implements a very simple disk backed key-value store used
2 * by Redis for the "disk" backend. This implementation uses the filesystem
3 * to store key/value pairs. Every file represents a given key.
4 *
5 * The key path is calculated using the SHA1 of the key name. For instance
6 * the key "foo" is stored as a file name called:
7 *
8 * /0b/ee/0beec7b5ea3f0fdbc95d0dd47f3c5bc275da8a33
9 *
10 * The couples of characters from the hex output of SHA1 are also used
11 * to locate two two levels of directories to store the file (as most
12 * filesystems are not able to handle too many files in a single dir).
13 *
14 * In the end there are 65536 final directories (256 directories inside
15 * every 256 top level directories), so that with 1 billion of files every
16 * directory will contain in the average 15258 entires, that is ok with
17 * most filesystems implementation.
18 *
19 * Note that since Redis supports multiple databases, the actual key name
20 * is:
21 *
22 * /0b/ee/<dbid>_0beec7b5ea3f0fdbc95d0dd47f3c5bc275da8a33
23 *
24 * so for instance if the key is inside DB 0:
25 *
26 * /0b/ee/0_0beec7b5ea3f0fdbc95d0dd47f3c5bc275da8a33
27 *
28 * The actaul implementation of this disk store is highly dependant to the
29 * filesystem implementation itself. This implementation may be replaced by
30 * a B+TREE implementation in future implementations.
31 *
32 * Data ok every key is serialized using the same format used for .rdb
33 * serialization. Everything is serialized on every entry: key name,
34 * ttl information in case of keys with an associated expire time, and the
35 * serialized value itself.
36 *
37 * Because the format is the same of the .rdb files it is trivial to create
38 * an .rdb file starting from this format just by mean of scanning the
39 * directories and concatenating entries, with the sole addition of an
40 * .rdb header at the start and the end-of-db opcode at the end.
41 *
42 * -------------------------------------------------------------------------
43 *
44 * Copyright (c) 2010-2011, Salvatore Sanfilippo <antirez at gmail dot com>
45 * All rights reserved.
46 *
47 * Redistribution and use in source and binary forms, with or without
48 * modification, are permitted provided that the following conditions are met:
49 *
50 * * Redistributions of source code must retain the above copyright notice,
51 * this list of conditions and the following disclaimer.
52 * * Redistributions in binary form must reproduce the above copyright
53 * notice, this list of conditions and the following disclaimer in the
54 * documentation and/or other materials provided with the distribution.
55 * * Neither the name of Redis nor the names of its contributors may be used
56 * to endorse or promote products derived from this software without
57 * specific prior written permission.
58 *
59 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
60 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
61 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
62 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
63 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
64 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
65 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
66 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
67 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
68 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
69 * POSSIBILITY OF SUCH DAMAGE.
70 */
71
72 #include "redis.h"
73 #include "sha1.h"
74
75 #include <fcntl.h>
76 #include <sys/stat.h>
77 #include <dirent.h>
78
79 int create256dir(char *prefix) {
80 char buf[1024];
81 int j;
82
83 for (j = 0; j < 256; j++) {
84 snprintf(buf,sizeof(buf),"%s%02x",prefix,j);
85 if (mkdir(buf,0755) == -1) {
86 redisLog(REDIS_WARNING,"Error creating dir %s for diskstore: %s",
87 buf,strerror(errno));
88 return REDIS_ERR;
89 }
90 }
91 return REDIS_OK;
92 }
93
94 int dsOpen(void) {
95 struct stat sb;
96 int retval, j;
97 char *path = server.ds_path;
98 char buf[1024];
99
100 if ((retval = stat(path,&sb) == -1) && errno != ENOENT) {
101 redisLog(REDIS_WARNING, "Error opening disk store at %s: %s",
102 path, strerror(errno));
103 return REDIS_ERR;
104 }
105
106 /* Directory already in place. Assume everything is ok. */
107 if (retval == 0 && S_ISDIR(sb.st_mode)) {
108 redisLog(REDIS_NOTICE,"Disk store %s exists", path);
109 return REDIS_OK;
110 }
111
112 /* File exists but it's not a directory */
113 if (retval == 0 && !S_ISDIR(sb.st_mode)) {
114 redisLog(REDIS_WARNING,"Disk store at %s is not a directory", path);
115 return REDIS_ERR;
116 }
117
118 /* New disk store, create the directory structure now, as creating
119 * them in a lazy way is not a good idea, after very few insertions
120 * we'll need most of the 65536 directories anyway. */
121 redisLog(REDIS_NOTICE,"Disk store %s does not exist: creating", path);
122 if (mkdir(path,0755) == -1) {
123 redisLog(REDIS_WARNING,"Disk store init failed creating dir %s: %s",
124 path, strerror(errno));
125 return REDIS_ERR;
126 }
127 /* Create the top level 256 directories */
128 snprintf(buf,sizeof(buf),"%s/",path);
129 if (create256dir(buf) == REDIS_ERR) return REDIS_ERR;
130
131 /* For every 256 top level dir, create 256 nested dirs */
132 for (j = 0; j < 256; j++) {
133 snprintf(buf,sizeof(buf),"%s/%02x/",path,j);
134 if (create256dir(buf) == REDIS_ERR) return REDIS_ERR;
135 }
136 return REDIS_OK;
137 }
138
139 int dsClose(void) {
140 return REDIS_OK;
141 }
142
143 /* Convert key into full path for this object. Dirty but hopefully
144 * is fast enough. Returns the length of the returned path. */
145 int dsKeyToPath(redisDb *db, char *buf, robj *key) {
146 SHA1_CTX ctx;
147 unsigned char hash[20];
148 char hex[40], digits[] = "0123456789abcdef";
149 int j, l;
150 char *origbuf = buf;
151
152 SHA1Init(&ctx);
153 SHA1Update(&ctx,key->ptr,sdslen(key->ptr));
154 SHA1Final(hash,&ctx);
155
156 /* Convert the hash into hex format */
157 for (j = 0; j < 20; j++) {
158 hex[j*2] = digits[(hash[j]&0xF0)>>4];
159 hex[(j*2)+1] = digits[hash[j]&0x0F];
160 }
161
162 /* Create the object path. Start with server.ds_path that's the root dir */
163 l = sdslen(server.ds_path);
164 memcpy(buf,server.ds_path,l);
165 buf += l;
166 *buf++ = '/';
167
168 /* Then add xx/yy/ that is the two level directories */
169 buf[0] = hex[0];
170 buf[1] = hex[1];
171 buf[2] = '/';
172 buf[3] = hex[2];
173 buf[4] = hex[3];
174 buf[5] = '/';
175 buf += 6;
176
177 /* Add the database number followed by _ and finall the SHA1 hex */
178 l = ll2string(buf,64,db->id);
179 buf += l;
180 buf[0] = '_';
181 memcpy(buf+1,hex,40);
182 buf[41] = '\0';
183 return (buf-origbuf)+41;
184 }
185
186 int dsSet(redisDb *db, robj *key, robj *val, time_t expire) {
187 char buf[1024], buf2[1024];
188 int retval, len;
189 FILE *fp;
190 rio rdb;
191
192 len = dsKeyToPath(db,buf,key);
193 memcpy(buf2,buf,len);
194 snprintf(buf2+len,sizeof(buf2)-len,"-%ld-%ld",(long)time(NULL),(long)val);
195 while ((fp = fopen(buf2,"w")) == NULL) {
196 if (errno == ENOSPC) {
197 redisLog(REDIS_WARNING,"Diskstore: No space left on device. Please make room and wait 30 seconds for Redis to continue.");
198 sleep(30);
199 } else {
200 redisLog(REDIS_WARNING,"diskstore error opening %s: %s",
201 buf2, strerror(errno));
202 redisPanic("Unrecoverable diskstore error. Exiting.");
203 }
204 }
205
206 rdb = rioInitWithFile(fp);
207 if ((retval = rdbSaveKeyValuePair(&rdb,key,val,expire,time(NULL))) == -1)
208 return REDIS_ERR;
209 fclose(fp);
210 if (retval == 0) {
211 /* Expired key. Unlink failing not critical */
212 unlink(buf);
213 unlink(buf2);
214 } else {
215 /* Use rename for atomic updadte of value */
216 if (rename(buf2,buf) == -1) {
217 redisLog(REDIS_WARNING,"rename(2) returned an error: %s",
218 strerror(errno));
219 redisPanic("Unrecoverable diskstore error. Exiting.");
220 }
221 }
222 return REDIS_OK;
223 }
224
225 robj *dsGet(redisDb *db, robj *key, time_t *expire) {
226 char buf[1024];
227 int type;
228 time_t expiretime = -1; /* -1 means: no expire */
229 robj *dskey; /* Key as loaded from disk. */
230 robj *val;
231 FILE *fp;
232 rio rdb;
233
234 dsKeyToPath(db,buf,key);
235 fp = fopen(buf,"r");
236 if (fp == NULL && errno == ENOENT) return NULL; /* No such key */
237 if (fp == NULL) {
238 redisLog(REDIS_WARNING,"Disk store failed opening %s: %s",
239 buf, strerror(errno));
240 goto readerr;
241 }
242
243 rdb = rioInitWithFile(fp);
244 if ((type = rdbLoadType(&rdb)) == -1) goto readerr;
245 if (type == REDIS_EXPIRETIME) {
246 if ((expiretime = rdbLoadTime(&rdb)) == -1) goto readerr;
247 /* We read the time so we need to read the object type again */
248 if ((type = rdbLoadType(&rdb)) == -1) goto readerr;
249 }
250 /* Read key */
251 if ((dskey = rdbLoadStringObject(&rdb)) == NULL) goto readerr;
252 /* Read value */
253 if ((val = rdbLoadObject(type,&rdb)) == NULL) goto readerr;
254 fclose(fp);
255
256 /* The key we asked, and the key returned, must be the same */
257 redisAssert(equalStringObjects(key,dskey));
258
259 /* Check if the key already expired */
260 decrRefCount(dskey);
261 if (expiretime != -1 && expiretime < time(NULL)) {
262 decrRefCount(val);
263 unlink(buf); /* This failing is non critical here */
264 return NULL;
265 }
266
267 /* Everything ok... */
268 *expire = expiretime;
269 return val;
270
271 readerr:
272 redisLog(REDIS_WARNING,"Read error reading reading %s. Corrupted key?",
273 buf);
274 redisPanic("Unrecoverable error reading from disk store");
275 return NULL; /* unreached */
276 }
277
278 int dsDel(redisDb *db, robj *key) {
279 char buf[1024];
280
281 dsKeyToPath(db,buf,key);
282 if (unlink(buf) == -1) {
283 if (errno == ENOENT) {
284 return REDIS_ERR;
285 } else {
286 redisLog(REDIS_WARNING,"Disk store can't remove %s: %s",
287 buf, strerror(errno));
288 redisPanic("Unrecoverable Disk store errore. Existing.");
289 return REDIS_ERR; /* unreached */
290 }
291 } else {
292 return REDIS_OK;
293 }
294 }
295
296 int dsExists(redisDb *db, robj *key) {
297 char buf[1024];
298
299 dsKeyToPath(db,buf,key);
300 return access(buf,R_OK) == 0;
301 }
302
303 int dsGetDbidFromFilename(char *path) {
304 char id[64];
305 char *p = strchr(path,'_');
306 int len = (p - path);
307
308 redisAssert(p != NULL && len < 64);
309 memcpy(id,path,len);
310 id[len] = '\0';
311 return atoi(id);
312 }
313
314 void dsFlushOneDir(char *path, int dbid) {
315 DIR *dir;
316 struct dirent *dp, de;
317
318 dir = opendir(path);
319 if (dir == NULL) {
320 redisLog(REDIS_WARNING,"Disk store can't open dir %s: %s",
321 path, strerror(errno));
322 redisPanic("Unrecoverable Disk store errore. Existing.");
323 }
324 while(1) {
325 char buf[1024];
326
327 readdir_r(dir,&de,&dp);
328 if (dp == NULL) break;
329 if (dp->d_name[0] == '.') continue;
330
331 /* Check if we need to remove this entry accordingly to the
332 * DB number. */
333 if (dbid != -1 && dsGetDbidFromFilename(dp->d_name)) continue;
334
335 /* Finally unlink the file */
336 snprintf(buf,1024,"%s/%s",path,dp->d_name);
337 if (unlink(buf) == -1) {
338 redisLog(REDIS_WARNING,
339 "Can't unlink %s: %s", buf, strerror(errno));
340 redisPanic("Unrecoverable Disk store errore. Existing.");
341 }
342 }
343 closedir(dir);
344 }
345
346 void dsFlushDb(int dbid) {
347 char buf[1024];
348 int j, i;
349
350 redisLog(REDIS_NOTICE,"Flushing diskstore DB (%d)",dbid);
351 for (j = 0; j < 256; j++) {
352 for (i = 0; i < 256; i++) {
353 snprintf(buf,1024,"%s/%02x/%02x",server.ds_path,j,i);
354 dsFlushOneDir(buf,dbid);
355 }
356 }
357 }
358
359 void dsRdbSaveSetState(int state) {
360 pthread_mutex_lock(&server.bgsavethread_mutex);
361 server.bgsavethread_state = state;
362 pthread_mutex_unlock(&server.bgsavethread_mutex);
363 }
364
365 void *dsRdbSave_thread(void *arg) {
366 char tmpfile[256], *filename = (char*)arg;
367 struct dirent *dp, de;
368 int j, i, last_dbid = -1;
369 FILE *fp;
370 rio rdb;
371
372 /* Change state to ACTIVE, to signal there is a saving thead working. */
373 redisLog(REDIS_NOTICE,"Diskstore BGSAVE thread started");
374 dsRdbSaveSetState(REDIS_BGSAVE_THREAD_ACTIVE);
375
376 snprintf(tmpfile,256,"temp-%d.rdb", (int) getpid());
377 fp = fopen(tmpfile,"w");
378 if (!fp) {
379 redisLog(REDIS_WARNING, "Failed opening .rdb for saving: %s",
380 strerror(errno));
381 dsRdbSaveSetState(REDIS_BGSAVE_THREAD_DONE_ERR);
382 return NULL;
383 }
384
385 rdb = rioInitWithFile(fp);
386 if (rioWrite(&rdb,"REDIS0001",9) == 0) goto werr;
387
388 sleep(5);
389
390 /* Scan all diskstore dirs looking for keys */
391 for (j = 0; j < 256; j++) {
392 for (i = 0; i < 256; i++) {
393 DIR *dir;
394 char buf[1024];
395
396 /* For every directory, collect all the keys */
397 snprintf(buf,sizeof(buf),"%s/%02x/%02x",server.ds_path,j,i);
398 if ((dir = opendir(buf)) == NULL) {
399 redisLog(REDIS_WARNING,"Disk store can't open dir %s: %s",
400 buf, strerror(errno));
401 goto werr;
402 }
403
404 while(1) {
405 char buf[1024];
406 int dbid;
407 FILE *entryfp;
408
409 readdir_r(dir,&de,&dp);
410 if (dp == NULL) break;
411 if (dp->d_name[0] == '.') continue;
412 /* If there is a '-' char in the file name, it's a temp file */
413 if (strchr(dp->d_name,'-') != NULL) continue;
414
415 /* Emit the SELECT DB opcode if needed. */
416 dbid = dsGetDbidFromFilename(dp->d_name);
417 if (dbid != last_dbid) {
418 last_dbid = dbid;
419 if (rdbSaveType(&rdb,REDIS_SELECTDB) == -1) goto werr;
420 if (rdbSaveLen(&rdb,dbid) == -1) goto werr;
421 }
422
423 /* Let's copy this file into the target .rdb */
424 snprintf(buf,sizeof(buf),"%s/%02x/%02x/%s",
425 server.ds_path,j,i,dp->d_name);
426 if ((entryfp = fopen(buf,"r")) == NULL) {
427 redisLog(REDIS_WARNING,"Can't open %s: %s",
428 buf,strerror(errno));
429 closedir(dir);
430 goto werr;
431 }
432 while(1) {
433 size_t nread = fread(buf,1,sizeof(buf),entryfp);
434
435 if (nread == 0) {
436 if (ferror(entryfp)) {
437 redisLog(REDIS_WARNING,"Error reading from file entry while performing BGSAVE for diskstore: %s", strerror(errno));
438 closedir(dir);
439 goto werr;
440 } else {
441 break;
442 }
443 }
444 if (rioWrite(&rdb,buf,nread) == 0) {
445 closedir(dir);
446 goto werr;
447 }
448 }
449 fclose(entryfp);
450 }
451 closedir(dir);
452 }
453 }
454
455 /* Output the end of file opcode */
456 if (rdbSaveType(&rdb,REDIS_EOF) == -1) goto werr;
457
458 /* Make sure data will not remain on the OS's output buffers */
459 fflush(fp);
460 fsync(fileno(fp));
461 fclose(fp);
462 zfree(filename);
463
464 /* Use RENAME to make sure the DB file is changed atomically only
465 * if the generate DB file is ok. */
466 if (rename(tmpfile,filename) == -1) {
467 redisLog(REDIS_WARNING,"Error moving temp DB file on the final destination: %s (diskstore)", strerror(errno));
468 unlink(tmpfile);
469 dsRdbSaveSetState(REDIS_BGSAVE_THREAD_DONE_ERR);
470 return NULL;
471 }
472 redisLog(REDIS_NOTICE,"DB saved on disk by diskstore thread");
473 dsRdbSaveSetState(REDIS_BGSAVE_THREAD_DONE_OK);
474 return NULL;
475
476 werr:
477 zfree(filename);
478 fclose(fp);
479 unlink(tmpfile);
480 dsRdbSaveSetState(REDIS_BGSAVE_THREAD_DONE_ERR);
481 redisLog(REDIS_WARNING,"Write error saving DB on disk: %s", strerror(errno));
482 return NULL;
483 }
484
485 int dsRdbSaveBackground(char *filename) {
486 pthread_t thread;
487
488 if (pthread_create(&thread,NULL,dsRdbSave_thread,zstrdup(filename)) != 0) {
489 redisLog(REDIS_WARNING,"Can't create diskstore BGSAVE thread: %s",
490 strerror(errno));
491 return REDIS_ERR;
492 } else {
493 server.bgsavethread = thread;
494 return REDIS_OK;
495 }
496 }
497
498 int dsRdbSave(char *filename) {
499 /* A blocking save is actually a non blocking save... just we wait
500 * for it to terminate in a non-busy loop. */
501
502 redisLog(REDIS_NOTICE,"Starting a blocking SAVE (BGSAVE + blocking wait)");
503 server.dirty_before_bgsave = server.dirty;
504 if (dsRdbSaveBackground(filename) == REDIS_ERR) return REDIS_ERR;
505 while(1) {
506 usleep(1000);
507 int state;
508
509 pthread_mutex_lock(&server.bgsavethread_mutex);
510 state = server.bgsavethread_state;
511 pthread_mutex_unlock(&server.bgsavethread_mutex);
512
513 if (state == REDIS_BGSAVE_THREAD_DONE_OK ||
514 state == REDIS_BGSAVE_THREAD_DONE_ERR) break;
515 }
516 return REDIS_OK;
517 }