]> git.saurik.com Git - apple/hfs.git/blob - fsck_hfs/dfalib/fsck_journal.c
hfs-556.100.11.tar.gz
[apple/hfs.git] / fsck_hfs / dfalib / fsck_journal.c
1 /*
2 * Copyright (c) 2010-2012 Apple Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28
29 #include <stdio.h>
30 #include <stddef.h>
31 #include <stdlib.h>
32 #include <string.h>
33 #include <limits.h>
34 #include <err.h>
35 #include <errno.h>
36 #include <fcntl.h>
37 #include <unistd.h>
38 #include <stdarg.h>
39 #include <sys/types.h>
40 #include <sys/param.h>
41 #include <sys/stat.h>
42 #include <sys/ioctl.h>
43 #include <sys/disk.h>
44 #include <sys/param.h>
45
46 #include "../fsck_hfs.h"
47 #include "fsck_journal.h"
48
49 #define DEBUG_JOURNAL 0
50
51 extern char debug;
52
53 #include <hfs/hfs_format.h>
54 #include <libkern/OSByteOrder.h>
55
56 typedef struct SwapType {
57 const char *name;
58 uint16_t (^swap16)(uint16_t);
59 uint32_t (^swap32)(uint32_t);
60 uint64_t (^swap64)(uint64_t);
61 } swapper_t;
62
63 static swapper_t nativeEndian = {
64 "native endian",
65 ^(uint16_t x) { return x; },
66 ^(uint32_t x) { return x; },
67 ^(uint64_t x) { return x; }
68 };
69
70 static swapper_t swappedEndian = {
71 "swapped endian",
72 ^(uint16_t x) { return OSSwapInt16(x); },
73 ^(uint32_t x) { return OSSwapInt32(x); },
74 ^(uint64_t x) { return OSSwapInt64(x); }
75 };
76
77 typedef int (^journal_write_block_t)(off_t, void *, size_t);
78
79 //
80 // this isn't a great checksum routine but it will do for now.
81 // we use it to checksum the journal header and the block list
82 // headers that are at the start of each transaction.
83 //
84 static uint32_t
85 calc_checksum(char *ptr, int len)
86 {
87 int i;
88 uint32_t cksum = 0;
89
90 // this is a lame checksum but for now it'll do
91 for(i = 0; i < len; i++, ptr++) {
92 cksum = (cksum << 8) ^ (cksum + *(unsigned char *)ptr);
93 }
94
95 return (~cksum);
96 }
97
98 typedef struct JournalIOInfo {
99 int jfd; // File descriptor for journal buffer
100 int wrapCount; // Incremented when it wraps around.
101 size_t bSize; // Block size. I/O needs to be done in that amount.
102 uint64_t base; // Base offset of journal buffer, past the header
103 uint64_t size; // Size of the journal, minus the header size
104 uint64_t end; // End of the journal (initially the "end" field from the journal header)
105 uint64_t current; // Current offset; starts at "start"
106 } JournalIOInfo_t;
107
108 /*
109 * Attempt to read <length> bytes from the journal buffer.
110 * Since this is a wrapped buffer, it may have to start at the
111 * beginning. info->{base, size, end} are read-only; info->current
112 * is updated with the current offset. It returns the number of bytes
113 * it read, or -1 on error.
114 */
115 static ssize_t
116 journalRead(JournalIOInfo_t *info, uint8_t *buffer, size_t length)
117 {
118 size_t nread = 0;
119 uint8_t *ptr = buffer;
120
121 // fprintf(stderr, "%s(%p, %p, %zu)\n", __FUNCTION__, info, buffer, length);
122 if (info->wrapCount > 1) {
123 fplog(stderr, "%s(%p, %p, %zu): journal buffer wrap count = %d\n", __FUNCTION__, info, buffer, length, info->wrapCount);
124 return -1;
125 }
126 while (nread < length) {
127 off_t end;
128 size_t amt;
129 ssize_t n;
130
131 if (info->end < info->current) {
132 // It wraps, so we max out at bse+size
133 end = info->base + info->size;
134 } else {
135 end = info->end;
136 }
137 amt = MIN((length - nread), (end - info->current));
138 if (amt == 0) {
139 if (debug) {
140 fplog(stderr, "Journal read amount is 0, is that right?\n");
141 }
142 goto done;
143 }
144
145 n = pread(info->jfd, ptr, amt, info->current);
146 if (n == -1) {
147 warn("pread(%d, %p, %zu, %llu)", info->jfd, ptr, amt, info->current);
148 goto done;
149 }
150 if (n != amt) {
151 if (debug) {
152 fplog(stderr, "%s(%d): Wanted to read %zu, but only read %zd\n", __FUNCTION__, __LINE__, amt, n);
153 }
154 }
155 nread += n;
156 ptr += n;
157 info->current += n;
158 if (info->current == (info->base + info->size)) {
159 info->current = info->base;
160 info->wrapCount++;
161 }
162 }
163 done:
164 return nread;
165 }
166
167 /*
168 * Read a transaction from the journal buffer.
169 * A transaction is a list of block_list_headers, and their
170 * associated data. It needs to read all of the block_lists in
171 * a transaction, or it fails. It returns NULL if there are
172 * no transactions, and on error. (Maybe that should change?)
173 */
174 static block_list_header *
175 getJournalTransaction(JournalIOInfo_t *jinfo, swapper_t *swap)
176 {
177 block_list_header *retval = NULL;
178 uint8_t block[jinfo->bSize];
179 block_list_header *hdr = (void*)&block;
180 ssize_t nread;
181 ssize_t amt;
182
183 memset(block, 0, sizeof(block));
184 nread = journalRead(jinfo, block, sizeof(block));
185 if (nread == -1 ||
186 (size_t)nread != sizeof(block)) {
187 if (debug)
188 plog("%s: wanted %zd, got %zd\n", __FUNCTION__, sizeof(block), nread);
189 return NULL;
190 }
191 if (swap->swap32(hdr->num_blocks) == 0) {
192 /*
193 * Either there really are no blocks, or this is not a valid
194 * transaction. Either way, there's nothing for us to do here.
195 */
196 #if DEBUG_JOURNAL
197 if (debug)
198 fplog(stderr, "%s(%d): hdr->num_blocks == 0\n", __FUNCTION__, __LINE__);
199 #endif
200 return NULL;
201 }
202 /*
203 * Now we check the checksum to see if this is a valid header.
204 * Note that we verify the checksum before reading any more -- if
205 * it's not a valid header, we don't want to read more than a block
206 * size.
207 */
208 uint32_t tmpChecksum = swap->swap32(hdr->checksum);
209 uint32_t compChecksum;
210 hdr->checksum = 0;
211 compChecksum = calc_checksum((void*)hdr, sizeof(*hdr));
212 hdr->checksum = swap->swap32(tmpChecksum);
213
214 if (compChecksum != tmpChecksum) {
215 if (debug)
216 fplog(stderr, "%s(%d): hdr has bad checksum, returning NULL\n", __FUNCTION__, __LINE__);
217 return NULL;
218 }
219
220 if (swap->swap32(hdr->bytes_used) < sizeof(block)) {
221 #if DEBUG_JOURNAL
222 if (debug) {
223 fplog(stderr, "%s(%d): hdr has bytes_used (%u) less than sizeof block (%zd)\n",
224 __FUNCTION__, __LINE__, swap->swap32(hdr->bytes_used), sizeof(block));
225 }
226 #endif
227 return NULL;
228 }
229
230 retval = malloc(swap->swap32(hdr->bytes_used));
231 if (retval == NULL)
232 return NULL;
233
234 memset(retval, 0, swap->swap32(hdr->bytes_used));
235 memcpy(retval, block, sizeof(block));
236 amt = swap->swap32(hdr->bytes_used) - sizeof(block);
237 nread = journalRead(jinfo, ((uint8_t*)retval) + sizeof(block), amt);
238 if (nread != amt) {
239 free(retval);
240 return NULL;
241 }
242
243 return retval;
244 }
245
246 /*
247 * Replay a transaction.
248 * Transactions have a blockListSize amount of block_list_header, and
249 * are then followed by data. We read it in, verify the checksum, and
250 * if it's good, we call the block that was passed in to do something
251 * with it. Maybe write it out. Maybe laugh about it.
252 *
253 * It returns -1 if there was an error before it wrote anything out,
254 * and -2 if there was an error after it wrote something out.
255 *
256 * The arguments are:
257 * txn -- a block_list_header pointer, which has the description and data
258 * to be replayed.
259 * blSize -- the size of the block_list for this journal. (The data
260 * are after the block_list, but part of the same buffer.)
261 * blkSize -- The block size used to convert block numbers to offsets. This
262 * is defined to be the size of the journal header.
263 * swap -- A pointer to a swapper_t used to swap journal data structure elements.
264 * writer -- A block-of-code that does writing.
265 *
266 * "writer" should return -1 to stop the replay (this propagates an error up).
267 */
268 static int
269 replayTransaction(block_list_header *txn, size_t blSize, size_t blkSize, swapper_t *swap, journal_write_block_t writer)
270 {
271 uint32_t i;
272 uint8_t *endPtr = ((uint8_t*)txn) + swap->swap32(txn->bytes_used);
273 uint8_t *dataPtr = ((uint8_t*)txn) + blSize;
274 int retval = -1;
275 for (i = 1; i < swap->swap32(txn->num_blocks); i++) {
276 #if DEBUG_JOURNAL
277 if (debug)
278 plog("\tBlock %d: blkNum %llu, size %u, data offset = %zd\n", i, swap->swap64(txn->binfo[i].bnum), swap->swap32(txn->binfo[i].bsize), dataPtr - (uint8_t*)txn);
279 #endif
280 /*
281 * XXX
282 * Check with security types on these checks. Need to ensure
283 * that the fields don't take us off into the dark scary woods.
284 * It's mostly the second one that I am unsure about.
285 */
286 if (dataPtr > endPtr) {
287 if (debug)
288 plog("\tData out of range for block_list_header\n");
289 return retval;
290 }
291 if ((endPtr - dataPtr) < swap->swap32(txn->binfo[i].bsize)) {
292 if (debug)
293 plog("\tData size for block %d out of range for block_list_header\n", i);
294 return retval;
295 }
296 if ((dataPtr + swap->swap32(txn->binfo[i].bsize)) > endPtr) {
297 if (debug)
298 plog("\tData end out of range for block_list_header\n");
299 return retval;
300 }
301 #if DEBUG_JOURNAL
302 // Just for debugging
303 if (debug) {
304 if (swap->swap64(txn->binfo[i].bnum) == 2) {
305 HFSPlusVolumeHeader *vp = (void*)dataPtr;
306 plog("vp->signature = %#x, version = %#x\n", vp->signature, vp->version);
307 }
308 }
309 #endif
310 // It's in the spec, and I saw it come up once on a live volume.
311 if (swap->swap64(txn->binfo[i].bnum) == ~(uint64_t)0) {
312 #if DEBUG_JOURNAL
313 if (debug)
314 plog("\tSkipping this block due to magic skip number\n");
315 #endif
316 } else {
317 // Should we set retval to -2 here?
318 if (writer) {
319 if ((writer)(swap->swap64(txn->binfo[i].bnum) * blkSize, dataPtr, swap->swap32(txn->binfo[i].bsize)) == -1)
320 return retval;
321 }
322 }
323 dataPtr += swap->swap32(txn->binfo[i].bsize);
324 retval = -2;
325 }
326 return 0;
327 }
328
329 /*
330 * Read a journal header in from the journal device.
331 */
332 static int
333 loadJournalHeader(int jfd, off_t offset, size_t blockSize, journal_header *jhp)
334 {
335 uint8_t buffer[blockSize];
336 ssize_t nread;
337
338 nread = pread(jfd, buffer, sizeof(buffer), offset);
339 if (nread == -1 ||
340 (size_t)nread != sizeof(buffer)) {
341 warn("tried to read %zu for journal header buffer, got %zd", sizeof(buffer), nread);
342 return -1;
343 }
344 *jhp = *(journal_header*)buffer;
345 return 0;
346 }
347
348 /*
349 * Replay a journal (called "journal_open" because you have to
350 * to replay it as part of opening it). At this point, all it
351 * is useful for is replaying the journal.
352 *
353 * It is passed in:
354 * jfd -- file descriptor for the journal device
355 * offset -- offset (in bytes) of the journal on the journal device
356 * journal_size -- size of the jorunal (in bytes)
357 * min_fs_blksize -- Blocksize of the data filesystem
358 * flags -- unused for now
359 * jdev_name -- string name for the journal device. used for logging.
360 * do_write_b -- a block which does the actual writing.
361 *
362 * Currently, for fsck_hfs, the do_write_b block writes to the cache. It could also
363 * just print out the block numbers, or just check their integrity, as much as is
364 * possible.
365 *
366 * The function works by loading the journal header. From there, it then starts
367 * loading transactions, via block_list_header groups. When it gets to the end
368 * of the journal, it tries continuing, in case there were transactions that
369 * didn't get updated in the header (this apparently happens).
370 *
371 * It returns 0 on success, and -1 on error. Note that there's not a lot
372 * fsck_hfs can probably do in the event of error.
373 *
374 */
375 int
376 journal_open(int jfd,
377 off_t offset, // Offset of journal
378 off_t journal_size, // Size, in bytes, of the entire journal
379 size_t min_fs_blksize, // Blocksize of the data filesystem, journal blocksize must be at least this size
380 uint32_t flags __unused, // Not used in this implementation
381 const char *jdev_name, // The name of the journal device, for logging
382 int (^do_write_b)(off_t, void*, size_t))
383 {
384 journal_header jhdr = { 0 };
385 swapper_t *jnlSwap; // Used to swap fields of the journal
386 uint32_t tempCksum; // Temporary checksum value
387 uint32_t jBlkSize = 0;
388
389 if (ioctl(jfd, DKIOCGETBLOCKSIZE, &jBlkSize) == -1) {
390 jBlkSize = (uint32_t)min_fs_blksize;
391 } else {
392 if (jBlkSize < min_fs_blksize) {
393 fplog(stderr, "%s: journal block size %u < min block size %zu for %s\n", __FUNCTION__, jBlkSize, min_fs_blksize, jdev_name);
394 return -1;
395 }
396 if ((jBlkSize % min_fs_blksize) != 0) {
397 fplog(stderr, "%s: journal block size %u is not a multiple of fs block size %zu for %s\n", __FUNCTION__, jBlkSize, min_fs_blksize, jdev_name);
398 return -1;
399 }
400 }
401 if (loadJournalHeader(jfd, offset, jBlkSize, &jhdr) != 0) {
402 fplog(stderr, "%s: unable to load journal header from %s\n", __FUNCTION__, jdev_name);
403 return -1;
404 }
405
406 /*
407 * Unlike the rest of the filesystem, the journal can be in native or
408 * non-native byte order. Barring moving a filesystem from one host
409 * to another, it'll almost always be in native byte order.
410 */
411 if (jhdr.endian == ENDIAN_MAGIC) {
412 jnlSwap = &nativeEndian;
413 } else if (OSSwapInt32(jhdr.endian) == ENDIAN_MAGIC) {
414 jnlSwap = &swappedEndian;
415 } else {
416 fplog(stderr, "%s: Unknown journal endian magic number %#x from %s\n", __FUNCTION__, jhdr.endian, jdev_name);
417 return -1;
418 }
419 /*
420 * Two different magic numbers are valid.
421 * Do they mean different thigs, though?
422 */
423 if (jnlSwap->swap32(jhdr.magic) != JOURNAL_HEADER_MAGIC &&
424 jnlSwap->swap32(jhdr.magic) != OLD_JOURNAL_HEADER_MAGIC) {
425 fplog(stderr, "%s: Unknown journal header magic number %#x from %s\n", __FUNCTION__, jhdr.magic, jdev_name);
426 return -1;
427 }
428
429 /*
430 * Checksums have to be done with the checksum field set to 0.
431 * So we have to stash it aside for a bit, and set the field to
432 * 0, before we can compare. Afterwards, if it compares correctly,
433 * we put the original (swapped, if necessary) value back, just
434 * in case.
435 */
436 tempCksum = jnlSwap->swap32(jhdr.checksum);
437 jhdr.checksum = 0;
438 if (jnlSwap->swap32(jhdr.magic) == JOURNAL_HEADER_MAGIC &&
439 (calc_checksum((void*)&jhdr, JOURNAL_HEADER_CKSUM_SIZE) != tempCksum)) {
440 fplog(stderr, "%s: Invalid journal checksum from %s\n", __FUNCTION__, jdev_name);
441 return -1;
442 }
443 jhdr.checksum = jnlSwap->swap32(tempCksum);
444
445 /*
446 * Set up information about the journal which we use to do the I/O.
447 * The journal is a circular buffer. However, the start of the journal
448 * buffer is past the journal header. See the JournalIOInfo structure above.
449 */
450 off_t startOffset = jnlSwap->swap64(jhdr.start);
451 off_t endOffset =jnlSwap->swap64(jhdr.end);
452 off_t journalStart = offset + jnlSwap->swap32(jhdr.jhdr_size);
453
454 /*
455 * The journal code was updated to be able to read past the "end" of the journal,
456 * to see if there were any valid transactions there. If we are peeking past the
457 * end, we don't care if we have checksum errors -- that just means they're not
458 * valid transactions.
459 *
460 */
461 int into_the_weeds = 0;
462 uint32_t last_sequence_number = 0;
463
464 JournalIOInfo_t jinfo = { 0 };
465
466 #if DEBUG_JOURNAL
467 if (debug)
468 plog("Journal start sequence number = %u\n", jnlSwap->swap32(jhdr.sequence_num));
469 #endif
470
471 /*
472 * Now set up the JournalIOInfo object with the file descriptor,
473 * the block size, start and end of the journal buffer, and where
474 * the journal pointer currently is.
475 */
476 jinfo.jfd = jfd;
477 jinfo.bSize = jnlSwap->swap32(jhdr.jhdr_size);
478 jinfo.base = journalStart;
479 jinfo.size = journal_size - jinfo.bSize;
480 jinfo.end = offset + endOffset;
481 jinfo.current = offset + startOffset;
482
483 const char *state = "";
484 int bad_journal = 0;
485 block_list_header *txn = NULL;
486
487 /*
488 * Loop while getting transactions. We exit when we hit a checksum
489 * error, or when the sequence number for a transaction doesn't match
490 * what we expect it to. (That's the trickiest part -- the into_the_weeds
491 * portion of the code. It doesn't match the TN11150 documentation, so
492 * I've had to go by both my experience with real-world journals and by
493 * looking at the kernel code.)
494 */
495 while (1) {
496 int rv;
497
498 if (jinfo.current == jinfo.end && into_the_weeds == 0) {
499 /*
500 * This is a bit weird, but it works: if current == end, but gone_into_weeds is 1,
501 * then this code will not execute. If it does execute, it'll go to get a transaction.
502 * That will put the pointer past end.
503 */
504 if (jhdr.sequence_num == 0) {
505 /*
506 * XXX
507 * I am not sure about this; this behaviour is not in TN1150 at all,
508 * but I _think_ this is what the kernel is doing.
509 */
510 #if DEBUG_JOURNAL
511 if (debug)
512 plog("Journal sequence number is 0, is going into the end okay?\n");
513 #endif
514 }
515 into_the_weeds = 1;
516 #if DEBUG_JOURNAL
517 if (debug)
518 plog("Attempting to read past stated end of journal\n");
519 #endif
520 state = "tentative ";
521 jinfo.end = (jinfo.base + startOffset - jinfo.bSize);
522 continue;
523 }
524 #if DEBUG_JOURNAL
525 if (debug)
526 plog("Before getting %stransaction: jinfo.current = %llu\n", state, jinfo.current);
527 #endif
528 /*
529 * Note that getJournalTransaction verifies the checksum on the block_list_header, so
530 * if it's bad, it'll return NULL.
531 */
532 txn = getJournalTransaction(&jinfo, jnlSwap);
533 if (txn == NULL) {
534 #if DEBUG_JOURNAL
535 if (debug)
536 plog("txn is NULL, jinfo.current = %llu\n", jinfo.current);
537 #endif
538 if (into_the_weeds) {
539 #if DEBUG_JOURNAL
540 if (debug)
541 plog("\tBut we do not care, since it is past the end of the journal\n");
542 #endif
543 } else {
544 bad_journal = 1;
545 }
546 break;
547 }
548 #if DEBUG_JOURNAL
549 if (debug) {
550 plog("After getting %stransaction: jinfo.current = %llu\n", state, jinfo.current);
551 plog("%stxn = { %u max_blocks, %u num_blocks, %u bytes_used, binfo[0].next = %u }\n", state, jnlSwap->swap32(txn->max_blocks), jnlSwap->swap32(txn->num_blocks), jnlSwap->swap32(txn->bytes_used), jnlSwap->swap32(txn->binfo[0].next));
552 }
553 #endif
554 if (into_the_weeds) {
555 /*
556 * This seems to be what the kernel was checking: if the
557 * last_sequence_number was set, and the txn sequence number
558 * is set, and the txn sequence number doesn't match either
559 * last_sequence_number _or_ an incremented version of it, then
560 * the transaction isn't worth looking at, and we've reached
561 * the end of the journal.
562 */
563 if (last_sequence_number != 0 &&
564 txn->binfo[0].next != 0 &&
565 jnlSwap->swap32(txn->binfo[0].next) != last_sequence_number &&
566 jnlSwap->swap32(txn->binfo[0].next) != (last_sequence_number + 1)) {
567 // Probably not a valid transaction
568 #if DEBUG_JOURNAL
569 if (debug)
570 plog("\tTentative txn sequence %u is not expected %u, stopping journal replay\n", jnlSwap->swap32(txn->binfo[0].next), last_sequence_number + 1);
571 #endif
572 break;
573 }
574 }
575 /*
576 * If we've got a valid transaction, then we replay it.
577 * If there was an error, we're done with the journal replay.
578 * (If the error occurred after the "end," then we don't care,
579 * and it's not a bad journal.)
580 */
581 rv = replayTransaction(txn,
582 jnlSwap->swap32(jhdr.blhdr_size),
583 jnlSwap->swap32(jhdr.jhdr_size),
584 jnlSwap,
585 do_write_b);
586
587 if (rv < 0) {
588 if (debug)
589 plog("\tTransaction replay failed, returned %d\n", rv);
590 if (into_the_weeds) {
591 if (debug)
592 plog("\t\tAnd we don't care\n");
593 } else {
594 bad_journal = 1;
595 }
596 break;
597 }
598 last_sequence_number = jnlSwap->swap32(txn->binfo[0].next);
599 free(txn);
600 txn = NULL;
601 }
602 if (txn)
603 free(txn);
604 if (bad_journal) {
605 if (debug)
606 plog("Journal was bad, stopped replaying\n");
607 return -1;
608 }
609
610 return 0;
611 }