2 * Copyright (c) 2010-2012 Apple Inc. All rights reserved.
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
39 #include <sys/types.h>
40 #include <sys/param.h>
42 #include <sys/ioctl.h>
44 #include <sys/param.h>
46 #include "../fsck_hfs.h"
47 #include "fsck_journal.h"
51 #include <hfs/hfs_format.h>
52 #include <libkern/OSByteOrder.h>
54 typedef struct SwapType
{
56 uint16_t (^swap16
)(uint16_t);
57 uint32_t (^swap32
)(uint32_t);
58 uint64_t (^swap64
)(uint64_t);
61 static swapper_t nativeEndian
= {
63 ^(uint16_t x
) { return x
; },
64 ^(uint32_t x
) { return x
; },
65 ^(uint64_t x
) { return x
; }
68 static swapper_t swappedEndian
= {
70 ^(uint16_t x
) { return OSSwapInt16(x
); },
71 ^(uint32_t x
) { return OSSwapInt32(x
); },
72 ^(uint64_t x
) { return OSSwapInt64(x
); }
75 typedef int (^journal_write_block_t
)(off_t
, void *, size_t);
78 // this isn't a great checksum routine but it will do for now.
79 // we use it to checksum the journal header and the block list
80 // headers that are at the start of each transaction.
83 calc_checksum(char *ptr
, int len
)
88 // this is a lame checksum but for now it'll do
89 for(i
= 0; i
< len
; i
++, ptr
++) {
90 cksum
= (cksum
<< 8) ^ (cksum
+ *(unsigned char *)ptr
);
96 typedef struct JournalIOInfo
{
97 int jfd
; // File descriptor for journal buffer
98 int wrapCount
; // Incremented when it wraps around.
99 size_t bSize
; // Block size. I/O needs to be done in that amount.
100 uint64_t base
; // Base offset of journal buffer, past the header
101 uint64_t size
; // Size of the journal, minus the header size
102 uint64_t end
; // End of the journal (initially the "end" field from the journal header)
103 uint64_t current
; // Current offset; starts at "start"
107 * Attempt to read <length> bytes from the journal buffer.
108 * Since this is a wrapped buffer, it may have to start at the
109 * beginning. info->{base, size, end} are read-only; info->current
110 * is updated with the current offset. It returns the number of bytes
111 * it read, or -1 on error.
114 journalRead(JournalIOInfo_t
*info
, uint8_t *buffer
, size_t length
)
117 uint8_t *ptr
= buffer
;
119 // fprintf(stderr, "%s(%p, %p, %zu)\n", __FUNCTION__, info, buffer, length);
120 if (info
->wrapCount
> 1) {
121 fplog(stderr
, "%s(%p, %p, %zu): journal buffer wrap count = %d\n", __FUNCTION__
, info
, buffer
, length
, info
->wrapCount
);
124 while (nread
< length
) {
129 if (info
->end
< info
->current
) {
130 // It wraps, so we max out at bse+size
131 end
= info
->base
+ info
->size
;
135 amt
= MIN((length
- nread
), (end
- info
->current
));
138 fplog(stderr
, "Journal read amount is 0, is that right?\n");
143 n
= pread(info
->jfd
, ptr
, amt
, info
->current
);
145 warn("pread(%d, %p, %zu, %llu)", info
->jfd
, ptr
, amt
, info
->current
);
150 fplog(stderr
, "%s(%d): Wanted to read %zu, but only read %zd\n", __FUNCTION__
, __LINE__
, amt
, n
);
156 if (info
->current
== (info
->base
+ info
->size
)) {
157 info
->current
= info
->base
;
166 * Read a transaction from the journal buffer.
167 * A transaction is a list of block_list_headers, and their
168 * associated data. It needs to read all of the block_lists in
169 * a transaction, or it fails. It returns NULL if there are
170 * no transactions, and on error. (Maybe that should change?)
172 static block_list_header
*
173 getJournalTransaction(JournalIOInfo_t
*jinfo
, swapper_t
*swap
)
175 block_list_header
*retval
= NULL
;
176 uint8_t block
[jinfo
->bSize
];
177 block_list_header
*hdr
= (void*)&block
;
181 memset(block
, 0, sizeof(block
));
182 nread
= journalRead(jinfo
, block
, sizeof(block
));
184 (size_t)nread
!= sizeof(block
)) {
186 plog("%s: wanted %zd, got %zd\n", __FUNCTION__
, sizeof(block
), nread
);
189 if (swap
->swap32(hdr
->num_blocks
) == 0) {
191 * Either there really are no blocks, or this is not a valid
192 * transaction. Either way, there's nothing for us to do here.
195 fplog(stderr
, "%s(%d): hdr->num_blocks == 0\n", __FUNCTION__
, __LINE__
);
199 * Now we check the checksum to see if this is a valid header.
200 * Note that we verify the checksum before reading any more -- if
201 * it's not a valid header, we don't want to read more than a block
204 uint32_t tmpChecksum
= swap
->swap32(hdr
->checksum
);
205 uint32_t compChecksum
;
207 compChecksum
= calc_checksum((void*)hdr
, sizeof(*hdr
));
208 hdr
->checksum
= swap
->swap32(tmpChecksum
);
210 if (compChecksum
!= tmpChecksum
) {
212 fplog(stderr
, "%s(%d): hdr has bad checksum, returning NULL\n", __FUNCTION__
, __LINE__
);
216 if (swap
->swap32(hdr
->bytes_used
) < sizeof(block
)) {
218 fplog(stderr
, "%s(%d): hdr has bytes_used (%u) less than sizeof block (%zd)\n",
219 __FUNCTION__
, __LINE__
, swap
->swap32(hdr
->bytes_used
), sizeof(block
));
224 retval
= malloc(swap
->swap32(hdr
->bytes_used
));
228 memset(retval
, 0, swap
->swap32(hdr
->bytes_used
));
229 memcpy(retval
, block
, sizeof(block
));
230 amt
= swap
->swap32(hdr
->bytes_used
) - sizeof(block
);
231 nread
= journalRead(jinfo
, ((uint8_t*)retval
) + sizeof(block
), amt
);
241 * Replay a transaction.
242 * Transactions have a blockListSize amount of block_list_header, and
243 * are then followed by data. We read it in, verify the checksum, and
244 * if it's good, we call the block that was passed in to do something
245 * with it. Maybe write it out. Maybe laugh about it.
247 * It returns -1 if there was an error before it wrote anything out,
248 * and -2 if there was an error after it wrote something out.
251 * txn -- a block_list_header pointer, which has the description and data
253 * blSize -- the size of the block_list for this journal. (The data
254 * are after the block_list, but part of the same buffer.)
255 * blkSize -- The block size used to convert block numbers to offsets. This
256 * is defined to be the size of the journal header.
257 * swap -- A pointer to a swapper_t used to swap journal data structure elements.
258 * writer -- A block-of-code that does writing.
260 * "writer" should return -1 to stop the replay (this propagates an error up).
263 replayTransaction(block_list_header
*txn
, size_t blSize
, size_t blkSize
, swapper_t
*swap
, journal_write_block_t writer
)
266 uint8_t *endPtr
= ((uint8_t*)txn
) + swap
->swap32(txn
->bytes_used
);
267 uint8_t *dataPtr
= ((uint8_t*)txn
) + blSize
;
269 for (i
= 1; i
< swap
->swap32(txn
->num_blocks
); i
++) {
271 plog("\tBlock %d: blkNum %llu, size %u, data offset = %zd\n", i
, swap
->swap64(txn
->binfo
[i
].bnum
), swap
->swap32(txn
->binfo
[i
].bsize
), dataPtr
- (uint8_t*)txn
);
274 * Check with security types on these checks. Need to ensure
275 * that the fields don't take us off into the dark scary woods.
276 * It's mostly the second one that I am unsure about.
278 if (dataPtr
> endPtr
) {
280 plog("\tData out of range for block_list_header\n");
283 if ((endPtr
- dataPtr
) < swap
->swap32(txn
->binfo
[i
].bsize
)) {
285 plog("\tData size for block %d out of range for block_list_header\n", i
);
288 if ((dataPtr
+ swap
->swap32(txn
->binfo
[i
].bsize
)) > endPtr
) {
290 plog("\tData end out of range for block_list_header\n");
293 // Just for debugging
295 if (swap
->swap64(txn
->binfo
[i
].bnum
) == 2) {
296 HFSPlusVolumeHeader
*vp
= (void*)dataPtr
;
297 plog("vp->signature = %#x, version = %#x\n", vp
->signature
, vp
->version
);
300 // It's in the spec, and I saw it come up once on a live volume.
301 if (swap
->swap64(txn
->binfo
[i
].bnum
) == ~(uint64_t)0) {
303 plog("\tSkipping this block due to magic skip number\n");
305 // Should we set retval to -2 here?
307 if ((writer
)(swap
->swap64(txn
->binfo
[i
].bnum
) * blkSize
, dataPtr
, swap
->swap32(txn
->binfo
[i
].bsize
)) == -1)
311 dataPtr
+= swap
->swap32(txn
->binfo
[i
].bsize
);
318 * Read a journal header in from the journal device.
321 loadJournalHeader(int jfd
, off_t offset
, size_t blockSize
, journal_header
*jhp
)
323 uint8_t buffer
[blockSize
];
326 nread
= pread(jfd
, buffer
, sizeof(buffer
), offset
);
328 (size_t)nread
!= sizeof(buffer
)) {
329 warn("tried to read %zu for journal header buffer, got %zd", sizeof(buffer
), nread
);
332 *jhp
= *(journal_header
*)buffer
;
337 * Replay a journal (called "journal_open" because you have to
338 * to replay it as part of opening it). At this point, all it
339 * is useful for is replaying the journal.
342 * jfd -- file descriptor for the journal device
343 * offset -- offset (in bytes) of the journal on the journal device
344 * journal_size -- size of the jorunal (in bytes)
345 * min_fs_blksize -- Blocksize of the data filesystem
346 * flags -- unused for now
347 * jdev_name -- string name for the journal device. used for logging.
348 * do_write_b -- a block which does the actual writing.
350 * Currently, for fsck_hfs, the do_write_b block writes to the cache. It could also
351 * just print out the block numbers, or just check their integrity, as much as is
354 * The function works by loading the journal header. From there, it then starts
355 * loading transactions, via block_list_header groups. When it gets to the end
356 * of the journal, it tries continuing, in case there were transactions that
357 * didn't get updated in the header (this apparently happens).
359 * It returns 0 on success, and -1 on error. Note that there's not a lot
360 * fsck_hfs can probably do in the event of error.
364 journal_open(int jfd
,
365 off_t offset
, // Offset of journal
366 off_t journal_size
, // Size, in bytes, of the entire journal
367 size_t min_fs_blksize
, // Blocksize of the data filesystem, journal blocksize must be at least this size
368 uint32_t flags __unused
, // Not used in this implementation
369 const char *jdev_name
, // The name of the journal device, for logging
370 int (^do_write_b
)(off_t
, void*, size_t))
372 journal_header jhdr
= { 0 };
373 swapper_t
*jnlSwap
; // Used to swap fields of the journal
374 uint32_t tempCksum
; // Temporary checksum value
375 uint32_t jBlkSize
= 0;
377 if (ioctl(jfd
, DKIOCGETBLOCKSIZE
, &jBlkSize
) == -1) {
378 jBlkSize
= min_fs_blksize
;
380 if (jBlkSize
< min_fs_blksize
) {
381 fplog(stderr
, "%s: journal block size %u < min block size %zu for %s\n", __FUNCTION__
, jBlkSize
, min_fs_blksize
, jdev_name
);
384 if ((jBlkSize
% min_fs_blksize
) != 0) {
385 fplog(stderr
, "%s: journal block size %u is not a multiple of fs block size %zu for %s\n", __FUNCTION__
, jBlkSize
, min_fs_blksize
, jdev_name
);
389 if (loadJournalHeader(jfd
, offset
, jBlkSize
, &jhdr
) != 0) {
390 fplog(stderr
, "%s: unable to load journal header from %s\n", __FUNCTION__
, jdev_name
);
395 * Unlike the rest of the filesystem, the journal can be in native or
396 * non-native byte order. Barring moving a filesystem from one host
397 * to another, it'll almost always be in native byte order.
399 if (jhdr
.endian
== ENDIAN_MAGIC
) {
400 jnlSwap
= &nativeEndian
;
401 } else if (OSSwapInt32(jhdr
.endian
) == ENDIAN_MAGIC
) {
402 jnlSwap
= &swappedEndian
;
404 fplog(stderr
, "%s: Unknown journal endian magic number %#x from %s\n", __FUNCTION__
, jhdr
.endian
, jdev_name
);
408 * Two different magic numbers are valid.
409 * Do they mean different thigs, though?
411 if (jnlSwap
->swap32(jhdr
.magic
) != JOURNAL_HEADER_MAGIC
&&
412 jnlSwap
->swap32(jhdr
.magic
) != OLD_JOURNAL_HEADER_MAGIC
) {
413 fplog(stderr
, "%s: Unknown journal header magic number %#x from %s\n", __FUNCTION__
, jhdr
.magic
, jdev_name
);
418 * Checksums have to be done with the checksum field set to 0.
419 * So we have to stash it aside for a bit, and set the field to
420 * 0, before we can compare. Afterwards, if it compares correctly,
421 * we put the original (swapped, if necessary) value back, just
424 tempCksum
= jnlSwap
->swap32(jhdr
.checksum
);
426 if (jnlSwap
->swap32(jhdr
.magic
) == JOURNAL_HEADER_MAGIC
&&
427 (calc_checksum((void*)&jhdr
, JOURNAL_HEADER_CKSUM_SIZE
) != tempCksum
)) {
428 fplog(stderr
, "%s: Invalid journal checksum from %s\n", __FUNCTION__
, jdev_name
);
431 jhdr
.checksum
= jnlSwap
->swap32(tempCksum
);
434 * Set up information about the journal which we use to do the I/O.
435 * The journal is a circular buffer. However, the start of the journal
436 * buffer is past the journal header. See the JournalIOInfo structure above.
438 off_t startOffset
= jnlSwap
->swap64(jhdr
.start
);
439 off_t endOffset
=jnlSwap
->swap64(jhdr
.end
);
440 off_t journalStart
= offset
+ jnlSwap
->swap32(jhdr
.jhdr_size
);
443 * The journal code was updated to be able to read past the "end" of the journal,
444 * to see if there were any valid transactions there. If we are peeking past the
445 * end, we don't care if we have checksum errors -- that just means they're not
446 * valid transactions.
449 int into_the_weeds
= 0;
450 uint32_t last_sequence_number
= 0;
452 JournalIOInfo_t jinfo
= { 0 };
455 plog("Journal start sequence number = %u\n", jnlSwap
->swap32(jhdr
.sequence_num
));
458 * Now set up the JournalIOInfo object with the file descriptor,
459 * the block size, start and end of the journal buffer, and where
460 * the journal pointer currently is.
463 jinfo
.bSize
= jnlSwap
->swap32(jhdr
.jhdr_size
);
464 jinfo
.base
= journalStart
;
465 jinfo
.size
= journal_size
- jinfo
.bSize
;
466 jinfo
.end
= offset
+ endOffset
;
467 jinfo
.current
= offset
+ startOffset
;
469 const char *state
= "";
471 block_list_header
*txn
= NULL
;
474 * Loop while getting transactions. We exit when we hit a checksum
475 * error, or when the sequence number for a transaction doesn't match
476 * what we expect it to. (That's the trickiest part -- the into_the_weeds
477 * portion of the code. It doesn't match the TN11150 documentation, so
478 * I've had to go by both my experience with real-world journals and by
479 * looking at the kernel code.)
484 if (jinfo
.current
== jinfo
.end
&& into_the_weeds
== 0) {
486 * This is a bit weird, but it works: if current == end, but gone_into_weeds is 1,
487 * then this code will not execute. If it does execute, it'll go to get a transaction.
488 * That will put the pointer past end.
490 if (jhdr
.sequence_num
== 0) {
493 * I am not sure about this; this behaviour is not in TN1150 at all,
494 * but I _think_ this is what the kernel is doing.
496 plog("Journal sequence number is 0, is going into the end okay?\n");
500 plog("Attempting to read past stated end of journal\n");
501 state
= "tentative ";
502 jinfo
.end
= (jinfo
.base
+ startOffset
- jinfo
.bSize
);
506 plog("Before getting %stransaction: jinfo.current = %llu\n", state
, jinfo
.current
);
508 * Note that getJournalTransaction verifies the checksum on the block_list_header, so
509 * if it's bad, it'll return NULL.
511 txn
= getJournalTransaction(&jinfo
, jnlSwap
);
514 plog("txn is NULL, jinfo.current = %llu\n", jinfo
.current
);
515 if (into_the_weeds
) {
517 plog("\tBut we do not care, since it is past the end of the journal\n");
524 plog("After getting %stransaction: jinfo.current = %llu\n", state
, jinfo
.current
);
525 plog("%stxn = { %u max_blocks, %u num_blocks, %u bytes_used, binfo[0].next = %u }\n", state
, jnlSwap
->swap32(txn
->max_blocks
), jnlSwap
->swap32(txn
->num_blocks
), jnlSwap
->swap32(txn
->bytes_used
), jnlSwap
->swap32(txn
->binfo
[0].next
));
527 if (into_the_weeds
) {
529 * This seems to be what the kernel was checking: if the
530 * last_sequence_number was set, and the txn sequence number
531 * is set, and the txn sequence number doesn't match either
532 * last_sequence_number _or_ an incremented version of it, then
533 * the transaction isn't worth looking at, and we've reached
534 * the end of the journal.
536 if (last_sequence_number
!= 0 &&
537 txn
->binfo
[0].next
!= 0 &&
538 jnlSwap
->swap32(txn
->binfo
[0].next
) != last_sequence_number
&&
539 jnlSwap
->swap32(txn
->binfo
[0].next
) != (last_sequence_number
+ 1)) {
540 // Probably not a valid transaction
542 plog("\tTentative txn sequence %u is not expected %u, stopping journal replay\n", jnlSwap
->swap32(txn
->binfo
[0].next
), last_sequence_number
+ 1);
547 * If we've got a valid transaction, then we replay it.
548 * If there was an error, we're done with the journal replay.
549 * (If the error occurred after the "end," then we don't care,
550 * and it's not a bad journal.)
552 rv
= replayTransaction(txn
,
553 jnlSwap
->swap32(jhdr
.blhdr_size
),
554 jnlSwap
->swap32(jhdr
.jhdr_size
),
560 plog("\tTransaction replay failed, returned %d\n", rv
);
561 if (into_the_weeds
) {
563 plog("\t\tAnd we don't care\n");
569 last_sequence_number
= jnlSwap
->swap32(txn
->binfo
[0].next
);
577 plog("Journal was bad, stopped replaying\n");