2 * Copyright (c) 2010-2012 Apple Inc. All rights reserved.
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
39 #include <sys/types.h>
40 #include <sys/param.h>
42 #include <sys/ioctl.h>
44 #include <sys/param.h>
46 #include "../fsck_hfs.h"
47 #include "fsck_journal.h"
49 #define DEBUG_JOURNAL 0
53 #include <hfs/hfs_format.h>
54 #include <libkern/OSByteOrder.h>
56 typedef struct SwapType
{
58 uint16_t (^swap16
)(uint16_t);
59 uint32_t (^swap32
)(uint32_t);
60 uint64_t (^swap64
)(uint64_t);
63 static swapper_t nativeEndian
= {
65 ^(uint16_t x
) { return x
; },
66 ^(uint32_t x
) { return x
; },
67 ^(uint64_t x
) { return x
; }
70 static swapper_t swappedEndian
= {
72 ^(uint16_t x
) { return OSSwapInt16(x
); },
73 ^(uint32_t x
) { return OSSwapInt32(x
); },
74 ^(uint64_t x
) { return OSSwapInt64(x
); }
77 typedef int (^journal_write_block_t
)(off_t
, void *, size_t);
80 // this isn't a great checksum routine but it will do for now.
81 // we use it to checksum the journal header and the block list
82 // headers that are at the start of each transaction.
85 calc_checksum(char *ptr
, int len
)
90 // this is a lame checksum but for now it'll do
91 for(i
= 0; i
< len
; i
++, ptr
++) {
92 cksum
= (cksum
<< 8) ^ (cksum
+ *(unsigned char *)ptr
);
98 typedef struct JournalIOInfo
{
99 int jfd
; // File descriptor for journal buffer
100 int wrapCount
; // Incremented when it wraps around.
101 size_t bSize
; // Block size. I/O needs to be done in that amount.
102 uint64_t base
; // Base offset of journal buffer, past the header
103 uint64_t size
; // Size of the journal, minus the header size
104 uint64_t end
; // End of the journal (initially the "end" field from the journal header)
105 uint64_t current
; // Current offset; starts at "start"
109 * Attempt to read <length> bytes from the journal buffer.
110 * Since this is a wrapped buffer, it may have to start at the
111 * beginning. info->{base, size, end} are read-only; info->current
112 * is updated with the current offset. It returns the number of bytes
113 * it read, or -1 on error.
116 journalRead(JournalIOInfo_t
*info
, uint8_t *buffer
, size_t length
)
119 uint8_t *ptr
= buffer
;
121 // fprintf(stderr, "%s(%p, %p, %zu)\n", __FUNCTION__, info, buffer, length);
122 if (info
->wrapCount
> 1) {
123 fplog(stderr
, "%s(%p, %p, %zu): journal buffer wrap count = %d\n", __FUNCTION__
, info
, buffer
, length
, info
->wrapCount
);
126 while (nread
< length
) {
131 if (info
->end
< info
->current
) {
132 // It wraps, so we max out at bse+size
133 end
= info
->base
+ info
->size
;
137 amt
= MIN((length
- nread
), (end
- info
->current
));
140 fplog(stderr
, "Journal read amount is 0, is that right?\n");
145 n
= pread(info
->jfd
, ptr
, amt
, info
->current
);
147 warn("pread(%d, %p, %zu, %llu)", info
->jfd
, ptr
, amt
, info
->current
);
152 fplog(stderr
, "%s(%d): Wanted to read %zu, but only read %zd\n", __FUNCTION__
, __LINE__
, amt
, n
);
158 if (info
->current
== (info
->base
+ info
->size
)) {
159 info
->current
= info
->base
;
168 * Read a transaction from the journal buffer.
169 * A transaction is a list of block_list_headers, and their
170 * associated data. It needs to read all of the block_lists in
171 * a transaction, or it fails. It returns NULL if there are
172 * no transactions, and on error. (Maybe that should change?)
174 static block_list_header
*
175 getJournalTransaction(JournalIOInfo_t
*jinfo
, swapper_t
*swap
)
177 block_list_header
*retval
= NULL
;
178 uint8_t block
[jinfo
->bSize
];
179 block_list_header
*hdr
= (void*)&block
;
183 memset(block
, 0, sizeof(block
));
184 nread
= journalRead(jinfo
, block
, sizeof(block
));
186 (size_t)nread
!= sizeof(block
)) {
188 plog("%s: wanted %zd, got %zd\n", __FUNCTION__
, sizeof(block
), nread
);
191 if (swap
->swap32(hdr
->num_blocks
) == 0) {
193 * Either there really are no blocks, or this is not a valid
194 * transaction. Either way, there's nothing for us to do here.
198 fplog(stderr
, "%s(%d): hdr->num_blocks == 0\n", __FUNCTION__
, __LINE__
);
203 * Now we check the checksum to see if this is a valid header.
204 * Note that we verify the checksum before reading any more -- if
205 * it's not a valid header, we don't want to read more than a block
208 uint32_t tmpChecksum
= swap
->swap32(hdr
->checksum
);
209 uint32_t compChecksum
;
211 compChecksum
= calc_checksum((void*)hdr
, sizeof(*hdr
));
212 hdr
->checksum
= swap
->swap32(tmpChecksum
);
214 if (compChecksum
!= tmpChecksum
) {
216 fplog(stderr
, "%s(%d): hdr has bad checksum, returning NULL\n", __FUNCTION__
, __LINE__
);
220 if (swap
->swap32(hdr
->bytes_used
) < sizeof(block
)) {
223 fplog(stderr
, "%s(%d): hdr has bytes_used (%u) less than sizeof block (%zd)\n",
224 __FUNCTION__
, __LINE__
, swap
->swap32(hdr
->bytes_used
), sizeof(block
));
230 retval
= malloc(swap
->swap32(hdr
->bytes_used
));
234 memset(retval
, 0, swap
->swap32(hdr
->bytes_used
));
235 memcpy(retval
, block
, sizeof(block
));
236 amt
= swap
->swap32(hdr
->bytes_used
) - sizeof(block
);
237 nread
= journalRead(jinfo
, ((uint8_t*)retval
) + sizeof(block
), amt
);
247 * Replay a transaction.
248 * Transactions have a blockListSize amount of block_list_header, and
249 * are then followed by data. We read it in, verify the checksum, and
250 * if it's good, we call the block that was passed in to do something
251 * with it. Maybe write it out. Maybe laugh about it.
253 * It returns -1 if there was an error before it wrote anything out,
254 * and -2 if there was an error after it wrote something out.
257 * txn -- a block_list_header pointer, which has the description and data
259 * blSize -- the size of the block_list for this journal. (The data
260 * are after the block_list, but part of the same buffer.)
261 * blkSize -- The block size used to convert block numbers to offsets. This
262 * is defined to be the size of the journal header.
263 * swap -- A pointer to a swapper_t used to swap journal data structure elements.
264 * writer -- A block-of-code that does writing.
266 * "writer" should return -1 to stop the replay (this propagates an error up).
269 replayTransaction(block_list_header
*txn
, size_t blSize
, size_t blkSize
, swapper_t
*swap
, journal_write_block_t writer
)
272 uint8_t *endPtr
= ((uint8_t*)txn
) + swap
->swap32(txn
->bytes_used
);
273 uint8_t *dataPtr
= ((uint8_t*)txn
) + blSize
;
275 for (i
= 1; i
< swap
->swap32(txn
->num_blocks
); i
++) {
278 plog("\tBlock %d: blkNum %llu, size %u, data offset = %zd\n", i
, swap
->swap64(txn
->binfo
[i
].bnum
), swap
->swap32(txn
->binfo
[i
].bsize
), dataPtr
- (uint8_t*)txn
);
282 * Check with security types on these checks. Need to ensure
283 * that the fields don't take us off into the dark scary woods.
284 * It's mostly the second one that I am unsure about.
286 if (dataPtr
> endPtr
) {
288 plog("\tData out of range for block_list_header\n");
291 if ((endPtr
- dataPtr
) < swap
->swap32(txn
->binfo
[i
].bsize
)) {
293 plog("\tData size for block %d out of range for block_list_header\n", i
);
296 if ((dataPtr
+ swap
->swap32(txn
->binfo
[i
].bsize
)) > endPtr
) {
298 plog("\tData end out of range for block_list_header\n");
302 // Just for debugging
304 if (swap
->swap64(txn
->binfo
[i
].bnum
) == 2) {
305 HFSPlusVolumeHeader
*vp
= (void*)dataPtr
;
306 plog("vp->signature = %#x, version = %#x\n", vp
->signature
, vp
->version
);
310 // It's in the spec, and I saw it come up once on a live volume.
311 if (swap
->swap64(txn
->binfo
[i
].bnum
) == ~(uint64_t)0) {
314 plog("\tSkipping this block due to magic skip number\n");
317 // Should we set retval to -2 here?
319 if ((writer
)(swap
->swap64(txn
->binfo
[i
].bnum
) * blkSize
, dataPtr
, swap
->swap32(txn
->binfo
[i
].bsize
)) == -1)
323 dataPtr
+= swap
->swap32(txn
->binfo
[i
].bsize
);
330 * Read a journal header in from the journal device.
333 loadJournalHeader(int jfd
, off_t offset
, size_t blockSize
, journal_header
*jhp
)
335 uint8_t buffer
[blockSize
];
338 nread
= pread(jfd
, buffer
, sizeof(buffer
), offset
);
340 (size_t)nread
!= sizeof(buffer
)) {
341 warn("tried to read %zu for journal header buffer, got %zd", sizeof(buffer
), nread
);
344 *jhp
= *(journal_header
*)buffer
;
349 * Replay a journal (called "journal_open" because you have to
350 * to replay it as part of opening it). At this point, all it
351 * is useful for is replaying the journal.
354 * jfd -- file descriptor for the journal device
355 * offset -- offset (in bytes) of the journal on the journal device
356 * journal_size -- size of the jorunal (in bytes)
357 * min_fs_blksize -- Blocksize of the data filesystem
358 * flags -- unused for now
359 * jdev_name -- string name for the journal device. used for logging.
360 * do_write_b -- a block which does the actual writing.
362 * Currently, for fsck_hfs, the do_write_b block writes to the cache. It could also
363 * just print out the block numbers, or just check their integrity, as much as is
366 * The function works by loading the journal header. From there, it then starts
367 * loading transactions, via block_list_header groups. When it gets to the end
368 * of the journal, it tries continuing, in case there were transactions that
369 * didn't get updated in the header (this apparently happens).
371 * It returns 0 on success, and -1 on error. Note that there's not a lot
372 * fsck_hfs can probably do in the event of error.
376 journal_open(int jfd
,
377 off_t offset
, // Offset of journal
378 off_t journal_size
, // Size, in bytes, of the entire journal
379 size_t min_fs_blksize
, // Blocksize of the data filesystem, journal blocksize must be at least this size
380 uint32_t flags __unused
, // Not used in this implementation
381 const char *jdev_name
, // The name of the journal device, for logging
382 int (^do_write_b
)(off_t
, void*, size_t))
384 journal_header jhdr
= { 0 };
385 swapper_t
*jnlSwap
; // Used to swap fields of the journal
386 uint32_t tempCksum
; // Temporary checksum value
387 uint32_t jBlkSize
= 0;
389 if (ioctl(jfd
, DKIOCGETBLOCKSIZE
, &jBlkSize
) == -1) {
390 jBlkSize
= min_fs_blksize
;
392 if (jBlkSize
< min_fs_blksize
) {
393 fplog(stderr
, "%s: journal block size %u < min block size %zu for %s\n", __FUNCTION__
, jBlkSize
, min_fs_blksize
, jdev_name
);
396 if ((jBlkSize
% min_fs_blksize
) != 0) {
397 fplog(stderr
, "%s: journal block size %u is not a multiple of fs block size %zu for %s\n", __FUNCTION__
, jBlkSize
, min_fs_blksize
, jdev_name
);
401 if (loadJournalHeader(jfd
, offset
, jBlkSize
, &jhdr
) != 0) {
402 fplog(stderr
, "%s: unable to load journal header from %s\n", __FUNCTION__
, jdev_name
);
407 * Unlike the rest of the filesystem, the journal can be in native or
408 * non-native byte order. Barring moving a filesystem from one host
409 * to another, it'll almost always be in native byte order.
411 if (jhdr
.endian
== ENDIAN_MAGIC
) {
412 jnlSwap
= &nativeEndian
;
413 } else if (OSSwapInt32(jhdr
.endian
) == ENDIAN_MAGIC
) {
414 jnlSwap
= &swappedEndian
;
416 fplog(stderr
, "%s: Unknown journal endian magic number %#x from %s\n", __FUNCTION__
, jhdr
.endian
, jdev_name
);
420 * Two different magic numbers are valid.
421 * Do they mean different thigs, though?
423 if (jnlSwap
->swap32(jhdr
.magic
) != JOURNAL_HEADER_MAGIC
&&
424 jnlSwap
->swap32(jhdr
.magic
) != OLD_JOURNAL_HEADER_MAGIC
) {
425 fplog(stderr
, "%s: Unknown journal header magic number %#x from %s\n", __FUNCTION__
, jhdr
.magic
, jdev_name
);
430 * Checksums have to be done with the checksum field set to 0.
431 * So we have to stash it aside for a bit, and set the field to
432 * 0, before we can compare. Afterwards, if it compares correctly,
433 * we put the original (swapped, if necessary) value back, just
436 tempCksum
= jnlSwap
->swap32(jhdr
.checksum
);
438 if (jnlSwap
->swap32(jhdr
.magic
) == JOURNAL_HEADER_MAGIC
&&
439 (calc_checksum((void*)&jhdr
, JOURNAL_HEADER_CKSUM_SIZE
) != tempCksum
)) {
440 fplog(stderr
, "%s: Invalid journal checksum from %s\n", __FUNCTION__
, jdev_name
);
443 jhdr
.checksum
= jnlSwap
->swap32(tempCksum
);
446 * Set up information about the journal which we use to do the I/O.
447 * The journal is a circular buffer. However, the start of the journal
448 * buffer is past the journal header. See the JournalIOInfo structure above.
450 off_t startOffset
= jnlSwap
->swap64(jhdr
.start
);
451 off_t endOffset
=jnlSwap
->swap64(jhdr
.end
);
452 off_t journalStart
= offset
+ jnlSwap
->swap32(jhdr
.jhdr_size
);
455 * The journal code was updated to be able to read past the "end" of the journal,
456 * to see if there were any valid transactions there. If we are peeking past the
457 * end, we don't care if we have checksum errors -- that just means they're not
458 * valid transactions.
461 int into_the_weeds
= 0;
462 uint32_t last_sequence_number
= 0;
464 JournalIOInfo_t jinfo
= { 0 };
468 plog("Journal start sequence number = %u\n", jnlSwap
->swap32(jhdr
.sequence_num
));
472 * Now set up the JournalIOInfo object with the file descriptor,
473 * the block size, start and end of the journal buffer, and where
474 * the journal pointer currently is.
477 jinfo
.bSize
= jnlSwap
->swap32(jhdr
.jhdr_size
);
478 jinfo
.base
= journalStart
;
479 jinfo
.size
= journal_size
- jinfo
.bSize
;
480 jinfo
.end
= offset
+ endOffset
;
481 jinfo
.current
= offset
+ startOffset
;
483 const char *state
= "";
485 block_list_header
*txn
= NULL
;
488 * Loop while getting transactions. We exit when we hit a checksum
489 * error, or when the sequence number for a transaction doesn't match
490 * what we expect it to. (That's the trickiest part -- the into_the_weeds
491 * portion of the code. It doesn't match the TN11150 documentation, so
492 * I've had to go by both my experience with real-world journals and by
493 * looking at the kernel code.)
498 if (jinfo
.current
== jinfo
.end
&& into_the_weeds
== 0) {
500 * This is a bit weird, but it works: if current == end, but gone_into_weeds is 1,
501 * then this code will not execute. If it does execute, it'll go to get a transaction.
502 * That will put the pointer past end.
504 if (jhdr
.sequence_num
== 0) {
507 * I am not sure about this; this behaviour is not in TN1150 at all,
508 * but I _think_ this is what the kernel is doing.
510 plog("Journal sequence number is 0, is going into the end okay?\n");
515 plog("Attempting to read past stated end of journal\n");
517 state
= "tentative ";
518 jinfo
.end
= (jinfo
.base
+ startOffset
- jinfo
.bSize
);
523 plog("Before getting %stransaction: jinfo.current = %llu\n", state
, jinfo
.current
);
526 * Note that getJournalTransaction verifies the checksum on the block_list_header, so
527 * if it's bad, it'll return NULL.
529 txn
= getJournalTransaction(&jinfo
, jnlSwap
);
533 plog("txn is NULL, jinfo.current = %llu\n", jinfo
.current
);
535 if (into_the_weeds
) {
538 plog("\tBut we do not care, since it is past the end of the journal\n");
547 plog("After getting %stransaction: jinfo.current = %llu\n", state
, jinfo
.current
);
548 plog("%stxn = { %u max_blocks, %u num_blocks, %u bytes_used, binfo[0].next = %u }\n", state
, jnlSwap
->swap32(txn
->max_blocks
), jnlSwap
->swap32(txn
->num_blocks
), jnlSwap
->swap32(txn
->bytes_used
), jnlSwap
->swap32(txn
->binfo
[0].next
));
551 if (into_the_weeds
) {
553 * This seems to be what the kernel was checking: if the
554 * last_sequence_number was set, and the txn sequence number
555 * is set, and the txn sequence number doesn't match either
556 * last_sequence_number _or_ an incremented version of it, then
557 * the transaction isn't worth looking at, and we've reached
558 * the end of the journal.
560 if (last_sequence_number
!= 0 &&
561 txn
->binfo
[0].next
!= 0 &&
562 jnlSwap
->swap32(txn
->binfo
[0].next
) != last_sequence_number
&&
563 jnlSwap
->swap32(txn
->binfo
[0].next
) != (last_sequence_number
+ 1)) {
564 // Probably not a valid transaction
567 plog("\tTentative txn sequence %u is not expected %u, stopping journal replay\n", jnlSwap
->swap32(txn
->binfo
[0].next
), last_sequence_number
+ 1);
573 * If we've got a valid transaction, then we replay it.
574 * If there was an error, we're done with the journal replay.
575 * (If the error occurred after the "end," then we don't care,
576 * and it's not a bad journal.)
578 rv
= replayTransaction(txn
,
579 jnlSwap
->swap32(jhdr
.blhdr_size
),
580 jnlSwap
->swap32(jhdr
.jhdr_size
),
586 plog("\tTransaction replay failed, returned %d\n", rv
);
587 if (into_the_weeds
) {
589 plog("\t\tAnd we don't care\n");
595 last_sequence_number
= jnlSwap
->swap32(txn
->binfo
[0].next
);
603 plog("Journal was bad, stopped replaying\n");