2 * Copyright (c) 2010-2012 Apple Inc. All rights reserved.
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
39 #include <sys/types.h>
40 #include <sys/param.h>
42 #include <sys/ioctl.h>
44 #include <sys/param.h>
46 #include "../fsck_hfs.h"
47 #include "fsck_journal.h"
51 #include <hfs/hfs_format.h>
52 #include <libkern/OSByteOrder.h>
54 typedef struct SwapType
{
56 uint16_t (*swap16
)(uint16_t);
57 uint32_t (*swap32
)(uint32_t);
58 uint64_t (*swap64
)(uint64_t);
61 uint16_t ident16(uint16_t x
) { return x
; }
62 uint32_t ident32(uint32_t x
) { return x
; }
63 uint64_t ident64(uint64_t x
) { return x
; }
65 static swapper_t nativeEndian
= {
72 static swapper_t swappedEndian
= {
79 typedef int (__lambda_ journal_write_block_t
)(off_t
, void *, size_t);
82 // this isn't a great checksum routine but it will do for now.
83 // we use it to checksum the journal header and the block list
84 // headers that are at the start of each transaction.
87 calc_checksum(char *ptr
, int len
)
92 // this is a lame checksum but for now it'll do
93 for(i
= 0; i
< len
; i
++, ptr
++) {
94 cksum
= (cksum
<< 8) ^ (cksum
+ *(unsigned char *)ptr
);
100 typedef struct JournalIOInfo
{
101 int jfd
; // File descriptor for journal buffer
102 int wrapCount
; // Incremented when it wraps around.
103 size_t bSize
; // Block size. I/O needs to be done in that amount.
104 uint64_t base
; // Base offset of journal buffer, past the header
105 uint64_t size
; // Size of the journal, minus the header size
106 uint64_t end
; // End of the journal (initially the "end" field from the journal header)
107 uint64_t current
; // Current offset; starts at "start"
111 * Attempt to read <length> bytes from the journal buffer.
112 * Since this is a wrapped buffer, it may have to start at the
113 * beginning. info->{base, size, end} are read-only; info->current
114 * is updated with the current offset. It returns the number of bytes
115 * it read, or -1 on error.
118 journalRead(JournalIOInfo_t
*info
, uint8_t *buffer
, size_t length
)
121 uint8_t *ptr
= buffer
;
123 // fprintf(stderr, "%s(%p, %p, %zu)\n", __FUNCTION__, info, buffer, length);
124 if (info
->wrapCount
> 1) {
125 fplog(stderr
, "%s(%p, %p, %zu): journal buffer wrap count = %d\n", __FUNCTION__
, info
, buffer
, length
, info
->wrapCount
);
128 while (nread
< length
) {
133 if (info
->end
< info
->current
) {
134 // It wraps, so we max out at bse+size
135 end
= info
->base
+ info
->size
;
139 amt
= MIN((length
- nread
), (end
- info
->current
));
142 fplog(stderr
, "Journal read amount is 0, is that right?\n");
147 n
= pread(info
->jfd
, ptr
, amt
, info
->current
);
149 warn("pread(%d, %p, %zu, %"PRIu64
")", info
->jfd
, ptr
, amt
, info
->current
);
154 fplog(stderr
, "%s(%d): Wanted to read %zu, but only read %zd\n", __FUNCTION__
, __LINE__
, amt
, n
);
160 if (info
->current
== (info
->base
+ info
->size
)) {
161 info
->current
= info
->base
;
170 * Read a transaction from the journal buffer.
171 * A transaction is a list of block_list_headers, and their
172 * associated data. It needs to read all of the block_lists in
173 * a transaction, or it fails. It returns NULL if there are
174 * no transactions, and on error. (Maybe that should change?)
176 static block_list_header
*
177 getJournalTransaction(JournalIOInfo_t
*jinfo
, swapper_t
*swap
)
179 block_list_header
*retval
= NULL
;
180 uint8_t block
[jinfo
->bSize
];
181 block_list_header
*hdr
= (void*)&block
;
185 memset(block
, 0, sizeof(block
));
186 nread
= journalRead(jinfo
, block
, sizeof(block
));
188 (size_t)nread
!= sizeof(block
)) {
190 plog("%s: wanted %zd, got %zd\n", __FUNCTION__
, sizeof(block
), nread
);
193 if (swap
->swap32(hdr
->num_blocks
) == 0) {
195 * Either there really are no blocks, or this is not a valid
196 * transaction. Either way, there's nothing for us to do here.
199 fplog(stderr
, "%s(%d): hdr->num_blocks == 0\n", __FUNCTION__
, __LINE__
);
203 * Now we check the checksum to see if this is a valid header.
204 * Note that we verify the checksum before reading any more -- if
205 * it's not a valid header, we don't want to read more than a block
208 uint32_t tmpChecksum
= swap
->swap32(hdr
->checksum
);
209 uint32_t compChecksum
;
211 compChecksum
= calc_checksum((void*)hdr
, sizeof(*hdr
));
212 hdr
->checksum
= swap
->swap32(tmpChecksum
);
214 if (compChecksum
!= tmpChecksum
) {
216 fplog(stderr
, "%s(%d): hdr has bad checksum, returning NULL\n", __FUNCTION__
, __LINE__
);
220 if (swap
->swap32(hdr
->bytes_used
) < sizeof(block
)) {
222 fplog(stderr
, "%s(%d): hdr has bytes_used (%u) less than sizeof block (%zd)\n",
223 __FUNCTION__
, __LINE__
, swap
->swap32(hdr
->bytes_used
), sizeof(block
));
228 retval
= malloc(swap
->swap32(hdr
->bytes_used
));
232 memset(retval
, 0, swap
->swap32(hdr
->bytes_used
));
233 memcpy(retval
, block
, sizeof(block
));
234 amt
= swap
->swap32(hdr
->bytes_used
) - sizeof(block
);
235 nread
= journalRead(jinfo
, ((uint8_t*)retval
) + sizeof(block
), amt
);
245 * Replay a transaction.
246 * Transactions have a blockListSize amount of block_list_header, and
247 * are then followed by data. We read it in, verify the checksum, and
248 * if it's good, we call the block that was passed in to do something
249 * with it. Maybe write it out. Maybe laugh about it.
251 * It returns -1 if there was an error before it wrote anything out,
252 * and -2 if there was an error after it wrote something out.
255 * txn -- a block_list_header pointer, which has the description and data
257 * blSize -- the size of the block_list for this journal. (The data
258 * are after the block_list, but part of the same buffer.)
259 * blkSize -- The block size used to convert block numbers to offsets. This
260 * is defined to be the size of the journal header.
261 * swap -- A pointer to a swapper_t used to swap journal data structure elements.
262 * writer -- A block-of-code that does writing.
264 * "writer" should return -1 to stop the replay (this propagates an error up).
267 replayTransaction(block_list_header
*txn
, size_t blSize
, size_t blkSize
, swapper_t
*swap
, journal_write_block_t writer
)
270 uint8_t *endPtr
= ((uint8_t*)txn
) + swap
->swap32(txn
->bytes_used
);
271 uint8_t *dataPtr
= ((uint8_t*)txn
) + blSize
;
273 for (i
= 1; i
< swap
->swap32(txn
->num_blocks
); i
++) {
275 plog("\tBlock %d: blkNum %llu, size %u, data offset = %zd\n", i
, swap
->swap64(txn
->binfo
[i
].bnum
), swap
->swap32(txn
->binfo
[i
].bsize
), dataPtr
- (uint8_t*)txn
);
278 * Check with security types on these checks. Need to ensure
279 * that the fields don't take us off into the dark scary woods.
280 * It's mostly the second one that I am unsure about.
282 if (dataPtr
> endPtr
) {
284 plog("\tData out of range for block_list_header\n");
287 if ((endPtr
- dataPtr
) < swap
->swap32(txn
->binfo
[i
].bsize
)) {
289 plog("\tData size for block %d out of range for block_list_header\n", i
);
292 if ((dataPtr
+ swap
->swap32(txn
->binfo
[i
].bsize
)) > endPtr
) {
294 plog("\tData end out of range for block_list_header\n");
297 // Just for debugging
299 if (swap
->swap64(txn
->binfo
[i
].bnum
) == 2) {
300 HFSPlusVolumeHeader
*vp
= (void*)dataPtr
;
301 plog("vp->signature = %#x, version = %#x\n", vp
->signature
, vp
->version
);
304 // It's in the spec, and I saw it come up once on a live volume.
305 if (swap
->swap64(txn
->binfo
[i
].bnum
) == ~(uint64_t)0) {
307 plog("\tSkipping this block due to magic skip number\n");
309 // Should we set retval to -2 here?
311 if ((writer
)(swap
->swap64(txn
->binfo
[i
].bnum
) * blkSize
, dataPtr
, swap
->swap32(txn
->binfo
[i
].bsize
)) == -1)
315 dataPtr
+= swap
->swap32(txn
->binfo
[i
].bsize
);
322 * Read a journal header in from the journal device.
325 loadJournalHeader(int jfd
, off_t offset
, size_t blockSize
, journal_header
*jhp
)
327 uint8_t buffer
[blockSize
];
330 nread
= pread(jfd
, buffer
, sizeof(buffer
), offset
);
332 (size_t)nread
!= sizeof(buffer
)) {
333 warn("tried to read %zu for journal header buffer, got %zd", sizeof(buffer
), nread
);
336 *jhp
= *(journal_header
*)buffer
;
341 * Replay a journal (called "journal_open" because you have to
342 * to replay it as part of opening it). At this point, all it
343 * is useful for is replaying the journal.
346 * jfd -- file descriptor for the journal device
347 * offset -- offset (in bytes) of the journal on the journal device
348 * journal_size -- size of the jorunal (in bytes)
349 * min_fs_blksize -- Blocksize of the data filesystem
350 * flags -- unused for now
351 * jdev_name -- string name for the journal device. used for logging.
352 * do_write_b -- a block which does the actual writing.
354 * Currently, for fsck_hfs, the do_write_b block writes to the cache. It could also
355 * just print out the block numbers, or just check their integrity, as much as is
358 * The function works by loading the journal header. From there, it then starts
359 * loading transactions, via block_list_header groups. When it gets to the end
360 * of the journal, it tries continuing, in case there were transactions that
361 * didn't get updated in the header (this apparently happens).
363 * It returns 0 on success, and -1 on error. Note that there's not a lot
364 * fsck_hfs can probably do in the event of error.
368 journal_open(int jfd
,
369 off_t offset
, // Offset of journal
370 off_t journal_size
, // Size, in bytes, of the entire journal
371 size_t min_fs_blksize
, // Blocksize of the data filesystem, journal blocksize must be at least this size
372 uint32_t flags __unused
, // Not used in this implementation
373 const char *jdev_name
, // The name of the journal device, for logging
374 int (__lambda_ do_write_b
)(off_t
, void*, size_t))
376 journal_header jhdr
= { 0 };
377 swapper_t
*jnlSwap
; // Used to swap fields of the journal
378 uint32_t tempCksum
; // Temporary checksum value
379 uint32_t jBlkSize
= 0;
381 if (ioctl(jfd
, DKIOCGETBLOCKSIZE
, &jBlkSize
) == -1) {
382 jBlkSize
= min_fs_blksize
;
384 if (jBlkSize
< min_fs_blksize
) {
385 fplog(stderr
, "%s: journal block size %u < min block size %zu for %s\n", __FUNCTION__
, jBlkSize
, min_fs_blksize
, jdev_name
);
388 if ((jBlkSize
% min_fs_blksize
) != 0) {
389 fplog(stderr
, "%s: journal block size %u is not a multiple of fs block size %zu for %s\n", __FUNCTION__
, jBlkSize
, min_fs_blksize
, jdev_name
);
393 if (loadJournalHeader(jfd
, offset
, jBlkSize
, &jhdr
) != 0) {
394 fplog(stderr
, "%s: unable to load journal header from %s\n", __FUNCTION__
, jdev_name
);
399 * Unlike the rest of the filesystem, the journal can be in native or
400 * non-native byte order. Barring moving a filesystem from one host
401 * to another, it'll almost always be in native byte order.
403 if (jhdr
.endian
== ENDIAN_MAGIC
) {
404 jnlSwap
= &nativeEndian
;
405 } else if (OSSwapInt32(jhdr
.endian
) == ENDIAN_MAGIC
) {
406 jnlSwap
= &swappedEndian
;
408 fplog(stderr
, "%s: Unknown journal endian magic number %#x from %s\n", __FUNCTION__
, jhdr
.endian
, jdev_name
);
412 * Two different magic numbers are valid.
413 * Do they mean different thigs, though?
415 if (jnlSwap
->swap32(jhdr
.magic
) != JOURNAL_HEADER_MAGIC
&&
416 jnlSwap
->swap32(jhdr
.magic
) != OLD_JOURNAL_HEADER_MAGIC
) {
417 fplog(stderr
, "%s: Unknown journal header magic number %#x from %s\n", __FUNCTION__
, jhdr
.magic
, jdev_name
);
422 * Checksums have to be done with the checksum field set to 0.
423 * So we have to stash it aside for a bit, and set the field to
424 * 0, before we can compare. Afterwards, if it compares correctly,
425 * we put the original (swapped, if necessary) value back, just
428 tempCksum
= jnlSwap
->swap32(jhdr
.checksum
);
430 if (jnlSwap
->swap32(jhdr
.magic
) == JOURNAL_HEADER_MAGIC
&&
431 (calc_checksum((void*)&jhdr
, JOURNAL_HEADER_CKSUM_SIZE
) != tempCksum
)) {
432 fplog(stderr
, "%s: Invalid journal checksum from %s\n", __FUNCTION__
, jdev_name
);
435 jhdr
.checksum
= jnlSwap
->swap32(tempCksum
);
438 * Set up information about the journal which we use to do the I/O.
439 * The journal is a circular buffer. However, the start of the journal
440 * buffer is past the journal header. See the JournalIOInfo structure above.
442 off_t startOffset
= jnlSwap
->swap64(jhdr
.start
);
443 off_t endOffset
=jnlSwap
->swap64(jhdr
.end
);
444 off_t journalStart
= offset
+ jnlSwap
->swap32(jhdr
.jhdr_size
);
447 * The journal code was updated to be able to read past the "end" of the journal,
448 * to see if there were any valid transactions there. If we are peeking past the
449 * end, we don't care if we have checksum errors -- that just means they're not
450 * valid transactions.
453 int into_the_weeds
= 0;
454 uint32_t last_sequence_number
= 0;
456 JournalIOInfo_t jinfo
= { 0 };
459 plog("Journal start sequence number = %u\n", jnlSwap
->swap32(jhdr
.sequence_num
));
462 * Now set up the JournalIOInfo object with the file descriptor,
463 * the block size, start and end of the journal buffer, and where
464 * the journal pointer currently is.
467 jinfo
.bSize
= jnlSwap
->swap32(jhdr
.jhdr_size
);
468 jinfo
.base
= journalStart
;
469 jinfo
.size
= journal_size
- jinfo
.bSize
;
470 jinfo
.end
= offset
+ endOffset
;
471 jinfo
.current
= offset
+ startOffset
;
473 const char *state
= "";
475 block_list_header
*txn
= NULL
;
478 * Loop while getting transactions. We exit when we hit a checksum
479 * error, or when the sequence number for a transaction doesn't match
480 * what we expect it to. (That's the trickiest part -- the into_the_weeds
481 * portion of the code. It doesn't match the TN11150 documentation, so
482 * I've had to go by both my experience with real-world journals and by
483 * looking at the kernel code.)
488 if (jinfo
.current
== jinfo
.end
&& into_the_weeds
== 0) {
490 * This is a bit weird, but it works: if current == end, but gone_into_weeds is 1,
491 * then this code will not execute. If it does execute, it'll go to get a transaction.
492 * That will put the pointer past end.
494 if (jhdr
.sequence_num
== 0) {
497 * I am not sure about this; this behaviour is not in TN1150 at all,
498 * but I _think_ this is what the kernel is doing.
500 plog("Journal sequence number is 0, is going into the end okay?\n");
504 plog("Attempting to read past stated end of journal\n");
505 state
= "tentative ";
506 jinfo
.end
= (jinfo
.base
+ startOffset
- jinfo
.bSize
);
510 plog("Before getting %stransaction: jinfo.current = %llu\n", state
, jinfo
.current
);
512 * Note that getJournalTransaction verifies the checksum on the block_list_header, so
513 * if it's bad, it'll return NULL.
515 txn
= getJournalTransaction(&jinfo
, jnlSwap
);
518 plog("txn is NULL, jinfo.current = %llu\n", jinfo
.current
);
519 if (into_the_weeds
) {
521 plog("\tBut we do not care, since it is past the end of the journal\n");
528 plog("After getting %stransaction: jinfo.current = %llu\n", state
, jinfo
.current
);
529 plog("%stxn = { %u max_blocks, %u num_blocks, %u bytes_used, binfo[0].next = %u }\n", state
, jnlSwap
->swap32(txn
->max_blocks
), jnlSwap
->swap32(txn
->num_blocks
), jnlSwap
->swap32(txn
->bytes_used
), jnlSwap
->swap32(txn
->binfo
[0].next
));
531 if (into_the_weeds
) {
533 * This seems to be what the kernel was checking: if the
534 * last_sequence_number was set, and the txn sequence number
535 * is set, and the txn sequence number doesn't match either
536 * last_sequence_number _or_ an incremented version of it, then
537 * the transaction isn't worth looking at, and we've reached
538 * the end of the journal.
540 if (last_sequence_number
!= 0 &&
541 txn
->binfo
[0].next
!= 0 &&
542 jnlSwap
->swap32(txn
->binfo
[0].next
) != last_sequence_number
&&
543 jnlSwap
->swap32(txn
->binfo
[0].next
) != (last_sequence_number
+ 1)) {
544 // Probably not a valid transaction
546 plog("\tTentative txn sequence %u is not expected %u, stopping journal replay\n", jnlSwap
->swap32(txn
->binfo
[0].next
), last_sequence_number
+ 1);
551 * If we've got a valid transaction, then we replay it.
552 * If there was an error, we're done with the journal replay.
553 * (If the error occurred after the "end," then we don't care,
554 * and it's not a bad journal.)
556 rv
= replayTransaction(txn
,
557 jnlSwap
->swap32(jhdr
.blhdr_size
),
558 jnlSwap
->swap32(jhdr
.jhdr_size
),
564 plog("\tTransaction replay failed, returned %d\n", rv
);
565 if (into_the_weeds
) {
567 plog("\t\tAnd we don't care\n");
573 last_sequence_number
= jnlSwap
->swap32(txn
->binfo
[0].next
);
581 plog("Journal was bad, stopped replaying\n");