2  * Copyright (c) 2010-2012 Apple Inc. All rights reserved. 
   4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ 
   6  * This file contains Original Code and/or Modifications of Original Code 
   7  * as defined in and that are subject to the Apple Public Source License 
   8  * Version 2.0 (the 'License'). You may not use this file except in 
   9  * compliance with the License. The rights granted to you under the License 
  10  * may not be used to create, or enable the creation or redistribution of, 
  11  * unlawful or unlicensed copies of an Apple operating system, or to 
  12  * circumvent, violate, or enable the circumvention or violation of, any 
  13  * terms of an Apple operating system software license agreement. 
  15  * Please obtain a copy of the License at 
  16  * http://www.opensource.apple.com/apsl/ and read it before using this file. 
  18  * The Original Code and all software distributed under the License are 
  19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER 
  20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, 
  21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, 
  22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. 
  23  * Please see the License for the specific language governing rights and 
  24  * limitations under the License. 
  26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ 
  39 #include <sys/types.h> 
  40 #include <sys/param.h> 
  42 #include <sys/ioctl.h> 
  44 #include <sys/param.h> 
  46 #include "../fsck_hfs.h" 
  47 #include "fsck_journal.h" 
  51 #include <hfs/hfs_format.h> 
  52 #include <libkern/OSByteOrder.h> 
  54 typedef struct SwapType 
{ 
  56         uint16_t (*swap16
)(uint16_t); 
  57         uint32_t (*swap32
)(uint32_t); 
  58         uint64_t (*swap64
)(uint64_t); 
  61 uint16_t ident16(uint16_t x
) { return x
; } 
  62 uint32_t ident32(uint32_t x
) { return x
; } 
  63 uint64_t ident64(uint64_t x
) { return x
; } 
  65 static swapper_t nativeEndian 
= { 
  72 static swapper_t swappedEndian 
= { 
  79 typedef int (__lambda_ journal_write_block_t
)(off_t
, void *, size_t); 
  82 // this isn't a great checksum routine but it will do for now. 
  83 // we use it to checksum the journal header and the block list 
  84 // headers that are at the start of each transaction. 
  87 calc_checksum(char *ptr
, int len
) 
  92         // this is a lame checksum but for now it'll do 
  93         for(i 
= 0; i 
< len
; i
++, ptr
++) { 
  94                 cksum 
= (cksum 
<< 8) ^ (cksum 
+ *(unsigned char *)ptr
); 
 100 typedef struct JournalIOInfo 
{ 
 101         int             jfd
;    // File descriptor for journal buffer 
 102         int             wrapCount
;      // Incremented when it wraps around. 
 103         size_t          bSize
;  // Block size.  I/O needs to be done in that amount. 
 104         uint64_t        base
;   // Base offset of journal buffer, past the header 
 105         uint64_t        size
;   // Size of the journal, minus the header size 
 106         uint64_t        end
;    // End of the journal (initially the "end" field from the journal header) 
 107         uint64_t        current
;        // Current offset; starts at "start" 
 111  * Attempt to read <length> bytes from the journal buffer. 
 112  * Since this is a wrapped buffer, it may have to start at the 
 113  * beginning.  info->{base, size, end} are read-only; info->current 
 114  * is updated with the current offset.  It returns the number of bytes 
 115  * it read, or -1 on error. 
 118 journalRead(JournalIOInfo_t 
*info
, uint8_t *buffer
, size_t length
) 
 121         uint8_t *ptr 
= buffer
; 
 123 //      fprintf(stderr, "%s(%p, %p, %zu)\n", __FUNCTION__, info, buffer, length); 
 124         if (info
->wrapCount 
> 1) { 
 125                 fplog(stderr
, "%s(%p, %p, %zu):  journal buffer wrap count = %d\n", __FUNCTION__
, info
, buffer
, length
, info
->wrapCount
); 
 128         while (nread 
< length
) { 
 133                 if (info
->end 
< info
->current
) { 
 134                         // It wraps, so we max out at bse+size 
 135                         end 
= info
->base 
+ info
->size
; 
 139                 amt 
= MIN((length 
- nread
), (end 
- info
->current
)); 
 142                                 fplog(stderr
, "Journal read amount is 0, is that right?\n"); 
 147                 n 
= pread(info
->jfd
, ptr
, amt
, info
->current
); 
 149                         warn("pread(%d, %p, %zu, %"PRIu64
")", info
->jfd
, ptr
, amt
, info
->current
); 
 154                                 fplog(stderr
, "%s(%d):  Wanted to read %zu, but only read %zd\n", __FUNCTION__
, __LINE__
, amt
, n
); 
 160                 if (info
->current 
== (info
->base 
+ info
->size
)) { 
 161                         info
->current 
= info
->base
; 
 170  * Read a transaction from the journal buffer. 
 171  * A transaction is a list of block_list_headers, and their 
 172  * associated data.  It needs to read all of the block_lists in 
 173  * a transaction, or it fails.  It returns NULL if there are 
 174  * no transactions, and on error.  (Maybe that should change?) 
 176 static block_list_header 
* 
 177 getJournalTransaction(JournalIOInfo_t 
*jinfo
, swapper_t 
*swap
) 
 179         block_list_header 
*retval 
= NULL
; 
 180         uint8_t block
[jinfo
->bSize
]; 
 181         block_list_header 
*hdr 
= (void*)&block
; 
 185         memset(block
, 0, sizeof(block
)); 
 186         nread 
= journalRead(jinfo
, block
, sizeof(block
)); 
 188             (size_t)nread 
!= sizeof(block
)) { 
 190                         plog("%s:  wanted %zd, got %zd\n", __FUNCTION__
, sizeof(block
), nread
); 
 193         if (swap
->swap32(hdr
->num_blocks
) == 0) { 
 195                  * Either there really are no blocks, or this is not a valid 
 196                  * transaction.  Either way, there's nothing for us to do here. 
 199                         fplog(stderr
, "%s(%d):  hdr->num_blocks == 0\n", __FUNCTION__
, __LINE__
); 
 203          * Now we check the checksum to see if this is a valid header. 
 204          * Note that we verify the checksum before reading any more -- if 
 205          * it's not a valid header, we don't want to read more than a block 
 208         uint32_t tmpChecksum 
= swap
->swap32(hdr
->checksum
); 
 209         uint32_t compChecksum
; 
 211         compChecksum 
= calc_checksum((void*)hdr
, sizeof(*hdr
)); 
 212         hdr
->checksum 
= swap
->swap32(tmpChecksum
); 
 214         if (compChecksum 
!= tmpChecksum
) { 
 216                         fplog(stderr
, "%s(%d):  hdr has bad checksum, returning NULL\n", __FUNCTION__
, __LINE__
); 
 220         if (swap
->swap32(hdr
->bytes_used
) < sizeof(block
)) { 
 222                         fplog(stderr
, "%s(%d):  hdr has bytes_used (%u) less than sizeof block (%zd)\n", 
 223                               __FUNCTION__
, __LINE__
, swap
->swap32(hdr
->bytes_used
), sizeof(block
)); 
 228         retval 
= malloc(swap
->swap32(hdr
->bytes_used
)); 
 232         memset(retval
, 0, swap
->swap32(hdr
->bytes_used
)); 
 233         memcpy(retval
, block
, sizeof(block
)); 
 234         amt 
= swap
->swap32(hdr
->bytes_used
) - sizeof(block
); 
 235         nread 
= journalRead(jinfo
, ((uint8_t*)retval
) + sizeof(block
), amt
); 
 245  * Replay a transaction. 
 246  * Transactions have a blockListSize amount of block_list_header, and 
 247  * are then followed by data.  We read it in, verify the checksum, and 
 248  * if it's good, we call the block that was passed in to do something 
 249  * with it.  Maybe write it out.  Maybe laugh about it. 
 251  * It returns -1 if there was an error before it wrote anything out, 
 252  * and -2 if there was an error after it wrote something out. 
 255  * txn  -- a block_list_header pointer, which has the description and data 
 257  * blSize       -- the size of the block_list for this journal.  (The data 
 258  *              are after the block_list, but part of the same buffer.) 
 259  * blkSize      -- The block size used to convert block numbers to offsets.  This 
 260  *              is defined to be the size of the journal header. 
 261  * swap -- A pointer to a swapper_t used to swap journal data structure elements. 
 262  * writer       -- A block-of-code that does writing. 
 264  * "writer" should return -1 to stop the replay (this propagates an error up). 
 267 replayTransaction(block_list_header 
*txn
, size_t blSize
, size_t blkSize
, swapper_t 
*swap
, journal_write_block_t writer
) 
 270         uint8_t *endPtr 
= ((uint8_t*)txn
) + swap
->swap32(txn
->bytes_used
); 
 271         uint8_t *dataPtr 
= ((uint8_t*)txn
) + blSize
; 
 273         for (i 
= 1; i 
< swap
->swap32(txn
->num_blocks
); i
++) { 
 275                         plog("\tBlock %d:  blkNum %llu, size %u, data offset = %zd\n", i
, swap
->swap64(txn
->binfo
[i
].bnum
), swap
->swap32(txn
->binfo
[i
].bsize
), dataPtr 
- (uint8_t*)txn
); 
 278                  * Check with security types on these checks.  Need to ensure 
 279                  * that the fields don't take us off into the dark scary woods. 
 280                  * It's mostly the second one that I am unsure about. 
 282                 if (dataPtr 
> endPtr
) { 
 284                                 plog("\tData out of range for block_list_header\n"); 
 287                 if ((endPtr 
- dataPtr
) < swap
->swap32(txn
->binfo
[i
].bsize
)) { 
 289                                 plog("\tData size for block %d out of range for block_list_header\n", i
); 
 292                 if ((dataPtr 
+ swap
->swap32(txn
->binfo
[i
].bsize
)) > endPtr
) { 
 294                                 plog("\tData end out of range for block_list_header\n"); 
 297                 // Just for debugging 
 299                         if (swap
->swap64(txn
->binfo
[i
].bnum
) == 2) { 
 300                                 HFSPlusVolumeHeader 
*vp 
= (void*)dataPtr
; 
 301                                 plog("vp->signature = %#x, version = %#x\n", vp
->signature
, vp
->version
); 
 304                 // It's in the spec, and I saw it come up once on a live volume. 
 305                 if (swap
->swap64(txn
->binfo
[i
].bnum
) == ~(uint64_t)0) { 
 307                                 plog("\tSkipping this block due to magic skip number\n"); 
 309                         // Should we set retval to -2 here? 
 311                                 if ((writer
)(swap
->swap64(txn
->binfo
[i
].bnum
) * blkSize
, dataPtr
, swap
->swap32(txn
->binfo
[i
].bsize
)) == -1) 
 315                 dataPtr 
+= swap
->swap32(txn
->binfo
[i
].bsize
); 
 322  * Read a journal header in from the journal device. 
 325 loadJournalHeader(int jfd
, off_t offset
, size_t blockSize
, journal_header 
*jhp
) 
 327         uint8_t buffer
[blockSize
]; 
 330         nread 
= pread(jfd
, buffer
, sizeof(buffer
), offset
); 
 332             (size_t)nread 
!= sizeof(buffer
)) { 
 333                 warn("tried to read %zu for journal header buffer, got %zd", sizeof(buffer
), nread
); 
 336         *jhp 
= *(journal_header
*)buffer
; 
 341  * Replay a journal (called "journal_open" because you have to 
 342  * to replay it as part of opening it).  At this point, all it 
 343  * is useful for is replaying the journal. 
 346  *     jfd      -- file descriptor for the journal device 
 347  *     offset   -- offset (in bytes) of the journal on the journal device 
 348  *     journal_size     -- size of the jorunal (in bytes) 
 349  *     min_fs_blksize   -- Blocksize of the data filesystem 
 350  *     flags    -- unused for now 
 351  *     jdev_name        -- string name for the journal device.  used for logging. 
 352  *     do_write_b       -- a block which does the actual writing. 
 354  * Currently, for fsck_hfs, the do_write_b block writes to the cache.  It could also 
 355  * just print out the block numbers, or just check their integrity, as much as is 
 358  * The function works by loading the journal header.  From there, it then starts 
 359  * loading transactions, via block_list_header groups.  When it gets to the end 
 360  * of the journal, it tries continuing, in case there were transactions that 
 361  * didn't get updated in the header (this apparently happens). 
 363  * It returns 0 on success, and -1 on error.  Note that there's not a lot 
 364  * fsck_hfs can probably do in the event of error. 
 368 journal_open(int jfd
, 
 369              off_t      offset
,         // Offset of journal 
 370              off_t      journal_size
,   // Size, in bytes, of the entire journal 
 371              size_t     min_fs_blksize
, // Blocksize of the data filesystem, journal blocksize must be at least this size 
 372              uint32_t   flags __unused
, // Not used in this implementation 
 373              const char *jdev_name
,     // The name of the journal device, for logging 
 374              int (__lambda_ do_write_b
)(off_t
, void*, size_t)) 
 376         journal_header jhdr 
= { 0 }; 
 377         swapper_t       
*jnlSwap
;       // Used to swap fields of the journal 
 378         uint32_t        tempCksum
;      // Temporary checksum value 
 379         uint32_t        jBlkSize 
= 0; 
 381         if (ioctl(jfd
, DKIOCGETBLOCKSIZE
, &jBlkSize
) == -1) { 
 382                 jBlkSize 
= min_fs_blksize
; 
 384                 if (jBlkSize 
< min_fs_blksize
) { 
 385                         fplog(stderr
, "%s:  journal block size %u < min block size %zu for %s\n", __FUNCTION__
, jBlkSize
, min_fs_blksize
, jdev_name
); 
 388                 if ((jBlkSize 
% min_fs_blksize
) != 0) { 
 389                         fplog(stderr
, "%s:  journal block size %u is not a multiple of fs block size %zu for %s\n", __FUNCTION__
, jBlkSize
, min_fs_blksize
, jdev_name
); 
 393         if (loadJournalHeader(jfd
, offset
, jBlkSize
, &jhdr
) != 0) { 
 394                 fplog(stderr
, "%s:  unable to load journal header from %s\n", __FUNCTION__
, jdev_name
); 
 399          * Unlike the rest of the filesystem, the journal can be in native or 
 400          * non-native byte order.  Barring moving a filesystem from one host 
 401          * to another, it'll almost always be in native byte order. 
 403         if (jhdr
.endian 
== ENDIAN_MAGIC
) { 
 404                 jnlSwap 
= &nativeEndian
; 
 405         } else if (OSSwapInt32(jhdr
.endian
) == ENDIAN_MAGIC
) { 
 406                 jnlSwap 
= &swappedEndian
; 
 408                 fplog(stderr
, "%s:  Unknown journal endian magic number %#x from %s\n", __FUNCTION__
, jhdr
.endian
, jdev_name
); 
 412          * Two different magic numbers are valid. 
 413          * Do they mean different thigs, though? 
 415         if (jnlSwap
->swap32(jhdr
.magic
) != JOURNAL_HEADER_MAGIC 
&& 
 416             jnlSwap
->swap32(jhdr
.magic
) != OLD_JOURNAL_HEADER_MAGIC
) { 
 417                 fplog(stderr
, "%s:  Unknown journal header magic number %#x from %s\n", __FUNCTION__
, jhdr
.magic
, jdev_name
); 
 422          * Checksums have to be done with the checksum field set to 0. 
 423          * So we have to stash it aside for a bit, and set the field to 
 424          * 0, before we can compare.  Afterwards, if it compares correctly, 
 425          * we put the original (swapped, if necessary) value back, just 
 428         tempCksum 
= jnlSwap
->swap32(jhdr
.checksum
); 
 430         if (jnlSwap
->swap32(jhdr
.magic
) == JOURNAL_HEADER_MAGIC 
&& 
 431             (calc_checksum((void*)&jhdr
, JOURNAL_HEADER_CKSUM_SIZE
) != tempCksum
)) { 
 432                 fplog(stderr
, "%s:  Invalid journal checksum from %s\n", __FUNCTION__
, jdev_name
); 
 435         jhdr
.checksum 
= jnlSwap
->swap32(tempCksum
); 
 438          * Set up information about the journal which we use to do the I/O. 
 439          * The journal is a circular buffer.  However, the start of the journal 
 440          * buffer is past the journal header.  See the JournalIOInfo structure above. 
 442         off_t startOffset 
= jnlSwap
->swap64(jhdr
.start
); 
 443         off_t endOffset 
=jnlSwap
->swap64(jhdr
.end
); 
 444         off_t journalStart 
= offset 
+ jnlSwap
->swap32(jhdr
.jhdr_size
); 
 447          * The journal code was updated to be able to read past the "end" of the journal, 
 448          * to see if there were any valid transactions there.  If we are peeking past the 
 449          * end, we don't care if we have checksum errors -- that just means they're not 
 450          * valid transactions. 
 453         int into_the_weeds 
= 0; 
 454         uint32_t last_sequence_number 
= 0; 
 456         JournalIOInfo_t jinfo 
= { 0 }; 
 459                 plog("Journal start sequence number = %u\n", jnlSwap
->swap32(jhdr
.sequence_num
)); 
 462          * Now set up the JournalIOInfo object with the file descriptor, 
 463          * the block size, start and end of the journal buffer, and where 
 464          * the journal pointer currently is. 
 467         jinfo
.bSize 
= jnlSwap
->swap32(jhdr
.jhdr_size
); 
 468         jinfo
.base 
= journalStart
; 
 469         jinfo
.size 
= journal_size 
- jinfo
.bSize
; 
 470         jinfo
.end 
= offset 
+ endOffset
; 
 471         jinfo
.current 
= offset 
+ startOffset
; 
 473         const char *state 
= ""; 
 475         block_list_header 
*txn 
= NULL
; 
 478          * Loop while getting transactions.  We exit when we hit a checksum 
 479          * error, or when the sequence number for a transaction doesn't match 
 480          * what we expect it to.  (That's the trickiest part -- the into_the_weeds 
 481          * portion of the code.  It doesn't match the TN11150 documentation, so 
 482          * I've had to go by both my experience with real-world journals and by 
 483          * looking at the kernel code.) 
 488                 if (jinfo
.current 
== jinfo
.end 
&& into_the_weeds 
== 0) { 
 490                          * This is a bit weird, but it works:  if current == end, but gone_into_weeds is 1, 
 491                          * then this code will not execute.  If it does execute, it'll go to get a transaction. 
 492                          * That will put the pointer past end. 
 494                         if (jhdr
.sequence_num 
== 0) { 
 497                                  * I am not sure about this; this behaviour is not in TN1150 at all, 
 498                                  * but I _think_ this is what the kernel is doing. 
 500                                 plog("Journal sequence number is 0, is going into the end okay?\n"); 
 504                                 plog("Attempting to read past stated end of journal\n"); 
 505                         state 
= "tentative "; 
 506                         jinfo
.end 
= (jinfo
.base 
+ startOffset 
- jinfo
.bSize
); 
 510                         plog("Before getting %stransaction:  jinfo.current = %llu\n", state
, jinfo
.current
); 
 512                  * Note that getJournalTransaction verifies the checksum on the block_list_header, so 
 513                  * if it's bad, it'll return NULL. 
 515                 txn 
= getJournalTransaction(&jinfo
, jnlSwap
); 
 518                                 plog("txn is NULL, jinfo.current = %llu\n", jinfo
.current
); 
 519                         if (into_the_weeds
) { 
 521                                         plog("\tBut we do not care, since it is past the end of the journal\n"); 
 528                         plog("After getting %stransaction:  jinfo.current = %llu\n", state
, jinfo
.current
); 
 529                         plog("%stxn = { %u max_blocks, %u num_blocks, %u bytes_used, binfo[0].next = %u }\n", state
, jnlSwap
->swap32(txn
->max_blocks
), jnlSwap
->swap32(txn
->num_blocks
), jnlSwap
->swap32(txn
->bytes_used
), jnlSwap
->swap32(txn
->binfo
[0].next
)); 
 531                 if (into_the_weeds
) { 
 533                          * This seems to be what the kernel was checking:  if the 
 534                          * last_sequence_number was set, and the txn sequence number 
 535                          * is set, and the txn sequence number doesn't match either 
 536                          * last_sequence_number _or_ an incremented version of it, then 
 537                          * the transaction isn't worth looking at, and we've reached 
 538                          * the end of the journal. 
 540                         if (last_sequence_number 
!= 0 && 
 541                             txn
->binfo
[0].next 
!= 0 && 
 542                             jnlSwap
->swap32(txn
->binfo
[0].next
) != last_sequence_number 
&& 
 543                             jnlSwap
->swap32(txn
->binfo
[0].next
) != (last_sequence_number 
+ 1)) { 
 544                                 // Probably not a valid transaction 
 546                                         plog("\tTentative txn sequence %u is not expected %u, stopping journal replay\n", jnlSwap
->swap32(txn
->binfo
[0].next
), last_sequence_number 
+ 1); 
 551                  * If we've got a valid transaction, then we replay it. 
 552                  * If there was an error, we're done with the journal replay. 
 553                  * (If the error occurred after the "end," then we don't care, 
 554                  * and it's not a bad journal.) 
 556                 rv 
= replayTransaction(txn
, 
 557                                        jnlSwap
->swap32(jhdr
.blhdr_size
), 
 558                                        jnlSwap
->swap32(jhdr
.jhdr_size
), 
 564                                 plog("\tTransaction replay failed, returned %d\n", rv
); 
 565                         if (into_the_weeds
) { 
 567                                         plog("\t\tAnd we don't care\n"); 
 573                 last_sequence_number 
= jnlSwap
->swap32(txn
->binfo
[0].next
); 
 581                         plog("Journal was bad, stopped replaying\n");