]> git.saurik.com Git - apple/xnu.git/blobdiff - bsd/hfs/hfs_btreeio.c
xnu-1228.5.20.tar.gz
[apple/xnu.git] / bsd / hfs / hfs_btreeio.c
index 2bb3c7695efdc4d1a68abcb7c481ad66bf506453..0c81c879f594ccb162b71f1b1383646300762895 100644 (file)
 /*
- * Copyright (c) 2000 Apple Computer, Inc. All rights reserved.
+ * Copyright (c) 2000-2007 Apple Inc. All rights reserved.
  *
- * @APPLE_LICENSE_HEADER_START@
+ * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
  * 
- * The contents of this file constitute Original Code as defined in and
- * are subject to the Apple Public Source License Version 1.1 (the
- * "License").  You may not use this file except in compliance with the
- * License.  Please obtain a copy of the License at
- * http://www.apple.com/publicsource and read it before using this file.
+ * This file contains Original Code and/or Modifications of Original Code
+ * as defined in and that are subject to the Apple Public Source License
+ * Version 2.0 (the 'License'). You may not use this file except in
+ * compliance with the License. The rights granted to you under the License
+ * may not be used to create, or enable the creation or redistribution of,
+ * unlawful or unlicensed copies of an Apple operating system, or to
+ * circumvent, violate, or enable the circumvention or violation of, any
+ * terms of an Apple operating system software license agreement.
  * 
- * This Original Code and all software distributed under the License are
- * distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY KIND, EITHER
+ * Please obtain a copy of the License at
+ * http://www.opensource.apple.com/apsl/ and read it before using this file.
+ * 
+ * The Original Code and all software distributed under the License are
+ * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE OR NON-INFRINGEMENT.  Please see the
- * License for the specific language governing rights and limitations
- * under the License.
+ * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
+ * Please see the License for the specific language governing rights and
+ * limitations under the License.
  * 
- * @APPLE_LICENSE_HEADER_END@
+ * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
  */
-/*     @(#)hfs_btreeio.c
-*
-*      (c) 1998, 2000 Apple Computer, Inc.  All Rights Reserved
-*
-*      hfs_btreeio.c -- I/O Routines for the HFS B-tree files.
-*
-*      HISTORY
-*      15-Feb-2000     Don Brady       Added ClearBTNodes.
-*      16-Jul-1998     Don Brady               In ExtendBtreeFile force all b-tree nodes to be contiguous on disk.
-*       4-Jun-1998     Pat Dirks               Changed to do all B*-Tree writes synchronously (FORCESYNCBTREEWRITES = 1)
-*      18-apr-1998     Don Brady               Call brelse on bread failure.
-*      17-Apr-1998     Pat Dirks               Fixed ReleaseBTreeBlock to not call brelse when bwrite or bdwrite is called.
-*      13-apr-1998     Don Brady               Add ExtendBTreeFile routine (from BTreeWrapper.c).
-*      26-mar-1998     Don Brady               SetBTreeBlockSize was incorrectly excluding 512 byte blockSize.
-*      18-feb-1998     Don Brady               Initially created file.
-*
-*/
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/buf.h>
+#include <sys/buf_internal.h>
+#include <sys/kernel.h>
+#include <sys/malloc.h>
 #include <sys/mount.h>
 #include <sys/vnode.h>
 
 
 #include "hfs.h"
+#include "hfs_cnode.h"
 #include "hfs_dbg.h"
 #include "hfs_endian.h"
+#include "hfs_btreeio.h"
 
 #include "hfscommon/headers/FileMgrInternal.h"
 #include "hfscommon/headers/BTreesPrivate.h"
 
 #define FORCESYNCBTREEWRITES 0
 
-static OSStatus FlushAlternate( ExtendedVCB *vcb );
+/* From bsd/vfs/vfs_bio.c */
+extern int bdwrite_internal(struct buf *, int);
 
 static int ClearBTNodes(struct vnode *vp, long blksize, off_t offset, off_t amount);
+static int btree_journal_modify_block_end(struct hfsmount *hfsmp, struct buf *bp);
 
 
-OSStatus SetBTreeBlockSize(FileReference vp, ByteCount blockSize, ItemCount minBlockCount)
+__private_extern__
+OSStatus SetBTreeBlockSize(FileReference vp, ByteCount blockSize, __unused ItemCount minBlockCount)
 {
        BTreeControlBlockPtr    bTreePtr;
        
        DBG_ASSERT(vp != NULL);
-       DBG_ASSERT(VTOFCB(vp) != NULL);
-       DBG_ASSERT(VTOFCB(vp)->fcbBTCBPtr != NULL);
        DBG_ASSERT(blockSize >= kMinNodeSize);
     if (blockSize > MAXBSIZE )
         return (fsBTBadNodeSize);
 
-    DBG_TREE(("SetBlockSizeProc: blockSize=%ld for file %ld\n", blockSize, H_FILEID(VTOH(vp))));
-
-       bTreePtr = (BTreeControlBlockPtr)(VTOH(vp)->fcbBTCBPtr);
+       bTreePtr = (BTreeControlBlockPtr)VTOF(vp)->fcbBTCBPtr;
        bTreePtr->nodeSize = blockSize;
        
     return (E_NONE);
 }
 
 
-OSStatus GetBTreeBlock(FileReference vp, UInt32 blockNum, GetBlockOptions options, BlockDescriptor *block)
+__private_extern__
+OSStatus GetBTreeBlock(FileReference vp, u_int32_t blockNum, GetBlockOptions options, BlockDescriptor *block)
 {
     OSStatus    retval = E_NONE;
     struct buf   *bp = NULL;
 
-    if (options & kGetEmptyBlock)
-        bp = getblk(vp, blockNum, block->blockSize, 0, 0, BLK_META);
-    else
-        retval = meta_bread(vp, blockNum, block->blockSize, NOCRED, &bp);
-
-    DBG_ASSERT(bp != NULL);
-    DBG_ASSERT(bp->b_data != NULL);
-    DBG_ASSERT(bp->b_bcount == block->blockSize);
-    DBG_ASSERT(bp->b_lblkno == blockNum);
+    if (options & kGetEmptyBlock) {
+        daddr64_t blkno;
+        off_t offset;
 
+        offset = (daddr64_t)blockNum * (daddr64_t)block->blockSize;
+        bp = buf_getblk(vp, (daddr64_t)blockNum, block->blockSize, 0, 0, BLK_META);
+        if (bp &&
+            VNOP_BLOCKMAP(vp, offset, block->blockSize, &blkno, NULL, NULL, 0, NULL) == 0) {
+            buf_setblkno(bp, blkno);
+        }
+    } else {
+        retval = buf_meta_bread(vp, (daddr64_t)blockNum, block->blockSize, NOCRED, &bp);
+    }
     if (bp == NULL)
         retval = -1;   //XXX need better error
 
     if (retval == E_NONE) {
         block->blockHeader = bp;
-        block->buffer = bp->b_data;
-        block->blockReadFromDisk = (bp->b_flags & B_CACHE) == 0;       /* not found in cache ==> came from disk */
+        block->buffer = (char *)buf_dataptr(bp);
+       block->blockNum = buf_lblkno(bp);
+        block->blockReadFromDisk = (buf_fromcache(bp) == 0);   /* not found in cache ==> came from disk */
+
+               // XXXdbg 
+               block->isModified = 0;
 
-#if BYTE_ORDER == LITTLE_ENDIAN
-        /* Endian swap B-Tree node (only if it's a valid block) */
+        /* Check and endian swap B-Tree node (only if it's a valid block) */
         if (!(options & kGetEmptyBlock)) {
             /* This happens when we first open the b-tree, we might not have all the node data on hand */
             if ((((BTNodeDescriptor *)block->buffer)->kind == kBTHeaderNode) &&
-                (((BTHeaderRec *)((char *)block->buffer + 14))->nodeSize != bp->b_bcount) &&
-                (SWAP_BE16 (((BTHeaderRec *)((char *)block->buffer + 14))->nodeSize) != bp->b_bcount)) {
-
-                /* Don't swap the descriptors at all, we don't care (this block will be invalidated) */
-                SWAP_BT_NODE (block, ISHFSPLUS(VTOVCB(vp)), H_FILEID(VTOH(vp)), 3);
-
-            /* The node needs swapping */
-            } else if (*((UInt16 *)((char *)block->buffer + (block->blockSize - sizeof (UInt16)))) == 0x0e00) {
-                SWAP_BT_NODE (block, ISHFSPLUS(VTOVCB(vp)), H_FILEID(VTOH(vp)), 0);
-#if 0
-            /* The node is not already in native byte order, hence corrupt */
-            } else if (*((UInt16 *)((char *)block->buffer + (block->blockSize - sizeof (UInt16)))) != 0x000e) {
-                panic ("%s Corrupt B-Tree node detected!\n", "GetBTreeBlock:");
-#endif
+                (((BTHeaderRec *)((char *)block->buffer + 14))->nodeSize != buf_count(bp)) &&
+                (SWAP_BE16 (((BTHeaderRec *)((char *)block->buffer + 14))->nodeSize) != buf_count(bp))) {
+
+                /*
+                 * Don't swap the node descriptor, record offsets, or other records.
+                 * This record will be invalidated and re-read with the correct node
+                 * size once the B-tree control block is set up with the node size
+                 * from the header record.
+                 */
+                retval = hfs_swap_BTNode (block, vp, kSwapBTNodeHeaderRecordOnly);
+
+                       } else if (block->blockReadFromDisk) {
+               /*
+                * The node was just read from disk, so always swap/check it.
+                * This is necessary on big endian since the test below won't trigger.
+                */
+                retval = hfs_swap_BTNode (block, vp, kSwapBTNodeBigToHost);
+            } else if (*((u_int16_t *)((char *)block->buffer + (block->blockSize - sizeof (u_int16_t)))) == 0x0e00) {
+                               /*
+                                * The node was left in the cache in non-native order, so swap it.
+                                * This only happens on little endian, after the node is written
+                                * back to disk.
+                                */
+                retval = hfs_swap_BTNode (block, vp, kSwapBTNodeBigToHost);
             }
+            
+               /*
+                * If we got an error, then the node is only partially swapped.
+                * We mark the buffer invalid so that the next attempt to get the
+                * node will read it and attempt to swap again, and will notice
+                * the error again.  If we didn't do this, the next attempt to get
+                * the node might use the partially swapped node as-is.
+                */
+            if (retval)
+                               buf_markinvalid(bp);
         }
-#endif
-    } else {
+    }
+    
+    if (retval) {
        if (bp)
-                       brelse(bp);
+                       buf_brelse(bp);
         block->blockHeader = NULL;
         block->buffer = NULL;
     }
@@ -134,51 +155,146 @@ OSStatus GetBTreeBlock(FileReference vp, UInt32 blockNum, GetBlockOptions option
 }
 
 
+__private_extern__
+void ModifyBlockStart(FileReference vp, BlockDescPtr blockPtr)
+{
+       struct hfsmount *hfsmp = VTOHFS(vp);
+    struct buf *bp = NULL;
+
+       if (hfsmp->jnl == NULL) {
+               return;
+       }
+       
+    bp = (struct buf *) blockPtr->blockHeader;
+    if (bp == NULL) {
+               panic("ModifyBlockStart: null bp  for blockdescptr %p?!?\n", blockPtr);
+               return;
+    }
+
+       journal_modify_block_start(hfsmp->jnl, bp);
+       blockPtr->isModified = 1;
+}
+
+static void
+btree_swap_node(struct buf *bp, __unused void *arg)
+{
+    // struct hfsmount *hfsmp = (struct hfsmount *)arg;
+       int retval;
+    struct vnode *vp = buf_vnode(bp);
+    BlockDescriptor block;
+                                   
+    /* Prepare the block pointer */
+    block.blockHeader = bp;
+    block.buffer = (char *)buf_dataptr(bp);
+    block.blockNum = buf_lblkno(bp);
+    /* not found in cache ==> came from disk */
+    block.blockReadFromDisk = (buf_fromcache(bp) == 0);
+    block.blockSize = buf_count(bp);
+
+    // swap the data now that this node is ready to go to disk
+    retval = hfs_swap_BTNode (&block, vp, kSwapBTNodeHostToBig);
+    if (retval)
+       panic("btree_swap_node: about to write corrupt node!\n");
+}
+
+
+static int
+btree_journal_modify_block_end(struct hfsmount *hfsmp, struct buf *bp)
+{
+    return journal_modify_block_end(hfsmp->jnl, bp, btree_swap_node, hfsmp);
+}
+
+
+__private_extern__
 OSStatus ReleaseBTreeBlock(FileReference vp, BlockDescPtr blockPtr, ReleaseBlockOptions options)
 {
+    struct hfsmount    *hfsmp = VTOHFS(vp);
     OSStatus   retval = E_NONE;
     struct buf *bp = NULL;
 
     bp = (struct buf *) blockPtr->blockHeader;
 
     if (bp == NULL) {
-        DBG_TREE(("ReleaseBlockProc: blockHeader is zero!\n"));
         retval = -1;
         goto exit;
     }
 
     if (options & kTrashBlock) {
-        bp->b_flags |= B_INVAL;
-       brelse(bp);     /* note: B-tree code will clear blockPtr->blockHeader and blockPtr->buffer */
+                buf_markinvalid(bp);
+
+               if (hfsmp->jnl && (buf_flags(bp) & B_LOCKED)) {
+                       journal_kill_block(hfsmp->jnl, bp);
+               } else {
+                       buf_brelse(bp); /* note: B-tree code will clear blockPtr->blockHeader and blockPtr->buffer */
+               }
     } else {
         if (options & kForceWriteBlock) {
-            bp->b_flags |= B_DIRTY;
-            retval = VOP_BWRITE(bp);
+                       if (hfsmp->jnl) {
+                               if (blockPtr->isModified == 0) {
+                                       panic("hfs: releaseblock: modified is 0 but forcewrite set! bp %p\n", bp);
+                               }
+
+                               retval = btree_journal_modify_block_end(hfsmp, bp);
+                               blockPtr->isModified = 0;
+                       } else {
+                               retval = VNOP_BWRITE(bp);
+                       }
         } else if (options & kMarkBlockDirty) {
-            bp->b_flags |= B_DIRTY;
-#if FORCESYNCBTREEWRITES
-            VOP_BWRITE(bp);
-#else
-                       if (options & kLockTransaction) {
-                   /*
-                    *
-                    * Set the B_LOCKED flag and unlock the buffer, causing brelse to move
-                    * the buffer onto the LOCKED free list.  This is necessary, otherwise
-                    * getnewbuf() would try to reclaim the buffers using bawrite, which
-                    * isn't going to work.
-                    *
-                    */
-                               extern int count_lock_queue __P((void));
-                               /* Don't hog all the buffers... */
-                               if (count_lock_queue() > kMaxLockedMetaBuffers)
-                                       hfs_fsync_transaction(vp);
-                   bp->b_flags |= B_LOCKED;
-               };
-            bdwrite(bp);
+                       struct timeval tv;
+                       microuptime(&tv);
+            if ((options & kLockTransaction) && hfsmp->jnl == NULL) {
+                /*
+                 *
+                 * Set the B_LOCKED flag and unlock the buffer, causing buf_brelse to move
+                 * the buffer onto the LOCKED free list.  This is necessary, otherwise
+                 * getnewbuf() would try to reclaim the buffers using buf_bawrite, which
+                 * isn't going to work.
+                 *
+                 */
+                /* Don't hog all the buffers... */
+                if (count_lock_queue() > kMaxLockedMetaBuffers) {
+                     hfs_btsync(vp, HFS_SYNCTRANS);
+                     /* Rollback sync time to cause a sync on lock release... */
+                     (void) BTSetLastSync(VTOF(vp), tv.tv_sec - (kMaxSecsForFsync + 1));
+                }
+               buf_setflags(bp, B_LOCKED);
+            }
 
-#endif
+            /* 
+             * Delay-write this block.
+             * If the maximum delayed buffers has been exceeded then
+             * free up some buffers and fall back to an asynchronous write.
+             */
+                       if (hfsmp->jnl) {
+                               if (blockPtr->isModified == 0) {
+                                       panic("hfs: releaseblock: modified is 0 but markdirty set! bp %p\n", bp);
+                               }
+                               retval = btree_journal_modify_block_end(hfsmp, bp);
+                               blockPtr->isModified = 0;
+                       } else if (bdwrite_internal(bp, 1) != 0) {
+                hfs_btsync(vp, 0);
+                /* Rollback sync time to cause a sync on lock release... */
+                (void) BTSetLastSync(VTOF(vp), tv.tv_sec - (kMaxSecsForFsync + 1));
+
+                buf_clearflags(bp, B_LOCKED);
+                buf_bawrite(bp);
+            }
         } else {
-               brelse(bp);     /* note: B-tree code will clear blockPtr->blockHeader and blockPtr->buffer */
+                       // check if we had previously called journal_modify_block_start() 
+                       // on this block and if so, abort it (which will call buf_brelse()).
+                       if (hfsmp->jnl && blockPtr->isModified) {
+                               // XXXdbg - I don't want to call modify_block_abort()
+                               //          because I think it may be screwing up the
+                               //          journal and blowing away a block that has
+                               //          valid data in it.
+                               //   
+                               //    journal_modify_block_abort(hfsmp->jnl, bp);
+                               //panic("hfs: releaseblock called for 0x%x but mod_block_start previously called.\n", bp);
+                               btree_journal_modify_block_end(hfsmp, bp);
+                               blockPtr->isModified = 0;
+                       } else {
+                               buf_brelse(bp); /* note: B-tree code will clear blockPtr->blockHeader and blockPtr->buffer */
+                       }
         };
     };
 
@@ -187,146 +303,199 @@ exit:
 }
 
 
+__private_extern__
 OSStatus ExtendBTreeFile(FileReference vp, FSSize minEOF, FSSize maxEOF)
 {
 #pragma unused (maxEOF)
 
-       OSStatus        retval;
-       UInt64          actualBytesAdded;
-       UInt64          bytesToAdd;
-    UInt32             extendFlags;
-       BTreeInfoRec btInfo;
+       OSStatus        retval = 0, ret = 0;
+       int64_t         actualBytesAdded, origSize;
+       u_int64_t       bytesToAdd;
+       u_int32_t       startAllocation;
+       u_int32_t       fileblocks;
+       BTreeInfoRec    btInfo;
        ExtendedVCB     *vcb;
-       FCB                     *filePtr;
-    struct proc *p = NULL;
-
+       FCB             *filePtr;
+       struct proc     *p = NULL;
+       int64_t         trim = 0;
+       int             lockflags = 0;
 
        filePtr = GetFileControlBlock(vp);
 
-       if ( minEOF > filePtr->fcbEOF )
+       if ( (off_t)minEOF > filePtr->fcbEOF )
        {
                bytesToAdd = minEOF - filePtr->fcbEOF;
 
-               if (bytesToAdd < filePtr->fcbClmpSize)
-                       bytesToAdd = filePtr->fcbClmpSize;              //XXX why not always be a mutiple of clump size?
+               if (bytesToAdd < filePtr->ff_clumpsize)
+                       bytesToAdd = filePtr->ff_clumpsize;             //XXX why not always be a mutiple of clump size?
        }
        else
        {
-               DBG_TREE((" ExtendBTreeFile: minEOF is smaller than current size!"));
                return -1;
        }
 
-       vcb = FCBTOVCB(filePtr);
+       vcb = VTOVCB(vp);
        
        /*
         * The Extents B-tree can't have overflow extents. ExtendFileC will
         * return an error if an attempt is made to extend the Extents B-tree
         * when the resident extents are exhausted.
         */
-    /* XXX warning - this can leave the volume bitmap unprotected during ExtendFileC call */
-       if(H_FILEID(filePtr) != kHFSExtentsFileID)
-       {
-               p = current_proc();
-               /* lock extents b-tree (also protects volume bitmap) */
-               retval = hfs_metafilelocking(VTOHFS(vp), kHFSExtentsFileID, LK_EXCLUSIVE, p);
-               if (retval)
-                       return (retval);
-       }
 
-    (void) BTGetInformation(filePtr, 0, &btInfo);
+       /* Protect allocation bitmap and extents overflow file. */
+       lockflags = SFL_BITMAP;
+       if (VTOC(vp)->c_fileid != kHFSExtentsFileID)
+               lockflags |= SFL_EXTENTS;
+       lockflags = hfs_systemfile_lock(vcb, lockflags, HFS_EXCLUSIVE_LOCK);
 
+       (void) BTGetInformation(filePtr, 0, &btInfo);
+
+#if 0  // XXXdbg
        /*
-     * The b-tree code expects nodes to be contiguous. So when
+        * The b-tree code expects nodes to be contiguous. So when
         * the allocation block size is less than the b-tree node
-     * size, we need to force disk allocations to be contiguous.
-     */
+        * size, we need to force disk allocations to be contiguous.
+        */
        if (vcb->blockSize >= btInfo.nodeSize) {
                extendFlags = 0;
        } else {
                /* Ensure that all b-tree nodes are contiguous on disk */
-               extendFlags = kEFAllMask | kEFContigMask;
+               extendFlags = kEFContigMask;
        }
+#endif
 
-    retval = ExtendFileC(vcb, filePtr, bytesToAdd, 0, extendFlags, &actualBytesAdded);
-
-       if(H_FILEID(filePtr) != kHFSExtentsFileID)
-               (void) hfs_metafilelocking(VTOHFS(vp), kHFSExtentsFileID, LK_RELEASE, p);
+       origSize = filePtr->fcbEOF;
+       fileblocks = filePtr->ff_blocks;
+       startAllocation = vcb->nextAllocation;
+
+       // loop trying to get a contiguous chunk that's an integer multiple
+       // of the btree node size.  if we can't get a contiguous chunk that
+       // is at least the node size then we break out of the loop and let
+       // the error propagate back up.
+       while((off_t)bytesToAdd >= btInfo.nodeSize) {
+           do {
+               retval = ExtendFileC(vcb, filePtr, bytesToAdd, 0,
+                                    kEFContigMask | kEFMetadataMask | kEFNoClumpMask,
+                                    (int64_t *)&actualBytesAdded);
+               if (retval == dskFulErr && actualBytesAdded == 0) {
+                   bytesToAdd >>= 1;
+                   if (bytesToAdd < btInfo.nodeSize) {
+                       break;
+                   } else if ((bytesToAdd % btInfo.nodeSize) != 0) {
+                       // make sure it's an integer multiple of the nodeSize
+                       bytesToAdd -= (bytesToAdd % btInfo.nodeSize);
+                   }
+               }
+           } while (retval == dskFulErr && actualBytesAdded == 0);
 
-       if (retval)
-               return (retval);
+           if (retval == dskFulErr && actualBytesAdded == 0 && bytesToAdd <= btInfo.nodeSize) {
+               break;
+           }
 
-       if (actualBytesAdded < bytesToAdd)
-               DBG_TREE((" ExtendBTreeFile: actualBytesAdded < bytesToAdd!"));
-       
-       filePtr->fcbEOF = filePtr->fcbPLen;
+           filePtr->fcbEOF = (u_int64_t)filePtr->ff_blocks * (u_int64_t)vcb->blockSize;
+           bytesToAdd = minEOF - filePtr->fcbEOF;
+       }
 
-       retval = ClearBTNodes(vp, btInfo.nodeSize, filePtr->fcbEOF - actualBytesAdded, actualBytesAdded);       
-       if (retval)
-               return (retval);
-       
        /*
-        * Update the Alternate MDB or Alternate VolumeHeader
+        * If a new extent was added then move the roving allocator
+        * reference forward by the current b-tree file size so 
+        * there's plenty of room to grow.
         */
-       if ((H_FILEID(filePtr) == kHFSExtentsFileID)    ||
-           (H_FILEID(filePtr) == kHFSCatalogFileID)    ||
-           (H_FILEID(filePtr) == kHFSAttributesFileID)
-          ) {
-               MarkVCBDirty( vcb );
-               if (vcb->vcbSigWord == kHFSPlusSigWord) {
-                       retval = hfs_flushvolumeheader(VCBTOHFS(vcb), 0);
-               } else {
-                       retval = hfs_flushMDB(VCBTOHFS(vcb), 0);
-               }
-               if (retval == 0) {
-                       retval = FlushAlternate(vcb);
-               }
+       if ((retval == 0) &&
+           ((VCBTOHFS(vcb)->hfs_flags & HFS_METADATA_ZONE) == 0) &&
+           (vcb->nextAllocation > startAllocation) &&
+           ((vcb->nextAllocation + fileblocks) < vcb->allocLimit)) {
+               HFS_UPDATE_NEXT_ALLOCATION(vcb, vcb->nextAllocation + fileblocks); 
+       }
+               
+       filePtr->fcbEOF = (u_int64_t)filePtr->ff_blocks * (u_int64_t)vcb->blockSize;
+
+       // XXXdbg ExtendFileC() could have returned an error even though
+       // it grew the file to be big enough for our needs.  If this is
+       // the case, we don't care about retval so we blow it away.
+       //
+       if (filePtr->fcbEOF >= (off_t)minEOF && retval != 0) {
+               retval = 0;
        }
-       
-       return retval;
-}
 
+       // XXXdbg if the file grew but isn't large enough or isn't an
+       // even multiple of the nodeSize then trim things back.  if
+       // the file isn't large enough we trim back to the original
+       // size.  otherwise we trim back to be an even multiple of the
+       // btree node size.
+       //
+       if ((filePtr->fcbEOF < (off_t)minEOF) || ((filePtr->fcbEOF - origSize) % btInfo.nodeSize) != 0) {
+
+               if (filePtr->fcbEOF < (off_t)minEOF) {
+                       retval = dskFulErr;
+                       
+                       if (filePtr->fcbEOF < origSize) {
+                               panic("hfs: btree file eof %lld less than orig size %lld!\n",
+                                         filePtr->fcbEOF, origSize);
+                       }
+                       
+                       trim = filePtr->fcbEOF - origSize;
+               } else {
+                       trim = ((filePtr->fcbEOF - origSize) % btInfo.nodeSize);
+               }
 
-static OSStatus
-FlushAlternate( ExtendedVCB *vcb )
-{
-       struct hfsmount *hfsmp = VCBTOHFS(vcb);
-       struct vnode *dev_vp = hfsmp->hfs_devvp;
-       struct buf *pri_bp = NULL;
-       struct buf *alt_bp = NULL;
-       int sectorsize;
-       u_long priIDSector;
-       u_long altIDSector;
-       int result;
+               ret = TruncateFileC(vcb, filePtr, filePtr->fcbEOF - trim, 0);
+               filePtr->fcbEOF = (u_int64_t)filePtr->ff_blocks * (u_int64_t)vcb->blockSize;
 
-       sectorsize = hfsmp->hfs_phys_block_size;
-       priIDSector = (vcb->hfsPlusIOPosOffset / sectorsize) +
-                       HFS_PRI_SECTOR(sectorsize);
+               // XXXdbg - panic if the file didn't get trimmed back properly
+               if ((filePtr->fcbEOF % btInfo.nodeSize) != 0) {
+                       panic("hfs: truncate file didn't! fcbEOF %lld nsize %d fcb %p\n",
+                                 filePtr->fcbEOF, btInfo.nodeSize, filePtr);
+               }
 
-       altIDSector = (vcb->hfsPlusIOPosOffset / sectorsize) +
-                       HFS_ALT_SECTOR(sectorsize, hfsmp->hfs_phys_block_count);
+               if (ret) {
+                       // XXXdbg - this probably doesn't need to be a panic()
+                       panic("hfs: error truncating btree files (sz 0x%llx, trim %lld, ret %ld)\n",
+                             filePtr->fcbEOF, trim, ret);
+                       goto out;
+               }
+       }
 
-       /* Get the main MDB/VolumeHeader block */
-       result = meta_bread(dev_vp, priIDSector, sectorsize, NOCRED, &pri_bp);
-       if (result)
-               goto exit;
+       if(VTOC(vp)->c_fileid != kHFSExtentsFileID) {
+               /*
+                * Get any extents overflow b-tree changes to disk ASAP!
+                */
+               (void) BTFlushPath(VTOF(vcb->extentsRefNum));
+               (void) hfs_fsync(vcb->extentsRefNum, MNT_WAIT, 0, p);
+       }
+       hfs_systemfile_unlock(vcb, lockflags);
+       lockflags = 0;
 
-       /* Get the alternate MDB/VolumeHeader block */
-       result = meta_bread(dev_vp, altIDSector, sectorsize, NOCRED, &alt_bp);
-       if (result)
-               goto exit;
+       if ((filePtr->fcbEOF % btInfo.nodeSize) != 0) {
+               panic("hfs: extendbtree: fcb %p has eof 0x%llx not a multiple of 0x%x (trim %llx)\n",
+                         filePtr, filePtr->fcbEOF, btInfo.nodeSize, trim);
+       }
 
-       bcopy(pri_bp->b_data + HFS_PRI_OFFSET(sectorsize),
-             alt_bp->b_data + HFS_ALT_OFFSET(sectorsize), kMDBSize);
+       /*
+        * Update the Alternate MDB or Alternate VolumeHeader
+        */
+       if ((VTOC(vp)->c_fileid == kHFSExtentsFileID)   ||
+           (VTOC(vp)->c_fileid == kHFSCatalogFileID)   ||
+           (VTOC(vp)->c_fileid == kHFSAttributesFileID)
+          ) {
+               VTOC(vp)->c_flag |= C_MODIFIED;
+               MarkVCBDirty( vcb );
+               ret = hfs_flushvolumeheader(VCBTOHFS(vcb), MNT_WAIT, HFS_ALTFLUSH);
+       } else {
+               VTOC(vp)->c_touch_chgtime = TRUE;
+               VTOC(vp)->c_touch_modtime = TRUE;
+               (void) hfs_update(vp, TRUE);
+       }
 
-       result = VOP_BWRITE(alt_bp);
-       alt_bp = NULL;
-exit:
-       if (alt_bp)
-               brelse(alt_bp);
-       if (pri_bp)
-               brelse(pri_bp);
+       ret = ClearBTNodes(vp, btInfo.nodeSize, origSize, (filePtr->fcbEOF - origSize));
+out:
+       if (retval == 0)
+               retval = ret;
        
-       return (result);
+       if (lockflags)
+               hfs_systemfile_unlock(vcb, lockflags);
+       
+       return retval;
 }
 
 
@@ -336,28 +505,257 @@ exit:
 static int
 ClearBTNodes(struct vnode *vp, long blksize, off_t offset, off_t amount)
 {
+       struct hfsmount *hfsmp = VTOHFS(vp);
        struct buf *bp = NULL;
-       daddr_t blk;
-       daddr_t blkcnt;
+       daddr64_t blk;
+       daddr64_t blkcnt;
     
        blk = offset / blksize;
        blkcnt = amount / blksize;
        
        while (blkcnt > 0) {
-               bp = getblk(vp, blk, blksize, 0, 0, BLK_META);
+               bp = buf_getblk(vp, blk, blksize, 0, 0, BLK_META);
                if (bp == NULL)
                        continue;
-               bzero((char *)bp->b_data, blksize);
-               bp->b_flags |= (B_DIRTY | B_AGE);
-
-                /* wait/yield every 32 blocks so we don't hog all the buffers */
-               if ((blk % 32) == 0)
-                       VOP_BWRITE(bp);
-               else
-                       bawrite(bp);
+
+        // XXXdbg
+               if (hfsmp->jnl) {
+                       // XXXdbg -- skipping this for now since it makes a transaction
+                       //           become *way* too large
+                   //journal_modify_block_start(hfsmp->jnl, bp);
+               }
+               bzero((char *)buf_dataptr(bp), blksize);
+
+               buf_markaged(bp);
+
+        // XXXdbg
+               if (hfsmp->jnl) {
+                       // XXXdbg -- skipping this for now since it makes a transaction
+                       //           become *way* too large
+                       //journal_modify_block_end(hfsmp->jnl, bp);
+
+                       // XXXdbg - remove this once we decide what to do with the
+                       //          writes to the journal
+                       if ((blk % 32) == 0)
+                           VNOP_BWRITE(bp);
+                       else
+                           buf_bawrite(bp);
+               } else {
+                       /* wait/yield every 32 blocks so we don't hog all the buffers */
+                       if ((blk % 32) == 0)
+                               VNOP_BWRITE(bp);
+                       else
+                               buf_bawrite(bp);
+               }
                --blkcnt;
                ++blk;
        }
 
        return (0);
 }
+
+
+extern char  hfs_attrname[];
+
+/*
+ * Create an HFS+ Attribute B-tree File.
+ *
+ * No global resources should be held.
+ */
+int
+hfs_create_attr_btree(struct hfsmount *hfsmp, u_int32_t nodesize, u_int32_t nodecnt)
+{
+       struct vnode* vp = NULLVP;
+       struct cat_desc cndesc;
+       struct cat_attr cnattr;
+       struct cat_fork cfork;
+       BlockDescriptor blkdesc;
+       BTNodeDescriptor  *ndp;
+       BTHeaderRec  *bthp;
+       BTreeControlBlockPtr btcb = NULL;
+       struct buf *bp = NULL;
+       void * buffer;
+       u_int16_t *index;
+       u_int16_t  offset;
+       int intrans = 0;
+       int result;
+again:
+       /*
+        * Serialize creation using HFS_CREATING_BTREE flag.
+        */
+       lck_mtx_lock(&hfsmp->hfs_mutex);
+       if (hfsmp->hfs_flags & HFS_CREATING_BTREE) {
+                       /* Someone else beat us, wait for them to finish. */
+                       (void) msleep(hfsmp->hfs_attribute_cp, &hfsmp->hfs_mutex,
+                                     PDROP | PINOD, "hfs_create_attr_btree", 0);
+                       if (hfsmp->hfs_attribute_vp) {
+                               return (0);
+                       }
+                       goto again;
+       }
+       hfsmp->hfs_flags |= HFS_CREATING_BTREE;
+       lck_mtx_unlock(&hfsmp->hfs_mutex);
+
+       /* Check if were out of usable disk space. */
+       if ((hfs_freeblks(hfsmp, 1) == 0)) {
+               result = ENOSPC;
+               goto exit;
+       }
+
+       /*
+        * Set up Attribute B-tree vnode
+        * (this must be done before we start a transaction
+        *  or take any system file locks)
+        */
+       bzero(&cndesc, sizeof(cndesc));
+       cndesc.cd_parentcnid = kHFSRootParentID;
+       cndesc.cd_flags |= CD_ISMETA;
+       cndesc.cd_nameptr = (const u_int8_t *)hfs_attrname;
+       cndesc.cd_namelen = strlen(hfs_attrname);
+       cndesc.cd_cnid = kHFSAttributesFileID;
+
+       bzero(&cnattr, sizeof(cnattr));
+       cnattr.ca_linkcount = 1;
+       cnattr.ca_mode = S_IFREG;
+       cnattr.ca_fileid = cndesc.cd_cnid;
+
+       bzero(&cfork, sizeof(cfork));
+       cfork.cf_clump = nodesize * nodecnt;
+
+       result = hfs_getnewvnode(hfsmp, NULL, NULL, &cndesc, 0, &cnattr, &cfork, &vp);
+       if (result) {
+               goto exit;
+       }
+       /*
+        * Set up Attribute B-tree control block
+        */
+       MALLOC(btcb, BTreeControlBlock *, sizeof(BTreeControlBlock), M_TEMP, M_WAITOK);
+        bzero(btcb, sizeof(BTreeControlBlock));
+
+       btcb->nodeSize          = nodesize;
+       btcb->maxKeyLength      = kHFSPlusAttrKeyMaximumLength;
+       btcb->btreeType         = 0xFF;
+       btcb->attributes        = kBTVariableIndexKeysMask | kBTBigKeysMask;
+       btcb->version           = kBTreeVersion;
+       btcb->writeCount        = 1;
+       btcb->flags             = 0;  /* kBTHeaderDirty */
+       btcb->fileRefNum        = vp;
+       btcb->getBlockProc      = GetBTreeBlock;
+       btcb->releaseBlockProc  = ReleaseBTreeBlock;
+       btcb->setEndOfForkProc  = ExtendBTreeFile;
+       btcb->keyCompareProc    = (KeyCompareProcPtr)hfs_attrkeycompare;
+       VTOF(vp)->fcbBTCBPtr    = btcb;
+
+       /*
+        * Allocate some space
+        */
+       if (hfs_start_transaction(hfsmp) != 0) {
+               result = EINVAL;
+               goto exit;
+       }
+       intrans = 1;
+
+       /* Note ExtendBTreeFile will acquire the necessary system file locks. */
+       result = ExtendBTreeFile(vp, nodesize, cfork.cf_clump);
+       if (result)
+               goto exit;
+
+       btcb->totalNodes = VTOF(vp)->ff_size / nodesize;
+       btcb->freeNodes = btcb->totalNodes - 1;
+
+       /*
+        * Initialize the b-tree header on disk
+        */
+       bp = buf_getblk(vp, 0, nodesize, 0, 0, BLK_META);
+       if (bp == NULL) {
+               result = EIO;
+               goto exit;
+       }
+
+       buffer = (void *)buf_dataptr(bp);
+       blkdesc.buffer = buffer;
+       blkdesc.blockHeader = (void *)bp;
+       blkdesc.blockReadFromDisk = 0;
+       blkdesc.isModified = 0;
+
+       ModifyBlockStart(vp, &blkdesc);
+
+       if (buf_size(bp) != nodesize)
+               panic("hfs_create_attr_btree: bad buffer size (%d)\n", buf_size(bp));
+
+       bzero(buffer, nodesize);
+       index = (u_int16_t *)buffer;
+
+       /* FILL IN THE NODE DESCRIPTOR:  */
+       ndp = (BTNodeDescriptor *)buffer;
+       ndp->kind = kBTHeaderNode;
+       ndp->numRecords = 3;
+       offset = sizeof(BTNodeDescriptor);
+       index[(nodesize / 2) - 1] = offset;
+
+       /* FILL IN THE HEADER RECORD:  */
+       bthp = (BTHeaderRec *)((u_int8_t *)buffer + offset);
+       bthp->nodeSize     = nodesize;
+       bthp->totalNodes   = btcb->totalNodes;
+       bthp->freeNodes    = btcb->freeNodes;
+       bthp->clumpSize    = cfork.cf_clump;
+       bthp->btreeType    = 0xFF;
+       bthp->attributes   = kBTVariableIndexKeysMask | kBTBigKeysMask;
+       bthp->maxKeyLength = kHFSPlusAttrKeyMaximumLength;
+       bthp->keyCompareType = kHFSBinaryCompare;
+       offset += sizeof(BTHeaderRec);
+       index[(nodesize / 2) - 2] = offset;
+
+       /* FILL IN THE USER RECORD:  */
+       offset += kBTreeHeaderUserBytes;
+       index[(nodesize / 2) - 3] = offset;
+
+       /* FILL IN THE MAP RECORD (only one node in use). */
+       *((u_int8_t *)buffer + offset) = 0x80;
+       offset += nodesize - sizeof(BTNodeDescriptor) - sizeof(BTHeaderRec)
+                          - kBTreeHeaderUserBytes - (4 * sizeof(int16_t));
+       index[(nodesize / 2) - 4] = offset;
+
+       if (hfsmp->jnl) {
+               result = btree_journal_modify_block_end(hfsmp, bp);
+       } else {
+               result = VNOP_BWRITE(bp);
+       }
+       if (result)
+               goto exit;
+
+       /* Update vp/cp for attribute btree */
+       lck_mtx_lock(&hfsmp->hfs_mutex);
+       hfsmp->hfs_attribute_cp = VTOC(vp);
+       hfsmp->hfs_attribute_vp = vp;
+       lck_mtx_unlock(&hfsmp->hfs_mutex);
+
+       (void) hfs_flushvolumeheader(hfsmp, MNT_WAIT, HFS_ALTFLUSH);
+exit:
+       if (vp) {
+               hfs_unlock(VTOC(vp));
+       }
+       if (result) {
+               if (btcb) {
+                       FREE (btcb, M_TEMP);
+               }
+               if (vp) {
+                       vnode_put(vp);
+               }
+               /* XXX need to give back blocks ? */
+       }
+       if (intrans) {
+               hfs_end_transaction(hfsmp);
+       }
+
+       /*
+        * All done, clear HFS_CREATING_BTREE, and wake up any sleepers.
+        */
+       lck_mtx_lock(&hfsmp->hfs_mutex);
+       hfsmp->hfs_flags &= ~HFS_CREATING_BTREE;
+       wakeup((caddr_t)hfsmp->hfs_attribute_cp);
+       lck_mtx_unlock(&hfsmp->hfs_mutex);
+
+       return (result);
+}
+