bsd/vfs/vfs_utfconv.c                  standard
 bsd/vfs/vfs_vnops.c                    standard
 bsd/vfs/vnode_if.c                     standard
+bsd/vfs/vfs_journal.c                  standard
 
 bsd/miscfs/deadfs/dead_vnops.c         standard
 bsd/miscfs/fdesc/fdesc_vfsops.c                optional fdesc
 bsd/kern/mach_loader.c                 standard
 bsd/kern/posix_sem.c                   standard
 bsd/kern/posix_shm.c                   standard
+# XXXdbg - I need this in the journaling and block cache code
+bsd/kern/qsort.c                       standard
 
 bsd/vm/vnode_pager.c                   standard
 bsd/vm/vm_unix.c                       standard
 
 #include <sys/quota.h>
 #include <sys/dirent.h>
 
+#include <vfs/vfs_journal.h>
+
 #include <hfs/hfs_format.h>
 #include <hfs/hfs_catalog.h>
 #include <hfs/hfs_cnode.h>
     int16_t                    vcbAtrb;
     int16_t                    vcbFlags;
     int16_t                    vcbspare;
+    u_int32_t                  vcbJinfoBlock;
 
     u_int32_t                  vcbCrDate;
     u_int32_t                  vcbLsMod;
        u_int8_t                        hfs_fs_ronly;                   /* Whether this was mounted as read-initially  */
        u_int8_t                        hfs_unknownpermissions; /* Whether this was mounted with MNT_UNKNOWNPERMISSIONS */
        u_int8_t                        hfs_media_writeable;
+       u_int8_t                        hfs_orphans_cleaned;
        
        /* Physical Description */
        u_long                          hfs_phys_block_count;   /* Num of PHYSICAL blocks of volume */
        unicode_to_hfs_func_t   hfs_get_hfsname;
  
        struct quotafile        hfs_qfiles[MAXQUOTAS];    /* quota files */
+
+       // XXXdbg
+       void                *jnl;           // the journal for this volume (if one exists)
+       struct vnode        *jvp;           // device where the journal lives (may be equal to devvp)
+       u_int32_t            jnl_start;     // start block of the journal file (so we don't delete it)
+       u_int32_t            hfs_jnlfileid;
+       u_int32_t            hfs_jnlinfoblkid;
+    volatile int         readers;
+       volatile int         blocker;
 } hfsmount_t;
 
 #define hfs_private_metadata_dir       hfs_privdir_desc.cd_cnid
 
+#define hfs_global_shared_lock_acquire(hfsmp)    \
+    do { \
+       if (hfsmp->blocker) { \
+              tsleep((caddr_t)&hfsmp->blocker, PRIBIO, "journal_blocker", 0); \
+           continue; \
+          } \
+          hfsmp->readers++; \
+       break; \
+       } while (1)
+
+#define hfs_global_shared_lock_release(hfsmp)    \
+    do { \
+           hfsmp->readers--; \
+           if (hfsmp->readers == 0) { \
+               wakeup((caddr_t)&hfsmp->readers); \
+        } \
+    } while (0)
+
+#define hfs_global_exclusive_lock_acquire(hfsmp) \
+    do { \
+       if (hfsmp->blocker) { \
+              tsleep((caddr_t)&hfsmp->blocker, PRIBIO, "journal_blocker", 0); \
+           continue; \
+          } \
+       if (hfsmp->readers != 0) { \
+              tsleep((caddr_t)&hfsmp->readers, PRIBIO, "journal_enable/disble", 0); \
+           continue; \
+       } \
+       hfsmp->blocker = 1; \
+       break; \
+       } while (1)
+     
+#define hfs_global_exclusive_lock_release(hfsmp) \
+    hfsmp->blocker = 0; \
+       wakeup((caddr_t)&hfsmp->blocker)
+
 #define MAXHFSVNODELEN         31
 
 
 #define VTOHFS(VP) ((struct hfsmount *)((VP)->v_mount->mnt_data))
 #define        VFSTOHFS(MP) ((struct hfsmount *)(MP)->mnt_data)        
 #define VCBTOHFS(VCB) (((struct vfsVCB *)(VCB))->vcb_hfsmp)
+#define FCBTOHFS(FCB) ((struct hfsmount *)(FCB)->ff_cp->c_vp->v_mount->mnt_data)
 
 /*
  * Various ways to acquire a VCB pointer:
 #define VTOVCB(VP) (&(((struct hfsmount *)((VP)->v_mount->mnt_data))->hfs_vcb.vcb_vcb))
 #define VFSTOVCB(MP) (&(((struct hfsmount *)(MP)->mnt_data)->hfs_vcb.vcb_vcb))
 #define HFSTOVCB(HFSMP) (&(HFSMP)->hfs_vcb.vcb_vcb)
+#define FCBTOVCB(FCB) (&(((struct hfsmount *)((FCB)->ff_cp->c_vp->v_mount->mnt_data))->hfs_vcb.vcb_vcb))
 
 
 #define E_NONE 0
 
 extern u_int32_t hfs_freeblks(struct hfsmount * hfsmp, int wantreserve);
 
+extern void hfs_remove_orphans(struct hfsmount *);
+
 
 short MacToVFSError(OSErr err);
 
 #define  HFS_SYNCTRANS         1
 
 extern int hfs_btsync(struct vnode *vp, int sync_transaction);
+// used as a callback by the journaling code
+extern void hfs_sync_metadata(void *arg);
 
 short make_dir_entry(FCB **fileptr, char *name, u_int32_t fileID);
 
 OSErr  hfs_MountHFSVolume(struct hfsmount *hfsmp, HFSMasterDirectoryBlock *mdb,
                struct proc *p);
 OSErr  hfs_MountHFSPlusVolume(struct hfsmount *hfsmp, HFSPlusVolumeHeader *vhp,
-               off_t embeddedOffset, u_int64_t disksize, struct proc *p);
+               off_t embeddedOffset, u_int64_t disksize, struct proc *p, void *args);
+
+extern int     hfs_early_journal_init(struct hfsmount *hfsmp, HFSPlusVolumeHeader *vhp,
+                                                          void *_args, int embeddedOffset, int mdb_offset,
+                                                          HFSMasterDirectoryBlock *mdbp, struct ucred *cred);
+extern u_long  GetFileInfo(ExtendedVCB *vcb, u_int32_t dirid, char *name,
+                                       struct cat_attr *fattr, struct cat_fork *forkinfo);
 
 int hfs_getconverter(u_int32_t encoding, hfs_to_unicode_func_t *get_unicode,
                     unicode_to_hfs_func_t *get_hfsname);
 
                if ((error = hfs_write_access(vp, ap->a_cred, ap->a_p, false)) != 0)
                        return (error);
 
+               // XXXdbg
+               hfs_global_shared_lock_acquire(hfsmp);
+               if (hfsmp->jnl) {
+                   if ((error = journal_start_transaction(hfsmp->jnl)) != 0) {
+                               hfs_global_shared_lock_release(hfsmp);
+                               return error;
+                   }
+               }
+
                /* Lock catalog b-tree */
                error = hfs_metafilelocking(hfsmp, kHFSCatalogFileID, LK_SHARED, ap->a_p);
-               if (error)
-                       return (error);
+               if (error) {
+                   if (hfsmp->jnl) {
+                               journal_end_transaction(hfsmp->jnl);
+                       }
+                       hfs_global_shared_lock_release(hfsmp);
+                   return (error);
+               }
 
                error = cat_insertfilethread(hfsmp, &cp->c_desc);
 
                /* Unlock catalog b-tree */
                (void) hfs_metafilelocking(hfsmp, kHFSCatalogFileID, LK_RELEASE, ap->a_p);
+
+               if (hfsmp->jnl) {
+                   journal_end_transaction(hfsmp->jnl);
+               }
+               hfs_global_shared_lock_release(hfsmp);
+
                if (error)
                        return (error);
        }
        }
        if (cp->c_flag & (C_NOEXISTS | C_DELETED))
                return (ENOENT);
+
+       // XXXdbg - don't allow modifying the journal or journal_info_block
+       if (hfsmp->jnl && cp->c_datafork) {
+               struct HFSPlusExtentDescriptor *extd;
+               
+               extd = &cp->c_datafork->ff_data.cf_extents[0];
+               if (extd->startBlock == HFSTOVCB(hfsmp)->vcbJinfoBlock || extd->startBlock == hfsmp->jnl_start) {
+                       return EPERM;
+               }
+       }
+
        /*
         * Ownership of a file is required in one of two classes of calls:
         *
         * If any cnode attributes changed then do an update.
         */
        if (alist->volattr == 0) {
-               struct timeval atime, mtime;
+               struct timeval tv;
 
-               atime.tv_sec = cp->c_atime;
-               atime.tv_usec = 0;
-               mtime.tv_sec = cp->c_mtime;
-               mtime.tv_usec = cp->c_mtime_nsec / 1000;
                cp->c_flag |= C_MODIFIED;
-               if ((error = VOP_UPDATE(vp, &atime, &mtime, 1)))
+               tv = time;
+               CTIMES(cp, &tv, &tv);
+               if ((error = VOP_UPDATE(vp, &tv, &tv, 1)))
                        goto ErrorExit;
        }
        /* Volume Rename */
                        to_desc.cd_cnid = cp->c_cnid;
                        to_desc.cd_flags = CD_ISDIR;
 
+                       // XXXdbg
+                       hfs_global_shared_lock_acquire(hfsmp);
+                       if (hfsmp->jnl) {
+                           if (journal_start_transaction(hfsmp->jnl) != 0) {
+                                       hfs_global_shared_lock_release(hfsmp);
+                                       error = EINVAL;
+                                       /* Restore the old name in the VCB */
+                                       copystr(cp->c_desc.cd_nameptr, vcb->vcbVN, sizeof(vcb->vcbVN), NULL);
+                                       vcb->vcbFlags |= 0xFF00;
+                                       goto ErrorExit;
+                           }
+                       }
+
+
                        /* Lock catalog b-tree */
                        error = hfs_metafilelocking(hfsmp, kHFSCatalogFileID, LK_EXCLUSIVE, p);
                        if (error) {
+                               if (hfsmp->jnl) {
+                                   journal_end_transaction(hfsmp->jnl);
+                               }
+                               hfs_global_shared_lock_release(hfsmp);
+
                                /* Restore the old name in the VCB */
                                copystr(cp->c_desc.cd_nameptr, vcb->vcbVN, sizeof(vcb->vcbVN), NULL);
                                vcb->vcbFlags |= 0xFF00;
 
                        /* Unlock the Catalog */
                        (void) hfs_metafilelocking(hfsmp, kHFSCatalogFileID, LK_RELEASE, p);
-
+                       
+                       if (hfsmp->jnl) {
+                           journal_end_transaction(hfsmp->jnl);
+                       }
+                       hfs_global_shared_lock_release(hfsmp);
+                       
                        if (error) {
                                /* Restore the old name in the VCB */
                                copystr(cp->c_desc.cd_nameptr, vcb->vcbVN, sizeof(vcb->vcbVN), NULL);
        int error = 0;
        int depleted = 0;
        int index, startindex;
-       int i;
+       int i, dir_entries;
        struct cat_desc *lastdescp = NULL;
        struct cat_desc prevdesc;
        char * prevnamebuf = NULL;
        struct cat_entrylist *ce_list = NULL;
 
+       dir_entries = dcp->c_entries;
+       if (dcp->c_attr.ca_fileid == kHFSRootFolderID && hfsmp->jnl) {
+               dir_entries -= 3;
+       }
+
        *(ap->a_actualcount) = 0;
        *(ap->a_eofflag) = 0;
        
 
        /* Convert uio_offset into a directory index. */
        startindex = index = uio->uio_offset / sizeof(struct dirent);
-       if ((index + 1) > dcp->c_entries) {
+       if ((index + 1) > dir_entries) {
                *(ap->a_eofflag) = 1;
                error = 0;
                goto exit;
                                /* Termination checks */
                                if ((--maxcount <= 0) ||
                                    (uio->uio_resid < (fixedblocksize + HFS_AVERAGE_NAME_SIZE)) ||
-                                   (index >= dcp->c_entries)) {
+                                   (index >= dir_entries)) {
                                        depleted = 1;
                                        break;
                                }
                } /* for each catalog entry */
 
                /* If there are more entries then save the last name. */
-               if (index < dcp->c_entries
+               if (index < dir_entries
                &&  !(*(ap->a_eofflag))
                &&  lastdescp != NULL) {
                        if (prevnamebuf == NULL)
        if (ATTR_DIR_ENTRYCOUNT & attr) {
                u_long entries = cattrp->ca_entries;
 
-               if ((descp->cd_parentcnid == kRootParID) &&
-                   (hfsmp->hfs_private_metadata_dir != 0))
-                       --entries;      /* hide private dir */
+               if (descp->cd_parentcnid == kRootParID) {
+                       if (hfsmp->hfs_private_metadata_dir != 0)
+                               --entries;          /* hide private dir */
+                       if (hfsmp->jnl)
+                               entries -= 2;   /* hide the journal files */
+               }
 
                *((u_long *)attrbufptr)++ = entries;
        }
 
        if (options & kGetEmptyBlock)
                bp = getblk(vp, blockNum, block->blockSize, 0, 0, BLK_META);
        else
-       retval = meta_bread(vp, blockNum, block->blockSize, NOCRED, &bp);
+               retval = meta_bread(vp, blockNum, block->blockSize, NOCRED, &bp);
 
     DBG_ASSERT(bp != NULL);
     DBG_ASSERT(bp->b_data != NULL);
         block->buffer = bp->b_data;
         block->blockReadFromDisk = (bp->b_flags & B_CACHE) == 0;       /* not found in cache ==> came from disk */
 
+               // XXXdbg 
+               block->isModified = 0;
+
 #if BYTE_ORDER == LITTLE_ENDIAN
         /* Endian swap B-Tree node (only if it's a valid block) */
         if (!(options & kGetEmptyBlock)) {
 }
 
 
+__private_extern__
+void ModifyBlockStart(FileReference vp, BlockDescPtr blockPtr)
+{
+       struct hfsmount *hfsmp = VTOHFS(vp);
+    struct buf *bp = NULL;
+
+       if (hfsmp->jnl == NULL) {
+               return;
+       }
+       
+    bp = (struct buf *) blockPtr->blockHeader;
+    if (bp == NULL) {
+               panic("ModifyBlockStart: null bp  for blockdescptr 0x%x?!?\n", blockPtr);
+               return;
+    }
+
+       journal_modify_block_start(hfsmp->jnl, bp);
+       blockPtr->isModified = 1;
+}
+
+
 __private_extern__
 OSStatus ReleaseBTreeBlock(FileReference vp, BlockDescPtr blockPtr, ReleaseBlockOptions options)
 {
+    struct hfsmount    *hfsmp = VTOHFS(vp);
     extern int bdwrite_internal(struct buf *, int);
     OSStatus   retval = E_NONE;
     struct buf *bp = NULL;
     }
 
     if (options & kTrashBlock) {
-        bp->b_flags |= B_INVAL;
-       brelse(bp);     /* note: B-tree code will clear blockPtr->blockHeader and blockPtr->buffer */
+               bp->b_flags |= B_INVAL;
+               if (hfsmp->jnl && (bp->b_flags & B_LOCKED)) {
+                       journal_kill_block(hfsmp->jnl, bp);
+               } else {
+                       brelse(bp);     /* note: B-tree code will clear blockPtr->blockHeader and blockPtr->buffer */
+               }
     } else {
         if (options & kForceWriteBlock) {
-            retval = VOP_BWRITE(bp);
+                       if (hfsmp->jnl) {
+                               if (blockPtr->isModified == 0) {
+                                       panic("hfs: releaseblock: modified is 0 but forcewrite set! bp 0x%x\n", bp);
+                               }
+                               retval = journal_modify_block_end(hfsmp->jnl, bp);
+                               blockPtr->isModified = 0;
+                       } else {
+                               retval = VOP_BWRITE(bp);
+                       }
         } else if (options & kMarkBlockDirty) {
-#if FORCESYNCBTREEWRITES
-            VOP_BWRITE(bp);
-#else
-            if (options & kLockTransaction) {
+            if ((options & kLockTransaction) && hfsmp->jnl == NULL) {
                 /*
                  *
                  * Set the B_LOCKED flag and unlock the buffer, causing brelse to move
                      /* Rollback sync time to cause a sync on lock release... */
                      (void) BTSetLastSync(VTOF(vp), time.tv_sec - (kMaxSecsForFsync + 1));
                 }
-                bp->b_flags |= B_LOCKED;
-           }
+
+                               bp->b_flags |= B_LOCKED;
+            }
+
             /* 
              * Delay-write this block.
              * If the maximum delayed buffers has been exceeded then
              * free up some buffers and fall back to an asynchronous write.
              */
-            if (bdwrite_internal(bp, 1) != 0) {
+                       if (hfsmp->jnl) {
+                               if (blockPtr->isModified == 0) {
+                                       panic("hfs: releaseblock: modified is 0 but markdirty set! bp 0x%x\n", bp);
+                               }
+                               retval = journal_modify_block_end(hfsmp->jnl, bp);
+                               blockPtr->isModified = 0;
+                       } else if (bdwrite_internal(bp, 1) != 0) {
                 hfs_btsync(vp, 0);
                 /* Rollback sync time to cause a sync on lock release... */
                 (void) BTSetLastSync(VTOF(vp), time.tv_sec - (kMaxSecsForFsync + 1));
                 bp->b_flags &= ~B_LOCKED;
                 bawrite(bp);
             }
-
-#endif
         } else {
-               brelse(bp);     /* note: B-tree code will clear blockPtr->blockHeader and blockPtr->buffer */
+                       // check if we had previously called journal_modify_block_start() 
+                       // on this block and if so, abort it (which will call brelse()).
+                       if (hfsmp->jnl && blockPtr->isModified) {
+                               // XXXdbg - I don't want to call modify_block_abort()
+                               //          because I think it may be screwing up the
+                               //          journal and blowing away a block that has
+                               //          valid data in it.
+                               //   
+                               //    journal_modify_block_abort(hfsmp->jnl, bp);
+                               //panic("hfs: releaseblock called for 0x%x but mod_block_start previously called.\n", bp);
+                               journal_modify_block_end(hfsmp->jnl, bp);
+                               blockPtr->isModified = 0;
+                       } else {
+                               brelse(bp);     /* note: B-tree code will clear blockPtr->blockHeader and blockPtr->buffer */
+                       }
         };
     };
 
 {
 #pragma unused (maxEOF)
 
-       OSStatus        retval;
-       UInt64          actualBytesAdded;
+       OSStatus        retval, ret;
+       UInt64          actualBytesAdded, origSize;
        UInt64          bytesToAdd;
-    UInt32             extendFlags;
        u_int32_t       startAllocation;
        u_int32_t       fileblocks;
        BTreeInfoRec btInfo;
        ExtendedVCB     *vcb;
        FCB                     *filePtr;
     struct proc *p = NULL;
-
+       UInt64          trim = 0;       
 
        filePtr = GetFileControlBlock(vp);
 
        {
                p = current_proc();
                /* lock extents b-tree (also protects volume bitmap) */
-               retval = hfs_metafilelocking(VTOHFS(vp), kHFSExtentsFileID, LK_EXCLUSIVE, p);
+               retval = hfs_metafilelocking(VTOHFS(vp), kHFSExtentsFileID, LK_EXCLUSIVE, p);
                if (retval)
                        return (retval);
        }
 
     (void) BTGetInformation(filePtr, 0, &btInfo);
 
+#if 0  // XXXdbg
        /*
         * The b-tree code expects nodes to be contiguous. So when
         * the allocation block size is less than the b-tree node
                extendFlags = 0;
        } else {
                /* Ensure that all b-tree nodes are contiguous on disk */
-               extendFlags = kEFAllMask | kEFContigMask;
+               extendFlags = kEFContigMask;
        }
+#endif
 
+       origSize = filePtr->fcbEOF;
        fileblocks = filePtr->ff_blocks;
        startAllocation = vcb->nextAllocation;
 
-       retval = ExtendFileC(vcb, filePtr, bytesToAdd, 0, extendFlags, &actualBytesAdded);
-
+       // loop trying to get a contiguous chunk that's an integer multiple
+       // of the btree node size.  if we can't get a contiguous chunk that
+       // is at least the node size then we break out of the loop and let
+       // the error propagate back up.
+       do {
+               retval = ExtendFileC(vcb, filePtr, bytesToAdd, 0, kEFContigMask, &actualBytesAdded);
+               if (retval == dskFulErr && actualBytesAdded == 0) {
+
+                       if (bytesToAdd == btInfo.nodeSize || bytesToAdd < (minEOF - origSize)) {
+                               // if we're here there's nothing else to try, we're out
+                               // of space so we break and bail out.
+                               break;
+                       } else {
+                               bytesToAdd >>= 1;
+                               if (bytesToAdd < btInfo.nodeSize) {
+                                       bytesToAdd = btInfo.nodeSize;
+                               } else if ((bytesToAdd % btInfo.nodeSize) != 0) {
+                                       // make sure it's an integer multiple of the nodeSize
+                                       bytesToAdd -= (bytesToAdd % btInfo.nodeSize);
+                               }
+                       }
+               }
+       } while (retval == dskFulErr && actualBytesAdded == 0);
+       
        /*
         * If a new extent was added then move the roving allocator
         * reference forward by the current b-tree file size so 
                vcb->nextAllocation += fileblocks;
        }
                
+       filePtr->fcbEOF = (u_int64_t)filePtr->ff_blocks * (u_int64_t)vcb->blockSize;
+
+       // XXXdbg ExtendFileC() could have returned an error even though
+       // it grew the file to be big enough for our needs.  If this is
+       // the case, we don't care about retval so we blow it away.
+       //
+       if (filePtr->fcbEOF >= minEOF && retval != 0) {
+               retval = 0;
+       }
+
+       // XXXdbg if the file grew but isn't large enough or isn't an
+       // even multiple of the nodeSize then trim things back.  if
+       // the file isn't large enough we trim back to the original
+       // size.  otherwise we trim back to be an even multiple of the
+       // btree node size.
+       //
+       if ((filePtr->fcbEOF < minEOF) || (actualBytesAdded % btInfo.nodeSize) != 0) {
+
+               if (filePtr->fcbEOF < minEOF) {
+                       retval = dskFulErr;
+                       
+                       if (filePtr->fcbEOF < origSize) {
+                               panic("hfs: btree file eof %lld less than orig size %lld!\n",
+                                         filePtr->fcbEOF, origSize);
+                       }
+                       
+                       trim = filePtr->fcbEOF - origSize;
+                       if (trim != actualBytesAdded) {
+                               panic("hfs: trim == %lld but actualBytesAdded == %lld\n",
+                                         trim, actualBytesAdded);
+                       }
+               } else {
+                       trim = (actualBytesAdded % btInfo.nodeSize);
+               }
+
+               ret = TruncateFileC(vcb, filePtr, filePtr->fcbEOF - trim, 0);
+               filePtr->fcbEOF = (u_int64_t)filePtr->ff_blocks * (u_int64_t)vcb->blockSize;
+
+               // XXXdbg - panic if the file didn't get trimmed back properly
+               if ((filePtr->fcbEOF % btInfo.nodeSize) != 0) {
+                       panic("hfs: truncate file didn't! fcbEOF %lld nsize %d fcb 0x%x\n",
+                                 filePtr->fcbEOF, btInfo.nodeSize, filePtr);
+               }
+
+               if (ret) {
+                       // XXXdbg - this probably doesn't need to be a panic()
+                       panic("hfs: error truncating btree files (sz 0x%llx, trim %lld, ret %d)\n",
+                                 filePtr->fcbEOF, trim, ret);
+                       return ret;
+               }
+               actualBytesAdded -= trim;
+       }
+
        if(VTOC(vp)->c_fileid != kHFSExtentsFileID) {
                /*
                 * Get any extents overflow b-tree changes to disk ASAP!
                 */
-               if (retval == 0) {
-                       (void) BTFlushPath(VTOF(vcb->extentsRefNum));
-                       (void) VOP_FSYNC(vcb->extentsRefNum, NOCRED, MNT_WAIT, p);
-               }
+               (void) BTFlushPath(VTOF(vcb->extentsRefNum));
+               (void) VOP_FSYNC(vcb->extentsRefNum, NOCRED, MNT_WAIT, p);
+
                (void) hfs_metafilelocking(VTOHFS(vp), kHFSExtentsFileID, LK_RELEASE, p);
        }
-       if (retval)
-               return (retval);
-       
-       filePtr->fcbEOF = (u_int64_t)filePtr->ff_blocks * (u_int64_t)vcb->blockSize;
 
-       retval = ClearBTNodes(vp, btInfo.nodeSize, filePtr->fcbEOF - actualBytesAdded, actualBytesAdded);       
-       if (retval)
-               return (retval);
-       
+       if ((filePtr->fcbEOF % btInfo.nodeSize) != 0) {
+               panic("hfs: extendbtree: fcb 0x%x has eof 0x%llx not a multiple of 0x%x (trim %llx)\n",
+                         filePtr, filePtr->fcbEOF, btInfo.nodeSize, trim);
+       }
+
        /*
         * Update the Alternate MDB or Alternate VolumeHeader
         */
            (VTOC(vp)->c_fileid == kHFSAttributesFileID)
           ) {
                MarkVCBDirty( vcb );
-               retval = hfs_flushvolumeheader(VCBTOHFS(vcb), MNT_WAIT, HFS_ALTFLUSH);
+               ret = hfs_flushvolumeheader(VCBTOHFS(vcb), MNT_WAIT, HFS_ALTFLUSH);
        }
+
+       ret = ClearBTNodes(vp, btInfo.nodeSize, filePtr->fcbEOF - actualBytesAdded, actualBytesAdded);
+       if (ret)
+               return (ret);
        
        return retval;
 }
 static int
 ClearBTNodes(struct vnode *vp, long blksize, off_t offset, off_t amount)
 {
+       struct hfsmount *hfsmp = VTOHFS(vp);
        struct buf *bp = NULL;
        daddr_t blk;
        daddr_t blkcnt;
                bp = getblk(vp, blk, blksize, 0, 0, BLK_META);
                if (bp == NULL)
                        continue;
+
+        // XXXdbg
+               if (hfsmp->jnl) {
+                       // XXXdbg -- skipping this for now since it makes a transaction
+                       //           become *way* too large
+                   //journal_modify_block_start(hfsmp->jnl, bp);
+               }
+
                bzero((char *)bp->b_data, blksize);
                bp->b_flags |= B_AGE;
 
-                /* wait/yield every 32 blocks so we don't hog all the buffers */
-               if ((blk % 32) == 0)
-                       VOP_BWRITE(bp);
-               else
-                       bawrite(bp);
+        // XXXdbg
+               if (hfsmp->jnl) {
+                       // XXXdbg -- skipping this for now since it makes a transaction
+                       //           become *way* too large
+                       //journal_modify_block_end(hfsmp->jnl, bp);
+
+                       // XXXdbg - remove this once we decide what to do with the
+                       //          writes to the journal
+                       if ((blk % 32) == 0)
+                           VOP_BWRITE(bp);
+                       else
+                           bawrite(bp);
+               } else {
+                       /* wait/yield every 32 blocks so we don't hog all the buffers */
+                       if ((blk % 32) == 0)
+                               VOP_BWRITE(bp);
+                       else
+                               bawrite(bp);
+               }
                --blkcnt;
                ++blk;
        }
 
        if (result)
                goto exit;
 
+       // XXXdbg - preflight all btree operations to make sure there's enough space
+       result = BTCheckFreeSpace(fcb);
+       if (result)
+               goto exit;
+
        BDINIT(file_data, &file_rec);
        result = BTSearchRecord(fcb, &iterator[0], &file_data, &datasize, &iterator[0]);
        if (result) 
                (void) BTFlushPath(fcb);
        }       
 exit:
+       (void) BTFlushPath(fcb);
        FREE(iterator, M_TEMP);
 
        return MacToVFSError(result);
        encoding = getencoding(recp);
        hint = iterator->hint.nodeNum;
 
+       /* Hide the journal files (if any) */
+       if (hfsmp->jnl &&
+               ((cnid == hfsmp->hfs_jnlfileid) ||
+                (cnid == hfsmp->hfs_jnlinfoblkid))) {
+
+               result = ENOENT;
+               goto exit;
+       }
+
        /*
         * When a hardlink link is encountered, auto resolve it
         */
                hfs_setencodingbits(hfsmp, encoding);
        }
 
+       // XXXdbg - preflight all btree operations to make sure there's enough space
+       result = BTCheckFreeSpace(fcb);
+       if (result)
+               goto exit;
+
        /*
         * Insert the thread record first
         */
        vcb->vcbNxtCNID = nextCNID;
        vcb->vcbFlags |= 0xFF00;
 
-       (void) BTFlushPath(fcb);
-
 exit:
+       (void) BTFlushPath(fcb);
        FREE(bto, M_TEMP);
 
        return MacToVFSError(result);
        if ((result = buildkey(hfsmp, to_cdp, (HFSPlusCatalogKey *)&to_iterator->key, 0)))
                goto exit;      
 
+       // XXXdbg - preflight all btree operations to make sure there's enough space
+       result = BTCheckFreeSpace(fcb);
+       if (result)
+               goto exit;
+
        to_key = (HFSPlusCatalogKey *)&to_iterator->key;
        MALLOC(recp, CatalogRecord *, sizeof(CatalogRecord), M_TEMP, M_WAITOK);
        BDINIT(btdata, recp);
                result = BTInsertRecord(fcb, to_iterator, &btdata, datasize);
                if (result) {
                        /* Try and restore original before leaving */
+                   // XXXdbg
+                   #if 1
+                      {
+                       int err;
+                       err = BTInsertRecord(fcb, from_iterator, &btdata, datasize);
+                       if (err)
+                               panic("cat_create: could not undo (BTInsert = %d)", err);
+                      }
+                   #else
                        (void) BTInsertRecord(fcb, from_iterator, &btdata, datasize);
+                   #endif
                        goto exit;
                }
                sourcegone = 1;
                result = BTDeleteRecord(fcb, from_iterator);
                if (result) {
                        /* Try and delete new record before leaving */
+                 // XXXdbg
+                 #if 1
+                    {
+                       int err;
+                       err = BTDeleteRecord(fcb, to_iterator);
+                       if (err)
+                               panic("cat_create: could not undo (BTDelete = %d)", err);
+                    }                  
+                 #else
                        (void) BTDeleteRecord(fcb, to_iterator);
+                 #endif
                        goto exit;
                }
        }
                        FREE(pluskey, M_TEMP);
                }
        }
-       (void) BTFlushPath(fcb);
 exit:
+       (void) BTFlushPath(fcb);
        if (from_iterator)
                FREE(from_iterator, M_TEMP);
        if (to_iterator)
         * A directory must be empty
         * A file must be zero length (no blocks)
         */
-
        if (descp->cd_cnid < kHFSFirstUserCatalogNodeID ||
            descp->cd_parentcnid == kRootParID)
                return (EINVAL);
        if (result)
                goto exit;
 
+       // XXXdbg - preflight all btree operations to make sure there's enough space
+       result = BTCheckFreeSpace(fcb);
+       if (result)
+               goto exit;
+
        /* Delete record */
        result = BTDeleteRecord(fcb, iterator);
        if (result)
 
        TrashCatalogIterator(vcb, descp->cd_parentcnid);
 
-       (void) BTFlushPath(fcb);
 exit:
+       (void) BTFlushPath(fcb);
        FREE(iterator, M_TEMP);
 
        return MacToVFSError(result);
        /* Update the node hint. */
        descp->cd_hint = iterator->hint.nodeNum;
 
-       (void) BTFlushPath(fcb);
-
 exit:
+       (void) BTFlushPath(fcb);
        FREE(iterator, M_TEMP);
 
        return MacToVFSError(result);
                return (0);     /* stop */
        }
 
-       /* Hide the private meta data directory. */
-       if (parentcnid == kRootDirID  &&
-           rec->recordType == kHFSPlusFolderRecord &&
-           rec->hfsPlusFolder.folderID == hfsmp->hfs_private_metadata_dir) {
-               return (1);     /* continue */
+       /* Hide the private meta data directory and journal files */
+       if (parentcnid == kRootDirID) {
+               if ((rec->recordType == kHFSPlusFolderRecord) &&
+                   (rec->hfsPlusFolder.folderID == hfsmp->hfs_private_metadata_dir)) {
+                       return (1);     /* continue */
+               }
+               if (hfsmp->jnl &&
+                   (rec->recordType == kHFSPlusFileRecord) &&
+                   ((rec->hfsPlusFile.fileID == hfsmp->hfs_jnlfileid) ||
+                    (rec->hfsPlusFile.fileID == hfsmp->hfs_jnlinfoblkid))) {
+
+                       return (1);     /* continue */
+               }
        }
 
+
        cep = &list->entry[list->realentries++];
 
        if (state->stdhfs) {
 struct read_state {
        u_int32_t       cbs_parentID;
        u_int32_t       cbs_hiddenDirID;
+       u_int32_t       cbs_hiddenJournalID;
+       u_int32_t       cbs_hiddenInfoBlkID;
        off_t           cbs_lastoffset;
        struct uio *    cbs_uio;
        ExtendedVCB *   cbs_vcb;
            catent.d_type == DT_DIR)
                goto lastitem;
 
+       /* Hide the journal files */
+       if ((curID == kRootDirID) &&
+           (catent.d_type == DT_REG) &&
+           ((catent.d_fileno == state->cbs_hiddenJournalID) ||
+            (catent.d_fileno == state->cbs_hiddenInfoBlkID))) {
+
+               return (1);     /* skip and continue */
+       }
+
        state->cbs_lastoffset = state->cbs_uio->uio_offset;
 
        /* if this entry won't fit then we're done */
                goto cleanup;
 
        state.cbs_hiddenDirID = hfsmp->hfs_private_metadata_dir;
+       if (hfsmp->jnl) {
+               state.cbs_hiddenJournalID = hfsmp->hfs_jnlfileid;
+               state.cbs_hiddenInfoBlkID = hfsmp->hfs_jnlinfoblkid;
+       }
+
        state.cbs_lastoffset = cip->currentOffset;
        state.cbs_vcb = vcb;
        state.cbs_uio = uio;
        case kHFSPlusFileRecord:
                cnid = crp->hfsPlusFile.fileID;
                break;
+       default:
+               panic("hfs: getcnid: unknown recordType (crp @ 0x%x)\n", crp);
+               break;
        }
+
        return (cnid);
 }
 
        case kHFSPlusFolderThreadRecord:
                cnid = recp->hfsPlusThread.parentID;
                break;
+       default:
+               panic("hfs: getparentcnid: unknown recordType (crp @ 0x%x)\n", recp);
+               break;
        }
+
        return (cnid);
 }
 
 
        int recycle = 0;
        int forkcount = 0;
        int truncated = 0;
+       int started_tr = 0, grabbed_lock = 0;
 
        if (prtactive && vp->v_usecount != 0)
                vprint("hfs_inactive: pushing active", vp);
            vp->v_type == VREG &&
            (VTOF(vp)->ff_blocks != 0)) {                       
                error = VOP_TRUNCATE(vp, (off_t)0, IO_NDELAY, NOCRED, p);
-               if (error) goto out;
                truncated = 1;
+               // have to do this to prevent the lost ubc_info panic
+               SET(cp->c_flag, C_TRANSIT);
                recycle = 1;
+               if (error) goto out;
        }
 
        /*
                cp->c_flag &= ~C_DELETED;
                cp->c_rdev = 0;
                
+               // XXXdbg
+               hfs_global_shared_lock_acquire(hfsmp);
+               grabbed_lock = 1;
+               if (hfsmp->jnl) {
+                   if (journal_start_transaction(hfsmp->jnl) != 0) {
+                               error = EINVAL;
+                               goto out;
+                   }
+                   started_tr = 1;
+               }
+
                /* Lock catalog b-tree */
                error = hfs_metafilelocking(hfsmp, kHFSCatalogFileID, LK_EXCLUSIVE, p);
                if (error) goto out;
                if (HFSTOVCB(hfsmp)->vcbSigWord == kHFSPlusSigWord)
                        cp->c_flag |= C_MODIFIED;
        }
-        if (cp->c_flag & (C_ACCESS | C_CHANGE | C_MODIFIED | C_UPDATE)) {
-                tv = time;
-                VOP_UPDATE(vp, &tv, &tv, 0);
-        }
+
+       if (cp->c_flag & (C_ACCESS | C_CHANGE | C_MODIFIED | C_UPDATE)) {
+               tv = time;
+               VOP_UPDATE(vp, &tv, &tv, 0);
+       }
 out:
+       // XXXdbg - have to do this because a goto could have come here
+       if (started_tr) {
+           journal_end_transaction(hfsmp->jnl);
+           started_tr = 0;
+       }
+       if (grabbed_lock) {
+               hfs_global_shared_lock_release(hfsmp);
+       }
+
        VOP_UNLOCK(vp, 0, p);
        /*
         * If we are done with the vnode, reclaim it
                        retval = ENOENT;
                        goto exit;
                }
+
+               /* Hide private journal files */
+               if (hfsmp->jnl &&
+                       (cp->c_parentcnid == kRootDirID) &&
+                       ((cp->c_cnid == hfsmp->hfs_jnlfileid) ||
+                       (cp->c_cnid == hfsmp->hfs_jnlinfoblkid))) {
+                   retval = ENOENT;
+                       goto exit;
+               }
+        
                if (wantrsrc && rvp != NULL) {
                        vp = rvp;
                        rvp = NULL;
 
 enum {
        kHFSSigWord             = 0x4244,       /* 'BD' in ASCII */
        kHFSPlusSigWord         = 0x482B,       /* 'H+' in ASCII */
+       kHFSJSigWord            = 0x484a,       /* 'HJ' in ASCII */
        kHFSPlusVersion         = 0x0004,       /* will change as format changes */
                                                /* version 4 shipped with Mac OS 8.1 */
-       kHFSPlusMountVersion    = 0x31302E30    /* '10.0' for Mac OS X */
+       kHFSPlusMountVersion    = 0x31302E30,   /* '10.0' for Mac OS X */
+       kHFSJMountVersion       = 0x4846534a    /* 'HFSJ' for journaled HFS+ on OS X */
 };
 
 
        kHFSVolumeNoCacheRequiredBit = 10,              /* don't cache volume blocks (i.e. RAM or ROM disk) */
        kHFSBootVolumeInconsistentBit = 11,             /* boot volume is inconsistent (System 7.6 and later) */
        kHFSCatalogNodeIDsReusedBit = 12,
-                                                       /* Bits 13-14 are reserved for future use */
+       kHFSVolumeJournaledBit = 13,                    /* this volume has a journal on it */
+                                                       /* Bit 14 is reserved for future use */
        kHFSVolumeSoftwareLockBit       = 15,           /* volume is locked by software */
 
        kHFSVolumeHardwareLockMask      = 1 << kHFSVolumeHardwareLockBit,
        kHFSVolumeNoCacheRequiredMask = 1 << kHFSVolumeNoCacheRequiredBit,
        kHFSBootVolumeInconsistentMask = 1 << kHFSBootVolumeInconsistentBit,
        kHFSCatalogNodeIDsReusedMask = 1 << kHFSCatalogNodeIDsReusedBit,
+       kHFSVolumeJournaledMask = 1 << kHFSVolumeJournaledBit,
        kHFSVolumeSoftwareLockMask      = 1 << kHFSVolumeSoftwareLockBit,
        kHFSMDBAttributesMask           = 0x8380
 };
        u_int16_t       version;                /* == kHFSPlusVersion */
        u_int32_t       attributes;             /* volume attributes */
        u_int32_t       lastMountedVersion;     /* implementation version which last mounted volume */
-       u_int32_t       reserved;               /* reserved - initialized as zero */
+//XXXdbg       u_int32_t       reserved;               /* reserved - initialized as zero */
+       u_int32_t       journalInfoBlock;       /* block addr of journal info (if volume is journaled, zero otherwise) */
 
        u_int32_t       createDate;             /* date and time of volume creation */
        u_int32_t       modifyDate;             /* date and time of last modification */
        kBTVariableIndexKeysMask = 0x00000004   /* keys in index nodes are variable length */
 };
 
+/* JournalInfoBlock - Structure that describes where our journal lives */
+struct JournalInfoBlock {
+       u_int32_t       flags;
+       u_int32_t       device_signature[8];  // signature used to locate our device.
+       u_int64_t       offset;               // byte offset to the journal on the device
+       u_int64_t       size;                 // size in bytes of the journal
+       u_int32_t       reserved[32];
+};
+typedef struct JournalInfoBlock JournalInfoBlock;
+
+enum {
+    kJIJournalInFSMask          = 0x00000001,
+    kJIJournalOnOtherDeviceMask = 0x00000002,
+    kJIJournalNeedInitMask      = 0x00000004
+};
+
+
 #pragma options align=reset
 
 #ifdef __cplusplus
 
        fip->fdCreator = SWAP_BE32 (kHFSPlusCreator);   /* 'hfs+' */
        fip->fdFlags   = SWAP_BE16 (kHasBeenInited);
 
+       hfs_global_shared_lock_acquire(hfsmp);
+       if (hfsmp->jnl) {
+           if (journal_start_transaction(hfsmp->jnl) != 0) {
+                       hfs_global_shared_lock_release(hfsmp);
+                       return EINVAL;
+           }
+       }
+
        /* Create the indirect link directly in the catalog */
        result = cat_create(hfsmp, &desc, &attr, NULL);
 
-       if (linkcnid != NULL)
+       if (result == 0 && linkcnid != NULL)
                *linkcnid = attr.ca_fileid;
 
+       if (hfsmp->jnl) {
+           journal_end_transaction(hfsmp->jnl);
+       }
+       hfs_global_shared_lock_release(hfsmp);
+
        return (result);
 }
 
 
        /* Lock catalog b-tree */
        retval = hfs_metafilelocking(hfsmp, kHFSCatalogFileID, LK_EXCLUSIVE, p);
-       if (retval)
-               return retval;
+       if (retval) {
+           return retval;
+       }
 
        /*
         * If this is a new hardlink then we need to create the data
                bzero(&to_desc, sizeof(to_desc));
                to_desc.cd_parentcnid = hfsmp->hfs_privdir_desc.cd_cnid;
                to_desc.cd_cnid = cp->c_fileid;
+
                do {
                        /* get a unique indirect node number */
                        indnodeno = ((random() & 0x3fffffff) + 100);
                                cp->c_desc.cd_nameptr, &cp->c_desc.cd_cnid);
                if (retval) {
                        /* put it source file back */
+               // XXXdbg
+               #if 1
+                   {
+                       int err;
+                               err = cat_rename(hfsmp, &to_desc, &dcp->c_desc, &cp->c_desc, NULL);
+                               if (err)
+                                       panic("hfs_makelink: error %d from cat_rename backout 1", err);
+                   }
+               #else
                        (void) cat_rename(hfsmp, &to_desc, &dcp->c_desc, &cp->c_desc, NULL);
+               #endif
                        goto out;
                }
                cp->c_rdev = indnodeno;
                (void) cat_delete(hfsmp, &cp->c_desc, &cp->c_attr);
 
                /* Put the source file back */
+       // XXXdbg
+       #if 1
+               {
+                       int err;
+                       err = cat_rename(hfsmp, &to_desc, &dcp->c_desc, &cp->c_desc, NULL);
+                       if (err)
+                               panic("hfs_makelink: error %d from cat_rename backout 2", err);
+               }
+       #else
                (void) cat_rename(hfsmp, &to_desc, &dcp->c_desc, &cp->c_desc, NULL);
+       #endif
                goto out;
        }
 
                struct componentname *a_cnp;
        } */ *ap;
 {
+       struct hfsmount *hfsmp;
        struct vnode *vp = ap->a_vp;
        struct vnode *tdvp = ap->a_tdvp;
        struct componentname *cnp = ap->a_cnp;
        struct timeval tv;
        int error;
 
+       hfsmp = VTOHFS(vp);
+       
 #if HFS_DIAGNOSTIC
        if ((cnp->cn_flags & HASBUF) == 0)
                panic("hfs_link: no name");
        if (VTOVCB(tdvp)->vcbSigWord != kHFSPlusSigWord)
                return err_link(ap);    /* hfs disks don't support hard links */
        
-       if (VTOHFS(vp)->hfs_private_metadata_dir == 0)
+       if (hfsmp->hfs_private_metadata_dir == 0)
                return err_link(ap);    /* no private metadata dir, no links possible */
 
        if (tdvp != vp && (error = vn_lock(vp, LK_EXCLUSIVE, p))) {
                goto out1;
        }
 
+       hfs_global_shared_lock_acquire(hfsmp);
+       if (hfsmp->jnl) {
+           if (journal_start_transaction(hfsmp->jnl) != 0) {
+                       hfs_global_shared_lock_release(hfsmp);
+                       return EINVAL;
+           }
+       }
+
        cp->c_nlink++;
        cp->c_flag |= C_CHANGE;
        tv = time;
+
        error = VOP_UPDATE(vp, &tv, &tv, 1);
-       if (!error)
-               error = hfs_makelink(VTOHFS(vp), cp, tdcp, cnp);
+       if (!error) {
+               error = hfs_makelink(hfsmp, cp, tdcp, cnp);
+       }
        if (error) {
                cp->c_nlink--;
                cp->c_flag |= C_CHANGE;
                tdcp->c_flag |= C_CHANGE | C_UPDATE;
                tv = time;
                (void) VOP_UPDATE(tdvp, &tv, &tv, 0);
-               hfs_volupdate(VTOHFS(vp), VOL_MKFILE,
+
+               hfs_volupdate(hfsmp, VOL_MKFILE,
                        (tdcp->c_cnid == kHFSRootFolderID));
        }
+
+       // XXXdbg - need to do this here as well because cp could have changed
+       error = VOP_UPDATE(vp, &tv, &tv, 1);
+
        FREE_ZONE(cnp->cn_pnbuf, cnp->cn_pnlen, M_NAMEI);
+
+       if (hfsmp->jnl) {
+           journal_end_transaction(hfsmp->jnl);
+       }
+       hfs_global_shared_lock_release(hfsmp);
+
 out1:
        if (tdvp != vp)
                VOP_UNLOCK(vp, 0, p);
 
                         * creation of files in the directory.
                         */
                        retval = VOP_ACCESS(dvp, VWRITE, cred, cnp->cn_proc);
-                       if (retval)
+                       if (retval) {
                                goto exit;
+                       }
                
                        cnp->cn_flags |= SAVENAME;
                        if (!(flags & LOCKPARENT))
 
        u_long  hfs_encoding;           /* encoding for this volume (standard HFS only) */
        struct  timezone hfs_timezone;  /* user time zone info (standard HFS only) */
        int     flags;                  /* mounting flags, see below */
+       int     journal_tbuffer_size;   /* size in bytes of the journal transaction buffer */
+       int     journal_flags;          /* flags to pass to journal_open/create */
+       int     journal_disable;        /* don't use journaling (potentially dangerous) */
 };
 
 #define HFSFSMNT_NOXONFILES    0x1     /* disable execute permissions for files */
 #define HFSFSMNT_WRAPPER       0x2     /* mount HFS wrapper (if it exists) */
+#define HFSFSMNT_EXTENDED_ARGS  0x4     /* indicates new fields after "flags" are valid */
+
 #endif /* __APPLE_API_UNSTABLE */
 
 #endif /* ! _HFS_MOUNT_H_ */
 
     int                                retval;
        off_t filebytes;
        u_long fileblocks;
+       struct hfsmount *hfsmp;
+       int started_tr = 0, grabbed_lock = 0;
 
        ioflag = ap->a_ioflag;
 
        if ((cp->c_flags & APPEND) && uio->uio_offset != fp->ff_size)
                return (EPERM);
 
+       // XXXdbg - don't allow modification of the journal or journal_info_block
+       if (VTOHFS(vp)->jnl && cp->c_datafork) {
+               struct HFSPlusExtentDescriptor *extd;
+
+               extd = &cp->c_datafork->ff_data.cf_extents[0];
+               if (extd->startBlock == VTOVCB(vp)->vcbJinfoBlock || extd->startBlock == VTOHFS(vp)->jnl_start) {
+                       return EPERM;
+               }
+       }
+
        writelimit = uio->uio_offset + uio->uio_resid;
 
        /*
        if(writelimit > filebytes) {
                bytesToAdd = writelimit - filebytes;
 
-               retval = hfs_chkdq(cp, (int64_t)(roundup(bytesToAdd, fp->ff_clumpsize)), 
+               retval = hfs_chkdq(cp, (int64_t)(roundup(bytesToAdd, vcb->blockSize)), 
                                   ap->a_cred, 0);
                if (retval)
                        return (retval);
        }
 #endif /* QUOTA */
 
+       hfsmp = VTOHFS(vp);
+       if (writelimit > filebytes) {
+               hfs_global_shared_lock_acquire(hfsmp);
+               grabbed_lock = 1;
+       }
+       if (hfsmp->jnl && (writelimit > filebytes)) {
+               if (journal_start_transaction(hfsmp->jnl) != 0) {
+                       hfs_global_shared_lock_release(hfsmp);
+                       return EINVAL;
+               }
+               started_tr = 1;
+       }
+
        while (writelimit > filebytes) {
        
                bytesToAdd = writelimit - filebytes;
                        (int)uio->uio_offset, uio->uio_resid, (int)fp->ff_size,  (int)filebytes, 0);
        }
 
+       // XXXdbg
+       if (started_tr) {
+               hfs_flushvolumeheader(hfsmp, MNT_NOWAIT, 0);
+               journal_end_transaction(hfsmp->jnl);
+               started_tr = 0;
+       }
+       if (grabbed_lock) {
+               hfs_global_shared_lock_release(hfsmp);
+               grabbed_lock = 0;
+       }
+
        if (UBCISVALID(vp) && retval == E_NONE) {
                off_t filesize;
                off_t zero_off;
     struct proc                *p = NULL;
     struct rl_entry *invalid_range;
     enum rl_overlaptype overlaptype;
+    int started_tr = 0, grabbed_lock = 0;
 
        /*
         * Check for underlying vnode requests and ensure that logical
        if (ap->a_bpn == NULL)
                return (0);
 
-       if (overflow_extents(fp) || fp->ff_unallocblocks) {
+       p = current_proc();
+       if (fp->ff_unallocblocks) {
                lockExtBtree = 1;
-               p = current_proc();
+
+               // XXXdbg
+               hfs_global_shared_lock_acquire(hfsmp);
+               grabbed_lock = 1;
+
+               if (hfsmp->jnl) {
+                       if (journal_start_transaction(hfsmp->jnl) != 0) {
+                               hfs_global_shared_lock_release(hfsmp);
+                               return EINVAL;
+                       } else {
+                               started_tr = 1;
+                       }
+               } 
+
                if (retval = hfs_metafilelocking(hfsmp, kHFSExtentsFileID, LK_EXCLUSIVE | LK_CANRECURSE, p)) {
+                       if (started_tr) {
+                               journal_end_transaction(hfsmp->jnl);
+                       }
+                       if (grabbed_lock) {
+                               hfs_global_shared_lock_release(hfsmp);
+                       }
                        return (retval);
-               }
+               }
+       } else if (overflow_extents(fp)) {
+               lockExtBtree = 1;
+               if (retval = hfs_metafilelocking(hfsmp, kHFSExtentsFileID, LK_EXCLUSIVE | LK_CANRECURSE, p)) {
+                       return retval;
+               }
        }
 
        /*
                }
 
                if (retval) {
-                       (void) hfs_metafilelocking(hfsmp, kHFSExtentsFileID, LK_RELEASE, p);
-                       return (retval);
-               }
+                       (void) hfs_metafilelocking(hfsmp, kHFSExtentsFileID, LK_RELEASE, p);
+                       if (started_tr) {
+                               hfs_flushvolumeheader(hfsmp, MNT_NOWAIT, 0);
+                               journal_end_transaction(hfsmp->jnl);
+                       }
+                       if (grabbed_lock) {
+                               hfs_global_shared_lock_release(hfsmp);
+                       }
+                       return (retval);
+               }
                VTOC(ap->a_vp)->c_flag |= C_MODIFIED;
        }
 
        if (lockExtBtree)
                (void) hfs_metafilelocking(hfsmp, kHFSExtentsFileID, LK_RELEASE, p);
 
+       // XXXdbg
+       if (started_tr) {
+               hfs_flushvolumeheader(hfsmp, MNT_NOWAIT, 0);
+               journal_end_transaction(hfsmp->jnl);
+               started_tr = 0;
+       }
+       if (grabbed_lock) {
+               hfs_global_shared_lock_release(hfsmp);
+               grabbed_lock = 0;
+       }
+                       
     if (retval == E_NONE) {
         /* Adjust the mapping information for invalid file ranges: */
         overlaptype = rl_scan(&fp->ff_invalidranges,
        }
        
        frag->b_vp = NULL;
+       //
+       // XXXdbg - in the case that this is a meta-data block, it won't affect
+       //          the journal because this bp is for a physical disk block,
+       //          not a logical block that is part of the catalog or extents
+       //          files.
        SET(frag->b_flags, B_INVAL);
        brelse(frag);
        
        off_t filebytes;
        u_long fileblocks;
        int blksize;
+       struct hfsmount *hfsmp;
 
        if (vp->v_type != VREG && vp->v_type != VLNK)
                return (EISDIR);        /* cannot truncate an HFS directory! */
        if ((!ISHFSPLUS(VTOVCB(vp))) && (length > (off_t)MAXHFSFILESIZE))
                return (EFBIG);
 
+       hfsmp = VTOHFS(vp);
 
        tv = time;
        retval = E_NONE;
         */
        if (length > fp->ff_size) {
 #if QUOTA
-               retval = hfs_chkdq(cp, (int64_t)(roundup(length - filebytes, fp->ff_clumpsize)),
+               retval = hfs_chkdq(cp, (int64_t)(roundup(length - filebytes, blksize)),
                                ap->a_cred, 0);
                if (retval)
                        goto Err_Exit;
                        if (suser(ap->a_cred, NULL) != 0)
                                eflags |= kEFReserveMask;  /* keep a reserve */
 
+                       // XXXdbg
+                       hfs_global_shared_lock_acquire(hfsmp);
+                       if (hfsmp->jnl) {
+                               if (journal_start_transaction(hfsmp->jnl) != 0) {
+                                       retval = EINVAL;
+                                       goto Err_Exit;
+                               }
+                       }
+
                        /* lock extents b-tree (also protects volume bitmap) */
                        retval = hfs_metafilelocking(VTOHFS(vp), kHFSExtentsFileID, LK_EXCLUSIVE, ap->a_p);
-                       if (retval)
+                       if (retval) {
+                               if (hfsmp->jnl) {
+                                       journal_end_transaction(hfsmp->jnl);
+                               } 
+                               hfs_global_shared_lock_release(hfsmp);
+
                                goto Err_Exit;
+                       }
 
                        while ((length > filebytes) && (retval == E_NONE)) {
                                bytesToAdd = length - filebytes;
                                        break;
                                }
                        } /* endwhile */
+
                        (void) hfs_metafilelocking(VTOHFS(vp), kHFSExtentsFileID, LK_RELEASE, ap->a_p);
+
+                       // XXXdbg
+                       if (hfsmp->jnl) {
+                               hfs_flushvolumeheader(hfsmp, MNT_NOWAIT, 0);
+                               journal_end_transaction(hfsmp->jnl);
+                       } 
+                       hfs_global_shared_lock_release(hfsmp);
+
                        if (retval)
                                goto Err_Exit;
 
 #if QUOTA
                  off_t savedbytes = ((off_t)fp->ff_blocks * (off_t)blksize);
 #endif /* QUOTA */
+                 // XXXdbg
+                 hfs_global_shared_lock_acquire(hfsmp);
+                       if (hfsmp->jnl) {
+                               if (journal_start_transaction(hfsmp->jnl) != 0) {
+                                       retval = EINVAL;
+                                       goto Err_Exit;
+                               }
+                       }
+
                        /* lock extents b-tree (also protects volume bitmap) */
                        retval = hfs_metafilelocking(VTOHFS(vp), kHFSExtentsFileID, LK_EXCLUSIVE, ap->a_p);
-                       if (retval)
+                       if (retval) {
+                               if (hfsmp->jnl) {
+                                       journal_end_transaction(hfsmp->jnl);
+                               }
+                               hfs_global_shared_lock_release(hfsmp);
                                goto Err_Exit;
+                       }
                        
                        if (fp->ff_unallocblocks == 0)
                                retval = MacToVFSError(TruncateFileC(VTOVCB(vp),
                                                (FCB*)fp, length, false));
 
                        (void) hfs_metafilelocking(VTOHFS(vp), kHFSExtentsFileID, LK_RELEASE, ap->a_p);
+
+                       // XXXdbg
+                       if (hfsmp->jnl) {
+                               hfs_flushvolumeheader(hfsmp, MNT_NOWAIT, 0);
+                               journal_end_transaction(hfsmp->jnl);
+                       }
+                       hfs_global_shared_lock_release(hfsmp);
+
                        filebytes = (off_t)fp->ff_blocks * (off_t)blksize;
                        if (retval)
                                goto Err_Exit;
        int retval, retval2;
        UInt32 blockHint;
        UInt32 extendFlags =0;   /* For call to ExtendFileC */
+       struct hfsmount *hfsmp;
+
+       hfsmp = VTOHFS(vp);
 
        *(ap->a_bytesallocated) = 0;
        fileblocks = fp->ff_blocks;
                moreBytesRequested = length - filebytes;
                
 #if QUOTA
-               retval = hfs_chkdq(cp, (int64_t)(roundup(moreBytesRequested, fp->ff_clumpsize)), 
+               retval = hfs_chkdq(cp,
+                               (int64_t)(roundup(moreBytesRequested, VTOVCB(vp)->blockSize)), 
                                ap->a_cred, 0);
                if (retval)
                        return (retval);
 
 #endif /* QUOTA */
+               // XXXdbg
+               hfs_global_shared_lock_acquire(hfsmp);
+               if (hfsmp->jnl) {
+                       if (journal_start_transaction(hfsmp->jnl) != 0) {
+                               retval = EINVAL;
+                               goto Err_Exit;
+                       }
+               }
+
                /* lock extents b-tree (also protects volume bitmap) */
                retval = hfs_metafilelocking(VTOHFS(vp), kHFSExtentsFileID, LK_EXCLUSIVE, ap->a_p);
-               if (retval) goto Err_Exit;
+               if (retval) {
+                       if (hfsmp->jnl) {
+                               journal_end_transaction(hfsmp->jnl);
+                       }
+                       hfs_global_shared_lock_release(hfsmp);
+                       goto Err_Exit;
+               }
 
                retval = MacToVFSError(ExtendFileC(VTOVCB(vp),
                                                (FCB*)fp,
 
                *(ap->a_bytesallocated) = actualBytesAdded;
                filebytes = (off_t)fp->ff_blocks * (off_t)VTOVCB(vp)->blockSize;
+
                (void) hfs_metafilelocking(VTOHFS(vp), kHFSExtentsFileID, LK_RELEASE, ap->a_p);
 
+               // XXXdbg
+               if (hfsmp->jnl) {
+                       hfs_flushvolumeheader(hfsmp, MNT_NOWAIT, 0);
+                       journal_end_transaction(hfsmp->jnl);
+               }
+               hfs_global_shared_lock_release(hfsmp);
+
                /*
                 * if we get an error and no changes were made then exit
                 * otherwise we must do the VOP_UPDATE to reflect the changes
                        (void) vinvalbuf(vp, vflags, ap->a_cred, ap->a_p, 0, 0);
                }
 
+               // XXXdbg
+               hfs_global_shared_lock_acquire(hfsmp);
+               if (hfsmp->jnl) {
+                       if (journal_start_transaction(hfsmp->jnl) != 0) {
+                               retval = EINVAL;
+                               goto Err_Exit;
+                       }
+               }
+
                /* lock extents b-tree (also protects volume bitmap) */
                retval = hfs_metafilelocking(VTOHFS(vp), kHFSExtentsFileID, LK_EXCLUSIVE, ap->a_p);
-               if (retval) goto Err_Exit;
+               if (retval) {
+                       if (hfsmp->jnl) {
+                               journal_end_transaction(hfsmp->jnl);
+                       }
+                       hfs_global_shared_lock_release(hfsmp);
+
+                       goto Err_Exit;
+               }                       
 
                retval = MacToVFSError(
                             TruncateFileC(
                                             false));
                (void) hfs_metafilelocking(VTOHFS(vp), kHFSExtentsFileID, LK_RELEASE, ap->a_p);
                filebytes = (off_t)fp->ff_blocks * (off_t)VTOVCB(vp)->blockSize;
+
+               if (hfsmp->jnl) {
+                       hfs_flushvolumeheader(hfsmp, MNT_NOWAIT, 0);
+                       journal_end_transaction(hfsmp->jnl);
+               }
+               hfs_global_shared_lock_release(hfsmp);
+               
+
                /*
                 * if we get an error and no changes were made then exit
                 * otherwise we must do the VOP_UPDATE to reflect the changes
        } */ *ap;
 {
        int retval = 0;
-#if BYTE_ORDER == LITTLE_ENDIAN
        register struct buf *bp = ap->a_bp;
        register struct vnode *vp = bp->b_vp;
+#if BYTE_ORDER == LITTLE_ENDIAN
        BlockDescriptor block;
 
        /* Trap B-Tree writes */
        }
 #endif
        /* This buffer shouldn't be locked anymore but if it is clear it */
-       if (ISSET(ap->a_bp->b_flags, B_LOCKED)) {
-               CLR(ap->a_bp->b_flags, B_LOCKED);
+       if (ISSET(bp->b_flags, B_LOCKED)) {
+           // XXXdbg
+           if (VTOHFS(vp)->jnl) {
+                       panic("hfs: CLEARING the lock bit on bp 0x%x\n", bp);
+           }
+               CLR(bp->b_flags, B_LOCKED);
                printf("hfs_bwrite: called with lock bit set\n");
        }
        retval = vn_bwrite (ap);
 
        CatalogRecord * myCurrentDataPtr;
        CatPosition * myCatPositionPtr;
        BTScanState myBTScanState;
+       void *user_start = NULL;
+       int   user_len;
 
        /* XXX Parameter check a_searchattrs? */
 
        MALLOC( attributesBuffer, void *, eachReturnBufferSize, M_TEMP, M_WAITOK );
        variableBuffer = (void*)((char*) attributesBuffer + fixedBlockSize);
 
+       // XXXdbg - have to lock the user's buffer so we don't fault
+       // while holding the shared catalog file lock.  see the comment
+       // in hfs_readdir() for more details.
+       //
+       if (VTOHFS(ap->a_vp)->jnl && ap->a_uio->uio_segflg == UIO_USERSPACE) {
+               user_start = ap->a_uio->uio_iov->iov_base;
+               user_len   = ap->a_uio->uio_iov->iov_len;
+
+               if ((err = vslock(user_start, user_len)) != 0) {
+                       user_start = NULL;
+                       goto ExitThisRoutine;
+               }
+       }
+
        /* Lock catalog b-tree */
        err = hfs_metafilelocking(VTOHFS(ap->a_vp), kHFSCatalogFileID, LK_SHARED, p);
        if (err)
 ExitThisRoutine:
         FREE( attributesBuffer, M_TEMP );
 
+       if (VTOHFS(ap->a_vp)->jnl && user_start) {
+               vsunlock(user_start, user_len, TRUE);
+       }
+
        return (MacToVFSError(err));
 }
 
                goto exit;
        }
 
+       /* Hide the private journal files */
+       if (VTOHFS(root_vp)->jnl &&
+           ((c_attr.ca_fileid == VTOHFS(root_vp)->hfs_jnlfileid) ||
+            (c_attr.ca_fileid == VTOHFS(root_vp)->hfs_jnlinfoblkid))) {
+               err = 0;
+               goto exit;
+       }
+
        if (returnAttrList->commonattr & ATTR_CMN_NAME) {
                cat_convertkey(VTOHFS(root_vp), key, rec, &c_desc);
        } else {
 
 #include <sys/quota.h>
 #include <sys/disk.h>
 
+// XXXdbg
+#include <vfs/vfs_journal.h>
+
 #include <miscfs/specfs/specdev.h>
 #include <hfs/hfs_mount.h>
 
                    (HFSTOVCB(hfsmp)->vcbSigWord == kHFSPlusSigWord)) {
                        /* setup private/hidden directory for unlinked files */
                        hfsmp->hfs_private_metadata_dir = FindMetaDataDirectory(HFSTOVCB(hfsmp));
+                       if (hfsmp->jnl)
+                               hfs_remove_orphans(hfsmp);
                }
 
                if (args.fspec == 0) {
                goto error_exit;
        }
 
-       
        /* Set the mount flag to indicate that we support volfs  */
        mp->mnt_flag |= MNT_DOVOLFS;
     if (VFSTOVCB(mp)->vcbSigWord == kHFSSigWord) {
        mp->mnt_flag |= MNT_FIXEDSCRIPTENCODING;
     }
        (void) copyinstr(path, mp->mnt_stat.f_mntonname, MNAMELEN-1, &size);
+
        bzero(mp->mnt_stat.f_mntonname + size, MNAMELEN - size);
        (void) copyinstr(args.fspec, mp->mnt_stat.f_mntfromname, MNAMELEN - 1, &size);
        bzero(mp->mnt_stat.f_mntfromname + size, MNAMELEN - size);
 
        vcb->vcbLsMod           = to_bsd_time(SWAP_BE32(vhp->modifyDate));
        vcb->vcbAtrb            = (UInt16) SWAP_BE32 (vhp->attributes); /* VCB only uses lower 16 bits */
+       vcb->vcbJinfoBlock  = SWAP_BE32(vhp->journalInfoBlock);
        vcb->vcbClpSiz          = SWAP_BE32 (vhp->rsrcClumpSize);
        vcb->vcbNxtCNID         = SWAP_BE32 (vhp->nextCatalogID);
        vcb->vcbVolBkUp         = to_bsd_time(SWAP_BE32(vhp->backupDate));
 }
 
 
+static int
+get_raw_device(char *fspec, int is_user, int ronly, struct vnode **rvp, struct ucred *cred, struct proc *p)
+{
+       char            *rawbuf;
+       char            *dp;
+       size_t           namelen;
+       struct nameidata nd;
+       int               retval;
+
+       *rvp = NULL;
+
+       MALLOC(rawbuf, char *, MAXPATHLEN, M_HFSMNT, M_WAITOK);
+       if (rawbuf == NULL) {
+               retval = ENOMEM;
+               goto error_exit;
+       }
+
+       if (is_user) {
+               retval = copyinstr(fspec, rawbuf, MAXPATHLEN - 1, &namelen);
+               if (retval != E_NONE) {
+                       FREE(rawbuf, M_HFSMNT);
+                       goto error_exit;
+               }
+       } else {
+               strcpy(rawbuf, fspec);
+               namelen = strlen(rawbuf);
+       }
+
+       /* make sure it's null terminated */
+       rawbuf[MAXPATHLEN-1] = '\0';   
+
+       dp = &rawbuf[namelen-1];
+       while(dp >= rawbuf && *dp != '/') {
+               dp--;
+       }
+                       
+       if (dp != NULL) {
+               dp++;
+       } else {
+               dp = rawbuf;
+       }
+                       
+       /* make room for and insert the 'r' for the raw device */
+       memmove(dp+1, dp, strlen(dp)+1);
+       *dp = 'r';
+
+       NDINIT(&nd, LOOKUP, FOLLOW, UIO_SYSSPACE, rawbuf, p);
+       retval = namei(&nd);
+       if (retval != E_NONE) {
+               DBG_ERR(("hfs_mountfs: can't open raw device for journal: %s, %x\n", rawbuf, nd.ni_vp->v_rdev));
+               FREE(rawbuf, M_HFSMNT);
+               goto error_exit;
+       }
+
+       *rvp = nd.ni_vp;
+       if ((retval = VOP_OPEN(*rvp, ronly ? FREAD : FREAD|FWRITE, FSCRED, p))) {
+               *rvp = NULL;
+               goto error_exit;
+       }
+
+       // don't need this any more
+       FREE(rawbuf, M_HFSMNT);
+
+       return 0;
+
+  error_exit:
+       if (*rvp) {
+           (void)VOP_CLOSE(*rvp, ronly ? FREAD : FREAD|FWRITE, cred, p);
+       }
+
+       if (rawbuf) {
+               FREE(rawbuf, M_HFSMNT);
+       }
+       return retval;
+}
+
+
+
 /*
  * Common code for mount and mountroot
  */
        u_int32_t blksize;
        u_int32_t minblksize;
        u_int32_t iswritable;
+       daddr_t   mdb_offset;
 
        dev = devvp->v_rdev;
        cred = p ? p->p_ucred : NOCRED;
                return (retval);
        }
 
+       mdb_offset = HFS_PRI_SECTOR(blksize);
        if ((retval = meta_bread(devvp, HFS_PRI_SECTOR(blksize), blksize, cred, &bp))) {
                goto error_exit;
        }
        bzero(hfsmp, sizeof(struct hfsmount));
 
        simple_lock_init(&hfsmp->hfs_renamelock);
-
+       
        /*
        *  Init the volume information structure
        */
        } else /* Mount an HFS Plus disk */ {
                HFSPlusVolumeHeader *vhp;
                off_t embeddedOffset;
+               int   jnl_disable = 0;
        
                /* Get the embedded Volume Header */
                if (SWAP_BE16(mdbp->drEmbedSigWord) == kHFSPlusSigWord) {
 
                        hfsmp->hfs_phys_block_count = disksize / blksize;
        
-                       retval = meta_bread(devvp, (embeddedOffset / blksize) +
-                                       HFS_PRI_SECTOR(blksize), blksize, cred, &bp);
+                       mdb_offset = (embeddedOffset / blksize) + HFS_PRI_SECTOR(blksize);
+                       retval = meta_bread(devvp, mdb_offset, blksize, cred, &bp);
                        if (retval)
                                goto error_exit;
                        bcopy(bp->b_data + HFS_PRI_OFFSET(blksize), mdbp, 512);
                        vhp = (HFSPlusVolumeHeader*) mdbp;
                }
 
+               // XXXdbg
+               //
+               hfsmp->jnl = NULL;
+               hfsmp->jvp = NULL;
+               if (args != NULL && (args->flags & HFSFSMNT_EXTENDED_ARGS) && args->journal_disable) {
+                   jnl_disable = 1;
+               }
+                               
+               //
+               // We only initialize the journal here if the last person
+               // to mount this volume was journaling aware.  Otherwise
+               // we delay journal initialization until later at the end
+               // of hfs_MountHFSPlusVolume() because the last person who
+               // mounted it could have messed things up behind our back
+               // (so we need to go find the .journal file, make sure it's
+               // the right size, re-sync up if it was moved, etc).
+               //
+               if (   (SWAP_BE32(vhp->lastMountedVersion) == kHFSJMountVersion)
+                       && (SWAP_BE32(vhp->attributes) & kHFSVolumeJournaledMask)
+                       && !jnl_disable) {
+                       
+                       // if we're able to init the journal, mark the mount
+                       // point as journaled.
+                       //
+                       if (hfs_early_journal_init(hfsmp, vhp, args, embeddedOffset, mdb_offset, mdbp, cred) == 0) {
+                               mp->mnt_flag |= MNT_JOURNALED;
+                       } else {
+                               retval = EINVAL;
+                               goto error_exit;
+                       }
+               }
+               // XXXdbg
+       
                (void) hfs_getconverter(0, &hfsmp->hfs_get_unicode, &hfsmp->hfs_get_hfsname);
 
-               retval = hfs_MountHFSPlusVolume(hfsmp, vhp, embeddedOffset, disksize, p);
+               retval = hfs_MountHFSPlusVolume(hfsmp, vhp, embeddedOffset, disksize, p, args);
                /*
                 * If the backend didn't like our physical blocksize
                 * then retry with physical blocksize of 512.
                        hfsmp->hfs_phys_block_size = blksize;
  
                        /* Try again with a smaller block size... */
-                       retval = hfs_MountHFSPlusVolume(hfsmp, vhp, embeddedOffset, disksize, p);
+                       retval = hfs_MountHFSPlusVolume(hfsmp, vhp, embeddedOffset, disksize, p, args);
                }
                if (retval)
                        (void) hfs_relconverter(0);
        if (mdbp)
                FREE(mdbp, M_TEMP);
        (void)VOP_CLOSE(devvp, ronly ? FREAD : FREAD|FWRITE, cred, p);
+       if (hfsmp && hfsmp->jvp && hfsmp->jvp != hfsmp->hfs_devvp) {
+           (void)VOP_CLOSE(hfsmp->jvp, ronly ? FREAD : FREAD|FWRITE, cred, p);
+               hfsmp->jvp = NULL;
+       }
        if (hfsmp) {
                FREE(hfsmp, M_HFSMNT);
                mp->mnt_data = (qaddr_t)0;
        int retval = E_NONE;
        int flags;
        int force;
+       int started_tr = 0, grabbed_lock = 0;
 
        flags = 0;
        force = 0;
         * Flush out the b-trees, volume bitmap and Volume Header
         */
        if (hfsmp->hfs_fs_ronly == 0) {
+               hfs_global_shared_lock_acquire(hfsmp);
+               grabbed_lock = 1;
+           if (hfsmp->jnl) {
+                       journal_start_transaction(hfsmp->jnl);
+                       started_tr = 1;
+               }
+               
                retval = VOP_FSYNC(HFSTOVCB(hfsmp)->catalogRefNum, NOCRED, MNT_WAIT, p);
                if (retval && !force)
-                       return (retval);
-
+                       goto err_exit;
+               
                retval = VOP_FSYNC(HFSTOVCB(hfsmp)->extentsRefNum, NOCRED, MNT_WAIT, p);
                if (retval && !force)
-                       return (retval);
+                       goto err_exit;
+                       
+               // if we have an allocation file, sync it too so we don't leave dirty
+               // blocks around
+               if (HFSTOVCB(hfsmp)->allocationsRefNum) {
+                   if (retval = VOP_FSYNC(HFSTOVCB(hfsmp)->allocationsRefNum, NOCRED, MNT_WAIT, p)) {
+                       if (!force)
+                           goto err_exit;
+                   }
+               }
 
                if (retval = VOP_FSYNC(hfsmp->hfs_devvp, NOCRED, MNT_WAIT, p)) {
                        if (!force)
-                               return (retval);
+                               goto err_exit;
                }
                
                /* See if this volume is damaged, is so do not unmount cleanly */
                        HFSTOVCB(hfsmp)->vcbAtrb |= kHFSVolumeUnmountedMask;
                }
 
-               retval = hfs_flushvolumeheader(hfsmp, MNT_WAIT, 0);
+               retval = hfs_flushvolumeheader(hfsmp, MNT_WAIT, 1);
                if (retval) {
                        HFSTOVCB(hfsmp)->vcbAtrb &= ~kHFSVolumeUnmountedMask;
                        if (!force)
-                               return (retval);        /* could not flush everything */
+                               goto err_exit;  /* could not flush everything */
+               }
+
+               if (hfsmp->jnl) {
+                       journal_end_transaction(hfsmp->jnl);
+                       started_tr = 0;
+               }
+               if (grabbed_lock) {
+                       hfs_global_shared_lock_release(hfsmp);
+                       grabbed_lock = 0;
                }
        }
 
+       if (hfsmp->jnl) {
+               journal_flush(hfsmp->jnl);
+       }
+       
        /*
         *      Invalidate our caches and release metadata vnodes
         */
        if (HFSTOVCB(hfsmp)->vcbSigWord == kHFSSigWord)
                (void) hfs_relconverter(hfsmp->hfs_encoding);
 
+       // XXXdbg
+       if (hfsmp->jnl) {
+           journal_close(hfsmp->jnl);
+       }
+
+       if (hfsmp->jvp && hfsmp->jvp != hfsmp->hfs_devvp) {
+           retval = VOP_CLOSE(hfsmp->jvp, hfsmp->hfs_fs_ronly ? FREAD : FREAD|FWRITE,
+                              NOCRED, p);
+           vrele(hfsmp->jvp);
+               hfsmp->jvp = NULL;
+       }
+       // XXXdbg
+
        hfsmp->hfs_devvp->v_specflags &= ~SI_MOUNTEDON;
        retval = VOP_CLOSE(hfsmp->hfs_devvp,
                    hfsmp->hfs_fs_ronly ? FREAD : FREAD|FWRITE,
        FREE(hfsmp, M_HFSMNT);
        mp->mnt_data = (qaddr_t)0;
        return (0);
+
+  err_exit:
+       if (hfsmp->jnl && started_tr) {
+               journal_end_transaction(hfsmp->jnl);
+       }
+       if (grabbed_lock) {
+               hfs_global_shared_lock_release(hfsmp);
+       }
+       return retval;
 }
 
 
 }
 
 
+
+
 /*
  * Get file system statistics.
  */
 }
 
 
+//
+// XXXdbg -- this is a callback to be used by the journal to
+//           get meta data blocks flushed out to disk.
+//
+// XXXdbg -- be smarter and don't flush *every* block on each
+//           call.  try to only flush some so we don't wind up
+//           being too synchronous.
+//
+__private_extern__
+void
+hfs_sync_metadata(void *arg)
+{
+       struct mount *mp = (struct mount *)arg;
+       struct cnode *cp;
+       struct hfsmount *hfsmp;
+       ExtendedVCB *vcb;
+       struct vnode *meta_vp[3];
+       struct buf *bp;
+       int i, sectorsize, priIDSector, altIDSector, retval;
+       int error, allerror = 0;
+
+       hfsmp = VFSTOHFS(mp);
+       vcb = HFSTOVCB(hfsmp);
+
+       bflushq(BQ_META, mp);
+
+
+#if 1     // XXXdbg - I do not believe this is necessary...
+          //          but if I pull it out, then the journal
+             //          does not seem to get flushed properly
+             //          when it is closed....
+       
+       // now make sure the super block is flushed
+       sectorsize = hfsmp->hfs_phys_block_size;
+       priIDSector = (vcb->hfsPlusIOPosOffset / sectorsize) +
+                  HFS_PRI_SECTOR(sectorsize);
+       retval = meta_bread(hfsmp->hfs_devvp, priIDSector, sectorsize, NOCRED, &bp);
+       if (retval != 0) {
+               panic("hfs: sync_metadata: can't read super-block?! (retval 0x%x, priIDSector)\n",
+                         retval, priIDSector);
+       }
+
+       if (retval == 0 && (bp->b_flags & B_DELWRI) && (bp->b_flags & B_LOCKED) == 0) {
+           bwrite(bp);
+       } else if (bp) {
+           brelse(bp);
+       }
+
+       // the alternate super block...
+       // XXXdbg - we probably don't need to do this each and every time.
+       //          hfs_btreeio.c:FlushAlternate() should flag when it was
+       //          written...
+       altIDSector = (vcb->hfsPlusIOPosOffset / sectorsize) +
+                       HFS_ALT_SECTOR(sectorsize, hfsmp->hfs_phys_block_count);
+       retval = meta_bread(hfsmp->hfs_devvp, altIDSector, sectorsize, NOCRED, &bp);
+       if (retval == 0 && (bp->b_flags & B_DELWRI) && (bp->b_flags & B_LOCKED) == 0) {
+           bwrite(bp);
+       } else if (bp) {
+           brelse(bp);
+       }
+#endif
+       
+}
+
 /*
  * Go through the disk queues to initiate sandbagged IO;
  * go through the inodes to write those that have been modified;
                panic("update: rofs mod");
        };
 
+#if 0
+       // XXXdbg first go through and flush out any modified
+       //        meta data blocks so they go out in order...
+       bflushq(BQ_META, mp);
+       bflushq(BQ_LRU,  mp);
+       // only flush locked blocks if we're not doing journaling
+       if (hfsmp->jnl == NULL) {
+           bflushq(BQ_LOCKED, mp);
+       }
+#endif
+
        /*
         * Write back each 'modified' vnode
         */
                        simple_unlock(&mntvnode_slock);
                        goto loop;
                }
+
                simple_lock(&vp->v_interlock);
                nvp = vp->v_mntvnodes.le_next;
+
                cp = VTOC(vp);
 
+               // restart our whole search if this guy is locked
+               // or being reclaimed.
+               if (cp == NULL || vp->v_flag & (VXLOCK|VORECLAIM)) {
+                       simple_unlock(&vp->v_interlock);
+                       continue;
+               }
+
                if ((vp->v_flag & VSYSTEM) || (vp->v_type == VNON) ||
                    (((cp->c_flag & (C_ACCESS | C_CHANGE | C_MODIFIED | C_UPDATE)) == 0) &&
                    (vp->v_dirtyblkhd.lh_first == NULL) && !(vp->v_flag & VHASDIRTY))) {
                btvp = btvp = meta_vp[i];;
                if ((btvp==0) || (btvp->v_type == VNON) || (btvp->v_mount != mp))
                        continue;
+
                simple_lock(&btvp->v_interlock);
                cp = VTOC(btvp);
                if (((cp->c_flag & (C_ACCESS | C_CHANGE | C_MODIFIED | C_UPDATE)) == 0) &&
         */
 
        if (IsVCBDirty(vcb)) {
+               // XXXdbg - debugging, remove
+               if (hfsmp->jnl) {
+                       //printf("hfs: sync: strange, a journaled volume w/dirty VCB? jnl 0x%x hfsmp 0x%x\n",
+                       //        hfsmp->jnl, hfsmp);
+               }
+
                error = hfs_flushvolumeheader(hfsmp, waitfor, 0);
-               if (error)
-                       allerror = error;
+               if (error)
+                       allerror = error;
        }
 
+       if (hfsmp->jnl) {
+           journal_flush(hfsmp->jnl);
+       }
+       
+  err_exit:
        return (allerror);
 }
 
 }
 
 
+// XXXdbg
+#include <sys/filedesc.h>
+
+
 /*
  * HFS filesystem related variables.
  */
        extern u_int32_t hfs_encodingbias;
 
        /* all sysctl names at this level are terminal */
-       if (namelen != 1)
-               return (ENOTDIR);       /* overloaded */
 
        if (name[0] == HFS_ENCODINGBIAS)
                return (sysctl_int(oldp, oldlenp, newp, newlen,
                                &hfs_encodingbias));
+       else if (name[0] == 0x082969) {
+               // make the file system journaled...
+               struct vnode *vp = p->p_fd->fd_cdir, *jvp;
+               struct hfsmount *hfsmp;
+               ExtendedVCB *vcb;
+               int retval;
+               struct cat_attr jnl_attr, jinfo_attr;
+               struct cat_fork jnl_fork, jinfo_fork;
+               void *jnl = NULL;
+               
+               hfsmp = VTOHFS(vp);
+               if (hfsmp->hfs_fs_ronly) {
+                       return EROFS;
+               }
+               if (HFSTOVCB(hfsmp)->vcbSigWord == kHFSSigWord) {
+                       printf("hfs: can't make a plain hfs volume journaled.\n");
+                       return EINVAL;
+               }
+
+               if (hfsmp->jnl) {
+                   printf("hfs: volume @ mp 0x%x is already journaled!\n", vp->v_mount);
+                   return EAGAIN;
+               }
+
+               vcb = HFSTOVCB(hfsmp);
+               if (BTHasContiguousNodes(VTOF(vcb->catalogRefNum)) == 0 ||
+                       BTHasContiguousNodes(VTOF(vcb->extentsRefNum)) == 0) {
+
+                       printf("hfs: volume has a btree w/non-contiguous nodes.  can not enable journaling.\n");
+                       return EINVAL;
+               }
+
+               // make sure these both exist!
+               if (   GetFileInfo(vcb, kRootDirID, ".journal_info_block", &jinfo_attr, &jinfo_fork) == 0
+                       || GetFileInfo(vcb, kRootDirID, ".journal", &jnl_attr, &jnl_fork) == 0) {
+
+                       return EINVAL;
+               }
+
+               hfs_sync(hfsmp->hfs_mp, MNT_WAIT, FSCRED, p);
+               bflushq(BQ_META);
+
+               printf("hfs: Initializing the journal (joffset 0x%llx sz 0x%llx)...\n",
+                          (off_t)name[2], (off_t)name[3]);
+
+               jvp = hfsmp->hfs_devvp;
+               jnl = journal_create(jvp,
+                                                        (off_t)name[2] * (off_t)HFSTOVCB(hfsmp)->blockSize
+                                                        + HFSTOVCB(hfsmp)->hfsPlusIOPosOffset,
+                                                        (off_t)name[3],
+                                                        hfsmp->hfs_devvp,
+                                                        hfsmp->hfs_phys_block_size,
+                                                        0,
+                                                        0,
+                                                        hfs_sync_metadata, hfsmp->hfs_mp);
+
+               if (jnl == NULL) {
+                       printf("hfs: FAILED to create the journal!\n");
+                       if (jvp && jvp != hfsmp->hfs_devvp) {
+                               VOP_CLOSE(jvp, hfsmp->hfs_fs_ronly ? FREAD : FREAD|FWRITE, FSCRED, p);
+                       }
+                       jvp = NULL;
+
+                       return EINVAL;
+               } 
+
+               hfs_global_exclusive_lock_acquire(hfsmp);
+               
+               HFSTOVCB(hfsmp)->vcbJinfoBlock = name[1];
+               HFSTOVCB(hfsmp)->vcbAtrb |= kHFSVolumeJournaledMask;
+               hfsmp->jvp = jvp;
+               hfsmp->jnl = jnl;
+
+               // save this off for the hack-y check in hfs_remove()
+               hfsmp->jnl_start        = (u_int32_t)name[2];
+               hfsmp->hfs_jnlinfoblkid = jinfo_attr.ca_fileid;
+               hfsmp->hfs_jnlfileid    = jnl_attr.ca_fileid;
+
+               hfsmp->hfs_mp->mnt_flag |= MNT_JOURNALED;
+
+               hfs_global_exclusive_lock_release(hfsmp);
+               hfs_flushvolumeheader(hfsmp, MNT_WAIT, 1);
+
+               return 0;
+       } else if (name[0] == 0x031272) {
+               // clear the journaling bit 
+               struct vnode *vp = p->p_fd->fd_cdir;
+               struct hfsmount *hfsmp;
+               void *jnl;
+               int retval;
+               
+               hfsmp = VTOHFS(vp);
+               if (hfsmp->jnl == NULL) {
+                       return EINVAL;
+               }
+
+               printf("hfs: disabling journaling for mount @ 0x%x\n", vp->v_mount);
+
+               jnl = hfsmp->jnl;
+               
+               hfs_global_exclusive_lock_acquire(hfsmp);
+
+               // Lights out for you buddy!
+               hfsmp->jnl = NULL;
+               journal_close(jnl);
+
+               if (hfsmp->jvp && hfsmp->jvp != hfsmp->hfs_devvp) {
+                       VOP_CLOSE(hfsmp->jvp, hfsmp->hfs_fs_ronly ? FREAD : FREAD|FWRITE, FSCRED, p);
+               }
+               hfsmp->jnl = NULL;
+               hfsmp->jvp = NULL;
+               hfsmp->hfs_mp->mnt_flag &= ~MNT_JOURNALED;
+               hfsmp->jnl_start        = 0;
+               hfsmp->hfs_jnlinfoblkid = 0;
+               hfsmp->hfs_jnlfileid    = 0;
+               
+               HFSTOVCB(hfsmp)->vcbAtrb &= ~kHFSVolumeJournaledMask;
+               
+               hfs_global_exclusive_lock_release(hfsmp);
+               hfs_flushvolumeheader(hfsmp, MNT_WAIT, 1);
+
+               return 0;
+       }
 
        return (EOPNOTSUPP);
 }
                        --vcb->vcbNmFls;
                break;
        }
+
+       if (hfsmp->jnl) {
+               hfs_flushvolumeheader(hfsmp, 0, 0);
+       }
+
        return (0);
 }
 
        ByteCount namelen;
 
        sectorsize = hfsmp->hfs_phys_block_size;
-
        retval = bread(hfsmp->hfs_devvp, HFS_PRI_SECTOR(sectorsize), sectorsize, NOCRED, &bp);
        if (retval) {
                if (bp)
        DBG_ASSERT(bp->b_data != NULL);
        DBG_ASSERT(bp->b_bcount == size);
 
+       if (hfsmp->jnl) {
+               panic("hfs: standard hfs volumes should not be journaled!\n");
+       }
+
        mdb = (HFSMasterDirectoryBlock *)(bp->b_data + HFS_PRI_OFFSET(sectorsize));
     
        mdb->drCrDate   = SWAP_BE32 (UTCToLocal(to_hfs_time(vcb->vcbCrDate)));
 
                if (meta_bread(hfsmp->hfs_devvp, altIDSector, sectorsize, NOCRED, &alt_bp) == 0) {
                        bcopy(mdb, alt_bp->b_data + HFS_ALT_OFFSET(sectorsize), kMDBSize);
+
                        (void) VOP_BWRITE(alt_bp);
                } else if (alt_bp)
                        brelse(alt_bp);
 
        if (waitfor != MNT_WAIT)
                bawrite(bp);
-       else
+       else 
                retval = VOP_BWRITE(bp);
  
        MarkVCBClean( vcb );
        priIDSector = (vcb->hfsPlusIOPosOffset / sectorsize) +
                        HFS_PRI_SECTOR(sectorsize);
 
+       // XXXdbg
+       hfs_global_shared_lock_acquire(hfsmp);
+       if (hfsmp->jnl) {
+               if (journal_start_transaction(hfsmp->jnl) != 0) {
+                       hfs_global_shared_lock_release(hfsmp);
+                   return EINVAL;
+           }
+       }
+
        retval = meta_bread(hfsmp->hfs_devvp, priIDSector, sectorsize, NOCRED, &bp);
        if (retval) {
                if (bp)
                        brelse(bp);
+
+               if (hfsmp->jnl) {
+                       journal_end_transaction(hfsmp->jnl);
+               }
+               hfs_global_shared_lock_release(hfsmp);
+
                return (retval);
        }
 
+       if (hfsmp->jnl) {
+               journal_modify_block_start(hfsmp->jnl, bp);
+       }
+
        volumeHeader = (HFSPlusVolumeHeader *)((char *)bp->b_data + HFS_PRI_OFFSET(sectorsize));
 
        /*
 
                        if ( SWAP_BE32 (mdb->drCrDate) != vcb->localCreateDate )
                          {
+                               // XXXdbg
+                               if (hfsmp->jnl) {
+                                   journal_modify_block_start(hfsmp->jnl, bp2);
+                               }
+
                                mdb->drCrDate = SWAP_BE32 (vcb->localCreateDate);       /* pick up the new create date */
 
-                               (void) VOP_BWRITE(bp2);         /* write out the changes */
+                               // XXXdbg
+                               if (hfsmp->jnl) {
+                                       journal_modify_block_end(hfsmp->jnl, bp2);
+                               } else {
+                                       (void) VOP_BWRITE(bp2);         /* write out the changes */
+                               }
                          }
                        else
                          {
                  }     
        }
 
+// XXXdbg - only monkey around with the volume signature on non-root volumes
+//
+#if 0
+       if (hfsmp->jnl &&
+               hfsmp->hfs_fs_ronly == 0 &&
+               (HFSTOVFS(hfsmp)->mnt_flag & MNT_ROOTFS) == 0) {
+               
+               int old_sig = volumeHeader->signature;
+
+               if (vcb->vcbAtrb & kHFSVolumeUnmountedMask) {
+                       volumeHeader->signature = kHFSPlusSigWord;
+               } else {
+                       volumeHeader->signature = kHFSJSigWord;
+               }
+
+               if (old_sig != volumeHeader->signature) {
+                       altflush = 1;
+               }
+       }
+#endif
+// XXXdbg
+
        /* Note: only update the lower 16 bits worth of attributes */
        volumeHeader->attributes        = SWAP_BE32 ((SWAP_BE32 (volumeHeader->attributes) & 0xFFFF0000) + (UInt16) vcb->vcbAtrb);
-       volumeHeader->lastMountedVersion = SWAP_BE32 (kHFSPlusMountVersion);
+       volumeHeader->journalInfoBlock = SWAP_BE32(vcb->vcbJinfoBlock);
+       if (hfsmp->jnl) {
+               volumeHeader->lastMountedVersion = SWAP_BE32 (kHFSJMountVersion);
+       } else {
+               volumeHeader->lastMountedVersion = SWAP_BE32 (kHFSPlusMountVersion);
+       }
        volumeHeader->createDate        = SWAP_BE32 (vcb->localCreateDate);  /* volume create date is in local time */
        volumeHeader->modifyDate        = SWAP_BE32 (to_hfs_time(vcb->vcbLsMod));
        volumeHeader->backupDate        = SWAP_BE32 (to_hfs_time(vcb->vcbVolBkUp));
                        HFS_ALT_SECTOR(sectorsize, hfsmp->hfs_phys_block_count);
 
                if (meta_bread(hfsmp->hfs_devvp, altIDSector, sectorsize, NOCRED, &alt_bp) == 0) {
+                       if (hfsmp->jnl) {
+                               journal_modify_block_start(hfsmp->jnl, alt_bp);
+                       }
+
                        bcopy(volumeHeader, alt_bp->b_data + HFS_ALT_OFFSET(sectorsize), kMDBSize);
-                       (void) VOP_BWRITE(alt_bp);
+
+                       if (hfsmp->jnl) {
+                               journal_modify_block_end(hfsmp->jnl, alt_bp);
+                       } else {
+                               (void) VOP_BWRITE(alt_bp);
+                       }
                } else if (alt_bp)
                        brelse(alt_bp);
        }
 
-       if (waitfor != MNT_WAIT)
-               bawrite(bp);
-       else {
-               retval = VOP_BWRITE(bp);
-               /* When critical data changes, flush the device cache */
-               if (critical && (retval == 0)) {
+       // XXXdbg
+       if (hfsmp->jnl) {
+               journal_modify_block_end(hfsmp->jnl, bp);
+               journal_end_transaction(hfsmp->jnl);
+       } else {
+               if (waitfor != MNT_WAIT)
+                       bawrite(bp);
+               else {
+                   retval = VOP_BWRITE(bp);
+                   /* When critical data changes, flush the device cache */
+                   if (critical && (retval == 0)) {
                        (void) VOP_IOCTL(hfsmp->hfs_devvp, DKIOCSYNCHRONIZECACHE,
-                                       NULL, FWRITE, NOCRED, current_proc());
+                                        NULL, FWRITE, NOCRED, current_proc());
+                   }
                }
        }
+       hfs_global_shared_lock_release(hfsmp);
  
        vcb->vcbFlags &= 0x00FF;
        return (retval);
 
 
 
 static void ReleaseMetaFileVNode(struct vnode *vp);
+static int  hfs_late_journal_init(struct hfsmount *hfsmp, HFSPlusVolumeHeader *vhp, void *_args);
 
 u_int32_t GetLogicalBlockSize(struct vnode *vp);
 
 //*******************************************************************************
 
 OSErr hfs_MountHFSPlusVolume(struct hfsmount *hfsmp, HFSPlusVolumeHeader *vhp,
-       off_t embeddedOffset, u_int64_t disksize, struct proc *p)
+       off_t embeddedOffset, u_int64_t disksize, struct proc *p, void *args)
 {
        register ExtendedVCB *vcb;
        struct cat_desc cndesc;
        UInt32 blockSize;
        OSErr retval;
 
-       if (SWAP_BE16(vhp->signature) != kHFSPlusSigWord ||
-           SWAP_BE16(vhp->version) != kHFSPlusVersion)
-               return (EINVAL);
+       // XXXdbg - added the kHFSJSigWord case
+       if ((SWAP_BE16(vhp->signature) != kHFSPlusSigWord &&
+                SWAP_BE16(vhp->signature) != kHFSJSigWord) ||
+           SWAP_BE16(vhp->version) != kHFSPlusVersion) {
+               // XXXdbg
+               printf("hfs: mount: sig 0x%x and version 0x%x are not HFS or HFS+.\n",
+                          vhp->signature, vhp->version);
+               return (EINVAL);
+       }
 
        /* Block size must be at least 512 and a power of 2 */
        blockSize = SWAP_BE32(vhp->blockSize);
                return (EINVAL);
    
        /* don't mount a writable volume if its dirty, it must be cleaned by fsck_hfs */
-       if (hfsmp->hfs_fs_ronly == 0 && (SWAP_BE32(vhp->attributes) & kHFSVolumeUnmountedMask) == 0)
+       if (hfsmp->hfs_fs_ronly == 0 && hfsmp->jnl == NULL && (SWAP_BE32(vhp->attributes) & kHFSVolumeUnmountedMask) == 0)
                return (EINVAL);
 
        /* Make sure we can live with the physical block size. */
        vcb = HFSTOVCB(hfsmp);
 
        vcb->vcbSigWord = SWAP_BE16(vhp->signature);
+
+       // XXXdbg - remap this in case we've mounted a dirty journaled volume
+       if (vcb->vcbSigWord == kHFSJSigWord) {
+               vcb->vcbSigWord = kHFSPlusSigWord;
+       }
+
+       vcb->vcbJinfoBlock = SWAP_BE32(vhp->journalInfoBlock);
        vcb->vcbLsMod   = to_bsd_time(SWAP_BE32(vhp->modifyDate));
        vcb->vcbAtrb    = (UInt16)SWAP_BE32(vhp->attributes);
        vcb->vcbClpSiz  = SWAP_BE32(vhp->rsrcClumpSize);
 
        /* mark the volume dirty (clear clean unmount bit) */
        vcb->vcbAtrb &= ~kHFSVolumeUnmountedMask;
+       if (hfsmp->jnl && hfsmp->hfs_fs_ronly == 0) {
+               hfs_flushvolumeheader(hfsmp, TRUE, TRUE);
+       }
 
        /*
         * all done with metadata files so we can unlock now...
 
        /* setup private/hidden directory for unlinked files */
        hfsmp->hfs_private_metadata_dir = FindMetaDataDirectory(vcb);
+       if (hfsmp->jnl && (hfsmp->hfs_fs_ronly == 0))
+               hfs_remove_orphans(hfsmp);
 
        if ( !(vcb->vcbAtrb & kHFSVolumeHardwareLockMask) )     // if the disk is not write protected
        {
                MarkVCBDirty( vcb );    // mark VCB dirty so it will be written
        }
 
+
+       //
+       // Check if we need to do late journal initialization.  This only
+       // happens if a previous version of MacOS X (or 9) touched the disk.
+       // In that case hfs_late_journal_init() will go re-locate the journal 
+       // and journal_info_block files and validate that they're still kosher.
+       //
+       if (   (vcb->vcbAtrb & kHFSVolumeJournaledMask)
+               && (SWAP_BE32(vhp->lastMountedVersion) != kHFSJMountVersion)
+               && (hfsmp->jnl == NULL)) {
+
+               retval = hfs_late_journal_init(hfsmp, vhp, args);
+               if (retval != 0) {
+                       hfsmp->jnl = NULL;
+                       goto ErrorExit;
+               } else if (hfsmp->jnl) {
+                       hfsmp->hfs_mp->mnt_flag |= MNT_JOURNALED;
+               }
+       } else if (hfsmp->jnl) {
+               struct cat_attr jinfo_attr, jnl_attr;
+               
+               // if we're here we need to fill in the fileid's for the
+               // journal and journal_info_block.
+               hfsmp->hfs_jnlinfoblkid = GetFileInfo(vcb, kRootDirID, ".journal_info_block", &jinfo_attr, NULL);
+               hfsmp->hfs_jnlfileid    = GetFileInfo(vcb, kRootDirID, ".journal", &jnl_attr, NULL);
+               if (hfsmp->hfs_jnlinfoblkid == 0 || hfsmp->hfs_jnlfileid == 0) {
+                       printf("hfs: danger! couldn't find the file-id's for the journal or journal_info_block\n");
+                       printf("hfs: jnlfileid %d, jnlinfoblkid %d\n", hfsmp->hfs_jnlfileid, hfsmp->hfs_jnlinfoblkid);
+               }
+       }
+
+
        return (0);
 
 ErrorExit:
        fndrinfo->frLocation.h = SWAP_BE16 (22460);
        fndrinfo->frFlags |= SWAP_BE16 (kIsInvisible + kNameLocked);            
 
+       // XXXdbg
+       hfs_global_shared_lock_acquire(hfsmp);
+       if (hfsmp->jnl) {
+           if ((error = journal_start_transaction(hfsmp->jnl)) != 0) {
+                       hfs_global_shared_lock_release(hfsmp);
+                       return (0);
+           }
+       }
+
        error = cat_create(hfsmp, &hfsmp->hfs_privdir_desc,
                        &hfsmp->hfs_privdir_attr, &out_desc);
 
        /* Unlock catalog b-tree */
        (void) hfs_metafilelocking(hfsmp, kHFSCatalogFileID, LK_RELEASE, current_proc());
-       if (error)
-               return (0);
+       if (error) {
+           if (hfsmp->jnl) {
+                       journal_end_transaction(hfsmp->jnl);
+           }
+               hfs_global_shared_lock_release(hfsmp);
+
+           return (0);
+       }
 
        hfsmp->hfs_privdir_desc.cd_hint = out_desc.cd_hint;
        hfsmp->hfs_privdir_desc.cd_cnid = out_desc.cd_cnid;
                vput(dvp);
        }
        hfs_volupdate(hfsmp, VOL_MKDIR, 1);
+       if (hfsmp->jnl) {
+           journal_end_transaction(hfsmp->jnl);
+       } 
+       hfs_global_shared_lock_release(hfsmp);
+
        cat_releasedesc(&out_desc);
 
        return (out_desc.cd_cnid);
 }
 
+__private_extern__
+u_long
+GetFileInfo(ExtendedVCB *vcb, u_int32_t dirid, char *name,
+                       struct cat_attr *fattr, struct cat_fork *forkinfo)
+{
+       struct hfsmount * hfsmp;
+       struct vnode * dvp = NULL;
+       struct cnode * dcp = NULL;
+       struct FndrDirInfo * fndrinfo;
+       struct cat_desc jdesc;
+       struct timeval tv;
+       int error;
+       
+       if (vcb->vcbSigWord != kHFSPlusSigWord)
+               return (0);
+
+       hfsmp = VCBTOHFS(vcb);
+
+       memset(&jdesc, 0, sizeof(struct cat_desc));
+       jdesc.cd_parentcnid = kRootDirID;
+       jdesc.cd_nameptr = name;
+       jdesc.cd_namelen = strlen(name);
+
+       /* Lock catalog b-tree */
+       error = hfs_metafilelocking(hfsmp, kHFSCatalogFileID, LK_EXCLUSIVE, current_proc());    
+       if (error)
+               return (0);
+
+       error = cat_lookup(hfsmp, &jdesc, 0, NULL, fattr, forkinfo);
+
+       (void) hfs_metafilelocking(hfsmp, kHFSCatalogFileID, LK_RELEASE, current_proc());
+
+       if (error == 0) {
+               return (fattr->ca_fileid);
+       } else if (hfsmp->hfs_fs_ronly) {
+               return (0);
+       }
+}
+
+
+/*
+ * On Journaled HFS, there can be orphaned files.  These
+ * are files that were unlinked while busy. If the volume
+ * was not cleanly unmounted then some of these files may
+ * have persisted and need to be removed.
+ */
+__private_extern__
+void
+hfs_remove_orphans(struct hfsmount * hfsmp)
+{
+       struct BTreeIterator * iterator = NULL;
+       struct FSBufferDescriptor btdata;
+       struct HFSPlusCatalogFile filerec;
+       struct HFSPlusCatalogKey * keyp;
+       FCB *fcb;
+       ExtendedVCB *vcb;
+       char filename[32];
+       char tempname[32];
+       size_t namelen;
+       int catlock = 0;
+       int result, started_tr = 0;
+       
+       if (hfsmp->hfs_orphans_cleaned)
+               return;
+
+       vcb = HFSTOVCB(hfsmp);
+       fcb = VTOF(vcb->catalogRefNum);
+
+       btdata.bufferAddress = &filerec;
+       btdata.itemSize = sizeof(filerec);
+       btdata.itemCount = 1;
+
+       MALLOC(iterator, struct BTreeIterator *, sizeof(*iterator), M_TEMP, M_WAITOK);
+       bzero(iterator, sizeof(*iterator));
+       keyp = (HFSPlusCatalogKey*)&iterator->key;
+       keyp->parentID = hfsmp->hfs_private_metadata_dir;
+
+       // XXXdbg
+       hfs_global_shared_lock_acquire(hfsmp);
+       if (hfsmp->jnl) {
+           if (journal_start_transaction(hfsmp->jnl) != 0) {
+                       hfs_global_shared_lock_release(hfsmp);
+                       return;
+           }
+               started_tr = 1;
+       }
+
+       /* Lock catalog b-tree */
+       result = hfs_metafilelocking(hfsmp, kHFSCatalogFileID, LK_EXCLUSIVE, current_proc());   
+       if (result)
+               goto exit;
+       catlock = 1;
+
+       /*
+        * Position the iterator at the folder thread record.
+        * (i.e. one record before first child)
+        */
+       result = BTSearchRecord(fcb, iterator, NULL, NULL, iterator);
+       if (result)
+               goto exit;
+
+       /* Visit all the children in the HFS+ private directory. */
+       for (;;) {
+               result = BTIterateRecord(fcb, kBTreeNextRecord, iterator, &btdata, NULL);
+               if (result)
+                       break;
+               if (keyp->parentID != hfsmp->hfs_private_metadata_dir)
+                       break;
+               if (filerec.recordType != kHFSPlusFileRecord)
+                       continue;
+               
+               (void) utf8_encodestr(keyp->nodeName.unicode, keyp->nodeName.length * 2,
+                                     filename, &namelen, sizeof(filename), 0, 0);
+               
+               (void) sprintf(tempname, "%s%d", HFS_DELETE_PREFIX, filerec.fileID);
+               
+               /*
+                * Delete all files named "tempxxx", where
+                * xxx is the file's cnid in decimal.
+                *
+                * Delete all files named "iNodexxx", that
+                * have a link count of zero.
+                */
+               if (bcmp(tempname, filename, namelen) == 0) {
+                       struct filefork fork = {0};
+                       struct cnode cnode = {0};
+
+                       // XXXdebug
+                       //printf("hfs_remove_orphans: removing %s\n", filename);
+
+                       /* Build a fake cnode */
+                       cnode.c_desc.cd_parentcnid = hfsmp->hfs_private_metadata_dir;
+                       cnode.c_desc.cd_nameptr = filename;
+                       cnode.c_desc.cd_namelen = namelen;
+                       cnode.c_desc.cd_cnid = filerec.fileID;
+                       cnode.c_attr.ca_fileid = filerec.fileID;
+                       cnode.c_blocks = filerec.dataFork.totalBlocks +
+                                        filerec.resourceFork.totalBlocks;
+
+                       /* Position iterator at previous entry */
+                       if (BTIterateRecord(fcb, kBTreePrevRecord, iterator,
+                           NULL, NULL) != 0)
+                               break;
+                       
+                       /* Truncate the file to zero (both forks) */
+                       if (filerec.dataFork.totalBlocks > 0) {
+                               fork.ff_cp = &cnode;
+                               cnode.c_datafork = ⋔
+                               bcopy(&filerec.dataFork, &fork.ff_data, sizeof(struct cat_fork));
+                               if (TruncateFileC(vcb, (FCB*)&fork, 0, false) != 0) {
+                                       printf("error truncting data fork!\n");
+                                       break;
+                               }
+                       }
+                       if (filerec.resourceFork.totalBlocks > 0) {
+                               fork.ff_cp = &cnode;
+                               cnode.c_datafork = NULL;
+                               cnode.c_rsrcfork = ⋔
+                               bcopy(&filerec.resourceFork, &fork.ff_data, sizeof(struct cat_fork));
+                               if (TruncateFileC(vcb, (FCB*)&fork, 0, false) != 0) {
+                                       printf("error truncting rsrc fork!\n");
+                                       break;
+                               }
+                       }
+
+                       /* Remove the file record from the Catalog */   
+                       if (cat_delete(hfsmp, &cnode.c_desc, &cnode.c_attr) != 0) {
+                               printf("error deleting cat rec!\n");
+                               break;
+                       }
+                       
+                       /* Update parent and volume counts */   
+                       hfsmp->hfs_privdir_attr.ca_entries--;
+                       (void)cat_update(hfsmp, &hfsmp->hfs_privdir_desc,
+                                        &hfsmp->hfs_privdir_attr, NULL, NULL);
+                       hfs_volupdate(hfsmp, VOL_RMFILE, 0);
+               }
+       }
+       
+exit:
+       /* Unlock catalog b-tree */
+       if (catlock)
+               (void) hfs_metafilelocking(hfsmp, kHFSCatalogFileID, LK_RELEASE, current_proc());
+
+       if (started_tr) {
+               journal_end_transaction(hfsmp->jnl);
+       }
+       hfs_global_shared_lock_release(hfsmp);
+
+       FREE(iterator, M_TEMP);
+       hfsmp->hfs_orphans_cleaned = 1;
+}
+
 
 /*
  * This will return the correct logical block size for a given vnode.
 
        switch (err) {
        case dskFulErr:                 /*    -34 */
-       case btNoSpaceAvail:            /* -32733 */
+               return ENOSPC;
+       case btNoSpaceAvail:    /* -32733 */
+               return EFBIG;
        case fxOvFlErr:                 /* -32750 */
-               return ENOSPC;          /*    +28 */
+               return EOVERFLOW;
        
        case btBadNode:                 /* -32731 */
-               return EIO;             /*    +5 */
+               return EBADF;
        
        case memFullErr:                /*  -108 */
                return ENOMEM;          /*   +12 */
                return EISDIR;          /*     21 */
        
        case fxRangeErr:                /* -32751 */
-               return EIO;             /*      5 */
+               return ERANGE;
        
        case bdNamErr:                  /*   -37 */
                return ENAMETOOLONG;    /*    63 */
 }
 
 
+__private_extern__
+int
+hfs_early_journal_init(struct hfsmount *hfsmp, HFSPlusVolumeHeader *vhp,
+                                          void *_args, int embeddedOffset, int mdb_offset,
+                                          HFSMasterDirectoryBlock *mdbp, struct ucred *cred)
+{
+       JournalInfoBlock *jibp;
+       struct buf       *jinfo_bp, *bp;
+       int               sectors_per_fsblock, arg_flags=0, arg_tbufsz=0;
+       int               retval, blksize = hfsmp->hfs_phys_block_size;
+       struct vnode     *devvp;
+       struct hfs_mount_args *args = _args;
+
+       devvp = hfsmp->hfs_devvp;
+
+       if (args != NULL && (args->flags & HFSFSMNT_EXTENDED_ARGS)) {
+               arg_flags  = args->journal_flags;
+               arg_tbufsz = args->journal_tbuffer_size;
+       }
+
+       sectors_per_fsblock = SWAP_BE32(vhp->blockSize) / blksize;
+                               
+       retval = meta_bread(devvp,
+                                               embeddedOffset/blksize + 
+                                               (SWAP_BE32(vhp->journalInfoBlock)*sectors_per_fsblock),
+                                               SWAP_BE32(vhp->blockSize), cred, &jinfo_bp);
+       if (retval)
+               return retval;
+
+       jibp = (JournalInfoBlock *)jinfo_bp->b_data;
+       jibp->flags  = SWAP_BE32(jibp->flags);
+       jibp->offset = SWAP_BE64(jibp->offset);
+       jibp->size   = SWAP_BE64(jibp->size);
+
+       if (jibp->flags & kJIJournalInFSMask) {
+               hfsmp->jvp = hfsmp->hfs_devvp;
+       } else {
+               printf("hfs: journal not stored in fs! don't know what to do.\n");
+               brelse(jinfo_bp);
+               return EINVAL;
+       }
+
+       // save this off for the hack-y check in hfs_remove()
+       hfsmp->jnl_start = jibp->offset / SWAP_BE32(vhp->blockSize);
+
+       if (jibp->flags & kJIJournalNeedInitMask) {
+               printf("hfs: Initializing the journal (joffset 0x%llx sz 0x%llx)...\n",
+                          jibp->offset + (off_t)embeddedOffset, jibp->size);
+               hfsmp->jnl = journal_create(hfsmp->jvp,
+                                                                       jibp->offset + (off_t)embeddedOffset,
+                                                                       jibp->size,
+                                                                       devvp,
+                                                                       blksize,
+                                                                       arg_flags,
+                                                                       arg_tbufsz,
+                                                                       hfs_sync_metadata, hfsmp->hfs_mp);
+
+               // no need to start a transaction here... if this were to fail
+               // we'd just re-init it on the next mount.
+               jibp->flags &= ~kJIJournalNeedInitMask;
+               jibp->flags  = SWAP_BE32(jibp->flags);
+               bwrite(jinfo_bp);
+               jinfo_bp = NULL;
+               jibp     = NULL;
+       } else { 
+               //printf("hfs: Opening the journal (joffset 0x%llx sz 0x%llx vhp_blksize %d)...\n",
+               //         jibp->offset + (off_t)embeddedOffset,
+               //         jibp->size, SWAP_BE32(vhp->blockSize));
+                               
+               hfsmp->jnl = journal_open(hfsmp->jvp,
+                                                                 jibp->offset + (off_t)embeddedOffset,
+                                                                 jibp->size,
+                                                                 devvp,
+                                                                 blksize,
+                                                                 arg_flags,
+                                                                 arg_tbufsz,
+                                                                 hfs_sync_metadata, hfsmp->hfs_mp);
+
+               brelse(jinfo_bp);
+               jinfo_bp = NULL;
+               jibp     = NULL;
+
+               if (hfsmp->jnl && mdbp) {
+                       // reload the mdb because it could have changed
+                       // if the journal had to be replayed.
+                       retval = meta_bread(devvp, mdb_offset, blksize, cred, &bp);
+                       if (retval) {
+                               brelse(bp);
+                               printf("hfs: failed to reload the mdb after opening the journal (retval %d)!\n",
+                                          retval);
+                               return retval;
+                       }
+                       bcopy(bp->b_data + HFS_PRI_OFFSET(blksize), mdbp, 512);
+                       brelse(bp);
+                       bp = NULL;
+               }
+       }
+
+
+       //printf("journal @ 0x%x\n", hfsmp->jnl);
+       
+       // if we expected the journal to be there and we couldn't
+       // create it or open it then we have to bail out.
+       if (hfsmp->jnl == NULL) {
+               hfsmp->jnl_start = 0;
+               
+               printf("hfs: failed to open/create the journal (retval %d).\n", retval);
+               return EINVAL;
+       }
 
+       return 0;
+}
+
+
+//
+// This function will go and re-locate the .journal_info_block and
+// the .journal files in case they moved (which can happen if you
+// run Norton SpeedDisk).  If we fail to find either file we just
+// disable journaling for this volume and return.  We turn off the
+// journaling bit in the vcb and assume it will get written to disk
+// later (if it doesn't on the next mount we'd do the same thing
+// again which is harmless).  If we disable journaling we don't
+// return an error so that the volume is still mountable.
+//
+// If the info we find for the .journal_info_block and .journal files
+// isn't what we had stored, we re-set our cached info and proceed
+// with opening the journal normally.
+//
+static int
+hfs_late_journal_init(struct hfsmount *hfsmp, HFSPlusVolumeHeader *vhp, void *_args)
+{
+       JournalInfoBlock *jibp;
+       struct buf       *jinfo_bp, *bp;
+       int               sectors_per_fsblock, arg_flags=0, arg_tbufsz=0;
+       int               retval, need_flush = 0, write_jibp = 0;
+       struct vnode     *devvp;
+       struct cat_attr   jib_attr, jattr;
+       struct cat_fork   jib_fork, jfork;
+       ExtendedVCB      *vcb;
+       u_long            fid;
+       struct hfs_mount_args *args = _args;
+       
+       devvp = hfsmp->hfs_devvp;
+       vcb = HFSTOVCB(hfsmp);
+       
+       if (args != NULL && (args->flags & HFSFSMNT_EXTENDED_ARGS)) {
+               if (args->journal_disable) {
+                       return 0;
+               }
+
+               arg_flags  = args->journal_flags;
+               arg_tbufsz = args->journal_tbuffer_size;
+       }
+
+       fid = GetFileInfo(vcb, kRootDirID, ".journal_info_block", &jib_attr, &jib_fork);
+       if (fid == 0 || jib_fork.cf_extents[0].startBlock == 0 || jib_fork.cf_size == 0) {
+               printf("hfs: can't find the .journal_info_block! disabling journaling (start: %d).\n",
+                          jib_fork.cf_extents[0].startBlock);
+               vcb->vcbAtrb &= ~kHFSVolumeJournaledMask;
+               return 0;
+       }
+       hfsmp->hfs_jnlinfoblkid = fid;
+
+       // make sure the journal_info_block begins where we think it should.
+       if (SWAP_BE32(vhp->journalInfoBlock) != jib_fork.cf_extents[0].startBlock) {
+               printf("hfs: The journal_info_block moved (was: %d; is: %d).  Fixing up\n",
+                          SWAP_BE32(vhp->journalInfoBlock), jib_fork.cf_extents[0].startBlock);
+
+               vcb->vcbJinfoBlock    = jib_fork.cf_extents[0].startBlock;
+               vhp->journalInfoBlock = SWAP_BE32(jib_fork.cf_extents[0].startBlock);
+       }
+
+
+       sectors_per_fsblock = SWAP_BE32(vhp->blockSize) / hfsmp->hfs_phys_block_size;
+       retval = meta_bread(devvp,
+                                               vcb->hfsPlusIOPosOffset / hfsmp->hfs_phys_block_size + 
+                                               (SWAP_BE32(vhp->journalInfoBlock)*sectors_per_fsblock),
+                                               SWAP_BE32(vhp->blockSize), NOCRED, &jinfo_bp);
+       if (retval) {
+               printf("hfs: can't read journal info block. disabling journaling.\n");
+               vcb->vcbAtrb &= ~kHFSVolumeJournaledMask;
+               return 0;
+       }
+
+       jibp = (JournalInfoBlock *)jinfo_bp->b_data;
+       jibp->flags  = SWAP_BE32(jibp->flags);
+       jibp->offset = SWAP_BE64(jibp->offset);
+       jibp->size   = SWAP_BE64(jibp->size);
+
+       fid = GetFileInfo(vcb, kRootDirID, ".journal", &jattr, &jfork);
+       if (fid == 0 || jfork.cf_extents[0].startBlock == 0 || jfork.cf_size == 0) {
+               printf("hfs: can't find the journal file! disabling journaling (start: %d)\n",
+                          jfork.cf_extents[0].startBlock);
+               brelse(jinfo_bp);
+               vcb->vcbAtrb &= ~kHFSVolumeJournaledMask;
+               return 0;
+       }
+       hfsmp->hfs_jnlfileid = fid;
+
+       // make sure the journal file begins where we think it should.
+       if ((jibp->offset / (u_int64_t)vcb->blockSize) != jfork.cf_extents[0].startBlock) {
+               printf("hfs: The journal file moved (was: %lld; is: %d).  Fixing up\n",
+                          (jibp->offset / (u_int64_t)vcb->blockSize), jfork.cf_extents[0].startBlock);
+
+               jibp->offset = (u_int64_t)jfork.cf_extents[0].startBlock * (u_int64_t)vcb->blockSize;
+               write_jibp   = 1;
+       }
+
+       // check the size of the journal file.
+       if (jibp->size != (u_int64_t)jfork.cf_extents[0].blockCount*vcb->blockSize) {
+               printf("hfs: The journal file changed size! (was %lld; is %lld).  Fixing up.\n",
+                          jibp->size, (u_int64_t)jfork.cf_extents[0].blockCount*vcb->blockSize);
+               
+               jibp->size = (u_int64_t)jfork.cf_extents[0].blockCount * vcb->blockSize;
+               write_jibp = 1;
+       }
+       
+       if (jibp->flags & kJIJournalInFSMask) {
+               hfsmp->jvp = hfsmp->hfs_devvp;
+       } else {
+               printf("hfs: journal not stored in fs! don't know what to do.\n");
+               brelse(jinfo_bp);
+               return EINVAL;
+       }
+
+       // save this off for the hack-y check in hfs_remove()
+       hfsmp->jnl_start = jibp->offset / SWAP_BE32(vhp->blockSize);
+
+       if (jibp->flags & kJIJournalNeedInitMask) {
+               printf("hfs: Initializing the journal (joffset 0x%llx sz 0x%llx)...\n",
+                          jibp->offset + (off_t)vcb->hfsPlusIOPosOffset, jibp->size);
+               hfsmp->jnl = journal_create(hfsmp->jvp,
+                                                                       jibp->offset + (off_t)vcb->hfsPlusIOPosOffset,
+                                                                       jibp->size,
+                                                                       devvp,
+                                                                       hfsmp->hfs_phys_block_size,
+                                                                       arg_flags,
+                                                                       arg_tbufsz,
+                                                                       hfs_sync_metadata, hfsmp->hfs_mp);
+
+               // no need to start a transaction here... if this were to fail
+               // we'd just re-init it on the next mount.
+               jibp->flags &= ~kJIJournalNeedInitMask;
+               write_jibp   = 1;
+
+       } else { 
+               //
+               // if we weren't the last person to mount this volume
+               // then we need to throw away the journal because it
+               // is likely that someone else mucked with the disk.
+               // if the journal is empty this is no big deal.  if the
+               // disk is dirty this prevents us from replaying the
+               // journal over top of changes that someone else made.
+               //
+               arg_flags |= JOURNAL_RESET;
+               
+               //printf("hfs: Opening the journal (joffset 0x%llx sz 0x%llx vhp_blksize %d)...\n",
+               //         jibp->offset + (off_t)vcb->hfsPlusIOPosOffset,
+               //         jibp->size, SWAP_BE32(vhp->blockSize));
+                               
+               hfsmp->jnl = journal_open(hfsmp->jvp,
+                                                                 jibp->offset + (off_t)vcb->hfsPlusIOPosOffset,
+                                                                 jibp->size,
+                                                                 devvp,
+                                                                 hfsmp->hfs_phys_block_size,
+                                                                 arg_flags,
+                                                                 arg_tbufsz,
+                                                                 hfs_sync_metadata, hfsmp->hfs_mp);
+       }
+                       
+
+       if (write_jibp) {
+               jibp->flags  = SWAP_BE32(jibp->flags);
+               jibp->offset = SWAP_BE64(jibp->offset);
+               jibp->size   = SWAP_BE64(jibp->size);
+
+               bwrite(jinfo_bp);
+       } else {
+               brelse(jinfo_bp);
+       } 
+       jinfo_bp = NULL;
+       jibp     = NULL;
+
+       //printf("journal @ 0x%x\n", hfsmp->jnl);
+       
+       // if we expected the journal to be there and we couldn't
+       // create it or open it then we have to bail out.
+       if (hfsmp->jnl == NULL) {
+               hfsmp->jnl_start = 0;
+               
+               printf("hfs: failed to open/create the journal (retval %d).\n", retval);
+               return EINVAL;
+       }
+
+       return 0;
+}
 
 
        if (cp->c_flags & (IMMUTABLE | APPEND))
                return (EPERM);
+
+       // XXXdbg - don't allow modification of the journal or journal_info_block
+       if (VTOHFS(vp)->jnl && cp->c_datafork) {
+               struct HFSPlusExtentDescriptor *extd;
+
+               extd = &cp->c_datafork->ff_data.cf_extents[0];
+               if (extd->startBlock == VTOVCB(vp)->vcbJinfoBlock || extd->startBlock == VTOHFS(vp)->jnl_start) {
+                       return EPERM;
+               }
+       }
+
        /*
         * Go through the fields and update iff not VNOVAL.
         */
        if (VTOVCB(vp)->vcbSigWord != kHFSPlusSigWord)
                return (0);
 
+       // XXXdbg - don't allow modification of the journal or journal_info_block
+       if (VTOHFS(vp)->jnl && cp && cp->c_datafork) {
+               struct HFSPlusExtentDescriptor *extd;
+
+               extd = &cp->c_datafork->ff_data.cf_extents[0];
+               if (extd->startBlock == VTOVCB(vp)->vcbJinfoBlock || extd->startBlock == VTOHFS(vp)->jnl_start) {
+                       return EPERM;
+               }
+       }
+
 #if OVERRIDE_UNKNOWN_PERMISSIONS
        if (VTOVFS(vp)->mnt_flag & MNT_UNKNOWNPERMISSIONS) {
                return (0);
        struct hfsmount *hfsmp = VTOHFS(from_vp);
        struct cat_desc tempdesc;
        struct cat_attr tempattr;
-       int error = 0;
+       int error = 0, started_tr = 0, grabbed_lock = 0;
 
        /* The files must be on the same volume. */
        if (from_vp->v_mount != to_vp->v_mount)
            VNODE_IS_RSRC(from_vp) || VNODE_IS_RSRC(to_vp))
                return (EINVAL);
 
+       // XXXdbg - don't allow modification of the journal or journal_info_block
+       if (hfsmp->jnl) {
+               struct HFSPlusExtentDescriptor *extd;
+
+               if (from_cp->c_datafork) {
+                       extd = &from_cp->c_datafork->ff_data.cf_extents[0];
+                       if (extd->startBlock == VTOVCB(from_vp)->vcbJinfoBlock || extd->startBlock == hfsmp->jnl_start) {
+                               return EPERM;
+                       }
+               }
+
+               if (to_cp->c_datafork) {
+                       extd = &to_cp->c_datafork->ff_data.cf_extents[0];
+                       if (extd->startBlock == VTOVCB(to_vp)->vcbJinfoBlock || extd->startBlock == hfsmp->jnl_start) {
+                               return EPERM;
+                       }
+               }
+       }
+
        from_rvp = from_cp->c_rsrc_vp;
        to_rvp = to_cp->c_rsrc_vp;
 
        if (to_rvp)
                (void) vinvalbuf(to_rvp, V_SAVE, ap->a_cred, ap->a_p, 0, 0);
 
+       // XXXdbg
+       hfs_global_shared_lock_acquire(hfsmp);
+       grabbed_lock = 1;
+       if (hfsmp->jnl) {
+           if ((error = journal_start_transaction(hfsmp->jnl)) != 0) {
+                       goto Err_Exit;
+           }
+               started_tr = 1;
+       }
+       
        /* Lock catalog b-tree */
        error = hfs_metafilelocking(hfsmp, kHFSCatalogFileID, LK_EXCLUSIVE, ap->a_p);
        if (error) goto Err_Exit;
         * (except the modify date)
         */
        bcopy(&to_cp->c_desc, &from_cp->c_desc, sizeof(struct cat_desc));
+
        from_cp->c_hint = 0;
        from_cp->c_fileid = from_cp->c_cnid;
        from_cp->c_itime = to_cp->c_itime;
        if (from_rvp)
                vrele(from_rvp);
 
+       // XXXdbg
+       if (started_tr) {
+           journal_end_transaction(hfsmp->jnl);
+       }
+       if (grabbed_lock) {
+               hfs_global_shared_lock_release(hfsmp);
+       }
+
        return (error);
 }
 
      IN struct proc *p;
 
      */
-
 static int
 hfs_fsync(ap)
        struct vop_fsync_args /* {
        register struct buf *bp;
        struct timeval tv;
        struct buf *nbp;
+       struct hfsmount *hfsmp = VTOHFS(ap->a_vp);
        int s;
        int wait;
        int retry = 0;
         * for regular files write out any clusters
         */
        if (vp->v_flag & VSYSTEM) {
-               if (VTOF(vp)->fcbBTCBPtr != NULL)
-                       BTFlushPath(VTOF(vp));
+           if (VTOF(vp)->fcbBTCBPtr != NULL) {
+                       // XXXdbg
+                       if (hfsmp->jnl) {
+                               if (BTIsDirty(VTOF(vp))) {
+                                       panic("hfs: system file vp 0x%x has dirty blocks (jnl 0x%x)\n",
+                                                 vp, hfsmp->jnl);
+                               }
+                       } else {
+                               BTFlushPath(VTOF(vp));
+                       }
+           }
        } else if (UBCINFOEXISTS(vp))
                (void) cluster_push(vp);
 
                if ((bp->b_flags & B_BUSY))
                        continue;
                if ((bp->b_flags & B_DELWRI) == 0)
-                       panic("hfs_fsync: not dirty");
+                       panic("hfs_fsync: bp 0x% not dirty (hfsmp 0x%x)", bp, hfsmp);
+               // XXXdbg
+               if (hfsmp->jnl && (bp->b_flags & B_LOCKED)) {
+                       if ((bp->b_flags & B_META) == 0) {
+                               panic("hfs: bp @ 0x%x is locked but not meta! jnl 0x%x\n",
+                                         bp, hfsmp->jnl);
+                       }
+                       // if journal_active() returns >= 0 then the journal is ok and we 
+                       // shouldn't do anything to this locked block (because it is part 
+                       // of a transaction).  otherwise we'll just go through the normal 
+                       // code path and flush the buffer.
+                       if (journal_active(hfsmp->jnl) >= 0) {
+                               continue;
+                       }
+               }
+
                bremfree(bp);
                bp->b_flags |= B_BUSY;
                /* Clear B_LOCKED, should only be set on meta files */
                bp->b_flags &= ~B_LOCKED;
+
                splx(s);
                /*
                 * Wait for I/O associated with indirect blocks to complete,
                        tsleep((caddr_t)&vp->v_numoutput, PRIBIO + 1, "hfs_fsync", 0);
                }
 
-               if (vp->v_dirtyblkhd.lh_first) {
+               // XXXdbg -- is checking for hfsmp->jnl == NULL the right
+               //           thing to do?
+               if (hfsmp->jnl == NULL && vp->v_dirtyblkhd.lh_first) {
                        /* still have some dirty buffers */
                        if (retry++ > 10) {
                                vprint("hfs_fsync: dirty", vp);
 
        vp = HFSTOVCB(hfsmp)->catalogRefNum;
 
+       // XXXdbg - don't need to do this on a journaled volume
+       if (hfsmp->jnl) {
+               return 0;
+       }
+
        if (hfs_metafilelocking(hfsmp, kHFSCatalogFileID, LK_EXCLUSIVE, p) != 0)
                return (0);
 
        register struct buf *bp;
        struct timeval tv;
        struct buf *nbp;
+       struct hfsmount *hfsmp = VTOHFS(vp);
        int s;
 
        /*
                if ((bp->b_flags & B_BUSY))
                        continue;
                if ((bp->b_flags & B_DELWRI) == 0)
-                       panic("hfs_fsync: not dirty");
+                       panic("hfs_btsync: not dirty (bp 0x%x hfsmp 0x%x)", bp, hfsmp);
+
+               // XXXdbg
+               if (hfsmp->jnl && (bp->b_flags & B_LOCKED)) {
+                       if ((bp->b_flags & B_META) == 0) {
+                               panic("hfs: bp @ 0x%x is locked but not meta! jnl 0x%x\n",
+                                         bp, hfsmp->jnl);
+                       }
+                       // if journal_active() returns >= 0 then the journal is ok and we 
+                       // shouldn't do anything to this locked block (because it is part 
+                       // of a transaction).  otherwise we'll just go through the normal 
+                       // code path and flush the buffer.
+                       if (journal_active(hfsmp->jnl) >= 0) {
+                           continue;
+                       }
+               }
+
                if (sync_transaction && !(bp->b_flags & B_LOCKED))
                        continue;
 
                bremfree(bp);
                bp->b_flags |= B_BUSY;
                bp->b_flags &= ~B_LOCKED;
+
                splx(s);
 
                (void) bawrite(bp);
        struct cnode *dcp;
        struct hfsmount * hfsmp;
        struct timeval tv;
-       int error = 0;
+       int error = 0, started_tr = 0, grabbed_lock = 0;
 
        cp = VTOC(vp);
        dcp = VTOC(dvp);
                vput(vp);
                return (EINVAL);        /* cannot remove "." */
        }
+
+       // XXXdbg
+       hfs_global_shared_lock_acquire(hfsmp);
+       grabbed_lock = 1;
+       if (hfsmp->jnl) {
+           if ((error = journal_start_transaction(hfsmp->jnl)) != 0) {
+                       goto out;
+           }
+               started_tr = 1;
+       }
+
        /*
         * Verify the directory is empty (and valid).
         * (Rmdir ".." won't be valid since
        dcp->c_flag |= C_CHANGE | C_UPDATE;
        tv = time;
        (void) VOP_UPDATE(dvp, &tv, &tv, 0);
+
        hfs_volupdate(hfsmp, VOL_RMDIR, (dcp->c_cnid == kHFSRootFolderID));
 
        cp->c_mode = 0;  /* Makes the vnode go away...see inactive */
        if (dvp) 
                vput(dvp);
        vput(vp);
+
+       // XXXdbg
+       if (started_tr) { 
+           journal_end_transaction(hfsmp->jnl);
+       }
+       if (grabbed_lock) {
+               hfs_global_shared_lock_release(hfsmp);
+       }
+
        return (error);
 }
 
        int truncated = 0;
        struct timeval tv;
        int error = 0;
+       int started_tr = 0, grabbed_lock = 0;
 
        /* Redirect directories to rmdir */
        if (vp->v_type == VDIR)
            VNODE_IS_RSRC(vp)) {
                error = EPERM;
                goto out;
-        }
+       }
 
        /*
         * Aquire a vnode for a non-empty resource fork.
                        goto out;
        }
 
+       // XXXdbg - don't allow deleting the journal or journal_info_block
+       if (hfsmp->jnl && cp->c_datafork) {
+               struct HFSPlusExtentDescriptor *extd;
+
+               extd = &cp->c_datafork->ff_data.cf_extents[0];
+               if (extd->startBlock == HFSTOVCB(hfsmp)->vcbJinfoBlock || extd->startBlock == hfsmp->jnl_start) {
+                       error = EPERM;
+                       goto out;
+               }
+       }
+
        /*
         * Check if this file is being used.
         *
                goto out;
        }
 
+       // XXXdbg
+       hfs_global_shared_lock_acquire(hfsmp);
+       grabbed_lock = 1;
+       if (hfsmp->jnl) {
+           if ((error = journal_start_transaction(hfsmp->jnl)) != 0) {
+                       goto out;
+           }
+           started_tr = 1;
+       }
+
        /* Remove our entry from the namei cache. */
        cache_purge(vp);
 
+       // XXXdbg - if we're journaled, kill any dirty symlink buffers 
+       if (hfsmp->jnl && vp->v_type == VLNK && vp->v_dirtyblkhd.lh_first) {
+           struct buf *bp, *nbp;
+
+         recheck:
+           for (bp=vp->v_dirtyblkhd.lh_first; bp; bp=nbp) {
+                       nbp = bp->b_vnbufs.le_next;
+                       
+                       if ((bp->b_flags & B_BUSY)) {
+                               // if it was busy, someone else must be dealing
+                               // with it so just move on.
+                               continue;
+                       }
+
+                       if (!(bp->b_flags & B_META)) {
+                               panic("hfs: symlink bp @ 0x%x is not marked meta-data!\n", bp);
+                       }
+
+                       // if it's part of the current transaction, kill it.
+                       if (bp->b_flags & B_LOCKED) {
+                               bremfree(bp);
+                               bp->b_flags |= B_BUSY;
+                               journal_kill_block(hfsmp->jnl, bp);
+                               goto recheck;
+                       }
+           }
+       }
+       // XXXdbg
+
        /*
         * Truncate any non-busy forks.  Busy forks will
         * get trucated when their vnode goes inactive.
                if (error)
                        goto out;
 
+               /* Delete the link record */
                error = cat_delete(hfsmp, &desc, &cp->c_attr);
 
+               if ((error == 0) && (--cp->c_nlink < 1)) {
+                       char inodename[32];
+                       char delname[32];
+                       struct cat_desc to_desc;
+                       struct cat_desc from_desc;
+
+                       /*
+                        * This is now esentially an open deleted file.
+                        * Rename it to reflect this state which makes
+                        * orphan file cleanup easier (see hfs_remove_orphans).
+                        * Note: a rename failure here is not fatal.
+                        */     
+                       MAKE_INODE_NAME(inodename, cp->c_rdev);
+                       bzero(&from_desc, sizeof(from_desc));
+                       from_desc.cd_nameptr = inodename;
+                       from_desc.cd_namelen = strlen(inodename);
+                       from_desc.cd_parentcnid = hfsmp->hfs_private_metadata_dir;
+                       from_desc.cd_flags = 0;
+                       from_desc.cd_cnid = cp->c_fileid;
+
+                       MAKE_DELETED_NAME(delname, cp->c_fileid);               
+                       bzero(&to_desc, sizeof(to_desc));
+                       to_desc.cd_nameptr = delname;
+                       to_desc.cd_namelen = strlen(delname);
+                       to_desc.cd_parentcnid = hfsmp->hfs_private_metadata_dir;
+                       to_desc.cd_flags = 0;
+                       to_desc.cd_cnid = cp->c_fileid;
+       
+                       (void) cat_rename(hfsmp, &from_desc, &hfsmp->hfs_privdir_desc,
+                                         &to_desc, (struct cat_desc *)NULL);
+                       cp->c_flag |= C_DELETED;
+               }
+
                /* Unlock the Catalog */
                (void) hfs_metafilelocking(hfsmp, kHFSCatalogFileID, LK_RELEASE, p);
 
                        goto out;
 
                cp->c_flag |= C_CHANGE;
-                if (--cp->c_nlink < 1)
-                       cp->c_flag |= C_DELETED;
+               tv = time;
+               (void) VOP_UPDATE(vp, &tv, &tv, 0);
+
                hfs_volupdate(hfsmp, VOL_RMFILE, (dcp->c_cnid == kHFSRootFolderID));
 
        } else if (dataforkbusy || rsrcforkbusy) {
 
                /* Lock catalog b-tree */
                error = hfs_metafilelocking(hfsmp, kHFSCatalogFileID, LK_EXCLUSIVE, p);
-               if (error) goto out;
+               if (error)
+                       goto out;
 
                error = cat_rename(hfsmp, &cp->c_desc, &todir_desc,
                                &to_desc, (struct cat_desc *)NULL);
 
-               hfsmp->hfs_privdir_attr.ca_entries++;
+               // XXXdbg - only bump this count if we were successful
+               if (error == 0) {
+                       hfsmp->hfs_privdir_attr.ca_entries++;
+               }
                (void)cat_update(hfsmp, &hfsmp->hfs_privdir_desc,
                                &hfsmp->hfs_privdir_attr, NULL, NULL);
 
 
                cp->c_flag |= C_CHANGE | C_DELETED | C_NOEXISTS;
                --cp->c_nlink;
+               tv = time;
+               (void) VOP_UPDATE(vp, &tv, &tv, 0);
 
        } else /* Not busy */ {
 
-               /* Lock catalog b-tree */
-               error = hfs_metafilelocking(hfsmp, kHFSCatalogFileID, LK_EXCLUSIVE, p);
-               if (error) goto out;
-
                if (vp->v_type == VDIR && cp->c_entries > 0)
                        panic("hfs_remove: attempting to delete a non-empty directory!");
                if (vp->v_type != VDIR && cp->c_blocks > 0)
                        panic("hfs_remove: attempting to delete a non-empty file!");
 
+               /* Lock catalog b-tree */
+               error = hfs_metafilelocking(hfsmp, kHFSCatalogFileID, LK_EXCLUSIVE, p);
+               if (error)
+                       goto out;
+
                error = cat_delete(hfsmp, &cp->c_desc, &cp->c_attr);
 
-               if (error && truncated)
-                       panic("hfs_remove: couldn't delete a truncated file!");
+               if (error && error != ENXIO && truncated) {
+                       if ((cp->c_datafork && cp->c_datafork->ff_data.cf_size != 0) ||
+                               (cp->c_rsrcfork && cp->c_rsrcfork->ff_data.cf_size != 0)) {
+                               panic("hfs: remove: couldn't delete a truncated file! (%d, data sz %lld; rsrc sz %lld)",
+                                         error, cp->c_datafork->ff_data.cf_size, cp->c_rsrcfork->ff_data.cf_size);
+                       } else {
+                               printf("hfs: remove: strangely enough, deleting truncated file %s (%d) got err %d\n",
+                                          cp->c_desc.cd_nameptr, cp->c_attr.ca_fileid, error);
+                       }
+               }
 
                /* Unlock the Catalog */
                (void) hfs_metafilelocking(hfsmp, kHFSCatalogFileID, LK_RELEASE, p);
        if (rvp)
                vrele(rvp);
        VOP_UNLOCK(vp, 0, p);
-       (void) ubc_uncache(vp);
+       // XXXdbg - try to prevent the lost ubc_info panic
+       if ((cp->c_flag & C_HARDLINK) == 0 || cp->c_nlink == 0) {
+               (void) ubc_uncache(vp);
+       }
        vrele(vp);
        vput(dvp);
+
+       // XXXdbg
+       if (started_tr) {
+           journal_end_transaction(hfsmp->jnl);
+       }
+       if (grabbed_lock) {
+               hfs_global_shared_lock_release(hfsmp);
+       }
+
        return (0);
+
 out:
        if (rvp)
                vrele(rvp);
        }
        vput(vp);
        vput(dvp);
+
+       // XXXdbg
+       if (started_tr) {
+           journal_end_transaction(hfsmp->jnl);
+       }
+       if (grabbed_lock) {
+               hfs_global_shared_lock_release(hfsmp);
+       }
+
        return (error);
 }
 
        struct hfsmount *hfsmp;
        struct proc *p = fcnp->cn_proc;
        struct timeval tv;
-       int retval = 0;
+       int retval = 0, started_tr = 0, grabbed_lock = 0;
+       int fdvp_locked = 0;
+       int fvp_locked = 0;
        cnid_t oldparent = 0;
        cnid_t newparent = 0;
 
+       // XXXdbg
+       if (fvp) 
+           hfsmp = VTOHFS(fvp);
+       else if (tvp)
+           hfsmp = VTOHFS(tvp);
+       else
+           hfsmp = NULL;
+       
 #if HFS_DIAGNOSTIC
     if ((tcnp->cn_flags & HASBUF) == 0 ||
         (fcnp->cn_flags & HASBUF) == 0)
                goto abortop;
        }
 
-       if ((retval = vn_lock(fvp, LK_EXCLUSIVE | LK_RETRY, p)))
-               goto abortop;
-
        /*
         * Make sure "from" vnode and its parent are changeable.
         */
        fcp = VTOC(fvp);
        oldparent = fdcp->c_cnid;
        if ((fcp->c_flags & (IMMUTABLE | APPEND)) || (fdcp->c_flags & APPEND)) {
-               VOP_UNLOCK(fvp, 0, p);
                retval = EPERM;
                goto abortop;
        }
 
        if (fcp->c_parentcnid != fdcp->c_cnid) {
-               VOP_UNLOCK(fvp, 0, p);
                retval = EINVAL;
                goto abortop;
        }
        if (fvp == ap->a_tvp &&
            (bcmp(fcp->c_desc.cd_nameptr, tcnp->cn_nameptr,
             fcp->c_desc.cd_namelen) == 0)) {
-               VOP_UNLOCK(fvp, 0, p);
                retval = 0;
                goto abortop;
        }
                        || fdcp == fcp
                        || (fcnp->cn_flags&ISDOTDOT)
                        || (fcp->c_flag & C_RENAME)) {
-                       VOP_UNLOCK(fvp, 0, p);
                        retval = EINVAL;
                        goto abortop;
                }
 
        newparent = tdcp->c_cnid;
        
+       // XXXdbg - don't allow renaming the journal or journal_info_block
+       if (hfsmp->jnl && fcp->c_datafork) {
+               struct HFSPlusExtentDescriptor *extd;
+                       
+               extd = &fcp->c_datafork->ff_data.cf_extents[0];
+               if (extd->startBlock == HFSTOVCB(hfsmp)->vcbJinfoBlock || extd->startBlock == hfsmp->jnl_start) {
+                       retval = EPERM;
+                       goto bad;
+               }
+       }
+
+       if (hfsmp->jnl && tcp && tcp->c_datafork) {
+               struct HFSPlusExtentDescriptor *extd;
+                       
+               extd = &tcp->c_datafork->ff_data.cf_extents[0];
+               if (extd->startBlock == HFSTOVCB(hfsmp)->vcbJinfoBlock || extd->startBlock == hfsmp->jnl_start) {
+                       retval = EPERM;
+                       goto bad;
+               }
+       }
+
        retval = VOP_ACCESS(fvp, VWRITE, tcnp->cn_cred, tcnp->cn_proc);
        if ((fvp->v_type == VDIR) && (newparent != oldparent)) {
                if (retval)             /* write access check above */
        }
        retval = 0;  /* Reset value from above, we dont care about it anymore */
        
+       /* XXX
+        * Prevent lock heirarchy violation (deadlock):
+        *
+        * If fdvp is the parent of tdvp then we must drop
+        * tdvp lock before aquiring the lock for fdvp.
+        *
+        * XXXdbg - moved this to happen up here *before* we
+        *          start a transaction.  otherwise we can
+        *          deadlock because the vnode layer may get
+        *          this lock for someone else and then they'll
+        *          never be able to start a transaction.
+        */
+       if (newparent != oldparent) {
+           if (fdcp->c_cnid == tdcp->c_parentcnid) {
+                       vput(tdvp);
+                       vn_lock(fdvp, LK_EXCLUSIVE | LK_RETRY, p);
+                       vget(tdvp, LK_EXCLUSIVE | LK_RETRY, p);
+           } else {
+                       vn_lock(fdvp, LK_EXCLUSIVE | LK_RETRY, p);
+               }
+       }
+       fdvp_locked = 1;
+       if ((retval = vn_lock(fvp, LK_EXCLUSIVE | LK_RETRY, p)))
+               goto bad;
+       fvp_locked = 1;
+       
+       // XXXdbg
+       hfs_global_shared_lock_acquire(hfsmp);
+       grabbed_lock = 1;
+       if (hfsmp->jnl) {
+           if ((retval = journal_start_transaction(hfsmp->jnl)) != 0) {
+                       goto bad;
+           }
+               started_tr = 1;
+       }
+
        /*
         * If the destination exists, then be sure its type (file or dir)
         * matches that of the source.  And, if it is a directory make sure
 
        }
 
-       /* XXX
-        * Prevent lock heirarchy violation (deadlock):
-        *
-        * If fdvp is the parent of tdvp then we must drop
-        * tdvp lock before aquiring the lock for fdvp.
-        */
-       if (newparent != oldparent)
-               vn_lock(fdvp, LK_EXCLUSIVE | LK_RETRY, p);
-
        /* remove the existing entry from the namei cache: */
        cache_purge(fvp);
 
-       hfsmp = VTOHFS(fvp);
        bzero(&from_desc, sizeof(from_desc));
        from_desc.cd_nameptr = fcnp->cn_nameptr;
        from_desc.cd_namelen = fcnp->cn_namelen;
        /* Lock catalog b-tree */
        retval = hfs_metafilelocking(hfsmp, kHFSCatalogFileID, LK_EXCLUSIVE, p);
        if (retval) {
-               if (newparent != oldparent)  /* unlock the lock we just got */
-                       VOP_UNLOCK(fdvp, 0, p);
                 goto bad;
        }
-       retval = cat_rename(hfsmp, &from_desc, &tdcp->c_desc,
-                       &to_desc, &out_desc);
+       retval = cat_rename(hfsmp, &from_desc, &tdcp->c_desc,
+                                               &to_desc, &out_desc);
 
        /* Unlock catalog b-tree */
        (void) hfs_metafilelocking(hfsmp, kHFSCatalogFileID, LK_RELEASE, p);
 
-       if (newparent != oldparent)
+       if (newparent != oldparent) {
                VOP_UNLOCK(fdvp, 0, p);
+               fdvp_locked = 0;
+       }
 
        if (retval)  goto bad;
 
                fdcp->c_entries--;
        tdcp->c_nlink++;
        tdcp->c_entries++;
-       fdcp->c_flag |= C_UPDATE;
-       tdcp->c_flag |= C_UPDATE;
+       fdcp->c_flag |= C_CHANGE | C_UPDATE;
+       tdcp->c_flag |= C_CHANGE | C_UPDATE;
        tv = time;
        CTIMES(fdcp, &tv, &tv);
        CTIMES(tdcp, &tv, &tv);
        tdcp->c_childhint = out_desc.cd_hint;   /* Cache directory's location */
 
+       // make sure both directories get updated on disk.
+       if (fdvp != tdvp) {
+               (void) VOP_UPDATE(fdvp, &tv, &tv, 0);
+       }
+       (void) VOP_UPDATE(tdvp, &tv, &tv, 0);
+
        hfs_volupdate(hfsmp, fvp->v_type == VDIR ? VOL_RMDIR : VOL_RMFILE,
                (fdcp->c_cnid == kHFSRootFolderID));
        hfs_volupdate(hfsmp, fvp->v_type == VDIR ? VOL_MKDIR : VOL_MKFILE,
        vput(tdvp);
        vrele(fdvp);
        vput(fvp);
+
+       // XXXdbg
+       if (started_tr) {
+           journal_end_transaction(hfsmp->jnl);
+       }
+       if (grabbed_lock) {
+               hfs_global_shared_lock_release(hfsmp);
+       }
+
        return (0);
 
 bad:
        if (fcp)
                fcp->c_flag &= ~C_RENAME;
+
+       // XXXdbg make sure both directories get updated on disk.
+       if (fdvp != tdvp) {
+               (void) VOP_UPDATE(fdvp, &tv, &tv, 0);
+       }
+       (void) VOP_UPDATE(tdvp, &tv, &tv, 0);
+
        if (tdvp == tvp)
                vrele(tdvp);
        else
                vput(tdvp);
        if (tvp)
                vput(tvp);
-       vrele(fdvp);
 
-       if (VOP_ISLOCKED(fvp))
+       if (fdvp_locked)
+               vput(fdvp);
+       else
+               vrele(fdvp);
+
+       if (fvp_locked)
                vput(fvp);
        else
                vrele(fvp);
+
+       // XXXdbg
+       if (started_tr) {
+           journal_end_transaction(hfsmp->jnl);
+       }
+       if (grabbed_lock) {
+               hfs_global_shared_lock_release(hfsmp);
+       }
+
        return (retval);
 
 abortop:
        VOP_ABORTOP(fdvp, fcnp);
        vrele(fdvp);
        vrele(fvp);
+
        return (retval);
 }
 
        } */ *ap;
 {
        register struct vnode *vp, **vpp = ap->a_vpp;
+       struct hfsmount *hfsmp;
        struct filefork *fp;
        int len, error;
        struct buf *bp = NULL;
                return (EINVAL);
        }
 
+
+       hfsmp = VTOHFS(ap->a_dvp);
+
        /* Create the vnode */
        if ((error = hfs_makenode(S_IFLNK | ap->a_vap->va_mode,
-           ap->a_dvp, vpp, ap->a_cnp)))
+                                                         ap->a_dvp, vpp, ap->a_cnp))) {
                return (error);
+       }
 
        vp = *vpp;
        len = strlen(ap->a_target);
        fp = VTOF(vp);
        fp->ff_clumpsize = VTOVCB(vp)->blockSize;
 
+       // XXXdbg
+       hfs_global_shared_lock_acquire(hfsmp);
+       if (hfsmp->jnl) {
+           if ((error = journal_start_transaction(hfsmp->jnl)) != 0) {
+                       hfs_global_shared_lock_release(hfsmp);
+                       VOP_ABORTOP(ap->a_dvp, ap->a_cnp);
+                       vput(ap->a_dvp);
+                       return (error);
+           }
+       }
+
        /* Allocate space for the link */
        error = VOP_TRUNCATE(vp, len, IO_NOZEROFILL,
                              ap->a_cnp->cn_cred, ap->a_cnp->cn_proc);
        /* Write the link to disk */
        bp = getblk(vp, 0, roundup((int)fp->ff_size, VTOHFS(vp)->hfs_phys_block_size),
                        0, 0, BLK_META);
+       if (hfsmp->jnl) {
+               journal_modify_block_start(hfsmp->jnl, bp);
+       }
        bzero(bp->b_data, bp->b_bufsize);
        bcopy(ap->a_target, bp->b_data, len);
-       bawrite(bp);
+       if (hfsmp->jnl) {
+               journal_modify_block_end(hfsmp->jnl, bp);
+       } else {
+               bawrite(bp);
+       }
 out:
+       if (hfsmp->jnl) {
+               journal_end_transaction(hfsmp->jnl);
+       }
+       hfs_global_shared_lock_release(hfsmp);
        vput(vp);
        return (error);
 }
        off_t off = uio->uio_offset;
        int retval = 0;
        int eofflag = 0;
-
+       void *user_start = NULL;
+       int   user_len;
+ 
        /* We assume it's all one big buffer... */
        if (uio->uio_iovcnt > 1 || uio->uio_resid < AVERAGE_HFSDIRENTRY_SIZE)
                return EINVAL;
 
+       // XXXdbg
+       // We have to lock the user's buffer here so that we won't
+       // fault on it after we've acquired a shared lock on the
+       // catalog file.  The issue is that you can get a 3-way
+       // deadlock if someone else starts a transaction and then
+       // tries to lock the catalog file but can't because we're
+       // here and we can't service our page fault because VM is
+       // blocked trying to start a transaction as a result of
+       // trying to free up pages for our page fault.  It's messy
+       // but it does happen on dual-procesors that are paging
+       // heavily (see radar 3082639 for more info).  By locking
+       // the buffer up-front we prevent ourselves from faulting
+       // while holding the shared catalog file lock.
+       //
+       // Fortunately this and hfs_search() are the only two places
+       // currently (10/30/02) that can fault on user data with a
+       // shared lock on the catalog file.
+       //
+       if (hfsmp->jnl && uio->uio_segflg == UIO_USERSPACE) {
+               user_start = uio->uio_iov->iov_base;
+               user_len   = uio->uio_iov->iov_len;
+
+               if ((retval = vslock(user_start, user_len)) != 0) {
+                       return retval;
+               }
+       }
+
+
        /* Create the entries for . and .. */
        if (uio->uio_offset < sizeof(rootdots)) {
                caddr_t dep;
        }
 
 Exit:;
+       if (hfsmp->jnl && user_start) {
+               vsunlock(user_start, user_len, TRUE);
+       }
+
        if (ap->a_eofflag)
                *ap->a_eofflag = eofflag;
 
                }
                bcopy(bp->b_data, fp->ff_symlinkptr, (size_t)fp->ff_size);
                if (bp) {
-                       bp->b_flags |= B_INVAL;         /* data no longer needed */
+                       if (VTOHFS(vp)->jnl && (bp->b_flags & B_LOCKED) == 0) {
+                               bp->b_flags |= B_INVAL;         /* data no longer needed */
+                       }
                        brelse(bp);
                }
        }
        struct cat_fork *rsrcforkp = NULL;
        struct cat_fork datafork;
        int updateflag;
+       struct hfsmount *hfsmp;
        int error;
 
+       hfsmp = VTOHFS(vp);
+
        /* XXX do we really want to clear the sytem cnode flags here???? */
        if ((vp->v_flag & VSYSTEM) ||
            (VTOVFS(vp)->mnt_flag & MNT_RDONLY) ||
        updateflag = cp->c_flag & (C_ACCESS | C_CHANGE | C_MODIFIED | C_UPDATE);
 
        /* Nothing to update. */
-       if (updateflag == 0)
+       if (updateflag == 0) {
                return (0);
+       }
        /* HFS standard doesn't have access times. */
-       if ((updateflag == C_ACCESS) && (VTOVCB(vp)->vcbSigWord == kHFSSigWord))
+       if ((updateflag == C_ACCESS) && (VTOVCB(vp)->vcbSigWord == kHFSSigWord)) {
                return (0);
+       }
        if (updateflag & C_ACCESS) {
                /*
                 * If only the access time is changing then defer
            (dataforkp && cp->c_datafork->ff_unallocblocks) ||
            (rsrcforkp && cp->c_rsrcfork->ff_unallocblocks)) {
                if (updateflag & (C_CHANGE | C_UPDATE))
-                       hfs_volupdate(VTOHFS(vp), VOL_UPDATE, 0);       
+                       hfs_volupdate(hfsmp, VOL_UPDATE, 0);    
                cp->c_flag &= ~(C_ACCESS | C_CHANGE | C_UPDATE);
                cp->c_flag |= C_MODIFIED;
+
                return (0);
        }
 
+
+       // XXXdbg
+       hfs_global_shared_lock_acquire(hfsmp);
+       if (hfsmp->jnl) {
+               if ((error = journal_start_transaction(hfsmp->jnl)) != 0) {
+                       hfs_global_shared_lock_release(hfsmp);
+                       return error;
+           }
+       }
+                       
+
        /*
         * For files with invalid ranges (holes) the on-disk
         * field representing the size of the file (cf_size)
         * A shared lock is sufficient since an update doesn't change
         * the tree and the lock on vp protects the cnode.
         */
-       error = hfs_metafilelocking(VTOHFS(vp), kHFSCatalogFileID, LK_SHARED, p);
-       if (error)
+       error = hfs_metafilelocking(hfsmp, kHFSCatalogFileID, LK_SHARED, p);
+       if (error) {
+               if (hfsmp->jnl) {
+                       journal_end_transaction(hfsmp->jnl);
+               }
+               hfs_global_shared_lock_release(hfsmp);
                return (error);
+       }
 
        /* XXX - waitfor is not enforced */
-       error = cat_update(VTOHFS(vp), &cp->c_desc, &cp->c_attr, dataforkp, rsrcforkp);
+       error = cat_update(hfsmp, &cp->c_desc, &cp->c_attr, dataforkp, rsrcforkp);
 
         /* Unlock the Catalog b-tree file. */
-       (void) hfs_metafilelocking(VTOHFS(vp), kHFSCatalogFileID, LK_RELEASE, p);
+       (void) hfs_metafilelocking(hfsmp, kHFSCatalogFileID, LK_RELEASE, p);
 
        if (updateflag & (C_CHANGE | C_UPDATE))
-               hfs_volupdate(VTOHFS(vp), VOL_UPDATE, 0);       
+               hfs_volupdate(hfsmp, VOL_UPDATE, 0);    
+
+       // XXXdbg
+       if (hfsmp->jnl) {
+           journal_end_transaction(hfsmp->jnl);
+       }
+       hfs_global_shared_lock_release(hfsmp);
 
        /* After the updates are finished, clear the flags */
        cp->c_flag &= ~(C_ACCESS | C_CHANGE | C_MODIFIED | C_UPDATE | C_ATIMEMOD);
        struct proc *p;
        struct cat_desc in_desc, out_desc;
        struct cat_attr attr;
-       int error;
+       int error, started_tr = 0, grabbed_lock = 0;
        enum vtype vnodetype;
 
        p = cnp->cn_proc;
        in_desc.cd_parentcnid = dcp->c_cnid;
        in_desc.cd_flags = S_ISDIR(mode) ? CD_ISDIR : 0;
 
+       // XXXdbg
+       hfs_global_shared_lock_acquire(hfsmp);
+       grabbed_lock = 1;
+       if (hfsmp->jnl) {
+           if ((error = journal_start_transaction(hfsmp->jnl)) != 0) {
+                       goto exit;
+           }
+               started_tr = 1;
+       }
+
        /* Lock catalog b-tree */
        error = hfs_metafilelocking(VTOHFS(dvp), kHFSCatalogFileID, LK_EXCLUSIVE, p);
        if (error)
        dcp->c_flag |= C_CHANGE | C_UPDATE;
        tv = time;
        (void) VOP_UPDATE(dvp, &tv, &tv, 0);
+
        hfs_volupdate(hfsmp, vnodetype == VDIR ? VOL_MKDIR : VOL_MKFILE,
                (dcp->c_cnid == kHFSRootFolderID));
 
+       // XXXdbg
+       // have to end the transaction here before we call hfs_getnewvnode()
+       // because that can cause us to try and reclaim a vnode on a different
+       // file system which could cause us to start a transaction which can
+       // deadlock with someone on that other file system (since we could be
+       // holding two transaction locks as well as various vnodes and we did
+       // not obtain the locks on them in the proper order).
+    //
+       // NOTE: this means that if the quota check fails or we have to update
+       //       the change time on a block-special device that those changes
+       //       will happen as part of independent transactions.
+       //
+       if (started_tr) {
+               journal_end_transaction(hfsmp->jnl);
+               started_tr = 0;
+       }
+       if (grabbed_lock) {
+               hfs_global_shared_lock_release(hfsmp);
+               grabbed_lock = 0;
+       }
+
        /* Create a vnode for the object just created: */
        error = hfs_getnewvnode(hfsmp, NULL, &out_desc, 0, &attr, NULL, &tvp);
        if (error)
                goto exit;
 
+
 #if QUOTA
        cp = VTOC(tvp);
        /* 
                        VOP_RMDIR(dvp,tvp, cnp);
                else
                        VOP_REMOVE(dvp,tvp, cnp);
+
                return (error);
        }
 #endif /* QUOTA */
                tvp->v_type = IFTOVT(mode);
                cp->c_flag |= C_CHANGE;
                tv = time;
-               if ((error = VOP_UPDATE(tvp, &tv, &tv, 1))) {
-                       vput(tvp);
+               if ((error = VOP_UPDATE(tvp, &tv, &tv, 1))) {
+                       vput(tvp);
                        goto exit;
                }
        }
                FREE_ZONE(cnp->cn_pnbuf, cnp->cn_pnlen, M_NAMEI);
        vput(dvp);
 
+       // XXXdbg
+       if (started_tr) {
+           journal_end_transaction(hfsmp->jnl);
+               started_tr = 0;
+       }
+       if (grabbed_lock) {
+               hfs_global_shared_lock_release(hfsmp);
+               grabbed_lock = 0;
+       }
+
        return (error);
 }
 
 
        err = ReleaseNode (btreePtr, &nodeRec);
        M_ExitOnError (err);
 
+       /*
+        * Under Mac OS, b-tree nodes can be non-contiguous on disk when the
+        * allocation block size is smaller than the b-tree node size.
+        *
+        * If journaling is turned on for this volume we can't deal with this
+        * situation and so we bail out.  If journaling isn't on it's ok as
+        * hfs_strategy_fragmented() deals with it.  Journaling can't support
+        * this because it assumes that if you give it a block that it's
+        * contiguous on disk.
+        */
+       if ( FCBTOHFS(filePtr)->jnl && !NodesAreContiguous(FCBTOVCB(filePtr), filePtr, btreePtr->nodeSize) ) {
+               return fsBTInvalidNodeErr;
+       }
+
        //////////////////////////////// Success ////////////////////////////////////
 
        //\80\80 align LEOF to multiple of node size?       - just on close
        if (filePtr == nil)                                                                     return  paramErr;
        if (searchIterator == nil)                                                      return  paramErr;
 
+       node.buffer = nil;
+       node.blockHeader = nil;
+
        btreePtr = (BTreeControlBlockPtr) filePtr->fcbBTCBPtr;
        if (btreePtr == nil)                                                            return  fsBTInvalidFileErr;
 
 
        ////////////////////////// Priliminary Checks ///////////////////////////////
 
-       left.buffer             = nil;
-       right.buffer    = nil;
-       node.buffer             = nil;
+       left.buffer               = nil;
+       left.blockHeader  = nil;
+       right.buffer      = nil;
+       right.blockHeader = nil;
+       node.buffer               = nil;
+       node.blockHeader  = nil;
 
 
        if (filePtr == nil)
 
        ////////////////////////// Priliminary Checks ///////////////////////////////
 
-       left.buffer  = nil;
-       right.buffer = nil;
-       node.buffer  = nil;
+       left.buffer       = nil;
+       left.blockHeader  = nil;
+       right.buffer      = nil;
+       right.blockHeader = nil;
+       node.buffer       = nil;
+       node.blockHeader  = nil;
 
        btreePtr = (BTreeControlBlockPtr) filePtr->fcbBTCBPtr;
 
        UInt16                                  index;
        Boolean                                 recordFit;
 
-
        ////////////////////////// Priliminary Checks ///////////////////////////////
 
        nodeRec.buffer = nil;                                   // so we can call ReleaseNode
+       nodeRec.blockHeader = nil;
 
        err = CheckInsertParams (filePtr, iterator, record, recordLen);
        if (err != noErr)
                                                                err = GetNewNode (btreePtr, insertNodeNum, &nodeRec);
                                                                M_ExitOnError (err);
 
+                                                               // XXXdbg
+                                                               ModifyBlockStart(btreePtr->fileRefNum, &nodeRec);
+                                                               
                                                                ((NodeDescPtr)nodeRec.buffer)->kind = kBTLeafNode;
                                                                ((NodeDescPtr)nodeRec.buffer)->height   = 1;
 
                                                                btreePtr->rootNode                      = insertNodeNum;
                                                                btreePtr->firstLeafNode         = insertNodeNum;
                                                                btreePtr->lastLeafNode          = insertNodeNum;
+
                                                                M_BTreeHeaderDirty (btreePtr);
 
                                                                goto Success;
 
        if (index > 0)
        {
+               // XXXdbg
+               ModifyBlockStart(btreePtr->fileRefNum, &nodeRec);
+                                                               
                recordFit = InsertKeyRecord (btreePtr, nodeRec.buffer, index,
                                                                                &iterator->key, KeyLength(btreePtr, &iterator->key),
                                                                                record->bufferAddress, recordLen);
        ++btreePtr->writeCount;
        ++btreePtr->leafRecords;
        M_BTreeHeaderDirty (btreePtr);
-
+               
        // create hint
        iterator->hint.writeCount       = btreePtr->writeCount;
        iterator->hint.nodeNum          = insertNodeNum;
        ////////////////////////// Priliminary Checks ///////////////////////////////
 
        nodeRec.buffer = nil;                                   // so we can call ReleaseNode
+       nodeRec.blockHeader = nil;
 
        err = CheckInsertParams (filePtr, iterator, record, recordLen);
        if (err != noErr)
                err = GetNode (btreePtr, insertNodeNum, &nodeRec);
                if( err == noErr )
                {
+                       // XXXdbg
+                       ModifyBlockStart(btreePtr->fileRefNum, &nodeRec);
+                                                               
                        err = TrySimpleReplace (btreePtr, nodeRec.buffer, iterator, record, recordLen, &recordFit);
                        M_ExitOnError (err);
 
        // optimization - if simple replace will work then don't extend btree
        // \80\80 if we tried this before, and failed because it wouldn't fit then we shouldn't try this again...
 
+       // XXXdbg
+       ModifyBlockStart(btreePtr->fileRefNum, &nodeRec);
+
        err = TrySimpleReplace (btreePtr, nodeRec.buffer, iterator, record, recordLen, &recordFit);
        M_ExitOnError (err);
 
        }
 
 
+       // XXXdbg
+       ModifyBlockStart(btreePtr->fileRefNum, &nodeRec);
+                                                               
        DeleteRecord (btreePtr, nodeRec.buffer, index); // delete existing key/record
 
        err = InsertTree (btreePtr, treePathTable, &iterator->key, record->bufferAddress,
        ////////////////////////// Priliminary Checks ///////////////////////////////
 
        nodeRec.buffer = nil;                                   // so we can call ReleaseNode
+       nodeRec.blockHeader = nil;
 
        btreePtr = (BTreeControlBlockPtr) filePtr->fcbBTCBPtr;
 
                                err = GetRecordByIndex(btreePtr, nodeRec.buffer, index, &keyPtr, &recordPtr, &recordLen);
                                M_ExitOnError (err);
 
+                               // XXXdbg
+                               ModifyBlockStart(btreePtr->fileRefNum, &nodeRec);
+                                                               
                                err = callBackProc(keyPtr, recordPtr, recordLen, callBackState);
                                M_ExitOnError (err);
 
        err = GetRecordByIndex(btreePtr, nodeRec.buffer, index, &keyPtr, &recordPtr, &recordLen);
        M_ExitOnError (err);
 
+       // XXXdbg
+       ModifyBlockStart(btreePtr->fileRefNum, &nodeRec);
+                                                               
        err = callBackProc(keyPtr, recordPtr, recordLen, callBackState);
        M_ExitOnError (err);
 
        ////////////////////////// Priliminary Checks ///////////////////////////////
 
        nodeRec.buffer = nil;                                   // so we can call ReleaseNode
+       nodeRec.blockHeader = nil;
 
        M_ReturnErrorIf (filePtr == nil,        paramErr);
        M_ReturnErrorIf (iterator == nil,       paramErr);
        ++btreePtr->writeCount;
        --btreePtr->leafRecords;
        M_BTreeHeaderDirty (btreePtr);
-
+               
        iterator->hint.nodeNum  = 0;
 
        return noErr;
        return noErr;
 }
 
+// XXXdbg
+__private_extern__
+OSStatus
+BTIsDirty(FCB *filePtr)
+{
+       BTreeControlBlockPtr    btreePtr;
 
+       btreePtr = (BTreeControlBlockPtr) filePtr->fcbBTCBPtr;
+       return TreeIsDirty(btreePtr);
+}
 
 /*-------------------------------------------------------------------------------
 Routine:       BTFlushPath     -       Flush BTreeControlBlock to Header Node.
        BTHeaderRec *header;    
 
 
+       node.buffer = nil;
+       node.blockHeader = nil;
+
        btreePtr = (BTreeControlBlockPtr) filePtr->fcbBTCBPtr;
        if (btreePtr == nil)
                return (fsBTInvalidFileErr);
 }
 
 
+/*-------------------------------------------------------------------------------
+Routine:       BTCheckFreeSpace
+
+Function:      Makes sure there is enough free space so that a tree operation
+            will succeed.
+
+Input:         fcb     - pointer file control block
+
+Output:                none
+
+Result:                noErr                   - success
+            
+-------------------------------------------------------------------------------*/
+
+__private_extern__
+OSStatus       BTCheckFreeSpace                (FCB                                    *filePtr)
+{
+       BTreeControlBlockPtr    btreePtr;
+       int                                     nodesNeeded, err = noErr;
+
+
+       M_ReturnErrorIf (filePtr == nil,        paramErr);
+
+       btreePtr = (BTreeControlBlockPtr) filePtr->fcbBTCBPtr;
+       
+       REQUIRE_FILE_LOCK(btreePtr->fileRefNum, true);
+
+       M_ReturnErrorIf (btreePtr == nil,       fsBTInvalidFileErr);
+
+       // XXXdbg this is highly conservative but so much better than
+       //        winding up with turds on your disk.
+       //
+       nodesNeeded = (btreePtr->treeDepth + 1) * 10;
+       
+       if (btreePtr->freeNodes < nodesNeeded) {
+               err = ExtendBTree(btreePtr, nodesNeeded + btreePtr->totalNodes - btreePtr->freeNodes);
+       }
+
+       return err;
+}
+
+
+__private_extern__
+OSStatus       BTHasContiguousNodes    (FCB                                    *filePtr)
+{
+       BTreeControlBlockPtr    btreePtr;
+       int                                     nodesNeeded, err = noErr;
+
+
+       M_ReturnErrorIf (filePtr == nil,        paramErr);
+
+       btreePtr = (BTreeControlBlockPtr) filePtr->fcbBTCBPtr;
+       
+       REQUIRE_FILE_LOCK(btreePtr->fileRefNum, true);
+
+       M_ReturnErrorIf (btreePtr == nil,       fsBTInvalidFileErr);
+
+       return NodesAreContiguous(FCBTOVCB(filePtr), filePtr, btreePtr->nodeSize);
+}
 
        nodeNumber              = 0;                            // first node number of header map record
        node.buffer             = nil;                          // clear node.buffer to get header node
                                                                                //      - and for ErrorExit
+       node.blockHeader = nil;
        
        while (true)
        {
                err = GetMapNode (btreePtr, &node, &mapPtr, &mapSize);
                M_ExitOnError (err);
                
+               // XXXdbg
+               ModifyBlockStart(btreePtr->fileRefNum, &node);
+                                                               
        //////////////////////// Find Word with Free Bit ////////////////////////////
 
                pos             = mapPtr;
        //////////////////////////// Find Map Record ////////////////////////////////
        nodeIndex                       = 0;                            // first node number of header map record
        node.buffer                     = nil;                          // invalidate node.buffer to get header node
+       node.blockHeader    = nil;
        
        while (nodeNum >= nodeIndex)
        {
        
        //////////////////////////// Mark Node Free /////////////////////////////////
 
+       // XXXdbg
+       ModifyBlockStart(btreePtr->fileRefNum, &node);
+                                                               
        nodeNum -= (nodeIndex - (mapSize << 3));                        // relative to this map record
        bitOffset = 15 - (nodeNum & 0x0000000F);                        // last 4 bits are bit offset
        mapPos += nodeNum >> 4;                                                         // point to word containing map bit
        filePtr                         = GetFileControlBlock(btreePtr->fileRefNum);
        
        mapNode.buffer          = nil;
+       mapNode.blockHeader = nil;
        newNode.buffer          = nil;
+       newNode.blockHeader = nil;
 
        mapNodeRecSize  = nodeSize - sizeof(BTNodeDescriptor) - 6;      // 2 bytes of free space (see note)
 
        
 
        /////////////////////// Initialize New Map Nodes ////////////////////////////
+       // XXXdbg - this is the correct place for this:
+       ModifyBlockStart(btreePtr->fileRefNum, &mapNode);
 
        ((BTNodeDescriptor*)mapNode.buffer)->fLink = firstNewMapNodeNum;
 
                err = GetNewNode (btreePtr, nodeNum, &newNode);
                M_ExitOnError (err);
                
+               // XXXdbg
+               ModifyBlockStart(btreePtr->fileRefNum, &newNode);
+
                ((NodeDescPtr)newNode.buffer)->numRecords       = 1;
                ((NodeDescPtr)newNode.buffer)->kind = kBTMapNode;
                
                        err = GetNode (btreePtr, nextNodeNum, &mapNode);
                        M_ExitOnError (err);
                        
+                       // XXXdbg
+                       ModifyBlockStart(btreePtr->fileRefNum, &mapNode);
+
                        mapIndex = 0;
                        
                        mapStart         = (UInt16 *) GetRecordAddress (btreePtr, mapNode.buffer, mapIndex);
        ////////////////////////////// Error Exit ///////////////////////////////////
 
 ErrorExit:
-
+       
        (void) ReleaseNode (btreePtr, &mapNode);
        (void) ReleaseNode (btreePtr, &newNode);
        
 
 
 
 
+__private_extern__
+OSStatus TreeIsDirty(BTreeControlBlockPtr btreePtr)
+{
+    return (btreePtr->flags & kBTHeaderDirty);
+}
+
+
+
 /*-------------------------------------------------------------------------------
 Routine:       UpdateHeader    -       Write BTreeInfoRec fields to Header node.
 
        BTHeaderRec     *header;        
        UInt32 options;
 
-
        if ((btreePtr->flags & kBTHeaderDirty) == 0)                    // btree info already flushed
        return  noErr;
        
        
        err = GetNode (btreePtr, kHeaderNodeNum, &node );
-       if (err != noErr)
+       if (err != noErr) {
                return  err;
+       }
        
+       // XXXdbg
+       ModifyBlockStart(btreePtr->fileRefNum, &node);
+
        header = (BTHeaderRec*) ((char *)node.buffer + sizeof(BTNodeDescriptor));
        
        header->treeDepth               = btreePtr->treeDepth;
        // assume foundRecord points to Boolean
        
        left->buffer            = nil;
+       left->blockHeader   = nil;
        middle->buffer          = nil;
+       middle->blockHeader     = nil;
        right->buffer           = nil;
+       right->blockHeader      = nil;
        
        foundIt                         = false;
        
 
        // release old buffer if we have one
        if ( theScanStatePtr->bufferPtr != NULL )
        {
-               theScanStatePtr->bufferPtr->b_flags |= (B_INVAL | B_AGE);
+           theScanStatePtr->bufferPtr->b_flags |= (B_INVAL | B_AGE);
                brelse( theScanStatePtr->bufferPtr );
                theScanStatePtr->bufferPtr = NULL;
                theScanStatePtr->currentNodePtr = NULL;
        
        // now read blocks from the device 
        myErr = bread(  myDevPtr, 
-                                       myPhyBlockNum, 
-                                       myBufferSize,  
-                                       NOCRED, 
-                                       &theScanStatePtr->bufferPtr );
+                                                       myPhyBlockNum, 
+                                                       myBufferSize,  
+                                                       NOCRED, 
+                                                       &theScanStatePtr->bufferPtr );
        if ( myErr != E_NONE )
        {
                goto ExitThisRoutine;
        if ( scanState->bufferPtr != NULL )
        {
                scanState->bufferPtr->b_flags |= (B_INVAL | B_AGE);
-               brelse( scanState->bufferPtr ); 
+               brelse( scanState->bufferPtr );
                scanState->bufferPtr = NULL;
                scanState->currentNodePtr = NULL;
        }
 
        PanicIf ((level == 1) && (((NodeDescPtr)targetNode->buffer)->kind != kBTLeafNode), "\P InsertLevel: non-leaf at level 1! ");
 #endif
        leftNode.buffer = nil;
+       leftNode.blockHeader = nil;
        targetNodeNum = treePathTable [level].node;
 
        insertParent = false;
        updateParent = false;
 
+       // XXXdbg
+       ModifyBlockStart(btreePtr->fileRefNum, targetNode);
+
        ////// process first insert //////
-       
+
        err = InsertNode (btreePtr, primaryKey, targetNode, targetNodeNum, index,
                                          &newNodeNum, &newIndex, &leftNode, &updateParent, &insertParent, &newRoot );
        M_ExitOnError (err);
                UInt8 *                         recPtr;
                UInt16                          recSize;
                
+               parentNode.buffer = nil;
+               parentNode.blockHeader = nil;
+
                secondaryKey = nil;
                
                PanicIf ( (level == btreePtr->treeDepth), "\p InsertLevel: unfinished insert!?");
        
                if ( updateParent )
                {
+                       // XXXdbg
+                       ModifyBlockStart(btreePtr->fileRefNum, &parentNode);
+
                        //\80\80 debug: check if ptr == targetNodeNum
                        GetRecordByIndex (btreePtr, parentNode.buffer, index, &keyPtr, &recPtr, &recSize);
                        PanicIf( (*(UInt32 *) recPtr) != targetNodeNum, "\p InsertLevel: parent ptr doesn't match target node!");
                {
                        err = GetNode (btreePtr, leftNodeNum, leftNode);        // will be released by caller or a split below
                        M_ExitOnError (err);
+                       // XXXdbg
+                       ModifyBlockStart(btreePtr->fileRefNum, leftNode);
                }
 
                PanicIf ( ((NodeDescPtr) leftNode->buffer)->fLink != node, "\p InsertNode, RotateLeft: invalid sibling link!" );
        return noErr;
 
 ErrorExit:
-
        (void) ReleaseNode (btreePtr, leftNode);
        return err;
        
        Boolean                         deleteRequired;
        Boolean                         updateRequired;
 
-
+       // XXXdbg - initialize these to null in case we get an
+       //          error and try to exit before it's initialized
+       parentNode.buffer      = nil;   
+       parentNode.blockHeader = nil;
+       
        deleteRequired = false;
        updateRequired = false;
 
        targetNodePtr = targetNode->buffer;
        PanicIf (targetNodePtr == nil, "\pDeleteTree: targetNode has nil buffer!");
 
+       // XXXdbg
+       ModifyBlockStart(btreePtr->fileRefNum, targetNode);
+
        DeleteRecord (btreePtr, targetNodePtr, index);
                
        //\80\80 coalesce remaining records?
 
                deleteRequired = true;
                
+               siblingNode.buffer = nil;
+               siblingNode.blockHeader = nil;
+
                ////////////////// Get Siblings & Update Links //////////////////////////
                
                siblingNodeNum = targetNodePtr->bLink;                          // Left Sibling Node
                {
                        err = GetNode (btreePtr, siblingNodeNum, &siblingNode);
                        M_ExitOnError (err);
+
+                       // XXXdbg
+                       ModifyBlockStart(btreePtr->fileRefNum, &siblingNode);
+
                        ((NodeDescPtr)siblingNode.buffer)->fLink = targetNodePtr->fLink;
                        err = UpdateNode (btreePtr, &siblingNode, 0, kLockTransaction);
                        M_ExitOnError (err);
                {
                        err = GetNode (btreePtr, siblingNodeNum, &siblingNode);
                        M_ExitOnError (err);
+
+                       // XXXdbg
+                       ModifyBlockStart(btreePtr->fileRefNum, &siblingNode);
+
                        ((NodeDescPtr)siblingNode.buffer)->bLink = targetNodePtr->bLink;
                        err = UpdateNode (btreePtr, &siblingNode, 0, kLockTransaction);
                        M_ExitOnError (err);
                
                err = UpdateNode (btreePtr, targetNode, 0, kLockTransaction);
                M_ExitOnError (err);
+
                err = FreeNode (btreePtr, targetNodeNum);
                M_ExitOnError (err);
        }
                         UInt16         recSize;
                         UInt32         insertNode;
                         
+                        // XXXdbg
+                        ModifyBlockStart(btreePtr->fileRefNum, &parentNode);
+
                        //\80\80 debug: check if ptr == targetNodeNum
                        GetRecordByIndex (btreePtr, parentNode.buffer, index, &keyPtr, &recPtr, &recSize);
                        PanicIf( (*(UInt32 *) recPtr) != targetNodeNum, "\p DeleteTree: parent ptr doesn't match targetNodeNum!!");
        return  noErr;
 
 ErrorExit:
-       
+
        (void) ReleaseNode (btreePtr, targetNode);
        (void) ReleaseNode (btreePtr, &parentNode);
 
        
        originalRoot    = btreePtr->rootNode;
        
+       // XXXdbg
+       ModifyBlockStart(btreePtr->fileRefNum, blockPtr);
+
        while (true)
        {
                if ( ((NodeDescPtr)blockPtr->buffer)->numRecords > 1)
                //// Get New Root Node
                err = GetNode (btreePtr, btreePtr->rootNode, blockPtr);
                M_ExitOnError (err);
+
+               // XXXdbg
+               ModifyBlockStart(btreePtr->fileRefNum, blockPtr);
        }
        
        if (btreePtr->rootNode != originalRoot)
 
        if ( left != nil )
        {
+               // XXXdbg
+               ModifyBlockStart(btreePtr->fileRefNum, leftNode);
+
                left->fLink     = newNodeNum;
                err = UpdateNode (btreePtr, leftNode, 0, kLockTransaction);
                M_ExitOnError (err);
        err = GetNewNode (btreePtr, newNodeNum, leftNode);
        M_ExitOnError (err);
        
+       // XXXdbg
+       ModifyBlockStart(btreePtr->fileRefNum, leftNode);
+
        left            = leftNode->buffer;
        left->fLink     = rightNodeNum;
        
 
        err = RotateLeft (btreePtr, left, right, index, keyPtr, recPtr, recSize,
                                          insertIndex, insertNodeNum, &recordFit, recsRotated);
-       M_ExitOnError (err);
        
+       M_ExitOnError (err);
+
        return noErr;
        
 ErrorExit:
        Boolean                         didItFit;
        UInt16                          keyLength;      
        
+       rootNode.buffer = nil;
+       rootNode.blockHeader = nil;
+
        PanicIf (leftNode == nil, "\pAddNewRootNode: leftNode == nil");
        PanicIf (rightNode == nil, "\pAddNewRootNode: rightNode == nil");
        
        err = GetNewNode (btreePtr, rootNum, &rootNode);
        M_ExitOnError (err);
                
+       // XXXdbg
+       ModifyBlockStart(btreePtr->fileRefNum, &rootNode);
+
        ((NodeDescPtr)rootNode.buffer)->kind = kBTIndexNode;
        ((NodeDescPtr)rootNode.buffer)->height  = ++btreePtr->treeDepth;
        
 
        err = BuildCatalogKeyUTF8(vcb, destID, destName, kUndefinedStrLen, &destKey, NULL);
        ReturnIfError(err);
 
+       err = BTCheckFreeSpace(GetFileControlBlock(vcb->extentsRefNum));
+       ReturnIfError(err);
+       
        if ( isHFSPlus )
        {
                //--    Step 1: Check the catalog nodes for extents
 
        
        err = noErr;
        *hint = 0;
+
+       // XXXdbg - preflight that there's enough space
+       err = BTCheckFreeSpace(GetFileControlBlock(vcb->extentsRefNum));
+       if (err)
+               return err;
+
        MALLOC(btIterator, BTreeIterator *, sizeof(*btIterator), M_TEMP, M_WAITOK);
        bzero(btIterator, sizeof(*btIterator));
        
        if (err == noErr)
                *hint = btIterator->hint.nodeNum;
 
+       (void) BTFlushPath(GetFileControlBlock(vcb->extentsRefNum));
+       
        FREE(btIterator, M_TEMP);       
        return err;
 }
        OSErr                           err;
        
        err = noErr;
+
+       // XXXdbg - preflight that there's enough space
+       err = BTCheckFreeSpace(GetFileControlBlock(vcb->extentsRefNum));
+       if (err)
+               return err;
+
        MALLOC(btIterator, BTreeIterator *, sizeof(*btIterator), M_TEMP, M_WAITOK);
        bzero(btIterator, sizeof(*btIterator));
        
        }
 
        err = BTDeleteRecord(GetFileControlBlock(vcb->extentsRefNum), btIterator);
-
+       (void) BTFlushPath(GetFileControlBlock(vcb->extentsRefNum));
+       
        FREE(btIterator, M_TEMP);       
        return err;
 }
                //      Need to find and change a record in Extents BTree
                //
                btFCB = GetFileControlBlock(vcb->extentsRefNum);
+
+               // XXXdbg - preflight that there's enough space
+               err = BTCheckFreeSpace(btFCB);
+               if (err)
+                       return err;
+
                MALLOC(btIterator, BTreeIterator *, sizeof(*btIterator), M_TEMP, M_WAITOK);
                bzero(btIterator, sizeof(*btIterator));
 
 
                        if (err == noErr)
                                err = BTReplaceRecord(btFCB, btIterator, &btRecord, btRecordSize);
+                       (void) BTFlushPath(btFCB);
                }
                else {          //      HFS Plus volume
                        HFSPlusExtentRecord     foundData;              // The extent data actually found
                                BlockMoveData(extentData, &foundData, sizeof(HFSPlusExtentRecord));
                                err = BTReplaceRecord(btFCB, btIterator, &btRecord, btRecordSize);
                        }
+                       (void) BTFlushPath(btFCB);
                }
                FREE(btIterator, M_TEMP);       
        }
        
        return true;
 }
+
+
+//_________________________________________________________________________________
+//
+// Routine:            NodesAreContiguous
+//
+// Purpose:            Ensure that all b-tree nodes are contiguous on disk
+//                             Called by BTOpenPath during volume mount
+//_________________________________________________________________________________
+
+Boolean NodesAreContiguous(
+       ExtendedVCB     *vcb,
+       FCB                     *fcb,
+       UInt32          nodeSize)
+{
+       UInt32                          mask;
+       UInt32                          startBlock;
+       UInt32                          blocksChecked;
+       UInt32                          hint;
+       HFSPlusExtentKey        key;
+       HFSPlusExtentRecord     extents;
+       OSErr                           result;
+       Boolean                         lastExtentReached;
+       
+
+       if (vcb->blockSize >= nodeSize)
+               return TRUE;
+
+       mask = (nodeSize / vcb->blockSize) - 1;
+
+       // check the local extents
+       (void) GetFCBExtentRecord(fcb, extents);
+       if ( !ExtentsAreIntegral(extents, mask, &blocksChecked, &lastExtentReached) )
+               return FALSE;
+
+       if (lastExtentReached || (SInt64)((SInt64)blocksChecked * (SInt64)vcb->blockSize) >= fcb->ff_size)
+               return TRUE;
+
+       startBlock = blocksChecked;
+
+       // check the overflow extents (if any)
+       while ( !lastExtentReached )
+       {
+               result = FindExtentRecord(vcb, kDataForkType, fcb->ff_cp->c_fileid, startBlock, FALSE, &key, extents, &hint);
+               if (result) break;
+
+               if ( !ExtentsAreIntegral(extents, mask, &blocksChecked, &lastExtentReached) )
+                       return FALSE;
+
+               startBlock += blocksChecked;
+       }
+
+       return TRUE;
+}
+
 
 
        if (bp) {
                if (dirty) {
-                       bdwrite(bp);
+                       // XXXdbg
+                       struct hfsmount *hfsmp = VCBTOHFS(vcb);
+                       
+                       if (hfsmp->jnl) {
+                               journal_modify_block_end(hfsmp->jnl, bp);
+                       } else {
+                               bdwrite(bp);
+                       }
                } else {
                        brelse(bp);
                }
        UInt32  bitsPerBlock;
        UInt32  wordsPerBlock;
        Boolean dirty = false;
+       struct hfsmount *hfsmp = VCBTOHFS(vcb);
 
        //      Since this routine doesn't wrap around
        if (maxBlocks > (endingBlock - startingBlock)) {
                endingBlock = block + maxBlocks;        //      if we get this far, we've found enough
        }
        
+       // XXXdbg
+       if (hfsmp->jnl) {
+               journal_modify_block_start(hfsmp->jnl, (struct buf *)blockRef);
+       }
+
        //
        //      Allocate all of the consecutive blocks
        //
                                if (err != noErr) goto Exit;
                 buffer = currCache;
 
+                               // XXXdbg
+                               if (hfsmp->jnl) {
+                                       journal_modify_block_start(hfsmp->jnl, (struct buf *)blockRef);
+                               }
+                               
                                wordsLeft = wordsPerBlock;
                        }
                        
        UInt32  blockRef;
        UInt32  bitsPerBlock;
        UInt32  wordsPerBlock;
+       // XXXdbg
+       struct hfsmount *hfsmp = VCBTOHFS(vcb);
 
        //
        //      Pre-read the bitmap block containing the first word of allocation
                wordsLeft = wordsPerBlock - wordIndexInBlock;
        }
        
+       // XXXdbg
+       if (hfsmp->jnl) {
+               journal_modify_block_start(hfsmp->jnl, (struct buf *)blockRef);
+       }
+
        //
        //      If the first block to allocate doesn't start on a word
        //      boundary in the bitmap, then treat that first word
                        err = ReadBitmapBlock(vcb, startingBlock, &buffer, &blockRef);
                        if (err != noErr) goto Exit;
 
+                       // XXXdbg
+                       if (hfsmp->jnl) {
+                               journal_modify_block_start(hfsmp->jnl, (struct buf *)blockRef);
+                       }
+
                        //      Readjust currentWord and wordsLeft
                        currentWord = buffer;
                        wordsLeft = wordsPerBlock;
                        err = ReadBitmapBlock(vcb, startingBlock, &buffer, &blockRef);
                        if (err != noErr) goto Exit;
 
+                       // XXXdbg
+                       if (hfsmp->jnl) {
+                               journal_modify_block_start(hfsmp->jnl, (struct buf *)blockRef);
+                       }
+                       
                        //      Readjust currentWord and wordsLeft
                        currentWord = buffer;
                        wordsLeft = wordsPerBlock;
        UInt32  blockRef;
        UInt32  bitsPerBlock;
        UInt32  wordsPerBlock;
+    // XXXdbg
+       struct hfsmount *hfsmp = VCBTOHFS(vcb);
 
        //
        //      Pre-read the bitmap block containing the first word of allocation
 
        err = ReadBitmapBlock(vcb, startingBlock, &buffer, &blockRef);
        if (err != noErr) goto Exit;
+       // XXXdbg
+       if (hfsmp->jnl) {
+               journal_modify_block_start(hfsmp->jnl, (struct buf *)blockRef);
+       }
+
        //
        //      Initialize currentWord, and wordsLeft.
        //
                        err = ReadBitmapBlock(vcb, startingBlock, &buffer, &blockRef);
                        if (err != noErr) goto Exit;
 
+                       // XXXdbg
+                       if (hfsmp->jnl) {
+                               journal_modify_block_start(hfsmp->jnl, (struct buf *)blockRef);
+                       }
+
                        //      Readjust currentWord and wordsLeft
                        currentWord = buffer;
                        wordsLeft = wordsPerBlock;
                        err = ReadBitmapBlock(vcb, startingBlock, &buffer, &blockRef);
                        if (err != noErr) goto Exit;
 
+                       // XXXdbg
+                       if (hfsmp->jnl) {
+                               journal_modify_block_start(hfsmp->jnl, (struct buf *)blockRef);
+                       }
+                       
                        //      Readjust currentWord and wordsLeft
                        currentWord = buffer;
                        wordsLeft = wordsPerBlock;
 
        void            *blockHeader;
        ByteCount        blockSize;
        Boolean          blockReadFromDisk;
-       Byte             reserved[3];
+       Byte         isModified;             // XXXdbg - for journaling
+       Byte             reserved[2];
 };
 typedef struct BlockDescriptor BlockDescriptor;
 typedef BlockDescriptor *BlockDescPtr;
 extern OSStatus        BTSetLastSync           (FCB                                            *filePtr,
                                                                         UInt32                                         lastfsync );
 
+extern OSStatus        BTCheckFreeSpace        (FCB                                            *filePtr);
+
+extern OSStatus        BTHasContiguousNodes(FCB                                                *filePtr);
+
 #endif /* __APPLE_API_PRIVATE */
 #endif /* KERNEL */
 #endif // __BTREESINTERNAL__
 
 OSStatus       TrashNode                               (BTreeControlBlockPtr    btreePtr,
                                                                         NodePtr                                 nodePtr );
 
+// XXXdbg
+void ModifyBlockStart(FileReference vp, BlockDescPtr blockPtr);
+// XXXdbg
+
 OSStatus       UpdateNode                              (BTreeControlBlockPtr    btreePtr,
                                                                         NodePtr                                 nodePtr,
                                                                         UInt32                                  transactionID,
 
        
        if (fp->f_type != DTYPE_VNODE)
                return(KERN_INVALID_ARGUMENT);
+
+       if (!(fp->f_flag & FREAD))
+               return (KERN_PROTECTION_FAILURE);
+
        vp = (struct vnode *)fp->f_data;
 
        if (vp->v_type != VREG)
 
 
 
 #include <sys/types.h>
-#include <stdlib.h>
+//#include <stdlib.h>
 
 static inline char     *med3 __P((char *, char *, char *, int (*)()));
 static inline void      swapfunc __P((char *, char *, int, int));
               :(cmp(b, c) > 0 ? b : (cmp(a, c) < 0 ? a : c ));
 }
 
+__private_extern__
 void
 qsort(a, n, es, cmp)
        void *a;
 
 /*
- * Copyright (c) 1999-2001 Apple Computer, Inc. All rights reserved.
+ * Copyright (c) 1999-2002 Apple Computer, Inc. All rights reserved.
  *
  * @APPLE_LICENSE_HEADER_START@
  * 
        simple_unlock(&vp->v_interlock);
 }
 
+/*
+ * Serialize the requests to the VM
+ * Returns:
+ *             0       -       Failure
+ *             1       -       Sucessful in acquiring the lock
+ *             2       -       Sucessful in acquiring the lock recursively
+ *                             do not call ubc_unbusy()
+ *                             [This is strange, but saves 4 bytes in struct ubc_info]
+ */
+static int
+ubc_busy(struct vnode *vp)
+{
+       register struct ubc_info        *uip;
+
+       if (!UBCINFOEXISTS(vp))
+               return (0);
+
+       uip = vp->v_ubcinfo;
+
+       while (ISSET(uip->ui_flags, UI_BUSY)) {
+
+               if (uip->ui_owner == (void *)current_thread())
+                       return (2);
+
+               SET(uip->ui_flags, UI_WANTED);
+               (void) tsleep((caddr_t)&vp->v_ubcinfo, PINOD, "ubcbusy", 0);
+
+               if (!UBCINFOEXISTS(vp))
+                       return (0);
+       }
+       uip->ui_owner = (void *)current_thread();
+
+       SET(uip->ui_flags, UI_BUSY);
+
+       return (1);
+}
+
+static void
+ubc_unbusy(struct vnode *vp)
+{
+       register struct ubc_info        *uip;
+
+       if (!UBCINFOEXISTS(vp)) {
+               wakeup((caddr_t)&vp->v_ubcinfo);
+               return;
+       }
+       uip = vp->v_ubcinfo;
+       CLR(uip->ui_flags, UI_BUSY);
+       uip->ui_owner = (void *)NULL;
+
+       if (ISSET(uip->ui_flags, UI_WANTED)) {
+               CLR(uip->ui_flags, UI_WANTED);
+               wakeup((caddr_t)&vp->v_ubcinfo);
+       }
+}
+
 /*
  *     Initialization of the zone for Unified Buffer Cache.
  */
                uip->ui_refcount = 1;
                uip->ui_size = 0;
                uip->ui_mapped = 0;
+               uip->ui_owner = (void *)NULL;
                ubc_lock(vp);
        }
 #if DIAGNOSTIC
 void
 ubc_info_deallocate(struct ubc_info *uip)
 {
+
        assert(uip->ui_refcount > 0);
 
-    if (uip->ui_refcount-- == 1)
+    if (uip->ui_refcount-- == 1) {
+               struct vnode *vp;
+
+               vp = uip->ui_vnode;
+               if (ISSET(uip->ui_flags, UI_WANTED)) {
+                       CLR(uip->ui_flags, UI_WANTED);
+                       wakeup((caddr_t)&vp->v_ubcinfo);
+               }
+
                ubc_info_free(uip);
+       }
 }
 
 /*
 {
        kern_return_t kret;
        struct ubc_info *uip;
+       int    recursed;
        memory_object_control_t control;
        memory_object_perf_info_data_t   perf;
 
        if (!UBCINFOEXISTS(vp))
                return (0);
 
+       if ((recursed = ubc_busy(vp)) == 0)
+               return (0);
+
        uip = vp->v_ubcinfo;
 
        assert(uip != UBC_INFO_NULL);
        if (kret != KERN_SUCCESS) {
                printf("ubc_uncache: memory_object_change_attributes_named "
                        "kret = %d", kret);
+               if (recursed == 1)
+                       ubc_unbusy(vp);
                return (0);
        }
 
        ubc_release_named(vp);
 
+       if (recursed == 1)
+               ubc_unbusy(vp);
        return (1);
 }
 
 ubc_getobject(struct vnode *vp, int flags)
 {
        struct ubc_info *uip;
+       int    recursed;
        memory_object_control_t control;
 
-       uip = vp->v_ubcinfo;
-
        if (UBCINVALID(vp))
                return (0);
 
-       ubc_lock(vp);
+       if ((recursed = ubc_busy(vp)) == 0)
+               return (0);
 
+       uip = vp->v_ubcinfo;
        control = uip->ui_control;
 
        if ((flags & UBC_HOLDOBJECT) && (!ISSET(uip->ui_flags, UI_HASOBJREF))) {
                 * Take a temporary reference on the ubc info so that it won't go
                 * away during our recovery attempt.
                 */
+               ubc_lock(vp);
                uip->ui_refcount++;
                ubc_unlock(vp);
                if (memory_object_recover_named(control, TRUE) == KERN_SUCCESS) {
-                       ubc_lock(vp);
                        SET(uip->ui_flags, UI_HASOBJREF);
-                       ubc_unlock(vp);
                } else {
                        control = MEMORY_OBJECT_CONTROL_NULL;
                }
+               if (recursed == 1)
+                       ubc_unbusy(vp);
                ubc_info_deallocate(uip);
 
        } else {
-               ubc_unlock(vp);
+               if (recursed == 1)
+                       ubc_unbusy(vp);
        }
 
        return (control);
 ubc_hold(struct vnode *vp)
 {
        struct ubc_info *uip;
+       int    recursed;
        memory_object_control_t object;
 
        if (UBCINVALID(vp))
                return (0);
 
-       if (!UBCINFOEXISTS(vp)) {
+       if ((recursed = ubc_busy(vp)) == 0) {
                /* must be invalid or dying vnode */
                assert(UBCINVALID(vp) ||
-                          ((vp->v_flag & VXLOCK) || (vp->v_flag & VTERMINATE)));
+                       ((vp->v_flag & VXLOCK) || (vp->v_flag & VTERMINATE)));
                return (0);
        }
 
 
        ubc_lock(vp);
        uip->ui_refcount++;
+       ubc_unlock(vp);
 
        if (!ISSET(uip->ui_flags, UI_HASOBJREF)) {
-               ubc_unlock(vp);
-               if (memory_object_recover_named(uip->ui_control, TRUE) != KERN_SUCCESS) {
+               if (memory_object_recover_named(uip->ui_control, TRUE)
+                       != KERN_SUCCESS) {
+                       if (recursed == 1)
+                               ubc_unbusy(vp);
                        ubc_info_deallocate(uip);
                        return (0);
                }
-               ubc_lock(vp);
                SET(uip->ui_flags, UI_HASOBJREF);
-               ubc_unlock(vp);
-       } else {
-               ubc_unlock(vp);
        }
+       if (recursed == 1)
+               ubc_unbusy(vp);
 
        assert(uip->ui_refcount > 0);
+
        return (1);
 }
 
 ubc_release_named(struct vnode *vp)
 {
        struct ubc_info *uip;
+       int    recursed;
        memory_object_control_t control;
-       kern_return_t kret;
+       kern_return_t kret = KERN_FAILURE;
 
        if (UBCINVALID(vp))
                return (0);
 
-       if (!UBCINFOEXISTS(vp))
+       if ((recursed = ubc_busy(vp)) == 0)
                return (0);
-
        uip = vp->v_ubcinfo;
 
        /* can not release held or mapped vnodes */
        if (ISSET(uip->ui_flags, UI_HASOBJREF) && 
-           (uip->ui_refcount == 1) && !uip->ui_mapped) {
+               (uip->ui_refcount == 1) && !uip->ui_mapped) {
                control = uip->ui_control;
                assert(control);
                CLR(uip->ui_flags, UI_HASOBJREF);
                kret = memory_object_release_name(control,
                                MEMORY_OBJECT_RESPECT_CACHE);
-               return ((kret != KERN_SUCCESS) ? 0 : 1);
-       } else 
-               return (0);
+       }
+
+       if (recursed == 1)
+               ubc_unbusy(vp);
+       return ((kret != KERN_SUCCESS) ? 0 : 1);
 }
 
 /*
 
        s = splbio();
        for (bp = vp->v_dirtyblkhd.lh_first; bp; bp = nbp) {
                nbp = bp->b_vnbufs.le_next;
-               if ((bp->b_flags & B_BUSY))
+               // XXXdbg - don't flush locked blocks.  they may be journaled.
+               if ((bp->b_flags & B_BUSY) || (bp->b_flags & B_LOCKED))
                        continue;
                if ((bp->b_flags & B_DELWRI) == 0)
                        panic("spec_fsync: not dirty");
 
        int getpages;
 {
        register struct nfsnode *np = VTONFS(vp);
-       register int biosize, diff, i;
+       register int biosize, i;
+       off_t diff;
        struct buf *bp = 0, *rabp;
        struct vattr vattr;
        struct proc *p;
                bufsize = biosize;
                if ((off_t)(lbn + 1) * biosize > np->n_size && 
                    (off_t)(lbn + 1) * biosize - np->n_size < biosize) {
-                       bufsize = np->n_size - lbn * biosize;
+                       bufsize = np->n_size - (off_t)lbn * biosize;
                        bufsize = (bufsize + DEV_BSIZE - 1) & ~(DEV_BSIZE - 1);
                }
                bp = nfs_getcacheblk(vp, lbn, bufsize, p, operation);
                bp = getblk(vp, bn, size, 0, 0, operation);
 
        if( vp->v_type == VREG)
-               bp->b_blkno = (bn * biosize) / DEV_BSIZE;
+               bp->b_blkno = ((off_t)bn * biosize) / DEV_BSIZE;
 
        return (bp);
 }
 
        register struct mbuf *m, **mpp;
        register char *cp1, *cp2;
        register int len;
-       struct mbuf *om, *m2, *recm = 0;
+       struct mbuf *om, *m2, *recm;
        u_long recmark;
 
        if (slp->ns_flag & SLP_GETSTREAM)
 
            /*
             * Now get the record part.
+            *
+            * Note that slp->ns_reclen may be 0.  Linux sometimes
+            * generates 0-length RPCs
             */
+           recm = NULL;
            if (slp->ns_cc == slp->ns_reclen) {
                recm = slp->ns_raw;
                slp->ns_raw = slp->ns_rawend = (struct mbuf *)0;
 
 #if 0
                /* (removed for UBC) */
                bufsize = biosize;
-               if ((lbn + 1) * biosize > np->n_size) {
-                       bufsize = np->n_size - lbn * biosize;
+               if ((off_t)(lbn + 1) * biosize > np->n_size) {
+                       bufsize = np->n_size - (off_t)lbn * biosize;
                        bufsize = (bufsize + DEV_BSIZE - 1) & ~(DEV_BSIZE - 1);
                }
 #endif
 
        biosize = min(vp->v_mount->mnt_stat.f_iosize, PAGE_SIZE); /* nfs_bio.c */
 
-       *ap->a_offset = (off_t)(ap->a_lblkno *  biosize);
+       *ap->a_offset = (off_t)ap->a_lblkno *  biosize;
 
        return (0);
 }
 
 #define b_trans_head b_freelist.tqe_prev
 #define b_trans_next b_freelist.tqe_next
 #define b_real_bp    b_saveaddr
+#define b_iostate    b_rcred
+
+/* journaling uses this cluster i/o field for its own
+ * purposes because meta data buf's should never go
+ * through the clustering code.
+ */
+#define b_transaction b_vectorlist
+
+   
 
 /*
  * These flags are kept in b_flags.
 #define        B_WRITE         0x00000000      /* Write buffer (pseudo flag). */
 #define        B_WRITEINPROG   0x01000000      /* Write in progress. */
 #define        B_HDRALLOC      0x02000000      /* zone allocated buffer header */
-#define        B_UNUSED1       0x04000000      /* Unused bit */
+#define        B_NORELSE       0x04000000      /* don't brelse() in bwrite() */
 #define B_NEED_IODONE   0x08000000
                                                                /* need to do a biodone on the */
                                                                /* real_bp associated with a cluster_io */
 
 
 #define DKIOCGETMAXBLOCKCOUNTREAD    _IOR('d', 64, u_int64_t)
 #define DKIOCGETMAXBLOCKCOUNTWRITE   _IOR('d', 65, u_int64_t)
+#define DKIOCGETMAXBYTECOUNTREAD         _IOR('d', 70, u_int64_t)
+#define DKIOCGETMAXBYTECOUNTWRITE        _IOR('d', 71, u_int64_t)
 #define DKIOCGETMAXSEGMENTCOUNTREAD  _IOR('d', 66, u_int64_t)
 #define DKIOCGETMAXSEGMENTCOUNTWRITE _IOR('d', 67, u_int64_t)
+#define DKIOCGETMAXSEGMENTBYTECOUNTREAD  _IOR('d', 68, u_int64_t)
+#define DKIOCGETMAXSEGMENTBYTECOUNTWRITE _IOR('d', 69, u_int64_t)
 
 #ifdef KERNEL
 #define DKIOCSETBLOCKSIZE            _IOW('d', 24, u_int32_t)
 
 #define M_IP6MISC      88      /* IPv6 misc. memory */
 #define M_TSEGQ                89      /* TCP segment queue entry */
 #define M_IGMP         90
+#define M_JOURNAL       91      /* VFS Journaling code */
 
-#define        M_LAST          91      /* Must be last type + 1 */
+#define        M_LAST          92      /* Must be last type + 1 */
 
 /* Strings corresponding to types of memory */
 /* Must be in synch with the #defines above */
        "UDF mount"     /* 85 M_UDFMNT */ \
        "IPv6 NDP",     /* 86 M_IP6NDP */ \
        "IPv6 options", /* 87 M_IP6OPT */ \
-       "IPv6 Misc"     /* 88 M_IP6MISC */\
-       "TCP Segment Q" /* 89 M_TSEGQ */\
-       "IGMP state"    /* 90 M_IGMP */\
+       "IPv6 Misc",    /* 88 M_IP6MISC */\
+       "TCP Segment Q",/* 89 M_TSEGQ */\
+       "IGMP state",   /* 90 M_IGMP */\
+       "Journaling"    /* 91 M_JOURNAL */\
 }
 
 struct kmemstats {
 
 #define MNT_DONTBROWSE 0x00100000      /* file system is not appropriate path to user data */
 #define MNT_UNKNOWNPERMISSIONS 0x00200000 /* no known mapping for uid/gid in permissions information on disk */
 #define MNT_AUTOMOUNTED 0x00400000     /* filesystem was mounted by automounter */
+#define MNT_JOURNALED   0x00800000  /* filesystem is journaled */
 
 /*
  * NFS export related mount flags.
                        MNT_DEFEXPORTED | MNT_EXPORTANON| MNT_EXKERB    | \
                        MNT_LOCAL       |               MNT_QUOTA       | \
                        MNT_ROOTFS      | MNT_DOVOLFS   | MNT_DONTBROWSE | \
-                       MNT_UNKNOWNPERMISSIONS | MNT_AUTOMOUNTED | MNT_FIXEDSCRIPTENCODING )
+                       MNT_UNKNOWNPERMISSIONS | MNT_AUTOMOUNTED | MNT_JOURNALED | MNT_FIXEDSCRIPTENCODING )
 /*
  * External filesystem command modifier flags.
  * Unmount can use the MNT_FORCE flag.
 
        int                                             ui_refcount;/* ref count on the ubc_info */
        off_t                                   ui_size;        /* file size for the vnode */
        long                                    ui_mapped;      /* is it currently mapped */
+       void                                    *ui_owner;      /* for recursive ubc_busy */
 };
 
 /* Defines for ui_flags */
 #define UI_HASOBJREF   0x00000004              /* hold a reference on object */
 #define UI_WASMAPPED   0x00000008              /* vnode was mapped */
 #define        UI_DONTCACHE    0x00000010              /* do not cache object */
+#define        UI_BUSY                 0x00000020              /* for VM synchronization */
+#define        UI_WANTED               0x00000040              /* for VM synchronization */
 
 #endif /* __APPLE_API_PRIVATE */
 
 
 EXPINC_SUBDIRS_I386 = \
 
 DATAFILES = \
-        vfs_support.h  
+       vfs_support.h vfs_journal.h
 
 INSTALL_MI_LIST        = ${DATAFILES}
 
 
 /* number of per vnode, "in flight" buffer writes */
 #define        BUFWRITE_THROTTLE       9
 
+
 /*
  * Time in seconds before a buffer on a list is 
  * considered as a stale buffer 
 
        simple_lock(&bufhashlist_slock);
 
-#if 0  
-       if(incore(bp->b_vp, bp->b_lblkno))
-               panic("binshash: already incore");
+#if 0
+       if((bad = incore(bp->b_vp, bp->b_lblkno)))
+               panic("binshash: already incore bp 0x%x, bad 0x%x\n", bp, bad);
 #endif /* 0 */
 
        BHASHENTCHECK(bp);
                         */
                        bp->b_rcred = crdup(cred);
                }
+
                VOP_STRATEGY(bp);
 
                trace(TR_BREADMISS, pack(vp, size), blkno);
                        p->p_stats->p_ru.ru_oublock++;          /* XXX */
 
                /* Release the buffer. */
-               brelse(bp);
+               // XXXdbg - only if the unused bit is set
+               if (!ISSET(bp->b_flags, B_NORELSE)) {
+                   brelse(bp);
+               } else {
+                   CLR(bp->b_flags, B_NORELSE);
+               }
 
                return (rv);
        } else {
        if (nbdwrite < 0)
                panic("bdwrite: Negative nbdwrite");
 
-       if (nbdwrite > ((nbuf/4)*3)) {
+       // can't do a bawrite() if the LOCKED bit is set because the
+       // buffer is part of a transaction and can't go to disk until
+       // the LOCKED bit is cleared.
+       if (!ISSET(bp->b_flags, B_LOCKED) && nbdwrite > ((nbuf/4)*3)) {
                if (return_error)
                        return (EAGAIN);
                else
 
        trace(TR_BRELSE, pack(bp->b_vp, bp->b_bufsize), bp->b_lblkno);
 
+       // if we're invalidating a buffer that has the B_CALL bit
+       // set then call the b_iodone function so it gets cleaned
+       // up properly.
+       //
+       if (ISSET(bp->b_flags, B_META) && ISSET(bp->b_flags, B_INVAL)) {
+               if (ISSET(bp->b_flags, B_CALL) && !ISSET(bp->b_flags, B_DELWRI)) {
+                       panic("brelse: CALL flag set but not DELWRI! bp 0x%x\n", bp);
+               }
+               if (ISSET(bp->b_flags, B_CALL)) {       /* if necessary, call out */
+                       void    (*iodone_func)(struct buf *) = bp->b_iodone;
+
+                       CLR(bp->b_flags, B_CALL);       /* but note callout done */
+                       bp->b_iodone = NULL;
+
+                       if (iodone_func == NULL) {
+                               panic("brelse: bp @ 0x%x has NULL b_iodone!\n", bp);
+                       }
+                       (*iodone_func)(bp);
+               }
+       }
+       
        /* IO is done. Cleanup the UPL state */
        if (!ISSET(bp->b_flags, B_META)
                && UBCINFOEXISTS(bp->b_vp) && bp->b_bufsize) {
                        brelse(bp);
                        goto start;
                }
+               /*
+                * NOTE: YOU CAN NOT BLOCK UNTIL binshash() HAS BEEN
+                *       CALLED!  BE CAREFUL.
+                */
 
                /*
                 * if it is meta, the queue may be set to other 
        }
 
        if (ISSET(bp->b_flags, B_META) && (bp->b_data == 0))
-               panic("allocbuf: bp->b_data is NULL");
+               panic("allocbuf: bp->b_data is NULL, buf @ 0x%x", bp);
 
        bp->b_bufsize = desired_size;
        bp->b_bcount = size;
                panic("getnewbuf: null bp");
 
 found:
+       if (ISSET(bp->b_flags, B_LOCKED)) {
+           panic("getnewbuf: bp @ 0x%x is LOCKED! (flags 0x%x)\n", bp, bp->b_flags);
+       }
+       
        if (bp->b_hash.le_prev == (struct buf **)0xdeadbeef) 
-               panic("getnewbuf: le_prev is deadbeef");
+               panic("getnewbuf: le_prev is deadbeef, buf @ 0x%x", bp);
 
        if(ISSET(bp->b_flags, B_BUSY))
-               panic("getnewbuf reusing BUSY buf");
+               panic("getnewbuf reusing BUSY buf @ 0x%x", bp);
 
        /* Clean it */
        if (bcleanbuf(bp)) {
        }
 
        if (ISSET(bp->b_flags, B_CALL)) {       /* if necessary, call out */
+               void    (*iodone_func)(struct buf *) = bp->b_iodone;
+
                CLR(bp->b_flags, B_CALL);       /* but note callout done */
-               (*bp->b_iodone)(bp);
+               bp->b_iodone = NULL;
+
+               if (iodone_func == NULL) {
+                       panic("biodone: bp @ 0x%x has NULL b_iodone!\n", bp);                   
+               } else { 
+                       (*iodone_func)(bp);
+               }
        } else if (ISSET(bp->b_flags, B_ASYNC)) /* if async, release it */
                brelse(bp);
        else {                                  /* or just wakeup the buffer */ 
        /* clear out various fields */
        bp->b_flags = B_BUSY;
        bp->b_blkno = bp->b_lblkno = 0;
+
        bp->b_iodone = 0;
        bp->b_error = 0;
        bp->b_resid = 0;
 
        (void) thread_funnel_set(kernel_flock, funnel_state);
 }
+
+
+static int
+bp_cmp(void *a, void *b)
+{
+    struct buf *bp_a = *(struct buf **)a,
+               *bp_b = *(struct buf **)b;
+    daddr_t res;
+
+    // don't have to worry about negative block
+    // numbers so this is ok to do.
+    //
+    res = (bp_a->b_blkno - bp_b->b_blkno);
+
+    return (int)res;
+}
+
+#define NFLUSH 32
+
+int
+bflushq(int whichq, struct mount *mp)
+{
+       struct buf *bp, *next;
+       int         i, buf_count, s;
+       int         counter=0, total_writes=0;
+       static struct buf *flush_table[NFLUSH];
+
+       if (whichq < 0 || whichq >= BQUEUES) {
+           return;
+       }
+
+
+  restart:
+       bp = TAILQ_FIRST(&bufqueues[whichq]);
+       for(buf_count=0; bp; bp=next) {
+           next = bp->b_freelist.tqe_next;
+                       
+           if (bp->b_vp == NULL || bp->b_vp->v_mount != mp) {
+               continue;
+           }
+
+           if ((bp->b_flags & B_DELWRI) && (bp->b_flags & B_BUSY) == 0) {
+               if (whichq != BQ_LOCKED && (bp->b_flags & B_LOCKED)) {
+                   panic("bflushq: bp @ 0x%x is locked!\n", bp);
+               }
+               
+               bremfree(bp);
+               bp->b_flags |= B_BUSY;
+               flush_table[buf_count] = bp;
+               buf_count++;
+               total_writes++;
+
+               if (buf_count >= NFLUSH) {
+                   qsort(flush_table, buf_count, sizeof(struct buf *), bp_cmp);
+
+                   for(i=0; i < buf_count; i++) {
+                       bawrite(flush_table[i]);
+                   }
+
+                   goto restart;
+               }
+           }
+       }
+
+       if (buf_count > 0) {
+           qsort(flush_table, buf_count, sizeof(struct buf *), bp_cmp);
+           for(i=0; i < buf_count; i++) {
+               bawrite(flush_table[i]);
+           }
+       }
+
+       return total_writes;
+}
 
-
 /*
  * Copyright (c) 2000-2002 Apple Computer, Inc. All rights reserved.
  *
 #define CL_NOZERO    0x80
 #define CL_PAGEIN    0x100
 #define CL_DEV_MEMORY 0x200
+#define CL_PRESERVE   0x400
+
+struct clios {
+        u_int  io_completed;
+        u_int  io_issued;
+        off_t  io_offset;
+        int    io_error;
+        int    io_wanted;
+};
+
 
 static void cluster_zero(upl_t upl, vm_offset_t   upl_offset,
                int size, struct buf *bp);
 static int cluster_nocopy_write(struct vnode *vp, struct uio *uio,
                off_t newEOF, int devblocksize, int flags);
 static int cluster_phys_read(struct vnode *vp, struct uio *uio,
-               off_t filesize);
-static int cluster_phys_write(struct vnode *vp, struct uio *uio, off_t newEOF);
+               off_t filesize, int devblocksize, int flags);
+static int cluster_phys_write(struct vnode *vp, struct uio *uio,
+               off_t newEOF, int devblocksize, int flags);
+static int cluster_align_phys_io(struct vnode *vp, struct uio *uio,
+                vm_offset_t usr_paddr, int xsize, int devblocksize, int flags);
 static int cluster_push_x(struct vnode *vp, off_t EOF, daddr_t first, daddr_t last, int can_delay);
 static int cluster_try_push(struct vnode *vp, off_t newEOF, int can_delay, int push_all);
 
        int         total_resid;
        int         upl_offset;
        int         zero_offset;
+       int         l_blkno;
        upl_t       upl;
        struct buf *cbp;
        struct buf *cbp_head;
        struct buf *cbp_next;
        struct buf *real_bp;
        struct vnode *vp;
+       struct clios *iostate;
        int         commit_size;
        int         pg_offset;
 
        real_bp    = cbp->b_real_bp;
        vp         = cbp->b_vp;
        zero_offset= cbp->b_validend;
+       l_blkno    = cbp->b_lblkno;
+       iostate    = (struct clios *)cbp->b_iostate;
 
        while (cbp) {
                if (cbp->b_vectorcount > 1)
 
                cbp = cbp_next;
        }
+       if (zero_offset)
+               cluster_zero(upl, zero_offset, PAGE_SIZE - (zero_offset & PAGE_MASK), real_bp);
+
        if ((vp->v_flag & VTHROTTLED) && (vp->v_numoutput <= (ASYNC_THROTTLE / 3))) {
                vp->v_flag &= ~VTHROTTLED;
                wakeup((caddr_t)&vp->v_numoutput);
        }
-       if (zero_offset)
-               cluster_zero(upl, zero_offset, PAGE_SIZE - (zero_offset & PAGE_MASK), real_bp);
+       if (iostate) {
+               if (error) {
+                       off_t   error_offset;
+
+                       error_offset = (off_t)l_blkno * PAGE_SIZE_64;
 
+                       if (iostate->io_error == 0) {
+                               iostate->io_error = error;
+                               iostate->io_offset = error_offset;
+                       } else {
+                               if (error_offset < iostate->io_offset)
+                                       iostate->io_offset = error_offset;
+                       }
+               }
+               iostate->io_completed += total_size;
+
+               if (iostate->io_wanted) {
+                       iostate->io_wanted = 0;
+                       wakeup((caddr_t)&iostate->io_wanted);
+               }
+       }
        if ((b_flags & B_NEED_IODONE) && real_bp) {
                if (error) {
                        real_bp->b_flags |= B_ERROR;
                error = EIO;
 
        if (b_flags & B_COMMIT_UPL) {
-               pg_offset   = upl_offset & PAGE_MASK;
+               pg_offset   = upl_offset & PAGE_MASK;
                commit_size = (((pg_offset + total_size) + (PAGE_SIZE - 1)) / PAGE_SIZE) * PAGE_SIZE;
 
-               if (error || (b_flags & B_NOCACHE)) {
+               if (error || (b_flags & B_NOCACHE) || ((b_flags & B_PHYS) && !(b_flags & B_READ))) {
                        int upl_abort_code;
 
-                       if ((b_flags & B_PAGEOUT) && (error != ENXIO)) /* transient error */
+                       if (b_flags & B_PHYS)
+                               upl_abort_code = UPL_ABORT_FREE_ON_EMPTY;
+                       else if ((b_flags & B_PAGEOUT) && (error != ENXIO)) /* transient error */
                                upl_abort_code = UPL_ABORT_FREE_ON_EMPTY;
                        else if (b_flags & B_PGIN)
                                upl_abort_code = UPL_ABORT_FREE_ON_EMPTY | UPL_ABORT_ERROR;
                } else {
                        int upl_commit_flags = UPL_COMMIT_FREE_ON_EMPTY;
 
-                       if ( !(b_flags & B_PAGEOUT))
+                       if (b_flags & B_PHYS)
+                               upl_commit_flags |= UPL_COMMIT_SET_DIRTY;
+                       else if ( !(b_flags & B_PAGEOUT))
                                upl_commit_flags |= UPL_COMMIT_CLEAR_DIRTY;
                        if (b_flags & B_AGE)
                                upl_commit_flags |= UPL_COMMIT_INACTIVATE;
 }
 
 static int
-cluster_io(vp, upl, upl_offset, f_offset, non_rounded_size, devblocksize, flags, real_bp)
+cluster_io(vp, upl, upl_offset, f_offset, non_rounded_size, devblocksize, flags, real_bp, iostate)
        struct vnode *vp;
        upl_t         upl;
        vm_offset_t   upl_offset;
        int           devblocksize;
        int           flags;
        struct buf   *real_bp;
+       struct clios *iostate;
 {
        struct buf   *cbp;
        struct iovec *iovp;
-       u_int           size;
+       u_int         size;
+       u_int         io_size;
        int           io_flags;
        int           error = 0;
        int           retval = 0;
        u_int max_vectors;
        int priv;
        int zero_offset = 0;
+       u_int  first_lblkno;
 
        if (flags & CL_READ) {
                io_flags = (B_VECTORLIST | B_READ);
        }
        pl = ubc_upl_pageinfo(upl);
 
-       if (flags & CL_ASYNC)
-               io_flags |= (B_CALL | B_ASYNC);
        if (flags & CL_AGE)
                io_flags |= B_AGE;
        if (flags & CL_DUMP)
                io_flags |= B_NOCACHE;
        if (flags & CL_PAGEIN)
                io_flags |= B_PGIN;
+       if (flags & CL_PAGEOUT)
+               io_flags |= B_PAGEOUT;
+       if (flags & CL_COMMIT)
+               io_flags |= B_COMMIT_UPL;
+       if (flags & CL_PRESERVE)
+               io_flags |= B_PHYS;
 
        if (devblocksize)
                size = (non_rounded_size + (devblocksize - 1)) & ~(devblocksize - 1);
                zero_offset = upl_offset + non_rounded_size;
        }
        while (size) {
-               size_t io_size;
                int vsize;
                int i;
                int pl_index;
                else
                        io_size = size;
 
-               if (error = VOP_CMAP(vp, f_offset, io_size, &blkno, &io_size, NULL)) {
+               if (error = VOP_CMAP(vp, f_offset, io_size, &blkno, (size_t *)&io_size, NULL)) {
                        if (error == EOPNOTSUPP)
                                panic("VOP_CMAP Unimplemented");
                        break;
                if (error)
                        break;
 
-               if (flags & CL_ASYNC)
-                       cbp->b_iodone = (void *)cluster_iodone;
+               if (flags & CL_ASYNC) {
+                       cbp->b_flags |= (B_CALL | B_ASYNC);
+                       cbp->b_iodone = (void *)cluster_iodone;
+               }
                cbp->b_flags |= io_flags;
 
                cbp->b_lblkno = lblkno;
                cbp->b_uploffset = upl_offset;
                cbp->b_trans_next = (struct buf *)0;
 
+               if (cbp->b_iostate = (void *)iostate)
+                       iostate->io_issued += io_size;
+
                if (flags & CL_READ)
                        KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 26)) | DBG_FUNC_NONE,
                                     cbp->b_lblkno, cbp->b_blkno, upl_offset, io_size, 0);
                         * then go ahead and issue the I/O
                         */
 start_io:              
-                       if (flags & CL_COMMIT)
-                               cbp_head->b_flags |= B_COMMIT_UPL;
-                       if (flags & CL_PAGEOUT)
-                               cbp_head->b_flags |= B_PAGEOUT;
-                       if (flags & CL_PAGEIN)
-                               cbp_head->b_flags |= B_PGIN;
-
                        if (real_bp) {
                                cbp_head->b_flags |= B_NEED_IODONE;
                                cbp_head->b_real_bp = real_bp;
        if (error) {
                int abort_size;
 
+               io_size = 0;
+               
                for (cbp = cbp_head; cbp;) {
                        struct buf * cbp_next;
  
                                _FREE(cbp->b_vectorlist, M_SEGMENT);
                        upl_offset -= cbp->b_bcount;
                        size       += cbp->b_bcount;
+                       io_size    += cbp->b_bcount;
 
                        cbp_next = cbp->b_trans_next;
                        free_io_buf(cbp);
                        cbp = cbp_next;
                }
+               if (iostate) {
+                       if (iostate->io_error == 0) {
+                               iostate->io_error = error;
+                               iostate->io_offset = f_offset - (off_t)io_size;
+                       }
+                       iostate->io_issued -= io_size;
+
+                       if (iostate->io_wanted) {
+                               iostate->io_wanted = 0;
+                               wakeup((caddr_t)&iostate->io_wanted);
+                       }
+               }
                pg_offset  = upl_offset & PAGE_MASK;
                abort_size = ((size + pg_offset + (PAGE_SIZE - 1)) / PAGE_SIZE) * PAGE_SIZE;
 
                if (flags & CL_COMMIT) {
                        int upl_abort_code;
 
-                       if ((flags & CL_PAGEOUT) && (error != ENXIO)) /* transient error */
+                       if (flags & CL_PRESERVE)
+                               upl_abort_code = UPL_ABORT_FREE_ON_EMPTY;
+                       else if ((flags & CL_PAGEOUT) && (error != ENXIO)) /* transient error */
                                upl_abort_code = UPL_ABORT_FREE_ON_EMPTY;
                        else if (flags & CL_PAGEIN)
-                           upl_abort_code = UPL_ABORT_FREE_ON_EMPTY | UPL_ABORT_ERROR;
+                               upl_abort_code = UPL_ABORT_FREE_ON_EMPTY | UPL_ABORT_ERROR;
                        else
                                upl_abort_code = UPL_ABORT_FREE_ON_EMPTY | UPL_ABORT_DUMP_PAGES;
 
        }
 
        return (cluster_io(vp, upl, upl_offset, f_offset, io_size, devblocksize,
-                          local_flags, (struct buf *)0));
+                          local_flags, (struct buf *)0, (struct clios *)0));
 }
 
 int
                                    size - (upl_offset + rounded_size), UPL_ABORT_FREE_ON_EMPTY | UPL_ABORT_ERROR);
        
        retval = cluster_io(vp, upl, upl_offset, f_offset, io_size, devblocksize,
-                          local_flags | CL_READ | CL_PAGEIN, (struct buf *)0);
+                          local_flags | CL_READ | CL_PAGEIN, (struct buf *)0, (struct clios *)0);
 
        if (retval == 0) {
                int b_lblkno;
 
        f_offset = ubc_blktooff(bp->b_vp, bp->b_lblkno);
 
-        return (cluster_io(bp->b_vp, bp->b_pagelist, 0, f_offset, bp->b_bcount, 0, flags, bp));
+        return (cluster_io(bp->b_vp, bp->b_pagelist, 0, f_offset, bp->b_bcount, 0, flags, bp, (struct clios *)0));
 }
 
 int
        int           retval = 0;
 
 
-       if ((!uio) || (uio->uio_segflg != UIO_USERSPACE) || (!(vp->v_flag & VNOCACHE_DATA)))
+       if ( (!(vp->v_flag & VNOCACHE_DATA)) || (!uio) || (uio->uio_segflg != UIO_USERSPACE))
          {
            retval = cluster_write_x(vp, uio, oldEOF, newEOF, headOff, tailOff, devblocksize, flags);
            return(retval);
 
             if (upl_flags & UPL_PHYS_CONTIG)
              {
-               /*
-                * since the interface to the IOKit below us uses physical block #'s and
-                * block counts to specify the I/O, we can't handle anything that isn't
-                * devblocksize aligned 
-                */
-               if ((uio->uio_offset & (devblocksize - 1)) || (uio->uio_resid & (devblocksize - 1)))
-                   return(EINVAL);
-
                if (flags & IO_HEADZEROFILL)
                  {
                    flags &= ~IO_HEADZEROFILL;
                        return(retval);
                  }
 
-               retval = cluster_phys_write(vp, uio, newEOF);
+               retval = cluster_phys_write(vp, uio, newEOF, devblocksize, flags);
 
                if (uio->uio_resid == 0 && (flags & IO_TAILZEROFILL))
                  {
        return(retval);
 }
 
+
 static int
 cluster_nocopy_write(vp, uio, newEOF, devblocksize, flags)
        struct vnode *vp;
                       (int)upl_offset, (int)uio->uio_offset, io_size, 0, 0);
 
          error = cluster_io(vp, upl, upl_offset, uio->uio_offset,
-                            io_size, devblocksize, 0, (struct buf *)0);
+                            io_size, devblocksize, 0, (struct buf *)0, (struct clios *)0);
 
          if (error == 0) {
            /*
        return (error);
 }
 
+
 static int
-cluster_phys_write(vp, uio, newEOF)
+cluster_phys_write(vp, uio, newEOF, devblocksize, flags)
        struct vnode *vp;
        struct uio   *uio;
        off_t        newEOF;
+       int          devblocksize;
+       int          flags;
 {
+       upl_page_info_t *pl;
+       vm_offset_t      src_paddr;
        upl_t            upl;
        vm_offset_t      upl_offset;
+       int              tail_size;
        int              io_size;
        int              upl_size;
        int              upl_needed_size;
                              (vm_offset_t)iov->iov_base & ~PAGE_MASK,
                              &upl_size, &upl, NULL, &pages_in_pl, &upl_flags, 0);
 
-       if (kret != KERN_SUCCESS)
-         {
-           /* cluster_phys_write: failed to get pagelist */
-             /* note: return kret here */
+       if (kret != KERN_SUCCESS) {
+               /*
+                * cluster_phys_write: failed to get pagelist
+                * note: return kret here
+                */
              return(EINVAL);
-         }
-
+       }
        /*
         * Consider the possibility that upl_size wasn't satisfied.
         * This is a failure in the physical memory case.
         */
-       if (upl_size < upl_needed_size)
-         {
-           kernel_upl_abort_range(upl, 0, upl_size, UPL_ABORT_FREE_ON_EMPTY);
-           return(EINVAL);
-         }
+       if (upl_size < upl_needed_size) {
+               kernel_upl_abort_range(upl, 0, upl_size, UPL_ABORT_FREE_ON_EMPTY);
+               return(EINVAL);
+       }
+       pl = ubc_upl_pageinfo(upl);
 
-       /*
-        * issue a synchronous write to cluster_io
-        */
+       src_paddr = (vm_offset_t)upl_phys_page(pl, 0) + ((vm_offset_t)iov->iov_base & PAGE_MASK);
 
-       error = cluster_io(vp, upl, upl_offset, uio->uio_offset,
-                          io_size, 0, CL_DEV_MEMORY, (struct buf *)0);
+       while (((uio->uio_offset & (devblocksize - 1)) || io_size < devblocksize) && io_size) {
+               int   head_size;
 
-       if (error == 0) {
-         /*
-          * The cluster_io write completed successfully,
-          * update the uio structure and commit.
-          */
+               head_size = devblocksize - (int)(uio->uio_offset & (devblocksize - 1));
 
-         ubc_upl_commit_range(upl, 0, upl_size, UPL_COMMIT_FREE_ON_EMPTY);
-           
-         iov->iov_base += io_size;
-         iov->iov_len -= io_size;
-         uio->uio_resid -= io_size;
-         uio->uio_offset += io_size;
+               if (head_size > io_size)
+                       head_size = io_size;
+
+               error = cluster_align_phys_io(vp, uio, src_paddr, head_size, devblocksize, 0);
+
+               if (error) {
+                       ubc_upl_abort_range(upl, 0, upl_size, UPL_ABORT_FREE_ON_EMPTY);
+
+                       return(EINVAL);
+               }
+               upl_offset += head_size;
+               src_paddr  += head_size;
+               io_size    -= head_size;
        }
-       else
-         ubc_upl_abort_range(upl, 0, upl_size, UPL_ABORT_FREE_ON_EMPTY);
+       tail_size = io_size & (devblocksize - 1);
+       io_size  -= tail_size;
+
+       if (io_size) {
+               /*
+                * issue a synchronous write to cluster_io
+                */
+               error = cluster_io(vp, upl, upl_offset, uio->uio_offset,
+                                  io_size, 0, CL_DEV_MEMORY, (struct buf *)0, (struct clios *)0);
+       }
+       if (error == 0) {
+               /*
+                * The cluster_io write completed successfully,
+                * update the uio structure
+                */
+               uio->uio_resid  -= io_size;
+               iov->iov_len    -= io_size;
+               iov->iov_base   += io_size;
+               uio->uio_offset += io_size;
+               src_paddr       += io_size;
+
+               if (tail_size)
+                       error = cluster_align_phys_io(vp, uio, src_paddr, tail_size, devblocksize, 0);
+       }
+       /*
+        * just release our hold on the physically contiguous
+        * region without changing any state
+        */
+       ubc_upl_abort_range(upl, 0, upl_size, UPL_ABORT_FREE_ON_EMPTY);
 
        return (error);
 }
 
+
 static int
 cluster_write_x(vp, uio, oldEOF, newEOF, headOff, tailOff, devblocksize, flags)
        struct vnode *vp;
                                read_size = newEOF - upl_f_offset;
 
                        retval = cluster_io(vp, upl, 0, upl_f_offset, read_size, devblocksize,
-                                           CL_READ, (struct buf *)0);
+                                           CL_READ, (struct buf *)0, (struct clios *)0);
                        if (retval) {
                                /*
                                 * we had an error during the read which causes us to abort
                                        read_size = newEOF - (upl_f_offset + upl_offset);
 
                                retval = cluster_io(vp, upl, upl_offset, upl_f_offset + upl_offset, read_size, devblocksize,
-                                                   CL_READ, (struct buf *)0);
+                                                   CL_READ, (struct buf *)0, (struct clios *)0);
                                if (retval) {
                                        /*
                                         * we had an error during the read which causes us to abort
                        if (last_blkno > vp->v_lastw)
                                vp->v_lastw = last_blkno;
 
-                       ubc_upl_commit_range(upl, 0, upl_size, UPL_COMMIT_SET_DIRTY | UPL_COMMIT_INACTIVATE | UPL_COMMIT_FREE_ON_EMPTY);
+                       ubc_upl_commit_range(upl, 0, upl_size, UPL_COMMIT_SET_DIRTY | UPL_COMMIT_INACTIVATE | UPL_COMMIT_FREE_ON_EMPTY);
                        continue;
 issue_io:
                        /*
                                tsleep((caddr_t)&vp->v_numoutput, PRIBIO + 1, "cluster_write", 0);
                        }       
                        retval = cluster_io(vp, upl, 0, upl_f_offset, io_size, devblocksize,
-                                           io_flags, (struct buf *)0);
+                                           io_flags, (struct buf *)0, (struct clios *)0);
                }
        }
        KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 40)) | DBG_FUNC_END,
 
            if (upl_flags & UPL_PHYS_CONTIG)
              {
-               retval = cluster_phys_read(vp, uio, filesize);
+               retval = cluster_phys_read(vp, uio, filesize, devblocksize, flags);
              }
            else if (uio->uio_resid < 4 * PAGE_SIZE)
              {
        return(retval);
 }
 
+
 static int
 cluster_read_x(vp, uio, filesize, devblocksize, flags)
        struct vnode *vp;
                         */
 
                        error = cluster_io(vp, upl, upl_offset, upl_f_offset + upl_offset,
-                                          io_size, devblocksize, CL_READ, (struct buf *)0);
+                                          io_size, devblocksize, CL_READ, (struct buf *)0, (struct clios *)0);
                }
                if (error == 0) {
                        /*
        return (retval);
 }
 
+
 static int
 cluster_nocopy_read(vp, uio, filesize, devblocksize, flags)
        struct vnode *vp;
                       (int)upl, (int)upl_offset, (int)start_upl_f_offset, io_size, 0);
 
          error = cluster_io(vp, upl, upl_offset, start_upl_f_offset,
-                            io_size, devblocksize, CL_READ| CL_NOZERO, (struct buf *)0);
+                            io_size, devblocksize, CL_READ| CL_NOZERO, (struct buf *)0,  (struct clios *)0);
 
          if (error == 0) {
            /*
 }
 
 
+
 static int
-cluster_phys_read(vp, uio, filesize)
+cluster_phys_read(vp, uio, filesize, devblocksize, flags)
        struct vnode *vp;
        struct uio   *uio;
        off_t        filesize;
+       int          devblocksize;
+       int          flags;
 {
+       upl_page_info_t *pl;
        upl_t            upl;
        vm_offset_t      upl_offset;
+       vm_offset_t      dst_paddr;
        off_t            max_size;
        int              io_size;
+       int              tail_size;
        int              upl_size;
        int              upl_needed_size;
        int              pages_in_pl;
        int              upl_flags;
        kern_return_t    kret;
        struct iovec     *iov;
+       struct clios     iostate;
        int              error;
 
        /*
 
        max_size = filesize - uio->uio_offset;
 
-       if (max_size < (off_t)((unsigned int)iov->iov_len))
-           io_size = max_size;
+       if (max_size > (off_t)((unsigned int)iov->iov_len))
+               io_size = iov->iov_len;
        else
-           io_size = iov->iov_len;
+               io_size = max_size;
 
        upl_offset = (vm_offset_t)iov->iov_base & PAGE_MASK_64;
        upl_needed_size = upl_offset + io_size;
 
+       error       = 0;
        pages_in_pl = 0;
        upl_size = upl_needed_size;
        upl_flags = UPL_FILE_IO | UPL_NO_SYNC | UPL_CLEAN_IN_PLACE | UPL_SET_INTERNAL;
                              (vm_offset_t)iov->iov_base & ~PAGE_MASK,
                              &upl_size, &upl, NULL, &pages_in_pl, &upl_flags, 0);
 
-       if (kret != KERN_SUCCESS)
-         {
-           /* cluster_phys_read: failed to get pagelist */
-           return(EINVAL);
-         }
+       if (kret != KERN_SUCCESS) {
+               /*
+                * cluster_phys_read: failed to get pagelist
+                */
+               return(EINVAL);
+       }
+       if (upl_size < upl_needed_size) {
+               /*
+                * The upl_size wasn't satisfied.
+                */
+               ubc_upl_abort_range(upl, 0, upl_size, UPL_ABORT_FREE_ON_EMPTY);
+
+               return(EINVAL);
+       }
+       pl = ubc_upl_pageinfo(upl);
+
+       dst_paddr = (vm_offset_t)upl_phys_page(pl, 0) + ((vm_offset_t)iov->iov_base & PAGE_MASK);
 
+       while (((uio->uio_offset & (devblocksize - 1)) || io_size < devblocksize) && io_size) {
+               int   head_size;
+
+               head_size = devblocksize - (int)(uio->uio_offset & (devblocksize - 1));
+
+               if (head_size > io_size)
+                       head_size = io_size;
+
+               error = cluster_align_phys_io(vp, uio, dst_paddr, head_size, devblocksize, CL_READ);
+
+               if (error) {
+                       ubc_upl_abort_range(upl, 0, upl_size, UPL_ABORT_FREE_ON_EMPTY);
+
+                       return(EINVAL);
+               }
+               upl_offset += head_size;
+               dst_paddr  += head_size;
+               io_size    -= head_size;
+       }
+       tail_size = io_size & (devblocksize - 1);
+       io_size  -= tail_size;
+
+       iostate.io_completed = 0;
+       iostate.io_issued = 0;
+       iostate.io_error = 0;
+       iostate.io_wanted = 0;
+
+       while (io_size && error == 0) {
+               int  xsize;
+
+               if (io_size > (MAX_UPL_TRANSFER * PAGE_SIZE))
+                       xsize = MAX_UPL_TRANSFER * PAGE_SIZE;
+               else
+                       xsize = io_size;
+               /*
+                * request asynchronously so that we can overlap
+                * the preparation of the next I/O... we'll do
+                * the commit after all the I/O has completed
+                * since its all issued against the same UPL
+                * if there are already too many outstanding reads
+                * throttle back until we reach a more reasonable level
+                */
+               while ((iostate.io_issued - iostate.io_completed) > (2 * MAX_UPL_TRANSFER * PAGE_SIZE)) {
+                       iostate.io_wanted = 1;
+                       tsleep((caddr_t)&iostate.io_wanted, PRIBIO + 1, "cluster_phys_read", 0);
+               }       
+
+               error = cluster_io(vp, upl, upl_offset, uio->uio_offset, xsize, 0, 
+                                  CL_READ | CL_NOZERO | CL_DEV_MEMORY | CL_ASYNC,
+                                  (struct buf *)0, &iostate);
+               /*
+                * The cluster_io read was issued successfully,
+                * update the uio structure
+                */
+               if (error == 0) {
+                       uio->uio_resid  -= xsize;
+                       iov->iov_len    -= xsize;
+                       iov->iov_base   += xsize;
+                       uio->uio_offset += xsize;
+                       dst_paddr       += xsize;
+                       upl_offset      += xsize;
+                       io_size         -= xsize;
+               }
+       }
        /*
-        * Consider the possibility that upl_size wasn't satisfied.
+        * make sure any async reads have completed before
+        * we proceed
         */
-       if (upl_size < upl_needed_size)
-         {
-           ubc_upl_abort_range(upl, 0, upl_size, UPL_ABORT_FREE_ON_EMPTY);
-           return(EINVAL);
-         }
+       while (iostate.io_issued != iostate.io_completed) {
+               iostate.io_wanted = 1;
+               tsleep((caddr_t)&iostate.io_wanted, PRIBIO + 1, "cluster_phys_read", 0);
+       }       
+       if (iostate.io_error) {
+               error = iostate.io_error;
+       }
+       if (error == 0 && tail_size)
+               error = cluster_align_phys_io(vp, uio, dst_paddr, tail_size, devblocksize, CL_READ);
 
        /*
-        * issue a synchronous read to cluster_io
+        * just release our hold on the physically contiguous
+        * region without changing any state
         */
-
-       error = cluster_io(vp, upl, upl_offset, uio->uio_offset,
-                          io_size, 0, CL_READ| CL_NOZERO | CL_DEV_MEMORY, (struct buf *)0);
-
-       if (error == 0)
-         {
-           /*
-            * The cluster_io read completed successfully,
-            * update the uio structure and commit.
-            */
-
-           ubc_upl_commit_range(upl, 0, upl_size, UPL_COMMIT_FREE_ON_EMPTY);
-           
-           iov->iov_base += io_size;
-           iov->iov_len -= io_size;
-           uio->uio_resid -= io_size;
-           uio->uio_offset += io_size;
-         }
-       else
-           ubc_upl_abort_range(upl, 0, upl_size, UPL_ABORT_FREE_ON_EMPTY);
+       ubc_upl_abort_range(upl, 0, upl_size, UPL_ABORT_FREE_ON_EMPTY);
        
        return (error);
 }
 
+
 /*
  * generate advisory I/O's in the largest chunks possible
  * the completed pages will be released into the VM cache
                                 * issue an asynchronous read to cluster_io
                                 */
                                retval = cluster_io(vp, upl, upl_offset, upl_f_offset + upl_offset, io_size, devblocksize,
-                                                   CL_ASYNC | CL_READ | CL_COMMIT | CL_AGE, (struct buf *)0);
+                                                   CL_ASYNC | CL_READ | CL_COMMIT | CL_AGE, (struct buf *)0, (struct clios *)0);
 
                                issued_io = 1;
                        }
                        vp->v_flag |= VTHROTTLED;
                        tsleep((caddr_t)&vp->v_numoutput, PRIBIO + 1, "cluster_push", 0);
                }
-               cluster_io(vp, upl, upl_offset, upl_f_offset + upl_offset, io_size, vp->v_ciosiz, io_flags, (struct buf *)0);
+               cluster_io(vp, upl, upl_offset, upl_f_offset + upl_offset, io_size, vp->v_ciosiz, io_flags, (struct buf *)0, (struct clios *)0);
 
                size -= io_size;
        }
 
        return(1);
 }
+
+
+
+static int
+cluster_align_phys_io(struct vnode *vp, struct uio *uio, vm_offset_t usr_paddr, int xsize, int devblocksize, int flags)
+{
+        struct iovec     *iov;
+        upl_page_info_t  *pl;
+        upl_t            upl;
+        vm_offset_t      ubc_paddr;
+        kern_return_t    kret;
+        int              error = 0;
+
+        iov = uio->uio_iov;
+
+        kret = ubc_create_upl(vp,
+                              uio->uio_offset & ~PAGE_MASK_64,
+                              PAGE_SIZE,
+                              &upl,
+                              &pl,
+                              UPL_FLAGS_NONE);
+
+        if (kret != KERN_SUCCESS)
+                return(EINVAL);
+
+        if (!upl_valid_page(pl, 0)) {
+                /*
+                 * issue a synchronous read to cluster_io
+                 */
+                error = cluster_io(vp, upl, 0, uio->uio_offset & ~PAGE_MASK_64, PAGE_SIZE, devblocksize,
+                                  CL_READ, (struct buf *)0, (struct clios *)0);
+                if (error) {
+                          ubc_upl_abort_range(upl, 0, PAGE_SIZE, UPL_ABORT_DUMP_PAGES | UPL_ABORT_FREE_ON_EMPTY);
+
+                          return(error);
+                }
+        }
+        ubc_paddr = (vm_offset_t)upl_phys_page(pl, 0) + (int)(uio->uio_offset & PAGE_MASK_64);
+
+       if (flags & CL_READ)
+               copyp2p(ubc_paddr, usr_paddr, xsize, 2);
+       else
+               copyp2p(usr_paddr, ubc_paddr, xsize, 1);
+
+       if ( !(flags & CL_READ) || upl_dirty_page(pl, 0)) {
+                /*
+                 * issue a synchronous write to cluster_io
+                 */
+                error = cluster_io(vp, upl, 0, uio->uio_offset & ~PAGE_MASK_64, PAGE_SIZE, devblocksize,
+                                  0, (struct buf *)0, (struct clios *)0);
+       }
+       if (error == 0) {
+               uio->uio_offset += xsize;
+               iov->iov_base   += xsize;
+               iov->iov_len    -= xsize;
+               uio->uio_resid  -= xsize;
+       }
+       ubc_upl_abort_range(upl, 0, PAGE_SIZE, UPL_ABORT_DUMP_PAGES | UPL_ABORT_FREE_ON_EMPTY);
+
+        return (error);
+}
 
--- /dev/null
+/*
+ * Copyright (c) 1995-2002 Apple Computer, Inc. All rights reserved.
+ *
+ * @APPLE_LICENSE_HEADER_START@
+ * 
+ * The contents of this file constitute Original Code as defined in and
+ * are subject to the Apple Public Source License Version 1.1 (the
+ * "License").  You may not use this file except in compliance with the
+ * License.  Please obtain a copy of the License at
+ * http://www.apple.com/publicsource and read it before using this file.
+ * 
+ * This Original Code and all software distributed under the License are
+ * distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY KIND, EITHER
+ * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
+ * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE OR NON-INFRINGEMENT.  Please see the
+ * License for the specific language governing rights and limitations
+ * under the License.
+ * 
+ * @APPLE_LICENSE_HEADER_END@
+ */
+//
+// This file implements a simple write-ahead journaling layer.  
+// In theory any file system can make use of it by calling these 
+// functions when the fs wants to modify meta-data blocks.  See
+// vfs_journal.h for a more detailed description of the api and
+// data structures.
+//
+// Dominic Giampaolo (dbg@apple.com)
+//
+
+#ifdef KERNEL
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/kernel.h>
+#include <sys/file.h>
+#include <sys/stat.h>
+#include <sys/buf.h>
+#include <sys/proc.h>
+#include <sys/mount.h>
+#include <sys/namei.h>
+#include <sys/vnode.h>
+#include <sys/ioctl.h>
+#include <sys/tty.h>
+#include <sys/ubc.h>
+#include <sys/malloc.h>
+#include <sys/vnode.h>
+#include <kern/thread_act.h>
+#include <sys/disk.h>
+#include <miscfs/specfs/specdev.h>
+
+extern task_t kernel_task;
+
+#else
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <limits.h>
+#include <errno.h>
+#include <fcntl.h>
+#include <unistd.h>
+#include <stdarg.h>
+#include <sys/types.h>
+#include "compat.h"
+
+#endif   /* KERNEL */
+
+#include "vfs_journal.h"
+
+
+// number of bytes to checksum in a block_list_header
+// NOTE: this should be enough to clear out the header
+//       fields as well as the first entry of binfo[]
+#define BLHDR_CHECKSUM_SIZE 32
+
+
+
+static int  end_transaction(transaction *tr, int force_it);
+static void abort_transaction(journal *jnl, transaction *tr);
+static void dump_journal(journal *jnl);
+
+
+#define CHECK_JOURNAL(jnl) \
+    do { \
+    if (jnl == NULL) {\
+       panic("%s:%d: null journal ptr?\n", __FILE__, __LINE__);\
+    }\
+    if (jnl->jdev == NULL) { \
+       panic("%s:%d: jdev is null!\n", __FILE__, __LINE__);\
+    } \
+    if (jnl->fsdev == NULL) { \
+       panic("%s:%d: fsdev is null!\n", __FILE__, __LINE__);\
+    } \
+    if (jnl->jhdr->magic != JOURNAL_HEADER_MAGIC) {\
+       panic("%s:%d: jhdr magic corrupted (0x%x != 0x%x)\n",\
+       __FILE__, __LINE__, jnl->jhdr->magic, JOURNAL_HEADER_MAGIC);\
+    }\
+    if (   jnl->jhdr->start <= 0 \
+       || jnl->jhdr->start > jnl->jhdr->size\
+       || jnl->jhdr->start > 128*1024*1024) {\
+       panic("%s:%d: jhdr start looks bad (0x%llx max size 0x%llx)\n", \
+       __FILE__, __LINE__, jnl->jhdr->start, jnl->jhdr->size);\
+    }\
+    if (   jnl->jhdr->end <= 0 \
+       || jnl->jhdr->end > jnl->jhdr->size\
+       || jnl->jhdr->end > 128*1024*1024) {\
+       panic("%s:%d: jhdr end looks bad (0x%llx max size 0x%llx)\n", \
+       __FILE__, __LINE__, jnl->jhdr->end, jnl->jhdr->size);\
+    }\
+    if (jnl->jhdr->size > 128*1024*1024) {\
+       panic("%s:%d: jhdr size looks bad (0x%llx)\n",\
+       __FILE__, __LINE__, jnl->jhdr->size);\
+    } \
+    } while(0)
+
+#define CHECK_TRANSACTION(tr) \
+    do {\
+    if (tr == NULL) {\
+       panic("%s:%d: null transaction ptr?\n", __FILE__, __LINE__);\
+    }\
+    if (tr->jnl == NULL) {\
+       panic("%s:%d: null tr->jnl ptr?\n", __FILE__, __LINE__);\
+    }\
+    if (tr->blhdr != (block_list_header *)tr->tbuffer) {\
+       panic("%s:%d: blhdr (0x%x) != tbuffer (0x%x)\n", __FILE__, __LINE__, tr->blhdr, tr->tbuffer);\
+    }\
+    if (tr->total_bytes < 0) {\
+       panic("%s:%d: tr total_bytes looks bad: %d\n", __FILE__, __LINE__, tr->total_bytes);\
+    }\
+    if (tr->journal_start < 0 || tr->journal_start > 128*1024*1024) {\
+       panic("%s:%d: tr journal start looks bad: 0x%llx\n", __FILE__, __LINE__, tr->journal_start);\
+    }\
+    if (tr->journal_end < 0 || tr->journal_end > 128*1024*1024) {\
+       panic("%s:%d: tr journal end looks bad: 0x%llx\n", __FILE__, __LINE__, tr->journal_end);\
+    }\
+    if (tr->blhdr && (tr->blhdr->max_blocks <= 0 || tr->blhdr->max_blocks > 2048)) {\
+       panic("%s:%d: tr blhdr max_blocks looks bad: %d\n", __FILE__, __LINE__, tr->blhdr->max_blocks);\
+    }\
+    } while(0)
+
+
+
+//
+// this isn't a great checksum routine but it will do for now.
+// we use it to checksum the journal header and the block list
+// headers that are at the start of each transaction.
+//
+static int
+calc_checksum(char *ptr, int len)
+{
+    int i, cksum=0;
+
+    // this is a lame checksum but for now it'll do
+    for(i=0; i < len; i++, ptr++) {
+               cksum = (cksum << 8) ^ (cksum + *(unsigned char *)ptr);
+    }
+
+    return (~cksum);
+}
+
+
+#define JNL_WRITE 1
+#define JNL_READ  2
+
+//
+// This function sets up a fake buf and passes it directly to the
+// journal device strategy routine (so that it won't get cached in
+// the block cache.
+//
+// It also handles range checking the i/o so that we don't write
+// outside the journal boundaries and it will wrap the i/o back
+// to the beginning if necessary (skipping over the journal header)
+// 
+static size_t
+do_journal_io(journal *jnl, off_t *offset, void *data, size_t len, int direction)
+{
+    int         err, io_sz=0, curlen=len;
+    struct buf *bp;
+       int max_iosize=0, max_vectors;
+
+    if (*offset < 0 || *offset > jnl->jhdr->size) {
+               panic("jnl: do_jnl_io: bad offset 0x%llx (max 0x%llx)\n", *offset, jnl->jhdr->size);
+    }
+
+  again:
+    bp = alloc_io_buf(jnl->jdev, 1);
+
+    if (direction == JNL_WRITE) {
+               bp->b_flags  |= 0;   // don't have to set any flags (was: B_WRITEINPROG)
+               jnl->jdev->v_numoutput++;
+               vfs_io_attributes(jnl->jdev, B_WRITE, &max_iosize, &max_vectors);
+    } else if (direction == JNL_READ) {
+               bp->b_flags  |= B_READ;
+               vfs_io_attributes(jnl->jdev, B_READ, &max_iosize, &max_vectors);
+    }
+
+       if (max_iosize == 0) {
+               max_iosize = 128 * 1024;
+       }
+
+    if (*offset + (off_t)curlen > jnl->jhdr->size && *offset != 0 && jnl->jhdr->size != 0) {
+               if (*offset == jnl->jhdr->size) {
+                       *offset = jnl->jhdr->jhdr_size;
+               } else {
+                       curlen = (off_t)jnl->jhdr->size - *offset;
+               }
+    }
+
+       if (curlen > max_iosize) {
+               curlen = max_iosize;
+       }
+
+    if (curlen <= 0) {
+               panic("jnl: do_jnl_io: curlen == %d, offset 0x%llx len %d\n", curlen, *offset, len);
+    }
+
+    bp->b_bufsize = curlen;
+    bp->b_bcount  = curlen;
+    bp->b_data    = data;
+    bp->b_blkno   = (daddr_t) ((jnl->jdev_offset + *offset) / (off_t)jnl->jhdr->jhdr_size);
+    bp->b_lblkno  = (daddr_t) ((jnl->jdev_offset + *offset) / (off_t)jnl->jhdr->jhdr_size);
+
+    err = VOP_STRATEGY(bp);
+    if (!err) {
+               err = biowait(bp);
+    }
+    
+    bp->b_data    = NULL;
+    bp->b_bufsize = bp->b_bcount = 0;
+    bp->b_blkno   = bp->b_lblkno = -1;
+
+    free_io_buf(bp);
+
+    if (err) {
+               printf("jnl: do_jnl_io: strategy err 0x%x\n", err);
+               return 0;
+    }
+
+    *offset += curlen;
+    io_sz   += curlen;
+    if (io_sz != len) {
+               // handle wrap-around
+               data    = (char *)data + curlen;
+               curlen  = len - io_sz;
+               if (*offset >= jnl->jhdr->size) {
+                       *offset = jnl->jhdr->jhdr_size;
+               }
+               goto again;
+    }
+
+    return io_sz;
+}
+
+static size_t
+read_journal_data(journal *jnl, off_t *offset, void *data, size_t len)
+{
+    return do_journal_io(jnl, offset, data, len, JNL_READ);
+}
+
+static size_t
+write_journal_data(journal *jnl, off_t *offset, void *data, size_t len)
+{
+    return do_journal_io(jnl, offset, data, len, JNL_WRITE);
+}
+
+
+static int
+write_journal_header(journal *jnl)
+{
+    int ret;
+    off_t jhdr_offset = 0;
+    
+    // 
+    // XXXdbg note: this ioctl doesn't seem to do anything on firewire disks.
+    //
+    ret = VOP_IOCTL(jnl->jdev, DKIOCSYNCHRONIZECACHE, NULL, FWRITE, NOCRED, current_proc());
+    if (ret != 0) {
+               printf("jnl: flushing fs disk buffer returned 0x%x\n", ret);
+    }
+
+
+    jnl->jhdr->checksum = 0;
+    jnl->jhdr->checksum = calc_checksum((char *)jnl->jhdr, sizeof(struct journal_header));
+    if (write_journal_data(jnl, &jhdr_offset, jnl->header_buf, jnl->jhdr->jhdr_size) != jnl->jhdr->jhdr_size) {
+               printf("jnl: write_journal_header: error writing the journal header!\n");
+               jnl->flags |= JOURNAL_INVALID;
+               return -1;
+    }  
+
+    return 0;
+}
+
+
+
+//
+// this is a work function used to free up transactions that
+// completed. they can't be free'd from buffer_flushed_callback
+// because it is called from deep with the disk driver stack
+// and thus can't do something that would potentially cause
+// paging.  it gets called by each of the journal api entry
+// points so stuff shouldn't hang around for too long.
+//
+static void
+free_old_stuff(journal *jnl)
+{
+    transaction *tr, *next;
+
+    for(tr=jnl->tr_freeme; tr; tr=next) {
+               next = tr->next;
+               kmem_free(kernel_map, (vm_offset_t)tr, sizeof(transaction));
+    }
+
+    jnl->tr_freeme = NULL;
+}
+
+
+
+//
+// This is our callback that lets us know when a buffer has been
+// flushed to disk.  It's called from deep within the driver stack
+// and thus is quite limited in what it can do.  Notably, it can
+// not initiate any new i/o's or allocate/free memory.
+//
+static void
+buffer_flushed_callback(struct buf *bp)
+{
+    transaction  *tr;
+    journal      *jnl;
+    transaction  *ctr, *prev=NULL, *next;
+    int           i, bufsize;
+
+
+    //printf("jnl: buf flush: bp @ 0x%x l/blkno %d/%d vp 0x%x tr @ 0x%x\n",
+    //    bp, bp->b_lblkno, bp->b_blkno, bp->b_vp, bp->b_transaction);
+
+    // snarf out the bits we want
+    bufsize = bp->b_bufsize;
+    tr      = bp->b_transaction;
+
+    bp->b_iodone      = NULL;   // don't call us for this guy again
+    bp->b_transaction = NULL;
+
+    //
+    // This is what biodone() would do if it didn't call us.
+    // NOTE: THIS CODE *HAS* TO BE HERE!
+    //
+    if (ISSET(bp->b_flags, B_ASYNC)) { /* if async, release it */
+               brelse(bp);
+    } else {                                   /* or just wakeup the buffer */ 
+               CLR(bp->b_flags, B_WANTED);
+               wakeup(bp);
+    }
+
+    // NOTE: from here on out we do *NOT* touch bp anymore.
+
+
+    // then we've already seen it
+    if (tr == NULL) {
+               return;
+    }
+
+    CHECK_TRANSACTION(tr);
+
+    jnl = tr->jnl;
+    if (jnl->flags & JOURNAL_INVALID) {
+               return;
+    }
+
+    CHECK_JOURNAL(jnl);
+
+    // update the number of blocks that have been flushed.
+    // this buf may represent more than one block so take
+    // that into account.
+    tr->num_flushed += bufsize;
+
+
+    // if this transaction isn't done yet, just return as
+    // there is nothing to do.
+    if ((tr->num_flushed + tr->num_killed) < tr->total_bytes) {
+               return;
+    }
+
+    //printf("jnl: tr 0x%x (0x%llx 0x%llx) in jnl 0x%x completed.\n",
+    //   tr, tr->journal_start, tr->journal_end, jnl);
+
+       // find this entry in the old_start[] index and mark it completed
+       simple_lock(&jnl->old_start_lock);
+       for(i=0; i < sizeof(jnl->old_start)/sizeof(jnl->old_start[0]); i++) {
+
+               if ((jnl->old_start[i] & ~(0x8000000000000000LL)) == tr->journal_start) {
+                       jnl->old_start[i] &= ~(0x8000000000000000LL);
+                       break;
+               }
+       }
+       if (i >= sizeof(jnl->old_start)/sizeof(jnl->old_start[0])) {
+               panic("jnl: buffer_flushed: did not find tr w/start @ %lld (tr 0x%x, jnl 0x%x)\n",
+                         tr->journal_start, tr, jnl);
+       }
+       simple_unlock(&jnl->old_start_lock);
+
+
+    // if we are here then we need to update the journal header
+    // to reflect that this transaction is complete
+    if (tr->journal_start == jnl->active_start) {
+               jnl->active_start = tr->journal_end;
+               tr->journal_start = tr->journal_end = (off_t)0;
+    }
+
+    // go through the completed_trs list and try to coalesce
+    // entries, restarting back at the beginning if we have to.
+    for(ctr=jnl->completed_trs; ctr; prev=ctr, ctr=next) {
+               if (ctr->journal_start == jnl->active_start) {
+                       jnl->active_start = ctr->journal_end;
+                       if (prev) {
+                               prev->next = ctr->next;
+                       }
+                       if (ctr == jnl->completed_trs) {
+                               jnl->completed_trs = ctr->next;
+                       }
+           
+                       next           = jnl->completed_trs;   // this starts us over again
+                       ctr->next      = jnl->tr_freeme;
+                       jnl->tr_freeme = ctr;
+                       ctr            = NULL;
+               } else if (tr->journal_end == ctr->journal_start) {
+                       ctr->journal_start = tr->journal_start;
+                       next               = jnl->completed_trs;  // this starts us over again
+                       ctr                = NULL;
+                       tr->journal_start  = tr->journal_end = (off_t)0;
+               } else if (tr->journal_start == ctr->journal_end) {
+                       ctr->journal_end  = tr->journal_end;
+                       next              = ctr->next;
+                       tr->journal_start = tr->journal_end = (off_t)0;
+               } else {
+                       next = ctr->next;
+               }
+    }
+    
+    // at this point no one should be using this guy anymore
+    tr->total_bytes = 0xfbadc0de;
+
+    // if this is true then we didn't merge with anyone
+    // so link ourselves in at the head of the completed
+    // transaction list.
+    if (tr->journal_start != 0) {
+               // put this entry into the correct sorted place
+               // in the list instead of just at the head.
+               //
+       
+               prev = NULL;
+               for(ctr=jnl->completed_trs; ctr && tr->journal_start > ctr->journal_start; prev=ctr, ctr=ctr->next) {
+                       // just keep looping
+               }
+
+               if (ctr == NULL && prev == NULL) {
+                       jnl->completed_trs = tr;
+                       tr->next = NULL;
+               } else if (ctr == jnl->completed_trs) {
+                       tr->next = jnl->completed_trs;
+                       jnl->completed_trs = tr;
+               } else {
+                       tr->next = prev->next;
+                       prev->next = tr;
+               }
+    } else {
+               // if we're here this tr got merged with someone else so
+               // put it on the list to be free'd
+               tr->next       = jnl->tr_freeme;
+               jnl->tr_freeme = tr;
+    }
+}
+
+static int
+update_fs_block(journal *jnl, void *block_ptr, off_t fs_block, size_t bsize)
+{
+    int         ret;
+    struct buf *oblock_bp=NULL;
+    
+    // first read the block we want.
+    ret = meta_bread(jnl->fsdev, (daddr_t)fs_block, bsize, NOCRED, &oblock_bp);
+    if (ret != 0) {
+               printf("jnl: update_fs_block: error reading fs block # %lld! (ret %d)\n", fs_block, ret);
+
+               if (oblock_bp) {
+                       brelse(oblock_bp);
+                       oblock_bp = NULL;
+               }
+
+               // let's try to be aggressive here and just re-write the block
+               oblock_bp = getblk(jnl->fsdev, (daddr_t)fs_block, bsize, 0, 0, BLK_META);
+               if (oblock_bp == NULL) {
+                       printf("jnl: update_fs_block: getblk() for %lld failed! failing update.\n", fs_block);
+                       return -1;
+               }
+    }
+           
+    // make sure it's the correct size.
+    if (oblock_bp->b_bufsize != bsize) {
+               brelse(oblock_bp);
+               return -1;
+    }
+
+    // copy the journal data over top of it
+    memcpy(oblock_bp->b_data, block_ptr, bsize);
+
+    if ((ret = VOP_BWRITE(oblock_bp)) != 0) {
+               printf("jnl: update_fs_block: failed to update block %lld (ret %d)\n", fs_block,ret);
+               brelse(oblock_bp);
+               return ret;
+    }
+
+    // and now invalidate it so that if someone else wants to read
+    // it in a different size they'll be able to do it.
+    ret = meta_bread(jnl->fsdev, (daddr_t)fs_block, bsize, NOCRED, &oblock_bp);
+    if (oblock_bp) {
+               oblock_bp->b_flags |= B_INVAL;
+               brelse(oblock_bp);
+    }
+           
+    return 0;
+}
+
+
+static int
+replay_journal(journal *jnl)
+{
+    int i, ret, checksum, max_bsize;
+    struct buf *oblock_bp;
+    block_list_header *blhdr;
+    off_t offset;
+    char *buf, *block_ptr=NULL;
+    
+    // wrap the start ptr if it points to the very end of the journal
+    if (jnl->jhdr->start == jnl->jhdr->size) {
+               jnl->jhdr->start = jnl->jhdr->jhdr_size;
+    }
+    if (jnl->jhdr->end == jnl->jhdr->size) {
+               jnl->jhdr->end = jnl->jhdr->jhdr_size;
+    }
+
+    if (jnl->jhdr->start == jnl->jhdr->end) {
+               return 0;
+    }
+
+    // allocate memory for the header_block.  we'll read each blhdr into this
+    if (kmem_alloc(kernel_map, (vm_offset_t *)&buf, jnl->jhdr->blhdr_size)) {
+               printf("jnl: replay_journal: no memory for block buffer! (%d bytes)\n",
+                          jnl->jhdr->blhdr_size);
+               return -1;
+    }
+    
+
+    printf("jnl: replay_journal: from: %lld to: %lld (joffset 0x%llx)\n",
+                  jnl->jhdr->start, jnl->jhdr->end, jnl->jdev_offset);
+
+    while(jnl->jhdr->start != jnl->jhdr->end) {
+               offset = jnl->jhdr->start;
+               ret = read_journal_data(jnl, &offset, buf, jnl->jhdr->blhdr_size);
+               if (ret != jnl->jhdr->blhdr_size) {
+                       printf("jnl: replay_journal: Could not read block list header block @ 0x%llx!\n", offset);
+                       goto bad_replay;
+               }
+
+               blhdr = (block_list_header *)buf;
+               checksum = blhdr->checksum;
+               blhdr->checksum = 0;
+               if (checksum != calc_checksum((char *)blhdr, BLHDR_CHECKSUM_SIZE)) {
+                       printf("jnl: replay_journal: bad block list header @ 0x%llx (checksum 0x%x != 0x%x)\n",
+                                  offset, checksum, calc_checksum((char *)blhdr, BLHDR_CHECKSUM_SIZE));
+                       goto bad_replay;
+               }
+               if (   blhdr->max_blocks <= 0 || blhdr->max_blocks > 2048
+                          || blhdr->num_blocks <= 0 || blhdr->num_blocks > blhdr->max_blocks) {
+                       printf("jnl: replay_journal: bad looking journal entry: max: %d num: %d\n",
+                                  blhdr->max_blocks, blhdr->num_blocks);
+                       goto bad_replay;
+               }
+       
+               for(i=1,max_bsize=0; i < blhdr->num_blocks; i++) {
+                       if (blhdr->binfo[i].bnum < 0 && blhdr->binfo[i].bnum != (off_t)-1) {
+                               printf("jnl: replay_journal: bogus block number 0x%llx\n", blhdr->binfo[i].bnum);
+                               goto bad_replay;
+                       }
+                       if (blhdr->binfo[i].bsize > max_bsize) {
+                               max_bsize = blhdr->binfo[i].bsize;
+                       }
+               }
+
+               // make sure it's at least one page in size.
+               if (max_bsize & (PAGE_SIZE - 1)) {
+                       max_bsize = (max_bsize + PAGE_SIZE) & ~(PAGE_SIZE - 1);
+               }
+
+               if (kmem_alloc(kernel_map, (vm_offset_t *)&block_ptr, max_bsize)) {
+                       goto bad_replay;
+               }
+
+               //printf("jnl: replay_journal: %d blocks in journal entry @ 0x%llx\n", blhdr->num_blocks-1,
+               //         jnl->jhdr->start);
+               for(i=1; i < blhdr->num_blocks; i++) {
+                       int size;
+
+                       size = blhdr->binfo[i].bsize;
+
+                       ret = read_journal_data(jnl, &offset, block_ptr, size);
+                       if (ret != size) {
+                               printf("jnl: replay_journal: Could not read journal entry data @ offset 0x%llx!\n", offset);
+                               goto bad_replay;
+                       }
+
+                       // don't replay "killed" blocks
+                       if (blhdr->binfo[i].bnum == (off_t)-1) {
+                               // printf("jnl: replay_journal: skipping killed fs block (slot %d)\n", i);
+                       } else {
+                               //printf("jnl: replay_journal: fixing fs block # %lld (%d)\n",
+                               //         blhdr->binfo[i].bnum, blhdr->binfo[i].bsize);
+
+                               if (update_fs_block(jnl, block_ptr, blhdr->binfo[i].bnum, blhdr->binfo[i].bsize) != 0) {
+                                       goto bad_replay;
+                               }
+                       }
+
+                       // check if we need to wrap offset back to the beginning
+                       // (which is just past the journal header)
+                       //
+                       if (offset >= jnl->jhdr->size) {
+                               offset = jnl->jhdr->jhdr_size;
+                       }
+               }
+
+               kmem_free(kernel_map, (vm_offset_t)block_ptr, max_bsize);
+               block_ptr = NULL;
+
+               jnl->jhdr->start += blhdr->bytes_used;
+               if (jnl->jhdr->start >= jnl->jhdr->size) {
+                       // wrap around and skip the journal header block
+                       jnl->jhdr->start = (jnl->jhdr->start % jnl->jhdr->size) + jnl->jhdr->jhdr_size;
+               }
+
+               // only update the on-disk journal header if we've reached the
+               // last chunk of updates from this transaction.  if binfo[0].bnum
+               // is zero then we know we're at the end.
+               if (blhdr->binfo[0].bnum == 0) {
+                       if (write_journal_header(jnl) != 0) {
+                               goto bad_replay;
+                       }
+               }
+    }
+
+    kmem_free(kernel_map, (vm_offset_t)buf, jnl->jhdr->blhdr_size);
+    return 0;
+
+  bad_replay:
+    if (block_ptr) {
+               kmem_free(kernel_map, (vm_offset_t)block_ptr, max_bsize);
+    }
+    kmem_free(kernel_map, (vm_offset_t)buf, jnl->jhdr->blhdr_size);
+    return -1;
+}
+
+
+#define DEFAULT_TRANSACTION_BUFFER_SIZE  (128*1024)
+//#define DEFAULT_TRANSACTION_BUFFER_SIZE  (256*1024)  // better performance but uses more mem
+#define MAX_TRANSACTION_BUFFER_SIZE      (512*1024)
+
+// XXXdbg - so I can change it in the debugger
+int def_tbuffer_size = 0;
+
+
+//
+// This function sets the size of the tbuffer and the
+// size of the blhdr.  It assumes that jnl->jhdr->size
+// and jnl->jhdr->jhdr_size are already valid.
+//
+static void
+size_up_tbuffer(journal *jnl, int tbuffer_size, int phys_blksz)
+{
+       //
+       // one-time initialization based on how much memory 
+       // there is in the machine.
+       //
+       if (def_tbuffer_size == 0) {
+               if (mem_size < (256*1024*1024)) {
+                       def_tbuffer_size = DEFAULT_TRANSACTION_BUFFER_SIZE;
+               } else if (mem_size < (512*1024*1024)) {
+                       def_tbuffer_size = DEFAULT_TRANSACTION_BUFFER_SIZE * 2;
+               } else if (mem_size < (1024*1024*1024)) {
+                       def_tbuffer_size = DEFAULT_TRANSACTION_BUFFER_SIZE * 3;
+               } else if (mem_size >= (1024*1024*1024)) {
+                       def_tbuffer_size = DEFAULT_TRANSACTION_BUFFER_SIZE * 4;
+               }
+       }
+
+    // size up the transaction buffer... can't be larger than the number
+    // of blocks that can fit in a block_list_header block.
+    if (tbuffer_size == 0) {
+               jnl->tbuffer_size = def_tbuffer_size;
+    } else {
+               // make sure that the specified tbuffer_size isn't too small
+               if (tbuffer_size < jnl->jhdr->blhdr_size * 2) {
+                       tbuffer_size = jnl->jhdr->blhdr_size * 2;
+               }
+               // and make sure it's an even multiple of the block size
+               if ((tbuffer_size % jnl->jhdr->jhdr_size) != 0) {
+                       tbuffer_size -= (tbuffer_size % jnl->jhdr->jhdr_size);
+               }
+
+               jnl->tbuffer_size = tbuffer_size;
+    }
+
+    if (jnl->tbuffer_size > (jnl->jhdr->size / 2)) {
+               jnl->tbuffer_size = (jnl->jhdr->size / 2);
+    }
+    
+    if (jnl->tbuffer_size > MAX_TRANSACTION_BUFFER_SIZE) {
+               jnl->tbuffer_size = MAX_TRANSACTION_BUFFER_SIZE;
+    }
+
+    jnl->jhdr->blhdr_size = (jnl->tbuffer_size / jnl->jhdr->jhdr_size) * sizeof(block_info);
+       if (jnl->jhdr->blhdr_size < phys_blksz) {
+               jnl->jhdr->blhdr_size = phys_blksz;
+       }
+}
+
+
+
+journal *
+journal_create(struct vnode *jvp,
+                          off_t         offset,
+                          off_t         journal_size,
+                          struct vnode *fsvp,
+                          size_t        min_fs_blksz,
+                          int32_t       flags,
+                          int32_t       tbuffer_size,
+                          void        (*flush)(void *arg),
+                          void         *arg)
+{
+    journal *jnl;
+    int      ret, phys_blksz;
+
+    /* Get the real physical block size. */
+    if (VOP_IOCTL(jvp, DKIOCGETBLOCKSIZE, (caddr_t)&phys_blksz, 0, FSCRED, NULL)) {
+               return NULL;
+    }
+
+    if (phys_blksz > min_fs_blksz) {
+               printf("jnl: create: error: phys blksize %d bigger than min fs blksize %d\n",
+                          phys_blksz, min_fs_blksz);
+               return NULL;
+    }
+
+    if ((journal_size % phys_blksz) != 0) {
+               printf("jnl: create: journal size 0x%llx is not an even multiple of block size 0x%x\n",
+                          journal_size, phys_blksz);
+               return NULL;
+    }
+
+    if (kmem_alloc(kernel_map, (vm_offset_t *)&jnl, sizeof(struct journal))) {
+               return NULL;
+    }
+    memset(jnl, 0, sizeof(*jnl));
+
+    jnl->jdev         = jvp;
+    jnl->jdev_offset  = offset;
+    jnl->fsdev        = fsvp;
+    jnl->flush        = flush;
+    jnl->flush_arg    = arg;
+    jnl->flags        = (flags & JOURNAL_OPTION_FLAGS_MASK);
+       simple_lock_init(&jnl->old_start_lock);
+       
+    if (kmem_alloc(kernel_map, (vm_offset_t *)&jnl->header_buf, phys_blksz)) {
+               printf("jnl: create: could not allocate space for header buffer (%d bytes)\n", phys_blksz);
+               goto bad_kmem_alloc;
+    }
+
+    memset(jnl->header_buf, 0, phys_blksz);
+    
+    jnl->jhdr             = (journal_header *)jnl->header_buf;
+    jnl->jhdr->magic      = JOURNAL_HEADER_MAGIC;
+    jnl->jhdr->endian     = ENDIAN_MAGIC;
+    jnl->jhdr->start      = phys_blksz;    // start at block #1, block #0 is for the jhdr itself
+    jnl->jhdr->end        = phys_blksz;
+    jnl->jhdr->size       = journal_size;
+    jnl->jhdr->jhdr_size  = phys_blksz;
+    size_up_tbuffer(jnl, tbuffer_size, phys_blksz);
+
+       jnl->active_start     = jnl->jhdr->start;
+
+    // XXXdbg  - for testing you can force the journal to wrap around
+    // jnl->jhdr->start = jnl->jhdr->size - (phys_blksz*3);
+    // jnl->jhdr->end   = jnl->jhdr->size - (phys_blksz*3);
+    
+    if (semaphore_create(kernel_task, &jnl->jsem, SYNC_POLICY_FIFO, 1) != 0) {
+               printf("jnl: journal_create: failed to create journal semaphore..\n");
+               goto bad_sem;
+    }
+
+    if (write_journal_header(jnl) != 0) {
+               printf("jnl: journal_create: failed to write journal header.\n");
+               goto bad_write;
+    }
+
+    return jnl;
+
+
+  bad_write:
+    semaphore_destroy(kernel_task, jnl->jsem);
+  bad_sem:
+    kmem_free(kernel_map, (vm_offset_t)jnl->header_buf, phys_blksz);
+  bad_kmem_alloc:
+    jnl->jhdr = NULL;
+       kmem_free(kernel_map, (vm_offset_t)jnl, sizeof(struct journal));
+    return NULL;
+}
+
+
+journal *
+journal_open(struct vnode *jvp,
+                        off_t         offset,
+                        off_t         journal_size,
+                        struct vnode *fsvp,
+                        size_t        min_fs_blksz,
+                        int32_t       flags,
+                        int32_t       tbuffer_size,
+                        void        (*flush)(void *arg),
+                        void         *arg)
+{
+    journal *jnl;
+    int      orig_blksz=0, phys_blksz, blhdr_size;
+    off_t    hdr_offset=0;
+
+    /* Get the real physical block size. */
+    if (VOP_IOCTL(jvp, DKIOCGETBLOCKSIZE, (caddr_t)&phys_blksz, 0, FSCRED, NULL)) {
+               return NULL;
+    }
+
+    if (phys_blksz > min_fs_blksz) {
+               printf("jnl: create: error: phys blksize %d bigger than min fs blksize %d\n",
+                          phys_blksz, min_fs_blksz);
+               return NULL;
+    }
+
+    if ((journal_size % phys_blksz) != 0) {
+               printf("jnl: open: journal size 0x%llx is not an even multiple of block size 0x%x\n",
+                          journal_size, phys_blksz);
+               return NULL;
+    }
+
+    if (kmem_alloc(kernel_map, (vm_offset_t *)&jnl, sizeof(struct journal))) {
+               return NULL;
+    }
+    memset(jnl, 0, sizeof(*jnl));
+
+    jnl->jdev         = jvp;
+    jnl->jdev_offset  = offset;
+    jnl->fsdev        = fsvp;
+    jnl->flush        = flush;
+    jnl->flush_arg    = arg;
+    jnl->flags        = (flags & JOURNAL_OPTION_FLAGS_MASK);
+       simple_lock_init(&jnl->old_start_lock);
+
+    if (kmem_alloc(kernel_map, (vm_offset_t *)&jnl->header_buf, phys_blksz)) {
+               printf("jnl: create: could not allocate space for header buffer (%d bytes)\n", phys_blksz);
+               goto bad_kmem_alloc;
+    }
+
+    jnl->jhdr = (journal_header *)jnl->header_buf;
+    memset(jnl->jhdr, 0, sizeof(journal_header)+4);
+
+    // we have to set this up here so that do_journal_io() will work
+    jnl->jhdr->jhdr_size = phys_blksz;
+
+    if (read_journal_data(jnl, &hdr_offset, jnl->jhdr, phys_blksz) != phys_blksz) {
+               printf("jnl: open: could not read %d bytes for the journal header.\n",
+                          phys_blksz);
+               goto bad_journal;
+    }
+
+    if (jnl->jhdr->magic != JOURNAL_HEADER_MAGIC && jnl->jhdr->magic != OLD_JOURNAL_HEADER_MAGIC) {
+               printf("jnl: open: journal magic is bad (0x%x != 0x%x)\n",
+                          jnl->jhdr->magic, JOURNAL_HEADER_MAGIC);
+               goto bad_journal;
+    }
+
+       // only check if we're the current journal header magic value
+       if (jnl->jhdr->magic == JOURNAL_HEADER_MAGIC) {
+               int orig_checksum = jnl->jhdr->checksum;
+
+               jnl->jhdr->checksum = 0;
+               if (orig_checksum != calc_checksum((char *)jnl->jhdr, sizeof(struct journal_header))) {
+                       printf("jnl: open: journal checksum is bad (0x%x != 0x%x)\n", orig_checksum,
+                                  calc_checksum((char *)jnl->jhdr, sizeof(struct journal_header)));
+                       //goto bad_journal;
+               }
+       }
+
+       // XXXdbg - convert old style magic numbers to the new one
+       if (jnl->jhdr->magic == OLD_JOURNAL_HEADER_MAGIC) {
+               jnl->jhdr->magic = JOURNAL_HEADER_MAGIC;
+       }
+
+    if (phys_blksz != jnl->jhdr->jhdr_size && jnl->jhdr->jhdr_size != 0) {
+               printf("jnl: open: phys_blksz %d does not match journal header size %d\n",
+                          phys_blksz, jnl->jhdr->jhdr_size);
+
+               orig_blksz = phys_blksz;
+               phys_blksz = jnl->jhdr->jhdr_size;
+               if (VOP_IOCTL(jvp, DKIOCSETBLOCKSIZE, (caddr_t)&phys_blksz, FWRITE, FSCRED, NULL)) {
+                       printf("jnl: could not set block size to %d bytes.\n", phys_blksz);
+                       goto bad_journal;
+               }
+//             goto bad_journal;
+    }
+
+    if (   jnl->jhdr->start <= 0
+                  || jnl->jhdr->start > jnl->jhdr->size
+                  || jnl->jhdr->start > 128*1024*1024) {
+               printf("jnl: open: jhdr start looks bad (0x%llx max size 0x%llx)\n",
+                          jnl->jhdr->start, jnl->jhdr->size);
+               goto bad_journal;
+    }
+
+    if (   jnl->jhdr->end <= 0
+                  || jnl->jhdr->end > jnl->jhdr->size
+                  || jnl->jhdr->end > 128*1024*1024) {
+               printf("jnl: open: jhdr end looks bad (0x%llx max size 0x%llx)\n",
+                          jnl->jhdr->end, jnl->jhdr->size);
+               goto bad_journal;
+    }
+
+    if (jnl->jhdr->size > 128*1024*1024) {
+               printf("jnl: open: jhdr size looks bad (0x%llx)\n", jnl->jhdr->size);
+               goto bad_journal;
+    }
+
+// XXXdbg - can't do these checks because hfs writes all kinds of
+//          non-uniform sized blocks even on devices that have a block size
+//          that is larger than 512 bytes (i.e. optical media w/2k blocks).
+//          therefore these checks will fail and so we just have to punt and
+//          do more relaxed checking...
+// XXXdbg    if ((jnl->jhdr->start % jnl->jhdr->jhdr_size) != 0) {
+    if ((jnl->jhdr->start % 512) != 0) {
+               printf("jnl: open: journal start (0x%llx) not a multiple of 512?\n",
+                          jnl->jhdr->start);
+               goto bad_journal;
+    }
+
+//XXXdbg    if ((jnl->jhdr->end % jnl->jhdr->jhdr_size) != 0) {
+    if ((jnl->jhdr->end % 512) != 0) {
+               printf("jnl: open: journal end (0x%llx) not a multiple of block size (0x%x)?\n",
+                          jnl->jhdr->end, jnl->jhdr->jhdr_size);
+               goto bad_journal;
+    }
+
+    // take care of replaying the journal if necessary
+       if (flags & JOURNAL_RESET) {
+               printf("jnl: journal start/end pointers reset! (jnl 0x%x; s 0x%llx e 0x%llx)\n",
+                          jnl, jnl->jhdr->start, jnl->jhdr->end);
+               jnl->jhdr->start = jnl->jhdr->end;
+       } else if (replay_journal(jnl) != 0) {
+               printf("jnl: journal_open: Error replaying the journal!\n");
+               goto bad_journal;
+    }
+
+       if (orig_blksz != 0) {
+               VOP_IOCTL(jvp, DKIOCSETBLOCKSIZE, (caddr_t)&orig_blksz, FWRITE, FSCRED, NULL);
+               phys_blksz = orig_blksz;
+       }
+
+       // make sure this is in sync!
+       jnl->active_start = jnl->jhdr->start;
+
+    // set this now, after we've replayed the journal
+    size_up_tbuffer(jnl, tbuffer_size, phys_blksz);
+
+    if (semaphore_create(kernel_task, &jnl->jsem, SYNC_POLICY_FIFO, 1) != 0) {
+               printf("jnl: journal_create: failed to create journal semaphore..\n");
+               goto bad_journal;
+    }
+
+    return jnl;
+
+  bad_journal:
+       if (orig_blksz != 0) {
+               phys_blksz = orig_blksz;
+               VOP_IOCTL(jvp, DKIOCSETBLOCKSIZE, (caddr_t)&orig_blksz, FWRITE, FSCRED, NULL);
+       }
+    kmem_free(kernel_map, (vm_offset_t)jnl->header_buf, phys_blksz);
+  bad_kmem_alloc:
+       kmem_free(kernel_map, (vm_offset_t)jnl, sizeof(struct journal));
+    return NULL;    
+}
+
+void
+journal_close(journal *jnl)
+{
+    volatile off_t *start, *end;
+    int             counter=0;
+
+    CHECK_JOURNAL(jnl);
+
+       // set this before doing anything that would block so that
+       // we start tearing things down properly.
+       //
+       jnl->flags |= JOURNAL_CLOSE_PENDING;
+
+    if (jnl->owner != current_act()) {
+               int ret;
+
+               while ((ret = semaphore_wait(jnl->jsem)) == KERN_ABORTED) {
+                       // just keep trying if we've been ^C'ed
+               }
+               if (ret != 0) {
+                       printf("jnl: close: sem wait failed.\n");
+                       return;
+               }
+    }
+
+    //
+    // only write stuff to disk if the journal is still valid
+    //
+    if ((jnl->flags & JOURNAL_INVALID) == 0) {
+
+               if (jnl->active_tr) {
+                       journal_end_transaction(jnl);
+               }
+               
+               // flush any buffered transactions
+               if (jnl->cur_tr) {
+                       transaction *tr = jnl->cur_tr;
+
+                       jnl->cur_tr = NULL;
+                       end_transaction(tr, 1);   // force it to get flushed
+               }
+    
+               //start = &jnl->jhdr->start;
+               start = &jnl->active_start;
+               end   = &jnl->jhdr->end;
+    
+               while (*start != *end && counter++ < 500) {
+                       printf("jnl: close: flushing the buffer cache (start 0x%llx end 0x%llx)\n", *start, *end);
+                       if (jnl->flush) {
+                               jnl->flush(jnl->flush_arg);
+                       }
+       
+               }
+
+               if (*start != *end) {
+                       printf("jnl: close: buffer flushing didn't seem to flush out all the transactions! (0x%llx - 0x%llx)\n",
+                                  *start, *end);
+               }
+
+               // make sure this is in sync when we close the journal
+               jnl->jhdr->start = jnl->active_start;
+
+               // if this fails there's not much we can do at this point...
+               write_journal_header(jnl);
+    } else {
+               // if we're here the journal isn't valid any more.
+               // so make sure we don't leave any locked blocks lying around
+               printf("jnl: close: journal 0x%x, is invalid.  aborting outstanding transactions\n", jnl);
+               if (jnl->active_tr || jnl->cur_tr) {
+                       transaction *tr;
+                       if (jnl->active_tr) {
+                               tr = jnl->active_tr;
+                               jnl->active_tr = NULL;
+                       } else {
+                               tr = jnl->cur_tr;
+                               jnl->cur_tr = NULL;
+                       }
+
+                       abort_transaction(jnl, tr);
+                       if (jnl->active_tr || jnl->cur_tr) {
+                               panic("jnl: close: jnl @ 0x%x had both an active and cur tr\n", jnl);
+                       }
+               }
+    }
+
+    free_old_stuff(jnl);
+
+    kmem_free(kernel_map, (vm_offset_t)jnl->header_buf, jnl->jhdr->jhdr_size);
+    jnl->jhdr = (void *)0xbeefbabe;
+
+    semaphore_destroy(kernel_task, jnl->jsem);
+       kmem_free(kernel_map, (vm_offset_t)jnl, sizeof(struct journal));
+}
+
+static void
+dump_journal(journal *jnl)
+{
+    transaction *ctr;
+
+    printf("journal:");
+    printf("  jdev_offset %.8llx\n", jnl->jdev_offset);
+    printf("  magic: 0x%.8x\n", jnl->jhdr->magic);
+    printf("  start: 0x%.8llx\n", jnl->jhdr->start);
+    printf("  end:   0x%.8llx\n", jnl->jhdr->end);
+    printf("  size:  0x%.8llx\n", jnl->jhdr->size);
+    printf("  blhdr size: %d\n", jnl->jhdr->blhdr_size);
+    printf("  jhdr size: %d\n", jnl->jhdr->jhdr_size);
+    printf("  chksum: 0x%.8x\n", jnl->jhdr->checksum);
+    
+    printf("  completed transactions:\n");
+    for(ctr=jnl->completed_trs; ctr; ctr=ctr->next) {
+               printf("    0x%.8llx - 0x%.8llx\n", ctr->journal_start, ctr->journal_end);
+    }
+}
+
+
+
+static off_t
+free_space(journal *jnl)
+{
+    off_t free_space;
+       
+    if (jnl->jhdr->start < jnl->jhdr->end) {
+               free_space = jnl->jhdr->size - (jnl->jhdr->end - jnl->jhdr->start) - jnl->jhdr->jhdr_size;
+    } else if (jnl->jhdr->start > jnl->jhdr->end) {
+               free_space = jnl->jhdr->start - jnl->jhdr->end;
+    } else {
+               // journal is completely empty
+               free_space = jnl->jhdr->size - jnl->jhdr->jhdr_size;
+    }
+
+    return free_space;
+}
+
+
+//
+// The journal must be locked on entry to this function.
+// The "desired_size" is in bytes.
+//
+static int
+check_free_space(journal *jnl, int desired_size)
+{
+    int    i, counter=0;
+
+    //printf("jnl: check free space (desired 0x%x, avail 0x%Lx)\n",
+//        desired_size, free_space(jnl));
+    
+    while (1) {
+               if (counter++ == 5000) {
+                       dump_journal(jnl);
+                       panic("jnl: check_free_space: buffer flushing isn't working "
+                                 "(jnl @ 0x%x s %lld e %lld f %lld [active start %lld]).\n", jnl,
+                                 jnl->jhdr->start, jnl->jhdr->end, free_space(jnl), jnl->active_start);
+               }
+               if (counter > 7500) {
+                       printf("jnl: check_free_space: giving up waiting for free space.\n");
+                       return ENOSPC;
+               }
+
+               // make sure there's space in the journal to hold this transaction
+               if (free_space(jnl) > desired_size) {
+                       break;
+               }
+
+               //
+               // here's where we lazily bump up jnl->jhdr->start.  we'll consume
+               // entries until there is enough space for the next transaction.
+               //
+               simple_lock(&jnl->old_start_lock);
+               for(i=0; i < sizeof(jnl->old_start)/sizeof(jnl->old_start[0]); i++) {
+                       int   counter;
+
+                       counter = 0;
+                       while (jnl->old_start[i] & 0x8000000000000000LL) {
+                               if (counter++ > 100) {
+                                       panic("jnl: check_free_space: tr starting @ 0x%llx not flushing (jnl 0x%x).\n",
+                                                 jnl->old_start[i], jnl);
+                               }
+                               
+                               simple_unlock(&jnl->old_start_lock);
+                               if (jnl->flush) {
+                                       jnl->flush(jnl->flush_arg);
+                               }
+                               tsleep((caddr_t)jnl, PRIBIO, "check_free_space1", 1);
+                               simple_lock(&jnl->old_start_lock);
+                       }
+
+                       if (jnl->old_start[i] == 0) {
+                               continue;
+                       }
+
+                       jnl->jhdr->start  = jnl->old_start[i];
+                       jnl->old_start[i] = 0;
+                       if (free_space(jnl) > desired_size) {
+                               write_journal_header(jnl);
+                               break;
+                       }
+               }
+               simple_unlock(&jnl->old_start_lock);
+               
+               // if we bumped the start, loop and try again
+               if (i < sizeof(jnl->old_start)/sizeof(jnl->old_start[0])) {
+                       continue;
+               }
+
+
+               // if the file system gave us a flush function, call it to so that
+               // it can flush some blocks which hopefully will cause some transactions
+               // to complete and thus free up space in the journal.
+               if (jnl->flush) {
+                       jnl->flush(jnl->flush_arg);
+               }
+       
+               // wait for a while to avoid being cpu-bound (this will
+               // put us to sleep for 10 milliseconds)
+               tsleep((caddr_t)jnl, PRIBIO, "check_free_space2", 1);
+    }
+
+    return 0;
+}
+
+int
+journal_start_transaction(journal *jnl)
+{
+    int ret;
+    transaction *tr;
+
+    CHECK_JOURNAL(jnl);
+    
+    if (jnl->flags & JOURNAL_INVALID) {
+               return EINVAL;
+    }
+
+    if (jnl->owner == current_act()) {
+               if (jnl->active_tr == NULL) {
+                       panic("jnl: start_tr: active_tr is NULL (jnl @ 0x%x, owner 0x%x, current_act 0x%x\n",
+                                 jnl, jnl->owner, current_act());
+               }
+               jnl->nested_count++;
+               return 0;
+    }
+
+    while ((ret = semaphore_wait(jnl->jsem)) == KERN_ABORTED) {
+               // just keep looping if we've been ^C'ed
+    }
+    if (ret != 0) {
+               printf("jnl: start_tr: sem wait failed.\n");
+               return EINVAL;
+    }
+
+    if (jnl->owner != NULL || jnl->nested_count != 0 || jnl->active_tr != NULL) {
+               panic("jnl: start_tr: owner 0x%x, nested count 0x%x, active_tr 0x%x jnl @ 0x%x\n",
+                         jnl->owner, jnl->nested_count, jnl->active_tr, jnl);
+    }
+
+    jnl->owner        = current_act();
+    jnl->nested_count = 1;
+
+    free_old_stuff(jnl);
+
+    // make sure there's room in the journal
+    if (check_free_space(jnl, jnl->tbuffer_size) != 0) {
+               printf("jnl: start transaction failed: no space\n");
+               ret = ENOSPC;
+               goto bad_start;
+    }
+
+    // if there's a buffered transaction, use it.
+    if (jnl->cur_tr) {
+               jnl->active_tr = jnl->cur_tr;
+               jnl->cur_tr    = NULL;
+
+               return 0;
+    }
+
+    if (kmem_alloc(kernel_map, (vm_offset_t *)&tr, sizeof(transaction))) {
+               printf("jnl: start transaction failed: no mem\n");
+               ret = ENOMEM;
+               goto bad_start;
+    }
+    memset(tr, 0, sizeof(transaction));
+
+    tr->tbuffer_size = jnl->tbuffer_size;
+    if (kmem_alloc(kernel_map, (vm_offset_t *)&tr->tbuffer, tr->tbuffer_size)) {
+               kmem_free(kernel_map, (vm_offset_t)tr, sizeof(transaction));
+               printf("jnl: start transaction failed: no tbuffer mem\n");
+               ret = ENOMEM;
+               goto bad_start;
+    }
+
+    // journal replay code checksum check depends on this.
+    memset(tr->tbuffer, 0, BLHDR_CHECKSUM_SIZE);
+
+    tr->blhdr = (block_list_header *)tr->tbuffer;
+    tr->blhdr->max_blocks = (jnl->jhdr->blhdr_size / sizeof(block_info)) - 1;
+    tr->blhdr->num_blocks = 1;      // accounts for this header block
+    tr->blhdr->bytes_used = jnl->jhdr->blhdr_size;
+
+    tr->num_blhdrs  = 1;
+    tr->total_bytes = jnl->jhdr->blhdr_size;
+    tr->jnl         = jnl;
+
+    jnl->active_tr    = tr;
+
+    // printf("jnl: start_tr: owner 0x%x new tr @ 0x%x\n", jnl->owner, tr);
+
+    return 0;
+
+  bad_start:
+       jnl->owner        = NULL;
+       jnl->nested_count = 0;
+       semaphore_signal(jnl->jsem);
+       return ret;
+}
+
+
+int
+journal_modify_block_start(journal *jnl, struct buf *bp)
+{
+    transaction *tr;
+    
+    CHECK_JOURNAL(jnl);
+
+    if (jnl->flags & JOURNAL_INVALID) {
+               return EINVAL;
+    }
+
+    // XXXdbg - for debugging I want this to be true.  later it may
+    //          not be necessary.
+    if ((bp->b_flags & B_META) == 0) {
+               panic("jnl: modify_block_start: bp @ 0x%x is not a meta-data block! (jnl 0x%x)\n", bp, jnl);
+    }
+
+    tr = jnl->active_tr;
+    CHECK_TRANSACTION(tr);
+
+    if (jnl->owner != current_act()) {
+               panic("jnl: modify_block_start: called w/out a transaction! jnl 0x%x, owner 0x%x, curact 0x%x\n",
+                         jnl, jnl->owner, current_act());
+    }
+
+    free_old_stuff(jnl);
+
+    //printf("jnl: mod block start (bp 0x%x vp 0x%x l/blkno %d/%d bsz %d; total bytes %d)\n",
+    //   bp, bp->b_vp, bp->b_lblkno, bp->b_blkno, bp->b_bufsize, tr->total_bytes);
+
+    // can't allow blocks that aren't an even multiple of the
+    // underlying block size.
+    if ((bp->b_bufsize % jnl->jhdr->jhdr_size) != 0) {
+               panic("jnl: mod block start: bufsize %d not a multiple of block size %d\n",
+                         bp->b_bufsize, jnl->jhdr->jhdr_size);
+               return -1;
+    }
+
+    // make sure that this transaction isn't bigger than the whole journal
+    if (tr->total_bytes+bp->b_bufsize >= (jnl->jhdr->size - jnl->jhdr->jhdr_size)) {
+               panic("jnl: transaction too big (%d >= %lld bytes, bufsize %d, tr 0x%x bp 0x%x)\n",
+                         tr->total_bytes, (tr->jnl->jhdr->size - jnl->jhdr->jhdr_size), bp->b_bufsize, tr, bp);
+               return -1;
+    }
+
+    // if the block is dirty and not already locked we have to write
+    // it out before we muck with it because it has data that belongs
+    // (presumably) to another transaction.
+    //
+    if ((bp->b_flags & B_DELWRI) && (bp->b_flags & B_LOCKED) == 0) {
+
+               // this will cause it to not be brelse()'d
+               bp->b_flags |= B_NORELSE;
+               VOP_BWRITE(bp);
+    }
+
+    bp->b_flags |= B_LOCKED;
+       
+    return 0;
+}
+
+int
+journal_modify_block_abort(journal *jnl, struct buf *bp)
+{
+    transaction *tr;
+       block_list_header *blhdr;
+       int i, j;
+    
+    CHECK_JOURNAL(jnl);
+
+    tr = jnl->active_tr;
+       
+       //
+       // if there's no active transaction then we just want to
+       // call brelse() and return since this is just a block
+       // that happened to be modified as part of another tr.
+       //
+       if (tr == NULL) {
+               brelse(bp);
+               return 0;
+       }
+
+    if (jnl->flags & JOURNAL_INVALID) {
+               return EINVAL;
+    }
+
+    CHECK_TRANSACTION(tr);
+    
+    if (jnl->owner != current_act()) {
+               panic("jnl: modify_block_abort: called w/out a transaction! jnl 0x%x, owner 0x%x, curact 0x%x\n",
+                         jnl, jnl->owner, current_act());
+    }
+
+    free_old_stuff(jnl);
+
+    // printf("jnl: modify_block_abort: tr 0x%x bp 0x%x\n", jnl->active_tr, bp);
+
+    // first check if it's already part of this transaction
+    for(blhdr=tr->blhdr; blhdr; blhdr=(block_list_header *)((long)blhdr->binfo[0].bnum)) {
+               for(i=1; i < blhdr->num_blocks; i++) {
+                       if (bp == blhdr->binfo[i].bp) {
+                               if (bp->b_bufsize != blhdr->binfo[i].bsize) {
+                                       panic("jnl: bp @ 0x%x changed size on me! (%d vs. %d, jnl 0x%x)\n",
+                                                 bp, bp->b_bufsize, blhdr->binfo[i].bsize, jnl);
+                               }
+                               break;
+                       }
+               }
+
+               if (i < blhdr->num_blocks) {
+                       break;
+               }
+    }
+
+       //
+       // if blhdr is null, then this block has only had modify_block_start
+       // called on it as part of the current transaction.  that means that
+       // it is ok to clear the LOCKED bit since it hasn't actually been
+       // modified.  if blhdr is non-null then modify_block_end was called
+       // on it and so we need to keep it locked in memory.
+       //
+       if (blhdr == NULL) { 
+               bp->b_flags &= ~(B_LOCKED);
+       }
+
+    brelse(bp);
+    return 0;
+}
+
+
+int
+journal_modify_block_end(journal *jnl, struct buf *bp)
+{
+    int                i, j, tbuffer_offset;
+    char              *blkptr;
+    block_list_header *blhdr, *prev=NULL;
+    transaction       *tr;
+
+    CHECK_JOURNAL(jnl);
+
+    if (jnl->flags & JOURNAL_INVALID) {
+               return EINVAL;
+    }
+
+    tr = jnl->active_tr;
+    CHECK_TRANSACTION(tr);
+
+    if (jnl->owner != current_act()) {
+               panic("jnl: modify_block_end: called w/out a transaction! jnl 0x%x, owner 0x%x, curact 0x%x\n",
+                         jnl, jnl->owner, current_act());
+    }
+
+    free_old_stuff(jnl);
+
+    //printf("jnl: mod block end:  (bp 0x%x vp 0x%x l/blkno %d/%d bsz %d, total bytes %d)\n", 
+    //   bp, bp->b_vp, bp->b_lblkno, bp->b_blkno, bp->b_bufsize, tr->total_bytes);
+
+    if ((bp->b_flags & B_LOCKED) == 0) {
+               panic("jnl: modify_block_end: bp 0x%x not locked! jnl @ 0x%x\n", bp, jnl);
+               bp->b_flags |= B_LOCKED;
+    }
+        
+    // first check if it's already part of this transaction
+    for(blhdr=tr->blhdr; blhdr; prev=blhdr,blhdr=(block_list_header *)((long)blhdr->binfo[0].bnum)) {
+               tbuffer_offset = jnl->jhdr->blhdr_size;
+
+               for(i=1; i < blhdr->num_blocks; i++) {
+                       if (bp == blhdr->binfo[i].bp) {
+                               if (bp->b_bufsize != blhdr->binfo[i].bsize) {
+                                       panic("jnl: bp @ 0x%x changed size on me! (%d vs. %d, jnl 0x%x)\n",
+                                                 bp, bp->b_bufsize, blhdr->binfo[i].bsize, jnl);
+                               }
+                               break;
+                       }
+                       tbuffer_offset += blhdr->binfo[i].bsize;
+               }
+
+               if (i < blhdr->num_blocks) {
+                       break;
+               }
+    }
+
+    if (blhdr == NULL
+               && prev
+               && (prev->num_blocks+1) <= prev->max_blocks
+               && (prev->bytes_used+bp->b_bufsize) <= tr->tbuffer_size) {
+               blhdr = prev;
+    } else if (blhdr == NULL) {
+               block_list_header *nblhdr;
+
+               if (prev == NULL) {
+                       panic("jnl: modify block end: no way man, prev == NULL?!?, jnl 0x%x, bp 0x%x\n", jnl, bp);
+               }
+
+               // we got to the end of the list, didn't find the block and there's
+               // no room in the block_list_header pointed to by prev
+       
+               // we allocate another tbuffer and link it in at the end of the list
+               // through prev->binfo[0].bnum.  that's a skanky way to do things but
+               // avoids having yet another linked list of small data structures to manage.
+
+               if (kmem_alloc(kernel_map, (vm_offset_t *)&nblhdr, tr->tbuffer_size)) {
+                       panic("jnl: end_tr: no space for new block tr @ 0x%x (total bytes: %d)!\n",
+                                 tr, tr->total_bytes);
+               }
+
+               // journal replay code checksum check depends on this.
+               memset(nblhdr, 0, BLHDR_CHECKSUM_SIZE);
+
+               // initialize the new guy
+               nblhdr->max_blocks = (jnl->jhdr->blhdr_size / sizeof(block_info)) - 1;
+               nblhdr->num_blocks = 1;      // accounts for this header block
+               nblhdr->bytes_used = jnl->jhdr->blhdr_size;
+           
+               tr->num_blhdrs++;
+               tr->total_bytes += jnl->jhdr->blhdr_size;
+
+               // then link him in at the end
+               prev->binfo[0].bnum = (off_t)((long)nblhdr);
+
+               // and finally switch to using the new guy
+               blhdr          = nblhdr;
+               tbuffer_offset = jnl->jhdr->blhdr_size;
+               i              = 1;
+    }
+
+
+    if ((i+1) > blhdr->max_blocks) {
+               panic("jnl: modify_block_end: i = %d, max_blocks %d\n", i, blhdr->max_blocks);
+    }
+
+    // copy the data into the in-memory transaction buffer
+    blkptr = (char *)&((char *)blhdr)[tbuffer_offset];
+    memcpy(blkptr, bp->b_data, bp->b_bufsize);
+
+    // if this is true then this is a new block we haven't seen
+    if (i >= blhdr->num_blocks) {
+               vget(bp->b_vp, 0, current_proc());
+
+               blhdr->binfo[i].bnum  = bp->b_blkno;
+               blhdr->binfo[i].bsize = bp->b_bufsize;
+               blhdr->binfo[i].bp    = bp;
+
+               blhdr->bytes_used += bp->b_bufsize;
+               tr->total_bytes   += bp->b_bufsize;
+
+               blhdr->num_blocks++;
+    }
+
+    bdwrite(bp);
+
+    return 0;
+}
+
+int
+journal_kill_block(journal *jnl, struct buf *bp)
+{
+    int                i;
+    block_list_header *blhdr;
+    transaction       *tr;
+
+    CHECK_JOURNAL(jnl);
+
+    if (jnl->flags & JOURNAL_INVALID) {
+               return EINVAL;
+    }
+
+    tr = jnl->active_tr;
+    CHECK_TRANSACTION(tr);
+
+    if (jnl->owner != current_act()) {
+               panic("jnl: modify_block_end: called w/out a transaction! jnl 0x%x, owner 0x%x, curact 0x%x\n",
+                         jnl, jnl->owner, current_act());
+    }
+
+    free_old_stuff(jnl);
+
+    if ((bp->b_flags & B_LOCKED) == 0) {
+               panic("jnl: kill block: bp 0x%x not locked! jnl @ 0x%x\n", bp, jnl);
+    }
+
+    // first check if it's already part of this transaction
+    for(blhdr=tr->blhdr; blhdr; blhdr=(block_list_header *)((long)blhdr->binfo[0].bnum)) {
+
+               for(i=1; i < blhdr->num_blocks; i++) {
+                       if (bp == blhdr->binfo[i].bp) {
+                               bp->b_flags &= ~B_LOCKED;
+
+                               // this undoes the vget() in journal_modify_block_end()
+                               vrele(bp->b_vp);
+
+                               // if the block has the DELWRI and CALL bits sets, then
+                               // things are seriously weird.  if it was part of another
+                               // transaction then journal_modify_block_start() should
+                               // have force it to be written.
+                               //
+                               if ((bp->b_flags & B_DELWRI) && (bp->b_flags & B_CALL)) {
+                                       panic("jnl: kill block: this defies all logic! bp 0x%x\n", bp);
+                               } else {
+                                       tr->num_killed += bp->b_bufsize;
+                               }
+
+                               if (bp->b_flags & B_BUSY) {
+                                       brelse(bp);
+                               }
+
+                               blhdr->binfo[i].bp   = NULL;
+                               blhdr->binfo[i].bnum = (off_t)-1;
+                               break;
+                       }
+               }
+
+               if (i < blhdr->num_blocks) {
+                       break;
+               }
+    }
+
+    return 0;
+}
+
+
+static int
+journal_binfo_cmp(void *a, void *b)
+{
+    block_info *bi_a = (struct block_info *)a,
+ *bi_b = (struct block_info *)b;
+    daddr_t res;
+
+    if (bi_a->bp == NULL) {
+               return 1;
+    }
+    if (bi_b->bp == NULL) {
+               return -1;
+    }
+
+    // don't have to worry about negative block
+    // numbers so this is ok to do.
+    //
+    res = (bi_a->bp->b_blkno - bi_b->bp->b_blkno);
+
+    return (int)res;
+}
+
+
+static int
+end_transaction(transaction *tr, int force_it)
+{
+    int                 i, j, ret, amt;
+    off_t               end;
+    journal            *jnl = tr->jnl;
+    struct buf         *bp;
+    block_list_header  *blhdr=NULL, *next=NULL;
+
+       if (jnl->cur_tr) {
+               panic("jnl: jnl @ 0x%x already has cur_tr 0x%x, new tr: 0x%x\n",
+                         jnl, jnl->cur_tr, tr);
+       }
+
+    // if there weren't any modified blocks in the transaction
+    // just save off the transaction pointer and return.
+    if (tr->total_bytes == jnl->jhdr->blhdr_size) {
+               jnl->cur_tr = tr;
+               return;
+    }
+
+    // if our transaction buffer isn't very full, just hang
+    // on to it and don't actually flush anything.  this is
+    // what is known as "group commit".  we will flush the
+    // transaction buffer if it's full or if we have more than
+    // one of them so we don't start hogging too much memory.
+    //
+    if (   force_it == 0
+                  && (jnl->flags & JOURNAL_NO_GROUP_COMMIT) == 0 
+                  && tr->num_blhdrs < 3
+                  && (tr->total_bytes <= ((tr->tbuffer_size*tr->num_blhdrs) - tr->tbuffer_size/8))) {
+
+               jnl->cur_tr = tr;
+               return;
+    }
+
+
+    // if we're here we're going to flush the transaction buffer to disk.
+    // make sure there is room in the journal first.
+    check_free_space(jnl, tr->total_bytes);
+
+    // range check the end index
+    if (jnl->jhdr->end <= 0 || jnl->jhdr->end > jnl->jhdr->size) {
+               panic("jnl: end_transaction: end is bogus 0x%llx (sz 0x%llx)\n",
+                         jnl->jhdr->end, jnl->jhdr->size);
+    }
+
+    // this transaction starts where the current journal ends
+    tr->journal_start = jnl->jhdr->end;
+    end               = jnl->jhdr->end;
+
+       //
+       // if the first entry in old_start[] isn't free yet, loop calling the
+       // file system flush routine until it is (or we panic).
+       //
+       i = 0;
+       simple_lock(&jnl->old_start_lock);
+       while ((jnl->old_start[0] & 0x8000000000000000LL) != 0) {
+               if (jnl->flush) {
+                       simple_unlock(&jnl->old_start_lock);
+
+                       if (jnl->flush) {
+                               jnl->flush(jnl->flush_arg);
+                       }
+
+                       // yield the cpu so others can get in to clear the lock bit
+                       (void)tsleep((void *)jnl, PRIBIO, "jnl-old-start-sleep", 1);
+
+                       simple_lock(&jnl->old_start_lock);
+               }
+               if (i++ >= 100) {
+                       panic("jnl: transaction that started at 0x%llx is not completing! jnl 0x%x\n",
+                                 jnl->old_start[0] & (~0x8000000000000000LL), jnl);
+               }
+       }
+
+       //
+       // slide everyone else down and put our latest guy in the last
+       // entry in the old_start array
+       //
+       memcpy(&jnl->old_start[0], &jnl->old_start[1], sizeof(jnl->old_start)-sizeof(jnl->old_start[0]));
+       jnl->old_start[sizeof(jnl->old_start)/sizeof(jnl->old_start[0]) - 1] = tr->journal_start | 0x8000000000000000LL;
+
+       simple_unlock(&jnl->old_start_lock);
+
+
+    // for each block, make sure that the physical block # is set
+    for(blhdr=tr->blhdr; blhdr; blhdr=next) {
+
+               for(i=1; i < blhdr->num_blocks; i++) {
+           
+                       bp = blhdr->binfo[i].bp;
+                       if (bp == NULL) {   // only true if a block was "killed" 
+                               if (blhdr->binfo[i].bnum != (off_t)-1) {
+                                       panic("jnl: inconsistent binfo (NULL bp w/bnum %lld; jnl @ 0x%x, tr 0x%x)\n",
+                                                 blhdr->binfo[i].bnum, jnl, tr);
+                               }
+                               continue;
+                       }
+
+                       if (bp->b_vp == NULL && bp->b_lblkno == bp->b_blkno) {
+                               panic("jnl: end_tr: DANGER! bp @ 0x%x w/null vp and l/blkno = %d/%d\n",
+                                         bp, bp->b_lblkno, bp->b_blkno);
+                       }
+           
+                       // if the lblkno is the same as blkno and this bp isn't
+                       // associated with the underlying file system device then
+                       // we need to call bmap() to get the actual physical block.
+                       //
+                       if ((bp->b_lblkno == bp->b_blkno) && (bp->b_vp != jnl->fsdev)) {
+                               if (VOP_BMAP(bp->b_vp, bp->b_lblkno, NULL, &bp->b_blkno, NULL) != 0) {
+                                       printf("jnl: end_tr: can't bmap the bp @ 0x%x, jnl 0x%x\n", bp, jnl);
+                                       goto bad_journal;
+                               }
+                       }
+           
+                       // update this so we write out the correct physical block number!
+                       blhdr->binfo[i].bnum = bp->b_blkno;
+               }
+
+               next = (block_list_header *)((long)blhdr->binfo[0].bnum);
+    }
+    
+    for(blhdr=tr->blhdr; blhdr; blhdr=(block_list_header *)((long)blhdr->binfo[0].bnum)) {
+
+               amt = blhdr->bytes_used;
+
+               blhdr->checksum = 0;
+               blhdr->checksum = calc_checksum((char *)blhdr, BLHDR_CHECKSUM_SIZE);
+       
+               ret = write_journal_data(jnl, &end, blhdr, amt);
+               if (ret != amt) {
+                       printf("jnl: end_transaction: only wrote %d of %d bytes to the journal!\n",
+                                  ret, amt);
+
+                       goto bad_journal;
+               }
+    }
+
+    jnl->jhdr->end  = end;    // update where the journal now ends
+    tr->journal_end = end;    // the transaction ends here too
+    if (tr->journal_start == 0 || tr->journal_end == 0) {
+               panic("jnl: end_transaction: bad tr journal start/end: 0x%llx 0x%llx\n",
+                         tr->journal_start, tr->journal_end);
+    }
+
+    if (write_journal_header(jnl) != 0) {
+               goto bad_journal;
+    }
+
+    //
+    // setup for looping through all the blhdr's.  we null out the
+    // tbuffer and blhdr fields so that they're not used any more.
+    //
+    blhdr       = tr->blhdr;
+    tr->tbuffer = NULL;
+    tr->blhdr   = NULL;
+
+    // the buffer_flushed_callback will only be called for the 
+    // real blocks that get flushed so we have to account for 
+    // the block_list_headers here.
+    //
+    tr->num_flushed = tr->num_blhdrs * jnl->jhdr->blhdr_size;
+
+    // for each block, set the iodone callback and unlock it
+    for(; blhdr; blhdr=next) {
+
+               // we can re-order the buf ptrs because everything is written out already
+               qsort(&blhdr->binfo[1], blhdr->num_blocks-1, sizeof(block_info), journal_binfo_cmp);
+
+               for(i=1; i < blhdr->num_blocks; i++) {
+                       if (blhdr->binfo[i].bp == NULL) {
+                               continue;
+                       }
+
+                       ret = meta_bread(blhdr->binfo[i].bp->b_vp,
+                                                        (daddr_t)blhdr->binfo[i].bp->b_lblkno,
+                                                        blhdr->binfo[i].bp->b_bufsize,
+                                                        NOCRED,
+                                                        &bp);
+                       if (ret == 0 && bp != NULL) {
+                               struct vnode *save_vp;
+               
+                               if (bp != blhdr->binfo[i].bp) {
+                                       panic("jnl: end_tr: got back a different bp! (bp 0x%x should be 0x%x, jnl 0x%x\n",
+                                                 bp, blhdr->binfo[i].bp, jnl);
+                               }
+
+                               if ((bp->b_flags & (B_LOCKED|B_DELWRI)) != (B_LOCKED|B_DELWRI)) {
+                                       if (jnl->flags & JOURNAL_CLOSE_PENDING) {
+                                               brelse(bp);
+                                               continue;
+                                       } else {
+                                               panic("jnl: end_tr: !!!DANGER!!! bp 0x%x flags (0x%x) not LOCKED & DELWRI\n", bp, bp->b_flags);
+                                       }
+                               }
+
+                               if (bp->b_iodone != NULL) {
+                                       panic("jnl: bp @ 0x%x (blkno %d, vp 0x%x) has non-null iodone (0x%x) buffflushcb 0x%x\n",
+                                                 bp, bp->b_blkno, bp->b_vp, bp->b_iodone, buffer_flushed_callback);
+                               }
+
+                               save_vp = bp->b_vp;
+
+                               bp->b_iodone       = buffer_flushed_callback;
+                               bp->b_transaction  = tr;
+                               bp->b_flags       |= B_CALL;
+                               bp->b_flags       &= ~(B_LOCKED);
+
+                               // kicking off the write here helps performance
+                               bawrite(bp);
+                               // XXXdbg this is good for testing: bdwrite(bp);
+                               //bdwrite(bp);
+                               
+                               // this undoes the vget() in journal_modify_block_end()
+                               vrele(save_vp);
+
+                       } else {
+                               printf("jnl: end_transaction: could not find block %Ld vp 0x%x!\n",
+                                          blhdr->binfo[i].bnum, blhdr->binfo[i].bp);
+                       }
+               }
+
+               next = (block_list_header *)((long)blhdr->binfo[0].bnum);
+
+               // we can free blhdr here since we won't need it any more
+               blhdr->binfo[0].bnum = 0xdeadc0de;
+               kmem_free(kernel_map, (vm_offset_t)blhdr, tr->tbuffer_size);
+    }
+
+    //printf("jnl: end_tr: tr @ 0x%x, jnl-blocks: 0x%llx - 0x%llx. exit!\n",
+    //   tr, tr->journal_start, tr->journal_end);
+    return 0;
+
+
+  bad_journal:
+    jnl->flags |= JOURNAL_INVALID;
+    abort_transaction(jnl, tr);
+    return -1;
+}
+
+static void
+abort_transaction(journal *jnl, transaction *tr)
+{
+    int                i, ret;
+    block_list_header *blhdr, *next;
+    struct buf        *bp;
+
+    // for each block list header, iterate over the blocks then
+    // free up the memory associated with the block list.
+    //
+    // for each block, clear the lock bit and release it.
+    //
+    for(blhdr=tr->blhdr; blhdr; blhdr=next) {
+
+               for(i=1; i < blhdr->num_blocks; i++) {
+                       if (blhdr->binfo[i].bp == NULL) {
+                               continue;
+                       }
+           
+                       ret = meta_bread(blhdr->binfo[i].bp->b_vp,
+                                                        (daddr_t)blhdr->binfo[i].bp->b_lblkno,
+                                                        blhdr->binfo[i].bp->b_bufsize,
+                                                        NOCRED,
+                                                        &bp);
+                       if (ret == 0 && bp != NULL) {
+                               if (bp != blhdr->binfo[i].bp) {
+                                       panic("jnl: abort_tr: got back a different bp! (bp 0x%x should be 0x%x, jnl 0x%x\n",
+                                                 bp, blhdr->binfo[i].bp, jnl);
+                               }
+
+                               // clear the locked bit and the delayed-write bit.  we
+                               // don't want these blocks going to disk.
+                               bp->b_flags &= ~(B_LOCKED|B_DELWRI);
+                               bp->b_flags |= B_INVAL;
+
+                               brelse(bp);
+
+                       } else {
+                               printf("jnl: abort_tr: could not find block %Ld vp 0x%x!\n",
+                                          blhdr->binfo[i].bnum, blhdr->binfo[i].bp);
+                       }
+               }
+
+               next = (block_list_header *)((long)blhdr->binfo[0].bnum);
+
+               // we can free blhdr here since we won't need it any more
+               blhdr->binfo[0].bnum = 0xdeadc0de;
+               kmem_free(kernel_map, (vm_offset_t)blhdr, tr->tbuffer_size);
+    }
+
+    tr->tbuffer     = NULL;
+    tr->blhdr       = NULL;
+    tr->total_bytes = 0xdbadc0de;
+       kmem_free(kernel_map, (vm_offset_t)tr, sizeof(transaction));
+}
+
+
+int
+journal_end_transaction(journal *jnl)
+{
+    int ret;
+    transaction *tr;
+    
+    CHECK_JOURNAL(jnl);
+
+       if ((jnl->flags & JOURNAL_INVALID) && jnl->owner == NULL) {
+               return 0;
+       }
+
+    if (jnl->owner != current_act()) {
+               panic("jnl: end_tr: I'm not the owner! jnl 0x%x, owner 0x%x, curact 0x%x\n",
+                         jnl, jnl->owner, current_act());
+    }
+
+    free_old_stuff(jnl);
+
+    jnl->nested_count--;
+    if (jnl->nested_count > 0) {
+               return 0;
+    } else if (jnl->nested_count < 0) {
+               panic("jnl: jnl @ 0x%x has negative nested count (%d). bad boy.\n", jnl, jnl->nested_count);
+    }
+    
+    if (jnl->flags & JOURNAL_INVALID) {
+               if (jnl->active_tr) {
+                       transaction *tr;
+
+                       if (jnl->cur_tr != NULL) {
+                               panic("jnl: journal @ 0x%x has active tr (0x%x) and cur tr (0x%x)\n",
+                                         jnl, jnl->active_tr, jnl->cur_tr);
+                       }
+           
+                       tr             = jnl->active_tr;
+                       jnl->active_tr = NULL;
+                       abort_transaction(jnl, tr);
+               }
+
+               jnl->owner = NULL;
+               semaphore_signal(jnl->jsem);
+
+               return EINVAL;
+    }
+
+    tr = jnl->active_tr;
+    CHECK_TRANSACTION(tr);
+
+    // clear this out here so that when check_free_space() calls
+    // the FS flush function, we don't panic in journal_flush()
+    // if the FS were to call that.  note: check_free_space() is
+    // called from end_transaction().
+    // 
+    jnl->active_tr = NULL;
+    ret = end_transaction(tr, 0);
+
+    jnl->owner = NULL;
+    semaphore_signal(jnl->jsem);
+
+    return ret;
+}
+
+
+int
+journal_flush(journal *jnl)
+{
+    int need_signal = 0;
+    
+    CHECK_JOURNAL(jnl);
+    
+    if (jnl->flags & JOURNAL_INVALID) {
+               return -1;
+    }
+
+    if (jnl->owner != current_act()) {
+               int ret;
+
+               while ((ret = semaphore_wait(jnl->jsem)) == KERN_ABORTED) {
+                       // just keep looping if we've ben ^C'ed 
+               }
+               if (ret != 0) {
+                       printf("jnl: flush: sem wait failed.\n");
+                       return -1;
+               }
+               need_signal = 1;
+    }
+
+    free_old_stuff(jnl);
+
+    // if we're not active, flush any buffered transactions
+    if (jnl->active_tr == NULL && jnl->cur_tr) {
+               transaction *tr = jnl->cur_tr;
+
+               jnl->cur_tr = NULL;
+               end_transaction(tr, 1);   // force it to get flushed
+    }
+
+    if (need_signal) {
+               semaphore_signal(jnl->jsem);
+    }
+
+    return 0;
+}
+
+int
+journal_active(journal *jnl)
+{
+    if (jnl->flags & JOURNAL_INVALID) {
+               return -1;
+    }
+    
+    return (jnl->active_tr == NULL) ? 0 : 1;
+}
 
--- /dev/null
+
+/*
+ * Copyright (c) 2000-2002 Apple Computer, Inc. All rights reserved.
+ *
+ * @APPLE_LICENSE_HEADER_START@
+ * 
+ * The contents of this file constitute Original Code as defined in and
+ * are subject to the Apple Public Source License Version 1.1 (the
+ * "License").  You may not use this file except in compliance with the
+ * License.  Please obtain a copy of the License at
+ * http://www.apple.com/publicsource and read it before using this file.
+ * 
+ * This Original Code and all software distributed under the License are
+ * distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY KIND, EITHER
+ * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
+ * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE OR NON-INFRINGEMENT.  Please see the
+ * License for the specific language governing rights and limitations
+ * under the License.
+ * 
+ * @APPLE_LICENSE_HEADER_END@
+ */
+/*
+ * This header contains the structures and function prototypes
+ * for the vfs journaling code.  The data types are not meant
+ * to be modified by user code.  Just use the functions and do
+ * not mess around with the structs.
+ */ 
+#ifndef _SYS_VFS_JOURNAL_H_
+#define _SYS_VFS_JOURNAL_H_
+
+#include <sys/appleapiopts.h>
+
+#ifdef __APPLE_API_UNSTABLE
+
+#include <sys/types.h>
+
+typedef struct block_info {
+    off_t       bnum;                // block # on the file system device
+    size_t      bsize;               // in bytes
+    struct buf *bp;
+} block_info;
+
+typedef struct block_list_header {
+    u_int16_t   max_blocks;          // max number of blocks in this chunk
+    u_int16_t   num_blocks;          // number of valid block numbers in block_nums
+    int32_t     bytes_used;          // how many bytes of this tbuffer are used
+    int32_t     checksum;            // on-disk: checksum of this header and binfo[0]
+    int32_t     pad;                 // pad out to 16 bytes
+    block_info  binfo[1];            // so we can reference them by name
+} block_list_header;
+
+
+struct journal;
+
+typedef struct transaction {
+    int                 tbuffer_size;  // in bytes
+    char               *tbuffer;       // memory copy of the transaction
+    block_list_header  *blhdr;         // points to the first byte of tbuffer
+    int                 num_blhdrs;    // how many buffers we've allocated
+    int                 total_bytes;   // total # of bytes in transaction
+    int                 num_flushed;   // how many bytes have been flushed
+    int                 num_killed;    // how many bytes were "killed"
+    off_t               journal_start; // where in the journal this transaction starts
+    off_t               journal_end;   // where in the journal this transaction ends
+    struct journal     *jnl;           // ptr back to the journal structure
+    struct transaction *next;          // list of tr's (either completed or to be free'd)
+} transaction;
+
+
+/*
+ * This is written to block zero of the journal and it
+ * maintains overall state about the journal.
+ */
+typedef struct journal_header {
+    int32_t        magic;
+    int32_t        endian;
+    volatile off_t start;         // zero-based byte offset of the start of the first transaction
+    volatile off_t end;           // zero-based byte offset of where free space begins
+    off_t          size;          // size in bytes of the entire journal
+    int32_t        blhdr_size;    // size in bytes of each block_list_header in the journal
+    int32_t        checksum;
+    int32_t        jhdr_size;     // block size (in bytes) of the journal header
+} journal_header;
+
+#define JOURNAL_HEADER_MAGIC  0x4a4e4c78   // 'JNLx'
+#define ENDIAN_MAGIC          0x12345678
+
+#define OLD_JOURNAL_HEADER_MAGIC  0x4a484452   // 'JHDR'
+
+
+/*
+ * In memory structure about the journal.
+ */
+typedef struct journal {
+    struct vnode       *jdev;              // vnode of the device where the journal lives
+    off_t               jdev_offset;       // byte offset to the start of the journal
+
+    struct vnode       *fsdev;             // vnode of the file system device
+    
+    void              (*flush)(void *arg); // fs callback to flush meta data blocks
+    void               *flush_arg;         // arg that's passed to flush()
+
+    int32_t             flags;
+    int32_t             tbuffer_size;      // default transaction buffer size
+
+    char               *header_buf;        // in-memory copy of the journal header
+    journal_header     *jhdr;              // points to the first byte of header_buf
+
+    transaction        *cur_tr;            // for group-commit
+    transaction        *completed_trs;     // out-of-order transactions that completed
+    transaction        *active_tr;         // for nested transactions
+    int32_t             nested_count;      // for nested transactions
+    void               *owner;             // a ptr that's unique to the calling process
+
+    transaction        *tr_freeme;         // transaction structs that need to be free'd
+
+       volatile off_t      active_start;      // the active start that we only keep in memory
+       simple_lock_data_t  old_start_lock;    // guard access
+       volatile off_t      old_start[16];     // this is how we do lazy start update
+
+    semaphore_t         jsem;
+} journal;
+
+/* internal-only journal flags (top 16 bits) */
+#define JOURNAL_CLOSE_PENDING     0x00010000
+#define JOURNAL_INVALID           0x00020000
+
+/* journal_open/create options are always in the low-16 bits */
+#define JOURNAL_OPTION_FLAGS_MASK 0x0000ffff
+
+/*
+ * Prototypes.
+ */
+
+/*
+ * Call journal_create() to create a new journal.  You only
+ * call this once, typically at file system creation time.
+ *
+ * The "jvp" argument is the vnode where the journal is written.
+ * The journal starts at "offset" and is "journal_size" bytes long.
+ *
+ * The "fsvp" argument is the vnode of your file system.  It may be
+ * the same as "jvp".
+ *
+ * The "min_fs_block_size" argument is the minimum block size
+ * (in bytes) that the file system will ever write.  Typically
+ * this is the block size of the file system (1k, 4k, etc) but
+ * on HFS+ it is the minimum block size of the underlying device.
+ *
+ * The flags argument lets you disable group commit if you
+ * want tighter guarantees on transactions (in exchange for
+ * lower performance).
+ *
+ * The tbuffer_size is the size of the transaction buffer
+ * used by the journal. If you specify zero, the journal code
+ * will use a reasonable defaults.  The tbuffer_size should 
+ * be an integer multiple of the min_fs_block_size.
+ *
+ * Returns a valid journal pointer or NULL if one could not
+ * be created.
+ */
+journal *journal_create(struct vnode *jvp,
+                                               off_t         offset,
+                                               off_t         journal_size,
+                                               struct vnode *fsvp,
+                                               size_t        min_fs_block_size,
+                                               int32_t       flags,
+                                               int32_t       tbuffer_size,
+                                               void        (*flush)(void *arg),
+                                               void         *arg);
+
+/*
+ * Call journal_open() when mounting an existing file system
+ * that has a previously created journal.  It will take care
+ * of validating the journal and replaying it if necessary.
+ *
+ * See journal_create() for a description of the arguments.
+ *
+ * Returns a valid journal pointer of NULL if it runs into
+ * trouble reading/playing back the journal.
+ */
+journal  *journal_open(struct vnode *jvp,
+                                          off_t         offset,
+                                          off_t         journal_size,
+                                          struct vnode *fsvp,
+                                          size_t        min_fs_block_size,
+                                          int32_t       flags,
+                                          int32_t       tbuffer_size,
+                                          void        (*flush)(void *arg),
+                                          void         *arg);
+
+/*
+ * Call journal_close() just before your file system is unmounted.
+ * It flushes any outstanding transactions and makes sure the
+ * journal is in a consistent state.
+ */
+void      journal_close(journal *journal);
+
+/*
+ * flags for journal_create/open.  only can use 
+ * the low 16 bits for flags because internal 
+ * bits go in the high 16.
+ */
+#define JOURNAL_NO_GROUP_COMMIT   0x00000001
+#define JOURNAL_RESET             0x00000002
+
+/*
+ * Transaction related functions.
+ *
+ * Before you start modifying file system meta data, you
+ * should call journal_start_transaction().  Then before
+ * you modify each block, call journal_modify_block_start()
+ * and when you're done, journal_modify_block_end().  When
+ * you've modified the last block as part of a transaction,
+ * call journal_end_transaction() to commit the changes.
+ *
+ * If you decide to abort the modifications to a block you
+ * should call journal_modify_block_abort().
+ *
+ * If as part of a transaction you need want to throw out
+ * any previous copies of a block (because it got deleted)
+ * then call journal_kill_block().  This will mark it so
+ * that the journal does not play it back (effectively
+ * dropping it).
+ */
+int   journal_start_transaction(journal *jnl);
+int   journal_modify_block_start(journal *jnl, struct buf *bp);
+int   journal_modify_block_abort(journal *jnl, struct buf *bp);
+int   journal_modify_block_end(journal *jnl, struct buf *bp);
+int   journal_kill_block(journal *jnl, struct buf *bp);
+int   journal_end_transaction(journal *jnl);
+
+int   journal_active(journal *jnl);
+int   journal_flush(journal *jnl);
+
+#endif /* __APPLE_API_UNSTABLE */
+#endif /* !_SYS_VFS_JOURNAL_H_ */
 
                if (error = VOP_FSYNC(vp, cred, MNT_WAIT, p)) {
                        return (error);
                }
-               if (vp->v_dirtyblkhd.lh_first)
-                       panic("vinvalbuf: dirty bufs");
+
+               // XXXdbg - if there are dirty bufs, wait for 'em if they're busy
+               for (bp=vp->v_dirtyblkhd.lh_first; bp; bp=nbp) {
+                   nbp = bp->b_vnbufs.le_next;
+                   if (ISSET(bp->b_flags, B_BUSY)) {
+                       SET(bp->b_flags, B_WANTED);
+                       tsleep((caddr_t)bp, slpflag | (PRIBIO + 1), "vinvalbuf", 0);
+                       nbp = vp->v_dirtyblkhd.lh_first;
+                   } else {
+                       panic("vinvalbuf: dirty buf (vp 0x%x, bp 0x%x)", vp, bp);
+                   }
+               }
        }
 
        for (;;) {
-               if ((blist = vp->v_cleanblkhd.lh_first) && flags & V_SAVEMETA)
+               if ((blist = vp->v_cleanblkhd.lh_first) && (flags & V_SAVEMETA))
                        while (blist && blist->b_lblkno < 0)
                                blist = blist->b_vnbufs.le_next;
                if (!blist && (blist = vp->v_dirtyblkhd.lh_first) &&
 
                for (bp = blist; bp; bp = nbp) {
                        nbp = bp->b_vnbufs.le_next;
-                       if (flags & V_SAVEMETA && bp->b_lblkno < 0)
+                       if ((flags & V_SAVEMETA) && bp->b_lblkno < 0)
                                continue;
                        s = splbio();
                        if (ISSET(bp->b_flags, B_BUSY)) {
                                (void) VOP_BWRITE(bp);
                                break;
                        }
-                       SET(bp->b_flags, B_INVAL);
+
+                       if (bp->b_flags & B_LOCKED) {
+                               panic("vinvalbuf: bp @ 0x%x is locked!\n", bp);
+                               break;
+                       } else {
+                               SET(bp->b_flags, B_INVAL);
+                       }
                        brelse(bp);
                }
        }
 
 #define kIOCommandPoolSizeKey         "IOCommandPoolSize"          // (OSNumber)
 
 // properties found in services that have transfer constraints
-#define kIOMaximumBlockCountReadKey    "IOMaximumBlockCountRead"    // (OSNumber)
-#define kIOMaximumBlockCountWriteKey   "IOMaximumBlockCountWrite"   // (OSNumber)
-#define kIOMaximumSegmentCountReadKey  "IOMaximumSegmentCountRead"  // (OSNumber)
-#define kIOMaximumSegmentCountWriteKey "IOMaximumSegmentCountWrite" // (OSNumber)
+#define kIOMaximumBlockCountReadKey        "IOMaximumBlockCountRead"        // (OSNumber)
+#define kIOMaximumBlockCountWriteKey       "IOMaximumBlockCountWrite"       // (OSNumber)
+#define kIOMaximumByteCountReadKey         "IOMaximumByteCountRead"         // (OSNumber)
+#define kIOMaximumByteCountWriteKey        "IOMaximumByteCountWrite"        // (OSNumber)
+#define kIOMaximumSegmentCountReadKey      "IOMaximumSegmentCountRead"      // (OSNumber)
+#define kIOMaximumSegmentCountWriteKey     "IOMaximumSegmentCountWrite"     // (OSNumber)
+#define kIOMaximumSegmentByteCountReadKey  "IOMaximumSegmentByteCountRead"  // (OSNumber)
+#define kIOMaximumSegmentByteCountWriteKey "IOMaximumSegmentByteCountWrite" // (OSNumber)
 
 // properties found in services that wish to describe an icon
 //
 
  */
 const char * gIOKernelKmods =
 "{
-    'com.apple.kernel'                         = '6.1';
-    'com.apple.kernel.bsd'                     = '6.1';
-    'com.apple.kernel.iokit'                   = '6.1';
-    'com.apple.kernel.libkern'                 = '6.1';
-    'com.apple.kernel.mach'                    = '6.1';
-    'com.apple.iokit.IOADBFamily'              = '1.1';
-    'com.apple.iokit.IONVRAMFamily'            = '1.1';
-    'com.apple.iokit.IOSystemManagementFamily' = '1.1';
-    'com.apple.iokit.ApplePlatformFamily'      = '1.0';
-    'com.apple.driver.AppleNMI'                = '1.0';
+    'com.apple.kernel'                         = '6.2';
+    'com.apple.kernel.bsd'                     = '6.2';
+    'com.apple.kernel.iokit'                   = '6.2';
+    'com.apple.kernel.libkern'                 = '6.2';
+    'com.apple.kernel.mach'                    = '6.2';
+    'com.apple.iokit.IOADBFamily'              = '6.2';
+    'com.apple.iokit.IONVRAMFamily'            = '6.2';
+    'com.apple.iokit.IOSystemManagementFamily' = '6.2';
+    'com.apple.iokit.ApplePlatformFamily'      = '6.2';
+    'com.apple.driver.AppleNMI'                = '6.2';
 }";
 
 
 
         */
 
 
+/*
+ * copy 'size' bytes from physical to physical address
+ * the caller must validate the physical ranges 
+ *
+ * if flush_action == 0, no cache flush necessary
+ * if flush_action == 1, flush the source
+ * if flush_action == 2, flush the dest
+ * if flush_action == 3, flush both source and dest
+ */
+
+kern_return_t copyp2p(vm_offset_t source, vm_offset_t dest, unsigned int size, unsigned int flush_action) {
+
+        switch(flush_action) {
+       case 1:
+               flush_dcache(source, size, 1);
+               break;
+       case 2:
+               flush_dcache(dest, size, 1);
+               break;
+       case 3:
+               flush_dcache(source, size, 1);
+               flush_dcache(dest, size, 1);
+               break;
+
+       }
+        bcopy_phys((char *)source, (char *)dest, size);        /* Do a physical copy */
+
+        switch(flush_action) {
+       case 1:
+               flush_dcache(source, size, 1);
+               break;
+       case 2:
+               flush_dcache(dest, size, 1);
+               break;
+       case 3:
+               flush_dcache(source, size, 1);
+               flush_dcache(dest, size, 1);
+               break;
+
+       }
+}
+
+
 
 /*
  *              Copies data from a physical page to a virtual page.  This is used to
 
                        rlwinm. r0,r8,0,MSR_PR_BIT,MSR_PR_BIT   ; See if we are doing this for user state
                        stw             r8,savesrr1(r25)                                ; Set the msr of the interrupted guy
                        xor             r3,r25,r5                                               ; Get the real address of the savearea
-                       bne-    fsnuser                                                 ; We are not user state...
+                       beq-    fsnuser                                                 ; We are not user state...
                        stw             r10,ACT_MACT_SPF(r17)                   ; Set the activation copy
                        stw             r10,spcFlags(r26)                               ; Set per_proc copy
 
                        rlwinm. r0,r8,0,MSR_PR_BIT,MSR_PR_BIT   ; See if we are doing this for user state
                        stw             r8,savesrr1(r25)                                ; Set the msr of the interrupted guy
                        xor             r3,r25,r5                                               ; Get the real address of the savearea
-                       bne-    vrnuser                                                 ; We are not user state...
+                       beq-    vrnuser                                                 ; We are not user state...
                        stw             r10,ACT_MACT_SPF(r17)                   ; Set the activation copy
                        stw             r10,spcFlags(r26)                               ; Set per_proc copy
 
 
 #endif
 
 vm_map_t        mapping_map = VM_MAP_NULL;
+#define                MAPPING_MAP_SIZE        33554432        /* 32MB address space */
 
 unsigned int   incrVSID = 0;                                                                   /* VSID increment value */
 unsigned int   mappingdeb0 = 0;                                                
        mappingblok     *mbn;
        vm_offset_t     mapping_min;
        
-       retr = kmem_suballoc(kernel_map, &mapping_min, mem_size / 16,
+       retr = kmem_suballoc(kernel_map, &mapping_min, MAPPING_MAP_SIZE,
                             FALSE, TRUE, &mapping_map);
 
        if (retr != KERN_SUCCESS)
 }
 
 
+/*
+ * copy 'size' bytes from physical to physical address
+ * the caller must validate the physical ranges 
+ *
+ * if flush_action == 0, no cache flush necessary
+ * if flush_action == 1, flush the source
+ * if flush_action == 2, flush the dest
+ * if flush_action == 3, flush both source and dest
+ */
+
+kern_return_t copyp2p(vm_offset_t source, vm_offset_t dest, unsigned int size, unsigned int flush_action) {
+
+        switch(flush_action) {
+       case 1:
+               flush_dcache(source, size, 1);
+               break;
+       case 2:
+               flush_dcache(dest, size, 1);
+               break;
+       case 3:
+               flush_dcache(source, size, 1);
+               flush_dcache(dest, size, 1);
+               break;
+
+       }
+        bcopy_phys((char *)source, (char *)dest, size);        /* Do a physical copy */
+
+        switch(flush_action) {
+       case 1:
+               flush_dcache(source, size, 1);
+               break;
+       case 2:
+               flush_dcache(dest, size, 1);
+               break;
+       case 3:
+               flush_dcache(source, size, 1);
+               flush_dcache(dest, size, 1);
+               break;
+
+       }
+}
+
+
+
 #if DEBUG
 /*
  *             Dumps out the mapping stuff associated with a virtual address
 
             hash_table_size *= 2)
                continue;
 
+       if (num > (sizeof(pte_t) * 524288))
+               hash_table_size = hash_table_size/2; /* reduce by half above 512MB */
+
        /* Scale to within any physical memory layout constraints */
        do {
                num = atop(mem_size);   /* num now holds mem_size in pages */
 
        kmem_init(start, end);
        pmap_init();
        
-       zsize = mem_size >> 2;                  /* Get target zone size as 1/4 of physical memory */
+       if (PE_parse_boot_arg("zsize", &zsize))
+               zsize = zsize * 1024 * 1024;
+       else {
+               zsize = mem_size >> 2;                  /* Get target zone size as 1/4 of physical memory */
+       }
        if(zsize < ZONE_MAP_MIN) zsize = ZONE_MAP_MIN;  /* Clamp to min */
        if(zsize > ZONE_MAP_MAX) zsize = ZONE_MAP_MAX;  /* Clamp to max */
        zone_init(zsize);                                               /* Allocate address space for zones */
 
 extern kern_return_t kmem_alloc_pages(
        register vm_object_t            object,
        register vm_object_offset_t     offset,
-       register vm_offset_t            start,
-       register vm_offset_t            end,
-       vm_prot_t                       protection);
+       register vm_size_t              size);
 
 extern void kmem_remap_pages(
        register vm_object_t            object,
 
        /*
         *      Since we have not given out this address yet,
-        *      it is safe to unlock the map.
+        *      it is safe to unlock the map. Except of course
+        *      we must make certain no one coalesces our address
+         *      or does a blind vm_deallocate and removes the object
+        *      an extra object reference will suffice to protect
+        *      against both contingencies.
         */
+       vm_object_reference(object);
        vm_map_unlock(map);
 
        vm_object_lock(object);
                                                offset + (vm_object_offset_t)i);
                                vm_object_unlock(object);
                                vm_map_remove(map, addr, addr + size, 0);
+                               vm_object_deallocate(object);
                                return KERN_RESOURCE_SHORTAGE;
                        }
                        vm_object_unlock(object);
                        vm_object_unlock(object);
                }
                vm_map_remove(map, addr, addr + size, 0);
+               vm_object_deallocate(object);
                return (kr);
        }
+       /* now that the page is wired, we no longer have to fear coalesce */
+       vm_object_deallocate(object);
        if (object == kernel_object)
                vm_map_simplify(map, addr);
 
        vm_offset_t     *newaddrp,
        vm_size_t       newsize)
 {
-       vm_offset_t oldmin, oldmax;
-       vm_offset_t newaddr;
-       vm_object_t object;
-       vm_map_entry_t oldentry, newentry;
-       kern_return_t kr;
+       vm_offset_t     oldmin, oldmax;
+       vm_offset_t     newaddr;
+       vm_offset_t     offset;
+       vm_object_t     object;
+       vm_map_entry_t  oldentry, newentry;
+       vm_page_t       mem;
+       kern_return_t   kr;
 
        oldmin = trunc_page(oldaddr);
        oldmax = round_page(oldaddr + oldsize);
        oldsize = oldmax - oldmin;
        newsize = round_page(newsize);
 
-       /*
-        *      Find space for the new region.
-        */
-
-       kr = vm_map_find_space(map, &newaddr, newsize, (vm_offset_t) 0,
-                              &newentry);
-       if (kr != KERN_SUCCESS) {
-               return kr;
-       }
 
        /*
         *      Find the VM object backing the old region.
         */
 
+       vm_map_lock(map);
+
        if (!vm_map_lookup_entry(map, oldmin, &oldentry))
                panic("kmem_realloc");
        object = oldentry->object.vm_object;
         */
 
        vm_object_reference(object);
+       /* by grabbing the object lock before unlocking the map */
+       /* we guarantee that we will panic if more than one     */
+       /* attempt is made to realloc a kmem_alloc'd area       */
        vm_object_lock(object);
+       vm_map_unlock(map);
        if (object->size != oldsize)
                panic("kmem_realloc");
        object->size = newsize;
        vm_object_unlock(object);
 
-       newentry->object.vm_object = object;
-       newentry->offset = 0;
-       assert (newentry->wired_count == 0);
-       newentry->wired_count = 1;
+       /* allocate the new pages while expanded portion of the */
+       /* object is still not mapped */
+       kmem_alloc_pages(object, oldsize, newsize-oldsize);
+
 
        /*
-        *      Since we have not given out this address yet,
-        *      it is safe to unlock the map.  We are trusting
-        *      that nobody will play with either region.
+        *      Find space for the new region.
         */
 
+       kr = vm_map_find_space(map, &newaddr, newsize, (vm_offset_t) 0,
+                              &newentry);
+       if (kr != KERN_SUCCESS) {
+               vm_object_lock(object);
+               for(offset = oldsize; 
+                               offset<newsize; offset+=PAGE_SIZE) {
+                       if ((mem = vm_page_lookup(object, offset)) != VM_PAGE_NULL) {
+                               vm_page_lock_queues();
+                               vm_page_free(mem);
+                               vm_page_unlock_queues();
+                       }
+               }
+               object->size = oldsize;
+               vm_object_unlock(object);
+               vm_object_deallocate(object);
+               return kr;
+       }
+       newentry->object.vm_object = object;
+       newentry->offset = 0;
+       assert (newentry->wired_count == 0);
+
+       
+       /* add an extra reference in case we have someone doing an */
+       /* unexpected deallocate */
+       vm_object_reference(object);
        vm_map_unlock(map);
 
-       /*
-        *      Remap the pages in the old region and
-        *      allocate more pages for the new region.
-        */
+       if ((kr = vm_map_wire(map, newaddr, newaddr + newsize, 
+                               VM_PROT_DEFAULT, FALSE)) != KERN_SUCCESS) {
+               vm_map_remove(map, newaddr, newaddr + newsize, 0);
+               vm_object_lock(object);
+               for(offset = oldsize; 
+                               offset<newsize; offset+=PAGE_SIZE) {
+                       if ((mem = vm_page_lookup(object, offset)) != VM_PAGE_NULL) {
+                               vm_page_lock_queues();
+                               vm_page_free(mem);
+                               vm_page_unlock_queues();
+                       }
+               }
+               object->size = oldsize;
+               vm_object_unlock(object);
+               vm_object_deallocate(object);
+               return (kr);
+       }
+       vm_object_deallocate(object);
 
-       kmem_remap_pages(object, 0,
-                        newaddr, newaddr + oldsize,
-                        VM_PROT_DEFAULT);
-       kmem_alloc_pages(object, oldsize,
-                        newaddr + oldsize, newaddr + newsize,
-                        VM_PROT_DEFAULT);
 
        *newaddrp = newaddr;
        return KERN_SUCCESS;
 }
 
 /*
- *     Allocate new wired pages in an object.
- *     The object is assumed to be mapped into the kernel map or
- *     a submap.
+ *     Allocate new pages in an object.
  */
 
 kern_return_t
 kmem_alloc_pages(
        register vm_object_t            object,
        register vm_object_offset_t     offset,
-       register vm_offset_t            start,
-       register vm_offset_t            end,
-       vm_prot_t                       protection)
+       register vm_size_t              size)
 {
-       /*
-        *      Mark the pmap region as not pageable.
-        */
-       pmap_pageable(kernel_pmap, start, end, FALSE);
 
-       while (start < end) {
+       size = round_page(size);
+        vm_object_lock(object);
+       while (size) {
            register vm_page_t  mem;
 
-           vm_object_lock(object);
 
            /*
             *  Allocate a page
                vm_object_lock(object);
            }
 
-           /*
-            *  Wire it down
-            */
-           vm_page_lock_queues();
-           vm_page_wire(mem);
-           vm_page_unlock_queues();
-           vm_object_unlock(object);
-
-           /*
-            *  Enter it in the kernel pmap
-            */
-           PMAP_ENTER(kernel_pmap, start, mem, protection, 
-                               VM_WIMG_USE_DEFAULT, TRUE);
-
-           vm_object_lock(object);
-           PAGE_WAKEUP_DONE(mem);
-           vm_object_unlock(object);
 
-           start += PAGE_SIZE;
-           offset += PAGE_SIZE_64;
+           offset += PAGE_SIZE;
+           size -= PAGE_SIZE;
+           mem->busy = FALSE;
        }
+       vm_object_unlock(object);
        return KERN_SUCCESS;
 }