From: Apple <opensource@apple.com>
Date: Tue, 12 Aug 2003 21:04:55 +0000 (+0000)
Subject: xnu-344.12.2.tar.gz
X-Git-Tag: mac-os-x-1022^0
X-Git-Url: https://git.saurik.com/apple/xnu.git/commitdiff_plain/b4c24cb9d3df001f2892dc4ed451bc769ff28a9f?ds=sidebyside

xnu-344.12.2.tar.gz
---

diff --git a/bsd/conf/files b/bsd/conf/files
index 7012205fa..817d99f42 100644
--- a/bsd/conf/files
+++ b/bsd/conf/files
@@ -137,6 +137,7 @@ bsd/vfs/vfs_support.c			standard
 bsd/vfs/vfs_utfconv.c			standard
 bsd/vfs/vfs_vnops.c			standard
 bsd/vfs/vnode_if.c			standard
+bsd/vfs/vfs_journal.c			standard
 
 bsd/miscfs/deadfs/dead_vnops.c		standard
 bsd/miscfs/fdesc/fdesc_vfsops.c		optional fdesc
@@ -501,6 +502,8 @@ bsd/kern/mach_header.c			standard
 bsd/kern/mach_loader.c			standard
 bsd/kern/posix_sem.c			standard
 bsd/kern/posix_shm.c			standard
+# XXXdbg - I need this in the journaling and block cache code
+bsd/kern/qsort.c			standard
 
 bsd/vm/vnode_pager.c			standard
 bsd/vm/vm_unix.c			standard
diff --git a/bsd/conf/version.minor b/bsd/conf/version.minor
index d00491fd7..0cfbf0888 100644
--- a/bsd/conf/version.minor
+++ b/bsd/conf/version.minor
@@ -1 +1 @@
-1
+2
diff --git a/bsd/hfs/hfs.h b/bsd/hfs/hfs.h
index 9086981b0..b82adcf2e 100644
--- a/bsd/hfs/hfs.h
+++ b/bsd/hfs/hfs.h
@@ -36,6 +36,8 @@
 #include <sys/quota.h>
 #include <sys/dirent.h>
 
+#include <vfs/vfs_journal.h>
+
 #include <hfs/hfs_format.h>
 #include <hfs/hfs_catalog.h>
 #include <hfs/hfs_cnode.h>
@@ -108,6 +110,7 @@ struct vcb_t {
     int16_t 			vcbAtrb;
     int16_t			vcbFlags;
     int16_t 			vcbspare;
+    u_int32_t 			vcbJinfoBlock;
 
     u_int32_t 			vcbCrDate;
     u_int32_t 			vcbLsMod;
@@ -180,6 +183,7 @@ typedef struct hfsmount {
 	u_int8_t			hfs_fs_ronly;			/* Whether this was mounted as read-initially  */
 	u_int8_t			hfs_unknownpermissions;	/* Whether this was mounted with MNT_UNKNOWNPERMISSIONS */
 	u_int8_t			hfs_media_writeable;
+	u_int8_t			hfs_orphans_cleaned;
 	
 	/* Physical Description */
 	u_long				hfs_phys_block_count;	/* Num of PHYSICAL blocks of volume */
@@ -211,10 +215,55 @@ typedef struct hfsmount {
 	unicode_to_hfs_func_t	hfs_get_hfsname;
  
 	struct quotafile	hfs_qfiles[MAXQUOTAS];    /* quota files */
+
+	// XXXdbg
+	void                *jnl;           // the journal for this volume (if one exists)
+	struct vnode        *jvp;           // device where the journal lives (may be equal to devvp)
+	u_int32_t            jnl_start;     // start block of the journal file (so we don't delete it)
+	u_int32_t            hfs_jnlfileid;
+	u_int32_t            hfs_jnlinfoblkid;
+    volatile int         readers;
+	volatile int         blocker;
 } hfsmount_t;
 
 #define hfs_private_metadata_dir	hfs_privdir_desc.cd_cnid
 
+#define hfs_global_shared_lock_acquire(hfsmp)    \
+    do { \
+       if (hfsmp->blocker) { \
+	       tsleep((caddr_t)&hfsmp->blocker, PRIBIO, "journal_blocker", 0); \
+           continue; \
+	   } \
+	   hfsmp->readers++; \
+       break; \
+	} while (1)
+
+#define hfs_global_shared_lock_release(hfsmp)    \
+    do { \
+	    hfsmp->readers--; \
+	    if (hfsmp->readers == 0) { \
+	        wakeup((caddr_t)&hfsmp->readers); \
+        } \
+    } while (0)
+
+#define hfs_global_exclusive_lock_acquire(hfsmp) \
+    do { \
+       if (hfsmp->blocker) { \
+	       tsleep((caddr_t)&hfsmp->blocker, PRIBIO, "journal_blocker", 0); \
+           continue; \
+	   } \
+       if (hfsmp->readers != 0) { \
+	       tsleep((caddr_t)&hfsmp->readers, PRIBIO, "journal_enable/disble", 0); \
+           continue; \
+       } \
+       hfsmp->blocker = 1; \
+       break; \
+	} while (1)
+     
+#define hfs_global_exclusive_lock_release(hfsmp) \
+    hfsmp->blocker = 0; \
+	wakeup((caddr_t)&hfsmp->blocker)
+
 #define MAXHFSVNODELEN		31
 
 
@@ -325,6 +374,7 @@ enum { kdirentMaxNameBytes = NAME_MAX };
 #define VTOHFS(VP) ((struct hfsmount *)((VP)->v_mount->mnt_data))
 #define	VFSTOHFS(MP) ((struct hfsmount *)(MP)->mnt_data)	
 #define VCBTOHFS(VCB) (((struct vfsVCB *)(VCB))->vcb_hfsmp)
+#define FCBTOHFS(FCB) ((struct hfsmount *)(FCB)->ff_cp->c_vp->v_mount->mnt_data)
 
 /*
  * Various ways to acquire a VCB pointer:
@@ -332,6 +382,7 @@ enum { kdirentMaxNameBytes = NAME_MAX };
 #define VTOVCB(VP) (&(((struct hfsmount *)((VP)->v_mount->mnt_data))->hfs_vcb.vcb_vcb))
 #define VFSTOVCB(MP) (&(((struct hfsmount *)(MP)->mnt_data)->hfs_vcb.vcb_vcb))
 #define HFSTOVCB(HFSMP) (&(HFSMP)->hfs_vcb.vcb_vcb)
+#define FCBTOVCB(FCB) (&(((struct hfsmount *)((FCB)->ff_cp->c_vp->v_mount->mnt_data))->hfs_vcb.vcb_vcb))
 
 
 #define E_NONE	0
@@ -376,6 +427,8 @@ extern int hfs_metafilelocking(struct hfsmount *hfsmp, u_long fileID, u_int flag
 
 extern u_int32_t hfs_freeblks(struct hfsmount * hfsmp, int wantreserve);
 
+extern void hfs_remove_orphans(struct hfsmount *);
+
 
 short MacToVFSError(OSErr err);
 
@@ -388,6 +441,8 @@ u_long FindMetaDataDirectory(ExtendedVCB *vcb);
 #define  HFS_SYNCTRANS		1
 
 extern int hfs_btsync(struct vnode *vp, int sync_transaction);
+// used as a callback by the journaling code
+extern void hfs_sync_metadata(void *arg);
 
 short make_dir_entry(FCB **fileptr, char *name, u_int32_t fileID);
 
@@ -399,7 +454,13 @@ unsigned long BestBlockSizeFit(unsigned long allocationBlockSize,
 OSErr	hfs_MountHFSVolume(struct hfsmount *hfsmp, HFSMasterDirectoryBlock *mdb,
 		struct proc *p);
 OSErr	hfs_MountHFSPlusVolume(struct hfsmount *hfsmp, HFSPlusVolumeHeader *vhp,
-		off_t embeddedOffset, u_int64_t disksize, struct proc *p);
+		off_t embeddedOffset, u_int64_t disksize, struct proc *p, void *args);
+
+extern int     hfs_early_journal_init(struct hfsmount *hfsmp, HFSPlusVolumeHeader *vhp,
+							   void *_args, int embeddedOffset, int mdb_offset,
+							   HFSMasterDirectoryBlock *mdbp, struct ucred *cred);
+extern u_long  GetFileInfo(ExtendedVCB *vcb, u_int32_t dirid, char *name,
+					struct cat_attr *fattr, struct cat_fork *forkinfo);
 
 int hfs_getconverter(u_int32_t encoding, hfs_to_unicode_func_t *get_unicode,
 		     unicode_to_hfs_func_t *get_hfsname);
diff --git a/bsd/hfs/hfs_attrlist.c b/bsd/hfs/hfs_attrlist.c
index 650d6fa4a..f53d05e3f 100644
--- a/bsd/hfs/hfs_attrlist.c
+++ b/bsd/hfs/hfs_attrlist.c
@@ -194,15 +194,35 @@ hfs_getattrlist(ap)
 		if ((error = hfs_write_access(vp, ap->a_cred, ap->a_p, false)) != 0)
         		return (error);
 
+		// XXXdbg
+		hfs_global_shared_lock_acquire(hfsmp);
+		if (hfsmp->jnl) {
+		    if ((error = journal_start_transaction(hfsmp->jnl)) != 0) {
+				hfs_global_shared_lock_release(hfsmp);
+				return error;
+		    }
+		}
+
 		/* Lock catalog b-tree */
 		error = hfs_metafilelocking(hfsmp, kHFSCatalogFileID, LK_SHARED, ap->a_p);
-		if (error)
-			return (error);
+		if (error) {
+		    if (hfsmp->jnl) {
+				journal_end_transaction(hfsmp->jnl);
+			}
+			hfs_global_shared_lock_release(hfsmp);
+		    return (error);
+		}
 
 		error = cat_insertfilethread(hfsmp, &cp->c_desc);
 
 		/* Unlock catalog b-tree */
 		(void) hfs_metafilelocking(hfsmp, kHFSCatalogFileID, LK_RELEASE, ap->a_p);
+
+		if (hfsmp->jnl) {
+		    journal_end_transaction(hfsmp->jnl);
+		}
+		hfs_global_shared_lock_release(hfsmp);
+
 		if (error)
 			return (error);
 	}
@@ -350,6 +370,17 @@ hfs_setattrlist(ap)
 	}
 	if (cp->c_flag & (C_NOEXISTS | C_DELETED))
 		return (ENOENT);
+
+	// XXXdbg - don't allow modifying the journal or journal_info_block
+	if (hfsmp->jnl && cp->c_datafork) {
+		struct HFSPlusExtentDescriptor *extd;
+		
+		extd = &cp->c_datafork->ff_data.cf_extents[0];
+		if (extd->startBlock == HFSTOVCB(hfsmp)->vcbJinfoBlock || extd->startBlock == hfsmp->jnl_start) {
+			return EPERM;
+		}
+	}
+
 	/*
 	 * Ownership of a file is required in one of two classes of calls:
 	 *
@@ -447,14 +478,12 @@ hfs_setattrlist(ap)
 	 * If any cnode attributes changed then do an update.
 	 */
 	if (alist->volattr == 0) {
-		struct timeval atime, mtime;
+		struct timeval tv;
 
-		atime.tv_sec = cp->c_atime;
-		atime.tv_usec = 0;
-		mtime.tv_sec = cp->c_mtime;
-		mtime.tv_usec = cp->c_mtime_nsec / 1000;
 		cp->c_flag |= C_MODIFIED;
-		if ((error = VOP_UPDATE(vp, &atime, &mtime, 1)))
+		tv = time;
+		CTIMES(cp, &tv, &tv);
+		if ((error = VOP_UPDATE(vp, &tv, &tv, 1)))
 			goto ErrorExit;
 	}
 	/* Volume Rename */
@@ -482,9 +511,28 @@ hfs_setattrlist(ap)
 			to_desc.cd_cnid = cp->c_cnid;
 			to_desc.cd_flags = CD_ISDIR;
 
+			// XXXdbg
+			hfs_global_shared_lock_acquire(hfsmp);
+			if (hfsmp->jnl) {
+			    if (journal_start_transaction(hfsmp->jnl) != 0) {
+					hfs_global_shared_lock_release(hfsmp);
+					error = EINVAL;
+					/* Restore the old name in the VCB */
+					copystr(cp->c_desc.cd_nameptr, vcb->vcbVN, sizeof(vcb->vcbVN), NULL);
+					vcb->vcbFlags |= 0xFF00;
+					goto ErrorExit;
+			    }
+			}
+
+
 			/* Lock catalog b-tree */
 			error = hfs_metafilelocking(hfsmp, kHFSCatalogFileID, LK_EXCLUSIVE, p);
 			if (error) {
+				if (hfsmp->jnl) {
+				    journal_end_transaction(hfsmp->jnl);
+				}
+				hfs_global_shared_lock_release(hfsmp);
+
 				/* Restore the old name in the VCB */
 				copystr(cp->c_desc.cd_nameptr, vcb->vcbVN, sizeof(vcb->vcbVN), NULL);
 				vcb->vcbFlags |= 0xFF00;
@@ -495,7 +543,12 @@ hfs_setattrlist(ap)
 
 			/* Unlock the Catalog */
 			(void) hfs_metafilelocking(hfsmp, kHFSCatalogFileID, LK_RELEASE, p);
-
+			
+			if (hfsmp->jnl) {
+			    journal_end_transaction(hfsmp->jnl);
+			}
+			hfs_global_shared_lock_release(hfsmp);
+			
 			if (error) {
 				/* Restore the old name in the VCB */
 				copystr(cp->c_desc.cd_nameptr, vcb->vcbVN, sizeof(vcb->vcbVN), NULL);
@@ -601,12 +654,17 @@ hfs_readdirattr(ap)
 	int error = 0;
 	int depleted = 0;
 	int index, startindex;
-	int i;
+	int i, dir_entries;
 	struct cat_desc *lastdescp = NULL;
 	struct cat_desc prevdesc;
 	char * prevnamebuf = NULL;
 	struct cat_entrylist *ce_list = NULL;
 
+	dir_entries = dcp->c_entries;
+	if (dcp->c_attr.ca_fileid == kHFSRootFolderID && hfsmp->jnl) {
+		dir_entries -= 3;
+	}
+
 	*(ap->a_actualcount) = 0;
 	*(ap->a_eofflag) = 0;
 	
@@ -639,7 +697,7 @@ hfs_readdirattr(ap)
 
 	/* Convert uio_offset into a directory index. */
 	startindex = index = uio->uio_offset / sizeof(struct dirent);
-	if ((index + 1) > dcp->c_entries) {
+	if ((index + 1) > dir_entries) {
 		*(ap->a_eofflag) = 1;
 		error = 0;
 		goto exit;
@@ -781,7 +839,7 @@ hfs_readdirattr(ap)
 				/* Termination checks */
 				if ((--maxcount <= 0) ||
 				    (uio->uio_resid < (fixedblocksize + HFS_AVERAGE_NAME_SIZE)) ||
-				    (index >= dcp->c_entries)) {
+				    (index >= dir_entries)) {
 					depleted = 1;
 					break;
 				}
@@ -789,7 +847,7 @@ hfs_readdirattr(ap)
 		} /* for each catalog entry */
 
 		/* If there are more entries then save the last name. */
-		if (index < dcp->c_entries
+		if (index < dir_entries
 		&&  !(*(ap->a_eofflag))
 		&&  lastdescp != NULL) {
 			if (prevnamebuf == NULL)
@@ -1408,9 +1466,12 @@ packdirattr(
 	if (ATTR_DIR_ENTRYCOUNT & attr) {
 		u_long entries = cattrp->ca_entries;
 
-		if ((descp->cd_parentcnid == kRootParID) &&
-		    (hfsmp->hfs_private_metadata_dir != 0))
-			--entries;	/* hide private dir */
+		if (descp->cd_parentcnid == kRootParID) {
+			if (hfsmp->hfs_private_metadata_dir != 0)
+				--entries;	    /* hide private dir */
+			if (hfsmp->jnl)
+				entries -= 2;	/* hide the journal files */
+		}
 
 		*((u_long *)attrbufptr)++ = entries;
 	}
diff --git a/bsd/hfs/hfs_btreeio.c b/bsd/hfs/hfs_btreeio.c
index 6947a695a..a70290d05 100644
--- a/bsd/hfs/hfs_btreeio.c
+++ b/bsd/hfs/hfs_btreeio.c
@@ -68,7 +68,7 @@ OSStatus GetBTreeBlock(FileReference vp, UInt32 blockNum, GetBlockOptions option
 	if (options & kGetEmptyBlock)
 		bp = getblk(vp, blockNum, block->blockSize, 0, 0, BLK_META);
 	else
-	retval = meta_bread(vp, blockNum, block->blockSize, NOCRED, &bp);
+		retval = meta_bread(vp, blockNum, block->blockSize, NOCRED, &bp);
 
     DBG_ASSERT(bp != NULL);
     DBG_ASSERT(bp->b_data != NULL);
@@ -83,6 +83,9 @@ OSStatus GetBTreeBlock(FileReference vp, UInt32 blockNum, GetBlockOptions option
         block->buffer = bp->b_data;
         block->blockReadFromDisk = (bp->b_flags & B_CACHE) == 0;	/* not found in cache ==> came from disk */
 
+		// XXXdbg 
+		block->isModified = 0;
+
 #if BYTE_ORDER == LITTLE_ENDIAN
         /* Endian swap B-Tree node (only if it's a valid block) */
         if (!(options & kGetEmptyBlock)) {
@@ -116,9 +119,31 @@ OSStatus GetBTreeBlock(FileReference vp, UInt32 blockNum, GetBlockOptions option
 }
 
 
+__private_extern__
+void ModifyBlockStart(FileReference vp, BlockDescPtr blockPtr)
+{
+	struct hfsmount	*hfsmp = VTOHFS(vp);
+    struct buf *bp = NULL;
+
+	if (hfsmp->jnl == NULL) {
+		return;
+	}
+	
+    bp = (struct buf *) blockPtr->blockHeader;
+    if (bp == NULL) {
+		panic("ModifyBlockStart: null bp  for blockdescptr 0x%x?!?\n", blockPtr);
+		return;
+    }
+
+	journal_modify_block_start(hfsmp->jnl, bp);
+	blockPtr->isModified = 1;
+}
+
+
 __private_extern__
 OSStatus ReleaseBTreeBlock(FileReference vp, BlockDescPtr blockPtr, ReleaseBlockOptions options)
 {
+    struct hfsmount	*hfsmp = VTOHFS(vp);
     extern int bdwrite_internal(struct buf *, int);
     OSStatus	retval = E_NONE;
     struct buf *bp = NULL;
@@ -131,16 +156,25 @@ OSStatus ReleaseBTreeBlock(FileReference vp, BlockDescPtr blockPtr, ReleaseBlock
     }
 
     if (options & kTrashBlock) {
-        bp->b_flags |= B_INVAL;
-    	brelse(bp);	/* note: B-tree code will clear blockPtr->blockHeader and blockPtr->buffer */
+		bp->b_flags |= B_INVAL;
+		if (hfsmp->jnl && (bp->b_flags & B_LOCKED)) {
+			journal_kill_block(hfsmp->jnl, bp);
+		} else {
+			brelse(bp);	/* note: B-tree code will clear blockPtr->blockHeader and blockPtr->buffer */
+		}
     } else {
         if (options & kForceWriteBlock) {
-            retval = VOP_BWRITE(bp);
+			if (hfsmp->jnl) {
+				if (blockPtr->isModified == 0) {
+					panic("hfs: releaseblock: modified is 0 but forcewrite set! bp 0x%x\n", bp);
+				}
+				retval = journal_modify_block_end(hfsmp->jnl, bp);
+				blockPtr->isModified = 0;
+			} else {
+				retval = VOP_BWRITE(bp);
+			}
         } else if (options & kMarkBlockDirty) {
-#if FORCESYNCBTREEWRITES
-            VOP_BWRITE(bp);
-#else
-            if (options & kLockTransaction) {
+            if ((options & kLockTransaction) && hfsmp->jnl == NULL) {
                 /*
                  *
                  * Set the B_LOCKED flag and unlock the buffer, causing brelse to move
@@ -156,24 +190,44 @@ OSStatus ReleaseBTreeBlock(FileReference vp, BlockDescPtr blockPtr, ReleaseBlock
                      /* Rollback sync time to cause a sync on lock release... */
                      (void) BTSetLastSync(VTOF(vp), time.tv_sec - (kMaxSecsForFsync + 1));
                 }
-                bp->b_flags |= B_LOCKED;
-           }
+
+				bp->b_flags |= B_LOCKED;
+            }
+
             /* 
              * Delay-write this block.
              * If the maximum delayed buffers has been exceeded then
              * free up some buffers and fall back to an asynchronous write.
              */
-            if (bdwrite_internal(bp, 1) != 0) {
+			if (hfsmp->jnl) {
+				if (blockPtr->isModified == 0) {
+					panic("hfs: releaseblock: modified is 0 but markdirty set! bp 0x%x\n", bp);
+				}
+				retval = journal_modify_block_end(hfsmp->jnl, bp);
+				blockPtr->isModified = 0;
+			} else if (bdwrite_internal(bp, 1) != 0) {
                 hfs_btsync(vp, 0);
                 /* Rollback sync time to cause a sync on lock release... */
                 (void) BTSetLastSync(VTOF(vp), time.tv_sec - (kMaxSecsForFsync + 1));
                 bp->b_flags &= ~B_LOCKED;
                 bawrite(bp);
             }
-
-#endif
         } else {
-    		brelse(bp);	/* note: B-tree code will clear blockPtr->blockHeader and blockPtr->buffer */
+			// check if we had previously called journal_modify_block_start() 
+			// on this block and if so, abort it (which will call brelse()).
+			if (hfsmp->jnl && blockPtr->isModified) {
+				// XXXdbg - I don't want to call modify_block_abort()
+				//          because I think it may be screwing up the
+				//          journal and blowing away a block that has
+				//          valid data in it.
+				//   
+				//    journal_modify_block_abort(hfsmp->jnl, bp);
+				//panic("hfs: releaseblock called for 0x%x but mod_block_start previously called.\n", bp);
+				journal_modify_block_end(hfsmp->jnl, bp);
+				blockPtr->isModified = 0;
+			} else {
+				brelse(bp);	/* note: B-tree code will clear blockPtr->blockHeader and blockPtr->buffer */
+			}
         };
     };
 
@@ -187,17 +241,16 @@ OSStatus ExtendBTreeFile(FileReference vp, FSSize minEOF, FSSize maxEOF)
 {
 #pragma unused (maxEOF)
 
-	OSStatus	retval;
-	UInt64		actualBytesAdded;
+	OSStatus	retval, ret;
+	UInt64		actualBytesAdded, origSize;
 	UInt64		bytesToAdd;
-    UInt32		extendFlags;
 	u_int32_t	startAllocation;
 	u_int32_t	fileblocks;
 	BTreeInfoRec btInfo;
 	ExtendedVCB	*vcb;
 	FCB			*filePtr;
     struct proc *p = NULL;
-
+	UInt64 		trim = 0;	
 
 	filePtr = GetFileControlBlock(vp);
 
@@ -225,13 +278,14 @@ OSStatus ExtendBTreeFile(FileReference vp, FSSize minEOF, FSSize maxEOF)
 	{
 		p = current_proc();
 		/* lock extents b-tree (also protects volume bitmap) */
-		retval = hfs_metafilelocking(VTOHFS(vp), kHFSExtentsFileID, LK_EXCLUSIVE, p);
+ 		retval = hfs_metafilelocking(VTOHFS(vp), kHFSExtentsFileID, LK_EXCLUSIVE, p);
 		if (retval)
 			return (retval);
 	}
 
     (void) BTGetInformation(filePtr, 0, &btInfo);
 
+#if 0  // XXXdbg
 	/*
 	 * The b-tree code expects nodes to be contiguous. So when
 	 * the allocation block size is less than the b-tree node
@@ -241,14 +295,38 @@ OSStatus ExtendBTreeFile(FileReference vp, FSSize minEOF, FSSize maxEOF)
 		extendFlags = 0;
 	} else {
 		/* Ensure that all b-tree nodes are contiguous on disk */
-		extendFlags = kEFAllMask | kEFContigMask;
+		extendFlags = kEFContigMask;
 	}
+#endif
 
+	origSize = filePtr->fcbEOF;
 	fileblocks = filePtr->ff_blocks;
 	startAllocation = vcb->nextAllocation;
 
-	retval = ExtendFileC(vcb, filePtr, bytesToAdd, 0, extendFlags, &actualBytesAdded);
-
+	// loop trying to get a contiguous chunk that's an integer multiple
+	// of the btree node size.  if we can't get a contiguous chunk that
+	// is at least the node size then we break out of the loop and let
+	// the error propagate back up.
+	do {
+		retval = ExtendFileC(vcb, filePtr, bytesToAdd, 0, kEFContigMask, &actualBytesAdded);
+		if (retval == dskFulErr && actualBytesAdded == 0) {
+
+			if (bytesToAdd == btInfo.nodeSize || bytesToAdd < (minEOF - origSize)) {
+				// if we're here there's nothing else to try, we're out
+				// of space so we break and bail out.
+				break;
+			} else {
+				bytesToAdd >>= 1;
+				if (bytesToAdd < btInfo.nodeSize) {
+					bytesToAdd = btInfo.nodeSize;
+				} else if ((bytesToAdd % btInfo.nodeSize) != 0) {
+					// make sure it's an integer multiple of the nodeSize
+					bytesToAdd -= (bytesToAdd % btInfo.nodeSize);
+				}
+			}
+		}
+	} while (retval == dskFulErr && actualBytesAdded == 0);
+	
 	/*
 	 * If a new extent was added then move the roving allocator
 	 * reference forward by the current b-tree file size so 
@@ -260,25 +338,74 @@ OSStatus ExtendBTreeFile(FileReference vp, FSSize minEOF, FSSize maxEOF)
 		vcb->nextAllocation += fileblocks;
 	}
 		
+	filePtr->fcbEOF = (u_int64_t)filePtr->ff_blocks * (u_int64_t)vcb->blockSize;
+
+	// XXXdbg ExtendFileC() could have returned an error even though
+	// it grew the file to be big enough for our needs.  If this is
+	// the case, we don't care about retval so we blow it away.
+	//
+	if (filePtr->fcbEOF >= minEOF && retval != 0) {
+		retval = 0;
+	}
+
+	// XXXdbg if the file grew but isn't large enough or isn't an
+	// even multiple of the nodeSize then trim things back.  if
+	// the file isn't large enough we trim back to the original
+	// size.  otherwise we trim back to be an even multiple of the
+	// btree node size.
+	//
+	if ((filePtr->fcbEOF < minEOF) || (actualBytesAdded % btInfo.nodeSize) != 0) {
+
+		if (filePtr->fcbEOF < minEOF) {
+			retval = dskFulErr;
+			
+			if (filePtr->fcbEOF < origSize) {
+				panic("hfs: btree file eof %lld less than orig size %lld!\n",
+					  filePtr->fcbEOF, origSize);
+			}
+			
+			trim = filePtr->fcbEOF - origSize;
+			if (trim != actualBytesAdded) {
+				panic("hfs: trim == %lld but actualBytesAdded == %lld\n",
+					  trim, actualBytesAdded);
+			}
+		} else {
+			trim = (actualBytesAdded % btInfo.nodeSize);
+		}
+
+		ret = TruncateFileC(vcb, filePtr, filePtr->fcbEOF - trim, 0);
+		filePtr->fcbEOF = (u_int64_t)filePtr->ff_blocks * (u_int64_t)vcb->blockSize;
+
+		// XXXdbg - panic if the file didn't get trimmed back properly
+		if ((filePtr->fcbEOF % btInfo.nodeSize) != 0) {
+			panic("hfs: truncate file didn't! fcbEOF %lld nsize %d fcb 0x%x\n",
+				  filePtr->fcbEOF, btInfo.nodeSize, filePtr);
+		}
+
+		if (ret) {
+			// XXXdbg - this probably doesn't need to be a panic()
+			panic("hfs: error truncating btree files (sz 0x%llx, trim %lld, ret %d)\n",
+				  filePtr->fcbEOF, trim, ret);
+			return ret;
+		}
+		actualBytesAdded -= trim;
+	}
+
 	if(VTOC(vp)->c_fileid != kHFSExtentsFileID) {
 		/*
 		 * Get any extents overflow b-tree changes to disk ASAP!
 		 */
-		if (retval == 0) {
-			(void) BTFlushPath(VTOF(vcb->extentsRefNum));
-			(void) VOP_FSYNC(vcb->extentsRefNum, NOCRED, MNT_WAIT, p);
-		}
+		(void) BTFlushPath(VTOF(vcb->extentsRefNum));
+		(void) VOP_FSYNC(vcb->extentsRefNum, NOCRED, MNT_WAIT, p);
+
 		(void) hfs_metafilelocking(VTOHFS(vp), kHFSExtentsFileID, LK_RELEASE, p);
 	}
-	if (retval)
-		return (retval);
-	
-	filePtr->fcbEOF = (u_int64_t)filePtr->ff_blocks * (u_int64_t)vcb->blockSize;
 
-	retval = ClearBTNodes(vp, btInfo.nodeSize, filePtr->fcbEOF - actualBytesAdded, actualBytesAdded);	
-	if (retval)
-		return (retval);
-	
+	if ((filePtr->fcbEOF % btInfo.nodeSize) != 0) {
+		panic("hfs: extendbtree: fcb 0x%x has eof 0x%llx not a multiple of 0x%x (trim %llx)\n",
+			  filePtr, filePtr->fcbEOF, btInfo.nodeSize, trim);
+	}
+
 	/*
 	 * Update the Alternate MDB or Alternate VolumeHeader
 	 */
@@ -287,8 +414,12 @@ OSStatus ExtendBTreeFile(FileReference vp, FSSize minEOF, FSSize maxEOF)
 	    (VTOC(vp)->c_fileid == kHFSAttributesFileID)
 	   ) {
 		MarkVCBDirty( vcb );
-		retval = hfs_flushvolumeheader(VCBTOHFS(vcb), MNT_WAIT, HFS_ALTFLUSH);
+		ret = hfs_flushvolumeheader(VCBTOHFS(vcb), MNT_WAIT, HFS_ALTFLUSH);
 	}
+
+	ret = ClearBTNodes(vp, btInfo.nodeSize, filePtr->fcbEOF - actualBytesAdded, actualBytesAdded);
+	if (ret)
+		return (ret);
 	
 	return retval;
 }
@@ -300,6 +431,7 @@ OSStatus ExtendBTreeFile(FileReference vp, FSSize minEOF, FSSize maxEOF)
 static int
 ClearBTNodes(struct vnode *vp, long blksize, off_t offset, off_t amount)
 {
+	struct hfsmount *hfsmp = VTOHFS(vp);
 	struct buf *bp = NULL;
 	daddr_t blk;
 	daddr_t blkcnt;
@@ -311,14 +443,36 @@ ClearBTNodes(struct vnode *vp, long blksize, off_t offset, off_t amount)
 		bp = getblk(vp, blk, blksize, 0, 0, BLK_META);
 		if (bp == NULL)
 			continue;
+
+        // XXXdbg
+		if (hfsmp->jnl) {
+			// XXXdbg -- skipping this for now since it makes a transaction
+			//           become *way* too large
+		    //journal_modify_block_start(hfsmp->jnl, bp);
+		}
+
 		bzero((char *)bp->b_data, blksize);
 		bp->b_flags |= B_AGE;
 
-		 /* wait/yield every 32 blocks so we don't hog all the buffers */
-		if ((blk % 32) == 0)
-			VOP_BWRITE(bp);
-		else
-			bawrite(bp);
+        // XXXdbg
+		if (hfsmp->jnl) {
+			// XXXdbg -- skipping this for now since it makes a transaction
+			//           become *way* too large
+			//journal_modify_block_end(hfsmp->jnl, bp);
+
+			// XXXdbg - remove this once we decide what to do with the
+			//          writes to the journal
+			if ((blk % 32) == 0)
+			    VOP_BWRITE(bp);
+			else
+			    bawrite(bp);
+		} else {
+			/* wait/yield every 32 blocks so we don't hog all the buffers */
+			if ((blk % 32) == 0)
+				VOP_BWRITE(bp);
+			else
+				bawrite(bp);
+		}
 		--blkcnt;
 		++blk;
 	}
diff --git a/bsd/hfs/hfs_catalog.c b/bsd/hfs/hfs_catalog.c
index 7d6999e65..769576d7e 100644
--- a/bsd/hfs/hfs_catalog.c
+++ b/bsd/hfs/hfs_catalog.c
@@ -261,6 +261,11 @@ cat_insertfilethread(struct hfsmount *hfsmp, struct cat_desc *descp)
 	if (result)
 		goto exit;
 
+	// XXXdbg - preflight all btree operations to make sure there's enough space
+	result = BTCheckFreeSpace(fcb);
+	if (result)
+		goto exit;
+
 	BDINIT(file_data, &file_rec);
 	result = BTSearchRecord(fcb, &iterator[0], &file_data, &datasize, &iterator[0]);
 	if (result) 
@@ -288,6 +293,7 @@ cat_insertfilethread(struct hfsmount *hfsmp, struct cat_desc *descp)
 		(void) BTFlushPath(fcb);
 	}	
 exit:
+	(void) BTFlushPath(fcb);
 	FREE(iterator, M_TEMP);
 
 	return MacToVFSError(result);
@@ -426,6 +432,15 @@ cat_lookupbykey(struct hfsmount *hfsmp, CatalogKey *keyp, u_long hint, int wantr
 	encoding = getencoding(recp);
 	hint = iterator->hint.nodeNum;
 
+	/* Hide the journal files (if any) */
+	if (hfsmp->jnl &&
+		((cnid == hfsmp->hfs_jnlfileid) ||
+		 (cnid == hfsmp->hfs_jnlinfoblkid))) {
+
+		result = ENOENT;
+		goto exit;
+	}
+
 	/*
 	 * When a hardlink link is encountered, auto resolve it
 	 */
@@ -529,6 +544,11 @@ cat_create(struct hfsmount *hfsmp, struct cat_desc *descp, struct cat_attr *attr
 		hfs_setencodingbits(hfsmp, encoding);
 	}
 
+	// XXXdbg - preflight all btree operations to make sure there's enough space
+	result = BTCheckFreeSpace(fcb);
+	if (result)
+		goto exit;
+
 	/*
 	 * Insert the thread record first
 	 */
@@ -617,9 +637,8 @@ cat_create(struct hfsmount *hfsmp, struct cat_desc *descp, struct cat_attr *attr
 	vcb->vcbNxtCNID = nextCNID;
 	vcb->vcbFlags |= 0xFF00;
 
-	(void) BTFlushPath(fcb);
-
 exit:
+	(void) BTFlushPath(fcb);
 	FREE(bto, M_TEMP);
 
 	return MacToVFSError(result);
@@ -678,6 +697,11 @@ cat_rename (
 	if ((result = buildkey(hfsmp, to_cdp, (HFSPlusCatalogKey *)&to_iterator->key, 0)))
 		goto exit;	
 
+	// XXXdbg - preflight all btree operations to make sure there's enough space
+	result = BTCheckFreeSpace(fcb);
+	if (result)
+		goto exit;
+
 	to_key = (HFSPlusCatalogKey *)&to_iterator->key;
 	MALLOC(recp, CatalogRecord *, sizeof(CatalogRecord), M_TEMP, M_WAITOK);
 	BDINIT(btdata, recp);
@@ -781,7 +805,17 @@ cat_rename (
 		result = BTInsertRecord(fcb, to_iterator, &btdata, datasize);
 		if (result) {
 			/* Try and restore original before leaving */
+		    // XXXdbg
+		    #if 1
+		       {
+		       	int err;
+			err = BTInsertRecord(fcb, from_iterator, &btdata, datasize);
+			if (err)
+				panic("cat_create: could not undo (BTInsert = %d)", err);
+		       }
+		    #else
 			(void) BTInsertRecord(fcb, from_iterator, &btdata, datasize);
+		    #endif
 			goto exit;
 		}
 		sourcegone = 1;
@@ -794,7 +828,17 @@ cat_rename (
 		result = BTDeleteRecord(fcb, from_iterator);
 		if (result) {
 			/* Try and delete new record before leaving */
+		  // XXXdbg
+		  #if 1
+		     {
+		     	int err;
+			err = BTDeleteRecord(fcb, to_iterator);
+			if (err)
+				panic("cat_create: could not undo (BTDelete = %d)", err);
+		     }			
+		  #else
 			(void) BTDeleteRecord(fcb, to_iterator);
+		  #endif
 			goto exit;
 		}
 	}
@@ -834,8 +878,8 @@ cat_rename (
 			FREE(pluskey, M_TEMP);
 		}
 	}
-	(void) BTFlushPath(fcb);
 exit:
+	(void) BTFlushPath(fcb);
 	if (from_iterator)
 		FREE(from_iterator, M_TEMP);
 	if (to_iterator)
@@ -874,7 +918,6 @@ cat_delete(struct hfsmount *hfsmp, struct cat_desc *descp, struct cat_attr *attr
 	 * A directory must be empty
 	 * A file must be zero length (no blocks)
 	 */
-
 	if (descp->cd_cnid < kHFSFirstUserCatalogNodeID ||
 	    descp->cd_parentcnid == kRootParID)
 		return (EINVAL);
@@ -899,6 +942,11 @@ cat_delete(struct hfsmount *hfsmp, struct cat_desc *descp, struct cat_attr *attr
 	if (result)
 		goto exit;
 
+	// XXXdbg - preflight all btree operations to make sure there's enough space
+	result = BTCheckFreeSpace(fcb);
+	if (result)
+		goto exit;
+
 	/* Delete record */
 	result = BTDeleteRecord(fcb, iterator);
 	if (result)
@@ -910,8 +958,8 @@ cat_delete(struct hfsmount *hfsmp, struct cat_desc *descp, struct cat_attr *attr
 
 	TrashCatalogIterator(vcb, descp->cd_parentcnid);
 
-	(void) BTFlushPath(fcb);
 exit:
+	(void) BTFlushPath(fcb);
 	FREE(iterator, M_TEMP);
 
 	return MacToVFSError(result);
@@ -973,9 +1021,8 @@ cat_update(struct hfsmount *hfsmp, struct cat_desc *descp, struct cat_attr *attr
 	/* Update the node hint. */
 	descp->cd_hint = iterator->hint.nodeNum;
 
-	(void) BTFlushPath(fcb);
-
 exit:
+	(void) BTFlushPath(fcb);
 	FREE(iterator, M_TEMP);
 
 	return MacToVFSError(result);
@@ -1242,13 +1289,22 @@ catrec_readattr(const CatalogKey *key, const CatalogRecord *rec,
 		return (0);	/* stop */
 	}
 
-	/* Hide the private meta data directory. */
-	if (parentcnid == kRootDirID  &&
-	    rec->recordType == kHFSPlusFolderRecord &&
-	    rec->hfsPlusFolder.folderID == hfsmp->hfs_private_metadata_dir) {
-		return (1);	/* continue */
+	/* Hide the private meta data directory and journal files */
+	if (parentcnid == kRootDirID) {
+		if ((rec->recordType == kHFSPlusFolderRecord) &&
+		    (rec->hfsPlusFolder.folderID == hfsmp->hfs_private_metadata_dir)) {
+			return (1);	/* continue */
+		}
+		if (hfsmp->jnl &&
+		    (rec->recordType == kHFSPlusFileRecord) &&
+		    ((rec->hfsPlusFile.fileID == hfsmp->hfs_jnlfileid) ||
+		     (rec->hfsPlusFile.fileID == hfsmp->hfs_jnlinfoblkid))) {
+
+			return (1);	/* continue */
+		}
 	}
 
+
 	cep = &list->entry[list->realentries++];
 
 	if (state->stdhfs) {
@@ -1408,6 +1464,8 @@ exit:
 struct read_state {
 	u_int32_t	cbs_parentID;
 	u_int32_t	cbs_hiddenDirID;
+	u_int32_t	cbs_hiddenJournalID;
+	u_int32_t	cbs_hiddenInfoBlkID;
 	off_t		cbs_lastoffset;
 	struct uio *	cbs_uio;
 	ExtendedVCB *	cbs_vcb;
@@ -1517,6 +1575,15 @@ lastitem:
 	    catent.d_type == DT_DIR)
 		goto lastitem;
 
+	/* Hide the journal files */
+	if ((curID == kRootDirID) &&
+	    (catent.d_type == DT_REG) &&
+	    ((catent.d_fileno == state->cbs_hiddenJournalID) ||
+	     (catent.d_fileno == state->cbs_hiddenInfoBlkID))) {
+
+		return (1);	/* skip and continue */
+	}
+
 	state->cbs_lastoffset = state->cbs_uio->uio_offset;
 
 	/* if this entry won't fit then we're done */
@@ -1565,6 +1632,11 @@ cat_getdirentries(struct hfsmount *hfsmp, struct cat_desc *descp,
 		goto cleanup;
 
 	state.cbs_hiddenDirID = hfsmp->hfs_private_metadata_dir;
+	if (hfsmp->jnl) {
+		state.cbs_hiddenJournalID = hfsmp->hfs_jnlfileid;
+		state.cbs_hiddenInfoBlkID = hfsmp->hfs_jnlinfoblkid;
+	}
+
 	state.cbs_lastoffset = cip->currentOffset;
 	state.cbs_vcb = vcb;
 	state.cbs_uio = uio;
@@ -2203,7 +2275,11 @@ getcnid(const CatalogRecord *crp)
 	case kHFSPlusFileRecord:
 		cnid = crp->hfsPlusFile.fileID;
 		break;
+	default:
+		panic("hfs: getcnid: unknown recordType (crp @ 0x%x)\n", crp);
+		break;
 	}
+
 	return (cnid);
 }
 
@@ -2225,7 +2301,11 @@ getparentcnid(const CatalogRecord *recp)
 	case kHFSPlusFolderThreadRecord:
 		cnid = recp->hfsPlusThread.parentID;
 		break;
+	default:
+		panic("hfs: getparentcnid: unknown recordType (crp @ 0x%x)\n", recp);
+		break;
 	}
+
 	return (cnid);
 }
 
diff --git a/bsd/hfs/hfs_cnode.c b/bsd/hfs/hfs_cnode.c
index d59163ab5..65617595f 100644
--- a/bsd/hfs/hfs_cnode.c
+++ b/bsd/hfs/hfs_cnode.c
@@ -62,6 +62,7 @@ hfs_inactive(ap)
 	int recycle = 0;
 	int forkcount = 0;
 	int truncated = 0;
+	int started_tr = 0, grabbed_lock = 0;
 
 	if (prtactive && vp->v_usecount != 0)
 		vprint("hfs_inactive: pushing active", vp);
@@ -85,9 +86,11 @@ hfs_inactive(ap)
 	    vp->v_type == VREG &&
 	    (VTOF(vp)->ff_blocks != 0)) {			
 		error = VOP_TRUNCATE(vp, (off_t)0, IO_NDELAY, NOCRED, p);
-		if (error) goto out;
 		truncated = 1;
+		// have to do this to prevent the lost ubc_info panic
+		SET(cp->c_flag, C_TRANSIT);
 		recycle = 1;
+		if (error) goto out;
 	}
 
 	/*
@@ -103,6 +106,17 @@ hfs_inactive(ap)
 		cp->c_flag &= ~C_DELETED;
 		cp->c_rdev = 0;
 		
+		// XXXdbg
+		hfs_global_shared_lock_acquire(hfsmp);
+		grabbed_lock = 1;
+		if (hfsmp->jnl) {
+		    if (journal_start_transaction(hfsmp->jnl) != 0) {
+				error = EINVAL;
+				goto out;
+		    }
+		    started_tr = 1;
+		}
+
 		/* Lock catalog b-tree */
 		error = hfs_metafilelocking(hfsmp, kHFSCatalogFileID, LK_EXCLUSIVE, p);
 		if (error) goto out;
@@ -148,11 +162,21 @@ hfs_inactive(ap)
 		if (HFSTOVCB(hfsmp)->vcbSigWord == kHFSPlusSigWord)
 			cp->c_flag |= C_MODIFIED;
 	}
-        if (cp->c_flag & (C_ACCESS | C_CHANGE | C_MODIFIED | C_UPDATE)) {
-                tv = time;
-                VOP_UPDATE(vp, &tv, &tv, 0);
-        }
+
+	if (cp->c_flag & (C_ACCESS | C_CHANGE | C_MODIFIED | C_UPDATE)) {
+		tv = time;
+		VOP_UPDATE(vp, &tv, &tv, 0);
+	}
 out:
+	// XXXdbg - have to do this because a goto could have come here
+	if (started_tr) {
+	    journal_end_transaction(hfsmp->jnl);
+	    started_tr = 0;
+	}
+	if (grabbed_lock) {
+		hfs_global_shared_lock_release(hfsmp);
+	}
+
 	VOP_UNLOCK(vp, 0, p);
 	/*
 	 * If we are done with the vnode, reclaim it
@@ -313,6 +337,16 @@ hfs_getcnode(struct hfsmount *hfsmp, cnid_t cnid, struct cat_desc *descp, int wa
 			retval = ENOENT;
 			goto exit;
 		}
+
+		/* Hide private journal files */
+		if (hfsmp->jnl &&
+			(cp->c_parentcnid == kRootDirID) &&
+			((cp->c_cnid == hfsmp->hfs_jnlfileid) ||
+			(cp->c_cnid == hfsmp->hfs_jnlinfoblkid))) {
+		    retval = ENOENT;
+			goto exit;
+		}
+	 
 		if (wantrsrc && rvp != NULL) {
 			vp = rvp;
 			rvp = NULL;
diff --git a/bsd/hfs/hfs_format.h b/bsd/hfs/hfs_format.h
index ffbef0fb9..a8833dedd 100644
--- a/bsd/hfs/hfs_format.h
+++ b/bsd/hfs/hfs_format.h
@@ -45,9 +45,11 @@ extern "C" {
 enum {
 	kHFSSigWord		= 0x4244,	/* 'BD' in ASCII */
 	kHFSPlusSigWord		= 0x482B,	/* 'H+' in ASCII */
+	kHFSJSigWord		= 0x484a,	/* 'HJ' in ASCII */
 	kHFSPlusVersion		= 0x0004,	/* will change as format changes */
 						/* version 4 shipped with Mac OS 8.1 */
-	kHFSPlusMountVersion	= 0x31302E30	/* '10.0' for Mac OS X */
+	kHFSPlusMountVersion	= 0x31302E30,	/* '10.0' for Mac OS X */
+	kHFSJMountVersion	= 0x4846534a	/* 'HFSJ' for journaled HFS+ on OS X */
 };
 
 
@@ -452,7 +454,8 @@ enum {
 	kHFSVolumeNoCacheRequiredBit = 10,		/* don't cache volume blocks (i.e. RAM or ROM disk) */
 	kHFSBootVolumeInconsistentBit = 11,		/* boot volume is inconsistent (System 7.6 and later) */
 	kHFSCatalogNodeIDsReusedBit = 12,
-							/* Bits 13-14 are reserved for future use */
+	kHFSVolumeJournaledBit = 13,			/* this volume has a journal on it */
+							/* Bit 14 is reserved for future use */
 	kHFSVolumeSoftwareLockBit	= 15,		/* volume is locked by software */
 
 	kHFSVolumeHardwareLockMask	= 1 << kHFSVolumeHardwareLockBit,
@@ -461,6 +464,7 @@ enum {
 	kHFSVolumeNoCacheRequiredMask = 1 << kHFSVolumeNoCacheRequiredBit,
 	kHFSBootVolumeInconsistentMask = 1 << kHFSBootVolumeInconsistentBit,
 	kHFSCatalogNodeIDsReusedMask = 1 << kHFSCatalogNodeIDsReusedBit,
+	kHFSVolumeJournaledMask	= 1 << kHFSVolumeJournaledBit,
 	kHFSVolumeSoftwareLockMask	= 1 << kHFSVolumeSoftwareLockBit,
 	kHFSMDBAttributesMask		= 0x8380
 };
@@ -509,7 +513,8 @@ struct HFSPlusVolumeHeader {
 	u_int16_t 	version;		/* == kHFSPlusVersion */
 	u_int32_t 	attributes;		/* volume attributes */
 	u_int32_t 	lastMountedVersion;	/* implementation version which last mounted volume */
-	u_int32_t 	reserved;		/* reserved - initialized as zero */
+//XXXdbg	u_int32_t 	reserved;		/* reserved - initialized as zero */
+	u_int32_t 	journalInfoBlock;	/* block addr of journal info (if volume is journaled, zero otherwise) */
 
 	u_int32_t 	createDate;		/* date and time of volume creation */
 	u_int32_t 	modifyDate;		/* date and time of last modification */
@@ -601,6 +606,23 @@ enum {
 	kBTVariableIndexKeysMask = 0x00000004	/* keys in index nodes are variable length */
 };
 
+/* JournalInfoBlock - Structure that describes where our journal lives */
+struct JournalInfoBlock {
+	u_int32_t	flags;
+    	u_int32_t       device_signature[8];  // signature used to locate our device.
+	u_int64_t       offset;               // byte offset to the journal on the device
+	u_int64_t       size;                 // size in bytes of the journal
+	u_int32_t 	reserved[32];
+};
+typedef struct JournalInfoBlock JournalInfoBlock;
+
+enum {
+    kJIJournalInFSMask          = 0x00000001,
+    kJIJournalOnOtherDeviceMask = 0x00000002,
+    kJIJournalNeedInitMask      = 0x00000004
+};
+
+
 #pragma options align=reset
 
 #ifdef __cplusplus
diff --git a/bsd/hfs/hfs_link.c b/bsd/hfs/hfs_link.c
index 6a78cd752..97a36516c 100644
--- a/bsd/hfs/hfs_link.c
+++ b/bsd/hfs/hfs_link.c
@@ -72,12 +72,25 @@ createindirectlink(struct hfsmount *hfsmp, u_int32_t linknum,
 	fip->fdCreator = SWAP_BE32 (kHFSPlusCreator);	/* 'hfs+' */
 	fip->fdFlags   = SWAP_BE16 (kHasBeenInited);
 
+	hfs_global_shared_lock_acquire(hfsmp);
+	if (hfsmp->jnl) {
+	    if (journal_start_transaction(hfsmp->jnl) != 0) {
+			hfs_global_shared_lock_release(hfsmp);
+			return EINVAL;
+	    }
+	}
+
 	/* Create the indirect link directly in the catalog */
 	result = cat_create(hfsmp, &desc, &attr, NULL);
 
-	if (linkcnid != NULL)
+	if (result == 0 && linkcnid != NULL)
 		*linkcnid = attr.ca_fileid;
 
+	if (hfsmp->jnl) {
+	    journal_end_transaction(hfsmp->jnl);
+	}
+	hfs_global_shared_lock_release(hfsmp);
+
 	return (result);
 }
 
@@ -111,8 +124,9 @@ hfs_makelink(struct hfsmount *hfsmp, struct cnode *cp, struct cnode *dcp,
 
 	/* Lock catalog b-tree */
 	retval = hfs_metafilelocking(hfsmp, kHFSCatalogFileID, LK_EXCLUSIVE, p);
-	if (retval)
-		return retval;
+	if (retval) {
+	    return retval;
+	}
 
 	/*
 	 * If this is a new hardlink then we need to create the data
@@ -123,6 +137,7 @@ hfs_makelink(struct hfsmount *hfsmp, struct cnode *cp, struct cnode *dcp,
 		bzero(&to_desc, sizeof(to_desc));
 		to_desc.cd_parentcnid = hfsmp->hfs_privdir_desc.cd_cnid;
 		to_desc.cd_cnid = cp->c_fileid;
+
 		do {
 			/* get a unique indirect node number */
 			indnodeno = ((random() & 0x3fffffff) + 100);
@@ -144,7 +159,17 @@ hfs_makelink(struct hfsmount *hfsmp, struct cnode *cp, struct cnode *dcp,
 				cp->c_desc.cd_nameptr, &cp->c_desc.cd_cnid);
 		if (retval) {
 			/* put it source file back */
+		// XXXdbg
+		#if 1
+		    {
+		    	int err;
+				err = cat_rename(hfsmp, &to_desc, &dcp->c_desc, &cp->c_desc, NULL);
+				if (err)
+					panic("hfs_makelink: error %d from cat_rename backout 1", err);
+		    }
+		#else
 			(void) cat_rename(hfsmp, &to_desc, &dcp->c_desc, &cp->c_desc, NULL);
+		#endif
 			goto out;
 		}
 		cp->c_rdev = indnodeno;
@@ -161,7 +186,17 @@ hfs_makelink(struct hfsmount *hfsmp, struct cnode *cp, struct cnode *dcp,
 		(void) cat_delete(hfsmp, &cp->c_desc, &cp->c_attr);
 
 		/* Put the source file back */
+	// XXXdbg
+	#if 1
+		{
+			int err;
+			err = cat_rename(hfsmp, &to_desc, &dcp->c_desc, &cp->c_desc, NULL);
+			if (err)
+				panic("hfs_makelink: error %d from cat_rename backout 2", err);
+		}
+	#else
 		(void) cat_rename(hfsmp, &to_desc, &dcp->c_desc, &cp->c_desc, NULL);
+	#endif
 		goto out;
 	}
 
@@ -205,6 +240,7 @@ hfs_link(ap)
 		struct componentname *a_cnp;
 	} */ *ap;
 {
+	struct hfsmount *hfsmp;
 	struct vnode *vp = ap->a_vp;
 	struct vnode *tdvp = ap->a_tdvp;
 	struct componentname *cnp = ap->a_cnp;
@@ -214,6 +250,8 @@ hfs_link(ap)
 	struct timeval tv;
 	int error;
 
+	hfsmp = VTOHFS(vp);
+	
 #if HFS_DIAGNOSTIC
 	if ((cnp->cn_flags & HASBUF) == 0)
 		panic("hfs_link: no name");
@@ -226,7 +264,7 @@ hfs_link(ap)
 	if (VTOVCB(tdvp)->vcbSigWord != kHFSPlusSigWord)
 		return err_link(ap);	/* hfs disks don't support hard links */
 	
-	if (VTOHFS(vp)->hfs_private_metadata_dir == 0)
+	if (hfsmp->hfs_private_metadata_dir == 0)
 		return err_link(ap);	/* no private metadata dir, no links possible */
 
 	if (tdvp != vp && (error = vn_lock(vp, LK_EXCLUSIVE, p))) {
@@ -252,12 +290,22 @@ hfs_link(ap)
 		goto out1;
 	}
 
+	hfs_global_shared_lock_acquire(hfsmp);
+	if (hfsmp->jnl) {
+	    if (journal_start_transaction(hfsmp->jnl) != 0) {
+			hfs_global_shared_lock_release(hfsmp);
+			return EINVAL;
+	    }
+	}
+
 	cp->c_nlink++;
 	cp->c_flag |= C_CHANGE;
 	tv = time;
+
 	error = VOP_UPDATE(vp, &tv, &tv, 1);
-	if (!error)
-		error = hfs_makelink(VTOHFS(vp), cp, tdcp, cnp);
+	if (!error) {
+		error = hfs_makelink(hfsmp, cp, tdcp, cnp);
+	}
 	if (error) {
 		cp->c_nlink--;
 		cp->c_flag |= C_CHANGE;
@@ -268,10 +316,21 @@ hfs_link(ap)
 		tdcp->c_flag |= C_CHANGE | C_UPDATE;
 		tv = time;
 		(void) VOP_UPDATE(tdvp, &tv, &tv, 0);
-		hfs_volupdate(VTOHFS(vp), VOL_MKFILE,
+
+		hfs_volupdate(hfsmp, VOL_MKFILE,
 			(tdcp->c_cnid == kHFSRootFolderID));
 	}
+
+	// XXXdbg - need to do this here as well because cp could have changed
+	error = VOP_UPDATE(vp, &tv, &tv, 1);
+
 	FREE_ZONE(cnp->cn_pnbuf, cnp->cn_pnlen, M_NAMEI);
+
+	if (hfsmp->jnl) {
+	    journal_end_transaction(hfsmp->jnl);
+	}
+	hfs_global_shared_lock_release(hfsmp);
+
 out1:
 	if (tdvp != vp)
 		VOP_UNLOCK(vp, 0, p);
diff --git a/bsd/hfs/hfs_lookup.c b/bsd/hfs/hfs_lookup.c
index 824f615dc..db88b99c0 100644
--- a/bsd/hfs/hfs_lookup.c
+++ b/bsd/hfs/hfs_lookup.c
@@ -261,8 +261,9 @@ notfound:
 			 * creation of files in the directory.
 			 */
 			retval = VOP_ACCESS(dvp, VWRITE, cred, cnp->cn_proc);
-			if (retval)
+			if (retval) {
 				goto exit;
+			}
 		
 			cnp->cn_flags |= SAVENAME;
 			if (!(flags & LOCKPARENT))
diff --git a/bsd/hfs/hfs_mount.h b/bsd/hfs/hfs_mount.h
index 06afe6df8..502926a42 100644
--- a/bsd/hfs/hfs_mount.h
+++ b/bsd/hfs/hfs_mount.h
@@ -52,10 +52,15 @@ struct hfs_mount_args {
 	u_long	hfs_encoding;		/* encoding for this volume (standard HFS only) */
 	struct	timezone hfs_timezone;	/* user time zone info (standard HFS only) */
 	int	flags;			/* mounting flags, see below */
+	int     journal_tbuffer_size;   /* size in bytes of the journal transaction buffer */
+	int	journal_flags;          /* flags to pass to journal_open/create */
+	int	journal_disable;        /* don't use journaling (potentially dangerous) */
 };
 
 #define HFSFSMNT_NOXONFILES	0x1	/* disable execute permissions for files */
 #define HFSFSMNT_WRAPPER	0x2	/* mount HFS wrapper (if it exists) */
+#define HFSFSMNT_EXTENDED_ARGS  0x4     /* indicates new fields after "flags" are valid */
+
 #endif /* __APPLE_API_UNSTABLE */
 
 #endif /* ! _HFS_MOUNT_H_ */
diff --git a/bsd/hfs/hfs_readwrite.c b/bsd/hfs/hfs_readwrite.c
index 4544a7685..6f0311411 100644
--- a/bsd/hfs/hfs_readwrite.c
+++ b/bsd/hfs/hfs_readwrite.c
@@ -267,6 +267,8 @@ hfs_write(ap)
     int 				retval;
 	off_t filebytes;
 	u_long fileblocks;
+	struct hfsmount *hfsmp;
+	int started_tr = 0, grabbed_lock = 0;
 
 	ioflag = ap->a_ioflag;
 
@@ -288,6 +290,16 @@ hfs_write(ap)
 	if ((cp->c_flags & APPEND) && uio->uio_offset != fp->ff_size)
 		return (EPERM);
 
+	// XXXdbg - don't allow modification of the journal or journal_info_block
+	if (VTOHFS(vp)->jnl && cp->c_datafork) {
+		struct HFSPlusExtentDescriptor *extd;
+
+		extd = &cp->c_datafork->ff_data.cf_extents[0];
+		if (extd->startBlock == VTOVCB(vp)->vcbJinfoBlock || extd->startBlock == VTOHFS(vp)->jnl_start) {
+			return EPERM;
+		}
+	}
+
 	writelimit = uio->uio_offset + uio->uio_resid;
 
 	/*
@@ -333,13 +345,26 @@ hfs_write(ap)
 	if(writelimit > filebytes) {
 		bytesToAdd = writelimit - filebytes;
 
-		retval = hfs_chkdq(cp, (int64_t)(roundup(bytesToAdd, fp->ff_clumpsize)), 
+		retval = hfs_chkdq(cp, (int64_t)(roundup(bytesToAdd, vcb->blockSize)), 
 				   ap->a_cred, 0);
 		if (retval)
 			return (retval);
 	}
 #endif /* QUOTA */
 
+	hfsmp = VTOHFS(vp);
+	if (writelimit > filebytes) {
+		hfs_global_shared_lock_acquire(hfsmp);
+		grabbed_lock = 1;
+	}
+	if (hfsmp->jnl && (writelimit > filebytes)) {
+		if (journal_start_transaction(hfsmp->jnl) != 0) {
+			hfs_global_shared_lock_release(hfsmp);
+			return EINVAL;
+		}
+		started_tr = 1;
+	}
+
 	while (writelimit > filebytes) {
 	
 		bytesToAdd = writelimit - filebytes;
@@ -364,6 +389,17 @@ hfs_write(ap)
 			(int)uio->uio_offset, uio->uio_resid, (int)fp->ff_size,  (int)filebytes, 0);
 	}
 
+	// XXXdbg
+	if (started_tr) {
+		hfs_flushvolumeheader(hfsmp, MNT_NOWAIT, 0);
+		journal_end_transaction(hfsmp->jnl);
+		started_tr = 0;
+	}
+	if (grabbed_lock) {
+		hfs_global_shared_lock_release(hfsmp);
+		grabbed_lock = 0;
+	}
+
 	if (UBCISVALID(vp) && retval == E_NONE) {
 		off_t filesize;
 		off_t zero_off;
@@ -952,6 +988,7 @@ hfs_cmap(ap)
     struct proc		*p = NULL;
     struct rl_entry *invalid_range;
     enum rl_overlaptype overlaptype;
+    int started_tr = 0, grabbed_lock = 0;
 
 	/*
 	 * Check for underlying vnode requests and ensure that logical
@@ -960,12 +997,37 @@ hfs_cmap(ap)
 	if (ap->a_bpn == NULL)
 		return (0);
 
-	if (overflow_extents(fp) || fp->ff_unallocblocks) {
+	p = current_proc();
+	if (fp->ff_unallocblocks) {
 		lockExtBtree = 1;
-		p = current_proc();
+
+		// XXXdbg
+		hfs_global_shared_lock_acquire(hfsmp);
+		grabbed_lock = 1;
+
+		if (hfsmp->jnl) {
+			if (journal_start_transaction(hfsmp->jnl) != 0) {
+				hfs_global_shared_lock_release(hfsmp);
+				return EINVAL;
+			} else {
+				started_tr = 1;
+			}
+		} 
+
 		if (retval = hfs_metafilelocking(hfsmp, kHFSExtentsFileID, LK_EXCLUSIVE | LK_CANRECURSE, p)) {
+			if (started_tr) {
+				journal_end_transaction(hfsmp->jnl);
+			}
+			if (grabbed_lock) {
+				hfs_global_shared_lock_release(hfsmp);
+			}
 			return (retval);
-        	}
+		}
+	} else if (overflow_extents(fp)) {
+		lockExtBtree = 1;
+		if (retval = hfs_metafilelocking(hfsmp, kHFSExtentsFileID, LK_EXCLUSIVE | LK_CANRECURSE, p)) {
+			return retval;
+		}
 	}
 
 	/*
@@ -1007,9 +1069,16 @@ hfs_cmap(ap)
 		}
 
 		if (retval) {
-    			(void) hfs_metafilelocking(hfsmp, kHFSExtentsFileID, LK_RELEASE, p);
-     			return (retval);
-    		}
+			(void) hfs_metafilelocking(hfsmp, kHFSExtentsFileID, LK_RELEASE, p);
+			if (started_tr) {
+				hfs_flushvolumeheader(hfsmp, MNT_NOWAIT, 0);
+				journal_end_transaction(hfsmp->jnl);
+			}
+			if (grabbed_lock) {
+				hfs_global_shared_lock_release(hfsmp);
+			}
+			return (retval);
+		}
 		VTOC(ap->a_vp)->c_flag |= C_MODIFIED;
 	}
 
@@ -1024,6 +1093,17 @@ hfs_cmap(ap)
 	if (lockExtBtree)
     		(void) hfs_metafilelocking(hfsmp, kHFSExtentsFileID, LK_RELEASE, p);
 
+	// XXXdbg
+	if (started_tr) {
+		hfs_flushvolumeheader(hfsmp, MNT_NOWAIT, 0);
+		journal_end_transaction(hfsmp->jnl);
+		started_tr = 0;
+	}
+	if (grabbed_lock) {
+		hfs_global_shared_lock_release(hfsmp);
+		grabbed_lock = 0;
+	}
+			
     if (retval == E_NONE) {
         /* Adjust the mapping information for invalid file ranges: */
         overlaptype = rl_scan(&fp->ff_invalidranges,
@@ -1153,6 +1233,11 @@ hfs_strategy_fragmented(struct buf *bp)
 	}
 	
 	frag->b_vp = NULL;
+	//
+	// XXXdbg - in the case that this is a meta-data block, it won't affect
+	//          the journal because this bp is for a physical disk block,
+	//          not a logical block that is part of the catalog or extents
+	//          files.
 	SET(frag->b_flags, B_INVAL);
 	brelse(frag);
 	
@@ -1291,6 +1376,7 @@ int hfs_truncate(ap)
 	off_t filebytes;
 	u_long fileblocks;
 	int blksize;
+	struct hfsmount *hfsmp;
 
 	if (vp->v_type != VREG && vp->v_type != VLNK)
 		return (EISDIR);	/* cannot truncate an HFS directory! */
@@ -1309,6 +1395,7 @@ int hfs_truncate(ap)
 	if ((!ISHFSPLUS(VTOVCB(vp))) && (length > (off_t)MAXHFSFILESIZE))
 		return (EFBIG);
 
+	hfsmp = VTOHFS(vp);
 
 	tv = time;
 	retval = E_NONE;
@@ -1329,7 +1416,7 @@ int hfs_truncate(ap)
 	 */
 	if (length > fp->ff_size) {
 #if QUOTA
-		retval = hfs_chkdq(cp, (int64_t)(roundup(length - filebytes, fp->ff_clumpsize)),
+		retval = hfs_chkdq(cp, (int64_t)(roundup(length - filebytes, blksize)),
 				ap->a_cred, 0);
 		if (retval)
 			goto Err_Exit;
@@ -1347,10 +1434,25 @@ int hfs_truncate(ap)
 			if (suser(ap->a_cred, NULL) != 0)
 				eflags |= kEFReserveMask;  /* keep a reserve */
 
+			// XXXdbg
+			hfs_global_shared_lock_acquire(hfsmp);
+			if (hfsmp->jnl) {
+				if (journal_start_transaction(hfsmp->jnl) != 0) {
+					retval = EINVAL;
+					goto Err_Exit;
+				}
+			}
+
 			/* lock extents b-tree (also protects volume bitmap) */
 			retval = hfs_metafilelocking(VTOHFS(vp), kHFSExtentsFileID, LK_EXCLUSIVE, ap->a_p);
-			if (retval)
+			if (retval) {
+				if (hfsmp->jnl) {
+					journal_end_transaction(hfsmp->jnl);
+				} 
+				hfs_global_shared_lock_release(hfsmp);
+
 				goto Err_Exit;
+			}
 
 			while ((length > filebytes) && (retval == E_NONE)) {
 				bytesToAdd = length - filebytes;
@@ -1368,7 +1470,16 @@ int hfs_truncate(ap)
 					break;
 				}
 			} /* endwhile */
+
 			(void) hfs_metafilelocking(VTOHFS(vp), kHFSExtentsFileID, LK_RELEASE, ap->a_p);
+
+			// XXXdbg
+			if (hfsmp->jnl) {
+				hfs_flushvolumeheader(hfsmp, MNT_NOWAIT, 0);
+				journal_end_transaction(hfsmp->jnl);
+			} 
+			hfs_global_shared_lock_release(hfsmp);
+
 			if (retval)
 				goto Err_Exit;
 
@@ -1484,16 +1595,38 @@ int hfs_truncate(ap)
 #if QUOTA
 		  off_t savedbytes = ((off_t)fp->ff_blocks * (off_t)blksize);
 #endif /* QUOTA */
+		  // XXXdbg
+		  hfs_global_shared_lock_acquire(hfsmp);
+			if (hfsmp->jnl) {
+				if (journal_start_transaction(hfsmp->jnl) != 0) {
+					retval = EINVAL;
+					goto Err_Exit;
+				}
+			}
+
 			/* lock extents b-tree (also protects volume bitmap) */
 			retval = hfs_metafilelocking(VTOHFS(vp), kHFSExtentsFileID, LK_EXCLUSIVE, ap->a_p);
-			if (retval)
+			if (retval) {
+				if (hfsmp->jnl) {
+					journal_end_transaction(hfsmp->jnl);
+				}
+				hfs_global_shared_lock_release(hfsmp);
 				goto Err_Exit;
+			}
 			
 			if (fp->ff_unallocblocks == 0)
 				retval = MacToVFSError(TruncateFileC(VTOVCB(vp),
 						(FCB*)fp, length, false));
 
 			(void) hfs_metafilelocking(VTOHFS(vp), kHFSExtentsFileID, LK_RELEASE, ap->a_p);
+
+			// XXXdbg
+			if (hfsmp->jnl) {
+				hfs_flushvolumeheader(hfsmp, MNT_NOWAIT, 0);
+				journal_end_transaction(hfsmp->jnl);
+			}
+			hfs_global_shared_lock_release(hfsmp);
+
 			filebytes = (off_t)fp->ff_blocks * (off_t)blksize;
 			if (retval)
 				goto Err_Exit;
@@ -1564,6 +1697,9 @@ int hfs_allocate(ap)
 	int retval, retval2;
 	UInt32 blockHint;
 	UInt32 extendFlags =0;   /* For call to ExtendFileC */
+	struct hfsmount *hfsmp;
+
+	hfsmp = VTOHFS(vp);
 
 	*(ap->a_bytesallocated) = 0;
 	fileblocks = fp->ff_blocks;
@@ -1610,15 +1746,31 @@ int hfs_allocate(ap)
 		moreBytesRequested = length - filebytes;
 		
 #if QUOTA
-		retval = hfs_chkdq(cp, (int64_t)(roundup(moreBytesRequested, fp->ff_clumpsize)), 
+		retval = hfs_chkdq(cp,
+				(int64_t)(roundup(moreBytesRequested, VTOVCB(vp)->blockSize)), 
 				ap->a_cred, 0);
 		if (retval)
 			return (retval);
 
 #endif /* QUOTA */
+		// XXXdbg
+		hfs_global_shared_lock_acquire(hfsmp);
+		if (hfsmp->jnl) {
+			if (journal_start_transaction(hfsmp->jnl) != 0) {
+				retval = EINVAL;
+				goto Err_Exit;
+			}
+		}
+
 		/* lock extents b-tree (also protects volume bitmap) */
 		retval = hfs_metafilelocking(VTOHFS(vp), kHFSExtentsFileID, LK_EXCLUSIVE, ap->a_p);
-		if (retval) goto Err_Exit;
+		if (retval) {
+			if (hfsmp->jnl) {
+				journal_end_transaction(hfsmp->jnl);
+			}
+			hfs_global_shared_lock_release(hfsmp);
+			goto Err_Exit;
+		}
 
 		retval = MacToVFSError(ExtendFileC(VTOVCB(vp),
 						(FCB*)fp,
@@ -1629,8 +1781,16 @@ int hfs_allocate(ap)
 
 		*(ap->a_bytesallocated) = actualBytesAdded;
 		filebytes = (off_t)fp->ff_blocks * (off_t)VTOVCB(vp)->blockSize;
+
 		(void) hfs_metafilelocking(VTOHFS(vp), kHFSExtentsFileID, LK_RELEASE, ap->a_p);
 
+		// XXXdbg
+		if (hfsmp->jnl) {
+			hfs_flushvolumeheader(hfsmp, MNT_NOWAIT, 0);
+			journal_end_transaction(hfsmp->jnl);
+		}
+		hfs_global_shared_lock_release(hfsmp);
+
 		/*
 		 * if we get an error and no changes were made then exit
 		 * otherwise we must do the VOP_UPDATE to reflect the changes
@@ -1661,9 +1821,25 @@ int hfs_allocate(ap)
 			(void) vinvalbuf(vp, vflags, ap->a_cred, ap->a_p, 0, 0);
 		}
 
+		// XXXdbg
+		hfs_global_shared_lock_acquire(hfsmp);
+		if (hfsmp->jnl) {
+			if (journal_start_transaction(hfsmp->jnl) != 0) {
+				retval = EINVAL;
+				goto Err_Exit;
+			}
+		}
+
 		/* lock extents b-tree (also protects volume bitmap) */
 		retval = hfs_metafilelocking(VTOHFS(vp), kHFSExtentsFileID, LK_EXCLUSIVE, ap->a_p);
-		if (retval) goto Err_Exit;
+		if (retval) {
+			if (hfsmp->jnl) {
+				journal_end_transaction(hfsmp->jnl);
+			}
+			hfs_global_shared_lock_release(hfsmp);
+
+			goto Err_Exit;
+		}			
 
 		retval = MacToVFSError(
                             TruncateFileC(
@@ -1673,6 +1849,14 @@ int hfs_allocate(ap)
                                             false));
 		(void) hfs_metafilelocking(VTOHFS(vp), kHFSExtentsFileID, LK_RELEASE, ap->a_p);
 		filebytes = (off_t)fp->ff_blocks * (off_t)VTOVCB(vp)->blockSize;
+
+		if (hfsmp->jnl) {
+			hfs_flushvolumeheader(hfsmp, MNT_NOWAIT, 0);
+			journal_end_transaction(hfsmp->jnl);
+		}
+		hfs_global_shared_lock_release(hfsmp);
+		
+
 		/*
 		 * if we get an error and no changes were made then exit
 		 * otherwise we must do the VOP_UPDATE to reflect the changes
@@ -1794,9 +1978,9 @@ hfs_bwrite(ap)
 	} */ *ap;
 {
 	int retval = 0;
-#if BYTE_ORDER == LITTLE_ENDIAN
 	register struct buf *bp = ap->a_bp;
 	register struct vnode *vp = bp->b_vp;
+#if BYTE_ORDER == LITTLE_ENDIAN
 	BlockDescriptor block;
 
 	/* Trap B-Tree writes */
@@ -1820,8 +2004,12 @@ hfs_bwrite(ap)
 	}
 #endif
 	/* This buffer shouldn't be locked anymore but if it is clear it */
-	if (ISSET(ap->a_bp->b_flags, B_LOCKED)) {
-		CLR(ap->a_bp->b_flags, B_LOCKED);
+	if (ISSET(bp->b_flags, B_LOCKED)) {
+	    // XXXdbg
+	    if (VTOHFS(vp)->jnl) {
+			panic("hfs: CLEARING the lock bit on bp 0x%x\n", bp);
+	    }
+		CLR(bp->b_flags, B_LOCKED);
 		printf("hfs_bwrite: called with lock bit set\n");
 	}
 	retval = vn_bwrite (ap);
diff --git a/bsd/hfs/hfs_search.c b/bsd/hfs/hfs_search.c
index 0c7638fbe..84aecbb01 100644
--- a/bsd/hfs/hfs_search.c
+++ b/bsd/hfs/hfs_search.c
@@ -193,6 +193,8 @@ hfs_search( ap )
 	CatalogRecord * myCurrentDataPtr;
 	CatPosition * myCatPositionPtr;
 	BTScanState myBTScanState;
+	void *user_start = NULL;
+	int   user_len;
 
 	/* XXX Parameter check a_searchattrs? */
 
@@ -223,6 +225,20 @@ hfs_search( ap )
 	MALLOC( attributesBuffer, void *, eachReturnBufferSize, M_TEMP, M_WAITOK );
 	variableBuffer = (void*)((char*) attributesBuffer + fixedBlockSize);
 
+	// XXXdbg - have to lock the user's buffer so we don't fault
+	// while holding the shared catalog file lock.  see the comment
+	// in hfs_readdir() for more details.
+	//
+	if (VTOHFS(ap->a_vp)->jnl && ap->a_uio->uio_segflg == UIO_USERSPACE) {
+		user_start = ap->a_uio->uio_iov->iov_base;
+		user_len   = ap->a_uio->uio_iov->iov_len;
+
+		if ((err = vslock(user_start, user_len)) != 0) {
+			user_start = NULL;
+			goto ExitThisRoutine;
+		}
+	}
+
 	/* Lock catalog b-tree */
 	err = hfs_metafilelocking(VTOHFS(ap->a_vp), kHFSCatalogFileID, LK_SHARED, p);
 	if (err)
@@ -383,6 +399,10 @@ QuickExit:
 ExitThisRoutine:
         FREE( attributesBuffer, M_TEMP );
 
+	if (VTOHFS(ap->a_vp)->jnl && user_start) {
+		vsunlock(user_start, user_len, TRUE);
+	}
+
 	return (MacToVFSError(err));
 }
 
@@ -858,6 +878,14 @@ InsertMatch( struct vnode *root_vp, struct uio *a_uio, CatalogRecord *rec,
 		goto exit;
 	}
 
+	/* Hide the private journal files */
+	if (VTOHFS(root_vp)->jnl &&
+	    ((c_attr.ca_fileid == VTOHFS(root_vp)->hfs_jnlfileid) ||
+	     (c_attr.ca_fileid == VTOHFS(root_vp)->hfs_jnlinfoblkid))) {
+		err = 0;
+		goto exit;
+	}
+
 	if (returnAttrList->commonattr & ATTR_CMN_NAME) {
 		cat_convertkey(VTOHFS(root_vp), key, rec, &c_desc);
 	} else {
diff --git a/bsd/hfs/hfs_vfsops.c b/bsd/hfs/hfs_vfsops.c
index c92af136d..cff8a45ec 100644
--- a/bsd/hfs/hfs_vfsops.c
+++ b/bsd/hfs/hfs_vfsops.c
@@ -77,6 +77,9 @@
 #include <sys/quota.h>
 #include <sys/disk.h>
 
+// XXXdbg
+#include <vfs/vfs_journal.h>
+
 #include <miscfs/specfs/specdev.h>
 #include <hfs/hfs_mount.h>
 
@@ -259,6 +262,8 @@ hfs_mount(mp, path, data, ndp, p)
 		    (HFSTOVCB(hfsmp)->vcbSigWord == kHFSPlusSigWord)) {
 			/* setup private/hidden directory for unlinked files */
 			hfsmp->hfs_private_metadata_dir = FindMetaDataDirectory(HFSTOVCB(hfsmp));
+			if (hfsmp->jnl)
+				hfs_remove_orphans(hfsmp);
 		}
 
 		if (args.fspec == 0) {
@@ -325,7 +330,6 @@ hfs_mount(mp, path, data, ndp, p)
 		goto error_exit;
 	}
 
-	
 	/* Set the mount flag to indicate that we support volfs  */
 	mp->mnt_flag |= MNT_DOVOLFS;
     if (VFSTOVCB(mp)->vcbSigWord == kHFSSigWord) {
@@ -333,6 +337,7 @@ hfs_mount(mp, path, data, ndp, p)
     	mp->mnt_flag |= MNT_FIXEDSCRIPTENCODING;
     }
 	(void) copyinstr(path, mp->mnt_stat.f_mntonname, MNAMELEN-1, &size);
+
 	bzero(mp->mnt_stat.f_mntonname + size, MNAMELEN - size);
 	(void) copyinstr(args.fspec, mp->mnt_stat.f_mntfromname, MNAMELEN - 1, &size);
 	bzero(mp->mnt_stat.f_mntfromname + size, MNAMELEN - size);
@@ -641,6 +646,7 @@ loop:
 
 	vcb->vcbLsMod		= to_bsd_time(SWAP_BE32(vhp->modifyDate));
 	vcb->vcbAtrb		= (UInt16) SWAP_BE32 (vhp->attributes);	/* VCB only uses lower 16 bits */
+	vcb->vcbJinfoBlock  = SWAP_BE32(vhp->journalInfoBlock);
 	vcb->vcbClpSiz		= SWAP_BE32 (vhp->rsrcClumpSize);
 	vcb->vcbNxtCNID		= SWAP_BE32 (vhp->nextCatalogID);
 	vcb->vcbVolBkUp		= to_bsd_time(SWAP_BE32(vhp->backupDate));
@@ -720,6 +726,84 @@ loop:
 }
 
 
+static int
+get_raw_device(char *fspec, int is_user, int ronly, struct vnode **rvp, struct ucred *cred, struct proc *p)
+{
+	char            *rawbuf;
+	char            *dp;
+	size_t           namelen;
+	struct nameidata nd;
+	int               retval;
+
+	*rvp = NULL;
+
+	MALLOC(rawbuf, char *, MAXPATHLEN, M_HFSMNT, M_WAITOK);
+	if (rawbuf == NULL) {
+		retval = ENOMEM;
+		goto error_exit;
+	}
+
+	if (is_user) {
+		retval = copyinstr(fspec, rawbuf, MAXPATHLEN - 1, &namelen);
+		if (retval != E_NONE) {
+			FREE(rawbuf, M_HFSMNT);
+			goto error_exit;
+		}
+	} else {
+		strcpy(rawbuf, fspec);
+		namelen = strlen(rawbuf);
+	}
+
+	/* make sure it's null terminated */
+	rawbuf[MAXPATHLEN-1] = '\0';   
+
+	dp = &rawbuf[namelen-1];
+	while(dp >= rawbuf && *dp != '/') {
+		dp--;
+	}
+			
+	if (dp != NULL) {
+		dp++;
+	} else {
+		dp = rawbuf;
+	}
+			
+	/* make room for and insert the 'r' for the raw device */
+	memmove(dp+1, dp, strlen(dp)+1);
+	*dp = 'r';
+
+	NDINIT(&nd, LOOKUP, FOLLOW, UIO_SYSSPACE, rawbuf, p);
+	retval = namei(&nd);
+	if (retval != E_NONE) {
+		DBG_ERR(("hfs_mountfs: can't open raw device for journal: %s, %x\n", rawbuf, nd.ni_vp->v_rdev));
+		FREE(rawbuf, M_HFSMNT);
+		goto error_exit;
+	}
+
+	*rvp = nd.ni_vp;
+	if ((retval = VOP_OPEN(*rvp, ronly ? FREAD : FREAD|FWRITE, FSCRED, p))) {
+		*rvp = NULL;
+		goto error_exit;
+	}
+
+	// don't need this any more
+	FREE(rawbuf, M_HFSMNT);
+
+	return 0;
+
+  error_exit:
+	if (*rvp) {
+	    (void)VOP_CLOSE(*rvp, ronly ? FREAD : FREAD|FWRITE, cred, p);
+	}
+
+	if (rawbuf) {
+		FREE(rawbuf, M_HFSMNT);
+	}
+	return retval;
+}
+
+
+
 /*
  * Common code for mount and mountroot
  */
@@ -741,6 +825,7 @@ hfs_mountfs(struct vnode *devvp, struct mount *mp, struct proc *p,
 	u_int32_t blksize;
 	u_int32_t minblksize;
 	u_int32_t iswritable;
+	daddr_t   mdb_offset;
 
 	dev = devvp->v_rdev;
 	cred = p ? p->p_ucred : NOCRED;
@@ -825,6 +910,7 @@ hfs_mountfs(struct vnode *devvp, struct mount *mp, struct proc *p,
 		return (retval);
 	}
 
+	mdb_offset = HFS_PRI_SECTOR(blksize);
 	if ((retval = meta_bread(devvp, HFS_PRI_SECTOR(blksize), blksize, cred, &bp))) {
 		goto error_exit;
 	}
@@ -837,7 +923,7 @@ hfs_mountfs(struct vnode *devvp, struct mount *mp, struct proc *p,
 	bzero(hfsmp, sizeof(struct hfsmount));
 
 	simple_lock_init(&hfsmp->hfs_renamelock);
-
+	
 	/*
 	*  Init the volume information structure
 	*/
@@ -932,6 +1018,7 @@ hfs_mountfs(struct vnode *devvp, struct mount *mp, struct proc *p,
 	} else /* Mount an HFS Plus disk */ {
 		HFSPlusVolumeHeader *vhp;
 		off_t embeddedOffset;
+		int   jnl_disable = 0;
 	
 		/* Get the embedded Volume Header */
 		if (SWAP_BE16(mdbp->drEmbedSigWord) == kHFSPlusSigWord) {
@@ -973,8 +1060,8 @@ hfs_mountfs(struct vnode *devvp, struct mount *mp, struct proc *p,
 
 			hfsmp->hfs_phys_block_count = disksize / blksize;
 	
-			retval = meta_bread(devvp, (embeddedOffset / blksize) +
-					HFS_PRI_SECTOR(blksize), blksize, cred, &bp);
+			mdb_offset = (embeddedOffset / blksize) + HFS_PRI_SECTOR(blksize);
+			retval = meta_bread(devvp, mdb_offset, blksize, cred, &bp);
 			if (retval)
 				goto error_exit;
 			bcopy(bp->b_data + HFS_PRI_OFFSET(blksize), mdbp, 512);
@@ -987,9 +1074,42 @@ hfs_mountfs(struct vnode *devvp, struct mount *mp, struct proc *p,
 			vhp = (HFSPlusVolumeHeader*) mdbp;
 		}
 
+		// XXXdbg
+		//
+		hfsmp->jnl = NULL;
+		hfsmp->jvp = NULL;
+		if (args != NULL && (args->flags & HFSFSMNT_EXTENDED_ARGS) && args->journal_disable) {
+		    jnl_disable = 1;
+		}
+				
+		//
+		// We only initialize the journal here if the last person
+		// to mount this volume was journaling aware.  Otherwise
+		// we delay journal initialization until later at the end
+		// of hfs_MountHFSPlusVolume() because the last person who
+		// mounted it could have messed things up behind our back
+		// (so we need to go find the .journal file, make sure it's
+		// the right size, re-sync up if it was moved, etc).
+		//
+		if (   (SWAP_BE32(vhp->lastMountedVersion) == kHFSJMountVersion)
+			&& (SWAP_BE32(vhp->attributes) & kHFSVolumeJournaledMask)
+			&& !jnl_disable) {
+			
+			// if we're able to init the journal, mark the mount
+			// point as journaled.
+			//
+			if (hfs_early_journal_init(hfsmp, vhp, args, embeddedOffset, mdb_offset, mdbp, cred) == 0) {
+				mp->mnt_flag |= MNT_JOURNALED;
+			} else {
+				retval = EINVAL;
+				goto error_exit;
+			}
+		}
+		// XXXdbg
+	
 		(void) hfs_getconverter(0, &hfsmp->hfs_get_unicode, &hfsmp->hfs_get_hfsname);
 
-		retval = hfs_MountHFSPlusVolume(hfsmp, vhp, embeddedOffset, disksize, p);
+		retval = hfs_MountHFSPlusVolume(hfsmp, vhp, embeddedOffset, disksize, p, args);
 		/*
 		 * If the backend didn't like our physical blocksize
 		 * then retry with physical blocksize of 512.
@@ -1012,7 +1132,7 @@ hfs_mountfs(struct vnode *devvp, struct mount *mp, struct proc *p,
      			hfsmp->hfs_phys_block_size = blksize;
  
 			/* Try again with a smaller block size... */
-			retval = hfs_MountHFSPlusVolume(hfsmp, vhp, embeddedOffset, disksize, p);
+			retval = hfs_MountHFSPlusVolume(hfsmp, vhp, embeddedOffset, disksize, p, args);
 		}
 		if (retval)
 			(void) hfs_relconverter(0);
@@ -1039,6 +1159,10 @@ error_exit:
 	if (mdbp)
 		FREE(mdbp, M_TEMP);
 	(void)VOP_CLOSE(devvp, ronly ? FREAD : FREAD|FWRITE, cred, p);
+	if (hfsmp && hfsmp->jvp && hfsmp->jvp != hfsmp->hfs_devvp) {
+	    (void)VOP_CLOSE(hfsmp->jvp, ronly ? FREAD : FREAD|FWRITE, cred, p);
+		hfsmp->jvp = NULL;
+	}
 	if (hfsmp) {
 		FREE(hfsmp, M_HFSMNT);
 		mp->mnt_data = (qaddr_t)0;
@@ -1075,6 +1199,7 @@ hfs_unmount(mp, mntflags, p)
 	int retval = E_NONE;
 	int flags;
 	int force;
+	int started_tr = 0, grabbed_lock = 0;
 
 	flags = 0;
 	force = 0;
@@ -1090,17 +1215,33 @@ hfs_unmount(mp, mntflags, p)
 	 * Flush out the b-trees, volume bitmap and Volume Header
 	 */
 	if (hfsmp->hfs_fs_ronly == 0) {
+		hfs_global_shared_lock_acquire(hfsmp);
+		grabbed_lock = 1;
+	    if (hfsmp->jnl) {
+			journal_start_transaction(hfsmp->jnl);
+			started_tr = 1;
+		}
+		
 		retval = VOP_FSYNC(HFSTOVCB(hfsmp)->catalogRefNum, NOCRED, MNT_WAIT, p);
 		if (retval && !force)
-			return (retval);
-
+			goto err_exit;
+		
 		retval = VOP_FSYNC(HFSTOVCB(hfsmp)->extentsRefNum, NOCRED, MNT_WAIT, p);
 		if (retval && !force)
-			return (retval);
+			goto err_exit;
+			
+		// if we have an allocation file, sync it too so we don't leave dirty
+		// blocks around
+		if (HFSTOVCB(hfsmp)->allocationsRefNum) {
+		    if (retval = VOP_FSYNC(HFSTOVCB(hfsmp)->allocationsRefNum, NOCRED, MNT_WAIT, p)) {
+			if (!force)
+			    goto err_exit;
+		    }
+		}
 
 		if (retval = VOP_FSYNC(hfsmp->hfs_devvp, NOCRED, MNT_WAIT, p)) {
 			if (!force)
-				return (retval);
+				goto err_exit;
 		}
 		
 		/* See if this volume is damaged, is so do not unmount cleanly */
@@ -1110,14 +1251,27 @@ hfs_unmount(mp, mntflags, p)
 			HFSTOVCB(hfsmp)->vcbAtrb |= kHFSVolumeUnmountedMask;
 		}
 
-		retval = hfs_flushvolumeheader(hfsmp, MNT_WAIT, 0);
+		retval = hfs_flushvolumeheader(hfsmp, MNT_WAIT, 1);
 		if (retval) {
 			HFSTOVCB(hfsmp)->vcbAtrb &= ~kHFSVolumeUnmountedMask;
 			if (!force)
-				return (retval);	/* could not flush everything */
+				goto err_exit;	/* could not flush everything */
+		}
+
+		if (hfsmp->jnl) {
+			journal_end_transaction(hfsmp->jnl);
+			started_tr = 0;
+		}
+		if (grabbed_lock) {
+			hfs_global_shared_lock_release(hfsmp);
+			grabbed_lock = 0;
 		}
 	}
 
+	if (hfsmp->jnl) {
+		journal_flush(hfsmp->jnl);
+	}
+	
 	/*
 	 *	Invalidate our caches and release metadata vnodes
 	 */
@@ -1126,6 +1280,19 @@ hfs_unmount(mp, mntflags, p)
 	if (HFSTOVCB(hfsmp)->vcbSigWord == kHFSSigWord)
 		(void) hfs_relconverter(hfsmp->hfs_encoding);
 
+	// XXXdbg
+	if (hfsmp->jnl) {
+	    journal_close(hfsmp->jnl);
+	}
+
+	if (hfsmp->jvp && hfsmp->jvp != hfsmp->hfs_devvp) {
+	    retval = VOP_CLOSE(hfsmp->jvp, hfsmp->hfs_fs_ronly ? FREAD : FREAD|FWRITE,
+			       NOCRED, p);
+	    vrele(hfsmp->jvp);
+		hfsmp->jvp = NULL;
+	}
+	// XXXdbg
+
 	hfsmp->hfs_devvp->v_specflags &= ~SI_MOUNTEDON;
 	retval = VOP_CLOSE(hfsmp->hfs_devvp,
 		    hfsmp->hfs_fs_ronly ? FREAD : FREAD|FWRITE,
@@ -1137,6 +1304,15 @@ hfs_unmount(mp, mntflags, p)
 	FREE(hfsmp, M_HFSMNT);
 	mp->mnt_data = (qaddr_t)0;
 	return (0);
+
+  err_exit:
+	if (hfsmp->jnl && started_tr) {
+		journal_end_transaction(hfsmp->jnl);
+	}
+	if (grabbed_lock) {
+		hfs_global_shared_lock_release(hfsmp);
+	}
+	return retval;
 }
 
 
@@ -1241,6 +1417,8 @@ hfs_quotactl(mp, cmds, uid, arg, p)
 }
 
 
+
+
 /*
  * Get file system statistics.
  */
@@ -1276,6 +1454,70 @@ hfs_statfs(mp, sbp, p)
 }
 
 
+//
+// XXXdbg -- this is a callback to be used by the journal to
+//           get meta data blocks flushed out to disk.
+//
+// XXXdbg -- be smarter and don't flush *every* block on each
+//           call.  try to only flush some so we don't wind up
+//           being too synchronous.
+//
+__private_extern__
+void
+hfs_sync_metadata(void *arg)
+{
+	struct mount *mp = (struct mount *)arg;
+	struct cnode *cp;
+	struct hfsmount *hfsmp;
+	ExtendedVCB *vcb;
+	struct vnode *meta_vp[3];
+	struct buf *bp;
+	int i, sectorsize, priIDSector, altIDSector, retval;
+	int error, allerror = 0;
+
+	hfsmp = VFSTOHFS(mp);
+	vcb = HFSTOVCB(hfsmp);
+
+	bflushq(BQ_META, mp);
+
+
+#if 1     // XXXdbg - I do not believe this is necessary...
+          //          but if I pull it out, then the journal
+	      //          does not seem to get flushed properly
+	      //          when it is closed....
+	
+	// now make sure the super block is flushed
+	sectorsize = hfsmp->hfs_phys_block_size;
+	priIDSector = (vcb->hfsPlusIOPosOffset / sectorsize) +
+                  HFS_PRI_SECTOR(sectorsize);
+	retval = meta_bread(hfsmp->hfs_devvp, priIDSector, sectorsize, NOCRED, &bp);
+	if (retval != 0) {
+		panic("hfs: sync_metadata: can't read super-block?! (retval 0x%x, priIDSector)\n",
+			  retval, priIDSector);
+	}
+
+	if (retval == 0 && (bp->b_flags & B_DELWRI) && (bp->b_flags & B_LOCKED) == 0) {
+	    bwrite(bp);
+	} else if (bp) {
+	    brelse(bp);
+	}
+
+	// the alternate super block...
+	// XXXdbg - we probably don't need to do this each and every time.
+	//          hfs_btreeio.c:FlushAlternate() should flag when it was
+	//          written...
+	altIDSector = (vcb->hfsPlusIOPosOffset / sectorsize) +
+			HFS_ALT_SECTOR(sectorsize, hfsmp->hfs_phys_block_count);
+	retval = meta_bread(hfsmp->hfs_devvp, altIDSector, sectorsize, NOCRED, &bp);
+	if (retval == 0 && (bp->b_flags & B_DELWRI) && (bp->b_flags & B_LOCKED) == 0) {
+	    bwrite(bp);
+	} else if (bp) {
+	    brelse(bp);
+	}
+#endif
+	
+}
+
 /*
  * Go through the disk queues to initiate sandbagged IO;
  * go through the inodes to write those that have been modified;
@@ -1310,6 +1552,17 @@ hfs_sync(mp, waitfor, cred, p)
 		panic("update: rofs mod");
 	};
 
+#if 0
+	// XXXdbg first go through and flush out any modified
+	//        meta data blocks so they go out in order...
+	bflushq(BQ_META, mp);
+	bflushq(BQ_LRU,  mp);
+	// only flush locked blocks if we're not doing journaling
+	if (hfsmp->jnl == NULL) {
+	    bflushq(BQ_LOCKED, mp);
+	}
+#endif
+
 	/*
 	 * Write back each 'modified' vnode
 	 */
@@ -1326,10 +1579,19 @@ loop:
 			simple_unlock(&mntvnode_slock);
 			goto loop;
 		}
+
 		simple_lock(&vp->v_interlock);
 		nvp = vp->v_mntvnodes.le_next;
+
 		cp = VTOC(vp);
 
+		// restart our whole search if this guy is locked
+		// or being reclaimed.
+		if (cp == NULL || vp->v_flag & (VXLOCK|VORECLAIM)) {
+			simple_unlock(&vp->v_interlock);
+			continue;
+		}
+
 		if ((vp->v_flag & VSYSTEM) || (vp->v_type == VNON) ||
 		    (((cp->c_flag & (C_ACCESS | C_CHANGE | C_MODIFIED | C_UPDATE)) == 0) &&
 		    (vp->v_dirtyblkhd.lh_first == NULL) && !(vp->v_flag & VHASDIRTY))) {
@@ -1372,6 +1634,7 @@ loop:
 		btvp = btvp = meta_vp[i];;
 		if ((btvp==0) || (btvp->v_type == VNON) || (btvp->v_mount != mp))
 			continue;
+
 		simple_lock(&btvp->v_interlock);
 		cp = VTOC(btvp);
 		if (((cp->c_flag & (C_ACCESS | C_CHANGE | C_MODIFIED | C_UPDATE)) == 0) &&
@@ -1409,11 +1672,22 @@ loop:
 	 */
 
 	if (IsVCBDirty(vcb)) {
+		// XXXdbg - debugging, remove
+		if (hfsmp->jnl) {
+			//printf("hfs: sync: strange, a journaled volume w/dirty VCB? jnl 0x%x hfsmp 0x%x\n",
+			//	  hfsmp->jnl, hfsmp);
+		}
+
 		error = hfs_flushvolumeheader(hfsmp, waitfor, 0);
-        	if (error)
-            		allerror = error;
+		if (error)
+			allerror = error;
 	}
 
+	if (hfsmp->jnl) {
+	    journal_flush(hfsmp->jnl);
+	}
+	
+  err_exit:
 	return (allerror);
 }
 
@@ -1534,6 +1808,10 @@ hfs_init(vfsp)
 }
 
 
+// XXXdbg
+#include <sys/filedesc.h>
+
+
 /*
  * HFS filesystem related variables.
  */
@@ -1550,12 +1828,133 @@ hfs_sysctl(name, namelen, oldp, oldlenp, newp, newlen, p)
 	extern u_int32_t hfs_encodingbias;
 
 	/* all sysctl names at this level are terminal */
-	if (namelen != 1)
-		return (ENOTDIR);	/* overloaded */
 
 	if (name[0] == HFS_ENCODINGBIAS)
 		return (sysctl_int(oldp, oldlenp, newp, newlen,
 				&hfs_encodingbias));
+	else if (name[0] == 0x082969) {
+		// make the file system journaled...
+		struct vnode *vp = p->p_fd->fd_cdir, *jvp;
+		struct hfsmount *hfsmp;
+		ExtendedVCB *vcb;
+		int retval;
+		struct cat_attr jnl_attr, jinfo_attr;
+		struct cat_fork jnl_fork, jinfo_fork;
+		void *jnl = NULL;
+		
+		hfsmp = VTOHFS(vp);
+		if (hfsmp->hfs_fs_ronly) {
+			return EROFS;
+		}
+		if (HFSTOVCB(hfsmp)->vcbSigWord == kHFSSigWord) {
+			printf("hfs: can't make a plain hfs volume journaled.\n");
+			return EINVAL;
+		}
+
+		if (hfsmp->jnl) {
+		    printf("hfs: volume @ mp 0x%x is already journaled!\n", vp->v_mount);
+		    return EAGAIN;
+		}
+
+		vcb = HFSTOVCB(hfsmp);
+		if (BTHasContiguousNodes(VTOF(vcb->catalogRefNum)) == 0 ||
+			BTHasContiguousNodes(VTOF(vcb->extentsRefNum)) == 0) {
+
+			printf("hfs: volume has a btree w/non-contiguous nodes.  can not enable journaling.\n");
+			return EINVAL;
+		}
+
+		// make sure these both exist!
+		if (   GetFileInfo(vcb, kRootDirID, ".journal_info_block", &jinfo_attr, &jinfo_fork) == 0
+			|| GetFileInfo(vcb, kRootDirID, ".journal", &jnl_attr, &jnl_fork) == 0) {
+
+			return EINVAL;
+		}
+
+		hfs_sync(hfsmp->hfs_mp, MNT_WAIT, FSCRED, p);
+		bflushq(BQ_META);
+
+		printf("hfs: Initializing the journal (joffset 0x%llx sz 0x%llx)...\n",
+			   (off_t)name[2], (off_t)name[3]);
+
+		jvp = hfsmp->hfs_devvp;
+		jnl = journal_create(jvp,
+							 (off_t)name[2] * (off_t)HFSTOVCB(hfsmp)->blockSize
+							 + HFSTOVCB(hfsmp)->hfsPlusIOPosOffset,
+							 (off_t)name[3],
+							 hfsmp->hfs_devvp,
+							 hfsmp->hfs_phys_block_size,
+							 0,
+							 0,
+							 hfs_sync_metadata, hfsmp->hfs_mp);
+
+		if (jnl == NULL) {
+			printf("hfs: FAILED to create the journal!\n");
+			if (jvp && jvp != hfsmp->hfs_devvp) {
+				VOP_CLOSE(jvp, hfsmp->hfs_fs_ronly ? FREAD : FREAD|FWRITE, FSCRED, p);
+			}
+			jvp = NULL;
+
+			return EINVAL;
+		} 
+
+		hfs_global_exclusive_lock_acquire(hfsmp);
+		
+		HFSTOVCB(hfsmp)->vcbJinfoBlock = name[1];
+		HFSTOVCB(hfsmp)->vcbAtrb |= kHFSVolumeJournaledMask;
+		hfsmp->jvp = jvp;
+		hfsmp->jnl = jnl;
+
+		// save this off for the hack-y check in hfs_remove()
+		hfsmp->jnl_start        = (u_int32_t)name[2];
+		hfsmp->hfs_jnlinfoblkid = jinfo_attr.ca_fileid;
+		hfsmp->hfs_jnlfileid    = jnl_attr.ca_fileid;
+
+		hfsmp->hfs_mp->mnt_flag |= MNT_JOURNALED;
+
+		hfs_global_exclusive_lock_release(hfsmp);
+		hfs_flushvolumeheader(hfsmp, MNT_WAIT, 1);
+
+		return 0;
+	} else if (name[0] == 0x031272) {
+		// clear the journaling bit 
+		struct vnode *vp = p->p_fd->fd_cdir;
+		struct hfsmount *hfsmp;
+		void *jnl;
+		int retval;
+		
+		hfsmp = VTOHFS(vp);
+		if (hfsmp->jnl == NULL) {
+			return EINVAL;
+		}
+
+		printf("hfs: disabling journaling for mount @ 0x%x\n", vp->v_mount);
+
+		jnl = hfsmp->jnl;
+		
+		hfs_global_exclusive_lock_acquire(hfsmp);
+
+		// Lights out for you buddy!
+		hfsmp->jnl = NULL;
+		journal_close(jnl);
+
+		if (hfsmp->jvp && hfsmp->jvp != hfsmp->hfs_devvp) {
+			VOP_CLOSE(hfsmp->jvp, hfsmp->hfs_fs_ronly ? FREAD : FREAD|FWRITE, FSCRED, p);
+		}
+		hfsmp->jnl = NULL;
+		hfsmp->jvp = NULL;
+		hfsmp->hfs_mp->mnt_flag &= ~MNT_JOURNALED;
+		hfsmp->jnl_start        = 0;
+		hfsmp->hfs_jnlinfoblkid = 0;
+		hfsmp->hfs_jnlfileid    = 0;
+		
+		HFSTOVCB(hfsmp)->vcbAtrb &= ~kHFSVolumeJournaledMask;
+		
+		hfs_global_exclusive_lock_release(hfsmp);
+		hfs_flushvolumeheader(hfsmp, MNT_WAIT, 1);
+
+		return 0;
+	}
 
 	return (EOPNOTSUPP);
 }
@@ -1688,6 +2087,11 @@ hfs_volupdate(struct hfsmount *hfsmp, enum volop op, int inroot)
 			--vcb->vcbNmFls;
 		break;
 	}
+
+	if (hfsmp->jnl) {
+		hfs_flushvolumeheader(hfsmp, 0, 0);
+	}
+
 	return (0);
 }
 
@@ -1704,7 +2108,6 @@ hfs_flushMDB(struct hfsmount *hfsmp, int waitfor, int altflush)
 	ByteCount namelen;
 
 	sectorsize = hfsmp->hfs_phys_block_size;
-
 	retval = bread(hfsmp->hfs_devvp, HFS_PRI_SECTOR(sectorsize), sectorsize, NOCRED, &bp);
 	if (retval) {
 		if (bp)
@@ -1716,6 +2119,10 @@ hfs_flushMDB(struct hfsmount *hfsmp, int waitfor, int altflush)
 	DBG_ASSERT(bp->b_data != NULL);
 	DBG_ASSERT(bp->b_bcount == size);
 
+	if (hfsmp->jnl) {
+		panic("hfs: standard hfs volumes should not be journaled!\n");
+	}
+
 	mdb = (HFSMasterDirectoryBlock *)(bp->b_data + HFS_PRI_OFFSET(sectorsize));
     
 	mdb->drCrDate	= SWAP_BE32 (UTCToLocal(to_hfs_time(vcb->vcbCrDate)));
@@ -1770,6 +2177,7 @@ hfs_flushMDB(struct hfsmount *hfsmp, int waitfor, int altflush)
 
 		if (meta_bread(hfsmp->hfs_devvp, altIDSector, sectorsize, NOCRED, &alt_bp) == 0) {
 			bcopy(mdb, alt_bp->b_data + HFS_ALT_OFFSET(sectorsize), kMDBSize);
+
 			(void) VOP_BWRITE(alt_bp);
 		} else if (alt_bp)
 			brelse(alt_bp);
@@ -1777,7 +2185,7 @@ hfs_flushMDB(struct hfsmount *hfsmp, int waitfor, int altflush)
 
 	if (waitfor != MNT_WAIT)
 		bawrite(bp);
-	else
+	else 
 		retval = VOP_BWRITE(bp);
  
 	MarkVCBClean( vcb );
@@ -1809,13 +2217,32 @@ hfs_flushvolumeheader(struct hfsmount *hfsmp, int waitfor, int altflush)
 	priIDSector = (vcb->hfsPlusIOPosOffset / sectorsize) +
 			HFS_PRI_SECTOR(sectorsize);
 
+	// XXXdbg
+	hfs_global_shared_lock_acquire(hfsmp);
+	if (hfsmp->jnl) {
+		if (journal_start_transaction(hfsmp->jnl) != 0) {
+			hfs_global_shared_lock_release(hfsmp);
+		    return EINVAL;
+	    }
+	}
+
 	retval = meta_bread(hfsmp->hfs_devvp, priIDSector, sectorsize, NOCRED, &bp);
 	if (retval) {
 		if (bp)
 			brelse(bp);
+
+		if (hfsmp->jnl) {
+			journal_end_transaction(hfsmp->jnl);
+		}
+		hfs_global_shared_lock_release(hfsmp);
+
 		return (retval);
 	}
 
+	if (hfsmp->jnl) {
+		journal_modify_block_start(hfsmp->jnl, bp);
+	}
+
 	volumeHeader = (HFSPlusVolumeHeader *)((char *)bp->b_data + HFS_PRI_OFFSET(sectorsize));
 
 	/*
@@ -1839,9 +2266,19 @@ hfs_flushvolumeheader(struct hfsmount *hfsmp, int waitfor, int altflush)
 
 			if ( SWAP_BE32 (mdb->drCrDate) != vcb->localCreateDate )
 			  {
+				// XXXdbg
+				if (hfsmp->jnl) {
+				    journal_modify_block_start(hfsmp->jnl, bp2);
+				}
+
 				mdb->drCrDate = SWAP_BE32 (vcb->localCreateDate);	/* pick up the new create date */
 
-				(void) VOP_BWRITE(bp2);		/* write out the changes */
+				// XXXdbg
+				if (hfsmp->jnl) {
+					journal_modify_block_end(hfsmp->jnl, bp2);
+				} else {
+					(void) VOP_BWRITE(bp2);		/* write out the changes */
+				}
 			  }
 			else
 			  {
@@ -1850,9 +2287,36 @@ hfs_flushvolumeheader(struct hfsmount *hfsmp, int waitfor, int altflush)
 		  }	
 	}
 
+// XXXdbg - only monkey around with the volume signature on non-root volumes
+//
+#if 0
+	if (hfsmp->jnl &&
+		hfsmp->hfs_fs_ronly == 0 &&
+		(HFSTOVFS(hfsmp)->mnt_flag & MNT_ROOTFS) == 0) {
+		
+		int old_sig = volumeHeader->signature;
+
+		if (vcb->vcbAtrb & kHFSVolumeUnmountedMask) {
+			volumeHeader->signature = kHFSPlusSigWord;
+		} else {
+			volumeHeader->signature = kHFSJSigWord;
+		}
+
+		if (old_sig != volumeHeader->signature) {
+			altflush = 1;
+		}
+	}
+#endif
+// XXXdbg
+
 	/* Note: only update the lower 16 bits worth of attributes */
 	volumeHeader->attributes	= SWAP_BE32 ((SWAP_BE32 (volumeHeader->attributes) & 0xFFFF0000) + (UInt16) vcb->vcbAtrb);
-	volumeHeader->lastMountedVersion = SWAP_BE32 (kHFSPlusMountVersion);
+	volumeHeader->journalInfoBlock = SWAP_BE32(vcb->vcbJinfoBlock);
+	if (hfsmp->jnl) {
+		volumeHeader->lastMountedVersion = SWAP_BE32 (kHFSJMountVersion);
+	} else {
+		volumeHeader->lastMountedVersion = SWAP_BE32 (kHFSPlusMountVersion);
+	}
 	volumeHeader->createDate	= SWAP_BE32 (vcb->localCreateDate);  /* volume create date is in local time */
 	volumeHeader->modifyDate	= SWAP_BE32 (to_hfs_time(vcb->vcbLsMod));
 	volumeHeader->backupDate	= SWAP_BE32 (to_hfs_time(vcb->vcbVolBkUp));
@@ -1918,22 +2382,38 @@ hfs_flushvolumeheader(struct hfsmount *hfsmp, int waitfor, int altflush)
 			HFS_ALT_SECTOR(sectorsize, hfsmp->hfs_phys_block_count);
 
 		if (meta_bread(hfsmp->hfs_devvp, altIDSector, sectorsize, NOCRED, &alt_bp) == 0) {
+			if (hfsmp->jnl) {
+				journal_modify_block_start(hfsmp->jnl, alt_bp);
+			}
+
 			bcopy(volumeHeader, alt_bp->b_data + HFS_ALT_OFFSET(sectorsize), kMDBSize);
-			(void) VOP_BWRITE(alt_bp);
+
+			if (hfsmp->jnl) {
+				journal_modify_block_end(hfsmp->jnl, alt_bp);
+			} else {
+				(void) VOP_BWRITE(alt_bp);
+			}
 		} else if (alt_bp)
 			brelse(alt_bp);
 	}
 
-	if (waitfor != MNT_WAIT)
-		bawrite(bp);
-	else {
-		retval = VOP_BWRITE(bp);
-		/* When critical data changes, flush the device cache */
-		if (critical && (retval == 0)) {
+	// XXXdbg
+	if (hfsmp->jnl) {
+		journal_modify_block_end(hfsmp->jnl, bp);
+		journal_end_transaction(hfsmp->jnl);
+	} else {
+		if (waitfor != MNT_WAIT)
+			bawrite(bp);
+		else {
+		    retval = VOP_BWRITE(bp);
+		    /* When critical data changes, flush the device cache */
+		    if (critical && (retval == 0)) {
 			(void) VOP_IOCTL(hfsmp->hfs_devvp, DKIOCSYNCHRONIZECACHE,
-					NULL, FWRITE, NOCRED, current_proc());
+					 NULL, FWRITE, NOCRED, current_proc());
+		    }
 		}
 	}
+	hfs_global_shared_lock_release(hfsmp);
  
 	vcb->vcbFlags &= 0x00FF;
 	return (retval);
diff --git a/bsd/hfs/hfs_vfsutils.c b/bsd/hfs/hfs_vfsutils.c
index c45f8a898..386acae02 100644
--- a/bsd/hfs/hfs_vfsutils.c
+++ b/bsd/hfs/hfs_vfsutils.c
@@ -55,6 +55,7 @@ extern uid_t console_user;
 
 
 static void ReleaseMetaFileVNode(struct vnode *vp);
+static int  hfs_late_journal_init(struct hfsmount *hfsmp, HFSPlusVolumeHeader *vhp, void *_args);
 
 u_int32_t GetLogicalBlockSize(struct vnode *vp);
 
@@ -246,7 +247,7 @@ CmdDone:
 //*******************************************************************************
 
 OSErr hfs_MountHFSPlusVolume(struct hfsmount *hfsmp, HFSPlusVolumeHeader *vhp,
-	off_t embeddedOffset, u_int64_t disksize, struct proc *p)
+	off_t embeddedOffset, u_int64_t disksize, struct proc *p, void *args)
 {
 	register ExtendedVCB *vcb;
 	struct cat_desc cndesc;
@@ -254,9 +255,15 @@ OSErr hfs_MountHFSPlusVolume(struct hfsmount *hfsmp, HFSPlusVolumeHeader *vhp,
 	UInt32 blockSize;
 	OSErr retval;
 
-	if (SWAP_BE16(vhp->signature) != kHFSPlusSigWord ||
-	    SWAP_BE16(vhp->version) != kHFSPlusVersion)
-	    	return (EINVAL);
+	// XXXdbg - added the kHFSJSigWord case
+	if ((SWAP_BE16(vhp->signature) != kHFSPlusSigWord &&
+		 SWAP_BE16(vhp->signature) != kHFSJSigWord) ||
+	    SWAP_BE16(vhp->version) != kHFSPlusVersion) {
+		// XXXdbg
+		printf("hfs: mount: sig 0x%x and version 0x%x are not HFS or HFS+.\n",
+			   vhp->signature, vhp->version);
+		return (EINVAL);
+	}
 
 	/* Block size must be at least 512 and a power of 2 */
 	blockSize = SWAP_BE32(vhp->blockSize);
@@ -264,7 +271,7 @@ OSErr hfs_MountHFSPlusVolume(struct hfsmount *hfsmp, HFSPlusVolumeHeader *vhp,
 		return (EINVAL);
    
 	/* don't mount a writable volume if its dirty, it must be cleaned by fsck_hfs */
-	if (hfsmp->hfs_fs_ronly == 0 && (SWAP_BE32(vhp->attributes) & kHFSVolumeUnmountedMask) == 0)
+	if (hfsmp->hfs_fs_ronly == 0 && hfsmp->jnl == NULL && (SWAP_BE32(vhp->attributes) & kHFSVolumeUnmountedMask) == 0)
 		return (EINVAL);
 
 	/* Make sure we can live with the physical block size. */
@@ -280,6 +287,13 @@ OSErr hfs_MountHFSPlusVolume(struct hfsmount *hfsmp, HFSPlusVolumeHeader *vhp,
 	vcb = HFSTOVCB(hfsmp);
 
 	vcb->vcbSigWord	= SWAP_BE16(vhp->signature);
+
+	// XXXdbg - remap this in case we've mounted a dirty journaled volume
+	if (vcb->vcbSigWord == kHFSJSigWord) {
+		vcb->vcbSigWord = kHFSPlusSigWord;
+	}
+
+	vcb->vcbJinfoBlock = SWAP_BE32(vhp->journalInfoBlock);
 	vcb->vcbLsMod	= to_bsd_time(SWAP_BE32(vhp->modifyDate));
 	vcb->vcbAtrb	= (UInt16)SWAP_BE32(vhp->attributes);
 	vcb->vcbClpSiz	= SWAP_BE32(vhp->rsrcClumpSize);
@@ -413,6 +427,9 @@ OSErr hfs_MountHFSPlusVolume(struct hfsmount *hfsmp, HFSPlusVolumeHeader *vhp,
 
 	/* mark the volume dirty (clear clean unmount bit) */
 	vcb->vcbAtrb &=	~kHFSVolumeUnmountedMask;
+	if (hfsmp->jnl && hfsmp->hfs_fs_ronly == 0) {
+		hfs_flushvolumeheader(hfsmp, TRUE, TRUE);
+	}
 
 	/*
 	 * all done with metadata files so we can unlock now...
@@ -423,12 +440,46 @@ OSErr hfs_MountHFSPlusVolume(struct hfsmount *hfsmp, HFSPlusVolumeHeader *vhp,
 
 	/* setup private/hidden directory for unlinked files */
 	hfsmp->hfs_private_metadata_dir = FindMetaDataDirectory(vcb);
+	if (hfsmp->jnl && (hfsmp->hfs_fs_ronly == 0))
+		hfs_remove_orphans(hfsmp);
 
 	if ( !(vcb->vcbAtrb & kHFSVolumeHardwareLockMask) )	// if the disk is not write protected
 	{
 		MarkVCBDirty( vcb );	// mark VCB dirty so it will be written
 	}
 
+
+	//
+	// Check if we need to do late journal initialization.  This only
+	// happens if a previous version of MacOS X (or 9) touched the disk.
+	// In that case hfs_late_journal_init() will go re-locate the journal 
+	// and journal_info_block files and validate that they're still kosher.
+	//
+	if (   (vcb->vcbAtrb & kHFSVolumeJournaledMask)
+		&& (SWAP_BE32(vhp->lastMountedVersion) != kHFSJMountVersion)
+		&& (hfsmp->jnl == NULL)) {
+
+		retval = hfs_late_journal_init(hfsmp, vhp, args);
+		if (retval != 0) {
+			hfsmp->jnl = NULL;
+			goto ErrorExit;
+		} else if (hfsmp->jnl) {
+			hfsmp->hfs_mp->mnt_flag |= MNT_JOURNALED;
+		}
+	} else if (hfsmp->jnl) {
+		struct cat_attr jinfo_attr, jnl_attr;
+		
+		// if we're here we need to fill in the fileid's for the
+		// journal and journal_info_block.
+		hfsmp->hfs_jnlinfoblkid = GetFileInfo(vcb, kRootDirID, ".journal_info_block", &jinfo_attr, NULL);
+		hfsmp->hfs_jnlfileid    = GetFileInfo(vcb, kRootDirID, ".journal", &jnl_attr, NULL);
+		if (hfsmp->hfs_jnlinfoblkid == 0 || hfsmp->hfs_jnlfileid == 0) {
+			printf("hfs: danger! couldn't find the file-id's for the journal or journal_info_block\n");
+			printf("hfs: jnlfileid %d, jnlinfoblkid %d\n", hfsmp->hfs_jnlfileid, hfsmp->hfs_jnlinfoblkid);
+		}
+	}
+
+
 	return (0);
 
 ErrorExit:
@@ -759,13 +810,28 @@ FindMetaDataDirectory(ExtendedVCB *vcb)
 	fndrinfo->frLocation.h = SWAP_BE16 (22460);
 	fndrinfo->frFlags |= SWAP_BE16 (kIsInvisible + kNameLocked);		
 
+	// XXXdbg
+	hfs_global_shared_lock_acquire(hfsmp);
+	if (hfsmp->jnl) {
+	    if ((error = journal_start_transaction(hfsmp->jnl)) != 0) {
+			hfs_global_shared_lock_release(hfsmp);
+			return (0);
+	    }
+	}
+
 	error = cat_create(hfsmp, &hfsmp->hfs_privdir_desc,
 			&hfsmp->hfs_privdir_attr, &out_desc);
 
 	/* Unlock catalog b-tree */
 	(void) hfs_metafilelocking(hfsmp, kHFSCatalogFileID, LK_RELEASE, current_proc());
-	if (error)
-		return (0);
+	if (error) {
+	    if (hfsmp->jnl) {
+			journal_end_transaction(hfsmp->jnl);
+	    }
+		hfs_global_shared_lock_release(hfsmp);
+
+	    return (0);
+	}
 
 	hfsmp->hfs_privdir_desc.cd_hint = out_desc.cd_hint;
 	hfsmp->hfs_privdir_desc.cd_cnid = out_desc.cd_cnid;
@@ -783,11 +849,209 @@ FindMetaDataDirectory(ExtendedVCB *vcb)
 		vput(dvp);
 	}
 	hfs_volupdate(hfsmp, VOL_MKDIR, 1);
+	if (hfsmp->jnl) {
+	    journal_end_transaction(hfsmp->jnl);
+	} 
+	hfs_global_shared_lock_release(hfsmp);
+
 	cat_releasedesc(&out_desc);
 
 	return (out_desc.cd_cnid);
 }
 
+__private_extern__
+u_long
+GetFileInfo(ExtendedVCB *vcb, u_int32_t dirid, char *name,
+			struct cat_attr *fattr, struct cat_fork *forkinfo)
+{
+	struct hfsmount * hfsmp;
+	struct vnode * dvp = NULL;
+	struct cnode * dcp = NULL;
+	struct FndrDirInfo * fndrinfo;
+	struct cat_desc jdesc;
+	struct timeval tv;
+	int error;
+	
+	if (vcb->vcbSigWord != kHFSPlusSigWord)
+		return (0);
+
+	hfsmp = VCBTOHFS(vcb);
+
+	memset(&jdesc, 0, sizeof(struct cat_desc));
+	jdesc.cd_parentcnid = kRootDirID;
+	jdesc.cd_nameptr = name;
+	jdesc.cd_namelen = strlen(name);
+
+	/* Lock catalog b-tree */
+	error = hfs_metafilelocking(hfsmp, kHFSCatalogFileID, LK_EXCLUSIVE, current_proc());	
+	if (error)
+		return (0);
+
+	error = cat_lookup(hfsmp, &jdesc, 0, NULL, fattr, forkinfo);
+
+	(void) hfs_metafilelocking(hfsmp, kHFSCatalogFileID, LK_RELEASE, current_proc());
+
+	if (error == 0) {
+		return (fattr->ca_fileid);
+	} else if (hfsmp->hfs_fs_ronly) {
+		return (0);
+	}
+}
+
+
+/*
+ * On Journaled HFS, there can be orphaned files.  These
+ * are files that were unlinked while busy. If the volume
+ * was not cleanly unmounted then some of these files may
+ * have persisted and need to be removed.
+ */
+__private_extern__
+void
+hfs_remove_orphans(struct hfsmount * hfsmp)
+{
+	struct BTreeIterator * iterator = NULL;
+	struct FSBufferDescriptor btdata;
+	struct HFSPlusCatalogFile filerec;
+	struct HFSPlusCatalogKey * keyp;
+	FCB *fcb;
+	ExtendedVCB *vcb;
+	char filename[32];
+	char tempname[32];
+	size_t namelen;
+	int catlock = 0;
+	int result, started_tr = 0;
+	
+	if (hfsmp->hfs_orphans_cleaned)
+		return;
+
+	vcb = HFSTOVCB(hfsmp);
+	fcb = VTOF(vcb->catalogRefNum);
+
+	btdata.bufferAddress = &filerec;
+	btdata.itemSize = sizeof(filerec);
+	btdata.itemCount = 1;
+
+	MALLOC(iterator, struct BTreeIterator *, sizeof(*iterator), M_TEMP, M_WAITOK);
+	bzero(iterator, sizeof(*iterator));
+	keyp = (HFSPlusCatalogKey*)&iterator->key;
+	keyp->parentID = hfsmp->hfs_private_metadata_dir;
+
+	// XXXdbg
+	hfs_global_shared_lock_acquire(hfsmp);
+	if (hfsmp->jnl) {
+	    if (journal_start_transaction(hfsmp->jnl) != 0) {
+			hfs_global_shared_lock_release(hfsmp);
+			return;
+	    }
+		started_tr = 1;
+	}
+
+	/* Lock catalog b-tree */
+	result = hfs_metafilelocking(hfsmp, kHFSCatalogFileID, LK_EXCLUSIVE, current_proc());	
+	if (result)
+		goto exit;
+	catlock = 1;
+
+	/*
+	 * Position the iterator at the folder thread record.
+	 * (i.e. one record before first child)
+	 */
+	result = BTSearchRecord(fcb, iterator, NULL, NULL, iterator);
+	if (result)
+		goto exit;
+
+	/* Visit all the children in the HFS+ private directory. */
+	for (;;) {
+		result = BTIterateRecord(fcb, kBTreeNextRecord, iterator, &btdata, NULL);
+		if (result)
+			break;
+		if (keyp->parentID != hfsmp->hfs_private_metadata_dir)
+			break;
+		if (filerec.recordType != kHFSPlusFileRecord)
+			continue;
+		
+		(void) utf8_encodestr(keyp->nodeName.unicode, keyp->nodeName.length * 2,
+		                      filename, &namelen, sizeof(filename), 0, 0);
+		
+		(void) sprintf(tempname, "%s%d", HFS_DELETE_PREFIX, filerec.fileID);
+		
+		/*
+		 * Delete all files named "tempxxx", where
+		 * xxx is the file's cnid in decimal.
+		 *
+		 * Delete all files named "iNodexxx", that
+		 * have a link count of zero.
+		 */
+		if (bcmp(tempname, filename, namelen) == 0) {
+   			struct filefork fork = {0};
+   			struct cnode cnode = {0};
+
+			// XXXdebug
+			//printf("hfs_remove_orphans: removing %s\n", filename);
+
+			/* Build a fake cnode */
+			cnode.c_desc.cd_parentcnid = hfsmp->hfs_private_metadata_dir;
+			cnode.c_desc.cd_nameptr = filename;
+			cnode.c_desc.cd_namelen = namelen;
+			cnode.c_desc.cd_cnid = filerec.fileID;
+			cnode.c_attr.ca_fileid = filerec.fileID;
+			cnode.c_blocks = filerec.dataFork.totalBlocks +
+			                 filerec.resourceFork.totalBlocks;
+
+			/* Position iterator at previous entry */
+			if (BTIterateRecord(fcb, kBTreePrevRecord, iterator,
+			    NULL, NULL) != 0)
+				break;
+			
+			/* Truncate the file to zero (both forks) */
+			if (filerec.dataFork.totalBlocks > 0) {
+				fork.ff_cp = &cnode;
+				cnode.c_datafork = &fork;
+				bcopy(&filerec.dataFork, &fork.ff_data, sizeof(struct cat_fork));
+				if (TruncateFileC(vcb, (FCB*)&fork, 0, false) != 0) {
+					printf("error truncting data fork!\n");
+					break;
+				}
+			}
+			if (filerec.resourceFork.totalBlocks > 0) {
+				fork.ff_cp = &cnode;
+				cnode.c_datafork = NULL;
+				cnode.c_rsrcfork = &fork;
+				bcopy(&filerec.resourceFork, &fork.ff_data, sizeof(struct cat_fork));
+				if (TruncateFileC(vcb, (FCB*)&fork, 0, false) != 0) {
+					printf("error truncting rsrc fork!\n");
+					break;
+				}
+			}
+
+			/* Remove the file record from the Catalog */	
+			if (cat_delete(hfsmp, &cnode.c_desc, &cnode.c_attr) != 0) {
+				printf("error deleting cat rec!\n");
+				break;
+			}
+			
+			/* Update parent and volume counts */	
+			hfsmp->hfs_privdir_attr.ca_entries--;
+			(void)cat_update(hfsmp, &hfsmp->hfs_privdir_desc,
+			                 &hfsmp->hfs_privdir_attr, NULL, NULL);
+ 			hfs_volupdate(hfsmp, VOL_RMFILE, 0);
+		}
+	}
+	
+exit:
+	/* Unlock catalog b-tree */
+	if (catlock)
+		(void) hfs_metafilelocking(hfsmp, kHFSCatalogFileID, LK_RELEASE, current_proc());
+
+	if (started_tr) {
+		journal_end_transaction(hfsmp->jnl);
+	}
+	hfs_global_shared_lock_release(hfsmp);
+
+	FREE(iterator, M_TEMP);
+	hfsmp->hfs_orphans_cleaned = 1;
+}
+
 
 /*
  * This will return the correct logical block size for a given vnode.
@@ -860,12 +1124,14 @@ short MacToVFSError(OSErr err)
 
 	switch (err) {
 	case dskFulErr:			/*    -34 */
-	case btNoSpaceAvail:		/* -32733 */
+		return ENOSPC;
+	case btNoSpaceAvail:	/* -32733 */
+		return EFBIG;
 	case fxOvFlErr:			/* -32750 */
-		return ENOSPC;		/*    +28 */
+		return EOVERFLOW;
 	
 	case btBadNode:			/* -32731 */
-		return EIO;		/*    +5 */
+		return EBADF;
 	
 	case memFullErr:		/*  -108 */
 		return ENOMEM;		/*   +12 */
@@ -885,7 +1151,7 @@ short MacToVFSError(OSErr err)
 		return EISDIR;		/*     21 */
 	
 	case fxRangeErr:		/* -32751 */
-		return EIO;		/*      5 */
+		return ERANGE;
 	
 	case bdNamErr:			/*   -37 */
 		return ENAMETOOLONG;	/*    63 */
@@ -995,4 +1261,299 @@ hfs_relnamehints(struct cnode *dcp)
 }
 
 
+__private_extern__
+int
+hfs_early_journal_init(struct hfsmount *hfsmp, HFSPlusVolumeHeader *vhp,
+					   void *_args, int embeddedOffset, int mdb_offset,
+					   HFSMasterDirectoryBlock *mdbp, struct ucred *cred)
+{
+	JournalInfoBlock *jibp;
+	struct buf       *jinfo_bp, *bp;
+	int               sectors_per_fsblock, arg_flags=0, arg_tbufsz=0;
+	int               retval, blksize = hfsmp->hfs_phys_block_size;
+	struct vnode     *devvp;
+	struct hfs_mount_args *args = _args;
+
+	devvp = hfsmp->hfs_devvp;
+
+	if (args != NULL && (args->flags & HFSFSMNT_EXTENDED_ARGS)) {
+		arg_flags  = args->journal_flags;
+		arg_tbufsz = args->journal_tbuffer_size;
+	}
+
+	sectors_per_fsblock = SWAP_BE32(vhp->blockSize) / blksize;
+				
+	retval = meta_bread(devvp,
+						embeddedOffset/blksize + 
+						(SWAP_BE32(vhp->journalInfoBlock)*sectors_per_fsblock),
+						SWAP_BE32(vhp->blockSize), cred, &jinfo_bp);
+	if (retval)
+		return retval;
+
+	jibp = (JournalInfoBlock *)jinfo_bp->b_data;
+	jibp->flags  = SWAP_BE32(jibp->flags);
+	jibp->offset = SWAP_BE64(jibp->offset);
+	jibp->size   = SWAP_BE64(jibp->size);
+
+	if (jibp->flags & kJIJournalInFSMask) {
+		hfsmp->jvp = hfsmp->hfs_devvp;
+	} else {
+		printf("hfs: journal not stored in fs! don't know what to do.\n");
+		brelse(jinfo_bp);
+		return EINVAL;
+	}
+
+	// save this off for the hack-y check in hfs_remove()
+	hfsmp->jnl_start = jibp->offset / SWAP_BE32(vhp->blockSize);
+
+	if (jibp->flags & kJIJournalNeedInitMask) {
+		printf("hfs: Initializing the journal (joffset 0x%llx sz 0x%llx)...\n",
+			   jibp->offset + (off_t)embeddedOffset, jibp->size);
+		hfsmp->jnl = journal_create(hfsmp->jvp,
+									jibp->offset + (off_t)embeddedOffset,
+									jibp->size,
+									devvp,
+									blksize,
+									arg_flags,
+									arg_tbufsz,
+									hfs_sync_metadata, hfsmp->hfs_mp);
+
+		// no need to start a transaction here... if this were to fail
+		// we'd just re-init it on the next mount.
+		jibp->flags &= ~kJIJournalNeedInitMask;
+		jibp->flags  = SWAP_BE32(jibp->flags);
+		bwrite(jinfo_bp);
+		jinfo_bp = NULL;
+		jibp     = NULL;
+	} else { 
+		//printf("hfs: Opening the journal (joffset 0x%llx sz 0x%llx vhp_blksize %d)...\n",
+		//	   jibp->offset + (off_t)embeddedOffset,
+		//	   jibp->size, SWAP_BE32(vhp->blockSize));
+				
+		hfsmp->jnl = journal_open(hfsmp->jvp,
+								  jibp->offset + (off_t)embeddedOffset,
+								  jibp->size,
+								  devvp,
+								  blksize,
+								  arg_flags,
+								  arg_tbufsz,
+								  hfs_sync_metadata, hfsmp->hfs_mp);
+
+		brelse(jinfo_bp);
+		jinfo_bp = NULL;
+		jibp     = NULL;
+
+		if (hfsmp->jnl && mdbp) {
+			// reload the mdb because it could have changed
+			// if the journal had to be replayed.
+			retval = meta_bread(devvp, mdb_offset, blksize, cred, &bp);
+			if (retval) {
+				brelse(bp);
+				printf("hfs: failed to reload the mdb after opening the journal (retval %d)!\n",
+					   retval);
+				return retval;
+			}
+			bcopy(bp->b_data + HFS_PRI_OFFSET(blksize), mdbp, 512);
+			brelse(bp);
+			bp = NULL;
+		}
+	}
+
+
+	//printf("journal @ 0x%x\n", hfsmp->jnl);
+	
+	// if we expected the journal to be there and we couldn't
+	// create it or open it then we have to bail out.
+	if (hfsmp->jnl == NULL) {
+		hfsmp->jnl_start = 0;
+		
+		printf("hfs: failed to open/create the journal (retval %d).\n", retval);
+		return EINVAL;
+	}
 
+	return 0;
+}
+
+
+//
+// This function will go and re-locate the .journal_info_block and
+// the .journal files in case they moved (which can happen if you
+// run Norton SpeedDisk).  If we fail to find either file we just
+// disable journaling for this volume and return.  We turn off the
+// journaling bit in the vcb and assume it will get written to disk
+// later (if it doesn't on the next mount we'd do the same thing
+// again which is harmless).  If we disable journaling we don't
+// return an error so that the volume is still mountable.
+//
+// If the info we find for the .journal_info_block and .journal files
+// isn't what we had stored, we re-set our cached info and proceed
+// with opening the journal normally.
+//
+static int
+hfs_late_journal_init(struct hfsmount *hfsmp, HFSPlusVolumeHeader *vhp, void *_args)
+{
+	JournalInfoBlock *jibp;
+	struct buf       *jinfo_bp, *bp;
+	int               sectors_per_fsblock, arg_flags=0, arg_tbufsz=0;
+	int               retval, need_flush = 0, write_jibp = 0;
+	struct vnode     *devvp;
+	struct cat_attr   jib_attr, jattr;
+	struct cat_fork   jib_fork, jfork;
+	ExtendedVCB      *vcb;
+	u_long            fid;
+	struct hfs_mount_args *args = _args;
+	
+	devvp = hfsmp->hfs_devvp;
+	vcb = HFSTOVCB(hfsmp);
+	
+	if (args != NULL && (args->flags & HFSFSMNT_EXTENDED_ARGS)) {
+		if (args->journal_disable) {
+			return 0;
+		}
+
+		arg_flags  = args->journal_flags;
+		arg_tbufsz = args->journal_tbuffer_size;
+	}
+
+	fid = GetFileInfo(vcb, kRootDirID, ".journal_info_block", &jib_attr, &jib_fork);
+	if (fid == 0 || jib_fork.cf_extents[0].startBlock == 0 || jib_fork.cf_size == 0) {
+		printf("hfs: can't find the .journal_info_block! disabling journaling (start: %d).\n",
+			   jib_fork.cf_extents[0].startBlock);
+		vcb->vcbAtrb &= ~kHFSVolumeJournaledMask;
+		return 0;
+	}
+	hfsmp->hfs_jnlinfoblkid = fid;
+
+	// make sure the journal_info_block begins where we think it should.
+	if (SWAP_BE32(vhp->journalInfoBlock) != jib_fork.cf_extents[0].startBlock) {
+		printf("hfs: The journal_info_block moved (was: %d; is: %d).  Fixing up\n",
+			   SWAP_BE32(vhp->journalInfoBlock), jib_fork.cf_extents[0].startBlock);
+
+		vcb->vcbJinfoBlock    = jib_fork.cf_extents[0].startBlock;
+		vhp->journalInfoBlock = SWAP_BE32(jib_fork.cf_extents[0].startBlock);
+	}
+
+
+	sectors_per_fsblock = SWAP_BE32(vhp->blockSize) / hfsmp->hfs_phys_block_size;
+	retval = meta_bread(devvp,
+						vcb->hfsPlusIOPosOffset / hfsmp->hfs_phys_block_size + 
+						(SWAP_BE32(vhp->journalInfoBlock)*sectors_per_fsblock),
+						SWAP_BE32(vhp->blockSize), NOCRED, &jinfo_bp);
+	if (retval) {
+		printf("hfs: can't read journal info block. disabling journaling.\n");
+		vcb->vcbAtrb &= ~kHFSVolumeJournaledMask;
+		return 0;
+	}
+
+	jibp = (JournalInfoBlock *)jinfo_bp->b_data;
+	jibp->flags  = SWAP_BE32(jibp->flags);
+	jibp->offset = SWAP_BE64(jibp->offset);
+	jibp->size   = SWAP_BE64(jibp->size);
+
+	fid = GetFileInfo(vcb, kRootDirID, ".journal", &jattr, &jfork);
+	if (fid == 0 || jfork.cf_extents[0].startBlock == 0 || jfork.cf_size == 0) {
+		printf("hfs: can't find the journal file! disabling journaling (start: %d)\n",
+			   jfork.cf_extents[0].startBlock);
+		brelse(jinfo_bp);
+		vcb->vcbAtrb &= ~kHFSVolumeJournaledMask;
+		return 0;
+	}
+	hfsmp->hfs_jnlfileid = fid;
+
+	// make sure the journal file begins where we think it should.
+	if ((jibp->offset / (u_int64_t)vcb->blockSize) != jfork.cf_extents[0].startBlock) {
+		printf("hfs: The journal file moved (was: %lld; is: %d).  Fixing up\n",
+			   (jibp->offset / (u_int64_t)vcb->blockSize), jfork.cf_extents[0].startBlock);
+
+		jibp->offset = (u_int64_t)jfork.cf_extents[0].startBlock * (u_int64_t)vcb->blockSize;
+		write_jibp   = 1;
+	}
+
+	// check the size of the journal file.
+	if (jibp->size != (u_int64_t)jfork.cf_extents[0].blockCount*vcb->blockSize) {
+		printf("hfs: The journal file changed size! (was %lld; is %lld).  Fixing up.\n",
+			   jibp->size, (u_int64_t)jfork.cf_extents[0].blockCount*vcb->blockSize);
+		
+		jibp->size = (u_int64_t)jfork.cf_extents[0].blockCount * vcb->blockSize;
+		write_jibp = 1;
+	}
+	
+	if (jibp->flags & kJIJournalInFSMask) {
+		hfsmp->jvp = hfsmp->hfs_devvp;
+	} else {
+		printf("hfs: journal not stored in fs! don't know what to do.\n");
+		brelse(jinfo_bp);
+		return EINVAL;
+	}
+
+	// save this off for the hack-y check in hfs_remove()
+	hfsmp->jnl_start = jibp->offset / SWAP_BE32(vhp->blockSize);
+
+	if (jibp->flags & kJIJournalNeedInitMask) {
+		printf("hfs: Initializing the journal (joffset 0x%llx sz 0x%llx)...\n",
+			   jibp->offset + (off_t)vcb->hfsPlusIOPosOffset, jibp->size);
+		hfsmp->jnl = journal_create(hfsmp->jvp,
+									jibp->offset + (off_t)vcb->hfsPlusIOPosOffset,
+									jibp->size,
+									devvp,
+									hfsmp->hfs_phys_block_size,
+									arg_flags,
+									arg_tbufsz,
+									hfs_sync_metadata, hfsmp->hfs_mp);
+
+		// no need to start a transaction here... if this were to fail
+		// we'd just re-init it on the next mount.
+		jibp->flags &= ~kJIJournalNeedInitMask;
+		write_jibp   = 1;
+
+	} else { 
+		//
+		// if we weren't the last person to mount this volume
+		// then we need to throw away the journal because it
+		// is likely that someone else mucked with the disk.
+		// if the journal is empty this is no big deal.  if the
+		// disk is dirty this prevents us from replaying the
+		// journal over top of changes that someone else made.
+		//
+		arg_flags |= JOURNAL_RESET;
+		
+		//printf("hfs: Opening the journal (joffset 0x%llx sz 0x%llx vhp_blksize %d)...\n",
+		//	   jibp->offset + (off_t)vcb->hfsPlusIOPosOffset,
+		//	   jibp->size, SWAP_BE32(vhp->blockSize));
+				
+		hfsmp->jnl = journal_open(hfsmp->jvp,
+								  jibp->offset + (off_t)vcb->hfsPlusIOPosOffset,
+								  jibp->size,
+								  devvp,
+								  hfsmp->hfs_phys_block_size,
+								  arg_flags,
+								  arg_tbufsz,
+								  hfs_sync_metadata, hfsmp->hfs_mp);
+	}
+			
+
+	if (write_jibp) {
+		jibp->flags  = SWAP_BE32(jibp->flags);
+		jibp->offset = SWAP_BE64(jibp->offset);
+		jibp->size   = SWAP_BE64(jibp->size);
+
+		bwrite(jinfo_bp);
+	} else {
+		brelse(jinfo_bp);
+	} 
+	jinfo_bp = NULL;
+	jibp     = NULL;
+
+	//printf("journal @ 0x%x\n", hfsmp->jnl);
+	
+	// if we expected the journal to be there and we couldn't
+	// create it or open it then we have to bail out.
+	if (hfsmp->jnl == NULL) {
+		hfsmp->jnl_start = 0;
+		
+		printf("hfs: failed to open/create the journal (retval %d).\n", retval);
+		return EINVAL;
+	}
+
+	return 0;
+}
diff --git a/bsd/hfs/hfs_vnops.c b/bsd/hfs/hfs_vnops.c
index 19006da0e..0080c1400 100644
--- a/bsd/hfs/hfs_vnops.c
+++ b/bsd/hfs/hfs_vnops.c
@@ -561,6 +561,17 @@ hfs_setattr(ap)
 
 	if (cp->c_flags & (IMMUTABLE | APPEND))
 		return (EPERM);
+
+	// XXXdbg - don't allow modification of the journal or journal_info_block
+	if (VTOHFS(vp)->jnl && cp->c_datafork) {
+		struct HFSPlusExtentDescriptor *extd;
+
+		extd = &cp->c_datafork->ff_data.cf_extents[0];
+		if (extd->startBlock == VTOVCB(vp)->vcbJinfoBlock || extd->startBlock == VTOHFS(vp)->jnl_start) {
+			return EPERM;
+		}
+	}
+
 	/*
 	 * Go through the fields and update iff not VNOVAL.
 	 */
@@ -649,6 +660,16 @@ hfs_chmod(vp, mode, cred, p)
 	if (VTOVCB(vp)->vcbSigWord != kHFSPlusSigWord)
 		return (0);
 
+	// XXXdbg - don't allow modification of the journal or journal_info_block
+	if (VTOHFS(vp)->jnl && cp && cp->c_datafork) {
+		struct HFSPlusExtentDescriptor *extd;
+
+		extd = &cp->c_datafork->ff_data.cf_extents[0];
+		if (extd->startBlock == VTOVCB(vp)->vcbJinfoBlock || extd->startBlock == VTOHFS(vp)->jnl_start) {
+			return EPERM;
+		}
+	}
+
 #if OVERRIDE_UNKNOWN_PERMISSIONS
 	if (VTOVFS(vp)->mnt_flag & MNT_UNKNOWNPERMISSIONS) {
 		return (0);
@@ -915,7 +936,7 @@ hfs_exchange(ap)
 	struct hfsmount *hfsmp = VTOHFS(from_vp);
 	struct cat_desc tempdesc;
 	struct cat_attr tempattr;
-	int error = 0;
+	int error = 0, started_tr = 0, grabbed_lock = 0;
 
 	/* The files must be on the same volume. */
 	if (from_vp->v_mount != to_vp->v_mount)
@@ -927,6 +948,25 @@ hfs_exchange(ap)
 	    VNODE_IS_RSRC(from_vp) || VNODE_IS_RSRC(to_vp))
 		return (EINVAL);
 
+	// XXXdbg - don't allow modification of the journal or journal_info_block
+	if (hfsmp->jnl) {
+		struct HFSPlusExtentDescriptor *extd;
+
+		if (from_cp->c_datafork) {
+			extd = &from_cp->c_datafork->ff_data.cf_extents[0];
+			if (extd->startBlock == VTOVCB(from_vp)->vcbJinfoBlock || extd->startBlock == hfsmp->jnl_start) {
+				return EPERM;
+			}
+		}
+
+		if (to_cp->c_datafork) {
+			extd = &to_cp->c_datafork->ff_data.cf_extents[0];
+			if (extd->startBlock == VTOVCB(to_vp)->vcbJinfoBlock || extd->startBlock == hfsmp->jnl_start) {
+				return EPERM;
+			}
+		}
+	}
+
 	from_rvp = from_cp->c_rsrc_vp;
 	to_rvp = to_cp->c_rsrc_vp;
 
@@ -952,6 +992,16 @@ hfs_exchange(ap)
 	if (to_rvp)
 		(void) vinvalbuf(to_rvp, V_SAVE, ap->a_cred, ap->a_p, 0, 0);
 
+	// XXXdbg
+	hfs_global_shared_lock_acquire(hfsmp);
+	grabbed_lock = 1;
+	if (hfsmp->jnl) {
+	    if ((error = journal_start_transaction(hfsmp->jnl)) != 0) {
+			goto Err_Exit;
+	    }
+		started_tr = 1;
+	}
+	
 	/* Lock catalog b-tree */
 	error = hfs_metafilelocking(hfsmp, kHFSCatalogFileID, LK_EXCLUSIVE, ap->a_p);
 	if (error) goto Err_Exit;
@@ -994,6 +1044,7 @@ hfs_exchange(ap)
 	 * (except the modify date)
 	 */
 	bcopy(&to_cp->c_desc, &from_cp->c_desc, sizeof(struct cat_desc));
+
 	from_cp->c_hint = 0;
 	from_cp->c_fileid = from_cp->c_cnid;
 	from_cp->c_itime = to_cp->c_itime;
@@ -1031,6 +1082,14 @@ Err_Exit:
 	if (from_rvp)
 		vrele(from_rvp);
 
+	// XXXdbg
+	if (started_tr) {
+	    journal_end_transaction(hfsmp->jnl);
+	}
+	if (grabbed_lock) {
+		hfs_global_shared_lock_release(hfsmp);
+	}
+
 	return (error);
 }
 
@@ -1046,7 +1105,6 @@ Err_Exit:
      IN struct proc *p;
 
      */
-
 static int
 hfs_fsync(ap)
 	struct vop_fsync_args /* {
@@ -1063,6 +1121,7 @@ hfs_fsync(ap)
 	register struct buf *bp;
 	struct timeval tv;
 	struct buf *nbp;
+	struct hfsmount *hfsmp = VTOHFS(ap->a_vp);
 	int s;
 	int wait;
 	int retry = 0;
@@ -1078,8 +1137,17 @@ hfs_fsync(ap)
 	 * for regular files write out any clusters
 	 */
 	if (vp->v_flag & VSYSTEM) {
-		if (VTOF(vp)->fcbBTCBPtr != NULL)
-			BTFlushPath(VTOF(vp));
+	    if (VTOF(vp)->fcbBTCBPtr != NULL) {
+			// XXXdbg
+			if (hfsmp->jnl) {
+				if (BTIsDirty(VTOF(vp))) {
+					panic("hfs: system file vp 0x%x has dirty blocks (jnl 0x%x)\n",
+						  vp, hfsmp->jnl);
+				}
+			} else {
+				BTFlushPath(VTOF(vp));
+			}
+	    }
 	} else if (UBCINFOEXISTS(vp))
 		(void) cluster_push(vp);
 
@@ -1139,11 +1207,27 @@ loop:
 		if ((bp->b_flags & B_BUSY))
 			continue;
 		if ((bp->b_flags & B_DELWRI) == 0)
-			panic("hfs_fsync: not dirty");
+			panic("hfs_fsync: bp 0x% not dirty (hfsmp 0x%x)", bp, hfsmp);
+		// XXXdbg
+		if (hfsmp->jnl && (bp->b_flags & B_LOCKED)) {
+			if ((bp->b_flags & B_META) == 0) {
+				panic("hfs: bp @ 0x%x is locked but not meta! jnl 0x%x\n",
+					  bp, hfsmp->jnl);
+			}
+			// if journal_active() returns >= 0 then the journal is ok and we 
+			// shouldn't do anything to this locked block (because it is part 
+			// of a transaction).  otherwise we'll just go through the normal 
+			// code path and flush the buffer.
+			if (journal_active(hfsmp->jnl) >= 0) {
+				continue;
+			}
+		}
+
 		bremfree(bp);
 		bp->b_flags |= B_BUSY;
 		/* Clear B_LOCKED, should only be set on meta files */
 		bp->b_flags &= ~B_LOCKED;
+
 		splx(s);
 		/*
 		 * Wait for I/O associated with indirect blocks to complete,
@@ -1162,7 +1246,9 @@ loop:
 			tsleep((caddr_t)&vp->v_numoutput, PRIBIO + 1, "hfs_fsync", 0);
 		}
 
-		if (vp->v_dirtyblkhd.lh_first) {
+		// XXXdbg -- is checking for hfsmp->jnl == NULL the right
+		//           thing to do?
+		if (hfsmp->jnl == NULL && vp->v_dirtyblkhd.lh_first) {
 			/* still have some dirty buffers */
 			if (retry++ > 10) {
 				vprint("hfs_fsync: dirty", vp);
@@ -1216,6 +1302,11 @@ hfs_metasync(struct hfsmount *hfsmp, daddr_t node, struct proc *p)
 
 	vp = HFSTOVCB(hfsmp)->catalogRefNum;
 
+	// XXXdbg - don't need to do this on a journaled volume
+	if (hfsmp->jnl) {
+		return 0;
+	}
+
 	if (hfs_metafilelocking(hfsmp, kHFSCatalogFileID, LK_EXCLUSIVE, p) != 0)
 		return (0);
 
@@ -1254,6 +1345,7 @@ hfs_btsync(struct vnode *vp, int sync_transaction)
 	register struct buf *bp;
 	struct timeval tv;
 	struct buf *nbp;
+	struct hfsmount *hfsmp = VTOHFS(vp);
 	int s;
 
 	/*
@@ -1267,13 +1359,30 @@ loop:
 		if ((bp->b_flags & B_BUSY))
 			continue;
 		if ((bp->b_flags & B_DELWRI) == 0)
-			panic("hfs_fsync: not dirty");
+			panic("hfs_btsync: not dirty (bp 0x%x hfsmp 0x%x)", bp, hfsmp);
+
+		// XXXdbg
+		if (hfsmp->jnl && (bp->b_flags & B_LOCKED)) {
+			if ((bp->b_flags & B_META) == 0) {
+				panic("hfs: bp @ 0x%x is locked but not meta! jnl 0x%x\n",
+					  bp, hfsmp->jnl);
+			}
+			// if journal_active() returns >= 0 then the journal is ok and we 
+			// shouldn't do anything to this locked block (because it is part 
+			// of a transaction).  otherwise we'll just go through the normal 
+			// code path and flush the buffer.
+			if (journal_active(hfsmp->jnl) >= 0) {
+			    continue;
+			}
+		}
+
 		if (sync_transaction && !(bp->b_flags & B_LOCKED))
 			continue;
 
 		bremfree(bp);
 		bp->b_flags |= B_BUSY;
 		bp->b_flags &= ~B_LOCKED;
+
 		splx(s);
 
 		(void) bawrite(bp);
@@ -1316,7 +1425,7 @@ hfs_rmdir(ap)
 	struct cnode *dcp;
 	struct hfsmount * hfsmp;
 	struct timeval tv;
-	int error = 0;
+	int error = 0, started_tr = 0, grabbed_lock = 0;
 
 	cp = VTOC(vp);
 	dcp = VTOC(dvp);
@@ -1327,6 +1436,17 @@ hfs_rmdir(ap)
 		vput(vp);
 		return (EINVAL);	/* cannot remove "." */
 	}
+
+	// XXXdbg
+	hfs_global_shared_lock_acquire(hfsmp);
+	grabbed_lock = 1;
+	if (hfsmp->jnl) {
+	    if ((error = journal_start_transaction(hfsmp->jnl)) != 0) {
+			goto out;
+	    }
+		started_tr = 1;
+	}
+
 	/*
 	 * Verify the directory is empty (and valid).
 	 * (Rmdir ".." won't be valid since
@@ -1372,6 +1492,7 @@ hfs_rmdir(ap)
 	dcp->c_flag |= C_CHANGE | C_UPDATE;
 	tv = time;
 	(void) VOP_UPDATE(dvp, &tv, &tv, 0);
+
 	hfs_volupdate(hfsmp, VOL_RMDIR, (dcp->c_cnid == kHFSRootFolderID));
 
 	cp->c_mode = 0;  /* Makes the vnode go away...see inactive */
@@ -1380,6 +1501,15 @@ out:
 	if (dvp) 
 		vput(dvp);
 	vput(vp);
+
+	// XXXdbg
+	if (started_tr) { 
+	    journal_end_transaction(hfsmp->jnl);
+	}
+	if (grabbed_lock) {
+		hfs_global_shared_lock_release(hfsmp);
+	}
+
 	return (error);
 }
 
@@ -1415,6 +1545,7 @@ hfs_remove(ap)
 	int truncated = 0;
 	struct timeval tv;
 	int error = 0;
+	int started_tr = 0, grabbed_lock = 0;
 
 	/* Redirect directories to rmdir */
 	if (vp->v_type == VDIR)
@@ -1435,7 +1566,7 @@ hfs_remove(ap)
 	    VNODE_IS_RSRC(vp)) {
 		error = EPERM;
 		goto out;
-        }
+	}
 
 	/*
 	 * Aquire a vnode for a non-empty resource fork.
@@ -1447,6 +1578,17 @@ hfs_remove(ap)
 			goto out;
 	}
 
+	// XXXdbg - don't allow deleting the journal or journal_info_block
+	if (hfsmp->jnl && cp->c_datafork) {
+		struct HFSPlusExtentDescriptor *extd;
+
+		extd = &cp->c_datafork->ff_data.cf_extents[0];
+		if (extd->startBlock == HFSTOVCB(hfsmp)->vcbJinfoBlock || extd->startBlock == hfsmp->jnl_start) {
+			error = EPERM;
+			goto out;
+		}
+	}
+
 	/*
 	 * Check if this file is being used.
 	 *
@@ -1470,9 +1612,48 @@ hfs_remove(ap)
 		goto out;
 	}
 
+	// XXXdbg
+	hfs_global_shared_lock_acquire(hfsmp);
+	grabbed_lock = 1;
+	if (hfsmp->jnl) {
+	    if ((error = journal_start_transaction(hfsmp->jnl)) != 0) {
+			goto out;
+	    }
+	    started_tr = 1;
+	}
+
 	/* Remove our entry from the namei cache. */
 	cache_purge(vp);
 
+	// XXXdbg - if we're journaled, kill any dirty symlink buffers 
+	if (hfsmp->jnl && vp->v_type == VLNK && vp->v_dirtyblkhd.lh_first) {
+	    struct buf *bp, *nbp;
+
+	  recheck:
+	    for (bp=vp->v_dirtyblkhd.lh_first; bp; bp=nbp) {
+			nbp = bp->b_vnbufs.le_next;
+			
+			if ((bp->b_flags & B_BUSY)) {
+				// if it was busy, someone else must be dealing
+				// with it so just move on.
+				continue;
+			}
+
+			if (!(bp->b_flags & B_META)) {
+				panic("hfs: symlink bp @ 0x%x is not marked meta-data!\n", bp);
+			}
+
+			// if it's part of the current transaction, kill it.
+			if (bp->b_flags & B_LOCKED) {
+				bremfree(bp);
+				bp->b_flags |= B_BUSY;
+				journal_kill_block(hfsmp->jnl, bp);
+				goto recheck;
+			}
+	    }
+	}
+	// XXXdbg
+
 	/*
 	 * Truncate any non-busy forks.  Busy forks will
 	 * get trucated when their vnode goes inactive.
@@ -1535,8 +1716,42 @@ hfs_remove(ap)
 		if (error)
 			goto out;
 
+		/* Delete the link record */
 		error = cat_delete(hfsmp, &desc, &cp->c_attr);
 
+		if ((error == 0) && (--cp->c_nlink < 1)) {
+			char inodename[32];
+			char delname[32];
+			struct cat_desc to_desc;
+			struct cat_desc from_desc;
+
+			/*
+			 * This is now esentially an open deleted file.
+			 * Rename it to reflect this state which makes
+			 * orphan file cleanup easier (see hfs_remove_orphans).
+			 * Note: a rename failure here is not fatal.
+			 */	
+			MAKE_INODE_NAME(inodename, cp->c_rdev);
+			bzero(&from_desc, sizeof(from_desc));
+			from_desc.cd_nameptr = inodename;
+			from_desc.cd_namelen = strlen(inodename);
+			from_desc.cd_parentcnid = hfsmp->hfs_private_metadata_dir;
+			from_desc.cd_flags = 0;
+			from_desc.cd_cnid = cp->c_fileid;
+
+			MAKE_DELETED_NAME(delname, cp->c_fileid);		
+			bzero(&to_desc, sizeof(to_desc));
+			to_desc.cd_nameptr = delname;
+			to_desc.cd_namelen = strlen(delname);
+			to_desc.cd_parentcnid = hfsmp->hfs_private_metadata_dir;
+			to_desc.cd_flags = 0;
+			to_desc.cd_cnid = cp->c_fileid;
+	
+			(void) cat_rename(hfsmp, &from_desc, &hfsmp->hfs_privdir_desc,
+			                  &to_desc, (struct cat_desc *)NULL);
+			cp->c_flag |= C_DELETED;
+		}
+
 		/* Unlock the Catalog */
 		(void) hfs_metafilelocking(hfsmp, kHFSCatalogFileID, LK_RELEASE, p);
 
@@ -1548,8 +1763,9 @@ hfs_remove(ap)
 			goto out;
 
 		cp->c_flag |= C_CHANGE;
-                if (--cp->c_nlink < 1)
-			cp->c_flag |= C_DELETED;
+		tv = time;
+		(void) VOP_UPDATE(vp, &tv, &tv, 0);
+
 		hfs_volupdate(hfsmp, VOL_RMFILE, (dcp->c_cnid == kHFSRootFolderID));
 
 	} else if (dataforkbusy || rsrcforkbusy) {
@@ -1573,12 +1789,16 @@ hfs_remove(ap)
 
 		/* Lock catalog b-tree */
 		error = hfs_metafilelocking(hfsmp, kHFSCatalogFileID, LK_EXCLUSIVE, p);
-		if (error) goto out;
+		if (error)
+			goto out;
 
 		error = cat_rename(hfsmp, &cp->c_desc, &todir_desc,
 				&to_desc, (struct cat_desc *)NULL);
 
-		hfsmp->hfs_privdir_attr.ca_entries++;
+		// XXXdbg - only bump this count if we were successful
+		if (error == 0) {
+			hfsmp->hfs_privdir_attr.ca_entries++;
+		}
 		(void)cat_update(hfsmp, &hfsmp->hfs_privdir_desc,
 				&hfsmp->hfs_privdir_attr, NULL, NULL);
 
@@ -1588,22 +1808,33 @@ hfs_remove(ap)
 
 		cp->c_flag |= C_CHANGE | C_DELETED | C_NOEXISTS;
 		--cp->c_nlink;
+		tv = time;
+		(void) VOP_UPDATE(vp, &tv, &tv, 0);
 
 	} else /* Not busy */ {
 
-		/* Lock catalog b-tree */
-		error = hfs_metafilelocking(hfsmp, kHFSCatalogFileID, LK_EXCLUSIVE, p);
-		if (error) goto out;
-
 		if (vp->v_type == VDIR && cp->c_entries > 0)
 			panic("hfs_remove: attempting to delete a non-empty directory!");
 		if (vp->v_type != VDIR && cp->c_blocks > 0)
 			panic("hfs_remove: attempting to delete a non-empty file!");
 
+		/* Lock catalog b-tree */
+		error = hfs_metafilelocking(hfsmp, kHFSCatalogFileID, LK_EXCLUSIVE, p);
+		if (error)
+			goto out;
+
 		error = cat_delete(hfsmp, &cp->c_desc, &cp->c_attr);
 
-		if (error && truncated)
-			panic("hfs_remove: couldn't delete a truncated file!");
+		if (error && error != ENXIO && truncated) {
+			if ((cp->c_datafork && cp->c_datafork->ff_data.cf_size != 0) ||
+				(cp->c_rsrcfork && cp->c_rsrcfork->ff_data.cf_size != 0)) {
+				panic("hfs: remove: couldn't delete a truncated file! (%d, data sz %lld; rsrc sz %lld)",
+					  error, cp->c_datafork->ff_data.cf_size, cp->c_rsrcfork->ff_data.cf_size);
+			} else {
+				printf("hfs: remove: strangely enough, deleting truncated file %s (%d) got err %d\n",
+					   cp->c_desc.cd_nameptr, cp->c_attr.ca_fileid, error);
+			}
+		}
 
 		/* Unlock the Catalog */
 		(void) hfs_metafilelocking(hfsmp, kHFSCatalogFileID, LK_RELEASE, p);
@@ -1642,10 +1873,23 @@ hfs_remove(ap)
 	if (rvp)
 		vrele(rvp);
 	VOP_UNLOCK(vp, 0, p);
-	(void) ubc_uncache(vp);
+	// XXXdbg - try to prevent the lost ubc_info panic
+	if ((cp->c_flag & C_HARDLINK) == 0 || cp->c_nlink == 0) {
+		(void) ubc_uncache(vp);
+	}
 	vrele(vp);
 	vput(dvp);
+
+	// XXXdbg
+	if (started_tr) {
+	    journal_end_transaction(hfsmp->jnl);
+	}
+	if (grabbed_lock) {
+		hfs_global_shared_lock_release(hfsmp);
+	}
+
 	return (0);
+
 out:
 	if (rvp)
 		vrele(rvp);
@@ -1658,6 +1902,15 @@ out:
 	}
 	vput(vp);
 	vput(dvp);
+
+	// XXXdbg
+	if (started_tr) {
+	    journal_end_transaction(hfsmp->jnl);
+	}
+	if (grabbed_lock) {
+		hfs_global_shared_lock_release(hfsmp);
+	}
+
 	return (error);
 }
 
@@ -1736,10 +1989,20 @@ hfs_rename(ap)
 	struct hfsmount *hfsmp;
 	struct proc *p = fcnp->cn_proc;
 	struct timeval tv;
-	int retval = 0;
+	int retval = 0, started_tr = 0, grabbed_lock = 0;
+	int fdvp_locked = 0;
+	int fvp_locked = 0;
 	cnid_t oldparent = 0;
 	cnid_t newparent = 0;
 
+	// XXXdbg
+	if (fvp) 
+	    hfsmp = VTOHFS(fvp);
+	else if (tvp)
+	    hfsmp = VTOHFS(tvp);
+	else
+	    hfsmp = NULL;
+	
 #if HFS_DIAGNOSTIC
     if ((tcnp->cn_flags & HASBUF) == 0 ||
         (fcnp->cn_flags & HASBUF) == 0)
@@ -1780,9 +2043,6 @@ hfs_rename(ap)
 		goto abortop;
 	}
 
-	if ((retval = vn_lock(fvp, LK_EXCLUSIVE | LK_RETRY, p)))
-		goto abortop;
-
 	/*
 	 * Make sure "from" vnode and its parent are changeable.
 	 */
@@ -1790,13 +2050,11 @@ hfs_rename(ap)
 	fcp = VTOC(fvp);
 	oldparent = fdcp->c_cnid;
 	if ((fcp->c_flags & (IMMUTABLE | APPEND)) || (fdcp->c_flags & APPEND)) {
-		VOP_UNLOCK(fvp, 0, p);
 		retval = EPERM;
 		goto abortop;
 	}
 
 	if (fcp->c_parentcnid != fdcp->c_cnid) {
-		VOP_UNLOCK(fvp, 0, p);
 		retval = EINVAL;
 		goto abortop;
 	}
@@ -1812,7 +2070,6 @@ hfs_rename(ap)
 	if (fvp == ap->a_tvp &&
 	    (bcmp(fcp->c_desc.cd_nameptr, tcnp->cn_nameptr,
 	     fcp->c_desc.cd_namelen) == 0)) {
-		VOP_UNLOCK(fvp, 0, p);
 		retval = 0;
 		goto abortop;
 	}
@@ -1829,7 +2086,6 @@ hfs_rename(ap)
 			|| fdcp == fcp
 			|| (fcnp->cn_flags&ISDOTDOT)
 			|| (fcp->c_flag & C_RENAME)) {
-			VOP_UNLOCK(fvp, 0, p);
 			retval = EINVAL;
 			goto abortop;
 		}
@@ -1846,6 +2102,27 @@ hfs_rename(ap)
 
 	newparent = tdcp->c_cnid;
 	
+	// XXXdbg - don't allow renaming the journal or journal_info_block
+	if (hfsmp->jnl && fcp->c_datafork) {
+		struct HFSPlusExtentDescriptor *extd;
+			
+		extd = &fcp->c_datafork->ff_data.cf_extents[0];
+		if (extd->startBlock == HFSTOVCB(hfsmp)->vcbJinfoBlock || extd->startBlock == hfsmp->jnl_start) {
+			retval = EPERM;
+			goto bad;
+		}
+	}
+
+	if (hfsmp->jnl && tcp && tcp->c_datafork) {
+		struct HFSPlusExtentDescriptor *extd;
+			
+		extd = &tcp->c_datafork->ff_data.cf_extents[0];
+		if (extd->startBlock == HFSTOVCB(hfsmp)->vcbJinfoBlock || extd->startBlock == hfsmp->jnl_start) {
+			retval = EPERM;
+			goto bad;
+		}
+	}
+
 	retval = VOP_ACCESS(fvp, VWRITE, tcnp->cn_cred, tcnp->cn_proc);
 	if ((fvp->v_type == VDIR) && (newparent != oldparent)) {
 		if (retval)		/* write access check above */
@@ -1853,6 +2130,42 @@ hfs_rename(ap)
 	}
 	retval = 0;  /* Reset value from above, we dont care about it anymore */
 	
+	/* XXX
+	 * Prevent lock heirarchy violation (deadlock):
+	 *
+	 * If fdvp is the parent of tdvp then we must drop
+	 * tdvp lock before aquiring the lock for fdvp.
+	 *
+	 * XXXdbg - moved this to happen up here *before* we
+	 *          start a transaction.  otherwise we can
+	 *          deadlock because the vnode layer may get
+	 *          this lock for someone else and then they'll
+	 *          never be able to start a transaction.
+	 */
+	if (newparent != oldparent) {
+	    if (fdcp->c_cnid == tdcp->c_parentcnid) {
+			vput(tdvp);
+			vn_lock(fdvp, LK_EXCLUSIVE | LK_RETRY, p);
+			vget(tdvp, LK_EXCLUSIVE | LK_RETRY, p);
+	    } else {
+			vn_lock(fdvp, LK_EXCLUSIVE | LK_RETRY, p);
+		}
+	}
+	fdvp_locked = 1;
+	if ((retval = vn_lock(fvp, LK_EXCLUSIVE | LK_RETRY, p)))
+		goto bad;
+	fvp_locked = 1;
+	
+	// XXXdbg
+	hfs_global_shared_lock_acquire(hfsmp);
+	grabbed_lock = 1;
+	if (hfsmp->jnl) {
+	    if ((retval = journal_start_transaction(hfsmp->jnl)) != 0) {
+			goto bad;
+	    }
+		started_tr = 1;
+	}
+
 	/*
 	 * If the destination exists, then be sure its type (file or dir)
 	 * matches that of the source.	And, if it is a directory make sure
@@ -1904,19 +2217,9 @@ hfs_rename(ap)
 
 	}
 
-	/* XXX
-	 * Prevent lock heirarchy violation (deadlock):
-	 *
-	 * If fdvp is the parent of tdvp then we must drop
-	 * tdvp lock before aquiring the lock for fdvp.
-	 */
-	if (newparent != oldparent)
-		vn_lock(fdvp, LK_EXCLUSIVE | LK_RETRY, p);
-
 	/* remove the existing entry from the namei cache: */
 	cache_purge(fvp);
 
-	hfsmp = VTOHFS(fvp);
 	bzero(&from_desc, sizeof(from_desc));
 	from_desc.cd_nameptr = fcnp->cn_nameptr;
 	from_desc.cd_namelen = fcnp->cn_namelen;
@@ -1933,18 +2236,18 @@ hfs_rename(ap)
 	/* Lock catalog b-tree */
 	retval = hfs_metafilelocking(hfsmp, kHFSCatalogFileID, LK_EXCLUSIVE, p);
 	if (retval) {
-		if (newparent != oldparent)  /* unlock the lock we just got */
-			VOP_UNLOCK(fdvp, 0, p);
 		 goto bad;
  	}
-    	retval = cat_rename(hfsmp, &from_desc, &tdcp->c_desc,
-    			&to_desc, &out_desc);
+	retval = cat_rename(hfsmp, &from_desc, &tdcp->c_desc,
+						&to_desc, &out_desc);
 
 	/* Unlock catalog b-tree */
 	(void) hfs_metafilelocking(hfsmp, kHFSCatalogFileID, LK_RELEASE, p);
 
-	if (newparent != oldparent)
+	if (newparent != oldparent) {
 		VOP_UNLOCK(fdvp, 0, p);
+		fdvp_locked = 0;
+	}
 
 	if (retval)  goto bad;
 
@@ -1965,13 +2268,19 @@ hfs_rename(ap)
 		fdcp->c_entries--;
 	tdcp->c_nlink++;
 	tdcp->c_entries++;
-	fdcp->c_flag |= C_UPDATE;
-	tdcp->c_flag |= C_UPDATE;
+	fdcp->c_flag |= C_CHANGE | C_UPDATE;
+	tdcp->c_flag |= C_CHANGE | C_UPDATE;
 	tv = time;
 	CTIMES(fdcp, &tv, &tv);
 	CTIMES(tdcp, &tv, &tv);
 	tdcp->c_childhint = out_desc.cd_hint;	/* Cache directory's location */
 
+	// make sure both directories get updated on disk.
+	if (fdvp != tdvp) {
+		(void) VOP_UPDATE(fdvp, &tv, &tv, 0);
+	}
+	(void) VOP_UPDATE(tdvp, &tv, &tv, 0);
+
 	hfs_volupdate(hfsmp, fvp->v_type == VDIR ? VOL_RMDIR : VOL_RMFILE,
 		(fdcp->c_cnid == kHFSRootFolderID));
 	hfs_volupdate(hfsmp, fvp->v_type == VDIR ? VOL_MKDIR : VOL_MKFILE,
@@ -1980,23 +2289,52 @@ hfs_rename(ap)
 	vput(tdvp);
 	vrele(fdvp);
 	vput(fvp);
+
+	// XXXdbg
+	if (started_tr) {
+	    journal_end_transaction(hfsmp->jnl);
+	}
+	if (grabbed_lock) {
+		hfs_global_shared_lock_release(hfsmp);
+	}
+
 	return (0);
 
 bad:
 	if (fcp)
 		fcp->c_flag &= ~C_RENAME;
+
+	// XXXdbg make sure both directories get updated on disk.
+	if (fdvp != tdvp) {
+		(void) VOP_UPDATE(fdvp, &tv, &tv, 0);
+	}
+	(void) VOP_UPDATE(tdvp, &tv, &tv, 0);
+
 	if (tdvp == tvp)
 		vrele(tdvp);
 	else
 		vput(tdvp);
 	if (tvp)
 		vput(tvp);
-	vrele(fdvp);
 
-	if (VOP_ISLOCKED(fvp))
+	if (fdvp_locked)
+		vput(fdvp);
+	else
+		vrele(fdvp);
+
+	if (fvp_locked)
 		vput(fvp);
 	else
 		vrele(fvp);
+
+	// XXXdbg
+	if (started_tr) {
+	    journal_end_transaction(hfsmp->jnl);
+	}
+	if (grabbed_lock) {
+		hfs_global_shared_lock_release(hfsmp);
+	}
+
 	return (retval);
 
 abortop:
@@ -2011,6 +2349,7 @@ abortop:
 	VOP_ABORTOP(fdvp, fcnp);
 	vrele(fdvp);
 	vrele(fvp);
+
 	return (retval);
 }
 
@@ -2079,6 +2418,7 @@ hfs_symlink(ap)
 	} */ *ap;
 {
 	register struct vnode *vp, **vpp = ap->a_vpp;
+	struct hfsmount *hfsmp;
 	struct filefork *fp;
 	int len, error;
 	struct buf *bp = NULL;
@@ -2097,16 +2437,31 @@ hfs_symlink(ap)
 		return (EINVAL);
 	}
 
+
+	hfsmp = VTOHFS(ap->a_dvp);
+
 	/* Create the vnode */
 	if ((error = hfs_makenode(S_IFLNK | ap->a_vap->va_mode,
-	    ap->a_dvp, vpp, ap->a_cnp)))
+							  ap->a_dvp, vpp, ap->a_cnp))) {
 		return (error);
+	}
 
 	vp = *vpp;
 	len = strlen(ap->a_target);
 	fp = VTOF(vp);
 	fp->ff_clumpsize = VTOVCB(vp)->blockSize;
 
+	// XXXdbg
+	hfs_global_shared_lock_acquire(hfsmp);
+	if (hfsmp->jnl) {
+	    if ((error = journal_start_transaction(hfsmp->jnl)) != 0) {
+			hfs_global_shared_lock_release(hfsmp);
+			VOP_ABORTOP(ap->a_dvp, ap->a_cnp);
+			vput(ap->a_dvp);
+			return (error);
+	    }
+	}
+
 	/* Allocate space for the link */
 	error = VOP_TRUNCATE(vp, len, IO_NOZEROFILL,
 	                      ap->a_cnp->cn_cred, ap->a_cnp->cn_proc);
@@ -2116,10 +2471,21 @@ hfs_symlink(ap)
 	/* Write the link to disk */
 	bp = getblk(vp, 0, roundup((int)fp->ff_size, VTOHFS(vp)->hfs_phys_block_size),
 			0, 0, BLK_META);
+	if (hfsmp->jnl) {
+		journal_modify_block_start(hfsmp->jnl, bp);
+	}
 	bzero(bp->b_data, bp->b_bufsize);
 	bcopy(ap->a_target, bp->b_data, len);
-	bawrite(bp);
+	if (hfsmp->jnl) {
+		journal_modify_block_end(hfsmp->jnl, bp);
+	} else {
+		bawrite(bp);
+	}
 out:
+	if (hfsmp->jnl) {
+		journal_end_transaction(hfsmp->jnl);
+	}
+	hfs_global_shared_lock_release(hfsmp);
 	vput(vp);
 	return (error);
 }
@@ -2207,11 +2573,41 @@ hfs_readdir(ap)
 	off_t off = uio->uio_offset;
 	int retval = 0;
 	int eofflag = 0;
-
+	void *user_start = NULL;
+	int   user_len;
+ 
 	/* We assume it's all one big buffer... */
 	if (uio->uio_iovcnt > 1 || uio->uio_resid < AVERAGE_HFSDIRENTRY_SIZE)
 		return EINVAL;
 
+	// XXXdbg
+	// We have to lock the user's buffer here so that we won't
+	// fault on it after we've acquired a shared lock on the
+	// catalog file.  The issue is that you can get a 3-way
+	// deadlock if someone else starts a transaction and then
+	// tries to lock the catalog file but can't because we're
+	// here and we can't service our page fault because VM is
+	// blocked trying to start a transaction as a result of
+	// trying to free up pages for our page fault.  It's messy
+	// but it does happen on dual-procesors that are paging
+	// heavily (see radar 3082639 for more info).  By locking
+	// the buffer up-front we prevent ourselves from faulting
+	// while holding the shared catalog file lock.
+	//
+	// Fortunately this and hfs_search() are the only two places
+	// currently (10/30/02) that can fault on user data with a
+	// shared lock on the catalog file.
+	//
+	if (hfsmp->jnl && uio->uio_segflg == UIO_USERSPACE) {
+		user_start = uio->uio_iov->iov_base;
+		user_len   = uio->uio_iov->iov_len;
+
+		if ((retval = vslock(user_start, user_len)) != 0) {
+			return retval;
+		}
+	}
+
+
 	/* Create the entries for . and .. */
 	if (uio->uio_offset < sizeof(rootdots)) {
 		caddr_t dep;
@@ -2297,6 +2693,10 @@ hfs_readdir(ap)
 	}
 
 Exit:;
+	if (hfsmp->jnl && user_start) {
+		vsunlock(user_start, user_len, TRUE);
+	}
+
 	if (ap->a_eofflag)
 		*ap->a_eofflag = eofflag;
 
@@ -2359,7 +2759,9 @@ hfs_readlink(ap)
 		}
 		bcopy(bp->b_data, fp->ff_symlinkptr, (size_t)fp->ff_size);
 		if (bp) {
-			bp->b_flags |= B_INVAL;		/* data no longer needed */
+			if (VTOHFS(vp)->jnl && (bp->b_flags & B_LOCKED) == 0) {
+				bp->b_flags |= B_INVAL;		/* data no longer needed */
+			}
 			brelse(bp);
 		}
 	}
@@ -2693,8 +3095,11 @@ hfs_update(ap)
 	struct cat_fork *rsrcforkp = NULL;
 	struct cat_fork datafork;
 	int updateflag;
+	struct hfsmount *hfsmp;
 	int error;
 
+	hfsmp = VTOHFS(vp);
+
 	/* XXX do we really want to clear the sytem cnode flags here???? */
 	if ((vp->v_flag & VSYSTEM) ||
 	    (VTOVFS(vp)->mnt_flag & MNT_RDONLY) ||
@@ -2706,11 +3111,13 @@ hfs_update(ap)
 	updateflag = cp->c_flag & (C_ACCESS | C_CHANGE | C_MODIFIED | C_UPDATE);
 
 	/* Nothing to update. */
-	if (updateflag == 0)
+	if (updateflag == 0) {
 		return (0);
+	}
 	/* HFS standard doesn't have access times. */
-	if ((updateflag == C_ACCESS) && (VTOVCB(vp)->vcbSigWord == kHFSSigWord))
+	if ((updateflag == C_ACCESS) && (VTOVCB(vp)->vcbSigWord == kHFSSigWord)) {
 		return (0);
+	}
 	if (updateflag & C_ACCESS) {
 		/*
 		 * If only the access time is changing then defer
@@ -2764,12 +3171,24 @@ hfs_update(ap)
 	    (dataforkp && cp->c_datafork->ff_unallocblocks) ||
 	    (rsrcforkp && cp->c_rsrcfork->ff_unallocblocks)) {
 		if (updateflag & (C_CHANGE | C_UPDATE))
-			hfs_volupdate(VTOHFS(vp), VOL_UPDATE, 0);	
+			hfs_volupdate(hfsmp, VOL_UPDATE, 0);	
 		cp->c_flag &= ~(C_ACCESS | C_CHANGE | C_UPDATE);
 		cp->c_flag |= C_MODIFIED;
+
 		return (0);
 	}
 
+
+	// XXXdbg
+	hfs_global_shared_lock_acquire(hfsmp);
+	if (hfsmp->jnl) {
+		if ((error = journal_start_transaction(hfsmp->jnl)) != 0) {
+			hfs_global_shared_lock_release(hfsmp);
+			return error;
+	    }
+	}
+			
+
 	/*
 	 * For files with invalid ranges (holes) the on-disk
 	 * field representing the size of the file (cf_size)
@@ -2786,18 +3205,29 @@ hfs_update(ap)
 	 * A shared lock is sufficient since an update doesn't change
 	 * the tree and the lock on vp protects the cnode.
 	 */
-	error = hfs_metafilelocking(VTOHFS(vp), kHFSCatalogFileID, LK_SHARED, p);
-	if (error)
+	error = hfs_metafilelocking(hfsmp, kHFSCatalogFileID, LK_SHARED, p);
+	if (error) {
+		if (hfsmp->jnl) {
+			journal_end_transaction(hfsmp->jnl);
+		}
+		hfs_global_shared_lock_release(hfsmp);
 		return (error);
+	}
 
 	/* XXX - waitfor is not enforced */
-	error = cat_update(VTOHFS(vp), &cp->c_desc, &cp->c_attr, dataforkp, rsrcforkp);
+	error = cat_update(hfsmp, &cp->c_desc, &cp->c_attr, dataforkp, rsrcforkp);
 
 	 /* Unlock the Catalog b-tree file. */
-	(void) hfs_metafilelocking(VTOHFS(vp), kHFSCatalogFileID, LK_RELEASE, p);
+	(void) hfs_metafilelocking(hfsmp, kHFSCatalogFileID, LK_RELEASE, p);
 
 	if (updateflag & (C_CHANGE | C_UPDATE))
-		hfs_volupdate(VTOHFS(vp), VOL_UPDATE, 0);	
+		hfs_volupdate(hfsmp, VOL_UPDATE, 0);	
+
+	// XXXdbg
+	if (hfsmp->jnl) {
+	    journal_end_transaction(hfsmp->jnl);
+	}
+	hfs_global_shared_lock_release(hfsmp);
 
 	/* After the updates are finished, clear the flags */
 	cp->c_flag &= ~(C_ACCESS | C_CHANGE | C_MODIFIED | C_UPDATE | C_ATIMEMOD);
@@ -2826,7 +3256,7 @@ hfs_makenode(mode, dvp, vpp, cnp)
 	struct proc *p;
 	struct cat_desc in_desc, out_desc;
 	struct cat_attr attr;
-	int error;
+	int error, started_tr = 0, grabbed_lock = 0;
 	enum vtype vnodetype;
 
 	p = cnp->cn_proc;
@@ -2902,6 +3332,16 @@ hfs_makenode(mode, dvp, vpp, cnp)
 	in_desc.cd_parentcnid = dcp->c_cnid;
 	in_desc.cd_flags = S_ISDIR(mode) ? CD_ISDIR : 0;
 
+	// XXXdbg
+	hfs_global_shared_lock_acquire(hfsmp);
+	grabbed_lock = 1;
+	if (hfsmp->jnl) {
+	    if ((error = journal_start_transaction(hfsmp->jnl)) != 0) {
+			goto exit;
+	    }
+		started_tr = 1;
+	}
+
 	/* Lock catalog b-tree */
 	error = hfs_metafilelocking(VTOHFS(dvp), kHFSCatalogFileID, LK_EXCLUSIVE, p);
 	if (error)
@@ -2921,14 +3361,37 @@ hfs_makenode(mode, dvp, vpp, cnp)
 	dcp->c_flag |= C_CHANGE | C_UPDATE;
 	tv = time;
 	(void) VOP_UPDATE(dvp, &tv, &tv, 0);
+
 	hfs_volupdate(hfsmp, vnodetype == VDIR ? VOL_MKDIR : VOL_MKFILE,
 		(dcp->c_cnid == kHFSRootFolderID));
 
+	// XXXdbg
+	// have to end the transaction here before we call hfs_getnewvnode()
+	// because that can cause us to try and reclaim a vnode on a different
+	// file system which could cause us to start a transaction which can
+	// deadlock with someone on that other file system (since we could be
+	// holding two transaction locks as well as various vnodes and we did
+	// not obtain the locks on them in the proper order).
+    //
+	// NOTE: this means that if the quota check fails or we have to update
+	//       the change time on a block-special device that those changes
+	//       will happen as part of independent transactions.
+	//
+	if (started_tr) {
+		journal_end_transaction(hfsmp->jnl);
+		started_tr = 0;
+	}
+	if (grabbed_lock) {
+		hfs_global_shared_lock_release(hfsmp);
+		grabbed_lock = 0;
+	}
+
 	/* Create a vnode for the object just created: */
 	error = hfs_getnewvnode(hfsmp, NULL, &out_desc, 0, &attr, NULL, &tvp);
 	if (error)
 		goto exit;
 
+
 #if QUOTA
 	cp = VTOC(tvp);
 	/* 
@@ -2945,6 +3408,7 @@ hfs_makenode(mode, dvp, vpp, cnp)
 			VOP_RMDIR(dvp,tvp, cnp);
 		else
 			VOP_REMOVE(dvp,tvp, cnp);
+
 		return (error);
 	}
 #endif /* QUOTA */
@@ -2960,8 +3424,8 @@ hfs_makenode(mode, dvp, vpp, cnp)
 		tvp->v_type = IFTOVT(mode);
 		cp->c_flag |= C_CHANGE;
 		tv = time;
-        	if ((error = VOP_UPDATE(tvp, &tv, &tv, 1))) {
-        		vput(tvp);
+		if ((error = VOP_UPDATE(tvp, &tv, &tv, 1))) {
+			vput(tvp);
 			goto exit;
 		}
 	}
@@ -2974,6 +3438,16 @@ exit:
         	FREE_ZONE(cnp->cn_pnbuf, cnp->cn_pnlen, M_NAMEI);
 	vput(dvp);
 
+	// XXXdbg
+	if (started_tr) {
+	    journal_end_transaction(hfsmp->jnl);
+		started_tr = 0;
+	}
+	if (grabbed_lock) {
+		hfs_global_shared_lock_release(hfsmp);
+		grabbed_lock = 0;
+	}
+
 	return (error);
 }
 
diff --git a/bsd/hfs/hfscommon/BTree/BTree.c b/bsd/hfs/hfscommon/BTree/BTree.c
index 12c2680af..65c12839f 100644
--- a/bsd/hfs/hfscommon/BTree/BTree.c
+++ b/bsd/hfs/hfscommon/BTree/BTree.c
@@ -339,6 +339,20 @@ OSStatus	BTOpenPath			(FCB					*filePtr,
 	err = ReleaseNode (btreePtr, &nodeRec);
 	M_ExitOnError (err);
 
+	/*
+	 * Under Mac OS, b-tree nodes can be non-contiguous on disk when the
+	 * allocation block size is smaller than the b-tree node size.
+	 *
+	 * If journaling is turned on for this volume we can't deal with this
+	 * situation and so we bail out.  If journaling isn't on it's ok as
+	 * hfs_strategy_fragmented() deals with it.  Journaling can't support
+	 * this because it assumes that if you give it a block that it's
+	 * contiguous on disk.
+	 */
+	if ( FCBTOHFS(filePtr)->jnl && !NodesAreContiguous(FCBTOVCB(filePtr), filePtr, btreePtr->nodeSize) ) {
+		return fsBTInvalidNodeErr;
+	}
+
 	//////////////////////////////// Success ////////////////////////////////////
 
 	// align LEOF to multiple of node size?	- just on close
@@ -456,6 +470,9 @@ OSStatus	BTSearchRecord		(FCB						*filePtr,
 	if (filePtr == nil)									return	paramErr;
 	if (searchIterator == nil)							return	paramErr;
 
+	node.buffer = nil;
+	node.blockHeader = nil;
+
 	btreePtr = (BTreeControlBlockPtr) filePtr->fcbBTCBPtr;
 	if (btreePtr == nil)								return	fsBTInvalidFileErr;
 
@@ -629,9 +646,12 @@ OSStatus	BTIterateRecord		(FCB						*filePtr,
 
 	////////////////////////// Priliminary Checks ///////////////////////////////
 
-	left.buffer		= nil;
-	right.buffer	= nil;
-	node.buffer		= nil;
+	left.buffer		  = nil;
+	left.blockHeader  = nil;
+	right.buffer	  = nil;
+	right.blockHeader = nil;
+	node.buffer		  = nil;
+	node.blockHeader  = nil;
 
 
 	if (filePtr == nil)
@@ -928,9 +948,12 @@ BTIterateRecords(FCB *filePtr, BTreeIterationOperation operation, BTreeIterator
 
 	////////////////////////// Priliminary Checks ///////////////////////////////
 
-	left.buffer  = nil;
-	right.buffer = nil;
-	node.buffer  = nil;
+	left.buffer       = nil;
+	left.blockHeader  = nil;
+	right.buffer      = nil;
+	right.blockHeader = nil;
+	node.buffer       = nil;
+	node.blockHeader  = nil;
 
 	btreePtr = (BTreeControlBlockPtr) filePtr->fcbBTCBPtr;
 
@@ -1201,10 +1224,10 @@ OSStatus	BTInsertRecord		(FCB						*filePtr,
 	UInt16					index;
 	Boolean					recordFit;
 
-
 	////////////////////////// Priliminary Checks ///////////////////////////////
 
 	nodeRec.buffer = nil;					// so we can call ReleaseNode
+	nodeRec.blockHeader = nil;
 
 	err = CheckInsertParams (filePtr, iterator, record, recordLen);
 	if (err != noErr)
@@ -1241,6 +1264,9 @@ OSStatus	BTInsertRecord		(FCB						*filePtr,
 								err = GetNewNode (btreePtr, insertNodeNum, &nodeRec);
 								M_ExitOnError (err);
 
+								// XXXdbg
+								ModifyBlockStart(btreePtr->fileRefNum, &nodeRec);
+								
 								((NodeDescPtr)nodeRec.buffer)->kind = kBTLeafNode;
 								((NodeDescPtr)nodeRec.buffer)->height	= 1;
 
@@ -1261,6 +1287,7 @@ OSStatus	BTInsertRecord		(FCB						*filePtr,
 								btreePtr->rootNode	 		= insertNodeNum;
 								btreePtr->firstLeafNode		= insertNodeNum;
 								btreePtr->lastLeafNode		= insertNodeNum;
+
 								M_BTreeHeaderDirty (btreePtr);
 
 								goto Success;
@@ -1270,6 +1297,9 @@ OSStatus	BTInsertRecord		(FCB						*filePtr,
 
 	if (index > 0)
 	{
+		// XXXdbg
+		ModifyBlockStart(btreePtr->fileRefNum, &nodeRec);
+								
 		recordFit = InsertKeyRecord (btreePtr, nodeRec.buffer, index,
 										&iterator->key, KeyLength(btreePtr, &iterator->key),
 										record->bufferAddress, recordLen);
@@ -1308,7 +1338,7 @@ Success:
 	++btreePtr->writeCount;
 	++btreePtr->leafRecords;
 	M_BTreeHeaderDirty (btreePtr);
-
+		
 	// create hint
 	iterator->hint.writeCount 	= btreePtr->writeCount;
 	iterator->hint.nodeNum		= insertNodeNum;
@@ -1359,6 +1389,7 @@ OSStatus	BTReplaceRecord		(FCB						*filePtr,
 	////////////////////////// Priliminary Checks ///////////////////////////////
 
 	nodeRec.buffer = nil;					// so we can call ReleaseNode
+	nodeRec.blockHeader = nil;
 
 	err = CheckInsertParams (filePtr, iterator, record, recordLen);
 	if (err != noErr)
@@ -1380,6 +1411,9 @@ OSStatus	BTReplaceRecord		(FCB						*filePtr,
 		err = GetNode (btreePtr, insertNodeNum, &nodeRec);
 		if( err == noErr )
 		{
+			// XXXdbg
+			ModifyBlockStart(btreePtr->fileRefNum, &nodeRec);
+								
 			err = TrySimpleReplace (btreePtr, nodeRec.buffer, iterator, record, recordLen, &recordFit);
 			M_ExitOnError (err);
 
@@ -1415,6 +1449,9 @@ OSStatus	BTReplaceRecord		(FCB						*filePtr,
 	// optimization - if simple replace will work then don't extend btree
 	//  if we tried this before, and failed because it wouldn't fit then we shouldn't try this again...
 
+	// XXXdbg
+	ModifyBlockStart(btreePtr->fileRefNum, &nodeRec);
+
 	err = TrySimpleReplace (btreePtr, nodeRec.buffer, iterator, record, recordLen, &recordFit);
 	M_ExitOnError (err);
 
@@ -1441,6 +1478,9 @@ OSStatus	BTReplaceRecord		(FCB						*filePtr,
 	}
 
 
+	// XXXdbg
+	ModifyBlockStart(btreePtr->fileRefNum, &nodeRec);
+								
 	DeleteRecord (btreePtr, nodeRec.buffer, index);	// delete existing key/record
 
 	err = InsertTree (btreePtr, treePathTable, &iterator->key, record->bufferAddress,
@@ -1498,6 +1538,7 @@ BTUpdateRecord(FCB *filePtr, BTreeIterator *iterator,
 	////////////////////////// Priliminary Checks ///////////////////////////////
 
 	nodeRec.buffer = nil;					// so we can call ReleaseNode
+	nodeRec.blockHeader = nil;
 
 	btreePtr = (BTreeControlBlockPtr) filePtr->fcbBTCBPtr;
 
@@ -1521,6 +1562,9 @@ BTUpdateRecord(FCB *filePtr, BTreeIterator *iterator,
 				err = GetRecordByIndex(btreePtr, nodeRec.buffer, index, &keyPtr, &recordPtr, &recordLen);
 				M_ExitOnError (err);
 
+				// XXXdbg
+				ModifyBlockStart(btreePtr->fileRefNum, &nodeRec);
+								
 				err = callBackProc(keyPtr, recordPtr, recordLen, callBackState);
 				M_ExitOnError (err);
 
@@ -1553,6 +1597,9 @@ BTUpdateRecord(FCB *filePtr, BTreeIterator *iterator,
 	err = GetRecordByIndex(btreePtr, nodeRec.buffer, index, &keyPtr, &recordPtr, &recordLen);
 	M_ExitOnError (err);
 
+	// XXXdbg
+	ModifyBlockStart(btreePtr->fileRefNum, &nodeRec);
+								
 	err = callBackProc(keyPtr, recordPtr, recordLen, callBackState);
 	M_ExitOnError (err);
 
@@ -1600,6 +1647,7 @@ OSStatus	BTDeleteRecord		(FCB						*filePtr,
 	////////////////////////// Priliminary Checks ///////////////////////////////
 
 	nodeRec.buffer = nil;					// so we can call ReleaseNode
+	nodeRec.blockHeader = nil;
 
 	M_ReturnErrorIf (filePtr == nil, 	paramErr);
 	M_ReturnErrorIf (iterator == nil,	paramErr);
@@ -1630,7 +1678,7 @@ OSStatus	BTDeleteRecord		(FCB						*filePtr,
 	++btreePtr->writeCount;
 	--btreePtr->leafRecords;
 	M_BTreeHeaderDirty (btreePtr);
-
+		
 	iterator->hint.nodeNum	= 0;
 
 	return noErr;
@@ -1682,7 +1730,16 @@ OSStatus	BTGetInformation	(FCB					*filePtr,
 	return noErr;
 }
 
+// XXXdbg
+__private_extern__
+OSStatus
+BTIsDirty(FCB *filePtr)
+{
+	BTreeControlBlockPtr	btreePtr;
 
+	btreePtr = (BTreeControlBlockPtr) filePtr->fcbBTCBPtr;
+	return TreeIsDirty(btreePtr);
+}
 
 /*-------------------------------------------------------------------------------
 Routine:	BTFlushPath	-	Flush BTreeControlBlock to Header Node.
@@ -1743,6 +1800,9 @@ BTReloadData(FCB *filePtr)
 	BTHeaderRec *header;	
 
 
+	node.buffer = nil;
+	node.blockHeader = nil;
+
 	btreePtr = (BTreeControlBlockPtr) filePtr->fcbBTCBPtr;
 	if (btreePtr == nil)
 		return (fsBTInvalidFileErr);
@@ -1877,3 +1937,62 @@ OSStatus	BTSetLastSync		(FCB					*filePtr,
 }
 
 
+/*-------------------------------------------------------------------------------
+Routine:	BTCheckFreeSpace
+
+Function:	Makes sure there is enough free space so that a tree operation
+            will succeed.
+
+Input:		fcb	- pointer file control block
+
+Output:		none
+
+Result:		noErr			- success
+            
+-------------------------------------------------------------------------------*/
+
+__private_extern__
+OSStatus	BTCheckFreeSpace		(FCB					*filePtr)
+{
+	BTreeControlBlockPtr	btreePtr;
+	int 					nodesNeeded, err = noErr;
+
+
+	M_ReturnErrorIf (filePtr == nil, 	paramErr);
+
+	btreePtr = (BTreeControlBlockPtr) filePtr->fcbBTCBPtr;
+	
+	REQUIRE_FILE_LOCK(btreePtr->fileRefNum, true);
+
+	M_ReturnErrorIf (btreePtr == nil,	fsBTInvalidFileErr);
+
+	// XXXdbg this is highly conservative but so much better than
+	//        winding up with turds on your disk.
+	//
+	nodesNeeded = (btreePtr->treeDepth + 1) * 10;
+	
+	if (btreePtr->freeNodes < nodesNeeded) {
+		err = ExtendBTree(btreePtr, nodesNeeded + btreePtr->totalNodes - btreePtr->freeNodes);
+	}
+
+	return err;
+}
+
+
+__private_extern__
+OSStatus	BTHasContiguousNodes	(FCB	 				*filePtr)
+{
+	BTreeControlBlockPtr	btreePtr;
+	int 					nodesNeeded, err = noErr;
+
+
+	M_ReturnErrorIf (filePtr == nil, 	paramErr);
+
+	btreePtr = (BTreeControlBlockPtr) filePtr->fcbBTCBPtr;
+	
+	REQUIRE_FILE_LOCK(btreePtr->fileRefNum, true);
+
+	M_ReturnErrorIf (btreePtr == nil,	fsBTInvalidFileErr);
+
+	return NodesAreContiguous(FCBTOVCB(filePtr), filePtr, btreePtr->nodeSize);
+}
diff --git a/bsd/hfs/hfscommon/BTree/BTreeAllocate.c b/bsd/hfs/hfscommon/BTree/BTreeAllocate.c
index 60cfa0635..a902d5087 100644
--- a/bsd/hfs/hfscommon/BTree/BTreeAllocate.c
+++ b/bsd/hfs/hfscommon/BTree/BTreeAllocate.c
@@ -125,12 +125,16 @@ OSStatus	AllocateNode (BTreeControlBlockPtr		btreePtr, UInt32	*nodeNum)
 	nodeNumber		= 0;				// first node number of header map record
 	node.buffer		= nil;				// clear node.buffer to get header node
 										//	- and for ErrorExit
+	node.blockHeader = nil;
 	
 	while (true)
 	{
 		err = GetMapNode (btreePtr, &node, &mapPtr, &mapSize);
 		M_ExitOnError (err);
 		
+		// XXXdbg
+		ModifyBlockStart(btreePtr->fileRefNum, &node);
+								
 	//////////////////////// Find Word with Free Bit ////////////////////////////
 
 		pos		= mapPtr;
@@ -233,6 +237,7 @@ OSStatus	FreeNode (BTreeControlBlockPtr		btreePtr, UInt32	nodeNum)
 	//////////////////////////// Find Map Record ////////////////////////////////
 	nodeIndex			= 0;				// first node number of header map record
 	node.buffer			= nil;				// invalidate node.buffer to get header node
+	node.blockHeader    = nil;
 	
 	while (nodeNum >= nodeIndex)
 	{
@@ -244,6 +249,9 @@ OSStatus	FreeNode (BTreeControlBlockPtr		btreePtr, UInt32	nodeNum)
 	
 	//////////////////////////// Mark Node Free /////////////////////////////////
 
+	// XXXdbg
+	ModifyBlockStart(btreePtr->fileRefNum, &node);
+								
 	nodeNum -= (nodeIndex - (mapSize << 3));			// relative to this map record
 	bitOffset = 15 - (nodeNum & 0x0000000F);			// last 4 bits are bit offset
 	mapPos += nodeNum >> 4;								// point to word containing map bit
@@ -319,7 +327,9 @@ OSStatus	ExtendBTree	(BTreeControlBlockPtr	btreePtr,
 	filePtr				= GetFileControlBlock(btreePtr->fileRefNum);
 	
 	mapNode.buffer		= nil;
+	mapNode.blockHeader = nil;
 	newNode.buffer		= nil;
+	newNode.blockHeader = nil;
 
 	mapNodeRecSize	= nodeSize - sizeof(BTNodeDescriptor) - 6;	// 2 bytes of free space (see note)
 
@@ -379,6 +389,8 @@ OSStatus	ExtendBTree	(BTreeControlBlockPtr	btreePtr,
 	
 
 	/////////////////////// Initialize New Map Nodes ////////////////////////////
+	// XXXdbg - this is the correct place for this:
+	ModifyBlockStart(btreePtr->fileRefNum, &mapNode);
 
 	((BTNodeDescriptor*)mapNode.buffer)->fLink = firstNewMapNodeNum;
 
@@ -388,6 +400,9 @@ OSStatus	ExtendBTree	(BTreeControlBlockPtr	btreePtr,
 		err = GetNewNode (btreePtr, nodeNum, &newNode);
 		M_ExitOnError (err);
 		
+		// XXXdbg
+		ModifyBlockStart(btreePtr->fileRefNum, &newNode);
+
 		((NodeDescPtr)newNode.buffer)->numRecords	= 1;
 		((NodeDescPtr)newNode.buffer)->kind = kBTMapNode;
 		
@@ -428,6 +443,9 @@ OSStatus	ExtendBTree	(BTreeControlBlockPtr	btreePtr,
 			err = GetNode (btreePtr, nextNodeNum, &mapNode);
 			M_ExitOnError (err);
 			
+			// XXXdbg
+			ModifyBlockStart(btreePtr->fileRefNum, &mapNode);
+
 			mapIndex = 0;
 			
 			mapStart	 = (UInt16 *) GetRecordAddress (btreePtr, mapNode.buffer, mapIndex);
@@ -476,7 +494,7 @@ Success:
 	////////////////////////////// Error Exit ///////////////////////////////////
 
 ErrorExit:
-
+	
 	(void) ReleaseNode (btreePtr, &mapNode);
 	(void) ReleaseNode (btreePtr, &newNode);
 	
diff --git a/bsd/hfs/hfscommon/BTree/BTreeMiscOps.c b/bsd/hfs/hfscommon/BTree/BTreeMiscOps.c
index c71fab021..7d56bf4f8 100644
--- a/bsd/hfs/hfscommon/BTree/BTreeMiscOps.c
+++ b/bsd/hfs/hfscommon/BTree/BTreeMiscOps.c
@@ -209,6 +209,14 @@ OSStatus	VerifyHeader	(FCB				*filePtr,
 
 
 
+__private_extern__
+OSStatus TreeIsDirty(BTreeControlBlockPtr btreePtr)
+{
+    return (btreePtr->flags & kBTHeaderDirty);
+}
+
+
+
 /*-------------------------------------------------------------------------------
 Routine:	UpdateHeader	-	Write BTreeInfoRec fields to Header node.
 
@@ -229,15 +237,18 @@ OSStatus UpdateHeader(BTreeControlBlockPtr btreePtr, Boolean forceWrite)
 	BTHeaderRec	*header;	
 	UInt32 options;
 
-
 	if ((btreePtr->flags & kBTHeaderDirty) == 0)			// btree info already flushed
 	return	noErr;
 	
 	
 	err = GetNode (btreePtr, kHeaderNodeNum, &node );
-	if (err != noErr)
+	if (err != noErr) {
 		return	err;
+	}
 	
+	// XXXdbg
+	ModifyBlockStart(btreePtr->fileRefNum, &node);
+
 	header = (BTHeaderRec*) ((char *)node.buffer + sizeof(BTNodeDescriptor));
 	
 	header->treeDepth		= btreePtr->treeDepth;
@@ -315,8 +326,11 @@ OSStatus	FindIteratorPosition	(BTreeControlBlockPtr	 btreePtr,
 	// assume foundRecord points to Boolean
 	
 	left->buffer		= nil;
+	left->blockHeader   = nil;
 	middle->buffer		= nil;
+	middle->blockHeader	= nil;
 	right->buffer		= nil;
+	right->blockHeader	= nil;
 	
 	foundIt				= false;
 	
diff --git a/bsd/hfs/hfscommon/BTree/BTreeScanner.c b/bsd/hfs/hfscommon/BTree/BTreeScanner.c
index 014069807..8cc50aaa1 100644
--- a/bsd/hfs/hfscommon/BTree/BTreeScanner.c
+++ b/bsd/hfs/hfscommon/BTree/BTreeScanner.c
@@ -221,7 +221,7 @@ static int ReadMultipleNodes( BTScanState *theScanStatePtr )
 	// release old buffer if we have one
 	if ( theScanStatePtr->bufferPtr != NULL )
 	{
-		theScanStatePtr->bufferPtr->b_flags |= (B_INVAL | B_AGE);
+	    theScanStatePtr->bufferPtr->b_flags |= (B_INVAL | B_AGE);
 		brelse( theScanStatePtr->bufferPtr );
 		theScanStatePtr->bufferPtr = NULL;
 		theScanStatePtr->currentNodePtr = NULL;
@@ -249,10 +249,10 @@ static int ReadMultipleNodes( BTScanState *theScanStatePtr )
 	
 	// now read blocks from the device 
 	myErr = bread( 	myDevPtr, 
-					myPhyBlockNum, 
-					myBufferSize,  
-					NOCRED, 
-					&theScanStatePtr->bufferPtr );
+							myPhyBlockNum, 
+							myBufferSize,  
+							NOCRED, 
+							&theScanStatePtr->bufferPtr );
 	if ( myErr != E_NONE )
 	{
 		goto ExitThisRoutine;
@@ -374,7 +374,7 @@ int	 BTScanTerminate(	BTScanState *		scanState,
 	if ( scanState->bufferPtr != NULL )
 	{
 		scanState->bufferPtr->b_flags |= (B_INVAL | B_AGE);
-		brelse( scanState->bufferPtr ); 
+		brelse( scanState->bufferPtr );
 		scanState->bufferPtr = NULL;
 		scanState->currentNodePtr = NULL;
 	}
diff --git a/bsd/hfs/hfscommon/BTree/BTreeTreeOps.c b/bsd/hfs/hfscommon/BTree/BTreeTreeOps.c
index 2de280321..3a8463911 100644
--- a/bsd/hfs/hfscommon/BTree/BTreeTreeOps.c
+++ b/bsd/hfs/hfscommon/BTree/BTreeTreeOps.c
@@ -395,13 +395,17 @@ OSStatus	InsertLevel (BTreeControlBlockPtr		 btreePtr,
 	PanicIf ((level == 1) && (((NodeDescPtr)targetNode->buffer)->kind != kBTLeafNode), "\P InsertLevel: non-leaf at level 1! ");
 #endif
 	leftNode.buffer = nil;
+	leftNode.blockHeader = nil;
 	targetNodeNum = treePathTable [level].node;
 
 	insertParent = false;
 	updateParent = false;
 
+	// XXXdbg
+	ModifyBlockStart(btreePtr->fileRefNum, targetNode);
+
 	////// process first insert //////
-	
+
 	err = InsertNode (btreePtr, primaryKey, targetNode, targetNodeNum, index,
 					  &newNodeNum, &newIndex, &leftNode, &updateParent, &insertParent, &newRoot );
 	M_ExitOnError (err);
@@ -446,6 +450,9 @@ OSStatus	InsertLevel (BTreeControlBlockPtr		 btreePtr,
 		UInt8 *				recPtr;
 		UInt16				recSize;
 		
+		parentNode.buffer = nil;
+		parentNode.blockHeader = nil;
+
 		secondaryKey = nil;
 		
 		PanicIf ( (level == btreePtr->treeDepth), "\p InsertLevel: unfinished insert!?");
@@ -468,6 +475,9 @@ OSStatus	InsertLevel (BTreeControlBlockPtr		 btreePtr,
 	
 		if ( updateParent )
 		{
+			// XXXdbg
+			ModifyBlockStart(btreePtr->fileRefNum, &parentNode);
+
 			// debug: check if ptr == targetNodeNum
 			GetRecordByIndex (btreePtr, parentNode.buffer, index, &keyPtr, &recPtr, &recSize);
 			PanicIf( (*(UInt32 *) recPtr) != targetNodeNum, "\p InsertLevel: parent ptr doesn't match target node!");
@@ -594,6 +604,8 @@ static OSErr	InsertNode	(BTreeControlBlockPtr	 btreePtr,
 		{
 			err = GetNode (btreePtr, leftNodeNum, leftNode);	// will be released by caller or a split below
 			M_ExitOnError (err);
+			// XXXdbg
+			ModifyBlockStart(btreePtr->fileRefNum, leftNode);
 		}
 
 		PanicIf ( ((NodeDescPtr) leftNode->buffer)->fLink != node, "\p InsertNode, RotateLeft: invalid sibling link!" );
@@ -642,7 +654,6 @@ static OSErr	InsertNode	(BTreeControlBlockPtr	 btreePtr,
 	return noErr;
 
 ErrorExit:
-
 	(void) ReleaseNode (btreePtr, leftNode);
 	return err;
 	
@@ -678,7 +689,11 @@ OSStatus	DeleteTree			(BTreeControlBlockPtr		 btreePtr,
 	Boolean				deleteRequired;
 	Boolean				updateRequired;
 
-
+	// XXXdbg - initialize these to null in case we get an
+	//          error and try to exit before it's initialized
+	parentNode.buffer      = nil;	
+	parentNode.blockHeader = nil;
+	
 	deleteRequired = false;
 	updateRequired = false;
 
@@ -686,6 +701,9 @@ OSStatus	DeleteTree			(BTreeControlBlockPtr		 btreePtr,
 	targetNodePtr = targetNode->buffer;
 	PanicIf (targetNodePtr == nil, "\pDeleteTree: targetNode has nil buffer!");
 
+	// XXXdbg
+	ModifyBlockStart(btreePtr->fileRefNum, targetNode);
+
 	DeleteRecord (btreePtr, targetNodePtr, index);
 		
 	// coalesce remaining records?
@@ -697,6 +715,9 @@ OSStatus	DeleteTree			(BTreeControlBlockPtr		 btreePtr,
 
 		deleteRequired = true;
 		
+		siblingNode.buffer = nil;
+		siblingNode.blockHeader = nil;
+
 		////////////////// Get Siblings & Update Links //////////////////////////
 		
 		siblingNodeNum = targetNodePtr->bLink;				// Left Sibling Node
@@ -704,6 +725,10 @@ OSStatus	DeleteTree			(BTreeControlBlockPtr		 btreePtr,
 		{
 			err = GetNode (btreePtr, siblingNodeNum, &siblingNode);
 			M_ExitOnError (err);
+
+			// XXXdbg
+			ModifyBlockStart(btreePtr->fileRefNum, &siblingNode);
+
 			((NodeDescPtr)siblingNode.buffer)->fLink = targetNodePtr->fLink;
 			err = UpdateNode (btreePtr, &siblingNode, 0, kLockTransaction);
 			M_ExitOnError (err);
@@ -718,6 +743,10 @@ OSStatus	DeleteTree			(BTreeControlBlockPtr		 btreePtr,
 		{
 			err = GetNode (btreePtr, siblingNodeNum, &siblingNode);
 			M_ExitOnError (err);
+
+			// XXXdbg
+			ModifyBlockStart(btreePtr->fileRefNum, &siblingNode);
+
 			((NodeDescPtr)siblingNode.buffer)->bLink = targetNodePtr->bLink;
 			err = UpdateNode (btreePtr, &siblingNode, 0, kLockTransaction);
 			M_ExitOnError (err);
@@ -733,6 +762,7 @@ OSStatus	DeleteTree			(BTreeControlBlockPtr		 btreePtr,
 		
 		err = UpdateNode (btreePtr, targetNode, 0, kLockTransaction);
 		M_ExitOnError (err);
+
 		err = FreeNode (btreePtr, targetNodeNum);
 		M_ExitOnError (err);
 	}
@@ -776,6 +806,9 @@ OSStatus	DeleteTree			(BTreeControlBlockPtr		 btreePtr,
 			 UInt16		recSize;
 			 UInt32		insertNode;
 			 
+			 // XXXdbg
+			 ModifyBlockStart(btreePtr->fileRefNum, &parentNode);
+
 			// debug: check if ptr == targetNodeNum
 			GetRecordByIndex (btreePtr, parentNode.buffer, index, &keyPtr, &recPtr, &recSize);
 			PanicIf( (*(UInt32 *) recPtr) != targetNodeNum, "\p DeleteTree: parent ptr doesn't match targetNodeNum!!");
@@ -805,7 +838,7 @@ OSStatus	DeleteTree			(BTreeControlBlockPtr		 btreePtr,
 	return	noErr;
 
 ErrorExit:
-	
+
 	(void) ReleaseNode (btreePtr, targetNode);
 	(void) ReleaseNode (btreePtr, &parentNode);
 
@@ -826,6 +859,9 @@ static OSStatus	CollapseTree	(BTreeControlBlockPtr		btreePtr,
 	
 	originalRoot	= btreePtr->rootNode;
 	
+	// XXXdbg
+	ModifyBlockStart(btreePtr->fileRefNum, blockPtr);
+
 	while (true)
 	{
 		if ( ((NodeDescPtr)blockPtr->buffer)->numRecords > 1)
@@ -848,6 +884,9 @@ static OSStatus	CollapseTree	(BTreeControlBlockPtr		btreePtr,
 		//// Get New Root Node
 		err = GetNode (btreePtr, btreePtr->rootNode, blockPtr);
 		M_ExitOnError (err);
+
+		// XXXdbg
+		ModifyBlockStart(btreePtr->fileRefNum, blockPtr);
 	}
 	
 	if (btreePtr->rootNode != originalRoot)
@@ -1110,6 +1149,9 @@ static OSStatus	SplitLeft		(BTreeControlBlockPtr		 btreePtr,
 
 	if ( left != nil )
 	{
+		// XXXdbg
+		ModifyBlockStart(btreePtr->fileRefNum, leftNode);
+
 		left->fLink	= newNodeNum;
 		err = UpdateNode (btreePtr, leftNode, 0, kLockTransaction);
 		M_ExitOnError (err);
@@ -1121,6 +1163,9 @@ static OSStatus	SplitLeft		(BTreeControlBlockPtr		 btreePtr,
 	err = GetNewNode (btreePtr, newNodeNum, leftNode);
 	M_ExitOnError (err);
 	
+	// XXXdbg
+	ModifyBlockStart(btreePtr->fileRefNum, leftNode);
+
 	left		= leftNode->buffer;
 	left->fLink	= rightNodeNum;
 	
@@ -1145,8 +1190,9 @@ static OSStatus	SplitLeft		(BTreeControlBlockPtr		 btreePtr,
 
 	err = RotateLeft (btreePtr, left, right, index, keyPtr, recPtr, recSize,
 					  insertIndex, insertNodeNum, &recordFit, recsRotated);
-	M_ExitOnError (err);
 	
+	M_ExitOnError (err);
+
 	return noErr;
 	
 ErrorExit:
@@ -1202,6 +1248,9 @@ static OSStatus	AddNewRootNode	(BTreeControlBlockPtr	 btreePtr,
 	Boolean				didItFit;
 	UInt16				keyLength;	
 	
+	rootNode.buffer = nil;
+	rootNode.blockHeader = nil;
+
 	PanicIf (leftNode == nil, "\pAddNewRootNode: leftNode == nil");
 	PanicIf (rightNode == nil, "\pAddNewRootNode: rightNode == nil");
 	
@@ -1214,6 +1263,9 @@ static OSStatus	AddNewRootNode	(BTreeControlBlockPtr	 btreePtr,
 	err = GetNewNode (btreePtr, rootNum, &rootNode);
 	M_ExitOnError (err);
 		
+	// XXXdbg
+	ModifyBlockStart(btreePtr->fileRefNum, &rootNode);
+
 	((NodeDescPtr)rootNode.buffer)->kind = kBTIndexNode;
 	((NodeDescPtr)rootNode.buffer)->height	= ++btreePtr->treeDepth;
 	
diff --git a/bsd/hfs/hfscommon/Catalog/FileIDsServices.c b/bsd/hfs/hfscommon/Catalog/FileIDsServices.c
index 923e90334..44e5996a0 100644
--- a/bsd/hfs/hfscommon/Catalog/FileIDsServices.c
+++ b/bsd/hfs/hfscommon/Catalog/FileIDsServices.c
@@ -65,6 +65,9 @@ OSErr ExchangeFileIDs( ExtendedVCB *vcb, ConstUTF8Param srcName, ConstUTF8Param
 	err = BuildCatalogKeyUTF8(vcb, destID, destName, kUndefinedStrLen, &destKey, NULL);
 	ReturnIfError(err);
 
+	err = BTCheckFreeSpace(GetFileControlBlock(vcb->extentsRefNum));
+	ReturnIfError(err);
+	
 	if ( isHFSPlus )
 	{
 		//--	Step 1: Check the catalog nodes for extents
diff --git a/bsd/hfs/hfscommon/Misc/FileExtentMapping.c b/bsd/hfs/hfscommon/Misc/FileExtentMapping.c
index b294edd9a..6831d79c0 100644
--- a/bsd/hfs/hfscommon/Misc/FileExtentMapping.c
+++ b/bsd/hfs/hfscommon/Misc/FileExtentMapping.c
@@ -495,6 +495,12 @@ static OSErr CreateExtentRecord(
 	
 	err = noErr;
 	*hint = 0;
+
+	// XXXdbg - preflight that there's enough space
+	err = BTCheckFreeSpace(GetFileControlBlock(vcb->extentsRefNum));
+	if (err)
+		return err;
+
 	MALLOC(btIterator, BTreeIterator *, sizeof(*btIterator), M_TEMP, M_WAITOK);
 	bzero(btIterator, sizeof(*btIterator));
 	
@@ -530,6 +536,8 @@ static OSErr CreateExtentRecord(
 	if (err == noErr)
 		*hint = btIterator->hint.nodeNum;
 
+	(void) BTFlushPath(GetFileControlBlock(vcb->extentsRefNum));
+	
 	FREE(btIterator, M_TEMP);	
 	return err;
 }
@@ -545,6 +553,12 @@ OSErr DeleteExtentRecord(
 	OSErr				err;
 	
 	err = noErr;
+
+	// XXXdbg - preflight that there's enough space
+	err = BTCheckFreeSpace(GetFileControlBlock(vcb->extentsRefNum));
+	if (err)
+		return err;
+
 	MALLOC(btIterator, BTreeIterator *, sizeof(*btIterator), M_TEMP, M_WAITOK);
 	bzero(btIterator, sizeof(*btIterator));
 	
@@ -569,7 +583,8 @@ OSErr DeleteExtentRecord(
 	}
 
 	err = BTDeleteRecord(GetFileControlBlock(vcb->extentsRefNum), btIterator);
-
+	(void) BTFlushPath(GetFileControlBlock(vcb->extentsRefNum));
+	
 	FREE(btIterator, M_TEMP);	
 	return err;
 }
@@ -1730,6 +1745,12 @@ static OSErr UpdateExtentRecord (
 		//	Need to find and change a record in Extents BTree
 		//
 		btFCB = GetFileControlBlock(vcb->extentsRefNum);
+
+		// XXXdbg - preflight that there's enough space
+		err = BTCheckFreeSpace(btFCB);
+		if (err)
+			return err;
+
 		MALLOC(btIterator, BTreeIterator *, sizeof(*btIterator), M_TEMP, M_WAITOK);
 		bzero(btIterator, sizeof(*btIterator));
 
@@ -1757,6 +1778,7 @@ static OSErr UpdateExtentRecord (
 
 			if (err == noErr)
 				err = BTReplaceRecord(btFCB, btIterator, &btRecord, btRecordSize);
+			(void) BTFlushPath(btFCB);
 		}
 		else {		//	HFS Plus volume
 			HFSPlusExtentRecord	foundData;		// The extent data actually found
@@ -1776,6 +1798,7 @@ static OSErr UpdateExtentRecord (
 				BlockMoveData(extentData, &foundData, sizeof(HFSPlusExtentRecord));
 				err = BTReplaceRecord(btFCB, btIterator, &btRecord, btRecordSize);
 			}
+			(void) BTFlushPath(btFCB);
 		}
 		FREE(btIterator, M_TEMP);	
 	}
@@ -1887,3 +1910,58 @@ static Boolean ExtentsAreIntegral(
 	
 	return true;
 }
+
+
+//_________________________________________________________________________________
+//
+// Routine:		NodesAreContiguous
+//
+// Purpose:		Ensure that all b-tree nodes are contiguous on disk
+//				Called by BTOpenPath during volume mount
+//_________________________________________________________________________________
+
+Boolean NodesAreContiguous(
+	ExtendedVCB	*vcb,
+	FCB			*fcb,
+	UInt32		nodeSize)
+{
+	UInt32				mask;
+	UInt32				startBlock;
+	UInt32				blocksChecked;
+	UInt32				hint;
+	HFSPlusExtentKey	key;
+	HFSPlusExtentRecord	extents;
+	OSErr				result;
+	Boolean				lastExtentReached;
+	
+
+	if (vcb->blockSize >= nodeSize)
+		return TRUE;
+
+	mask = (nodeSize / vcb->blockSize) - 1;
+
+	// check the local extents
+	(void) GetFCBExtentRecord(fcb, extents);
+	if ( !ExtentsAreIntegral(extents, mask, &blocksChecked, &lastExtentReached) )
+		return FALSE;
+
+	if (lastExtentReached || (SInt64)((SInt64)blocksChecked * (SInt64)vcb->blockSize) >= fcb->ff_size)
+		return TRUE;
+
+	startBlock = blocksChecked;
+
+	// check the overflow extents (if any)
+	while ( !lastExtentReached )
+	{
+		result = FindExtentRecord(vcb, kDataForkType, fcb->ff_cp->c_fileid, startBlock, FALSE, &key, extents, &hint);
+		if (result) break;
+
+		if ( !ExtentsAreIntegral(extents, mask, &blocksChecked, &lastExtentReached) )
+			return FALSE;
+
+		startBlock += blocksChecked;
+	}
+
+	return TRUE;
+}
+
diff --git a/bsd/hfs/hfscommon/Misc/VolumeAllocation.c b/bsd/hfs/hfscommon/Misc/VolumeAllocation.c
index ae4fccf6f..4fe649921 100644
--- a/bsd/hfs/hfscommon/Misc/VolumeAllocation.c
+++ b/bsd/hfs/hfscommon/Misc/VolumeAllocation.c
@@ -476,7 +476,14 @@ static OSErr ReleaseBitmapBlock(
 
 	if (bp) {
 		if (dirty) {
-			bdwrite(bp);
+			// XXXdbg
+			struct hfsmount *hfsmp = VCBTOHFS(vcb);
+			
+			if (hfsmp->jnl) {
+				journal_modify_block_end(hfsmp->jnl, bp);
+			} else {
+				bdwrite(bp);
+			}
 		} else {
 			brelse(bp);
 		}
@@ -597,6 +604,7 @@ static OSErr BlockAllocateAny(
 	UInt32  bitsPerBlock;
 	UInt32  wordsPerBlock;
 	Boolean dirty = false;
+	struct hfsmount *hfsmp = VCBTOHFS(vcb);
 
 	//	Since this routine doesn't wrap around
 	if (maxBlocks > (endingBlock - startingBlock)) {
@@ -678,6 +686,11 @@ static OSErr BlockAllocateAny(
 		endingBlock = block + maxBlocks;	//	if we get this far, we've found enough
 	}
 	
+	// XXXdbg
+	if (hfsmp->jnl) {
+		journal_modify_block_start(hfsmp->jnl, (struct buf *)blockRef);
+	}
+
 	//
 	//	Allocate all of the consecutive blocks
 	//
@@ -709,6 +722,11 @@ static OSErr BlockAllocateAny(
 				if (err != noErr) goto Exit;
                 buffer = currCache;
 
+				// XXXdbg
+				if (hfsmp->jnl) {
+					journal_modify_block_start(hfsmp->jnl, (struct buf *)blockRef);
+				}
+				
 				wordsLeft = wordsPerBlock;
 			}
 			
@@ -845,6 +863,8 @@ static OSErr BlockMarkAllocated(
 	UInt32  blockRef;
 	UInt32  bitsPerBlock;
 	UInt32  wordsPerBlock;
+	// XXXdbg
+	struct hfsmount *hfsmp = VCBTOHFS(vcb);
 
 	//
 	//	Pre-read the bitmap block containing the first word of allocation
@@ -866,6 +886,11 @@ static OSErr BlockMarkAllocated(
 		wordsLeft = wordsPerBlock - wordIndexInBlock;
 	}
 	
+	// XXXdbg
+	if (hfsmp->jnl) {
+		journal_modify_block_start(hfsmp->jnl, (struct buf *)blockRef);
+	}
+
 	//
 	//	If the first block to allocate doesn't start on a word
 	//	boundary in the bitmap, then treat that first word
@@ -909,6 +934,11 @@ static OSErr BlockMarkAllocated(
 			err = ReadBitmapBlock(vcb, startingBlock, &buffer, &blockRef);
 			if (err != noErr) goto Exit;
 
+			// XXXdbg
+			if (hfsmp->jnl) {
+				journal_modify_block_start(hfsmp->jnl, (struct buf *)blockRef);
+			}
+
 			//	Readjust currentWord and wordsLeft
 			currentWord = buffer;
 			wordsLeft = wordsPerBlock;
@@ -942,6 +972,11 @@ static OSErr BlockMarkAllocated(
 			err = ReadBitmapBlock(vcb, startingBlock, &buffer, &blockRef);
 			if (err != noErr) goto Exit;
 
+			// XXXdbg
+			if (hfsmp->jnl) {
+				journal_modify_block_start(hfsmp->jnl, (struct buf *)blockRef);
+			}
+			
 			//	Readjust currentWord and wordsLeft
 			currentWord = buffer;
 			wordsLeft = wordsPerBlock;
@@ -995,6 +1030,8 @@ static OSErr BlockMarkFree(
 	UInt32  blockRef;
 	UInt32  bitsPerBlock;
 	UInt32  wordsPerBlock;
+    // XXXdbg
+	struct hfsmount *hfsmp = VCBTOHFS(vcb);
 
 	//
 	//	Pre-read the bitmap block containing the first word of allocation
@@ -1002,6 +1039,11 @@ static OSErr BlockMarkFree(
 
 	err = ReadBitmapBlock(vcb, startingBlock, &buffer, &blockRef);
 	if (err != noErr) goto Exit;
+	// XXXdbg
+	if (hfsmp->jnl) {
+		journal_modify_block_start(hfsmp->jnl, (struct buf *)blockRef);
+	}
+
 	//
 	//	Initialize currentWord, and wordsLeft.
 	//
@@ -1058,6 +1100,11 @@ static OSErr BlockMarkFree(
 			err = ReadBitmapBlock(vcb, startingBlock, &buffer, &blockRef);
 			if (err != noErr) goto Exit;
 
+			// XXXdbg
+			if (hfsmp->jnl) {
+				journal_modify_block_start(hfsmp->jnl, (struct buf *)blockRef);
+			}
+
 			//	Readjust currentWord and wordsLeft
 			currentWord = buffer;
 			wordsLeft = wordsPerBlock;
@@ -1092,6 +1139,11 @@ static OSErr BlockMarkFree(
 			err = ReadBitmapBlock(vcb, startingBlock, &buffer, &blockRef);
 			if (err != noErr) goto Exit;
 
+			// XXXdbg
+			if (hfsmp->jnl) {
+				journal_modify_block_start(hfsmp->jnl, (struct buf *)blockRef);
+			}
+			
 			//	Readjust currentWord and wordsLeft
 			currentWord = buffer;
 			wordsLeft = wordsPerBlock;
diff --git a/bsd/hfs/hfscommon/headers/BTreesInternal.h b/bsd/hfs/hfscommon/headers/BTreesInternal.h
index a473cfceb..4ae9e7ad3 100644
--- a/bsd/hfs/hfscommon/headers/BTreesInternal.h
+++ b/bsd/hfs/hfscommon/headers/BTreesInternal.h
@@ -115,7 +115,8 @@ struct BlockDescriptor{
 	void		*blockHeader;
 	ByteCount	 blockSize;
 	Boolean		 blockReadFromDisk;
-	Byte		 reserved[3];
+	Byte         isModified;             // XXXdbg - for journaling
+	Byte		 reserved[2];
 };
 typedef struct BlockDescriptor BlockDescriptor;
 typedef BlockDescriptor *BlockDescPtr;
@@ -338,6 +339,10 @@ extern OSStatus	BTGetLastSync		(FCB		 				*filePtr,
 extern OSStatus	BTSetLastSync		(FCB		 				*filePtr,
 									 UInt32						lastfsync );
 
+extern OSStatus	BTCheckFreeSpace	(FCB		 				*filePtr);
+
+extern OSStatus	BTHasContiguousNodes(FCB		 				*filePtr);
+
 #endif /* __APPLE_API_PRIVATE */
 #endif /* KERNEL */
 #endif // __BTREESINTERNAL__
diff --git a/bsd/hfs/hfscommon/headers/BTreesPrivate.h b/bsd/hfs/hfscommon/headers/BTreesPrivate.h
index 4721f13a5..805c86346 100644
--- a/bsd/hfs/hfscommon/headers/BTreesPrivate.h
+++ b/bsd/hfs/hfscommon/headers/BTreesPrivate.h
@@ -382,6 +382,10 @@ OSStatus	ReleaseNode				(BTreeControlBlockPtr	 btreePtr,
 OSStatus	TrashNode				(BTreeControlBlockPtr	 btreePtr,
 									 NodePtr				 nodePtr );
 
+// XXXdbg
+void ModifyBlockStart(FileReference vp, BlockDescPtr blockPtr);
+// XXXdbg
+
 OSStatus	UpdateNode				(BTreeControlBlockPtr	 btreePtr,
 									 NodePtr				 nodePtr,
 									 UInt32					 transactionID,
diff --git a/bsd/kern/kern_mman.c b/bsd/kern/kern_mman.c
index ed614c238..3febc75bf 100644
--- a/bsd/kern/kern_mman.c
+++ b/bsd/kern/kern_mman.c
@@ -1086,6 +1086,10 @@ kern_return_t map_fd_funneled(
 	
 	if (fp->f_type != DTYPE_VNODE)
 		return(KERN_INVALID_ARGUMENT);
+
+	if (!(fp->f_flag & FREAD))
+		return (KERN_PROTECTION_FAILURE);
+
 	vp = (struct vnode *)fp->f_data;
 
 	if (vp->v_type != VREG)
diff --git a/bsd/kern/qsort.c b/bsd/kern/qsort.c
index 6ccb04112..d1505f175 100644
--- a/bsd/kern/qsort.c
+++ b/bsd/kern/qsort.c
@@ -58,7 +58,7 @@
 
 
 #include <sys/types.h>
-#include <stdlib.h>
+//#include <stdlib.h>
 
 static inline char	*med3 __P((char *, char *, char *, int (*)()));
 static inline void	 swapfunc __P((char *, char *, int, int));
@@ -113,6 +113,7 @@ med3(a, b, c, cmp)
               :(cmp(b, c) > 0 ? b : (cmp(a, c) < 0 ? a : c ));
 }
 
+__private_extern__
 void
 qsort(a, n, es, cmp)
 	void *a;
diff --git a/bsd/kern/ubc_subr.c b/bsd/kern/ubc_subr.c
index 955b6b638..47cb041ab 100644
--- a/bsd/kern/ubc_subr.c
+++ b/bsd/kern/ubc_subr.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 1999-2001 Apple Computer, Inc. All rights reserved.
+ * Copyright (c) 1999-2002 Apple Computer, Inc. All rights reserved.
  *
  * @APPLE_LICENSE_HEADER_START@
  * 
@@ -78,6 +78,62 @@ ubc_unlock(struct vnode *vp)
 	simple_unlock(&vp->v_interlock);
 }
 
+/*
+ * Serialize the requests to the VM
+ * Returns:
+ *		0	-	Failure
+ *		1	-	Sucessful in acquiring the lock
+ *		2	-	Sucessful in acquiring the lock recursively
+ *				do not call ubc_unbusy()
+ *				[This is strange, but saves 4 bytes in struct ubc_info]
+ */
+static int
+ubc_busy(struct vnode *vp)
+{
+	register struct ubc_info	*uip;
+
+	if (!UBCINFOEXISTS(vp))
+		return (0);
+
+	uip = vp->v_ubcinfo;
+
+	while (ISSET(uip->ui_flags, UI_BUSY)) {
+
+		if (uip->ui_owner == (void *)current_thread())
+			return (2);
+
+		SET(uip->ui_flags, UI_WANTED);
+		(void) tsleep((caddr_t)&vp->v_ubcinfo, PINOD, "ubcbusy", 0);
+
+		if (!UBCINFOEXISTS(vp))
+			return (0);
+	}
+	uip->ui_owner = (void *)current_thread();
+
+	SET(uip->ui_flags, UI_BUSY);
+
+	return (1);
+}
+
+static void
+ubc_unbusy(struct vnode *vp)
+{
+	register struct ubc_info	*uip;
+
+	if (!UBCINFOEXISTS(vp)) {
+		wakeup((caddr_t)&vp->v_ubcinfo);
+		return;
+	}
+	uip = vp->v_ubcinfo;
+	CLR(uip->ui_flags, UI_BUSY);
+	uip->ui_owner = (void *)NULL;
+
+	if (ISSET(uip->ui_flags, UI_WANTED)) {
+		CLR(uip->ui_flags, UI_WANTED);
+		wakeup((caddr_t)&vp->v_ubcinfo);
+	}
+}
+
 /*
  *	Initialization of the zone for Unified Buffer Cache.
  */
@@ -139,6 +195,7 @@ ubc_info_init(struct vnode *vp)
 		uip->ui_refcount = 1;
 		uip->ui_size = 0;
 		uip->ui_mapped = 0;
+		uip->ui_owner = (void *)NULL;
 		ubc_lock(vp);
 	}
 #if DIAGNOSTIC
@@ -232,10 +289,20 @@ ubc_info_free(struct ubc_info *uip)
 void
 ubc_info_deallocate(struct ubc_info *uip)
 {
+
 	assert(uip->ui_refcount > 0);
 
-    if (uip->ui_refcount-- == 1)
+    if (uip->ui_refcount-- == 1) {
+		struct vnode *vp;
+
+		vp = uip->ui_vnode;
+		if (ISSET(uip->ui_flags, UI_WANTED)) {
+			CLR(uip->ui_flags, UI_WANTED);
+			wakeup((caddr_t)&vp->v_ubcinfo);
+		}
+
 		ubc_info_free(uip);
+	}
 }
 
 /*
@@ -339,12 +406,16 @@ ubc_uncache(struct vnode *vp)
 {
 	kern_return_t kret;
 	struct ubc_info *uip;
+	int    recursed;
 	memory_object_control_t control;
 	memory_object_perf_info_data_t   perf;
 
 	if (!UBCINFOEXISTS(vp))
 		return (0);
 
+	if ((recursed = ubc_busy(vp)) == 0)
+		return (0);
+
 	uip = vp->v_ubcinfo;
 
 	assert(uip != UBC_INFO_NULL);
@@ -372,11 +443,15 @@ ubc_uncache(struct vnode *vp)
 	if (kret != KERN_SUCCESS) {
 		printf("ubc_uncache: memory_object_change_attributes_named "
 			"kret = %d", kret);
+		if (recursed == 1)
+			ubc_unbusy(vp);
 		return (0);
 	}
 
 	ubc_release_named(vp);
 
+	if (recursed == 1)
+		ubc_unbusy(vp);
 	return (1);
 }
 
@@ -506,15 +581,16 @@ memory_object_control_t
 ubc_getobject(struct vnode *vp, int flags)
 {
 	struct ubc_info *uip;
+	int    recursed;
 	memory_object_control_t control;
 
-	uip = vp->v_ubcinfo;
-
 	if (UBCINVALID(vp))
 		return (0);
 
-	ubc_lock(vp);
+	if ((recursed = ubc_busy(vp)) == 0)
+		return (0);
 
+	uip = vp->v_ubcinfo;
 	control = uip->ui_control;
 
 	if ((flags & UBC_HOLDOBJECT) && (!ISSET(uip->ui_flags, UI_HASOBJREF))) {
@@ -523,19 +599,21 @@ ubc_getobject(struct vnode *vp, int flags)
 		 * Take a temporary reference on the ubc info so that it won't go
 		 * away during our recovery attempt.
 		 */
+		ubc_lock(vp);
 		uip->ui_refcount++;
 		ubc_unlock(vp);
 		if (memory_object_recover_named(control, TRUE) == KERN_SUCCESS) {
-			ubc_lock(vp);
 			SET(uip->ui_flags, UI_HASOBJREF);
-			ubc_unlock(vp);
 		} else {
 			control = MEMORY_OBJECT_CONTROL_NULL;
 		}
+		if (recursed == 1)
+			ubc_unbusy(vp);
 		ubc_info_deallocate(uip);
 
 	} else {
-		ubc_unlock(vp);
+		if (recursed == 1)
+			ubc_unbusy(vp);
 	}
 
 	return (control);
@@ -770,15 +848,16 @@ int
 ubc_hold(struct vnode *vp)
 {
 	struct ubc_info *uip;
+	int    recursed;
 	memory_object_control_t object;
 
 	if (UBCINVALID(vp))
 		return (0);
 
-	if (!UBCINFOEXISTS(vp)) {
+	if ((recursed = ubc_busy(vp)) == 0) {
 		/* must be invalid or dying vnode */
 		assert(UBCINVALID(vp) ||
-			   ((vp->v_flag & VXLOCK) || (vp->v_flag & VTERMINATE)));
+			((vp->v_flag & VXLOCK) || (vp->v_flag & VTERMINATE)));
 		return (0);
 	}
 
@@ -787,21 +866,23 @@ ubc_hold(struct vnode *vp)
 
 	ubc_lock(vp);
 	uip->ui_refcount++;
+	ubc_unlock(vp);
 
 	if (!ISSET(uip->ui_flags, UI_HASOBJREF)) {
-		ubc_unlock(vp);
-		if (memory_object_recover_named(uip->ui_control, TRUE) != KERN_SUCCESS) {
+		if (memory_object_recover_named(uip->ui_control, TRUE)
+			!= KERN_SUCCESS) {
+			if (recursed == 1)
+				ubc_unbusy(vp);
 			ubc_info_deallocate(uip);
 			return (0);
 		}
-		ubc_lock(vp);
 		SET(uip->ui_flags, UI_HASOBJREF);
-		ubc_unlock(vp);
-	} else {
-		ubc_unlock(vp);
 	}
+	if (recursed == 1)
+		ubc_unbusy(vp);
 
 	assert(uip->ui_refcount > 0);
+
 	return (1);
 }
 
@@ -872,28 +953,30 @@ int
 ubc_release_named(struct vnode *vp)
 {
 	struct ubc_info *uip;
+	int    recursed;
 	memory_object_control_t control;
-	kern_return_t kret;
+	kern_return_t kret = KERN_FAILURE;
 
 	if (UBCINVALID(vp))
 		return (0);
 
-	if (!UBCINFOEXISTS(vp))
+	if ((recursed = ubc_busy(vp)) == 0)
 		return (0);
-
 	uip = vp->v_ubcinfo;
 
 	/* can not release held or mapped vnodes */
 	if (ISSET(uip->ui_flags, UI_HASOBJREF) && 
-	    (uip->ui_refcount == 1) && !uip->ui_mapped) {
+		(uip->ui_refcount == 1) && !uip->ui_mapped) {
 		control = uip->ui_control;
 		assert(control);
 		CLR(uip->ui_flags, UI_HASOBJREF);
 		kret = memory_object_release_name(control,
 				MEMORY_OBJECT_RESPECT_CACHE);
-		return ((kret != KERN_SUCCESS) ? 0 : 1);
-	} else 
-		return (0);
+	}
+
+	if (recursed == 1)
+		ubc_unbusy(vp);
+	return ((kret != KERN_SUCCESS) ? 0 : 1);
 }
 
 /*
diff --git a/bsd/miscfs/specfs/spec_vnops.c b/bsd/miscfs/specfs/spec_vnops.c
index 52eea7f81..f44cd9323 100644
--- a/bsd/miscfs/specfs/spec_vnops.c
+++ b/bsd/miscfs/specfs/spec_vnops.c
@@ -555,7 +555,8 @@ loop:
 	s = splbio();
 	for (bp = vp->v_dirtyblkhd.lh_first; bp; bp = nbp) {
 		nbp = bp->b_vnbufs.le_next;
-		if ((bp->b_flags & B_BUSY))
+		// XXXdbg - don't flush locked blocks.  they may be journaled.
+		if ((bp->b_flags & B_BUSY) || (bp->b_flags & B_LOCKED))
 			continue;
 		if ((bp->b_flags & B_DELWRI) == 0)
 			panic("spec_fsync: not dirty");
diff --git a/bsd/nfs/nfs_bio.c b/bsd/nfs/nfs_bio.c
index 7a7394c78..4b77d6637 100644
--- a/bsd/nfs/nfs_bio.c
+++ b/bsd/nfs/nfs_bio.c
@@ -115,7 +115,8 @@ nfs_bioread(vp, uio, ioflag, cred, getpages)
 	int getpages;
 {
 	register struct nfsnode *np = VTONFS(vp);
-	register int biosize, diff, i;
+	register int biosize, i;
+	off_t diff;
 	struct buf *bp = 0, *rabp;
 	struct vattr vattr;
 	struct proc *p;
@@ -268,7 +269,7 @@ again:
 		bufsize = biosize;
 		if ((off_t)(lbn + 1) * biosize > np->n_size && 
 		    (off_t)(lbn + 1) * biosize - np->n_size < biosize) {
-			bufsize = np->n_size - lbn * biosize;
+			bufsize = np->n_size - (off_t)lbn * biosize;
 			bufsize = (bufsize + DEV_BSIZE - 1) & ~(DEV_BSIZE - 1);
 		}
 		bp = nfs_getcacheblk(vp, lbn, bufsize, p, operation);
@@ -876,7 +877,7 @@ nfs_getcacheblk(vp, bn, size, p, operation)
 		bp = getblk(vp, bn, size, 0, 0, operation);
 
 	if( vp->v_type == VREG)
-		bp->b_blkno = (bn * biosize) / DEV_BSIZE;
+		bp->b_blkno = ((off_t)bn * biosize) / DEV_BSIZE;
 
 	return (bp);
 }
diff --git a/bsd/nfs/nfs_socket.c b/bsd/nfs/nfs_socket.c
index ef42d4683..8038b43a6 100644
--- a/bsd/nfs/nfs_socket.c
+++ b/bsd/nfs/nfs_socket.c
@@ -2204,7 +2204,7 @@ nfsrv_getstream(slp, waitflag)
 	register struct mbuf *m, **mpp;
 	register char *cp1, *cp2;
 	register int len;
-	struct mbuf *om, *m2, *recm = 0;
+	struct mbuf *om, *m2, *recm;
 	u_long recmark;
 
 	if (slp->ns_flag & SLP_GETSTREAM)
@@ -2249,7 +2249,11 @@ nfsrv_getstream(slp, waitflag)
 
 	    /*
 	     * Now get the record part.
+	     *
+	     * Note that slp->ns_reclen may be 0.  Linux sometimes
+	     * generates 0-length RPCs
 	     */
+	    recm = NULL;
 	    if (slp->ns_cc == slp->ns_reclen) {
 		recm = slp->ns_raw;
 		slp->ns_raw = slp->ns_rawend = (struct mbuf *)0;
diff --git a/bsd/nfs/nfs_vnops.c b/bsd/nfs/nfs_vnops.c
index 2d516acf2..e8c78eee8 100644
--- a/bsd/nfs/nfs_vnops.c
+++ b/bsd/nfs/nfs_vnops.c
@@ -4512,8 +4512,8 @@ again:
 #if 0
 		/* (removed for UBC) */
 		bufsize = biosize;
-		if ((lbn + 1) * biosize > np->n_size) {
-			bufsize = np->n_size - lbn * biosize;
+		if ((off_t)(lbn + 1) * biosize > np->n_size) {
+			bufsize = np->n_size - (off_t)lbn * biosize;
 			bufsize = (bufsize + DEV_BSIZE - 1) & ~(DEV_BSIZE - 1);
 		}
 #endif
@@ -4618,7 +4618,7 @@ nfs_blktooff(ap)
 
 	biosize = min(vp->v_mount->mnt_stat.f_iosize, PAGE_SIZE); /* nfs_bio.c */
 
-	*ap->a_offset = (off_t)(ap->a_lblkno *  biosize);
+	*ap->a_offset = (off_t)ap->a_lblkno *  biosize;
 
 	return (0);
 }
diff --git a/bsd/sys/buf.h b/bsd/sys/buf.h
index fb456c562..d051d11f0 100644
--- a/bsd/sys/buf.h
+++ b/bsd/sys/buf.h
@@ -132,6 +132,15 @@ struct buf {
 #define b_trans_head b_freelist.tqe_prev
 #define b_trans_next b_freelist.tqe_next
 #define b_real_bp    b_saveaddr
+#define b_iostate    b_rcred
+
+/* journaling uses this cluster i/o field for its own
+ * purposes because meta data buf's should never go
+ * through the clustering code.
+ */
+#define b_transaction b_vectorlist
+
+   
 
 /*
  * These flags are kept in b_flags.
@@ -163,7 +172,7 @@ struct buf {
 #define	B_WRITE		0x00000000	/* Write buffer (pseudo flag). */
 #define	B_WRITEINPROG	0x01000000	/* Write in progress. */
 #define	B_HDRALLOC	0x02000000	/* zone allocated buffer header */
-#define	B_UNUSED1	0x04000000	/* Unused bit */
+#define	B_NORELSE	0x04000000	/* don't brelse() in bwrite() */
 #define B_NEED_IODONE   0x08000000
 								/* need to do a biodone on the */
 								/* real_bp associated with a cluster_io */
diff --git a/bsd/sys/disk.h b/bsd/sys/disk.h
index 74b269c58..65a4bffdd 100644
--- a/bsd/sys/disk.h
+++ b/bsd/sys/disk.h
@@ -44,8 +44,12 @@ typedef struct
 
 #define DKIOCGETMAXBLOCKCOUNTREAD    _IOR('d', 64, u_int64_t)
 #define DKIOCGETMAXBLOCKCOUNTWRITE   _IOR('d', 65, u_int64_t)
+#define DKIOCGETMAXBYTECOUNTREAD         _IOR('d', 70, u_int64_t)
+#define DKIOCGETMAXBYTECOUNTWRITE        _IOR('d', 71, u_int64_t)
 #define DKIOCGETMAXSEGMENTCOUNTREAD  _IOR('d', 66, u_int64_t)
 #define DKIOCGETMAXSEGMENTCOUNTWRITE _IOR('d', 67, u_int64_t)
+#define DKIOCGETMAXSEGMENTBYTECOUNTREAD  _IOR('d', 68, u_int64_t)
+#define DKIOCGETMAXSEGMENTBYTECOUNTWRITE _IOR('d', 69, u_int64_t)
 
 #ifdef KERNEL
 #define DKIOCSETBLOCKSIZE            _IOW('d', 24, u_int32_t)
diff --git a/bsd/sys/malloc.h b/bsd/sys/malloc.h
index fb05b8734..751de10e5 100644
--- a/bsd/sys/malloc.h
+++ b/bsd/sys/malloc.h
@@ -164,8 +164,9 @@
 #define M_IP6MISC	88	/* IPv6 misc. memory */
 #define M_TSEGQ		89	/* TCP segment queue entry */
 #define M_IGMP		90
+#define M_JOURNAL       91      /* VFS Journaling code */
 
-#define	M_LAST		91	/* Must be last type + 1 */
+#define	M_LAST		92	/* Must be last type + 1 */
 
 /* Strings corresponding to types of memory */
 /* Must be in synch with the #defines above */
@@ -258,9 +259,10 @@
 	"UDF mount"	/* 85 M_UDFMNT */ \
 	"IPv6 NDP",	/* 86 M_IP6NDP */ \
 	"IPv6 options",	/* 87 M_IP6OPT */ \
-	"IPv6 Misc"	/* 88 M_IP6MISC */\
-	"TCP Segment Q"	/* 89 M_TSEGQ */\
-	"IGMP state"	/* 90 M_IGMP */\
+	"IPv6 Misc",	/* 88 M_IP6MISC */\
+	"TCP Segment Q",/* 89 M_TSEGQ */\
+	"IGMP state",	/* 90 M_IGMP */\
+	"Journaling"    /* 91 M_JOURNAL */\
 }
 
 struct kmemstats {
diff --git a/bsd/sys/mount.h b/bsd/sys/mount.h
index a2840d9bc..2b8e1e05c 100644
--- a/bsd/sys/mount.h
+++ b/bsd/sys/mount.h
@@ -159,6 +159,7 @@ struct mount {
 #define MNT_DONTBROWSE	0x00100000	/* file system is not appropriate path to user data */
 #define MNT_UNKNOWNPERMISSIONS 0x00200000 /* no known mapping for uid/gid in permissions information on disk */
 #define MNT_AUTOMOUNTED 0x00400000	/* filesystem was mounted by automounter */
+#define MNT_JOURNALED   0x00800000  /* filesystem is journaled */
 
 /*
  * NFS export related mount flags.
@@ -188,7 +189,7 @@ struct mount {
 			MNT_DEFEXPORTED	| MNT_EXPORTANON| MNT_EXKERB	| \
 			MNT_LOCAL	|		MNT_QUOTA	| \
 			MNT_ROOTFS	| MNT_DOVOLFS	| MNT_DONTBROWSE | \
-			MNT_UNKNOWNPERMISSIONS | MNT_AUTOMOUNTED | MNT_FIXEDSCRIPTENCODING )
+			MNT_UNKNOWNPERMISSIONS | MNT_AUTOMOUNTED | MNT_JOURNALED | MNT_FIXEDSCRIPTENCODING )
 /*
  * External filesystem command modifier flags.
  * Unmount can use the MNT_FORCE flag.
diff --git a/bsd/sys/ubc.h b/bsd/sys/ubc.h
index d243bb97f..e6a2a189d 100644
--- a/bsd/sys/ubc.h
+++ b/bsd/sys/ubc.h
@@ -60,6 +60,7 @@ struct ubc_info {
 	int						ui_refcount;/* ref count on the ubc_info */
 	off_t					ui_size;	/* file size for the vnode */
 	long					ui_mapped;	/* is it currently mapped */
+	void					*ui_owner;	/* for recursive ubc_busy */
 };
 
 /* Defines for ui_flags */
@@ -69,6 +70,8 @@ struct ubc_info {
 #define UI_HASOBJREF	0x00000004		/* hold a reference on object */
 #define UI_WASMAPPED	0x00000008		/* vnode was mapped */
 #define	UI_DONTCACHE	0x00000010		/* do not cache object */
+#define	UI_BUSY			0x00000020		/* for VM synchronization */
+#define	UI_WANTED		0x00000040		/* for VM synchronization */
 
 #endif /* __APPLE_API_PRIVATE */
 
diff --git a/bsd/vfs/Makefile b/bsd/vfs/Makefile
index ce2bd8753..1ed043ac2 100644
--- a/bsd/vfs/Makefile
+++ b/bsd/vfs/Makefile
@@ -20,7 +20,7 @@ EXPINC_SUBDIRS_PPC = \
 EXPINC_SUBDIRS_I386 = \
 
 DATAFILES = \
- 	 vfs_support.h	
+	vfs_support.h vfs_journal.h
 
 INSTALL_MI_LIST	= ${DATAFILES}
 
diff --git a/bsd/vfs/vfs_bio.c b/bsd/vfs/vfs_bio.c
index c11c03bea..57c206760 100644
--- a/bsd/vfs/vfs_bio.c
+++ b/bsd/vfs/vfs_bio.c
@@ -180,6 +180,7 @@ simple_lock_data_t bufhashlist_slock;		/* lock on buffer hash list */
 /* number of per vnode, "in flight" buffer writes */
 #define	BUFWRITE_THROTTLE	9
 
+
 /*
  * Time in seconds before a buffer on a list is 
  * considered as a stale buffer 
@@ -211,9 +212,9 @@ binshash(struct buf *bp, struct bufhashhdr *dp)
 
 	simple_lock(&bufhashlist_slock);
 
-#if 0	
-	if(incore(bp->b_vp, bp->b_lblkno))
-		panic("binshash: already incore");
+#if 0
+	if((bad = incore(bp->b_vp, bp->b_lblkno)))
+		panic("binshash: already incore bp 0x%x, bad 0x%x\n", bp, bad);
 #endif /* 0 */
 
 	BHASHENTCHECK(bp);
@@ -459,6 +460,7 @@ bio_doread(vp, blkno, size, cred, async, queuetype)
 			 */
 			bp->b_rcred = crdup(cred);
 		}
+
 		VOP_STRATEGY(bp);
 
 		trace(TR_BREADMISS, pack(vp, size), blkno);
@@ -627,7 +629,12 @@ bwrite(bp)
 			p->p_stats->p_ru.ru_oublock++;		/* XXX */
 
 		/* Release the buffer. */
-		brelse(bp);
+		// XXXdbg - only if the unused bit is set
+		if (!ISSET(bp->b_flags, B_NORELSE)) {
+		    brelse(bp);
+		} else {
+		    CLR(bp->b_flags, B_NORELSE);
+		}
 
 		return (rv);
 	} else {
@@ -707,7 +714,10 @@ bdwrite_internal(bp, return_error)
 	if (nbdwrite < 0)
 		panic("bdwrite: Negative nbdwrite");
 
-	if (nbdwrite > ((nbuf/4)*3)) {
+	// can't do a bawrite() if the LOCKED bit is set because the
+	// buffer is part of a transaction and can't go to disk until
+	// the LOCKED bit is cleared.
+	if (!ISSET(bp->b_flags, B_LOCKED) && nbdwrite > ((nbuf/4)*3)) {
 		if (return_error)
 			return (EAGAIN);
 		else
@@ -807,6 +817,27 @@ brelse(bp)
 
 	trace(TR_BRELSE, pack(bp->b_vp, bp->b_bufsize), bp->b_lblkno);
 
+	// if we're invalidating a buffer that has the B_CALL bit
+	// set then call the b_iodone function so it gets cleaned
+	// up properly.
+	//
+	if (ISSET(bp->b_flags, B_META) && ISSET(bp->b_flags, B_INVAL)) {
+		if (ISSET(bp->b_flags, B_CALL) && !ISSET(bp->b_flags, B_DELWRI)) {
+			panic("brelse: CALL flag set but not DELWRI! bp 0x%x\n", bp);
+		}
+		if (ISSET(bp->b_flags, B_CALL)) {	/* if necessary, call out */
+			void	(*iodone_func)(struct buf *) = bp->b_iodone;
+
+			CLR(bp->b_flags, B_CALL);	/* but note callout done */
+			bp->b_iodone = NULL;
+
+			if (iodone_func == NULL) {
+				panic("brelse: bp @ 0x%x has NULL b_iodone!\n", bp);
+			}
+			(*iodone_func)(bp);
+		}
+	}
+	
 	/* IO is done. Cleanup the UPL state */
 	if (!ISSET(bp->b_flags, B_META)
 		&& UBCINFOEXISTS(bp->b_vp) && bp->b_bufsize) {
@@ -1121,6 +1152,10 @@ start:
 			brelse(bp);
 			goto start;
 		}
+		/*
+		 * NOTE: YOU CAN NOT BLOCK UNTIL binshash() HAS BEEN
+		 *       CALLED!  BE CAREFUL.
+		 */
 
 		/*
 		 * if it is meta, the queue may be set to other 
@@ -1451,7 +1486,7 @@ allocbuf(bp, size)
 	}
 
 	if (ISSET(bp->b_flags, B_META) && (bp->b_data == 0))
-		panic("allocbuf: bp->b_data is NULL");
+		panic("allocbuf: bp->b_data is NULL, buf @ 0x%x", bp);
 
 	bp->b_bufsize = desired_size;
 	bp->b_bcount = size;
@@ -1603,11 +1638,15 @@ start:
 		panic("getnewbuf: null bp");
 
 found:
+	if (ISSET(bp->b_flags, B_LOCKED)) {
+	    panic("getnewbuf: bp @ 0x%x is LOCKED! (flags 0x%x)\n", bp, bp->b_flags);
+	}
+	
 	if (bp->b_hash.le_prev == (struct buf **)0xdeadbeef) 
-		panic("getnewbuf: le_prev is deadbeef");
+		panic("getnewbuf: le_prev is deadbeef, buf @ 0x%x", bp);
 
 	if(ISSET(bp->b_flags, B_BUSY))
-		panic("getnewbuf reusing BUSY buf");
+		panic("getnewbuf reusing BUSY buf @ 0x%x", bp);
 
 	/* Clean it */
 	if (bcleanbuf(bp)) {
@@ -1822,8 +1861,16 @@ biodone(bp)
 	}
 
 	if (ISSET(bp->b_flags, B_CALL)) {	/* if necessary, call out */
+		void	(*iodone_func)(struct buf *) = bp->b_iodone;
+
 		CLR(bp->b_flags, B_CALL);	/* but note callout done */
-		(*bp->b_iodone)(bp);
+		bp->b_iodone = NULL;
+
+		if (iodone_func == NULL) {
+			panic("biodone: bp @ 0x%x has NULL b_iodone!\n", bp);			
+		} else { 
+			(*iodone_func)(bp);
+		}
 	} else if (ISSET(bp->b_flags, B_ASYNC))	/* if async, release it */
 		brelse(bp);
 	else {		                        /* or just wakeup the buffer */	
@@ -1932,6 +1979,7 @@ alloc_io_buf(vp, priv)
 	/* clear out various fields */
 	bp->b_flags = B_BUSY;
 	bp->b_blkno = bp->b_lblkno = 0;
+
 	bp->b_iodone = 0;
 	bp->b_error = 0;
 	bp->b_resid = 0;
@@ -2344,3 +2392,76 @@ doit:
 
 	(void) thread_funnel_set(kernel_flock, funnel_state);
 }
+
+
+static int
+bp_cmp(void *a, void *b)
+{
+    struct buf *bp_a = *(struct buf **)a,
+               *bp_b = *(struct buf **)b;
+    daddr_t res;
+
+    // don't have to worry about negative block
+    // numbers so this is ok to do.
+    //
+    res = (bp_a->b_blkno - bp_b->b_blkno);
+
+    return (int)res;
+}
+
+#define NFLUSH 32
+
+int
+bflushq(int whichq, struct mount *mp)
+{
+	struct buf *bp, *next;
+	int         i, buf_count, s;
+	int         counter=0, total_writes=0;
+	static struct buf *flush_table[NFLUSH];
+
+	if (whichq < 0 || whichq >= BQUEUES) {
+	    return;
+	}
+
+
+  restart:
+	bp = TAILQ_FIRST(&bufqueues[whichq]);
+	for(buf_count=0; bp; bp=next) {
+	    next = bp->b_freelist.tqe_next;
+			
+	    if (bp->b_vp == NULL || bp->b_vp->v_mount != mp) {
+		continue;
+	    }
+
+	    if ((bp->b_flags & B_DELWRI) && (bp->b_flags & B_BUSY) == 0) {
+		if (whichq != BQ_LOCKED && (bp->b_flags & B_LOCKED)) {
+		    panic("bflushq: bp @ 0x%x is locked!\n", bp);
+		}
+		
+		bremfree(bp);
+		bp->b_flags |= B_BUSY;
+		flush_table[buf_count] = bp;
+		buf_count++;
+		total_writes++;
+
+		if (buf_count >= NFLUSH) {
+		    qsort(flush_table, buf_count, sizeof(struct buf *), bp_cmp);
+
+		    for(i=0; i < buf_count; i++) {
+			bawrite(flush_table[i]);
+		    }
+
+		    goto restart;
+		}
+	    }
+	}
+
+	if (buf_count > 0) {
+	    qsort(flush_table, buf_count, sizeof(struct buf *), bp_cmp);
+	    for(i=0; i < buf_count; i++) {
+		bawrite(flush_table[i]);
+	    }
+	}
+
+	return total_writes;
+}
diff --git a/bsd/vfs/vfs_cluster.c b/bsd/vfs/vfs_cluster.c
index df2e73751..49b0938bb 100644
--- a/bsd/vfs/vfs_cluster.c
+++ b/bsd/vfs/vfs_cluster.c
@@ -1,4 +1,3 @@
-
 /*
  * Copyright (c) 2000-2002 Apple Computer, Inc. All rights reserved.
  *
@@ -80,6 +79,16 @@
 #define CL_NOZERO    0x80
 #define CL_PAGEIN    0x100
 #define CL_DEV_MEMORY 0x200
+#define CL_PRESERVE   0x400
+
+struct clios {
+        u_int  io_completed;
+        u_int  io_issued;
+        off_t  io_offset;
+        int    io_error;
+        int    io_wanted;
+};
+
 
 static void cluster_zero(upl_t upl, vm_offset_t   upl_offset,
 		int size, struct buf *bp);
@@ -93,8 +102,11 @@ static int cluster_nocopy_read(struct vnode *vp, struct uio *uio,
 static int cluster_nocopy_write(struct vnode *vp, struct uio *uio,
 		off_t newEOF, int devblocksize, int flags);
 static int cluster_phys_read(struct vnode *vp, struct uio *uio,
-		off_t filesize);
-static int cluster_phys_write(struct vnode *vp, struct uio *uio, off_t newEOF);
+		off_t filesize, int devblocksize, int flags);
+static int cluster_phys_write(struct vnode *vp, struct uio *uio,
+		off_t newEOF, int devblocksize, int flags);
+static int cluster_align_phys_io(struct vnode *vp, struct uio *uio,
+                vm_offset_t usr_paddr, int xsize, int devblocksize, int flags);
 static int cluster_push_x(struct vnode *vp, off_t EOF, daddr_t first, daddr_t last, int can_delay);
 static int cluster_try_push(struct vnode *vp, off_t newEOF, int can_delay, int push_all);
 
@@ -116,12 +128,14 @@ cluster_iodone(bp)
 	int         total_resid;
 	int         upl_offset;
 	int         zero_offset;
+	int         l_blkno;
 	upl_t       upl;
 	struct buf *cbp;
 	struct buf *cbp_head;
 	struct buf *cbp_next;
 	struct buf *real_bp;
 	struct vnode *vp;
+	struct clios *iostate;
 	int         commit_size;
 	int         pg_offset;
 
@@ -155,6 +169,8 @@ cluster_iodone(bp)
 	real_bp    = cbp->b_real_bp;
 	vp         = cbp->b_vp;
 	zero_offset= cbp->b_validend;
+	l_blkno    = cbp->b_lblkno;
+	iostate    = (struct clios *)cbp->b_iostate;
 
 	while (cbp) {
 		if (cbp->b_vectorcount > 1)
@@ -172,13 +188,34 @@ cluster_iodone(bp)
 
 		cbp = cbp_next;
 	}
+	if (zero_offset)
+	        cluster_zero(upl, zero_offset, PAGE_SIZE - (zero_offset & PAGE_MASK), real_bp);
+
 	if ((vp->v_flag & VTHROTTLED) && (vp->v_numoutput <= (ASYNC_THROTTLE / 3))) {
 	        vp->v_flag &= ~VTHROTTLED;
 		wakeup((caddr_t)&vp->v_numoutput);
 	}
-	if (zero_offset)
-	        cluster_zero(upl, zero_offset, PAGE_SIZE - (zero_offset & PAGE_MASK), real_bp);
+	if (iostate) {
+	        if (error) {
+		        off_t   error_offset;
+
+			error_offset = (off_t)l_blkno * PAGE_SIZE_64;
 
+		        if (iostate->io_error == 0) {
+			        iostate->io_error = error;
+				iostate->io_offset = error_offset;
+			} else {
+			        if (error_offset < iostate->io_offset)
+				        iostate->io_offset = error_offset;
+			}
+		}
+		iostate->io_completed += total_size;
+
+		if (iostate->io_wanted) {
+		        iostate->io_wanted = 0;
+			wakeup((caddr_t)&iostate->io_wanted);
+		}
+	}
 	if ((b_flags & B_NEED_IODONE) && real_bp) {
 		if (error) {
 		        real_bp->b_flags |= B_ERROR;
@@ -192,13 +229,15 @@ cluster_iodone(bp)
 	        error = EIO;
 
 	if (b_flags & B_COMMIT_UPL) {
-		pg_offset   = upl_offset & PAGE_MASK;
+	        pg_offset   = upl_offset & PAGE_MASK;
 		commit_size = (((pg_offset + total_size) + (PAGE_SIZE - 1)) / PAGE_SIZE) * PAGE_SIZE;
 
-		if (error || (b_flags & B_NOCACHE)) {
+		if (error || (b_flags & B_NOCACHE) || ((b_flags & B_PHYS) && !(b_flags & B_READ))) {
 		        int upl_abort_code;
 
-			if ((b_flags & B_PAGEOUT) && (error != ENXIO)) /* transient error */
+			if (b_flags & B_PHYS)
+			        upl_abort_code = UPL_ABORT_FREE_ON_EMPTY;
+			else if ((b_flags & B_PAGEOUT) && (error != ENXIO)) /* transient error */
 			        upl_abort_code = UPL_ABORT_FREE_ON_EMPTY;
 			else if (b_flags & B_PGIN)
 				upl_abort_code = UPL_ABORT_FREE_ON_EMPTY | UPL_ABORT_ERROR;
@@ -215,7 +254,9 @@ cluster_iodone(bp)
 		} else {
 		        int upl_commit_flags = UPL_COMMIT_FREE_ON_EMPTY;
 
-			if ( !(b_flags & B_PAGEOUT))
+			if (b_flags & B_PHYS)
+			        upl_commit_flags |= UPL_COMMIT_SET_DIRTY;
+			else if ( !(b_flags & B_PAGEOUT))
 			        upl_commit_flags |= UPL_COMMIT_CLEAR_DIRTY;
 			if (b_flags & B_AGE)
 			        upl_commit_flags |= UPL_COMMIT_INACTIVATE;
@@ -271,7 +312,7 @@ cluster_zero(upl, upl_offset, size, bp)
 }
 
 static int
-cluster_io(vp, upl, upl_offset, f_offset, non_rounded_size, devblocksize, flags, real_bp)
+cluster_io(vp, upl, upl_offset, f_offset, non_rounded_size, devblocksize, flags, real_bp, iostate)
 	struct vnode *vp;
 	upl_t         upl;
 	vm_offset_t   upl_offset;
@@ -280,10 +321,12 @@ cluster_io(vp, upl, upl_offset, f_offset, non_rounded_size, devblocksize, flags,
 	int           devblocksize;
 	int           flags;
 	struct buf   *real_bp;
+	struct clios *iostate;
 {
 	struct buf   *cbp;
 	struct iovec *iovp;
-	u_int           size;
+	u_int         size;
+	u_int         io_size;
 	int           io_flags;
 	int           error = 0;
 	int           retval = 0;
@@ -297,6 +340,7 @@ cluster_io(vp, upl, upl_offset, f_offset, non_rounded_size, devblocksize, flags,
 	u_int max_vectors;
 	int priv;
 	int zero_offset = 0;
+	u_int  first_lblkno;
 
 	if (flags & CL_READ) {
 	        io_flags = (B_VECTORLIST | B_READ);
@@ -309,14 +353,18 @@ cluster_io(vp, upl, upl_offset, f_offset, non_rounded_size, devblocksize, flags,
 	}
 	pl = ubc_upl_pageinfo(upl);
 
-	if (flags & CL_ASYNC)
-	        io_flags |= (B_CALL | B_ASYNC);
 	if (flags & CL_AGE)
 	        io_flags |= B_AGE;
 	if (flags & CL_DUMP)
 	        io_flags |= B_NOCACHE;
 	if (flags & CL_PAGEIN)
 		io_flags |= B_PGIN;
+	if (flags & CL_PAGEOUT)
+		io_flags |= B_PAGEOUT;
+	if (flags & CL_COMMIT)
+	        io_flags |= B_COMMIT_UPL;
+	if (flags & CL_PRESERVE)
+	        io_flags |= B_PHYS;
 
 	if (devblocksize)
 	        size = (non_rounded_size + (devblocksize - 1)) & ~(devblocksize - 1);
@@ -338,7 +386,6 @@ cluster_io(vp, upl, upl_offset, f_offset, non_rounded_size, devblocksize, flags,
 	        zero_offset = upl_offset + non_rounded_size;
 	}
 	while (size) {
-		size_t io_size;
 		int vsize;
 		int i;
 		int pl_index;
@@ -352,7 +399,7 @@ cluster_io(vp, upl, upl_offset, f_offset, non_rounded_size, devblocksize, flags,
 		else
 		        io_size = size;
 
-		if (error = VOP_CMAP(vp, f_offset, io_size, &blkno, &io_size, NULL)) {
+		if (error = VOP_CMAP(vp, f_offset, io_size, &blkno, (size_t *)&io_size, NULL)) {
 		        if (error == EOPNOTSUPP)
 			        panic("VOP_CMAP Unimplemented");
 			break;
@@ -587,8 +634,10 @@ cluster_io(vp, upl, upl_offset, f_offset, non_rounded_size, devblocksize, flags,
 		if (error)
 		        break;
 
-		if (flags & CL_ASYNC)
-			cbp->b_iodone = (void *)cluster_iodone;
+		if (flags & CL_ASYNC) {
+			cbp->b_flags |= (B_CALL | B_ASYNC);
+		        cbp->b_iodone = (void *)cluster_iodone;
+		}
 		cbp->b_flags |= io_flags;
 
 		cbp->b_lblkno = lblkno;
@@ -598,6 +647,9 @@ cluster_io(vp, upl, upl_offset, f_offset, non_rounded_size, devblocksize, flags,
 		cbp->b_uploffset = upl_offset;
 		cbp->b_trans_next = (struct buf *)0;
 
+		if (cbp->b_iostate = (void *)iostate)
+		        iostate->io_issued += io_size;
+
 		if (flags & CL_READ)
 			KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 26)) | DBG_FUNC_NONE,
 				     cbp->b_lblkno, cbp->b_blkno, upl_offset, io_size, 0);
@@ -631,13 +683,6 @@ cluster_io(vp, upl, upl_offset, f_offset, non_rounded_size, devblocksize, flags,
 			 * then go ahead and issue the I/O
 			 */
 start_io:		
-		        if (flags & CL_COMMIT)
-			        cbp_head->b_flags |= B_COMMIT_UPL;
-			if (flags & CL_PAGEOUT)
-			        cbp_head->b_flags |= B_PAGEOUT;
-			if (flags & CL_PAGEIN)
-				cbp_head->b_flags |= B_PGIN;
-
 			if (real_bp) {
 			        cbp_head->b_flags |= B_NEED_IODONE;
 				cbp_head->b_real_bp = real_bp;
@@ -687,6 +732,8 @@ start_io:
 	if (error) {
 	        int abort_size;
 
+		io_size = 0;
+		
 	        for (cbp = cbp_head; cbp;) {
 			struct buf * cbp_next;
  
@@ -694,21 +741,36 @@ start_io:
 			        _FREE(cbp->b_vectorlist, M_SEGMENT);
 			upl_offset -= cbp->b_bcount;
 			size       += cbp->b_bcount;
+			io_size    += cbp->b_bcount;
 
 			cbp_next = cbp->b_trans_next;
 			free_io_buf(cbp);
 			cbp = cbp_next;
 		}
+		if (iostate) {
+		        if (iostate->io_error == 0) {
+			        iostate->io_error = error;
+				iostate->io_offset = f_offset - (off_t)io_size;
+			}
+			iostate->io_issued -= io_size;
+
+			if (iostate->io_wanted) {
+			        iostate->io_wanted = 0;
+				wakeup((caddr_t)&iostate->io_wanted);
+			}
+		}
 		pg_offset  = upl_offset & PAGE_MASK;
 		abort_size = ((size + pg_offset + (PAGE_SIZE - 1)) / PAGE_SIZE) * PAGE_SIZE;
 
 		if (flags & CL_COMMIT) {
 		        int upl_abort_code;
 
-			if ((flags & CL_PAGEOUT) && (error != ENXIO)) /* transient error */
+			if (flags & CL_PRESERVE)
+			        upl_abort_code = UPL_ABORT_FREE_ON_EMPTY;
+			else if ((flags & CL_PAGEOUT) && (error != ENXIO)) /* transient error */
 			        upl_abort_code = UPL_ABORT_FREE_ON_EMPTY;
 			else if (flags & CL_PAGEIN)
-			    upl_abort_code = UPL_ABORT_FREE_ON_EMPTY | UPL_ABORT_ERROR;
+			        upl_abort_code = UPL_ABORT_FREE_ON_EMPTY | UPL_ABORT_ERROR;
 			else
 				upl_abort_code = UPL_ABORT_FREE_ON_EMPTY | UPL_ABORT_DUMP_PAGES;
 
@@ -910,7 +972,7 @@ cluster_pageout(vp, upl, upl_offset, f_offset, size, filesize, devblocksize, fla
 	}
 
 	return (cluster_io(vp, upl, upl_offset, f_offset, io_size, devblocksize,
-			   local_flags, (struct buf *)0));
+			   local_flags, (struct buf *)0, (struct clios *)0));
 }
 
 int
@@ -968,7 +1030,7 @@ cluster_pagein(vp, upl, upl_offset, f_offset, size, filesize, devblocksize, flag
 				    size - (upl_offset + rounded_size), UPL_ABORT_FREE_ON_EMPTY | UPL_ABORT_ERROR);
 	
 	retval = cluster_io(vp, upl, upl_offset, f_offset, io_size, devblocksize,
-			   local_flags | CL_READ | CL_PAGEIN, (struct buf *)0);
+			   local_flags | CL_READ | CL_PAGEIN, (struct buf *)0, (struct clios *)0);
 
 	if (retval == 0) {
 	        int b_lblkno;
@@ -1010,7 +1072,7 @@ cluster_bp(bp)
 
 	f_offset = ubc_blktooff(bp->b_vp, bp->b_lblkno);
 
-        return (cluster_io(bp->b_vp, bp->b_pagelist, 0, f_offset, bp->b_bcount, 0, flags, bp));
+        return (cluster_io(bp->b_vp, bp->b_pagelist, 0, f_offset, bp->b_bcount, 0, flags, bp, (struct clios *)0));
 }
 
 int
@@ -1037,7 +1099,7 @@ cluster_write(vp, uio, oldEOF, newEOF, headOff, tailOff, devblocksize, flags)
 	int           retval = 0;
 
 
-	if ((!uio) || (uio->uio_segflg != UIO_USERSPACE) || (!(vp->v_flag & VNOCACHE_DATA)))
+	if ( (!(vp->v_flag & VNOCACHE_DATA)) || (!uio) || (uio->uio_segflg != UIO_USERSPACE))
 	  {
 	    retval = cluster_write_x(vp, uio, oldEOF, newEOF, headOff, tailOff, devblocksize, flags);
 	    return(retval);
@@ -1074,14 +1136,6 @@ cluster_write(vp, uio, oldEOF, newEOF, headOff, tailOff, devblocksize, flags)
 
             if (upl_flags & UPL_PHYS_CONTIG)
 	      {
-		/*
-		 * since the interface to the IOKit below us uses physical block #'s and
-		 * block counts to specify the I/O, we can't handle anything that isn't
-		 * devblocksize aligned 
-		 */
-		if ((uio->uio_offset & (devblocksize - 1)) || (uio->uio_resid & (devblocksize - 1)))
-		    return(EINVAL);
-
 		if (flags & IO_HEADZEROFILL)
 		  {
 		    flags &= ~IO_HEADZEROFILL;
@@ -1090,7 +1144,7 @@ cluster_write(vp, uio, oldEOF, newEOF, headOff, tailOff, devblocksize, flags)
 		        return(retval);
 		  }
 
-		retval = cluster_phys_write(vp, uio, newEOF);
+		retval = cluster_phys_write(vp, uio, newEOF, devblocksize, flags);
 
 		if (uio->uio_resid == 0 && (flags & IO_TAILZEROFILL))
 		  {
@@ -1172,6 +1226,7 @@ cluster_write(vp, uio, oldEOF, newEOF, headOff, tailOff, devblocksize, flags)
 	return(retval);
 }
 
+
 static int
 cluster_nocopy_write(vp, uio, newEOF, devblocksize, flags)
 	struct vnode *vp;
@@ -1326,7 +1381,7 @@ cluster_nocopy_write(vp, uio, newEOF, devblocksize, flags)
 		       (int)upl_offset, (int)uio->uio_offset, io_size, 0, 0);
 
 	  error = cluster_io(vp, upl, upl_offset, uio->uio_offset,
-			     io_size, devblocksize, 0, (struct buf *)0);
+			     io_size, devblocksize, 0, (struct buf *)0, (struct clios *)0);
 
 	  if (error == 0) {
 	    /*
@@ -1361,14 +1416,20 @@ cluster_nocopy_write(vp, uio, newEOF, devblocksize, flags)
 	return (error);
 }
 
+
 static int
-cluster_phys_write(vp, uio, newEOF)
+cluster_phys_write(vp, uio, newEOF, devblocksize, flags)
 	struct vnode *vp;
 	struct uio   *uio;
 	off_t        newEOF;
+	int          devblocksize;
+	int          flags;
 {
+	upl_page_info_t *pl;
+	vm_offset_t      src_paddr;
  	upl_t            upl;
 	vm_offset_t      upl_offset;
+	int              tail_size;
 	int              io_size;
 	int              upl_size;
 	int              upl_needed_size;
@@ -1399,49 +1460,78 @@ cluster_phys_write(vp, uio, newEOF)
 			      (vm_offset_t)iov->iov_base & ~PAGE_MASK,
 			      &upl_size, &upl, NULL, &pages_in_pl, &upl_flags, 0);
 
-	if (kret != KERN_SUCCESS)
-	  {
-	    /* cluster_phys_write: failed to get pagelist */
-	      /* note: return kret here */
+	if (kret != KERN_SUCCESS) {
+	        /*
+		 * cluster_phys_write: failed to get pagelist
+		 * note: return kret here
+		 */
 	      return(EINVAL);
-	  }
-
+	}
 	/*
 	 * Consider the possibility that upl_size wasn't satisfied.
 	 * This is a failure in the physical memory case.
 	 */
-	if (upl_size < upl_needed_size)
-	  {
-	    kernel_upl_abort_range(upl, 0, upl_size, UPL_ABORT_FREE_ON_EMPTY);
-	    return(EINVAL);
-	  }
+	if (upl_size < upl_needed_size) {
+	        kernel_upl_abort_range(upl, 0, upl_size, UPL_ABORT_FREE_ON_EMPTY);
+		return(EINVAL);
+	}
+	pl = ubc_upl_pageinfo(upl);
 
-	/*
-	 * issue a synchronous write to cluster_io
-	 */
+	src_paddr = (vm_offset_t)upl_phys_page(pl, 0) + ((vm_offset_t)iov->iov_base & PAGE_MASK);
 
-	error = cluster_io(vp, upl, upl_offset, uio->uio_offset,
-			   io_size, 0, CL_DEV_MEMORY, (struct buf *)0);
+	while (((uio->uio_offset & (devblocksize - 1)) || io_size < devblocksize) && io_size) {
+	        int   head_size;
 
-	if (error == 0) {
-	  /*
-	   * The cluster_io write completed successfully,
-	   * update the uio structure and commit.
-	   */
+		head_size = devblocksize - (int)(uio->uio_offset & (devblocksize - 1));
 
-	  ubc_upl_commit_range(upl, 0, upl_size, UPL_COMMIT_FREE_ON_EMPTY);
-	    
-	  iov->iov_base += io_size;
-	  iov->iov_len -= io_size;
-	  uio->uio_resid -= io_size;
-	  uio->uio_offset += io_size;
+		if (head_size > io_size)
+		        head_size = io_size;
+
+		error = cluster_align_phys_io(vp, uio, src_paddr, head_size, devblocksize, 0);
+
+		if (error) {
+		        ubc_upl_abort_range(upl, 0, upl_size, UPL_ABORT_FREE_ON_EMPTY);
+
+			return(EINVAL);
+		}
+		upl_offset += head_size;
+		src_paddr  += head_size;
+		io_size    -= head_size;
 	}
-	else
-	  ubc_upl_abort_range(upl, 0, upl_size, UPL_ABORT_FREE_ON_EMPTY);
+	tail_size = io_size & (devblocksize - 1);
+	io_size  -= tail_size;
+
+	if (io_size) {
+	        /*
+		 * issue a synchronous write to cluster_io
+		 */
+	        error = cluster_io(vp, upl, upl_offset, uio->uio_offset,
+				   io_size, 0, CL_DEV_MEMORY, (struct buf *)0, (struct clios *)0);
+	}
+	if (error == 0) {
+	        /*
+		 * The cluster_io write completed successfully,
+		 * update the uio structure
+		 */
+		uio->uio_resid  -= io_size;
+		iov->iov_len    -= io_size;
+	        iov->iov_base   += io_size;
+		uio->uio_offset += io_size;
+		src_paddr       += io_size;
+
+		if (tail_size)
+		        error = cluster_align_phys_io(vp, uio, src_paddr, tail_size, devblocksize, 0);
+	}
+	/*
+	 * just release our hold on the physically contiguous
+	 * region without changing any state
+	 */
+	ubc_upl_abort_range(upl, 0, upl_size, UPL_ABORT_FREE_ON_EMPTY);
 
 	return (error);
 }
 
+
 static int
 cluster_write_x(vp, uio, oldEOF, newEOF, headOff, tailOff, devblocksize, flags)
 	struct vnode *vp;
@@ -1593,7 +1683,7 @@ cluster_write_x(vp, uio, oldEOF, newEOF, headOff, tailOff, devblocksize, flags)
 			        read_size = newEOF - upl_f_offset;
 
 		        retval = cluster_io(vp, upl, 0, upl_f_offset, read_size, devblocksize,
-					    CL_READ, (struct buf *)0);
+					    CL_READ, (struct buf *)0, (struct clios *)0);
 			if (retval) {
 				/*
 				 * we had an error during the read which causes us to abort
@@ -1627,7 +1717,7 @@ cluster_write_x(vp, uio, oldEOF, newEOF, headOff, tailOff, devblocksize, flags)
 				        read_size = newEOF - (upl_f_offset + upl_offset);
 
 			        retval = cluster_io(vp, upl, upl_offset, upl_f_offset + upl_offset, read_size, devblocksize,
-						    CL_READ, (struct buf *)0);
+						    CL_READ, (struct buf *)0, (struct clios *)0);
 				if (retval) {
 					/*
 					 * we had an error during the read which causes us to abort
@@ -1934,7 +2024,7 @@ delay_io:
 			if (last_blkno > vp->v_lastw)
 			        vp->v_lastw = last_blkno;
 
-			ubc_upl_commit_range(upl, 0, upl_size, UPL_COMMIT_SET_DIRTY | UPL_COMMIT_INACTIVATE | UPL_COMMIT_FREE_ON_EMPTY);
+		        ubc_upl_commit_range(upl, 0, upl_size, UPL_COMMIT_SET_DIRTY | UPL_COMMIT_INACTIVATE | UPL_COMMIT_FREE_ON_EMPTY);
 			continue;
 issue_io:
 			/*
@@ -1963,7 +2053,7 @@ issue_io:
 				tsleep((caddr_t)&vp->v_numoutput, PRIBIO + 1, "cluster_write", 0);
 			}	
 			retval = cluster_io(vp, upl, 0, upl_f_offset, io_size, devblocksize,
-					    io_flags, (struct buf *)0);
+					    io_flags, (struct buf *)0, (struct clios *)0);
 		}
 	}
 	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 40)) | DBG_FUNC_END,
@@ -2039,7 +2129,7 @@ cluster_read(vp, uio, filesize, devblocksize, flags)
 
 	    if (upl_flags & UPL_PHYS_CONTIG)
 	      {
-		retval = cluster_phys_read(vp, uio, filesize);
+		retval = cluster_phys_read(vp, uio, filesize, devblocksize, flags);
 	      }
 	    else if (uio->uio_resid < 4 * PAGE_SIZE)
 	      {
@@ -2119,6 +2209,7 @@ cluster_read(vp, uio, filesize, devblocksize, flags)
 	return(retval);
 }
 
+
 static int
 cluster_read_x(vp, uio, filesize, devblocksize, flags)
 	struct vnode *vp;
@@ -2288,7 +2379,7 @@ cluster_read_x(vp, uio, filesize, devblocksize, flags)
 			 */
 
 			error = cluster_io(vp, upl, upl_offset, upl_f_offset + upl_offset,
-					   io_size, devblocksize, CL_READ, (struct buf *)0);
+					   io_size, devblocksize, CL_READ, (struct buf *)0, (struct clios *)0);
 		}
 		if (error == 0) {
 		        /*
@@ -2481,6 +2572,7 @@ cluster_read_x(vp, uio, filesize, devblocksize, flags)
 	return (retval);
 }
 
+
 static int
 cluster_nocopy_read(vp, uio, filesize, devblocksize, flags)
 	struct vnode *vp;
@@ -2687,7 +2779,7 @@ cluster_nocopy_read(vp, uio, filesize, devblocksize, flags)
 		       (int)upl, (int)upl_offset, (int)start_upl_f_offset, io_size, 0);
 
 	  error = cluster_io(vp, upl, upl_offset, start_upl_f_offset,
-			     io_size, devblocksize, CL_READ| CL_NOZERO, (struct buf *)0);
+			     io_size, devblocksize, CL_READ| CL_NOZERO, (struct buf *)0,  (struct clios *)0);
 
 	  if (error == 0) {
 	    /*
@@ -2724,22 +2816,29 @@ cluster_nocopy_read(vp, uio, filesize, devblocksize, flags)
 }
 
 
+
 static int
-cluster_phys_read(vp, uio, filesize)
+cluster_phys_read(vp, uio, filesize, devblocksize, flags)
 	struct vnode *vp;
 	struct uio   *uio;
 	off_t        filesize;
+	int          devblocksize;
+	int          flags;
 {
+	upl_page_info_t *pl;
 	upl_t            upl;
 	vm_offset_t      upl_offset;
+	vm_offset_t      dst_paddr;
 	off_t            max_size;
 	int              io_size;
+	int              tail_size;
 	int              upl_size;
 	int              upl_needed_size;
 	int              pages_in_pl;
 	int              upl_flags;
 	kern_return_t    kret;
 	struct iovec     *iov;
+	struct clios     iostate;
 	int              error;
 
 	/*
@@ -2752,14 +2851,15 @@ cluster_phys_read(vp, uio, filesize)
 
 	max_size = filesize - uio->uio_offset;
 
-	if (max_size < (off_t)((unsigned int)iov->iov_len))
-	    io_size = max_size;
+	if (max_size > (off_t)((unsigned int)iov->iov_len))
+	        io_size = iov->iov_len;
 	else
-	    io_size = iov->iov_len;
+	        io_size = max_size;
 
 	upl_offset = (vm_offset_t)iov->iov_base & PAGE_MASK_64;
 	upl_needed_size = upl_offset + io_size;
 
+	error       = 0;
 	pages_in_pl = 0;
 	upl_size = upl_needed_size;
 	upl_flags = UPL_FILE_IO | UPL_NO_SYNC | UPL_CLEAN_IN_PLACE | UPL_SET_INTERNAL;
@@ -2768,48 +2868,112 @@ cluster_phys_read(vp, uio, filesize)
 			      (vm_offset_t)iov->iov_base & ~PAGE_MASK,
 			      &upl_size, &upl, NULL, &pages_in_pl, &upl_flags, 0);
 
-	if (kret != KERN_SUCCESS)
-	  {
-	    /* cluster_phys_read: failed to get pagelist */
-	    return(EINVAL);
-	  }
+	if (kret != KERN_SUCCESS) {
+	        /*
+		 * cluster_phys_read: failed to get pagelist
+		 */
+	        return(EINVAL);
+	}
+	if (upl_size < upl_needed_size) {
+	        /*
+		 * The upl_size wasn't satisfied.
+		 */
+	        ubc_upl_abort_range(upl, 0, upl_size, UPL_ABORT_FREE_ON_EMPTY);
+
+		return(EINVAL);
+	}
+	pl = ubc_upl_pageinfo(upl);
+
+	dst_paddr = (vm_offset_t)upl_phys_page(pl, 0) + ((vm_offset_t)iov->iov_base & PAGE_MASK);
 
+	while (((uio->uio_offset & (devblocksize - 1)) || io_size < devblocksize) && io_size) {
+	        int   head_size;
+
+		head_size = devblocksize - (int)(uio->uio_offset & (devblocksize - 1));
+
+		if (head_size > io_size)
+		        head_size = io_size;
+
+		error = cluster_align_phys_io(vp, uio, dst_paddr, head_size, devblocksize, CL_READ);
+
+		if (error) {
+		        ubc_upl_abort_range(upl, 0, upl_size, UPL_ABORT_FREE_ON_EMPTY);
+
+			return(EINVAL);
+		}
+		upl_offset += head_size;
+		dst_paddr  += head_size;
+		io_size    -= head_size;
+	}
+	tail_size = io_size & (devblocksize - 1);
+	io_size  -= tail_size;
+
+	iostate.io_completed = 0;
+	iostate.io_issued = 0;
+	iostate.io_error = 0;
+	iostate.io_wanted = 0;
+
+	while (io_size && error == 0) {
+	        int  xsize;
+
+		if (io_size > (MAX_UPL_TRANSFER * PAGE_SIZE))
+		        xsize = MAX_UPL_TRANSFER * PAGE_SIZE;
+		else
+		        xsize = io_size;
+		/*
+		 * request asynchronously so that we can overlap
+		 * the preparation of the next I/O... we'll do
+		 * the commit after all the I/O has completed
+		 * since its all issued against the same UPL
+		 * if there are already too many outstanding reads
+		 * throttle back until we reach a more reasonable level
+		 */
+		while ((iostate.io_issued - iostate.io_completed) > (2 * MAX_UPL_TRANSFER * PAGE_SIZE)) {
+	                iostate.io_wanted = 1;
+			tsleep((caddr_t)&iostate.io_wanted, PRIBIO + 1, "cluster_phys_read", 0);
+		}	
+
+	        error = cluster_io(vp, upl, upl_offset, uio->uio_offset, xsize, 0, 
+				   CL_READ | CL_NOZERO | CL_DEV_MEMORY | CL_ASYNC,
+				   (struct buf *)0, &iostate);
+	        /*
+		 * The cluster_io read was issued successfully,
+		 * update the uio structure
+		 */
+		if (error == 0) {
+		        uio->uio_resid  -= xsize;
+			iov->iov_len    -= xsize;
+			iov->iov_base   += xsize;
+			uio->uio_offset += xsize;
+			dst_paddr       += xsize;
+			upl_offset      += xsize;
+			io_size         -= xsize;
+		}
+	}
 	/*
-	 * Consider the possibility that upl_size wasn't satisfied.
+	 * make sure any async reads have completed before
+	 * we proceed
 	 */
-	if (upl_size < upl_needed_size)
-	  {
-	    ubc_upl_abort_range(upl, 0, upl_size, UPL_ABORT_FREE_ON_EMPTY);
-	    return(EINVAL);
-	  }
+	while (iostate.io_issued != iostate.io_completed) {
+	        iostate.io_wanted = 1;
+		tsleep((caddr_t)&iostate.io_wanted, PRIBIO + 1, "cluster_phys_read", 0);
+	}	
+	if (iostate.io_error) {
+	        error = iostate.io_error;
+	}
+	if (error == 0 && tail_size)
+	        error = cluster_align_phys_io(vp, uio, dst_paddr, tail_size, devblocksize, CL_READ);
 
 	/*
-	 * issue a synchronous read to cluster_io
+	 * just release our hold on the physically contiguous
+	 * region without changing any state
 	 */
-
-	error = cluster_io(vp, upl, upl_offset, uio->uio_offset,
-			   io_size, 0, CL_READ| CL_NOZERO | CL_DEV_MEMORY, (struct buf *)0);
-
-	if (error == 0)
-	  {
-	    /*
-	     * The cluster_io read completed successfully,
-	     * update the uio structure and commit.
-	     */
-
-	    ubc_upl_commit_range(upl, 0, upl_size, UPL_COMMIT_FREE_ON_EMPTY);
-	    
-	    iov->iov_base += io_size;
-	    iov->iov_len -= io_size;
-	    uio->uio_resid -= io_size;
-	    uio->uio_offset += io_size;
-	  }
-	else
-	    ubc_upl_abort_range(upl, 0, upl_size, UPL_ABORT_FREE_ON_EMPTY);
+	ubc_upl_abort_range(upl, 0, upl_size, UPL_ABORT_FREE_ON_EMPTY);
 	
 	return (error);
 }
 
+
 /*
  * generate advisory I/O's in the largest chunks possible
  * the completed pages will be released into the VM cache
@@ -2932,7 +3096,7 @@ advisory_read(vp, filesize, f_offset, resid, devblocksize)
 				 * issue an asynchronous read to cluster_io
 				 */
 				retval = cluster_io(vp, upl, upl_offset, upl_f_offset + upl_offset, io_size, devblocksize,
-						    CL_ASYNC | CL_READ | CL_COMMIT | CL_AGE, (struct buf *)0);
+						    CL_ASYNC | CL_READ | CL_COMMIT | CL_AGE, (struct buf *)0, (struct clios *)0);
 
 				issued_io = 1;
 			}
@@ -3228,7 +3392,7 @@ cluster_push_x(vp, EOF, first, last, can_delay)
 		        vp->v_flag |= VTHROTTLED;
 			tsleep((caddr_t)&vp->v_numoutput, PRIBIO + 1, "cluster_push", 0);
 		}
-		cluster_io(vp, upl, upl_offset, upl_f_offset + upl_offset, io_size, vp->v_ciosiz, io_flags, (struct buf *)0);
+		cluster_io(vp, upl, upl_offset, upl_f_offset + upl_offset, io_size, vp->v_ciosiz, io_flags, (struct buf *)0, (struct clios *)0);
 
 		size -= io_size;
 	}
@@ -3236,3 +3400,64 @@ cluster_push_x(vp, EOF, first, last, can_delay)
 
 	return(1);
 }
+
+
+
+static int
+cluster_align_phys_io(struct vnode *vp, struct uio *uio, vm_offset_t usr_paddr, int xsize, int devblocksize, int flags)
+{
+        struct iovec     *iov;
+        upl_page_info_t  *pl;
+        upl_t            upl;
+        vm_offset_t      ubc_paddr;
+        kern_return_t    kret;
+        int              error = 0;
+
+        iov = uio->uio_iov;
+
+        kret = ubc_create_upl(vp,
+                              uio->uio_offset & ~PAGE_MASK_64,
+                              PAGE_SIZE,
+                              &upl,
+                              &pl,
+                              UPL_FLAGS_NONE);
+
+        if (kret != KERN_SUCCESS)
+                return(EINVAL);
+
+        if (!upl_valid_page(pl, 0)) {
+                /*
+                 * issue a synchronous read to cluster_io
+                 */
+                error = cluster_io(vp, upl, 0, uio->uio_offset & ~PAGE_MASK_64, PAGE_SIZE, devblocksize,
+				   CL_READ, (struct buf *)0, (struct clios *)0);
+                if (error) {
+                          ubc_upl_abort_range(upl, 0, PAGE_SIZE, UPL_ABORT_DUMP_PAGES | UPL_ABORT_FREE_ON_EMPTY);
+
+                          return(error);
+                }
+        }
+        ubc_paddr = (vm_offset_t)upl_phys_page(pl, 0) + (int)(uio->uio_offset & PAGE_MASK_64);
+
+	if (flags & CL_READ)
+	        copyp2p(ubc_paddr, usr_paddr, xsize, 2);
+	else
+	        copyp2p(usr_paddr, ubc_paddr, xsize, 1);
+
+	if ( !(flags & CL_READ) || upl_dirty_page(pl, 0)) {
+                /*
+                 * issue a synchronous write to cluster_io
+                 */
+                error = cluster_io(vp, upl, 0, uio->uio_offset & ~PAGE_MASK_64, PAGE_SIZE, devblocksize,
+				   0, (struct buf *)0, (struct clios *)0);
+	}
+	if (error == 0) {
+	        uio->uio_offset += xsize;
+		iov->iov_base   += xsize;
+		iov->iov_len    -= xsize;
+		uio->uio_resid  -= xsize;
+	}
+	ubc_upl_abort_range(upl, 0, PAGE_SIZE, UPL_ABORT_DUMP_PAGES | UPL_ABORT_FREE_ON_EMPTY);
+
+        return (error);
+}
diff --git a/bsd/vfs/vfs_journal.c b/bsd/vfs/vfs_journal.c
new file mode 100644
index 000000000..2acb4fab2
--- /dev/null
+++ b/bsd/vfs/vfs_journal.c
@@ -0,0 +1,2067 @@
+/*
+ * Copyright (c) 1995-2002 Apple Computer, Inc. All rights reserved.
+ *
+ * @APPLE_LICENSE_HEADER_START@
+ * 
+ * The contents of this file constitute Original Code as defined in and
+ * are subject to the Apple Public Source License Version 1.1 (the
+ * "License").  You may not use this file except in compliance with the
+ * License.  Please obtain a copy of the License at
+ * http://www.apple.com/publicsource and read it before using this file.
+ * 
+ * This Original Code and all software distributed under the License are
+ * distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY KIND, EITHER
+ * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
+ * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE OR NON-INFRINGEMENT.  Please see the
+ * License for the specific language governing rights and limitations
+ * under the License.
+ * 
+ * @APPLE_LICENSE_HEADER_END@
+ */
+//
+// This file implements a simple write-ahead journaling layer.  
+// In theory any file system can make use of it by calling these 
+// functions when the fs wants to modify meta-data blocks.  See
+// vfs_journal.h for a more detailed description of the api and
+// data structures.
+//
+// Dominic Giampaolo (dbg@apple.com)
+//
+
+#ifdef KERNEL
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/kernel.h>
+#include <sys/file.h>
+#include <sys/stat.h>
+#include <sys/buf.h>
+#include <sys/proc.h>
+#include <sys/mount.h>
+#include <sys/namei.h>
+#include <sys/vnode.h>
+#include <sys/ioctl.h>
+#include <sys/tty.h>
+#include <sys/ubc.h>
+#include <sys/malloc.h>
+#include <sys/vnode.h>
+#include <kern/thread_act.h>
+#include <sys/disk.h>
+#include <miscfs/specfs/specdev.h>
+
+extern task_t kernel_task;
+
+#else
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <limits.h>
+#include <errno.h>
+#include <fcntl.h>
+#include <unistd.h>
+#include <stdarg.h>
+#include <sys/types.h>
+#include "compat.h"
+
+#endif   /* KERNEL */
+
+#include "vfs_journal.h"
+
+
+// number of bytes to checksum in a block_list_header
+// NOTE: this should be enough to clear out the header
+//       fields as well as the first entry of binfo[]
+#define BLHDR_CHECKSUM_SIZE 32
+
+
+
+static int  end_transaction(transaction *tr, int force_it);
+static void abort_transaction(journal *jnl, transaction *tr);
+static void dump_journal(journal *jnl);
+
+
+#define CHECK_JOURNAL(jnl) \
+    do { \
+    if (jnl == NULL) {\
+	panic("%s:%d: null journal ptr?\n", __FILE__, __LINE__);\
+    }\
+    if (jnl->jdev == NULL) { \
+	panic("%s:%d: jdev is null!\n", __FILE__, __LINE__);\
+    } \
+    if (jnl->fsdev == NULL) { \
+	panic("%s:%d: fsdev is null!\n", __FILE__, __LINE__);\
+    } \
+    if (jnl->jhdr->magic != JOURNAL_HEADER_MAGIC) {\
+	panic("%s:%d: jhdr magic corrupted (0x%x != 0x%x)\n",\
+	__FILE__, __LINE__, jnl->jhdr->magic, JOURNAL_HEADER_MAGIC);\
+    }\
+    if (   jnl->jhdr->start <= 0 \
+	|| jnl->jhdr->start > jnl->jhdr->size\
+	|| jnl->jhdr->start > 128*1024*1024) {\
+	panic("%s:%d: jhdr start looks bad (0x%llx max size 0x%llx)\n", \
+	__FILE__, __LINE__, jnl->jhdr->start, jnl->jhdr->size);\
+    }\
+    if (   jnl->jhdr->end <= 0 \
+	|| jnl->jhdr->end > jnl->jhdr->size\
+	|| jnl->jhdr->end > 128*1024*1024) {\
+	panic("%s:%d: jhdr end looks bad (0x%llx max size 0x%llx)\n", \
+	__FILE__, __LINE__, jnl->jhdr->end, jnl->jhdr->size);\
+    }\
+    if (jnl->jhdr->size > 128*1024*1024) {\
+	panic("%s:%d: jhdr size looks bad (0x%llx)\n",\
+	__FILE__, __LINE__, jnl->jhdr->size);\
+    } \
+    } while(0)
+
+#define CHECK_TRANSACTION(tr) \
+    do {\
+    if (tr == NULL) {\
+	panic("%s:%d: null transaction ptr?\n", __FILE__, __LINE__);\
+    }\
+    if (tr->jnl == NULL) {\
+	panic("%s:%d: null tr->jnl ptr?\n", __FILE__, __LINE__);\
+    }\
+    if (tr->blhdr != (block_list_header *)tr->tbuffer) {\
+	panic("%s:%d: blhdr (0x%x) != tbuffer (0x%x)\n", __FILE__, __LINE__, tr->blhdr, tr->tbuffer);\
+    }\
+    if (tr->total_bytes < 0) {\
+	panic("%s:%d: tr total_bytes looks bad: %d\n", __FILE__, __LINE__, tr->total_bytes);\
+    }\
+    if (tr->journal_start < 0 || tr->journal_start > 128*1024*1024) {\
+	panic("%s:%d: tr journal start looks bad: 0x%llx\n", __FILE__, __LINE__, tr->journal_start);\
+    }\
+    if (tr->journal_end < 0 || tr->journal_end > 128*1024*1024) {\
+	panic("%s:%d: tr journal end looks bad: 0x%llx\n", __FILE__, __LINE__, tr->journal_end);\
+    }\
+    if (tr->blhdr && (tr->blhdr->max_blocks <= 0 || tr->blhdr->max_blocks > 2048)) {\
+	panic("%s:%d: tr blhdr max_blocks looks bad: %d\n", __FILE__, __LINE__, tr->blhdr->max_blocks);\
+    }\
+    } while(0)
+
+
+
+//
+// this isn't a great checksum routine but it will do for now.
+// we use it to checksum the journal header and the block list
+// headers that are at the start of each transaction.
+//
+static int
+calc_checksum(char *ptr, int len)
+{
+    int i, cksum=0;
+
+    // this is a lame checksum but for now it'll do
+    for(i=0; i < len; i++, ptr++) {
+		cksum = (cksum << 8) ^ (cksum + *(unsigned char *)ptr);
+    }
+
+    return (~cksum);
+}
+
+
+#define JNL_WRITE 1
+#define JNL_READ  2
+
+//
+// This function sets up a fake buf and passes it directly to the
+// journal device strategy routine (so that it won't get cached in
+// the block cache.
+//
+// It also handles range checking the i/o so that we don't write
+// outside the journal boundaries and it will wrap the i/o back
+// to the beginning if necessary (skipping over the journal header)
+// 
+static size_t
+do_journal_io(journal *jnl, off_t *offset, void *data, size_t len, int direction)
+{
+    int         err, io_sz=0, curlen=len;
+    struct buf *bp;
+	int max_iosize=0, max_vectors;
+
+    if (*offset < 0 || *offset > jnl->jhdr->size) {
+		panic("jnl: do_jnl_io: bad offset 0x%llx (max 0x%llx)\n", *offset, jnl->jhdr->size);
+    }
+
+  again:
+    bp = alloc_io_buf(jnl->jdev, 1);
+
+    if (direction == JNL_WRITE) {
+		bp->b_flags  |= 0;   // don't have to set any flags (was: B_WRITEINPROG)
+		jnl->jdev->v_numoutput++;
+		vfs_io_attributes(jnl->jdev, B_WRITE, &max_iosize, &max_vectors);
+    } else if (direction == JNL_READ) {
+		bp->b_flags  |= B_READ;
+		vfs_io_attributes(jnl->jdev, B_READ, &max_iosize, &max_vectors);
+    }
+
+	if (max_iosize == 0) {
+		max_iosize = 128 * 1024;
+	}
+
+    if (*offset + (off_t)curlen > jnl->jhdr->size && *offset != 0 && jnl->jhdr->size != 0) {
+		if (*offset == jnl->jhdr->size) {
+			*offset = jnl->jhdr->jhdr_size;
+		} else {
+			curlen = (off_t)jnl->jhdr->size - *offset;
+		}
+    }
+
+	if (curlen > max_iosize) {
+		curlen = max_iosize;
+	}
+
+    if (curlen <= 0) {
+		panic("jnl: do_jnl_io: curlen == %d, offset 0x%llx len %d\n", curlen, *offset, len);
+    }
+
+    bp->b_bufsize = curlen;
+    bp->b_bcount  = curlen;
+    bp->b_data    = data;
+    bp->b_blkno   = (daddr_t) ((jnl->jdev_offset + *offset) / (off_t)jnl->jhdr->jhdr_size);
+    bp->b_lblkno  = (daddr_t) ((jnl->jdev_offset + *offset) / (off_t)jnl->jhdr->jhdr_size);
+
+    err = VOP_STRATEGY(bp);
+    if (!err) {
+		err = biowait(bp);
+    }
+    
+    bp->b_data    = NULL;
+    bp->b_bufsize = bp->b_bcount = 0;
+    bp->b_blkno   = bp->b_lblkno = -1;
+
+    free_io_buf(bp);
+
+    if (err) {
+		printf("jnl: do_jnl_io: strategy err 0x%x\n", err);
+		return 0;
+    }
+
+    *offset += curlen;
+    io_sz   += curlen;
+    if (io_sz != len) {
+		// handle wrap-around
+		data    = (char *)data + curlen;
+		curlen  = len - io_sz;
+		if (*offset >= jnl->jhdr->size) {
+			*offset = jnl->jhdr->jhdr_size;
+		}
+		goto again;
+    }
+
+    return io_sz;
+}
+
+static size_t
+read_journal_data(journal *jnl, off_t *offset, void *data, size_t len)
+{
+    return do_journal_io(jnl, offset, data, len, JNL_READ);
+}
+
+static size_t
+write_journal_data(journal *jnl, off_t *offset, void *data, size_t len)
+{
+    return do_journal_io(jnl, offset, data, len, JNL_WRITE);
+}
+
+
+static int
+write_journal_header(journal *jnl)
+{
+    int ret;
+    off_t jhdr_offset = 0;
+    
+    // 
+    // XXXdbg note: this ioctl doesn't seem to do anything on firewire disks.
+    //
+    ret = VOP_IOCTL(jnl->jdev, DKIOCSYNCHRONIZECACHE, NULL, FWRITE, NOCRED, current_proc());
+    if (ret != 0) {
+		printf("jnl: flushing fs disk buffer returned 0x%x\n", ret);
+    }
+
+
+    jnl->jhdr->checksum = 0;
+    jnl->jhdr->checksum = calc_checksum((char *)jnl->jhdr, sizeof(struct journal_header));
+    if (write_journal_data(jnl, &jhdr_offset, jnl->header_buf, jnl->jhdr->jhdr_size) != jnl->jhdr->jhdr_size) {
+		printf("jnl: write_journal_header: error writing the journal header!\n");
+		jnl->flags |= JOURNAL_INVALID;
+		return -1;
+    }	
+
+    return 0;
+}
+
+
+
+//
+// this is a work function used to free up transactions that
+// completed. they can't be free'd from buffer_flushed_callback
+// because it is called from deep with the disk driver stack
+// and thus can't do something that would potentially cause
+// paging.  it gets called by each of the journal api entry
+// points so stuff shouldn't hang around for too long.
+//
+static void
+free_old_stuff(journal *jnl)
+{
+    transaction *tr, *next;
+
+    for(tr=jnl->tr_freeme; tr; tr=next) {
+		next = tr->next;
+		kmem_free(kernel_map, (vm_offset_t)tr, sizeof(transaction));
+    }
+
+    jnl->tr_freeme = NULL;
+}
+
+
+
+//
+// This is our callback that lets us know when a buffer has been
+// flushed to disk.  It's called from deep within the driver stack
+// and thus is quite limited in what it can do.  Notably, it can
+// not initiate any new i/o's or allocate/free memory.
+//
+static void
+buffer_flushed_callback(struct buf *bp)
+{
+    transaction  *tr;
+    journal      *jnl;
+    transaction  *ctr, *prev=NULL, *next;
+    int           i, bufsize;
+
+
+    //printf("jnl: buf flush: bp @ 0x%x l/blkno %d/%d vp 0x%x tr @ 0x%x\n",
+    //	   bp, bp->b_lblkno, bp->b_blkno, bp->b_vp, bp->b_transaction);
+
+    // snarf out the bits we want
+    bufsize = bp->b_bufsize;
+    tr      = bp->b_transaction;
+
+    bp->b_iodone      = NULL;   // don't call us for this guy again
+    bp->b_transaction = NULL;
+
+    //
+    // This is what biodone() would do if it didn't call us.
+    // NOTE: THIS CODE *HAS* TO BE HERE!
+    //
+    if (ISSET(bp->b_flags, B_ASYNC)) {	/* if async, release it */
+		brelse(bp);
+    } else {		                        /* or just wakeup the buffer */	
+		CLR(bp->b_flags, B_WANTED);
+		wakeup(bp);
+    }
+
+    // NOTE: from here on out we do *NOT* touch bp anymore.
+
+
+    // then we've already seen it
+    if (tr == NULL) {
+		return;
+    }
+
+    CHECK_TRANSACTION(tr);
+
+    jnl = tr->jnl;
+    if (jnl->flags & JOURNAL_INVALID) {
+		return;
+    }
+
+    CHECK_JOURNAL(jnl);
+
+    // update the number of blocks that have been flushed.
+    // this buf may represent more than one block so take
+    // that into account.
+    tr->num_flushed += bufsize;
+
+
+    // if this transaction isn't done yet, just return as
+    // there is nothing to do.
+    if ((tr->num_flushed + tr->num_killed) < tr->total_bytes) {
+		return;
+    }
+
+    //printf("jnl: tr 0x%x (0x%llx 0x%llx) in jnl 0x%x completed.\n",
+    //   tr, tr->journal_start, tr->journal_end, jnl);
+
+	// find this entry in the old_start[] index and mark it completed
+	simple_lock(&jnl->old_start_lock);
+	for(i=0; i < sizeof(jnl->old_start)/sizeof(jnl->old_start[0]); i++) {
+
+		if ((jnl->old_start[i] & ~(0x8000000000000000LL)) == tr->journal_start) {
+			jnl->old_start[i] &= ~(0x8000000000000000LL);
+			break;
+		}
+	}
+	if (i >= sizeof(jnl->old_start)/sizeof(jnl->old_start[0])) {
+		panic("jnl: buffer_flushed: did not find tr w/start @ %lld (tr 0x%x, jnl 0x%x)\n",
+			  tr->journal_start, tr, jnl);
+	}
+	simple_unlock(&jnl->old_start_lock);
+
+
+    // if we are here then we need to update the journal header
+    // to reflect that this transaction is complete
+    if (tr->journal_start == jnl->active_start) {
+		jnl->active_start = tr->journal_end;
+		tr->journal_start = tr->journal_end = (off_t)0;
+    }
+
+    // go through the completed_trs list and try to coalesce
+    // entries, restarting back at the beginning if we have to.
+    for(ctr=jnl->completed_trs; ctr; prev=ctr, ctr=next) {
+		if (ctr->journal_start == jnl->active_start) {
+			jnl->active_start = ctr->journal_end;
+			if (prev) {
+				prev->next = ctr->next;
+			}
+			if (ctr == jnl->completed_trs) {
+				jnl->completed_trs = ctr->next;
+			}
+	    
+			next           = jnl->completed_trs;   // this starts us over again
+			ctr->next      = jnl->tr_freeme;
+			jnl->tr_freeme = ctr;
+			ctr            = NULL;
+		} else if (tr->journal_end == ctr->journal_start) {
+			ctr->journal_start = tr->journal_start;
+			next               = jnl->completed_trs;  // this starts us over again
+			ctr                = NULL;
+			tr->journal_start  = tr->journal_end = (off_t)0;
+		} else if (tr->journal_start == ctr->journal_end) {
+			ctr->journal_end  = tr->journal_end;
+			next              = ctr->next;
+			tr->journal_start = tr->journal_end = (off_t)0;
+		} else {
+			next = ctr->next;
+		}
+    }
+    
+    // at this point no one should be using this guy anymore
+    tr->total_bytes = 0xfbadc0de;
+
+    // if this is true then we didn't merge with anyone
+    // so link ourselves in at the head of the completed
+    // transaction list.
+    if (tr->journal_start != 0) {
+		// put this entry into the correct sorted place
+		// in the list instead of just at the head.
+		//
+	
+		prev = NULL;
+		for(ctr=jnl->completed_trs; ctr && tr->journal_start > ctr->journal_start; prev=ctr, ctr=ctr->next) {
+			// just keep looping
+		}
+
+		if (ctr == NULL && prev == NULL) {
+			jnl->completed_trs = tr;
+			tr->next = NULL;
+		} else if (ctr == jnl->completed_trs) {
+			tr->next = jnl->completed_trs;
+			jnl->completed_trs = tr;
+		} else {
+			tr->next = prev->next;
+			prev->next = tr;
+		}
+    } else {
+		// if we're here this tr got merged with someone else so
+		// put it on the list to be free'd
+		tr->next       = jnl->tr_freeme;
+		jnl->tr_freeme = tr;
+    }
+}
+
+static int
+update_fs_block(journal *jnl, void *block_ptr, off_t fs_block, size_t bsize)
+{
+    int         ret;
+    struct buf *oblock_bp=NULL;
+    
+    // first read the block we want.
+    ret = meta_bread(jnl->fsdev, (daddr_t)fs_block, bsize, NOCRED, &oblock_bp);
+    if (ret != 0) {
+		printf("jnl: update_fs_block: error reading fs block # %lld! (ret %d)\n", fs_block, ret);
+
+		if (oblock_bp) {
+			brelse(oblock_bp);
+			oblock_bp = NULL;
+		}
+
+		// let's try to be aggressive here and just re-write the block
+		oblock_bp = getblk(jnl->fsdev, (daddr_t)fs_block, bsize, 0, 0, BLK_META);
+		if (oblock_bp == NULL) {
+			printf("jnl: update_fs_block: getblk() for %lld failed! failing update.\n", fs_block);
+			return -1;
+		}
+    }
+	    
+    // make sure it's the correct size.
+    if (oblock_bp->b_bufsize != bsize) {
+		brelse(oblock_bp);
+		return -1;
+    }
+
+    // copy the journal data over top of it
+    memcpy(oblock_bp->b_data, block_ptr, bsize);
+
+    if ((ret = VOP_BWRITE(oblock_bp)) != 0) {
+		printf("jnl: update_fs_block: failed to update block %lld (ret %d)\n", fs_block,ret);
+		brelse(oblock_bp);
+		return ret;
+    }
+
+    // and now invalidate it so that if someone else wants to read
+    // it in a different size they'll be able to do it.
+    ret = meta_bread(jnl->fsdev, (daddr_t)fs_block, bsize, NOCRED, &oblock_bp);
+    if (oblock_bp) {
+		oblock_bp->b_flags |= B_INVAL;
+		brelse(oblock_bp);
+    }
+	    
+    return 0;
+}
+
+
+static int
+replay_journal(journal *jnl)
+{
+    int i, ret, checksum, max_bsize;
+    struct buf *oblock_bp;
+    block_list_header *blhdr;
+    off_t offset;
+    char *buf, *block_ptr=NULL;
+    
+    // wrap the start ptr if it points to the very end of the journal
+    if (jnl->jhdr->start == jnl->jhdr->size) {
+		jnl->jhdr->start = jnl->jhdr->jhdr_size;
+    }
+    if (jnl->jhdr->end == jnl->jhdr->size) {
+		jnl->jhdr->end = jnl->jhdr->jhdr_size;
+    }
+
+    if (jnl->jhdr->start == jnl->jhdr->end) {
+		return 0;
+    }
+
+    // allocate memory for the header_block.  we'll read each blhdr into this
+    if (kmem_alloc(kernel_map, (vm_offset_t *)&buf, jnl->jhdr->blhdr_size)) {
+		printf("jnl: replay_journal: no memory for block buffer! (%d bytes)\n",
+			   jnl->jhdr->blhdr_size);
+		return -1;
+    }
+    
+
+    printf("jnl: replay_journal: from: %lld to: %lld (joffset 0x%llx)\n",
+		   jnl->jhdr->start, jnl->jhdr->end, jnl->jdev_offset);
+
+    while(jnl->jhdr->start != jnl->jhdr->end) {
+		offset = jnl->jhdr->start;
+		ret = read_journal_data(jnl, &offset, buf, jnl->jhdr->blhdr_size);
+		if (ret != jnl->jhdr->blhdr_size) {
+			printf("jnl: replay_journal: Could not read block list header block @ 0x%llx!\n", offset);
+			goto bad_replay;
+		}
+
+		blhdr = (block_list_header *)buf;
+		checksum = blhdr->checksum;
+		blhdr->checksum = 0;
+		if (checksum != calc_checksum((char *)blhdr, BLHDR_CHECKSUM_SIZE)) {
+			printf("jnl: replay_journal: bad block list header @ 0x%llx (checksum 0x%x != 0x%x)\n",
+				   offset, checksum, calc_checksum((char *)blhdr, BLHDR_CHECKSUM_SIZE));
+			goto bad_replay;
+		}
+		if (   blhdr->max_blocks <= 0 || blhdr->max_blocks > 2048
+			   || blhdr->num_blocks <= 0 || blhdr->num_blocks > blhdr->max_blocks) {
+			printf("jnl: replay_journal: bad looking journal entry: max: %d num: %d\n",
+				   blhdr->max_blocks, blhdr->num_blocks);
+			goto bad_replay;
+		}
+	
+		for(i=1,max_bsize=0; i < blhdr->num_blocks; i++) {
+			if (blhdr->binfo[i].bnum < 0 && blhdr->binfo[i].bnum != (off_t)-1) {
+				printf("jnl: replay_journal: bogus block number 0x%llx\n", blhdr->binfo[i].bnum);
+				goto bad_replay;
+			}
+			if (blhdr->binfo[i].bsize > max_bsize) {
+				max_bsize = blhdr->binfo[i].bsize;
+			}
+		}
+
+		// make sure it's at least one page in size.
+		if (max_bsize & (PAGE_SIZE - 1)) {
+			max_bsize = (max_bsize + PAGE_SIZE) & ~(PAGE_SIZE - 1);
+		}
+
+		if (kmem_alloc(kernel_map, (vm_offset_t *)&block_ptr, max_bsize)) {
+			goto bad_replay;
+		}
+
+		//printf("jnl: replay_journal: %d blocks in journal entry @ 0x%llx\n", blhdr->num_blocks-1,
+		//	   jnl->jhdr->start);
+		for(i=1; i < blhdr->num_blocks; i++) {
+			int size;
+
+			size = blhdr->binfo[i].bsize;
+
+			ret = read_journal_data(jnl, &offset, block_ptr, size);
+			if (ret != size) {
+				printf("jnl: replay_journal: Could not read journal entry data @ offset 0x%llx!\n", offset);
+				goto bad_replay;
+			}
+
+			// don't replay "killed" blocks
+			if (blhdr->binfo[i].bnum == (off_t)-1) {
+				// printf("jnl: replay_journal: skipping killed fs block (slot %d)\n", i);
+			} else {
+				//printf("jnl: replay_journal: fixing fs block # %lld (%d)\n",
+				//	   blhdr->binfo[i].bnum, blhdr->binfo[i].bsize);
+
+				if (update_fs_block(jnl, block_ptr, blhdr->binfo[i].bnum, blhdr->binfo[i].bsize) != 0) {
+					goto bad_replay;
+				}
+			}
+
+			// check if we need to wrap offset back to the beginning
+			// (which is just past the journal header)
+			//
+			if (offset >= jnl->jhdr->size) {
+				offset = jnl->jhdr->jhdr_size;
+			}
+		}
+
+		kmem_free(kernel_map, (vm_offset_t)block_ptr, max_bsize);
+		block_ptr = NULL;
+
+		jnl->jhdr->start += blhdr->bytes_used;
+		if (jnl->jhdr->start >= jnl->jhdr->size) {
+			// wrap around and skip the journal header block
+			jnl->jhdr->start = (jnl->jhdr->start % jnl->jhdr->size) + jnl->jhdr->jhdr_size;
+		}
+
+		// only update the on-disk journal header if we've reached the
+		// last chunk of updates from this transaction.  if binfo[0].bnum
+		// is zero then we know we're at the end.
+		if (blhdr->binfo[0].bnum == 0) {
+			if (write_journal_header(jnl) != 0) {
+				goto bad_replay;
+			}
+		}
+    }
+
+    kmem_free(kernel_map, (vm_offset_t)buf, jnl->jhdr->blhdr_size);
+    return 0;
+
+  bad_replay:
+    if (block_ptr) {
+		kmem_free(kernel_map, (vm_offset_t)block_ptr, max_bsize);
+    }
+    kmem_free(kernel_map, (vm_offset_t)buf, jnl->jhdr->blhdr_size);
+    return -1;
+}
+
+
+#define DEFAULT_TRANSACTION_BUFFER_SIZE  (128*1024)
+//#define DEFAULT_TRANSACTION_BUFFER_SIZE  (256*1024)  // better performance but uses more mem
+#define MAX_TRANSACTION_BUFFER_SIZE      (512*1024)
+
+// XXXdbg - so I can change it in the debugger
+int def_tbuffer_size = 0;
+
+
+//
+// This function sets the size of the tbuffer and the
+// size of the blhdr.  It assumes that jnl->jhdr->size
+// and jnl->jhdr->jhdr_size are already valid.
+//
+static void
+size_up_tbuffer(journal *jnl, int tbuffer_size, int phys_blksz)
+{
+	//
+	// one-time initialization based on how much memory 
+	// there is in the machine.
+	//
+	if (def_tbuffer_size == 0) {
+		if (mem_size < (256*1024*1024)) {
+			def_tbuffer_size = DEFAULT_TRANSACTION_BUFFER_SIZE;
+		} else if (mem_size < (512*1024*1024)) {
+			def_tbuffer_size = DEFAULT_TRANSACTION_BUFFER_SIZE * 2;
+		} else if (mem_size < (1024*1024*1024)) {
+			def_tbuffer_size = DEFAULT_TRANSACTION_BUFFER_SIZE * 3;
+		} else if (mem_size >= (1024*1024*1024)) {
+			def_tbuffer_size = DEFAULT_TRANSACTION_BUFFER_SIZE * 4;
+		}
+	}
+
+    // size up the transaction buffer... can't be larger than the number
+    // of blocks that can fit in a block_list_header block.
+    if (tbuffer_size == 0) {
+		jnl->tbuffer_size = def_tbuffer_size;
+    } else {
+		// make sure that the specified tbuffer_size isn't too small
+		if (tbuffer_size < jnl->jhdr->blhdr_size * 2) {
+			tbuffer_size = jnl->jhdr->blhdr_size * 2;
+		}
+		// and make sure it's an even multiple of the block size
+		if ((tbuffer_size % jnl->jhdr->jhdr_size) != 0) {
+			tbuffer_size -= (tbuffer_size % jnl->jhdr->jhdr_size);
+		}
+
+		jnl->tbuffer_size = tbuffer_size;
+    }
+
+    if (jnl->tbuffer_size > (jnl->jhdr->size / 2)) {
+		jnl->tbuffer_size = (jnl->jhdr->size / 2);
+    }
+    
+    if (jnl->tbuffer_size > MAX_TRANSACTION_BUFFER_SIZE) {
+		jnl->tbuffer_size = MAX_TRANSACTION_BUFFER_SIZE;
+    }
+
+    jnl->jhdr->blhdr_size = (jnl->tbuffer_size / jnl->jhdr->jhdr_size) * sizeof(block_info);
+	if (jnl->jhdr->blhdr_size < phys_blksz) {
+		jnl->jhdr->blhdr_size = phys_blksz;
+	}
+}
+
+
+
+journal *
+journal_create(struct vnode *jvp,
+			   off_t         offset,
+			   off_t         journal_size,
+			   struct vnode *fsvp,
+			   size_t        min_fs_blksz,
+			   int32_t       flags,
+			   int32_t       tbuffer_size,
+			   void        (*flush)(void *arg),
+			   void         *arg)
+{
+    journal *jnl;
+    int      ret, phys_blksz;
+
+    /* Get the real physical block size. */
+    if (VOP_IOCTL(jvp, DKIOCGETBLOCKSIZE, (caddr_t)&phys_blksz, 0, FSCRED, NULL)) {
+		return NULL;
+    }
+
+    if (phys_blksz > min_fs_blksz) {
+		printf("jnl: create: error: phys blksize %d bigger than min fs blksize %d\n",
+			   phys_blksz, min_fs_blksz);
+		return NULL;
+    }
+
+    if ((journal_size % phys_blksz) != 0) {
+		printf("jnl: create: journal size 0x%llx is not an even multiple of block size 0x%x\n",
+			   journal_size, phys_blksz);
+		return NULL;
+    }
+
+    if (kmem_alloc(kernel_map, (vm_offset_t *)&jnl, sizeof(struct journal))) {
+		return NULL;
+    }
+    memset(jnl, 0, sizeof(*jnl));
+
+    jnl->jdev         = jvp;
+    jnl->jdev_offset  = offset;
+    jnl->fsdev        = fsvp;
+    jnl->flush        = flush;
+    jnl->flush_arg    = arg;
+    jnl->flags        = (flags & JOURNAL_OPTION_FLAGS_MASK);
+	simple_lock_init(&jnl->old_start_lock);
+	
+    if (kmem_alloc(kernel_map, (vm_offset_t *)&jnl->header_buf, phys_blksz)) {
+		printf("jnl: create: could not allocate space for header buffer (%d bytes)\n", phys_blksz);
+		goto bad_kmem_alloc;
+    }
+
+    memset(jnl->header_buf, 0, phys_blksz);
+    
+    jnl->jhdr             = (journal_header *)jnl->header_buf;
+    jnl->jhdr->magic      = JOURNAL_HEADER_MAGIC;
+    jnl->jhdr->endian     = ENDIAN_MAGIC;
+    jnl->jhdr->start      = phys_blksz;    // start at block #1, block #0 is for the jhdr itself
+    jnl->jhdr->end        = phys_blksz;
+    jnl->jhdr->size       = journal_size;
+    jnl->jhdr->jhdr_size  = phys_blksz;
+    size_up_tbuffer(jnl, tbuffer_size, phys_blksz);
+
+	jnl->active_start     = jnl->jhdr->start;
+
+    // XXXdbg  - for testing you can force the journal to wrap around
+    // jnl->jhdr->start = jnl->jhdr->size - (phys_blksz*3);
+    // jnl->jhdr->end   = jnl->jhdr->size - (phys_blksz*3);
+    
+    if (semaphore_create(kernel_task, &jnl->jsem, SYNC_POLICY_FIFO, 1) != 0) {
+		printf("jnl: journal_create: failed to create journal semaphore..\n");
+		goto bad_sem;
+    }
+
+    if (write_journal_header(jnl) != 0) {
+		printf("jnl: journal_create: failed to write journal header.\n");
+		goto bad_write;
+    }
+
+    return jnl;
+
+
+  bad_write:
+    semaphore_destroy(kernel_task, jnl->jsem);
+  bad_sem:
+    kmem_free(kernel_map, (vm_offset_t)jnl->header_buf, phys_blksz);
+  bad_kmem_alloc:
+    jnl->jhdr = NULL;
+	kmem_free(kernel_map, (vm_offset_t)jnl, sizeof(struct journal));
+    return NULL;
+}
+
+
+journal *
+journal_open(struct vnode *jvp,
+			 off_t         offset,
+			 off_t         journal_size,
+			 struct vnode *fsvp,
+			 size_t        min_fs_blksz,
+			 int32_t       flags,
+			 int32_t       tbuffer_size,
+			 void        (*flush)(void *arg),
+			 void         *arg)
+{
+    journal *jnl;
+    int      orig_blksz=0, phys_blksz, blhdr_size;
+    off_t    hdr_offset=0;
+
+    /* Get the real physical block size. */
+    if (VOP_IOCTL(jvp, DKIOCGETBLOCKSIZE, (caddr_t)&phys_blksz, 0, FSCRED, NULL)) {
+		return NULL;
+    }
+
+    if (phys_blksz > min_fs_blksz) {
+		printf("jnl: create: error: phys blksize %d bigger than min fs blksize %d\n",
+			   phys_blksz, min_fs_blksz);
+		return NULL;
+    }
+
+    if ((journal_size % phys_blksz) != 0) {
+		printf("jnl: open: journal size 0x%llx is not an even multiple of block size 0x%x\n",
+			   journal_size, phys_blksz);
+		return NULL;
+    }
+
+    if (kmem_alloc(kernel_map, (vm_offset_t *)&jnl, sizeof(struct journal))) {
+		return NULL;
+    }
+    memset(jnl, 0, sizeof(*jnl));
+
+    jnl->jdev         = jvp;
+    jnl->jdev_offset  = offset;
+    jnl->fsdev        = fsvp;
+    jnl->flush        = flush;
+    jnl->flush_arg    = arg;
+    jnl->flags        = (flags & JOURNAL_OPTION_FLAGS_MASK);
+	simple_lock_init(&jnl->old_start_lock);
+
+    if (kmem_alloc(kernel_map, (vm_offset_t *)&jnl->header_buf, phys_blksz)) {
+		printf("jnl: create: could not allocate space for header buffer (%d bytes)\n", phys_blksz);
+		goto bad_kmem_alloc;
+    }
+
+    jnl->jhdr = (journal_header *)jnl->header_buf;
+    memset(jnl->jhdr, 0, sizeof(journal_header)+4);
+
+    // we have to set this up here so that do_journal_io() will work
+    jnl->jhdr->jhdr_size = phys_blksz;
+
+    if (read_journal_data(jnl, &hdr_offset, jnl->jhdr, phys_blksz) != phys_blksz) {
+		printf("jnl: open: could not read %d bytes for the journal header.\n",
+			   phys_blksz);
+		goto bad_journal;
+    }
+
+    if (jnl->jhdr->magic != JOURNAL_HEADER_MAGIC && jnl->jhdr->magic != OLD_JOURNAL_HEADER_MAGIC) {
+		printf("jnl: open: journal magic is bad (0x%x != 0x%x)\n",
+			   jnl->jhdr->magic, JOURNAL_HEADER_MAGIC);
+		goto bad_journal;
+    }
+
+	// only check if we're the current journal header magic value
+	if (jnl->jhdr->magic == JOURNAL_HEADER_MAGIC) {
+		int orig_checksum = jnl->jhdr->checksum;
+
+		jnl->jhdr->checksum = 0;
+		if (orig_checksum != calc_checksum((char *)jnl->jhdr, sizeof(struct journal_header))) {
+			printf("jnl: open: journal checksum is bad (0x%x != 0x%x)\n", orig_checksum,
+				   calc_checksum((char *)jnl->jhdr, sizeof(struct journal_header)));
+			//goto bad_journal;
+		}
+	}
+
+	// XXXdbg - convert old style magic numbers to the new one
+	if (jnl->jhdr->magic == OLD_JOURNAL_HEADER_MAGIC) {
+		jnl->jhdr->magic = JOURNAL_HEADER_MAGIC;
+	}
+
+    if (phys_blksz != jnl->jhdr->jhdr_size && jnl->jhdr->jhdr_size != 0) {
+		printf("jnl: open: phys_blksz %d does not match journal header size %d\n",
+			   phys_blksz, jnl->jhdr->jhdr_size);
+
+		orig_blksz = phys_blksz;
+		phys_blksz = jnl->jhdr->jhdr_size;
+		if (VOP_IOCTL(jvp, DKIOCSETBLOCKSIZE, (caddr_t)&phys_blksz, FWRITE, FSCRED, NULL)) {
+			printf("jnl: could not set block size to %d bytes.\n", phys_blksz);
+			goto bad_journal;
+		}
+//		goto bad_journal;
+    }
+
+    if (   jnl->jhdr->start <= 0
+		   || jnl->jhdr->start > jnl->jhdr->size
+		   || jnl->jhdr->start > 128*1024*1024) {
+		printf("jnl: open: jhdr start looks bad (0x%llx max size 0x%llx)\n",
+			   jnl->jhdr->start, jnl->jhdr->size);
+		goto bad_journal;
+    }
+
+    if (   jnl->jhdr->end <= 0
+		   || jnl->jhdr->end > jnl->jhdr->size
+		   || jnl->jhdr->end > 128*1024*1024) {
+		printf("jnl: open: jhdr end looks bad (0x%llx max size 0x%llx)\n",
+			   jnl->jhdr->end, jnl->jhdr->size);
+		goto bad_journal;
+    }
+
+    if (jnl->jhdr->size > 128*1024*1024) {
+		printf("jnl: open: jhdr size looks bad (0x%llx)\n", jnl->jhdr->size);
+		goto bad_journal;
+    }
+
+// XXXdbg - can't do these checks because hfs writes all kinds of
+//          non-uniform sized blocks even on devices that have a block size
+//          that is larger than 512 bytes (i.e. optical media w/2k blocks).
+//          therefore these checks will fail and so we just have to punt and
+//          do more relaxed checking...
+// XXXdbg    if ((jnl->jhdr->start % jnl->jhdr->jhdr_size) != 0) {
+    if ((jnl->jhdr->start % 512) != 0) {
+		printf("jnl: open: journal start (0x%llx) not a multiple of 512?\n",
+			   jnl->jhdr->start);
+		goto bad_journal;
+    }
+
+//XXXdbg    if ((jnl->jhdr->end % jnl->jhdr->jhdr_size) != 0) {
+    if ((jnl->jhdr->end % 512) != 0) {
+		printf("jnl: open: journal end (0x%llx) not a multiple of block size (0x%x)?\n",
+			   jnl->jhdr->end, jnl->jhdr->jhdr_size);
+		goto bad_journal;
+    }
+
+    // take care of replaying the journal if necessary
+	if (flags & JOURNAL_RESET) {
+		printf("jnl: journal start/end pointers reset! (jnl 0x%x; s 0x%llx e 0x%llx)\n",
+			   jnl, jnl->jhdr->start, jnl->jhdr->end);
+		jnl->jhdr->start = jnl->jhdr->end;
+	} else if (replay_journal(jnl) != 0) {
+		printf("jnl: journal_open: Error replaying the journal!\n");
+		goto bad_journal;
+    }
+
+	if (orig_blksz != 0) {
+		VOP_IOCTL(jvp, DKIOCSETBLOCKSIZE, (caddr_t)&orig_blksz, FWRITE, FSCRED, NULL);
+		phys_blksz = orig_blksz;
+	}
+
+	// make sure this is in sync!
+	jnl->active_start = jnl->jhdr->start;
+
+    // set this now, after we've replayed the journal
+    size_up_tbuffer(jnl, tbuffer_size, phys_blksz);
+
+    if (semaphore_create(kernel_task, &jnl->jsem, SYNC_POLICY_FIFO, 1) != 0) {
+		printf("jnl: journal_create: failed to create journal semaphore..\n");
+		goto bad_journal;
+    }
+
+    return jnl;
+
+  bad_journal:
+	if (orig_blksz != 0) {
+		phys_blksz = orig_blksz;
+		VOP_IOCTL(jvp, DKIOCSETBLOCKSIZE, (caddr_t)&orig_blksz, FWRITE, FSCRED, NULL);
+	}
+    kmem_free(kernel_map, (vm_offset_t)jnl->header_buf, phys_blksz);
+  bad_kmem_alloc:
+	kmem_free(kernel_map, (vm_offset_t)jnl, sizeof(struct journal));
+    return NULL;    
+}
+
+void
+journal_close(journal *jnl)
+{
+    volatile off_t *start, *end;
+    int             counter=0;
+
+    CHECK_JOURNAL(jnl);
+
+	// set this before doing anything that would block so that
+	// we start tearing things down properly.
+	//
+	jnl->flags |= JOURNAL_CLOSE_PENDING;
+
+    if (jnl->owner != current_act()) {
+		int ret;
+
+		while ((ret = semaphore_wait(jnl->jsem)) == KERN_ABORTED) {
+			// just keep trying if we've been ^C'ed
+		}
+		if (ret != 0) {
+			printf("jnl: close: sem wait failed.\n");
+			return;
+		}
+    }
+
+    //
+    // only write stuff to disk if the journal is still valid
+    //
+    if ((jnl->flags & JOURNAL_INVALID) == 0) {
+
+		if (jnl->active_tr) {
+			journal_end_transaction(jnl);
+		}
+		
+		// flush any buffered transactions
+		if (jnl->cur_tr) {
+			transaction *tr = jnl->cur_tr;
+
+			jnl->cur_tr = NULL;
+			end_transaction(tr, 1);   // force it to get flushed
+		}
+    
+		//start = &jnl->jhdr->start;
+		start = &jnl->active_start;
+		end   = &jnl->jhdr->end;
+    
+		while (*start != *end && counter++ < 500) {
+			printf("jnl: close: flushing the buffer cache (start 0x%llx end 0x%llx)\n", *start, *end);
+			if (jnl->flush) {
+				jnl->flush(jnl->flush_arg);
+			}
+	
+		}
+
+		if (*start != *end) {
+			printf("jnl: close: buffer flushing didn't seem to flush out all the transactions! (0x%llx - 0x%llx)\n",
+				   *start, *end);
+		}
+
+		// make sure this is in sync when we close the journal
+		jnl->jhdr->start = jnl->active_start;
+
+		// if this fails there's not much we can do at this point...
+		write_journal_header(jnl);
+    } else {
+		// if we're here the journal isn't valid any more.
+		// so make sure we don't leave any locked blocks lying around
+		printf("jnl: close: journal 0x%x, is invalid.  aborting outstanding transactions\n", jnl);
+		if (jnl->active_tr || jnl->cur_tr) {
+			transaction *tr;
+			if (jnl->active_tr) {
+				tr = jnl->active_tr;
+				jnl->active_tr = NULL;
+			} else {
+				tr = jnl->cur_tr;
+				jnl->cur_tr = NULL;
+			}
+
+			abort_transaction(jnl, tr);
+			if (jnl->active_tr || jnl->cur_tr) {
+				panic("jnl: close: jnl @ 0x%x had both an active and cur tr\n", jnl);
+			}
+		}
+    }
+
+    free_old_stuff(jnl);
+
+    kmem_free(kernel_map, (vm_offset_t)jnl->header_buf, jnl->jhdr->jhdr_size);
+    jnl->jhdr = (void *)0xbeefbabe;
+
+    semaphore_destroy(kernel_task, jnl->jsem);
+	kmem_free(kernel_map, (vm_offset_t)jnl, sizeof(struct journal));
+}
+
+static void
+dump_journal(journal *jnl)
+{
+    transaction *ctr;
+
+    printf("journal:");
+    printf("  jdev_offset %.8llx\n", jnl->jdev_offset);
+    printf("  magic: 0x%.8x\n", jnl->jhdr->magic);
+    printf("  start: 0x%.8llx\n", jnl->jhdr->start);
+    printf("  end:   0x%.8llx\n", jnl->jhdr->end);
+    printf("  size:  0x%.8llx\n", jnl->jhdr->size);
+    printf("  blhdr size: %d\n", jnl->jhdr->blhdr_size);
+    printf("  jhdr size: %d\n", jnl->jhdr->jhdr_size);
+    printf("  chksum: 0x%.8x\n", jnl->jhdr->checksum);
+    
+    printf("  completed transactions:\n");
+    for(ctr=jnl->completed_trs; ctr; ctr=ctr->next) {
+		printf("    0x%.8llx - 0x%.8llx\n", ctr->journal_start, ctr->journal_end);
+    }
+}
+
+
+
+static off_t
+free_space(journal *jnl)
+{
+    off_t free_space;
+	
+    if (jnl->jhdr->start < jnl->jhdr->end) {
+		free_space = jnl->jhdr->size - (jnl->jhdr->end - jnl->jhdr->start) - jnl->jhdr->jhdr_size;
+    } else if (jnl->jhdr->start > jnl->jhdr->end) {
+		free_space = jnl->jhdr->start - jnl->jhdr->end;
+    } else {
+		// journal is completely empty
+		free_space = jnl->jhdr->size - jnl->jhdr->jhdr_size;
+    }
+
+    return free_space;
+}
+
+
+//
+// The journal must be locked on entry to this function.
+// The "desired_size" is in bytes.
+//
+static int
+check_free_space(journal *jnl, int desired_size)
+{
+    int    i, counter=0;
+
+    //printf("jnl: check free space (desired 0x%x, avail 0x%Lx)\n",
+//	   desired_size, free_space(jnl));
+    
+    while (1) {
+		if (counter++ == 5000) {
+			dump_journal(jnl);
+			panic("jnl: check_free_space: buffer flushing isn't working "
+				  "(jnl @ 0x%x s %lld e %lld f %lld [active start %lld]).\n", jnl,
+				  jnl->jhdr->start, jnl->jhdr->end, free_space(jnl), jnl->active_start);
+		}
+		if (counter > 7500) {
+			printf("jnl: check_free_space: giving up waiting for free space.\n");
+			return ENOSPC;
+		}
+
+		// make sure there's space in the journal to hold this transaction
+		if (free_space(jnl) > desired_size) {
+			break;
+		}
+
+		//
+		// here's where we lazily bump up jnl->jhdr->start.  we'll consume
+		// entries until there is enough space for the next transaction.
+		//
+		simple_lock(&jnl->old_start_lock);
+		for(i=0; i < sizeof(jnl->old_start)/sizeof(jnl->old_start[0]); i++) {
+			int   counter;
+
+			counter = 0;
+			while (jnl->old_start[i] & 0x8000000000000000LL) {
+				if (counter++ > 100) {
+					panic("jnl: check_free_space: tr starting @ 0x%llx not flushing (jnl 0x%x).\n",
+						  jnl->old_start[i], jnl);
+				}
+				
+				simple_unlock(&jnl->old_start_lock);
+				if (jnl->flush) {
+					jnl->flush(jnl->flush_arg);
+				}
+				tsleep((caddr_t)jnl, PRIBIO, "check_free_space1", 1);
+				simple_lock(&jnl->old_start_lock);
+			}
+
+			if (jnl->old_start[i] == 0) {
+				continue;
+			}
+
+			jnl->jhdr->start  = jnl->old_start[i];
+			jnl->old_start[i] = 0;
+			if (free_space(jnl) > desired_size) {
+				write_journal_header(jnl);
+				break;
+			}
+		}
+		simple_unlock(&jnl->old_start_lock);
+		
+		// if we bumped the start, loop and try again
+		if (i < sizeof(jnl->old_start)/sizeof(jnl->old_start[0])) {
+			continue;
+		}
+
+
+		// if the file system gave us a flush function, call it to so that
+		// it can flush some blocks which hopefully will cause some transactions
+		// to complete and thus free up space in the journal.
+		if (jnl->flush) {
+			jnl->flush(jnl->flush_arg);
+		}
+	
+		// wait for a while to avoid being cpu-bound (this will
+		// put us to sleep for 10 milliseconds)
+		tsleep((caddr_t)jnl, PRIBIO, "check_free_space2", 1);
+    }
+
+    return 0;
+}
+
+int
+journal_start_transaction(journal *jnl)
+{
+    int ret;
+    transaction *tr;
+
+    CHECK_JOURNAL(jnl);
+    
+    if (jnl->flags & JOURNAL_INVALID) {
+		return EINVAL;
+    }
+
+    if (jnl->owner == current_act()) {
+		if (jnl->active_tr == NULL) {
+			panic("jnl: start_tr: active_tr is NULL (jnl @ 0x%x, owner 0x%x, current_act 0x%x\n",
+				  jnl, jnl->owner, current_act());
+		}
+		jnl->nested_count++;
+		return 0;
+    }
+
+    while ((ret = semaphore_wait(jnl->jsem)) == KERN_ABORTED) {
+		// just keep looping if we've been ^C'ed
+    }
+    if (ret != 0) {
+		printf("jnl: start_tr: sem wait failed.\n");
+		return EINVAL;
+    }
+
+    if (jnl->owner != NULL || jnl->nested_count != 0 || jnl->active_tr != NULL) {
+		panic("jnl: start_tr: owner 0x%x, nested count 0x%x, active_tr 0x%x jnl @ 0x%x\n",
+			  jnl->owner, jnl->nested_count, jnl->active_tr, jnl);
+    }
+
+    jnl->owner        = current_act();
+    jnl->nested_count = 1;
+
+    free_old_stuff(jnl);
+
+    // make sure there's room in the journal
+    if (check_free_space(jnl, jnl->tbuffer_size) != 0) {
+		printf("jnl: start transaction failed: no space\n");
+		ret = ENOSPC;
+		goto bad_start;
+    }
+
+    // if there's a buffered transaction, use it.
+    if (jnl->cur_tr) {
+		jnl->active_tr = jnl->cur_tr;
+		jnl->cur_tr    = NULL;
+
+		return 0;
+    }
+
+    if (kmem_alloc(kernel_map, (vm_offset_t *)&tr, sizeof(transaction))) {
+		printf("jnl: start transaction failed: no mem\n");
+		ret = ENOMEM;
+		goto bad_start;
+    }
+    memset(tr, 0, sizeof(transaction));
+
+    tr->tbuffer_size = jnl->tbuffer_size;
+    if (kmem_alloc(kernel_map, (vm_offset_t *)&tr->tbuffer, tr->tbuffer_size)) {
+		kmem_free(kernel_map, (vm_offset_t)tr, sizeof(transaction));
+		printf("jnl: start transaction failed: no tbuffer mem\n");
+		ret = ENOMEM;
+		goto bad_start;
+    }
+
+    // journal replay code checksum check depends on this.
+    memset(tr->tbuffer, 0, BLHDR_CHECKSUM_SIZE);
+
+    tr->blhdr = (block_list_header *)tr->tbuffer;
+    tr->blhdr->max_blocks = (jnl->jhdr->blhdr_size / sizeof(block_info)) - 1;
+    tr->blhdr->num_blocks = 1;      // accounts for this header block
+    tr->blhdr->bytes_used = jnl->jhdr->blhdr_size;
+
+    tr->num_blhdrs  = 1;
+    tr->total_bytes = jnl->jhdr->blhdr_size;
+    tr->jnl         = jnl;
+
+    jnl->active_tr    = tr;
+
+    // printf("jnl: start_tr: owner 0x%x new tr @ 0x%x\n", jnl->owner, tr);
+
+    return 0;
+
+  bad_start:
+	jnl->owner        = NULL;
+	jnl->nested_count = 0;
+	semaphore_signal(jnl->jsem);
+	return ret;
+}
+
+
+int
+journal_modify_block_start(journal *jnl, struct buf *bp)
+{
+    transaction *tr;
+    
+    CHECK_JOURNAL(jnl);
+
+    if (jnl->flags & JOURNAL_INVALID) {
+		return EINVAL;
+    }
+
+    // XXXdbg - for debugging I want this to be true.  later it may
+    //          not be necessary.
+    if ((bp->b_flags & B_META) == 0) {
+		panic("jnl: modify_block_start: bp @ 0x%x is not a meta-data block! (jnl 0x%x)\n", bp, jnl);
+    }
+
+    tr = jnl->active_tr;
+    CHECK_TRANSACTION(tr);
+
+    if (jnl->owner != current_act()) {
+		panic("jnl: modify_block_start: called w/out a transaction! jnl 0x%x, owner 0x%x, curact 0x%x\n",
+			  jnl, jnl->owner, current_act());
+    }
+
+    free_old_stuff(jnl);
+
+    //printf("jnl: mod block start (bp 0x%x vp 0x%x l/blkno %d/%d bsz %d; total bytes %d)\n",
+    //   bp, bp->b_vp, bp->b_lblkno, bp->b_blkno, bp->b_bufsize, tr->total_bytes);
+
+    // can't allow blocks that aren't an even multiple of the
+    // underlying block size.
+    if ((bp->b_bufsize % jnl->jhdr->jhdr_size) != 0) {
+		panic("jnl: mod block start: bufsize %d not a multiple of block size %d\n",
+			  bp->b_bufsize, jnl->jhdr->jhdr_size);
+		return -1;
+    }
+
+    // make sure that this transaction isn't bigger than the whole journal
+    if (tr->total_bytes+bp->b_bufsize >= (jnl->jhdr->size - jnl->jhdr->jhdr_size)) {
+		panic("jnl: transaction too big (%d >= %lld bytes, bufsize %d, tr 0x%x bp 0x%x)\n",
+			  tr->total_bytes, (tr->jnl->jhdr->size - jnl->jhdr->jhdr_size), bp->b_bufsize, tr, bp);
+		return -1;
+    }
+
+    // if the block is dirty and not already locked we have to write
+    // it out before we muck with it because it has data that belongs
+    // (presumably) to another transaction.
+    //
+    if ((bp->b_flags & B_DELWRI) && (bp->b_flags & B_LOCKED) == 0) {
+
+		// this will cause it to not be brelse()'d
+		bp->b_flags |= B_NORELSE;
+		VOP_BWRITE(bp);
+    }
+
+    bp->b_flags |= B_LOCKED;
+	
+    return 0;
+}
+
+int
+journal_modify_block_abort(journal *jnl, struct buf *bp)
+{
+    transaction *tr;
+	block_list_header *blhdr;
+	int i, j;
+    
+    CHECK_JOURNAL(jnl);
+
+    tr = jnl->active_tr;
+	
+	//
+	// if there's no active transaction then we just want to
+	// call brelse() and return since this is just a block
+	// that happened to be modified as part of another tr.
+	//
+	if (tr == NULL) {
+		brelse(bp);
+		return 0;
+	}
+
+    if (jnl->flags & JOURNAL_INVALID) {
+		return EINVAL;
+    }
+
+    CHECK_TRANSACTION(tr);
+    
+    if (jnl->owner != current_act()) {
+		panic("jnl: modify_block_abort: called w/out a transaction! jnl 0x%x, owner 0x%x, curact 0x%x\n",
+			  jnl, jnl->owner, current_act());
+    }
+
+    free_old_stuff(jnl);
+
+    // printf("jnl: modify_block_abort: tr 0x%x bp 0x%x\n", jnl->active_tr, bp);
+
+    // first check if it's already part of this transaction
+    for(blhdr=tr->blhdr; blhdr; blhdr=(block_list_header *)((long)blhdr->binfo[0].bnum)) {
+		for(i=1; i < blhdr->num_blocks; i++) {
+			if (bp == blhdr->binfo[i].bp) {
+				if (bp->b_bufsize != blhdr->binfo[i].bsize) {
+					panic("jnl: bp @ 0x%x changed size on me! (%d vs. %d, jnl 0x%x)\n",
+						  bp, bp->b_bufsize, blhdr->binfo[i].bsize, jnl);
+				}
+				break;
+			}
+		}
+
+		if (i < blhdr->num_blocks) {
+			break;
+		}
+    }
+
+	//
+	// if blhdr is null, then this block has only had modify_block_start
+	// called on it as part of the current transaction.  that means that
+	// it is ok to clear the LOCKED bit since it hasn't actually been
+	// modified.  if blhdr is non-null then modify_block_end was called
+	// on it and so we need to keep it locked in memory.
+	//
+	if (blhdr == NULL) { 
+		bp->b_flags &= ~(B_LOCKED);
+	}
+
+    brelse(bp);
+    return 0;
+}
+
+
+int
+journal_modify_block_end(journal *jnl, struct buf *bp)
+{
+    int                i, j, tbuffer_offset;
+    char              *blkptr;
+    block_list_header *blhdr, *prev=NULL;
+    transaction       *tr;
+
+    CHECK_JOURNAL(jnl);
+
+    if (jnl->flags & JOURNAL_INVALID) {
+		return EINVAL;
+    }
+
+    tr = jnl->active_tr;
+    CHECK_TRANSACTION(tr);
+
+    if (jnl->owner != current_act()) {
+		panic("jnl: modify_block_end: called w/out a transaction! jnl 0x%x, owner 0x%x, curact 0x%x\n",
+			  jnl, jnl->owner, current_act());
+    }
+
+    free_old_stuff(jnl);
+
+    //printf("jnl: mod block end:  (bp 0x%x vp 0x%x l/blkno %d/%d bsz %d, total bytes %d)\n", 
+    //   bp, bp->b_vp, bp->b_lblkno, bp->b_blkno, bp->b_bufsize, tr->total_bytes);
+
+    if ((bp->b_flags & B_LOCKED) == 0) {
+		panic("jnl: modify_block_end: bp 0x%x not locked! jnl @ 0x%x\n", bp, jnl);
+		bp->b_flags |= B_LOCKED;
+    }
+	 
+    // first check if it's already part of this transaction
+    for(blhdr=tr->blhdr; blhdr; prev=blhdr,blhdr=(block_list_header *)((long)blhdr->binfo[0].bnum)) {
+		tbuffer_offset = jnl->jhdr->blhdr_size;
+
+		for(i=1; i < blhdr->num_blocks; i++) {
+			if (bp == blhdr->binfo[i].bp) {
+				if (bp->b_bufsize != blhdr->binfo[i].bsize) {
+					panic("jnl: bp @ 0x%x changed size on me! (%d vs. %d, jnl 0x%x)\n",
+						  bp, bp->b_bufsize, blhdr->binfo[i].bsize, jnl);
+				}
+				break;
+			}
+			tbuffer_offset += blhdr->binfo[i].bsize;
+		}
+
+		if (i < blhdr->num_blocks) {
+			break;
+		}
+    }
+
+    if (blhdr == NULL
+		&& prev
+		&& (prev->num_blocks+1) <= prev->max_blocks
+		&& (prev->bytes_used+bp->b_bufsize) <= tr->tbuffer_size) {
+		blhdr = prev;
+    } else if (blhdr == NULL) {
+		block_list_header *nblhdr;
+
+		if (prev == NULL) {
+			panic("jnl: modify block end: no way man, prev == NULL?!?, jnl 0x%x, bp 0x%x\n", jnl, bp);
+		}
+
+		// we got to the end of the list, didn't find the block and there's
+		// no room in the block_list_header pointed to by prev
+	
+		// we allocate another tbuffer and link it in at the end of the list
+		// through prev->binfo[0].bnum.  that's a skanky way to do things but
+		// avoids having yet another linked list of small data structures to manage.
+
+		if (kmem_alloc(kernel_map, (vm_offset_t *)&nblhdr, tr->tbuffer_size)) {
+			panic("jnl: end_tr: no space for new block tr @ 0x%x (total bytes: %d)!\n",
+				  tr, tr->total_bytes);
+		}
+
+		// journal replay code checksum check depends on this.
+		memset(nblhdr, 0, BLHDR_CHECKSUM_SIZE);
+
+		// initialize the new guy
+		nblhdr->max_blocks = (jnl->jhdr->blhdr_size / sizeof(block_info)) - 1;
+		nblhdr->num_blocks = 1;      // accounts for this header block
+		nblhdr->bytes_used = jnl->jhdr->blhdr_size;
+	    
+		tr->num_blhdrs++;
+		tr->total_bytes += jnl->jhdr->blhdr_size;
+
+		// then link him in at the end
+		prev->binfo[0].bnum = (off_t)((long)nblhdr);
+
+		// and finally switch to using the new guy
+		blhdr          = nblhdr;
+		tbuffer_offset = jnl->jhdr->blhdr_size;
+		i              = 1;
+    }
+
+
+    if ((i+1) > blhdr->max_blocks) {
+		panic("jnl: modify_block_end: i = %d, max_blocks %d\n", i, blhdr->max_blocks);
+    }
+
+    // copy the data into the in-memory transaction buffer
+    blkptr = (char *)&((char *)blhdr)[tbuffer_offset];
+    memcpy(blkptr, bp->b_data, bp->b_bufsize);
+
+    // if this is true then this is a new block we haven't seen
+    if (i >= blhdr->num_blocks) {
+		vget(bp->b_vp, 0, current_proc());
+
+		blhdr->binfo[i].bnum  = bp->b_blkno;
+		blhdr->binfo[i].bsize = bp->b_bufsize;
+		blhdr->binfo[i].bp    = bp;
+
+		blhdr->bytes_used += bp->b_bufsize;
+		tr->total_bytes   += bp->b_bufsize;
+
+		blhdr->num_blocks++;
+    }
+
+    bdwrite(bp);
+
+    return 0;
+}
+
+int
+journal_kill_block(journal *jnl, struct buf *bp)
+{
+    int                i;
+    block_list_header *blhdr;
+    transaction       *tr;
+
+    CHECK_JOURNAL(jnl);
+
+    if (jnl->flags & JOURNAL_INVALID) {
+		return EINVAL;
+    }
+
+    tr = jnl->active_tr;
+    CHECK_TRANSACTION(tr);
+
+    if (jnl->owner != current_act()) {
+		panic("jnl: modify_block_end: called w/out a transaction! jnl 0x%x, owner 0x%x, curact 0x%x\n",
+			  jnl, jnl->owner, current_act());
+    }
+
+    free_old_stuff(jnl);
+
+    if ((bp->b_flags & B_LOCKED) == 0) {
+		panic("jnl: kill block: bp 0x%x not locked! jnl @ 0x%x\n", bp, jnl);
+    }
+
+    // first check if it's already part of this transaction
+    for(blhdr=tr->blhdr; blhdr; blhdr=(block_list_header *)((long)blhdr->binfo[0].bnum)) {
+
+		for(i=1; i < blhdr->num_blocks; i++) {
+			if (bp == blhdr->binfo[i].bp) {
+				bp->b_flags &= ~B_LOCKED;
+
+				// this undoes the vget() in journal_modify_block_end()
+				vrele(bp->b_vp);
+
+				// if the block has the DELWRI and CALL bits sets, then
+				// things are seriously weird.  if it was part of another
+				// transaction then journal_modify_block_start() should
+				// have force it to be written.
+				//
+				if ((bp->b_flags & B_DELWRI) && (bp->b_flags & B_CALL)) {
+					panic("jnl: kill block: this defies all logic! bp 0x%x\n", bp);
+				} else {
+					tr->num_killed += bp->b_bufsize;
+				}
+
+				if (bp->b_flags & B_BUSY) {
+					brelse(bp);
+				}
+
+				blhdr->binfo[i].bp   = NULL;
+				blhdr->binfo[i].bnum = (off_t)-1;
+				break;
+			}
+		}
+
+		if (i < blhdr->num_blocks) {
+			break;
+		}
+    }
+
+    return 0;
+}
+
+
+static int
+journal_binfo_cmp(void *a, void *b)
+{
+    block_info *bi_a = (struct block_info *)a,
+ *bi_b = (struct block_info *)b;
+    daddr_t res;
+
+    if (bi_a->bp == NULL) {
+		return 1;
+    }
+    if (bi_b->bp == NULL) {
+		return -1;
+    }
+
+    // don't have to worry about negative block
+    // numbers so this is ok to do.
+    //
+    res = (bi_a->bp->b_blkno - bi_b->bp->b_blkno);
+
+    return (int)res;
+}
+
+
+static int
+end_transaction(transaction *tr, int force_it)
+{
+    int                 i, j, ret, amt;
+    off_t               end;
+    journal            *jnl = tr->jnl;
+    struct buf         *bp;
+    block_list_header  *blhdr=NULL, *next=NULL;
+
+	if (jnl->cur_tr) {
+		panic("jnl: jnl @ 0x%x already has cur_tr 0x%x, new tr: 0x%x\n",
+			  jnl, jnl->cur_tr, tr);
+	}
+
+    // if there weren't any modified blocks in the transaction
+    // just save off the transaction pointer and return.
+    if (tr->total_bytes == jnl->jhdr->blhdr_size) {
+		jnl->cur_tr = tr;
+		return;
+    }
+
+    // if our transaction buffer isn't very full, just hang
+    // on to it and don't actually flush anything.  this is
+    // what is known as "group commit".  we will flush the
+    // transaction buffer if it's full or if we have more than
+    // one of them so we don't start hogging too much memory.
+    //
+    if (   force_it == 0
+		   && (jnl->flags & JOURNAL_NO_GROUP_COMMIT) == 0 
+		   && tr->num_blhdrs < 3
+		   && (tr->total_bytes <= ((tr->tbuffer_size*tr->num_blhdrs) - tr->tbuffer_size/8))) {
+
+		jnl->cur_tr = tr;
+		return;
+    }
+
+
+    // if we're here we're going to flush the transaction buffer to disk.
+    // make sure there is room in the journal first.
+    check_free_space(jnl, tr->total_bytes);
+
+    // range check the end index
+    if (jnl->jhdr->end <= 0 || jnl->jhdr->end > jnl->jhdr->size) {
+		panic("jnl: end_transaction: end is bogus 0x%llx (sz 0x%llx)\n",
+			  jnl->jhdr->end, jnl->jhdr->size);
+    }
+
+    // this transaction starts where the current journal ends
+    tr->journal_start = jnl->jhdr->end;
+    end               = jnl->jhdr->end;
+
+	//
+	// if the first entry in old_start[] isn't free yet, loop calling the
+	// file system flush routine until it is (or we panic).
+	//
+	i = 0;
+	simple_lock(&jnl->old_start_lock);
+	while ((jnl->old_start[0] & 0x8000000000000000LL) != 0) {
+		if (jnl->flush) {
+			simple_unlock(&jnl->old_start_lock);
+
+			if (jnl->flush) {
+				jnl->flush(jnl->flush_arg);
+			}
+
+			// yield the cpu so others can get in to clear the lock bit
+			(void)tsleep((void *)jnl, PRIBIO, "jnl-old-start-sleep", 1);
+
+			simple_lock(&jnl->old_start_lock);
+		}
+		if (i++ >= 100) {
+			panic("jnl: transaction that started at 0x%llx is not completing! jnl 0x%x\n",
+				  jnl->old_start[0] & (~0x8000000000000000LL), jnl);
+		}
+	}
+
+	//
+	// slide everyone else down and put our latest guy in the last
+	// entry in the old_start array
+	//
+	memcpy(&jnl->old_start[0], &jnl->old_start[1], sizeof(jnl->old_start)-sizeof(jnl->old_start[0]));
+	jnl->old_start[sizeof(jnl->old_start)/sizeof(jnl->old_start[0]) - 1] = tr->journal_start | 0x8000000000000000LL;
+
+	simple_unlock(&jnl->old_start_lock);
+
+
+    // for each block, make sure that the physical block # is set
+    for(blhdr=tr->blhdr; blhdr; blhdr=next) {
+
+		for(i=1; i < blhdr->num_blocks; i++) {
+	    
+			bp = blhdr->binfo[i].bp;
+			if (bp == NULL) {   // only true if a block was "killed" 
+				if (blhdr->binfo[i].bnum != (off_t)-1) {
+					panic("jnl: inconsistent binfo (NULL bp w/bnum %lld; jnl @ 0x%x, tr 0x%x)\n",
+						  blhdr->binfo[i].bnum, jnl, tr);
+				}
+				continue;
+			}
+
+			if (bp->b_vp == NULL && bp->b_lblkno == bp->b_blkno) {
+				panic("jnl: end_tr: DANGER! bp @ 0x%x w/null vp and l/blkno = %d/%d\n",
+					  bp, bp->b_lblkno, bp->b_blkno);
+			}
+	    
+			// if the lblkno is the same as blkno and this bp isn't
+			// associated with the underlying file system device then
+			// we need to call bmap() to get the actual physical block.
+			//
+			if ((bp->b_lblkno == bp->b_blkno) && (bp->b_vp != jnl->fsdev)) {
+				if (VOP_BMAP(bp->b_vp, bp->b_lblkno, NULL, &bp->b_blkno, NULL) != 0) {
+					printf("jnl: end_tr: can't bmap the bp @ 0x%x, jnl 0x%x\n", bp, jnl);
+					goto bad_journal;
+				}
+			}
+	    
+			// update this so we write out the correct physical block number!
+			blhdr->binfo[i].bnum = bp->b_blkno;
+		}
+
+		next = (block_list_header *)((long)blhdr->binfo[0].bnum);
+    }
+    
+    for(blhdr=tr->blhdr; blhdr; blhdr=(block_list_header *)((long)blhdr->binfo[0].bnum)) {
+
+		amt = blhdr->bytes_used;
+
+		blhdr->checksum = 0;
+		blhdr->checksum = calc_checksum((char *)blhdr, BLHDR_CHECKSUM_SIZE);
+	
+		ret = write_journal_data(jnl, &end, blhdr, amt);
+		if (ret != amt) {
+			printf("jnl: end_transaction: only wrote %d of %d bytes to the journal!\n",
+				   ret, amt);
+
+			goto bad_journal;
+		}
+    }
+
+    jnl->jhdr->end  = end;    // update where the journal now ends
+    tr->journal_end = end;    // the transaction ends here too
+    if (tr->journal_start == 0 || tr->journal_end == 0) {
+		panic("jnl: end_transaction: bad tr journal start/end: 0x%llx 0x%llx\n",
+			  tr->journal_start, tr->journal_end);
+    }
+
+    if (write_journal_header(jnl) != 0) {
+		goto bad_journal;
+    }
+
+    //
+    // setup for looping through all the blhdr's.  we null out the
+    // tbuffer and blhdr fields so that they're not used any more.
+    //
+    blhdr       = tr->blhdr;
+    tr->tbuffer = NULL;
+    tr->blhdr   = NULL;
+
+    // the buffer_flushed_callback will only be called for the 
+    // real blocks that get flushed so we have to account for 
+    // the block_list_headers here.
+    //
+    tr->num_flushed = tr->num_blhdrs * jnl->jhdr->blhdr_size;
+
+    // for each block, set the iodone callback and unlock it
+    for(; blhdr; blhdr=next) {
+
+		// we can re-order the buf ptrs because everything is written out already
+		qsort(&blhdr->binfo[1], blhdr->num_blocks-1, sizeof(block_info), journal_binfo_cmp);
+
+		for(i=1; i < blhdr->num_blocks; i++) {
+			if (blhdr->binfo[i].bp == NULL) {
+				continue;
+			}
+
+			ret = meta_bread(blhdr->binfo[i].bp->b_vp,
+							 (daddr_t)blhdr->binfo[i].bp->b_lblkno,
+							 blhdr->binfo[i].bp->b_bufsize,
+							 NOCRED,
+							 &bp);
+			if (ret == 0 && bp != NULL) {
+				struct vnode *save_vp;
+		
+				if (bp != blhdr->binfo[i].bp) {
+					panic("jnl: end_tr: got back a different bp! (bp 0x%x should be 0x%x, jnl 0x%x\n",
+						  bp, blhdr->binfo[i].bp, jnl);
+				}
+
+				if ((bp->b_flags & (B_LOCKED|B_DELWRI)) != (B_LOCKED|B_DELWRI)) {
+					if (jnl->flags & JOURNAL_CLOSE_PENDING) {
+						brelse(bp);
+						continue;
+					} else {
+						panic("jnl: end_tr: !!!DANGER!!! bp 0x%x flags (0x%x) not LOCKED & DELWRI\n", bp, bp->b_flags);
+					}
+				}
+
+				if (bp->b_iodone != NULL) {
+					panic("jnl: bp @ 0x%x (blkno %d, vp 0x%x) has non-null iodone (0x%x) buffflushcb 0x%x\n",
+						  bp, bp->b_blkno, bp->b_vp, bp->b_iodone, buffer_flushed_callback);
+				}
+
+				save_vp = bp->b_vp;
+
+				bp->b_iodone       = buffer_flushed_callback;
+				bp->b_transaction  = tr;
+				bp->b_flags       |= B_CALL;
+				bp->b_flags       &= ~(B_LOCKED);
+
+				// kicking off the write here helps performance
+				bawrite(bp);
+				// XXXdbg this is good for testing: bdwrite(bp);
+				//bdwrite(bp);
+				
+				// this undoes the vget() in journal_modify_block_end()
+				vrele(save_vp);
+
+			} else {
+				printf("jnl: end_transaction: could not find block %Ld vp 0x%x!\n",
+					   blhdr->binfo[i].bnum, blhdr->binfo[i].bp);
+			}
+		}
+
+		next = (block_list_header *)((long)blhdr->binfo[0].bnum);
+
+		// we can free blhdr here since we won't need it any more
+		blhdr->binfo[0].bnum = 0xdeadc0de;
+		kmem_free(kernel_map, (vm_offset_t)blhdr, tr->tbuffer_size);
+    }
+
+    //printf("jnl: end_tr: tr @ 0x%x, jnl-blocks: 0x%llx - 0x%llx. exit!\n",
+    //   tr, tr->journal_start, tr->journal_end);
+    return 0;
+
+
+  bad_journal:
+    jnl->flags |= JOURNAL_INVALID;
+    abort_transaction(jnl, tr);
+    return -1;
+}
+
+static void
+abort_transaction(journal *jnl, transaction *tr)
+{
+    int                i, ret;
+    block_list_header *blhdr, *next;
+    struct buf        *bp;
+
+    // for each block list header, iterate over the blocks then
+    // free up the memory associated with the block list.
+    //
+    // for each block, clear the lock bit and release it.
+    //
+    for(blhdr=tr->blhdr; blhdr; blhdr=next) {
+
+		for(i=1; i < blhdr->num_blocks; i++) {
+			if (blhdr->binfo[i].bp == NULL) {
+				continue;
+			}
+	    
+			ret = meta_bread(blhdr->binfo[i].bp->b_vp,
+							 (daddr_t)blhdr->binfo[i].bp->b_lblkno,
+							 blhdr->binfo[i].bp->b_bufsize,
+							 NOCRED,
+							 &bp);
+			if (ret == 0 && bp != NULL) {
+				if (bp != blhdr->binfo[i].bp) {
+					panic("jnl: abort_tr: got back a different bp! (bp 0x%x should be 0x%x, jnl 0x%x\n",
+						  bp, blhdr->binfo[i].bp, jnl);
+				}
+
+				// clear the locked bit and the delayed-write bit.  we
+				// don't want these blocks going to disk.
+				bp->b_flags &= ~(B_LOCKED|B_DELWRI);
+				bp->b_flags |= B_INVAL;
+
+				brelse(bp);
+
+			} else {
+				printf("jnl: abort_tr: could not find block %Ld vp 0x%x!\n",
+					   blhdr->binfo[i].bnum, blhdr->binfo[i].bp);
+			}
+		}
+
+		next = (block_list_header *)((long)blhdr->binfo[0].bnum);
+
+		// we can free blhdr here since we won't need it any more
+		blhdr->binfo[0].bnum = 0xdeadc0de;
+		kmem_free(kernel_map, (vm_offset_t)blhdr, tr->tbuffer_size);
+    }
+
+    tr->tbuffer     = NULL;
+    tr->blhdr       = NULL;
+    tr->total_bytes = 0xdbadc0de;
+	kmem_free(kernel_map, (vm_offset_t)tr, sizeof(transaction));
+}
+
+
+int
+journal_end_transaction(journal *jnl)
+{
+    int ret;
+    transaction *tr;
+    
+    CHECK_JOURNAL(jnl);
+
+	if ((jnl->flags & JOURNAL_INVALID) && jnl->owner == NULL) {
+		return 0;
+	}
+
+    if (jnl->owner != current_act()) {
+		panic("jnl: end_tr: I'm not the owner! jnl 0x%x, owner 0x%x, curact 0x%x\n",
+			  jnl, jnl->owner, current_act());
+    }
+
+    free_old_stuff(jnl);
+
+    jnl->nested_count--;
+    if (jnl->nested_count > 0) {
+		return 0;
+    } else if (jnl->nested_count < 0) {
+		panic("jnl: jnl @ 0x%x has negative nested count (%d). bad boy.\n", jnl, jnl->nested_count);
+    }
+    
+    if (jnl->flags & JOURNAL_INVALID) {
+		if (jnl->active_tr) {
+			transaction *tr;
+
+			if (jnl->cur_tr != NULL) {
+				panic("jnl: journal @ 0x%x has active tr (0x%x) and cur tr (0x%x)\n",
+					  jnl, jnl->active_tr, jnl->cur_tr);
+			}
+	    
+			tr             = jnl->active_tr;
+			jnl->active_tr = NULL;
+			abort_transaction(jnl, tr);
+		}
+
+		jnl->owner = NULL;
+		semaphore_signal(jnl->jsem);
+
+		return EINVAL;
+    }
+
+    tr = jnl->active_tr;
+    CHECK_TRANSACTION(tr);
+
+    // clear this out here so that when check_free_space() calls
+    // the FS flush function, we don't panic in journal_flush()
+    // if the FS were to call that.  note: check_free_space() is
+    // called from end_transaction().
+    // 
+    jnl->active_tr = NULL;
+    ret = end_transaction(tr, 0);
+
+    jnl->owner = NULL;
+    semaphore_signal(jnl->jsem);
+
+    return ret;
+}
+
+
+int
+journal_flush(journal *jnl)
+{
+    int need_signal = 0;
+    
+    CHECK_JOURNAL(jnl);
+    
+    if (jnl->flags & JOURNAL_INVALID) {
+		return -1;
+    }
+
+    if (jnl->owner != current_act()) {
+		int ret;
+
+		while ((ret = semaphore_wait(jnl->jsem)) == KERN_ABORTED) {
+			// just keep looping if we've ben ^C'ed 
+		}
+		if (ret != 0) {
+			printf("jnl: flush: sem wait failed.\n");
+			return -1;
+		}
+		need_signal = 1;
+    }
+
+    free_old_stuff(jnl);
+
+    // if we're not active, flush any buffered transactions
+    if (jnl->active_tr == NULL && jnl->cur_tr) {
+		transaction *tr = jnl->cur_tr;
+
+		jnl->cur_tr = NULL;
+		end_transaction(tr, 1);   // force it to get flushed
+    }
+
+    if (need_signal) {
+		semaphore_signal(jnl->jsem);
+    }
+
+    return 0;
+}
+
+int
+journal_active(journal *jnl)
+{
+    if (jnl->flags & JOURNAL_INVALID) {
+		return -1;
+    }
+    
+    return (jnl->active_tr == NULL) ? 0 : 1;
+}
diff --git a/bsd/vfs/vfs_journal.h b/bsd/vfs/vfs_journal.h
new file mode 100644
index 000000000..523ba7d52
--- /dev/null
+++ b/bsd/vfs/vfs_journal.h
@@ -0,0 +1,238 @@
+
+/*
+ * Copyright (c) 2000-2002 Apple Computer, Inc. All rights reserved.
+ *
+ * @APPLE_LICENSE_HEADER_START@
+ * 
+ * The contents of this file constitute Original Code as defined in and
+ * are subject to the Apple Public Source License Version 1.1 (the
+ * "License").  You may not use this file except in compliance with the
+ * License.  Please obtain a copy of the License at
+ * http://www.apple.com/publicsource and read it before using this file.
+ * 
+ * This Original Code and all software distributed under the License are
+ * distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY KIND, EITHER
+ * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
+ * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE OR NON-INFRINGEMENT.  Please see the
+ * License for the specific language governing rights and limitations
+ * under the License.
+ * 
+ * @APPLE_LICENSE_HEADER_END@
+ */
+/*
+ * This header contains the structures and function prototypes
+ * for the vfs journaling code.  The data types are not meant
+ * to be modified by user code.  Just use the functions and do
+ * not mess around with the structs.
+ */ 
+#ifndef _SYS_VFS_JOURNAL_H_
+#define _SYS_VFS_JOURNAL_H_
+
+#include <sys/appleapiopts.h>
+
+#ifdef __APPLE_API_UNSTABLE
+
+#include <sys/types.h>
+
+typedef struct block_info {
+    off_t       bnum;                // block # on the file system device
+    size_t      bsize;               // in bytes
+    struct buf *bp;
+} block_info;
+
+typedef struct block_list_header {
+    u_int16_t   max_blocks;          // max number of blocks in this chunk
+    u_int16_t   num_blocks;          // number of valid block numbers in block_nums
+    int32_t     bytes_used;          // how many bytes of this tbuffer are used
+    int32_t     checksum;            // on-disk: checksum of this header and binfo[0]
+    int32_t     pad;                 // pad out to 16 bytes
+    block_info  binfo[1];            // so we can reference them by name
+} block_list_header;
+
+
+struct journal;
+
+typedef struct transaction {
+    int                 tbuffer_size;  // in bytes
+    char               *tbuffer;       // memory copy of the transaction
+    block_list_header  *blhdr;         // points to the first byte of tbuffer
+    int                 num_blhdrs;    // how many buffers we've allocated
+    int                 total_bytes;   // total # of bytes in transaction
+    int                 num_flushed;   // how many bytes have been flushed
+    int                 num_killed;    // how many bytes were "killed"
+    off_t               journal_start; // where in the journal this transaction starts
+    off_t               journal_end;   // where in the journal this transaction ends
+    struct journal     *jnl;           // ptr back to the journal structure
+    struct transaction *next;          // list of tr's (either completed or to be free'd)
+} transaction;
+
+
+/*
+ * This is written to block zero of the journal and it
+ * maintains overall state about the journal.
+ */
+typedef struct journal_header {
+    int32_t        magic;
+    int32_t        endian;
+    volatile off_t start;         // zero-based byte offset of the start of the first transaction
+    volatile off_t end;           // zero-based byte offset of where free space begins
+    off_t          size;          // size in bytes of the entire journal
+    int32_t        blhdr_size;    // size in bytes of each block_list_header in the journal
+    int32_t        checksum;
+    int32_t        jhdr_size;     // block size (in bytes) of the journal header
+} journal_header;
+
+#define JOURNAL_HEADER_MAGIC  0x4a4e4c78   // 'JNLx'
+#define ENDIAN_MAGIC          0x12345678
+
+#define OLD_JOURNAL_HEADER_MAGIC  0x4a484452   // 'JHDR'
+
+
+/*
+ * In memory structure about the journal.
+ */
+typedef struct journal {
+    struct vnode       *jdev;              // vnode of the device where the journal lives
+    off_t               jdev_offset;       // byte offset to the start of the journal
+
+    struct vnode       *fsdev;             // vnode of the file system device
+    
+    void              (*flush)(void *arg); // fs callback to flush meta data blocks
+    void               *flush_arg;         // arg that's passed to flush()
+
+    int32_t             flags;
+    int32_t             tbuffer_size;      // default transaction buffer size
+
+    char               *header_buf;        // in-memory copy of the journal header
+    journal_header     *jhdr;              // points to the first byte of header_buf
+
+    transaction        *cur_tr;            // for group-commit
+    transaction        *completed_trs;     // out-of-order transactions that completed
+    transaction        *active_tr;         // for nested transactions
+    int32_t             nested_count;      // for nested transactions
+    void               *owner;             // a ptr that's unique to the calling process
+
+    transaction        *tr_freeme;         // transaction structs that need to be free'd
+
+	volatile off_t      active_start;      // the active start that we only keep in memory
+	simple_lock_data_t  old_start_lock;    // guard access
+	volatile off_t      old_start[16];     // this is how we do lazy start update
+
+    semaphore_t         jsem;
+} journal;
+
+/* internal-only journal flags (top 16 bits) */
+#define JOURNAL_CLOSE_PENDING     0x00010000
+#define JOURNAL_INVALID           0x00020000
+
+/* journal_open/create options are always in the low-16 bits */
+#define JOURNAL_OPTION_FLAGS_MASK 0x0000ffff
+
+/*
+ * Prototypes.
+ */
+
+/*
+ * Call journal_create() to create a new journal.  You only
+ * call this once, typically at file system creation time.
+ *
+ * The "jvp" argument is the vnode where the journal is written.
+ * The journal starts at "offset" and is "journal_size" bytes long.
+ *
+ * The "fsvp" argument is the vnode of your file system.  It may be
+ * the same as "jvp".
+ *
+ * The "min_fs_block_size" argument is the minimum block size
+ * (in bytes) that the file system will ever write.  Typically
+ * this is the block size of the file system (1k, 4k, etc) but
+ * on HFS+ it is the minimum block size of the underlying device.
+ *
+ * The flags argument lets you disable group commit if you
+ * want tighter guarantees on transactions (in exchange for
+ * lower performance).
+ *
+ * The tbuffer_size is the size of the transaction buffer
+ * used by the journal. If you specify zero, the journal code
+ * will use a reasonable defaults.  The tbuffer_size should 
+ * be an integer multiple of the min_fs_block_size.
+ *
+ * Returns a valid journal pointer or NULL if one could not
+ * be created.
+ */
+journal *journal_create(struct vnode *jvp,
+						off_t         offset,
+						off_t         journal_size,
+						struct vnode *fsvp,
+						size_t        min_fs_block_size,
+						int32_t       flags,
+						int32_t       tbuffer_size,
+						void        (*flush)(void *arg),
+						void         *arg);
+
+/*
+ * Call journal_open() when mounting an existing file system
+ * that has a previously created journal.  It will take care
+ * of validating the journal and replaying it if necessary.
+ *
+ * See journal_create() for a description of the arguments.
+ *
+ * Returns a valid journal pointer of NULL if it runs into
+ * trouble reading/playing back the journal.
+ */
+journal  *journal_open(struct vnode *jvp,
+					   off_t         offset,
+					   off_t         journal_size,
+					   struct vnode *fsvp,
+					   size_t        min_fs_block_size,
+					   int32_t       flags,
+					   int32_t       tbuffer_size,
+					   void        (*flush)(void *arg),
+					   void         *arg);
+
+/*
+ * Call journal_close() just before your file system is unmounted.
+ * It flushes any outstanding transactions and makes sure the
+ * journal is in a consistent state.
+ */
+void      journal_close(journal *journal);
+
+/*
+ * flags for journal_create/open.  only can use 
+ * the low 16 bits for flags because internal 
+ * bits go in the high 16.
+ */
+#define JOURNAL_NO_GROUP_COMMIT   0x00000001
+#define JOURNAL_RESET             0x00000002
+
+/*
+ * Transaction related functions.
+ *
+ * Before you start modifying file system meta data, you
+ * should call journal_start_transaction().  Then before
+ * you modify each block, call journal_modify_block_start()
+ * and when you're done, journal_modify_block_end().  When
+ * you've modified the last block as part of a transaction,
+ * call journal_end_transaction() to commit the changes.
+ *
+ * If you decide to abort the modifications to a block you
+ * should call journal_modify_block_abort().
+ *
+ * If as part of a transaction you need want to throw out
+ * any previous copies of a block (because it got deleted)
+ * then call journal_kill_block().  This will mark it so
+ * that the journal does not play it back (effectively
+ * dropping it).
+ */
+int   journal_start_transaction(journal *jnl);
+int   journal_modify_block_start(journal *jnl, struct buf *bp);
+int   journal_modify_block_abort(journal *jnl, struct buf *bp);
+int   journal_modify_block_end(journal *jnl, struct buf *bp);
+int   journal_kill_block(journal *jnl, struct buf *bp);
+int   journal_end_transaction(journal *jnl);
+
+int   journal_active(journal *jnl);
+int   journal_flush(journal *jnl);
+
+#endif /* __APPLE_API_UNSTABLE */
+#endif /* !_SYS_VFS_JOURNAL_H_ */
diff --git a/bsd/vfs/vfs_subr.c b/bsd/vfs/vfs_subr.c
index c49f321c2..ce79f9d4d 100644
--- a/bsd/vfs/vfs_subr.c
+++ b/bsd/vfs/vfs_subr.c
@@ -677,12 +677,22 @@ vinvalbuf(vp, flags, cred, p, slpflag, slptimeo)
 		if (error = VOP_FSYNC(vp, cred, MNT_WAIT, p)) {
 			return (error);
 		}
-		if (vp->v_dirtyblkhd.lh_first)
-			panic("vinvalbuf: dirty bufs");
+
+		// XXXdbg - if there are dirty bufs, wait for 'em if they're busy
+		for (bp=vp->v_dirtyblkhd.lh_first; bp; bp=nbp) {
+		    nbp = bp->b_vnbufs.le_next;
+		    if (ISSET(bp->b_flags, B_BUSY)) {
+			SET(bp->b_flags, B_WANTED);
+			tsleep((caddr_t)bp, slpflag | (PRIBIO + 1), "vinvalbuf", 0);
+			nbp = vp->v_dirtyblkhd.lh_first;
+		    } else {
+			panic("vinvalbuf: dirty buf (vp 0x%x, bp 0x%x)", vp, bp);
+		    }
+		}
 	}
 
 	for (;;) {
-		if ((blist = vp->v_cleanblkhd.lh_first) && flags & V_SAVEMETA)
+		if ((blist = vp->v_cleanblkhd.lh_first) && (flags & V_SAVEMETA))
 			while (blist && blist->b_lblkno < 0)
 				blist = blist->b_vnbufs.le_next;
 		if (!blist && (blist = vp->v_dirtyblkhd.lh_first) &&
@@ -694,7 +704,7 @@ vinvalbuf(vp, flags, cred, p, slpflag, slptimeo)
 
 		for (bp = blist; bp; bp = nbp) {
 			nbp = bp->b_vnbufs.le_next;
-			if (flags & V_SAVEMETA && bp->b_lblkno < 0)
+			if ((flags & V_SAVEMETA) && bp->b_lblkno < 0)
 				continue;
 			s = splbio();
 			if (ISSET(bp->b_flags, B_BUSY)) {
@@ -720,7 +730,13 @@ vinvalbuf(vp, flags, cred, p, slpflag, slptimeo)
 				(void) VOP_BWRITE(bp);
 				break;
 			}
-			SET(bp->b_flags, B_INVAL);
+
+			if (bp->b_flags & B_LOCKED) {
+				panic("vinvalbuf: bp @ 0x%x is locked!\n", bp);
+				break;
+			} else {
+				SET(bp->b_flags, B_INVAL);
+			}
 			brelse(bp);
 		}
 	}
diff --git a/iokit/IOKit/IOKitKeys.h b/iokit/IOKit/IOKitKeys.h
index 6862e89e9..65fe7197d 100644
--- a/iokit/IOKit/IOKitKeys.h
+++ b/iokit/IOKit/IOKitKeys.h
@@ -96,10 +96,14 @@
 #define kIOCommandPoolSizeKey	       "IOCommandPoolSize"          // (OSNumber)
 
 // properties found in services that have transfer constraints
-#define kIOMaximumBlockCountReadKey    "IOMaximumBlockCountRead"    // (OSNumber)
-#define kIOMaximumBlockCountWriteKey   "IOMaximumBlockCountWrite"   // (OSNumber)
-#define kIOMaximumSegmentCountReadKey  "IOMaximumSegmentCountRead"  // (OSNumber)
-#define kIOMaximumSegmentCountWriteKey "IOMaximumSegmentCountWrite" // (OSNumber)
+#define kIOMaximumBlockCountReadKey        "IOMaximumBlockCountRead"        // (OSNumber)
+#define kIOMaximumBlockCountWriteKey       "IOMaximumBlockCountWrite"       // (OSNumber)
+#define kIOMaximumByteCountReadKey         "IOMaximumByteCountRead"         // (OSNumber)
+#define kIOMaximumByteCountWriteKey        "IOMaximumByteCountWrite"        // (OSNumber)
+#define kIOMaximumSegmentCountReadKey      "IOMaximumSegmentCountRead"      // (OSNumber)
+#define kIOMaximumSegmentCountWriteKey     "IOMaximumSegmentCountWrite"     // (OSNumber)
+#define kIOMaximumSegmentByteCountReadKey  "IOMaximumSegmentByteCountRead"  // (OSNumber)
+#define kIOMaximumSegmentByteCountWriteKey "IOMaximumSegmentByteCountWrite" // (OSNumber)
 
 // properties found in services that wish to describe an icon
 //
diff --git a/iokit/KernelConfigTables.cpp b/iokit/KernelConfigTables.cpp
index ff0b955c9..1eedcc6df 100644
--- a/iokit/KernelConfigTables.cpp
+++ b/iokit/KernelConfigTables.cpp
@@ -28,16 +28,16 @@
  */
 const char * gIOKernelKmods =
 "{
-    'com.apple.kernel'                         = '6.1';
-    'com.apple.kernel.bsd'                     = '6.1';
-    'com.apple.kernel.iokit'                   = '6.1';
-    'com.apple.kernel.libkern'                 = '6.1';
-    'com.apple.kernel.mach'                    = '6.1';
-    'com.apple.iokit.IOADBFamily'              = '1.1';
-    'com.apple.iokit.IONVRAMFamily'            = '1.1';
-    'com.apple.iokit.IOSystemManagementFamily' = '1.1';
-    'com.apple.iokit.ApplePlatformFamily'      = '1.0';
-    'com.apple.driver.AppleNMI'                = '1.0';
+    'com.apple.kernel'                         = '6.2';
+    'com.apple.kernel.bsd'                     = '6.2';
+    'com.apple.kernel.iokit'                   = '6.2';
+    'com.apple.kernel.libkern'                 = '6.2';
+    'com.apple.kernel.mach'                    = '6.2';
+    'com.apple.iokit.IOADBFamily'              = '6.2';
+    'com.apple.iokit.IONVRAMFamily'            = '6.2';
+    'com.apple.iokit.IOSystemManagementFamily' = '6.2';
+    'com.apple.iokit.ApplePlatformFamily'      = '6.2';
+    'com.apple.driver.AppleNMI'                = '6.2';
 }";
 
 
diff --git a/iokit/conf/version.minor b/iokit/conf/version.minor
index d00491fd7..0cfbf0888 100644
--- a/iokit/conf/version.minor
+++ b/iokit/conf/version.minor
@@ -1 +1 @@
-1
+2
diff --git a/libkern/conf/version.minor b/libkern/conf/version.minor
index d00491fd7..0cfbf0888 100644
--- a/libkern/conf/version.minor
+++ b/libkern/conf/version.minor
@@ -1 +1 @@
-1
+2
diff --git a/libsa/conf/version.minor b/libsa/conf/version.minor
index d00491fd7..0cfbf0888 100644
--- a/libsa/conf/version.minor
+++ b/libsa/conf/version.minor
@@ -1 +1 @@
-1
+2
diff --git a/osfmk/conf/kernelversion.minor b/osfmk/conf/kernelversion.minor
index d00491fd7..0cfbf0888 100644
--- a/osfmk/conf/kernelversion.minor
+++ b/osfmk/conf/kernelversion.minor
@@ -1 +1 @@
-1
+2
diff --git a/osfmk/conf/version.minor b/osfmk/conf/version.minor
index d00491fd7..0cfbf0888 100644
--- a/osfmk/conf/version.minor
+++ b/osfmk/conf/version.minor
@@ -1 +1 @@
-1
+2
diff --git a/osfmk/i386/loose_ends.c b/osfmk/i386/loose_ends.c
index e64faedb2..b0cd27fff 100644
--- a/osfmk/i386/loose_ends.c
+++ b/osfmk/i386/loose_ends.c
@@ -64,6 +64,49 @@
 	 */
 
 
+/*
+ * copy 'size' bytes from physical to physical address
+ * the caller must validate the physical ranges 
+ *
+ * if flush_action == 0, no cache flush necessary
+ * if flush_action == 1, flush the source
+ * if flush_action == 2, flush the dest
+ * if flush_action == 3, flush both source and dest
+ */
+
+kern_return_t copyp2p(vm_offset_t source, vm_offset_t dest, unsigned int size, unsigned int flush_action) {
+
+        switch(flush_action) {
+	case 1:
+	        flush_dcache(source, size, 1);
+		break;
+	case 2:
+	        flush_dcache(dest, size, 1);
+		break;
+	case 3:
+	        flush_dcache(source, size, 1);
+	        flush_dcache(dest, size, 1);
+		break;
+
+	}
+        bcopy_phys((char *)source, (char *)dest, size);	/* Do a physical copy */
+
+        switch(flush_action) {
+	case 1:
+	        flush_dcache(source, size, 1);
+		break;
+	case 2:
+	        flush_dcache(dest, size, 1);
+		break;
+	case 3:
+	        flush_dcache(source, size, 1);
+	        flush_dcache(dest, size, 1);
+		break;
+
+	}
+}
+
+
 
 /*
  *              Copies data from a physical page to a virtual page.  This is used to
diff --git a/osfmk/ppc/cswtch.s b/osfmk/ppc/cswtch.s
index 3cca411b2..148730393 100644
--- a/osfmk/ppc/cswtch.s
+++ b/osfmk/ppc/cswtch.s
@@ -871,7 +871,7 @@ fsenable:	lwz		r8,savesrr1(r25)				; Get the msr of the interrupted guy
 			rlwinm.	r0,r8,0,MSR_PR_BIT,MSR_PR_BIT	; See if we are doing this for user state
 			stw		r8,savesrr1(r25)				; Set the msr of the interrupted guy
 			xor		r3,r25,r5						; Get the real address of the savearea
-			bne-	fsnuser							; We are not user state...
+			beq-	fsnuser							; We are not user state...
 			stw		r10,ACT_MACT_SPF(r17)			; Set the activation copy
 			stw		r10,spcFlags(r26)				; Set per_proc copy
 
@@ -2297,7 +2297,7 @@ vrenable:	lwz		r8,savesrr1(r25)				; Get the msr of the interrupted guy
 			rlwinm.	r0,r8,0,MSR_PR_BIT,MSR_PR_BIT	; See if we are doing this for user state
 			stw		r8,savesrr1(r25)				; Set the msr of the interrupted guy
 			xor		r3,r25,r5						; Get the real address of the savearea
-			bne-	vrnuser							; We are not user state...
+			beq-	vrnuser							; We are not user state...
 			stw		r10,ACT_MACT_SPF(r17)			; Set the activation copy
 			stw		r10,spcFlags(r26)				; Set per_proc copy
 
diff --git a/osfmk/ppc/mappings.c b/osfmk/ppc/mappings.c
index de3411de9..237e2bc12 100644
--- a/osfmk/ppc/mappings.c
+++ b/osfmk/ppc/mappings.c
@@ -70,6 +70,7 @@
 #endif
 
 vm_map_t        mapping_map = VM_MAP_NULL;
+#define		MAPPING_MAP_SIZE	33554432	/* 32MB address space */
 
 unsigned int	incrVSID = 0;									/* VSID increment value */
 unsigned int	mappingdeb0 = 0;						
@@ -1548,7 +1549,7 @@ void mapping_free_prime(void) {									/* Primes the mapping block release list
 	mappingblok	*mbn;
 	vm_offset_t     mapping_min;
 	
-	retr = kmem_suballoc(kernel_map, &mapping_min, mem_size / 16,
+	retr = kmem_suballoc(kernel_map, &mapping_min, MAPPING_MAP_SIZE,
 			     FALSE, TRUE, &mapping_map);
 
 	if (retr != KERN_SUCCESS)
@@ -1877,6 +1878,50 @@ kern_return_t copyp2v(vm_offset_t source, vm_offset_t sink, unsigned int size) {
 }
 
 
+/*
+ * copy 'size' bytes from physical to physical address
+ * the caller must validate the physical ranges 
+ *
+ * if flush_action == 0, no cache flush necessary
+ * if flush_action == 1, flush the source
+ * if flush_action == 2, flush the dest
+ * if flush_action == 3, flush both source and dest
+ */
+
+kern_return_t copyp2p(vm_offset_t source, vm_offset_t dest, unsigned int size, unsigned int flush_action) {
+
+        switch(flush_action) {
+	case 1:
+	        flush_dcache(source, size, 1);
+		break;
+	case 2:
+	        flush_dcache(dest, size, 1);
+		break;
+	case 3:
+	        flush_dcache(source, size, 1);
+	        flush_dcache(dest, size, 1);
+		break;
+
+	}
+        bcopy_phys((char *)source, (char *)dest, size);	/* Do a physical copy */
+
+        switch(flush_action) {
+	case 1:
+	        flush_dcache(source, size, 1);
+		break;
+	case 2:
+	        flush_dcache(dest, size, 1);
+		break;
+	case 3:
+	        flush_dcache(source, size, 1);
+	        flush_dcache(dest, size, 1);
+		break;
+
+	}
+}
+
+
+
 #if DEBUG
 /*
  *		Dumps out the mapping stuff associated with a virtual address
diff --git a/osfmk/ppc/pmap.c b/osfmk/ppc/pmap.c
index f6f6a8e34..85fabb668 100644
--- a/osfmk/ppc/pmap.c
+++ b/osfmk/ppc/pmap.c
@@ -483,6 +483,9 @@ pmap_bootstrap(unsigned int mem_size, vm_offset_t *first_avail, vm_offset_t *fir
 	     hash_table_size *= 2)
 		continue;
 
+	if (num > (sizeof(pte_t) * 524288))
+		hash_table_size = hash_table_size/2; /* reduce by half above 512MB */
+
 	/* Scale to within any physical memory layout constraints */
 	do {
 		num = atop(mem_size);	/* num now holds mem_size in pages */
diff --git a/osfmk/vm/vm_init.c b/osfmk/vm/vm_init.c
index 0a729a2a8..d066f2f58 100644
--- a/osfmk/vm/vm_init.c
+++ b/osfmk/vm/vm_init.c
@@ -100,7 +100,11 @@ vm_mem_bootstrap(void)
 	kmem_init(start, end);
 	pmap_init();
 	
-	zsize = mem_size >> 2;			/* Get target zone size as 1/4 of physical memory */
+	if (PE_parse_boot_arg("zsize", &zsize))
+		zsize = zsize * 1024 * 1024;
+	else {
+		zsize = mem_size >> 2;			/* Get target zone size as 1/4 of physical memory */
+	}
 	if(zsize < ZONE_MAP_MIN) zsize = ZONE_MAP_MIN;	/* Clamp to min */
 	if(zsize > ZONE_MAP_MAX) zsize = ZONE_MAP_MAX;	/* Clamp to max */
 	zone_init(zsize);						/* Allocate address space for zones */
diff --git a/osfmk/vm/vm_kern.c b/osfmk/vm/vm_kern.c
index ab065e048..4d4ee31cb 100644
--- a/osfmk/vm/vm_kern.c
+++ b/osfmk/vm/vm_kern.c
@@ -85,9 +85,7 @@ vm_map_t	kernel_pageable_map;
 extern kern_return_t kmem_alloc_pages(
 	register vm_object_t		object,
 	register vm_object_offset_t	offset,
-	register vm_offset_t		start,
-	register vm_offset_t		end,
-	vm_prot_t			protection);
+	register vm_size_t		size);
 
 extern void kmem_remap_pages(
 	register vm_object_t		object,
@@ -254,8 +252,13 @@ kernel_memory_allocate(
 
 	/*
 	 *	Since we have not given out this address yet,
-	 *	it is safe to unlock the map.
+	 *	it is safe to unlock the map. Except of course
+	 *	we must make certain no one coalesces our address
+         *      or does a blind vm_deallocate and removes the object
+	 *	an extra object reference will suffice to protect
+	 * 	against both contingencies.
 	 */
+	vm_object_reference(object);
 	vm_map_unlock(map);
 
 	vm_object_lock(object);
@@ -271,6 +274,7 @@ kernel_memory_allocate(
 						offset + (vm_object_offset_t)i);
 				vm_object_unlock(object);
 				vm_map_remove(map, addr, addr + size, 0);
+				vm_object_deallocate(object);
 				return KERN_RESOURCE_SHORTAGE;
 			}
 			vm_object_unlock(object);
@@ -289,8 +293,11 @@ kernel_memory_allocate(
 			vm_object_unlock(object);
 		}
 		vm_map_remove(map, addr, addr + size, 0);
+		vm_object_deallocate(object);
 		return (kr);
 	}
+	/* now that the page is wired, we no longer have to fear coalesce */
+	vm_object_deallocate(object);
 	if (object == kernel_object)
 		vm_map_simplify(map, addr);
 
@@ -338,31 +345,26 @@ kmem_realloc(
 	vm_offset_t	*newaddrp,
 	vm_size_t	newsize)
 {
-	vm_offset_t oldmin, oldmax;
-	vm_offset_t newaddr;
-	vm_object_t object;
-	vm_map_entry_t oldentry, newentry;
-	kern_return_t kr;
+	vm_offset_t	oldmin, oldmax;
+	vm_offset_t	newaddr;
+	vm_offset_t	offset;
+	vm_object_t	object;
+	vm_map_entry_t	oldentry, newentry;
+	vm_page_t	mem;
+	kern_return_t	kr;
 
 	oldmin = trunc_page(oldaddr);
 	oldmax = round_page(oldaddr + oldsize);
 	oldsize = oldmax - oldmin;
 	newsize = round_page(newsize);
 
-	/*
-	 *	Find space for the new region.
-	 */
-
-	kr = vm_map_find_space(map, &newaddr, newsize, (vm_offset_t) 0,
-			       &newentry);
-	if (kr != KERN_SUCCESS) {
-		return kr;
-	}
 
 	/*
 	 *	Find the VM object backing the old region.
 	 */
 
+	vm_map_lock(map);
+
 	if (!vm_map_lookup_entry(map, oldmin, &oldentry))
 		panic("kmem_realloc");
 	object = oldentry->object.vm_object;
@@ -373,36 +375,71 @@ kmem_realloc(
 	 */
 
 	vm_object_reference(object);
+	/* by grabbing the object lock before unlocking the map */
+	/* we guarantee that we will panic if more than one     */
+	/* attempt is made to realloc a kmem_alloc'd area       */
 	vm_object_lock(object);
+	vm_map_unlock(map);
 	if (object->size != oldsize)
 		panic("kmem_realloc");
 	object->size = newsize;
 	vm_object_unlock(object);
 
-	newentry->object.vm_object = object;
-	newentry->offset = 0;
-	assert (newentry->wired_count == 0);
-	newentry->wired_count = 1;
+	/* allocate the new pages while expanded portion of the */
+	/* object is still not mapped */
+	kmem_alloc_pages(object, oldsize, newsize-oldsize);
+
 
 	/*
-	 *	Since we have not given out this address yet,
-	 *	it is safe to unlock the map.  We are trusting
-	 *	that nobody will play with either region.
+	 *	Find space for the new region.
 	 */
 
+	kr = vm_map_find_space(map, &newaddr, newsize, (vm_offset_t) 0,
+			       &newentry);
+	if (kr != KERN_SUCCESS) {
+		vm_object_lock(object);
+		for(offset = oldsize; 
+				offset<newsize; offset+=PAGE_SIZE) {
+	    		if ((mem = vm_page_lookup(object, offset)) != VM_PAGE_NULL) {
+				vm_page_lock_queues();
+				vm_page_free(mem);
+				vm_page_unlock_queues();
+			}
+		}
+		object->size = oldsize;
+		vm_object_unlock(object);
+		vm_object_deallocate(object);
+		return kr;
+	}
+	newentry->object.vm_object = object;
+	newentry->offset = 0;
+	assert (newentry->wired_count == 0);
+
+	
+	/* add an extra reference in case we have someone doing an */
+	/* unexpected deallocate */
+	vm_object_reference(object);
 	vm_map_unlock(map);
 
-	/*
-	 *	Remap the pages in the old region and
-	 *	allocate more pages for the new region.
-	 */
+	if ((kr = vm_map_wire(map, newaddr, newaddr + newsize, 
+				VM_PROT_DEFAULT, FALSE)) != KERN_SUCCESS) {
+		vm_map_remove(map, newaddr, newaddr + newsize, 0);
+		vm_object_lock(object);
+		for(offset = oldsize; 
+				offset<newsize; offset+=PAGE_SIZE) {
+	    		if ((mem = vm_page_lookup(object, offset)) != VM_PAGE_NULL) {
+				vm_page_lock_queues();
+				vm_page_free(mem);
+				vm_page_unlock_queues();
+			}
+		}
+		object->size = oldsize;
+		vm_object_unlock(object);
+		vm_object_deallocate(object);
+		return (kr);
+	}
+	vm_object_deallocate(object);
 
-	kmem_remap_pages(object, 0,
-			 newaddr, newaddr + oldsize,
-			 VM_PROT_DEFAULT);
-	kmem_alloc_pages(object, oldsize,
-			 newaddr + oldsize, newaddr + newsize,
-			 VM_PROT_DEFAULT);
 
 	*newaddrp = newaddr;
 	return KERN_SUCCESS;
@@ -500,28 +537,21 @@ kmem_free(
 }
 
 /*
- *	Allocate new wired pages in an object.
- *	The object is assumed to be mapped into the kernel map or
- *	a submap.
+ *	Allocate new pages in an object.
  */
 
 kern_return_t
 kmem_alloc_pages(
 	register vm_object_t		object,
 	register vm_object_offset_t	offset,
-	register vm_offset_t		start,
-	register vm_offset_t		end,
-	vm_prot_t			protection)
+	register vm_size_t		size)
 {
-	/*
-	 *	Mark the pmap region as not pageable.
-	 */
-	pmap_pageable(kernel_pmap, start, end, FALSE);
 
-	while (start < end) {
+	size = round_page(size);
+        vm_object_lock(object);
+	while (size) {
 	    register vm_page_t	mem;
 
-	    vm_object_lock(object);
 
 	    /*
 	     *	Allocate a page
@@ -533,27 +563,12 @@ kmem_alloc_pages(
 		vm_object_lock(object);
 	    }
 
-	    /*
-	     *	Wire it down
-	     */
-	    vm_page_lock_queues();
-	    vm_page_wire(mem);
-	    vm_page_unlock_queues();
-	    vm_object_unlock(object);
-
-	    /*
-	     *	Enter it in the kernel pmap
-	     */
-	    PMAP_ENTER(kernel_pmap, start, mem, protection, 
-				VM_WIMG_USE_DEFAULT, TRUE);
-
-	    vm_object_lock(object);
-	    PAGE_WAKEUP_DONE(mem);
-	    vm_object_unlock(object);
 
-	    start += PAGE_SIZE;
-	    offset += PAGE_SIZE_64;
+	    offset += PAGE_SIZE;
+	    size -= PAGE_SIZE;
+	    mem->busy = FALSE;
 	}
+	vm_object_unlock(object);
 	return KERN_SUCCESS;
 }
 
diff --git a/pexpert/conf/version.minor b/pexpert/conf/version.minor
index d00491fd7..0cfbf0888 100644
--- a/pexpert/conf/version.minor
+++ b/pexpert/conf/version.minor
@@ -1 +1 @@
-1
+2