X-Git-Url: https://git.saurik.com/apple/xnu.git/blobdiff_plain/6d2010ae8f7a6078e10b361c6962983bab233e0f..fe8ab488e9161c46dd9885d58fc52996dc0249ff:/bsd/vfs/vfs_journal.c

diff --git a/bsd/vfs/vfs_journal.c b/bsd/vfs/vfs_journal.c
index 4999f814b..628e5e7dc 100644
--- a/bsd/vfs/vfs_journal.c
+++ b/bsd/vfs/vfs_journal.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2002-2011 Apple Inc. All rights reserved.
+ * Copyright (c) 2002-2014 Apple Inc. All rights reserved.
  *
  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
  * 
@@ -115,20 +115,22 @@ SYSCTL_INT(_vfs_generic_jnl_kdebug, OID_AUTO, trim, CTLFLAG_RW|CTLFLAG_LOCKED, &
 #define KERNEL_DEBUG KERNEL_DEBUG_CONSTANT
 #endif
 
+
 #ifndef CONFIG_HFS_TRIM
 #define CONFIG_HFS_TRIM 0
 #endif
 
+
 #if JOURNALING
 
 //
-// By default, we grow the list of extents to trim by one page at a time.
+// By default, we grow the list of extents to trim by 4K at a time.
 // We'll opt to flush a transaction if it contains at least
 // JOURNAL_FLUSH_TRIM_EXTENTS extents to be trimmed (even if the number
 // of modified blocks is small).
 //
 enum {
-    JOURNAL_DEFAULT_TRIM_BYTES = PAGE_SIZE,
+    JOURNAL_DEFAULT_TRIM_BYTES = 4096,
     JOURNAL_DEFAULT_TRIM_EXTENTS = JOURNAL_DEFAULT_TRIM_BYTES / sizeof(dk_extent_t),
     JOURNAL_FLUSH_TRIM_EXTENTS = JOURNAL_DEFAULT_TRIM_EXTENTS * 15 / 16
 };
@@ -136,8 +138,7 @@ enum {
 unsigned int jnl_trim_flush_limit = JOURNAL_FLUSH_TRIM_EXTENTS;
 SYSCTL_UINT (_kern, OID_AUTO, jnl_trim_flush, CTLFLAG_RW, &jnl_trim_flush_limit, 0, "number of trimmed extents to cause a journal flush");
 
-
-/* XXX next prototytype should be from libsa/stdlib.h> but conflicts libkern */
+/* XXX next prototype should be from libsa/stdlib.h> but conflicts libkern */
 __private_extern__ void qsort(
 	void * array,
 	size_t nmembers,
@@ -161,8 +162,6 @@ static int end_transaction(transaction *tr, int force_it, errno_t (*callback)(vo
 static void abort_transaction(journal *jnl, transaction *tr);
 static void dump_journal(journal *jnl);
 
-static __inline__ void  lock_journal(journal *jnl);
-static __inline__ void  unlock_journal(journal *jnl);
 static __inline__ void  lock_oldstart(journal *jnl);
 static __inline__ void  unlock_oldstart(journal *jnl);
 static __inline__ void  lock_flush(journal *jnl);
@@ -247,10 +246,11 @@ static int insert_block(journal *jnl, struct bucket **buf_ptr, int blk_index, of
 // we use it to checksum the journal header and the block list
 // headers that are at the start of each transaction.
 //
-static int
+static unsigned int
 calc_checksum(char *ptr, int len)
 {
-	int i, cksum=0;
+	int i;
+	unsigned int cksum=0;
 
 	// this is a lame checksum but for now it'll do
 	for(i = 0; i < len; i++, ptr++) {
@@ -275,15 +275,20 @@ journal_init(void)
 	jnl_mutex_group  = lck_grp_alloc_init("jnl-mutex", jnl_group_attr);
 }
 
-static __inline__ void
-lock_journal(journal *jnl)
+__inline__ void
+journal_lock(journal *jnl)
 {
 	lck_mtx_lock(&jnl->jlock);
+	if (jnl->owner) {
+		panic ("jnl: owner is %p, expected NULL\n", jnl->owner);
+	}
+	jnl->owner = current_thread();
 }
 
-static __inline__ void
-unlock_journal(journal *jnl)
+__inline__ void
+journal_unlock(journal *jnl)
 {
+	jnl->owner = NULL;
 	lck_mtx_unlock(&jnl->jlock);
 }
 
@@ -333,6 +338,7 @@ do_journal_io(journal *jnl, off_t *offset, void *data, size_t len, int direction
 	size_t	io_sz = 0;
 	buf_t	bp;
 	off_t 	max_iosize;
+	struct bufattr *bap;
 
 	if (*offset < 0 || *offset > jnl->jhdr->size) {
 		panic("jnl: do_jnl_io: bad offset 0x%llx (max 0x%llx)\n", *offset, jnl->jhdr->size);
@@ -368,6 +374,20 @@ again:
 		panic("jnl: request for i/o to jnl-header without JNL_HEADER flag set! (len %d, data %p)\n", curlen, data);
 	}
 
+	/*
+	 * As alluded to in the block comment at the top of the function, we use a "fake" iobuf
+	 * here and issue directly to the disk device that the journal protects since we don't
+	 * want this to enter the block cache.  As a result, we lose the ability to mark it
+	 * as a metadata buf_t for the layers below us that may care. If we were to
+	 * simply attach the B_META flag into the b_flags this may confuse things further
+	 * since this is an iobuf, not a metadata buffer. 
+	 *
+	 * To address this, we use the extended bufattr struct embedded in the bp. 
+	 * Explicitly mark the buf here as a metadata buffer in its bufattr flags.
+	 */
+	bap = &bp->b_attr;
+	bap->ba_flags |= BA_META;
+	
 	if (direction & JNL_READ)
 		buf_setflags(bp, B_READ);
 	else {
@@ -570,9 +590,6 @@ buffer_flushed_callback(struct buf *bp, void *arg)
 	CHECK_TRANSACTION(tr);
 
 	jnl = tr->jnl;
-	if (jnl->flags & JOURNAL_INVALID) {
-		return;
-	}
 
 	CHECK_JOURNAL(jnl);
 
@@ -613,6 +630,9 @@ buffer_flushed_callback(struct buf *bp, void *arg)
 	// cleanup for this transaction
 	tr->total_bytes = 0xfbadc0de;
 
+	if (jnl->flags & JOURNAL_INVALID)
+		goto transaction_done;
+
 	//printf("jnl: tr 0x%x (0x%llx 0x%llx) in jnl 0x%x completed.\n",
 	//   tr, tr->journal_start, tr->journal_end, jnl);
 
@@ -708,6 +728,7 @@ buffer_flushed_callback(struct buf *bp, void *arg)
 		tr->next       = jnl->tr_freeme;
 		jnl->tr_freeme = tr;
 	}
+transaction_done:
 	unlock_oldstart(jnl);
 
 	unlock_condition(jnl, &jnl->asyncIO);
@@ -1090,7 +1111,8 @@ add_block(journal *jnl, struct bucket **buf_ptr, off_t block_num, size_t size, _
 static int
 replay_journal(journal *jnl)
 {
-	int		i, orig_checksum, checksum, check_block_checksums=0, bad_blocks=0;
+	int		i, bad_blocks=0;
+	unsigned int	orig_checksum, checksum, check_block_checksums = 0;
 	size_t		ret;
 	size_t		max_bsize = 0;		/* protected by block_ptr */
 	block_list_header *blhdr;
@@ -1099,6 +1121,7 @@ replay_journal(journal *jnl)
 	struct bucket	*co_buf;
 	int		num_buckets = STARTING_BUCKETS, num_full, check_past_jnl_end = 1, in_uncharted_territory=0;
 	uint32_t	last_sequence_num = 0;
+	int 		replay_retry_count = 0;
     
 	// wrap the start ptr if it points to the very end of the journal
 	if (jnl->jhdr->start == jnl->jhdr->size) {
@@ -1155,7 +1178,7 @@ restart_replay:
 		if (jnl->flags & JOURNAL_NEED_SWAP) {
 			// calculate the checksum based on the unswapped data
 			// because it is done byte-at-a-time.
-			orig_checksum = SWAP32(orig_checksum);
+			orig_checksum = (unsigned int)SWAP32(orig_checksum);
 			checksum = calc_checksum((char *)blhdr, BLHDR_CHECKSUM_SIZE);
 			swap_block_list_header(jnl, blhdr);
 		} else {
@@ -1336,11 +1359,25 @@ restart_replay:
 		
 bad_txn_handling:
 		if (bad_blocks) {
+			/* Journal replay got error before it found any valid 
+			 *  transations, abort replay */
 			if (txn_start_offset == 0) {
 				printf("jnl: %s: no known good txn start offset! aborting journal replay.\n", jnl->jdev_name);
 				goto bad_replay;
 			}
 
+			/* Repeated error during journal replay, abort replay */
+			if (replay_retry_count == 3) {
+				printf("jnl: %s: repeated errors replaying journal! aborting journal replay.\n", jnl->jdev_name);
+				goto bad_replay;
+			}
+			replay_retry_count++;
+
+			/* There was an error replaying the journal (possibly 
+			 * EIO/ENXIO from the device).  So retry replaying all 
+			 * the good transactions that we found before getting 
+			 * the error.  
+			 */
 			jnl->jhdr->start = orig_jnl_start;
 			jnl->jhdr->end = txn_start_offset;
 			check_past_jnl_end = 0;
@@ -1452,7 +1489,7 @@ bad_replay:
 
 
 #define DEFAULT_TRANSACTION_BUFFER_SIZE  (128*1024)
-#define MAX_TRANSACTION_BUFFER_SIZE      (2048*1024)
+#define MAX_TRANSACTION_BUFFER_SIZE      (3072*1024)
 
 // XXXdbg - so I can change it in the debugger
 int def_tbuffer_size = 0;
@@ -1471,14 +1508,14 @@ size_up_tbuffer(journal *jnl, int tbuffer_size, int phys_blksz)
 	// there is in the machine.
 	//
 	if (def_tbuffer_size == 0) {
-		if (mem_size < (256*1024*1024)) {
+		if (max_mem < (256*1024*1024)) {
 			def_tbuffer_size = DEFAULT_TRANSACTION_BUFFER_SIZE;
-		} else if (mem_size < (512*1024*1024)) {
+		} else if (max_mem < (512*1024*1024)) {
 			def_tbuffer_size = DEFAULT_TRANSACTION_BUFFER_SIZE * 2;
-		} else if (mem_size < (1024*1024*1024)) {
+		} else if (max_mem < (1024*1024*1024)) {
 			def_tbuffer_size = DEFAULT_TRANSACTION_BUFFER_SIZE * 3;
 		} else {
-			def_tbuffer_size = DEFAULT_TRANSACTION_BUFFER_SIZE * (mem_size / (256*1024*1024));
+			def_tbuffer_size = DEFAULT_TRANSACTION_BUFFER_SIZE * (max_mem / (256*1024*1024));
 		}
 	}
 
@@ -1516,8 +1553,6 @@ size_up_tbuffer(journal *jnl, int tbuffer_size, int phys_blksz)
 	}
 }
 
-
-
 static void
 get_io_info(struct vnode *devvp, size_t phys_blksz, journal *jnl, struct vfs_context *context)
 {
@@ -1530,9 +1565,10 @@ get_io_info(struct vnode *devvp, size_t phys_blksz, journal *jnl, struct vfs_con
 
 	if (VNOP_IOCTL(devvp, DKIOCGETFEATURES, (caddr_t)&features, 0, context) == 0) {
 		if (features & DK_FEATURE_FORCE_UNIT_ACCESS) {
-			const char *name = vnode_name(devvp);
+			const char *name = vnode_getname_printable(devvp);
 			jnl->flags |= JOURNAL_DO_FUA_WRITES;
-			printf("jnl: %s: enabling FUA writes (features 0x%x)\n", name ? name : "no-name-dev", features);
+			printf("jnl: %s: enabling FUA writes (features 0x%x)\n", name, features);
+			vnode_putname_printable(name);
 		}
 		if (features & DK_FEATURE_UNMAP) {
 			jnl->flags |= JOURNAL_USE_UNMAP;
@@ -1600,23 +1636,6 @@ get_io_info(struct vnode *devvp, size_t phys_blksz, journal *jnl, struct vfs_con
 }
 
 
-static const char *
-get_jdev_name(struct vnode *jvp)
-{
-	const char *jdev_name;
-    
-	jdev_name = vnode_name(jvp);
-	if (jdev_name == NULL) {
-		jdev_name = vfs_addname("unknown-dev", strlen("unknown-dev"), 0, 0);
-	} else {
-		// this just bumps the refcount on the name so we have our own copy
-		jdev_name = vfs_addname(jdev_name, strlen(jdev_name), 0, 0);
-	}
-
-	return jdev_name;
-}
-
-
 journal *
 journal_create(struct vnode *jvp,
 			   off_t         offset,
@@ -1626,7 +1645,8 @@ journal_create(struct vnode *jvp,
 			   int32_t       flags,
 			   int32_t       tbuffer_size,
 			   void        (*flush)(void *arg),
-			   void         *arg)
+			   void         *arg,
+			   struct mount *fsmount)
 {
 	journal		*jnl;
 	uint32_t	phys_blksz, new_txn_base;
@@ -1642,36 +1662,36 @@ journal_create(struct vnode *jvp,
 	context.vc_thread = current_thread();
 	context.vc_ucred = FSCRED;
 
-	jdev_name = get_jdev_name(jvp);
+	jdev_name = vnode_getname_printable(jvp);
 
 	/* Get the real physical block size. */
 	if (VNOP_IOCTL(jvp, DKIOCGETBLOCKSIZE, (caddr_t)&phys_blksz, 0, &context)) {
-		return NULL;
+		goto cleanup_jdev_name;
 	}
 
 	if (journal_size < (256*1024) || journal_size > (MAX_JOURNAL_SIZE)) {
-		printf("jnl: create: journal size %lld looks bogus.\n", journal_size);
-		return NULL;
+		printf("jnl: %s: create: journal size %lld looks bogus.\n", jdev_name, journal_size);
+		goto cleanup_jdev_name;
 	}
 
 	min_size = phys_blksz * (phys_blksz / sizeof(block_info));
 	/* Reject journals that are too small given the sector size of the device */
 	if (journal_size < min_size) {
-		printf("jnl: create: journal size (%lld) too small given sector size of (%u)\n", 
-				journal_size, phys_blksz);
-		return NULL;
+		printf("jnl: %s: create: journal size (%lld) too small given sector size of (%u)\n", 
+				jdev_name, journal_size, phys_blksz);
+		goto cleanup_jdev_name;
 	}
 
 	if (phys_blksz > min_fs_blksz) {
 		printf("jnl: %s: create: error: phys blksize %u bigger than min fs blksize %zd\n",
 		       jdev_name, phys_blksz, min_fs_blksz);
-		return NULL;
+		goto cleanup_jdev_name;
 	}
 
 	if ((journal_size % phys_blksz) != 0) {
 		printf("jnl: %s: create: journal size 0x%llx is not an even multiple of block size 0x%ux\n",
 		       jdev_name, journal_size, phys_blksz);
-		return NULL;
+		goto cleanup_jdev_name;
 	}
 
 
@@ -1687,6 +1707,12 @@ journal_create(struct vnode *jvp,
 	jnl->jdev_name    = jdev_name;
 	lck_mtx_init(&jnl->old_start_lock, jnl_mutex_group, jnl_lock_attr);
 
+	// Keep a point to the mount around for use in IO throttling.
+	jnl->fsmount      = fsmount;
+	// XXX: This lock discipline looks correct based on dounmount(), but it
+	// doesn't seem to be documented anywhere.
+	mount_ref(fsmount, 0);
+
 	get_io_info(jvp, phys_blksz, jnl, &context);
 	
 	if (kmem_alloc_kobject(kernel_map, (vm_offset_t *)&jnl->header_buf, phys_blksz)) {
@@ -1722,7 +1748,7 @@ journal_create(struct vnode *jvp,
 	       && jnl->jhdr->sequence_num != 0) {
 
 		new_txn_base = (jnl->jhdr->sequence_num + (journal_size / phys_blksz) + (random() % 16384)) & 0x00ffffff;
-		printf("jnl: create: avoiding old sequence number 0x%x (0x%x)\n", jnl->jhdr->sequence_num, new_txn_base);
+		printf("jnl: %s: create: avoiding old sequence number 0x%x (0x%x)\n", jdev_name, jnl->jhdr->sequence_num, new_txn_base);
 
 #if 0
 		int i;
@@ -1763,7 +1789,8 @@ journal_create(struct vnode *jvp,
 	lck_mtx_init(&jnl->jlock, jnl_mutex_group, jnl_lock_attr);
 	lck_mtx_init(&jnl->flock, jnl_mutex_group, jnl_lock_attr);
 	lck_rw_init(&jnl->trim_lock, jnl_mutex_group, jnl_lock_attr);
-	
+
+
 	jnl->flushing = FALSE;
 	jnl->asyncIO = FALSE;
 	jnl->flush_aborted = FALSE;
@@ -1776,19 +1803,20 @@ journal_create(struct vnode *jvp,
 		goto bad_write;
 	}
 
-	return jnl;
+	goto journal_create_complete;
 
 
 bad_write:
 	kmem_free(kernel_map, (vm_offset_t)jnl->header_buf, phys_blksz);
 bad_kmem_alloc:
-	if (jdev_name) {
-		vfs_removename(jdev_name);
-	}
 	jnl->jhdr = NULL;
 	FREE_ZONE(jnl, sizeof(struct journal), M_JNL_JNL);
-
-	return NULL;
+	mount_drop(fsmount, 0);
+cleanup_jdev_name:
+	vnode_putname_printable(jdev_name);
+	jnl = NULL;
+journal_create_complete:
+	return jnl;
 }
 
 
@@ -1801,7 +1829,8 @@ journal_open(struct vnode *jvp,
 			 int32_t       flags,
 			 int32_t       tbuffer_size,
 			 void        (*flush)(void *arg),
-			 void         *arg)
+			 void         *arg,
+			 struct mount *fsmount)
 {
 	journal		*jnl;
 	uint32_t	orig_blksz=0;
@@ -1809,39 +1838,39 @@ journal_open(struct vnode *jvp,
 	u_int32_t	min_size = 0;
 	int		orig_checksum, checksum;
 	struct vfs_context context;
-	const char	*jdev_name = get_jdev_name(jvp);
+	const char	*jdev_name = vnode_getname_printable(jvp);
 
 	context.vc_thread = current_thread();
 	context.vc_ucred = FSCRED;
 
 	/* Get the real physical block size. */
 	if (VNOP_IOCTL(jvp, DKIOCGETBLOCKSIZE, (caddr_t)&phys_blksz, 0, &context)) {
-		return NULL;
+		goto cleanup_jdev_name;
 	}
 
 	if (phys_blksz > min_fs_blksz) {
 		printf("jnl: %s: open: error: phys blksize %u bigger than min fs blksize %zd\n",
 		       jdev_name, phys_blksz, min_fs_blksz);
-		return NULL;
+		goto cleanup_jdev_name;
 	}
 
 	if (journal_size < (256*1024) || journal_size > (1024*1024*1024)) {
-		printf("jnl: open: journal size %lld looks bogus.\n", journal_size);
-		return NULL;
+		printf("jnl: %s: open: journal size %lld looks bogus.\n", jdev_name, journal_size);
+		goto cleanup_jdev_name;
 	}
 
 	min_size = phys_blksz * (phys_blksz / sizeof(block_info));
 	/* Reject journals that are too small given the sector size of the device */
 	if (journal_size < min_size) {
-		printf("jnl: open: journal size (%lld) too small given sector size of (%u)\n", 
-				journal_size, phys_blksz);
-		return NULL;
+		printf("jnl: %s: open: journal size (%lld) too small given sector size of (%u)\n", 
+				jdev_name, journal_size, phys_blksz);
+		goto cleanup_jdev_name;
 	}
     
 	if ((journal_size % phys_blksz) != 0) {
 		printf("jnl: %s: open: journal size 0x%llx is not an even multiple of block size 0x%x\n",
 		       jdev_name, journal_size, phys_blksz);
-		return NULL;
+		goto cleanup_jdev_name;
 	}
 
 	MALLOC_ZONE(jnl, struct journal *, sizeof(struct journal), M_JNL_JNL, M_WAITOK);
@@ -1856,6 +1885,12 @@ journal_open(struct vnode *jvp,
 	jnl->jdev_name    = jdev_name;
 	lck_mtx_init(&jnl->old_start_lock, jnl_mutex_group, jnl_lock_attr);
 
+	/* We need a reference to the mount to later pass to the throttling code for
+	 * IO accounting.
+	 */
+	jnl->fsmount      = fsmount;
+	mount_ref(fsmount, 0);
+
 	get_io_info(jvp, phys_blksz, jnl, &context);
 
 	if (kmem_alloc_kobject(kernel_map, (vm_offset_t *)&jnl->header_buf, phys_blksz)) {
@@ -1911,26 +1946,24 @@ journal_open(struct vnode *jvp,
 		jnl->jhdr->magic = JOURNAL_HEADER_MAGIC;
 	}
 
-    if (phys_blksz != (size_t)jnl->jhdr->jhdr_size && jnl->jhdr->jhdr_size != 0) {
-	/*
-	 * The volume has probably been resized (such that we had to adjust the
-	 * logical sector size), or copied to media with a different logical
-	 * sector size.
-	 *
-	 * Temporarily change the device's logical block size to match the
-	 * journal's header size.  This will allow us to replay the journal
-	 * safely.  If the replay succeeds, we will update the journal's header
-	 * size (later in this function).
-	 */
-
-	orig_blksz = phys_blksz;
-	phys_blksz = jnl->jhdr->jhdr_size;
-	VNOP_IOCTL(jvp, DKIOCSETBLOCKSIZE, (caddr_t)&phys_blksz, FWRITE, &context);
+	if (phys_blksz != (size_t)jnl->jhdr->jhdr_size && jnl->jhdr->jhdr_size != 0) {
+		/*
+		 * The volume has probably been resized (such that we had to adjust the
+		 * logical sector size), or copied to media with a different logical
+		 * sector size.
+		 * 
+		 * Temporarily change the device's logical block size to match the
+		 * journal's header size.  This will allow us to replay the journal
+		 * safely.  If the replay succeeds, we will update the journal's header
+		 * size (later in this function).
+		 */
+		orig_blksz = phys_blksz;
+		phys_blksz = jnl->jhdr->jhdr_size;
+		VNOP_IOCTL(jvp, DKIOCSETBLOCKSIZE, (caddr_t)&phys_blksz, FWRITE, &context);
+		printf("jnl: %s: open: temporarily switched block size from %u to %u\n",
+			   jdev_name, orig_blksz, phys_blksz);
+	}
 
-	printf("jnl: %s: open: temporarily switched block size from %u to %u\n",
-	       jdev_name, orig_blksz, phys_blksz);
-    }
-    
 	if (   jnl->jhdr->start <= 0
 	       || jnl->jhdr->start > jnl->jhdr->size
 	       || jnl->jhdr->start > 1024*1024*1024) {
@@ -1980,68 +2013,73 @@ journal_open(struct vnode *jvp,
 		printf("jnl: %s: journal_open: Error replaying the journal!\n", jdev_name);
 		goto bad_journal;
 	}
-
-    /*
-     * When we get here, we know that the journal is empty (jnl->jhdr->start ==
-     * jnl->jhdr->end).  If the device's logical block size was different from
-     * the journal's header size, then we can now restore the device's logical
-     * block size and update the journal's header size to match.
-     *
-     * Note that we also adjust the journal's start and end so that they will
-     * be aligned on the new block size.  We pick a new sequence number to
-     * avoid any problems if a replay found previous transactions using the old
-     * journal header size.  (See the comments in journal_create(), above.)
-     */
-    if (orig_blksz != 0) {
-	VNOP_IOCTL(jvp, DKIOCSETBLOCKSIZE, (caddr_t)&orig_blksz, FWRITE, &context);
-	phys_blksz = orig_blksz;
-	orig_blksz = 0;
-	printf("jnl: %s: open: restored block size to %u\n", jdev_name, phys_blksz);
 	
-	jnl->jhdr->jhdr_size = phys_blksz;
-	jnl->jhdr->start = phys_blksz;
-	jnl->jhdr->end = phys_blksz;
-	jnl->jhdr->sequence_num = (jnl->jhdr->sequence_num +
-				   (journal_size / phys_blksz) +
-				   (random() % 16384)) & 0x00ffffff;
+	/*
+	 * When we get here, we know that the journal is empty (jnl->jhdr->start ==
+	 * jnl->jhdr->end).  If the device's logical block size was different from
+	 * the journal's header size, then we can now restore the device's logical
+	 * block size and update the journal's header size to match.
+	 *
+	 * Note that we also adjust the journal's start and end so that they will
+	 * be aligned on the new block size.  We pick a new sequence number to
+	 * avoid any problems if a replay found previous transactions using the old
+	 * journal header size.  (See the comments in journal_create(), above.)
+	 */
 	
-	if (write_journal_header(jnl, 1, jnl->jhdr->sequence_num)) {
-		printf("jnl: %s: open: failed to update journal header size\n", jdev_name);
+	if (orig_blksz != 0) {
+		VNOP_IOCTL(jvp, DKIOCSETBLOCKSIZE, (caddr_t)&orig_blksz, FWRITE, &context);
+		phys_blksz = orig_blksz;
+		
+		orig_blksz = 0;
+		
+		jnl->jhdr->jhdr_size = phys_blksz;
+		jnl->jhdr->start = phys_blksz;
+		jnl->jhdr->end = phys_blksz;
+		jnl->jhdr->sequence_num = (jnl->jhdr->sequence_num +
+								   (journal_size / phys_blksz) +
+								   (random() % 16384)) & 0x00ffffff;
+		
+		if (write_journal_header(jnl, 1, jnl->jhdr->sequence_num)) {
+			printf("jnl: %s: open: failed to update journal header size\n", jdev_name);
+			goto bad_journal;
+		}
+	}
+
+	// make sure this is in sync!
+	jnl->active_start = jnl->jhdr->start;
+	jnl->sequence_num = jnl->jhdr->sequence_num;
+
+	// set this now, after we've replayed the journal
+	size_up_tbuffer(jnl, tbuffer_size, phys_blksz);
+
+	// TODO: Does this need to change if the device's logical block size changed?
+	if ((off_t)(jnl->jhdr->blhdr_size/sizeof(block_info)-1) > (jnl->jhdr->size/jnl->jhdr->jhdr_size)) {
+		printf("jnl: %s: open: jhdr size and blhdr size are not compatible (0x%llx, %d, %d)\n", jdev_name, jnl->jhdr->size,
+		       jnl->jhdr->blhdr_size, jnl->jhdr->jhdr_size);
 		goto bad_journal;
 	}
-    }
-    
-    // make sure this is in sync!
-    jnl->active_start = jnl->jhdr->start;
-    jnl->sequence_num = jnl->jhdr->sequence_num;
-
-    // set this now, after we've replayed the journal
-    size_up_tbuffer(jnl, tbuffer_size, phys_blksz);
-
-    // TODO: Does this need to change if the device's logical block size changed?
-    if ((off_t)(jnl->jhdr->blhdr_size/sizeof(block_info)-1) > (jnl->jhdr->size/jnl->jhdr->jhdr_size)) {
-	printf("jnl: %s: open: jhdr size and blhdr size are not compatible (0x%llx, %d, %d)\n", jdev_name, jnl->jhdr->size,
-	   jnl->jhdr->blhdr_size, jnl->jhdr->jhdr_size);
-	goto bad_journal;
-    }
-
-    lck_mtx_init(&jnl->jlock, jnl_mutex_group, jnl_lock_attr);
-
-    return jnl;
-
-  bad_journal:
-    if (orig_blksz != 0) {
-	phys_blksz = orig_blksz;
-	VNOP_IOCTL(jvp, DKIOCSETBLOCKSIZE, (caddr_t)&orig_blksz, FWRITE, &context);
-	printf("jnl: %s: open: restored block size to %u after error\n", jdev_name, orig_blksz);
-    }
-    kmem_free(kernel_map, (vm_offset_t)jnl->header_buf, phys_blksz);
-  bad_kmem_alloc:
-    if (jdev_name) {
-	vfs_removename(jdev_name);
-    }
-    FREE_ZONE(jnl, sizeof(struct journal), M_JNL_JNL);
-    return NULL;    
+
+	lck_mtx_init(&jnl->jlock, jnl_mutex_group, jnl_lock_attr);
+	lck_mtx_init(&jnl->flock, jnl_mutex_group, jnl_lock_attr);
+	lck_rw_init(&jnl->trim_lock, jnl_mutex_group, jnl_lock_attr);
+
+	goto journal_open_complete;
+
+bad_journal:
+	if (orig_blksz != 0) {
+		phys_blksz = orig_blksz;
+		VNOP_IOCTL(jvp, DKIOCSETBLOCKSIZE, (caddr_t)&orig_blksz, FWRITE, &context);
+		printf("jnl: %s: open: restored block size after error\n", jdev_name);
+	}
+	kmem_free(kernel_map, (vm_offset_t)jnl->header_buf, phys_blksz);
+bad_kmem_alloc:
+	FREE_ZONE(jnl, sizeof(struct journal), M_JNL_JNL);
+	mount_drop(fsmount, 0);
+cleanup_jdev_name:
+	vnode_putname_printable(jdev_name);
+	jnl = NULL;
+journal_open_complete:
+	return jnl;    
 }
 
 
@@ -2057,7 +2095,7 @@ journal_is_clean(struct vnode *jvp,
 	int		ret;
 	int		orig_checksum, checksum;
 	struct vfs_context context;
-	const		char *jdev_name = get_jdev_name(jvp);
+	const		char *jdev_name = vnode_getname_printable(jvp);
 
 	context.vc_thread = current_thread();
 	context.vc_ucred = FSCRED;
@@ -2065,31 +2103,36 @@ journal_is_clean(struct vnode *jvp,
 	/* Get the real physical block size. */
 	if (VNOP_IOCTL(jvp, DKIOCGETBLOCKSIZE, (caddr_t)&phys_blksz, 0, &context)) {
 		printf("jnl: %s: is_clean: failed to get device block size.\n", jdev_name);
-		return EINVAL;
+		ret = EINVAL;
+		goto cleanup_jdev_name;
 	}
 
 	if (phys_blksz > (uint32_t)min_fs_block_size) {
 		printf("jnl: %s: is_clean: error: phys blksize %d bigger than min fs blksize %zd\n",
 		       jdev_name, phys_blksz, min_fs_block_size);
-		return EINVAL;
+		ret = EINVAL;
+		goto cleanup_jdev_name;
 	}
 
 	if (journal_size < (256*1024) || journal_size > (MAX_JOURNAL_SIZE)) {
-		printf("jnl: is_clean: journal size %lld looks bogus.\n", journal_size);
-		return EINVAL;
+		printf("jnl: %s: is_clean: journal size %lld looks bogus.\n", jdev_name, journal_size);
+		ret = EINVAL;
+		goto cleanup_jdev_name;
 	}
     
 	if ((journal_size % phys_blksz) != 0) {
 		printf("jnl: %s: is_clean: journal size 0x%llx is not an even multiple of block size 0x%x\n",
 		       jdev_name, journal_size, phys_blksz);
-		return EINVAL;
+		ret = EINVAL;
+		goto cleanup_jdev_name;
 	}
 
 	memset(&jnl, 0, sizeof(jnl));
 
 	if (kmem_alloc_kobject(kernel_map, (vm_offset_t *)&jnl.header_buf, phys_blksz)) {
 		printf("jnl: %s: is_clean: could not allocate space for header buffer (%d bytes)\n", jdev_name, phys_blksz);
-		return ENOMEM;
+		ret = ENOMEM;
+		goto cleanup_jdev_name;
 	}
 	jnl.header_buf_size = phys_blksz;
 
@@ -2150,12 +2193,9 @@ journal_is_clean(struct vnode *jvp,
 
 get_out:
 	kmem_free(kernel_map, (vm_offset_t)jnl.header_buf, phys_blksz);
-	if (jdev_name) {
-		vfs_removename(jdev_name);
-	}
-    
-	return ret;    
-
+cleanup_jdev_name:
+	vnode_putname_printable(jdev_name);
+	return ret;
 }
 
 
@@ -2173,7 +2213,7 @@ journal_close(journal *jnl)
 	jnl->flags |= JOURNAL_CLOSE_PENDING;
 
 	if (jnl->owner != current_thread()) {
-		lock_journal(jnl);
+		journal_lock(jnl);
 	}
 
 	wait_condition(jnl, &jnl->flushing, "journal_close");
@@ -2251,16 +2291,23 @@ journal_close(journal *jnl)
 			}
 		}
 	}
+	wait_condition(jnl, &jnl->asyncIO, "journal_close");
 
 	free_old_stuff(jnl);
 
 	kmem_free(kernel_map, (vm_offset_t)jnl->header_buf, jnl->header_buf_size);
 	jnl->jhdr = (void *)0xbeefbabe;
 
-	if (jnl->jdev_name) {
-		vfs_removename(jnl->jdev_name);
-	}
+	// Release reference on the mount
+	if (jnl->fsmount)
+		 mount_drop(jnl->fsmount, 0);
+
+	vnode_putname_printable(jnl->jdev_name);
 
+	journal_unlock(jnl);
+	lck_mtx_destroy(&jnl->old_start_lock, jnl_mutex_group);
+	lck_mtx_destroy(&jnl->jlock, jnl_mutex_group);
+	lck_mtx_destroy(&jnl->flock, jnl_mutex_group);
 	FREE_ZONE(jnl, sizeof(struct journal), M_JNL_JNL);
 }
 
@@ -2351,7 +2398,7 @@ check_free_space(journal *jnl, int desired_size, boolean_t *delayed_header_write
 
 			lcl_counter = 0;
 			while (jnl->old_start[i] & 0x8000000000000000LL) {
-				if (lcl_counter++ > 1000) {
+				if (lcl_counter++ > 10000) {
 					panic("jnl: check_free_space: tr starting @ 0x%llx not flushing (jnl %p).\n",
 					      jnl->old_start[i], jnl);
 				}
@@ -2431,7 +2478,19 @@ static errno_t
 journal_allocate_transaction(journal *jnl)
 {
 	transaction *tr;
+	boolean_t was_vm_privileged;
 	
+	if (jnl->fsmount->mnt_kern_flag & MNTK_SWAP_MOUNT) {
+		/*
+		 * the disk driver can allocate memory on this path...
+		 * if we block waiting for memory, and there is enough pressure to
+		 * cause us to try and create a new swap file, we may end up deadlocking
+		 * due to waiting for the journal on the swap file creation path...
+		 * by making ourselves vm_privileged, we give ourselves the best chance
+		 * of not blocking
+		 */
+		was_vm_privileged = set_vm_privilege(TRUE);
+	}
 	MALLOC_ZONE(tr, transaction *, sizeof(transaction), M_JNL_TR, M_WAITOK);
 	memset(tr, 0, sizeof(transaction));
 
@@ -2442,6 +2501,8 @@ journal_allocate_transaction(journal *jnl)
 		jnl->active_tr = NULL;
 		return ENOMEM;
 	}
+	if ((jnl->fsmount->mnt_kern_flag & MNTK_SWAP_MOUNT) && (was_vm_privileged == FALSE))
+		set_vm_privilege(FALSE);
 
 	// journal replay code checksum check depends on this.
 	memset(tr->tbuffer, 0, BLHDR_CHECKSUM_SIZE);
@@ -2484,14 +2545,14 @@ journal_start_transaction(journal *jnl)
 		jnl->nested_count++;
 		return 0;
 	}
-	lock_journal(jnl);
 
-	if (jnl->owner != NULL || jnl->nested_count != 0 || jnl->active_tr != NULL) {
+	journal_lock(jnl);
+
+	if (jnl->nested_count != 0 || jnl->active_tr != NULL) {
 		panic("jnl: start_tr: owner %p, nested count %d, active_tr %p jnl @ %p\n",
 		      jnl->owner, jnl->nested_count, jnl->active_tr, jnl);
 	}
 
-	jnl->owner = current_thread();
 	jnl->nested_count = 1;
 
 #if JOE
@@ -2529,9 +2590,8 @@ journal_start_transaction(journal *jnl)
 	return 0;
 
 bad_start:
-	jnl->owner        = NULL;
 	jnl->nested_count = 0;
-	unlock_journal(jnl);
+	journal_unlock(jnl);
 
 	return ret;
 }
@@ -2829,6 +2889,8 @@ journal_modify_block_end(journal *jnl, struct buf *bp, void (*func)(buf_t bp, vo
 		blhdr->binfo[i].bnum = (off_t)(buf_blkno(bp));
 		blhdr->binfo[i].u.bp = bp;
 
+		KERNEL_DEBUG_CONSTANT(0x3018004, VM_KERNEL_ADDRPERM(vp), blhdr->binfo[i].bnum, bsize, 0, 0);
+
 		if (func) {
 			void (*old_func)(buf_t, void *)=NULL, *old_arg=NULL;
 			
@@ -2922,7 +2984,6 @@ journal_kill_block(journal *jnl, struct buf *bp)
 	return 0;
 }
 
-
 /*
 ;________________________________________________________________________________
 ;
@@ -2976,16 +3037,31 @@ journal_trim_set_callback(journal *jnl, jnl_trim_callback_t callback, void *arg)
 ;________________________________________________________________________________
 */
 static int
-trim_realloc(struct jnl_trim_list *trim)
+trim_realloc(journal *jnl, struct jnl_trim_list *trim)
 {
 	void *new_extents;
 	uint32_t new_allocated_count;
+	boolean_t was_vm_privileged;
 	
 	if (jnl_kdebug)
-		KERNEL_DEBUG_CONSTANT(DBG_JOURNAL_TRIM_REALLOC | DBG_FUNC_START, trim, 0, trim->allocated_count, trim->extent_count, 0);
+		KERNEL_DEBUG_CONSTANT(DBG_JOURNAL_TRIM_REALLOC | DBG_FUNC_START, VM_KERNEL_ADDRPERM(trim), 0, trim->allocated_count, trim->extent_count, 0);
 	
 	new_allocated_count = trim->allocated_count + JOURNAL_DEFAULT_TRIM_EXTENTS;
+
+	if (jnl->fsmount->mnt_kern_flag & MNTK_SWAP_MOUNT) {
+		/*
+		 * if we block waiting for memory, and there is enough pressure to
+		 * cause us to try and create a new swap file, we may end up deadlocking
+		 * due to waiting for the journal on the swap file creation path...
+		 * by making ourselves vm_privileged, we give ourselves the best chance
+		 * of not blocking
+		 */
+		was_vm_privileged = set_vm_privilege(TRUE);
+	}
 	new_extents = kalloc(new_allocated_count * sizeof(dk_extent_t));
+	if ((jnl->fsmount->mnt_kern_flag & MNTK_SWAP_MOUNT) && (was_vm_privileged == FALSE))
+		set_vm_privilege(FALSE);
+
 	if (new_extents == NULL) {
 		printf("jnl: trim_realloc: unable to grow extent list!\n");
 		/*
@@ -3016,44 +3092,53 @@ trim_realloc(struct jnl_trim_list *trim)
 	return 0;
 }
 
-
 /*
-;________________________________________________________________________________
-;
-; Routine:		trim_search_extent
-;
-; Function:		Search the given extent list to see if any of its extents
-;				overlap the given extent.
-;
-; Input Arguments:
-;	trim		- The trim list to be searched.
-;	offset		- The first byte of the range to be searched for.
-;	length		- The number of bytes of the extent being searched for.
-;
-; Output:
-;	(result)	- TRUE if one or more extents overlap, FALSE otherwise.
-;________________________________________________________________________________
-*/
+ ;________________________________________________________________________________
+ ;
+ ; Routine:		trim_search_extent
+ ;
+ ; Function:		Search the given extent list to see if any of its extents
+ ;				overlap the given extent.
+ ;
+ ; Input Arguments:
+ ;	trim		- The trim list to be searched.
+ ;	offset		- The first byte of the range to be searched for.
+ ;	length		- The number of bytes of the extent being searched for.
+ ;  overlap_start - start of the overlapping extent
+ ;  overlap_len   - length of the overlapping extent
+ ;
+ ; Output:
+ ;	(result)	- TRUE if one or more extents overlap, FALSE otherwise.
+ ;________________________________________________________________________________
+ */
 static int
-trim_search_extent(struct jnl_trim_list *trim, uint64_t offset, uint64_t length)
+trim_search_extent(struct jnl_trim_list *trim, uint64_t offset,
+		uint64_t length, uint64_t *overlap_start, uint64_t *overlap_len)
 {
 	uint64_t end = offset + length;
 	uint32_t lower = 0;						/* Lowest index to search */
 	uint32_t upper = trim->extent_count;	/* Highest index to search + 1 */
 	uint32_t middle;
-	
+
 	/* A binary search over the extent list. */
 	while (lower < upper) {
 		middle = (lower + upper) / 2;
-		
+
 		if (trim->extents[middle].offset >= end)
 			upper = middle;
 		else if (trim->extents[middle].offset + trim->extents[middle].length <= offset)
 			lower = middle + 1;
-		else
+		else {
+			if (overlap_start) {
+				*overlap_start = trim->extents[middle].offset;
+			}
+			if (overlap_len) {
+				*overlap_len = trim->extents[middle].length;
+			}
 			return TRUE;
+		}
 	}
-	
+
 	return FALSE;
 }
 
@@ -3092,7 +3177,7 @@ journal_trim_add_extent(journal *jnl, uint64_t offset, uint64_t length)
 	dk_extent_t *extent;
 	uint32_t insert_index;
 	uint32_t replace_count;
-	
+		
 	CHECK_JOURNAL(jnl);
 
 	/* TODO: Is it OK to manipulate the trim list even if JOURNAL_INVALID is set?  I think so... */
@@ -3104,7 +3189,7 @@ journal_trim_add_extent(journal *jnl, uint64_t offset, uint64_t length)
 	CHECK_TRANSACTION(tr);
 
 	if (jnl_kdebug)
-		KERNEL_DEBUG_CONSTANT(DBG_JOURNAL_TRIM_ADD | DBG_FUNC_START, jnl, offset, length, tr->trim.extent_count, 0);
+		KERNEL_DEBUG_CONSTANT(DBG_JOURNAL_TRIM_ADD | DBG_FUNC_START, VM_KERNEL_ADDRPERM(jnl), offset, length, tr->trim.extent_count, 0);
 
 	if (jnl->owner != current_thread()) {
 		panic("jnl: trim_add_extent: called w/out a transaction! jnl %p, owner %p, curact %p\n",
@@ -3112,9 +3197,9 @@ journal_trim_add_extent(journal *jnl, uint64_t offset, uint64_t length)
 	}
 
 	free_old_stuff(jnl);
-	
+		
 	end = offset + length;
-	
+		
 	/*
 	 * Find the range of existing extents that can be combined with the
 	 * input extent.  We start by counting the number of extents that end
@@ -3132,7 +3217,7 @@ journal_trim_add_extent(journal *jnl, uint64_t offset, uint64_t length)
 		++replace_count;
 		++extent;
 	}
-	
+		
 	/*
 	 * If none of the existing extents can be combined with the input extent,
 	 * then just insert it in the list (before item number insert_index).
@@ -3140,7 +3225,7 @@ journal_trim_add_extent(journal *jnl, uint64_t offset, uint64_t length)
 	if (replace_count == 0) {
 		/* If the list was already full, we need to grow it. */
 		if (tr->trim.extent_count == tr->trim.allocated_count) {
-			if (trim_realloc(&tr->trim) != 0) {
+			if (trim_realloc(jnl, &tr->trim) != 0) {
 				printf("jnl: trim_add_extent: out of memory!");
 				if (jnl_kdebug)
 					KERNEL_DEBUG_CONSTANT(DBG_JOURNAL_TRIM_ADD | DBG_FUNC_END, ENOMEM, 0, 0, tr->trim.extent_count, 0);
@@ -3198,6 +3283,92 @@ journal_trim_add_extent(journal *jnl, uint64_t offset, uint64_t length)
     return 0;
 }
 
+/*
+ * journal_trim_extent_overlap
+ *
+ * Return 1 if there are any pending TRIMs that overlap with the given offset and length
+ * Return 0 otherwise.
+ */
+
+int journal_trim_extent_overlap (journal *jnl, uint64_t offset, uint64_t length, uint64_t *end) {
+	transaction *tr = NULL;
+	int overlap = 0;
+
+	uint64_t overlap_start;
+	uint64_t overlap_len;
+	tr = jnl->active_tr;
+	CHECK_TRANSACTION(tr);
+
+	/*
+	 * There are two lists that need to be examined for potential overlaps:
+	 *
+	 * The first is the current transaction. Since this function requires that
+	 * a transaction be active when this is called, this is the "active_tr"
+	 * pointer in the journal struct.  This has a trimlist pointer which needs
+	 * to be searched.
+	 */
+	overlap = trim_search_extent (&tr->trim, offset, length, &overlap_start, &overlap_len);
+	if (overlap == 0) {
+		/*
+		 * The second is the async trim list, which is only done if the current
+		 * transaction group (active transaction) did not overlap with our target
+		 * extent. This async trim list is the set of all previously
+		 * committed transaction groups whose I/Os are now in-flight. We need to hold the
+		 * trim lock in order to search this list.  If we grab the list before the
+		 * TRIM has completed, then we will compare it. If it is grabbed AFTER the
+		 * TRIM has completed, then the pointer will be zeroed out and we won't have
+		 * to check anything.
+		 */
+		lck_rw_lock_shared (&jnl->trim_lock);
+		if (jnl->async_trim != NULL) {
+			overlap = trim_search_extent(jnl->async_trim, offset, length, &overlap_start, &overlap_len);
+		}
+		lck_rw_unlock_shared (&jnl->trim_lock);
+	}
+
+	if (overlap) {
+		/* compute the end (min) of the overlapping range */
+		if ( (overlap_start + overlap_len) < (offset + length)) {
+			*end = (overlap_start + overlap_len);
+		}
+		else {
+			*end = (offset + length);
+		}
+	}
+
+
+	return overlap;
+}
+
+/*
+ * journal_request_immediate_flush
+ *
+ * FS requests that the journal flush immediately upon the
+ * active transaction's completion.
+ *
+ * Returns 0 if operation succeeds
+ * Returns EPERM if we failed to leave hint
+ */
+int
+journal_request_immediate_flush (journal *jnl) {
+
+	transaction *tr = NULL;
+	/*
+	 * Is a transaction still in process? You must do
+	 * this while there are txns open
+	 */
+	tr = jnl->active_tr;
+	if (tr != NULL) {
+		CHECK_TRANSACTION(tr);
+		tr->flush_on_completion = TRUE;
+	}
+	else {
+		return EPERM;
+	}
+	return 0;
+}
+
+
 
 /*
 ;________________________________________________________________________________
@@ -3222,7 +3393,7 @@ journal_trim_add_extent(journal *jnl, uint64_t offset, uint64_t length)
 ;________________________________________________________________________________
 */
 static int
-trim_remove_extent(struct jnl_trim_list *trim, uint64_t offset, uint64_t length)
+trim_remove_extent(journal *jnl, struct jnl_trim_list *trim, uint64_t offset, uint64_t length)
 {
 	u_int64_t end;
 	dk_extent_t *extent;
@@ -3269,7 +3440,7 @@ trim_remove_extent(struct jnl_trim_list *trim, uint64_t offset, uint64_t length)
 	if (keep_before >  keep_after) {
 		/* If the list was already full, we need to grow it. */
 		if (trim->extent_count == trim->allocated_count) {
-			if (trim_realloc(trim) != 0) {
+			if (trim_realloc(jnl, trim) != 0) {
 				printf("jnl: trim_remove_extent: out of memory!");
 				return ENOMEM;
 			}
@@ -3331,24 +3502,23 @@ trim_remove_extent(struct jnl_trim_list *trim, uint64_t offset, uint64_t length)
 	return 0;
 }
 
-
 /*
-;________________________________________________________________________________
-;
-; Routine:		journal_trim_remove_extent
-;
-; Function:		Make note of a range of bytes, some of which may have previously
-;				been passed to journal_trim_add_extent, is now in use on the
-;				volume.  The given bytes will be not be trimmed as part of
-;				this transaction, or a pending trim of a transaction being
-;				asynchronously flushed.
-;
-; Input Arguments:
-;	jnl			- The journal for the volume containing the byte range.
-;	offset		- The first byte of the range to be trimmed.
-;	length		- The number of bytes of the extent being trimmed.
-;________________________________________________________________________________
-*/
+ ;________________________________________________________________________________
+ ;
+ ; Routine:		journal_trim_remove_extent
+ ;
+ ; Function:		Make note of a range of bytes, some of which may have previously
+ ;				been passed to journal_trim_add_extent, is now in use on the
+ ;				volume.  The given bytes will be not be trimmed as part of
+ ;				this transaction, or a pending trim of a transaction being
+ ;				asynchronously flushed.
+ ;
+ ; Input Arguments:
+ ;	jnl			- The journal for the volume containing the byte range.
+ ;	offset		- The first byte of the range to be trimmed.
+ ;	length		- The number of bytes of the extent being trimmed.
+ ;________________________________________________________________________________
+ */
 __private_extern__ int
 journal_trim_remove_extent(journal *jnl, uint64_t offset, uint64_t length)
 {
@@ -3366,7 +3536,7 @@ journal_trim_remove_extent(journal *jnl, uint64_t offset, uint64_t length)
 	CHECK_TRANSACTION(tr);
 
 	if (jnl_kdebug)
-		KERNEL_DEBUG_CONSTANT(DBG_JOURNAL_TRIM_REMOVE | DBG_FUNC_START, jnl, offset, length, tr->trim.extent_count, 0);
+		KERNEL_DEBUG_CONSTANT(DBG_JOURNAL_TRIM_REMOVE | DBG_FUNC_START, VM_KERNEL_ADDRPERM(jnl), offset, length, tr->trim.extent_count, 0);
 
 	if (jnl->owner != current_thread()) {
 		panic("jnl: trim_remove_extent: called w/out a transaction! jnl %p, owner %p, curact %p\n",
@@ -3374,8 +3544,8 @@ journal_trim_remove_extent(journal *jnl, uint64_t offset, uint64_t length)
 	}
 
 	free_old_stuff(jnl);
-	
-	error = trim_remove_extent(&tr->trim, offset, length);
+		
+	error = trim_remove_extent(jnl, &tr->trim, offset, length);
 	if (error == 0) {
 		int found = FALSE;
 		
@@ -3385,7 +3555,7 @@ journal_trim_remove_extent(journal *jnl, uint64_t offset, uint64_t length)
 		 */
 		lck_rw_lock_shared(&jnl->trim_lock);
 		if (jnl->async_trim != NULL)
-			found = trim_search_extent(jnl->async_trim, offset, length);
+			found = trim_search_extent(jnl->async_trim, offset, length, NULL, NULL);
 		lck_rw_unlock_shared(&jnl->trim_lock);
 		
 		if (found) {
@@ -3398,10 +3568,10 @@ journal_trim_remove_extent(journal *jnl, uint64_t offset, uint64_t length)
 			uint32_t async_extent_count = 0;
 			
 			if (jnl_kdebug)
-				KERNEL_DEBUG_CONSTANT(DBG_JOURNAL_TRIM_REMOVE_PENDING | DBG_FUNC_START, jnl, offset, length, 0, 0);
+				KERNEL_DEBUG_CONSTANT(DBG_JOURNAL_TRIM_REMOVE_PENDING | DBG_FUNC_START, VM_KERNEL_ADDRPERM(jnl), offset, length, 0, 0);
 			lck_rw_lock_exclusive(&jnl->trim_lock);
 			if (jnl->async_trim != NULL) {
-				error = trim_remove_extent(jnl->async_trim, offset, length);
+				error = trim_remove_extent(jnl, jnl->async_trim, offset, length);
 				async_extent_count = jnl->async_trim->extent_count;
 			}
 			lck_rw_unlock_exclusive(&jnl->trim_lock);
@@ -3420,31 +3590,37 @@ static int
 journal_trim_flush(journal *jnl, transaction *tr)
 {
 	int errno = 0;
+	boolean_t was_vm_privileged;
 	
 	if (jnl_kdebug)
-		KERNEL_DEBUG_CONSTANT(DBG_JOURNAL_TRIM_FLUSH | DBG_FUNC_START, jnl, tr, 0, tr->trim.extent_count, 0);
+		KERNEL_DEBUG_CONSTANT(DBG_JOURNAL_TRIM_FLUSH | DBG_FUNC_START, VM_KERNEL_ADDRPERM(jnl), tr, 0, tr->trim.extent_count, 0);
 
+	if (jnl->fsmount->mnt_kern_flag & MNTK_SWAP_MOUNT) {
+		/*
+		 * the disk driver can allocate memory on this path...
+		 * if we block waiting for memory, and there is enough pressure to
+		 * cause us to try and create a new swap file, we may end up deadlocking
+		 * due to waiting for the journal on the swap file creation path...
+		 * by making ourselves vm_privileged, we give ourselves the best chance
+		 * of not blocking
+		 */
+		was_vm_privileged = set_vm_privilege(TRUE);
+	}
+	lck_rw_lock_shared(&jnl->trim_lock);
 	if (tr->trim.extent_count > 0) {
 		dk_unmap_t unmap;
 				
 		bzero(&unmap, sizeof(unmap));
-		lck_rw_lock_shared(&jnl->trim_lock);
 		if (CONFIG_HFS_TRIM && (jnl->flags & JOURNAL_USE_UNMAP)) {
 			unmap.extents = tr->trim.extents;
 			unmap.extentsCount = tr->trim.extent_count;
 			if (jnl_kdebug)
-				KERNEL_DEBUG_CONSTANT(DBG_JOURNAL_TRIM_UNMAP | DBG_FUNC_START, jnl, tr, 0, tr->trim.extent_count, 0);
+				KERNEL_DEBUG_CONSTANT(DBG_JOURNAL_TRIM_UNMAP | DBG_FUNC_START, VM_KERNEL_ADDRPERM(jnl), tr, 0, tr->trim.extent_count, 0);
 			errno = VNOP_IOCTL(jnl->fsdev, DKIOCUNMAP, (caddr_t)&unmap, FWRITE, vfs_context_kernel());
 			if (jnl_kdebug)
 				KERNEL_DEBUG_CONSTANT(DBG_JOURNAL_TRIM_UNMAP | DBG_FUNC_END, errno, 0, 0, 0, 0);
-			if (errno) {
-				printf("jnl: error %d from DKIOCUNMAP (extents=%lx, count=%u); disabling trim for %s\n",
-						errno, (unsigned long) (unmap.extents), unmap.extentsCount,
-						jnl->jdev_name);
-				jnl->flags &= ~JOURNAL_USE_UNMAP;
-			}
 		}
-
+		
 		/*
 		 * Call back into the file system to tell them that we have
 		 * trimmed some extents and that they can now be reused.
@@ -3456,10 +3632,11 @@ journal_trim_flush(journal *jnl, transaction *tr)
 		 */
 		if (jnl->trim_callback)
 			jnl->trim_callback(jnl->trim_callback_arg, tr->trim.extent_count, tr->trim.extents);
-
-		lck_rw_unlock_shared(&jnl->trim_lock);
 	}
+	lck_rw_unlock_shared(&jnl->trim_lock);
 
+	if ((jnl->fsmount->mnt_kern_flag & MNTK_SWAP_MOUNT) && (was_vm_privileged == FALSE))
+		set_vm_privilege(FALSE);
 	/*
 	 * If the transaction we're flushing was the async transaction, then
 	 * tell the current transaction that there is no pending trim
@@ -3475,6 +3652,11 @@ journal_trim_flush(journal *jnl, transaction *tr)
 		jnl->async_trim = NULL;
 	lck_rw_unlock_exclusive(&jnl->trim_lock);
 
+	/*
+	 * By the time we get here, no other thread can discover the address
+	 * of "tr", so it is safe for us to manipulate tr->trim without
+	 * holding any locks.
+	 */
 	if (tr->trim.extents) {			
 		kfree(tr->trim.extents, tr->trim.allocated_count * sizeof(dk_extent_t));
 		tr->trim.allocated_count = 0;
@@ -3488,7 +3670,6 @@ journal_trim_flush(journal *jnl, transaction *tr)
 	return errno;
 }
 
-
 static int
 journal_binfo_cmp(const void *a, const void *b)
 {
@@ -3564,7 +3745,7 @@ end_transaction(transaction *tr, int force_it, errno_t (*callback)(void*), void
 		jnl->cur_tr = tr;
 		goto done;
 	}
-
+	
     // if our transaction buffer isn't very full, just hang
     // on to it and don't actually flush anything.  this is
     // what is known as "group commit".  we will flush the
@@ -3607,7 +3788,7 @@ end_transaction(transaction *tr, int force_it, errno_t (*callback)(void*), void
 		KERNEL_DEBUG(0xbbbbc018|DBG_FUNC_END, jnl, tr, ret_val, 0, 0);
 		goto done;
 	}
-
+	
 	/*
 	 * Store a pointer to this transaction's trim list so that
 	 * future transactions can find it.
@@ -3634,7 +3815,7 @@ end_transaction(transaction *tr, int force_it, errno_t (*callback)(void*), void
 	 * of the journal flush, 'saved_sequence_num' remains stable
 	 */
 	jnl->saved_sequence_num = jnl->sequence_num;
-
+	
 	/*
 	 * if we're here we're going to flush the transaction buffer to disk.
 	 * 'check_free_space' will not return untl there is enough free
@@ -3793,8 +3974,7 @@ end_transaction(transaction *tr, int force_it, errno_t (*callback)(void*), void
 		must_wait = TRUE;
 
 	if (drop_lock_early == TRUE) {
-		jnl->owner = NULL;
-		unlock_journal(jnl);
+		journal_unlock(jnl);
 		drop_lock = FALSE;
 	}
 	if (must_wait == TRUE)
@@ -3812,8 +3992,7 @@ end_transaction(transaction *tr, int force_it, errno_t (*callback)(void*), void
 	KERNEL_DEBUG(0xbbbbc018|DBG_FUNC_END, jnl, tr, ret_val, 0, 0);
 done:
 	if (drop_lock == TRUE) {
-		jnl->owner = NULL;
-		unlock_journal(jnl);
+		journal_unlock(jnl);
 	}
 	return (ret_val);
 }
@@ -3822,14 +4001,8 @@ done:
 static void
 finish_end_thread(transaction *tr)
 {
-#if !CONFIG_EMBEDDED
-	proc_apply_thread_selfdiskacc(IOPOL_PASSIVE);
-#else /* !CONFIG_EMBEDDED */
-	struct uthread	*ut;
-
-	ut = get_bsdthread_info(current_thread());
-	ut->uu_iopol_disk = IOPOL_PASSIVE;
-#endif /* !CONFIG_EMBEDDED */
+	proc_set_task_policy(current_task(), current_thread(),
+	                     TASK_POLICY_INTERNAL, TASK_POLICY_IOPOL, IOPOL_PASSIVE);
 
 	finish_end_transaction(tr, NULL, NULL);
 
@@ -3840,14 +4013,8 @@ finish_end_thread(transaction *tr)
 static void
 write_header_thread(journal *jnl)
 {
-#if !CONFIG_EMBEDDED
-	proc_apply_thread_selfdiskacc(IOPOL_PASSIVE);
-#else /* !CONFIG_EMBEDDED */
-	struct uthread	*ut;
-
-	ut = get_bsdthread_info(current_thread());
-	ut->uu_iopol_disk = IOPOL_PASSIVE;
-#endif /* !CONFIG_EMBEDDED */
+	proc_set_task_policy(current_task(), current_thread(),
+	                     TASK_POLICY_INTERNAL, TASK_POLICY_IOPOL, IOPOL_PASSIVE);
 
 	if (write_journal_header(jnl, 1, jnl->saved_sequence_num))
 		jnl->write_header_failed = TRUE;
@@ -3878,6 +4045,7 @@ finish_end_transaction(transaction *tr, errno_t (*callback)(void*), void *callba
 	end  = jnl->jhdr->end;
 
 	for (blhdr = tr->blhdr; blhdr; blhdr = (block_list_header *)((long)blhdr->binfo[0].bnum)) {
+		boolean_t was_vm_privileged;
 
 		amt = blhdr->bytes_used;
 
@@ -3886,9 +4054,22 @@ finish_end_transaction(transaction *tr, errno_t (*callback)(void*), void *callba
 		blhdr->checksum = 0;
 		blhdr->checksum = calc_checksum((char *)blhdr, BLHDR_CHECKSUM_SIZE);
 
+		if (jnl->fsmount->mnt_kern_flag & MNTK_SWAP_MOUNT) {
+			/*
+			 * if we block waiting for memory, and there is enough pressure to
+			 * cause us to try and create a new swap file, we may end up deadlocking
+			 * due to waiting for the journal on the swap file creation path...
+			 * by making ourselves vm_privileged, we give ourselves the best chance
+			 * of not blocking
+			 */
+			was_vm_privileged = set_vm_privilege(TRUE);
+		}
 		if (kmem_alloc(kernel_map, (vm_offset_t *)&bparray, blhdr->num_blocks * sizeof(struct buf *))) {
 			panic("can't allocate %zd bytes for bparray\n", blhdr->num_blocks * sizeof(struct buf *));
 		}
+		if ((jnl->fsmount->mnt_kern_flag & MNTK_SWAP_MOUNT) && (was_vm_privileged == FALSE))
+			set_vm_privilege(FALSE);
+
 		tbuffer_offset = jnl->jhdr->blhdr_size;
 
 		for (i = 1; i < blhdr->num_blocks; i++) {
@@ -4119,13 +4300,13 @@ bad_journal:
 		jnl->flush_aborted = TRUE;
 
 		unlock_condition(jnl, &jnl->flushing);
-		lock_journal(jnl);
+		journal_lock(jnl);
 
 		jnl->flags |= JOURNAL_INVALID;
 		jnl->old_start[sizeof(jnl->old_start)/sizeof(jnl->old_start[0]) - 1] &= ~0x8000000000000000LL;
 		abort_transaction(jnl, tr);		// cleans up list of extents to be trimmed
 
-		unlock_journal(jnl);
+		journal_unlock(jnl);
 	} else
 		unlock_condition(jnl, &jnl->flushing);
 
@@ -4249,7 +4430,7 @@ abort_transaction(journal *jnl, transaction *tr)
 					 */
 					vnode_rele_ext(bp_vp, 0, 1);
 				} else {
-					printf("jnl: %s: abort_tr: could not find block %Ld vp %p!\n",
+					printf("jnl: %s: abort_tr: could not find block %lld vp %p!\n",
 					       jnl->jdev_name, blhdr->binfo[i].bnum, tbp);
 					if (bp) {
 						buf_brelse(bp);
@@ -4276,6 +4457,7 @@ abort_transaction(journal *jnl, transaction *tr)
 		jnl->async_trim = NULL;
 	lck_rw_unlock_exclusive(&jnl->trim_lock);
 	
+	
 	if (tr->trim.extents) {
 		kfree(tr->trim.extents, tr->trim.allocated_count * sizeof(dk_extent_t));
 	}
@@ -4328,8 +4510,7 @@ journal_end_transaction(journal *jnl)
 
 			abort_transaction(jnl, tr);
 		}
-		jnl->owner = NULL;
-		unlock_journal(jnl);
+		journal_unlock(jnl);
 
 		return EINVAL;
 	}
@@ -4343,7 +4524,19 @@ journal_end_transaction(journal *jnl)
 	// called from end_transaction().
 	// 
 	jnl->active_tr = NULL;
-	ret = end_transaction(tr, 0, NULL, NULL, TRUE, FALSE);
+	
+	/* Examine the force-journal-flush state in the active txn */
+	if (tr->flush_on_completion == TRUE) {
+		/*
+		 * If the FS requested it, disallow group commit and force the
+		 * transaction out to disk immediately.
+		 */
+		ret = end_transaction(tr, 1, NULL, NULL, TRUE, TRUE);
+	}
+	else {
+		/* in the common path we can simply use the double-buffered journal */
+		ret = end_transaction(tr, 0, NULL, NULL, TRUE, FALSE);
+	}
 
 	return ret;
 }
@@ -4391,7 +4584,7 @@ journal_flush(journal *jnl, boolean_t wait_for_IO)
 	KERNEL_DEBUG(DBG_JOURNAL_FLUSH | DBG_FUNC_START, jnl, 0, 0, 0, 0);
 
 	if (jnl->owner != current_thread()) {
-		lock_journal(jnl);
+		journal_lock(jnl);
 		drop_lock = TRUE;
 	}
 
@@ -4415,7 +4608,7 @@ journal_flush(journal *jnl, boolean_t wait_for_IO)
 
 	} else  { 
 		if (drop_lock == TRUE) {
-			unlock_journal(jnl);
+			journal_unlock(jnl);
 		}
 
 		/* Because of pipelined journal, the journal transactions 
@@ -4520,7 +4713,8 @@ int journal_relocate(journal *jnl, off_t offset, off_t journal_size, int32_t tbu
 {
 	int		ret;
 	transaction	*tr;
-	
+	size_t i = 0;
+
 	/*
 	 * Sanity check inputs, and adjust the size of the transaction buffer.
 	 */
@@ -4565,7 +4759,23 @@ int journal_relocate(journal *jnl, off_t offset, off_t journal_size, int32_t tbu
 		return ret;
 	}
 	wait_condition(jnl, &jnl->flushing, "end_transaction");
-	
+
+	/*
+	 * At this point, we have completely flushed the contents of the current
+	 * journal to disk (and have asynchronously written all of the txns to 
+	 * their actual desired locations).  As a result, we can (and must) clear 
+	 * out the old_start array.  If we do not, then if the last written transaction
+	 * started at the beginning of the journal (starting 1 block into the 
+	 * journal file) it could confuse the buffer_flushed callback. This is
+	 * because we're about to reset the start/end pointers of the journal header
+	 * below. 
+	 */
+	lock_oldstart(jnl); 
+	for (i = 0; i < sizeof (jnl->old_start) / sizeof(jnl->old_start[0]); i++) { 
+		jnl->old_start[i] = 0; 
+	}
+	unlock_oldstart(jnl);
+
 	/* Update the journal's offset and size in memory. */
 	jnl->jdev_offset = offset;
 	jnl->jhdr->start = jnl->jhdr->end = jnl->jhdr->jhdr_size;
@@ -4619,7 +4829,8 @@ journal_create(__unused struct vnode *jvp,
 	       __unused int32_t       flags,
 	       __unused int32_t       tbuffer_size,
 	       __unused void        (*flush)(void *arg),
-	       __unused void         *arg)
+	       __unused void         *arg,
+	       __unused struct mount *fsmount)
 {
     return NULL;
 }
@@ -4633,7 +4844,8 @@ journal_open(__unused struct vnode *jvp,
 	     __unused int32_t       flags,
 	     __unused int32_t       tbuffer_size,
 	     __unused void        (*flush)(void *arg),
-	     __unused void         *arg)
+	     __unused void         *arg,
+	     __unused struct mount *fsmount)
 {
 	return NULL;
 }
@@ -4709,4 +4921,47 @@ journal_owner(__unused journal *jnl)
 {
 	return NULL;
 }
+
+void 
+journal_lock(__unused journal *jnl) 
+{
+	return;
+}
+
+void 
+journal_unlock(__unused journal *jnl)
+{
+	return;
+}
+
+__private_extern__ int
+journal_trim_add_extent(__unused journal *jnl, 
+			__unused uint64_t offset, 
+			__unused uint64_t length)
+{
+	return 0;
+}
+
+int
+journal_request_immediate_flush(__unused journal *jnl) 
+{
+	return 0;
+}
+
+__private_extern__ int
+journal_trim_remove_extent(__unused journal *jnl, 
+			   __unused uint64_t offset, 
+			   __unused uint64_t length)
+{
+	return 0;
+}
+
+int journal_trim_extent_overlap(__unused journal *jnl, 
+				__unused uint64_t offset, 
+				__unused uint64_t length, 
+				__unused uint64_t *end) 
+{
+	return 0;
+}
+
 #endif  // !JOURNALING