xnu-2782.30.5.tar.gz

[apple/xnu.git] / bsd / vfs / vfs_journal.c
diff --git a/bsd/vfs/vfs_journal.c b/bsd/vfs/vfs_journal.c

index 3864ff34c2b805092352f92a433461b707827ffe..628e5e7dcc7401ae3f86496271f065e86c2ef3c6 100644 (file)
--- a/bsd/vfs/vfs_journal.c
+++ b/bsd/vfs/vfs_journal.c
@@ -1,5 +1,5 @@
  /*
- * Copyright (c) 2002-2012 Apple Inc. All rights reserved.
+ * Copyright (c) 2002-2014 Apple Inc. All rights reserved.
   *
   * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
   * 
@@ -124,13 +124,13 @@ SYSCTL_INT(_vfs_generic_jnl_kdebug, OID_AUTO, trim, CTLFLAG_RW|CTLFLAG_LOCKED, &
  #if JOURNALING
  
  //
-// By default, we grow the list of extents to trim by one page at a time.
+// By default, we grow the list of extents to trim by 4K at a time.
  // We'll opt to flush a transaction if it contains at least
  // JOURNAL_FLUSH_TRIM_EXTENTS extents to be trimmed (even if the number
  // of modified blocks is small).
  //
  enum {
-    JOURNAL_DEFAULT_TRIM_BYTES = PAGE_SIZE,
+    JOURNAL_DEFAULT_TRIM_BYTES = 4096,
      JOURNAL_DEFAULT_TRIM_EXTENTS = JOURNAL_DEFAULT_TRIM_BYTES / sizeof(dk_extent_t),
      JOURNAL_FLUSH_TRIM_EXTENTS = JOURNAL_DEFAULT_TRIM_EXTENTS * 15 / 16
  };
@@ -162,8 +162,6 @@ static int end_transaction(transaction *tr, int force_it, errno_t (*callback)(vo
  static void abort_transaction(journal *jnl, transaction *tr);
  static void dump_journal(journal *jnl);
  
-static __inline__ void  lock_journal(journal *jnl);
-static __inline__ void  unlock_journal(journal *jnl);
  static __inline__ void  lock_oldstart(journal *jnl);
  static __inline__ void  unlock_oldstart(journal *jnl);
  static __inline__ void  lock_flush(journal *jnl);
@@ -248,10 +246,11 @@ static int insert_block(journal *jnl, struct bucket **buf_ptr, int blk_index, of
  // we use it to checksum the journal header and the block list
  // headers that are at the start of each transaction.
  //
-static int
+static unsigned int
  calc_checksum(char *ptr, int len)
  {
-       int i, cksum=0;
+       int i;
+       unsigned int cksum=0;
  
         // this is a lame checksum but for now it'll do
         for(i = 0; i < len; i++, ptr++) {
@@ -276,15 +275,20 @@ journal_init(void)
         jnl_mutex_group  = lck_grp_alloc_init("jnl-mutex", jnl_group_attr);
  }
  
-static __inline__ void
-lock_journal(journal *jnl)
+__inline__ void
+journal_lock(journal *jnl)
  {
         lck_mtx_lock(&jnl->jlock);
+       if (jnl->owner) {
+               panic ("jnl: owner is %p, expected NULL\n", jnl->owner);
+       }
+       jnl->owner = current_thread();
  }
  
-static __inline__ void
-unlock_journal(journal *jnl)
+__inline__ void
+journal_unlock(journal *jnl)
  {
+       jnl->owner = NULL;
         lck_mtx_unlock(&jnl->jlock);
  }
  
@@ -334,6 +338,7 @@ do_journal_io(journal *jnl, off_t *offset, void *data, size_t len, int direction
         size_t  io_sz = 0;
         buf_t   bp;
         off_t   max_iosize;
+       struct bufattr *bap;
  
         if (*offset < 0 || *offset > jnl->jhdr->size) {
                 panic("jnl: do_jnl_io: bad offset 0x%llx (max 0x%llx)\n", *offset, jnl->jhdr->size);
@@ -369,6 +374,20 @@ again:
                 panic("jnl: request for i/o to jnl-header without JNL_HEADER flag set! (len %d, data %p)\n", curlen, data);
         }
  
+       /*
+        * As alluded to in the block comment at the top of the function, we use a "fake" iobuf
+        * here and issue directly to the disk device that the journal protects since we don't
+        * want this to enter the block cache.  As a result, we lose the ability to mark it
+        * as a metadata buf_t for the layers below us that may care. If we were to
+        * simply attach the B_META flag into the b_flags this may confuse things further
+        * since this is an iobuf, not a metadata buffer. 
+        *
+        * To address this, we use the extended bufattr struct embedded in the bp. 
+        * Explicitly mark the buf here as a metadata buffer in its bufattr flags.
+        */
+       bap = &bp->b_attr;
+       bap->ba_flags |= BA_META;
+       
         if (direction & JNL_READ)
                 buf_setflags(bp, B_READ);
         else {
@@ -571,9 +590,6 @@ buffer_flushed_callback(struct buf *bp, void *arg)
         CHECK_TRANSACTION(tr);
  
         jnl = tr->jnl;
-       if (jnl->flags & JOURNAL_INVALID) {
-               return;
-       }
  
         CHECK_JOURNAL(jnl);
  
@@ -614,6 +630,9 @@ buffer_flushed_callback(struct buf *bp, void *arg)
         // cleanup for this transaction
         tr->total_bytes = 0xfbadc0de;
  
+       if (jnl->flags & JOURNAL_INVALID)
+               goto transaction_done;
+
         //printf("jnl: tr 0x%x (0x%llx 0x%llx) in jnl 0x%x completed.\n",
         //   tr, tr->journal_start, tr->journal_end, jnl);
  
@@ -709,6 +728,7 @@ buffer_flushed_callback(struct buf *bp, void *arg)
                 tr->next       = jnl->tr_freeme;
                 jnl->tr_freeme = tr;
         }
+transaction_done:
         unlock_oldstart(jnl);
  
         unlock_condition(jnl, &jnl->asyncIO);
@@ -1091,7 +1111,8 @@ add_block(journal *jnl, struct bucket **buf_ptr, off_t block_num, size_t size, _
  static int
  replay_journal(journal *jnl)
  {
-       int             i, orig_checksum, checksum, check_block_checksums=0, bad_blocks=0;
+       int             i, bad_blocks=0;
+       unsigned int    orig_checksum, checksum, check_block_checksums = 0;
         size_t          ret;
         size_t          max_bsize = 0;          /* protected by block_ptr */
         block_list_header *blhdr;
@@ -1157,7 +1178,7 @@ restart_replay:
                 if (jnl->flags & JOURNAL_NEED_SWAP) {
                         // calculate the checksum based on the unswapped data
                         // because it is done byte-at-a-time.
-                       orig_checksum = SWAP32(orig_checksum);
+                       orig_checksum = (unsigned int)SWAP32(orig_checksum);
                         checksum = calc_checksum((char *)blhdr, BLHDR_CHECKSUM_SIZE);
                         swap_block_list_header(jnl, blhdr);
                 } else {
@@ -1468,7 +1489,7 @@ bad_replay:
  
  
  #define DEFAULT_TRANSACTION_BUFFER_SIZE  (128*1024)
-#define MAX_TRANSACTION_BUFFER_SIZE      (2048*1024)
+#define MAX_TRANSACTION_BUFFER_SIZE      (3072*1024)
  
  // XXXdbg - so I can change it in the debugger
  int def_tbuffer_size = 0;
@@ -1487,14 +1508,14 @@ size_up_tbuffer(journal *jnl, int tbuffer_size, int phys_blksz)
         // there is in the machine.
         //
         if (def_tbuffer_size == 0) {
-               if (mem_size < (256*1024*1024)) {
+               if (max_mem < (256*1024*1024)) {
                         def_tbuffer_size = DEFAULT_TRANSACTION_BUFFER_SIZE;
-               } else if (mem_size < (512*1024*1024)) {
+               } else if (max_mem < (512*1024*1024)) {
                         def_tbuffer_size = DEFAULT_TRANSACTION_BUFFER_SIZE * 2;
-               } else if (mem_size < (1024*1024*1024)) {
+               } else if (max_mem < (1024*1024*1024)) {
                         def_tbuffer_size = DEFAULT_TRANSACTION_BUFFER_SIZE * 3;
                 } else {
-                       def_tbuffer_size = DEFAULT_TRANSACTION_BUFFER_SIZE * (mem_size / (256*1024*1024));
+                       def_tbuffer_size = DEFAULT_TRANSACTION_BUFFER_SIZE * (max_mem / (256*1024*1024));
                 }
         }
  
@@ -1532,8 +1553,6 @@ size_up_tbuffer(journal *jnl, int tbuffer_size, int phys_blksz)
         }
  }
  
-
-
  static void
  get_io_info(struct vnode *devvp, size_t phys_blksz, journal *jnl, struct vfs_context *context)
  {
@@ -1546,9 +1565,10 @@ get_io_info(struct vnode *devvp, size_t phys_blksz, journal *jnl, struct vfs_con
  
         if (VNOP_IOCTL(devvp, DKIOCGETFEATURES, (caddr_t)&features, 0, context) == 0) {
                 if (features & DK_FEATURE_FORCE_UNIT_ACCESS) {
-                       const char *name = vnode_name(devvp);
+                       const char *name = vnode_getname_printable(devvp);
                         jnl->flags |= JOURNAL_DO_FUA_WRITES;
-                       printf("jnl: %s: enabling FUA writes (features 0x%x)\n", name ? name : "no-name-dev", features);
+                       printf("jnl: %s: enabling FUA writes (features 0x%x)\n", name, features);
+                       vnode_putname_printable(name);
                 }
                 if (features & DK_FEATURE_UNMAP) {
                         jnl->flags |= JOURNAL_USE_UNMAP;
@@ -1616,23 +1636,6 @@ get_io_info(struct vnode *devvp, size_t phys_blksz, journal *jnl, struct vfs_con
  }
  
  
-static const char *
-get_jdev_name(struct vnode *jvp)
-{
-       const char *jdev_name;
-    
-       jdev_name = vnode_name(jvp);
-       if (jdev_name == NULL) {
-               jdev_name = vfs_addname("unknown-dev", strlen("unknown-dev"), 0, 0);
-       } else {
-               // this just bumps the refcount on the name so we have our own copy
-               jdev_name = vfs_addname(jdev_name, strlen(jdev_name), 0, 0);
-       }
-
-       return jdev_name;
-}
-
-
  journal *
  journal_create(struct vnode *jvp,
                            off_t         offset,
@@ -1642,7 +1645,8 @@ journal_create(struct vnode *jvp,
                            int32_t       flags,
                            int32_t       tbuffer_size,
                            void        (*flush)(void *arg),
-                          void         *arg)
+                          void         *arg,
+                          struct mount *fsmount)
  {
         journal         *jnl;
         uint32_t        phys_blksz, new_txn_base;
@@ -1658,36 +1662,36 @@ journal_create(struct vnode *jvp,
         context.vc_thread = current_thread();
         context.vc_ucred = FSCRED;
  
-       jdev_name = get_jdev_name(jvp);
+       jdev_name = vnode_getname_printable(jvp);
  
         /* Get the real physical block size. */
         if (VNOP_IOCTL(jvp, DKIOCGETBLOCKSIZE, (caddr_t)&phys_blksz, 0, &context)) {
-               return NULL;
+               goto cleanup_jdev_name;
         }
  
         if (journal_size < (256*1024) || journal_size > (MAX_JOURNAL_SIZE)) {
-               printf("jnl: create: journal size %lld looks bogus.\n", journal_size);
-               return NULL;
+               printf("jnl: %s: create: journal size %lld looks bogus.\n", jdev_name, journal_size);
+               goto cleanup_jdev_name;
         }
  
         min_size = phys_blksz * (phys_blksz / sizeof(block_info));
         /* Reject journals that are too small given the sector size of the device */
         if (journal_size < min_size) {
-               printf("jnl: create: journal size (%lld) too small given sector size of (%u)\n", 
-                               journal_size, phys_blksz);
-               return NULL;
+               printf("jnl: %s: create: journal size (%lld) too small given sector size of (%u)\n", 
+                               jdev_name, journal_size, phys_blksz);
+               goto cleanup_jdev_name;
         }
  
         if (phys_blksz > min_fs_blksz) {
                 printf("jnl: %s: create: error: phys blksize %u bigger than min fs blksize %zd\n",
                        jdev_name, phys_blksz, min_fs_blksz);
-               return NULL;
+               goto cleanup_jdev_name;
         }
  
         if ((journal_size % phys_blksz) != 0) {
                 printf("jnl: %s: create: journal size 0x%llx is not an even multiple of block size 0x%ux\n",
                        jdev_name, journal_size, phys_blksz);
-               return NULL;
+               goto cleanup_jdev_name;
         }
  
  
@@ -1703,6 +1707,12 @@ journal_create(struct vnode *jvp,
         jnl->jdev_name    = jdev_name;
         lck_mtx_init(&jnl->old_start_lock, jnl_mutex_group, jnl_lock_attr);
  
+       // Keep a point to the mount around for use in IO throttling.
+       jnl->fsmount      = fsmount;
+       // XXX: This lock discipline looks correct based on dounmount(), but it
+       // doesn't seem to be documented anywhere.
+       mount_ref(fsmount, 0);
+
         get_io_info(jvp, phys_blksz, jnl, &context);
         
         if (kmem_alloc_kobject(kernel_map, (vm_offset_t *)&jnl->header_buf, phys_blksz)) {
@@ -1738,7 +1748,7 @@ journal_create(struct vnode *jvp,
                && jnl->jhdr->sequence_num != 0) {
  
                 new_txn_base = (jnl->jhdr->sequence_num + (journal_size / phys_blksz) + (random() % 16384)) & 0x00ffffff;
-               printf("jnl: create: avoiding old sequence number 0x%x (0x%x)\n", jnl->jhdr->sequence_num, new_txn_base);
+               printf("jnl: %s: create: avoiding old sequence number 0x%x (0x%x)\n", jdev_name, jnl->jhdr->sequence_num, new_txn_base);
  
  #if 0
                 int i;
@@ -1793,19 +1803,20 @@ journal_create(struct vnode *jvp,
                 goto bad_write;
         }
  
-       return jnl;
+       goto journal_create_complete;
  
  
  bad_write:
         kmem_free(kernel_map, (vm_offset_t)jnl->header_buf, phys_blksz);
  bad_kmem_alloc:
-       if (jdev_name) {
-               vfs_removename(jdev_name);
-       }
         jnl->jhdr = NULL;
         FREE_ZONE(jnl, sizeof(struct journal), M_JNL_JNL);
-
-       return NULL;
+       mount_drop(fsmount, 0);
+cleanup_jdev_name:
+       vnode_putname_printable(jdev_name);
+       jnl = NULL;
+journal_create_complete:
+       return jnl;
  }
  
  
@@ -1818,7 +1829,8 @@ journal_open(struct vnode *jvp,
                          int32_t       flags,
                          int32_t       tbuffer_size,
                          void        (*flush)(void *arg),
-                        void         *arg)
+                        void         *arg,
+                        struct mount *fsmount)
  {
         journal         *jnl;
         uint32_t        orig_blksz=0;
@@ -1826,39 +1838,39 @@ journal_open(struct vnode *jvp,
         u_int32_t       min_size = 0;
         int             orig_checksum, checksum;
         struct vfs_context context;
-       const char      *jdev_name = get_jdev_name(jvp);
+       const char      *jdev_name = vnode_getname_printable(jvp);
  
         context.vc_thread = current_thread();
         context.vc_ucred = FSCRED;
  
         /* Get the real physical block size. */
         if (VNOP_IOCTL(jvp, DKIOCGETBLOCKSIZE, (caddr_t)&phys_blksz, 0, &context)) {
-               return NULL;
+               goto cleanup_jdev_name;
         }
  
         if (phys_blksz > min_fs_blksz) {
                 printf("jnl: %s: open: error: phys blksize %u bigger than min fs blksize %zd\n",
                        jdev_name, phys_blksz, min_fs_blksz);
-               return NULL;
+               goto cleanup_jdev_name;
         }
  
         if (journal_size < (256*1024) || journal_size > (1024*1024*1024)) {
-               printf("jnl: open: journal size %lld looks bogus.\n", journal_size);
-               return NULL;
+               printf("jnl: %s: open: journal size %lld looks bogus.\n", jdev_name, journal_size);
+               goto cleanup_jdev_name;
         }
  
         min_size = phys_blksz * (phys_blksz / sizeof(block_info));
         /* Reject journals that are too small given the sector size of the device */
         if (journal_size < min_size) {
-               printf("jnl: open: journal size (%lld) too small given sector size of (%u)\n", 
-                               journal_size, phys_blksz);
-               return NULL;
+               printf("jnl: %s: open: journal size (%lld) too small given sector size of (%u)\n", 
+                               jdev_name, journal_size, phys_blksz);
+               goto cleanup_jdev_name;
         }
      
         if ((journal_size % phys_blksz) != 0) {
                 printf("jnl: %s: open: journal size 0x%llx is not an even multiple of block size 0x%x\n",
                        jdev_name, journal_size, phys_blksz);
-               return NULL;
+               goto cleanup_jdev_name;
         }
  
         MALLOC_ZONE(jnl, struct journal *, sizeof(struct journal), M_JNL_JNL, M_WAITOK);
@@ -1873,6 +1885,12 @@ journal_open(struct vnode *jvp,
         jnl->jdev_name    = jdev_name;
         lck_mtx_init(&jnl->old_start_lock, jnl_mutex_group, jnl_lock_attr);
  
+       /* We need a reference to the mount to later pass to the throttling code for
+        * IO accounting.
+        */
+       jnl->fsmount      = fsmount;
+       mount_ref(fsmount, 0);
+
         get_io_info(jvp, phys_blksz, jnl, &context);
  
         if (kmem_alloc_kobject(kernel_map, (vm_offset_t *)&jnl->header_buf, phys_blksz)) {
@@ -2045,7 +2063,7 @@ journal_open(struct vnode *jvp,
         lck_mtx_init(&jnl->flock, jnl_mutex_group, jnl_lock_attr);
         lck_rw_init(&jnl->trim_lock, jnl_mutex_group, jnl_lock_attr);
  
-       return jnl;
+       goto journal_open_complete;
  
  bad_journal:
         if (orig_blksz != 0) {
@@ -2055,11 +2073,13 @@ bad_journal:
         }
         kmem_free(kernel_map, (vm_offset_t)jnl->header_buf, phys_blksz);
  bad_kmem_alloc:
-       if (jdev_name) {
-               vfs_removename(jdev_name);
-       }
         FREE_ZONE(jnl, sizeof(struct journal), M_JNL_JNL);
-       return NULL;    
+       mount_drop(fsmount, 0);
+cleanup_jdev_name:
+       vnode_putname_printable(jdev_name);
+       jnl = NULL;
+journal_open_complete:
+       return jnl;    
  }
  
  
@@ -2075,7 +2095,7 @@ journal_is_clean(struct vnode *jvp,
         int             ret;
         int             orig_checksum, checksum;
         struct vfs_context context;
-       const           char *jdev_name = get_jdev_name(jvp);
+       const           char *jdev_name = vnode_getname_printable(jvp);
  
         context.vc_thread = current_thread();
         context.vc_ucred = FSCRED;
@@ -2083,31 +2103,36 @@ journal_is_clean(struct vnode *jvp,
         /* Get the real physical block size. */
         if (VNOP_IOCTL(jvp, DKIOCGETBLOCKSIZE, (caddr_t)&phys_blksz, 0, &context)) {
                 printf("jnl: %s: is_clean: failed to get device block size.\n", jdev_name);
-               return EINVAL;
+               ret = EINVAL;
+               goto cleanup_jdev_name;
         }
  
         if (phys_blksz > (uint32_t)min_fs_block_size) {
                 printf("jnl: %s: is_clean: error: phys blksize %d bigger than min fs blksize %zd\n",
                        jdev_name, phys_blksz, min_fs_block_size);
-               return EINVAL;
+               ret = EINVAL;
+               goto cleanup_jdev_name;
         }
  
         if (journal_size < (256*1024) || journal_size > (MAX_JOURNAL_SIZE)) {
-               printf("jnl: is_clean: journal size %lld looks bogus.\n", journal_size);
-               return EINVAL;
+               printf("jnl: %s: is_clean: journal size %lld looks bogus.\n", jdev_name, journal_size);
+               ret = EINVAL;
+               goto cleanup_jdev_name;
         }
      
         if ((journal_size % phys_blksz) != 0) {
                 printf("jnl: %s: is_clean: journal size 0x%llx is not an even multiple of block size 0x%x\n",
                        jdev_name, journal_size, phys_blksz);
-               return EINVAL;
+               ret = EINVAL;
+               goto cleanup_jdev_name;
         }
  
         memset(&jnl, 0, sizeof(jnl));
  
         if (kmem_alloc_kobject(kernel_map, (vm_offset_t *)&jnl.header_buf, phys_blksz)) {
                 printf("jnl: %s: is_clean: could not allocate space for header buffer (%d bytes)\n", jdev_name, phys_blksz);
-               return ENOMEM;
+               ret = ENOMEM;
+               goto cleanup_jdev_name;
         }
         jnl.header_buf_size = phys_blksz;
  
@@ -2168,12 +2193,9 @@ journal_is_clean(struct vnode *jvp,
  
  get_out:
         kmem_free(kernel_map, (vm_offset_t)jnl.header_buf, phys_blksz);
-       if (jdev_name) {
-               vfs_removename(jdev_name);
-       }
-    
-       return ret;    
-
+cleanup_jdev_name:
+       vnode_putname_printable(jdev_name);
+       return ret;
  }
  
  
@@ -2191,7 +2213,7 @@ journal_close(journal *jnl)
         jnl->flags |= JOURNAL_CLOSE_PENDING;
  
         if (jnl->owner != current_thread()) {
-               lock_journal(jnl);
+               journal_lock(jnl);
         }
  
         wait_condition(jnl, &jnl->flushing, "journal_close");
@@ -2269,16 +2291,23 @@ journal_close(journal *jnl)
                         }
                 }
         }
+       wait_condition(jnl, &jnl->asyncIO, "journal_close");
  
         free_old_stuff(jnl);
  
         kmem_free(kernel_map, (vm_offset_t)jnl->header_buf, jnl->header_buf_size);
         jnl->jhdr = (void *)0xbeefbabe;
  
-       if (jnl->jdev_name) {
-               vfs_removename(jnl->jdev_name);
-       }
+       // Release reference on the mount
+       if (jnl->fsmount)
+                mount_drop(jnl->fsmount, 0);
  
+       vnode_putname_printable(jnl->jdev_name);
+
+       journal_unlock(jnl);
+       lck_mtx_destroy(&jnl->old_start_lock, jnl_mutex_group);
+       lck_mtx_destroy(&jnl->jlock, jnl_mutex_group);
+       lck_mtx_destroy(&jnl->flock, jnl_mutex_group);
         FREE_ZONE(jnl, sizeof(struct journal), M_JNL_JNL);
  }
  
@@ -2449,7 +2478,19 @@ static errno_t
  journal_allocate_transaction(journal *jnl)
  {
         transaction *tr;
+       boolean_t was_vm_privileged;
         
+       if (jnl->fsmount->mnt_kern_flag & MNTK_SWAP_MOUNT) {
+               /*
+                * the disk driver can allocate memory on this path...
+                * if we block waiting for memory, and there is enough pressure to
+                * cause us to try and create a new swap file, we may end up deadlocking
+                * due to waiting for the journal on the swap file creation path...
+                * by making ourselves vm_privileged, we give ourselves the best chance
+                * of not blocking
+                */
+               was_vm_privileged = set_vm_privilege(TRUE);
+       }
         MALLOC_ZONE(tr, transaction *, sizeof(transaction), M_JNL_TR, M_WAITOK);
         memset(tr, 0, sizeof(transaction));
  
@@ -2460,6 +2501,8 @@ journal_allocate_transaction(journal *jnl)
                 jnl->active_tr = NULL;
                 return ENOMEM;
         }
+       if ((jnl->fsmount->mnt_kern_flag & MNTK_SWAP_MOUNT) && (was_vm_privileged == FALSE))
+               set_vm_privilege(FALSE);
  
         // journal replay code checksum check depends on this.
         memset(tr->tbuffer, 0, BLHDR_CHECKSUM_SIZE);
@@ -2502,14 +2545,14 @@ journal_start_transaction(journal *jnl)
                 jnl->nested_count++;
                 return 0;
         }
-       lock_journal(jnl);
  
-       if (jnl->owner != NULL || jnl->nested_count != 0 || jnl->active_tr != NULL) {
+       journal_lock(jnl);
+
+       if (jnl->nested_count != 0 || jnl->active_tr != NULL) {
                 panic("jnl: start_tr: owner %p, nested count %d, active_tr %p jnl @ %p\n",
                       jnl->owner, jnl->nested_count, jnl->active_tr, jnl);
         }
  
-       jnl->owner = current_thread();
         jnl->nested_count = 1;
  
  #if JOE
@@ -2547,9 +2590,8 @@ journal_start_transaction(journal *jnl)
         return 0;
  
  bad_start:
-       jnl->owner        = NULL;
         jnl->nested_count = 0;
-       unlock_journal(jnl);
+       journal_unlock(jnl);
  
         return ret;
  }
@@ -2847,6 +2889,8 @@ journal_modify_block_end(journal *jnl, struct buf *bp, void (*func)(buf_t bp, vo
                 blhdr->binfo[i].bnum = (off_t)(buf_blkno(bp));
                 blhdr->binfo[i].u.bp = bp;
  
+               KERNEL_DEBUG_CONSTANT(0x3018004, VM_KERNEL_ADDRPERM(vp), blhdr->binfo[i].bnum, bsize, 0, 0);
+
                 if (func) {
                         void (*old_func)(buf_t, void *)=NULL, *old_arg=NULL;
                         
@@ -2993,16 +3037,31 @@ journal_trim_set_callback(journal *jnl, jnl_trim_callback_t callback, void *arg)
  ;________________________________________________________________________________
  */
  static int
-trim_realloc(struct jnl_trim_list *trim)
+trim_realloc(journal *jnl, struct jnl_trim_list *trim)
  {
         void *new_extents;
         uint32_t new_allocated_count;
+       boolean_t was_vm_privileged;
         
         if (jnl_kdebug)
-               KERNEL_DEBUG_CONSTANT(DBG_JOURNAL_TRIM_REALLOC | DBG_FUNC_START, trim, 0, trim->allocated_count, trim->extent_count, 0);
+               KERNEL_DEBUG_CONSTANT(DBG_JOURNAL_TRIM_REALLOC | DBG_FUNC_START, VM_KERNEL_ADDRPERM(trim), 0, trim->allocated_count, trim->extent_count, 0);
         
         new_allocated_count = trim->allocated_count + JOURNAL_DEFAULT_TRIM_EXTENTS;
+
+       if (jnl->fsmount->mnt_kern_flag & MNTK_SWAP_MOUNT) {
+               /*
+                * if we block waiting for memory, and there is enough pressure to
+                * cause us to try and create a new swap file, we may end up deadlocking
+                * due to waiting for the journal on the swap file creation path...
+                * by making ourselves vm_privileged, we give ourselves the best chance
+                * of not blocking
+                */
+               was_vm_privileged = set_vm_privilege(TRUE);
+       }
         new_extents = kalloc(new_allocated_count * sizeof(dk_extent_t));
+       if ((jnl->fsmount->mnt_kern_flag & MNTK_SWAP_MOUNT) && (was_vm_privileged == FALSE))
+               set_vm_privilege(FALSE);
+
         if (new_extents == NULL) {
                 printf("jnl: trim_realloc: unable to grow extent list!\n");
                 /*
@@ -3045,31 +3104,41 @@ trim_realloc(struct jnl_trim_list *trim)
   ;     trim            - The trim list to be searched.
   ;     offset          - The first byte of the range to be searched for.
   ;     length          - The number of bytes of the extent being searched for.
+ ;  overlap_start - start of the overlapping extent
+ ;  overlap_len   - length of the overlapping extent
   ;
   ; Output:
   ;     (result)        - TRUE if one or more extents overlap, FALSE otherwise.
   ;________________________________________________________________________________
   */
  static int
-trim_search_extent(struct jnl_trim_list *trim, uint64_t offset, uint64_t length)
+trim_search_extent(struct jnl_trim_list *trim, uint64_t offset,
+               uint64_t length, uint64_t *overlap_start, uint64_t *overlap_len)
  {
         uint64_t end = offset + length;
         uint32_t lower = 0;                                             /* Lowest index to search */
         uint32_t upper = trim->extent_count;    /* Highest index to search + 1 */
         uint32_t middle;
-       
+
         /* A binary search over the extent list. */
         while (lower < upper) {
                 middle = (lower + upper) / 2;
-               
+
                 if (trim->extents[middle].offset >= end)
                         upper = middle;
                 else if (trim->extents[middle].offset + trim->extents[middle].length <= offset)
                         lower = middle + 1;
-               else
+               else {
+                       if (overlap_start) {
+                               *overlap_start = trim->extents[middle].offset;
+                       }
+                       if (overlap_len) {
+                               *overlap_len = trim->extents[middle].length;
+                       }
                         return TRUE;
+               }
         }
-       
+
         return FALSE;
  }
  
@@ -3120,7 +3189,7 @@ journal_trim_add_extent(journal *jnl, uint64_t offset, uint64_t length)
         CHECK_TRANSACTION(tr);
  
         if (jnl_kdebug)
-               KERNEL_DEBUG_CONSTANT(DBG_JOURNAL_TRIM_ADD | DBG_FUNC_START, jnl, offset, length, tr->trim.extent_count, 0);
+               KERNEL_DEBUG_CONSTANT(DBG_JOURNAL_TRIM_ADD | DBG_FUNC_START, VM_KERNEL_ADDRPERM(jnl), offset, length, tr->trim.extent_count, 0);
  
         if (jnl->owner != current_thread()) {
                 panic("jnl: trim_add_extent: called w/out a transaction! jnl %p, owner %p, curact %p\n",
@@ -3156,7 +3225,7 @@ journal_trim_add_extent(journal *jnl, uint64_t offset, uint64_t length)
         if (replace_count == 0) {
                 /* If the list was already full, we need to grow it. */
                 if (tr->trim.extent_count == tr->trim.allocated_count) {
-                       if (trim_realloc(&tr->trim) != 0) {
+                       if (trim_realloc(jnl, &tr->trim) != 0) {
                                 printf("jnl: trim_add_extent: out of memory!");
                                 if (jnl_kdebug)
                                         KERNEL_DEBUG_CONSTANT(DBG_JOURNAL_TRIM_ADD | DBG_FUNC_END, ENOMEM, 0, 0, tr->trim.extent_count, 0);
@@ -3214,6 +3283,92 @@ journal_trim_add_extent(journal *jnl, uint64_t offset, uint64_t length)
      return 0;
  }
  
+/*
+ * journal_trim_extent_overlap
+ *
+ * Return 1 if there are any pending TRIMs that overlap with the given offset and length
+ * Return 0 otherwise.
+ */
+
+int journal_trim_extent_overlap (journal *jnl, uint64_t offset, uint64_t length, uint64_t *end) {
+       transaction *tr = NULL;
+       int overlap = 0;
+
+       uint64_t overlap_start;
+       uint64_t overlap_len;
+       tr = jnl->active_tr;
+       CHECK_TRANSACTION(tr);
+
+       /*
+        * There are two lists that need to be examined for potential overlaps:
+        *
+        * The first is the current transaction. Since this function requires that
+        * a transaction be active when this is called, this is the "active_tr"
+        * pointer in the journal struct.  This has a trimlist pointer which needs
+        * to be searched.
+        */
+       overlap = trim_search_extent (&tr->trim, offset, length, &overlap_start, &overlap_len);
+       if (overlap == 0) {
+               /*
+                * The second is the async trim list, which is only done if the current
+                * transaction group (active transaction) did not overlap with our target
+                * extent. This async trim list is the set of all previously
+                * committed transaction groups whose I/Os are now in-flight. We need to hold the
+                * trim lock in order to search this list.  If we grab the list before the
+                * TRIM has completed, then we will compare it. If it is grabbed AFTER the
+                * TRIM has completed, then the pointer will be zeroed out and we won't have
+                * to check anything.
+                */
+               lck_rw_lock_shared (&jnl->trim_lock);
+               if (jnl->async_trim != NULL) {
+                       overlap = trim_search_extent(jnl->async_trim, offset, length, &overlap_start, &overlap_len);
+               }
+               lck_rw_unlock_shared (&jnl->trim_lock);
+       }
+
+       if (overlap) {
+               /* compute the end (min) of the overlapping range */
+               if ( (overlap_start + overlap_len) < (offset + length)) {
+                       *end = (overlap_start + overlap_len);
+               }
+               else {
+                       *end = (offset + length);
+               }
+       }
+
+
+       return overlap;
+}
+
+/*
+ * journal_request_immediate_flush
+ *
+ * FS requests that the journal flush immediately upon the
+ * active transaction's completion.
+ *
+ * Returns 0 if operation succeeds
+ * Returns EPERM if we failed to leave hint
+ */
+int
+journal_request_immediate_flush (journal *jnl) {
+
+       transaction *tr = NULL;
+       /*
+        * Is a transaction still in process? You must do
+        * this while there are txns open
+        */
+       tr = jnl->active_tr;
+       if (tr != NULL) {
+               CHECK_TRANSACTION(tr);
+               tr->flush_on_completion = TRUE;
+       }
+       else {
+               return EPERM;
+       }
+       return 0;
+}
+
+
  
  /*
  ;________________________________________________________________________________
@@ -3238,7 +3393,7 @@ journal_trim_add_extent(journal *jnl, uint64_t offset, uint64_t length)
  ;________________________________________________________________________________
  */
  static int
-trim_remove_extent(struct jnl_trim_list *trim, uint64_t offset, uint64_t length)
+trim_remove_extent(journal *jnl, struct jnl_trim_list *trim, uint64_t offset, uint64_t length)
  {
         u_int64_t end;
         dk_extent_t *extent;
@@ -3285,7 +3440,7 @@ trim_remove_extent(struct jnl_trim_list *trim, uint64_t offset, uint64_t length)
         if (keep_before >  keep_after) {
                 /* If the list was already full, we need to grow it. */
                 if (trim->extent_count == trim->allocated_count) {
-                       if (trim_realloc(trim) != 0) {
+                       if (trim_realloc(jnl, trim) != 0) {
                                 printf("jnl: trim_remove_extent: out of memory!");
                                 return ENOMEM;
                         }
@@ -3381,7 +3536,7 @@ journal_trim_remove_extent(journal *jnl, uint64_t offset, uint64_t length)
         CHECK_TRANSACTION(tr);
  
         if (jnl_kdebug)
-               KERNEL_DEBUG_CONSTANT(DBG_JOURNAL_TRIM_REMOVE | DBG_FUNC_START, jnl, offset, length, tr->trim.extent_count, 0);
+               KERNEL_DEBUG_CONSTANT(DBG_JOURNAL_TRIM_REMOVE | DBG_FUNC_START, VM_KERNEL_ADDRPERM(jnl), offset, length, tr->trim.extent_count, 0);
  
         if (jnl->owner != current_thread()) {
                 panic("jnl: trim_remove_extent: called w/out a transaction! jnl %p, owner %p, curact %p\n",
@@ -3390,7 +3545,7 @@ journal_trim_remove_extent(journal *jnl, uint64_t offset, uint64_t length)
  
         free_old_stuff(jnl);
                 
-       error = trim_remove_extent(&tr->trim, offset, length);
+       error = trim_remove_extent(jnl, &tr->trim, offset, length);
         if (error == 0) {
                 int found = FALSE;
                 
@@ -3400,7 +3555,7 @@ journal_trim_remove_extent(journal *jnl, uint64_t offset, uint64_t length)
                  */
                 lck_rw_lock_shared(&jnl->trim_lock);
                 if (jnl->async_trim != NULL)
-                       found = trim_search_extent(jnl->async_trim, offset, length);
+                       found = trim_search_extent(jnl->async_trim, offset, length, NULL, NULL);
                 lck_rw_unlock_shared(&jnl->trim_lock);
                 
                 if (found) {
@@ -3413,10 +3568,10 @@ journal_trim_remove_extent(journal *jnl, uint64_t offset, uint64_t length)
                         uint32_t async_extent_count = 0;
                         
                         if (jnl_kdebug)
-                               KERNEL_DEBUG_CONSTANT(DBG_JOURNAL_TRIM_REMOVE_PENDING | DBG_FUNC_START, jnl, offset, length, 0, 0);
+                               KERNEL_DEBUG_CONSTANT(DBG_JOURNAL_TRIM_REMOVE_PENDING | DBG_FUNC_START, VM_KERNEL_ADDRPERM(jnl), offset, length, 0, 0);
                         lck_rw_lock_exclusive(&jnl->trim_lock);
                         if (jnl->async_trim != NULL) {
-                               error = trim_remove_extent(jnl->async_trim, offset, length);
+                               error = trim_remove_extent(jnl, jnl->async_trim, offset, length);
                                 async_extent_count = jnl->async_trim->extent_count;
                         }
                         lck_rw_unlock_exclusive(&jnl->trim_lock);
@@ -3435,10 +3590,22 @@ static int
  journal_trim_flush(journal *jnl, transaction *tr)
  {
         int errno = 0;
+       boolean_t was_vm_privileged;
         
         if (jnl_kdebug)
-               KERNEL_DEBUG_CONSTANT(DBG_JOURNAL_TRIM_FLUSH | DBG_FUNC_START, jnl, tr, 0, tr->trim.extent_count, 0);
+               KERNEL_DEBUG_CONSTANT(DBG_JOURNAL_TRIM_FLUSH | DBG_FUNC_START, VM_KERNEL_ADDRPERM(jnl), tr, 0, tr->trim.extent_count, 0);
  
+       if (jnl->fsmount->mnt_kern_flag & MNTK_SWAP_MOUNT) {
+               /*
+                * the disk driver can allocate memory on this path...
+                * if we block waiting for memory, and there is enough pressure to
+                * cause us to try and create a new swap file, we may end up deadlocking
+                * due to waiting for the journal on the swap file creation path...
+                * by making ourselves vm_privileged, we give ourselves the best chance
+                * of not blocking
+                */
+               was_vm_privileged = set_vm_privilege(TRUE);
+       }
         lck_rw_lock_shared(&jnl->trim_lock);
         if (tr->trim.extent_count > 0) {
                 dk_unmap_t unmap;
@@ -3448,16 +3615,10 @@ journal_trim_flush(journal *jnl, transaction *tr)
                         unmap.extents = tr->trim.extents;
                         unmap.extentsCount = tr->trim.extent_count;
                         if (jnl_kdebug)
-                               KERNEL_DEBUG_CONSTANT(DBG_JOURNAL_TRIM_UNMAP | DBG_FUNC_START, jnl, tr, 0, tr->trim.extent_count, 0);
+                               KERNEL_DEBUG_CONSTANT(DBG_JOURNAL_TRIM_UNMAP | DBG_FUNC_START, VM_KERNEL_ADDRPERM(jnl), tr, 0, tr->trim.extent_count, 0);
                         errno = VNOP_IOCTL(jnl->fsdev, DKIOCUNMAP, (caddr_t)&unmap, FWRITE, vfs_context_kernel());
                         if (jnl_kdebug)
                                 KERNEL_DEBUG_CONSTANT(DBG_JOURNAL_TRIM_UNMAP | DBG_FUNC_END, errno, 0, 0, 0, 0);
-                       if (errno) {
-                               printf("jnl: error %d from DKIOCUNMAP (extents=%lx, count=%u); disabling trim for %s\n",
-                                          errno, (unsigned long) (unmap.extents), unmap.extentsCount,
-                                          jnl->jdev_name);
-                               jnl->flags &= ~JOURNAL_USE_UNMAP;
-                       }
                 }
                 
                 /*
@@ -3474,6 +3635,8 @@ journal_trim_flush(journal *jnl, transaction *tr)
         }
         lck_rw_unlock_shared(&jnl->trim_lock);
  
+       if ((jnl->fsmount->mnt_kern_flag & MNTK_SWAP_MOUNT) && (was_vm_privileged == FALSE))
+               set_vm_privilege(FALSE);
         /*
          * If the transaction we're flushing was the async transaction, then
          * tell the current transaction that there is no pending trim
@@ -3582,7 +3745,7 @@ end_transaction(transaction *tr, int force_it, errno_t (*callback)(void*), void
                 jnl->cur_tr = tr;
                 goto done;
         }
-
+       
      // if our transaction buffer isn't very full, just hang
      // on to it and don't actually flush anything.  this is
      // what is known as "group commit".  we will flush the
@@ -3811,8 +3974,7 @@ end_transaction(transaction *tr, int force_it, errno_t (*callback)(void*), void
                 must_wait = TRUE;
  
         if (drop_lock_early == TRUE) {
-               jnl->owner = NULL;
-               unlock_journal(jnl);
+               journal_unlock(jnl);
                 drop_lock = FALSE;
         }
         if (must_wait == TRUE)
@@ -3830,8 +3992,7 @@ end_transaction(transaction *tr, int force_it, errno_t (*callback)(void*), void
         KERNEL_DEBUG(0xbbbbc018|DBG_FUNC_END, jnl, tr, ret_val, 0, 0);
  done:
         if (drop_lock == TRUE) {
-               jnl->owner = NULL;
-               unlock_journal(jnl);
+               journal_unlock(jnl);
         }
         return (ret_val);
  }
@@ -3840,7 +4001,9 @@ done:
  static void
  finish_end_thread(transaction *tr)
  {
-       proc_apply_thread_selfdiskacc(IOPOL_PASSIVE);
+       proc_set_task_policy(current_task(), current_thread(),
+                            TASK_POLICY_INTERNAL, TASK_POLICY_IOPOL, IOPOL_PASSIVE);
+
         finish_end_transaction(tr, NULL, NULL);
  
         thread_deallocate(current_thread());
@@ -3850,7 +4013,8 @@ finish_end_thread(transaction *tr)
  static void
  write_header_thread(journal *jnl)
  {
-       proc_apply_thread_selfdiskacc(IOPOL_PASSIVE);
+       proc_set_task_policy(current_task(), current_thread(),
+                            TASK_POLICY_INTERNAL, TASK_POLICY_IOPOL, IOPOL_PASSIVE);
  
         if (write_journal_header(jnl, 1, jnl->saved_sequence_num))
                 jnl->write_header_failed = TRUE;
@@ -3881,6 +4045,7 @@ finish_end_transaction(transaction *tr, errno_t (*callback)(void*), void *callba
         end  = jnl->jhdr->end;
  
         for (blhdr = tr->blhdr; blhdr; blhdr = (block_list_header *)((long)blhdr->binfo[0].bnum)) {
+               boolean_t was_vm_privileged;
  
                 amt = blhdr->bytes_used;
  
@@ -3889,9 +4054,22 @@ finish_end_transaction(transaction *tr, errno_t (*callback)(void*), void *callba
                 blhdr->checksum = 0;
                 blhdr->checksum = calc_checksum((char *)blhdr, BLHDR_CHECKSUM_SIZE);
  
+               if (jnl->fsmount->mnt_kern_flag & MNTK_SWAP_MOUNT) {
+                       /*
+                        * if we block waiting for memory, and there is enough pressure to
+                        * cause us to try and create a new swap file, we may end up deadlocking
+                        * due to waiting for the journal on the swap file creation path...
+                        * by making ourselves vm_privileged, we give ourselves the best chance
+                        * of not blocking
+                        */
+                       was_vm_privileged = set_vm_privilege(TRUE);
+               }
                 if (kmem_alloc(kernel_map, (vm_offset_t *)&bparray, blhdr->num_blocks * sizeof(struct buf *))) {
                         panic("can't allocate %zd bytes for bparray\n", blhdr->num_blocks * sizeof(struct buf *));
                 }
+               if ((jnl->fsmount->mnt_kern_flag & MNTK_SWAP_MOUNT) && (was_vm_privileged == FALSE))
+                       set_vm_privilege(FALSE);
+
                 tbuffer_offset = jnl->jhdr->blhdr_size;
  
                 for (i = 1; i < blhdr->num_blocks; i++) {
@@ -4122,13 +4300,13 @@ bad_journal:
                 jnl->flush_aborted = TRUE;
  
                 unlock_condition(jnl, &jnl->flushing);
-               lock_journal(jnl);
+               journal_lock(jnl);
  
                 jnl->flags |= JOURNAL_INVALID;
                 jnl->old_start[sizeof(jnl->old_start)/sizeof(jnl->old_start[0]) - 1] &= ~0x8000000000000000LL;
                 abort_transaction(jnl, tr);             // cleans up list of extents to be trimmed
  
-               unlock_journal(jnl);
+               journal_unlock(jnl);
         } else
                 unlock_condition(jnl, &jnl->flushing);
  
@@ -4332,8 +4510,7 @@ journal_end_transaction(journal *jnl)
  
                         abort_transaction(jnl, tr);
                 }
-               jnl->owner = NULL;
-               unlock_journal(jnl);
+               journal_unlock(jnl);
  
                 return EINVAL;
         }
@@ -4347,7 +4524,19 @@ journal_end_transaction(journal *jnl)
         // called from end_transaction().
         // 
         jnl->active_tr = NULL;
-       ret = end_transaction(tr, 0, NULL, NULL, TRUE, FALSE);
+       
+       /* Examine the force-journal-flush state in the active txn */
+       if (tr->flush_on_completion == TRUE) {
+               /*
+                * If the FS requested it, disallow group commit and force the
+                * transaction out to disk immediately.
+                */
+               ret = end_transaction(tr, 1, NULL, NULL, TRUE, TRUE);
+       }
+       else {
+               /* in the common path we can simply use the double-buffered journal */
+               ret = end_transaction(tr, 0, NULL, NULL, TRUE, FALSE);
+       }
  
         return ret;
  }
@@ -4395,7 +4584,7 @@ journal_flush(journal *jnl, boolean_t wait_for_IO)
         KERNEL_DEBUG(DBG_JOURNAL_FLUSH | DBG_FUNC_START, jnl, 0, 0, 0, 0);
  
         if (jnl->owner != current_thread()) {
-               lock_journal(jnl);
+               journal_lock(jnl);
                 drop_lock = TRUE;
         }
  
@@ -4419,7 +4608,7 @@ journal_flush(journal *jnl, boolean_t wait_for_IO)
  
         } else  { 
                 if (drop_lock == TRUE) {
-                       unlock_journal(jnl);
+                       journal_unlock(jnl);
                 }
  
                 /* Because of pipelined journal, the journal transactions 
@@ -4640,7 +4829,8 @@ journal_create(__unused struct vnode *jvp,
                __unused int32_t       flags,
                __unused int32_t       tbuffer_size,
                __unused void        (*flush)(void *arg),
-              __unused void         *arg)
+              __unused void         *arg,
+              __unused struct mount *fsmount)
  {
      return NULL;
  }
@@ -4654,7 +4844,8 @@ journal_open(__unused struct vnode *jvp,
              __unused int32_t       flags,
              __unused int32_t       tbuffer_size,
              __unused void        (*flush)(void *arg),
-            __unused void         *arg)
+            __unused void         *arg,
+            __unused struct mount *fsmount)
  {
         return NULL;
  }
@@ -4730,4 +4921,47 @@ journal_owner(__unused journal *jnl)
  {
         return NULL;
  }
+
+void 
+journal_lock(__unused journal *jnl) 
+{
+       return;
+}
+
+void 
+journal_unlock(__unused journal *jnl)
+{
+       return;
+}
+
+__private_extern__ int
+journal_trim_add_extent(__unused journal *jnl, 
+                       __unused uint64_t offset, 
+                       __unused uint64_t length)
+{
+       return 0;
+}
+
+int
+journal_request_immediate_flush(__unused journal *jnl) 
+{
+       return 0;
+}
+
+__private_extern__ int
+journal_trim_remove_extent(__unused journal *jnl, 
+                          __unused uint64_t offset, 
+                          __unused uint64_t length)
+{
+       return 0;
+}
+
+int journal_trim_extent_overlap(__unused journal *jnl, 
+                               __unused uint64_t offset, 
+                               __unused uint64_t length, 
+                               __unused uint64_t *end) 
+{
+       return 0;
+}
+
  #endif  // !JOURNALING