]> git.saurik.com Git - apple/xnu.git/blobdiff - bsd/vfs/vfs_journal.c
xnu-2422.115.4.tar.gz
[apple/xnu.git] / bsd / vfs / vfs_journal.c
index 4999f814bbba938baf0984b8835e3db93f09598e..d385566923ef5a5ba5971bf9e804eaf7b5722577 100644 (file)
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2002-2011 Apple Inc. All rights reserved.
+ * Copyright (c) 2002-2012 Apple Inc. All rights reserved.
  *
  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
  * 
@@ -115,10 +115,12 @@ SYSCTL_INT(_vfs_generic_jnl_kdebug, OID_AUTO, trim, CTLFLAG_RW|CTLFLAG_LOCKED, &
 #define KERNEL_DEBUG KERNEL_DEBUG_CONSTANT
 #endif
 
+
 #ifndef CONFIG_HFS_TRIM
 #define CONFIG_HFS_TRIM 0
 #endif
 
+
 #if JOURNALING
 
 //
@@ -136,8 +138,7 @@ enum {
 unsigned int jnl_trim_flush_limit = JOURNAL_FLUSH_TRIM_EXTENTS;
 SYSCTL_UINT (_kern, OID_AUTO, jnl_trim_flush, CTLFLAG_RW, &jnl_trim_flush_limit, 0, "number of trimmed extents to cause a journal flush");
 
-
-/* XXX next prototytype should be from libsa/stdlib.h> but conflicts libkern */
+/* XXX next prototype should be from libsa/stdlib.h> but conflicts libkern */
 __private_extern__ void qsort(
        void * array,
        size_t nmembers,
@@ -247,10 +248,11 @@ static int insert_block(journal *jnl, struct bucket **buf_ptr, int blk_index, of
 // we use it to checksum the journal header and the block list
 // headers that are at the start of each transaction.
 //
-static int
+static unsigned int
 calc_checksum(char *ptr, int len)
 {
-       int i, cksum=0;
+       int i;
+       unsigned int cksum=0;
 
        // this is a lame checksum but for now it'll do
        for(i = 0; i < len; i++, ptr++) {
@@ -1090,7 +1092,8 @@ add_block(journal *jnl, struct bucket **buf_ptr, off_t block_num, size_t size, _
 static int
 replay_journal(journal *jnl)
 {
-       int             i, orig_checksum, checksum, check_block_checksums=0, bad_blocks=0;
+       int             i, bad_blocks=0;
+       unsigned int    orig_checksum, checksum, check_block_checksums = 0;
        size_t          ret;
        size_t          max_bsize = 0;          /* protected by block_ptr */
        block_list_header *blhdr;
@@ -1099,6 +1102,7 @@ replay_journal(journal *jnl)
        struct bucket   *co_buf;
        int             num_buckets = STARTING_BUCKETS, num_full, check_past_jnl_end = 1, in_uncharted_territory=0;
        uint32_t        last_sequence_num = 0;
+       int             replay_retry_count = 0;
     
        // wrap the start ptr if it points to the very end of the journal
        if (jnl->jhdr->start == jnl->jhdr->size) {
@@ -1155,7 +1159,7 @@ restart_replay:
                if (jnl->flags & JOURNAL_NEED_SWAP) {
                        // calculate the checksum based on the unswapped data
                        // because it is done byte-at-a-time.
-                       orig_checksum = SWAP32(orig_checksum);
+                       orig_checksum = (unsigned int)SWAP32(orig_checksum);
                        checksum = calc_checksum((char *)blhdr, BLHDR_CHECKSUM_SIZE);
                        swap_block_list_header(jnl, blhdr);
                } else {
@@ -1336,11 +1340,25 @@ restart_replay:
                
 bad_txn_handling:
                if (bad_blocks) {
+                       /* Journal replay got error before it found any valid 
+                        *  transations, abort replay */
                        if (txn_start_offset == 0) {
                                printf("jnl: %s: no known good txn start offset! aborting journal replay.\n", jnl->jdev_name);
                                goto bad_replay;
                        }
 
+                       /* Repeated error during journal replay, abort replay */
+                       if (replay_retry_count == 3) {
+                               printf("jnl: %s: repeated errors replaying journal! aborting journal replay.\n", jnl->jdev_name);
+                               goto bad_replay;
+                       }
+                       replay_retry_count++;
+
+                       /* There was an error replaying the journal (possibly 
+                        * EIO/ENXIO from the device).  So retry replaying all 
+                        * the good transactions that we found before getting 
+                        * the error.  
+                        */
                        jnl->jhdr->start = orig_jnl_start;
                        jnl->jhdr->end = txn_start_offset;
                        check_past_jnl_end = 0;
@@ -1530,9 +1548,10 @@ get_io_info(struct vnode *devvp, size_t phys_blksz, journal *jnl, struct vfs_con
 
        if (VNOP_IOCTL(devvp, DKIOCGETFEATURES, (caddr_t)&features, 0, context) == 0) {
                if (features & DK_FEATURE_FORCE_UNIT_ACCESS) {
-                       const char *name = vnode_name(devvp);
+                       const char *name = vnode_getname_printable(devvp);
                        jnl->flags |= JOURNAL_DO_FUA_WRITES;
-                       printf("jnl: %s: enabling FUA writes (features 0x%x)\n", name ? name : "no-name-dev", features);
+                       printf("jnl: %s: enabling FUA writes (features 0x%x)\n", name, features);
+                       vnode_putname_printable(name);
                }
                if (features & DK_FEATURE_UNMAP) {
                        jnl->flags |= JOURNAL_USE_UNMAP;
@@ -1600,23 +1619,6 @@ get_io_info(struct vnode *devvp, size_t phys_blksz, journal *jnl, struct vfs_con
 }
 
 
-static const char *
-get_jdev_name(struct vnode *jvp)
-{
-       const char *jdev_name;
-    
-       jdev_name = vnode_name(jvp);
-       if (jdev_name == NULL) {
-               jdev_name = vfs_addname("unknown-dev", strlen("unknown-dev"), 0, 0);
-       } else {
-               // this just bumps the refcount on the name so we have our own copy
-               jdev_name = vfs_addname(jdev_name, strlen(jdev_name), 0, 0);
-       }
-
-       return jdev_name;
-}
-
-
 journal *
 journal_create(struct vnode *jvp,
                           off_t         offset,
@@ -1626,7 +1628,8 @@ journal_create(struct vnode *jvp,
                           int32_t       flags,
                           int32_t       tbuffer_size,
                           void        (*flush)(void *arg),
-                          void         *arg)
+                          void         *arg,
+                          struct mount *fsmount)
 {
        journal         *jnl;
        uint32_t        phys_blksz, new_txn_base;
@@ -1642,36 +1645,36 @@ journal_create(struct vnode *jvp,
        context.vc_thread = current_thread();
        context.vc_ucred = FSCRED;
 
-       jdev_name = get_jdev_name(jvp);
+       jdev_name = vnode_getname_printable(jvp);
 
        /* Get the real physical block size. */
        if (VNOP_IOCTL(jvp, DKIOCGETBLOCKSIZE, (caddr_t)&phys_blksz, 0, &context)) {
-               return NULL;
+               goto cleanup_jdev_name;
        }
 
        if (journal_size < (256*1024) || journal_size > (MAX_JOURNAL_SIZE)) {
-               printf("jnl: create: journal size %lld looks bogus.\n", journal_size);
-               return NULL;
+               printf("jnl: %s: create: journal size %lld looks bogus.\n", jdev_name, journal_size);
+               goto cleanup_jdev_name;
        }
 
        min_size = phys_blksz * (phys_blksz / sizeof(block_info));
        /* Reject journals that are too small given the sector size of the device */
        if (journal_size < min_size) {
-               printf("jnl: create: journal size (%lld) too small given sector size of (%u)\n", 
-                               journal_size, phys_blksz);
-               return NULL;
+               printf("jnl: %s: create: journal size (%lld) too small given sector size of (%u)\n", 
+                               jdev_name, journal_size, phys_blksz);
+               goto cleanup_jdev_name;
        }
 
        if (phys_blksz > min_fs_blksz) {
                printf("jnl: %s: create: error: phys blksize %u bigger than min fs blksize %zd\n",
                       jdev_name, phys_blksz, min_fs_blksz);
-               return NULL;
+               goto cleanup_jdev_name;
        }
 
        if ((journal_size % phys_blksz) != 0) {
                printf("jnl: %s: create: journal size 0x%llx is not an even multiple of block size 0x%ux\n",
                       jdev_name, journal_size, phys_blksz);
-               return NULL;
+               goto cleanup_jdev_name;
        }
 
 
@@ -1687,6 +1690,12 @@ journal_create(struct vnode *jvp,
        jnl->jdev_name    = jdev_name;
        lck_mtx_init(&jnl->old_start_lock, jnl_mutex_group, jnl_lock_attr);
 
+       // Keep a point to the mount around for use in IO throttling.
+       jnl->fsmount      = fsmount;
+       // XXX: This lock discipline looks correct based on dounmount(), but it
+       // doesn't seem to be documented anywhere.
+       mount_ref(fsmount, 0);
+
        get_io_info(jvp, phys_blksz, jnl, &context);
        
        if (kmem_alloc_kobject(kernel_map, (vm_offset_t *)&jnl->header_buf, phys_blksz)) {
@@ -1722,7 +1731,7 @@ journal_create(struct vnode *jvp,
               && jnl->jhdr->sequence_num != 0) {
 
                new_txn_base = (jnl->jhdr->sequence_num + (journal_size / phys_blksz) + (random() % 16384)) & 0x00ffffff;
-               printf("jnl: create: avoiding old sequence number 0x%x (0x%x)\n", jnl->jhdr->sequence_num, new_txn_base);
+               printf("jnl: %s: create: avoiding old sequence number 0x%x (0x%x)\n", jdev_name, jnl->jhdr->sequence_num, new_txn_base);
 
 #if 0
                int i;
@@ -1763,7 +1772,8 @@ journal_create(struct vnode *jvp,
        lck_mtx_init(&jnl->jlock, jnl_mutex_group, jnl_lock_attr);
        lck_mtx_init(&jnl->flock, jnl_mutex_group, jnl_lock_attr);
        lck_rw_init(&jnl->trim_lock, jnl_mutex_group, jnl_lock_attr);
-       
+
+
        jnl->flushing = FALSE;
        jnl->asyncIO = FALSE;
        jnl->flush_aborted = FALSE;
@@ -1776,19 +1786,20 @@ journal_create(struct vnode *jvp,
                goto bad_write;
        }
 
-       return jnl;
+       goto journal_create_complete;
 
 
 bad_write:
        kmem_free(kernel_map, (vm_offset_t)jnl->header_buf, phys_blksz);
 bad_kmem_alloc:
-       if (jdev_name) {
-               vfs_removename(jdev_name);
-       }
        jnl->jhdr = NULL;
        FREE_ZONE(jnl, sizeof(struct journal), M_JNL_JNL);
-
-       return NULL;
+       mount_drop(fsmount, 0);
+cleanup_jdev_name:
+       vnode_putname_printable(jdev_name);
+       jnl = NULL;
+journal_create_complete:
+       return jnl;
 }
 
 
@@ -1801,7 +1812,8 @@ journal_open(struct vnode *jvp,
                         int32_t       flags,
                         int32_t       tbuffer_size,
                         void        (*flush)(void *arg),
-                        void         *arg)
+                        void         *arg,
+                        struct mount *fsmount)
 {
        journal         *jnl;
        uint32_t        orig_blksz=0;
@@ -1809,39 +1821,39 @@ journal_open(struct vnode *jvp,
        u_int32_t       min_size = 0;
        int             orig_checksum, checksum;
        struct vfs_context context;
-       const char      *jdev_name = get_jdev_name(jvp);
+       const char      *jdev_name = vnode_getname_printable(jvp);
 
        context.vc_thread = current_thread();
        context.vc_ucred = FSCRED;
 
        /* Get the real physical block size. */
        if (VNOP_IOCTL(jvp, DKIOCGETBLOCKSIZE, (caddr_t)&phys_blksz, 0, &context)) {
-               return NULL;
+               goto cleanup_jdev_name;
        }
 
        if (phys_blksz > min_fs_blksz) {
                printf("jnl: %s: open: error: phys blksize %u bigger than min fs blksize %zd\n",
                       jdev_name, phys_blksz, min_fs_blksz);
-               return NULL;
+               goto cleanup_jdev_name;
        }
 
        if (journal_size < (256*1024) || journal_size > (1024*1024*1024)) {
-               printf("jnl: open: journal size %lld looks bogus.\n", journal_size);
-               return NULL;
+               printf("jnl: %s: open: journal size %lld looks bogus.\n", jdev_name, journal_size);
+               goto cleanup_jdev_name;
        }
 
        min_size = phys_blksz * (phys_blksz / sizeof(block_info));
        /* Reject journals that are too small given the sector size of the device */
        if (journal_size < min_size) {
-               printf("jnl: open: journal size (%lld) too small given sector size of (%u)\n", 
-                               journal_size, phys_blksz);
-               return NULL;
+               printf("jnl: %s: open: journal size (%lld) too small given sector size of (%u)\n", 
+                               jdev_name, journal_size, phys_blksz);
+               goto cleanup_jdev_name;
        }
     
        if ((journal_size % phys_blksz) != 0) {
                printf("jnl: %s: open: journal size 0x%llx is not an even multiple of block size 0x%x\n",
                       jdev_name, journal_size, phys_blksz);
-               return NULL;
+               goto cleanup_jdev_name;
        }
 
        MALLOC_ZONE(jnl, struct journal *, sizeof(struct journal), M_JNL_JNL, M_WAITOK);
@@ -1856,6 +1868,12 @@ journal_open(struct vnode *jvp,
        jnl->jdev_name    = jdev_name;
        lck_mtx_init(&jnl->old_start_lock, jnl_mutex_group, jnl_lock_attr);
 
+       /* We need a reference to the mount to later pass to the throttling code for
+        * IO accounting.
+        */
+       jnl->fsmount      = fsmount;
+       mount_ref(fsmount, 0);
+
        get_io_info(jvp, phys_blksz, jnl, &context);
 
        if (kmem_alloc_kobject(kernel_map, (vm_offset_t *)&jnl->header_buf, phys_blksz)) {
@@ -1911,26 +1929,24 @@ journal_open(struct vnode *jvp,
                jnl->jhdr->magic = JOURNAL_HEADER_MAGIC;
        }
 
-    if (phys_blksz != (size_t)jnl->jhdr->jhdr_size && jnl->jhdr->jhdr_size != 0) {
-       /*
-        * The volume has probably been resized (such that we had to adjust the
-        * logical sector size), or copied to media with a different logical
-        * sector size.
-        *
-        * Temporarily change the device's logical block size to match the
-        * journal's header size.  This will allow us to replay the journal
-        * safely.  If the replay succeeds, we will update the journal's header
-        * size (later in this function).
-        */
-
-       orig_blksz = phys_blksz;
-       phys_blksz = jnl->jhdr->jhdr_size;
-       VNOP_IOCTL(jvp, DKIOCSETBLOCKSIZE, (caddr_t)&phys_blksz, FWRITE, &context);
+       if (phys_blksz != (size_t)jnl->jhdr->jhdr_size && jnl->jhdr->jhdr_size != 0) {
+               /*
+                * The volume has probably been resized (such that we had to adjust the
+                * logical sector size), or copied to media with a different logical
+                * sector size.
+                * 
+                * Temporarily change the device's logical block size to match the
+                * journal's header size.  This will allow us to replay the journal
+                * safely.  If the replay succeeds, we will update the journal's header
+                * size (later in this function).
+                */
+               orig_blksz = phys_blksz;
+               phys_blksz = jnl->jhdr->jhdr_size;
+               VNOP_IOCTL(jvp, DKIOCSETBLOCKSIZE, (caddr_t)&phys_blksz, FWRITE, &context);
+               printf("jnl: %s: open: temporarily switched block size from %u to %u\n",
+                          jdev_name, orig_blksz, phys_blksz);
+       }
 
-       printf("jnl: %s: open: temporarily switched block size from %u to %u\n",
-              jdev_name, orig_blksz, phys_blksz);
-    }
-    
        if (   jnl->jhdr->start <= 0
               || jnl->jhdr->start > jnl->jhdr->size
               || jnl->jhdr->start > 1024*1024*1024) {
@@ -1980,68 +1996,73 @@ journal_open(struct vnode *jvp,
                printf("jnl: %s: journal_open: Error replaying the journal!\n", jdev_name);
                goto bad_journal;
        }
-
-    /*
-     * When we get here, we know that the journal is empty (jnl->jhdr->start ==
-     * jnl->jhdr->end).  If the device's logical block size was different from
-     * the journal's header size, then we can now restore the device's logical
-     * block size and update the journal's header size to match.
-     *
-     * Note that we also adjust the journal's start and end so that they will
-     * be aligned on the new block size.  We pick a new sequence number to
-     * avoid any problems if a replay found previous transactions using the old
-     * journal header size.  (See the comments in journal_create(), above.)
-     */
-    if (orig_blksz != 0) {
-       VNOP_IOCTL(jvp, DKIOCSETBLOCKSIZE, (caddr_t)&orig_blksz, FWRITE, &context);
-       phys_blksz = orig_blksz;
-       orig_blksz = 0;
-       printf("jnl: %s: open: restored block size to %u\n", jdev_name, phys_blksz);
        
-       jnl->jhdr->jhdr_size = phys_blksz;
-       jnl->jhdr->start = phys_blksz;
-       jnl->jhdr->end = phys_blksz;
-       jnl->jhdr->sequence_num = (jnl->jhdr->sequence_num +
-                                  (journal_size / phys_blksz) +
-                                  (random() % 16384)) & 0x00ffffff;
+       /*
+        * When we get here, we know that the journal is empty (jnl->jhdr->start ==
+        * jnl->jhdr->end).  If the device's logical block size was different from
+        * the journal's header size, then we can now restore the device's logical
+        * block size and update the journal's header size to match.
+        *
+        * Note that we also adjust the journal's start and end so that they will
+        * be aligned on the new block size.  We pick a new sequence number to
+        * avoid any problems if a replay found previous transactions using the old
+        * journal header size.  (See the comments in journal_create(), above.)
+        */
        
-       if (write_journal_header(jnl, 1, jnl->jhdr->sequence_num)) {
-               printf("jnl: %s: open: failed to update journal header size\n", jdev_name);
+       if (orig_blksz != 0) {
+               VNOP_IOCTL(jvp, DKIOCSETBLOCKSIZE, (caddr_t)&orig_blksz, FWRITE, &context);
+               phys_blksz = orig_blksz;
+               
+               orig_blksz = 0;
+               
+               jnl->jhdr->jhdr_size = phys_blksz;
+               jnl->jhdr->start = phys_blksz;
+               jnl->jhdr->end = phys_blksz;
+               jnl->jhdr->sequence_num = (jnl->jhdr->sequence_num +
+                                                                  (journal_size / phys_blksz) +
+                                                                  (random() % 16384)) & 0x00ffffff;
+               
+               if (write_journal_header(jnl, 1, jnl->jhdr->sequence_num)) {
+                       printf("jnl: %s: open: failed to update journal header size\n", jdev_name);
+                       goto bad_journal;
+               }
+       }
+
+       // make sure this is in sync!
+       jnl->active_start = jnl->jhdr->start;
+       jnl->sequence_num = jnl->jhdr->sequence_num;
+
+       // set this now, after we've replayed the journal
+       size_up_tbuffer(jnl, tbuffer_size, phys_blksz);
+
+       // TODO: Does this need to change if the device's logical block size changed?
+       if ((off_t)(jnl->jhdr->blhdr_size/sizeof(block_info)-1) > (jnl->jhdr->size/jnl->jhdr->jhdr_size)) {
+               printf("jnl: %s: open: jhdr size and blhdr size are not compatible (0x%llx, %d, %d)\n", jdev_name, jnl->jhdr->size,
+                      jnl->jhdr->blhdr_size, jnl->jhdr->jhdr_size);
                goto bad_journal;
        }
-    }
-    
-    // make sure this is in sync!
-    jnl->active_start = jnl->jhdr->start;
-    jnl->sequence_num = jnl->jhdr->sequence_num;
-
-    // set this now, after we've replayed the journal
-    size_up_tbuffer(jnl, tbuffer_size, phys_blksz);
-
-    // TODO: Does this need to change if the device's logical block size changed?
-    if ((off_t)(jnl->jhdr->blhdr_size/sizeof(block_info)-1) > (jnl->jhdr->size/jnl->jhdr->jhdr_size)) {
-       printf("jnl: %s: open: jhdr size and blhdr size are not compatible (0x%llx, %d, %d)\n", jdev_name, jnl->jhdr->size,
-          jnl->jhdr->blhdr_size, jnl->jhdr->jhdr_size);
-       goto bad_journal;
-    }
-
-    lck_mtx_init(&jnl->jlock, jnl_mutex_group, jnl_lock_attr);
-
-    return jnl;
-
-  bad_journal:
-    if (orig_blksz != 0) {
-       phys_blksz = orig_blksz;
-       VNOP_IOCTL(jvp, DKIOCSETBLOCKSIZE, (caddr_t)&orig_blksz, FWRITE, &context);
-       printf("jnl: %s: open: restored block size to %u after error\n", jdev_name, orig_blksz);
-    }
-    kmem_free(kernel_map, (vm_offset_t)jnl->header_buf, phys_blksz);
-  bad_kmem_alloc:
-    if (jdev_name) {
-       vfs_removename(jdev_name);
-    }
-    FREE_ZONE(jnl, sizeof(struct journal), M_JNL_JNL);
-    return NULL;    
+
+       lck_mtx_init(&jnl->jlock, jnl_mutex_group, jnl_lock_attr);
+       lck_mtx_init(&jnl->flock, jnl_mutex_group, jnl_lock_attr);
+       lck_rw_init(&jnl->trim_lock, jnl_mutex_group, jnl_lock_attr);
+
+       goto journal_open_complete;
+
+bad_journal:
+       if (orig_blksz != 0) {
+               phys_blksz = orig_blksz;
+               VNOP_IOCTL(jvp, DKIOCSETBLOCKSIZE, (caddr_t)&orig_blksz, FWRITE, &context);
+               printf("jnl: %s: open: restored block size after error\n", jdev_name);
+       }
+       kmem_free(kernel_map, (vm_offset_t)jnl->header_buf, phys_blksz);
+bad_kmem_alloc:
+       FREE_ZONE(jnl, sizeof(struct journal), M_JNL_JNL);
+       mount_drop(fsmount, 0);
+cleanup_jdev_name:
+       vnode_putname_printable(jdev_name);
+       jnl = NULL;
+journal_open_complete:
+       return jnl;    
 }
 
 
@@ -2057,7 +2078,7 @@ journal_is_clean(struct vnode *jvp,
        int             ret;
        int             orig_checksum, checksum;
        struct vfs_context context;
-       const           char *jdev_name = get_jdev_name(jvp);
+       const           char *jdev_name = vnode_getname_printable(jvp);
 
        context.vc_thread = current_thread();
        context.vc_ucred = FSCRED;
@@ -2065,31 +2086,36 @@ journal_is_clean(struct vnode *jvp,
        /* Get the real physical block size. */
        if (VNOP_IOCTL(jvp, DKIOCGETBLOCKSIZE, (caddr_t)&phys_blksz, 0, &context)) {
                printf("jnl: %s: is_clean: failed to get device block size.\n", jdev_name);
-               return EINVAL;
+               ret = EINVAL;
+               goto cleanup_jdev_name;
        }
 
        if (phys_blksz > (uint32_t)min_fs_block_size) {
                printf("jnl: %s: is_clean: error: phys blksize %d bigger than min fs blksize %zd\n",
                       jdev_name, phys_blksz, min_fs_block_size);
-               return EINVAL;
+               ret = EINVAL;
+               goto cleanup_jdev_name;
        }
 
        if (journal_size < (256*1024) || journal_size > (MAX_JOURNAL_SIZE)) {
-               printf("jnl: is_clean: journal size %lld looks bogus.\n", journal_size);
-               return EINVAL;
+               printf("jnl: %s: is_clean: journal size %lld looks bogus.\n", jdev_name, journal_size);
+               ret = EINVAL;
+               goto cleanup_jdev_name;
        }
     
        if ((journal_size % phys_blksz) != 0) {
                printf("jnl: %s: is_clean: journal size 0x%llx is not an even multiple of block size 0x%x\n",
                       jdev_name, journal_size, phys_blksz);
-               return EINVAL;
+               ret = EINVAL;
+               goto cleanup_jdev_name;
        }
 
        memset(&jnl, 0, sizeof(jnl));
 
        if (kmem_alloc_kobject(kernel_map, (vm_offset_t *)&jnl.header_buf, phys_blksz)) {
                printf("jnl: %s: is_clean: could not allocate space for header buffer (%d bytes)\n", jdev_name, phys_blksz);
-               return ENOMEM;
+               ret = ENOMEM;
+               goto cleanup_jdev_name;
        }
        jnl.header_buf_size = phys_blksz;
 
@@ -2150,12 +2176,9 @@ journal_is_clean(struct vnode *jvp,
 
 get_out:
        kmem_free(kernel_map, (vm_offset_t)jnl.header_buf, phys_blksz);
-       if (jdev_name) {
-               vfs_removename(jdev_name);
-       }
-    
-       return ret;    
-
+cleanup_jdev_name:
+       vnode_putname_printable(jdev_name);
+       return ret;
 }
 
 
@@ -2257,10 +2280,16 @@ journal_close(journal *jnl)
        kmem_free(kernel_map, (vm_offset_t)jnl->header_buf, jnl->header_buf_size);
        jnl->jhdr = (void *)0xbeefbabe;
 
-       if (jnl->jdev_name) {
-               vfs_removename(jnl->jdev_name);
-       }
+       // Release reference on the mount
+       if (jnl->fsmount)
+                mount_drop(jnl->fsmount, 0);
+
+       vnode_putname_printable(jnl->jdev_name);
 
+       unlock_journal(jnl);
+       lck_mtx_destroy(&jnl->old_start_lock, jnl_mutex_group);
+       lck_mtx_destroy(&jnl->jlock, jnl_mutex_group);
+       lck_mtx_destroy(&jnl->flock, jnl_mutex_group);
        FREE_ZONE(jnl, sizeof(struct journal), M_JNL_JNL);
 }
 
@@ -2351,7 +2380,7 @@ check_free_space(journal *jnl, int desired_size, boolean_t *delayed_header_write
 
                        lcl_counter = 0;
                        while (jnl->old_start[i] & 0x8000000000000000LL) {
-                               if (lcl_counter++ > 1000) {
+                               if (lcl_counter++ > 10000) {
                                        panic("jnl: check_free_space: tr starting @ 0x%llx not flushing (jnl %p).\n",
                                              jnl->old_start[i], jnl);
                                }
@@ -2829,6 +2858,8 @@ journal_modify_block_end(journal *jnl, struct buf *bp, void (*func)(buf_t bp, vo
                blhdr->binfo[i].bnum = (off_t)(buf_blkno(bp));
                blhdr->binfo[i].u.bp = bp;
 
+               KERNEL_DEBUG_CONSTANT(0x3018004, vp, blhdr->binfo[i].bnum, bsize, 0, 0);
+
                if (func) {
                        void (*old_func)(buf_t, void *)=NULL, *old_arg=NULL;
                        
@@ -2922,7 +2953,6 @@ journal_kill_block(journal *jnl, struct buf *bp)
        return 0;
 }
 
-
 /*
 ;________________________________________________________________________________
 ;
@@ -3016,44 +3046,53 @@ trim_realloc(struct jnl_trim_list *trim)
        return 0;
 }
 
-
 /*
-;________________________________________________________________________________
-;
-; Routine:             trim_search_extent
-;
-; Function:            Search the given extent list to see if any of its extents
-;                              overlap the given extent.
-;
-; Input Arguments:
-;      trim            - The trim list to be searched.
-;      offset          - The first byte of the range to be searched for.
-;      length          - The number of bytes of the extent being searched for.
-;
-; Output:
-;      (result)        - TRUE if one or more extents overlap, FALSE otherwise.
-;________________________________________________________________________________
-*/
+ ;________________________________________________________________________________
+ ;
+ ; Routine:            trim_search_extent
+ ;
+ ; Function:           Search the given extent list to see if any of its extents
+ ;                             overlap the given extent.
+ ;
+ ; Input Arguments:
+ ;     trim            - The trim list to be searched.
+ ;     offset          - The first byte of the range to be searched for.
+ ;     length          - The number of bytes of the extent being searched for.
+ ;  overlap_start - start of the overlapping extent
+ ;  overlap_len   - length of the overlapping extent
+ ;
+ ; Output:
+ ;     (result)        - TRUE if one or more extents overlap, FALSE otherwise.
+ ;________________________________________________________________________________
+ */
 static int
-trim_search_extent(struct jnl_trim_list *trim, uint64_t offset, uint64_t length)
+trim_search_extent(struct jnl_trim_list *trim, uint64_t offset,
+               uint64_t length, uint64_t *overlap_start, uint64_t *overlap_len)
 {
        uint64_t end = offset + length;
        uint32_t lower = 0;                                             /* Lowest index to search */
        uint32_t upper = trim->extent_count;    /* Highest index to search + 1 */
        uint32_t middle;
-       
+
        /* A binary search over the extent list. */
        while (lower < upper) {
                middle = (lower + upper) / 2;
-               
+
                if (trim->extents[middle].offset >= end)
                        upper = middle;
                else if (trim->extents[middle].offset + trim->extents[middle].length <= offset)
                        lower = middle + 1;
-               else
+               else {
+                       if (overlap_start) {
+                               *overlap_start = trim->extents[middle].offset;
+                       }
+                       if (overlap_len) {
+                               *overlap_len = trim->extents[middle].length;
+                       }
                        return TRUE;
+               }
        }
-       
+
        return FALSE;
 }
 
@@ -3092,7 +3131,7 @@ journal_trim_add_extent(journal *jnl, uint64_t offset, uint64_t length)
        dk_extent_t *extent;
        uint32_t insert_index;
        uint32_t replace_count;
-       
+               
        CHECK_JOURNAL(jnl);
 
        /* TODO: Is it OK to manipulate the trim list even if JOURNAL_INVALID is set?  I think so... */
@@ -3112,9 +3151,9 @@ journal_trim_add_extent(journal *jnl, uint64_t offset, uint64_t length)
        }
 
        free_old_stuff(jnl);
-       
+               
        end = offset + length;
-       
+               
        /*
         * Find the range of existing extents that can be combined with the
         * input extent.  We start by counting the number of extents that end
@@ -3132,7 +3171,7 @@ journal_trim_add_extent(journal *jnl, uint64_t offset, uint64_t length)
                ++replace_count;
                ++extent;
        }
-       
+               
        /*
         * If none of the existing extents can be combined with the input extent,
         * then just insert it in the list (before item number insert_index).
@@ -3198,6 +3237,92 @@ journal_trim_add_extent(journal *jnl, uint64_t offset, uint64_t length)
     return 0;
 }
 
+/*
+ * journal_trim_extent_overlap
+ *
+ * Return 1 if there are any pending TRIMs that overlap with the given offset and length
+ * Return 0 otherwise.
+ */
+
+int journal_trim_extent_overlap (journal *jnl, uint64_t offset, uint64_t length, uint64_t *end) {
+       transaction *tr = NULL;
+       int overlap = 0;
+
+       uint64_t overlap_start;
+       uint64_t overlap_len;
+       tr = jnl->active_tr;
+       CHECK_TRANSACTION(tr);
+
+       /*
+        * There are two lists that need to be examined for potential overlaps:
+        *
+        * The first is the current transaction. Since this function requires that
+        * a transaction be active when this is called, this is the "active_tr"
+        * pointer in the journal struct.  This has a trimlist pointer which needs
+        * to be searched.
+        */
+       overlap = trim_search_extent (&tr->trim, offset, length, &overlap_start, &overlap_len);
+       if (overlap == 0) {
+               /*
+                * The second is the async trim list, which is only done if the current
+                * transaction group (active transaction) did not overlap with our target
+                * extent. This async trim list is the set of all previously
+                * committed transaction groups whose I/Os are now in-flight. We need to hold the
+                * trim lock in order to search this list.  If we grab the list before the
+                * TRIM has completed, then we will compare it. If it is grabbed AFTER the
+                * TRIM has completed, then the pointer will be zeroed out and we won't have
+                * to check anything.
+                */
+               lck_rw_lock_shared (&jnl->trim_lock);
+               if (jnl->async_trim != NULL) {
+                       overlap = trim_search_extent(jnl->async_trim, offset, length, &overlap_start, &overlap_len);
+               }
+               lck_rw_unlock_shared (&jnl->trim_lock);
+       }
+
+       if (overlap) {
+               /* compute the end (min) of the overlapping range */
+               if ( (overlap_start + overlap_len) < (offset + length)) {
+                       *end = (overlap_start + overlap_len);
+               }
+               else {
+                       *end = (offset + length);
+               }
+       }
+
+
+       return overlap;
+}
+
+/*
+ * journal_request_immediate_flush
+ *
+ * FS requests that the journal flush immediately upon the
+ * active transaction's completion.
+ *
+ * Returns 0 if operation succeeds
+ * Returns EPERM if we failed to leave hint
+ */
+int
+journal_request_immediate_flush (journal *jnl) {
+
+       transaction *tr = NULL;
+       /*
+        * Is a transaction still in process? You must do
+        * this while there are txns open
+        */
+       tr = jnl->active_tr;
+       if (tr != NULL) {
+               CHECK_TRANSACTION(tr);
+               tr->flush_on_completion = TRUE;
+       }
+       else {
+               return EPERM;
+       }
+       return 0;
+}
+
+
 
 /*
 ;________________________________________________________________________________
@@ -3331,24 +3456,23 @@ trim_remove_extent(struct jnl_trim_list *trim, uint64_t offset, uint64_t length)
        return 0;
 }
 
-
 /*
-;________________________________________________________________________________
-;
-; Routine:             journal_trim_remove_extent
-;
-; Function:            Make note of a range of bytes, some of which may have previously
-                             been passed to journal_trim_add_extent, is now in use on the
-                             volume.  The given bytes will be not be trimmed as part of
-                             this transaction, or a pending trim of a transaction being
-                             asynchronously flushed.
-;
-; Input Arguments:
-     jnl                     - The journal for the volume containing the byte range.
-     offset          - The first byte of the range to be trimmed.
-     length          - The number of bytes of the extent being trimmed.
-;________________________________________________________________________________
-*/
+ ;________________________________________________________________________________
+ ;
+ ; Routine:            journal_trim_remove_extent
+ ;
+ ; Function:           Make note of a range of bytes, some of which may have previously
+ ;                             been passed to journal_trim_add_extent, is now in use on the
+ ;                             volume.  The given bytes will be not be trimmed as part of
+ ;                             this transaction, or a pending trim of a transaction being
+ ;                             asynchronously flushed.
+ ;
+ ; Input Arguments:
+ ;     jnl                     - The journal for the volume containing the byte range.
+ ;     offset          - The first byte of the range to be trimmed.
+ ;     length          - The number of bytes of the extent being trimmed.
+ ;________________________________________________________________________________
+ */
 __private_extern__ int
 journal_trim_remove_extent(journal *jnl, uint64_t offset, uint64_t length)
 {
@@ -3374,7 +3498,7 @@ journal_trim_remove_extent(journal *jnl, uint64_t offset, uint64_t length)
        }
 
        free_old_stuff(jnl);
-       
+               
        error = trim_remove_extent(&tr->trim, offset, length);
        if (error == 0) {
                int found = FALSE;
@@ -3385,7 +3509,7 @@ journal_trim_remove_extent(journal *jnl, uint64_t offset, uint64_t length)
                 */
                lck_rw_lock_shared(&jnl->trim_lock);
                if (jnl->async_trim != NULL)
-                       found = trim_search_extent(jnl->async_trim, offset, length);
+                       found = trim_search_extent(jnl->async_trim, offset, length, NULL, NULL);
                lck_rw_unlock_shared(&jnl->trim_lock);
                
                if (found) {
@@ -3424,11 +3548,11 @@ journal_trim_flush(journal *jnl, transaction *tr)
        if (jnl_kdebug)
                KERNEL_DEBUG_CONSTANT(DBG_JOURNAL_TRIM_FLUSH | DBG_FUNC_START, jnl, tr, 0, tr->trim.extent_count, 0);
 
+       lck_rw_lock_shared(&jnl->trim_lock);
        if (tr->trim.extent_count > 0) {
                dk_unmap_t unmap;
                                
                bzero(&unmap, sizeof(unmap));
-               lck_rw_lock_shared(&jnl->trim_lock);
                if (CONFIG_HFS_TRIM && (jnl->flags & JOURNAL_USE_UNMAP)) {
                        unmap.extents = tr->trim.extents;
                        unmap.extentsCount = tr->trim.extent_count;
@@ -3437,14 +3561,8 @@ journal_trim_flush(journal *jnl, transaction *tr)
                        errno = VNOP_IOCTL(jnl->fsdev, DKIOCUNMAP, (caddr_t)&unmap, FWRITE, vfs_context_kernel());
                        if (jnl_kdebug)
                                KERNEL_DEBUG_CONSTANT(DBG_JOURNAL_TRIM_UNMAP | DBG_FUNC_END, errno, 0, 0, 0, 0);
-                       if (errno) {
-                               printf("jnl: error %d from DKIOCUNMAP (extents=%lx, count=%u); disabling trim for %s\n",
-                                               errno, (unsigned long) (unmap.extents), unmap.extentsCount,
-                                               jnl->jdev_name);
-                               jnl->flags &= ~JOURNAL_USE_UNMAP;
-                       }
                }
-
+               
                /*
                 * Call back into the file system to tell them that we have
                 * trimmed some extents and that they can now be reused.
@@ -3456,9 +3574,8 @@ journal_trim_flush(journal *jnl, transaction *tr)
                 */
                if (jnl->trim_callback)
                        jnl->trim_callback(jnl->trim_callback_arg, tr->trim.extent_count, tr->trim.extents);
-
-               lck_rw_unlock_shared(&jnl->trim_lock);
        }
+       lck_rw_unlock_shared(&jnl->trim_lock);
 
        /*
         * If the transaction we're flushing was the async transaction, then
@@ -3475,6 +3592,11 @@ journal_trim_flush(journal *jnl, transaction *tr)
                jnl->async_trim = NULL;
        lck_rw_unlock_exclusive(&jnl->trim_lock);
 
+       /*
+        * By the time we get here, no other thread can discover the address
+        * of "tr", so it is safe for us to manipulate tr->trim without
+        * holding any locks.
+        */
        if (tr->trim.extents) {                 
                kfree(tr->trim.extents, tr->trim.allocated_count * sizeof(dk_extent_t));
                tr->trim.allocated_count = 0;
@@ -3488,7 +3610,6 @@ journal_trim_flush(journal *jnl, transaction *tr)
        return errno;
 }
 
-
 static int
 journal_binfo_cmp(const void *a, const void *b)
 {
@@ -3564,7 +3685,7 @@ end_transaction(transaction *tr, int force_it, errno_t (*callback)(void*), void
                jnl->cur_tr = tr;
                goto done;
        }
-
+       
     // if our transaction buffer isn't very full, just hang
     // on to it and don't actually flush anything.  this is
     // what is known as "group commit".  we will flush the
@@ -3607,7 +3728,7 @@ end_transaction(transaction *tr, int force_it, errno_t (*callback)(void*), void
                KERNEL_DEBUG(0xbbbbc018|DBG_FUNC_END, jnl, tr, ret_val, 0, 0);
                goto done;
        }
-
+       
        /*
         * Store a pointer to this transaction's trim list so that
         * future transactions can find it.
@@ -3634,7 +3755,7 @@ end_transaction(transaction *tr, int force_it, errno_t (*callback)(void*), void
         * of the journal flush, 'saved_sequence_num' remains stable
         */
        jnl->saved_sequence_num = jnl->sequence_num;
-
+       
        /*
         * if we're here we're going to flush the transaction buffer to disk.
         * 'check_free_space' will not return untl there is enough free
@@ -3822,14 +3943,8 @@ done:
 static void
 finish_end_thread(transaction *tr)
 {
-#if !CONFIG_EMBEDDED
-       proc_apply_thread_selfdiskacc(IOPOL_PASSIVE);
-#else /* !CONFIG_EMBEDDED */
-       struct uthread  *ut;
-
-       ut = get_bsdthread_info(current_thread());
-       ut->uu_iopol_disk = IOPOL_PASSIVE;
-#endif /* !CONFIG_EMBEDDED */
+       proc_set_task_policy(current_task(), current_thread(),
+                            TASK_POLICY_INTERNAL, TASK_POLICY_IOPOL, IOPOL_PASSIVE);
 
        finish_end_transaction(tr, NULL, NULL);
 
@@ -3840,14 +3955,8 @@ finish_end_thread(transaction *tr)
 static void
 write_header_thread(journal *jnl)
 {
-#if !CONFIG_EMBEDDED
-       proc_apply_thread_selfdiskacc(IOPOL_PASSIVE);
-#else /* !CONFIG_EMBEDDED */
-       struct uthread  *ut;
-
-       ut = get_bsdthread_info(current_thread());
-       ut->uu_iopol_disk = IOPOL_PASSIVE;
-#endif /* !CONFIG_EMBEDDED */
+       proc_set_task_policy(current_task(), current_thread(),
+                            TASK_POLICY_INTERNAL, TASK_POLICY_IOPOL, IOPOL_PASSIVE);
 
        if (write_journal_header(jnl, 1, jnl->saved_sequence_num))
                jnl->write_header_failed = TRUE;
@@ -4249,7 +4358,7 @@ abort_transaction(journal *jnl, transaction *tr)
                                         */
                                        vnode_rele_ext(bp_vp, 0, 1);
                                } else {
-                                       printf("jnl: %s: abort_tr: could not find block %Ld vp %p!\n",
+                                       printf("jnl: %s: abort_tr: could not find block %lld vp %p!\n",
                                               jnl->jdev_name, blhdr->binfo[i].bnum, tbp);
                                        if (bp) {
                                                buf_brelse(bp);
@@ -4276,6 +4385,7 @@ abort_transaction(journal *jnl, transaction *tr)
                jnl->async_trim = NULL;
        lck_rw_unlock_exclusive(&jnl->trim_lock);
        
+       
        if (tr->trim.extents) {
                kfree(tr->trim.extents, tr->trim.allocated_count * sizeof(dk_extent_t));
        }
@@ -4343,7 +4453,19 @@ journal_end_transaction(journal *jnl)
        // called from end_transaction().
        // 
        jnl->active_tr = NULL;
-       ret = end_transaction(tr, 0, NULL, NULL, TRUE, FALSE);
+       
+       /* Examine the force-journal-flush state in the active txn */
+       if (tr->flush_on_completion == TRUE) {
+               /*
+                * If the FS requested it, disallow group commit and force the
+                * transaction out to disk immediately.
+                */
+               ret = end_transaction(tr, 1, NULL, NULL, TRUE, TRUE);
+       }
+       else {
+               /* in the common path we can simply use the double-buffered journal */
+               ret = end_transaction(tr, 0, NULL, NULL, TRUE, FALSE);
+       }
 
        return ret;
 }
@@ -4520,7 +4642,8 @@ int journal_relocate(journal *jnl, off_t offset, off_t journal_size, int32_t tbu
 {
        int             ret;
        transaction     *tr;
-       
+       size_t i = 0;
+
        /*
         * Sanity check inputs, and adjust the size of the transaction buffer.
         */
@@ -4565,7 +4688,23 @@ int journal_relocate(journal *jnl, off_t offset, off_t journal_size, int32_t tbu
                return ret;
        }
        wait_condition(jnl, &jnl->flushing, "end_transaction");
-       
+
+       /*
+        * At this point, we have completely flushed the contents of the current
+        * journal to disk (and have asynchronously written all of the txns to 
+        * their actual desired locations).  As a result, we can (and must) clear 
+        * out the old_start array.  If we do not, then if the last written transaction
+        * started at the beginning of the journal (starting 1 block into the 
+        * journal file) it could confuse the buffer_flushed callback. This is
+        * because we're about to reset the start/end pointers of the journal header
+        * below. 
+        */
+       lock_oldstart(jnl); 
+       for (i = 0; i < sizeof (jnl->old_start) / sizeof(jnl->old_start[0]); i++) { 
+               jnl->old_start[i] = 0; 
+       }
+       unlock_oldstart(jnl);
+
        /* Update the journal's offset and size in memory. */
        jnl->jdev_offset = offset;
        jnl->jhdr->start = jnl->jhdr->end = jnl->jhdr->jhdr_size;
@@ -4619,7 +4758,8 @@ journal_create(__unused struct vnode *jvp,
               __unused int32_t       flags,
               __unused int32_t       tbuffer_size,
               __unused void        (*flush)(void *arg),
-              __unused void         *arg)
+              __unused void         *arg,
+              __unused struct mount *fsmount)
 {
     return NULL;
 }
@@ -4633,7 +4773,8 @@ journal_open(__unused struct vnode *jvp,
             __unused int32_t       flags,
             __unused int32_t       tbuffer_size,
             __unused void        (*flush)(void *arg),
-            __unused void         *arg)
+            __unused void         *arg,
+            __unused struct mount *fsmount)
 {
        return NULL;
 }