]> git.saurik.com Git - apple/xnu.git/blobdiff - bsd/hfs/hfs_readwrite.c
xnu-3248.60.10.tar.gz
[apple/xnu.git] / bsd / hfs / hfs_readwrite.c
index 7bf65093cabbb49ff18559d50f427af06b392df6..78719c0694fcf3f8cc3769b6a5ead6b56548c79e 100644 (file)
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2000-2010 Apple Inc. All rights reserved.
+ * Copyright (c) 2000-2015 Apple Inc. All rights reserved.
  *
  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
  * 
@@ -41,6 +41,7 @@
 #include <sys/filedesc.h>
 #include <sys/stat.h>
 #include <sys/buf.h>
+#include <sys/buf_internal.h>
 #include <sys/proc.h>
 #include <sys/kauth.h>
 #include <sys/vnode.h>
 #include <sys/disk.h>
 #include <sys/sysctl.h>
 #include <sys/fsctl.h>
+#include <sys/mount_internal.h>
+#include <sys/file_internal.h>
+
+#include <libkern/OSDebug.h>
 
 #include <miscfs/specfs/specdev.h>
 
@@ -61,6 +66,8 @@
 #include <vm/vm_pageout.h>
 #include <vm/vm_kern.h>
 
+#include <IOKit/IOBSD.h>
+
 #include <sys/kdebug.h>
 
 #include       "hfs.h"
@@ -73,6 +80,7 @@
 #include       "hfs_cnode.h"
 #include       "hfs_dbg.h"
 
+
 #define can_cluster(size) ((((size & (4096-1))) == 0) && (size <= (MAXPHYSIO/2)))
 
 enum {
@@ -82,12 +90,18 @@ enum {
 /* from bsd/hfs/hfs_vfsops.c */
 extern int hfs_vfs_vget (struct mount *mp, ino64_t ino, struct vnode **vpp, vfs_context_t context);
 
-static int  hfs_clonelink(struct vnode *, int, kauth_cred_t, struct proc *);
+/* from hfs_hotfiles.c */
+extern int hfs_pin_overflow_extents (struct hfsmount *hfsmp, uint32_t fileid,
+                                             uint8_t forktype, uint32_t *pinned);
+
 static int  hfs_clonefile(struct vnode *, int, int, int);
 static int  hfs_clonesysfile(struct vnode *, int, int, int, kauth_cred_t, struct proc *);
-static int  hfs_minorupdate(struct vnode *vp);
 static int  do_hfs_truncate(struct vnode *vp, off_t length, int flags, int skip, vfs_context_t context);
 
+/* from bsd/hfs/hfs_vnops.c */
+extern decmpfs_cnode* hfs_lazy_init_decmpfs_cnode (struct cnode *cp);
+
+
 
 int flush_cache_on_write = 0;
 SYSCTL_INT (_kern, OID_AUTO, flush_cache_on_write, CTLFLAG_RW | CTLFLAG_LOCKED, &flush_cache_on_write, 0, "always flush the drive cache on writes to uncached files");
@@ -98,6 +112,16 @@ SYSCTL_INT (_kern, OID_AUTO, flush_cache_on_write, CTLFLAG_RW | CTLFLAG_LOCKED,
 int
 hfs_vnop_read(struct vnop_read_args *ap)
 {
+       /*
+          struct vnop_read_args {
+          struct vnodeop_desc *a_desc;
+          vnode_t a_vp;
+          struct uio *a_uio;
+          int a_ioflag;
+          vfs_context_t a_context;
+          };
+        */
+
        uio_t uio = ap->a_uio;
        struct vnode *vp = ap->a_vp;
        struct cnode *cp;
@@ -109,6 +133,8 @@ hfs_vnop_read(struct vnop_read_args *ap)
        off_t offset = uio_offset(uio);
        int retval = 0;
        int took_truncate_lock = 0;
+       int io_throttle = 0;
+       int throttled_count = 0;
 
        /* Preflight checks */
        if (!vnode_isreg(vp)) {
@@ -122,7 +148,15 @@ hfs_vnop_read(struct vnop_read_args *ap)
                return (0);             /* Nothing left to do */
        if (offset < 0)
                return (EINVAL);        /* cant read from a negative offset */
-       
+
+#if SECURE_KERNEL
+       if ((ap->a_ioflag & (IO_SKIP_ENCRYPTION|IO_SYSCALL_DISPATCH)) ==
+                                               (IO_SKIP_ENCRYPTION|IO_SYSCALL_DISPATCH)) {
+               /* Don't allow unencrypted io request from user space */
+               return EPERM;
+       }
+#endif
+
 #if HFS_COMPRESSION
        if (VNODE_IS_RSRC(vp)) {
                if (hfs_hides_rsrc(ap->a_context, VTOC(vp), 1)) { /* 1 == don't take the cnode lock */
@@ -133,12 +167,19 @@ hfs_vnop_read(struct vnop_read_args *ap)
                int compressed = hfs_file_is_compressed(VTOC(vp), 1); /* 1 == don't take the cnode lock */
                if (compressed) {
                        retval = decmpfs_read_compressed(ap, &compressed, VTOCMP(vp));
+                       if (retval == 0 && !(ap->a_ioflag & IO_EVTONLY) && vnode_isfastdevicecandidate(vp)) {
+                               (void) hfs_addhotfile(vp);
+                       }
                        if (compressed) {
                                if (retval == 0) {
                                        /* successful read, update the access time */
                                        VTOC(vp)->c_touch_acctime = TRUE;
                                        
-                                       /* compressed files are not hot file candidates */
+                                       //
+                                       // compressed files are not traditional hot file candidates
+                                       // but they may be for CF (which ignores the ff_bytesread
+                                       // field)
+                                       //
                                        if (VTOHFS(vp)->hfc_stage == HFC_RECORDING) {
                                                VTOF(vp)->ff_bytesread = 0;
                                        }
@@ -147,7 +188,7 @@ hfs_vnop_read(struct vnop_read_args *ap)
                        }
                        /* otherwise the file was converted back to a regular file while we were reading it */
                        retval = 0;
-               } else if ((VTOC(vp)->c_flags & UF_COMPRESSED)) {
+               } else if ((VTOC(vp)->c_bsdflags & UF_COMPRESSED)) {
                        int error;
                        
                        error = check_for_dataless_file(vp, NAMESPACE_HANDLER_READ_OP);
@@ -164,17 +205,36 @@ hfs_vnop_read(struct vnop_read_args *ap)
        hfsmp = VTOHFS(vp);
 
 #if CONFIG_PROTECT
-       if ((retval = cp_handle_vnop (cp, CP_READ_ACCESS)) != 0) {
+       if ((retval = cp_handle_vnop (vp, CP_READ_ACCESS, ap->a_ioflag)) != 0) {
                goto exit;
        }
-#endif
+
+#endif // CONFIG_PROTECT
+
+       /* 
+        * If this read request originated from a syscall (as opposed to 
+        * an in-kernel page fault or something), then set it up for 
+        * throttle checks
+        */
+       if (ap->a_ioflag & IO_SYSCALL_DISPATCH) {
+               io_throttle = IO_RETURN_ON_THROTTLE;
+       }
+
+read_again:
 
        /* Protect against a size change. */
-       hfs_lock_truncate(cp, HFS_SHARED_LOCK);
+       hfs_lock_truncate(cp, HFS_SHARED_LOCK, HFS_LOCK_DEFAULT);
        took_truncate_lock = 1;
 
        filesize = fp->ff_size;
        filebytes = (off_t)fp->ff_blocks * (off_t)hfsmp->blockSize;
+
+       /*
+        * Check the file size. Note that per POSIX spec, we return 0 at 
+        * file EOF, so attempting a read at an offset that is too big
+        * should just return 0 on HFS+. Since the return value was initialized
+        * to 0 above, we just jump to exit.  HFS Standard has its own behavior.
+        */
        if (offset > filesize) {
                if ((hfsmp->hfs_flags & HFS_STANDARD) &&
                    (offset > (off_t)MAXHFSFILESIZE)) {
@@ -183,14 +243,14 @@ hfs_vnop_read(struct vnop_read_args *ap)
                goto exit;
        }
 
-       KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 12)) | DBG_FUNC_START,
+       KERNEL_DEBUG(HFSDBG_READ | DBG_FUNC_START,
                (int)uio_offset(uio), uio_resid(uio), (int)filesize, (int)filebytes, 0);
 
-       retval = cluster_read(vp, uio, filesize, ap->a_ioflag);
+       retval = cluster_read(vp, uio, filesize, ap->a_ioflag |io_throttle);
 
        cp->c_touch_acctime = TRUE;
 
-       KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 12)) | DBG_FUNC_END,
+       KERNEL_DEBUG(HFSDBG_READ | DBG_FUNC_END,
                (int)uio_offset(uio), uio_resid(uio), (int)filesize,  (int)filebytes, 0);
 
        /*
@@ -204,7 +264,7 @@ hfs_vnop_read(struct vnop_read_args *ap)
 
                /* When ff_bytesread exceeds 32-bits, update it behind the cnode lock. */
                if ((fp->ff_bytesread + bytesread) > 0x00000000ffffffff) {
-                       hfs_lock(cp, HFS_FORCE_LOCK);
+                       hfs_lock(cp, HFS_EXCLUSIVE_LOCK, HFS_LOCK_ALLOW_NOEXISTS);
                        took_cnode_lock = 1;
                }
                /*
@@ -220,17 +280,60 @@ hfs_vnop_read(struct vnop_read_args *ap)
                } else {
                        fp->ff_bytesread += bytesread;
                }
+
+               if (!(ap->a_ioflag & IO_EVTONLY) && vnode_isfastdevicecandidate(vp)) {
+                       //
+                       // We don't add hotfiles for processes doing IO_EVTONLY I/O
+                       // on the assumption that they're system processes such as
+                       // mdworker which scan everything in the system (and thus
+                       // do not represent user-initiated access to files)
+                       //
+                       (void) hfs_addhotfile(vp);
+               }
                if (took_cnode_lock)
                        hfs_unlock(cp);
        }
 exit:
        if (took_truncate_lock) {
-               hfs_unlock_truncate(cp, 0);
+               hfs_unlock_truncate(cp, HFS_LOCK_DEFAULT);
        }
+       if (retval == EAGAIN) {
+               throttle_lowpri_io(1);
+               throttled_count++;
 
+               retval = 0;
+               goto read_again;
+       }
+       if (throttled_count) {
+               throttle_info_reset_window((uthread_t)get_bsdthread_info(current_thread()));
+       }
        return (retval);
 }
 
+/*
+ * Ideally, this wouldn't be necessary; the cluster code should be
+ * able to handle this on the read-side.  See <rdar://20420068>.
+ */
+static errno_t hfs_zero_eof_page(vnode_t vp, off_t zero_up_to)
+{
+       assert(VTOC(vp)->c_lockowner != current_thread());
+       assert(VTOC(vp)->c_truncatelockowner == current_thread());
+
+       struct filefork *fp = VTOF(vp);
+
+       if (!(fp->ff_size & PAGE_MASK_64) || zero_up_to <= fp->ff_size) {
+               // Nothing to do
+               return 0;
+       }
+
+       zero_up_to = MIN(zero_up_to, (off_t)round_page_64(fp->ff_size));
+
+       /* N.B. At present, @zero_up_to is not important because the cluster
+          code will always zero up to the end of the page anyway. */
+       return cluster_write(vp, NULL, fp->ff_size, zero_up_to,
+                                                fp->ff_size, 0, IO_HEADZEROFILL);
+}
+
 /*
  * Write data to a file.
  */
@@ -259,6 +362,8 @@ hfs_vnop_write(struct vnop_write_args *ap)
        int do_snapshot = 1;
        time_t orig_ctime=VTOC(vp)->c_ctime;
        int took_truncate_lock = 0;
+       int io_return_on_throttle = 0;
+       int throttled_count = 0;
 
 #if HFS_COMPRESSION
        if ( hfs_file_is_compressed(VTOC(vp), 1) ) { /* 1 == don't take the cnode lock */
@@ -276,7 +381,7 @@ hfs_vnop_write(struct vnop_write_args *ap)
                                printf("invalid state %d for compressed file\n", state);
                                /* fall through */
                }
-       } else if ((VTOC(vp)->c_flags & UF_COMPRESSED)) {
+       } else if ((VTOC(vp)->c_bsdflags & UF_COMPRESSED)) {
                int error;
                
                error = check_for_dataless_file(vp, NAMESPACE_HANDLER_WRITE_OP);
@@ -291,7 +396,14 @@ hfs_vnop_write(struct vnop_write_args *ap)
 
 #endif
 
-       // LP64todo - fix this! uio_resid may be 64-bit value
+#if SECURE_KERNEL
+       if ((ioflag & (IO_SKIP_ENCRYPTION|IO_SYSCALL_DISPATCH)) ==
+                                               (IO_SKIP_ENCRYPTION|IO_SYSCALL_DISPATCH)) {
+               /* Don't allow unencrypted io request from user space */
+               return EPERM;
+       }
+#endif
+
        resid = uio_resid(uio);
        offset = uio_offset(uio);
 
@@ -307,7 +419,7 @@ hfs_vnop_write(struct vnop_write_args *ap)
        hfsmp = VTOHFS(vp);
 
 #if CONFIG_PROTECT
-       if ((retval = cp_handle_vnop (cp, CP_WRITE_ACCESS)) != 0) {
+       if ((retval = cp_handle_vnop (vp, CP_WRITE_ACCESS, 0)) != 0) {
                goto exit;
        }
 #endif
@@ -326,13 +438,24 @@ hfs_vnop_write(struct vnop_write_args *ap)
        }
 #endif /* HFS_SPARSE_DEV */
 
+       if ((ioflag & (IO_SINGLE_WRITER | IO_SYSCALL_DISPATCH)) == 
+                       (IO_SINGLE_WRITER | IO_SYSCALL_DISPATCH)) {
+               io_return_on_throttle = IO_RETURN_ON_THROTTLE;
+       }
+
 again:
-       /* Protect against a size change. */
-       if (ioflag & IO_APPEND) {
-               hfs_lock_truncate(cp, HFS_EXCLUSIVE_LOCK);
+       /*
+        * Protect against a size change.
+        *
+        * Note: If took_truncate_lock is true, then we previously got the lock shared
+        * but needed to upgrade to exclusive.  So try getting it exclusive from the
+        * start.
+        */
+       if (ioflag & IO_APPEND || took_truncate_lock) {
+               hfs_lock_truncate(cp, HFS_EXCLUSIVE_LOCK, HFS_LOCK_DEFAULT);
        }       
        else {
-               hfs_lock_truncate(cp, HFS_SHARED_LOCK);
+               hfs_lock_truncate(cp, HFS_SHARED_LOCK, HFS_LOCK_DEFAULT);
        }
        took_truncate_lock = 1;
 
@@ -341,26 +464,49 @@ again:
                uio_setoffset(uio, fp->ff_size);
                offset = fp->ff_size;
        }
-       if ((cp->c_flags & APPEND) && offset != fp->ff_size) {
+       if ((cp->c_bsdflags & APPEND) && offset != fp->ff_size) {
                retval = EPERM;
                goto exit;
        }
 
+       cred = vfs_context_ucred(ap->a_context);
+       if (cred && suser(cred, NULL) != 0)
+               eflags |= kEFReserveMask;
+
        origFileSize = fp->ff_size;
        writelimit = offset + resid;
-       filebytes = (off_t)fp->ff_blocks * (off_t)hfsmp->blockSize;
 
-       /* If the truncate lock is shared, and if we either have virtual 
-        * blocks or will need to extend the file, upgrade the truncate 
-        * to exclusive lock.  If upgrade fails, we lose the lock and 
-        * have to get exclusive lock again.  Note that we want to
-        * grab the truncate lock exclusive even if we're not allocating new blocks
-        * because we could still be growing past the LEOF.
+       /*
+        * We may need an exclusive truncate lock for several reasons, all
+        * of which are because we may be writing to a (portion of a) block
+        * for the first time, and we need to make sure no readers see the
+        * prior, uninitialized contents of the block.  The cases are:
+        *
+        * 1. We have unallocated (delayed allocation) blocks.  We may be
+        *    allocating new blocks to the file and writing to them.
+        *    (A more precise check would be whether the range we're writing
+        *    to contains delayed allocation blocks.)
+        * 2. We need to extend the file.  The bytes between the old EOF
+        *    and the new EOF are not yet initialized.  This is important
+        *    even if we're not allocating new blocks to the file.  If the
+        *    old EOF and new EOF are in the same block, we still need to
+        *    protect that range of bytes until they are written for the
+        *    first time.
+        *
+        * If we had a shared lock with the above cases, we need to try to upgrade
+        * to an exclusive lock.  If the upgrade fails, we will lose the shared
+        * lock, and will need to take the truncate lock again; the took_truncate_lock
+        * flag will still be set, causing us to try for an exclusive lock next time.
         */
        if ((cp->c_truncatelockowner == HFS_SHARED_OWNER) &&
-           ((fp->ff_unallocblocks != 0) || (writelimit > origFileSize))) {
-               /* Lock upgrade failed and we lost our shared lock, try again */
+           ((fp->ff_unallocblocks != 0) ||
+            (writelimit > origFileSize))) {
                if (lck_rw_lock_shared_to_exclusive(&cp->c_truncatelock) == FALSE) {
+                       /*
+                        * Lock upgrade failed and we lost our shared lock, try again.
+                        * Note: we do not set took_truncate_lock=0 here.  Leaving it
+                        * set to 1 will cause us to try to get the lock exclusive.
+                        */
                        goto again;
                } 
                else {
@@ -369,23 +515,29 @@ again:
                }
        }
 
-       if ( (retval = hfs_lock(VTOC(vp), HFS_EXCLUSIVE_LOCK))) {
+       if ( (retval = hfs_lock(VTOC(vp), HFS_EXCLUSIVE_LOCK, HFS_LOCK_DEFAULT))) {
                goto exit;
        }
        cnode_locked = 1;
-       
-       if (cp->c_truncatelockowner == HFS_SHARED_OWNER) {
-               KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 0)) | DBG_FUNC_START,
-                            (int)offset, uio_resid(uio), (int)fp->ff_size,
-                            (int)filebytes, 0);
+
+       filebytes = hfs_blk_to_bytes(fp->ff_blocks, hfsmp->blockSize);
+
+       if (offset > filebytes
+               && (hfs_blk_to_bytes(hfs_freeblks(hfsmp, ISSET(eflags, kEFReserveMask)),
+                                                        hfsmp->blockSize) < offset - filebytes)) {
+               retval = ENOSPC;
+               goto exit;
        }
 
+       KERNEL_DEBUG(HFSDBG_WRITE | DBG_FUNC_START,
+                    (int)offset, uio_resid(uio), (int)fp->ff_size,
+                    (int)filebytes, 0);
+
        /* Check if we do not need to extend the file */
        if (writelimit <= filebytes) {
                goto sizeok;
        }
 
-       cred = vfs_context_ucred(ap->a_context);
        bytesToAdd = writelimit - filebytes;
 
 #if QUOTA
@@ -402,8 +554,6 @@ again:
 
        while (writelimit > filebytes) {
                bytesToAdd = writelimit - filebytes;
-               if (cred && suser(cred, NULL) != 0)
-                       eflags |= kEFReserveMask;
 
                /* Protect extents b-tree and allocation bitmap */
                lockflags = SFL_BITMAP;
@@ -425,10 +575,10 @@ again:
                if (retval != E_NONE)
                        break;
                filebytes = (off_t)fp->ff_blocks * (off_t)hfsmp->blockSize;
-               KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 0)) | DBG_FUNC_NONE,
+               KERNEL_DEBUG(HFSDBG_WRITE | DBG_FUNC_NONE,
                        (int)offset, uio_resid(uio), (int)fp->ff_size,  (int)filebytes, 0);
        }
-       (void) hfs_update(vp, TRUE);
+       (void) hfs_update(vp, 0);
        (void) hfs_volupdate(hfsmp, VOL_UPDATE, 0);
        (void) hfs_end_transaction(hfsmp);
 
@@ -446,134 +596,45 @@ again:
 sizeok:
        if (retval == E_NONE) {
                off_t filesize;
-               off_t zero_off;
-               off_t tail_off;
-               off_t inval_start;
-               off_t inval_end;
-               off_t io_start;
+               off_t head_off;
                int lflag;
-               struct rl_entry *invalid_range;
 
-               if (writelimit > fp->ff_size)
+               if (writelimit > fp->ff_size) {
                        filesize = writelimit;
-               else
+                       struct timeval tv;
+                       rl_add(fp->ff_size, writelimit - 1 , &fp->ff_invalidranges);
+                       microuptime(&tv);
+                       cp->c_zftimeout = tv.tv_sec + ZFTIMELIMIT;
+               } else
                        filesize = fp->ff_size;
 
                lflag = ioflag & ~(IO_TAILZEROFILL | IO_HEADZEROFILL | IO_NOZEROVALID | IO_NOZERODIRTY);
 
-               if (offset <= fp->ff_size) {
-                       zero_off = offset & ~PAGE_MASK_64;
-                       
-                       /* Check to see whether the area between the zero_offset and the start
-                          of the transfer to see whether is invalid and should be zero-filled
-                          as part of the transfer:
-                        */
-                       if (offset > zero_off) {
-                               if (rl_scan(&fp->ff_invalidranges, zero_off, offset - 1, &invalid_range) != RL_NOOVERLAP)
-                                       lflag |= IO_HEADZEROFILL;
-                       }
-               } else {
-                       off_t eof_page_base = fp->ff_size & ~PAGE_MASK_64;
-                       
-                       /* The bytes between fp->ff_size and uio->uio_offset must never be
-                          read without being zeroed.  The current last block is filled with zeroes
-                          if it holds valid data but in all cases merely do a little bookkeeping
-                          to track the area from the end of the current last page to the start of
-                          the area actually written.  For the same reason only the bytes up to the
-                          start of the page where this write will start is invalidated; any remainder
-                          before uio->uio_offset is explicitly zeroed as part of the cluster_write.
-                          
-                          Note that inval_start, the start of the page after the current EOF,
-                          may be past the start of the write, in which case the zeroing
-                          will be handled by the cluser_write of the actual data.
-                        */
-                       inval_start = (fp->ff_size + (PAGE_SIZE_64 - 1)) & ~PAGE_MASK_64;
-                       inval_end = offset & ~PAGE_MASK_64;
-                       zero_off = fp->ff_size;
-                       
-                       if ((fp->ff_size & PAGE_MASK_64) &&
-                               (rl_scan(&fp->ff_invalidranges,
-                                                       eof_page_base,
-                                                       fp->ff_size - 1,
-                                                       &invalid_range) != RL_NOOVERLAP)) {
-                               /* The page containing the EOF is not valid, so the
-                                  entire page must be made inaccessible now.  If the write
-                                  starts on a page beyond the page containing the eof
-                                  (inval_end > eof_page_base), add the
-                                  whole page to the range to be invalidated.  Otherwise
-                                  (i.e. if the write starts on the same page), zero-fill
-                                  the entire page explicitly now:
-                                */
-                               if (inval_end > eof_page_base) {
-                                       inval_start = eof_page_base;
-                               } else {
-                                       zero_off = eof_page_base;
-                               };
-                       };
-                       
-                       if (inval_start < inval_end) {
-                               struct timeval tv;
-                               /* There's some range of data that's going to be marked invalid */
-                               
-                               if (zero_off < inval_start) {
-                                       /* The pages between inval_start and inval_end are going to be invalidated,
-                                          and the actual write will start on a page past inval_end.  Now's the last
-                                          chance to zero-fill the page containing the EOF:
-                                        */
-                                       hfs_unlock(cp);
-                                       cnode_locked = 0;
-                                       retval = cluster_write(vp, (uio_t) 0,
-                                                       fp->ff_size, inval_start,
-                                                       zero_off, (off_t)0,
-                                                       lflag | IO_HEADZEROFILL | IO_NOZERODIRTY);
-                                       hfs_lock(cp, HFS_FORCE_LOCK);
-                                       cnode_locked = 1;
-                                       if (retval) goto ioerr_exit;
-                                       offset = uio_offset(uio);
-                               };
-                               
-                               /* Mark the remaining area of the newly allocated space as invalid: */
-                               rl_add(inval_start, inval_end - 1 , &fp->ff_invalidranges);
-                               microuptime(&tv);
-                               cp->c_zftimeout = tv.tv_sec + ZFTIMELIMIT;
-                               zero_off = fp->ff_size = inval_end;
-                       };
-                       
-                       if (offset > zero_off) lflag |= IO_HEADZEROFILL;
-               };
-
-               /* Check to see whether the area between the end of the write and the end of
-                  the page it falls in is invalid and should be zero-filled as part of the transfer:
-                */
-               tail_off = (writelimit + (PAGE_SIZE_64 - 1)) & ~PAGE_MASK_64;
-               if (tail_off > filesize) tail_off = filesize;
-               if (tail_off > writelimit) {
-                       if (rl_scan(&fp->ff_invalidranges, writelimit, tail_off - 1, &invalid_range) != RL_NOOVERLAP) {
-                               lflag |= IO_TAILZEROFILL;
-                       };
-               };
-               
                /*
-                * if the write starts beyond the current EOF (possibly advanced in the
-                * zeroing of the last block, above), then we'll zero fill from the current EOF
-                * to where the write begins:
-                *
-                * NOTE: If (and ONLY if) the portion of the file about to be written is
-                *       before the current EOF it might be marked as invalid now and must be
-                *       made readable (removed from the invalid ranges) before cluster_write
-                *       tries to write it:
+                * We no longer use IO_HEADZEROFILL or IO_TAILZEROFILL (except
+                * for one case below).  For the regions that lie before the
+                * beginning and after the end of this write that are in the
+                * same page, we let the cluster code handle zeroing that out
+                * if necessary.  If those areas are not cached, the cluster
+                * code will try and read those areas in, and in the case
+                * where those regions have never been written to,
+                * hfs_vnop_blockmap will consult the invalid ranges and then
+                * indicate that.  The cluster code will zero out those areas.
                 */
-               io_start = (lflag & IO_HEADZEROFILL) ? zero_off : offset;
-               if (io_start < fp->ff_size) {
-                       off_t io_end;
 
-                       io_end = (lflag & IO_TAILZEROFILL) ? tail_off : writelimit;
-                       rl_remove(io_start, io_end - 1, &fp->ff_invalidranges);
-               };
+               head_off = trunc_page_64(offset);
+
+               if (head_off < offset && head_off >= fp->ff_size) {
+                       /*
+                        * The first page is beyond current EOF, so as an
+                        * optimisation, we can pass IO_HEADZEROFILL.
+                        */
+                       lflag |= IO_HEADZEROFILL;
+               }
 
                hfs_unlock(cp);
                cnode_locked = 0;
-               
+
                /*
                 * We need to tell UBC the fork's new size BEFORE calling
                 * cluster_write, in case any of the new pages need to be
@@ -594,13 +655,47 @@ sizeok:
                 * zero, unless we are extending the file via write.
                 */
                if (filesize > fp->ff_size) {
+                       retval = hfs_zero_eof_page(vp, offset);
+                       if (retval)
+                               goto exit;
                        fp->ff_new_size = filesize;
                        ubc_setsize(vp, filesize);
                }
-               retval = cluster_write(vp, uio, fp->ff_size, filesize, zero_off,
-                               tail_off, lflag | IO_NOZERODIRTY);
+               retval = cluster_write(vp, uio, fp->ff_size, filesize, head_off,
+                                                          0, lflag | IO_NOZERODIRTY | io_return_on_throttle);
                if (retval) {
                        fp->ff_new_size = 0;    /* no longer extending; use ff_size */
+                       
+                       if (retval == EAGAIN) {
+                               /*
+                                * EAGAIN indicates that we still have I/O to do, but
+                                * that we now need to be throttled
+                                */
+                               if (resid != uio_resid(uio)) {
+                                       /*
+                                        * did manage to do some I/O before returning EAGAIN
+                                        */
+                                       resid = uio_resid(uio);
+                                       offset = uio_offset(uio);
+
+                                       cp->c_touch_chgtime = TRUE;
+                                       cp->c_touch_modtime = TRUE;
+                                       hfs_incr_gencount(cp);
+                               }
+                               if (filesize > fp->ff_size) {
+                                       /*
+                                        * we called ubc_setsize before the call to
+                                        * cluster_write... since we only partially
+                                        * completed the I/O, we need to 
+                                        * re-adjust our idea of the filesize based
+                                        * on our interim EOF
+                                        */
+                                       ubc_setsize(vp, offset);
+
+                                       fp->ff_size = offset;
+                               }
+                               goto exit;
+                       }
                        if (filesize > origFileSize) {
                                ubc_setsize(vp, origFileSize);
                        }
@@ -615,13 +710,7 @@ sizeok:
                                fp->ff_bytesread = 0;
                        }
                }
-               fp->ff_new_size = 0;    /* ff_size now has the correct size */
-               
-               /* If we wrote some bytes, then touch the change and mod times */
-               if (resid > uio_resid(uio)) {
-                       cp->c_touch_chgtime = TRUE;
-                       cp->c_touch_modtime = TRUE;
-               }
+               fp->ff_new_size = 0;    /* ff_size now has the correct size */          
        }
        if (partialwrite) {
                uio_setresid(uio, (uio_resid(uio) + bytesToAdd));
@@ -631,57 +720,71 @@ sizeok:
        // XXXdbg - see radar 4871353 for more info
        {
            if (flush_cache_on_write && ((ioflag & IO_NOCACHE) || vnode_isnocache(vp))) {
-               VNOP_IOCTL(hfsmp->hfs_devvp, DKIOCSYNCHRONIZECACHE, NULL, FWRITE, NULL);
+                       hfs_flush(hfsmp, HFS_FLUSH_CACHE);
            }
        }
 
 ioerr_exit:
-       /*
-        * If we successfully wrote any data, and we are not the superuser
-        * we clear the setuid and setgid bits as a precaution against
-        * tampering.
-        */
-       if (cp->c_mode & (S_ISUID | S_ISGID)) {
-               cred = vfs_context_ucred(ap->a_context);
-               if (resid > uio_resid(uio) && cred && suser(cred, NULL)) {
-                       if (!cnode_locked) {
-                               hfs_lock(cp, HFS_FORCE_LOCK);
-                               cnode_locked = 1;
+       if (!cnode_locked) {
+               hfs_lock(cp, HFS_EXCLUSIVE_LOCK, HFS_LOCK_ALLOW_NOEXISTS);
+               cnode_locked = 1;
+       }
+
+       if (resid > uio_resid(uio)) {
+               cp->c_touch_chgtime = TRUE;
+               cp->c_touch_modtime = TRUE;
+               hfs_incr_gencount(cp);
+
+               /*
+                * If we successfully wrote any data, and we are not the superuser
+                * we clear the setuid and setgid bits as a precaution against
+                * tampering.
+                */
+               if (cp->c_mode & (S_ISUID | S_ISGID)) {
+                       cred = vfs_context_ucred(ap->a_context);
+                       if (cred && suser(cred, NULL)) {
+                               cp->c_mode &= ~(S_ISUID | S_ISGID);
                        }
-                       cp->c_mode &= ~(S_ISUID | S_ISGID);
                }
        }
        if (retval) {
                if (ioflag & IO_UNIT) {
-                       if (!cnode_locked) {
-                               hfs_lock(cp, HFS_FORCE_LOCK);
-                               cnode_locked = 1;
-                       }
                        (void)hfs_truncate(vp, origFileSize, ioflag & IO_SYNC,
-                                          0, 0, ap->a_context);
-                       // LP64todo - fix this!  resid needs to by user_ssize_t
+                                          0, ap->a_context);
                        uio_setoffset(uio, (uio_offset(uio) - (resid - uio_resid(uio))));
                        uio_setresid(uio, resid);
                        filebytes = (off_t)fp->ff_blocks * (off_t)hfsmp->blockSize;
                }
-       } else if ((ioflag & IO_SYNC) && (resid > uio_resid(uio))) {
-               if (!cnode_locked) {
-                       hfs_lock(cp, HFS_FORCE_LOCK);
-                       cnode_locked = 1;
-               }
-               retval = hfs_update(vp, TRUE);
-       }
+       } else if ((ioflag & IO_SYNC) && (resid > uio_resid(uio)))
+               retval = hfs_update(vp, 0);
+
        /* Updating vcbWrCnt doesn't need to be atomic. */
        hfsmp->vcbWrCnt++;
 
-       KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 0)) | DBG_FUNC_END,
+       KERNEL_DEBUG(HFSDBG_WRITE | DBG_FUNC_END,
                (int)uio_offset(uio), uio_resid(uio), (int)fp->ff_size, (int)filebytes, 0);
 exit:
+       if (retval && took_truncate_lock
+               && cp->c_truncatelockowner == current_thread()) {
+               fp->ff_new_size = 0;
+               rl_remove(fp->ff_size, RL_INFINITY, &fp->ff_invalidranges);
+       }
+
        if (cnode_locked)
                hfs_unlock(cp);
-       
+
        if (took_truncate_lock) {
-               hfs_unlock_truncate(cp, 0);
+               hfs_unlock_truncate(cp, HFS_LOCK_DEFAULT);
+       }
+       if (retval == EAGAIN) {
+               throttle_lowpri_io(1);
+               throttled_count++;
+
+               retval = 0;
+               goto again;
+       }
+       if (throttled_count) {
+               throttle_info_reset_window((uthread_t)get_bsdthread_info(current_thread()));
        }
        return (retval);
 }
@@ -825,8 +928,6 @@ lookup_bucket(struct access_cache *cache, int *indexp, cnid_t parent_id)
     }
        
     if (cache->numcached > NUM_CACHE_ENTRIES) {
-       /*printf("hfs: EGAD! numcached is %d... cut our losses and trim to %d\n",
-         cache->numcached, NUM_CACHE_ENTRIES);*/
        cache->numcached = NUM_CACHE_ENTRIES;
     }
        
@@ -874,11 +975,9 @@ add_node(struct access_cache *cache, int index, cnid_t nodeID, int access)
 
     /* if the cache is full, do a replace rather than an insert */
     if (cache->numcached >= NUM_CACHE_ENTRIES) {
-       //printf("hfs: cache is full (%d). replace at index %d\n", cache->numcached, index);
        cache->numcached = NUM_CACHE_ENTRIES-1;
 
        if (index > cache->numcached) {
-           //    printf("hfs: index %d pinned to %d\n", index, cache->numcached);
            index = cache->numcached;
        }
     }
@@ -908,15 +1007,15 @@ struct cinfo {
 };
 
 static int
-snoop_callback(const struct cat_desc *descp, const struct cat_attr *attrp, void * arg)
+snoop_callback(const cnode_t *cp, void *arg)
 {
-    struct cinfo *cip = (struct cinfo *)arg;
+    struct cinfo *cip = arg;
 
-    cip->uid = attrp->ca_uid;
-    cip->gid = attrp->ca_gid;
-    cip->mode = attrp->ca_mode;
-    cip->parentcnid = descp->cd_parentcnid;
-    cip->recflags = attrp->ca_recflags;
+    cip->uid = cp->c_uid;
+    cip->gid = cp->c_gid;
+    cip->mode = cp->c_mode;
+    cip->parentcnid = cp->c_parentcnid;
+    cip->recflags = cp->c_attr.ca_recflags;
        
     return (0);
 }
@@ -933,33 +1032,41 @@ do_attr_lookup(struct hfsmount *hfsmp, struct access_cache *cache, cnid_t cnid,
 
     /* if this id matches the one the fsctl was called with, skip the lookup */
     if (cnid == skip_cp->c_cnid) {
-       cnattrp->ca_uid = skip_cp->c_uid;
-       cnattrp->ca_gid = skip_cp->c_gid;
-       cnattrp->ca_mode = skip_cp->c_mode;
-       cnattrp->ca_recflags = skip_cp->c_attr.ca_recflags;
-       keyp->hfsPlus.parentID = skip_cp->c_parentcnid;
+               cnattrp->ca_uid = skip_cp->c_uid;
+               cnattrp->ca_gid = skip_cp->c_gid;
+               cnattrp->ca_mode = skip_cp->c_mode;
+               cnattrp->ca_recflags = skip_cp->c_attr.ca_recflags;
+               keyp->hfsPlus.parentID = skip_cp->c_parentcnid;
     } else {
-       struct cinfo c_info;
-
-       /* otherwise, check the cnode hash incase the file/dir is incore */
-       if (hfs_chash_snoop(hfsmp, cnid, 0, snoop_callback, &c_info) == 0) {
-           cnattrp->ca_uid = c_info.uid;
-           cnattrp->ca_gid = c_info.gid;
-           cnattrp->ca_mode = c_info.mode;
-           cnattrp->ca_recflags = c_info.recflags;
-           keyp->hfsPlus.parentID = c_info.parentcnid;
-       } else {
-           int lockflags;
-                       
-           lockflags = hfs_systemfile_lock(hfsmp, SFL_CATALOG, HFS_SHARED_LOCK);
-                       
-           /* lookup this cnid in the catalog */
-           error = cat_getkeyplusattr(hfsmp, cnid, keyp, cnattrp);
+               struct cinfo c_info;
+
+               /* otherwise, check the cnode hash incase the file/dir is incore */
+               error = hfs_chash_snoop(hfsmp, cnid, 0, snoop_callback, &c_info);
+
+               if (error == EACCES) {
+                       // File is deleted
+                       return ENOENT;
+               } else if (!error) {
+                       cnattrp->ca_uid = c_info.uid;
+                       cnattrp->ca_gid = c_info.gid;
+                       cnattrp->ca_mode = c_info.mode;
+                       cnattrp->ca_recflags = c_info.recflags;
+                       keyp->hfsPlus.parentID = c_info.parentcnid;
+               } else {
+                       int lockflags;
+
+                       if (throttle_io_will_be_throttled(-1, HFSTOVFS(hfsmp)))
+                               throttle_lowpri_io(1);
+
+                       lockflags = hfs_systemfile_lock(hfsmp, SFL_CATALOG, HFS_SHARED_LOCK);
+
+                       /* lookup this cnid in the catalog */
+                       error = cat_getkeyplusattr(hfsmp, cnid, keyp, cnattrp);
                        
-           hfs_systemfile_unlock(hfsmp, lockflags);
+                       hfs_systemfile_unlock(hfsmp, lockflags);
                        
-           cache->lookups++;
-       }
+                       cache->lookups++;
+               }
     }
        
     return (error);
@@ -1128,7 +1235,7 @@ do_bulk_access_check(struct hfsmount *hfsmp, struct vnode *vp,
     boolean_t is64bit;
 
     /*
-     * NOTE: on entry, the vnode is locked. Incase this vnode
+     * NOTE: on entry, the vnode has an io_ref. In case this vnode
      * happens to be in our list of file_ids, we'll note it
      * avoid calling hfs_chashget_nowait() on that id as that
      * will cause a "locking against myself" panic.
@@ -1373,7 +1480,7 @@ do_bulk_access_check(struct hfsmount *hfsmp, struct vnode *vp,
            access[i] = 0;
            continue;
        }
-                       
+       
        myaccess = do_access_check(hfsmp, &error, &cache, catkey.hfsPlus.parentID, 
            skip_cp, p, cred, context,bitmap, map_size, parents, num_parents);
                        
@@ -1401,8 +1508,6 @@ do_bulk_access_check(struct hfsmount *hfsmp, struct vnode *vp,
                
   err_exit_bulk_access:
                
-    //printf("hfs: on exit (err %d), numfiles/numcached/cachehits/lookups is %d/%d/%d/%d\n", error, num_files, cache.numcached, cache.cachehits, cache.lookups);
-               
     if (file_ids) 
        kfree(file_ids, sizeof(int) * num_files);
     if (parents) 
@@ -1423,24 +1528,13 @@ do_bulk_access_check(struct hfsmount *hfsmp, struct vnode *vp,
 /* end "bulk-access" support */
 
 
-/*
- * Callback for use with freeze ioctl.
- */
-static int
-hfs_freezewrite_callback(struct vnode *vp, __unused void *cargs)
-{
-       vnode_waitforwrites(vp, 0, 0, 0, "hfs freeze");
-
-       return 0;
-}
-
 /*
  * Control filesystem operating characteristics.
  */
 int
 hfs_vnop_ioctl( struct vnop_ioctl_args /* {
                vnode_t a_vp;
-               int  a_command;
+               long  a_command;
                caddr_t  a_data;
                int  a_fflag;
                vfs_context_t a_context;
@@ -1483,7 +1577,7 @@ hfs_vnop_ioctl( struct vnop_ioctl_args /* {
 #if CONFIG_PROTECT
        {
                int error = 0;
-               if ((error = cp_handle_vnop(VTOC(vp), CP_WRITE_ACCESS)) != 0) {
+               if ((error = cp_handle_vnop(vp, CP_WRITE_ACCESS, 0)) != 0) {
                        return error;
                }
        }
@@ -1498,6 +1592,7 @@ hfs_vnop_ioctl( struct vnop_ioctl_args /* {
                int  outlen;
                char *bufptr;
                int error;
+               int flags = 0;
 
                /* Caller must be owner of file system. */
                vfsp = vfs_statfs(HFSTOVFS(hfsmp));
@@ -1511,6 +1606,9 @@ hfs_vnop_ioctl( struct vnop_ioctl_args /* {
                }
                bufptr = (char *)ap->a_data;
                cnid = strtoul(bufptr, NULL, 10);
+               if (ap->a_fflag & HFS_GETPATH_VOLUME_RELATIVE) {
+                       flags |= BUILDPATH_VOLUME_RELATIVE; 
+               }
 
                /* We need to call hfs_vfs_vget to leverage the code that will
                 * fix the origin list for us if needed, as opposed to calling
@@ -1520,12 +1618,139 @@ hfs_vnop_ioctl( struct vnop_ioctl_args /* {
                if ((error = hfs_vfs_vget(HFSTOVFS(hfsmp), cnid, &file_vp, context))) {
                        return (error);
                }
-               error = build_path(file_vp, bufptr, sizeof(pathname_t), &outlen, 0, context);
+               error = build_path(file_vp, bufptr, sizeof(pathname_t), &outlen, flags, context);
                vnode_put(file_vp);
 
                return (error);
        }
 
+       case HFS_TRANSFER_DOCUMENT_ID:
+       {
+               struct cnode *cp = NULL;
+               int error;
+               u_int32_t to_fd = *(u_int32_t *)ap->a_data;
+               struct fileproc *to_fp;
+               struct vnode *to_vp;
+               struct cnode *to_cp;
+
+               cp = VTOC(vp);
+
+               if ((error = fp_getfvp(p, to_fd, &to_fp, &to_vp)) != 0) {
+                       //printf("could not get the vnode for fd %d (err %d)\n", to_fd, error);
+                       return error;
+               }
+               if ( (error = vnode_getwithref(to_vp)) ) {
+                       file_drop(to_fd);
+                       return error;
+               }
+
+               if (VTOHFS(to_vp) != hfsmp) {
+                       error = EXDEV;
+                       goto transfer_cleanup;
+               }
+
+               int need_unlock = 1;
+               to_cp = VTOC(to_vp);
+               error = hfs_lockpair(cp, to_cp, HFS_EXCLUSIVE_LOCK);
+               if (error != 0) {
+                       //printf("could not lock the pair of cnodes (error %d)\n", error);
+                       goto transfer_cleanup;
+               }
+                       
+               if (!(cp->c_bsdflags & UF_TRACKED)) {
+                       error = EINVAL;
+               } else if (to_cp->c_bsdflags & UF_TRACKED) {
+                       //
+                       // if the destination is already tracked, return an error
+                       // as otherwise it's a silent deletion of the target's
+                       // document-id
+                       //
+                       error = EEXIST;
+               } else if (S_ISDIR(cp->c_attr.ca_mode) || S_ISREG(cp->c_attr.ca_mode) || S_ISLNK(cp->c_attr.ca_mode)) {
+                       //
+                       // we can use the FndrExtendedFileInfo because the doc-id is the first
+                       // thing in both it and the ExtendedDirInfo struct which is fixed in
+                       // format and can not change layout
+                       //
+                       struct FndrExtendedFileInfo *f_extinfo = (struct FndrExtendedFileInfo *)((u_int8_t*)cp->c_finderinfo + 16);
+                       struct FndrExtendedFileInfo *to_extinfo = (struct FndrExtendedFileInfo *)((u_int8_t*)to_cp->c_finderinfo + 16);
+
+                       if (f_extinfo->document_id == 0) {
+                               uint32_t new_id;
+
+                               hfs_unlockpair(cp, to_cp);  // have to unlock to be able to get a new-id
+                               
+                               if ((error = hfs_generate_document_id(hfsmp, &new_id)) == 0) {
+                                       //
+                                       // re-lock the pair now that we have the document-id
+                                       //
+                                       hfs_lockpair(cp, to_cp, HFS_EXCLUSIVE_LOCK);
+                                       f_extinfo->document_id = new_id;
+                               } else {
+                                       goto transfer_cleanup;
+                               }
+                       }
+                                       
+                       to_extinfo->document_id = f_extinfo->document_id;
+                       f_extinfo->document_id = 0;
+                       //printf("TRANSFERRING: doc-id %d from ino %d to ino %d\n", to_extinfo->document_id, cp->c_fileid, to_cp->c_fileid);
+
+                       // make sure the destination is also UF_TRACKED
+                       to_cp->c_bsdflags |= UF_TRACKED;
+                       cp->c_bsdflags &= ~UF_TRACKED;
+
+                       // mark the cnodes dirty
+                       cp->c_flag |= C_MODIFIED;
+                       to_cp->c_flag |= C_MODIFIED;
+
+                       int lockflags;
+                       if ((error = hfs_start_transaction(hfsmp)) == 0) {
+
+                               lockflags = hfs_systemfile_lock(hfsmp, SFL_CATALOG, HFS_EXCLUSIVE_LOCK);
+
+                               (void) cat_update(hfsmp, &cp->c_desc, &cp->c_attr, NULL, NULL);
+                               (void) cat_update(hfsmp, &to_cp->c_desc, &to_cp->c_attr, NULL, NULL);
+
+                               hfs_systemfile_unlock (hfsmp, lockflags);
+                               (void) hfs_end_transaction(hfsmp);
+                       }
+
+#if CONFIG_FSE
+                       add_fsevent(FSE_DOCID_CHANGED, context,
+                                   FSE_ARG_DEV,   hfsmp->hfs_raw_dev,
+                                   FSE_ARG_INO,   (ino64_t)cp->c_fileid,       // src inode #
+                                   FSE_ARG_INO,   (ino64_t)to_cp->c_fileid,    // dst inode #
+                                   FSE_ARG_INT32, to_extinfo->document_id,
+                                   FSE_ARG_DONE);
+
+                       hfs_unlockpair(cp, to_cp);    // unlock this so we can send the fsevents
+                       need_unlock = 0;
+
+                       if (need_fsevent(FSE_STAT_CHANGED, vp)) {
+                               add_fsevent(FSE_STAT_CHANGED, context, FSE_ARG_VNODE, vp, FSE_ARG_DONE);
+                       }
+                       if (need_fsevent(FSE_STAT_CHANGED, to_vp)) {
+                               add_fsevent(FSE_STAT_CHANGED, context, FSE_ARG_VNODE, to_vp, FSE_ARG_DONE);
+                       }
+#else
+                       hfs_unlockpair(cp, to_cp);    // unlock this so we can send the fsevents
+                       need_unlock = 0;
+#endif
+               }
+               
+               if (need_unlock) {
+                       hfs_unlockpair(cp, to_cp);
+               }
+
+       transfer_cleanup:
+               vnode_put(to_vp);
+               file_drop(to_fd);
+
+               return error;
+       }
+
+
+
        case HFS_PREV_LINK:
        case HFS_NEXT_LINK:
        {
@@ -1580,6 +1805,7 @@ hfs_vnop_ioctl( struct vnop_ioctl_args /* {
        case HFS_RESIZE_VOLUME: {
                u_int64_t newsize;
                u_int64_t cursize;
+               int ret;
 
                vfsp = vfs_statfs(HFSTOVFS(hfsmp));
                if (suser(cred, NULL) &&
@@ -1596,14 +1822,18 @@ hfs_vnop_ioctl( struct vnop_ioctl_args /* {
                }
                newsize = *(u_int64_t *)ap->a_data;
                cursize = (u_int64_t)hfsmp->totalBlocks * (u_int64_t)hfsmp->blockSize;
-               
+
+               if (newsize == cursize) {
+                       return (0);
+               }
+               IOBSDMountChange(hfsmp->hfs_mp, kIOMountChangeWillResize);
                if (newsize > cursize) {
-                       return hfs_extendfs(hfsmp, *(u_int64_t *)ap->a_data, context);
-               } else if (newsize < cursize) {
-                       return hfs_truncatefs(hfsmp, *(u_int64_t *)ap->a_data, context);
+                       ret = hfs_extendfs(hfsmp, *(u_int64_t *)ap->a_data, context);
                } else {
-                       return (0);
+                       ret = hfs_truncatefs(hfsmp, *(u_int64_t *)ap->a_data, context);
                }
+               IOBSDMountChange(hfsmp->hfs_mp, kIOMountChangeDidResize);
+               return (ret);
        }
        case HFS_CHANGE_NEXT_ALLOCATION: {
                int error = 0;          /* Assume success */
@@ -1620,7 +1850,7 @@ hfs_vnop_ioctl( struct vnop_ioctl_args /* {
                if (!vnode_isvroot(vp)) {
                        return (EINVAL);
                }
-               HFS_MOUNT_LOCK(hfsmp, TRUE);
+               hfs_lock_mount(hfsmp);
                location = *(u_int32_t *)ap->a_data;
                if ((location >= hfsmp->allocLimit) &&
                        (location != HFS_NO_UPDATE_NEXT_ALLOCATION)) {
@@ -1644,7 +1874,7 @@ hfs_vnop_ioctl( struct vnop_ioctl_args /* {
                }
                MarkVCBDirty(hfsmp);
 fail_change_next_allocation:
-               HFS_MOUNT_UNLOCK(hfsmp, TRUE);
+               hfs_unlock_mount(hfsmp);
                return (error);
        }
 
@@ -1697,22 +1927,13 @@ fail_change_next_allocation:
                vnode_ref(bsfs_rootvp);
                vnode_put(bsfs_rootvp);
 
+               hfs_lock_mount(hfsmp);
                hfsmp->hfs_backingfs_rootvp = bsfs_rootvp;
-
                hfsmp->hfs_flags |= HFS_HAS_SPARSE_DEVICE;
-               /* The free extent cache is managed differently for sparse devices.  
-                * There is a window between which the volume is mounted and the 
-                * device is marked as sparse, so the free extent cache for this 
-                * volume is currently initialized as normal volume (sorted by block 
-                * count).  Reset the cache so that it will be rebuilt again 
-                * for sparse device (sorted by start block).
-                */
-               ResetVCBFreeExtCache(hfsmp);
+               hfsmp->hfs_sparsebandblks = bsdata->bandsize / hfsmp->blockSize * 4;
+               hfs_unlock_mount(hfsmp);
 
-               hfsmp->hfs_sparsebandblks = bsdata->bandsize / HFSTOVCB(hfsmp)->blockSize;
-               hfsmp->hfs_sparsebandblks *= 4;
-
-               vfs_markdependency(hfsmp->hfs_mp);
+               /* We check the MNTK_VIRTUALDEV bit instead of marking the dependent process */
 
                /*
                 * If the sparse image is on a sparse image file (as opposed to a sparse
@@ -1733,6 +1954,15 @@ fail_change_next_allocation:
                        }
                }
                                
+               /* The free extent cache is managed differently for sparse devices.  
+                * There is a window between which the volume is mounted and the 
+                * device is marked as sparse, so the free extent cache for this 
+                * volume is currently initialized as normal volume (sorted by block 
+                * count).  Reset the cache so that it will be rebuilt again 
+                * for sparse device (sorted by start block).
+                */
+               ResetVCBFreeExtCache(hfsmp);
+
                (void)vnode_put(di_vp);
                file_drop(bsdata->backingfd);
                return (0);
@@ -1752,102 +1982,89 @@ fail_change_next_allocation:
                if ((hfsmp->hfs_flags & HFS_HAS_SPARSE_DEVICE) &&
                    hfsmp->hfs_backingfs_rootvp) {
 
+                       hfs_lock_mount(hfsmp);
                        hfsmp->hfs_flags &= ~HFS_HAS_SPARSE_DEVICE;
                        tmpvp = hfsmp->hfs_backingfs_rootvp;
                        hfsmp->hfs_backingfs_rootvp = NULLVP;
                        hfsmp->hfs_sparsebandblks = 0;
+                       hfs_unlock_mount(hfsmp);
+
                        vnode_rele(tmpvp);
                }
                return (0);
        }
 #endif /* HFS_SPARSE_DEV */
 
-       case F_FREEZE_FS: {
-               struct mount *mp;
-               mp = vnode_mount(vp);
-               hfsmp = VFSTOHFS(mp);
+       /* Change the next CNID stored in the VH */
+       case HFS_CHANGE_NEXTCNID: {
+               int error = 0;          /* Assume success */
+               u_int32_t fileid;
+               int wraparound = 0;
+               int lockflags = 0;
 
-               if (!(hfsmp->jnl))
-                       return (ENOTSUP);
+               if (vnode_vfsisrdonly(vp)) {
+                       return (EROFS);
+               }
+               vfsp = vfs_statfs(HFSTOVFS(hfsmp));
+               if (suser(cred, NULL) &&
+                       kauth_cred_getuid(cred) != vfsp->f_owner) {
+                       return (EACCES); /* must be owner of file system */
+               }
+               
+               fileid = *(u_int32_t *)ap->a_data;
 
-               vfsp = vfs_statfs(mp);
-       
-               if (kauth_cred_getuid(cred) != vfsp->f_owner &&
-                       !kauth_cred_issuser(cred))
-                       return (EACCES);
+               /* Must have catalog lock excl. to advance the CNID pointer */
+               lockflags = hfs_systemfile_lock (hfsmp, SFL_CATALOG , HFS_EXCLUSIVE_LOCK);
 
-               lck_rw_lock_exclusive(&hfsmp->hfs_insync);
-               // flush things before we get started to try and prevent
-               // dirty data from being paged out while we're frozen.
-               // note: can't do this after taking the lock as it will
-               // deadlock against ourselves.
-               vnode_iterate(mp, 0, hfs_freezewrite_callback, NULL);
-               hfs_lock_global (hfsmp, HFS_EXCLUSIVE_LOCK);
-
-               // DO NOT call hfs_journal_flush() because that takes a
-               // shared lock on the global exclusive lock!
-               journal_flush(hfsmp->jnl, TRUE);
-
-               // don't need to iterate on all vnodes, we just need to
-               // wait for writes to the system files and the device vnode
-               //
-               // Now that journal flush waits for all metadata blocks to 
-               // be written out, waiting for btree writes is probably no
-               // longer required.
-               if (HFSTOVCB(hfsmp)->extentsRefNum)
-                   vnode_waitforwrites(HFSTOVCB(hfsmp)->extentsRefNum, 0, 0, 0, "hfs freeze");
-               if (HFSTOVCB(hfsmp)->catalogRefNum)
-                   vnode_waitforwrites(HFSTOVCB(hfsmp)->catalogRefNum, 0, 0, 0, "hfs freeze");
-               if (HFSTOVCB(hfsmp)->allocationsRefNum)
-                   vnode_waitforwrites(HFSTOVCB(hfsmp)->allocationsRefNum, 0, 0, 0, "hfs freeze");
-               if (hfsmp->hfs_attribute_vp)
-                   vnode_waitforwrites(hfsmp->hfs_attribute_vp, 0, 0, 0, "hfs freeze");
-               vnode_waitforwrites(hfsmp->hfs_devvp, 0, 0, 0, "hfs freeze");
-
-               hfsmp->hfs_freezing_proc = current_proc();
+               hfs_lock_mount(hfsmp);
 
-               return (0);
+               /* If it is less than the current next CNID, force the wraparound bit to be set */
+               if (fileid < hfsmp->vcbNxtCNID) {
+                       wraparound=1;
+               }
+
+               /* Return previous value. */
+               *(u_int32_t *)ap->a_data = hfsmp->vcbNxtCNID;
+
+               hfsmp->vcbNxtCNID = fileid;
+
+               if (wraparound) {
+                       hfsmp->vcbAtrb |= kHFSCatalogNodeIDsReusedMask;
+               }
+               
+               MarkVCBDirty(hfsmp);
+               hfs_unlock_mount(hfsmp);
+               hfs_systemfile_unlock (hfsmp, lockflags);
+
+               return (error);
        }
+       
+       case F_FREEZE_FS: {
+               struct mount *mp;
+               mp = vnode_mount(vp);
+               hfsmp = VFSTOHFS(mp);
 
-       case F_THAW_FS: {
-               vfsp = vfs_statfs(vnode_mount(vp));
+               if (!(hfsmp->jnl))
+                       return (ENOTSUP);
+
+               vfsp = vfs_statfs(mp);
+       
                if (kauth_cred_getuid(cred) != vfsp->f_owner &&
                        !kauth_cred_issuser(cred))
                        return (EACCES);
 
-               // if we're not the one who froze the fs then we
-               // can't thaw it.
-               if (hfsmp->hfs_freezing_proc != current_proc()) {
-                   return EPERM;
-               }
-
-               // NOTE: if you add code here, also go check the
-               //       code that "thaws" the fs in hfs_vnop_close()
-               //
-               hfsmp->hfs_freezing_proc = NULL;
-               hfs_unlock_global (hfsmp);
-               lck_rw_unlock_exclusive(&hfsmp->hfs_insync);
-
-               return (0);
+               return hfs_freeze(hfsmp);
        }
 
-       case HFS_BULKACCESS_FSCTL: {
-           int size;
-           
-           if (hfsmp->hfs_flags & HFS_STANDARD) {
-               return EINVAL;
-           }
+       case F_THAW_FS: {
+               vfsp = vfs_statfs(vnode_mount(vp));
+               if (kauth_cred_getuid(cred) != vfsp->f_owner &&
+                       !kauth_cred_issuser(cred))
+                       return (EACCES);
 
-           if (is64bit) {
-               size = sizeof(struct user64_access_t);
-           } else {
-               size = sizeof(struct user32_access_t);
-           }
-           
-           return do_bulk_access_check(hfsmp, vp, ap, size, context);
-       } 
+               return hfs_thaw(hfsmp, current_proc());
+       }
 
        case HFS_EXT_BULKACCESS_FSCTL: {
            int size;
@@ -1884,7 +2101,7 @@ fail_change_next_allocation:
                 * are enabled by default, so any change will be transient only 
                 * till the volume is remounted.
                 */
-               if (!is_suser()) {
+               if (!kauth_cred_issuser(kauth_cred_get())) {
                        return (EPERM);
                }
                if (state == 0 || state == 1)
@@ -1893,15 +2110,284 @@ fail_change_next_allocation:
                        return (EINVAL);        
        }
 
+       case F_SETSTATICCONTENT: {
+               int error;
+               int enable_static = 0;
+               struct cnode *cp = NULL;
+               /* 
+                * lock the cnode, decorate the cnode flag, and bail out.
+                * VFS should have already authenticated the caller for us.
+                */
+
+               if (ap->a_data) {
+                       /* 
+                        * Note that even though ap->a_data is of type caddr_t,
+                        * the fcntl layer at the syscall handler will pass in NULL
+                        * or 1 depending on what the argument supplied to the fcntl
+                        * was.  So it is in fact correct to check the ap->a_data 
+                        * argument for zero or non-zero value when deciding whether or not
+                        * to enable the static bit in the cnode.
+                        */
+                       enable_static = 1;
+               }
+               if (hfsmp->hfs_flags & HFS_READ_ONLY) {
+                       return EROFS;
+               }
+               cp = VTOC(vp);
+
+               error = hfs_lock (cp, HFS_EXCLUSIVE_LOCK, HFS_LOCK_DEFAULT);
+               if (error == 0) {
+                       if (enable_static) {
+                               cp->c_flag |= C_SSD_STATIC;
+                       }
+                       else {
+                               cp->c_flag &= ~C_SSD_STATIC;
+                       }
+                       hfs_unlock (cp);
+               }
+               return error;
+       }
+
+       case F_SET_GREEDY_MODE: {
+               int error;
+               int enable_greedy_mode = 0;
+               struct cnode *cp = NULL;
+               /* 
+                * lock the cnode, decorate the cnode flag, and bail out.
+                * VFS should have already authenticated the caller for us.
+                */
+
+               if (ap->a_data) {
+                       /* 
+                        * Note that even though ap->a_data is of type caddr_t,
+                        * the fcntl layer at the syscall handler will pass in NULL
+                        * or 1 depending on what the argument supplied to the fcntl
+                        * was.  So it is in fact correct to check the ap->a_data 
+                        * argument for zero or non-zero value when deciding whether or not
+                        * to enable the greedy mode bit in the cnode.
+                        */
+                       enable_greedy_mode = 1;
+               }
+               if (hfsmp->hfs_flags & HFS_READ_ONLY) {
+                       return EROFS;
+               }
+               cp = VTOC(vp);
+
+               error = hfs_lock (cp, HFS_EXCLUSIVE_LOCK, HFS_LOCK_DEFAULT);
+               if (error == 0) {
+                       if (enable_greedy_mode) {
+                               cp->c_flag |= C_SSD_GREEDY_MODE;
+                       }
+                       else {
+                               cp->c_flag &= ~C_SSD_GREEDY_MODE;
+                       }
+                       hfs_unlock (cp);
+               }
+               return error;
+       }
+
+       case F_SETIOTYPE: {
+               int error;
+               uint32_t iotypeflag = 0;
+               
+               struct cnode *cp = NULL;
+               /* 
+                * lock the cnode, decorate the cnode flag, and bail out.
+                * VFS should have already authenticated the caller for us.
+                */
+
+               if (ap->a_data == NULL) {
+                       return EINVAL;
+               }
+
+               /* 
+                * Note that even though ap->a_data is of type caddr_t, we
+                * can only use 32 bits of flag values.
+                */
+               iotypeflag = (uint32_t) ap->a_data;
+               switch (iotypeflag) {
+                       case F_IOTYPE_ISOCHRONOUS:
+                               break;
+                       default:
+                               return EINVAL;
+               }
+
+
+               if (hfsmp->hfs_flags & HFS_READ_ONLY) {
+                       return EROFS;
+               }
+               cp = VTOC(vp);
+
+               error = hfs_lock (cp, HFS_EXCLUSIVE_LOCK, HFS_LOCK_DEFAULT);
+               if (error == 0) {
+                       switch (iotypeflag) {
+                               case F_IOTYPE_ISOCHRONOUS:
+                                       cp->c_flag |= C_IO_ISOCHRONOUS;
+                                       break;
+                               default:
+                                       break;
+                       }
+                       hfs_unlock (cp);
+               }
+               return error;
+       }
+
+       case F_MAKECOMPRESSED: {
+               int error = 0;
+               uint32_t gen_counter;
+               struct cnode *cp = NULL;
+               int reset_decmp = 0;
+
+               if (hfsmp->hfs_flags & HFS_READ_ONLY) {
+                       return EROFS;
+               }
+
+               /* 
+                * acquire & lock the cnode.
+                * VFS should have already authenticated the caller for us.
+                */
+
+               if (ap->a_data) {
+                       /* 
+                        * Cast the pointer into a uint32_t so we can extract the 
+                        * supplied generation counter.
+                        */
+                       gen_counter = *((uint32_t*)ap->a_data);
+               }
+               else {
+                       return EINVAL;
+               }
+
+#if HFS_COMPRESSION
+               cp = VTOC(vp);
+               /* Grab truncate lock first; we may truncate the file */
+               hfs_lock_truncate (cp, HFS_EXCLUSIVE_LOCK, HFS_LOCK_DEFAULT);
+
+               error = hfs_lock (cp, HFS_EXCLUSIVE_LOCK, HFS_LOCK_DEFAULT);
+               if (error) {
+                       hfs_unlock_truncate(cp, HFS_LOCK_DEFAULT);
+                       return error;
+               }
+
+               /* Are there any other usecounts/FDs? */
+               if (vnode_isinuse(vp, 1)) {
+                       hfs_unlock(cp);
+                       hfs_unlock_truncate(cp, HFS_LOCK_DEFAULT);
+                       return EBUSY;
+               }
+
+               /* now we have the cnode locked down; Validate arguments */
+               if (cp->c_attr.ca_flags & (UF_IMMUTABLE | UF_COMPRESSED)) {
+                       /* EINVAL if you are trying to manipulate an IMMUTABLE file */
+                       hfs_unlock(cp);
+                       hfs_unlock_truncate (cp, HFS_LOCK_DEFAULT);
+                       return EINVAL;
+               }
+
+               if ((hfs_get_gencount (cp)) == gen_counter) {
+                       /* 
+                        * OK, the gen_counter matched.  Go for it:
+                        * Toggle state bits, truncate file, and suppress mtime update 
+                        */
+                       reset_decmp = 1;
+                       cp->c_bsdflags |= UF_COMPRESSED;                                
+
+                       error = hfs_truncate(vp, 0, IO_NDELAY, HFS_TRUNCATE_SKIPTIMES,
+                                                                ap->a_context);
+               }
+               else {
+                       error = ESTALE;
+               }
+
+               /* Unlock cnode before executing decmpfs ; they may need to get an EA */
+               hfs_unlock(cp);
+
+               /*
+                * Reset the decmp state while still holding the truncate lock. We need to 
+                * serialize here against a listxattr on this node which may occur at any 
+                * time. 
+                * 
+                * Even if '0/skiplock' is passed in 2nd argument to hfs_file_is_compressed,
+                * that will still potentially require getting the com.apple.decmpfs EA. If the 
+                * EA is required, then we can't hold the cnode lock, because the getxattr call is
+                * generic(through VFS), and can't pass along any info telling it that we're already
+                * holding it (the lock). If we don't serialize, then we risk listxattr stopping
+                * and trying to fill in the hfs_file_is_compressed info during the callback
+                * operation, which will result in deadlock against the b-tree node.
+                * 
+                * So, to serialize against listxattr (which will grab buf_t meta references on
+                * the b-tree blocks), we hold the truncate lock as we're manipulating the 
+                * decmpfs payload. 
+                */
+               if ((reset_decmp) && (error == 0)) {
+                       decmpfs_cnode *dp = VTOCMP (vp);
+                       if (dp != NULL) {
+                               decmpfs_cnode_set_vnode_state(dp, FILE_TYPE_UNKNOWN, 0);
+                       }
+
+                       /* Initialize the decmpfs node as needed */
+                       (void) hfs_file_is_compressed (cp, 0); /* ok to take lock */
+               }
+
+               hfs_unlock_truncate (cp, HFS_LOCK_DEFAULT);
+
+#endif
+               return error;
+       }
+
+       case F_SETBACKINGSTORE: {
+
+               int error = 0;
+
+               /* 
+                * See comment in F_SETSTATICCONTENT re: using
+            * a null check for a_data
+                */
+               if (ap->a_data) {
+                       error = hfs_set_backingstore (vp, 1);
+               }
+               else {
+                       error = hfs_set_backingstore (vp, 0);
+               }               
+
+               return error;
+       }
+
+       case F_GETPATH_MTMINFO: {
+               int error = 0;
+
+               int *data = (int*) ap->a_data;  
+
+               /* Ask if this is a backingstore vnode */
+               error = hfs_is_backingstore (vp, data);
+
+               return error;
+       }
+
        case F_FULLFSYNC: {
                int error;
                
                if (hfsmp->hfs_flags & HFS_READ_ONLY) {
                        return (EROFS);
                }
-               error = hfs_lock(VTOC(vp), HFS_EXCLUSIVE_LOCK);
+               error = hfs_lock(VTOC(vp), HFS_EXCLUSIVE_LOCK, HFS_LOCK_DEFAULT);
                if (error == 0) {
-                       error = hfs_fsync(vp, MNT_WAIT, TRUE, p);
+                       error = hfs_fsync(vp, MNT_WAIT, HFS_FSYNC_FULL, p);
+                       hfs_unlock(VTOC(vp));
+               }
+
+               return error;
+       }
+
+       case F_BARRIERFSYNC: {
+               int error;
+
+               if (hfsmp->hfs_flags & HFS_READ_ONLY) {
+                       return (EROFS);
+               }
+               error = hfs_lock(VTOC(vp), HFS_EXCLUSIVE_LOCK, HFS_LOCK_DEFAULT);
+               if (error == 0) {
+                       error = hfs_fsync(vp, MNT_WAIT, HFS_FSYNC_BARRIER, p);
                        hfs_unlock(VTOC(vp));
                }
 
@@ -1915,7 +2401,7 @@ fail_change_next_allocation:
                if (!vnode_isreg(vp))
                        return EINVAL;
  
-               error = hfs_lock(VTOC(vp), HFS_EXCLUSIVE_LOCK);
+               error = hfs_lock(VTOC(vp), HFS_EXCLUSIVE_LOCK, HFS_LOCK_DEFAULT);
                if (error == 0) {
                        cp = VTOC(vp);
                        /*
@@ -1941,7 +2427,7 @@ fail_change_next_allocation:
                fp = VTOF(vp);
 
                /* Protect against a size change. */
-               hfs_lock_truncate(VTOC(vp), HFS_EXCLUSIVE_LOCK);
+               hfs_lock_truncate(VTOC(vp), HFS_EXCLUSIVE_LOCK, HFS_LOCK_DEFAULT);
 
 #if HFS_COMPRESSION
                if (compressed && (uncompressed_size == -1)) {
@@ -1960,92 +2446,10 @@ fail_change_next_allocation:
                        error = advisory_read(vp, fp->ff_size, ra->ra_offset, ra->ra_count);
                }
 
-               hfs_unlock_truncate(VTOC(vp), 0);
+               hfs_unlock_truncate(VTOC(vp), HFS_LOCK_DEFAULT);
                return (error);
        }
 
-       case F_READBOOTSTRAP:
-       case F_WRITEBOOTSTRAP:
-       {
-           struct vnode *devvp = NULL;
-           user_fbootstraptransfer_t *user_bootstrapp;
-           int devBlockSize;
-           int error;
-           uio_t auio;
-           daddr64_t blockNumber;
-           u_int32_t blockOffset;
-           u_int32_t xfersize;
-           struct buf *bp;
-           user_fbootstraptransfer_t user_bootstrap;
-
-               if (!vnode_isvroot(vp))
-                       return (EINVAL);
-               /* LP64 - when caller is a 64 bit process then we are passed a pointer 
-                * to a user_fbootstraptransfer_t else we get a pointer to a 
-                * fbootstraptransfer_t which we munge into a user_fbootstraptransfer_t
-                */
-               if ((hfsmp->hfs_flags & HFS_READ_ONLY)
-                       && (ap->a_command == F_WRITEBOOTSTRAP)) {
-                       return (EROFS);
-               }
-               if (is64bit) {
-                       user_bootstrapp = (user_fbootstraptransfer_t *)ap->a_data;
-               }
-               else {
-                       user32_fbootstraptransfer_t *bootstrapp = (user32_fbootstraptransfer_t *)ap->a_data;
-                       user_bootstrapp = &user_bootstrap;
-                       user_bootstrap.fbt_offset = bootstrapp->fbt_offset;
-                       user_bootstrap.fbt_length = bootstrapp->fbt_length;
-                       user_bootstrap.fbt_buffer = CAST_USER_ADDR_T(bootstrapp->fbt_buffer);
-               }
-
-               if ((user_bootstrapp->fbt_offset < 0) || (user_bootstrapp->fbt_offset > 1024) || 
-                               (user_bootstrapp->fbt_length > 1024)) {
-                       return EINVAL;
-               }
-
-               if (user_bootstrapp->fbt_offset + user_bootstrapp->fbt_length > 1024) 
-                       return EINVAL;
-           
-               devvp = VTOHFS(vp)->hfs_devvp;
-               auio = uio_create(1, user_bootstrapp->fbt_offset, 
-                                                 is64bit ? UIO_USERSPACE64 : UIO_USERSPACE32,
-                                                 (ap->a_command == F_WRITEBOOTSTRAP) ? UIO_WRITE : UIO_READ);
-               uio_addiov(auio, user_bootstrapp->fbt_buffer, user_bootstrapp->fbt_length);
-
-           devBlockSize = vfs_devblocksize(vnode_mount(vp));
-
-           while (uio_resid(auio) > 0) {
-                       blockNumber = uio_offset(auio) / devBlockSize;
-                       error = (int)buf_bread(devvp, blockNumber, devBlockSize, cred, &bp);
-                       if (error) {
-                               if (bp) buf_brelse(bp);
-                               uio_free(auio);
-                               return error;
-                       };
-
-                       blockOffset = uio_offset(auio) % devBlockSize;
-                       xfersize = devBlockSize - blockOffset;
-                       error = uiomove((caddr_t)buf_dataptr(bp) + blockOffset, (int)xfersize, auio);
-                       if (error) {
-                               buf_brelse(bp);
-                               uio_free(auio);
-                               return error;
-                       };
-                       if (uio_rw(auio) == UIO_WRITE) {
-                               error = VNOP_BWRITE(bp);
-                               if (error) {
-                                       uio_free(auio);
-                       return error;
-                               }
-                       } else {
-                               buf_brelse(bp);
-                       };
-               };
-               uio_free(auio);
-       };
-       return 0;
-
        case _IOC(IOC_OUT,'h', 4, 0):     /* Create date in local time */
        {
                if (is64bit) {
@@ -2065,6 +2469,10 @@ fail_change_next_allocation:
            *(uint32_t *)ap->a_data = hfsmp->hfs_last_mounted_mtime;
            break;
 
+       case HFS_FSCTL_GET_VERY_LOW_DISK:
+           *(uint32_t*)ap->a_data = hfsmp->hfs_freespace_notify_dangerlimit;
+           break;
+
        case HFS_FSCTL_SET_VERY_LOW_DISK:
            if (*(uint32_t *)ap->a_data >= hfsmp->hfs_freespace_notify_warninglimit) {
                return EINVAL;
@@ -2073,6 +2481,10 @@ fail_change_next_allocation:
            hfsmp->hfs_freespace_notify_dangerlimit = *(uint32_t *)ap->a_data;
            break;
 
+       case HFS_FSCTL_GET_LOW_DISK:
+           *(uint32_t*)ap->a_data = hfsmp->hfs_freespace_notify_warninglimit;
+           break;
+
        case HFS_FSCTL_SET_LOW_DISK:
            if (   *(uint32_t *)ap->a_data >= hfsmp->hfs_freespace_notify_desiredlevel
                || *(uint32_t *)ap->a_data <= hfsmp->hfs_freespace_notify_dangerlimit) {
@@ -2083,6 +2495,10 @@ fail_change_next_allocation:
            hfsmp->hfs_freespace_notify_warninglimit = *(uint32_t *)ap->a_data;
            break;
 
+       case HFS_FSCTL_GET_DESIRED_DISK:
+           *(uint32_t*)ap->a_data = hfsmp->hfs_freespace_notify_desiredlevel;
+           break;
+
        case HFS_FSCTL_SET_DESIRED_DISK:
            if (*(uint32_t *)ap->a_data <= hfsmp->hfs_freespace_notify_warninglimit) {
                return EINVAL;
@@ -2103,18 +2519,18 @@ fail_change_next_allocation:
                if (hfsmp->hfs_flags & HFS_READ_ONLY) {
                        return (EROFS);
                }
-               HFS_MOUNT_LOCK(hfsmp, TRUE);
+               hfs_lock_mount (hfsmp);
                bcopy(ap->a_data, &hfsmp->vcbFndrInfo, sizeof(hfsmp->vcbFndrInfo));
-               HFS_MOUNT_UNLOCK(hfsmp, TRUE);
-               (void) hfs_flushvolumeheader(hfsmp, MNT_WAIT, 0);
+               hfs_unlock_mount (hfsmp);
+               (void) hfs_flushvolumeheader(hfsmp, HFS_FVH_WAIT);
                break;
                
        case HFS_GET_BOOT_INFO:
                if (!vnode_isvroot(vp))
                        return(EINVAL);
-               HFS_MOUNT_LOCK(hfsmp, TRUE);
+               hfs_lock_mount (hfsmp);
                bcopy(&hfsmp->vcbFndrInfo, ap->a_data, sizeof(hfsmp->vcbFndrInfo));
-               HFS_MOUNT_UNLOCK(hfsmp, TRUE);
+               hfs_unlock_mount(hfsmp);
                break;
 
        case HFS_MARK_BOOT_CORRUPT:
@@ -2122,7 +2538,7 @@ fail_change_next_allocation:
                 * kHFSVolumeInconsistentBit in the volume header.  This will 
                 * force fsck_hfs on next mount.
                 */
-               if (!is_suser()) {
+               if (!kauth_cred_issuser(kauth_cred_get())) {
                        return EACCES;
                }
                        
@@ -2134,54 +2550,276 @@ fail_change_next_allocation:
                if (hfsmp->hfs_flags & HFS_READ_ONLY) {
                        return (EROFS);
                }
-               printf ("hfs_vnop_ioctl: Marking the boot volume corrupt.\n");
-               hfs_mark_volume_inconsistent(hfsmp);
+               printf ("hfs_vnop_ioctl: Marking the boot volume corrupt.\n");
+               hfs_mark_inconsistent(hfsmp, HFS_FSCK_FORCED);
+               break;
+
+       case HFS_FSCTL_GET_JOURNAL_INFO:
+               jip = (struct hfs_journal_info*)ap->a_data;
+
+               if (vp == NULLVP)
+                       return EINVAL;
+
+           if (hfsmp->jnl == NULL) {
+                       jnl_start = 0;
+                       jnl_size  = 0;
+           } else {
+                       jnl_start = hfs_blk_to_bytes(hfsmp->jnl_start, hfsmp->blockSize) + hfsmp->hfsPlusIOPosOffset;
+                       jnl_size  = hfsmp->jnl_size;
+           }
+
+               jip->jstart = jnl_start;
+               jip->jsize = jnl_size;
+               break;
+
+       case HFS_SET_ALWAYS_ZEROFILL: {
+           struct cnode *cp = VTOC(vp);
+
+           if (*(int *)ap->a_data) {
+               cp->c_flag |= C_ALWAYS_ZEROFILL;
+           } else {
+               cp->c_flag &= ~C_ALWAYS_ZEROFILL;
+           }
+           break;
+       }    
+
+       case HFS_DISABLE_METAZONE: {
+               /* Only root can disable metadata zone */
+               if (!kauth_cred_issuser(kauth_cred_get())) {
+                       return EACCES;
+               }
+               if (hfsmp->hfs_flags & HFS_READ_ONLY) {
+                       return (EROFS);
+               }
+
+               /* Disable metadata zone now */
+               (void) hfs_metadatazone_init(hfsmp, true);
+               printf ("hfs: Disabling metadata zone on %s\n", hfsmp->vcbVN);
+               break;
+       }
+
+
+       case HFS_FSINFO_METADATA_BLOCKS: {
+               int error;
+               struct hfsinfo_metadata *hinfo;
+
+               hinfo = (struct hfsinfo_metadata *)ap->a_data;
+
+               /* Get information about number of metadata blocks */
+               error = hfs_getinfo_metadata_blocks(hfsmp, hinfo);
+               if (error) {
+                       return error;
+               }
+
+               break;
+       }
+
+       case HFS_GET_FSINFO: {
+               hfs_fsinfo *fsinfo = (hfs_fsinfo *)ap->a_data;
+
+               /* Only root is allowed to get fsinfo */
+               if (!kauth_cred_issuser(kauth_cred_get())) {
+                       return EACCES;
+               }
+
+               /*
+                * Make sure that the caller's version number matches with
+                * the kernel's version number.  This will make sure that
+                * if the structures being read/written into are changed
+                * by the kernel, the caller will not read incorrect data.
+                *
+                * The first three fields --- request_type, version and
+                * flags are same for all the hfs_fsinfo structures, so
+                * we can access the version number by assuming any
+                * structure for now.
+                */
+               if (fsinfo->header.version != HFS_FSINFO_VERSION) {
+                       return ENOTSUP;
+               }
+
+               /* Make sure that the current file system is not marked inconsistent */
+               if (hfsmp->vcbAtrb & kHFSVolumeInconsistentMask) {
+                       return EIO;
+               }
+
+               return hfs_get_fsinfo(hfsmp, ap->a_data);
+       }
+
+       case HFS_CS_FREESPACE_TRIM: {
+               int error = 0;
+               int lockflags = 0;
+
+               /* Only root allowed */
+               if (!kauth_cred_issuser(kauth_cred_get())) {
+                       return EACCES;
+               }
+
+               /* 
+                * This core functionality is similar to hfs_scan_blocks().  
+                * The main difference is that hfs_scan_blocks() is called 
+                * as part of mount where we are assured that the journal is 
+                * empty to start with.  This fcntl() can be called on a 
+                * mounted volume, therefore it has to flush the content of 
+                * the journal as well as ensure the state of summary table. 
+                * 
+                * This fcntl scans over the entire allocation bitmap,
+                * creates list of all the free blocks, and issues TRIM 
+                * down to the underlying device.  This can take long time 
+                * as it can generate up to 512MB of read I/O.
+                */
+
+               if ((hfsmp->hfs_flags & HFS_SUMMARY_TABLE) == 0) {
+                       error = hfs_init_summary(hfsmp);
+                       if (error) {
+                               printf("hfs: fsctl() could not initialize summary table for %s\n", hfsmp->vcbVN);
+                               return error;
+                       }
+               }
+
+               /* 
+                * The journal maintains list of recently deallocated blocks to 
+                * issue DKIOCUNMAPs when the corresponding journal transaction is 
+                * flushed to the disk.  To avoid any race conditions, we only 
+                * want one active trim list and only one thread issuing DKIOCUNMAPs.
+                * Therefore we make sure that the journal trim list is sync'ed, 
+                * empty, and not modifiable for the duration of our scan.
+                * 
+                * Take the journal lock before flushing the journal to the disk. 
+                * We will keep on holding the journal lock till we don't get the 
+                * bitmap lock to make sure that no new journal transactions can 
+                * start.  This will make sure that the journal trim list is not 
+                * modified after the journal flush and before getting bitmap lock.
+                * We can release the journal lock after we acquire the bitmap 
+                * lock as it will prevent any further block deallocations.
+                */
+               hfs_journal_lock(hfsmp);
+
+               /* Flush the journal and wait for all I/Os to finish up */
+               error = hfs_flush(hfsmp, HFS_FLUSH_JOURNAL_META);
+               if (error) {
+                       hfs_journal_unlock(hfsmp);
+                       return error;
+               }
+
+               /* Take bitmap lock to ensure it is not being modified */
+               lockflags = hfs_systemfile_lock(hfsmp, SFL_BITMAP, HFS_EXCLUSIVE_LOCK);
+
+               /* Release the journal lock */
+               hfs_journal_unlock(hfsmp);
+
+               /* 
+                * ScanUnmapBlocks reads the bitmap in large block size 
+                * (up to 1MB) unlike the runtime which reads the bitmap 
+                * in the 4K block size.  This can cause buf_t collisions 
+                * and potential data corruption.  To avoid this, we 
+                * invalidate all the existing buffers associated with 
+                * the bitmap vnode before scanning it.
+                *
+                * Note: ScanUnmapBlock() cleans up all the buffers 
+                * after itself, so there won't be any large buffers left 
+                * for us to clean up after it returns.
+                */
+               error = buf_invalidateblks(hfsmp->hfs_allocation_vp, 0, 0, 0);
+               if (error) {
+                       hfs_systemfile_unlock(hfsmp, lockflags);
+                       return error;
+               }
+
+               /* Traverse bitmap and issue DKIOCUNMAPs */
+               error = ScanUnmapBlocks(hfsmp);
+               hfs_systemfile_unlock(hfsmp, lockflags);
+               if (error) {
+                       return error;
+               }
+
                break;
+       }
 
-       case HFS_FSCTL_GET_JOURNAL_INFO:
-               jip = (struct hfs_journal_info*)ap->a_data;
+       case HFS_SET_HOTFILE_STATE: {
+               int error;
+               struct cnode *cp = VTOC(vp);
+               uint32_t hf_state = *((uint32_t*)ap->a_data);
+               uint32_t num_unpinned = 0;
+               
+               error = hfs_lock(cp, HFS_EXCLUSIVE_LOCK, HFS_LOCK_DEFAULT);
+               if (error) {
+                       return error;
+               }
 
-               if (vp == NULLVP)
-                       return EINVAL;
+               // printf("hfs: setting hotfile state %d on %s\n", hf_state, vp->v_name);
+               if (hf_state == HFS_MARK_FASTDEVCANDIDATE) {
+                       vnode_setfastdevicecandidate(vp);
 
-           if (hfsmp->jnl == NULL) {
-                       jnl_start = 0;
-                       jnl_size  = 0;
-           } else {
-                       jnl_start = (off_t)(hfsmp->jnl_start * HFSTOVCB(hfsmp)->blockSize) + (off_t)HFSTOVCB(hfsmp)->hfsPlusIOPosOffset;
-                       jnl_size  = (off_t)hfsmp->jnl_size;
-           }
+                       cp->c_attr.ca_recflags |= kHFSFastDevCandidateMask;
+                       cp->c_attr.ca_recflags &= ~kHFSDoNotFastDevPinMask;
+                       cp->c_flag |= C_MODIFIED;
+               } else if (hf_state == HFS_UNMARK_FASTDEVCANDIDATE || hf_state == HFS_NEVER_FASTDEVCANDIDATE) {
+                       vnode_clearfastdevicecandidate(vp);
+                       hfs_removehotfile(vp);
 
-               jip->jstart = jnl_start;
-               jip->jsize = jnl_size;
-               break;
+                       if (cp->c_attr.ca_recflags & kHFSFastDevPinnedMask) {
+                               hfs_pin_vnode(hfsmp, vp, HFS_UNPIN_IT, &num_unpinned, ap->a_context);
+                       }
+                               
+                       if (hf_state == HFS_NEVER_FASTDEVCANDIDATE) {
+                               cp->c_attr.ca_recflags |= kHFSDoNotFastDevPinMask;
+                       }
+                       cp->c_attr.ca_recflags &= ~(kHFSFastDevCandidateMask|kHFSFastDevPinnedMask);
+                       cp->c_flag |= C_MODIFIED;
 
-       case HFS_SET_ALWAYS_ZEROFILL: {
-           struct cnode *cp = VTOC(vp);
+               } else {
+                       error = EINVAL;
+               }
 
-           if (*(int *)ap->a_data) {
-               cp->c_flag |= C_ALWAYS_ZEROFILL;
-           } else {
-               cp->c_flag &= ~C_ALWAYS_ZEROFILL;
-           }
-           break;
-       }    
+               if (num_unpinned != 0) {
+                       lck_mtx_lock(&hfsmp->hfc_mutex);
+                       hfsmp->hfs_hotfile_freeblks += num_unpinned;
+                       lck_mtx_unlock(&hfsmp->hfc_mutex);
+               }
 
-       case HFS_DISABLE_METAZONE: {
-               /* Only root can disable metadata zone */
-               if (!is_suser()) {
+               hfs_unlock(cp);
+               return error;
+               break;
+       }
+
+       case HFS_REPIN_HOTFILE_STATE: {
+               int error=0;
+               uint32_t repin_what = *((uint32_t*)ap->a_data);
+
+               /* Only root allowed */
+               if (!kauth_cred_issuser(kauth_cred_get())) {
                        return EACCES;
                }
-               if (hfsmp->hfs_flags & HFS_READ_ONLY) {
-                       return (EROFS);
+
+               if (!(hfsmp->hfs_flags & (HFS_CS_METADATA_PIN | HFS_CS_HOTFILE_PIN))) {
+                       // this system is neither regular Fusion or Cooperative Fusion
+                       // so this fsctl makes no sense.
+                       return EINVAL;
                }
 
-               /* Disable metadata zone now */
-               (void) hfs_metadatazone_init(hfsmp, true);
-               printf ("hfs: Disabling metadata zone on %s\n", hfsmp->vcbVN);
+               //
+               // After a converting a CoreStorage volume to be encrypted, the
+               // extents could have moved around underneath us.  This call
+               // allows corestoraged to re-pin everything that should be
+               // pinned (it would happen on the next reboot too but that could
+               // be a long time away).
+               //
+               if ((repin_what & HFS_REPIN_METADATA) && (hfsmp->hfs_flags & HFS_CS_METADATA_PIN)) {
+                       hfs_pin_fs_metadata(hfsmp);
+               }
+               if ((repin_what & HFS_REPIN_USERDATA) && (hfsmp->hfs_flags & HFS_CS_HOTFILE_PIN)) {
+                       hfs_repin_hotfiles(hfsmp);
+               }
+               if ((repin_what & HFS_REPIN_USERDATA) && (hfsmp->hfs_flags & HFS_CS_SWAPFILE_PIN)) {
+                       //XXX Swapfiles (marked SWAP_PINNED) may have moved too.
+                       //XXX Do we care? They have a more transient/dynamic nature/lifetime.
+               }
+
+               return error;
                break;
-       }
-       
+       }               
+
+
        default:
                return (ENOTTY);
        }
@@ -2323,6 +2961,62 @@ hfs_vnop_offtoblk(struct vnop_offtoblk_args *ap)
  * searched for mapping.
  *
  * System file cnodes are expected to be locked (shared or exclusive).
+ *
+ * -- INVALID RANGES --
+ *
+ * Invalid ranges are used to keep track of where we have extended a
+ * file, but have not yet written that data to disk.  In the past we
+ * would clear up the invalid ranges as we wrote to those areas, but
+ * before data was actually flushed to disk.  The problem with that
+ * approach is that the data can be left in the cache and is therefore
+ * still not valid on disk.  So now we clear up the ranges here, when
+ * the flags field has VNODE_WRITE set, indicating a write is about to
+ * occur.  This isn't ideal (ideally we want to clear them up when
+ * know the data has been successfully written), but it's the best we
+ * can do.
+ *
+ * For reads, we use the invalid ranges here in block map to indicate
+ * to the caller that the data should be zeroed (a_bpn == -1).  We
+ * have to be careful about what ranges we return to the cluster code.
+ * Currently the cluster code can only handle non-rounded values for
+ * the EOF; it cannot handle funny sized ranges in the middle of the
+ * file (the main problem is that it sends down odd sized I/Os to the
+ * disk).  Our code currently works because whilst the very first
+ * offset and the last offset in the invalid ranges are not aligned,
+ * gaps in the invalid ranges between the first and last, have to be
+ * aligned (because we always write page sized blocks).  For example,
+ * consider this arrangement:
+ *
+ *         +-------------+-----+-------+------+
+ *         |             |XXXXX|       |XXXXXX|
+ *         +-------------+-----+-------+------+
+ *                       a     b       c      d
+ *
+ * This shows two invalid ranges <a, b> and <c, d>.  Whilst a and d
+ * are not necessarily aligned, b and c *must* be.
+ *
+ * Zero-filling occurs in a number of ways:
+ *
+ *   1. When a read occurs and we return with a_bpn == -1.
+ *
+ *   2. When hfs_fsync or hfs_filedone calls hfs_flush_invalid_ranges
+ *      which will cause us to iterate over the ranges bringing in
+ *      pages that are not present in the cache and zeroing them.  Any
+ *      pages that are already in the cache are left untouched.  Note
+ *      that hfs_fsync does not always flush invalid ranges.
+ *
+ *   3. When we extend a file we zero out from the old EOF to the end
+ *      of the page.  It would be nice if we didn't have to do this if
+ *      the page wasn't present (and could defer it), but because of
+ *      the problem described above, we have to.
+ *
+ * The invalid ranges are also used to restrict the size that we write
+ * out on disk: see hfs_prepare_fork_for_update.
+ *
+ * Note that invalid ranges are ignored when neither the VNODE_READ or
+ * the VNODE_WRITE flag is specified.  This is useful for the
+ * F_LOG2PHYS* fcntls which are not interested in invalid ranges: they
+ * just want to know whether blocks are physically allocated or not.
  */
 int
 hfs_vnop_blockmap(struct vnop_blockmap_args *ap)
@@ -2343,7 +3037,7 @@ hfs_vnop_blockmap(struct vnop_blockmap_args *ap)
        struct cnode *cp;
        struct filefork *fp;
        struct hfsmount *hfsmp;
-       size_t bytesContAvail = 0;
+       size_t bytesContAvail = ap->a_size;
        int retval = E_NONE;
        int syslocks = 0;
        int lockflags = 0;
@@ -2384,17 +3078,110 @@ hfs_vnop_blockmap(struct vnop_blockmap_args *ap)
        if (ap->a_bpn == NULL)
                return (0);
 
+       hfsmp = VTOHFS(vp);
+       cp = VTOC(vp);
+       fp = VTOF(vp);
+
        if ( !vnode_issystem(vp) && !vnode_islnk(vp) && !vnode_isswap(vp)) {
-               if (VTOC(vp)->c_lockowner != current_thread()) {
-                       hfs_lock(VTOC(vp), HFS_FORCE_LOCK);
+               if (cp->c_lockowner != current_thread()) {
+                       hfs_lock(VTOC(vp), HFS_EXCLUSIVE_LOCK, HFS_LOCK_ALLOW_NOEXISTS);
                        tooklock = 1;
                }
+
+               // For reads, check the invalid ranges
+               if (ISSET(ap->a_flags, VNODE_READ)) {
+                       if (ap->a_foffset >= fp->ff_size) {
+                               retval = ERANGE;
+                               goto exit;
+                       }
+
+                       overlaptype = rl_scan(&fp->ff_invalidranges, ap->a_foffset,
+                                                                 ap->a_foffset + (off_t)bytesContAvail - 1,
+                                                                 &invalid_range);
+                       switch(overlaptype) {
+                               case RL_MATCHINGOVERLAP:
+                               case RL_OVERLAPCONTAINSRANGE:
+                               case RL_OVERLAPSTARTSBEFORE:
+                                       /* There's no valid block for this byte offset */
+                                       *ap->a_bpn = (daddr64_t)-1;
+                                       /* There's no point limiting the amount to be returned
+                                        * if the invalid range that was hit extends all the way
+                                        * to the EOF (i.e. there's no valid bytes between the
+                                        * end of this range and the file's EOF):
+                                        */
+                                       if (((off_t)fp->ff_size > (invalid_range->rl_end + 1)) &&
+                                               ((size_t)(invalid_range->rl_end + 1 - ap->a_foffset) < bytesContAvail)) {
+                                               bytesContAvail = invalid_range->rl_end + 1 - ap->a_foffset;
+                                       }
+
+                                       retval = 0;
+                                       goto exit;
+
+                               case RL_OVERLAPISCONTAINED:
+                               case RL_OVERLAPENDSAFTER:
+                                       /* The range of interest hits an invalid block before the end: */
+                                       if (invalid_range->rl_start == ap->a_foffset) {
+                                               /* There's actually no valid information to be had starting here: */
+                                               *ap->a_bpn = (daddr64_t)-1;
+                                               if (((off_t)fp->ff_size > (invalid_range->rl_end + 1)) &&
+                                                       ((size_t)(invalid_range->rl_end + 1 - ap->a_foffset) < bytesContAvail)) {
+                                                       bytesContAvail = invalid_range->rl_end + 1 - ap->a_foffset;
+                                               }
+
+                                               retval = 0;
+                                               goto exit;
+                                       } else {
+                                               /*
+                                                * Sadly, the lower layers don't like us to
+                                                * return unaligned ranges, so we skip over
+                                                * any invalid ranges here that are less than
+                                                * a page: zeroing of those bits is not our
+                                                * responsibility (it's dealt with elsewhere).
+                                                */
+                                               do {
+                                                       off_t rounded_start = round_page_64(invalid_range->rl_start);
+                                                       if ((off_t)bytesContAvail < rounded_start - ap->a_foffset)
+                                                               break;
+                                                       if (rounded_start < invalid_range->rl_end + 1) {
+                                                               bytesContAvail = rounded_start - ap->a_foffset;
+                                                               break;
+                                                       }
+                                               } while ((invalid_range = TAILQ_NEXT(invalid_range,
+                                                                                                                        rl_link)));
+                                       }
+                                       break;
+
+                               case RL_NOOVERLAP:
+                                       break;
+                       } // switch
+               }
        }
-       hfsmp = VTOHFS(vp);
-       cp = VTOC(vp);
-       fp = VTOF(vp);
+
+#if CONFIG_PROTECT
+       if (cp->c_cpentry) {
+               const int direction = (ISSET(ap->a_flags, VNODE_WRITE)
+                                                          ? VNODE_WRITE : VNODE_READ);
+
+               cp_io_params_t io_params;
+               cp_io_params(hfsmp, cp->c_cpentry,
+                                        off_rsrc_make(ap->a_foffset, VNODE_IS_RSRC(vp)),
+                                        direction, &io_params);
+
+               if (io_params.max_len < (off_t)bytesContAvail)
+                       bytesContAvail = io_params.max_len;
+
+               if (io_params.phys_offset != -1) {
+                       *ap->a_bpn = ((io_params.phys_offset + hfsmp->hfsPlusIOPosOffset)
+                                                 / hfsmp->hfs_logical_block_size);
+
+                       retval = 0;
+                       goto exit;
+               }
+       }
+#endif
 
 retry:
+
        /* Check virtual blocks only when performing write operation */
        if ((ap->a_flags & VNODE_WRITE) && (fp->ff_unallocblocks != 0)) {
                if (hfs_start_transaction(hfsmp) != 0) {
@@ -2449,14 +3236,14 @@ retry:
                        cp->c_blocks += loanedBlocks;
                        fp->ff_blocks += loanedBlocks;
 
-                       HFS_MOUNT_LOCK(hfsmp, TRUE);
+                       hfs_lock_mount (hfsmp);
                        hfsmp->loanedBlocks += loanedBlocks;
-                       HFS_MOUNT_UNLOCK(hfsmp, TRUE);
+                       hfs_unlock_mount (hfsmp);
 
                        hfs_systemfile_unlock(hfsmp, lockflags);
                        cp->c_flag |= C_MODIFIED;
                        if (started_tr) {
-                               (void) hfs_update(vp, TRUE);
+                               (void) hfs_update(vp, 0);
                                (void) hfs_volupdate(hfsmp, VOL_UPDATE, 0);
 
                                hfs_end_transaction(hfsmp);
@@ -2466,19 +3253,13 @@ retry:
                }
        }
 
-       retval = MapFileBlockC(hfsmp, (FCB *)fp, ap->a_size, ap->a_foffset,
+       retval = MapFileBlockC(hfsmp, (FCB *)fp, bytesContAvail, ap->a_foffset,
                               ap->a_bpn, &bytesContAvail);
        if (syslocks) {
                hfs_systemfile_unlock(hfsmp, lockflags);
                syslocks = 0;
        }
 
-       if (started_tr) {
-               (void) hfs_update(vp, TRUE);
-               (void) hfs_volupdate(hfsmp, VOL_UPDATE, 0);
-               hfs_end_transaction(hfsmp);
-               started_tr = 0;
-       }       
        if (retval) {
                /* On write, always return error because virtual blocks, if any, 
                 * should have been allocated in ExtendFileC().  We do not 
@@ -2490,98 +3271,71 @@ retry:
                    (ap->a_flags & VNODE_WRITE) ||
                    ((ap->a_flags & VNODE_READ) && (fp->ff_unallocblocks == 0))) {
                        goto exit;
-               } 
-               
+               }
+
                /* Validate if the start offset is within logical file size */
-               if (ap->a_foffset > fp->ff_size) {
-                       goto exit;
+               if (ap->a_foffset >= fp->ff_size) {
+                       goto exit;
                }
 
-               /* Searching file extents has failed for read operation, therefore 
-                * search rangelist for any uncommitted holes in the file. 
+               /*
+                * At this point, we have encountered a failure during
+                * MapFileBlockC that resulted in ERANGE, and we are not
+                * servicing a write, and there are borrowed blocks.
+                *
+                * However, the cluster layer will not call blockmap for
+                * blocks that are borrowed and in-cache.  We have to assume
+                * that because we observed ERANGE being emitted from
+                * MapFileBlockC, this extent range is not valid on-disk.  So
+                * we treat this as a mapping that needs to be zero-filled
+                * prior to reading.
                 */
-               overlaptype = rl_scan(&fp->ff_invalidranges, ap->a_foffset,
-                                     ap->a_foffset + (off_t)(ap->a_size - 1),
-                                     &invalid_range);
-               switch(overlaptype) {
-               case RL_OVERLAPISCONTAINED:
-                       /* start_offset <= rl_start, end_offset >= rl_end */
-                       if (ap->a_foffset != invalid_range->rl_start) {
-                               break;
-                       }
-               case RL_MATCHINGOVERLAP:
-                       /* start_offset = rl_start, end_offset = rl_end */
-               case RL_OVERLAPCONTAINSRANGE:
-                       /* start_offset >= rl_start, end_offset <= rl_end */
-               case RL_OVERLAPSTARTSBEFORE:
-                       /* start_offset > rl_start, end_offset >= rl_start */
-                       if ((off_t)fp->ff_size > (invalid_range->rl_end + 1)) {
-                               bytesContAvail = (invalid_range->rl_end + 1) - ap->a_foffset;
-                       } else {
-                               bytesContAvail = fp->ff_size - ap->a_foffset;
-                       }
-                       if (bytesContAvail > ap->a_size) {
-                               bytesContAvail = ap->a_size;
-                       }
-                       *ap->a_bpn = (daddr64_t)-1;
-                       retval = 0;
-                       break;
-               case RL_OVERLAPENDSAFTER:
-                       /* start_offset < rl_start, end_offset < rl_end */
-               case RL_NOOVERLAP:
-                       break;
-               }
+
+               if (fp->ff_size - ap->a_foffset < (off_t)bytesContAvail)
+                       bytesContAvail = fp->ff_size - ap->a_foffset;
+
+               *ap->a_bpn = (daddr64_t) -1;
+               retval = 0;
+
                goto exit;
        }
 
-       /* MapFileC() found a valid extent in the filefork.  Search the 
-        * mapping information further for invalid file ranges 
-        */
-       overlaptype = rl_scan(&fp->ff_invalidranges, ap->a_foffset,
-                             ap->a_foffset + (off_t)bytesContAvail - 1,
-                             &invalid_range);
-       if (overlaptype != RL_NOOVERLAP) {
-               switch(overlaptype) {
-               case RL_MATCHINGOVERLAP:
-               case RL_OVERLAPCONTAINSRANGE:
-               case RL_OVERLAPSTARTSBEFORE:
-                       /* There's no valid block for this byte offset */
-                       *ap->a_bpn = (daddr64_t)-1;
-                       /* There's no point limiting the amount to be returned
-                        * if the invalid range that was hit extends all the way 
-                        * to the EOF (i.e. there's no valid bytes between the
-                        * end of this range and the file's EOF):
-                        */
-                       if (((off_t)fp->ff_size > (invalid_range->rl_end + 1)) &&
-                           ((size_t)(invalid_range->rl_end + 1 - ap->a_foffset) < bytesContAvail)) {
-                               bytesContAvail = invalid_range->rl_end + 1 - ap->a_foffset;
-                       }
-                       break;
-       
-               case RL_OVERLAPISCONTAINED:
-               case RL_OVERLAPENDSAFTER:
-                       /* The range of interest hits an invalid block before the end: */
-                       if (invalid_range->rl_start == ap->a_foffset) {
-                               /* There's actually no valid information to be had starting here: */
-                               *ap->a_bpn = (daddr64_t)-1;
-                               if (((off_t)fp->ff_size > (invalid_range->rl_end + 1)) &&
-                                   ((size_t)(invalid_range->rl_end + 1 - ap->a_foffset) < bytesContAvail)) {
-                                       bytesContAvail = invalid_range->rl_end + 1 - ap->a_foffset;
+exit:
+       if (retval == 0) {
+               if (ISSET(ap->a_flags, VNODE_WRITE)) {
+                       struct rl_entry *r = TAILQ_FIRST(&fp->ff_invalidranges);
+
+                       // See if we might be overlapping invalid ranges...
+                       if (r && (ap->a_foffset + (off_t)bytesContAvail) > r->rl_start) {
+                               /*
+                                * Mark the file as needing an update if we think the
+                                * on-disk EOF has changed.
+                                */
+                               if (ap->a_foffset <= r->rl_start)
+                                       SET(cp->c_flag, C_MODIFIED);
+
+                               /*
+                                * This isn't the ideal place to put this.  Ideally, we
+                                * should do something *after* we have successfully
+                                * written to the range, but that's difficult to do
+                                * because we cannot take locks in the callback.  At
+                                * present, the cluster code will call us with VNODE_WRITE
+                                * set just before it's about to write the data so we know
+                                * that data is about to be written.  If we get an I/O
+                                * error at this point then chances are the metadata
+                                * update to follow will also have an I/O error so the
+                                * risk here is small.
+                                */
+                               rl_remove(ap->a_foffset, ap->a_foffset + bytesContAvail - 1,
+                                                 &fp->ff_invalidranges);
+
+                               if (!TAILQ_FIRST(&fp->ff_invalidranges)) {
+                                       cp->c_flag &= ~C_ZFWANTSYNC;
+                                       cp->c_zftimeout = 0;
                                }
-                       } else {
-                               bytesContAvail = invalid_range->rl_start - ap->a_foffset;
                        }
-                       break;
+               }
 
-               case RL_NOOVERLAP:
-                       break;
-               } /* end switch */
-               if (bytesContAvail > ap->a_size)
-                       bytesContAvail = ap->a_size;
-       } 
-               
-exit:
-       if (retval == 0) {
                if (ap->a_run)
                        *ap->a_run = bytesContAvail;
 
@@ -2589,13 +3343,19 @@ exit:
                        *(int *)ap->a_poff = 0;
        }
 
+       if (started_tr) {
+               hfs_update(vp, TRUE);
+               hfs_volupdate(hfsmp, VOL_UPDATE, 0);
+               hfs_end_transaction(hfsmp);
+               started_tr = 0;
+       }
+
        if (tooklock)
                hfs_unlock(cp);
 
        return (MacToVFSError(retval));
 }
 
-
 /*
  * prepare and issue the I/O
  * buf_strategy knows how to deal
@@ -2609,49 +3369,38 @@ hfs_vnop_strategy(struct vnop_strategy_args *ap)
        vnode_t vp = buf_vnode(bp);
        int error = 0;
        
-#if CONFIG_PROTECT
-       cnode_t *cp = NULL; 
-       
-       if ((cp = cp_get_protected_cnode(vp)) != NULL) {
-               /*
-                * Some paths to hfs_vnop_strategy will take the cnode lock, 
-                * and some won't. But since content protection is only enabled
-                * for files that (a) aren't system files and (b) are regular 
-                * files, any valid cnode here will be unlocked.
-                */
-               hfs_lock(cp, HFS_SHARED_LOCK);
-               buf_setcpaddr(bp, cp->c_cpentry);
+       /* Mark buffer as containing static data if cnode flag set */
+       if (VTOC(vp)->c_flag & C_SSD_STATIC) {
+               buf_markstatic(bp);
        }
-#endif /* CONFIG_PROTECT */
        
-       error = buf_strategy(VTOHFS(vp)->hfs_devvp, ap);
+       /* Mark buffer as containing static data if cnode flag set */
+       if (VTOC(vp)->c_flag & C_SSD_GREEDY_MODE) {
+               bufattr_markgreedymode(&bp->b_attr);
+       }
 
-#if CONFIG_PROTECT
-       if (cp) {
-               hfs_unlock(cp);
+       /* mark buffer as containing burst mode data if cnode flag set */
+       if (VTOC(vp)->c_flag & C_IO_ISOCHRONOUS) {
+               bufattr_markisochronous(&bp->b_attr);
        }
-#endif
        
-       return error;
-}
+#if CONFIG_PROTECT
+       error = cp_handle_strategy(bp);
 
-static int 
-hfs_minorupdate(struct vnode *vp) {
-       struct cnode *cp = VTOC(vp);
-       cp->c_flag &= ~C_MODIFIED;
-       cp->c_touch_acctime = 0;
-       cp->c_touch_chgtime = 0;
-       cp->c_touch_modtime = 0;
+       if (error)
+               return error;
+#endif
        
-       return 0;
+       error = buf_strategy(VTOHFS(vp)->hfs_devvp, ap);
+       
+       return error;
 }
 
 int
-do_hfs_truncate(struct vnode *vp, off_t length, int flags, int skipupdate, vfs_context_t context)
+do_hfs_truncate(struct vnode *vp, off_t length, int flags, int truncateflags, vfs_context_t context)
 {
        register struct cnode *cp = VTOC(vp);
        struct filefork *fp = VTOF(vp);
-       struct proc *p = vfs_context_proc(context);;
        kauth_cred_t cred = vfs_context_ucred(context);
        int retval;
        off_t bytesToAdd;
@@ -2661,12 +3410,13 @@ do_hfs_truncate(struct vnode *vp, off_t length, int flags, int skipupdate, vfs_c
        int blksize;
        struct hfsmount *hfsmp;
        int lockflags;
+       int suppress_times = (truncateflags & HFS_TRUNCATE_SKIPTIMES);
 
        blksize = VTOVCB(vp)->blockSize;
        fileblocks = fp->ff_blocks;
        filebytes = (off_t)fileblocks * (off_t)blksize;
 
-       KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 7)) | DBG_FUNC_START,
+       KERNEL_DEBUG(HFSDBG_TRUNCATE | DBG_FUNC_START,
                 (int)length, (int)fp->ff_size, (int)filebytes, 0, 0);
 
        if (length < 0)
@@ -2720,8 +3470,9 @@ do_hfs_truncate(struct vnode *vp, off_t length, int flags, int skipupdate, vfs_c
                        /* All or nothing and don't round up to clumpsize. */
                        eflags = kEFAllMask | kEFNoClumpMask;
 
-                       if (cred && suser(cred, NULL) != 0)
+                       if (cred && (suser(cred, NULL) != 0)) {
                                eflags |= kEFReserveMask;  /* keep a reserve */
+                       }
 
                        /*
                         * Allocate Journal and Quota files in metadata zone.
@@ -2743,6 +3494,10 @@ do_hfs_truncate(struct vnode *vp, off_t length, int flags, int skipupdate, vfs_c
                                lockflags |= SFL_EXTENTS;
                        lockflags = hfs_systemfile_lock(hfsmp, lockflags, HFS_EXCLUSIVE_LOCK);
 
+                       /* 
+                        * Keep growing the file as long as the current EOF is
+                        * less than the desired value.
+                        */
                        while ((length > filebytes) && (retval == E_NONE)) {
                                bytesToAdd = length - filebytes;
                                retval = MacToVFSError(ExtendFileC(VTOVCB(vp),
@@ -2763,13 +3518,8 @@ do_hfs_truncate(struct vnode *vp, off_t length, int flags, int skipupdate, vfs_c
                        hfs_systemfile_unlock(hfsmp, lockflags);
 
                        if (hfsmp->jnl) {
-                               if (skipupdate) {
-                                       (void) hfs_minorupdate(vp);
-                               }
-                               else {
-                                       (void) hfs_update(vp, TRUE);
-                                       (void) hfs_volupdate(hfsmp, VOL_UPDATE, 0);
-                               }
+                               hfs_update(vp, 0);
+                               hfs_volupdate(hfsmp, VOL_UPDATE, 0);
                        }
 
                        hfs_end_transaction(hfsmp);
@@ -2777,64 +3527,48 @@ do_hfs_truncate(struct vnode *vp, off_t length, int flags, int skipupdate, vfs_c
                        if (retval)
                                goto Err_Exit;
 
-                       KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 7)) | DBG_FUNC_NONE,
+                       KERNEL_DEBUG(HFSDBG_TRUNCATE | DBG_FUNC_NONE,
                                (int)length, (int)fp->ff_size, (int)filebytes, 0, 0);
                }
  
-               if (!(flags & IO_NOZEROFILL)) {
+               if (ISSET(flags, IO_NOZEROFILL)) {
+                       // An optimisation for the hibernation file
+                       if (vnode_isswap(vp))
+                               rl_remove_all(&fp->ff_invalidranges);
+               } else {
                        if (UBCINFOEXISTS(vp)  && (vnode_issystem(vp) == 0) && retval == E_NONE) {
-                               struct rl_entry *invalid_range;
-                               off_t zero_limit;
-                       
-                               zero_limit = (fp->ff_size + (PAGE_SIZE_64 - 1)) & ~PAGE_MASK_64;
-                               if (length < zero_limit) zero_limit = length;
-
                                if (length > (off_t)fp->ff_size) {
                                        struct timeval tv;
 
                                        /* Extending the file: time to fill out the current last page w. zeroes? */
-                                       if ((fp->ff_size & PAGE_MASK_64) &&
-                                           (rl_scan(&fp->ff_invalidranges, fp->ff_size & ~PAGE_MASK_64,
-                                           fp->ff_size - 1, &invalid_range) == RL_NOOVERLAP)) {
-                                               
-                                               /* There's some valid data at the start of the (current) last page
+                                       if (fp->ff_size & PAGE_MASK_64) {
+                                               /* There might be some valid data at the start of the (current) last page
                                                   of the file, so zero out the remainder of that page to ensure the
-                                                  entire page contains valid data.  Since there is no invalid range
-                                                  possible past the (current) eof, there's no need to remove anything
-                                                  from the invalid range list before calling cluster_write():  */
+                                                  entire page contains valid data. */
                                                hfs_unlock(cp);
-                                               retval = cluster_write(vp, (struct uio *) 0, fp->ff_size, zero_limit,
-                                                               fp->ff_size, (off_t)0,
-                                                               (flags & IO_SYNC) | IO_HEADZEROFILL | IO_NOZERODIRTY);
-                                               hfs_lock(cp, HFS_FORCE_LOCK);
+                                               retval = hfs_zero_eof_page(vp, length);
+                                               hfs_lock(cp, HFS_EXCLUSIVE_LOCK, HFS_LOCK_ALLOW_NOEXISTS);
                                                if (retval) goto Err_Exit;
-                                               
-                                               /* Merely invalidate the remaining area, if necessary: */
-                                               if (length > zero_limit) {
-                                                       microuptime(&tv);
-                                                       rl_add(zero_limit, length - 1, &fp->ff_invalidranges);
-                                                       cp->c_zftimeout = tv.tv_sec + ZFTIMELIMIT;
-                                               }
-                                       } else {
-                                       /* The page containing the (current) eof is invalid: just add the
-                                          remainder of the page to the invalid list, along with the area
-                                          being newly allocated:
-                                        */
+                                       }
                                        microuptime(&tv);
                                        rl_add(fp->ff_size, length - 1, &fp->ff_invalidranges);
                                        cp->c_zftimeout = tv.tv_sec + ZFTIMELIMIT;
-                                       };
                                }
                        } else {
                                        panic("hfs_truncate: invoked on non-UBC object?!");
                        };
                }
-               cp->c_touch_modtime = TRUE;
+               if (suppress_times == 0) {
+                       cp->c_touch_modtime = TRUE;
+               }
                fp->ff_size = length;
 
        } else { /* Shorten the size of the file */
 
-               if ((off_t)fp->ff_size > length) {
+               // An optimisation for the hibernation file
+               if (ISSET(flags, IO_NOZEROFILL) && vnode_isswap(vp)) {
+                       rl_remove_all(&fp->ff_invalidranges);
+               } else if ((off_t)fp->ff_size > length) {
                        /* Any space previously marked as invalid is now irrelevant: */
                        rl_remove(length, fp->ff_size - 1, &fp->ff_invalidranges);
                }
@@ -2847,8 +3581,7 @@ do_hfs_truncate(struct vnode *vp, off_t length, int flags, int skipupdate, vfs_c
                        u_int32_t finalblks;
                        u_int32_t loanedBlocks;
 
-                       HFS_MOUNT_LOCK(hfsmp, TRUE);
-
+                       hfs_lock_mount(hfsmp);
                        loanedBlocks = fp->ff_unallocblocks;
                        cp->c_blocks -= loanedBlocks;
                        fp->ff_blocks -= loanedBlocks;
@@ -2866,85 +3599,89 @@ do_hfs_truncate(struct vnode *vp, off_t length, int flags, int skipupdate, vfs_c
                                cp->c_blocks += loanedBlocks;
                                fp->ff_blocks += loanedBlocks;
                        }
-                       HFS_MOUNT_UNLOCK(hfsmp, TRUE);
+                       hfs_unlock_mount (hfsmp);
                }
 
-               /*
-                * For a TBE process the deallocation of the file blocks is
-                * delayed until the file is closed.  And hfs_close calls
-                * truncate with the IO_NDELAY flag set.  So when IO_NDELAY
-                * isn't set, we make sure this isn't a TBE process.
-                */
-               if ((flags & IO_NDELAY) || (proc_tbe(p) == 0)) {
-#if QUOTA
-                 off_t savedbytes = ((off_t)fp->ff_blocks * (off_t)blksize);
-#endif /* QUOTA */
-                 if (hfs_start_transaction(hfsmp) != 0) {
-                     retval = EINVAL;
-                     goto Err_Exit;
-                 }
+               off_t savedbytes = ((off_t)fp->ff_blocks * (off_t)blksize);
+               if (hfs_start_transaction(hfsmp) != 0) {
+                       retval = EINVAL;
+                       goto Err_Exit;
+               }
 
-                       if (fp->ff_unallocblocks == 0) {
-                               /* Protect extents b-tree and allocation bitmap */
-                               lockflags = SFL_BITMAP;
-                               if (overflow_extents(fp))
-                                       lockflags |= SFL_EXTENTS;
-                               lockflags = hfs_systemfile_lock(hfsmp, lockflags, HFS_EXCLUSIVE_LOCK);
+               if (fp->ff_unallocblocks == 0) {
+                       /* Protect extents b-tree and allocation bitmap */
+                       lockflags = SFL_BITMAP;
+                       if (overflow_extents(fp))
+                               lockflags |= SFL_EXTENTS;
+                       lockflags = hfs_systemfile_lock(hfsmp, lockflags, HFS_EXCLUSIVE_LOCK);
 
-                               retval = MacToVFSError(TruncateFileC(VTOVCB(vp), (FCB*)fp, length, 0, 
-                                                                                                        FORK_IS_RSRC (fp), FTOC(fp)->c_fileid, false));
+                       retval = MacToVFSError(TruncateFileC(VTOVCB(vp), (FCB*)fp, length, 0, 
+                                                                                                FORK_IS_RSRC (fp), FTOC(fp)->c_fileid, false));
 
-                               hfs_systemfile_unlock(hfsmp, lockflags);
-                       }
-                       if (hfsmp->jnl) {
-                               if (retval == 0) {
-                                       fp->ff_size = length;
-                               }
-                               if (skipupdate) {
-                                       (void) hfs_minorupdate(vp);
-                               }
-                               else {
-                                       (void) hfs_update(vp, TRUE);
-                                       (void) hfs_volupdate(hfsmp, VOL_UPDATE, 0);
-                               }
+                       hfs_systemfile_unlock(hfsmp, lockflags);
+               }
+               if (hfsmp->jnl) {
+                       if (retval == 0) {
+                               fp->ff_size = length;
                        }
-                       hfs_end_transaction(hfsmp);
+                       hfs_update(vp, 0);
+                       hfs_volupdate(hfsmp, VOL_UPDATE, 0);
+               }
+               hfs_end_transaction(hfsmp);
 
-                       filebytes = (off_t)fp->ff_blocks * (off_t)blksize;
-                       if (retval)
-                               goto Err_Exit;
+               filebytes = (off_t)fp->ff_blocks * (off_t)blksize;
+               if (retval)
+                       goto Err_Exit;
 #if QUOTA
-                       /* These are bytesreleased */
-                       (void) hfs_chkdq(cp, (int64_t)-(savedbytes - filebytes), NOCRED, 0);
+               /* These are bytesreleased */
+               (void) hfs_chkdq(cp, (int64_t)-(savedbytes - filebytes), NOCRED, 0);
 #endif /* QUOTA */
-               }
-               /* Only set update flag if the logical length changes */
-               if ((off_t)fp->ff_size != length)
+
+               //
+               // Unlike when growing a file, we adjust the hotfile block count here
+               // instead of deeper down in the block allocation code because we do
+               // not necessarily have a vnode or "fcb" at the time we're deleting
+               // the file and so we wouldn't know if it was hotfile cached or not
+               //
+               hfs_hotfile_adjust_blocks(vp, (int64_t)((savedbytes - filebytes) / blksize));
+
+
+               /* 
+                * Only set update flag if the logical length changes & we aren't
+                * suppressing modtime updates.
+                */
+               if (((off_t)fp->ff_size != length) && (suppress_times == 0)) {
                        cp->c_touch_modtime = TRUE;
+               }
                fp->ff_size = length;
        }
        if (cp->c_mode & (S_ISUID | S_ISGID)) {
-               if (!vfs_context_issuser(context)) {
+               if (!vfs_context_issuser(context))
                        cp->c_mode &= ~(S_ISUID | S_ISGID);
-                       skipupdate = 0;
-               }
-       }
-       if (skipupdate) {
-               retval = hfs_minorupdate(vp);
        }
-       else {
-               cp->c_touch_chgtime = TRUE;     /* status changed */
+       cp->c_flag |= C_MODIFIED;
+       cp->c_touch_chgtime = TRUE;     /* status changed */
+       if (suppress_times == 0) {
                cp->c_touch_modtime = TRUE;     /* file data was modified */
-               retval = hfs_update(vp, MNT_WAIT);
+
+               /*
+                * If we are not suppressing the modtime update, then
+                * update the gen count as well.
+                */
+               if (S_ISREG(cp->c_attr.ca_mode) || S_ISLNK (cp->c_attr.ca_mode)) {
+                       hfs_incr_gencount(cp);
+               }
        }
+
+       retval = hfs_update(vp, 0);
        if (retval) {
-               KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 7)) | DBG_FUNC_NONE,
+               KERNEL_DEBUG(HFSDBG_TRUNCATE | DBG_FUNC_NONE,
                     -1, -1, -1, retval, 0);
        }
 
 Err_Exit:
 
-       KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 7)) | DBG_FUNC_END,
+       KERNEL_DEBUG(HFSDBG_TRUNCATE | DBG_FUNC_END,
                 (int)length, (int)fp->ff_size, (int)filebytes, retval, 0);
 
        return (retval);
@@ -2967,7 +3704,9 @@ hfs_prepare_release_storage (struct hfsmount *hfsmp, struct vnode *vp) {
        
        struct filefork *fp = VTOF(vp);
        struct cnode *cp = VTOC(vp);
+#if QUOTA
        int retval = 0;
+#endif /* QUOTA */
        
        /* Cannot truncate an HFS directory! */
        if (vnode_isdir(vp)) {
@@ -3008,8 +3747,7 @@ hfs_prepare_release_storage (struct hfsmount *hfsmp, struct vnode *vp) {
        if (fp->ff_unallocblocks > 0) {
                u_int32_t loanedBlocks;
                
-               HFS_MOUNT_LOCK(hfsmp, TRUE);
-               
+               hfs_lock_mount (hfsmp);
                loanedBlocks = fp->ff_unallocblocks;
                cp->c_blocks -= loanedBlocks;
                fp->ff_blocks -= loanedBlocks;
@@ -3017,7 +3755,7 @@ hfs_prepare_release_storage (struct hfsmount *hfsmp, struct vnode *vp) {
                
                hfsmp->loanedBlocks -= loanedBlocks;
                
-               HFS_MOUNT_UNLOCK(hfsmp, TRUE);
+               hfs_unlock_mount (hfsmp);
        }
        
        return 0;
@@ -3030,6 +3768,12 @@ hfs_prepare_release_storage (struct hfsmount *hfsmp, struct vnode *vp) {
  * for use when deleting a file.  The simplification here is that we know 
  * that we are releasing all blocks.
  *
+ * Note that this function may be called when there is no vnode backing
+ * the file fork in question.  We may call this from hfs_vnop_inactive
+ * to clear out resource fork data (and may not want to clear out the data 
+ * fork yet).  As a result, we pointer-check both sets of inputs before 
+ * doing anything with them.
+ *
  * The caller is responsible for saving off a copy of the filefork(s)
  * embedded within the cnode prior to calling this function.  The pointers
  * supplied as arguments must be valid even if the cnode is no longer valid.
@@ -3048,14 +3792,18 @@ hfs_release_storage (struct hfsmount *hfsmp, struct filefork *datafork,
        blksize = hfsmp->blockSize;
        
        /* Data Fork */
-       if (datafork->ff_blocks > 0) {
+       if (datafork) {
+               off_t prev_filebytes;
+               datafork->ff_size = 0;
+
                fileblocks = datafork->ff_blocks;
-               filebytes = (off_t)fileblocks * (off_t)blksize;         
+               filebytes = (off_t)fileblocks * (off_t)blksize;
+               prev_filebytes = filebytes;
                
                /* We killed invalid ranges and loaned blocks before we removed the catalog entry */
                
                while (filebytes > 0) {
-                       if (filebytes > HFS_BIGFILE_SIZE && overflow_extents(datafork)) {
+                       if (filebytes > HFS_BIGFILE_SIZE) {
                                filebytes -= HFS_BIGFILE_SIZE;
                        } else {
                                filebytes = 0;
@@ -3078,11 +3826,14 @@ hfs_release_storage (struct hfsmount *hfsmp, struct filefork *datafork,
                                
                                hfs_systemfile_unlock(hfsmp, lockflags);
                        }
-                       if (error == 0) {
-                               datafork->ff_size = filebytes;
-                       }
                        (void) hfs_volupdate(hfsmp, VOL_UPDATE, 0);
                        
+                       struct cnode *cp = datafork ? FTOC(datafork) : NULL;
+                       struct vnode *vp;
+                       vp = cp ? CTOV(cp, 0) : NULL;
+                       hfs_hotfile_adjust_blocks(vp, (int64_t)((prev_filebytes - filebytes) / blksize));
+                       prev_filebytes = filebytes;
+                       
                        /* Finish the transaction and start over if necessary */
                        hfs_end_transaction(hfsmp);
                        
@@ -3093,14 +3844,16 @@ hfs_release_storage (struct hfsmount *hfsmp, struct filefork *datafork,
        }
        
        /* Resource fork */
-       if (error == 0 && (rsrcfork != NULL) && rsrcfork->ff_blocks > 0) {
+       if (error == 0 && rsrcfork) {
+               rsrcfork->ff_size = 0;
+
                fileblocks = rsrcfork->ff_blocks;
                filebytes = (off_t)fileblocks * (off_t)blksize;
                
                /* We killed invalid ranges and loaned blocks before we removed the catalog entry */
                
                while (filebytes > 0) {
-                       if (filebytes > HFS_BIGFILE_SIZE && overflow_extents(rsrcfork)) {
+                       if (filebytes > HFS_BIGFILE_SIZE) {
                                filebytes -= HFS_BIGFILE_SIZE;
                        } else {
                                filebytes = 0;
@@ -3123,9 +3876,6 @@ hfs_release_storage (struct hfsmount *hfsmp, struct filefork *datafork,
                                
                                hfs_systemfile_unlock(hfsmp, lockflags);
                        }
-                       if (error == 0) {
-                               rsrcfork->ff_size = filebytes;
-                       }
                        (void) hfs_volupdate(hfsmp, VOL_UPDATE, 0);
                        
                        /* Finish the transaction and start over if necessary */
@@ -3140,52 +3890,79 @@ hfs_release_storage (struct hfsmount *hfsmp, struct filefork *datafork,
        return error;
 }
 
+errno_t hfs_ubc_setsize(vnode_t vp, off_t len, bool have_cnode_lock)
+{
+       errno_t error;
+
+       /*
+        * Call ubc_setsize to give the VM subsystem a chance to do
+        * whatever it needs to with existing pages before we delete
+        * blocks.  Note that symlinks don't use the UBC so we'll
+        * get back ENOENT in that case.
+        */
+       if (have_cnode_lock) {
+               error = ubc_setsize_ex(vp, len, UBC_SETSIZE_NO_FS_REENTRY);
+               if (error == EAGAIN) {
+                       cnode_t *cp = VTOC(vp);
+
+                       if (cp->c_truncatelockowner != current_thread()) {
+#if DEVELOPMENT || DEBUG
+                               panic("hfs: hfs_ubc_setsize called without exclusive truncate lock!");
+#else
+                               printf("hfs: hfs_ubc_setsize called without exclusive truncate lock!\n");
+#endif
+                       }
+
+                       hfs_unlock(cp);
+                       error = ubc_setsize_ex(vp, len, 0);
+                       hfs_lock_always(cp, HFS_EXCLUSIVE_LOCK);
+               }
+       } else
+               error = ubc_setsize_ex(vp, len, 0);
+
+       return error == ENOENT ? 0 : error;
+}
 
 /*
  * Truncate a cnode to at most length size, freeing (or adding) the
  * disk blocks.
  */
 int
-hfs_truncate(struct vnode *vp, off_t length, int flags, int skipsetsize,
-             int skipupdate, vfs_context_t context)
+hfs_truncate(struct vnode *vp, off_t length, int flags,
+                        int truncateflags, vfs_context_t context)
 {
-       struct filefork *fp = VTOF(vp);
+       struct filefork *fp = VTOF(vp);
        off_t filebytes;
        u_int32_t fileblocks;
-       int blksize, error = 0;
+       int blksize;
+       errno_t error = 0;
        struct cnode *cp = VTOC(vp);
+       hfsmount_t *hfsmp = VTOHFS(vp);
 
        /* Cannot truncate an HFS directory! */
        if (vnode_isdir(vp)) {
                return (EISDIR);
        }
        /* A swap file cannot change size. */
-       if (vnode_isswap(vp) && (length != 0)) {
+       if (vnode_isswap(vp) && length && !ISSET(flags, IO_NOAUTH)) {
                return (EPERM);
        }
 
-       blksize = VTOVCB(vp)->blockSize;
+       blksize = hfsmp->blockSize;
        fileblocks = fp->ff_blocks;
        filebytes = (off_t)fileblocks * (off_t)blksize;
 
-       //
-       // Have to do this here so that we don't wind up with
-       // i/o pending for blocks that are about to be released
-       // if we truncate the file.
-       //
-       // If skipsetsize is set, then the caller is responsible
-       // for the ubc_setsize.
-       //
-       // Even if skipsetsize is set, if the length is zero we
-       // want to call ubc_setsize() because as of SnowLeopard
-       // it will no longer cause any page-ins and it will drop
-       // any dirty pages so that we don't do any i/o that we
-       // don't have to.  This also prevents a race where i/o
-       // for truncated blocks may overwrite later data if the
-       // blocks get reallocated to a different file.
-       //
-       if (!skipsetsize || length == 0)
-               ubc_setsize(vp, length);
+       bool caller_has_cnode_lock = (cp->c_lockowner == current_thread());
+
+       error = hfs_ubc_setsize(vp, length, caller_has_cnode_lock);
+       if (error)
+               return error;
+
+       if (!caller_has_cnode_lock) {
+               error = hfs_lock(cp, HFS_EXCLUSIVE_LOCK, HFS_LOCK_DEFAULT);
+               if (error)
+                       return error;
+       }
 
        // have to loop truncating or growing files that are
        // really big because otherwise transactions can get
@@ -3193,40 +3970,54 @@ hfs_truncate(struct vnode *vp, off_t length, int flags, int skipsetsize,
 
        if (length < filebytes) {
                while (filebytes > length) {
-                       if ((filebytes - length) > HFS_BIGFILE_SIZE && overflow_extents(fp)) {
+                       if ((filebytes - length) > HFS_BIGFILE_SIZE) {
                                filebytes -= HFS_BIGFILE_SIZE;
                        } else {
                                filebytes = length;
                        }
-                       cp->c_flag |= C_FORCEUPDATE;
-                       error = do_hfs_truncate(vp, filebytes, flags, skipupdate, context);
+                       error = do_hfs_truncate(vp, filebytes, flags, truncateflags, context);
                        if (error)
                                break;
                }
        } else if (length > filebytes) {
-               while (filebytes < length) {
-                       if ((length - filebytes) > HFS_BIGFILE_SIZE && overflow_extents(fp)) {
-                               filebytes += HFS_BIGFILE_SIZE;
-                       } else {
-                               filebytes = length;
+               kauth_cred_t cred = vfs_context_ucred(context);
+               const bool keep_reserve = cred && suser(cred, NULL) != 0;
+
+               if (hfs_freeblks(hfsmp, keep_reserve)
+                       < howmany(length - filebytes, blksize)) {
+                       error = ENOSPC;
+               } else {
+                       while (filebytes < length) {
+                               if ((length - filebytes) > HFS_BIGFILE_SIZE) {
+                                       filebytes += HFS_BIGFILE_SIZE;
+                               } else {
+                                       filebytes = length;
+                               }
+                               error = do_hfs_truncate(vp, filebytes, flags, truncateflags, context);
+                               if (error)
+                                       break;
                        }
-                       cp->c_flag |= C_FORCEUPDATE;
-                       error = do_hfs_truncate(vp, filebytes, flags, skipupdate, context);
-                       if (error)
-                               break;
                }
        } else /* Same logical size */ {
 
-               error = do_hfs_truncate(vp, length, flags, skipupdate, context);
+               error = do_hfs_truncate(vp, length, flags, truncateflags, context);
        }
        /* Files that are changing size are not hot file candidates. */
        if (VTOHFS(vp)->hfc_stage == HFC_RECORDING) {
                fp->ff_bytesread = 0;
        }
 
-       return (error);
-}
 
+       if (!caller_has_cnode_lock)
+               hfs_unlock(cp);
+
+       // Make sure UBC's size matches up (in case we didn't completely succeed)
+       errno_t err2 = hfs_ubc_setsize(vp, fp->ff_size, caller_has_cnode_lock);
+       if (!error)
+               error = err2;
+
+       return error;
+}
 
 
 /*
@@ -3273,9 +4064,9 @@ hfs_vnop_allocate(struct vnop_allocate_args /* {
 
        check_for_tracked_file(vp, orig_ctime, ap->a_length == 0 ? NAMESPACE_HANDLER_TRUNCATE_OP|NAMESPACE_HANDLER_DELETE_OP : NAMESPACE_HANDLER_TRUNCATE_OP, NULL);
 
-       hfs_lock_truncate(cp, HFS_EXCLUSIVE_LOCK);
+       hfs_lock_truncate(cp, HFS_EXCLUSIVE_LOCK, HFS_LOCK_DEFAULT);
 
-       if ((retval = hfs_lock(cp, HFS_EXCLUSIVE_LOCK))) {
+       if ((retval = hfs_lock(cp, HFS_EXCLUSIVE_LOCK, HFS_LOCK_DEFAULT))) {
                goto Err_Exit;
        }
        
@@ -3322,6 +4113,13 @@ hfs_vnop_allocate(struct vnop_allocate_args /* {
         * value of filebytes is 0, length will be at least 1.
         */
        if (length > filebytes) {
+               if (ISSET(extendFlags, kEFAllMask)
+                       && (hfs_freeblks(hfsmp, ISSET(extendFlags, kEFReserveMask))
+                               < howmany(length - filebytes, hfsmp->blockSize))) {
+                       retval = ENOSPC;
+                       goto Err_Exit;
+               }
+
                off_t total_bytes_added = 0, orig_request_size;
 
                orig_request_size = moreBytesRequested = length - filebytes;
@@ -3364,13 +4162,13 @@ hfs_vnop_allocate(struct vnop_allocate_args /* {
                    /* Protect extents b-tree and allocation bitmap */
                    lockflags = SFL_BITMAP;
                    if (overflow_extents(fp))
-                       lockflags |= SFL_EXTENTS;
+                               lockflags |= SFL_EXTENTS;
                    lockflags = hfs_systemfile_lock(hfsmp, lockflags, HFS_EXCLUSIVE_LOCK);
 
                    if (moreBytesRequested >= HFS_BIGFILE_SIZE) {
-                       bytesRequested = HFS_BIGFILE_SIZE;
+                               bytesRequested = HFS_BIGFILE_SIZE;
                    } else {
-                       bytesRequested = moreBytesRequested;
+                               bytesRequested = moreBytesRequested;
                    }
 
                    if (extendFlags & kEFContigMask) {
@@ -3399,7 +4197,7 @@ hfs_vnop_allocate(struct vnop_allocate_args /* {
                    hfs_systemfile_unlock(hfsmp, lockflags);
 
                    if (hfsmp->jnl) {
-                       (void) hfs_update(vp, TRUE);
+                       (void) hfs_update(vp, 0);
                        (void) hfs_volupdate(hfsmp, VOL_UPDATE, 0);
                    }
 
@@ -3427,14 +4225,18 @@ hfs_vnop_allocate(struct vnop_allocate_args /* {
 
        } else { /* Shorten the size of the file */
 
-               if (fp->ff_size > length) {
-                       /*
-                        * Any buffers that are past the truncation point need to be
-                        * invalidated (to maintain buffer cache consistency).
-                        */
-               }
+               /*
+                * N.B. At present, this code is never called.  If and when we
+                * do start using it, it looks like there might be slightly
+                * strange semantics with the file size: it's possible for the
+                * file size to *increase* e.g. if current file size is 5,
+                * length is 1024 and filebytes is 4096, the file size will
+                * end up being 1024 bytes.  This isn't necessarily a problem
+                * but it's not consistent with the code above which doesn't
+                * change the file size.
+                */
 
-               retval = hfs_truncate(vp, length, 0, 0, 0, ap->a_context);
+               retval = hfs_truncate(vp, length, 0, 0, ap->a_context);
                filebytes = (off_t)fp->ff_blocks * (off_t)vcb->blockSize;
 
                /*
@@ -3450,21 +4252,20 @@ hfs_vnop_allocate(struct vnop_allocate_args /* {
                if (fp->ff_size > filebytes) {
                        fp->ff_size = filebytes;
 
-                       hfs_unlock(cp);
-                       ubc_setsize(vp, fp->ff_size);
-                       hfs_lock(cp, HFS_FORCE_LOCK);
+                       hfs_ubc_setsize(vp, fp->ff_size, true);
                }
        }
 
 Std_Exit:
+       cp->c_flag |= C_MODIFIED;
        cp->c_touch_chgtime = TRUE;
        cp->c_touch_modtime = TRUE;
-       retval2 = hfs_update(vp, MNT_WAIT);
+       retval2 = hfs_update(vp, 0);
 
        if (retval == 0)
                retval = retval2;
 Err_Exit:
-       hfs_unlock_truncate(cp, 0);
+       hfs_unlock_truncate(cp, HFS_LOCK_DEFAULT);
        hfs_unlock(cp);
        return (retval);
 }
@@ -3494,8 +4295,10 @@ hfs_vnop_pagein(struct vnop_pagein_args *ap)
        upl_t           upl;
        upl_page_info_t *pl;
        off_t           f_offset;
+       off_t           page_needed_f_offset;
        int             offset;
        int             isize; 
+       int             upl_size; 
        int             pg_index;
        boolean_t       truncate_lock_held = FALSE;
        boolean_t       file_converted = FALSE;
@@ -3506,7 +4309,28 @@ hfs_vnop_pagein(struct vnop_pagein_args *ap)
        fp = VTOF(vp);
 
 #if CONFIG_PROTECT
-       if ((error = cp_handle_vnop(cp, CP_READ_ACCESS | CP_WRITE_ACCESS)) != 0) {
+       if ((error = cp_handle_vnop(vp, CP_READ_ACCESS | CP_WRITE_ACCESS, 0)) != 0) {
+               /* 
+                * If we errored here, then this means that one of two things occurred:
+                * 1. there was a problem with the decryption of the key.
+                * 2. the device is locked and we are not allowed to access this particular file.
+                * 
+                * Either way, this means that we need to shut down this upl now.  As long as 
+                * the pl pointer is NULL (meaning that we're supposed to create the UPL ourselves)
+                * then we create a upl and immediately abort it.
+                */
+               if (ap->a_pl == NULL) {
+                       /* create the upl */
+                       ubc_create_upl (vp, ap->a_f_offset, ap->a_size, &upl, &pl, 
+                                       UPL_UBC_PAGEIN | UPL_RET_ONLY_ABSENT);
+                       /* mark the range as needed so it doesn't immediately get discarded upon abort */
+                       ubc_upl_range_needed (upl, ap->a_pl_offset / PAGE_SIZE, 1);
+       
+                       /* Abort the range */
+                       ubc_upl_abort_range (upl, 0, ap->a_size, UPL_ABORT_FREE_ON_EMPTY | UPL_ABORT_ERROR);
+               }
+
+       
                return error;
        }
 #endif /* CONFIG_PROTECT */
@@ -3523,6 +4347,8 @@ hfs_vnop_pagein(struct vnop_pagein_args *ap)
                goto pagein_done;
        }
 
+       page_needed_f_offset = ap->a_f_offset + ap->a_pl_offset;
+
 retry_pagein:
        /*
         * take truncate lock (shared/recursive) to guard against 
@@ -3569,11 +4395,11 @@ retry_pagein:
        if (vfs_isforce(vp->v_mount)) {
                if (cp->c_flag & C_DELETED) {
                        /* If we don't get it, then just go ahead and operate without the lock */
-                       truncate_lock_held = hfs_try_trunclock(cp, HFS_RECURSE_TRUNCLOCK);
+                       truncate_lock_held = hfs_try_trunclock(cp, HFS_SHARED_LOCK, HFS_LOCK_SKIP_IF_EXCLUSIVE);
                }
        }
        else {
-               hfs_lock_truncate(cp, HFS_RECURSE_TRUNCLOCK);
+               hfs_lock_truncate(cp, HFS_SHARED_LOCK, HFS_LOCK_SKIP_IF_EXCLUSIVE);
                truncate_lock_held = TRUE;
        }
 
@@ -3583,9 +4409,11 @@ retry_pagein:
                error = EINVAL;
                goto pagein_done;
        }
-       isize = ap->a_size;
+       ubc_upl_range_needed(upl, ap->a_pl_offset / PAGE_SIZE, 1);
 
-       /* 
+       upl_size = isize = ap->a_size;
+
+       /*
         * Scan from the back to find the last page in the UPL, so that we 
         * aren't looking at a UPL that may have already been freed by the
         * preceding aborts/completions.
@@ -3653,6 +4481,7 @@ retry_pagein:
                        int compressed = hfs_file_is_compressed(VTOC(vp), 1); /* 1 == don't take the cnode lock */
 
                        if (compressed) {
+
                                if (truncate_lock_held) {
                                        /*
                                         * can't hold the truncate lock when calling into the decmpfs layer
@@ -3661,7 +4490,7 @@ retry_pagein:
                                         * takes the lock shared, we can deadlock if some other thread
                                         * tries to grab the lock exclusively in between.
                                         */
-                                       hfs_unlock_truncate(cp, 1);
+                                       hfs_unlock_truncate(cp, HFS_LOCK_SKIP_IF_EXCLUSIVE);
                                        truncate_lock_held = FALSE;
                                }
                                ap->a_pl = upl;
@@ -3676,12 +4505,19 @@ retry_pagein:
                                 * compressed once the compression lock is successfully taken
                                 * i.e. we would block on that lock while the file is being inflated
                                 */
+                               if (error == 0 && vnode_isfastdevicecandidate(vp)) {
+                                       (void) hfs_addhotfile(vp);
+                               }
                                if (compressed) {
                                        if (error == 0) {
                                                /* successful page-in, update the access time */
                                                VTOC(vp)->c_touch_acctime = TRUE;
                                        
-                                               /* compressed files are not hot file candidates */
+                                               //
+                                               // compressed files are not traditional hot file candidates
+                                               // but they may be for CF (which ignores the ff_bytesread
+                                               // field)
+                                               //
                                                if (VTOHFS(vp)->hfc_stage == HFC_RECORDING) {
                                                        fp->ff_bytesread = 0;
                                                }
@@ -3692,6 +4528,19 @@ retry_pagein:
                                                 * indication that the pagein needs to be redriven
                                                 */
                                                ubc_upl_abort_range(upl, (upl_offset_t) offset, xsize, UPL_ABORT_FREE_ON_EMPTY | UPL_ABORT_RESTART);
+                                       } else if (error == ENOSPC) {
+
+                                               if (upl_size == PAGE_SIZE)
+                                                       panic("decmpfs_pagein_compressed: couldn't ubc_upl_map a single page\n");
+
+                                               ubc_upl_abort_range(upl, (upl_offset_t) offset, isize, UPL_ABORT_FREE_ON_EMPTY);
+
+                                               ap->a_size = PAGE_SIZE;
+                                               ap->a_pl = NULL;
+                                               ap->a_pl_offset = 0;
+                                               ap->a_f_offset = page_needed_f_offset;
+
+                                               goto retry_pagein;
                                        }
                                        goto pagein_next_range;
                                }
@@ -3744,7 +4593,7 @@ retry_pagein:
 
                        /* When ff_bytesread exceeds 32-bits, update it behind the cnode lock. */
                        if ((fp->ff_bytesread + bytesread) > 0x00000000ffffffff && cp->c_lockowner != current_thread()) {
-                               hfs_lock(cp, HFS_FORCE_LOCK);
+                               hfs_lock(cp, HFS_EXCLUSIVE_LOCK, HFS_LOCK_ALLOW_NOEXISTS);
                                took_cnode_lock = 1;
                        }
                        /*
@@ -3761,6 +4610,10 @@ retry_pagein:
                                fp->ff_bytesread += bytesread;
                        }
                        cp->c_touch_acctime = TRUE;
+
+                       if (vnode_isfastdevicecandidate(vp)) {
+                               (void) hfs_addhotfile(vp);
+                       }
                        if (took_cnode_lock)
                                hfs_unlock(cp);
                }
@@ -3776,7 +4629,7 @@ pagein_next_range:
 pagein_done:
        if (truncate_lock_held == TRUE) {
                /* Note 1 is passed to hfs_unlock_truncate in been_recursed argument */
-               hfs_unlock_truncate(cp, 1);
+               hfs_unlock_truncate(cp, HFS_LOCK_SKIP_IF_EXCLUSIVE);
        }
 
        return (error);
@@ -3814,16 +4667,6 @@ hfs_vnop_pageout(struct vnop_pageout_args *ap)
        cp = VTOC(vp);
        fp = VTOF(vp);
        
-       /*
-        * Figure out where the file ends, for pageout purposes.  If
-        * ff_new_size > ff_size, then we're in the middle of extending the
-        * file via a write, so it is safe (and necessary) that we be able
-        * to pageout up to that point.
-        */
-       filesize = fp->ff_size;
-       if (fp->ff_new_size > filesize)
-               filesize = fp->ff_new_size;
-
        a_flags = ap->a_flags;
        a_pl_offset = ap->a_pl_offset;
 
@@ -3842,10 +4685,19 @@ hfs_vnop_pageout(struct vnop_pageout_args *ap)
                a_pl_offset = 0;
 
                /*
-                * take truncate lock (shared) to guard against 
-                * zero-fill thru fsync interfering, but only for v2 
+                * For V2 semantics, we want to take the cnode truncate lock
+                * shared to guard against the file size changing via zero-filling.
+                * 
+                * However, we have to be careful because we may be invoked 
+                * via the ubc_msync path to write out dirty mmap'd pages
+                * in response to a lock event on a content-protected
+                * filesystem (e.g. to write out class A files).
+                * As a result, we want to take the truncate lock 'SHARED' with 
+                * the mini-recursion locktype so that we don't deadlock/panic 
+                * because we may be already holding the truncate lock exclusive to force any other
+                * IOs to have blocked behind us. 
                 */
-               hfs_lock_truncate(cp, HFS_SHARED_LOCK);
+               hfs_lock_truncate(cp, HFS_SHARED_LOCK, HFS_LOCK_SKIP_IF_EXCLUSIVE);
 
                if (a_flags & UPL_MSYNC) {
                        request_flags = UPL_UBC_MSYNC | UPL_RET_ONLY_DIRTY;
@@ -3866,6 +4718,16 @@ hfs_vnop_pageout(struct vnop_pageout_args *ap)
         * it was either passed in or we succesfully created it
         */
 
+       /*
+        * Figure out where the file ends, for pageout purposes.  If
+        * ff_new_size > ff_size, then we're in the middle of extending the
+        * file via a write, so it is safe (and necessary) that we be able
+        * to pageout up to that point.
+        */
+       filesize = fp->ff_size;
+       if (fp->ff_new_size > filesize)
+               filesize = fp->ff_new_size;
+
        /* 
         * Now that HFS is opting into VFC_VFSVNOP_PAGEOUTV2, we may need to operate on our own  
         * UPL instead of relying on the UPL passed into us.  We go ahead and do that here,
@@ -3956,42 +4818,6 @@ hfs_vnop_pageout(struct vnop_pageout_args *ap)
                        }
                        xsize = num_of_pages * PAGE_SIZE;
 
-                       if (!vnode_isswap(vp)) {
-                               off_t end_of_range;
-                               int tooklock;
-
-                               tooklock = 0;
-
-                               if (cp->c_lockowner != current_thread()) {
-                                       if ((retval = hfs_lock(cp, HFS_EXCLUSIVE_LOCK))) {
-                                               /*
-                                                * we're in the v2 path, so we are the
-                                                * owner of the UPL... we may have already
-                                                * processed some of the UPL, so abort it
-                                                * from the current working offset to the
-                                                * end of the UPL
-                                                */
-                                               ubc_upl_abort_range(upl,
-                                                                   offset,
-                                                                   ap->a_size - offset,
-                                                                   UPL_ABORT_FREE_ON_EMPTY);
-                                               goto pageout_done;
-                                       }
-                                       tooklock = 1;
-                               }
-                               end_of_range = f_offset + xsize - 1;
-       
-                               if (end_of_range >= filesize) {
-                                       end_of_range = (off_t)(filesize - 1);
-                               }
-                               if (f_offset < filesize) {
-                                       rl_remove(f_offset, end_of_range, &fp->ff_invalidranges);
-                                       cp->c_flag |= C_MODIFIED;  /* leof is dirty */
-                               }
-                               if (tooklock) {
-                                       hfs_unlock(cp);
-                               }
-                       }
                        if ((error = cluster_pageout(vp, upl, offset, f_offset,
                                                        xsize, filesize, a_flags))) {
                                if (error_ret == 0)
@@ -4008,36 +4834,6 @@ hfs_vnop_pageout(struct vnop_pageout_args *ap)
                }
        } /* end block for v2 pageout behavior */
        else {
-               if (!vnode_isswap(vp)) {
-                       off_t end_of_range;
-                       int tooklock = 0;
-
-                       if (cp->c_lockowner != current_thread()) {
-                               if ((retval = hfs_lock(cp, HFS_EXCLUSIVE_LOCK))) {
-                                       if (!(a_flags & UPL_NOCOMMIT)) {
-                                               ubc_upl_abort_range(upl,
-                                                                   a_pl_offset,
-                                                                   ap->a_size,
-                                                                   UPL_ABORT_FREE_ON_EMPTY);
-                                       }
-                                       goto pageout_done;
-                               }
-                               tooklock = 1;
-                       }
-                       end_of_range = ap->a_f_offset + ap->a_size - 1;
-       
-                       if (end_of_range >= filesize) {
-                               end_of_range = (off_t)(filesize - 1);
-                       }
-                       if (ap->a_f_offset < filesize) {
-                               rl_remove(ap->a_f_offset, end_of_range, &fp->ff_invalidranges);
-                               cp->c_flag |= C_MODIFIED;  /* leof is dirty */
-                       }
-
-                       if (tooklock) {
-                               hfs_unlock(cp);
-                       }
-               }
                /* 
                 * just call cluster_pageout for old pre-v2 behavior
                 */
@@ -4046,26 +4842,52 @@ hfs_vnop_pageout(struct vnop_pageout_args *ap)
        }
 
        /*
-        * If data was written, update the modification time of the file.
-        * If setuid or setgid bits are set and this process is not the 
-        * superuser then clear the setuid and setgid bits as a precaution 
-        * against tampering.
+        * If data was written, update the modification time of the file
+        * but only if it's mapped writable; we will have touched the
+        * modifcation time for direct writes.
         */
-       if (retval == 0) {
-               cp->c_touch_modtime = TRUE;
-               cp->c_touch_chgtime = TRUE;
-               if ((cp->c_mode & (S_ISUID | S_ISGID)) &&
-                   (vfs_context_suser(ap->a_context) != 0)) {
-                       hfs_lock(cp, HFS_FORCE_LOCK);
-                       cp->c_mode &= ~(S_ISUID | S_ISGID);
-                       hfs_unlock(cp);
+       if (retval == 0 && (ubc_is_mapped_writable(vp)
+                                               || ISSET(cp->c_flag, C_MIGHT_BE_DIRTY_FROM_MAPPING))) {
+               hfs_lock(cp, HFS_EXCLUSIVE_LOCK, HFS_LOCK_ALLOW_NOEXISTS);
+
+               // Check again with lock
+               bool mapped_writable = ubc_is_mapped_writable(vp);
+               if (mapped_writable
+                       || ISSET(cp->c_flag, C_MIGHT_BE_DIRTY_FROM_MAPPING)) {
+                       cp->c_touch_modtime = TRUE;
+                       cp->c_touch_chgtime = TRUE;
+
+                       /*
+                        * We only need to increment the generation counter if
+                        * it's currently mapped writable because we incremented
+                        * the counter in hfs_vnop_mnomap.
+                        */
+                       if (mapped_writable)
+                               hfs_incr_gencount(VTOC(vp));
+
+                       /*
+                        * If setuid or setgid bits are set and this process is
+                        * not the superuser then clear the setuid and setgid bits
+                        * as a precaution against tampering.
+                        */
+                       if ((cp->c_mode & (S_ISUID | S_ISGID)) &&
+                               (vfs_context_suser(ap->a_context) != 0)) {
+                               cp->c_mode &= ~(S_ISUID | S_ISGID);
+                       }
                }
+
+               hfs_unlock(cp);
        }
 
 pageout_done:
        if (is_pageoutv2) {
-               /* release truncate lock (shared) */
-               hfs_unlock_truncate(cp, 0);
+               /* 
+                * Release the truncate lock.  Note that because 
+                * we may have taken the lock recursively by 
+                * being invoked via ubc_msync due to lockdown,
+                * we should release it recursively, too.
+                */
+               hfs_unlock_truncate(cp, HFS_LOCK_SKIP_IF_EXCLUSIVE);
        }
        return (retval);
 }
@@ -4123,6 +4945,168 @@ hfs_vnop_bwrite(struct vnop_bwrite_args *ap)
        return (retval);
 }
 
+
+int
+hfs_pin_block_range(struct hfsmount *hfsmp, int pin_state, uint32_t start_block, uint32_t nblocks, vfs_context_t ctx)
+{
+       _dk_cs_pin_t pin;
+       unsigned ioc;
+       int err;
+
+       memset(&pin, 0, sizeof(pin));
+       pin.cp_extent.offset = ((uint64_t)start_block) * HFSTOVCB(hfsmp)->blockSize;
+       pin.cp_extent.length = ((uint64_t)nblocks) * HFSTOVCB(hfsmp)->blockSize;
+       switch (pin_state) {
+       case HFS_PIN_IT:
+               ioc = _DKIOCCSPINEXTENT;
+               pin.cp_flags = _DKIOCCSPINTOFASTMEDIA;
+               break;
+       case HFS_PIN_IT | HFS_TEMP_PIN:
+               ioc = _DKIOCCSPINEXTENT;
+               pin.cp_flags = _DKIOCCSPINTOFASTMEDIA | _DKIOCCSTEMPORARYPIN;
+               break;
+       case HFS_PIN_IT | HFS_DATALESS_PIN:
+               ioc = _DKIOCCSPINEXTENT;
+               pin.cp_flags = _DKIOCCSPINTOFASTMEDIA | _DKIOCCSPINFORSWAPFILE;
+               break;
+       case HFS_UNPIN_IT:
+               ioc = _DKIOCCSUNPINEXTENT;
+               pin.cp_flags = 0;
+               break;
+       case HFS_UNPIN_IT | HFS_EVICT_PIN:
+               ioc = _DKIOCCSPINEXTENT;
+               pin.cp_flags = _DKIOCCSPINTOSLOWMEDIA;
+               break;
+       default:
+               return EINVAL;
+       }
+       err = VNOP_IOCTL(hfsmp->hfs_devvp, ioc, (caddr_t)&pin, 0, ctx);
+       return err;
+}
+
+//
+// The cnode lock should already be held on entry to this function
+//
+int
+hfs_pin_vnode(struct hfsmount *hfsmp, struct vnode *vp, int pin_state, uint32_t *num_blocks_pinned, vfs_context_t ctx)
+{
+       struct filefork *fp = VTOF(vp);
+       int i, err=0, need_put=0;
+       struct vnode *rsrc_vp=NULL;
+       uint32_t npinned = 0;
+       off_t               offset;
+
+       if (num_blocks_pinned) {
+               *num_blocks_pinned = 0;
+       }
+       
+       if (vnode_vtype(vp) != VREG) {
+               /* Not allowed to pin directories or symlinks */
+               printf("hfs: can't pin vnode of type %d\n", vnode_vtype(vp));
+               return (EPERM);
+       }
+       
+       if (fp->ff_unallocblocks) {
+               printf("hfs: can't pin a vnode w/unalloced blocks (%d)\n", fp->ff_unallocblocks);
+               return (EINVAL);
+       }
+
+       /*
+        * It is possible that if the caller unlocked/re-locked the cnode after checking
+        * for C_NOEXISTS|C_DELETED that the file could have been deleted while the
+        * cnode was unlocked.  So check the condition again and return ENOENT so that
+        * the caller knows why we failed to pin the vnode. 
+        */
+       if (VTOC(vp)->c_flag & (C_NOEXISTS|C_DELETED)) {
+               // makes no sense to pin something that's pending deletion
+               return ENOENT;
+       }
+
+       if (fp->ff_blocks == 0 && (VTOC(vp)->c_bsdflags & UF_COMPRESSED)) {
+               if (!VNODE_IS_RSRC(vp) && hfs_vgetrsrc(hfsmp, vp, &rsrc_vp) == 0) {
+                       //printf("hfs: fileid %d resource fork nblocks: %d / size: %lld\n", VTOC(vp)->c_fileid,
+                       //       VTOC(rsrc_vp)->c_rsrcfork->ff_blocks,VTOC(rsrc_vp)->c_rsrcfork->ff_size);
+
+                       fp = VTOC(rsrc_vp)->c_rsrcfork;
+                       need_put = 1;
+               }
+       }
+       if (fp->ff_blocks == 0) {
+               if (need_put) {
+                       //
+                       // use a distinct error code for a compressed file that has no resource fork;
+                       // we return EALREADY to indicate that the data is already probably hot file
+                       // cached because it's in an EA and the attributes btree is on the ssd
+                       // 
+                       err = EALREADY;
+               } else {
+                       err = EINVAL;
+               }
+               goto out;
+       }
+
+       offset = 0;
+       for (i = 0; i < kHFSPlusExtentDensity; i++) {
+               if (fp->ff_extents[i].startBlock == 0) {
+                       break;
+               }
+
+               err = hfs_pin_block_range(hfsmp, pin_state, fp->ff_extents[i].startBlock, fp->ff_extents[i].blockCount, ctx);
+               if (err) {
+                       break;
+               } else {
+                       npinned += fp->ff_extents[i].blockCount;                        
+               }
+       }
+       
+       if (err || npinned == 0) {
+               goto out;
+       }
+
+       if (fp->ff_extents[kHFSPlusExtentDensity-1].startBlock) {
+               uint32_t pblocks;
+               uint8_t forktype = 0;
+
+               if (fp == VTOC(vp)->c_rsrcfork) {
+                       forktype = 0xff;
+               }
+               /*
+                * The file could have overflow extents, better pin them.
+                *
+                * We assume that since we are holding the cnode lock for this cnode,
+                * the files extents cannot be manipulated, but the tree could, so we
+                * need to ensure that it doesn't change behind our back as we iterate it.
+                */
+               int lockflags = hfs_systemfile_lock (hfsmp, SFL_EXTENTS, HFS_SHARED_LOCK);
+               err = hfs_pin_overflow_extents(hfsmp, VTOC(vp)->c_fileid, forktype, &pblocks);
+               hfs_systemfile_unlock (hfsmp, lockflags);
+
+               if (err) {
+                       goto out;
+               }
+               npinned += pblocks;
+       }
+
+out:
+       if (num_blocks_pinned) {
+               *num_blocks_pinned = npinned;
+       }
+       
+       if (need_put && rsrc_vp) {
+               //
+               // have to unlock the cnode since it's shared between the
+               // resource fork vnode and the data fork vnode (and the
+               // vnode_put() may need to re-acquire the cnode lock to
+               // reclaim the resource fork vnode)
+               //
+               hfs_unlock(VTOC(vp));
+               vnode_put(rsrc_vp);
+               hfs_lock(VTOC(vp), HFS_EXCLUSIVE_LOCK, HFS_LOCK_ALLOW_NOEXISTS);
+       }
+       return err;
+}
+
+
 /*
  * Relocate a file to a new location on disk
  *  cnode must be locked on entry
@@ -4179,7 +5163,8 @@ hfs_relocate(struct  vnode *vp, u_int32_t  blockHint, kauth_cred_t cred,
        enum vtype vnodetype;
 
        vnodetype = vnode_vtype(vp);
-       if (vnodetype != VREG && vnodetype != VLNK) {
+       if (vnodetype != VREG) {
+               /* Not allowed to move symlinks. */
                return (EPERM);
        }
        
@@ -4202,18 +5187,17 @@ hfs_relocate(struct  vnode *vp, u_int32_t  blockHint, kauth_cred_t cred,
                return EINVAL;
        }
 #endif
-
        /* If it's an SSD, also disable HFS relocation */
        if (hfsmp->hfs_flags & HFS_SSD) {
                return EINVAL;
        }
 
+
        blksize = hfsmp->blockSize;
        if (blockHint == 0)
                blockHint = hfsmp->nextAllocation;
 
-       if ((fp->ff_size > 0x7fffffff) ||
-           ((fp->ff_size > blksize) && vnodetype == VLNK)) {
+       if (fp->ff_size > 0x7fffffff) {
                return (EFBIG);
        }
 
@@ -4230,15 +5214,15 @@ hfs_relocate(struct  vnode *vp, u_int32_t  blockHint, kauth_cred_t cred,
 
        if (!vnode_issystem(vp) && (vnodetype != VLNK)) {
                hfs_unlock(cp);
-               hfs_lock_truncate(cp, HFS_EXCLUSIVE_LOCK);
+               hfs_lock_truncate(cp, HFS_EXCLUSIVE_LOCK, HFS_LOCK_DEFAULT);
                /* Force lock since callers expects lock to be held. */
-               if ((retval = hfs_lock(cp, HFS_FORCE_LOCK))) {
-                       hfs_unlock_truncate(cp, 0);
+               if ((retval = hfs_lock(cp, HFS_EXCLUSIVE_LOCK, HFS_LOCK_ALLOW_NOEXISTS))) {
+                       hfs_unlock_truncate(cp, HFS_LOCK_DEFAULT);
                        return (retval);
                }
                /* No need to continue if file was removed. */
                if (cp->c_flag & C_NOEXISTS) {
-                       hfs_unlock_truncate(cp, 0);
+                       hfs_unlock_truncate(cp, HFS_LOCK_DEFAULT);
                        return (ENOENT);
                }
                took_trunc_lock = 1;
@@ -4253,7 +5237,7 @@ hfs_relocate(struct  vnode *vp, u_int32_t  blockHint, kauth_cred_t cred,
 
        if (hfs_start_transaction(hfsmp) != 0) {
                if (took_trunc_lock)
-                       hfs_unlock_truncate(cp, 0);
+                       hfs_unlock_truncate(cp, HFS_LOCK_DEFAULT);
            return (EINVAL);
        }
        started_tr = 1;
@@ -4278,10 +5262,10 @@ hfs_relocate(struct  vnode *vp, u_int32_t  blockHint, kauth_cred_t cred,
        nextallocsave = hfsmp->nextAllocation;
        retval = ExtendFileC(hfsmp, (FCB*)fp, growsize, blockHint, eflags, &newbytes);
        if (eflags & kEFMetadataMask) {
-               HFS_MOUNT_LOCK(hfsmp, TRUE);
+               hfs_lock_mount(hfsmp);
                HFS_UPDATE_NEXT_ALLOCATION(hfsmp, nextallocsave);
                MarkVCBDirty(hfsmp);
-               HFS_MOUNT_UNLOCK(hfsmp, TRUE);
+               hfs_unlock_mount(hfsmp);
        }
 
        retval = MacToVFSError(retval);
@@ -4291,7 +5275,7 @@ hfs_relocate(struct  vnode *vp, u_int32_t  blockHint, kauth_cred_t cred,
                        retval = ENOSPC;
                        goto restore;
                } else if (fp->ff_blocks < (headblks + datablks)) {
-                       printf("hfs_relocate: allocation failed");
+                       printf("hfs_relocate: allocation failed id=%u, vol=%s\n", cp->c_cnid, hfsmp->vcbVN);
                        retval = ENOSPC;
                        goto restore;
                }
@@ -4342,7 +5326,7 @@ hfs_relocate(struct  vnode *vp, u_int32_t  blockHint, kauth_cred_t cred,
         */
 
        if (vnodetype == VLNK)
-               retval = hfs_clonelink(vp, blksize, cred, p);
+               retval = EPERM;
        else if (vnode_issystem(vp))
                retval = hfs_clonesysfile(vp, headblks, datablks, blksize, cred, p);
        else
@@ -4373,7 +5357,7 @@ hfs_relocate(struct  vnode *vp, u_int32_t  blockHint, kauth_cred_t cred,
                goto restore;
 out:
        if (took_trunc_lock)
-               hfs_unlock_truncate(cp, 0);
+               hfs_unlock_truncate(cp, HFS_LOCK_DEFAULT);
 
        if (lockflags) {
                hfs_systemfile_unlock(hfsmp, lockflags);
@@ -4382,13 +5366,13 @@ out:
 
        /* Push cnode's new extent data to disk. */
        if (retval == 0) {
-               (void) hfs_update(vp, MNT_WAIT);
+               hfs_update(vp, 0);
        }
        if (hfsmp->jnl) {
                if (cp->c_cnid < kHFSFirstUserCatalogNodeID)
-                       (void) hfs_flushvolumeheader(hfsmp, MNT_WAIT, HFS_ALTFLUSH);
+                       (void) hfs_flushvolumeheader(hfsmp, HFS_FVH_WAIT | HFS_FVH_WRITE_ALT);
                else
-                       (void) hfs_flushvolumeheader(hfsmp, MNT_NOWAIT, 0);
+                       (void) hfs_flushvolumeheader(hfsmp, 0);
        }
 exit:
        if (started_tr)
@@ -4399,7 +5383,7 @@ exit:
 restore:
        if (fp->ff_blocks == headblks) {
                if (took_trunc_lock)
-                       hfs_unlock_truncate(cp, 0);
+                       hfs_unlock_truncate(cp, HFS_LOCK_DEFAULT);
                goto exit;
        }
        /*
@@ -4419,44 +5403,11 @@ restore:
        lockflags = 0;
 
        if (took_trunc_lock)
-               hfs_unlock_truncate(cp, 0);
+               hfs_unlock_truncate(cp, HFS_LOCK_DEFAULT);
        goto exit;
 }
 
 
-/*
- * Clone a symlink.
- *
- */
-static int
-hfs_clonelink(struct vnode *vp, int blksize, kauth_cred_t cred, __unused struct proc *p)
-{
-       struct buf *head_bp = NULL;
-       struct buf *tail_bp = NULL;
-       int error;
-
-
-       error = (int)buf_meta_bread(vp, (daddr64_t)0, blksize, cred, &head_bp);
-       if (error)
-               goto out;
-
-       tail_bp = buf_getblk(vp, (daddr64_t)1, blksize, 0, 0, BLK_META);
-       if (tail_bp == NULL) {
-               error = EIO;
-               goto out;
-       }
-       bcopy((char *)buf_dataptr(head_bp), (char *)buf_dataptr(tail_bp), blksize);
-       error = (int)buf_bwrite(tail_bp);
-out:
-       if (head_bp) {
-               buf_markinvalid(head_bp);
-               buf_brelse(head_bp);
-       }       
-       (void) buf_invalidateblks(vp, BUF_WRITE_DATA, 0, 0);
-
-       return (error);
-}
-
 /*
  * Clone a file's data within the file.
  *
@@ -4481,14 +5432,14 @@ hfs_clonefile(struct vnode *vp, int blkstart, int blkcnt, int blksize)
        hfs_unlock(VTOC(vp));
 
 #if CONFIG_PROTECT
-       if ((error = cp_handle_vnop(VTOC(vp), CP_WRITE_ACCESS)) != 0) {
-               hfs_lock(VTOC(vp), HFS_FORCE_LOCK);     
+       if ((error = cp_handle_vnop(vp, CP_WRITE_ACCESS, 0)) != 0) {
+               hfs_lock(VTOC(vp), HFS_EXCLUSIVE_LOCK, HFS_LOCK_ALLOW_NOEXISTS);        
                return (error);
        }
 #endif /* CONFIG_PROTECT */
 
-       if (kmem_alloc(kernel_map, (vm_offset_t *)&bufp, bufsize)) {
-               hfs_lock(VTOC(vp), HFS_FORCE_LOCK);
+       if (kmem_alloc(kernel_map, (vm_offset_t *)&bufp, bufsize, VM_KERN_MEMORY_FILE)) {
+               hfs_lock(VTOC(vp), HFS_EXCLUSIVE_LOCK, HFS_LOCK_ALLOW_NOEXISTS);
                return (ENOMEM);
        }
 
@@ -4506,7 +5457,7 @@ hfs_clonefile(struct vnode *vp, int blkstart, int blkcnt, int blksize)
                        break;
                }
                if (uio_resid(auio) != 0) {
-                       printf("hfs_clonefile: cluster_read: uio_resid = %lld\n", uio_resid(auio));
+                       printf("hfs_clonefile: cluster_read: uio_resid = %lld\n", (int64_t)uio_resid(auio));
                        error = EIO;            
                        break;
                }
@@ -4543,7 +5494,7 @@ hfs_clonefile(struct vnode *vp, int blkstart, int blkcnt, int blksize)
                ubc_msync(vp, writebase, writebase + offset, NULL, UBC_INVALIDATE | UBC_PUSHDIRTY);
        } else {
                /*
-                * No need to call ubc_sync_range or hfs_invalbuf
+                * No need to call ubc_msync or hfs_invalbuf
                 * since the file was copied using IO_NOCACHE and
                 * the copy was done starting and ending on a page
                 * boundary in the file.
@@ -4551,7 +5502,7 @@ hfs_clonefile(struct vnode *vp, int blkstart, int blkcnt, int blksize)
        }
        kmem_free(kernel_map, (vm_offset_t)bufp, bufsize);
 
-       hfs_lock(VTOC(vp), HFS_FORCE_LOCK);     
+       hfs_lock(VTOC(vp), HFS_EXCLUSIVE_LOCK, HFS_LOCK_ALLOW_NOEXISTS);        
        return (error);
 }
 
@@ -4581,7 +5532,7 @@ hfs_clonesysfile(struct vnode *vp, int blkstart, int blkcnt, int blksize,
        bufsize = MIN(blkcnt * blksize, 1024 * 1024) & ~(iosize - 1);
        breadcnt = bufsize / iosize;
 
-       if (kmem_alloc(kernel_map, (vm_offset_t *)&bufp, bufsize)) {
+       if (kmem_alloc(kernel_map, (vm_offset_t *)&bufp, bufsize, VM_KERN_MEMORY_FILE)) {
                return (ENOMEM);
        }       
        start_blk = ((daddr64_t)blkstart * blksize) / iosize;
@@ -4642,3 +5593,90 @@ out:
 
        return (error);
 }
+
+errno_t hfs_flush_invalid_ranges(vnode_t vp)
+{
+       cnode_t *cp = VTOC(vp);
+
+       assert(cp->c_lockowner == current_thread());
+       assert(cp->c_truncatelockowner == current_thread());
+
+       if (!ISSET(cp->c_flag, C_ZFWANTSYNC) && !cp->c_zftimeout)
+               return 0;
+
+       filefork_t *fp = VTOF(vp);
+
+       /*
+        * We can't hold the cnode lock whilst we call cluster_write so we
+        * need to copy the extents into a local buffer.
+        */
+       int max_exts = 16;
+       struct ext {
+               off_t start, end;
+       } exts_buf[max_exts];           // 256 bytes
+       struct ext *exts = exts_buf;
+       int ext_count = 0;
+       errno_t ret;
+
+       struct rl_entry *r = TAILQ_FIRST(&fp->ff_invalidranges);
+
+       while (r) {
+               /* If we have more than can fit in our stack buffer, switch
+                  to a heap buffer. */
+               if (exts == exts_buf && ext_count == max_exts) {
+                       max_exts = 256;
+                       MALLOC(exts, struct ext *, sizeof(struct ext) * max_exts,
+                                  M_TEMP, M_WAITOK);
+                       memcpy(exts, exts_buf, ext_count * sizeof(struct ext));
+               }
+
+               struct rl_entry *next = TAILQ_NEXT(r, rl_link);
+
+               exts[ext_count++] = (struct ext){ r->rl_start, r->rl_end };
+
+               if (!next || (ext_count == max_exts && exts != exts_buf)) {
+                       hfs_unlock(cp);
+                       for (int i = 0; i < ext_count; ++i) {
+                               ret = cluster_write(vp, NULL, fp->ff_size, exts[i].end + 1,
+                                                                       exts[i].start, 0,
+                                                                       IO_HEADZEROFILL | IO_NOZERODIRTY | IO_NOCACHE);
+                               if (ret) {
+                                       hfs_lock_always(cp, HFS_EXCLUSIVE_LOCK);
+                                       goto exit;
+                               }
+                       }
+
+                       if (!next) {
+                               hfs_lock_always(cp, HFS_EXCLUSIVE_LOCK);
+                               break;
+                       }
+
+                       /* Push any existing clusters which should clean up our invalid
+                          ranges as they go through hfs_vnop_blockmap. */
+                       cluster_push(vp, 0);
+
+                       hfs_lock_always(cp, HFS_EXCLUSIVE_LOCK);
+
+                       /*
+                        * Get back to where we were (given we dropped the lock).
+                        * This shouldn't be many because we pushed above.
+                        */
+                       TAILQ_FOREACH(r, &fp->ff_invalidranges, rl_link) {
+                               if (r->rl_end > exts[ext_count - 1].end)
+                                       break;
+                       }
+
+                       ext_count = 0;
+               } else
+                       r = next;
+       }
+
+       ret = 0;
+
+exit:
+
+       if (exts != exts_buf)
+               FREE(exts, M_TEMP);
+
+       return ret;
+}