X-Git-Url: https://git.saurik.com/apple/xnu.git/blobdiff_plain/b7266188b87f3620ec3f9f717e57194a7dd989fe..c18c124eaa464aaaa5549e99e5a70fc9cbb50944:/bsd/hfs/hfs_readwrite.c

diff --git a/bsd/hfs/hfs_readwrite.c b/bsd/hfs/hfs_readwrite.c
index 97578830d..f09bdc7d2 100644
--- a/bsd/hfs/hfs_readwrite.c
+++ b/bsd/hfs/hfs_readwrite.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2000-2009 Apple Inc. All rights reserved.
+ * Copyright (c) 2000-2015 Apple Inc. All rights reserved.
  *
  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
  * 
@@ -41,6 +41,7 @@
 #include <sys/filedesc.h>
 #include <sys/stat.h>
 #include <sys/buf.h>
+#include <sys/buf_internal.h>
 #include <sys/proc.h>
 #include <sys/kauth.h>
 #include <sys/vnode.h>
@@ -52,6 +53,8 @@
 #include <sys/disk.h>
 #include <sys/sysctl.h>
 #include <sys/fsctl.h>
+#include <sys/mount_internal.h>
+#include <sys/file_internal.h>
 
 #include <miscfs/specfs/specdev.h>
 
@@ -82,16 +85,18 @@ enum {
 /* from bsd/hfs/hfs_vfsops.c */
 extern int hfs_vfs_vget (struct mount *mp, ino64_t ino, struct vnode **vpp, vfs_context_t context);
 
-static int  hfs_clonelink(struct vnode *, int, kauth_cred_t, struct proc *);
 static int  hfs_clonefile(struct vnode *, int, int, int);
 static int  hfs_clonesysfile(struct vnode *, int, int, int, kauth_cred_t, struct proc *);
 static int  hfs_minorupdate(struct vnode *vp);
 static int  do_hfs_truncate(struct vnode *vp, off_t length, int flags, int skip, vfs_context_t context);
 
+/* from bsd/hfs/hfs_vnops.c */
+extern decmpfs_cnode* hfs_lazy_init_decmpfs_cnode (struct cnode *cp);
+
 
-int flush_cache_on_write = 0;
-SYSCTL_INT (_kern, OID_AUTO, flush_cache_on_write, CTLFLAG_RW, &flush_cache_on_write, 0, "always flush the drive cache on writes to uncached files");
 
+int flush_cache_on_write = 0;
+SYSCTL_INT (_kern, OID_AUTO, flush_cache_on_write, CTLFLAG_RW | CTLFLAG_LOCKED, &flush_cache_on_write, 0, "always flush the drive cache on writes to uncached files");
 
 /*
  * Read data from a file.
@@ -99,6 +104,16 @@ SYSCTL_INT (_kern, OID_AUTO, flush_cache_on_write, CTLFLAG_RW, &flush_cache_on_w
 int
 hfs_vnop_read(struct vnop_read_args *ap)
 {
+	/*
+	   struct vnop_read_args {
+	   struct vnodeop_desc *a_desc;
+	   vnode_t a_vp;
+	   struct uio *a_uio;
+	   int a_ioflag;
+	   vfs_context_t a_context;
+	   };
+	 */
+
 	uio_t uio = ap->a_uio;
 	struct vnode *vp = ap->a_vp;
 	struct cnode *cp;
@@ -109,6 +124,9 @@ hfs_vnop_read(struct vnop_read_args *ap)
 	off_t start_resid = uio_resid(uio);
 	off_t offset = uio_offset(uio);
 	int retval = 0;
+	int took_truncate_lock = 0;
+	int io_throttle = 0;
+	int throttled_count = 0;
 
 	/* Preflight checks */
 	if (!vnode_isreg(vp)) {
@@ -122,7 +140,15 @@ hfs_vnop_read(struct vnop_read_args *ap)
 		return (0);		/* Nothing left to do */
 	if (offset < 0)
 		return (EINVAL);	/* cant read from a negative offset */
-	
+
+	if ((ap->a_ioflag & (IO_SKIP_ENCRYPTION|IO_SYSCALL_DISPATCH)) ==
+						(IO_SKIP_ENCRYPTION|IO_SYSCALL_DISPATCH)) {
+		/* Don't allow unencrypted io request from user space */
+		return EPERM;
+	}
+
+
+
 #if HFS_COMPRESSION
 	if (VNODE_IS_RSRC(vp)) {
 		if (hfs_hides_rsrc(ap->a_context, VTOC(vp), 1)) { /* 1 == don't take the cnode lock */
@@ -147,6 +173,14 @@ hfs_vnop_read(struct vnop_read_args *ap)
 			}
 			/* otherwise the file was converted back to a regular file while we were reading it */
 			retval = 0;
+		} else if ((VTOC(vp)->c_bsdflags & UF_COMPRESSED)) {
+			int error;
+			
+			error = check_for_dataless_file(vp, NAMESPACE_HANDLER_READ_OP);
+			if (error) {
+				return error;
+			}
+
 		}
 	}
 #endif /* HFS_COMPRESSION */
@@ -155,11 +189,36 @@ hfs_vnop_read(struct vnop_read_args *ap)
 	fp = VTOF(vp);
 	hfsmp = VTOHFS(vp);
 
+#if CONFIG_PROTECT
+	if ((retval = cp_handle_vnop (vp, CP_READ_ACCESS, ap->a_ioflag)) != 0) {
+		goto exit;
+	}
+#endif
+
+	/* 
+	 * If this read request originated from a syscall (as opposed to 
+	 * an in-kernel page fault or something), then set it up for 
+	 * throttle checks
+	 */
+	if (ap->a_ioflag & IO_SYSCALL_DISPATCH) {
+		io_throttle = IO_RETURN_ON_THROTTLE;
+	}
+
+read_again:
+
 	/* Protect against a size change. */
-	hfs_lock_truncate(cp, 0);
+	hfs_lock_truncate(cp, HFS_SHARED_LOCK, HFS_LOCK_DEFAULT);
+	took_truncate_lock = 1;
 
 	filesize = fp->ff_size;
 	filebytes = (off_t)fp->ff_blocks * (off_t)hfsmp->blockSize;
+
+	/*
+	 * Check the file size. Note that per POSIX spec, we return 0 at 
+	 * file EOF, so attempting a read at an offset that is too big
+	 * should just return 0 on HFS+. Since the return value was initialized
+	 * to 0 above, we just jump to exit.  HFS Standard has its own behavior.
+	 */
 	if (offset > filesize) {
 		if ((hfsmp->hfs_flags & HFS_STANDARD) &&
 		    (offset > (off_t)MAXHFSFILESIZE)) {
@@ -168,14 +227,14 @@ hfs_vnop_read(struct vnop_read_args *ap)
 		goto exit;
 	}
 
-	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 12)) | DBG_FUNC_START,
+	KERNEL_DEBUG(HFSDBG_READ | DBG_FUNC_START,
 		(int)uio_offset(uio), uio_resid(uio), (int)filesize, (int)filebytes, 0);
 
-	retval = cluster_read(vp, uio, filesize, ap->a_ioflag);
+	retval = cluster_read(vp, uio, filesize, ap->a_ioflag |io_throttle);
 
 	cp->c_touch_acctime = TRUE;
 
-	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 12)) | DBG_FUNC_END,
+	KERNEL_DEBUG(HFSDBG_READ | DBG_FUNC_END,
 		(int)uio_offset(uio), uio_resid(uio), (int)filesize,  (int)filebytes, 0);
 
 	/*
@@ -189,7 +248,7 @@ hfs_vnop_read(struct vnop_read_args *ap)
 
 		/* When ff_bytesread exceeds 32-bits, update it behind the cnode lock. */
 		if ((fp->ff_bytesread + bytesread) > 0x00000000ffffffff) {
-			hfs_lock(cp, HFS_FORCE_LOCK);
+			hfs_lock(cp, HFS_EXCLUSIVE_LOCK, HFS_LOCK_ALLOW_NOEXISTS);
 			took_cnode_lock = 1;
 		}
 		/*
@@ -209,7 +268,19 @@ hfs_vnop_read(struct vnop_read_args *ap)
 			hfs_unlock(cp);
 	}
 exit:
-	hfs_unlock_truncate(cp, 0);
+	if (took_truncate_lock) {
+		hfs_unlock_truncate(cp, HFS_LOCK_DEFAULT);
+	}
+	if (retval == EAGAIN) {
+		throttle_lowpri_io(1);
+		throttled_count++;
+
+		retval = 0;
+		goto read_again;
+	}
+	if (throttled_count) {
+		throttle_info_reset_window((uthread_t)get_bsdthread_info(current_thread()));
+	}
 	return (retval);
 }
 
@@ -238,7 +309,12 @@ hfs_vnop_write(struct vnop_write_args *ap)
 	int lockflags;
 	int cnode_locked = 0;
 	int partialwrite = 0;
-	int exclusive_lock = 0;
+	int do_snapshot = 1;
+	time_t orig_ctime=VTOC(vp)->c_ctime;
+	int took_truncate_lock = 0;
+	int io_return_on_throttle = 0;
+	int throttled_count = 0;
+	struct rl_entry *invalid_range;
 
 #if HFS_COMPRESSION
 	if ( hfs_file_is_compressed(VTOC(vp), 1) ) { /* 1 == don't take the cnode lock */
@@ -247,23 +323,40 @@ hfs_vnop_write(struct vnop_write_args *ap)
 			case FILE_IS_COMPRESSED:
 				return EACCES;
 			case FILE_IS_CONVERTING:
-				/* if FILE_IS_CONVERTING, we allow writes */
+				/* if FILE_IS_CONVERTING, we allow writes but do not
+				   bother with snapshots or else we will deadlock.
+				*/
+				do_snapshot = 0;
 				break;
 			default:
 				printf("invalid state %d for compressed file\n", state);
 				/* fall through */
 		}
+	} else if ((VTOC(vp)->c_bsdflags & UF_COMPRESSED)) {
+		int error;
+		
+		error = check_for_dataless_file(vp, NAMESPACE_HANDLER_WRITE_OP);
+		if (error != 0) {
+			return error;
+		}
+	}
+
+	if (do_snapshot) {
+		check_for_tracked_file(vp, orig_ctime, NAMESPACE_HANDLER_WRITE_OP, uio);
 	}
+
 #endif
 
-	// LP64todo - fix this! uio_resid may be 64-bit value
+	if ((ioflag & (IO_SKIP_ENCRYPTION|IO_SYSCALL_DISPATCH)) ==
+						(IO_SKIP_ENCRYPTION|IO_SYSCALL_DISPATCH)) {
+		/* Don't allow unencrypted io request from user space */
+		return EPERM;
+	}
+
+
 	resid = uio_resid(uio);
 	offset = uio_offset(uio);
 
-	if (ioflag & IO_APPEND) {
-	    exclusive_lock = 1;
-	}
-	
 	if (offset < 0)
 		return (EINVAL);
 	if (resid == 0)
@@ -275,8 +368,14 @@ hfs_vnop_write(struct vnop_write_args *ap)
 	fp = VTOF(vp);
 	hfsmp = VTOHFS(vp);
 
+#if CONFIG_PROTECT
+	if ((retval = cp_handle_vnop (vp, CP_WRITE_ACCESS, 0)) != 0) {
+		goto exit;
+	}
+#endif
+
 	eflags = kEFDeferMask;	/* defer file block allocations */
-#ifdef HFS_SPARSE_DEV
+#if HFS_SPARSE_DEV
 	/* 
 	 * When the underlying device is sparse and space
 	 * is low (< 8MB), stop doing delayed allocations
@@ -289,15 +388,33 @@ hfs_vnop_write(struct vnop_write_args *ap)
 	}
 #endif /* HFS_SPARSE_DEV */
 
+	if ((ioflag & (IO_SINGLE_WRITER | IO_SYSCALL_DISPATCH)) == 
+			(IO_SINGLE_WRITER | IO_SYSCALL_DISPATCH)) {
+		io_return_on_throttle = IO_RETURN_ON_THROTTLE;
+	}
+
 again:
-	/* Protect against a size change. */
-	hfs_lock_truncate(cp, exclusive_lock);
+	/*
+	 * Protect against a size change.
+	 *
+	 * Note: If took_truncate_lock is true, then we previously got the lock shared
+	 * but needed to upgrade to exclusive.  So try getting it exclusive from the
+	 * start.
+	 */
+	if (ioflag & IO_APPEND || took_truncate_lock) {
+		hfs_lock_truncate(cp, HFS_EXCLUSIVE_LOCK, HFS_LOCK_DEFAULT);
+	}	
+	else {
+		hfs_lock_truncate(cp, HFS_SHARED_LOCK, HFS_LOCK_DEFAULT);
+	}
+	took_truncate_lock = 1;
 
+	/* Update UIO */
 	if (ioflag & IO_APPEND) {
 		uio_setoffset(uio, fp->ff_size);
 		offset = fp->ff_size;
 	}
-	if ((cp->c_flags & APPEND) && offset != fp->ff_size) {
+	if ((cp->c_bsdflags & APPEND) && offset != fp->ff_size) {
 		retval = EPERM;
 		goto exit;
 	}
@@ -306,32 +423,77 @@ again:
 	writelimit = offset + resid;
 	filebytes = (off_t)fp->ff_blocks * (off_t)hfsmp->blockSize;
 
-	/* If the truncate lock is shared, and if we either have virtual 
-	 * blocks or will need to extend the file, upgrade the truncate 
-	 * to exclusive lock.  If upgrade fails, we lose the lock and 
-	 * have to get exclusive lock again.  Note that we want to
-	 * grab the truncate lock exclusive even if we're not allocating new blocks
-	 * because we could still be growing past the LEOF.
+	/*
+	 * We may need an exclusive truncate lock for several reasons, all
+	 * of which are because we may be writing to a (portion of a) block
+	 * for the first time, and we need to make sure no readers see the
+	 * prior, uninitialized contents of the block.  The cases are:
+	 *
+	 * 1. We have unallocated (delayed allocation) blocks.  We may be
+	 *    allocating new blocks to the file and writing to them.
+	 *    (A more precise check would be whether the range we're writing
+	 *    to contains delayed allocation blocks.)
+	 * 2. We need to extend the file.  The bytes between the old EOF
+	 *    and the new EOF are not yet initialized.  This is important
+	 *    even if we're not allocating new blocks to the file.  If the
+	 *    old EOF and new EOF are in the same block, we still need to
+	 *    protect that range of bytes until they are written for the
+	 *    first time.
+	 * 3. The write overlaps some invalid ranges (delayed zero fill; that
+	 *    part of the file has been allocated, but not yet written).
+	 *
+	 * If we had a shared lock with the above cases, we need to try to upgrade
+	 * to an exclusive lock.  If the upgrade fails, we will lose the shared
+	 * lock, and will need to take the truncate lock again; the took_truncate_lock
+	 * flag will still be set, causing us to try for an exclusive lock next time.
+	 *
+	 * NOTE: Testing for #3 (delayed zero fill) needs to be done while the cnode
+	 * lock is held, since it protects the range lists.
 	 */
-	if ((exclusive_lock == 0) && 
-	    ((fp->ff_unallocblocks != 0) || (writelimit > origFileSize))) {
-	    	exclusive_lock = 1;
-		/* Lock upgrade failed and we lost our shared lock, try again */
+	if ((cp->c_truncatelockowner == HFS_SHARED_OWNER) &&
+	    ((fp->ff_unallocblocks != 0) ||
+	     (writelimit > origFileSize))) {
 		if (lck_rw_lock_shared_to_exclusive(&cp->c_truncatelock) == FALSE) {
+			/*
+			 * Lock upgrade failed and we lost our shared lock, try again.
+			 * Note: we do not set took_truncate_lock=0 here.  Leaving it
+			 * set to 1 will cause us to try to get the lock exclusive.
+			 */
 			goto again;
 		} 
+		else {
+			/* Store the owner in the c_truncatelockowner field if we successfully upgrade */
+			cp->c_truncatelockowner = current_thread();  
+		}
 	}
 
-	if ( (retval = hfs_lock(VTOC(vp), HFS_EXCLUSIVE_LOCK))) {
+	if ( (retval = hfs_lock(VTOC(vp), HFS_EXCLUSIVE_LOCK, HFS_LOCK_DEFAULT))) {
 		goto exit;
 	}
 	cnode_locked = 1;
 	
-	if (!exclusive_lock) {
-		KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 0)) | DBG_FUNC_START,
-		             (int)offset, uio_resid(uio), (int)fp->ff_size,
-		             (int)filebytes, 0);
+	/*
+	 * Now that we have the cnode lock, see if there are delayed zero fill ranges
+	 * overlapping our write.  If so, we need the truncate lock exclusive (see above).
+	 */
+	if ((cp->c_truncatelockowner == HFS_SHARED_OWNER) &&
+	    (rl_scan(&fp->ff_invalidranges, offset, writelimit-1, &invalid_range) != RL_NOOVERLAP)) {
+	    	/*
+		 * When testing, it appeared that calling lck_rw_lock_shared_to_exclusive() causes
+		 * a deadlock, rather than simply returning failure.  (That is, it apparently does
+		 * not behave like a "try_lock").  Since this condition is rare, just drop the
+		 * cnode lock and try again.  Since took_truncate_lock is set, we will
+		 * automatically take the truncate lock exclusive.
+		 */
+		hfs_unlock(cp);
+		cnode_locked = 0;
+		hfs_unlock_truncate(cp, HFS_LOCK_DEFAULT);
+		goto again;
 	}
+	
+	KERNEL_DEBUG(HFSDBG_WRITE | DBG_FUNC_START,
+		     (int)offset, uio_resid(uio), (int)fp->ff_size,
+		     (int)filebytes, 0);
 
 	/* Check if we do not need to extend the file */
 	if (writelimit <= filebytes) {
@@ -378,7 +540,7 @@ again:
 		if (retval != E_NONE)
 			break;
 		filebytes = (off_t)fp->ff_blocks * (off_t)hfsmp->blockSize;
-		KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 0)) | DBG_FUNC_NONE,
+		KERNEL_DEBUG(HFSDBG_WRITE | DBG_FUNC_NONE,
 			(int)offset, uio_resid(uio), (int)fp->ff_size,  (int)filebytes, 0);
 	}
 	(void) hfs_update(vp, TRUE);
@@ -405,7 +567,6 @@ sizeok:
 		off_t inval_end;
 		off_t io_start;
 		int lflag;
-		struct rl_entry *invalid_range;
 
 		if (writelimit > fp->ff_size)
 			filesize = writelimit;
@@ -479,7 +640,7 @@ sizeok:
 							fp->ff_size, inval_start,
 							zero_off, (off_t)0,
 							lflag | IO_HEADZEROFILL | IO_NOZERODIRTY);
-					hfs_lock(cp, HFS_FORCE_LOCK);
+					hfs_lock(cp, HFS_EXCLUSIVE_LOCK, HFS_LOCK_ALLOW_NOEXISTS);
 					cnode_locked = 1;
 					if (retval) goto ioerr_exit;
 					offset = uio_offset(uio);
@@ -551,9 +712,40 @@ sizeok:
 			ubc_setsize(vp, filesize);
 		}
 		retval = cluster_write(vp, uio, fp->ff_size, filesize, zero_off,
-				tail_off, lflag | IO_NOZERODIRTY);
+				tail_off, lflag | IO_NOZERODIRTY | io_return_on_throttle);
 		if (retval) {
 			fp->ff_new_size = 0;	/* no longer extending; use ff_size */
+			
+			if (retval == EAGAIN) {
+				/*
+				 * EAGAIN indicates that we still have I/O to do, but
+				 * that we now need to be throttled
+				 */
+				if (resid != uio_resid(uio)) {
+					/*
+					 * did manage to do some I/O before returning EAGAIN
+					 */
+					resid = uio_resid(uio);
+					offset = uio_offset(uio);
+
+					cp->c_touch_chgtime = TRUE;
+					cp->c_touch_modtime = TRUE;
+					hfs_incr_gencount(cp);
+				}
+				if (filesize > fp->ff_size) {
+					/*
+					 * we called ubc_setsize before the call to
+					 * cluster_write... since we only partially
+					 * completed the I/O, we need to 
+					 * re-adjust our idea of the filesize based
+					 * on our interim EOF
+					 */
+					ubc_setsize(vp, offset);
+
+					fp->ff_size = offset;
+				}
+				goto exit;
+			}
 			if (filesize > origFileSize) {
 				ubc_setsize(vp, origFileSize);
 			}
@@ -568,13 +760,7 @@ sizeok:
 				fp->ff_bytesread = 0;
 			}
 		}
-		fp->ff_new_size = 0;	/* ff_size now has the correct size */
-		
-		/* If we wrote some bytes, then touch the change and mod times */
-		if (resid > uio_resid(uio)) {
-			cp->c_touch_chgtime = TRUE;
-			cp->c_touch_modtime = TRUE;
-		}
+		fp->ff_new_size = 0;	/* ff_size now has the correct size */		
 	}
 	if (partialwrite) {
 		uio_setresid(uio, (uio_resid(uio) + bytesToAdd));
@@ -589,50 +775,61 @@ sizeok:
 	}
 
 ioerr_exit:
-	/*
-	 * If we successfully wrote any data, and we are not the superuser
-	 * we clear the setuid and setgid bits as a precaution against
-	 * tampering.
-	 */
-	if (cp->c_mode & (S_ISUID | S_ISGID)) {
-		cred = vfs_context_ucred(ap->a_context);
-		if (resid > uio_resid(uio) && cred && suser(cred, NULL)) {
-			if (!cnode_locked) {
-				hfs_lock(cp, HFS_FORCE_LOCK);
-				cnode_locked = 1;
+	if (resid > uio_resid(uio)) {
+		if (!cnode_locked) {
+			hfs_lock(cp, HFS_EXCLUSIVE_LOCK, HFS_LOCK_ALLOW_NOEXISTS);
+			cnode_locked = 1;
+		}
+
+		cp->c_touch_chgtime = TRUE;
+		cp->c_touch_modtime = TRUE;
+		hfs_incr_gencount(cp);
+
+		/*
+		 * If we successfully wrote any data, and we are not the superuser
+		 * we clear the setuid and setgid bits as a precaution against
+		 * tampering.
+		 */
+		if (cp->c_mode & (S_ISUID | S_ISGID)) {
+			cred = vfs_context_ucred(ap->a_context);
+			if (cred && suser(cred, NULL)) {
+				cp->c_mode &= ~(S_ISUID | S_ISGID);
 			}
-			cp->c_mode &= ~(S_ISUID | S_ISGID);
 		}
 	}
 	if (retval) {
 		if (ioflag & IO_UNIT) {
-			if (!cnode_locked) {
-				hfs_lock(cp, HFS_FORCE_LOCK);
-				cnode_locked = 1;
-			}
 			(void)hfs_truncate(vp, origFileSize, ioflag & IO_SYNC,
-			                   0, 0, ap->a_context);
-			// LP64todo - fix this!  resid needs to by user_ssize_t
+			                   0, ap->a_context);
 			uio_setoffset(uio, (uio_offset(uio) - (resid - uio_resid(uio))));
 			uio_setresid(uio, resid);
 			filebytes = (off_t)fp->ff_blocks * (off_t)hfsmp->blockSize;
 		}
-	} else if ((ioflag & IO_SYNC) && (resid > uio_resid(uio))) {
-		if (!cnode_locked) {
-			hfs_lock(cp, HFS_FORCE_LOCK);
-			cnode_locked = 1;
-		}
+	} else if ((ioflag & IO_SYNC) && (resid > uio_resid(uio)))
 		retval = hfs_update(vp, TRUE);
-	}
+
 	/* Updating vcbWrCnt doesn't need to be atomic. */
 	hfsmp->vcbWrCnt++;
 
-	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 0)) | DBG_FUNC_END,
+	KERNEL_DEBUG(HFSDBG_WRITE | DBG_FUNC_END,
 		(int)uio_offset(uio), uio_resid(uio), (int)fp->ff_size, (int)filebytes, 0);
 exit:
 	if (cnode_locked)
 		hfs_unlock(cp);
-	hfs_unlock_truncate(cp, exclusive_lock);
+	
+	if (took_truncate_lock) {
+		hfs_unlock_truncate(cp, HFS_LOCK_DEFAULT);
+	}
+	if (retval == EAGAIN) {
+		throttle_lowpri_io(1);
+		throttled_count++;
+
+		retval = 0;
+		goto again;
+	}
+	if (throttled_count) {
+		throttle_info_reset_window((uthread_t)get_bsdthread_info(current_thread()));
+	}
 	return (retval);
 }
 
@@ -775,8 +972,6 @@ lookup_bucket(struct access_cache *cache, int *indexp, cnid_t parent_id)
     }
 	
     if (cache->numcached > NUM_CACHE_ENTRIES) {
-	/*printf("hfs: EGAD! numcached is %d... cut our losses and trim to %d\n",
-	  cache->numcached, NUM_CACHE_ENTRIES);*/
 	cache->numcached = NUM_CACHE_ENTRIES;
     }
 	
@@ -824,11 +1019,9 @@ add_node(struct access_cache *cache, int index, cnid_t nodeID, int access)
 
     /* if the cache is full, do a replace rather than an insert */
     if (cache->numcached >= NUM_CACHE_ENTRIES) {
-	//printf("hfs: cache is full (%d). replace at index %d\n", cache->numcached, index);
 	cache->numcached = NUM_CACHE_ENTRIES-1;
 
 	if (index > cache->numcached) {
-	    //    printf("hfs: index %d pinned to %d\n", index, cache->numcached);
 	    index = cache->numcached;
 	}
     }
@@ -858,15 +1051,15 @@ struct cinfo {
 };
 
 static int
-snoop_callback(const struct cat_desc *descp, const struct cat_attr *attrp, void * arg)
+snoop_callback(const cnode_t *cp, void *arg)
 {
-    struct cinfo *cip = (struct cinfo *)arg;
+    struct cinfo *cip = arg;
 
-    cip->uid = attrp->ca_uid;
-    cip->gid = attrp->ca_gid;
-    cip->mode = attrp->ca_mode;
-    cip->parentcnid = descp->cd_parentcnid;
-    cip->recflags = attrp->ca_recflags;
+    cip->uid = cp->c_uid;
+    cip->gid = cp->c_gid;
+    cip->mode = cp->c_mode;
+    cip->parentcnid = cp->c_parentcnid;
+    cip->recflags = cp->c_attr.ca_recflags;
 	
     return (0);
 }
@@ -883,33 +1076,41 @@ do_attr_lookup(struct hfsmount *hfsmp, struct access_cache *cache, cnid_t cnid,
 
     /* if this id matches the one the fsctl was called with, skip the lookup */
     if (cnid == skip_cp->c_cnid) {
-	cnattrp->ca_uid = skip_cp->c_uid;
-	cnattrp->ca_gid = skip_cp->c_gid;
-	cnattrp->ca_mode = skip_cp->c_mode;
-	cnattrp->ca_recflags = skip_cp->c_attr.ca_recflags;
-	keyp->hfsPlus.parentID = skip_cp->c_parentcnid;
+		cnattrp->ca_uid = skip_cp->c_uid;
+		cnattrp->ca_gid = skip_cp->c_gid;
+		cnattrp->ca_mode = skip_cp->c_mode;
+		cnattrp->ca_recflags = skip_cp->c_attr.ca_recflags;
+		keyp->hfsPlus.parentID = skip_cp->c_parentcnid;
     } else {
-	struct cinfo c_info;
-
-	/* otherwise, check the cnode hash incase the file/dir is incore */
-	if (hfs_chash_snoop(hfsmp, cnid, snoop_callback, &c_info) == 0) {
-	    cnattrp->ca_uid = c_info.uid;
-	    cnattrp->ca_gid = c_info.gid;
-	    cnattrp->ca_mode = c_info.mode;
-	    cnattrp->ca_recflags = c_info.recflags;
-	    keyp->hfsPlus.parentID = c_info.parentcnid;
-	} else {
-	    int lockflags;
-			
-	    lockflags = hfs_systemfile_lock(hfsmp, SFL_CATALOG, HFS_SHARED_LOCK);
-			
-	    /* lookup this cnid in the catalog */
-	    error = cat_getkeyplusattr(hfsmp, cnid, keyp, cnattrp);
+		struct cinfo c_info;
+
+		/* otherwise, check the cnode hash incase the file/dir is incore */
+		error = hfs_chash_snoop(hfsmp, cnid, 0, snoop_callback, &c_info);
+
+		if (error == EACCES) {
+			// File is deleted
+			return ENOENT;
+		} else if (!error) {
+			cnattrp->ca_uid = c_info.uid;
+			cnattrp->ca_gid = c_info.gid;
+			cnattrp->ca_mode = c_info.mode;
+			cnattrp->ca_recflags = c_info.recflags;
+			keyp->hfsPlus.parentID = c_info.parentcnid;
+		} else {
+			int lockflags;
+
+			if (throttle_io_will_be_throttled(-1, HFSTOVFS(hfsmp)))
+				throttle_lowpri_io(1);
+
+			lockflags = hfs_systemfile_lock(hfsmp, SFL_CATALOG, HFS_SHARED_LOCK);
+
+			/* lookup this cnid in the catalog */
+			error = cat_getkeyplusattr(hfsmp, cnid, keyp, cnattrp);
 			
-	    hfs_systemfile_unlock(hfsmp, lockflags);
+			hfs_systemfile_unlock(hfsmp, lockflags);
 			
-	    cache->lookups++;
-	}
+			cache->lookups++;
+		}
     }
 	
     return (error);
@@ -1004,7 +1205,7 @@ do_access_check(struct hfsmount *hfsmp, int *err, struct access_cache *cache, HF
 	    struct vnode *vp;
 
 	    /* get the vnode for this cnid */
-	    myErr = hfs_vget(hfsmp, thisNodeID, &vp, 0);
+	    myErr = hfs_vget(hfsmp, thisNodeID, &vp, 0, 0);
 	    if ( myErr ) {
 		myResult = 0;
 		goto ExitThisRoutine;
@@ -1027,21 +1228,19 @@ do_access_check(struct hfsmount *hfsmp, int *err, struct access_cache *cache, HF
 	    }
 	} else {
 	    unsigned int flags;
-		   
-	    myPerms = DerivePermissionSummary(cnattr.ca_uid, cnattr.ca_gid,
-		cnattr.ca_mode, hfsmp->hfs_mp,
-		myp_ucred, theProcPtr);
+		int mode = cnattr.ca_mode & S_IFMT;   
+		myPerms = DerivePermissionSummary(cnattr.ca_uid, cnattr.ca_gid, cnattr.ca_mode, hfsmp->hfs_mp,myp_ucred, theProcPtr);
 
-	    if (cnattr.ca_mode & S_IFDIR) {
-		flags = R_OK | X_OK;
-	    } else {
-		flags = R_OK;
-	    }
-	    if ( (myPerms & flags) != flags) {
-		myResult = 0;
-		myErr = EACCES;
-		goto ExitThisRoutine;   /* no access */
-	    }
+		if (mode == S_IFDIR) {
+			flags = R_OK | X_OK;
+		} else {
+			flags = R_OK;
+		}
+		if ( (myPerms & flags) != flags) {
+			myResult = 0;
+			myErr = EACCES;
+			goto ExitThisRoutine;   /* no access */
+		}
 
 	    /* up the hierarchy we go */
 	    thisNodeID = catkey.hfsPlus.parentID;
@@ -1080,7 +1279,7 @@ do_bulk_access_check(struct hfsmount *hfsmp, struct vnode *vp,
     boolean_t is64bit;
 
     /*
-     * NOTE: on entry, the vnode is locked. Incase this vnode
+     * NOTE: on entry, the vnode has an io_ref. In case this vnode
      * happens to be in our list of file_ids, we'll note it
      * avoid calling hfs_chashget_nowait() on that id as that
      * will cause a "locking against myself" panic.
@@ -1284,7 +1483,7 @@ do_bulk_access_check(struct hfsmount *hfsmp, struct vnode *vp,
 		struct vnode *cvp;
 		int myErr = 0;
 		/* get the vnode for this cnid */
-		myErr = hfs_vget(hfsmp, cnid, &cvp, 0);
+		myErr = hfs_vget(hfsmp, cnid, &cvp, 0, 0);
 		if ( myErr ) {
 		    access[i] = myErr;
 		    continue;
@@ -1325,7 +1524,7 @@ do_bulk_access_check(struct hfsmount *hfsmp, struct vnode *vp,
 	    access[i] = 0;
 	    continue;
 	}
-			
+	
 	myaccess = do_access_check(hfsmp, &error, &cache, catkey.hfsPlus.parentID, 
 	    skip_cp, p, cred, context,bitmap, map_size, parents, num_parents);
 			
@@ -1353,8 +1552,6 @@ do_bulk_access_check(struct hfsmount *hfsmp, struct vnode *vp,
 		
   err_exit_bulk_access:
 		
-    //printf("hfs: on exit (err %d), numfiles/numcached/cachehits/lookups is %d/%d/%d/%d\n", error, num_files, cache.numcached, cache.cachehits, cache.lookups);
-		
     if (file_ids) 
 	kfree(file_ids, sizeof(int) * num_files);
     if (parents) 
@@ -1375,24 +1572,13 @@ do_bulk_access_check(struct hfsmount *hfsmp, struct vnode *vp,
 /* end "bulk-access" support */
 
 
-/*
- * Callback for use with freeze ioctl.
- */
-static int
-hfs_freezewrite_callback(struct vnode *vp, __unused void *cargs)
-{
-	vnode_waitforwrites(vp, 0, 0, 0, "hfs freeze");
-
-	return 0;
-}
-
 /*
  * Control filesystem operating characteristics.
  */
 int
 hfs_vnop_ioctl( struct vnop_ioctl_args /* {
 		vnode_t a_vp;
-		int  a_command;
+		long  a_command;
 		caddr_t  a_data;
 		int  a_fflag;
 		vfs_context_t a_context;
@@ -1432,6 +1618,15 @@ hfs_vnop_ioctl( struct vnop_ioctl_args /* {
 
 	is64bit = proc_is64bit(p);
 
+#if CONFIG_PROTECT
+	{
+		int error = 0;
+		if ((error = cp_handle_vnop(vp, CP_WRITE_ACCESS, 0)) != 0) {
+			return error;
+		}
+	}
+#endif /* CONFIG_PROTECT */
+
 	switch (ap->a_command) {
 
 	case HFS_GETPATH:
@@ -1441,6 +1636,7 @@ hfs_vnop_ioctl( struct vnop_ioctl_args /* {
 		int  outlen;
 		char *bufptr;
 		int error;
+		int flags = 0;
 
 		/* Caller must be owner of file system. */
 		vfsp = vfs_statfs(HFSTOVFS(hfsmp));
@@ -1454,6 +1650,9 @@ hfs_vnop_ioctl( struct vnop_ioctl_args /* {
 		}
 		bufptr = (char *)ap->a_data;
 		cnid = strtoul(bufptr, NULL, 10);
+		if (ap->a_fflag & HFS_GETPATH_VOLUME_RELATIVE) {
+			flags |= BUILDPATH_VOLUME_RELATIVE; 
+		}
 
 		/* We need to call hfs_vfs_vget to leverage the code that will
 		 * fix the origin list for us if needed, as opposed to calling
@@ -1463,12 +1662,139 @@ hfs_vnop_ioctl( struct vnop_ioctl_args /* {
 		if ((error = hfs_vfs_vget(HFSTOVFS(hfsmp), cnid, &file_vp, context))) {
 			return (error);
 		}
-		error = build_path(file_vp, bufptr, sizeof(pathname_t), &outlen, 0, context);
+		error = build_path(file_vp, bufptr, sizeof(pathname_t), &outlen, flags, context);
 		vnode_put(file_vp);
 
 		return (error);
 	}
 
+	case HFS_TRANSFER_DOCUMENT_ID:
+	{
+		struct cnode *cp = NULL;
+		int error;
+		u_int32_t to_fd = *(u_int32_t *)ap->a_data;
+		struct fileproc *to_fp;
+		struct vnode *to_vp;
+		struct cnode *to_cp;
+
+		cp = VTOC(vp);
+
+		if ((error = fp_getfvp(p, to_fd, &to_fp, &to_vp)) != 0) {
+			//printf("could not get the vnode for fd %d (err %d)\n", to_fd, error);
+			return error;
+		}
+		if ( (error = vnode_getwithref(to_vp)) ) {
+			file_drop(to_fd);
+			return error;
+		}
+
+		if (VTOHFS(to_vp) != hfsmp) {
+			error = EXDEV;
+			goto transfer_cleanup;
+		}
+
+		int need_unlock = 1;
+		to_cp = VTOC(to_vp);
+		error = hfs_lockpair(cp, to_cp, HFS_EXCLUSIVE_LOCK);
+		if (error != 0) {
+			//printf("could not lock the pair of cnodes (error %d)\n", error);
+			goto transfer_cleanup;
+		}
+			
+		if (!(cp->c_bsdflags & UF_TRACKED)) {
+			error = EINVAL;
+		} else if (to_cp->c_bsdflags & UF_TRACKED) {
+			//
+			// if the destination is already tracked, return an error
+			// as otherwise it's a silent deletion of the target's
+			// document-id
+			//
+			error = EEXIST;
+		} else if (S_ISDIR(cp->c_attr.ca_mode) || S_ISREG(cp->c_attr.ca_mode) || S_ISLNK(cp->c_attr.ca_mode)) {
+			//
+			// we can use the FndrExtendedFileInfo because the doc-id is the first
+			// thing in both it and the ExtendedDirInfo struct which is fixed in
+			// format and can not change layout
+			//
+			struct FndrExtendedFileInfo *f_extinfo = (struct FndrExtendedFileInfo *)((u_int8_t*)cp->c_finderinfo + 16);
+			struct FndrExtendedFileInfo *to_extinfo = (struct FndrExtendedFileInfo *)((u_int8_t*)to_cp->c_finderinfo + 16);
+
+			if (f_extinfo->document_id == 0) {
+				uint32_t new_id;
+
+				hfs_unlockpair(cp, to_cp);  // have to unlock to be able to get a new-id
+				
+				if ((error = hfs_generate_document_id(hfsmp, &new_id)) == 0) {
+					//
+					// re-lock the pair now that we have the document-id
+					//
+					hfs_lockpair(cp, to_cp, HFS_EXCLUSIVE_LOCK);
+					f_extinfo->document_id = new_id;
+				} else {
+					goto transfer_cleanup;
+				}
+			}
+					
+			to_extinfo->document_id = f_extinfo->document_id;
+			f_extinfo->document_id = 0;
+			//printf("TRANSFERRING: doc-id %d from ino %d to ino %d\n", to_extinfo->document_id, cp->c_fileid, to_cp->c_fileid);
+
+			// make sure the destination is also UF_TRACKED
+			to_cp->c_bsdflags |= UF_TRACKED;
+			cp->c_bsdflags &= ~UF_TRACKED;
+
+			// mark the cnodes dirty
+			cp->c_flag |= C_MODIFIED | C_FORCEUPDATE;
+			to_cp->c_flag |= C_MODIFIED | C_FORCEUPDATE;
+
+			int lockflags;
+			if ((error = hfs_start_transaction(hfsmp)) == 0) {
+
+				lockflags = hfs_systemfile_lock(hfsmp, SFL_CATALOG, HFS_EXCLUSIVE_LOCK);
+
+				(void) cat_update(hfsmp, &cp->c_desc, &cp->c_attr, NULL, NULL);
+				(void) cat_update(hfsmp, &to_cp->c_desc, &to_cp->c_attr, NULL, NULL);
+
+				hfs_systemfile_unlock (hfsmp, lockflags);
+				(void) hfs_end_transaction(hfsmp);
+			}
+
+#if CONFIG_FSE
+			add_fsevent(FSE_DOCID_CHANGED, context,
+				    FSE_ARG_DEV,   hfsmp->hfs_raw_dev,
+				    FSE_ARG_INO,   (ino64_t)cp->c_fileid,       // src inode #
+				    FSE_ARG_INO,   (ino64_t)to_cp->c_fileid,    // dst inode #
+				    FSE_ARG_INT32, to_extinfo->document_id,
+				    FSE_ARG_DONE);
+
+			hfs_unlockpair(cp, to_cp);    // unlock this so we can send the fsevents
+			need_unlock = 0;
+
+			if (need_fsevent(FSE_STAT_CHANGED, vp)) {
+				add_fsevent(FSE_STAT_CHANGED, context, FSE_ARG_VNODE, vp, FSE_ARG_DONE);
+			}
+			if (need_fsevent(FSE_STAT_CHANGED, to_vp)) {
+				add_fsevent(FSE_STAT_CHANGED, context, FSE_ARG_VNODE, to_vp, FSE_ARG_DONE);
+			}
+#else
+			hfs_unlockpair(cp, to_cp);    // unlock this so we can send the fsevents
+			need_unlock = 0;
+#endif
+		}
+		
+		if (need_unlock) {
+			hfs_unlockpair(cp, to_cp);
+		}
+
+	transfer_cleanup:
+		vnode_put(to_vp);
+		file_drop(to_fd);
+
+		return error;
+	}
+
+
+
 	case HFS_PREV_LINK:
 	case HFS_NEXT_LINK:
 	{
@@ -1491,7 +1817,7 @@ hfs_vnop_ioctl( struct vnop_ioctl_args /* {
 		if (linkfileid < kHFSFirstUserCatalogNodeID) {
 			return (EINVAL);
 		}
-		if ((error = hfs_lookuplink(hfsmp, linkfileid, &prevlinkid, &nextlinkid))) {
+		if ((error = hfs_lookup_siblinglinks(hfsmp, linkfileid, &prevlinkid, &nextlinkid))) {
 			return (error);
 		}
 		if (ap->a_command == HFS_NEXT_LINK) {
@@ -1563,7 +1889,7 @@ hfs_vnop_ioctl( struct vnop_ioctl_args /* {
 		if (!vnode_isvroot(vp)) {
 			return (EINVAL);
 		}
-		HFS_MOUNT_LOCK(hfsmp, TRUE);
+		hfs_lock_mount(hfsmp);
 		location = *(u_int32_t *)ap->a_data;
 		if ((location >= hfsmp->allocLimit) &&
 			(location != HFS_NO_UPDATE_NEXT_ALLOCATION)) {
@@ -1587,11 +1913,11 @@ hfs_vnop_ioctl( struct vnop_ioctl_args /* {
 		}
 		MarkVCBDirty(hfsmp);
 fail_change_next_allocation:
-		HFS_MOUNT_UNLOCK(hfsmp, TRUE);
+		hfs_unlock_mount(hfsmp);
 		return (error);
 	}
 
-#ifdef HFS_SPARSE_DEV
+#if HFS_SPARSE_DEV
 	case HFS_SETBACKINGSTOREINFO: {
 		struct vnode * bsfs_rootvp;
 		struct vnode * di_vp;
@@ -1640,12 +1966,13 @@ fail_change_next_allocation:
 		vnode_ref(bsfs_rootvp);
 		vnode_put(bsfs_rootvp);
 
+		hfs_lock_mount(hfsmp);
 		hfsmp->hfs_backingfs_rootvp = bsfs_rootvp;
 		hfsmp->hfs_flags |= HFS_HAS_SPARSE_DEVICE;
-		hfsmp->hfs_sparsebandblks = bsdata->bandsize / HFSTOVCB(hfsmp)->blockSize;
-		hfsmp->hfs_sparsebandblks *= 4;
+		hfsmp->hfs_sparsebandblks = bsdata->bandsize / hfsmp->blockSize * 4;
+		hfs_unlock_mount(hfsmp);
 
-		vfs_markdependency(hfsmp->hfs_mp);
+		/* We check the MNTK_VIRTUALDEV bit instead of marking the dependent process */
 
 		/*
 		 * If the sparse image is on a sparse image file (as opposed to a sparse
@@ -1666,6 +1993,15 @@ fail_change_next_allocation:
 			}
 		}
 				
+		/* The free extent cache is managed differently for sparse devices.  
+		 * There is a window between which the volume is mounted and the 
+		 * device is marked as sparse, so the free extent cache for this 
+		 * volume is currently initialized as normal volume (sorted by block 
+		 * count).  Reset the cache so that it will be rebuilt again 
+		 * for sparse device (sorted by start block).
+		 */
+		ResetVCBFreeExtCache(hfsmp);
+
 		(void)vnode_put(di_vp);
 		file_drop(bsdata->backingfd);
 		return (0);
@@ -1685,16 +2021,63 @@ fail_change_next_allocation:
 		if ((hfsmp->hfs_flags & HFS_HAS_SPARSE_DEVICE) &&
 		    hfsmp->hfs_backingfs_rootvp) {
 
+			hfs_lock_mount(hfsmp);
 			hfsmp->hfs_flags &= ~HFS_HAS_SPARSE_DEVICE;
 			tmpvp = hfsmp->hfs_backingfs_rootvp;
 			hfsmp->hfs_backingfs_rootvp = NULLVP;
 			hfsmp->hfs_sparsebandblks = 0;
+			hfs_unlock_mount(hfsmp);
+
 			vnode_rele(tmpvp);
 		}
 		return (0);
 	}
 #endif /* HFS_SPARSE_DEV */
 
+	/* Change the next CNID stored in the VH */
+	case HFS_CHANGE_NEXTCNID: {
+		int error = 0;		/* Assume success */
+		u_int32_t fileid;
+		int wraparound = 0;
+		int lockflags = 0;
+
+		if (vnode_vfsisrdonly(vp)) {
+			return (EROFS);
+		}
+		vfsp = vfs_statfs(HFSTOVFS(hfsmp));
+		if (suser(cred, NULL) &&
+			kauth_cred_getuid(cred) != vfsp->f_owner) {
+			return (EACCES); /* must be owner of file system */
+		}
+		
+		fileid = *(u_int32_t *)ap->a_data;
+
+		/* Must have catalog lock excl. to advance the CNID pointer */
+		lockflags = hfs_systemfile_lock (hfsmp, SFL_CATALOG , HFS_EXCLUSIVE_LOCK);
+
+		hfs_lock_mount(hfsmp);
+
+		/* If it is less than the current next CNID, force the wraparound bit to be set */
+		if (fileid < hfsmp->vcbNxtCNID) {
+			wraparound=1;
+		}
+
+		/* Return previous value. */
+		*(u_int32_t *)ap->a_data = hfsmp->vcbNxtCNID;
+
+		hfsmp->vcbNxtCNID = fileid;
+
+		if (wraparound) {
+			hfsmp->vcbAtrb |= kHFSCatalogNodeIDsReusedMask;
+		}
+		
+		MarkVCBDirty(hfsmp);
+		hfs_unlock_mount(hfsmp);
+		hfs_systemfile_unlock (hfsmp, lockflags);
+
+		return (error);
+	}
+	
 	case F_FREEZE_FS: {
 		struct mount *mp;
  
@@ -1710,34 +2093,7 @@ fail_change_next_allocation:
 			!kauth_cred_issuser(cred))
 			return (EACCES);
 
-		lck_rw_lock_exclusive(&hfsmp->hfs_insync);
- 
-		// flush things before we get started to try and prevent
-		// dirty data from being paged out while we're frozen.
-		// note: can't do this after taking the lock as it will
-		// deadlock against ourselves.
-		vnode_iterate(mp, 0, hfs_freezewrite_callback, NULL);
-		hfs_global_exclusive_lock_acquire(hfsmp);
-
-		// DO NOT call hfs_journal_flush() because that takes a
-		// shared lock on the global exclusive lock!
-		journal_flush(hfsmp->jnl);
-
-		// don't need to iterate on all vnodes, we just need to
-		// wait for writes to the system files and the device vnode
-		if (HFSTOVCB(hfsmp)->extentsRefNum)
-		    vnode_waitforwrites(HFSTOVCB(hfsmp)->extentsRefNum, 0, 0, 0, "hfs freeze");
-		if (HFSTOVCB(hfsmp)->catalogRefNum)
-		    vnode_waitforwrites(HFSTOVCB(hfsmp)->catalogRefNum, 0, 0, 0, "hfs freeze");
-		if (HFSTOVCB(hfsmp)->allocationsRefNum)
-		    vnode_waitforwrites(HFSTOVCB(hfsmp)->allocationsRefNum, 0, 0, 0, "hfs freeze");
-		if (hfsmp->hfs_attribute_vp)
-		    vnode_waitforwrites(hfsmp->hfs_attribute_vp, 0, 0, 0, "hfs freeze");
-		vnode_waitforwrites(hfsmp->hfs_devvp, 0, 0, 0, "hfs freeze");
-
-		hfsmp->hfs_freezing_proc = current_proc();
-
-		return (0);
+		return hfs_freeze(hfsmp);
 	}
 
 	case F_THAW_FS: {
@@ -1746,20 +2102,7 @@ fail_change_next_allocation:
 			!kauth_cred_issuser(cred))
 			return (EACCES);
 
-		// if we're not the one who froze the fs then we
-		// can't thaw it.
-		if (hfsmp->hfs_freezing_proc != current_proc()) {
-		    return EPERM;
-		}
-
-		// NOTE: if you add code here, also go check the
-		//       code that "thaws" the fs in hfs_vnop_close()
-		//
-		hfsmp->hfs_freezing_proc = NULL;
-		hfs_global_exclusive_lock_release(hfsmp);
-		lck_rw_unlock_exclusive(&hfsmp->hfs_insync);
-
-		return (0);
+		return hfs_thaw(hfsmp, current_proc());
 	}
 
 	case HFS_BULKACCESS_FSCTL: {
@@ -1794,30 +2137,6 @@ fail_change_next_allocation:
 	    return do_bulk_access_check(hfsmp, vp, ap, size, context);
 	} 
 
-	case HFS_SETACLSTATE: {
-		int state;
-
-		if (ap->a_data == NULL) {
-			return (EINVAL);
-		}
-
-		vfsp = vfs_statfs(HFSTOVFS(hfsmp));
-		state = *(int *)ap->a_data;
-
-		if (hfsmp->hfs_flags & HFS_READ_ONLY) {
-			return (EROFS);
-		}
-		// super-user can enable or disable acl's on a volume.
-		// the volume owner can only enable acl's
-		if (!is_suser() && (state == 0 || kauth_cred_getuid(cred) != vfsp->f_owner)) {
-			return (EPERM);
-		}
-		if (state == 0 || state == 1)
-			return hfs_set_volxattr(hfsmp, HFS_SETACLSTATE, state);
-		else
-			return (EINVAL);	
-	}
-
 	case HFS_SET_XATTREXTENTS_STATE: {
 		int state;
 
@@ -1833,8 +2152,11 @@ fail_change_next_allocation:
 
 		/* Super-user can enable or disable extent-based extended 
 		 * attribute support on a volume 
+		 * Note: Starting Mac OS X 10.7, extent-based extended attributes
+		 * are enabled by default, so any change will be transient only 
+		 * till the volume is remounted.
 		 */
-		if (!is_suser()) {
+		if (!kauth_cred_issuser(kauth_cred_get())) {
 			return (EPERM);
 		}
 		if (state == 0 || state == 1)
@@ -1843,16 +2165,270 @@ fail_change_next_allocation:
 			return (EINVAL);	
 	}
 
-	case F_FULLFSYNC: {
+	case F_SETSTATICCONTENT: {
 		int error;
-		
-		if (hfsmp->hfs_flags & HFS_READ_ONLY) {
-			return (EROFS);
+		int enable_static = 0;
+		struct cnode *cp = NULL;
+		/* 
+		 * lock the cnode, decorate the cnode flag, and bail out.
+		 * VFS should have already authenticated the caller for us.
+		 */
+
+		if (ap->a_data) {
+			/* 
+			 * Note that even though ap->a_data is of type caddr_t,
+			 * the fcntl layer at the syscall handler will pass in NULL
+			 * or 1 depending on what the argument supplied to the fcntl
+			 * was.  So it is in fact correct to check the ap->a_data 
+			 * argument for zero or non-zero value when deciding whether or not
+			 * to enable the static bit in the cnode.
+			 */
+			enable_static = 1;
 		}
-		error = hfs_lock(VTOC(vp), HFS_EXCLUSIVE_LOCK);
-		if (error == 0) {
-			error = hfs_fsync(vp, MNT_WAIT, TRUE, p);
-			hfs_unlock(VTOC(vp));
+		if (hfsmp->hfs_flags & HFS_READ_ONLY) {
+			return EROFS;
+		}
+		cp = VTOC(vp);
+
+		error = hfs_lock (cp, HFS_EXCLUSIVE_LOCK, HFS_LOCK_DEFAULT);
+		if (error == 0) {
+			if (enable_static) {
+				cp->c_flag |= C_SSD_STATIC;
+			}
+			else {
+				cp->c_flag &= ~C_SSD_STATIC;
+			}
+			hfs_unlock (cp);
+		}
+		return error;
+	}
+
+	case F_SET_GREEDY_MODE: {
+		int error;
+		int enable_greedy_mode = 0;
+		struct cnode *cp = NULL;
+		/* 
+		 * lock the cnode, decorate the cnode flag, and bail out.
+		 * VFS should have already authenticated the caller for us.
+		 */
+
+		if (ap->a_data) {
+			/* 
+			 * Note that even though ap->a_data is of type caddr_t,
+			 * the fcntl layer at the syscall handler will pass in NULL
+			 * or 1 depending on what the argument supplied to the fcntl
+			 * was.  So it is in fact correct to check the ap->a_data 
+			 * argument for zero or non-zero value when deciding whether or not
+			 * to enable the greedy mode bit in the cnode.
+			 */
+			enable_greedy_mode = 1;
+		}
+		if (hfsmp->hfs_flags & HFS_READ_ONLY) {
+			return EROFS;
+		}
+		cp = VTOC(vp);
+
+		error = hfs_lock (cp, HFS_EXCLUSIVE_LOCK, HFS_LOCK_DEFAULT);
+		if (error == 0) {
+			if (enable_greedy_mode) {
+				cp->c_flag |= C_SSD_GREEDY_MODE;
+			}
+			else {
+				cp->c_flag &= ~C_SSD_GREEDY_MODE;
+			}
+			hfs_unlock (cp);
+		}
+		return error;
+	}
+
+	case F_SETIOTYPE: {
+		int error;
+		uint32_t iotypeflag = 0;
+		
+		struct cnode *cp = NULL;
+		/* 
+		 * lock the cnode, decorate the cnode flag, and bail out.
+		 * VFS should have already authenticated the caller for us.
+		 */
+
+		if (ap->a_data == NULL) {
+			return EINVAL;
+		}
+
+		/* 
+		 * Note that even though ap->a_data is of type caddr_t, we
+		 * can only use 32 bits of flag values.
+		 */
+		iotypeflag = (uint32_t) ap->a_data;
+		switch (iotypeflag) {
+			case F_IOTYPE_ISOCHRONOUS:
+				break;
+			default:
+				return EINVAL;
+		}
+
+
+		if (hfsmp->hfs_flags & HFS_READ_ONLY) {
+			return EROFS;
+		}
+		cp = VTOC(vp);
+
+		error = hfs_lock (cp, HFS_EXCLUSIVE_LOCK, HFS_LOCK_DEFAULT);
+		if (error == 0) {
+			switch (iotypeflag) {
+				case F_IOTYPE_ISOCHRONOUS:
+					cp->c_flag |= C_IO_ISOCHRONOUS;
+					break;
+				default:
+					break;
+			}
+			hfs_unlock (cp);
+		}
+		return error;
+	}
+
+	case F_MAKECOMPRESSED: {
+		int error = 0;
+		uint32_t gen_counter;
+		struct cnode *cp = NULL;
+		int reset_decmp = 0;
+
+		if (hfsmp->hfs_flags & HFS_READ_ONLY) {
+			return EROFS;
+		}
+
+		/* 
+		 * acquire & lock the cnode.
+		 * VFS should have already authenticated the caller for us.
+		 */
+
+		if (ap->a_data) {
+			/* 
+			 * Cast the pointer into a uint32_t so we can extract the 
+			 * supplied generation counter.
+			 */
+			gen_counter = *((uint32_t*)ap->a_data);
+		}
+		else {
+			return EINVAL;
+		}
+
+#if HFS_COMPRESSION
+		cp = VTOC(vp);
+		/* Grab truncate lock first; we may truncate the file */
+		hfs_lock_truncate (cp, HFS_EXCLUSIVE_LOCK, HFS_LOCK_DEFAULT);
+
+		error = hfs_lock (cp, HFS_EXCLUSIVE_LOCK, HFS_LOCK_DEFAULT);
+		if (error) {
+			hfs_unlock_truncate(cp, HFS_LOCK_DEFAULT);
+			return error;
+		}
+
+		/* Are there any other usecounts/FDs? */
+		if (vnode_isinuse(vp, 1)) {
+			hfs_unlock(cp);
+			hfs_unlock_truncate(cp, HFS_LOCK_DEFAULT);
+			return EBUSY;
+		}
+
+		/* now we have the cnode locked down; Validate arguments */
+		if (cp->c_attr.ca_flags & (UF_IMMUTABLE | UF_COMPRESSED)) {
+			/* EINVAL if you are trying to manipulate an IMMUTABLE file */
+			hfs_unlock(cp);
+			hfs_unlock_truncate (cp, HFS_LOCK_DEFAULT);
+			return EINVAL;
+		}
+
+		if ((hfs_get_gencount (cp)) == gen_counter) {
+			/* 
+			 * OK, the gen_counter matched.  Go for it:
+			 * Toggle state bits, truncate file, and suppress mtime update 
+			 */
+			reset_decmp = 1;
+			cp->c_bsdflags |= UF_COMPRESSED;				
+
+			error = hfs_truncate(vp, 0, IO_NDELAY, HFS_TRUNCATE_SKIPTIMES,
+								 ap->a_context);
+		}
+		else {
+			error = ESTALE;
+		}
+
+		/* Unlock cnode before executing decmpfs ; they may need to get an EA */
+		hfs_unlock(cp);
+
+		/*
+		 * Reset the decmp state while still holding the truncate lock. We need to 
+		 * serialize here against a listxattr on this node which may occur at any 
+		 * time. 
+		 * 
+		 * Even if '0/skiplock' is passed in 2nd argument to hfs_file_is_compressed,
+		 * that will still potentially require getting the com.apple.decmpfs EA. If the 
+	 	 * EA is required, then we can't hold the cnode lock, because the getxattr call is
+		 * generic(through VFS), and can't pass along any info telling it that we're already
+		 * holding it (the lock). If we don't serialize, then we risk listxattr stopping
+		 * and trying to fill in the hfs_file_is_compressed info during the callback
+		 * operation, which will result in deadlock against the b-tree node.
+		 * 
+		 * So, to serialize against listxattr (which will grab buf_t meta references on
+		 * the b-tree blocks), we hold the truncate lock as we're manipulating the 
+		 * decmpfs payload. 
+		 */
+		if ((reset_decmp) && (error == 0)) {
+			decmpfs_cnode *dp = VTOCMP (vp);
+			if (dp != NULL) {
+				decmpfs_cnode_set_vnode_state(dp, FILE_TYPE_UNKNOWN, 0);
+			}
+
+			/* Initialize the decmpfs node as needed */
+			(void) hfs_file_is_compressed (cp, 0); /* ok to take lock */
+		}
+
+		hfs_unlock_truncate (cp, HFS_LOCK_DEFAULT);
+
+#endif
+		return error;
+	}
+
+	case F_SETBACKINGSTORE: {
+
+		int error = 0;
+
+		/* 
+		 * See comment in F_SETSTATICCONTENT re: using
+	     * a null check for a_data
+  		 */
+		if (ap->a_data) {
+			error = hfs_set_backingstore (vp, 1);
+		}
+		else {
+			error = hfs_set_backingstore (vp, 0);
+		}		
+
+		return error;
+	}
+
+	case F_GETPATH_MTMINFO: {
+		int error = 0;
+
+		int *data = (int*) ap->a_data;	
+
+		/* Ask if this is a backingstore vnode */
+		error = hfs_is_backingstore (vp, data);
+
+		return error;
+	}
+
+	case F_FULLFSYNC: {
+		int error;
+		
+		if (hfsmp->hfs_flags & HFS_READ_ONLY) {
+			return (EROFS);
+		}
+		error = hfs_lock(VTOC(vp), HFS_EXCLUSIVE_LOCK, HFS_LOCK_DEFAULT);
+		if (error == 0) {
+			error = hfs_fsync(vp, MNT_WAIT, TRUE, p);
+			hfs_unlock(VTOC(vp));
 		}
 
 		return error;
@@ -1865,7 +2441,7 @@ fail_change_next_allocation:
 		if (!vnode_isreg(vp))
 			return EINVAL;
  
-		error = hfs_lock(VTOC(vp), HFS_EXCLUSIVE_LOCK);
+		error = hfs_lock(VTOC(vp), HFS_EXCLUSIVE_LOCK, HFS_LOCK_DEFAULT);
 		if (error == 0) {
 			cp = VTOC(vp);
 			/*
@@ -1891,7 +2467,7 @@ fail_change_next_allocation:
 		fp = VTOF(vp);
 
 		/* Protect against a size change. */
-		hfs_lock_truncate(VTOC(vp), TRUE);
+		hfs_lock_truncate(VTOC(vp), HFS_EXCLUSIVE_LOCK, HFS_LOCK_DEFAULT);
 
 #if HFS_COMPRESSION
 		if (compressed && (uncompressed_size == -1)) {
@@ -1910,85 +2486,10 @@ fail_change_next_allocation:
 			error = advisory_read(vp, fp->ff_size, ra->ra_offset, ra->ra_count);
 		}
 
-		hfs_unlock_truncate(VTOC(vp), TRUE);
+		hfs_unlock_truncate(VTOC(vp), HFS_LOCK_DEFAULT);
 		return (error);
 	}
 
-	case F_READBOOTSTRAP:
-	case F_WRITEBOOTSTRAP:
-	{
-	    struct vnode *devvp = NULL;
-	    user_fbootstraptransfer_t *user_bootstrapp;
-	    int devBlockSize;
-	    int error;
-	    uio_t auio;
-	    daddr64_t blockNumber;
-	    u_int32_t blockOffset;
-	    u_int32_t xfersize;
-	    struct buf *bp;
-	    user_fbootstraptransfer_t user_bootstrap;
-
-		if (!vnode_isvroot(vp))
-			return (EINVAL);
-		/* LP64 - when caller is a 64 bit process then we are passed a pointer 
-		 * to a user_fbootstraptransfer_t else we get a pointer to a 
-		 * fbootstraptransfer_t which we munge into a user_fbootstraptransfer_t
-		 */
-		if (hfsmp->hfs_flags & HFS_READ_ONLY) {
-			return (EROFS);
-		}
-		if (is64bit) {
-			user_bootstrapp = (user_fbootstraptransfer_t *)ap->a_data;
-		}
-		else {
-	    	user32_fbootstraptransfer_t *bootstrapp = (user32_fbootstraptransfer_t *)ap->a_data;
-			user_bootstrapp = &user_bootstrap;
-			user_bootstrap.fbt_offset = bootstrapp->fbt_offset;
-			user_bootstrap.fbt_length = bootstrapp->fbt_length;
-			user_bootstrap.fbt_buffer = CAST_USER_ADDR_T(bootstrapp->fbt_buffer);
-		}
-		if (user_bootstrapp->fbt_offset + user_bootstrapp->fbt_length > 1024) 
-			return EINVAL;
-	    
-	    devvp = VTOHFS(vp)->hfs_devvp;
-		auio = uio_create(1, user_bootstrapp->fbt_offset, 
-						  is64bit ? UIO_USERSPACE64 : UIO_USERSPACE32,
-						  (ap->a_command == F_WRITEBOOTSTRAP) ? UIO_WRITE : UIO_READ);
-		uio_addiov(auio, user_bootstrapp->fbt_buffer, user_bootstrapp->fbt_length);
-
-	    devBlockSize = vfs_devblocksize(vnode_mount(vp));
-
-	    while (uio_resid(auio) > 0) {
-			blockNumber = uio_offset(auio) / devBlockSize;
-			error = (int)buf_bread(devvp, blockNumber, devBlockSize, cred, &bp);
-			if (error) {
-				if (bp) buf_brelse(bp);
-				uio_free(auio);
-				return error;
-			};
-
-			blockOffset = uio_offset(auio) % devBlockSize;
-			xfersize = devBlockSize - blockOffset;
-			error = uiomove((caddr_t)buf_dataptr(bp) + blockOffset, (int)xfersize, auio);
-			if (error) {
-				buf_brelse(bp);
-				uio_free(auio);
-				return error;
-			};
-			if (uio_rw(auio) == UIO_WRITE) {
-				error = VNOP_BWRITE(bp);
-				if (error) {
-					uio_free(auio);
-                  	return error;
-				}
-			} else {
-				buf_brelse(bp);
-			};
-		};
-		uio_free(auio);
-	};
-	return 0;
-
 	case _IOC(IOC_OUT,'h', 4, 0):     /* Create date in local time */
 	{
 		if (is64bit) {
@@ -2008,6 +2509,10 @@ fail_change_next_allocation:
 	    *(uint32_t *)ap->a_data = hfsmp->hfs_last_mounted_mtime;
 	    break;
 
+	case HFS_FSCTL_GET_VERY_LOW_DISK:
+	    *(uint32_t*)ap->a_data = hfsmp->hfs_freespace_notify_dangerlimit;
+	    break;
+
 	case HFS_FSCTL_SET_VERY_LOW_DISK:
 	    if (*(uint32_t *)ap->a_data >= hfsmp->hfs_freespace_notify_warninglimit) {
 		return EINVAL;
@@ -2016,6 +2521,10 @@ fail_change_next_allocation:
 	    hfsmp->hfs_freespace_notify_dangerlimit = *(uint32_t *)ap->a_data;
 	    break;
 
+	case HFS_FSCTL_GET_LOW_DISK:
+	    *(uint32_t*)ap->a_data = hfsmp->hfs_freespace_notify_warninglimit;
+	    break;
+
 	case HFS_FSCTL_SET_LOW_DISK:
 	    if (   *(uint32_t *)ap->a_data >= hfsmp->hfs_freespace_notify_desiredlevel
 		|| *(uint32_t *)ap->a_data <= hfsmp->hfs_freespace_notify_dangerlimit) {
@@ -2026,6 +2535,10 @@ fail_change_next_allocation:
 	    hfsmp->hfs_freespace_notify_warninglimit = *(uint32_t *)ap->a_data;
 	    break;
 
+	case HFS_FSCTL_GET_DESIRED_DISK:
+	    *(uint32_t*)ap->a_data = hfsmp->hfs_freespace_notify_desiredlevel;
+	    break;
+
 	case HFS_FSCTL_SET_DESIRED_DISK:
 	    if (*(uint32_t *)ap->a_data <= hfsmp->hfs_freespace_notify_warninglimit) {
 		return EINVAL;
@@ -2046,18 +2559,18 @@ fail_change_next_allocation:
 		if (hfsmp->hfs_flags & HFS_READ_ONLY) {
 			return (EROFS);
 		}
-		HFS_MOUNT_LOCK(hfsmp, TRUE);
+		hfs_lock_mount (hfsmp);
 		bcopy(ap->a_data, &hfsmp->vcbFndrInfo, sizeof(hfsmp->vcbFndrInfo));
-		HFS_MOUNT_UNLOCK(hfsmp, TRUE);
+		hfs_unlock_mount (hfsmp);
 		(void) hfs_flushvolumeheader(hfsmp, MNT_WAIT, 0);
 		break;
 		
 	case HFS_GET_BOOT_INFO:
 		if (!vnode_isvroot(vp))
 			return(EINVAL);
-		HFS_MOUNT_LOCK(hfsmp, TRUE);
+		hfs_lock_mount (hfsmp);
 		bcopy(&hfsmp->vcbFndrInfo, ap->a_data, sizeof(hfsmp->vcbFndrInfo));
-		HFS_MOUNT_UNLOCK(hfsmp, TRUE);
+		hfs_unlock_mount(hfsmp);
 		break;
 
 	case HFS_MARK_BOOT_CORRUPT:
@@ -2065,7 +2578,7 @@ fail_change_next_allocation:
 		 * kHFSVolumeInconsistentBit in the volume header.  This will 
 		 * force fsck_hfs on next mount.
 		 */
-		if (!is_suser()) {
+		if (!kauth_cred_issuser(kauth_cred_get())) {
 			return EACCES;
 		}
 			
@@ -2078,7 +2591,7 @@ fail_change_next_allocation:
 			return (EROFS);
 		}
 		printf ("hfs_vnop_ioctl: Marking the boot volume corrupt.\n");
-		hfs_mark_volume_inconsistent(hfsmp);
+		hfs_mark_inconsistent(hfsmp, HFS_FSCK_FORCED);
 		break;
 
 	case HFS_FSCTL_GET_JOURNAL_INFO:
@@ -2110,6 +2623,158 @@ fail_change_next_allocation:
 	    break;
 	}    
 
+	case HFS_DISABLE_METAZONE: {
+		/* Only root can disable metadata zone */
+		if (!kauth_cred_issuser(kauth_cred_get())) {
+			return EACCES;
+		}
+		if (hfsmp->hfs_flags & HFS_READ_ONLY) {
+			return (EROFS);
+		}
+
+		/* Disable metadata zone now */
+		(void) hfs_metadatazone_init(hfsmp, true);
+		printf ("hfs: Disabling metadata zone on %s\n", hfsmp->vcbVN);
+		break;
+	}
+
+
+	case HFS_FSINFO_METADATA_BLOCKS: {
+		int error;
+		struct hfsinfo_metadata *hinfo;
+
+		hinfo = (struct hfsinfo_metadata *)ap->a_data;
+
+		/* Get information about number of metadata blocks */
+		error = hfs_getinfo_metadata_blocks(hfsmp, hinfo);
+		if (error) {
+			return error;
+		}
+
+		break;
+	}
+
+	case HFS_GET_FSINFO: {
+		hfs_fsinfo *fsinfo = (hfs_fsinfo *)ap->a_data;
+
+		/* Only root is allowed to get fsinfo */
+		if (!kauth_cred_issuser(kauth_cred_get())) {
+			return EACCES;
+		}
+
+		/*
+		 * Make sure that the caller's version number matches with
+		 * the kernel's version number.  This will make sure that
+		 * if the structures being read/written into are changed
+		 * by the kernel, the caller will not read incorrect data.
+		 *
+		 * The first three fields --- request_type, version and
+		 * flags are same for all the hfs_fsinfo structures, so
+		 * we can access the version number by assuming any
+		 * structure for now.
+		 */
+		if (fsinfo->header.version != HFS_FSINFO_VERSION) {
+			return ENOTSUP;
+		}
+
+		/* Make sure that the current file system is not marked inconsistent */
+		if (hfsmp->vcbAtrb & kHFSVolumeInconsistentMask) {
+			return EIO;
+		}
+
+		return hfs_get_fsinfo(hfsmp, ap->a_data);
+	}
+
+	case HFS_CS_FREESPACE_TRIM: {
+		int error = 0;
+		int lockflags = 0;
+
+		/* Only root allowed */
+		if (!kauth_cred_issuser(kauth_cred_get())) {
+			return EACCES;
+		}
+
+		/* 
+		 * This core functionality is similar to hfs_scan_blocks().  
+		 * The main difference is that hfs_scan_blocks() is called 
+		 * as part of mount where we are assured that the journal is 
+		 * empty to start with.  This fcntl() can be called on a 
+		 * mounted volume, therefore it has to flush the content of 
+		 * the journal as well as ensure the state of summary table. 
+		 * 
+		 * This fcntl scans over the entire allocation bitmap,
+		 * creates list of all the free blocks, and issues TRIM 
+		 * down to the underlying device.  This can take long time 
+		 * as it can generate up to 512MB of read I/O.
+		 */
+
+		if ((hfsmp->hfs_flags & HFS_SUMMARY_TABLE) == 0) {
+			error = hfs_init_summary(hfsmp);
+			if (error) {
+				printf("hfs: fsctl() could not initialize summary table for %s\n", hfsmp->vcbVN);
+				return error;
+			}
+		}
+
+		/* 
+		 * The journal maintains list of recently deallocated blocks to 
+		 * issue DKIOCUNMAPs when the corresponding journal transaction is 
+		 * flushed to the disk.  To avoid any race conditions, we only 
+		 * want one active trim list and only one thread issuing DKIOCUNMAPs.
+		 * Therefore we make sure that the journal trim list is sync'ed, 
+		 * empty, and not modifiable for the duration of our scan.
+		 * 
+		 * Take the journal lock before flushing the journal to the disk. 
+		 * We will keep on holding the journal lock till we don't get the 
+		 * bitmap lock to make sure that no new journal transactions can 
+		 * start.  This will make sure that the journal trim list is not 
+		 * modified after the journal flush and before getting bitmap lock.
+		 * We can release the journal lock after we acquire the bitmap 
+		 * lock as it will prevent any further block deallocations.
+		 */
+		hfs_journal_lock(hfsmp);
+
+		/* Flush the journal and wait for all I/Os to finish up */
+		error = hfs_journal_flush(hfsmp, TRUE);
+		if (error) {
+			hfs_journal_unlock(hfsmp);
+			return error;
+		}
+
+		/* Take bitmap lock to ensure it is not being modified */
+		lockflags = hfs_systemfile_lock(hfsmp, SFL_BITMAP, HFS_EXCLUSIVE_LOCK);
+
+		/* Release the journal lock */
+		hfs_journal_unlock(hfsmp);
+
+		/* 
+		 * ScanUnmapBlocks reads the bitmap in large block size 
+		 * (up to 1MB) unlike the runtime which reads the bitmap 
+		 * in the 4K block size.  This can cause buf_t collisions 
+		 * and potential data corruption.  To avoid this, we 
+		 * invalidate all the existing buffers associated with 
+		 * the bitmap vnode before scanning it.
+		 *
+		 * Note: ScanUnmapBlock() cleans up all the buffers 
+		 * after itself, so there won't be any large buffers left 
+		 * for us to clean up after it returns.
+		 */
+		error = buf_invalidateblks(hfsmp->hfs_allocation_vp, 0, 0, 0);
+		if (error) {
+			hfs_systemfile_unlock(hfsmp, lockflags);
+			return error;
+		}
+
+		/* Traverse bitmap and issue DKIOCUNMAPs */
+		error = ScanUnmapBlocks(hfsmp);
+		hfs_systemfile_unlock(hfsmp, lockflags);
+		if (error) {
+			return error;
+		}
+
+		break;
+	}
+
 	default:
 		return (ENOTTY);
 	}
@@ -2314,7 +2979,7 @@ hfs_vnop_blockmap(struct vnop_blockmap_args *ap)
 
 	if ( !vnode_issystem(vp) && !vnode_islnk(vp) && !vnode_isswap(vp)) {
 		if (VTOC(vp)->c_lockowner != current_thread()) {
-			hfs_lock(VTOC(vp), HFS_FORCE_LOCK);
+			hfs_lock(VTOC(vp), HFS_EXCLUSIVE_LOCK, HFS_LOCK_ALLOW_NOEXISTS);
 			tooklock = 1;
 		}
 	}
@@ -2377,9 +3042,9 @@ retry:
 			cp->c_blocks += loanedBlocks;
 			fp->ff_blocks += loanedBlocks;
 
-			HFS_MOUNT_LOCK(hfsmp, TRUE);
+			hfs_lock_mount (hfsmp);
 			hfsmp->loanedBlocks += loanedBlocks;
-			HFS_MOUNT_UNLOCK(hfsmp, TRUE);
+			hfs_unlock_mount (hfsmp);
 
 			hfs_systemfile_unlock(hfsmp, lockflags);
 			cp->c_flag |= C_MODIFIED;
@@ -2421,44 +3086,43 @@ retry:
 		} 
 		
 		/* Validate if the start offset is within logical file size */
-		if (ap->a_foffset > fp->ff_size) {
-		    	goto exit;
+		if (ap->a_foffset >= fp->ff_size) {
+			goto exit;
 		}
 
-		/* Searching file extents has failed for read operation, therefore 
-		 * search rangelist for any uncommitted holes in the file. 
+		/*
+		 * At this point, we have encountered a failure during 
+		 * MapFileBlockC that resulted in ERANGE, and we are not servicing
+		 * a write, and there are borrowed blocks.
+		 * 
+		 * However, the cluster layer will not call blockmap for 
+		 * blocks that are borrowed and in-cache.  We have to assume that 
+		 * because we observed ERANGE being emitted from MapFileBlockC, this 
+		 * extent range is not valid on-disk.  So we treat this as a 
+		 * mapping that needs to be zero-filled prior to reading.  
+		 *
+		 * Note that under certain circumstances (such as non-contiguous 
+		 * userland VM mappings in the calling process), cluster_io 
+		 * may be forced to split a large I/O driven by hfs_vnop_write 
+		 * into multiple sub-I/Os that necessitate a RMW cycle.  If this is
+		 * the case here, then we have already removed the invalid range list
+		 * mapping prior to getting to this blockmap call, so we should not
+		 * search the invalid rangelist for this byte range.
 		 */
-		overlaptype = rl_scan(&fp->ff_invalidranges, ap->a_foffset,
-	        	              ap->a_foffset + (off_t)(ap->a_size - 1),
-	                	      &invalid_range);
-		switch(overlaptype) {
-		case RL_OVERLAPISCONTAINED:
-			/* start_offset <= rl_start, end_offset >= rl_end */
-			if (ap->a_foffset != invalid_range->rl_start) {
-				break;
-			}
-		case RL_MATCHINGOVERLAP:
-			/* start_offset = rl_start, end_offset = rl_end */
-		case RL_OVERLAPCONTAINSRANGE:
-			/* start_offset >= rl_start, end_offset <= rl_end */
-		case RL_OVERLAPSTARTSBEFORE:
-			/* start_offset > rl_start, end_offset >= rl_start */
-			if ((off_t)fp->ff_size > (invalid_range->rl_end + 1)) {
-				bytesContAvail = (invalid_range->rl_end + 1) - ap->a_foffset;
-			} else {
-				bytesContAvail = fp->ff_size - ap->a_foffset;
-			}
-			if (bytesContAvail > ap->a_size) {
-				bytesContAvail = ap->a_size;
-			}
-			*ap->a_bpn = (daddr64_t)-1;
-			retval = 0;
-			break;
-		case RL_OVERLAPENDSAFTER:
-			/* start_offset < rl_start, end_offset < rl_end */
-		case RL_NOOVERLAP:
-			break;
+
+		bytesContAvail = fp->ff_size - ap->a_foffset;
+		/*
+		 * Clip the contiguous available bytes to, at most, the allowable
+		 * maximum or the amount requested.
+		 */
+
+		if (bytesContAvail > ap->a_size) {
+			bytesContAvail = ap->a_size;
 		}
+
+		*ap->a_bpn = (daddr64_t) -1;
+		retval = 0;
+
 		goto exit;
 	}
 
@@ -2523,7 +3187,6 @@ exit:
 	return (MacToVFSError(retval));
 }
 
-
 /*
  * prepare and issue the I/O
  * buf_strategy knows how to deal
@@ -2535,8 +3198,91 @@ hfs_vnop_strategy(struct vnop_strategy_args *ap)
 {
 	buf_t	bp = ap->a_bp;
 	vnode_t	vp = buf_vnode(bp);
+	int error = 0;
+	
+	/* Mark buffer as containing static data if cnode flag set */
+	if (VTOC(vp)->c_flag & C_SSD_STATIC) {
+		buf_markstatic(bp);
+	}
+	
+	/* Mark buffer as containing static data if cnode flag set */
+	if (VTOC(vp)->c_flag & C_SSD_GREEDY_MODE) {
+		bufattr_markgreedymode(&bp->b_attr);
+	}
 
-	return (buf_strategy(VTOHFS(vp)->hfs_devvp, ap));
+	/* mark buffer as containing burst mode data if cnode flag set */
+	if (VTOC(vp)->c_flag & C_IO_ISOCHRONOUS) {
+		bufattr_markisochronous(&bp->b_attr);
+	}
+	
+#if CONFIG_PROTECT
+	cnode_t *cp = NULL; 
+	
+	if ((!bufattr_rawencrypted(&bp->b_attr)) && 
+			((cp = cp_get_protected_cnode(vp)) != NULL)) {
+		/* 
+		 * We rely upon the truncate lock to protect the
+		 * CP cache key from getting tossed prior to our IO finishing here.
+		 * Nearly all cluster io calls to manipulate file payload from HFS
+		 * take the truncate lock before calling into the cluster
+		 * layer to ensure the file size does not change, or that they
+		 * have exclusive right to change the EOF of the file.  
+		 * That same guarantee protects us here since the code that
+		 * deals with CP lock events must now take the truncate lock 
+		 * before doing anything. 
+		 *
+		 * There is 1 exception here:
+		 * 1) One exception should be the VM swapfile IO, because HFS will
+		 * funnel the VNOP_PAGEOUT directly into a cluster_pageout call for the
+		 * swapfile code only without holding the truncate lock.  This is because
+		 * individual swapfiles are maintained at fixed-length sizes by the VM code.
+		 * In non-swapfile IO we use PAGEOUT_V2 semantics which allow us to 
+		 * create our own UPL and thus take the truncate lock before calling 
+		 * into the cluster layer.  In that case, however, we are not concerned 
+		 * with the CP blob being wiped out in the middle of the IO 
+		 * because there isn't anything to toss; the VM swapfile key stays
+		 * in-core as long as the file is open. 
+		 */
+		
+		
+		/*
+		 * Last chance: If this data protected I/O does not have unwrapped keys
+		 * present, then try to get them.  We already know that it should, by this point.
+		 */
+		if (cp->c_cpentry->cp_flags & (CP_KEY_FLUSHED | CP_NEEDS_KEYS)) {
+			int io_op = ( (buf_flags(bp) & B_READ) ? CP_READ_ACCESS : CP_WRITE_ACCESS);
+			if ((error = cp_handle_vnop(vp, io_op, 0)) != 0) {
+				/*
+				 * We have to be careful here.  By this point in the I/O path, VM or the cluster
+				 * engine has prepared a buf_t with the proper file offsets and all the rest,
+				 * so simply erroring out will result in us leaking this particular buf_t.
+				 * We need to properly decorate the buf_t just as buf_strategy would so as 
+				 * to make it appear that the I/O errored out with the particular error code.
+				 */
+				buf_seterror (bp, error);
+				buf_biodone(bp);
+				return error;
+			}
+		}
+		
+		/*
+		 *NB:
+		 * For filesystem resize, we may not have access to the underlying
+		 * file's cache key for whatever reason (device may be locked).  However,
+		 * we do not need it since we are going to use the temporary HFS-wide resize key
+		 * which is generated once we start relocating file content.  If this file's I/O 
+		 * should be done using the resize key, it will have been supplied already, so
+		 * do not attach the file's cp blob to the buffer. 
+		 */
+		if ((cp->c_cpentry->cp_flags & CP_RELOCATION_INFLIGHT) == 0) {
+			buf_setcpaddr(bp, cp->c_cpentry);
+		}
+	}
+#endif /* CONFIG_PROTECT */
+	
+	error = buf_strategy(VTOHFS(vp)->hfs_devvp, ap);
+	
+	return error;
 }
 
 static int 
@@ -2550,12 +3296,11 @@ hfs_minorupdate(struct vnode *vp) {
 	return 0;
 }
 
-static int
-do_hfs_truncate(struct vnode *vp, off_t length, int flags, int skipupdate, vfs_context_t context)
+int
+do_hfs_truncate(struct vnode *vp, off_t length, int flags, int truncateflags, vfs_context_t context)
 {
 	register struct cnode *cp = VTOC(vp);
     	struct filefork *fp = VTOF(vp);
-	struct proc *p = vfs_context_proc(context);;
 	kauth_cred_t cred = vfs_context_ucred(context);
 	int retval;
 	off_t bytesToAdd;
@@ -2565,12 +3310,14 @@ do_hfs_truncate(struct vnode *vp, off_t length, int flags, int skipupdate, vfs_c
 	int blksize;
 	struct hfsmount *hfsmp;
 	int lockflags;
+	int skipupdate = (truncateflags & HFS_TRUNCATE_SKIPUPDATE);
+	int suppress_times = (truncateflags & HFS_TRUNCATE_SKIPTIMES);
 
 	blksize = VTOVCB(vp)->blockSize;
 	fileblocks = fp->ff_blocks;
 	filebytes = (off_t)fileblocks * (off_t)blksize;
 
-	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 7)) | DBG_FUNC_START,
+	KERNEL_DEBUG(HFSDBG_TRUNCATE | DBG_FUNC_START,
 		 (int)length, (int)fp->ff_size, (int)filebytes, 0, 0);
 
 	if (length < 0)
@@ -2624,8 +3371,9 @@ do_hfs_truncate(struct vnode *vp, off_t length, int flags, int skipupdate, vfs_c
 			/* All or nothing and don't round up to clumpsize. */
 			eflags = kEFAllMask | kEFNoClumpMask;
 
-			if (cred && suser(cred, NULL) != 0)
+			if (cred && (suser(cred, NULL) != 0)) {
 				eflags |= kEFReserveMask;  /* keep a reserve */
+			}
 
 			/*
 			 * Allocate Journal and Quota files in metadata zone.
@@ -2647,6 +3395,10 @@ do_hfs_truncate(struct vnode *vp, off_t length, int flags, int skipupdate, vfs_c
 				lockflags |= SFL_EXTENTS;
 			lockflags = hfs_systemfile_lock(hfsmp, lockflags, HFS_EXCLUSIVE_LOCK);
 
+			/* 
+			 * Keep growing the file as long as the current EOF is
+			 * less than the desired value.
+			 */
 			while ((length > filebytes) && (retval == E_NONE)) {
 				bytesToAdd = length - filebytes;
 				retval = MacToVFSError(ExtendFileC(VTOVCB(vp),
@@ -2670,7 +3422,7 @@ do_hfs_truncate(struct vnode *vp, off_t length, int flags, int skipupdate, vfs_c
 				if (skipupdate) {
 					(void) hfs_minorupdate(vp);
 				}
-				else {
+				else {	
 					(void) hfs_update(vp, TRUE);
 					(void) hfs_volupdate(hfsmp, VOL_UPDATE, 0);
 				}
@@ -2681,11 +3433,15 @@ do_hfs_truncate(struct vnode *vp, off_t length, int flags, int skipupdate, vfs_c
 			if (retval)
 				goto Err_Exit;
 
-			KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 7)) | DBG_FUNC_NONE,
+			KERNEL_DEBUG(HFSDBG_TRUNCATE | DBG_FUNC_NONE,
 				(int)length, (int)fp->ff_size, (int)filebytes, 0, 0);
 		}
  
-		if (!(flags & IO_NOZEROFILL)) {
+		if (ISSET(flags, IO_NOZEROFILL)) {
+			// An optimisation for the hibernation file
+			if (vnode_isswap(vp))
+				rl_remove_all(&fp->ff_invalidranges);
+		} else {
 			if (UBCINFOEXISTS(vp)  && (vnode_issystem(vp) == 0) && retval == E_NONE) {
 				struct rl_entry *invalid_range;
 				off_t zero_limit;
@@ -2710,7 +3466,7 @@ do_hfs_truncate(struct vnode *vp, off_t length, int flags, int skipupdate, vfs_c
 						retval = cluster_write(vp, (struct uio *) 0, fp->ff_size, zero_limit,
 								fp->ff_size, (off_t)0,
 								(flags & IO_SYNC) | IO_HEADZEROFILL | IO_NOZERODIRTY);
-						hfs_lock(cp, HFS_FORCE_LOCK);
+						hfs_lock(cp, HFS_EXCLUSIVE_LOCK, HFS_LOCK_ALLOW_NOEXISTS);
 						if (retval) goto Err_Exit;
 						
 						/* Merely invalidate the remaining area, if necessary: */
@@ -2733,12 +3489,17 @@ do_hfs_truncate(struct vnode *vp, off_t length, int flags, int skipupdate, vfs_c
 					panic("hfs_truncate: invoked on non-UBC object?!");
 			};
 		}
-		cp->c_touch_modtime = TRUE;
+		if (suppress_times == 0) {
+			cp->c_touch_modtime = TRUE;
+		}
 		fp->ff_size = length;
 
 	} else { /* Shorten the size of the file */
 
-		if ((off_t)fp->ff_size > length) {
+		// An optimisation for the hibernation file
+		if (ISSET(flags, IO_NOZEROFILL) && vnode_isswap(vp)) {
+			rl_remove_all(&fp->ff_invalidranges);
+		} else if ((off_t)fp->ff_size > length) {
 			/* Any space previously marked as invalid is now irrelevant: */
 			rl_remove(length, fp->ff_size - 1, &fp->ff_invalidranges);
 		}
@@ -2751,8 +3512,7 @@ do_hfs_truncate(struct vnode *vp, off_t length, int flags, int skipupdate, vfs_c
 			u_int32_t finalblks;
 			u_int32_t loanedBlocks;
 
-			HFS_MOUNT_LOCK(hfsmp, TRUE);
-
+			hfs_lock_mount(hfsmp);
 			loanedBlocks = fp->ff_unallocblocks;
 			cp->c_blocks -= loanedBlocks;
 			fp->ff_blocks -= loanedBlocks;
@@ -2770,61 +3530,58 @@ do_hfs_truncate(struct vnode *vp, off_t length, int flags, int skipupdate, vfs_c
 				cp->c_blocks += loanedBlocks;
 				fp->ff_blocks += loanedBlocks;
 			}
-			HFS_MOUNT_UNLOCK(hfsmp, TRUE);
+			hfs_unlock_mount (hfsmp);
 		}
 
-		/*
-		 * For a TBE process the deallocation of the file blocks is
-		 * delayed until the file is closed.  And hfs_close calls
-		 * truncate with the IO_NDELAY flag set.  So when IO_NDELAY
-		 * isn't set, we make sure this isn't a TBE process.
-		 */
-		if ((flags & IO_NDELAY) || (proc_tbe(p) == 0)) {
 #if QUOTA
-		  off_t savedbytes = ((off_t)fp->ff_blocks * (off_t)blksize);
+		off_t savedbytes = ((off_t)fp->ff_blocks * (off_t)blksize);
 #endif /* QUOTA */
-		  if (hfs_start_transaction(hfsmp) != 0) {
-		      retval = EINVAL;
-		      goto Err_Exit;
-		  }
+		if (hfs_start_transaction(hfsmp) != 0) {
+			retval = EINVAL;
+			goto Err_Exit;
+		}
 
-			if (fp->ff_unallocblocks == 0) {
-				/* Protect extents b-tree and allocation bitmap */
-				lockflags = SFL_BITMAP;
-				if (overflow_extents(fp))
-					lockflags |= SFL_EXTENTS;
-				lockflags = hfs_systemfile_lock(hfsmp, lockflags, HFS_EXCLUSIVE_LOCK);
+		if (fp->ff_unallocblocks == 0) {
+			/* Protect extents b-tree and allocation bitmap */
+			lockflags = SFL_BITMAP;
+			if (overflow_extents(fp))
+				lockflags |= SFL_EXTENTS;
+			lockflags = hfs_systemfile_lock(hfsmp, lockflags, HFS_EXCLUSIVE_LOCK);
 
-				retval = MacToVFSError(TruncateFileC(VTOVCB(vp),
-						(FCB*)fp, length, false));
+			retval = MacToVFSError(TruncateFileC(VTOVCB(vp), (FCB*)fp, length, 0, 
+												 FORK_IS_RSRC (fp), FTOC(fp)->c_fileid, false));
 
-				hfs_systemfile_unlock(hfsmp, lockflags);
+			hfs_systemfile_unlock(hfsmp, lockflags);
+		}
+		if (hfsmp->jnl) {
+			if (retval == 0) {
+				fp->ff_size = length;
 			}
-			if (hfsmp->jnl) {
-				if (retval == 0) {
-					fp->ff_size = length;
-				}
-				if (skipupdate) {
-					(void) hfs_minorupdate(vp);
-				}
-				else {
-					(void) hfs_update(vp, TRUE);
-					(void) hfs_volupdate(hfsmp, VOL_UPDATE, 0);
-				}
+			if (skipupdate) {
+				(void) hfs_minorupdate(vp);
 			}
-			hfs_end_transaction(hfsmp);
+			else {
+				(void) hfs_update(vp, TRUE);
+				(void) hfs_volupdate(hfsmp, VOL_UPDATE, 0);
+			}
+		}
+		hfs_end_transaction(hfsmp);
 
-			filebytes = (off_t)fp->ff_blocks * (off_t)blksize;
-			if (retval)
-				goto Err_Exit;
+		filebytes = (off_t)fp->ff_blocks * (off_t)blksize;
+		if (retval)
+			goto Err_Exit;
 #if QUOTA
-			/* These are bytesreleased */
-			(void) hfs_chkdq(cp, (int64_t)-(savedbytes - filebytes), NOCRED, 0);
+		/* These are bytesreleased */
+		(void) hfs_chkdq(cp, (int64_t)-(savedbytes - filebytes), NOCRED, 0);
 #endif /* QUOTA */
-		}
-		/* Only set update flag if the logical length changes */
-		if ((off_t)fp->ff_size != length)
+
+		/* 
+		 * Only set update flag if the logical length changes & we aren't
+		 * suppressing modtime updates.
+		 */
+		if (((off_t)fp->ff_size != length) && (suppress_times == 0)) {
 			cp->c_touch_modtime = TRUE;
+		}
 		fp->ff_size = length;
 	}
 	if (cp->c_mode & (S_ISUID | S_ISGID)) {
@@ -2833,42 +3590,279 @@ do_hfs_truncate(struct vnode *vp, off_t length, int flags, int skipupdate, vfs_c
 			skipupdate = 0;
 		}
 	}
-	if (skipupdate) {
-		retval = hfs_minorupdate(vp);
-	}
-	else {
-		cp->c_touch_chgtime = TRUE;	/* status changed */
-		cp->c_touch_modtime = TRUE;	/* file data was modified */
-		retval = hfs_update(vp, MNT_WAIT);
-	}
-	if (retval) {
-		KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 7)) | DBG_FUNC_NONE,
-		     -1, -1, -1, retval, 0);
-	}
+	if (skipupdate) {
+		retval = hfs_minorupdate(vp);
+	}
+	else {
+		cp->c_touch_chgtime = TRUE;	/* status changed */
+		if (suppress_times == 0) {
+			cp->c_touch_modtime = TRUE;	/* file data was modified */
+		
+			/* 
+			 * If we are not suppressing the modtime update, then
+			 * update the gen count as well.
+			 */
+			if (S_ISREG(cp->c_attr.ca_mode) || S_ISLNK (cp->c_attr.ca_mode)) {
+				hfs_incr_gencount(cp);
+			}
+		}
+
+		retval = hfs_update(vp, MNT_WAIT);
+	}
+	if (retval) {
+		KERNEL_DEBUG(HFSDBG_TRUNCATE | DBG_FUNC_NONE,
+		     -1, -1, -1, retval, 0);
+	}
+
+Err_Exit:
+
+	KERNEL_DEBUG(HFSDBG_TRUNCATE | DBG_FUNC_END,
+		 (int)length, (int)fp->ff_size, (int)filebytes, retval, 0);
+
+	return (retval);
+}
+
+/*
+ * Preparation which must be done prior to deleting the catalog record
+ * of a file or directory.  In order to make the on-disk as safe as possible,
+ * we remove the catalog entry before releasing the bitmap blocks and the 
+ * overflow extent records.  However, some work must be done prior to deleting
+ * the catalog record.
+ * 
+ * When calling this function, the cnode must exist both in memory and on-disk.
+ * If there are both resource fork and data fork vnodes, this function should
+ * be called on both.  
+ */
+
+int
+hfs_prepare_release_storage (struct hfsmount *hfsmp, struct vnode *vp) {
+	
+	struct filefork *fp = VTOF(vp);
+	struct cnode *cp = VTOC(vp);
+#if QUOTA
+	int retval = 0;
+#endif /* QUOTA */
+	
+	/* Cannot truncate an HFS directory! */
+	if (vnode_isdir(vp)) {
+		return (EISDIR);
+	}
+	
+	/* 
+	 * See the comment below in hfs_truncate for why we need to call 
+	 * setsize here.  Essentially we want to avoid pending IO if we 
+	 * already know that the blocks are going to be released here.
+	 * This function is only called when totally removing all storage for a file, so
+	 * we can take a shortcut and immediately setsize (0);
+	 */
+	ubc_setsize(vp, 0);
+	
+	/* This should only happen with a corrupt filesystem */
+	if ((off_t)fp->ff_size < 0)
+		return (EINVAL);
+	
+	/* 
+	 * We cannot just check if fp->ff_size == length (as an optimization)
+	 * since there may be extra physical blocks that also need truncation.
+	 */
+#if QUOTA
+	if ((retval = hfs_getinoquota(cp))) {
+		return(retval);
+	}
+#endif /* QUOTA */
+	
+	/* Wipe out any invalid ranges which have yet to be backed by disk */
+	rl_remove(0, fp->ff_size - 1, &fp->ff_invalidranges);
+	
+	/* 
+	 * Account for any unmapped blocks. Since we're deleting the 
+	 * entire file, we don't have to worry about just shrinking
+	 * to a smaller number of borrowed blocks.
+	 */
+	if (fp->ff_unallocblocks > 0) {
+		u_int32_t loanedBlocks;
+		
+		hfs_lock_mount (hfsmp);
+		loanedBlocks = fp->ff_unallocblocks;
+		cp->c_blocks -= loanedBlocks;
+		fp->ff_blocks -= loanedBlocks;
+		fp->ff_unallocblocks = 0;
+		
+		hfsmp->loanedBlocks -= loanedBlocks;
+		
+		hfs_unlock_mount (hfsmp);
+	}
+	
+	return 0;
+}
+
+
+/*
+ * Special wrapper around calling TruncateFileC.  This function is useable
+ * even when the catalog record does not exist any longer, making it ideal
+ * for use when deleting a file.  The simplification here is that we know 
+ * that we are releasing all blocks.
+ *
+ * Note that this function may be called when there is no vnode backing
+ * the file fork in question.  We may call this from hfs_vnop_inactive
+ * to clear out resource fork data (and may not want to clear out the data 
+ * fork yet).  As a result, we pointer-check both sets of inputs before 
+ * doing anything with them.
+ *
+ * The caller is responsible for saving off a copy of the filefork(s)
+ * embedded within the cnode prior to calling this function.  The pointers
+ * supplied as arguments must be valid even if the cnode is no longer valid.
+ */
+
+int 
+hfs_release_storage (struct hfsmount *hfsmp, struct filefork *datafork, 
+					 struct filefork *rsrcfork, u_int32_t fileid) {
+	
+	off_t filebytes;
+	u_int32_t fileblocks;
+	int blksize = 0;
+	int error = 0;
+	int lockflags;
+	
+	blksize = hfsmp->blockSize;
+	
+	/* Data Fork */
+	if (datafork) {
+		datafork->ff_size = 0;
+
+		fileblocks = datafork->ff_blocks;
+		filebytes = (off_t)fileblocks * (off_t)blksize;		
+		
+		/* We killed invalid ranges and loaned blocks before we removed the catalog entry */
+		
+		while (filebytes > 0) {
+			if (filebytes > HFS_BIGFILE_SIZE) {
+				filebytes -= HFS_BIGFILE_SIZE;
+			} else {
+				filebytes = 0;
+			}
+			
+			/* Start a transaction, and wipe out as many blocks as we can in this iteration */
+			if (hfs_start_transaction(hfsmp) != 0) {
+				error = EINVAL;
+				break;
+			}
+			
+			if (datafork->ff_unallocblocks == 0) {
+				/* Protect extents b-tree and allocation bitmap */
+				lockflags = SFL_BITMAP;
+				if (overflow_extents(datafork))
+					lockflags |= SFL_EXTENTS;
+				lockflags = hfs_systemfile_lock(hfsmp, lockflags, HFS_EXCLUSIVE_LOCK);
+				
+				error = MacToVFSError(TruncateFileC(HFSTOVCB(hfsmp), datafork, filebytes, 1, 0, fileid, false));
+				
+				hfs_systemfile_unlock(hfsmp, lockflags);
+			}
+			(void) hfs_volupdate(hfsmp, VOL_UPDATE, 0);
+			
+			/* Finish the transaction and start over if necessary */
+			hfs_end_transaction(hfsmp);
+			
+			if (error) {
+				break;
+			}
+		}
+	}
+	
+	/* Resource fork */
+	if (error == 0 && rsrcfork) {
+		rsrcfork->ff_size = 0;
 
-Err_Exit:
+		fileblocks = rsrcfork->ff_blocks;
+		filebytes = (off_t)fileblocks * (off_t)blksize;
+		
+		/* We killed invalid ranges and loaned blocks before we removed the catalog entry */
+		
+		while (filebytes > 0) {
+			if (filebytes > HFS_BIGFILE_SIZE) {
+				filebytes -= HFS_BIGFILE_SIZE;
+			} else {
+				filebytes = 0;
+			}
+			
+			/* Start a transaction, and wipe out as many blocks as we can in this iteration */
+			if (hfs_start_transaction(hfsmp) != 0) {
+				error = EINVAL;
+				break;
+			}
+			
+			if (rsrcfork->ff_unallocblocks == 0) {
+				/* Protect extents b-tree and allocation bitmap */
+				lockflags = SFL_BITMAP;
+				if (overflow_extents(rsrcfork))
+					lockflags |= SFL_EXTENTS;
+				lockflags = hfs_systemfile_lock(hfsmp, lockflags, HFS_EXCLUSIVE_LOCK);
+				
+				error = MacToVFSError(TruncateFileC(HFSTOVCB(hfsmp), rsrcfork, filebytes, 1, 1, fileid, false));
+				
+				hfs_systemfile_unlock(hfsmp, lockflags);
+			}
+			(void) hfs_volupdate(hfsmp, VOL_UPDATE, 0);
+			
+			/* Finish the transaction and start over if necessary */
+			hfs_end_transaction(hfsmp);			
+			
+			if (error) {
+				break;
+			}
+		}
+	}
+	
+	return error;
+}
 
-	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 7)) | DBG_FUNC_END,
-		 (int)length, (int)fp->ff_size, (int)filebytes, retval, 0);
+errno_t hfs_ubc_setsize(vnode_t vp, off_t len, bool have_cnode_lock)
+{
+	errno_t error;
 
-	return (retval);
-}
+	/*
+	 * Call ubc_setsize to give the VM subsystem a chance to do
+	 * whatever it needs to with existing pages before we delete
+	 * blocks.  Note that symlinks don't use the UBC so we'll
+	 * get back ENOENT in that case.
+	 */
+	if (have_cnode_lock) {
+		error = ubc_setsize_ex(vp, len, UBC_SETSIZE_NO_FS_REENTRY);
+		if (error == EAGAIN) {
+			cnode_t *cp = VTOC(vp);
+
+			if (cp->c_truncatelockowner != current_thread()) {
+#if DEVELOPMENT || DEBUG
+				panic("hfs: hfs_ubc_setsize called without exclusive truncate lock!");
+#else
+				printf("hfs: hfs_ubc_setsize called without exclusive truncate lock!\n");
+#endif
+			}
 
+			hfs_unlock(cp);
+			error = ubc_setsize_ex(vp, len, 0);
+			hfs_lock_always(cp, HFS_EXCLUSIVE_LOCK);
+		}
+	} else
+		error = ubc_setsize_ex(vp, len, 0);
 
+	return error == ENOENT ? 0 : error;
+}
 
 /*
  * Truncate a cnode to at most length size, freeing (or adding) the
  * disk blocks.
  */
-__private_extern__
 int
-hfs_truncate(struct vnode *vp, off_t length, int flags, int skipsetsize,
-             int skipupdate, vfs_context_t context)
+hfs_truncate(struct vnode *vp, off_t length, int flags,
+			 int truncateflags, vfs_context_t context)
 {
-    	struct filefork *fp = VTOF(vp);
+	struct filefork *fp = VTOF(vp);
 	off_t filebytes;
 	u_int32_t fileblocks;
-	int blksize, error = 0;
+	int blksize;
+	errno_t error = 0;
 	struct cnode *cp = VTOC(vp);
 
 	/* Cannot truncate an HFS directory! */
@@ -2876,7 +3870,7 @@ hfs_truncate(struct vnode *vp, off_t length, int flags, int skipsetsize,
 		return (EISDIR);
 	}
 	/* A swap file cannot change size. */
-	if (vnode_isswap(vp) && (length != 0)) {
+	if (vnode_isswap(vp) && length && !ISSET(flags, IO_NOAUTH)) {
 		return (EPERM);
 	}
 
@@ -2884,24 +3878,17 @@ hfs_truncate(struct vnode *vp, off_t length, int flags, int skipsetsize,
 	fileblocks = fp->ff_blocks;
 	filebytes = (off_t)fileblocks * (off_t)blksize;
 
-	//
-	// Have to do this here so that we don't wind up with
-	// i/o pending for blocks that are about to be released
-	// if we truncate the file.
-	//
-	// If skipsetsize is set, then the caller is responsible
-	// for the ubc_setsize.
-	//
-	// Even if skipsetsize is set, if the length is zero we
-	// want to call ubc_setsize() because as of SnowLeopard
-	// it will no longer cause any page-ins and it will drop
-	// any dirty pages so that we don't do any i/o that we
-	// don't have to.  This also prevents a race where i/o
-	// for truncated blocks may overwrite later data if the
-	// blocks get reallocated to a different file.
-	//
-	if (!skipsetsize || length == 0)
-		ubc_setsize(vp, length);
+	bool caller_has_cnode_lock = (cp->c_lockowner == current_thread());
+
+	error = hfs_ubc_setsize(vp, length, caller_has_cnode_lock);
+	if (error)
+		return error;
+
+	if (!caller_has_cnode_lock) {
+		error = hfs_lock(cp, HFS_EXCLUSIVE_LOCK, HFS_LOCK_DEFAULT);
+		if (error)
+			return error;
+	}
 
 	// have to loop truncating or growing files that are
 	// really big because otherwise transactions can get
@@ -2909,40 +3896,47 @@ hfs_truncate(struct vnode *vp, off_t length, int flags, int skipsetsize,
 
 	if (length < filebytes) {
 		while (filebytes > length) {
-			if ((filebytes - length) > HFS_BIGFILE_SIZE && overflow_extents(fp)) {
+			if ((filebytes - length) > HFS_BIGFILE_SIZE) {
 		    		filebytes -= HFS_BIGFILE_SIZE;
 			} else {
 		    		filebytes = length;
 			}
 			cp->c_flag |= C_FORCEUPDATE;
-			error = do_hfs_truncate(vp, filebytes, flags, skipupdate, context);
+			error = do_hfs_truncate(vp, filebytes, flags, truncateflags, context);
 			if (error)
 				break;
 		}
 	} else if (length > filebytes) {
 		while (filebytes < length) {
-			if ((length - filebytes) > HFS_BIGFILE_SIZE && overflow_extents(fp)) {
+			if ((length - filebytes) > HFS_BIGFILE_SIZE) {
 				filebytes += HFS_BIGFILE_SIZE;
 			} else {
 				filebytes = length;
 			}
 			cp->c_flag |= C_FORCEUPDATE;
-			error = do_hfs_truncate(vp, filebytes, flags, skipupdate, context);
+			error = do_hfs_truncate(vp, filebytes, flags, truncateflags, context);
 			if (error)
 				break;
 		}
 	} else /* Same logical size */ {
 
-		error = do_hfs_truncate(vp, length, flags, skipupdate, context);
+		error = do_hfs_truncate(vp, length, flags, truncateflags, context);
 	}
 	/* Files that are changing size are not hot file candidates. */
 	if (VTOHFS(vp)->hfc_stage == HFC_RECORDING) {
 		fp->ff_bytesread = 0;
 	}
 
-	return (error);
-}
+	if (!caller_has_cnode_lock)
+		hfs_unlock(cp);
 
+	// Make sure UBC's size matches up (in case we didn't completely succeed)
+	errno_t err2 = hfs_ubc_setsize(vp, fp->ff_size, caller_has_cnode_lock);
+	if (!error)
+		error = err2;
+
+	return error;
+}
 
 
 /*
@@ -2974,6 +3968,7 @@ hfs_vnop_allocate(struct vnop_allocate_args /* {
 	struct hfsmount *hfsmp;
 	kauth_cred_t cred = vfs_context_ucred(ap->a_context);
 	int lockflags;
+	time_t orig_ctime;
 
 	*(ap->a_bytesallocated) = 0;
 
@@ -2984,9 +3979,13 @@ hfs_vnop_allocate(struct vnop_allocate_args /* {
 	
 	cp = VTOC(vp);
 
-	hfs_lock_truncate(cp, TRUE);
+	orig_ctime = VTOC(vp)->c_ctime;
 
-	if ((retval = hfs_lock(cp, HFS_EXCLUSIVE_LOCK))) {
+	check_for_tracked_file(vp, orig_ctime, ap->a_length == 0 ? NAMESPACE_HANDLER_TRUNCATE_OP|NAMESPACE_HANDLER_DELETE_OP : NAMESPACE_HANDLER_TRUNCATE_OP, NULL);
+
+	hfs_lock_truncate(cp, HFS_EXCLUSIVE_LOCK, HFS_LOCK_DEFAULT);
+
+	if ((retval = hfs_lock(cp, HFS_EXCLUSIVE_LOCK, HFS_LOCK_DEFAULT))) {
 		goto Err_Exit;
 	}
 	
@@ -3075,13 +4074,13 @@ hfs_vnop_allocate(struct vnop_allocate_args /* {
 		    /* Protect extents b-tree and allocation bitmap */
 		    lockflags = SFL_BITMAP;
 		    if (overflow_extents(fp))
-			lockflags |= SFL_EXTENTS;
+				lockflags |= SFL_EXTENTS;
 		    lockflags = hfs_systemfile_lock(hfsmp, lockflags, HFS_EXCLUSIVE_LOCK);
 
 		    if (moreBytesRequested >= HFS_BIGFILE_SIZE) {
-			bytesRequested = HFS_BIGFILE_SIZE;
+				bytesRequested = HFS_BIGFILE_SIZE;
 		    } else {
-			bytesRequested = moreBytesRequested;
+				bytesRequested = moreBytesRequested;
 		    }
 
 		    if (extendFlags & kEFContigMask) {
@@ -3138,14 +4137,18 @@ hfs_vnop_allocate(struct vnop_allocate_args /* {
 
 	} else { /* Shorten the size of the file */
 
-		if (fp->ff_size > length) {
-			/*
-			 * Any buffers that are past the truncation point need to be
-			 * invalidated (to maintain buffer cache consistency).
-			 */
-		}
+		/*
+		 * N.B. At present, this code is never called.  If and when we
+		 * do start using it, it looks like there might be slightly
+		 * strange semantics with the file size: it's possible for the
+		 * file size to *increase* e.g. if current file size is 5,
+		 * length is 1024 and filebytes is 4096, the file size will
+		 * end up being 1024 bytes.  This isn't necessarily a problem
+		 * but it's not consistent with the code above which doesn't
+		 * change the file size.
+		 */
 
-		retval = hfs_truncate(vp, length, 0, 0, 0, ap->a_context);
+		retval = hfs_truncate(vp, length, 0, 0, ap->a_context);
 		filebytes = (off_t)fp->ff_blocks * (off_t)vcb->blockSize;
 
 		/*
@@ -3161,9 +4164,7 @@ hfs_vnop_allocate(struct vnop_allocate_args /* {
 		if (fp->ff_size > filebytes) {
 			fp->ff_size = filebytes;
 
-			hfs_unlock(cp);
-			ubc_setsize(vp, fp->ff_size);
-			hfs_lock(cp, HFS_FORCE_LOCK);
+			hfs_ubc_setsize(vp, fp->ff_size, true);
 		}
 	}
 
@@ -3175,7 +4176,7 @@ Std_Exit:
 	if (retval == 0)
 		retval = retval2;
 Err_Exit:
-	hfs_unlock_truncate(cp, TRUE);
+	hfs_unlock_truncate(cp, HFS_LOCK_DEFAULT);
 	hfs_unlock(cp);
 	return (retval);
 }
@@ -3198,74 +4199,339 @@ hfs_vnop_pagein(struct vnop_pagein_args *ap)
 	};
 */
 {
-	vnode_t vp = ap->a_vp;
-	int error;
+	vnode_t 	vp;
+	struct cnode	*cp;
+	struct filefork *fp;
+	int		error = 0;
+	upl_t 		upl;
+	upl_page_info_t	*pl;
+	off_t		f_offset;
+	off_t		page_needed_f_offset;
+	int		offset;
+	int		isize; 
+	int		upl_size; 
+	int		pg_index;
+	boolean_t	truncate_lock_held = FALSE;
+	boolean_t 	file_converted = FALSE;
+	kern_return_t	kret;
+	
+	vp = ap->a_vp;
+	cp = VTOC(vp);
+	fp = VTOF(vp);
+
+#if CONFIG_PROTECT
+	if ((error = cp_handle_vnop(vp, CP_READ_ACCESS | CP_WRITE_ACCESS, 0)) != 0) {
+		/* 
+		 * If we errored here, then this means that one of two things occurred:
+		 * 1. there was a problem with the decryption of the key.
+		 * 2. the device is locked and we are not allowed to access this particular file.
+		 * 
+		 * Either way, this means that we need to shut down this upl now.  As long as 
+		 * the pl pointer is NULL (meaning that we're supposed to create the UPL ourselves)
+		 * then we create a upl and immediately abort it.
+		 */
+		if (ap->a_pl == NULL) {
+			/* create the upl */
+			ubc_create_upl (vp, ap->a_f_offset, ap->a_size, &upl, &pl, 
+					UPL_UBC_PAGEIN | UPL_RET_ONLY_ABSENT);
+			/* mark the range as needed so it doesn't immediately get discarded upon abort */
+			ubc_upl_range_needed (upl, ap->a_pl_offset / PAGE_SIZE, 1);
+	
+			/* Abort the range */
+			ubc_upl_abort_range (upl, 0, ap->a_size, UPL_ABORT_FREE_ON_EMPTY | UPL_ABORT_ERROR);
+		}
+
+	
+		return error;
+	}
+#endif /* CONFIG_PROTECT */
+
+	if (ap->a_pl != NULL) {
+		/*
+		 * this can only happen for swap files now that
+		 * we're asking for V2 paging behavior...
+		 * so don't need to worry about decompression, or
+		 * keeping track of blocks read or taking the truncate lock
+		 */
+		error = cluster_pagein(vp, ap->a_pl, ap->a_pl_offset, ap->a_f_offset,
+				       ap->a_size, (off_t)fp->ff_size, ap->a_flags);
+		goto pagein_done;
+	}
+
+	page_needed_f_offset = ap->a_f_offset + ap->a_pl_offset;
+
+retry_pagein:
+	/*
+	 * take truncate lock (shared/recursive) to guard against 
+	 * zero-fill thru fsync interfering, but only for v2
+	 *
+	 * the HFS_RECURSE_TRUNCLOCK arg indicates that we want the 
+	 * lock shared and we are allowed to recurse 1 level if this thread already
+	 * owns the lock exclusively... this can legally occur
+	 * if we are doing a shrinking ftruncate against a file
+	 * that is mapped private, and the pages being truncated
+	 * do not currently exist in the cache... in that case
+	 * we will have to page-in the missing pages in order
+	 * to provide them to the private mapping... we must
+	 * also call hfs_unlock_truncate with a postive been_recursed 
+	 * arg to indicate that if we have recursed, there is no need to drop
+	 * the lock.  Allowing this simple recursion is necessary
+	 * in order to avoid a certain deadlock... since the ftruncate
+	 * already holds the truncate lock exclusively, if we try
+	 * to acquire it shared to protect the pagein path, we will
+	 * hang this thread
+	 *
+	 * NOTE: The if () block below is a workaround in order to prevent a 
+	 * VM deadlock. See rdar://7853471.
+	 * 
+	 * If we are in a forced unmount, then launchd will still have the 
+	 * dyld_shared_cache file mapped as it is trying to reboot.  If we 
+	 * take the truncate lock here to service a page fault, then our 
+	 * thread could deadlock with the forced-unmount.  The forced unmount 
+	 * thread will try to reclaim the dyld_shared_cache vnode, but since it's 
+	 * marked C_DELETED, it will call ubc_setsize(0).  As a result, the unmount 
+	 * thread will think it needs to copy all of the data out of the file 
+	 * and into a VM copy object.  If we hold the cnode lock here, then that 
+	 * VM operation will not be able to proceed, because we'll set a busy page 
+	 * before attempting to grab the lock.  Note that this isn't as simple as "don't
+	 * call ubc_setsize" because doing that would just shift the problem to the
+	 * ubc_msync done before the vnode is reclaimed.
+	 *
+	 * So, if a forced unmount on this volume is in flight AND the cnode is 
+	 * marked C_DELETED, then just go ahead and do the page in without taking 
+	 * the lock (thus suspending pagein_v2 semantics temporarily).  Since it's on a file
+	 * that is not going to be available on the next mount, this seems like a 
+	 * OK solution from a correctness point of view, even though it is hacky.
+	 */
+	if (vfs_isforce(vp->v_mount)) {
+		if (cp->c_flag & C_DELETED) {
+			/* If we don't get it, then just go ahead and operate without the lock */
+			truncate_lock_held = hfs_try_trunclock(cp, HFS_SHARED_LOCK, HFS_LOCK_SKIP_IF_EXCLUSIVE);
+		}
+	}
+	else {
+		hfs_lock_truncate(cp, HFS_SHARED_LOCK, HFS_LOCK_SKIP_IF_EXCLUSIVE);
+		truncate_lock_held = TRUE;
+	}
+
+	kret = ubc_create_upl(vp, ap->a_f_offset, ap->a_size, &upl, &pl, UPL_UBC_PAGEIN | UPL_RET_ONLY_ABSENT); 
+
+	if ((kret != KERN_SUCCESS) || (upl == (upl_t) NULL)) {
+		error = EINVAL;
+		goto pagein_done;
+	}
+	ubc_upl_range_needed(upl, ap->a_pl_offset / PAGE_SIZE, 1);
+
+	upl_size = isize = ap->a_size;
+
+	/*
+	 * Scan from the back to find the last page in the UPL, so that we 
+	 * aren't looking at a UPL that may have already been freed by the
+	 * preceding aborts/completions.
+	 */ 
+	for (pg_index = ((isize) / PAGE_SIZE); pg_index > 0;) {
+		if (upl_page_present(pl, --pg_index))
+			break;
+		if (pg_index == 0) {
+			/*
+			 * no absent pages were found in the range specified
+			 * just abort the UPL to get rid of it and then we're done
+			 */
+			ubc_upl_abort_range(upl, 0, isize, UPL_ABORT_FREE_ON_EMPTY);
+			goto pagein_done;
+		}
+	}
+	/* 
+	 * initialize the offset variables before we touch the UPL.
+	 * f_offset is the position into the file, in bytes
+	 * offset is the position into the UPL, in bytes
+	 * pg_index is the pg# of the UPL we're operating on
+	 * isize is the offset into the UPL of the last page that is present. 
+	 */
+	isize = ((pg_index + 1) * PAGE_SIZE);	
+	pg_index = 0;
+	offset = 0;
+	f_offset = ap->a_f_offset;
+
+	while (isize) {
+		int  xsize;
+		int  num_of_pages;
+
+		if ( !upl_page_present(pl, pg_index)) {
+			/*
+			 * we asked for RET_ONLY_ABSENT, so it's possible
+			 * to get back empty slots in the UPL.
+			 * just skip over them
+			 */
+			f_offset += PAGE_SIZE;
+			offset   += PAGE_SIZE;
+			isize    -= PAGE_SIZE;
+			pg_index++;
+
+			continue;
+		}
+		/* 
+		 * We know that we have at least one absent page.
+		 * Now checking to see how many in a row we have
+		 */
+		num_of_pages = 1;
+		xsize = isize - PAGE_SIZE;
+
+		while (xsize) {
+			if ( !upl_page_present(pl, pg_index + num_of_pages))
+				break;
+			num_of_pages++;
+			xsize -= PAGE_SIZE;
+		}
+		xsize = num_of_pages * PAGE_SIZE;
 
 #if HFS_COMPRESSION
-	if (VNODE_IS_RSRC(vp)) {
-		/* allow pageins of the resource fork */
-	} else {
-		int compressed = hfs_file_is_compressed(VTOC(vp), 1); /* 1 == don't take the cnode lock */
-		if (compressed) {
-			error = decmpfs_pagein_compressed(ap, &compressed, VTOCMP(vp));
+		if (VNODE_IS_RSRC(vp)) {
+			/* allow pageins of the resource fork */
+		} else {
+			int compressed = hfs_file_is_compressed(VTOC(vp), 1); /* 1 == don't take the cnode lock */
+
 			if (compressed) {
-				if (error == 0) {
-					/* successful page-in, update the access time */
-					VTOC(vp)->c_touch_acctime = TRUE;
+
+				if (truncate_lock_held) {
+					/*
+					 * can't hold the truncate lock when calling into the decmpfs layer
+					 * since it calls back into this layer... even though we're only
+					 * holding the lock in shared mode, and the re-entrant path only
+					 * takes the lock shared, we can deadlock if some other thread
+					 * tries to grab the lock exclusively in between.
+					 */
+					hfs_unlock_truncate(cp, HFS_LOCK_SKIP_IF_EXCLUSIVE);
+					truncate_lock_held = FALSE;
+				}
+				ap->a_pl = upl;
+				ap->a_pl_offset = offset;
+				ap->a_f_offset = f_offset;
+				ap->a_size = xsize;
+
+				error = decmpfs_pagein_compressed(ap, &compressed, VTOCMP(vp));
+				/*
+				 * note that decpfs_pagein_compressed can change the state of
+				 * 'compressed'... it will set it to 0 if the file is no longer
+				 * compressed once the compression lock is successfully taken
+				 * i.e. we would block on that lock while the file is being inflated
+				 */
+				if (compressed) {
+					if (error == 0) {
+						/* successful page-in, update the access time */
+						VTOC(vp)->c_touch_acctime = TRUE;
 					
-					/* compressed files are not hot file candidates */
-					if (VTOHFS(vp)->hfc_stage == HFC_RECORDING) {
-						VTOF(vp)->ff_bytesread = 0;
+						/* compressed files are not hot file candidates */
+						if (VTOHFS(vp)->hfc_stage == HFC_RECORDING) {
+							fp->ff_bytesread = 0;
+						}
+					} else if (error == EAGAIN) {
+						/*
+						 * EAGAIN indicates someone else already holds the compression lock...
+						 * to avoid deadlocking, we'll abort this range of pages with an
+						 * indication that the pagein needs to be redriven
+						 */
+			        		ubc_upl_abort_range(upl, (upl_offset_t) offset, xsize, UPL_ABORT_FREE_ON_EMPTY | UPL_ABORT_RESTART);
+					} else if (error == ENOSPC) {
+
+						if (upl_size == PAGE_SIZE)
+							panic("decmpfs_pagein_compressed: couldn't ubc_upl_map a single page\n");
+
+						ubc_upl_abort_range(upl, (upl_offset_t) offset, isize, UPL_ABORT_FREE_ON_EMPTY);
+
+						ap->a_size = PAGE_SIZE;
+						ap->a_pl = NULL;
+						ap->a_pl_offset = 0;
+						ap->a_f_offset = page_needed_f_offset;
+
+						goto retry_pagein;
 					}
+					goto pagein_next_range;
+				}
+				else {
+					/* 
+					 * Set file_converted only if the file became decompressed while we were
+					 * paging in.  If it were still compressed, we would re-start the loop using the goto
+					 * in the above block.  This avoid us overloading truncate_lock_held as our retry_pagein
+					 * condition below, since we could have avoided taking the truncate lock to prevent
+					 * a deadlock in the force unmount case.
+					 */
+					file_converted = TRUE;
 				}
-				return error;
 			}
-			/* otherwise the file was converted back to a regular file while we were reading it */
-		}
-	}
-#endif
-
-	error = cluster_pagein(vp, ap->a_pl, ap->a_pl_offset, ap->a_f_offset,
-	                       ap->a_size, (off_t)VTOF(vp)->ff_size, ap->a_flags);
-	/*
-	 * Keep track of blocks read.
-	 */
-	if (!vnode_isswap(vp) && VTOHFS(vp)->hfc_stage == HFC_RECORDING && error == 0) {
-		struct cnode *cp;
-		struct filefork *fp;
-		int bytesread;
-		int took_cnode_lock = 0;
-		
-		cp = VTOC(vp);
-		fp = VTOF(vp);
+			if (file_converted == TRUE) {
+				/*
+				 * the file was converted back to a regular file after we first saw it as compressed
+				 * we need to abort the upl, retake the truncate lock, recreate the UPL and start over
+				 * reset a_size so that we consider what remains of the original request
+				 * and null out a_upl and a_pl_offset.
+				 *
+				 * We should only be able to get into this block if the decmpfs_pagein_compressed 
+				 * successfully decompressed the range in question for this file.
+				 */
+				ubc_upl_abort_range(upl, (upl_offset_t) offset, isize, UPL_ABORT_FREE_ON_EMPTY);
 
-		if (ap->a_f_offset == 0 && fp->ff_size < PAGE_SIZE)
-			bytesread = fp->ff_size;
-		else
-			bytesread = ap->a_size;
+				ap->a_size = isize;
+				ap->a_pl = NULL;
+				ap->a_pl_offset = 0;
 
-		/* When ff_bytesread exceeds 32-bits, update it behind the cnode lock. */
-		if ((fp->ff_bytesread + bytesread) > 0x00000000ffffffff && cp->c_lockowner != current_thread()) {
-			hfs_lock(cp, HFS_FORCE_LOCK);
-			took_cnode_lock = 1;
+				/* Reset file_converted back to false so that we don't infinite-loop. */
+				file_converted = FALSE;
+				goto retry_pagein;
+			}
 		}
+#endif
+		error = cluster_pagein(vp, upl, offset, f_offset, xsize, (off_t)fp->ff_size, ap->a_flags);
+
 		/*
-		 * If this file hasn't been seen since the start of
-		 * the current sampling period then start over.
+		 * Keep track of blocks read.
 		 */
-		if (cp->c_atime < VTOHFS(vp)->hfc_timebase) {
-			struct timeval tv;
+		if ( !vnode_isswap(vp) && VTOHFS(vp)->hfc_stage == HFC_RECORDING && error == 0) {
+			int bytesread;
+			int took_cnode_lock = 0;
+		
+			if (ap->a_f_offset == 0 && fp->ff_size < PAGE_SIZE)
+				bytesread = fp->ff_size;
+			else
+				bytesread = xsize;
+
+			/* When ff_bytesread exceeds 32-bits, update it behind the cnode lock. */
+			if ((fp->ff_bytesread + bytesread) > 0x00000000ffffffff && cp->c_lockowner != current_thread()) {
+				hfs_lock(cp, HFS_EXCLUSIVE_LOCK, HFS_LOCK_ALLOW_NOEXISTS);
+				took_cnode_lock = 1;
+			}
+			/*
+			 * If this file hasn't been seen since the start of
+			 * the current sampling period then start over.
+			 */
+			if (cp->c_atime < VTOHFS(vp)->hfc_timebase) {
+				struct timeval tv;
 
-			fp->ff_bytesread = bytesread;
-			microtime(&tv);
-			cp->c_atime = tv.tv_sec;
-		} else {
-			fp->ff_bytesread += bytesread;
+				fp->ff_bytesread = bytesread;
+				microtime(&tv);
+				cp->c_atime = tv.tv_sec;
+			} else {
+				fp->ff_bytesread += bytesread;
+			}
+			cp->c_touch_acctime = TRUE;
+			if (took_cnode_lock)
+				hfs_unlock(cp);
 		}
-		cp->c_touch_acctime = TRUE;
-		if (took_cnode_lock)
-			hfs_unlock(cp);
+pagein_next_range:
+		f_offset += xsize;
+		offset   += xsize;
+		isize    -= xsize;
+		pg_index += num_of_pages;
+
+		error = 0;
+	}
+
+pagein_done:
+	if (truncate_lock_held == TRUE) {
+		/* Note 1 is passed to hfs_unlock_truncate in been_recursed argument */
+		hfs_unlock_truncate(cp, HFS_LOCK_SKIP_IF_EXCLUSIVE);
 	}
+
 	return (error);
 }
 
@@ -3329,10 +4595,19 @@ hfs_vnop_pageout(struct vnop_pageout_args *ap)
 		a_pl_offset = 0;
 
 		/*
-		 * take truncate lock (shared) to guard against 
-		 * zero-fill thru fsync interfering, but only for v2 
+		 * For V2 semantics, we want to take the cnode truncate lock
+		 * shared to guard against the file size changing via zero-filling.
+		 * 
+		 * However, we have to be careful because we may be invoked 
+		 * via the ubc_msync path to write out dirty mmap'd pages
+		 * in response to a lock event on a content-protected
+		 * filesystem (e.g. to write out class A files).
+		 * As a result, we want to take the truncate lock 'SHARED' with 
+		 * the mini-recursion locktype so that we don't deadlock/panic 
+		 * because we may be already holding the truncate lock exclusive to force any other
+		 * IOs to have blocked behind us. 
 		 */
-		hfs_lock_truncate(cp, 0);
+		hfs_lock_truncate(cp, HFS_SHARED_LOCK, HFS_LOCK_SKIP_IF_EXCLUSIVE);
 
 		if (a_flags & UPL_MSYNC) {
 			request_flags = UPL_UBC_MSYNC | UPL_RET_ONLY_DIRTY;
@@ -3340,6 +4615,7 @@ hfs_vnop_pageout(struct vnop_pageout_args *ap)
 		else {
 			request_flags = UPL_UBC_PAGEOUT | UPL_RET_ONLY_DIRTY;
 		}
+		
 		kret = ubc_create_upl(vp, ap->a_f_offset, ap->a_size, &upl, &pl, request_flags); 
 
 		if ((kret != KERN_SUCCESS) || (upl == (upl_t) NULL)) {
@@ -3449,7 +4725,7 @@ hfs_vnop_pageout(struct vnop_pageout_args *ap)
 				tooklock = 0;
 
 				if (cp->c_lockowner != current_thread()) {
-					if ((retval = hfs_lock(cp, HFS_EXCLUSIVE_LOCK))) {
+					if ((retval = hfs_lock(cp, HFS_EXCLUSIVE_LOCK, HFS_LOCK_DEFAULT))) {
 						/*
 						 * we're in the v2 path, so we are the
 						 * owner of the UPL... we may have already
@@ -3499,7 +4775,7 @@ hfs_vnop_pageout(struct vnop_pageout_args *ap)
 			int tooklock = 0;
 
 			if (cp->c_lockowner != current_thread()) {
-				if ((retval = hfs_lock(cp, HFS_EXCLUSIVE_LOCK))) {
+				if ((retval = hfs_lock(cp, HFS_EXCLUSIVE_LOCK, HFS_LOCK_DEFAULT))) {
 					if (!(a_flags & UPL_NOCOMMIT)) {
 						ubc_upl_abort_range(upl,
 								    a_pl_offset,
@@ -3532,26 +4808,52 @@ hfs_vnop_pageout(struct vnop_pageout_args *ap)
 	}
 
 	/*
-	 * If data was written, update the modification time of the file.
-	 * If setuid or setgid bits are set and this process is not the 
-	 * superuser then clear the setuid and setgid bits as a precaution 
-	 * against tampering.
+	 * If data was written, update the modification time of the file
+	 * but only if it's mapped writable; we will have touched the
+	 * modifcation time for direct writes.
 	 */
-	if (retval == 0) {
-		cp->c_touch_modtime = TRUE;
-		cp->c_touch_chgtime = TRUE;
-		if ((cp->c_mode & (S_ISUID | S_ISGID)) &&
-		    (vfs_context_suser(ap->a_context) != 0)) {
-			hfs_lock(cp, HFS_FORCE_LOCK);
-			cp->c_mode &= ~(S_ISUID | S_ISGID);
-			hfs_unlock(cp);
+	if (retval == 0 && (ubc_is_mapped_writable(vp)
+						|| ISSET(cp->c_flag, C_MIGHT_BE_DIRTY_FROM_MAPPING))) {
+		hfs_lock(cp, HFS_EXCLUSIVE_LOCK, HFS_LOCK_ALLOW_NOEXISTS);
+
+		// Check again with lock
+		bool mapped_writable = ubc_is_mapped_writable(vp);
+		if (mapped_writable
+			|| ISSET(cp->c_flag, C_MIGHT_BE_DIRTY_FROM_MAPPING)) {
+			cp->c_touch_modtime = TRUE;
+			cp->c_touch_chgtime = TRUE;
+
+			/*
+			 * We only need to increment the generation counter if
+			 * it's currently mapped writable because we incremented
+			 * the counter in hfs_vnop_mnomap.
+			 */
+			if (mapped_writable)
+				hfs_incr_gencount(VTOC(vp));
+
+			/*
+			 * If setuid or setgid bits are set and this process is
+			 * not the superuser then clear the setuid and setgid bits
+			 * as a precaution against tampering.
+			 */
+			if ((cp->c_mode & (S_ISUID | S_ISGID)) &&
+				(vfs_context_suser(ap->a_context) != 0)) {
+				cp->c_mode &= ~(S_ISUID | S_ISGID);
+			}
 		}
+
+		hfs_unlock(cp);
 	}
 
 pageout_done:
 	if (is_pageoutv2) {
-		/* release truncate lock (shared) */
-		hfs_unlock_truncate(cp, 0);
+		/* 
+		 * Release the truncate lock.  Note that because 
+		 * we may have taken the lock recursively by 
+		 * being invoked via ubc_msync due to lockdown,
+		 * we should release it recursively, too.
+		 */
+		hfs_unlock_truncate(cp, HFS_LOCK_SKIP_IF_EXCLUSIVE);
 	}
 	return (retval);
 }
@@ -3643,7 +4945,6 @@ hfs_vnop_bwrite(struct vnop_bwrite_args *ap)
  *
  * During step 3 page-ins to the file get suspended.
  */
-__private_extern__
 int
 hfs_relocate(struct  vnode *vp, u_int32_t  blockHint, kauth_cred_t cred,
 	struct  proc *p)
@@ -3666,7 +4967,8 @@ hfs_relocate(struct  vnode *vp, u_int32_t  blockHint, kauth_cred_t cred,
 	enum vtype vnodetype;
 
 	vnodetype = vnode_vtype(vp);
-	if (vnodetype != VREG && vnodetype != VLNK) {
+	if (vnodetype != VREG) {
+		/* Not allowed to move symlinks. */
 		return (EPERM);
 	}
 	
@@ -3679,12 +4981,27 @@ hfs_relocate(struct  vnode *vp, u_int32_t  blockHint, kauth_cred_t cred,
 	fp = VTOF(vp);
 	if (fp->ff_unallocblocks)
 		return (EINVAL);
+
+#if CONFIG_PROTECT
+	/* 
+	 * <rdar://problem/9118426>
+	 * Disable HFS file relocation on content-protected filesystems
+	 */
+	if (cp_fs_protected (hfsmp->hfs_mp)) {
+		return EINVAL;
+	}
+#endif
+	/* If it's an SSD, also disable HFS relocation */
+	if (hfsmp->hfs_flags & HFS_SSD) {
+		return EINVAL;
+	}
+
+
 	blksize = hfsmp->blockSize;
 	if (blockHint == 0)
 		blockHint = hfsmp->nextAllocation;
 
-	if ((fp->ff_size > 0x7fffffff) ||
-	    ((fp->ff_size > blksize) && vnodetype == VLNK)) {
+	if (fp->ff_size > 0x7fffffff) {
 		return (EFBIG);
 	}
 
@@ -3701,15 +5018,15 @@ hfs_relocate(struct  vnode *vp, u_int32_t  blockHint, kauth_cred_t cred,
 
 	if (!vnode_issystem(vp) && (vnodetype != VLNK)) {
 		hfs_unlock(cp);
-		hfs_lock_truncate(cp, TRUE);
+		hfs_lock_truncate(cp, HFS_EXCLUSIVE_LOCK, HFS_LOCK_DEFAULT);
 		/* Force lock since callers expects lock to be held. */
-		if ((retval = hfs_lock(cp, HFS_FORCE_LOCK))) {
-			hfs_unlock_truncate(cp, TRUE);
+		if ((retval = hfs_lock(cp, HFS_EXCLUSIVE_LOCK, HFS_LOCK_ALLOW_NOEXISTS))) {
+			hfs_unlock_truncate(cp, HFS_LOCK_DEFAULT);
 			return (retval);
 		}
 		/* No need to continue if file was removed. */
 		if (cp->c_flag & C_NOEXISTS) {
-			hfs_unlock_truncate(cp, TRUE);
+			hfs_unlock_truncate(cp, HFS_LOCK_DEFAULT);
 			return (ENOENT);
 		}
 		took_trunc_lock = 1;
@@ -3724,7 +5041,7 @@ hfs_relocate(struct  vnode *vp, u_int32_t  blockHint, kauth_cred_t cred,
 
 	if (hfs_start_transaction(hfsmp) != 0) {
 		if (took_trunc_lock)
-			hfs_unlock_truncate(cp, TRUE);
+			hfs_unlock_truncate(cp, HFS_LOCK_DEFAULT);
 	    return (EINVAL);
 	}
 	started_tr = 1;
@@ -3749,10 +5066,10 @@ hfs_relocate(struct  vnode *vp, u_int32_t  blockHint, kauth_cred_t cred,
 	nextallocsave = hfsmp->nextAllocation;
 	retval = ExtendFileC(hfsmp, (FCB*)fp, growsize, blockHint, eflags, &newbytes);
 	if (eflags & kEFMetadataMask) {
-		HFS_MOUNT_LOCK(hfsmp, TRUE);
+		hfs_lock_mount(hfsmp);
 		HFS_UPDATE_NEXT_ALLOCATION(hfsmp, nextallocsave);
 		MarkVCBDirty(hfsmp);
-		HFS_MOUNT_UNLOCK(hfsmp, TRUE);
+		hfs_unlock_mount(hfsmp);
 	}
 
 	retval = MacToVFSError(retval);
@@ -3762,7 +5079,7 @@ hfs_relocate(struct  vnode *vp, u_int32_t  blockHint, kauth_cred_t cred,
 			retval = ENOSPC;
 			goto restore;
 		} else if (fp->ff_blocks < (headblks + datablks)) {
-			printf("hfs_relocate: allocation failed");
+			printf("hfs_relocate: allocation failed id=%u, vol=%s\n", cp->c_cnid, hfsmp->vcbVN);
 			retval = ENOSPC;
 			goto restore;
 		}
@@ -3813,7 +5130,7 @@ hfs_relocate(struct  vnode *vp, u_int32_t  blockHint, kauth_cred_t cred,
 	 */
 
 	if (vnodetype == VLNK)
-		retval = hfs_clonelink(vp, blksize, cred, p);
+		retval = EPERM;
 	else if (vnode_issystem(vp))
 		retval = hfs_clonesysfile(vp, headblks, datablks, blksize, cred, p);
 	else
@@ -3844,7 +5161,7 @@ hfs_relocate(struct  vnode *vp, u_int32_t  blockHint, kauth_cred_t cred,
 		goto restore;
 out:
 	if (took_trunc_lock)
-		hfs_unlock_truncate(cp, TRUE);
+		hfs_unlock_truncate(cp, HFS_LOCK_DEFAULT);
 
 	if (lockflags) {
 		hfs_systemfile_unlock(hfsmp, lockflags);
@@ -3870,7 +5187,7 @@ exit:
 restore:
 	if (fp->ff_blocks == headblks) {
 		if (took_trunc_lock)
-			hfs_unlock_truncate(cp, TRUE);
+			hfs_unlock_truncate(cp, HFS_LOCK_DEFAULT);
 		goto exit;
 	}
 	/*
@@ -3883,50 +5200,18 @@ restore:
 		lockflags = hfs_systemfile_lock(hfsmp, lockflags, HFS_EXCLUSIVE_LOCK);
 	}
 
-	(void) TruncateFileC(hfsmp, (FCB*)fp, fp->ff_size, false);
+	(void) TruncateFileC(hfsmp, (FCB*)fp, fp->ff_size, 0, FORK_IS_RSRC(fp), 
+						 FTOC(fp)->c_fileid, false);
 
 	hfs_systemfile_unlock(hfsmp, lockflags);
 	lockflags = 0;
 
 	if (took_trunc_lock)
-		hfs_unlock_truncate(cp, TRUE);
+		hfs_unlock_truncate(cp, HFS_LOCK_DEFAULT);
 	goto exit;
 }
 
 
-/*
- * Clone a symlink.
- *
- */
-static int
-hfs_clonelink(struct vnode *vp, int blksize, kauth_cred_t cred, __unused struct proc *p)
-{
-	struct buf *head_bp = NULL;
-	struct buf *tail_bp = NULL;
-	int error;
-
-
-	error = (int)buf_meta_bread(vp, (daddr64_t)0, blksize, cred, &head_bp);
-	if (error)
-		goto out;
-
-	tail_bp = buf_getblk(vp, (daddr64_t)1, blksize, 0, 0, BLK_META);
-	if (tail_bp == NULL) {
-		error = EIO;
-		goto out;
-	}
-	bcopy((char *)buf_dataptr(head_bp), (char *)buf_dataptr(tail_bp), blksize);
-	error = (int)buf_bwrite(tail_bp);
-out:
-	if (head_bp) {
-	        buf_markinvalid(head_bp);
-		buf_brelse(head_bp);
-	}	
-	(void) buf_invalidateblks(vp, BUF_WRITE_DATA, 0, 0);
-
-	return (error);
-}
-
 /*
  * Clone a file's data within the file.
  *
@@ -3948,10 +5233,19 @@ hfs_clonefile(struct vnode *vp, int blkstart, int blkcnt, int blksize)
 	iosize = bufsize = MIN(copysize, 128 * 1024);
 	offset = 0;
 
+	hfs_unlock(VTOC(vp));
+
+#if CONFIG_PROTECT
+	if ((error = cp_handle_vnop(vp, CP_WRITE_ACCESS, 0)) != 0) {
+		hfs_lock(VTOC(vp), HFS_EXCLUSIVE_LOCK, HFS_LOCK_ALLOW_NOEXISTS);	
+		return (error);
+	}
+#endif /* CONFIG_PROTECT */
+
 	if (kmem_alloc(kernel_map, (vm_offset_t *)&bufp, bufsize)) {
+		hfs_lock(VTOC(vp), HFS_EXCLUSIVE_LOCK, HFS_LOCK_ALLOW_NOEXISTS);
 		return (ENOMEM);
-	}	
-	hfs_unlock(VTOC(vp));
+	}
 
 	auio = uio_create(1, 0, UIO_SYSSPACE, UIO_READ);
 
@@ -3967,7 +5261,7 @@ hfs_clonefile(struct vnode *vp, int blkstart, int blkcnt, int blksize)
 			break;
 		}
 		if (uio_resid(auio) != 0) {
-			printf("hfs_clonefile: cluster_read: uio_resid = %lld\n", uio_resid(auio));
+			printf("hfs_clonefile: cluster_read: uio_resid = %lld\n", (int64_t)uio_resid(auio));
 			error = EIO;		
 			break;
 		}
@@ -4004,7 +5298,7 @@ hfs_clonefile(struct vnode *vp, int blkstart, int blkcnt, int blksize)
 		ubc_msync(vp, writebase, writebase + offset, NULL, UBC_INVALIDATE | UBC_PUSHDIRTY);
 	} else {
 		/*
-		 * No need to call ubc_sync_range or hfs_invalbuf
+		 * No need to call ubc_msync or hfs_invalbuf
 		 * since the file was copied using IO_NOCACHE and
 		 * the copy was done starting and ending on a page
 		 * boundary in the file.
@@ -4012,7 +5306,7 @@ hfs_clonefile(struct vnode *vp, int blkstart, int blkcnt, int blksize)
 	}
 	kmem_free(kernel_map, (vm_offset_t)bufp, bufsize);
 
-	hfs_lock(VTOC(vp), HFS_FORCE_LOCK);	
+	hfs_lock(VTOC(vp), HFS_EXCLUSIVE_LOCK, HFS_LOCK_ALLOW_NOEXISTS);	
 	return (error);
 }