xnu-1699.32.7.tar.gz

[apple/xnu.git] / bsd / hfs / hfs_readwrite.c
diff --git a/bsd/hfs/hfs_readwrite.c b/bsd/hfs/hfs_readwrite.c

index b9c8bf9127e2ccb05f9d6916d76e7da59547c147..63acbac05944fc5edbf0d3f27a2d6b9974393477 100644 (file)
--- a/bsd/hfs/hfs_readwrite.c
+++ b/bsd/hfs/hfs_readwrite.c
@@ -1,5 +1,5 @@
  /*
- * Copyright (c) 2000-2005 Apple Computer, Inc. All rights reserved.
+ * Copyright (c) 2000-2010 Apple Inc. All rights reserved.
   *
   * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
   * 
@@ -44,68 +44,53 @@
  #include <sys/proc.h>
  #include <sys/kauth.h>
  #include <sys/vnode.h>
+#include <sys/vnode_internal.h>
  #include <sys/uio.h>
  #include <sys/vfs_context.h>
+#include <sys/fsevents.h>
+#include <kern/kalloc.h>
  #include <sys/disk.h>
  #include <sys/sysctl.h>
+#include <sys/fsctl.h>
  
  #include <miscfs/specfs/specdev.h>
  
  #include <sys/ubc.h>
+#include <sys/ubc_internal.h>
+
  #include <vm/vm_pageout.h>
  #include <vm/vm_kern.h>
  
  #include <sys/kdebug.h>
  
  #include       "hfs.h"
+#include       "hfs_attrlist.h"
  #include       "hfs_endian.h"
-#include  "hfs_fsctl.h"
+#include       "hfs_fsctl.h"
  #include       "hfs_quota.h"
  #include       "hfscommon/headers/FileMgrInternal.h"
  #include       "hfscommon/headers/BTreesInternal.h"
  #include       "hfs_cnode.h"
  #include       "hfs_dbg.h"
  
-extern int overflow_extents(struct filefork *fp);
-
  #define can_cluster(size) ((((size & (4096-1))) == 0) && (size <= (MAXPHYSIO/2)))
  
  enum {
         MAXHFSFILESIZE = 0x7FFFFFFF             /* this needs to go in the mount structure */
  };
  
-extern u_int32_t GetLogicalBlockSize(struct vnode *vp);
-
-extern int  hfs_setextendedsecurity(struct hfsmount *, int);
-
+/* from bsd/hfs/hfs_vfsops.c */
+extern int hfs_vfs_vget (struct mount *mp, ino64_t ino, struct vnode **vpp, vfs_context_t context);
  
  static int  hfs_clonelink(struct vnode *, int, kauth_cred_t, struct proc *);
  static int  hfs_clonefile(struct vnode *, int, int, int);
  static int  hfs_clonesysfile(struct vnode *, int, int, int, kauth_cred_t, struct proc *);
+static int  hfs_minorupdate(struct vnode *vp);
+static int  do_hfs_truncate(struct vnode *vp, off_t length, int flags, int skip, vfs_context_t context);
  
  
  int flush_cache_on_write = 0;
-SYSCTL_INT (_kern, OID_AUTO, flush_cache_on_write, CTLFLAG_RW, &flush_cache_on_write, 0, "always flush the drive cache on writes to uncached files");
-
-
-/*****************************************************************************
-*
-*      I/O Operations on vnodes
-*
-*****************************************************************************/
-int  hfs_vnop_read(struct vnop_read_args *);
-int  hfs_vnop_write(struct vnop_write_args *);
-int  hfs_vnop_ioctl(struct vnop_ioctl_args *);
-int  hfs_vnop_select(struct vnop_select_args *);
-int  hfs_vnop_blktooff(struct vnop_blktooff_args *);
-int  hfs_vnop_offtoblk(struct vnop_offtoblk_args *);
-int  hfs_vnop_blockmap(struct vnop_blockmap_args *);
-int  hfs_vnop_strategy(struct vnop_strategy_args *);
-int  hfs_vnop_allocate(struct vnop_allocate_args *);
-int  hfs_vnop_pagein(struct vnop_pagein_args *);
-int  hfs_vnop_pageout(struct vnop_pageout_args *);
-int  hfs_vnop_bwrite(struct vnop_bwrite_args *);
-
+SYSCTL_INT (_kern, OID_AUTO, flush_cache_on_write, CTLFLAG_RW | CTLFLAG_LOCKED, &flush_cache_on_write, 0, "always flush the drive cache on writes to uncached files");
  
  /*
   * Read data from a file.
@@ -123,7 +108,7 @@ hfs_vnop_read(struct vnop_read_args *ap)
         off_t start_resid = uio_resid(uio);
         off_t offset = uio_offset(uio);
         int retval = 0;
-
+       int took_truncate_lock = 0;
  
         /* Preflight checks */
         if (!vnode_isreg(vp)) {
@@ -137,13 +122,56 @@ hfs_vnop_read(struct vnop_read_args *ap)
                 return (0);             /* Nothing left to do */
         if (offset < 0)
                 return (EINVAL);        /* cant read from a negative offset */
+       
+#if HFS_COMPRESSION
+       if (VNODE_IS_RSRC(vp)) {
+               if (hfs_hides_rsrc(ap->a_context, VTOC(vp), 1)) { /* 1 == don't take the cnode lock */
+                       return 0;
+               }
+               /* otherwise read the resource fork normally */
+       } else {
+               int compressed = hfs_file_is_compressed(VTOC(vp), 1); /* 1 == don't take the cnode lock */
+               if (compressed) {
+                       retval = decmpfs_read_compressed(ap, &compressed, VTOCMP(vp));
+                       if (compressed) {
+                               if (retval == 0) {
+                                       /* successful read, update the access time */
+                                       VTOC(vp)->c_touch_acctime = TRUE;
+                                       
+                                       /* compressed files are not hot file candidates */
+                                       if (VTOHFS(vp)->hfc_stage == HFC_RECORDING) {
+                                               VTOF(vp)->ff_bytesread = 0;
+                                       }
+                               }
+                               return retval;
+                       }
+                       /* otherwise the file was converted back to a regular file while we were reading it */
+                       retval = 0;
+               } else if ((VTOC(vp)->c_flags & UF_COMPRESSED)) {
+                       int error;
+                       
+                       error = check_for_dataless_file(vp, NAMESPACE_HANDLER_READ_OP);
+                       if (error) {
+                               return error;
+                       }
+
+               }
+       }
+#endif /* HFS_COMPRESSION */
  
         cp = VTOC(vp);
         fp = VTOF(vp);
         hfsmp = VTOHFS(vp);
  
+#if CONFIG_PROTECT
+       if ((retval = cp_handle_vnop (cp, CP_READ_ACCESS)) != 0) {
+               goto exit;
+       }
+#endif
+
         /* Protect against a size change. */
-       hfs_lock_truncate(cp, 0);
+       hfs_lock_truncate(cp, HFS_SHARED_LOCK);
+       took_truncate_lock = 1;
  
         filesize = fp->ff_size;
         filebytes = (off_t)fp->ff_blocks * (off_t)hfsmp->blockSize;
@@ -158,7 +186,7 @@ hfs_vnop_read(struct vnop_read_args *ap)
         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 12)) | DBG_FUNC_START,
                 (int)uio_offset(uio), uio_resid(uio), (int)filesize, (int)filebytes, 0);
  
-       retval = cluster_read(vp, uio, filesize, 0);
+       retval = cluster_read(vp, uio, filesize, ap->a_ioflag);
  
         cp->c_touch_acctime = TRUE;
  
@@ -168,7 +196,7 @@ hfs_vnop_read(struct vnop_read_args *ap)
         /*
          * Keep track blocks read
          */
-       if (VTOHFS(vp)->hfc_stage == HFC_RECORDING && retval == 0) {
+       if (hfsmp->hfc_stage == HFC_RECORDING && retval == 0) {
                 int took_cnode_lock = 0;
                 off_t bytesread;
  
@@ -183,7 +211,7 @@ hfs_vnop_read(struct vnop_read_args *ap)
                  * If this file hasn't been seen since the start of
                  * the current sampling period then start over.
                  */
-               if (cp->c_atime < VTOHFS(vp)->hfc_timebase) {
+               if (cp->c_atime < hfsmp->hfc_timebase) {
                         struct timeval tv;
  
                         fp->ff_bytesread = bytesread;
@@ -196,7 +224,10 @@ hfs_vnop_read(struct vnop_read_args *ap)
                         hfs_unlock(cp);
         }
  exit:
-       hfs_unlock_truncate(cp);
+       if (took_truncate_lock) {
+               hfs_unlock_truncate(cp, 0);
+       }
+
         return (retval);
  }
  
@@ -214,16 +245,52 @@ hfs_vnop_write(struct vnop_write_args *ap)
         kauth_cred_t cred = NULL;
         off_t origFileSize;
         off_t writelimit;
-       off_t bytesToAdd;
+       off_t bytesToAdd = 0;
         off_t actualBytesAdded;
         off_t filebytes;
         off_t offset;
-       size_t resid;
+       ssize_t resid;
         int eflags;
         int ioflag = ap->a_ioflag;
         int retval = 0;
         int lockflags;
         int cnode_locked = 0;
+       int partialwrite = 0;
+       int do_snapshot = 1;
+       time_t orig_ctime=VTOC(vp)->c_ctime;
+       int took_truncate_lock = 0;
+       struct rl_entry *invalid_range;
+
+#if HFS_COMPRESSION
+       if ( hfs_file_is_compressed(VTOC(vp), 1) ) { /* 1 == don't take the cnode lock */
+               int state = decmpfs_cnode_get_vnode_state(VTOCMP(vp));
+               switch(state) {
+                       case FILE_IS_COMPRESSED:
+                               return EACCES;
+                       case FILE_IS_CONVERTING:
+                               /* if FILE_IS_CONVERTING, we allow writes but do not
+                                  bother with snapshots or else we will deadlock.
+                               */
+                               do_snapshot = 0;
+                               break;
+                       default:
+                               printf("invalid state %d for compressed file\n", state);
+                               /* fall through */
+               }
+       } else if ((VTOC(vp)->c_flags & UF_COMPRESSED)) {
+               int error;
+               
+               error = check_for_dataless_file(vp, NAMESPACE_HANDLER_WRITE_OP);
+               if (error != 0) {
+                       return error;
+               }
+       }
+
+       if (do_snapshot) {
+               check_for_tracked_file(vp, orig_ctime, NAMESPACE_HANDLER_WRITE_OP, uio);
+       }
+
+#endif
  
         // LP64todo - fix this! uio_resid may be 64-bit value
         resid = uio_resid(uio);
@@ -236,32 +303,18 @@ hfs_vnop_write(struct vnop_write_args *ap)
         if (!vnode_isreg(vp))
                 return (EPERM);  /* Can only write regular files */
  
-       /* Protect against a size change. */
-       hfs_lock_truncate(VTOC(vp), TRUE);
-
-       if ( (retval = hfs_lock(VTOC(vp), HFS_EXCLUSIVE_LOCK))) {
-               hfs_unlock_truncate(VTOC(vp));
-               return (retval);
-       }
-       cnode_locked = 1;
         cp = VTOC(vp);
         fp = VTOF(vp);
         hfsmp = VTOHFS(vp);
-       filebytes = (off_t)fp->ff_blocks * (off_t)hfsmp->blockSize;
  
-       if (ioflag & IO_APPEND) {
-               uio_setoffset(uio, fp->ff_size);
-               offset = fp->ff_size;
-       }
-       if ((cp->c_flags & APPEND) && offset != fp->ff_size) {
-               retval = EPERM;
+#if CONFIG_PROTECT
+       if ((retval = cp_handle_vnop (cp, CP_WRITE_ACCESS)) != 0) {
                 goto exit;
         }
+#endif
  
-       origFileSize = fp->ff_size;
         eflags = kEFDeferMask;  /* defer file block allocations */
-
-#ifdef HFS_SPARSE_DEV
+#if HFS_SPARSE_DEV
         /* 
          * When the underlying device is sparse and space
          * is low (< 8MB), stop doing delayed allocations
@@ -274,19 +327,118 @@ hfs_vnop_write(struct vnop_write_args *ap)
         }
  #endif /* HFS_SPARSE_DEV */
  
-       KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 0)) | DBG_FUNC_START,
-               (int)offset, uio_resid(uio), (int)fp->ff_size, (int)filebytes, 0);
+again:
+       /* Protect against a size change. */
+       /*
+        * Protect against a size change.
+        *
+        * Note: If took_truncate_lock is true, then we previously got the lock shared
+        * but needed to upgrade to exclusive.  So try getting it exclusive from the
+        * start.
+        */
+       if (ioflag & IO_APPEND || took_truncate_lock) {
+               hfs_lock_truncate(cp, HFS_EXCLUSIVE_LOCK);
+       }       
+       else {
+               hfs_lock_truncate(cp, HFS_SHARED_LOCK);
+       }
+       took_truncate_lock = 1;
  
-       /* Now test if we need to extend the file */
-       /* Doing so will adjust the filebytes for us */
+       /* Update UIO */
+       if (ioflag & IO_APPEND) {
+               uio_setoffset(uio, fp->ff_size);
+               offset = fp->ff_size;
+       }
+       if ((cp->c_flags & APPEND) && offset != fp->ff_size) {
+               retval = EPERM;
+               goto exit;
+       }
  
+       origFileSize = fp->ff_size;
         writelimit = offset + resid;
-       if (writelimit <= filebytes)
+       filebytes = (off_t)fp->ff_blocks * (off_t)hfsmp->blockSize;
+
+       /*
+        * We may need an exclusive truncate lock for several reasons, all
+        * of which are because we may be writing to a (portion of a) block
+        * for the first time, and we need to make sure no readers see the
+        * prior, uninitialized contents of the block.  The cases are:
+        *
+        * 1. We have unallocated (delayed allocation) blocks.  We may be
+        *    allocating new blocks to the file and writing to them.
+        *    (A more precise check would be whether the range we're writing
+        *    to contains delayed allocation blocks.)
+        * 2. We need to extend the file.  The bytes between the old EOF
+        *    and the new EOF are not yet initialized.  This is important
+        *    even if we're not allocating new blocks to the file.  If the
+        *    old EOF and new EOF are in the same block, we still need to
+        *    protect that range of bytes until they are written for the
+        *    first time.
+        * 3. The write overlaps some invalid ranges (delayed zero fill; that
+        *    part of the file has been allocated, but not yet written).
+        *
+        * If we had a shared lock with the above cases, we need to try to upgrade
+        * to an exclusive lock.  If the upgrade fails, we will lose the shared
+        * lock, and will need to take the truncate lock again; the took_truncate_lock
+        * flag will still be set, causing us to try for an exclusive lock next time.
+        *
+        * NOTE: Testing for #3 (delayed zero fill) needs to be done while the cnode
+        * lock is held, since it protects the range lists.
+        */
+       if ((cp->c_truncatelockowner == HFS_SHARED_OWNER) &&
+           ((fp->ff_unallocblocks != 0) ||
+            (writelimit > origFileSize))) {
+               if (lck_rw_lock_shared_to_exclusive(&cp->c_truncatelock) == FALSE) {
+                       /*
+                        * Lock upgrade failed and we lost our shared lock, try again.
+                        * Note: we do not set took_truncate_lock=0 here.  Leaving it
+                        * set to 1 will cause us to try to get the lock exclusive.
+                        */
+                       goto again;
+               } 
+               else {
+                       /* Store the owner in the c_truncatelockowner field if we successfully upgrade */
+                       cp->c_truncatelockowner = current_thread();  
+               }
+       }
+
+       if ( (retval = hfs_lock(VTOC(vp), HFS_EXCLUSIVE_LOCK))) {
+               goto exit;
+       }
+       cnode_locked = 1;
+       
+       /*
+        * Now that we have the cnode lock, see if there are delayed zero fill ranges
+        * overlapping our write.  If so, we need the truncate lock exclusive (see above).
+        */
+       if ((cp->c_truncatelockowner == HFS_SHARED_OWNER) &&
+           (rl_scan(&fp->ff_invalidranges, offset, writelimit-1, &invalid_range) != RL_NOOVERLAP)) {
+               /*
+                * When testing, it appeared that calling lck_rw_lock_shared_to_exclusive() causes
+                * a deadlock, rather than simply returning failure.  (That is, it apparently does
+                * not behave like a "try_lock").  Since this condition is rare, just drop the
+                * cnode lock and try again.  Since took_truncate_lock is set, we will
+                * automatically take the truncate lock exclusive.
+                */
+               hfs_unlock(cp);
+               cnode_locked = 0;
+               hfs_unlock_truncate(cp, 0);
+               goto again;
+       }
+       
+       KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 0)) | DBG_FUNC_START,
+                    (int)offset, uio_resid(uio), (int)fp->ff_size,
+                    (int)filebytes, 0);
+
+       /* Check if we do not need to extend the file */
+       if (writelimit <= filebytes) {
                 goto sizeok;
+       }
  
         cred = vfs_context_ucred(ap->a_context);
-#if QUOTA
         bytesToAdd = writelimit - filebytes;
+
+#if QUOTA
         retval = hfs_chkdq(cp, (int64_t)(roundup(bytesToAdd, hfsmp->blockSize)), 
                            cred, 0);
         if (retval)
@@ -330,6 +482,17 @@ hfs_vnop_write(struct vnop_write_args *ap)
         (void) hfs_volupdate(hfsmp, VOL_UPDATE, 0);
         (void) hfs_end_transaction(hfsmp);
  
+       /*
+        * If we didn't grow the file enough try a partial write.
+        * POSIX expects this behavior.
+        */
+       if ((retval == ENOSPC) && (filebytes > offset)) {
+               retval = 0;
+               partialwrite = 1;
+               uio_setresid(uio, (uio_resid(uio) - bytesToAdd));
+               resid -= bytesToAdd;
+               writelimit = filebytes;
+       }
  sizeok:
         if (retval == E_NONE) {
                 off_t filesize;
@@ -339,14 +502,13 @@ sizeok:
                 off_t inval_end;
                 off_t io_start;
                 int lflag;
-               struct rl_entry *invalid_range;
  
                 if (writelimit > fp->ff_size)
                         filesize = writelimit;
                 else
                         filesize = fp->ff_size;
  
-               lflag = (ioflag & IO_SYNC);
+               lflag = ioflag & ~(IO_TAILZEROFILL | IO_HEADZEROFILL | IO_NOZEROVALID | IO_NOZERODIRTY);
  
                 if (offset <= fp->ff_size) {
                         zero_off = offset & ~PAGE_MASK_64;
@@ -460,30 +622,67 @@ sizeok:
  
                 hfs_unlock(cp);
                 cnode_locked = 0;
+               
+               /*
+                * We need to tell UBC the fork's new size BEFORE calling
+                * cluster_write, in case any of the new pages need to be
+                * paged out before cluster_write completes (which does happen
+                * in embedded systems due to extreme memory pressure).
+                * Similarly, we need to tell hfs_vnop_pageout what the new EOF
+                * will be, so that it can pass that on to cluster_pageout, and
+                * allow those pageouts.
+                *
+                * We don't update ff_size yet since we don't want pageins to
+                * be able to see uninitialized data between the old and new
+                * EOF, until cluster_write has completed and initialized that
+                * part of the file.
+                *
+                * The vnode pager relies on the file size last given to UBC via
+                * ubc_setsize.  hfs_vnop_pageout relies on fp->ff_new_size or
+                * ff_size (whichever is larger).  NOTE: ff_new_size is always
+                * zero, unless we are extending the file via write.
+                */
+               if (filesize > fp->ff_size) {
+                       fp->ff_new_size = filesize;
+                       ubc_setsize(vp, filesize);
+               }
                 retval = cluster_write(vp, uio, fp->ff_size, filesize, zero_off,
                                 tail_off, lflag | IO_NOZERODIRTY);
-               offset = uio_offset(uio);
-               if (offset > fp->ff_size) {
-                       fp->ff_size = offset;
-
-                       ubc_setsize(vp, fp->ff_size);       /* XXX check errors */
+               if (retval) {
+                       fp->ff_new_size = 0;    /* no longer extending; use ff_size */
+                       if (filesize > origFileSize) {
+                               ubc_setsize(vp, origFileSize);
+                       }
+                       goto ioerr_exit;
+               }
+               
+               if (filesize > origFileSize) {
+                       fp->ff_size = filesize;
+                       
                         /* Files that are changing size are not hot file candidates. */
-                       if (hfsmp->hfc_stage == HFC_RECORDING)
+                       if (hfsmp->hfc_stage == HFC_RECORDING) {
                                 fp->ff_bytesread = 0;
+                       }
                 }
+               fp->ff_new_size = 0;    /* ff_size now has the correct size */
+               
+               /* If we wrote some bytes, then touch the change and mod times */
                 if (resid > uio_resid(uio)) {
                         cp->c_touch_chgtime = TRUE;
                         cp->c_touch_modtime = TRUE;
                 }
         }
+       if (partialwrite) {
+               uio_setresid(uio, (uio_resid(uio) + bytesToAdd));
+               resid += bytesToAdd;
+       }
  
-       // XXXdbg - testing for vivek and paul lambert
+       // XXXdbg - see radar 4871353 for more info
         {
             if (flush_cache_on_write && ((ioflag & IO_NOCACHE) || vnode_isnocache(vp))) {
                 VNOP_IOCTL(hfsmp->hfs_devvp, DKIOCSYNCHRONIZECACHE, NULL, FWRITE, NULL);
             }
         }
-       HFS_KNOTE(vp, NOTE_WRITE);
  
  ioerr_exit:
         /*
@@ -508,7 +707,7 @@ ioerr_exit:
                                 cnode_locked = 1;
                         }
                         (void)hfs_truncate(vp, origFileSize, ioflag & IO_SYNC,
-                                          0, ap->a_context);
+                                          0, 0, ap->a_context);
                         // LP64todo - fix this!  resid needs to by user_ssize_t
                         uio_setoffset(uio, (uio_offset(uio) - (resid - uio_resid(uio))));
                         uio_setresid(uio, resid);
@@ -529,31 +728,25 @@ ioerr_exit:
  exit:
         if (cnode_locked)
                 hfs_unlock(cp);
-       hfs_unlock_truncate(cp);
+       
+       if (took_truncate_lock) {
+               hfs_unlock_truncate(cp, 0);
+       }
         return (retval);
  }
  
  /* support for the "bulk-access" fcntl */
  
-#define CACHE_ELEMS 64
  #define CACHE_LEVELS 16
+#define NUM_CACHE_ENTRIES (64*16)
  #define PARENT_IDS_FLAG 0x100
  
-/* from hfs_attrlist.c */
-extern unsigned long DerivePermissionSummary(uid_t obj_uid, gid_t obj_gid,
-                       mode_t obj_mode, struct mount *mp,
-                       kauth_cred_t cred, struct proc *p);
-
-/* from vfs/vfs_fsevents.c */
-extern char *get_pathbuff(void);
-extern void release_pathbuff(char *buff);
-
  struct access_cache {
         int numcached;
         int cachehits; /* these two for statistics gathering */
         int lookups;
         unsigned int *acache;
-       Boolean *haveaccess;
+       unsigned char *haveaccess;
  };
  
  struct access_t {
@@ -564,80 +757,142 @@ struct access_t {
         int       *file_ids;        /* IN: array of file ids */
         gid_t     *groups;          /* IN: array of groups */
         short     *access;          /* OUT: access info for each file (0 for 'has access') */
+} __attribute__((unavailable)); // this structure is for reference purposes only
+
+struct user32_access_t {
+       uid_t     uid;              /* IN: effective user id */
+       short     flags;            /* IN: access requested (i.e. R_OK) */
+       short     num_groups;       /* IN: number of groups user belongs to */
+       int       num_files;        /* IN: number of files to process */
+       user32_addr_t      file_ids;        /* IN: array of file ids */
+       user32_addr_t      groups;          /* IN: array of groups */
+       user32_addr_t      access;          /* OUT: access info for each file (0 for 'has access') */
  };
  
-struct user_access_t {
+struct user64_access_t {
         uid_t           uid;                    /* IN: effective user id */
         short           flags;                  /* IN: access requested (i.e. R_OK) */
         short           num_groups;             /* IN: number of groups user belongs to */
-       int                     num_files;              /* IN: number of files to process */
-       user_addr_t     file_ids;               /* IN: array of file ids */
-       user_addr_t     groups;                 /* IN: array of groups */
-       user_addr_t     access;                 /* OUT: access info for each file (0 for 'has access') */
+       int             num_files;              /* IN: number of files to process */
+       user64_addr_t   file_ids;               /* IN: array of file ids */
+       user64_addr_t   groups;                 /* IN: array of groups */
+       user64_addr_t   access;                 /* OUT: access info for each file (0 for 'has access') */
+};
+
+
+// these are the "extended" versions of the above structures
+// note that it is crucial that they be different sized than
+// the regular version
+struct ext_access_t {
+       uint32_t   flags;           /* IN: access requested (i.e. R_OK) */
+       uint32_t   num_files;       /* IN: number of files to process */
+       uint32_t   map_size;        /* IN: size of the bit map */
+       uint32_t  *file_ids;        /* IN: Array of file ids */
+       char      *bitmap;          /* OUT: hash-bitmap of interesting directory ids */
+       short     *access;          /* OUT: access info for each file (0 for 'has access') */
+       uint32_t   num_parents;   /* future use */
+       cnid_t      *parents;   /* future use */
+} __attribute__((unavailable)); // this structure is for reference purposes only
+
+struct user32_ext_access_t {
+       uint32_t   flags;           /* IN: access requested (i.e. R_OK) */
+       uint32_t   num_files;       /* IN: number of files to process */
+       uint32_t   map_size;        /* IN: size of the bit map */
+       user32_addr_t  file_ids;        /* IN: Array of file ids */
+       user32_addr_t     bitmap;          /* OUT: hash-bitmap of interesting directory ids */
+       user32_addr_t access;          /* OUT: access info for each file (0 for 'has access') */
+       uint32_t   num_parents;   /* future use */
+       user32_addr_t parents;   /* future use */
  };
  
+struct user64_ext_access_t {
+       uint32_t      flags;        /* IN: access requested (i.e. R_OK) */
+       uint32_t      num_files;    /* IN: number of files to process */
+       uint32_t      map_size;     /* IN: size of the bit map */
+       user64_addr_t   file_ids;     /* IN: array of file ids */
+       user64_addr_t   bitmap;       /* IN: array of groups */
+       user64_addr_t   access;       /* OUT: access info for each file (0 for 'has access') */
+       uint32_t      num_parents;/* future use */
+       user64_addr_t   parents;/* future use */
+};
+
+
  /*
   * Perform a binary search for the given parent_id. Return value is 
- * found/not found boolean, and indexp will be the index of the item 
- * or the index at which to insert the item if it's not found.
+ * the index if there is a match.  If no_match_indexp is non-NULL it
+ * will be assigned with the index to insert the item (even if it was
+ * not found).
   */
-static int
-lookup_bucket(struct access_cache *cache, int *indexp, cnid_t parent_id)
+static int cache_binSearch(cnid_t *array, unsigned int hi, cnid_t parent_id, int *no_match_indexp)
  {
-       unsigned int lo, hi;
-       int index, matches = 0;
+    int index=-1;
+    unsigned int lo=0;
         
-       if (cache->numcached == 0) {
-               *indexp = 0;
-               return 0; // table is empty, so insert at index=0 and report no match
+    do {
+       unsigned int mid = ((hi - lo)/2) + lo;
+       unsigned int this_id = array[mid];
+               
+       if (parent_id == this_id) {
+           hi = mid;
+           break;
         }
-       
-       if (cache->numcached > CACHE_ELEMS) {
-               /*printf("EGAD! numcached is %d... cut our losses and trim to %d\n",
-                 cache->numcached, CACHE_ELEMS);*/
-               cache->numcached = CACHE_ELEMS;
+               
+       if (parent_id < this_id) {
+           hi = mid;
+           continue;
+       }
+               
+       if (parent_id > this_id) {
+           lo = mid + 1;
+           continue;
         }
+    } while(lo < hi);
+
+    /* check if lo and hi converged on the match */
+    if (parent_id == array[hi]) {
+       index = hi;
+    }
         
-       lo = 0;
-       hi = cache->numcached - 1;
-       index = -1;
+    if (no_match_indexp) {
+       *no_match_indexp = hi;
+    }
+
+    return index;
+}
+ 
+ 
+static int
+lookup_bucket(struct access_cache *cache, int *indexp, cnid_t parent_id)
+{
+    unsigned int hi;
+    int matches = 0;
+    int index, no_match_index;
         
-       /* perform binary search for parent_id */
-       do {
-               unsigned int mid = (hi - lo)/2 + lo;
-               unsigned int this_id = cache->acache[mid];
-               
-               if (parent_id == this_id) {
-                       index = mid;
-                       break;
-               }
-               
-               if (parent_id < this_id) {
-                       hi = mid;
-                       continue;
-               }
-               
-               if (parent_id > this_id) {
-                       lo = mid + 1;
-                       continue;
-               }
-       } while(lo < hi);
+    if (cache->numcached == 0) {
+       *indexp = 0;
+       return 0; // table is empty, so insert at index=0 and report no match
+    }
         
-       /* check if lo and hi converged on the match */
-       if (parent_id == cache->acache[hi]) {
-               index = hi;
-       }
+    if (cache->numcached > NUM_CACHE_ENTRIES) {
+       /*printf("hfs: EGAD! numcached is %d... cut our losses and trim to %d\n",
+         cache->numcached, NUM_CACHE_ENTRIES);*/
+       cache->numcached = NUM_CACHE_ENTRIES;
+    }
         
-       /* if no existing entry found, find index for new one */
-       if (index == -1) {
-               index = (parent_id < cache->acache[hi]) ? hi : hi + 1;
-               matches = 0;
-       } else {
-               matches = 1;
-       }
+    hi = cache->numcached - 1;
         
-       *indexp = index;
-       return matches;
+    index = cache_binSearch(cache->acache, hi, parent_id, &no_match_index);
+
+    /* if no existing entry found, find index for new one */
+    if (index == -1) {
+       index = no_match_index;
+       matches = 0;
+    } else {
+       matches = 1;
+    }
+       
+    *indexp = index;
+    return matches;
  }
  
  /*
@@ -648,63 +903,71 @@ lookup_bucket(struct access_cache *cache, int *indexp, cnid_t parent_id)
  static void
  add_node(struct access_cache *cache, int index, cnid_t nodeID, int access)
  {
-       int lookup_index = -1;
-
-       /* need to do a lookup first if -1 passed for index */
-       if (index == -1) {
-               if (lookup_bucket(cache, &lookup_index, nodeID)) {
-                       if (cache->haveaccess[lookup_index] != access) {
-                               /* change access info for existing entry... should never happen */
-                              cache->haveaccess[lookup_index] = access;
-                       }
-
-                      /* mission accomplished */
-                       return;
-               } else {
-                       index = lookup_index;
-               }
-
-       }
-
-       /* if the cache is full, do a replace rather than an insert */
-       if (cache->numcached >= CACHE_ELEMS) {
-               //printf("cache is full (%d). replace at index %d\n", cache->numcached, index);
-               cache->numcached = CACHE_ELEMS-1;
-
-               if (index > cache->numcached) {
-                 //    printf("index %d pinned to %d\n", index, cache->numcached);
-                       index = cache->numcached;
-               }
-       } else if (index >= 0 && index < cache->numcached) {
-               /* only do bcopy if we're inserting */
-               bcopy( cache->acache+index, cache->acache+(index+1), (cache->numcached - index)*sizeof(int) );
-               bcopy( cache->haveaccess+index, cache->haveaccess+(index+1), (cache->numcached - index)*sizeof(Boolean) );
-       }
-
-       cache->acache[index] = nodeID;
-       cache->haveaccess[index] = access;
-       cache->numcached++;
+    int lookup_index = -1;
+
+    /* need to do a lookup first if -1 passed for index */
+    if (index == -1) {
+       if (lookup_bucket(cache, &lookup_index, nodeID)) {
+           if (cache->haveaccess[lookup_index] != access && cache->haveaccess[lookup_index] == ESRCH) {
+               // only update an entry if the previous access was ESRCH (i.e. a scope checking error)
+               cache->haveaccess[lookup_index] = access;
+           }
+
+           /* mission accomplished */
+           return;
+       } else {
+           index = lookup_index;
+       }
+
+    }
+
+    /* if the cache is full, do a replace rather than an insert */
+    if (cache->numcached >= NUM_CACHE_ENTRIES) {
+       //printf("hfs: cache is full (%d). replace at index %d\n", cache->numcached, index);
+       cache->numcached = NUM_CACHE_ENTRIES-1;
+
+       if (index > cache->numcached) {
+           //    printf("hfs: index %d pinned to %d\n", index, cache->numcached);
+           index = cache->numcached;
+       }
+    }
+
+    if (index < cache->numcached && index < NUM_CACHE_ENTRIES && nodeID > cache->acache[index]) {
+       index++;
+    }
+
+    if (index >= 0 && index < cache->numcached) {
+       /* only do bcopy if we're inserting */
+       bcopy( cache->acache+index, cache->acache+(index+1), (cache->numcached - index)*sizeof(int) );
+       bcopy( cache->haveaccess+index, cache->haveaccess+(index+1), (cache->numcached - index)*sizeof(unsigned char) );
+    }
+
+    cache->acache[index] = nodeID;
+    cache->haveaccess[index] = access;
+    cache->numcached++;
  }
  
  
  struct cinfo {
-       uid_t   uid;
-       gid_t   gid;
-       mode_t  mode;
-       cnid_t  parentcnid;
+    uid_t   uid;
+    gid_t   gid;
+    mode_t  mode;
+    cnid_t  parentcnid;
+    u_int16_t recflags;
  };
  
  static int
  snoop_callback(const struct cat_desc *descp, const struct cat_attr *attrp, void * arg)
  {
-       struct cinfo *cip = (struct cinfo *)arg;
+    struct cinfo *cip = (struct cinfo *)arg;
  
-       cip->uid = attrp->ca_uid;
-       cip->gid = attrp->ca_gid;
-       cip->mode = attrp->ca_mode;
-       cip->parentcnid = descp->cd_parentcnid;
+    cip->uid = attrp->ca_uid;
+    cip->gid = attrp->ca_gid;
+    cip->mode = attrp->ca_mode;
+    cip->parentcnid = descp->cd_parentcnid;
+    cip->recflags = attrp->ca_recflags;
         
-       return (0);
+    return (0);
  }
  
  /*
@@ -712,135 +975,508 @@ snoop_callback(const struct cat_desc *descp, const struct cat_attr *attrp, void
   * isn't incore, then go to the catalog.
   */ 
  static int
-do_attr_lookup(struct hfsmount *hfsmp, struct access_cache *cache, dev_t dev, cnid_t cnid, 
-              struct cnode *skip_cp, CatalogKey *keyp, struct cat_attr *cnattrp, struct proc *p)
+do_attr_lookup(struct hfsmount *hfsmp, struct access_cache *cache, cnid_t cnid, 
+    struct cnode *skip_cp, CatalogKey *keyp, struct cat_attr *cnattrp)
  {
-       int error = 0;
-
-       /* if this id matches the one the fsctl was called with, skip the lookup */
-       if (cnid == skip_cp->c_cnid) {
-               cnattrp->ca_uid = skip_cp->c_uid;
-               cnattrp->ca_gid = skip_cp->c_gid;
-               cnattrp->ca_mode = skip_cp->c_mode;
-               keyp->hfsPlus.parentID = skip_cp->c_parentcnid;
+    int error = 0;
+
+    /* if this id matches the one the fsctl was called with, skip the lookup */
+    if (cnid == skip_cp->c_cnid) {
+       cnattrp->ca_uid = skip_cp->c_uid;
+       cnattrp->ca_gid = skip_cp->c_gid;
+       cnattrp->ca_mode = skip_cp->c_mode;
+       cnattrp->ca_recflags = skip_cp->c_attr.ca_recflags;
+       keyp->hfsPlus.parentID = skip_cp->c_parentcnid;
+    } else {
+       struct cinfo c_info;
+
+       /* otherwise, check the cnode hash incase the file/dir is incore */
+       if (hfs_chash_snoop(hfsmp, cnid, 0, snoop_callback, &c_info) == 0) {
+           cnattrp->ca_uid = c_info.uid;
+           cnattrp->ca_gid = c_info.gid;
+           cnattrp->ca_mode = c_info.mode;
+           cnattrp->ca_recflags = c_info.recflags;
+           keyp->hfsPlus.parentID = c_info.parentcnid;
         } else {
-               struct cinfo c_info;
-
-               /* otherwise, check the cnode hash incase the file/dir is incore */
-               if (hfs_chash_snoop(dev, cnid, snoop_callback, &c_info) == 0) {
-                       cnattrp->ca_uid = c_info.uid;
-                       cnattrp->ca_gid = c_info.gid;
-                       cnattrp->ca_mode = c_info.mode;
-                       keyp->hfsPlus.parentID = c_info.parentcnid;
-               } else {
-                       int lockflags;
+           int lockflags;
                         
-                       lockflags = hfs_systemfile_lock(hfsmp, SFL_CATALOG, HFS_SHARED_LOCK);
+           lockflags = hfs_systemfile_lock(hfsmp, SFL_CATALOG, HFS_SHARED_LOCK);
                         
-                       /* lookup this cnid in the catalog */
-                       error = cat_getkeyplusattr(hfsmp, cnid, keyp, cnattrp);
+           /* lookup this cnid in the catalog */
+           error = cat_getkeyplusattr(hfsmp, cnid, keyp, cnattrp);
                         
-                       hfs_systemfile_unlock(hfsmp, lockflags);
+           hfs_systemfile_unlock(hfsmp, lockflags);
                         
-                       cache->lookups++;
-               }
+           cache->lookups++;
         }
+    }
         
-       return (error);
+    return (error);
  }
  
+
  /*
   * Compute whether we have access to the given directory (nodeID) and all its parents. Cache
   * up to CACHE_LEVELS as we progress towards the root.
   */
  static int 
  do_access_check(struct hfsmount *hfsmp, int *err, struct access_cache *cache, HFSCatalogNodeID nodeID, 
-               struct cnode *skip_cp, struct proc *theProcPtr, kauth_cred_t myp_ucred, dev_t dev )
+    struct cnode *skip_cp, struct proc *theProcPtr, kauth_cred_t myp_ucred,
+    struct vfs_context *my_context,
+    char *bitmap,
+    uint32_t map_size,
+    cnid_t* parents,
+    uint32_t num_parents)
  {
-       int                     myErr = 0;
-       int                     myResult;
-       HFSCatalogNodeID        thisNodeID;
-       unsigned long           myPerms;
-       struct cat_attr         cnattr;
-       int                     cache_index = -1;
-       CatalogKey              catkey;
-
-       int i = 0, ids_to_cache = 0;
-       int parent_ids[CACHE_LEVELS];
-
-       /* root always has access */
-       if (!suser(myp_ucred, NULL)) {
-               return (1);
-       }
-
-       thisNodeID = nodeID;
-       while (thisNodeID >=  kRootDirID) {
-               myResult = 0;   /* default to "no access" */
-       
-               /* check the cache before resorting to hitting the catalog */
-
-               /* ASSUMPTION: access info of cached entries is "final"... i.e. no need
-                * to look any further after hitting cached dir */
-
-               if (lookup_bucket(cache, &cache_index, thisNodeID)) {
-                       cache->cachehits++;
-                       myResult = cache->haveaccess[cache_index];
-                       goto ExitThisRoutine;
-               }
-
-               /* remember which parents we want to cache */
-               if (ids_to_cache < CACHE_LEVELS) {
-                       parent_ids[ids_to_cache] = thisNodeID;
-                       ids_to_cache++;
-               }
+    int                     myErr = 0;
+    int                     myResult;
+    HFSCatalogNodeID        thisNodeID;
+    unsigned int            myPerms;
+    struct cat_attr         cnattr;
+    int                     cache_index = -1, scope_index = -1, scope_idx_start = -1;
+    CatalogKey              catkey;
+
+    int i = 0, ids_to_cache = 0;
+    int parent_ids[CACHE_LEVELS];
+
+    thisNodeID = nodeID;
+    while (thisNodeID >=  kRootDirID) {
+       myResult = 0;   /* default to "no access" */
+              
+       /* check the cache before resorting to hitting the catalog */
+
+       /* ASSUMPTION: access info of cached entries is "final"... i.e. no need
+        * to look any further after hitting cached dir */
+
+       if (lookup_bucket(cache, &cache_index, thisNodeID)) {
+           cache->cachehits++;
+           myErr = cache->haveaccess[cache_index];
+           if (scope_index != -1) {
+               if (myErr == ESRCH) {
+                   myErr = 0;
+               }
+           } else {
+               scope_index = 0;   // so we'll just use the cache result 
+               scope_idx_start = ids_to_cache;
+           }
+           myResult = (myErr == 0) ? 1 : 0;
+           goto ExitThisRoutine;
+       }
+
+
+       if (parents) {
+           int tmp;
+           tmp = cache_binSearch(parents, num_parents-1, thisNodeID, NULL);
+           if (scope_index == -1)
+               scope_index = tmp;
+           if (tmp != -1 && scope_idx_start == -1 && ids_to_cache < CACHE_LEVELS) {
+               scope_idx_start = ids_to_cache;
+           }
+       }          
+
+       /* remember which parents we want to cache */
+       if (ids_to_cache < CACHE_LEVELS) {
+           parent_ids[ids_to_cache] = thisNodeID;
+           ids_to_cache++;
+       }
+       // Inefficient (using modulo) and we might want to use a hash function, not rely on the node id to be "nice"...
+       if (bitmap && map_size) {
+           bitmap[(thisNodeID/8)%(map_size)]|=(1<<(thisNodeID&7));            
+       }
                
-              /* do the lookup (checks the cnode hash, then the catalog) */
-              myErr = do_attr_lookup(hfsmp, cache, dev, thisNodeID, skip_cp, &catkey, &cnattr, theProcPtr);
-              if (myErr) {
-                      goto ExitThisRoutine; /* no access */
-              }
-
-               myPerms = DerivePermissionSummary(cnattr.ca_uid, cnattr.ca_gid,
-                                                 cnattr.ca_mode, hfsmp->hfs_mp,
-                                                 myp_ucred, theProcPtr);
-
-               if ( (myPerms & X_OK) == 0 ) {
-                      myResult = 0;
-                       goto ExitThisRoutine;   /* no access */
-              } 
-
-               /* up the hierarchy we go */
-               thisNodeID = catkey.hfsPlus.parentID;
-       }
-
-       /* if here, we have access to this node */
-       myResult = 1;
-
- ExitThisRoutine:
-       if (myErr) {
-              //printf("*** error %d from catalog looking up parent %d/%d!\n", myErr, dev, thisNodeID);
-               myResult = 0;
-       }
-       *err = myErr;
-
-       /* cache the parent directory(ies) */
-       for (i = 0; i < ids_to_cache; i++) {
-               /* small optimization: get rid of double-lookup for all these */
-              // printf("adding %d to cache with result: %d\n", parent_ids[i], myResult);
-               add_node(cache, -1, parent_ids[i], myResult);
-       }
-
-       return (myResult);
+
+       /* do the lookup (checks the cnode hash, then the catalog) */
+       myErr = do_attr_lookup(hfsmp, cache, thisNodeID, skip_cp, &catkey, &cnattr);
+       if (myErr) {
+           goto ExitThisRoutine; /* no access */
+       }
+
+       /* Root always gets access. */
+       if (suser(myp_ucred, NULL) == 0) {
+               thisNodeID = catkey.hfsPlus.parentID;
+               myResult = 1;
+               continue;
+       }
+
+       // if the thing has acl's, do the full permission check
+       if ((cnattr.ca_recflags & kHFSHasSecurityMask) != 0) {
+           struct vnode *vp;
+
+           /* get the vnode for this cnid */
+           myErr = hfs_vget(hfsmp, thisNodeID, &vp, 0, 0);
+           if ( myErr ) {
+               myResult = 0;
+               goto ExitThisRoutine;
+           }
+
+           thisNodeID = VTOC(vp)->c_parentcnid;
+
+           hfs_unlock(VTOC(vp));
+
+           if (vnode_vtype(vp) == VDIR) {
+               myErr = vnode_authorize(vp, NULL, (KAUTH_VNODE_SEARCH | KAUTH_VNODE_LIST_DIRECTORY), my_context);
+           } else {
+               myErr = vnode_authorize(vp, NULL, KAUTH_VNODE_READ_DATA, my_context);
+           }
+
+           vnode_put(vp);
+           if (myErr) {
+               myResult = 0;
+               goto ExitThisRoutine;
+           }
+       } else {
+           unsigned int flags;
+               int mode = cnattr.ca_mode & S_IFMT;   
+               myPerms = DerivePermissionSummary(cnattr.ca_uid, cnattr.ca_gid, cnattr.ca_mode, hfsmp->hfs_mp,myp_ucred, theProcPtr);
+
+               if (mode == S_IFDIR) {
+                       flags = R_OK | X_OK;
+               } else {
+                       flags = R_OK;
+               }
+               if ( (myPerms & flags) != flags) {
+                       myResult = 0;
+                       myErr = EACCES;
+                       goto ExitThisRoutine;   /* no access */
+               }
+
+           /* up the hierarchy we go */
+           thisNodeID = catkey.hfsPlus.parentID;
+       }
+    }
+
+    /* if here, we have access to this node */
+    myResult = 1;
+
+  ExitThisRoutine:
+    if (parents && myErr == 0 && scope_index == -1) {
+       myErr = ESRCH;
+    }
+                               
+    if (myErr) {
+       myResult = 0;
+    }
+    *err = myErr;
+
+    /* cache the parent directory(ies) */
+    for (i = 0; i < ids_to_cache; i++) {
+       if (myErr == 0 && parents && (scope_idx_start == -1 || i > scope_idx_start)) {
+           add_node(cache, -1, parent_ids[i], ESRCH);
+       } else {
+           add_node(cache, -1, parent_ids[i], myErr);
+       }
+    }
+
+    return (myResult);
  }
-/* end "bulk-access" support */
  
+static int
+do_bulk_access_check(struct hfsmount *hfsmp, struct vnode *vp,
+    struct vnop_ioctl_args *ap, int arg_size, vfs_context_t context)
+{
+    boolean_t is64bit;
+
+    /*
+     * NOTE: on entry, the vnode is locked. Incase this vnode
+     * happens to be in our list of file_ids, we'll note it
+     * avoid calling hfs_chashget_nowait() on that id as that
+     * will cause a "locking against myself" panic.
+     */
+    Boolean check_leaf = true;
+               
+    struct user64_ext_access_t *user_access_structp;
+    struct user64_ext_access_t tmp_user_access;
+    struct access_cache cache;
+               
+    int error = 0, prev_parent_check_ok=1;
+    unsigned int i;
+               
+    short flags;
+    unsigned int num_files = 0;
+    int map_size = 0;
+    int num_parents = 0;
+    int *file_ids=NULL;
+    short *access=NULL;
+    char *bitmap=NULL;
+    cnid_t *parents=NULL;
+    int leaf_index;
+       
+    cnid_t cnid;
+    cnid_t prevParent_cnid = 0;
+    unsigned int myPerms;
+    short myaccess = 0;
+    struct cat_attr cnattr;
+    CatalogKey catkey;
+    struct cnode *skip_cp = VTOC(vp);
+    kauth_cred_t cred = vfs_context_ucred(context);
+    proc_t p = vfs_context_proc(context);
+
+    is64bit = proc_is64bit(p);
+
+    /* initialize the local cache and buffers */
+    cache.numcached = 0;
+    cache.cachehits = 0;
+    cache.lookups = 0;
+    cache.acache = NULL;
+    cache.haveaccess = NULL;
+               
+    /* struct copyin done during dispatch... need to copy file_id array separately */
+    if (ap->a_data == NULL) {
+       error = EINVAL;
+       goto err_exit_bulk_access;
+    }
+
+    if (is64bit) {
+       if (arg_size != sizeof(struct user64_ext_access_t)) {
+           error = EINVAL;
+           goto err_exit_bulk_access;
+       }
+
+       user_access_structp = (struct user64_ext_access_t *)ap->a_data;
+
+    } else if (arg_size == sizeof(struct user32_access_t)) {
+       struct user32_access_t *accessp = (struct user32_access_t *)ap->a_data;
+
+       // convert an old style bulk-access struct to the new style
+       tmp_user_access.flags     = accessp->flags;
+       tmp_user_access.num_files = accessp->num_files;
+       tmp_user_access.map_size  = 0;
+       tmp_user_access.file_ids  = CAST_USER_ADDR_T(accessp->file_ids);
+       tmp_user_access.bitmap    = USER_ADDR_NULL;
+       tmp_user_access.access    = CAST_USER_ADDR_T(accessp->access);
+       tmp_user_access.num_parents = 0;
+       user_access_structp = &tmp_user_access;
+
+    } else if (arg_size == sizeof(struct user32_ext_access_t)) {
+       struct user32_ext_access_t *accessp = (struct user32_ext_access_t *)ap->a_data;
+
+       // up-cast from a 32-bit version of the struct
+       tmp_user_access.flags     = accessp->flags;
+       tmp_user_access.num_files = accessp->num_files;
+       tmp_user_access.map_size  = accessp->map_size;
+       tmp_user_access.num_parents  = accessp->num_parents;
+
+       tmp_user_access.file_ids  = CAST_USER_ADDR_T(accessp->file_ids);
+       tmp_user_access.bitmap    = CAST_USER_ADDR_T(accessp->bitmap);
+       tmp_user_access.access    = CAST_USER_ADDR_T(accessp->access);
+       tmp_user_access.parents    = CAST_USER_ADDR_T(accessp->parents);
+
+       user_access_structp = &tmp_user_access;
+    } else {
+       error = EINVAL;
+       goto err_exit_bulk_access;
+    }
+               
+    map_size = user_access_structp->map_size;
+
+    num_files = user_access_structp->num_files;
+
+    num_parents= user_access_structp->num_parents;
+
+    if (num_files < 1) {
+       goto err_exit_bulk_access;
+    }
+    if (num_files > 1024) {
+       error = EINVAL;
+       goto err_exit_bulk_access;
+    }
+
+    if (num_parents > 1024) {
+       error = EINVAL;
+       goto err_exit_bulk_access;
+    }
+               
+    file_ids = (int *) kalloc(sizeof(int) * num_files);
+    access = (short *) kalloc(sizeof(short) * num_files);
+    if (map_size) {
+       bitmap = (char *) kalloc(sizeof(char) * map_size);
+    }
+
+    if (num_parents) {
+       parents = (cnid_t *) kalloc(sizeof(cnid_t) * num_parents);
+    }
+
+    cache.acache = (unsigned int *) kalloc(sizeof(int) * NUM_CACHE_ENTRIES);
+    cache.haveaccess = (unsigned char *) kalloc(sizeof(unsigned char) * NUM_CACHE_ENTRIES);
+               
+    if (file_ids == NULL || access == NULL || (map_size != 0 && bitmap == NULL) || cache.acache == NULL || cache.haveaccess == NULL) {
+       if (file_ids) {
+           kfree(file_ids, sizeof(int) * num_files);
+       }
+       if (bitmap) {
+           kfree(bitmap, sizeof(char) * map_size);
+       }
+       if (access) {
+           kfree(access, sizeof(short) * num_files);
+       }
+       if (cache.acache) {
+           kfree(cache.acache, sizeof(int) * NUM_CACHE_ENTRIES);
+       }
+       if (cache.haveaccess) {
+           kfree(cache.haveaccess, sizeof(unsigned char) * NUM_CACHE_ENTRIES);
+       }
+       if (parents) {
+           kfree(parents, sizeof(cnid_t) * num_parents);
+       }                       
+       return ENOMEM;
+    }
+               
+    // make sure the bitmap is zero'ed out...
+    if (bitmap) {
+       bzero(bitmap, (sizeof(char) * map_size));
+    }
+
+    if ((error = copyin(user_access_structp->file_ids, (caddr_t)file_ids,
+               num_files * sizeof(int)))) {
+       goto err_exit_bulk_access;
+    }
+       
+    if (num_parents) {
+       if ((error = copyin(user_access_structp->parents, (caddr_t)parents,
+                   num_parents * sizeof(cnid_t)))) {
+           goto err_exit_bulk_access;
+       }
+    }
+       
+    flags = user_access_structp->flags;
+    if ((flags & (F_OK | R_OK | W_OK | X_OK)) == 0) {
+       flags = R_OK;
+    }
+               
+    /* check if we've been passed leaf node ids or parent ids */
+    if (flags & PARENT_IDS_FLAG) {
+       check_leaf = false;
+    }
+               
+    /* Check access to each file_id passed in */
+    for (i = 0; i < num_files; i++) {
+       leaf_index=-1;
+       cnid = (cnid_t) file_ids[i];
+                       
+       /* root always has access */
+       if ((!parents) && (!suser(cred, NULL))) {
+           access[i] = 0;
+           continue;
+       }
+                       
+       if (check_leaf) {
+           /* do the lookup (checks the cnode hash, then the catalog) */
+           error = do_attr_lookup(hfsmp, &cache, cnid, skip_cp, &catkey, &cnattr);
+           if (error) {
+               access[i] = (short) error;
+               continue;
+           }
+           
+           if (parents) {
+               // Check if the leaf matches one of the parent scopes
+               leaf_index = cache_binSearch(parents, num_parents-1, cnid, NULL);
+               if (leaf_index >= 0 && parents[leaf_index] == cnid)
+                   prev_parent_check_ok = 0;
+               else if (leaf_index >= 0)
+                   prev_parent_check_ok = 1;
+           }
+
+           // if the thing has acl's, do the full permission check
+           if ((cnattr.ca_recflags & kHFSHasSecurityMask) != 0) {
+               struct vnode *cvp;
+               int myErr = 0;
+               /* get the vnode for this cnid */
+               myErr = hfs_vget(hfsmp, cnid, &cvp, 0, 0);
+               if ( myErr ) {
+                   access[i] = myErr;
+                   continue;
+               }
+               
+               hfs_unlock(VTOC(cvp));
+               
+               if (vnode_vtype(cvp) == VDIR) {
+                   myErr = vnode_authorize(cvp, NULL, (KAUTH_VNODE_SEARCH | KAUTH_VNODE_LIST_DIRECTORY), context);
+               } else {
+                   myErr = vnode_authorize(cvp, NULL, KAUTH_VNODE_READ_DATA, context);
+               }
+               
+               vnode_put(cvp);
+               if (myErr) {
+                   access[i] = myErr;
+                   continue;
+               }
+           } else {
+               /* before calling CheckAccess(), check the target file for read access */
+               myPerms = DerivePermissionSummary(cnattr.ca_uid, cnattr.ca_gid,
+                   cnattr.ca_mode, hfsmp->hfs_mp, cred, p);
+               
+               /* fail fast if no access */ 
+               if ((myPerms & flags) == 0) {
+                   access[i] = EACCES;
+                   continue;
+               }                                                       
+           }
+       } else {
+           /* we were passed an array of parent ids */
+           catkey.hfsPlus.parentID = cnid;
+       }
+                       
+       /* if the last guy had the same parent and had access, we're done */
+       if (i > 0 && catkey.hfsPlus.parentID == prevParent_cnid && access[i-1] == 0 && prev_parent_check_ok) {
+           cache.cachehits++;
+           access[i] = 0;
+           continue;
+       }
+                       
+       myaccess = do_access_check(hfsmp, &error, &cache, catkey.hfsPlus.parentID, 
+           skip_cp, p, cred, context,bitmap, map_size, parents, num_parents);
+                       
+       if (myaccess || (error == ESRCH && leaf_index != -1)) {
+           access[i] = 0; // have access.. no errors to report
+       } else {
+           access[i] = (error != 0 ? (short) error : EACCES);
+       }
+                       
+       prevParent_cnid = catkey.hfsPlus.parentID;
+    }
+               
+    /* copyout the access array */
+    if ((error = copyout((caddr_t)access, user_access_structp->access, 
+               num_files * sizeof (short)))) {
+       goto err_exit_bulk_access;
+    }
+    if (map_size && bitmap) {
+       if ((error = copyout((caddr_t)bitmap, user_access_structp->bitmap, 
+                   map_size * sizeof (char)))) {
+           goto err_exit_bulk_access;
+       }
+    }
+       
+               
+  err_exit_bulk_access:
+               
+    //printf("hfs: on exit (err %d), numfiles/numcached/cachehits/lookups is %d/%d/%d/%d\n", error, num_files, cache.numcached, cache.cachehits, cache.lookups);
+               
+    if (file_ids) 
+       kfree(file_ids, sizeof(int) * num_files);
+    if (parents) 
+       kfree(parents, sizeof(cnid_t) * num_parents);
+    if (bitmap) 
+       kfree(bitmap, sizeof(char) * map_size);
+    if (access)
+       kfree(access, sizeof(short) * num_files);
+    if (cache.acache)
+       kfree(cache.acache, sizeof(int) * NUM_CACHE_ENTRIES);
+    if (cache.haveaccess)
+       kfree(cache.haveaccess, sizeof(unsigned char) * NUM_CACHE_ENTRIES);
+               
+    return (error);
+}
+
+
+/* end "bulk-access" support */
  
  
  /*
   * Callback for use with freeze ioctl.
   */
  static int
-hfs_freezewrite_callback(struct vnode *vp, void *cargs)
+hfs_freezewrite_callback(struct vnode *vp, __unused void *cargs)
  {
         vnode_waitforwrites(vp, 0, 0, 0, "hfs freeze");
  
@@ -866,11 +1502,112 @@ hfs_vnop_ioctl( struct vnop_ioctl_args /* {
         proc_t p = vfs_context_proc(context);
         struct vfsstatfs *vfsp;
         boolean_t is64bit;
+       off_t jnl_start, jnl_size;
+       struct hfs_journal_info *jip;
+#if HFS_COMPRESSION
+       int compressed = 0;
+       off_t uncompressed_size = -1;
+       int decmpfs_error = 0;
+       
+       if (ap->a_command == F_RDADVISE) {
+               /* we need to inspect the decmpfs state of the file as early as possible */
+               compressed = hfs_file_is_compressed(VTOC(vp), 0);
+               if (compressed) {
+                       if (VNODE_IS_RSRC(vp)) {
+                               /* if this is the resource fork, treat it as if it were empty */
+                               uncompressed_size = 0;
+                       } else {
+                               decmpfs_error = hfs_uncompressed_size_of_compressed_file(NULL, vp, 0, &uncompressed_size, 0);
+                               if (decmpfs_error != 0) {
+                                       /* failed to get the uncompressed size, we'll check for this later */
+                                       uncompressed_size = -1;
+                               }
+                       }
+               }
+       }
+#endif /* HFS_COMPRESSION */
  
         is64bit = proc_is64bit(p);
  
+#if CONFIG_PROTECT
+       {
+               int error = 0;
+               if ((error = cp_handle_vnop(VTOC(vp), CP_WRITE_ACCESS)) != 0) {
+                       return error;
+               }
+       }
+#endif /* CONFIG_PROTECT */
+
         switch (ap->a_command) {
  
+       case HFS_GETPATH:
+       {
+               struct vnode *file_vp;
+               cnid_t  cnid;
+               int  outlen;
+               char *bufptr;
+               int error;
+
+               /* Caller must be owner of file system. */
+               vfsp = vfs_statfs(HFSTOVFS(hfsmp));
+               if (suser(cred, NULL) &&
+                       kauth_cred_getuid(cred) != vfsp->f_owner) {
+                       return (EACCES);
+               }
+               /* Target vnode must be file system's root. */
+               if (!vnode_isvroot(vp)) {
+                       return (EINVAL);
+               }
+               bufptr = (char *)ap->a_data;
+               cnid = strtoul(bufptr, NULL, 10);
+
+               /* We need to call hfs_vfs_vget to leverage the code that will
+                * fix the origin list for us if needed, as opposed to calling
+                * hfs_vget, since we will need the parent for build_path call.
+                */
+
+               if ((error = hfs_vfs_vget(HFSTOVFS(hfsmp), cnid, &file_vp, context))) {
+                       return (error);
+               }
+               error = build_path(file_vp, bufptr, sizeof(pathname_t), &outlen, 0, context);
+               vnode_put(file_vp);
+
+               return (error);
+       }
+
+       case HFS_PREV_LINK:
+       case HFS_NEXT_LINK:
+       {
+               cnid_t linkfileid;
+               cnid_t nextlinkid;
+               cnid_t prevlinkid;
+               int error;
+
+               /* Caller must be owner of file system. */
+               vfsp = vfs_statfs(HFSTOVFS(hfsmp));
+               if (suser(cred, NULL) &&
+                       kauth_cred_getuid(cred) != vfsp->f_owner) {
+                       return (EACCES);
+               }
+               /* Target vnode must be file system's root. */
+               if (!vnode_isvroot(vp)) {
+                       return (EINVAL);
+               }
+               linkfileid = *(cnid_t *)ap->a_data;
+               if (linkfileid < kHFSFirstUserCatalogNodeID) {
+                       return (EINVAL);
+               }
+               if ((error = hfs_lookup_siblinglinks(hfsmp, linkfileid, &prevlinkid, &nextlinkid))) {
+                       return (error);
+               }
+               if (ap->a_command == HFS_NEXT_LINK) {
+                       *(cnid_t *)ap->a_data = nextlinkid;
+               } else {
+                       *(cnid_t *)ap->a_data = prevlinkid;
+               }
+               return (0);
+       }
+
         case HFS_RESIZE_PROGRESS: {
  
                 vfsp = vfs_statfs(HFSTOVFS(hfsmp));
@@ -881,8 +1618,14 @@ hfs_vnop_ioctl( struct vnop_ioctl_args /* {
                 if (!vnode_isvroot(vp)) {
                         return (EINVAL);
                 }
+               /* file system must not be mounted read-only */
+               if (hfsmp->hfs_flags & HFS_READ_ONLY) {
+                       return (EROFS);
+               }
+
                 return hfs_resize_progress(hfsmp, (u_int32_t *)ap->a_data);
         }
+
         case HFS_RESIZE_VOLUME: {
                 u_int64_t newsize;
                 u_int64_t cursize;
@@ -895,6 +1638,11 @@ hfs_vnop_ioctl( struct vnop_ioctl_args /* {
                 if (!vnode_isvroot(vp)) {
                         return (EINVAL);
                 }
+               
+               /* filesystem must not be mounted read only */
+               if (hfsmp->hfs_flags & HFS_READ_ONLY) {
+                       return (EROFS);
+               }
                 newsize = *(u_int64_t *)ap->a_data;
                 cursize = (u_int64_t)hfsmp->totalBlocks * (u_int64_t)hfsmp->blockSize;
                 
@@ -907,6 +1655,7 @@ hfs_vnop_ioctl( struct vnop_ioctl_args /* {
                 }
         }
         case HFS_CHANGE_NEXT_ALLOCATION: {
+               int error = 0;          /* Assume success */
                 u_int32_t location;
  
                 if (vnode_vfsisrdonly(vp)) {
@@ -920,26 +1669,44 @@ hfs_vnop_ioctl( struct vnop_ioctl_args /* {
                 if (!vnode_isvroot(vp)) {
                         return (EINVAL);
                 }
+               HFS_MOUNT_LOCK(hfsmp, TRUE);
                 location = *(u_int32_t *)ap->a_data;
-               if (location > hfsmp->totalBlocks - 1) {
-                       return (EINVAL);
+               if ((location >= hfsmp->allocLimit) &&
+                       (location != HFS_NO_UPDATE_NEXT_ALLOCATION)) {
+                       error = EINVAL;
+                       goto fail_change_next_allocation;
                 }
                 /* Return previous value. */
                 *(u_int32_t *)ap->a_data = hfsmp->nextAllocation;
-               HFS_MOUNT_LOCK(hfsmp, TRUE);
-               hfsmp->nextAllocation = location;
-               hfsmp->vcbFlags |= 0xFF00;
+               if (location == HFS_NO_UPDATE_NEXT_ALLOCATION) {
+                       /* On magic value for location, set nextAllocation to next block
+                        * after metadata zone and set flag in mount structure to indicate 
+                        * that nextAllocation should not be updated again.
+                        */
+                       if (hfsmp->hfs_metazone_end != 0) {
+                               HFS_UPDATE_NEXT_ALLOCATION(hfsmp, hfsmp->hfs_metazone_end + 1);
+                       }
+                       hfsmp->hfs_flags |= HFS_SKIP_UPDATE_NEXT_ALLOCATION; 
+               } else {
+                       hfsmp->hfs_flags &= ~HFS_SKIP_UPDATE_NEXT_ALLOCATION; 
+                       HFS_UPDATE_NEXT_ALLOCATION(hfsmp, location);
+               }
+               MarkVCBDirty(hfsmp);
+fail_change_next_allocation:
                 HFS_MOUNT_UNLOCK(hfsmp, TRUE);
-               return (0);
+               return (error);
         }
  
-#ifdef HFS_SPARSE_DEV
+#if HFS_SPARSE_DEV
         case HFS_SETBACKINGSTOREINFO: {
                 struct vnode * bsfs_rootvp;
                 struct vnode * di_vp;
                 struct hfs_backingstoreinfo *bsdata;
                 int error = 0;
                 
+               if (hfsmp->hfs_flags & HFS_READ_ONLY) {
+                       return (EROFS);
+               }
                 if (hfsmp->hfs_flags & HFS_HAS_SPARSE_DEVICE) {
                         return (EALREADY);
                 }
@@ -980,10 +1747,41 @@ hfs_vnop_ioctl( struct vnop_ioctl_args /* {
                 vnode_put(bsfs_rootvp);
  
                 hfsmp->hfs_backingfs_rootvp = bsfs_rootvp;
+
                 hfsmp->hfs_flags |= HFS_HAS_SPARSE_DEVICE;
+               /* The free extent cache is managed differently for sparse devices.  
+                * There is a window between which the volume is mounted and the 
+                * device is marked as sparse, so the free extent cache for this 
+                * volume is currently initialized as normal volume (sorted by block 
+                * count).  Reset the cache so that it will be rebuilt again 
+                * for sparse device (sorted by start block).
+                */
+               ResetVCBFreeExtCache(hfsmp);
+
                 hfsmp->hfs_sparsebandblks = bsdata->bandsize / HFSTOVCB(hfsmp)->blockSize;
                 hfsmp->hfs_sparsebandblks *= 4;
  
+               vfs_markdependency(hfsmp->hfs_mp);
+
+               /*
+                * If the sparse image is on a sparse image file (as opposed to a sparse
+                * bundle), then we may need to limit the free space to the maximum size
+                * of a file on that volume.  So we query (using pathconf), and if we get
+                * a meaningful result, we cache the number of blocks for later use in
+                * hfs_freeblks().
+                */
+               hfsmp->hfs_backingfs_maxblocks = 0;
+               if (vnode_vtype(di_vp) == VREG) {
+                       int terr;
+                       int hostbits;
+                       terr = vn_pathconf(di_vp, _PC_FILESIZEBITS, &hostbits, context);
+                       if (terr == 0 && hostbits != 0 && hostbits < 64) {
+                               u_int64_t hostfilesizemax = ((u_int64_t)1) << hostbits;
+                               
+                               hfsmp->hfs_backingfs_maxblocks = hostfilesizemax / hfsmp->blockSize;
+                       }
+               }
+                               
                 (void)vnode_put(di_vp);
                 file_drop(bsdata->backingfd);
                 return (0);
@@ -996,6 +1794,10 @@ hfs_vnop_ioctl( struct vnop_ioctl_args /* {
                         kauth_cred_getuid(cred) != vfsp->f_owner) {
                         return (EACCES); /* must be owner of file system */
                 }
+               if (hfsmp->hfs_flags & HFS_READ_ONLY) {
+                       return (EROFS);
+               }
+
                 if ((hfsmp->hfs_flags & HFS_HAS_SPARSE_DEVICE) &&
                     hfsmp->hfs_backingfs_rootvp) {
  
@@ -1011,32 +1813,38 @@ hfs_vnop_ioctl( struct vnop_ioctl_args /* {
  
         case F_FREEZE_FS: {
                 struct mount *mp;
-               task_t task;
   
-               if (!is_suser())
-                       return (EACCES);
-
                 mp = vnode_mount(vp);
                 hfsmp = VFSTOHFS(mp);
  
                 if (!(hfsmp->jnl))
                         return (ENOTSUP);
  
+               vfsp = vfs_statfs(mp);
+       
+               if (kauth_cred_getuid(cred) != vfsp->f_owner &&
+                       !kauth_cred_issuser(cred))
+                       return (EACCES);
+
                 lck_rw_lock_exclusive(&hfsmp->hfs_insync);
   
-               task = current_task();
-               task_working_set_disable(task);
-
                 // flush things before we get started to try and prevent
                 // dirty data from being paged out while we're frozen.
                 // note: can't do this after taking the lock as it will
                 // deadlock against ourselves.
                 vnode_iterate(mp, 0, hfs_freezewrite_callback, NULL);
-               hfs_global_exclusive_lock_acquire(hfsmp);
-               journal_flush(hfsmp->jnl);
+               hfs_lock_global (hfsmp, HFS_EXCLUSIVE_LOCK);
+
+               // DO NOT call hfs_journal_flush() because that takes a
+               // shared lock on the global exclusive lock!
+               journal_flush(hfsmp->jnl, TRUE);
  
                 // don't need to iterate on all vnodes, we just need to
                 // wait for writes to the system files and the device vnode
+               //
+               // Now that journal flush waits for all metadata blocks to 
+               // be written out, waiting for btree writes is probably no
+               // longer required.
                 if (HFSTOVCB(hfsmp)->extentsRefNum)
                     vnode_waitforwrites(HFSTOVCB(hfsmp)->extentsRefNum, 0, 0, 0, "hfs freeze");
                 if (HFSTOVCB(hfsmp)->catalogRefNum)
@@ -1053,7 +1861,9 @@ hfs_vnop_ioctl( struct vnop_ioctl_args /* {
         }
  
         case F_THAW_FS: {
-               if (!is_suser())
+               vfsp = vfs_statfs(vnode_mount(vp));
+               if (kauth_cred_getuid(cred) != vfsp->f_owner &&
+                       !kauth_cred_issuser(cred))
                         return (EACCES);
  
                 // if we're not the one who froze the fs then we
@@ -1066,279 +1876,81 @@ hfs_vnop_ioctl( struct vnop_ioctl_args /* {
                 //       code that "thaws" the fs in hfs_vnop_close()
                 //
                 hfsmp->hfs_freezing_proc = NULL;
-               hfs_global_exclusive_lock_release(hfsmp);
+               hfs_unlock_global (hfsmp);
                 lck_rw_unlock_exclusive(&hfsmp->hfs_insync);
  
                 return (0);
         }
  
-#define HFSIOC_BULKACCESS _IOW('h', 9, struct access_t)
-#define HFS_BULKACCESS_FSCTL IOCBASECMD(HFSIOC_BULKACCESS)
-
-       case HFS_BULKACCESS_FSCTL:
-       case HFS_BULKACCESS: {
-               /*
-                * NOTE: on entry, the vnode is locked. Incase this vnode
-                * happens to be in our list of file_ids, we'll note it
-                * avoid calling hfs_chashget_nowait() on that id as that
-                * will cause a "locking against myself" panic.
-                */
-               Boolean check_leaf = true;
-               
-               struct user_access_t *user_access_structp;
-               struct user_access_t tmp_user_access_t;
-               struct access_cache cache;
-               
-               int error = 0, i;
-               
-               dev_t dev = VTOC(vp)->c_dev;
-               
-               short flags;
-               struct ucred myucred;
-               int num_files;
-               int *file_ids = NULL;
-               short *access = NULL;
-               
-               cnid_t cnid;
-               cnid_t prevParent_cnid = 0;
-               unsigned long myPerms;
-               short myaccess = 0;
-               struct cat_attr cnattr;
-               CatalogKey catkey;
-               struct cnode *skip_cp = VTOC(vp);
-               struct vfs_context      my_context;
-
-               /* set up front for common exit code */
-               my_context.vc_ucred = NOCRED;
-
-               /* first, return error if not run as root */
-               if (cred->cr_ruid != 0) {
-                       return EPERM;
-               }
-               
-               /* initialize the local cache and buffers */
-               cache.numcached = 0;
-               cache.cachehits = 0;
-               cache.lookups = 0;
-               
-               file_ids = (int *) get_pathbuff();
-               access = (short *) get_pathbuff();
-               cache.acache = (int *) get_pathbuff();
-               cache.haveaccess = (Boolean *) get_pathbuff();
-               
-               if (file_ids == NULL || access == NULL || cache.acache == NULL || cache.haveaccess == NULL) {
-                       release_pathbuff((char *) file_ids);
-                       release_pathbuff((char *) access);
-                       release_pathbuff((char *) cache.acache);
-                       release_pathbuff((char *) cache.haveaccess);
-                       
-                       return ENOMEM;
-               }
-               
-               /* struct copyin done during dispatch... need to copy file_id array separately */
-               if (ap->a_data == NULL) {
-                       error = EINVAL;
-                       goto err_exit_bulk_access;
-               }
-
-               if (is64bit) {
-                       user_access_structp = (struct user_access_t *)ap->a_data;
-               }
-               else {
-                       struct access_t *       accessp = (struct access_t *)ap->a_data;
-                       tmp_user_access_t.uid = accessp->uid;
-                       tmp_user_access_t.flags = accessp->flags;
-                       tmp_user_access_t.num_groups = accessp->num_groups;
-                       tmp_user_access_t.num_files = accessp->num_files;
-                       tmp_user_access_t.file_ids = CAST_USER_ADDR_T(accessp->file_ids);
-                       tmp_user_access_t.groups = CAST_USER_ADDR_T(accessp->groups);
-                       tmp_user_access_t.access = CAST_USER_ADDR_T(accessp->access);
-                       user_access_structp = &tmp_user_access_t;
-               }
-               
-               num_files = user_access_structp->num_files;
-               if (num_files < 1) {
-                       goto err_exit_bulk_access;
-               }
-               if (num_files > 256) {
-                       error = EINVAL;
-                       goto err_exit_bulk_access;
-               }
-               
-               if ((error = copyin(user_access_structp->file_ids, (caddr_t)file_ids,
-                                                       num_files * sizeof(int)))) {
-                       goto err_exit_bulk_access;
-               }
-               
-               /* fill in the ucred structure */
-               flags = user_access_structp->flags;
-               if ((flags & (F_OK | R_OK | W_OK | X_OK)) == 0) {
-                       flags = R_OK;
-               }
-               
-               /* check if we've been passed leaf node ids or parent ids */
-               if (flags & PARENT_IDS_FLAG) {
-                       check_leaf = false;
-               }
-               
-               /*
-                * Create a templated credential; this credential may *NOT*
-                * be used unless instantiated with a kauth_cred_create();
-                * there must be a correcponding kauth_cred_unref() when it
-                * is no longer in use (i.e. before it goes out of scope).
-                */
-               memset(&myucred, 0, sizeof(myucred));
-               myucred.cr_ref = 1;
-               myucred.cr_uid = myucred.cr_ruid = myucred.cr_svuid = user_access_structp->uid;
-               myucred.cr_ngroups = user_access_structp->num_groups;
-               if (myucred.cr_ngroups < 1 || myucred.cr_ngroups > 16) {
-                       myucred.cr_ngroups = 0;
-               } else if ((error = copyin(user_access_structp->groups, (caddr_t)myucred.cr_groups,
-                                         myucred.cr_ngroups * sizeof(gid_t)))) {
-                       goto err_exit_bulk_access;
-               }
-               myucred.cr_rgid = myucred.cr_svgid = myucred.cr_groups[0];
-               myucred.cr_gmuid = myucred.cr_uid;
-               
-               my_context.vc_proc = p;
-               my_context.vc_ucred = kauth_cred_create(&myucred);
+       case HFS_BULKACCESS_FSCTL: {
+           int size;
+           
+           if (hfsmp->hfs_flags & HFS_STANDARD) {
+               return EINVAL;
+           }
  
-               /* Check access to each file_id passed in */
-               for (i = 0; i < num_files; i++) {
-#if 0
-                       cnid = (cnid_t) file_ids[i];
-                       
-                       /* root always has access */
-                       if (!suser(my_context.vc_ucred, NULL)) {
-                               access[i] = 0;
-                               continue;
-                       }
-                       
-                       if (check_leaf) {
-                               
-                               /* do the lookup (checks the cnode hash, then the catalog) */
-                               error = do_attr_lookup(hfsmp, &cache, dev, cnid, skip_cp, &catkey, &cnattr, p);
-                               if (error) {
-                                       access[i] = (short) error;
-                                       continue;
-                               }
-                                                       
-                               /* before calling CheckAccess(), check the target file for read access */
-                               myPerms = DerivePermissionSummary(cnattr.ca_uid, cnattr.ca_gid,
-                                                                 cnattr.ca_mode, hfsmp->hfs_mp, my_context.vc_ucred, p  );
-                               
-                               
-                               /* fail fast if no access */ 
-                               if ((myPerms & flags) == 0) {
-                                       access[i] = EACCES;
-                                       continue;
-                               }
-                       } else {
-                               /* we were passed an array of parent ids */
-                               catkey.hfsPlus.parentID = cnid;
-                       }
-                       
-                       /* if the last guy had the same parent and had access, we're done */
-                       if (i > 0 && catkey.hfsPlus.parentID == prevParent_cnid && access[i-1] == 0) {
-                               cache.cachehits++;
-                               access[i] = 0;
-                               continue;
-                       }
-                       
-                       myaccess = do_access_check(hfsmp, &error, &cache, catkey.hfsPlus.parentID, 
-                                                  skip_cp, p, my_context.vc_ucred, dev);
-                       
-                       if ( myaccess ) {
-                               access[i] = 0; // have access.. no errors to report
-                       } else {
-                               access[i] = (error != 0 ? (short) error : EACCES);
-                       }
-                       
-                       prevParent_cnid = catkey.hfsPlus.parentID;
-#else
-                       int myErr;
-                       
-                       cnid = (cnid_t)file_ids[i];
-                       
-                       while (cnid >= kRootDirID) {
-                           /* get the vnode for this cnid */
-                           myErr = hfs_vget(hfsmp, cnid, &vp, 0);
-                           if ( myErr ) {
-                               access[i] = EACCES;
-                               break;
-                           }
+           if (is64bit) {
+               size = sizeof(struct user64_access_t);
+           } else {
+               size = sizeof(struct user32_access_t);
+           }
+           
+           return do_bulk_access_check(hfsmp, vp, ap, size, context);
+       } 
  
-                           cnid = VTOC(vp)->c_parentcnid;
+       case HFS_EXT_BULKACCESS_FSCTL: {
+           int size;
+           
+           if (hfsmp->hfs_flags & HFS_STANDARD) {
+               return EINVAL;
+           }
  
-                           hfs_unlock(VTOC(vp));
-                           if (vnode_vtype(vp) == VDIR) {
-                               /*
-                                * XXX This code assumes that none of the
-                                * XXX callbacks from vnode_authorize() will
-                                * XXX take a persistent ref on the context
-                                * XXX credential, which is a bad assumption.
-                                */
-                               myErr = vnode_authorize(vp, NULL, (KAUTH_VNODE_SEARCH | KAUTH_VNODE_LIST_DIRECTORY), &my_context);
-                           } else {
-                               myErr = vnode_authorize(vp, NULL, KAUTH_VNODE_READ_DATA, &my_context);
-                           }
-                           vnode_put(vp);
-                           access[i] = myErr;
-                           if (myErr) {
-                               break;
-                           }
-                       }
-#endif                 
-               }
-               
-               /* copyout the access array */
-               if ((error = copyout((caddr_t)access, user_access_structp->access, 
-                                    num_files * sizeof (short)))) {
-                       goto err_exit_bulk_access;
-               }
-               
-       err_exit_bulk_access:
-               
-               //printf("on exit (err %d), numfiles/numcached/cachehits/lookups is %d/%d/%d/%d\n", error, num_files, cache.numcached, cache.cachehits, cache.lookups);
-               
-               release_pathbuff((char *) cache.acache);
-               release_pathbuff((char *) cache.haveaccess);
-               release_pathbuff((char *) file_ids);
-               release_pathbuff((char *) access);
-               /* clean up local context, if needed */
-               if (IS_VALID_CRED(my_context.vc_ucred))
-                       kauth_cred_unref(&my_context.vc_ucred);
-               
-               return (error);
-       } /* HFS_BULKACCESS */
+           if (is64bit) {
+               size = sizeof(struct user64_ext_access_t);
+           } else {
+               size = sizeof(struct user32_ext_access_t);
+           }
+           
+           return do_bulk_access_check(hfsmp, vp, ap, size, context);
+       } 
  
-       case HFS_SETACLSTATE: {
+       case HFS_SET_XATTREXTENTS_STATE: {
                 int state;
  
                 if (ap->a_data == NULL) {
                         return (EINVAL);
                 }
  
-               vfsp = vfs_statfs(HFSTOVFS(hfsmp));
                 state = *(int *)ap->a_data;
+               
+               if (hfsmp->hfs_flags & HFS_READ_ONLY) {
+                       return (EROFS);
+               }
  
-               // super-user can enable or disable acl's on a volume.
-               // the volume owner can only enable acl's
-               if (!is_suser() && (state == 0 || kauth_cred_getuid(cred) != vfsp->f_owner)) {
+               /* Super-user can enable or disable extent-based extended 
+                * attribute support on a volume 
+                * Note: Starting Mac OS X 10.7, extent-based extended attributes
+                * are enabled by default, so any change will be transient only 
+                * till the volume is remounted.
+                */
+               if (!is_suser()) {
                         return (EPERM);
                 }
                 if (state == 0 || state == 1)
-                       return hfs_setextendedsecurity(hfsmp, state);
+                       return hfs_set_volxattr(hfsmp, HFS_SET_XATTREXTENTS_STATE, state);
                 else
                         return (EINVAL);        
         }
  
         case F_FULLFSYNC: {
                 int error;
-
+               
+               if (hfsmp->hfs_flags & HFS_READ_ONLY) {
+                       return (EROFS);
+               }
                 error = hfs_lock(VTOC(vp), HFS_EXCLUSIVE_LOCK);
                 if (error == 0) {
-                       error = hfs_fsync(vp, MNT_NOWAIT, TRUE, p);
+                       error = hfs_fsync(vp, MNT_WAIT, TRUE, p);
                         hfs_unlock(VTOC(vp));
                 }
  
@@ -1378,107 +1990,80 @@ hfs_vnop_ioctl( struct vnop_ioctl_args /* {
                 fp = VTOF(vp);
  
                 /* Protect against a size change. */
-               hfs_lock_truncate(VTOC(vp), TRUE);
-
+               hfs_lock_truncate(VTOC(vp), HFS_EXCLUSIVE_LOCK);
+
+#if HFS_COMPRESSION
+               if (compressed && (uncompressed_size == -1)) {
+                       /* fetching the uncompressed size failed above, so return the error */
+                       error = decmpfs_error;
+               } else if ((compressed && (ra->ra_offset >= uncompressed_size)) ||
+                                  (!compressed && (ra->ra_offset >= fp->ff_size))) {
+                       error = EFBIG;
+               }
+#else /* HFS_COMPRESSION */
                 if (ra->ra_offset >= fp->ff_size) {
                         error = EFBIG;
-               } else {
+               }
+#endif /* HFS_COMPRESSION */
+               else {
                         error = advisory_read(vp, fp->ff_size, ra->ra_offset, ra->ra_count);
                 }
  
-               hfs_unlock_truncate(VTOC(vp));
+               hfs_unlock_truncate(VTOC(vp), 0);
                 return (error);
         }
  
         case F_READBOOTSTRAP:
         case F_WRITEBOOTSTRAP:
-       {
-           struct vnode *devvp = NULL;
-           user_fbootstraptransfer_t *user_bootstrapp;
-           int devBlockSize;
-           int error;
-           uio_t auio;
-           daddr64_t blockNumber;
-           u_long blockOffset;
-           u_long xfersize;
-           struct buf *bp;
-           user_fbootstraptransfer_t user_bootstrap;
+               return 0;
  
-               if (!vnode_isvroot(vp))
-                       return (EINVAL);
-               /* LP64 - when caller is a 64 bit process then we are passed a pointer 
-                * to a user_fbootstraptransfer_t else we get a pointer to a 
-                * fbootstraptransfer_t which we munge into a user_fbootstraptransfer_t
-                */
+       case _IOC(IOC_OUT,'h', 4, 0):     /* Create date in local time */
+       {
                 if (is64bit) {
-                       user_bootstrapp = (user_fbootstraptransfer_t *)ap->a_data;
+                       *(user_time_t *)(ap->a_data) = (user_time_t) (to_bsd_time(VTOVCB(vp)->localCreateDate));
                 }
                 else {
-               fbootstraptransfer_t *bootstrapp = (fbootstraptransfer_t *)ap->a_data;
-                       user_bootstrapp = &user_bootstrap;
-                       user_bootstrap.fbt_offset = bootstrapp->fbt_offset;
-                       user_bootstrap.fbt_length = bootstrapp->fbt_length;
-                       user_bootstrap.fbt_buffer = CAST_USER_ADDR_T(bootstrapp->fbt_buffer);
+                       *(user32_time_t *)(ap->a_data) = (user32_time_t) (to_bsd_time(VTOVCB(vp)->localCreateDate));
                 }
-               if (user_bootstrapp->fbt_offset + user_bootstrapp->fbt_length > 1024) 
-                       return EINVAL;
-           
-           devvp = VTOHFS(vp)->hfs_devvp;
-               auio = uio_create(1, user_bootstrapp->fbt_offset, 
-                                                 is64bit ? UIO_USERSPACE64 : UIO_USERSPACE32,
-                                                 (ap->a_command == F_WRITEBOOTSTRAP) ? UIO_WRITE : UIO_READ);
-               uio_addiov(auio, user_bootstrapp->fbt_buffer, user_bootstrapp->fbt_length);
+               return 0;
+       }
  
-           devBlockSize = vfs_devblocksize(vnode_mount(vp));
+       case SPOTLIGHT_FSCTL_GET_MOUNT_TIME:
+           *(uint32_t *)ap->a_data = hfsmp->hfs_mount_time;
+           break;
  
-           while (uio_resid(auio) > 0) {
-                       blockNumber = uio_offset(auio) / devBlockSize;
-                       error = (int)buf_bread(devvp, blockNumber, devBlockSize, cred, &bp);
-                       if (error) {
-                               if (bp) buf_brelse(bp);
-                               uio_free(auio);
-                               return error;
-                       };
+       case SPOTLIGHT_FSCTL_GET_LAST_MTIME:
+           *(uint32_t *)ap->a_data = hfsmp->hfs_last_mounted_mtime;
+           break;
  
-                       blockOffset = uio_offset(auio) % devBlockSize;
-                       xfersize = devBlockSize - blockOffset;
-                       error = uiomove((caddr_t)buf_dataptr(bp) + blockOffset, (int)xfersize, auio);
-                       if (error) {
-                               buf_brelse(bp);
-                               uio_free(auio);
-                               return error;
-                       };
-                       if (uio_rw(auio) == UIO_WRITE) {
-                               error = VNOP_BWRITE(bp);
-                               if (error) {
-                                       uio_free(auio);
-                       return error;
-                               }
-                       } else {
-                               buf_brelse(bp);
-                       };
-               };
-               uio_free(auio);
-       };
-       return 0;
+       case HFS_FSCTL_SET_VERY_LOW_DISK:
+           if (*(uint32_t *)ap->a_data >= hfsmp->hfs_freespace_notify_warninglimit) {
+               return EINVAL;
+           }
+
+           hfsmp->hfs_freespace_notify_dangerlimit = *(uint32_t *)ap->a_data;
+           break;
+
+       case HFS_FSCTL_SET_LOW_DISK:
+           if (   *(uint32_t *)ap->a_data >= hfsmp->hfs_freespace_notify_desiredlevel
+               || *(uint32_t *)ap->a_data <= hfsmp->hfs_freespace_notify_dangerlimit) {
+
+               return EINVAL;
+           }
  
-       case _IOC(IOC_OUT,'h', 4, 0):     /* Create date in local time */
-       {
-               if (is64bit) {
-                       *(user_time_t *)(ap->a_data) = (user_time_t) (to_bsd_time(VTOVCB(vp)->localCreateDate));
-               }
-               else {
-                       *(time_t *)(ap->a_data) = to_bsd_time(VTOVCB(vp)->localCreateDate);
-               }
-               return 0;
-       }
+           hfsmp->hfs_freespace_notify_warninglimit = *(uint32_t *)ap->a_data;
+           break;
+
+       case HFS_FSCTL_SET_DESIRED_DISK:
+           if (*(uint32_t *)ap->a_data <= hfsmp->hfs_freespace_notify_warninglimit) {
+               return EINVAL;
+           }
  
-       case HFS_GET_MOUNT_TIME:
-           return copyout(&hfsmp->hfs_mount_time, CAST_USER_ADDR_T(ap->a_data), sizeof(hfsmp->hfs_mount_time));
+           hfsmp->hfs_freespace_notify_desiredlevel = *(uint32_t *)ap->a_data;
             break;
  
-       case HFS_GET_LAST_MTIME:
-           return copyout(&hfsmp->hfs_last_mounted_mtime, CAST_USER_ADDR_T(ap->a_data), sizeof(hfsmp->hfs_last_mounted_mtime));
+       case HFS_VOLUME_STATUS:
+           *(uint32_t *)ap->a_data = hfsmp->hfs_notification_conditions;
             break;
  
         case HFS_SET_BOOT_INFO:
@@ -1486,6 +2071,9 @@ hfs_vnop_ioctl( struct vnop_ioctl_args /* {
                         return(EINVAL);
                 if (!kauth_cred_issuser(cred) && (kauth_cred_getuid(cred) != vfs_statfs(HFSTOVFS(hfsmp))->f_owner))
                         return(EACCES); /* must be superuser or owner of filesystem */
+               if (hfsmp->hfs_flags & HFS_READ_ONLY) {
+                       return (EROFS);
+               }
                 HFS_MOUNT_LOCK(hfsmp, TRUE);
                 bcopy(ap->a_data, &hfsmp->vcbFndrInfo, sizeof(hfsmp->vcbFndrInfo));
                 HFS_MOUNT_UNLOCK(hfsmp, TRUE);
@@ -1500,11 +2088,75 @@ hfs_vnop_ioctl( struct vnop_ioctl_args /* {
                 HFS_MOUNT_UNLOCK(hfsmp, TRUE);
                 break;
  
+       case HFS_MARK_BOOT_CORRUPT:
+               /* Mark the boot volume corrupt by setting 
+                * kHFSVolumeInconsistentBit in the volume header.  This will 
+                * force fsck_hfs on next mount.
+                */
+               if (!is_suser()) {
+                       return EACCES;
+               }
+                       
+               /* Allowed only on the root vnode of the boot volume */
+               if (!(vfs_flags(HFSTOVFS(hfsmp)) & MNT_ROOTFS) || 
+                   !vnode_isvroot(vp)) {
+                       return EINVAL;
+               }
+               if (hfsmp->hfs_flags & HFS_READ_ONLY) {
+                       return (EROFS);
+               }
+               printf ("hfs_vnop_ioctl: Marking the boot volume corrupt.\n");
+               hfs_mark_volume_inconsistent(hfsmp);
+               break;
+
+       case HFS_FSCTL_GET_JOURNAL_INFO:
+               jip = (struct hfs_journal_info*)ap->a_data;
+
+               if (vp == NULLVP)
+                       return EINVAL;
+
+           if (hfsmp->jnl == NULL) {
+                       jnl_start = 0;
+                       jnl_size  = 0;
+           } else {
+                       jnl_start = (off_t)(hfsmp->jnl_start * HFSTOVCB(hfsmp)->blockSize) + (off_t)HFSTOVCB(hfsmp)->hfsPlusIOPosOffset;
+                       jnl_size  = (off_t)hfsmp->jnl_size;
+           }
+
+               jip->jstart = jnl_start;
+               jip->jsize = jnl_size;
+               break;
+
+       case HFS_SET_ALWAYS_ZEROFILL: {
+           struct cnode *cp = VTOC(vp);
+
+           if (*(int *)ap->a_data) {
+               cp->c_flag |= C_ALWAYS_ZEROFILL;
+           } else {
+               cp->c_flag &= ~C_ALWAYS_ZEROFILL;
+           }
+           break;
+       }    
+
+       case HFS_DISABLE_METAZONE: {
+               /* Only root can disable metadata zone */
+               if (!is_suser()) {
+                       return EACCES;
+               }
+               if (hfsmp->hfs_flags & HFS_READ_ONLY) {
+                       return (EROFS);
+               }
+
+               /* Disable metadata zone now */
+               (void) hfs_metadatazone_init(hfsmp, true);
+               printf ("hfs: Disabling metadata zone on %s\n", hfsmp->vcbVN);
+               break;
+       }
+       
         default:
                 return (ENOTTY);
         }
  
-    /* Should never get here */
         return 0;
  }
  
@@ -1536,13 +2188,12 @@ hfs_vnop_select(__unused struct vnop_select_args *ap)
   * The block run is returned in logical blocks, and is the REMAINING amount of blocks
   */
  int
-hfs_bmap(struct vnode *vp, daddr_t bn, struct vnode **vpp, daddr64_t *bnp, int *runp)
+hfs_bmap(struct vnode *vp, daddr_t bn, struct vnode **vpp, daddr64_t *bnp, unsigned int *runp)
  {
-       struct cnode *cp = VTOC(vp);
         struct filefork *fp = VTOF(vp);
         struct hfsmount *hfsmp = VTOHFS(vp);
         int  retval = E_NONE;
-       daddr_t  logBlockSize;
+       u_int32_t  logBlockSize;
         size_t  bytesContAvail = 0;
         off_t  blockposition;
         int lockExtBtree;
@@ -1553,17 +2204,17 @@ hfs_bmap(struct vnode *vp, daddr_t bn, struct vnode **vpp, daddr64_t *bnp, int *
          * to physical mapping is requested.
          */
         if (vpp != NULL)
-               *vpp = cp->c_devvp;
+               *vpp = hfsmp->hfs_devvp;
         if (bnp == NULL)
                 return (0);
  
         logBlockSize = GetLogicalBlockSize(vp);
-       blockposition = (off_t)bn * (off_t)logBlockSize;
+       blockposition = (off_t)bn * logBlockSize;
  
         lockExtBtree = overflow_extents(fp);
  
         if (lockExtBtree)
-               lockflags = hfs_systemfile_lock(hfsmp, SFL_EXTENTS, HFS_SHARED_LOCK);
+               lockflags = hfs_systemfile_lock(hfsmp, SFL_EXTENTS, HFS_EXCLUSIVE_LOCK);
  
         retval = MacToVFSError(
                              MapFileBlockC (HFSTOVCB(hfsmp),
@@ -1633,6 +2284,15 @@ hfs_vnop_offtoblk(struct vnop_offtoblk_args *ap)
  /*
   * Map file offset to physical block number.
   *
+ * If this function is called for write operation, and if the file
+ * had virtual blocks allocated (delayed allocation), real blocks
+ * are allocated by calling ExtendFileC().
+ * 
+ * If this function is called for read operation, and if the file
+ * had virtual blocks allocated (delayed allocation), no change 
+ * to the size of file is done, and if required, rangelist is 
+ * searched for mapping.
+ *
   * System file cnodes are expected to be locked (shared or exclusive).
   */
  int
@@ -1663,6 +2323,26 @@ hfs_vnop_blockmap(struct vnop_blockmap_args *ap)
         int started_tr = 0;
         int tooklock = 0;
  
+#if HFS_COMPRESSION
+       if (VNODE_IS_RSRC(vp)) {
+               /* allow blockmaps to the resource fork */
+       } else {
+               if ( hfs_file_is_compressed(VTOC(vp), 1) ) { /* 1 == don't take the cnode lock */
+                       int state = decmpfs_cnode_get_vnode_state(VTOCMP(vp));
+                       switch(state) {
+                               case FILE_IS_COMPRESSED:
+                                       return ENOTSUP;
+                               case FILE_IS_CONVERTING:
+                                       /* if FILE_IS_CONVERTING, we allow blockmap */
+                                       break;
+                               default:
+                                       printf("invalid state %d for compressed file\n", state);
+                                       /* fall through */
+                       }
+               }
+       }
+#endif /* HFS_COMPRESSION */
+
         /* Do not allow blockmap operation on a directory */
         if (vnode_isdir(vp)) {
                 return (ENOTSUP);
@@ -1675,14 +2355,10 @@ hfs_vnop_blockmap(struct vnop_blockmap_args *ap)
         if (ap->a_bpn == NULL)
                 return (0);
  
-       if ( !vnode_issystem(vp) && !vnode_islnk(vp)) {
+       if ( !vnode_issystem(vp) && !vnode_islnk(vp) && !vnode_isswap(vp)) {
                 if (VTOC(vp)->c_lockowner != current_thread()) {
                         hfs_lock(VTOC(vp), HFS_FORCE_LOCK);
                         tooklock = 1;
-               } else {
-                       cp = VTOC(vp);
-                       panic("blockmap: %s cnode lock already held!\n",
-                               cp->c_desc.cd_nameptr ? cp->c_desc.cd_nameptr : "");
                 }
         }
         hfsmp = VTOHFS(vp);
@@ -1690,7 +2366,8 @@ hfs_vnop_blockmap(struct vnop_blockmap_args *ap)
         fp = VTOF(vp);
  
  retry:
-       if (fp->ff_unallocblocks) {
+       /* Check virtual blocks only when performing write operation */
+       if ((ap->a_flags & VNODE_WRITE) && (fp->ff_unallocblocks != 0)) {
                 if (hfs_start_transaction(hfsmp) != 0) {
                         retval = EINVAL;
                         goto exit;
@@ -1709,8 +2386,8 @@ retry:
         /*
          * Check for any delayed allocations.
          */
-       if (fp->ff_unallocblocks) {
-               SInt64 actbytes;
+       if ((ap->a_flags & VNODE_WRITE) && (fp->ff_unallocblocks != 0)) {
+               int64_t actbytes;
                 u_int32_t loanedBlocks;
  
                 // 
@@ -1746,9 +2423,7 @@ retry:
                         HFS_MOUNT_LOCK(hfsmp, TRUE);
                         hfsmp->loanedBlocks += loanedBlocks;
                         HFS_MOUNT_UNLOCK(hfsmp, TRUE);
-               }
  
-               if (retval) {
                         hfs_systemfile_unlock(hfsmp, lockflags);
                         cp->c_flag |= C_MODIFIED;
                         if (started_tr) {
@@ -1756,6 +2431,7 @@ retry:
                                 (void) hfs_volupdate(hfsmp, VOL_UPDATE, 0);
  
                                 hfs_end_transaction(hfsmp);
+                               started_tr = 0;
                         }
                         goto exit;
                 }
@@ -1775,10 +2451,63 @@ retry:
                 started_tr = 0;
         }       
         if (retval) {
+               /* On write, always return error because virtual blocks, if any, 
+                * should have been allocated in ExtendFileC().  We do not 
+                * allocate virtual blocks on read, therefore return error 
+                * only if no virtual blocks are allocated.  Otherwise we search
+                * rangelist for zero-fills
+                */
+               if ((MacToVFSError(retval) != ERANGE) ||
+                   (ap->a_flags & VNODE_WRITE) ||
+                   ((ap->a_flags & VNODE_READ) && (fp->ff_unallocblocks == 0))) {
+                       goto exit;
+               } 
+               
+               /* Validate if the start offset is within logical file size */
+               if (ap->a_foffset > fp->ff_size) {
+                       goto exit;
+               }
+
+               /* Searching file extents has failed for read operation, therefore 
+                * search rangelist for any uncommitted holes in the file. 
+                */
+               overlaptype = rl_scan(&fp->ff_invalidranges, ap->a_foffset,
+                                     ap->a_foffset + (off_t)(ap->a_size - 1),
+                                     &invalid_range);
+               switch(overlaptype) {
+               case RL_OVERLAPISCONTAINED:
+                       /* start_offset <= rl_start, end_offset >= rl_end */
+                       if (ap->a_foffset != invalid_range->rl_start) {
+                               break;
+                       }
+               case RL_MATCHINGOVERLAP:
+                       /* start_offset = rl_start, end_offset = rl_end */
+               case RL_OVERLAPCONTAINSRANGE:
+                       /* start_offset >= rl_start, end_offset <= rl_end */
+               case RL_OVERLAPSTARTSBEFORE:
+                       /* start_offset > rl_start, end_offset >= rl_start */
+                       if ((off_t)fp->ff_size > (invalid_range->rl_end + 1)) {
+                               bytesContAvail = (invalid_range->rl_end + 1) - ap->a_foffset;
+                       } else {
+                               bytesContAvail = fp->ff_size - ap->a_foffset;
+                       }
+                       if (bytesContAvail > ap->a_size) {
+                               bytesContAvail = ap->a_size;
+                       }
+                       *ap->a_bpn = (daddr64_t)-1;
+                       retval = 0;
+                       break;
+               case RL_OVERLAPENDSAFTER:
+                       /* start_offset < rl_start, end_offset < rl_end */
+               case RL_NOOVERLAP:
+                       break;
+               }
                 goto exit;
         }
  
-       /* Adjust the mapping information for invalid file ranges: */
+       /* MapFileC() found a valid extent in the filefork.  Search the 
+        * mapping information further for invalid file ranges 
+        */
         overlaptype = rl_scan(&fp->ff_invalidranges, ap->a_foffset,
                               ap->a_foffset + (off_t)bytesContAvail - 1,
                               &invalid_range);
@@ -1787,7 +2516,7 @@ retry:
                 case RL_MATCHINGOVERLAP:
                 case RL_OVERLAPCONTAINSRANGE:
                 case RL_OVERLAPSTARTSBEFORE:
-                       /* There's no valid block for this byte offset: */
+                       /* There's no valid block for this byte offset */
                         *ap->a_bpn = (daddr64_t)-1;
                         /* There's no point limiting the amount to be returned
                          * if the invalid range that was hit extends all the way 
@@ -1795,7 +2524,7 @@ retry:
                          * end of this range and the file's EOF):
                          */
                         if (((off_t)fp->ff_size > (invalid_range->rl_end + 1)) &&
-                           (invalid_range->rl_end + 1 - ap->a_foffset < bytesContAvail)) {
+                           ((size_t)(invalid_range->rl_end + 1 - ap->a_foffset) < bytesContAvail)) {
                                 bytesContAvail = invalid_range->rl_end + 1 - ap->a_foffset;
                         }
                         break;
@@ -1807,7 +2536,7 @@ retry:
                                 /* There's actually no valid information to be had starting here: */
                                 *ap->a_bpn = (daddr64_t)-1;
                                 if (((off_t)fp->ff_size > (invalid_range->rl_end + 1)) &&
-                                   (invalid_range->rl_end + 1 - ap->a_foffset < bytesContAvail)) {
+                                   ((size_t)(invalid_range->rl_end + 1 - ap->a_foffset) < bytesContAvail)) {
                                         bytesContAvail = invalid_range->rl_end + 1 - ap->a_foffset;
                                 }
                         } else {
@@ -1820,13 +2549,17 @@ retry:
                 } /* end switch */
                 if (bytesContAvail > ap->a_size)
                         bytesContAvail = ap->a_size;
+       } 
+               
+exit:
+       if (retval == 0) {
+               if (ap->a_run)
+                       *ap->a_run = bytesContAvail;
+
+               if (ap->a_poff)
+                       *(int *)ap->a_poff = 0;
         }
-       if (ap->a_run)
-               *ap->a_run = bytesContAvail;
  
-       if (ap->a_poff)
-               *(int *)ap->a_poff = 0;
-exit:
         if (tooklock)
                 hfs_unlock(cp);
  
@@ -1845,14 +2578,47 @@ hfs_vnop_strategy(struct vnop_strategy_args *ap)
  {
         buf_t   bp = ap->a_bp;
         vnode_t vp = buf_vnode(bp);
-       struct cnode *cp = VTOC(vp);
+       int error = 0;
+       
+#if CONFIG_PROTECT
+       cnode_t *cp = NULL; 
+       
+       if ((cp = cp_get_protected_cnode(vp)) != NULL) {
+               /*
+                * Some paths to hfs_vnop_strategy will take the cnode lock, 
+                * and some won't. But since content protection is only enabled
+                * for files that (a) aren't system files and (b) are regular 
+                * files, any valid cnode here will be unlocked.
+                */
+               hfs_lock(cp, HFS_SHARED_LOCK);
+               buf_setcpaddr(bp, cp->c_cpentry);
+       }
+#endif /* CONFIG_PROTECT */
+       
+       error = buf_strategy(VTOHFS(vp)->hfs_devvp, ap);
  
-       return (buf_strategy(cp->c_devvp, ap));
+#if CONFIG_PROTECT
+       if (cp) {
+               hfs_unlock(cp);
+       }
+#endif
+       
+       return error;
  }
  
+static int 
+hfs_minorupdate(struct vnode *vp) {
+       struct cnode *cp = VTOC(vp);
+       cp->c_flag &= ~C_MODIFIED;
+       cp->c_touch_acctime = 0;
+       cp->c_touch_chgtime = 0;
+       cp->c_touch_modtime = 0;
+       
+       return 0;
+}
  
-static int
-do_hfs_truncate(struct vnode *vp, off_t length, int flags, int skipsetsize, vfs_context_t context)
+int
+do_hfs_truncate(struct vnode *vp, off_t length, int flags, int skipupdate, vfs_context_t context)
  {
         register struct cnode *cp = VTOC(vp);
         struct filefork *fp = VTOF(vp);
@@ -1862,8 +2628,7 @@ do_hfs_truncate(struct vnode *vp, off_t length, int flags, int skipsetsize, vfs_
         off_t bytesToAdd;
         off_t actualBytesAdded;
         off_t filebytes;
-       u_int64_t old_filesize;
-       u_long fileblocks;
+       u_int32_t fileblocks;
         int blksize;
         struct hfsmount *hfsmp;
         int lockflags;
@@ -1871,7 +2636,6 @@ do_hfs_truncate(struct vnode *vp, off_t length, int flags, int skipsetsize, vfs_
         blksize = VTOVCB(vp)->blockSize;
         fileblocks = fp->ff_blocks;
         filebytes = (off_t)fileblocks * (off_t)blksize;
-       old_filesize = fp->ff_size;
  
         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 7)) | DBG_FUNC_START,
                  (int)length, (int)fp->ff_size, (int)filebytes, 0, 0);
@@ -1922,7 +2686,7 @@ do_hfs_truncate(struct vnode *vp, off_t length, int flags, int skipsetsize, vfs_
                  */
                 if (length > filebytes) {
                         int eflags;
-                       u_long blockHint = 0;
+                       u_int32_t blockHint = 0;
  
                         /* All or nothing and don't round up to clumpsize. */
                         eflags = kEFAllMask | kEFNoClumpMask;
@@ -1970,8 +2734,13 @@ do_hfs_truncate(struct vnode *vp, off_t length, int flags, int skipsetsize, vfs_
                         hfs_systemfile_unlock(hfsmp, lockflags);
  
                         if (hfsmp->jnl) {
-                           (void) hfs_update(vp, TRUE);
-                           (void) hfs_volupdate(hfsmp, VOL_UPDATE, 0);
+                               if (skipupdate) {
+                                       (void) hfs_minorupdate(vp);
+                               }
+                               else {
+                                       (void) hfs_update(vp, TRUE);
+                                       (void) hfs_volupdate(hfsmp, VOL_UPDATE, 0);
+                               }
                         }
  
                         hfs_end_transaction(hfsmp);
@@ -1984,7 +2753,7 @@ do_hfs_truncate(struct vnode *vp, off_t length, int flags, int skipsetsize, vfs_
                 }
   
                 if (!(flags & IO_NOZEROFILL)) {
-                       if (UBCINFOEXISTS(vp) && retval == E_NONE) {
+                       if (UBCINFOEXISTS(vp)  && (vnode_issystem(vp) == 0) && retval == E_NONE) {
                                 struct rl_entry *invalid_range;
                                 off_t zero_limit;
                         
@@ -2034,36 +2803,9 @@ do_hfs_truncate(struct vnode *vp, off_t length, int flags, int skipsetsize, vfs_
                 cp->c_touch_modtime = TRUE;
                 fp->ff_size = length;
  
-               /* Nested transactions will do their own ubc_setsize. */
-               if (!skipsetsize) {
-                       /*
-                        * ubc_setsize can cause a pagein here 
-                        * so we need to drop cnode lock. 
-                        */
-                       hfs_unlock(cp);
-                       ubc_setsize(vp, length);
-                       hfs_lock(cp, HFS_FORCE_LOCK);
-               }
-
         } else { /* Shorten the size of the file */
  
                 if ((off_t)fp->ff_size > length) {
-                       /*
-                        * Any buffers that are past the truncation point need to be
-                        * invalidated (to maintain buffer cache consistency).
-                        */
-
-                        /* Nested transactions will do their own ubc_setsize. */
-                        if (!skipsetsize) {
-                               /*
-                                * ubc_setsize can cause a pageout here 
-                                * so we need to drop cnode lock. 
-                                */
-                               hfs_unlock(cp);
-                               ubc_setsize(vp, length);
-                               hfs_lock(cp, HFS_FORCE_LOCK);
-                       }
-           
                         /* Any space previously marked as invalid is now irrelevant: */
                         rl_remove(length, fp->ff_size - 1, &fp->ff_invalidranges);
                 }
@@ -2120,8 +2862,8 @@ do_hfs_truncate(struct vnode *vp, off_t length, int flags, int skipsetsize, vfs_
                                         lockflags |= SFL_EXTENTS;
                                 lockflags = hfs_systemfile_lock(hfsmp, lockflags, HFS_EXCLUSIVE_LOCK);
  
-                               retval = MacToVFSError(TruncateFileC(VTOVCB(vp),
-                                               (FCB*)fp, length, false));
+                               retval = MacToVFSError(TruncateFileC(VTOVCB(vp), (FCB*)fp, length, 0, 
+                                                                                                        FORK_IS_RSRC (fp), FTOC(fp)->c_fileid, false));
  
                                 hfs_systemfile_unlock(hfsmp, lockflags);
                         }
@@ -2129,10 +2871,14 @@ do_hfs_truncate(struct vnode *vp, off_t length, int flags, int skipsetsize, vfs_
                                 if (retval == 0) {
                                         fp->ff_size = length;
                                 }
-                               (void) hfs_update(vp, TRUE);
-                               (void) hfs_volupdate(hfsmp, VOL_UPDATE, 0);
+                               if (skipupdate) {
+                                       (void) hfs_minorupdate(vp);
+                               }
+                               else {
+                                       (void) hfs_update(vp, TRUE);
+                                       (void) hfs_volupdate(hfsmp, VOL_UPDATE, 0);
+                               }
                         }
-
                         hfs_end_transaction(hfsmp);
  
                         filebytes = (off_t)fp->ff_blocks * (off_t)blksize;
@@ -2144,12 +2890,24 @@ do_hfs_truncate(struct vnode *vp, off_t length, int flags, int skipsetsize, vfs_
  #endif /* QUOTA */
                 }
                 /* Only set update flag if the logical length changes */
-               if (old_filesize != length)
+               if ((off_t)fp->ff_size != length)
                         cp->c_touch_modtime = TRUE;
                 fp->ff_size = length;
         }
-       cp->c_touch_chgtime = TRUE;
-       retval = hfs_update(vp, MNT_WAIT);
+       if (cp->c_mode & (S_ISUID | S_ISGID)) {
+               if (!vfs_context_issuser(context)) {
+                       cp->c_mode &= ~(S_ISUID | S_ISGID);
+                       skipupdate = 0;
+               }
+       }
+       if (skipupdate) {
+               retval = hfs_minorupdate(vp);
+       }
+       else {
+               cp->c_touch_chgtime = TRUE;     /* status changed */
+               cp->c_touch_modtime = TRUE;     /* file data was modified */
+               retval = hfs_update(vp, MNT_WAIT);
+       }
         if (retval) {
                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 7)) | DBG_FUNC_NONE,
                      -1, -1, -1, retval, 0);
@@ -2163,30 +2921,243 @@ Err_Exit:
         return (retval);
  }
  
+/*
+ * Preparation which must be done prior to deleting the catalog record
+ * of a file or directory.  In order to make the on-disk as safe as possible,
+ * we remove the catalog entry before releasing the bitmap blocks and the 
+ * overflow extent records.  However, some work must be done prior to deleting
+ * the catalog record.
+ * 
+ * When calling this function, the cnode must exist both in memory and on-disk.
+ * If there are both resource fork and data fork vnodes, this function should
+ * be called on both.  
+ */
+
+int
+hfs_prepare_release_storage (struct hfsmount *hfsmp, struct vnode *vp) {
+       
+       struct filefork *fp = VTOF(vp);
+       struct cnode *cp = VTOC(vp);
+       int retval = 0;
+       
+       /* Cannot truncate an HFS directory! */
+       if (vnode_isdir(vp)) {
+               return (EISDIR);
+       }
+       
+       /* 
+        * See the comment below in hfs_truncate for why we need to call 
+        * setsize here.  Essentially we want to avoid pending IO if we 
+        * already know that the blocks are going to be released here.
+        * This function is only called when totally removing all storage for a file, so
+        * we can take a shortcut and immediately setsize (0);
+        */
+       ubc_setsize(vp, 0);
+       
+       /* This should only happen with a corrupt filesystem */
+       if ((off_t)fp->ff_size < 0)
+               return (EINVAL);
+       
+       /* 
+        * We cannot just check if fp->ff_size == length (as an optimization)
+        * since there may be extra physical blocks that also need truncation.
+        */
+#if QUOTA
+       if ((retval = hfs_getinoquota(cp))) {
+               return(retval);
+       }
+#endif /* QUOTA */
+       
+       /* Wipe out any invalid ranges which have yet to be backed by disk */
+       rl_remove(0, fp->ff_size - 1, &fp->ff_invalidranges);
+       
+       /* 
+        * Account for any unmapped blocks. Since we're deleting the 
+        * entire file, we don't have to worry about just shrinking
+        * to a smaller number of borrowed blocks.
+        */
+       if (fp->ff_unallocblocks > 0) {
+               u_int32_t loanedBlocks;
+               
+               HFS_MOUNT_LOCK(hfsmp, TRUE);
+               
+               loanedBlocks = fp->ff_unallocblocks;
+               cp->c_blocks -= loanedBlocks;
+               fp->ff_blocks -= loanedBlocks;
+               fp->ff_unallocblocks = 0;
+               
+               hfsmp->loanedBlocks -= loanedBlocks;
+               
+               HFS_MOUNT_UNLOCK(hfsmp, TRUE);
+       }
+       
+       return 0;
+}
+
+
+/*
+ * Special wrapper around calling TruncateFileC.  This function is useable
+ * even when the catalog record does not exist any longer, making it ideal
+ * for use when deleting a file.  The simplification here is that we know 
+ * that we are releasing all blocks.
+ *
+ * The caller is responsible for saving off a copy of the filefork(s)
+ * embedded within the cnode prior to calling this function.  The pointers
+ * supplied as arguments must be valid even if the cnode is no longer valid.
+ */
+
+int 
+hfs_release_storage (struct hfsmount *hfsmp, struct filefork *datafork, 
+                                        struct filefork *rsrcfork, u_int32_t fileid) {
+       
+       off_t filebytes;
+       u_int32_t fileblocks;
+       int blksize = 0;
+       int error = 0;
+       int lockflags;
+       
+       blksize = hfsmp->blockSize;
+       
+       /* Data Fork */
+       if (datafork->ff_blocks > 0) {
+               fileblocks = datafork->ff_blocks;
+               filebytes = (off_t)fileblocks * (off_t)blksize;         
+               
+               /* We killed invalid ranges and loaned blocks before we removed the catalog entry */
+               
+               while (filebytes > 0) {
+                       if (filebytes > HFS_BIGFILE_SIZE && overflow_extents(datafork)) {
+                               filebytes -= HFS_BIGFILE_SIZE;
+                       } else {
+                               filebytes = 0;
+                       }
+                       
+                       /* Start a transaction, and wipe out as many blocks as we can in this iteration */
+                       if (hfs_start_transaction(hfsmp) != 0) {
+                               error = EINVAL;
+                               break;
+                       }
+                       
+                       if (datafork->ff_unallocblocks == 0) {
+                               /* Protect extents b-tree and allocation bitmap */
+                               lockflags = SFL_BITMAP;
+                               if (overflow_extents(datafork))
+                                       lockflags |= SFL_EXTENTS;
+                               lockflags = hfs_systemfile_lock(hfsmp, lockflags, HFS_EXCLUSIVE_LOCK);
+                               
+                               error = MacToVFSError(TruncateFileC(HFSTOVCB(hfsmp), datafork, filebytes, 1, 0, fileid, false));
+                               
+                               hfs_systemfile_unlock(hfsmp, lockflags);
+                       }
+                       if (error == 0) {
+                               datafork->ff_size = filebytes;
+                       }
+                       (void) hfs_volupdate(hfsmp, VOL_UPDATE, 0);
+                       
+                       /* Finish the transaction and start over if necessary */
+                       hfs_end_transaction(hfsmp);
+                       
+                       if (error) {
+                               break;
+                       }
+               }
+       }
+       
+       /* Resource fork */
+       if (error == 0 && (rsrcfork != NULL) && rsrcfork->ff_blocks > 0) {
+               fileblocks = rsrcfork->ff_blocks;
+               filebytes = (off_t)fileblocks * (off_t)blksize;
+               
+               /* We killed invalid ranges and loaned blocks before we removed the catalog entry */
+               
+               while (filebytes > 0) {
+                       if (filebytes > HFS_BIGFILE_SIZE && overflow_extents(rsrcfork)) {
+                               filebytes -= HFS_BIGFILE_SIZE;
+                       } else {
+                               filebytes = 0;
+                       }
+                       
+                       /* Start a transaction, and wipe out as many blocks as we can in this iteration */
+                       if (hfs_start_transaction(hfsmp) != 0) {
+                               error = EINVAL;
+                               break;
+                       }
+                       
+                       if (rsrcfork->ff_unallocblocks == 0) {
+                               /* Protect extents b-tree and allocation bitmap */
+                               lockflags = SFL_BITMAP;
+                               if (overflow_extents(rsrcfork))
+                                       lockflags |= SFL_EXTENTS;
+                               lockflags = hfs_systemfile_lock(hfsmp, lockflags, HFS_EXCLUSIVE_LOCK);
+                               
+                               error = MacToVFSError(TruncateFileC(HFSTOVCB(hfsmp), rsrcfork, filebytes, 1, 1, fileid, false));
+                               
+                               hfs_systemfile_unlock(hfsmp, lockflags);
+                       }
+                       if (error == 0) {
+                               rsrcfork->ff_size = filebytes;
+                       }
+                       (void) hfs_volupdate(hfsmp, VOL_UPDATE, 0);
+                       
+                       /* Finish the transaction and start over if necessary */
+                       hfs_end_transaction(hfsmp);                     
+                       
+                       if (error) {
+                               break;
+                       }
+               }
+       }
+       
+       return error;
+}
  
  
  /*
   * Truncate a cnode to at most length size, freeing (or adding) the
   * disk blocks.
   */
-__private_extern__
  int
  hfs_truncate(struct vnode *vp, off_t length, int flags, int skipsetsize,
-             vfs_context_t context)
+             int skipupdate, vfs_context_t context)
  {
         struct filefork *fp = VTOF(vp);
         off_t filebytes;
-       u_long fileblocks;
+       u_int32_t fileblocks;
         int blksize, error = 0;
         struct cnode *cp = VTOC(vp);
  
-       if (vnode_isdir(vp))
-               return (EISDIR);        /* cannot truncate an HFS directory! */
+       /* Cannot truncate an HFS directory! */
+       if (vnode_isdir(vp)) {
+               return (EISDIR);
+       }
+       /* A swap file cannot change size. */
+       if (vnode_isswap(vp) && (length != 0)) {
+               return (EPERM);
+       }
  
         blksize = VTOVCB(vp)->blockSize;
         fileblocks = fp->ff_blocks;
         filebytes = (off_t)fileblocks * (off_t)blksize;
  
+       //
+       // Have to do this here so that we don't wind up with
+       // i/o pending for blocks that are about to be released
+       // if we truncate the file.
+       //
+       // If skipsetsize is set, then the caller is responsible
+       // for the ubc_setsize.
+       //
+       // Even if skipsetsize is set, if the length is zero we
+       // want to call ubc_setsize() because as of SnowLeopard
+       // it will no longer cause any page-ins and it will drop
+       // any dirty pages so that we don't do any i/o that we
+       // don't have to.  This also prevents a race where i/o
+       // for truncated blocks may overwrite later data if the
+       // blocks get reallocated to a different file.
+       //
+       if (!skipsetsize || length == 0)
+               ubc_setsize(vp, length);
+
         // have to loop truncating or growing files that are
         // really big because otherwise transactions can get
         // enormous and consume too many kernel resources.
@@ -2199,7 +3170,7 @@ hfs_truncate(struct vnode *vp, off_t length, int flags, int skipsetsize,
                                 filebytes = length;
                         }
                         cp->c_flag |= C_FORCEUPDATE;
-                       error = do_hfs_truncate(vp, filebytes, flags, skipsetsize, context);
+                       error = do_hfs_truncate(vp, filebytes, flags, skipupdate, context);
                         if (error)
                                 break;
                 }
@@ -2211,13 +3182,13 @@ hfs_truncate(struct vnode *vp, off_t length, int flags, int skipsetsize,
                                 filebytes = length;
                         }
                         cp->c_flag |= C_FORCEUPDATE;
-                       error = do_hfs_truncate(vp, filebytes, flags, skipsetsize, context);
+                       error = do_hfs_truncate(vp, filebytes, flags, skipupdate, context);
                         if (error)
                                 break;
                 }
         } else /* Same logical size */ {
  
-               error = do_hfs_truncate(vp, length, flags, skipsetsize, context);
+               error = do_hfs_truncate(vp, length, flags, skipupdate, context);
         }
         /* Files that are changing size are not hot file candidates. */
         if (VTOHFS(vp)->hfc_stage == HFC_RECORDING) {
@@ -2251,13 +3222,14 @@ hfs_vnop_allocate(struct vnop_allocate_args /* {
         off_t moreBytesRequested;
         off_t actualBytesAdded;
         off_t filebytes;
-       u_long fileblocks;
+       u_int32_t fileblocks;
         int retval, retval2;
-       UInt32 blockHint;
-       UInt32 extendFlags;   /* For call to ExtendFileC */
+       u_int32_t blockHint;
+       u_int32_t extendFlags;   /* For call to ExtendFileC */
         struct hfsmount *hfsmp;
         kauth_cred_t cred = vfs_context_ucred(ap->a_context);
         int lockflags;
+       time_t orig_ctime;
  
         *(ap->a_bytesallocated) = 0;
  
@@ -2265,10 +3237,19 @@ hfs_vnop_allocate(struct vnop_allocate_args /* {
                 return (EISDIR);
         if (length < (off_t)0)
                 return (EINVAL);
+       
+       cp = VTOC(vp);
+
+       orig_ctime = VTOC(vp)->c_ctime;
  
-       if ((retval = hfs_lock(VTOC(vp), HFS_EXCLUSIVE_LOCK)))
-               return (retval);
-       cp = VTOC(vp);
+       check_for_tracked_file(vp, orig_ctime, ap->a_length == 0 ? NAMESPACE_HANDLER_TRUNCATE_OP|NAMESPACE_HANDLER_DELETE_OP : NAMESPACE_HANDLER_TRUNCATE_OP, NULL);
+
+       hfs_lock_truncate(cp, HFS_EXCLUSIVE_LOCK);
+
+       if ((retval = hfs_lock(cp, HFS_EXCLUSIVE_LOCK))) {
+               goto Err_Exit;
+       }
+       
         fp = VTOF(vp);
         hfsmp = VTOHFS(vp);
         vcb = VTOVCB(vp);
@@ -2290,6 +3271,8 @@ hfs_vnop_allocate(struct vnop_allocate_args /* {
                 extendFlags |= kEFAllMask;
         if (cred && suser(cred, NULL) != 0)
                 extendFlags |= kEFReserveMask;
+       if (hfs_virtualmetafile(cp))
+               extendFlags |= kEFMetadataMask;
  
         retval = E_NONE;
         blockHint = 0;
@@ -2310,7 +3293,9 @@ hfs_vnop_allocate(struct vnop_allocate_args /* {
          * value of filebytes is 0, length will be at least 1.
          */
         if (length > filebytes) {
-               moreBytesRequested = length - filebytes;
+               off_t total_bytes_added = 0, orig_request_size;
+
+               orig_request_size = moreBytesRequested = length - filebytes;
                 
  #if QUOTA
                 retval = hfs_chkdq(cp,
@@ -2328,7 +3313,6 @@ hfs_vnop_allocate(struct vnop_allocate_args /* {
                          * Allocate Journal and Quota files in metadata zone.
                          */
                         if (hfs_virtualmetafile(cp)) {
-                               extendFlags |= kEFMetadataMask;
                                 blockHint = hfsmp->hfs_metazone_start;
                         } else if ((blockHint >= hfsmp->hfs_metazone_start) &&
                                    (blockHint <= hfsmp->hfs_metazone_end)) {
@@ -2339,35 +3323,60 @@ hfs_vnop_allocate(struct vnop_allocate_args /* {
                         }
                 }
  
-               if (hfs_start_transaction(hfsmp) != 0) {
-                   retval = EINVAL;
-                   goto Err_Exit;
-               }
  
-               /* Protect extents b-tree and allocation bitmap */
-               lockflags = SFL_BITMAP;
-               if (overflow_extents(fp))
+               while ((length > filebytes) && (retval == E_NONE)) {
+                   off_t bytesRequested;
+                   
+                   if (hfs_start_transaction(hfsmp) != 0) {
+                       retval = EINVAL;
+                       goto Err_Exit;
+                   }
+
+                   /* Protect extents b-tree and allocation bitmap */
+                   lockflags = SFL_BITMAP;
+                   if (overflow_extents(fp))
                         lockflags |= SFL_EXTENTS;
-               lockflags = hfs_systemfile_lock(hfsmp, lockflags, HFS_EXCLUSIVE_LOCK);
+                   lockflags = hfs_systemfile_lock(hfsmp, lockflags, HFS_EXCLUSIVE_LOCK);
  
-               retval = MacToVFSError(ExtendFileC(vcb,
+                   if (moreBytesRequested >= HFS_BIGFILE_SIZE) {
+                       bytesRequested = HFS_BIGFILE_SIZE;
+                   } else {
+                       bytesRequested = moreBytesRequested;
+                   }
+
+                   if (extendFlags & kEFContigMask) {
+                           // if we're on a sparse device, this will force it to do a
+                           // full scan to find the space needed.
+                           hfsmp->hfs_flags &= ~HFS_DID_CONTIG_SCAN;
+                   }
+
+                   retval = MacToVFSError(ExtendFileC(vcb,
                                                 (FCB*)fp,
-                                               moreBytesRequested,
+                                               bytesRequested,
                                                 blockHint,
                                                 extendFlags,
                                                 &actualBytesAdded));
  
-               *(ap->a_bytesallocated) = actualBytesAdded;
-               filebytes = (off_t)fp->ff_blocks * (off_t)vcb->blockSize;
-
-               hfs_systemfile_unlock(hfsmp, lockflags);
+                   if (retval == E_NONE) {
+                       *(ap->a_bytesallocated) += actualBytesAdded;
+                       total_bytes_added += actualBytesAdded;
+                       moreBytesRequested -= actualBytesAdded;
+                       if (blockHint != 0) {
+                           blockHint += actualBytesAdded / vcb->blockSize;
+                       }
+                   }
+                   filebytes = (off_t)fp->ff_blocks * (off_t)vcb->blockSize;
+                   
+                   hfs_systemfile_unlock(hfsmp, lockflags);
  
-               if (hfsmp->jnl) {
+                   if (hfsmp->jnl) {
                         (void) hfs_update(vp, TRUE);
                         (void) hfs_volupdate(hfsmp, VOL_UPDATE, 0);
+                   }
+
+                   hfs_end_transaction(hfsmp);
                 }
  
-               hfs_end_transaction(hfsmp);
  
                 /*
                  * if we get an error and no changes were made then exit
@@ -2383,9 +3392,9 @@ hfs_vnop_allocate(struct vnop_allocate_args /* {
                  * until the file is closed, when we truncate the file to allocation
                  * block size.
                  */
-               if ((actualBytesAdded != 0) && (moreBytesRequested < actualBytesAdded))
+               if (total_bytes_added != 0 && orig_request_size < total_bytes_added)
                         *(ap->a_bytesallocated) =
-                               roundup(moreBytesRequested, (off_t)vcb->blockSize);
+                               roundup(orig_request_size, (off_t)vcb->blockSize);
  
         } else { /* Shorten the size of the file */
  
@@ -2396,31 +3405,9 @@ hfs_vnop_allocate(struct vnop_allocate_args /* {
                          */
                 }
  
-               if (hfs_start_transaction(hfsmp) != 0) {
-                   retval = EINVAL;
-                   goto Err_Exit;
-               }
-
-               /* Protect extents b-tree and allocation bitmap */
-               lockflags = SFL_BITMAP;
-               if (overflow_extents(fp))
-                       lockflags |= SFL_EXTENTS;
-               lockflags = hfs_systemfile_lock(hfsmp, lockflags, HFS_EXCLUSIVE_LOCK);
-
-               retval = MacToVFSError(TruncateFileC(vcb, (FCB*)fp, length, false));
-
-               hfs_systemfile_unlock(hfsmp, lockflags);
-
+               retval = hfs_truncate(vp, length, 0, 0, 0, ap->a_context);
                 filebytes = (off_t)fp->ff_blocks * (off_t)vcb->blockSize;
  
-               if (hfsmp->jnl) {
-                       (void) hfs_update(vp, TRUE);
-                       (void) hfs_volupdate(hfsmp, VOL_UPDATE, 0);
-               }
-
-               hfs_end_transaction(hfsmp);
-               
-
                 /*
                  * if we get an error and no changes were made then exit
                  * otherwise we must do the hfs_update to reflect the changes
@@ -2448,6 +3435,7 @@ Std_Exit:
         if (retval == 0)
                 retval = retval2;
  Err_Exit:
+       hfs_unlock_truncate(cp, 0);
         hfs_unlock(cp);
         return (retval);
  }
@@ -2470,50 +3458,298 @@ hfs_vnop_pagein(struct vnop_pagein_args *ap)
         };
  */
  {
-       vnode_t vp = ap->a_vp;
-       int error;
+       vnode_t         vp;
+       struct cnode    *cp;
+       struct filefork *fp;
+       int             error = 0;
+       upl_t           upl;
+       upl_page_info_t *pl;
+       off_t           f_offset;
+       int             offset;
+       int             isize; 
+       int             pg_index;
+       boolean_t       truncate_lock_held = FALSE;
+       boolean_t       file_converted = FALSE;
+       kern_return_t   kret;
+       
+       vp = ap->a_vp;
+       cp = VTOC(vp);
+       fp = VTOF(vp);
+
+#if CONFIG_PROTECT
+       if ((error = cp_handle_vnop(cp, CP_READ_ACCESS | CP_WRITE_ACCESS)) != 0) {
+               return error;
+       }
+#endif /* CONFIG_PROTECT */
+
+       if (ap->a_pl != NULL) {
+               /*
+                * this can only happen for swap files now that
+                * we're asking for V2 paging behavior...
+                * so don't need to worry about decompression, or
+                * keeping track of blocks read or taking the truncate lock
+                */
+               error = cluster_pagein(vp, ap->a_pl, ap->a_pl_offset, ap->a_f_offset,
+                                      ap->a_size, (off_t)fp->ff_size, ap->a_flags);
+               goto pagein_done;
+       }
  
-       error = cluster_pagein(vp, ap->a_pl, ap->a_pl_offset, ap->a_f_offset,
-                              ap->a_size, (off_t)VTOF(vp)->ff_size, ap->a_flags);
+retry_pagein:
         /*
-        * Keep track of blocks read.
+        * take truncate lock (shared/recursive) to guard against 
+        * zero-fill thru fsync interfering, but only for v2
+        *
+        * the HFS_RECURSE_TRUNCLOCK arg indicates that we want the 
+        * lock shared and we are allowed to recurse 1 level if this thread already
+        * owns the lock exclusively... this can legally occur
+        * if we are doing a shrinking ftruncate against a file
+        * that is mapped private, and the pages being truncated
+        * do not currently exist in the cache... in that case
+        * we will have to page-in the missing pages in order
+        * to provide them to the private mapping... we must
+        * also call hfs_unlock_truncate with a postive been_recursed 
+        * arg to indicate that if we have recursed, there is no need to drop
+        * the lock.  Allowing this simple recursion is necessary
+        * in order to avoid a certain deadlock... since the ftruncate
+        * already holds the truncate lock exclusively, if we try
+        * to acquire it shared to protect the pagein path, we will
+        * hang this thread
+        *
+        * NOTE: The if () block below is a workaround in order to prevent a 
+        * VM deadlock. See rdar://7853471.
+        * 
+        * If we are in a forced unmount, then launchd will still have the 
+        * dyld_shared_cache file mapped as it is trying to reboot.  If we 
+        * take the truncate lock here to service a page fault, then our 
+        * thread could deadlock with the forced-unmount.  The forced unmount 
+        * thread will try to reclaim the dyld_shared_cache vnode, but since it's 
+        * marked C_DELETED, it will call ubc_setsize(0).  As a result, the unmount 
+        * thread will think it needs to copy all of the data out of the file 
+        * and into a VM copy object.  If we hold the cnode lock here, then that 
+        * VM operation will not be able to proceed, because we'll set a busy page 
+        * before attempting to grab the lock.  Note that this isn't as simple as "don't
+        * call ubc_setsize" because doing that would just shift the problem to the
+        * ubc_msync done before the vnode is reclaimed.
+        *
+        * So, if a forced unmount on this volume is in flight AND the cnode is 
+        * marked C_DELETED, then just go ahead and do the page in without taking 
+        * the lock (thus suspending pagein_v2 semantics temporarily).  Since it's on a file
+        * that is not going to be available on the next mount, this seems like a 
+        * OK solution from a correctness point of view, even though it is hacky.
          */
-       if (VTOHFS(vp)->hfc_stage == HFC_RECORDING && error == 0) {
-               struct cnode *cp;
-               struct filefork *fp;
-               int bytesread;
-               int took_cnode_lock = 0;
-               
-               cp = VTOC(vp);
-               fp = VTOF(vp);
+       if (vfs_isforce(vp->v_mount)) {
+               if (cp->c_flag & C_DELETED) {
+                       /* If we don't get it, then just go ahead and operate without the lock */
+                       truncate_lock_held = hfs_try_trunclock(cp, HFS_RECURSE_TRUNCLOCK);
+               }
+       }
+       else {
+               hfs_lock_truncate(cp, HFS_RECURSE_TRUNCLOCK);
+               truncate_lock_held = TRUE;
+       }
  
-               if (ap->a_f_offset == 0 && fp->ff_size < PAGE_SIZE)
-                       bytesread = fp->ff_size;
-               else
-                       bytesread = ap->a_size;
+       kret = ubc_create_upl(vp, ap->a_f_offset, ap->a_size, &upl, &pl, UPL_UBC_PAGEIN | UPL_RET_ONLY_ABSENT); 
  
-               /* When ff_bytesread exceeds 32-bits, update it behind the cnode lock. */
-               if ((fp->ff_bytesread + bytesread) > 0x00000000ffffffff) {
-                       hfs_lock(cp, HFS_FORCE_LOCK);
-                       took_cnode_lock = 1;
+       if ((kret != KERN_SUCCESS) || (upl == (upl_t) NULL)) {
+               error = EINVAL;
+               goto pagein_done;
+       }
+       isize = ap->a_size;
+
+       /* 
+        * Scan from the back to find the last page in the UPL, so that we 
+        * aren't looking at a UPL that may have already been freed by the
+        * preceding aborts/completions.
+        */ 
+       for (pg_index = ((isize) / PAGE_SIZE); pg_index > 0;) {
+               if (upl_page_present(pl, --pg_index))
+                       break;
+               if (pg_index == 0) {
+                       /*
+                        * no absent pages were found in the range specified
+                        * just abort the UPL to get rid of it and then we're done
+                        */
+                       ubc_upl_abort_range(upl, 0, isize, UPL_ABORT_FREE_ON_EMPTY);
+                       goto pagein_done;
                 }
-               /*
-                * If this file hasn't been seen since the start of
-                * the current sampling period then start over.
+       }
+       /* 
+        * initialize the offset variables before we touch the UPL.
+        * f_offset is the position into the file, in bytes
+        * offset is the position into the UPL, in bytes
+        * pg_index is the pg# of the UPL we're operating on
+        * isize is the offset into the UPL of the last page that is present. 
+        */
+       isize = ((pg_index + 1) * PAGE_SIZE);   
+       pg_index = 0;
+       offset = 0;
+       f_offset = ap->a_f_offset;
+
+       while (isize) {
+               int  xsize;
+               int  num_of_pages;
+
+               if ( !upl_page_present(pl, pg_index)) {
+                       /*
+                        * we asked for RET_ONLY_ABSENT, so it's possible
+                        * to get back empty slots in the UPL.
+                        * just skip over them
+                        */
+                       f_offset += PAGE_SIZE;
+                       offset   += PAGE_SIZE;
+                       isize    -= PAGE_SIZE;
+                       pg_index++;
+
+                       continue;
+               }
+               /* 
+                * We know that we have at least one absent page.
+                * Now checking to see how many in a row we have
                  */
-               if (cp->c_atime < VTOHFS(vp)->hfc_timebase) {
-                       struct timeval tv;
+               num_of_pages = 1;
+               xsize = isize - PAGE_SIZE;
  
-                       fp->ff_bytesread = bytesread;
-                       microtime(&tv);
-                       cp->c_atime = tv.tv_sec;
+               while (xsize) {
+                       if ( !upl_page_present(pl, pg_index + num_of_pages))
+                               break;
+                       num_of_pages++;
+                       xsize -= PAGE_SIZE;
+               }
+               xsize = num_of_pages * PAGE_SIZE;
+
+#if HFS_COMPRESSION
+               if (VNODE_IS_RSRC(vp)) {
+                       /* allow pageins of the resource fork */
                 } else {
-                       fp->ff_bytesread += bytesread;
+                       int compressed = hfs_file_is_compressed(VTOC(vp), 1); /* 1 == don't take the cnode lock */
+
+                       if (compressed) {
+                               if (truncate_lock_held) {
+                                       /*
+                                        * can't hold the truncate lock when calling into the decmpfs layer
+                                        * since it calls back into this layer... even though we're only
+                                        * holding the lock in shared mode, and the re-entrant path only
+                                        * takes the lock shared, we can deadlock if some other thread
+                                        * tries to grab the lock exclusively in between.
+                                        */
+                                       hfs_unlock_truncate(cp, 1);
+                                       truncate_lock_held = FALSE;
+                               }
+                               ap->a_pl = upl;
+                               ap->a_pl_offset = offset;
+                               ap->a_f_offset = f_offset;
+                               ap->a_size = xsize;
+
+                               error = decmpfs_pagein_compressed(ap, &compressed, VTOCMP(vp));
+                               /*
+                                * note that decpfs_pagein_compressed can change the state of
+                                * 'compressed'... it will set it to 0 if the file is no longer
+                                * compressed once the compression lock is successfully taken
+                                * i.e. we would block on that lock while the file is being inflated
+                                */
+                               if (compressed) {
+                                       if (error == 0) {
+                                               /* successful page-in, update the access time */
+                                               VTOC(vp)->c_touch_acctime = TRUE;
+                                       
+                                               /* compressed files are not hot file candidates */
+                                               if (VTOHFS(vp)->hfc_stage == HFC_RECORDING) {
+                                                       fp->ff_bytesread = 0;
+                                               }
+                                       } else if (error == EAGAIN) {
+                                               /*
+                                                * EAGAIN indicates someone else already holds the compression lock...
+                                                * to avoid deadlocking, we'll abort this range of pages with an
+                                                * indication that the pagein needs to be redriven
+                                                */
+                                               ubc_upl_abort_range(upl, (upl_offset_t) offset, xsize, UPL_ABORT_FREE_ON_EMPTY | UPL_ABORT_RESTART);
+                                       }
+                                       goto pagein_next_range;
+                               }
+                               else {
+                                       /* 
+                                        * Set file_converted only if the file became decompressed while we were
+                                        * paging in.  If it were still compressed, we would re-start the loop using the goto
+                                        * in the above block.  This avoid us overloading truncate_lock_held as our retry_pagein
+                                        * condition below, since we could have avoided taking the truncate lock to prevent
+                                        * a deadlock in the force unmount case.
+                                        */
+                                       file_converted = TRUE;
+                               }
+                       }
+                       if (file_converted == TRUE) {
+                               /*
+                                * the file was converted back to a regular file after we first saw it as compressed
+                                * we need to abort the upl, retake the truncate lock, recreate the UPL and start over
+                                * reset a_size so that we consider what remains of the original request
+                                * and null out a_upl and a_pl_offset.
+                                *
+                                * We should only be able to get into this block if the decmpfs_pagein_compressed 
+                                * successfully decompressed the range in question for this file.
+                                */
+                               ubc_upl_abort_range(upl, (upl_offset_t) offset, isize, UPL_ABORT_FREE_ON_EMPTY);
+
+                               ap->a_size = isize;
+                               ap->a_pl = NULL;
+                               ap->a_pl_offset = 0;
+
+                               /* Reset file_converted back to false so that we don't infinite-loop. */
+                               file_converted = FALSE;
+                               goto retry_pagein;
+                       }
                 }
-               cp->c_touch_acctime = TRUE;
-               if (took_cnode_lock)
-                       hfs_unlock(cp);
+#endif
+               error = cluster_pagein(vp, upl, offset, f_offset, xsize, (off_t)fp->ff_size, ap->a_flags);
+
+               /*
+                * Keep track of blocks read.
+                */
+               if ( !vnode_isswap(vp) && VTOHFS(vp)->hfc_stage == HFC_RECORDING && error == 0) {
+                       int bytesread;
+                       int took_cnode_lock = 0;
+               
+                       if (ap->a_f_offset == 0 && fp->ff_size < PAGE_SIZE)
+                               bytesread = fp->ff_size;
+                       else
+                               bytesread = xsize;
+
+                       /* When ff_bytesread exceeds 32-bits, update it behind the cnode lock. */
+                       if ((fp->ff_bytesread + bytesread) > 0x00000000ffffffff && cp->c_lockowner != current_thread()) {
+                               hfs_lock(cp, HFS_FORCE_LOCK);
+                               took_cnode_lock = 1;
+                       }
+                       /*
+                        * If this file hasn't been seen since the start of
+                        * the current sampling period then start over.
+                        */
+                       if (cp->c_atime < VTOHFS(vp)->hfc_timebase) {
+                               struct timeval tv;
+
+                               fp->ff_bytesread = bytesread;
+                               microtime(&tv);
+                               cp->c_atime = tv.tv_sec;
+                       } else {
+                               fp->ff_bytesread += bytesread;
+                       }
+                       cp->c_touch_acctime = TRUE;
+                       if (took_cnode_lock)
+                               hfs_unlock(cp);
+               }
+pagein_next_range:
+               f_offset += xsize;
+               offset   += xsize;
+               isize    -= xsize;
+               pg_index += num_of_pages;
+
+               error = 0;
+       }
+
+pagein_done:
+       if (truncate_lock_held == TRUE) {
+               /* Note 1 is passed to hfs_unlock_truncate in been_recursed argument */
+               hfs_unlock_truncate(cp, 1);
         }
+
         return (error);
  }
  
@@ -2537,53 +3773,270 @@ hfs_vnop_pageout(struct vnop_pageout_args *ap)
         vnode_t vp = ap->a_vp;
         struct cnode *cp;
         struct filefork *fp;
-       int retval;
-       off_t end_of_range;
+       int retval = 0;
         off_t filesize;
+       upl_t           upl;
+       upl_page_info_t* pl;
+       vm_offset_t     a_pl_offset;
+       int             a_flags;
+       int is_pageoutv2 = 0;
+       kern_return_t kret;
  
         cp = VTOC(vp);
-       if (cp->c_lockowner == current_thread()) {
-               panic("pageout: %s cnode lock already held!\n",
-                     cp->c_desc.cd_nameptr ? cp->c_desc.cd_nameptr : "");
-       }
-       if ( (retval = hfs_lock(cp, HFS_EXCLUSIVE_LOCK))) {
-               if (!(ap->a_flags & UPL_NOCOMMIT)) {
-                       ubc_upl_abort_range(ap->a_pl,
-                                           ap->a_pl_offset,
-                                           ap->a_size,
-                                           UPL_ABORT_FREE_ON_EMPTY);
-               }
-               return (retval);
-       }
         fp = VTOF(vp);
-
+       
+       /*
+        * Figure out where the file ends, for pageout purposes.  If
+        * ff_new_size > ff_size, then we're in the middle of extending the
+        * file via a write, so it is safe (and necessary) that we be able
+        * to pageout up to that point.
+        */
         filesize = fp->ff_size;
-       end_of_range = ap->a_f_offset + ap->a_size - 1;
+       if (fp->ff_new_size > filesize)
+               filesize = fp->ff_new_size;
  
-       if (end_of_range >= filesize) {
-               end_of_range = (off_t)(filesize - 1);
-       }
-       if (ap->a_f_offset < filesize) {
-               rl_remove(ap->a_f_offset, end_of_range, &fp->ff_invalidranges);
-               cp->c_flag |= C_MODIFIED;  /* leof is dirty */
+       a_flags = ap->a_flags;
+       a_pl_offset = ap->a_pl_offset;
+
+       /*
+        * we can tell if we're getting the new or old behavior from the UPL
+        */
+       if ((upl = ap->a_pl) == NULL) {
+               int request_flags; 
+
+               is_pageoutv2 = 1;
+               /*
+                * we're in control of any UPL we commit
+                * make sure someone hasn't accidentally passed in UPL_NOCOMMIT 
+                */
+               a_flags &= ~UPL_NOCOMMIT;
+               a_pl_offset = 0;
+
+               /*
+                * take truncate lock (shared) to guard against 
+                * zero-fill thru fsync interfering, but only for v2 
+                */
+               hfs_lock_truncate(cp, HFS_SHARED_LOCK);
+
+               if (a_flags & UPL_MSYNC) {
+                       request_flags = UPL_UBC_MSYNC | UPL_RET_ONLY_DIRTY;
+               }
+               else {
+                       request_flags = UPL_UBC_PAGEOUT | UPL_RET_ONLY_DIRTY;
+               }
+               
+               kret = ubc_create_upl(vp, ap->a_f_offset, ap->a_size, &upl, &pl, request_flags); 
+
+               if ((kret != KERN_SUCCESS) || (upl == (upl_t) NULL)) {
+                       retval = EINVAL;
+                       goto pageout_done;
+               }
         }
-       hfs_unlock(cp);
+       /*
+        * from this point forward upl points at the UPL we're working with
+        * it was either passed in or we succesfully created it
+        */
+
+       /* 
+        * Now that HFS is opting into VFC_VFSVNOP_PAGEOUTV2, we may need to operate on our own  
+        * UPL instead of relying on the UPL passed into us.  We go ahead and do that here,
+        * scanning for dirty ranges.  We'll issue our own N cluster_pageout calls, for
+        * N dirty ranges in the UPL.  Note that this is almost a direct copy of the 
+        * logic in vnode_pageout except that we need to do it after grabbing the truncate 
+        * lock in HFS so that we don't lock invert ourselves.  
+        * 
+        * Note that we can still get into this function on behalf of the default pager with
+        * non-V2 behavior (swapfiles).  However in that case, we did not grab locks above 
+        * since fsync and other writing threads will grab the locks, then mark the 
+        * relevant pages as busy.  But the pageout codepath marks the pages as busy, 
+        * and THEN would attempt to grab the truncate lock, which would result in deadlock.  So
+        * we do not try to grab anything for the pre-V2 case, which should only be accessed
+        * by the paging/VM system.
+        */
+
+       if (is_pageoutv2) {
+               off_t f_offset;
+               int offset;
+               int isize; 
+               int pg_index;
+               int error;
+               int error_ret = 0;
+
+               isize = ap->a_size;
+               f_offset = ap->a_f_offset;
+
+               /* 
+                * Scan from the back to find the last page in the UPL, so that we 
+                * aren't looking at a UPL that may have already been freed by the
+                * preceding aborts/completions.
+                */ 
+               for (pg_index = ((isize) / PAGE_SIZE); pg_index > 0;) {
+                       if (upl_page_present(pl, --pg_index))
+                               break;
+                       if (pg_index == 0) {
+                               ubc_upl_abort_range(upl, 0, isize, UPL_ABORT_FREE_ON_EMPTY);
+                               goto pageout_done;
+                       }
+               }
+
+               /* 
+                * initialize the offset variables before we touch the UPL.
+                * a_f_offset is the position into the file, in bytes
+                * offset is the position into the UPL, in bytes
+                * pg_index is the pg# of the UPL we're operating on.
+                * isize is the offset into the UPL of the last non-clean page. 
+                */
+               isize = ((pg_index + 1) * PAGE_SIZE);   
+
+               offset = 0;
+               pg_index = 0;
+
+               while (isize) {
+                       int  xsize;
+                       int  num_of_pages;
+
+                       if ( !upl_page_present(pl, pg_index)) {
+                               /*
+                                * we asked for RET_ONLY_DIRTY, so it's possible
+                                * to get back empty slots in the UPL.
+                                * just skip over them
+                                */
+                               f_offset += PAGE_SIZE;
+                               offset   += PAGE_SIZE;
+                               isize    -= PAGE_SIZE;
+                               pg_index++;
+
+                               continue;
+                       }
+                       if ( !upl_dirty_page(pl, pg_index)) {
+                               panic ("hfs_vnop_pageout: unforeseen clean page @ index %d for UPL %p\n", pg_index, upl);
+                       }
+
+                       /* 
+                        * We know that we have at least one dirty page.
+                        * Now checking to see how many in a row we have
+                        */
+                       num_of_pages = 1;
+                       xsize = isize - PAGE_SIZE;
+
+                       while (xsize) {
+                               if ( !upl_dirty_page(pl, pg_index + num_of_pages))
+                                       break;
+                               num_of_pages++;
+                               xsize -= PAGE_SIZE;
+                       }
+                       xsize = num_of_pages * PAGE_SIZE;
+
+                       if (!vnode_isswap(vp)) {
+                               off_t end_of_range;
+                               int tooklock;
+
+                               tooklock = 0;
+
+                               if (cp->c_lockowner != current_thread()) {
+                                       if ((retval = hfs_lock(cp, HFS_EXCLUSIVE_LOCK))) {
+                                               /*
+                                                * we're in the v2 path, so we are the
+                                                * owner of the UPL... we may have already
+                                                * processed some of the UPL, so abort it
+                                                * from the current working offset to the
+                                                * end of the UPL
+                                                */
+                                               ubc_upl_abort_range(upl,
+                                                                   offset,
+                                                                   ap->a_size - offset,
+                                                                   UPL_ABORT_FREE_ON_EMPTY);
+                                               goto pageout_done;
+                                       }
+                                       tooklock = 1;
+                               }
+                               end_of_range = f_offset + xsize - 1;
+       
+                               if (end_of_range >= filesize) {
+                                       end_of_range = (off_t)(filesize - 1);
+                               }
+                               if (f_offset < filesize) {
+                                       rl_remove(f_offset, end_of_range, &fp->ff_invalidranges);
+                                       cp->c_flag |= C_MODIFIED;  /* leof is dirty */
+                               }
+                               if (tooklock) {
+                                       hfs_unlock(cp);
+                               }
+                       }
+                       if ((error = cluster_pageout(vp, upl, offset, f_offset,
+                                                       xsize, filesize, a_flags))) {
+                               if (error_ret == 0)
+                                       error_ret = error;
+                       }
+                       f_offset += xsize;
+                       offset   += xsize;
+                       isize    -= xsize;
+                       pg_index += num_of_pages;
+               }
+               /* capture errnos bubbled out of cluster_pageout if they occurred */
+               if (error_ret != 0) {
+                       retval = error_ret;
+               }
+       } /* end block for v2 pageout behavior */
+       else {
+               if (!vnode_isswap(vp)) {
+                       off_t end_of_range;
+                       int tooklock = 0;
+
+                       if (cp->c_lockowner != current_thread()) {
+                               if ((retval = hfs_lock(cp, HFS_EXCLUSIVE_LOCK))) {
+                                       if (!(a_flags & UPL_NOCOMMIT)) {
+                                               ubc_upl_abort_range(upl,
+                                                                   a_pl_offset,
+                                                                   ap->a_size,
+                                                                   UPL_ABORT_FREE_ON_EMPTY);
+                                       }
+                                       goto pageout_done;
+                               }
+                               tooklock = 1;
+                       }
+                       end_of_range = ap->a_f_offset + ap->a_size - 1;
+       
+                       if (end_of_range >= filesize) {
+                               end_of_range = (off_t)(filesize - 1);
+                       }
+                       if (ap->a_f_offset < filesize) {
+                               rl_remove(ap->a_f_offset, end_of_range, &fp->ff_invalidranges);
+                               cp->c_flag |= C_MODIFIED;  /* leof is dirty */
+                       }
  
-       retval = cluster_pageout(vp, ap->a_pl, ap->a_pl_offset, ap->a_f_offset,
-                                ap->a_size, filesize, ap->a_flags);
+                       if (tooklock) {
+                               hfs_unlock(cp);
+                       }
+               }
+               /* 
+                * just call cluster_pageout for old pre-v2 behavior
+                */
+               retval = cluster_pageout(vp, upl, a_pl_offset, ap->a_f_offset,
+                               ap->a_size, filesize, a_flags);         
+       }
  
         /*
-        * If data was written, and setuid or setgid bits are set and
-        * this process is not the superuser then clear the setuid and
-        * setgid bits as a precaution against tampering.
+        * If data was written, update the modification time of the file.
+        * If setuid or setgid bits are set and this process is not the 
+        * superuser then clear the setuid and setgid bits as a precaution 
+        * against tampering.
          */
-       if ((retval == 0) &&
-           (cp->c_mode & (S_ISUID | S_ISGID)) &&
-           (vfs_context_suser(ap->a_context) != 0)) {
-               hfs_lock(cp, HFS_FORCE_LOCK);
-               cp->c_mode &= ~(S_ISUID | S_ISGID);
+       if (retval == 0) {
+               cp->c_touch_modtime = TRUE;
                 cp->c_touch_chgtime = TRUE;
-               hfs_unlock(cp);
+               if ((cp->c_mode & (S_ISUID | S_ISGID)) &&
+                   (vfs_context_suser(ap->a_context) != 0)) {
+                       hfs_lock(cp, HFS_FORCE_LOCK);
+                       cp->c_mode &= ~(S_ISUID | S_ISGID);
+                       hfs_unlock(cp);
+               }
+       }
+
+pageout_done:
+       if (is_pageoutv2) {
+               /* release truncate lock (shared) */
+               hfs_unlock_truncate(cp, 0);
         }
         return (retval);
  }
@@ -2609,10 +4062,10 @@ hfs_vnop_bwrite(struct vnop_bwrite_args *ap)
                  * Swap and validate the node if it is in native byte order.
                  * This is always be true on big endian, so we always validate
                  * before writing here.  On little endian, the node typically has
-                * been swapped and validatated when it was written to the journal,
+                * been swapped and validated when it was written to the journal,
                  * so we won't do anything here.
                  */
-               if (((UInt16 *)((char *)buf_dataptr(bp) + buf_count(bp) - 2))[0] == 0x000e) {
+               if (((u_int16_t *)((char *)buf_dataptr(bp) + buf_count(bp) - 2))[0] == 0x000e) {
                         /* Prepare the block pointer */
                         block.blockHeader = bp;
                         block.buffer = (char *)buf_dataptr(bp);
@@ -2622,7 +4075,7 @@ hfs_vnop_bwrite(struct vnop_bwrite_args *ap)
                         block.blockSize = buf_count(bp);
      
                         /* Endian un-swap B-Tree node */
-                       retval = hfs_swap_BTNode (&block, vp, kSwapBTNodeHostToBig);
+                       retval = hfs_swap_BTNode (&block, vp, kSwapBTNodeHostToBig, false);
                         if (retval)
                                 panic("hfs_vnop_bwrite: about to write corrupt node!\n");
                 }
@@ -2632,7 +4085,7 @@ hfs_vnop_bwrite(struct vnop_bwrite_args *ap)
         if ((buf_flags(bp) & B_LOCKED)) {
                 // XXXdbg
                 if (VTOHFS(vp)->jnl) {
-                       panic("hfs: CLEARING the lock bit on bp 0x%x\n", bp);
+                       panic("hfs: CLEARING the lock bit on bp %p\n", bp);
                 }
                 buf_clearflags(bp, B_LOCKED);
         }
@@ -2656,7 +4109,7 @@ hfs_vnop_bwrite(struct vnop_bwrite_args *ap)
   * 0               N (file offset)
   *
   * -----------------     -----------------
- * |///////////////|     |               |     STEP 1 (aquire new blocks)
+ * |///////////////|     |               |     STEP 1 (acquire new blocks)
   * -----------------     -----------------
   * 0               N     N+1             2N
   *
@@ -2673,9 +4126,8 @@ hfs_vnop_bwrite(struct vnop_bwrite_args *ap)
   * During steps 2 and 3 page-outs to file offsets less
   * than or equal to N are suspended.
   *
- * During step 3 page-ins to the file get supended.
+ * During step 3 page-ins to the file get suspended.
   */
-__private_extern__
  int
  hfs_relocate(struct  vnode *vp, u_int32_t  blockHint, kauth_cred_t cred,
         struct  proc *p)
@@ -2689,7 +4141,6 @@ hfs_relocate(struct  vnode *vp, u_int32_t  blockHint, kauth_cred_t cred,
         u_int32_t  growsize;
         u_int32_t  nextallocsave;
         daddr64_t  sector_a,  sector_b;
-       int disabled_caching = 0;
         int eflags;
         off_t  newbytes;
         int  retval;
@@ -2712,11 +4163,27 @@ hfs_relocate(struct  vnode *vp, u_int32_t  blockHint, kauth_cred_t cred,
         fp = VTOF(vp);
         if (fp->ff_unallocblocks)
                 return (EINVAL);
+
+#if CONFIG_PROTECT
+       /* 
+        * <rdar://problem/9118426>
+        * Disable HFS file relocation on content-protected filesystems
+        */
+       if (cp_fs_protected (hfsmp->hfs_mp)) {
+               return EINVAL;
+       }
+#endif
+
+       /* If it's an SSD, also disable HFS relocation */
+       if (hfsmp->hfs_flags & HFS_SSD) {
+               return EINVAL;
+       }
+
         blksize = hfsmp->blockSize;
         if (blockHint == 0)
                 blockHint = hfsmp->nextAllocation;
  
-       if ((fp->ff_size > (u_int64_t)0x7fffffff) ||
+       if ((fp->ff_size > 0x7fffffff) ||
             ((fp->ff_size > blksize) && vnodetype == VLNK)) {
                 return (EFBIG);
         }
@@ -2734,11 +4201,17 @@ hfs_relocate(struct  vnode *vp, u_int32_t  blockHint, kauth_cred_t cred,
  
         if (!vnode_issystem(vp) && (vnodetype != VLNK)) {
                 hfs_unlock(cp);
-               hfs_lock_truncate(cp, TRUE);
-               if ((retval = hfs_lock(VTOC(vp), HFS_EXCLUSIVE_LOCK))) {
-                       hfs_unlock_truncate(cp);
+               hfs_lock_truncate(cp, HFS_EXCLUSIVE_LOCK);
+               /* Force lock since callers expects lock to be held. */
+               if ((retval = hfs_lock(cp, HFS_FORCE_LOCK))) {
+                       hfs_unlock_truncate(cp, 0);
                         return (retval);
                 }
+               /* No need to continue if file was removed. */
+               if (cp->c_flag & C_NOEXISTS) {
+                       hfs_unlock_truncate(cp, 0);
+                       return (ENOENT);
+               }
                 took_trunc_lock = 1;
         }
         headblks = fp->ff_blocks;
@@ -2751,7 +4224,7 @@ hfs_relocate(struct  vnode *vp, u_int32_t  blockHint, kauth_cred_t cred,
  
         if (hfs_start_transaction(hfsmp) != 0) {
                 if (took_trunc_lock)
-                       hfs_unlock_truncate(cp);
+                       hfs_unlock_truncate(cp, 0);
             return (EINVAL);
         }
         started_tr = 1;
@@ -2771,19 +4244,14 @@ hfs_relocate(struct  vnode *vp, u_int32_t  blockHint, kauth_cred_t cred,
         }
  
         /*
-        * STEP 1 - aquire new allocation blocks.
+        * STEP 1 - acquire new allocation blocks.
          */
-       if (!vnode_isnocache(vp)) {
-               vnode_setnocache(vp);
-               disabled_caching = 1;
-
-       }
         nextallocsave = hfsmp->nextAllocation;
         retval = ExtendFileC(hfsmp, (FCB*)fp, growsize, blockHint, eflags, &newbytes);
         if (eflags & kEFMetadataMask) {
                 HFS_MOUNT_LOCK(hfsmp, TRUE);
-               hfsmp->nextAllocation = nextallocsave;
-               hfsmp->vcbFlags |= 0xFF00;
+               HFS_UPDATE_NEXT_ALLOCATION(hfsmp, nextallocsave);
+               MarkVCBDirty(hfsmp);
                 HFS_MOUNT_UNLOCK(hfsmp, TRUE);
         }
  
@@ -2806,9 +4274,20 @@ hfs_relocate(struct  vnode *vp, u_int32_t  blockHint, kauth_cred_t cred,
                         retval = ENOSPC;
                         goto restore;
                 } else if ((eflags & kEFMetadataMask) &&
-                          ((((u_int64_t)sector_b * hfsmp->hfs_phys_block_size) / blksize) >
+                          ((((u_int64_t)sector_b * hfsmp->hfs_logical_block_size) / blksize) >
                               hfsmp->hfs_metazone_end)) {
-                       printf("hfs_relocate: didn't move into metadata zone\n");
+#if 0
+                       const char * filestr;
+                       char emptystr = '\0';
+
+                       if (cp->c_desc.cd_nameptr != NULL) {
+                               filestr = (const char *)&cp->c_desc.cd_nameptr[0];
+                       } else if (vnode_name(vp) != NULL) {
+                               filestr = vnode_name(vp);
+                       } else {
+                               filestr = &emptystr;
+                       }
+#endif
                         retval = ENOSPC;
                         goto restore;
                 }
@@ -2865,7 +4344,7 @@ hfs_relocate(struct  vnode *vp, u_int32_t  blockHint, kauth_cred_t cred,
                 goto restore;
  out:
         if (took_trunc_lock)
-               hfs_unlock_truncate(cp);
+               hfs_unlock_truncate(cp, 0);
  
         if (lockflags) {
                 hfs_systemfile_unlock(hfsmp, lockflags);
@@ -2876,7 +4355,6 @@ out:
         if (retval == 0) {
                 (void) hfs_update(vp, MNT_WAIT);
         }
-
         if (hfsmp->jnl) {
                 if (cp->c_cnid < kHFSFirstUserCatalogNodeID)
                         (void) hfs_flushvolumeheader(hfsmp, MNT_WAIT, HFS_ALTFLUSH);
@@ -2884,17 +4362,17 @@ out:
                         (void) hfs_flushvolumeheader(hfsmp, MNT_NOWAIT, 0);
         }
  exit:
-       if (disabled_caching) {
-               vnode_clearnocache(vp);
-       }
         if (started_tr)
                 hfs_end_transaction(hfsmp);
  
         return (retval);
  
  restore:
-       if (fp->ff_blocks == headblks)
+       if (fp->ff_blocks == headblks) {
+               if (took_trunc_lock)
+                       hfs_unlock_truncate(cp, 0);
                 goto exit;
+       }
         /*
          * Give back any newly allocated space.
          */
@@ -2905,13 +4383,14 @@ restore:
                 lockflags = hfs_systemfile_lock(hfsmp, lockflags, HFS_EXCLUSIVE_LOCK);
         }
  
-       (void) TruncateFileC(hfsmp, (FCB*)fp, fp->ff_size, false);
+       (void) TruncateFileC(hfsmp, (FCB*)fp, fp->ff_size, 0, FORK_IS_RSRC(fp), 
+                                                FTOC(fp)->c_fileid, false);
  
         hfs_systemfile_unlock(hfsmp, lockflags);
         lockflags = 0;
  
         if (took_trunc_lock)
-               hfs_unlock_truncate(cp);
+               hfs_unlock_truncate(cp, 0);
         goto exit;
  }
  
@@ -2921,7 +4400,7 @@ restore:
   *
   */
  static int
-hfs_clonelink(struct vnode *vp, int blksize, kauth_cred_t cred, struct proc *p)
+hfs_clonelink(struct vnode *vp, int blksize, kauth_cred_t cred, __unused struct proc *p)
  {
         struct buf *head_bp = NULL;
         struct buf *tail_bp = NULL;
@@ -2957,50 +4436,57 @@ static int
  hfs_clonefile(struct vnode *vp, int blkstart, int blkcnt, int blksize)
  {
         caddr_t  bufp;
-       size_t  writebase;
         size_t  bufsize;
         size_t  copysize;
          size_t  iosize;
-       off_t   filesize;
         size_t  offset;
+       off_t   writebase;
         uio_t auio;
         int  error = 0;
  
-       filesize = VTOF(vp)->ff_blocks * blksize;  /* virtual file size */
         writebase = blkstart * blksize;
         copysize = blkcnt * blksize;
         iosize = bufsize = MIN(copysize, 128 * 1024);
         offset = 0;
  
+       hfs_unlock(VTOC(vp));
+
+#if CONFIG_PROTECT
+       if ((error = cp_handle_vnop(VTOC(vp), CP_WRITE_ACCESS)) != 0) {
+               hfs_lock(VTOC(vp), HFS_FORCE_LOCK);     
+               return (error);
+       }
+#endif /* CONFIG_PROTECT */
+
         if (kmem_alloc(kernel_map, (vm_offset_t *)&bufp, bufsize)) {
+               hfs_lock(VTOC(vp), HFS_FORCE_LOCK);
                 return (ENOMEM);
-       }       
-       hfs_unlock(VTOC(vp));
+       }
  
-       auio = uio_create(1, 0, UIO_SYSSPACE32, UIO_READ);
+       auio = uio_create(1, 0, UIO_SYSSPACE, UIO_READ);
  
         while (offset < copysize) {
                 iosize = MIN(copysize - offset, iosize);
  
-               uio_reset(auio, offset, UIO_SYSSPACE32, UIO_READ);
+               uio_reset(auio, offset, UIO_SYSSPACE, UIO_READ);
                 uio_addiov(auio, (uintptr_t)bufp, iosize);
  
-               error = cluster_read(vp, auio, copysize, 0);
+               error = cluster_read(vp, auio, copysize, IO_NOCACHE);
                 if (error) {
                         printf("hfs_clonefile: cluster_read failed - %d\n", error);
                         break;
                 }
                 if (uio_resid(auio) != 0) {
-                       printf("clonedata: cluster_read: uio_resid = %lld\n", uio_resid(auio));
+                       printf("hfs_clonefile: cluster_read: uio_resid = %lld\n", uio_resid(auio));
                         error = EIO;            
                         break;
                 }
  
-               uio_reset(auio, writebase + offset, UIO_SYSSPACE32, UIO_WRITE);
+               uio_reset(auio, writebase + offset, UIO_SYSSPACE, UIO_WRITE);
                 uio_addiov(auio, (uintptr_t)bufp, iosize);
  
-               error = cluster_write(vp, auio, filesize + offset,
-                                     filesize + offset + iosize,
+               error = cluster_write(vp, auio, writebase + offset,
+                                     writebase + offset + iosize,
                                       uio_offset(auio), 0, IO_NOCACHE | IO_SYNC);
                 if (error) {
                         printf("hfs_clonefile: cluster_write failed - %d\n", error);
@@ -3015,11 +4501,25 @@ hfs_clonefile(struct vnode *vp, int blkstart, int blkcnt, int blksize)
         }
         uio_free(auio);
  
-       /*
-        * No need to call ubc_sync_range or hfs_invalbuf
-        * since the file was copied using IO_NOCACHE.
-        */
-
+       if ((blksize & PAGE_MASK)) {
+               /*
+                * since the copy may not have started on a PAGE
+                * boundary (or may not have ended on one), we 
+                * may have pages left in the cache since NOCACHE
+                * will let partially written pages linger...
+                * lets just flush the entire range to make sure
+                * we don't have any pages left that are beyond
+                * (or intersect) the real LEOF of this file
+                */
+               ubc_msync(vp, writebase, writebase + offset, NULL, UBC_INVALIDATE | UBC_PUSHDIRTY);
+       } else {
+               /*
+                * No need to call ubc_sync_range or hfs_invalbuf
+                * since the file was copied using IO_NOCACHE and
+                * the copy was done starting and ending on a page
+                * boundary in the file.
+                */
+       }
         kmem_free(kernel_map, (vm_offset_t)bufp, bufsize);
  
         hfs_lock(VTOC(vp), HFS_FORCE_LOCK);