]> git.saurik.com Git - apple/xnu.git/blobdiff - bsd/hfs/hfs_readwrite.c
xnu-1486.2.11.tar.gz
[apple/xnu.git] / bsd / hfs / hfs_readwrite.c
index 8bde675daca8d5c6e14372e47fb3f8efb51db44b..6dc30afad3270c1f85516de58e350d3a72a36f44 100644 (file)
@@ -1,16 +1,19 @@
 /*
 /*
- * Copyright (c) 2000-2002 Apple Computer, Inc. All rights reserved.
+ * Copyright (c) 2000-2009 Apple Inc. All rights reserved.
  *
  *
- * @APPLE_LICENSE_HEADER_START@
- * 
- * Copyright (c) 1999-2003 Apple Computer, Inc.  All Rights Reserved.
+ * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
  * 
  * This file contains Original Code and/or Modifications of Original Code
  * as defined in and that are subject to the Apple Public Source License
  * Version 2.0 (the 'License'). You may not use this file except in
  * 
  * This file contains Original Code and/or Modifications of Original Code
  * as defined in and that are subject to the Apple Public Source License
  * Version 2.0 (the 'License'). You may not use this file except in
- * compliance with the License. Please obtain a copy of the License at
- * http://www.opensource.apple.com/apsl/ and read it before using this
- * file.
+ * compliance with the License. The rights granted to you under the License
+ * may not be used to create, or enable the creation or redistribution of,
+ * unlawful or unlicensed copies of an Apple operating system, or to
+ * circumvent, violate, or enable the circumvention or violation of, any
+ * terms of an Apple operating system software license agreement.
+ * 
+ * Please obtain a copy of the License at
+ * http://www.opensource.apple.com/apsl/ and read it before using this file.
  * 
  * The Original Code and all software distributed under the License are
  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
  * 
  * The Original Code and all software distributed under the License are
  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
@@ -20,7 +23,7 @@
  * Please see the License for the specific language governing rights and
  * limitations under the License.
  * 
  * Please see the License for the specific language governing rights and
  * limitations under the License.
  * 
- * @APPLE_LICENSE_HEADER_END@
+ * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
  */
 /*     @(#)hfs_readwrite.c     1.0
  *
  */
 /*     @(#)hfs_readwrite.c     1.0
  *
 #include <sys/resourcevar.h>
 #include <sys/kernel.h>
 #include <sys/fcntl.h>
 #include <sys/resourcevar.h>
 #include <sys/kernel.h>
 #include <sys/fcntl.h>
+#include <sys/filedesc.h>
 #include <sys/stat.h>
 #include <sys/buf.h>
 #include <sys/proc.h>
 #include <sys/stat.h>
 #include <sys/buf.h>
 #include <sys/proc.h>
+#include <sys/kauth.h>
 #include <sys/vnode.h>
 #include <sys/vnode.h>
+#include <sys/vnode_internal.h>
 #include <sys/uio.h>
 #include <sys/uio.h>
+#include <sys/vfs_context.h>
+#include <sys/fsevents.h>
+#include <kern/kalloc.h>
+#include <sys/disk.h>
+#include <sys/sysctl.h>
+#include <sys/fsctl.h>
 
 #include <miscfs/specfs/specdev.h>
 
 #include <sys/ubc.h>
 
 #include <miscfs/specfs/specdev.h>
 
 #include <sys/ubc.h>
+#include <sys/ubc_internal.h>
+
 #include <vm/vm_pageout.h>
 #include <vm/vm_pageout.h>
+#include <vm/vm_kern.h>
 
 #include <sys/kdebug.h>
 
 #include       "hfs.h"
 
 #include <sys/kdebug.h>
 
 #include       "hfs.h"
+#include       "hfs_attrlist.h"
 #include       "hfs_endian.h"
 #include       "hfs_endian.h"
+#include       "hfs_fsctl.h"
 #include       "hfs_quota.h"
 #include       "hfscommon/headers/FileMgrInternal.h"
 #include       "hfscommon/headers/BTreesInternal.h"
 #include       "hfs_cnode.h"
 #include       "hfs_dbg.h"
 
 #include       "hfs_quota.h"
 #include       "hfscommon/headers/FileMgrInternal.h"
 #include       "hfscommon/headers/BTreesInternal.h"
 #include       "hfs_cnode.h"
 #include       "hfs_dbg.h"
 
-extern int overflow_extents(struct filefork *fp);
-
 #define can_cluster(size) ((((size & (4096-1))) == 0) && (size <= (MAXPHYSIO/2)))
 
 enum {
        MAXHFSFILESIZE = 0x7FFFFFFF             /* this needs to go in the mount structure */
 };
 
 #define can_cluster(size) ((((size & (4096-1))) == 0) && (size <= (MAXPHYSIO/2)))
 
 enum {
        MAXHFSFILESIZE = 0x7FFFFFFF             /* this needs to go in the mount structure */
 };
 
-extern u_int32_t GetLogicalBlockSize(struct vnode *vp);
+/* from bsd/hfs/hfs_vfsops.c */
+extern int hfs_vfs_vget (struct mount *mp, ino64_t ino, struct vnode **vpp, vfs_context_t context);
 
 
+static int  hfs_clonelink(struct vnode *, int, kauth_cred_t, struct proc *);
+static int  hfs_clonefile(struct vnode *, int, int, int);
+static int  hfs_clonesysfile(struct vnode *, int, int, int, kauth_cred_t, struct proc *);
+static int  hfs_minorupdate(struct vnode *vp);
+static int  do_hfs_truncate(struct vnode *vp, off_t length, int flags, int skip, vfs_context_t context);
 
 
-/*****************************************************************************
-*
-*      Operations on vnodes
-*
-*****************************************************************************/
 
 
-/*
-#% read                vp      L L L
-#
- vop_read {
-     IN struct vnode *vp;
-     INOUT struct uio *uio;
-     IN int ioflag;
-     IN struct ucred *cred;
+int flush_cache_on_write = 0;
+SYSCTL_INT (_kern, OID_AUTO, flush_cache_on_write, CTLFLAG_RW, &flush_cache_on_write, 0, "always flush the drive cache on writes to uncached files");
 
 
-     */
 
 
+/*
+ * Read data from a file.
+ */
 int
 int
-hfs_read(ap)
-       struct vop_read_args /* {
-               struct vnode *a_vp;
-               struct uio *a_uio;
-               int a_ioflag;
-               struct ucred *a_cred;
-       } */ *ap;
+hfs_vnop_read(struct vnop_read_args *ap)
 {
 {
-       register struct uio *uio = ap->a_uio;
-       register struct vnode *vp = ap->a_vp;
+       uio_t uio = ap->a_uio;
+       struct vnode *vp = ap->a_vp;
        struct cnode *cp;
        struct filefork *fp;
        struct cnode *cp;
        struct filefork *fp;
-       struct buf *bp;
-       daddr_t logBlockNo;
-       u_long fragSize, moveSize, startOffset, ioxfersize;
-       int devBlockSize = 0;
-       off_t bytesRemaining;
+       struct hfsmount *hfsmp;
+       off_t filesize;
+       off_t filebytes;
+       off_t start_resid = uio_resid(uio);
+       off_t offset = uio_offset(uio);
        int retval = 0;
        int retval = 0;
-       off_t filesize;
-       off_t filebytes;
 
        /* Preflight checks */
 
        /* Preflight checks */
-       if (vp->v_type != VREG && vp->v_type != VLNK)
-               return (EISDIR);        /* HFS can only read files */
-       if (uio->uio_resid == 0)
+       if (!vnode_isreg(vp)) {
+               /* can only read regular files */
+               if (vnode_isdir(vp))
+                       return (EISDIR);
+               else
+                       return (EPERM);
+       }
+       if (start_resid == 0)
                return (0);             /* Nothing left to do */
                return (0);             /* Nothing left to do */
-       if (uio->uio_offset < 0)
+       if (offset < 0)
                return (EINVAL);        /* cant read from a negative offset */
                return (EINVAL);        /* cant read from a negative offset */
+       
+#if HFS_COMPRESSION
+       if (VNODE_IS_RSRC(vp)) {
+               if (hfs_hides_rsrc(ap->a_context, VTOC(vp), 1)) { /* 1 == don't take the cnode lock */
+                       return 0;
+               }
+               /* otherwise read the resource fork normally */
+       } else {
+               int compressed = hfs_file_is_compressed(VTOC(vp), 1); /* 1 == don't take the cnode lock */
+               if (compressed) {
+                       retval = decmpfs_read_compressed(ap, &compressed, VTOCMP(vp));
+                       if (compressed) {
+                               if (retval == 0) {
+                                       /* successful read, update the access time */
+                                       VTOC(vp)->c_touch_acctime = TRUE;
+                                       
+                                       /* compressed files are not hot file candidates */
+                                       if (VTOHFS(vp)->hfc_stage == HFC_RECORDING) {
+                                               VTOF(vp)->ff_bytesread = 0;
+                                       }
+                               }
+                               return retval;
+                       }
+                       /* otherwise the file was converted back to a regular file while we were reading it */
+                       retval = 0;
+               }
+       }
+#endif /* HFS_COMPRESSION */
 
        cp = VTOC(vp);
        fp = VTOF(vp);
 
        cp = VTOC(vp);
        fp = VTOF(vp);
+       hfsmp = VTOHFS(vp);
+
+       /* Protect against a size change. */
+       hfs_lock_truncate(cp, 0);
+
        filesize = fp->ff_size;
        filesize = fp->ff_size;
-       filebytes = (off_t)fp->ff_blocks * (off_t)VTOVCB(vp)->blockSize;
-       if (uio->uio_offset > filesize) {
-               if ((!ISHFSPLUS(VTOVCB(vp))) && (uio->uio_offset > (off_t)MAXHFSFILESIZE))
-                       return (EFBIG);
-               else
-                       return (0);
+       filebytes = (off_t)fp->ff_blocks * (off_t)hfsmp->blockSize;
+       if (offset > filesize) {
+               if ((hfsmp->hfs_flags & HFS_STANDARD) &&
+                   (offset > (off_t)MAXHFSFILESIZE)) {
+                       retval = EFBIG;
+               }
+               goto exit;
        }
 
        }
 
-       VOP_DEVBLOCKSIZE(cp->c_devvp, &devBlockSize);
-
        KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 12)) | DBG_FUNC_START,
        KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 12)) | DBG_FUNC_START,
-               (int)uio->uio_offset, uio->uio_resid, (int)filesize, (int)filebytes, 0);
+               (int)uio_offset(uio), uio_resid(uio), (int)filesize, (int)filebytes, 0);
 
 
-       if (UBCISVALID(vp)) {
-               retval = cluster_read(vp, uio, filesize, devBlockSize, 0);
-       } else {
+       retval = cluster_read(vp, uio, filesize, ap->a_ioflag);
 
 
-               for (retval = 0, bp = NULL; uio->uio_resid > 0; bp = NULL) {
-
-            if ((bytesRemaining = (filesize - uio->uio_offset)) <= 0)
-                break;
-
-            logBlockNo  = (daddr_t)(uio->uio_offset / PAGE_SIZE_64);
-            startOffset = (u_long) (uio->uio_offset & PAGE_MASK_64);
-            fragSize    = PAGE_SIZE;
-
-            if (((logBlockNo * PAGE_SIZE) + fragSize) < filesize)
-                ioxfersize = fragSize;
-            else {
-                ioxfersize = filesize - (logBlockNo * PAGE_SIZE);
-                ioxfersize = (ioxfersize + (devBlockSize - 1)) & ~(devBlockSize - 1);
-            }
-               moveSize = ioxfersize;
-               moveSize -= startOffset;
-
-            if (bytesRemaining < moveSize)
-                moveSize = bytesRemaining;
-
-            if (uio->uio_resid < moveSize) {
-                moveSize = uio->uio_resid;
-            };
-            if (moveSize == 0) {
-                break;
-            };
-
-            if (( uio->uio_offset + fragSize) >= filesize) {
-                retval = bread(vp, logBlockNo, ioxfersize, NOCRED, &bp);
-
-            } else if (logBlockNo - 1 == vp->v_lastr && !(vp->v_flag & VRAOFF)) {
-                daddr_t nextLogBlockNo = logBlockNo + 1;
-                int nextsize;
-
-                if (((nextLogBlockNo * PAGE_SIZE) +
-                     (daddr_t)fragSize) < filesize)
-                    nextsize = fragSize;
-                else {
-                    nextsize = filesize - (nextLogBlockNo * PAGE_SIZE);
-                    nextsize = (nextsize + (devBlockSize - 1)) & ~(devBlockSize - 1);
-                }
-                retval = breadn(vp, logBlockNo, ioxfersize, &nextLogBlockNo, &nextsize, 1, NOCRED, &bp);
-            } else {
-                retval = bread(vp, logBlockNo, ioxfersize, NOCRED, &bp);
-            };
-
-            if (retval != E_NONE) {
-                if (bp) {
-                    brelse(bp);
-                    bp = NULL;
-                }
-                break;
-            };
-            vp->v_lastr = logBlockNo;
-
-            /*
-             * We should only get non-zero b_resid when an I/O retval
-             * has occurred, which should cause us to break above.
-             * However, if the short read did not cause an retval,
-             * then we want to ensure that we do not uiomove bad
-             * or uninitialized data.
-             */
-            ioxfersize -= bp->b_resid;
-
-            if (ioxfersize < moveSize) {                       /* XXX PPD This should take the offset into account, too! */
-                if (ioxfersize == 0)
-                    break;
-                moveSize = ioxfersize;
-            }
-            if ((startOffset + moveSize) > bp->b_bcount)
-                panic("hfs_read: bad startOffset or moveSize\n");
-
-            if ((retval = uiomove((caddr_t)bp->b_data + startOffset, (int)moveSize, uio)))
-                break;
-
-            if (S_ISREG(cp->c_mode) &&
-                (((startOffset + moveSize) == fragSize) || (uio->uio_offset == filesize))) {
-                bp->b_flags |= B_AGE;
-            };
-
-            brelse(bp);
-            /* Start of loop resets bp to NULL before reaching outside this block... */
-        }
-
-               if (bp != NULL) {
-                       brelse(bp);
-               }
-       }
-
-       cp->c_flag |= C_ACCESS;
+       cp->c_touch_acctime = TRUE;
 
        KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 12)) | DBG_FUNC_END,
 
        KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 12)) | DBG_FUNC_END,
-               (int)uio->uio_offset, uio->uio_resid, (int)filesize,  (int)filebytes, 0);
+               (int)uio_offset(uio), uio_resid(uio), (int)filesize,  (int)filebytes, 0);
+
+       /*
+        * Keep track blocks read
+        */
+       if (hfsmp->hfc_stage == HFC_RECORDING && retval == 0) {
+               int took_cnode_lock = 0;
+               off_t bytesread;
+
+               bytesread = start_resid - uio_resid(uio);
+
+               /* When ff_bytesread exceeds 32-bits, update it behind the cnode lock. */
+               if ((fp->ff_bytesread + bytesread) > 0x00000000ffffffff) {
+                       hfs_lock(cp, HFS_FORCE_LOCK);
+                       took_cnode_lock = 1;
+               }
+               /*
+                * If this file hasn't been seen since the start of
+                * the current sampling period then start over.
+                */
+               if (cp->c_atime < hfsmp->hfc_timebase) {
+                       struct timeval tv;
 
 
+                       fp->ff_bytesread = bytesread;
+                       microtime(&tv);
+                       cp->c_atime = tv.tv_sec;
+               } else {
+                       fp->ff_bytesread += bytesread;
+               }
+               if (took_cnode_lock)
+                       hfs_unlock(cp);
+       }
+exit:
+       hfs_unlock_truncate(cp, 0);
        return (retval);
 }
 
 /*
        return (retval);
 }
 
 /*
- * Write data to a file or directory.
-#% write       vp      L L L
-#
- vop_write {
-     IN struct vnode *vp;
-     INOUT struct uio *uio;
-     IN int ioflag;
-     IN struct ucred *cred;
-
-     */
+ * Write data to a file.
+ */
 int
 int
-hfs_write(ap)
-       struct vop_write_args /* {
-               struct vnode *a_vp;
-               struct uio *a_uio;
-               int a_ioflag;
-               struct ucred *a_cred;
-       } */ *ap;
+hfs_vnop_write(struct vnop_write_args *ap)
 {
 {
+       uio_t uio = ap->a_uio;
        struct vnode *vp = ap->a_vp;
        struct vnode *vp = ap->a_vp;
-       struct uio *uio = ap->a_uio;
        struct cnode *cp;
        struct filefork *fp;
        struct cnode *cp;
        struct filefork *fp;
-       struct buf *bp;
-       struct proc *p;
-       struct timeval tv;
-       ExtendedVCB *vcb;
-    int                                        devBlockSize = 0;
-    daddr_t                    logBlockNo;
-    long                               fragSize;
-    off_t                              origFileSize, currOffset, writelimit, bytesToAdd;
-    off_t                              actualBytesAdded;
-    u_long                             blkoffset, resid, xfersize, clearSize;
-    int                                        eflags, ioflag;
-    int                                retval;
-       off_t filebytes;
-       u_long fileblocks;
        struct hfsmount *hfsmp;
        struct hfsmount *hfsmp;
-       int started_tr = 0, grabbed_lock = 0;
+       kauth_cred_t cred = NULL;
+       off_t origFileSize;
+       off_t writelimit;
+       off_t bytesToAdd = 0;
+       off_t actualBytesAdded;
+       off_t filebytes;
+       off_t offset;
+       ssize_t resid;
+       int eflags;
+       int ioflag = ap->a_ioflag;
+       int retval = 0;
+       int lockflags;
+       int cnode_locked = 0;
+       int partialwrite = 0;
+       int exclusive_lock = 0;
+
+#if HFS_COMPRESSION
+       if ( hfs_file_is_compressed(VTOC(vp), 1) ) { /* 1 == don't take the cnode lock */
+               int state = decmpfs_cnode_get_vnode_state(VTOCMP(vp));
+               switch(state) {
+                       case FILE_IS_COMPRESSED:
+                               return EACCES;
+                       case FILE_IS_CONVERTING:
+                               /* if FILE_IS_CONVERTING, we allow writes */
+                               break;
+                       default:
+                               printf("invalid state %d for compressed file\n", state);
+                               /* fall through */
+               }
+       }
+#endif
 
 
-       ioflag = ap->a_ioflag;
+       // LP64todo - fix this! uio_resid may be 64-bit value
+       resid = uio_resid(uio);
+       offset = uio_offset(uio);
 
 
-       if (uio->uio_offset < 0)
+       if (ioflag & IO_APPEND) {
+           exclusive_lock = 1;
+       }
+       
+       if (offset < 0)
                return (EINVAL);
                return (EINVAL);
-       if (uio->uio_resid == 0)
+       if (resid == 0)
                return (E_NONE);
                return (E_NONE);
-       if (vp->v_type != VREG && vp->v_type != VLNK)
-               return (EISDIR);        /* Can only write files */
+       if (!vnode_isreg(vp))
+               return (EPERM);  /* Can only write regular files */
 
        cp = VTOC(vp);
        fp = VTOF(vp);
 
        cp = VTOC(vp);
        fp = VTOF(vp);
-       vcb = VTOVCB(vp);
-       fileblocks = fp->ff_blocks;
-       filebytes = (off_t)fileblocks * (off_t)vcb->blockSize;
-
-       if (ioflag & IO_APPEND)
-               uio->uio_offset = fp->ff_size;
-       if ((cp->c_flags & APPEND) && uio->uio_offset != fp->ff_size)
-               return (EPERM);
-
-       // XXXdbg - don't allow modification of the journal or journal_info_block
-       if (VTOHFS(vp)->jnl && cp->c_datafork) {
-               struct HFSPlusExtentDescriptor *extd;
+       hfsmp = VTOHFS(vp);
 
 
-               extd = &cp->c_datafork->ff_data.cf_extents[0];
-               if (extd->startBlock == VTOVCB(vp)->vcbJinfoBlock || extd->startBlock == VTOHFS(vp)->jnl_start) {
-                       return EPERM;
-               }
+       eflags = kEFDeferMask;  /* defer file block allocations */
+#ifdef HFS_SPARSE_DEV
+       /* 
+        * When the underlying device is sparse and space
+        * is low (< 8MB), stop doing delayed allocations
+        * and begin doing synchronous I/O.
+        */
+       if ((hfsmp->hfs_flags & HFS_HAS_SPARSE_DEVICE) &&
+           (hfs_freeblks(hfsmp, 0) < 2048)) {
+               eflags &= ~kEFDeferMask;
+               ioflag |= IO_SYNC;
        }
        }
+#endif /* HFS_SPARSE_DEV */
 
 
-       writelimit = uio->uio_offset + uio->uio_resid;
+again:
+       /* Protect against a size change. */
+       hfs_lock_truncate(cp, exclusive_lock);
 
 
-       /*
-        * Maybe this should be above the vnode op call, but so long as
-        * file servers have no limits, I don't think it matters.
-        */
-       p = uio->uio_procp;
-       if (vp->v_type == VREG && p &&
-           writelimit > p->p_rlimit[RLIMIT_FSIZE].rlim_cur) {
-               psignal(p, SIGXFSZ);
-               return (EFBIG);
+       if (ioflag & IO_APPEND) {
+               uio_setoffset(uio, fp->ff_size);
+               offset = fp->ff_size;
+       }
+       if ((cp->c_flags & APPEND) && offset != fp->ff_size) {
+               retval = EPERM;
+               goto exit;
        }
        }
-       p = current_proc();
-
-       VOP_DEVBLOCKSIZE(cp->c_devvp, &devBlockSize);
 
 
-       resid = uio->uio_resid;
        origFileSize = fp->ff_size;
        origFileSize = fp->ff_size;
-       eflags = kEFDeferMask;  /* defer file block allocations */
-       filebytes = (off_t)fp->ff_blocks * (off_t)vcb->blockSize;
-
-       /*
-        * NOTE: In the following loop there are two positions tracked:
-        * currOffset is the current I/O starting offset.  currOffset
-        * is never >LEOF; the LEOF is nudged along with currOffset as
-        * data is zeroed or written. uio->uio_offset is the start of
-        * the current I/O operation.  It may be arbitrarily beyond
-        * currOffset.
-        *
-        * The following is true at all times:
-        *   currOffset <= LEOF <= uio->uio_offset <= writelimit
+       writelimit = offset + resid;
+       filebytes = (off_t)fp->ff_blocks * (off_t)hfsmp->blockSize;
+
+       /* If the truncate lock is shared, and if we either have virtual 
+        * blocks or will need to extend the file, upgrade the truncate 
+        * to exclusive lock.  If upgrade fails, we lose the lock and 
+        * have to get exclusive lock again.  Note that we want to
+        * grab the truncate lock exclusive even if we're not allocating new blocks
+        * because we could still be growing past the LEOF.
         */
         */
-       currOffset = MIN(uio->uio_offset, fp->ff_size);
+       if ((exclusive_lock == 0) && 
+           ((fp->ff_unallocblocks != 0) || (writelimit > origFileSize))) {
+               exclusive_lock = 1;
+               /* Lock upgrade failed and we lost our shared lock, try again */
+               if (lck_rw_lock_shared_to_exclusive(&cp->c_truncatelock) == FALSE) {
+                       goto again;
+               } 
+       }
+
+       if ( (retval = hfs_lock(VTOC(vp), HFS_EXCLUSIVE_LOCK))) {
+               goto exit;
+       }
+       cnode_locked = 1;
+       
+       if (!exclusive_lock) {
+               KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 0)) | DBG_FUNC_START,
+                            (int)offset, uio_resid(uio), (int)fp->ff_size,
+                            (int)filebytes, 0);
+       }
 
 
-       KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 0)) | DBG_FUNC_START,
-               (int)uio->uio_offset, uio->uio_resid, (int)fp->ff_size, (int)filebytes, 0);
-       retval = 0;
+       /* Check if we do not need to extend the file */
+       if (writelimit <= filebytes) {
+               goto sizeok;
+       }
 
 
-       /* Now test if we need to extend the file */
-       /* Doing so will adjust the filebytes for us */
+       cred = vfs_context_ucred(ap->a_context);
+       bytesToAdd = writelimit - filebytes;
 
 #if QUOTA
 
 #if QUOTA
-       if(writelimit > filebytes) {
-               bytesToAdd = writelimit - filebytes;
-
-               retval = hfs_chkdq(cp, (int64_t)(roundup(bytesToAdd, vcb->blockSize)), 
-                                  ap->a_cred, 0);
-               if (retval)
-                       return (retval);
-       }
+       retval = hfs_chkdq(cp, (int64_t)(roundup(bytesToAdd, hfsmp->blockSize)), 
+                          cred, 0);
+       if (retval)
+               goto exit;
 #endif /* QUOTA */
 
 #endif /* QUOTA */
 
-       hfsmp = VTOHFS(vp);
-       if (writelimit > filebytes) {
-               hfs_global_shared_lock_acquire(hfsmp);
-               grabbed_lock = 1;
-       }
-       if (hfsmp->jnl && (writelimit > filebytes)) {
-               if (journal_start_transaction(hfsmp->jnl) != 0) {
-                       hfs_global_shared_lock_release(hfsmp);
-                       return EINVAL;
-               }
-               started_tr = 1;
+       if (hfs_start_transaction(hfsmp) != 0) {
+               retval = EINVAL;
+               goto exit;
        }
 
        while (writelimit > filebytes) {
        }
 
        while (writelimit > filebytes) {
-       
                bytesToAdd = writelimit - filebytes;
                bytesToAdd = writelimit - filebytes;
-               if (suser(ap->a_cred, NULL) != 0)
+               if (cred && suser(cred, NULL) != 0)
                        eflags |= kEFReserveMask;
 
                        eflags |= kEFReserveMask;
 
-               /* lock extents b-tree (also protects volume bitmap) */
-               retval = hfs_metafilelocking(VTOHFS(vp), kHFSExtentsFileID, LK_EXCLUSIVE, current_proc());
-               if (retval != E_NONE)
-                       break;
-
-               retval = MacToVFSError(ExtendFileC (vcb, (FCB*)fp, bytesToAdd,
+               /* Protect extents b-tree and allocation bitmap */
+               lockflags = SFL_BITMAP;
+               if (overflow_extents(fp))
+                       lockflags |= SFL_EXTENTS;
+               lockflags = hfs_systemfile_lock(hfsmp, lockflags, HFS_EXCLUSIVE_LOCK);
+       
+               /* Files that are changing size are not hot file candidates. */
+               if (hfsmp->hfc_stage == HFC_RECORDING) {
+                       fp->ff_bytesread = 0;
+               }
+               retval = MacToVFSError(ExtendFileC (hfsmp, (FCB*)fp, bytesToAdd,
                                0, eflags, &actualBytesAdded));
 
                                0, eflags, &actualBytesAdded));
 
-               (void) hfs_metafilelocking(VTOHFS(vp), kHFSExtentsFileID, LK_RELEASE, p);
+               hfs_systemfile_unlock(hfsmp, lockflags);
+
                if ((actualBytesAdded == 0) && (retval == E_NONE))
                        retval = ENOSPC;
                if (retval != E_NONE)
                        break;
                if ((actualBytesAdded == 0) && (retval == E_NONE))
                        retval = ENOSPC;
                if (retval != E_NONE)
                        break;
-               filebytes = (off_t)fp->ff_blocks * (off_t)vcb->blockSize;
+               filebytes = (off_t)fp->ff_blocks * (off_t)hfsmp->blockSize;
                KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 0)) | DBG_FUNC_NONE,
                KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 0)) | DBG_FUNC_NONE,
-                       (int)uio->uio_offset, uio->uio_resid, (int)fp->ff_size,  (int)filebytes, 0);
+                       (int)offset, uio_resid(uio), (int)fp->ff_size,  (int)filebytes, 0);
        }
        }
+       (void) hfs_update(vp, TRUE);
+       (void) hfs_volupdate(hfsmp, VOL_UPDATE, 0);
+       (void) hfs_end_transaction(hfsmp);
 
 
-       // XXXdbg
-       if (started_tr) {
-               hfs_flushvolumeheader(hfsmp, MNT_NOWAIT, 0);
-               journal_end_transaction(hfsmp->jnl);
-               started_tr = 0;
-       }
-       if (grabbed_lock) {
-               hfs_global_shared_lock_release(hfsmp);
-               grabbed_lock = 0;
+       /*
+        * If we didn't grow the file enough try a partial write.
+        * POSIX expects this behavior.
+        */
+       if ((retval == ENOSPC) && (filebytes > offset)) {
+               retval = 0;
+               partialwrite = 1;
+               uio_setresid(uio, (uio_resid(uio) - bytesToAdd));
+               resid -= bytesToAdd;
+               writelimit = filebytes;
        }
        }
-
-       if (UBCISVALID(vp) && retval == E_NONE) {
+sizeok:
+       if (retval == E_NONE) {
                off_t filesize;
                off_t zero_off;
                off_t tail_off;
                off_t inval_start;
                off_t inval_end;
                off_t filesize;
                off_t zero_off;
                off_t tail_off;
                off_t inval_start;
                off_t inval_end;
-               off_t io_start, io_end;
+               off_t io_start;
                int lflag;
                struct rl_entry *invalid_range;
 
                int lflag;
                struct rl_entry *invalid_range;
 
@@ -418,17 +412,19 @@ hfs_write(ap)
                else
                        filesize = fp->ff_size;
 
                else
                        filesize = fp->ff_size;
 
-               lflag = (ioflag & IO_SYNC);
+               lflag = ioflag & ~(IO_TAILZEROFILL | IO_HEADZEROFILL | IO_NOZEROVALID | IO_NOZERODIRTY);
 
 
-               if (uio->uio_offset <= fp->ff_size) {
-                       zero_off = uio->uio_offset & ~PAGE_MASK_64;
+               if (offset <= fp->ff_size) {
+                       zero_off = offset & ~PAGE_MASK_64;
                        
                        /* Check to see whether the area between the zero_offset and the start
                           of the transfer to see whether is invalid and should be zero-filled
                           as part of the transfer:
                         */
                        
                        /* Check to see whether the area between the zero_offset and the start
                           of the transfer to see whether is invalid and should be zero-filled
                           as part of the transfer:
                         */
-                       if (rl_scan(&fp->ff_invalidranges, zero_off, uio->uio_offset - 1, &invalid_range) != RL_NOOVERLAP)
-                               lflag |= IO_HEADZEROFILL;
+                       if (offset > zero_off) {
+                               if (rl_scan(&fp->ff_invalidranges, zero_off, offset - 1, &invalid_range) != RL_NOOVERLAP)
+                                       lflag |= IO_HEADZEROFILL;
+                       }
                } else {
                        off_t eof_page_base = fp->ff_size & ~PAGE_MASK_64;
                        
                } else {
                        off_t eof_page_base = fp->ff_size & ~PAGE_MASK_64;
                        
@@ -445,7 +441,7 @@ hfs_write(ap)
                           will be handled by the cluser_write of the actual data.
                         */
                        inval_start = (fp->ff_size + (PAGE_SIZE_64 - 1)) & ~PAGE_MASK_64;
                           will be handled by the cluser_write of the actual data.
                         */
                        inval_start = (fp->ff_size + (PAGE_SIZE_64 - 1)) & ~PAGE_MASK_64;
-                       inval_end = uio->uio_offset & ~PAGE_MASK_64;
+                       inval_end = offset & ~PAGE_MASK_64;
                        zero_off = fp->ff_size;
                        
                        if ((fp->ff_size & PAGE_MASK_64) &&
                        zero_off = fp->ff_size;
                        
                        if ((fp->ff_size & PAGE_MASK_64) &&
@@ -469,6 +465,7 @@ hfs_write(ap)
                        };
                        
                        if (inval_start < inval_end) {
                        };
                        
                        if (inval_start < inval_end) {
+                               struct timeval tv;
                                /* There's some range of data that's going to be marked invalid */
                                
                                if (zero_off < inval_start) {
                                /* There's some range of data that's going to be marked invalid */
                                
                                if (zero_off < inval_start) {
@@ -476,20 +473,26 @@ hfs_write(ap)
                                           and the actual write will start on a page past inval_end.  Now's the last
                                           chance to zero-fill the page containing the EOF:
                                         */
                                           and the actual write will start on a page past inval_end.  Now's the last
                                           chance to zero-fill the page containing the EOF:
                                         */
-                                       retval = cluster_write(vp, (struct uio *) 0,
+                                       hfs_unlock(cp);
+                                       cnode_locked = 0;
+                                       retval = cluster_write(vp, (uio_t) 0,
                                                        fp->ff_size, inval_start,
                                                        fp->ff_size, inval_start,
-                                                       zero_off, (off_t)0, devBlockSize,
+                                                       zero_off, (off_t)0,
                                                        lflag | IO_HEADZEROFILL | IO_NOZERODIRTY);
                                                        lflag | IO_HEADZEROFILL | IO_NOZERODIRTY);
+                                       hfs_lock(cp, HFS_FORCE_LOCK);
+                                       cnode_locked = 1;
                                        if (retval) goto ioerr_exit;
                                        if (retval) goto ioerr_exit;
+                                       offset = uio_offset(uio);
                                };
                                
                                /* Mark the remaining area of the newly allocated space as invalid: */
                                rl_add(inval_start, inval_end - 1 , &fp->ff_invalidranges);
                                };
                                
                                /* Mark the remaining area of the newly allocated space as invalid: */
                                rl_add(inval_start, inval_end - 1 , &fp->ff_invalidranges);
-                               cp->c_zftimeout = time.tv_sec + ZFTIMELIMIT;
+                               microuptime(&tv);
+                               cp->c_zftimeout = tv.tv_sec + ZFTIMELIMIT;
                                zero_off = fp->ff_size = inval_end;
                        };
                        
                                zero_off = fp->ff_size = inval_end;
                        };
                        
-                       if (uio->uio_offset > zero_off) lflag |= IO_HEADZEROFILL;
+                       if (offset > zero_off) lflag |= IO_HEADZEROFILL;
                };
 
                /* Check to see whether the area between the end of the write and the end of
                };
 
                /* Check to see whether the area between the end of the write and the end of
@@ -513,118 +516,76 @@ hfs_write(ap)
                 *       made readable (removed from the invalid ranges) before cluster_write
                 *       tries to write it:
                 */
                 *       made readable (removed from the invalid ranges) before cluster_write
                 *       tries to write it:
                 */
-               io_start = (lflag & IO_HEADZEROFILL) ? zero_off : uio->uio_offset;
-               io_end = (lflag & IO_TAILZEROFILL) ? tail_off : writelimit;
+               io_start = (lflag & IO_HEADZEROFILL) ? zero_off : offset;
                if (io_start < fp->ff_size) {
                if (io_start < fp->ff_size) {
+                       off_t io_end;
+
+                       io_end = (lflag & IO_TAILZEROFILL) ? tail_off : writelimit;
                        rl_remove(io_start, io_end - 1, &fp->ff_invalidranges);
                };
                        rl_remove(io_start, io_end - 1, &fp->ff_invalidranges);
                };
-               retval = cluster_write(vp, uio, fp->ff_size, filesize, zero_off,
-                               tail_off, devBlockSize, lflag | IO_NOZERODIRTY);
-                               
-               if (uio->uio_offset > fp->ff_size) {
-                       fp->ff_size = uio->uio_offset;
 
 
-                       ubc_setsize(vp, fp->ff_size);       /* XXX check errors */
+               hfs_unlock(cp);
+               cnode_locked = 0;
+               
+               /*
+                * We need to tell UBC the fork's new size BEFORE calling
+                * cluster_write, in case any of the new pages need to be
+                * paged out before cluster_write completes (which does happen
+                * in embedded systems due to extreme memory pressure).
+                * Similarly, we need to tell hfs_vnop_pageout what the new EOF
+                * will be, so that it can pass that on to cluster_pageout, and
+                * allow those pageouts.
+                *
+                * We don't update ff_size yet since we don't want pageins to
+                * be able to see uninitialized data between the old and new
+                * EOF, until cluster_write has completed and initialized that
+                * part of the file.
+                *
+                * The vnode pager relies on the file size last given to UBC via
+                * ubc_setsize.  hfs_vnop_pageout relies on fp->ff_new_size or
+                * ff_size (whichever is larger).  NOTE: ff_new_size is always
+                * zero, unless we are extending the file via write.
+                */
+               if (filesize > fp->ff_size) {
+                       fp->ff_new_size = filesize;
+                       ubc_setsize(vp, filesize);
                }
                }
-               if (resid > uio->uio_resid)
-                       cp->c_flag |= C_CHANGE | C_UPDATE;
-       } else {
-               while (retval == E_NONE && uio->uio_resid > 0) {
-                       logBlockNo = currOffset / PAGE_SIZE;
-                       blkoffset  = currOffset & PAGE_MASK;
-
-                       if ((filebytes - currOffset) < PAGE_SIZE_64)
-                               fragSize = filebytes - ((off_t)logBlockNo * PAGE_SIZE_64);
-                       else
-                               fragSize = PAGE_SIZE;
-                       xfersize = fragSize - blkoffset;
-       
-                       /* Make any adjustments for boundary conditions */
-                       if (currOffset + (off_t)xfersize > writelimit)
-                               xfersize = writelimit - currOffset;
-        
-                       /*
-                        * There is no need to read into bp if:
-                        * We start on a block boundary and will overwrite the whole block
-                        *
-                        *                                              OR
-                        */
-                       if ((blkoffset == 0) && (xfersize >= fragSize)) {
-                               bp = getblk(vp, logBlockNo, fragSize, 0, 0, BLK_READ);
-                               retval = 0;
-       
-                               if (bp->b_blkno == -1) {
-                                       brelse(bp);
-                                       retval = EIO;           /* XXX */
-                                       break;
-                               }
-                       } else {
-       
-                               if (currOffset == fp->ff_size && blkoffset == 0) {
-                                       bp = getblk(vp, logBlockNo, fragSize, 0, 0, BLK_READ);
-                                       retval = 0;
-                                       if (bp->b_blkno == -1) {
-                                               brelse(bp);
-                                               retval = EIO;           /* XXX */
-                                               break;
-                                       }
-                               } else {
-                                       /*
-                                        * This I/O transfer is not sufficiently aligned,
-                                        * so read the affected block into a buffer:
-                                        */
-                                       retval = bread(vp, logBlockNo, fragSize, ap->a_cred, &bp);
-                                       if (retval != E_NONE) {
-                                               if (bp)
-                                               brelse(bp);
-                                               break;
-                                       }
-                               }
-                       }
-       
-                       /* See if we are starting to write within file boundaries:
-                        * If not, then we need to present a "hole" for the area
-                        * between the current EOF and the start of the current
-                        * I/O operation:
-                        *
-                        * Note that currOffset is only less than uio_offset if
-                        * uio_offset > LEOF...
-                        */
-                       if (uio->uio_offset > currOffset) {
-                               clearSize = MIN(uio->uio_offset - currOffset, xfersize);
-                               bzero(bp->b_data + blkoffset, clearSize);
-                               currOffset += clearSize;
-                               blkoffset += clearSize;
-                               xfersize -= clearSize;
-                       }
-       
-                       if (xfersize > 0) {
-                               retval = uiomove((caddr_t)bp->b_data + blkoffset, (int)xfersize, uio);
-                               currOffset += xfersize;
-                       }
-       
-                       if (ioflag & IO_SYNC) {
-                               (void)VOP_BWRITE(bp);
-                       } else if ((xfersize + blkoffset) == fragSize) {
-                               bp->b_flags |= B_AGE;
-                               bawrite(bp);
-                       } else {
-                               bdwrite(bp);
+               retval = cluster_write(vp, uio, fp->ff_size, filesize, zero_off,
+                               tail_off, lflag | IO_NOZERODIRTY);
+               if (retval) {
+                       fp->ff_new_size = 0;    /* no longer extending; use ff_size */
+                       if (filesize > origFileSize) {
+                               ubc_setsize(vp, origFileSize);
                        }
                        }
-       
-                       /* Update the EOF if we just extended the file
-                        * (the PEOF has already been moved out and the
-                        * block mapping table has been updated):
-                        */
-                       if (currOffset > fp->ff_size) {
-                               fp->ff_size = currOffset;
-                               if (UBCISVALID(vp))
-                                       ubc_setsize(vp, fp->ff_size); /* XXX check errors */
+                       goto ioerr_exit;
+               }
+               
+               if (filesize > origFileSize) {
+                       fp->ff_size = filesize;
+                       
+                       /* Files that are changing size are not hot file candidates. */
+                       if (hfsmp->hfc_stage == HFC_RECORDING) {
+                               fp->ff_bytesread = 0;
                        }
                        }
-                       if (retval || (resid == 0))
-                               break;
-                       cp->c_flag |= C_CHANGE | C_UPDATE;
-               } /* endwhile */
+               }
+               fp->ff_new_size = 0;    /* ff_size now has the correct size */
+               
+               /* If we wrote some bytes, then touch the change and mod times */
+               if (resid > uio_resid(uio)) {
+                       cp->c_touch_chgtime = TRUE;
+                       cp->c_touch_modtime = TRUE;
+               }
+       }
+       if (partialwrite) {
+               uio_setresid(uio, (uio_resid(uio) + bytesToAdd));
+               resid += bytesToAdd;
+       }
+
+       // XXXdbg - see radar 4871353 for more info
+       {
+           if (flush_cache_on_write && ((ioflag & IO_NOCACHE) || vnode_isnocache(vp))) {
+               VNOP_IOCTL(hfsmp->hfs_devvp, DKIOCSYNCHRONIZECACHE, NULL, FWRITE, NULL);
+           }
        }
 
 ioerr_exit:
        }
 
 ioerr_exit:
@@ -633,1406 +594,3511 @@ ioerr_exit:
         * we clear the setuid and setgid bits as a precaution against
         * tampering.
         */
         * we clear the setuid and setgid bits as a precaution against
         * tampering.
         */
-       if (resid > uio->uio_resid && ap->a_cred && ap->a_cred->cr_uid != 0)
-               cp->c_mode &= ~(S_ISUID | S_ISGID);
-
+       if (cp->c_mode & (S_ISUID | S_ISGID)) {
+               cred = vfs_context_ucred(ap->a_context);
+               if (resid > uio_resid(uio) && cred && suser(cred, NULL)) {
+                       if (!cnode_locked) {
+                               hfs_lock(cp, HFS_FORCE_LOCK);
+                               cnode_locked = 1;
+                       }
+                       cp->c_mode &= ~(S_ISUID | S_ISGID);
+               }
+       }
        if (retval) {
                if (ioflag & IO_UNIT) {
        if (retval) {
                if (ioflag & IO_UNIT) {
-                       (void)VOP_TRUNCATE(vp, origFileSize,
-                               ioflag & IO_SYNC, ap->a_cred, uio->uio_procp);
-                       uio->uio_offset -= resid - uio->uio_resid;
-                       uio->uio_resid = resid;
-                       filebytes = (off_t)fp->ff_blocks * (off_t)vcb->blockSize;
+                       if (!cnode_locked) {
+                               hfs_lock(cp, HFS_FORCE_LOCK);
+                               cnode_locked = 1;
+                       }
+                       (void)hfs_truncate(vp, origFileSize, ioflag & IO_SYNC,
+                                          0, 0, ap->a_context);
+                       // LP64todo - fix this!  resid needs to by user_ssize_t
+                       uio_setoffset(uio, (uio_offset(uio) - (resid - uio_resid(uio))));
+                       uio_setresid(uio, resid);
+                       filebytes = (off_t)fp->ff_blocks * (off_t)hfsmp->blockSize;
                }
                }
-       } else if (resid > uio->uio_resid && (ioflag & IO_SYNC)) {
-               tv = time;
-               retval = VOP_UPDATE(vp, &tv, &tv, 1);
+       } else if ((ioflag & IO_SYNC) && (resid > uio_resid(uio))) {
+               if (!cnode_locked) {
+                       hfs_lock(cp, HFS_FORCE_LOCK);
+                       cnode_locked = 1;
+               }
+               retval = hfs_update(vp, TRUE);
        }
        }
+       /* Updating vcbWrCnt doesn't need to be atomic. */
+       hfsmp->vcbWrCnt++;
 
        KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 0)) | DBG_FUNC_END,
 
        KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 0)) | DBG_FUNC_END,
-               (int)uio->uio_offset, uio->uio_resid, (int)fp->ff_size, (int)filebytes, 0);
-
+               (int)uio_offset(uio), uio_resid(uio), (int)fp->ff_size, (int)filebytes, 0);
+exit:
+       if (cnode_locked)
+               hfs_unlock(cp);
+       hfs_unlock_truncate(cp, exclusive_lock);
        return (retval);
 }
 
        return (retval);
 }
 
+/* support for the "bulk-access" fcntl */
 
 
-/*
-
-#% ioctl       vp      U U U
-#
- vop_ioctl {
-     IN struct vnode *vp;
-     IN u_long command;
-     IN caddr_t data;
-     IN int fflag;
-     IN struct ucred *cred;
-     IN struct proc *p;
-
-     */
+#define CACHE_LEVELS 16
+#define NUM_CACHE_ENTRIES (64*16)
+#define PARENT_IDS_FLAG 0x100
 
 
+struct access_cache {
+       int numcached;
+       int cachehits; /* these two for statistics gathering */
+       int lookups;
+       unsigned int *acache;
+       unsigned char *haveaccess;
+};
 
 
-/* ARGSUSED */
-int
-hfs_ioctl(ap)
-       struct vop_ioctl_args /* {
-               struct vnode *a_vp;
-               int  a_command;
-               caddr_t  a_data;
-               int  a_fflag;
-               struct ucred *a_cred;
-               struct proc *a_p;
-       } */ *ap;
-{
-       switch (ap->a_command) {
-       case 1: {
-               register struct cnode *cp;
-               register struct vnode *vp;
-               register struct radvisory *ra;
-               struct filefork *fp;
-               int devBlockSize = 0;
-               int error;
+struct access_t {
+       uid_t     uid;              /* IN: effective user id */
+       short     flags;            /* IN: access requested (i.e. R_OK) */
+       short     num_groups;       /* IN: number of groups user belongs to */
+       int       num_files;        /* IN: number of files to process */
+       int       *file_ids;        /* IN: array of file ids */
+       gid_t     *groups;          /* IN: array of groups */
+       short     *access;          /* OUT: access info for each file (0 for 'has access') */
+} __attribute__((unavailable)); // this structure is for reference purposes only
+
+struct user32_access_t {
+       uid_t     uid;              /* IN: effective user id */
+       short     flags;            /* IN: access requested (i.e. R_OK) */
+       short     num_groups;       /* IN: number of groups user belongs to */
+       int       num_files;        /* IN: number of files to process */
+       user32_addr_t      file_ids;        /* IN: array of file ids */
+       user32_addr_t      groups;          /* IN: array of groups */
+       user32_addr_t      access;          /* OUT: access info for each file (0 for 'has access') */
+};
 
 
-               vp = ap->a_vp;
+struct user64_access_t {
+       uid_t           uid;                    /* IN: effective user id */
+       short           flags;                  /* IN: access requested (i.e. R_OK) */
+       short           num_groups;             /* IN: number of groups user belongs to */
+       int             num_files;              /* IN: number of files to process */
+       user64_addr_t   file_ids;               /* IN: array of file ids */
+       user64_addr_t   groups;                 /* IN: array of groups */
+       user64_addr_t   access;                 /* OUT: access info for each file (0 for 'has access') */
+};
 
 
-               if (vp->v_type != VREG)
-                       return EINVAL;
-               VOP_LEASE(vp, ap->a_p, ap->a_cred, LEASE_READ);
-               error = vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, ap->a_p);
-               if (error)
-                       return (error);
 
 
-               ra = (struct radvisory *)(ap->a_data);
-               cp = VTOC(vp);
-               fp = VTOF(vp);
+// these are the "extended" versions of the above structures
+// note that it is crucial that they be different sized than
+// the regular version
+struct ext_access_t {
+       uint32_t   flags;           /* IN: access requested (i.e. R_OK) */
+       uint32_t   num_files;       /* IN: number of files to process */
+       uint32_t   map_size;        /* IN: size of the bit map */
+       uint32_t  *file_ids;        /* IN: Array of file ids */
+       char      *bitmap;          /* OUT: hash-bitmap of interesting directory ids */
+       short     *access;          /* OUT: access info for each file (0 for 'has access') */
+       uint32_t   num_parents;   /* future use */
+       cnid_t      *parents;   /* future use */
+} __attribute__((unavailable)); // this structure is for reference purposes only
+
+struct user32_ext_access_t {
+       uint32_t   flags;           /* IN: access requested (i.e. R_OK) */
+       uint32_t   num_files;       /* IN: number of files to process */
+       uint32_t   map_size;        /* IN: size of the bit map */
+       user32_addr_t  file_ids;        /* IN: Array of file ids */
+       user32_addr_t     bitmap;          /* OUT: hash-bitmap of interesting directory ids */
+       user32_addr_t access;          /* OUT: access info for each file (0 for 'has access') */
+       uint32_t   num_parents;   /* future use */
+       user32_addr_t parents;   /* future use */
+};
 
 
-               if (ra->ra_offset >= fp->ff_size) {
-                       VOP_UNLOCK(vp, 0, ap->a_p);
-                       return (EFBIG);
-               }
-               VOP_DEVBLOCKSIZE(cp->c_devvp, &devBlockSize);
+struct user64_ext_access_t {
+       uint32_t      flags;        /* IN: access requested (i.e. R_OK) */
+       uint32_t      num_files;    /* IN: number of files to process */
+       uint32_t      map_size;     /* IN: size of the bit map */
+       user64_addr_t   file_ids;     /* IN: array of file ids */
+       user64_addr_t   bitmap;       /* IN: array of groups */
+       user64_addr_t   access;       /* OUT: access info for each file (0 for 'has access') */
+       uint32_t      num_parents;/* future use */
+       user64_addr_t   parents;/* future use */
+};
 
 
-               error = advisory_read(vp, fp->ff_size, ra->ra_offset, ra->ra_count, devBlockSize);
-               VOP_UNLOCK(vp, 0, ap->a_p);
 
 
-               return (error);
+/*
+ * Perform a binary search for the given parent_id. Return value is 
+ * the index if there is a match.  If no_match_indexp is non-NULL it
+ * will be assigned with the index to insert the item (even if it was
+ * not found).
+ */
+static int cache_binSearch(cnid_t *array, unsigned int hi, cnid_t parent_id, int *no_match_indexp)
+{
+    int index=-1;
+    unsigned int lo=0;
+       
+    do {
+       unsigned int mid = ((hi - lo)/2) + lo;
+       unsigned int this_id = array[mid];
+               
+       if (parent_id == this_id) {
+           hi = mid;
+           break;
        }
        }
+               
+       if (parent_id < this_id) {
+           hi = mid;
+           continue;
+       }
+               
+       if (parent_id > this_id) {
+           lo = mid + 1;
+           continue;
+       }
+    } while(lo < hi);
 
 
-        case 2: /* F_READBOOTBLOCKS */
-        case 3: /* F_WRITEBOOTBLOCKS */
-            {
-           struct vnode *vp = ap->a_vp;
-           struct vnode *devvp = NULL;
-           struct fbootstraptransfer *btd = (struct fbootstraptransfer *)ap->a_data;
-           int devBlockSize;
-           int error;
-           struct iovec aiov;
-           struct uio auio;
-           u_long blockNumber;
-           u_long blockOffset;
-           u_long xfersize;
-           struct buf *bp;
-
-            if ((vp->v_flag & VROOT) == 0) return EINVAL;
-            if (btd->fbt_offset + btd->fbt_length > 1024) return EINVAL;
-           
-           devvp = VTOHFS(vp)->hfs_devvp;
-           aiov.iov_base = btd->fbt_buffer;
-           aiov.iov_len = btd->fbt_length;
-           
-           auio.uio_iov = &aiov;
-           auio.uio_iovcnt = 1;
-           auio.uio_offset = btd->fbt_offset;
-           auio.uio_resid = btd->fbt_length;
-           auio.uio_segflg = UIO_USERSPACE;
-           auio.uio_rw = (ap->a_command == 3) ? UIO_WRITE : UIO_READ; /* F_WRITEBOOTSTRAP / F_READBOOTSTRAP */
-           auio.uio_procp = ap->a_p;
-
-           VOP_DEVBLOCKSIZE(devvp, &devBlockSize);
-
-           while (auio.uio_resid > 0) {
-             blockNumber = auio.uio_offset / devBlockSize;
-             error = bread(devvp, blockNumber, devBlockSize, ap->a_cred, &bp);
-             if (error) {
-                  if (bp) brelse(bp);
-                  return error;
-                };
-
-                blockOffset = auio.uio_offset % devBlockSize;
-             xfersize = devBlockSize - blockOffset;
-             error = uiomove((caddr_t)bp->b_data + blockOffset, (int)xfersize, &auio);
-                if (error) {
-                  brelse(bp);
-                  return error;
-                };
-                if (auio.uio_rw == UIO_WRITE) {
-                  error = VOP_BWRITE(bp);
-                  if (error) return error;
-                } else {
-                  brelse(bp);
-                };
-            };
-        };
-        return 0;
-
-        case _IOC(IOC_OUT,'h', 4, 0):     /* Create date in local time */
-            {
-            *(time_t *)(ap->a_data) = to_bsd_time(VTOVCB(ap->a_vp)->localCreateDate);
-            return 0;
-            }
-
-        default:
-            return (ENOTTY);
+    /* check if lo and hi converged on the match */
+    if (parent_id == array[hi]) {
+       index = hi;
+    }
+       
+    if (no_match_indexp) {
+       *no_match_indexp = hi;
     }
 
     }
 
-    /* Should never get here */
-       return 0;
+    return index;
 }
 }
-
-/* ARGSUSED */
-int
-hfs_select(ap)
-       struct vop_select_args /* {
-               struct vnode *a_vp;
-               int  a_which;
-               int  a_fflags;
-               struct ucred *a_cred;
-               void *a_wql;
-               struct proc *a_p;
-       } */ *ap;
+static int
+lookup_bucket(struct access_cache *cache, int *indexp, cnid_t parent_id)
 {
 {
-       /*
-        * We should really check to see if I/O is possible.
-        */
-       return (1);
+    unsigned int hi;
+    int matches = 0;
+    int index, no_match_index;
+       
+    if (cache->numcached == 0) {
+       *indexp = 0;
+       return 0; // table is empty, so insert at index=0 and report no match
+    }
+       
+    if (cache->numcached > NUM_CACHE_ENTRIES) {
+       /*printf("hfs: EGAD! numcached is %d... cut our losses and trim to %d\n",
+         cache->numcached, NUM_CACHE_ENTRIES);*/
+       cache->numcached = NUM_CACHE_ENTRIES;
+    }
+       
+    hi = cache->numcached - 1;
+       
+    index = cache_binSearch(cache->acache, hi, parent_id, &no_match_index);
+
+    /* if no existing entry found, find index for new one */
+    if (index == -1) {
+       index = no_match_index;
+       matches = 0;
+    } else {
+       matches = 1;
+    }
+       
+    *indexp = index;
+    return matches;
 }
 
 /*
 }
 
 /*
- * Bmap converts a the logical block number of a file to its physical block
- * number on the disk.
- */
-
-/*
- * vp  - address of vnode file the file
- * bn  - which logical block to convert to a physical block number.
- * vpp - returns the vnode for the block special file holding the filesystem
- *      containing the file of interest
- * bnp - address of where to return the filesystem physical block number
-#% bmap                vp      L L L
-#% bmap                vpp     - U -
-#
- vop_bmap {
-     IN struct vnode *vp;
-     IN daddr_t bn;
-     OUT struct vnode **vpp;
-     IN daddr_t *bnp;
-     OUT int *runp;
-     */
-/*
- * Converts a logical block number to a physical block, and optionally returns
- * the amount of remaining blocks in a run. The logical block is based on hfsNode.logBlockSize.
- * The physical block number is based on the device block size, currently its 512.
- * The block run is returned in logical blocks, and is the REMAINING amount of blocks
+ * Add a node to the access_cache at the given index (or do a lookup first
+ * to find the index if -1 is passed in). We currently do a replace rather
+ * than an insert if the cache is full.
  */
  */
-
-int
-hfs_bmap(ap)
-       struct vop_bmap_args /* {
-               struct vnode *a_vp;
-               daddr_t a_bn;
-               struct vnode **a_vpp;
-               daddr_t *a_bnp;
-               int *a_runp;
-       } */ *ap;
+static void
+add_node(struct access_cache *cache, int index, cnid_t nodeID, int access)
 {
 {
-       struct vnode *vp = ap->a_vp;
-       struct cnode *cp = VTOC(vp);
-       struct filefork *fp = VTOF(vp);
-       struct hfsmount *hfsmp = VTOHFS(vp);
-   int                                 retval = E_NONE;
-    daddr_t                            logBlockSize;
-    size_t                             bytesContAvail = 0;
-    off_t blockposition;
-    struct proc                        *p = NULL;
-    int                                        lockExtBtree;
-    struct rl_entry *invalid_range;
-    enum rl_overlaptype overlaptype;
+    int lookup_index = -1;
+
+    /* need to do a lookup first if -1 passed for index */
+    if (index == -1) {
+       if (lookup_bucket(cache, &lookup_index, nodeID)) {
+           if (cache->haveaccess[lookup_index] != access && cache->haveaccess[lookup_index] == ESRCH) {
+               // only update an entry if the previous access was ESRCH (i.e. a scope checking error)
+               cache->haveaccess[lookup_index] = access;
+           }
 
 
-       /*
-        * Check for underlying vnode requests and ensure that logical
-        * to physical mapping is requested.
-        */
-       if (ap->a_vpp != NULL)
-               *ap->a_vpp = cp->c_devvp;
-       if (ap->a_bnp == NULL)
-               return (0);
+           /* mission accomplished */
+           return;
+       } else {
+           index = lookup_index;
+       }
 
 
-       /* Only clustered I/O should have delayed allocations. */
-       DBG_ASSERT(fp->ff_unallocblocks == 0);
+    }
 
 
-       logBlockSize = GetLogicalBlockSize(vp);
-       blockposition = (off_t)ap->a_bn * (off_t)logBlockSize;
+    /* if the cache is full, do a replace rather than an insert */
+    if (cache->numcached >= NUM_CACHE_ENTRIES) {
+       //printf("hfs: cache is full (%d). replace at index %d\n", cache->numcached, index);
+       cache->numcached = NUM_CACHE_ENTRIES-1;
 
 
-       lockExtBtree = overflow_extents(fp);
-       if (lockExtBtree) {
-               p = current_proc();
-               retval = hfs_metafilelocking(hfsmp, kHFSExtentsFileID,
-                               LK_EXCLUSIVE | LK_CANRECURSE, p);
-               if (retval)
-                       return (retval);
+       if (index > cache->numcached) {
+           //    printf("hfs: index %d pinned to %d\n", index, cache->numcached);
+           index = cache->numcached;
        }
        }
+    }
 
 
-       retval = MacToVFSError(
-                            MapFileBlockC (HFSTOVCB(hfsmp),
-                                            (FCB*)fp,
-                                            MAXPHYSIO,
-                                            blockposition,
-                                            ap->a_bnp,
-                                            &bytesContAvail));
+    if (index < cache->numcached && index < NUM_CACHE_ENTRIES && nodeID > cache->acache[index]) {
+       index++;
+    }
 
 
-    if (lockExtBtree) (void) hfs_metafilelocking(hfsmp, kHFSExtentsFileID, LK_RELEASE, p);
-
-    if (retval == E_NONE) {
-        /* Adjust the mapping information for invalid file ranges: */
-        overlaptype = rl_scan(&fp->ff_invalidranges,
-                            blockposition,
-                            blockposition + MAXPHYSIO - 1,
-                            &invalid_range);
-        if (overlaptype != RL_NOOVERLAP) {
-            switch(overlaptype) {
-                case RL_MATCHINGOVERLAP:
-                case RL_OVERLAPCONTAINSRANGE:
-                case RL_OVERLAPSTARTSBEFORE:
-                    /* There's no valid block for this byte offset: */
-                    *ap->a_bnp = (daddr_t)-1;
-                    bytesContAvail = invalid_range->rl_end + 1 - blockposition;
-                    break;
-                
-                case RL_OVERLAPISCONTAINED:
-                case RL_OVERLAPENDSAFTER:
-                    /* The range of interest hits an invalid block before the end: */
-                    if (invalid_range->rl_start == blockposition) {
-                       /* There's actually no valid information to be had starting here: */
-                       *ap->a_bnp = (daddr_t)-1;
-                                               if ((fp->ff_size > (invalid_range->rl_end + 1)) &&
-                                                       (invalid_range->rl_end + 1 - blockposition < bytesContAvail)) {
-                               bytesContAvail = invalid_range->rl_end + 1 - blockposition;
-                       };
-                    } else {
-                       bytesContAvail = invalid_range->rl_start - blockposition;
-                    };
-                    break;
-            };
-                       if (bytesContAvail > MAXPHYSIO) bytesContAvail = MAXPHYSIO;
-        };
-        
-        /* Figure out how many read ahead blocks there are */
-        if (ap->a_runp != NULL) {
-            if (can_cluster(logBlockSize)) {
-                /* Make sure this result never goes negative: */
-                *ap->a_runp = (bytesContAvail < logBlockSize) ? 0 : (bytesContAvail / logBlockSize) - 1;
-            } else {
-                *ap->a_runp = 0;
-            };
-        };
-    };
-
-    return (retval);
+    if (index >= 0 && index < cache->numcached) {
+       /* only do bcopy if we're inserting */
+       bcopy( cache->acache+index, cache->acache+(index+1), (cache->numcached - index)*sizeof(int) );
+       bcopy( cache->haveaccess+index, cache->haveaccess+(index+1), (cache->numcached - index)*sizeof(unsigned char) );
+    }
+
+    cache->acache[index] = nodeID;
+    cache->haveaccess[index] = access;
+    cache->numcached++;
 }
 
 }
 
-/* blktooff converts logical block number to file offset */
 
 
-int
-hfs_blktooff(ap)
-       struct vop_blktooff_args /* {
-               struct vnode *a_vp;
-               daddr_t a_lblkno;  
-               off_t *a_offset;
-       } */ *ap;
-{      
-       if (ap->a_vp == NULL)
-               return (EINVAL);
-       *ap->a_offset = (off_t)ap->a_lblkno * PAGE_SIZE_64;
+struct cinfo {
+    uid_t   uid;
+    gid_t   gid;
+    mode_t  mode;
+    cnid_t  parentcnid;
+    u_int16_t recflags;
+};
 
 
-       return(0);
-}
+static int
+snoop_callback(const struct cat_desc *descp, const struct cat_attr *attrp, void * arg)
+{
+    struct cinfo *cip = (struct cinfo *)arg;
 
 
-int
-hfs_offtoblk(ap)
-       struct vop_offtoblk_args /* {
-               struct vnode *a_vp;
-               off_t a_offset;    
-               daddr_t *a_lblkno;
-       } */ *ap;
-{      
-       if (ap->a_vp == NULL)
-               return (EINVAL);
-       *ap->a_lblkno = ap->a_offset / PAGE_SIZE_64;
+    cip->uid = attrp->ca_uid;
+    cip->gid = attrp->ca_gid;
+    cip->mode = attrp->ca_mode;
+    cip->parentcnid = descp->cd_parentcnid;
+    cip->recflags = attrp->ca_recflags;
+       
+    return (0);
+}
 
 
-       return(0);
+/*
+ * Lookup the cnid's attr info (uid, gid, and mode) as well as its parent id. If the item
+ * isn't incore, then go to the catalog.
+ */ 
+static int
+do_attr_lookup(struct hfsmount *hfsmp, struct access_cache *cache, cnid_t cnid, 
+    struct cnode *skip_cp, CatalogKey *keyp, struct cat_attr *cnattrp)
+{
+    int error = 0;
+
+    /* if this id matches the one the fsctl was called with, skip the lookup */
+    if (cnid == skip_cp->c_cnid) {
+       cnattrp->ca_uid = skip_cp->c_uid;
+       cnattrp->ca_gid = skip_cp->c_gid;
+       cnattrp->ca_mode = skip_cp->c_mode;
+       cnattrp->ca_recflags = skip_cp->c_attr.ca_recflags;
+       keyp->hfsPlus.parentID = skip_cp->c_parentcnid;
+    } else {
+       struct cinfo c_info;
+
+       /* otherwise, check the cnode hash incase the file/dir is incore */
+       if (hfs_chash_snoop(hfsmp, cnid, snoop_callback, &c_info) == 0) {
+           cnattrp->ca_uid = c_info.uid;
+           cnattrp->ca_gid = c_info.gid;
+           cnattrp->ca_mode = c_info.mode;
+           cnattrp->ca_recflags = c_info.recflags;
+           keyp->hfsPlus.parentID = c_info.parentcnid;
+       } else {
+           int lockflags;
+                       
+           lockflags = hfs_systemfile_lock(hfsmp, SFL_CATALOG, HFS_SHARED_LOCK);
+                       
+           /* lookup this cnid in the catalog */
+           error = cat_getkeyplusattr(hfsmp, cnid, keyp, cnattrp);
+                       
+           hfs_systemfile_unlock(hfsmp, lockflags);
+                       
+           cache->lookups++;
+       }
+    }
+       
+    return (error);
 }
 
 }
 
-int
-hfs_cmap(ap)
-       struct vop_cmap_args /* {
-               struct vnode *a_vp;
-               off_t a_foffset;    
-               size_t a_size;
-               daddr_t *a_bpn;
-               size_t *a_run;
-               void *a_poff;
-       } */ *ap;
+
+/*
+ * Compute whether we have access to the given directory (nodeID) and all its parents. Cache
+ * up to CACHE_LEVELS as we progress towards the root.
+ */
+static int 
+do_access_check(struct hfsmount *hfsmp, int *err, struct access_cache *cache, HFSCatalogNodeID nodeID, 
+    struct cnode *skip_cp, struct proc *theProcPtr, kauth_cred_t myp_ucred,
+    struct vfs_context *my_context,
+    char *bitmap,
+    uint32_t map_size,
+    cnid_t* parents,
+    uint32_t num_parents)
 {
 {
-    struct hfsmount *hfsmp = VTOHFS(ap->a_vp);
-    struct filefork *fp = VTOF(ap->a_vp);
-    size_t                             bytesContAvail = 0;
-    int                        retval = E_NONE;
-    int lockExtBtree = 0;
-    struct proc                *p = NULL;
-    struct rl_entry *invalid_range;
-    enum rl_overlaptype overlaptype;
-    int started_tr = 0, grabbed_lock = 0;
+    int                     myErr = 0;
+    int                     myResult;
+    HFSCatalogNodeID        thisNodeID;
+    unsigned int            myPerms;
+    struct cat_attr         cnattr;
+    int                     cache_index = -1, scope_index = -1, scope_idx_start = -1;
+    CatalogKey              catkey;
+
+    int i = 0, ids_to_cache = 0;
+    int parent_ids[CACHE_LEVELS];
+
+    thisNodeID = nodeID;
+    while (thisNodeID >=  kRootDirID) {
+       myResult = 0;   /* default to "no access" */
+              
+       /* check the cache before resorting to hitting the catalog */
+
+       /* ASSUMPTION: access info of cached entries is "final"... i.e. no need
+        * to look any further after hitting cached dir */
+
+       if (lookup_bucket(cache, &cache_index, thisNodeID)) {
+           cache->cachehits++;
+           myErr = cache->haveaccess[cache_index];
+           if (scope_index != -1) {
+               if (myErr == ESRCH) {
+                   myErr = 0;
+               }
+           } else {
+               scope_index = 0;   // so we'll just use the cache result 
+               scope_idx_start = ids_to_cache;
+           }
+           myResult = (myErr == 0) ? 1 : 0;
+           goto ExitThisRoutine;
+       }
 
 
-       /*
-        * Check for underlying vnode requests and ensure that logical
-        * to physical mapping is requested.
-        */
-       if (ap->a_bpn == NULL)
-               return (0);
 
 
-       p = current_proc();
-  retry:
-       if (fp->ff_unallocblocks) {
-               lockExtBtree = 1;
+       if (parents) {
+           int tmp;
+           tmp = cache_binSearch(parents, num_parents-1, thisNodeID, NULL);
+           if (scope_index == -1)
+               scope_index = tmp;
+           if (tmp != -1 && scope_idx_start == -1 && ids_to_cache < CACHE_LEVELS) {
+               scope_idx_start = ids_to_cache;
+           }
+       }          
 
 
-               // XXXdbg
-               hfs_global_shared_lock_acquire(hfsmp);
-               grabbed_lock = 1;
+       /* remember which parents we want to cache */
+       if (ids_to_cache < CACHE_LEVELS) {
+           parent_ids[ids_to_cache] = thisNodeID;
+           ids_to_cache++;
+       }
+       // Inefficient (using modulo) and we might want to use a hash function, not rely on the node id to be "nice"...
+       if (bitmap && map_size) {
+           bitmap[(thisNodeID/8)%(map_size)]|=(1<<(thisNodeID&7));            
+       }
+              
 
 
-               if (hfsmp->jnl) {
-                       if (journal_start_transaction(hfsmp->jnl) != 0) {
-                               hfs_global_shared_lock_release(hfsmp);
-                               return EINVAL;
-                       } else {
-                               started_tr = 1;
-                       }
-               } 
+       /* do the lookup (checks the cnode hash, then the catalog) */
+       myErr = do_attr_lookup(hfsmp, cache, thisNodeID, skip_cp, &catkey, &cnattr);
+       if (myErr) {
+           goto ExitThisRoutine; /* no access */
+       }
 
 
-               if (retval = hfs_metafilelocking(hfsmp, kHFSExtentsFileID, LK_EXCLUSIVE | LK_CANRECURSE, p)) {
-                       if (started_tr) {
-                               journal_end_transaction(hfsmp->jnl);
-                       }
-                       if (grabbed_lock) {
-                               hfs_global_shared_lock_release(hfsmp);
-                       }
-                       return (retval);
-               }
-       } else if (overflow_extents(fp)) {
-               lockExtBtree = 1;
-               if (retval = hfs_metafilelocking(hfsmp, kHFSExtentsFileID, LK_EXCLUSIVE | LK_CANRECURSE, p)) {
-                       return retval;
-               }
+       /* Root always gets access. */
+       if (suser(myp_ucred, NULL) == 0) {
+               thisNodeID = catkey.hfsPlus.parentID;
+               myResult = 1;
+               continue;
        }
 
        }
 
-       /*
-        * Check for any delayed allocations.
-        */
-       if (fp->ff_unallocblocks) {
-               SInt64 reqbytes, actbytes;
+       // if the thing has acl's, do the full permission check
+       if ((cnattr.ca_recflags & kHFSHasSecurityMask) != 0) {
+           struct vnode *vp;
 
 
-               //
-               // Make sure we have a transaction.  It's possible
-               // that we came in and fp->ff_unallocblocks was zero
-               // but during the time we blocked acquiring the extents
-               // btree, ff_unallocblocks became non-zero and so we
-               // will need to start a transaction.
-               //
-               if (hfsmp->jnl && started_tr == 0) {
-                   if (lockExtBtree) {
-                       (void) hfs_metafilelocking(hfsmp, kHFSExtentsFileID, LK_RELEASE, p);
-                       lockExtBtree = 0;
-                   }
-    
-                   goto retry;
-               }
+           /* get the vnode for this cnid */
+           myErr = hfs_vget(hfsmp, thisNodeID, &vp, 0);
+           if ( myErr ) {
+               myResult = 0;
+               goto ExitThisRoutine;
+           }
 
 
-               reqbytes = (SInt64)fp->ff_unallocblocks *
-                            (SInt64)HFSTOVCB(hfsmp)->blockSize;
-               /*
-                * Release the blocks on loan and aquire some real ones.
-                * Note that we can race someone else for these blocks
-                * (and lose) so cmap needs to handle a failure here.
-                * Currently this race can't occur because all allocations
-                * are protected by an exclusive lock on the  Extents
-                * Overflow file.
-                */
-               HFSTOVCB(hfsmp)->loanedBlocks -= fp->ff_unallocblocks;
-               FTOC(fp)->c_blocks            -= fp->ff_unallocblocks;
-               fp->ff_blocks                 -= fp->ff_unallocblocks;
-               fp->ff_unallocblocks           = 0;
-
-               while (retval == 0 && reqbytes > 0) {
-                       retval = MacToVFSError(ExtendFileC(HFSTOVCB(hfsmp),
-                                       (FCB*)fp, reqbytes, 0,
-                                       kEFAllMask | kEFNoClumpMask, &actbytes));
-                       if (retval == 0 && actbytes == 0)
-                               retval = ENOSPC;
-
-                       if (retval) {
-                               fp->ff_unallocblocks =
-                                       reqbytes / HFSTOVCB(hfsmp)->blockSize;
-                               HFSTOVCB(hfsmp)->loanedBlocks += fp->ff_unallocblocks;
-                               FTOC(fp)->c_blocks            += fp->ff_unallocblocks;
-                               fp->ff_blocks                 += fp->ff_unallocblocks;
-                       }
-                       reqbytes -= actbytes;
-               }
+           thisNodeID = VTOC(vp)->c_parentcnid;
 
 
-               if (retval) {
-                       (void) hfs_metafilelocking(hfsmp, kHFSExtentsFileID, LK_RELEASE, p);
-                       if (started_tr) {
-                               hfs_flushvolumeheader(hfsmp, MNT_NOWAIT, 0);
-                               journal_end_transaction(hfsmp->jnl);
-                       }
-                       if (grabbed_lock) {
-                               hfs_global_shared_lock_release(hfsmp);
-                       }
-                       return (retval);
-               }
-               VTOC(ap->a_vp)->c_flag |= C_MODIFIED;
-       }
+           hfs_unlock(VTOC(vp));
 
 
-       retval = MacToVFSError(
-                          MapFileBlockC (HFSTOVCB(hfsmp),
-                                         (FCB *)fp,
-                                         ap->a_size,
-                                         ap->a_foffset,
-                                         ap->a_bpn,
-                                         &bytesContAvail));
+           if (vnode_vtype(vp) == VDIR) {
+               myErr = vnode_authorize(vp, NULL, (KAUTH_VNODE_SEARCH | KAUTH_VNODE_LIST_DIRECTORY), my_context);
+           } else {
+               myErr = vnode_authorize(vp, NULL, KAUTH_VNODE_READ_DATA, my_context);
+           }
 
 
-       if (lockExtBtree)
-               (void) hfs_metafilelocking(hfsmp, kHFSExtentsFileID, LK_RELEASE, p);
+           vnode_put(vp);
+           if (myErr) {
+               myResult = 0;
+               goto ExitThisRoutine;
+           }
+       } else {
+           unsigned int flags;
+                  
+           myPerms = DerivePermissionSummary(cnattr.ca_uid, cnattr.ca_gid,
+               cnattr.ca_mode, hfsmp->hfs_mp,
+               myp_ucred, theProcPtr);
+
+           if (cnattr.ca_mode & S_IFDIR) {
+               flags = R_OK | X_OK;
+           } else {
+               flags = R_OK;
+           }
+           if ( (myPerms & flags) != flags) {
+               myResult = 0;
+               myErr = EACCES;
+               goto ExitThisRoutine;   /* no access */
+           }
 
 
-       // XXXdbg
-       if (started_tr) {
-               hfs_flushvolumeheader(hfsmp, MNT_NOWAIT, 0);
-               journal_end_transaction(hfsmp->jnl);
-               started_tr = 0;
-       }
-       if (grabbed_lock) {
-               hfs_global_shared_lock_release(hfsmp);
-               grabbed_lock = 0;
+           /* up the hierarchy we go */
+           thisNodeID = catkey.hfsPlus.parentID;
        }
        }
-                       
-    if (retval == E_NONE) {
-        /* Adjust the mapping information for invalid file ranges: */
-        overlaptype = rl_scan(&fp->ff_invalidranges,
-                            ap->a_foffset,
-                            ap->a_foffset + (off_t)bytesContAvail - 1,
-                            &invalid_range);
-        if (overlaptype != RL_NOOVERLAP) {
-            switch(overlaptype) {
-                case RL_MATCHINGOVERLAP:
-                case RL_OVERLAPCONTAINSRANGE:
-                case RL_OVERLAPSTARTSBEFORE:
-                    /* There's no valid block for this byte offset: */
-                    *ap->a_bpn = (daddr_t)-1;
-                    
-                    /* There's no point limiting the amount to be returned if the
-                       invalid range that was hit extends all the way to the EOF
-                       (i.e. there's no valid bytes between the end of this range
-                       and the file's EOF):
-                     */
-                    if ((fp->ff_size > (invalid_range->rl_end + 1)) &&
-                                       (invalid_range->rl_end + 1 - ap->a_foffset < bytesContAvail)) {
-                       bytesContAvail = invalid_range->rl_end + 1 - ap->a_foffset;
-                    };
-                    break;
-                
-                case RL_OVERLAPISCONTAINED:
-                case RL_OVERLAPENDSAFTER:
-                    /* The range of interest hits an invalid block before the end: */
-                    if (invalid_range->rl_start == ap->a_foffset) {
-                       /* There's actually no valid information to be had starting here: */
-                       *ap->a_bpn = (daddr_t)-1;
-                                               if ((fp->ff_size > (invalid_range->rl_end + 1)) &&
-                                                       (invalid_range->rl_end + 1 - ap->a_foffset < bytesContAvail)) {
-                               bytesContAvail = invalid_range->rl_end + 1 - ap->a_foffset;
-                       };
-                    } else {
-                       bytesContAvail = invalid_range->rl_start - ap->a_foffset;
-                    };
-                    break;
-            };
-            if (bytesContAvail > ap->a_size) bytesContAvail = ap->a_size;
-        };
-        
-        if (ap->a_run) *ap->a_run = bytesContAvail;
-    };
+    }
 
 
-       if (ap->a_poff)
-               *(int *)ap->a_poff = 0;
+    /* if here, we have access to this node */
+    myResult = 1;
 
 
-       return (retval);
-}
+  ExitThisRoutine:
+    if (parents && myErr == 0 && scope_index == -1) {
+       myErr = ESRCH;
+    }
+                               
+    if (myErr) {
+       myResult = 0;
+    }
+    *err = myErr;
 
 
+    /* cache the parent directory(ies) */
+    for (i = 0; i < ids_to_cache; i++) {
+       if (myErr == 0 && parents && (scope_idx_start == -1 || i > scope_idx_start)) {
+           add_node(cache, -1, parent_ids[i], ESRCH);
+       } else {
+           add_node(cache, -1, parent_ids[i], myErr);
+       }
+    }
+
+    return (myResult);
+}
 
 
-/*
- * Read or write a buffer that is not contiguous on disk.  We loop over
- * each device block, copying to or from caller's buffer.
- *
- * We could be a bit more efficient by transferring as much data as is
- * contiguous.  But since this routine should rarely be called, and that
- * would be more complicated; best to keep it simple.
- */
 static int
 static int
-hfs_strategy_fragmented(struct buf *bp)
+do_bulk_access_check(struct hfsmount *hfsmp, struct vnode *vp,
+    struct vnop_ioctl_args *ap, int arg_size, vfs_context_t context)
 {
 {
-       register struct vnode *vp = bp->b_vp;
-       register struct cnode *cp = VTOC(vp);
-       register struct vnode *devvp = cp->c_devvp;
-       caddr_t ioaddr;         /* Address of fragment within bp  */
-       struct buf *frag = NULL; /* For reading or writing a single block */
-       int retval = 0;
-       long remaining;         /* Bytes (in bp) left to transfer */
-       off_t offset;           /* Logical offset of current fragment in vp */
-       u_long block_size;      /* Size of one device block (and one I/O) */
+    boolean_t is64bit;
+
+    /*
+     * NOTE: on entry, the vnode is locked. Incase this vnode
+     * happens to be in our list of file_ids, we'll note it
+     * avoid calling hfs_chashget_nowait() on that id as that
+     * will cause a "locking against myself" panic.
+     */
+    Boolean check_leaf = true;
+               
+    struct user64_ext_access_t *user_access_structp;
+    struct user64_ext_access_t tmp_user_access;
+    struct access_cache cache;
+               
+    int error = 0, prev_parent_check_ok=1;
+    unsigned int i;
+               
+    short flags;
+    unsigned int num_files = 0;
+    int map_size = 0;
+    int num_parents = 0;
+    int *file_ids=NULL;
+    short *access=NULL;
+    char *bitmap=NULL;
+    cnid_t *parents=NULL;
+    int leaf_index;
        
        
-       /* Make sure we redo this mapping for the next I/O */
-       bp->b_blkno = bp->b_lblkno;
+    cnid_t cnid;
+    cnid_t prevParent_cnid = 0;
+    unsigned int myPerms;
+    short myaccess = 0;
+    struct cat_attr cnattr;
+    CatalogKey catkey;
+    struct cnode *skip_cp = VTOC(vp);
+    kauth_cred_t cred = vfs_context_ucred(context);
+    proc_t p = vfs_context_proc(context);
+
+    is64bit = proc_is64bit(p);
+
+    /* initialize the local cache and buffers */
+    cache.numcached = 0;
+    cache.cachehits = 0;
+    cache.lookups = 0;
+    cache.acache = NULL;
+    cache.haveaccess = NULL;
+               
+    /* struct copyin done during dispatch... need to copy file_id array separately */
+    if (ap->a_data == NULL) {
+       error = EINVAL;
+       goto err_exit_bulk_access;
+    }
+
+    if (is64bit) {
+       if (arg_size != sizeof(struct user64_ext_access_t)) {
+           error = EINVAL;
+           goto err_exit_bulk_access;
+       }
+
+       user_access_structp = (struct user64_ext_access_t *)ap->a_data;
+
+    } else if (arg_size == sizeof(struct user32_access_t)) {
+       struct user32_access_t *accessp = (struct user32_access_t *)ap->a_data;
+
+       // convert an old style bulk-access struct to the new style
+       tmp_user_access.flags     = accessp->flags;
+       tmp_user_access.num_files = accessp->num_files;
+       tmp_user_access.map_size  = 0;
+       tmp_user_access.file_ids  = CAST_USER_ADDR_T(accessp->file_ids);
+       tmp_user_access.bitmap    = USER_ADDR_NULL;
+       tmp_user_access.access    = CAST_USER_ADDR_T(accessp->access);
+       tmp_user_access.num_parents = 0;
+       user_access_structp = &tmp_user_access;
+
+    } else if (arg_size == sizeof(struct user32_ext_access_t)) {
+       struct user32_ext_access_t *accessp = (struct user32_ext_access_t *)ap->a_data;
+
+       // up-cast from a 32-bit version of the struct
+       tmp_user_access.flags     = accessp->flags;
+       tmp_user_access.num_files = accessp->num_files;
+       tmp_user_access.map_size  = accessp->map_size;
+       tmp_user_access.num_parents  = accessp->num_parents;
+
+       tmp_user_access.file_ids  = CAST_USER_ADDR_T(accessp->file_ids);
+       tmp_user_access.bitmap    = CAST_USER_ADDR_T(accessp->bitmap);
+       tmp_user_access.access    = CAST_USER_ADDR_T(accessp->access);
+       tmp_user_access.parents    = CAST_USER_ADDR_T(accessp->parents);
+
+       user_access_structp = &tmp_user_access;
+    } else {
+       error = EINVAL;
+       goto err_exit_bulk_access;
+    }
+               
+    map_size = user_access_structp->map_size;
+
+    num_files = user_access_structp->num_files;
+
+    num_parents= user_access_structp->num_parents;
+
+    if (num_files < 1) {
+       goto err_exit_bulk_access;
+    }
+    if (num_files > 1024) {
+       error = EINVAL;
+       goto err_exit_bulk_access;
+    }
+
+    if (num_parents > 1024) {
+       error = EINVAL;
+       goto err_exit_bulk_access;
+    }
+               
+    file_ids = (int *) kalloc(sizeof(int) * num_files);
+    access = (short *) kalloc(sizeof(short) * num_files);
+    if (map_size) {
+       bitmap = (char *) kalloc(sizeof(char) * map_size);
+    }
+
+    if (num_parents) {
+       parents = (cnid_t *) kalloc(sizeof(cnid_t) * num_parents);
+    }
+
+    cache.acache = (unsigned int *) kalloc(sizeof(int) * NUM_CACHE_ENTRIES);
+    cache.haveaccess = (unsigned char *) kalloc(sizeof(unsigned char) * NUM_CACHE_ENTRIES);
+               
+    if (file_ids == NULL || access == NULL || (map_size != 0 && bitmap == NULL) || cache.acache == NULL || cache.haveaccess == NULL) {
+       if (file_ids) {
+           kfree(file_ids, sizeof(int) * num_files);
+       }
+       if (bitmap) {
+           kfree(bitmap, sizeof(char) * map_size);
+       }
+       if (access) {
+           kfree(access, sizeof(short) * num_files);
+       }
+       if (cache.acache) {
+           kfree(cache.acache, sizeof(int) * NUM_CACHE_ENTRIES);
+       }
+       if (cache.haveaccess) {
+           kfree(cache.haveaccess, sizeof(unsigned char) * NUM_CACHE_ENTRIES);
+       }
+       if (parents) {
+           kfree(parents, sizeof(cnid_t) * num_parents);
+       }                       
+       return ENOMEM;
+    }
+               
+    // make sure the bitmap is zero'ed out...
+    if (bitmap) {
+       bzero(bitmap, (sizeof(char) * map_size));
+    }
+
+    if ((error = copyin(user_access_structp->file_ids, (caddr_t)file_ids,
+               num_files * sizeof(int)))) {
+       goto err_exit_bulk_access;
+    }
        
        
-       /* Set up the logical position and number of bytes to read/write */
-       offset = (off_t) bp->b_lblkno * (off_t) GetLogicalBlockSize(vp);
-       block_size = VTOHFS(vp)->hfs_phys_block_size;
+    if (num_parents) {
+       if ((error = copyin(user_access_structp->parents, (caddr_t)parents,
+                   num_parents * sizeof(cnid_t)))) {
+           goto err_exit_bulk_access;
+       }
+    }
        
        
-       /* Get an empty buffer to do the deblocking */
-       frag = geteblk(block_size);
-       if (ISSET(bp->b_flags, B_READ))
-               SET(frag->b_flags, B_READ);
-
-       for (ioaddr = bp->b_data, remaining = bp->b_bcount; remaining != 0;
-           ioaddr += block_size, offset += block_size,
-           remaining -= block_size) {
-               frag->b_resid = frag->b_bcount;
-               CLR(frag->b_flags, B_DONE);
-
-               /* Map the current position to a physical block number */
-               retval = VOP_CMAP(vp, offset, block_size, &frag->b_lblkno,
-                   NULL, NULL);
-               if (retval != 0)
-                       break;
+    flags = user_access_structp->flags;
+    if ((flags & (F_OK | R_OK | W_OK | X_OK)) == 0) {
+       flags = R_OK;
+    }
+               
+    /* check if we've been passed leaf node ids or parent ids */
+    if (flags & PARENT_IDS_FLAG) {
+       check_leaf = false;
+    }
+               
+    /* Check access to each file_id passed in */
+    for (i = 0; i < num_files; i++) {
+       leaf_index=-1;
+       cnid = (cnid_t) file_ids[i];
+                       
+       /* root always has access */
+       if ((!parents) && (!suser(cred, NULL))) {
+           access[i] = 0;
+           continue;
+       }
+                       
+       if (check_leaf) {
+           /* do the lookup (checks the cnode hash, then the catalog) */
+           error = do_attr_lookup(hfsmp, &cache, cnid, skip_cp, &catkey, &cnattr);
+           if (error) {
+               access[i] = (short) error;
+               continue;
+           }
+           
+           if (parents) {
+               // Check if the leaf matches one of the parent scopes
+               leaf_index = cache_binSearch(parents, num_parents-1, cnid, NULL);
+               if (leaf_index >= 0 && parents[leaf_index] == cnid)
+                   prev_parent_check_ok = 0;
+               else if (leaf_index >= 0)
+                   prev_parent_check_ok = 1;
+           }
 
 
-               /*
-                * Did we try to read a hole?
-                * (Should never happen for metadata!)
-                */
-               if ((long)frag->b_lblkno == -1) {
-                       bzero(ioaddr, block_size);
-                       continue;
+           // if the thing has acl's, do the full permission check
+           if ((cnattr.ca_recflags & kHFSHasSecurityMask) != 0) {
+               struct vnode *cvp;
+               int myErr = 0;
+               /* get the vnode for this cnid */
+               myErr = hfs_vget(hfsmp, cnid, &cvp, 0);
+               if ( myErr ) {
+                   access[i] = myErr;
+                   continue;
                }
                
                }
                
-               /* If writing, copy before I/O */
-               if (!ISSET(bp->b_flags, B_READ))
-                       bcopy(ioaddr, frag->b_data, block_size);
-
-               /* Call the device to do the I/O and wait for it */
-               frag->b_blkno = frag->b_lblkno;
-               frag->b_vp = devvp;  /* Used to dispatch via VOP_STRATEGY */
-               frag->b_dev = devvp->v_rdev;
-               retval = VOP_STRATEGY(frag);
-               frag->b_vp = NULL;
-               if (retval != 0)
-                       break;
-               retval = biowait(frag);
-               if (retval != 0)
-                       break;
+               hfs_unlock(VTOC(cvp));
+               
+               if (vnode_vtype(cvp) == VDIR) {
+                   myErr = vnode_authorize(cvp, NULL, (KAUTH_VNODE_SEARCH | KAUTH_VNODE_LIST_DIRECTORY), context);
+               } else {
+                   myErr = vnode_authorize(cvp, NULL, KAUTH_VNODE_READ_DATA, context);
+               }
+               
+               vnode_put(cvp);
+               if (myErr) {
+                   access[i] = myErr;
+                   continue;
+               }
+           } else {
+               /* before calling CheckAccess(), check the target file for read access */
+               myPerms = DerivePermissionSummary(cnattr.ca_uid, cnattr.ca_gid,
+                   cnattr.ca_mode, hfsmp->hfs_mp, cred, p);
                
                
-               /* If reading, copy after the I/O */
-               if (ISSET(bp->b_flags, B_READ))
-                       bcopy(frag->b_data, ioaddr, block_size);
+               /* fail fast if no access */ 
+               if ((myPerms & flags) == 0) {
+                   access[i] = EACCES;
+                   continue;
+               }                                                       
+           }
+       } else {
+           /* we were passed an array of parent ids */
+           catkey.hfsPlus.parentID = cnid;
        }
        }
+                       
+       /* if the last guy had the same parent and had access, we're done */
+       if (i > 0 && catkey.hfsPlus.parentID == prevParent_cnid && access[i-1] == 0 && prev_parent_check_ok) {
+           cache.cachehits++;
+           access[i] = 0;
+           continue;
+       }
+                       
+       myaccess = do_access_check(hfsmp, &error, &cache, catkey.hfsPlus.parentID, 
+           skip_cp, p, cred, context,bitmap, map_size, parents, num_parents);
+                       
+       if (myaccess || (error == ESRCH && leaf_index != -1)) {
+           access[i] = 0; // have access.. no errors to report
+       } else {
+           access[i] = (error != 0 ? (short) error : EACCES);
+       }
+                       
+       prevParent_cnid = catkey.hfsPlus.parentID;
+    }
+               
+    /* copyout the access array */
+    if ((error = copyout((caddr_t)access, user_access_structp->access, 
+               num_files * sizeof (short)))) {
+       goto err_exit_bulk_access;
+    }
+    if (map_size && bitmap) {
+       if ((error = copyout((caddr_t)bitmap, user_access_structp->bitmap, 
+                   map_size * sizeof (char)))) {
+           goto err_exit_bulk_access;
+       }
+    }
        
        
-       frag->b_vp = NULL;
-       //
-       // XXXdbg - in the case that this is a meta-data block, it won't affect
-       //          the journal because this bp is for a physical disk block,
-       //          not a logical block that is part of the catalog or extents
-       //          files.
-       SET(frag->b_flags, B_INVAL);
-       brelse(frag);
-       
-       if ((bp->b_error = retval) != 0)
-               SET(bp->b_flags, B_ERROR);
-       
-       biodone(bp);    /* This I/O is now complete */
-       return retval;
+               
+  err_exit_bulk_access:
+               
+    //printf("hfs: on exit (err %d), numfiles/numcached/cachehits/lookups is %d/%d/%d/%d\n", error, num_files, cache.numcached, cache.cachehits, cache.lookups);
+               
+    if (file_ids) 
+       kfree(file_ids, sizeof(int) * num_files);
+    if (parents) 
+       kfree(parents, sizeof(cnid_t) * num_parents);
+    if (bitmap) 
+       kfree(bitmap, sizeof(char) * map_size);
+    if (access)
+       kfree(access, sizeof(short) * num_files);
+    if (cache.acache)
+       kfree(cache.acache, sizeof(int) * NUM_CACHE_ENTRIES);
+    if (cache.haveaccess)
+       kfree(cache.haveaccess, sizeof(unsigned char) * NUM_CACHE_ENTRIES);
+               
+    return (error);
 }
 
 
 }
 
 
+/* end "bulk-access" support */
+
+
+/*
+ * Callback for use with freeze ioctl.
+ */
+static int
+hfs_freezewrite_callback(struct vnode *vp, __unused void *cargs)
+{
+       vnode_waitforwrites(vp, 0, 0, 0, "hfs freeze");
+
+       return 0;
+}
+
 /*
 /*
- * Calculate the logical to physical mapping if not done already,
- * then call the device strategy routine.
-#
-#vop_strategy {
-#      IN struct buf *bp;
-    */
+ * Control filesystem operating characteristics.
+ */
 int
 int
-hfs_strategy(ap)
-       struct vop_strategy_args /* {
-               struct buf *a_bp;
-       } */ *ap;
+hfs_vnop_ioctl( struct vnop_ioctl_args /* {
+               vnode_t a_vp;
+               int  a_command;
+               caddr_t  a_data;
+               int  a_fflag;
+               vfs_context_t a_context;
+       } */ *ap)
 {
 {
-       register struct buf *bp = ap->a_bp;
-       register struct vnode *vp = bp->b_vp;
-       register struct cnode *cp = VTOC(vp);
-       int retval = 0;
-       off_t offset;
-       size_t bytes_contig;
+       struct vnode * vp = ap->a_vp;
+       struct hfsmount *hfsmp = VTOHFS(vp);
+       vfs_context_t context = ap->a_context;
+       kauth_cred_t cred = vfs_context_ucred(context);
+       proc_t p = vfs_context_proc(context);
+       struct vfsstatfs *vfsp;
+       boolean_t is64bit;
+       off_t jnl_start, jnl_size;
+       struct hfs_journal_info *jip;
+#if HFS_COMPRESSION
+       int compressed = 0;
+       off_t uncompressed_size = -1;
+       int decmpfs_error = 0;
        
        
-       if ( !(bp->b_flags & B_VECTORLIST)) {
-               if (vp->v_type == VBLK || vp->v_type == VCHR)
-                       panic("hfs_strategy: device vnode passed!");
+       if (ap->a_command == F_RDADVISE) {
+               /* we need to inspect the decmpfs state of the file as early as possible */
+               compressed = hfs_file_is_compressed(VTOC(vp), 0);
+               if (compressed) {
+                       if (VNODE_IS_RSRC(vp)) {
+                               /* if this is the resource fork, treat it as if it were empty */
+                               uncompressed_size = 0;
+                       } else {
+                               decmpfs_error = hfs_uncompressed_size_of_compressed_file(NULL, vp, 0, &uncompressed_size, 0);
+                               if (decmpfs_error != 0) {
+                                       /* failed to get the uncompressed size, we'll check for this later */
+                                       uncompressed_size = -1;
+                               }
+                       }
+               }
+       }
+#endif /* HFS_COMPRESSION */
 
 
-               if (bp->b_flags & B_PAGELIST) {
-                       /*
-                        * If we have a page list associated with this bp,
-                        * then go through cluster_bp since it knows how to 
-                        * deal with a page request that might span non-
-                        * contiguous physical blocks on the disk...
-                        */
-                       retval = cluster_bp(bp);
-                       vp = cp->c_devvp;
-                       bp->b_dev = vp->v_rdev;
+       is64bit = proc_is64bit(p);
 
 
-                       return (retval);
+       switch (ap->a_command) {
+
+       case HFS_GETPATH:
+       {
+               struct vnode *file_vp;
+               cnid_t  cnid;
+               int  outlen;
+               char *bufptr;
+               int error;
+
+               /* Caller must be owner of file system. */
+               vfsp = vfs_statfs(HFSTOVFS(hfsmp));
+               if (suser(cred, NULL) &&
+                       kauth_cred_getuid(cred) != vfsp->f_owner) {
+                       return (EACCES);
                }
                }
-               
-               /*
-                * If we don't already know the filesystem relative block
-                * number then get it using VOP_BMAP().  If VOP_BMAP()
-                * returns the block number as -1 then we've got a hole in
-                * the file.  Although HFS filesystems don't create files with
-                * holes, invalidating of subranges of the file (lazy zero
-                * filling) may create such a situation.
+               /* Target vnode must be file system's root. */
+               if (!vnode_isvroot(vp)) {
+                       return (EINVAL);
+               }
+               bufptr = (char *)ap->a_data;
+               cnid = strtoul(bufptr, NULL, 10);
+
+               /* We need to call hfs_vfs_vget to leverage the code that will
+                * fix the origin list for us if needed, as opposed to calling
+                * hfs_vget, since we will need the parent for build_path call.
                 */
                 */
-               if (bp->b_blkno == bp->b_lblkno) {
-                       offset = (off_t) bp->b_lblkno *
-                           (off_t) GetLogicalBlockSize(vp);
-
-                       if ((retval = VOP_CMAP(vp, offset, bp->b_bcount,
-                           &bp->b_blkno, &bytes_contig, NULL))) {
-                               bp->b_error = retval;
-                               bp->b_flags |= B_ERROR;
-                               biodone(bp);
-                               return (retval);
-                       }
-                       if (bytes_contig < bp->b_bcount)
-                       {
-                               /*
-                                * We were asked to read a block that wasn't
-                                * contiguous, so we have to read each of the
-                                * pieces and copy them into the buffer.
-                                * Since ordinary file I/O goes through
-                                * cluster_io (which won't ask us for
-                                * discontiguous data), this is probably an
-                                * attempt to read or write metadata.
-                                */
-                               return hfs_strategy_fragmented(bp);
-                       }
-                       if ((long)bp->b_blkno == -1)
-                               clrbuf(bp);
+
+               if ((error = hfs_vfs_vget(HFSTOVFS(hfsmp), cnid, &file_vp, context))) {
+                       return (error);
                }
                }
-               if ((long)bp->b_blkno == -1) {
-                       biodone(bp);
-                       return (0);
+               error = build_path(file_vp, bufptr, sizeof(pathname_t), &outlen, 0, context);
+               vnode_put(file_vp);
+
+               return (error);
+       }
+
+       case HFS_PREV_LINK:
+       case HFS_NEXT_LINK:
+       {
+               cnid_t linkfileid;
+               cnid_t nextlinkid;
+               cnid_t prevlinkid;
+               int error;
+
+               /* Caller must be owner of file system. */
+               vfsp = vfs_statfs(HFSTOVFS(hfsmp));
+               if (suser(cred, NULL) &&
+                       kauth_cred_getuid(cred) != vfsp->f_owner) {
+                       return (EACCES);
                }
                }
-               if (bp->b_validend == 0) {
-                       /*
-                        * Record the exact size of the I/O transfer about to
-                        * be made:
-                        */
-                       bp->b_validend = bp->b_bcount;
+               /* Target vnode must be file system's root. */
+               if (!vnode_isvroot(vp)) {
+                       return (EINVAL);
+               }
+               linkfileid = *(cnid_t *)ap->a_data;
+               if (linkfileid < kHFSFirstUserCatalogNodeID) {
+                       return (EINVAL);
+               }
+               if ((error = hfs_lookuplink(hfsmp, linkfileid, &prevlinkid, &nextlinkid))) {
+                       return (error);
+               }
+               if (ap->a_command == HFS_NEXT_LINK) {
+                       *(cnid_t *)ap->a_data = nextlinkid;
+               } else {
+                       *(cnid_t *)ap->a_data = prevlinkid;
                }
                }
+               return (0);
        }
        }
-       vp = cp->c_devvp;
-       bp->b_dev = vp->v_rdev;
 
 
-       return VOCALL (vp->v_op, VOFFSET(vop_strategy), ap);
-}
+       case HFS_RESIZE_PROGRESS: {
 
 
+               vfsp = vfs_statfs(HFSTOVFS(hfsmp));
+               if (suser(cred, NULL) &&
+                       kauth_cred_getuid(cred) != vfsp->f_owner) {
+                       return (EACCES); /* must be owner of file system */
+               }
+               if (!vnode_isvroot(vp)) {
+                       return (EINVAL);
+               }
+               /* file system must not be mounted read-only */
+               if (hfsmp->hfs_flags & HFS_READ_ONLY) {
+                       return (EROFS);
+               }
 
 
-/*
-#
-#% truncate    vp      L L L
-#
-vop_truncate {
-    IN struct vnode *vp;
-    IN off_t length;
-    IN int flags;      (IO_SYNC)
-    IN struct ucred *cred;
-    IN struct proc *p;
-};
- * Truncate a cnode to at most length size, freeing (or adding) the
- * disk blocks.
- */
-int hfs_truncate(ap)
-       struct vop_truncate_args /* {
-               struct vnode *a_vp;
-               off_t a_length;
-               int a_flags;
-               struct ucred *a_cred;
-               struct proc *a_p;
-       } */ *ap;
-{
-       register struct vnode *vp = ap->a_vp;
-       register struct cnode *cp = VTOC(vp);
-       struct filefork *fp = VTOF(vp);
-       off_t length;
-       long vflags;
-       struct timeval tv;
-       int retval;
-       off_t bytesToAdd;
-       off_t actualBytesAdded;
-       off_t filebytes;
-       u_long fileblocks;
-       int blksize;
-       struct hfsmount *hfsmp;
+               return hfs_resize_progress(hfsmp, (u_int32_t *)ap->a_data);
+       }
 
 
-       if (vp->v_type != VREG && vp->v_type != VLNK)
-               return (EISDIR);        /* cannot truncate an HFS directory! */
+       case HFS_RESIZE_VOLUME: {
+               u_int64_t newsize;
+               u_int64_t cursize;
 
 
-       length = ap->a_length;
-       blksize = VTOVCB(vp)->blockSize;
-       fileblocks = fp->ff_blocks;
-       filebytes = (off_t)fileblocks * (off_t)blksize;
+               vfsp = vfs_statfs(HFSTOVFS(hfsmp));
+               if (suser(cred, NULL) &&
+                       kauth_cred_getuid(cred) != vfsp->f_owner) {
+                       return (EACCES); /* must be owner of file system */
+               }
+               if (!vnode_isvroot(vp)) {
+                       return (EINVAL);
+               }
+               
+               /* filesystem must not be mounted read only */
+               if (hfsmp->hfs_flags & HFS_READ_ONLY) {
+                       return (EROFS);
+               }
+               newsize = *(u_int64_t *)ap->a_data;
+               cursize = (u_int64_t)hfsmp->totalBlocks * (u_int64_t)hfsmp->blockSize;
+               
+               if (newsize > cursize) {
+                       return hfs_extendfs(hfsmp, *(u_int64_t *)ap->a_data, context);
+               } else if (newsize < cursize) {
+                       return hfs_truncatefs(hfsmp, *(u_int64_t *)ap->a_data, context);
+               } else {
+                       return (0);
+               }
+       }
+       case HFS_CHANGE_NEXT_ALLOCATION: {
+               int error = 0;          /* Assume success */
+               u_int32_t location;
 
 
-       KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 7)) | DBG_FUNC_START,
-                (int)length, (int)fp->ff_size, (int)filebytes, 0, 0);
+               if (vnode_vfsisrdonly(vp)) {
+                       return (EROFS);
+               }
+               vfsp = vfs_statfs(HFSTOVFS(hfsmp));
+               if (suser(cred, NULL) &&
+                       kauth_cred_getuid(cred) != vfsp->f_owner) {
+                       return (EACCES); /* must be owner of file system */
+               }
+               if (!vnode_isvroot(vp)) {
+                       return (EINVAL);
+               }
+               HFS_MOUNT_LOCK(hfsmp, TRUE);
+               location = *(u_int32_t *)ap->a_data;
+               if ((location >= hfsmp->allocLimit) &&
+                       (location != HFS_NO_UPDATE_NEXT_ALLOCATION)) {
+                       error = EINVAL;
+                       goto fail_change_next_allocation;
+               }
+               /* Return previous value. */
+               *(u_int32_t *)ap->a_data = hfsmp->nextAllocation;
+               if (location == HFS_NO_UPDATE_NEXT_ALLOCATION) {
+                       /* On magic value for location, set nextAllocation to next block
+                        * after metadata zone and set flag in mount structure to indicate 
+                        * that nextAllocation should not be updated again.
+                        */
+                       if (hfsmp->hfs_metazone_end != 0) {
+                               HFS_UPDATE_NEXT_ALLOCATION(hfsmp, hfsmp->hfs_metazone_end + 1);
+                       }
+                       hfsmp->hfs_flags |= HFS_SKIP_UPDATE_NEXT_ALLOCATION; 
+               } else {
+                       hfsmp->hfs_flags &= ~HFS_SKIP_UPDATE_NEXT_ALLOCATION; 
+                       HFS_UPDATE_NEXT_ALLOCATION(hfsmp, location);
+               }
+               MarkVCBDirty(hfsmp);
+fail_change_next_allocation:
+               HFS_MOUNT_UNLOCK(hfsmp, TRUE);
+               return (error);
+       }
 
 
-       if (length < 0)
-               return (EINVAL);
+#ifdef HFS_SPARSE_DEV
+       case HFS_SETBACKINGSTOREINFO: {
+               struct vnode * bsfs_rootvp;
+               struct vnode * di_vp;
+               struct hfs_backingstoreinfo *bsdata;
+               int error = 0;
+               
+               if (hfsmp->hfs_flags & HFS_READ_ONLY) {
+                       return (EROFS);
+               }
+               if (hfsmp->hfs_flags & HFS_HAS_SPARSE_DEVICE) {
+                       return (EALREADY);
+               }
+               vfsp = vfs_statfs(HFSTOVFS(hfsmp));
+               if (suser(cred, NULL) &&
+                       kauth_cred_getuid(cred) != vfsp->f_owner) {
+                       return (EACCES); /* must be owner of file system */
+               }
+               bsdata = (struct hfs_backingstoreinfo *)ap->a_data;
+               if (bsdata == NULL) {
+                       return (EINVAL);
+               }
+               if ((error = file_vnode(bsdata->backingfd, &di_vp))) {
+                       return (error);
+               }
+               if ((error = vnode_getwithref(di_vp))) {
+                       file_drop(bsdata->backingfd);
+                       return(error);
+               }
 
 
-       if ((!ISHFSPLUS(VTOVCB(vp))) && (length > (off_t)MAXHFSFILESIZE))
-               return (EFBIG);
+               if (vnode_mount(vp) == vnode_mount(di_vp)) {
+                       (void)vnode_put(di_vp);
+                       file_drop(bsdata->backingfd);
+                       return (EINVAL);
+               }
 
 
-       hfsmp = VTOHFS(vp);
+               /*
+                * Obtain the backing fs root vnode and keep a reference
+                * on it.  This reference will be dropped in hfs_unmount.
+                */
+               error = VFS_ROOT(vnode_mount(di_vp), &bsfs_rootvp, NULL); /* XXX use context! */
+               if (error) {
+                       (void)vnode_put(di_vp);
+                       file_drop(bsdata->backingfd);
+                       return (error);
+               }
+               vnode_ref(bsfs_rootvp);
+               vnode_put(bsfs_rootvp);
 
 
-       tv = time;
-       retval = E_NONE;
+               hfsmp->hfs_backingfs_rootvp = bsfs_rootvp;
+               hfsmp->hfs_flags |= HFS_HAS_SPARSE_DEVICE;
+               hfsmp->hfs_sparsebandblks = bsdata->bandsize / HFSTOVCB(hfsmp)->blockSize;
+               hfsmp->hfs_sparsebandblks *= 4;
 
 
-       /* 
-        * We cannot just check if fp->ff_size == length (as an optimization)
-        * since there may be extra physical blocks that also need truncation.
-        */
-#if QUOTA
-       if (retval = hfs_getinoquota(cp))
-               return(retval);
-#endif /* QUOTA */
+               vfs_markdependency(hfsmp->hfs_mp);
 
 
-       /*
-        * Lengthen the size of the file. We must ensure that the
-        * last byte of the file is allocated. Since the smallest
-        * value of ff_size is 0, length will be at least 1.
-        */
-       if (length > fp->ff_size) {
-#if QUOTA
-               retval = hfs_chkdq(cp, (int64_t)(roundup(length - filebytes, blksize)),
-                               ap->a_cred, 0);
-               if (retval)
-                       goto Err_Exit;
-#endif /* QUOTA */
                /*
                /*
-                * If we don't have enough physical space then
-                * we need to extend the physical size.
+                * If the sparse image is on a sparse image file (as opposed to a sparse
+                * bundle), then we may need to limit the free space to the maximum size
+                * of a file on that volume.  So we query (using pathconf), and if we get
+                * a meaningful result, we cache the number of blocks for later use in
+                * hfs_freeblks().
                 */
                 */
-               if (length > filebytes) {
-                       int eflags;
-
-                       /* All or nothing and don't round up to clumpsize. */
-                       eflags = kEFAllMask | kEFNoClumpMask;
+               hfsmp->hfs_backingfs_maxblocks = 0;
+               if (vnode_vtype(di_vp) == VREG) {
+                       int terr;
+                       int hostbits;
+                       terr = vn_pathconf(di_vp, _PC_FILESIZEBITS, &hostbits, context);
+                       if (terr == 0 && hostbits != 0 && hostbits < 64) {
+                               u_int64_t hostfilesizemax = ((u_int64_t)1) << hostbits;
+                               
+                               hfsmp->hfs_backingfs_maxblocks = hostfilesizemax / hfsmp->blockSize;
+                       }
+               }
+                               
+               (void)vnode_put(di_vp);
+               file_drop(bsdata->backingfd);
+               return (0);
+       }
+       case HFS_CLRBACKINGSTOREINFO: {
+               struct vnode * tmpvp;
 
 
-                       if (suser(ap->a_cred, NULL) != 0)
-                               eflags |= kEFReserveMask;  /* keep a reserve */
+               vfsp = vfs_statfs(HFSTOVFS(hfsmp));
+               if (suser(cred, NULL) &&
+                       kauth_cred_getuid(cred) != vfsp->f_owner) {
+                       return (EACCES); /* must be owner of file system */
+               }
+               if (hfsmp->hfs_flags & HFS_READ_ONLY) {
+                       return (EROFS);
+               }
 
 
-                       // XXXdbg
-                       hfs_global_shared_lock_acquire(hfsmp);
-                       if (hfsmp->jnl) {
-                               if (journal_start_transaction(hfsmp->jnl) != 0) {
-                                       retval = EINVAL;
-                                       goto Err_Exit;
-                               }
-                       }
+               if ((hfsmp->hfs_flags & HFS_HAS_SPARSE_DEVICE) &&
+                   hfsmp->hfs_backingfs_rootvp) {
 
 
-                       /* lock extents b-tree (also protects volume bitmap) */
-                       retval = hfs_metafilelocking(VTOHFS(vp), kHFSExtentsFileID, LK_EXCLUSIVE, ap->a_p);
-                       if (retval) {
-                               if (hfsmp->jnl) {
-                                       journal_end_transaction(hfsmp->jnl);
-                               } 
-                               hfs_global_shared_lock_release(hfsmp);
+                       hfsmp->hfs_flags &= ~HFS_HAS_SPARSE_DEVICE;
+                       tmpvp = hfsmp->hfs_backingfs_rootvp;
+                       hfsmp->hfs_backingfs_rootvp = NULLVP;
+                       hfsmp->hfs_sparsebandblks = 0;
+                       vnode_rele(tmpvp);
+               }
+               return (0);
+       }
+#endif /* HFS_SPARSE_DEV */
 
 
-                               goto Err_Exit;
-                       }
+       case F_FREEZE_FS: {
+               struct mount *mp;
+               mp = vnode_mount(vp);
+               hfsmp = VFSTOHFS(mp);
 
 
-                       while ((length > filebytes) && (retval == E_NONE)) {
-                               bytesToAdd = length - filebytes;
-                               retval = MacToVFSError(ExtendFileC(VTOVCB(vp),
-                                                    (FCB*)fp,
-                                                    bytesToAdd,
-                                                    0,
-                                                    eflags,
-                                                    &actualBytesAdded));
+               if (!(hfsmp->jnl))
+                       return (ENOTSUP);
 
 
-                               filebytes = (off_t)fp->ff_blocks * (off_t)blksize;
-                               if (actualBytesAdded == 0 && retval == E_NONE) {
-                                       if (length > filebytes)
-                                               length = filebytes;
-                                       break;
-                               }
-                       } /* endwhile */
+               vfsp = vfs_statfs(mp);
+       
+               if (kauth_cred_getuid(cred) != vfsp->f_owner &&
+                       !kauth_cred_issuser(cred))
+                       return (EACCES);
 
 
-                       (void) hfs_metafilelocking(VTOHFS(vp), kHFSExtentsFileID, LK_RELEASE, ap->a_p);
+               lck_rw_lock_exclusive(&hfsmp->hfs_insync);
+               // flush things before we get started to try and prevent
+               // dirty data from being paged out while we're frozen.
+               // note: can't do this after taking the lock as it will
+               // deadlock against ourselves.
+               vnode_iterate(mp, 0, hfs_freezewrite_callback, NULL);
+               hfs_global_exclusive_lock_acquire(hfsmp);
+
+               // DO NOT call hfs_journal_flush() because that takes a
+               // shared lock on the global exclusive lock!
+               journal_flush(hfsmp->jnl);
+
+               // don't need to iterate on all vnodes, we just need to
+               // wait for writes to the system files and the device vnode
+               if (HFSTOVCB(hfsmp)->extentsRefNum)
+                   vnode_waitforwrites(HFSTOVCB(hfsmp)->extentsRefNum, 0, 0, 0, "hfs freeze");
+               if (HFSTOVCB(hfsmp)->catalogRefNum)
+                   vnode_waitforwrites(HFSTOVCB(hfsmp)->catalogRefNum, 0, 0, 0, "hfs freeze");
+               if (HFSTOVCB(hfsmp)->allocationsRefNum)
+                   vnode_waitforwrites(HFSTOVCB(hfsmp)->allocationsRefNum, 0, 0, 0, "hfs freeze");
+               if (hfsmp->hfs_attribute_vp)
+                   vnode_waitforwrites(hfsmp->hfs_attribute_vp, 0, 0, 0, "hfs freeze");
+               vnode_waitforwrites(hfsmp->hfs_devvp, 0, 0, 0, "hfs freeze");
+
+               hfsmp->hfs_freezing_proc = current_proc();
 
 
-                       // XXXdbg
-                       if (hfsmp->jnl) {
-                               hfs_flushvolumeheader(hfsmp, MNT_NOWAIT, 0);
-                               journal_end_transaction(hfsmp->jnl);
-                       } 
-                       hfs_global_shared_lock_release(hfsmp);
+               return (0);
+       }
 
 
-                       if (retval)
-                               goto Err_Exit;
+       case F_THAW_FS: {
+               vfsp = vfs_statfs(vnode_mount(vp));
+               if (kauth_cred_getuid(cred) != vfsp->f_owner &&
+                       !kauth_cred_issuser(cred))
+                       return (EACCES);
 
 
-                       KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 7)) | DBG_FUNC_NONE,
-                               (int)length, (int)fp->ff_size, (int)filebytes, 0, 0);
+               // if we're not the one who froze the fs then we
+               // can't thaw it.
+               if (hfsmp->hfs_freezing_proc != current_proc()) {
+                   return EPERM;
                }
                }
-               if (!(ap->a_flags & IO_NOZEROFILL)) {
-                       if (UBCINFOEXISTS(vp) && retval == E_NONE) {
-                               struct rl_entry *invalid_range;
-                               int devBlockSize;
-                               off_t zero_limit;
-                       
-                               zero_limit = (fp->ff_size + (PAGE_SIZE_64 - 1)) & ~PAGE_MASK_64;
-                               if (length < zero_limit) zero_limit = length;
 
 
-                               if (length > fp->ff_size) {
-                                       /* Extending the file: time to fill out the current last page w. zeroes? */
-                                       if ((fp->ff_size & PAGE_MASK_64) &&
-                                           (rl_scan(&fp->ff_invalidranges, fp->ff_size & ~PAGE_MASK_64,
-                                           fp->ff_size - 1, &invalid_range) == RL_NOOVERLAP)) {
-                                               
-                                               /* There's some valid data at the start of the (current) last page
-                                                  of the file, so zero out the remainder of that page to ensure the
-                                                  entire page contains valid data.  Since there is no invalid range
-                                                  possible past the (current) eof, there's no need to remove anything
-                                                  from the invalid range list before calling cluster_write():                                           */
-                                               VOP_DEVBLOCKSIZE(cp->c_devvp, &devBlockSize);
-                                               retval = cluster_write(vp, (struct uio *) 0, fp->ff_size, zero_limit,
-                                                               fp->ff_size, (off_t)0, devBlockSize,
-                                                               (ap->a_flags & IO_SYNC) | IO_HEADZEROFILL | IO_NOZERODIRTY);
-                                               if (retval) goto Err_Exit;
-                                               
-                                               /* Merely invalidate the remaining area, if necessary: */
-                                               if (length > zero_limit) {
-                                                       rl_add(zero_limit, length - 1, &fp->ff_invalidranges);
-                                                       cp->c_zftimeout = time.tv_sec + ZFTIMELIMIT;
-                                               }
-                                       } else {
-                                       /* The page containing the (current) eof is invalid: just add the
-                                          remainder of the page to the invalid list, along with the area
-                                          being newly allocated:
-                                        */
-                                       rl_add(fp->ff_size, length - 1, &fp->ff_invalidranges);
-                                       cp->c_zftimeout = time.tv_sec + ZFTIMELIMIT;
-                                       };
-                               }
-                       } else {
-                                       panic("hfs_truncate: invoked on non-UBC object?!");
-                       };
-               }
-               cp->c_flag |= C_UPDATE;
-               fp->ff_size = length;
+               // NOTE: if you add code here, also go check the
+               //       code that "thaws" the fs in hfs_vnop_close()
+               //
+               hfsmp->hfs_freezing_proc = NULL;
+               hfs_global_exclusive_lock_release(hfsmp);
+               lck_rw_unlock_exclusive(&hfsmp->hfs_insync);
 
 
-               if (UBCISVALID(vp))
-                       ubc_setsize(vp, fp->ff_size);   /* XXX check errors */
+               return (0);
+       }
 
 
-       } else { /* Shorten the size of the file */
+       case HFS_BULKACCESS_FSCTL: {
+           int size;
+           
+           if (hfsmp->hfs_flags & HFS_STANDARD) {
+               return EINVAL;
+           }
 
 
-               if (fp->ff_size > length) {
-                       /*
-                        * Any buffers that are past the truncation point need to be
-                        * invalidated (to maintain buffer cache consistency).  For
-                        * simplicity, we invalidate all the buffers by calling vinvalbuf.
-                        */
-                       if (UBCISVALID(vp))
-                               ubc_setsize(vp, length); /* XXX check errors */
+           if (is64bit) {
+               size = sizeof(struct user64_access_t);
+           } else {
+               size = sizeof(struct user32_access_t);
+           }
+           
+           return do_bulk_access_check(hfsmp, vp, ap, size, context);
+       } 
 
 
-                       vflags = ((length > 0) ? V_SAVE : 0)  | V_SAVEMETA;     
-                       retval = vinvalbuf(vp, vflags, ap->a_cred, ap->a_p, 0, 0);
+       case HFS_EXT_BULKACCESS_FSCTL: {
+           int size;
            
            
-                       /* Any space previously marked as invalid is now irrelevant: */
-                       rl_remove(length, fp->ff_size - 1, &fp->ff_invalidranges);
+           if (hfsmp->hfs_flags & HFS_STANDARD) {
+               return EINVAL;
+           }
+
+           if (is64bit) {
+               size = sizeof(struct user64_ext_access_t);
+           } else {
+               size = sizeof(struct user32_ext_access_t);
+           }
+           
+           return do_bulk_access_check(hfsmp, vp, ap, size, context);
+       } 
+
+       case HFS_SETACLSTATE: {
+               int state;
+
+               if (ap->a_data == NULL) {
+                       return (EINVAL);
                }
 
                }
 
-               /* 
-                * Account for any unmapped blocks. Note that the new
-                * file length can still end up with unmapped blocks.
-                */
-               if (fp->ff_unallocblocks > 0) {
-                       u_int32_t finalblks;
+               vfsp = vfs_statfs(HFSTOVFS(hfsmp));
+               state = *(int *)ap->a_data;
 
 
-                       /* lock extents b-tree */
-                       retval = hfs_metafilelocking(VTOHFS(vp), kHFSExtentsFileID,
-                                       LK_EXCLUSIVE, ap->a_p);
-                       if (retval)
-                               goto Err_Exit;
+               if (hfsmp->hfs_flags & HFS_READ_ONLY) {
+                       return (EROFS);
+               }
+               // super-user can enable or disable acl's on a volume.
+               // the volume owner can only enable acl's
+               if (!is_suser() && (state == 0 || kauth_cred_getuid(cred) != vfsp->f_owner)) {
+                       return (EPERM);
+               }
+               if (state == 0 || state == 1)
+                       return hfs_set_volxattr(hfsmp, HFS_SETACLSTATE, state);
+               else
+                       return (EINVAL);        
+       }
 
 
-                       VTOVCB(vp)->loanedBlocks -= fp->ff_unallocblocks;
-                       cp->c_blocks             -= fp->ff_unallocblocks;
-                       fp->ff_blocks            -= fp->ff_unallocblocks;
-                       fp->ff_unallocblocks      = 0;
+       case HFS_SET_XATTREXTENTS_STATE: {
+               int state;
 
 
-                       finalblks = (length + blksize - 1) / blksize;
-                       if (finalblks > fp->ff_blocks) {
-                               /* calculate required unmapped blocks */
-                               fp->ff_unallocblocks      = finalblks - fp->ff_blocks;
-                               VTOVCB(vp)->loanedBlocks += fp->ff_unallocblocks;
-                               cp->c_blocks             += fp->ff_unallocblocks;
-                               fp->ff_blocks            += fp->ff_unallocblocks;
-                       }
-                       (void) hfs_metafilelocking(VTOHFS(vp), kHFSExtentsFileID,
-                                       LK_RELEASE, ap->a_p);
+               if (ap->a_data == NULL) {
+                       return (EINVAL);
                }
 
                }
 
-               /*
-                * For a TBE process the deallocation of the file blocks is
-                * delayed until the file is closed.  And hfs_close calls
-                * truncate with the IO_NDELAY flag set.  So when IO_NDELAY
-                * isn't set, we make sure this isn't a TBE process.
+               state = *(int *)ap->a_data;
+               
+               if (hfsmp->hfs_flags & HFS_READ_ONLY) {
+                       return (EROFS);
+               }
+
+               /* Super-user can enable or disable extent-based extended 
+                * attribute support on a volume 
                 */
                 */
-               if ((ap->a_flags & IO_NDELAY) || (!ISSET(ap->a_p->p_flag, P_TBE))) {
-#if QUOTA
-                 off_t savedbytes = ((off_t)fp->ff_blocks * (off_t)blksize);
-#endif /* QUOTA */
-                 // XXXdbg
-                 hfs_global_shared_lock_acquire(hfsmp);
-                       if (hfsmp->jnl) {
-                               if (journal_start_transaction(hfsmp->jnl) != 0) {
-                                       retval = EINVAL;
-                                       goto Err_Exit;
-                               }
-                       }
+               if (!is_suser()) {
+                       return (EPERM);
+               }
+               if (state == 0 || state == 1)
+                       return hfs_set_volxattr(hfsmp, HFS_SET_XATTREXTENTS_STATE, state);
+               else
+                       return (EINVAL);        
+       }
 
 
-                       /* lock extents b-tree (also protects volume bitmap) */
-                       retval = hfs_metafilelocking(VTOHFS(vp), kHFSExtentsFileID, LK_EXCLUSIVE, ap->a_p);
-                       if (retval) {
-                               if (hfsmp->jnl) {
-                                       journal_end_transaction(hfsmp->jnl);
-                               }
-                               hfs_global_shared_lock_release(hfsmp);
-                               goto Err_Exit;
-                       }
-                       
-                       if (fp->ff_unallocblocks == 0)
-                               retval = MacToVFSError(TruncateFileC(VTOVCB(vp),
-                                               (FCB*)fp, length, false));
+       case F_FULLFSYNC: {
+               int error;
+               
+               if (hfsmp->hfs_flags & HFS_READ_ONLY) {
+                       return (EROFS);
+               }
+               error = hfs_lock(VTOC(vp), HFS_EXCLUSIVE_LOCK);
+               if (error == 0) {
+                       error = hfs_fsync(vp, MNT_WAIT, TRUE, p);
+                       hfs_unlock(VTOC(vp));
+               }
 
 
-                       (void) hfs_metafilelocking(VTOHFS(vp), kHFSExtentsFileID, LK_RELEASE, ap->a_p);
+               return error;
+       }
 
 
-                       // XXXdbg
-                       if (hfsmp->jnl) {
-                               hfs_flushvolumeheader(hfsmp, MNT_NOWAIT, 0);
-                               journal_end_transaction(hfsmp->jnl);
-                       }
-                       hfs_global_shared_lock_release(hfsmp);
+       case F_CHKCLEAN: {
+               register struct cnode *cp;
+               int error;
 
 
-                       filebytes = (off_t)fp->ff_blocks * (off_t)blksize;
-                       if (retval)
-                               goto Err_Exit;
-#if QUOTA
-                       /* These are bytesreleased */
-                       (void) hfs_chkdq(cp, (int64_t)-(savedbytes - filebytes), NOCRED, 0);
-#endif /* QUOTA */
+               if (!vnode_isreg(vp))
+                       return EINVAL;
+               error = hfs_lock(VTOC(vp), HFS_EXCLUSIVE_LOCK);
+               if (error == 0) {
+                       cp = VTOC(vp);
+                       /*
+                        * used by regression test to determine if 
+                        * all the dirty pages (via write) have been cleaned
+                        * after a call to 'fsysnc'.
+                        */
+                       error = is_file_clean(vp, VTOF(vp)->ff_size);
+                       hfs_unlock(cp);
                }
                }
-               /* Only set update flag if the logical length changes */
-               if (fp->ff_size != length)
-                       cp->c_flag |= C_UPDATE;
-               fp->ff_size = length;
+               return (error);
        }
        }
-       cp->c_flag |= C_CHANGE;
-       retval = VOP_UPDATE(vp, &tv, &tv, MNT_WAIT);
-       if (retval) {
-               KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 7)) | DBG_FUNC_NONE,
-                    -1, -1, -1, retval, 0);
+
+       case F_RDADVISE: {
+               register struct radvisory *ra;
+               struct filefork *fp;
+               int error;
+
+               if (!vnode_isreg(vp))
+                       return EINVAL;
+               ra = (struct radvisory *)(ap->a_data);
+               fp = VTOF(vp);
+
+               /* Protect against a size change. */
+               hfs_lock_truncate(VTOC(vp), TRUE);
+
+#if HFS_COMPRESSION
+               if (compressed && (uncompressed_size == -1)) {
+                       /* fetching the uncompressed size failed above, so return the error */
+                       error = decmpfs_error;
+               } else if ((compressed && (ra->ra_offset >= uncompressed_size)) ||
+                                  (!compressed && (ra->ra_offset >= fp->ff_size))) {
+                       error = EFBIG;
+               }
+#else /* HFS_COMPRESSION */
+               if (ra->ra_offset >= fp->ff_size) {
+                       error = EFBIG;
+               }
+#endif /* HFS_COMPRESSION */
+               else {
+                       error = advisory_read(vp, fp->ff_size, ra->ra_offset, ra->ra_count);
+               }
+
+               hfs_unlock_truncate(VTOC(vp), TRUE);
+               return (error);
+       }
+
+       case F_READBOOTSTRAP:
+       case F_WRITEBOOTSTRAP:
+       {
+           struct vnode *devvp = NULL;
+           user_fbootstraptransfer_t *user_bootstrapp;
+           int devBlockSize;
+           int error;
+           uio_t auio;
+           daddr64_t blockNumber;
+           u_int32_t blockOffset;
+           u_int32_t xfersize;
+           struct buf *bp;
+           user_fbootstraptransfer_t user_bootstrap;
+
+               if (!vnode_isvroot(vp))
+                       return (EINVAL);
+               /* LP64 - when caller is a 64 bit process then we are passed a pointer 
+                * to a user_fbootstraptransfer_t else we get a pointer to a 
+                * fbootstraptransfer_t which we munge into a user_fbootstraptransfer_t
+                */
+               if (hfsmp->hfs_flags & HFS_READ_ONLY) {
+                       return (EROFS);
+               }
+               if (is64bit) {
+                       user_bootstrapp = (user_fbootstraptransfer_t *)ap->a_data;
+               }
+               else {
+               user32_fbootstraptransfer_t *bootstrapp = (user32_fbootstraptransfer_t *)ap->a_data;
+                       user_bootstrapp = &user_bootstrap;
+                       user_bootstrap.fbt_offset = bootstrapp->fbt_offset;
+                       user_bootstrap.fbt_length = bootstrapp->fbt_length;
+                       user_bootstrap.fbt_buffer = CAST_USER_ADDR_T(bootstrapp->fbt_buffer);
+               }
+               if (user_bootstrapp->fbt_offset + user_bootstrapp->fbt_length > 1024) 
+                       return EINVAL;
+           
+           devvp = VTOHFS(vp)->hfs_devvp;
+               auio = uio_create(1, user_bootstrapp->fbt_offset, 
+                                                 is64bit ? UIO_USERSPACE64 : UIO_USERSPACE32,
+                                                 (ap->a_command == F_WRITEBOOTSTRAP) ? UIO_WRITE : UIO_READ);
+               uio_addiov(auio, user_bootstrapp->fbt_buffer, user_bootstrapp->fbt_length);
+
+           devBlockSize = vfs_devblocksize(vnode_mount(vp));
+
+           while (uio_resid(auio) > 0) {
+                       blockNumber = uio_offset(auio) / devBlockSize;
+                       error = (int)buf_bread(devvp, blockNumber, devBlockSize, cred, &bp);
+                       if (error) {
+                               if (bp) buf_brelse(bp);
+                               uio_free(auio);
+                               return error;
+                       };
+
+                       blockOffset = uio_offset(auio) % devBlockSize;
+                       xfersize = devBlockSize - blockOffset;
+                       error = uiomove((caddr_t)buf_dataptr(bp) + blockOffset, (int)xfersize, auio);
+                       if (error) {
+                               buf_brelse(bp);
+                               uio_free(auio);
+                               return error;
+                       };
+                       if (uio_rw(auio) == UIO_WRITE) {
+                               error = VNOP_BWRITE(bp);
+                               if (error) {
+                                       uio_free(auio);
+                       return error;
+                               }
+                       } else {
+                               buf_brelse(bp);
+                       };
+               };
+               uio_free(auio);
+       };
+       return 0;
+
+       case _IOC(IOC_OUT,'h', 4, 0):     /* Create date in local time */
+       {
+               if (is64bit) {
+                       *(user_time_t *)(ap->a_data) = (user_time_t) (to_bsd_time(VTOVCB(vp)->localCreateDate));
+               }
+               else {
+                       *(user32_time_t *)(ap->a_data) = (user32_time_t) (to_bsd_time(VTOVCB(vp)->localCreateDate));
+               }
+               return 0;
+       }
+
+       case SPOTLIGHT_FSCTL_GET_MOUNT_TIME:
+           *(uint32_t *)ap->a_data = hfsmp->hfs_mount_time;
+           break;
+
+       case SPOTLIGHT_FSCTL_GET_LAST_MTIME:
+           *(uint32_t *)ap->a_data = hfsmp->hfs_last_mounted_mtime;
+           break;
+
+       case HFS_FSCTL_SET_VERY_LOW_DISK:
+           if (*(uint32_t *)ap->a_data >= hfsmp->hfs_freespace_notify_warninglimit) {
+               return EINVAL;
+           }
+
+           hfsmp->hfs_freespace_notify_dangerlimit = *(uint32_t *)ap->a_data;
+           break;
+
+       case HFS_FSCTL_SET_LOW_DISK:
+           if (   *(uint32_t *)ap->a_data >= hfsmp->hfs_freespace_notify_desiredlevel
+               || *(uint32_t *)ap->a_data <= hfsmp->hfs_freespace_notify_dangerlimit) {
+
+               return EINVAL;
+           }
+
+           hfsmp->hfs_freespace_notify_warninglimit = *(uint32_t *)ap->a_data;
+           break;
+
+       case HFS_FSCTL_SET_DESIRED_DISK:
+           if (*(uint32_t *)ap->a_data <= hfsmp->hfs_freespace_notify_warninglimit) {
+               return EINVAL;
+           }
+
+           hfsmp->hfs_freespace_notify_desiredlevel = *(uint32_t *)ap->a_data;
+           break;
+
+       case HFS_VOLUME_STATUS:
+           *(uint32_t *)ap->a_data = hfsmp->hfs_notification_conditions;
+           break;
+
+       case HFS_SET_BOOT_INFO:
+               if (!vnode_isvroot(vp))
+                       return(EINVAL);
+               if (!kauth_cred_issuser(cred) && (kauth_cred_getuid(cred) != vfs_statfs(HFSTOVFS(hfsmp))->f_owner))
+                       return(EACCES); /* must be superuser or owner of filesystem */
+               if (hfsmp->hfs_flags & HFS_READ_ONLY) {
+                       return (EROFS);
+               }
+               HFS_MOUNT_LOCK(hfsmp, TRUE);
+               bcopy(ap->a_data, &hfsmp->vcbFndrInfo, sizeof(hfsmp->vcbFndrInfo));
+               HFS_MOUNT_UNLOCK(hfsmp, TRUE);
+               (void) hfs_flushvolumeheader(hfsmp, MNT_WAIT, 0);
+               break;
+               
+       case HFS_GET_BOOT_INFO:
+               if (!vnode_isvroot(vp))
+                       return(EINVAL);
+               HFS_MOUNT_LOCK(hfsmp, TRUE);
+               bcopy(&hfsmp->vcbFndrInfo, ap->a_data, sizeof(hfsmp->vcbFndrInfo));
+               HFS_MOUNT_UNLOCK(hfsmp, TRUE);
+               break;
+
+       case HFS_MARK_BOOT_CORRUPT:
+               /* Mark the boot volume corrupt by setting 
+                * kHFSVolumeInconsistentBit in the volume header.  This will 
+                * force fsck_hfs on next mount.
+                */
+               if (!is_suser()) {
+                       return EACCES;
+               }
+                       
+               /* Allowed only on the root vnode of the boot volume */
+               if (!(vfs_flags(HFSTOVFS(hfsmp)) & MNT_ROOTFS) || 
+                   !vnode_isvroot(vp)) {
+                       return EINVAL;
+               }
+               if (hfsmp->hfs_flags & HFS_READ_ONLY) {
+                       return (EROFS);
+               }
+               printf ("hfs_vnop_ioctl: Marking the boot volume corrupt.\n");
+               hfs_mark_volume_inconsistent(hfsmp);
+               break;
+
+       case HFS_FSCTL_GET_JOURNAL_INFO:
+               jip = (struct hfs_journal_info*)ap->a_data;
+
+               if (vp == NULLVP)
+                       return EINVAL;
+
+           if (hfsmp->jnl == NULL) {
+                       jnl_start = 0;
+                       jnl_size  = 0;
+           } else {
+                       jnl_start = (off_t)(hfsmp->jnl_start * HFSTOVCB(hfsmp)->blockSize) + (off_t)HFSTOVCB(hfsmp)->hfsPlusIOPosOffset;
+                       jnl_size  = (off_t)hfsmp->jnl_size;
+           }
+
+               jip->jstart = jnl_start;
+               jip->jsize = jnl_size;
+               break;
+
+       case HFS_SET_ALWAYS_ZEROFILL: {
+           struct cnode *cp = VTOC(vp);
+
+           if (*(int *)ap->a_data) {
+               cp->c_flag |= C_ALWAYS_ZEROFILL;
+           } else {
+               cp->c_flag &= ~C_ALWAYS_ZEROFILL;
+           }
+           break;
+       }    
+
+       default:
+               return (ENOTTY);
+       }
+
+       return 0;
+}
+
+/*
+ * select
+ */
+int
+hfs_vnop_select(__unused struct vnop_select_args *ap)
+/*
+       struct vnop_select_args {
+               vnode_t a_vp;
+               int  a_which;
+               int  a_fflags;
+               void *a_wql;
+               vfs_context_t a_context;
+       };
+*/
+{
+       /*
+        * We should really check to see if I/O is possible.
+        */
+       return (1);
+}
+
+/*
+ * Converts a logical block number to a physical block, and optionally returns
+ * the amount of remaining blocks in a run. The logical block is based on hfsNode.logBlockSize.
+ * The physical block number is based on the device block size, currently its 512.
+ * The block run is returned in logical blocks, and is the REMAINING amount of blocks
+ */
+int
+hfs_bmap(struct vnode *vp, daddr_t bn, struct vnode **vpp, daddr64_t *bnp, unsigned int *runp)
+{
+       struct filefork *fp = VTOF(vp);
+       struct hfsmount *hfsmp = VTOHFS(vp);
+       int  retval = E_NONE;
+       u_int32_t  logBlockSize;
+       size_t  bytesContAvail = 0;
+       off_t  blockposition;
+       int lockExtBtree;
+       int lockflags = 0;
+
+       /*
+        * Check for underlying vnode requests and ensure that logical
+        * to physical mapping is requested.
+        */
+       if (vpp != NULL)
+               *vpp = hfsmp->hfs_devvp;
+       if (bnp == NULL)
+               return (0);
+
+       logBlockSize = GetLogicalBlockSize(vp);
+       blockposition = (off_t)bn * logBlockSize;
+
+       lockExtBtree = overflow_extents(fp);
+
+       if (lockExtBtree)
+               lockflags = hfs_systemfile_lock(hfsmp, SFL_EXTENTS, HFS_EXCLUSIVE_LOCK);
+
+       retval = MacToVFSError(
+                            MapFileBlockC (HFSTOVCB(hfsmp),
+                                            (FCB*)fp,
+                                            MAXPHYSIO,
+                                            blockposition,
+                                            bnp,
+                                            &bytesContAvail));
+
+       if (lockExtBtree)
+               hfs_systemfile_unlock(hfsmp, lockflags);
+
+       if (retval == E_NONE) {
+               /* Figure out how many read ahead blocks there are */
+               if (runp != NULL) {
+                       if (can_cluster(logBlockSize)) {
+                               /* Make sure this result never goes negative: */
+                               *runp = (bytesContAvail < logBlockSize) ? 0 : (bytesContAvail / logBlockSize) - 1;
+                       } else {
+                               *runp = 0;
+                       }
+               }
+       }
+       return (retval);
+}
+
+/*
+ * Convert logical block number to file offset.
+ */
+int
+hfs_vnop_blktooff(struct vnop_blktooff_args *ap)
+/*
+       struct vnop_blktooff_args {
+               vnode_t a_vp;
+               daddr64_t a_lblkno;  
+               off_t *a_offset;
+       };
+*/
+{      
+       if (ap->a_vp == NULL)
+               return (EINVAL);
+       *ap->a_offset = (off_t)ap->a_lblkno * (off_t)GetLogicalBlockSize(ap->a_vp);
+
+       return(0);
+}
+
+/*
+ * Convert file offset to logical block number.
+ */
+int
+hfs_vnop_offtoblk(struct vnop_offtoblk_args *ap)
+/*
+       struct vnop_offtoblk_args {
+               vnode_t a_vp;
+               off_t a_offset;    
+               daddr64_t *a_lblkno;
+       };
+*/
+{      
+       if (ap->a_vp == NULL)
+               return (EINVAL);
+       *ap->a_lblkno = (daddr64_t)(ap->a_offset / (off_t)GetLogicalBlockSize(ap->a_vp));
+
+       return(0);
+}
+
+/*
+ * Map file offset to physical block number.
+ *
+ * If this function is called for write operation, and if the file
+ * had virtual blocks allocated (delayed allocation), real blocks
+ * are allocated by calling ExtendFileC().
+ * 
+ * If this function is called for read operation, and if the file
+ * had virtual blocks allocated (delayed allocation), no change 
+ * to the size of file is done, and if required, rangelist is 
+ * searched for mapping.
+ *
+ * System file cnodes are expected to be locked (shared or exclusive).
+ */
+int
+hfs_vnop_blockmap(struct vnop_blockmap_args *ap)
+/*
+       struct vnop_blockmap_args {
+               vnode_t a_vp;
+               off_t a_foffset;    
+               size_t a_size;
+               daddr64_t *a_bpn;
+               size_t *a_run;
+               void *a_poff;
+               int a_flags;
+               vfs_context_t a_context;
+       };
+*/
+{
+       struct vnode *vp = ap->a_vp;
+       struct cnode *cp;
+       struct filefork *fp;
+       struct hfsmount *hfsmp;
+       size_t bytesContAvail = 0;
+       int retval = E_NONE;
+       int syslocks = 0;
+       int lockflags = 0;
+       struct rl_entry *invalid_range;
+       enum rl_overlaptype overlaptype;
+       int started_tr = 0;
+       int tooklock = 0;
+
+#if HFS_COMPRESSION
+       if (VNODE_IS_RSRC(vp)) {
+               /* allow blockmaps to the resource fork */
+       } else {
+               if ( hfs_file_is_compressed(VTOC(vp), 1) ) { /* 1 == don't take the cnode lock */
+                       int state = decmpfs_cnode_get_vnode_state(VTOCMP(vp));
+                       switch(state) {
+                               case FILE_IS_COMPRESSED:
+                                       return ENOTSUP;
+                               case FILE_IS_CONVERTING:
+                                       /* if FILE_IS_CONVERTING, we allow blockmap */
+                                       break;
+                               default:
+                                       printf("invalid state %d for compressed file\n", state);
+                                       /* fall through */
+                       }
+               }
+       }
+#endif /* HFS_COMPRESSION */
+
+       /* Do not allow blockmap operation on a directory */
+       if (vnode_isdir(vp)) {
+               return (ENOTSUP);
+       }
+
+       /*
+        * Check for underlying vnode requests and ensure that logical
+        * to physical mapping is requested.
+        */
+       if (ap->a_bpn == NULL)
+               return (0);
+
+       if ( !vnode_issystem(vp) && !vnode_islnk(vp) && !vnode_isswap(vp)) {
+               if (VTOC(vp)->c_lockowner != current_thread()) {
+                       hfs_lock(VTOC(vp), HFS_FORCE_LOCK);
+                       tooklock = 1;
+               }
+       }
+       hfsmp = VTOHFS(vp);
+       cp = VTOC(vp);
+       fp = VTOF(vp);
+
+retry:
+       /* Check virtual blocks only when performing write operation */
+       if ((ap->a_flags & VNODE_WRITE) && (fp->ff_unallocblocks != 0)) {
+               if (hfs_start_transaction(hfsmp) != 0) {
+                       retval = EINVAL;
+                       goto exit;
+               } else {
+                       started_tr = 1;
+               }
+               syslocks = SFL_EXTENTS | SFL_BITMAP;
+               
+       } else if (overflow_extents(fp)) {
+               syslocks = SFL_EXTENTS;
+       }
+       
+       if (syslocks)
+               lockflags = hfs_systemfile_lock(hfsmp, syslocks, HFS_EXCLUSIVE_LOCK);
+
+       /*
+        * Check for any delayed allocations.
+        */
+       if ((ap->a_flags & VNODE_WRITE) && (fp->ff_unallocblocks != 0)) {
+               int64_t actbytes;
+               u_int32_t loanedBlocks;
+
+               // 
+               // Make sure we have a transaction.  It's possible
+               // that we came in and fp->ff_unallocblocks was zero
+               // but during the time we blocked acquiring the extents
+               // btree, ff_unallocblocks became non-zero and so we
+               // will need to start a transaction.
+               //
+               if (started_tr == 0) {
+                       if (syslocks) {
+                               hfs_systemfile_unlock(hfsmp, lockflags);
+                               syslocks = 0;
+                       }
+                       goto retry;
+               }
+
+               /*
+                * Note: ExtendFileC will Release any blocks on loan and
+                * aquire real blocks.  So we ask to extend by zero bytes
+                * since ExtendFileC will account for the virtual blocks.
+                */
+
+               loanedBlocks = fp->ff_unallocblocks;
+               retval = ExtendFileC(hfsmp, (FCB*)fp, 0, 0,
+                                    kEFAllMask | kEFNoClumpMask, &actbytes);
+
+               if (retval) {
+                       fp->ff_unallocblocks = loanedBlocks;
+                       cp->c_blocks += loanedBlocks;
+                       fp->ff_blocks += loanedBlocks;
+
+                       HFS_MOUNT_LOCK(hfsmp, TRUE);
+                       hfsmp->loanedBlocks += loanedBlocks;
+                       HFS_MOUNT_UNLOCK(hfsmp, TRUE);
+
+                       hfs_systemfile_unlock(hfsmp, lockflags);
+                       cp->c_flag |= C_MODIFIED;
+                       if (started_tr) {
+                               (void) hfs_update(vp, TRUE);
+                               (void) hfs_volupdate(hfsmp, VOL_UPDATE, 0);
+
+                               hfs_end_transaction(hfsmp);
+                               started_tr = 0;
+                       }
+                       goto exit;
+               }
+       }
+
+       retval = MapFileBlockC(hfsmp, (FCB *)fp, ap->a_size, ap->a_foffset,
+                              ap->a_bpn, &bytesContAvail);
+       if (syslocks) {
+               hfs_systemfile_unlock(hfsmp, lockflags);
+               syslocks = 0;
+       }
+
+       if (started_tr) {
+               (void) hfs_update(vp, TRUE);
+               (void) hfs_volupdate(hfsmp, VOL_UPDATE, 0);
+               hfs_end_transaction(hfsmp);
+               started_tr = 0;
+       }       
+       if (retval) {
+               /* On write, always return error because virtual blocks, if any, 
+                * should have been allocated in ExtendFileC().  We do not 
+                * allocate virtual blocks on read, therefore return error 
+                * only if no virtual blocks are allocated.  Otherwise we search
+                * rangelist for zero-fills
+                */
+               if ((MacToVFSError(retval) != ERANGE) ||
+                   (ap->a_flags & VNODE_WRITE) ||
+                   ((ap->a_flags & VNODE_READ) && (fp->ff_unallocblocks == 0))) {
+                       goto exit;
+               } 
+               
+               /* Validate if the start offset is within logical file size */
+               if (ap->a_foffset > fp->ff_size) {
+                       goto exit;
+               }
+
+               /* Searching file extents has failed for read operation, therefore 
+                * search rangelist for any uncommitted holes in the file. 
+                */
+               overlaptype = rl_scan(&fp->ff_invalidranges, ap->a_foffset,
+                                     ap->a_foffset + (off_t)(ap->a_size - 1),
+                                     &invalid_range);
+               switch(overlaptype) {
+               case RL_OVERLAPISCONTAINED:
+                       /* start_offset <= rl_start, end_offset >= rl_end */
+                       if (ap->a_foffset != invalid_range->rl_start) {
+                               break;
+                       }
+               case RL_MATCHINGOVERLAP:
+                       /* start_offset = rl_start, end_offset = rl_end */
+               case RL_OVERLAPCONTAINSRANGE:
+                       /* start_offset >= rl_start, end_offset <= rl_end */
+               case RL_OVERLAPSTARTSBEFORE:
+                       /* start_offset > rl_start, end_offset >= rl_start */
+                       if ((off_t)fp->ff_size > (invalid_range->rl_end + 1)) {
+                               bytesContAvail = (invalid_range->rl_end + 1) - ap->a_foffset;
+                       } else {
+                               bytesContAvail = fp->ff_size - ap->a_foffset;
+                       }
+                       if (bytesContAvail > ap->a_size) {
+                               bytesContAvail = ap->a_size;
+                       }
+                       *ap->a_bpn = (daddr64_t)-1;
+                       retval = 0;
+                       break;
+               case RL_OVERLAPENDSAFTER:
+                       /* start_offset < rl_start, end_offset < rl_end */
+               case RL_NOOVERLAP:
+                       break;
+               }
+               goto exit;
+       }
+
+       /* MapFileC() found a valid extent in the filefork.  Search the 
+        * mapping information further for invalid file ranges 
+        */
+       overlaptype = rl_scan(&fp->ff_invalidranges, ap->a_foffset,
+                             ap->a_foffset + (off_t)bytesContAvail - 1,
+                             &invalid_range);
+       if (overlaptype != RL_NOOVERLAP) {
+               switch(overlaptype) {
+               case RL_MATCHINGOVERLAP:
+               case RL_OVERLAPCONTAINSRANGE:
+               case RL_OVERLAPSTARTSBEFORE:
+                       /* There's no valid block for this byte offset */
+                       *ap->a_bpn = (daddr64_t)-1;
+                       /* There's no point limiting the amount to be returned
+                        * if the invalid range that was hit extends all the way 
+                        * to the EOF (i.e. there's no valid bytes between the
+                        * end of this range and the file's EOF):
+                        */
+                       if (((off_t)fp->ff_size > (invalid_range->rl_end + 1)) &&
+                           ((size_t)(invalid_range->rl_end + 1 - ap->a_foffset) < bytesContAvail)) {
+                               bytesContAvail = invalid_range->rl_end + 1 - ap->a_foffset;
+                       }
+                       break;
+       
+               case RL_OVERLAPISCONTAINED:
+               case RL_OVERLAPENDSAFTER:
+                       /* The range of interest hits an invalid block before the end: */
+                       if (invalid_range->rl_start == ap->a_foffset) {
+                               /* There's actually no valid information to be had starting here: */
+                               *ap->a_bpn = (daddr64_t)-1;
+                               if (((off_t)fp->ff_size > (invalid_range->rl_end + 1)) &&
+                                   ((size_t)(invalid_range->rl_end + 1 - ap->a_foffset) < bytesContAvail)) {
+                                       bytesContAvail = invalid_range->rl_end + 1 - ap->a_foffset;
+                               }
+                       } else {
+                               bytesContAvail = invalid_range->rl_start - ap->a_foffset;
+                       }
+                       break;
+
+               case RL_NOOVERLAP:
+                       break;
+               } /* end switch */
+               if (bytesContAvail > ap->a_size)
+                       bytesContAvail = ap->a_size;
+       } 
+               
+exit:
+       if (retval == 0) {
+               if (ap->a_run)
+                       *ap->a_run = bytesContAvail;
+
+               if (ap->a_poff)
+                       *(int *)ap->a_poff = 0;
+       }
+
+       if (tooklock)
+               hfs_unlock(cp);
+
+       return (MacToVFSError(retval));
+}
+
+
+/*
+ * prepare and issue the I/O
+ * buf_strategy knows how to deal
+ * with requests that require 
+ * fragmented I/Os
+ */
+int
+hfs_vnop_strategy(struct vnop_strategy_args *ap)
+{
+       buf_t   bp = ap->a_bp;
+       vnode_t vp = buf_vnode(bp);
+
+       return (buf_strategy(VTOHFS(vp)->hfs_devvp, ap));
+}
+
+static int 
+hfs_minorupdate(struct vnode *vp) {
+       struct cnode *cp = VTOC(vp);
+       cp->c_flag &= ~C_MODIFIED;
+       cp->c_touch_acctime = 0;
+       cp->c_touch_chgtime = 0;
+       cp->c_touch_modtime = 0;
+       
+       return 0;
+}
+
+static int
+do_hfs_truncate(struct vnode *vp, off_t length, int flags, int skipupdate, vfs_context_t context)
+{
+       register struct cnode *cp = VTOC(vp);
+       struct filefork *fp = VTOF(vp);
+       struct proc *p = vfs_context_proc(context);;
+       kauth_cred_t cred = vfs_context_ucred(context);
+       int retval;
+       off_t bytesToAdd;
+       off_t actualBytesAdded;
+       off_t filebytes;
+       u_int32_t fileblocks;
+       int blksize;
+       struct hfsmount *hfsmp;
+       int lockflags;
+
+       blksize = VTOVCB(vp)->blockSize;
+       fileblocks = fp->ff_blocks;
+       filebytes = (off_t)fileblocks * (off_t)blksize;
+
+       KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 7)) | DBG_FUNC_START,
+                (int)length, (int)fp->ff_size, (int)filebytes, 0, 0);
+
+       if (length < 0)
+               return (EINVAL);
+
+       /* This should only happen with a corrupt filesystem */
+       if ((off_t)fp->ff_size < 0)
+               return (EINVAL);
+
+       if ((!ISHFSPLUS(VTOVCB(vp))) && (length > (off_t)MAXHFSFILESIZE))
+               return (EFBIG);
+
+       hfsmp = VTOHFS(vp);
+
+       retval = E_NONE;
+
+       /* Files that are changing size are not hot file candidates. */
+       if (hfsmp->hfc_stage == HFC_RECORDING) {
+               fp->ff_bytesread = 0;
+       }
+
+       /* 
+        * We cannot just check if fp->ff_size == length (as an optimization)
+        * since there may be extra physical blocks that also need truncation.
+        */
+#if QUOTA
+       if ((retval = hfs_getinoquota(cp)))
+               return(retval);
+#endif /* QUOTA */
+
+       /*
+        * Lengthen the size of the file. We must ensure that the
+        * last byte of the file is allocated. Since the smallest
+        * value of ff_size is 0, length will be at least 1.
+        */
+       if (length > (off_t)fp->ff_size) {
+#if QUOTA
+               retval = hfs_chkdq(cp, (int64_t)(roundup(length - filebytes, blksize)),
+                                  cred, 0);
+               if (retval)
+                       goto Err_Exit;
+#endif /* QUOTA */
+               /*
+                * If we don't have enough physical space then
+                * we need to extend the physical size.
+                */
+               if (length > filebytes) {
+                       int eflags;
+                       u_int32_t blockHint = 0;
+
+                       /* All or nothing and don't round up to clumpsize. */
+                       eflags = kEFAllMask | kEFNoClumpMask;
+
+                       if (cred && suser(cred, NULL) != 0)
+                               eflags |= kEFReserveMask;  /* keep a reserve */
+
+                       /*
+                        * Allocate Journal and Quota files in metadata zone.
+                        */
+                       if (filebytes == 0 &&
+                           hfsmp->hfs_flags & HFS_METADATA_ZONE &&
+                           hfs_virtualmetafile(cp)) {
+                               eflags |= kEFMetadataMask;
+                               blockHint = hfsmp->hfs_metazone_start;
+                       }
+                       if (hfs_start_transaction(hfsmp) != 0) {
+                           retval = EINVAL;
+                           goto Err_Exit;
+                       }
+
+                       /* Protect extents b-tree and allocation bitmap */
+                       lockflags = SFL_BITMAP;
+                       if (overflow_extents(fp))
+                               lockflags |= SFL_EXTENTS;
+                       lockflags = hfs_systemfile_lock(hfsmp, lockflags, HFS_EXCLUSIVE_LOCK);
+
+                       while ((length > filebytes) && (retval == E_NONE)) {
+                               bytesToAdd = length - filebytes;
+                               retval = MacToVFSError(ExtendFileC(VTOVCB(vp),
+                                                    (FCB*)fp,
+                                                    bytesToAdd,
+                                                    blockHint,
+                                                    eflags,
+                                                    &actualBytesAdded));
+
+                               filebytes = (off_t)fp->ff_blocks * (off_t)blksize;
+                               if (actualBytesAdded == 0 && retval == E_NONE) {
+                                       if (length > filebytes)
+                                               length = filebytes;
+                                       break;
+                               }
+                       } /* endwhile */
+
+                       hfs_systemfile_unlock(hfsmp, lockflags);
+
+                       if (hfsmp->jnl) {
+                               if (skipupdate) {
+                                       (void) hfs_minorupdate(vp);
+                               }
+                               else {
+                                       (void) hfs_update(vp, TRUE);
+                                       (void) hfs_volupdate(hfsmp, VOL_UPDATE, 0);
+                               }
+                       }
+
+                       hfs_end_transaction(hfsmp);
+
+                       if (retval)
+                               goto Err_Exit;
+
+                       KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 7)) | DBG_FUNC_NONE,
+                               (int)length, (int)fp->ff_size, (int)filebytes, 0, 0);
+               }
+               if (!(flags & IO_NOZEROFILL)) {
+                       if (UBCINFOEXISTS(vp)  && (vnode_issystem(vp) == 0) && retval == E_NONE) {
+                               struct rl_entry *invalid_range;
+                               off_t zero_limit;
+                       
+                               zero_limit = (fp->ff_size + (PAGE_SIZE_64 - 1)) & ~PAGE_MASK_64;
+                               if (length < zero_limit) zero_limit = length;
+
+                               if (length > (off_t)fp->ff_size) {
+                                       struct timeval tv;
+
+                                       /* Extending the file: time to fill out the current last page w. zeroes? */
+                                       if ((fp->ff_size & PAGE_MASK_64) &&
+                                           (rl_scan(&fp->ff_invalidranges, fp->ff_size & ~PAGE_MASK_64,
+                                           fp->ff_size - 1, &invalid_range) == RL_NOOVERLAP)) {
+                                               
+                                               /* There's some valid data at the start of the (current) last page
+                                                  of the file, so zero out the remainder of that page to ensure the
+                                                  entire page contains valid data.  Since there is no invalid range
+                                                  possible past the (current) eof, there's no need to remove anything
+                                                  from the invalid range list before calling cluster_write():  */
+                                               hfs_unlock(cp);
+                                               retval = cluster_write(vp, (struct uio *) 0, fp->ff_size, zero_limit,
+                                                               fp->ff_size, (off_t)0,
+                                                               (flags & IO_SYNC) | IO_HEADZEROFILL | IO_NOZERODIRTY);
+                                               hfs_lock(cp, HFS_FORCE_LOCK);
+                                               if (retval) goto Err_Exit;
+                                               
+                                               /* Merely invalidate the remaining area, if necessary: */
+                                               if (length > zero_limit) {
+                                                       microuptime(&tv);
+                                                       rl_add(zero_limit, length - 1, &fp->ff_invalidranges);
+                                                       cp->c_zftimeout = tv.tv_sec + ZFTIMELIMIT;
+                                               }
+                                       } else {
+                                       /* The page containing the (current) eof is invalid: just add the
+                                          remainder of the page to the invalid list, along with the area
+                                          being newly allocated:
+                                        */
+                                       microuptime(&tv);
+                                       rl_add(fp->ff_size, length - 1, &fp->ff_invalidranges);
+                                       cp->c_zftimeout = tv.tv_sec + ZFTIMELIMIT;
+                                       };
+                               }
+                       } else {
+                                       panic("hfs_truncate: invoked on non-UBC object?!");
+                       };
+               }
+               cp->c_touch_modtime = TRUE;
+               fp->ff_size = length;
+
+       } else { /* Shorten the size of the file */
+
+               if ((off_t)fp->ff_size > length) {
+                       /* Any space previously marked as invalid is now irrelevant: */
+                       rl_remove(length, fp->ff_size - 1, &fp->ff_invalidranges);
+               }
+
+               /* 
+                * Account for any unmapped blocks. Note that the new
+                * file length can still end up with unmapped blocks.
+                */
+               if (fp->ff_unallocblocks > 0) {
+                       u_int32_t finalblks;
+                       u_int32_t loanedBlocks;
+
+                       HFS_MOUNT_LOCK(hfsmp, TRUE);
+
+                       loanedBlocks = fp->ff_unallocblocks;
+                       cp->c_blocks -= loanedBlocks;
+                       fp->ff_blocks -= loanedBlocks;
+                       fp->ff_unallocblocks = 0;
+
+                       hfsmp->loanedBlocks -= loanedBlocks;
+
+                       finalblks = (length + blksize - 1) / blksize;
+                       if (finalblks > fp->ff_blocks) {
+                               /* calculate required unmapped blocks */
+                               loanedBlocks = finalblks - fp->ff_blocks;
+                               hfsmp->loanedBlocks += loanedBlocks;
+
+                               fp->ff_unallocblocks = loanedBlocks;
+                               cp->c_blocks += loanedBlocks;
+                               fp->ff_blocks += loanedBlocks;
+                       }
+                       HFS_MOUNT_UNLOCK(hfsmp, TRUE);
+               }
+
+               /*
+                * For a TBE process the deallocation of the file blocks is
+                * delayed until the file is closed.  And hfs_close calls
+                * truncate with the IO_NDELAY flag set.  So when IO_NDELAY
+                * isn't set, we make sure this isn't a TBE process.
+                */
+               if ((flags & IO_NDELAY) || (proc_tbe(p) == 0)) {
+#if QUOTA
+                 off_t savedbytes = ((off_t)fp->ff_blocks * (off_t)blksize);
+#endif /* QUOTA */
+                 if (hfs_start_transaction(hfsmp) != 0) {
+                     retval = EINVAL;
+                     goto Err_Exit;
+                 }
+
+                       if (fp->ff_unallocblocks == 0) {
+                               /* Protect extents b-tree and allocation bitmap */
+                               lockflags = SFL_BITMAP;
+                               if (overflow_extents(fp))
+                                       lockflags |= SFL_EXTENTS;
+                               lockflags = hfs_systemfile_lock(hfsmp, lockflags, HFS_EXCLUSIVE_LOCK);
+
+                               retval = MacToVFSError(TruncateFileC(VTOVCB(vp),
+                                               (FCB*)fp, length, false));
+
+                               hfs_systemfile_unlock(hfsmp, lockflags);
+                       }
+                       if (hfsmp->jnl) {
+                               if (retval == 0) {
+                                       fp->ff_size = length;
+                               }
+                               if (skipupdate) {
+                                       (void) hfs_minorupdate(vp);
+                               }
+                               else {
+                                       (void) hfs_update(vp, TRUE);
+                                       (void) hfs_volupdate(hfsmp, VOL_UPDATE, 0);
+                               }
+                       }
+                       hfs_end_transaction(hfsmp);
+
+                       filebytes = (off_t)fp->ff_blocks * (off_t)blksize;
+                       if (retval)
+                               goto Err_Exit;
+#if QUOTA
+                       /* These are bytesreleased */
+                       (void) hfs_chkdq(cp, (int64_t)-(savedbytes - filebytes), NOCRED, 0);
+#endif /* QUOTA */
+               }
+               /* Only set update flag if the logical length changes */
+               if ((off_t)fp->ff_size != length)
+                       cp->c_touch_modtime = TRUE;
+               fp->ff_size = length;
+       }
+       if (cp->c_mode & (S_ISUID | S_ISGID)) {
+               if (!vfs_context_issuser(context)) {
+                       cp->c_mode &= ~(S_ISUID | S_ISGID);
+                       skipupdate = 0;
+               }
+       }
+       if (skipupdate) {
+               retval = hfs_minorupdate(vp);
+       }
+       else {
+               cp->c_touch_chgtime = TRUE;     /* status changed */
+               cp->c_touch_modtime = TRUE;     /* file data was modified */
+               retval = hfs_update(vp, MNT_WAIT);
+       }
+       if (retval) {
+               KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 7)) | DBG_FUNC_NONE,
+                    -1, -1, -1, retval, 0);
+       }
+
+Err_Exit:
+
+       KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 7)) | DBG_FUNC_END,
+                (int)length, (int)fp->ff_size, (int)filebytes, retval, 0);
+
+       return (retval);
+}
+
+
+
+/*
+ * Truncate a cnode to at most length size, freeing (or adding) the
+ * disk blocks.
+ */
+__private_extern__
+int
+hfs_truncate(struct vnode *vp, off_t length, int flags, int skipsetsize,
+             int skipupdate, vfs_context_t context)
+{
+       struct filefork *fp = VTOF(vp);
+       off_t filebytes;
+       u_int32_t fileblocks;
+       int blksize, error = 0;
+       struct cnode *cp = VTOC(vp);
+
+       /* Cannot truncate an HFS directory! */
+       if (vnode_isdir(vp)) {
+               return (EISDIR);
+       }
+       /* A swap file cannot change size. */
+       if (vnode_isswap(vp) && (length != 0)) {
+               return (EPERM);
+       }
+
+       blksize = VTOVCB(vp)->blockSize;
+       fileblocks = fp->ff_blocks;
+       filebytes = (off_t)fileblocks * (off_t)blksize;
+
+       //
+       // Have to do this here so that we don't wind up with
+       // i/o pending for blocks that are about to be released
+       // if we truncate the file.
+       //
+       // If skipsetsize is set, then the caller is responsible
+       // for the ubc_setsize.
+       //
+       // Even if skipsetsize is set, if the length is zero we
+       // want to call ubc_setsize() because as of SnowLeopard
+       // it will no longer cause any page-ins and it will drop
+       // any dirty pages so that we don't do any i/o that we
+       // don't have to.  This also prevents a race where i/o
+       // for truncated blocks may overwrite later data if the
+       // blocks get reallocated to a different file.
+       //
+       if (!skipsetsize || length == 0)
+               ubc_setsize(vp, length);
+
+       // have to loop truncating or growing files that are
+       // really big because otherwise transactions can get
+       // enormous and consume too many kernel resources.
+
+       if (length < filebytes) {
+               while (filebytes > length) {
+                       if ((filebytes - length) > HFS_BIGFILE_SIZE && overflow_extents(fp)) {
+                               filebytes -= HFS_BIGFILE_SIZE;
+                       } else {
+                               filebytes = length;
+                       }
+                       cp->c_flag |= C_FORCEUPDATE;
+                       error = do_hfs_truncate(vp, filebytes, flags, skipupdate, context);
+                       if (error)
+                               break;
+               }
+       } else if (length > filebytes) {
+               while (filebytes < length) {
+                       if ((length - filebytes) > HFS_BIGFILE_SIZE && overflow_extents(fp)) {
+                               filebytes += HFS_BIGFILE_SIZE;
+                       } else {
+                               filebytes = length;
+                       }
+                       cp->c_flag |= C_FORCEUPDATE;
+                       error = do_hfs_truncate(vp, filebytes, flags, skipupdate, context);
+                       if (error)
+                               break;
+               }
+       } else /* Same logical size */ {
+
+               error = do_hfs_truncate(vp, length, flags, skipupdate, context);
+       }
+       /* Files that are changing size are not hot file candidates. */
+       if (VTOHFS(vp)->hfc_stage == HFC_RECORDING) {
+               fp->ff_bytesread = 0;
+       }
+
+       return (error);
+}
+
+
+
+/*
+ * Preallocate file storage space.
+ */
+int
+hfs_vnop_allocate(struct vnop_allocate_args /* {
+               vnode_t a_vp;
+               off_t a_length;
+               u_int32_t  a_flags;
+               off_t *a_bytesallocated;
+               off_t a_offset;
+               vfs_context_t a_context;
+       } */ *ap)
+{
+       struct vnode *vp = ap->a_vp;
+       struct cnode *cp;
+       struct filefork *fp;
+       ExtendedVCB *vcb;
+       off_t length = ap->a_length;
+       off_t startingPEOF;
+       off_t moreBytesRequested;
+       off_t actualBytesAdded;
+       off_t filebytes;
+       u_int32_t fileblocks;
+       int retval, retval2;
+       u_int32_t blockHint;
+       u_int32_t extendFlags;   /* For call to ExtendFileC */
+       struct hfsmount *hfsmp;
+       kauth_cred_t cred = vfs_context_ucred(ap->a_context);
+       int lockflags;
+
+       *(ap->a_bytesallocated) = 0;
+
+       if (!vnode_isreg(vp))
+               return (EISDIR);
+       if (length < (off_t)0)
+               return (EINVAL);
+       
+       cp = VTOC(vp);
+
+       hfs_lock_truncate(cp, TRUE);
+
+       if ((retval = hfs_lock(cp, HFS_EXCLUSIVE_LOCK))) {
+               goto Err_Exit;
+       }
+       
+       fp = VTOF(vp);
+       hfsmp = VTOHFS(vp);
+       vcb = VTOVCB(vp);
+
+       fileblocks = fp->ff_blocks;
+       filebytes = (off_t)fileblocks * (off_t)vcb->blockSize;
+
+       if ((ap->a_flags & ALLOCATEFROMVOL) && (length < filebytes)) {
+               retval = EINVAL;
+               goto Err_Exit;
+       }
+
+       /* Fill in the flags word for the call to Extend the file */
+
+       extendFlags = kEFNoClumpMask;
+       if (ap->a_flags & ALLOCATECONTIG) 
+               extendFlags |= kEFContigMask;
+       if (ap->a_flags & ALLOCATEALL)
+               extendFlags |= kEFAllMask;
+       if (cred && suser(cred, NULL) != 0)
+               extendFlags |= kEFReserveMask;
+       if (hfs_virtualmetafile(cp))
+               extendFlags |= kEFMetadataMask;
+
+       retval = E_NONE;
+       blockHint = 0;
+       startingPEOF = filebytes;
+
+       if (ap->a_flags & ALLOCATEFROMPEOF)
+               length += filebytes;
+       else if (ap->a_flags & ALLOCATEFROMVOL)
+               blockHint = ap->a_offset / VTOVCB(vp)->blockSize;
+
+       /* If no changes are necesary, then we're done */
+       if (filebytes == length)
+               goto Std_Exit;
+
+       /*
+        * Lengthen the size of the file. We must ensure that the
+        * last byte of the file is allocated. Since the smallest
+        * value of filebytes is 0, length will be at least 1.
+        */
+       if (length > filebytes) {
+               off_t total_bytes_added = 0, orig_request_size;
+
+               orig_request_size = moreBytesRequested = length - filebytes;
+               
+#if QUOTA
+               retval = hfs_chkdq(cp,
+                               (int64_t)(roundup(moreBytesRequested, vcb->blockSize)), 
+                               cred, 0);
+               if (retval)
+                       goto Err_Exit;
+
+#endif /* QUOTA */
+               /*
+                * Metadata zone checks.
+                */
+               if (hfsmp->hfs_flags & HFS_METADATA_ZONE) {
+                       /*
+                        * Allocate Journal and Quota files in metadata zone.
+                        */
+                       if (hfs_virtualmetafile(cp)) {
+                               blockHint = hfsmp->hfs_metazone_start;
+                       } else if ((blockHint >= hfsmp->hfs_metazone_start) &&
+                                  (blockHint <= hfsmp->hfs_metazone_end)) {
+                               /*
+                                * Move blockHint outside metadata zone.
+                                */
+                               blockHint = hfsmp->hfs_metazone_end + 1;
+                       }
+               }
+
+
+               while ((length > filebytes) && (retval == E_NONE)) {
+                   off_t bytesRequested;
+                   
+                   if (hfs_start_transaction(hfsmp) != 0) {
+                       retval = EINVAL;
+                       goto Err_Exit;
+                   }
+
+                   /* Protect extents b-tree and allocation bitmap */
+                   lockflags = SFL_BITMAP;
+                   if (overflow_extents(fp))
+                       lockflags |= SFL_EXTENTS;
+                   lockflags = hfs_systemfile_lock(hfsmp, lockflags, HFS_EXCLUSIVE_LOCK);
+
+                   if (moreBytesRequested >= HFS_BIGFILE_SIZE) {
+                       bytesRequested = HFS_BIGFILE_SIZE;
+                   } else {
+                       bytesRequested = moreBytesRequested;
+                   }
+
+                   if (extendFlags & kEFContigMask) {
+                           // if we're on a sparse device, this will force it to do a
+                           // full scan to find the space needed.
+                           hfsmp->hfs_flags &= ~HFS_DID_CONTIG_SCAN;
+                   }
+
+                   retval = MacToVFSError(ExtendFileC(vcb,
+                                               (FCB*)fp,
+                                               bytesRequested,
+                                               blockHint,
+                                               extendFlags,
+                                               &actualBytesAdded));
+
+                   if (retval == E_NONE) {
+                       *(ap->a_bytesallocated) += actualBytesAdded;
+                       total_bytes_added += actualBytesAdded;
+                       moreBytesRequested -= actualBytesAdded;
+                       if (blockHint != 0) {
+                           blockHint += actualBytesAdded / vcb->blockSize;
+                       }
+                   }
+                   filebytes = (off_t)fp->ff_blocks * (off_t)vcb->blockSize;
+                   
+                   hfs_systemfile_unlock(hfsmp, lockflags);
+
+                   if (hfsmp->jnl) {
+                       (void) hfs_update(vp, TRUE);
+                       (void) hfs_volupdate(hfsmp, VOL_UPDATE, 0);
+                   }
+
+                   hfs_end_transaction(hfsmp);
+               }
+
+
+               /*
+                * if we get an error and no changes were made then exit
+                * otherwise we must do the hfs_update to reflect the changes
+                */
+               if (retval && (startingPEOF == filebytes))
+                       goto Err_Exit;
+        
+               /*
+                * Adjust actualBytesAdded to be allocation block aligned, not
+                * clump size aligned.
+                * NOTE: So what we are reporting does not affect reality
+                * until the file is closed, when we truncate the file to allocation
+                * block size.
+                */
+               if (total_bytes_added != 0 && orig_request_size < total_bytes_added)
+                       *(ap->a_bytesallocated) =
+                               roundup(orig_request_size, (off_t)vcb->blockSize);
+
+       } else { /* Shorten the size of the file */
+
+               if (fp->ff_size > length) {
+                       /*
+                        * Any buffers that are past the truncation point need to be
+                        * invalidated (to maintain buffer cache consistency).
+                        */
+               }
+
+               retval = hfs_truncate(vp, length, 0, 0, 0, ap->a_context);
+               filebytes = (off_t)fp->ff_blocks * (off_t)vcb->blockSize;
+
+               /*
+                * if we get an error and no changes were made then exit
+                * otherwise we must do the hfs_update to reflect the changes
+                */
+               if (retval && (startingPEOF == filebytes)) goto Err_Exit;
+#if QUOTA
+               /* These are  bytesreleased */
+               (void) hfs_chkdq(cp, (int64_t)-((startingPEOF - filebytes)), NOCRED,0);
+#endif /* QUOTA */
+
+               if (fp->ff_size > filebytes) {
+                       fp->ff_size = filebytes;
+
+                       hfs_unlock(cp);
+                       ubc_setsize(vp, fp->ff_size);
+                       hfs_lock(cp, HFS_FORCE_LOCK);
+               }
+       }
+
+Std_Exit:
+       cp->c_touch_chgtime = TRUE;
+       cp->c_touch_modtime = TRUE;
+       retval2 = hfs_update(vp, MNT_WAIT);
+
+       if (retval == 0)
+               retval = retval2;
+Err_Exit:
+       hfs_unlock_truncate(cp, TRUE);
+       hfs_unlock(cp);
+       return (retval);
+}
+
+
+/*
+ * Pagein for HFS filesystem
+ */
+int
+hfs_vnop_pagein(struct vnop_pagein_args *ap)
+/*
+       struct vnop_pagein_args {
+               vnode_t a_vp,
+               upl_t         a_pl,
+               vm_offset_t   a_pl_offset,
+               off_t         a_f_offset,
+               size_t        a_size,
+               int           a_flags
+               vfs_context_t a_context;
+       };
+*/
+{
+       vnode_t vp = ap->a_vp;
+       int error;
+
+#if HFS_COMPRESSION
+       if (VNODE_IS_RSRC(vp)) {
+               /* allow pageins of the resource fork */
+       } else {
+               int compressed = hfs_file_is_compressed(VTOC(vp), 1); /* 1 == don't take the cnode lock */
+               if (compressed) {
+                       error = decmpfs_pagein_compressed(ap, &compressed, VTOCMP(vp));
+                       if (compressed) {
+                               if (error == 0) {
+                                       /* successful page-in, update the access time */
+                                       VTOC(vp)->c_touch_acctime = TRUE;
+                                       
+                                       /* compressed files are not hot file candidates */
+                                       if (VTOHFS(vp)->hfc_stage == HFC_RECORDING) {
+                                               VTOF(vp)->ff_bytesread = 0;
+                                       }
+                               }
+                               return error;
+                       }
+                       /* otherwise the file was converted back to a regular file while we were reading it */
+               }
+       }
+#endif
+
+       error = cluster_pagein(vp, ap->a_pl, ap->a_pl_offset, ap->a_f_offset,
+                              ap->a_size, (off_t)VTOF(vp)->ff_size, ap->a_flags);
+       /*
+        * Keep track of blocks read.
+        */
+       if (!vnode_isswap(vp) && VTOHFS(vp)->hfc_stage == HFC_RECORDING && error == 0) {
+               struct cnode *cp;
+               struct filefork *fp;
+               int bytesread;
+               int took_cnode_lock = 0;
+               
+               cp = VTOC(vp);
+               fp = VTOF(vp);
+
+               if (ap->a_f_offset == 0 && fp->ff_size < PAGE_SIZE)
+                       bytesread = fp->ff_size;
+               else
+                       bytesread = ap->a_size;
+
+               /* When ff_bytesread exceeds 32-bits, update it behind the cnode lock. */
+               if ((fp->ff_bytesread + bytesread) > 0x00000000ffffffff && cp->c_lockowner != current_thread()) {
+                       hfs_lock(cp, HFS_FORCE_LOCK);
+                       took_cnode_lock = 1;
+               }
+               /*
+                * If this file hasn't been seen since the start of
+                * the current sampling period then start over.
+                */
+               if (cp->c_atime < VTOHFS(vp)->hfc_timebase) {
+                       struct timeval tv;
+
+                       fp->ff_bytesread = bytesread;
+                       microtime(&tv);
+                       cp->c_atime = tv.tv_sec;
+               } else {
+                       fp->ff_bytesread += bytesread;
+               }
+               cp->c_touch_acctime = TRUE;
+               if (took_cnode_lock)
+                       hfs_unlock(cp);
+       }
+       return (error);
+}
+
+/* 
+ * Pageout for HFS filesystem.
+ */
+int
+hfs_vnop_pageout(struct vnop_pageout_args *ap)
+/*
+       struct vnop_pageout_args {
+          vnode_t a_vp,
+          upl_t         a_pl,
+          vm_offset_t   a_pl_offset,
+          off_t         a_f_offset,
+          size_t        a_size,
+          int           a_flags
+          vfs_context_t a_context;
+       };
+*/
+{
+       vnode_t vp = ap->a_vp;
+       struct cnode *cp;
+       struct filefork *fp;
+       int retval = 0;
+       off_t filesize;
+       upl_t           upl;
+       upl_page_info_t* pl;
+       vm_offset_t     a_pl_offset;
+       int             a_flags;
+       int is_pageoutv2 = 0;
+
+       cp = VTOC(vp);
+       fp = VTOF(vp);
+       
+       /*
+        * Figure out where the file ends, for pageout purposes.  If
+        * ff_new_size > ff_size, then we're in the middle of extending the
+        * file via a write, so it is safe (and necessary) that we be able
+        * to pageout up to that point.
+        */
+       filesize = fp->ff_size;
+       if (fp->ff_new_size > filesize)
+               filesize = fp->ff_new_size;
+
+       a_flags = ap->a_flags;
+       a_pl_offset = ap->a_pl_offset;
+
+       /*
+        * we can tell if we're getting the new or old behavior from the UPL
+        */
+       if ((upl = ap->a_pl) == NULL) {
+               int request_flags; 
+
+               is_pageoutv2 = 1;
+               /*
+                * we're in control of any UPL we commit
+                * make sure someone hasn't accidentally passed in UPL_NOCOMMIT 
+                */
+               a_flags &= ~UPL_NOCOMMIT;
+               a_pl_offset = 0;
+
+               /*
+                * take truncate lock (shared) to guard against 
+                * zero-fill thru fsync interfering, but only for v2 
+                */
+               hfs_lock_truncate(cp, 0);
+
+               if (a_flags & UPL_MSYNC) {
+                       request_flags = UPL_UBC_MSYNC | UPL_RET_ONLY_DIRTY;
+               }
+               else {
+                       request_flags = UPL_UBC_PAGEOUT | UPL_RET_ONLY_DIRTY;
+               }
+               ubc_create_upl(vp, ap->a_f_offset, ap->a_size, &upl, &pl, request_flags); 
+
+               if (upl == (upl_t) NULL) {
+                       retval = EINVAL;
+                       goto pageout_done;
+               }
+       }
+       /*
+        * from this point forward upl points at the UPL we're working with
+        * it was either passed in or we succesfully created it
+        */
+
+       /* 
+        * Now that HFS is opting into VFC_VFSVNOP_PAGEOUTV2, we may need to operate on our own  
+        * UPL instead of relying on the UPL passed into us.  We go ahead and do that here,
+        * scanning for dirty ranges.  We'll issue our own N cluster_pageout calls, for
+        * N dirty ranges in the UPL.  Note that this is almost a direct copy of the 
+        * logic in vnode_pageout except that we need to do it after grabbing the truncate 
+        * lock in HFS so that we don't lock invert ourselves.  
+        * 
+        * Note that we can still get into this function on behalf of the default pager with
+        * non-V2 behavior (swapfiles).  However in that case, we did not grab locks above 
+        * since fsync and other writing threads will grab the locks, then mark the 
+        * relevant pages as busy.  But the pageout codepath marks the pages as busy, 
+        * and THEN would attempt to grab the truncate lock, which would result in deadlock.  So
+        * we do not try to grab anything for the pre-V2 case, which should only be accessed
+        * by the paging/VM system.
+        */
+
+       if (is_pageoutv2) {
+               off_t f_offset;
+               int offset;
+               int isize; 
+               int pg_index;
+               int error;
+               int error_ret = 0;
+
+               isize = ap->a_size;
+               f_offset = ap->a_f_offset;
+
+               /* 
+                * Scan from the back to find the last page in the UPL, so that we 
+                * aren't looking at a UPL that may have already been freed by the
+                * preceding aborts/completions.
+                */ 
+               for (pg_index = ((isize) / PAGE_SIZE); pg_index > 0;) {
+                       if (upl_page_present(pl, --pg_index))
+                               break;
+                       if (pg_index == 0) {
+                               ubc_upl_abort_range(upl, 0, isize, UPL_ABORT_FREE_ON_EMPTY);
+                               goto pageout_done;
+                       }
+               }
+
+               /* 
+                * initialize the offset variables before we touch the UPL.
+                * a_f_offset is the position into the file, in bytes
+                * offset is the position into the UPL, in bytes
+                * pg_index is the pg# of the UPL we're operating on.
+                * isize is the offset into the UPL of the last non-clean page. 
+                */
+               isize = ((pg_index + 1) * PAGE_SIZE);   
+
+               offset = 0;
+               pg_index = 0;
+
+               while (isize) {
+                       int  xsize;
+                       int  num_of_pages;
+
+                       if ( !upl_page_present(pl, pg_index)) {
+                               /*
+                                * we asked for RET_ONLY_DIRTY, so it's possible
+                                * to get back empty slots in the UPL.
+                                * just skip over them
+                                */
+                               f_offset += PAGE_SIZE;
+                               offset   += PAGE_SIZE;
+                               isize    -= PAGE_SIZE;
+                               pg_index++;
+
+                               continue;
+                       }
+                       if ( !upl_dirty_page(pl, pg_index)) {
+                               panic ("hfs_vnop_pageout: unforeseen clean page @ index %d for UPL %p\n", pg_index, upl);
+                       }
+
+                       /* 
+                        * We know that we have at least one dirty page.
+                        * Now checking to see how many in a row we have
+                        */
+                       num_of_pages = 1;
+                       xsize = isize - PAGE_SIZE;
+
+                       while (xsize) {
+                               if ( !upl_dirty_page(pl, pg_index + num_of_pages))
+                                       break;
+                               num_of_pages++;
+                               xsize -= PAGE_SIZE;
+                       }
+                       xsize = num_of_pages * PAGE_SIZE;
+
+                       if (!vnode_isswap(vp)) {
+                               off_t end_of_range;
+                               int tooklock;
+
+                               tooklock = 0;
+
+                               if (cp->c_lockowner != current_thread()) {
+                                       if ((retval = hfs_lock(cp, HFS_EXCLUSIVE_LOCK))) {
+                                               /*
+                                                * we're in the v2 path, so we are the
+                                                * owner of the UPL... we may have already
+                                                * processed some of the UPL, so abort it
+                                                * from the current working offset to the
+                                                * end of the UPL
+                                                */
+                                               ubc_upl_abort_range(upl,
+                                                                   offset,
+                                                                   ap->a_size - offset,
+                                                                   UPL_ABORT_FREE_ON_EMPTY);
+                                               goto pageout_done;
+                                       }
+                                       tooklock = 1;
+                               }
+                               end_of_range = f_offset + xsize - 1;
+       
+                               if (end_of_range >= filesize) {
+                                       end_of_range = (off_t)(filesize - 1);
+                               }
+                               if (f_offset < filesize) {
+                                       rl_remove(f_offset, end_of_range, &fp->ff_invalidranges);
+                                       cp->c_flag |= C_MODIFIED;  /* leof is dirty */
+                               }
+                               if (tooklock) {
+                                       hfs_unlock(cp);
+                               }
+                       }
+                       if ((error = cluster_pageout(vp, upl, offset, f_offset,
+                                                       xsize, filesize, a_flags))) {
+                               if (error_ret == 0)
+                                       error_ret = error;
+                       }
+                       f_offset += xsize;
+                       offset   += xsize;
+                       isize    -= xsize;
+                       pg_index += num_of_pages;
+               }
+               /* capture errnos bubbled out of cluster_pageout if they occurred */
+               if (error_ret != 0) {
+                       retval = error_ret;
+               }
+       } /* end block for v2 pageout behavior */
+       else {
+               if (!vnode_isswap(vp)) {
+                       off_t end_of_range;
+                       int tooklock = 0;
+
+                       if (cp->c_lockowner != current_thread()) {
+                               if ((retval = hfs_lock(cp, HFS_EXCLUSIVE_LOCK))) {
+                                       if (!(a_flags & UPL_NOCOMMIT)) {
+                                               ubc_upl_abort_range(upl,
+                                                                   a_pl_offset,
+                                                                   ap->a_size,
+                                                                   UPL_ABORT_FREE_ON_EMPTY);
+                                       }
+                                       goto pageout_done;
+                               }
+                               tooklock = 1;
+                       }
+                       end_of_range = ap->a_f_offset + ap->a_size - 1;
+       
+                       if (end_of_range >= filesize) {
+                               end_of_range = (off_t)(filesize - 1);
+                       }
+                       if (ap->a_f_offset < filesize) {
+                               rl_remove(ap->a_f_offset, end_of_range, &fp->ff_invalidranges);
+                               cp->c_flag |= C_MODIFIED;  /* leof is dirty */
+                       }
+
+                       if (tooklock) {
+                               hfs_unlock(cp);
+                       }
+               }
+               /* 
+                * just call cluster_pageout for old pre-v2 behavior
+                */
+               retval = cluster_pageout(vp, upl, a_pl_offset, ap->a_f_offset,
+                               ap->a_size, filesize, a_flags);         
+       }
+
+       /*
+        * If data was written, update the modification time of the file.
+        * If setuid or setgid bits are set and this process is not the 
+        * superuser then clear the setuid and setgid bits as a precaution 
+        * against tampering.
+        */
+       if (retval == 0) {
+               cp->c_touch_modtime = TRUE;
+               cp->c_touch_chgtime = TRUE;
+               if ((cp->c_mode & (S_ISUID | S_ISGID)) &&
+                   (vfs_context_suser(ap->a_context) != 0)) {
+                       hfs_lock(cp, HFS_FORCE_LOCK);
+                       cp->c_mode &= ~(S_ISUID | S_ISGID);
+                       hfs_unlock(cp);
+               }
+       }
+
+pageout_done:
+       if (is_pageoutv2) {
+               /* release truncate lock (shared) */
+               hfs_unlock_truncate(cp, 0);
+       }
+       return (retval);
+}
+
+/*
+ * Intercept B-Tree node writes to unswap them if necessary.
+ */
+int
+hfs_vnop_bwrite(struct vnop_bwrite_args *ap)
+{
+       int retval = 0;
+       register struct buf *bp = ap->a_bp;
+       register struct vnode *vp = buf_vnode(bp);
+       BlockDescriptor block;
+
+       /* Trap B-Tree writes */
+       if ((VTOC(vp)->c_fileid == kHFSExtentsFileID) ||
+           (VTOC(vp)->c_fileid == kHFSCatalogFileID) ||
+           (VTOC(vp)->c_fileid == kHFSAttributesFileID) ||
+           (vp == VTOHFS(vp)->hfc_filevp)) {
+
+               /* 
+                * Swap and validate the node if it is in native byte order.
+                * This is always be true on big endian, so we always validate
+                * before writing here.  On little endian, the node typically has
+                * been swapped and validated when it was written to the journal,
+                * so we won't do anything here.
+                */
+               if (((u_int16_t *)((char *)buf_dataptr(bp) + buf_count(bp) - 2))[0] == 0x000e) {
+                       /* Prepare the block pointer */
+                       block.blockHeader = bp;
+                       block.buffer = (char *)buf_dataptr(bp);
+                       block.blockNum = buf_lblkno(bp);
+                       /* not found in cache ==> came from disk */
+                       block.blockReadFromDisk = (buf_fromcache(bp) == 0);
+                       block.blockSize = buf_count(bp);
+    
+                       /* Endian un-swap B-Tree node */
+                       retval = hfs_swap_BTNode (&block, vp, kSwapBTNodeHostToBig, false);
+                       if (retval)
+                               panic("hfs_vnop_bwrite: about to write corrupt node!\n");
+               }
        }
 
        }
 
-Err_Exit:
-
-       KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 7)) | DBG_FUNC_END,
-                (int)length, (int)fp->ff_size, (int)filebytes, retval, 0);
+       /* This buffer shouldn't be locked anymore but if it is clear it */
+       if ((buf_flags(bp) & B_LOCKED)) {
+               // XXXdbg
+               if (VTOHFS(vp)->jnl) {
+                       panic("hfs: CLEARING the lock bit on bp %p\n", bp);
+               }
+               buf_clearflags(bp, B_LOCKED);
+       }
+       retval = vn_bwrite (ap);
 
        return (retval);
 }
 
 
        return (retval);
 }
 
-
-
 /*
 /*
-#
-#% allocate    vp      L L L
-#
-vop_allocate {
-       IN struct vnode *vp;
-       IN off_t length;
-       IN int flags;
-       OUT off_t *bytesallocated;
-       IN off_t offset;
-       IN struct ucred *cred;
-       IN struct proc *p;
-};
- * allocate a cnode to at most length size
+ * Relocate a file to a new location on disk
+ *  cnode must be locked on entry
+ *
+ * Relocation occurs by cloning the file's data from its
+ * current set of blocks to a new set of blocks. During
+ * the relocation all of the blocks (old and new) are
+ * owned by the file.
+ *
+ * -----------------
+ * |///////////////|
+ * -----------------
+ * 0               N (file offset)
+ *
+ * -----------------     -----------------
+ * |///////////////|     |               |     STEP 1 (acquire new blocks)
+ * -----------------     -----------------
+ * 0               N     N+1             2N
+ *
+ * -----------------     -----------------
+ * |///////////////|     |///////////////|     STEP 2 (clone data)
+ * -----------------     -----------------
+ * 0               N     N+1             2N
+ *
+ *                       -----------------
+ *                       |///////////////|     STEP 3 (head truncate blocks)
+ *                       -----------------
+ *                       0               N
+ *
+ * During steps 2 and 3 page-outs to file offsets less
+ * than or equal to N are suspended.
+ *
+ * During step 3 page-ins to the file get suspended.
  */
  */
-int hfs_allocate(ap)
-       struct vop_allocate_args /* {
-               struct vnode *a_vp;
-               off_t a_length;
-               u_int32_t  a_flags;
-               off_t *a_bytesallocated;
-               off_t a_offset;
-               struct ucred *a_cred;
-               struct proc *a_p;
-       } */ *ap;
+__private_extern__
+int
+hfs_relocate(struct  vnode *vp, u_int32_t  blockHint, kauth_cred_t cred,
+       struct  proc *p)
 {
 {
-       struct vnode *vp = ap->a_vp;
-       struct cnode *cp = VTOC(vp);
-       struct filefork *fp = VTOF(vp);
-       off_t length = ap->a_length;
-       off_t startingPEOF;
-       off_t moreBytesRequested;
-       off_t actualBytesAdded;
-       off_t filebytes;
-       u_long fileblocks;
-       long vflags;
-       struct timeval tv;
-       int retval, retval2;
-       UInt32 blockHint;
-       UInt32 extendFlags =0;   /* For call to ExtendFileC */
-       struct hfsmount *hfsmp;
-
+       struct  cnode *cp;
+       struct  filefork *fp;
+       struct  hfsmount *hfsmp;
+       u_int32_t  headblks;
+       u_int32_t  datablks;
+       u_int32_t  blksize;
+       u_int32_t  growsize;
+       u_int32_t  nextallocsave;
+       daddr64_t  sector_a,  sector_b;
+       int eflags;
+       off_t  newbytes;
+       int  retval;
+       int lockflags = 0;
+       int took_trunc_lock = 0;
+       int started_tr = 0;
+       enum vtype vnodetype;
+
+       vnodetype = vnode_vtype(vp);
+       if (vnodetype != VREG && vnodetype != VLNK) {
+               return (EPERM);
+       }
+       
        hfsmp = VTOHFS(vp);
        hfsmp = VTOHFS(vp);
+       if (hfsmp->hfs_flags & HFS_FRAGMENTED_FREESPACE) {
+               return (ENOSPC);
+       }
 
 
-       *(ap->a_bytesallocated) = 0;
-       fileblocks = fp->ff_blocks;
-       filebytes = (off_t)fileblocks * (off_t)VTOVCB(vp)->blockSize;
-
-       if (length < (off_t)0)
-               return (EINVAL);
-       if (vp->v_type != VREG && vp->v_type != VLNK)
-               return (EISDIR);
-       if ((ap->a_flags & ALLOCATEFROMVOL) && (length <= filebytes))
+       cp = VTOC(vp);
+       fp = VTOF(vp);
+       if (fp->ff_unallocblocks)
                return (EINVAL);
                return (EINVAL);
+       blksize = hfsmp->blockSize;
+       if (blockHint == 0)
+               blockHint = hfsmp->nextAllocation;
 
 
-       /* Fill in the flags word for the call to Extend the file */
-
-       if (ap->a_flags & ALLOCATECONTIG) 
-               extendFlags |= kEFContigMask;
-
-       if (ap->a_flags & ALLOCATEALL)
-               extendFlags |= kEFAllMask;
-
-       if (suser(ap->a_cred, NULL) != 0)
-               extendFlags |= kEFReserveMask;
-
-       tv = time;
-       retval = E_NONE;
-       blockHint = 0;
-       startingPEOF = filebytes;
+       if ((fp->ff_size > 0x7fffffff) ||
+           ((fp->ff_size > blksize) && vnodetype == VLNK)) {
+               return (EFBIG);
+       }
 
 
-       if (ap->a_flags & ALLOCATEFROMPEOF)
-               length += filebytes;
-       else if (ap->a_flags & ALLOCATEFROMVOL)
-               blockHint = ap->a_offset / VTOVCB(vp)->blockSize;
+       //
+       // We do not believe that this call to hfs_fsync() is
+       // necessary and it causes a journal transaction
+       // deadlock so we are removing it.
+       //
+       //if (vnodetype == VREG && !vnode_issystem(vp)) {
+       //      retval = hfs_fsync(vp, MNT_WAIT, 0, p);
+       //      if (retval)
+       //              return (retval);
+       //}
+
+       if (!vnode_issystem(vp) && (vnodetype != VLNK)) {
+               hfs_unlock(cp);
+               hfs_lock_truncate(cp, TRUE);
+               /* Force lock since callers expects lock to be held. */
+               if ((retval = hfs_lock(cp, HFS_FORCE_LOCK))) {
+                       hfs_unlock_truncate(cp, TRUE);
+                       return (retval);
+               }
+               /* No need to continue if file was removed. */
+               if (cp->c_flag & C_NOEXISTS) {
+                       hfs_unlock_truncate(cp, TRUE);
+                       return (ENOENT);
+               }
+               took_trunc_lock = 1;
+       }
+       headblks = fp->ff_blocks;
+       datablks = howmany(fp->ff_size, blksize);
+       growsize = datablks * blksize;
+       eflags = kEFContigMask | kEFAllMask | kEFNoClumpMask;
+       if (blockHint >= hfsmp->hfs_metazone_start &&
+           blockHint <= hfsmp->hfs_metazone_end)
+               eflags |= kEFMetadataMask;
+
+       if (hfs_start_transaction(hfsmp) != 0) {
+               if (took_trunc_lock)
+                       hfs_unlock_truncate(cp, TRUE);
+           return (EINVAL);
+       }
+       started_tr = 1;
+       /*
+        * Protect the extents b-tree and the allocation bitmap
+        * during MapFileBlockC and ExtendFileC operations.
+        */
+       lockflags = SFL_BITMAP;
+       if (overflow_extents(fp))
+               lockflags |= SFL_EXTENTS;
+       lockflags = hfs_systemfile_lock(hfsmp, lockflags, HFS_EXCLUSIVE_LOCK);
 
 
-       /* If no changes are necesary, then we're done */
-       if (filebytes == length)
-               goto Std_Exit;
+       retval = MapFileBlockC(hfsmp, (FCB *)fp, 1, growsize - 1, &sector_a, NULL);
+       if (retval) {
+               retval = MacToVFSError(retval);
+               goto out;
+       }
 
        /*
 
        /*
-        * Lengthen the size of the file. We must ensure that the
-        * last byte of the file is allocated. Since the smallest
-        * value of filebytes is 0, length will be at least 1.
+        * STEP 1 - acquire new allocation blocks.
         */
         */
-       if (length > filebytes) {
-               moreBytesRequested = length - filebytes;
-               
-#if QUOTA
-               retval = hfs_chkdq(cp,
-                               (int64_t)(roundup(moreBytesRequested, VTOVCB(vp)->blockSize)), 
-                               ap->a_cred, 0);
-               if (retval)
-                       return (retval);
+       nextallocsave = hfsmp->nextAllocation;
+       retval = ExtendFileC(hfsmp, (FCB*)fp, growsize, blockHint, eflags, &newbytes);
+       if (eflags & kEFMetadataMask) {
+               HFS_MOUNT_LOCK(hfsmp, TRUE);
+               HFS_UPDATE_NEXT_ALLOCATION(hfsmp, nextallocsave);
+               MarkVCBDirty(hfsmp);
+               HFS_MOUNT_UNLOCK(hfsmp, TRUE);
+       }
 
 
-#endif /* QUOTA */
-               // XXXdbg
-               hfs_global_shared_lock_acquire(hfsmp);
-               if (hfsmp->jnl) {
-                       if (journal_start_transaction(hfsmp->jnl) != 0) {
-                               retval = EINVAL;
-                               goto Err_Exit;
-                       }
+       retval = MacToVFSError(retval);
+       if (retval == 0) {
+               cp->c_flag |= C_MODIFIED;
+               if (newbytes < growsize) {
+                       retval = ENOSPC;
+                       goto restore;
+               } else if (fp->ff_blocks < (headblks + datablks)) {
+                       printf("hfs_relocate: allocation failed");
+                       retval = ENOSPC;
+                       goto restore;
                }
 
                }
 
-               /* lock extents b-tree (also protects volume bitmap) */
-               retval = hfs_metafilelocking(VTOHFS(vp), kHFSExtentsFileID, LK_EXCLUSIVE, ap->a_p);
+               retval = MapFileBlockC(hfsmp, (FCB *)fp, 1, growsize, &sector_b, NULL);
                if (retval) {
                if (retval) {
-                       if (hfsmp->jnl) {
-                               journal_end_transaction(hfsmp->jnl);
+                       retval = MacToVFSError(retval);
+               } else if ((sector_a + 1) == sector_b) {
+                       retval = ENOSPC;
+                       goto restore;
+               } else if ((eflags & kEFMetadataMask) &&
+                          ((((u_int64_t)sector_b * hfsmp->hfs_logical_block_size) / blksize) >
+                             hfsmp->hfs_metazone_end)) {
+#if 0
+                       const char * filestr;
+                       char emptystr = '\0';
+
+                       if (cp->c_desc.cd_nameptr != NULL) {
+                               filestr = (const char *)&cp->c_desc.cd_nameptr[0];
+                       } else if (vnode_name(vp) != NULL) {
+                               filestr = vnode_name(vp);
+                       } else {
+                               filestr = &emptystr;
                        }
                        }
-                       hfs_global_shared_lock_release(hfsmp);
-                       goto Err_Exit;
-               }
-
-               retval = MacToVFSError(ExtendFileC(VTOVCB(vp),
-                                               (FCB*)fp,
-                                               moreBytesRequested,
-                                               blockHint,
-                                               extendFlags,
-                                               &actualBytesAdded));
-
-               *(ap->a_bytesallocated) = actualBytesAdded;
-               filebytes = (off_t)fp->ff_blocks * (off_t)VTOVCB(vp)->blockSize;
-
-               (void) hfs_metafilelocking(VTOHFS(vp), kHFSExtentsFileID, LK_RELEASE, ap->a_p);
-
-               // XXXdbg
-               if (hfsmp->jnl) {
-                       hfs_flushvolumeheader(hfsmp, MNT_NOWAIT, 0);
-                       journal_end_transaction(hfsmp->jnl);
+#endif
+                       retval = ENOSPC;
+                       goto restore;
                }
                }
-               hfs_global_shared_lock_release(hfsmp);
+       }
+       /* Done with system locks and journal for now. */
+       hfs_systemfile_unlock(hfsmp, lockflags);
+       lockflags = 0;
+       hfs_end_transaction(hfsmp);
+       started_tr = 0;
 
 
+       if (retval) {
                /*
                /*
-                * if we get an error and no changes were made then exit
-                * otherwise we must do the VOP_UPDATE to reflect the changes
-                */
-               if (retval && (startingPEOF == filebytes))
-                       goto Err_Exit;
-        
-               /*
-                * Adjust actualBytesAdded to be allocation block aligned, not
-                * clump size aligned.
-                * NOTE: So what we are reporting does not affect reality
-                * until the file is closed, when we truncate the file to allocation
-                * block size.
+                * Check to see if failure is due to excessive fragmentation.
                 */
                 */
-               if ((actualBytesAdded != 0) && (moreBytesRequested < actualBytesAdded))
-                       *(ap->a_bytesallocated) =
-                               roundup(moreBytesRequested, (off_t)VTOVCB(vp)->blockSize);
-
-       } else { /* Shorten the size of the file */
-
-               if (fp->ff_size > length) {
-                       /*
-                        * Any buffers that are past the truncation point need to be
-                        * invalidated (to maintain buffer cache consistency).  For
-                        * simplicity, we invalidate all the buffers by calling vinvalbuf.
-                        */
-                       vflags = ((length > 0) ? V_SAVE : 0) | V_SAVEMETA;
-                       (void) vinvalbuf(vp, vflags, ap->a_cred, ap->a_p, 0, 0);
-               }
-
-               // XXXdbg
-               hfs_global_shared_lock_acquire(hfsmp);
-               if (hfsmp->jnl) {
-                       if (journal_start_transaction(hfsmp->jnl) != 0) {
-                               retval = EINVAL;
-                               goto Err_Exit;
-                       }
+               if ((retval == ENOSPC) &&
+                   (hfs_freeblks(hfsmp, 0) > (datablks * 2))) {
+                       hfsmp->hfs_flags |= HFS_FRAGMENTED_FREESPACE;
                }
                }
+               goto out;
+       }
+       /*
+        * STEP 2 - clone file data into the new allocation blocks.
+        */
 
 
-               /* lock extents b-tree (also protects volume bitmap) */
-               retval = hfs_metafilelocking(VTOHFS(vp), kHFSExtentsFileID, LK_EXCLUSIVE, ap->a_p);
-               if (retval) {
-                       if (hfsmp->jnl) {
-                               journal_end_transaction(hfsmp->jnl);
-                       }
-                       hfs_global_shared_lock_release(hfsmp);
-
-                       goto Err_Exit;
-               }                       
-
-               retval = MacToVFSError(
-                            TruncateFileC(
-                                            VTOVCB(vp),
-                                            (FCB*)fp,
-                                            length,
-                                            false));
-               (void) hfs_metafilelocking(VTOHFS(vp), kHFSExtentsFileID, LK_RELEASE, ap->a_p);
-               filebytes = (off_t)fp->ff_blocks * (off_t)VTOVCB(vp)->blockSize;
+       if (vnodetype == VLNK)
+               retval = hfs_clonelink(vp, blksize, cred, p);
+       else if (vnode_issystem(vp))
+               retval = hfs_clonesysfile(vp, headblks, datablks, blksize, cred, p);
+       else
+               retval = hfs_clonefile(vp, headblks, datablks, blksize);
+
+       /* Start transaction for step 3 or for a restore. */
+       if (hfs_start_transaction(hfsmp) != 0) {
+               retval = EINVAL;
+               goto out;
+       }
+       started_tr = 1;
+       if (retval)
+               goto restore;
 
 
-               if (hfsmp->jnl) {
-                       hfs_flushvolumeheader(hfsmp, MNT_NOWAIT, 0);
-                       journal_end_transaction(hfsmp->jnl);
-               }
-               hfs_global_shared_lock_release(hfsmp);
-               
+       /*
+        * STEP 3 - switch to cloned data and remove old blocks.
+        */
+       lockflags = SFL_BITMAP;
+       if (overflow_extents(fp))
+               lockflags |= SFL_EXTENTS;
+       lockflags = hfs_systemfile_lock(hfsmp, lockflags, HFS_EXCLUSIVE_LOCK);
+
+       retval = HeadTruncateFile(hfsmp, (FCB*)fp, headblks);
+
+       hfs_systemfile_unlock(hfsmp, lockflags);
+       lockflags = 0;
+       if (retval)
+               goto restore;
+out:
+       if (took_trunc_lock)
+               hfs_unlock_truncate(cp, TRUE);
+
+       if (lockflags) {
+               hfs_systemfile_unlock(hfsmp, lockflags);
+               lockflags = 0;
+       }
 
 
-               /*
-                * if we get an error and no changes were made then exit
-                * otherwise we must do the VOP_UPDATE to reflect the changes
-                */
-               if (retval && (startingPEOF == filebytes)) goto Err_Exit;
-#if QUOTA
-               /* These are  bytesreleased */
-               (void) hfs_chkdq(cp, (int64_t)-((startingPEOF - filebytes)), NOCRED,0);
-#endif /* QUOTA */
+       /* Push cnode's new extent data to disk. */
+       if (retval == 0) {
+               (void) hfs_update(vp, MNT_WAIT);
+       }
+       if (hfsmp->jnl) {
+               if (cp->c_cnid < kHFSFirstUserCatalogNodeID)
+                       (void) hfs_flushvolumeheader(hfsmp, MNT_WAIT, HFS_ALTFLUSH);
+               else
+                       (void) hfs_flushvolumeheader(hfsmp, MNT_NOWAIT, 0);
+       }
+exit:
+       if (started_tr)
+               hfs_end_transaction(hfsmp);
 
 
-               if (fp->ff_size > filebytes) {
-                       fp->ff_size = filebytes;
+       return (retval);
 
 
-                       if (UBCISVALID(vp))
-                               ubc_setsize(vp, fp->ff_size); /* XXX check errors */
-               }
+restore:
+       if (fp->ff_blocks == headblks) {
+               if (took_trunc_lock)
+                       hfs_unlock_truncate(cp, TRUE);
+               goto exit;
+       }
+       /*
+        * Give back any newly allocated space.
+        */
+       if (lockflags == 0) {
+               lockflags = SFL_BITMAP;
+               if (overflow_extents(fp))
+                       lockflags |= SFL_EXTENTS;
+               lockflags = hfs_systemfile_lock(hfsmp, lockflags, HFS_EXCLUSIVE_LOCK);
        }
 
        }
 
-Std_Exit:
-       cp->c_flag |= C_CHANGE | C_UPDATE;
-       retval2 = VOP_UPDATE(vp, &tv, &tv, MNT_WAIT);
+       (void) TruncateFileC(hfsmp, (FCB*)fp, fp->ff_size, false);
 
 
-       if (retval == 0)
-               retval = retval2;
-Err_Exit:
-       return (retval);
+       hfs_systemfile_unlock(hfsmp, lockflags);
+       lockflags = 0;
+
+       if (took_trunc_lock)
+               hfs_unlock_truncate(cp, TRUE);
+       goto exit;
 }
 
 
 /*
 }
 
 
 /*
- * pagein for HFS filesystem
+ * Clone a symlink.
+ *
  */
  */
-int
-hfs_pagein(ap)
-       struct vop_pagein_args /* {
-               struct vnode *a_vp,
-               upl_t         a_pl,
-               vm_offset_t   a_pl_offset,
-               off_t         a_f_offset,
-               size_t        a_size,
-               struct ucred *a_cred,
-               int           a_flags
-       } */ *ap;
+static int
+hfs_clonelink(struct vnode *vp, int blksize, kauth_cred_t cred, __unused struct proc *p)
 {
 {
-       register struct vnode *vp = ap->a_vp;
-       int devBlockSize = 0;
+       struct buf *head_bp = NULL;
+       struct buf *tail_bp = NULL;
        int error;
 
        int error;
 
-       if (vp->v_type != VREG && vp->v_type != VLNK)
-               panic("hfs_pagein: vp not UBC type\n");
 
 
-       VOP_DEVBLOCKSIZE(VTOC(vp)->c_devvp, &devBlockSize);
+       error = (int)buf_meta_bread(vp, (daddr64_t)0, blksize, cred, &head_bp);
+       if (error)
+               goto out;
+
+       tail_bp = buf_getblk(vp, (daddr64_t)1, blksize, 0, 0, BLK_META);
+       if (tail_bp == NULL) {
+               error = EIO;
+               goto out;
+       }
+       bcopy((char *)buf_dataptr(head_bp), (char *)buf_dataptr(tail_bp), blksize);
+       error = (int)buf_bwrite(tail_bp);
+out:
+       if (head_bp) {
+               buf_markinvalid(head_bp);
+               buf_brelse(head_bp);
+       }       
+       (void) buf_invalidateblks(vp, BUF_WRITE_DATA, 0, 0);
 
 
-       error = cluster_pagein(vp, ap->a_pl, ap->a_pl_offset, ap->a_f_offset,
-                               ap->a_size, (off_t)VTOF(vp)->ff_size, devBlockSize,
-                               ap->a_flags);
        return (error);
 }
 
        return (error);
 }
 
-/* 
- * pageout for HFS filesystem.
+/*
+ * Clone a file's data within the file.
+ *
  */
  */
-int
-hfs_pageout(ap)
-       struct vop_pageout_args /* {
-          struct vnode *a_vp,
-          upl_t         a_pl,
-          vm_offset_t   a_pl_offset,
-          off_t         a_f_offset,
-          size_t        a_size,
-          struct ucred *a_cred,
-          int           a_flags
-       } */ *ap;
+static int
+hfs_clonefile(struct vnode *vp, int blkstart, int blkcnt, int blksize)
 {
 {
-       struct vnode *vp = ap->a_vp;
-       struct cnode *cp = VTOC(vp);
-       struct filefork *fp = VTOF(vp);
-       int retval;
-       int devBlockSize = 0;
-       off_t end_of_range;
-       off_t filesize;
-
-       if (UBCINVALID(vp))
-               panic("hfs_pageout: Not a  VREG: vp=%x", vp);
-
-       VOP_DEVBLOCKSIZE(cp->c_devvp, &devBlockSize);
-       filesize = fp->ff_size;
-       end_of_range = ap->a_f_offset + ap->a_size - 1;
+       caddr_t  bufp;
+       size_t  bufsize;
+       size_t  copysize;
+        size_t  iosize;
+       size_t  offset;
+       off_t   writebase;
+       uio_t auio;
+       int  error = 0;
+
+       writebase = blkstart * blksize;
+       copysize = blkcnt * blksize;
+       iosize = bufsize = MIN(copysize, 128 * 1024);
+       offset = 0;
+
+       if (kmem_alloc(kernel_map, (vm_offset_t *)&bufp, bufsize)) {
+               return (ENOMEM);
+       }       
+       hfs_unlock(VTOC(vp));
+
+       auio = uio_create(1, 0, UIO_SYSSPACE, UIO_READ);
+
+       while (offset < copysize) {
+               iosize = MIN(copysize - offset, iosize);
+
+               uio_reset(auio, offset, UIO_SYSSPACE, UIO_READ);
+               uio_addiov(auio, (uintptr_t)bufp, iosize);
+
+               error = cluster_read(vp, auio, copysize, IO_NOCACHE);
+               if (error) {
+                       printf("hfs_clonefile: cluster_read failed - %d\n", error);
+                       break;
+               }
+               if (uio_resid(auio) != 0) {
+                       printf("hfs_clonefile: cluster_read: uio_resid = %lld\n", uio_resid(auio));
+                       error = EIO;            
+                       break;
+               }
 
 
-       if (end_of_range >= filesize)
-               end_of_range = (off_t)(filesize - 1);
-       if (ap->a_f_offset < filesize)
-               rl_remove(ap->a_f_offset, end_of_range, &fp->ff_invalidranges);
+               uio_reset(auio, writebase + offset, UIO_SYSSPACE, UIO_WRITE);
+               uio_addiov(auio, (uintptr_t)bufp, iosize);
 
 
-       retval = cluster_pageout(vp, ap->a_pl, ap->a_pl_offset, ap->a_f_offset, ap->a_size,
-                                filesize, devBlockSize, ap->a_flags);
+               error = cluster_write(vp, auio, writebase + offset,
+                                     writebase + offset + iosize,
+                                     uio_offset(auio), 0, IO_NOCACHE | IO_SYNC);
+               if (error) {
+                       printf("hfs_clonefile: cluster_write failed - %d\n", error);
+                       break;
+               }
+               if (uio_resid(auio) != 0) {
+                       printf("hfs_clonefile: cluster_write failed - uio_resid not zero\n");
+                       error = EIO;            
+                       break;
+               }       
+               offset += iosize;
+       }
+       uio_free(auio);
 
 
-       /*
-        * If we successfully wrote any data, and we are not the superuser
-        * we clear the setuid and setgid bits as a precaution against
-        * tampering.
-        */
-       if (retval == 0 && ap->a_cred && ap->a_cred->cr_uid != 0)
-               cp->c_mode &= ~(S_ISUID | S_ISGID);
+       if ((blksize & PAGE_MASK)) {
+               /*
+                * since the copy may not have started on a PAGE
+                * boundary (or may not have ended on one), we 
+                * may have pages left in the cache since NOCACHE
+                * will let partially written pages linger...
+                * lets just flush the entire range to make sure
+                * we don't have any pages left that are beyond
+                * (or intersect) the real LEOF of this file
+                */
+               ubc_msync(vp, writebase, writebase + offset, NULL, UBC_INVALIDATE | UBC_PUSHDIRTY);
+       } else {
+               /*
+                * No need to call ubc_sync_range or hfs_invalbuf
+                * since the file was copied using IO_NOCACHE and
+                * the copy was done starting and ending on a page
+                * boundary in the file.
+                */
+       }
+       kmem_free(kernel_map, (vm_offset_t)bufp, bufsize);
 
 
-       return (retval);
+       hfs_lock(VTOC(vp), HFS_FORCE_LOCK);     
+       return (error);
 }
 
 /*
 }
 
 /*
- * Intercept B-Tree node writes to unswap them if necessary.
-#
-#vop_bwrite {
-#      IN struct buf *bp;
+ * Clone a system (metadata) file.
+ *
  */
  */
-int
-hfs_bwrite(ap)
-       struct vop_bwrite_args /* {
-               struct buf *a_bp;
-       } */ *ap;
+static int
+hfs_clonesysfile(struct vnode *vp, int blkstart, int blkcnt, int blksize,
+                 kauth_cred_t cred, struct proc *p)
 {
 {
-       int retval = 0;
-       register struct buf *bp = ap->a_bp;
-       register struct vnode *vp = bp->b_vp;
-#if BYTE_ORDER == LITTLE_ENDIAN
-       BlockDescriptor block;
+       caddr_t  bufp;
+       char * offset;
+       size_t  bufsize;
+       size_t  iosize;
+       struct buf *bp = NULL;
+       daddr64_t  blkno;
+       daddr64_t  blk;
+       daddr64_t  start_blk;
+       daddr64_t  last_blk;
+       int  breadcnt;
+        int  i;
+       int  error = 0;
+
+
+       iosize = GetLogicalBlockSize(vp);
+       bufsize = MIN(blkcnt * blksize, 1024 * 1024) & ~(iosize - 1);
+       breadcnt = bufsize / iosize;
+
+       if (kmem_alloc(kernel_map, (vm_offset_t *)&bufp, bufsize)) {
+               return (ENOMEM);
+       }       
+       start_blk = ((daddr64_t)blkstart * blksize) / iosize;
+       last_blk  = ((daddr64_t)blkcnt * blksize) / iosize;
+       blkno = 0;
+
+       while (blkno < last_blk) {
+               /*
+                * Read up to a megabyte
+                */
+               offset = bufp;
+               for (i = 0, blk = blkno; (i < breadcnt) && (blk < last_blk); ++i, ++blk) {
+                       error = (int)buf_meta_bread(vp, blk, iosize, cred, &bp);
+                       if (error) {
+                               printf("hfs_clonesysfile: meta_bread error %d\n", error);
+                               goto out;
+                       }
+                       if (buf_count(bp) != iosize) {
+                               printf("hfs_clonesysfile: b_bcount is only %d\n", buf_count(bp));
+                               goto out;
+                       }
+                       bcopy((char *)buf_dataptr(bp), offset, iosize);
 
 
-       /* Trap B-Tree writes */
-       if ((VTOC(vp)->c_fileid == kHFSExtentsFileID) ||
-           (VTOC(vp)->c_fileid == kHFSCatalogFileID)) {
+                       buf_markinvalid(bp);
+                       buf_brelse(bp);
+                       bp = NULL;
 
 
-               /* Swap if the B-Tree node is in native byte order */
-               if (((UInt16 *)((char *)bp->b_data + bp->b_bcount - 2))[0] == 0x000e) {
-                       /* Prepare the block pointer */
-                       block.blockHeader = bp;
-                       block.buffer = bp->b_data;
-                       /* not found in cache ==> came from disk */
-                       block.blockReadFromDisk = (bp->b_flags & B_CACHE) == 0;
-                       block.blockSize = bp->b_bcount;
-    
-                       /* Endian un-swap B-Tree node */
-                       SWAP_BT_NODE (&block, ISHFSPLUS (VTOVCB(vp)), VTOC(vp)->c_fileid, 1);
+                       offset += iosize;
+               }
+       
+               /*
+                * Write up to a megabyte
+                */
+               offset = bufp;
+               for (i = 0; (i < breadcnt) && (blkno < last_blk); ++i, ++blkno) {
+                       bp = buf_getblk(vp, start_blk + blkno, iosize, 0, 0, BLK_META);
+                       if (bp == NULL) {
+                               printf("hfs_clonesysfile: getblk failed on blk %qd\n", start_blk + blkno);
+                               error = EIO;
+                               goto out;
+                       }
+                       bcopy(offset, (char *)buf_dataptr(bp), iosize);
+                       error = (int)buf_bwrite(bp);
+                       bp = NULL;
+                       if (error)
+                               goto out;
+                       offset += iosize;
                }
                }
-
-               /* We don't check to make sure that it's 0x0e00 because it could be all zeros */
        }
        }
-#endif
-       /* This buffer shouldn't be locked anymore but if it is clear it */
-       if (ISSET(bp->b_flags, B_LOCKED)) {
-           // XXXdbg
-           if (VTOHFS(vp)->jnl) {
-                       panic("hfs: CLEARING the lock bit on bp 0x%x\n", bp);
-           }
-               CLR(bp->b_flags, B_LOCKED);
-               printf("hfs_bwrite: called with lock bit set\n");
+out:
+       if (bp) {
+               buf_brelse(bp);
        }
        }
-       retval = vn_bwrite (ap);
 
 
-       return (retval);
+       kmem_free(kernel_map, (vm_offset_t)bufp, bufsize);
+
+       error = hfs_fsync(vp, MNT_WAIT, 0, p);
+
+       return (error);
 }
 }