xnu-7195.81.3.tar.gz

[apple/xnu.git] / bsd / vfs / vfs_syscalls.c
diff --git a/bsd/vfs/vfs_syscalls.c b/bsd/vfs/vfs_syscalls.c

index c9dc444b94886c4aae49a1387f437f7e55a001a4..cb7cf97fddaf0d9a4fccb4207a305c7c9d775192 100644 (file)
--- a/bsd/vfs/vfs_syscalls.c
+++ b/bsd/vfs/vfs_syscalls.c
@@ -1,5 +1,5 @@
  /*
- * Copyright (c) 1995-2017 Apple Inc. All rights reserved.
+ * Copyright (c) 1995-2020 Apple Inc. All rights reserved.
   *
   * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
   *
@@ -83,7 +83,7 @@
  #include <sys/proc_internal.h>
  #include <sys/kauth.h>
  #include <sys/uio_internal.h>
-#include <sys/malloc.h>
+#include <kern/kalloc.h>
  #include <sys/mman.h>
  #include <sys/dirent.h>
  #include <sys/attr.h>
@@ -94,6 +94,7 @@
  #include <sys/fsevents.h>
  #include <sys/imgsrc.h>
  #include <sys/sysproto.h>
+#include <sys/sysctl.h>
  #include <sys/xattr.h>
  #include <sys/fcntl.h>
  #include <sys/fsctl.h>
@@ -103,6 +104,7 @@
  #include <sys/clonefile.h>
  #include <sys/snapshot.h>
  #include <sys/priv.h>
+#include <sys/fsgetpath.h>
  #include <machine/cons.h>
  #include <machine/limits.h>
  #include <miscfs/specfs/specdev.h>
@@ -121,9 +123,19 @@
  #include <vm/vm_protos.h>
  
  #include <libkern/OSAtomic.h>
+#include <os/atomic_private.h>
  #include <pexpert/pexpert.h>
  #include <IOKit/IOBSD.h>
  
+// deps for MIG call
+#include <kern/host.h>
+#include <kern/ipc_misc.h>
+#include <mach/host_priv.h>
+#include <mach/vfs_nspace.h>
+#include <os/log.h>
+
+#include <nfs/nfs_conf.h>
+
  #if ROUTEFS
  #include <miscfs/routefs/routefs.h>
  #endif /* ROUTEFS */
@@ -135,14 +147,14 @@
  
  #if CONFIG_FSE
  #define GET_PATH(x) \
-       (x) = get_pathbuff();
+       ((x) = get_pathbuff())
  #define RELEASE_PATH(x) \
-       release_pathbuff(x);
+       release_pathbuff(x)
  #else
  #define GET_PATH(x)     \
-       MALLOC_ZONE((x), char *, MAXPATHLEN, M_NAMEI, M_WAITOK);
+       ((x) = zalloc(ZV_NAMEI))
  #define RELEASE_PATH(x) \
-       FREE_ZONE((x), MAXPATHLEN, M_NAMEI);
+       zfree(ZV_NAMEI, x)
  #endif /* CONFIG_FSE */
  
  #ifndef HFS_GET_BOOT_INFO
@@ -157,6 +169,12 @@
  #define APFSIOC_REVERT_TO_SNAPSHOT  _IOW('J', 1, u_int64_t)
  #endif
  
+/*
+ * If you need accounting for KM_FD_VN_DATA consider using
+ * ZONE_VIEW_DEFINE to define a zone view.
+ */
+#define KM_FD_VN_DATA KHEAP_DEFAULT
+
  extern void disk_conditioner_unmount(mount_t mp);
  
  /* struct for checkdirs iteration */
@@ -177,8 +195,6 @@ static int sync_callback(mount_t, void *);
  static int munge_statfs(struct mount *mp, struct vfsstatfs *sfsp,
      user_addr_t bufp, int *sizep, boolean_t is_64_bit,
      boolean_t partial_copy);
-static int statfs64_common(struct mount *mp, struct vfsstatfs *sfsp,
-    user_addr_t bufp);
  static int fsync_common(proc_t p, struct fsync_args *uap, int flags);
  static int mount_common(char *fstypename, vnode_t pvp, vnode_t vp,
      struct componentname *cnp, user_addr_t fsmountargs,
@@ -202,9 +218,13 @@ struct fd_vn_data * fg_vn_data_alloc(void);
   */
  #define MAX_AUTHORIZE_ENOENT_RETRIES 1024
  
-static int rmdirat_internal(vfs_context_t, int, user_addr_t, enum uio_seg);
+/* Max retry limit for rename due to vnode recycling. */
+#define MAX_RENAME_ERECYCLE_RETRIES 1024
+
+static int rmdirat_internal(vfs_context_t, int, user_addr_t, enum uio_seg,
+    int unlink_flags);
  
-static int fsgetpath_internal(vfs_context_t, int, uint64_t, vm_size_t, caddr_t, int *);
+static int fsgetpath_internal(vfs_context_t, int, uint64_t, vm_size_t, caddr_t, uint32_t options, int *);
  
  #ifdef CONFIG_IMGSRC_ACCESS
  static int authorize_devpath_and_update_mntfromname(mount_t mp, user_addr_t devpath, vnode_t *devvpp, vfs_context_t ctx);
@@ -215,15 +235,18 @@ static void mount_end_update(mount_t mp);
  static int relocate_imageboot_source(vnode_t pvp, vnode_t vp, struct componentname *cnp, const char *fsname, vfs_context_t ctx, boolean_t is64bit, user_addr_t fsmountargs, boolean_t by_index);
  #endif /* CONFIG_IMGSRC_ACCESS */
  
+#if CONFIG_LOCKERBOOT
+int mount_locker_protoboot(const char *fsname, const char *mntpoint,
+    const char *pbdevpath);
+#endif
+
  //snapshot functions
  #if CONFIG_MNT_ROOTSNAP
-static int snapshot_root(int dirfd, user_addr_t name, uint32_t flags, vfs_context_t ctx);
+static int __attribute__ ((noinline)) snapshot_root(int dirfd, user_addr_t name, uint32_t flags, vfs_context_t ctx);
  #else
-static int snapshot_root(int dirfd, user_addr_t name, uint32_t flags, vfs_context_t ctx) __attribute__((unused));
+static int __attribute__ ((noinline)) snapshot_root(int dirfd, user_addr_t name, uint32_t flags, vfs_context_t ctx) __attribute__((unused));
  #endif
  
-int (*union_dircheckp)(struct vnode **, struct fileproc *, vfs_context_t);
-
  __private_extern__
  int sync_internal(void);
  
@@ -234,6 +257,8 @@ extern lck_grp_t *fd_vn_lck_grp;
  extern lck_grp_attr_t *fd_vn_lck_grp_attr;
  extern lck_attr_t *fd_vn_lck_attr;
  
+extern lck_rw_t * rootvnode_rw_lock;
+
  /*
   * incremented each time a mount or unmount operation occurs
   * used to invalidate the cached value of the rootvp in the
@@ -244,6 +269,9 @@ uint32_t mount_generation = 0;
  /* counts number of mount and unmount operations */
  unsigned int vfs_nummntops = 0;
  
+/* system-wide, per-boot unique mount ID */
+static _Atomic uint64_t mount_unique_id = 1;
+
  extern const struct fileops vnops;
  #if CONFIG_APPLEDOUBLE
  extern errno_t rmdir_remove_orphaned_appleDouble(vnode_t, vfs_context_t, int *);
@@ -253,7 +281,7 @@ extern errno_t rmdir_remove_orphaned_appleDouble(vnode_t, vfs_context_t, int *);
   * Virtual File System System Calls
   */
  
-#if NFSCLIENT || DEVFS || ROUTEFS
+#if CONFIG_NFS_CLIENT || DEVFS || ROUTEFS
  /*
   * Private in-kernel mounting spi (NFS only, not exported)
   */
@@ -267,7 +295,7 @@ vfs_iskernelmount(mount_t mp)
  __private_extern__
  int
  kernel_mount(char *fstype, vnode_t pvp, vnode_t vp, const char *path,
-    void *data, __unused size_t datalen, int syscall_flags, __unused uint32_t kern_flags, vfs_context_t ctx)
+    void *data, __unused size_t datalen, int syscall_flags, uint32_t kern_flags, vfs_context_t ctx)
  {
         struct nameidata nd;
         boolean_t did_namei;
@@ -282,6 +310,9 @@ kernel_mount(char *fstype, vnode_t pvp, vnode_t vp, const char *path,
         if (vp == NULLVP) {
                 error = namei(&nd);
                 if (error) {
+                       if (kern_flags & (KERNEL_MOUNT_SNAPSHOT | KERNEL_MOUNT_VOLBYROLE_MASK)) {
+                               printf("failed to locate mount-on path: %s ", path);
+                       }
                         return error;
                 }
                 vp = nd.ni_vp;
@@ -291,7 +322,7 @@ kernel_mount(char *fstype, vnode_t pvp, vnode_t vp, const char *path,
                 char *pnbuf = CAST_DOWN(char *, path);
  
                 nd.ni_cnd.cn_pnbuf = pnbuf;
-               nd.ni_cnd.cn_pnlen = strlen(pnbuf) + 1;
+               nd.ni_cnd.cn_pnlen = (int)(strlen(pnbuf) + 1);
                 did_namei = FALSE;
         }
  
@@ -306,7 +337,7 @@ kernel_mount(char *fstype, vnode_t pvp, vnode_t vp, const char *path,
  
         return error;
  }
-#endif /* NFSCLIENT || DEVFS */
+#endif /* CONFIG_NFS_CLIENT || DEVFS */
  
  /*
   * Mount a file system.
@@ -373,11 +404,11 @@ fmount(__unused proc_t p, struct fmount_args *uap, __unused int32_t *retval)
         }
  
         memset(&cn, 0, sizeof(struct componentname));
-       MALLOC(cn.cn_pnbuf, char *, MAXPATHLEN, M_TEMP, M_WAITOK);
+       cn.cn_pnbuf = zalloc_flags(ZV_NAMEI, Z_WAITOK);
         cn.cn_pnlen = MAXPATHLEN;
  
         if ((error = vn_getpath(vp, cn.cn_pnbuf, &cn.cn_pnlen)) != 0) {
-               FREE(cn.cn_pnbuf, M_TEMP);
+               zfree(ZV_NAMEI, cn.cn_pnbuf);
                 vnode_put(pvp);
                 vnode_put(vp);
                 file_drop(uap->fd);
@@ -386,7 +417,7 @@ fmount(__unused proc_t p, struct fmount_args *uap, __unused int32_t *retval)
  
         error = mount_common(fstypename, pvp, vp, &cn, uap->data, flags, 0, labelstr, FALSE, ctx);
  
-       FREE(cn.cn_pnbuf, M_TEMP);
+       zfree(ZV_NAMEI, cn.cn_pnbuf);
         vnode_put(pvp);
         vnode_put(vp);
         file_drop(uap->fd);
@@ -433,6 +464,7 @@ __mac_mount(struct proc *p, register struct __mac_mount_args *uap, __unused int3
         struct nameidata nd;
         size_t dummy = 0;
         char *labelstr = NULL;
+       size_t labelsz = 0;
         int flags = uap->flags;
         int error;
  #if CONFIG_IMGSRC_ACCESS || CONFIG_MACF
@@ -481,8 +513,8 @@ __mac_mount(struct proc *p, register struct __mac_mount_args *uap, __unused int3
                 if (is_64bit) {
                         struct user64_mac mac64;
                         error = copyin(uap->mac_p, &mac64, sizeof(mac64));
-                       mac.m_buflen = mac64.m_buflen;
-                       mac.m_string = mac64.m_string;
+                       mac.m_buflen = (user_size_t)mac64.m_buflen;
+                       mac.m_string = (user_addr_t)mac64.m_string;
                 } else {
                         struct user32_mac mac32;
                         error = copyin(uap->mac_p, &mac32, sizeof(mac32));
@@ -497,7 +529,8 @@ __mac_mount(struct proc *p, register struct __mac_mount_args *uap, __unused int3
                         error = EINVAL;
                         goto out;
                 }
-               MALLOC(labelstr, char *, mac.m_buflen, M_MACTEMP, M_WAITOK);
+               labelsz = mac.m_buflen;
+               labelstr = kheap_alloc(KHEAP_TEMP, labelsz, Z_WAITOK);
                 error = copyinstr(mac.m_string, labelstr, mac.m_buflen, &ulen);
                 if (error) {
                         goto out;
@@ -558,9 +591,7 @@ __mac_mount(struct proc *p, register struct __mac_mount_args *uap, __unused int3
  out:
  
  #if CONFIG_MACF
-       if (labelstr) {
-               FREE(labelstr, M_MACTEMP);
-       }
+       kheap_free(KHEAP_DEFAULT, labelstr, labelsz);
  #endif /* CONFIG_MACF */
  
         if (vp) {
@@ -607,6 +638,7 @@ mount_common(char *fstypename, vnode_t pvp, vnode_t vp,
         struct vfstable *vfsp = (struct vfstable *)0;
         struct proc *p = vfs_context_proc(ctx);
         int error, flag = 0;
+       bool flag_set = false;
         user_addr_t devpath = USER_ADDR_NULL;
         int ronly = 0;
         int mntalloc = 0;
@@ -614,6 +646,23 @@ mount_common(char *fstypename, vnode_t pvp, vnode_t vp,
         boolean_t is_rwlock_locked = FALSE;
         boolean_t did_rele = FALSE;
         boolean_t have_usecount = FALSE;
+       boolean_t did_set_lmount = FALSE;
+
+#if CONFIG_ROSV_STARTUP || CONFIG_MOUNT_VM || CONFIG_BASESYSTEMROOT
+       /* Check for mutually-exclusive flag bits */
+       uint32_t checkflags = (internal_flags & (KERNEL_MOUNT_VOLBYROLE_MASK | KERNEL_MOUNT_BASESYSTEMROOT));
+       int bitcount = 0;
+       while (checkflags != 0) {
+               checkflags &= (checkflags - 1);
+               bitcount++;
+       }
+
+       if (bitcount > 1) {
+               //not allowed to request multiple mount-by-role flags
+               error = EINVAL;
+               goto out1;
+       }
+#endif
  
         /*
          * Process an update for an existing mount
@@ -625,13 +674,15 @@ mount_common(char *fstypename, vnode_t pvp, vnode_t vp,
                 }
                 mp = vp->v_mount;
  
-               /* unmount in progress return error */
+               /* if unmount or mount in progress, return error */
                 mount_lock_spin(mp);
-               if (mp->mnt_lflag & MNT_LUNMOUNT) {
+               if (mp->mnt_lflag & (MNT_LUNMOUNT | MNT_LMOUNT)) {
                         mount_unlock(mp);
                         error = EBUSY;
                         goto out1;
                 }
+               mp->mnt_lflag |= MNT_LMOUNT;
+               did_set_lmount = TRUE;
                 mount_unlock(mp);
                 lck_rw_lock_exclusive(&mp->mnt_rwlock);
                 is_rwlock_locked = TRUE;
@@ -655,14 +706,22 @@ mount_common(char *fstypename, vnode_t pvp, vnode_t vp,
                         goto out1;
                 }
  
-#ifdef CONFIG_IMGSRC_ACCESS
+               /*
+                * can't turn off MNT_REMOVABLE either but it may be an unexpected
+                * failure to return an error for this so we'll just silently
+                * add it if it is not passed in.
+                */
+               if ((mp->mnt_flag & MNT_REMOVABLE) &&
+                   ((flags & MNT_REMOVABLE) == 0)) {
+                       flags |= MNT_REMOVABLE;
+               }
+
                 /* Can't downgrade the backer of the root FS */
                 if ((mp->mnt_kern_flag & MNTK_BACKS_ROOT) &&
                     (!vfs_isrdonly(mp)) && (flags & MNT_RDONLY)) {
                         error = ENOTSUP;
                         goto out1;
                 }
-#endif /* CONFIG_IMGSRC_ACCESS */
  
                 /*
                  * Only root, or the user that did the original mount is
@@ -689,6 +748,7 @@ mount_common(char *fstypename, vnode_t pvp, vnode_t vp,
                         }
                 }
                 flag = mp->mnt_flag;
+               flag_set = true;
  
  
  
@@ -696,7 +756,7 @@ mount_common(char *fstypename, vnode_t pvp, vnode_t vp,
  
                 vfsp = mp->mnt_vtable;
                 goto update;
-       }
+       } // MNT_UPDATE
  
         /*
          * For non-root users, silently enforce MNT_NOSUID and MNT_NODEV, and
@@ -726,9 +786,12 @@ mount_common(char *fstypename, vnode_t pvp, vnode_t vp,
         }
  
         /*
-        * VFC_VFSLOCALARGS is not currently supported for kernel mounts
+        * VFC_VFSLOCALARGS is not currently supported for kernel mounts,
+        * except in ROSV configs and for the initial BaseSystem root.
          */
-       if (kernelmount && (vfsp->vfc_vfsflags & VFC_VFSLOCALARGS)) {
+       if (kernelmount && (vfsp->vfc_vfsflags & VFC_VFSLOCALARGS) &&
+           ((internal_flags & KERNEL_MOUNT_VOLBYROLE_MASK) == 0) &&
+           ((internal_flags & KERNEL_MOUNT_BASESYSTEMROOT) == 0)) {
                 error = EINVAL;  /* unsupported request */
                 goto out1;
         }
@@ -741,9 +804,7 @@ mount_common(char *fstypename, vnode_t pvp, vnode_t vp,
         /*
          * Allocate and initialize the filesystem (mount_t)
          */
-       MALLOC_ZONE(mp, struct mount *, (u_int32_t)sizeof(struct mount),
-           M_MOUNT, M_WAITOK);
-       bzero((char *)mp, (u_int32_t)sizeof(struct mount));
+       mp = zalloc_flags(mount_zone, Z_WAITOK | Z_ZERO);
         mntalloc = 1;
  
         /* Initialize the default IO constraints */
@@ -759,6 +820,9 @@ mount_common(char *fstypename, vnode_t pvp, vnode_t vp,
         mp->mnt_realrootvp = NULLVP;
         mp->mnt_authcache_ttl = CACHED_LOOKUP_RIGHT_TTL;
  
+       mp->mnt_lflag |= MNT_LMOUNT;
+       did_set_lmount = TRUE;
+
         TAILQ_INIT(&mp->mnt_vnodelist);
         TAILQ_INIT(&mp->mnt_workerqueue);
         TAILQ_INIT(&mp->mnt_newvnodes);
@@ -770,23 +834,30 @@ mount_common(char *fstypename, vnode_t pvp, vnode_t vp,
         //mp->mnt_stat.f_type = vfsp->vfc_typenum;
         mp->mnt_flag |= vfsp->vfc_flags & MNT_VISFLAGMASK;
         strlcpy(mp->mnt_vfsstat.f_fstypename, vfsp->vfc_name, MFSTYPENAMELEN);
-       strlcpy(mp->mnt_vfsstat.f_mntonname, cnp->cn_pnbuf, MAXPATHLEN);
+       do {
+               int pathlen = MAXPATHLEN;
+
+               if (vn_getpath_ext(vp, pvp, mp->mnt_vfsstat.f_mntonname, &pathlen, VN_GETPATH_FSENTER)) {
+                       strlcpy(mp->mnt_vfsstat.f_mntonname, cnp->cn_pnbuf, MAXPATHLEN);
+               }
+       } while (0);
         mp->mnt_vnodecovered = vp;
         mp->mnt_vfsstat.f_owner = kauth_cred_getuid(vfs_context_ucred(ctx));
         mp->mnt_throttle_mask = LOWPRI_MAX_NUM_DEV - 1;
         mp->mnt_devbsdunit = 0;
+       mp->mnt_mount_id = os_atomic_inc_orig(&mount_unique_id, relaxed);
  
         /* XXX 3762912 hack to support HFS filesystem 'owner' - filesystem may update later */
         vfs_setowner(mp, KAUTH_UID_NONE, KAUTH_GID_NONE);
  
-#if NFSCLIENT || DEVFS || ROUTEFS
+#if CONFIG_NFS_CLIENT || DEVFS || ROUTEFS
         if (kernelmount) {
                 mp->mnt_kern_flag |= MNTK_KERNEL_MOUNT;
         }
         if ((internal_flags & KERNEL_MOUNT_PERMIT_UNMOUNT) != 0) {
                 mp->mnt_kern_flag |= MNTK_PERMIT_UNMOUNT;
         }
-#endif /* NFSCLIENT || DEVFS */
+#endif /* CONFIG_NFS_CLIENT || DEVFS */
  
  update:
  
@@ -807,7 +878,7 @@ update:
         mp->mnt_flag &= ~(MNT_NOSUID | MNT_NOEXEC | MNT_NODEV |
             MNT_SYNCHRONOUS | MNT_UNION | MNT_ASYNC |
             MNT_UNKNOWNPERMISSIONS | MNT_DONTBROWSE |
-           MNT_AUTOMOUNTED | MNT_DEFWRITE | MNT_NOATIME |
+           MNT_AUTOMOUNTED | MNT_DEFWRITE | MNT_NOATIME | MNT_STRICTATIME |
             MNT_QUARANTINE | MNT_CPROTECT);
  
  #if SECURE_KERNEL
@@ -824,7 +895,7 @@ update:
         mp->mnt_flag |= flags & (MNT_NOSUID | MNT_NOEXEC | MNT_NODEV |
             MNT_SYNCHRONOUS | MNT_UNION | MNT_ASYNC |
             MNT_UNKNOWNPERMISSIONS | MNT_DONTBROWSE |
-           MNT_AUTOMOUNTED | MNT_DEFWRITE | MNT_NOATIME |
+           MNT_AUTOMOUNTED | MNT_DEFWRITE | MNT_NOATIME | MNT_STRICTATIME |
             MNT_QUARANTINE | MNT_CPROTECT);
  
  #if CONFIG_MACF
@@ -837,30 +908,71 @@ update:
         }
  #endif
         /*
-        * Process device path for local file systems if requested
+        * Process device path for local file systems if requested.
+        *
+        * Snapshot and mount-by-role mounts do not use this path; they are
+        * passing other opaque data in the device path field.
+        *
+        * Basesystemroot mounts pass a device path to be resolved here,
+        * but it's just a char * already inside the kernel, which
+        * kernel_mount() shoved into a user_addr_t to call us. So for such
+        * mounts we must skip copyin (both of the address and of the string
+        * (in NDINIT).
          */
         if (vfsp->vfc_vfsflags & VFC_VFSLOCALARGS &&
-           !(internal_flags & KERNEL_MOUNT_SNAPSHOT)) {
-               if (vfs_context_is64bit(ctx)) {
-                       if ((error = copyin(fsmountargs, (caddr_t)&devpath, sizeof(devpath)))) {
-                               goto out1;
-                       }
-                       fsmountargs += sizeof(devpath);
-               } else {
-                       user32_addr_t tmp;
-                       if ((error = copyin(fsmountargs, (caddr_t)&tmp, sizeof(tmp)))) {
-                               goto out1;
+           !(internal_flags & (KERNEL_MOUNT_SNAPSHOT | KERNEL_MOUNT_VOLBYROLE_MASK))) {
+               boolean_t do_copyin_devpath = true;
+#if CONFIG_BASESYSTEMROOT
+               if (internal_flags & KERNEL_MOUNT_BASESYSTEMROOT) {
+                       // KERNEL_MOUNT_BASESYSTEMROOT implies subtle behavior worh nothing:
+                       // We have been passed fsmountargs, which is typed as a user_addr_t,
+                       // but is actually a char ** pointing to a (kernelspace) string.
+                       // We manually unpack it with a series of casts and dereferences
+                       // that reverses what was done just above us on the stack in
+                       // imageboot_pivot_image().
+                       // After retrieving the path to the dev node (which we will NDINIT
+                       // in a moment), we pass NULL fsmountargs on to the filesystem.
+                       _Static_assert(sizeof(char **) == sizeof(fsmountargs), "fsmountargs should fit a (kernel) address");
+                       char **devnamepp = (char **)fsmountargs;
+                       char *devnamep = *devnamepp;
+                       devpath = CAST_USER_ADDR_T(devnamep);
+                       do_copyin_devpath = false;
+                       fsmountargs = USER_ADDR_NULL;
+
+                       //Now that we have a mp, denote that this mount is for the basesystem.
+                       mp->mnt_supl_kern_flag |= MNTK_SUPL_BASESYSTEM;
+               }
+#endif // CONFIG_BASESYSTEMROOT
+
+               if (do_copyin_devpath) {
+                       if (vfs_context_is64bit(ctx)) {
+                               if ((error = copyin(fsmountargs, (caddr_t)&devpath, sizeof(devpath)))) {
+                                       goto out1;
+                               }
+                               fsmountargs += sizeof(devpath);
+                       } else {
+                               user32_addr_t tmp;
+                               if ((error = copyin(fsmountargs, (caddr_t)&tmp, sizeof(tmp)))) {
+                                       goto out1;
+                               }
+                               /* munge into LP64 addr */
+                               devpath = CAST_USER_ADDR_T(tmp);
+                               fsmountargs += sizeof(tmp);
                         }
-                       /* munge into LP64 addr */
-                       devpath = CAST_USER_ADDR_T(tmp);
-                       fsmountargs += sizeof(tmp);
                 }
  
                 /* Lookup device and authorize access to it */
                 if ((devpath)) {
                         struct nameidata nd;
  
-                       NDINIT(&nd, LOOKUP, OP_MOUNT, FOLLOW, UIO_USERSPACE, devpath, ctx);
+                       enum uio_seg seg = UIO_USERSPACE;
+#if CONFIG_BASESYSTEMROOT
+                       if (internal_flags & KERNEL_MOUNT_BASESYSTEMROOT) {
+                               seg = UIO_SYSSPACE;
+                       }
+#endif // CONFIG_BASESYSTEMROOT
+
+                       NDINIT(&nd, LOOKUP, OP_MOUNT, FOLLOW, seg, devpath, ctx);
                         if ((error = namei(&nd))) {
                                 goto out1;
                         }
@@ -969,7 +1081,8 @@ update:
                                 goto out2;
                         }
                 }
-       }
+       } // localargs && !(snapshot | data | vm)
+
  #if CONFIG_MACF
         if ((flags & MNT_UPDATE) == 0) {
                 mac_mount_label_init(mp);
@@ -985,11 +1098,116 @@ update:
         }
  #endif
         /*
-        * Mount the filesystem.
+        * Mount the filesystem.  We already asserted that internal_flags
+        * cannot have more than one mount-by-role bit set.
          */
         if (internal_flags & KERNEL_MOUNT_SNAPSHOT) {
                 error = VFS_IOCTL(mp, VFSIOC_MOUNT_SNAPSHOT,
                     (caddr_t)fsmountargs, 0, ctx);
+       } else if (internal_flags & KERNEL_MOUNT_DATAVOL) {
+#if CONFIG_ROSV_STARTUP
+               struct mount *origin_mp = (struct mount*)fsmountargs;
+               fs_role_mount_args_t frma = {origin_mp, VFS_DATA_ROLE};
+               error = VFS_IOCTL(mp, VFSIOC_MOUNT_BYROLE, (caddr_t)&frma, 0, ctx);
+               if (error) {
+                       printf("MOUNT-BY-ROLE (%d) failed! (%d)", VFS_DATA_ROLE, error);
+               } else {
+                       /* Mark volume associated with system volume */
+                       mp->mnt_kern_flag |= MNTK_SYSTEM;
+
+                       /* Attempt to acquire the mnt_devvp and set it up */
+                       struct vnode *mp_devvp = NULL;
+                       if (mp->mnt_vfsstat.f_mntfromname[0] != 0) {
+                               errno_t lerr = vnode_lookup(mp->mnt_vfsstat.f_mntfromname,
+                                   0, &mp_devvp, vfs_context_kernel());
+                               if (!lerr) {
+                                       mp->mnt_devvp = mp_devvp;
+                                       //vnode_lookup took an iocount, need to drop it.
+                                       vnode_put(mp_devvp);
+                                       // now set `device_vnode` to the devvp that was acquired.
+                                       // this is needed in order to ensure vfs_init_io_attributes is invoked.
+                                       // note that though the iocount above was dropped, the mount acquires
+                                       // an implicit reference against the device.
+                                       device_vnode = mp_devvp;
+                               }
+                       }
+               }
+#else
+               error = EINVAL;
+#endif
+       } else if (internal_flags & KERNEL_MOUNT_VMVOL) {
+#if CONFIG_MOUNT_VM
+               struct mount *origin_mp = (struct mount*)fsmountargs;
+               fs_role_mount_args_t frma = {origin_mp, VFS_VM_ROLE};
+               error = VFS_IOCTL(mp, VFSIOC_MOUNT_BYROLE, (caddr_t)&frma, 0, ctx);
+               if (error) {
+                       printf("MOUNT-BY-ROLE (%d) failed! (%d)", VFS_VM_ROLE, error);
+               } else {
+                       /* Mark volume associated with system volume and a swap mount */
+                       mp->mnt_kern_flag |= (MNTK_SYSTEM | MNTK_SWAP_MOUNT);
+                       /* Attempt to acquire the mnt_devvp and set it up */
+                       struct vnode *mp_devvp = NULL;
+                       if (mp->mnt_vfsstat.f_mntfromname[0] != 0) {
+                               errno_t lerr = vnode_lookup(mp->mnt_vfsstat.f_mntfromname,
+                                   0, &mp_devvp, vfs_context_kernel());
+                               if (!lerr) {
+                                       mp->mnt_devvp = mp_devvp;
+                                       //vnode_lookup took an iocount, need to drop it.
+                                       vnode_put(mp_devvp);
+
+                                       // now set `device_vnode` to the devvp that was acquired.
+                                       // note that though the iocount above was dropped, the mount acquires
+                                       // an implicit reference against the device.
+                                       device_vnode = mp_devvp;
+                               }
+                       }
+               }
+#else
+               error = EINVAL;
+#endif
+       } else if ((internal_flags & KERNEL_MOUNT_PREBOOTVOL) || (internal_flags & KERNEL_MOUNT_RECOVERYVOL)) {
+#if CONFIG_MOUNT_PREBOOTRECOVERY
+               struct mount *origin_mp = (struct mount*)fsmountargs;
+               uint32_t mount_role = 0;
+               if (internal_flags & KERNEL_MOUNT_PREBOOTVOL) {
+                       mount_role = VFS_PREBOOT_ROLE;
+               } else if (internal_flags & KERNEL_MOUNT_RECOVERYVOL) {
+                       mount_role = VFS_RECOVERY_ROLE;
+               }
+
+               if (mount_role != 0) {
+                       fs_role_mount_args_t frma = {origin_mp, mount_role};
+                       error = VFS_IOCTL(mp, VFSIOC_MOUNT_BYROLE, (caddr_t)&frma, 0, ctx);
+                       if (error) {
+                               printf("MOUNT-BY-ROLE (%d) failed! (%d)", mount_role, error);
+                       } else {
+                               // NOT YET - need to qualify how this interacts with shutdown, ERP/ERB, etc
+                               /* Mark volume associated with system volume */
+                               //mp->mnt_kern_flag |= MNTK_SYSTEM;
+                               /* Attempt to acquire the mnt_devvp and set it up */
+                               struct vnode *mp_devvp = NULL;
+                               if (mp->mnt_vfsstat.f_mntfromname[0] != 0) {
+                                       errno_t lerr = vnode_lookup(mp->mnt_vfsstat.f_mntfromname,
+                                           0, &mp_devvp, vfs_context_kernel());
+                                       if (!lerr) {
+                                               mp->mnt_devvp = mp_devvp;
+                                               //vnode_lookup took an iocount, need to drop it.
+                                               vnode_put(mp_devvp);
+
+                                               // now set `device_vnode` to the devvp that was acquired.
+                                               // note that though the iocount above was dropped, the mount acquires
+                                               // an implicit reference against the device.
+                                               device_vnode = mp_devvp;
+                                       }
+                               }
+                       }
+               } else {
+                       printf("MOUNT-BY-ROLE (%d) failed - ROLE UNRECOGNIZED! (%d)", mount_role, error);
+                       error = EINVAL;
+               }
+#else
+               error = EINVAL;
+#endif
         } else {
                 error = VFS_MOUNT(mp, device_vnode, fsmountargs, ctx);
         }
@@ -1019,11 +1237,16 @@ update:
         if (error == 0) {
                 struct vfs_attr vfsattr;
  #if CONFIG_MACF
+               error = mac_mount_check_mount_late(ctx, mp);
+               if (error != 0) {
+                       goto out4;
+               }
+
                 if (vfs_flags(mp) & MNT_MULTILABEL) {
                         error = VFS_ROOT(mp, &rvp, ctx);
                         if (error) {
                                 printf("%s() VFS_ROOT returned %d\n", __func__, error);
-                               goto out3;
+                               goto out4;
                         }
                         error = vnode_label(mp, NULL, rvp, NULL, 0, ctx);
                         /*
@@ -1032,7 +1255,7 @@ update:
                         vnode_put(rvp);
  
                         if (error) {
-                               goto out3;
+                               goto out4;
                         }
                 }
  #endif  /* MAC */
@@ -1170,7 +1393,8 @@ update:
  #if CONFIG_MACF
                 mac_mount_label_destroy(mp);
  #endif
-               FREE_ZONE(mp, sizeof(struct mount), M_MOUNT);
+               zfree(mount_zone, mp);
+               did_set_lmount = false;
         }
  exit:
         /*
@@ -1180,6 +1404,12 @@ exit:
                 vnode_put(devvp);
         }
  
+       if (did_set_lmount) {
+               mount_lock_spin(mp);
+               mp->mnt_lflag &= ~MNT_LMOUNT;
+               mount_unlock(mp);
+       }
+
         return error;
  
  /* Error condition exits */
@@ -1223,9 +1453,18 @@ out2:
  out1:
         /* Release mnt_rwlock only when it was taken */
         if (is_rwlock_locked == TRUE) {
+               if (flag_set) {
+                       mp->mnt_flag = flag;  /* restore mnt_flag value */
+               }
                 lck_rw_done(&mp->mnt_rwlock);
         }
  
+       if (did_set_lmount) {
+               mount_lock_spin(mp);
+               mp->mnt_lflag &= ~MNT_LMOUNT;
+               mount_unlock(mp);
+       }
+
         if (mntalloc) {
                 if (mp->mnt_crossref) {
                         mount_dropcrossref(mp, vp, 0);
@@ -1234,7 +1473,7 @@ out1:
  #if CONFIG_MACF
                         mac_mount_label_destroy(mp);
  #endif
-                       FREE_ZONE(mp, sizeof(struct mount), M_MOUNT);
+                       zfree(mount_zone, mp);
                 }
         }
         if (vfsp_ref) {
@@ -1310,8 +1549,10 @@ out:
  
  #if CONFIG_IMGSRC_ACCESS
  
-#if DEBUG
-#define IMGSRC_DEBUG(args...) printf(args)
+#define DEBUG_IMGSRC 0
+
+#if DEBUG_IMGSRC
+#define IMGSRC_DEBUG(args...) printf("imgsrc: " args)
  #else
  #define IMGSRC_DEBUG(args...) do { } while(0)
  #endif
@@ -1323,8 +1564,13 @@ authorize_devpath_and_update_mntfromname(mount_t mp, user_addr_t devpath, vnode_
         vnode_t vp, realdevvp;
         mode_t accessmode;
         int error;
+       enum uio_seg uio = UIO_USERSPACE;
+
+       if (ctx == vfs_context_kernel()) {
+               uio = UIO_SYSSPACE;
+       }
  
-       NDINIT(&nd, LOOKUP, OP_LOOKUP, FOLLOW, UIO_USERSPACE, devpath, ctx);
+       NDINIT(&nd, LOOKUP, OP_LOOKUP, FOLLOW, uio, devpath, ctx);
         if ((error = namei(&nd))) {
                 IMGSRC_DEBUG("namei() failed with %d\n", error);
                 return error;
@@ -1378,8 +1624,10 @@ authorize_devpath_and_update_mntfromname(mount_t mp, user_addr_t devpath, vnode_
  
  out1:
         vnode_put(realdevvp);
+
  out:
         nameidone(&nd);
+
         if (error) {
                 vnode_put(vp);
         }
@@ -1398,6 +1646,9 @@ place_mount_and_checkdirs(mount_t mp, vnode_t vp, vfs_context_t ctx)
  
         mp->mnt_vnodecovered = vp; /* XXX This is normally only set at init-time ... */
  
+       IMGSRC_DEBUG("placing: fsname = %s, vp = %s\n",
+           mp->mnt_vtable->vfc_name, vnode_getname(vp));
+
         vnode_lock_spin(vp);
         CLR(vp->v_flag, VMOUNT);
         vp->v_mountedhere = mp;
@@ -1452,7 +1703,7 @@ mount_begin_update(mount_t mp, vfs_context_t ctx, int flags)
  
         /* unmount in progress return error */
         mount_lock_spin(mp);
-       if (mp->mnt_lflag & MNT_LUNMOUNT) {
+       if (mp->mnt_lflag & (MNT_LUNMOUNT | MNT_LMOUNT)) {
                 mount_unlock(mp);
                 return EBUSY;
         }
@@ -1518,18 +1769,18 @@ get_imgsrc_rootvnode(uint32_t height, vnode_t *rvpp)
  }
  
  static int
-relocate_imageboot_source(vnode_t pvp, vnode_t vp, struct componentname *cnp,
-    const char *fsname, vfs_context_t ctx,
+relocate_imageboot_source(vnode_t pvp, vnode_t vp,
+    struct componentname *cnp, const char *fsname, vfs_context_t ctx,
      boolean_t is64bit, user_addr_t fsmountargs, boolean_t by_index)
  {
         int error;
         mount_t mp;
         boolean_t placed = FALSE;
-       vnode_t devvp = NULLVP;
         struct vfstable *vfsp;
         user_addr_t devpath;
         char *old_mntonname;
         vnode_t rvp;
+       vnode_t devvp;
         uint32_t height;
         uint32_t flags;
  
@@ -1559,7 +1810,7 @@ relocate_imageboot_source(vnode_t pvp, vnode_t vp, struct componentname *cnp,
  
                         height = mia64.mi_height;
                         flags = mia64.mi_flags;
-                       devpath = mia64.mi_devpath;
+                       devpath = (user_addr_t)mia64.mi_devpath;
                 } else {
                         struct user32_mnt_imgsrc_args mia32;
                         error = copyin(fsmountargs, &mia32, sizeof(mia32));
@@ -1601,13 +1852,13 @@ relocate_imageboot_source(vnode_t pvp, vnode_t vp, struct componentname *cnp,
  
         error = get_imgsrc_rootvnode(height, &rvp);
         if (error != 0) {
-               IMGSRC_DEBUG("getting root vnode failed with %d\n", error);
+               IMGSRC_DEBUG("getting old root vnode failed with %d\n", error);
                 return error;
         }
  
-       IMGSRC_DEBUG("got root vnode.\n");
+       IMGSRC_DEBUG("got old root vnode\n");
  
-       MALLOC(old_mntonname, char*, MAXPATHLEN, M_TEMP, M_WAITOK);
+       old_mntonname = zalloc_flags(ZV_NAMEI, Z_WAITOK);
  
         /* Can only move once */
         mp = vnode_mount(rvp);
@@ -1617,6 +1868,7 @@ relocate_imageboot_source(vnode_t pvp, vnode_t vp, struct componentname *cnp,
                 goto out0;
         }
  
+       IMGSRC_DEBUG("moving rvp: fsname = %s\n", mp->mnt_vtable->vfc_name);
         IMGSRC_DEBUG("Starting updated.\n");
  
         /* Get exclusive rwlock on mount, authorize update on mp */
@@ -1635,7 +1887,6 @@ relocate_imageboot_source(vnode_t pvp, vnode_t vp, struct componentname *cnp,
                 goto out1;
         }
  
-
         IMGSRC_DEBUG("Preparing coveredvp.\n");
  
         /* Mark covered vnode as mount in progress, authorize placing mount on top */
@@ -1650,7 +1901,8 @@ relocate_imageboot_source(vnode_t pvp, vnode_t vp, struct componentname *cnp,
         /* Sanity check the name caller has provided */
         vfsp = mp->mnt_vtable;
         if (strncmp(vfsp->vfc_name, fsname, MFSNAMELEN) != 0) {
-               IMGSRC_DEBUG("Wrong fs name.\n");
+               IMGSRC_DEBUG("Wrong fs name: actual = %s, expected = %s\n",
+                   vfsp->vfc_name, fsname);
                 error = EINVAL;
                 goto out2;
         }
@@ -1703,7 +1955,7 @@ relocate_imageboot_source(vnode_t pvp, vnode_t vp, struct componentname *cnp,
  
         mount_end_update(mp);
         vnode_put(rvp);
-       FREE(old_mntonname, M_TEMP);
+       zfree(ZV_NAMEI, old_mntonname);
  
         vfs_notify_mount(pvp);
  
@@ -1733,10 +1985,63 @@ out1:
  
  out0:
         vnode_put(rvp);
-       FREE(old_mntonname, M_TEMP);
+       zfree(ZV_NAMEI, old_mntonname);
         return error;
  }
  
+#if CONFIG_LOCKERBOOT
+__private_extern__
+int
+mount_locker_protoboot(const char *fsname, const char *mntpoint,
+    const char *pbdevpath)
+{
+       int error = -1;
+       struct nameidata nd;
+       boolean_t cleanup_nd = FALSE;
+       vfs_context_t ctx = vfs_context_kernel();
+       boolean_t is64 = TRUE;
+       boolean_t by_index = TRUE;
+       struct user64_mnt_imgsrc_args mia64 = {
+               .mi_height = 0,
+               .mi_flags = 0,
+               .mi_devpath = CAST_USER_ADDR_T(pbdevpath),
+       };
+       user_addr_t mia64addr = CAST_USER_ADDR_T(&mia64);
+
+       NDINIT(&nd, LOOKUP, OP_MOUNT, FOLLOW | AUDITVNPATH1 | WANTPARENT,
+           UIO_SYSSPACE, CAST_USER_ADDR_T(mntpoint), ctx);
+       error = namei(&nd);
+       if (error) {
+               IMGSRC_DEBUG("namei: %d\n", error);
+               goto out;
+       }
+
+       cleanup_nd = TRUE;
+       error = relocate_imageboot_source(nd.ni_dvp, nd.ni_vp,
+           &nd.ni_cnd, fsname, ctx, is64, mia64addr, by_index);
+
+out:
+       if (cleanup_nd) {
+               int stashed = error;
+
+               error = vnode_put(nd.ni_vp);
+               if (error) {
+                       panic("vnode_put() returned non-zero: %d", error);
+               }
+
+               if (nd.ni_dvp) {
+                       error = vnode_put(nd.ni_dvp);
+                       if (error) {
+                               panic("vnode_put() returned non-zero: %d", error);
+                       }
+               }
+               nameidone(&nd);
+
+               error = stashed;
+       }
+       return error;
+}
+#endif /* CONFIG_LOCKERBOOT */
  #endif /* CONFIG_IMGSRC_ACCESS */
  
  void
@@ -1781,11 +2086,10 @@ checkdirs_callback(proc_t p, void * arg)
         vnode_t olddp = cdrp->olddp;
         vnode_t newdp = cdrp->newdp;
         struct filedesc *fdp;
-       vnode_t tvp;
-       vnode_t fdp_cvp;
-       vnode_t fdp_rvp;
-       int cdir_changed = 0;
-       int rdir_changed = 0;
+       vnode_t new_cvp = newdp;
+       vnode_t new_rvp = newdp;
+       vnode_t old_cvp = NULL;
+       vnode_t old_rvp = NULL;
  
         /*
          * XXX Also needs to iterate each thread in the process to see if it
@@ -1793,36 +2097,70 @@ checkdirs_callback(proc_t p, void * arg)
          * XXX update that as well.
          */
  
+       /*
+        * First, with the proc_fdlock held, check to see if we will need
+        * to do any work.  If not, we will get out fast.
+        */
         proc_fdlock(p);
         fdp = p->p_fd;
-       if (fdp == (struct filedesc *)0) {
+       if (fdp == NULL ||
+           (fdp->fd_cdir != olddp && fdp->fd_rdir != olddp)) {
                 proc_fdunlock(p);
                 return PROC_RETURNED;
         }
-       fdp_cvp = fdp->fd_cdir;
-       fdp_rvp = fdp->fd_rdir;
         proc_fdunlock(p);
  
-       if (fdp_cvp == olddp) {
-               vnode_ref(newdp);
-               tvp = fdp->fd_cdir;
-               fdp_cvp = newdp;
-               cdir_changed = 1;
-               vnode_rele(tvp);
+       /*
+        * Ok, we will have to do some work.  Always take two refs
+        * because we might need that many.  We'll dispose of whatever
+        * we ended up not using.
+        */
+       if (vnode_ref(newdp) != 0) {
+               return PROC_RETURNED;
         }
-       if (fdp_rvp == olddp) {
-               vnode_ref(newdp);
-               tvp = fdp->fd_rdir;
-               fdp_rvp = newdp;
-               rdir_changed = 1;
-               vnode_rele(tvp);
+       if (vnode_ref(newdp) != 0) {
+               vnode_rele(newdp);
+               return PROC_RETURNED;
         }
-       if (cdir_changed || rdir_changed) {
-               proc_fdlock(p);
-               fdp->fd_cdir = fdp_cvp;
-               fdp->fd_rdir = fdp_rvp;
-               proc_fdunlock(p);
+
+       proc_dirs_lock_exclusive(p);
+       /*
+        * Now do the work.  Note: we dropped the proc_fdlock, so we
+        * have to do all of the checks again.
+        */
+       proc_fdlock(p);
+       fdp = p->p_fd;
+       if (fdp != NULL) {
+               if (fdp->fd_cdir == olddp) {
+                       old_cvp = olddp;
+                       fdp->fd_cdir = newdp;
+                       new_cvp = NULL;
+               }
+               if (fdp->fd_rdir == olddp) {
+                       old_rvp = olddp;
+                       fdp->fd_rdir = newdp;
+                       new_rvp = NULL;
+               }
+       }
+       proc_fdunlock(p);
+       proc_dirs_unlock_exclusive(p);
+
+       /*
+        * Dispose of any references that are no longer needed.
+        */
+       if (old_cvp != NULL) {
+               vnode_rele(old_cvp);
         }
+       if (old_rvp != NULL) {
+               vnode_rele(old_rvp);
+       }
+       if (new_cvp != NULL) {
+               vnode_rele(new_cvp);
+       }
+       if (new_rvp != NULL) {
+               vnode_rele(new_rvp);
+       }
+
         return PROC_RETURNED;
  }
  
@@ -1860,8 +2198,10 @@ checkdirs(vnode_t olddp, vfs_context_t ctx)
  
         if (rootvnode == olddp) {
                 vnode_ref(newdp);
+               lck_rw_lock_exclusive(rootvnode_rw_lock);
                 tvp = rootvnode;
                 rootvnode = newdp;
+               lck_rw_unlock_exclusive(rootvnode_rw_lock);
                 vnode_rele(tvp);
         }
  
@@ -1930,6 +2270,8 @@ vfs_unmountbyfsid(fsid_t *fsid, int flags, vfs_context_t ctx)
         return safedounmount(mp, flags, ctx);
  }
  
+#define ROLE_ACCOUNT_UNMOUNT_ENTITLEMENT        \
+       "com.apple.private.vfs.role-account-unmount"
  
  /*
   * The mount struct comes with a mount ref which will be consumed.
@@ -1952,10 +2294,15 @@ safedounmount(struct mount *mp, int flags, vfs_context_t ctx)
         }
  
         /*
-        * Skip authorization if the mount is tagged as permissive and
-        * this is not a forced-unmount attempt.
+        * Skip authorization in two cases:
+        * - If the process running the unmount has ROLE_ACCOUNT_UNMOUNT_ENTITLEMENT.
+        *   This entitlement allows non-root processes unmount volumes mounted by
+        *   other processes.
+        * - If the mount is tagged as permissive and this is not a forced-unmount
+        *   attempt.
          */
-       if (!(((mp->mnt_kern_flag & MNTK_PERMIT_UNMOUNT) != 0) && ((flags & MNT_FORCE) == 0))) {
+       if (!IOTaskHasEntitlement(current_task(), ROLE_ACCOUNT_UNMOUNT_ENTITLEMENT) &&
+           (!(((mp->mnt_kern_flag & MNTK_PERMIT_UNMOUNT) != 0) && ((flags & MNT_FORCE) == 0)))) {
                 /*
                  * Only root, or the user that did the original mount is
                  * permitted to unmount this filesystem.
@@ -1966,19 +2313,22 @@ safedounmount(struct mount *mp, int flags, vfs_context_t ctx)
                 }
         }
         /*
-        * Don't allow unmounting the root file system.
+        * Don't allow unmounting the root file system, or other volumes
+        * associated with it (for example, the associated VM or DATA mounts) .
          */
-       if (mp->mnt_flag & MNT_ROOTFS) {
-               error = EBUSY; /* the root is always busy */
+       if ((mp->mnt_flag & MNT_ROOTFS) || (mp->mnt_kern_flag & MNTK_SYSTEM)) {
+               error = EBUSY; /* the root (or associated volumes) is always busy */
                 goto out;
         }
  
-#ifdef CONFIG_IMGSRC_ACCESS
+       /*
+        * If the mount is providing the root filesystem's disk image
+        * (i.e. imageboot), don't allow unmounting
+        */
         if (mp->mnt_kern_flag & MNTK_BACKS_ROOT) {
                 error = EBUSY;
                 goto out;
         }
-#endif /* CONFIG_IMGSRC_ACCESS */
  
         return dounmount(mp, flags, 1, ctx);
  
@@ -2017,7 +2367,7 @@ dounmount(struct mount *mp, int flags, int withref, vfs_context_t ctx)
          * If already an unmount in progress just return EBUSY.
          * Even a forced unmount cannot override.
          */
-       if (mp->mnt_lflag & MNT_LUNMOUNT) {
+       if (mp->mnt_lflag & (MNT_LUNMOUNT | MNT_LMOUNT)) {
                 if (withref != 0) {
                         mount_drop(mp, 1);
                 }
@@ -2089,9 +2439,6 @@ dounmount(struct mount *mp, int flags, int withref, vfs_context_t ctx)
                 }
         }
  
-       /* free disk_conditioner_info structure for this mount */
-       disk_conditioner_unmount(mp);
-
         IOBSDMountChange(mp, kIOMountChangeUnmount);
  
  #if CONFIG_TRIGGERS
@@ -2183,6 +2530,10 @@ dounmount(struct mount *mp, int flags, int withref, vfs_context_t ctx)
                 wakeup((caddr_t)mp);
         }
         mount_refdrain(mp);
+
+       /* free disk_conditioner_info structure for this mount */
+       disk_conditioner_unmount(mp);
+
  out:
         if (mp->mnt_lflag & MNT_LWAIT) {
                 mp->mnt_lflag &= ~MNT_LWAIT;
@@ -2266,7 +2617,7 @@ out:
  #if CONFIG_MACF
                         mac_mount_label_destroy(mp);
  #endif
-                       FREE_ZONE(mp, sizeof(struct mount), M_MOUNT);
+                       zfree(mount_zone, mp);
                 } else {
                         panic("dounmount: no coveredvp");
                 }
@@ -2292,7 +2643,7 @@ dounmount_submounts(struct mount *mp, int flags, vfs_context_t ctx)
         TAILQ_FOREACH(smp, &mountlist, mnt_list)
         count++;
         fsids_sz = count * sizeof(fsid_t);
-       MALLOC(fsids, fsid_t *, fsids_sz, M_TEMP, M_NOWAIT);
+       fsids = kheap_alloc(KHEAP_TEMP, fsids_sz, Z_NOWAIT);
         if (fsids == NULL) {
                 mount_list_unlock();
                 goto out;
@@ -2332,9 +2683,7 @@ dounmount_submounts(struct mount *mp, int flags, vfs_context_t ctx)
                 }
         }
  out:
-       if (fsids) {
-               FREE(fsids, M_TEMP);
-       }
+       kheap_free(KHEAP_TEMP, fsids, fsids_sz);
  }
  
  void
@@ -2357,7 +2706,7 @@ mount_dropcrossref(mount_t mp, vnode_t dp, int need_put)
  #if CONFIG_MACF
                 mac_mount_label_destroy(mp);
  #endif
-               FREE_ZONE(mp, sizeof(struct mount), M_MOUNT);
+               zfree(mount_zone, mp);
                 return;
         }
         if (need_put) {
@@ -2376,14 +2725,44 @@ int syncprt = 0;
  
  int print_vmpage_stat = 0;
  
+/*
+ * sync_callback:      simple wrapper that calls VFS_SYNC() on volumes
+ *                     mounted read-write with the passed waitfor value.
+ *
+ * Parameters: mp      mount-point descriptor per mounted file-system instance.
+ *             arg     user argument (please see below)
+ *
+ * User argument is a pointer to 32 bit unsigned integer which describes the
+ * type of waitfor value to set for calling VFS_SYNC().  If user argument is
+ * passed as NULL, VFS_SYNC() is called with MNT_NOWAIT set as the default
+ * waitfor value.
+ *
+ * Returns:            VFS_RETURNED
+ */
  static int
-sync_callback(mount_t mp, __unused void *arg)
+sync_callback(mount_t mp, void *arg)
  {
         if ((mp->mnt_flag & MNT_RDONLY) == 0) {
                 int asyncflag = mp->mnt_flag & MNT_ASYNC;
+               unsigned waitfor = MNT_NOWAIT;
+
+               if (arg) {
+                       waitfor = *(uint32_t*)arg;
+               }
+
+               /* Sanity check for flags - these are the only valid combinations for the flag bits*/
+               if (waitfor != MNT_WAIT &&
+                   waitfor != (MNT_WAIT | MNT_VOLUME) &&
+                   waitfor != MNT_NOWAIT &&
+                   waitfor != (MNT_NOWAIT | MNT_VOLUME) &&
+                   waitfor != MNT_DWAIT &&
+                   waitfor != (MNT_DWAIT | MNT_VOLUME)) {
+                       panic("Passed inappropriate waitfor %u to "
+                           "sync_callback()", waitfor);
+               }
  
                 mp->mnt_flag &= ~MNT_ASYNC;
-               VFS_SYNC(mp, arg ? MNT_WAIT : MNT_NOWAIT, vfs_context_kernel());
+               (void)VFS_SYNC(mp, waitfor, vfs_context_kernel());
                 if (asyncflag) {
                         mp->mnt_flag |= MNT_ASYNC;
                 }
@@ -2426,7 +2805,7 @@ sync_internal_callback(mount_t mp, void *arg)
  
                 if ((sync_type == SYNC_ONLY_RELIABLE_MEDIA) && !is_reliable) {
                         return VFS_RETURNED;
-               } else if ((sync_type = SYNC_ONLY_UNRELIABLE_MEDIA) && is_reliable) {
+               } else if ((sync_type == SYNC_ONLY_UNRELIABLE_MEDIA) && is_reliable) {
                         return VFS_RETURNED;
                 }
         }
@@ -2442,10 +2821,17 @@ int sync_timeout_seconds = 5;
  #define SYNC_THREAD_RUN       0x0001
  #define SYNC_THREAD_RUNNING   0x0002
  
+#if CONFIG_PHYS_WRITE_ACCT
+thread_t pm_sync_thread;
+#endif /* CONFIG_PHYS_WRITE_ACCT */
+
  static void
  sync_thread(__unused void *arg, __unused wait_result_t wr)
  {
         sync_type_t sync_type;
+#if CONFIG_PHYS_WRITE_ACCT
+       pm_sync_thread = current_thread();
+#endif /* CONFIG_PHYS_WRITE_ACCT */
  
         lck_mtx_lock(sync_mtx_lck);
         while (sync_thread_state & SYNC_THREAD_RUN) {
@@ -2467,6 +2853,9 @@ sync_thread(__unused void *arg, __unused wait_result_t wr)
          */
         wakeup(&sync_thread_state);
         sync_thread_state &= ~SYNC_THREAD_RUNNING;
+#if CONFIG_PHYS_WRITE_ACCT
+       pm_sync_thread = NULL;
+#endif /* CONFIG_PHYS_WRITE_ACCT */
         lck_mtx_unlock(sync_mtx_lck);
  
         if (print_vmpage_stat) {
@@ -2480,7 +2869,7 @@ sync_thread(__unused void *arg, __unused wait_result_t wr)
  #endif /* DIAGNOSTIC */
  }
  
-struct timeval sync_timeout_last_print = {0, 0};
+struct timeval sync_timeout_last_print = {.tv_sec = 0, .tv_usec = 0};
  
  /*
   * An in-kernel sync for power management to call.
@@ -2492,7 +2881,7 @@ sync_internal(void)
         thread_t thd;
         int error;
         int thread_created = FALSE;
-       struct timespec ts = {sync_timeout_seconds, 0};
+       struct timespec ts = {.tv_sec = sync_timeout_seconds, .tv_nsec = 0};
  
         lck_mtx_lock(sync_mtx_lck);
         sync_thread_state |= SYNC_THREAD_RUN;
@@ -2553,6 +2942,7 @@ quotactl(proc_t p, struct quotactl_args *uap, __unused int32_t *retval)
                 return error;
         }
         mp = nd.ni_vp->v_mount;
+       mount_ref(mp, 0);
         vnode_put(nd.ni_vp);
         nameidone(&nd);
  
@@ -2563,7 +2953,7 @@ quotactl(proc_t p, struct quotactl_args *uap, __unused int32_t *retval)
         case Q_QUOTAON:
                 /* uap->arg specifies a file from which to take the quotas */
                 fnamelen = MAXPATHLEN;
-               datap = kalloc(MAXPATHLEN);
+               datap = zalloc(ZV_NAMEI);
                 error = copyinstr(uap->arg, datap, MAXPATHLEN, &fnamelen);
                 break;
         case Q_GETQUOTA:
@@ -2600,7 +2990,7 @@ quotactl(proc_t p, struct quotactl_args *uap, __unused int32_t *retval)
         switch (quota_cmd) {
         case Q_QUOTAON:
                 if (datap != NULL) {
-                       kfree(datap, MAXPATHLEN);
+                       zfree(ZV_NAMEI, datap);
                 }
                 break;
         case Q_GETQUOTA:
@@ -2627,6 +3017,7 @@ quotactl(proc_t p, struct quotactl_args *uap, __unused int32_t *retval)
                 break;
         } /* switch */
  
+       mount_drop(mp, 0);
         return error;
  }
  #else
@@ -2670,6 +3061,7 @@ statfs(__unused proc_t p, struct statfs_args *uap, __unused int32_t *retval)
  #if CONFIG_MACF
         error = mac_mount_check_stat(ctx, mp);
         if (error != 0) {
+               vnode_put(vp);
                 return error;
         }
  #endif
@@ -2738,40 +3130,33 @@ out:
         return error;
  }
  
-/*
- * Common routine to handle copying of statfs64 data to user space
- */
-static int
-statfs64_common(struct mount *mp, struct vfsstatfs *sfsp, user_addr_t bufp)
-{
-       int error;
-       struct statfs64 sfs;
-
-       bzero(&sfs, sizeof(sfs));
-
-       sfs.f_bsize = sfsp->f_bsize;
-       sfs.f_iosize = (int32_t)sfsp->f_iosize;
-       sfs.f_blocks = sfsp->f_blocks;
-       sfs.f_bfree = sfsp->f_bfree;
-       sfs.f_bavail = sfsp->f_bavail;
-       sfs.f_files = sfsp->f_files;
-       sfs.f_ffree = sfsp->f_ffree;
-       sfs.f_fsid = sfsp->f_fsid;
-       sfs.f_owner = sfsp->f_owner;
-       sfs.f_type = mp->mnt_vtable->vfc_typenum;
-       sfs.f_flags = mp->mnt_flag & MNT_VISFLAGMASK;
-       sfs.f_fssubtype = sfsp->f_fssubtype;
+void
+vfs_get_statfs64(struct mount *mp, struct statfs64 *sfs)
+{
+       struct vfsstatfs *vsfs = &mp->mnt_vfsstat;
+
+       bzero(sfs, sizeof(*sfs));
+
+       sfs->f_bsize = vsfs->f_bsize;
+       sfs->f_iosize = (int32_t)vsfs->f_iosize;
+       sfs->f_blocks = vsfs->f_blocks;
+       sfs->f_bfree = vsfs->f_bfree;
+       sfs->f_bavail = vsfs->f_bavail;
+       sfs->f_files = vsfs->f_files;
+       sfs->f_ffree = vsfs->f_ffree;
+       sfs->f_fsid = vsfs->f_fsid;
+       sfs->f_owner = vsfs->f_owner;
+       sfs->f_type = mp->mnt_vtable->vfc_typenum;
+       sfs->f_flags = mp->mnt_flag & MNT_VISFLAGMASK;
+       sfs->f_fssubtype = vsfs->f_fssubtype;
+       sfs->f_flags_ext = (mp->mnt_kern_flag & MNTK_SYSTEMDATA) ? MNT_EXT_ROOT_DATA_VOL : 0;
         if (mp->mnt_kern_flag & MNTK_TYPENAME_OVERRIDE) {
-               strlcpy(&sfs.f_fstypename[0], &mp->fstypename_override[0], MFSTYPENAMELEN);
+               strlcpy(&sfs->f_fstypename[0], &mp->fstypename_override[0], MFSTYPENAMELEN);
         } else {
-               strlcpy(&sfs.f_fstypename[0], &sfsp->f_fstypename[0], MFSTYPENAMELEN);
+               strlcpy(&sfs->f_fstypename[0], &vsfs->f_fstypename[0], MFSTYPENAMELEN);
         }
-       strlcpy(&sfs.f_mntonname[0], &sfsp->f_mntonname[0], MAXPATHLEN);
-       strlcpy(&sfs.f_mntfromname[0], &sfsp->f_mntfromname[0], MAXPATHLEN);
-
-       error = copyout((caddr_t)&sfs, bufp, sizeof(sfs));
-
-       return error;
+       strlcpy(&sfs->f_mntonname[0], &vsfs->f_mntonname[0], MAXPATHLEN);
+       strlcpy(&sfs->f_mntfromname[0], &vsfs->f_mntfromname[0], MAXPATHLEN);
  }
  
  /*
@@ -2781,39 +3166,57 @@ int
  statfs64(__unused struct proc *p, struct statfs64_args *uap, __unused int32_t *retval)
  {
         struct mount *mp;
-       struct vfsstatfs *sp;
         int error;
-       struct nameidata nd;
+       struct nameidata *ndp;
+       struct statfs64 *sfsp;
         vfs_context_t ctxp = vfs_context_current();
         vnode_t vp;
+       union {
+               struct nameidata nd;
+               struct statfs64 sfs;
+       } *__nameidata_statfs64;
  
-       NDINIT(&nd, LOOKUP, OP_STATFS, FOLLOW | AUDITVNPATH1,
+       __nameidata_statfs64 = kheap_alloc(KHEAP_TEMP, sizeof(*__nameidata_statfs64),
+           Z_WAITOK);
+       ndp = &__nameidata_statfs64->nd;
+
+       NDINIT(ndp, LOOKUP, OP_STATFS, FOLLOW | AUDITVNPATH1,
             UIO_USERSPACE, uap->path, ctxp);
-       error = namei(&nd);
+       error = namei(ndp);
         if (error != 0) {
-               return error;
+               goto out;
         }
-       vp = nd.ni_vp;
+       vp = ndp->ni_vp;
         mp = vp->v_mount;
-       sp = &mp->mnt_vfsstat;
-       nameidone(&nd);
+       nameidone(ndp);
  
  #if CONFIG_MACF
         error = mac_mount_check_stat(ctxp, mp);
         if (error != 0) {
-               return error;
+               vnode_put(vp);
+               goto out;
         }
  #endif
  
         error = vfs_update_vfsstat(mp, ctxp, VFS_USER_EVENT);
         if (error != 0) {
                 vnode_put(vp);
-               return error;
+               goto out;
         }
  
-       error = statfs64_common(mp, sp, uap->buf);
+       sfsp = &__nameidata_statfs64->sfs;
+       vfs_get_statfs64(mp, sfsp);
+       if ((mp->mnt_kern_flag & MNTK_SYSTEMDATA) &&
+           (p->p_vfs_iopolicy & P_VFS_IOPOLICY_STATFS_NO_DATA_VOLUME)) {
+               /* This process does not want to see a seperate data volume mountpoint */
+               strlcpy(&sfsp->f_mntonname[0], "/", sizeof("/"));
+       }
+       error = copyout(sfsp, uap->buf, sizeof(*sfsp));
         vnode_put(vp);
  
+out:
+       kheap_free(KHEAP_TEMP, __nameidata_statfs64, sizeof(*__nameidata_statfs64));
+
         return error;
  }
  
@@ -2825,7 +3228,7 @@ fstatfs64(__unused struct proc *p, struct fstatfs64_args *uap, __unused int32_t
  {
         struct vnode *vp;
         struct mount *mp;
-       struct vfsstatfs *sp;
+       struct statfs64 sfs;
         int error;
  
         AUDIT_ARG(fd, uap->fd);
@@ -2855,12 +3258,17 @@ fstatfs64(__unused struct proc *p, struct fstatfs64_args *uap, __unused int32_t
         }
  #endif
  
-       sp = &mp->mnt_vfsstat;
         if ((error = vfs_update_vfsstat(mp, vfs_context_current(), VFS_USER_EVENT)) != 0) {
                 goto out;
         }
  
-       error = statfs64_common(mp, sp, uap->buf);
+       vfs_get_statfs64(mp, &sfs);
+       if ((mp->mnt_kern_flag & MNTK_SYSTEMDATA) &&
+           (p->p_vfs_iopolicy & P_VFS_IOPOLICY_STATFS_NO_DATA_VOLUME)) {
+               /* This process does not want to see a seperate data volume mountpoint */
+               strlcpy(&sfs.f_mntonname[0], "/", sizeof("/"));
+       }
+       error = copyout(&sfs, uap->buf, sizeof(sfs));
  
  out:
         file_drop(uap->fd);
@@ -2900,9 +3308,10 @@ getfsstat_callback(mount_t mp, void * arg)
                  * If MNT_NOWAIT is specified, do not refresh the
                  * fsstat cache. MNT_WAIT/MNT_DWAIT overrides MNT_NOWAIT.
                  */
-               if (((fstp->flags & MNT_NOWAIT) == 0 || (fstp->flags & (MNT_WAIT | MNT_DWAIT))) &&
-                   (error = vfs_update_vfsstat(mp, ctx,
-                   VFS_USER_EVENT))) {
+               if ((mp->mnt_lflag & MNT_LDEAD) ||
+                   (((fstp->flags & MNT_NOWAIT) == 0 || (fstp->flags & (MNT_WAIT | MNT_DWAIT))) &&
+                   (!(mp->mnt_lflag & MNT_LUNMOUNT)) &&
+                   (error = vfs_update_vfsstat(mp, ctx, VFS_USER_EVENT)))) {
                         KAUTH_DEBUG("vfs_update_vfsstat returned %d", error);
                         return VFS_RETURNED;
                 }
@@ -2975,6 +3384,10 @@ __mac_getfsstat(__unused proc_t p, struct __mac_getfsstat_args *uap, int *retval
         size_t count, maxcount, bufsize, macsize;
         struct getfsstat_struct fst;
  
+       if ((unsigned)uap->bufsize > INT_MAX || (unsigned)uap->macsize > INT_MAX) {
+               return EINVAL;
+       }
+
         bufsize = (size_t) uap->bufsize;
         macsize = (size_t) uap->macsize;
  
@@ -3000,21 +3413,21 @@ __mac_getfsstat(__unused proc_t p, struct __mac_getfsstat_args *uap, int *retval
                 }
  
                 /* Copy in the array */
-               MALLOC(mp0, u_int32_t *, macsize, M_MACTEMP, M_WAITOK);
+               mp0 = kheap_alloc(KHEAP_TEMP, macsize, Z_WAITOK);
                 if (mp0 == NULL) {
                         return ENOMEM;
                 }
  
                 error = copyin(uap->mac, mp0, macsize);
                 if (error) {
-                       FREE(mp0, M_MACTEMP);
+                       kheap_free(KHEAP_TEMP, mp0, macsize);
                         return error;
                 }
  
                 /* Normalize to an array of user_addr_t */
-               MALLOC(mp, user_addr_t *, count * sizeof(user_addr_t), M_MACTEMP, M_WAITOK);
+               mp = kheap_alloc(KHEAP_TEMP, count * sizeof(user_addr_t), Z_WAITOK);
                 if (mp == NULL) {
-                       FREE(mp0, M_MACTEMP);
+                       kheap_free(KHEAP_TEMP, mp0, macsize);
                         return ENOMEM;
                 }
  
@@ -3025,7 +3438,7 @@ __mac_getfsstat(__unused proc_t p, struct __mac_getfsstat_args *uap, int *retval
                                 mp[i] = (user_addr_t)mp0[i];
                         }
                 }
-               FREE(mp0, M_MACTEMP);
+               kheap_free(KHEAP_TEMP, mp0, macsize);
         }
  #endif
  
@@ -3035,13 +3448,13 @@ __mac_getfsstat(__unused proc_t p, struct __mac_getfsstat_args *uap, int *retval
         fst.flags = uap->flags;
         fst.count = 0;
         fst.error = 0;
-       fst.maxcount = maxcount;
+       fst.maxcount = (int)maxcount;
  
  
-       vfs_iterate(0, getfsstat_callback, &fst);
+       vfs_iterate(VFS_ITERATE_NOSKIP_UNMOUNT, getfsstat_callback, &fst);
  
         if (mp) {
-               FREE(mp, M_MACTEMP);
+               kheap_free(KHEAP_TEMP, mp, count * sizeof(user_addr_t));
         }
  
         if (fst.error) {
@@ -3062,6 +3475,7 @@ getfsstat64_callback(mount_t mp, void * arg)
  {
         struct getfsstat_struct *fstp = (struct getfsstat_struct *)arg;
         struct vfsstatfs *sp;
+       struct statfs64 sfs;
         int error;
  
         if (fstp->sfsp && fstp->count < fstp->maxcount) {
@@ -3081,19 +3495,21 @@ getfsstat64_callback(mount_t mp, void * arg)
                  * getfsstat, since the constants are out of the same
                  * namespace.
                  */
-               if (((fstp->flags & MNT_NOWAIT) == 0 ||
-                   (fstp->flags & (MNT_WAIT | MNT_DWAIT))) &&
-                   (error = vfs_update_vfsstat(mp, vfs_context_current(), VFS_USER_EVENT))) {
+               if ((mp->mnt_lflag & MNT_LDEAD) ||
+                   ((((fstp->flags & MNT_NOWAIT) == 0) || (fstp->flags & (MNT_WAIT | MNT_DWAIT))) &&
+                   (!(mp->mnt_lflag & MNT_LUNMOUNT)) &&
+                   (error = vfs_update_vfsstat(mp, vfs_context_current(), VFS_USER_EVENT)))) {
                         KAUTH_DEBUG("vfs_update_vfsstat returned %d", error);
                         return VFS_RETURNED;
                 }
  
-               error = statfs64_common(mp, sp, fstp->sfsp);
+               vfs_get_statfs64(mp, &sfs);
+               error = copyout(&sfs, fstp->sfsp, sizeof(sfs));
                 if (error) {
                         fstp->error = error;
                         return VFS_RETURNED_DONE;
                 }
-               fstp->sfsp += sizeof(struct statfs64);
+               fstp->sfsp += sizeof(sfs);
         }
         fstp->count++;
         return VFS_RETURNED;
@@ -3120,7 +3536,7 @@ getfsstat64(__unused proc_t p, struct getfsstat64_args *uap, int *retval)
         fst.error = 0;
         fst.maxcount = maxcount;
  
-       vfs_iterate(0, getfsstat64_callback, &fst);
+       vfs_iterate(VFS_ITERATE_NOSKIP_UNMOUNT, getfsstat64_callback, &fst);
  
         if (fst.error) {
                 KAUTH_DEBUG("ERROR - %s gets %d", p->p_comm, fst.error);
@@ -3239,7 +3655,7 @@ common_fchdir(proc_t p, struct fchdir_args *uap, int per_thread)
         vnode_t tdp;
         vnode_t tvp;
         struct mount *mp;
-       int error;
+       int error, should_put = 1;
         vfs_context_t ctx = vfs_context_current();
  
         AUDIT_ARG(fd, uap->fd);
@@ -3309,6 +3725,7 @@ common_fchdir(proc_t p, struct fchdir_args *uap, int per_thread)
                 goto out;
         }
         vnode_put(vp);
+       should_put = 0;
  
         if (per_thread) {
                 thread_t th = vfs_context_thread(ctx);
@@ -3319,23 +3736,26 @@ common_fchdir(proc_t p, struct fchdir_args *uap, int per_thread)
                         OSBitOrAtomic(P_THCWD, &p->p_flag);
                 } else {
                         vnode_rele(vp);
-                       return ENOENT;
+                       error = ENOENT;
+                       goto out;
                 }
         } else {
+               proc_dirs_lock_exclusive(p);
                 proc_fdlock(p);
                 tvp = fdp->fd_cdir;
                 fdp->fd_cdir = vp;
                 proc_fdunlock(p);
+               proc_dirs_unlock_exclusive(p);
         }
  
         if (tvp) {
                 vnode_rele(tvp);
         }
-       file_drop(uap->fd);
  
-       return 0;
  out:
-       vnode_put(vp);
+       if (should_put) {
+               vnode_put(vp);
+       }
         file_drop(uap->fd);
  
         return error;
@@ -3353,6 +3773,7 @@ __pthread_fchdir(proc_t p, struct __pthread_fchdir_args *uap, __unused int32_t *
         return common_fchdir(p, (void *)uap, 1);
  }
  
+
  /*
   * Change current working directory (".").
   *
@@ -3362,46 +3783,44 @@ __pthread_fchdir(proc_t p, struct __pthread_fchdir_args *uap, __unused int32_t *
   *     vnode_ref:ENOENT                No such file or directory
   */
  /* ARGSUSED */
-static int
-common_chdir(proc_t p, struct chdir_args *uap, int per_thread)
+int
+chdir_internal(proc_t p, vfs_context_t ctx, struct nameidata *ndp, int per_thread)
  {
         struct filedesc *fdp = p->p_fd;
         int error;
-       struct nameidata nd;
         vnode_t tvp;
-       vfs_context_t ctx = vfs_context_current();
  
-       NDINIT(&nd, LOOKUP, OP_CHDIR, FOLLOW | AUDITVNPATH1,
-           UIO_USERSPACE, uap->path, ctx);
-       error = change_dir(&nd, ctx);
+       error = change_dir(ndp, ctx);
         if (error) {
                 return error;
         }
-       if ((error = vnode_ref(nd.ni_vp))) {
-               vnode_put(nd.ni_vp);
+       if ((error = vnode_ref(ndp->ni_vp))) {
+               vnode_put(ndp->ni_vp);
                 return error;
         }
         /*
          * drop the iocount we picked up in change_dir
          */
-       vnode_put(nd.ni_vp);
+       vnode_put(ndp->ni_vp);
  
         if (per_thread) {
                 thread_t th = vfs_context_thread(ctx);
                 if (th) {
                         uthread_t uth = get_bsdthread_info(th);
                         tvp = uth->uu_cdir;
-                       uth->uu_cdir = nd.ni_vp;
+                       uth->uu_cdir = ndp->ni_vp;
                         OSBitOrAtomic(P_THCWD, &p->p_flag);
                 } else {
-                       vnode_rele(nd.ni_vp);
+                       vnode_rele(ndp->ni_vp);
                         return ENOENT;
                 }
         } else {
+               proc_dirs_lock_exclusive(p);
                 proc_fdlock(p);
                 tvp = fdp->fd_cdir;
-               fdp->fd_cdir = nd.ni_vp;
+               fdp->fd_cdir = ndp->ni_vp;
                 proc_fdunlock(p);
+               proc_dirs_unlock_exclusive(p);
         }
  
         if (tvp) {
@@ -3413,13 +3832,35 @@ common_chdir(proc_t p, struct chdir_args *uap, int per_thread)
  
  
  /*
- * chdir
- *
- * Change current working directory (".") for the entire process
- *
- * Parameters:  p       Process requesting the call
- *              uap     User argument descriptor (see below)
- *              retval  (ignored)
+ * Change current working directory (".").
+ *
+ * Returns:    0                       Success
+ *     chdir_internal:ENOTDIR
+ *     chdir_internal:ENOENT           No such file or directory
+ *     chdir_internal:???
+ */
+/* ARGSUSED */
+static int
+common_chdir(proc_t p, struct chdir_args *uap, int per_thread)
+{
+       struct nameidata nd;
+       vfs_context_t ctx = vfs_context_current();
+
+       NDINIT(&nd, LOOKUP, OP_CHDIR, FOLLOW | AUDITVNPATH1,
+           UIO_USERSPACE, uap->path, ctx);
+
+       return chdir_internal(p, ctx, &nd, per_thread);
+}
+
+
+/*
+ * chdir
+ *
+ * Change current working directory (".") for the entire process
+ *
+ * Parameters:  p       Process requesting the call
+ *              uap     User argument descriptor (see below)
+ *              retval  (ignored)
   *
   * Indirect parameters:        uap->path       Directory path
   *
@@ -3498,11 +3939,21 @@ chroot(proc_t p, struct chroot_args *uap, __unused int32_t *retval)
         }
         vnode_put(nd.ni_vp);
  
+       /*
+        * This lock provides the guarantee that as long as you hold the lock
+        * fdp->fd_rdir has a usecount on it. This is used to take an iocount
+        * on a referenced vnode in namei when determining the rootvnode for
+        * a process.
+        */
+       /* needed for synchronization with lookup */
+       proc_dirs_lock_exclusive(p);
+       /* needed for setting the flag and other activities on the fd itself */
         proc_fdlock(p);
         tvp = fdp->fd_rdir;
         fdp->fd_rdir = nd.ni_vp;
         fdp->fd_flags |= FD_CHROOT;
         proc_fdunlock(p);
+       proc_dirs_unlock_exclusive(p);
  
         if (tvp != NULL) {
                 vnode_rele(tvp);
@@ -3511,6 +3962,110 @@ chroot(proc_t p, struct chroot_args *uap, __unused int32_t *retval)
         return 0;
  }
  
+#define PATHSTATICBUFLEN 256
+#define PIVOT_ROOT_ENTITLEMENT              \
+       "com.apple.private.vfs.pivot-root"
+
+#if defined(XNU_TARGET_OS_OSX)
+int
+pivot_root(proc_t p, struct pivot_root_args *uap, __unused int *retval)
+{
+       int error;
+       char new_rootfs_path_before[PATHSTATICBUFLEN] = {0};
+       char old_rootfs_path_after[PATHSTATICBUFLEN] = {0};
+       char *new_rootfs_path_before_buf = NULL;
+       char *old_rootfs_path_after_buf = NULL;
+       char *incoming = NULL;
+       char *outgoing = NULL;
+       vnode_t incoming_rootvp = NULLVP;
+       size_t bytes_copied;
+
+       /*
+        * XXX : Additional restrictions needed
+        * - perhaps callable only once.
+        */
+       if ((error = suser(kauth_cred_get(), &p->p_acflag))) {
+               return error;
+       }
+
+       /*
+        * pivot_root can be executed by launchd only.
+        * Enforce entitlement.
+        */
+       if ((p->p_pid != 1) || !IOTaskHasEntitlement(current_task(), PIVOT_ROOT_ENTITLEMENT)) {
+               return EPERM;
+       }
+
+       error = copyinstr(uap->new_rootfs_path_before, &new_rootfs_path_before[0], PATHSTATICBUFLEN, &bytes_copied);
+       if (error == ENAMETOOLONG) {
+               new_rootfs_path_before_buf = zalloc_flags(ZV_NAMEI, Z_WAITOK);
+               error = copyinstr(uap->new_rootfs_path_before, new_rootfs_path_before_buf, MAXPATHLEN, &bytes_copied);
+       }
+
+       if (error) {
+               goto out;
+       }
+
+       error = copyinstr(uap->old_rootfs_path_after, &old_rootfs_path_after[0], PATHSTATICBUFLEN, &bytes_copied);
+       if (error == ENAMETOOLONG) {
+               old_rootfs_path_after_buf = zalloc_flags(ZV_NAMEI, Z_WAITOK);
+               error = copyinstr(uap->old_rootfs_path_after, old_rootfs_path_after_buf, MAXPATHLEN, &bytes_copied);
+       }
+       if (error) {
+               goto out;
+       }
+
+       if (new_rootfs_path_before_buf) {
+               incoming = new_rootfs_path_before_buf;
+       } else {
+               incoming = &new_rootfs_path_before[0];
+       }
+
+       if (old_rootfs_path_after_buf) {
+               outgoing = old_rootfs_path_after_buf;
+       } else {
+               outgoing = &old_rootfs_path_after[0];
+       }
+
+       /*
+        * The proposed incoming FS MUST be authenticated (i.e. not a chunklist DMG).
+        * Userland is not allowed to pivot to an image.
+        */
+       error = vnode_lookup(incoming, 0, &incoming_rootvp, vfs_context_kernel());
+       if (error) {
+               goto out;
+       }
+       error = VNOP_IOCTL(incoming_rootvp, FSIOC_KERNEL_ROOTAUTH, NULL, 0, vfs_context_kernel());
+       if (error) {
+               goto out;
+       }
+
+       error = vfs_switch_root(incoming, outgoing, VFSSR_VIRTUALDEV_PROHIBITED);
+
+out:
+       if (incoming_rootvp != NULLVP) {
+               vnode_put(incoming_rootvp);
+               incoming_rootvp = NULLVP;
+       }
+
+       if (old_rootfs_path_after_buf) {
+               zfree(ZV_NAMEI, old_rootfs_path_after_buf);
+       }
+
+       if (new_rootfs_path_before_buf) {
+               zfree(ZV_NAMEI, new_rootfs_path_before_buf);
+       }
+
+       return error;
+}
+#else
+int
+pivot_root(proc_t p, __unused struct pivot_root_args *uap, int *retval)
+{
+       return nosys(p, NULL, retval);
+}
+#endif /* XNU_TARGET_OS_OSX */
+
  /*
   * Common routine for chroot and chdir.
   *
@@ -3562,8 +4117,8 @@ fg_vn_data_alloc(void)
         struct fd_vn_data *fvdata;
  
         /* Allocate per fd vnode data */
-       MALLOC(fvdata, struct fd_vn_data *, (sizeof(struct fd_vn_data)),
-           M_FD_VN_DATA, M_WAITOK | M_ZERO);
+       fvdata = kheap_alloc(KM_FD_VN_DATA, sizeof(struct fd_vn_data),
+           Z_WAITOK | Z_ZERO);
         lck_mtx_init(&fvdata->fv_lock, fd_vn_lck_grp, fd_vn_lck_attr);
         return fvdata;
  }
@@ -3576,11 +4131,9 @@ fg_vn_data_free(void *fgvndata)
  {
         struct fd_vn_data *fvdata = (struct fd_vn_data *)fgvndata;
  
-       if (fvdata->fv_buf) {
-               FREE(fvdata->fv_buf, M_FD_DIRBUF);
-       }
+       kheap_free(KHEAP_DATA_BUFFERS, fvdata->fv_buf, fvdata->fv_bufallocsiz);
         lck_mtx_destroy(&fvdata->fv_lock, fd_vn_lck_grp);
-       FREE(fvdata, M_FD_VN_DATA);
+       kheap_free(KM_FD_VN_DATA, fvdata, sizeof(struct fd_vn_data));
  }
  
  /*
@@ -3611,7 +4164,6 @@ open1(vfs_context_t ctx, struct nameidata *ndp, int uflags,
         vnode_t vp;
         int flags, oflags;
         int type, indx, error;
-       struct flock lf;
         struct vfs_context context;
  
         oflags = uflags;
@@ -3650,14 +4202,15 @@ open1(vfs_context_t ctx, struct nameidata *ndp, int uflags,
         uu->uu_dupfd = 0;
         vp = ndp->ni_vp;
  
-       fp->f_fglob->fg_flag = flags & (FMASK | O_EVTONLY | FENCRYPTED | FUNENCRYPTED);
-       fp->f_fglob->fg_ops = &vnops;
-       fp->f_fglob->fg_data = (caddr_t)vp;
+       fp->fp_glob->fg_flag = flags & (FMASK | O_EVTONLY | FENCRYPTED | FUNENCRYPTED);
+       fp->fp_glob->fg_ops = &vnops;
+       fp->fp_glob->fg_data = (caddr_t)vp;
  
         if (flags & (O_EXLOCK | O_SHLOCK)) {
-               lf.l_whence = SEEK_SET;
-               lf.l_start = 0;
-               lf.l_len = 0;
+               struct flock lf = {
+                       .l_whence = SEEK_SET,
+               };
+
                 if (flags & O_EXLOCK) {
                         lf.l_type = F_WRLCK;
                 } else {
@@ -3668,31 +4221,17 @@ open1(vfs_context_t ctx, struct nameidata *ndp, int uflags,
                         type |= F_WAIT;
                 }
  #if CONFIG_MACF
-               error = mac_file_check_lock(vfs_context_ucred(ctx), fp->f_fglob,
+               error = mac_file_check_lock(vfs_context_ucred(ctx), fp->fp_glob,
                     F_SETLK, &lf);
                 if (error) {
                         goto bad;
                 }
  #endif
-               if ((error = VNOP_ADVLOCK(vp, (caddr_t)fp->f_fglob, F_SETLK, &lf, type, ctx, NULL))) {
+               if ((error = VNOP_ADVLOCK(vp, (caddr_t)fp->fp_glob, F_SETLK, &lf, type, ctx, NULL))) {
                         goto bad;
                 }
-               fp->f_fglob->fg_flag |= FHASLOCK;
-       }
-
-#if DEVELOPMENT || DEBUG
-       /*
-        * XXX VSWAP: Check for entitlements or special flag here
-        * so we can restrict access appropriately.
-        */
-#else /* DEVELOPMENT || DEBUG */
-
-       if (vnode_isswap(vp) && (flags & (FWRITE | O_TRUNC)) && (ctx != vfs_context_kernel())) {
-               /* block attempt to write/truncate swapfile */
-               error = EPERM;
-               goto bad;
+               fp->fp_glob->fg_flag |= FWASLOCKED;
         }
-#endif /* DEVELOPMENT || DEBUG */
  
         /* try to truncate by setting the size attribute */
         if ((flags & O_TRUNC) && ((error = vnode_setsize(vp, (off_t)0, 0, ctx)) != 0)) {
@@ -3703,9 +4242,9 @@ open1(vfs_context_t ctx, struct nameidata *ndp, int uflags,
          * For directories we hold some additional information in the fd.
          */
         if (vnode_vtype(vp) == VDIR) {
-               fp->f_fglob->fg_vn_data = fg_vn_data_alloc();
+               fp->fp_glob->fg_vn_data = fg_vn_data_alloc();
         } else {
-               fp->f_fglob->fg_vn_data = NULL;
+               fp->fp_glob->fg_vn_data = NULL;
         }
  
         vnode_put(vp);
@@ -3718,7 +4257,7 @@ open1(vfs_context_t ctx, struct nameidata *ndp, int uflags,
             !(flags & O_NOCTTY)) {
                 int tmp = 0;
  
-               (void)(*fp->f_fglob->fg_ops->fo_ioctl)(fp, (int)TIOCSCTTY,
+               (void)(*fp->fp_glob->fg_ops->fo_ioctl)(fp, (int)TIOCSCTTY,
                     (caddr_t)&tmp, ctx);
         }
  
@@ -3733,7 +4272,7 @@ open1(vfs_context_t ctx, struct nameidata *ndp, int uflags,
  
  #if CONFIG_SECLUDED_MEMORY
         if (secluded_for_filecache &&
-           FILEGLOB_DTYPE(fp->f_fglob) == DTYPE_VNODE &&
+           FILEGLOB_DTYPE(fp->fp_glob) == DTYPE_VNODE &&
             vnode_vtype(vp) == VREG) {
                 memory_object_control_t moc;
  
@@ -3741,7 +4280,7 @@ open1(vfs_context_t ctx, struct nameidata *ndp, int uflags,
  
                 if (moc == MEMORY_OBJECT_CONTROL_NULL) {
                         /* nothing to do... */
-               } else if (fp->f_fglob->fg_flag & FWRITE) {
+               } else if (fp->fp_glob->fg_flag & FWRITE) {
                         /* writable -> no longer  eligible for secluded pages */
                         memory_object_mark_eligible_for_secluded(moc,
                             FALSE);
@@ -3750,7 +4289,7 @@ open1(vfs_context_t ctx, struct nameidata *ndp, int uflags,
                         size_t copied;
                         /* XXX FBDP: better way to detect /Applications/ ? */
                         if (UIO_SEG_IS_USER_SPACE(ndp->ni_segflg)) {
-                               copyinstr(ndp->ni_dirp,
+                               (void)copyinstr(ndp->ni_dirp,
                                     pathname,
                                     sizeof(pathname),
                                     &copied);
@@ -3784,27 +4323,14 @@ open1(vfs_context_t ctx, struct nameidata *ndp, int uflags,
  #else
  /* not implemented... */
  #endif
-                       if (!strncmp(vp->v_name,
-                           DYLD_SHARED_CACHE_NAME,
-                           strlen(DYLD_SHARED_CACHE_NAME)) ||
-                           !strncmp(vp->v_name,
-                           "dyld",
-                           strlen(vp->v_name)) ||
-                           !strncmp(vp->v_name,
-                           "launchd",
-                           strlen(vp->v_name)) ||
-                           !strncmp(vp->v_name,
-                           "Camera",
-                           strlen(vp->v_name)) ||
-                           !strncmp(vp->v_name,
-                           "mediaserverd",
-                           strlen(vp->v_name)) ||
-                           !strncmp(vp->v_name,
-                           "SpringBoard",
-                           strlen(vp->v_name)) ||
-                           !strncmp(vp->v_name,
-                           "backboardd",
-                           strlen(vp->v_name))) {
+                       size_t len = strlen(vp->v_name);
+                       if (!strncmp(vp->v_name, DYLD_SHARED_CACHE_NAME, len) ||
+                           !strncmp(vp->v_name, "dyld", len) ||
+                           !strncmp(vp->v_name, "launchd", len) ||
+                           !strncmp(vp->v_name, "Camera", len) ||
+                           !strncmp(vp->v_name, "mediaserverd", len) ||
+                           !strncmp(vp->v_name, "SpringBoard", len) ||
+                           !strncmp(vp->v_name, "backboardd", len)) {
                                 /*
                                  * This file matters when launching Camera:
                                  * do not store its contents in the secluded
@@ -3825,20 +4351,20 @@ open1(vfs_context_t ctx, struct nameidata *ndp, int uflags,
         return 0;
  bad:
         context = *vfs_context_current();
-       context.vc_ucred = fp->f_fglob->fg_cred;
+       context.vc_ucred = fp->fp_glob->fg_cred;
  
-       if ((fp->f_fglob->fg_flag & FHASLOCK) &&
-           (FILEGLOB_DTYPE(fp->f_fglob) == DTYPE_VNODE)) {
-               lf.l_whence = SEEK_SET;
-               lf.l_start = 0;
-               lf.l_len = 0;
-               lf.l_type = F_UNLCK;
+       if ((fp->fp_glob->fg_flag & FWASLOCKED) &&
+           (FILEGLOB_DTYPE(fp->fp_glob) == DTYPE_VNODE)) {
+               struct flock lf = {
+                       .l_whence = SEEK_SET,
+                       .l_type = F_UNLCK,
+               };
  
                 (void)VNOP_ADVLOCK(
-                       vp, (caddr_t)fp->f_fglob, F_UNLCK, &lf, F_FLOCK, ctx, NULL);
+                       vp, (caddr_t)fp->fp_glob, F_UNLCK, &lf, F_FLOCK, ctx, NULL);
         }
  
-       vn_close(vp, fp->f_fglob->fg_flag, &context);
+       vn_close(vp, fp->fp_glob->fg_flag, &context);
         vnode_put(vp);
         fp_free(p, indx, fp);
  
@@ -3939,7 +4465,7 @@ open_extended(proc_t p, struct open_extended_args *uap, int32_t *retval)
  
         VATTR_INIT(&va);
         cmode = ((uap->mode & ~fdp->fd_cmask) & ALLPERMS) & ~S_ISTXT;
-       VATTR_SET(&va, va_mode, cmode);
+       VATTR_SET(&va, va_mode, cmode & ACCESSPERMS);
         if (uap->uid != KAUTH_UID_NONE) {
                 VATTR_SET(&va, va_uid, uap->uid);
         }
@@ -4033,20 +4559,33 @@ openat_internal(vfs_context_t ctx, user_addr_t path, int flags, int mode,
      int fd, enum uio_seg segflg, int *retval)
  {
         struct filedesc *fdp = (vfs_context_proc(ctx))->p_fd;
-       struct vnode_attr va;
-       struct nameidata nd;
+       struct {
+               struct vnode_attr va;
+               struct nameidata nd;
+       } *__open_data;
+       struct vnode_attr *vap;
+       struct nameidata *ndp;
         int cmode;
+       int error;
  
-       VATTR_INIT(&va);
+       __open_data = kheap_alloc(KHEAP_TEMP, sizeof(*__open_data), Z_WAITOK);
+       vap = &__open_data->va;
+       ndp = &__open_data->nd;
+
+       VATTR_INIT(vap);
         /* Mask off all but regular access permissions */
         cmode = ((mode & ~fdp->fd_cmask) & ALLPERMS) & ~S_ISTXT;
-       VATTR_SET(&va, va_mode, cmode & ACCESSPERMS);
+       VATTR_SET(vap, va_mode, cmode & ACCESSPERMS);
  
-       NDINIT(&nd, LOOKUP, OP_OPEN, FOLLOW | AUDITVNPATH1,
+       NDINIT(ndp, LOOKUP, OP_OPEN, FOLLOW | AUDITVNPATH1,
             segflg, path, ctx);
  
-       return open1at(ctx, &nd, flags, &va, fileproc_alloc_init, NULL,
-                  retval, fd);
+       error = open1at(ctx, ndp, flags, vap, fileproc_alloc_init, NULL,
+           retval, fd);
+
+       kheap_free(KHEAP_TEMP, __open_data, sizeof(*__open_data));
+
+       return error;
  }
  
  int
@@ -4131,17 +4670,16 @@ openbyid_np(__unused proc_t p, struct openbyid_np_args *uap, int *retval)
  
         /*resolve path from fsis, objid*/
         do {
-               MALLOC(buf, char *, buflen + 1, M_TEMP, M_WAITOK);
+               buf = kheap_alloc(KHEAP_TEMP, buflen + 1, Z_WAITOK);
                 if (buf == NULL) {
                         return ENOMEM;
                 }
  
-               error = fsgetpath_internal(
-                       ctx, fsid.val[0], objid,
-                       buflen, buf, &pathlen);
+               error = fsgetpath_internal( ctx, fsid.val[0], objid, buflen,
+                   buf, FSOPT_ISREALFSID, &pathlen);
  
                 if (error) {
-                       FREE(buf, M_TEMP);
+                       kheap_free(KHEAP_TEMP, buf, buflen + 1);
                         buf = NULL;
                 }
         } while (error == ENOSPC && (buflen += MAXPATHLEN));
@@ -4155,7 +4693,7 @@ openbyid_np(__unused proc_t p, struct openbyid_np_args *uap, int *retval)
         error = openat_internal(
                 ctx, (user_addr_t)buf, uap->oflags, 0, AT_FDCWD, UIO_SYSSPACE, retval);
  
-       FREE(buf, M_TEMP);
+       kheap_free(KHEAP_TEMP, buf, buflen + 1);
  
         return error;
  }
@@ -4184,7 +4722,7 @@ mknod(proc_t p, struct mknod_args *uap, __unused int32_t *retval)
                 return mkfifo1(ctx, uap->path, &va);
         }
  
-       AUDIT_ARG(mode, uap->mode);
+       AUDIT_ARG(mode, (mode_t)uap->mode);
         AUDIT_ARG(value32, uap->dev);
  
         if ((error = suser(vfs_context_ucred(ctx), &p->p_acflag))) {
@@ -4391,32 +4929,22 @@ mkfifo(proc_t p, struct mkfifo_args *uap, __unused int32_t *retval)
         return mkfifo1(vfs_context_current(), uap->path, &va);
  }
  
-
-static char *
-my_strrchr(char *p, int ch)
-{
-       char *save;
-
-       for (save = NULL;; ++p) {
-               if (*p == ch) {
-                       save = p;
-               }
-               if (!*p) {
-                       return save;
-               }
-       }
-       /* NOTREACHED */
-}
-
+extern int safe_getpath_new(struct vnode *dvp, char *leafname, char *path, int _len, int *truncated_path, int firmlink);
  extern int safe_getpath(struct vnode *dvp, char *leafname, char *path, int _len, int *truncated_path);
+extern int safe_getpath_no_firmlink(struct vnode *dvp, char *leafname, char *path, int _len, int *truncated_path);
  
  int
-safe_getpath(struct vnode *dvp, char *leafname, char *path, int _len, int *truncated_path)
+safe_getpath_new(struct vnode *dvp, char *leafname, char *path, int _len, int *truncated_path, int firmlink)
  {
         int ret, len = _len;
  
         *truncated_path = 0;
-       ret = vn_getpath(dvp, path, &len);
+
+       if (firmlink) {
+               ret = vn_getpath(dvp, path, &len);
+       } else {
+               ret = vn_getpath_no_firmlink(dvp, path, &len);
+       }
         if (ret == 0 && len < (MAXPATHLEN - 1)) {
                 if (leafname) {
                         path[len - 1] = '/';
@@ -4426,11 +4954,11 @@ safe_getpath(struct vnode *dvp, char *leafname, char *path, int _len, int *trunc
  
                                 // the string got truncated!
                                 *truncated_path = 1;
-                               ptr = my_strrchr(path, '/');
+                               ptr = strrchr(path, '/');
                                 if (ptr) {
                                         *ptr = '\0';   // chop off the string at the last directory component
                                 }
-                               len = strlen(path) + 1;
+                               len = (int)strlen(path) + 1;
                         }
                 }
         } else if (ret == 0) {
@@ -4462,13 +4990,28 @@ safe_getpath(struct vnode *dvp, char *leafname, char *path, int _len, int *trunc
                         }
  
                         len = _len;
-                       ret = vn_getpath(mydvp, path, &len);
+                       if (firmlink) {
+                               ret = vn_getpath(mydvp, path, &len);
+                       } else {
+                               ret = vn_getpath_no_firmlink(mydvp, path, &len);
+                       }
                 } while (ret == ENOSPC);
         }
  
         return len;
  }
  
+int
+safe_getpath(struct vnode *dvp, char *leafname, char *path, int _len, int *truncated_path)
+{
+       return safe_getpath_new(dvp, leafname, path, _len, truncated_path, 1);
+}
+
+int
+safe_getpath_no_firmlink(struct vnode *dvp, char *leafname, char *path, int _len, int *truncated_path)
+{
+       return safe_getpath_new(dvp, leafname, path, _len, truncated_path, 0);
+}
  
  /*
   * Make a hard file link.
@@ -4486,7 +5029,7 @@ static int
  linkat_internal(vfs_context_t ctx, int fd1, user_addr_t path, int fd2,
      user_addr_t link, int flag, enum uio_seg segflg)
  {
-       vnode_t vp, dvp, lvp;
+       vnode_t vp, pvp, dvp, lvp;
         struct nameidata nd;
         int follow;
         int error;
@@ -4608,10 +5151,6 @@ linkat_internal(vfs_context_t ctx, int fd1, user_addr_t path, int fd2,
  
                 /* build the path to the new link file */
                 GET_PATH(target_path);
-               if (target_path == NULL) {
-                       error = ENOMEM;
-                       goto out2;
-               }
  
                 len = safe_getpath(dvp, nd.ni_cnd.cn_nameptr, target_path, MAXPATHLEN, &truncated);
  
@@ -4620,10 +5159,6 @@ linkat_internal(vfs_context_t ctx, int fd1, user_addr_t path, int fd2,
                 if (has_listeners) {
                         /* build the path to file we are linking to */
                         GET_PATH(link_to_path);
-                       if (link_to_path == NULL) {
-                               error = ENOMEM;
-                               goto out2;
-                       }
  
                         link_name_len = MAXPATHLEN;
                         if (vn_getpath(vp, link_to_path, &link_name_len) == 0) {
@@ -4653,10 +5188,22 @@ linkat_internal(vfs_context_t ctx, int fd1, user_addr_t path, int fd2,
                                     FSE_ARG_FINFO, &finfo,
                                     FSE_ARG_DONE);
                         }
-                       if (vp->v_parent) {
+
+                       pvp = vp->v_parent;
+                       // need an iocount on pvp in this case
+                       if (pvp && pvp != dvp) {
+                               error = vnode_get(pvp);
+                               if (error) {
+                                       pvp = NULLVP;
+                                       error = 0;
+                               }
+                       }
+                       if (pvp) {
                                 add_fsevent(FSE_STAT_CHANGED, ctx,
-                                   FSE_ARG_VNODE, vp->v_parent,
-                                   FSE_ARG_DONE);
+                                   FSE_ARG_VNODE, pvp, FSE_ARG_DONE);
+                       }
+                       if (pvp && pvp != dvp) {
+                               vnode_put(pvp);
                         }
                 }
  #endif
@@ -4719,7 +5266,7 @@ symlinkat_internal(vfs_context_t ctx, user_addr_t path_data, int fd,
  
         error = 0;
         if (UIO_SEG_IS_USER_SPACE(segflg)) {
-               MALLOC_ZONE(path, char *, MAXPATHLEN, M_NAMEI, M_WAITOK);
+               path = zalloc(ZV_NAMEI);
                 error = copyinstr(path_data, path, MAXPATHLEN, &dummy);
         } else {
                 path = (char *)path_data;
@@ -4769,16 +5316,16 @@ symlinkat_internal(vfs_context_t ctx, user_addr_t path_data, int fd,
                 error = VNOP_SYMLINK(dvp, &vp, &nd.ni_cnd, &va, path, ctx);
         }
  
-#if CONFIG_MACF
+       /* do fallback attribute handling */
         if (error == 0 && vp) {
-               error = vnode_label(vnode_mount(vp), dvp, vp, &nd.ni_cnd, VNODE_LABEL_CREATE, ctx);
+               error = vnode_setattr_fallback(vp, &va, ctx);
         }
-#endif
  
-       /* do fallback attribute handling */
+#if CONFIG_MACF
         if (error == 0 && vp) {
-               error = vnode_setattr_fallback(vp, &va, ctx);
+               error = vnode_label(vnode_mount(vp), dvp, vp, &nd.ni_cnd, VNODE_LABEL_CREATE, ctx);
         }
+#endif
  
         if (error == 0) {
                 int     update_flags = 0;
@@ -4855,7 +5402,7 @@ skipit:
         vnode_put(dvp);
  out:
         if (path && (path != (char *)path_data)) {
-               FREE_ZONE(path, MAXPATHLEN, M_NAMEI);
+               zfree(ZV_NAMEI, path);
         }
  
         return error;
@@ -4899,7 +5446,9 @@ unlinkat_internal(vfs_context_t ctx, int fd, vnode_t start_dvp,
         int error;
         struct componentname *cnp;
         char  *path = NULL;
-       int  len = 0;
+       char  *no_firmlink_path = NULL;
+       int  len_path = 0;
+       int  len_no_firmlink_path = 0;
  #if CONFIG_FSE
         fse_info  finfo;
         struct vnode_attr va;
@@ -4908,6 +5457,7 @@ unlinkat_internal(vfs_context_t ctx, int fd, vnode_t start_dvp,
         int need_event;
         int has_listeners;
         int truncated_path;
+       int truncated_no_firmlink_path;
         int batched;
         struct vnode_attr *vap;
         int do_retry;
@@ -4934,6 +5484,7 @@ retry:
         need_event = 0;
         has_listeners = 0;
         truncated_path = 0;
+       truncated_no_firmlink_path = 0;
         vap = NULL;
  
         NDINIT(&nd, DELETE, OP_UNLINK, cn_flags, segflg, path_arg, ctx);
@@ -4967,8 +5518,9 @@ continue_lookup:
                 /*
                  * The root of a mounted filesystem cannot be deleted.
                  */
-               if (vp->v_flag & VROOT) {
+               if ((vp->v_flag & VROOT) || (dvp->v_mount != vp->v_mount)) {
                         error = EBUSY;
+                       goto out;
                 }
  
  #if DEVELOPMENT || DEBUG
@@ -4988,7 +5540,6 @@ continue_lookup:
                         error = vn_authorize_unlink(dvp, vp, cnp, ctx, NULL);
                         if (error) {
                                 if (error == ENOENT) {
-                                       assert(retry_count < MAX_AUTHORIZE_ENOENT_RETRIES);
                                         if (retry_count < MAX_AUTHORIZE_ENOENT_RETRIES) {
                                                 do_retry = 1;
                                                 retry_count++;
@@ -5027,12 +5578,12 @@ continue_lookup:
         if (need_event || has_listeners) {
                 if (path == NULL) {
                         GET_PATH(path);
-                       if (path == NULL) {
-                               error = ENOMEM;
-                               goto out;
-                       }
                 }
-               len = safe_getpath(dvp, nd.ni_cnd.cn_nameptr, path, MAXPATHLEN, &truncated_path);
+               len_path = safe_getpath(dvp, nd.ni_cnd.cn_nameptr, path, MAXPATHLEN, &truncated_path);
+               if (no_firmlink_path == NULL) {
+                       GET_PATH(no_firmlink_path);
+               }
+               len_no_firmlink_path = safe_getpath_no_firmlink(dvp, nd.ni_cnd.cn_nameptr, no_firmlink_path, MAXPATHLEN, &truncated_no_firmlink_path);
         }
  
  #if NAMEDRSRCFORK
@@ -5058,7 +5609,6 @@ continue_lookup:
                         }
                         goto continue_lookup;
                 } else if (error == ENOENT && batched) {
-                       assert(retry_count < MAX_AUTHORIZE_ENOENT_RETRIES);
                         if (retry_count < MAX_AUTHORIZE_ENOENT_RETRIES) {
                                 /*
                                  * For compound VNOPs, the authorization callback may
@@ -5106,7 +5656,7 @@ continue_lookup:
                                 finfo.mode |= FSE_TRUNCATED_PATH;
                         }
                         add_fsevent(FSE_DELETE, ctx,
-                           FSE_ARG_STRING, len, path,
+                           FSE_ARG_STRING, len_no_firmlink_path, no_firmlink_path,
                             FSE_ARG_FINFO, &finfo,
                             FSE_ARG_DONE);
                 }
@@ -5116,8 +5666,13 @@ continue_lookup:
  out:
         if (path != NULL) {
                 RELEASE_PATH(path);
+               path = NULL;
         }
  
+       if (no_firmlink_path != NULL) {
+               RELEASE_PATH(no_firmlink_path);
+               no_firmlink_path = NULL;
+       }
  #if NAMEDRSRCFORK
         /* recycle the deleted rsrc fork vnode to force a reclaim, which
          * will cause its shadow file to go away if necessary.
@@ -5176,13 +5731,18 @@ unlink(__unused proc_t p, struct unlink_args *uap, __unused int32_t *retval)
  int
  unlinkat(__unused proc_t p, struct unlinkat_args *uap, __unused int32_t *retval)
  {
-       if (uap->flag & ~AT_REMOVEDIR) {
+       if (uap->flag & ~(AT_REMOVEDIR | AT_REMOVEDIR_DATALESS)) {
                 return EINVAL;
         }
  
-       if (uap->flag & AT_REMOVEDIR) {
+       if (uap->flag & (AT_REMOVEDIR | AT_REMOVEDIR_DATALESS)) {
+               int unlink_flags = 0;
+
+               if (uap->flag & AT_REMOVEDIR_DATALESS) {
+                       unlink_flags |= VNODE_REMOVE_DATALESS_DIR;
+               }
                 return rmdirat_internal(vfs_context_current(), uap->fd,
-                          uap->path, UIO_USERSPACE);
+                          uap->path, UIO_USERSPACE, unlink_flags);
         } else {
                 return unlinkat_internal(vfs_context_current(), uap->fd,
                            NULLVP, uap->path, UIO_USERSPACE, 0);
@@ -5217,10 +5777,10 @@ lseek(proc_t p, struct lseek_args *uap, off_t *retval)
  #if CONFIG_MACF
         if (uap->whence == L_INCR && uap->offset == 0) {
                 error = mac_file_check_get_offset(vfs_context_ucred(ctx),
-                   fp->f_fglob);
+                   fp->fp_glob);
         } else {
                 error = mac_file_check_change_offset(vfs_context_ucred(ctx),
-                   fp->f_fglob);
+                   fp->fp_glob);
         }
         if (error) {
                 file_drop(uap->fd);
@@ -5234,7 +5794,7 @@ lseek(proc_t p, struct lseek_args *uap, off_t *retval)
  
         switch (uap->whence) {
         case L_INCR:
-               offset += fp->f_fglob->fg_offset;
+               offset += fp->fp_glob->fg_offset;
                 break;
         case L_XTND:
                 if ((error = vnode_size(vp, &file_size, ctx)) != 0) {
@@ -5268,8 +5828,8 @@ lseek(proc_t p, struct lseek_args *uap, off_t *retval)
                                 error = EINVAL;
                         } else {
                                 /* Success */
-                               fp->f_fglob->fg_offset = offset;
-                               *retval = fp->f_fglob->fg_offset;
+                               fp->fp_glob->fg_offset = offset;
+                               *retval = fp->fp_glob->fg_offset;
                         }
                 }
         }
@@ -5402,7 +5962,8 @@ access_extended(__unused proc_t p, struct access_extended_args *uap, __unused in
         errno_t *result = NULL;
         errno_t error = 0;
         int wantdelete = 0;
-       unsigned int desc_max, desc_actual, i, j;
+       size_t desc_max, desc_actual;
+       unsigned int i, j;
         struct vfs_context context;
         struct nameidata nd;
         int niopts;
@@ -5432,7 +5993,7 @@ access_extended(__unused proc_t p, struct access_extended_args *uap, __unused in
         if (uap->size <= sizeof(stack_input)) {
                 input = stack_input;
         } else {
-               MALLOC(input, struct accessx_descriptor *, uap->size, M_TEMP, M_WAITOK);
+               input = kheap_alloc(KHEAP_DATA_BUFFERS, uap->size, Z_WAITOK);
                 if (input == NULL) {
                         error = ENOMEM;
                         goto out;
@@ -5528,7 +6089,8 @@ access_extended(__unused proc_t p, struct access_extended_args *uap, __unused in
                 error = ENOMEM;
                 goto out;
         }
-       MALLOC(result, errno_t *, desc_actual * sizeof(errno_t), M_TEMP, M_WAITOK | M_ZERO);
+       result = kheap_alloc(KHEAP_DATA_BUFFERS, desc_actual * sizeof(errno_t),
+           Z_WAITOK | Z_ZERO);
         if (result == NULL) {
                 error = ENOMEM;
                 goto out;
@@ -5619,10 +6181,10 @@ access_extended(__unused proc_t p, struct access_extended_args *uap, __unused in
  
  out:
         if (input && input != stack_input) {
-               FREE(input, M_TEMP);
+               kheap_free(KHEAP_DATA_BUFFERS, input, uap->size);
         }
         if (result) {
-               FREE(result, M_TEMP);
+               kheap_free(KHEAP_DATA_BUFFERS, result, desc_actual * sizeof(errno_t));
         }
         if (vp) {
                 vnode_put(vp);
@@ -5674,7 +6236,7 @@ faccessat_internal(vfs_context_t ctx, int fd, user_addr_t path, int amode,
         context.vc_thread = ctx->vc_thread;
  
  
-       niopts = FOLLOW | AUDITVNPATH1;
+       niopts = (flag & AT_SYMLINK_NOFOLLOW ? NOFOLLOW : FOLLOW) | AUDITVNPATH1;
         /* need parent for vnode_authorize for deletion test */
         if (amode & _DELETE_OK) {
                 niopts |= WANTPARENT;
@@ -5738,7 +6300,7 @@ int
  faccessat(__unused proc_t p, struct faccessat_args *uap,
      __unused int32_t *retval)
  {
-       if (uap->flag & ~AT_EACCESS) {
+       if (uap->flag & ~(AT_EACCESS | AT_SYMLINK_NOFOLLOW)) {
                 return EINVAL;
         }
  
@@ -5775,6 +6337,8 @@ fstatat_internal(vfs_context_t ctx, user_addr_t path, user_addr_t ub,
         kauth_filesec_t fsec;
         size_t xsecurity_bufsize;
         void * statptr;
+       struct fileproc *fp = NULL;
+       int needsrealdev = 0;
  
         follow = (flag & AT_SYMLINK_NOFOLLOW) ? NOFOLLOW : FOLLOW;
         NDINIT(&nd, LOOKUP, OP_GETATTR, follow | AUDITVNPATH1,
@@ -5785,9 +6349,24 @@ fstatat_internal(vfs_context_t ctx, user_addr_t path, user_addr_t ub,
         /* stat calls are allowed for resource forks. */
         nd.ni_cnd.cn_flags |= CN_ALLOWRSRCFORK;
  #endif
-       error = nameiat(&nd, fd);
-       if (error) {
-               return error;
+
+       if (flag & AT_FDONLY) {
+               vnode_t fvp;
+
+               error = fp_getfvp(vfs_context_proc(ctx), fd, &fp, &fvp);
+               if (error) {
+                       return error;
+               }
+               if ((error = vnode_getwithref(fvp))) {
+                       file_drop(fd);
+                       return error;
+               }
+               nd.ni_vp = fvp;
+       } else {
+               error = nameiat(&nd, fd);
+               if (error) {
+                       return error;
+               }
         }
         fsec = KAUTH_FILESEC_NONE;
  
@@ -5806,7 +6385,19 @@ fstatat_internal(vfs_context_t ctx, user_addr_t path, user_addr_t ub,
         }
  #endif
  
-       error = vn_stat(nd.ni_vp, statptr, (xsecurity != USER_ADDR_NULL ? &fsec : NULL), isstat64, ctx);
+       needsrealdev = flag & AT_REALDEV ? 1 : 0;
+       if (fp && (xsecurity == USER_ADDR_NULL)) {
+               /*
+                * If the caller has the file open, and is not
+                * requesting extended security information, we are
+                * going to let them get the basic stat information.
+                */
+               error = vn_stat_noauth(nd.ni_vp, statptr, NULL, isstat64, needsrealdev, ctx,
+                   fp->fp_glob->fg_cred);
+       } else {
+               error = vn_stat(nd.ni_vp, statptr, (xsecurity != USER_ADDR_NULL ? &fsec : NULL),
+                   isstat64, needsrealdev, ctx);
+       }
  
  #if NAMEDRSRCFORK
         if (is_namedstream) {
@@ -5815,6 +6406,10 @@ fstatat_internal(vfs_context_t ctx, user_addr_t path, user_addr_t ub,
  #endif
         vnode_put(nd.ni_vp);
         nameidone(&nd);
+       if (fp) {
+               file_drop(fd);
+               fp = NULL;
+       }
  
         if (error) {
                 return error;
@@ -6031,7 +6626,7 @@ lstat64_extended(__unused proc_t p, struct lstat64_extended_args *uap, __unused
  int
  fstatat(__unused proc_t p, struct fstatat_args *uap, __unused int32_t *retval)
  {
-       if (uap->flag & ~AT_SYMLINK_NOFOLLOW) {
+       if (uap->flag & ~(AT_SYMLINK_NOFOLLOW | AT_REALDEV | AT_FDONLY)) {
                 return EINVAL;
         }
  
@@ -6043,7 +6638,7 @@ int
  fstatat64(__unused proc_t p, struct fstatat64_args *uap,
      __unused int32_t *retval)
  {
-       if (uap->flag & ~AT_SYMLINK_NOFOLLOW) {
+       if (uap->flag & ~(AT_SYMLINK_NOFOLLOW | AT_REALDEV | AT_FDONLY)) {
                 return EINVAL;
         }
  
@@ -6103,6 +6698,10 @@ readlinkat_internal(vfs_context_t ctx, int fd, user_addr_t path,
         struct nameidata nd;
         char uio_buf[UIO_SIZEOF(1)];
  
+       if (bufsize > INT32_MAX) {
+               return EINVAL;
+       }
+
         NDINIT(&nd, LOOKUP, OP_READLINK, NOFOLLOW | AUDITVNPATH1,
             seg, path, ctx);
  
@@ -6133,7 +6732,7 @@ readlinkat_internal(vfs_context_t ctx, int fd, user_addr_t path,
         }
         vnode_put(vp);
  
-       *retval = bufsize - (int)uio_resid(auio);
+       *retval = (int)(bufsize - uio_resid(auio));
         return error;
  }
  
@@ -6159,29 +6758,25 @@ readlinkat(proc_t p, struct readlinkat_args *uap, int32_t *retval)
  }
  
  /*
- * Change file flags.
- *
- * NOTE: this will vnode_put() `vp'
+ * Change file flags, the deep inner layer.
   */
  static int
-chflags1(vnode_t vp, int flags, vfs_context_t ctx)
+chflags0(vnode_t vp, struct vnode_attr *va,
+    int (*setattr)(vnode_t, void *, vfs_context_t),
+    void *arg, vfs_context_t ctx)
  {
-       struct vnode_attr va;
-       kauth_action_t action;
+       kauth_action_t action = 0;
         int error;
  
-       VATTR_INIT(&va);
-       VATTR_SET(&va, va_flags, flags);
-
  #if CONFIG_MACF
-       error = mac_vnode_check_setflags(ctx, vp, flags);
+       error = mac_vnode_check_setflags(ctx, vp, va->va_flags);
         if (error) {
                 goto out;
         }
  #endif
  
         /* request authorisation, disregard immutability */
-       if ((error = vnode_authattr(vp, &va, &action, ctx)) != 0) {
+       if ((error = vnode_authattr(vp, va, &action, ctx)) != 0) {
                 goto out;
         }
         /*
@@ -6192,19 +6787,39 @@ chflags1(vnode_t vp, int flags, vfs_context_t ctx)
         if (action && ((error = vnode_authorize(vp, NULL, action | KAUTH_VNODE_NOIMMUTABLE, ctx)) != 0)) {
                 goto out;
         }
-       error = vnode_setattr(vp, &va, ctx);
+       error = (*setattr)(vp, arg, ctx);
  
  #if CONFIG_MACF
         if (error == 0) {
-               mac_vnode_notify_setflags(ctx, vp, flags);
+               mac_vnode_notify_setflags(ctx, vp, va->va_flags);
         }
  #endif
  
+out:
+       return error;
+}
+
+/*
+ * Change file flags.
+ *
+ * NOTE: this will vnode_put() `vp'
+ */
+static int
+chflags1(vnode_t vp, int flags, vfs_context_t ctx)
+{
+       struct vnode_attr va;
+       int error;
+
+       VATTR_INIT(&va);
+       VATTR_SET(&va, va_flags, flags);
+
+       error = chflags0(vp, &va, (void *)vnode_setattr, &va, ctx);
+       vnode_put(vp);
+
         if ((error == 0) && !VATTR_IS_SUPPORTED(&va, va_flags)) {
                 error = ENOTSUP;
         }
-out:
-       vnode_put(vp);
+
         return error;
  }
  
@@ -6772,8 +7387,8 @@ getutimes(user_addr_t usrtvp, struct timespec *tsp)
                         if (error) {
                                 return error;
                         }
-                       TIMEVAL_TO_TIMESPEC(&tv[0], &tsp[0]);
-                       TIMEVAL_TO_TIMESPEC(&tv[1], &tsp[1]);
+                       TIMEVAL64_TO_TIMESPEC(&tv[0], &tsp[0]);
+                       TIMEVAL64_TO_TIMESPEC(&tv[1], &tsp[1]);
                 } else {
                         struct user32_timeval tv[2];
                         error = copyin(usrtvp, (void *)tv, sizeof(tv));
@@ -6921,7 +7536,7 @@ futimes(__unused proc_t p, struct futimes_args *uap, __unused int32_t *retval)
   */
  /* ARGSUSED */
  int
-truncate(__unused proc_t p, struct truncate_args *uap, __unused int32_t *retval)
+truncate(proc_t p, struct truncate_args *uap, __unused int32_t *retval)
  {
         vnode_t vp;
         struct vnode_attr va;
@@ -6929,10 +7544,18 @@ truncate(__unused proc_t p, struct truncate_args *uap, __unused int32_t *retval)
         int error;
         struct nameidata nd;
         kauth_action_t action;
+       rlim_t fsize_limit;
  
         if (uap->length < 0) {
                 return EINVAL;
         }
+
+       fsize_limit = proc_limitgetcur(p, RLIMIT_FSIZE, TRUE);
+       if ((rlim_t)uap->length > fsize_limit) {
+               psignal(p, SIGXFSZ);
+               return EFBIG;
+       }
+
         NDINIT(&nd, LOOKUP, OP_TRUNCATE, FOLLOW | AUDITVNPATH1,
             UIO_USERSPACE, uap->path, ctx);
         if ((error = namei(&nd))) {
@@ -6984,17 +7607,24 @@ ftruncate(proc_t p, struct ftruncate_args *uap, int32_t *retval)
         struct fileproc *fp;
         int error;
         int fd = uap->fd;
+       rlim_t fsize_limit;
  
         AUDIT_ARG(fd, uap->fd);
         if (uap->length < 0) {
                 return EINVAL;
         }
  
+       fsize_limit = proc_limitgetcur(p, RLIMIT_FSIZE, TRUE);
+       if ((rlim_t)uap->length > fsize_limit) {
+               psignal(p, SIGXFSZ);
+               return EFBIG;
+       }
+
         if ((error = fp_lookup(p, fd, &fp, 0))) {
                 return error;
         }
  
-       switch (FILEGLOB_DTYPE(fp->f_fglob)) {
+       switch (FILEGLOB_DTYPE(fp->fp_glob)) {
         case DTYPE_PSXSHM:
                 error = pshm_truncate(p, fp, uap->fd, uap->length, retval);
                 goto out;
@@ -7005,9 +7635,9 @@ ftruncate(proc_t p, struct ftruncate_args *uap, int32_t *retval)
                 goto out;
         }
  
-       vp = (vnode_t)fp->f_fglob->fg_data;
+       vp = (vnode_t)fp->fp_glob->fg_data;
  
-       if ((fp->f_fglob->fg_flag & FWRITE) == 0) {
+       if ((fp->fp_glob->fg_flag & FWRITE) == 0) {
                 AUDIT_ARG(vnpath_withref, vp, ARG_VNODE1);
                 error = EINVAL;
                 goto out;
@@ -7021,7 +7651,7 @@ ftruncate(proc_t p, struct ftruncate_args *uap, int32_t *retval)
  
  #if CONFIG_MACF
         error = mac_vnode_check_truncate(ctx,
-           fp->f_fglob->fg_cred, vp);
+           fp->fp_glob->fg_cred, vp);
         if (error) {
                 (void)vnode_put(vp);
                 goto out;
@@ -7033,7 +7663,7 @@ ftruncate(proc_t p, struct ftruncate_args *uap, int32_t *retval)
  
  #if CONFIG_MACF
         if (error == 0) {
-               mac_vnode_notify_truncate(ctx, fp->f_fglob->fg_cred, vp);
+               mac_vnode_notify_truncate(ctx, fp->fp_glob->fg_cred, vp);
         }
  #endif
  
@@ -7135,7 +7765,7 @@ fsync_common(proc_t p, struct fsync_args *uap, int flags)
             (vp->v_flag & VISNAMEDSTREAM) &&
             (vp->v_parent != NULLVP) &&
             vnode_isshadow(vp) &&
-           (fp->f_flags & FP_WRITTEN)) {
+           (fp->fp_glob->fg_flag & FWASWRITTEN)) {
                 (void) vnode_flushnamedstream(vp->v_parent, vp, ctx);
         }
  #endif
@@ -7427,10 +8057,6 @@ clonefile_internal(vnode_t fvp, boolean_t data_read_authorised, int dst_dirfd,
                 int fsevent;
  #endif /* CONFIG_FSE */
  
-#if CONFIG_MACF
-               (void)vnode_label(vnode_mount(tvp), tdvp, tvp, cnp,
-                   VNODE_LABEL_CREATE, ctx);
-#endif
                 /*
                  * If some of the requested attributes weren't handled by the
                  * VNOP, use our fallback code.
@@ -7439,6 +8065,11 @@ clonefile_internal(vnode_t fvp, boolean_t data_read_authorised, int dst_dirfd,
                         (void)vnode_setattr_fallback(tvp, &nva, ctx);
                 }
  
+#if CONFIG_MACF
+               (void)vnode_label(vnode_mount(tvp), tdvp, tvp, cnp,
+                   VNODE_LABEL_CREATE, ctx);
+#endif
+
                 // Make sure the name & parent pointers are hooked up
                 if (tvp->v_name == NULL) {
                         update_flags |= VNODE_UPDATE_NAME;
@@ -7557,7 +8188,7 @@ fclonefileat(__unused proc_t p, struct fclonefileat_args *uap,
                 return error;
         }
  
-       if ((fp->f_fglob->fg_flag & FREAD) == 0) {
+       if ((fp->fp_glob->fg_flag & FREAD) == 0) {
                 AUDIT_ARG(vnpath_withref, fvp, ARG_VNODE1);
                 error = EBADF;
                 goto out;
@@ -7578,25 +8209,56 @@ out:
         return error;
  }
  
-/*
- * Rename files.  Source and destination must either both be directories,
- * or both not be directories.  If target is a directory, it must be empty.
- */
-/* ARGSUSED */
  static int
-renameat_internal(vfs_context_t ctx, int fromfd, user_addr_t from,
-    int tofd, user_addr_t to, int segflg, vfs_rename_flags_t flags)
+rename_submounts_callback(mount_t mp, void *arg)
  {
-       if (flags & ~VFS_RENAME_FLAGS_MASK) {
-               return EINVAL;
+       int error = 0;
+       mount_t pmp = (mount_t)arg;
+       int prefix_len = (int)strlen(pmp->mnt_vfsstat.f_mntonname);
+
+       if (strncmp(mp->mnt_vfsstat.f_mntonname, pmp->mnt_vfsstat.f_mntonname, prefix_len) != 0) {
+               return 0;
         }
  
-       if (ISSET(flags, VFS_RENAME_SWAP) && ISSET(flags, VFS_RENAME_EXCL)) {
-               return EINVAL;
+       if (mp->mnt_vfsstat.f_mntonname[prefix_len] != '/') {
+               return 0;
         }
  
-       vnode_t tvp, tdvp;
+       if ((error = vfs_busy(mp, LK_NOWAIT))) {
+               printf("vfs_busy failed with %d for %s\n", error, mp->mnt_vfsstat.f_mntonname);
+               return -1;
+       }
+
+       int pathlen = MAXPATHLEN;
+       if ((error = vn_getpath_ext(mp->mnt_vnodecovered, NULL, mp->mnt_vfsstat.f_mntonname, &pathlen, VN_GETPATH_FSENTER))) {
+               printf("vn_getpath_ext failed with %d for mnt_vnodecovered of %s\n", error, mp->mnt_vfsstat.f_mntonname);
+       }
+
+       vfs_unbusy(mp);
+
+       return error;
+}
+
+/*
+ * Rename files.  Source and destination must either both be directories,
+ * or both not be directories.  If target is a directory, it must be empty.
+ */
+/* ARGSUSED */
+static int
+renameat_internal(vfs_context_t ctx, int fromfd, user_addr_t from,
+    int tofd, user_addr_t to, int segflg, vfs_rename_flags_t flags)
+{
+       if (flags & ~VFS_RENAME_FLAGS_MASK) {
+               return EINVAL;
+       }
+
+       if (ISSET(flags, VFS_RENAME_SWAP) && ISSET(flags, VFS_RENAME_EXCL)) {
+               return EINVAL;
+       }
+
+       vnode_t tvp, tdvp;
         vnode_t fvp, fdvp;
+       vnode_t mnt_fvp;
         struct nameidata *fromnd, *tond;
         int error;
         int do_retry;
@@ -7607,14 +8269,18 @@ renameat_internal(vfs_context_t ctx, int fromfd, user_addr_t from,
         int has_listeners;
         const char *oname = NULL;
         char *from_name = NULL, *to_name = NULL;
+       char *from_name_no_firmlink = NULL, *to_name_no_firmlink = NULL;
         int from_len = 0, to_len = 0;
+       int from_len_no_firmlink = 0, to_len_no_firmlink = 0;
         int holding_mntlock;
+       int vn_authorize_skipped;
         mount_t locked_mp = NULL;
         vnode_t oparent = NULLVP;
  #if CONFIG_FSE
         fse_info from_finfo, to_finfo;
  #endif
-       int from_truncated = 0, to_truncated;
+       int from_truncated = 0, to_truncated = 0;
+       int from_truncated_no_firmlink = 0, to_truncated_no_firmlink = 0;
         int batched = 0;
         struct vnode_attr *fvap, *tvap;
         int continuing = 0;
@@ -7623,7 +8289,7 @@ renameat_internal(vfs_context_t ctx, int fromfd, user_addr_t from,
                 struct nameidata from_node, to_node;
                 struct vnode_attr fv_attr, tv_attr;
         } * __rename_data;
-       MALLOC(__rename_data, void *, sizeof(*__rename_data), M_TEMP, M_WAITOK);
+       __rename_data = kheap_alloc(KHEAP_TEMP, sizeof(*__rename_data), Z_WAITOK);
         fromnd = &__rename_data->from_node;
         tond = &__rename_data->to_node;
  
@@ -7634,7 +8300,9 @@ retry:
         fvp = tvp = NULL;
         fdvp = tdvp = NULL;
         fvap = tvap = NULL;
+       mnt_fvp = NULLVP;
         mntrename = FALSE;
+       vn_authorize_skipped = FALSE;
  
         NDINIT(fromnd, DELETE, OP_UNLINK, WANTPARENT | AUDITVNPATH1,
             segflg, from, ctx);
@@ -7695,8 +8363,31 @@ continue_lookup:
         }
  
         if (tvp && ISSET(flags, VFS_RENAME_EXCL)) {
-               error = EEXIST;
-               goto out1;
+               int32_t pval = 0;
+               int err = 0;
+
+               /*
+                * We allow rename with VFS_RENAME_EXCL flag for an existing file which
+                * has the same name as target iff the following conditions are met:
+                * 1. the target file system is case insensitive
+                * 2. source and target directories are the same
+                * 3. source and target files are the same
+                * 4. name only differs in case (determined by underlying filesystem)
+                */
+               if (fvp != tvp || fdvp != tdvp) {
+                       error = EEXIST;
+                       goto out1;
+               }
+
+               /*
+                * Assume that the target file system is case sensitive if
+                * _PC_CASE_SENSITIVE selector isn't supported.
+                */
+               err = VNOP_PATHCONF(tvp, _PC_CASE_SENSITIVE, &pval, ctx);
+               if (err != 0 || pval != 0) {
+                       error = EEXIST;
+                       goto out1;
+               }
         }
  
         batched = vnode_compound_rename_available(fdvp);
@@ -7742,25 +8433,29 @@ continue_lookup:
         if (need_event || has_listeners) {
                 if (from_name == NULL) {
                         GET_PATH(from_name);
-                       if (from_name == NULL) {
-                               error = ENOMEM;
-                               goto out1;
-                       }
                 }
  
                 from_len = safe_getpath(fdvp, fromnd->ni_cnd.cn_nameptr, from_name, MAXPATHLEN, &from_truncated);
+
+               if (from_name_no_firmlink == NULL) {
+                       GET_PATH(from_name_no_firmlink);
+               }
+
+               from_len_no_firmlink = safe_getpath_no_firmlink(fdvp, fromnd->ni_cnd.cn_nameptr, from_name_no_firmlink, MAXPATHLEN, &from_truncated_no_firmlink);
         }
  
         if (need_event || need_kpath2 || has_listeners) {
                 if (to_name == NULL) {
                         GET_PATH(to_name);
-                       if (to_name == NULL) {
-                               error = ENOMEM;
-                               goto out1;
-                       }
                 }
  
                 to_len = safe_getpath(tdvp, tond->ni_cnd.cn_nameptr, to_name, MAXPATHLEN, &to_truncated);
+
+               if (to_name_no_firmlink == NULL) {
+                       GET_PATH(to_name_no_firmlink);
+               }
+
+               to_len_no_firmlink = safe_getpath_no_firmlink(tdvp, tond->ni_cnd.cn_nameptr, to_name_no_firmlink, MAXPATHLEN, &to_truncated_no_firmlink);
                 if (to_name && need_kpath2) {
                         AUDIT_ARG(kpath, to_name, ARG_KPATH2);
                 }
@@ -7783,25 +8478,6 @@ continue_lookup:
                 goto skipped_lookup;
         }
  
-       if (!batched) {
-               error = vn_authorize_renamex_with_paths(fdvp, fvp, &fromnd->ni_cnd, from_name, tdvp, tvp, &tond->ni_cnd, to_name, ctx, flags, NULL);
-               if (error) {
-                       if (error == ENOENT) {
-                               assert(retry_count < MAX_AUTHORIZE_ENOENT_RETRIES);
-                               if (retry_count < MAX_AUTHORIZE_ENOENT_RETRIES) {
-                                       /*
-                                        * We encountered a race where after doing the namei, tvp stops
-                                        * being valid. If so, simply re-drive the rename call from the
-                                        * top.
-                                        */
-                                       do_retry = 1;
-                                       retry_count += 1;
-                               }
-                       }
-                       goto out1;
-               }
-       }
-
         /*
          * If the source and destination are the same (i.e. they're
          * links to the same vnode) and the target file system is
@@ -7818,6 +8494,7 @@ continue_lookup:
                  */
                 if (VNOP_PATHCONF(fvp, _PC_CASE_SENSITIVE, &pathconf_val, ctx) != 0 ||
                     pathconf_val != 0) {
+                       vn_authorize_skipped = TRUE;
                         goto out1;
                 }
         }
@@ -7827,7 +8504,7 @@ continue_lookup:
          * - target must not exist
          * - target must reside in the same directory as source
          * - union mounts cannot be renamed
-        * - "/" cannot be renamed
+        * - the root fs, and tightly-linked system volumes, cannot be renamed
          *
          * XXX Handle this in VFS after a continued lookup (if we missed
          * in the cache to start off)
@@ -7844,6 +8521,7 @@ continue_lookup:
             (fvp->v_mountedhere == NULL) &&
             (fdvp == tdvp) &&
             ((fvp->v_mount->mnt_flag & (MNT_UNION | MNT_ROOTFS)) == 0) &&
+           ((fvp->v_mount->mnt_kern_flag & MNTK_SYSTEM) == 0) &&
             (fvp->v_mount->mnt_vnodecovered != NULLVP)) {
                 vnode_t coveredvp;
  
@@ -7853,7 +8531,11 @@ continue_lookup:
                         error = ENOENT;
                         goto out1;
                 }
-               vnode_put(fvp);
+               /*
+                * Save the 'fvp' as it is needed for vn_authorize_renamex_with_paths()
+                * later.
+                */
+               mnt_fvp = fvp;
  
                 fvp = coveredvp;
                 mntrename = TRUE;
@@ -7891,6 +8573,7 @@ continue_lookup:
                 if (fromnd->ni_cnd.cn_namelen == tond->ni_cnd.cn_namelen &&
                     !bcmp(fromnd->ni_cnd.cn_nameptr, tond->ni_cnd.cn_nameptr,
                     fromnd->ni_cnd.cn_namelen)) {
+                       vn_authorize_skipped = TRUE;
                         goto out1;
                 }
         }
@@ -7948,6 +8631,10 @@ continue_lookup:
                         vnode_put(fvp);
                         vnode_put(fdvp);
  
+                       if (mnt_fvp != NULLVP) {
+                               vnode_put(mnt_fvp);
+                       }
+
                         mount_lock_renames(locked_mp);
                         holding_mntlock = 1;
  
@@ -7971,6 +8658,32 @@ continue_lookup:
                 }
         }
  
+       if (!batched) {
+               error = vn_authorize_renamex_with_paths(fdvp, mntrename ? mnt_fvp : fvp,
+                   &fromnd->ni_cnd, from_name, tdvp, tvp, &tond->ni_cnd, to_name, ctx,
+                   flags, NULL);
+               if (error) {
+                       if (error == ENOENT) {
+                               if (retry_count < MAX_AUTHORIZE_ENOENT_RETRIES) {
+                                       /*
+                                        * We encountered a race where after doing the namei,
+                                        * tvp stops being valid. If so, simply re-drive the rename
+                                        * call from the top.
+                                        */
+                                       do_retry = 1;
+                                       retry_count += 1;
+                               }
+                       }
+                       goto out1;
+               }
+       }
+
+       /* Release the 'mnt_fvp' now that it is no longer needed. */
+       if (mnt_fvp != NULLVP) {
+               vnode_put(mnt_fvp);
+               mnt_fvp = NULLVP;
+       }
+
         // save these off so we can later verify that fvp is the same
         oname   = fvp->v_name;
         oparent = fvp->v_parent;
@@ -7990,6 +8703,41 @@ skipped_lookup:
                 holding_mntlock = 0;
         }
         if (error) {
+               if (error == EDATALESS) {
+                       /*
+                        * If we've been here before, something has gone
+                        * horribly wrong and we should just get out lest
+                        * we spiral around the drain forever.
+                        */
+                       if (flags & VFS_RENAME_DATALESS) {
+                               error = EIO;
+                               goto out1;
+                       }
+
+                       /*
+                        * The object we're renaming is dataless (or has a
+                        * dataless descendent) and requires materialization
+                        * before the rename occurs.  But we're holding the
+                        * mount point's rename lock, so it's not safe to
+                        * make the upcall.
+                        *
+                        * In this case, we release the lock, perform the
+                        * materialization, and start the whole thing over.
+                        */
+                       error = vnode_materialize_dataless_file(fvp,
+                           NAMESPACE_HANDLER_RENAME_OP);
+
+                       if (error == 0) {
+                               /*
+                                * The next time around we need to tell the
+                                * file system that the materializtaion has
+                                * been performed.
+                                */
+                               flags |= VFS_RENAME_DATALESS;
+                               do_retry = 1;
+                       }
+                       goto out1;
+               }
                 if (error == EKEEPLOOKING) {
                         if ((fromnd->ni_flag & NAMEI_CONTLOOKUP) == 0) {
                                 if ((tond->ni_flag & NAMEI_CONTLOOKUP) == 0) {
@@ -8011,7 +8759,13 @@ skipped_lookup:
                  * but other filesystems susceptible to this race could return it, too.
                  */
                 if (error == ERECYCLE) {
-                       do_retry = 1;
+                       if (retry_count < MAX_RENAME_ERECYCLE_RETRIES) {
+                               do_retry = 1;
+                               retry_count += 1;
+                       } else {
+                               printf("rename retry limit due to ERECYCLE reached\n");
+                               error = ENOENT;
+                       }
                 }
  
                 /*
@@ -8020,7 +8774,6 @@ skipped_lookup:
                  * cache, redrive the lookup.
                  */
                 if (batched && error == ENOENT) {
-                       assert(retry_count < MAX_AUTHORIZE_ENOENT_RETRIES);
                         if (retry_count < MAX_AUTHORIZE_ENOENT_RETRIES) {
                                 do_retry = 1;
                                 retry_count += 1;
@@ -8058,9 +8811,9 @@ skipped_lookup:
  
                 if (tvp) {
                         add_fsevent(FSE_RENAME, ctx,
-                           FSE_ARG_STRING, from_len, from_name,
+                           FSE_ARG_STRING, from_len_no_firmlink, from_name_no_firmlink,
                             FSE_ARG_FINFO, &from_finfo,
-                           FSE_ARG_STRING, to_len, to_name,
+                           FSE_ARG_STRING, to_len_no_firmlink, to_name_no_firmlink,
                             FSE_ARG_FINFO, &to_finfo,
                             FSE_ARG_DONE);
                         if (flags & VFS_RENAME_SWAP) {
@@ -8071,17 +8824,17 @@ skipped_lookup:
                                  * two.
                                  */
                                 add_fsevent(FSE_RENAME, ctx,
-                                   FSE_ARG_STRING, to_len, to_name,
+                                   FSE_ARG_STRING, to_len_no_firmlink, to_name_no_firmlink,
                                     FSE_ARG_FINFO, &to_finfo,
-                                   FSE_ARG_STRING, from_len, from_name,
+                                   FSE_ARG_STRING, from_len_no_firmlink, from_name_no_firmlink,
                                     FSE_ARG_FINFO, &from_finfo,
                                     FSE_ARG_DONE);
                         }
                 } else {
                         add_fsevent(FSE_RENAME, ctx,
-                           FSE_ARG_STRING, from_len, from_name,
+                           FSE_ARG_STRING, from_len_no_firmlink, from_name_no_firmlink,
                             FSE_ARG_FINFO, &from_finfo,
-                           FSE_ARG_STRING, to_len, to_name,
+                           FSE_ARG_STRING, to_len_no_firmlink, to_name_no_firmlink,
                             FSE_ARG_DONE);
                 }
         }
@@ -8103,7 +8856,7 @@ skipped_lookup:
                         error = EBUSY;
                         goto out1;
                 }
-               MALLOC_ZONE(tobuf, char *, MAXPATHLEN, M_NAMEI, M_WAITOK);
+               tobuf = zalloc(ZV_NAMEI);
  
                 if (UIO_SEG_IS_USER_SPACE(segflg)) {
                         error = copyinstr(to, tobuf, MAXPATHLEN, &len);
@@ -8124,14 +8877,21 @@ skipped_lookup:
                                         mpname = cp + 1;
                                 }
                         }
+
+                       /* Update f_mntonname of sub mounts */
+                       vfs_iterate(0, rename_submounts_callback, (void *)mp);
+
                         /* append name to prefix */
-                       maxlen = MAXPATHLEN - (pathend - mp->mnt_vfsstat.f_mntonname);
+                       maxlen = MAXPATHLEN - (int)(pathend - mp->mnt_vfsstat.f_mntonname);
                         bzero(pathend, maxlen);
+
                         strlcpy(pathend, mpname, maxlen);
                 }
-               FREE_ZONE(tobuf, MAXPATHLEN, M_NAMEI);
+               zfree(ZV_NAMEI, tobuf);
  
                 vfs_unbusy(mp);
+
+               vfs_event_signal(NULL, VQ_UPDATE, (intptr_t)NULL);
         }
         /*
          * fix up name & parent pointers.  note that we first
@@ -8153,14 +8913,37 @@ skipped_lookup:
                 vnode_update_identity(fvp, tdvp, tond->ni_cnd.cn_nameptr, tond->ni_cnd.cn_namelen, tond->ni_cnd.cn_hash, update_flags);
         }
  out1:
+       /*
+        * There are some cases (for e.g. 'fvp == tvp') when vn_authorize was
+        * skipped earlier as no actual rename was performed.
+        */
+       if (vn_authorize_skipped && error == 0) {
+               error = vn_authorize_renamex_with_paths(fdvp, fvp,
+                   &fromnd->ni_cnd, from_name, tdvp, tvp, &tond->ni_cnd, to_name, ctx,
+                   flags, NULL);
+               if (error && error == ENOENT) {
+                       if (retry_count < MAX_AUTHORIZE_ENOENT_RETRIES) {
+                               do_retry = 1;
+                               retry_count += 1;
+                       }
+               }
+       }
         if (to_name != NULL) {
                 RELEASE_PATH(to_name);
                 to_name = NULL;
         }
+       if (to_name_no_firmlink != NULL) {
+               RELEASE_PATH(to_name_no_firmlink);
+               to_name_no_firmlink = NULL;
+       }
         if (from_name != NULL) {
                 RELEASE_PATH(from_name);
                 from_name = NULL;
         }
+       if (from_name_no_firmlink != NULL) {
+               RELEASE_PATH(from_name_no_firmlink);
+               from_name_no_firmlink = NULL;
+       }
         if (holding_mntlock) {
                 mount_unlock_renames(locked_mp);
                 mount_drop(locked_mp, 0);
@@ -8190,7 +8973,9 @@ out1:
                 }
                 vnode_put(fdvp);
         }
-
+       if (mnt_fvp != NULLVP) {
+               vnode_put(mnt_fvp);
+       }
         /*
          * If things changed after we did the namei, then we will re-drive
          * this rename call from the top.
@@ -8200,7 +8985,7 @@ out1:
                 goto retry;
         }
  
-       FREE(__rename_data, M_TEMP);
+       kheap_free(KHEAP_TEMP, __rename_data, sizeof(*__rename_data));
         return error;
  }
  
@@ -8420,16 +9205,19 @@ mkdirat(proc_t p, struct mkdirat_args *uap, __unused int32_t *retval)
  
  static int
  rmdirat_internal(vfs_context_t ctx, int fd, user_addr_t dirpath,
-    enum uio_seg segflg)
+    enum uio_seg segflg, int unlink_flags)
  {
         vnode_t vp, dvp;
         int error;
         struct nameidata nd;
         char     *path = NULL;
-       int       len = 0;
+       char     *no_firmlink_path = NULL;
+       int       len_path = 0;
+       int       len_no_firmlink_path = 0;
         int has_listeners = 0;
         int need_event = 0;
-       int truncated = 0;
+       int truncated_path = 0;
+       int truncated_no_firmlink_path = 0;
  #if CONFIG_FSE
         struct vnode_attr va;
  #endif /* CONFIG_FSE */
@@ -8499,7 +9287,6 @@ continue_lookup:
                                 error = vn_authorize_rmdir(dvp, vp, &nd.ni_cnd, ctx, NULL);
                                 if (error) {
                                         if (error == ENOENT) {
-                                               assert(restart_count < MAX_AUTHORIZE_ENOENT_RETRIES);
                                                 if (restart_count < MAX_AUTHORIZE_ENOENT_RETRIES) {
                                                         restart_flag = 1;
                                                         restart_count += 1;
@@ -8517,7 +9304,7 @@ continue_lookup:
                 }
  
  #if CONFIG_FSE
-               fse_info  finfo;
+               fse_info  finfo = {0};
  
                 need_event = need_fsevent(FSE_DELETE, dvp);
                 if (need_event) {
@@ -8537,15 +9324,17 @@ continue_lookup:
                 if (need_event || has_listeners) {
                         if (path == NULL) {
                                 GET_PATH(path);
-                               if (path == NULL) {
-                                       error = ENOMEM;
-                                       goto out;
-                               }
                         }
  
-                       len = safe_getpath(dvp, nd.ni_cnd.cn_nameptr, path, MAXPATHLEN, &truncated);
+                       len_path = safe_getpath(dvp, nd.ni_cnd.cn_nameptr, path, MAXPATHLEN, &truncated_path);
+
+                       if (no_firmlink_path == NULL) {
+                               GET_PATH(no_firmlink_path);
+                       }
+
+                       len_no_firmlink_path = safe_getpath_no_firmlink(dvp, nd.ni_cnd.cn_nameptr, no_firmlink_path, MAXPATHLEN, &truncated_no_firmlink_path);
  #if CONFIG_FSE
-                       if (truncated) {
+                       if (truncated_no_firmlink_path) {
                                 finfo.mode |= FSE_TRUNCATED_PATH;
                         }
  #endif
@@ -8561,7 +9350,6 @@ continue_lookup:
                 if (error == EKEEPLOOKING) {
                         goto continue_lookup;
                 } else if (batched && error == ENOENT) {
-                       assert(restart_count < MAX_AUTHORIZE_ENOENT_RETRIES);
                         if (restart_count < MAX_AUTHORIZE_ENOENT_RETRIES) {
                                 /*
                                  * For compound VNOPs, the authorization callback
@@ -8573,6 +9361,27 @@ continue_lookup:
                                 goto out;
                         }
                 }
+
+               /*
+                * XXX There's no provision for passing flags
+                * to VNOP_RMDIR().  So, if vn_rmdir() fails
+                * because it's not empty, then we try again
+                * with VNOP_REMOVE(), passing in a special
+                * flag that clever file systems will know
+                * how to handle.
+                */
+               if (error == ENOTEMPTY &&
+                   (unlink_flags & VNODE_REMOVE_DATALESS_DIR) != 0) {
+                       /*
+                        * If this fails, we want to keep the original
+                        * error.
+                        */
+                       if (vn_remove(dvp, &vp, &nd,
+                           VNODE_REMOVE_DATALESS_DIR, vap, ctx) == 0) {
+                               error = 0;
+                       }
+               }
+
  #if CONFIG_APPLEDOUBLE
                 /*
                  * Special case to remove orphaned AppleDouble
@@ -8581,8 +9390,9 @@ continue_lookup:
                  * so here we are.
                  */
                 if (error == ENOTEMPTY) {
-                       error = rmdir_remove_orphaned_appleDouble(vp, ctx, &restart_flag);
-                       if (error == EBUSY) {
+                       int ad_error = rmdir_remove_orphaned_appleDouble(vp, ctx, &restart_flag);
+                       if (ad_error == EBUSY) {
+                               error = ad_error;
                                 goto out;
                         }
  
@@ -8590,7 +9400,7 @@ continue_lookup:
                         /*
                          * Assuming everything went well, we will try the RMDIR again
                          */
-                       if (!error) {
+                       if (!ad_error) {
                                 error = vn_rmdir(dvp, &vp, &nd, vap, ctx);
                         }
                 }
@@ -8619,7 +9429,7 @@ continue_lookup:
                                         vnode_get_fse_info_from_vap(vp, &finfo, vap);
                                 }
                                 add_fsevent(FSE_DELETE, ctx,
-                                   FSE_ARG_STRING, len, path,
+                                   FSE_ARG_STRING, len_no_firmlink_path, no_firmlink_path,
                                     FSE_ARG_FINFO, &finfo,
                                     FSE_ARG_DONE);
                         }
@@ -8631,6 +9441,12 @@ out:
                         RELEASE_PATH(path);
                         path = NULL;
                 }
+
+               if (no_firmlink_path != NULL) {
+                       RELEASE_PATH(no_firmlink_path);
+                       no_firmlink_path = NULL;
+               }
+
                 /*
                  * nameidone has to happen before we vnode_put(dvp)
                  * since it may need to release the fs_nodelock on the dvp
@@ -8660,7 +9476,7 @@ int
  rmdir(__unused proc_t p, struct rmdir_args *uap, __unused int32_t *retval)
  {
         return rmdirat_internal(vfs_context_current(), AT_FDCWD,
-                  CAST_USER_ADDR_T(uap->path), UIO_USERSPACE);
+                  CAST_USER_ADDR_T(uap->path), UIO_USERSPACE, 0);
  }
  
  /* Get direntry length padded to 8 byte alignment */
@@ -8689,7 +9505,7 @@ vnode_readdir64(struct vnode *vp, struct uio *uio, int flags, int *eofflag,
                 uio_t auio;
                 struct direntry *entry64;
                 struct dirent *dep;
-               int bytesread;
+               size_t bytesread;
                 int error;
  
                 /*
@@ -8710,7 +9526,7 @@ vnode_readdir64(struct vnode *vp, struct uio *uio, int flags, int *eofflag,
                  * prevent uio_resid() * 3 / 8 from overflowing.
                  */
                 bufsize = 3 * MIN((user_size_t)uio_resid(uio), 87371u) / 8;
-               MALLOC(bufptr, void *, bufsize, M_TEMP, M_WAITOK);
+               bufptr = kheap_alloc(KHEAP_DATA_BUFFERS, bufsize, Z_WAITOK);
                 if (bufptr == NULL) {
                         return ENOMEM;
                 }
@@ -8724,15 +9540,16 @@ vnode_readdir64(struct vnode *vp, struct uio *uio, int flags, int *eofflag,
                 dep = (struct dirent *)bufptr;
                 bytesread = bufsize - uio_resid(auio);
  
-               MALLOC(entry64, struct direntry *, sizeof(struct direntry),
-                   M_TEMP, M_WAITOK);
+               entry64 = kheap_alloc(KHEAP_TEMP, sizeof(struct direntry), Z_WAITOK);
                 /*
                  * Convert all the entries and copy them out to user's buffer.
                  */
                 while (error == 0 && (char *)dep < ((char *)bufptr + bytesread)) {
-                       size_t  enbufsize = DIRENT64_LEN(dep->d_namlen);
-
-                       if (DIRENT_END(dep) > ((char *)bufptr + bytesread) ||
+                       /* First check that the dirent struct up to d_name is within the buffer */
+                       if ((char*)dep + offsetof(struct dirent, d_name) > ((char *)bufptr + bytesread) ||
+                           /* Check that the length of the entire dirent is within the buffer */
+                           DIRENT_END(dep) > ((char *)bufptr + bytesread) ||
+                           /* Check that the actual length including the name doesn't exceed d_reclen */
                             DIRENT_LEN(dep->d_namlen) > dep->d_reclen) {
                                 printf("%s: %s: Bad dirent recived from directory %s\n", __func__,
                                     vp->v_mount->mnt_vfsstat.f_mntonname,
@@ -8741,11 +9558,13 @@ vnode_readdir64(struct vnode *vp, struct uio *uio, int flags, int *eofflag,
                                 break;
                         }
  
+                       size_t  enbufsize = DIRENT64_LEN(dep->d_namlen);
+
                         bzero(entry64, enbufsize);
                         /* Convert a dirent to a dirent64. */
                         entry64->d_ino = dep->d_ino;
                         entry64->d_seekoff = 0;
-                       entry64->d_reclen = enbufsize;
+                       entry64->d_reclen = (uint16_t)enbufsize;
                         entry64->d_namlen = dep->d_namlen;
                         entry64->d_type = dep->d_type;
                         bcopy(dep->d_name, entry64->d_name, dep->d_namlen + 1);
@@ -8762,8 +9581,8 @@ vnode_readdir64(struct vnode *vp, struct uio *uio, int flags, int *eofflag,
                         uio->uio_offset = auio->uio_offset;
                 }
                 uio_free(auio);
-               FREE(bufptr, M_TEMP);
-               FREE(entry64, M_TEMP);
+               kheap_free(KHEAP_DATA_BUFFERS, bufptr, bufsize);
+               kheap_free(KHEAP_TEMP, entry64, sizeof(struct direntry));
                 return error;
         }
  }
@@ -8775,7 +9594,7 @@ vnode_readdir64(struct vnode *vp, struct uio *uio, int flags, int *eofflag,
   */
  static int
  getdirentries_common(int fd, user_addr_t bufp, user_size_t bufsize, ssize_t *bytesread,
-    off_t *offset, int flags)
+    off_t *offset, int *eofflag, int flags)
  {
         vnode_t vp;
         struct vfs_context context = *vfs_context_current();    /* local copy */
@@ -8783,14 +9602,14 @@ getdirentries_common(int fd, user_addr_t bufp, user_size_t bufsize, ssize_t *byt
         uio_t auio;
         int spacetype = proc_is64bit(vfs_context_proc(&context)) ? UIO_USERSPACE64 : UIO_USERSPACE32;
         off_t loff;
-       int error, eofflag, numdirent;
+       int error, numdirent;
         char uio_buf[UIO_SIZEOF(1)];
  
         error = fp_getfvp(vfs_context_proc(&context), fd, &fp, &vp);
         if (error) {
                 return error;
         }
-       if ((fp->f_fglob->fg_flag & FREAD) == 0) {
+       if ((fp->fp_glob->fg_flag & FREAD) == 0) {
                 AUDIT_ARG(vnpath_withref, vp, ARG_VNODE1);
                 error = EBADF;
                 goto out;
@@ -8801,7 +9620,7 @@ getdirentries_common(int fd, user_addr_t bufp, user_size_t bufsize, ssize_t *byt
         }
  
  #if CONFIG_MACF
-       error = mac_file_check_change_offset(vfs_context_ucred(&context), fp->f_fglob);
+       error = mac_file_check_change_offset(vfs_context_ucred(&context), fp->fp_glob);
         if (error) {
                 goto out;
         }
@@ -8826,16 +9645,16 @@ unionread:
         }
  #endif /* MAC */
  
-       loff = fp->f_fglob->fg_offset;
+       loff = fp->fp_glob->fg_offset;
         auio = uio_createwithbuffer(1, loff, spacetype, UIO_READ, &uio_buf[0], sizeof(uio_buf));
         uio_addiov(auio, bufp, bufsize);
  
         if (flags & VNODE_READDIR_EXTENDED) {
-               error = vnode_readdir64(vp, auio, flags, &eofflag, &numdirent, &context);
-               fp->f_fglob->fg_offset = uio_offset(auio);
+               error = vnode_readdir64(vp, auio, flags, eofflag, &numdirent, &context);
+               fp->fp_glob->fg_offset = uio_offset(auio);
         } else {
-               error = VNOP_READDIR(vp, auio, 0, &eofflag, &numdirent, &context);
-               fp->f_fglob->fg_offset = uio_offset(auio);
+               error = VNOP_READDIR(vp, auio, 0, eofflag, &numdirent, &context);
+               fp->fp_glob->fg_offset = uio_offset(auio);
         }
         if (error) {
                 (void)vnode_put(vp);
@@ -8843,23 +9662,12 @@ unionread:
         }
  
         if ((user_ssize_t)bufsize == uio_resid(auio)) {
-               if (union_dircheckp) {
-                       error = union_dircheckp(&vp, fp, &context);
-                       if (error == -1) {
-                               goto unionread;
-                       }
-                       if (error) {
-                               (void)vnode_put(vp);
-                               goto out;
-                       }
-               }
-
                 if ((vp->v_mount->mnt_flag & MNT_UNION)) {
                         struct vnode *tvp = vp;
                         if (lookup_traverse_union(tvp, &vp, &context) == 0) {
                                 vnode_ref(vp);
-                               fp->f_fglob->fg_data = (caddr_t) vp;
-                               fp->f_fglob->fg_offset = 0;
+                               fp->fp_glob->fg_data = (caddr_t) vp;
+                               fp->fp_glob->fg_offset = 0;
                                 vnode_rele(tvp);
                                 vnode_put(tvp);
                                 goto unionread;
@@ -8885,10 +9693,11 @@ getdirentries(__unused struct proc *p, struct getdirentries_args *uap, int32_t *
  {
         off_t offset;
         ssize_t bytesread;
-       int error;
+       int error, eofflag;
  
         AUDIT_ARG(fd, uap->fd);
-       error = getdirentries_common(uap->fd, uap->buf, uap->count, &bytesread, &offset, 0);
+       error = getdirentries_common(uap->fd, uap->buf, uap->count,
+           &bytesread, &offset, &eofflag, 0);
  
         if (error == 0) {
                 if (proc_is64bit(p)) {
@@ -8898,7 +9707,7 @@ getdirentries(__unused struct proc *p, struct getdirentries_args *uap, int32_t *
                         user32_long_t base = (user32_long_t)offset;
                         error = copyout((caddr_t)&base, uap->basep, sizeof(user32_long_t));
                 }
-               *retval = bytesread;
+               *retval = (int)bytesread;
         }
         return error;
  }
@@ -8908,14 +9717,37 @@ getdirentries64(__unused struct proc *p, struct getdirentries64_args *uap, user_
  {
         off_t offset;
         ssize_t bytesread;
-       int error;
+       int error, eofflag;
+       user_size_t bufsize;
  
         AUDIT_ARG(fd, uap->fd);
-       error = getdirentries_common(uap->fd, uap->buf, uap->bufsize, &bytesread, &offset, VNODE_READDIR_EXTENDED);
+
+       /*
+        * If the buffer is at least GETDIRENTRIES64_EXTENDED_BUFSIZE large,
+        * then the kernel carves out the last 4 bytes to return extended
+        * information to userspace (namely whether we reached EOF with this call).
+        */
+       if (uap->bufsize >= GETDIRENTRIES64_EXTENDED_BUFSIZE) {
+               bufsize = uap->bufsize - sizeof(getdirentries64_flags_t);
+       } else {
+               bufsize = uap->bufsize;
+       }
+
+       error = getdirentries_common(uap->fd, uap->buf, bufsize,
+           &bytesread, &offset, &eofflag, VNODE_READDIR_EXTENDED);
  
         if (error == 0) {
                 *retval = bytesread;
                 error = copyout((caddr_t)&offset, uap->position, sizeof(off_t));
+
+               if (error == 0 && uap->bufsize >= GETDIRENTRIES64_EXTENDED_BUFSIZE) {
+                       getdirentries64_flags_t flags = 0;
+                       if (eofflag) {
+                               flags |= GETDIRENTRIES64_EOF;
+                       }
+                       error = copyout(&flags, (user_addr_t)uap->buf + bufsize,
+                           sizeof(flags));
+               }
         }
         return error;
  }
@@ -9064,7 +9896,7 @@ getdirentriesattr(proc_t p, struct getdirentriesattr_args *uap, int32_t *retval)
         uint32_t count = 0, savecount = 0;
         uint32_t newstate = 0;
         int error, eofflag;
-       uint32_t loff = 0;
+       off_t loff = 0;
         struct attrlist attributelist;
         vfs_context_t ctx = vfs_context_current();
         int fd = uap->fd;
@@ -9084,7 +9916,7 @@ getdirentriesattr(proc_t p, struct getdirentriesattr_args *uap, int32_t *retval)
         if ((error = fp_getfvp(p, fd, &fp, &vp))) {
                 return error;
         }
-       if ((fp->f_fglob->fg_flag & FREAD) == 0) {
+       if ((fp->fp_glob->fg_flag & FREAD) == 0) {
                 AUDIT_ARG(vnpath_withref, vp, ARG_VNODE1);
                 error = EBADF;
                 goto out;
@@ -9093,7 +9925,7 @@ getdirentriesattr(proc_t p, struct getdirentriesattr_args *uap, int32_t *retval)
  
  #if CONFIG_MACF
         error = mac_file_check_change_offset(vfs_context_ucred(ctx),
-           fp->f_fglob);
+           fp->fp_glob);
         if (error) {
                 goto out;
         }
@@ -9122,7 +9954,7 @@ unionread:
  #endif /* MAC */
  
         /* set up the uio structure which will contain the users return buffer */
-       loff = fp->f_fglob->fg_offset;
+       loff = fp->fp_glob->fg_offset;
         auio = uio_createwithbuffer(1, loff, spacetype, UIO_READ, &uio_buf[0], sizeof(uio_buf));
         uio_addiov(auio, uap->buffer, uap->buffersize);
  
@@ -9142,7 +9974,7 @@ unionread:
                  * info, so truncate before extending again */
  
                 error = VNOP_READDIRATTR(vp, &attributelist, auio, count,
-                   (u_long)(uint32_t)uap->options, &newstate, &eofflag, &count, ctx);
+                   (uint32_t)uap->options, &newstate, &eofflag, &count, ctx);
         }
  
         if (error) {
@@ -9163,11 +9995,11 @@ unionread:
                 } else {                                                // Empty buffer
                         struct vnode *tvp = vp;
                         if (lookup_traverse_union(tvp, &vp, ctx) == 0) {
-                               vnode_ref_ext(vp, fp->f_fglob->fg_flag & O_EVTONLY, 0);
-                               fp->f_fglob->fg_data = (caddr_t) vp;
-                               fp->f_fglob->fg_offset = 0; // reset index for new dir
+                               vnode_ref_ext(vp, fp->fp_glob->fg_flag & O_EVTONLY, 0);
+                               fp->fp_glob->fg_data = (caddr_t) vp;
+                               fp->fp_glob->fg_offset = 0; // reset index for new dir
                                 count = savecount;
-                               vnode_rele_internal(tvp, fp->f_fglob->fg_flag & O_EVTONLY, 0, 0);
+                               vnode_rele_internal(tvp, fp->fp_glob->fg_flag & O_EVTONLY, 0, 0);
                                 vnode_put(tvp);
                                 goto unionread;
                         }
@@ -9180,7 +10012,7 @@ unionread:
         if (error) {
                 goto out;
         }
-       fp->f_fglob->fg_offset = uio_offset(auio); /* should be multiple of dirent, not variable */
+       fp->fp_glob->fg_offset = uio_offset(auio); /* should be multiple of dirent, not variable */
  
         if ((error = copyout((caddr_t) &count, uap->count, sizeof(count)))) {
                 goto out;
@@ -9289,10 +10121,6 @@ exchangedata(__unused proc_t p, struct exchangedata_args *uap, __unused int32_t
                 kauth_authorize_fileop_has_listeners()) {
                 GET_PATH(fpath);
                 GET_PATH(spath);
-               if (fpath == NULL || spath == NULL) {
-                       error = ENOMEM;
-                       goto out;
-               }
  
                 flen = safe_getpath(fvp, NULL, fpath, MAXPATHLEN, &from_truncated);
                 slen = safe_getpath(svp, NULL, spath, MAXPATHLEN, &to_truncated);
@@ -9368,8 +10196,8 @@ uint32_t
  freespace_mb(vnode_t vp)
  {
         vfs_update_vfsstat(vp->v_mount, vfs_context_current(), VFS_USER_EVENT);
-       return ((uint64_t)vp->v_mount->mnt_vfsstat.f_bavail *
-              vp->v_mount->mnt_vfsstat.f_bsize) >> 20;
+       return (uint32_t)(((uint64_t)vp->v_mount->mnt_vfsstat.f_bavail *
+              vp->v_mount->mnt_vfsstat.f_bsize) >> 20);
  }
  
  #if CONFIG_SEARCHFS
@@ -9391,7 +10219,7 @@ searchfs(proc_t p, struct searchfs_args *uap, __unused int32_t *retval)
         uio_t auio = NULL;
         int spacetype = proc_is64bit(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
         uint32_t nummatches;
-       int mallocsize;
+       size_t mallocsize;
         uint32_t nameiflags;
         vfs_context_t ctx = vfs_context_current();
         char uio_buf[UIO_SIZEOF(1)];
@@ -9445,7 +10273,7 @@ searchfs(proc_t p, struct searchfs_args *uap, __unused int32_t *retval)
         mallocsize = searchblock.sizeofsearchparams1 + searchblock.sizeofsearchparams2 +
             sizeof(struct attrlist) + sizeof(struct searchstate) + (2 * sizeof(uint32_t));
  
-       MALLOC(searchparams1, void *, mallocsize, M_TEMP, M_WAITOK);
+       searchparams1 = kheap_alloc(KHEAP_DATA_BUFFERS, mallocsize, Z_WAITOK);
  
         /* Now set up the various pointers to the correct place in our newly allocated memory */
  
@@ -9611,12 +10439,12 @@ searchfs(proc_t p, struct searchfs_args *uap, __unused int32_t *retval)
             searchparams1,
             searchparams2,
             &searchblock.searchattrs,
-           (u_long)searchblock.maxmatches,
+           (uint32_t)searchblock.maxmatches,
             &timelimit,
             returnattrs,
             &nummatches,
-           (u_long)uap->scriptcode,
-           (u_long)uap->options,
+           (uint32_t)uap->scriptcode,
+           (uint32_t)uap->options,
             auio,
             (struct searchstate *) &state->ss_fsstate,
             ctx);
@@ -9650,7 +10478,7 @@ saveandexit:
  
  freeandexit:
  
-       FREE(searchparams1, M_TEMP);
+       kheap_free(KHEAP_DATA_BUFFERS, searchparams1, mallocsize);
  
         return error;
  } /* end of searchfs system call */
@@ -9666,822 +10494,810 @@ searchfs(__unused proc_t p, __unused struct searchfs_args *uap, __unused int32_t
  #endif /* CONFIG_SEARCHFS */
  
  
-lck_grp_attr_t *  nspace_group_attr;
-lck_attr_t *      nspace_lock_attr;
-lck_grp_t *       nspace_mutex_group;
+#if CONFIG_DATALESS_FILES
  
-lck_mtx_t         nspace_handler_lock;
-lck_mtx_t         nspace_handler_exclusion_lock;
+/*
+ * === Namespace Resolver Up-call Mechanism ===
+ *
+ * When I/O is performed to a dataless file or directory (read, write,
+ * lookup-in, etc.), the file system performs an upcall to the namespace
+ * resolver (filecoordinationd) to materialize the object.
+ *
+ * We need multiple up-calls to be in flight at once, and we need these
+ * up-calls to be interruptible, thus the following implementation:
+ *
+ * => The nspace_resolver_request represents the in-kernel request state.
+ *    It contains a request ID, storage space for the errno code returned
+ *    by filecoordinationd, and flags.
+ *
+ * => The request ID is simply a global monotonically incrementing 32-bit
+ *    number.  Outstanding requests are stored in a hash table, and the
+ *    hash function is extremely simple.
+ *
+ * => When an upcall is to be made to filecoordinationd, a request structure
+ *    is allocated on the stack (it is small, and needs to live only during
+ *    the duration of the call to resolve_nspace_item_ext()).  It is
+ *    initialized and inserted into the table.  Some backpressure from
+ *    filecoordinationd is applied by limiting the numnber of entries that
+ *    can be inserted into the table (and thus limiting the number of
+ *    outstanding requests issued to filecoordinationd); waiting for an
+ *    available slot is interruptible.
+ *
+ * => Once the request has been inserted into the table, the up-call is made
+ *    to filecoordinationd via a MiG-generated stub.  The up-call returns
+ *    immediately and filecoordinationd processes the request asynchronously.
+ *
+ * => The caller now waits for the request to complete.  Tnis is achieved by
+ *    sleeping on the address of the request structure and waiting for
+ *    filecoordinationd to mark the request structure as complete.  This
+ *    is an interruptible sleep call; if interrupted, the request structure
+ *    is removed from the table and EINTR is returned to the caller.  If
+ *    this occurs, an advisory up-call is made to filecoordinationd with
+ *    the request ID to indicate that the request can be aborted or
+ *    de-prioritized at the discretion of filecoordinationd.
+ *
+ * => When filecoordinationd has completed the request, it signals completion
+ *    by writing to the vfs.nspace.complete sysctl node.  Only a process
+ *    decorated as a namespace resolver can write to this sysctl node.  The
+ *    value is a request ID / errno tuple passed as an array of 2 uint32_t's.
+ *    The request ID is looked up in the table, and if the request is found,
+ *    the error code is stored in the request structure and a wakeup()
+ *    issued on the address of the request structure.  If the request is not
+ *    found, we simply drop the completion notification, assuming that the
+ *    caller was interrupted.
+ *
+ * => When the waiting thread wakes up, it extracts the error code from the
+ *    request structure, removes the request from the table, and returns the
+ *    error code to the calling function.  Fini!
+ */
  
-time_t snapshot_timestamp = 0;
-int nspace_allow_virtual_devs = 0;
+struct nspace_resolver_request {
+       LIST_ENTRY(nspace_resolver_request) r_hashlink;
+       vnode_t         r_vp;
+       uint32_t        r_req_id;
+       int             r_resolver_error;
+       int             r_flags;
+};
  
-void nspace_handler_init(void);
+#define RRF_COMPLETE    0x0001
  
-typedef struct nspace_item_info {
-       struct vnode *vp;
-       void         *arg;
-       uint64_t      op;
-       uint32_t      vid;
-       uint32_t      flags;
-       uint32_t      token;
-       uint32_t      refcount;
-} nspace_item_info;
-
-#define MAX_NSPACE_ITEMS   128
-nspace_item_info nspace_items[MAX_NSPACE_ITEMS];
-uint32_t      nspace_item_idx = 0;              // also used as the sleep/wakeup rendezvous address
-uint32_t      nspace_token_id = 0;
-uint32_t      nspace_handler_timeout = 15;    // seconds
-
-#define NSPACE_ITEM_NEW         0x0001
-#define NSPACE_ITEM_PROCESSING  0x0002
-#define NSPACE_ITEM_DEAD        0x0004
-#define NSPACE_ITEM_CANCELLED   0x0008
-#define NSPACE_ITEM_DONE        0x0010
-#define NSPACE_ITEM_RESET_TIMER 0x0020
-
-#define NSPACE_ITEM_NSPACE_EVENT   0x0040
-#define NSPACE_ITEM_SNAPSHOT_EVENT 0x0080
-
-#define NSPACE_ITEM_ALL_EVENT_TYPES (NSPACE_ITEM_NSPACE_EVENT | NSPACE_ITEM_SNAPSHOT_EVENT)
-
-//#pragma optimization_level 0
+static uint32_t
+next_nspace_req_id(void)
+{
+       static uint32_t next_req_id;
  
-typedef enum {
-       NSPACE_HANDLER_NSPACE = 0,
-       NSPACE_HANDLER_SNAPSHOT = 1,
-
-       NSPACE_HANDLER_COUNT,
-} nspace_type_t;
-
-typedef struct {
-       uint64_t handler_tid;
-       struct proc *handler_proc;
-       int handler_busy;
-} nspace_handler_t;
-
-nspace_handler_t nspace_handlers[NSPACE_HANDLER_COUNT];
-
-/* namespace fsctl functions */
-static int nspace_flags_matches_handler(uint32_t event_flags, nspace_type_t nspace_type);
-static int nspace_item_flags_for_type(nspace_type_t nspace_type);
-static int nspace_open_flags_for_type(nspace_type_t nspace_type);
-static nspace_type_t nspace_type_for_op(uint64_t op);
-static int nspace_is_special_process(struct proc *proc);
-static int vn_open_with_vp(vnode_t vp, int fmode, vfs_context_t ctx);
-static int wait_for_namespace_event(namespace_handler_data *nhd, nspace_type_t nspace_type);
-static int validate_namespace_args(int is64bit, int size);
-static int process_namespace_fsctl(nspace_type_t nspace_type, int is64bit, u_int size, caddr_t data);
-
-
-static inline int
-nspace_flags_matches_handler(uint32_t event_flags, nspace_type_t nspace_type)
-{
-       switch (nspace_type) {
-       case NSPACE_HANDLER_NSPACE:
-               return (event_flags & NSPACE_ITEM_ALL_EVENT_TYPES) == NSPACE_ITEM_NSPACE_EVENT;
-       case NSPACE_HANDLER_SNAPSHOT:
-               return (event_flags & NSPACE_ITEM_ALL_EVENT_TYPES) == NSPACE_ITEM_SNAPSHOT_EVENT;
-       default:
-               printf("nspace_flags_matches_handler: invalid type %u\n", (int)nspace_type);
-               return 0;
-       }
+       return OSAddAtomic(1, &next_req_id);
  }
  
-static inline int
-nspace_item_flags_for_type(nspace_type_t nspace_type)
-{
-       switch (nspace_type) {
-       case NSPACE_HANDLER_NSPACE:
-               return NSPACE_ITEM_NSPACE_EVENT;
-       case NSPACE_HANDLER_SNAPSHOT:
-               return NSPACE_ITEM_SNAPSHOT_EVENT;
-       default:
-               printf("nspace_item_flags_for_type: invalid type %u\n", (int)nspace_type);
-               return 0;
-       }
-}
+#define NSPACE_RESOLVER_REQ_HASHSIZE    32      /* XXX tune */
+#define NSPACE_RESOLVER_MAX_OUTSTANDING 256     /* XXX tune */
  
-static inline int
-nspace_open_flags_for_type(nspace_type_t nspace_type)
-{
-       switch (nspace_type) {
-       case NSPACE_HANDLER_NSPACE:
-               return FREAD | FWRITE | O_EVTONLY;
-       case NSPACE_HANDLER_SNAPSHOT:
-               return FREAD | O_EVTONLY;
-       default:
-               printf("nspace_open_flags_for_type: invalid type %u\n", (int)nspace_type);
-               return 0;
-       }
-}
+static LIST_HEAD(nspace_resolver_requesthead,
+    nspace_resolver_request) * nspace_resolver_request_hashtbl;
+static u_long nspace_resolver_request_hashmask;
+static u_int nspace_resolver_request_count;
+static bool nspace_resolver_request_wait_slot;
+static lck_grp_t *nspace_resolver_request_lck_grp;
+static lck_mtx_t nspace_resolver_request_hash_mutex;
+
+#define NSPACE_REQ_LOCK() \
+       lck_mtx_lock(&nspace_resolver_request_hash_mutex)
+#define NSPACE_REQ_UNLOCK() \
+       lck_mtx_unlock(&nspace_resolver_request_hash_mutex)
  
-static inline nspace_type_t
-nspace_type_for_op(uint64_t op)
+#define NSPACE_RESOLVER_HASH(req_id)    \
+       (&nspace_resolver_request_hashtbl[(req_id) & \
+        nspace_resolver_request_hashmask])
+
+static struct nspace_resolver_request *
+nspace_resolver_req_lookup(uint32_t req_id)
  {
-       switch (op & NAMESPACE_HANDLER_EVENT_TYPE_MASK) {
-       case NAMESPACE_HANDLER_NSPACE_EVENT:
-               return NSPACE_HANDLER_NSPACE;
-       case NAMESPACE_HANDLER_SNAPSHOT_EVENT:
-               return NSPACE_HANDLER_SNAPSHOT;
-       default:
-               printf("nspace_type_for_op: invalid op mask %llx\n", op & NAMESPACE_HANDLER_EVENT_TYPE_MASK);
-               return NSPACE_HANDLER_NSPACE;
+       struct nspace_resolver_requesthead *bucket;
+       struct nspace_resolver_request *req;
+
+       bucket = NSPACE_RESOLVER_HASH(req_id);
+       LIST_FOREACH(req, bucket, r_hashlink) {
+               if (req->r_req_id == req_id) {
+                       return req;
+               }
         }
+
+       return NULL;
  }
  
-static inline int
-nspace_is_special_process(struct proc *proc)
+static int
+nspace_resolver_req_add(struct nspace_resolver_request *req)
  {
-       int i;
-       for (i = 0; i < NSPACE_HANDLER_COUNT; i++) {
-               if (proc == nspace_handlers[i].handler_proc) {
-                       return 1;
+       struct nspace_resolver_requesthead *bucket;
+       int error;
+
+       while (nspace_resolver_request_count >=
+           NSPACE_RESOLVER_MAX_OUTSTANDING) {
+               nspace_resolver_request_wait_slot = true;
+               error = msleep(&nspace_resolver_request_count,
+                   &nspace_resolver_request_hash_mutex,
+                   PVFS | PCATCH, "nspacerq", NULL);
+               if (error) {
+                       return error;
                 }
         }
+
+       bucket = NSPACE_RESOLVER_HASH(req->r_req_id);
+#if DIAGNOSTIC
+       assert(nspace_resolver_req_lookup(req->r_req_id) == NULL);
+#endif /* DIAGNOSTIC */
+       LIST_INSERT_HEAD(bucket, req, r_hashlink);
+       nspace_resolver_request_count++;
+
         return 0;
  }
  
-void
-nspace_handler_init(void)
+static void
+nspace_resolver_req_remove(struct nspace_resolver_request *req)
  {
-       nspace_lock_attr    = lck_attr_alloc_init();
-       nspace_group_attr   = lck_grp_attr_alloc_init();
-       nspace_mutex_group  = lck_grp_alloc_init("nspace-mutex", nspace_group_attr);
-       lck_mtx_init(&nspace_handler_lock, nspace_mutex_group, nspace_lock_attr);
-       lck_mtx_init(&nspace_handler_exclusion_lock, nspace_mutex_group, nspace_lock_attr);
-       memset(&nspace_items[0], 0, sizeof(nspace_items));
+       struct nspace_resolver_requesthead *bucket;
+
+       bucket = NSPACE_RESOLVER_HASH(req->r_req_id);
+#if DIAGNOSTIC
+       assert(nspace_resolver_req_lookup(req->r_req_id) != NULL);
+#endif /* DIAGNOSTIC */
+       LIST_REMOVE(req, r_hashlink);
+       nspace_resolver_request_count--;
+
+       if (nspace_resolver_request_wait_slot) {
+               nspace_resolver_request_wait_slot = false;
+               wakeup(&nspace_resolver_request_count);
+       }
  }
  
-void
-nspace_proc_exit(struct proc *p)
+static void
+nspace_resolver_req_cancel(uint32_t req_id)
  {
-       int i, event_mask = 0;
+       kern_return_t kr;
+       mach_port_t mp;
  
-       for (i = 0; i < NSPACE_HANDLER_COUNT; i++) {
-               if (p == nspace_handlers[i].handler_proc) {
-                       event_mask |= nspace_item_flags_for_type(i);
-                       nspace_handlers[i].handler_tid = 0;
-                       nspace_handlers[i].handler_proc = NULL;
-               }
-       }
+       // Failures here aren't fatal -- the cancellation message
+       // sent to the resolver is merely advisory.
  
-       if (event_mask == 0) {
+       kr = host_get_filecoordinationd_port(host_priv_self(), &mp);
+       if (kr != KERN_SUCCESS || !IPC_PORT_VALID(mp)) {
                 return;
         }
  
-       lck_mtx_lock(&nspace_handler_lock);
-       if (event_mask & NSPACE_ITEM_SNAPSHOT_EVENT) {
-               // if this process was the snapshot handler, zero snapshot_timeout
-               snapshot_timestamp = 0;
-       }
-
-       //
-       // unblock anyone that's waiting for the handler that died
-       //
-       for (i = 0; i < MAX_NSPACE_ITEMS; i++) {
-               if (nspace_items[i].flags & (NSPACE_ITEM_NEW | NSPACE_ITEM_PROCESSING)) {
-                       if (nspace_items[i].flags & event_mask) {
-                               if (nspace_items[i].vp && (nspace_items[i].vp->v_flag & VNEEDSSNAPSHOT)) {
-                                       vnode_lock_spin(nspace_items[i].vp);
-                                       nspace_items[i].vp->v_flag &= ~VNEEDSSNAPSHOT;
-                                       vnode_unlock(nspace_items[i].vp);
-                               }
-                               nspace_items[i].vp = NULL;
-                               nspace_items[i].vid = 0;
-                               nspace_items[i].flags = NSPACE_ITEM_DONE;
-                               nspace_items[i].token = 0;
-
-                               wakeup((caddr_t)&(nspace_items[i].vp));
-                       }
-               }
+       kr = send_nspace_resolve_cancel(mp, req_id);
+       if (kr != KERN_SUCCESS) {
+               os_log_error(OS_LOG_DEFAULT,
+                   "NSPACE send_nspace_resolve_cancel failure: %d", kr);
         }
  
-       wakeup((caddr_t)&nspace_item_idx);
-       lck_mtx_unlock(&nspace_handler_lock);
+       ipc_port_release_send(mp);
  }
  
-
-int
-resolve_nspace_item(struct vnode *vp, uint64_t op)
+static int
+nspace_resolver_req_wait(struct nspace_resolver_request *req)
  {
-       return resolve_nspace_item_ext(vp, op, NULL);
-}
+       bool send_cancel_message = false;
+       int error;
  
-int
-resolve_nspace_item_ext(struct vnode *vp, uint64_t op, void *arg)
-{
-       int i, error, keep_waiting;
-       struct timespec ts;
-       nspace_type_t nspace_type = nspace_type_for_op(op);
+       NSPACE_REQ_LOCK();
  
-       // only allow namespace events on regular files, directories and symlinks.
-       if (vp->v_type != VREG && vp->v_type != VDIR && vp->v_type != VLNK) {
-               return 0;
+       while ((req->r_flags & RRF_COMPLETE) == 0) {
+               error = msleep(req, &nspace_resolver_request_hash_mutex,
+                   PVFS | PCATCH, "nspace", NULL);
+               if (error && error != ERESTART) {
+                       req->r_resolver_error = (error == EINTR) ? EINTR :
+                           ETIMEDOUT;
+                       send_cancel_message = true;
+                       break;
+               }
         }
  
-       //
-       // if this is a snapshot event and the vnode is on a
-       // disk image just pretend nothing happened since any
-       // change to the disk image will cause the disk image
-       // itself to get backed up and this avoids multi-way
-       // deadlocks between the snapshot handler and the ever
-       // popular diskimages-helper process.  the variable
-       // nspace_allow_virtual_devs allows this behavior to
-       // be overridden (for use by the Mobile TimeMachine
-       // testing infrastructure which uses disk images)
-       //
-       if ((op & NAMESPACE_HANDLER_SNAPSHOT_EVENT)
-           && (vp->v_mount != NULL)
-           && (vp->v_mount->mnt_kern_flag & MNTK_VIRTUALDEV)
-           && !nspace_allow_virtual_devs) {
-               return 0;
-       }
+       nspace_resolver_req_remove(req);
  
-       // if (thread_tid(current_thread()) == namespace_handler_tid) {
-       if (nspace_handlers[nspace_type].handler_proc == NULL) {
-               return 0;
-       }
+       NSPACE_REQ_UNLOCK();
  
-       if (nspace_is_special_process(current_proc())) {
-               return EDEADLK;
+       if (send_cancel_message) {
+               nspace_resolver_req_cancel(req->r_req_id);
         }
  
-       lck_mtx_lock(&nspace_handler_lock);
+       return req->r_resolver_error;
+}
  
-retry:
-       for (i = 0; i < MAX_NSPACE_ITEMS; i++) {
-               if (vp == nspace_items[i].vp && op == nspace_items[i].op) {
-                       break;
-               }
-       }
+static void
+nspace_resolver_req_mark_complete(
+       struct nspace_resolver_request *req,
+       int resolver_error)
+{
+       req->r_resolver_error = resolver_error;
+       req->r_flags |= RRF_COMPLETE;
+       wakeup(req);
+}
  
-       if (i >= MAX_NSPACE_ITEMS) {
-               for (i = 0; i < MAX_NSPACE_ITEMS; i++) {
-                       if (nspace_items[i].flags == 0) {
-                               break;
+static void
+nspace_resolver_req_completed(uint32_t req_id, int resolver_error, uint64_t orig_gencount)
+{
+       struct nspace_resolver_request *req;
+
+       NSPACE_REQ_LOCK();
+
+       // If we don't find the request corresponding to our req_id,
+       // just drop the completion signal on the floor; it's likely
+       // that the requester interrupted with a signal.
+
+       req = nspace_resolver_req_lookup(req_id);
+       if (req) {
+               mount_t locked_mp = NULL;
+
+               locked_mp = req->r_vp->v_mount;
+               mount_ref(locked_mp, 0);
+               mount_lock_renames(locked_mp);
+
+               //
+               // if the resolver isn't already returning an error and we have an
+               // orig_gencount, then get an iocount on the request vnode and check
+               // that the gencount on req->r_vp has not changed.
+               //
+               // note: a ref was taken on req->r_vp when the request was created
+               // and that ref will be dropped by that thread when it wakes up.
+               //
+               if (resolver_error == 0 &&
+                   orig_gencount != 0 &&
+                   vnode_getwithref(req->r_vp) == 0) {
+                       struct vnode_attr va;
+                       uint64_t cur_gencount;
+
+                       VATTR_INIT(&va);
+                       VATTR_WANTED(&va, va_recursive_gencount);
+
+                       if (vnode_getattr(req->r_vp, &va, vfs_context_kernel()) == 0) {
+                               cur_gencount = va.va_recursive_gencount;
+                       } else {
+                               cur_gencount = 0;
                         }
-               }
-       } else {
-               nspace_items[i].refcount++;
-       }
  
-       if (i >= MAX_NSPACE_ITEMS) {
-               ts.tv_sec = nspace_handler_timeout;
-               ts.tv_nsec = 0;
+                       if (resolver_error == 0 && cur_gencount && orig_gencount && cur_gencount != orig_gencount) {
+                               printf("nspace.complete: gencount changed! (orig %llu cur %llu)\n", orig_gencount, cur_gencount);
  
-               error = msleep((caddr_t)&nspace_token_id, &nspace_handler_lock, PVFS | PCATCH, "nspace-no-space", &ts);
-               if (error == 0) {
-                       // an entry got free'd up, go see if we can get a slot
-                       goto retry;
-               } else {
-                       lck_mtx_unlock(&nspace_handler_lock);
-                       return error;
-               }
-       }
+                               // this error will be returned to the thread that initiated the
+                               // materialization of req->r_vp.
+                               resolver_error = EBUSY;
  
-       //
-       // if it didn't already exist, add it.  if it did exist
-       // we'll get woken up when someone does a wakeup() on
-       // the slot in the nspace_items table.
-       //
-       if (vp != nspace_items[i].vp) {
-               nspace_items[i].vp = vp;
-               nspace_items[i].arg = (arg == NSPACE_REARM_NO_ARG) ? NULL : arg;  // arg is {NULL, true, uio *} - only pass uio thru to the user
-               nspace_items[i].op = op;
-               nspace_items[i].vid = vnode_vid(vp);
-               nspace_items[i].flags = NSPACE_ITEM_NEW;
-               nspace_items[i].flags |= nspace_item_flags_for_type(nspace_type);
-               if (nspace_items[i].flags & NSPACE_ITEM_SNAPSHOT_EVENT) {
-                       if (arg) {
-                               vnode_lock_spin(vp);
-                               vp->v_flag |= VNEEDSSNAPSHOT;
-                               vnode_unlock(vp);
+                               // note: we explicitly do not return an error to the caller (i.e.
+                               // the thread that did the materialization) because they said they
+                               // don't want one.
                         }
+
+                       vnode_put(req->r_vp);
                 }
  
-               nspace_items[i].token = 0;
-               nspace_items[i].refcount = 1;
+               mount_unlock_renames(locked_mp);
+               mount_drop(locked_mp, 0);
  
-               wakeup((caddr_t)&nspace_item_idx);
+               nspace_resolver_req_mark_complete(req, resolver_error);
         }
  
-       //
-       // Now go to sleep until the handler does a wakeup on this
-       // slot in the nspace_items table (or we timeout).
-       //
-       keep_waiting = 1;
-       while (keep_waiting) {
-               ts.tv_sec = nspace_handler_timeout;
-               ts.tv_nsec = 0;
-               error = msleep((caddr_t)&(nspace_items[i].vp), &nspace_handler_lock, PVFS | PCATCH, "namespace-done", &ts);
-
-               if (nspace_items[i].flags & NSPACE_ITEM_DONE) {
-                       error = 0;
-               } else if (nspace_items[i].flags & NSPACE_ITEM_CANCELLED) {
-                       error = nspace_items[i].token;
-               } else if (error == EWOULDBLOCK || error == ETIMEDOUT) {
-                       if (nspace_items[i].flags & NSPACE_ITEM_RESET_TIMER) {
-                               nspace_items[i].flags &= ~NSPACE_ITEM_RESET_TIMER;
-                               continue;
-                       } else {
-                               error = ETIMEDOUT;
-                       }
-               } else if (error == 0) {
-                       // hmmm, why did we get woken up?
-                       printf("woken up for token %d but it's not done, cancelled or timedout and error == 0.\n",
-                           nspace_items[i].token);
-               }
+       NSPACE_REQ_UNLOCK();
  
-               if (--nspace_items[i].refcount == 0) {
-                       nspace_items[i].vp = NULL;     // clear this so that no one will match on it again
-                       nspace_items[i].arg = NULL;
-                       nspace_items[i].token = 0;     // clear this so that the handler will not find it anymore
-                       nspace_items[i].flags = 0;     // this clears it for re-use
-               }
-               wakeup(&nspace_token_id);
-               keep_waiting = 0;
-       }
+       return;
+}
  
-       lck_mtx_unlock(&nspace_handler_lock);
+static struct proc *nspace_resolver_proc;
  
-       return error;
+static int
+nspace_resolver_get_proc_state(struct proc *p, int *is_resolver)
+{
+       *is_resolver = ((p->p_lflag & P_LNSPACE_RESOLVER) &&
+           p == nspace_resolver_proc) ? 1 : 0;
+       return 0;
  }
  
-int
-nspace_snapshot_event(vnode_t vp, time_t ctime, uint64_t op_type, void *arg)
+static int
+nspace_resolver_set_proc_state(struct proc *p, int is_resolver)
  {
-       int snapshot_error = 0;
+       vfs_context_t ctx = vfs_context_current();
+       int error = 0;
  
-       if (vp == NULL) {
-               return 0;
+       //
+       // The system filecoordinationd runs as uid == 0.  This also
+       // has the nice side-effect of filtering out filecoordinationd
+       // running in the simulator.
+       //
+       if (!vfs_context_issuser(ctx)) {
+               return EPERM;
         }
  
-       /* Swap files are special; skip them */
-       if (vnode_isswap(vp)) {
-               return 0;
+       error = priv_check_cred(vfs_context_ucred(ctx),
+           PRIV_VFS_DATALESS_RESOLVER, 0);
+       if (error) {
+               return error;
         }
  
-       if (ctime != 0 && snapshot_timestamp != 0 && (ctime <= snapshot_timestamp || vnode_needssnapshots(vp))) {
-               // the change time is within this epoch
-               int error;
+       if (is_resolver) {
+               NSPACE_REQ_LOCK();
  
-               error = resolve_nspace_item_ext(vp, op_type | NAMESPACE_HANDLER_SNAPSHOT_EVENT, arg);
-               if (error == EDEADLK) {
-                       snapshot_error = 0;
-               } else if (error) {
-                       if (error == EAGAIN) {
-                               printf("nspace_snapshot_event: timed out waiting for namespace handler...\n");
-                       } else if (error == EINTR) {
-                               // printf("nspace_snapshot_event: got a signal while waiting for namespace handler...\n");
-                               snapshot_error = EINTR;
-                       }
+               if (nspace_resolver_proc == NULL) {
+                       proc_lock(p);
+                       p->p_lflag |= P_LNSPACE_RESOLVER;
+                       proc_unlock(p);
+                       nspace_resolver_proc = p;
+               } else {
+                       error = EBUSY;
                 }
+
+               NSPACE_REQ_UNLOCK();
+       } else {
+               // This is basically just like the exit case.
+               // nspace_resolver_exited() will verify that the
+               // process is the resolver, and will clear the
+               // global.
+               nspace_resolver_exited(p);
         }
  
-       return snapshot_error;
+       return error;
  }
  
-int
-get_nspace_item_status(struct vnode *vp, int32_t *status)
+static int
+nspace_materialization_get_proc_state(struct proc *p, int *is_prevented)
  {
-       int i;
-
-       lck_mtx_lock(&nspace_handler_lock);
-       for (i = 0; i < MAX_NSPACE_ITEMS; i++) {
-               if (nspace_items[i].vp == vp) {
-                       break;
-               }
+       if ((p->p_lflag & P_LNSPACE_RESOLVER) != 0 ||
+           (p->p_vfs_iopolicy &
+           P_VFS_IOPOLICY_MATERIALIZE_DATALESS_FILES) == 0) {
+               *is_prevented = 1;
+       } else {
+               *is_prevented = 0;
         }
+       return 0;
+}
  
-       if (i >= MAX_NSPACE_ITEMS) {
-               lck_mtx_unlock(&nspace_handler_lock);
-               return ENOENT;
+static int
+nspace_materialization_set_proc_state(struct proc *p, int is_prevented)
+{
+       if (p->p_lflag & P_LNSPACE_RESOLVER) {
+               return is_prevented ? 0 : EBUSY;
         }
  
-       *status = nspace_items[i].flags;
-       lck_mtx_unlock(&nspace_handler_lock);
+       if (is_prevented) {
+               OSBitAndAtomic16(~((uint16_t)P_VFS_IOPOLICY_MATERIALIZE_DATALESS_FILES), &p->p_vfs_iopolicy);
+       } else {
+               OSBitOrAtomic16((uint16_t)P_VFS_IOPOLICY_MATERIALIZE_DATALESS_FILES, &p->p_vfs_iopolicy);
+       }
         return 0;
  }
  
-
-#if 0
  static int
-build_volfs_path(struct vnode *vp, char *path, int *len)
+nspace_materialization_get_thread_state(int *is_prevented)
  {
-       struct vnode_attr va;
-       int ret;
+       uthread_t ut = get_bsdthread_info(current_thread());
  
-       VATTR_INIT(&va);
-       VATTR_WANTED(&va, va_fsid);
-       VATTR_WANTED(&va, va_fileid);
+       *is_prevented = (ut->uu_flag & UT_NSPACE_NODATALESSFAULTS) ? 1 : 0;
+       return 0;
+}
  
-       if (vnode_getattr(vp, &va, vfs_context_kernel()) != 0) {
-               *len = snprintf(path, *len, "/non/existent/path/because/vnode_getattr/failed") + 1;
-               ret = -1;
+static int
+nspace_materialization_set_thread_state(int is_prevented)
+{
+       uthread_t ut = get_bsdthread_info(current_thread());
+
+       if (is_prevented) {
+               ut->uu_flag |= UT_NSPACE_NODATALESSFAULTS;
         } else {
-               *len = snprintf(path, *len, "/.vol/%d/%lld", (dev_t)va.va_fsid, va.va_fileid) + 1;
-               ret = 0;
+               ut->uu_flag &= ~UT_NSPACE_NODATALESSFAULTS;
         }
-
-       return ret;
+       return 0;
  }
-#endif
  
-//
-// Note: this function does NOT check permissions on all of the
-// parent directories leading to this vnode.  It should only be
-// called on behalf of a root process.  Otherwise a process may
-// get access to a file because the file itself is readable even
-// though its parent directories would prevent access.
-//
  static int
-vn_open_with_vp(vnode_t vp, int fmode, vfs_context_t ctx)
+nspace_materialization_is_prevented(void)
  {
-       int error, action;
+       proc_t p = current_proc();
+       uthread_t ut = (uthread_t)get_bsdthread_info(current_thread());
+       vfs_context_t ctx = vfs_context_current();
  
-       if ((error = suser(kauth_cred_get(), &(current_proc()->p_acflag)))) {
-               return error;
+       /*
+        * Kernel context ==> return EDEADLK, as we would with any random
+        * process decorated as no-materialize.
+        */
+       if (ctx == vfs_context_kernel()) {
+               return EDEADLK;
         }
  
-#if CONFIG_MACF
-       error = mac_vnode_check_open(ctx, vp, fmode);
-       if (error) {
-               return error;
+       /*
+        * If the process has the dataless-manipulation entitlement,
+        * materialization is prevented, and depending on the kind
+        * of file system operation, things get to proceed as if the
+        * object is not dataless.
+        */
+       if (vfs_context_is_dataless_manipulator(ctx)) {
+               return EJUSTRETURN;
         }
-#endif
  
-       /* compute action to be authorized */
-       action = 0;
-       if (fmode & FREAD) {
-               action |= KAUTH_VNODE_READ_DATA;
-       }
-       if (fmode & (FWRITE | O_TRUNC)) {
-               /*
-                * If we are writing, appending, and not truncating,
-                * indicate that we are appending so that if the
-                * UF_APPEND or SF_APPEND bits are set, we do not deny
-                * the open.
-                */
-               if ((fmode & O_APPEND) && !(fmode & O_TRUNC)) {
-                       action |= KAUTH_VNODE_APPEND_DATA;
-               } else {
-                       action |= KAUTH_VNODE_WRITE_DATA;
+       /*
+        * Per-thread decorations override any process-wide decorations.
+        * (Foundation uses this, and this overrides even the dataless-
+        * manipulation entitlement so as to make API contracts consistent.)
+        */
+       if (ut != NULL) {
+               if (ut->uu_flag & UT_NSPACE_NODATALESSFAULTS) {
+                       return EDEADLK;
+               }
+               if (ut->uu_flag & UT_NSPACE_FORCEDATALESSFAULTS) {
+                       return 0;
                 }
         }
  
-       if ((error = vnode_authorize(vp, NULL, action, ctx)) != 0) {
-               return error;
+       /*
+        * If the process's iopolicy specifies that dataless files
+        * can be materialized, then we let it go ahead.
+        */
+       if (p->p_vfs_iopolicy & P_VFS_IOPOLICY_MATERIALIZE_DATALESS_FILES) {
+               return 0;
         }
  
+       /*
+        * The default behavior is to not materialize dataless files;
+        * return to the caller that deadlock was detected.
+        */
+       return EDEADLK;
+}
  
-       //
-       // if the vnode is tagged VOPENEVT and the current process
-       // has the P_CHECKOPENEVT flag set, then we or in the O_EVTONLY
-       // flag to the open mode so that this open won't count against
-       // the vnode when carbon delete() does a vnode_isinuse() to see
-       // if a file is currently in use.  this allows spotlight
-       // importers to not interfere with carbon apps that depend on
-       // the no-delete-if-busy semantics of carbon delete().
-       //
-       if ((vp->v_flag & VOPENEVT) && (current_proc()->p_flag & P_CHECKOPENEVT)) {
-               fmode |= O_EVTONLY;
-       }
+/* the vfs.nspace branch */
+SYSCTL_NODE(_vfs, OID_AUTO, nspace, CTLFLAG_RW | CTLFLAG_LOCKED, NULL, "vfs nspace hinge");
  
-       if ((error = VNOP_OPEN(vp, fmode, ctx))) {
+static int
+sysctl_nspace_resolver(__unused struct sysctl_oid *oidp,
+    __unused void *arg1, __unused int arg2, struct sysctl_req *req)
+{
+       struct proc *p = req->p;
+       int new_value, old_value, changed = 0;
+       int error;
+
+       error = nspace_resolver_get_proc_state(p, &old_value);
+       if (error) {
                 return error;
         }
-       if ((error = vnode_ref_ext(vp, fmode, 0))) {
-               VNOP_CLOSE(vp, fmode, ctx);
-               return error;
+
+       error = sysctl_io_number(req, old_value, sizeof(int), &new_value,
+           &changed);
+       if (error == 0 && changed) {
+               error = nspace_resolver_set_proc_state(p, new_value);
         }
+       return error;
+}
  
-       /* Call out to allow 3rd party notification of open.
-        * Ignore result of kauth_authorize_fileop call.
-        */
-#if CONFIG_MACF
-       mac_vnode_notify_open(ctx, vp, fmode);
-#endif
-       kauth_authorize_fileop(vfs_context_ucred(ctx), KAUTH_FILEOP_OPEN,
-           (uintptr_t)vp, 0);
+/* decorate this process as the dataless file resolver */
+SYSCTL_PROC(_vfs_nspace, OID_AUTO, resolver,
+    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_ANYBODY | CTLFLAG_LOCKED,
+    0, 0, sysctl_nspace_resolver, "I", "");
  
+static int
+sysctl_nspace_prevent_materialization(__unused struct sysctl_oid *oidp,
+    __unused void *arg1, __unused int arg2, struct sysctl_req *req)
+{
+       struct proc *p = req->p;
+       int new_value, old_value, changed = 0;
+       int error;
  
-       return 0;
+       error = nspace_materialization_get_proc_state(p, &old_value);
+       if (error) {
+               return error;
+       }
+
+       error = sysctl_io_number(req, old_value, sizeof(int), &new_value,
+           &changed);
+       if (error == 0 && changed) {
+               error = nspace_materialization_set_proc_state(p, new_value);
+       }
+       return error;
  }
  
+/* decorate this process as not wanting to materialize dataless files */
+SYSCTL_PROC(_vfs_nspace, OID_AUTO, prevent_materialization,
+    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_ANYBODY | CTLFLAG_LOCKED,
+    0, 0, sysctl_nspace_prevent_materialization, "I", "");
+
  static int
-wait_for_namespace_event(namespace_handler_data *nhd, nspace_type_t nspace_type)
+sysctl_nspace_thread_prevent_materialization(__unused struct sysctl_oid *oidp,
+    __unused void *arg1, __unused int arg2, struct sysctl_req *req)
  {
-       int i;
-       int error = 0;
-       int unblock = 0;
-       task_t curtask;
+       int new_value, old_value, changed = 0;
+       int error;
  
-       lck_mtx_lock(&nspace_handler_exclusion_lock);
-       if (nspace_handlers[nspace_type].handler_busy) {
-               lck_mtx_unlock(&nspace_handler_exclusion_lock);
-               return EBUSY;
+       error = nspace_materialization_get_thread_state(&old_value);
+       if (error) {
+               return error;
         }
  
-       nspace_handlers[nspace_type].handler_busy = 1;
-       lck_mtx_unlock(&nspace_handler_exclusion_lock);
+       error = sysctl_io_number(req, old_value, sizeof(int), &new_value,
+           &changed);
+       if (error == 0 && changed) {
+               error = nspace_materialization_set_thread_state(new_value);
+       }
+       return error;
+}
  
-       /*
-        * Any process that gets here will be one of the namespace handlers.
-        * As such, they should be prevented from acquiring DMG vnodes during vnode reclamation
-        * as we can cause deadlocks to occur, because the namespace handler may prevent
-        * VNOP_INACTIVE from proceeding.  Mark the current task as a P_DEPENDENCY_CAPABLE
-        * process.
-        */
-       curtask = current_task();
-       bsd_set_dependency_capable(curtask);
+/* decorate this thread as not wanting to materialize dataless files */
+SYSCTL_PROC(_vfs_nspace, OID_AUTO, thread_prevent_materialization,
+    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_ANYBODY | CTLFLAG_LOCKED,
+    0, 0, sysctl_nspace_thread_prevent_materialization, "I", "");
+
+static int
+sysctl_nspace_complete(__unused struct sysctl_oid *oidp, __unused void *arg1,
+    __unused int arg2, struct sysctl_req *req)
+{
+       struct proc *p = req->p;
+       uint32_t req_status[2] = { 0, 0 };
+       uint64_t gencount = 0;
+       int error, is_resolver, changed = 0, gencount_changed;
  
-       lck_mtx_lock(&nspace_handler_lock);
-       if (nspace_handlers[nspace_type].handler_proc == NULL) {
-               nspace_handlers[nspace_type].handler_tid = thread_tid(current_thread());
-               nspace_handlers[nspace_type].handler_proc = current_proc();
+       error = nspace_resolver_get_proc_state(p, &is_resolver);
+       if (error) {
+               return error;
         }
  
-       if (nspace_type == NSPACE_HANDLER_SNAPSHOT &&
-           (snapshot_timestamp == 0 || snapshot_timestamp == ~0)) {
-               error = EINVAL;
+       if (!is_resolver) {
+               return EPERM;
         }
  
-       while (error == 0) {
-               /* Try to find matching namespace item */
-               for (i = 0; i < MAX_NSPACE_ITEMS; i++) {
-                       if (nspace_items[i].flags & NSPACE_ITEM_NEW) {
-                               if (nspace_flags_matches_handler(nspace_items[i].flags, nspace_type)) {
-                                       break;
-                               }
-                       }
-               }
+       error = sysctl_io_opaque(req, req_status, sizeof(req_status),
+           &changed);
+       if (error) {
+               return error;
+       }
  
-               if (i >= MAX_NSPACE_ITEMS) {
-                       /* Nothing is there yet. Wait for wake up and retry */
-                       error = msleep((caddr_t)&nspace_item_idx, &nspace_handler_lock, PVFS | PCATCH, "namespace-items", 0);
-                       if ((nspace_type == NSPACE_HANDLER_SNAPSHOT) && (snapshot_timestamp == 0 || snapshot_timestamp == ~0)) {
-                               /* Prevent infinite loop if snapshot handler exited */
-                               error = EINVAL;
-                               break;
-                       }
-                       continue;
-               }
+       // get the gencount if it was passed
+       error = sysctl_io_opaque(req, &gencount, sizeof(gencount),
+           &gencount_changed);
+       if (error) {
+               gencount = 0;
+               // we ignore the error because the gencount was optional
+               error = 0;
+       }
  
-               nspace_items[i].flags  &= ~NSPACE_ITEM_NEW;
-               nspace_items[i].flags  |= NSPACE_ITEM_PROCESSING;
-               nspace_items[i].token  = ++nspace_token_id;
+       /*
+        * req_status[0] is the req_id
+        *
+        * req_status[1] is the errno
+        */
+       if (error == 0 && changed) {
+               nspace_resolver_req_completed(req_status[0],
+                   (int)req_status[1], gencount);
+       }
+       return error;
+}
  
-               assert(nspace_items[i].vp);
-               struct fileproc *fp;
-               int32_t indx;
-               int32_t fmode;
-               struct proc *p = current_proc();
-               vfs_context_t ctx = vfs_context_current();
-               struct vnode_attr va;
-               bool vn_get_succsessful = false;
-               bool vn_open_successful = false;
-               bool fp_alloc_successful = false;
+/* Resolver reports completed reqs here. */
+SYSCTL_PROC(_vfs_nspace, OID_AUTO, complete,
+    CTLTYPE_OPAQUE | CTLFLAG_RW | CTLFLAG_ANYBODY | CTLFLAG_LOCKED,
+    0, 0, sysctl_nspace_complete, "-", "");
  
-               /*
-                * Use vnode pointer to acquire a file descriptor for
-                * hand-off to userland
-                */
-               fmode = nspace_open_flags_for_type(nspace_type);
-               error = vnode_getwithvid(nspace_items[i].vp, nspace_items[i].vid);
-               if (error) {
-                       goto cleanup;
-               }
-               vn_get_succsessful = true;
+#endif /* CONFIG_DATALESS_FILES */
  
-               error = vn_open_with_vp(nspace_items[i].vp, fmode, ctx);
-               if (error) {
-                       goto cleanup;
-               }
-               vn_open_successful = true;
+#if CONFIG_DATALESS_FILES
+#define __no_dataless_unused    /* nothing */
+#else
+#define __no_dataless_unused    __unused
+#endif
  
-               error = falloc(p, &fp, &indx, ctx);
-               if (error) {
-                       goto cleanup;
-               }
-               fp_alloc_successful = true;
+void
+nspace_resolver_init(void)
+{
+#if CONFIG_DATALESS_FILES
+       nspace_resolver_request_lck_grp =
+           lck_grp_alloc_init("file namespace resolver", NULL);
  
-               fp->f_fglob->fg_flag = fmode;
-               fp->f_fglob->fg_ops = &vnops;
-               fp->f_fglob->fg_data = (caddr_t)nspace_items[i].vp;
+       lck_mtx_init(&nspace_resolver_request_hash_mutex,
+           nspace_resolver_request_lck_grp, NULL);
  
-               proc_fdlock(p);
-               procfdtbl_releasefd(p, indx, NULL);
-               fp_drop(p, indx, fp, 1);
-               proc_fdunlock(p);
+       nspace_resolver_request_hashtbl =
+           hashinit(NSPACE_RESOLVER_REQ_HASHSIZE,
+           M_VNODE /* XXX */, &nspace_resolver_request_hashmask);
+#endif /* CONFIG_DATALESS_FILES */
+}
  
-               /*
-                * All variants of the namespace handler struct support these three fields:
-                * token, flags, and the FD pointer
-                */
-               error = copyout(&nspace_items[i].token, nhd->token, sizeof(uint32_t));
-               if (error) {
-                       goto cleanup;
-               }
-               error = copyout(&nspace_items[i].op, nhd->flags, sizeof(uint64_t));
-               if (error) {
-                       goto cleanup;
-               }
-               error = copyout(&indx, nhd->fdptr, sizeof(uint32_t));
-               if (error) {
-                       goto cleanup;
-               }
+void
+nspace_resolver_exited(struct proc *p __no_dataless_unused)
+{
+#if CONFIG_DATALESS_FILES
+       struct nspace_resolver_requesthead *bucket;
+       struct nspace_resolver_request *req;
+       u_long idx;
  
-               /*
-                * Handle optional fields:
-                * extended version support an info ptr (offset, length), and the
-                *
-                * namedata version supports a unique per-link object ID
-                *
-                */
-               if (nhd->infoptr) {
-                       uio_t uio = (uio_t)nspace_items[i].arg;
-                       uint64_t u_offset, u_length;
+       NSPACE_REQ_LOCK();
  
-                       if (uio) {
-                               u_offset = uio_offset(uio);
-                               u_length = uio_resid(uio);
-                       } else {
-                               u_offset = 0;
-                               u_length = 0;
-                       }
-                       error = copyout(&u_offset, nhd->infoptr, sizeof(uint64_t));
-                       if (error) {
-                               goto cleanup;
-                       }
-                       error = copyout(&u_length, nhd->infoptr + sizeof(uint64_t), sizeof(uint64_t));
-                       if (error) {
-                               goto cleanup;
+       if ((p->p_lflag & P_LNSPACE_RESOLVER) &&
+           p == nspace_resolver_proc) {
+               for (idx = 0; idx <= nspace_resolver_request_hashmask; idx++) {
+                       bucket = &nspace_resolver_request_hashtbl[idx];
+                       LIST_FOREACH(req, bucket, r_hashlink) {
+                               nspace_resolver_req_mark_complete(req,
+                                   ETIMEDOUT);
                         }
                 }
+               nspace_resolver_proc = NULL;
+       }
  
-               if (nhd->objid) {
-                       VATTR_INIT(&va);
-                       VATTR_WANTED(&va, va_linkid);
-                       error = vnode_getattr(nspace_items[i].vp, &va, ctx);
-                       if (error) {
-                               goto cleanup;
-                       }
+       NSPACE_REQ_UNLOCK();
+#endif /* CONFIG_DATALESS_FILES */
+}
  
-                       uint64_t linkid = 0;
-                       if (VATTR_IS_SUPPORTED(&va, va_linkid)) {
-                               linkid = (uint64_t)va.va_linkid;
-                       }
-                       error = copyout(&linkid, nhd->objid, sizeof(uint64_t));
-               }
-cleanup:
-               if (error) {
-                       if (fp_alloc_successful) {
-                               fp_free(p, indx, fp);
-                       }
-                       if (vn_open_successful) {
-                               vn_close(nspace_items[i].vp, fmode, ctx);
-                       }
-                       unblock = 1;
-               }
+int
+resolve_nspace_item(struct vnode *vp, uint64_t op)
+{
+       return resolve_nspace_item_ext(vp, op, NULL);
+}
  
-               if (vn_get_succsessful) {
-                       vnode_put(nspace_items[i].vp);
-               }
+#define DATALESS_RESOLVER_ENTITLEMENT     \
+       "com.apple.private.vfs.dataless-resolver"
+#define DATALESS_MANIPULATION_ENTITLEMENT \
+       "com.apple.private.vfs.dataless-manipulation"
  
-               break;
-       }
+/*
+ * Return TRUE if the vfs context is associated with a process entitled
+ * for dataless manipulation.
+ *
+ * XXX Arguably belongs in vfs_subr.c, but is here because of the
+ * complication around CONFIG_DATALESS_FILES.
+ */
+boolean_t
+vfs_context_is_dataless_manipulator(vfs_context_t ctx __unused)
+{
+#if CONFIG_DATALESS_FILES
+       assert(ctx->vc_thread == current_thread());
+       task_t const task = current_task();
+       return IOTaskHasEntitlement(task, DATALESS_MANIPULATION_ENTITLEMENT) ||
+              IOTaskHasEntitlement(task, DATALESS_RESOLVER_ENTITLEMENT);
+#else
+       return false;
+#endif /* CONFIG_DATALESS_FILES */
+}
  
-       if (unblock) {
-               if (nspace_items[i].vp && (nspace_items[i].vp->v_flag & VNEEDSSNAPSHOT)) {
-                       vnode_lock_spin(nspace_items[i].vp);
-                       nspace_items[i].vp->v_flag &= ~VNEEDSSNAPSHOT;
-                       vnode_unlock(nspace_items[i].vp);
-               }
-               nspace_items[i].vp = NULL;
-               nspace_items[i].vid = 0;
-               nspace_items[i].flags = NSPACE_ITEM_DONE;
-               nspace_items[i].token = 0;
+int
+resolve_nspace_item_ext(
+       struct vnode *vp __no_dataless_unused,
+       uint64_t op __no_dataless_unused,
+       void *arg __unused)
+{
+#if CONFIG_DATALESS_FILES
+       int error;
+       mach_port_t mp;
+       char *path = NULL;
+       int path_len;
+       kern_return_t kr;
+       struct nspace_resolver_request req;
  
-               wakeup((caddr_t)&(nspace_items[i].vp));
+       // only allow namespace events on regular files, directories and symlinks.
+       if (vp->v_type != VREG && vp->v_type != VDIR && vp->v_type != VLNK) {
+               return EFTYPE;
         }
  
-       if (nspace_type == NSPACE_HANDLER_SNAPSHOT) {
-               // just go through every snapshot event and unblock it immediately.
-               if (error && (snapshot_timestamp == 0 || snapshot_timestamp == ~0)) {
-                       for (i = 0; i < MAX_NSPACE_ITEMS; i++) {
-                               if (nspace_items[i].flags & NSPACE_ITEM_NEW) {
-                                       if (nspace_flags_matches_handler(nspace_items[i].flags, nspace_type)) {
-                                               nspace_items[i].vp = NULL;
-                                               nspace_items[i].vid = 0;
-                                               nspace_items[i].flags = NSPACE_ITEM_DONE;
-                                               nspace_items[i].token = 0;
+       //
+       // if this is a snapshot event and the vnode is on a
+       // disk image just pretend nothing happened since any
+       // change to the disk image will cause the disk image
+       // itself to get backed up and this avoids multi-way
+       // deadlocks between the snapshot handler and the ever
+       // popular diskimages-helper process.  the variable
+       // nspace_allow_virtual_devs allows this behavior to
+       // be overridden (for use by the Mobile TimeMachine
+       // testing infrastructure which uses disk images)
+       //
+       if (op & NAMESPACE_HANDLER_SNAPSHOT_EVENT) {
+               os_log_debug(OS_LOG_DEFAULT, "NSPACE SNAPSHOT not handled");
+               return ENOTSUP;
+       }
  
-                                               wakeup((caddr_t)&(nspace_items[i].vp));
-                                       }
-                               }
-                       }
-               }
+       error = nspace_materialization_is_prevented();
+       if (error) {
+               os_log_debug(OS_LOG_DEFAULT,
+                   "NSPACE process/thread is decorated as no-materialization");
+               return error;
         }
  
-       lck_mtx_unlock(&nspace_handler_lock);
+       kr = host_get_filecoordinationd_port(host_priv_self(), &mp);
+       if (kr != KERN_SUCCESS || !IPC_PORT_VALID(mp)) {
+               os_log_error(OS_LOG_DEFAULT, "NSPACE no port");
+               // Treat this like being unable to access the backing
+               // store server.
+               return ETIMEDOUT;
+       }
  
-       lck_mtx_lock(&nspace_handler_exclusion_lock);
-       nspace_handlers[nspace_type].handler_busy = 0;
-       lck_mtx_unlock(&nspace_handler_exclusion_lock);
+       path = zalloc(ZV_NAMEI);
+       path_len = MAXPATHLEN;
  
-       return error;
-}
+       error = vn_getpath(vp, path, &path_len);
+       if (error == 0) {
+               int xxx_rdar44371223;   /* XXX Mig bug */
+               req.r_req_id = next_nspace_req_id();
+               req.r_resolver_error = 0;
+               req.r_flags = 0;
  
-static inline int
-validate_namespace_args(int is64bit, int size)
-{
-       if (is64bit) {
-               /* Must be one of these */
-               if (size == sizeof(user64_namespace_handler_info)) {
-                       goto sizeok;
-               }
-               if (size == sizeof(user64_namespace_handler_info_ext)) {
-                       goto sizeok;
-               }
-               if (size == sizeof(user64_namespace_handler_data)) {
-                       goto sizeok;
-               }
-               return EINVAL;
-       } else {
-               /* 32 bit -- must be one of these */
-               if (size == sizeof(user32_namespace_handler_info)) {
-                       goto sizeok;
-               }
-               if (size == sizeof(user32_namespace_handler_info_ext)) {
-                       goto sizeok;
+               if ((error = vnode_ref(vp)) == 0) {     // take a ref so that the vnode doesn't go away
+                       req.r_vp = vp;
+               } else {
+                       goto out_release_port;
                 }
-               if (size == sizeof(user32_namespace_handler_data)) {
-                       goto sizeok;
+
+               NSPACE_REQ_LOCK();
+               error = nspace_resolver_req_add(&req);
+               NSPACE_REQ_UNLOCK();
+               if (error) {
+                       vnode_rele(req.r_vp);
+                       goto out_release_port;
                 }
-               return EINVAL;
-       }
  
-sizeok:
+               os_log_debug(OS_LOG_DEFAULT, "NSPACE resolve_path call");
+               kr = send_nspace_resolve_path(mp, req.r_req_id,
+                   current_proc()->p_pid, (uint32_t)(op & 0xffffffff),
+                   path, &xxx_rdar44371223);
+               if (kr != KERN_SUCCESS) {
+                       // Also treat this like being unable to access
+                       // the backing store server.
+                       os_log_error(OS_LOG_DEFAULT,
+                           "NSPACE resolve_path failure: %d", kr);
+                       error = ETIMEDOUT;
  
-       return 0;
-}
+                       NSPACE_REQ_LOCK();
+                       nspace_resolver_req_remove(&req);
+                       NSPACE_REQ_UNLOCK();
+                       vnode_rele(req.r_vp);
+                       goto out_release_port;
+               }
  
-static int
-process_namespace_fsctl(nspace_type_t nspace_type, int is64bit, u_int size, caddr_t data)
-{
-       int error = 0;
-       namespace_handler_data nhd;
+               // Give back the memory we allocated earlier while
+               // we wait; we no longer need it.
+               zfree(ZV_NAMEI, path);
+               path = NULL;
  
-       bzero(&nhd, sizeof(namespace_handler_data));
+               // Request has been submitted to the resolver.
+               // Now (interruptibly) wait for completion.
+               // Upon requrn, the request will have been removed
+               // from the lookup table.
+               error = nspace_resolver_req_wait(&req);
  
-       if ((error = suser(kauth_cred_get(), &(current_proc()->p_acflag)))) {
-               return error;
+               vnode_rele(req.r_vp);
         }
  
-       error = validate_namespace_args(is64bit, size);
-       if (error) {
-               return error;
+out_release_port:
+       if (path != NULL) {
+               zfree(ZV_NAMEI, path);
         }
+       ipc_port_release_send(mp);
  
-       /* Copy in the userland pointers into our kernel-only struct */
+       return error;
+#else
+       return ENOTSUP;
+#endif /* CONFIG_DATALESS_FILES */
+}
  
-       if (is64bit) {
-               /* 64 bit userland structures */
-               nhd.token = (user_addr_t)((user64_namespace_handler_info *)data)->token;
-               nhd.flags = (user_addr_t)((user64_namespace_handler_info *)data)->flags;
-               nhd.fdptr = (user_addr_t)((user64_namespace_handler_info *)data)->fdptr;
+int
+nspace_snapshot_event(__unused vnode_t vp, __unused  time_t ctime,
+    __unused uint64_t op_type, __unused void *arg)
+{
+       return 0;
+}
  
-               /* If the size is greater than the standard info struct, add in extra fields */
-               if (size > (sizeof(user64_namespace_handler_info))) {
-                       if (size >= (sizeof(user64_namespace_handler_info_ext))) {
-                               nhd.infoptr = (user_addr_t)((user64_namespace_handler_info_ext *)data)->infoptr;
-                       }
-                       if (size == (sizeof(user64_namespace_handler_data))) {
-                               nhd.objid = (user_addr_t)((user64_namespace_handler_data*)data)->objid;
-                       }
-                       /* Otherwise the fields were pre-zeroed when we did the bzero above. */
-               }
-       } else {
-               /* 32 bit userland structures */
-               nhd.token = CAST_USER_ADDR_T(((user32_namespace_handler_info *)data)->token);
-               nhd.flags = CAST_USER_ADDR_T(((user32_namespace_handler_info *)data)->flags);
-               nhd.fdptr = CAST_USER_ADDR_T(((user32_namespace_handler_info *)data)->fdptr);
+#if 0
+static int
+build_volfs_path(struct vnode *vp, char *path, int *len)
+{
+       struct vnode_attr va;
+       int ret;
  
-               if (size > (sizeof(user32_namespace_handler_info))) {
-                       if (size >= (sizeof(user32_namespace_handler_info_ext))) {
-                               nhd.infoptr = CAST_USER_ADDR_T(((user32_namespace_handler_info_ext *)data)->infoptr);
-                       }
-                       if (size == (sizeof(user32_namespace_handler_data))) {
-                               nhd.objid = (user_addr_t)((user32_namespace_handler_data*)data)->objid;
-                       }
-                       /* Otherwise the fields were pre-zeroed when we did the bzero above. */
-               }
+       VATTR_INIT(&va);
+       VATTR_WANTED(&va, va_fsid);
+       VATTR_WANTED(&va, va_fileid);
+
+       if (vnode_getattr(vp, &va, vfs_context_kernel()) != 0) {
+               *len = snprintf(path, *len, "/non/existent/path/because/vnode_getattr/failed") + 1;
+               ret = -1;
+       } else {
+               *len = snprintf(path, *len, "/.vol/%d/%lld", (dev_t)va.va_fsid, va.va_fileid) + 1;
+               ret = 0;
         }
  
-       return wait_for_namespace_event(&nhd, nspace_type);
+       return ret;
  }
+#endif
  
  static unsigned long
  fsctl_bogus_command_compat(unsigned long cmd)
@@ -10493,22 +11309,6 @@ fsctl_bogus_command_compat(unsigned long cmd)
                 return FSIOC_ROUTEFS_SETROUTEID;
         case IOCBASECMD(FSIOC_SET_PACKAGE_EXTS):
                 return FSIOC_SET_PACKAGE_EXTS;
-       case IOCBASECMD(FSIOC_NAMESPACE_HANDLER_GET):
-               return FSIOC_NAMESPACE_HANDLER_GET;
-       case IOCBASECMD(FSIOC_OLD_SNAPSHOT_HANDLER_GET):
-               return FSIOC_OLD_SNAPSHOT_HANDLER_GET;
-       case IOCBASECMD(FSIOC_SNAPSHOT_HANDLER_GET_EXT):
-               return FSIOC_SNAPSHOT_HANDLER_GET_EXT;
-       case IOCBASECMD(FSIOC_NAMESPACE_HANDLER_UPDATE):
-               return FSIOC_NAMESPACE_HANDLER_UPDATE;
-       case IOCBASECMD(FSIOC_NAMESPACE_HANDLER_UNBLOCK):
-               return FSIOC_NAMESPACE_HANDLER_UNBLOCK;
-       case IOCBASECMD(FSIOC_NAMESPACE_HANDLER_CANCEL):
-               return FSIOC_NAMESPACE_HANDLER_CANCEL;
-       case IOCBASECMD(FSIOC_NAMESPACE_HANDLER_SET_SNAPSHOT_TIME):
-               return FSIOC_NAMESPACE_HANDLER_SET_SNAPSHOT_TIME;
-       case IOCBASECMD(FSIOC_NAMESPACE_ALLOW_DMG_SNAPSHOT_EVENTS):
-               return FSIOC_NAMESPACE_ALLOW_DMG_SNAPSHOT_EVENTS;
         case IOCBASECMD(FSIOC_SET_FSTYPENAME_OVERRIDE):
                 return FSIOC_SET_FSTYPENAME_OVERRIDE;
         case IOCBASECMD(DISK_CONDITIONER_IOC_GET):
@@ -10528,6 +11328,128 @@ fsctl_bogus_command_compat(unsigned long cmd)
         return cmd;
  }
  
+static int
+cas_bsdflags_setattr(vnode_t vp, void *arg, vfs_context_t ctx)
+{
+       return VNOP_IOCTL(vp, FSIOC_CAS_BSDFLAGS, arg, FWRITE, ctx);
+}
+
+static int __attribute__((noinline))
+handle_sync_volume(vnode_t vp, vnode_t *arg_vp, caddr_t data, vfs_context_t ctx)
+{
+       struct vfs_attr vfa;
+       mount_t mp = vp->v_mount;
+       unsigned arg;
+       int error;
+
+       /* record vid of vp so we can drop it below. */
+       uint32_t vvid = vp->v_id;
+
+       /*
+        * Then grab mount_iterref so that we can release the vnode.
+        * Without this, a thread may call vnode_iterate_prepare then
+        * get into a deadlock because we've never released the root vp
+        */
+       error = mount_iterref(mp, 0);
+       if (error) {
+               return error;
+       }
+       vnode_put(vp);
+
+       arg = MNT_NOWAIT;
+       if (*(uint32_t*)data & FSCTL_SYNC_WAIT) {
+               arg = MNT_WAIT;
+       }
+
+       /*
+        * If the filessytem supports multiple filesytems in a
+        * partition (For eg APFS volumes in a container, it knows
+        * that the waitfor argument to VFS_SYNC are flags.
+        */
+       VFSATTR_INIT(&vfa);
+       VFSATTR_WANTED(&vfa, f_capabilities);
+       if ((vfs_getattr(mp, &vfa, vfs_context_current()) == 0) &&
+           VFSATTR_IS_SUPPORTED(&vfa, f_capabilities) &&
+           ((vfa.f_capabilities.valid[VOL_CAPABILITIES_FORMAT] & VOL_CAP_FMT_SHARED_SPACE)) &&
+           ((vfa.f_capabilities.capabilities[VOL_CAPABILITIES_FORMAT] & VOL_CAP_FMT_SHARED_SPACE))) {
+               arg |= MNT_VOLUME;
+       }
+
+       /* issue the sync for this volume */
+       (void)sync_callback(mp, &arg);
+
+       /*
+        * Then release the mount_iterref once we're done syncing; it's not
+        * needed for the VNOP_IOCTL below
+        */
+       mount_iterdrop(mp);
+
+       if (arg & FSCTL_SYNC_FULLSYNC) {
+               /* re-obtain vnode iocount on the root vp, if possible */
+               error = vnode_getwithvid(vp, vvid);
+               if (error == 0) {
+                       error = VNOP_IOCTL(vp, F_FULLFSYNC, (caddr_t)NULL, 0, ctx);
+                       vnode_put(vp);
+               }
+       }
+       /* mark the argument VP as having been released */
+       *arg_vp = NULL;
+       return error;
+}
+
+#if ROUTEFS
+static int __attribute__((noinline))
+handle_routes(user_addr_t udata)
+{
+       char routepath[MAXPATHLEN];
+       size_t len = 0;
+       int error;
+
+       if ((error = suser(kauth_cred_get(), &(current_proc()->p_acflag)))) {
+               return error;
+       }
+       bzero(routepath, MAXPATHLEN);
+       error = copyinstr(udata, &routepath[0], MAXPATHLEN, &len);
+       if (error) {
+               return error;
+       }
+       error = routefs_kernel_mount(routepath);
+       return error;
+}
+#endif
+
+static int __attribute__((noinline))
+handle_flags(vnode_t vp, caddr_t data, vfs_context_t ctx)
+{
+       struct fsioc_cas_bsdflags *cas = (struct fsioc_cas_bsdflags *)data;
+       struct vnode_attr va;
+       int error;
+
+       VATTR_INIT(&va);
+       VATTR_SET(&va, va_flags, cas->new_flags);
+
+       error = chflags0(vp, &va, cas_bsdflags_setattr, cas, ctx);
+       return error;
+}
+
+static int __attribute__((noinline))
+handle_auth(vnode_t vp, u_long cmd, caddr_t data, u_long options, vfs_context_t ctx)
+{
+       struct mount *mp = NULL;
+       errno_t rootauth = 0;
+
+       mp = vp->v_mount;
+
+       /*
+        * query the underlying FS and see if it reports something
+        * sane for this vnode. If volume is authenticated via
+        * chunklist, leave that for the caller to determine.
+        */
+       rootauth = VNOP_IOCTL(vp, cmd, data, (int)options, ctx);
+
+       return rootauth;
+}
+
  /*
   * Make a filesystem-specific control call:
   */
@@ -10543,6 +11465,10 @@ fsctl_internal(proc_t p, vnode_t *arg_vp, u_long cmd, user_addr_t udata, u_long
         caddr_t data, memp;
         vnode_t vp = *arg_vp;
  
+       if (vp->v_type == VCHR || vp->v_type == VBLK) {
+               return ENOTTY;
+       }
+
         cmd = fsctl_bogus_command_compat(cmd);
  
         size = IOCPARM_LEN(cmd);
@@ -10555,7 +11481,7 @@ fsctl_internal(proc_t p, vnode_t *arg_vp, u_long cmd, user_addr_t udata, u_long
         memp = NULL;
  
         if (size > sizeof(stkbuf)) {
-               if ((memp = (caddr_t)kalloc(size)) == 0) {
+               if ((memp = (caddr_t)kheap_alloc(KHEAP_TEMP, size, Z_WAITOK)) == 0) {
                         return ENOMEM;
                 }
                 data = memp;
@@ -10568,7 +11494,7 @@ fsctl_internal(proc_t p, vnode_t *arg_vp, u_long cmd, user_addr_t udata, u_long
                         error = copyin(udata, data, size);
                         if (error) {
                                 if (memp) {
-                                       kfree(memp, size);
+                                       kheap_free(KHEAP_TEMP, memp, size);
                                 }
                                 return error;
                         }
@@ -10578,83 +11504,32 @@ fsctl_internal(proc_t p, vnode_t *arg_vp, u_long cmd, user_addr_t udata, u_long
                         } else {
                                 *(uint32_t *)data = (uint32_t)udata;
                         }
-               };
-       } else if ((cmd & IOC_OUT) && size) {
-               /*
-                * Zero the buffer so the user always
-                * gets back something deterministic.
-                */
-               bzero(data, size);
-       } else if (cmd & IOC_VOID) {
-               if (is64bit) {
-                       *(user_addr_t *)data = udata;
-               } else {
-                       *(uint32_t *)data = (uint32_t)udata;
-               }
-       }
-
-       /* Check to see if it's a generic command */
-       switch (cmd) {
-       case FSIOC_SYNC_VOLUME: {
-               mount_t mp = vp->v_mount;
-               int arg = *(uint32_t*)data;
-
-               /* record vid of vp so we can drop it below. */
-               uint32_t vvid = vp->v_id;
-
-               /*
-                * Then grab mount_iterref so that we can release the vnode.
-                * Without this, a thread may call vnode_iterate_prepare then
-                * get into a deadlock because we've never released the root vp
-                */
-               error = mount_iterref(mp, 0);
-               if (error) {
-                       break;
-               }
-               vnode_put(vp);
-
-               /* issue the sync for this volume */
-               (void)sync_callback(mp, (arg & FSCTL_SYNC_WAIT) ? &arg : NULL);
-
+               };
+       } else if ((cmd & IOC_OUT) && size) {
                 /*
-                * Then release the mount_iterref once we're done syncing; it's not
-                * needed for the VNOP_IOCTL below
+                * Zero the buffer so the user always
+                * gets back something deterministic.
                  */
-               mount_iterdrop(mp);
-
-               if (arg & FSCTL_SYNC_FULLSYNC) {
-                       /* re-obtain vnode iocount on the root vp, if possible */
-                       error = vnode_getwithvid(vp, vvid);
-                       if (error == 0) {
-                               error = VNOP_IOCTL(vp, F_FULLFSYNC, (caddr_t)NULL, 0, ctx);
-                               vnode_put(vp);
-                       }
+               bzero(data, size);
+       } else if (cmd & IOC_VOID) {
+               if (is64bit) {
+                       *(user_addr_t *)data = udata;
+               } else {
+                       *(uint32_t *)data = (uint32_t)udata;
                 }
-               /* mark the argument VP as having been released */
-               *arg_vp = NULL;
         }
-       break;
  
-       case FSIOC_ROUTEFS_SETROUTEID: {
-#if ROUTEFS
-               char routepath[MAXPATHLEN];
-               size_t len = 0;
+       /* Check to see if it's a generic command */
+       switch (cmd) {
+       case FSIOC_SYNC_VOLUME:
+               error = handle_sync_volume(vp, arg_vp, data, ctx);
+               break;
  
-               if ((error = suser(kauth_cred_get(), &(current_proc()->p_acflag)))) {
-                       break;
-               }
-               bzero(routepath, MAXPATHLEN);
-               error = copyinstr(udata, &routepath[0], MAXPATHLEN, &len);
-               if (error) {
-                       break;
-               }
-               error = routefs_kernel_mount(routepath);
-               if (error) {
-                       break;
-               }
+       case FSIOC_ROUTEFS_SETROUTEID:
+#if ROUTEFS
+               error = handle_routes(udata);
  #endif
-       }
-       break;
+               break;
  
         case FSIOC_SET_PACKAGE_EXTS: {
                 user_addr_t ext_strings;
@@ -10675,7 +11550,10 @@ fsctl_internal(proc_t p, vnode_t *arg_vp, u_long cmd, user_addr_t udata, u_long
                 }
  
                 if (is64bit) {
-                       ext_strings = ((user64_package_ext_info *)data)->strings;
+                       if (sizeof(user64_addr_t) > sizeof(user_addr_t)) {
+                               assert(((user64_package_ext_info *)data)->strings <= UINT32_MAX);
+                       }
+                       ext_strings = (user_addr_t)((user64_package_ext_info *)data)->strings;
                         num_entries = ((user64_package_ext_info *)data)->num_entries;
                         max_width   = ((user64_package_ext_info *)data)->max_width;
                 } else {
@@ -10687,191 +11565,6 @@ fsctl_internal(proc_t p, vnode_t *arg_vp, u_long cmd, user_addr_t udata, u_long
         }
         break;
  
-       /* namespace handlers */
-       case FSIOC_NAMESPACE_HANDLER_GET: {
-               error = process_namespace_fsctl(NSPACE_HANDLER_NSPACE, is64bit, size, data);
-       }
-       break;
-
-       /* Snapshot handlers */
-       case FSIOC_OLD_SNAPSHOT_HANDLER_GET: {
-               error = process_namespace_fsctl(NSPACE_HANDLER_SNAPSHOT, is64bit, size, data);
-       }
-       break;
-
-       case FSIOC_SNAPSHOT_HANDLER_GET_EXT: {
-               error = process_namespace_fsctl(NSPACE_HANDLER_SNAPSHOT, is64bit, size, data);
-       }
-       break;
-
-       case FSIOC_NAMESPACE_HANDLER_UPDATE: {
-               uint32_t token, val;
-               int i;
-
-               if ((error = suser(kauth_cred_get(), &(p->p_acflag)))) {
-                       break;
-               }
-
-               if (!nspace_is_special_process(p)) {
-                       error = EINVAL;
-                       break;
-               }
-
-               token = ((uint32_t *)data)[0];
-               val   = ((uint32_t *)data)[1];
-
-               lck_mtx_lock(&nspace_handler_lock);
-
-               for (i = 0; i < MAX_NSPACE_ITEMS; i++) {
-                       if (nspace_items[i].token == token) {
-                               break;          /* exit for loop, not case stmt */
-                       }
-               }
-
-               if (i >= MAX_NSPACE_ITEMS) {
-                       error = ENOENT;
-               } else {
-                       //
-                       // if this bit is set, when resolve_nspace_item() times out
-                       // it will loop and go back to sleep.
-                       //
-                       nspace_items[i].flags |= NSPACE_ITEM_RESET_TIMER;
-               }
-
-               lck_mtx_unlock(&nspace_handler_lock);
-
-               if (error) {
-                       printf("nspace-handler-update: did not find token %u\n", token);
-               }
-       }
-       break;
-
-       case FSIOC_NAMESPACE_HANDLER_UNBLOCK: {
-               uint32_t token, val;
-               int i;
-
-               if ((error = suser(kauth_cred_get(), &(p->p_acflag)))) {
-                       break;
-               }
-
-               if (!nspace_is_special_process(p)) {
-                       error = EINVAL;
-                       break;
-               }
-
-               token = ((uint32_t *)data)[0];
-               val   = ((uint32_t *)data)[1];
-
-               lck_mtx_lock(&nspace_handler_lock);
-
-               for (i = 0; i < MAX_NSPACE_ITEMS; i++) {
-                       if (nspace_items[i].token == token) {
-                               break;         /* exit for loop, not case statement */
-                       }
-               }
-
-               if (i >= MAX_NSPACE_ITEMS) {
-                       printf("nspace-handler-unblock: did not find token %u\n", token);
-                       error = ENOENT;
-               } else {
-                       if (val == 0 && nspace_items[i].vp) {
-                               vnode_lock_spin(nspace_items[i].vp);
-                               nspace_items[i].vp->v_flag &= ~VNEEDSSNAPSHOT;
-                               vnode_unlock(nspace_items[i].vp);
-                       }
-
-                       nspace_items[i].vp = NULL;
-                       nspace_items[i].arg = NULL;
-                       nspace_items[i].op = 0;
-                       nspace_items[i].vid = 0;
-                       nspace_items[i].flags = NSPACE_ITEM_DONE;
-                       nspace_items[i].token = 0;
-
-                       wakeup((caddr_t)&(nspace_items[i].vp));
-               }
-
-               lck_mtx_unlock(&nspace_handler_lock);
-       }
-       break;
-
-       case FSIOC_NAMESPACE_HANDLER_CANCEL: {
-               uint32_t token, val;
-               int i;
-
-               if ((error = suser(kauth_cred_get(), &(p->p_acflag)))) {
-                       break;
-               }
-
-               if (!nspace_is_special_process(p)) {
-                       error = EINVAL;
-                       break;
-               }
-
-               token = ((uint32_t *)data)[0];
-               val   = ((uint32_t *)data)[1];
-
-               lck_mtx_lock(&nspace_handler_lock);
-
-               for (i = 0; i < MAX_NSPACE_ITEMS; i++) {
-                       if (nspace_items[i].token == token) {
-                               break;          /* exit for loop, not case stmt */
-                       }
-               }
-
-               if (i >= MAX_NSPACE_ITEMS) {
-                       printf("nspace-handler-cancel: did not find token %u\n", token);
-                       error = ENOENT;
-               } else {
-                       if (nspace_items[i].vp) {
-                               vnode_lock_spin(nspace_items[i].vp);
-                               nspace_items[i].vp->v_flag &= ~VNEEDSSNAPSHOT;
-                               vnode_unlock(nspace_items[i].vp);
-                       }
-
-                       nspace_items[i].vp = NULL;
-                       nspace_items[i].arg = NULL;
-                       nspace_items[i].vid = 0;
-                       nspace_items[i].token = val;
-                       nspace_items[i].flags &= ~NSPACE_ITEM_PROCESSING;
-                       nspace_items[i].flags |= NSPACE_ITEM_CANCELLED;
-
-                       wakeup((caddr_t)&(nspace_items[i].vp));
-               }
-
-               lck_mtx_unlock(&nspace_handler_lock);
-       }
-       break;
-
-       case FSIOC_NAMESPACE_HANDLER_SET_SNAPSHOT_TIME: {
-               if ((error = suser(kauth_cred_get(), &(current_proc()->p_acflag)))) {
-                       break;
-               }
-
-               // we explicitly do not do the namespace_handler_proc check here
-
-               lck_mtx_lock(&nspace_handler_lock);
-               snapshot_timestamp = ((uint32_t *)data)[0];
-               wakeup(&nspace_item_idx);
-               lck_mtx_unlock(&nspace_handler_lock);
-               printf("nspace-handler-set-snapshot-time: %d\n", (int)snapshot_timestamp);
-       }
-       break;
-
-       case FSIOC_NAMESPACE_ALLOW_DMG_SNAPSHOT_EVENTS:
-       {
-               if ((error = suser(kauth_cred_get(), &(current_proc()->p_acflag)))) {
-                       break;
-               }
-
-               lck_mtx_lock(&nspace_handler_lock);
-               nspace_allow_virtual_devs = ((uint32_t *)data)[0];
-               lck_mtx_unlock(&nspace_handler_lock);
-               printf("nspace-snapshot-handler will%s allow events on disk-images\n",
-                   nspace_allow_virtual_devs ? "" : " NOT");
-               error = 0;
-       }
-       break;
-
         case FSIOC_SET_FSTYPENAME_OVERRIDE:
         {
                 if ((error = suser(kauth_cred_get(), &(current_proc()->p_acflag)))) {
@@ -10880,6 +11573,21 @@ fsctl_internal(proc_t p, vnode_t *arg_vp, u_long cmd, user_addr_t udata, u_long
                 if (vp->v_mount) {
                         mount_lock(vp->v_mount);
                         if (data[0] != 0) {
+                               int i;
+                               for (i = 0; i < MFSTYPENAMELEN; i++) {
+                                       if (!data[i]) {
+                                               goto continue_copy;
+                                       }
+                               }
+                               /*
+                                * Getting here means we have a user data string which has no
+                                * NULL termination in its first MFSTYPENAMELEN bytes.
+                                * This is bogus, let's avoid strlcpy-ing the read data and
+                                * return an error.
+                                */
+                               error = EINVAL;
+                               goto unlock;
+continue_copy:
                                 strlcpy(&vp->v_mount->fstypename_override[0], data, MFSTYPENAMELEN);
                                 vp->v_mount->mnt_kern_flag |= MNTK_TYPENAME_OVERRIDE;
                                 if (vfs_isrdonly(vp->v_mount) && strcmp(vp->v_mount->fstypename_override, "mtmfs") == 0) {
@@ -10893,6 +11601,7 @@ fsctl_internal(proc_t p, vnode_t *arg_vp, u_long cmd, user_addr_t udata, u_long
                                 vp->v_mount->mnt_kern_flag &= ~MNTK_TYPENAME_OVERRIDE;
                                 vp->v_mount->fstypename_override[0] = '\0';
                         }
+unlock:
                         mount_unlock(vp->v_mount);
                 }
         }
@@ -10908,15 +11617,30 @@ fsctl_internal(proc_t p, vnode_t *arg_vp, u_long cmd, user_addr_t udata, u_long
         }
         break;
  
+       case FSIOC_CAS_BSDFLAGS:
+               error = handle_flags(vp, data, ctx);
+               break;
+
         case FSIOC_FD_ONLY_OPEN_ONCE: {
+               error = 0;
                 if (vnode_usecount(vp) > 1) {
-                       error = EBUSY;
-               } else {
-                       error = 0;
+                       vnode_lock_spin(vp);
+                       if (vp->v_lflag & VL_HASSTREAMS) {
+                               if (vnode_isinuse_locked(vp, 1, 1)) {
+                                       error = EBUSY;
+                               }
+                       } else if (vnode_usecount(vp) > 1) {
+                               error = EBUSY;
+                       }
+                       vnode_unlock(vp);
                 }
         }
         break;
  
+       case FSIOC_EVAL_ROOTAUTH:
+               error = handle_auth(vp, cmd, data, options, ctx);
+               break;
+
         default: {
                 /* other, known commands shouldn't be passed down here */
                 switch (cmd) {
@@ -10943,11 +11667,12 @@ fsctl_internal(proc_t p, vnode_t *arg_vp, u_long cmd, user_addr_t udata, u_long
                 case F_BARRIERFSYNC:
                 case F_FREEZE_FS:
                 case F_THAW_FS:
+               case FSIOC_KERNEL_ROOTAUTH:
                         error = EINVAL;
                         goto outdrop;
                 }
                 /* Invoke the filesystem-specific code */
-               error = VNOP_IOCTL(vp, cmd, data, options, ctx);
+               error = VNOP_IOCTL(vp, cmd, data, (int)options, ctx);
         }
         } /* end switch stmt */
  
@@ -10961,7 +11686,7 @@ fsctl_internal(proc_t p, vnode_t *arg_vp, u_long cmd, user_addr_t udata, u_long
  
  outdrop:
         if (memp) {
-               kfree(memp, size);
+               kheap_free(KHEAP_TEMP, memp, size);
         }
  
         return error;
@@ -10973,11 +11698,11 @@ fsctl(proc_t p, struct fsctl_args *uap, __unused int32_t *retval)
  {
         int error;
         struct nameidata nd;
-       u_long nameiflags;
+       uint32_t nameiflags;
         vnode_t vp = NULL;
         vfs_context_t ctx = vfs_context_current();
  
-       AUDIT_ARG(cmd, uap->cmd);
+       AUDIT_ARG(cmd, (int)uap->cmd);
         AUDIT_ARG(value32, uap->options);
         /* Get the vnode for the file we are getting info on:  */
         nameiflags = 0;
@@ -10993,6 +11718,9 @@ fsctl(proc_t p, struct fsctl_args *uap, __unused int32_t *retval)
         if ((uap->options & FSOPT_NOFOLLOW) == 0) {
                 nameiflags |= FOLLOW;
         }
+       if (uap->cmd == FSIOC_FIRMLINK_CTL) {
+               nameiflags |= (CN_FIRMLINK_NOFOLLOW | NOCACHE);
+       }
         NDINIT(&nd, LOOKUP, OP_FSCTL, nameiflags | AUDITVNPATH1,
             UIO_USERSPACE, uap->path, ctx);
         if ((error = namei(&nd))) {
@@ -11026,7 +11754,7 @@ ffsctl(proc_t p, struct ffsctl_args *uap, __unused int32_t *retval)
         int fd = -1;
  
         AUDIT_ARG(fd, uap->fd);
-       AUDIT_ARG(cmd, uap->cmd);
+       AUDIT_ARG(cmd, (int)uap->cmd);
         AUDIT_ARG(value32, uap->options);
  
         /* Get the vnode for the file we are getting info on:  */
@@ -11060,6 +11788,27 @@ ffsctl(proc_t p, struct ffsctl_args *uap, __unused int32_t *retval)
  }
  /* end of fsctl system call */
  
+#define FILESEC_ACCESS_ENTITLEMENT              \
+       "com.apple.private.vfs.filesec-access"
+
+static int
+xattr_entitlement_check(const char *attrname, vfs_context_t ctx, bool setting)
+{
+       if (strcmp(attrname, KAUTH_FILESEC_XATTR) == 0) {
+               /*
+                * get: root and tasks with FILESEC_ACCESS_ENTITLEMENT.
+                * set: only tasks with FILESEC_ACCESS_ENTITLEMENT.
+                */
+               if ((!setting && vfs_context_issuser(ctx)) ||
+                   IOTaskHasEntitlement(current_task(),
+                   FILESEC_ACCESS_ENTITLEMENT)) {
+                       return 0;
+               }
+       }
+
+       return EPERM;
+}
+
  /*
   *  Retrieve the data of an extended attribute.
   */
@@ -11094,11 +11843,9 @@ getxattr(proc_t p, struct getxattr_args *uap, user_ssize_t *retval)
         if (error != 0) {
                 goto out;
         }
-       if (xattr_protected(attrname)) {
-               if (!vfs_context_issuser(ctx) || strcmp(attrname, "com.apple.system.Security") != 0) {
-                       error = EPERM;
-                       goto out;
-               }
+       if (xattr_protected(attrname) &&
+           (error = xattr_entitlement_check(attrname, ctx, false)) != 0) {
+               goto out;
         }
         /*
          * the specific check for 0xffffffff is a hack to preserve
@@ -11156,6 +11903,7 @@ fgetxattr(proc_t p, struct fgetxattr_args *uap, user_ssize_t *retval)
  {
         vnode_t vp;
         char attrname[XATTR_MAXNAMELEN + 1];
+       vfs_context_t ctx = vfs_context_current();
         uio_t auio = NULL;
         int spacetype = IS_64BIT_PROCESS(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
         size_t attrsize = 0;
@@ -11178,8 +11926,8 @@ fgetxattr(proc_t p, struct fgetxattr_args *uap, user_ssize_t *retval)
         if (error != 0) {
                 goto out;
         }
-       if (xattr_protected(attrname)) {
-               error = EPERM;
+       if (xattr_protected(attrname) &&
+           (error = xattr_entitlement_check(attrname, ctx, false)) != 0) {
                 goto out;
         }
         if (uap->value && uap->size > 0) {
@@ -11231,12 +11979,16 @@ setxattr(proc_t p, struct setxattr_args *uap, int *retval)
                 /* Otherwise return the default error from copyinstr to detect ERANGE, etc */
                 return error;
         }
-       if (xattr_protected(attrname)) {
-               return EPERM;
+       if (xattr_protected(attrname) &&
+           (error = xattr_entitlement_check(attrname, ctx, true)) != 0) {
+               return error;
         }
         if (uap->size != 0 && uap->value == 0) {
                 return EINVAL;
         }
+       if (uap->size > INT_MAX) {
+               return E2BIG;
+       }
  
         nameiflags = (uap->options & XATTR_NOFOLLOW) ? 0 : FOLLOW;
         NDINIT(&nd, LOOKUP, OP_SETXATTR, nameiflags, spacetype, uap->path, ctx);
@@ -11271,14 +12023,12 @@ fsetxattr(proc_t p, struct fsetxattr_args *uap, int *retval)
  {
         vnode_t vp;
         char attrname[XATTR_MAXNAMELEN + 1];
+       vfs_context_t ctx = vfs_context_current();
         uio_t auio = NULL;
         int spacetype = IS_64BIT_PROCESS(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
         size_t namelen;
         int error;
         char uio_buf[UIO_SIZEOF(1)];
-#if CONFIG_FSE
-       vfs_context_t ctx = vfs_context_current();
-#endif
  
         if (uap->options & (XATTR_NOFOLLOW | XATTR_NOSECURITY | XATTR_NODEFAULT)) {
                 return EINVAL;
@@ -11293,12 +12043,16 @@ fsetxattr(proc_t p, struct fsetxattr_args *uap, int *retval)
                 /* Otherwise return the default error from copyinstr to detect ERANGE, etc */
                 return error;
         }
-       if (xattr_protected(attrname)) {
-               return EPERM;
+       if (xattr_protected(attrname) &&
+           (error = xattr_entitlement_check(attrname, ctx, true)) != 0) {
+               return error;
         }
         if (uap->size != 0 && uap->value == 0) {
                 return EINVAL;
         }
+       if (uap->size > INT_MAX) {
+               return E2BIG;
+       }
         if ((error = file_vnode(uap->fd, &vp))) {
                 return error;
         }
@@ -11509,9 +12263,8 @@ flistxattr(proc_t p, struct flistxattr_args *uap, user_ssize_t *retval)
  }
  
  static int
-fsgetpath_internal(
-       vfs_context_t ctx, int volfs_id, uint64_t objid,
-       vm_size_t bufsize, caddr_t buf, int *pathlen)
+fsgetpath_internal(vfs_context_t ctx, int volfs_id, uint64_t objid,
+    vm_size_t bufsize, caddr_t buf, uint32_t options, int *pathlen)
  {
         int error;
         struct mount *mp = NULL;
@@ -11537,7 +12290,25 @@ retry:
  
  unionget:
         if (objid == 2) {
-               error = VFS_ROOT(mp, &vp, ctx);
+               struct vfs_attr vfsattr;
+               int use_vfs_root = TRUE;
+
+               VFSATTR_INIT(&vfsattr);
+               VFSATTR_WANTED(&vfsattr, f_capabilities);
+               if (!(options & FSOPT_ISREALFSID) &&
+                   vfs_getattr(mp, &vfsattr, vfs_context_kernel()) == 0 &&
+                   VFSATTR_IS_SUPPORTED(&vfsattr, f_capabilities)) {
+                       if ((vfsattr.f_capabilities.capabilities[VOL_CAPABILITIES_FORMAT] & VOL_CAP_FMT_VOL_GROUPS) &&
+                           (vfsattr.f_capabilities.valid[VOL_CAPABILITIES_FORMAT] & VOL_CAP_FMT_VOL_GROUPS)) {
+                               use_vfs_root = FALSE;
+                       }
+               }
+
+               if (use_vfs_root) {
+                       error = VFS_ROOT(mp, &vp, ctx);
+               } else {
+                       error = VFS_VGET(mp, objid, &vp, ctx);
+               }
         } else {
                 error = VFS_VGET(mp, (ino64_t)objid, &vp, ctx);
         }
@@ -11572,8 +12343,11 @@ unionget:
  
         /* Obtain the absolute path to this vnode. */
         bpflags = vfs_context_suser(ctx) ? BUILDPATH_CHECKACCESS : 0;
+       if (options & FSOPT_NOFIRMLINKPATH) {
+               bpflags |= BUILDPATH_NO_FIRMLINK;
+       }
         bpflags |= BUILDPATH_CHECK_MOVED;
-       error = build_path(vp, buf, bufsize, &length, bpflags, ctx);
+       error = build_path(vp, buf, (int)bufsize, &length, bpflags, ctx);
         vnode_put(vp);
  
         if (error) {
@@ -11591,26 +12365,24 @@ unionget:
  
         AUDIT_ARG(text, buf);
  
-       if (kdebug_enable) {
-               long dbg_parms[NUMPARMS];
-               int  dbg_namelen;
+       if (kdebug_debugid_enabled(VFS_LOOKUP) && length > 0) {
+               unsigned long path_words[NUMPARMS];
+               size_t path_len = sizeof(path_words);
  
-               dbg_namelen = (int)sizeof(dbg_parms);
+               if ((size_t)length < path_len) {
+                       memcpy((char *)path_words, buf, length);
+                       memset((char *)path_words + length, 0, path_len - length);
  
-               if (length < dbg_namelen) {
-                       memcpy((char *)dbg_parms, buf, length);
-                       memset((char *)dbg_parms + length, 0, dbg_namelen - length);
-
-                       dbg_namelen = length;
+                       path_len = length;
                 } else {
-                       memcpy((char *)dbg_parms, buf + (length - dbg_namelen), dbg_namelen);
+                       memcpy((char *)path_words, buf + (length - path_len), path_len);
                 }
  
-               kdebug_vfs_lookup(dbg_parms, dbg_namelen, (void *)vp,
+               kdebug_vfs_lookup(path_words, (int)path_len, vp,
                     KDBG_VFS_LOOKUP_FLAG_LOOKUP);
         }
  
-       *pathlen = (user_ssize_t)length; /* may be superseded by error */
+       *pathlen = length; /* may be superseded by error */
  
  out:
         return error;
@@ -11619,8 +12391,9 @@ out:
  /*
   * Obtain the full pathname of a file system object by id.
   */
-int
-fsgetpath(__unused proc_t p, struct fsgetpath_args *uap, user_ssize_t *retval)
+static int
+fsgetpath_extended(user_addr_t buf, user_size_t bufsize, user_addr_t user_fsid, uint64_t objid,
+    uint32_t options, user_ssize_t *retval)
  {
         vfs_context_t ctx = vfs_context_current();
         fsid_t fsid;
@@ -11628,39 +12401,54 @@ fsgetpath(__unused proc_t p, struct fsgetpath_args *uap, user_ssize_t *retval)
         int length;
         int error;
  
-       if ((error = copyin(uap->fsid, (caddr_t)&fsid, sizeof(fsid)))) {
+       if (options & ~(FSOPT_NOFIRMLINKPATH | FSOPT_ISREALFSID)) {
+               return EINVAL;
+       }
+
+       if ((error = copyin(user_fsid, (caddr_t)&fsid, sizeof(fsid)))) {
                 return error;
         }
         AUDIT_ARG(value32, fsid.val[0]);
-       AUDIT_ARG(value64, uap->objid);
+       AUDIT_ARG(value64, objid);
         /* Restrict output buffer size for now. */
  
-       if (uap->bufsize > PAGE_SIZE) {
+       if (bufsize > PAGE_SIZE || bufsize <= 0) {
                 return EINVAL;
         }
-       MALLOC(realpath, char *, uap->bufsize, M_TEMP, M_WAITOK | M_ZERO);
+       realpath = kheap_alloc(KHEAP_TEMP, bufsize, Z_WAITOK | Z_ZERO);
         if (realpath == NULL) {
                 return ENOMEM;
         }
  
-       error = fsgetpath_internal(
-               ctx, fsid.val[0], uap->objid,
-               uap->bufsize, realpath, &length);
+       error = fsgetpath_internal(ctx, fsid.val[0], objid, bufsize, realpath,
+           options, &length);
  
         if (error) {
                 goto out;
         }
  
-       error = copyout((caddr_t)realpath, uap->buf, length);
+       error = copyout((caddr_t)realpath, buf, length);
  
         *retval = (user_ssize_t)length; /* may be superseded by error */
  out:
-       if (realpath) {
-               FREE(realpath, M_TEMP);
-       }
+       kheap_free(KHEAP_TEMP, realpath, bufsize);
         return error;
  }
  
+int
+fsgetpath(__unused proc_t p, struct fsgetpath_args *uap, user_ssize_t *retval)
+{
+       return fsgetpath_extended(uap->buf, uap->bufsize, uap->fsid, uap->objid,
+                  0, retval);
+}
+
+int
+fsgetpath_ext(__unused proc_t p, struct fsgetpath_ext_args *uap, user_ssize_t *retval)
+{
+       return fsgetpath_extended(uap->buf, uap->bufsize, uap->fsid, uap->objid,
+                  uap->options, retval);
+}
+
  /*
   * Common routine to handle various flavors of statfs data heading out
   *     to user space.
@@ -11681,7 +12469,7 @@ munge_statfs(struct mount *mp, struct vfsstatfs *sfsp,
                 my_size = copy_size = sizeof(sfs);
                 bzero(&sfs, my_size);
                 sfs.f_flags = mp->mnt_flag & MNT_VISFLAGMASK;
-               sfs.f_type = mp->mnt_vtable->vfc_typenum;
+               sfs.f_type = (short)mp->mnt_vtable->vfc_typenum;
                 sfs.f_reserved1 = (short)sfsp->f_fssubtype;
                 sfs.f_bsize = (user64_long_t)sfsp->f_bsize;
                 sfs.f_iosize = (user64_long_t)sfsp->f_iosize;
@@ -11711,7 +12499,7 @@ munge_statfs(struct mount *mp, struct vfsstatfs *sfsp,
                 bzero(&sfs, my_size);
  
                 sfs.f_flags = mp->mnt_flag & MNT_VISFLAGMASK;
-               sfs.f_type = mp->mnt_vtable->vfc_typenum;
+               sfs.f_type = (short)mp->mnt_vtable->vfc_typenum;
                 sfs.f_reserved1 = (short)sfsp->f_fssubtype;
  
                 /*
@@ -11757,7 +12545,7 @@ munge_statfs(struct mount *mp, struct vfsstatfs *sfsp,
                         sfs.f_bavail = (user32_long_t)__SHIFT_OR_CLIP(sfsp->f_bavail, shift);
  #undef __SHIFT_OR_CLIP
                         sfs.f_bsize = (user32_long_t)(sfsp->f_bsize << shift);
-                       sfs.f_iosize = lmax(sfsp->f_iosize, sfsp->f_bsize);
+                       sfs.f_iosize = (int)lmax(sfsp->f_iosize, sfsp->f_bsize);
                 } else {
                         /* filesystem is small enough to be reported honestly */
                         sfs.f_bsize = (user32_long_t)sfsp->f_bsize;
@@ -11843,12 +12631,12 @@ munge_user32_stat(struct stat *sbp, struct user32_stat *usbp)
         usbp->st_gid = sbp->st_gid;
         usbp->st_rdev = sbp->st_rdev;
  #ifndef _POSIX_C_SOURCE
-       usbp->st_atimespec.tv_sec = sbp->st_atimespec.tv_sec;
-       usbp->st_atimespec.tv_nsec = sbp->st_atimespec.tv_nsec;
-       usbp->st_mtimespec.tv_sec = sbp->st_mtimespec.tv_sec;
-       usbp->st_mtimespec.tv_nsec = sbp->st_mtimespec.tv_nsec;
-       usbp->st_ctimespec.tv_sec = sbp->st_ctimespec.tv_sec;
-       usbp->st_ctimespec.tv_nsec = sbp->st_ctimespec.tv_nsec;
+       usbp->st_atimespec.tv_sec = (user32_time_t)sbp->st_atimespec.tv_sec;
+       usbp->st_atimespec.tv_nsec = (user32_long_t)sbp->st_atimespec.tv_nsec;
+       usbp->st_mtimespec.tv_sec = (user32_time_t)sbp->st_mtimespec.tv_sec;
+       usbp->st_mtimespec.tv_nsec = (user32_long_t)sbp->st_mtimespec.tv_nsec;
+       usbp->st_ctimespec.tv_sec = (user32_time_t)sbp->st_ctimespec.tv_sec;
+       usbp->st_ctimespec.tv_nsec = (user32_long_t)sbp->st_ctimespec.tv_nsec;
  #else
         usbp->st_atime = sbp->st_atime;
         usbp->st_atimensec = sbp->st_atimensec;
@@ -11924,14 +12712,14 @@ munge_user32_stat64(struct stat64 *sbp, struct user32_stat64 *usbp)
         usbp->st_gid = sbp->st_gid;
         usbp->st_rdev = sbp->st_rdev;
  #ifndef _POSIX_C_SOURCE
-       usbp->st_atimespec.tv_sec = sbp->st_atimespec.tv_sec;
-       usbp->st_atimespec.tv_nsec = sbp->st_atimespec.tv_nsec;
-       usbp->st_mtimespec.tv_sec = sbp->st_mtimespec.tv_sec;
-       usbp->st_mtimespec.tv_nsec = sbp->st_mtimespec.tv_nsec;
-       usbp->st_ctimespec.tv_sec = sbp->st_ctimespec.tv_sec;
-       usbp->st_ctimespec.tv_nsec = sbp->st_ctimespec.tv_nsec;
-       usbp->st_birthtimespec.tv_sec = sbp->st_birthtimespec.tv_sec;
-       usbp->st_birthtimespec.tv_nsec = sbp->st_birthtimespec.tv_nsec;
+       usbp->st_atimespec.tv_sec = (user32_time_t)sbp->st_atimespec.tv_sec;
+       usbp->st_atimespec.tv_nsec = (user32_long_t)sbp->st_atimespec.tv_nsec;
+       usbp->st_mtimespec.tv_sec = (user32_time_t)sbp->st_mtimespec.tv_sec;
+       usbp->st_mtimespec.tv_nsec = (user32_long_t)sbp->st_mtimespec.tv_nsec;
+       usbp->st_ctimespec.tv_sec = (user32_time_t)sbp->st_ctimespec.tv_sec;
+       usbp->st_ctimespec.tv_nsec = (user32_long_t)sbp->st_ctimespec.tv_nsec;
+       usbp->st_birthtimespec.tv_sec = (user32_time_t)sbp->st_birthtimespec.tv_sec;
+       usbp->st_birthtimespec.tv_nsec = (user32_long_t)sbp->st_birthtimespec.tv_nsec;
  #else
         usbp->st_atime = sbp->st_atime;
         usbp->st_atimensec = sbp->st_atimensec;
@@ -12050,7 +12838,7 @@ vnode_get_snapshot(int dirfd, vnode_t *rvpp, vnode_t *sdvpp,
                 goto out;
         }
  
-       MALLOC(name_buf, caddr_t, MAXPATHLEN, M_TEMP, M_WAITOK);
+       name_buf = zalloc_flags(ZV_NAMEI, Z_WAITOK);
         error = copyinstr(name, name_buf, MAXPATHLEN, &name_len);
         if (error) {
                 goto out1;
@@ -12093,7 +12881,7 @@ vnode_get_snapshot(int dirfd, vnode_t *rvpp, vnode_t *sdvpp,
  
         error = namei(ndp);
  out1:
-       FREE(name_buf, M_TEMP);
+       zfree(ZV_NAMEI, name_buf);
  out:
         if (error) {
                 if (*sdvpp) {
@@ -12123,41 +12911,50 @@ out:
   * Since this requires superuser privileges, vnode_authorize calls are not
   * made.
   */
-static int
+static int __attribute__((noinline))
  snapshot_create(int dirfd, user_addr_t name, __unused uint32_t flags,
      vfs_context_t ctx)
  {
         vnode_t rvp, snapdvp;
         int error;
-       struct nameidata namend;
+       struct nameidata *ndp;
  
-       error = vnode_get_snapshot(dirfd, &rvp, &snapdvp, name, &namend, CREATE,
+       ndp = kheap_alloc(KHEAP_TEMP, sizeof(*ndp), Z_WAITOK);
+
+       error = vnode_get_snapshot(dirfd, &rvp, &snapdvp, name, ndp, CREATE,
             OP_LINK, ctx);
         if (error) {
-               return error;
+               goto out;
         }
  
-       if (namend.ni_vp) {
-               vnode_put(namend.ni_vp);
+       if (ndp->ni_vp) {
+               vnode_put(ndp->ni_vp);
                 error = EEXIST;
         } else {
-               struct vnode_attr va;
+               struct vnode_attr *vap;
                 vnode_t vp = NULLVP;
  
-               VATTR_INIT(&va);
-               VATTR_SET(&va, va_type, VREG);
-               VATTR_SET(&va, va_mode, 0);
+               vap = kheap_alloc(KHEAP_TEMP, sizeof(*vap), Z_WAITOK);
  
-               error = vn_create(snapdvp, &vp, &namend, &va,
+               VATTR_INIT(vap);
+               VATTR_SET(vap, va_type, VREG);
+               VATTR_SET(vap, va_mode, 0);
+
+               error = vn_create(snapdvp, &vp, ndp, vap,
                     VN_CREATE_NOAUTH  | VN_CREATE_NOINHERIT, 0, NULL, ctx);
                 if (!error && vp) {
                         vnode_put(vp);
                 }
+
+               kheap_free(KHEAP_TEMP, vap, sizeof(*vap));
         }
  
-       nameidone(&namend);
+       nameidone(ndp);
         vnode_put(snapdvp);
         vnode_put(rvp);
+out:
+       kheap_free(KHEAP_TEMP, ndp, sizeof(*ndp));
+
         return error;
  }
  
@@ -12167,28 +12964,32 @@ snapshot_create(int dirfd, user_addr_t name, __unused uint32_t flags,
   * get the vnode for the unnamed snapshot directory and the snapshot and
   * delete the snapshot.
   */
-static int
+static int __attribute__((noinline))
  snapshot_delete(int dirfd, user_addr_t name, __unused uint32_t flags,
      vfs_context_t ctx)
  {
         vnode_t rvp, snapdvp;
         int error;
-       struct nameidata namend;
+       struct nameidata *ndp;
  
-       error = vnode_get_snapshot(dirfd, &rvp, &snapdvp, name, &namend, DELETE,
+       ndp = kheap_alloc(KHEAP_TEMP, sizeof(*ndp), Z_WAITOK);
+
+       error = vnode_get_snapshot(dirfd, &rvp, &snapdvp, name, ndp, DELETE,
             OP_UNLINK, ctx);
         if (error) {
                 goto out;
         }
  
-       error = VNOP_REMOVE(snapdvp, namend.ni_vp, &namend.ni_cnd,
+       error = VNOP_REMOVE(snapdvp, ndp->ni_vp, &ndp->ni_cnd,
             VNODE_REMOVE_SKIP_NAMESPACE_EVENT, ctx);
  
-       vnode_put(namend.ni_vp);
-       nameidone(&namend);
+       vnode_put(ndp->ni_vp);
+       nameidone(ndp);
         vnode_put(snapdvp);
         vnode_put(rvp);
  out:
+       kheap_free(KHEAP_TEMP, ndp, sizeof(*ndp));
+
         return error;
  }
  
@@ -12197,7 +12998,7 @@ out:
   *
   * Marks the filesystem to revert to the given snapshot on next mount.
   */
-static int
+static int __attribute__((noinline))
  snapshot_revert(int dirfd, user_addr_t name, __unused uint32_t flags,
      vfs_context_t ctx)
  {
@@ -12215,10 +13016,10 @@ snapshot_revert(int dirfd, user_addr_t name, __unused uint32_t flags,
         }
         mp = vnode_mount(rvp);
  
-       MALLOC(name_buf, caddr_t, MAXPATHLEN, M_TEMP, M_WAITOK);
+       name_buf = zalloc_flags(ZV_NAMEI, Z_WAITOK);
         error = copyinstr(name, name_buf, MAXPATHLEN, &name_len);
         if (error) {
-               FREE(name_buf, M_TEMP);
+               zfree(ZV_NAMEI, name_buf);
                 vnode_put(rvp);
                 return error;
         }
@@ -12226,7 +13027,7 @@ snapshot_revert(int dirfd, user_addr_t name, __unused uint32_t flags,
  #if CONFIG_MACF
         error = mac_mount_check_snapshot_revert(ctx, mp, name_buf);
         if (error) {
-               FREE(name_buf, M_TEMP);
+               zfree(ZV_NAMEI, name_buf);
                 vnode_put(rvp);
                 return error;
         }
@@ -12239,7 +13040,7 @@ snapshot_revert(int dirfd, user_addr_t name, __unused uint32_t flags,
         error = mount_iterref(mp, 0);
         vnode_put(rvp);
         if (error) {
-               FREE(name_buf, M_TEMP);
+               zfree(ZV_NAMEI, name_buf);
                 return error;
         }
  
@@ -12254,7 +13055,7 @@ snapshot_revert(int dirfd, user_addr_t name, __unused uint32_t flags,
  
         error = VFS_IOCTL(mp, VFSIOC_REVERT_SNAPSHOT, (caddr_t)&revert_data, 0, ctx);
         mount_iterdrop(mp);
-       FREE(name_buf, M_TEMP);
+       zfree(ZV_NAMEI, name_buf);
  
         if (error) {
                 /* If there was any error, try again using VNOP_IOCTL */
@@ -12289,7 +13090,7 @@ snapshot_revert(int dirfd, user_addr_t name, __unused uint32_t flags,
   * rename(2) (which has to deal with a lot more complications). It differs
   * slightly from rename(2) in that EEXIST is returned if the new name exists.
   */
-static int
+static int __attribute__((noinline))
  snapshot_rename(int dirfd, user_addr_t old, user_addr_t new,
      __unused uint32_t flags, vfs_context_t ctx)
  {
@@ -12305,7 +13106,7 @@ snapshot_rename(int dirfd, user_addr_t old, user_addr_t new,
                 struct nameidata to_node;
         } * __rename_data;
  
-       MALLOC(__rename_data, void *, sizeof(*__rename_data), M_TEMP, M_WAITOK);
+       __rename_data = kheap_alloc(KHEAP_TEMP, sizeof(*__rename_data), Z_WAITOK);
         fromnd = &__rename_data->from_node;
         tond = &__rename_data->to_node;
  
@@ -12316,7 +13117,7 @@ snapshot_rename(int dirfd, user_addr_t old, user_addr_t new,
         }
         fvp  = fromnd->ni_vp;
  
-       MALLOC(newname_buf, caddr_t, MAXPATHLEN, M_TEMP, M_WAITOK);
+       newname_buf = zalloc_flags(ZV_NAMEI, Z_WAITOK);
         error = copyinstr(new, newname_buf, MAXPATHLEN, &name_len);
         if (error) {
                 goto out1;
@@ -12374,13 +13175,13 @@ snapshot_rename(int dirfd, user_addr_t old, user_addr_t new,
  out2:
         nameidone(tond);
  out1:
-       FREE(newname_buf, M_TEMP);
+       zfree(ZV_NAMEI, newname_buf);
         vnode_put(fvp);
         vnode_put(snapdvp);
         vnode_put(rvp);
         nameidone(fromnd);
  out:
-       FREE(__rename_data, M_TEMP);
+       kheap_free(KHEAP_TEMP, __rename_data, sizeof(*__rename_data));
         return error;
  }
  
@@ -12390,11 +13191,13 @@ out:
   * get the vnode for the unnamed snapshot directory and the snapshot and
   * mount the snapshot.
   */
-static int
+static int __attribute__((noinline))
  snapshot_mount(int dirfd, user_addr_t name, user_addr_t directory,
      __unused user_addr_t mnt_data, __unused uint32_t flags, vfs_context_t ctx)
  {
+       mount_t mp;
         vnode_t rvp, snapdvp, snapvp, vp, pvp;
+       struct fs_snapshot_mount_args smnt_data;
         int error;
         struct nameidata *snapndp, *dirndp;
         /* carving out a chunk for structs that are too big to be on stack. */
@@ -12403,8 +13206,8 @@ snapshot_mount(int dirfd, user_addr_t name, user_addr_t directory,
                 struct nameidata dirnd;
         } * __snapshot_mount_data;
  
-       MALLOC(__snapshot_mount_data, void *, sizeof(*__snapshot_mount_data),
-           M_TEMP, M_WAITOK);
+       __snapshot_mount_data = kheap_alloc(KHEAP_TEMP,
+           sizeof(*__snapshot_mount_data), Z_WAITOK);
         snapndp = &__snapshot_mount_data->snapnd;
         dirndp = &__snapshot_mount_data->dirnd;
  
@@ -12430,20 +13233,28 @@ snapshot_mount(int dirfd, user_addr_t name, user_addr_t directory,
  
         vp = dirndp->ni_vp;
         pvp = dirndp->ni_dvp;
+       mp = vnode_mount(rvp);
  
         if ((vp->v_flag & VROOT) && (vp->v_mount->mnt_flag & MNT_ROOTFS)) {
                 error = EINVAL;
-       } else {
-               mount_t mp = vnode_mount(rvp);
-               struct fs_snapshot_mount_args smnt_data;
+               goto out2;
+       }
  
-               smnt_data.sm_mp  = mp;
-               smnt_data.sm_cnp = &snapndp->ni_cnd;
-               error = mount_common(mp->mnt_vfsstat.f_fstypename, pvp, vp,
-                   &dirndp->ni_cnd, CAST_USER_ADDR_T(&smnt_data), flags & MNT_DONTBROWSE,
-                   KERNEL_MOUNT_SNAPSHOT, NULL, FALSE, ctx);
+#if CONFIG_MACF
+       error = mac_mount_check_snapshot_mount(ctx, rvp, vp, &dirndp->ni_cnd, snapndp->ni_cnd.cn_nameptr,
+           mp->mnt_vfsstat.f_fstypename);
+       if (error) {
+               goto out2;
         }
+#endif
+
+       smnt_data.sm_mp  = mp;
+       smnt_data.sm_cnp = &snapndp->ni_cnd;
+       error = mount_common(mp->mnt_vfsstat.f_fstypename, pvp, vp,
+           &dirndp->ni_cnd, CAST_USER_ADDR_T(&smnt_data), flags & MNT_DONTBROWSE,
+           KERNEL_MOUNT_SNAPSHOT, NULL, FALSE, ctx);
  
+out2:
         vnode_put(vp);
         vnode_put(pvp);
         nameidone(dirndp);
@@ -12453,7 +13264,8 @@ out1:
         vnode_put(rvp);
         nameidone(snapndp);
  out:
-       FREE(__snapshot_mount_data, M_TEMP);
+       kheap_free(KHEAP_TEMP, __snapshot_mount_data,
+           sizeof(*__snapshot_mount_data));
         return error;
  }
  
@@ -12462,7 +13274,7 @@ out:
   *
   * Marks the filesystem to root from the given snapshot on next boot.
   */
-static int
+static int __attribute__((noinline))
  snapshot_root(int dirfd, user_addr_t name, __unused uint32_t flags,
      vfs_context_t ctx)
  {
@@ -12480,10 +13292,10 @@ snapshot_root(int dirfd, user_addr_t name, __unused uint32_t flags,
         }
         mp = vnode_mount(rvp);
  
-       MALLOC(name_buf, caddr_t, MAXPATHLEN, M_TEMP, M_WAITOK);
+       name_buf = zalloc_flags(ZV_NAMEI, Z_WAITOK);
         error = copyinstr(name, name_buf, MAXPATHLEN, &name_len);
         if (error) {
-               FREE(name_buf, M_TEMP);
+               zfree(ZV_NAMEI, name_buf);
                 vnode_put(rvp);
                 return error;
         }
@@ -12497,7 +13309,7 @@ snapshot_root(int dirfd, user_addr_t name, __unused uint32_t flags,
         error = mount_iterref(mp, 0);
         vnode_put(rvp);
         if (error) {
-               FREE(name_buf, M_TEMP);
+               zfree(ZV_NAMEI, name_buf);
                 return error;
         }
  
@@ -12513,7 +13325,7 @@ snapshot_root(int dirfd, user_addr_t name, __unused uint32_t flags,
         error = VFS_IOCTL(mp, VFSIOC_ROOT_SNAPSHOT, (caddr_t)&root_data, 0, ctx);
  
         mount_iterdrop(mp);
-       FREE(name_buf, M_TEMP);
+       zfree(ZV_NAMEI, name_buf);
  
         return error;
  }
@@ -12537,10 +13349,10 @@ fs_snapshot(__unused proc_t p, struct fs_snapshot_args *uap,
         }
  
         /*
-        * Enforce user authorization for snapshot modification operations
+        * Enforce user authorization for snapshot modification operations,
+        * or if trying to root from snapshot.
          */
-       if ((uap->op != SNAPSHOT_OP_MOUNT) &&
-           (uap->op != SNAPSHOT_OP_ROOT)) {
+       if (uap->op != SNAPSHOT_OP_MOUNT) {
                 vnode_t dvp = NULLVP;
                 vnode_t devvp = NULLVP;
                 mount_t mp;