/*
- * Copyright (c) 2000-2015 Apple Inc. All rights reserved.
+ * Copyright (c) 2000-2018 Apple Inc. All rights reserved.
*
* @APPLE_OSREFERENCE_LICENSE_HEADER_START@
*
* External virtual filesystem routines
*/
-
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/proc_internal.h>
#include <miscfs/fifofs/fifo.h>
#include <string.h>
-#include <machine/spl.h>
-
+#include <machine/machine_routines.h>
#include <kern/assert.h>
#include <mach/kern_return.h>
#include <kern/kalloc.h> /* kalloc()/kfree() */
#include <kern/clock.h> /* delay_for_interval() */
#include <libkern/OSAtomic.h> /* OSAddAtomic() */
+#if !CONFIG_EMBEDDED
#include <console/video_console.h>
+#endif
#ifdef JOE_DEBUG
#include <libkern/OSDebug.h>
#include <security/mac_framework.h>
#endif
+#include <vfs/vfs_disk_conditioner.h>
+#include <libkern/section_keywords.h>
+
extern lck_grp_t *vnode_lck_grp;
extern lck_attr_t *vnode_lck_attr;
/* XXX next protptype should be from <nfs/nfs.h> */
extern int nfs_vinvalbuf(vnode_t, int, vfs_context_t, int);
+extern int paniclog_append_noflush(const char *format, ...);
+
/* XXX next prototytype should be from libsa/stdlib.h> but conflicts libkern */
__private_extern__ void qsort(
void * array,
size_t member_size,
int (*)(const void *, const void *));
-extern kern_return_t adjust_vm_object_cache(vm_size_t oval, vm_size_t nval);
__private_extern__ void vntblinit(void);
-__private_extern__ kern_return_t reset_vmobjectcache(unsigned int val1,
- unsigned int val2);
__private_extern__ int unlink1(vfs_context_t, vnode_t, user_addr_t,
enum uio_seg, int);
ragevnodes--; \
} while(0)
-
-/*
- * vnodetarget hasn't been used in a long time, but
- * it was exported for some reason... I'm leaving in
- * place for now... it should be deprecated out of the
- * exports and removed eventually.
- */
-u_int32_t vnodetarget; /* target for vnreclaim() */
-#define VNODE_FREE_TARGET 20 /* Default value for vnodetarget */
-
-/*
- * We need quite a few vnodes on the free list to sustain the
- * rapid stat() the compilation process does, and still benefit from the name
- * cache. Having too few vnodes on the free list causes serious disk
- * thrashing as we cycle through them.
- */
-#define VNODE_FREE_MIN CONFIG_VNODE_FREE_MIN /* freelist should have at least this many */
-
-
static void async_work_continue(void);
/*
TAILQ_INIT(&vnode_async_work_list);
TAILQ_INIT(&mountlist);
- if (!vnodetarget)
- vnodetarget = VNODE_FREE_TARGET;
-
microuptime(&rage_tv);
rage_limit = desiredvnodes / 100;
if (rage_limit < RAGE_LIMIT_MIN)
rage_limit = RAGE_LIMIT_MIN;
- /*
- * Scale the vm_object_cache to accomodate the vnodes
- * we want to cache
- */
- (void) adjust_vm_object_cache(0, desiredvnodes - VNODE_FREE_MIN);
-
/*
* create worker threads
*/
thread_deallocate(thread);
}
-/* Reset the VM Object Cache with the values passed in */
-__private_extern__ kern_return_t
-reset_vmobjectcache(unsigned int val1, unsigned int val2)
-{
- vm_size_t oval = val1 - VNODE_FREE_MIN;
- vm_size_t nval;
-
- if (val1 == val2) {
- return KERN_SUCCESS;
- }
-
- if(val2 < VNODE_FREE_MIN)
- nval = 0;
- else
- nval = val2 - VNODE_FREE_MIN;
-
- return(adjust_vm_object_cache(oval, nval));
-}
-
-
/* the timeout is in 10 msecs */
int
vnode_waitforwrites(vnode_t vp, int output_target, int slpflag, int slptimeout, const char *msg) {
void
vnode_iterate_setup(mount_t mp)
{
- while (mp->mnt_lflag & MNT_LITER) {
- mp->mnt_lflag |= MNT_LITERWAIT;
- msleep((caddr_t)mp, &mp->mnt_mlock, PVFS, "vnode_iterate_setup", NULL);
- }
-
mp->mnt_lflag |= MNT_LITER;
-
}
int
vnode_iterate_clear(mount_t mp)
{
mp->mnt_lflag &= ~MNT_LITER;
- if (mp->mnt_lflag & MNT_LITERWAIT) {
- mp->mnt_lflag &= ~MNT_LITERWAIT;
- wakeup(mp);
- }
}
+#if !CONFIG_EMBEDDED
#include <i386/panic_hooks.h>
static void vnode_iterate_panic_hook(panic_hook_t *hook_)
{
- extern int kdb_log(const char *fmt, ...);
struct vnode_iterate_panic_hook *hook = (struct vnode_iterate_panic_hook *)hook_;
panic_phys_range_t range;
uint64_t phys;
if (panic_phys_range_before(hook->mp, &phys, &range)) {
- kdb_log("mp = %p, phys = %p, prev (%p: %p-%p)\n",
+ paniclog_append_noflush("mp = %p, phys = %p, prev (%p: %p-%p)\n",
hook->mp, phys, range.type, range.phys_start,
range.phys_start + range.len);
} else {
- kdb_log("mp = %p, phys = %p, prev (!)\n", hook->mp, phys);
+ paniclog_append_noflush("mp = %p, phys = %p, prev (!)\n", hook->mp, phys);
}
if (panic_phys_range_before(hook->vp, &phys, &range)) {
- kdb_log("vp = %p, phys = %p, prev (%p: %p-%p)\n",
+ paniclog_append_noflush("vp = %p, phys = %p, prev (%p: %p-%p)\n",
hook->vp, phys, range.type, range.phys_start,
range.phys_start + range.len);
} else {
- kdb_log("vp = %p, phys = %p, prev (!)\n", hook->vp, phys);
+ paniclog_append_noflush("vp = %p, phys = %p, prev (!)\n", hook->vp, phys);
}
panic_dump_mem((void *)(((vm_offset_t)hook->mp -4096) & ~4095), 12288);
}
+#endif //CONFIG_EMBEDDED
int
vnode_iterate(mount_t mp, int flags, int (*callout)(struct vnode *, void *),
int vid, retval;
int ret = 0;
+ /*
+ * The mount iterate mutex is held for the duration of the iteration.
+ * This can be done by a state flag on the mount structure but we can
+ * run into priority inversion issues sometimes.
+ * Using a mutex allows us to benefit from the priority donation
+ * mechanisms in the kernel for locks. This mutex should never be
+ * acquired in spin mode and it should be acquired before attempting to
+ * acquire the mount lock.
+ */
+ mount_iterate_lock(mp);
+
mount_lock(mp);
vnode_iterate_setup(mp);
- /* it is returns 0 then there is nothing to do */
+ /* If it returns 0 then there is nothing to do */
retval = vnode_iterate_prepare(mp);
if (retval == 0) {
vnode_iterate_clear(mp);
mount_unlock(mp);
+ mount_iterate_unlock(mp);
return(ret);
}
+#if !CONFIG_EMBEDDED
struct vnode_iterate_panic_hook hook;
hook.mp = mp;
hook.vp = NULL;
panic_hook(&hook.hook, vnode_iterate_panic_hook);
+#endif
/* iterate over all the vnodes */
while (!TAILQ_EMPTY(&mp->mnt_workerqueue)) {
vp = TAILQ_FIRST(&mp->mnt_workerqueue);
+#if !CONFIG_EMBEDDED
hook.vp = vp;
+#endif
TAILQ_REMOVE(&mp->mnt_workerqueue, vp, v_mntvnodes);
TAILQ_INSERT_TAIL(&mp->mnt_vnodelist, vp, v_mntvnodes);
vid = vp->v_id;
}
out:
+#if !CONFIG_EMBEDDED
panic_unhook(&hook.hook);
+#endif
(void)vnode_iterate_reloadq(mp);
vnode_iterate_clear(mp);
mount_unlock(mp);
+ mount_iterate_unlock(mp);
return (ret);
}
lck_mtx_unlock(&mp->mnt_renamelock);
}
+void
+mount_iterate_lock(mount_t mp)
+{
+ lck_mtx_lock(&mp->mnt_iter_lock);
+}
+
+void
+mount_iterate_unlock(mount_t mp)
+{
+ lck_mtx_unlock(&mp->mnt_iter_lock);
+}
+
void
mount_lock(mount_t mp)
{
return (ENOMEM);
}
+#define DBG_MOUNTROOT (FSDBG_CODE(DBG_MOUNT, 0))
/*
* Find an appropriate filesystem to use for the root. If a filesystem
mount_t mp;
vnode_t bdevvp_rootvp;
+ KDBG_RELEASE(DBG_MOUNTROOT | DBG_FUNC_START);
if (mountroot != NULL) {
/*
* used for netboot which follows a different set of rules
*/
error = (*mountroot)();
+
+ KDBG_RELEASE(DBG_MOUNTROOT | DBG_FUNC_END, error, 0);
return (error);
}
if ((error = bdevvp(rootdev, &rootvp))) {
printf("vfs_mountroot: can't setup bdevvp\n");
+
+ KDBG_RELEASE(DBG_MOUNTROOT | DBG_FUNC_END, error, 1);
return (error);
}
/*
bdevvp_rootvp = rootvp;
for (vfsp = vfsconf; vfsp; vfsp = vfsp->vfc_next) {
- if (vfsp->vfc_mountroot == NULL)
+ if (vfsp->vfc_mountroot == NULL
+ && !ISSET(vfsp->vfc_vfsflags, VFC_VFSCANMOUNTROOT)) {
continue;
+ }
mp = vfs_rootmountalloc_internal(vfsp, "root_device");
mp->mnt_devvp = rootvp;
- if ((error = (*vfsp->vfc_mountroot)(mp, rootvp, ctx)) == 0) {
+ if (vfsp->vfc_mountroot)
+ error = (*vfsp->vfc_mountroot)(mp, rootvp, ctx);
+ else
+ error = VFS_MOUNT(mp, rootvp, 0, ctx);
+
+ if (!error) {
if ( bdevvp_rootvp != rootvp ) {
/*
* rootvp changed...
*/
vfs_init_io_attributes(rootvp, mp);
- if ((mp->mnt_ioflags & MNT_IOFLAGS_FUSION_DRIVE) &&
- (mp->mnt_ioflags & MNT_IOFLAGS_IOSCHED_SUPPORTED)) {
- /*
- * only for CF
- */
+ if (mp->mnt_ioflags & MNT_IOFLAGS_FUSION_DRIVE) {
root_is_CF_drive = TRUE;
}
+
/*
* Shadow the VFC_VFSNATIVEXATTR flag to MNTK_EXTENDED_ATTRS.
*/
mp->mnt_kern_flag |= MNTK_UNMOUNT_PREFLIGHT;
}
+#if !CONFIG_EMBEDDED
uint32_t speed;
- if (MNTK_VIRTUALDEV & mp->mnt_kern_flag) speed = 128;
- else if (MNTK_SSD & mp->mnt_kern_flag) speed = 7*256;
- else speed = 256;
+ if (MNTK_VIRTUALDEV & mp->mnt_kern_flag) speed = 128;
+ else if (disk_conditioner_mount_is_ssd(mp)) speed = 7*256;
+ else speed = 256;
vc_progress_setdiskspeed(speed);
+#endif
/*
* Probe root file system for additional features.
*/
(vfsattr.f_capabilities.valid[VOL_CAPABILITIES_FORMAT] & VOL_CAP_FMT_PATH_FROM_ID)) {
mp->mnt_kern_flag |= MNTK_PATH_FROM_ID;
}
+
+ if ((vfsattr.f_capabilities.capabilities[VOL_CAPABILITIES_FORMAT] & VOL_CAP_FMT_DIR_HARDLINKS) &&
+ (vfsattr.f_capabilities.valid[VOL_CAPABILITIES_FORMAT] & VOL_CAP_FMT_DIR_HARDLINKS)) {
+ mp->mnt_kern_flag |= MNTK_DIR_HARDLINKS;
+ }
}
/*
vnode_put(rootvp);
#if CONFIG_MACF
- if ((vfs_flags(mp) & MNT_MULTILABEL) == 0)
+ if ((vfs_flags(mp) & MNT_MULTILABEL) == 0) {
+ KDBG_RELEASE(DBG_MOUNTROOT | DBG_FUNC_END, 0, 2);
return (0);
+ }
error = VFS_ROOT(mp, &vp, ctx);
if (error) {
goto fail;
}
#endif
+ KDBG_RELEASE(DBG_MOUNTROOT | DBG_FUNC_END, 0, 3);
return (0);
}
#if CONFIG_MACF
fail:
#endif
vfs_rootmountfailed(mp);
-
+
if (error != EINVAL)
printf("%s_mountroot failed: %d\n", vfsp->vfc_name, error);
}
+ KDBG_RELEASE(DBG_MOUNTROOT | DBG_FUNC_END, error ? error : ENODEV, 4);
return (ENODEV);
}
fsid_t tfsid;
int mtype;
- mount_t nmp;
mount_list_lock();
tfsid.val[0] = makedev(nblkdev + mtype, mntid_gen);
tfsid.val[1] = mtype;
- TAILQ_FOREACH(nmp, &mountlist, mnt_list) {
- while (vfs_getvfs_locked(&tfsid)) {
- if (++mntid_gen == 0)
- mntid_gen++;
- tfsid.val[0] = makedev(nblkdev + mtype, mntid_gen);
- }
+ while (vfs_getvfs_locked(&tfsid)) {
+ if (++mntid_gen == 0)
+ mntid_gen++;
+ tfsid.val[0] = makedev(nblkdev + mtype, mntid_gen);
}
+
mp->mnt_vfsstat.f_fsid.val[0] = tfsid.val[0];
mp->mnt_vfsstat.f_fsid.val[1] = tfsid.val[1];
mount_list_unlock();
return (0);
}
-
/*
* Check to see if the new vnode represents a special device
* for which we already have a vnode (either because of
int retval;
unsigned int vid;
+ /*
+ * See comments in vnode_iterate() for the rationale for this lock
+ */
+ mount_iterate_lock(mp);
+
mount_lock(mp);
vnode_iterate_setup(mp);
/*
if (vnode_umount_preflight(mp, skipvp, flags)) {
vnode_iterate_clear(mp);
mount_unlock(mp);
+ mount_iterate_unlock(mp);
return(EBUSY);
}
}
loop:
- /* it is returns 0 then there is nothing to do */
+ /* If it returns 0 then there is nothing to do */
retval = vnode_iterate_prepare(mp);
if (retval == 0) {
vnode_iterate_clear(mp);
mount_unlock(mp);
+ mount_iterate_unlock(mp);
return(retval);
}
}
vnode_iterate_clear(mp);
mount_unlock(mp);
+ mount_iterate_unlock(mp);
if (busy && ((flags & FORCECLOSE)==0))
return (EBUSY);
int count;
int vid;
+ if (!vnode_isspec(vp)) {
+ return (vp->v_usecount - vp->v_kusecount);
+ }
+
loop:
if (!vnode_isaliased(vp))
return (vp->v_specinfo->si_opencount);
return build_path(vp, pathbuf, *len, len, 0, vfs_context_current());
}
+/*
+ * vn_getpath_fsenter_with_parent will reenter the file system to fine the path of the
+ * vnode. It requires that there are IO counts on both the vnode and the directory vnode.
+ *
+ * vn_getpath_fsenter is called by MAC hooks to authorize operations for every thing, but
+ * unlink, rmdir and rename. For these operation the MAC hook calls vn_getpath. This presents
+ * problems where if the path can not be found from the name cache, those operations can
+ * erroneously fail with EPERM even though the call should succeed. When removing or moving
+ * file system objects with operations such as unlink or rename, those operations need to
+ * take IO counts on the target and containing directory. Calling vn_getpath_fsenter from a
+ * MAC hook from these operations during forced unmount operations can lead to dead
+ * lock. This happens when the operation starts, IO counts are taken on the containing
+ * directories and targets. Before the MAC hook is called a forced unmount from another
+ * thread takes place and blocks on the on going operation's directory vnode in vdrain.
+ * After which, the MAC hook gets called and calls vn_getpath_fsenter. vn_getpath_fsenter
+ * is called with the understanding that there is an IO count on the target. If in
+ * build_path the directory vnode is no longer in the cache, then the parent object id via
+ * vnode_getattr from the target is obtain and used to call VFS_VGET to get the parent
+ * vnode. The file system's VFS_VGET then looks up by inode in its hash and tries to get
+ * an IO count. But VFS_VGET "sees" the directory vnode is in vdrain and can block
+ * depending on which version and how it calls the vnode_get family of interfaces.
+ *
+ * N.B. A reasonable interface to use is vnode_getwithvid. This interface was modified to
+ * call vnode_getiocount with VNODE_DRAINO, so it will happily get an IO count and not
+ * cause issues, but there is no guarantee that all or any file systems are doing that.
+ *
+ * vn_getpath_fsenter_with_parent can enter the file system safely since there is a known
+ * IO count on the directory vnode by calling build_path_with_parent.
+ */
+
+int
+vn_getpath_fsenter_with_parent(struct vnode *dvp, struct vnode *vp, char *pathbuf, int *len)
+{
+ return build_path_with_parent(vp, dvp, pathbuf, *len, len, 0, vfs_context_current());
+}
+
int
vn_getcdhash(struct vnode *vp, off_t offset, unsigned char *cdhash)
{
}
-__private_extern__ int
-is_package_name(const char *name, int len)
+int is_package_name(const char *name, int len)
{
int i, extlen;
const char *ptr, *name_ext;
case VFS_CTL_DISC:
case VFS_CTL_SERVERINFO:
return 1;
- break;
default:
break;
case AFPFS_VFS_CTL_NETCHANGE:
case AFPFS_VFS_CTL_VOLCHANGE:
return 1;
- break;
}
}
off_t readsegsize = 0;
off_t writesegsize = 0;
off_t alignment = 0;
+ u_int32_t minsaturationbytecount = 0;
u_int32_t ioqueue_depth = 0;
u_int32_t blksize;
u_int64_t temp;
temp = MNT_DEFAULT_IOQUEUE_DEPTH;
mp->mnt_ioqueue_depth = temp;
- mp->mnt_ioscale = (mp->mnt_ioqueue_depth + (MNT_DEFAULT_IOQUEUE_DEPTH - 1)) / MNT_DEFAULT_IOQUEUE_DEPTH;
+ mp->mnt_ioscale = MNT_IOSCALE(mp->mnt_ioqueue_depth);
if (mp->mnt_ioscale > 1)
printf("ioqueue_depth = %d, ioscale = %d\n", (int)mp->mnt_ioqueue_depth, (int)mp->mnt_ioscale);
if (features & DK_FEATURE_FORCE_UNIT_ACCESS)
mp->mnt_ioflags |= MNT_IOFLAGS_FUA_SUPPORTED;
+
+ if (VNOP_IOCTL(devvp, DKIOCGETIOMINSATURATIONBYTECOUNT, (caddr_t)&minsaturationbytecount, 0, ctx) == 0) {
+ mp->mnt_minsaturationbytecount = minsaturationbytecount;
+ } else {
+ mp->mnt_minsaturationbytecount = 0;
+ }
if (VNOP_IOCTL(devvp, DKIOCCORESTORAGE, (caddr_t)&cs_info, 0, ctx) == 0)
cs_present = TRUE;
*/
if ((cs_info.flags & DK_CORESTORAGE_PIN_YOUR_METADATA))
mp->mnt_ioflags |= MNT_IOFLAGS_FUSION_DRIVE;
+ } else {
+ /* Check for APFS Fusion */
+ dk_apfs_flavour_t flavour;
+ if ((VNOP_IOCTL(devvp, DKIOCGETAPFSFLAVOUR, (caddr_t)&flavour, 0, ctx) == 0) &&
+ (flavour == DK_APFS_FUSION)) {
+ mp->mnt_ioflags |= MNT_IOFLAGS_FUSION_DRIVE;
+ }
}
#if CONFIG_IOSCHED
if (space < req->oldlen)
return (ENOMEM);
- MALLOC(fsidlst, fsid_t *, req->oldlen, M_TEMP, M_WAITOK);
+ MALLOC(fsidlst, fsid_t *, req->oldlen, M_TEMP, M_WAITOK | M_ZERO);
if (fsidlst == NULL) {
return (ENOMEM);
}
sfs.f_ffree = (user64_long_t)sp->f_ffree;
sfs.f_fsid = sp->f_fsid;
sfs.f_owner = sp->f_owner;
-
+#ifdef NFSCLIENT
if (mp->mnt_kern_flag & MNTK_TYPENAME_OVERRIDE) {
- strlcpy(&sfs.f_fstypename[0], &mp->fstypename_override[0], MFSTYPENAMELEN);
- } else {
+ strlcpy(&sfs.f_fstypename[0], &mp->fstypename_override[0], MFSNAMELEN);
+ } else
+#endif
+ {
strlcpy(sfs.f_fstypename, sp->f_fstypename, MFSNAMELEN);
}
strlcpy(sfs.f_mntonname, sp->f_mntonname, MNAMELEN);
sfs.f_ffree = (user32_long_t)sp->f_ffree;
sfs.f_fsid = sp->f_fsid;
sfs.f_owner = sp->f_owner;
-
+
+#ifdef NFSCLIENT
if (mp->mnt_kern_flag & MNTK_TYPENAME_OVERRIDE) {
- strlcpy(&sfs.f_fstypename[0], &mp->fstypename_override[0], MFSTYPENAMELEN);
- } else {
+ strlcpy(&sfs.f_fstypename[0], &mp->fstypename_override[0], MFSNAMELEN);
+ } else
+#endif
+ {
strlcpy(sfs.f_fstypename, sp->f_fstypename, MFSNAMELEN);
}
strlcpy(sfs.f_mntonname, sp->f_mntonname, MNAMELEN);
return (error);
}
-static int filt_fsattach(struct knote *kn);
+static int filt_fsattach(struct knote *kn, struct kevent_internal_s *kev);
static void filt_fsdetach(struct knote *kn);
static int filt_fsevent(struct knote *kn, long hint);
-struct filterops fs_filtops = {
- .f_attach = filt_fsattach,
- .f_detach = filt_fsdetach,
- .f_event = filt_fsevent,
+static int filt_fstouch(struct knote *kn, struct kevent_internal_s *kev);
+static int filt_fsprocess(struct knote *kn, struct filt_process_s *data, struct kevent_internal_s *kev);
+SECURITY_READ_ONLY_EARLY(struct filterops) fs_filtops = {
+ .f_attach = filt_fsattach,
+ .f_detach = filt_fsdetach,
+ .f_event = filt_fsevent,
+ .f_touch = filt_fstouch,
+ .f_process = filt_fsprocess,
};
static int
-filt_fsattach(struct knote *kn)
+filt_fsattach(struct knote *kn, __unused struct kevent_internal_s *kev)
{
-
lck_mtx_lock(fs_klist_lock);
- kn->kn_flags |= EV_CLEAR;
KNOTE_ATTACH(&fs_klist, kn);
lck_mtx_unlock(fs_klist_lock);
+
+ /*
+ * filter only sees future events,
+ * so it can't be fired already.
+ */
return (0);
}
return (kn->kn_fflags != 0);
}
+static int
+filt_fstouch(struct knote *kn, struct kevent_internal_s *kev)
+{
+ int res;
+
+ lck_mtx_lock(fs_klist_lock);
+
+ kn->kn_sfflags = kev->fflags;
+
+ /*
+ * the above filter function sets bits even if nobody is looking for them.
+ * Just preserve those bits even in the new mask is more selective
+ * than before.
+ *
+ * For compatibility with previous implementations, we leave kn_fflags
+ * as they were before.
+ */
+ //if (kn->kn_sfflags)
+ // kn->kn_fflags &= kn->kn_sfflags;
+ res = (kn->kn_fflags != 0);
+
+ lck_mtx_unlock(fs_klist_lock);
+
+ return res;
+}
+
+static int
+filt_fsprocess(struct knote *kn, struct filt_process_s *data, struct kevent_internal_s *kev)
+{
+#pragma unused(data)
+ int res;
+
+ lck_mtx_lock(fs_klist_lock);
+ res = (kn->kn_fflags != 0);
+ if (res) {
+ *kev = kn->kn_kevent;
+ kn->kn_flags |= EV_CLEAR; /* automatic */
+ kn->kn_fflags = 0;
+ kn->kn_data = 0;
+ }
+ lck_mtx_unlock(fs_klist_lock);
+ return res;
+}
+
static int
sysctl_vfs_noremotehang(__unused struct sysctl_oid *oidp,
__unused void *arg1, __unused int arg2, struct sysctl_req *req)
{
int *name, namelen;
struct vfstable *vfsp;
- struct vfsconf vfsc;
+ struct vfsconf vfsc = {};
(void)oidp;
name = arg1;
SYSCTL_INT(_vfs_generic, VFS_MAXTYPENUM, maxtypenum,
CTLFLAG_RD | CTLFLAG_KERN | CTLFLAG_LOCKED,
&maxvfstypenum, 0, "");
-SYSCTL_INT(_vfs_generic, OID_AUTO, sync_timeout, CTLFLAG_RW | CTLFLAG_LOCKED, &sync_timeout, 0, "");
+SYSCTL_INT(_vfs_generic, OID_AUTO, sync_timeout, CTLFLAG_RW | CTLFLAG_LOCKED, &sync_timeout_seconds, 0, "");
SYSCTL_NODE(_vfs_generic, VFS_CONF, conf,
CTLFLAG_RD | CTLFLAG_LOCKED,
sysctl_vfs_generic_conf, "");
+/* Indicate that the root file system unmounted cleanly */
+static int vfs_root_unmounted_cleanly = 0;
+SYSCTL_INT(_vfs_generic, OID_AUTO, root_unmounted_cleanly, CTLFLAG_RD, &vfs_root_unmounted_cleanly, 0, "Root filesystem was unmounted cleanly");
+
+void
+vfs_set_root_unmounted_cleanly(void)
+{
+ vfs_root_unmounted_cleanly = 1;
+}
+
/*
* Print vnode state.
*/
panic("new_vnode(%p): free vnode still referenced", vp);
if ((vp->v_mntvnodes.tqe_prev != 0) && (vp->v_mntvnodes.tqe_next != 0))
panic("new_vnode(%p): vnode seems to be on mount list", vp);
- if ( !LIST_EMPTY(&vp->v_nclinks) || !LIST_EMPTY(&vp->v_ncchildren))
+ if ( !LIST_EMPTY(&vp->v_nclinks) || !TAILQ_EMPTY(&vp->v_ncchildren))
panic("new_vnode(%p): vnode still hooked into the name cache", vp);
} else {
vnode_unlock(vp);
return (vp);
}
-
-
+__attribute__((noreturn))
static void
async_work_continue(void)
{
VLISTNONE(vp); /* avoid double queue removal */
lck_mtx_init(&vp->v_lock, vnode_lck_grp, vnode_lck_attr);
+ TAILQ_INIT(&vp->v_ncchildren);
+
klist_init(&vp->v_knotes);
nanouptime(&ts);
vp->v_id = ts.tv_nsec;
return(vnode_isinuse_locked(vp, refcnt, 0));
}
+int vnode_usecount(vnode_t vp)
+{
+ return vp->v_usecount;
+}
+
+int vnode_iocount(vnode_t vp)
+{
+ return vp->v_iocount;
+}
static int
vnode_isinuse_locked(vnode_t vp, int refcnt, int locked)
ut = get_bsdthread_info(current_thread());
if ((current_proc()->p_lflag & P_LRAGE_VNODES) ||
- (ut->uu_flag & UT_RAGE_VNODES)) {
+ (ut->uu_flag & (UT_RAGE_VNODES | UT_KERN_RAGE_VNODES))) {
/*
* process has indicated that it wants any
* vnodes created on its behalf to be rapidly
* aged to reduce the impact on the cached set
* of vnodes
+ *
+ * if UT_KERN_RAGE_VNODES is set, then the
+ * kernel internally wants vnodes to be rapidly
+ * aged, even if the process hasn't requested
+ * this
*/
vp->v_flag |= VRAGE;
}
+
+#if CONFIG_SECLUDED_MEMORY
+ switch (secluded_for_filecache) {
+ case 0:
+ /*
+ * secluded_for_filecache == 0:
+ * + no file contents in secluded pool
+ */
+ break;
+ case 1:
+ /*
+ * secluded_for_filecache == 1:
+ * + no files from /
+ * + files from /Applications/ are OK
+ * + files from /Applications/Camera are not OK
+ * + no files that are open for write
+ */
+ if (vnode_vtype(vp) == VREG &&
+ vnode_mount(vp) != NULL &&
+ (! (vfs_flags(vnode_mount(vp)) & MNT_ROOTFS))) {
+ /* not from root filesystem: eligible for secluded pages */
+ memory_object_mark_eligible_for_secluded(
+ ubc_getobject(vp, UBC_FLAGS_NONE),
+ TRUE);
+ }
+ break;
+ case 2:
+ /*
+ * secluded_for_filecache == 2:
+ * + all read-only files OK, except:
+ * + dyld_shared_cache_arm64*
+ * + Camera
+ * + mediaserverd
+ */
+ if (vnode_vtype(vp) == VREG) {
+ memory_object_mark_eligible_for_secluded(
+ ubc_getobject(vp, UBC_FLAGS_NONE),
+ TRUE);
+ }
+ break;
+ default:
+ break;
+ }
+#endif /* CONFIG_SECLUDED_MEMORY */
+
return (0);
error_out:
VFSATTR_WANTED(&va, f_ffree);
VFSATTR_WANTED(&va, f_bsize);
VFSATTR_WANTED(&va, f_fssubtype);
+
+ if ((error = vfs_getattr(mp, &va, ctx)) != 0) {
+ KAUTH_DEBUG("STAT - filesystem returned error %d", error);
+ return(error);
+ }
#if CONFIG_MACF
if (eventtype == VFS_USER_EVENT) {
error = mac_mount_check_getattr(ctx, mp, &va);
return (error);
}
#endif
-
- if ((error = vfs_getattr(mp, &va, ctx)) != 0) {
- KAUTH_DEBUG("STAT - filesystem returned error %d", error);
- return(error);
- }
-
/*
* Unpack into the per-mount structure.
*
boolean_t batched;
struct componentname *cnp;
uint32_t defaulted;
- uint32_t dfflags; // Directory file flags
cnp = &ndp->ni_cnd;
error = 0;
panic("Mode for open, but not trying to open...");
}
- /*
- * Handle inheritance of restricted flag
- */
- error = vnode_flags(dvp, &dfflags, ctx);
- if (error)
- return error;
- if (dfflags & SF_RESTRICTED)
- VATTR_SET(vap, va_flags, SF_RESTRICTED);
/*
* Create the requested node.
if (!batched) {
*vpp = (vnode_t) 0;
vnode_put(vp);
+ vp = NULLVP;
}
}
+ /*
+ * For creation VNOPs, this is the equivalent of
+ * lookup_handle_found_vnode.
+ */
+ if (kdebug_enable && *vpp)
+ kdebug_lookup(*vpp, cnp);
+
out:
vn_attribute_cleanup(vap, defaulted);
static kauth_scope_t vnode_scope;
static int vnode_authorize_callback(kauth_cred_t credential, void *idata, kauth_action_t action,
uintptr_t arg0, uintptr_t arg1, uintptr_t arg2, uintptr_t arg3);
-static int vnode_authorize_callback_int(__unused kauth_cred_t credential, __unused void *idata, kauth_action_t action,
- uintptr_t arg0, uintptr_t arg1, uintptr_t arg2, uintptr_t arg3);
+static int vnode_authorize_callback_int(kauth_action_t action, vfs_context_t ctx,
+ vnode_t vp, vnode_t dvp, int *errorp);
typedef struct _vnode_authorize_context {
vnode_t vp;
#define _VAC_IN_GROUP (1<<1)
#define _VAC_IS_DIR_OWNER (1<<2)
#define _VAC_IN_DIR_GROUP (1<<3)
+#define _VAC_NO_VNODE_POINTERS (1<<4)
} *vauth_ctx;
void
* However, some file systems may have limited support.
*/
if ((vp->v_type == VDIR) &&
- !(vp->v_mount->mnt_vtable->vfc_vfsflags & VFC_VFSDIRLINKS)) {
+ !(vp->v_mount->mnt_kern_flag & MNTK_DIR_HARDLINKS)) {
return (EPERM); /* POSIX */
}
return (vnode_authorize(dvp, NULL, KAUTH_VNODE_ADD_FILE, ctx));
}
-int
+int
vn_authorize_rename(struct vnode *fdvp, struct vnode *fvp, struct componentname *fcnp,
- struct vnode *tdvp, struct vnode *tvp, struct componentname *tcnp,
- vfs_context_t ctx, void *reserved)
+ struct vnode *tdvp, struct vnode *tvp, struct componentname *tcnp,
+ vfs_context_t ctx, void *reserved)
+{
+ return vn_authorize_renamex(fdvp, fvp, fcnp, tdvp, tvp, tcnp, ctx, 0, reserved);
+}
+
+int
+vn_authorize_renamex(struct vnode *fdvp, struct vnode *fvp, struct componentname *fcnp,
+ struct vnode *tdvp, struct vnode *tvp, struct componentname *tcnp,
+ vfs_context_t ctx, vfs_rename_flags_t flags, void *reserved)
+{
+
+ return vn_authorize_renamex_with_paths(fdvp, fvp, fcnp, NULL, tdvp, tvp, tcnp, NULL, ctx, flags, reserved);
+}
+
+int
+vn_authorize_renamex_with_paths(struct vnode *fdvp, struct vnode *fvp, struct componentname *fcnp, const char *from_path,
+ struct vnode *tdvp, struct vnode *tvp, struct componentname *tcnp, const char *to_path,
+ vfs_context_t ctx, vfs_rename_flags_t flags, void *reserved)
{
int error = 0;
int moving = 0;
+ bool swap = flags & VFS_RENAME_SWAP;
if (reserved != NULL) {
panic("Passed something other than NULL as reserved field!");
error = mac_vnode_check_rename(ctx, fdvp, fvp, fcnp, tdvp, tvp, tcnp);
if (error)
goto out;
+ if (swap) {
+ error = mac_vnode_check_rename(ctx, tdvp, tvp, tcnp, fdvp, fvp, fcnp);
+ if (error)
+ goto out;
+ }
#endif
/***** </MACF> *****/
/***** <MiscChecks> *****/
if (tvp != NULL) {
- if (fvp->v_type == VDIR && tvp->v_type != VDIR) {
- error = ENOTDIR;
- goto out;
- } else if (fvp->v_type != VDIR && tvp->v_type == VDIR) {
- error = EISDIR;
- goto out;
+ if (!swap) {
+ if (fvp->v_type == VDIR && tvp->v_type != VDIR) {
+ error = ENOTDIR;
+ goto out;
+ } else if (fvp->v_type != VDIR && tvp->v_type == VDIR) {
+ error = EISDIR;
+ goto out;
+ }
}
+ } else if (swap) {
+ /*
+ * Caller should have already checked this and returned
+ * ENOENT. If we send back ENOENT here, caller will retry
+ * which isn't what we want so we send back EINVAL here
+ * instead.
+ */
+ error = EINVAL;
+ goto out;
}
if (fvp == tdvp) {
error = EINVAL;
goto out;
}
- /***** </MiscChecks> *****/
-
- /***** <Kauth> *****/
- error = 0;
- if ((tvp != NULL) && vnode_isdir(tvp)) {
- if (tvp != fdvp)
- moving = 1;
- } else if (tdvp != fdvp) {
- moving = 1;
+ if (swap && fdvp->v_parent == tvp) {
+ error = EINVAL;
+ goto out;
}
+ /***** </MiscChecks> *****/
+ /***** <Kauth> *****/
/*
- * must have delete rights to remove the old name even in
- * the simple case of fdvp == tdvp.
+ * As part of the Kauth step, we call out to allow 3rd-party
+ * fileop notification of "about to rename". This is needed
+ * in the event that 3rd-parties need to know that the DELETE
+ * authorization is actually part of a rename. It's important
+ * that we guarantee that the DELETE call-out will always be
+ * made if the WILL_RENAME call-out is made. Another fileop
+ * call-out will be performed once the operation is completed.
+ * We can ignore the result of kauth_authorize_fileop().
*
- * If fvp is a directory, and we are changing it's parent,
- * then we also need rights to rewrite its ".." entry as well.
+ * N.B. We are passing the vnode and *both* paths to each
+ * call; kauth_authorize_fileop() extracts the "from" path
+ * when posting a KAUTH_FILEOP_WILL_RENAME notification.
+ * As such, we only post these notifications if all of the
+ * information we need is provided.
*/
- if (vnode_isdir(fvp)) {
- if ((error = vnode_authorize(fvp, fdvp, KAUTH_VNODE_DELETE | KAUTH_VNODE_ADD_SUBDIRECTORY, ctx)) != 0)
- goto out;
- } else {
- if ((error = vnode_authorize(fvp, fdvp, KAUTH_VNODE_DELETE, ctx)) != 0)
+
+ if (swap) {
+ kauth_action_t f = 0, t = 0;
+
+ /*
+ * Directories changing parents need ...ADD_SUBDIR... to
+ * permit changing ".."
+ */
+ if (fdvp != tdvp) {
+ if (vnode_isdir(fvp))
+ f = KAUTH_VNODE_ADD_SUBDIRECTORY;
+ if (vnode_isdir(tvp))
+ t = KAUTH_VNODE_ADD_SUBDIRECTORY;
+ }
+ if (to_path != NULL)
+ kauth_authorize_fileop(vfs_context_ucred(ctx),
+ KAUTH_FILEOP_WILL_RENAME,
+ (uintptr_t)fvp,
+ (uintptr_t)to_path);
+ error = vnode_authorize(fvp, fdvp, KAUTH_VNODE_DELETE | f, ctx);
+ if (error)
goto out;
- }
- if (moving) {
- /* moving into tdvp or tvp, must have rights to add */
- if ((error = vnode_authorize(((tvp != NULL) && vnode_isdir(tvp)) ? tvp : tdvp,
- NULL,
- vnode_isdir(fvp) ? KAUTH_VNODE_ADD_SUBDIRECTORY : KAUTH_VNODE_ADD_FILE,
- ctx)) != 0) {
+ if (from_path != NULL)
+ kauth_authorize_fileop(vfs_context_ucred(ctx),
+ KAUTH_FILEOP_WILL_RENAME,
+ (uintptr_t)tvp,
+ (uintptr_t)from_path);
+ error = vnode_authorize(tvp, tdvp, KAUTH_VNODE_DELETE | t, ctx);
+ if (error)
goto out;
+ f = vnode_isdir(fvp) ? KAUTH_VNODE_ADD_SUBDIRECTORY : KAUTH_VNODE_ADD_FILE;
+ t = vnode_isdir(tvp) ? KAUTH_VNODE_ADD_SUBDIRECTORY : KAUTH_VNODE_ADD_FILE;
+ if (fdvp == tdvp)
+ error = vnode_authorize(fdvp, NULL, f | t, ctx);
+ else {
+ error = vnode_authorize(fdvp, NULL, t, ctx);
+ if (error)
+ goto out;
+ error = vnode_authorize(tdvp, NULL, f, ctx);
}
+ if (error)
+ goto out;
} else {
- /* node staying in same directory, must be allowed to add new name */
- if ((error = vnode_authorize(fdvp, NULL,
- vnode_isdir(fvp) ? KAUTH_VNODE_ADD_SUBDIRECTORY : KAUTH_VNODE_ADD_FILE, ctx)) != 0)
+ error = 0;
+ if ((tvp != NULL) && vnode_isdir(tvp)) {
+ if (tvp != fdvp)
+ moving = 1;
+ } else if (tdvp != fdvp) {
+ moving = 1;
+ }
+
+ /*
+ * must have delete rights to remove the old name even in
+ * the simple case of fdvp == tdvp.
+ *
+ * If fvp is a directory, and we are changing it's parent,
+ * then we also need rights to rewrite its ".." entry as well.
+ */
+ if (to_path != NULL)
+ kauth_authorize_fileop(vfs_context_ucred(ctx),
+ KAUTH_FILEOP_WILL_RENAME,
+ (uintptr_t)fvp,
+ (uintptr_t)to_path);
+ if (vnode_isdir(fvp)) {
+ if ((error = vnode_authorize(fvp, fdvp, KAUTH_VNODE_DELETE | KAUTH_VNODE_ADD_SUBDIRECTORY, ctx)) != 0)
+ goto out;
+ } else {
+ if ((error = vnode_authorize(fvp, fdvp, KAUTH_VNODE_DELETE, ctx)) != 0)
+ goto out;
+ }
+ if (moving) {
+ /* moving into tdvp or tvp, must have rights to add */
+ if ((error = vnode_authorize(((tvp != NULL) && vnode_isdir(tvp)) ? tvp : tdvp,
+ NULL,
+ vnode_isdir(fvp) ? KAUTH_VNODE_ADD_SUBDIRECTORY : KAUTH_VNODE_ADD_FILE,
+ ctx)) != 0) {
+ goto out;
+ }
+ } else {
+ /* node staying in same directory, must be allowed to add new name */
+ if ((error = vnode_authorize(fdvp, NULL,
+ vnode_isdir(fvp) ? KAUTH_VNODE_ADD_SUBDIRECTORY : KAUTH_VNODE_ADD_FILE, ctx)) != 0)
+ goto out;
+ }
+ /* overwriting tvp */
+ if ((tvp != NULL) && !vnode_isdir(tvp) &&
+ ((error = vnode_authorize(tvp, tdvp, KAUTH_VNODE_DELETE, ctx)) != 0)) {
goto out;
- }
- /* overwriting tvp */
- if ((tvp != NULL) && !vnode_isdir(tvp) &&
- ((error = vnode_authorize(tvp, tdvp, KAUTH_VNODE_DELETE, ctx)) != 0)) {
- goto out;
+ }
}
/***** </Kauth> *****/
}
/*
- * Authorize an operation on a vnode.
+ * Authorizer for directory cloning. This does not use vnodes but instead
+ * uses prefilled vnode attributes from the filesystem.
*
- * This is KPI, but here because it needs vnode_scope.
- *
- * Returns: 0 Success
- * kauth_authorize_action:EPERM ...
- * xlate => EACCES Permission denied
- * kauth_authorize_action:0 Success
- * kauth_authorize_action: Depends on callback return; this is
- * usually only vnode_authorize_callback(),
- * but may include other listerners, if any
- * exist.
- * EROFS
- * EACCES
- * EPERM
- * ???
+ * The same function is called to set up the attributes required, perform the
+ * authorization and cleanup (if required)
*/
int
-vnode_authorize(vnode_t vp, vnode_t dvp, kauth_action_t action, vfs_context_t ctx)
+vnode_attr_authorize_dir_clone(struct vnode_attr *vap, kauth_action_t action,
+ struct vnode_attr *dvap, __unused vnode_t sdvp, mount_t mp,
+ dir_clone_authorizer_op_t vattr_op, uint32_t flags, vfs_context_t ctx,
+ __unused void *reserved)
{
- int error, result;
+ int error;
+ int is_suser = vfs_context_issuser(ctx);
+
+ if (vattr_op == OP_VATTR_SETUP) {
+ VATTR_INIT(vap);
+
+ /*
+ * When ACL inheritence is implemented, both vap->va_acl and
+ * dvap->va_acl will be required (even as superuser).
+ */
+ VATTR_WANTED(vap, va_type);
+ VATTR_WANTED(vap, va_mode);
+ VATTR_WANTED(vap, va_flags);
+ VATTR_WANTED(vap, va_uid);
+ VATTR_WANTED(vap, va_gid);
+ if (dvap) {
+ VATTR_INIT(dvap);
+ VATTR_WANTED(dvap, va_flags);
+ }
+
+ if (!is_suser) {
+ /*
+ * If not superuser, we have to evaluate ACLs and
+ * need the target directory gid to set the initial
+ * gid of the new object.
+ */
+ VATTR_WANTED(vap, va_acl);
+ if (dvap)
+ VATTR_WANTED(dvap, va_gid);
+ } else if (dvap && (flags & VNODE_CLONEFILE_NOOWNERCOPY)) {
+ VATTR_WANTED(dvap, va_gid);
+ }
+ return (0);
+ } else if (vattr_op == OP_VATTR_CLEANUP) {
+ return (0); /* Nothing to do for now */
+ }
+
+ /* dvap isn't used for authorization */
+ error = vnode_attr_authorize(vap, NULL, mp, action, ctx);
+
+ if (error)
+ return (error);
+
+ /*
+ * vn_attribute_prepare should be able to accept attributes as well as
+ * vnodes but for now we do this inline.
+ */
+ if (!is_suser || (flags & VNODE_CLONEFILE_NOOWNERCOPY)) {
+ /*
+ * If the filesystem is mounted IGNORE_OWNERSHIP and an explicit
+ * owner is set, that owner takes ownership of all new files.
+ */
+ if ((mp->mnt_flag & MNT_IGNORE_OWNERSHIP) &&
+ (mp->mnt_fsowner != KAUTH_UID_NONE)) {
+ VATTR_SET(vap, va_uid, mp->mnt_fsowner);
+ } else {
+ /* default owner is current user */
+ VATTR_SET(vap, va_uid,
+ kauth_cred_getuid(vfs_context_ucred(ctx)));
+ }
+
+ if ((mp->mnt_flag & MNT_IGNORE_OWNERSHIP) &&
+ (mp->mnt_fsgroup != KAUTH_GID_NONE)) {
+ VATTR_SET(vap, va_gid, mp->mnt_fsgroup);
+ } else {
+ /*
+ * default group comes from parent object,
+ * fallback to current user
+ */
+ if (VATTR_IS_SUPPORTED(dvap, va_gid)) {
+ VATTR_SET(vap, va_gid, dvap->va_gid);
+ } else {
+ VATTR_SET(vap, va_gid,
+ kauth_cred_getgid(vfs_context_ucred(ctx)));
+ }
+ }
+ }
+
+ /* Inherit SF_RESTRICTED bit from destination directory only */
+ if (VATTR_IS_ACTIVE(vap, va_flags)) {
+ VATTR_SET(vap, va_flags,
+ ((vap->va_flags & ~(UF_DATAVAULT | SF_RESTRICTED)))); /* Turn off from source */
+ if (VATTR_IS_ACTIVE(dvap, va_flags))
+ VATTR_SET(vap, va_flags,
+ vap->va_flags | (dvap->va_flags & (UF_DATAVAULT | SF_RESTRICTED)));
+ } else if (VATTR_IS_ACTIVE(dvap, va_flags)) {
+ VATTR_SET(vap, va_flags, (dvap->va_flags & (UF_DATAVAULT | SF_RESTRICTED)));
+ }
+
+ return (0);
+}
+
+
+/*
+ * Authorize an operation on a vnode.
+ *
+ * This is KPI, but here because it needs vnode_scope.
+ *
+ * Returns: 0 Success
+ * kauth_authorize_action:EPERM ...
+ * xlate => EACCES Permission denied
+ * kauth_authorize_action:0 Success
+ * kauth_authorize_action: Depends on callback return; this is
+ * usually only vnode_authorize_callback(),
+ * but may include other listerners, if any
+ * exist.
+ * EROFS
+ * EACCES
+ * EPERM
+ * ???
+ */
+int
+vnode_authorize(vnode_t vp, vnode_t dvp, kauth_action_t action, vfs_context_t ctx)
+{
+ int error, result;
/*
* We can't authorize against a dead vnode; allow all operations through so that
* - Neither the node nor the directory are immutable.
* - The user is not the superuser.
*
- * Deletion is not permitted if the directory is sticky and the caller is
- * not owner of the node or directory.
+ * The precedence of factors for authorizing or denying delete for a credential
+ *
+ * 1) Explicit ACE on the node. (allow or deny DELETE)
+ * 2) Explicit ACE on the directory (allow or deny DELETE_CHILD).
*
- * If either the node grants DELETE, or the directory grants DELETE_CHILD,
- * the node may be deleted. If neither denies the permission, and the
- * caller has Posix write access to the directory, then the node may be
- * deleted.
+ * If there are conflicting ACEs on the node and the directory, the node
+ * ACE wins.
+ *
+ * 3) Sticky bit on the directory.
+ * Deletion is not permitted if the directory is sticky and the caller is
+ * not owner of the node or directory. The sticky bit rules are like a deny
+ * delete ACE except lower in priority than ACL's either allowing or denying
+ * delete.
+ *
+ * 4) POSIX permisions on the directory.
*
* As an optimization, we cache whether or not delete child is permitted
- * on directories without the sticky bit set.
+ * on directories. This enables us to skip directory ACL and POSIX checks
+ * as we already have the result from those checks. However, we always check the
+ * node ACL and, if the directory has the sticky bit set, we always check its
+ * ACL (even for a directory with an authorized delete child). Furthermore,
+ * caching the delete child authorization is independent of the sticky bit
+ * being set as it is only applicable in determining whether the node can be
+ * deleted or not.
*/
-int
-vnode_authorize_delete(vauth_ctx vcp, boolean_t cached_delete_child);
-/*static*/ int
+static int
vnode_authorize_delete(vauth_ctx vcp, boolean_t cached_delete_child)
{
struct vnode_attr *vap = vcp->vap;
struct vnode_attr *dvap = vcp->dvap;
kauth_cred_t cred = vcp->ctx->vc_ucred;
struct kauth_acl_eval eval;
- int error, delete_denied, delete_child_denied, ismember;
+ int error, ismember;
- /* check the ACL on the directory */
- delete_child_denied = 0;
- if (!cached_delete_child && VATTR_IS_NOT(dvap, va_acl, NULL)) {
- eval.ae_requested = KAUTH_VNODE_DELETE_CHILD;
- eval.ae_acl = &dvap->va_acl->acl_ace[0];
- eval.ae_count = dvap->va_acl->acl_entrycount;
+ /* Check the ACL on the node first */
+ if (VATTR_IS_NOT(vap, va_acl, NULL)) {
+ eval.ae_requested = KAUTH_VNODE_DELETE;
+ eval.ae_acl = &vap->va_acl->acl_ace[0];
+ eval.ae_count = vap->va_acl->acl_entrycount;
eval.ae_options = 0;
- if (vauth_dir_owner(vcp))
+ if (vauth_file_owner(vcp))
eval.ae_options |= KAUTH_AEVAL_IS_OWNER;
/*
* We use ENOENT as a marker to indicate we could not get
* have the ACL evaluation answer. Previously, we would
* always deny the operation at this point.
*/
- if ((error = vauth_dir_ingroup(vcp, &ismember, ENOENT)) != 0 && error != ENOENT)
- return(error);
+ if ((error = vauth_file_ingroup(vcp, &ismember, ENOENT)) != 0 && error != ENOENT)
+ return (error);
if (error == ENOENT)
eval.ae_options |= KAUTH_AEVAL_IN_GROUP_UNKNOWN;
else if (ismember)
eval.ae_exp_gwrite = KAUTH_VNODE_GENERIC_WRITE_BITS;
eval.ae_exp_gexec = KAUTH_VNODE_GENERIC_EXECUTE_BITS;
- /*
- * If there is no entry, we are going to defer to other
- * authorization mechanisms.
- */
- error = kauth_acl_evaluate(cred, &eval);
-
- if (error != 0) {
+ if ((error = kauth_acl_evaluate(cred, &eval)) != 0) {
KAUTH_DEBUG("%p ERROR during ACL processing - %d", vcp->vp, error);
- return(error);
+ return (error);
}
+
switch(eval.ae_result) {
case KAUTH_RESULT_DENY:
- delete_child_denied = 1;
- break;
- /* FALLSTHROUGH */
- case KAUTH_RESULT_ALLOW:
- KAUTH_DEBUG("%p ALLOWED - granted by directory ACL", vcp->vp);
- return(0);
+ KAUTH_DEBUG("%p DENIED - denied by ACL", vcp->vp);
+ return (EACCES);
+ case KAUTH_RESULT_ALLOW:
+ KAUTH_DEBUG("%p ALLOWED - granted by ACL", vcp->vp);
+ return (0);
case KAUTH_RESULT_DEFER:
default:
- /* Effectively the same as !delete_child_denied */
- KAUTH_DEBUG("%p DEFERRED - directory ACL", vcp->vp);
+ /* Defer to directory */
+ KAUTH_DEBUG("%p DEFERRED - by file ACL", vcp->vp);
break;
}
}
- /* check the ACL on the node */
- delete_denied = 0;
- if (VATTR_IS_NOT(vap, va_acl, NULL)) {
- eval.ae_requested = KAUTH_VNODE_DELETE;
- eval.ae_acl = &vap->va_acl->acl_ace[0];
- eval.ae_count = vap->va_acl->acl_entrycount;
+ /*
+ * Without a sticky bit, a previously authorized delete child is
+ * sufficient to authorize this delete.
+ *
+ * If the sticky bit is set, a directory ACL which allows delete child
+ * overrides a (potential) sticky bit deny. The authorized delete child
+ * cannot tell us if it was authorized because of an explicit delete
+ * child allow ACE or because of POSIX permisions so we have to check
+ * the directory ACL everytime if the directory has a sticky bit.
+ */
+ if (!(dvap->va_mode & S_ISTXT) && cached_delete_child) {
+ KAUTH_DEBUG("%p ALLOWED - granted by directory ACL or POSIX permissions and no sticky bit on directory", vcp->vp);
+ return (0);
+ }
+
+ /* check the ACL on the directory */
+ if (VATTR_IS_NOT(dvap, va_acl, NULL)) {
+ eval.ae_requested = KAUTH_VNODE_DELETE_CHILD;
+ eval.ae_acl = &dvap->va_acl->acl_ace[0];
+ eval.ae_count = dvap->va_acl->acl_entrycount;
eval.ae_options = 0;
- if (vauth_file_owner(vcp))
+ if (vauth_dir_owner(vcp))
eval.ae_options |= KAUTH_AEVAL_IS_OWNER;
/*
* We use ENOENT as a marker to indicate we could not get
* have the ACL evaluation answer. Previously, we would
* always deny the operation at this point.
*/
- if ((error = vauth_file_ingroup(vcp, &ismember, ENOENT)) != 0 && error != ENOENT)
+ if ((error = vauth_dir_ingroup(vcp, &ismember, ENOENT)) != 0 && error != ENOENT)
return(error);
if (error == ENOENT)
eval.ae_options |= KAUTH_AEVAL_IN_GROUP_UNKNOWN;
eval.ae_exp_gwrite = KAUTH_VNODE_GENERIC_WRITE_BITS;
eval.ae_exp_gexec = KAUTH_VNODE_GENERIC_EXECUTE_BITS;
- if ((error = kauth_acl_evaluate(cred, &eval)) != 0) {
+ /*
+ * If there is no entry, we are going to defer to other
+ * authorization mechanisms.
+ */
+ error = kauth_acl_evaluate(cred, &eval);
+
+ if (error != 0) {
KAUTH_DEBUG("%p ERROR during ACL processing - %d", vcp->vp, error);
- return(error);
+ return (error);
}
-
switch(eval.ae_result) {
case KAUTH_RESULT_DENY:
- delete_denied = 1;
- break;
+ KAUTH_DEBUG("%p DENIED - denied by directory ACL", vcp->vp);
+ return (EACCES);
case KAUTH_RESULT_ALLOW:
- KAUTH_DEBUG("%p ALLOWED - granted by file ACL", vcp->vp);
- return(0);
+ KAUTH_DEBUG("%p ALLOWED - granted by directory ACL", vcp->vp);
+ if (!cached_delete_child && vcp->dvp) {
+ vnode_cache_authorized_action(vcp->dvp,
+ vcp->ctx, KAUTH_VNODE_DELETE_CHILD);
+ }
+ return (0);
case KAUTH_RESULT_DEFER:
default:
- /* Effectively the same as !delete_child_denied */
- KAUTH_DEBUG("%p DEFERRED%s - by file ACL", vcp->vp, delete_denied ? "(DENY)" : "");
+ /* Deferred by directory ACL */
+ KAUTH_DEBUG("%p DEFERRED - directory ACL", vcp->vp);
break;
}
}
- /* if denied by ACL on directory or node, return denial */
- if (delete_denied || delete_child_denied) {
- KAUTH_DEBUG("%p DENIED - denied by ACL", vcp->vp);
- return(EACCES);
+ /*
+ * From this point, we can't explicitly allow and if we reach the end
+ * of the function without a denial, then the delete is authorized.
+ */
+ if (!cached_delete_child) {
+ if (vnode_authorize_posix(vcp, VWRITE, 1 /* on_dir */) != 0) {
+ KAUTH_DEBUG("%p DENIED - denied by posix permisssions", vcp->vp);
+ return (EACCES);
+ }
+ /*
+ * Cache the authorized action on the vnode if allowed by the
+ * directory ACL or POSIX permissions. It is correct to cache
+ * this action even if sticky bit would deny deleting the node.
+ */
+ if (vcp->dvp) {
+ vnode_cache_authorized_action(vcp->dvp, vcp->ctx,
+ KAUTH_VNODE_DELETE_CHILD);
+ }
}
/* enforce sticky bit behaviour */
if ((dvap->va_mode & S_ISTXT) && !vauth_file_owner(vcp) && !vauth_dir_owner(vcp)) {
KAUTH_DEBUG("%p DENIED - sticky bit rules (user %d file %d dir %d)",
vcp->vp, cred->cr_posix.cr_uid, vap->va_uid, dvap->va_uid);
- return(EACCES);
- }
-
- /* check the directory */
- if (!cached_delete_child && (error = vnode_authorize_posix(vcp, VWRITE, 1 /* on_dir */)) != 0) {
- KAUTH_DEBUG("%p DENIED - denied by posix permisssions", vcp->vp);
- return(error);
+ return (EACCES);
}
/* not denied, must be OK */
- return(0);
+ return (0);
}
* Check for file immutability.
*/
static int
-vnode_authorize_checkimmutable(vnode_t vp, struct vnode_attr *vap, int rights, int ignore)
+vnode_authorize_checkimmutable(mount_t mp, struct vnode_attr *vap, int rights, int ignore)
{
- mount_t mp;
int error;
int append;
*
* Sockets, fifos and devices require special handling.
*/
- switch(vp->v_type) {
+ switch(vap->va_type) {
case VSOCK:
case VFIFO:
case VBLK:
if (rights & KAUTH_VNODE_WRITE_RIGHTS) {
/* check per-filesystem options if possible */
- mp = vp->v_mount;
if (mp != NULL) {
/* check for no-EA filesystems */
* allowable for a UF_APPEND file.
*/
append = 0;
- if (vp->v_type == VDIR) {
+ if (vap->va_type == VDIR) {
if ((rights & (KAUTH_VNODE_ADD_FILE | KAUTH_VNODE_ADD_SUBDIRECTORY | KAUTH_VNODE_WRITE_EXTATTRIBUTES)) == rights)
append = 1;
} else {
static int
-vnode_authorize_callback(kauth_cred_t cred, void *idata, kauth_action_t action,
- uintptr_t arg0, uintptr_t arg1, uintptr_t arg2, uintptr_t arg3)
+vnode_authorize_callback(__unused kauth_cred_t cred, __unused void *idata,
+ kauth_action_t action, uintptr_t arg0, uintptr_t arg1, uintptr_t arg2,
+ uintptr_t arg3)
{
vfs_context_t ctx;
vnode_t cvp = NULLVP;
goto out;
}
defer:
- result = vnode_authorize_callback_int(cred, idata, action, arg0, arg1, arg2, arg3);
+ result = vnode_authorize_callback_int(action, ctx, vp, dvp, (int *)arg3);
if (result == KAUTH_RESULT_ALLOW && cvp != NULLVP) {
KAUTH_DEBUG("%p - caching action = %x", cvp, action);
return result;
}
+static int
+vnode_attr_authorize_internal(vauth_ctx vcp, mount_t mp,
+ kauth_ace_rights_t rights, int is_suser, boolean_t *found_deny,
+ int noimmutable, int parent_authorized_for_delete_child)
+{
+ int result;
+
+ /*
+ * Check for immutability.
+ *
+ * In the deletion case, parent directory immutability vetoes specific
+ * file rights.
+ */
+ if ((result = vnode_authorize_checkimmutable(mp, vcp->vap, rights,
+ noimmutable)) != 0)
+ goto out;
+
+ if ((rights & KAUTH_VNODE_DELETE) &&
+ !parent_authorized_for_delete_child) {
+ result = vnode_authorize_checkimmutable(mp, vcp->dvap,
+ KAUTH_VNODE_DELETE_CHILD, 0);
+ if (result)
+ goto out;
+ }
+
+ /*
+ * Clear rights that have been authorized by reaching this point, bail if nothing left to
+ * check.
+ */
+ rights &= ~(KAUTH_VNODE_LINKTARGET | KAUTH_VNODE_CHECKIMMUTABLE);
+ if (rights == 0)
+ goto out;
+
+ /*
+ * If we're not the superuser, authorize based on file properties;
+ * note that even if parent_authorized_for_delete_child is TRUE, we
+ * need to check on the node itself.
+ */
+ if (!is_suser) {
+ /* process delete rights */
+ if ((rights & KAUTH_VNODE_DELETE) &&
+ ((result = vnode_authorize_delete(vcp, parent_authorized_for_delete_child)) != 0))
+ goto out;
+
+ /* process remaining rights */
+ if ((rights & ~KAUTH_VNODE_DELETE) &&
+ (result = vnode_authorize_simple(vcp, rights, rights & KAUTH_VNODE_DELETE, found_deny)) != 0)
+ goto out;
+ } else {
+ /*
+ * Execute is only granted to root if one of the x bits is set. This check only
+ * makes sense if the posix mode bits are actually supported.
+ */
+ if ((rights & KAUTH_VNODE_EXECUTE) &&
+ (vcp->vap->va_type == VREG) &&
+ VATTR_IS_SUPPORTED(vcp->vap, va_mode) &&
+ !(vcp->vap->va_mode & (S_IXUSR | S_IXGRP | S_IXOTH))) {
+ result = EPERM;
+ KAUTH_DEBUG("%p DENIED - root execute requires at least one x bit in 0x%x", vp, va.va_mode);
+ goto out;
+ }
+
+ /* Assume that there were DENYs so we don't wrongly cache KAUTH_VNODE_SEARCHBYANYONE */
+ *found_deny = TRUE;
+
+ KAUTH_DEBUG("%p ALLOWED - caller is superuser", vp);
+ }
+out:
+ return (result);
+}
static int
-vnode_authorize_callback_int(__unused kauth_cred_t unused_cred, __unused void *idata, kauth_action_t action,
- uintptr_t arg0, uintptr_t arg1, uintptr_t arg2, uintptr_t arg3)
+vnode_authorize_callback_int(kauth_action_t action, vfs_context_t ctx,
+ vnode_t vp, vnode_t dvp, int *errorp)
{
struct _vnode_authorize_context auth_context;
vauth_ctx vcp;
- vfs_context_t ctx;
- vnode_t vp, dvp;
kauth_cred_t cred;
kauth_ace_rights_t rights;
struct vnode_attr va, dva;
int result;
- int *errorp;
int noimmutable;
boolean_t parent_authorized_for_delete_child = FALSE;
boolean_t found_deny = FALSE;
boolean_t parent_ref= FALSE;
+ boolean_t is_suser = FALSE;
vcp = &auth_context;
- ctx = vcp->ctx = (vfs_context_t)arg0;
- vp = vcp->vp = (vnode_t)arg1;
- dvp = vcp->dvp = (vnode_t)arg2;
- errorp = (int *)arg3;
+ vcp->ctx = ctx;
+ vcp->vp = vp;
+ vcp->dvp = dvp;
/*
* Note that we authorize against the context, not the passed cred
* (the same thing anyway)
if (vnode_cache_is_authorized(dvp, ctx, KAUTH_VNODE_DELETE_CHILD) == TRUE)
parent_authorized_for_delete_child = TRUE;
} else {
- dvp = NULL;
+ vcp->dvp = NULLVP;
+ vcp->dvap = NULL;
}
/*
goto out;
/*
- * Get vnode attributes and extended security information for the vnode
- * and directory if required.
- */
- VATTR_WANTED(&va, va_mode);
- VATTR_WANTED(&va, va_uid);
- VATTR_WANTED(&va, va_gid);
- VATTR_WANTED(&va, va_flags);
- VATTR_WANTED(&va, va_acl);
- if ((result = vnode_getattr(vp, &va, ctx)) != 0) {
- KAUTH_DEBUG("%p ERROR - failed to get vnode attributes - %d", vp, result);
- goto out;
- }
- if (dvp) {
- VATTR_WANTED(&dva, va_mode);
- VATTR_WANTED(&dva, va_uid);
- VATTR_WANTED(&dva, va_gid);
- VATTR_WANTED(&dva, va_flags);
- VATTR_WANTED(&dva, va_acl);
- if ((result = vnode_getattr(dvp, &dva, ctx)) != 0) {
- KAUTH_DEBUG("%p ERROR - failed to get directory vnode attributes - %d", vp, result);
- goto out;
- }
- }
-
- /*
- * If the vnode is an extended attribute data vnode (eg. a resource fork), *_DATA becomes
- * *_EXTATTRIBUTES.
+ * If the vnode is a namedstream (extended attribute) data vnode (eg.
+ * a resource fork), *_DATA becomes *_EXTATTRIBUTES.
*/
if (vnode_isnamedstream(vp)) {
if (rights & KAUTH_VNODE_READ_DATA) {
rights &= ~KAUTH_VNODE_WRITE_DATA;
rights |= KAUTH_VNODE_WRITE_EXTATTRIBUTES;
}
+
+ /*
+ * Point 'vp' to the namedstream's parent for ACL checking
+ */
+ if ((vp->v_parent != NULL) &&
+ (vget_internal(vp->v_parent, 0, VNODE_NODEAD | VNODE_DRAINO) == 0)) {
+ parent_ref = TRUE;
+ vcp->vp = vp = vp->v_parent;
+ }
+ }
+
+ if (vfs_context_issuser(ctx)) {
+ /*
+ * if we're not asking for execute permissions or modifications,
+ * then we're done, this action is authorized.
+ */
+ if (!(rights & (KAUTH_VNODE_EXECUTE | KAUTH_VNODE_WRITE_RIGHTS)))
+ goto success;
+
+ is_suser = TRUE;
}
/*
- * Point 'vp' to the resource fork's parent for ACL checking
+ * Get vnode attributes and extended security information for the vnode
+ * and directory if required.
+ *
+ * If we're root we only want mode bits and flags for checking
+ * execute and immutability.
*/
- if (vnode_isnamedstream(vp) &&
- (vp->v_parent != NULL) &&
- (vget_internal(vp->v_parent, 0, VNODE_NODEAD | VNODE_DRAINO) == 0)) {
- parent_ref = TRUE;
- vcp->vp = vp = vp->v_parent;
- if (VATTR_IS_SUPPORTED(&va, va_acl) && (va.va_acl != NULL))
- kauth_acl_free(va.va_acl);
- VATTR_INIT(&va);
- VATTR_WANTED(&va, va_mode);
+ VATTR_WANTED(&va, va_mode);
+ VATTR_WANTED(&va, va_flags);
+ if (!is_suser) {
VATTR_WANTED(&va, va_uid);
VATTR_WANTED(&va, va_gid);
- VATTR_WANTED(&va, va_flags);
VATTR_WANTED(&va, va_acl);
- if ((result = vnode_getattr(vp, &va, ctx)) != 0)
- goto out;
}
-
- /*
- * Check for immutability.
- *
- * In the deletion case, parent directory immutability vetoes specific
- * file rights.
- */
- if ((result = vnode_authorize_checkimmutable(vp, &va, rights, noimmutable)) != 0)
- goto out;
- if ((rights & KAUTH_VNODE_DELETE) &&
- parent_authorized_for_delete_child == FALSE &&
- ((result = vnode_authorize_checkimmutable(dvp, &dva, KAUTH_VNODE_DELETE_CHILD, 0)) != 0))
- goto out;
-
- /*
- * Clear rights that have been authorized by reaching this point, bail if nothing left to
- * check.
- */
- rights &= ~(KAUTH_VNODE_LINKTARGET | KAUTH_VNODE_CHECKIMMUTABLE);
- if (rights == 0)
+ if ((result = vnode_getattr(vp, &va, ctx)) != 0) {
+ KAUTH_DEBUG("%p ERROR - failed to get vnode attributes - %d", vp, result);
goto out;
+ }
+ VATTR_WANTED(&va, va_type);
+ VATTR_RETURN(&va, va_type, vnode_vtype(vp));
- /*
- * If we're not the superuser, authorize based on file properties;
- * note that even if parent_authorized_for_delete_child is TRUE, we
- * need to check on the node itself.
- */
- if (!vfs_context_issuser(ctx)) {
- /* process delete rights */
- if ((rights & KAUTH_VNODE_DELETE) &&
- ((result = vnode_authorize_delete(vcp, parent_authorized_for_delete_child)) != 0))
- goto out;
-
- /* process remaining rights */
- if ((rights & ~KAUTH_VNODE_DELETE) &&
- (result = vnode_authorize_simple(vcp, rights, rights & KAUTH_VNODE_DELETE, &found_deny)) != 0)
- goto out;
- } else {
-
- /*
- * Execute is only granted to root if one of the x bits is set. This check only
- * makes sense if the posix mode bits are actually supported.
- */
- if ((rights & KAUTH_VNODE_EXECUTE) &&
- (vp->v_type == VREG) &&
- VATTR_IS_SUPPORTED(&va, va_mode) &&
- !(va.va_mode & (S_IXUSR | S_IXGRP | S_IXOTH))) {
- result = EPERM;
- KAUTH_DEBUG("%p DENIED - root execute requires at least one x bit in 0x%x", vp, va.va_mode);
+ if (vcp->dvp) {
+ VATTR_WANTED(&dva, va_mode);
+ VATTR_WANTED(&dva, va_flags);
+ if (!is_suser) {
+ VATTR_WANTED(&dva, va_uid);
+ VATTR_WANTED(&dva, va_gid);
+ VATTR_WANTED(&dva, va_acl);
+ }
+ if ((result = vnode_getattr(vcp->dvp, &dva, ctx)) != 0) {
+ KAUTH_DEBUG("%p ERROR - failed to get directory vnode attributes - %d", vp, result);
goto out;
}
-
- /* Assume that there were DENYs so we don't wrongly cache KAUTH_VNODE_SEARCHBYANYONE */
- found_deny = TRUE;
-
- KAUTH_DEBUG("%p ALLOWED - caller is superuser", vp);
+ VATTR_WANTED(&dva, va_type);
+ VATTR_RETURN(&dva, va_type, vnode_vtype(vcp->dvp));
}
+
+ result = vnode_attr_authorize_internal(vcp, vp->v_mount, rights, is_suser,
+ &found_deny, noimmutable, parent_authorized_for_delete_child);
out:
if (VATTR_IS_SUPPORTED(&va, va_acl) && (va.va_acl != NULL))
kauth_acl_free(va.va_acl);
vnode_cache_authorized_action(vp, ctx, KAUTH_VNODE_SEARCHBYANYONE);
}
}
- if ((rights & KAUTH_VNODE_DELETE) && parent_authorized_for_delete_child == FALSE) {
- /*
- * parent was successfully and newly authorized for content deletions
- * add it to the cache, but only if it doesn't have the sticky
- * bit set on it. This same check is done earlier guarding
- * fetching of dva, and if we jumped to out without having done
- * this, we will have returned already because of a non-zero
- * 'result' value.
- */
- if (VATTR_IS_SUPPORTED(&dva, va_mode) &&
- !(dva.va_mode & (S_ISVTX))) {
- /* OK to cache delete rights */
- KAUTH_DEBUG("%p - caching DELETE_CHILD rights", dvp);
- vnode_cache_authorized_action(dvp, ctx, KAUTH_VNODE_DELETE_CHILD);
- }
- }
+success:
if (parent_ref)
vnode_put(vp);
+
/*
* Note that this implies that we will allow requests for no rights, as well as
* for rights that we do not recognise. There should be none of these.
return(KAUTH_RESULT_ALLOW);
}
+int
+vnode_attr_authorize_init(struct vnode_attr *vap, struct vnode_attr *dvap,
+ kauth_action_t action, vfs_context_t ctx)
+{
+ VATTR_INIT(vap);
+ VATTR_WANTED(vap, va_type);
+ VATTR_WANTED(vap, va_mode);
+ VATTR_WANTED(vap, va_flags);
+ if (dvap) {
+ VATTR_INIT(dvap);
+ if (action & KAUTH_VNODE_DELETE) {
+ VATTR_WANTED(dvap, va_type);
+ VATTR_WANTED(dvap, va_mode);
+ VATTR_WANTED(dvap, va_flags);
+ }
+ } else if (action & KAUTH_VNODE_DELETE) {
+ return (EINVAL);
+ }
+
+ if (!vfs_context_issuser(ctx)) {
+ VATTR_WANTED(vap, va_uid);
+ VATTR_WANTED(vap, va_gid);
+ VATTR_WANTED(vap, va_acl);
+ if (dvap && (action & KAUTH_VNODE_DELETE)) {
+ VATTR_WANTED(dvap, va_uid);
+ VATTR_WANTED(dvap, va_gid);
+ VATTR_WANTED(dvap, va_acl);
+ }
+ }
+
+ return (0);
+}
+
+int
+vnode_attr_authorize(struct vnode_attr *vap, struct vnode_attr *dvap, mount_t mp,
+ kauth_action_t action, vfs_context_t ctx)
+{
+ struct _vnode_authorize_context auth_context;
+ vauth_ctx vcp;
+ kauth_ace_rights_t rights;
+ int noimmutable;
+ boolean_t found_deny;
+ boolean_t is_suser = FALSE;
+ int result = 0;
+
+ vcp = &auth_context;
+ vcp->ctx = ctx;
+ vcp->vp = NULLVP;
+ vcp->vap = vap;
+ vcp->dvp = NULLVP;
+ vcp->dvap = dvap;
+ vcp->flags = vcp->flags_valid = 0;
+
+ noimmutable = (action & KAUTH_VNODE_NOIMMUTABLE) ? 1 : 0;
+ rights = action & ~(KAUTH_VNODE_ACCESS | KAUTH_VNODE_NOIMMUTABLE);
+
+ /*
+ * Check for read-only filesystems.
+ */
+ if ((rights & KAUTH_VNODE_WRITE_RIGHTS) &&
+ mp && (mp->mnt_flag & MNT_RDONLY) &&
+ ((vap->va_type == VREG) || (vap->va_type == VDIR) ||
+ (vap->va_type == VLNK) || (rights & KAUTH_VNODE_DELETE) ||
+ (rights & KAUTH_VNODE_DELETE_CHILD))) {
+ result = EROFS;
+ goto out;
+ }
+
+ /*
+ * Check for noexec filesystems.
+ */
+ if ((rights & KAUTH_VNODE_EXECUTE) &&
+ (vap->va_type == VREG) && mp && (mp->mnt_flag & MNT_NOEXEC)) {
+ result = EACCES;
+ goto out;
+ }
+
+ if (vfs_context_issuser(ctx)) {
+ /*
+ * if we're not asking for execute permissions or modifications,
+ * then we're done, this action is authorized.
+ */
+ if (!(rights & (KAUTH_VNODE_EXECUTE | KAUTH_VNODE_WRITE_RIGHTS)))
+ goto out;
+ is_suser = TRUE;
+ } else {
+ if (!VATTR_IS_SUPPORTED(vap, va_uid) ||
+ !VATTR_IS_SUPPORTED(vap, va_gid) ||
+ (mp && vfs_extendedsecurity(mp) && !VATTR_IS_SUPPORTED(vap, va_acl))) {
+ panic("vnode attrs not complete for vnode_attr_authorize\n");
+ }
+ }
+
+ result = vnode_attr_authorize_internal(vcp, mp, rights, is_suser,
+ &found_deny, noimmutable, FALSE);
+
+ if (result == EPERM)
+ result = EACCES;
+out:
+ return (result);
+}
+
+
int
vnode_authattr_new(vnode_t dvp, struct vnode_attr *vap, int noauth, vfs_context_t ctx)
{
{
int error;
int has_priv_suser, ismember, defaulted_owner, defaulted_group, defaulted_mode;
+ uint32_t inherit_flags;
kauth_cred_t cred;
guid_t changer;
mount_t dmp;
+ struct vnode_attr dva;
error = 0;
defaulted_owner = defaulted_group = defaulted_mode = 0;
+ inherit_flags = 0;
+
/*
* Require that the filesystem support extended security to apply any.
*/
}
}
+ /*
+ * We need the dvp's va_flags and *may* need the gid of the directory,
+ * we ask for both here.
+ */
+ VATTR_INIT(&dva);
+ VATTR_WANTED(&dva, va_gid);
+ VATTR_WANTED(&dva, va_flags);
+ if ((error = vnode_getattr(dvp, &dva, ctx)) != 0)
+ goto out;
+
/*
* If the filesystem is mounted IGNORE_OWNERSHIP and an explicit grouo is set, that
* group takes ownership of all new files.
} else {
if (!VATTR_IS_ACTIVE(vap, va_gid)) {
/* default group comes from parent object, fallback to current user */
- struct vnode_attr dva;
- VATTR_INIT(&dva);
- VATTR_WANTED(&dva, va_gid);
- if ((error = vnode_getattr(dvp, &dva, ctx)) != 0)
- goto out;
if (VATTR_IS_SUPPORTED(&dva, va_gid)) {
VATTR_SET(vap, va_gid, dva.va_gid);
} else {
if (!VATTR_IS_ACTIVE(vap, va_flags))
VATTR_SET(vap, va_flags, 0);
-
+
+ /* Determine if SF_RESTRICTED should be inherited from the parent
+ * directory. */
+ if (VATTR_IS_SUPPORTED(&dva, va_flags)) {
+ inherit_flags = dva.va_flags & (UF_DATAVAULT | SF_RESTRICTED);
+ }
+
/* default mode is everything, masked with current umask */
if (!VATTR_IS_ACTIVE(vap, va_mode)) {
VATTR_SET(vap, va_mode, ACCESSPERMS & ~vfs_context_proc(ctx)->p_fd->fd_cmask);
}
}
out:
+ if (inherit_flags) {
+ /* Apply SF_RESTRICTED to the file if its parent directory was
+ * restricted. This is done at the end so that root is not
+ * required if this flag is only set due to inheritance. */
+ VATTR_SET(vap, va_flags, (vap->va_flags | inherit_flags));
+ }
if (defaulted_fieldsp) {
if (defaulted_mode) {
*defaulted_fieldsp |= VATTR_PREPARE_DEFAULTED_MODE;
VATTR_IS_ACTIVE(vap, va_change_time) ||
VATTR_IS_ACTIVE(vap, va_modify_time) ||
VATTR_IS_ACTIVE(vap, va_access_time) ||
- VATTR_IS_ACTIVE(vap, va_backup_time)) {
+ VATTR_IS_ACTIVE(vap, va_backup_time) ||
+ VATTR_IS_ACTIVE(vap, va_addedtime)) {
VATTR_WANTED(&ova, va_uid);
#if 0 /* enable this when we support UUIDs as official owners */
VATTR_IS_ACTIVE(vap, va_change_time) ||
VATTR_IS_ACTIVE(vap, va_modify_time) ||
VATTR_IS_ACTIVE(vap, va_access_time) ||
- VATTR_IS_ACTIVE(vap, va_backup_time)) {
+ VATTR_IS_ACTIVE(vap, va_backup_time) ||
+ VATTR_IS_ACTIVE(vap, va_addedtime)) {
/*
* The owner and root may set any timestamps they like,
* provided that the file is not immutable. The owner still needs
required_action |= KAUTH_VNODE_WRITE_SECURITY;
}
- /* clear set-uid and set-gid bits as required by Posix */
- if (VATTR_IS_ACTIVE(vap, va_mode)) {
- newmode = vap->va_mode;
- } else if (VATTR_IS_SUPPORTED(&ova, va_mode)) {
- newmode = ova.va_mode;
- } else {
- KAUTH_DEBUG("CHOWN - trying to change owner but cannot get mode from filesystem to mask setugid bits");
- newmode = 0;
- }
- if (newmode & (S_ISUID | S_ISGID)) {
- VATTR_SET(vap, va_mode, newmode & ~(S_ISUID | S_ISGID));
- KAUTH_DEBUG("CHOWN - masking setugid bits from mode %o to %o", newmode, vap->va_mode);
+ }
+
+ /*
+ * clear set-uid and set-gid bits. POSIX only requires this for
+ * non-privileged processes but we do it even for root.
+ */
+ if (VATTR_IS_ACTIVE(vap, va_mode)) {
+ newmode = vap->va_mode;
+ } else if (VATTR_IS_SUPPORTED(&ova, va_mode)) {
+ newmode = ova.va_mode;
+ } else {
+ KAUTH_DEBUG("CHOWN - trying to change owner but cannot get mode from filesystem to mask setugid bits");
+ newmode = 0;
+ }
+
+ /* chown always clears setuid/gid bits. An exception is made for
+ * setattrlist executed by a root process to set <uid, gid, mode> on a file:
+ * setattrlist is allowed to set the new mode on the file and change (chown)
+ * uid/gid.
+ */
+ if (newmode & (S_ISUID | S_ISGID)) {
+ if (!VATTR_IS_ACTIVE(vap, va_mode) || !has_priv_suser) {
+ KAUTH_DEBUG("CHOWN - masking setugid bits from mode %o to %o",
+ newmode, newmode & ~(S_ISUID | S_ISGID));
+ newmode &= ~(S_ISUID | S_ISGID);
}
+ VATTR_SET(vap, va_mode, newmode);
}
}
mount_unlock(mp);
}
-
void
vnode_setswapmount(vnode_t vp)
{
}
void panic_print_vnodes(void);
+
/* define PANIC_PRINTS_VNODES only if investigation is required. */
#ifdef PANIC_PRINTS_VNODES
return dst;
}
-extern int kdb_printf(const char *format, ...) __printflike(1,2);
-
#define SANE_VNODE_PRINT_LIMIT 5000
void panic_print_vnodes(void)
{
char *nm;
char vname[257];
- kdb_printf("\n***** VNODES *****\n"
+ paniclog_append_noflush("\n***** VNODES *****\n"
"TYPE UREF ICNT PATH\n");
/* NULL-terminate the path name */
* iterate all vnodelist items in all mounts (mntlist) -> mnt_vnodelist
*/
TAILQ_FOREACH(mnt, &mountlist, mnt_list) {
+
+ if (!ml_validate_nofault((vm_offset_t)mnt, sizeof(mount_t))) {
+ paniclog_append_noflush("Unable to iterate the mount list %p - encountered an invalid mount pointer %p \n",
+ &mountlist, mnt);
+ break;
+ }
+
TAILQ_FOREACH(vp, &mnt->mnt_vnodelist, v_mntvnodes) {
+
+ if (!ml_validate_nofault((vm_offset_t)vp, sizeof(vnode_t))) {
+ paniclog_append_noflush("Unable to iterate the vnode list %p - encountered an invalid vnode pointer %p \n",
+ &mnt->mnt_vnodelist, vp);
+ break;
+ }
+
if (++nvnodes > SANE_VNODE_PRINT_LIMIT)
return;
type = __vtype(vp->v_type);
nm = __vpath(vp, vname, sizeof(vname)-1, 0);
- kdb_printf("%s %0d %0d %s\n",
+ paniclog_append_noflush("%s %0d %0d %s\n",
type, vp->v_usecount, vp->v_iocount, nm);
}
}
lck_mtx_unlock(&rp->vr_lock);
+#if CONFIG_MACF
+ int rv = mac_vnode_check_trigger_resolve(ctx, vp, &ndp->ni_cnd);
+ if (rv != 0)
+ return rv;
+#endif
+
/*
* XXX
* assumes that resolver will not access this trigger vnode (otherwise the kernel will deadlock)
}
#endif /* CONFIG_TRIGGERS */
+
+vm_offset_t kdebug_vnode(vnode_t vp)
+{
+ return VM_KERNEL_ADDRPERM(vp);
+}
+
+static int flush_cache_on_write = 0;
+SYSCTL_INT (_kern, OID_AUTO, flush_cache_on_write,
+ CTLFLAG_RW | CTLFLAG_LOCKED, &flush_cache_on_write, 0,
+ "always flush the drive cache on writes to uncached files");
+
+int vnode_should_flush_after_write(vnode_t vp, int ioflag)
+{
+ return (flush_cache_on_write
+ && (ISSET(ioflag, IO_NOCACHE) || vnode_isnocache(vp)));
+}
+
+/*
+ * sysctl for use by disk I/O tracing tools to get the list of existing
+ * vnodes' paths
+ */
+
+struct vnode_trace_paths_context {
+ uint64_t count;
+ long path[MAXPATHLEN / sizeof (long) + 1]; /* + 1 in case sizeof (long) does not divide MAXPATHLEN */
+};
+
+static int vnode_trace_path_callback(struct vnode *vp, void *arg) {
+ int len, rv;
+ struct vnode_trace_paths_context *ctx;
+
+ ctx = arg;
+
+ len = sizeof (ctx->path);
+ rv = vn_getpath(vp, (char *)ctx->path, &len);
+ /* vn_getpath() NUL-terminates, and len includes the NUL */
+
+ if (!rv) {
+ kdebug_vfs_lookup(ctx->path, len, vp,
+ KDBG_VFS_LOOKUP_FLAG_LOOKUP | KDBG_VFS_LOOKUP_FLAG_NOPROCFILT);
+
+ if (++(ctx->count) == 1000) {
+ thread_yield_to_preemption();
+ ctx->count = 0;
+ }
+ }
+
+ return VNODE_RETURNED;
+}
+
+static int vfs_trace_paths_callback(mount_t mp, void *arg) {
+ if (mp->mnt_flag & MNT_LOCAL)
+ vnode_iterate(mp, VNODE_ITERATE_ALL, vnode_trace_path_callback, arg);
+
+ return VFS_RETURNED;
+}
+
+static int sysctl_vfs_trace_paths SYSCTL_HANDLER_ARGS {
+ struct vnode_trace_paths_context ctx;
+
+ (void)oidp;
+ (void)arg1;
+ (void)arg2;
+ (void)req;
+
+ if (!kauth_cred_issuser(kauth_cred_get()))
+ return EPERM;
+
+ if (!kdebug_enable || !kdebug_debugid_enabled(VFS_LOOKUP))
+ return EINVAL;
+
+ bzero(&ctx, sizeof (struct vnode_trace_paths_context));
+
+ vfs_iterate(0, vfs_trace_paths_callback, &ctx);
+
+ return 0;
+}
+
+SYSCTL_PROC(_vfs_generic, OID_AUTO, trace_paths, CTLFLAG_RD | CTLFLAG_LOCKED | CTLFLAG_MASKED, NULL, 0, &sysctl_vfs_trace_paths, "-", "trace_paths");