/*
- * Copyright (c) 1995-2014 Apple Inc. All rights reserved.
+ * Copyright (c) 1995-2015 Apple Inc. All rights reserved.
*
* @APPLE_OSREFERENCE_LICENSE_HEADER_START@
*
#include <sys/fsctl.h>
#include <sys/ubc_internal.h>
#include <sys/disk.h>
+#include <sys/content_protection.h>
#include <machine/cons.h>
#include <machine/limits.h>
#include <miscfs/specfs/specdev.h>
#include <libkern/OSAtomic.h>
#include <pexpert/pexpert.h>
+#include <IOKit/IOBSD.h>
#if CONFIG_MACF
#include <security/mac.h>
struct fd_vn_data * fg_vn_data_alloc(void);
+/*
+ * Max retries for ENOENT returns from vn_authorize_{rmdir, unlink, rename}
+ * Concurrent lookups (or lookups by ids) on hard links can cause the
+ * vn_getpath (which does not re-enter the filesystem as vn_getpath_fsenter
+ * does) to return ENOENT as the path cannot be returned from the name cache
+ * alone. We have no option but to retry and hope to get one namei->reverse path
+ * generation done without an intervening lookup, lookup by id on the hard link
+ * item. This is only an issue for MAC hooks which cannot reenter the filesystem
+ * which currently are the MAC hooks for rename, unlink and rmdir.
+ */
+#define MAX_AUTHORIZE_ENOENT_RETRIES 1024
+
static int rmdirat_internal(vfs_context_t, int, user_addr_t, enum uio_seg);
static int fsgetpath_internal(vfs_context_t, int, uint64_t, vm_size_t, caddr_t, int *);
int sync_internal(void);
__private_extern__
-int unlink1(vfs_context_t, struct nameidata *, int);
+int unlink1(vfs_context_t, vnode_t, user_addr_t, enum uio_seg, int);
extern lck_grp_t *fd_vn_lck_grp;
extern lck_grp_attr_t *fd_vn_lck_grp_attr;
AUDIT_ARG(fflags, flags);
+#if SECURE_KERNEL
+ if (flags & MNT_UNION) {
+ /* No union mounts on release kernels */
+ error = EPERM;
+ goto out;
+ }
+#endif
+
if ((vp->v_flag & VROOT) &&
(vp->v_mount->mnt_flag & MNT_ROOTFS)) {
if (!(flags & MNT_UNION)) {
flags = (flags & ~(MNT_UPDATE));
}
-#ifdef SECURE_KERNEL
+#if SECURE_KERNEL
if ((flags & MNT_RDONLY) == 0) {
/* Release kernels are not allowed to mount "/" as rw */
error = EPERM;
if ( (error = namei(&nd)) )
goto out1;
- strncpy(mp->mnt_vfsstat.f_mntfromname, nd.ni_cnd.cn_pnbuf, MAXPATHLEN);
+ strlcpy(mp->mnt_vfsstat.f_mntfromname, nd.ni_cnd.cn_pnbuf, MAXPATHLEN);
devvp = nd.ni_vp;
nameidone(&nd);
/* Now that mount is setup, notify the listeners */
vfs_notify_mount(pvp);
+ IOBSDMountChange(mp, kIOMountChangeMount);
+
} else {
/* If we fail a fresh mount, there should be no vnodes left hooked into the mountpoint. */
if (mp->mnt_vnodelist.tqh_first != NULL) {
placed = TRUE;
- strncpy(old_mntonname, mp->mnt_vfsstat.f_mntonname, MAXPATHLEN);
- strncpy(mp->mnt_vfsstat.f_mntonname, cnp->cn_pnbuf, MAXPATHLEN);
+ strlcpy(old_mntonname, mp->mnt_vfsstat.f_mntonname, MAXPATHLEN);
+ strlcpy(mp->mnt_vfsstat.f_mntonname, cnp->cn_pnbuf, MAXPATHLEN);
/* Forbid future moves */
mount_lock(mp);
return 0;
out3:
- strncpy(mp->mnt_vfsstat.f_mntonname, old_mntonname, MAXPATHLEN);
+ strlcpy(mp->mnt_vfsstat.f_mntonname, old_mntonname, MAXPATHLEN);
mount_lock(mp);
mp->mnt_kern_flag &= ~(MNTK_HAS_MOVED);
}
}
+ IOBSDMountChange(mp, kIOMountChangeUnmount);
+
#if CONFIG_TRIGGERS
vfs_nested_trigger_unmounts(mp, flags, ctx);
did_vflush = 1;
int flags, oflags;
int type, indx, error;
struct flock lf;
- int no_controlling_tty = 0;
- int deny_controlling_tty = 0;
- struct session *sessp = SESSION_NULL;
+ struct vfs_context context;
oflags = uflags;
if ((oflags & O_ACCMODE) == O_ACCMODE)
return(EINVAL);
+
flags = FFLAGS(uflags);
+ CLR(flags, FENCRYPTED);
+ CLR(flags, FUNENCRYPTED);
AUDIT_ARG(fflags, oflags);
AUDIT_ARG(mode, vap->va_mode);
}
uu->uu_dupfd = -indx - 1;
- if (!(p->p_flag & P_CONTROLT)) {
- sessp = proc_session(p);
- no_controlling_tty = 1;
- /*
- * If conditions would warrant getting a controlling tty if
- * the device being opened is a tty (see ttyopen in tty.c),
- * but the open flags deny it, set a flag in the session to
- * prevent it.
- */
- if (SESS_LEADER(p, sessp) &&
- sessp->s_ttyvp == NULL &&
- (flags & O_NOCTTY)) {
- session_lock(sessp);
- sessp->s_flags |= S_NOCTTY;
- session_unlock(sessp);
- deny_controlling_tty = 1;
- }
- }
-
if ((error = vn_open_auth(ndp, &flags, vap))) {
if ((error == ENODEV || error == ENXIO) && (uu->uu_dupfd >= 0)){ /* XXX from fdopen */
if ((error = dupfdopen(p->p_fd, indx, uu->uu_dupfd, flags, error)) == 0) {
fp_drop(p, indx, NULL, 0);
*retval = indx;
- if (deny_controlling_tty) {
- session_lock(sessp);
- sessp->s_flags &= ~S_NOCTTY;
- session_unlock(sessp);
- }
- if (sessp != SESSION_NULL)
- session_rele(sessp);
return (0);
}
}
if (error == ERESTART)
error = EINTR;
fp_free(p, indx, fp);
-
- if (deny_controlling_tty) {
- session_lock(sessp);
- sessp->s_flags &= ~S_NOCTTY;
- session_unlock(sessp);
- }
- if (sessp != SESSION_NULL)
- session_rele(sessp);
return (error);
}
uu->uu_dupfd = 0;
vp = ndp->ni_vp;
- fp->f_fglob->fg_flag = flags & (FMASK | O_EVTONLY);
+ fp->f_fglob->fg_flag = flags & (FMASK | O_EVTONLY | FENCRYPTED | FUNENCRYPTED);
fp->f_fglob->fg_ops = &vnops;
fp->f_fglob->fg_data = (caddr_t)vp;
-#if CONFIG_PROTECT
- if (VATTR_IS_ACTIVE (vap, va_dataprotect_flags)) {
- if (vap->va_dataprotect_flags & VA_DP_RAWENCRYPTED) {
- fp->f_fglob->fg_flag |= FENCRYPTED;
- }
- }
-#endif
-
if (flags & (O_EXLOCK | O_SHLOCK)) {
lf.l_whence = SEEK_SET;
lf.l_start = 0;
if ((flags & O_TRUNC) && ((error = vnode_setsize(vp, (off_t)0, 0, ctx)) != 0))
goto bad;
- /*
- * If the open flags denied the acquisition of a controlling tty,
- * clear the flag in the session structure that prevented the lower
- * level code from assigning one.
- */
- if (deny_controlling_tty) {
- session_lock(sessp);
- sessp->s_flags &= ~S_NOCTTY;
- session_unlock(sessp);
- }
-
- /*
- * If a controlling tty was set by the tty line discipline, then we
- * want to set the vp of the tty into the session structure. We have
- * a race here because we can't get to the vp for the tp in ttyopen,
- * because it's not passed as a parameter in the open path.
- */
- if (no_controlling_tty && (p->p_flag & P_CONTROLT)) {
- vnode_t ttyvp;
-
- session_lock(sessp);
- ttyvp = sessp->s_ttyvp;
- sessp->s_ttyvp = vp;
- sessp->s_ttyvid = vnode_vid(vp);
- session_unlock(sessp);
- }
-
/*
* For directories we hold some additional information in the fd.
*/
vnode_put(vp);
+ /*
+ * The first terminal open (without a O_NOCTTY) by a session leader
+ * results in it being set as the controlling terminal.
+ */
+ if (vnode_istty(vp) && !(p->p_flag & P_CONTROLT) &&
+ !(flags & O_NOCTTY)) {
+ int tmp = 0;
+
+ (void)(*fp->f_fglob->fg_ops->fo_ioctl)(fp, (int)TIOCSCTTY,
+ (caddr_t)&tmp, ctx);
+ }
+
proc_fdlock(p);
if (flags & O_CLOEXEC)
*fdflags(p, indx) |= UF_EXCLOSE;
*retval = indx;
- if (sessp != SESSION_NULL)
- session_rele(sessp);
return (0);
bad:
- if (deny_controlling_tty) {
- session_lock(sessp);
- sessp->s_flags &= ~S_NOCTTY;
- session_unlock(sessp);
- }
- if (sessp != SESSION_NULL)
- session_rele(sessp);
-
- struct vfs_context context = *vfs_context_current();
+ context = *vfs_context_current();
context.vc_ucred = fp->f_fglob->fg_cred;
if ((fp->f_fglob->fg_flag & FHASLOCK) &&
* 2. set a flag to mark it as requiring open-raw-encrypted semantics.
*/
if (flags & O_CREAT) {
- VATTR_SET(&va, va_dataprotect_class, class);
+ /* lower level kernel code validates that the class is valid before applying it. */
+ if (class != PROTECTION_CLASS_DEFAULT) {
+ /*
+ * PROTECTION_CLASS_DEFAULT implies that we make the class for this
+ * file behave the same as open (2)
+ */
+ VATTR_SET(&va, va_dataprotect_class, class);
+ }
}
- if (dpflags & O_DP_GETRAWENCRYPTED) {
+ if (dpflags & (O_DP_GETRAWENCRYPTED|O_DP_GETRAWUNENCRYPTED)) {
if ( flags & (O_RDWR | O_WRONLY)) {
/* Not allowed to write raw encrypted bytes */
return EINVAL;
}
- VATTR_SET(&va, va_dataprotect_flags, VA_DP_RAWENCRYPTED);
+ if (uap->dpflags & O_DP_GETRAWENCRYPTED) {
+ VATTR_SET(&va, va_dataprotect_flags, VA_DP_RAWENCRYPTED);
+ }
+ if (uap->dpflags & O_DP_GETRAWUNENCRYPTED) {
+ VATTR_SET(&va, va_dataprotect_flags, VA_DP_RAWUNENCRYPTED);
+ }
}
error = open1(vfs_context_current(), &nd, uap->flags, &va,
}
switch (uap->mode & S_IFMT) {
- case S_IFMT: /* used by badsect to flag bad sectors */
- VATTR_SET(&va, va_type, VBAD);
- break;
case S_IFCHR:
VATTR_SET(&va, va_type, VCHR);
break;
error = VNOP_SYMLINK(dvp, &vp, &nd.ni_cnd, &va, path, ctx);
#if CONFIG_MACF
- if (error == 0)
+ if (error == 0 && vp)
error = vnode_label(vnode_mount(vp), dvp, vp, &nd.ni_cnd, VNODE_LABEL_CREATE, ctx);
#endif
/* do fallback attribute handling */
- if (error == 0)
+ if (error == 0 && vp)
error = vnode_setattr_fallback(vp, &va, ctx);
if (error == 0) {
int update_flags = 0;
+ /*check if a new vnode was created, else try to get one*/
if (vp == NULL) {
nd.ni_cnd.cn_nameiop = LOOKUP;
#if CONFIG_TRIGGERS
*/
/* ARGSUSED */
static int
-unlink1at(vfs_context_t ctx, struct nameidata *ndp, int unlink_flags, int fd)
+unlinkat_internal(vfs_context_t ctx, int fd, vnode_t start_dvp,
+ user_addr_t path_arg, enum uio_seg segflg, int unlink_flags)
{
+ struct nameidata nd;
vnode_t vp, dvp;
int error;
struct componentname *cnp;
fse_info finfo;
struct vnode_attr va;
#endif
- int flags = 0;
- int need_event = 0;
- int has_listeners = 0;
- int truncated_path=0;
+ int flags;
+ int need_event;
+ int has_listeners;
+ int truncated_path;
int batched;
- struct vnode_attr *vap = NULL;
+ struct vnode_attr *vap;
+ int do_retry;
+ int retry_count = 0;
+ int cn_flags;
+
+ cn_flags = LOCKPARENT;
+ if (!(unlink_flags & VNODE_REMOVE_NO_AUDIT_PATH))
+ cn_flags |= AUDITVNPATH1;
+ /* If a starting dvp is passed, it trumps any fd passed. */
+ if (start_dvp)
+ cn_flags |= USEDVP;
#if NAMEDRSRCFORK
/* unlink or delete is allowed on rsrc forks and named streams */
- ndp->ni_cnd.cn_flags |= CN_ALLOWRSRCFORK;
+ cn_flags |= CN_ALLOWRSRCFORK;
#endif
- ndp->ni_cnd.cn_flags |= LOCKPARENT;
- ndp->ni_flag |= NAMEI_COMPOUNDREMOVE;
- cnp = &ndp->ni_cnd;
+retry:
+ do_retry = 0;
+ flags = 0;
+ need_event = 0;
+ has_listeners = 0;
+ truncated_path = 0;
+ vap = NULL;
+
+ NDINIT(&nd, DELETE, OP_UNLINK, cn_flags, segflg, path_arg, ctx);
+
+ nd.ni_dvp = start_dvp;
+ nd.ni_flag |= NAMEI_COMPOUNDREMOVE;
+ cnp = &nd.ni_cnd;
lookup_continue:
- error = nameiat(ndp, fd);
+ error = nameiat(&nd, fd);
if (error)
return (error);
- dvp = ndp->ni_dvp;
- vp = ndp->ni_vp;
+ dvp = nd.ni_dvp;
+ vp = nd.ni_vp;
/* With Carbon delete semantics, busy files cannot be deleted */
if (!batched) {
error = vn_authorize_unlink(dvp, vp, cnp, ctx, NULL);
if (error) {
+ if (error == ENOENT) {
+ assert(retry_count < MAX_AUTHORIZE_ENOENT_RETRIES);
+ if (retry_count < MAX_AUTHORIZE_ENOENT_RETRIES) {
+ do_retry = 1;
+ retry_count++;
+ }
+ }
goto out;
}
}
goto out;
}
}
- len = safe_getpath(dvp, ndp->ni_cnd.cn_nameptr, path, MAXPATHLEN, &truncated_path);
+ len = safe_getpath(dvp, nd.ni_cnd.cn_nameptr, path, MAXPATHLEN, &truncated_path);
}
#if NAMEDRSRCFORK
- if (ndp->ni_cnd.cn_flags & CN_WANTSRSRCFORK)
+ if (nd.ni_cnd.cn_flags & CN_WANTSRSRCFORK)
error = vnode_removenamedstream(dvp, vp, XATTR_RESOURCEFORK_NAME, 0, ctx);
else
#endif
{
- error = vn_remove(dvp, &ndp->ni_vp, ndp, flags, vap, ctx);
- vp = ndp->ni_vp;
+ error = vn_remove(dvp, &nd.ni_vp, &nd, flags, vap, ctx);
+ vp = nd.ni_vp;
if (error == EKEEPLOOKING) {
if (!batched) {
panic("EKEEPLOOKING, but not a filesystem that supports compound VNOPs?");
}
- if ((ndp->ni_flag & NAMEI_CONTLOOKUP) == 0) {
+ if ((nd.ni_flag & NAMEI_CONTLOOKUP) == 0) {
panic("EKEEPLOOKING, but continue flag not set?");
}
goto out;
}
goto lookup_continue;
+ } else if (error == ENOENT && batched) {
+ assert(retry_count < MAX_AUTHORIZE_ENOENT_RETRIES);
+ if (retry_count < MAX_AUTHORIZE_ENOENT_RETRIES) {
+ /*
+ * For compound VNOPs, the authorization callback may
+ * return ENOENT in case of racing hardlink lookups
+ * hitting the name cache, redrive the lookup.
+ */
+ do_retry = 1;
+ retry_count += 1;
+ goto out;
+ }
}
}
* nameidone has to happen before we vnode_put(dvp)
* since it may need to release the fs_nodelock on the dvp
*/
- nameidone(ndp);
+ nameidone(&nd);
vnode_put(dvp);
if (vp) {
vnode_put(vp);
}
+
+ if (do_retry) {
+ goto retry;
+ }
+
return (error);
}
int
-unlink1(vfs_context_t ctx, struct nameidata *ndp, int unlink_flags)
+unlink1(vfs_context_t ctx, vnode_t start_dvp, user_addr_t path_arg,
+ enum uio_seg segflg, int unlink_flags)
{
- return (unlink1at(ctx, ndp, unlink_flags, AT_FDCWD));
+ return (unlinkat_internal(ctx, AT_FDCWD, start_dvp, path_arg, segflg,
+ unlink_flags));
}
/*
- * Delete a name from the filesystem using POSIX semantics.
+ * Delete a name from the filesystem using Carbon semantics.
*/
-static int
-unlinkat_internal(vfs_context_t ctx, int fd, user_addr_t path,
- enum uio_seg segflg)
+int
+delete(__unused proc_t p, struct delete_args *uap, __unused int32_t *retval)
{
- struct nameidata nd;
-
- NDINIT(&nd, DELETE, OP_UNLINK, AUDITVNPATH1, segflg,
- path, ctx);
- return (unlink1at(ctx, &nd, 0, fd));
+ return (unlinkat_internal(vfs_context_current(), AT_FDCWD, NULLVP,
+ uap->path, UIO_USERSPACE, VNODE_REMOVE_NODELETEBUSY));
}
+/*
+ * Delete a name from the filesystem using POSIX semantics.
+ */
int
unlink(__unused proc_t p, struct unlink_args *uap, __unused int32_t *retval)
{
- return (unlinkat_internal(vfs_context_current(), AT_FDCWD, uap->path,
- UIO_USERSPACE));
+ return (unlinkat_internal(vfs_context_current(), AT_FDCWD, NULLVP,
+ uap->path, UIO_USERSPACE, 0));
}
int
uap->path, UIO_USERSPACE));
else
return (unlinkat_internal(vfs_context_current(), uap->fd,
- uap->path, UIO_USERSPACE));
-}
-
-/*
- * Delete a name from the filesystem using Carbon semantics.
- */
-int
-delete(__unused proc_t p, struct delete_args *uap, __unused int32_t *retval)
-{
- struct nameidata nd;
- vfs_context_t ctx = vfs_context_current();
-
- NDINIT(&nd, DELETE, OP_UNLINK, AUDITVNPATH1, UIO_USERSPACE,
- uap->path, ctx);
- return unlink1(ctx, &nd, VNODE_REMOVE_NODELETEBUSY);
+ NULLVP, uap->path, UIO_USERSPACE, 0));
}
/*
return(EINVAL);
}
- NDINIT(&fromnd, LOOKUP, OP_COPYFILE, SAVESTART | AUDITVNPATH1,
+ NDINIT(&fromnd, LOOKUP, OP_COPYFILE, AUDITVNPATH1,
UIO_USERSPACE, uap->from, ctx);
if ((error = namei(&fromnd)))
return (error);
out1:
vnode_put(fvp);
- if (fromnd.ni_startdir)
- vnode_put(fromnd.ni_startdir);
nameidone(&fromnd);
if (error == -1)
struct nameidata *fromnd, *tond;
int error;
int do_retry;
+ int retry_count;
int mntrename;
int need_event;
const char *oname = NULL;
holding_mntlock = 0;
do_retry = 0;
+ retry_count = 0;
retry:
fvp = tvp = NULL;
fdvp = tdvp = NULL;
error = vn_authorize_rename(fdvp, fvp, &fromnd->ni_cnd, tdvp, tvp, &tond->ni_cnd, ctx, NULL);
if (error) {
if (error == ENOENT) {
- /*
- * We encountered a race where after doing the namei, tvp stops
- * being valid. If so, simply re-drive the rename call from the
- * top.
- */
- do_retry = 1;
+ assert(retry_count < MAX_AUTHORIZE_ENOENT_RETRIES);
+ if (retry_count < MAX_AUTHORIZE_ENOENT_RETRIES) {
+ /*
+ * We encountered a race where after doing the namei, tvp stops
+ * being valid. If so, simply re-drive the rename call from the
+ * top.
+ */
+ do_retry = 1;
+ retry_count += 1;
+ }
}
goto out1;
}
do_retry = 1;
}
+ /*
+ * For compound VNOPs, the authorization callback may return
+ * ENOENT in case of racing hardlink lookups hitting the name
+ * cache, redrive the lookup.
+ */
+ if (batched && error == ENOENT) {
+ assert(retry_count < MAX_AUTHORIZE_ENOENT_RETRIES);
+ if (retry_count < MAX_AUTHORIZE_ENOENT_RETRIES) {
+ do_retry = 1;
+ retry_count += 1;
+ }
+ }
+
goto out1;
}
struct vnode_attr va;
#endif /* CONFIG_FSE */
struct vnode_attr *vap = NULL;
+ int restart_count = 0;
int batched;
int restart_flag;
if (!batched) {
error = vn_authorize_rmdir(dvp, vp, &nd.ni_cnd, ctx, NULL);
if (error) {
+ if (error == ENOENT) {
+ assert(restart_count < MAX_AUTHORIZE_ENOENT_RETRIES);
+ if (restart_count < MAX_AUTHORIZE_ENOENT_RETRIES) {
+ restart_flag = 1;
+ restart_count += 1;
+ }
+ }
goto out;
}
}
if (error == EKEEPLOOKING) {
goto continue_lookup;
+ } else if (batched && error == ENOENT) {
+ assert(restart_count < MAX_AUTHORIZE_ENOENT_RETRIES);
+ if (restart_count < MAX_AUTHORIZE_ENOENT_RETRIES) {
+ /*
+ * For compound VNOPs, the authorization callback
+ * may return ENOENT in case of racing hard link lookups
+ * redrive the lookup.
+ */
+ restart_flag = 1;
+ restart_count += 1;
+ goto out;
+ }
}
#if CONFIG_APPLEDOUBLE
/*
memp = NULL;
+
/*
* ensure the buffer is large enough for underlying calls
*/
#ifndef HFSIOC_GETPATH
-typedef char pn_t[MAXPATHLEN];
+ typedef char pn_t[MAXPATHLEN];
#define HFSIOC_GETPATH _IOWR('h', 13, pn_t)
#endif
size = MAXPATHLEN;
}
-
if (size > sizeof (stkbuf)) {
if ((memp = (caddr_t)kalloc(size)) == 0) return ENOMEM;
data = memp;
/* Get the vnode for the file we are getting info on: */
if ((error = file_vnode(uap->fd, &vp)))
- goto done;
+ return error;
fd = uap->fd;
if ((error = vnode_getwithref(vp))) {
- goto done;
+ file_drop(fd);
+ return error;
}
#if CONFIG_MACF
- error = mac_mount_check_fsctl(ctx, vnode_mount(vp), uap->cmd);
- if (error) {
- goto done;
+ if ((error = mac_mount_check_fsctl(ctx, vnode_mount(vp), uap->cmd))) {
+ file_drop(fd);
+ vnode_put(vp);
+ return error;
}
#endif
error = fsctl_internal(p, &vp, uap->cmd, (user_addr_t)uap->data, uap->options, ctx);
-done:
- if (fd != -1)
- file_drop(fd);
+ file_drop(fd);
- if (vp)
+ /*validate vp; fsctl_internal() can drop iocount and reset vp to NULL*/
+ if (vp) {
vnode_put(vp);
+ }
+
return error;
}
/* end of fsctl system call */
return (EINVAL);
if ((error = copyinstr(uap->attrname, attrname, sizeof(attrname), &namelen) != 0)) {
- return (error);
+ if (error == EPERM) {
+ /* if the string won't fit in attrname, copyinstr emits EPERM */
+ return (ENAMETOOLONG);
+ }
+ /* Otherwise return the default error from copyinstr to detect ERANGE, etc */
+ return error;
}
if (xattr_protected(attrname))
return(EPERM);