git.saurik.com Git - apple/xnu.git/blame_incremental

... / ...

Commit	Line	Data
	1	/*
	2	* Copyright (c) 1995-2017 Apple Inc. All rights reserved.
	3	*
	4	* @APPLE_OSREFERENCE_LICENSE_HEADER_START@
	5	*
	6	* This file contains Original Code and/or Modifications of Original Code
	7	* as defined in and that are subject to the Apple Public Source License
	8	* Version 2.0 (the 'License'). You may not use this file except in
	9	* compliance with the License. The rights granted to you under the License
	10	* may not be used to create, or enable the creation or redistribution of,
	11	* unlawful or unlicensed copies of an Apple operating system, or to
	12	* circumvent, violate, or enable the circumvention or violation of, any
	13	* terms of an Apple operating system software license agreement.
	14	*
	15	* Please obtain a copy of the License at
	16	* http://www.opensource.apple.com/apsl/ and read it before using this file.
	17	*
	18	* The Original Code and all software distributed under the License are
	19	* distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
	20	* EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
	21	* INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
	22	* FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
	23	* Please see the License for the specific language governing rights and
	24	* limitations under the License.
	25	*
	26	* @APPLE_OSREFERENCE_LICENSE_HEADER_END@
	27	*/
	28	/*
	29	* Copyright (c) 1989, 1993
	30	* The Regents of the University of California. All rights reserved.
	31	* (c) UNIX System Laboratories, Inc.
	32	* All or some portions of this file are derived from material licensed
	33	* to the University of California by American Telephone and Telegraph
	34	* Co. or Unix System Laboratories, Inc. and are reproduced herein with
	35	* the permission of UNIX System Laboratories, Inc.
	36	*
	37	* Redistribution and use in source and binary forms, with or without
	38	* modification, are permitted provided that the following conditions
	39	* are met:
	40	* 1. Redistributions of source code must retain the above copyright
	41	* notice, this list of conditions and the following disclaimer.
	42	* 2. Redistributions in binary form must reproduce the above copyright
	43	* notice, this list of conditions and the following disclaimer in the
	44	* documentation and/or other materials provided with the distribution.
	45	* 3. All advertising materials mentioning features or use of this software
	46	* must display the following acknowledgement:
	47	* This product includes software developed by the University of
	48	* California, Berkeley and its contributors.
	49	* 4. Neither the name of the University nor the names of its contributors
	50	* may be used to endorse or promote products derived from this software
	51	* without specific prior written permission.
	52	*
	53	* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
	54	* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
	55	* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
	56	* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
	57	* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
	58	* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
	59	* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
	60	* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
	61	* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
	62	* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
	63	* SUCH DAMAGE.
	64	*
	65	* @(#)vfs_syscalls.c 8.41 (Berkeley) 6/15/95
	66	*/
	67	/*
	68	* NOTICE: This file was modified by SPARTA, Inc. in 2005 to introduce
	69	* support for mandatory and extensible security protections. This notice
	70	* is included in support of clause 2.2 (b) of the Apple Public License,
	71	* Version 2.0.
	72	*/
	73
	74	#include <sys/param.h>
	75	#include <sys/systm.h>
	76	#include <sys/namei.h>
	77	#include <sys/filedesc.h>
	78	#include <sys/kernel.h>
	79	#include <sys/file_internal.h>
	80	#include <sys/stat.h>
	81	#include <sys/vnode_internal.h>
	82	#include <sys/mount_internal.h>
	83	#include <sys/proc_internal.h>
	84	#include <sys/kauth.h>
	85	#include <sys/uio_internal.h>
	86	#include <sys/malloc.h>
	87	#include <sys/mman.h>
	88	#include <sys/dirent.h>
	89	#include <sys/attr.h>
	90	#include <sys/sysctl.h>
	91	#include <sys/ubc.h>
	92	#include <sys/quota.h>
	93	#include <sys/kdebug.h>
	94	#include <sys/fsevents.h>
	95	#include <sys/imgsrc.h>
	96	#include <sys/sysproto.h>
	97	#include <sys/xattr.h>
	98	#include <sys/fcntl.h>
	99	#include <sys/fsctl.h>
	100	#include <sys/ubc_internal.h>
	101	#include <sys/disk.h>
	102	#include <sys/content_protection.h>
	103	#include <sys/clonefile.h>
	104	#include <sys/snapshot.h>
	105	#include <sys/priv.h>
	106	#include <machine/cons.h>
	107	#include <machine/limits.h>
	108	#include <miscfs/specfs/specdev.h>
	109
	110	#include <vfs/vfs_disk_conditioner.h>
	111
	112	#include <security/audit/audit.h>
	113	#include <bsm/audit_kevents.h>
	114
	115	#include <mach/mach_types.h>
	116	#include <kern/kern_types.h>
	117	#include <kern/kalloc.h>
	118	#include <kern/task.h>
	119
	120	#include <vm/vm_pageout.h>
	121	#include <vm/vm_protos.h>
	122
	123	#include <libkern/OSAtomic.h>
	124	#include <pexpert/pexpert.h>
	125	#include <IOKit/IOBSD.h>
	126
	127	#if ROUTEFS
	128	#include <miscfs/routefs/routefs.h>
	129	#endif /* ROUTEFS */
	130
	131	#if CONFIG_MACF
	132	#include <security/mac.h>
	133	#include <security/mac_framework.h>
	134	#endif
	135
	136	#if CONFIG_FSE
	137	#define GET_PATH(x) \
	138	(x) = get_pathbuff();
	139	#define RELEASE_PATH(x) \
	140	release_pathbuff(x);
	141	#else
	142	#define GET_PATH(x) \
	143	MALLOC_ZONE((x), char *, MAXPATHLEN, M_NAMEI, M_WAITOK);
	144	#define RELEASE_PATH(x) \
	145	FREE_ZONE((x), MAXPATHLEN, M_NAMEI);
	146	#endif /* CONFIG_FSE */
	147
	148	#ifndef HFS_GET_BOOT_INFO
	149	#define HFS_GET_BOOT_INFO (FCNTL_FS_SPECIFIC_BASE + 0x00004)
	150	#endif
	151
	152	#ifndef HFS_SET_BOOT_INFO
	153	#define HFS_SET_BOOT_INFO (FCNTL_FS_SPECIFIC_BASE + 0x00005)
	154	#endif
	155
	156	#ifndef APFSIOC_REVERT_TO_SNAPSHOT
	157	#define APFSIOC_REVERT_TO_SNAPSHOT _IOW('J', 1, u_int64_t)
	158	#endif
	159
	160	extern void disk_conditioner_unmount(mount_t mp);
	161
	162	/* struct for checkdirs iteration */
	163	struct cdirargs {
	164	vnode_t olddp;
	165	vnode_t newdp;
	166	};
	167	/* callback for checkdirs iteration */
	168	static int checkdirs_callback(proc_t p, void * arg);
	169
	170	static int change_dir(struct nameidata *ndp, vfs_context_t ctx);
	171	static int checkdirs(vnode_t olddp, vfs_context_t ctx);
	172	void enablequotas(struct mount *mp, vfs_context_t ctx);
	173	static int getfsstat_callback(mount_t mp, void * arg);
	174	static int getutimes(user_addr_t usrtvp, struct timespec *tsp);
	175	static int setutimes(vfs_context_t ctx, vnode_t vp, const struct timespec *ts, int nullflag);
	176	static int sync_callback(mount_t, void *);
	177	static int munge_statfs(struct mount mp, struct vfsstatfs sfsp,
	178	user_addr_t bufp, int *sizep, boolean_t is_64_bit,
	179	boolean_t partial_copy);
	180	static int statfs64_common(struct mount mp, struct vfsstatfs sfsp,
	181	user_addr_t bufp);
	182	static int fsync_common(proc_t p, struct fsync_args *uap, int flags);
	183	static int mount_common(char *fstypename, vnode_t pvp, vnode_t vp,
	184	struct componentname *cnp, user_addr_t fsmountargs,
	185	int flags, uint32_t internal_flags, char *labelstr, boolean_t kernelmount,
	186	vfs_context_t ctx);
	187	void vfs_notify_mount(vnode_t pdvp);
	188
	189	int prepare_coveredvp(vnode_t vp, vfs_context_t ctx, struct componentname cnp, const char fsname, boolean_t skip_auth);
	190
	191	struct fd_vn_data * fg_vn_data_alloc(void);
	192
	193	/*
	194	* Max retries for ENOENT returns from vn_authorize_{rmdir, unlink, rename}
	195	* Concurrent lookups (or lookups by ids) on hard links can cause the
	196	* vn_getpath (which does not re-enter the filesystem as vn_getpath_fsenter
	197	* does) to return ENOENT as the path cannot be returned from the name cache
	198	* alone. We have no option but to retry and hope to get one namei->reverse path
	199	* generation done without an intervening lookup, lookup by id on the hard link
	200	* item. This is only an issue for MAC hooks which cannot reenter the filesystem
	201	* which currently are the MAC hooks for rename, unlink and rmdir.
	202	*/
	203	#define MAX_AUTHORIZE_ENOENT_RETRIES 1024
	204
	205	static int rmdirat_internal(vfs_context_t, int, user_addr_t, enum uio_seg);
	206
	207	static int fsgetpath_internal(vfs_context_t, int, uint64_t, vm_size_t, caddr_t, int *);
	208
	209	#ifdef CONFIG_IMGSRC_ACCESS
	210	static int authorize_devpath_and_update_mntfromname(mount_t mp, user_addr_t devpath, vnode_t *devvpp, vfs_context_t ctx);
	211	static int place_mount_and_checkdirs(mount_t mp, vnode_t vp, vfs_context_t ctx);
	212	static void undo_place_on_covered_vp(mount_t mp, vnode_t vp);
	213	static int mount_begin_update(mount_t mp, vfs_context_t ctx, int flags);
	214	static void mount_end_update(mount_t mp);
	215	static int relocate_imageboot_source(vnode_t pvp, vnode_t vp, struct componentname cnp, const char fsname, vfs_context_t ctx, boolean_t is64bit, user_addr_t fsmountargs, boolean_t by_index);
	216	#endif /* CONFIG_IMGSRC_ACCESS */
	217
	218	//snapshot functions
	219	#if CONFIG_MNT_ROOTSNAP
	220	static int snapshot_root(int dirfd, user_addr_t name, uint32_t flags, vfs_context_t ctx);
	221	#else
	222	static int snapshot_root(int dirfd, user_addr_t name, uint32_t flags, vfs_context_t ctx) __attribute__((unused));
	223	#endif
	224
	225	int (union_dircheckp)(struct vnode , struct fileproc , vfs_context_t);
	226
	227	__private_extern__
	228	int sync_internal(void);
	229
	230	__private_extern__
	231	int unlink1(vfs_context_t, vnode_t, user_addr_t, enum uio_seg, int);
	232
	233	extern lck_grp_t *fd_vn_lck_grp;
	234	extern lck_grp_attr_t *fd_vn_lck_grp_attr;
	235	extern lck_attr_t *fd_vn_lck_attr;
	236
	237	/*
	238	* incremented each time a mount or unmount operation occurs
	239	* used to invalidate the cached value of the rootvp in the
	240	* mount structure utilized by cache_lookup_path
	241	*/
	242	uint32_t mount_generation = 0;
	243
	244	/* counts number of mount and unmount operations */
	245	unsigned int vfs_nummntops=0;
	246
	247	extern const struct fileops vnops;
	248	#if CONFIG_APPLEDOUBLE
	249	extern errno_t rmdir_remove_orphaned_appleDouble(vnode_t, vfs_context_t, int *);
	250	#endif /* CONFIG_APPLEDOUBLE */
	251
	252	/*
	253	* Virtual File System System Calls
	254	*/
	255
	256	#if NFSCLIENT \|\| DEVFS \|\| ROUTEFS
	257	/*
	258	* Private in-kernel mounting spi (NFS only, not exported)
	259	*/
	260	__private_extern__
	261	boolean_t
	262	vfs_iskernelmount(mount_t mp)
	263	{
	264	return ((mp->mnt_kern_flag & MNTK_KERNEL_MOUNT) ? TRUE : FALSE);
	265	}
	266
	267	__private_extern__
	268	int
	269	kernel_mount(char fstype, vnode_t pvp, vnode_t vp, const char path,
	270	void *data, __unused size_t datalen, int syscall_flags, __unused uint32_t kern_flags, vfs_context_t ctx)
	271	{
	272	struct nameidata nd;
	273	boolean_t did_namei;
	274	int error;
	275
	276	NDINIT(&nd, LOOKUP, OP_MOUNT, FOLLOW \| AUDITVNPATH1 \| WANTPARENT,
	277	UIO_SYSSPACE, CAST_USER_ADDR_T(path), ctx);
	278
	279	/*
	280	* Get the vnode to be covered if it's not supplied
	281	*/
	282	if (vp == NULLVP) {
	283	error = namei(&nd);
	284	if (error)
	285	return (error);
	286	vp = nd.ni_vp;
	287	pvp = nd.ni_dvp;
	288	did_namei = TRUE;
	289	} else {
	290	char pnbuf = CAST_DOWN(char , path);
	291
	292	nd.ni_cnd.cn_pnbuf = pnbuf;
	293	nd.ni_cnd.cn_pnlen = strlen(pnbuf) + 1;
	294	did_namei = FALSE;
	295	}
	296
	297	error = mount_common(fstype, pvp, vp, &nd.ni_cnd, CAST_USER_ADDR_T(data),
	298	syscall_flags, kern_flags, NULL, TRUE, ctx);
	299
	300	if (did_namei) {
	301	vnode_put(vp);
	302	vnode_put(pvp);
	303	nameidone(&nd);
	304	}
	305
	306	return (error);
	307	}
	308	#endif /* NFSCLIENT \|\| DEVFS */
	309
	310	/*
	311	* Mount a file system.
	312	*/
	313	/* ARGSUSED */
	314	int
	315	mount(proc_t p, struct mount_args uap, __unused int32_t retval)
	316	{
	317	struct __mac_mount_args muap;
	318
	319	muap.type = uap->type;
	320	muap.path = uap->path;
	321	muap.flags = uap->flags;
	322	muap.data = uap->data;
	323	muap.mac_p = USER_ADDR_NULL;
	324	return (__mac_mount(p, &muap, retval));
	325	}
	326
	327	int
	328	fmount(__unused proc_t p, struct fmount_args uap, __unused int32_t retval)
	329	{
	330	struct componentname cn;
	331	vfs_context_t ctx = vfs_context_current();
	332	size_t dummy = 0;
	333	int error;
	334	int flags = uap->flags;
	335	char fstypename[MFSNAMELEN];
	336	char labelstr = NULL; / regular mount call always sets it to NULL for __mac_mount() */
	337	vnode_t pvp;
	338	vnode_t vp;
	339
	340	AUDIT_ARG(fd, uap->fd);
	341	AUDIT_ARG(fflags, flags);
	342	/* fstypename will get audited by mount_common */
	343
	344	/* Sanity check the flags */
	345	if (flags & (MNT_IMGSRC_BY_INDEX\|MNT_ROOTFS)) {
	346	return (ENOTSUP);
	347	}
	348
	349	if (flags & MNT_UNION) {
	350	return (EPERM);
	351	}
	352
	353	error = copyinstr(uap->type, fstypename, MFSNAMELEN, &dummy);
	354	if (error) {
	355	return (error);
	356	}
	357
	358	if ((error = file_vnode(uap->fd, &vp)) != 0) {
	359	return (error);
	360	}
	361
	362	if ((error = vnode_getwithref(vp)) != 0) {
	363	file_drop(uap->fd);
	364	return (error);
	365	}
	366
	367	pvp = vnode_getparent(vp);
	368	if (pvp == NULL) {
	369	vnode_put(vp);
	370	file_drop(uap->fd);
	371	return (EINVAL);
	372	}
	373
	374	memset(&cn, 0, sizeof(struct componentname));
	375	MALLOC(cn.cn_pnbuf, char *, MAXPATHLEN, M_TEMP, M_WAITOK);
	376	cn.cn_pnlen = MAXPATHLEN;
	377
	378	if((error = vn_getpath(vp, cn.cn_pnbuf, &cn.cn_pnlen)) != 0) {
	379	FREE(cn.cn_pnbuf, M_TEMP);
	380	vnode_put(pvp);
	381	vnode_put(vp);
	382	file_drop(uap->fd);
	383	return (error);
	384	}
	385
	386	error = mount_common(fstypename, pvp, vp, &cn, uap->data, flags, 0, labelstr, FALSE, ctx);
	387
	388	FREE(cn.cn_pnbuf, M_TEMP);
	389	vnode_put(pvp);
	390	vnode_put(vp);
	391	file_drop(uap->fd);
	392
	393	return (error);
	394	}
	395
	396	void
	397	vfs_notify_mount(vnode_t pdvp)
	398	{
	399	vfs_event_signal(NULL, VQ_MOUNT, (intptr_t)NULL);
	400	lock_vnode_and_post(pdvp, NOTE_WRITE);
	401	}
	402
	403	/*
	404	* __mac_mount:
	405	* Mount a file system taking into account MAC label behavior.
	406	* See mount(2) man page for more information
	407	*
	408	* Parameters: p Process requesting the mount
	409	* uap User argument descriptor (see below)
	410	* retval (ignored)
	411	*
	412	* Indirect: uap->type Filesystem type
	413	* uap->path Path to mount
	414	* uap->data Mount arguments
	415	* uap->mac_p MAC info
	416	* uap->flags Mount flags
	417	*
	418	*
	419	* Returns: 0 Success
	420	* !0 Not success
	421	*/
	422	boolean_t root_fs_upgrade_try = FALSE;
	423
	424	int
	425	__mac_mount(struct proc p, register struct __mac_mount_args uap, __unused int32_t *retval)
	426	{
	427	vnode_t pvp = NULL;
	428	vnode_t vp = NULL;
	429	int need_nameidone = 0;
	430	vfs_context_t ctx = vfs_context_current();
	431	char fstypename[MFSNAMELEN];
	432	struct nameidata nd;
	433	size_t dummy=0;
	434	char *labelstr = NULL;
	435	int flags = uap->flags;
	436	int error;
	437	#if CONFIG_IMGSRC_ACCESS \|\| CONFIG_MACF
	438	boolean_t is_64bit = IS_64BIT_PROCESS(p);
	439	#else
	440	#pragma unused(p)
	441	#endif
	442	/*
	443	* Get the fs type name from user space
	444	*/
	445	error = copyinstr(uap->type, fstypename, MFSNAMELEN, &dummy);
	446	if (error)
	447	return (error);
	448
	449	/*
	450	* Get the vnode to be covered
	451	*/
	452	NDINIT(&nd, LOOKUP, OP_MOUNT, FOLLOW \| AUDITVNPATH1 \| WANTPARENT,
	453	UIO_USERSPACE, uap->path, ctx);
	454	error = namei(&nd);
	455	if (error) {
	456	goto out;
	457	}
	458	need_nameidone = 1;
	459	vp = nd.ni_vp;
	460	pvp = nd.ni_dvp;
	461
	462	#ifdef CONFIG_IMGSRC_ACCESS
	463	/* Mounting image source cannot be batched with other operations */
	464	if (flags == MNT_IMGSRC_BY_INDEX) {
	465	error = relocate_imageboot_source(pvp, vp, &nd.ni_cnd, fstypename,
	466	ctx, is_64bit, uap->data, (flags == MNT_IMGSRC_BY_INDEX));
	467	goto out;
	468	}
	469	#endif /* CONFIG_IMGSRC_ACCESS */
	470
	471	#if CONFIG_MACF
	472	/*
	473	* Get the label string (if any) from user space
	474	*/
	475	if (uap->mac_p != USER_ADDR_NULL) {
	476	struct user_mac mac;
	477	size_t ulen = 0;
	478
	479	if (is_64bit) {
	480	struct user64_mac mac64;
	481	error = copyin(uap->mac_p, &mac64, sizeof(mac64));
	482	mac.m_buflen = mac64.m_buflen;
	483	mac.m_string = mac64.m_string;
	484	} else {
	485	struct user32_mac mac32;
	486	error = copyin(uap->mac_p, &mac32, sizeof(mac32));
	487	mac.m_buflen = mac32.m_buflen;
	488	mac.m_string = mac32.m_string;
	489	}
	490	if (error)
	491	goto out;
	492	if ((mac.m_buflen > MAC_MAX_LABEL_BUF_LEN) \|\|
	493	(mac.m_buflen < 2)) {
	494	error = EINVAL;
	495	goto out;
	496	}
	497	MALLOC(labelstr, char *, mac.m_buflen, M_MACTEMP, M_WAITOK);
	498	error = copyinstr(mac.m_string, labelstr, mac.m_buflen, &ulen);
	499	if (error) {
	500	goto out;
	501	}
	502	AUDIT_ARG(mac_string, labelstr);
	503	}
	504	#endif /* CONFIG_MACF */
	505
	506	AUDIT_ARG(fflags, flags);
	507
	508	#if SECURE_KERNEL
	509	if (flags & MNT_UNION) {
	510	/* No union mounts on release kernels */
	511	error = EPERM;
	512	goto out;
	513	}
	514	#endif
	515
	516	if ((vp->v_flag & VROOT) &&
	517	(vp->v_mount->mnt_flag & MNT_ROOTFS)) {
	518	if (!(flags & MNT_UNION)) {
	519	flags \|= MNT_UPDATE;
	520	}
	521	else {
	522	/*
	523	* For a union mount on '/', treat it as fresh
	524	* mount instead of update.
	525	* Otherwise, union mouting on '/' used to panic the
	526	* system before, since mnt_vnodecovered was found to
	527	* be NULL for '/' which is required for unionlookup
	528	* after it gets ENOENT on union mount.
	529	*/
	530	flags = (flags & ~(MNT_UPDATE));
	531	}
	532
	533	#if SECURE_KERNEL
	534	if ((flags & MNT_RDONLY) == 0) {
	535	/* Release kernels are not allowed to mount "/" as rw */
	536	error = EPERM;
	537	goto out;
	538	}
	539	#endif
	540	/*
	541	* See 7392553 for more details on why this check exists.
	542	* Suffice to say: If this check is ON and something tries
	543	* to mount the rootFS RW, we'll turn off the codesign
	544	* bitmap optimization.
	545	*/
	546	#if CHECK_CS_VALIDATION_BITMAP
	547	if ((flags & MNT_RDONLY) == 0 ) {
	548	root_fs_upgrade_try = TRUE;
	549	}
	550	#endif
	551	}
	552
	553	error = mount_common(fstypename, pvp, vp, &nd.ni_cnd, uap->data, flags, 0,
	554	labelstr, FALSE, ctx);
	555
	556	out:
	557
	558	#if CONFIG_MACF
	559	if (labelstr)
	560	FREE(labelstr, M_MACTEMP);
	561	#endif /* CONFIG_MACF */
	562
	563	if (vp) {
	564	vnode_put(vp);
	565	}
	566	if (pvp) {
	567	vnode_put(pvp);
	568	}
	569	if (need_nameidone) {
	570	nameidone(&nd);
	571	}
	572
	573	return (error);
	574	}
	575
	576	/*
	577	* common mount implementation (final stage of mounting)
	578
	579	* Arguments:
	580	* fstypename file system type (ie it's vfs name)
	581	* pvp parent of covered vnode
	582	* vp covered vnode
	583	* cnp component name (ie path) of covered vnode
	584	* flags generic mount flags
	585	* fsmountargs file system specific data
	586	* labelstr optional MAC label
	587	* kernelmount TRUE for mounts initiated from inside the kernel
	588	* ctx caller's context
	589	*/
	590	static int
	591	mount_common(char *fstypename, vnode_t pvp, vnode_t vp,
	592	struct componentname *cnp, user_addr_t fsmountargs, int flags, uint32_t internal_flags,
	593	char *labelstr, boolean_t kernelmount, vfs_context_t ctx)
	594	{
	595	#if !CONFIG_MACF
	596	#pragma unused(labelstr)
	597	#endif
	598	struct vnode *devvp = NULLVP;
	599	struct vnode *device_vnode = NULLVP;
	600	#if CONFIG_MACF
	601	struct vnode *rvp;
	602	#endif
	603	struct mount *mp;
	604	struct vfstable vfsp = (struct vfstable )0;
	605	struct proc *p = vfs_context_proc(ctx);
	606	int error, flag = 0;
	607	user_addr_t devpath = USER_ADDR_NULL;
	608	int ronly = 0;
	609	int mntalloc = 0;
	610	boolean_t vfsp_ref = FALSE;
	611	boolean_t is_rwlock_locked = FALSE;
	612	boolean_t did_rele = FALSE;
	613	boolean_t have_usecount = FALSE;
	614
	615	/*
	616	* Process an update for an existing mount
	617	*/
	618	if (flags & MNT_UPDATE) {
	619	if ((vp->v_flag & VROOT) == 0) {
	620	error = EINVAL;
	621	goto out1;
	622	}
	623	mp = vp->v_mount;
	624
	625	/* unmount in progress return error */
	626	mount_lock_spin(mp);
	627	if (mp->mnt_lflag & MNT_LUNMOUNT) {
	628	mount_unlock(mp);
	629	error = EBUSY;
	630	goto out1;
	631	}
	632	mount_unlock(mp);
	633	lck_rw_lock_exclusive(&mp->mnt_rwlock);
	634	is_rwlock_locked = TRUE;
	635	/*
	636	* We only allow the filesystem to be reloaded if it
	637	* is currently mounted read-only.
	638	*/
	639	if ((flags & MNT_RELOAD) &&
	640	((mp->mnt_flag & MNT_RDONLY) == 0)) {
	641	error = ENOTSUP;
	642	goto out1;
	643	}
	644
	645	/*
	646	* If content protection is enabled, update mounts are not
	647	* allowed to turn it off.
	648	*/
	649	if ((mp->mnt_flag & MNT_CPROTECT) &&
	650	((flags & MNT_CPROTECT) == 0)) {
	651	error = EINVAL;
	652	goto out1;
	653	}
	654
	655	#ifdef CONFIG_IMGSRC_ACCESS
	656	/* Can't downgrade the backer of the root FS */
	657	if ((mp->mnt_kern_flag & MNTK_BACKS_ROOT) &&
	658	(!vfs_isrdonly(mp)) && (flags & MNT_RDONLY)) {
	659	error = ENOTSUP;
	660	goto out1;
	661	}
	662	#endif /* CONFIG_IMGSRC_ACCESS */
	663
	664	/*
	665	* Only root, or the user that did the original mount is
	666	* permitted to update it.
	667	*/
	668	if (mp->mnt_vfsstat.f_owner != kauth_cred_getuid(vfs_context_ucred(ctx)) &&
	669	(error = suser(vfs_context_ucred(ctx), &p->p_acflag))) {
	670	goto out1;
	671	}
	672	#if CONFIG_MACF
	673	error = mac_mount_check_remount(ctx, mp);
	674	if (error != 0) {
	675	goto out1;
	676	}
	677	#endif
	678	/*
	679	* For non-root users, silently enforce MNT_NOSUID and MNT_NODEV,
	680	* and MNT_NOEXEC if mount point is already MNT_NOEXEC.
	681	*/
	682	if ((!kernelmount) && suser(vfs_context_ucred(ctx), NULL)) {
	683	flags \|= MNT_NOSUID \| MNT_NODEV;
	684	if (mp->mnt_flag & MNT_NOEXEC)
	685	flags \|= MNT_NOEXEC;
	686	}
	687	flag = mp->mnt_flag;
	688
	689
	690
	691	mp->mnt_flag \|= flags & (MNT_RELOAD \| MNT_FORCE \| MNT_UPDATE);
	692
	693	vfsp = mp->mnt_vtable;
	694	goto update;
	695	}
	696
	697	/*
	698	* For non-root users, silently enforce MNT_NOSUID and MNT_NODEV, and
	699	* MNT_NOEXEC if mount point is already MNT_NOEXEC.
	700	*/
	701	if ((!kernelmount) && suser(vfs_context_ucred(ctx), NULL)) {
	702	flags \|= MNT_NOSUID \| MNT_NODEV;
	703	if (vp->v_mount->mnt_flag & MNT_NOEXEC)
	704	flags \|= MNT_NOEXEC;
	705	}
	706
	707	/* XXXAUDIT: Should we capture the type on the error path as well? */
	708	AUDIT_ARG(text, fstypename);
	709	mount_list_lock();
	710	for (vfsp = vfsconf; vfsp; vfsp = vfsp->vfc_next)
	711	if (!strncmp(vfsp->vfc_name, fstypename, MFSNAMELEN)) {
	712	vfsp->vfc_refcount++;
	713	vfsp_ref = TRUE;
	714	break;
	715	}
	716	mount_list_unlock();
	717	if (vfsp == NULL) {
	718	error = ENODEV;
	719	goto out1;
	720	}
	721
	722	/*
	723	* VFC_VFSLOCALARGS is not currently supported for kernel mounts
	724	*/
	725	if (kernelmount && (vfsp->vfc_vfsflags & VFC_VFSLOCALARGS)) {
	726	error = EINVAL; /* unsupported request */
	727	goto out1;
	728	}
	729
	730	error = prepare_coveredvp(vp, ctx, cnp, fstypename, ((internal_flags & KERNEL_MOUNT_NOAUTH) != 0));
	731	if (error != 0) {
	732	goto out1;
	733	}
	734
	735	/*
	736	* Allocate and initialize the filesystem (mount_t)
	737	*/
	738	MALLOC_ZONE(mp, struct mount *, (u_int32_t)sizeof(struct mount),
	739	M_MOUNT, M_WAITOK);
	740	bzero((char *)mp, (u_int32_t)sizeof(struct mount));
	741	mntalloc = 1;
	742
	743	/* Initialize the default IO constraints */
	744	mp->mnt_maxreadcnt = mp->mnt_maxwritecnt = MAXPHYS;
	745	mp->mnt_segreadcnt = mp->mnt_segwritecnt = 32;
	746	mp->mnt_maxsegreadsize = mp->mnt_maxreadcnt;
	747	mp->mnt_maxsegwritesize = mp->mnt_maxwritecnt;
	748	mp->mnt_devblocksize = DEV_BSIZE;
	749	mp->mnt_alignmentmask = PAGE_MASK;
	750	mp->mnt_ioqueue_depth = MNT_DEFAULT_IOQUEUE_DEPTH;
	751	mp->mnt_ioscale = 1;
	752	mp->mnt_ioflags = 0;
	753	mp->mnt_realrootvp = NULLVP;
	754	mp->mnt_authcache_ttl = CACHED_LOOKUP_RIGHT_TTL;
	755
	756	TAILQ_INIT(&mp->mnt_vnodelist);
	757	TAILQ_INIT(&mp->mnt_workerqueue);
	758	TAILQ_INIT(&mp->mnt_newvnodes);
	759	mount_lock_init(mp);
	760	lck_rw_lock_exclusive(&mp->mnt_rwlock);
	761	is_rwlock_locked = TRUE;
	762	mp->mnt_op = vfsp->vfc_vfsops;
	763	mp->mnt_vtable = vfsp;
	764	//mp->mnt_stat.f_type = vfsp->vfc_typenum;
	765	mp->mnt_flag \|= vfsp->vfc_flags & MNT_VISFLAGMASK;
	766	strlcpy(mp->mnt_vfsstat.f_fstypename, vfsp->vfc_name, MFSTYPENAMELEN);
	767	strlcpy(mp->mnt_vfsstat.f_mntonname, cnp->cn_pnbuf, MAXPATHLEN);
	768	mp->mnt_vnodecovered = vp;
	769	mp->mnt_vfsstat.f_owner = kauth_cred_getuid(vfs_context_ucred(ctx));
	770	mp->mnt_throttle_mask = LOWPRI_MAX_NUM_DEV - 1;
	771	mp->mnt_devbsdunit = 0;
	772
	773	/* XXX 3762912 hack to support HFS filesystem 'owner' - filesystem may update later */
	774	vfs_setowner(mp, KAUTH_UID_NONE, KAUTH_GID_NONE);
	775
	776	#if NFSCLIENT \|\| DEVFS \|\| ROUTEFS
	777	if (kernelmount)
	778	mp->mnt_kern_flag \|= MNTK_KERNEL_MOUNT;
	779	if ((internal_flags & KERNEL_MOUNT_PERMIT_UNMOUNT) != 0)
	780	mp->mnt_kern_flag \|= MNTK_PERMIT_UNMOUNT;
	781	#endif /* NFSCLIENT \|\| DEVFS */
	782
	783	update:
	784
	785	/*
	786	* Set the mount level flags.
	787	*/
	788	if (flags & MNT_RDONLY)
	789	mp->mnt_flag \|= MNT_RDONLY;
	790	else if (mp->mnt_flag & MNT_RDONLY) {
	791	// disallow read/write upgrades of file systems that
	792	// had the TYPENAME_OVERRIDE feature set.
	793	if (mp->mnt_kern_flag & MNTK_TYPENAME_OVERRIDE) {
	794	error = EPERM;
	795	goto out1;
	796	}
	797	mp->mnt_kern_flag \|= MNTK_WANTRDWR;
	798	}
	799	mp->mnt_flag &= ~(MNT_NOSUID \| MNT_NOEXEC \| MNT_NODEV \|
	800	MNT_SYNCHRONOUS \| MNT_UNION \| MNT_ASYNC \|
	801	MNT_UNKNOWNPERMISSIONS \| MNT_DONTBROWSE \|
	802	MNT_AUTOMOUNTED \| MNT_DEFWRITE \| MNT_NOATIME \|
	803	MNT_QUARANTINE \| MNT_CPROTECT);
	804
	805	#if SECURE_KERNEL
	806	#if !CONFIG_MNT_SUID
	807	/*
	808	* On release builds of iOS based platforms, always enforce NOSUID on
	809	* all mounts. We do this here because we can catch update mounts as well as
	810	* non-update mounts in this case.
	811	*/
	812	mp->mnt_flag \|= (MNT_NOSUID);
	813	#endif
	814	#endif
	815
	816	mp->mnt_flag \|= flags & (MNT_NOSUID \| MNT_NOEXEC \| MNT_NODEV \|
	817	MNT_SYNCHRONOUS \| MNT_UNION \| MNT_ASYNC \|
	818	MNT_UNKNOWNPERMISSIONS \| MNT_DONTBROWSE \|
	819	MNT_AUTOMOUNTED \| MNT_DEFWRITE \| MNT_NOATIME \|
	820	MNT_QUARANTINE \| MNT_CPROTECT);
	821
	822	#if CONFIG_MACF
	823	if (flags & MNT_MULTILABEL) {
	824	if (vfsp->vfc_vfsflags & VFC_VFSNOMACLABEL) {
	825	error = EINVAL;
	826	goto out1;
	827	}
	828	mp->mnt_flag \|= MNT_MULTILABEL;
	829	}
	830	#endif
	831	/*
	832	* Process device path for local file systems if requested
	833	*/
	834	if (vfsp->vfc_vfsflags & VFC_VFSLOCALARGS &&
	835	!(internal_flags & KERNEL_MOUNT_SNAPSHOT)) {
	836	if (vfs_context_is64bit(ctx)) {
	837	if ( (error = copyin(fsmountargs, (caddr_t)&devpath, sizeof(devpath))) )
	838	goto out1;
	839	fsmountargs += sizeof(devpath);
	840	} else {
	841	user32_addr_t tmp;
	842	if ( (error = copyin(fsmountargs, (caddr_t)&tmp, sizeof(tmp))) )
	843	goto out1;
	844	/* munge into LP64 addr */
	845	devpath = CAST_USER_ADDR_T(tmp);
	846	fsmountargs += sizeof(tmp);
	847	}
	848
	849	/* Lookup device and authorize access to it */
	850	if ((devpath)) {
	851	struct nameidata nd;
	852
	853	NDINIT(&nd, LOOKUP, OP_MOUNT, FOLLOW, UIO_USERSPACE, devpath, ctx);
	854	if ( (error = namei(&nd)) )
	855	goto out1;
	856
	857	strlcpy(mp->mnt_vfsstat.f_mntfromname, nd.ni_cnd.cn_pnbuf, MAXPATHLEN);
	858	devvp = nd.ni_vp;
	859
	860	nameidone(&nd);
	861
	862	if (devvp->v_type != VBLK) {
	863	error = ENOTBLK;
	864	goto out2;
	865	}
	866	if (major(devvp->v_rdev) >= nblkdev) {
	867	error = ENXIO;
	868	goto out2;
	869	}
	870	/*
	871	* If mount by non-root, then verify that user has necessary
	872	* permissions on the device.
	873	*/
	874	if (suser(vfs_context_ucred(ctx), NULL) != 0) {
	875	mode_t accessmode = KAUTH_VNODE_READ_DATA;
	876
	877	if ((mp->mnt_flag & MNT_RDONLY) == 0)
	878	accessmode \|= KAUTH_VNODE_WRITE_DATA;
	879	if ((error = vnode_authorize(devvp, NULL, accessmode, ctx)) != 0)
	880	goto out2;
	881	}
	882	}
	883	/* On first mount, preflight and open device */
	884	if (devpath && ((flags & MNT_UPDATE) == 0)) {
	885	if ( (error = vnode_ref(devvp)) )
	886	goto out2;
	887	/*
	888	* Disallow multiple mounts of the same device.
	889	* Disallow mounting of a device that is currently in use
	890	* (except for root, which might share swap device for miniroot).
	891	* Flush out any old buffers remaining from a previous use.
	892	*/
	893	if ( (error = vfs_mountedon(devvp)) )
	894	goto out3;
	895
	896	if (vcount(devvp) > 1 && !(vfs_flags(mp) & MNT_ROOTFS)) {
	897	error = EBUSY;
	898	goto out3;
	899	}
	900	if ( (error = VNOP_FSYNC(devvp, MNT_WAIT, ctx)) ) {
	901	error = ENOTBLK;
	902	goto out3;
	903	}
	904	if ( (error = buf_invalidateblks(devvp, BUF_WRITE_DATA, 0, 0)) )
	905	goto out3;
	906
	907	ronly = (mp->mnt_flag & MNT_RDONLY) != 0;
	908	#if CONFIG_MACF
	909	error = mac_vnode_check_open(ctx,
	910	devvp,
	911	ronly ? FREAD : FREAD\|FWRITE);
	912	if (error)
	913	goto out3;
	914	#endif /* MAC */
	915	if ( (error = VNOP_OPEN(devvp, ronly ? FREAD : FREAD\|FWRITE, ctx)) )
	916	goto out3;
	917
	918	mp->mnt_devvp = devvp;
	919	device_vnode = devvp;
	920
	921	} else if ((mp->mnt_flag & MNT_RDONLY) &&
	922	(mp->mnt_kern_flag & MNTK_WANTRDWR) &&
	923	(device_vnode = mp->mnt_devvp)) {
	924	dev_t dev;
	925	int maj;
	926	/*
	927	* If upgrade to read-write by non-root, then verify
	928	* that user has necessary permissions on the device.
	929	*/
	930	vnode_getalways(device_vnode);
	931
	932	if (suser(vfs_context_ucred(ctx), NULL) &&
	933	(error = vnode_authorize(device_vnode, NULL,
	934	KAUTH_VNODE_READ_DATA \| KAUTH_VNODE_WRITE_DATA,
	935	ctx)) != 0) {
	936	vnode_put(device_vnode);
	937	goto out2;
	938	}
	939
	940	/* Tell the device that we're upgrading */
	941	dev = (dev_t)device_vnode->v_rdev;
	942	maj = major(dev);
	943
	944	if ((u_int)maj >= (u_int)nblkdev)
	945	panic("Volume mounted on a device with invalid major number.");
	946
	947	error = bdevsw[maj].d_open(dev, FREAD \| FWRITE, S_IFBLK, p);
	948	vnode_put(device_vnode);
	949	device_vnode = NULLVP;
	950	if (error != 0) {
	951	goto out2;
	952	}
	953	}
	954	}
	955	#if CONFIG_MACF
	956	if ((flags & MNT_UPDATE) == 0) {
	957	mac_mount_label_init(mp);
	958	mac_mount_label_associate(ctx, mp);
	959	}
	960	if (labelstr) {
	961	if ((flags & MNT_UPDATE) != 0) {
	962	error = mac_mount_check_label_update(ctx, mp);
	963	if (error != 0)
	964	goto out3;
	965	}
	966	}
	967	#endif
	968	/*
	969	* Mount the filesystem.
	970	*/
	971	if (internal_flags & KERNEL_MOUNT_SNAPSHOT) {
	972	error = VFS_IOCTL(mp, VFSIOC_MOUNT_SNAPSHOT,
	973	(caddr_t)fsmountargs, 0, ctx);
	974	} else {
	975	error = VFS_MOUNT(mp, device_vnode, fsmountargs, ctx);
	976	}
	977
	978	if (flags & MNT_UPDATE) {
	979	if (mp->mnt_kern_flag & MNTK_WANTRDWR)
	980	mp->mnt_flag &= ~MNT_RDONLY;
	981	mp->mnt_flag &=~
	982	(MNT_UPDATE \| MNT_RELOAD \| MNT_FORCE);
	983	mp->mnt_kern_flag &=~ MNTK_WANTRDWR;
	984	if (error)
	985	mp->mnt_flag = flag; /* restore flag value */
	986	vfs_event_signal(NULL, VQ_UPDATE, (intptr_t)NULL);
	987	lck_rw_done(&mp->mnt_rwlock);
	988	is_rwlock_locked = FALSE;
	989	if (!error)
	990	enablequotas(mp, ctx);
	991	goto exit;
	992	}
	993
	994	/*
	995	* Put the new filesystem on the mount list after root.
	996	*/
	997	if (error == 0) {
	998	struct vfs_attr vfsattr;
	999	#if CONFIG_MACF
	1000	if (vfs_flags(mp) & MNT_MULTILABEL) {
	1001	error = VFS_ROOT(mp, &rvp, ctx);
	1002	if (error) {
	1003	printf("%s() VFS_ROOT returned %d\n", __func__, error);
	1004	goto out3;
	1005	}
	1006	error = vnode_label(mp, NULL, rvp, NULL, 0, ctx);
	1007	/*
	1008	* drop reference provided by VFS_ROOT
	1009	*/
	1010	vnode_put(rvp);
	1011
	1012	if (error)
	1013	goto out3;
	1014	}
	1015	#endif /* MAC */
	1016
	1017	vnode_lock_spin(vp);
	1018	CLR(vp->v_flag, VMOUNT);
	1019	vp->v_mountedhere = mp;
	1020	vnode_unlock(vp);
	1021
	1022	/*
	1023	* taking the name_cache_lock exclusively will
	1024	* insure that everyone is out of the fast path who
	1025	* might be trying to use a now stale copy of
	1026	* vp->v_mountedhere->mnt_realrootvp
	1027	* bumping mount_generation causes the cached values
	1028	* to be invalidated
	1029	*/
	1030	name_cache_lock();
	1031	mount_generation++;
	1032	name_cache_unlock();
	1033
	1034	error = vnode_ref(vp);
	1035	if (error != 0) {
	1036	goto out4;
	1037	}
	1038
	1039	have_usecount = TRUE;
	1040
	1041	error = checkdirs(vp, ctx);
	1042	if (error != 0) {
	1043	/* Unmount the filesystem as cdir/rdirs cannot be updated */
	1044	goto out4;
	1045	}
	1046	/*
	1047	* there is no cleanup code here so I have made it void
	1048	* we need to revisit this
	1049	*/
	1050	(void)VFS_START(mp, 0, ctx);
	1051
	1052	if (mount_list_add(mp) != 0) {
	1053	/*
	1054	* The system is shutting down trying to umount
	1055	* everything, so fail with a plausible errno.
	1056	*/
	1057	error = EBUSY;
	1058	goto out4;
	1059	}
	1060	lck_rw_done(&mp->mnt_rwlock);
	1061	is_rwlock_locked = FALSE;
	1062
	1063	/* Check if this mounted file system supports EAs or named streams. */
	1064	/* Skip WebDAV file systems for now since they hang in VFS_GETATTR here. */
	1065	VFSATTR_INIT(&vfsattr);
	1066	VFSATTR_WANTED(&vfsattr, f_capabilities);
	1067	if (strncmp(mp->mnt_vfsstat.f_fstypename, "webdav", sizeof("webdav")) != 0 &&
	1068	vfs_getattr(mp, &vfsattr, ctx) == 0 &&
	1069	VFSATTR_IS_SUPPORTED(&vfsattr, f_capabilities)) {
	1070	if ((vfsattr.f_capabilities.capabilities[VOL_CAPABILITIES_INTERFACES] & VOL_CAP_INT_EXTENDED_ATTR) &&
	1071	(vfsattr.f_capabilities.valid[VOL_CAPABILITIES_INTERFACES] & VOL_CAP_INT_EXTENDED_ATTR)) {
	1072	mp->mnt_kern_flag \|= MNTK_EXTENDED_ATTRS;
	1073	}
	1074	#if NAMEDSTREAMS
	1075	if ((vfsattr.f_capabilities.capabilities[VOL_CAPABILITIES_INTERFACES] & VOL_CAP_INT_NAMEDSTREAMS) &&
	1076	(vfsattr.f_capabilities.valid[VOL_CAPABILITIES_INTERFACES] & VOL_CAP_INT_NAMEDSTREAMS)) {
	1077	mp->mnt_kern_flag \|= MNTK_NAMED_STREAMS;
	1078	}
	1079	#endif
	1080	/* Check if this file system supports path from id lookups. */
	1081	if ((vfsattr.f_capabilities.capabilities[VOL_CAPABILITIES_FORMAT] & VOL_CAP_FMT_PATH_FROM_ID) &&
	1082	(vfsattr.f_capabilities.valid[VOL_CAPABILITIES_FORMAT] & VOL_CAP_FMT_PATH_FROM_ID)) {
	1083	mp->mnt_kern_flag \|= MNTK_PATH_FROM_ID;
	1084	} else if (mp->mnt_flag & MNT_DOVOLFS) {
	1085	/* Legacy MNT_DOVOLFS flag also implies path from id lookups. */
	1086	mp->mnt_kern_flag \|= MNTK_PATH_FROM_ID;
	1087	}
	1088
	1089	if ((vfsattr.f_capabilities.capabilities[VOL_CAPABILITIES_FORMAT] & VOL_CAP_FMT_DIR_HARDLINKS) &&
	1090	(vfsattr.f_capabilities.valid[VOL_CAPABILITIES_FORMAT] & VOL_CAP_FMT_DIR_HARDLINKS)) {
	1091	mp->mnt_kern_flag \|= MNTK_DIR_HARDLINKS;
	1092	}
	1093	}
	1094	if (mp->mnt_vtable->vfc_vfsflags & VFC_VFSNATIVEXATTR) {
	1095	mp->mnt_kern_flag \|= MNTK_EXTENDED_ATTRS;
	1096	}
	1097	if (mp->mnt_vtable->vfc_vfsflags & VFC_VFSPREFLIGHT) {
	1098	mp->mnt_kern_flag \|= MNTK_UNMOUNT_PREFLIGHT;
	1099	}
	1100	/* increment the operations count */
	1101	OSAddAtomic(1, &vfs_nummntops);
	1102	enablequotas(mp, ctx);
	1103
	1104	if (device_vnode) {
	1105	device_vnode->v_specflags \|= SI_MOUNTEDON;
	1106
	1107	/*
	1108	* cache the IO attributes for the underlying physical media...
	1109	* an error return indicates the underlying driver doesn't
	1110	* support all the queries necessary... however, reasonable
	1111	* defaults will have been set, so no reason to bail or care
	1112	*/
	1113	vfs_init_io_attributes(device_vnode, mp);
	1114	}
	1115
	1116	/* Now that mount is setup, notify the listeners */
	1117	vfs_notify_mount(pvp);
	1118	IOBSDMountChange(mp, kIOMountChangeMount);
	1119
	1120	} else {
	1121	/* If we fail a fresh mount, there should be no vnodes left hooked into the mountpoint. */
	1122	if (mp->mnt_vnodelist.tqh_first != NULL) {
	1123	panic("mount_common(): mount of %s filesystem failed with %d, but vnode list is not empty.",
	1124	mp->mnt_vtable->vfc_name, error);
	1125	}
	1126
	1127	vnode_lock_spin(vp);
	1128	CLR(vp->v_flag, VMOUNT);
	1129	vnode_unlock(vp);
	1130	mount_list_lock();
	1131	mp->mnt_vtable->vfc_refcount--;
	1132	mount_list_unlock();
	1133
	1134	if (device_vnode ) {
	1135	vnode_rele(device_vnode);
	1136	VNOP_CLOSE(device_vnode, ronly ? FREAD : FREAD\|FWRITE, ctx);
	1137	}
	1138	lck_rw_done(&mp->mnt_rwlock);
	1139	is_rwlock_locked = FALSE;
	1140
	1141	/*
	1142	* if we get here, we have a mount structure that needs to be freed,
	1143	* but since the coveredvp hasn't yet been updated to point at it,
	1144	* no need to worry about other threads holding a crossref on this mp
	1145	* so it's ok to just free it
	1146	*/
	1147	mount_lock_destroy(mp);
	1148	#if CONFIG_MACF
	1149	mac_mount_label_destroy(mp);
	1150	#endif
	1151	FREE_ZONE((caddr_t)mp, sizeof (struct mount), M_MOUNT);
	1152	}
	1153	exit:
	1154	/*
	1155	* drop I/O count on the device vp if there was one
	1156	*/
	1157	if (devpath && devvp)
	1158	vnode_put(devvp);
	1159
	1160	return(error);
	1161
	1162	/* Error condition exits */
	1163	out4:
	1164	(void)VFS_UNMOUNT(mp, MNT_FORCE, ctx);
	1165
	1166	/*
	1167	* If the mount has been placed on the covered vp,
	1168	* it may have been discovered by now, so we have
	1169	* to treat this just like an unmount
	1170	*/
	1171	mount_lock_spin(mp);
	1172	mp->mnt_lflag \|= MNT_LDEAD;
	1173	mount_unlock(mp);
	1174
	1175	if (device_vnode != NULLVP) {
	1176	vnode_rele(device_vnode);
	1177	VNOP_CLOSE(device_vnode, mp->mnt_flag & MNT_RDONLY ? FREAD : FREAD\|FWRITE,
	1178	ctx);
	1179	did_rele = TRUE;
	1180	}
	1181
	1182	vnode_lock_spin(vp);
	1183
	1184	mp->mnt_crossref++;
	1185	vp->v_mountedhere = (mount_t) 0;
	1186
	1187	vnode_unlock(vp);
	1188
	1189	if (have_usecount) {
	1190	vnode_rele(vp);
	1191	}
	1192	out3:
	1193	if (devpath && ((flags & MNT_UPDATE) == 0) && (!did_rele))
	1194	vnode_rele(devvp);
	1195	out2:
	1196	if (devpath && devvp)
	1197	vnode_put(devvp);
	1198	out1:
	1199	/* Release mnt_rwlock only when it was taken */
	1200	if (is_rwlock_locked == TRUE) {
	1201	lck_rw_done(&mp->mnt_rwlock);
	1202	}
	1203
	1204	if (mntalloc) {
	1205	if (mp->mnt_crossref)
	1206	mount_dropcrossref(mp, vp, 0);
	1207	else {
	1208	mount_lock_destroy(mp);
	1209	#if CONFIG_MACF
	1210	mac_mount_label_destroy(mp);
	1211	#endif
	1212	FREE_ZONE((caddr_t)mp, sizeof (struct mount), M_MOUNT);
	1213	}
	1214	}
	1215	if (vfsp_ref) {
	1216	mount_list_lock();
	1217	vfsp->vfc_refcount--;
	1218	mount_list_unlock();
	1219	}
	1220
	1221	return(error);
	1222	}
	1223
	1224	/*
	1225	* Flush in-core data, check for competing mount attempts,
	1226	* and set VMOUNT
	1227	*/
	1228	int
	1229	prepare_coveredvp(vnode_t vp, vfs_context_t ctx, struct componentname cnp, const char fsname, boolean_t skip_auth)
	1230	{
	1231	#if !CONFIG_MACF
	1232	#pragma unused(cnp,fsname)
	1233	#endif
	1234	struct vnode_attr va;
	1235	int error;
	1236
	1237	if (!skip_auth) {
	1238	/*
	1239	* If the user is not root, ensure that they own the directory
	1240	* onto which we are attempting to mount.
	1241	*/
	1242	VATTR_INIT(&va);
	1243	VATTR_WANTED(&va, va_uid);
	1244	if ((error = vnode_getattr(vp, &va, ctx)) \|\|
	1245	(va.va_uid != kauth_cred_getuid(vfs_context_ucred(ctx)) &&
	1246	(!vfs_context_issuser(ctx)))) {
	1247	error = EPERM;
	1248	goto out;
	1249	}
	1250	}
	1251
	1252	if ( (error = VNOP_FSYNC(vp, MNT_WAIT, ctx)) )
	1253	goto out;
	1254
	1255	if ( (error = buf_invalidateblks(vp, BUF_WRITE_DATA, 0, 0)) )
	1256	goto out;
	1257
	1258	if (vp->v_type != VDIR) {
	1259	error = ENOTDIR;
	1260	goto out;
	1261	}
	1262
	1263	if (ISSET(vp->v_flag, VMOUNT) && (vp->v_mountedhere != NULL)) {
	1264	error = EBUSY;
	1265	goto out;
	1266	}
	1267
	1268	#if CONFIG_MACF
	1269	error = mac_mount_check_mount(ctx, vp,
	1270	cnp, fsname);
	1271	if (error != 0)
	1272	goto out;
	1273	#endif
	1274
	1275	vnode_lock_spin(vp);
	1276	SET(vp->v_flag, VMOUNT);
	1277	vnode_unlock(vp);
	1278
	1279	out:
	1280	return error;
	1281	}
	1282
	1283	#if CONFIG_IMGSRC_ACCESS
	1284
	1285	#if DEBUG
	1286	#define IMGSRC_DEBUG(args...) printf(args)
	1287	#else
	1288	#define IMGSRC_DEBUG(args...) do { } while(0)
	1289	#endif
	1290
	1291	static int
	1292	authorize_devpath_and_update_mntfromname(mount_t mp, user_addr_t devpath, vnode_t *devvpp, vfs_context_t ctx)
	1293	{
	1294	struct nameidata nd;
	1295	vnode_t vp, realdevvp;
	1296	mode_t accessmode;
	1297	int error;
	1298
	1299	NDINIT(&nd, LOOKUP, OP_LOOKUP, FOLLOW, UIO_USERSPACE, devpath, ctx);
	1300	if ( (error = namei(&nd)) ) {
	1301	IMGSRC_DEBUG("namei() failed with %d\n", error);
	1302	return error;
	1303	}
	1304
	1305	vp = nd.ni_vp;
	1306
	1307	if (!vnode_isblk(vp)) {
	1308	IMGSRC_DEBUG("Not block device.\n");
	1309	error = ENOTBLK;
	1310	goto out;
	1311	}
	1312
	1313	realdevvp = mp->mnt_devvp;
	1314	if (realdevvp == NULLVP) {
	1315	IMGSRC_DEBUG("No device backs the mount.\n");
	1316	error = ENXIO;
	1317	goto out;
	1318	}
	1319
	1320	error = vnode_getwithref(realdevvp);
	1321	if (error != 0) {
	1322	IMGSRC_DEBUG("Coudn't get iocount on device.\n");
	1323	goto out;
	1324	}
	1325
	1326	if (vnode_specrdev(vp) != vnode_specrdev(realdevvp)) {
	1327	IMGSRC_DEBUG("Wrong dev_t.\n");
	1328	error = ENXIO;
	1329	goto out1;
	1330	}
	1331
	1332	strlcpy(mp->mnt_vfsstat.f_mntfromname, nd.ni_cnd.cn_pnbuf, MAXPATHLEN);
	1333
	1334	/*
	1335	* If mount by non-root, then verify that user has necessary
	1336	* permissions on the device.
	1337	*/
	1338	if (!vfs_context_issuser(ctx)) {
	1339	accessmode = KAUTH_VNODE_READ_DATA;
	1340	if ((mp->mnt_flag & MNT_RDONLY) == 0)
	1341	accessmode \|= KAUTH_VNODE_WRITE_DATA;
	1342	if ((error = vnode_authorize(vp, NULL, accessmode, ctx)) != 0) {
	1343	IMGSRC_DEBUG("Access denied.\n");
	1344	goto out1;
	1345	}
	1346	}
	1347
	1348	*devvpp = vp;
	1349
	1350	out1:
	1351	vnode_put(realdevvp);
	1352	out:
	1353	nameidone(&nd);
	1354	if (error) {
	1355	vnode_put(vp);
	1356	}
	1357
	1358	return error;
	1359	}
	1360
	1361	/*
	1362	* Clear VMOUNT, set v_mountedhere, and mnt_vnodecovered, ref the vnode,
	1363	* and call checkdirs()
	1364	*/
	1365	static int
	1366	place_mount_and_checkdirs(mount_t mp, vnode_t vp, vfs_context_t ctx)
	1367	{
	1368	int error;
	1369
	1370	mp->mnt_vnodecovered = vp; /* XXX This is normally only set at init-time ... */
	1371
	1372	vnode_lock_spin(vp);
	1373	CLR(vp->v_flag, VMOUNT);
	1374	vp->v_mountedhere = mp;
	1375	vnode_unlock(vp);
	1376
	1377	/*
	1378	* taking the name_cache_lock exclusively will
	1379	* insure that everyone is out of the fast path who
	1380	* might be trying to use a now stale copy of
	1381	* vp->v_mountedhere->mnt_realrootvp
	1382	* bumping mount_generation causes the cached values
	1383	* to be invalidated
	1384	*/
	1385	name_cache_lock();
	1386	mount_generation++;
	1387	name_cache_unlock();
	1388
	1389	error = vnode_ref(vp);
	1390	if (error != 0) {
	1391	goto out;
	1392	}
	1393
	1394	error = checkdirs(vp, ctx);
	1395	if (error != 0) {
	1396	/* Unmount the filesystem as cdir/rdirs cannot be updated */
	1397	vnode_rele(vp);
	1398	goto out;
	1399	}
	1400
	1401	out:
	1402	if (error != 0) {
	1403	mp->mnt_vnodecovered = NULLVP;
	1404	}
	1405	return error;
	1406	}
	1407
	1408	static void
	1409	undo_place_on_covered_vp(mount_t mp, vnode_t vp)
	1410	{
	1411	vnode_rele(vp);
	1412	vnode_lock_spin(vp);
	1413	vp->v_mountedhere = (mount_t)NULL;
	1414	vnode_unlock(vp);
	1415
	1416	mp->mnt_vnodecovered = NULLVP;
	1417	}
	1418
	1419	static int
	1420	mount_begin_update(mount_t mp, vfs_context_t ctx, int flags)
	1421	{
	1422	int error;
	1423
	1424	/* unmount in progress return error */
	1425	mount_lock_spin(mp);
	1426	if (mp->mnt_lflag & MNT_LUNMOUNT) {
	1427	mount_unlock(mp);
	1428	return EBUSY;
	1429	}
	1430	mount_unlock(mp);
	1431	lck_rw_lock_exclusive(&mp->mnt_rwlock);
	1432
	1433	/*
	1434	* We only allow the filesystem to be reloaded if it
	1435	* is currently mounted read-only.
	1436	*/
	1437	if ((flags & MNT_RELOAD) &&
	1438	((mp->mnt_flag & MNT_RDONLY) == 0)) {
	1439	error = ENOTSUP;
	1440	goto out;
	1441	}
	1442
	1443	/*
	1444	* Only root, or the user that did the original mount is
	1445	* permitted to update it.
	1446	*/
	1447	if (mp->mnt_vfsstat.f_owner != kauth_cred_getuid(vfs_context_ucred(ctx)) &&
	1448	(!vfs_context_issuser(ctx))) {
	1449	error = EPERM;
	1450	goto out;
	1451	}
	1452	#if CONFIG_MACF
	1453	error = mac_mount_check_remount(ctx, mp);
	1454	if (error != 0) {
	1455	goto out;
	1456	}
	1457	#endif
	1458
	1459	out:
	1460	if (error) {
	1461	lck_rw_done(&mp->mnt_rwlock);
	1462	}
	1463
	1464	return error;
	1465	}
	1466
	1467	static void
	1468	mount_end_update(mount_t mp)
	1469	{
	1470	lck_rw_done(&mp->mnt_rwlock);
	1471	}
	1472
	1473	static int
	1474	get_imgsrc_rootvnode(uint32_t height, vnode_t *rvpp)
	1475	{
	1476	vnode_t vp;
	1477
	1478	if (height >= MAX_IMAGEBOOT_NESTING) {
	1479	return EINVAL;
	1480	}
	1481
	1482	vp = imgsrc_rootvnodes[height];
	1483	if ((vp != NULLVP) && (vnode_get(vp) == 0)) {
	1484	*rvpp = vp;
	1485	return 0;
	1486	} else {
	1487	return ENOENT;
	1488	}
	1489	}
	1490
	1491	static int
	1492	relocate_imageboot_source(vnode_t pvp, vnode_t vp, struct componentname *cnp,
	1493	const char *fsname, vfs_context_t ctx,
	1494	boolean_t is64bit, user_addr_t fsmountargs, boolean_t by_index)
	1495	{
	1496	int error;
	1497	mount_t mp;
	1498	boolean_t placed = FALSE;
	1499	vnode_t devvp = NULLVP;
	1500	struct vfstable *vfsp;
	1501	user_addr_t devpath;
	1502	char *old_mntonname;
	1503	vnode_t rvp;
	1504	uint32_t height;
	1505	uint32_t flags;
	1506
	1507	/* If we didn't imageboot, nothing to move */
	1508	if (imgsrc_rootvnodes[0] == NULLVP) {
	1509	return EINVAL;
	1510	}
	1511
	1512	/* Only root can do this */
	1513	if (!vfs_context_issuser(ctx)) {
	1514	return EPERM;
	1515	}
	1516
	1517	IMGSRC_DEBUG("looking for root vnode.\n");
	1518
	1519	/*
	1520	* Get root vnode of filesystem we're moving.
	1521	*/
	1522	if (by_index) {
	1523	if (is64bit) {
	1524	struct user64_mnt_imgsrc_args mia64;
	1525	error = copyin(fsmountargs, &mia64, sizeof(mia64));
	1526	if (error != 0) {
	1527	IMGSRC_DEBUG("Failed to copy in arguments.\n");
	1528	return error;
	1529	}
	1530
	1531	height = mia64.mi_height;
	1532	flags = mia64.mi_flags;
	1533	devpath = mia64.mi_devpath;
	1534	} else {
	1535	struct user32_mnt_imgsrc_args mia32;
	1536	error = copyin(fsmountargs, &mia32, sizeof(mia32));
	1537	if (error != 0) {
	1538	IMGSRC_DEBUG("Failed to copy in arguments.\n");
	1539	return error;
	1540	}
	1541
	1542	height = mia32.mi_height;
	1543	flags = mia32.mi_flags;
	1544	devpath = mia32.mi_devpath;
	1545	}
	1546	} else {
	1547	/*
	1548	* For binary compatibility--assumes one level of nesting.
	1549	*/
	1550	if (is64bit) {
	1551	if ( (error = copyin(fsmountargs, (caddr_t)&devpath, sizeof(devpath))) )
	1552	return error;
	1553	} else {
	1554	user32_addr_t tmp;
	1555	if ( (error = copyin(fsmountargs, (caddr_t)&tmp, sizeof(tmp))) )
	1556	return error;
	1557
	1558	/* munge into LP64 addr */
	1559	devpath = CAST_USER_ADDR_T(tmp);
	1560	}
	1561
	1562	height = 0;
	1563	flags = 0;
	1564	}
	1565
	1566	if (flags != 0) {
	1567	IMGSRC_DEBUG("%s: Got nonzero flags.\n", __FUNCTION__);
	1568	return EINVAL;
	1569	}
	1570
	1571	error = get_imgsrc_rootvnode(height, &rvp);
	1572	if (error != 0) {
	1573	IMGSRC_DEBUG("getting root vnode failed with %d\n", error);
	1574	return error;
	1575	}
	1576
	1577	IMGSRC_DEBUG("got root vnode.\n");
	1578
	1579	MALLOC(old_mntonname, char*, MAXPATHLEN, M_TEMP, M_WAITOK);
	1580
	1581	/* Can only move once */
	1582	mp = vnode_mount(rvp);
	1583	if ((mp->mnt_kern_flag & MNTK_HAS_MOVED) == MNTK_HAS_MOVED) {
	1584	IMGSRC_DEBUG("Already moved.\n");
	1585	error = EBUSY;
	1586	goto out0;
	1587	}
	1588
	1589	IMGSRC_DEBUG("Starting updated.\n");
	1590
	1591	/* Get exclusive rwlock on mount, authorize update on mp */
	1592	error = mount_begin_update(mp , ctx, 0);
	1593	if (error != 0) {
	1594	IMGSRC_DEBUG("Starting updated failed with %d\n", error);
	1595	goto out0;
	1596	}
	1597
	1598	/*
	1599	* It can only be moved once. Flag is set under the rwlock,
	1600	* so we're now safe to proceed.
	1601	*/
	1602	if ((mp->mnt_kern_flag & MNTK_HAS_MOVED) == MNTK_HAS_MOVED) {
	1603	IMGSRC_DEBUG("Already moved [2]\n");
	1604	goto out1;
	1605	}
	1606
	1607
	1608	IMGSRC_DEBUG("Preparing coveredvp.\n");
	1609
	1610	/* Mark covered vnode as mount in progress, authorize placing mount on top */
	1611	error = prepare_coveredvp(vp, ctx, cnp, fsname, FALSE);
	1612	if (error != 0) {
	1613	IMGSRC_DEBUG("Preparing coveredvp failed with %d.\n", error);
	1614	goto out1;
	1615	}
	1616
	1617	IMGSRC_DEBUG("Covered vp OK.\n");
	1618
	1619	/* Sanity check the name caller has provided */
	1620	vfsp = mp->mnt_vtable;
	1621	if (strncmp(vfsp->vfc_name, fsname, MFSNAMELEN) != 0) {
	1622	IMGSRC_DEBUG("Wrong fs name.\n");
	1623	error = EINVAL;
	1624	goto out2;
	1625	}
	1626
	1627	/* Check the device vnode and update mount-from name, for local filesystems */
	1628	if (vfsp->vfc_vfsflags & VFC_VFSLOCALARGS) {
	1629	IMGSRC_DEBUG("Local, doing device validation.\n");
	1630
	1631	if (devpath != USER_ADDR_NULL) {
	1632	error = authorize_devpath_and_update_mntfromname(mp, devpath, &devvp, ctx);
	1633	if (error) {
	1634	IMGSRC_DEBUG("authorize_devpath_and_update_mntfromname() failed.\n");
	1635	goto out2;
	1636	}
	1637
	1638	vnode_put(devvp);
	1639	}
	1640	}
	1641
	1642	/*
	1643	* Place mp on top of vnode, ref the vnode, call checkdirs(),
	1644	* and increment the name cache's mount generation
	1645	*/
	1646
	1647	IMGSRC_DEBUG("About to call place_mount_and_checkdirs().\n");
	1648	error = place_mount_and_checkdirs(mp, vp, ctx);
	1649	if (error != 0) {
	1650	goto out2;
	1651	}
	1652
	1653	placed = TRUE;
	1654
	1655	strlcpy(old_mntonname, mp->mnt_vfsstat.f_mntonname, MAXPATHLEN);
	1656	strlcpy(mp->mnt_vfsstat.f_mntonname, cnp->cn_pnbuf, MAXPATHLEN);
	1657
	1658	/* Forbid future moves */
	1659	mount_lock(mp);
	1660	mp->mnt_kern_flag \|= MNTK_HAS_MOVED;
	1661	mount_unlock(mp);
	1662
	1663	/* Finally, add to mount list, completely ready to go */
	1664	if (mount_list_add(mp) != 0) {
	1665	/*
	1666	* The system is shutting down trying to umount
	1667	* everything, so fail with a plausible errno.
	1668	*/
	1669	error = EBUSY;
	1670	goto out3;
	1671	}
	1672
	1673	mount_end_update(mp);
	1674	vnode_put(rvp);
	1675	FREE(old_mntonname, M_TEMP);
	1676
	1677	vfs_notify_mount(pvp);
	1678
	1679	return 0;
	1680	out3:
	1681	strlcpy(mp->mnt_vfsstat.f_mntonname, old_mntonname, MAXPATHLEN);
	1682
	1683	mount_lock(mp);
	1684	mp->mnt_kern_flag &= ~(MNTK_HAS_MOVED);
	1685	mount_unlock(mp);
	1686
	1687	out2:
	1688	/*
	1689	* Placing the mp on the vnode clears VMOUNT,
	1690	* so cleanup is different after that point
	1691	*/
	1692	if (placed) {
	1693	/* Rele the vp, clear VMOUNT and v_mountedhere */
	1694	undo_place_on_covered_vp(mp, vp);
	1695	} else {
	1696	vnode_lock_spin(vp);
	1697	CLR(vp->v_flag, VMOUNT);
	1698	vnode_unlock(vp);
	1699	}
	1700	out1:
	1701	mount_end_update(mp);
	1702
	1703	out0:
	1704	vnode_put(rvp);
	1705	FREE(old_mntonname, M_TEMP);
	1706	return error;
	1707	}
	1708
	1709	#endif /* CONFIG_IMGSRC_ACCESS */
	1710
	1711	void
	1712	enablequotas(struct mount *mp, vfs_context_t ctx)
	1713	{
	1714	struct nameidata qnd;
	1715	int type;
	1716	char qfpath[MAXPATHLEN];
	1717	const char *qfname = QUOTAFILENAME;
	1718	const char *qfopsname = QUOTAOPSNAME;
	1719	const char *qfextension[] = INITQFNAMES;
	1720
	1721	/* XXX Shoulkd be an MNTK_ flag, instead of strncmp()'s */
	1722	if (strncmp(mp->mnt_vfsstat.f_fstypename, "hfs", sizeof("hfs")) != 0 ) {
	1723	return;
	1724	}
	1725	/*
	1726	* Enable filesystem disk quotas if necessary.
	1727	* We ignore errors as this should not interfere with final mount
	1728	*/
	1729	for (type=0; type < MAXQUOTAS; type++) {
	1730	snprintf(qfpath, sizeof(qfpath), "%s/%s.%s", mp->mnt_vfsstat.f_mntonname, qfopsname, qfextension[type]);
	1731	NDINIT(&qnd, LOOKUP, OP_MOUNT, FOLLOW, UIO_SYSSPACE,
	1732	CAST_USER_ADDR_T(qfpath), ctx);
	1733	if (namei(&qnd) != 0)
	1734	continue; /* option file to trigger quotas is not present */
	1735	vnode_put(qnd.ni_vp);
	1736	nameidone(&qnd);
	1737	snprintf(qfpath, sizeof(qfpath), "%s/%s.%s", mp->mnt_vfsstat.f_mntonname, qfname, qfextension[type]);
	1738
	1739	(void) VFS_QUOTACTL(mp, QCMD(Q_QUOTAON, type), 0, qfpath, ctx);
	1740	}
	1741	return;
	1742	}
	1743
	1744
	1745	static int
	1746	checkdirs_callback(proc_t p, void * arg)
	1747	{
	1748	struct cdirargs * cdrp = (struct cdirargs * )arg;
	1749	vnode_t olddp = cdrp->olddp;
	1750	vnode_t newdp = cdrp->newdp;
	1751	struct filedesc *fdp;
	1752	vnode_t tvp;
	1753	vnode_t fdp_cvp;
	1754	vnode_t fdp_rvp;
	1755	int cdir_changed = 0;
	1756	int rdir_changed = 0;
	1757
	1758	/*
	1759	* XXX Also needs to iterate each thread in the process to see if it
	1760	* XXX is using a per-thread current working directory, and, if so,
	1761	* XXX update that as well.
	1762	*/
	1763
	1764	proc_fdlock(p);
	1765	fdp = p->p_fd;
	1766	if (fdp == (struct filedesc *)0) {
	1767	proc_fdunlock(p);
	1768	return(PROC_RETURNED);
	1769	}
	1770	fdp_cvp = fdp->fd_cdir;
	1771	fdp_rvp = fdp->fd_rdir;
	1772	proc_fdunlock(p);
	1773
	1774	if (fdp_cvp == olddp) {
	1775	vnode_ref(newdp);
	1776	tvp = fdp->fd_cdir;
	1777	fdp_cvp = newdp;
	1778	cdir_changed = 1;
	1779	vnode_rele(tvp);
	1780	}
	1781	if (fdp_rvp == olddp) {
	1782	vnode_ref(newdp);
	1783	tvp = fdp->fd_rdir;
	1784	fdp_rvp = newdp;
	1785	rdir_changed = 1;
	1786	vnode_rele(tvp);
	1787	}
	1788	if (cdir_changed \|\| rdir_changed) {
	1789	proc_fdlock(p);
	1790	fdp->fd_cdir = fdp_cvp;
	1791	fdp->fd_rdir = fdp_rvp;
	1792	proc_fdunlock(p);
	1793	}
	1794	return(PROC_RETURNED);
	1795	}
	1796
	1797
	1798
	1799	/*
	1800	* Scan all active processes to see if any of them have a current
	1801	* or root directory onto which the new filesystem has just been
	1802	* mounted. If so, replace them with the new mount point.
	1803	*/
	1804	static int
	1805	checkdirs(vnode_t olddp, vfs_context_t ctx)
	1806	{
	1807	vnode_t newdp;
	1808	vnode_t tvp;
	1809	int err;
	1810	struct cdirargs cdr;
	1811
	1812	if (olddp->v_usecount == 1)
	1813	return(0);
	1814	err = VFS_ROOT(olddp->v_mountedhere, &newdp, ctx);
	1815
	1816	if (err != 0) {
	1817	#if DIAGNOSTIC
	1818	panic("mount: lost mount: error %d", err);
	1819	#endif
	1820	return(err);
	1821	}
	1822
	1823	cdr.olddp = olddp;
	1824	cdr.newdp = newdp;
	1825	/* do not block for exec/fork trans as the vp in cwd & rootdir are not changing */
	1826	proc_iterate(PROC_ALLPROCLIST \| PROC_NOWAITTRANS, checkdirs_callback, (void *)&cdr, NULL, NULL);
	1827
	1828	if (rootvnode == olddp) {
	1829	vnode_ref(newdp);
	1830	tvp = rootvnode;
	1831	rootvnode = newdp;
	1832	vnode_rele(tvp);
	1833	}
	1834
	1835	vnode_put(newdp);
	1836	return(0);
	1837	}
	1838
	1839	/*
	1840	* Unmount a file system.
	1841	*
	1842	* Note: unmount takes a path to the vnode mounted on as argument,
	1843	* not special file (as before).
	1844	*/
	1845	/* ARGSUSED */
	1846	int
	1847	unmount(__unused proc_t p, struct unmount_args uap, __unused int32_t retval)
	1848	{
	1849	vnode_t vp;
	1850	struct mount *mp;
	1851	int error;
	1852	struct nameidata nd;
	1853	vfs_context_t ctx = vfs_context_current();
	1854
	1855	NDINIT(&nd, LOOKUP, OP_UNMOUNT, FOLLOW \| AUDITVNPATH1,
	1856	UIO_USERSPACE, uap->path, ctx);
	1857	error = namei(&nd);
	1858	if (error)
	1859	return (error);
	1860	vp = nd.ni_vp;
	1861	mp = vp->v_mount;
	1862	nameidone(&nd);
	1863
	1864	#if CONFIG_MACF
	1865	error = mac_mount_check_umount(ctx, mp);
	1866	if (error != 0) {
	1867	vnode_put(vp);
	1868	return (error);
	1869	}
	1870	#endif
	1871	/*
	1872	* Must be the root of the filesystem
	1873	*/
	1874	if ((vp->v_flag & VROOT) == 0) {
	1875	vnode_put(vp);
	1876	return (EINVAL);
	1877	}
	1878	mount_ref(mp, 0);
	1879	vnode_put(vp);
	1880	/* safedounmount consumes the mount ref */
	1881	return (safedounmount(mp, uap->flags, ctx));
	1882	}
	1883
	1884	int
	1885	vfs_unmountbyfsid(fsid_t *fsid, int flags, vfs_context_t ctx)
	1886	{
	1887	mount_t mp;
	1888
	1889	mp = mount_list_lookupby_fsid(fsid, 0, 1);
	1890	if (mp == (mount_t)0) {
	1891	return(ENOENT);
	1892	}
	1893	mount_ref(mp, 0);
	1894	mount_iterdrop(mp);
	1895	/* safedounmount consumes the mount ref */
	1896	return(safedounmount(mp, flags, ctx));
	1897	}
	1898
	1899
	1900	/*
	1901	* The mount struct comes with a mount ref which will be consumed.
	1902	* Do the actual file system unmount, prevent some common foot shooting.
	1903	*/
	1904	int
	1905	safedounmount(struct mount *mp, int flags, vfs_context_t ctx)
	1906	{
	1907	int error;
	1908	proc_t p = vfs_context_proc(ctx);
	1909
	1910	/*
	1911	* If the file system is not responding and MNT_NOBLOCK
	1912	* is set and not a forced unmount then return EBUSY.
	1913	*/
	1914	if ((mp->mnt_kern_flag & MNT_LNOTRESP) &&
	1915	(flags & MNT_NOBLOCK) && ((flags & MNT_FORCE) == 0)) {
	1916	error = EBUSY;
	1917	goto out;
	1918	}
	1919
	1920	/*
	1921	* Skip authorization if the mount is tagged as permissive and
	1922	* this is not a forced-unmount attempt.
	1923	*/
	1924	if (!(((mp->mnt_kern_flag & MNTK_PERMIT_UNMOUNT) != 0) && ((flags & MNT_FORCE) == 0))) {
	1925	/*
	1926	* Only root, or the user that did the original mount is
	1927	* permitted to unmount this filesystem.
	1928	*/
	1929	if ((mp->mnt_vfsstat.f_owner != kauth_cred_getuid(kauth_cred_get())) &&
	1930	(error = suser(kauth_cred_get(), &p->p_acflag)))
	1931	goto out;
	1932	}
	1933	/*
	1934	* Don't allow unmounting the root file system.
	1935	*/
	1936	if (mp->mnt_flag & MNT_ROOTFS) {
	1937	error = EBUSY; /* the root is always busy */
	1938	goto out;
	1939	}
	1940
	1941	#ifdef CONFIG_IMGSRC_ACCESS
	1942	if (mp->mnt_kern_flag & MNTK_BACKS_ROOT) {
	1943	error = EBUSY;
	1944	goto out;
	1945	}
	1946	#endif /* CONFIG_IMGSRC_ACCESS */
	1947
	1948	return (dounmount(mp, flags, 1, ctx));
	1949
	1950	out:
	1951	mount_drop(mp, 0);
	1952	return(error);
	1953	}
	1954
	1955	/*
	1956	* Do the actual file system unmount.
	1957	*/
	1958	int
	1959	dounmount(struct mount *mp, int flags, int withref, vfs_context_t ctx)
	1960	{
	1961	vnode_t coveredvp = (vnode_t)0;
	1962	int error;
	1963	int needwakeup = 0;
	1964	int forcedunmount = 0;
	1965	int lflags = 0;
	1966	struct vnode *devvp = NULLVP;
	1967	#if CONFIG_TRIGGERS
	1968	proc_t p = vfs_context_proc(ctx);
	1969	int did_vflush = 0;
	1970	int pflags_save = 0;
	1971	#endif /* CONFIG_TRIGGERS */
	1972
	1973	#if CONFIG_FSE
	1974	if (!(flags & MNT_FORCE)) {
	1975	fsevent_unmount(mp, ctx); /* has to come first! */
	1976	}
	1977	#endif
	1978
	1979	mount_lock(mp);
	1980
	1981	/*
	1982	* If already an unmount in progress just return EBUSY.
	1983	* Even a forced unmount cannot override.
	1984	*/
	1985	if (mp->mnt_lflag & MNT_LUNMOUNT) {
	1986	if (withref != 0)
	1987	mount_drop(mp, 1);
	1988	mount_unlock(mp);
	1989	return (EBUSY);
	1990	}
	1991
	1992	if (flags & MNT_FORCE) {
	1993	forcedunmount = 1;
	1994	mp->mnt_lflag \|= MNT_LFORCE;
	1995	}
	1996
	1997	#if CONFIG_TRIGGERS
	1998	if (flags & MNT_NOBLOCK && p != kernproc)
	1999	pflags_save = OSBitOrAtomic(P_NOREMOTEHANG, &p->p_flag);
	2000	#endif
	2001
	2002	mp->mnt_kern_flag \|= MNTK_UNMOUNT;
	2003	mp->mnt_lflag \|= MNT_LUNMOUNT;
	2004	mp->mnt_flag &=~ MNT_ASYNC;
	2005	/*
	2006	* anyone currently in the fast path that
	2007	* trips over the cached rootvp will be
	2008	* dumped out and forced into the slow path
	2009	* to regenerate a new cached value
	2010	*/
	2011	mp->mnt_realrootvp = NULLVP;
	2012	mount_unlock(mp);
	2013
	2014	if (forcedunmount && (flags & MNT_LNOSUB) == 0) {
	2015	/*
	2016	* Force unmount any mounts in this filesystem.
	2017	* If any unmounts fail - just leave them dangling.
	2018	* Avoids recursion.
	2019	*/
	2020	(void) dounmount_submounts(mp, flags \| MNT_LNOSUB, ctx);
	2021	}
	2022
	2023	/*
	2024	* taking the name_cache_lock exclusively will
	2025	* insure that everyone is out of the fast path who
	2026	* might be trying to use a now stale copy of
	2027	* vp->v_mountedhere->mnt_realrootvp
	2028	* bumping mount_generation causes the cached values
	2029	* to be invalidated
	2030	*/
	2031	name_cache_lock();
	2032	mount_generation++;
	2033	name_cache_unlock();
	2034
	2035
	2036	lck_rw_lock_exclusive(&mp->mnt_rwlock);
	2037	if (withref != 0)
	2038	mount_drop(mp, 0);
	2039	error = 0;
	2040	if (forcedunmount == 0) {
	2041	ubc_umount(mp); /* release cached vnodes */
	2042	if ((mp->mnt_flag & MNT_RDONLY) == 0) {
	2043	error = VFS_SYNC(mp, MNT_WAIT, ctx);
	2044	if (error) {
	2045	mount_lock(mp);
	2046	mp->mnt_kern_flag &= ~MNTK_UNMOUNT;
	2047	mp->mnt_lflag &= ~MNT_LUNMOUNT;
	2048	mp->mnt_lflag &= ~MNT_LFORCE;
	2049	goto out;
	2050	}
	2051	}
	2052	}
	2053
	2054	/* free disk_conditioner_info structure for this mount */
	2055	disk_conditioner_unmount(mp);
	2056
	2057	IOBSDMountChange(mp, kIOMountChangeUnmount);
	2058
	2059	#if CONFIG_TRIGGERS
	2060	vfs_nested_trigger_unmounts(mp, flags, ctx);
	2061	did_vflush = 1;
	2062	#endif
	2063	if (forcedunmount)
	2064	lflags \|= FORCECLOSE;
	2065	error = vflush(mp, NULLVP, SKIPSWAP \| SKIPSYSTEM \| SKIPROOT \| lflags);
	2066	if ((forcedunmount == 0) && error) {
	2067	mount_lock(mp);
	2068	mp->mnt_kern_flag &= ~MNTK_UNMOUNT;
	2069	mp->mnt_lflag &= ~MNT_LUNMOUNT;
	2070	mp->mnt_lflag &= ~MNT_LFORCE;
	2071	goto out;
	2072	}
	2073
	2074	/* make sure there are no one in the mount iterations or lookup */
	2075	mount_iterdrain(mp);
	2076
	2077	error = VFS_UNMOUNT(mp, flags, ctx);
	2078	if (error) {
	2079	mount_iterreset(mp);
	2080	mount_lock(mp);
	2081	mp->mnt_kern_flag &= ~MNTK_UNMOUNT;
	2082	mp->mnt_lflag &= ~MNT_LUNMOUNT;
	2083	mp->mnt_lflag &= ~MNT_LFORCE;
	2084	goto out;
	2085	}
	2086
	2087	/* increment the operations count */
	2088	if (!error)
	2089	OSAddAtomic(1, &vfs_nummntops);
	2090
	2091	if ( mp->mnt_devvp && mp->mnt_vtable->vfc_vfsflags & VFC_VFSLOCALARGS) {
	2092	/* hold an io reference and drop the usecount before close */
	2093	devvp = mp->mnt_devvp;
	2094	vnode_getalways(devvp);
	2095	vnode_rele(devvp);
	2096	VNOP_CLOSE(devvp, mp->mnt_flag & MNT_RDONLY ? FREAD : FREAD\|FWRITE,
	2097	ctx);
	2098	vnode_clearmountedon(devvp);
	2099	vnode_put(devvp);
	2100	}
	2101	lck_rw_done(&mp->mnt_rwlock);
	2102	mount_list_remove(mp);
	2103	lck_rw_lock_exclusive(&mp->mnt_rwlock);
	2104
	2105	/* mark the mount point hook in the vp but not drop the ref yet */
	2106	if ((coveredvp = mp->mnt_vnodecovered) != NULLVP) {
	2107	/*
	2108	* The covered vnode needs special handling. Trying to get an
	2109	* iocount must not block here as this may lead to deadlocks
	2110	* if the Filesystem to which the covered vnode belongs is
	2111	* undergoing forced unmounts. Since we hold a usecount, the
	2112	* vnode cannot be reused (it can, however, still be terminated)
	2113	*/
	2114	vnode_getalways(coveredvp);
	2115	vnode_lock_spin(coveredvp);
	2116
	2117	mp->mnt_crossref++;
	2118	coveredvp->v_mountedhere = (struct mount *)0;
	2119	CLR(coveredvp->v_flag, VMOUNT);
	2120
	2121	vnode_unlock(coveredvp);
	2122	vnode_put(coveredvp);
	2123	}
	2124
	2125	mount_list_lock();
	2126	mp->mnt_vtable->vfc_refcount--;
	2127	mount_list_unlock();
	2128
	2129	cache_purgevfs(mp); /* remove cache entries for this file sys */
	2130	vfs_event_signal(NULL, VQ_UNMOUNT, (intptr_t)NULL);
	2131	mount_lock(mp);
	2132	mp->mnt_lflag \|= MNT_LDEAD;
	2133
	2134	if (mp->mnt_lflag & MNT_LWAIT) {
	2135	/*
	2136	* do the wakeup here
	2137	* in case we block in mount_refdrain
	2138	* which will drop the mount lock
	2139	* and allow anyone blocked in vfs_busy
	2140	* to wakeup and see the LDEAD state
	2141	*/
	2142	mp->mnt_lflag &= ~MNT_LWAIT;
	2143	wakeup((caddr_t)mp);
	2144	}
	2145	mount_refdrain(mp);
	2146	out:
	2147	if (mp->mnt_lflag & MNT_LWAIT) {
	2148	mp->mnt_lflag &= ~MNT_LWAIT;
	2149	needwakeup = 1;
	2150	}
	2151
	2152	#if CONFIG_TRIGGERS
	2153	if (flags & MNT_NOBLOCK && p != kernproc) {
	2154	// Restore P_NOREMOTEHANG bit to its previous value
	2155	if ((pflags_save & P_NOREMOTEHANG) == 0)
	2156	OSBitAndAtomic(~((uint32_t) P_NOREMOTEHANG), &p->p_flag);
	2157	}
	2158
	2159	/*
	2160	* Callback and context are set together under the mount lock, and
	2161	* never cleared, so we're safe to examine them here, drop the lock,
	2162	* and call out.
	2163	*/
	2164	if (mp->mnt_triggercallback != NULL) {
	2165	mount_unlock(mp);
	2166	if (error == 0) {
	2167	mp->mnt_triggercallback(mp, VTC_RELEASE, mp->mnt_triggerdata, ctx);
	2168	} else if (did_vflush) {
	2169	mp->mnt_triggercallback(mp, VTC_REPLACE, mp->mnt_triggerdata, ctx);
	2170	}
	2171	} else {
	2172	mount_unlock(mp);
	2173	}
	2174	#else
	2175	mount_unlock(mp);
	2176	#endif /* CONFIG_TRIGGERS */
	2177
	2178	lck_rw_done(&mp->mnt_rwlock);
	2179
	2180	if (needwakeup)
	2181	wakeup((caddr_t)mp);
	2182
	2183	if (!error) {
	2184	if ((coveredvp != NULLVP)) {
	2185	vnode_t pvp = NULLVP;
	2186
	2187	/*
	2188	* The covered vnode needs special handling. Trying to
	2189	* get an iocount must not block here as this may lead
	2190	* to deadlocks if the Filesystem to which the covered
	2191	* vnode belongs is undergoing forced unmounts. Since we
	2192	* hold a usecount, the vnode cannot be reused
	2193	* (it can, however, still be terminated).
	2194	*/
	2195	vnode_getalways(coveredvp);
	2196
	2197	mount_dropcrossref(mp, coveredvp, 0);
	2198	/*
	2199	* We'll _try_ to detect if this really needs to be
	2200	* done. The coveredvp can only be in termination (or
	2201	* terminated) if the coveredvp's mount point is in a
	2202	* forced unmount (or has been) since we still hold the
	2203	* ref.
	2204	*/
	2205	if (!vnode_isrecycled(coveredvp)) {
	2206	pvp = vnode_getparent(coveredvp);
	2207	#if CONFIG_TRIGGERS
	2208	if (coveredvp->v_resolve) {
	2209	vnode_trigger_rearm(coveredvp, ctx);
	2210	}
	2211	#endif
	2212	}
	2213
	2214	vnode_rele(coveredvp);
	2215	vnode_put(coveredvp);
	2216	coveredvp = NULLVP;
	2217
	2218	if (pvp) {
	2219	lock_vnode_and_post(pvp, NOTE_WRITE);
	2220	vnode_put(pvp);
	2221	}
	2222	} else if (mp->mnt_flag & MNT_ROOTFS) {
	2223	mount_lock_destroy(mp);
	2224	#if CONFIG_MACF
	2225	mac_mount_label_destroy(mp);
	2226	#endif
	2227	FREE_ZONE((caddr_t)mp, sizeof (struct mount), M_MOUNT);
	2228	} else
	2229	panic("dounmount: no coveredvp");
	2230	}
	2231	return (error);
	2232	}
	2233
	2234	/*
	2235	* Unmount any mounts in this filesystem.
	2236	*/
	2237	void
	2238	dounmount_submounts(struct mount *mp, int flags, vfs_context_t ctx)
	2239	{
	2240	mount_t smp;
	2241	fsid_t *fsids, fsid;
	2242	int fsids_sz;
	2243	int count = 0, i, m = 0;
	2244	vnode_t vp;
	2245
	2246	mount_list_lock();
	2247
	2248	// Get an array to hold the submounts fsids.
	2249	TAILQ_FOREACH(smp, &mountlist, mnt_list)
	2250	count++;
	2251	fsids_sz = count * sizeof(fsid_t);
	2252	MALLOC(fsids, fsid_t *, fsids_sz, M_TEMP, M_NOWAIT);
	2253	if (fsids == NULL) {
	2254	mount_list_unlock();
	2255	goto out;
	2256	}
	2257	fsids[0] = mp->mnt_vfsstat.f_fsid; // Prime the pump
	2258
	2259	/*
	2260	* Fill the array with submount fsids.
	2261	* Since mounts are always added to the tail of the mount list, the
	2262	* list is always in mount order.
	2263	* For each mount check if the mounted-on vnode belongs to a
	2264	* mount that's already added to our array of mounts to be unmounted.
	2265	*/
	2266	for (smp = TAILQ_NEXT(mp, mnt_list); smp; smp = TAILQ_NEXT(smp, mnt_list)) {
	2267	vp = smp->mnt_vnodecovered;
	2268	if (vp == NULL)
	2269	continue;
	2270	fsid = vnode_mount(vp)->mnt_vfsstat.f_fsid; // Underlying fsid
	2271	for (i = 0; i <= m; i++) {
	2272	if (fsids[i].val[0] == fsid.val[0] &&
	2273	fsids[i].val[1] == fsid.val[1]) {
	2274	fsids[++m] = smp->mnt_vfsstat.f_fsid;
	2275	break;
	2276	}
	2277	}
	2278	}
	2279	mount_list_unlock();
	2280
	2281	// Unmount the submounts in reverse order. Ignore errors.
	2282	for (i = m; i > 0; i--) {
	2283	smp = mount_list_lookupby_fsid(&fsids[i], 0, 1);
	2284	if (smp) {
	2285	mount_ref(smp, 0);
	2286	mount_iterdrop(smp);
	2287	(void) dounmount(smp, flags, 1, ctx);
	2288	}
	2289	}
	2290	out:
	2291	if (fsids)
	2292	FREE(fsids, M_TEMP);
	2293	}
	2294
	2295	void
	2296	mount_dropcrossref(mount_t mp, vnode_t dp, int need_put)
	2297	{
	2298	vnode_lock(dp);
	2299	mp->mnt_crossref--;
	2300
	2301	if (mp->mnt_crossref < 0)
	2302	panic("mount cross refs -ve");
	2303
	2304	if ((mp != dp->v_mountedhere) && (mp->mnt_crossref == 0)) {
	2305
	2306	if (need_put)
	2307	vnode_put_locked(dp);
	2308	vnode_unlock(dp);
	2309
	2310	mount_lock_destroy(mp);
	2311	#if CONFIG_MACF
	2312	mac_mount_label_destroy(mp);
	2313	#endif
	2314	FREE_ZONE((caddr_t)mp, sizeof (struct mount), M_MOUNT);
	2315	return;
	2316	}
	2317	if (need_put)
	2318	vnode_put_locked(dp);
	2319	vnode_unlock(dp);
	2320	}
	2321
	2322
	2323	/*
	2324	* Sync each mounted filesystem.
	2325	*/
	2326	#if DIAGNOSTIC
	2327	int syncprt = 0;
	2328	#endif
	2329
	2330	int print_vmpage_stat=0;
	2331
	2332	static int
	2333	sync_callback(mount_t mp, __unused void *arg)
	2334	{
	2335	if ((mp->mnt_flag & MNT_RDONLY) == 0) {
	2336	int asyncflag = mp->mnt_flag & MNT_ASYNC;
	2337
	2338	mp->mnt_flag &= ~MNT_ASYNC;
	2339	VFS_SYNC(mp, arg ? MNT_WAIT : MNT_NOWAIT, vfs_context_kernel());
	2340	if (asyncflag)
	2341	mp->mnt_flag \|= MNT_ASYNC;
	2342	}
	2343
	2344	return (VFS_RETURNED);
	2345	}
	2346
	2347	/* ARGSUSED */
	2348	int
	2349	sync(__unused proc_t p, __unused struct sync_args uap, __unused int32_t retval)
	2350	{
	2351	vfs_iterate(LK_NOWAIT, sync_callback, NULL);
	2352
	2353	if (print_vmpage_stat) {
	2354	vm_countdirtypages();
	2355	}
	2356
	2357	#if DIAGNOSTIC
	2358	if (syncprt)
	2359	vfs_bufstats();
	2360	#endif /* DIAGNOSTIC */
	2361	return 0;
	2362	}
	2363
	2364	typedef enum {
	2365	SYNC_ALL = 0,
	2366	SYNC_ONLY_RELIABLE_MEDIA = 1,
	2367	SYNC_ONLY_UNRELIABLE_MEDIA = 2
	2368	} sync_type_t;
	2369
	2370	static int
	2371	sync_internal_callback(mount_t mp, void *arg)
	2372	{
	2373	if (arg) {
	2374	int is_reliable = !(mp->mnt_kern_flag & MNTK_VIRTUALDEV) &&
	2375	(mp->mnt_flag & MNT_LOCAL);
	2376	sync_type_t sync_type = ((sync_type_t )arg);
	2377
	2378	if ((sync_type == SYNC_ONLY_RELIABLE_MEDIA) && !is_reliable)
	2379	return (VFS_RETURNED);
	2380	else if ((sync_type = SYNC_ONLY_UNRELIABLE_MEDIA) && is_reliable)
	2381	return (VFS_RETURNED);
	2382	}
	2383
	2384	(void)sync_callback(mp, NULL);
	2385
	2386	return (VFS_RETURNED);
	2387	}
	2388
	2389	int sync_thread_state = 0;
	2390	int sync_timeout_seconds = 5;
	2391
	2392	#define SYNC_THREAD_RUN 0x0001
	2393	#define SYNC_THREAD_RUNNING 0x0002
	2394
	2395	static void
	2396	sync_thread(__unused void *arg, __unused wait_result_t wr)
	2397	{
	2398	sync_type_t sync_type;
	2399
	2400	lck_mtx_lock(sync_mtx_lck);
	2401	while (sync_thread_state & SYNC_THREAD_RUN) {
	2402	sync_thread_state &= ~SYNC_THREAD_RUN;
	2403	lck_mtx_unlock(sync_mtx_lck);
	2404
	2405	sync_type = SYNC_ONLY_RELIABLE_MEDIA;
	2406	vfs_iterate(LK_NOWAIT, sync_internal_callback, &sync_type);
	2407	sync_type = SYNC_ONLY_UNRELIABLE_MEDIA;
	2408	vfs_iterate(LK_NOWAIT, sync_internal_callback, &sync_type);
	2409
	2410	lck_mtx_lock(sync_mtx_lck);
	2411	}
	2412	/*
	2413	* This wakeup _has_ to be issued before the lock is released otherwise
	2414	* we may end up waking up a thread in sync_internal which is
	2415	* expecting a wakeup from a thread it just created and not from this
	2416	* thread which is about to exit.
	2417	*/
	2418	wakeup(&sync_thread_state);
	2419	sync_thread_state &= ~SYNC_THREAD_RUNNING;
	2420	lck_mtx_unlock(sync_mtx_lck);
	2421
	2422	if (print_vmpage_stat) {
	2423	vm_countdirtypages();
	2424	}
	2425
	2426	#if DIAGNOSTIC
	2427	if (syncprt)
	2428	vfs_bufstats();
	2429	#endif /* DIAGNOSTIC */
	2430	}
	2431
	2432	struct timeval sync_timeout_last_print = {0, 0};
	2433
	2434	/*
	2435	* An in-kernel sync for power management to call.
	2436	* This function always returns within sync_timeout seconds.
	2437	*/
	2438	__private_extern__ int
	2439	sync_internal(void)
	2440	{
	2441	thread_t thd;
	2442	int error;
	2443	int thread_created = FALSE;
	2444	struct timespec ts = {sync_timeout_seconds, 0};
	2445
	2446	lck_mtx_lock(sync_mtx_lck);
	2447	sync_thread_state \|= SYNC_THREAD_RUN;
	2448	if (!(sync_thread_state & SYNC_THREAD_RUNNING)) {
	2449	int kr;
	2450
	2451	sync_thread_state \|= SYNC_THREAD_RUNNING;
	2452	kr = kernel_thread_start(sync_thread, NULL, &thd);
	2453	if (kr != KERN_SUCCESS) {
	2454	sync_thread_state &= ~SYNC_THREAD_RUNNING;
	2455	lck_mtx_unlock(sync_mtx_lck);
	2456	printf("sync_thread failed\n");
	2457	return (0);
	2458	}
	2459	thread_created = TRUE;
	2460	}
	2461
	2462	error = msleep((caddr_t)&sync_thread_state, sync_mtx_lck,
	2463	(PVFS \| PDROP \| PCATCH), "sync_thread", &ts);
	2464	if (error) {
	2465	struct timeval now;
	2466
	2467	microtime(&now);
	2468	if (now.tv_sec - sync_timeout_last_print.tv_sec > 120) {
	2469	printf("sync timed out: %d sec\n", sync_timeout_seconds);
	2470	sync_timeout_last_print.tv_sec = now.tv_sec;
	2471	}
	2472	}
	2473
	2474	if (thread_created)
	2475	thread_deallocate(thd);
	2476
	2477	return (0);
	2478	} /* end of sync_internal call */
	2479
	2480	/*
	2481	* Change filesystem quotas.
	2482	*/
	2483	#if QUOTA
	2484	int
	2485	quotactl(proc_t p, struct quotactl_args uap, __unused int32_t retval)
	2486	{
	2487	struct mount *mp;
	2488	int error, quota_cmd, quota_status = 0;
	2489	caddr_t datap;
	2490	size_t fnamelen;
	2491	struct nameidata nd;
	2492	vfs_context_t ctx = vfs_context_current();
	2493	struct dqblk my_dqblk = {};
	2494
	2495	AUDIT_ARG(uid, uap->uid);
	2496	AUDIT_ARG(cmd, uap->cmd);
	2497	NDINIT(&nd, LOOKUP, OP_LOOKUP, FOLLOW \| AUDITVNPATH1, UIO_USERSPACE,
	2498	uap->path, ctx);
	2499	error = namei(&nd);
	2500	if (error)
	2501	return (error);
	2502	mp = nd.ni_vp->v_mount;
	2503	vnode_put(nd.ni_vp);
	2504	nameidone(&nd);
	2505
	2506	/* copyin any data we will need for downstream code */
	2507	quota_cmd = uap->cmd >> SUBCMDSHIFT;
	2508
	2509	switch (quota_cmd) {
	2510	case Q_QUOTAON:
	2511	/* uap->arg specifies a file from which to take the quotas */
	2512	fnamelen = MAXPATHLEN;
	2513	datap = kalloc(MAXPATHLEN);
	2514	error = copyinstr(uap->arg, datap, MAXPATHLEN, &fnamelen);
	2515	break;
	2516	case Q_GETQUOTA:
	2517	/* uap->arg is a pointer to a dqblk structure. */
	2518	datap = (caddr_t) &my_dqblk;
	2519	break;
	2520	case Q_SETQUOTA:
	2521	case Q_SETUSE:
	2522	/* uap->arg is a pointer to a dqblk structure. */
	2523	datap = (caddr_t) &my_dqblk;
	2524	if (proc_is64bit(p)) {
	2525	struct user_dqblk my_dqblk64;
	2526	error = copyin(uap->arg, (caddr_t)&my_dqblk64, sizeof (my_dqblk64));
	2527	if (error == 0) {
	2528	munge_dqblk(&my_dqblk, &my_dqblk64, FALSE);
	2529	}
	2530	}
	2531	else {
	2532	error = copyin(uap->arg, (caddr_t)&my_dqblk, sizeof (my_dqblk));
	2533	}
	2534	break;
	2535	case Q_QUOTASTAT:
	2536	/* uap->arg is a pointer to an integer */
	2537	datap = (caddr_t) &quota_status;
	2538	break;
	2539	default:
	2540	datap = NULL;
	2541	break;
	2542	} /* switch */
	2543
	2544	if (error == 0) {
	2545	error = VFS_QUOTACTL(mp, uap->cmd, uap->uid, datap, ctx);
	2546	}
	2547
	2548	switch (quota_cmd) {
	2549	case Q_QUOTAON:
	2550	if (datap != NULL)
	2551	kfree(datap, MAXPATHLEN);
	2552	break;
	2553	case Q_GETQUOTA:
	2554	/* uap->arg is a pointer to a dqblk structure we need to copy out to */
	2555	if (error == 0) {
	2556	if (proc_is64bit(p)) {
	2557	struct user_dqblk my_dqblk64;
	2558
	2559	memset(&my_dqblk64, 0, sizeof(my_dqblk64));
	2560	munge_dqblk(&my_dqblk, &my_dqblk64, TRUE);
	2561	error = copyout((caddr_t)&my_dqblk64, uap->arg, sizeof (my_dqblk64));
	2562	}
	2563	else {
	2564	error = copyout(datap, uap->arg, sizeof (struct dqblk));
	2565	}
	2566	}
	2567	break;
	2568	case Q_QUOTASTAT:
	2569	/* uap->arg is a pointer to an integer */
	2570	if (error == 0) {
	2571	error = copyout(datap, uap->arg, sizeof(quota_status));
	2572	}
	2573	break;
	2574	default:
	2575	break;
	2576	} /* switch */
	2577
	2578	return (error);
	2579	}
	2580	#else
	2581	int
	2582	quotactl(__unused proc_t p, __unused struct quotactl_args uap, __unused int32_t retval)
	2583	{
	2584	return (EOPNOTSUPP);
	2585	}
	2586	#endif /* QUOTA */
	2587
	2588	/*
	2589	* Get filesystem statistics.
	2590	*
	2591	* Returns: 0 Success
	2592	* namei:???
	2593	* vfs_update_vfsstat:???
	2594	* munge_statfs:EFAULT
	2595	*/
	2596	/* ARGSUSED */
	2597	int
	2598	statfs(__unused proc_t p, struct statfs_args uap, __unused int32_t retval)
	2599	{
	2600	struct mount *mp;
	2601	struct vfsstatfs *sp;
	2602	int error;
	2603	struct nameidata nd;
	2604	vfs_context_t ctx = vfs_context_current();
	2605	vnode_t vp;
	2606
	2607	NDINIT(&nd, LOOKUP, OP_STATFS, FOLLOW \| AUDITVNPATH1,
	2608	UIO_USERSPACE, uap->path, ctx);
	2609	error = namei(&nd);
	2610	if (error != 0)
	2611	return (error);
	2612	vp = nd.ni_vp;
	2613	mp = vp->v_mount;
	2614	sp = &mp->mnt_vfsstat;
	2615	nameidone(&nd);
	2616
	2617	#if CONFIG_MACF
	2618	error = mac_mount_check_stat(ctx, mp);
	2619	if (error != 0)
	2620	return (error);
	2621	#endif
	2622
	2623	error = vfs_update_vfsstat(mp, ctx, VFS_USER_EVENT);
	2624	if (error != 0) {
	2625	vnode_put(vp);
	2626	return (error);
	2627	}
	2628
	2629	error = munge_statfs(mp, sp, uap->buf, NULL, IS_64BIT_PROCESS(p), TRUE);
	2630	vnode_put(vp);
	2631	return (error);
	2632	}
	2633
	2634	/*
	2635	* Get filesystem statistics.
	2636	*/
	2637	/* ARGSUSED */
	2638	int
	2639	fstatfs(__unused proc_t p, struct fstatfs_args uap, __unused int32_t retval)
	2640	{
	2641	vnode_t vp;
	2642	struct mount *mp;
	2643	struct vfsstatfs *sp;
	2644	int error;
	2645
	2646	AUDIT_ARG(fd, uap->fd);
	2647
	2648	if ( (error = file_vnode(uap->fd, &vp)) )
	2649	return (error);
	2650
	2651	error = vnode_getwithref(vp);
	2652	if (error) {
	2653	file_drop(uap->fd);
	2654	return (error);
	2655	}
	2656
	2657	AUDIT_ARG(vnpath_withref, vp, ARG_VNODE1);
	2658
	2659	mp = vp->v_mount;
	2660	if (!mp) {
	2661	error = EBADF;
	2662	goto out;
	2663	}
	2664
	2665	#if CONFIG_MACF
	2666	error = mac_mount_check_stat(vfs_context_current(), mp);
	2667	if (error != 0)
	2668	goto out;
	2669	#endif
	2670
	2671	sp = &mp->mnt_vfsstat;
	2672	if ((error = vfs_update_vfsstat(mp, vfs_context_current(), VFS_USER_EVENT)) != 0) {
	2673	goto out;
	2674	}
	2675
	2676	error = munge_statfs(mp, sp, uap->buf, NULL, IS_64BIT_PROCESS(p), TRUE);
	2677
	2678	out:
	2679	file_drop(uap->fd);
	2680	vnode_put(vp);
	2681
	2682	return (error);
	2683	}
	2684
	2685	/*
	2686	* Common routine to handle copying of statfs64 data to user space
	2687	*/
	2688	static int
	2689	statfs64_common(struct mount mp, struct vfsstatfs sfsp, user_addr_t bufp)
	2690	{
	2691	int error;
	2692	struct statfs64 sfs;
	2693
	2694	bzero(&sfs, sizeof(sfs));
	2695
	2696	sfs.f_bsize = sfsp->f_bsize;
	2697	sfs.f_iosize = (int32_t)sfsp->f_iosize;
	2698	sfs.f_blocks = sfsp->f_blocks;
	2699	sfs.f_bfree = sfsp->f_bfree;
	2700	sfs.f_bavail = sfsp->f_bavail;
	2701	sfs.f_files = sfsp->f_files;
	2702	sfs.f_ffree = sfsp->f_ffree;
	2703	sfs.f_fsid = sfsp->f_fsid;
	2704	sfs.f_owner = sfsp->f_owner;
	2705	sfs.f_type = mp->mnt_vtable->vfc_typenum;
	2706	sfs.f_flags = mp->mnt_flag & MNT_VISFLAGMASK;
	2707	sfs.f_fssubtype = sfsp->f_fssubtype;
	2708	if (mp->mnt_kern_flag & MNTK_TYPENAME_OVERRIDE) {
	2709	strlcpy(&sfs.f_fstypename[0], &mp->fstypename_override[0], MFSTYPENAMELEN);
	2710	} else {
	2711	strlcpy(&sfs.f_fstypename[0], &sfsp->f_fstypename[0], MFSTYPENAMELEN);
	2712	}
	2713	strlcpy(&sfs.f_mntonname[0], &sfsp->f_mntonname[0], MAXPATHLEN);
	2714	strlcpy(&sfs.f_mntfromname[0], &sfsp->f_mntfromname[0], MAXPATHLEN);
	2715
	2716	error = copyout((caddr_t)&sfs, bufp, sizeof(sfs));
	2717
	2718	return(error);
	2719	}
	2720
	2721	/*
	2722	* Get file system statistics in 64-bit mode
	2723	*/
	2724	int
	2725	statfs64(__unused struct proc p, struct statfs64_args uap, __unused int32_t *retval)
	2726	{
	2727	struct mount *mp;
	2728	struct vfsstatfs *sp;
	2729	int error;
	2730	struct nameidata nd;
	2731	vfs_context_t ctxp = vfs_context_current();
	2732	vnode_t vp;
	2733
	2734	NDINIT(&nd, LOOKUP, OP_STATFS, FOLLOW \| AUDITVNPATH1,
	2735	UIO_USERSPACE, uap->path, ctxp);
	2736	error = namei(&nd);
	2737	if (error != 0)
	2738	return (error);
	2739	vp = nd.ni_vp;
	2740	mp = vp->v_mount;
	2741	sp = &mp->mnt_vfsstat;
	2742	nameidone(&nd);
	2743
	2744	#if CONFIG_MACF
	2745	error = mac_mount_check_stat(ctxp, mp);
	2746	if (error != 0)
	2747	return (error);
	2748	#endif
	2749
	2750	error = vfs_update_vfsstat(mp, ctxp, VFS_USER_EVENT);
	2751	if (error != 0) {
	2752	vnode_put(vp);
	2753	return (error);
	2754	}
	2755
	2756	error = statfs64_common(mp, sp, uap->buf);
	2757	vnode_put(vp);
	2758
	2759	return (error);
	2760	}
	2761
	2762	/*
	2763	* Get file system statistics in 64-bit mode
	2764	*/
	2765	int
	2766	fstatfs64(__unused struct proc p, struct fstatfs64_args uap, __unused int32_t *retval)
	2767	{
	2768	struct vnode *vp;
	2769	struct mount *mp;
	2770	struct vfsstatfs *sp;
	2771	int error;
	2772
	2773	AUDIT_ARG(fd, uap->fd);
	2774
	2775	if ( (error = file_vnode(uap->fd, &vp)) )
	2776	return (error);
	2777
	2778	error = vnode_getwithref(vp);
	2779	if (error) {
	2780	file_drop(uap->fd);
	2781	return (error);
	2782	}
	2783
	2784	AUDIT_ARG(vnpath_withref, vp, ARG_VNODE1);
	2785
	2786	mp = vp->v_mount;
	2787	if (!mp) {
	2788	error = EBADF;
	2789	goto out;
	2790	}
	2791
	2792	#if CONFIG_MACF
	2793	error = mac_mount_check_stat(vfs_context_current(), mp);
	2794	if (error != 0)
	2795	goto out;
	2796	#endif
	2797
	2798	sp = &mp->mnt_vfsstat;
	2799	if ((error = vfs_update_vfsstat(mp, vfs_context_current(), VFS_USER_EVENT)) != 0) {
	2800	goto out;
	2801	}
	2802
	2803	error = statfs64_common(mp, sp, uap->buf);
	2804
	2805	out:
	2806	file_drop(uap->fd);
	2807	vnode_put(vp);
	2808
	2809	return (error);
	2810	}
	2811
	2812	struct getfsstat_struct {
	2813	user_addr_t sfsp;
	2814	user_addr_t *mp;
	2815	int count;
	2816	int maxcount;
	2817	int flags;
	2818	int error;
	2819	};
	2820
	2821
	2822	static int
	2823	getfsstat_callback(mount_t mp, void * arg)
	2824	{
	2825
	2826	struct getfsstat_struct fstp = (struct getfsstat_struct )arg;
	2827	struct vfsstatfs *sp;
	2828	int error, my_size;
	2829	vfs_context_t ctx = vfs_context_current();
	2830
	2831	if (fstp->sfsp && fstp->count < fstp->maxcount) {
	2832	#if CONFIG_MACF
	2833	error = mac_mount_check_stat(ctx, mp);
	2834	if (error != 0) {
	2835	fstp->error = error;
	2836	return(VFS_RETURNED_DONE);
	2837	}
	2838	#endif
	2839	sp = &mp->mnt_vfsstat;
	2840	/*
	2841	* If MNT_NOWAIT is specified, do not refresh the
	2842	* fsstat cache. MNT_WAIT/MNT_DWAIT overrides MNT_NOWAIT.
	2843	*/
	2844	if (((fstp->flags & MNT_NOWAIT) == 0 \|\| (fstp->flags & (MNT_WAIT \| MNT_DWAIT))) &&
	2845	(error = vfs_update_vfsstat(mp, ctx,
	2846	VFS_USER_EVENT))) {
	2847	KAUTH_DEBUG("vfs_update_vfsstat returned %d", error);
	2848	return(VFS_RETURNED);
	2849	}
	2850
	2851	/*
	2852	* Need to handle LP64 version of struct statfs
	2853	*/
	2854	error = munge_statfs(mp, sp, fstp->sfsp, &my_size, IS_64BIT_PROCESS(vfs_context_proc(ctx)), FALSE);
	2855	if (error) {
	2856	fstp->error = error;
	2857	return(VFS_RETURNED_DONE);
	2858	}
	2859	fstp->sfsp += my_size;
	2860
	2861	if (fstp->mp) {
	2862	#if CONFIG_MACF
	2863	error = mac_mount_label_get(mp, *fstp->mp);
	2864	if (error) {
	2865	fstp->error = error;
	2866	return(VFS_RETURNED_DONE);
	2867	}
	2868	#endif
	2869	fstp->mp++;
	2870	}
	2871	}
	2872	fstp->count++;
	2873	return(VFS_RETURNED);
	2874	}
	2875
	2876	/*
	2877	* Get statistics on all filesystems.
	2878	*/
	2879	int
	2880	getfsstat(__unused proc_t p, struct getfsstat_args uap, int retval)
	2881	{
	2882	struct __mac_getfsstat_args muap;
	2883
	2884	muap.buf = uap->buf;
	2885	muap.bufsize = uap->bufsize;
	2886	muap.mac = USER_ADDR_NULL;
	2887	muap.macsize = 0;
	2888	muap.flags = uap->flags;
	2889
	2890	return (__mac_getfsstat(p, &muap, retval));
	2891	}
	2892
	2893	/*
	2894	* __mac_getfsstat: Get MAC-related file system statistics
	2895	*
	2896	* Parameters: p (ignored)
	2897	* uap User argument descriptor (see below)
	2898	* retval Count of file system statistics (N stats)
	2899	*
	2900	* Indirect: uap->bufsize Buffer size
	2901	* uap->macsize MAC info size
	2902	* uap->buf Buffer where information will be returned
	2903	* uap->mac MAC info
	2904	* uap->flags File system flags
	2905	*
	2906	*
	2907	* Returns: 0 Success
	2908	* !0 Not success
	2909	*
	2910	*/
	2911	int
	2912	__mac_getfsstat(__unused proc_t p, struct __mac_getfsstat_args uap, int retval)
	2913	{
	2914	user_addr_t sfsp;
	2915	user_addr_t *mp;
	2916	size_t count, maxcount, bufsize, macsize;
	2917	struct getfsstat_struct fst;
	2918
	2919	bufsize = (size_t) uap->bufsize;
	2920	macsize = (size_t) uap->macsize;
	2921
	2922	if (IS_64BIT_PROCESS(p)) {
	2923	maxcount = bufsize / sizeof(struct user64_statfs);
	2924	}
	2925	else {
	2926	maxcount = bufsize / sizeof(struct user32_statfs);
	2927	}
	2928	sfsp = uap->buf;
	2929	count = 0;
	2930
	2931	mp = NULL;
	2932
	2933	#if CONFIG_MACF
	2934	if (uap->mac != USER_ADDR_NULL) {
	2935	u_int32_t *mp0;
	2936	int error;
	2937	unsigned int i;
	2938
	2939	count = (macsize / (IS_64BIT_PROCESS(p) ? 8 : 4));
	2940	if (count != maxcount)
	2941	return (EINVAL);
	2942
	2943	/* Copy in the array */
	2944	MALLOC(mp0, u_int32_t *, macsize, M_MACTEMP, M_WAITOK);
	2945	if (mp0 == NULL) {
	2946	return (ENOMEM);
	2947	}
	2948
	2949	error = copyin(uap->mac, mp0, macsize);
	2950	if (error) {
	2951	FREE(mp0, M_MACTEMP);
	2952	return (error);
	2953	}
	2954
	2955	/* Normalize to an array of user_addr_t */
	2956	MALLOC(mp, user_addr_t , count sizeof(user_addr_t), M_MACTEMP, M_WAITOK);
	2957	if (mp == NULL) {
	2958	FREE(mp0, M_MACTEMP);
	2959	return (ENOMEM);
	2960	}
	2961
	2962	for (i = 0; i < count; i++) {
	2963	if (IS_64BIT_PROCESS(p))
	2964	mp[i] = ((user_addr_t *)mp0)[i];
	2965	else
	2966	mp[i] = (user_addr_t)mp0[i];
	2967	}
	2968	FREE(mp0, M_MACTEMP);
	2969	}
	2970	#endif
	2971
	2972
	2973	fst.sfsp = sfsp;
	2974	fst.mp = mp;
	2975	fst.flags = uap->flags;
	2976	fst.count = 0;
	2977	fst.error = 0;
	2978	fst.maxcount = maxcount;
	2979
	2980
	2981	vfs_iterate(0, getfsstat_callback, &fst);
	2982
	2983	if (mp)
	2984	FREE(mp, M_MACTEMP);
	2985
	2986	if (fst.error ) {
	2987	KAUTH_DEBUG("ERROR - %s gets %d", p->p_comm, fst.error);
	2988	return(fst.error);
	2989	}
	2990
	2991	if (fst.sfsp && fst.count > fst.maxcount)
	2992	*retval = fst.maxcount;
	2993	else
	2994	*retval = fst.count;
	2995	return (0);
	2996	}
	2997
	2998	static int
	2999	getfsstat64_callback(mount_t mp, void * arg)
	3000	{
	3001	struct getfsstat_struct fstp = (struct getfsstat_struct )arg;
	3002	struct vfsstatfs *sp;
	3003	int error;
	3004
	3005	if (fstp->sfsp && fstp->count < fstp->maxcount) {
	3006	#if CONFIG_MACF
	3007	error = mac_mount_check_stat(vfs_context_current(), mp);
	3008	if (error != 0) {
	3009	fstp->error = error;
	3010	return(VFS_RETURNED_DONE);
	3011	}
	3012	#endif
	3013	sp = &mp->mnt_vfsstat;
	3014	/*
	3015	* If MNT_NOWAIT is specified, do not refresh the fsstat
	3016	* cache. MNT_WAIT overrides MNT_NOWAIT.
	3017	*
	3018	* We treat MNT_DWAIT as MNT_WAIT for all instances of
	3019	* getfsstat, since the constants are out of the same
	3020	* namespace.
	3021	*/
	3022	if (((fstp->flags & MNT_NOWAIT) == 0 \|\|
	3023	(fstp->flags & (MNT_WAIT \| MNT_DWAIT))) &&
	3024	(error = vfs_update_vfsstat(mp, vfs_context_current(), VFS_USER_EVENT))) {
	3025	KAUTH_DEBUG("vfs_update_vfsstat returned %d", error);
	3026	return(VFS_RETURNED);
	3027	}
	3028
	3029	error = statfs64_common(mp, sp, fstp->sfsp);
	3030	if (error) {
	3031	fstp->error = error;
	3032	return(VFS_RETURNED_DONE);
	3033	}
	3034	fstp->sfsp += sizeof(struct statfs64);
	3035	}
	3036	fstp->count++;
	3037	return(VFS_RETURNED);
	3038	}
	3039
	3040	/*
	3041	* Get statistics on all file systems in 64 bit mode.
	3042	*/
	3043	int
	3044	getfsstat64(__unused proc_t p, struct getfsstat64_args uap, int retval)
	3045	{
	3046	user_addr_t sfsp;
	3047	int count, maxcount;
	3048	struct getfsstat_struct fst;
	3049
	3050	maxcount = uap->bufsize / sizeof(struct statfs64);
	3051
	3052	sfsp = uap->buf;
	3053	count = 0;
	3054
	3055	fst.sfsp = sfsp;
	3056	fst.flags = uap->flags;
	3057	fst.count = 0;
	3058	fst.error = 0;
	3059	fst.maxcount = maxcount;
	3060
	3061	vfs_iterate(0, getfsstat64_callback, &fst);
	3062
	3063	if (fst.error ) {
	3064	KAUTH_DEBUG("ERROR - %s gets %d", p->p_comm, fst.error);
	3065	return(fst.error);
	3066	}
	3067
	3068	if (fst.sfsp && fst.count > fst.maxcount)
	3069	*retval = fst.maxcount;
	3070	else
	3071	*retval = fst.count;
	3072
	3073	return (0);
	3074	}
	3075
	3076	/*
	3077	* gets the associated vnode with the file descriptor passed.
	3078	* as input
	3079	*
	3080	* INPUT
	3081	* ctx - vfs context of caller
	3082	* fd - file descriptor for which vnode is required.
	3083	* vpp - Pointer to pointer to vnode to be returned.
	3084	*
	3085	* The vnode is returned with an iocount so any vnode obtained
	3086	* by this call needs a vnode_put
	3087	*
	3088	*/
	3089	int
	3090	vnode_getfromfd(vfs_context_t ctx, int fd, vnode_t *vpp)
	3091	{
	3092	int error;
	3093	vnode_t vp;
	3094	struct fileproc *fp;
	3095	proc_t p = vfs_context_proc(ctx);
	3096
	3097	*vpp = NULLVP;
	3098
	3099	error = fp_getfvp(p, fd, &fp, &vp);
	3100	if (error)
	3101	return (error);
	3102
	3103	error = vnode_getwithref(vp);
	3104	if (error) {
	3105	(void)fp_drop(p, fd, fp, 0);
	3106	return (error);
	3107	}
	3108
	3109	(void)fp_drop(p, fd, fp, 0);
	3110	*vpp = vp;
	3111	return (error);
	3112	}
	3113
	3114	/*
	3115	* Wrapper function around namei to start lookup from a directory
	3116	* specified by a file descriptor ni_dirfd.
	3117	*
	3118	* In addition to all the errors returned by namei, this call can
	3119	* return ENOTDIR if the file descriptor does not refer to a directory.
	3120	* and EBADF if the file descriptor is not valid.
	3121	*/
	3122	int
	3123	nameiat(struct nameidata *ndp, int dirfd)
	3124	{
	3125	if ((dirfd != AT_FDCWD) &&
	3126	!(ndp->ni_flag & NAMEI_CONTLOOKUP) &&
	3127	!(ndp->ni_cnd.cn_flags & USEDVP)) {
	3128	int error = 0;
	3129	char c;
	3130
	3131	if (UIO_SEG_IS_USER_SPACE(ndp->ni_segflg)) {
	3132	error = copyin(ndp->ni_dirp, &c, sizeof(char));
	3133	if (error)
	3134	return (error);
	3135	} else {
	3136	c = ((char )(ndp->ni_dirp));
	3137	}
	3138
	3139	if (c != '/') {
	3140	vnode_t dvp_at;
	3141
	3142	error = vnode_getfromfd(ndp->ni_cnd.cn_context, dirfd,
	3143	&dvp_at);
	3144	if (error)
	3145	return (error);
	3146
	3147	if (vnode_vtype(dvp_at) != VDIR) {
	3148	vnode_put(dvp_at);
	3149	return (ENOTDIR);
	3150	}
	3151
	3152	ndp->ni_dvp = dvp_at;
	3153	ndp->ni_cnd.cn_flags \|= USEDVP;
	3154	error = namei(ndp);
	3155	ndp->ni_cnd.cn_flags &= ~USEDVP;
	3156	vnode_put(dvp_at);
	3157	return (error);
	3158	}
	3159	}
	3160
	3161	return (namei(ndp));
	3162	}
	3163
	3164	/*
	3165	* Change current working directory to a given file descriptor.
	3166	*/
	3167	/* ARGSUSED */
	3168	static int
	3169	common_fchdir(proc_t p, struct fchdir_args *uap, int per_thread)
	3170	{
	3171	struct filedesc *fdp = p->p_fd;
	3172	vnode_t vp;
	3173	vnode_t tdp;
	3174	vnode_t tvp;
	3175	struct mount *mp;
	3176	int error;
	3177	vfs_context_t ctx = vfs_context_current();
	3178
	3179	AUDIT_ARG(fd, uap->fd);
	3180	if (per_thread && uap->fd == -1) {
	3181	/*
	3182	* Switching back from per-thread to per process CWD; verify we
	3183	* in fact have one before proceeding. The only success case
	3184	* for this code path is to return 0 preemptively after zapping
	3185	* the thread structure contents.
	3186	*/
	3187	thread_t th = vfs_context_thread(ctx);
	3188	if (th) {
	3189	uthread_t uth = get_bsdthread_info(th);
	3190	tvp = uth->uu_cdir;
	3191	uth->uu_cdir = NULLVP;
	3192	if (tvp != NULLVP) {
	3193	vnode_rele(tvp);
	3194	return (0);
	3195	}
	3196	}
	3197	return (EBADF);
	3198	}
	3199
	3200	if ( (error = file_vnode(uap->fd, &vp)) )
	3201	return(error);
	3202	if ( (error = vnode_getwithref(vp)) ) {
	3203	file_drop(uap->fd);
	3204	return(error);
	3205	}
	3206
	3207	AUDIT_ARG(vnpath, vp, ARG_VNODE1);
	3208
	3209	if (vp->v_type != VDIR) {
	3210	error = ENOTDIR;
	3211	goto out;
	3212	}
	3213
	3214	#if CONFIG_MACF
	3215	error = mac_vnode_check_chdir(ctx, vp);
	3216	if (error)
	3217	goto out;
	3218	#endif
	3219	error = vnode_authorize(vp, NULL, KAUTH_VNODE_SEARCH, ctx);
	3220	if (error)
	3221	goto out;
	3222
	3223	while (!error && (mp = vp->v_mountedhere) != NULL) {
	3224	if (vfs_busy(mp, LK_NOWAIT)) {
	3225	error = EACCES;
	3226	goto out;
	3227	}
	3228	error = VFS_ROOT(mp, &tdp, ctx);
	3229	vfs_unbusy(mp);
	3230	if (error)
	3231	break;
	3232	vnode_put(vp);
	3233	vp = tdp;
	3234	}
	3235	if (error)
	3236	goto out;
	3237	if ( (error = vnode_ref(vp)) )
	3238	goto out;
	3239	vnode_put(vp);
	3240
	3241	if (per_thread) {
	3242	thread_t th = vfs_context_thread(ctx);
	3243	if (th) {
	3244	uthread_t uth = get_bsdthread_info(th);
	3245	tvp = uth->uu_cdir;
	3246	uth->uu_cdir = vp;
	3247	OSBitOrAtomic(P_THCWD, &p->p_flag);
	3248	} else {
	3249	vnode_rele(vp);
	3250	return (ENOENT);
	3251	}
	3252	} else {
	3253	proc_fdlock(p);
	3254	tvp = fdp->fd_cdir;
	3255	fdp->fd_cdir = vp;
	3256	proc_fdunlock(p);
	3257	}
	3258
	3259	if (tvp)
	3260	vnode_rele(tvp);
	3261	file_drop(uap->fd);
	3262
	3263	return (0);
	3264	out:
	3265	vnode_put(vp);
	3266	file_drop(uap->fd);
	3267
	3268	return(error);
	3269	}
	3270
	3271	int
	3272	fchdir(proc_t p, struct fchdir_args uap, __unused int32_t retval)
	3273	{
	3274	return common_fchdir(p, uap, 0);
	3275	}
	3276
	3277	int
	3278	__pthread_fchdir(proc_t p, struct __pthread_fchdir_args uap, __unused int32_t retval)
	3279	{
	3280	return common_fchdir(p, (void *)uap, 1);
	3281	}
	3282
	3283	/*
	3284	* Change current working directory (".").
	3285	*
	3286	* Returns: 0 Success
	3287	* change_dir:ENOTDIR
	3288	* change_dir:???
	3289	* vnode_ref:ENOENT No such file or directory
	3290	*/
	3291	/* ARGSUSED */
	3292	static int
	3293	common_chdir(proc_t p, struct chdir_args *uap, int per_thread)
	3294	{
	3295	struct filedesc *fdp = p->p_fd;
	3296	int error;
	3297	struct nameidata nd;
	3298	vnode_t tvp;
	3299	vfs_context_t ctx = vfs_context_current();
	3300
	3301	NDINIT(&nd, LOOKUP, OP_CHDIR, FOLLOW \| AUDITVNPATH1,
	3302	UIO_USERSPACE, uap->path, ctx);
	3303	error = change_dir(&nd, ctx);
	3304	if (error)
	3305	return (error);
	3306	if ( (error = vnode_ref(nd.ni_vp)) ) {
	3307	vnode_put(nd.ni_vp);
	3308	return (error);
	3309	}
	3310	/*
	3311	* drop the iocount we picked up in change_dir
	3312	*/
	3313	vnode_put(nd.ni_vp);
	3314
	3315	if (per_thread) {
	3316	thread_t th = vfs_context_thread(ctx);
	3317	if (th) {
	3318	uthread_t uth = get_bsdthread_info(th);
	3319	tvp = uth->uu_cdir;
	3320	uth->uu_cdir = nd.ni_vp;
	3321	OSBitOrAtomic(P_THCWD, &p->p_flag);
	3322	} else {
	3323	vnode_rele(nd.ni_vp);
	3324	return (ENOENT);
	3325	}
	3326	} else {
	3327	proc_fdlock(p);
	3328	tvp = fdp->fd_cdir;
	3329	fdp->fd_cdir = nd.ni_vp;
	3330	proc_fdunlock(p);
	3331	}
	3332
	3333	if (tvp)
	3334	vnode_rele(tvp);
	3335
	3336	return (0);
	3337	}
	3338
	3339
	3340	/*
	3341	* chdir
	3342	*
	3343	* Change current working directory (".") for the entire process
	3344	*
	3345	* Parameters: p Process requesting the call
	3346	* uap User argument descriptor (see below)
	3347	* retval (ignored)
	3348	*
	3349	* Indirect parameters: uap->path Directory path
	3350	*
	3351	* Returns: 0 Success
	3352	* common_chdir: ENOTDIR
	3353	* common_chdir: ENOENT No such file or directory
	3354	* common_chdir: ???
	3355	*
	3356	*/
	3357	int
	3358	chdir(proc_t p, struct chdir_args uap, __unused int32_t retval)
	3359	{
	3360	return common_chdir(p, (void *)uap, 0);
	3361	}
	3362
	3363	/*
	3364	* __pthread_chdir
	3365	*
	3366	* Change current working directory (".") for a single thread
	3367	*
	3368	* Parameters: p Process requesting the call
	3369	* uap User argument descriptor (see below)
	3370	* retval (ignored)
	3371	*
	3372	* Indirect parameters: uap->path Directory path
	3373	*
	3374	* Returns: 0 Success
	3375	* common_chdir: ENOTDIR
	3376	* common_chdir: ENOENT No such file or directory
	3377	* common_chdir: ???
	3378	*
	3379	*/
	3380	int
	3381	__pthread_chdir(proc_t p, struct __pthread_chdir_args uap, __unused int32_t retval)
	3382	{
	3383	return common_chdir(p, (void *)uap, 1);
	3384	}
	3385
	3386
	3387	/*
	3388	* Change notion of root (``/'') directory.
	3389	*/
	3390	/* ARGSUSED */
	3391	int
	3392	chroot(proc_t p, struct chroot_args uap, __unused int32_t retval)
	3393	{
	3394	struct filedesc *fdp = p->p_fd;
	3395	int error;
	3396	struct nameidata nd;
	3397	vnode_t tvp;
	3398	vfs_context_t ctx = vfs_context_current();
	3399
	3400	if ((error = suser(kauth_cred_get(), &p->p_acflag)))
	3401	return (error);
	3402
	3403	NDINIT(&nd, LOOKUP, OP_CHROOT, FOLLOW \| AUDITVNPATH1,
	3404	UIO_USERSPACE, uap->path, ctx);
	3405	error = change_dir(&nd, ctx);
	3406	if (error)
	3407	return (error);
	3408
	3409	#if CONFIG_MACF
	3410	error = mac_vnode_check_chroot(ctx, nd.ni_vp,
	3411	&nd.ni_cnd);
	3412	if (error) {
	3413	vnode_put(nd.ni_vp);
	3414	return (error);
	3415	}
	3416	#endif
	3417
	3418	if ( (error = vnode_ref(nd.ni_vp)) ) {
	3419	vnode_put(nd.ni_vp);
	3420	return (error);
	3421	}
	3422	vnode_put(nd.ni_vp);
	3423
	3424	proc_fdlock(p);
	3425	tvp = fdp->fd_rdir;
	3426	fdp->fd_rdir = nd.ni_vp;
	3427	fdp->fd_flags \|= FD_CHROOT;
	3428	proc_fdunlock(p);
	3429
	3430	if (tvp != NULL)
	3431	vnode_rele(tvp);
	3432
	3433	return (0);
	3434	}
	3435
	3436	/*
	3437	* Common routine for chroot and chdir.
	3438	*
	3439	* Returns: 0 Success
	3440	* ENOTDIR Not a directory
	3441	* namei:??? [anything namei can return]
	3442	* vnode_authorize:??? [anything vnode_authorize can return]
	3443	*/
	3444	static int
	3445	change_dir(struct nameidata *ndp, vfs_context_t ctx)
	3446	{
	3447	vnode_t vp;
	3448	int error;
	3449
	3450	if ((error = namei(ndp)))
	3451	return (error);
	3452	nameidone(ndp);
	3453	vp = ndp->ni_vp;
	3454
	3455	if (vp->v_type != VDIR) {
	3456	vnode_put(vp);
	3457	return (ENOTDIR);
	3458	}
	3459
	3460	#if CONFIG_MACF
	3461	error = mac_vnode_check_chdir(ctx, vp);
	3462	if (error) {
	3463	vnode_put(vp);
	3464	return (error);
	3465	}
	3466	#endif
	3467
	3468	error = vnode_authorize(vp, NULL, KAUTH_VNODE_SEARCH, ctx);
	3469	if (error) {
	3470	vnode_put(vp);
	3471	return (error);
	3472	}
	3473
	3474	return (error);
	3475	}
	3476
	3477	/*
	3478	* Free the vnode data (for directories) associated with the file glob.
	3479	*/
	3480	struct fd_vn_data *
	3481	fg_vn_data_alloc(void)
	3482	{
	3483	struct fd_vn_data *fvdata;
	3484
	3485	/* Allocate per fd vnode data */
	3486	MALLOC(fvdata, struct fd_vn_data *, (sizeof(struct fd_vn_data)),
	3487	M_FD_VN_DATA, M_WAITOK \| M_ZERO);
	3488	lck_mtx_init(&fvdata->fv_lock, fd_vn_lck_grp, fd_vn_lck_attr);
	3489	return fvdata;
	3490	}
	3491
	3492	/*
	3493	* Free the vnode data (for directories) associated with the file glob.
	3494	*/
	3495	void
	3496	fg_vn_data_free(void *fgvndata)
	3497	{
	3498	struct fd_vn_data fvdata = (struct fd_vn_data )fgvndata;
	3499
	3500	if (fvdata->fv_buf)
	3501	FREE(fvdata->fv_buf, M_FD_DIRBUF);
	3502	lck_mtx_destroy(&fvdata->fv_lock, fd_vn_lck_grp);
	3503	FREE(fvdata, M_FD_VN_DATA);
	3504	}
	3505
	3506	/*
	3507	* Check permissions, allocate an open file structure,
	3508	* and call the device open routine if any.
	3509	*
	3510	* Returns: 0 Success
	3511	* EINVAL
	3512	* EINTR
	3513	* falloc:ENFILE
	3514	* falloc:EMFILE
	3515	* falloc:ENOMEM
	3516	* vn_open_auth:???
	3517	* dupfdopen:???
	3518	* VNOP_ADVLOCK:???
	3519	* vnode_setsize:???
	3520	*
	3521	* XXX Need to implement uid, gid
	3522	*/
	3523	int
	3524	open1(vfs_context_t ctx, struct nameidata *ndp, int uflags,
	3525	struct vnode_attr vap, fp_allocfn_t fp_zalloc, void cra,
	3526	int32_t *retval)
	3527	{
	3528	proc_t p = vfs_context_proc(ctx);
	3529	uthread_t uu = get_bsdthread_info(vfs_context_thread(ctx));
	3530	struct fileproc *fp;
	3531	vnode_t vp;
	3532	int flags, oflags;
	3533	int type, indx, error;
	3534	struct flock lf;
	3535	struct vfs_context context;
	3536
	3537	oflags = uflags;
	3538
	3539	if ((oflags & O_ACCMODE) == O_ACCMODE)
	3540	return(EINVAL);
	3541
	3542	flags = FFLAGS(uflags);
	3543	CLR(flags, FENCRYPTED);
	3544	CLR(flags, FUNENCRYPTED);
	3545
	3546	AUDIT_ARG(fflags, oflags);
	3547	AUDIT_ARG(mode, vap->va_mode);
	3548
	3549	if ((error = falloc_withalloc(p,
	3550	&fp, &indx, ctx, fp_zalloc, cra)) != 0) {
	3551	return (error);
	3552	}
	3553	uu->uu_dupfd = -indx - 1;
	3554
	3555	if ((error = vn_open_auth(ndp, &flags, vap))) {
	3556	if ((error == ENODEV \|\| error == ENXIO) && (uu->uu_dupfd >= 0)){ /* XXX from fdopen */
	3557	if ((error = dupfdopen(p->p_fd, indx, uu->uu_dupfd, flags, error)) == 0) {
	3558	fp_drop(p, indx, NULL, 0);
	3559	*retval = indx;
	3560	return (0);
	3561	}
	3562	}
	3563	if (error == ERESTART)
	3564	error = EINTR;
	3565	fp_free(p, indx, fp);
	3566	return (error);
	3567	}
	3568	uu->uu_dupfd = 0;
	3569	vp = ndp->ni_vp;
	3570
	3571	fp->f_fglob->fg_flag = flags & (FMASK \| O_EVTONLY \| FENCRYPTED \| FUNENCRYPTED);
	3572	fp->f_fglob->fg_ops = &vnops;
	3573	fp->f_fglob->fg_data = (caddr_t)vp;
	3574
	3575	if (flags & (O_EXLOCK \| O_SHLOCK)) {
	3576	lf.l_whence = SEEK_SET;
	3577	lf.l_start = 0;
	3578	lf.l_len = 0;
	3579	if (flags & O_EXLOCK)
	3580	lf.l_type = F_WRLCK;
	3581	else
	3582	lf.l_type = F_RDLCK;
	3583	type = F_FLOCK;
	3584	if ((flags & FNONBLOCK) == 0)
	3585	type \|= F_WAIT;
	3586	#if CONFIG_MACF
	3587	error = mac_file_check_lock(vfs_context_ucred(ctx), fp->f_fglob,
	3588	F_SETLK, &lf);
	3589	if (error)
	3590	goto bad;
	3591	#endif
	3592	if ((error = VNOP_ADVLOCK(vp, (caddr_t)fp->f_fglob, F_SETLK, &lf, type, ctx, NULL)))
	3593	goto bad;
	3594	fp->f_fglob->fg_flag \|= FHASLOCK;
	3595	}
	3596
	3597	#if DEVELOPMENT \|\| DEBUG
	3598	/*
	3599	* XXX VSWAP: Check for entitlements or special flag here
	3600	* so we can restrict access appropriately.
	3601	*/
	3602	#else /* DEVELOPMENT \|\| DEBUG */
	3603
	3604	if (vnode_isswap(vp) && (flags & (FWRITE \| O_TRUNC)) && (ctx != vfs_context_kernel())) {
	3605	/* block attempt to write/truncate swapfile */
	3606	error = EPERM;
	3607	goto bad;
	3608	}
	3609	#endif /* DEVELOPMENT \|\| DEBUG */
	3610
	3611	/* try to truncate by setting the size attribute */
	3612	if ((flags & O_TRUNC) && ((error = vnode_setsize(vp, (off_t)0, 0, ctx)) != 0))
	3613	goto bad;
	3614
	3615	/*
	3616	* For directories we hold some additional information in the fd.
	3617	*/
	3618	if (vnode_vtype(vp) == VDIR) {
	3619	fp->f_fglob->fg_vn_data = fg_vn_data_alloc();
	3620	} else {
	3621	fp->f_fglob->fg_vn_data = NULL;
	3622	}
	3623
	3624	vnode_put(vp);
	3625
	3626	/*
	3627	* The first terminal open (without a O_NOCTTY) by a session leader
	3628	* results in it being set as the controlling terminal.
	3629	*/
	3630	if (vnode_istty(vp) && !(p->p_flag & P_CONTROLT) &&
	3631	!(flags & O_NOCTTY)) {
	3632	int tmp = 0;
	3633
	3634	(void)(*fp->f_fglob->fg_ops->fo_ioctl)(fp, (int)TIOCSCTTY,
	3635	(caddr_t)&tmp, ctx);
	3636	}
	3637
	3638	proc_fdlock(p);
	3639	if (flags & O_CLOEXEC)
	3640	*fdflags(p, indx) \|= UF_EXCLOSE;
	3641	if (flags & O_CLOFORK)
	3642	*fdflags(p, indx) \|= UF_FORKCLOSE;
	3643	procfdtbl_releasefd(p, indx, NULL);
	3644
	3645	#if CONFIG_SECLUDED_MEMORY
	3646	if (secluded_for_filecache &&
	3647	FILEGLOB_DTYPE(fp->f_fglob) == DTYPE_VNODE &&
	3648	vnode_vtype(vp) == VREG) {
	3649	memory_object_control_t moc;
	3650
	3651	moc = ubc_getobject(vp, UBC_FLAGS_NONE);
	3652
	3653	if (moc == MEMORY_OBJECT_CONTROL_NULL) {
	3654	/* nothing to do... */
	3655	} else if (fp->f_fglob->fg_flag & FWRITE) {
	3656	/* writable -> no longer eligible for secluded pages */
	3657	memory_object_mark_eligible_for_secluded(moc,
	3658	FALSE);
	3659	} else if (secluded_for_filecache == 1) {
	3660	char pathname[32] = { 0, };
	3661	size_t copied;
	3662	/* XXX FBDP: better way to detect /Applications/ ? */
	3663	if (UIO_SEG_IS_USER_SPACE(ndp->ni_segflg)) {
	3664	copyinstr(ndp->ni_dirp,
	3665	pathname,
	3666	sizeof (pathname),
	3667	&copied);
	3668	} else {
	3669	copystr(CAST_DOWN(void *, ndp->ni_dirp),
	3670	pathname,
	3671	sizeof (pathname),
	3672	&copied);
	3673	}
	3674	pathname[sizeof (pathname) - 1] = '\0';
	3675	if (strncmp(pathname,
	3676	"/Applications/",
	3677	strlen("/Applications/")) == 0 &&
	3678	strncmp(pathname,
	3679	"/Applications/Camera.app/",
	3680	strlen("/Applications/Camera.app/")) != 0) {
	3681	/*
	3682	* not writable
	3683	* AND from "/Applications/"
	3684	* AND not from "/Applications/Camera.app/"
	3685	* ==> eligible for secluded
	3686	*/
	3687	memory_object_mark_eligible_for_secluded(moc,
	3688	TRUE);
	3689	}
	3690	} else if (secluded_for_filecache == 2) {
	3691	#if __arm64__
	3692	#define DYLD_SHARED_CACHE_NAME "dyld_shared_cache_arm64"
	3693	#elif __arm__
	3694	#define DYLD_SHARED_CACHE_NAME "dyld_shared_cache_armv7"
	3695	#else
	3696	/* not implemented... */
	3697	#endif
	3698	if (!strncmp(vp->v_name,
	3699	DYLD_SHARED_CACHE_NAME,
	3700	strlen(DYLD_SHARED_CACHE_NAME)) \|\|
	3701	!strncmp(vp->v_name,
	3702	"dyld",
	3703	strlen(vp->v_name)) \|\|
	3704	!strncmp(vp->v_name,
	3705	"launchd",
	3706	strlen(vp->v_name)) \|\|
	3707	!strncmp(vp->v_name,
	3708	"Camera",
	3709	strlen(vp->v_name)) \|\|
	3710	!strncmp(vp->v_name,
	3711	"mediaserverd",
	3712	strlen(vp->v_name)) \|\|
	3713	!strncmp(vp->v_name,
	3714	"SpringBoard",
	3715	strlen(vp->v_name)) \|\|
	3716	!strncmp(vp->v_name,
	3717	"backboardd",
	3718	strlen(vp->v_name))) {
	3719	/*
	3720	* This file matters when launching Camera:
	3721	* do not store its contents in the secluded
	3722	* pool that will be drained on Camera launch.
	3723	*/
	3724	memory_object_mark_eligible_for_secluded(moc,
	3725	FALSE);
	3726	}
	3727	}
	3728	}
	3729	#endif /* CONFIG_SECLUDED_MEMORY */
	3730
	3731	fp_drop(p, indx, fp, 1);
	3732	proc_fdunlock(p);
	3733
	3734	*retval = indx;
	3735
	3736	return (0);
	3737	bad:
	3738	context = *vfs_context_current();
	3739	context.vc_ucred = fp->f_fglob->fg_cred;
	3740
	3741	if ((fp->f_fglob->fg_flag & FHASLOCK) &&
	3742	(FILEGLOB_DTYPE(fp->f_fglob) == DTYPE_VNODE)) {
	3743	lf.l_whence = SEEK_SET;
	3744	lf.l_start = 0;
	3745	lf.l_len = 0;
	3746	lf.l_type = F_UNLCK;
	3747
	3748	(void)VNOP_ADVLOCK(
	3749	vp, (caddr_t)fp->f_fglob, F_UNLCK, &lf, F_FLOCK, ctx, NULL);
	3750	}
	3751
	3752	vn_close(vp, fp->f_fglob->fg_flag, &context);
	3753	vnode_put(vp);
	3754	fp_free(p, indx, fp);
	3755
	3756	return (error);
	3757	}
	3758
	3759	/*
	3760	* While most of the *at syscall handlers can call nameiat() which
	3761	* is a wrapper around namei, the use of namei and initialisation
	3762	* of nameidata are far removed and in different functions - namei
	3763	* gets called in vn_open_auth for open1. So we'll just do here what
	3764	* nameiat() does.
	3765	*/
	3766	static int
	3767	open1at(vfs_context_t ctx, struct nameidata *ndp, int uflags,
	3768	struct vnode_attr vap, fp_allocfn_t fp_zalloc, void cra, int32_t *retval,
	3769	int dirfd)
	3770	{
	3771	if ((dirfd != AT_FDCWD) && !(ndp->ni_cnd.cn_flags & USEDVP)) {
	3772	int error;
	3773	char c;
	3774
	3775	if (UIO_SEG_IS_USER_SPACE(ndp->ni_segflg)) {
	3776	error = copyin(ndp->ni_dirp, &c, sizeof(char));
	3777	if (error)
	3778	return (error);
	3779	} else {
	3780	c = ((char )(ndp->ni_dirp));
	3781	}
	3782
	3783	if (c != '/') {
	3784	vnode_t dvp_at;
	3785
	3786	error = vnode_getfromfd(ndp->ni_cnd.cn_context, dirfd,
	3787	&dvp_at);
	3788	if (error)
	3789	return (error);
	3790
	3791	if (vnode_vtype(dvp_at) != VDIR) {
	3792	vnode_put(dvp_at);
	3793	return (ENOTDIR);
	3794	}
	3795
	3796	ndp->ni_dvp = dvp_at;
	3797	ndp->ni_cnd.cn_flags \|= USEDVP;
	3798	error = open1(ctx, ndp, uflags, vap, fp_zalloc, cra,
	3799	retval);
	3800	vnode_put(dvp_at);
	3801	return (error);
	3802	}
	3803	}
	3804
	3805	return (open1(ctx, ndp, uflags, vap, fp_zalloc, cra, retval));
	3806	}
	3807
	3808	/*
	3809	* open_extended: open a file given a path name; with extended argument list (including extended security (ACL)).
	3810	*
	3811	* Parameters: p Process requesting the open
	3812	* uap User argument descriptor (see below)
	3813	* retval Pointer to an area to receive the
	3814	* return calue from the system call
	3815	*
	3816	* Indirect: uap->path Path to open (same as 'open')
	3817	* uap->flags Flags to open (same as 'open'
	3818	* uap->uid UID to set, if creating
	3819	* uap->gid GID to set, if creating
	3820	* uap->mode File mode, if creating (same as 'open')
	3821	* uap->xsecurity ACL to set, if creating
	3822	*
	3823	* Returns: 0 Success
	3824	* !0 errno value
	3825	*
	3826	* Notes: The kauth_filesec_t in 'va', if any, is in host byte order.
	3827	*
	3828	* XXX: We should enummerate the possible errno values here, and where
	3829	* in the code they originated.
	3830	*/
	3831	int
	3832	open_extended(proc_t p, struct open_extended_args uap, int32_t retval)
	3833	{
	3834	struct filedesc *fdp = p->p_fd;
	3835	int ciferror;
	3836	kauth_filesec_t xsecdst;
	3837	struct vnode_attr va;
	3838	struct nameidata nd;
	3839	int cmode;
	3840
	3841	AUDIT_ARG(owner, uap->uid, uap->gid);
	3842
	3843	xsecdst = NULL;
	3844	if ((uap->xsecurity != USER_ADDR_NULL) &&
	3845	((ciferror = kauth_copyinfilesec(uap->xsecurity, &xsecdst)) != 0))
	3846	return ciferror;
	3847
	3848	VATTR_INIT(&va);
	3849	cmode = ((uap->mode &~ fdp->fd_cmask) & ALLPERMS) & ~S_ISTXT;
	3850	VATTR_SET(&va, va_mode, cmode);
	3851	if (uap->uid != KAUTH_UID_NONE)
	3852	VATTR_SET(&va, va_uid, uap->uid);
	3853	if (uap->gid != KAUTH_GID_NONE)
	3854	VATTR_SET(&va, va_gid, uap->gid);
	3855	if (xsecdst != NULL)
	3856	VATTR_SET(&va, va_acl, &xsecdst->fsec_acl);
	3857
	3858	NDINIT(&nd, LOOKUP, OP_OPEN, FOLLOW \| AUDITVNPATH1, UIO_USERSPACE,
	3859	uap->path, vfs_context_current());
	3860
	3861	ciferror = open1(vfs_context_current(), &nd, uap->flags, &va,
	3862	fileproc_alloc_init, NULL, retval);
	3863	if (xsecdst != NULL)
	3864	kauth_filesec_free(xsecdst);
	3865
	3866	return ciferror;
	3867	}
	3868
	3869	/*
	3870	* Go through the data-protected atomically controlled open (2)
	3871	*
	3872	* int open_dprotected_np(user_addr_t path, int flags, int class, int dpflags, int mode)
	3873	*/
	3874	int open_dprotected_np (__unused proc_t p, struct open_dprotected_np_args uap, int32_t retval) {
	3875	int flags = uap->flags;
	3876	int class = uap->class;
	3877	int dpflags = uap->dpflags;
	3878
	3879	/*
	3880	* Follow the same path as normal open(2)
	3881	* Look up the item if it exists, and acquire the vnode.
	3882	*/
	3883	struct filedesc *fdp = p->p_fd;
	3884	struct vnode_attr va;
	3885	struct nameidata nd;
	3886	int cmode;
	3887	int error;
	3888
	3889	VATTR_INIT(&va);
	3890	/* Mask off all but regular access permissions */
	3891	cmode = ((uap->mode &~ fdp->fd_cmask) & ALLPERMS) & ~S_ISTXT;
	3892	VATTR_SET(&va, va_mode, cmode & ACCESSPERMS);
	3893
	3894	NDINIT(&nd, LOOKUP, OP_OPEN, FOLLOW \| AUDITVNPATH1, UIO_USERSPACE,
	3895	uap->path, vfs_context_current());
	3896
	3897	/*
	3898	* Initialize the extra fields in vnode_attr to pass down our
	3899	* extra fields.
	3900	* 1. target cprotect class.
	3901	* 2. set a flag to mark it as requiring open-raw-encrypted semantics.
	3902	*/
	3903	if (flags & O_CREAT) {
	3904	/* lower level kernel code validates that the class is valid before applying it. */
	3905	if (class != PROTECTION_CLASS_DEFAULT) {
	3906	/*
	3907	* PROTECTION_CLASS_DEFAULT implies that we make the class for this
	3908	* file behave the same as open (2)
	3909	*/
	3910	VATTR_SET(&va, va_dataprotect_class, class);
	3911	}
	3912	}
	3913
	3914	if (dpflags & (O_DP_GETRAWENCRYPTED\|O_DP_GETRAWUNENCRYPTED)) {
	3915	if ( flags & (O_RDWR \| O_WRONLY)) {
	3916	/* Not allowed to write raw encrypted bytes */
	3917	return EINVAL;
	3918	}
	3919	if (uap->dpflags & O_DP_GETRAWENCRYPTED) {
	3920	VATTR_SET(&va, va_dataprotect_flags, VA_DP_RAWENCRYPTED);
	3921	}
	3922	if (uap->dpflags & O_DP_GETRAWUNENCRYPTED) {
	3923	VATTR_SET(&va, va_dataprotect_flags, VA_DP_RAWUNENCRYPTED);
	3924	}
	3925	}
	3926
	3927	error = open1(vfs_context_current(), &nd, uap->flags, &va,
	3928	fileproc_alloc_init, NULL, retval);
	3929
	3930	return error;
	3931	}
	3932
	3933	static int
	3934	openat_internal(vfs_context_t ctx, user_addr_t path, int flags, int mode,
	3935	int fd, enum uio_seg segflg, int *retval)
	3936	{
	3937	struct filedesc *fdp = (vfs_context_proc(ctx))->p_fd;
	3938	struct vnode_attr va;
	3939	struct nameidata nd;
	3940	int cmode;
	3941
	3942	VATTR_INIT(&va);
	3943	/* Mask off all but regular access permissions */
	3944	cmode = ((mode &~ fdp->fd_cmask) & ALLPERMS) & ~S_ISTXT;
	3945	VATTR_SET(&va, va_mode, cmode & ACCESSPERMS);
	3946
	3947	NDINIT(&nd, LOOKUP, OP_OPEN, FOLLOW \| AUDITVNPATH1,
	3948	segflg, path, ctx);
	3949
	3950	return (open1at(ctx, &nd, flags, &va, fileproc_alloc_init, NULL,
	3951	retval, fd));
	3952	}
	3953
	3954	int
	3955	open(proc_t p, struct open_args uap, int32_t retval)
	3956	{
	3957	__pthread_testcancel(1);
	3958	return(open_nocancel(p, (struct open_nocancel_args *)uap, retval));
	3959	}
	3960
	3961	int
	3962	open_nocancel(__unused proc_t p, struct open_nocancel_args *uap,
	3963	int32_t *retval)
	3964	{
	3965	return (openat_internal(vfs_context_current(), uap->path, uap->flags,
	3966	uap->mode, AT_FDCWD, UIO_USERSPACE, retval));
	3967	}
	3968
	3969	int
	3970	openat_nocancel(__unused proc_t p, struct openat_nocancel_args *uap,
	3971	int32_t *retval)
	3972	{
	3973	return (openat_internal(vfs_context_current(), uap->path, uap->flags,
	3974	uap->mode, uap->fd, UIO_USERSPACE, retval));
	3975	}
	3976
	3977	int
	3978	openat(proc_t p, struct openat_args uap, int32_t retval)
	3979	{
	3980	__pthread_testcancel(1);
	3981	return(openat_nocancel(p, (struct openat_nocancel_args *)uap, retval));
	3982	}
	3983
	3984	/*
	3985	* openbyid_np: open a file given a file system id and a file system object id
	3986	* the hfs file system object id is an fsobj_id_t {uint32, uint32}
	3987	* file systems that don't support object ids it is a node id (uint64_t).
	3988	*
	3989	* Parameters: p Process requesting the open
	3990	* uap User argument descriptor (see below)
	3991	* retval Pointer to an area to receive the
	3992	* return calue from the system call
	3993	*
	3994	* Indirect: uap->path Path to open (same as 'open')
	3995	*
	3996	* uap->fsid id of target file system
	3997	* uap->objid id of target file system object
	3998	* uap->flags Flags to open (same as 'open')
	3999	*
	4000	* Returns: 0 Success
	4001	* !0 errno value
	4002	*
	4003	*
	4004	* XXX: We should enummerate the possible errno values here, and where
	4005	* in the code they originated.
	4006	*/
	4007	int
	4008	openbyid_np(__unused proc_t p, struct openbyid_np_args uap, int retval)
	4009	{
	4010	fsid_t fsid;
	4011	uint64_t objid;
	4012	int error;
	4013	char *buf = NULL;
	4014	int buflen = MAXPATHLEN;
	4015	int pathlen = 0;
	4016	vfs_context_t ctx = vfs_context_current();
	4017
	4018	if ((error = priv_check_cred(vfs_context_ucred(ctx), PRIV_VFS_OPEN_BY_ID, 0))) {
	4019	return (error);
	4020	}
	4021
	4022	if ((error = copyin(uap->fsid, (caddr_t)&fsid, sizeof(fsid)))) {
	4023	return (error);
	4024	}
	4025
	4026	/uap->obj is an fsobj_id_t defined as struct {uint32_t, uint32_t} /
	4027	if ((error = copyin(uap->objid, (caddr_t)&objid, sizeof(uint64_t)))) {
	4028	return (error);
	4029	}
	4030
	4031	AUDIT_ARG(value32, fsid.val[0]);
	4032	AUDIT_ARG(value64, objid);
	4033
	4034	/resolve path from fsis, objid/
	4035	do {
	4036	MALLOC(buf, char *, buflen + 1, M_TEMP, M_WAITOK);
	4037	if (buf == NULL) {
	4038	return (ENOMEM);
	4039	}
	4040
	4041	error = fsgetpath_internal(
	4042	ctx, fsid.val[0], objid,
	4043	buflen, buf, &pathlen);
	4044
	4045	if (error) {
	4046	FREE(buf, M_TEMP);
	4047	buf = NULL;
	4048	}
	4049	} while (error == ENOSPC && (buflen += MAXPATHLEN));
	4050
	4051	if (error) {
	4052	return error;
	4053	}
	4054
	4055	buf[pathlen] = 0;
	4056
	4057	error = openat_internal(
	4058	ctx, (user_addr_t)buf, uap->oflags, 0, AT_FDCWD, UIO_SYSSPACE, retval);
	4059
	4060	FREE(buf, M_TEMP);
	4061
	4062	return error;
	4063	}
	4064
	4065
	4066	/*
	4067	* Create a special file.
	4068	*/
	4069	static int mkfifo1(vfs_context_t ctx, user_addr_t upath, struct vnode_attr *vap);
	4070
	4071	int
	4072	mknod(proc_t p, struct mknod_args uap, __unused int32_t retval)
	4073	{
	4074	struct vnode_attr va;
	4075	vfs_context_t ctx = vfs_context_current();
	4076	int error;
	4077	struct nameidata nd;
	4078	vnode_t vp, dvp;
	4079
	4080	VATTR_INIT(&va);
	4081	VATTR_SET(&va, va_mode, (uap->mode & ALLPERMS) & ~p->p_fd->fd_cmask);
	4082	VATTR_SET(&va, va_rdev, uap->dev);
	4083
	4084	/* If it's a mknod() of a FIFO, call mkfifo1() instead */
	4085	if ((uap->mode & S_IFMT) == S_IFIFO)
	4086	return(mkfifo1(ctx, uap->path, &va));
	4087
	4088	AUDIT_ARG(mode, uap->mode);
	4089	AUDIT_ARG(value32, uap->dev);
	4090
	4091	if ((error = suser(vfs_context_ucred(ctx), &p->p_acflag)))
	4092	return (error);
	4093	NDINIT(&nd, CREATE, OP_MKNOD, LOCKPARENT \| AUDITVNPATH1,
	4094	UIO_USERSPACE, uap->path, ctx);
	4095	error = namei(&nd);
	4096	if (error)
	4097	return (error);
	4098	dvp = nd.ni_dvp;
	4099	vp = nd.ni_vp;
	4100
	4101	if (vp != NULL) {
	4102	error = EEXIST;
	4103	goto out;
	4104	}
	4105
	4106	switch (uap->mode & S_IFMT) {
	4107	case S_IFCHR:
	4108	VATTR_SET(&va, va_type, VCHR);
	4109	break;
	4110	case S_IFBLK:
	4111	VATTR_SET(&va, va_type, VBLK);
	4112	break;
	4113	default:
	4114	error = EINVAL;
	4115	goto out;
	4116	}
	4117
	4118	#if CONFIG_MACF
	4119	error = mac_vnode_check_create(ctx,
	4120	nd.ni_dvp, &nd.ni_cnd, &va);
	4121	if (error)
	4122	goto out;
	4123	#endif
	4124
	4125	if ((error = vnode_authorize(dvp, NULL, KAUTH_VNODE_ADD_FILE, ctx)) != 0)
	4126	goto out;
	4127
	4128	if ((error = vn_create(dvp, &vp, &nd, &va, 0, 0, NULL, ctx)) != 0)
	4129	goto out;
	4130
	4131	if (vp) {
	4132	int update_flags = 0;
	4133
	4134	// Make sure the name & parent pointers are hooked up
	4135	if (vp->v_name == NULL)
	4136	update_flags \|= VNODE_UPDATE_NAME;
	4137	if (vp->v_parent == NULLVP)
	4138	update_flags \|= VNODE_UPDATE_PARENT;
	4139
	4140	if (update_flags)
	4141	vnode_update_identity(vp, dvp, nd.ni_cnd.cn_nameptr, nd.ni_cnd.cn_namelen, nd.ni_cnd.cn_hash, update_flags);
	4142
	4143	#if CONFIG_FSE
	4144	add_fsevent(FSE_CREATE_FILE, ctx,
	4145	FSE_ARG_VNODE, vp,
	4146	FSE_ARG_DONE);
	4147	#endif
	4148	}
	4149
	4150	out:
	4151	/*
	4152	* nameidone has to happen before we vnode_put(dvp)
	4153	* since it may need to release the fs_nodelock on the dvp
	4154	*/
	4155	nameidone(&nd);
	4156
	4157	if (vp)
	4158	vnode_put(vp);
	4159	vnode_put(dvp);
	4160
	4161	return (error);
	4162	}
	4163
	4164	/*
	4165	* Create a named pipe.
	4166	*
	4167	* Returns: 0 Success
	4168	* EEXIST
	4169	* namei:???
	4170	* vnode_authorize:???
	4171	* vn_create:???
	4172	*/
	4173	static int
	4174	mkfifo1(vfs_context_t ctx, user_addr_t upath, struct vnode_attr *vap)
	4175	{
	4176	vnode_t vp, dvp;
	4177	int error;
	4178	struct nameidata nd;
	4179
	4180	NDINIT(&nd, CREATE, OP_MKFIFO, LOCKPARENT \| AUDITVNPATH1,
	4181	UIO_USERSPACE, upath, ctx);
	4182	error = namei(&nd);
	4183	if (error)
	4184	return (error);
	4185	dvp = nd.ni_dvp;
	4186	vp = nd.ni_vp;
	4187
	4188	/* check that this is a new file and authorize addition */
	4189	if (vp != NULL) {
	4190	error = EEXIST;
	4191	goto out;
	4192	}
	4193	VATTR_SET(vap, va_type, VFIFO);
	4194
	4195	if ((error = vn_authorize_create(dvp, &nd.ni_cnd, vap, ctx, NULL)) != 0)
	4196	goto out;
	4197
	4198	error = vn_create(dvp, &vp, &nd, vap, 0, 0, NULL, ctx);
	4199	out:
	4200	/*
	4201	* nameidone has to happen before we vnode_put(dvp)
	4202	* since it may need to release the fs_nodelock on the dvp
	4203	*/
	4204	nameidone(&nd);
	4205
	4206	if (vp)
	4207	vnode_put(vp);
	4208	vnode_put(dvp);
	4209
	4210	return error;
	4211	}
	4212
	4213
	4214	/*
	4215	* mkfifo_extended: Create a named pipe; with extended argument list (including extended security (ACL)).
	4216	*
	4217	* Parameters: p Process requesting the open
	4218	* uap User argument descriptor (see below)
	4219	* retval (Ignored)
	4220	*
	4221	* Indirect: uap->path Path to fifo (same as 'mkfifo')
	4222	* uap->uid UID to set
	4223	* uap->gid GID to set
	4224	* uap->mode File mode to set (same as 'mkfifo')
	4225	* uap->xsecurity ACL to set, if creating
	4226	*
	4227	* Returns: 0 Success
	4228	* !0 errno value
	4229	*
	4230	* Notes: The kauth_filesec_t in 'va', if any, is in host byte order.
	4231	*
	4232	* XXX: We should enummerate the possible errno values here, and where
	4233	* in the code they originated.
	4234	*/
	4235	int
	4236	mkfifo_extended(proc_t p, struct mkfifo_extended_args uap, __unused int32_t retval)
	4237	{
	4238	int ciferror;
	4239	kauth_filesec_t xsecdst;
	4240	struct vnode_attr va;
	4241
	4242	AUDIT_ARG(owner, uap->uid, uap->gid);
	4243
	4244	xsecdst = KAUTH_FILESEC_NONE;
	4245	if (uap->xsecurity != USER_ADDR_NULL) {
	4246	if ((ciferror = kauth_copyinfilesec(uap->xsecurity, &xsecdst)) != 0)
	4247	return ciferror;
	4248	}
	4249
	4250	VATTR_INIT(&va);
	4251	VATTR_SET(&va, va_mode, (uap->mode & ALLPERMS) & ~p->p_fd->fd_cmask);
	4252	if (uap->uid != KAUTH_UID_NONE)
	4253	VATTR_SET(&va, va_uid, uap->uid);
	4254	if (uap->gid != KAUTH_GID_NONE)
	4255	VATTR_SET(&va, va_gid, uap->gid);
	4256	if (xsecdst != KAUTH_FILESEC_NONE)
	4257	VATTR_SET(&va, va_acl, &xsecdst->fsec_acl);
	4258
	4259	ciferror = mkfifo1(vfs_context_current(), uap->path, &va);
	4260
	4261	if (xsecdst != KAUTH_FILESEC_NONE)
	4262	kauth_filesec_free(xsecdst);
	4263	return ciferror;
	4264	}
	4265
	4266	/* ARGSUSED */
	4267	int
	4268	mkfifo(proc_t p, struct mkfifo_args uap, __unused int32_t retval)
	4269	{
	4270	struct vnode_attr va;
	4271
	4272	VATTR_INIT(&va);
	4273	VATTR_SET(&va, va_mode, (uap->mode & ALLPERMS) & ~p->p_fd->fd_cmask);
	4274
	4275	return(mkfifo1(vfs_context_current(), uap->path, &va));
	4276	}
	4277
	4278
	4279	static char *
	4280	my_strrchr(char *p, int ch)
	4281	{
	4282	char *save;
	4283
	4284	for (save = NULL;; ++p) {
	4285	if (*p == ch)
	4286	save = p;
	4287	if (!*p)
	4288	return(save);
	4289	}
	4290	/* NOTREACHED */
	4291	}
	4292
	4293	extern int safe_getpath(struct vnode dvp, char leafname, char path, int _len, int truncated_path);
	4294
	4295	int
	4296	safe_getpath(struct vnode dvp, char leafname, char path, int _len, int truncated_path)
	4297	{
	4298	int ret, len = _len;
	4299
	4300	*truncated_path = 0;
	4301	ret = vn_getpath(dvp, path, &len);
	4302	if (ret == 0 && len < (MAXPATHLEN - 1)) {
	4303	if (leafname) {
	4304	path[len-1] = '/';
	4305	len += strlcpy(&path[len], leafname, MAXPATHLEN-len) + 1;
	4306	if (len > MAXPATHLEN) {
	4307	char *ptr;
	4308
	4309	// the string got truncated!
	4310	*truncated_path = 1;
	4311	ptr = my_strrchr(path, '/');
	4312	if (ptr) {
	4313	*ptr = '\0'; // chop off the string at the last directory component
	4314	}
	4315	len = strlen(path) + 1;
	4316	}
	4317	}
	4318	} else if (ret == 0) {
	4319	*truncated_path = 1;
	4320	} else if (ret != 0) {
	4321	struct vnode *mydvp=dvp;
	4322
	4323	if (ret != ENOSPC) {
	4324	printf("safe_getpath: failed to get the path for vp %p (%s) : err %d\n",
	4325	dvp, dvp->v_name ? dvp->v_name : "no-name", ret);
	4326	}
	4327	*truncated_path = 1;
	4328
	4329	do {
	4330	if (mydvp->v_parent != NULL) {
	4331	mydvp = mydvp->v_parent;
	4332	} else if (mydvp->v_mount) {
	4333	strlcpy(path, mydvp->v_mount->mnt_vfsstat.f_mntonname, _len);
	4334	break;
	4335	} else {
	4336	// no parent and no mount point? only thing is to punt and say "/" changed
	4337	strlcpy(path, "/", _len);
	4338	len = 2;
	4339	mydvp = NULL;
	4340	}
	4341
	4342	if (mydvp == NULL) {
	4343	break;
	4344	}
	4345
	4346	len = _len;
	4347	ret = vn_getpath(mydvp, path, &len);
	4348	} while (ret == ENOSPC);
	4349	}
	4350
	4351	return len;
	4352	}
	4353
	4354
	4355	/*
	4356	* Make a hard file link.
	4357	*
	4358	* Returns: 0 Success
	4359	* EPERM
	4360	* EEXIST
	4361	* EXDEV
	4362	* namei:???
	4363	* vnode_authorize:???
	4364	* VNOP_LINK:???
	4365	*/
	4366	/* ARGSUSED */
	4367	static int
	4368	linkat_internal(vfs_context_t ctx, int fd1, user_addr_t path, int fd2,
	4369	user_addr_t link, int flag, enum uio_seg segflg)
	4370	{
	4371	vnode_t vp, dvp, lvp;
	4372	struct nameidata nd;
	4373	int follow;
	4374	int error;
	4375	#if CONFIG_FSE
	4376	fse_info finfo;
	4377	#endif
	4378	int need_event, has_listeners, need_kpath2;
	4379	char *target_path = NULL;
	4380	int truncated=0;
	4381
	4382	vp = dvp = lvp = NULLVP;
	4383
	4384	/* look up the object we are linking to */
	4385	follow = (flag & AT_SYMLINK_FOLLOW) ? FOLLOW : NOFOLLOW;
	4386	NDINIT(&nd, LOOKUP, OP_LOOKUP, AUDITVNPATH1 \| follow,
	4387	segflg, path, ctx);
	4388
	4389	error = nameiat(&nd, fd1);
	4390	if (error)
	4391	return (error);
	4392	vp = nd.ni_vp;
	4393
	4394	nameidone(&nd);
	4395
	4396	/*
	4397	* Normally, linking to directories is not supported.
	4398	* However, some file systems may have limited support.
	4399	*/
	4400	if (vp->v_type == VDIR) {
	4401	if (!ISSET(vp->v_mount->mnt_kern_flag, MNTK_DIR_HARDLINKS)) {
	4402	error = EPERM; /* POSIX */
	4403	goto out;
	4404	}
	4405
	4406	/* Linking to a directory requires ownership. */
	4407	if (!kauth_cred_issuser(vfs_context_ucred(ctx))) {
	4408	struct vnode_attr dva;
	4409
	4410	VATTR_INIT(&dva);
	4411	VATTR_WANTED(&dva, va_uid);
	4412	if (vnode_getattr(vp, &dva, ctx) != 0 \|\|
	4413	!VATTR_IS_SUPPORTED(&dva, va_uid) \|\|
	4414	(dva.va_uid != kauth_cred_getuid(vfs_context_ucred(ctx)))) {
	4415	error = EACCES;
	4416	goto out;
	4417	}
	4418	}
	4419	}
	4420
	4421	/* lookup the target node */
	4422	#if CONFIG_TRIGGERS
	4423	nd.ni_op = OP_LINK;
	4424	#endif
	4425	nd.ni_cnd.cn_nameiop = CREATE;
	4426	nd.ni_cnd.cn_flags = LOCKPARENT \| AUDITVNPATH2 \| CN_NBMOUNTLOOK;
	4427	nd.ni_dirp = link;
	4428	error = nameiat(&nd, fd2);
	4429	if (error != 0)
	4430	goto out;
	4431	dvp = nd.ni_dvp;
	4432	lvp = nd.ni_vp;
	4433
	4434	#if CONFIG_MACF
	4435	if ((error = mac_vnode_check_link(ctx, dvp, vp, &nd.ni_cnd)) != 0)
	4436	goto out2;
	4437	#endif
	4438
	4439	/* or to anything that kauth doesn't want us to (eg. immutable items) */
	4440	if ((error = vnode_authorize(vp, NULL, KAUTH_VNODE_LINKTARGET, ctx)) != 0)
	4441	goto out2;
	4442
	4443	/* target node must not exist */
	4444	if (lvp != NULLVP) {
	4445	error = EEXIST;
	4446	goto out2;
	4447	}
	4448	/* cannot link across mountpoints */
	4449	if (vnode_mount(vp) != vnode_mount(dvp)) {
	4450	error = EXDEV;
	4451	goto out2;
	4452	}
	4453
	4454	/* authorize creation of the target note */
	4455	if ((error = vnode_authorize(dvp, NULL, KAUTH_VNODE_ADD_FILE, ctx)) != 0)
	4456	goto out2;
	4457
	4458	/* and finally make the link */
	4459	error = VNOP_LINK(vp, dvp, &nd.ni_cnd, ctx);
	4460	if (error)
	4461	goto out2;
	4462
	4463	#if CONFIG_MACF
	4464	(void)mac_vnode_notify_link(ctx, vp, dvp, &nd.ni_cnd);
	4465	#endif
	4466
	4467	#if CONFIG_FSE
	4468	need_event = need_fsevent(FSE_CREATE_FILE, dvp);
	4469	#else
	4470	need_event = 0;
	4471	#endif
	4472	has_listeners = kauth_authorize_fileop_has_listeners();
	4473
	4474	need_kpath2 = 0;
	4475	#if CONFIG_AUDIT
	4476	if (AUDIT_RECORD_EXISTS()) {
	4477	need_kpath2 = 1;
	4478	}
	4479	#endif
	4480
	4481	if (need_event \|\| has_listeners \|\| need_kpath2) {
	4482	char *link_to_path = NULL;
	4483	int len, link_name_len;
	4484
	4485	/* build the path to the new link file */
	4486	GET_PATH(target_path);
	4487	if (target_path == NULL) {
	4488	error = ENOMEM;
	4489	goto out2;
	4490	}
	4491
	4492	len = safe_getpath(dvp, nd.ni_cnd.cn_nameptr, target_path, MAXPATHLEN, &truncated);
	4493
	4494	AUDIT_ARG(kpath, target_path, ARG_KPATH2);
	4495
	4496	if (has_listeners) {
	4497	/* build the path to file we are linking to */
	4498	GET_PATH(link_to_path);
	4499	if (link_to_path == NULL) {
	4500	error = ENOMEM;
	4501	goto out2;
	4502	}
	4503
	4504	link_name_len = MAXPATHLEN;
	4505	if (vn_getpath(vp, link_to_path, &link_name_len) == 0) {
	4506	/*
	4507	* Call out to allow 3rd party notification of rename.
	4508	* Ignore result of kauth_authorize_fileop call.
	4509	*/
	4510	kauth_authorize_fileop(vfs_context_ucred(ctx), KAUTH_FILEOP_LINK,
	4511	(uintptr_t)link_to_path,
	4512	(uintptr_t)target_path);
	4513	}
	4514	if (link_to_path != NULL) {
	4515	RELEASE_PATH(link_to_path);
	4516	}
	4517	}
	4518	#if CONFIG_FSE
	4519	if (need_event) {
	4520	/* construct fsevent */
	4521	if (get_fse_info(vp, &finfo, ctx) == 0) {
	4522	if (truncated) {
	4523	finfo.mode \|= FSE_TRUNCATED_PATH;
	4524	}
	4525
	4526	// build the path to the destination of the link
	4527	add_fsevent(FSE_CREATE_FILE, ctx,
	4528	FSE_ARG_STRING, len, target_path,
	4529	FSE_ARG_FINFO, &finfo,
	4530	FSE_ARG_DONE);
	4531	}
	4532	if (vp->v_parent) {
	4533	add_fsevent(FSE_STAT_CHANGED, ctx,
	4534	FSE_ARG_VNODE, vp->v_parent,
	4535	FSE_ARG_DONE);
	4536	}
	4537	}
	4538	#endif
	4539	}
	4540	out2:
	4541	/*
	4542	* nameidone has to happen before we vnode_put(dvp)
	4543	* since it may need to release the fs_nodelock on the dvp
	4544	*/
	4545	nameidone(&nd);
	4546	if (target_path != NULL) {
	4547	RELEASE_PATH(target_path);
	4548	}
	4549	out:
	4550	if (lvp)
	4551	vnode_put(lvp);
	4552	if (dvp)
	4553	vnode_put(dvp);
	4554	vnode_put(vp);
	4555	return (error);
	4556	}
	4557
	4558	int
	4559	link(__unused proc_t p, struct link_args uap, __unused int32_t retval)
	4560	{
	4561	return (linkat_internal(vfs_context_current(), AT_FDCWD, uap->path,
	4562	AT_FDCWD, uap->link, AT_SYMLINK_FOLLOW, UIO_USERSPACE));
	4563	}
	4564
	4565	int
	4566	linkat(__unused proc_t p, struct linkat_args uap, __unused int32_t retval)
	4567	{
	4568	if (uap->flag & ~AT_SYMLINK_FOLLOW)
	4569	return (EINVAL);
	4570
	4571	return (linkat_internal(vfs_context_current(), uap->fd1, uap->path,
	4572	uap->fd2, uap->link, uap->flag, UIO_USERSPACE));
	4573	}
	4574
	4575	/*
	4576	* Make a symbolic link.
	4577	*
	4578	* We could add support for ACLs here too...
	4579	*/
	4580	/* ARGSUSED */
	4581	static int
	4582	symlinkat_internal(vfs_context_t ctx, user_addr_t path_data, int fd,
	4583	user_addr_t link, enum uio_seg segflg)
	4584	{
	4585	struct vnode_attr va;
	4586	char *path;
	4587	int error;
	4588	struct nameidata nd;
	4589	vnode_t vp, dvp;
	4590	size_t dummy=0;
	4591	proc_t p;
	4592
	4593	error = 0;
	4594	if (UIO_SEG_IS_USER_SPACE(segflg)) {
	4595	MALLOC_ZONE(path, char *, MAXPATHLEN, M_NAMEI, M_WAITOK);
	4596	error = copyinstr(path_data, path, MAXPATHLEN, &dummy);
	4597	} else {
	4598	path = (char *)path_data;
	4599	}
	4600	if (error)
	4601	goto out;
	4602	AUDIT_ARG(text, path); /* This is the link string */
	4603
	4604	NDINIT(&nd, CREATE, OP_SYMLINK, LOCKPARENT \| AUDITVNPATH1,
	4605	segflg, link, ctx);
	4606
	4607	error = nameiat(&nd, fd);
	4608	if (error)
	4609	goto out;
	4610	dvp = nd.ni_dvp;
	4611	vp = nd.ni_vp;
	4612
	4613	p = vfs_context_proc(ctx);
	4614	VATTR_INIT(&va);
	4615	VATTR_SET(&va, va_type, VLNK);
	4616	VATTR_SET(&va, va_mode, ACCESSPERMS & ~p->p_fd->fd_cmask);
	4617
	4618	#if CONFIG_MACF
	4619	error = mac_vnode_check_create(ctx,
	4620	dvp, &nd.ni_cnd, &va);
	4621	#endif
	4622	if (error != 0) {
	4623	goto skipit;
	4624	}
	4625
	4626	if (vp != NULL) {
	4627	error = EEXIST;
	4628	goto skipit;
	4629	}
	4630
	4631	/* authorize */
	4632	if (error == 0)
	4633	error = vnode_authorize(dvp, NULL, KAUTH_VNODE_ADD_FILE, ctx);
	4634	/* get default ownership, etc. */
	4635	if (error == 0)
	4636	error = vnode_authattr_new(dvp, &va, 0, ctx);
	4637	if (error == 0)
	4638	error = VNOP_SYMLINK(dvp, &vp, &nd.ni_cnd, &va, path, ctx);
	4639
	4640	#if CONFIG_MACF
	4641	if (error == 0 && vp)
	4642	error = vnode_label(vnode_mount(vp), dvp, vp, &nd.ni_cnd, VNODE_LABEL_CREATE, ctx);
	4643	#endif
	4644
	4645	/* do fallback attribute handling */
	4646	if (error == 0 && vp)
	4647	error = vnode_setattr_fallback(vp, &va, ctx);
	4648
	4649	if (error == 0) {
	4650	int update_flags = 0;
	4651
	4652	/check if a new vnode was created, else try to get one/
	4653	if (vp == NULL) {
	4654	nd.ni_cnd.cn_nameiop = LOOKUP;
	4655	#if CONFIG_TRIGGERS
	4656	nd.ni_op = OP_LOOKUP;
	4657	#endif
	4658	nd.ni_cnd.cn_flags = 0;
	4659	error = nameiat(&nd, fd);
	4660	vp = nd.ni_vp;
	4661
	4662	if (vp == NULL)
	4663	goto skipit;
	4664	}
	4665
	4666	#if 0 /* XXX - kauth_todo - is KAUTH_FILEOP_SYMLINK needed? */
	4667	/* call out to allow 3rd party notification of rename.
	4668	* Ignore result of kauth_authorize_fileop call.
	4669	*/
	4670	if (kauth_authorize_fileop_has_listeners() &&
	4671	namei(&nd) == 0) {
	4672	char *new_link_path = NULL;
	4673	int len;
	4674
	4675	/* build the path to the new link file */
	4676	new_link_path = get_pathbuff();
	4677	len = MAXPATHLEN;
	4678	vn_getpath(dvp, new_link_path, &len);
	4679	if ((len + 1 + nd.ni_cnd.cn_namelen + 1) < MAXPATHLEN) {
	4680	new_link_path[len - 1] = '/';
	4681	strlcpy(&new_link_path[len], nd.ni_cnd.cn_nameptr, MAXPATHLEN-len);
	4682	}
	4683
	4684	kauth_authorize_fileop(vfs_context_ucred(ctx), KAUTH_FILEOP_SYMLINK,
	4685	(uintptr_t)path, (uintptr_t)new_link_path);
	4686	if (new_link_path != NULL)
	4687	release_pathbuff(new_link_path);
	4688	}
	4689	#endif
	4690	// Make sure the name & parent pointers are hooked up
	4691	if (vp->v_name == NULL)
	4692	update_flags \|= VNODE_UPDATE_NAME;
	4693	if (vp->v_parent == NULLVP)
	4694	update_flags \|= VNODE_UPDATE_PARENT;
	4695
	4696	if (update_flags)
	4697	vnode_update_identity(vp, dvp, nd.ni_cnd.cn_nameptr, nd.ni_cnd.cn_namelen, nd.ni_cnd.cn_hash, update_flags);
	4698
	4699	#if CONFIG_FSE
	4700	add_fsevent(FSE_CREATE_FILE, ctx,
	4701	FSE_ARG_VNODE, vp,
	4702	FSE_ARG_DONE);
	4703	#endif
	4704	}
	4705
	4706	skipit:
	4707	/*
	4708	* nameidone has to happen before we vnode_put(dvp)
	4709	* since it may need to release the fs_nodelock on the dvp
	4710	*/
	4711	nameidone(&nd);
	4712
	4713	if (vp)
	4714	vnode_put(vp);
	4715	vnode_put(dvp);
	4716	out:
	4717	if (path && (path != (char *)path_data))
	4718	FREE_ZONE(path, MAXPATHLEN, M_NAMEI);
	4719
	4720	return (error);
	4721	}
	4722
	4723	int
	4724	symlink(__unused proc_t p, struct symlink_args uap, __unused int32_t retval)
	4725	{
	4726	return (symlinkat_internal(vfs_context_current(), uap->path, AT_FDCWD,
	4727	uap->link, UIO_USERSPACE));
	4728	}
	4729
	4730	int
	4731	symlinkat(__unused proc_t p, struct symlinkat_args *uap,
	4732	__unused int32_t *retval)
	4733	{
	4734	return (symlinkat_internal(vfs_context_current(), uap->path1, uap->fd,
	4735	uap->path2, UIO_USERSPACE));
	4736	}
	4737
	4738	/*
	4739	* Delete a whiteout from the filesystem.
	4740	* No longer supported.
	4741	*/
	4742	int
	4743	undelete(__unused proc_t p, __unused struct undelete_args uap, __unused int32_t retval)
	4744	{
	4745	return (ENOTSUP);
	4746	}
	4747
	4748	/*
	4749	* Delete a name from the filesystem.
	4750	*/
	4751	/* ARGSUSED */
	4752	static int
	4753	unlinkat_internal(vfs_context_t ctx, int fd, vnode_t start_dvp,
	4754	user_addr_t path_arg, enum uio_seg segflg, int unlink_flags)
	4755	{
	4756	struct nameidata nd;
	4757	vnode_t vp, dvp;
	4758	int error;
	4759	struct componentname *cnp;
	4760	char *path = NULL;
	4761	int len=0;
	4762	#if CONFIG_FSE
	4763	fse_info finfo;
	4764	struct vnode_attr va;
	4765	#endif
	4766	int flags;
	4767	int need_event;
	4768	int has_listeners;
	4769	int truncated_path;
	4770	int batched;
	4771	struct vnode_attr *vap;
	4772	int do_retry;
	4773	int retry_count = 0;
	4774	int cn_flags;
	4775
	4776	cn_flags = LOCKPARENT;
	4777	if (!(unlink_flags & VNODE_REMOVE_NO_AUDIT_PATH))
	4778	cn_flags \|= AUDITVNPATH1;
	4779	/* If a starting dvp is passed, it trumps any fd passed. */
	4780	if (start_dvp)
	4781	cn_flags \|= USEDVP;
	4782
	4783	#if NAMEDRSRCFORK
	4784	/* unlink or delete is allowed on rsrc forks and named streams */
	4785	cn_flags \|= CN_ALLOWRSRCFORK;
	4786	#endif
	4787
	4788	retry:
	4789	do_retry = 0;
	4790	flags = 0;
	4791	need_event = 0;
	4792	has_listeners = 0;
	4793	truncated_path = 0;
	4794	vap = NULL;
	4795
	4796	NDINIT(&nd, DELETE, OP_UNLINK, cn_flags, segflg, path_arg, ctx);
	4797
	4798	nd.ni_dvp = start_dvp;
	4799	nd.ni_flag \|= NAMEI_COMPOUNDREMOVE;
	4800	cnp = &nd.ni_cnd;
	4801
	4802	continue_lookup:
	4803	error = nameiat(&nd, fd);
	4804	if (error)
	4805	return (error);
	4806
	4807	dvp = nd.ni_dvp;
	4808	vp = nd.ni_vp;
	4809
	4810
	4811	/* With Carbon delete semantics, busy files cannot be deleted */
	4812	if (unlink_flags & VNODE_REMOVE_NODELETEBUSY) {
	4813	flags \|= VNODE_REMOVE_NODELETEBUSY;
	4814	}
	4815
	4816	/* Skip any potential upcalls if told to. */
	4817	if (unlink_flags & VNODE_REMOVE_SKIP_NAMESPACE_EVENT) {
	4818	flags \|= VNODE_REMOVE_SKIP_NAMESPACE_EVENT;
	4819	}
	4820
	4821	if (vp) {
	4822	batched = vnode_compound_remove_available(vp);
	4823	/*
	4824	* The root of a mounted filesystem cannot be deleted.
	4825	*/
	4826	if (vp->v_flag & VROOT) {
	4827	error = EBUSY;
	4828	}
	4829
	4830	#if DEVELOPMENT \|\| DEBUG
	4831	/*
	4832	* XXX VSWAP: Check for entitlements or special flag here
	4833	* so we can restrict access appropriately.
	4834	*/
	4835	#else /* DEVELOPMENT \|\| DEBUG */
	4836
	4837	if (vnode_isswap(vp) && (ctx != vfs_context_kernel())) {
	4838	error = EPERM;
	4839	goto out;
	4840	}
	4841	#endif /* DEVELOPMENT \|\| DEBUG */
	4842
	4843	if (!batched) {
	4844	error = vn_authorize_unlink(dvp, vp, cnp, ctx, NULL);
	4845	if (error) {
	4846	if (error == ENOENT) {
	4847	assert(retry_count < MAX_AUTHORIZE_ENOENT_RETRIES);
	4848	if (retry_count < MAX_AUTHORIZE_ENOENT_RETRIES) {
	4849	do_retry = 1;
	4850	retry_count++;
	4851	}
	4852	}
	4853	goto out;
	4854	}
	4855	}
	4856	} else {
	4857	batched = 1;
	4858
	4859	if (!vnode_compound_remove_available(dvp)) {
	4860	panic("No vp, but no compound remove?");
	4861	}
	4862	}
	4863
	4864	#if CONFIG_FSE
	4865	need_event = need_fsevent(FSE_DELETE, dvp);
	4866	if (need_event) {
	4867	if (!batched) {
	4868	if ((vp->v_flag & VISHARDLINK) == 0) {
	4869	/* XXX need to get these data in batched VNOP */
	4870	get_fse_info(vp, &finfo, ctx);
	4871	}
	4872	} else {
	4873	error = vfs_get_notify_attributes(&va);
	4874	if (error) {
	4875	goto out;
	4876	}
	4877
	4878	vap = &va;
	4879	}
	4880	}
	4881	#endif
	4882	has_listeners = kauth_authorize_fileop_has_listeners();
	4883	if (need_event \|\| has_listeners) {
	4884	if (path == NULL) {
	4885	GET_PATH(path);
	4886	if (path == NULL) {
	4887	error = ENOMEM;
	4888	goto out;
	4889	}
	4890	}
	4891	len = safe_getpath(dvp, nd.ni_cnd.cn_nameptr, path, MAXPATHLEN, &truncated_path);
	4892	}
	4893
	4894	#if NAMEDRSRCFORK
	4895	if (nd.ni_cnd.cn_flags & CN_WANTSRSRCFORK)
	4896	error = vnode_removenamedstream(dvp, vp, XATTR_RESOURCEFORK_NAME, 0, ctx);
	4897	else
	4898	#endif
	4899	{
	4900	error = vn_remove(dvp, &nd.ni_vp, &nd, flags, vap, ctx);
	4901	vp = nd.ni_vp;
	4902	if (error == EKEEPLOOKING) {
	4903	if (!batched) {
	4904	panic("EKEEPLOOKING, but not a filesystem that supports compound VNOPs?");
	4905	}
	4906
	4907	if ((nd.ni_flag & NAMEI_CONTLOOKUP) == 0) {
	4908	panic("EKEEPLOOKING, but continue flag not set?");
	4909	}
	4910
	4911	if (vnode_isdir(vp)) {
	4912	error = EISDIR;
	4913	goto out;
	4914	}
	4915	goto continue_lookup;
	4916	} else if (error == ENOENT && batched) {
	4917	assert(retry_count < MAX_AUTHORIZE_ENOENT_RETRIES);
	4918	if (retry_count < MAX_AUTHORIZE_ENOENT_RETRIES) {
	4919	/*
	4920	* For compound VNOPs, the authorization callback may
	4921	* return ENOENT in case of racing hardlink lookups
	4922	* hitting the name cache, redrive the lookup.
	4923	*/
	4924	do_retry = 1;
	4925	retry_count += 1;
	4926	goto out;
	4927	}
	4928	}
	4929	}
	4930
	4931	/*
	4932	* Call out to allow 3rd party notification of delete.
	4933	* Ignore result of kauth_authorize_fileop call.
	4934	*/
	4935	if (!error) {
	4936	if (has_listeners) {
	4937	kauth_authorize_fileop(vfs_context_ucred(ctx),
	4938	KAUTH_FILEOP_DELETE,
	4939	(uintptr_t)vp,
	4940	(uintptr_t)path);
	4941	}
	4942
	4943	if (vp->v_flag & VISHARDLINK) {
	4944	//
	4945	// if a hardlink gets deleted we want to blow away the
	4946	// v_parent link because the path that got us to this
	4947	// instance of the link is no longer valid. this will
	4948	// force the next call to get the path to ask the file
	4949	// system instead of just following the v_parent link.
	4950	//
	4951	vnode_update_identity(vp, NULL, NULL, 0, 0, VNODE_UPDATE_PARENT);
	4952	}
	4953
	4954	#if CONFIG_FSE
	4955	if (need_event) {
	4956	if (vp->v_flag & VISHARDLINK) {
	4957	get_fse_info(vp, &finfo, ctx);
	4958	} else if (vap) {
	4959	vnode_get_fse_info_from_vap(vp, &finfo, vap);
	4960	}
	4961	if (truncated_path) {
	4962	finfo.mode \|= FSE_TRUNCATED_PATH;
	4963	}
	4964	add_fsevent(FSE_DELETE, ctx,
	4965	FSE_ARG_STRING, len, path,
	4966	FSE_ARG_FINFO, &finfo,
	4967	FSE_ARG_DONE);
	4968	}
	4969	#endif
	4970	}
	4971
	4972	out:
	4973	if (path != NULL)
	4974	RELEASE_PATH(path);
	4975
	4976	#if NAMEDRSRCFORK
	4977	/* recycle the deleted rsrc fork vnode to force a reclaim, which
	4978	* will cause its shadow file to go away if necessary.
	4979	*/
	4980	if (vp && (vnode_isnamedstream(vp)) &&
	4981	(vp->v_parent != NULLVP) &&
	4982	vnode_isshadow(vp)) {
	4983	vnode_recycle(vp);
	4984	}
	4985	#endif
	4986	/*
	4987	* nameidone has to happen before we vnode_put(dvp)
	4988	* since it may need to release the fs_nodelock on the dvp
	4989	*/
	4990	nameidone(&nd);
	4991	vnode_put(dvp);
	4992	if (vp) {
	4993	vnode_put(vp);
	4994	}
	4995
	4996	if (do_retry) {
	4997	goto retry;
	4998	}
	4999
	5000	return (error);
	5001	}
	5002
	5003	int
	5004	unlink1(vfs_context_t ctx, vnode_t start_dvp, user_addr_t path_arg,
	5005	enum uio_seg segflg, int unlink_flags)
	5006	{
	5007	return (unlinkat_internal(ctx, AT_FDCWD, start_dvp, path_arg, segflg,
	5008	unlink_flags));
	5009	}
	5010
	5011	/*
	5012	* Delete a name from the filesystem using Carbon semantics.
	5013	*/
	5014	int
	5015	delete(__unused proc_t p, struct delete_args uap, __unused int32_t retval)
	5016	{
	5017	return (unlinkat_internal(vfs_context_current(), AT_FDCWD, NULLVP,
	5018	uap->path, UIO_USERSPACE, VNODE_REMOVE_NODELETEBUSY));
	5019	}
	5020
	5021	/*
	5022	* Delete a name from the filesystem using POSIX semantics.
	5023	*/
	5024	int
	5025	unlink(__unused proc_t p, struct unlink_args uap, __unused int32_t retval)
	5026	{
	5027	return (unlinkat_internal(vfs_context_current(), AT_FDCWD, NULLVP,
	5028	uap->path, UIO_USERSPACE, 0));
	5029	}
	5030
	5031	int
	5032	unlinkat(__unused proc_t p, struct unlinkat_args uap, __unused int32_t retval)
	5033	{
	5034	if (uap->flag & ~AT_REMOVEDIR)
	5035	return (EINVAL);
	5036
	5037	if (uap->flag & AT_REMOVEDIR)
	5038	return (rmdirat_internal(vfs_context_current(), uap->fd,
	5039	uap->path, UIO_USERSPACE));
	5040	else
	5041	return (unlinkat_internal(vfs_context_current(), uap->fd,
	5042	NULLVP, uap->path, UIO_USERSPACE, 0));
	5043	}
	5044
	5045	/*
	5046	* Reposition read/write file offset.
	5047	*/
	5048	int
	5049	lseek(proc_t p, struct lseek_args uap, off_t retval)
	5050	{
	5051	struct fileproc *fp;
	5052	vnode_t vp;
	5053	struct vfs_context *ctx;
	5054	off_t offset = uap->offset, file_size;
	5055	int error;
	5056
	5057	if ( (error = fp_getfvp(p,uap->fd, &fp, &vp)) ) {
	5058	if (error == ENOTSUP)
	5059	return (ESPIPE);
	5060	return (error);
	5061	}
	5062	if (vnode_isfifo(vp)) {
	5063	file_drop(uap->fd);
	5064	return(ESPIPE);
	5065	}
	5066
	5067
	5068	ctx = vfs_context_current();
	5069	#if CONFIG_MACF
	5070	if (uap->whence == L_INCR && uap->offset == 0)
	5071	error = mac_file_check_get_offset(vfs_context_ucred(ctx),
	5072	fp->f_fglob);
	5073	else
	5074	error = mac_file_check_change_offset(vfs_context_ucred(ctx),
	5075	fp->f_fglob);
	5076	if (error) {
	5077	file_drop(uap->fd);
	5078	return (error);
	5079	}
	5080	#endif
	5081	if ( (error = vnode_getwithref(vp)) ) {
	5082	file_drop(uap->fd);
	5083	return(error);
	5084	}
	5085
	5086	switch (uap->whence) {
	5087	case L_INCR:
	5088	offset += fp->f_fglob->fg_offset;
	5089	break;
	5090	case L_XTND:
	5091	if ((error = vnode_size(vp, &file_size, ctx)) != 0)
	5092	break;
	5093	offset += file_size;
	5094	break;
	5095	case L_SET:
	5096	break;
	5097	case SEEK_HOLE:
	5098	error = VNOP_IOCTL(vp, FSIOC_FIOSEEKHOLE, (caddr_t)&offset, 0, ctx);
	5099	break;
	5100	case SEEK_DATA:
	5101	error = VNOP_IOCTL(vp, FSIOC_FIOSEEKDATA, (caddr_t)&offset, 0, ctx);
	5102	break;
	5103	default:
	5104	error = EINVAL;
	5105	}
	5106	if (error == 0) {
	5107	if (uap->offset > 0 && offset < 0) {
	5108	/* Incremented/relative move past max size */
	5109	error = EOVERFLOW;
	5110	} else {
	5111	/*
	5112	* Allow negative offsets on character devices, per
	5113	* POSIX 1003.1-2001. Most likely for writing disk
	5114	* labels.
	5115	*/
	5116	if (offset < 0 && vp->v_type != VCHR) {
	5117	/* Decremented/relative move before start */
	5118	error = EINVAL;
	5119	} else {
	5120	/* Success */
	5121	fp->f_fglob->fg_offset = offset;
	5122	*retval = fp->f_fglob->fg_offset;
	5123	}
	5124	}
	5125	}
	5126
	5127	/*
	5128	* An lseek can affect whether data is "available to read." Use
	5129	* hint of NOTE_NONE so no EVFILT_VNODE events fire
	5130	*/
	5131	post_event_if_success(vp, error, NOTE_NONE);
	5132	(void)vnode_put(vp);
	5133	file_drop(uap->fd);
	5134	return (error);
	5135	}
	5136
	5137
	5138	/*
	5139	* Check access permissions.
	5140	*
	5141	* Returns: 0 Success
	5142	* vnode_authorize:???
	5143	*/
	5144	static int
	5145	access1(vnode_t vp, vnode_t dvp, int uflags, vfs_context_t ctx)
	5146	{
	5147	kauth_action_t action;
	5148	int error;
	5149
	5150	/*
	5151	* If just the regular access bits, convert them to something
	5152	* that vnode_authorize will understand.
	5153	*/
	5154	if (!(uflags & _ACCESS_EXTENDED_MASK)) {
	5155	action = 0;
	5156	if (uflags & R_OK)
	5157	action \|= KAUTH_VNODE_READ_DATA; /* aka KAUTH_VNODE_LIST_DIRECTORY */
	5158	if (uflags & W_OK) {
	5159	if (vnode_isdir(vp)) {
	5160	action \|= KAUTH_VNODE_ADD_FILE \|
	5161	KAUTH_VNODE_ADD_SUBDIRECTORY;
	5162	/* might want delete rights here too */
	5163	} else {
	5164	action \|= KAUTH_VNODE_WRITE_DATA;
	5165	}
	5166	}
	5167	if (uflags & X_OK) {
	5168	if (vnode_isdir(vp)) {
	5169	action \|= KAUTH_VNODE_SEARCH;
	5170	} else {
	5171	action \|= KAUTH_VNODE_EXECUTE;
	5172	}
	5173	}
	5174	} else {
	5175	/* take advantage of definition of uflags */
	5176	action = uflags >> 8;
	5177	}
	5178
	5179	#if CONFIG_MACF
	5180	error = mac_vnode_check_access(ctx, vp, uflags);
	5181	if (error)
	5182	return (error);
	5183	#endif /* MAC */
	5184
	5185	/* action == 0 means only check for existence */
	5186	if (action != 0) {
	5187	error = vnode_authorize(vp, dvp, action \| KAUTH_VNODE_ACCESS, ctx);
	5188	} else {
	5189	error = 0;
	5190	}
	5191
	5192	return(error);
	5193	}
	5194
	5195
	5196
	5197	/*
	5198	* access_extended: Check access permissions in bulk.
	5199	*
	5200	* Description: uap->entries Pointer to an array of accessx
	5201	* descriptor structs, plus one or
	5202	* more NULL terminated strings (see
	5203	* "Notes" section below).
	5204	* uap->size Size of the area pointed to by
	5205	* uap->entries.
	5206	* uap->results Pointer to the results array.
	5207	*
	5208	* Returns: 0 Success
	5209	* ENOMEM Insufficient memory
	5210	* EINVAL Invalid arguments
	5211	* namei:EFAULT Bad address
	5212	* namei:ENAMETOOLONG Filename too long
	5213	* namei:ENOENT No such file or directory
	5214	* namei:ELOOP Too many levels of symbolic links
	5215	* namei:EBADF Bad file descriptor
	5216	* namei:ENOTDIR Not a directory
	5217	* namei:???
	5218	* access1:
	5219	*
	5220	* Implicit returns:
	5221	* uap->results Array contents modified
	5222	*
	5223	* Notes: The uap->entries are structured as an arbitrary length array
	5224	* of accessx descriptors, followed by one or more NULL terminated
	5225	* strings
	5226	*
	5227	* struct accessx_descriptor[0]
	5228	* ...
	5229	* struct accessx_descriptor[n]
	5230	* char name_data[0];
	5231	*
	5232	* We determine the entry count by walking the buffer containing
	5233	* the uap->entries argument descriptor. For each descriptor we
	5234	* see, the valid values for the offset ad_name_offset will be
	5235	* in the byte range:
	5236	*
	5237	* [ uap->entries + sizeof(struct accessx_descriptor) ]
	5238	* to
	5239	* [ uap->entries + uap->size - 2 ]
	5240	*
	5241	* since we must have at least one string, and the string must
	5242	* be at least one character plus the NULL terminator in length.
	5243	*
	5244	* XXX: Need to support the check-as uid argument
	5245	*/
	5246	int
	5247	access_extended(__unused proc_t p, struct access_extended_args uap, __unused int32_t retval)
	5248	{
	5249	struct accessx_descriptor *input = NULL;
	5250	errno_t *result = NULL;
	5251	errno_t error = 0;
	5252	int wantdelete = 0;
	5253	unsigned int desc_max, desc_actual, i, j;
	5254	struct vfs_context context;
	5255	struct nameidata nd;
	5256	int niopts;
	5257	vnode_t vp = NULL;
	5258	vnode_t dvp = NULL;
	5259	#define ACCESSX_MAX_DESCR_ON_STACK 10
	5260	struct accessx_descriptor stack_input[ACCESSX_MAX_DESCR_ON_STACK];
	5261
	5262	context.vc_ucred = NULL;
	5263
	5264	/*
	5265	* Validate parameters; if valid, copy the descriptor array and string
	5266	* arguments into local memory. Before proceeding, the following
	5267	* conditions must have been met:
	5268	*
	5269	* o The total size is not permitted to exceed ACCESSX_MAX_TABLESIZE
	5270	* o There must be sufficient room in the request for at least one
	5271	* descriptor and a one yte NUL terminated string.
	5272	* o The allocation of local storage must not fail.
	5273	*/
	5274	if (uap->size > ACCESSX_MAX_TABLESIZE)
	5275	return(ENOMEM);
	5276	if (uap->size < (sizeof(struct accessx_descriptor) + 2))
	5277	return(EINVAL);
	5278	if (uap->size <= sizeof (stack_input)) {
	5279	input = stack_input;
	5280	} else {
	5281	MALLOC(input, struct accessx_descriptor *, uap->size, M_TEMP, M_WAITOK);
	5282	if (input == NULL) {
	5283	error = ENOMEM;
	5284	goto out;
	5285	}
	5286	}
	5287	error = copyin(uap->entries, input, uap->size);
	5288	if (error)
	5289	goto out;
	5290
	5291	AUDIT_ARG(opaque, input, uap->size);
	5292
	5293	/*
	5294	* Force NUL termination of the copyin buffer to avoid nami() running
	5295	* off the end. If the caller passes us bogus data, they may get a
	5296	* bogus result.
	5297	*/
	5298	((char *)input)[uap->size - 1] = 0;
	5299
	5300	/*
	5301	* Access is defined as checking against the process' real identity,
	5302	* even if operations are checking the effective identity. This
	5303	* requires that we use a local vfs context.
	5304	*/
	5305	context.vc_ucred = kauth_cred_copy_real(kauth_cred_get());
	5306	context.vc_thread = current_thread();
	5307
	5308	/*
	5309	* Find out how many entries we have, so we can allocate the result
	5310	* array by walking the list and adjusting the count downward by the
	5311	* earliest string offset we see.
	5312	*/
	5313	desc_max = (uap->size - 2) / sizeof(struct accessx_descriptor);
	5314	desc_actual = desc_max;
	5315	for (i = 0; i < desc_actual; i++) {
	5316	/*
	5317	* Take the offset to the name string for this entry and
	5318	* convert to an input array index, which would be one off
	5319	* the end of the array if this entry was the lowest-addressed
	5320	* name string.
	5321	*/
	5322	j = input[i].ad_name_offset / sizeof(struct accessx_descriptor);
	5323
	5324	/*
	5325	* An offset greater than the max allowable offset is an error.
	5326	* It is also an error for any valid entry to point
	5327	* to a location prior to the end of the current entry, if
	5328	* it's not a reference to the string of the previous entry.
	5329	*/
	5330	if (j > desc_max \|\| (j != 0 && j <= i)) {
	5331	error = EINVAL;
	5332	goto out;
	5333	}
	5334
	5335	/* Also do not let ad_name_offset point to something beyond the size of the input */
	5336	if (input[i].ad_name_offset >= uap->size) {
	5337	error = EINVAL;
	5338	goto out;
	5339	}
	5340
	5341	/*
	5342	* An offset of 0 means use the previous descriptor's offset;
	5343	* this is used to chain multiple requests for the same file
	5344	* to avoid multiple lookups.
	5345	*/
	5346	if (j == 0) {
	5347	/* This is not valid for the first entry */
	5348	if (i == 0) {
	5349	error = EINVAL;
	5350	goto out;
	5351	}
	5352	continue;
	5353	}
	5354
	5355	/*
	5356	* If the offset of the string for this descriptor is before
	5357	* what we believe is the current actual last descriptor,
	5358	* then we need to adjust our estimate downward; this permits
	5359	* the string table following the last descriptor to be out
	5360	* of order relative to the descriptor list.
	5361	*/
	5362	if (j < desc_actual)
	5363	desc_actual = j;
	5364	}
	5365
	5366	/*
	5367	* We limit the actual number of descriptors we are willing to process
	5368	* to a hard maximum of ACCESSX_MAX_DESCRIPTORS. If the number being
	5369	* requested does not exceed this limit,
	5370	*/
	5371	if (desc_actual > ACCESSX_MAX_DESCRIPTORS) {
	5372	error = ENOMEM;
	5373	goto out;
	5374	}
	5375	MALLOC(result, errno_t , desc_actual sizeof(errno_t), M_TEMP, M_WAITOK \| M_ZERO);
	5376	if (result == NULL) {
	5377	error = ENOMEM;
	5378	goto out;
	5379	}
	5380
	5381	/*
	5382	* Do the work by iterating over the descriptor entries we know to
	5383	* at least appear to contain valid data.
	5384	*/
	5385	error = 0;
	5386	for (i = 0; i < desc_actual; i++) {
	5387	/*
	5388	* If the ad_name_offset is 0, then we use the previous
	5389	* results to make the check; otherwise, we are looking up
	5390	* a new file name.
	5391	*/
	5392	if (input[i].ad_name_offset != 0) {
	5393	/* discard old vnodes */
	5394	if (vp) {
	5395	vnode_put(vp);
	5396	vp = NULL;
	5397	}
	5398	if (dvp) {
	5399	vnode_put(dvp);
	5400	dvp = NULL;
	5401	}
	5402
	5403	/*
	5404	* Scan forward in the descriptor list to see if we
	5405	* need the parent vnode. We will need it if we are
	5406	* deleting, since we must have rights to remove
	5407	* entries in the parent directory, as well as the
	5408	* rights to delete the object itself.
	5409	*/
	5410	wantdelete = input[i].ad_flags & _DELETE_OK;
	5411	for (j = i + 1; (j < desc_actual) && (input[j].ad_name_offset == 0); j++)
	5412	if (input[j].ad_flags & _DELETE_OK)
	5413	wantdelete = 1;
	5414
	5415	niopts = FOLLOW \| AUDITVNPATH1;
	5416
	5417	/* need parent for vnode_authorize for deletion test */
	5418	if (wantdelete)
	5419	niopts \|= WANTPARENT;
	5420
	5421	/* do the lookup */
	5422	NDINIT(&nd, LOOKUP, OP_ACCESS, niopts, UIO_SYSSPACE,
	5423	CAST_USER_ADDR_T(((const char *)input) + input[i].ad_name_offset),
	5424	&context);
	5425	error = namei(&nd);
	5426	if (!error) {
	5427	vp = nd.ni_vp;
	5428	if (wantdelete)
	5429	dvp = nd.ni_dvp;
	5430	}
	5431	nameidone(&nd);
	5432	}
	5433
	5434	/*
	5435	* Handle lookup errors.
	5436	*/
	5437	switch(error) {
	5438	case ENOENT:
	5439	case EACCES:
	5440	case EPERM:
	5441	case ENOTDIR:
	5442	result[i] = error;
	5443	break;
	5444	case 0:
	5445	/* run this access check */
	5446	result[i] = access1(vp, dvp, input[i].ad_flags, &context);
	5447	break;
	5448	default:
	5449	/* fatal lookup error */
	5450
	5451	goto out;
	5452	}
	5453	}
	5454
	5455	AUDIT_ARG(data, result, sizeof(errno_t), desc_actual);
	5456
	5457	/* copy out results */
	5458	error = copyout(result, uap->results, desc_actual * sizeof(errno_t));
	5459
	5460	out:
	5461	if (input && input != stack_input)
	5462	FREE(input, M_TEMP);
	5463	if (result)
	5464	FREE(result, M_TEMP);
	5465	if (vp)
	5466	vnode_put(vp);
	5467	if (dvp)
	5468	vnode_put(dvp);
	5469	if (IS_VALID_CRED(context.vc_ucred))
	5470	kauth_cred_unref(&context.vc_ucred);
	5471	return(error);
	5472	}
	5473
	5474
	5475	/*
	5476	* Returns: 0 Success
	5477	* namei:EFAULT Bad address
	5478	* namei:ENAMETOOLONG Filename too long
	5479	* namei:ENOENT No such file or directory
	5480	* namei:ELOOP Too many levels of symbolic links
	5481	* namei:EBADF Bad file descriptor
	5482	* namei:ENOTDIR Not a directory
	5483	* namei:???
	5484	* access1:
	5485	*/
	5486	static int
	5487	faccessat_internal(vfs_context_t ctx, int fd, user_addr_t path, int amode,
	5488	int flag, enum uio_seg segflg)
	5489	{
	5490	int error;
	5491	struct nameidata nd;
	5492	int niopts;
	5493	struct vfs_context context;
	5494	#if NAMEDRSRCFORK
	5495	int is_namedstream = 0;
	5496	#endif
	5497
	5498	/*
	5499	* Unless the AT_EACCESS option is used, Access is defined as checking
	5500	* against the process' real identity, even if operations are checking
	5501	* the effective identity. So we need to tweak the credential
	5502	* in the context for that case.
	5503	*/
	5504	if (!(flag & AT_EACCESS))
	5505	context.vc_ucred = kauth_cred_copy_real(kauth_cred_get());
	5506	else
	5507	context.vc_ucred = ctx->vc_ucred;
	5508	context.vc_thread = ctx->vc_thread;
	5509
	5510
	5511	niopts = FOLLOW \| AUDITVNPATH1;
	5512	/* need parent for vnode_authorize for deletion test */
	5513	if (amode & _DELETE_OK)
	5514	niopts \|= WANTPARENT;
	5515	NDINIT(&nd, LOOKUP, OP_ACCESS, niopts, segflg,
	5516	path, &context);
	5517
	5518	#if NAMEDRSRCFORK
	5519	/* access(F_OK) calls are allowed for resource forks. */
	5520	if (amode == F_OK)
	5521	nd.ni_cnd.cn_flags \|= CN_ALLOWRSRCFORK;
	5522	#endif
	5523	error = nameiat(&nd, fd);
	5524	if (error)
	5525	goto out;
	5526
	5527	#if NAMEDRSRCFORK
	5528	/* Grab reference on the shadow stream file vnode to
	5529	* force an inactive on release which will mark it
	5530	* for recycle.
	5531	*/
	5532	if (vnode_isnamedstream(nd.ni_vp) &&
	5533	(nd.ni_vp->v_parent != NULLVP) &&
	5534	vnode_isshadow(nd.ni_vp)) {
	5535	is_namedstream = 1;
	5536	vnode_ref(nd.ni_vp);
	5537	}
	5538	#endif
	5539
	5540	error = access1(nd.ni_vp, nd.ni_dvp, amode, &context);
	5541
	5542	#if NAMEDRSRCFORK
	5543	if (is_namedstream) {
	5544	vnode_rele(nd.ni_vp);
	5545	}
	5546	#endif
	5547
	5548	vnode_put(nd.ni_vp);
	5549	if (amode & _DELETE_OK)
	5550	vnode_put(nd.ni_dvp);
	5551	nameidone(&nd);
	5552
	5553	out:
	5554	if (!(flag & AT_EACCESS))
	5555	kauth_cred_unref(&context.vc_ucred);
	5556	return (error);
	5557	}
	5558
	5559	int
	5560	access(__unused proc_t p, struct access_args uap, __unused int32_t retval)
	5561	{
	5562	return (faccessat_internal(vfs_context_current(), AT_FDCWD,
	5563	uap->path, uap->flags, 0, UIO_USERSPACE));
	5564	}
	5565
	5566	int
	5567	faccessat(__unused proc_t p, struct faccessat_args *uap,
	5568	__unused int32_t *retval)
	5569	{
	5570	if (uap->flag & ~AT_EACCESS)
	5571	return (EINVAL);
	5572
	5573	return (faccessat_internal(vfs_context_current(), uap->fd,
	5574	uap->path, uap->amode, uap->flag, UIO_USERSPACE));
	5575	}
	5576
	5577	/*
	5578	* Returns: 0 Success
	5579	* EFAULT
	5580	* copyout:EFAULT
	5581	* namei:???
	5582	* vn_stat:???
	5583	*/
	5584	static int
	5585	fstatat_internal(vfs_context_t ctx, user_addr_t path, user_addr_t ub,
	5586	user_addr_t xsecurity, user_addr_t xsecurity_size, int isstat64,
	5587	enum uio_seg segflg, int fd, int flag)
	5588	{
	5589	struct nameidata nd;
	5590	int follow;
	5591	union {
	5592	struct stat sb;
	5593	struct stat64 sb64;
	5594	} source = {};
	5595	union {
	5596	struct user64_stat user64_sb;
	5597	struct user32_stat user32_sb;
	5598	struct user64_stat64 user64_sb64;
	5599	struct user32_stat64 user32_sb64;
	5600	} dest = {};
	5601	caddr_t sbp;
	5602	int error, my_size;
	5603	kauth_filesec_t fsec;
	5604	size_t xsecurity_bufsize;
	5605	void * statptr;
	5606
	5607	follow = (flag & AT_SYMLINK_NOFOLLOW) ? NOFOLLOW : FOLLOW;
	5608	NDINIT(&nd, LOOKUP, OP_GETATTR, follow \| AUDITVNPATH1,
	5609	segflg, path, ctx);
	5610
	5611	#if NAMEDRSRCFORK
	5612	int is_namedstream = 0;
	5613	/* stat calls are allowed for resource forks. */
	5614	nd.ni_cnd.cn_flags \|= CN_ALLOWRSRCFORK;
	5615	#endif
	5616	error = nameiat(&nd, fd);
	5617	if (error)
	5618	return (error);
	5619	fsec = KAUTH_FILESEC_NONE;
	5620
	5621	statptr = (void *)&source;
	5622
	5623	#if NAMEDRSRCFORK
	5624	/* Grab reference on the shadow stream file vnode to
	5625	* force an inactive on release which will mark it
	5626	* for recycle.
	5627	*/
	5628	if (vnode_isnamedstream(nd.ni_vp) &&
	5629	(nd.ni_vp->v_parent != NULLVP) &&
	5630	vnode_isshadow(nd.ni_vp)) {
	5631	is_namedstream = 1;
	5632	vnode_ref(nd.ni_vp);
	5633	}
	5634	#endif
	5635
	5636	error = vn_stat(nd.ni_vp, statptr, (xsecurity != USER_ADDR_NULL ? &fsec : NULL), isstat64, ctx);
	5637
	5638	#if NAMEDRSRCFORK
	5639	if (is_namedstream) {
	5640	vnode_rele(nd.ni_vp);
	5641	}
	5642	#endif
	5643	vnode_put(nd.ni_vp);
	5644	nameidone(&nd);
	5645
	5646	if (error)
	5647	return (error);
	5648	/* Zap spare fields */
	5649	if (isstat64 != 0) {
	5650	source.sb64.st_lspare = 0;
	5651	source.sb64.st_qspare[0] = 0LL;
	5652	source.sb64.st_qspare[1] = 0LL;
	5653	if (IS_64BIT_PROCESS(vfs_context_proc(ctx))) {
	5654	munge_user64_stat64(&source.sb64, &dest.user64_sb64);
	5655	my_size = sizeof(dest.user64_sb64);
	5656	sbp = (caddr_t)&dest.user64_sb64;
	5657	} else {
	5658	munge_user32_stat64(&source.sb64, &dest.user32_sb64);
	5659	my_size = sizeof(dest.user32_sb64);
	5660	sbp = (caddr_t)&dest.user32_sb64;
	5661	}
	5662	/*
	5663	* Check if we raced (post lookup) against the last unlink of a file.
	5664	*/
	5665	if ((source.sb64.st_nlink == 0) && S_ISREG(source.sb64.st_mode)) {
	5666	source.sb64.st_nlink = 1;
	5667	}
	5668	} else {
	5669	source.sb.st_lspare = 0;
	5670	source.sb.st_qspare[0] = 0LL;
	5671	source.sb.st_qspare[1] = 0LL;
	5672	if (IS_64BIT_PROCESS(vfs_context_proc(ctx))) {
	5673	munge_user64_stat(&source.sb, &dest.user64_sb);
	5674	my_size = sizeof(dest.user64_sb);
	5675	sbp = (caddr_t)&dest.user64_sb;
	5676	} else {
	5677	munge_user32_stat(&source.sb, &dest.user32_sb);
	5678	my_size = sizeof(dest.user32_sb);
	5679	sbp = (caddr_t)&dest.user32_sb;
	5680	}
	5681
	5682	/*
	5683	* Check if we raced (post lookup) against the last unlink of a file.
	5684	*/
	5685	if ((source.sb.st_nlink == 0) && S_ISREG(source.sb.st_mode)) {
	5686	source.sb.st_nlink = 1;
	5687	}
	5688	}
	5689	if ((error = copyout(sbp, ub, my_size)) != 0)
	5690	goto out;
	5691
	5692	/* caller wants extended security information? */
	5693	if (xsecurity != USER_ADDR_NULL) {
	5694
	5695	/* did we get any? */
	5696	if (fsec == KAUTH_FILESEC_NONE) {
	5697	if (susize(xsecurity_size, 0) != 0) {
	5698	error = EFAULT;
	5699	goto out;
	5700	}
	5701	} else {
	5702	/* find the user buffer size */
	5703	xsecurity_bufsize = fusize(xsecurity_size);
	5704
	5705	/* copy out the actual data size */
	5706	if (susize(xsecurity_size, KAUTH_FILESEC_COPYSIZE(fsec)) != 0) {
	5707	error = EFAULT;
	5708	goto out;
	5709	}
	5710
	5711	/* if the caller supplied enough room, copy out to it */
	5712	if (xsecurity_bufsize >= KAUTH_FILESEC_COPYSIZE(fsec))
	5713	error = copyout(fsec, xsecurity, KAUTH_FILESEC_COPYSIZE(fsec));
	5714	}
	5715	}
	5716	out:
	5717	if (fsec != KAUTH_FILESEC_NONE)
	5718	kauth_filesec_free(fsec);
	5719	return (error);
	5720	}
	5721
	5722	/*
	5723	* stat_extended: Get file status; with extended security (ACL).
	5724	*
	5725	* Parameters: p (ignored)
	5726	* uap User argument descriptor (see below)
	5727	* retval (ignored)
	5728	*
	5729	* Indirect: uap->path Path of file to get status from
	5730	* uap->ub User buffer (holds file status info)
	5731	* uap->xsecurity ACL to get (extended security)
	5732	* uap->xsecurity_size Size of ACL
	5733	*
	5734	* Returns: 0 Success
	5735	* !0 errno value
	5736	*
	5737	*/
	5738	int
	5739	stat_extended(__unused proc_t p, struct stat_extended_args *uap,
	5740	__unused int32_t *retval)
	5741	{
	5742	return (fstatat_internal(vfs_context_current(), uap->path, uap->ub,
	5743	uap->xsecurity, uap->xsecurity_size, 0, UIO_USERSPACE, AT_FDCWD,
	5744	0));
	5745	}
	5746
	5747	/*
	5748	* Returns: 0 Success
	5749	* fstatat_internal:??? [see fstatat_internal() in this file]
	5750	*/
	5751	int
	5752	stat(__unused proc_t p, struct stat_args uap, __unused int32_t retval)
	5753	{
	5754	return (fstatat_internal(vfs_context_current(), uap->path, uap->ub,
	5755	0, 0, 0, UIO_USERSPACE, AT_FDCWD, 0));
	5756	}
	5757
	5758	int
	5759	stat64(__unused proc_t p, struct stat64_args uap, __unused int32_t retval)
	5760	{
	5761	return (fstatat_internal(vfs_context_current(), uap->path, uap->ub,
	5762	0, 0, 1, UIO_USERSPACE, AT_FDCWD, 0));
	5763	}
	5764
	5765	/*
	5766	* stat64_extended: Get file status; can handle large inode numbers; with extended security (ACL).
	5767	*
	5768	* Parameters: p (ignored)
	5769	* uap User argument descriptor (see below)
	5770	* retval (ignored)
	5771	*
	5772	* Indirect: uap->path Path of file to get status from
	5773	* uap->ub User buffer (holds file status info)
	5774	* uap->xsecurity ACL to get (extended security)
	5775	* uap->xsecurity_size Size of ACL
	5776	*
	5777	* Returns: 0 Success
	5778	* !0 errno value
	5779	*
	5780	*/
	5781	int
	5782	stat64_extended(__unused proc_t p, struct stat64_extended_args uap, __unused int32_t retval)
	5783	{
	5784	return (fstatat_internal(vfs_context_current(), uap->path, uap->ub,
	5785	uap->xsecurity, uap->xsecurity_size, 1, UIO_USERSPACE, AT_FDCWD,
	5786	0));
	5787	}
	5788
	5789	/*
	5790	* lstat_extended: Get file status; does not follow links; with extended security (ACL).
	5791	*
	5792	* Parameters: p (ignored)
	5793	* uap User argument descriptor (see below)
	5794	* retval (ignored)
	5795	*
	5796	* Indirect: uap->path Path of file to get status from
	5797	* uap->ub User buffer (holds file status info)
	5798	* uap->xsecurity ACL to get (extended security)
	5799	* uap->xsecurity_size Size of ACL
	5800	*
	5801	* Returns: 0 Success
	5802	* !0 errno value
	5803	*
	5804	*/
	5805	int
	5806	lstat_extended(__unused proc_t p, struct lstat_extended_args uap, __unused int32_t retval)
	5807	{
	5808	return (fstatat_internal(vfs_context_current(), uap->path, uap->ub,
	5809	uap->xsecurity, uap->xsecurity_size, 0, UIO_USERSPACE, AT_FDCWD,
	5810	AT_SYMLINK_NOFOLLOW));
	5811	}
	5812
	5813	/*
	5814	* Get file status; this version does not follow links.
	5815	*/
	5816	int
	5817	lstat(__unused proc_t p, struct lstat_args uap, __unused int32_t retval)
	5818	{
	5819	return (fstatat_internal(vfs_context_current(), uap->path, uap->ub,
	5820	0, 0, 0, UIO_USERSPACE, AT_FDCWD, AT_SYMLINK_NOFOLLOW));
	5821	}
	5822
	5823	int
	5824	lstat64(__unused proc_t p, struct lstat64_args uap, __unused int32_t retval)
	5825	{
	5826	return (fstatat_internal(vfs_context_current(), uap->path, uap->ub,
	5827	0, 0, 1, UIO_USERSPACE, AT_FDCWD, AT_SYMLINK_NOFOLLOW));
	5828	}
	5829
	5830	/*
	5831	* lstat64_extended: Get file status; can handle large inode numbers; does not
	5832	* follow links; with extended security (ACL).
	5833	*
	5834	* Parameters: p (ignored)
	5835	* uap User argument descriptor (see below)
	5836	* retval (ignored)
	5837	*
	5838	* Indirect: uap->path Path of file to get status from
	5839	* uap->ub User buffer (holds file status info)
	5840	* uap->xsecurity ACL to get (extended security)
	5841	* uap->xsecurity_size Size of ACL
	5842	*
	5843	* Returns: 0 Success
	5844	* !0 errno value
	5845	*
	5846	*/
	5847	int
	5848	lstat64_extended(__unused proc_t p, struct lstat64_extended_args uap, __unused int32_t retval)
	5849	{
	5850	return (fstatat_internal(vfs_context_current(), uap->path, uap->ub,
	5851	uap->xsecurity, uap->xsecurity_size, 1, UIO_USERSPACE, AT_FDCWD,
	5852	AT_SYMLINK_NOFOLLOW));
	5853	}
	5854
	5855	int
	5856	fstatat(__unused proc_t p, struct fstatat_args uap, __unused int32_t retval)
	5857	{
	5858	if (uap->flag & ~AT_SYMLINK_NOFOLLOW)
	5859	return (EINVAL);
	5860
	5861	return (fstatat_internal(vfs_context_current(), uap->path, uap->ub,
	5862	0, 0, 0, UIO_USERSPACE, uap->fd, uap->flag));
	5863	}
	5864
	5865	int
	5866	fstatat64(__unused proc_t p, struct fstatat64_args *uap,
	5867	__unused int32_t *retval)
	5868	{
	5869	if (uap->flag & ~AT_SYMLINK_NOFOLLOW)
	5870	return (EINVAL);
	5871
	5872	return (fstatat_internal(vfs_context_current(), uap->path, uap->ub,
	5873	0, 0, 1, UIO_USERSPACE, uap->fd, uap->flag));
	5874	}
	5875
	5876	/*
	5877	* Get configurable pathname variables.
	5878	*
	5879	* Returns: 0 Success
	5880	* namei:???
	5881	* vn_pathconf:???
	5882	*
	5883	* Notes: Global implementation constants are intended to be
	5884	* implemented in this function directly; all other constants
	5885	* are per-FS implementation, and therefore must be handled in
	5886	* each respective FS, instead.
	5887	*
	5888	* XXX We implement some things globally right now that should actually be
	5889	* XXX per-FS; we will need to deal with this at some point.
	5890	*/
	5891	/* ARGSUSED */
	5892	int
	5893	pathconf(__unused proc_t p, struct pathconf_args uap, int32_t retval)
	5894	{
	5895	int error;
	5896	struct nameidata nd;
	5897	vfs_context_t ctx = vfs_context_current();
	5898
	5899	NDINIT(&nd, LOOKUP, OP_PATHCONF, FOLLOW \| AUDITVNPATH1,
	5900	UIO_USERSPACE, uap->path, ctx);
	5901	error = namei(&nd);
	5902	if (error)
	5903	return (error);
	5904
	5905	error = vn_pathconf(nd.ni_vp, uap->name, retval, ctx);
	5906
	5907	vnode_put(nd.ni_vp);
	5908	nameidone(&nd);
	5909	return (error);
	5910	}
	5911
	5912	/*
	5913	* Return target name of a symbolic link.
	5914	*/
	5915	/* ARGSUSED */
	5916	static int
	5917	readlinkat_internal(vfs_context_t ctx, int fd, user_addr_t path,
	5918	enum uio_seg seg, user_addr_t buf, size_t bufsize, enum uio_seg bufseg,
	5919	int *retval)
	5920	{
	5921	vnode_t vp;
	5922	uio_t auio;
	5923	int error;
	5924	struct nameidata nd;
	5925	char uio_buf[ UIO_SIZEOF(1) ];
	5926
	5927	NDINIT(&nd, LOOKUP, OP_READLINK, NOFOLLOW \| AUDITVNPATH1,
	5928	seg, path, ctx);
	5929
	5930	error = nameiat(&nd, fd);
	5931	if (error)
	5932	return (error);
	5933	vp = nd.ni_vp;
	5934
	5935	nameidone(&nd);
	5936
	5937	auio = uio_createwithbuffer(1, 0, bufseg, UIO_READ,
	5938	&uio_buf[0], sizeof(uio_buf));
	5939	uio_addiov(auio, buf, bufsize);
	5940	if (vp->v_type != VLNK) {
	5941	error = EINVAL;
	5942	} else {
	5943	#if CONFIG_MACF
	5944	error = mac_vnode_check_readlink(ctx, vp);
	5945	#endif
	5946	if (error == 0)
	5947	error = vnode_authorize(vp, NULL, KAUTH_VNODE_READ_DATA,
	5948	ctx);
	5949	if (error == 0)
	5950	error = VNOP_READLINK(vp, auio, ctx);
	5951	}
	5952	vnode_put(vp);
	5953
	5954	*retval = bufsize - (int)uio_resid(auio);
	5955	return (error);
	5956	}
	5957
	5958	int
	5959	readlink(proc_t p, struct readlink_args uap, int32_t retval)
	5960	{
	5961	enum uio_seg procseg;
	5962
	5963	procseg = IS_64BIT_PROCESS(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
	5964	return (readlinkat_internal(vfs_context_current(), AT_FDCWD,
	5965	CAST_USER_ADDR_T(uap->path), procseg, CAST_USER_ADDR_T(uap->buf),
	5966	uap->count, procseg, retval));
	5967	}
	5968
	5969	int
	5970	readlinkat(proc_t p, struct readlinkat_args uap, int32_t retval)
	5971	{
	5972	enum uio_seg procseg;
	5973
	5974	procseg = IS_64BIT_PROCESS(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
	5975	return (readlinkat_internal(vfs_context_current(), uap->fd, uap->path,
	5976	procseg, uap->buf, uap->bufsize, procseg, retval));
	5977	}
	5978
	5979	/*
	5980	* Change file flags.
	5981	*
	5982	* NOTE: this will vnode_put() `vp'
	5983	*/
	5984	static int
	5985	chflags1(vnode_t vp, int flags, vfs_context_t ctx)
	5986	{
	5987	struct vnode_attr va;
	5988	kauth_action_t action;
	5989	int error;
	5990
	5991	VATTR_INIT(&va);
	5992	VATTR_SET(&va, va_flags, flags);
	5993
	5994	#if CONFIG_MACF
	5995	error = mac_vnode_check_setflags(ctx, vp, flags);
	5996	if (error)
	5997	goto out;
	5998	#endif
	5999
	6000	/* request authorisation, disregard immutability */
	6001	if ((error = vnode_authattr(vp, &va, &action, ctx)) != 0)
	6002	goto out;
	6003	/*
	6004	* Request that the auth layer disregard those file flags it's allowed to when
	6005	* authorizing this operation; we need to do this in order to be able to
	6006	* clear immutable flags.
	6007	*/
	6008	if (action && ((error = vnode_authorize(vp, NULL, action \| KAUTH_VNODE_NOIMMUTABLE, ctx)) != 0))
	6009	goto out;
	6010	error = vnode_setattr(vp, &va, ctx);
	6011
	6012	#if CONFIG_MACF
	6013	if (error == 0)
	6014	mac_vnode_notify_setflags(ctx, vp, flags);
	6015	#endif
	6016
	6017	if ((error == 0) && !VATTR_IS_SUPPORTED(&va, va_flags)) {
	6018	error = ENOTSUP;
	6019	}
	6020	out:
	6021	vnode_put(vp);
	6022	return(error);
	6023	}
	6024
	6025	/*
	6026	* Change flags of a file given a path name.
	6027	*/
	6028	/* ARGSUSED */
	6029	int
	6030	chflags(__unused proc_t p, struct chflags_args uap, __unused int32_t retval)
	6031	{
	6032	vnode_t vp;
	6033	vfs_context_t ctx = vfs_context_current();
	6034	int error;
	6035	struct nameidata nd;
	6036
	6037	AUDIT_ARG(fflags, uap->flags);
	6038	NDINIT(&nd, LOOKUP, OP_SETATTR, FOLLOW \| AUDITVNPATH1,
	6039	UIO_USERSPACE, uap->path, ctx);
	6040	error = namei(&nd);
	6041	if (error)
	6042	return (error);
	6043	vp = nd.ni_vp;
	6044	nameidone(&nd);
	6045
	6046	/* we don't vnode_put() here because chflags1 does internally */
	6047	error = chflags1(vp, uap->flags, ctx);
	6048
	6049	return(error);
	6050	}
	6051
	6052	/*
	6053	* Change flags of a file given a file descriptor.
	6054	*/
	6055	/* ARGSUSED */
	6056	int
	6057	fchflags(__unused proc_t p, struct fchflags_args uap, __unused int32_t retval)
	6058	{
	6059	vnode_t vp;
	6060	int error;
	6061
	6062	AUDIT_ARG(fd, uap->fd);
	6063	AUDIT_ARG(fflags, uap->flags);
	6064	if ( (error = file_vnode(uap->fd, &vp)) )
	6065	return (error);
	6066
	6067	if ((error = vnode_getwithref(vp))) {
	6068	file_drop(uap->fd);
	6069	return(error);
	6070	}
	6071
	6072	AUDIT_ARG(vnpath, vp, ARG_VNODE1);
	6073
	6074	/* we don't vnode_put() here because chflags1 does internally */
	6075	error = chflags1(vp, uap->flags, vfs_context_current());
	6076
	6077	file_drop(uap->fd);
	6078	return (error);
	6079	}
	6080
	6081	/*
	6082	* Change security information on a filesystem object.
	6083	*
	6084	* Returns: 0 Success
	6085	* EPERM Operation not permitted
	6086	* vnode_authattr:??? [anything vnode_authattr can return]
	6087	* vnode_authorize:??? [anything vnode_authorize can return]
	6088	* vnode_setattr:??? [anything vnode_setattr can return]
	6089	*
	6090	* Notes: If vnode_authattr or vnode_authorize return EACCES, it will be
	6091	* translated to EPERM before being returned.
	6092	*/
	6093	static int
	6094	chmod_vnode(vfs_context_t ctx, vnode_t vp, struct vnode_attr *vap)
	6095	{
	6096	kauth_action_t action;
	6097	int error;
	6098
	6099	AUDIT_ARG(mode, vap->va_mode);
	6100	/* XXX audit new args */
	6101
	6102	#if NAMEDSTREAMS
	6103	/* chmod calls are not allowed for resource forks. */
	6104	if (vp->v_flag & VISNAMEDSTREAM) {
	6105	return (EPERM);
	6106	}
	6107	#endif
	6108
	6109	#if CONFIG_MACF
	6110	if (VATTR_IS_ACTIVE(vap, va_mode) &&
	6111	(error = mac_vnode_check_setmode(ctx, vp, (mode_t)vap->va_mode)) != 0)
	6112	return (error);
	6113
	6114	if (VATTR_IS_ACTIVE(vap, va_uid) \|\| VATTR_IS_ACTIVE(vap, va_gid)) {
	6115	if ((error = mac_vnode_check_setowner(ctx, vp,
	6116	VATTR_IS_ACTIVE(vap, va_uid) ? vap->va_uid : -1,
	6117	VATTR_IS_ACTIVE(vap, va_gid) ? vap->va_gid : -1)))
	6118	return (error);
	6119	}
	6120
	6121	if (VATTR_IS_ACTIVE(vap, va_acl) &&
	6122	(error = mac_vnode_check_setacl(ctx, vp, vap->va_acl)))
	6123	return (error);
	6124	#endif
	6125
	6126	/* make sure that the caller is allowed to set this security information */
	6127	if (((error = vnode_authattr(vp, vap, &action, ctx)) != 0) \|\|
	6128	((error = vnode_authorize(vp, NULL, action, ctx)) != 0)) {
	6129	if (error == EACCES)
	6130	error = EPERM;
	6131	return(error);
	6132	}
	6133
	6134	if ((error = vnode_setattr(vp, vap, ctx)) != 0)
	6135	return (error);
	6136
	6137	#if CONFIG_MACF
	6138	if (VATTR_IS_ACTIVE(vap, va_mode))
	6139	mac_vnode_notify_setmode(ctx, vp, (mode_t)vap->va_mode);
	6140
	6141	if (VATTR_IS_ACTIVE(vap, va_uid) \|\| VATTR_IS_ACTIVE(vap, va_gid))
	6142	mac_vnode_notify_setowner(ctx, vp,
	6143	VATTR_IS_ACTIVE(vap, va_uid) ? vap->va_uid : -1,
	6144	VATTR_IS_ACTIVE(vap, va_gid) ? vap->va_gid : -1);
	6145
	6146	if (VATTR_IS_ACTIVE(vap, va_acl))
	6147	mac_vnode_notify_setacl(ctx, vp, vap->va_acl);
	6148	#endif
	6149
	6150	return (error);
	6151	}
	6152
	6153
	6154	/*
	6155	* Change mode of a file given a path name.
	6156	*
	6157	* Returns: 0 Success
	6158	* namei:??? [anything namei can return]
	6159	* chmod_vnode:??? [anything chmod_vnode can return]
	6160	*/
	6161	static int
	6162	chmodat(vfs_context_t ctx, user_addr_t path, struct vnode_attr *vap,
	6163	int fd, int flag, enum uio_seg segflg)
	6164	{
	6165	struct nameidata nd;
	6166	int follow, error;
	6167
	6168	follow = (flag & AT_SYMLINK_NOFOLLOW) ? NOFOLLOW : FOLLOW;
	6169	NDINIT(&nd, LOOKUP, OP_SETATTR, follow \| AUDITVNPATH1,
	6170	segflg, path, ctx);
	6171	if ((error = nameiat(&nd, fd)))
	6172	return (error);
	6173	error = chmod_vnode(ctx, nd.ni_vp, vap);
	6174	vnode_put(nd.ni_vp);
	6175	nameidone(&nd);
	6176	return(error);
	6177	}
	6178
	6179	/*
	6180	* chmod_extended: Change the mode of a file given a path name; with extended
	6181	* argument list (including extended security (ACL)).
	6182	*
	6183	* Parameters: p Process requesting the open
	6184	* uap User argument descriptor (see below)
	6185	* retval (ignored)
	6186	*
	6187	* Indirect: uap->path Path to object (same as 'chmod')
	6188	* uap->uid UID to set
	6189	* uap->gid GID to set
	6190	* uap->mode File mode to set (same as 'chmod')
	6191	* uap->xsecurity ACL to set (or delete)
	6192	*
	6193	* Returns: 0 Success
	6194	* !0 errno value
	6195	*
	6196	* Notes: The kauth_filesec_t in 'va', if any, is in host byte order.
	6197	*
	6198	* XXX: We should enummerate the possible errno values here, and where
	6199	* in the code they originated.
	6200	*/
	6201	int
	6202	chmod_extended(__unused proc_t p, struct chmod_extended_args uap, __unused int32_t retval)
	6203	{
	6204	int error;
	6205	struct vnode_attr va;
	6206	kauth_filesec_t xsecdst;
	6207
	6208	AUDIT_ARG(owner, uap->uid, uap->gid);
	6209
	6210	VATTR_INIT(&va);
	6211	if (uap->mode != -1)
	6212	VATTR_SET(&va, va_mode, uap->mode & ALLPERMS);
	6213	if (uap->uid != KAUTH_UID_NONE)
	6214	VATTR_SET(&va, va_uid, uap->uid);
	6215	if (uap->gid != KAUTH_GID_NONE)
	6216	VATTR_SET(&va, va_gid, uap->gid);
	6217
	6218	xsecdst = NULL;
	6219	switch(uap->xsecurity) {
	6220	/* explicit remove request */
	6221	case CAST_USER_ADDR_T((void )1): / _FILESEC_REMOVE_ACL */
	6222	VATTR_SET(&va, va_acl, NULL);
	6223	break;
	6224	/* not being set */
	6225	case USER_ADDR_NULL:
	6226	break;
	6227	default:
	6228	if ((error = kauth_copyinfilesec(uap->xsecurity, &xsecdst)) != 0)
	6229	return(error);
	6230	VATTR_SET(&va, va_acl, &xsecdst->fsec_acl);
	6231	KAUTH_DEBUG("CHMOD - setting ACL with %d entries", va.va_acl->acl_entrycount);
	6232	}
	6233
	6234	error = chmodat(vfs_context_current(), uap->path, &va, AT_FDCWD, 0,
	6235	UIO_USERSPACE);
	6236
	6237	if (xsecdst != NULL)
	6238	kauth_filesec_free(xsecdst);
	6239	return(error);
	6240	}
	6241
	6242	/*
	6243	* Returns: 0 Success
	6244	* chmodat:??? [anything chmodat can return]
	6245	*/
	6246	static int
	6247	fchmodat_internal(vfs_context_t ctx, user_addr_t path, int mode, int fd,
	6248	int flag, enum uio_seg segflg)
	6249	{
	6250	struct vnode_attr va;
	6251
	6252	VATTR_INIT(&va);
	6253	VATTR_SET(&va, va_mode, mode & ALLPERMS);
	6254
	6255	return (chmodat(ctx, path, &va, fd, flag, segflg));
	6256	}
	6257
	6258	int
	6259	chmod(__unused proc_t p, struct chmod_args uap, __unused int32_t retval)
	6260	{
	6261	return (fchmodat_internal(vfs_context_current(), uap->path, uap->mode,
	6262	AT_FDCWD, 0, UIO_USERSPACE));
	6263	}
	6264
	6265	int
	6266	fchmodat(__unused proc_t p, struct fchmodat_args uap, __unused int32_t retval)
	6267	{
	6268	if (uap->flag & ~AT_SYMLINK_NOFOLLOW)
	6269	return (EINVAL);
	6270
	6271	return (fchmodat_internal(vfs_context_current(), uap->path, uap->mode,
	6272	uap->fd, uap->flag, UIO_USERSPACE));
	6273	}
	6274
	6275	/*
	6276	* Change mode of a file given a file descriptor.
	6277	*/
	6278	static int
	6279	fchmod1(__unused proc_t p, int fd, struct vnode_attr *vap)
	6280	{
	6281	vnode_t vp;
	6282	int error;
	6283
	6284	AUDIT_ARG(fd, fd);
	6285
	6286	if ((error = file_vnode(fd, &vp)) != 0)
	6287	return (error);
	6288	if ((error = vnode_getwithref(vp)) != 0) {
	6289	file_drop(fd);
	6290	return(error);
	6291	}
	6292	AUDIT_ARG(vnpath, vp, ARG_VNODE1);
	6293
	6294	error = chmod_vnode(vfs_context_current(), vp, vap);
	6295	(void)vnode_put(vp);
	6296	file_drop(fd);
	6297
	6298	return (error);
	6299	}
	6300
	6301	/*
	6302	* fchmod_extended: Change mode of a file given a file descriptor; with
	6303	* extended argument list (including extended security (ACL)).
	6304	*
	6305	* Parameters: p Process requesting to change file mode
	6306	* uap User argument descriptor (see below)
	6307	* retval (ignored)
	6308	*
	6309	* Indirect: uap->mode File mode to set (same as 'chmod')
	6310	* uap->uid UID to set
	6311	* uap->gid GID to set
	6312	* uap->xsecurity ACL to set (or delete)
	6313	* uap->fd File descriptor of file to change mode
	6314	*
	6315	* Returns: 0 Success
	6316	* !0 errno value
	6317	*
	6318	*/
	6319	int
	6320	fchmod_extended(proc_t p, struct fchmod_extended_args uap, __unused int32_t retval)
	6321	{
	6322	int error;
	6323	struct vnode_attr va;
	6324	kauth_filesec_t xsecdst;
	6325
	6326	AUDIT_ARG(owner, uap->uid, uap->gid);
	6327
	6328	VATTR_INIT(&va);
	6329	if (uap->mode != -1)
	6330	VATTR_SET(&va, va_mode, uap->mode & ALLPERMS);
	6331	if (uap->uid != KAUTH_UID_NONE)
	6332	VATTR_SET(&va, va_uid, uap->uid);
	6333	if (uap->gid != KAUTH_GID_NONE)
	6334	VATTR_SET(&va, va_gid, uap->gid);
	6335
	6336	xsecdst = NULL;
	6337	switch(uap->xsecurity) {
	6338	case USER_ADDR_NULL:
	6339	VATTR_SET(&va, va_acl, NULL);
	6340	break;
	6341	case CAST_USER_ADDR_T((void )1): / _FILESEC_REMOVE_ACL */
	6342	VATTR_SET(&va, va_acl, NULL);
	6343	break;
	6344	/* not being set */
	6345	case CAST_USER_ADDR_T(-1):
	6346	break;
	6347	default:
	6348	if ((error = kauth_copyinfilesec(uap->xsecurity, &xsecdst)) != 0)
	6349	return(error);
	6350	VATTR_SET(&va, va_acl, &xsecdst->fsec_acl);
	6351	}
	6352
	6353	error = fchmod1(p, uap->fd, &va);
	6354
	6355
	6356	switch(uap->xsecurity) {
	6357	case USER_ADDR_NULL:
	6358	case CAST_USER_ADDR_T(-1):
	6359	break;
	6360	default:
	6361	if (xsecdst != NULL)
	6362	kauth_filesec_free(xsecdst);
	6363	}
	6364	return(error);
	6365	}
	6366
	6367	int
	6368	fchmod(proc_t p, struct fchmod_args uap, __unused int32_t retval)
	6369	{
	6370	struct vnode_attr va;
	6371
	6372	VATTR_INIT(&va);
	6373	VATTR_SET(&va, va_mode, uap->mode & ALLPERMS);
	6374
	6375	return(fchmod1(p, uap->fd, &va));
	6376	}
	6377
	6378
	6379	/*
	6380	* Set ownership given a path name.
	6381	*/
	6382	/* ARGSUSED */
	6383	static int
	6384	fchownat_internal(vfs_context_t ctx, int fd, user_addr_t path, uid_t uid,
	6385	gid_t gid, int flag, enum uio_seg segflg)
	6386	{
	6387	vnode_t vp;
	6388	struct vnode_attr va;
	6389	int error;
	6390	struct nameidata nd;
	6391	int follow;
	6392	kauth_action_t action;
	6393
	6394	AUDIT_ARG(owner, uid, gid);
	6395
	6396	follow = (flag & AT_SYMLINK_NOFOLLOW) ? NOFOLLOW : FOLLOW;
	6397	NDINIT(&nd, LOOKUP, OP_SETATTR, follow \| AUDITVNPATH1, segflg,
	6398	path, ctx);
	6399	error = nameiat(&nd, fd);
	6400	if (error)
	6401	return (error);
	6402	vp = nd.ni_vp;
	6403
	6404	nameidone(&nd);
	6405
	6406	VATTR_INIT(&va);
	6407	if (uid != (uid_t)VNOVAL)
	6408	VATTR_SET(&va, va_uid, uid);
	6409	if (gid != (gid_t)VNOVAL)
	6410	VATTR_SET(&va, va_gid, gid);
	6411
	6412	#if CONFIG_MACF
	6413	error = mac_vnode_check_setowner(ctx, vp, uid, gid);
	6414	if (error)
	6415	goto out;
	6416	#endif
	6417
	6418	/* preflight and authorize attribute changes */
	6419	if ((error = vnode_authattr(vp, &va, &action, ctx)) != 0)
	6420	goto out;
	6421	if (action && ((error = vnode_authorize(vp, NULL, action, ctx)) != 0))
	6422	goto out;
	6423	error = vnode_setattr(vp, &va, ctx);
	6424
	6425	#if CONFIG_MACF
	6426	if (error == 0)
	6427	mac_vnode_notify_setowner(ctx, vp, uid, gid);
	6428	#endif
	6429
	6430	out:
	6431	/*
	6432	* EACCES is only allowed from namei(); permissions failure should
	6433	* return EPERM, so we need to translate the error code.
	6434	*/
	6435	if (error == EACCES)
	6436	error = EPERM;
	6437
	6438	vnode_put(vp);
	6439	return (error);
	6440	}
	6441
	6442	int
	6443	chown(__unused proc_t p, struct chown_args uap, __unused int32_t retval)
	6444	{
	6445	return (fchownat_internal(vfs_context_current(), AT_FDCWD, uap->path,
	6446	uap->uid, uap->gid, 0, UIO_USERSPACE));
	6447	}
	6448
	6449	int
	6450	lchown(__unused proc_t p, struct lchown_args uap, __unused int32_t retval)
	6451	{
	6452	return (fchownat_internal(vfs_context_current(), AT_FDCWD, uap->path,
	6453	uap->owner, uap->group, AT_SYMLINK_NOFOLLOW, UIO_USERSPACE));
	6454	}
	6455
	6456	int
	6457	fchownat(__unused proc_t p, struct fchownat_args uap, __unused int32_t retval)
	6458	{
	6459	if (uap->flag & ~AT_SYMLINK_NOFOLLOW)
	6460	return (EINVAL);
	6461
	6462	return (fchownat_internal(vfs_context_current(), uap->fd, uap->path,
	6463	uap->uid, uap->gid, uap->flag, UIO_USERSPACE));
	6464	}
	6465
	6466	/*
	6467	* Set ownership given a file descriptor.
	6468	*/
	6469	/* ARGSUSED */
	6470	int
	6471	fchown(__unused proc_t p, struct fchown_args uap, __unused int32_t retval)
	6472	{
	6473	struct vnode_attr va;
	6474	vfs_context_t ctx = vfs_context_current();
	6475	vnode_t vp;
	6476	int error;
	6477	kauth_action_t action;
	6478
	6479	AUDIT_ARG(owner, uap->uid, uap->gid);
	6480	AUDIT_ARG(fd, uap->fd);
	6481
	6482	if ( (error = file_vnode(uap->fd, &vp)) )
	6483	return (error);
	6484
	6485	if ( (error = vnode_getwithref(vp)) ) {
	6486	file_drop(uap->fd);
	6487	return(error);
	6488	}
	6489	AUDIT_ARG(vnpath, vp, ARG_VNODE1);
	6490
	6491	VATTR_INIT(&va);
	6492	if (uap->uid != VNOVAL)
	6493	VATTR_SET(&va, va_uid, uap->uid);
	6494	if (uap->gid != VNOVAL)
	6495	VATTR_SET(&va, va_gid, uap->gid);
	6496
	6497	#if NAMEDSTREAMS
	6498	/* chown calls are not allowed for resource forks. */
	6499	if (vp->v_flag & VISNAMEDSTREAM) {
	6500	error = EPERM;
	6501	goto out;
	6502	}
	6503	#endif
	6504
	6505	#if CONFIG_MACF
	6506	error = mac_vnode_check_setowner(ctx, vp, uap->uid, uap->gid);
	6507	if (error)
	6508	goto out;
	6509	#endif
	6510
	6511	/* preflight and authorize attribute changes */
	6512	if ((error = vnode_authattr(vp, &va, &action, ctx)) != 0)
	6513	goto out;
	6514	if (action && ((error = vnode_authorize(vp, NULL, action, ctx)) != 0)) {
	6515	if (error == EACCES)
	6516	error = EPERM;
	6517	goto out;
	6518	}
	6519	error = vnode_setattr(vp, &va, ctx);
	6520
	6521	#if CONFIG_MACF
	6522	if (error == 0)
	6523	mac_vnode_notify_setowner(ctx, vp, uap->uid, uap->gid);
	6524	#endif
	6525
	6526	out:
	6527	(void)vnode_put(vp);
	6528	file_drop(uap->fd);
	6529	return (error);
	6530	}
	6531
	6532	static int
	6533	getutimes(user_addr_t usrtvp, struct timespec *tsp)
	6534	{
	6535	int error;
	6536
	6537	if (usrtvp == USER_ADDR_NULL) {
	6538	struct timeval old_tv;
	6539	/* XXX Y2038 bug because of microtime argument */
	6540	microtime(&old_tv);
	6541	TIMEVAL_TO_TIMESPEC(&old_tv, &tsp[0]);
	6542	tsp[1] = tsp[0];
	6543	} else {
	6544	if (IS_64BIT_PROCESS(current_proc())) {
	6545	struct user64_timeval tv[2];
	6546	error = copyin(usrtvp, (void *)tv, sizeof(tv));
	6547	if (error)
	6548	return (error);
	6549	TIMEVAL_TO_TIMESPEC(&tv[0], &tsp[0]);
	6550	TIMEVAL_TO_TIMESPEC(&tv[1], &tsp[1]);
	6551	} else {
	6552	struct user32_timeval tv[2];
	6553	error = copyin(usrtvp, (void *)tv, sizeof(tv));
	6554	if (error)
	6555	return (error);
	6556	TIMEVAL_TO_TIMESPEC(&tv[0], &tsp[0]);
	6557	TIMEVAL_TO_TIMESPEC(&tv[1], &tsp[1]);
	6558	}
	6559	}
	6560	return 0;
	6561	}
	6562
	6563	static int
	6564	setutimes(vfs_context_t ctx, vnode_t vp, const struct timespec *ts,
	6565	int nullflag)
	6566	{
	6567	int error;
	6568	struct vnode_attr va;
	6569	kauth_action_t action;
	6570
	6571	AUDIT_ARG(vnpath, vp, ARG_VNODE1);
	6572
	6573	VATTR_INIT(&va);
	6574	VATTR_SET(&va, va_access_time, ts[0]);
	6575	VATTR_SET(&va, va_modify_time, ts[1]);
	6576	if (nullflag)
	6577	va.va_vaflags \|= VA_UTIMES_NULL;
	6578
	6579	#if NAMEDSTREAMS
	6580	/* utimes calls are not allowed for resource forks. */
	6581	if (vp->v_flag & VISNAMEDSTREAM) {
	6582	error = EPERM;
	6583	goto out;
	6584	}
	6585	#endif
	6586
	6587	#if CONFIG_MACF
	6588	error = mac_vnode_check_setutimes(ctx, vp, ts[0], ts[1]);
	6589	if (error)
	6590	goto out;
	6591	#endif
	6592	if ((error = vnode_authattr(vp, &va, &action, ctx)) != 0) {
	6593	if (!nullflag && error == EACCES)
	6594	error = EPERM;
	6595	goto out;
	6596	}
	6597
	6598	/* since we may not need to auth anything, check here */
	6599	if ((action != 0) && ((error = vnode_authorize(vp, NULL, action, ctx)) != 0)) {
	6600	if (!nullflag && error == EACCES)
	6601	error = EPERM;
	6602	goto out;
	6603	}
	6604	error = vnode_setattr(vp, &va, ctx);
	6605
	6606	#if CONFIG_MACF
	6607	if (error == 0)
	6608	mac_vnode_notify_setutimes(ctx, vp, ts[0], ts[1]);
	6609	#endif
	6610
	6611	out:
	6612	return error;
	6613	}
	6614
	6615	/*
	6616	* Set the access and modification times of a file.
	6617	*/
	6618	/* ARGSUSED */
	6619	int
	6620	utimes(__unused proc_t p, struct utimes_args uap, __unused int32_t retval)
	6621	{
	6622	struct timespec ts[2];
	6623	user_addr_t usrtvp;
	6624	int error;
	6625	struct nameidata nd;
	6626	vfs_context_t ctx = vfs_context_current();
	6627
	6628	/*
	6629	* AUDIT: Needed to change the order of operations to do the
	6630	* name lookup first because auditing wants the path.
	6631	*/
	6632	NDINIT(&nd, LOOKUP, OP_SETATTR, FOLLOW \| AUDITVNPATH1,
	6633	UIO_USERSPACE, uap->path, ctx);
	6634	error = namei(&nd);
	6635	if (error)
	6636	return (error);
	6637	nameidone(&nd);
	6638
	6639	/*
	6640	* Fetch the user-supplied time. If usrtvp is USER_ADDR_NULL, we fetch
	6641	* the current time instead.
	6642	*/
	6643	usrtvp = uap->tptr;
	6644	if ((error = getutimes(usrtvp, ts)) != 0)
	6645	goto out;
	6646
	6647	error = setutimes(ctx, nd.ni_vp, ts, usrtvp == USER_ADDR_NULL);
	6648
	6649	out:
	6650	vnode_put(nd.ni_vp);
	6651	return (error);
	6652	}
	6653
	6654	/*
	6655	* Set the access and modification times of a file.
	6656	*/
	6657	/* ARGSUSED */
	6658	int
	6659	futimes(__unused proc_t p, struct futimes_args uap, __unused int32_t retval)
	6660	{
	6661	struct timespec ts[2];
	6662	vnode_t vp;
	6663	user_addr_t usrtvp;
	6664	int error;
	6665
	6666	AUDIT_ARG(fd, uap->fd);
	6667	usrtvp = uap->tptr;
	6668	if ((error = getutimes(usrtvp, ts)) != 0)
	6669	return (error);
	6670	if ((error = file_vnode(uap->fd, &vp)) != 0)
	6671	return (error);
	6672	if((error = vnode_getwithref(vp))) {
	6673	file_drop(uap->fd);
	6674	return(error);
	6675	}
	6676
	6677	error = setutimes(vfs_context_current(), vp, ts, usrtvp == 0);
	6678	vnode_put(vp);
	6679	file_drop(uap->fd);
	6680	return(error);
	6681	}
	6682
	6683	/*
	6684	* Truncate a file given its path name.
	6685	*/
	6686	/* ARGSUSED */
	6687	int
	6688	truncate(__unused proc_t p, struct truncate_args uap, __unused int32_t retval)
	6689	{
	6690	vnode_t vp;
	6691	struct vnode_attr va;
	6692	vfs_context_t ctx = vfs_context_current();
	6693	int error;
	6694	struct nameidata nd;
	6695	kauth_action_t action;
	6696
	6697	if (uap->length < 0)
	6698	return(EINVAL);
	6699	NDINIT(&nd, LOOKUP, OP_TRUNCATE, FOLLOW \| AUDITVNPATH1,
	6700	UIO_USERSPACE, uap->path, ctx);
	6701	if ((error = namei(&nd)))
	6702	return (error);
	6703	vp = nd.ni_vp;
	6704
	6705	nameidone(&nd);
	6706
	6707	VATTR_INIT(&va);
	6708	VATTR_SET(&va, va_data_size, uap->length);
	6709
	6710	#if CONFIG_MACF
	6711	error = mac_vnode_check_truncate(ctx, NOCRED, vp);
	6712	if (error)
	6713	goto out;
	6714	#endif
	6715
	6716	if ((error = vnode_authattr(vp, &va, &action, ctx)) != 0)
	6717	goto out;
	6718	if ((action != 0) && ((error = vnode_authorize(vp, NULL, action, ctx)) != 0))
	6719	goto out;
	6720	error = vnode_setattr(vp, &va, ctx);
	6721
	6722	#if CONFIG_MACF
	6723	if (error == 0)
	6724	mac_vnode_notify_truncate(ctx, NOCRED, vp);
	6725	#endif
	6726
	6727	out:
	6728	vnode_put(vp);
	6729	return (error);
	6730	}
	6731
	6732	/*
	6733	* Truncate a file given a file descriptor.
	6734	*/
	6735	/* ARGSUSED */
	6736	int
	6737	ftruncate(proc_t p, struct ftruncate_args uap, int32_t retval)
	6738	{
	6739	vfs_context_t ctx = vfs_context_current();
	6740	struct vnode_attr va;
	6741	vnode_t vp;
	6742	struct fileproc *fp;
	6743	int error ;
	6744	int fd = uap->fd;
	6745
	6746	AUDIT_ARG(fd, uap->fd);
	6747	if (uap->length < 0)
	6748	return(EINVAL);
	6749
	6750	if ( (error = fp_lookup(p,fd,&fp,0)) ) {
	6751	return(error);
	6752	}
	6753
	6754	switch (FILEGLOB_DTYPE(fp->f_fglob)) {
	6755	case DTYPE_PSXSHM:
	6756	error = pshm_truncate(p, fp, uap->fd, uap->length, retval);
	6757	goto out;
	6758	case DTYPE_VNODE:
	6759	break;
	6760	default:
	6761	error = EINVAL;
	6762	goto out;
	6763	}
	6764
	6765	vp = (vnode_t)fp->f_fglob->fg_data;
	6766
	6767	if ((fp->f_fglob->fg_flag & FWRITE) == 0) {
	6768	AUDIT_ARG(vnpath_withref, vp, ARG_VNODE1);
	6769	error = EINVAL;
	6770	goto out;
	6771	}
	6772
	6773	if ((error = vnode_getwithref(vp)) != 0) {
	6774	goto out;
	6775	}
	6776
	6777	AUDIT_ARG(vnpath, vp, ARG_VNODE1);
	6778
	6779	#if CONFIG_MACF
	6780	error = mac_vnode_check_truncate(ctx,
	6781	fp->f_fglob->fg_cred, vp);
	6782	if (error) {
	6783	(void)vnode_put(vp);
	6784	goto out;
	6785	}
	6786	#endif
	6787	VATTR_INIT(&va);
	6788	VATTR_SET(&va, va_data_size, uap->length);
	6789	error = vnode_setattr(vp, &va, ctx);
	6790
	6791	#if CONFIG_MACF
	6792	if (error == 0)
	6793	mac_vnode_notify_truncate(ctx, fp->f_fglob->fg_cred, vp);
	6794	#endif
	6795
	6796	(void)vnode_put(vp);
	6797	out:
	6798	file_drop(fd);
	6799	return (error);
	6800	}
	6801
	6802
	6803	/*
	6804	* Sync an open file with synchronized I/O _file_ integrity completion
	6805	*/
	6806	/* ARGSUSED */
	6807	int
	6808	fsync(proc_t p, struct fsync_args uap, __unused int32_t retval)
	6809	{
	6810	__pthread_testcancel(1);
	6811	return(fsync_common(p, uap, MNT_WAIT));
	6812	}
	6813
	6814
	6815	/*
	6816	* Sync an open file with synchronized I/O _file_ integrity completion
	6817	*
	6818	* Notes: This is a legacy support function that does not test for
	6819	* thread cancellation points.
	6820	*/
	6821	/* ARGSUSED */
	6822	int
	6823	fsync_nocancel(proc_t p, struct fsync_nocancel_args uap, __unused int32_t retval)
	6824	{
	6825	return(fsync_common(p, (struct fsync_args *)uap, MNT_WAIT));
	6826	}
	6827
	6828
	6829	/*
	6830	* Sync an open file with synchronized I/O _data_ integrity completion
	6831	*/
	6832	/* ARGSUSED */
	6833	int
	6834	fdatasync(proc_t p, struct fdatasync_args uap, __unused int32_t retval)
	6835	{
	6836	__pthread_testcancel(1);
	6837	return(fsync_common(p, (struct fsync_args *)uap, MNT_DWAIT));
	6838	}
	6839
	6840
	6841	/*
	6842	* fsync_common
	6843	*
	6844	* Common fsync code to support both synchronized I/O file integrity completion
	6845	* (normal fsync) and synchronized I/O data integrity completion (fdatasync).
	6846	*
	6847	* If 'flags' is MNT_DWAIT, the caller is requesting data integrity, which
	6848	* will only guarantee that the file data contents are retrievable. If
	6849	* 'flags' is MNT_WAIT, the caller is rewuesting file integrity, which also
	6850	* includes additional metadata unnecessary for retrieving the file data
	6851	* contents, such as atime, mtime, ctime, etc., also be committed to stable
	6852	* storage.
	6853	*
	6854	* Parameters: p The process
	6855	* uap->fd The descriptor to synchronize
	6856	* flags The data integrity flags
	6857	*
	6858	* Returns: int Success
	6859	* fp_getfvp:EBADF Bad file descriptor
	6860	* fp_getfvp:ENOTSUP fd does not refer to a vnode
	6861	* VNOP_FSYNC:??? unspecified
	6862	*
	6863	* Notes: We use struct fsync_args because it is a short name, and all
	6864	* caller argument structures are otherwise identical.
	6865	*/
	6866	static int
	6867	fsync_common(proc_t p, struct fsync_args *uap, int flags)
	6868	{
	6869	vnode_t vp;
	6870	struct fileproc *fp;
	6871	vfs_context_t ctx = vfs_context_current();
	6872	int error;
	6873
	6874	AUDIT_ARG(fd, uap->fd);
	6875
	6876	if ( (error = fp_getfvp(p, uap->fd, &fp, &vp)) )
	6877	return (error);
	6878	if ( (error = vnode_getwithref(vp)) ) {
	6879	file_drop(uap->fd);
	6880	return(error);
	6881	}
	6882
	6883	AUDIT_ARG(vnpath, vp, ARG_VNODE1);
	6884
	6885	error = VNOP_FSYNC(vp, flags, ctx);
	6886
	6887	#if NAMEDRSRCFORK
	6888	/* Sync resource fork shadow file if necessary. */
	6889	if ((error == 0) &&
	6890	(vp->v_flag & VISNAMEDSTREAM) &&
	6891	(vp->v_parent != NULLVP) &&
	6892	vnode_isshadow(vp) &&
	6893	(fp->f_flags & FP_WRITTEN)) {
	6894	(void) vnode_flushnamedstream(vp->v_parent, vp, ctx);
	6895	}
	6896	#endif
	6897
	6898	(void)vnode_put(vp);
	6899	file_drop(uap->fd);
	6900	return (error);
	6901	}
	6902
	6903	/*
	6904	* Duplicate files. Source must be a file, target must be a file or
	6905	* must not exist.
	6906	*
	6907	* XXX Copyfile authorisation checking is woefully inadequate, and will not
	6908	* perform inheritance correctly.
	6909	*/
	6910	/* ARGSUSED */
	6911	int
	6912	copyfile(__unused proc_t p, struct copyfile_args uap, __unused int32_t retval)
	6913	{
	6914	vnode_t tvp, fvp, tdvp, sdvp;
	6915	struct nameidata fromnd, tond;
	6916	int error;
	6917	vfs_context_t ctx = vfs_context_current();
	6918	#if CONFIG_MACF
	6919	struct filedesc *fdp = (vfs_context_proc(ctx))->p_fd;
	6920	struct vnode_attr va;
	6921	#endif
	6922
	6923	/* Check that the flags are valid. */
	6924
	6925	if (uap->flags & ~CPF_MASK) {
	6926	return(EINVAL);
	6927	}
	6928
	6929	NDINIT(&fromnd, LOOKUP, OP_COPYFILE, AUDITVNPATH1,
	6930	UIO_USERSPACE, uap->from, ctx);
	6931	if ((error = namei(&fromnd)))
	6932	return (error);
	6933	fvp = fromnd.ni_vp;
	6934
	6935	NDINIT(&tond, CREATE, OP_LINK,
	6936	LOCKPARENT \| LOCKLEAF \| NOCACHE \| SAVESTART \| AUDITVNPATH2 \| CN_NBMOUNTLOOK,
	6937	UIO_USERSPACE, uap->to, ctx);
	6938	if ((error = namei(&tond))) {
	6939	goto out1;
	6940	}
	6941	tdvp = tond.ni_dvp;
	6942	tvp = tond.ni_vp;
	6943
	6944	if (tvp != NULL) {
	6945	if (!(uap->flags & CPF_OVERWRITE)) {
	6946	error = EEXIST;
	6947	goto out;
	6948	}
	6949	}
	6950
	6951	if (fvp->v_type == VDIR \|\| (tvp && tvp->v_type == VDIR)) {
	6952	error = EISDIR;
	6953	goto out;
	6954	}
	6955
	6956	/* This calls existing MAC hooks for open */
	6957	if ((error = vn_authorize_open_existing(fvp, &fromnd.ni_cnd, FREAD, ctx,
	6958	NULL))) {
	6959	goto out;
	6960	}
	6961
	6962	if (tvp) {
	6963	/*
	6964	* See unlinkat_internal for an explanation of the potential
	6965	* ENOENT from the MAC hook but the gist is that the MAC hook
	6966	* can fail because vn_getpath isn't able to return the full
	6967	* path. We choose to ignore this failure.
	6968	*/
	6969	error = vn_authorize_unlink(tdvp, tvp, &tond.ni_cnd, ctx, NULL);
	6970	if (error && error != ENOENT)
	6971	goto out;
	6972	error = 0;
	6973	}
	6974
	6975	#if CONFIG_MACF
	6976	VATTR_INIT(&va);
	6977	VATTR_SET(&va, va_type, fvp->v_type);
	6978	/* Mask off all but regular access permissions */
	6979	VATTR_SET(&va, va_mode,
	6980	((((uap->mode &~ fdp->fd_cmask) & ALLPERMS) & ~S_ISTXT) & ACCESSPERMS));
	6981	error = mac_vnode_check_create(ctx, tdvp, &tond.ni_cnd, &va);
	6982	if (error)
	6983	goto out;
	6984	#endif /* CONFIG_MACF */
	6985
	6986	if ((error = vnode_authorize(tdvp, NULL, KAUTH_VNODE_ADD_FILE, ctx)) != 0)
	6987	goto out;
	6988
	6989	if (fvp == tdvp)
	6990	error = EINVAL;
	6991	/*
	6992	* If source is the same as the destination (that is the
	6993	* same inode number) then there is nothing to do.
	6994	* (fixed to have POSIX semantics - CSM 3/2/98)
	6995	*/
	6996	if (fvp == tvp)
	6997	error = -1;
	6998	if (!error)
	6999	error = VNOP_COPYFILE(fvp, tdvp, tvp, &tond.ni_cnd, uap->mode, uap->flags, ctx);
	7000	out:
	7001	sdvp = tond.ni_startdir;
	7002	/*
	7003	* nameidone has to happen before we vnode_put(tdvp)
	7004	* since it may need to release the fs_nodelock on the tdvp
	7005	*/
	7006	nameidone(&tond);
	7007
	7008	if (tvp)
	7009	vnode_put(tvp);
	7010	vnode_put(tdvp);
	7011	vnode_put(sdvp);
	7012	out1:
	7013	vnode_put(fvp);
	7014
	7015	nameidone(&fromnd);
	7016
	7017	if (error == -1)
	7018	return (0);
	7019	return (error);
	7020	}
	7021
	7022	#define CLONE_SNAPSHOT_FALLBACKS_ENABLED 1
	7023
	7024	/*
	7025	* Helper function for doing clones. The caller is expected to provide an
	7026	* iocounted source vnode and release it.
	7027	*/
	7028	static int
	7029	clonefile_internal(vnode_t fvp, boolean_t data_read_authorised, int dst_dirfd,
	7030	user_addr_t dst, uint32_t flags, vfs_context_t ctx)
	7031	{
	7032	vnode_t tvp, tdvp;
	7033	struct nameidata tond;
	7034	int error;
	7035	int follow;
	7036	boolean_t free_src_acl;
	7037	boolean_t attr_cleanup;
	7038	enum vtype v_type;
	7039	kauth_action_t action;
	7040	struct componentname *cnp;
	7041	uint32_t defaulted;
	7042	struct vnode_attr va;
	7043	struct vnode_attr nva;
	7044	uint32_t vnop_flags;
	7045
	7046	v_type = vnode_vtype(fvp);
	7047	switch (v_type) {
	7048	case VLNK:
	7049	/* FALLTHRU */
	7050	case VREG:
	7051	action = KAUTH_VNODE_ADD_FILE;
	7052	break;
	7053	case VDIR:
	7054	if (vnode_isvroot(fvp) \|\| vnode_ismount(fvp) \|\|
	7055	fvp->v_mountedhere) {
	7056	return (EINVAL);
	7057	}
	7058	action = KAUTH_VNODE_ADD_SUBDIRECTORY;
	7059	break;
	7060	default:
	7061	return (EINVAL);
	7062	}
	7063
	7064	AUDIT_ARG(fd2, dst_dirfd);
	7065	AUDIT_ARG(value32, flags);
	7066
	7067	follow = (flags & CLONE_NOFOLLOW) ? NOFOLLOW : FOLLOW;
	7068	NDINIT(&tond, CREATE, OP_LINK, follow \| WANTPARENT \| AUDITVNPATH2,
	7069	UIO_USERSPACE, dst, ctx);
	7070	if ((error = nameiat(&tond, dst_dirfd)))
	7071	return (error);
	7072	cnp = &tond.ni_cnd;
	7073	tdvp = tond.ni_dvp;
	7074	tvp = tond.ni_vp;
	7075
	7076	free_src_acl = FALSE;
	7077	attr_cleanup = FALSE;
	7078
	7079	if (tvp != NULL) {
	7080	error = EEXIST;
	7081	goto out;
	7082	}
	7083
	7084	if (vnode_mount(tdvp) != vnode_mount(fvp)) {
	7085	error = EXDEV;
	7086	goto out;
	7087	}
	7088
	7089	#if CONFIG_MACF
	7090	if ((error = mac_vnode_check_clone(ctx, tdvp, fvp, cnp)))
	7091	goto out;
	7092	#endif
	7093	if ((error = vnode_authorize(tdvp, NULL, action, ctx)))
	7094	goto out;
	7095
	7096	action = KAUTH_VNODE_GENERIC_READ_BITS;
	7097	if (data_read_authorised)
	7098	action &= ~KAUTH_VNODE_READ_DATA;
	7099	if ((error = vnode_authorize(fvp, NULL, action, ctx)))
	7100	goto out;
	7101
	7102	/*
	7103	* certain attributes may need to be changed from the source, we ask for
	7104	* those here.
	7105	*/
	7106	VATTR_INIT(&va);
	7107	VATTR_WANTED(&va, va_uid);
	7108	VATTR_WANTED(&va, va_gid);
	7109	VATTR_WANTED(&va, va_mode);
	7110	VATTR_WANTED(&va, va_flags);
	7111	VATTR_WANTED(&va, va_acl);
	7112
	7113	if ((error = vnode_getattr(fvp, &va, ctx)) != 0)
	7114	goto out;
	7115
	7116	VATTR_INIT(&nva);
	7117	VATTR_SET(&nva, va_type, v_type);
	7118	if (VATTR_IS_SUPPORTED(&va, va_acl) && va.va_acl != NULL) {
	7119	VATTR_SET(&nva, va_acl, va.va_acl);
	7120	free_src_acl = TRUE;
	7121	}
	7122
	7123	/* Handle ACL inheritance, initialize vap. */
	7124	if (v_type == VLNK) {
	7125	error = vnode_authattr_new(tdvp, &nva, 0, ctx);
	7126	} else {
	7127	error = vn_attribute_prepare(tdvp, &nva, &defaulted, ctx);
	7128	if (error)
	7129	goto out;
	7130	attr_cleanup = TRUE;
	7131	}
	7132
	7133	vnop_flags = VNODE_CLONEFILE_DEFAULT;
	7134	/*
	7135	* We've got initial values for all security parameters,
	7136	* If we are superuser, then we can change owners to be the
	7137	* same as the source. Both superuser and the owner have default
	7138	* WRITE_SECURITY privileges so all other fields can be taken
	7139	* from source as well.
	7140	*/
	7141	if (!(flags & CLONE_NOOWNERCOPY) && vfs_context_issuser(ctx)) {
	7142	if (VATTR_IS_SUPPORTED(&va, va_uid))
	7143	VATTR_SET(&nva, va_uid, va.va_uid);
	7144	if (VATTR_IS_SUPPORTED(&va, va_gid))
	7145	VATTR_SET(&nva, va_gid, va.va_gid);
	7146	} else {
	7147	vnop_flags \|= VNODE_CLONEFILE_NOOWNERCOPY;
	7148	}
	7149
	7150	if (VATTR_IS_SUPPORTED(&va, va_mode))
	7151	VATTR_SET(&nva, va_mode, va.va_mode);
	7152	if (VATTR_IS_SUPPORTED(&va, va_flags)) {
	7153	VATTR_SET(&nva, va_flags,
	7154	((va.va_flags & ~(UF_DATAVAULT \| SF_RESTRICTED)) \| /* Turn off from source */
	7155	(nva.va_flags & (UF_DATAVAULT \| SF_RESTRICTED))));
	7156	}
	7157
	7158	error = VNOP_CLONEFILE(fvp, tdvp, &tvp, cnp, &nva, vnop_flags, ctx);
	7159
	7160	if (!error && tvp) {
	7161	int update_flags = 0;
	7162	#if CONFIG_FSE
	7163	int fsevent;
	7164	#endif /* CONFIG_FSE */
	7165
	7166	#if CONFIG_MACF
	7167	(void)vnode_label(vnode_mount(tvp), tdvp, tvp, cnp,
	7168	VNODE_LABEL_CREATE, ctx);
	7169	#endif
	7170	/*
	7171	* If some of the requested attributes weren't handled by the
	7172	* VNOP, use our fallback code.
	7173	*/
	7174	if (!VATTR_ALL_SUPPORTED(&va))
	7175	(void)vnode_setattr_fallback(tvp, &nva, ctx);
	7176
	7177	// Make sure the name & parent pointers are hooked up
	7178	if (tvp->v_name == NULL)
	7179	update_flags \|= VNODE_UPDATE_NAME;
	7180	if (tvp->v_parent == NULLVP)
	7181	update_flags \|= VNODE_UPDATE_PARENT;
	7182
	7183	if (update_flags) {
	7184	(void)vnode_update_identity(tvp, tdvp, cnp->cn_nameptr,
	7185	cnp->cn_namelen, cnp->cn_hash, update_flags);
	7186	}
	7187
	7188	#if CONFIG_FSE
	7189	switch (vnode_vtype(tvp)) {
	7190	case VLNK:
	7191	/* FALLTHRU */
	7192	case VREG:
	7193	fsevent = FSE_CREATE_FILE;
	7194	break;
	7195	case VDIR:
	7196	fsevent = FSE_CREATE_DIR;
	7197	break;
	7198	default:
	7199	goto out;
	7200	}
	7201
	7202	if (need_fsevent(fsevent, tvp)) {
	7203	/*
	7204	* The following is a sequence of three explicit events.
	7205	* A pair of FSE_CLONE events representing the source and destination
	7206	* followed by an FSE_CREATE_[FILE \| DIR] for the destination.
	7207	* fseventsd may coalesce the destination clone and create events
	7208	* into a single event resulting in the following sequence for a client
	7209	* FSE_CLONE (src)
	7210	* FSE_CLONE \| FSE_CREATE (dst)
	7211	*/
	7212	add_fsevent(FSE_CLONE, ctx, FSE_ARG_VNODE, fvp, FSE_ARG_VNODE, tvp,
	7213	FSE_ARG_DONE);
	7214	add_fsevent(fsevent, ctx, FSE_ARG_VNODE, tvp,
	7215	FSE_ARG_DONE);
	7216	}
	7217	#endif /* CONFIG_FSE */
	7218	}
	7219
	7220	out:
	7221	if (attr_cleanup)
	7222	vn_attribute_cleanup(&nva, defaulted);
	7223	if (free_src_acl && va.va_acl)
	7224	kauth_acl_free(va.va_acl);
	7225	nameidone(&tond);
	7226	if (tvp)
	7227	vnode_put(tvp);
	7228	vnode_put(tdvp);
	7229	return (error);
	7230	}
	7231
	7232	/*
	7233	* clone files or directories, target must not exist.
	7234	*/
	7235	/* ARGSUSED */
	7236	int
	7237	clonefileat(__unused proc_t p, struct clonefileat_args *uap,
	7238	__unused int32_t *retval)
	7239	{
	7240	vnode_t fvp;
	7241	struct nameidata fromnd;
	7242	int follow;
	7243	int error;
	7244	vfs_context_t ctx = vfs_context_current();
	7245
	7246	/* Check that the flags are valid. */
	7247	if (uap->flags & ~(CLONE_NOFOLLOW \| CLONE_NOOWNERCOPY))
	7248	return (EINVAL);
	7249
	7250	AUDIT_ARG(fd, uap->src_dirfd);
	7251
	7252	follow = (uap->flags & CLONE_NOFOLLOW) ? NOFOLLOW : FOLLOW;
	7253	NDINIT(&fromnd, LOOKUP, OP_COPYFILE, follow \| AUDITVNPATH1,
	7254	UIO_USERSPACE, uap->src, ctx);
	7255	if ((error = nameiat(&fromnd, uap->src_dirfd)))
	7256	return (error);
	7257
	7258	fvp = fromnd.ni_vp;
	7259	nameidone(&fromnd);
	7260
	7261	error = clonefile_internal(fvp, FALSE, uap->dst_dirfd, uap->dst,
	7262	uap->flags, ctx);
	7263
	7264	vnode_put(fvp);
	7265	return (error);
	7266	}
	7267
	7268	int
	7269	fclonefileat(__unused proc_t p, struct fclonefileat_args *uap,
	7270	__unused int32_t *retval)
	7271	{
	7272	vnode_t fvp;
	7273	struct fileproc *fp;
	7274	int error;
	7275	vfs_context_t ctx = vfs_context_current();
	7276
	7277	/* Check that the flags are valid. */
	7278	if (uap->flags & ~(CLONE_NOFOLLOW \| CLONE_NOOWNERCOPY))
	7279	return (EINVAL);
	7280
	7281	AUDIT_ARG(fd, uap->src_fd);
	7282	error = fp_getfvp(p, uap->src_fd, &fp, &fvp);
	7283	if (error)
	7284	return (error);
	7285
	7286	if ((fp->f_fglob->fg_flag & FREAD) == 0) {
	7287	AUDIT_ARG(vnpath_withref, fvp, ARG_VNODE1);
	7288	error = EBADF;
	7289	goto out;
	7290	}
	7291
	7292	if ((error = vnode_getwithref(fvp)))
	7293	goto out;
	7294
	7295	AUDIT_ARG(vnpath, fvp, ARG_VNODE1);
	7296
	7297	error = clonefile_internal(fvp, TRUE, uap->dst_dirfd, uap->dst,
	7298	uap->flags, ctx);
	7299
	7300	vnode_put(fvp);
	7301	out:
	7302	file_drop(uap->src_fd);
	7303	return (error);
	7304	}
	7305
	7306	/*
	7307	* Rename files. Source and destination must either both be directories,
	7308	* or both not be directories. If target is a directory, it must be empty.
	7309	*/
	7310	/* ARGSUSED */
	7311	static int
	7312	renameat_internal(vfs_context_t ctx, int fromfd, user_addr_t from,
	7313	int tofd, user_addr_t to, int segflg, vfs_rename_flags_t flags)
	7314	{
	7315	if (flags & ~VFS_RENAME_FLAGS_MASK)
	7316	return EINVAL;
	7317
	7318	if (ISSET(flags, VFS_RENAME_SWAP) && ISSET(flags, VFS_RENAME_EXCL))
	7319	return EINVAL;
	7320
	7321	vnode_t tvp, tdvp;
	7322	vnode_t fvp, fdvp;
	7323	struct nameidata fromnd, tond;
	7324	int error;
	7325	int do_retry;
	7326	int retry_count;
	7327	int mntrename;
	7328	int need_event;
	7329	int need_kpath2;
	7330	int has_listeners;
	7331	const char *oname = NULL;
	7332	char from_name = NULL, to_name = NULL;
	7333	int from_len=0, to_len=0;
	7334	int holding_mntlock;
	7335	mount_t locked_mp = NULL;
	7336	vnode_t oparent = NULLVP;
	7337	#if CONFIG_FSE
	7338	fse_info from_finfo, to_finfo;
	7339	#endif
	7340	int from_truncated=0, to_truncated;
	7341	int batched = 0;
	7342	struct vnode_attr fvap, tvap;
	7343	int continuing = 0;
	7344	/* carving out a chunk for structs that are too big to be on stack. */
	7345	struct {
	7346	struct nameidata from_node, to_node;
	7347	struct vnode_attr fv_attr, tv_attr;
	7348	} * __rename_data;
	7349	MALLOC(__rename_data, void , sizeof(__rename_data), M_TEMP, M_WAITOK);
	7350	fromnd = &__rename_data->from_node;
	7351	tond = &__rename_data->to_node;
	7352
	7353	holding_mntlock = 0;
	7354	do_retry = 0;
	7355	retry_count = 0;
	7356	retry:
	7357	fvp = tvp = NULL;
	7358	fdvp = tdvp = NULL;
	7359	fvap = tvap = NULL;
	7360	mntrename = FALSE;
	7361
	7362	NDINIT(fromnd, DELETE, OP_UNLINK, WANTPARENT \| AUDITVNPATH1,
	7363	segflg, from, ctx);
	7364	fromnd->ni_flag = NAMEI_COMPOUNDRENAME;
	7365
	7366	NDINIT(tond, RENAME, OP_RENAME, WANTPARENT \| AUDITVNPATH2 \| CN_NBMOUNTLOOK,
	7367	segflg, to, ctx);
	7368	tond->ni_flag = NAMEI_COMPOUNDRENAME;
	7369
	7370	continue_lookup:
	7371	if ((fromnd->ni_flag & NAMEI_CONTLOOKUP) != 0 \|\| !continuing) {
	7372	if ( (error = nameiat(fromnd, fromfd)) )
	7373	goto out1;
	7374	fdvp = fromnd->ni_dvp;
	7375	fvp = fromnd->ni_vp;
	7376
	7377	if (fvp && fvp->v_type == VDIR)
	7378	tond->ni_cnd.cn_flags \|= WILLBEDIR;
	7379	}
	7380
	7381	if ((tond->ni_flag & NAMEI_CONTLOOKUP) != 0 \|\| !continuing) {
	7382	if ( (error = nameiat(tond, tofd)) ) {
	7383	/*
	7384	* Translate error code for rename("dir1", "dir2/.").
	7385	*/
	7386	if (error == EISDIR && fvp->v_type == VDIR)
	7387	error = EINVAL;
	7388	goto out1;
	7389	}
	7390	tdvp = tond->ni_dvp;
	7391	tvp = tond->ni_vp;
	7392	}
	7393
	7394	#if DEVELOPMENT \|\| DEBUG
	7395	/*
	7396	* XXX VSWAP: Check for entitlements or special flag here
	7397	* so we can restrict access appropriately.
	7398	*/
	7399	#else /* DEVELOPMENT \|\| DEBUG */
	7400
	7401	if (fromnd->ni_vp && vnode_isswap(fromnd->ni_vp) && (ctx != vfs_context_kernel())) {
	7402	error = EPERM;
	7403	goto out1;
	7404	}
	7405
	7406	if (tond->ni_vp && vnode_isswap(tond->ni_vp) && (ctx != vfs_context_kernel())) {
	7407	error = EPERM;
	7408	goto out1;
	7409	}
	7410	#endif /* DEVELOPMENT \|\| DEBUG */
	7411
	7412	if (!tvp && ISSET(flags, VFS_RENAME_SWAP)) {
	7413	error = ENOENT;
	7414	goto out1;
	7415	}
	7416
	7417	if (tvp && ISSET(flags, VFS_RENAME_EXCL)) {
	7418	error = EEXIST;
	7419	goto out1;
	7420	}
	7421
	7422	batched = vnode_compound_rename_available(fdvp);
	7423
	7424	#if CONFIG_FSE
	7425	need_event = need_fsevent(FSE_RENAME, fdvp);
	7426	if (need_event) {
	7427	if (fvp) {
	7428	get_fse_info(fvp, &from_finfo, ctx);
	7429	} else {
	7430	error = vfs_get_notify_attributes(&__rename_data->fv_attr);
	7431	if (error) {
	7432	goto out1;
	7433	}
	7434
	7435	fvap = &__rename_data->fv_attr;
	7436	}
	7437
	7438	if (tvp) {
	7439	get_fse_info(tvp, &to_finfo, ctx);
	7440	} else if (batched) {
	7441	error = vfs_get_notify_attributes(&__rename_data->tv_attr);
	7442	if (error) {
	7443	goto out1;
	7444	}
	7445
	7446	tvap = &__rename_data->tv_attr;
	7447	}
	7448	}
	7449	#else
	7450	need_event = 0;
	7451	#endif /* CONFIG_FSE */
	7452
	7453	has_listeners = kauth_authorize_fileop_has_listeners();
	7454
	7455	need_kpath2 = 0;
	7456	#if CONFIG_AUDIT
	7457	if (AUDIT_RECORD_EXISTS()) {
	7458	need_kpath2 = 1;
	7459	}
	7460	#endif
	7461
	7462	if (need_event \|\| has_listeners) {
	7463	if (from_name == NULL) {
	7464	GET_PATH(from_name);
	7465	if (from_name == NULL) {
	7466	error = ENOMEM;
	7467	goto out1;
	7468	}
	7469	}
	7470
	7471	from_len = safe_getpath(fdvp, fromnd->ni_cnd.cn_nameptr, from_name, MAXPATHLEN, &from_truncated);
	7472	}
	7473
	7474	if (need_event \|\| need_kpath2 \|\| has_listeners) {
	7475	if (to_name == NULL) {
	7476	GET_PATH(to_name);
	7477	if (to_name == NULL) {
	7478	error = ENOMEM;
	7479	goto out1;
	7480	}
	7481	}
	7482
	7483	to_len = safe_getpath(tdvp, tond->ni_cnd.cn_nameptr, to_name, MAXPATHLEN, &to_truncated);
	7484	if (to_name && need_kpath2) {
	7485	AUDIT_ARG(kpath, to_name, ARG_KPATH2);
	7486	}
	7487	}
	7488	if (!fvp) {
	7489	/*
	7490	* Claim: this check will never reject a valid rename.
	7491	* For success, either fvp must be on the same mount as tdvp, or fvp must sit atop a vnode on the same mount as tdvp.
	7492	* Suppose fdvp and tdvp are not on the same mount.
	7493	* If fvp is on the same mount as tdvp, then fvp is not on the same mount as fdvp, so fvp is the root of its filesystem. If fvp is the root,
	7494	* then you can't move it to within another dir on the same mountpoint.
	7495	* If fvp sits atop a vnode on the same mount as fdvp, then that vnode must be part of the same mount as fdvp, which is a contradiction.
	7496	*
	7497	* If this check passes, then we are safe to pass these vnodes to the same FS.
	7498	*/
	7499	if (fdvp->v_mount != tdvp->v_mount) {
	7500	error = EXDEV;
	7501	goto out1;
	7502	}
	7503	goto skipped_lookup;
	7504	}
	7505
	7506	if (!batched) {
	7507	error = vn_authorize_renamex_with_paths(fdvp, fvp, &fromnd->ni_cnd, from_name, tdvp, tvp, &tond->ni_cnd, to_name, ctx, flags, NULL);
	7508	if (error) {
	7509	if (error == ENOENT) {
	7510	assert(retry_count < MAX_AUTHORIZE_ENOENT_RETRIES);
	7511	if (retry_count < MAX_AUTHORIZE_ENOENT_RETRIES) {
	7512	/*
	7513	* We encountered a race where after doing the namei, tvp stops
	7514	* being valid. If so, simply re-drive the rename call from the
	7515	* top.
	7516	*/
	7517	do_retry = 1;
	7518	retry_count += 1;
	7519	}
	7520	}
	7521	goto out1;
	7522	}
	7523	}
	7524
	7525	/*
	7526	* If the source and destination are the same (i.e. they're
	7527	* links to the same vnode) and the target file system is
	7528	* case sensitive, then there is nothing to do.
	7529	*
	7530	* XXX Come back to this.
	7531	*/
	7532	if (fvp == tvp) {
	7533	int pathconf_val;
	7534
	7535	/*
	7536	* Note: if _PC_CASE_SENSITIVE selector isn't supported,
	7537	* then assume that this file system is case sensitive.
	7538	*/
	7539	if (VNOP_PATHCONF(fvp, _PC_CASE_SENSITIVE, &pathconf_val, ctx) != 0 \|\|
	7540	pathconf_val != 0) {
	7541	goto out1;
	7542	}
	7543	}
	7544
	7545	/*
	7546	* Allow the renaming of mount points.
	7547	* - target must not exist
	7548	* - target must reside in the same directory as source
	7549	* - union mounts cannot be renamed
	7550	* - "/" cannot be renamed
	7551	*
	7552	* XXX Handle this in VFS after a continued lookup (if we missed
	7553	* in the cache to start off)
	7554	*
	7555	* N.B. If RENAME_SWAP is being used, then @tvp != NULL and so
	7556	* we'll skip past here. The file system is responsible for
	7557	* checking that @tvp is not a descendent of @fvp and vice versa
	7558	* so it should always return EINVAL if either @tvp or @fvp is the
	7559	* root of a volume.
	7560	*/
	7561	if ((fvp->v_flag & VROOT) &&
	7562	(fvp->v_type == VDIR) &&
	7563	(tvp == NULL) &&
	7564	(fvp->v_mountedhere == NULL) &&
	7565	(fdvp == tdvp) &&
	7566	((fvp->v_mount->mnt_flag & (MNT_UNION \| MNT_ROOTFS)) == 0) &&
	7567	(fvp->v_mount->mnt_vnodecovered != NULLVP)) {
	7568	vnode_t coveredvp;
	7569
	7570	/* switch fvp to the covered vnode */
	7571	coveredvp = fvp->v_mount->mnt_vnodecovered;
	7572	if ( (vnode_getwithref(coveredvp)) ) {
	7573	error = ENOENT;
	7574	goto out1;
	7575	}
	7576	vnode_put(fvp);
	7577
	7578	fvp = coveredvp;
	7579	mntrename = TRUE;
	7580	}
	7581	/*
	7582	* Check for cross-device rename.
	7583	*/
	7584	if ((fvp->v_mount != tdvp->v_mount) \|\|
	7585	(tvp && (fvp->v_mount != tvp->v_mount))) {
	7586	error = EXDEV;
	7587	goto out1;
	7588	}
	7589
	7590	/*
	7591	* If source is the same as the destination (that is the
	7592	* same inode number) then there is nothing to do...
	7593	* EXCEPT if the underlying file system supports case
	7594	* insensitivity and is case preserving. In this case
	7595	* the file system needs to handle the special case of
	7596	* getting the same vnode as target (fvp) and source (tvp).
	7597	*
	7598	* Only file systems that support pathconf selectors _PC_CASE_SENSITIVE
	7599	* and _PC_CASE_PRESERVING can have this exception, and they need to
	7600	* handle the special case of getting the same vnode as target and
	7601	* source. NOTE: Then the target is unlocked going into vnop_rename,
	7602	* so not to cause locking problems. There is a single reference on tvp.
	7603	*
	7604	* NOTE - that fvp == tvp also occurs if they are hard linked and
	7605	* that correct behaviour then is just to return success without doing
	7606	* anything.
	7607	*
	7608	* XXX filesystem should take care of this itself, perhaps...
	7609	*/
	7610	if (fvp == tvp && fdvp == tdvp) {
	7611	if (fromnd->ni_cnd.cn_namelen == tond->ni_cnd.cn_namelen &&
	7612	!bcmp(fromnd->ni_cnd.cn_nameptr, tond->ni_cnd.cn_nameptr,
	7613	fromnd->ni_cnd.cn_namelen)) {
	7614	goto out1;
	7615	}
	7616	}
	7617
	7618	if (holding_mntlock && fvp->v_mount != locked_mp) {
	7619	/*
	7620	* we're holding a reference and lock
	7621	* on locked_mp, but it no longer matches
	7622	* what we want to do... so drop our hold
	7623	*/
	7624	mount_unlock_renames(locked_mp);
	7625	mount_drop(locked_mp, 0);
	7626	holding_mntlock = 0;
	7627	}
	7628	if (tdvp != fdvp && fvp->v_type == VDIR) {
	7629	/*
	7630	* serialize renames that re-shape
	7631	* the tree... if holding_mntlock is
	7632	* set, then we're ready to go...
	7633	* otherwise we
	7634	* first need to drop the iocounts
	7635	* we picked up, second take the
	7636	* lock to serialize the access,
	7637	* then finally start the lookup
	7638	* process over with the lock held
	7639	*/
	7640	if (!holding_mntlock) {
	7641	/*
	7642	* need to grab a reference on
	7643	* the mount point before we
	7644	* drop all the iocounts... once
	7645	* the iocounts are gone, the mount
	7646	* could follow
	7647	*/
	7648	locked_mp = fvp->v_mount;
	7649	mount_ref(locked_mp, 0);
	7650
	7651	/*
	7652	* nameidone has to happen before we vnode_put(tvp)
	7653	* since it may need to release the fs_nodelock on the tvp
	7654	*/
	7655	nameidone(tond);
	7656
	7657	if (tvp)
	7658	vnode_put(tvp);
	7659	vnode_put(tdvp);
	7660
	7661	/*
	7662	* nameidone has to happen before we vnode_put(fdvp)
	7663	* since it may need to release the fs_nodelock on the fvp
	7664	*/
	7665	nameidone(fromnd);
	7666
	7667	vnode_put(fvp);
	7668	vnode_put(fdvp);
	7669
	7670	mount_lock_renames(locked_mp);
	7671	holding_mntlock = 1;
	7672
	7673	goto retry;
	7674	}
	7675	} else {
	7676	/*
	7677	* when we dropped the iocounts to take
	7678	* the lock, we allowed the identity of
	7679	* the various vnodes to change... if they did,
	7680	* we may no longer be dealing with a rename
	7681	* that reshapes the tree... once we're holding
	7682	* the iocounts, the vnodes can't change type
	7683	* so we're free to drop the lock at this point
	7684	* and continue on
	7685	*/
	7686	if (holding_mntlock) {
	7687	mount_unlock_renames(locked_mp);
	7688	mount_drop(locked_mp, 0);
	7689	holding_mntlock = 0;
	7690	}
	7691	}
	7692
	7693	// save these off so we can later verify that fvp is the same
	7694	oname = fvp->v_name;
	7695	oparent = fvp->v_parent;
	7696
	7697	skipped_lookup:
	7698	error = vn_rename(fdvp, &fvp, &fromnd->ni_cnd, fvap,
	7699	tdvp, &tvp, &tond->ni_cnd, tvap,
	7700	flags, ctx);
	7701
	7702	if (holding_mntlock) {
	7703	/*
	7704	* we can drop our serialization
	7705	* lock now
	7706	*/
	7707	mount_unlock_renames(locked_mp);
	7708	mount_drop(locked_mp, 0);
	7709	holding_mntlock = 0;
	7710	}
	7711	if (error) {
	7712	if (error == EKEEPLOOKING) {
	7713	if ((fromnd->ni_flag & NAMEI_CONTLOOKUP) == 0) {
	7714	if ((tond->ni_flag & NAMEI_CONTLOOKUP) == 0) {
	7715	panic("EKEEPLOOKING without NAMEI_CONTLOOKUP on either ndp?");
	7716	}
	7717	}
	7718
	7719	fromnd->ni_vp = fvp;
	7720	tond->ni_vp = tvp;
	7721
	7722	goto continue_lookup;
	7723	}
	7724
	7725	/*
	7726	* We may encounter a race in the VNOP where the destination didn't
	7727	* exist when we did the namei, but it does by the time we go and
	7728	* try to create the entry. In this case, we should re-drive this rename
	7729	* call from the top again. Currently, only HFS bubbles out ERECYCLE,
	7730	* but other filesystems susceptible to this race could return it, too.
	7731	*/
	7732	if (error == ERECYCLE) {
	7733	do_retry = 1;
	7734	}
	7735
	7736	/*
	7737	* For compound VNOPs, the authorization callback may return
	7738	* ENOENT in case of racing hardlink lookups hitting the name
	7739	* cache, redrive the lookup.
	7740	*/
	7741	if (batched && error == ENOENT) {
	7742	assert(retry_count < MAX_AUTHORIZE_ENOENT_RETRIES);
	7743	if (retry_count < MAX_AUTHORIZE_ENOENT_RETRIES) {
	7744	do_retry = 1;
	7745	retry_count += 1;
	7746	}
	7747	}
	7748
	7749	goto out1;
	7750	}
	7751
	7752	/* call out to allow 3rd party notification of rename.
	7753	* Ignore result of kauth_authorize_fileop call.
	7754	*/
	7755	kauth_authorize_fileop(vfs_context_ucred(ctx),
	7756	KAUTH_FILEOP_RENAME,
	7757	(uintptr_t)from_name, (uintptr_t)to_name);
	7758	if (flags & VFS_RENAME_SWAP) {
	7759	kauth_authorize_fileop(vfs_context_ucred(ctx),
	7760	KAUTH_FILEOP_RENAME,
	7761	(uintptr_t)to_name, (uintptr_t)from_name);
	7762	}
	7763
	7764	#if CONFIG_FSE
	7765	if (from_name != NULL && to_name != NULL) {
	7766	if (from_truncated \|\| to_truncated) {
	7767	// set it here since only the from_finfo gets reported up to user space
	7768	from_finfo.mode \|= FSE_TRUNCATED_PATH;
	7769	}
	7770
	7771	if (tvap && tvp) {
	7772	vnode_get_fse_info_from_vap(tvp, &to_finfo, tvap);
	7773	}
	7774	if (fvap) {
	7775	vnode_get_fse_info_from_vap(fvp, &from_finfo, fvap);
	7776	}
	7777
	7778	if (tvp) {
	7779	add_fsevent(FSE_RENAME, ctx,
	7780	FSE_ARG_STRING, from_len, from_name,
	7781	FSE_ARG_FINFO, &from_finfo,
	7782	FSE_ARG_STRING, to_len, to_name,
	7783	FSE_ARG_FINFO, &to_finfo,
	7784	FSE_ARG_DONE);
	7785	if (flags & VFS_RENAME_SWAP) {
	7786	/*
	7787	* Strictly speaking, swap is the equivalent of
	7788	* three renames. FSEvents clients should only take
	7789	* the events as a hint, so we only bother reporting
	7790	* two.
	7791	*/
	7792	add_fsevent(FSE_RENAME, ctx,
	7793	FSE_ARG_STRING, to_len, to_name,
	7794	FSE_ARG_FINFO, &to_finfo,
	7795	FSE_ARG_STRING, from_len, from_name,
	7796	FSE_ARG_FINFO, &from_finfo,
	7797	FSE_ARG_DONE);
	7798	}
	7799	} else {
	7800	add_fsevent(FSE_RENAME, ctx,
	7801	FSE_ARG_STRING, from_len, from_name,
	7802	FSE_ARG_FINFO, &from_finfo,
	7803	FSE_ARG_STRING, to_len, to_name,
	7804	FSE_ARG_DONE);
	7805	}
	7806	}
	7807	#endif /* CONFIG_FSE */
	7808
	7809	/*
	7810	* update filesystem's mount point data
	7811	*/
	7812	if (mntrename) {
	7813	char cp, pathend, *mpname;
	7814	char * tobuf;
	7815	struct mount *mp;
	7816	int maxlen;
	7817	size_t len = 0;
	7818
	7819	mp = fvp->v_mountedhere;
	7820
	7821	if (vfs_busy(mp, LK_NOWAIT)) {
	7822	error = EBUSY;
	7823	goto out1;
	7824	}
	7825	MALLOC_ZONE(tobuf, char *, MAXPATHLEN, M_NAMEI, M_WAITOK);
	7826
	7827	if (UIO_SEG_IS_USER_SPACE(segflg))
	7828	error = copyinstr(to, tobuf, MAXPATHLEN, &len);
	7829	else
	7830	error = copystr((void *)to, tobuf, MAXPATHLEN, &len);
	7831	if (!error) {
	7832	/* find current mount point prefix */
	7833	pathend = &mp->mnt_vfsstat.f_mntonname[0];
	7834	for (cp = pathend; *cp != '\0'; ++cp) {
	7835	if (*cp == '/')
	7836	pathend = cp + 1;
	7837	}
	7838	/* find last component of target name */
	7839	for (mpname = cp = tobuf; *cp != '\0'; ++cp) {
	7840	if (*cp == '/')
	7841	mpname = cp + 1;
	7842	}
	7843	/* append name to prefix */
	7844	maxlen = MAXPATHLEN - (pathend - mp->mnt_vfsstat.f_mntonname);
	7845	bzero(pathend, maxlen);
	7846	strlcpy(pathend, mpname, maxlen);
	7847	}
	7848	FREE_ZONE(tobuf, MAXPATHLEN, M_NAMEI);
	7849
	7850	vfs_unbusy(mp);
	7851	}
	7852	/*
	7853	* fix up name & parent pointers. note that we first
	7854	* check that fvp has the same name/parent pointers it
	7855	* had before the rename call... this is a 'weak' check
	7856	* at best...
	7857	*
	7858	* XXX oparent and oname may not be set in the compound vnop case
	7859	*/
	7860	if (batched \|\| (oname == fvp->v_name && oparent == fvp->v_parent)) {
	7861	int update_flags;
	7862
	7863	update_flags = VNODE_UPDATE_NAME;
	7864
	7865	if (fdvp != tdvp)
	7866	update_flags \|= VNODE_UPDATE_PARENT;
	7867
	7868	vnode_update_identity(fvp, tdvp, tond->ni_cnd.cn_nameptr, tond->ni_cnd.cn_namelen, tond->ni_cnd.cn_hash, update_flags);
	7869	}
	7870	out1:
	7871	if (to_name != NULL) {
	7872	RELEASE_PATH(to_name);
	7873	to_name = NULL;
	7874	}
	7875	if (from_name != NULL) {
	7876	RELEASE_PATH(from_name);
	7877	from_name = NULL;
	7878	}
	7879	if (holding_mntlock) {
	7880	mount_unlock_renames(locked_mp);
	7881	mount_drop(locked_mp, 0);
	7882	holding_mntlock = 0;
	7883	}
	7884	if (tdvp) {
	7885	/*
	7886	* nameidone has to happen before we vnode_put(tdvp)
	7887	* since it may need to release the fs_nodelock on the tdvp
	7888	*/
	7889	nameidone(tond);
	7890
	7891	if (tvp)
	7892	vnode_put(tvp);
	7893	vnode_put(tdvp);
	7894	}
	7895	if (fdvp) {
	7896	/*
	7897	* nameidone has to happen before we vnode_put(fdvp)
	7898	* since it may need to release the fs_nodelock on the fdvp
	7899	*/
	7900	nameidone(fromnd);
	7901
	7902	if (fvp)
	7903	vnode_put(fvp);
	7904	vnode_put(fdvp);
	7905	}
	7906
	7907	/*
	7908	* If things changed after we did the namei, then we will re-drive
	7909	* this rename call from the top.
	7910	*/
	7911	if (do_retry) {
	7912	do_retry = 0;
	7913	goto retry;
	7914	}
	7915
	7916	FREE(__rename_data, M_TEMP);
	7917	return (error);
	7918	}
	7919
	7920	int
	7921	rename(__unused proc_t p, struct rename_args uap, __unused int32_t retval)
	7922	{
	7923	return (renameat_internal(vfs_context_current(), AT_FDCWD, uap->from,
	7924	AT_FDCWD, uap->to, UIO_USERSPACE, 0));
	7925	}
	7926
	7927	int renameatx_np(__unused proc_t p, struct renameatx_np_args uap, __unused int32_t retval)
	7928	{
	7929	return renameat_internal(
	7930	vfs_context_current(),
	7931	uap->fromfd, uap->from,
	7932	uap->tofd, uap->to,
	7933	UIO_USERSPACE, uap->flags);
	7934	}
	7935
	7936	int
	7937	renameat(__unused proc_t p, struct renameat_args uap, __unused int32_t retval)
	7938	{
	7939	return (renameat_internal(vfs_context_current(), uap->fromfd, uap->from,
	7940	uap->tofd, uap->to, UIO_USERSPACE, 0));
	7941	}
	7942
	7943	/*
	7944	* Make a directory file.
	7945	*
	7946	* Returns: 0 Success
	7947	* EEXIST
	7948	* namei:???
	7949	* vnode_authorize:???
	7950	* vn_create:???
	7951	*/
	7952	/* ARGSUSED */
	7953	static int
	7954	mkdir1at(vfs_context_t ctx, user_addr_t path, struct vnode_attr *vap, int fd,
	7955	enum uio_seg segflg)
	7956	{
	7957	vnode_t vp, dvp;
	7958	int error;
	7959	int update_flags = 0;
	7960	int batched;
	7961	struct nameidata nd;
	7962
	7963	AUDIT_ARG(mode, vap->va_mode);
	7964	NDINIT(&nd, CREATE, OP_MKDIR, LOCKPARENT \| AUDITVNPATH1, segflg,
	7965	path, ctx);
	7966	nd.ni_cnd.cn_flags \|= WILLBEDIR;
	7967	nd.ni_flag = NAMEI_COMPOUNDMKDIR;
	7968
	7969	continue_lookup:
	7970	error = nameiat(&nd, fd);
	7971	if (error)
	7972	return (error);
	7973	dvp = nd.ni_dvp;
	7974	vp = nd.ni_vp;
	7975
	7976	if (vp != NULL) {
	7977	error = EEXIST;
	7978	goto out;
	7979	}
	7980
	7981	batched = vnode_compound_mkdir_available(dvp);
	7982
	7983	VATTR_SET(vap, va_type, VDIR);
	7984
	7985	/*
	7986	* XXX
	7987	* Don't authorize in VFS for compound VNOP.... mkdir -p today assumes that it will
	7988	* only get EXISTS or EISDIR for existing path components, and not that it could see
	7989	* EACCESS/EPERM--so if we authorize for mkdir on "/" for "mkdir -p /tmp/foo/bar/baz"
	7990	* it will fail in a spurious manner. Need to figure out if this is valid behavior.
	7991	*/
	7992	if ((error = vn_authorize_mkdir(dvp, &nd.ni_cnd, vap, ctx, NULL)) != 0) {
	7993	if (error == EACCES \|\| error == EPERM) {
	7994	int error2;
	7995
	7996	nameidone(&nd);
	7997	vnode_put(dvp);
	7998	dvp = NULLVP;
	7999
	8000	/*
	8001	* Try a lookup without "NAMEI_COMPOUNDVNOP" to make sure we return EEXIST
	8002	* rather than EACCESS if the target exists.
	8003	*/
	8004	NDINIT(&nd, LOOKUP, OP_MKDIR, AUDITVNPATH1, segflg,
	8005	path, ctx);
	8006	error2 = nameiat(&nd, fd);
	8007	if (error2) {
	8008	goto out;
	8009	} else {
	8010	vp = nd.ni_vp;
	8011	error = EEXIST;
	8012	goto out;
	8013	}
	8014	}
	8015
	8016	goto out;
	8017	}
	8018
	8019	/*
	8020	* make the directory
	8021	*/
	8022	if ((error = vn_create(dvp, &vp, &nd, vap, 0, 0, NULL, ctx)) != 0) {
	8023	if (error == EKEEPLOOKING) {
	8024	nd.ni_vp = vp;
	8025	goto continue_lookup;
	8026	}
	8027
	8028	goto out;
	8029	}
	8030
	8031	// Make sure the name & parent pointers are hooked up
	8032	if (vp->v_name == NULL)
	8033	update_flags \|= VNODE_UPDATE_NAME;
	8034	if (vp->v_parent == NULLVP)
	8035	update_flags \|= VNODE_UPDATE_PARENT;
	8036
	8037	if (update_flags)
	8038	vnode_update_identity(vp, dvp, nd.ni_cnd.cn_nameptr, nd.ni_cnd.cn_namelen, nd.ni_cnd.cn_hash, update_flags);
	8039
	8040	#if CONFIG_FSE
	8041	add_fsevent(FSE_CREATE_DIR, ctx, FSE_ARG_VNODE, vp, FSE_ARG_DONE);
	8042	#endif
	8043
	8044	out:
	8045	/*
	8046	* nameidone has to happen before we vnode_put(dvp)
	8047	* since it may need to release the fs_nodelock on the dvp
	8048	*/
	8049	nameidone(&nd);
	8050
	8051	if (vp)
	8052	vnode_put(vp);
	8053	if (dvp)
	8054	vnode_put(dvp);
	8055
	8056	return (error);
	8057	}
	8058
	8059	/*
	8060	* mkdir_extended: Create a directory; with extended security (ACL).
	8061	*
	8062	* Parameters: p Process requesting to create the directory
	8063	* uap User argument descriptor (see below)
	8064	* retval (ignored)
	8065	*
	8066	* Indirect: uap->path Path of directory to create
	8067	* uap->mode Access permissions to set
	8068	* uap->xsecurity ACL to set
	8069	*
	8070	* Returns: 0 Success
	8071	* !0 Not success
	8072	*
	8073	*/
	8074	int
	8075	mkdir_extended(proc_t p, struct mkdir_extended_args uap, __unused int32_t retval)
	8076	{
	8077	int ciferror;
	8078	kauth_filesec_t xsecdst;
	8079	struct vnode_attr va;
	8080
	8081	AUDIT_ARG(owner, uap->uid, uap->gid);
	8082
	8083	xsecdst = NULL;
	8084	if ((uap->xsecurity != USER_ADDR_NULL) &&
	8085	((ciferror = kauth_copyinfilesec(uap->xsecurity, &xsecdst)) != 0))
	8086	return ciferror;
	8087
	8088	VATTR_INIT(&va);
	8089	VATTR_SET(&va, va_mode, (uap->mode & ACCESSPERMS) & ~p->p_fd->fd_cmask);
	8090	if (xsecdst != NULL)
	8091	VATTR_SET(&va, va_acl, &xsecdst->fsec_acl);
	8092
	8093	ciferror = mkdir1at(vfs_context_current(), uap->path, &va, AT_FDCWD,
	8094	UIO_USERSPACE);
	8095	if (xsecdst != NULL)
	8096	kauth_filesec_free(xsecdst);
	8097	return ciferror;
	8098	}
	8099
	8100	int
	8101	mkdir(proc_t p, struct mkdir_args uap, __unused int32_t retval)
	8102	{
	8103	struct vnode_attr va;
	8104
	8105	VATTR_INIT(&va);
	8106	VATTR_SET(&va, va_mode, (uap->mode & ACCESSPERMS) & ~p->p_fd->fd_cmask);
	8107
	8108	return (mkdir1at(vfs_context_current(), uap->path, &va, AT_FDCWD,
	8109	UIO_USERSPACE));
	8110	}
	8111
	8112	int
	8113	mkdirat(proc_t p, struct mkdirat_args uap, __unused int32_t retval)
	8114	{
	8115	struct vnode_attr va;
	8116
	8117	VATTR_INIT(&va);
	8118	VATTR_SET(&va, va_mode, (uap->mode & ACCESSPERMS) & ~p->p_fd->fd_cmask);
	8119
	8120	return(mkdir1at(vfs_context_current(), uap->path, &va, uap->fd,
	8121	UIO_USERSPACE));
	8122	}
	8123
	8124	static int
	8125	rmdirat_internal(vfs_context_t ctx, int fd, user_addr_t dirpath,
	8126	enum uio_seg segflg)
	8127	{
	8128	vnode_t vp, dvp;
	8129	int error;
	8130	struct nameidata nd;
	8131	char *path = NULL;
	8132	int len=0;
	8133	int has_listeners = 0;
	8134	int need_event = 0;
	8135	int truncated = 0;
	8136	#if CONFIG_FSE
	8137	struct vnode_attr va;
	8138	#endif /* CONFIG_FSE */
	8139	struct vnode_attr *vap = NULL;
	8140	int restart_count = 0;
	8141	int batched;
	8142
	8143	int restart_flag;
	8144
	8145	/*
	8146	* This loop exists to restart rmdir in the unlikely case that two
	8147	* processes are simultaneously trying to remove the same directory
	8148	* containing orphaned appleDouble files.
	8149	*/
	8150	do {
	8151	NDINIT(&nd, DELETE, OP_RMDIR, LOCKPARENT \| AUDITVNPATH1,
	8152	segflg, dirpath, ctx);
	8153	nd.ni_flag = NAMEI_COMPOUNDRMDIR;
	8154	continue_lookup:
	8155	restart_flag = 0;
	8156	vap = NULL;
	8157
	8158	error = nameiat(&nd, fd);
	8159	if (error)
	8160	return (error);
	8161
	8162	dvp = nd.ni_dvp;
	8163	vp = nd.ni_vp;
	8164
	8165	if (vp) {
	8166	batched = vnode_compound_rmdir_available(vp);
	8167
	8168	if (vp->v_flag & VROOT) {
	8169	/*
	8170	* The root of a mounted filesystem cannot be deleted.
	8171	*/
	8172	error = EBUSY;
	8173	goto out;
	8174	}
	8175
	8176	#if DEVELOPMENT \|\| DEBUG
	8177	/*
	8178	* XXX VSWAP: Check for entitlements or special flag here
	8179	* so we can restrict access appropriately.
	8180	*/
	8181	#else /* DEVELOPMENT \|\| DEBUG */
	8182
	8183	if (vnode_isswap(vp) && (ctx != vfs_context_kernel())) {
	8184	error = EPERM;
	8185	goto out;
	8186	}
	8187	#endif /* DEVELOPMENT \|\| DEBUG */
	8188
	8189	/*
	8190	* Removed a check here; we used to abort if vp's vid
	8191	* was not the same as what we'd seen the last time around.
	8192	* I do not think that check was valid, because if we retry
	8193	* and all dirents are gone, the directory could legitimately
	8194	* be recycled but still be present in a situation where we would
	8195	* have had permission to delete. Therefore, we won't make
	8196	* an effort to preserve that check now that we may not have a
	8197	* vp here.
	8198	*/
	8199
	8200	if (!batched) {
	8201	error = vn_authorize_rmdir(dvp, vp, &nd.ni_cnd, ctx, NULL);
	8202	if (error) {
	8203	if (error == ENOENT) {
	8204	assert(restart_count < MAX_AUTHORIZE_ENOENT_RETRIES);
	8205	if (restart_count < MAX_AUTHORIZE_ENOENT_RETRIES) {
	8206	restart_flag = 1;
	8207	restart_count += 1;
	8208	}
	8209	}
	8210	goto out;
	8211	}
	8212	}
	8213	} else {
	8214	batched = 1;
	8215
	8216	if (!vnode_compound_rmdir_available(dvp)) {
	8217	panic("No error, but no compound rmdir?");
	8218	}
	8219	}
	8220
	8221	#if CONFIG_FSE
	8222	fse_info finfo;
	8223
	8224	need_event = need_fsevent(FSE_DELETE, dvp);
	8225	if (need_event) {
	8226	if (!batched) {
	8227	get_fse_info(vp, &finfo, ctx);
	8228	} else {
	8229	error = vfs_get_notify_attributes(&va);
	8230	if (error) {
	8231	goto out;
	8232	}
	8233
	8234	vap = &va;
	8235	}
	8236	}
	8237	#endif
	8238	has_listeners = kauth_authorize_fileop_has_listeners();
	8239	if (need_event \|\| has_listeners) {
	8240	if (path == NULL) {
	8241	GET_PATH(path);
	8242	if (path == NULL) {
	8243	error = ENOMEM;
	8244	goto out;
	8245	}
	8246	}
	8247
	8248	len = safe_getpath(dvp, nd.ni_cnd.cn_nameptr, path, MAXPATHLEN, &truncated);
	8249	#if CONFIG_FSE
	8250	if (truncated) {
	8251	finfo.mode \|= FSE_TRUNCATED_PATH;
	8252	}
	8253	#endif
	8254	}
	8255
	8256	error = vn_rmdir(dvp, &vp, &nd, vap, ctx);
	8257	nd.ni_vp = vp;
	8258	if (vp == NULLVP) {
	8259	/* Couldn't find a vnode */
	8260	goto out;
	8261	}
	8262
	8263	if (error == EKEEPLOOKING) {
	8264	goto continue_lookup;
	8265	} else if (batched && error == ENOENT) {
	8266	assert(restart_count < MAX_AUTHORIZE_ENOENT_RETRIES);
	8267	if (restart_count < MAX_AUTHORIZE_ENOENT_RETRIES) {
	8268	/*
	8269	* For compound VNOPs, the authorization callback
	8270	* may return ENOENT in case of racing hard link lookups
	8271	* redrive the lookup.
	8272	*/
	8273	restart_flag = 1;
	8274	restart_count += 1;
	8275	goto out;
	8276	}
	8277	}
	8278	#if CONFIG_APPLEDOUBLE
	8279	/*
	8280	* Special case to remove orphaned AppleDouble
	8281	* files. I don't like putting this in the kernel,
	8282	* but carbon does not like putting this in carbon either,
	8283	* so here we are.
	8284	*/
	8285	if (error == ENOTEMPTY) {
	8286	error = rmdir_remove_orphaned_appleDouble(vp, ctx, &restart_flag);
	8287	if (error == EBUSY) {
	8288	goto out;
	8289	}
	8290
	8291
	8292	/*
	8293	* Assuming everything went well, we will try the RMDIR again
	8294	*/
	8295	if (!error)
	8296	error = vn_rmdir(dvp, &vp, &nd, vap, ctx);
	8297	}
	8298	#endif /* CONFIG_APPLEDOUBLE */
	8299	/*
	8300	* Call out to allow 3rd party notification of delete.
	8301	* Ignore result of kauth_authorize_fileop call.
	8302	*/
	8303	if (!error) {
	8304	if (has_listeners) {
	8305	kauth_authorize_fileop(vfs_context_ucred(ctx),
	8306	KAUTH_FILEOP_DELETE,
	8307	(uintptr_t)vp,
	8308	(uintptr_t)path);
	8309	}
	8310
	8311	if (vp->v_flag & VISHARDLINK) {
	8312	// see the comment in unlink1() about why we update
	8313	// the parent of a hard link when it is removed
	8314	vnode_update_identity(vp, NULL, NULL, 0, 0, VNODE_UPDATE_PARENT);
	8315	}
	8316
	8317	#if CONFIG_FSE
	8318	if (need_event) {
	8319	if (vap) {
	8320	vnode_get_fse_info_from_vap(vp, &finfo, vap);
	8321	}
	8322	add_fsevent(FSE_DELETE, ctx,
	8323	FSE_ARG_STRING, len, path,
	8324	FSE_ARG_FINFO, &finfo,
	8325	FSE_ARG_DONE);
	8326	}
	8327	#endif
	8328	}
	8329
	8330	out:
	8331	if (path != NULL) {
	8332	RELEASE_PATH(path);
	8333	path = NULL;
	8334	}
	8335	/*
	8336	* nameidone has to happen before we vnode_put(dvp)
	8337	* since it may need to release the fs_nodelock on the dvp
	8338	*/
	8339	nameidone(&nd);
	8340	vnode_put(dvp);
	8341
	8342	if (vp)
	8343	vnode_put(vp);
	8344
	8345	if (restart_flag == 0) {
	8346	wakeup_one((caddr_t)vp);
	8347	return (error);
	8348	}
	8349	tsleep(vp, PVFS, "rm AD", 1);
	8350
	8351	} while (restart_flag != 0);
	8352
	8353	return (error);
	8354
	8355	}
	8356
	8357	/*
	8358	* Remove a directory file.
	8359	*/
	8360	/* ARGSUSED */
	8361	int
	8362	rmdir(__unused proc_t p, struct rmdir_args uap, __unused int32_t retval)
	8363	{
	8364	return (rmdirat_internal(vfs_context_current(), AT_FDCWD,
	8365	CAST_USER_ADDR_T(uap->path), UIO_USERSPACE));
	8366	}
	8367
	8368	/* Get direntry length padded to 8 byte alignment */
	8369	#define DIRENT64_LEN(namlen) \
	8370	((sizeof(struct direntry) + (namlen) - (MAXPATHLEN-1) + 7) & ~7)
	8371
	8372	/* Get dirent length padded to 4 byte alignment */
	8373	#define DIRENT_LEN(namelen) \
	8374	((sizeof(struct dirent) + (namelen + 1) - (__DARWIN_MAXNAMLEN + 1) + 3) & ~3)
	8375
	8376	/* Get the end of this dirent */
	8377	#define DIRENT_END(dep) \
	8378	(((char *)(dep)) + (dep)->d_reclen - 1)
	8379
	8380	errno_t
	8381	vnode_readdir64(struct vnode vp, struct uio uio, int flags, int *eofflag,
	8382	int *numdirent, vfs_context_t ctxp)
	8383	{
	8384	/* Check if fs natively supports VNODE_READDIR_EXTENDED */
	8385	if ((vp->v_mount->mnt_vtable->vfc_vfsflags & VFC_VFSREADDIR_EXTENDED) &&
	8386	((vp->v_mount->mnt_kern_flag & MNTK_DENY_READDIREXT) == 0)) {
	8387	return VNOP_READDIR(vp, uio, flags, eofflag, numdirent, ctxp);
	8388	} else {
	8389	size_t bufsize;
	8390	void * bufptr;
	8391	uio_t auio;
	8392	struct direntry *entry64;
	8393	struct dirent *dep;
	8394	int bytesread;
	8395	int error;
	8396
	8397	/*
	8398	* We're here because the underlying file system does not
	8399	* support direnties or we mounted denying support so we must
	8400	* fall back to dirents and convert them to direntries.
	8401	*
	8402	* Our kernel buffer needs to be smaller since re-packing will
	8403	* expand each dirent. The worse case (when the name length
	8404	* is 3 or less) corresponds to a struct direntry size of 32
	8405	* bytes (8-byte aligned) and a struct dirent size of 12 bytes
	8406	* (4-byte aligned). So having a buffer that is 3/8 the size
	8407	* will prevent us from reading more than we can pack.
	8408	*
	8409	* Since this buffer is wired memory, we will limit the
	8410	* buffer size to a maximum of 32K. We would really like to
	8411	* use 32K in the MIN(), but we use magic number 87371 to
	8412	* prevent uio_resid() * 3 / 8 from overflowing.
	8413	*/
	8414	bufsize = 3 * MIN((user_size_t)uio_resid(uio), 87371u) / 8;
	8415	MALLOC(bufptr, void *, bufsize, M_TEMP, M_WAITOK);
	8416	if (bufptr == NULL) {
	8417	return ENOMEM;
	8418	}
	8419
	8420	auio = uio_create(1, 0, UIO_SYSSPACE, UIO_READ);
	8421	uio_addiov(auio, (uintptr_t)bufptr, bufsize);
	8422	auio->uio_offset = uio->uio_offset;
	8423
	8424	error = VNOP_READDIR(vp, auio, 0, eofflag, numdirent, ctxp);
	8425
	8426	dep = (struct dirent *)bufptr;
	8427	bytesread = bufsize - uio_resid(auio);
	8428
	8429	MALLOC(entry64, struct direntry *, sizeof(struct direntry),
	8430	M_TEMP, M_WAITOK);
	8431	/*
	8432	* Convert all the entries and copy them out to user's buffer.
	8433	*/
	8434	while (error == 0 && (char )dep < ((char )bufptr + bytesread)) {
	8435	size_t enbufsize = DIRENT64_LEN(dep->d_namlen);
	8436
	8437	if (DIRENT_END(dep) > ((char *)bufptr + bytesread) \|\|
	8438	DIRENT_LEN(dep->d_namlen) > dep->d_reclen) {
	8439	printf("%s: %s: Bad dirent recived from directory %s\n", __func__,
	8440	vp->v_mount->mnt_vfsstat.f_mntonname,
	8441	vp->v_name ? vp->v_name : "<unknown>");
	8442	error = EIO;
	8443	break;
	8444	}
	8445
	8446	bzero(entry64, enbufsize);
	8447	/* Convert a dirent to a dirent64. */
	8448	entry64->d_ino = dep->d_ino;
	8449	entry64->d_seekoff = 0;
	8450	entry64->d_reclen = enbufsize;
	8451	entry64->d_namlen = dep->d_namlen;
	8452	entry64->d_type = dep->d_type;
	8453	bcopy(dep->d_name, entry64->d_name, dep->d_namlen + 1);
	8454
	8455	/* Move to next entry. */
	8456	dep = (struct dirent )((char )dep + dep->d_reclen);
	8457
	8458	/* Copy entry64 to user's buffer. */
	8459	error = uiomove((caddr_t)entry64, entry64->d_reclen, uio);
	8460	}
	8461
	8462	/* Update the real offset using the offset we got from VNOP_READDIR. */
	8463	if (error == 0) {
	8464	uio->uio_offset = auio->uio_offset;
	8465	}
	8466	uio_free(auio);
	8467	FREE(bufptr, M_TEMP);
	8468	FREE(entry64, M_TEMP);
	8469	return (error);
	8470	}
	8471	}
	8472
	8473	#define GETDIRENTRIES_MAXBUFSIZE (128 * 1024 * 1024U)
	8474
	8475	/*
	8476	* Read a block of directory entries in a file system independent format.
	8477	*/
	8478	static int
	8479	getdirentries_common(int fd, user_addr_t bufp, user_size_t bufsize, ssize_t *bytesread,
	8480	off_t *offset, int flags)
	8481	{
	8482	vnode_t vp;
	8483	struct vfs_context context = vfs_context_current(); / local copy */
	8484	struct fileproc *fp;
	8485	uio_t auio;
	8486	int spacetype = proc_is64bit(vfs_context_proc(&context)) ? UIO_USERSPACE64 : UIO_USERSPACE32;
	8487	off_t loff;
	8488	int error, eofflag, numdirent;
	8489	char uio_buf[ UIO_SIZEOF(1) ];
	8490
	8491	error = fp_getfvp(vfs_context_proc(&context), fd, &fp, &vp);
	8492	if (error) {
	8493	return (error);
	8494	}
	8495	if ((fp->f_fglob->fg_flag & FREAD) == 0) {
	8496	AUDIT_ARG(vnpath_withref, vp, ARG_VNODE1);
	8497	error = EBADF;
	8498	goto out;
	8499	}
	8500
	8501	if (bufsize > GETDIRENTRIES_MAXBUFSIZE)
	8502	bufsize = GETDIRENTRIES_MAXBUFSIZE;
	8503
	8504	#if CONFIG_MACF
	8505	error = mac_file_check_change_offset(vfs_context_ucred(&context), fp->f_fglob);
	8506	if (error)
	8507	goto out;
	8508	#endif
	8509	if ( (error = vnode_getwithref(vp)) ) {
	8510	goto out;
	8511	}
	8512	AUDIT_ARG(vnpath, vp, ARG_VNODE1);
	8513
	8514	unionread:
	8515	if (vp->v_type != VDIR) {
	8516	(void)vnode_put(vp);
	8517	error = EINVAL;
	8518	goto out;
	8519	}
	8520
	8521	#if CONFIG_MACF
	8522	error = mac_vnode_check_readdir(&context, vp);
	8523	if (error != 0) {
	8524	(void)vnode_put(vp);
	8525	goto out;
	8526	}
	8527	#endif /* MAC */
	8528
	8529	loff = fp->f_fglob->fg_offset;
	8530	auio = uio_createwithbuffer(1, loff, spacetype, UIO_READ, &uio_buf[0], sizeof(uio_buf));
	8531	uio_addiov(auio, bufp, bufsize);
	8532
	8533	if (flags & VNODE_READDIR_EXTENDED) {
	8534	error = vnode_readdir64(vp, auio, flags, &eofflag, &numdirent, &context);
	8535	fp->f_fglob->fg_offset = uio_offset(auio);
	8536	} else {
	8537	error = VNOP_READDIR(vp, auio, 0, &eofflag, &numdirent, &context);
	8538	fp->f_fglob->fg_offset = uio_offset(auio);
	8539	}
	8540	if (error) {
	8541	(void)vnode_put(vp);
	8542	goto out;
	8543	}
	8544
	8545	if ((user_ssize_t)bufsize == uio_resid(auio)){
	8546	if (union_dircheckp) {
	8547	error = union_dircheckp(&vp, fp, &context);
	8548	if (error == -1)
	8549	goto unionread;
	8550	if (error) {
	8551	(void)vnode_put(vp);
	8552	goto out;
	8553	}
	8554	}
	8555
	8556	if ((vp->v_mount->mnt_flag & MNT_UNION)) {
	8557	struct vnode *tvp = vp;
	8558	if (lookup_traverse_union(tvp, &vp, &context) == 0) {
	8559	vnode_ref(vp);
	8560	fp->f_fglob->fg_data = (caddr_t) vp;
	8561	fp->f_fglob->fg_offset = 0;
	8562	vnode_rele(tvp);
	8563	vnode_put(tvp);
	8564	goto unionread;
	8565	}
	8566	vp = tvp;
	8567	}
	8568	}
	8569
	8570	vnode_put(vp);
	8571	if (offset) {
	8572	*offset = loff;
	8573	}
	8574
	8575	*bytesread = bufsize - uio_resid(auio);
	8576	out:
	8577	file_drop(fd);
	8578	return (error);
	8579	}
	8580
	8581
	8582	int
	8583	getdirentries(__unused struct proc p, struct getdirentries_args uap, int32_t *retval)
	8584	{
	8585	off_t offset;
	8586	ssize_t bytesread;
	8587	int error;
	8588
	8589	AUDIT_ARG(fd, uap->fd);
	8590	error = getdirentries_common(uap->fd, uap->buf, uap->count, &bytesread, &offset, 0);
	8591
	8592	if (error == 0) {
	8593	if (proc_is64bit(p)) {
	8594	user64_long_t base = (user64_long_t)offset;
	8595	error = copyout((caddr_t)&base, uap->basep, sizeof(user64_long_t));
	8596	} else {
	8597	user32_long_t base = (user32_long_t)offset;
	8598	error = copyout((caddr_t)&base, uap->basep, sizeof(user32_long_t));
	8599	}
	8600	*retval = bytesread;
	8601	}
	8602	return (error);
	8603	}
	8604
	8605	int
	8606	getdirentries64(__unused struct proc p, struct getdirentries64_args uap, user_ssize_t *retval)
	8607	{
	8608	off_t offset;
	8609	ssize_t bytesread;
	8610	int error;
	8611
	8612	AUDIT_ARG(fd, uap->fd);
	8613	error = getdirentries_common(uap->fd, uap->buf, uap->bufsize, &bytesread, &offset, VNODE_READDIR_EXTENDED);
	8614
	8615	if (error == 0) {
	8616	*retval = bytesread;
	8617	error = copyout((caddr_t)&offset, uap->position, sizeof(off_t));
	8618	}
	8619	return (error);
	8620	}
	8621
	8622
	8623	/*
	8624	* Set the mode mask for creation of filesystem nodes.
	8625	* XXX implement xsecurity
	8626	*/
	8627	#define UMASK_NOXSECURITY (void )1 / leave existing xsecurity alone */
	8628	static int
	8629	umask1(proc_t p, int newmask, __unused kauth_filesec_t fsec, int32_t *retval)
	8630	{
	8631	struct filedesc *fdp;
	8632
	8633	AUDIT_ARG(mask, newmask);
	8634	proc_fdlock(p);
	8635	fdp = p->p_fd;
	8636	*retval = fdp->fd_cmask;
	8637	fdp->fd_cmask = newmask & ALLPERMS;
	8638	proc_fdunlock(p);
	8639	return (0);
	8640	}
	8641
	8642	/*
	8643	* umask_extended: Set the mode mask for creation of filesystem nodes; with extended security (ACL).
	8644	*
	8645	* Parameters: p Process requesting to set the umask
	8646	* uap User argument descriptor (see below)
	8647	* retval umask of the process (parameter p)
	8648	*
	8649	* Indirect: uap->newmask umask to set
	8650	* uap->xsecurity ACL to set
	8651	*
	8652	* Returns: 0 Success
	8653	* !0 Not success
	8654	*
	8655	*/
	8656	int
	8657	umask_extended(proc_t p, struct umask_extended_args uap, int32_t retval)
	8658	{
	8659	int ciferror;
	8660	kauth_filesec_t xsecdst;
	8661
	8662	xsecdst = KAUTH_FILESEC_NONE;
	8663	if (uap->xsecurity != USER_ADDR_NULL) {
	8664	if ((ciferror = kauth_copyinfilesec(uap->xsecurity, &xsecdst)) != 0)
	8665	return ciferror;
	8666	} else {
	8667	xsecdst = KAUTH_FILESEC_NONE;
	8668	}
	8669
	8670	ciferror = umask1(p, uap->newmask, xsecdst, retval);
	8671
	8672	if (xsecdst != KAUTH_FILESEC_NONE)
	8673	kauth_filesec_free(xsecdst);
	8674	return ciferror;
	8675	}
	8676
	8677	int
	8678	umask(proc_t p, struct umask_args uap, int32_t retval)
	8679	{
	8680	return(umask1(p, uap->newmask, UMASK_NOXSECURITY, retval));
	8681	}
	8682
	8683	/*
	8684	* Void all references to file by ripping underlying filesystem
	8685	* away from vnode.
	8686	*/
	8687	/* ARGSUSED */
	8688	int
	8689	revoke(proc_t p, struct revoke_args uap, __unused int32_t retval)
	8690	{
	8691	vnode_t vp;
	8692	struct vnode_attr va;
	8693	vfs_context_t ctx = vfs_context_current();
	8694	int error;
	8695	struct nameidata nd;
	8696
	8697	NDINIT(&nd, LOOKUP, OP_REVOKE, FOLLOW \| AUDITVNPATH1, UIO_USERSPACE,
	8698	uap->path, ctx);
	8699	error = namei(&nd);
	8700	if (error)
	8701	return (error);
	8702	vp = nd.ni_vp;
	8703
	8704	nameidone(&nd);
	8705
	8706	if (!(vnode_ischr(vp) \|\| vnode_isblk(vp))) {
	8707	error = ENOTSUP;
	8708	goto out;
	8709	}
	8710
	8711	if (vnode_isblk(vp) && vnode_ismountedon(vp)) {
	8712	error = EBUSY;
	8713	goto out;
	8714	}
	8715
	8716	#if CONFIG_MACF
	8717	error = mac_vnode_check_revoke(ctx, vp);
	8718	if (error)
	8719	goto out;
	8720	#endif
	8721
	8722	VATTR_INIT(&va);
	8723	VATTR_WANTED(&va, va_uid);
	8724	if ((error = vnode_getattr(vp, &va, ctx)))
	8725	goto out;
	8726	if (kauth_cred_getuid(vfs_context_ucred(ctx)) != va.va_uid &&
	8727	(error = suser(vfs_context_ucred(ctx), &p->p_acflag)))
	8728	goto out;
	8729	if (vp->v_usecount > 0 \|\| (vnode_isaliased(vp)))
	8730	VNOP_REVOKE(vp, REVOKEALL, ctx);
	8731	out:
	8732	vnode_put(vp);
	8733	return (error);
	8734	}
	8735
	8736
	8737	/*
	8738	* HFS/HFS PlUS SPECIFIC SYSTEM CALLS
	8739	* The following system calls are designed to support features
	8740	* which are specific to the HFS & HFS Plus volume formats
	8741	*/
	8742
	8743
	8744	/*
	8745	* Obtain attribute information on objects in a directory while enumerating
	8746	* the directory.
	8747	*/
	8748	/* ARGSUSED */
	8749	int
	8750	getdirentriesattr (proc_t p, struct getdirentriesattr_args uap, int32_t retval)
	8751	{
	8752	vnode_t vp;
	8753	struct fileproc *fp;
	8754	uio_t auio = NULL;
	8755	int spacetype = proc_is64bit(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
	8756	uint32_t count = 0, savecount = 0;
	8757	uint32_t newstate = 0;
	8758	int error, eofflag;
	8759	uint32_t loff = 0;
	8760	struct attrlist attributelist;
	8761	vfs_context_t ctx = vfs_context_current();
	8762	int fd = uap->fd;
	8763	char uio_buf[ UIO_SIZEOF(1) ];
	8764	kauth_action_t action;
	8765
	8766	AUDIT_ARG(fd, fd);
	8767
	8768	/* Get the attributes into kernel space */
	8769	if ((error = copyin(uap->alist, (caddr_t)&attributelist, sizeof(attributelist)))) {
	8770	return(error);
	8771	}
	8772	if ((error = copyin(uap->count, (caddr_t)&count, sizeof(count)))) {
	8773	return(error);
	8774	}
	8775	savecount = count;
	8776	if ( (error = fp_getfvp(p, fd, &fp, &vp)) ) {
	8777	return (error);
	8778	}
	8779	if ((fp->f_fglob->fg_flag & FREAD) == 0) {
	8780	AUDIT_ARG(vnpath_withref, vp, ARG_VNODE1);
	8781	error = EBADF;
	8782	goto out;
	8783	}
	8784
	8785
	8786	#if CONFIG_MACF
	8787	error = mac_file_check_change_offset(vfs_context_ucred(ctx),
	8788	fp->f_fglob);
	8789	if (error)
	8790	goto out;
	8791	#endif
	8792
	8793
	8794	if ( (error = vnode_getwithref(vp)) )
	8795	goto out;
	8796
	8797	AUDIT_ARG(vnpath, vp, ARG_VNODE1);
	8798
	8799	unionread:
	8800	if (vp->v_type != VDIR) {
	8801	(void)vnode_put(vp);
	8802	error = EINVAL;
	8803	goto out;
	8804	}
	8805
	8806	#if CONFIG_MACF
	8807	error = mac_vnode_check_readdir(ctx, vp);
	8808	if (error != 0) {
	8809	(void)vnode_put(vp);
	8810	goto out;
	8811	}
	8812	#endif /* MAC */
	8813
	8814	/* set up the uio structure which will contain the users return buffer */
	8815	loff = fp->f_fglob->fg_offset;
	8816	auio = uio_createwithbuffer(1, loff, spacetype, UIO_READ, &uio_buf[0], sizeof(uio_buf));
	8817	uio_addiov(auio, uap->buffer, uap->buffersize);
	8818
	8819	/*
	8820	* If the only item requested is file names, we can let that past with
	8821	* just LIST_DIRECTORY. If they want any other attributes, that means
	8822	* they need SEARCH as well.
	8823	*/
	8824	action = KAUTH_VNODE_LIST_DIRECTORY;
	8825	if ((attributelist.commonattr & ~ATTR_CMN_NAME) \|\|
	8826	attributelist.fileattr \|\| attributelist.dirattr)
	8827	action \|= KAUTH_VNODE_SEARCH;
	8828
	8829	if ((error = vnode_authorize(vp, NULL, action, ctx)) == 0) {
	8830
	8831	/* Believe it or not, uap->options only has 32-bits of valid
	8832	* info, so truncate before extending again */
	8833
	8834	error = VNOP_READDIRATTR(vp, &attributelist, auio, count,
	8835	(u_long)(uint32_t)uap->options, &newstate, &eofflag, &count, ctx);
	8836	}
	8837
	8838	if (error) {
	8839	(void) vnode_put(vp);
	8840	goto out;
	8841	}
	8842
	8843	/*
	8844	* If we've got the last entry of a directory in a union mount
	8845	* then reset the eofflag and pretend there's still more to come.
	8846	* The next call will again set eofflag and the buffer will be empty,
	8847	* so traverse to the underlying directory and do the directory
	8848	* read there.
	8849	*/
	8850	if (eofflag && vp->v_mount->mnt_flag & MNT_UNION) {
	8851	if (uio_resid(auio) < (user_ssize_t) uap->buffersize) { // Got some entries
	8852	eofflag = 0;
	8853	} else { // Empty buffer
	8854	struct vnode *tvp = vp;
	8855	if (lookup_traverse_union(tvp, &vp, ctx) == 0) {
	8856	vnode_ref_ext(vp, fp->f_fglob->fg_flag & O_EVTONLY, 0);
	8857	fp->f_fglob->fg_data = (caddr_t) vp;
	8858	fp->f_fglob->fg_offset = 0; // reset index for new dir
	8859	count = savecount;
	8860	vnode_rele_internal(tvp, fp->f_fglob->fg_flag & O_EVTONLY, 0, 0);
	8861	vnode_put(tvp);
	8862	goto unionread;
	8863	}
	8864	vp = tvp;
	8865	}
	8866	}
	8867
	8868	(void)vnode_put(vp);
	8869
	8870	if (error)
	8871	goto out;
	8872	fp->f_fglob->fg_offset = uio_offset(auio); /* should be multiple of dirent, not variable */
	8873
	8874	if ((error = copyout((caddr_t) &count, uap->count, sizeof(count))))
	8875	goto out;
	8876	if ((error = copyout((caddr_t) &newstate, uap->newstate, sizeof(newstate))))
	8877	goto out;
	8878	if ((error = copyout((caddr_t) &loff, uap->basep, sizeof(loff))))
	8879	goto out;
	8880
	8881	retval = eofflag; / similar to getdirentries */
	8882	error = 0;
	8883	out:
	8884	file_drop(fd);
	8885	return (error); /* return error earlier, an retval of 0 or 1 now */
	8886
	8887	} /* end of getdirentriesattr system call */
	8888
	8889	/*
	8890	* Exchange data between two files
	8891	*/
	8892
	8893	/* ARGSUSED */
	8894	int
	8895	exchangedata (__unused proc_t p, struct exchangedata_args uap, __unused int32_t retval)
	8896	{
	8897
	8898	struct nameidata fnd, snd;
	8899	vfs_context_t ctx = vfs_context_current();
	8900	vnode_t fvp;
	8901	vnode_t svp;
	8902	int error;
	8903	u_int32_t nameiflags;
	8904	char *fpath = NULL;
	8905	char *spath = NULL;
	8906	int flen=0, slen=0;
	8907	int from_truncated=0, to_truncated=0;
	8908	#if CONFIG_FSE
	8909	fse_info f_finfo, s_finfo;
	8910	#endif
	8911
	8912	nameiflags = 0;
	8913	if ((uap->options & FSOPT_NOFOLLOW) == 0) nameiflags \|= FOLLOW;
	8914
	8915	NDINIT(&fnd, LOOKUP, OP_EXCHANGEDATA, nameiflags \| AUDITVNPATH1,
	8916	UIO_USERSPACE, uap->path1, ctx);
	8917
	8918	error = namei(&fnd);
	8919	if (error)
	8920	goto out2;
	8921
	8922	nameidone(&fnd);
	8923	fvp = fnd.ni_vp;
	8924
	8925	NDINIT(&snd, LOOKUP, OP_EXCHANGEDATA, CN_NBMOUNTLOOK \| nameiflags \| AUDITVNPATH2,
	8926	UIO_USERSPACE, uap->path2, ctx);
	8927
	8928	error = namei(&snd);
	8929	if (error) {
	8930	vnode_put(fvp);
	8931	goto out2;
	8932	}
	8933	nameidone(&snd);
	8934	svp = snd.ni_vp;
	8935
	8936	/*
	8937	* if the files are the same, return an inval error
	8938	*/
	8939	if (svp == fvp) {
	8940	error = EINVAL;
	8941	goto out;
	8942	}
	8943
	8944	/*
	8945	* if the files are on different volumes, return an error
	8946	*/
	8947	if (svp->v_mount != fvp->v_mount) {
	8948	error = EXDEV;
	8949	goto out;
	8950	}
	8951
	8952	/* If they're not files, return an error */
	8953	if ( (vnode_isreg(fvp) == 0) \|\| (vnode_isreg(svp) == 0)) {
	8954	error = EINVAL;
	8955	goto out;
	8956	}
	8957
	8958	#if CONFIG_MACF
	8959	error = mac_vnode_check_exchangedata(ctx,
	8960	fvp, svp);
	8961	if (error)
	8962	goto out;
	8963	#endif
	8964	if (((error = vnode_authorize(fvp, NULL, KAUTH_VNODE_READ_DATA \| KAUTH_VNODE_WRITE_DATA, ctx)) != 0) \|\|
	8965	((error = vnode_authorize(svp, NULL, KAUTH_VNODE_READ_DATA \| KAUTH_VNODE_WRITE_DATA, ctx)) != 0))
	8966	goto out;
	8967
	8968	if (
	8969	#if CONFIG_FSE
	8970	need_fsevent(FSE_EXCHANGE, fvp) \|\|
	8971	#endif
	8972	kauth_authorize_fileop_has_listeners()) {
	8973	GET_PATH(fpath);
	8974	GET_PATH(spath);
	8975	if (fpath == NULL \|\| spath == NULL) {
	8976	error = ENOMEM;
	8977	goto out;
	8978	}
	8979
	8980	flen = safe_getpath(fvp, NULL, fpath, MAXPATHLEN, &from_truncated);
	8981	slen = safe_getpath(svp, NULL, spath, MAXPATHLEN, &to_truncated);
	8982
	8983	#if CONFIG_FSE
	8984	get_fse_info(fvp, &f_finfo, ctx);
	8985	get_fse_info(svp, &s_finfo, ctx);
	8986	if (from_truncated \|\| to_truncated) {
	8987	// set it here since only the f_finfo gets reported up to user space
	8988	f_finfo.mode \|= FSE_TRUNCATED_PATH;
	8989	}
	8990	#endif
	8991	}
	8992	/* Ok, make the call */
	8993	error = VNOP_EXCHANGE(fvp, svp, 0, ctx);
	8994
	8995	if (error == 0) {
	8996	const char *tmpname;
	8997
	8998	if (fpath != NULL && spath != NULL) {
	8999	/* call out to allow 3rd party notification of exchangedata.
	9000	* Ignore result of kauth_authorize_fileop call.
	9001	*/
	9002	kauth_authorize_fileop(vfs_context_ucred(ctx), KAUTH_FILEOP_EXCHANGE,
	9003	(uintptr_t)fpath, (uintptr_t)spath);
	9004	}
	9005	name_cache_lock();
	9006
	9007	tmpname = fvp->v_name;
	9008	fvp->v_name = svp->v_name;
	9009	svp->v_name = tmpname;
	9010
	9011	if (fvp->v_parent != svp->v_parent) {
	9012	vnode_t tmp;
	9013
	9014	tmp = fvp->v_parent;
	9015	fvp->v_parent = svp->v_parent;
	9016	svp->v_parent = tmp;
	9017	}
	9018	name_cache_unlock();
	9019
	9020	#if CONFIG_FSE
	9021	if (fpath != NULL && spath != NULL) {
	9022	add_fsevent(FSE_EXCHANGE, ctx,
	9023	FSE_ARG_STRING, flen, fpath,
	9024	FSE_ARG_FINFO, &f_finfo,
	9025	FSE_ARG_STRING, slen, spath,
	9026	FSE_ARG_FINFO, &s_finfo,
	9027	FSE_ARG_DONE);
	9028	}
	9029	#endif
	9030	}
	9031
	9032	out:
	9033	if (fpath != NULL)
	9034	RELEASE_PATH(fpath);
	9035	if (spath != NULL)
	9036	RELEASE_PATH(spath);
	9037	vnode_put(svp);
	9038	vnode_put(fvp);
	9039	out2:
	9040	return (error);
	9041	}
	9042
	9043	/*
	9044	* Return (in MB) the amount of freespace on the given vnode's volume.
	9045	*/
	9046	uint32_t freespace_mb(vnode_t vp);
	9047
	9048	uint32_t
	9049	freespace_mb(vnode_t vp)
	9050	{
	9051	vfs_update_vfsstat(vp->v_mount, vfs_context_current(), VFS_USER_EVENT);
	9052	return (((uint64_t)vp->v_mount->mnt_vfsstat.f_bavail *
	9053	vp->v_mount->mnt_vfsstat.f_bsize) >> 20);
	9054	}
	9055
	9056	#if CONFIG_SEARCHFS
	9057
	9058	/* ARGSUSED */
	9059
	9060	int
	9061	searchfs(proc_t p, struct searchfs_args uap, __unused int32_t retval)
	9062	{
	9063	vnode_t vp, tvp;
	9064	int i, error=0;
	9065	int fserror = 0;
	9066	struct nameidata nd;
	9067	struct user64_fssearchblock searchblock;
	9068	struct searchstate *state;
	9069	struct attrlist *returnattrs;
	9070	struct timeval timelimit;
	9071	void searchparams1,searchparams2;
	9072	uio_t auio = NULL;
	9073	int spacetype = proc_is64bit(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
	9074	uint32_t nummatches;
	9075	int mallocsize;
	9076	uint32_t nameiflags;
	9077	vfs_context_t ctx = vfs_context_current();
	9078	char uio_buf[ UIO_SIZEOF(1) ];
	9079
	9080	/* Start by copying in fsearchblock parameter list */
	9081	if (IS_64BIT_PROCESS(p)) {
	9082	error = copyin(uap->searchblock, (caddr_t) &searchblock, sizeof(searchblock));
	9083	timelimit.tv_sec = searchblock.timelimit.tv_sec;
	9084	timelimit.tv_usec = searchblock.timelimit.tv_usec;
	9085	}
	9086	else {
	9087	struct user32_fssearchblock tmp_searchblock;
	9088
	9089	error = copyin(uap->searchblock, (caddr_t) &tmp_searchblock, sizeof(tmp_searchblock));
	9090	// munge into 64-bit version
	9091	searchblock.returnattrs = CAST_USER_ADDR_T(tmp_searchblock.returnattrs);
	9092	searchblock.returnbuffer = CAST_USER_ADDR_T(tmp_searchblock.returnbuffer);
	9093	searchblock.returnbuffersize = tmp_searchblock.returnbuffersize;
	9094	searchblock.maxmatches = tmp_searchblock.maxmatches;
	9095	/*
	9096	* These casts are safe. We will promote the tv_sec into a 64 bit long if necessary
	9097	* from a 32 bit long, and tv_usec is already a signed 32 bit int.
	9098	*/
	9099	timelimit.tv_sec = (__darwin_time_t) tmp_searchblock.timelimit.tv_sec;
	9100	timelimit.tv_usec = (__darwin_useconds_t) tmp_searchblock.timelimit.tv_usec;
	9101	searchblock.searchparams1 = CAST_USER_ADDR_T(tmp_searchblock.searchparams1);
	9102	searchblock.sizeofsearchparams1 = tmp_searchblock.sizeofsearchparams1;
	9103	searchblock.searchparams2 = CAST_USER_ADDR_T(tmp_searchblock.searchparams2);
	9104	searchblock.sizeofsearchparams2 = tmp_searchblock.sizeofsearchparams2;
	9105	searchblock.searchattrs = tmp_searchblock.searchattrs;
	9106	}
	9107	if (error)
	9108	return(error);
	9109
	9110	/* Do a sanity check on sizeofsearchparams1 and sizeofsearchparams2.
	9111	*/
	9112	if (searchblock.sizeofsearchparams1 > SEARCHFS_MAX_SEARCHPARMS \|\|
	9113	searchblock.sizeofsearchparams2 > SEARCHFS_MAX_SEARCHPARMS)
	9114	return(EINVAL);
	9115
	9116	/* Now malloc a big bunch of space to hold the search parameters, the attrlists and the search state. */
	9117	/* It all has to do into local memory and it's not that big so we might as well put it all together. */
	9118	/* Searchparams1 shall be first so we might as well use that to hold the base address of the allocated*/
	9119	/* block. */
	9120	/* */
	9121	/* NOTE: we allocate an extra 8 bytes to account for the difference in size of the searchstate */
	9122	/* due to the changes in rdar://problem/12438273. That way if a 3rd party file system */
	9123	/* assumes the size is still 556 bytes it will continue to work */
	9124
	9125	mallocsize = searchblock.sizeofsearchparams1 + searchblock.sizeofsearchparams2 +
	9126	sizeof(struct attrlist) + sizeof(struct searchstate) + (2*sizeof(uint32_t));
	9127
	9128	MALLOC(searchparams1, void *, mallocsize, M_TEMP, M_WAITOK);
	9129
	9130	/* Now set up the various pointers to the correct place in our newly allocated memory */
	9131
	9132	searchparams2 = (void *) (((caddr_t) searchparams1) + searchblock.sizeofsearchparams1);
	9133	returnattrs = (struct attrlist *) (((caddr_t) searchparams2) + searchblock.sizeofsearchparams2);
	9134	state = (struct searchstate *) (((caddr_t) returnattrs) + sizeof (struct attrlist));
	9135
	9136	/* Now copy in the stuff given our local variables. */
	9137
	9138	if ((error = copyin(searchblock.searchparams1, searchparams1, searchblock.sizeofsearchparams1)))
	9139	goto freeandexit;
	9140
	9141	if ((error = copyin(searchblock.searchparams2, searchparams2, searchblock.sizeofsearchparams2)))
	9142	goto freeandexit;
	9143
	9144	if ((error = copyin(searchblock.returnattrs, (caddr_t) returnattrs, sizeof(struct attrlist))))
	9145	goto freeandexit;
	9146
	9147	if ((error = copyin(uap->state, (caddr_t) state, sizeof(struct searchstate))))
	9148	goto freeandexit;
	9149
	9150	/*
	9151	* When searching a union mount, need to set the
	9152	* start flag at the first call on each layer to
	9153	* reset state for the new volume.
	9154	*/
	9155	if (uap->options & SRCHFS_START)
	9156	state->ss_union_layer = 0;
	9157	else
	9158	uap->options \|= state->ss_union_flags;
	9159	state->ss_union_flags = 0;
	9160
	9161	/*
	9162	* Because searchparams1 and searchparams2 may contain an ATTR_CMN_NAME search parameter,
	9163	* which is passed in with an attrreference_t, we need to inspect the buffer manually here.
	9164	* The KPI does not provide us the ability to pass in the length of the buffers searchparams1
	9165	* and searchparams2. To obviate the need for all searchfs-supporting filesystems to
	9166	* validate the user-supplied data offset of the attrreference_t, we'll do it here.
	9167	*/
	9168
	9169	if (searchblock.searchattrs.commonattr & ATTR_CMN_NAME) {
	9170	attrreference_t* string_ref;
	9171	u_int32_t* start_length;
	9172	user64_size_t param_length;
	9173
	9174	/* validate searchparams1 */
	9175	param_length = searchblock.sizeofsearchparams1;
	9176	/* skip the word that specifies length of the buffer */
	9177	start_length= (u_int32_t*) searchparams1;
	9178	start_length= start_length+1;
	9179	string_ref= (attrreference_t*) start_length;
	9180
	9181	/* ensure no negative offsets or too big offsets */
	9182	if (string_ref->attr_dataoffset < 0 ) {
	9183	error = EINVAL;
	9184	goto freeandexit;
	9185	}
	9186	if (string_ref->attr_length > MAXPATHLEN) {
	9187	error = EINVAL;
	9188	goto freeandexit;
	9189	}
	9190
	9191	/* Check for pointer overflow in the string ref */
	9192	if (((char) string_ref + string_ref->attr_dataoffset) < (char) string_ref) {
	9193	error = EINVAL;
	9194	goto freeandexit;
	9195	}
	9196
	9197	if (((char) string_ref + string_ref->attr_dataoffset) > ((char)searchparams1 + param_length)) {
	9198	error = EINVAL;
	9199	goto freeandexit;
	9200	}
	9201	if (((char)string_ref + string_ref->attr_dataoffset + string_ref->attr_length) > ((char)searchparams1 + param_length)) {
	9202	error = EINVAL;
	9203	goto freeandexit;
	9204	}
	9205	}
	9206
	9207	/* set up the uio structure which will contain the users return buffer */
	9208	auio = uio_createwithbuffer(1, 0, spacetype, UIO_READ, &uio_buf[0], sizeof(uio_buf));
	9209	uio_addiov(auio, searchblock.returnbuffer, searchblock.returnbuffersize);
	9210
	9211	nameiflags = 0;
	9212	if ((uap->options & FSOPT_NOFOLLOW) == 0) nameiflags \|= FOLLOW;
	9213	NDINIT(&nd, LOOKUP, OP_SEARCHFS, nameiflags \| AUDITVNPATH1,
	9214	UIO_USERSPACE, uap->path, ctx);
	9215
	9216	error = namei(&nd);
	9217	if (error)
	9218	goto freeandexit;
	9219	vp = nd.ni_vp;
	9220	nameidone(&nd);
	9221
	9222	/*
	9223	* Switch to the root vnode for the volume
	9224	*/
	9225	error = VFS_ROOT(vnode_mount(vp), &tvp, ctx);
	9226	vnode_put(vp);
	9227	if (error)
	9228	goto freeandexit;
	9229	vp = tvp;
	9230
	9231	/*
	9232	* If it's a union mount, the path lookup takes
	9233	* us to the top layer. But we may need to descend
	9234	* to a lower layer. For non-union mounts the layer
	9235	* is always zero.
	9236	*/
	9237	for (i = 0; i < (int) state->ss_union_layer; i++) {
	9238	if ((vp->v_mount->mnt_flag & MNT_UNION) == 0)
	9239	break;
	9240	tvp = vp;
	9241	vp = vp->v_mount->mnt_vnodecovered;
	9242	if (vp == NULL) {
	9243	vnode_put(tvp);
	9244	error = ENOENT;
	9245	goto freeandexit;
	9246	}
	9247	error = vnode_getwithref(vp);
	9248	vnode_put(tvp);
	9249	if (error)
	9250	goto freeandexit;
	9251	}
	9252
	9253	#if CONFIG_MACF
	9254	error = mac_vnode_check_searchfs(ctx, vp, &searchblock.searchattrs);
	9255	if (error) {
	9256	vnode_put(vp);
	9257	goto freeandexit;
	9258	}
	9259	#endif
	9260
	9261
	9262	/*
	9263	* If searchblock.maxmatches == 0, then skip the search. This has happened
	9264	* before and sometimes the underlying code doesnt deal with it well.
	9265	*/
	9266	if (searchblock.maxmatches == 0) {
	9267	nummatches = 0;
	9268	goto saveandexit;
	9269	}
	9270
	9271	/*
	9272	* Allright, we have everything we need, so lets make that call.
	9273	*
	9274	* We keep special track of the return value from the file system:
	9275	* EAGAIN is an acceptable error condition that shouldn't keep us
	9276	* from copying out any results...
	9277	*/
	9278
	9279	fserror = VNOP_SEARCHFS(vp,
	9280	searchparams1,
	9281	searchparams2,
	9282	&searchblock.searchattrs,
	9283	(u_long)searchblock.maxmatches,
	9284	&timelimit,
	9285	returnattrs,
	9286	&nummatches,
	9287	(u_long)uap->scriptcode,
	9288	(u_long)uap->options,
	9289	auio,
	9290	(struct searchstate *) &state->ss_fsstate,
	9291	ctx);
	9292
	9293	/*
	9294	* If it's a union mount we need to be called again
	9295	* to search the mounted-on filesystem.
	9296	*/
	9297	if ((vp->v_mount->mnt_flag & MNT_UNION) && fserror == 0) {
	9298	state->ss_union_flags = SRCHFS_START;
	9299	state->ss_union_layer++; // search next layer down
	9300	fserror = EAGAIN;
	9301	}
	9302
	9303	saveandexit:
	9304
	9305	vnode_put(vp);
	9306
	9307	/* Now copy out the stuff that needs copying out. That means the number of matches, the
	9308	search state. Everything was already put into he return buffer by the vop call. */
	9309
	9310	if ((error = copyout((caddr_t) state, uap->state, sizeof(struct searchstate))) != 0)
	9311	goto freeandexit;
	9312
	9313	if ((error = suulong(uap->nummatches, (uint64_t)nummatches)) != 0)
	9314	goto freeandexit;
	9315
	9316	error = fserror;
	9317
	9318	freeandexit:
	9319
	9320	FREE(searchparams1,M_TEMP);
	9321
	9322	return(error);
	9323
	9324
	9325	} /* end of searchfs system call */
	9326
	9327	#else /* CONFIG_SEARCHFS */
	9328
	9329	int
	9330	searchfs(__unused proc_t p, __unused struct searchfs_args uap, __unused int32_t retval)
	9331	{
	9332	return (ENOTSUP);
	9333	}
	9334
	9335	#endif /* CONFIG_SEARCHFS */
	9336
	9337
	9338	lck_grp_attr_t * nspace_group_attr;
	9339	lck_attr_t * nspace_lock_attr;
	9340	lck_grp_t * nspace_mutex_group;
	9341
	9342	lck_mtx_t nspace_handler_lock;
	9343	lck_mtx_t nspace_handler_exclusion_lock;
	9344
	9345	time_t snapshot_timestamp=0;
	9346	int nspace_allow_virtual_devs=0;
	9347
	9348	void nspace_handler_init(void);
	9349
	9350	typedef struct nspace_item_info {
	9351	struct vnode *vp;
	9352	void *arg;
	9353	uint64_t op;
	9354	uint32_t vid;
	9355	uint32_t flags;
	9356	uint32_t token;
	9357	uint32_t refcount;
	9358	} nspace_item_info;
	9359
	9360	#define MAX_NSPACE_ITEMS 128
	9361	nspace_item_info nspace_items[MAX_NSPACE_ITEMS];
	9362	uint32_t nspace_item_idx=0; // also used as the sleep/wakeup rendezvous address
	9363	uint32_t nspace_token_id=0;
	9364	uint32_t nspace_handler_timeout = 15; // seconds
	9365
	9366	#define NSPACE_ITEM_NEW 0x0001
	9367	#define NSPACE_ITEM_PROCESSING 0x0002
	9368	#define NSPACE_ITEM_DEAD 0x0004
	9369	#define NSPACE_ITEM_CANCELLED 0x0008
	9370	#define NSPACE_ITEM_DONE 0x0010
	9371	#define NSPACE_ITEM_RESET_TIMER 0x0020
	9372
	9373	#define NSPACE_ITEM_NSPACE_EVENT 0x0040
	9374	#define NSPACE_ITEM_SNAPSHOT_EVENT 0x0080
	9375
	9376	#define NSPACE_ITEM_ALL_EVENT_TYPES (NSPACE_ITEM_NSPACE_EVENT \| NSPACE_ITEM_SNAPSHOT_EVENT)
	9377
	9378	//#pragma optimization_level 0
	9379
	9380	typedef enum {
	9381	NSPACE_HANDLER_NSPACE = 0,
	9382	NSPACE_HANDLER_SNAPSHOT = 1,
	9383
	9384	NSPACE_HANDLER_COUNT,
	9385	} nspace_type_t;
	9386
	9387	typedef struct {
	9388	uint64_t handler_tid;
	9389	struct proc *handler_proc;
	9390	int handler_busy;
	9391	} nspace_handler_t;
	9392
	9393	nspace_handler_t nspace_handlers[NSPACE_HANDLER_COUNT];
	9394
	9395	/* namespace fsctl functions */
	9396	static int nspace_flags_matches_handler(uint32_t event_flags, nspace_type_t nspace_type);
	9397	static int nspace_item_flags_for_type(nspace_type_t nspace_type);
	9398	static int nspace_open_flags_for_type(nspace_type_t nspace_type);
	9399	static nspace_type_t nspace_type_for_op(uint64_t op);
	9400	static int nspace_is_special_process(struct proc *proc);
	9401	static int vn_open_with_vp(vnode_t vp, int fmode, vfs_context_t ctx);
	9402	static int wait_for_namespace_event(namespace_handler_data *nhd, nspace_type_t nspace_type);
	9403	static int validate_namespace_args (int is64bit, int size);
	9404	static int process_namespace_fsctl(nspace_type_t nspace_type, int is64bit, u_int size, caddr_t data);
	9405
	9406
	9407	static inline int nspace_flags_matches_handler(uint32_t event_flags, nspace_type_t nspace_type)
	9408	{
	9409	switch(nspace_type) {
	9410	case NSPACE_HANDLER_NSPACE:
	9411	return (event_flags & NSPACE_ITEM_ALL_EVENT_TYPES) == NSPACE_ITEM_NSPACE_EVENT;
	9412	case NSPACE_HANDLER_SNAPSHOT:
	9413	return (event_flags & NSPACE_ITEM_ALL_EVENT_TYPES) == NSPACE_ITEM_SNAPSHOT_EVENT;
	9414	default:
	9415	printf("nspace_flags_matches_handler: invalid type %u\n", (int)nspace_type);
	9416	return 0;
	9417	}
	9418	}
	9419
	9420	static inline int nspace_item_flags_for_type(nspace_type_t nspace_type)
	9421	{
	9422	switch(nspace_type) {
	9423	case NSPACE_HANDLER_NSPACE:
	9424	return NSPACE_ITEM_NSPACE_EVENT;
	9425	case NSPACE_HANDLER_SNAPSHOT:
	9426	return NSPACE_ITEM_SNAPSHOT_EVENT;
	9427	default:
	9428	printf("nspace_item_flags_for_type: invalid type %u\n", (int)nspace_type);
	9429	return 0;
	9430	}
	9431	}
	9432
	9433	static inline int nspace_open_flags_for_type(nspace_type_t nspace_type)
	9434	{
	9435	switch(nspace_type) {
	9436	case NSPACE_HANDLER_NSPACE:
	9437	return FREAD \| FWRITE \| O_EVTONLY;
	9438	case NSPACE_HANDLER_SNAPSHOT:
	9439	return FREAD \| O_EVTONLY;
	9440	default:
	9441	printf("nspace_open_flags_for_type: invalid type %u\n", (int)nspace_type);
	9442	return 0;
	9443	}
	9444	}
	9445
	9446	static inline nspace_type_t nspace_type_for_op(uint64_t op)
	9447	{
	9448	switch(op & NAMESPACE_HANDLER_EVENT_TYPE_MASK) {
	9449	case NAMESPACE_HANDLER_NSPACE_EVENT:
	9450	return NSPACE_HANDLER_NSPACE;
	9451	case NAMESPACE_HANDLER_SNAPSHOT_EVENT:
	9452	return NSPACE_HANDLER_SNAPSHOT;
	9453	default:
	9454	printf("nspace_type_for_op: invalid op mask %llx\n", op & NAMESPACE_HANDLER_EVENT_TYPE_MASK);
	9455	return NSPACE_HANDLER_NSPACE;
	9456	}
	9457	}
	9458
	9459	static inline int nspace_is_special_process(struct proc *proc)
	9460	{
	9461	int i;
	9462	for (i = 0; i < NSPACE_HANDLER_COUNT; i++) {
	9463	if (proc == nspace_handlers[i].handler_proc)
	9464	return 1;
	9465	}
	9466	return 0;
	9467	}
	9468
	9469	void
	9470	nspace_handler_init(void)
	9471	{
	9472	nspace_lock_attr = lck_attr_alloc_init();
	9473	nspace_group_attr = lck_grp_attr_alloc_init();
	9474	nspace_mutex_group = lck_grp_alloc_init("nspace-mutex", nspace_group_attr);
	9475	lck_mtx_init(&nspace_handler_lock, nspace_mutex_group, nspace_lock_attr);
	9476	lck_mtx_init(&nspace_handler_exclusion_lock, nspace_mutex_group, nspace_lock_attr);
	9477	memset(&nspace_items[0], 0, sizeof(nspace_items));
	9478	}
	9479
	9480	void
	9481	nspace_proc_exit(struct proc *p)
	9482	{
	9483	int i, event_mask = 0;
	9484
	9485	for (i = 0; i < NSPACE_HANDLER_COUNT; i++) {
	9486	if (p == nspace_handlers[i].handler_proc) {
	9487	event_mask \|= nspace_item_flags_for_type(i);
	9488	nspace_handlers[i].handler_tid = 0;
	9489	nspace_handlers[i].handler_proc = NULL;
	9490	}
	9491	}
	9492
	9493	if (event_mask == 0) {
	9494	return;
	9495	}
	9496
	9497	lck_mtx_lock(&nspace_handler_lock);
	9498	if (event_mask & NSPACE_ITEM_SNAPSHOT_EVENT) {
	9499	// if this process was the snapshot handler, zero snapshot_timeout
	9500	snapshot_timestamp = 0;
	9501	}
	9502
	9503	//
	9504	// unblock anyone that's waiting for the handler that died
	9505	//
	9506	for(i=0; i < MAX_NSPACE_ITEMS; i++) {
	9507	if (nspace_items[i].flags & (NSPACE_ITEM_NEW \| NSPACE_ITEM_PROCESSING)) {
	9508
	9509	if ( nspace_items[i].flags & event_mask ) {
	9510
	9511	if (nspace_items[i].vp && (nspace_items[i].vp->v_flag & VNEEDSSNAPSHOT)) {
	9512	vnode_lock_spin(nspace_items[i].vp);
	9513	nspace_items[i].vp->v_flag &= ~VNEEDSSNAPSHOT;
	9514	vnode_unlock(nspace_items[i].vp);
	9515	}
	9516	nspace_items[i].vp = NULL;
	9517	nspace_items[i].vid = 0;
	9518	nspace_items[i].flags = NSPACE_ITEM_DONE;
	9519	nspace_items[i].token = 0;
	9520
	9521	wakeup((caddr_t)&(nspace_items[i].vp));
	9522	}
	9523	}
	9524	}
	9525
	9526	wakeup((caddr_t)&nspace_item_idx);
	9527	lck_mtx_unlock(&nspace_handler_lock);
	9528	}
	9529
	9530
	9531	int
	9532	resolve_nspace_item(struct vnode *vp, uint64_t op)
	9533	{
	9534	return resolve_nspace_item_ext(vp, op, NULL);
	9535	}
	9536
	9537	int
	9538	resolve_nspace_item_ext(struct vnode vp, uint64_t op, void arg)
	9539	{
	9540	int i, error, keep_waiting;
	9541	struct timespec ts;
	9542	nspace_type_t nspace_type = nspace_type_for_op(op);
	9543
	9544	// only allow namespace events on regular files, directories and symlinks.
	9545	if (vp->v_type != VREG && vp->v_type != VDIR && vp->v_type != VLNK) {
	9546	return 0;
	9547	}
	9548
	9549	//
	9550	// if this is a snapshot event and the vnode is on a
	9551	// disk image just pretend nothing happened since any
	9552	// change to the disk image will cause the disk image
	9553	// itself to get backed up and this avoids multi-way
	9554	// deadlocks between the snapshot handler and the ever
	9555	// popular diskimages-helper process. the variable
	9556	// nspace_allow_virtual_devs allows this behavior to
	9557	// be overridden (for use by the Mobile TimeMachine
	9558	// testing infrastructure which uses disk images)
	9559	//
	9560	if ( (op & NAMESPACE_HANDLER_SNAPSHOT_EVENT)
	9561	&& (vp->v_mount != NULL)
	9562	&& (vp->v_mount->mnt_kern_flag & MNTK_VIRTUALDEV)
	9563	&& !nspace_allow_virtual_devs) {
	9564
	9565	return 0;
	9566	}
	9567
	9568	// if (thread_tid(current_thread()) == namespace_handler_tid) {
	9569	if (nspace_handlers[nspace_type].handler_proc == NULL) {
	9570	return 0;
	9571	}
	9572
	9573	if (nspace_is_special_process(current_proc())) {
	9574	return EDEADLK;
	9575	}
	9576
	9577	lck_mtx_lock(&nspace_handler_lock);
	9578
	9579	retry:
	9580	for(i=0; i < MAX_NSPACE_ITEMS; i++) {
	9581	if (vp == nspace_items[i].vp && op == nspace_items[i].op) {
	9582	break;
	9583	}
	9584	}
	9585
	9586	if (i >= MAX_NSPACE_ITEMS) {
	9587	for(i=0; i < MAX_NSPACE_ITEMS; i++) {
	9588	if (nspace_items[i].flags == 0) {
	9589	break;
	9590	}
	9591	}
	9592	} else {
	9593	nspace_items[i].refcount++;
	9594	}
	9595
	9596	if (i >= MAX_NSPACE_ITEMS) {
	9597	ts.tv_sec = nspace_handler_timeout;
	9598	ts.tv_nsec = 0;
	9599
	9600	error = msleep((caddr_t)&nspace_token_id, &nspace_handler_lock, PVFS\|PCATCH, "nspace-no-space", &ts);
	9601	if (error == 0) {
	9602	// an entry got free'd up, go see if we can get a slot
	9603	goto retry;
	9604	} else {
	9605	lck_mtx_unlock(&nspace_handler_lock);
	9606	return error;
	9607	}
	9608	}
	9609
	9610	//
	9611	// if it didn't already exist, add it. if it did exist
	9612	// we'll get woken up when someone does a wakeup() on
	9613	// the slot in the nspace_items table.
	9614	//
	9615	if (vp != nspace_items[i].vp) {
	9616	nspace_items[i].vp = vp;
	9617	nspace_items[i].arg = (arg == NSPACE_REARM_NO_ARG) ? NULL : arg; // arg is {NULL, true, uio *} - only pass uio thru to the user
	9618	nspace_items[i].op = op;
	9619	nspace_items[i].vid = vnode_vid(vp);
	9620	nspace_items[i].flags = NSPACE_ITEM_NEW;
	9621	nspace_items[i].flags \|= nspace_item_flags_for_type(nspace_type);
	9622	if (nspace_items[i].flags & NSPACE_ITEM_SNAPSHOT_EVENT) {
	9623	if (arg) {
	9624	vnode_lock_spin(vp);
	9625	vp->v_flag \|= VNEEDSSNAPSHOT;
	9626	vnode_unlock(vp);
	9627	}
	9628	}
	9629
	9630	nspace_items[i].token = 0;
	9631	nspace_items[i].refcount = 1;
	9632
	9633	wakeup((caddr_t)&nspace_item_idx);
	9634	}
	9635
	9636	//
	9637	// Now go to sleep until the handler does a wakeup on this
	9638	// slot in the nspace_items table (or we timeout).
	9639	//
	9640	keep_waiting = 1;
	9641	while(keep_waiting) {
	9642	ts.tv_sec = nspace_handler_timeout;
	9643	ts.tv_nsec = 0;
	9644	error = msleep((caddr_t)&(nspace_items[i].vp), &nspace_handler_lock, PVFS\|PCATCH, "namespace-done", &ts);
	9645
	9646	if (nspace_items[i].flags & NSPACE_ITEM_DONE) {
	9647	error = 0;
	9648	} else if (nspace_items[i].flags & NSPACE_ITEM_CANCELLED) {
	9649	error = nspace_items[i].token;
	9650	} else if (error == EWOULDBLOCK \|\| error == ETIMEDOUT) {
	9651	if (nspace_items[i].flags & NSPACE_ITEM_RESET_TIMER) {
	9652	nspace_items[i].flags &= ~NSPACE_ITEM_RESET_TIMER;
	9653	continue;
	9654	} else {
	9655	error = ETIMEDOUT;
	9656	}
	9657	} else if (error == 0) {
	9658	// hmmm, why did we get woken up?
	9659	printf("woken up for token %d but it's not done, cancelled or timedout and error == 0.\n",
	9660	nspace_items[i].token);
	9661	}
	9662
	9663	if (--nspace_items[i].refcount == 0) {
	9664	nspace_items[i].vp = NULL; // clear this so that no one will match on it again
	9665	nspace_items[i].arg = NULL;
	9666	nspace_items[i].token = 0; // clear this so that the handler will not find it anymore
	9667	nspace_items[i].flags = 0; // this clears it for re-use
	9668	}
	9669	wakeup(&nspace_token_id);
	9670	keep_waiting = 0;
	9671	}
	9672
	9673	lck_mtx_unlock(&nspace_handler_lock);
	9674
	9675	return error;
	9676	}
	9677
	9678	int nspace_snapshot_event(vnode_t vp, time_t ctime, uint64_t op_type, void *arg)
	9679	{
	9680	int snapshot_error = 0;
	9681
	9682	if (vp == NULL) {
	9683	return 0;
	9684	}
	9685
	9686	/* Swap files are special; skip them */
	9687	if (vnode_isswap(vp)) {
	9688	return 0;
	9689	}
	9690
	9691	if (ctime != 0 && snapshot_timestamp != 0 && (ctime <= snapshot_timestamp \|\| vnode_needssnapshots(vp))) {
	9692	// the change time is within this epoch
	9693	int error;
	9694
	9695	error = resolve_nspace_item_ext(vp, op_type \| NAMESPACE_HANDLER_SNAPSHOT_EVENT, arg);
	9696	if (error == EDEADLK) {
	9697	snapshot_error = 0;
	9698	} else if (error) {
	9699	if (error == EAGAIN) {
	9700	printf("nspace_snapshot_event: timed out waiting for namespace handler...\n");
	9701	} else if (error == EINTR) {
	9702	// printf("nspace_snapshot_event: got a signal while waiting for namespace handler...\n");
	9703	snapshot_error = EINTR;
	9704	}
	9705	}
	9706	}
	9707
	9708	return snapshot_error;
	9709	}
	9710
	9711	int
	9712	get_nspace_item_status(struct vnode vp, int32_t status)
	9713	{
	9714	int i;
	9715
	9716	lck_mtx_lock(&nspace_handler_lock);
	9717	for(i=0; i < MAX_NSPACE_ITEMS; i++) {
	9718	if (nspace_items[i].vp == vp) {
	9719	break;
	9720	}
	9721	}
	9722
	9723	if (i >= MAX_NSPACE_ITEMS) {
	9724	lck_mtx_unlock(&nspace_handler_lock);
	9725	return ENOENT;
	9726	}
	9727
	9728	*status = nspace_items[i].flags;
	9729	lck_mtx_unlock(&nspace_handler_lock);
	9730	return 0;
	9731	}
	9732
	9733
	9734	#if 0
	9735	static int
	9736	build_volfs_path(struct vnode vp, char path, int *len)
	9737	{
	9738	struct vnode_attr va;
	9739	int ret;
	9740
	9741	VATTR_INIT(&va);
	9742	VATTR_WANTED(&va, va_fsid);
	9743	VATTR_WANTED(&va, va_fileid);
	9744
	9745	if (vnode_getattr(vp, &va, vfs_context_kernel()) != 0) {
	9746	len = snprintf(path, len, "/non/existent/path/because/vnode_getattr/failed") + 1;
	9747	ret = -1;
	9748	} else {
	9749	len = snprintf(path, len, "/.vol/%d/%lld", (dev_t)va.va_fsid, va.va_fileid) + 1;
	9750	ret = 0;
	9751	}
	9752
	9753	return ret;
	9754	}
	9755	#endif
	9756
	9757	//
	9758	// Note: this function does NOT check permissions on all of the
	9759	// parent directories leading to this vnode. It should only be
	9760	// called on behalf of a root process. Otherwise a process may
	9761	// get access to a file because the file itself is readable even
	9762	// though its parent directories would prevent access.
	9763	//
	9764	static int
	9765	vn_open_with_vp(vnode_t vp, int fmode, vfs_context_t ctx)
	9766	{
	9767	int error, action;
	9768
	9769	if ((error = suser(kauth_cred_get(), &(current_proc()->p_acflag)))) {
	9770	return error;
	9771	}
	9772
	9773	#if CONFIG_MACF
	9774	error = mac_vnode_check_open(ctx, vp, fmode);
	9775	if (error)
	9776	return error;
	9777	#endif
	9778
	9779	/* compute action to be authorized */
	9780	action = 0;
	9781	if (fmode & FREAD) {
	9782	action \|= KAUTH_VNODE_READ_DATA;
	9783	}
	9784	if (fmode & (FWRITE \| O_TRUNC)) {
	9785	/*
	9786	* If we are writing, appending, and not truncating,
	9787	* indicate that we are appending so that if the
	9788	* UF_APPEND or SF_APPEND bits are set, we do not deny
	9789	* the open.
	9790	*/
	9791	if ((fmode & O_APPEND) && !(fmode & O_TRUNC)) {
	9792	action \|= KAUTH_VNODE_APPEND_DATA;
	9793	} else {
	9794	action \|= KAUTH_VNODE_WRITE_DATA;
	9795	}
	9796	}
	9797
	9798	if ((error = vnode_authorize(vp, NULL, action, ctx)) != 0)
	9799	return error;
	9800
	9801
	9802	//
	9803	// if the vnode is tagged VOPENEVT and the current process
	9804	// has the P_CHECKOPENEVT flag set, then we or in the O_EVTONLY
	9805	// flag to the open mode so that this open won't count against
	9806	// the vnode when carbon delete() does a vnode_isinuse() to see
	9807	// if a file is currently in use. this allows spotlight
	9808	// importers to not interfere with carbon apps that depend on
	9809	// the no-delete-if-busy semantics of carbon delete().
	9810	//
	9811	if ((vp->v_flag & VOPENEVT) && (current_proc()->p_flag & P_CHECKOPENEVT)) {
	9812	fmode \|= O_EVTONLY;
	9813	}
	9814
	9815	if ( (error = VNOP_OPEN(vp, fmode, ctx)) ) {
	9816	return error;
	9817	}
	9818	if ( (error = vnode_ref_ext(vp, fmode, 0)) ) {
	9819	VNOP_CLOSE(vp, fmode, ctx);
	9820	return error;
	9821	}
	9822
	9823	/* Call out to allow 3rd party notification of open.
	9824	* Ignore result of kauth_authorize_fileop call.
	9825	*/
	9826	#if CONFIG_MACF
	9827	mac_vnode_notify_open(ctx, vp, fmode);
	9828	#endif
	9829	kauth_authorize_fileop(vfs_context_ucred(ctx), KAUTH_FILEOP_OPEN,
	9830	(uintptr_t)vp, 0);
	9831
	9832
	9833	return 0;
	9834	}
	9835
	9836	static int
	9837	wait_for_namespace_event(namespace_handler_data *nhd, nspace_type_t nspace_type)
	9838	{
	9839	int i;
	9840	int error = 0;
	9841	int unblock = 0;
	9842	task_t curtask;
	9843
	9844	lck_mtx_lock(&nspace_handler_exclusion_lock);
	9845	if (nspace_handlers[nspace_type].handler_busy) {
	9846	lck_mtx_unlock(&nspace_handler_exclusion_lock);
	9847	return EBUSY;
	9848	}
	9849
	9850	nspace_handlers[nspace_type].handler_busy = 1;
	9851	lck_mtx_unlock(&nspace_handler_exclusion_lock);
	9852
	9853	/*
	9854	* Any process that gets here will be one of the namespace handlers.
	9855	* As such, they should be prevented from acquiring DMG vnodes during vnode reclamation
	9856	* as we can cause deadlocks to occur, because the namespace handler may prevent
	9857	* VNOP_INACTIVE from proceeding. Mark the current task as a P_DEPENDENCY_CAPABLE
	9858	* process.
	9859	*/
	9860	curtask = current_task();
	9861	bsd_set_dependency_capable (curtask);
	9862
	9863	lck_mtx_lock(&nspace_handler_lock);
	9864	if (nspace_handlers[nspace_type].handler_proc == NULL) {
	9865	nspace_handlers[nspace_type].handler_tid = thread_tid(current_thread());
	9866	nspace_handlers[nspace_type].handler_proc = current_proc();
	9867	}
	9868
	9869	if (nspace_type == NSPACE_HANDLER_SNAPSHOT &&
	9870	(snapshot_timestamp == 0 \|\| snapshot_timestamp == ~0)) {
	9871	error = EINVAL;
	9872	}
	9873
	9874	while (error == 0) {
	9875
	9876	/* Try to find matching namespace item */
	9877	for (i = 0; i < MAX_NSPACE_ITEMS; i++) {
	9878	if (nspace_items[i].flags & NSPACE_ITEM_NEW) {
	9879	if (nspace_flags_matches_handler(nspace_items[i].flags, nspace_type)) {
	9880	break;
	9881	}
	9882	}
	9883	}
	9884
	9885	if (i >= MAX_NSPACE_ITEMS) {
	9886	/* Nothing is there yet. Wait for wake up and retry */
	9887	error = msleep((caddr_t)&nspace_item_idx, &nspace_handler_lock, PVFS\|PCATCH, "namespace-items", 0);
	9888	if ((nspace_type == NSPACE_HANDLER_SNAPSHOT) && (snapshot_timestamp == 0 \|\| snapshot_timestamp == ~0)) {
	9889	/* Prevent infinite loop if snapshot handler exited */
	9890	error = EINVAL;
	9891	break;
	9892	}
	9893	continue;
	9894	}
	9895
	9896	nspace_items[i].flags &= ~NSPACE_ITEM_NEW;
	9897	nspace_items[i].flags \|= NSPACE_ITEM_PROCESSING;
	9898	nspace_items[i].token = ++nspace_token_id;
	9899
	9900	assert(nspace_items[i].vp);
	9901	struct fileproc *fp;
	9902	int32_t indx;
	9903	int32_t fmode;
	9904	struct proc *p = current_proc();
	9905	vfs_context_t ctx = vfs_context_current();
	9906	struct vnode_attr va;
	9907	bool vn_get_succsessful = false;
	9908	bool vn_open_successful = false;
	9909	bool fp_alloc_successful = false;
	9910
	9911	/*
	9912	* Use vnode pointer to acquire a file descriptor for
	9913	* hand-off to userland
	9914	*/
	9915	fmode = nspace_open_flags_for_type(nspace_type);
	9916	error = vnode_getwithvid(nspace_items[i].vp, nspace_items[i].vid);
	9917	if (error) goto cleanup;
	9918	vn_get_succsessful = true;
	9919
	9920	error = vn_open_with_vp(nspace_items[i].vp, fmode, ctx);
	9921	if (error) goto cleanup;
	9922	vn_open_successful = true;
	9923
	9924	error = falloc(p, &fp, &indx, ctx);
	9925	if (error) goto cleanup;
	9926	fp_alloc_successful = true;
	9927
	9928	fp->f_fglob->fg_flag = fmode;
	9929	fp->f_fglob->fg_ops = &vnops;
	9930	fp->f_fglob->fg_data = (caddr_t)nspace_items[i].vp;
	9931
	9932	proc_fdlock(p);
	9933	procfdtbl_releasefd(p, indx, NULL);
	9934	fp_drop(p, indx, fp, 1);
	9935	proc_fdunlock(p);
	9936
	9937	/*
	9938	* All variants of the namespace handler struct support these three fields:
	9939	* token, flags, and the FD pointer
	9940	*/
	9941	error = copyout(&nspace_items[i].token, nhd->token, sizeof(uint32_t));
	9942	if (error) goto cleanup;
	9943	error = copyout(&nspace_items[i].op, nhd->flags, sizeof(uint64_t));
	9944	if (error) goto cleanup;
	9945	error = copyout(&indx, nhd->fdptr, sizeof(uint32_t));
	9946	if (error) goto cleanup;
	9947
	9948	/*
	9949	* Handle optional fields:
	9950	* extended version support an info ptr (offset, length), and the
	9951	*
	9952	* namedata version supports a unique per-link object ID
	9953	*
	9954	*/
	9955	if (nhd->infoptr) {
	9956	uio_t uio = (uio_t)nspace_items[i].arg;
	9957	uint64_t u_offset, u_length;
	9958
	9959	if (uio) {
	9960	u_offset = uio_offset(uio);
	9961	u_length = uio_resid(uio);
	9962	} else {
	9963	u_offset = 0;
	9964	u_length = 0;
	9965	}
	9966	error = copyout(&u_offset, nhd->infoptr, sizeof(uint64_t));
	9967	if (error) goto cleanup;
	9968	error = copyout(&u_length, nhd->infoptr + sizeof(uint64_t), sizeof(uint64_t));
	9969	if (error) goto cleanup;
	9970	}
	9971
	9972	if (nhd->objid) {
	9973	VATTR_INIT(&va);
	9974	VATTR_WANTED(&va, va_linkid);
	9975	error = vnode_getattr(nspace_items[i].vp, &va, ctx);
	9976	if (error) goto cleanup;
	9977
	9978	uint64_t linkid = 0;
	9979	if (VATTR_IS_SUPPORTED (&va, va_linkid)) {
	9980	linkid = (uint64_t)va.va_linkid;
	9981	}
	9982	error = copyout(&linkid, nhd->objid, sizeof(uint64_t));
	9983	}
	9984	cleanup:
	9985	if (error) {
	9986	if (fp_alloc_successful) fp_free(p, indx, fp);
	9987	if (vn_open_successful) vn_close(nspace_items[i].vp, fmode, ctx);
	9988	unblock = 1;
	9989	}
	9990
	9991	if (vn_get_succsessful) vnode_put(nspace_items[i].vp);
	9992
	9993	break;
	9994	}
	9995
	9996	if (unblock) {
	9997	if (nspace_items[i].vp && (nspace_items[i].vp->v_flag & VNEEDSSNAPSHOT)) {
	9998	vnode_lock_spin(nspace_items[i].vp);
	9999	nspace_items[i].vp->v_flag &= ~VNEEDSSNAPSHOT;
	10000	vnode_unlock(nspace_items[i].vp);
	10001	}
	10002	nspace_items[i].vp = NULL;
	10003	nspace_items[i].vid = 0;
	10004	nspace_items[i].flags = NSPACE_ITEM_DONE;
	10005	nspace_items[i].token = 0;
	10006
	10007	wakeup((caddr_t)&(nspace_items[i].vp));
	10008	}
	10009
	10010	if (nspace_type == NSPACE_HANDLER_SNAPSHOT) {
	10011	// just go through every snapshot event and unblock it immediately.
	10012	if (error && (snapshot_timestamp == 0 \|\| snapshot_timestamp == ~0)) {
	10013	for(i = 0; i < MAX_NSPACE_ITEMS; i++) {
	10014	if (nspace_items[i].flags & NSPACE_ITEM_NEW) {
	10015	if (nspace_flags_matches_handler(nspace_items[i].flags, nspace_type)) {
	10016	nspace_items[i].vp = NULL;
	10017	nspace_items[i].vid = 0;
	10018	nspace_items[i].flags = NSPACE_ITEM_DONE;
	10019	nspace_items[i].token = 0;
	10020
	10021	wakeup((caddr_t)&(nspace_items[i].vp));
	10022	}
	10023	}
	10024	}
	10025	}
	10026	}
	10027
	10028	lck_mtx_unlock(&nspace_handler_lock);
	10029
	10030	lck_mtx_lock(&nspace_handler_exclusion_lock);
	10031	nspace_handlers[nspace_type].handler_busy = 0;
	10032	lck_mtx_unlock(&nspace_handler_exclusion_lock);
	10033
	10034	return error;
	10035	}
	10036
	10037	static inline int validate_namespace_args (int is64bit, int size) {
	10038
	10039	if (is64bit) {
	10040	/* Must be one of these */
	10041	if (size == sizeof(user64_namespace_handler_info)) {
	10042	goto sizeok;
	10043	}
	10044	if (size == sizeof(user64_namespace_handler_info_ext)) {
	10045	goto sizeok;
	10046	}
	10047	if (size == sizeof(user64_namespace_handler_data)) {
	10048	goto sizeok;
	10049	}
	10050	return EINVAL;
	10051	}
	10052	else {
	10053	/* 32 bit -- must be one of these */
	10054	if (size == sizeof(user32_namespace_handler_info)) {
	10055	goto sizeok;
	10056	}
	10057	if (size == sizeof(user32_namespace_handler_info_ext)) {
	10058	goto sizeok;
	10059	}
	10060	if (size == sizeof(user32_namespace_handler_data)) {
	10061	goto sizeok;
	10062	}
	10063	return EINVAL;
	10064	}
	10065
	10066	sizeok:
	10067
	10068	return 0;
	10069
	10070	}
	10071
	10072	static int process_namespace_fsctl(nspace_type_t nspace_type, int is64bit, u_int size, caddr_t data)
	10073	{
	10074	int error = 0;
	10075	namespace_handler_data nhd;
	10076
	10077	bzero (&nhd, sizeof(namespace_handler_data));
	10078
	10079	if ((error = suser(kauth_cred_get(), &(current_proc()->p_acflag)))) {
	10080	return error;
	10081	}
	10082
	10083	error = validate_namespace_args (is64bit, size);
	10084	if (error) {
	10085	return error;
	10086	}
	10087
	10088	/* Copy in the userland pointers into our kernel-only struct */
	10089
	10090	if (is64bit) {
	10091	/* 64 bit userland structures */
	10092	nhd.token = (user_addr_t)((user64_namespace_handler_info *)data)->token;
	10093	nhd.flags = (user_addr_t)((user64_namespace_handler_info *)data)->flags;
	10094	nhd.fdptr = (user_addr_t)((user64_namespace_handler_info *)data)->fdptr;
	10095
	10096	/* If the size is greater than the standard info struct, add in extra fields */
	10097	if (size > (sizeof(user64_namespace_handler_info))) {
	10098	if (size >= (sizeof(user64_namespace_handler_info_ext))) {
	10099	nhd.infoptr = (user_addr_t)((user64_namespace_handler_info_ext *)data)->infoptr;
	10100	}
	10101	if (size == (sizeof(user64_namespace_handler_data))) {
	10102	nhd.objid = (user_addr_t)((user64_namespace_handler_data*)data)->objid;
	10103	}
	10104	/* Otherwise the fields were pre-zeroed when we did the bzero above. */
	10105	}
	10106	}
	10107	else {
	10108	/* 32 bit userland structures */
	10109	nhd.token = CAST_USER_ADDR_T(((user32_namespace_handler_info *)data)->token);
	10110	nhd.flags = CAST_USER_ADDR_T(((user32_namespace_handler_info *)data)->flags);
	10111	nhd.fdptr = CAST_USER_ADDR_T(((user32_namespace_handler_info *)data)->fdptr);
	10112
	10113	if (size > (sizeof(user32_namespace_handler_info))) {
	10114	if (size >= (sizeof(user32_namespace_handler_info_ext))) {
	10115	nhd.infoptr = CAST_USER_ADDR_T(((user32_namespace_handler_info_ext *)data)->infoptr);
	10116	}
	10117	if (size == (sizeof(user32_namespace_handler_data))) {
	10118	nhd.objid = (user_addr_t)((user32_namespace_handler_data*)data)->objid;
	10119	}
	10120	/* Otherwise the fields were pre-zeroed when we did the bzero above. */
	10121	}
	10122	}
	10123
	10124	return wait_for_namespace_event(&nhd, nspace_type);
	10125	}
	10126
	10127	static unsigned long
	10128	fsctl_bogus_command_compat(unsigned long cmd)
	10129	{
	10130
	10131	switch (cmd) {
	10132	case IOCBASECMD(FSIOC_SYNC_VOLUME):
	10133	return (FSIOC_SYNC_VOLUME);
	10134	case IOCBASECMD(FSIOC_ROUTEFS_SETROUTEID):
	10135	return (FSIOC_ROUTEFS_SETROUTEID);
	10136	case IOCBASECMD(FSIOC_SET_PACKAGE_EXTS):
	10137	return (FSIOC_SET_PACKAGE_EXTS);
	10138	case IOCBASECMD(FSIOC_NAMESPACE_HANDLER_GET):
	10139	return (FSIOC_NAMESPACE_HANDLER_GET);
	10140	case IOCBASECMD(FSIOC_OLD_SNAPSHOT_HANDLER_GET):
	10141	return (FSIOC_OLD_SNAPSHOT_HANDLER_GET);
	10142	case IOCBASECMD(FSIOC_SNAPSHOT_HANDLER_GET_EXT):
	10143	return (FSIOC_SNAPSHOT_HANDLER_GET_EXT);
	10144	case IOCBASECMD(FSIOC_NAMESPACE_HANDLER_UPDATE):
	10145	return (FSIOC_NAMESPACE_HANDLER_UPDATE);
	10146	case IOCBASECMD(FSIOC_NAMESPACE_HANDLER_UNBLOCK):
	10147	return (FSIOC_NAMESPACE_HANDLER_UNBLOCK);
	10148	case IOCBASECMD(FSIOC_NAMESPACE_HANDLER_CANCEL):
	10149	return (FSIOC_NAMESPACE_HANDLER_CANCEL);
	10150	case IOCBASECMD(FSIOC_NAMESPACE_HANDLER_SET_SNAPSHOT_TIME):
	10151	return (FSIOC_NAMESPACE_HANDLER_SET_SNAPSHOT_TIME);
	10152	case IOCBASECMD(FSIOC_NAMESPACE_ALLOW_DMG_SNAPSHOT_EVENTS):
	10153	return (FSIOC_NAMESPACE_ALLOW_DMG_SNAPSHOT_EVENTS);
	10154	case IOCBASECMD(FSIOC_SET_FSTYPENAME_OVERRIDE):
	10155	return (FSIOC_SET_FSTYPENAME_OVERRIDE);
	10156	case IOCBASECMD(DISK_CONDITIONER_IOC_GET):
	10157	return (DISK_CONDITIONER_IOC_GET);
	10158	case IOCBASECMD(DISK_CONDITIONER_IOC_SET):
	10159	return (DISK_CONDITIONER_IOC_SET);
	10160	case IOCBASECMD(FSIOC_FIOSEEKHOLE):
	10161	return (FSIOC_FIOSEEKHOLE);
	10162	case IOCBASECMD(FSIOC_FIOSEEKDATA):
	10163	return (FSIOC_FIOSEEKDATA);
	10164	case IOCBASECMD(SPOTLIGHT_IOC_GET_MOUNT_TIME):
	10165	return (SPOTLIGHT_IOC_GET_MOUNT_TIME);
	10166	case IOCBASECMD(SPOTLIGHT_IOC_GET_LAST_MTIME):
	10167	return (SPOTLIGHT_IOC_GET_LAST_MTIME);
	10168	}
	10169
	10170	return (cmd);
	10171	}
	10172
	10173	/*
	10174	* Make a filesystem-specific control call:
	10175	*/
	10176	/* ARGSUSED */
	10177	static int
	10178	fsctl_internal(proc_t p, vnode_t *arg_vp, u_long cmd, user_addr_t udata, u_long options, vfs_context_t ctx)
	10179	{
	10180	int error=0;
	10181	boolean_t is64bit;
	10182	u_int size;
	10183	#define STK_PARAMS 128
	10184	char stkbuf[STK_PARAMS] = {0};
	10185	caddr_t data, memp;
	10186	vnode_t vp = *arg_vp;
	10187
	10188	cmd = fsctl_bogus_command_compat(cmd);
	10189
	10190	size = IOCPARM_LEN(cmd);
	10191	if (size > IOCPARM_MAX) return (EINVAL);
	10192
	10193	is64bit = proc_is64bit(p);
	10194
	10195	memp = NULL;
	10196
	10197	if (size > sizeof (stkbuf)) {
	10198	if ((memp = (caddr_t)kalloc(size)) == 0) return ENOMEM;
	10199	data = memp;
	10200	} else {
	10201	data = &stkbuf[0];
	10202	};
	10203
	10204	if (cmd & IOC_IN) {
	10205	if (size) {
	10206	error = copyin(udata, data, size);
	10207	if (error) {
	10208	if (memp) {
	10209	kfree (memp, size);
	10210	}
	10211	return error;
	10212	}
	10213	} else {
	10214	if (is64bit) {
	10215	(user_addr_t )data = udata;
	10216	}
	10217	else {
	10218	(uint32_t )data = (uint32_t)udata;
	10219	}
	10220	};
	10221	} else if ((cmd & IOC_OUT) && size) {
	10222	/*
	10223	* Zero the buffer so the user always
	10224	* gets back something deterministic.
	10225	*/
	10226	bzero(data, size);
	10227	} else if (cmd & IOC_VOID) {
	10228	if (is64bit) {
	10229	(user_addr_t )data = udata;
	10230	}
	10231	else {
	10232	(uint32_t )data = (uint32_t)udata;
	10233	}
	10234	}
	10235
	10236	/* Check to see if it's a generic command */
	10237	switch (cmd) {
	10238
	10239	case FSIOC_SYNC_VOLUME: {
	10240	mount_t mp = vp->v_mount;
	10241	int arg = (uint32_t)data;
	10242
	10243	/* record vid of vp so we can drop it below. */
	10244	uint32_t vvid = vp->v_id;
	10245
	10246	/*
	10247	* Then grab mount_iterref so that we can release the vnode.
	10248	* Without this, a thread may call vnode_iterate_prepare then
	10249	* get into a deadlock because we've never released the root vp
	10250	*/
	10251	error = mount_iterref (mp, 0);
	10252	if (error) {
	10253	break;
	10254	}
	10255	vnode_put(vp);
	10256
	10257	/* issue the sync for this volume */
	10258	(void)sync_callback(mp, (arg & FSCTL_SYNC_WAIT) ? &arg : NULL);
	10259
	10260	/*
	10261	* Then release the mount_iterref once we're done syncing; it's not
	10262	* needed for the VNOP_IOCTL below
	10263	*/
	10264	mount_iterdrop(mp);
	10265
	10266	if (arg & FSCTL_SYNC_FULLSYNC) {
	10267	/* re-obtain vnode iocount on the root vp, if possible */
	10268	error = vnode_getwithvid (vp, vvid);
	10269	if (error == 0) {
	10270	error = VNOP_IOCTL(vp, F_FULLFSYNC, (caddr_t)NULL, 0, ctx);
	10271	vnode_put (vp);
	10272	}
	10273	}
	10274	/* mark the argument VP as having been released */
	10275	*arg_vp = NULL;
	10276	}
	10277	break;
	10278
	10279	case FSIOC_ROUTEFS_SETROUTEID: {
	10280	#if ROUTEFS
	10281	char routepath[MAXPATHLEN];
	10282	size_t len = 0;
	10283
	10284	if ((error = suser(kauth_cred_get(), &(current_proc()->p_acflag)))) {
	10285	break;
	10286	}
	10287	bzero(routepath, MAXPATHLEN);
	10288	error = copyinstr(udata, &routepath[0], MAXPATHLEN, &len);
	10289	if (error) {
	10290	break;
	10291	}
	10292	error = routefs_kernel_mount(routepath);
	10293	if (error) {
	10294	break;
	10295	}
	10296	#endif
	10297	}
	10298	break;
	10299
	10300	case FSIOC_SET_PACKAGE_EXTS: {
	10301	user_addr_t ext_strings;
	10302	uint32_t num_entries;
	10303	uint32_t max_width;
	10304
	10305	if ((error = priv_check_cred(kauth_cred_get(), PRIV_PACKAGE_EXTENSIONS, 0)))
	10306	break;
	10307
	10308	if ( (is64bit && size != sizeof(user64_package_ext_info))
	10309	\|\| (is64bit == 0 && size != sizeof(user32_package_ext_info))) {
	10310
	10311	// either you're 64-bit and passed a 64-bit struct or
	10312	// you're 32-bit and passed a 32-bit struct. otherwise
	10313	// it's not ok.
	10314	error = EINVAL;
	10315	break;
	10316	}
	10317
	10318	if (is64bit) {
	10319	ext_strings = ((user64_package_ext_info *)data)->strings;
	10320	num_entries = ((user64_package_ext_info *)data)->num_entries;
	10321	max_width = ((user64_package_ext_info *)data)->max_width;
	10322	} else {
	10323	ext_strings = CAST_USER_ADDR_T(((user32_package_ext_info *)data)->strings);
	10324	num_entries = ((user32_package_ext_info *)data)->num_entries;
	10325	max_width = ((user32_package_ext_info *)data)->max_width;
	10326	}
	10327	error = set_package_extensions_table(ext_strings, num_entries, max_width);
	10328	}
	10329	break;
	10330
	10331	/* namespace handlers */
	10332	case FSIOC_NAMESPACE_HANDLER_GET: {
	10333	error = process_namespace_fsctl(NSPACE_HANDLER_NSPACE, is64bit, size, data);
	10334	}
	10335	break;
	10336
	10337	/* Snapshot handlers */
	10338	case FSIOC_OLD_SNAPSHOT_HANDLER_GET: {
	10339	error = process_namespace_fsctl(NSPACE_HANDLER_SNAPSHOT, is64bit, size, data);
	10340	}
	10341	break;
	10342
	10343	case FSIOC_SNAPSHOT_HANDLER_GET_EXT: {
	10344	error = process_namespace_fsctl(NSPACE_HANDLER_SNAPSHOT, is64bit, size, data);
	10345	}
	10346	break;
	10347
	10348	case FSIOC_NAMESPACE_HANDLER_UPDATE: {
	10349	uint32_t token, val;
	10350	int i;
	10351
	10352	if ((error = suser(kauth_cred_get(), &(p->p_acflag)))) {
	10353	break;
	10354	}
	10355
	10356	if (!nspace_is_special_process(p)) {
	10357	error = EINVAL;
	10358	break;
	10359	}
	10360
	10361	token = ((uint32_t *)data)[0];
	10362	val = ((uint32_t *)data)[1];
	10363
	10364	lck_mtx_lock(&nspace_handler_lock);
	10365
	10366	for(i=0; i < MAX_NSPACE_ITEMS; i++) {
	10367	if (nspace_items[i].token == token) {
	10368	break; /* exit for loop, not case stmt */
	10369	}
	10370	}
	10371
	10372	if (i >= MAX_NSPACE_ITEMS) {
	10373	error = ENOENT;
	10374	} else {
	10375	//
	10376	// if this bit is set, when resolve_nspace_item() times out
	10377	// it will loop and go back to sleep.
	10378	//
	10379	nspace_items[i].flags \|= NSPACE_ITEM_RESET_TIMER;
	10380	}
	10381
	10382	lck_mtx_unlock(&nspace_handler_lock);
	10383
	10384	if (error) {
	10385	printf("nspace-handler-update: did not find token %u\n", token);
	10386	}
	10387	}
	10388	break;
	10389
	10390	case FSIOC_NAMESPACE_HANDLER_UNBLOCK: {
	10391	uint32_t token, val;
	10392	int i;
	10393
	10394	if ((error = suser(kauth_cred_get(), &(p->p_acflag)))) {
	10395	break;
	10396	}
	10397
	10398	if (!nspace_is_special_process(p)) {
	10399	error = EINVAL;
	10400	break;
	10401	}
	10402
	10403	token = ((uint32_t *)data)[0];
	10404	val = ((uint32_t *)data)[1];
	10405
	10406	lck_mtx_lock(&nspace_handler_lock);
	10407
	10408	for(i=0; i < MAX_NSPACE_ITEMS; i++) {
	10409	if (nspace_items[i].token == token) {
	10410	break; /* exit for loop, not case statement */
	10411	}
	10412	}
	10413
	10414	if (i >= MAX_NSPACE_ITEMS) {
	10415	printf("nspace-handler-unblock: did not find token %u\n", token);
	10416	error = ENOENT;
	10417	} else {
	10418	if (val == 0 && nspace_items[i].vp) {
	10419	vnode_lock_spin(nspace_items[i].vp);
	10420	nspace_items[i].vp->v_flag &= ~VNEEDSSNAPSHOT;
	10421	vnode_unlock(nspace_items[i].vp);
	10422	}
	10423
	10424	nspace_items[i].vp = NULL;
	10425	nspace_items[i].arg = NULL;
	10426	nspace_items[i].op = 0;
	10427	nspace_items[i].vid = 0;
	10428	nspace_items[i].flags = NSPACE_ITEM_DONE;
	10429	nspace_items[i].token = 0;
	10430
	10431	wakeup((caddr_t)&(nspace_items[i].vp));
	10432	}
	10433
	10434	lck_mtx_unlock(&nspace_handler_lock);
	10435	}
	10436	break;
	10437
	10438	case FSIOC_NAMESPACE_HANDLER_CANCEL: {
	10439	uint32_t token, val;
	10440	int i;
	10441
	10442	if ((error = suser(kauth_cred_get(), &(p->p_acflag)))) {
	10443	break;
	10444	}
	10445
	10446	if (!nspace_is_special_process(p)) {
	10447	error = EINVAL;
	10448	break;
	10449	}
	10450
	10451	token = ((uint32_t *)data)[0];
	10452	val = ((uint32_t *)data)[1];
	10453
	10454	lck_mtx_lock(&nspace_handler_lock);
	10455
	10456	for(i=0; i < MAX_NSPACE_ITEMS; i++) {
	10457	if (nspace_items[i].token == token) {
	10458	break; /* exit for loop, not case stmt */
	10459	}
	10460	}
	10461
	10462	if (i >= MAX_NSPACE_ITEMS) {
	10463	printf("nspace-handler-cancel: did not find token %u\n", token);
	10464	error = ENOENT;
	10465	} else {
	10466	if (nspace_items[i].vp) {
	10467	vnode_lock_spin(nspace_items[i].vp);
	10468	nspace_items[i].vp->v_flag &= ~VNEEDSSNAPSHOT;
	10469	vnode_unlock(nspace_items[i].vp);
	10470	}
	10471
	10472	nspace_items[i].vp = NULL;
	10473	nspace_items[i].arg = NULL;
	10474	nspace_items[i].vid = 0;
	10475	nspace_items[i].token = val;
	10476	nspace_items[i].flags &= ~NSPACE_ITEM_PROCESSING;
	10477	nspace_items[i].flags \|= NSPACE_ITEM_CANCELLED;
	10478
	10479	wakeup((caddr_t)&(nspace_items[i].vp));
	10480	}
	10481
	10482	lck_mtx_unlock(&nspace_handler_lock);
	10483	}
	10484	break;
	10485
	10486	case FSIOC_NAMESPACE_HANDLER_SET_SNAPSHOT_TIME: {
	10487	if ((error = suser(kauth_cred_get(), &(current_proc()->p_acflag)))) {
	10488	break;
	10489	}
	10490
	10491	// we explicitly do not do the namespace_handler_proc check here
	10492
	10493	lck_mtx_lock(&nspace_handler_lock);
	10494	snapshot_timestamp = ((uint32_t *)data)[0];
	10495	wakeup(&nspace_item_idx);
	10496	lck_mtx_unlock(&nspace_handler_lock);
	10497	printf("nspace-handler-set-snapshot-time: %d\n", (int)snapshot_timestamp);
	10498
	10499	}
	10500	break;
	10501
	10502	case FSIOC_NAMESPACE_ALLOW_DMG_SNAPSHOT_EVENTS:
	10503	{
	10504	if ((error = suser(kauth_cred_get(), &(current_proc()->p_acflag)))) {
	10505	break;
	10506	}
	10507
	10508	lck_mtx_lock(&nspace_handler_lock);
	10509	nspace_allow_virtual_devs = ((uint32_t *)data)[0];
	10510	lck_mtx_unlock(&nspace_handler_lock);
	10511	printf("nspace-snapshot-handler will%s allow events on disk-images\n",
	10512	nspace_allow_virtual_devs ? "" : " NOT");
	10513	error = 0;
	10514
	10515	}
	10516	break;
	10517
	10518	case FSIOC_SET_FSTYPENAME_OVERRIDE:
	10519	{
	10520	if ((error = suser(kauth_cred_get(), &(current_proc()->p_acflag)))) {
	10521	break;
	10522	}
	10523	if (vp->v_mount) {
	10524	mount_lock(vp->v_mount);
	10525	if (data[0] != 0) {
	10526	strlcpy(&vp->v_mount->fstypename_override[0], data, MFSTYPENAMELEN);
	10527	vp->v_mount->mnt_kern_flag \|= MNTK_TYPENAME_OVERRIDE;
	10528	if (vfs_isrdonly(vp->v_mount) && strcmp(vp->v_mount->fstypename_override, "mtmfs") == 0) {
	10529	vp->v_mount->mnt_kern_flag \|= MNTK_EXTENDED_SECURITY;
	10530	vp->v_mount->mnt_kern_flag &= ~MNTK_AUTH_OPAQUE;
	10531	}
	10532	} else {
	10533	if (strcmp(vp->v_mount->fstypename_override, "mtmfs") == 0) {
	10534	vp->v_mount->mnt_kern_flag &= ~MNTK_EXTENDED_SECURITY;
	10535	}
	10536	vp->v_mount->mnt_kern_flag &= ~MNTK_TYPENAME_OVERRIDE;
	10537	vp->v_mount->fstypename_override[0] = '\0';
	10538	}
	10539	mount_unlock(vp->v_mount);
	10540	}
	10541	}
	10542	break;
	10543
	10544	case DISK_CONDITIONER_IOC_GET: {
	10545	error = disk_conditioner_get_info(vp->v_mount, (disk_conditioner_info *)data);
	10546	}
	10547	break;
	10548
	10549	case DISK_CONDITIONER_IOC_SET: {
	10550	error = disk_conditioner_set_info(vp->v_mount, (disk_conditioner_info *)data);
	10551	}
	10552	break;
	10553
	10554	default: {
	10555	/* other, known commands shouldn't be passed down here */
	10556	switch (cmd) {
	10557	case F_PUNCHHOLE:
	10558	case F_TRIM_ACTIVE_FILE:
	10559	case F_RDADVISE:
	10560	case F_TRANSCODEKEY:
	10561	case F_GETPROTECTIONLEVEL:
	10562	case F_GETDEFAULTPROTLEVEL:
	10563	case F_MAKECOMPRESSED:
	10564	case F_SET_GREEDY_MODE:
	10565	case F_SETSTATICCONTENT:
	10566	case F_SETIOTYPE:
	10567	case F_SETBACKINGSTORE:
	10568	case F_GETPATH_MTMINFO:
	10569	case APFSIOC_REVERT_TO_SNAPSHOT:
	10570	case FSIOC_FIOSEEKHOLE:
	10571	case FSIOC_FIOSEEKDATA:
	10572	case HFS_GET_BOOT_INFO:
	10573	case HFS_SET_BOOT_INFO:
	10574	case FIOPINSWAP:
	10575	case F_CHKCLEAN:
	10576	case F_FULLFSYNC:
	10577	case F_BARRIERFSYNC:
	10578	case F_FREEZE_FS:
	10579	case F_THAW_FS:
	10580	error = EINVAL;
	10581	goto outdrop;
	10582	}
	10583	/* Invoke the filesystem-specific code */
	10584	error = VNOP_IOCTL(vp, cmd, data, options, ctx);
	10585	}
	10586
	10587	} /* end switch stmt */
	10588
	10589	/*
	10590	* if no errors, copy any data to user. Size was
	10591	* already set and checked above.
	10592	*/
	10593	if (error == 0 && (cmd & IOC_OUT) && size)
	10594	error = copyout(data, udata, size);
	10595
	10596	outdrop:
	10597	if (memp) {
	10598	kfree(memp, size);
	10599	}
	10600
	10601	return error;
	10602	}
	10603
	10604	/* ARGSUSED */
	10605	int
	10606	fsctl (proc_t p, struct fsctl_args uap, __unused int32_t retval)
	10607	{
	10608	int error;
	10609	struct nameidata nd;
	10610	u_long nameiflags;
	10611	vnode_t vp = NULL;
	10612	vfs_context_t ctx = vfs_context_current();
	10613
	10614	AUDIT_ARG(cmd, uap->cmd);
	10615	AUDIT_ARG(value32, uap->options);
	10616	/* Get the vnode for the file we are getting info on: */
	10617	nameiflags = 0;
	10618	if ((uap->options & FSOPT_NOFOLLOW) == 0) nameiflags \|= FOLLOW;
	10619	NDINIT(&nd, LOOKUP, OP_FSCTL, nameiflags \| AUDITVNPATH1,
	10620	UIO_USERSPACE, uap->path, ctx);
	10621	if ((error = namei(&nd))) goto done;
	10622	vp = nd.ni_vp;
	10623	nameidone(&nd);
	10624
	10625	#if CONFIG_MACF
	10626	error = mac_mount_check_fsctl(ctx, vnode_mount(vp), uap->cmd);
	10627	if (error) {
	10628	goto done;
	10629	}
	10630	#endif
	10631
	10632	error = fsctl_internal(p, &vp, uap->cmd, (user_addr_t)uap->data, uap->options, ctx);
	10633
	10634	done:
	10635	if (vp)
	10636	vnode_put(vp);
	10637	return error;
	10638	}
	10639	/* ARGSUSED */
	10640	int
	10641	ffsctl (proc_t p, struct ffsctl_args uap, __unused int32_t retval)
	10642	{
	10643	int error;
	10644	vnode_t vp = NULL;
	10645	vfs_context_t ctx = vfs_context_current();
	10646	int fd = -1;
	10647
	10648	AUDIT_ARG(fd, uap->fd);
	10649	AUDIT_ARG(cmd, uap->cmd);
	10650	AUDIT_ARG(value32, uap->options);
	10651
	10652	/* Get the vnode for the file we are getting info on: */
	10653	if ((error = file_vnode(uap->fd, &vp)))
	10654	return error;
	10655	fd = uap->fd;
	10656	if ((error = vnode_getwithref(vp))) {
	10657	file_drop(fd);
	10658	return error;
	10659	}
	10660
	10661	#if CONFIG_MACF
	10662	if ((error = mac_mount_check_fsctl(ctx, vnode_mount(vp), uap->cmd))) {
	10663	file_drop(fd);
	10664	vnode_put(vp);
	10665	return error;
	10666	}
	10667	#endif
	10668
	10669	error = fsctl_internal(p, &vp, uap->cmd, (user_addr_t)uap->data, uap->options, ctx);
	10670
	10671	file_drop(fd);
	10672
	10673	/validate vp; fsctl_internal() can drop iocount and reset vp to NULL/
	10674	if (vp) {
	10675	vnode_put(vp);
	10676	}
	10677
	10678	return error;
	10679	}
	10680	/* end of fsctl system call */
	10681
	10682	/*
	10683	* Retrieve the data of an extended attribute.
	10684	*/
	10685	int
	10686	getxattr(proc_t p, struct getxattr_args uap, user_ssize_t retval)
	10687	{
	10688	vnode_t vp;
	10689	struct nameidata nd;
	10690	char attrname[XATTR_MAXNAMELEN+1];
	10691	vfs_context_t ctx = vfs_context_current();
	10692	uio_t auio = NULL;
	10693	int spacetype = IS_64BIT_PROCESS(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
	10694	size_t attrsize = 0;
	10695	size_t namelen;
	10696	u_int32_t nameiflags;
	10697	int error;
	10698	char uio_buf[ UIO_SIZEOF(1) ];
	10699
	10700	if (uap->options & (XATTR_NOSECURITY \| XATTR_NODEFAULT))
	10701	return (EINVAL);
	10702
	10703	nameiflags = (uap->options & XATTR_NOFOLLOW) ? 0 : FOLLOW;
	10704	NDINIT(&nd, LOOKUP, OP_GETXATTR, nameiflags, spacetype, uap->path, ctx);
	10705	if ((error = namei(&nd))) {
	10706	return (error);
	10707	}
	10708	vp = nd.ni_vp;
	10709	nameidone(&nd);
	10710
	10711	error = copyinstr(uap->attrname, attrname, sizeof(attrname), &namelen);
	10712	if (error != 0) {
	10713	goto out;
	10714	}
	10715	if (xattr_protected(attrname)) {
	10716	if (!vfs_context_issuser(ctx) \|\| strcmp(attrname, "com.apple.system.Security") != 0) {
	10717	error = EPERM;
	10718	goto out;
	10719	}
	10720	}
	10721	/*
	10722	* the specific check for 0xffffffff is a hack to preserve
	10723	* binaray compatibilty in K64 with applications that discovered
	10724	* that passing in a buf pointer and a size of -1 resulted in
	10725	* just the size of the indicated extended attribute being returned.
	10726	* this isn't part of the documented behavior, but because of the
	10727	* original implemtation's check for "uap->size > 0", this behavior
	10728	* was allowed. In K32 that check turned into a signed comparison
	10729	* even though uap->size is unsigned... in K64, we blow by that
	10730	* check because uap->size is unsigned and doesn't get sign smeared
	10731	* in the munger for a 32 bit user app. we also need to add a
	10732	* check to limit the maximum size of the buffer being passed in...
	10733	* unfortunately, the underlying fileystems seem to just malloc
	10734	* the requested size even if the actual extended attribute is tiny.
	10735	* because that malloc is for kernel wired memory, we have to put a
	10736	* sane limit on it.
	10737	*
	10738	* U32 running on K64 will yield 0x00000000ffffffff for uap->size
	10739	* U64 running on K64 will yield -1 (64 bits wide)
	10740	* U32/U64 running on K32 will yield -1 (32 bits wide)
	10741	*/
	10742	if (uap->size == 0xffffffff \|\| uap->size == (size_t)-1)
	10743	goto no_uio;
	10744
	10745	if (uap->value) {
	10746	if (uap->size > (size_t)XATTR_MAXSIZE)
	10747	uap->size = XATTR_MAXSIZE;
	10748
	10749	auio = uio_createwithbuffer(1, uap->position, spacetype, UIO_READ,
	10750	&uio_buf[0], sizeof(uio_buf));
	10751	uio_addiov(auio, uap->value, uap->size);
	10752	}
	10753	no_uio:
	10754	error = vn_getxattr(vp, attrname, auio, &attrsize, uap->options, ctx);
	10755	out:
	10756	vnode_put(vp);
	10757
	10758	if (auio) {
	10759	*retval = uap->size - uio_resid(auio);
	10760	} else {
	10761	*retval = (user_ssize_t)attrsize;
	10762	}
	10763
	10764	return (error);
	10765	}
	10766
	10767	/*
	10768	* Retrieve the data of an extended attribute.
	10769	*/
	10770	int
	10771	fgetxattr(proc_t p, struct fgetxattr_args uap, user_ssize_t retval)
	10772	{
	10773	vnode_t vp;
	10774	char attrname[XATTR_MAXNAMELEN+1];
	10775	uio_t auio = NULL;
	10776	int spacetype = IS_64BIT_PROCESS(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
	10777	size_t attrsize = 0;
	10778	size_t namelen;
	10779	int error;
	10780	char uio_buf[ UIO_SIZEOF(1) ];
	10781
	10782	if (uap->options & (XATTR_NOFOLLOW \| XATTR_NOSECURITY \| XATTR_NODEFAULT))
	10783	return (EINVAL);
	10784
	10785	if ( (error = file_vnode(uap->fd, &vp)) ) {
	10786	return (error);
	10787	}
	10788	if ( (error = vnode_getwithref(vp)) ) {
	10789	file_drop(uap->fd);
	10790	return(error);
	10791	}
	10792	error = copyinstr(uap->attrname, attrname, sizeof(attrname), &namelen);
	10793	if (error != 0) {
	10794	goto out;
	10795	}
	10796	if (xattr_protected(attrname)) {
	10797	error = EPERM;
	10798	goto out;
	10799	}
	10800	if (uap->value && uap->size > 0) {
	10801	auio = uio_createwithbuffer(1, uap->position, spacetype, UIO_READ,
	10802	&uio_buf[0], sizeof(uio_buf));
	10803	uio_addiov(auio, uap->value, uap->size);
	10804	}
	10805
	10806	error = vn_getxattr(vp, attrname, auio, &attrsize, uap->options, vfs_context_current());
	10807	out:
	10808	(void)vnode_put(vp);
	10809	file_drop(uap->fd);
	10810
	10811	if (auio) {
	10812	*retval = uap->size - uio_resid(auio);
	10813	} else {
	10814	*retval = (user_ssize_t)attrsize;
	10815	}
	10816	return (error);
	10817	}
	10818
	10819	/*
	10820	* Set the data of an extended attribute.
	10821	*/
	10822	int
	10823	setxattr(proc_t p, struct setxattr_args uap, int retval)
	10824	{
	10825	vnode_t vp;
	10826	struct nameidata nd;
	10827	char attrname[XATTR_MAXNAMELEN+1];
	10828	vfs_context_t ctx = vfs_context_current();
	10829	uio_t auio = NULL;
	10830	int spacetype = IS_64BIT_PROCESS(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
	10831	size_t namelen;
	10832	u_int32_t nameiflags;
	10833	int error;
	10834	char uio_buf[ UIO_SIZEOF(1) ];
	10835
	10836	if (uap->options & (XATTR_NOSECURITY \| XATTR_NODEFAULT))
	10837	return (EINVAL);
	10838
	10839	error = copyinstr(uap->attrname, attrname, sizeof(attrname), &namelen);
	10840	if (error != 0) {
	10841	if (error == EPERM) {
	10842	/* if the string won't fit in attrname, copyinstr emits EPERM */
	10843	return (ENAMETOOLONG);
	10844	}
	10845	/* Otherwise return the default error from copyinstr to detect ERANGE, etc */
	10846	return error;
	10847	}
	10848	if (xattr_protected(attrname))
	10849	return(EPERM);
	10850	if (uap->size != 0 && uap->value == 0) {
	10851	return (EINVAL);
	10852	}
	10853
	10854	nameiflags = (uap->options & XATTR_NOFOLLOW) ? 0 : FOLLOW;
	10855	NDINIT(&nd, LOOKUP, OP_SETXATTR, nameiflags, spacetype, uap->path, ctx);
	10856	if ((error = namei(&nd))) {
	10857	return (error);
	10858	}
	10859	vp = nd.ni_vp;
	10860	nameidone(&nd);
	10861
	10862	auio = uio_createwithbuffer(1, uap->position, spacetype, UIO_WRITE,
	10863	&uio_buf[0], sizeof(uio_buf));
	10864	uio_addiov(auio, uap->value, uap->size);
	10865
	10866	error = vn_setxattr(vp, attrname, auio, uap->options, ctx);
	10867	#if CONFIG_FSE
	10868	if (error == 0) {
	10869	add_fsevent(FSE_XATTR_MODIFIED, ctx,
	10870	FSE_ARG_VNODE, vp,
	10871	FSE_ARG_DONE);
	10872	}
	10873	#endif
	10874	vnode_put(vp);
	10875	*retval = 0;
	10876	return (error);
	10877	}
	10878
	10879	/*
	10880	* Set the data of an extended attribute.
	10881	*/
	10882	int
	10883	fsetxattr(proc_t p, struct fsetxattr_args uap, int retval)
	10884	{
	10885	vnode_t vp;
	10886	char attrname[XATTR_MAXNAMELEN+1];
	10887	uio_t auio = NULL;
	10888	int spacetype = IS_64BIT_PROCESS(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
	10889	size_t namelen;
	10890	int error;
	10891	char uio_buf[ UIO_SIZEOF(1) ];
	10892	#if CONFIG_FSE
	10893	vfs_context_t ctx = vfs_context_current();
	10894	#endif
	10895
	10896	if (uap->options & (XATTR_NOFOLLOW \| XATTR_NOSECURITY \| XATTR_NODEFAULT))
	10897	return (EINVAL);
	10898
	10899	error = copyinstr(uap->attrname, attrname, sizeof(attrname), &namelen);
	10900	if (error != 0) {
	10901	if (error == EPERM) {
	10902	/* if the string won't fit in attrname, copyinstr emits EPERM */
	10903	return (ENAMETOOLONG);
	10904	}
	10905	/* Otherwise return the default error from copyinstr to detect ERANGE, etc */
	10906	return error;
	10907	}
	10908	if (xattr_protected(attrname))
	10909	return(EPERM);
	10910	if (uap->size != 0 && uap->value == 0) {
	10911	return (EINVAL);
	10912	}
	10913	if ( (error = file_vnode(uap->fd, &vp)) ) {
	10914	return (error);
	10915	}
	10916	if ( (error = vnode_getwithref(vp)) ) {
	10917	file_drop(uap->fd);
	10918	return(error);
	10919	}
	10920	auio = uio_createwithbuffer(1, uap->position, spacetype, UIO_WRITE,
	10921	&uio_buf[0], sizeof(uio_buf));
	10922	uio_addiov(auio, uap->value, uap->size);
	10923
	10924	error = vn_setxattr(vp, attrname, auio, uap->options, vfs_context_current());
	10925	#if CONFIG_FSE
	10926	if (error == 0) {
	10927	add_fsevent(FSE_XATTR_MODIFIED, ctx,
	10928	FSE_ARG_VNODE, vp,
	10929	FSE_ARG_DONE);
	10930	}
	10931	#endif
	10932	vnode_put(vp);
	10933	file_drop(uap->fd);
	10934	*retval = 0;
	10935	return (error);
	10936	}
	10937
	10938	/*
	10939	* Remove an extended attribute.
	10940	* XXX Code duplication here.
	10941	*/
	10942	int
	10943	removexattr(proc_t p, struct removexattr_args uap, int retval)
	10944	{
	10945	vnode_t vp;
	10946	struct nameidata nd;
	10947	char attrname[XATTR_MAXNAMELEN+1];
	10948	int spacetype = IS_64BIT_PROCESS(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
	10949	vfs_context_t ctx = vfs_context_current();
	10950	size_t namelen;
	10951	u_int32_t nameiflags;
	10952	int error;
	10953
	10954	if (uap->options & (XATTR_NOSECURITY \| XATTR_NODEFAULT))
	10955	return (EINVAL);
	10956
	10957	error = copyinstr(uap->attrname, attrname, sizeof(attrname), &namelen);
	10958	if (error != 0) {
	10959	return (error);
	10960	}
	10961	if (xattr_protected(attrname))
	10962	return(EPERM);
	10963	nameiflags = (uap->options & XATTR_NOFOLLOW) ? 0 : FOLLOW;
	10964	NDINIT(&nd, LOOKUP, OP_REMOVEXATTR, nameiflags, spacetype, uap->path, ctx);
	10965	if ((error = namei(&nd))) {
	10966	return (error);
	10967	}
	10968	vp = nd.ni_vp;
	10969	nameidone(&nd);
	10970
	10971	error = vn_removexattr(vp, attrname, uap->options, ctx);
	10972	#if CONFIG_FSE
	10973	if (error == 0) {
	10974	add_fsevent(FSE_XATTR_REMOVED, ctx,
	10975	FSE_ARG_VNODE, vp,
	10976	FSE_ARG_DONE);
	10977	}
	10978	#endif
	10979	vnode_put(vp);
	10980	*retval = 0;
	10981	return (error);
	10982	}
	10983
	10984	/*
	10985	* Remove an extended attribute.
	10986	* XXX Code duplication here.
	10987	*/
	10988	int
	10989	fremovexattr(__unused proc_t p, struct fremovexattr_args uap, int retval)
	10990	{
	10991	vnode_t vp;
	10992	char attrname[XATTR_MAXNAMELEN+1];
	10993	size_t namelen;
	10994	int error;
	10995	#if CONFIG_FSE
	10996	vfs_context_t ctx = vfs_context_current();
	10997	#endif
	10998
	10999	if (uap->options & (XATTR_NOFOLLOW \| XATTR_NOSECURITY \| XATTR_NODEFAULT))
	11000	return (EINVAL);
	11001
	11002	error = copyinstr(uap->attrname, attrname, sizeof(attrname), &namelen);
	11003	if (error != 0) {
	11004	return (error);
	11005	}
	11006	if (xattr_protected(attrname))
	11007	return(EPERM);
	11008	if ( (error = file_vnode(uap->fd, &vp)) ) {
	11009	return (error);
	11010	}
	11011	if ( (error = vnode_getwithref(vp)) ) {
	11012	file_drop(uap->fd);
	11013	return(error);
	11014	}
	11015
	11016	error = vn_removexattr(vp, attrname, uap->options, vfs_context_current());
	11017	#if CONFIG_FSE
	11018	if (error == 0) {
	11019	add_fsevent(FSE_XATTR_REMOVED, ctx,
	11020	FSE_ARG_VNODE, vp,
	11021	FSE_ARG_DONE);
	11022	}
	11023	#endif
	11024	vnode_put(vp);
	11025	file_drop(uap->fd);
	11026	*retval = 0;
	11027	return (error);
	11028	}
	11029
	11030	/*
	11031	* Retrieve the list of extended attribute names.
	11032	* XXX Code duplication here.
	11033	*/
	11034	int
	11035	listxattr(proc_t p, struct listxattr_args uap, user_ssize_t retval)
	11036	{
	11037	vnode_t vp;
	11038	struct nameidata nd;
	11039	vfs_context_t ctx = vfs_context_current();
	11040	uio_t auio = NULL;
	11041	int spacetype = IS_64BIT_PROCESS(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
	11042	size_t attrsize = 0;
	11043	u_int32_t nameiflags;
	11044	int error;
	11045	char uio_buf[ UIO_SIZEOF(1) ];
	11046
	11047	if (uap->options & (XATTR_NOSECURITY \| XATTR_NODEFAULT))
	11048	return (EINVAL);
	11049
	11050	nameiflags = (uap->options & XATTR_NOFOLLOW) ? 0 : FOLLOW;
	11051	NDINIT(&nd, LOOKUP, OP_LISTXATTR, nameiflags, spacetype, uap->path, ctx);
	11052	if ((error = namei(&nd))) {
	11053	return (error);
	11054	}
	11055	vp = nd.ni_vp;
	11056	nameidone(&nd);
	11057	if (uap->namebuf != 0 && uap->bufsize > 0) {
	11058	auio = uio_createwithbuffer(1, 0, spacetype, UIO_READ,
	11059	&uio_buf[0], sizeof(uio_buf));
	11060	uio_addiov(auio, uap->namebuf, uap->bufsize);
	11061	}
	11062
	11063	error = vn_listxattr(vp, auio, &attrsize, uap->options, ctx);
	11064
	11065	vnode_put(vp);
	11066	if (auio) {
	11067	*retval = (user_ssize_t)uap->bufsize - uio_resid(auio);
	11068	} else {
	11069	*retval = (user_ssize_t)attrsize;
	11070	}
	11071	return (error);
	11072	}
	11073
	11074	/*
	11075	* Retrieve the list of extended attribute names.
	11076	* XXX Code duplication here.
	11077	*/
	11078	int
	11079	flistxattr(proc_t p, struct flistxattr_args uap, user_ssize_t retval)
	11080	{
	11081	vnode_t vp;
	11082	uio_t auio = NULL;
	11083	int spacetype = proc_is64bit(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
	11084	size_t attrsize = 0;
	11085	int error;
	11086	char uio_buf[ UIO_SIZEOF(1) ];
	11087
	11088	if (uap->options & (XATTR_NOFOLLOW \| XATTR_NOSECURITY \| XATTR_NODEFAULT))
	11089	return (EINVAL);
	11090
	11091	if ( (error = file_vnode(uap->fd, &vp)) ) {
	11092	return (error);
	11093	}
	11094	if ( (error = vnode_getwithref(vp)) ) {
	11095	file_drop(uap->fd);
	11096	return(error);
	11097	}
	11098	if (uap->namebuf != 0 && uap->bufsize > 0) {
	11099	auio = uio_createwithbuffer(1, 0, spacetype,
	11100	UIO_READ, &uio_buf[0], sizeof(uio_buf));
	11101	uio_addiov(auio, uap->namebuf, uap->bufsize);
	11102	}
	11103
	11104	error = vn_listxattr(vp, auio, &attrsize, uap->options, vfs_context_current());
	11105
	11106	vnode_put(vp);
	11107	file_drop(uap->fd);
	11108	if (auio) {
	11109	*retval = (user_ssize_t)uap->bufsize - uio_resid(auio);
	11110	} else {
	11111	*retval = (user_ssize_t)attrsize;
	11112	}
	11113	return (error);
	11114	}
	11115
	11116	static int fsgetpath_internal(
	11117	vfs_context_t ctx, int volfs_id, uint64_t objid,
	11118	vm_size_t bufsize, caddr_t buf, int *pathlen)
	11119	{
	11120	int error;
	11121	struct mount *mp = NULL;
	11122	vnode_t vp;
	11123	int length;
	11124	int bpflags;
	11125	/* maximum number of times to retry build_path */
	11126	unsigned int retries = 0x10;
	11127
	11128	if (bufsize > PAGE_SIZE) {
	11129	return (EINVAL);
	11130	}
	11131
	11132	if (buf == NULL) {
	11133	return (ENOMEM);
	11134	}
	11135
	11136	retry:
	11137	if ((mp = mount_lookupby_volfsid(volfs_id, 1)) == NULL) {
	11138	error = ENOTSUP; /* unexpected failure */
	11139	return ENOTSUP;
	11140	}
	11141
	11142	unionget:
	11143	if (objid == 2) {
	11144	error = VFS_ROOT(mp, &vp, ctx);
	11145	} else {
	11146	error = VFS_VGET(mp, (ino64_t)objid, &vp, ctx);
	11147	}
	11148
	11149	if (error == ENOENT && (mp->mnt_flag & MNT_UNION)) {
	11150	/*
	11151	* If the fileid isn't found and we're in a union
	11152	* mount volume, then see if the fileid is in the
	11153	* mounted-on volume.
	11154	*/
	11155	struct mount *tmp = mp;
	11156	mp = vnode_mount(tmp->mnt_vnodecovered);
	11157	vfs_unbusy(tmp);
	11158	if (vfs_busy(mp, LK_NOWAIT) == 0)
	11159	goto unionget;
	11160	} else {
	11161	vfs_unbusy(mp);
	11162	}
	11163
	11164	if (error) {
	11165	return error;
	11166	}
	11167
	11168	#if CONFIG_MACF
	11169	error = mac_vnode_check_fsgetpath(ctx, vp);
	11170	if (error) {
	11171	vnode_put(vp);
	11172	return error;
	11173	}
	11174	#endif
	11175
	11176	/* Obtain the absolute path to this vnode. */
	11177	bpflags = vfs_context_suser(ctx) ? BUILDPATH_CHECKACCESS : 0;
	11178	bpflags \|= BUILDPATH_CHECK_MOVED;
	11179	error = build_path(vp, buf, bufsize, &length, bpflags, ctx);
	11180	vnode_put(vp);
	11181
	11182	if (error) {
	11183	/* there was a race building the path, try a few more times */
	11184	if (error == EAGAIN) {
	11185	--retries;
	11186	if (retries > 0)
	11187	goto retry;
	11188
	11189	error = ENOENT;
	11190	}
	11191	goto out;
	11192	}
	11193
	11194	AUDIT_ARG(text, buf);
	11195
	11196	if (kdebug_enable) {
	11197	long dbg_parms[NUMPARMS];
	11198	int dbg_namelen;
	11199
	11200	dbg_namelen = (int)sizeof(dbg_parms);
	11201
	11202	if (length < dbg_namelen) {
	11203	memcpy((char *)dbg_parms, buf, length);
	11204	memset((char *)dbg_parms + length, 0, dbg_namelen - length);
	11205
	11206	dbg_namelen = length;
	11207	} else {
	11208	memcpy((char *)dbg_parms, buf + (length - dbg_namelen), dbg_namelen);
	11209	}
	11210
	11211	kdebug_vfs_lookup(dbg_parms, dbg_namelen, (void *)vp,
	11212	KDBG_VFS_LOOKUP_FLAG_LOOKUP);
	11213	}
	11214
	11215	pathlen = (user_ssize_t)length; / may be superseded by error */
	11216
	11217	out:
	11218	return (error);
	11219	}
	11220
	11221	/*
	11222	* Obtain the full pathname of a file system object by id.
	11223	*/
	11224	int
	11225	fsgetpath(__unused proc_t p, struct fsgetpath_args uap, user_ssize_t retval)
	11226	{
	11227	vfs_context_t ctx = vfs_context_current();
	11228	fsid_t fsid;
	11229	char *realpath;
	11230	int length;
	11231	int error;
	11232
	11233	if ((error = copyin(uap->fsid, (caddr_t)&fsid, sizeof(fsid)))) {
	11234	return (error);
	11235	}
	11236	AUDIT_ARG(value32, fsid.val[0]);
	11237	AUDIT_ARG(value64, uap->objid);
	11238	/* Restrict output buffer size for now. */
	11239
	11240	if (uap->bufsize > PAGE_SIZE) {
	11241	return (EINVAL);
	11242	}
	11243	MALLOC(realpath, char *, uap->bufsize, M_TEMP, M_WAITOK \| M_ZERO);
	11244	if (realpath == NULL) {
	11245	return (ENOMEM);
	11246	}
	11247
	11248	error = fsgetpath_internal(
	11249	ctx, fsid.val[0], uap->objid,
	11250	uap->bufsize, realpath, &length);
	11251
	11252	if (error) {
	11253	goto out;
	11254	}
	11255
	11256	error = copyout((caddr_t)realpath, uap->buf, length);
	11257
	11258	retval = (user_ssize_t)length; / may be superseded by error */
	11259	out:
	11260	if (realpath) {
	11261	FREE(realpath, M_TEMP);
	11262	}
	11263	return (error);
	11264	}
	11265
	11266	/*
	11267	* Common routine to handle various flavors of statfs data heading out
	11268	* to user space.
	11269	*
	11270	* Returns: 0 Success
	11271	* EFAULT
	11272	*/
	11273	static int
	11274	munge_statfs(struct mount mp, struct vfsstatfs sfsp,
	11275	user_addr_t bufp, int *sizep, boolean_t is_64_bit,
	11276	boolean_t partial_copy)
	11277	{
	11278	int error;
	11279	int my_size, copy_size;
	11280
	11281	if (is_64_bit) {
	11282	struct user64_statfs sfs;
	11283	my_size = copy_size = sizeof(sfs);
	11284	bzero(&sfs, my_size);
	11285	sfs.f_flags = mp->mnt_flag & MNT_VISFLAGMASK;
	11286	sfs.f_type = mp->mnt_vtable->vfc_typenum;
	11287	sfs.f_reserved1 = (short)sfsp->f_fssubtype;
	11288	sfs.f_bsize = (user64_long_t)sfsp->f_bsize;
	11289	sfs.f_iosize = (user64_long_t)sfsp->f_iosize;
	11290	sfs.f_blocks = (user64_long_t)sfsp->f_blocks;
	11291	sfs.f_bfree = (user64_long_t)sfsp->f_bfree;
	11292	sfs.f_bavail = (user64_long_t)sfsp->f_bavail;
	11293	sfs.f_files = (user64_long_t)sfsp->f_files;
	11294	sfs.f_ffree = (user64_long_t)sfsp->f_ffree;
	11295	sfs.f_fsid = sfsp->f_fsid;
	11296	sfs.f_owner = sfsp->f_owner;
	11297	if (mp->mnt_kern_flag & MNTK_TYPENAME_OVERRIDE) {
	11298	strlcpy(&sfs.f_fstypename[0], &mp->fstypename_override[0], MFSNAMELEN);
	11299	} else {
	11300	strlcpy(&sfs.f_fstypename[0], &sfsp->f_fstypename[0], MFSNAMELEN);
	11301	}
	11302	strlcpy(&sfs.f_mntonname[0], &sfsp->f_mntonname[0], MNAMELEN);
	11303	strlcpy(&sfs.f_mntfromname[0], &sfsp->f_mntfromname[0], MNAMELEN);
	11304
	11305	if (partial_copy) {
	11306	copy_size -= (sizeof(sfs.f_reserved3) + sizeof(sfs.f_reserved4));
	11307	}
	11308	error = copyout((caddr_t)&sfs, bufp, copy_size);
	11309	}
	11310	else {
	11311	struct user32_statfs sfs;
	11312
	11313	my_size = copy_size = sizeof(sfs);
	11314	bzero(&sfs, my_size);
	11315
	11316	sfs.f_flags = mp->mnt_flag & MNT_VISFLAGMASK;
	11317	sfs.f_type = mp->mnt_vtable->vfc_typenum;
	11318	sfs.f_reserved1 = (short)sfsp->f_fssubtype;
	11319
	11320	/*
	11321	* It's possible for there to be more than 2^^31 blocks in the filesystem, so we
	11322	* have to fudge the numbers here in that case. We inflate the blocksize in order
	11323	* to reflect the filesystem size as best we can.
	11324	*/
	11325	if ((sfsp->f_blocks > INT_MAX)
	11326	/* Hack for 4061702 . I think the real fix is for Carbon to
	11327	* look for some volume capability and not depend on hidden
	11328	* semantics agreed between a FS and carbon.
	11329	* f_blocks, f_bfree, and f_bavail set to -1 is the trigger
	11330	* for Carbon to set bNoVolumeSizes volume attribute.
	11331	* Without this the webdavfs files cannot be copied onto
	11332	* disk as they look huge. This change should not affect
	11333	* XSAN as they should not setting these to -1..
	11334	*/
	11335	&& (sfsp->f_blocks != 0xffffffffffffffffULL)
	11336	&& (sfsp->f_bfree != 0xffffffffffffffffULL)
	11337	&& (sfsp->f_bavail != 0xffffffffffffffffULL)) {
	11338	int shift;
	11339
	11340	/*
	11341	* Work out how far we have to shift the block count down to make it fit.
	11342	* Note that it's possible to have to shift so far that the resulting
	11343	* blocksize would be unreportably large. At that point, we will clip
	11344	* any values that don't fit.
	11345	*
	11346	* For safety's sake, we also ensure that f_iosize is never reported as
	11347	* being smaller than f_bsize.
	11348	*/
	11349	for (shift = 0; shift < 32; shift++) {
	11350	if ((sfsp->f_blocks >> shift) <= INT_MAX)
	11351	break;
	11352	if ((sfsp->f_bsize << (shift + 1)) > INT_MAX)
	11353	break;
	11354	}
	11355	#define __SHIFT_OR_CLIP(x, s) ((((x) >> (s)) > INT_MAX) ? INT_MAX : ((x) >> (s)))
	11356	sfs.f_blocks = (user32_long_t)__SHIFT_OR_CLIP(sfsp->f_blocks, shift);
	11357	sfs.f_bfree = (user32_long_t)__SHIFT_OR_CLIP(sfsp->f_bfree, shift);
	11358	sfs.f_bavail = (user32_long_t)__SHIFT_OR_CLIP(sfsp->f_bavail, shift);
	11359	#undef __SHIFT_OR_CLIP
	11360	sfs.f_bsize = (user32_long_t)(sfsp->f_bsize << shift);
	11361	sfs.f_iosize = lmax(sfsp->f_iosize, sfsp->f_bsize);
	11362	} else {
	11363	/* filesystem is small enough to be reported honestly */
	11364	sfs.f_bsize = (user32_long_t)sfsp->f_bsize;
	11365	sfs.f_iosize = (user32_long_t)sfsp->f_iosize;
	11366	sfs.f_blocks = (user32_long_t)sfsp->f_blocks;
	11367	sfs.f_bfree = (user32_long_t)sfsp->f_bfree;
	11368	sfs.f_bavail = (user32_long_t)sfsp->f_bavail;
	11369	}
	11370	sfs.f_files = (user32_long_t)sfsp->f_files;
	11371	sfs.f_ffree = (user32_long_t)sfsp->f_ffree;
	11372	sfs.f_fsid = sfsp->f_fsid;
	11373	sfs.f_owner = sfsp->f_owner;
	11374	if (mp->mnt_kern_flag & MNTK_TYPENAME_OVERRIDE) {
	11375	strlcpy(&sfs.f_fstypename[0], &mp->fstypename_override[0], MFSNAMELEN);
	11376	} else {
	11377	strlcpy(&sfs.f_fstypename[0], &sfsp->f_fstypename[0], MFSNAMELEN);
	11378	}
	11379	strlcpy(&sfs.f_mntonname[0], &sfsp->f_mntonname[0], MNAMELEN);
	11380	strlcpy(&sfs.f_mntfromname[0], &sfsp->f_mntfromname[0], MNAMELEN);
	11381
	11382	if (partial_copy) {
	11383	copy_size -= (sizeof(sfs.f_reserved3) + sizeof(sfs.f_reserved4));
	11384	}
	11385	error = copyout((caddr_t)&sfs, bufp, copy_size);
	11386	}
	11387
	11388	if (sizep != NULL) {
	11389	*sizep = my_size;
	11390	}
	11391	return(error);
	11392	}
	11393
	11394	/*
	11395	* copy stat structure into user_stat structure.
	11396	*/
	11397	void munge_user64_stat(struct stat sbp, struct user64_stat usbp)
	11398	{
	11399	bzero(usbp, sizeof(*usbp));
	11400
	11401	usbp->st_dev = sbp->st_dev;
	11402	usbp->st_ino = sbp->st_ino;
	11403	usbp->st_mode = sbp->st_mode;
	11404	usbp->st_nlink = sbp->st_nlink;
	11405	usbp->st_uid = sbp->st_uid;
	11406	usbp->st_gid = sbp->st_gid;
	11407	usbp->st_rdev = sbp->st_rdev;
	11408	#ifndef _POSIX_C_SOURCE
	11409	usbp->st_atimespec.tv_sec = sbp->st_atimespec.tv_sec;
	11410	usbp->st_atimespec.tv_nsec = sbp->st_atimespec.tv_nsec;
	11411	usbp->st_mtimespec.tv_sec = sbp->st_mtimespec.tv_sec;
	11412	usbp->st_mtimespec.tv_nsec = sbp->st_mtimespec.tv_nsec;
	11413	usbp->st_ctimespec.tv_sec = sbp->st_ctimespec.tv_sec;
	11414	usbp->st_ctimespec.tv_nsec = sbp->st_ctimespec.tv_nsec;
	11415	#else
	11416	usbp->st_atime = sbp->st_atime;
	11417	usbp->st_atimensec = sbp->st_atimensec;
	11418	usbp->st_mtime = sbp->st_mtime;
	11419	usbp->st_mtimensec = sbp->st_mtimensec;
	11420	usbp->st_ctime = sbp->st_ctime;
	11421	usbp->st_ctimensec = sbp->st_ctimensec;
	11422	#endif
	11423	usbp->st_size = sbp->st_size;
	11424	usbp->st_blocks = sbp->st_blocks;
	11425	usbp->st_blksize = sbp->st_blksize;
	11426	usbp->st_flags = sbp->st_flags;
	11427	usbp->st_gen = sbp->st_gen;
	11428	usbp->st_lspare = sbp->st_lspare;
	11429	usbp->st_qspare[0] = sbp->st_qspare[0];
	11430	usbp->st_qspare[1] = sbp->st_qspare[1];
	11431	}
	11432
	11433	void munge_user32_stat(struct stat sbp, struct user32_stat usbp)
	11434	{
	11435	bzero(usbp, sizeof(*usbp));
	11436
	11437	usbp->st_dev = sbp->st_dev;
	11438	usbp->st_ino = sbp->st_ino;
	11439	usbp->st_mode = sbp->st_mode;
	11440	usbp->st_nlink = sbp->st_nlink;
	11441	usbp->st_uid = sbp->st_uid;
	11442	usbp->st_gid = sbp->st_gid;
	11443	usbp->st_rdev = sbp->st_rdev;
	11444	#ifndef _POSIX_C_SOURCE
	11445	usbp->st_atimespec.tv_sec = sbp->st_atimespec.tv_sec;
	11446	usbp->st_atimespec.tv_nsec = sbp->st_atimespec.tv_nsec;
	11447	usbp->st_mtimespec.tv_sec = sbp->st_mtimespec.tv_sec;
	11448	usbp->st_mtimespec.tv_nsec = sbp->st_mtimespec.tv_nsec;
	11449	usbp->st_ctimespec.tv_sec = sbp->st_ctimespec.tv_sec;
	11450	usbp->st_ctimespec.tv_nsec = sbp->st_ctimespec.tv_nsec;
	11451	#else
	11452	usbp->st_atime = sbp->st_atime;
	11453	usbp->st_atimensec = sbp->st_atimensec;
	11454	usbp->st_mtime = sbp->st_mtime;
	11455	usbp->st_mtimensec = sbp->st_mtimensec;
	11456	usbp->st_ctime = sbp->st_ctime;
	11457	usbp->st_ctimensec = sbp->st_ctimensec;
	11458	#endif
	11459	usbp->st_size = sbp->st_size;
	11460	usbp->st_blocks = sbp->st_blocks;
	11461	usbp->st_blksize = sbp->st_blksize;
	11462	usbp->st_flags = sbp->st_flags;
	11463	usbp->st_gen = sbp->st_gen;
	11464	usbp->st_lspare = sbp->st_lspare;
	11465	usbp->st_qspare[0] = sbp->st_qspare[0];
	11466	usbp->st_qspare[1] = sbp->st_qspare[1];
	11467	}
	11468
	11469	/*
	11470	* copy stat64 structure into user_stat64 structure.
	11471	*/
	11472	void munge_user64_stat64(struct stat64 sbp, struct user64_stat64 usbp)
	11473	{
	11474	bzero(usbp, sizeof(*usbp));
	11475
	11476	usbp->st_dev = sbp->st_dev;
	11477	usbp->st_ino = sbp->st_ino;
	11478	usbp->st_mode = sbp->st_mode;
	11479	usbp->st_nlink = sbp->st_nlink;
	11480	usbp->st_uid = sbp->st_uid;
	11481	usbp->st_gid = sbp->st_gid;
	11482	usbp->st_rdev = sbp->st_rdev;
	11483	#ifndef _POSIX_C_SOURCE
	11484	usbp->st_atimespec.tv_sec = sbp->st_atimespec.tv_sec;
	11485	usbp->st_atimespec.tv_nsec = sbp->st_atimespec.tv_nsec;
	11486	usbp->st_mtimespec.tv_sec = sbp->st_mtimespec.tv_sec;
	11487	usbp->st_mtimespec.tv_nsec = sbp->st_mtimespec.tv_nsec;
	11488	usbp->st_ctimespec.tv_sec = sbp->st_ctimespec.tv_sec;
	11489	usbp->st_ctimespec.tv_nsec = sbp->st_ctimespec.tv_nsec;
	11490	usbp->st_birthtimespec.tv_sec = sbp->st_birthtimespec.tv_sec;
	11491	usbp->st_birthtimespec.tv_nsec = sbp->st_birthtimespec.tv_nsec;
	11492	#else
	11493	usbp->st_atime = sbp->st_atime;
	11494	usbp->st_atimensec = sbp->st_atimensec;
	11495	usbp->st_mtime = sbp->st_mtime;
	11496	usbp->st_mtimensec = sbp->st_mtimensec;
	11497	usbp->st_ctime = sbp->st_ctime;
	11498	usbp->st_ctimensec = sbp->st_ctimensec;
	11499	usbp->st_birthtime = sbp->st_birthtime;
	11500	usbp->st_birthtimensec = sbp->st_birthtimensec;
	11501	#endif
	11502	usbp->st_size = sbp->st_size;
	11503	usbp->st_blocks = sbp->st_blocks;
	11504	usbp->st_blksize = sbp->st_blksize;
	11505	usbp->st_flags = sbp->st_flags;
	11506	usbp->st_gen = sbp->st_gen;
	11507	usbp->st_lspare = sbp->st_lspare;
	11508	usbp->st_qspare[0] = sbp->st_qspare[0];
	11509	usbp->st_qspare[1] = sbp->st_qspare[1];
	11510	}
	11511
	11512	void munge_user32_stat64(struct stat64 sbp, struct user32_stat64 usbp)
	11513	{
	11514	bzero(usbp, sizeof(*usbp));
	11515
	11516	usbp->st_dev = sbp->st_dev;
	11517	usbp->st_ino = sbp->st_ino;
	11518	usbp->st_mode = sbp->st_mode;
	11519	usbp->st_nlink = sbp->st_nlink;
	11520	usbp->st_uid = sbp->st_uid;
	11521	usbp->st_gid = sbp->st_gid;
	11522	usbp->st_rdev = sbp->st_rdev;
	11523	#ifndef _POSIX_C_SOURCE
	11524	usbp->st_atimespec.tv_sec = sbp->st_atimespec.tv_sec;
	11525	usbp->st_atimespec.tv_nsec = sbp->st_atimespec.tv_nsec;
	11526	usbp->st_mtimespec.tv_sec = sbp->st_mtimespec.tv_sec;
	11527	usbp->st_mtimespec.tv_nsec = sbp->st_mtimespec.tv_nsec;
	11528	usbp->st_ctimespec.tv_sec = sbp->st_ctimespec.tv_sec;
	11529	usbp->st_ctimespec.tv_nsec = sbp->st_ctimespec.tv_nsec;
	11530	usbp->st_birthtimespec.tv_sec = sbp->st_birthtimespec.tv_sec;
	11531	usbp->st_birthtimespec.tv_nsec = sbp->st_birthtimespec.tv_nsec;
	11532	#else
	11533	usbp->st_atime = sbp->st_atime;
	11534	usbp->st_atimensec = sbp->st_atimensec;
	11535	usbp->st_mtime = sbp->st_mtime;
	11536	usbp->st_mtimensec = sbp->st_mtimensec;
	11537	usbp->st_ctime = sbp->st_ctime;
	11538	usbp->st_ctimensec = sbp->st_ctimensec;
	11539	usbp->st_birthtime = sbp->st_birthtime;
	11540	usbp->st_birthtimensec = sbp->st_birthtimensec;
	11541	#endif
	11542	usbp->st_size = sbp->st_size;
	11543	usbp->st_blocks = sbp->st_blocks;
	11544	usbp->st_blksize = sbp->st_blksize;
	11545	usbp->st_flags = sbp->st_flags;
	11546	usbp->st_gen = sbp->st_gen;
	11547	usbp->st_lspare = sbp->st_lspare;
	11548	usbp->st_qspare[0] = sbp->st_qspare[0];
	11549	usbp->st_qspare[1] = sbp->st_qspare[1];
	11550	}
	11551
	11552	/*
	11553	* Purge buffer cache for simulating cold starts
	11554	*/
	11555	static int vnode_purge_callback(struct vnode vp, __unused void cargs)
	11556	{
	11557	ubc_msync(vp, (off_t)0, ubc_getsize(vp), NULL /* off_t resid_off /, UBC_PUSHALL \| UBC_INVALIDATE);
	11558
	11559	return VNODE_RETURNED;
	11560	}
	11561
	11562	static int vfs_purge_callback(mount_t mp, __unused void * arg)
	11563	{
	11564	vnode_iterate(mp, VNODE_WAIT \| VNODE_ITERATE_ALL, vnode_purge_callback, NULL);
	11565
	11566	return VFS_RETURNED;
	11567	}
	11568
	11569	int
	11570	vfs_purge(__unused struct proc p, __unused struct vfs_purge_args uap, __unused int32_t *retval)
	11571	{
	11572	if (!kauth_cred_issuser(kauth_cred_get()))
	11573	return EPERM;
	11574
	11575	vfs_iterate(0/* flags */, vfs_purge_callback, NULL);
	11576
	11577	return 0;
	11578	}
	11579
	11580	/*
	11581	* gets the vnode associated with the (unnamed) snapshot directory
	11582	* for a Filesystem. The snapshot directory vnode is returned with
	11583	* an iocount on it.
	11584	*/
	11585	int
	11586	vnode_get_snapdir(vnode_t rvp, vnode_t *sdvpp, vfs_context_t ctx)
	11587	{
	11588	return (VFS_VGET_SNAPDIR(vnode_mount(rvp), sdvpp, ctx));
	11589	}
	11590
	11591	/*
	11592	* Get the snapshot vnode.
	11593	*
	11594	* If successful, the call returns with an iocount on rvpp ,sdvpp and
	11595	* needs nameidone() on ndp.
	11596	*
	11597	* If the snapshot vnode exists it is returned in ndp->ni_vp.
	11598	*
	11599	* If it returns with an error, rvpp, sdvpp are NULL and nameidone() is
	11600	* not needed.
	11601	*/
	11602	static int
	11603	vnode_get_snapshot(int dirfd, vnode_t rvpp, vnode_t sdvpp,
	11604	user_addr_t name, struct nameidata *ndp, int32_t op,
	11605	#if !CONFIG_TRIGGERS
	11606	__unused
	11607	#endif
	11608	enum path_operation pathop,
	11609	vfs_context_t ctx)
	11610	{
	11611	int error, i;
	11612	caddr_t name_buf;
	11613	size_t name_len;
	11614	struct vfs_attr vfa;
	11615
	11616	*sdvpp = NULLVP;
	11617	*rvpp = NULLVP;
	11618
	11619	error = vnode_getfromfd(ctx, dirfd, rvpp);
	11620	if (error)
	11621	return (error);
	11622
	11623	if (!vnode_isvroot(*rvpp)) {
	11624	error = EINVAL;
	11625	goto out;
	11626	}
	11627
	11628	/* Make sure the filesystem supports snapshots */
	11629	VFSATTR_INIT(&vfa);
	11630	VFSATTR_WANTED(&vfa, f_capabilities);
	11631	if ((vfs_getattr(vnode_mount(*rvpp), &vfa, ctx) != 0) \|\|
	11632	!VFSATTR_IS_SUPPORTED(&vfa, f_capabilities) \|\|
	11633	!((vfa.f_capabilities.valid[VOL_CAPABILITIES_INTERFACES] &
	11634	VOL_CAP_INT_SNAPSHOT)) \|\|
	11635	!((vfa.f_capabilities.capabilities[VOL_CAPABILITIES_INTERFACES] &
	11636	VOL_CAP_INT_SNAPSHOT))) {
	11637	error = ENOTSUP;
	11638	goto out;
	11639	}
	11640
	11641	error = vnode_get_snapdir(*rvpp, sdvpp, ctx);
	11642	if (error)
	11643	goto out;
	11644
	11645	MALLOC(name_buf, caddr_t, MAXPATHLEN, M_TEMP, M_WAITOK);
	11646	error = copyinstr(name, name_buf, MAXPATHLEN, &name_len);
	11647	if (error)
	11648	goto out1;
	11649
	11650	/*
	11651	* Some sanity checks- name can't be empty, "." or ".." or have slashes.
	11652	* (the length returned by copyinstr includes the terminating NUL)
	11653	*/
	11654	if ((name_len == 1) \|\| (name_len == 2 && name_buf[0] == '.') \|\|
	11655	(name_len == 3 && name_buf[0] == '.' && name_buf[1] == '.')) {
	11656	error = EINVAL;
	11657	goto out1;
	11658	}
	11659	for (i = 0; i < (int)name_len && name_buf[i] != '/'; i++);
	11660	if (i < (int)name_len) {
	11661	error = EINVAL;
	11662	goto out1;
	11663	}
	11664
	11665	#if CONFIG_MACF
	11666	if (op == CREATE) {
	11667	error = mac_mount_check_snapshot_create(ctx, vnode_mount(*rvpp),
	11668	name_buf);
	11669	} else if (op == DELETE) {
	11670	error = mac_mount_check_snapshot_delete(ctx, vnode_mount(*rvpp),
	11671	name_buf);
	11672	}
	11673	if (error)
	11674	goto out1;
	11675	#endif
	11676
	11677	/* Check if the snapshot already exists ... */
	11678	NDINIT(ndp, op, pathop, USEDVP \| NOCACHE \| AUDITVNPATH1,
	11679	UIO_SYSSPACE, CAST_USER_ADDR_T(name_buf), ctx);
	11680	ndp->ni_dvp = *sdvpp;
	11681
	11682	error = namei(ndp);
	11683	out1:
	11684	FREE(name_buf, M_TEMP);
	11685	out:
	11686	if (error) {
	11687	if (*sdvpp) {
	11688	vnode_put(*sdvpp);
	11689	*sdvpp = NULLVP;
	11690	}
	11691	if (*rvpp) {
	11692	vnode_put(*rvpp);
	11693	*rvpp = NULLVP;
	11694	}
	11695	}
	11696	return (error);
	11697	}
	11698
	11699	/*
	11700	* create a filesystem snapshot (for supporting filesystems)
	11701	*
	11702	* A much simplified version of openat(dirfd, name, O_CREAT \| O_EXCL)
	11703	* We get to the (unnamed) snapshot directory vnode and create the vnode
	11704	* for the snapshot in it.
	11705	*
	11706	* Restrictions:
	11707	*
	11708	* a) Passed in name for snapshot cannot have slashes.
	11709	* b) name can't be "." or ".."
	11710	*
	11711	* Since this requires superuser privileges, vnode_authorize calls are not
	11712	* made.
	11713	*/
	11714	static int
	11715	snapshot_create(int dirfd, user_addr_t name, __unused uint32_t flags,
	11716	vfs_context_t ctx)
	11717	{
	11718	vnode_t rvp, snapdvp;
	11719	int error;
	11720	struct nameidata namend;
	11721
	11722	error = vnode_get_snapshot(dirfd, &rvp, &snapdvp, name, &namend, CREATE,
	11723	OP_LINK, ctx);
	11724	if (error)
	11725	return (error);
	11726
	11727	if (namend.ni_vp) {
	11728	vnode_put(namend.ni_vp);
	11729	error = EEXIST;
	11730	} else {
	11731	struct vnode_attr va;
	11732	vnode_t vp = NULLVP;
	11733
	11734	VATTR_INIT(&va);
	11735	VATTR_SET(&va, va_type, VREG);
	11736	VATTR_SET(&va, va_mode, 0);
	11737
	11738	error = vn_create(snapdvp, &vp, &namend, &va,
	11739	VN_CREATE_NOAUTH \| VN_CREATE_NOINHERIT, 0, NULL, ctx);
	11740	if (!error && vp)
	11741	vnode_put(vp);
	11742	}
	11743
	11744	nameidone(&namend);
	11745	vnode_put(snapdvp);
	11746	vnode_put(rvp);
	11747	return (error);
	11748	}
	11749
	11750	/*
	11751	* Delete a Filesystem snapshot
	11752	*
	11753	* get the vnode for the unnamed snapshot directory and the snapshot and
	11754	* delete the snapshot.
	11755	*/
	11756	static int
	11757	snapshot_delete(int dirfd, user_addr_t name, __unused uint32_t flags,
	11758	vfs_context_t ctx)
	11759	{
	11760	vnode_t rvp, snapdvp;
	11761	int error;
	11762	struct nameidata namend;
	11763
	11764	error = vnode_get_snapshot(dirfd, &rvp, &snapdvp, name, &namend, DELETE,
	11765	OP_UNLINK, ctx);
	11766	if (error)
	11767	goto out;
	11768
	11769	error = VNOP_REMOVE(snapdvp, namend.ni_vp, &namend.ni_cnd,
	11770	VNODE_REMOVE_SKIP_NAMESPACE_EVENT, ctx);
	11771
	11772	vnode_put(namend.ni_vp);
	11773	nameidone(&namend);
	11774	vnode_put(snapdvp);
	11775	vnode_put(rvp);
	11776	out:
	11777	return (error);
	11778	}
	11779
	11780	/*
	11781	* Revert a filesystem to a snapshot
	11782	*
	11783	* Marks the filesystem to revert to the given snapshot on next mount.
	11784	*/
	11785	static int
	11786	snapshot_revert(int dirfd, user_addr_t name, __unused uint32_t flags,
	11787	vfs_context_t ctx)
	11788	{
	11789	int error;
	11790	vnode_t rvp;
	11791	mount_t mp;
	11792	struct fs_snapshot_revert_args revert_data;
	11793	struct componentname cnp;
	11794	caddr_t name_buf;
	11795	size_t name_len;
	11796
	11797	error = vnode_getfromfd(ctx, dirfd, &rvp);
	11798	if (error) {
	11799	return (error);
	11800	}
	11801	mp = vnode_mount(rvp);
	11802
	11803	MALLOC(name_buf, caddr_t, MAXPATHLEN, M_TEMP, M_WAITOK);
	11804	error = copyinstr(name, name_buf, MAXPATHLEN, &name_len);
	11805	if (error) {
	11806	FREE(name_buf, M_TEMP);
	11807	vnode_put(rvp);
	11808	return (error);
	11809	}
	11810
	11811	#if CONFIG_MACF
	11812	error = mac_mount_check_snapshot_revert(ctx, mp, name_buf);
	11813	if (error) {
	11814	FREE(name_buf, M_TEMP);
	11815	vnode_put(rvp);
	11816	return (error);
	11817	}
	11818	#endif
	11819
	11820	/*
	11821	* Grab mount_iterref so that we can release the vnode,
	11822	* since VFSIOC_REVERT_SNAPSHOT could conceivably cause a sync.
	11823	*/
	11824	error = mount_iterref (mp, 0);
	11825	vnode_put(rvp);
	11826	if (error) {
	11827	FREE(name_buf, M_TEMP);
	11828	return (error);
	11829	}
	11830
	11831	memset(&cnp, 0, sizeof(cnp));
	11832	cnp.cn_pnbuf = (char *)name_buf;
	11833	cnp.cn_nameiop = LOOKUP;
	11834	cnp.cn_flags = ISLASTCN \| HASBUF;
	11835	cnp.cn_pnlen = MAXPATHLEN;
	11836	cnp.cn_nameptr = cnp.cn_pnbuf;
	11837	cnp.cn_namelen = (int)name_len;
	11838	revert_data.sr_cnp = &cnp;
	11839
	11840	error = VFS_IOCTL(mp, VFSIOC_REVERT_SNAPSHOT, (caddr_t)&revert_data, 0, ctx);
	11841	mount_iterdrop(mp);
	11842	FREE(name_buf, M_TEMP);
	11843
	11844	if (error) {
	11845	/* If there was any error, try again using VNOP_IOCTL */
	11846
	11847	vnode_t snapdvp;
	11848	struct nameidata namend;
	11849
	11850	error = vnode_get_snapshot(dirfd, &rvp, &snapdvp, name, &namend, LOOKUP,
	11851	OP_LOOKUP, ctx);
	11852	if (error) {
	11853	return (error);
	11854	}
	11855
	11856
	11857	error = VNOP_IOCTL(namend.ni_vp, APFSIOC_REVERT_TO_SNAPSHOT, (caddr_t) NULL,
	11858	0, ctx);
	11859
	11860	vnode_put(namend.ni_vp);
	11861	nameidone(&namend);
	11862	vnode_put(snapdvp);
	11863	vnode_put(rvp);
	11864	}
	11865
	11866	return (error);
	11867	}
	11868
	11869	/*
	11870	* rename a Filesystem snapshot
	11871	*
	11872	* get the vnode for the unnamed snapshot directory and the snapshot and
	11873	* rename the snapshot. This is a very specialised (and simple) case of
	11874	* rename(2) (which has to deal with a lot more complications). It differs
	11875	* slightly from rename(2) in that EEXIST is returned if the new name exists.
	11876	*/
	11877	static int
	11878	snapshot_rename(int dirfd, user_addr_t old, user_addr_t new,
	11879	__unused uint32_t flags, vfs_context_t ctx)
	11880	{
	11881	vnode_t rvp, snapdvp;
	11882	int error, i;
	11883	caddr_t newname_buf;
	11884	size_t name_len;
	11885	vnode_t fvp;
	11886	struct nameidata fromnd, tond;
	11887	/* carving out a chunk for structs that are too big to be on stack. */
	11888	struct {
	11889	struct nameidata from_node;
	11890	struct nameidata to_node;
	11891	} * __rename_data;
	11892
	11893	MALLOC(__rename_data, void , sizeof(__rename_data), M_TEMP, M_WAITOK);
	11894	fromnd = &__rename_data->from_node;
	11895	tond = &__rename_data->to_node;
	11896
	11897	error = vnode_get_snapshot(dirfd, &rvp, &snapdvp, old, fromnd, DELETE,
	11898	OP_UNLINK, ctx);
	11899	if (error)
	11900	goto out;
	11901	fvp = fromnd->ni_vp;
	11902
	11903	MALLOC(newname_buf, caddr_t, MAXPATHLEN, M_TEMP, M_WAITOK);
	11904	error = copyinstr(new, newname_buf, MAXPATHLEN, &name_len);
	11905	if (error)
	11906	goto out1;
	11907
	11908	/*
	11909	* Some sanity checks- new name can't be empty, "." or ".." or have
	11910	* slashes.
	11911	* (the length returned by copyinstr includes the terminating NUL)
	11912	*
	11913	* The FS rename VNOP is suppossed to handle this but we'll pick it
	11914	* off here itself.
	11915	*/
	11916	if ((name_len == 1) \|\| (name_len == 2 && newname_buf[0] == '.') \|\|
	11917	(name_len == 3 && newname_buf[0] == '.' && newname_buf[1] == '.')) {
	11918	error = EINVAL;
	11919	goto out1;
	11920	}
	11921	for (i = 0; i < (int)name_len && newname_buf[i] != '/'; i++);
	11922	if (i < (int)name_len) {
	11923	error = EINVAL;
	11924	goto out1;
	11925	}
	11926
	11927	#if CONFIG_MACF
	11928	error = mac_mount_check_snapshot_create(ctx, vnode_mount(rvp),
	11929	newname_buf);
	11930	if (error)
	11931	goto out1;
	11932	#endif
	11933
	11934	NDINIT(tond, RENAME, OP_RENAME, USEDVP \| NOCACHE \| AUDITVNPATH2,
	11935	UIO_SYSSPACE, CAST_USER_ADDR_T(newname_buf), ctx);
	11936	tond->ni_dvp = snapdvp;
	11937
	11938	error = namei(tond);
	11939	if (error) {
	11940	goto out2;
	11941	} else if (tond->ni_vp) {
	11942	/*
	11943	* snapshot rename behaves differently than rename(2) - if the
	11944	* new name exists, EEXIST is returned.
	11945	*/
	11946	vnode_put(tond->ni_vp);
	11947	error = EEXIST;
	11948	goto out2;
	11949	}
	11950
	11951	error = VNOP_RENAME(snapdvp, fvp, &fromnd->ni_cnd, snapdvp, NULLVP,
	11952	&tond->ni_cnd, ctx);
	11953
	11954	out2:
	11955	nameidone(tond);
	11956	out1:
	11957	FREE(newname_buf, M_TEMP);
	11958	vnode_put(fvp);
	11959	vnode_put(snapdvp);
	11960	vnode_put(rvp);
	11961	nameidone(fromnd);
	11962	out:
	11963	FREE(__rename_data, M_TEMP);
	11964	return (error);
	11965	}
	11966
	11967	/*
	11968	* Mount a Filesystem snapshot
	11969	*
	11970	* get the vnode for the unnamed snapshot directory and the snapshot and
	11971	* mount the snapshot.
	11972	*/
	11973	static int
	11974	snapshot_mount(int dirfd, user_addr_t name, user_addr_t directory,
	11975	__unused user_addr_t mnt_data, __unused uint32_t flags, vfs_context_t ctx)
	11976	{
	11977	vnode_t rvp, snapdvp, snapvp, vp, pvp;
	11978	int error;
	11979	struct nameidata snapndp, dirndp;
	11980	/* carving out a chunk for structs that are too big to be on stack. */
	11981	struct {
	11982	struct nameidata snapnd;
	11983	struct nameidata dirnd;
	11984	} * __snapshot_mount_data;
	11985
	11986	MALLOC(__snapshot_mount_data, void , sizeof(__snapshot_mount_data),
	11987	M_TEMP, M_WAITOK);
	11988	snapndp = &__snapshot_mount_data->snapnd;
	11989	dirndp = &__snapshot_mount_data->dirnd;
	11990
	11991	error = vnode_get_snapshot(dirfd, &rvp, &snapdvp, name, snapndp, LOOKUP,
	11992	OP_LOOKUP, ctx);
	11993	if (error)
	11994	goto out;
	11995
	11996	snapvp = snapndp->ni_vp;
	11997	if (!vnode_mount(rvp) \|\| (vnode_mount(rvp) == dead_mountp)) {
	11998	error = EIO;
	11999	goto out1;
	12000	}
	12001
	12002	/* Get the vnode to be covered */
	12003	NDINIT(dirndp, LOOKUP, OP_MOUNT, FOLLOW \| AUDITVNPATH1 \| WANTPARENT,
	12004	UIO_USERSPACE, directory, ctx);
	12005	error = namei(dirndp);
	12006	if (error)
	12007	goto out1;
	12008
	12009	vp = dirndp->ni_vp;
	12010	pvp = dirndp->ni_dvp;
	12011
	12012	if ((vp->v_flag & VROOT) && (vp->v_mount->mnt_flag & MNT_ROOTFS)) {
	12013	error = EINVAL;
	12014	} else {
	12015	mount_t mp = vnode_mount(rvp);
	12016	struct fs_snapshot_mount_args smnt_data;
	12017
	12018	smnt_data.sm_mp = mp;
	12019	smnt_data.sm_cnp = &snapndp->ni_cnd;
	12020	error = mount_common(mp->mnt_vfsstat.f_fstypename, pvp, vp,
	12021	&dirndp->ni_cnd, CAST_USER_ADDR_T(&smnt_data), flags & MNT_DONTBROWSE,
	12022	KERNEL_MOUNT_SNAPSHOT, NULL, FALSE, ctx);
	12023	}
	12024
	12025	vnode_put(vp);
	12026	vnode_put(pvp);
	12027	nameidone(dirndp);
	12028	out1:
	12029	vnode_put(snapvp);
	12030	vnode_put(snapdvp);
	12031	vnode_put(rvp);
	12032	nameidone(snapndp);
	12033	out:
	12034	FREE(__snapshot_mount_data, M_TEMP);
	12035	return (error);
	12036	}
	12037
	12038	/*
	12039	* Root from a snapshot of the filesystem
	12040	*
	12041	* Marks the filesystem to root from the given snapshot on next boot.
	12042	*/
	12043	static int
	12044	snapshot_root(int dirfd, user_addr_t name, __unused uint32_t flags,
	12045	vfs_context_t ctx)
	12046	{
	12047	int error;
	12048	vnode_t rvp;
	12049	mount_t mp;
	12050	struct fs_snapshot_root_args root_data;
	12051	struct componentname cnp;
	12052	caddr_t name_buf;
	12053	size_t name_len;
	12054
	12055	error = vnode_getfromfd(ctx, dirfd, &rvp);
	12056	if (error) {
	12057	return (error);
	12058	}
	12059	mp = vnode_mount(rvp);
	12060
	12061	MALLOC(name_buf, caddr_t, MAXPATHLEN, M_TEMP, M_WAITOK);
	12062	error = copyinstr(name, name_buf, MAXPATHLEN, &name_len);
	12063	if (error) {
	12064	FREE(name_buf, M_TEMP);
	12065	vnode_put(rvp);
	12066	return (error);
	12067	}
	12068
	12069	// XXX MAC checks ?
	12070
	12071	/*
	12072	* Grab mount_iterref so that we can release the vnode,
	12073	* since VFSIOC_ROOT_SNAPSHOT could conceivably cause a sync.
	12074	*/
	12075	error = mount_iterref (mp, 0);
	12076	vnode_put(rvp);
	12077	if (error) {
	12078	FREE(name_buf, M_TEMP);
	12079	return (error);
	12080	}
	12081
	12082	memset(&cnp, 0, sizeof(cnp));
	12083	cnp.cn_pnbuf = (char *)name_buf;
	12084	cnp.cn_nameiop = LOOKUP;
	12085	cnp.cn_flags = ISLASTCN \| HASBUF;
	12086	cnp.cn_pnlen = MAXPATHLEN;
	12087	cnp.cn_nameptr = cnp.cn_pnbuf;
	12088	cnp.cn_namelen = (int)name_len;
	12089	root_data.sr_cnp = &cnp;
	12090
	12091	error = VFS_IOCTL(mp, VFSIOC_ROOT_SNAPSHOT, (caddr_t)&root_data, 0, ctx);
	12092
	12093	mount_iterdrop(mp);
	12094	FREE(name_buf, M_TEMP);
	12095
	12096	return (error);
	12097	}
	12098
	12099	/*
	12100	* FS snapshot operations dispatcher
	12101	*/
	12102	int
	12103	fs_snapshot(__unused proc_t p, struct fs_snapshot_args *uap,
	12104	__unused int32_t *retval)
	12105	{
	12106	int error;
	12107	vfs_context_t ctx = vfs_context_current();
	12108
	12109	AUDIT_ARG(fd, uap->dirfd);
	12110	AUDIT_ARG(value32, uap->op);
	12111
	12112	error = priv_check_cred(vfs_context_ucred(ctx), PRIV_VFS_SNAPSHOT, 0);
	12113	if (error)
	12114	return (error);
	12115
	12116	switch (uap->op) {
	12117	case SNAPSHOT_OP_CREATE:
	12118	error = snapshot_create(uap->dirfd, uap->name1, uap->flags, ctx);
	12119	break;
	12120	case SNAPSHOT_OP_DELETE:
	12121	error = snapshot_delete(uap->dirfd, uap->name1, uap->flags, ctx);
	12122	break;
	12123	case SNAPSHOT_OP_RENAME:
	12124	error = snapshot_rename(uap->dirfd, uap->name1, uap->name2,
	12125	uap->flags, ctx);
	12126	break;
	12127	case SNAPSHOT_OP_MOUNT:
	12128	error = snapshot_mount(uap->dirfd, uap->name1, uap->name2,
	12129	uap->data, uap->flags, ctx);
	12130	break;
	12131	case SNAPSHOT_OP_REVERT:
	12132	error = snapshot_revert(uap->dirfd, uap->name1, uap->flags, ctx);
	12133	break;
	12134	#if CONFIG_MNT_ROOTSNAP
	12135	case SNAPSHOT_OP_ROOT:
	12136	error = snapshot_root(uap->dirfd, uap->name1, uap->flags, ctx);
	12137	break;
	12138	#endif /* CONFIG_MNT_ROOTSNAP */
	12139	default:
	12140	error = ENOSYS;
	12141	}
	12142
	12143	return (error);
	12144	}