git.saurik.com Git - apple/xnu.git/blame_incremental

0 / 11421 ( 0%)

Commit	Line	Data
	1	/*
	2	* Copyright (c) 2000-2019 Apple Inc. All rights reserved.
	3	*
	4	* @APPLE_OSREFERENCE_LICENSE_HEADER_START@
	5	*
	6	* This file contains Original Code and/or Modifications of Original Code
	7	* as defined in and that are subject to the Apple Public Source License
	8	* Version 2.0 (the 'License'). You may not use this file except in
	9	* compliance with the License. The rights granted to you under the License
	10	* may not be used to create, or enable the creation or redistribution of,
	11	* unlawful or unlicensed copies of an Apple operating system, or to
	12	* circumvent, violate, or enable the circumvention or violation of, any
	13	* terms of an Apple operating system software license agreement.
	14	*
	15	* Please obtain a copy of the License at
	16	* http://www.opensource.apple.com/apsl/ and read it before using this file.
	17	*
	18	* The Original Code and all software distributed under the License are
	19	* distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
	20	* EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
	21	* INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
	22	* FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
	23	* Please see the License for the specific language governing rights and
	24	* limitations under the License.
	25	*
	26	* @APPLE_OSREFERENCE_LICENSE_HEADER_END@
	27	*/
	28	/* Copyright (c) 1995 NeXT Computer, Inc. All Rights Reserved */
	29	/*
	30	* Copyright (c) 1989, 1993
	31	* The Regents of the University of California. All rights reserved.
	32	* (c) UNIX System Laboratories, Inc.
	33	* All or some portions of this file are derived from material licensed
	34	* to the University of California by American Telephone and Telegraph
	35	* Co. or Unix System Laboratories, Inc. and are reproduced herein with
	36	* the permission of UNIX System Laboratories, Inc.
	37	*
	38	* Redistribution and use in source and binary forms, with or without
	39	* modification, are permitted provided that the following conditions
	40	* are met:
	41	* 1. Redistributions of source code must retain the above copyright
	42	* notice, this list of conditions and the following disclaimer.
	43	* 2. Redistributions in binary form must reproduce the above copyright
	44	* notice, this list of conditions and the following disclaimer in the
	45	* documentation and/or other materials provided with the distribution.
	46	* 3. All advertising materials mentioning features or use of this software
	47	* must display the following acknowledgement:
	48	* This product includes software developed by the University of
	49	* California, Berkeley and its contributors.
	50	* 4. Neither the name of the University nor the names of its contributors
	51	* may be used to endorse or promote products derived from this software
	52	* without specific prior written permission.
	53	*
	54	* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
	55	* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
	56	* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
	57	* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
	58	* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
	59	* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
	60	* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
	61	* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
	62	* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
	63	* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
	64	* SUCH DAMAGE.
	65	*
	66	* @(#)vfs_subr.c 8.31 (Berkeley) 5/26/95
	67	*/
	68	/*
	69	* NOTICE: This file was modified by SPARTA, Inc. in 2005 to introduce
	70	* support for mandatory and extensible security protections. This notice
	71	* is included in support of clause 2.2 (b) of the Apple Public License,
	72	* Version 2.0.
	73	*/
	74
	75	/*
	76	* External virtual filesystem routines
	77	*/
	78
	79	#include <sys/param.h>
	80	#include <sys/systm.h>
	81	#include <sys/proc_internal.h>
	82	#include <sys/kauth.h>
	83	#include <sys/mount_internal.h>
	84	#include <sys/time.h>
	85	#include <sys/lock.h>
	86	#include <sys/vnode.h>
	87	#include <sys/vnode_internal.h>
	88	#include <sys/stat.h>
	89	#include <sys/namei.h>
	90	#include <sys/ucred.h>
	91	#include <sys/buf_internal.h>
	92	#include <sys/errno.h>
	93	#include <kern/kalloc.h>
	94	#include <sys/uio_internal.h>
	95	#include <sys/uio.h>
	96	#include <sys/domain.h>
	97	#include <sys/mbuf.h>
	98	#include <sys/syslog.h>
	99	#include <sys/ubc_internal.h>
	100	#include <sys/vm.h>
	101	#include <sys/sysctl.h>
	102	#include <sys/filedesc.h>
	103	#include <sys/event.h>
	104	#include <sys/kdebug.h>
	105	#include <sys/kauth.h>
	106	#include <sys/user.h>
	107	#include <sys/systm.h>
	108	#include <sys/kern_memorystatus.h>
	109	#include <sys/lockf.h>
	110	#include <sys/reboot.h>
	111	#include <miscfs/fifofs/fifo.h>
	112
	113	#include <nfs/nfs_conf.h>
	114
	115	#include <string.h>
	116	#include <machine/machine_routines.h>
	117
	118	#include <kern/assert.h>
	119	#include <mach/kern_return.h>
	120	#include <kern/thread.h>
	121	#include <kern/sched_prim.h>
	122
	123	#include <miscfs/specfs/specdev.h>
	124
	125	#include <mach/mach_types.h>
	126	#include <mach/memory_object_types.h>
	127	#include <mach/memory_object_control.h>
	128
	129	#include <kern/kalloc.h> /* kalloc()/kfree() */
	130	#include <kern/clock.h> /* delay_for_interval() */
	131	#include <libkern/OSAtomic.h> /* OSAddAtomic() */
	132	#include <os/atomic_private.h>
	133	#if defined(XNU_TARGET_OS_OSX)
	134	#include <console/video_console.h>
	135	#endif
	136
	137	#ifdef JOE_DEBUG
	138	#include <libkern/OSDebug.h>
	139	#endif
	140
	141	#include <vm/vm_protos.h> /* vnode_pager_vrele() */
	142
	143	#if CONFIG_MACF
	144	#include <security/mac_framework.h>
	145	#endif
	146
	147	#include <vfs/vfs_disk_conditioner.h>
	148	#include <libkern/section_keywords.h>
	149
	150	static LCK_GRP_DECLARE(vnode_lck_grp, "vnode");
	151	static LCK_ATTR_DECLARE(vnode_lck_attr, 0, 0);
	152
	153	#if CONFIG_TRIGGERS
	154	static LCK_GRP_DECLARE(trigger_vnode_lck_grp, "trigger_vnode");
	155	static LCK_ATTR_DECLARE(trigger_vnode_lck_attr, 0, 0);
	156	#endif
	157
	158	extern lck_mtx_t mnt_list_mtx_lock;
	159
	160	ZONE_DECLARE(specinfo_zone, "specinfo",
	161	sizeof(struct specinfo), ZC_NOENCRYPT \| ZC_ZFREE_CLEARMEM);
	162
	163	ZONE_DECLARE(vnode_zone, "vnodes",
	164	sizeof(struct vnode), ZC_NOENCRYPT \| ZC_NOGC \| ZC_ZFREE_CLEARMEM);
	165
	166	enum vtype iftovt_tab[16] = {
	167	VNON, VFIFO, VCHR, VNON, VDIR, VNON, VBLK, VNON,
	168	VREG, VNON, VLNK, VNON, VSOCK, VNON, VNON, VBAD,
	169	};
	170	int vttoif_tab[9] = {
	171	0, S_IFREG, S_IFDIR, S_IFBLK, S_IFCHR, S_IFLNK,
	172	S_IFSOCK, S_IFIFO, S_IFMT,
	173	};
	174
	175	/* XXX These should be in a BSD accessible Mach header, but aren't. */
	176	extern void memory_object_mark_used(
	177	memory_object_control_t control);
	178
	179	extern void memory_object_mark_unused(
	180	memory_object_control_t control,
	181	boolean_t rage);
	182
	183	extern void memory_object_mark_io_tracking(
	184	memory_object_control_t control);
	185
	186	/* XXX next protptype should be from <nfs/nfs.h> */
	187	extern int nfs_vinvalbuf(vnode_t, int, vfs_context_t, int);
	188
	189	extern int paniclog_append_noflush(const char *format, ...);
	190
	191	/* XXX next prototytype should be from libsa/stdlib.h> but conflicts libkern */
	192	__private_extern__ void qsort(
	193	void * array,
	194	size_t nmembers,
	195	size_t member_size,
	196	int ()(const void , const void *));
	197
	198	__private_extern__ void vntblinit(void);
	199	__private_extern__ int unlink1(vfs_context_t, vnode_t, user_addr_t,
	200	enum uio_seg, int);
	201
	202	static void vnode_list_add(vnode_t);
	203	static void vnode_async_list_add(vnode_t);
	204	static void vnode_list_remove(vnode_t);
	205	static void vnode_list_remove_locked(vnode_t);
	206
	207	static void vnode_abort_advlocks(vnode_t);
	208	static errno_t vnode_drain(vnode_t);
	209	static void vgone(vnode_t, int flags);
	210	static void vclean(vnode_t vp, int flag);
	211	static void vnode_reclaim_internal(vnode_t, int, int, int);
	212
	213	static void vnode_dropiocount(vnode_t);
	214
	215	static vnode_t checkalias(vnode_t vp, dev_t nvp_rdev);
	216	static int vnode_reload(vnode_t);
	217
	218	static int unmount_callback(mount_t, __unused void *);
	219
	220	static void insmntque(vnode_t vp, mount_t mp);
	221	static int mount_getvfscnt(void);
	222	static int mount_fillfsids(fsid_t *, int );
	223	static void vnode_iterate_setup(mount_t);
	224	int vnode_umount_preflight(mount_t, vnode_t, int);
	225	static int vnode_iterate_prepare(mount_t);
	226	static int vnode_iterate_reloadq(mount_t);
	227	static void vnode_iterate_clear(mount_t);
	228	static mount_t vfs_getvfs_locked(fsid_t *);
	229	static int vn_create_reg(vnode_t dvp, vnode_t vpp, struct nameidata ndp,
	230	struct vnode_attr vap, uint32_t flags, int fmode, uint32_t statusp, vfs_context_t ctx);
	231	static int vnode_authattr_new_internal(vnode_t dvp, struct vnode_attr vap, int noauth, uint32_t defaulted_fieldsp, vfs_context_t ctx);
	232
	233	errno_t rmdir_remove_orphaned_appleDouble(vnode_t, vfs_context_t, int *);
	234
	235	#ifdef JOE_DEBUG
	236	static void record_vp(vnode_t vp, int count);
	237	#endif
	238
	239	#if CONFIG_JETSAM && (DEVELOPMENT \|\| DEBUG)
	240	extern int bootarg_no_vnode_jetsam; /* from bsd_init.c default value is 0 */
	241	#endif /* CONFIG_JETSAM && (DEVELOPMENT \|\| DEBUG) */
	242
	243	extern int bootarg_no_vnode_drain; /* from bsd_init.c default value is 0 */
	244
	245	boolean_t root_is_CF_drive = FALSE;
	246
	247	#if CONFIG_TRIGGERS
	248	static int vnode_resolver_create(mount_t, vnode_t, struct vnode_trigger_param *, boolean_t external);
	249	static void vnode_resolver_detach(vnode_t);
	250	#endif
	251
	252	TAILQ_HEAD(freelst, vnode) vnode_free_list; /* vnode free list */
	253	TAILQ_HEAD(deadlst, vnode) vnode_dead_list; /* vnode dead list */
	254	TAILQ_HEAD(async_work_lst, vnode) vnode_async_work_list;
	255
	256
	257	TAILQ_HEAD(ragelst, vnode) vnode_rage_list; /* vnode rapid age list */
	258	struct timeval rage_tv;
	259	int rage_limit = 0;
	260	int ragevnodes = 0;
	261
	262	int deadvnodes_low = 0;
	263	int deadvnodes_high = 0;
	264
	265	uint64_t newvnode = 0;
	266	uint64_t newvnode_nodead = 0;
	267
	268	static int vfs_unmountall_started = 0;
	269
	270	#define RAGE_LIMIT_MIN 100
	271	#define RAGE_TIME_LIMIT 5
	272
	273	/*
	274	* ROSV definitions
	275	* NOTE: These are shadowed from PlatformSupport definitions, but XNU
	276	* builds standalone.
	277	*/
	278	#define PLATFORM_DATA_VOLUME_MOUNT_POINT "/System/Volumes/Data"
	279
	280	/*
	281	* These could be in PlatformSupport but aren't yet
	282	*/
	283	#define PLATFORM_PREBOOT_VOLUME_MOUNT_POINT "/System/Volumes/Preboot"
	284	#define PLATFORM_RECOVERY_VOLUME_MOUNT_POINT "/System/Volumes/Recovery"
	285
	286	#if CONFIG_MOUNT_VM
	287	#define PLATFORM_VM_VOLUME_MOUNT_POINT "/System/Volumes/VM"
	288	#endif
	289
	290	struct mntlist mountlist; /* mounted filesystem list */
	291	static int nummounts = 0;
	292
	293	static int print_busy_vnodes = 0; /* print out busy vnodes */
	294
	295	#if DIAGNOSTIC
	296	#define VLISTCHECK(fun, vp, list) \
	297	if ((vp)->v_freelist.tqe_prev == (struct vnode **)0xdeadb) \
	298	panic("%s: %s vnode not on %slist", (fun), (list), (list));
	299	#else
	300	#define VLISTCHECK(fun, vp, list)
	301	#endif /* DIAGNOSTIC */
	302
	303	#define VLISTNONE(vp) \
	304	do { \
	305	(vp)->v_freelist.tqe_next = (struct vnode *)0; \
	306	(vp)->v_freelist.tqe_prev = (struct vnode **)0xdeadb; \
	307	} while(0)
	308
	309	#define VONLIST(vp) \
	310	((vp)->v_freelist.tqe_prev != (struct vnode **)0xdeadb)
	311
	312	/* remove a vnode from free vnode list */
	313	#define VREMFREE(fun, vp) \
	314	do { \
	315	VLISTCHECK((fun), (vp), "free"); \
	316	TAILQ_REMOVE(&vnode_free_list, (vp), v_freelist); \
	317	VLISTNONE((vp)); \
	318	freevnodes--; \
	319	} while(0)
	320
	321
	322	/* remove a vnode from dead vnode list */
	323	#define VREMDEAD(fun, vp) \
	324	do { \
	325	VLISTCHECK((fun), (vp), "dead"); \
	326	TAILQ_REMOVE(&vnode_dead_list, (vp), v_freelist); \
	327	VLISTNONE((vp)); \
	328	vp->v_listflag &= ~VLIST_DEAD; \
	329	deadvnodes--; \
	330	} while(0)
	331
	332
	333	/* remove a vnode from async work vnode list */
	334	#define VREMASYNC_WORK(fun, vp) \
	335	do { \
	336	VLISTCHECK((fun), (vp), "async_work"); \
	337	TAILQ_REMOVE(&vnode_async_work_list, (vp), v_freelist); \
	338	VLISTNONE((vp)); \
	339	vp->v_listflag &= ~VLIST_ASYNC_WORK; \
	340	async_work_vnodes--; \
	341	} while(0)
	342
	343
	344	/* remove a vnode from rage vnode list */
	345	#define VREMRAGE(fun, vp) \
	346	do { \
	347	if ( !(vp->v_listflag & VLIST_RAGE)) \
	348	panic("VREMRAGE: vp not on rage list"); \
	349	VLISTCHECK((fun), (vp), "rage"); \
	350	TAILQ_REMOVE(&vnode_rage_list, (vp), v_freelist); \
	351	VLISTNONE((vp)); \
	352	vp->v_listflag &= ~VLIST_RAGE; \
	353	ragevnodes--; \
	354	} while(0)
	355
	356	static void async_work_continue(void);
	357	static void vn_laundry_continue(void);
	358
	359	/*
	360	* Initialize the vnode management data structures.
	361	*/
	362	__private_extern__ void
	363	vntblinit(void)
	364	{
	365	thread_t thread = THREAD_NULL;
	366
	367	TAILQ_INIT(&vnode_free_list);
	368	TAILQ_INIT(&vnode_rage_list);
	369	TAILQ_INIT(&vnode_dead_list);
	370	TAILQ_INIT(&vnode_async_work_list);
	371	TAILQ_INIT(&mountlist);
	372
	373	microuptime(&rage_tv);
	374	rage_limit = desiredvnodes / 100;
	375
	376	if (rage_limit < RAGE_LIMIT_MIN) {
	377	rage_limit = RAGE_LIMIT_MIN;
	378	}
	379
	380	deadvnodes_low = (desiredvnodes) / 100;
	381	if (deadvnodes_low > 300) {
	382	deadvnodes_low = 300;
	383	}
	384	deadvnodes_high = deadvnodes_low * 2;
	385
	386	/*
	387	* create worker threads
	388	*/
	389	kernel_thread_start((thread_continue_t)async_work_continue, NULL, &thread);
	390	thread_deallocate(thread);
	391	kernel_thread_start((thread_continue_t)vn_laundry_continue, NULL, &thread);
	392	thread_deallocate(thread);
	393	}
	394
	395	/* the timeout is in 10 msecs */
	396	int
	397	vnode_waitforwrites(vnode_t vp, int output_target, int slpflag, int slptimeout, const char *msg)
	398	{
	399	int error = 0;
	400	struct timespec ts;
	401
	402	if (output_target < 0) {
	403	return EINVAL;
	404	}
	405
	406	KERNEL_DEBUG(0x3010280 \| DBG_FUNC_START, (int)vp, output_target, vp->v_numoutput, 0, 0);
	407
	408	if (vp->v_numoutput > output_target) {
	409	slpflag \|= PDROP;
	410
	411	vnode_lock_spin(vp);
	412
	413	while ((vp->v_numoutput > output_target) && error == 0) {
	414	if (output_target) {
	415	vp->v_flag \|= VTHROTTLED;
	416	} else {
	417	vp->v_flag \|= VBWAIT;
	418	}
	419
	420	ts.tv_sec = (slptimeout / 100);
	421	ts.tv_nsec = (slptimeout % 1000) * 10 * NSEC_PER_USEC * 1000;
	422	error = msleep((caddr_t)&vp->v_numoutput, &vp->v_lock, (slpflag \| (PRIBIO + 1)), msg, &ts);
	423
	424	vnode_lock_spin(vp);
	425	}
	426	vnode_unlock(vp);
	427	}
	428	KERNEL_DEBUG(0x3010280 \| DBG_FUNC_END, (int)vp, output_target, vp->v_numoutput, error, 0);
	429
	430	return error;
	431	}
	432
	433
	434	void
	435	vnode_startwrite(vnode_t vp)
	436	{
	437	OSAddAtomic(1, &vp->v_numoutput);
	438	}
	439
	440
	441	void
	442	vnode_writedone(vnode_t vp)
	443	{
	444	if (vp) {
	445	int need_wakeup = 0;
	446
	447	OSAddAtomic(-1, &vp->v_numoutput);
	448
	449	vnode_lock_spin(vp);
	450
	451	if (vp->v_numoutput < 0) {
	452	panic("vnode_writedone: numoutput < 0");
	453	}
	454
	455	if ((vp->v_flag & VTHROTTLED)) {
	456	vp->v_flag &= ~VTHROTTLED;
	457	need_wakeup = 1;
	458	}
	459	if ((vp->v_flag & VBWAIT) && (vp->v_numoutput == 0)) {
	460	vp->v_flag &= ~VBWAIT;
	461	need_wakeup = 1;
	462	}
	463	vnode_unlock(vp);
	464
	465	if (need_wakeup) {
	466	wakeup((caddr_t)&vp->v_numoutput);
	467	}
	468	}
	469	}
	470
	471
	472
	473	int
	474	vnode_hasdirtyblks(vnode_t vp)
	475	{
	476	struct cl_writebehind *wbp;
	477
	478	/*
	479	* Not taking the buf_mtx as there is little
	480	* point doing it. Even if the lock is taken the
	481	* state can change right after that. If their
	482	* needs to be a synchronization, it must be driven
	483	* by the caller
	484	*/
	485	if (vp->v_dirtyblkhd.lh_first) {
	486	return 1;
	487	}
	488
	489	if (!UBCINFOEXISTS(vp)) {
	490	return 0;
	491	}
	492
	493	wbp = vp->v_ubcinfo->cl_wbehind;
	494
	495	if (wbp && (wbp->cl_number \|\| wbp->cl_scmap)) {
	496	return 1;
	497	}
	498
	499	return 0;
	500	}
	501
	502	int
	503	vnode_hascleanblks(vnode_t vp)
	504	{
	505	/*
	506	* Not taking the buf_mtx as there is little
	507	* point doing it. Even if the lock is taken the
	508	* state can change right after that. If their
	509	* needs to be a synchronization, it must be driven
	510	* by the caller
	511	*/
	512	if (vp->v_cleanblkhd.lh_first) {
	513	return 1;
	514	}
	515	return 0;
	516	}
	517
	518	void
	519	vnode_iterate_setup(mount_t mp)
	520	{
	521	mp->mnt_lflag \|= MNT_LITER;
	522	}
	523
	524	int
	525	vnode_umount_preflight(mount_t mp, vnode_t skipvp, int flags)
	526	{
	527	vnode_t vp;
	528	int ret = 0;
	529
	530	TAILQ_FOREACH(vp, &mp->mnt_vnodelist, v_mntvnodes) {
	531	if (vp->v_type == VDIR) {
	532	continue;
	533	}
	534	if (vp == skipvp) {
	535	continue;
	536	}
	537	if ((flags & SKIPSYSTEM) && ((vp->v_flag & VSYSTEM) \|\| (vp->v_flag & VNOFLUSH))) {
	538	continue;
	539	}
	540	if ((flags & SKIPSWAP) && (vp->v_flag & VSWAP)) {
	541	continue;
	542	}
	543	if ((flags & WRITECLOSE) && (vp->v_writecount == 0 \|\| vp->v_type != VREG)) {
	544	continue;
	545	}
	546
	547	/* Look for busy vnode */
	548	if ((vp->v_usecount != 0) && ((vp->v_usecount - vp->v_kusecount) != 0)) {
	549	ret = 1;
	550	if (print_busy_vnodes && ((flags & FORCECLOSE) == 0)) {
	551	vprint("vnode_umount_preflight - busy vnode", vp);
	552	} else {
	553	return ret;
	554	}
	555	} else if (vp->v_iocount > 0) {
	556	/* Busy if iocount is > 0 for more than 3 seconds */
	557	tsleep(&vp->v_iocount, PVFS, "vnode_drain_network", 3 * hz);
	558	if (vp->v_iocount > 0) {
	559	ret = 1;
	560	if (print_busy_vnodes && ((flags & FORCECLOSE) == 0)) {
	561	vprint("vnode_umount_preflight - busy vnode", vp);
	562	} else {
	563	return ret;
	564	}
	565	}
	566	continue;
	567	}
	568	}
	569
	570	return ret;
	571	}
	572
	573	/*
	574	* This routine prepares iteration by moving all the vnodes to worker queue
	575	* called with mount lock held
	576	*/
	577	int
	578	vnode_iterate_prepare(mount_t mp)
	579	{
	580	vnode_t vp;
	581
	582	if (TAILQ_EMPTY(&mp->mnt_vnodelist)) {
	583	/* nothing to do */
	584	return 0;
	585	}
	586
	587	vp = TAILQ_FIRST(&mp->mnt_vnodelist);
	588	vp->v_mntvnodes.tqe_prev = &(mp->mnt_workerqueue.tqh_first);
	589	mp->mnt_workerqueue.tqh_first = mp->mnt_vnodelist.tqh_first;
	590	mp->mnt_workerqueue.tqh_last = mp->mnt_vnodelist.tqh_last;
	591
	592	TAILQ_INIT(&mp->mnt_vnodelist);
	593	if (mp->mnt_newvnodes.tqh_first != NULL) {
	594	panic("vnode_iterate_prepare: newvnode when entering vnode");
	595	}
	596	TAILQ_INIT(&mp->mnt_newvnodes);
	597
	598	return 1;
	599	}
	600
	601
	602	/* called with mount lock held */
	603	int
	604	vnode_iterate_reloadq(mount_t mp)
	605	{
	606	int moved = 0;
	607
	608	/* add the remaining entries in workerq to the end of mount vnode list */
	609	if (!TAILQ_EMPTY(&mp->mnt_workerqueue)) {
	610	struct vnode * mvp;
	611	mvp = TAILQ_LAST(&mp->mnt_vnodelist, vnodelst);
	612
	613	/* Joining the workerque entities to mount vnode list */
	614	if (mvp) {
	615	mvp->v_mntvnodes.tqe_next = mp->mnt_workerqueue.tqh_first;
	616	} else {
	617	mp->mnt_vnodelist.tqh_first = mp->mnt_workerqueue.tqh_first;
	618	}
	619	mp->mnt_workerqueue.tqh_first->v_mntvnodes.tqe_prev = mp->mnt_vnodelist.tqh_last;
	620	mp->mnt_vnodelist.tqh_last = mp->mnt_workerqueue.tqh_last;
	621	TAILQ_INIT(&mp->mnt_workerqueue);
	622	}
	623
	624	/* add the newvnodes to the head of mount vnode list */
	625	if (!TAILQ_EMPTY(&mp->mnt_newvnodes)) {
	626	struct vnode * nlvp;
	627	nlvp = TAILQ_LAST(&mp->mnt_newvnodes, vnodelst);
	628
	629	mp->mnt_newvnodes.tqh_first->v_mntvnodes.tqe_prev = &mp->mnt_vnodelist.tqh_first;
	630	nlvp->v_mntvnodes.tqe_next = mp->mnt_vnodelist.tqh_first;
	631	if (mp->mnt_vnodelist.tqh_first) {
	632	mp->mnt_vnodelist.tqh_first->v_mntvnodes.tqe_prev = &nlvp->v_mntvnodes.tqe_next;
	633	} else {
	634	mp->mnt_vnodelist.tqh_last = mp->mnt_newvnodes.tqh_last;
	635	}
	636	mp->mnt_vnodelist.tqh_first = mp->mnt_newvnodes.tqh_first;
	637	TAILQ_INIT(&mp->mnt_newvnodes);
	638	moved = 1;
	639	}
	640
	641	return moved;
	642	}
	643
	644
	645	void
	646	vnode_iterate_clear(mount_t mp)
	647	{
	648	mp->mnt_lflag &= ~MNT_LITER;
	649	}
	650
	651	#if defined(__x86_64__)
	652
	653	#include <i386/panic_hooks.h>
	654
	655	struct vnode_iterate_panic_hook {
	656	panic_hook_t hook;
	657	mount_t mp;
	658	struct vnode *vp;
	659	};
	660
	661	static void
	662	vnode_iterate_panic_hook(panic_hook_t *hook_)
	663	{
	664	struct vnode_iterate_panic_hook hook = (struct vnode_iterate_panic_hook )hook_;
	665	panic_phys_range_t range;
	666	uint64_t phys;
	667
	668	if (panic_phys_range_before(hook->mp, &phys, &range)) {
	669	paniclog_append_noflush("mp = %p, phys = %p, prev (%p: %p-%p)\n",
	670	hook->mp, phys, range.type, range.phys_start,
	671	range.phys_start + range.len);
	672	} else {
	673	paniclog_append_noflush("mp = %p, phys = %p, prev (!)\n", hook->mp, phys);
	674	}
	675
	676	if (panic_phys_range_before(hook->vp, &phys, &range)) {
	677	paniclog_append_noflush("vp = %p, phys = %p, prev (%p: %p-%p)\n",
	678	hook->vp, phys, range.type, range.phys_start,
	679	range.phys_start + range.len);
	680	} else {
	681	paniclog_append_noflush("vp = %p, phys = %p, prev (!)\n", hook->vp, phys);
	682	}
	683	panic_dump_mem((void *)(((vm_offset_t)hook->mp - 4096) & ~4095), 12288);
	684	}
	685	#endif /* defined(__x86_64__) */
	686
	687	int
	688	vnode_iterate(mount_t mp, int flags, int (callout)(struct vnode , void *),
	689	void *arg)
	690	{
	691	struct vnode *vp;
	692	int vid, retval;
	693	int ret = 0;
	694
	695	/*
	696	* The mount iterate mutex is held for the duration of the iteration.
	697	* This can be done by a state flag on the mount structure but we can
	698	* run into priority inversion issues sometimes.
	699	* Using a mutex allows us to benefit from the priority donation
	700	* mechanisms in the kernel for locks. This mutex should never be
	701	* acquired in spin mode and it should be acquired before attempting to
	702	* acquire the mount lock.
	703	*/
	704	mount_iterate_lock(mp);
	705
	706	mount_lock(mp);
	707
	708	vnode_iterate_setup(mp);
	709
	710	/* If it returns 0 then there is nothing to do */
	711	retval = vnode_iterate_prepare(mp);
	712
	713	if (retval == 0) {
	714	vnode_iterate_clear(mp);
	715	mount_unlock(mp);
	716	mount_iterate_unlock(mp);
	717	return ret;
	718	}
	719
	720	#if defined(__x86_64__)
	721	struct vnode_iterate_panic_hook hook;
	722	hook.mp = mp;
	723	hook.vp = NULL;
	724	panic_hook(&hook.hook, vnode_iterate_panic_hook);
	725	#endif
	726	/* iterate over all the vnodes */
	727	while (!TAILQ_EMPTY(&mp->mnt_workerqueue)) {
	728	vp = TAILQ_FIRST(&mp->mnt_workerqueue);
	729	#if defined(__x86_64__)
	730	hook.vp = vp;
	731	#endif
	732	TAILQ_REMOVE(&mp->mnt_workerqueue, vp, v_mntvnodes);
	733	TAILQ_INSERT_TAIL(&mp->mnt_vnodelist, vp, v_mntvnodes);
	734	vid = vp->v_id;
	735	if ((vp->v_data == NULL) \|\| (vp->v_type == VNON) \|\| (vp->v_mount != mp)) {
	736	continue;
	737	}
	738	mount_unlock(mp);
	739
	740	if (vget_internal(vp, vid, (flags \| VNODE_NODEAD \| VNODE_WITHID \| VNODE_NOSUSPEND))) {
	741	mount_lock(mp);
	742	continue;
	743	}
	744	if (flags & VNODE_RELOAD) {
	745	/*
	746	* we're reloading the filesystem
	747	* cast out any inactive vnodes...
	748	*/
	749	if (vnode_reload(vp)) {
	750	/* vnode will be recycled on the refcount drop */
	751	vnode_put(vp);
	752	mount_lock(mp);
	753	continue;
	754	}
	755	}
	756
	757	retval = callout(vp, arg);
	758
	759	switch (retval) {
	760	case VNODE_RETURNED:
	761	case VNODE_RETURNED_DONE:
	762	vnode_put(vp);
	763	if (retval == VNODE_RETURNED_DONE) {
	764	mount_lock(mp);
	765	ret = 0;
	766	goto out;
	767	}
	768	break;
	769
	770	case VNODE_CLAIMED_DONE:
	771	mount_lock(mp);
	772	ret = 0;
	773	goto out;
	774	case VNODE_CLAIMED:
	775	default:
	776	break;
	777	}
	778	mount_lock(mp);
	779	}
	780
	781	out:
	782	#if defined(__x86_64__)
	783	panic_unhook(&hook.hook);
	784	#endif
	785	(void)vnode_iterate_reloadq(mp);
	786	vnode_iterate_clear(mp);
	787	mount_unlock(mp);
	788	mount_iterate_unlock(mp);
	789	return ret;
	790	}
	791
	792	void
	793	mount_lock_renames(mount_t mp)
	794	{
	795	lck_mtx_lock(&mp->mnt_renamelock);
	796	}
	797
	798	void
	799	mount_unlock_renames(mount_t mp)
	800	{
	801	lck_mtx_unlock(&mp->mnt_renamelock);
	802	}
	803
	804	void
	805	mount_iterate_lock(mount_t mp)
	806	{
	807	lck_mtx_lock(&mp->mnt_iter_lock);
	808	}
	809
	810	void
	811	mount_iterate_unlock(mount_t mp)
	812	{
	813	lck_mtx_unlock(&mp->mnt_iter_lock);
	814	}
	815
	816	void
	817	mount_lock(mount_t mp)
	818	{
	819	lck_mtx_lock(&mp->mnt_mlock);
	820	}
	821
	822	void
	823	mount_lock_spin(mount_t mp)
	824	{
	825	lck_mtx_lock_spin(&mp->mnt_mlock);
	826	}
	827
	828	void
	829	mount_unlock(mount_t mp)
	830	{
	831	lck_mtx_unlock(&mp->mnt_mlock);
	832	}
	833
	834
	835	void
	836	mount_ref(mount_t mp, int locked)
	837	{
	838	if (!locked) {
	839	mount_lock_spin(mp);
	840	}
	841
	842	mp->mnt_count++;
	843
	844	if (!locked) {
	845	mount_unlock(mp);
	846	}
	847	}
	848
	849
	850	void
	851	mount_drop(mount_t mp, int locked)
	852	{
	853	if (!locked) {
	854	mount_lock_spin(mp);
	855	}
	856
	857	mp->mnt_count--;
	858
	859	if (mp->mnt_count == 0 && (mp->mnt_lflag & MNT_LDRAIN)) {
	860	wakeup(&mp->mnt_lflag);
	861	}
	862
	863	if (!locked) {
	864	mount_unlock(mp);
	865	}
	866	}
	867
	868
	869	int
	870	mount_iterref(mount_t mp, int locked)
	871	{
	872	int retval = 0;
	873
	874	if (!locked) {
	875	mount_list_lock();
	876	}
	877	if (mp->mnt_iterref < 0) {
	878	retval = 1;
	879	} else {
	880	mp->mnt_iterref++;
	881	}
	882	if (!locked) {
	883	mount_list_unlock();
	884	}
	885	return retval;
	886	}
	887
	888	int
	889	mount_isdrained(mount_t mp, int locked)
	890	{
	891	int retval;
	892
	893	if (!locked) {
	894	mount_list_lock();
	895	}
	896	if (mp->mnt_iterref < 0) {
	897	retval = 1;
	898	} else {
	899	retval = 0;
	900	}
	901	if (!locked) {
	902	mount_list_unlock();
	903	}
	904	return retval;
	905	}
	906
	907	void
	908	mount_iterdrop(mount_t mp)
	909	{
	910	mount_list_lock();
	911	mp->mnt_iterref--;
	912	wakeup(&mp->mnt_iterref);
	913	mount_list_unlock();
	914	}
	915
	916	void
	917	mount_iterdrain(mount_t mp)
	918	{
	919	mount_list_lock();
	920	while (mp->mnt_iterref) {
	921	msleep((caddr_t)&mp->mnt_iterref, &mnt_list_mtx_lock, PVFS, "mount_iterdrain", NULL);
	922	}
	923	/* mount iterations drained */
	924	mp->mnt_iterref = -1;
	925	mount_list_unlock();
	926	}
	927	void
	928	mount_iterreset(mount_t mp)
	929	{
	930	mount_list_lock();
	931	if (mp->mnt_iterref == -1) {
	932	mp->mnt_iterref = 0;
	933	}
	934	mount_list_unlock();
	935	}
	936
	937	/* always called with mount lock held */
	938	int
	939	mount_refdrain(mount_t mp)
	940	{
	941	if (mp->mnt_lflag & MNT_LDRAIN) {
	942	panic("already in drain");
	943	}
	944	mp->mnt_lflag \|= MNT_LDRAIN;
	945
	946	while (mp->mnt_count) {
	947	msleep((caddr_t)&mp->mnt_lflag, &mp->mnt_mlock, PVFS, "mount_drain", NULL);
	948	}
	949
	950	if (mp->mnt_vnodelist.tqh_first != NULL) {
	951	panic("mount_refdrain: dangling vnode");
	952	}
	953
	954	mp->mnt_lflag &= ~MNT_LDRAIN;
	955
	956	return 0;
	957	}
	958
	959	/* Tags the mount point as not supportine extended readdir for NFS exports */
	960	void
	961	mount_set_noreaddirext(mount_t mp)
	962	{
	963	mount_lock(mp);
	964	mp->mnt_kern_flag \|= MNTK_DENY_READDIREXT;
	965	mount_unlock(mp);
	966	}
	967
	968	/*
	969	* Mark a mount point as busy. Used to synchronize access and to delay
	970	* unmounting.
	971	*/
	972	int
	973	vfs_busy(mount_t mp, int flags)
	974	{
	975	restart:
	976	if (mp->mnt_lflag & MNT_LDEAD) {
	977	return ENOENT;
	978	}
	979
	980	mount_lock(mp);
	981
	982	if (mp->mnt_lflag & MNT_LUNMOUNT) {
	983	if (flags & LK_NOWAIT \|\| mp->mnt_lflag & MNT_LDEAD) {
	984	mount_unlock(mp);
	985	return ENOENT;
	986	}
	987
	988	/*
	989	* Since all busy locks are shared except the exclusive
	990	* lock granted when unmounting, the only place that a
	991	* wakeup needs to be done is at the release of the
	992	* exclusive lock at the end of dounmount.
	993	*/
	994	mp->mnt_lflag \|= MNT_LWAIT;
	995	msleep((caddr_t)mp, &mp->mnt_mlock, (PVFS \| PDROP), "vfsbusy", NULL);
	996	return ENOENT;
	997	}
	998
	999	mount_unlock(mp);
	1000
	1001	lck_rw_lock_shared(&mp->mnt_rwlock);
	1002
	1003	/*
	1004	* Until we are granted the rwlock, it's possible for the mount point to
	1005	* change state, so re-evaluate before granting the vfs_busy.
	1006	*/
	1007	if (mp->mnt_lflag & (MNT_LDEAD \| MNT_LUNMOUNT)) {
	1008	lck_rw_done(&mp->mnt_rwlock);
	1009	goto restart;
	1010	}
	1011	return 0;
	1012	}
	1013
	1014	/*
	1015	* Free a busy filesystem.
	1016	*/
	1017	void
	1018	vfs_unbusy(mount_t mp)
	1019	{
	1020	lck_rw_done(&mp->mnt_rwlock);
	1021	}
	1022
	1023
	1024
	1025	static void
	1026	vfs_rootmountfailed(mount_t mp)
	1027	{
	1028	mount_list_lock();
	1029	mp->mnt_vtable->vfc_refcount--;
	1030	mount_list_unlock();
	1031
	1032	vfs_unbusy(mp);
	1033
	1034	mount_lock_destroy(mp);
	1035
	1036	#if CONFIG_MACF
	1037	mac_mount_label_destroy(mp);
	1038	#endif
	1039
	1040	zfree(mount_zone, mp);
	1041	}
	1042
	1043	/*
	1044	* Lookup a filesystem type, and if found allocate and initialize
	1045	* a mount structure for it.
	1046	*
	1047	* Devname is usually updated by mount(8) after booting.
	1048	*/
	1049	static mount_t
	1050	vfs_rootmountalloc_internal(struct vfstable vfsp, const char devname)
	1051	{
	1052	mount_t mp;
	1053
	1054	mp = zalloc_flags(mount_zone, Z_WAITOK \| Z_ZERO);
	1055	/* Initialize the default IO constraints */
	1056	mp->mnt_maxreadcnt = mp->mnt_maxwritecnt = MAXPHYS;
	1057	mp->mnt_segreadcnt = mp->mnt_segwritecnt = 32;
	1058	mp->mnt_maxsegreadsize = mp->mnt_maxreadcnt;
	1059	mp->mnt_maxsegwritesize = mp->mnt_maxwritecnt;
	1060	mp->mnt_devblocksize = DEV_BSIZE;
	1061	mp->mnt_alignmentmask = PAGE_MASK;
	1062	mp->mnt_ioqueue_depth = MNT_DEFAULT_IOQUEUE_DEPTH;
	1063	mp->mnt_ioscale = 1;
	1064	mp->mnt_ioflags = 0;
	1065	mp->mnt_realrootvp = NULLVP;
	1066	mp->mnt_authcache_ttl = CACHED_LOOKUP_RIGHT_TTL;
	1067	mp->mnt_throttle_mask = LOWPRI_MAX_NUM_DEV - 1;
	1068	mp->mnt_devbsdunit = 0;
	1069
	1070	mount_lock_init(mp);
	1071	(void)vfs_busy(mp, LK_NOWAIT);
	1072
	1073	TAILQ_INIT(&mp->mnt_vnodelist);
	1074	TAILQ_INIT(&mp->mnt_workerqueue);
	1075	TAILQ_INIT(&mp->mnt_newvnodes);
	1076
	1077	mp->mnt_vtable = vfsp;
	1078	mp->mnt_op = vfsp->vfc_vfsops;
	1079	mp->mnt_flag = MNT_RDONLY \| MNT_ROOTFS;
	1080	mp->mnt_vnodecovered = NULLVP;
	1081	//mp->mnt_stat.f_type = vfsp->vfc_typenum;
	1082	mp->mnt_flag \|= vfsp->vfc_flags & MNT_VISFLAGMASK;
	1083
	1084	mount_list_lock();
	1085	vfsp->vfc_refcount++;
	1086	mount_list_unlock();
	1087
	1088	strlcpy(mp->mnt_vfsstat.f_fstypename, vfsp->vfc_name, MFSTYPENAMELEN);
	1089	mp->mnt_vfsstat.f_mntonname[0] = '/';
	1090	/* XXX const poisoning layering violation */
	1091	(void) copystr((const void *)devname, mp->mnt_vfsstat.f_mntfromname, MAXPATHLEN - 1, NULL);
	1092
	1093	#if CONFIG_MACF
	1094	mac_mount_label_init(mp);
	1095	mac_mount_label_associate(vfs_context_kernel(), mp);
	1096	#endif
	1097	return mp;
	1098	}
	1099
	1100	errno_t
	1101	vfs_rootmountalloc(const char fstypename, const char devname, mount_t *mpp)
	1102	{
	1103	struct vfstable *vfsp;
	1104
	1105	for (vfsp = vfsconf; vfsp; vfsp = vfsp->vfc_next) {
	1106	if (!strncmp(vfsp->vfc_name, fstypename,
	1107	sizeof(vfsp->vfc_name))) {
	1108	break;
	1109	}
	1110	}
	1111	if (vfsp == NULL) {
	1112	return ENODEV;
	1113	}
	1114
	1115	*mpp = vfs_rootmountalloc_internal(vfsp, devname);
	1116
	1117	if (*mpp) {
	1118	return 0;
	1119	}
	1120
	1121	return ENOMEM;
	1122	}
	1123
	1124	#define DBG_MOUNTROOT (FSDBG_CODE(DBG_MOUNT, 0))
	1125
	1126	/*
	1127	* Find an appropriate filesystem to use for the root. If a filesystem
	1128	* has not been preselected, walk through the list of known filesystems
	1129	* trying those that have mountroot routines, and try them until one
	1130	* works or we have tried them all.
	1131	*/
	1132	extern int (*mountroot)(void);
	1133
	1134	int
	1135	vfs_mountroot(void)
	1136	{
	1137	#if CONFIG_MACF
	1138	struct vnode *vp;
	1139	#endif
	1140	struct vfstable *vfsp;
	1141	vfs_context_t ctx = vfs_context_kernel();
	1142	struct vfs_attr vfsattr;
	1143	int error;
	1144	mount_t mp;
	1145	vnode_t bdevvp_rootvp;
	1146
	1147	KDBG_RELEASE(DBG_MOUNTROOT \| DBG_FUNC_START);
	1148	if (mountroot != NULL) {
	1149	/*
	1150	* used for netboot which follows a different set of rules
	1151	*/
	1152	error = (*mountroot)();
	1153
	1154	KDBG_RELEASE(DBG_MOUNTROOT \| DBG_FUNC_END, error, 0);
	1155	return error;
	1156	}
	1157	if ((error = bdevvp(rootdev, &rootvp))) {
	1158	printf("vfs_mountroot: can't setup bdevvp\n");
	1159
	1160	KDBG_RELEASE(DBG_MOUNTROOT \| DBG_FUNC_END, error, 1);
	1161	return error;
	1162	}
	1163	/*
	1164	* 4951998 - code we call in vfc_mountroot may replace rootvp
	1165	* so keep a local copy for some house keeping.
	1166	*/
	1167	bdevvp_rootvp = rootvp;
	1168
	1169	for (vfsp = vfsconf; vfsp; vfsp = vfsp->vfc_next) {
	1170	if (vfsp->vfc_mountroot == NULL
	1171	&& !ISSET(vfsp->vfc_vfsflags, VFC_VFSCANMOUNTROOT)) {
	1172	continue;
	1173	}
	1174
	1175	mp = vfs_rootmountalloc_internal(vfsp, "root_device");
	1176	mp->mnt_devvp = rootvp;
	1177
	1178	if (vfsp->vfc_mountroot) {
	1179	error = (*vfsp->vfc_mountroot)(mp, rootvp, ctx);
	1180	} else {
	1181	error = VFS_MOUNT(mp, rootvp, 0, ctx);
	1182	}
	1183
	1184	if (!error) {
	1185	if (bdevvp_rootvp != rootvp) {
	1186	/*
	1187	* rootvp changed...
	1188	* bump the iocount and fix up mnt_devvp for the
	1189	* new rootvp (it will already have a usecount taken)...
	1190	* drop the iocount and the usecount on the orignal
	1191	* since we are no longer going to use it...
	1192	*/
	1193	vnode_getwithref(rootvp);
	1194	mp->mnt_devvp = rootvp;
	1195
	1196	vnode_rele(bdevvp_rootvp);
	1197	vnode_put(bdevvp_rootvp);
	1198	}
	1199	mp->mnt_devvp->v_specflags \|= SI_MOUNTEDON;
	1200
	1201	vfs_unbusy(mp);
	1202
	1203	mount_list_add(mp);
	1204
	1205	/*
	1206	* cache the IO attributes for the underlying physical media...
	1207	* an error return indicates the underlying driver doesn't
	1208	* support all the queries necessary... however, reasonable
	1209	* defaults will have been set, so no reason to bail or care
	1210	*/
	1211	vfs_init_io_attributes(rootvp, mp);
	1212
	1213	if (mp->mnt_ioflags & MNT_IOFLAGS_FUSION_DRIVE) {
	1214	root_is_CF_drive = TRUE;
	1215	}
	1216
	1217	/*
	1218	* Shadow the VFC_VFSNATIVEXATTR flag to MNTK_EXTENDED_ATTRS.
	1219	*/
	1220	if (mp->mnt_vtable->vfc_vfsflags & VFC_VFSNATIVEXATTR) {
	1221	mp->mnt_kern_flag \|= MNTK_EXTENDED_ATTRS;
	1222	}
	1223	if (mp->mnt_vtable->vfc_vfsflags & VFC_VFSPREFLIGHT) {
	1224	mp->mnt_kern_flag \|= MNTK_UNMOUNT_PREFLIGHT;
	1225	}
	1226
	1227	#if defined(XNU_TARGET_OS_OSX)
	1228	uint32_t speed;
	1229
	1230	if (MNTK_VIRTUALDEV & mp->mnt_kern_flag) {
	1231	speed = 128;
	1232	} else if (disk_conditioner_mount_is_ssd(mp)) {
	1233	speed = 7 * 256;
	1234	} else {
	1235	speed = 256;
	1236	}
	1237	vc_progress_setdiskspeed(speed);
	1238	#endif /* XNU_TARGET_OS_OSX */
	1239	/*
	1240	* Probe root file system for additional features.
	1241	*/
	1242	(void)VFS_START(mp, 0, ctx);
	1243
	1244	VFSATTR_INIT(&vfsattr);
	1245	VFSATTR_WANTED(&vfsattr, f_capabilities);
	1246	if (vfs_getattr(mp, &vfsattr, ctx) == 0 &&
	1247	VFSATTR_IS_SUPPORTED(&vfsattr, f_capabilities)) {
	1248	if ((vfsattr.f_capabilities.capabilities[VOL_CAPABILITIES_INTERFACES] & VOL_CAP_INT_EXTENDED_ATTR) &&
	1249	(vfsattr.f_capabilities.valid[VOL_CAPABILITIES_INTERFACES] & VOL_CAP_INT_EXTENDED_ATTR)) {
	1250	mp->mnt_kern_flag \|= MNTK_EXTENDED_ATTRS;
	1251	}
	1252	#if NAMEDSTREAMS
	1253	if ((vfsattr.f_capabilities.capabilities[VOL_CAPABILITIES_INTERFACES] & VOL_CAP_INT_NAMEDSTREAMS) &&
	1254	(vfsattr.f_capabilities.valid[VOL_CAPABILITIES_INTERFACES] & VOL_CAP_INT_NAMEDSTREAMS)) {
	1255	mp->mnt_kern_flag \|= MNTK_NAMED_STREAMS;
	1256	}
	1257	#endif
	1258	if ((vfsattr.f_capabilities.capabilities[VOL_CAPABILITIES_FORMAT] & VOL_CAP_FMT_PATH_FROM_ID) &&
	1259	(vfsattr.f_capabilities.valid[VOL_CAPABILITIES_FORMAT] & VOL_CAP_FMT_PATH_FROM_ID)) {
	1260	mp->mnt_kern_flag \|= MNTK_PATH_FROM_ID;
	1261	}
	1262
	1263	if ((vfsattr.f_capabilities.capabilities[VOL_CAPABILITIES_FORMAT] & VOL_CAP_FMT_DIR_HARDLINKS) &&
	1264	(vfsattr.f_capabilities.valid[VOL_CAPABILITIES_FORMAT] & VOL_CAP_FMT_DIR_HARDLINKS)) {
	1265	mp->mnt_kern_flag \|= MNTK_DIR_HARDLINKS;
	1266	}
	1267	}
	1268
	1269	/*
	1270	* get rid of iocount reference returned
	1271	* by bdevvp (or picked up by us on the substitued
	1272	* rootvp)... it (or we) will have also taken
	1273	* a usecount reference which we want to keep
	1274	*/
	1275	vnode_put(rootvp);
	1276
	1277	#if CONFIG_MACF
	1278	if ((vfs_flags(mp) & MNT_MULTILABEL) == 0) {
	1279	KDBG_RELEASE(DBG_MOUNTROOT \| DBG_FUNC_END, 0, 2);
	1280	return 0;
	1281	}
	1282
	1283	error = VFS_ROOT(mp, &vp, ctx);
	1284	if (error) {
	1285	printf("%s() VFS_ROOT() returned %d\n",
	1286	__func__, error);
	1287	dounmount(mp, MNT_FORCE, 0, ctx);
	1288	goto fail;
	1289	}
	1290	error = vnode_label(mp, NULL, vp, NULL, 0, ctx);
	1291	/*
	1292	* get rid of reference provided by VFS_ROOT
	1293	*/
	1294	vnode_put(vp);
	1295
	1296	if (error) {
	1297	printf("%s() vnode_label() returned %d\n",
	1298	__func__, error);
	1299	dounmount(mp, MNT_FORCE, 0, ctx);
	1300	goto fail;
	1301	}
	1302	#endif
	1303	KDBG_RELEASE(DBG_MOUNTROOT \| DBG_FUNC_END, 0, 3);
	1304	return 0;
	1305	}
	1306	#if CONFIG_MACF
	1307	fail:
	1308	#endif
	1309	vfs_rootmountfailed(mp);
	1310
	1311	if (error != EINVAL) {
	1312	printf("%s_mountroot failed: %d\n", vfsp->vfc_name, error);
	1313	}
	1314	}
	1315	KDBG_RELEASE(DBG_MOUNTROOT \| DBG_FUNC_END, error ? error : ENODEV, 4);
	1316	return ENODEV;
	1317	}
	1318
	1319	static int
	1320	cache_purge_callback(mount_t mp, __unused void * arg)
	1321	{
	1322	cache_purgevfs(mp);
	1323	return VFS_RETURNED;
	1324	}
	1325
	1326	extern lck_rw_t rootvnode_rw_lock;
	1327	extern void set_rootvnode(vnode_t);
	1328
	1329
	1330	static int
	1331	mntonname_fixup_callback(mount_t mp, __unused void *arg)
	1332	{
	1333	int error = 0;
	1334
	1335	if ((strncmp(&mp->mnt_vfsstat.f_mntonname[0], "/", sizeof("/")) == 0) \|\|
	1336	(strncmp(&mp->mnt_vfsstat.f_mntonname[0], "/dev", sizeof("/dev")) == 0)) {
	1337	return 0;
	1338	}
	1339
	1340	if ((error = vfs_busy(mp, LK_NOWAIT))) {
	1341	printf("vfs_busy failed with %d for %s\n", error, mp->mnt_vfsstat.f_mntonname);
	1342	return -1;
	1343	}
	1344
	1345	int pathlen = MAXPATHLEN;
	1346	if ((error = vn_getpath_ext(mp->mnt_vnodecovered, NULL, mp->mnt_vfsstat.f_mntonname, &pathlen, VN_GETPATH_FSENTER))) {
	1347	printf("vn_getpath_ext failed with %d for mnt_vnodecovered of %s\n", error, mp->mnt_vfsstat.f_mntonname);
	1348	}
	1349
	1350	vfs_unbusy(mp);
	1351
	1352	return error;
	1353	}
	1354
	1355	static int
	1356	clear_mntk_backs_root_callback(mount_t mp, __unused void *arg)
	1357	{
	1358	lck_rw_lock_exclusive(&mp->mnt_rwlock);
	1359	mp->mnt_kern_flag &= ~MNTK_BACKS_ROOT;
	1360	lck_rw_done(&mp->mnt_rwlock);
	1361	return VFS_RETURNED;
	1362	}
	1363
	1364	static int
	1365	verify_incoming_rootfs(vnode_t *incoming_rootvnodep, vfs_context_t ctx,
	1366	vfs_switch_root_flags_t flags)
	1367	{
	1368	mount_t mp;
	1369	vnode_t tdp;
	1370	vnode_t incoming_rootvnode_with_iocount = *incoming_rootvnodep;
	1371	vnode_t incoming_rootvnode_with_usecount = NULLVP;
	1372	int error = 0;
	1373
	1374	if (vnode_vtype(incoming_rootvnode_with_iocount) != VDIR) {
	1375	printf("Incoming rootfs path not a directory\n");
	1376	error = ENOTDIR;
	1377	goto done;
	1378	}
	1379
	1380	/*
	1381	* Before we call VFS_ROOT, we have to let go of the iocount already
	1382	* acquired, but before doing that get a usecount.
	1383	*/
	1384	vnode_ref_ext(incoming_rootvnode_with_iocount, 0, VNODE_REF_FORCE);
	1385	incoming_rootvnode_with_usecount = incoming_rootvnode_with_iocount;
	1386	vnode_lock_spin(incoming_rootvnode_with_usecount);
	1387	if ((mp = incoming_rootvnode_with_usecount->v_mount)) {
	1388	mp->mnt_crossref++;
	1389	vnode_unlock(incoming_rootvnode_with_usecount);
	1390	} else {
	1391	vnode_unlock(incoming_rootvnode_with_usecount);
	1392	printf("Incoming rootfs root vnode does not have associated mount\n");
	1393	error = ENOTDIR;
	1394	goto done;
	1395	}
	1396
	1397	if (vfs_busy(mp, LK_NOWAIT)) {
	1398	printf("Incoming rootfs root vnode mount is busy\n");
	1399	error = ENOENT;
	1400	goto out;
	1401	}
	1402
	1403	vnode_put(incoming_rootvnode_with_iocount);
	1404	incoming_rootvnode_with_iocount = NULLVP;
	1405
	1406	error = VFS_ROOT(mp, &tdp, ctx);
	1407
	1408	if (error) {
	1409	printf("Could not get rootvnode of incoming rootfs\n");
	1410	} else if (tdp != incoming_rootvnode_with_usecount) {
	1411	vnode_put(tdp);
	1412	tdp = NULLVP;
	1413	printf("Incoming rootfs root vnode mount is is not a mountpoint\n");
	1414	error = EINVAL;
	1415	goto out_busy;
	1416	} else {
	1417	incoming_rootvnode_with_iocount = tdp;
	1418	tdp = NULLVP;
	1419	}
	1420
	1421	if ((flags & VFSSR_VIRTUALDEV_PROHIBITED) != 0) {
	1422	lck_rw_lock_shared(&mp->mnt_rwlock);
	1423	if (mp->mnt_flag & MNTK_VIRTUALDEV) {
	1424	error = ENODEV;
	1425	}
	1426	lck_rw_done(&mp->mnt_rwlock);
	1427	if (error) {
	1428	printf("Incoming rootfs is backed by a virtual device; cannot switch to it");
	1429	goto out_busy;
	1430	}
	1431	}
	1432
	1433	out_busy:
	1434	vfs_unbusy(mp);
	1435
	1436	out:
	1437	vnode_lock(incoming_rootvnode_with_usecount);
	1438	mp->mnt_crossref--;
	1439	if (mp->mnt_crossref < 0) {
	1440	panic("mount cross refs -ve");
	1441	}
	1442	vnode_unlock(incoming_rootvnode_with_usecount);
	1443
	1444	done:
	1445	if (incoming_rootvnode_with_usecount) {
	1446	vnode_rele(incoming_rootvnode_with_usecount);
	1447	incoming_rootvnode_with_usecount = NULLVP;
	1448	}
	1449
	1450	if (error && incoming_rootvnode_with_iocount) {
	1451	vnode_put(incoming_rootvnode_with_iocount);
	1452	incoming_rootvnode_with_iocount = NULLVP;
	1453	}
	1454
	1455	*incoming_rootvnodep = incoming_rootvnode_with_iocount;
	1456	return error;
	1457	}
	1458
	1459	/*
	1460	* vfs_switch_root()
	1461	*
	1462	* Move the current root volume, and put a different volume at the root.
	1463	*
	1464	* incoming_vol_old_path: This is the path where the incoming root volume
	1465	* is mounted when this function begins.
	1466	* outgoing_vol_new_path: This is the path where the outgoing root volume
	1467	* will be mounted when this function (successfully) ends.
	1468	* Note: Do not use a leading slash.
	1469	*
	1470	* Volumes mounted at several fixed points (including /dev) will be preserved
	1471	* at the same absolute path. That means they will move within the folder
	1472	* hierarchy during the pivot operation. For example, /dev before the pivot
	1473	* will be at /dev after the pivot.
	1474	*
	1475	* If any filesystem has MNTK_BACKS_ROOT set, it will be cleared. If the
	1476	* incoming root volume is actually a disk image backed by some other
	1477	* filesystem, it is the caller's responsibility to re-set MNTK_BACKS_ROOT
	1478	* as appropriate.
	1479	*/
	1480	int
	1481	vfs_switch_root(const char *incoming_vol_old_path,
	1482	const char *outgoing_vol_new_path,
	1483	vfs_switch_root_flags_t flags)
	1484	{
	1485	// grumble grumble
	1486	#define countof(x) (sizeof(x) / sizeof(x[0]))
	1487
	1488	struct preserved_mount {
	1489	vnode_t pm_rootvnode;
	1490	mount_t pm_mount;
	1491	vnode_t pm_new_covered_vp;
	1492	vnode_t pm_old_covered_vp;
	1493	const char *pm_path;
	1494	};
	1495
	1496	vfs_context_t ctx = vfs_context_kernel();
	1497	vnode_t incoming_rootvnode = NULLVP;
	1498	vnode_t outgoing_vol_new_covered_vp = NULLVP;
	1499	vnode_t incoming_vol_old_covered_vp = NULLVP;
	1500	mount_t outgoing = NULL;
	1501	mount_t incoming = NULL;
	1502
	1503	struct preserved_mount devfs = { NULLVP, NULL, NULLVP, NULLVP, "dev" };
	1504	struct preserved_mount preboot = { NULLVP, NULL, NULLVP, NULLVP, "System/Volumes/Preboot" };
	1505	struct preserved_mount recovery = { NULLVP, NULL, NULLVP, NULLVP, "System/Volumes/Recovery" };
	1506	struct preserved_mount vm = { NULLVP, NULL, NULLVP, NULLVP, "System/Volumes/VM" };
	1507	struct preserved_mount update = { NULLVP, NULL, NULLVP, NULLVP, "System/Volumes/Update" };
	1508	struct preserved_mount iscPreboot = { NULLVP, NULL, NULLVP, NULLVP, "System/Volumes/iSCPreboot" };
	1509	struct preserved_mount hardware = { NULLVP, NULL, NULLVP, NULLVP, "System/Volumes/Hardware" };
	1510	struct preserved_mount xarts = { NULLVP, NULL, NULLVP, NULLVP, "System/Volumes/xarts" };
	1511	struct preserved_mount factorylogs = { NULLVP, NULL, NULLVP, NULLVP, "FactoryLogs" };
	1512	struct preserved_mount idiags = { NULLVP, NULL, NULLVP, NULLVP, "System/Volumes/Diags" };
	1513
	1514	struct preserved_mount *preserved[10];
	1515	preserved[0] = &devfs;
	1516	preserved[1] = &preboot;
	1517	preserved[2] = &recovery;
	1518	preserved[3] = &vm;
	1519	preserved[4] = &update;
	1520	preserved[5] = &iscPreboot;
	1521	preserved[6] = &hardware;
	1522	preserved[7] = &xarts;
	1523	preserved[8] = &factorylogs;
	1524	preserved[9] = &idiags;
	1525
	1526	int error;
	1527
	1528	printf("%s : shuffling mount points : %s <-> / <-> %s\n", __FUNCTION__, incoming_vol_old_path, outgoing_vol_new_path);
	1529
	1530	if (outgoing_vol_new_path[0] == '/') {
	1531	// I should have written this to be more helpful and just advance the pointer forward past the slash
	1532	printf("Do not use a leading slash in outgoing_vol_new_path\n");
	1533	return EINVAL;
	1534	}
	1535
	1536	// Set incoming_rootvnode.
	1537	// Find the vnode representing the mountpoint of the new root
	1538	// filesystem. That will be the new root directory.
	1539	error = vnode_lookup(incoming_vol_old_path, 0, &incoming_rootvnode, ctx);
	1540	if (error) {
	1541	printf("Incoming rootfs root vnode not found\n");
	1542	error = ENOENT;
	1543	goto done;
	1544	}
	1545
	1546	/*
	1547	* This function drops the icoount and sets the vnode to NULL on error.
	1548	*/
	1549	error = verify_incoming_rootfs(&incoming_rootvnode, ctx, flags);
	1550	if (error) {
	1551	goto done;
	1552	}
	1553
	1554	/*
	1555	* Set outgoing_vol_new_covered_vp.
	1556	* Find the vnode representing the future mountpoint of the old
	1557	* root filesystem, inside the directory incoming_rootvnode.
	1558	* Right now it's at "/incoming_vol_old_path/outgoing_vol_new_path".
	1559	* soon it will become "/oldrootfs_path_after", which will be covered.
	1560	*/
	1561	error = vnode_lookupat(outgoing_vol_new_path, 0, &outgoing_vol_new_covered_vp, ctx, incoming_rootvnode);
	1562	if (error) {
	1563	printf("Outgoing rootfs path not found, abandoning / switch, error = %d\n", error);
	1564	error = ENOENT;
	1565	goto done;
	1566	}
	1567	if (vnode_vtype(outgoing_vol_new_covered_vp) != VDIR) {
	1568	printf("Outgoing rootfs path is not a directory, abandoning / switch\n");
	1569	error = ENOTDIR;
	1570	goto done;
	1571	}
	1572
	1573	/*
	1574	* Find the preserved mounts - see if they are mounted. Get their root
	1575	* vnode if they are. If they aren't, leave rootvnode NULL which will
	1576	* be the signal to ignore this mount later on.
	1577	*
	1578	* Also get preserved mounts' new_covered_vp.
	1579	* Find the node representing the folder "dev" inside the directory newrootvnode.
	1580	* Right now it's at "/incoming_vol_old_path/dev".
	1581	* Soon it will become /dev, which will be covered by the devfs mountpoint.
	1582	*/
	1583	for (size_t i = 0; i < countof(preserved); i++) {
	1584	struct preserved_mount *pmi = preserved[i];
	1585
	1586	error = vnode_lookupat(pmi->pm_path, 0, &pmi->pm_rootvnode, ctx, rootvnode);
	1587	if (error) {
	1588	printf("skipping preserved mountpoint because not found or error: %d: %s\n", error, pmi->pm_path);
	1589	// not fatal. try the next one in the list.
	1590	continue;
	1591	}
	1592	bool is_mountpoint = false;
	1593	vnode_lock_spin(pmi->pm_rootvnode);
	1594	if ((pmi->pm_rootvnode->v_flag & VROOT) != 0) {
	1595	is_mountpoint = true;
	1596	}
	1597	vnode_unlock(pmi->pm_rootvnode);
	1598	if (!is_mountpoint) {
	1599	printf("skipping preserved mountpoint because not a mountpoint: %s\n", pmi->pm_path);
	1600	vnode_put(pmi->pm_rootvnode);
	1601	pmi->pm_rootvnode = NULLVP;
	1602	// not fatal. try the next one in the list.
	1603	continue;
	1604	}
	1605
	1606	error = vnode_lookupat(pmi->pm_path, 0, &pmi->pm_new_covered_vp, ctx, incoming_rootvnode);
	1607	if (error) {
	1608	printf("preserved new mount directory not found or error: %d: %s\n", error, pmi->pm_path);
	1609	error = ENOENT;
	1610	goto done;
	1611	}
	1612	if (vnode_vtype(pmi->pm_new_covered_vp) != VDIR) {
	1613	printf("preserved new mount directory not directory: %s\n", pmi->pm_path);
	1614	error = ENOTDIR;
	1615	goto done;
	1616	}
	1617
	1618	printf("will preserve mountpoint across pivot: /%s\n", pmi->pm_path);
	1619	}
	1620
	1621	/*
	1622	* --
	1623	* At this point, everything has been prepared and all error conditions
	1624	* have been checked. We check everything we can before this point;
	1625	* from now on we start making destructive changes, and we can't stop
	1626	* until we reach the end.
	1627	* ----
	1628	*/
	1629
	1630	/* this usecount is transferred to the mnt_vnodecovered */
	1631	vnode_ref_ext(outgoing_vol_new_covered_vp, 0, VNODE_REF_FORCE);
	1632	/* this usecount is transferred to set_rootvnode */
	1633	vnode_ref_ext(incoming_rootvnode, 0, VNODE_REF_FORCE);
	1634
	1635
	1636	for (size_t i = 0; i < countof(preserved); i++) {
	1637	struct preserved_mount *pmi = preserved[i];
	1638	if (pmi->pm_rootvnode == NULLVP) {
	1639	continue;
	1640	}
	1641
	1642	/* this usecount is transferred to the mnt_vnodecovered */
	1643	vnode_ref_ext(pmi->pm_new_covered_vp, 0, VNODE_REF_FORCE);
	1644
	1645	/* The new_covered_vp is a mountpoint from now on. */
	1646	vnode_lock_spin(pmi->pm_new_covered_vp);
	1647	pmi->pm_new_covered_vp->v_flag \|= VMOUNT;
	1648	vnode_unlock(pmi->pm_new_covered_vp);
	1649	}
	1650
	1651	/* The outgoing_vol_new_covered_vp is a mountpoint from now on. */
	1652	vnode_lock_spin(outgoing_vol_new_covered_vp);
	1653	outgoing_vol_new_covered_vp->v_flag \|= VMOUNT;
	1654	vnode_unlock(outgoing_vol_new_covered_vp);
	1655
	1656
	1657	/*
	1658	* Identify the mount_ts of the mounted filesystems that are being
	1659	* manipulated: outgoing rootfs, incoming rootfs, and the preserved
	1660	* mounts.
	1661	*/
	1662	outgoing = rootvnode->v_mount;
	1663	incoming = incoming_rootvnode->v_mount;
	1664	for (size_t i = 0; i < countof(preserved); i++) {
	1665	struct preserved_mount *pmi = preserved[i];
	1666	if (pmi->pm_rootvnode == NULLVP) {
	1667	continue;
	1668	}
	1669
	1670	pmi->pm_mount = pmi->pm_rootvnode->v_mount;
	1671	}
	1672
	1673	lck_rw_lock_exclusive(&rootvnode_rw_lock);
	1674
	1675	/* Setup incoming as the new rootfs */
	1676	lck_rw_lock_exclusive(&incoming->mnt_rwlock);
	1677	incoming_vol_old_covered_vp = incoming->mnt_vnodecovered;
	1678	incoming->mnt_vnodecovered = NULLVP;
	1679	strlcpy(incoming->mnt_vfsstat.f_mntonname, "/", MAXPATHLEN);
	1680	incoming->mnt_flag \|= MNT_ROOTFS;
	1681	lck_rw_done(&incoming->mnt_rwlock);
	1682
	1683	/*
	1684	* The preserved mountpoints will now be moved to
	1685	* incoming_rootnode/pm_path, and then by the end of the function,
	1686	* since incoming_rootnode is going to /, the preserved mounts
	1687	* will be end up back at /pm_path
	1688	*/
	1689	for (size_t i = 0; i < countof(preserved); i++) {
	1690	struct preserved_mount *pmi = preserved[i];
	1691	if (pmi->pm_rootvnode == NULLVP) {
	1692	continue;
	1693	}
	1694
	1695	lck_rw_lock_exclusive(&pmi->pm_mount->mnt_rwlock);
	1696	pmi->pm_old_covered_vp = pmi->pm_mount->mnt_vnodecovered;
	1697	pmi->pm_mount->mnt_vnodecovered = pmi->pm_new_covered_vp;
	1698	vnode_lock_spin(pmi->pm_new_covered_vp);
	1699	pmi->pm_new_covered_vp->v_mountedhere = pmi->pm_mount;
	1700	vnode_unlock(pmi->pm_new_covered_vp);
	1701	lck_rw_done(&pmi->pm_mount->mnt_rwlock);
	1702	}
	1703
	1704	/*
	1705	* The old root volume now covers outgoing_vol_new_covered_vp
	1706	* on the new root volume. Remove the ROOTFS marker.
	1707	* Now it is to be found at outgoing_vol_new_path
	1708	*/
	1709	lck_rw_lock_exclusive(&outgoing->mnt_rwlock);
	1710	outgoing->mnt_vnodecovered = outgoing_vol_new_covered_vp;
	1711	strlcpy(outgoing->mnt_vfsstat.f_mntonname, "/", MAXPATHLEN);
	1712	strlcat(outgoing->mnt_vfsstat.f_mntonname, outgoing_vol_new_path, MAXPATHLEN);
	1713	outgoing->mnt_flag &= ~MNT_ROOTFS;
	1714	vnode_lock_spin(outgoing_vol_new_covered_vp);
	1715	outgoing_vol_new_covered_vp->v_mountedhere = outgoing;
	1716	vnode_unlock(outgoing_vol_new_covered_vp);
	1717	lck_rw_done(&outgoing->mnt_rwlock);
	1718
	1719	if (!(outgoing->mnt_kern_flag & MNTK_VIRTUALDEV) &&
	1720	(TAILQ_FIRST(&mountlist) == outgoing)) {
	1721	vfs_setmntsystem(outgoing);
	1722	}
	1723
	1724	/*
	1725	* Finally, remove the mount_t linkage from the previously covered
	1726	* vnodes on the old root volume. These were incoming_vol_old_path,
	1727	* and each preserved mounts's "/pm_path". The filesystems previously
	1728	* mounted there have already been moved away.
	1729	*/
	1730	vnode_lock_spin(incoming_vol_old_covered_vp);
	1731	incoming_vol_old_covered_vp->v_flag &= ~VMOUNT;
	1732	incoming_vol_old_covered_vp->v_mountedhere = NULL;
	1733	vnode_unlock(incoming_vol_old_covered_vp);
	1734
	1735	for (size_t i = 0; i < countof(preserved); i++) {
	1736	struct preserved_mount *pmi = preserved[i];
	1737	if (pmi->pm_rootvnode == NULLVP) {
	1738	continue;
	1739	}
	1740
	1741	vnode_lock_spin(pmi->pm_old_covered_vp);
	1742	pmi->pm_old_covered_vp->v_flag &= ~VMOUNT;
	1743	pmi->pm_old_covered_vp->v_mountedhere = NULL;
	1744	vnode_unlock(pmi->pm_old_covered_vp);
	1745	}
	1746
	1747	/*
	1748	* Clear the name cache since many cached names are now invalid.
	1749	*/
	1750	vfs_iterate(0 /* flags */, cache_purge_callback, NULL);
	1751
	1752	/*
	1753	* Actually change the rootvnode! And finally drop the lock that
	1754	* prevents concurrent vnode_lookups.
	1755	*/
	1756	set_rootvnode(incoming_rootvnode);
	1757	lck_rw_unlock_exclusive(&rootvnode_rw_lock);
	1758
	1759	if (!(incoming->mnt_kern_flag & MNTK_VIRTUALDEV) &&
	1760	!(outgoing->mnt_kern_flag & MNTK_VIRTUALDEV)) {
	1761	/*
	1762	* Switch the order of mount structures in the mountlist, new root
	1763	* mount moves to the head of the list followed by /dev and the other
	1764	* preserved mounts then all the preexisting mounts (old rootfs + any
	1765	* others)
	1766	*/
	1767	mount_list_lock();
	1768	for (size_t i = 0; i < countof(preserved); i++) {
	1769	struct preserved_mount *pmi = preserved[i];
	1770	if (pmi->pm_rootvnode == NULLVP) {
	1771	continue;
	1772	}
	1773
	1774	TAILQ_REMOVE(&mountlist, pmi->pm_mount, mnt_list);
	1775	TAILQ_INSERT_HEAD(&mountlist, pmi->pm_mount, mnt_list);
	1776	}
	1777	TAILQ_REMOVE(&mountlist, incoming, mnt_list);
	1778	TAILQ_INSERT_HEAD(&mountlist, incoming, mnt_list);
	1779	mount_list_unlock();
	1780	}
	1781
	1782	/*
	1783	* Fixups across all volumes
	1784	*/
	1785	vfs_iterate(0 /* flags */, mntonname_fixup_callback, NULL);
	1786	vfs_iterate(0 /* flags */, clear_mntk_backs_root_callback, NULL);
	1787
	1788	error = 0;
	1789
	1790	done:
	1791	for (size_t i = 0; i < countof(preserved); i++) {
	1792	struct preserved_mount *pmi = preserved[i];
	1793
	1794	if (pmi->pm_rootvnode) {
	1795	vnode_put(pmi->pm_rootvnode);
	1796	}
	1797	if (pmi->pm_new_covered_vp) {
	1798	vnode_put(pmi->pm_new_covered_vp);
	1799	}
	1800	if (pmi->pm_old_covered_vp) {
	1801	vnode_rele(pmi->pm_old_covered_vp);
	1802	}
	1803	}
	1804
	1805	if (outgoing_vol_new_covered_vp) {
	1806	vnode_put(outgoing_vol_new_covered_vp);
	1807	}
	1808
	1809	if (incoming_vol_old_covered_vp) {
	1810	vnode_rele(incoming_vol_old_covered_vp);
	1811	}
	1812
	1813	if (incoming_rootvnode) {
	1814	vnode_put(incoming_rootvnode);
	1815	}
	1816
	1817	printf("%s : done shuffling mount points with error: %d\n", __FUNCTION__, error);
	1818	return error;
	1819	}
	1820
	1821	/*
	1822	* Mount the Recovery volume of a container
	1823	*/
	1824	int
	1825	vfs_mount_recovery(void)
	1826	{
	1827	#if CONFIG_MOUNT_PREBOOTRECOVERY
	1828	int error = 0;
	1829
	1830	error = vnode_get(rootvnode);
	1831	if (error) {
	1832	/* root must be mounted first */
	1833	printf("vnode_get(rootvnode) failed with error %d\n", error);
	1834	return error;
	1835	}
	1836
	1837	char recoverypath[] = PLATFORM_RECOVERY_VOLUME_MOUNT_POINT; /* !const because of internal casting */
	1838
	1839	/* Mount the recovery volume */
	1840	printf("attempting kernel mount for recovery volume... \n");
	1841	error = kernel_mount(rootvnode->v_mount->mnt_vfsstat.f_fstypename, NULLVP, NULLVP,
	1842	recoverypath, (rootvnode->v_mount), 0, 0, (KERNEL_MOUNT_RECOVERYVOL), vfs_context_kernel());
	1843
	1844	if (error) {
	1845	printf("Failed to mount recovery volume (%d)\n", error);
	1846	} else {
	1847	printf("mounted recovery volume\n");
	1848	}
	1849
	1850	vnode_put(rootvnode);
	1851	return error;
	1852	#else
	1853	return 0;
	1854	#endif
	1855	}
	1856
	1857	/*
	1858	* Lookup a mount point by filesystem identifier.
	1859	*/
	1860
	1861	struct mount *
	1862	vfs_getvfs(fsid_t *fsid)
	1863	{
	1864	return mount_list_lookupby_fsid(fsid, 0, 0);
	1865	}
	1866
	1867	static struct mount *
	1868	vfs_getvfs_locked(fsid_t *fsid)
	1869	{
	1870	return mount_list_lookupby_fsid(fsid, 1, 0);
	1871	}
	1872
	1873	struct mount *
	1874	vfs_getvfs_by_mntonname(char *path)
	1875	{
	1876	mount_t retmp = (mount_t)0;
	1877	mount_t mp;
	1878
	1879	mount_list_lock();
	1880	TAILQ_FOREACH(mp, &mountlist, mnt_list) {
	1881	if (!strncmp(mp->mnt_vfsstat.f_mntonname, path,
	1882	sizeof(mp->mnt_vfsstat.f_mntonname))) {
	1883	retmp = mp;
	1884	if (mount_iterref(retmp, 1)) {
	1885	retmp = NULL;
	1886	}
	1887	goto out;
	1888	}
	1889	}
	1890	out:
	1891	mount_list_unlock();
	1892	return retmp;
	1893	}
	1894
	1895	/* generation number for creation of new fsids */
	1896	u_short mntid_gen = 0;
	1897	/*
	1898	* Get a new unique fsid
	1899	*/
	1900	void
	1901	vfs_getnewfsid(struct mount *mp)
	1902	{
	1903	fsid_t tfsid;
	1904	int mtype;
	1905
	1906	mount_list_lock();
	1907
	1908	/* generate a new fsid */
	1909	mtype = mp->mnt_vtable->vfc_typenum;
	1910	if (++mntid_gen == 0) {
	1911	mntid_gen++;
	1912	}
	1913	tfsid.val[0] = makedev(nblkdev + mtype, mntid_gen);
	1914	tfsid.val[1] = mtype;
	1915
	1916	while (vfs_getvfs_locked(&tfsid)) {
	1917	if (++mntid_gen == 0) {
	1918	mntid_gen++;
	1919	}
	1920	tfsid.val[0] = makedev(nblkdev + mtype, mntid_gen);
	1921	}
	1922
	1923	mp->mnt_vfsstat.f_fsid.val[0] = tfsid.val[0];
	1924	mp->mnt_vfsstat.f_fsid.val[1] = tfsid.val[1];
	1925	mount_list_unlock();
	1926	}
	1927
	1928	/*
	1929	* Routines having to do with the management of the vnode table.
	1930	*/
	1931	extern int(*dead_vnodeop_p)(void );
	1932	long numvnodes, freevnodes, deadvnodes, async_work_vnodes;
	1933
	1934
	1935	int async_work_timed_out = 0;
	1936	int async_work_handled = 0;
	1937	int dead_vnode_wanted = 0;
	1938	int dead_vnode_waited = 0;
	1939
	1940	/*
	1941	* Move a vnode from one mount queue to another.
	1942	*/
	1943	static void
	1944	insmntque(vnode_t vp, mount_t mp)
	1945	{
	1946	mount_t lmp;
	1947	/*
	1948	* Delete from old mount point vnode list, if on one.
	1949	*/
	1950	if ((lmp = vp->v_mount) != NULL && lmp != dead_mountp) {
	1951	if ((vp->v_lflag & VNAMED_MOUNT) == 0) {
	1952	panic("insmntque: vp not in mount vnode list");
	1953	}
	1954	vp->v_lflag &= ~VNAMED_MOUNT;
	1955
	1956	mount_lock_spin(lmp);
	1957
	1958	mount_drop(lmp, 1);
	1959
	1960	if (vp->v_mntvnodes.tqe_next == NULL) {
	1961	if (TAILQ_LAST(&lmp->mnt_vnodelist, vnodelst) == vp) {
	1962	TAILQ_REMOVE(&lmp->mnt_vnodelist, vp, v_mntvnodes);
	1963	} else if (TAILQ_LAST(&lmp->mnt_newvnodes, vnodelst) == vp) {
	1964	TAILQ_REMOVE(&lmp->mnt_newvnodes, vp, v_mntvnodes);
	1965	} else if (TAILQ_LAST(&lmp->mnt_workerqueue, vnodelst) == vp) {
	1966	TAILQ_REMOVE(&lmp->mnt_workerqueue, vp, v_mntvnodes);
	1967	}
	1968	} else {
	1969	vp->v_mntvnodes.tqe_next->v_mntvnodes.tqe_prev = vp->v_mntvnodes.tqe_prev;
	1970	*vp->v_mntvnodes.tqe_prev = vp->v_mntvnodes.tqe_next;
	1971	}
	1972	vp->v_mntvnodes.tqe_next = NULL;
	1973	vp->v_mntvnodes.tqe_prev = NULL;
	1974	mount_unlock(lmp);
	1975	return;
	1976	}
	1977
	1978	/*
	1979	* Insert into list of vnodes for the new mount point, if available.
	1980	*/
	1981	if ((vp->v_mount = mp) != NULL) {
	1982	mount_lock_spin(mp);
	1983	if ((vp->v_mntvnodes.tqe_next != 0) && (vp->v_mntvnodes.tqe_prev != 0)) {
	1984	panic("vp already in mount list");
	1985	}
	1986	if (mp->mnt_lflag & MNT_LITER) {
	1987	TAILQ_INSERT_HEAD(&mp->mnt_newvnodes, vp, v_mntvnodes);
	1988	} else {
	1989	TAILQ_INSERT_HEAD(&mp->mnt_vnodelist, vp, v_mntvnodes);
	1990	}
	1991	if (vp->v_lflag & VNAMED_MOUNT) {
	1992	panic("insmntque: vp already in mount vnode list");
	1993	}
	1994	vp->v_lflag \|= VNAMED_MOUNT;
	1995	mount_ref(mp, 1);
	1996	mount_unlock(mp);
	1997	}
	1998	}
	1999
	2000
	2001	/*
	2002	* Create a vnode for a block device.
	2003	* Used for root filesystem, argdev, and swap areas.
	2004	* Also used for memory file system special devices.
	2005	*/
	2006	int
	2007	bdevvp(dev_t dev, vnode_t *vpp)
	2008	{
	2009	vnode_t nvp;
	2010	int error;
	2011	struct vnode_fsparam vfsp;
	2012	struct vfs_context context;
	2013
	2014	if (dev == NODEV) {
	2015	*vpp = NULLVP;
	2016	return ENODEV;
	2017	}
	2018
	2019	context.vc_thread = current_thread();
	2020	context.vc_ucred = FSCRED;
	2021
	2022	vfsp.vnfs_mp = (struct mount *)0;
	2023	vfsp.vnfs_vtype = VBLK;
	2024	vfsp.vnfs_str = "bdevvp";
	2025	vfsp.vnfs_dvp = NULL;
	2026	vfsp.vnfs_fsnode = NULL;
	2027	vfsp.vnfs_cnp = NULL;
	2028	vfsp.vnfs_vops = spec_vnodeop_p;
	2029	vfsp.vnfs_rdev = dev;
	2030	vfsp.vnfs_filesize = 0;
	2031
	2032	vfsp.vnfs_flags = VNFS_NOCACHE \| VNFS_CANTCACHE;
	2033
	2034	vfsp.vnfs_marksystem = 0;
	2035	vfsp.vnfs_markroot = 0;
	2036
	2037	if ((error = vnode_create(VNCREATE_FLAVOR, VCREATESIZE, &vfsp, &nvp))) {
	2038	*vpp = NULLVP;
	2039	return error;
	2040	}
	2041	vnode_lock_spin(nvp);
	2042	nvp->v_flag \|= VBDEVVP;
	2043	nvp->v_tag = VT_NON; /* set this to VT_NON so during aliasing it can be replaced */
	2044	vnode_unlock(nvp);
	2045	if ((error = vnode_ref(nvp))) {
	2046	panic("bdevvp failed: vnode_ref");
	2047	return error;
	2048	}
	2049	if ((error = VNOP_FSYNC(nvp, MNT_WAIT, &context))) {
	2050	panic("bdevvp failed: fsync");
	2051	return error;
	2052	}
	2053	if ((error = buf_invalidateblks(nvp, BUF_WRITE_DATA, 0, 0))) {
	2054	panic("bdevvp failed: invalidateblks");
	2055	return error;
	2056	}
	2057
	2058	#if CONFIG_MACF
	2059	/*
	2060	* XXXMAC: We can't put a MAC check here, the system will
	2061	* panic without this vnode.
	2062	*/
	2063	#endif /* MAC */
	2064
	2065	if ((error = VNOP_OPEN(nvp, FREAD, &context))) {
	2066	panic("bdevvp failed: open");
	2067	return error;
	2068	}
	2069	*vpp = nvp;
	2070
	2071	return 0;
	2072	}
	2073
	2074	/*
	2075	* Check to see if the new vnode represents a special device
	2076	* for which we already have a vnode (either because of
	2077	* bdevvp() or because of a different vnode representing
	2078	* the same block device). If such an alias exists, deallocate
	2079	* the existing contents and return the aliased vnode. The
	2080	* caller is responsible for filling it with its new contents.
	2081	*/
	2082	static vnode_t
	2083	checkalias(struct vnode *nvp, dev_t nvp_rdev)
	2084	{
	2085	struct vnode *vp;
	2086	struct vnode **vpp;
	2087	struct specinfo *sin = NULL;
	2088	int vid = 0;
	2089
	2090	vpp = &speclisth[SPECHASH(nvp_rdev)];
	2091	loop:
	2092	SPECHASH_LOCK();
	2093
	2094	for (vp = *vpp; vp; vp = vp->v_specnext) {
	2095	if (nvp_rdev == vp->v_rdev && nvp->v_type == vp->v_type) {
	2096	vid = vp->v_id;
	2097	break;
	2098	}
	2099	}
	2100	SPECHASH_UNLOCK();
	2101
	2102	if (vp) {
	2103	found_alias:
	2104	if (vnode_getwithvid(vp, vid)) {
	2105	goto loop;
	2106	}
	2107	/*
	2108	* Termination state is checked in vnode_getwithvid
	2109	*/
	2110	vnode_lock(vp);
	2111
	2112	/*
	2113	* Alias, but not in use, so flush it out.
	2114	*/
	2115	if ((vp->v_iocount == 1) && (vp->v_usecount == 0)) {
	2116	vnode_reclaim_internal(vp, 1, 1, 0);
	2117	vnode_put_locked(vp);
	2118	vnode_unlock(vp);
	2119	goto loop;
	2120	}
	2121	}
	2122	if (vp == NULL \|\| vp->v_tag != VT_NON) {
	2123	if (sin == NULL) {
	2124	sin = zalloc_flags(specinfo_zone, Z_WAITOK \| Z_ZERO);
	2125	} else {
	2126	bzero(sin, sizeof(struct specinfo));
	2127	}
	2128
	2129	nvp->v_specinfo = sin;
	2130	nvp->v_rdev = nvp_rdev;
	2131	nvp->v_specflags = 0;
	2132	nvp->v_speclastr = -1;
	2133	nvp->v_specinfo->si_opencount = 0;
	2134	nvp->v_specinfo->si_initted = 0;
	2135	nvp->v_specinfo->si_throttleable = 0;
	2136
	2137	SPECHASH_LOCK();
	2138
	2139	/* We dropped the lock, someone could have added */
	2140	if (vp == NULLVP) {
	2141	for (vp = *vpp; vp; vp = vp->v_specnext) {
	2142	if (nvp_rdev == vp->v_rdev && nvp->v_type == vp->v_type) {
	2143	vid = vp->v_id;
	2144	SPECHASH_UNLOCK();
	2145	goto found_alias;
	2146	}
	2147	}
	2148	}
	2149
	2150	nvp->v_hashchain = vpp;
	2151	nvp->v_specnext = *vpp;
	2152	*vpp = nvp;
	2153
	2154	if (vp != NULLVP) {
	2155	nvp->v_specflags \|= SI_ALIASED;
	2156	vp->v_specflags \|= SI_ALIASED;
	2157	SPECHASH_UNLOCK();
	2158	vnode_put_locked(vp);
	2159	vnode_unlock(vp);
	2160	} else {
	2161	SPECHASH_UNLOCK();
	2162	}
	2163
	2164	return NULLVP;
	2165	}
	2166
	2167	if (sin) {
	2168	zfree(specinfo_zone, sin);
	2169	}
	2170
	2171	if ((vp->v_flag & (VBDEVVP \| VDEVFLUSH)) != 0) {
	2172	return vp;
	2173	}
	2174
	2175	panic("checkalias with VT_NON vp that shouldn't: %p", vp);
	2176
	2177	return vp;
	2178	}
	2179
	2180
	2181	/*
	2182	* Get a reference on a particular vnode and lock it if requested.
	2183	* If the vnode was on the inactive list, remove it from the list.
	2184	* If the vnode was on the free list, remove it from the list and
	2185	* move it to inactive list as needed.
	2186	* The vnode lock bit is set if the vnode is being eliminated in
	2187	* vgone. The process is awakened when the transition is completed,
	2188	* and an error returned to indicate that the vnode is no longer
	2189	* usable (possibly having been changed to a new file system type).
	2190	*/
	2191	int
	2192	vget_internal(vnode_t vp, int vid, int vflags)
	2193	{
	2194	int error = 0;
	2195
	2196	vnode_lock_spin(vp);
	2197
	2198	if ((vflags & VNODE_WRITEABLE) && (vp->v_writecount == 0)) {
	2199	/*
	2200	* vnode to be returned only if it has writers opened
	2201	*/
	2202	error = EINVAL;
	2203	} else {
	2204	error = vnode_getiocount(vp, vid, vflags);
	2205	}
	2206
	2207	vnode_unlock(vp);
	2208
	2209	return error;
	2210	}
	2211
	2212	/*
	2213	* Returns: 0 Success
	2214	* ENOENT No such file or directory [terminating]
	2215	*/
	2216	int
	2217	vnode_ref(vnode_t vp)
	2218	{
	2219	return vnode_ref_ext(vp, 0, 0);
	2220	}
	2221
	2222	/*
	2223	* Returns: 0 Success
	2224	* ENOENT No such file or directory [terminating]
	2225	*/
	2226	int
	2227	vnode_ref_ext(vnode_t vp, int fmode, int flags)
	2228	{
	2229	int error = 0;
	2230
	2231	vnode_lock_spin(vp);
	2232
	2233	/*
	2234	* once all the current call sites have been fixed to insure they have
	2235	* taken an iocount, we can toughen this assert up and insist that the
	2236	* iocount is non-zero... a non-zero usecount doesn't insure correctness
	2237	*/
	2238	if (vp->v_iocount <= 0 && vp->v_usecount <= 0) {
	2239	panic("vnode_ref_ext: vp %p has no valid reference %d, %d", vp, vp->v_iocount, vp->v_usecount);
	2240	}
	2241
	2242	/*
	2243	* if you are the owner of drain/termination, can acquire usecount
	2244	*/
	2245	if ((flags & VNODE_REF_FORCE) == 0) {
	2246	if ((vp->v_lflag & (VL_DRAIN \| VL_TERMINATE \| VL_DEAD))) {
	2247	if (vp->v_owner != current_thread()) {
	2248	error = ENOENT;
	2249	goto out;
	2250	}
	2251	}
	2252	}
	2253
	2254	/* Enable atomic ops on v_usecount without the vnode lock */
	2255	os_atomic_inc(&vp->v_usecount, relaxed);
	2256
	2257	if (fmode & FWRITE) {
	2258	if (++vp->v_writecount <= 0) {
	2259	panic("vnode_ref_ext: v_writecount");
	2260	}
	2261	}
	2262	if (fmode & O_EVTONLY) {
	2263	if (++vp->v_kusecount <= 0) {
	2264	panic("vnode_ref_ext: v_kusecount");
	2265	}
	2266	}
	2267	if (vp->v_flag & VRAGE) {
	2268	struct uthread *ut;
	2269
	2270	ut = get_bsdthread_info(current_thread());
	2271
	2272	if (!(current_proc()->p_lflag & P_LRAGE_VNODES) &&
	2273	!(ut->uu_flag & UT_RAGE_VNODES)) {
	2274	/*
	2275	* a 'normal' process accessed this vnode
	2276	* so make sure its no longer marked
	2277	* for rapid aging... also, make sure
	2278	* it gets removed from the rage list...
	2279	* when v_usecount drops back to 0, it
	2280	* will be put back on the real free list
	2281	*/
	2282	vp->v_flag &= ~VRAGE;
	2283	vp->v_references = 0;
	2284	vnode_list_remove(vp);
	2285	}
	2286	}
	2287	if (vp->v_usecount == 1 && vp->v_type == VREG && !(vp->v_flag & VSYSTEM)) {
	2288	if (vp->v_ubcinfo) {
	2289	vnode_lock_convert(vp);
	2290	memory_object_mark_used(vp->v_ubcinfo->ui_control);
	2291	}
	2292	}
	2293	out:
	2294	vnode_unlock(vp);
	2295
	2296	return error;
	2297	}
	2298
	2299
	2300	boolean_t
	2301	vnode_on_reliable_media(vnode_t vp)
	2302	{
	2303	mount_t mp = vp->v_mount;
	2304
	2305	/*
	2306	* A NULL mountpoint would imply it's not attached to a any filesystem.
	2307	* This can only happen with a vnode created by bdevvp(). We'll consider
	2308	* those as not unreliable as the primary use of this function is determine
	2309	* which vnodes are to be handed off to the async cleaner thread for
	2310	* reclaim.
	2311	*/
	2312	if (!mp \|\| (!(mp->mnt_kern_flag & MNTK_VIRTUALDEV) && (mp->mnt_flag & MNT_LOCAL))) {
	2313	return TRUE;
	2314	}
	2315
	2316	return FALSE;
	2317	}
	2318
	2319	static void
	2320	vnode_async_list_add_locked(vnode_t vp)
	2321	{
	2322	if (VONLIST(vp) \|\| (vp->v_lflag & (VL_TERMINATE \| VL_DEAD))) {
	2323	panic("vnode_async_list_add: %p is in wrong state", vp);
	2324	}
	2325
	2326	TAILQ_INSERT_HEAD(&vnode_async_work_list, vp, v_freelist);
	2327	vp->v_listflag \|= VLIST_ASYNC_WORK;
	2328
	2329	async_work_vnodes++;
	2330	}
	2331
	2332	static void
	2333	vnode_async_list_add(vnode_t vp)
	2334	{
	2335	vnode_list_lock();
	2336
	2337	vnode_async_list_add_locked(vp);
	2338
	2339	vnode_list_unlock();
	2340
	2341	wakeup(&vnode_async_work_list);
	2342	}
	2343
	2344
	2345	/*
	2346	* put the vnode on appropriate free list.
	2347	* called with vnode LOCKED
	2348	*/
	2349	static void
	2350	vnode_list_add(vnode_t vp)
	2351	{
	2352	boolean_t need_dead_wakeup = FALSE;
	2353
	2354	#if DIAGNOSTIC
	2355	lck_mtx_assert(&vp->v_lock, LCK_MTX_ASSERT_OWNED);
	2356	#endif
	2357
	2358	again:
	2359
	2360	/*
	2361	* if it is already on a list or non zero references return
	2362	*/
	2363	if (VONLIST(vp) \|\| (vp->v_usecount != 0) \|\| (vp->v_iocount != 0) \|\| (vp->v_lflag & VL_TERMINATE)) {
	2364	return;
	2365	}
	2366
	2367	/*
	2368	* In vclean, we might have deferred ditching locked buffers
	2369	* because something was still referencing them (indicated by
	2370	* usecount). We can ditch them now.
	2371	*/
	2372	if (ISSET(vp->v_lflag, VL_DEAD)
	2373	&& (!LIST_EMPTY(&vp->v_cleanblkhd) \|\| !LIST_EMPTY(&vp->v_dirtyblkhd))) {
	2374	++vp->v_iocount; // Probably not necessary, but harmless
	2375	#ifdef JOE_DEBUG
	2376	record_vp(vp, 1);
	2377	#endif
	2378	vnode_unlock(vp);
	2379	buf_invalidateblks(vp, BUF_INVALIDATE_LOCKED, 0, 0);
	2380	vnode_lock(vp);
	2381	vnode_dropiocount(vp);
	2382	goto again;
	2383	}
	2384
	2385	vnode_list_lock();
	2386
	2387	if ((vp->v_flag & VRAGE) && !(vp->v_lflag & VL_DEAD)) {
	2388	/*
	2389	* add the new guy to the appropriate end of the RAGE list
	2390	*/
	2391	if ((vp->v_flag & VAGE)) {
	2392	TAILQ_INSERT_HEAD(&vnode_rage_list, vp, v_freelist);
	2393	} else {
	2394	TAILQ_INSERT_TAIL(&vnode_rage_list, vp, v_freelist);
	2395	}
	2396
	2397	vp->v_listflag \|= VLIST_RAGE;
	2398	ragevnodes++;
	2399
	2400	/*
	2401	* reset the timestamp for the last inserted vp on the RAGE
	2402	* queue to let new_vnode know that its not ok to start stealing
	2403	* from this list... as long as we're actively adding to this list
	2404	* we'll push out the vnodes we want to donate to the real free list
	2405	* once we stop pushing, we'll let some time elapse before we start
	2406	* stealing them in the new_vnode routine
	2407	*/
	2408	microuptime(&rage_tv);
	2409	} else {
	2410	/*
	2411	* if VL_DEAD, insert it at head of the dead list
	2412	* else insert at tail of LRU list or at head if VAGE is set
	2413	*/
	2414	if ((vp->v_lflag & VL_DEAD)) {
	2415	TAILQ_INSERT_HEAD(&vnode_dead_list, vp, v_freelist);
	2416	vp->v_listflag \|= VLIST_DEAD;
	2417	deadvnodes++;
	2418
	2419	if (dead_vnode_wanted) {
	2420	dead_vnode_wanted--;
	2421	need_dead_wakeup = TRUE;
	2422	}
	2423	} else if ((vp->v_flag & VAGE)) {
	2424	TAILQ_INSERT_HEAD(&vnode_free_list, vp, v_freelist);
	2425	vp->v_flag &= ~VAGE;
	2426	freevnodes++;
	2427	} else {
	2428	TAILQ_INSERT_TAIL(&vnode_free_list, vp, v_freelist);
	2429	freevnodes++;
	2430	}
	2431	}
	2432	vnode_list_unlock();
	2433
	2434	if (need_dead_wakeup == TRUE) {
	2435	wakeup_one((caddr_t)&dead_vnode_wanted);
	2436	}
	2437	}
	2438
	2439
	2440	/*
	2441	* remove the vnode from appropriate free list.
	2442	* called with vnode LOCKED and
	2443	* the list lock held
	2444	*/
	2445	static void
	2446	vnode_list_remove_locked(vnode_t vp)
	2447	{
	2448	if (VONLIST(vp)) {
	2449	/*
	2450	* the v_listflag field is
	2451	* protected by the vnode_list_lock
	2452	*/
	2453	if (vp->v_listflag & VLIST_RAGE) {
	2454	VREMRAGE("vnode_list_remove", vp);
	2455	} else if (vp->v_listflag & VLIST_DEAD) {
	2456	VREMDEAD("vnode_list_remove", vp);
	2457	} else if (vp->v_listflag & VLIST_ASYNC_WORK) {
	2458	VREMASYNC_WORK("vnode_list_remove", vp);
	2459	} else {
	2460	VREMFREE("vnode_list_remove", vp);
	2461	}
	2462	}
	2463	}
	2464
	2465
	2466	/*
	2467	* remove the vnode from appropriate free list.
	2468	* called with vnode LOCKED
	2469	*/
	2470	static void
	2471	vnode_list_remove(vnode_t vp)
	2472	{
	2473	#if DIAGNOSTIC
	2474	lck_mtx_assert(&vp->v_lock, LCK_MTX_ASSERT_OWNED);
	2475	#endif
	2476	/*
	2477	* we want to avoid taking the list lock
	2478	* in the case where we're not on the free
	2479	* list... this will be true for most
	2480	* directories and any currently in use files
	2481	*
	2482	* we're guaranteed that we can't go from
	2483	* the not-on-list state to the on-list
	2484	* state since we hold the vnode lock...
	2485	* all calls to vnode_list_add are done
	2486	* under the vnode lock... so we can
	2487	* check for that condition (the prevelant one)
	2488	* without taking the list lock
	2489	*/
	2490	if (VONLIST(vp)) {
	2491	vnode_list_lock();
	2492	/*
	2493	* however, we're not guaranteed that
	2494	* we won't go from the on-list state
	2495	* to the not-on-list state until we
	2496	* hold the vnode_list_lock... this
	2497	* is due to "new_vnode" removing vnodes
	2498	* from the free list uder the list_lock
	2499	* w/o the vnode lock... so we need to
	2500	* check again whether we're currently
	2501	* on the free list
	2502	*/
	2503	vnode_list_remove_locked(vp);
	2504
	2505	vnode_list_unlock();
	2506	}
	2507	}
	2508
	2509
	2510	void
	2511	vnode_rele(vnode_t vp)
	2512	{
	2513	vnode_rele_internal(vp, 0, 0, 0);
	2514	}
	2515
	2516
	2517	void
	2518	vnode_rele_ext(vnode_t vp, int fmode, int dont_reenter)
	2519	{
	2520	vnode_rele_internal(vp, fmode, dont_reenter, 0);
	2521	}
	2522
	2523
	2524	void
	2525	vnode_rele_internal(vnode_t vp, int fmode, int dont_reenter, int locked)
	2526	{
	2527	int32_t old_usecount;
	2528
	2529	if (!locked) {
	2530	vnode_lock_spin(vp);
	2531	}
	2532	#if DIAGNOSTIC
	2533	else {
	2534	lck_mtx_assert(&vp->v_lock, LCK_MTX_ASSERT_OWNED);
	2535	}
	2536	#endif
	2537	/* Enable atomic ops on v_usecount without the vnode lock */
	2538	old_usecount = os_atomic_dec_orig(&vp->v_usecount, relaxed);
	2539	if (old_usecount < 1) {
	2540	/*
	2541	* Because we allow atomic ops on usecount (in lookup only, under
	2542	* specific conditions of already having a usecount) it is
	2543	* possible that when the vnode is examined, its usecount is
	2544	* different than what will be printed in this panic message.
	2545	*/
	2546	panic("vnode_rele_ext: vp %p usecount -ve : %d. v_tag = %d, v_type = %d, v_flag = %x.",
	2547	vp, old_usecount - 1, vp->v_tag, vp->v_type, vp->v_flag);
	2548	}
	2549
	2550	if (fmode & FWRITE) {
	2551	if (--vp->v_writecount < 0) {
	2552	panic("vnode_rele_ext: vp %p writecount -ve : %d. v_tag = %d, v_type = %d, v_flag = %x.", vp, vp->v_writecount, vp->v_tag, vp->v_type, vp->v_flag);
	2553	}
	2554	}
	2555	if (fmode & O_EVTONLY) {
	2556	if (--vp->v_kusecount < 0) {
	2557	panic("vnode_rele_ext: vp %p kusecount -ve : %d. v_tag = %d, v_type = %d, v_flag = %x.", vp, vp->v_kusecount, vp->v_tag, vp->v_type, vp->v_flag);
	2558	}
	2559	}
	2560	if (vp->v_kusecount > vp->v_usecount) {
	2561	panic("vnode_rele_ext: vp %p kusecount(%d) out of balance with usecount(%d). v_tag = %d, v_type = %d, v_flag = %x.", vp, vp->v_kusecount, vp->v_usecount, vp->v_tag, vp->v_type, vp->v_flag);
	2562	}
	2563
	2564	if ((vp->v_iocount > 0) \|\| (vp->v_usecount > 0)) {
	2565	/*
	2566	* vnode is still busy... if we're the last
	2567	* usecount, mark for a future call to VNOP_INACTIVE
	2568	* when the iocount finally drops to 0
	2569	*/
	2570	if (vp->v_usecount == 0) {
	2571	vp->v_lflag \|= VL_NEEDINACTIVE;
	2572	vp->v_flag &= ~(VNOCACHE_DATA \| VRAOFF \| VOPENEVT);
	2573	}
	2574	goto done;
	2575	}
	2576	vp->v_flag &= ~(VNOCACHE_DATA \| VRAOFF \| VOPENEVT);
	2577
	2578	if (ISSET(vp->v_lflag, VL_TERMINATE \| VL_DEAD) \|\| dont_reenter) {
	2579	/*
	2580	* vnode is being cleaned, or
	2581	* we've requested that we don't reenter
	2582	* the filesystem on this release...in
	2583	* the latter case, we'll mark the vnode aged
	2584	*/
	2585	if (dont_reenter) {
	2586	if (!(vp->v_lflag & (VL_TERMINATE \| VL_DEAD \| VL_MARKTERM))) {
	2587	vp->v_lflag \|= VL_NEEDINACTIVE;
	2588
	2589	if (vnode_on_reliable_media(vp) == FALSE \|\| vp->v_flag & VISDIRTY) {
	2590	vnode_async_list_add(vp);
	2591	goto done;
	2592	}
	2593	}
	2594	vp->v_flag \|= VAGE;
	2595	}
	2596	vnode_list_add(vp);
	2597
	2598	goto done;
	2599	}
	2600	/*
	2601	* at this point both the iocount and usecount
	2602	* are zero
	2603	* pick up an iocount so that we can call
	2604	* VNOP_INACTIVE with the vnode lock unheld
	2605	*/
	2606	vp->v_iocount++;
	2607	#ifdef JOE_DEBUG
	2608	record_vp(vp, 1);
	2609	#endif
	2610	vp->v_lflag &= ~VL_NEEDINACTIVE;
	2611	vnode_unlock(vp);
	2612
	2613	VNOP_INACTIVE(vp, vfs_context_current());
	2614
	2615	vnode_lock_spin(vp);
	2616	/*
	2617	* because we dropped the vnode lock to call VNOP_INACTIVE
	2618	* the state of the vnode may have changed... we may have
	2619	* picked up an iocount, usecount or the MARKTERM may have
	2620	* been set... we need to reevaluate the reference counts
	2621	* to determine if we can call vnode_reclaim_internal at
	2622	* this point... if the reference counts are up, we'll pick
	2623	* up the MARKTERM state when they get subsequently dropped
	2624	*/
	2625	if ((vp->v_iocount == 1) && (vp->v_usecount == 0) &&
	2626	((vp->v_lflag & (VL_MARKTERM \| VL_TERMINATE \| VL_DEAD)) == VL_MARKTERM)) {
	2627	struct uthread *ut;
	2628
	2629	ut = get_bsdthread_info(current_thread());
	2630
	2631	if (ut->uu_defer_reclaims) {
	2632	vp->v_defer_reclaimlist = ut->uu_vreclaims;
	2633	ut->uu_vreclaims = vp;
	2634	goto done;
	2635	}
	2636	vnode_lock_convert(vp);
	2637	vnode_reclaim_internal(vp, 1, 1, 0);
	2638	}
	2639	vnode_dropiocount(vp);
	2640	vnode_list_add(vp);
	2641	done:
	2642	if (vp->v_usecount == 0 && vp->v_type == VREG && !(vp->v_flag & VSYSTEM)) {
	2643	if (vp->v_ubcinfo) {
	2644	vnode_lock_convert(vp);
	2645	memory_object_mark_unused(vp->v_ubcinfo->ui_control, (vp->v_flag & VRAGE) == VRAGE);
	2646	}
	2647	}
	2648	if (!locked) {
	2649	vnode_unlock(vp);
	2650	}
	2651	return;
	2652	}
	2653
	2654	/*
	2655	* Remove any vnodes in the vnode table belonging to mount point mp.
	2656	*
	2657	* If MNT_NOFORCE is specified, there should not be any active ones,
	2658	* return error if any are found (nb: this is a user error, not a
	2659	* system error). If MNT_FORCE is specified, detach any active vnodes
	2660	* that are found.
	2661	*/
	2662
	2663	int
	2664	vflush(struct mount mp, struct vnode skipvp, int flags)
	2665	{
	2666	struct vnode *vp;
	2667	int busy = 0;
	2668	int reclaimed = 0;
	2669	int retval;
	2670	unsigned int vid;
	2671	bool first_try = true;
	2672
	2673	/*
	2674	* See comments in vnode_iterate() for the rationale for this lock
	2675	*/
	2676	mount_iterate_lock(mp);
	2677
	2678	mount_lock(mp);
	2679	vnode_iterate_setup(mp);
	2680	/*
	2681	* On regular unmounts(not forced) do a
	2682	* quick check for vnodes to be in use. This
	2683	* preserves the caching of vnodes. automounter
	2684	* tries unmounting every so often to see whether
	2685	* it is still busy or not.
	2686	*/
	2687	if (((flags & FORCECLOSE) == 0) && ((mp->mnt_kern_flag & MNTK_UNMOUNT_PREFLIGHT) != 0)) {
	2688	if (vnode_umount_preflight(mp, skipvp, flags)) {
	2689	vnode_iterate_clear(mp);
	2690	mount_unlock(mp);
	2691	mount_iterate_unlock(mp);
	2692	return EBUSY;
	2693	}
	2694	}
	2695	loop:
	2696	/* If it returns 0 then there is nothing to do */
	2697	retval = vnode_iterate_prepare(mp);
	2698
	2699	if (retval == 0) {
	2700	vnode_iterate_clear(mp);
	2701	mount_unlock(mp);
	2702	mount_iterate_unlock(mp);
	2703	return retval;
	2704	}
	2705
	2706	/* iterate over all the vnodes */
	2707	while (!TAILQ_EMPTY(&mp->mnt_workerqueue)) {
	2708	vp = TAILQ_FIRST(&mp->mnt_workerqueue);
	2709	TAILQ_REMOVE(&mp->mnt_workerqueue, vp, v_mntvnodes);
	2710	TAILQ_INSERT_TAIL(&mp->mnt_vnodelist, vp, v_mntvnodes);
	2711
	2712	if ((vp->v_mount != mp) \|\| (vp == skipvp)) {
	2713	continue;
	2714	}
	2715	vid = vp->v_id;
	2716	mount_unlock(mp);
	2717
	2718	vnode_lock_spin(vp);
	2719
	2720	// If vnode is already terminating, wait for it...
	2721	while (vp->v_id == vid && ISSET(vp->v_lflag, VL_TERMINATE)) {
	2722	vp->v_lflag \|= VL_TERMWANT;
	2723	msleep(&vp->v_lflag, &vp->v_lock, PVFS, "vflush", NULL);
	2724	}
	2725
	2726	if ((vp->v_id != vid) \|\| ISSET(vp->v_lflag, VL_DEAD)) {
	2727	vnode_unlock(vp);
	2728	mount_lock(mp);
	2729	continue;
	2730	}
	2731
	2732	/*
	2733	* If requested, skip over vnodes marked VSYSTEM.
	2734	* Skip over all vnodes marked VNOFLUSH.
	2735	*/
	2736	if ((flags & SKIPSYSTEM) && ((vp->v_flag & VSYSTEM) \|\|
	2737	(vp->v_flag & VNOFLUSH))) {
	2738	vnode_unlock(vp);
	2739	mount_lock(mp);
	2740	continue;
	2741	}
	2742	/*
	2743	* If requested, skip over vnodes marked VSWAP.
	2744	*/
	2745	if ((flags & SKIPSWAP) && (vp->v_flag & VSWAP)) {
	2746	vnode_unlock(vp);
	2747	mount_lock(mp);
	2748	continue;
	2749	}
	2750	/*
	2751	* If requested, skip over vnodes marked VROOT.
	2752	*/
	2753	if ((flags & SKIPROOT) && (vp->v_flag & VROOT)) {
	2754	vnode_unlock(vp);
	2755	mount_lock(mp);
	2756	continue;
	2757	}
	2758	/*
	2759	* If WRITECLOSE is set, only flush out regular file
	2760	* vnodes open for writing.
	2761	*/
	2762	if ((flags & WRITECLOSE) &&
	2763	(vp->v_writecount == 0 \|\| vp->v_type != VREG)) {
	2764	vnode_unlock(vp);
	2765	mount_lock(mp);
	2766	continue;
	2767	}
	2768	/*
	2769	* If the real usecount is 0, all we need to do is clear
	2770	* out the vnode data structures and we are done.
	2771	*/
	2772	if (((vp->v_usecount == 0) \|\|
	2773	((vp->v_usecount - vp->v_kusecount) == 0))) {
	2774	vnode_lock_convert(vp);
	2775	vp->v_iocount++; /* so that drain waits for * other iocounts */
	2776	#ifdef JOE_DEBUG
	2777	record_vp(vp, 1);
	2778	#endif
	2779	vnode_reclaim_internal(vp, 1, 1, 0);
	2780	vnode_dropiocount(vp);
	2781	vnode_list_add(vp);
	2782	vnode_unlock(vp);
	2783
	2784	reclaimed++;
	2785	mount_lock(mp);
	2786	continue;
	2787	}
	2788	/*
	2789	* If FORCECLOSE is set, forcibly close the vnode.
	2790	* For block or character devices, revert to an
	2791	* anonymous device. For all other files, just kill them.
	2792	*/
	2793	if (flags & FORCECLOSE) {
	2794	vnode_lock_convert(vp);
	2795
	2796	if (vp->v_type != VBLK && vp->v_type != VCHR) {
	2797	vp->v_iocount++; /* so that drain waits * for other iocounts */
	2798	#ifdef JOE_DEBUG
	2799	record_vp(vp, 1);
	2800	#endif
	2801	vnode_abort_advlocks(vp);
	2802	vnode_reclaim_internal(vp, 1, 1, 0);
	2803	vnode_dropiocount(vp);
	2804	vnode_list_add(vp);
	2805	vnode_unlock(vp);
	2806	} else {
	2807	vclean(vp, 0);
	2808	vp->v_lflag &= ~VL_DEAD;
	2809	vp->v_op = spec_vnodeop_p;
	2810	vp->v_flag \|= VDEVFLUSH;
	2811	vnode_unlock(vp);
	2812	}
	2813	mount_lock(mp);
	2814	continue;
	2815	}
	2816
	2817	/* log vnodes blocking unforced unmounts */
	2818	if (print_busy_vnodes && first_try && ((flags & FORCECLOSE) == 0)) {
	2819	vprint("vflush - busy vnode", vp);
	2820	}
	2821
	2822	vnode_unlock(vp);
	2823	mount_lock(mp);
	2824	busy++;
	2825	}
	2826
	2827	/* At this point the worker queue is completed */
	2828	if (busy && ((flags & FORCECLOSE) == 0) && reclaimed) {
	2829	busy = 0;
	2830	reclaimed = 0;
	2831	(void)vnode_iterate_reloadq(mp);
	2832	first_try = false;
	2833	/* returned with mount lock held */
	2834	goto loop;
	2835	}
	2836
	2837	/* if new vnodes were created in between retry the reclaim */
	2838	if (vnode_iterate_reloadq(mp) != 0) {
	2839	if (!(busy && ((flags & FORCECLOSE) == 0))) {
	2840	first_try = false;
	2841	goto loop;
	2842	}
	2843	}
	2844	vnode_iterate_clear(mp);
	2845	mount_unlock(mp);
	2846	mount_iterate_unlock(mp);
	2847
	2848	if (busy && ((flags & FORCECLOSE) == 0)) {
	2849	return EBUSY;
	2850	}
	2851	return 0;
	2852	}
	2853
	2854	long num_recycledvnodes = 0;
	2855	/*
	2856	* Disassociate the underlying file system from a vnode.
	2857	* The vnode lock is held on entry.
	2858	*/
	2859	static void
	2860	vclean(vnode_t vp, int flags)
	2861	{
	2862	vfs_context_t ctx = vfs_context_current();
	2863	int active;
	2864	int need_inactive;
	2865	int already_terminating;
	2866	int clflags = 0;
	2867	#if NAMEDSTREAMS
	2868	int is_namedstream;
	2869	#endif
	2870
	2871	/*
	2872	* Check to see if the vnode is in use.
	2873	* If so we have to reference it before we clean it out
	2874	* so that its count cannot fall to zero and generate a
	2875	* race against ourselves to recycle it.
	2876	*/
	2877	active = vp->v_usecount;
	2878
	2879	/*
	2880	* just in case we missed sending a needed
	2881	* VNOP_INACTIVE, we'll do it now
	2882	*/
	2883	need_inactive = (vp->v_lflag & VL_NEEDINACTIVE);
	2884
	2885	vp->v_lflag &= ~VL_NEEDINACTIVE;
	2886
	2887	/*
	2888	* Prevent the vnode from being recycled or
	2889	* brought into use while we clean it out.
	2890	*/
	2891	already_terminating = (vp->v_lflag & VL_TERMINATE);
	2892
	2893	vp->v_lflag \|= VL_TERMINATE;
	2894
	2895	#if NAMEDSTREAMS
	2896	is_namedstream = vnode_isnamedstream(vp);
	2897	#endif
	2898
	2899	vnode_unlock(vp);
	2900
	2901	OSAddAtomicLong(1, &num_recycledvnodes);
	2902
	2903	if (flags & DOCLOSE) {
	2904	clflags \|= IO_NDELAY;
	2905	}
	2906	if (flags & REVOKEALL) {
	2907	clflags \|= IO_REVOKE;
	2908	}
	2909
	2910	#if CONFIG_MACF
	2911	if (vp->v_mount) {
	2912	/*
	2913	* It is possible for bdevvp vnodes to not have a mount
	2914	* pointer. It's fine to let it get reclaimed without
	2915	* notifying.
	2916	*/
	2917	mac_vnode_notify_reclaim(vp);
	2918	}
	2919	#endif
	2920
	2921	if (active && (flags & DOCLOSE)) {
	2922	VNOP_CLOSE(vp, clflags, ctx);
	2923	}
	2924
	2925	/*
	2926	* Clean out any buffers associated with the vnode.
	2927	*/
	2928	if (flags & DOCLOSE) {
	2929	#if CONFIG_NFS_CLIENT
	2930	if (vp->v_tag == VT_NFS) {
	2931	nfs_vinvalbuf(vp, V_SAVE, ctx, 0);
	2932	} else
	2933	#endif /* CONFIG_NFS_CLIENT */
	2934	{
	2935	VNOP_FSYNC(vp, MNT_WAIT, ctx);
	2936
	2937	/*
	2938	* If the vnode is still in use (by the journal for
	2939	* example) we don't want to invalidate locked buffers
	2940	* here. In that case, either the journal will tidy them
	2941	* up, or we will deal with it when the usecount is
	2942	* finally released in vnode_rele_internal.
	2943	*/
	2944	buf_invalidateblks(vp, BUF_WRITE_DATA \| (active ? 0 : BUF_INVALIDATE_LOCKED), 0, 0);
	2945	}
	2946	if (UBCINFOEXISTS(vp)) {
	2947	/*
	2948	* Clean the pages in VM.
	2949	*/
	2950	(void)ubc_msync(vp, (off_t)0, ubc_getsize(vp), NULL, UBC_PUSHALL \| UBC_INVALIDATE \| UBC_SYNC);
	2951	}
	2952	}
	2953	if (active \|\| need_inactive) {
	2954	VNOP_INACTIVE(vp, ctx);
	2955	}
	2956
	2957	#if NAMEDSTREAMS
	2958	if ((is_namedstream != 0) && (vp->v_parent != NULLVP)) {
	2959	vnode_t pvp = vp->v_parent;
	2960
	2961	/* Delete the shadow stream file before we reclaim its vnode */
	2962	if (vnode_isshadow(vp)) {
	2963	vnode_relenamedstream(pvp, vp);
	2964	}
	2965
	2966	/*
	2967	* No more streams associated with the parent. We
	2968	* have a ref on it, so its identity is stable.
	2969	* If the parent is on an opaque volume, then we need to know
	2970	* whether it has associated named streams.
	2971	*/
	2972	if (vfs_authopaque(pvp->v_mount)) {
	2973	vnode_lock_spin(pvp);
	2974	pvp->v_lflag &= ~VL_HASSTREAMS;
	2975	vnode_unlock(pvp);
	2976	}
	2977	}
	2978	#endif
	2979
	2980	/*
	2981	* Destroy ubc named reference
	2982	* cluster_release is done on this path
	2983	* along with dropping the reference on the ucred
	2984	* (and in the case of forced unmount of an mmap-ed file,
	2985	* the ubc reference on the vnode is dropped here too).
	2986	*/
	2987	ubc_destroy_named(vp);
	2988
	2989	#if CONFIG_TRIGGERS
	2990	/*
	2991	* cleanup trigger info from vnode (if any)
	2992	*/
	2993	if (vp->v_resolve) {
	2994	vnode_resolver_detach(vp);
	2995	}
	2996	#endif
	2997
	2998	#if CONFIG_IO_COMPRESSION_STATS
	2999	if ((vp->io_compression_stats)) {
	3000	vnode_iocs_record_and_free(vp);
	3001	}
	3002	#endif /* CONFIG_IO_COMPRESSION_STATS */
	3003
	3004	/*
	3005	* Reclaim the vnode.
	3006	*/
	3007	if (VNOP_RECLAIM(vp, ctx)) {
	3008	panic("vclean: cannot reclaim");
	3009	}
	3010
	3011	// make sure the name & parent ptrs get cleaned out!
	3012	vnode_update_identity(vp, NULLVP, NULL, 0, 0, VNODE_UPDATE_PARENT \| VNODE_UPDATE_NAME \| VNODE_UPDATE_PURGE \| VNODE_UPDATE_PURGEFIRMLINK);
	3013
	3014	vnode_lock(vp);
	3015
	3016	/*
	3017	* Remove the vnode from any mount list it might be on. It is not
	3018	* safe to do this any earlier because unmount needs to wait for
	3019	* any vnodes to terminate and it cannot do that if it cannot find
	3020	* them.
	3021	*/
	3022	insmntque(vp, (struct mount *)0);
	3023
	3024	vp->v_mount = dead_mountp;
	3025	vp->v_op = dead_vnodeop_p;
	3026	vp->v_tag = VT_NON;
	3027	vp->v_data = NULL;
	3028
	3029	vp->v_lflag \|= VL_DEAD;
	3030	vp->v_flag &= ~VISDIRTY;
	3031
	3032	if (already_terminating == 0) {
	3033	vp->v_lflag &= ~VL_TERMINATE;
	3034	/*
	3035	* Done with purge, notify sleepers of the grim news.
	3036	*/
	3037	if (vp->v_lflag & VL_TERMWANT) {
	3038	vp->v_lflag &= ~VL_TERMWANT;
	3039	wakeup(&vp->v_lflag);
	3040	}
	3041	}
	3042	}
	3043
	3044	/*
	3045	* Eliminate all activity associated with the requested vnode
	3046	* and with all vnodes aliased to the requested vnode.
	3047	*/
	3048	int
	3049	#if DIAGNOSTIC
	3050	vn_revoke(vnode_t vp, int flags, __unused vfs_context_t a_context)
	3051	#else
	3052	vn_revoke(vnode_t vp, __unused int flags, __unused vfs_context_t a_context)
	3053	#endif
	3054	{
	3055	struct vnode *vq;
	3056	int vid;
	3057
	3058	#if DIAGNOSTIC
	3059	if ((flags & REVOKEALL) == 0) {
	3060	panic("vnop_revoke");
	3061	}
	3062	#endif
	3063
	3064	if (vnode_isaliased(vp)) {
	3065	/*
	3066	* If a vgone (or vclean) is already in progress,
	3067	* return an immediate error
	3068	*/
	3069	if (vp->v_lflag & VL_TERMINATE) {
	3070	return ENOENT;
	3071	}
	3072
	3073	/*
	3074	* Ensure that vp will not be vgone'd while we
	3075	* are eliminating its aliases.
	3076	*/
	3077	SPECHASH_LOCK();
	3078	while ((vp->v_specflags & SI_ALIASED)) {
	3079	for (vq = *vp->v_hashchain; vq; vq = vq->v_specnext) {
	3080	if (vq->v_rdev != vp->v_rdev \|\|
	3081	vq->v_type != vp->v_type \|\| vp == vq) {
	3082	continue;
	3083	}
	3084	vid = vq->v_id;
	3085	SPECHASH_UNLOCK();
	3086	if (vnode_getwithvid(vq, vid)) {
	3087	SPECHASH_LOCK();
	3088	break;
	3089	}
	3090	vnode_lock(vq);
	3091	if (!(vq->v_lflag & VL_TERMINATE)) {
	3092	vnode_reclaim_internal(vq, 1, 1, 0);
	3093	}
	3094	vnode_put_locked(vq);
	3095	vnode_unlock(vq);
	3096	SPECHASH_LOCK();
	3097	break;
	3098	}
	3099	}
	3100	SPECHASH_UNLOCK();
	3101	}
	3102	vnode_lock(vp);
	3103	if (vp->v_lflag & VL_TERMINATE) {
	3104	vnode_unlock(vp);
	3105	return ENOENT;
	3106	}
	3107	vnode_reclaim_internal(vp, 1, 0, REVOKEALL);
	3108	vnode_unlock(vp);
	3109
	3110	return 0;
	3111	}
	3112
	3113	/*
	3114	* Recycle an unused vnode to the front of the free list.
	3115	* Release the passed interlock if the vnode will be recycled.
	3116	*/
	3117	int
	3118	vnode_recycle(struct vnode *vp)
	3119	{
	3120	vnode_lock_spin(vp);
	3121
	3122	if (vp->v_iocount \|\| vp->v_usecount) {
	3123	vp->v_lflag \|= VL_MARKTERM;
	3124	vnode_unlock(vp);
	3125	return 0;
	3126	}
	3127	vnode_lock_convert(vp);
	3128	vnode_reclaim_internal(vp, 1, 0, 0);
	3129
	3130	vnode_unlock(vp);
	3131
	3132	return 1;
	3133	}
	3134
	3135	static int
	3136	vnode_reload(vnode_t vp)
	3137	{
	3138	vnode_lock_spin(vp);
	3139
	3140	if ((vp->v_iocount > 1) \|\| vp->v_usecount) {
	3141	vnode_unlock(vp);
	3142	return 0;
	3143	}
	3144	if (vp->v_iocount <= 0) {
	3145	panic("vnode_reload with no iocount %d", vp->v_iocount);
	3146	}
	3147
	3148	/* mark for release when iocount is dopped */
	3149	vp->v_lflag \|= VL_MARKTERM;
	3150	vnode_unlock(vp);
	3151
	3152	return 1;
	3153	}
	3154
	3155
	3156	static void
	3157	vgone(vnode_t vp, int flags)
	3158	{
	3159	struct vnode *vq;
	3160	struct vnode *vx;
	3161
	3162	/*
	3163	* Clean out the filesystem specific data.
	3164	* vclean also takes care of removing the
	3165	* vnode from any mount list it might be on
	3166	*/
	3167	vclean(vp, flags \| DOCLOSE);
	3168
	3169	/*
	3170	* If special device, remove it from special device alias list
	3171	* if it is on one.
	3172	*/
	3173	if ((vp->v_type == VBLK \|\| vp->v_type == VCHR) && vp->v_specinfo != 0) {
	3174	SPECHASH_LOCK();
	3175	if (*vp->v_hashchain == vp) {
	3176	*vp->v_hashchain = vp->v_specnext;
	3177	} else {
	3178	for (vq = *vp->v_hashchain; vq; vq = vq->v_specnext) {
	3179	if (vq->v_specnext != vp) {
	3180	continue;
	3181	}
	3182	vq->v_specnext = vp->v_specnext;
	3183	break;
	3184	}
	3185	if (vq == NULL) {
	3186	panic("missing bdev");
	3187	}
	3188	}
	3189	if (vp->v_specflags & SI_ALIASED) {
	3190	vx = NULL;
	3191	for (vq = *vp->v_hashchain; vq; vq = vq->v_specnext) {
	3192	if (vq->v_rdev != vp->v_rdev \|\|
	3193	vq->v_type != vp->v_type) {
	3194	continue;
	3195	}
	3196	if (vx) {
	3197	break;
	3198	}
	3199	vx = vq;
	3200	}
	3201	if (vx == NULL) {
	3202	panic("missing alias");
	3203	}
	3204	if (vq == NULL) {
	3205	vx->v_specflags &= ~SI_ALIASED;
	3206	}
	3207	vp->v_specflags &= ~SI_ALIASED;
	3208	}
	3209	SPECHASH_UNLOCK();
	3210	{
	3211	struct specinfo *tmp = vp->v_specinfo;
	3212	vp->v_specinfo = NULL;
	3213	zfree(specinfo_zone, tmp);
	3214	}
	3215	}
	3216	}
	3217
	3218	/*
	3219	* Lookup a vnode by device number.
	3220	*/
	3221	int
	3222	check_mountedon(dev_t dev, enum vtype type, int *errorp)
	3223	{
	3224	vnode_t vp;
	3225	int rc = 0;
	3226	int vid;
	3227
	3228	loop:
	3229	SPECHASH_LOCK();
	3230	for (vp = speclisth[SPECHASH(dev)]; vp; vp = vp->v_specnext) {
	3231	if (dev != vp->v_rdev \|\| type != vp->v_type) {
	3232	continue;
	3233	}
	3234	vid = vp->v_id;
	3235	SPECHASH_UNLOCK();
	3236	if (vnode_getwithvid(vp, vid)) {
	3237	goto loop;
	3238	}
	3239	vnode_lock_spin(vp);
	3240	if ((vp->v_usecount > 0) \|\| (vp->v_iocount > 1)) {
	3241	vnode_unlock(vp);
	3242	if ((*errorp = vfs_mountedon(vp)) != 0) {
	3243	rc = 1;
	3244	}
	3245	} else {
	3246	vnode_unlock(vp);
	3247	}
	3248	vnode_put(vp);
	3249	return rc;
	3250	}
	3251	SPECHASH_UNLOCK();
	3252	return 0;
	3253	}
	3254
	3255	/*
	3256	* Calculate the total number of references to a special device.
	3257	*/
	3258	int
	3259	vcount(vnode_t vp)
	3260	{
	3261	vnode_t vq, vnext;
	3262	int count;
	3263	int vid;
	3264
	3265	if (!vnode_isspec(vp)) {
	3266	return vp->v_usecount - vp->v_kusecount;
	3267	}
	3268
	3269	loop:
	3270	if (!vnode_isaliased(vp)) {
	3271	return vp->v_specinfo->si_opencount;
	3272	}
	3273	count = 0;
	3274
	3275	SPECHASH_LOCK();
	3276	/*
	3277	* Grab first vnode and its vid.
	3278	*/
	3279	vq = *vp->v_hashchain;
	3280	vid = vq ? vq->v_id : 0;
	3281
	3282	SPECHASH_UNLOCK();
	3283
	3284	while (vq) {
	3285	/*
	3286	* Attempt to get the vnode outside the SPECHASH lock.
	3287	*/
	3288	if (vnode_getwithvid(vq, vid)) {
	3289	goto loop;
	3290	}
	3291	vnode_lock(vq);
	3292
	3293	if (vq->v_rdev == vp->v_rdev && vq->v_type == vp->v_type) {
	3294	if ((vq->v_usecount == 0) && (vq->v_iocount == 1) && vq != vp) {
	3295	/*
	3296	* Alias, but not in use, so flush it out.
	3297	*/
	3298	vnode_reclaim_internal(vq, 1, 1, 0);
	3299	vnode_put_locked(vq);
	3300	vnode_unlock(vq);
	3301	goto loop;
	3302	}
	3303	count += vq->v_specinfo->si_opencount;
	3304	}
	3305	vnode_unlock(vq);
	3306
	3307	SPECHASH_LOCK();
	3308	/*
	3309	* must do this with the reference still held on 'vq'
	3310	* so that it can't be destroyed while we're poking
	3311	* through v_specnext
	3312	*/
	3313	vnext = vq->v_specnext;
	3314	vid = vnext ? vnext->v_id : 0;
	3315
	3316	SPECHASH_UNLOCK();
	3317
	3318	vnode_put(vq);
	3319
	3320	vq = vnext;
	3321	}
	3322
	3323	return count;
	3324	}
	3325
	3326	int prtactive = 0; /* 1 => print out reclaim of active vnodes */
	3327
	3328	/*
	3329	* Print out a description of a vnode.
	3330	*/
	3331	static const char *typename[] =
	3332	{ "VNON", "VREG", "VDIR", "VBLK", "VCHR", "VLNK", "VSOCK", "VFIFO", "VBAD" };
	3333
	3334	void
	3335	vprint(const char label, struct vnode vp)
	3336	{
	3337	char sbuf[64];
	3338
	3339	if (label != NULL) {
	3340	printf("%s: ", label);
	3341	}
	3342	printf("name %s type %s, usecount %d, writecount %d\n",
	3343	vp->v_name, typename[vp->v_type],
	3344	vp->v_usecount, vp->v_writecount);
	3345	sbuf[0] = '\0';
	3346	if (vp->v_flag & VROOT) {
	3347	strlcat(sbuf, "\|VROOT", sizeof(sbuf));
	3348	}
	3349	if (vp->v_flag & VTEXT) {
	3350	strlcat(sbuf, "\|VTEXT", sizeof(sbuf));
	3351	}
	3352	if (vp->v_flag & VSYSTEM) {
	3353	strlcat(sbuf, "\|VSYSTEM", sizeof(sbuf));
	3354	}
	3355	if (vp->v_flag & VNOFLUSH) {
	3356	strlcat(sbuf, "\|VNOFLUSH", sizeof(sbuf));
	3357	}
	3358	if (vp->v_flag & VBWAIT) {
	3359	strlcat(sbuf, "\|VBWAIT", sizeof(sbuf));
	3360	}
	3361	if (vnode_isaliased(vp)) {
	3362	strlcat(sbuf, "\|VALIASED", sizeof(sbuf));
	3363	}
	3364	if (sbuf[0] != '\0') {
	3365	printf("vnode flags (%s\n", &sbuf[1]);
	3366	}
	3367	}
	3368
	3369
	3370	int
	3371	vn_getpath(struct vnode vp, char pathbuf, int *len)
	3372	{
	3373	return build_path(vp, pathbuf, *len, len, BUILDPATH_NO_FS_ENTER, vfs_context_current());
	3374	}
	3375
	3376	int
	3377	vn_getpath_fsenter(struct vnode vp, char pathbuf, int *len)
	3378	{
	3379	return build_path(vp, pathbuf, *len, len, 0, vfs_context_current());
	3380	}
	3381
	3382	/*
	3383	* vn_getpath_fsenter_with_parent will reenter the file system to fine the path of the
	3384	* vnode. It requires that there are IO counts on both the vnode and the directory vnode.
	3385	*
	3386	* vn_getpath_fsenter is called by MAC hooks to authorize operations for every thing, but
	3387	* unlink, rmdir and rename. For these operation the MAC hook calls vn_getpath. This presents
	3388	* problems where if the path can not be found from the name cache, those operations can
	3389	* erroneously fail with EPERM even though the call should succeed. When removing or moving
	3390	* file system objects with operations such as unlink or rename, those operations need to
	3391	* take IO counts on the target and containing directory. Calling vn_getpath_fsenter from a
	3392	* MAC hook from these operations during forced unmount operations can lead to dead
	3393	* lock. This happens when the operation starts, IO counts are taken on the containing
	3394	* directories and targets. Before the MAC hook is called a forced unmount from another
	3395	* thread takes place and blocks on the on going operation's directory vnode in vdrain.
	3396	* After which, the MAC hook gets called and calls vn_getpath_fsenter. vn_getpath_fsenter
	3397	* is called with the understanding that there is an IO count on the target. If in
	3398	* build_path the directory vnode is no longer in the cache, then the parent object id via
	3399	* vnode_getattr from the target is obtain and used to call VFS_VGET to get the parent
	3400	* vnode. The file system's VFS_VGET then looks up by inode in its hash and tries to get
	3401	* an IO count. But VFS_VGET "sees" the directory vnode is in vdrain and can block
	3402	* depending on which version and how it calls the vnode_get family of interfaces.
	3403	*
	3404	* N.B. A reasonable interface to use is vnode_getwithvid. This interface was modified to
	3405	* call vnode_getiocount with VNODE_DRAINO, so it will happily get an IO count and not
	3406	* cause issues, but there is no guarantee that all or any file systems are doing that.
	3407	*
	3408	* vn_getpath_fsenter_with_parent can enter the file system safely since there is a known
	3409	* IO count on the directory vnode by calling build_path_with_parent.
	3410	*/
	3411
	3412	int
	3413	vn_getpath_fsenter_with_parent(struct vnode dvp, struct vnode vp, char pathbuf, int len)
	3414	{
	3415	return build_path_with_parent(vp, dvp, pathbuf, *len, len, NULL, 0, vfs_context_current());
	3416	}
	3417
	3418	int
	3419	vn_getpath_ext(struct vnode vp, struct vnode dvp, char pathbuf, int len, int flags)
	3420	{
	3421	int bpflags = (flags & VN_GETPATH_FSENTER) ? 0 : BUILDPATH_NO_FS_ENTER;
	3422
	3423	if (flags && (flags != VN_GETPATH_FSENTER)) {
	3424	if (flags & VN_GETPATH_NO_FIRMLINK) {
	3425	bpflags \|= BUILDPATH_NO_FIRMLINK;;
	3426	}
	3427	if (flags & VN_GETPATH_VOLUME_RELATIVE) {
	3428	bpflags \|= (BUILDPATH_VOLUME_RELATIVE \| BUILDPATH_NO_FIRMLINK);
	3429	}
	3430	if (flags & VN_GETPATH_NO_PROCROOT) {
	3431	bpflags \|= BUILDPATH_NO_PROCROOT;
	3432	}
	3433	}
	3434
	3435	return build_path_with_parent(vp, dvp, pathbuf, *len, len, NULL, bpflags, vfs_context_current());
	3436	}
	3437
	3438	int
	3439	vn_getpath_no_firmlink(struct vnode vp, char pathbuf, int *len)
	3440	{
	3441	return vn_getpath_ext(vp, NULLVP, pathbuf, len, VN_GETPATH_NO_FIRMLINK);
	3442	}
	3443
	3444	int
	3445	vn_getpath_ext_with_mntlen(struct vnode vp, struct vnode dvp, char pathbuf, size_t len, size_t *mntlen, int flags)
	3446	{
	3447	int bpflags = (flags & VN_GETPATH_FSENTER) ? 0 : BUILDPATH_NO_FS_ENTER;
	3448	int local_len;
	3449	int error;
	3450
	3451	if (*len > INT_MAX) {
	3452	return EINVAL;
	3453	}
	3454
	3455	local_len = *len;
	3456
	3457	if (flags && (flags != VN_GETPATH_FSENTER)) {
	3458	if (flags & VN_GETPATH_NO_FIRMLINK) {
	3459	bpflags \|= BUILDPATH_NO_FIRMLINK;;
	3460	}
	3461	if (flags & VN_GETPATH_VOLUME_RELATIVE) {
	3462	bpflags \|= (BUILDPATH_VOLUME_RELATIVE \| BUILDPATH_NO_FIRMLINK);
	3463	}
	3464	if (flags & VN_GETPATH_NO_PROCROOT) {
	3465	bpflags \|= BUILDPATH_NO_PROCROOT;
	3466	}
	3467	}
	3468
	3469	error = build_path_with_parent(vp, dvp, pathbuf, local_len, &local_len, mntlen, bpflags, vfs_context_current());
	3470
	3471	if (local_len >= 0 && local_len <= (int)*len) {
	3472	*len = (size_t)local_len;
	3473	}
	3474
	3475	return error;
	3476	}
	3477
	3478	int
	3479	vn_getcdhash(struct vnode vp, off_t offset, unsigned char cdhash)
	3480	{
	3481	return ubc_cs_getcdhash(vp, offset, cdhash);
	3482	}
	3483
	3484
	3485	static char *extension_table = NULL;
	3486	static int nexts;
	3487	static int max_ext_width;
	3488
	3489	static int
	3490	extension_cmp(const void a, const void b)
	3491	{
	3492	return (int)(strlen((const char )a) - strlen((const char )b));
	3493	}
	3494
	3495
	3496	//
	3497	// This is the api LaunchServices uses to inform the kernel
	3498	// the list of package extensions to ignore.
	3499	//
	3500	// Internally we keep the list sorted by the length of the
	3501	// the extension (from longest to shortest). We sort the
	3502	// list of extensions so that we can speed up our searches
	3503	// when comparing file names -- we only compare extensions
	3504	// that could possibly fit into the file name, not all of
	3505	// them (i.e. a short 8 character name can't have an 8
	3506	// character extension).
	3507	//
	3508	extern lck_mtx_t pkg_extensions_lck;
	3509
	3510	__private_extern__ int
	3511	set_package_extensions_table(user_addr_t data, int nentries, int maxwidth)
	3512	{
	3513	char new_exts, old_exts;
	3514	int old_nentries = 0, old_maxwidth = 0;
	3515	int error;
	3516
	3517	if (nentries <= 0 \|\| nentries > 1024 \|\| maxwidth <= 0 \|\| maxwidth > 255) {
	3518	return EINVAL;
	3519	}
	3520
	3521
	3522	// allocate one byte extra so we can guarantee null termination
	3523	new_exts = kheap_alloc(KHEAP_DATA_BUFFERS, (nentries * maxwidth) + 1,
	3524	Z_WAITOK);
	3525	if (new_exts == NULL) {
	3526	return ENOMEM;
	3527	}
	3528
	3529	error = copyin(data, new_exts, nentries * maxwidth);
	3530	if (error) {
	3531	kheap_free(KHEAP_DATA_BUFFERS, new_exts, (nentries * maxwidth) + 1);
	3532	return error;
	3533	}
	3534
	3535	new_exts[(nentries * maxwidth)] = '\0'; // guarantee null termination of the block
	3536
	3537	qsort(new_exts, nentries, maxwidth, extension_cmp);
	3538
	3539	lck_mtx_lock(&pkg_extensions_lck);
	3540
	3541	old_exts = extension_table;
	3542	old_nentries = nexts;
	3543	old_maxwidth = max_ext_width;
	3544	extension_table = new_exts;
	3545	nexts = nentries;
	3546	max_ext_width = maxwidth;
	3547
	3548	lck_mtx_unlock(&pkg_extensions_lck);
	3549
	3550	kheap_free(KHEAP_DATA_BUFFERS, old_exts,
	3551	(old_nentries * old_maxwidth) + 1);
	3552
	3553	return 0;
	3554	}
	3555
	3556
	3557	int
	3558	is_package_name(const char *name, int len)
	3559	{
	3560	int i;
	3561	size_t extlen;
	3562	const char ptr, name_ext;
	3563
	3564	// if the name is less than 3 bytes it can't be of the
	3565	// form A.B and if it begins with a "." then it is also
	3566	// not a package.
	3567	if (len <= 3 \|\| name[0] == '.') {
	3568	return 0;
	3569	}
	3570
	3571	name_ext = NULL;
	3572	for (ptr = name; *ptr != '\0'; ptr++) {
	3573	if (*ptr == '.') {
	3574	name_ext = ptr;
	3575	}
	3576	}
	3577
	3578	// if there is no "." extension, it can't match
	3579	if (name_ext == NULL) {
	3580	return 0;
	3581	}
	3582
	3583	// advance over the "."
	3584	name_ext++;
	3585
	3586	lck_mtx_lock(&pkg_extensions_lck);
	3587
	3588	// now iterate over all the extensions to see if any match
	3589	ptr = &extension_table[0];
	3590	for (i = 0; i < nexts; i++, ptr += max_ext_width) {
	3591	extlen = strlen(ptr);
	3592	if (strncasecmp(name_ext, ptr, extlen) == 0 && name_ext[extlen] == '\0') {
	3593	// aha, a match!
	3594	lck_mtx_unlock(&pkg_extensions_lck);
	3595	return 1;
	3596	}
	3597	}
	3598
	3599	lck_mtx_unlock(&pkg_extensions_lck);
	3600
	3601	// if we get here, no extension matched
	3602	return 0;
	3603	}
	3604
	3605	int
	3606	vn_path_package_check(__unused vnode_t vp, char path, int pathlen, int component)
	3607	{
	3608	char ptr, end;
	3609	int comp = 0;
	3610
	3611	if (pathlen < 0) {
	3612	return EINVAL;
	3613	}
	3614
	3615	*component = -1;
	3616	if (*path != '/') {
	3617	return EINVAL;
	3618	}
	3619
	3620	end = path + 1;
	3621	while (end < path + pathlen && *end != '\0') {
	3622	while (end < path + pathlen && end == '/' && end != '\0') {
	3623	end++;
	3624	}
	3625
	3626	ptr = end;
	3627
	3628	while (end < path + pathlen && end != '/' && end != '\0') {
	3629	end++;
	3630	}
	3631
	3632	if (end > path + pathlen) {
	3633	// hmm, string wasn't null terminated
	3634	return EINVAL;
	3635	}
	3636
	3637	*end = '\0';
	3638	if (is_package_name(ptr, (int)(end - ptr))) {
	3639	*component = comp;
	3640	break;
	3641	}
	3642
	3643	end++;
	3644	comp++;
	3645	}
	3646
	3647	return 0;
	3648	}
	3649
	3650	/*
	3651	* Determine if a name is inappropriate for a searchfs query.
	3652	* This list consists of /System currently.
	3653	*/
	3654
	3655	int
	3656	vn_searchfs_inappropriate_name(const char *name, int len)
	3657	{
	3658	const char *bad_names[] = { "System" };
	3659	int bad_len[] = { 6 };
	3660	int i;
	3661
	3662	if (len < 0) {
	3663	return EINVAL;
	3664	}
	3665
	3666	for (i = 0; i < (int) (sizeof(bad_names) / sizeof(bad_names[0])); i++) {
	3667	if (len == bad_len[i] && strncmp(name, bad_names[i], strlen(bad_names[i]) + 1) == 0) {
	3668	return 1;
	3669	}
	3670	}
	3671
	3672	// if we get here, no name matched
	3673	return 0;
	3674	}
	3675
	3676	/*
	3677	* Top level filesystem related information gathering.
	3678	*/
	3679	extern unsigned int vfs_nummntops;
	3680
	3681	/*
	3682	* The VFS_NUMMNTOPS shouldn't be at name[1] since
	3683	* is a VFS generic variable. Since we no longer support
	3684	* VT_UFS, we reserve its value to support this sysctl node.
	3685	*
	3686	* It should have been:
	3687	* name[0]: VFS_GENERIC
	3688	* name[1]: VFS_NUMMNTOPS
	3689	*/
	3690	SYSCTL_INT(_vfs, VFS_NUMMNTOPS, nummntops,
	3691	CTLFLAG_RD \| CTLFLAG_KERN \| CTLFLAG_LOCKED,
	3692	&vfs_nummntops, 0, "");
	3693
	3694	int
	3695	vfs_sysctl(int *name __unused, u_int namelen __unused,
	3696	user_addr_t oldp __unused, size_t *oldlenp __unused,
	3697	user_addr_t newp __unused, size_t newlen __unused, proc_t p __unused);
	3698
	3699	int
	3700	vfs_sysctl(int *name __unused, u_int namelen __unused,
	3701	user_addr_t oldp __unused, size_t *oldlenp __unused,
	3702	user_addr_t newp __unused, size_t newlen __unused, proc_t p __unused)
	3703	{
	3704	return EINVAL;
	3705	}
	3706
	3707
	3708	//
	3709	// The following code disallows specific sysctl's that came through
	3710	// the direct sysctl interface (vfs_sysctl_node) instead of the newer
	3711	// sysctl_vfs_ctlbyfsid() interface. We can not allow these selectors
	3712	// through vfs_sysctl_node() because it passes the user's oldp pointer
	3713	// directly to the file system which (for these selectors) casts it
	3714	// back to a struct sysctl_req and then proceed to use SYSCTL_IN()
	3715	// which jumps through an arbitrary function pointer. When called
	3716	// through the sysctl_vfs_ctlbyfsid() interface this does not happen
	3717	// and so it's safe.
	3718	//
	3719	// Unfortunately we have to pull in definitions from AFP and SMB and
	3720	// perform explicit name checks on the file system to determine if
	3721	// these selectors are being used.
	3722	//
	3723
	3724	#define AFPFS_VFS_CTL_GETID 0x00020001
	3725	#define AFPFS_VFS_CTL_NETCHANGE 0x00020002
	3726	#define AFPFS_VFS_CTL_VOLCHANGE 0x00020003
	3727
	3728	#define SMBFS_SYSCTL_REMOUNT 1
	3729	#define SMBFS_SYSCTL_REMOUNT_INFO 2
	3730	#define SMBFS_SYSCTL_GET_SERVER_SHARE 3
	3731
	3732
	3733	static int
	3734	is_bad_sysctl_name(struct vfstable *vfsp, int selector_name)
	3735	{
	3736	switch (selector_name) {
	3737	case VFS_CTL_QUERY:
	3738	case VFS_CTL_TIMEO:
	3739	case VFS_CTL_NOLOCKS:
	3740	case VFS_CTL_NSTATUS:
	3741	case VFS_CTL_SADDR:
	3742	case VFS_CTL_DISC:
	3743	case VFS_CTL_SERVERINFO:
	3744	return 1;
	3745
	3746	default:
	3747	break;
	3748	}
	3749
	3750	// the more complicated check for some of SMB's special values
	3751	if (strcmp(vfsp->vfc_name, "smbfs") == 0) {
	3752	switch (selector_name) {
	3753	case SMBFS_SYSCTL_REMOUNT:
	3754	case SMBFS_SYSCTL_REMOUNT_INFO:
	3755	case SMBFS_SYSCTL_GET_SERVER_SHARE:
	3756	return 1;
	3757	}
	3758	} else if (strcmp(vfsp->vfc_name, "afpfs") == 0) {
	3759	switch (selector_name) {
	3760	case AFPFS_VFS_CTL_GETID:
	3761	case AFPFS_VFS_CTL_NETCHANGE:
	3762	case AFPFS_VFS_CTL_VOLCHANGE:
	3763	return 1;
	3764	}
	3765	}
	3766
	3767	//
	3768	// If we get here we passed all the checks so the selector is ok
	3769	//
	3770	return 0;
	3771	}
	3772
	3773
	3774	int vfs_sysctl_node SYSCTL_HANDLER_ARGS
	3775	{
	3776	int *name, namelen;
	3777	struct vfstable *vfsp;
	3778	int error;
	3779	int fstypenum;
	3780
	3781	fstypenum = oidp->oid_number;
	3782	name = arg1;
	3783	namelen = arg2;
	3784
	3785	/* all sysctl names at this level should have at least one name slot for the FS */
	3786	if (namelen < 1) {
	3787	return EISDIR; /* overloaded */
	3788	}
	3789	mount_list_lock();
	3790	for (vfsp = vfsconf; vfsp; vfsp = vfsp->vfc_next) {
	3791	if (vfsp->vfc_typenum == fstypenum) {
	3792	vfsp->vfc_refcount++;
	3793	break;
	3794	}
	3795	}
	3796	mount_list_unlock();
	3797
	3798	if (vfsp == NULL) {
	3799	return ENOTSUP;
	3800	}
	3801
	3802	if (is_bad_sysctl_name(vfsp, name[0])) {
	3803	printf("vfs: bad selector 0x%.8x for old-style sysctl(). use the sysctl-by-fsid interface instead\n", name[0]);
	3804	return EPERM;
	3805	}
	3806
	3807	error = (vfsp->vfc_vfsops->vfs_sysctl)(name, namelen, req->oldptr, &req->oldlen, req->newptr, req->newlen, vfs_context_current());
	3808
	3809	mount_list_lock();
	3810	vfsp->vfc_refcount--;
	3811	mount_list_unlock();
	3812
	3813	return error;
	3814	}
	3815
	3816	/*
	3817	* Check to see if a filesystem is mounted on a block device.
	3818	*/
	3819	int
	3820	vfs_mountedon(struct vnode *vp)
	3821	{
	3822	struct vnode *vq;
	3823	int error = 0;
	3824
	3825	SPECHASH_LOCK();
	3826	if (vp->v_specflags & SI_MOUNTEDON) {
	3827	error = EBUSY;
	3828	goto out;
	3829	}
	3830	if (vp->v_specflags & SI_ALIASED) {
	3831	for (vq = *vp->v_hashchain; vq; vq = vq->v_specnext) {
	3832	if (vq->v_rdev != vp->v_rdev \|\|
	3833	vq->v_type != vp->v_type) {
	3834	continue;
	3835	}
	3836	if (vq->v_specflags & SI_MOUNTEDON) {
	3837	error = EBUSY;
	3838	break;
	3839	}
	3840	}
	3841	}
	3842	out:
	3843	SPECHASH_UNLOCK();
	3844	return error;
	3845	}
	3846
	3847	struct unmount_info {
	3848	int u_errs; // Total failed unmounts
	3849	int u_busy; // EBUSY failed unmounts
	3850	int u_count; // Total volumes iterated
	3851	int u_only_non_system;
	3852	};
	3853
	3854	static int
	3855	unmount_callback(mount_t mp, void *arg)
	3856	{
	3857	int error;
	3858	char *mntname;
	3859	struct unmount_info *uip = arg;
	3860
	3861	uip->u_count++;
	3862
	3863	mntname = zalloc(ZV_NAMEI);
	3864	strlcpy(mntname, mp->mnt_vfsstat.f_mntonname, MAXPATHLEN);
	3865
	3866	if (uip->u_only_non_system
	3867	&& ((mp->mnt_flag & MNT_ROOTFS) \|\| (mp->mnt_kern_flag & MNTK_SYSTEM))) { //MNTK_BACKS_ROOT
	3868	printf("unmount(%d) %s skipped\n", uip->u_only_non_system, mntname);
	3869	mount_iterdrop(mp); // VFS_ITERATE_CB_DROPREF
	3870	} else {
	3871	printf("unmount(%d) %s\n", uip->u_only_non_system, mntname);
	3872
	3873	mount_ref(mp, 0);
	3874	mount_iterdrop(mp); // VFS_ITERATE_CB_DROPREF
	3875	error = dounmount(mp, MNT_FORCE, 1, vfs_context_current());
	3876	if (error) {
	3877	uip->u_errs++;
	3878	printf("Unmount of %s failed (%d)\n", mntname ? mntname:"?", error);
	3879	if (error == EBUSY) {
	3880	uip->u_busy++;
	3881	}
	3882	}
	3883	}
	3884	if (mntname) {
	3885	zfree(ZV_NAMEI, mntname);
	3886	}
	3887
	3888	return VFS_RETURNED;
	3889	}
	3890
	3891	/*
	3892	* Unmount all filesystems. The list is traversed in reverse order
	3893	* of mounting to avoid dependencies.
	3894	* Busy mounts are retried.
	3895	*/
	3896	__private_extern__ void
	3897	vfs_unmountall(int only_non_system)
	3898	{
	3899	int mounts, sec = 1;
	3900	struct unmount_info ui;
	3901
	3902	vfs_unmountall_started = 1;
	3903	printf("vfs_unmountall(%ssystem) start\n", only_non_system ? "non" : "");
	3904
	3905	retry:
	3906	ui.u_errs = ui.u_busy = ui.u_count = 0;
	3907	ui.u_only_non_system = only_non_system;
	3908	// avoid vfs_iterate deadlock in dounmount(), use VFS_ITERATE_CB_DROPREF
	3909	vfs_iterate(VFS_ITERATE_CB_DROPREF \| VFS_ITERATE_TAIL_FIRST, unmount_callback, &ui);
	3910	mounts = mount_getvfscnt();
	3911	if (mounts == 0) {
	3912	return;
	3913	}
	3914	if (ui.u_busy > 0) { // Busy mounts - wait & retry
	3915	tsleep(&nummounts, PVFS, "busy mount", sec * hz);
	3916	sec *= 2;
	3917	if (sec <= 32) {
	3918	goto retry;
	3919	}
	3920	printf("Unmounting timed out\n");
	3921	} else if (ui.u_count < mounts) {
	3922	// If the vfs_iterate missed mounts in progress - wait a bit
	3923	tsleep(&nummounts, PVFS, "missed mount", 2 * hz);
	3924	}
	3925
	3926	printf("vfs_unmountall(%ssystem) end\n", only_non_system ? "non" : "");
	3927	}
	3928
	3929	/*
	3930	* This routine is called from vnode_pager_deallocate out of the VM
	3931	* The path to vnode_pager_deallocate can only be initiated by ubc_destroy_named
	3932	* on a vnode that has a UBCINFO
	3933	*/
	3934	__private_extern__ void
	3935	vnode_pager_vrele(vnode_t vp)
	3936	{
	3937	struct ubc_info *uip;
	3938
	3939	vnode_lock_spin(vp);
	3940
	3941	vp->v_lflag &= ~VNAMED_UBC;
	3942	if (vp->v_usecount != 0) {
	3943	/*
	3944	* At the eleventh hour, just before the ubcinfo is
	3945	* destroyed, ensure the ubc-specific v_usecount
	3946	* reference has gone. We use v_usecount != 0 as a hint;
	3947	* ubc_unmap() does nothing if there's no mapping.
	3948	*
	3949	* This case is caused by coming here via forced unmount,
	3950	* versus the usual vm_object_deallocate() path.
	3951	* In the forced unmount case, ubc_destroy_named()
	3952	* releases the pager before memory_object_last_unmap()
	3953	* can be called.
	3954	*/
	3955	vnode_unlock(vp);
	3956	ubc_unmap(vp);
	3957	vnode_lock_spin(vp);
	3958	}
	3959
	3960	uip = vp->v_ubcinfo;
	3961	vp->v_ubcinfo = UBC_INFO_NULL;
	3962
	3963	vnode_unlock(vp);
	3964
	3965	ubc_info_deallocate(uip);
	3966	}
	3967
	3968
	3969	#include <sys/disk.h>
	3970
	3971	u_int32_t rootunit = (u_int32_t)-1;
	3972
	3973	#if CONFIG_IOSCHED
	3974	extern int lowpri_throttle_enabled;
	3975	extern int iosched_enabled;
	3976	#endif
	3977
	3978	errno_t
	3979	vfs_init_io_attributes(vnode_t devvp, mount_t mp)
	3980	{
	3981	int error;
	3982	off_t readblockcnt = 0;
	3983	off_t writeblockcnt = 0;
	3984	off_t readmaxcnt = 0;
	3985	off_t writemaxcnt = 0;
	3986	off_t readsegcnt = 0;
	3987	off_t writesegcnt = 0;
	3988	off_t readsegsize = 0;
	3989	off_t writesegsize = 0;
	3990	off_t alignment = 0;
	3991	u_int32_t minsaturationbytecount = 0;
	3992	u_int32_t ioqueue_depth = 0;
	3993	u_int32_t blksize;
	3994	u_int64_t temp;
	3995	u_int32_t features;
	3996	u_int64_t location = 0;
	3997	vfs_context_t ctx = vfs_context_current();
	3998	dk_corestorage_info_t cs_info;
	3999	boolean_t cs_present = FALSE;;
	4000	int isssd = 0;
	4001	int isvirtual = 0;
	4002
	4003
	4004	VNOP_IOCTL(devvp, DKIOCGETTHROTTLEMASK, (caddr_t)&mp->mnt_throttle_mask, 0, NULL);
	4005	/*
	4006	* as a reasonable approximation, only use the lowest bit of the mask
	4007	* to generate a disk unit number
	4008	*/
	4009	mp->mnt_devbsdunit = num_trailing_0(mp->mnt_throttle_mask);
	4010
	4011	if (devvp == rootvp) {
	4012	rootunit = mp->mnt_devbsdunit;
	4013	}
	4014
	4015	if (mp->mnt_devbsdunit == rootunit) {
	4016	/*
	4017	* this mount point exists on the same device as the root
	4018	* partition, so it comes under the hard throttle control...
	4019	* this is true even for the root mount point itself
	4020	*/
	4021	mp->mnt_kern_flag \|= MNTK_ROOTDEV;
	4022	}
	4023	/*
	4024	* force the spec device to re-cache
	4025	* the underlying block size in case
	4026	* the filesystem overrode the initial value
	4027	*/
	4028	set_fsblocksize(devvp);
	4029
	4030
	4031	if ((error = VNOP_IOCTL(devvp, DKIOCGETBLOCKSIZE,
	4032	(caddr_t)&blksize, 0, ctx))) {
	4033	return error;
	4034	}
	4035
	4036	mp->mnt_devblocksize = blksize;
	4037
	4038	/*
	4039	* set the maximum possible I/O size
	4040	* this may get clipped to a smaller value
	4041	* based on which constraints are being advertised
	4042	* and if those advertised constraints result in a smaller
	4043	* limit for a given I/O
	4044	*/
	4045	mp->mnt_maxreadcnt = MAX_UPL_SIZE_BYTES;
	4046	mp->mnt_maxwritecnt = MAX_UPL_SIZE_BYTES;
	4047
	4048	if (VNOP_IOCTL(devvp, DKIOCISVIRTUAL, (caddr_t)&isvirtual, 0, ctx) == 0) {
	4049	if (isvirtual) {
	4050	mp->mnt_kern_flag \|= MNTK_VIRTUALDEV;
	4051	mp->mnt_flag \|= MNT_REMOVABLE;
	4052	}
	4053	}
	4054	if (VNOP_IOCTL(devvp, DKIOCISSOLIDSTATE, (caddr_t)&isssd, 0, ctx) == 0) {
	4055	if (isssd) {
	4056	mp->mnt_kern_flag \|= MNTK_SSD;
	4057	}
	4058	}
	4059	if ((error = VNOP_IOCTL(devvp, DKIOCGETFEATURES,
	4060	(caddr_t)&features, 0, ctx))) {
	4061	return error;
	4062	}
	4063
	4064	if ((error = VNOP_IOCTL(devvp, DKIOCGETMAXBLOCKCOUNTREAD,
	4065	(caddr_t)&readblockcnt, 0, ctx))) {
	4066	return error;
	4067	}
	4068
	4069	if ((error = VNOP_IOCTL(devvp, DKIOCGETMAXBLOCKCOUNTWRITE,
	4070	(caddr_t)&writeblockcnt, 0, ctx))) {
	4071	return error;
	4072	}
	4073
	4074	if ((error = VNOP_IOCTL(devvp, DKIOCGETMAXBYTECOUNTREAD,
	4075	(caddr_t)&readmaxcnt, 0, ctx))) {
	4076	return error;
	4077	}
	4078
	4079	if ((error = VNOP_IOCTL(devvp, DKIOCGETMAXBYTECOUNTWRITE,
	4080	(caddr_t)&writemaxcnt, 0, ctx))) {
	4081	return error;
	4082	}
	4083
	4084	if ((error = VNOP_IOCTL(devvp, DKIOCGETMAXSEGMENTCOUNTREAD,
	4085	(caddr_t)&readsegcnt, 0, ctx))) {
	4086	return error;
	4087	}
	4088
	4089	if ((error = VNOP_IOCTL(devvp, DKIOCGETMAXSEGMENTCOUNTWRITE,
	4090	(caddr_t)&writesegcnt, 0, ctx))) {
	4091	return error;
	4092	}
	4093
	4094	if ((error = VNOP_IOCTL(devvp, DKIOCGETMAXSEGMENTBYTECOUNTREAD,
	4095	(caddr_t)&readsegsize, 0, ctx))) {
	4096	return error;
	4097	}
	4098
	4099	if ((error = VNOP_IOCTL(devvp, DKIOCGETMAXSEGMENTBYTECOUNTWRITE,
	4100	(caddr_t)&writesegsize, 0, ctx))) {
	4101	return error;
	4102	}
	4103
	4104	if ((error = VNOP_IOCTL(devvp, DKIOCGETMINSEGMENTALIGNMENTBYTECOUNT,
	4105	(caddr_t)&alignment, 0, ctx))) {
	4106	return error;
	4107	}
	4108
	4109	if ((error = VNOP_IOCTL(devvp, DKIOCGETCOMMANDPOOLSIZE,
	4110	(caddr_t)&ioqueue_depth, 0, ctx))) {
	4111	return error;
	4112	}
	4113
	4114	if (readmaxcnt) {
	4115	mp->mnt_maxreadcnt = (readmaxcnt > UINT32_MAX) ? UINT32_MAX :(uint32_t) readmaxcnt;
	4116	}
	4117
	4118	if (readblockcnt) {
	4119	temp = readblockcnt * blksize;
	4120	temp = (temp > UINT32_MAX) ? UINT32_MAX : temp;
	4121
	4122	if (temp < mp->mnt_maxreadcnt) {
	4123	mp->mnt_maxreadcnt = (u_int32_t)temp;
	4124	}
	4125	}
	4126
	4127	if (writemaxcnt) {
	4128	mp->mnt_maxwritecnt = (writemaxcnt > UINT32_MAX) ? UINT32_MAX : (uint32_t)writemaxcnt;
	4129	}
	4130
	4131	if (writeblockcnt) {
	4132	temp = writeblockcnt * blksize;
	4133	temp = (temp > UINT32_MAX) ? UINT32_MAX : temp;
	4134
	4135	if (temp < mp->mnt_maxwritecnt) {
	4136	mp->mnt_maxwritecnt = (u_int32_t)temp;
	4137	}
	4138	}
	4139
	4140	if (readsegcnt) {
	4141	temp = (readsegcnt > UINT16_MAX) ? UINT16_MAX : readsegcnt;
	4142	} else {
	4143	temp = mp->mnt_maxreadcnt / PAGE_SIZE;
	4144
	4145	if (temp > UINT16_MAX) {
	4146	temp = UINT16_MAX;
	4147	}
	4148	}
	4149	mp->mnt_segreadcnt = (u_int16_t)temp;
	4150
	4151	if (writesegcnt) {
	4152	temp = (writesegcnt > UINT16_MAX) ? UINT16_MAX : writesegcnt;
	4153	} else {
	4154	temp = mp->mnt_maxwritecnt / PAGE_SIZE;
	4155
	4156	if (temp > UINT16_MAX) {
	4157	temp = UINT16_MAX;
	4158	}
	4159	}
	4160	mp->mnt_segwritecnt = (u_int16_t)temp;
	4161
	4162	if (readsegsize) {
	4163	temp = (readsegsize > UINT32_MAX) ? UINT32_MAX : readsegsize;
	4164	} else {
	4165	temp = mp->mnt_maxreadcnt;
	4166	}
	4167	mp->mnt_maxsegreadsize = (u_int32_t)temp;
	4168
	4169	if (writesegsize) {
	4170	temp = (writesegsize > UINT32_MAX) ? UINT32_MAX : writesegsize;
	4171	} else {
	4172	temp = mp->mnt_maxwritecnt;
	4173	}
	4174	mp->mnt_maxsegwritesize = (u_int32_t)temp;
	4175
	4176	if (alignment) {
	4177	temp = (alignment > PAGE_SIZE) ? PAGE_MASK : alignment - 1;
	4178	} else {
	4179	temp = 0;
	4180	}
	4181	mp->mnt_alignmentmask = (uint32_t)temp;
	4182
	4183
	4184	if (ioqueue_depth > MNT_DEFAULT_IOQUEUE_DEPTH) {
	4185	temp = ioqueue_depth;
	4186	} else {
	4187	temp = MNT_DEFAULT_IOQUEUE_DEPTH;
	4188	}
	4189
	4190	mp->mnt_ioqueue_depth = (uint32_t)temp;
	4191	mp->mnt_ioscale = MNT_IOSCALE(mp->mnt_ioqueue_depth);
	4192
	4193	if (mp->mnt_ioscale > 1) {
	4194	printf("ioqueue_depth = %d, ioscale = %d\n", (int)mp->mnt_ioqueue_depth, (int)mp->mnt_ioscale);
	4195	}
	4196
	4197	if (features & DK_FEATURE_FORCE_UNIT_ACCESS) {
	4198	mp->mnt_ioflags \|= MNT_IOFLAGS_FUA_SUPPORTED;
	4199	}
	4200
	4201	if (VNOP_IOCTL(devvp, DKIOCGETIOMINSATURATIONBYTECOUNT, (caddr_t)&minsaturationbytecount, 0, ctx) == 0) {
	4202	mp->mnt_minsaturationbytecount = minsaturationbytecount;
	4203	} else {
	4204	mp->mnt_minsaturationbytecount = 0;
	4205	}
	4206
	4207	if (VNOP_IOCTL(devvp, DKIOCCORESTORAGE, (caddr_t)&cs_info, 0, ctx) == 0) {
	4208	cs_present = TRUE;
	4209	}
	4210
	4211	if (features & DK_FEATURE_UNMAP) {
	4212	mp->mnt_ioflags \|= MNT_IOFLAGS_UNMAP_SUPPORTED;
	4213
	4214	if (cs_present == TRUE) {
	4215	mp->mnt_ioflags \|= MNT_IOFLAGS_CSUNMAP_SUPPORTED;
	4216	}
	4217	}
	4218	if (cs_present == TRUE) {
	4219	/*
	4220	* for now we'll use the following test as a proxy for
	4221	* the underlying drive being FUSION in nature
	4222	*/
	4223	if ((cs_info.flags & DK_CORESTORAGE_PIN_YOUR_METADATA)) {
	4224	mp->mnt_ioflags \|= MNT_IOFLAGS_FUSION_DRIVE;
	4225	}
	4226	} else {
	4227	/* Check for APFS Fusion */
	4228	dk_apfs_flavour_t flavour;
	4229	if ((VNOP_IOCTL(devvp, DKIOCGETAPFSFLAVOUR, (caddr_t)&flavour, 0, ctx) == 0) &&
	4230	(flavour == DK_APFS_FUSION)) {
	4231	mp->mnt_ioflags \|= MNT_IOFLAGS_FUSION_DRIVE;
	4232	}
	4233	}
	4234
	4235	if (VNOP_IOCTL(devvp, DKIOCGETLOCATION, (caddr_t)&location, 0, ctx) == 0) {
	4236	if (location & DK_LOCATION_EXTERNAL) {
	4237	mp->mnt_ioflags \|= MNT_IOFLAGS_PERIPHERAL_DRIVE;
	4238	mp->mnt_flag \|= MNT_REMOVABLE;
	4239	}
	4240	}
	4241
	4242	#if CONFIG_IOSCHED
	4243	if (iosched_enabled && (features & DK_FEATURE_PRIORITY)) {
	4244	mp->mnt_ioflags \|= MNT_IOFLAGS_IOSCHED_SUPPORTED;
	4245	throttle_info_disable_throttle(mp->mnt_devbsdunit, (mp->mnt_ioflags & MNT_IOFLAGS_FUSION_DRIVE) != 0);
	4246	}
	4247	#endif /* CONFIG_IOSCHED */
	4248	return error;
	4249	}
	4250
	4251	static struct klist fs_klist;
	4252	static LCK_GRP_DECLARE(fs_klist_lck_grp, "fs_klist");
	4253	static LCK_MTX_DECLARE(fs_klist_lock, &fs_klist_lck_grp);
	4254
	4255	void
	4256	vfs_event_init(void)
	4257	{
	4258	klist_init(&fs_klist);
	4259	}
	4260
	4261	void
	4262	vfs_event_signal(fsid_t *fsid, u_int32_t event, intptr_t data)
	4263	{
	4264	if (event == VQ_DEAD \|\| event == VQ_NOTRESP) {
	4265	struct mount *mp = vfs_getvfs(fsid);
	4266	if (mp) {
	4267	mount_lock_spin(mp);
	4268	if (data) {
	4269	mp->mnt_kern_flag &= ~MNT_LNOTRESP; // Now responding
	4270	} else {
	4271	mp->mnt_kern_flag \|= MNT_LNOTRESP; // Not responding
	4272	}
	4273	mount_unlock(mp);
	4274	}
	4275	}
	4276
	4277	lck_mtx_lock(&fs_klist_lock);
	4278	KNOTE(&fs_klist, event);
	4279	lck_mtx_unlock(&fs_klist_lock);
	4280	}
	4281
	4282	/*
	4283	* return the number of mounted filesystems.
	4284	*/
	4285	static int
	4286	sysctl_vfs_getvfscnt(void)
	4287	{
	4288	return mount_getvfscnt();
	4289	}
	4290
	4291
	4292	static int
	4293	mount_getvfscnt(void)
	4294	{
	4295	int ret;
	4296
	4297	mount_list_lock();
	4298	ret = nummounts;
	4299	mount_list_unlock();
	4300	return ret;
	4301	}
	4302
	4303
	4304
	4305	static int
	4306	mount_fillfsids(fsid_t *fsidlst, int count)
	4307	{
	4308	struct mount *mp;
	4309	int actual = 0;
	4310
	4311	actual = 0;
	4312	mount_list_lock();
	4313	TAILQ_FOREACH(mp, &mountlist, mnt_list) {
	4314	if (actual < count) {
	4315	fsidlst[actual] = mp->mnt_vfsstat.f_fsid;
	4316	actual++;
	4317	}
	4318	}
	4319	mount_list_unlock();
	4320	return actual;
	4321	}
	4322
	4323	/*
	4324	* fill in the array of fsid_t's up to a max of 'count', the actual
	4325	* number filled in will be set in '*actual'. If there are more fsid_t's
	4326	* than room in fsidlst then ENOMEM will be returned and '*actual' will
	4327	* have the actual count.
	4328	* having *actual filled out even in the error case is depended upon.
	4329	*/
	4330	static int
	4331	sysctl_vfs_getvfslist(fsid_t fsidlst, unsigned long count, unsigned long actual)
	4332	{
	4333	struct mount *mp;
	4334
	4335	*actual = 0;
	4336	mount_list_lock();
	4337	TAILQ_FOREACH(mp, &mountlist, mnt_list) {
	4338	(*actual)++;
	4339	if (*actual <= count) {
	4340	fsidlst[(*actual) - 1] = mp->mnt_vfsstat.f_fsid;
	4341	}
	4342	}
	4343	mount_list_unlock();
	4344	return *actual <= count ? 0 : ENOMEM;
	4345	}
	4346
	4347	static int
	4348	sysctl_vfs_vfslist(__unused struct sysctl_oid oidp, __unused void arg1,
	4349	__unused int arg2, struct sysctl_req *req)
	4350	{
	4351	unsigned long actual;
	4352	int error;
	4353	size_t space;
	4354	fsid_t *fsidlst;
	4355
	4356	/* This is a readonly node. */
	4357	if (req->newptr != USER_ADDR_NULL) {
	4358	return EPERM;
	4359	}
	4360
	4361	/* they are querying us so just return the space required. */
	4362	if (req->oldptr == USER_ADDR_NULL) {
	4363	req->oldidx = sysctl_vfs_getvfscnt() * sizeof(fsid_t);
	4364	return 0;
	4365	}
	4366	again:
	4367	/*
	4368	* Retrieve an accurate count of the amount of space required to copy
	4369	* out all the fsids in the system.
	4370	*/
	4371	space = req->oldlen;
	4372	req->oldlen = sysctl_vfs_getvfscnt() * sizeof(fsid_t);
	4373
	4374	/* they didn't give us enough space. */
	4375	if (space < req->oldlen) {
	4376	return ENOMEM;
	4377	}
	4378
	4379	fsidlst = kheap_alloc(KHEAP_TEMP, req->oldlen, Z_WAITOK \| Z_ZERO);
	4380	if (fsidlst == NULL) {
	4381	return ENOMEM;
	4382	}
	4383
	4384	error = sysctl_vfs_getvfslist(fsidlst, req->oldlen / sizeof(fsid_t),
	4385	&actual);
	4386	/*
	4387	* If we get back ENOMEM, then another mount has been added while we
	4388	* slept in malloc above. If this is the case then try again.
	4389	*/
	4390	if (error == ENOMEM) {
	4391	kheap_free(KHEAP_TEMP, fsidlst, req->oldlen);
	4392	req->oldlen = space;
	4393	goto again;
	4394	}
	4395	if (error == 0) {
	4396	error = SYSCTL_OUT(req, fsidlst, actual * sizeof(fsid_t));
	4397	}
	4398	kheap_free(KHEAP_TEMP, fsidlst, req->oldlen);
	4399	return error;
	4400	}
	4401
	4402	/*
	4403	* Do a sysctl by fsid.
	4404	*/
	4405	static int
	4406	sysctl_vfs_ctlbyfsid(__unused struct sysctl_oid oidp, void arg1, int arg2,
	4407	struct sysctl_req *req)
	4408	{
	4409	union union_vfsidctl vc;
	4410	struct mount *mp;
	4411	struct vfsstatfs *sp;
	4412	int *name, namelen;
	4413	int flags = 0;
	4414	int error = 0, gotref = 0;
	4415	vfs_context_t ctx = vfs_context_current();
	4416	proc_t p = req->p; /* XXX req->p != current_proc()? */
	4417	boolean_t is_64_bit;
	4418	union {
	4419	struct statfs64 sfs64;
	4420	struct user64_statfs osfs64;
	4421	struct user32_statfs osfs32;
	4422	} *sfsbuf;
	4423
	4424	if (req->newptr == USER_ADDR_NULL) {
	4425	error = EINVAL;
	4426	goto out;
	4427	}
	4428
	4429	name = arg1;
	4430	namelen = arg2;
	4431	is_64_bit = proc_is64bit(p);
	4432
	4433	error = SYSCTL_IN(req, &vc, is_64_bit? sizeof(vc.vc64):sizeof(vc.vc32));
	4434	if (error) {
	4435	goto out;
	4436	}
	4437	if (vc.vc32.vc_vers != VFS_CTL_VERS1) { /* works for 32 and 64 */
	4438	error = EINVAL;
	4439	goto out;
	4440	}
	4441	mp = mount_list_lookupby_fsid(&vc.vc32.vc_fsid, 0, 1); /* works for 32 and 64 */
	4442	if (mp == NULL) {
	4443	error = ENOENT;
	4444	goto out;
	4445	}
	4446	gotref = 1;
	4447	/* reset so that the fs specific code can fetch it. */
	4448	req->newidx = 0;
	4449	/*
	4450	* Note if this is a VFS_CTL then we pass the actual sysctl req
	4451	* in for "oldp" so that the lower layer can DTRT and use the
	4452	* SYSCTL_IN/OUT routines.
	4453	*/
	4454	if (mp->mnt_op->vfs_sysctl != NULL) {
	4455	if (is_64_bit) {
	4456	if (vfs_64bitready(mp)) {
	4457	error = mp->mnt_op->vfs_sysctl(name, namelen,
	4458	CAST_USER_ADDR_T(req),
	4459	NULL, USER_ADDR_NULL, 0,
	4460	ctx);
	4461	} else {
	4462	error = ENOTSUP;
	4463	}
	4464	} else {
	4465	error = mp->mnt_op->vfs_sysctl(name, namelen,
	4466	CAST_USER_ADDR_T(req),
	4467	NULL, USER_ADDR_NULL, 0,
	4468	ctx);
	4469	}
	4470	if (error != ENOTSUP) {
	4471	goto out;
	4472	}
	4473	}
	4474	switch (name[0]) {
	4475	case VFS_CTL_UMOUNT:
	4476	#if CONFIG_MACF
	4477	error = mac_mount_check_umount(ctx, mp);
	4478	if (error != 0) {
	4479	goto out;
	4480	}
	4481	#endif
	4482	req->newidx = 0;
	4483	if (is_64_bit) {
	4484	req->newptr = vc.vc64.vc_ptr;
	4485	req->newlen = (size_t)vc.vc64.vc_len;
	4486	} else {
	4487	req->newptr = CAST_USER_ADDR_T(vc.vc32.vc_ptr);
	4488	req->newlen = vc.vc32.vc_len;
	4489	}
	4490	error = SYSCTL_IN(req, &flags, sizeof(flags));
	4491	if (error) {
	4492	break;
	4493	}
	4494
	4495	mount_ref(mp, 0);
	4496	mount_iterdrop(mp);
	4497	gotref = 0;
	4498	/* safedounmount consumes a ref */
	4499	error = safedounmount(mp, flags, ctx);
	4500	break;
	4501	case VFS_CTL_OSTATFS:
	4502	case VFS_CTL_STATFS64:
	4503	#if CONFIG_MACF
	4504	error = mac_mount_check_stat(ctx, mp);
	4505	if (error != 0) {
	4506	break;
	4507	}
	4508	#endif
	4509	req->newidx = 0;
	4510	if (is_64_bit) {
	4511	req->newptr = vc.vc64.vc_ptr;
	4512	req->newlen = (size_t)vc.vc64.vc_len;
	4513	} else {
	4514	req->newptr = CAST_USER_ADDR_T(vc.vc32.vc_ptr);
	4515	req->newlen = vc.vc32.vc_len;
	4516	}
	4517	error = SYSCTL_IN(req, &flags, sizeof(flags));
	4518	if (error) {
	4519	break;
	4520	}
	4521	sp = &mp->mnt_vfsstat;
	4522	if (((flags & MNT_NOWAIT) == 0 \|\| (flags & (MNT_WAIT \| MNT_DWAIT))) &&
	4523	(error = vfs_update_vfsstat(mp, ctx, VFS_USER_EVENT))) {
	4524	goto out;
	4525	}
	4526
	4527	sfsbuf = kheap_alloc(KHEAP_TEMP, sizeof(*sfsbuf), Z_WAITOK);
	4528
	4529	if (name[0] == VFS_CTL_STATFS64) {
	4530	struct statfs64 *sfs = &sfsbuf->sfs64;
	4531
	4532	vfs_get_statfs64(mp, sfs);
	4533	error = SYSCTL_OUT(req, sfs, sizeof(*sfs));
	4534	} else if (is_64_bit) {
	4535	struct user64_statfs *sfs = &sfsbuf->osfs64;
	4536
	4537	bzero(sfs, sizeof(*sfs));
	4538	sfs->f_flags = mp->mnt_flag & MNT_VISFLAGMASK;
	4539	sfs->f_type = (short)mp->mnt_vtable->vfc_typenum;
	4540	sfs->f_bsize = (user64_long_t)sp->f_bsize;
	4541	sfs->f_iosize = (user64_long_t)sp->f_iosize;
	4542	sfs->f_blocks = (user64_long_t)sp->f_blocks;
	4543	sfs->f_bfree = (user64_long_t)sp->f_bfree;
	4544	sfs->f_bavail = (user64_long_t)sp->f_bavail;
	4545	sfs->f_files = (user64_long_t)sp->f_files;
	4546	sfs->f_ffree = (user64_long_t)sp->f_ffree;
	4547	sfs->f_fsid = sp->f_fsid;
	4548	sfs->f_owner = sp->f_owner;
	4549	#ifdef CONFIG_NFS_CLIENT
	4550	if (mp->mnt_kern_flag & MNTK_TYPENAME_OVERRIDE) {
	4551	strlcpy(&sfs->f_fstypename[0], &mp->fstypename_override[0], MFSNAMELEN);
	4552	} else
	4553	#endif /* CONFIG_NFS_CLIENT */
	4554	{
	4555	strlcpy(sfs->f_fstypename, sp->f_fstypename, MFSNAMELEN);
	4556	}
	4557	strlcpy(sfs->f_mntonname, sp->f_mntonname, MNAMELEN);
	4558	strlcpy(sfs->f_mntfromname, sp->f_mntfromname, MNAMELEN);
	4559
	4560	error = SYSCTL_OUT(req, sfs, sizeof(*sfs));
	4561	} else {
	4562	struct user32_statfs *sfs = &sfsbuf->osfs32;
	4563	long temp;
	4564
	4565	bzero(sfs, sizeof(*sfs));
	4566	sfs->f_flags = mp->mnt_flag & MNT_VISFLAGMASK;
	4567	sfs->f_type = (short)mp->mnt_vtable->vfc_typenum;
	4568
	4569	/*
	4570	* It's possible for there to be more than 2^^31 blocks in the filesystem, so we
	4571	* have to fudge the numbers here in that case. We inflate the blocksize in order
	4572	* to reflect the filesystem size as best we can.
	4573	*/
	4574	if (sp->f_blocks > INT_MAX) {
	4575	int shift;
	4576
	4577	/*
	4578	* Work out how far we have to shift the block count down to make it fit.
	4579	* Note that it's possible to have to shift so far that the resulting
	4580	* blocksize would be unreportably large. At that point, we will clip
	4581	* any values that don't fit.
	4582	*
	4583	* For safety's sake, we also ensure that f_iosize is never reported as
	4584	* being smaller than f_bsize.
	4585	*/
	4586	for (shift = 0; shift < 32; shift++) {
	4587	if ((sp->f_blocks >> shift) <= INT_MAX) {
	4588	break;
	4589	}
	4590	if ((((long long)sp->f_bsize) << (shift + 1)) > INT_MAX) {
	4591	break;
	4592	}
	4593	}
	4594	#define __SHIFT_OR_CLIP(x, s) ((((x) >> (s)) > INT_MAX) ? INT_MAX : ((x) >> (s)))
	4595	sfs->f_blocks = (user32_long_t)__SHIFT_OR_CLIP(sp->f_blocks, shift);
	4596	sfs->f_bfree = (user32_long_t)__SHIFT_OR_CLIP(sp->f_bfree, shift);
	4597	sfs->f_bavail = (user32_long_t)__SHIFT_OR_CLIP(sp->f_bavail, shift);
	4598	#undef __SHIFT_OR_CLIP
	4599	sfs->f_bsize = (user32_long_t)(sp->f_bsize << shift);
	4600	temp = lmax(sp->f_iosize, sp->f_bsize);
	4601	if (temp > INT32_MAX) {
	4602	error = EINVAL;
	4603	kheap_free(KHEAP_TEMP, sfsbuf, sizeof(*sfsbuf));
	4604	goto out;
	4605	}
	4606	sfs->f_iosize = (user32_long_t)temp;
	4607	} else {
	4608	sfs->f_bsize = (user32_long_t)sp->f_bsize;
	4609	sfs->f_iosize = (user32_long_t)sp->f_iosize;
	4610	sfs->f_blocks = (user32_long_t)sp->f_blocks;
	4611	sfs->f_bfree = (user32_long_t)sp->f_bfree;
	4612	sfs->f_bavail = (user32_long_t)sp->f_bavail;
	4613	}
	4614	sfs->f_files = (user32_long_t)sp->f_files;
	4615	sfs->f_ffree = (user32_long_t)sp->f_ffree;
	4616	sfs->f_fsid = sp->f_fsid;
	4617	sfs->f_owner = sp->f_owner;
	4618
	4619	#ifdef CONFIG_NFS_CLIENT
	4620	if (mp->mnt_kern_flag & MNTK_TYPENAME_OVERRIDE) {
	4621	strlcpy(&sfs->f_fstypename[0], &mp->fstypename_override[0], MFSNAMELEN);
	4622	} else
	4623	#endif /* CONFIG_NFS_CLIENT */
	4624	{
	4625	strlcpy(sfs->f_fstypename, sp->f_fstypename, MFSNAMELEN);
	4626	}
	4627	strlcpy(sfs->f_mntonname, sp->f_mntonname, MNAMELEN);
	4628	strlcpy(sfs->f_mntfromname, sp->f_mntfromname, MNAMELEN);
	4629
	4630	error = SYSCTL_OUT(req, sfs, sizeof(*sfs));
	4631	}
	4632	kheap_free(KHEAP_TEMP, sfsbuf, sizeof(*sfsbuf));
	4633	break;
	4634	default:
	4635	error = ENOTSUP;
	4636	goto out;
	4637	}
	4638	out:
	4639	if (gotref != 0) {
	4640	mount_iterdrop(mp);
	4641	}
	4642	return error;
	4643	}
	4644
	4645	static int filt_fsattach(struct knote kn, struct kevent_qos_s kev);
	4646	static void filt_fsdetach(struct knote *kn);
	4647	static int filt_fsevent(struct knote *kn, long hint);
	4648	static int filt_fstouch(struct knote kn, struct kevent_qos_s kev);
	4649	static int filt_fsprocess(struct knote kn, struct kevent_qos_s kev);
	4650	SECURITY_READ_ONLY_EARLY(struct filterops) fs_filtops = {
	4651	.f_attach = filt_fsattach,
	4652	.f_detach = filt_fsdetach,
	4653	.f_event = filt_fsevent,
	4654	.f_touch = filt_fstouch,
	4655	.f_process = filt_fsprocess,
	4656	};
	4657
	4658	static int
	4659	filt_fsattach(struct knote kn, __unused struct kevent_qos_s kev)
	4660	{
	4661	kn->kn_flags \|= EV_CLEAR; /* automatic */
	4662	kn->kn_sdata = 0; /* incoming data is ignored */
	4663
	4664	lck_mtx_lock(&fs_klist_lock);
	4665	KNOTE_ATTACH(&fs_klist, kn);
	4666	lck_mtx_unlock(&fs_klist_lock);
	4667
	4668	/*
	4669	* filter only sees future events,
	4670	* so it can't be fired already.
	4671	*/
	4672	return 0;
	4673	}
	4674
	4675	static void
	4676	filt_fsdetach(struct knote *kn)
	4677	{
	4678	lck_mtx_lock(&fs_klist_lock);
	4679	KNOTE_DETACH(&fs_klist, kn);
	4680	lck_mtx_unlock(&fs_klist_lock);
	4681	}
	4682
	4683	static int
	4684	filt_fsevent(struct knote *kn, long hint)
	4685	{
	4686	/*
	4687	* Backwards compatibility:
	4688	* Other filters would do nothing if kn->kn_sfflags == 0
	4689	*/
	4690
	4691	if ((kn->kn_sfflags == 0) \|\| (kn->kn_sfflags & hint)) {
	4692	kn->kn_fflags \|= hint;
	4693	}
	4694
	4695	return kn->kn_fflags != 0;
	4696	}
	4697
	4698	static int
	4699	filt_fstouch(struct knote kn, struct kevent_qos_s kev)
	4700	{
	4701	int res;
	4702
	4703	lck_mtx_lock(&fs_klist_lock);
	4704
	4705	kn->kn_sfflags = kev->fflags;
	4706
	4707	/*
	4708	* the above filter function sets bits even if nobody is looking for them.
	4709	* Just preserve those bits even in the new mask is more selective
	4710	* than before.
	4711	*
	4712	* For compatibility with previous implementations, we leave kn_fflags
	4713	* as they were before.
	4714	*/
	4715	//if (kn->kn_sfflags)
	4716	// kn->kn_fflags &= kn->kn_sfflags;
	4717	res = (kn->kn_fflags != 0);
	4718
	4719	lck_mtx_unlock(&fs_klist_lock);
	4720
	4721	return res;
	4722	}
	4723
	4724	static int
	4725	filt_fsprocess(struct knote kn, struct kevent_qos_s kev)
	4726	{
	4727	int res = 0;
	4728
	4729	lck_mtx_lock(&fs_klist_lock);
	4730	if (kn->kn_fflags) {
	4731	knote_fill_kevent(kn, kev, 0);
	4732	res = 1;
	4733	}
	4734	lck_mtx_unlock(&fs_klist_lock);
	4735	return res;
	4736	}
	4737
	4738	static int
	4739	sysctl_vfs_noremotehang(__unused struct sysctl_oid *oidp,
	4740	__unused void arg1, __unused int arg2, struct sysctl_req req)
	4741	{
	4742	int out, error;
	4743	pid_t pid;
	4744	proc_t p;
	4745
	4746	/* We need a pid. */
	4747	if (req->newptr == USER_ADDR_NULL) {
	4748	return EINVAL;
	4749	}
	4750
	4751	error = SYSCTL_IN(req, &pid, sizeof(pid));
	4752	if (error) {
	4753	return error;
	4754	}
	4755
	4756	p = proc_find(pid < 0 ? -pid : pid);
	4757	if (p == NULL) {
	4758	return ESRCH;
	4759	}
	4760
	4761	/*
	4762	* Fetching the value is ok, but we only fetch if the old
	4763	* pointer is given.
	4764	*/
	4765	if (req->oldptr != USER_ADDR_NULL) {
	4766	out = !((p->p_flag & P_NOREMOTEHANG) == 0);
	4767	proc_rele(p);
	4768	error = SYSCTL_OUT(req, &out, sizeof(out));
	4769	return error;
	4770	}
	4771
	4772	/* cansignal offers us enough security. */
	4773	if (p != req->p && proc_suser(req->p) != 0) {
	4774	proc_rele(p);
	4775	return EPERM;
	4776	}
	4777
	4778	if (pid < 0) {
	4779	OSBitAndAtomic(~((uint32_t)P_NOREMOTEHANG), &p->p_flag);
	4780	} else {
	4781	OSBitOrAtomic(P_NOREMOTEHANG, &p->p_flag);
	4782	}
	4783	proc_rele(p);
	4784
	4785	return 0;
	4786	}
	4787
	4788	static int
	4789	sysctl_vfs_generic_conf SYSCTL_HANDLER_ARGS
	4790	{
	4791	int *name, namelen;
	4792	struct vfstable *vfsp;
	4793	struct vfsconf vfsc = {};
	4794
	4795	(void)oidp;
	4796	name = arg1;
	4797	namelen = arg2;
	4798
	4799	if (namelen < 1) {
	4800	return EISDIR;
	4801	} else if (namelen > 1) {
	4802	return ENOTDIR;
	4803	}
	4804
	4805	mount_list_lock();
	4806	for (vfsp = vfsconf; vfsp; vfsp = vfsp->vfc_next) {
	4807	if (vfsp->vfc_typenum == name[0]) {
	4808	break;
	4809	}
	4810	}
	4811
	4812	if (vfsp == NULL) {
	4813	mount_list_unlock();
	4814	return ENOTSUP;
	4815	}
	4816
	4817	vfsc.vfc_reserved1 = 0;
	4818	bcopy(vfsp->vfc_name, vfsc.vfc_name, sizeof(vfsc.vfc_name));
	4819	vfsc.vfc_typenum = vfsp->vfc_typenum;
	4820	vfsc.vfc_refcount = vfsp->vfc_refcount;
	4821	vfsc.vfc_flags = vfsp->vfc_flags;
	4822	vfsc.vfc_reserved2 = 0;
	4823	vfsc.vfc_reserved3 = 0;
	4824
	4825	mount_list_unlock();
	4826	return SYSCTL_OUT(req, &vfsc, sizeof(struct vfsconf));
	4827	}
	4828
	4829	/* the vfs.generic. branch. */
	4830	SYSCTL_EXTENSIBLE_NODE(_vfs, VFS_GENERIC, generic,
	4831	CTLFLAG_RW \| CTLFLAG_LOCKED, NULL, "vfs generic hinge");
	4832	/* retreive a list of mounted filesystem fsid_t */
	4833	SYSCTL_PROC(_vfs_generic, OID_AUTO, vfsidlist,
	4834	CTLTYPE_STRUCT \| CTLFLAG_RD \| CTLFLAG_LOCKED,
	4835	NULL, 0, sysctl_vfs_vfslist, "S,fsid", "List of mounted filesystem ids");
	4836	/* perform operations on filesystem via fsid_t */
	4837	SYSCTL_NODE(_vfs_generic, OID_AUTO, ctlbyfsid, CTLFLAG_RW \| CTLFLAG_LOCKED,
	4838	sysctl_vfs_ctlbyfsid, "ctlbyfsid");
	4839	SYSCTL_PROC(_vfs_generic, OID_AUTO, noremotehang, CTLFLAG_RW \| CTLFLAG_ANYBODY,
	4840	NULL, 0, sysctl_vfs_noremotehang, "I", "noremotehang");
	4841	SYSCTL_INT(_vfs_generic, VFS_MAXTYPENUM, maxtypenum,
	4842	CTLFLAG_RD \| CTLFLAG_KERN \| CTLFLAG_LOCKED,
	4843	&maxvfstypenum, 0, "");
	4844	SYSCTL_INT(_vfs_generic, OID_AUTO, sync_timeout, CTLFLAG_RW \| CTLFLAG_LOCKED, &sync_timeout_seconds, 0, "");
	4845	SYSCTL_NODE(_vfs_generic, VFS_CONF, conf,
	4846	CTLFLAG_RD \| CTLFLAG_LOCKED,
	4847	sysctl_vfs_generic_conf, "");
	4848	#if DEVELOPMENT \|\| DEBUG
	4849	SYSCTL_INT(_vfs_generic, OID_AUTO, print_busy_vnodes,
	4850	CTLTYPE_INT \| CTLFLAG_RW,
	4851	&print_busy_vnodes, 0,
	4852	"VFS log busy vnodes blocking unmount");
	4853	#endif
	4854
	4855	/* Indicate that the root file system unmounted cleanly */
	4856	static int vfs_root_unmounted_cleanly = 0;
	4857	SYSCTL_INT(_vfs_generic, OID_AUTO, root_unmounted_cleanly, CTLFLAG_RD, &vfs_root_unmounted_cleanly, 0, "Root filesystem was unmounted cleanly");
	4858
	4859	void
	4860	vfs_set_root_unmounted_cleanly(void)
	4861	{
	4862	vfs_root_unmounted_cleanly = 1;
	4863	}
	4864
	4865	/*
	4866	* Print vnode state.
	4867	*/
	4868	void
	4869	vn_print_state(struct vnode vp, const char fmt, ...)
	4870	{
	4871	va_list ap;
	4872	char perm_str[] = "(VM_KERNEL_ADDRPERM pointer)";
	4873	char fs_name[MFSNAMELEN];
	4874
	4875	va_start(ap, fmt);
	4876	vprintf(fmt, ap);
	4877	va_end(ap);
	4878	printf("vp 0x%0llx %s: ", (uint64_t)VM_KERNEL_ADDRPERM(vp), perm_str);
	4879	printf("tag %d, type %d\n", vp->v_tag, vp->v_type);
	4880	/* Counts .. */
	4881	printf(" iocount %d, usecount %d, kusecount %d references %d\n",
	4882	vp->v_iocount, vp->v_usecount, vp->v_kusecount, vp->v_references);
	4883	printf(" writecount %d, numoutput %d\n", vp->v_writecount,
	4884	vp->v_numoutput);
	4885	/* Flags */
	4886	printf(" flag 0x%x, lflag 0x%x, listflag 0x%x\n", vp->v_flag,
	4887	vp->v_lflag, vp->v_listflag);
	4888
	4889	if (vp->v_mount == NULL \|\| vp->v_mount == dead_mountp) {
	4890	strlcpy(fs_name, "deadfs", MFSNAMELEN);
	4891	} else {
	4892	vfs_name(vp->v_mount, fs_name);
	4893	}
	4894
	4895	printf(" v_data 0x%0llx %s\n",
	4896	(vp->v_data ? (uint64_t)VM_KERNEL_ADDRPERM(vp->v_data) : 0),
	4897	perm_str);
	4898	printf(" v_mount 0x%0llx %s vfs_name %s\n",
	4899	(vp->v_mount ? (uint64_t)VM_KERNEL_ADDRPERM(vp->v_mount) : 0),
	4900	perm_str, fs_name);
	4901	}
	4902
	4903	long num_reusedvnodes = 0;
	4904
	4905
	4906	static vnode_t
	4907	process_vp(vnode_t vp, int want_vp, bool can_defer, int *deferred)
	4908	{
	4909	unsigned int vpid;
	4910
	4911	*deferred = 0;
	4912
	4913	vpid = vp->v_id;
	4914
	4915	vnode_list_remove_locked(vp);
	4916
	4917	vnode_list_unlock();
	4918
	4919	vnode_lock_spin(vp);
	4920
	4921	/*
	4922	* We could wait for the vnode_lock after removing the vp from the freelist
	4923	* and the vid is bumped only at the very end of reclaim. So it is possible
	4924	* that we are looking at a vnode that is being terminated. If so skip it.
	4925	*/
	4926	if ((vpid != vp->v_id) \|\| (vp->v_usecount != 0) \|\| (vp->v_iocount != 0) \|\|
	4927	VONLIST(vp) \|\| (vp->v_lflag & VL_TERMINATE)) {
	4928	/*
	4929	* we lost the race between dropping the list lock
	4930	* and picking up the vnode_lock... someone else
	4931	* used this vnode and it is now in a new state
	4932	*/
	4933	vnode_unlock(vp);
	4934
	4935	return NULLVP;
	4936	}
	4937	if ((vp->v_lflag & (VL_NEEDINACTIVE \| VL_MARKTERM)) == VL_NEEDINACTIVE) {
	4938	/*
	4939	* we did a vnode_rele_ext that asked for
	4940	* us not to reenter the filesystem during
	4941	* the release even though VL_NEEDINACTIVE was
	4942	* set... we'll do it here by doing a
	4943	* vnode_get/vnode_put
	4944	*
	4945	* pick up an iocount so that we can call
	4946	* vnode_put and drive the VNOP_INACTIVE...
	4947	* vnode_put will either leave us off
	4948	* the freelist if a new ref comes in,
	4949	* or put us back on the end of the freelist
	4950	* or recycle us if we were marked for termination...
	4951	* so we'll just go grab a new candidate
	4952	*/
	4953	vp->v_iocount++;
	4954	#ifdef JOE_DEBUG
	4955	record_vp(vp, 1);
	4956	#endif
	4957	vnode_put_locked(vp);
	4958	vnode_unlock(vp);
	4959
	4960	return NULLVP;
	4961	}
	4962	/*
	4963	* Checks for anyone racing us for recycle
	4964	*/
	4965	if (vp->v_type != VBAD) {
	4966	if ((want_vp \|\| can_defer) && (vnode_on_reliable_media(vp) == FALSE \|\| (vp->v_flag & VISDIRTY))) {
	4967	vnode_async_list_add(vp);
	4968	vnode_unlock(vp);
	4969
	4970	*deferred = 1;
	4971
	4972	return NULLVP;
	4973	}
	4974	if (vp->v_lflag & VL_DEAD) {
	4975	panic("new_vnode(%p): the vnode is VL_DEAD but not VBAD", vp);
	4976	}
	4977
	4978	vnode_lock_convert(vp);
	4979	(void)vnode_reclaim_internal(vp, 1, want_vp, 0);
	4980
	4981	if (want_vp) {
	4982	if ((VONLIST(vp))) {
	4983	panic("new_vnode(%p): vp on list", vp);
	4984	}
	4985	if (vp->v_usecount \|\| vp->v_iocount \|\| vp->v_kusecount \|\|
	4986	(vp->v_lflag & (VNAMED_UBC \| VNAMED_MOUNT \| VNAMED_FSHASH))) {
	4987	panic("new_vnode(%p): free vnode still referenced", vp);
	4988	}
	4989	if ((vp->v_mntvnodes.tqe_prev != 0) && (vp->v_mntvnodes.tqe_next != 0)) {
	4990	panic("new_vnode(%p): vnode seems to be on mount list", vp);
	4991	}
	4992	if (!LIST_EMPTY(&vp->v_nclinks) \|\| !TAILQ_EMPTY(&vp->v_ncchildren)) {
	4993	panic("new_vnode(%p): vnode still hooked into the name cache", vp);
	4994	}
	4995	} else {
	4996	vnode_unlock(vp);
	4997	vp = NULLVP;
	4998	}
	4999	}
	5000	return vp;
	5001	}
	5002
	5003	__attribute__((noreturn))
	5004	static void
	5005	async_work_continue(void)
	5006	{
	5007	struct async_work_lst *q;
	5008	int deferred;
	5009	vnode_t vp;
	5010
	5011	q = &vnode_async_work_list;
	5012
	5013	for (;;) {
	5014	vnode_list_lock();
	5015
	5016	if (TAILQ_EMPTY(q)) {
	5017	assert_wait(q, (THREAD_UNINT));
	5018
	5019	vnode_list_unlock();
	5020
	5021	thread_block((thread_continue_t)async_work_continue);
	5022
	5023	continue;
	5024	}
	5025	async_work_handled++;
	5026
	5027	vp = TAILQ_FIRST(q);
	5028
	5029	vp = process_vp(vp, 0, false, &deferred);
	5030
	5031	if (vp != NULLVP) {
	5032	panic("found VBAD vp (%p) on async queue", vp);
	5033	}
	5034	}
	5035	}
	5036
	5037	__attribute__((noreturn))
	5038	static void
	5039	vn_laundry_continue(void)
	5040	{
	5041	struct freelst *free_q;
	5042	struct ragelst *rage_q;
	5043	int deferred;
	5044	vnode_t vp;
	5045	bool rage_q_empty;
	5046	bool free_q_empty;
	5047
	5048
	5049	free_q = &vnode_free_list;
	5050	rage_q = &vnode_rage_list;
	5051
	5052	for (;;) {
	5053	vnode_list_lock();
	5054
	5055	free_q_empty = TAILQ_EMPTY(free_q);
	5056	rage_q_empty = TAILQ_EMPTY(rage_q);
	5057
	5058	if (!rage_q_empty && !free_q_empty) {
	5059	struct timeval current_tv;
	5060
	5061	microuptime(&current_tv);
	5062	if (ragevnodes < rage_limit &&
	5063	((current_tv.tv_sec - rage_tv.tv_sec) < RAGE_TIME_LIMIT)) {
	5064	rage_q_empty = true;
	5065	}
	5066	}
	5067
	5068	if (deadvnodes >= deadvnodes_high \|\|
	5069	(rage_q_empty && free_q_empty) \|\|
	5070	numvnodes < desiredvnodes) {
	5071	assert_wait(free_q, (THREAD_UNINT));
	5072
	5073	vnode_list_unlock();
	5074
	5075	thread_block((thread_continue_t)vn_laundry_continue);
	5076
	5077	continue;
	5078	}
	5079
	5080	if (!rage_q_empty) {
	5081	vp = TAILQ_FIRST(rage_q);
	5082	} else {
	5083	vp = TAILQ_FIRST(free_q);
	5084	}
	5085
	5086	vp = process_vp(vp, 0, true, &deferred);
	5087	}
	5088	}
	5089
	5090	static inline void
	5091	wakeup_laundry_thread()
	5092	{
	5093	if ((deadvnodes < deadvnodes_low) &&
	5094	/* Minimum number of free vnodes the thread should act on */
	5095	((freevnodes + ragevnodes) > 10)) {
	5096	wakeup(&vnode_free_list);
	5097	}
	5098	}
	5099
	5100	static int
	5101	new_vnode(vnode_t *vpp)
	5102	{
	5103	vnode_t vp;
	5104	uint32_t retries = 0, max_retries = 100; /* retry incase of tablefull */
	5105	uint32_t bdevvp_vnodes = 0;
	5106	int force_alloc = 0, walk_count = 0;
	5107	boolean_t need_reliable_vp = FALSE;
	5108	int deferred;
	5109	struct timeval initial_tv;
	5110	struct timeval current_tv;
	5111	proc_t curproc = current_proc();
	5112
	5113	initial_tv.tv_sec = 0;
	5114	retry:
	5115	vp = NULLVP;
	5116
	5117	vnode_list_lock();
	5118	newvnode++;
	5119
	5120	if (need_reliable_vp == TRUE) {
	5121	async_work_timed_out++;
	5122	}
	5123
	5124	if ((numvnodes - deadvnodes) < desiredvnodes \|\| force_alloc) {
	5125	struct timespec ts;
	5126
	5127	if (!TAILQ_EMPTY(&vnode_dead_list)) {
	5128	/*
	5129	* Can always reuse a dead one
	5130	*/
	5131	vp = TAILQ_FIRST(&vnode_dead_list);
	5132	if (numvnodes >= desiredvnodes) {
	5133	wakeup_laundry_thread();
	5134	}
	5135	goto steal_this_vp;
	5136	}
	5137	/*
	5138	* no dead vnodes available... if we're under
	5139	* the limit, we'll create a new vnode
	5140	*/
	5141	numvnodes++;
	5142	if (numvnodes >= desiredvnodes) {
	5143	wakeup_laundry_thread();
	5144	}
	5145	vnode_list_unlock();
	5146
	5147	vp = zalloc_flags(vnode_zone, Z_WAITOK \| Z_ZERO);
	5148	VLISTNONE(vp); /* avoid double queue removal */
	5149	lck_mtx_init(&vp->v_lock, &vnode_lck_grp, &vnode_lck_attr);
	5150
	5151	TAILQ_INIT(&vp->v_ncchildren);
	5152
	5153	klist_init(&vp->v_knotes);
	5154	nanouptime(&ts);
	5155	vp->v_id = (uint32_t)ts.tv_nsec;
	5156	vp->v_flag = VSTANDARD;
	5157
	5158	#if CONFIG_MACF
	5159	if (mac_vnode_label_init_needed(vp)) {
	5160	mac_vnode_label_init(vp);
	5161	}
	5162	#endif /* MAC */
	5163
	5164	vp->v_iocount = 1;
	5165	goto done;
	5166	}
	5167
	5168	wakeup_laundry_thread();
	5169
	5170	microuptime(&current_tv);
	5171
	5172	#define MAX_WALK_COUNT 1000
	5173
	5174	if (!TAILQ_EMPTY(&vnode_rage_list) &&
	5175	(ragevnodes >= rage_limit \|\|
	5176	(current_tv.tv_sec - rage_tv.tv_sec) >= RAGE_TIME_LIMIT)) {
	5177	TAILQ_FOREACH(vp, &vnode_rage_list, v_freelist) {
	5178	if (!(vp->v_listflag & VLIST_RAGE)) {
	5179	panic("new_vnode: vp (%p) on RAGE list not marked VLIST_RAGE", vp);
	5180	}
	5181
	5182	// if we're a dependency-capable process, skip vnodes that can
	5183	// cause recycling deadlocks. (i.e. this process is diskimages
	5184	// helper and the vnode is in a disk image). Querying the
	5185	// mnt_kern_flag for the mount's virtual device status
	5186	// is safer than checking the mnt_dependent_process, which
	5187	// may not be updated if there are multiple devnode layers
	5188	// in between the disk image and the final consumer.
	5189
	5190	if ((curproc->p_flag & P_DEPENDENCY_CAPABLE) == 0 \|\| vp->v_mount == NULL \|\|
	5191	(vp->v_mount->mnt_kern_flag & MNTK_VIRTUALDEV) == 0) {
	5192	/*
	5193	* if need_reliable_vp == TRUE, then we've already sent one or more
	5194	* non-reliable vnodes to the async thread for processing and timed
	5195	* out waiting for a dead vnode to show up. Use the MAX_WALK_COUNT
	5196	* mechanism to first scan for a reliable vnode before forcing
	5197	* a new vnode to be created
	5198	*/
	5199	if (need_reliable_vp == FALSE \|\| vnode_on_reliable_media(vp) == TRUE) {
	5200	break;
	5201	}
	5202	}
	5203
	5204	// don't iterate more than MAX_WALK_COUNT vnodes to
	5205	// avoid keeping the vnode list lock held for too long.
	5206
	5207	if (walk_count++ > MAX_WALK_COUNT) {
	5208	vp = NULL;
	5209	break;
	5210	}
	5211	}
	5212	}
	5213
	5214	if (vp == NULL && !TAILQ_EMPTY(&vnode_free_list)) {
	5215	/*
	5216	* Pick the first vp for possible reuse
	5217	*/
	5218	walk_count = 0;
	5219	TAILQ_FOREACH(vp, &vnode_free_list, v_freelist) {
	5220	// if we're a dependency-capable process, skip vnodes that can
	5221	// cause recycling deadlocks. (i.e. this process is diskimages
	5222	// helper and the vnode is in a disk image). Querying the
	5223	// mnt_kern_flag for the mount's virtual device status
	5224	// is safer than checking the mnt_dependent_process, which
	5225	// may not be updated if there are multiple devnode layers
	5226	// in between the disk image and the final consumer.
	5227
	5228	if ((curproc->p_flag & P_DEPENDENCY_CAPABLE) == 0 \|\| vp->v_mount == NULL \|\|
	5229	(vp->v_mount->mnt_kern_flag & MNTK_VIRTUALDEV) == 0) {
	5230	/*
	5231	* if need_reliable_vp == TRUE, then we've already sent one or more
	5232	* non-reliable vnodes to the async thread for processing and timed
	5233	* out waiting for a dead vnode to show up. Use the MAX_WALK_COUNT
	5234	* mechanism to first scan for a reliable vnode before forcing
	5235	* a new vnode to be created
	5236	*/
	5237	if (need_reliable_vp == FALSE \|\| vnode_on_reliable_media(vp) == TRUE) {
	5238	break;
	5239	}
	5240	}
	5241
	5242	// don't iterate more than MAX_WALK_COUNT vnodes to
	5243	// avoid keeping the vnode list lock held for too long.
	5244
	5245	if (walk_count++ > MAX_WALK_COUNT) {
	5246	vp = NULL;
	5247	break;
	5248	}
	5249	}
	5250	}
	5251
	5252	//
	5253	// if we don't have a vnode and the walk_count is >= MAX_WALK_COUNT
	5254	// then we're trying to create a vnode on behalf of a
	5255	// process like diskimages-helper that has file systems
	5256	// mounted on top of itself (and thus we can't reclaim
	5257	// vnodes in the file systems on top of us). if we can't
	5258	// find a vnode to reclaim then we'll just have to force
	5259	// the allocation.
	5260	//
	5261	if (vp == NULL && walk_count >= MAX_WALK_COUNT) {
	5262	force_alloc = 1;
	5263	vnode_list_unlock();
	5264	goto retry;
	5265	}
	5266
	5267	if (vp == NULL) {
	5268	/*
	5269	* we've reached the system imposed maximum number of vnodes
	5270	* but there isn't a single one available
	5271	* wait a bit and then retry... if we can't get a vnode
	5272	* after our target number of retries, than log a complaint
	5273	*/
	5274	if (++retries <= max_retries) {
	5275	vnode_list_unlock();
	5276	delay_for_interval(1, 1000 * 1000);
	5277	goto retry;
	5278	}
	5279
	5280	vnode_list_unlock();
	5281	tablefull("vnode");
	5282	log(LOG_EMERG, "%d desired, %ld numvnodes, "
	5283	"%ld free, %ld dead, %ld async, %d rage %d bdevvp\n",
	5284	desiredvnodes, numvnodes, freevnodes, deadvnodes, async_work_vnodes, ragevnodes, bdevvp_vnodes);
	5285	#if CONFIG_JETSAM
	5286
	5287	#if DEVELOPMENT \|\| DEBUG
	5288	if (bootarg_no_vnode_jetsam) {
	5289	panic("vnode table is full\n");
	5290	}
	5291	#endif /* DEVELOPMENT \|\| DEBUG */
	5292
	5293	/*
	5294	* Running out of vnodes tends to make a system unusable. Start killing
	5295	* processes that jetsam knows are killable.
	5296	*/
	5297	if (memorystatus_kill_on_vnode_limit() == FALSE) {
	5298	/*
	5299	* If jetsam can't find any more processes to kill and there
	5300	* still aren't any free vnodes, panic. Hopefully we'll get a
	5301	* panic log to tell us why we ran out.
	5302	*/
	5303	panic("vnode table is full\n");
	5304	}
	5305
	5306	/*
	5307	* Now that we've killed someone, wait a bit and continue looking
	5308	* (with fewer retries before trying another kill).
	5309	*/
	5310	delay_for_interval(3, 1000 * 1000);
	5311	retries = 0;
	5312	max_retries = 10;
	5313	goto retry;
	5314	#endif
	5315
	5316	*vpp = NULL;
	5317	return ENFILE;
	5318	}
	5319	newvnode_nodead++;
	5320	steal_this_vp:
	5321	if ((vp = process_vp(vp, 1, true, &deferred)) == NULLVP) {
	5322	if (deferred) {
	5323	int elapsed_msecs;
	5324	struct timeval elapsed_tv;
	5325
	5326	if (initial_tv.tv_sec == 0) {
	5327	microuptime(&initial_tv);
	5328	}
	5329
	5330	vnode_list_lock();
	5331
	5332	dead_vnode_waited++;
	5333	dead_vnode_wanted++;
	5334
	5335	/*
	5336	* note that we're only going to explicitly wait 10ms
	5337	* for a dead vnode to become available, since even if one
	5338	* isn't available, a reliable vnode might now be available
	5339	* at the head of the VRAGE or free lists... if so, we
	5340	* can satisfy the new_vnode request with less latency then waiting
	5341	* for the full 100ms duration we're ultimately willing to tolerate
	5342	*/
	5343	assert_wait_timeout((caddr_t)&dead_vnode_wanted, (THREAD_INTERRUPTIBLE), 10000, NSEC_PER_USEC);
	5344
	5345	vnode_list_unlock();
	5346
	5347	thread_block(THREAD_CONTINUE_NULL);
	5348
	5349	microuptime(&elapsed_tv);
	5350
	5351	timevalsub(&elapsed_tv, &initial_tv);
	5352	elapsed_msecs = (int)(elapsed_tv.tv_sec * 1000 + elapsed_tv.tv_usec / 1000);
	5353
	5354	if (elapsed_msecs >= 100) {
	5355	/*
	5356	* we've waited long enough... 100ms is
	5357	* somewhat arbitrary for this case, but the
	5358	* normal worst case latency used for UI
	5359	* interaction is 100ms, so I've chosen to
	5360	* go with that.
	5361	*
	5362	* setting need_reliable_vp to TRUE
	5363	* forces us to find a reliable vnode
	5364	* that we can process synchronously, or
	5365	* to create a new one if the scan for
	5366	* a reliable one hits the scan limit
	5367	*/
	5368	need_reliable_vp = TRUE;
	5369	}
	5370	}
	5371	goto retry;
	5372	}
	5373	OSAddAtomicLong(1, &num_reusedvnodes);
	5374
	5375
	5376	#if CONFIG_MACF
	5377	/*
	5378	* We should never see VL_LABELWAIT or VL_LABEL here.
	5379	* as those operations hold a reference.
	5380	*/
	5381	assert((vp->v_lflag & VL_LABELWAIT) != VL_LABELWAIT);
	5382	assert((vp->v_lflag & VL_LABEL) != VL_LABEL);
	5383	if (vp->v_lflag & VL_LABELED \|\| vp->v_label != NULL) {
	5384	vnode_lock_convert(vp);
	5385	mac_vnode_label_recycle(vp);
	5386	} else if (mac_vnode_label_init_needed(vp)) {
	5387	vnode_lock_convert(vp);
	5388	mac_vnode_label_init(vp);
	5389	}
	5390
	5391	#endif /* MAC */
	5392
	5393	vp->v_iocount = 1;
	5394	vp->v_lflag = 0;
	5395	vp->v_writecount = 0;
	5396	vp->v_references = 0;
	5397	vp->v_iterblkflags = 0;
	5398	vp->v_flag = VSTANDARD;
	5399	/* vbad vnodes can point to dead_mountp */
	5400	vp->v_mount = NULL;
	5401	vp->v_defer_reclaimlist = (vnode_t)0;
	5402
	5403	vnode_unlock(vp);
	5404
	5405	done:
	5406	*vpp = vp;
	5407
	5408	return 0;
	5409	}
	5410
	5411	void
	5412	vnode_lock(vnode_t vp)
	5413	{
	5414	lck_mtx_lock(&vp->v_lock);
	5415	}
	5416
	5417	void
	5418	vnode_lock_spin(vnode_t vp)
	5419	{
	5420	lck_mtx_lock_spin(&vp->v_lock);
	5421	}
	5422
	5423	void
	5424	vnode_unlock(vnode_t vp)
	5425	{
	5426	lck_mtx_unlock(&vp->v_lock);
	5427	}
	5428
	5429
	5430
	5431	int
	5432	vnode_get(struct vnode *vp)
	5433	{
	5434	int retval;
	5435
	5436	vnode_lock_spin(vp);
	5437	retval = vnode_get_locked(vp);
	5438	vnode_unlock(vp);
	5439
	5440	return retval;
	5441	}
	5442
	5443	int
	5444	vnode_get_locked(struct vnode *vp)
	5445	{
	5446	#if DIAGNOSTIC
	5447	lck_mtx_assert(&vp->v_lock, LCK_MTX_ASSERT_OWNED);
	5448	#endif
	5449	if ((vp->v_iocount == 0) && (vp->v_lflag & (VL_TERMINATE \| VL_DEAD))) {
	5450	return ENOENT;
	5451	}
	5452
	5453	if (os_add_overflow(vp->v_iocount, 1, &vp->v_iocount)) {
	5454	panic("v_iocount overflow");
	5455	}
	5456
	5457	#ifdef JOE_DEBUG
	5458	record_vp(vp, 1);
	5459	#endif
	5460	return 0;
	5461	}
	5462
	5463	/*
	5464	* vnode_getwithvid() cuts in line in front of a vnode drain (that is,
	5465	* while the vnode is draining, but at no point after that) to prevent
	5466	* deadlocks when getting vnodes from filesystem hashes while holding
	5467	* resources that may prevent other iocounts from being released.
	5468	*/
	5469	int
	5470	vnode_getwithvid(vnode_t vp, uint32_t vid)
	5471	{
	5472	return vget_internal(vp, vid, (VNODE_NODEAD \| VNODE_WITHID \| VNODE_DRAINO));
	5473	}
	5474
	5475	/*
	5476	* vnode_getwithvid_drainok() is like vnode_getwithvid(), but does block behind a vnode
	5477	* drain; it exists for use in the VFS name cache, where we really do want to block behind
	5478	* vnode drain to prevent holding off an unmount.
	5479	*/
	5480	int
	5481	vnode_getwithvid_drainok(vnode_t vp, uint32_t vid)
	5482	{
	5483	return vget_internal(vp, vid, (VNODE_NODEAD \| VNODE_WITHID));
	5484	}
	5485
	5486	int
	5487	vnode_getwithref(vnode_t vp)
	5488	{
	5489	return vget_internal(vp, 0, 0);
	5490	}
	5491
	5492
	5493	__private_extern__ int
	5494	vnode_getalways(vnode_t vp)
	5495	{
	5496	return vget_internal(vp, 0, VNODE_ALWAYS);
	5497	}
	5498
	5499	__private_extern__ int
	5500	vnode_getalways_from_pager(vnode_t vp)
	5501	{
	5502	return vget_internal(vp, 0, VNODE_ALWAYS \| VNODE_PAGER);
	5503	}
	5504
	5505	static inline void
	5506	vn_set_dead(vnode_t vp)
	5507	{
	5508	vp->v_mount = NULL;
	5509	vp->v_op = dead_vnodeop_p;
	5510	vp->v_tag = VT_NON;
	5511	vp->v_data = NULL;
	5512	vp->v_type = VBAD;
	5513	vp->v_lflag \|= VL_DEAD;
	5514	}
	5515
	5516	static int
	5517	vnode_put_internal_locked(vnode_t vp, bool from_pager)
	5518	{
	5519	vfs_context_t ctx = vfs_context_current(); /* hoist outside loop */
	5520
	5521	#if DIAGNOSTIC
	5522	lck_mtx_assert(&vp->v_lock, LCK_MTX_ASSERT_OWNED);
	5523	#endif
	5524	retry:
	5525	if (vp->v_iocount < 1) {
	5526	panic("vnode_put(%p): iocount < 1", vp);
	5527	}
	5528
	5529	if ((vp->v_usecount > 0) \|\| (vp->v_iocount > 1)) {
	5530	vnode_dropiocount(vp);
	5531	return 0;
	5532	}
	5533
	5534	if (((vp->v_lflag & (VL_DEAD \| VL_NEEDINACTIVE)) == VL_NEEDINACTIVE)) {
	5535	vp->v_lflag &= ~VL_NEEDINACTIVE;
	5536	vnode_unlock(vp);
	5537
	5538	VNOP_INACTIVE(vp, ctx);
	5539
	5540	vnode_lock_spin(vp);
	5541	/*
	5542	* because we had to drop the vnode lock before calling
	5543	* VNOP_INACTIVE, the state of this vnode may have changed...
	5544	* we may pick up both VL_MARTERM and either
	5545	* an iocount or a usecount while in the VNOP_INACTIVE call
	5546	* we don't want to call vnode_reclaim_internal on a vnode
	5547	* that has active references on it... so loop back around
	5548	* and reevaluate the state
	5549	*/
	5550	goto retry;
	5551	}
	5552	vp->v_lflag &= ~VL_NEEDINACTIVE;
	5553
	5554	if ((vp->v_lflag & (VL_MARKTERM \| VL_TERMINATE \| VL_DEAD)) == VL_MARKTERM) {
	5555	if (from_pager) {
	5556	/*
	5557	* We can't initiate reclaim when called from the pager
	5558	* because it will deadlock with itself so we hand it
	5559	* off to the async cleaner thread.
	5560	*/
	5561	if (VONLIST(vp)) {
	5562	if (!(vp->v_listflag & VLIST_ASYNC_WORK)) {
	5563	vnode_list_lock();
	5564	vnode_list_remove_locked(vp);
	5565	vnode_async_list_add_locked(vp);
	5566	vnode_list_unlock();
	5567	}
	5568	wakeup(&vnode_async_work_list);
	5569	} else {
	5570	vnode_async_list_add(vp);
	5571	}
	5572	} else {
	5573	vnode_lock_convert(vp);
	5574	vnode_reclaim_internal(vp, 1, 1, 0);
	5575	}
	5576	}
	5577	vnode_dropiocount(vp);
	5578	vnode_list_add(vp);
	5579
	5580	return 0;
	5581	}
	5582
	5583	int
	5584	vnode_put_locked(vnode_t vp)
	5585	{
	5586	return vnode_put_internal_locked(vp, false);
	5587	}
	5588
	5589	int
	5590	vnode_put(vnode_t vp)
	5591	{
	5592	int retval;
	5593
	5594	vnode_lock_spin(vp);
	5595	retval = vnode_put_internal_locked(vp, false);
	5596	vnode_unlock(vp);
	5597
	5598	return retval;
	5599	}
	5600
	5601	int
	5602	vnode_put_from_pager(vnode_t vp)
	5603	{
	5604	int retval;
	5605
	5606	vnode_lock_spin(vp);
	5607	/* Cannot initiate reclaim while paging */
	5608	retval = vnode_put_internal_locked(vp, true);
	5609	vnode_unlock(vp);
	5610
	5611	return retval;
	5612	}
	5613
	5614	/* is vnode_t in use by others? */
	5615	int
	5616	vnode_isinuse(vnode_t vp, int refcnt)
	5617	{
	5618	return vnode_isinuse_locked(vp, refcnt, 0);
	5619	}
	5620
	5621	int
	5622	vnode_usecount(vnode_t vp)
	5623	{
	5624	return vp->v_usecount;
	5625	}
	5626
	5627	int
	5628	vnode_iocount(vnode_t vp)
	5629	{
	5630	return vp->v_iocount;
	5631	}
	5632
	5633	int
	5634	vnode_isinuse_locked(vnode_t vp, int refcnt, int locked)
	5635	{
	5636	int retval = 0;
	5637
	5638	if (!locked) {
	5639	vnode_lock_spin(vp);
	5640	}
	5641	if ((vp->v_type != VREG) && ((vp->v_usecount - vp->v_kusecount) > refcnt)) {
	5642	retval = 1;
	5643	goto out;
	5644	}
	5645	if (vp->v_type == VREG) {
	5646	retval = ubc_isinuse_locked(vp, refcnt, 1);
	5647	}
	5648
	5649	out:
	5650	if (!locked) {
	5651	vnode_unlock(vp);
	5652	}
	5653	return retval;
	5654	}
	5655
	5656
	5657	/* resume vnode_t */
	5658	errno_t
	5659	vnode_resume(vnode_t vp)
	5660	{
	5661	if ((vp->v_lflag & VL_SUSPENDED) && vp->v_owner == current_thread()) {
	5662	vnode_lock_spin(vp);
	5663	vp->v_lflag &= ~VL_SUSPENDED;
	5664	vp->v_owner = NULL;
	5665	vnode_unlock(vp);
	5666
	5667	wakeup(&vp->v_iocount);
	5668	}
	5669	return 0;
	5670	}
	5671
	5672	/* suspend vnode_t
	5673	* Please do not use on more than one vnode at a time as it may
	5674	* cause deadlocks.
	5675	* xxx should we explicity prevent this from happening?
	5676	*/
	5677
	5678	errno_t
	5679	vnode_suspend(vnode_t vp)
	5680	{
	5681	if (vp->v_lflag & VL_SUSPENDED) {
	5682	return EBUSY;
	5683	}
	5684
	5685	vnode_lock_spin(vp);
	5686
	5687	/*
	5688	* xxx is this sufficient to check if a vnode_drain is
	5689	* progress?
	5690	*/
	5691
	5692	if (vp->v_owner == NULL) {
	5693	vp->v_lflag \|= VL_SUSPENDED;
	5694	vp->v_owner = current_thread();
	5695	}
	5696	vnode_unlock(vp);
	5697
	5698	return 0;
	5699	}
	5700
	5701	/*
	5702	* Release any blocked locking requests on the vnode.
	5703	* Used for forced-unmounts.
	5704	*
	5705	* XXX What about network filesystems?
	5706	*/
	5707	static void
	5708	vnode_abort_advlocks(vnode_t vp)
	5709	{
	5710	if (vp->v_flag & VLOCKLOCAL) {
	5711	lf_abort_advlocks(vp);
	5712	}
	5713	}
	5714
	5715
	5716	static errno_t
	5717	vnode_drain(vnode_t vp)
	5718	{
	5719	if (vp->v_lflag & VL_DRAIN) {
	5720	panic("vnode_drain: recursive drain");
	5721	return ENOENT;
	5722	}
	5723	vp->v_lflag \|= VL_DRAIN;
	5724	vp->v_owner = current_thread();
	5725
	5726	while (vp->v_iocount > 1) {
	5727	if (bootarg_no_vnode_drain) {
	5728	struct timespec ts = {.tv_sec = 10, .tv_nsec = 0};
	5729	int error;
	5730
	5731	if (vfs_unmountall_started) {
	5732	ts.tv_sec = 1;
	5733	}
	5734
	5735	error = msleep(&vp->v_iocount, &vp->v_lock, PVFS, "vnode_drain_with_timeout", &ts);
	5736
	5737	/* Try to deal with leaked iocounts under bootarg and shutting down */
	5738	if (vp->v_iocount > 1 && error == EWOULDBLOCK &&
	5739	ts.tv_sec == 1 && vp->v_numoutput == 0) {
	5740	vp->v_iocount = 1;
	5741	break;
	5742	}
	5743	} else {
	5744	msleep(&vp->v_iocount, &vp->v_lock, PVFS, "vnode_drain", NULL);
	5745	}
	5746	}
	5747
	5748	vp->v_lflag &= ~VL_DRAIN;
	5749
	5750	return 0;
	5751	}
	5752
	5753
	5754	/*
	5755	* if the number of recent references via vnode_getwithvid or vnode_getwithref
	5756	* exceeds this threshold, than 'UN-AGE' the vnode by removing it from
	5757	* the LRU list if it's currently on it... once the iocount and usecount both drop
	5758	* to 0, it will get put back on the end of the list, effectively making it younger
	5759	* this allows us to keep actively referenced vnodes in the list without having
	5760	* to constantly remove and add to the list each time a vnode w/o a usecount is
	5761	* referenced which costs us taking and dropping a global lock twice.
	5762	* However, if the vnode is marked DIRTY, we want to pull it out much earlier
	5763	*/
	5764	#define UNAGE_THRESHHOLD 25
	5765	#define UNAGE_DIRTYTHRESHHOLD 6
	5766
	5767	errno_t
	5768	vnode_getiocount(vnode_t vp, unsigned int vid, int vflags)
	5769	{
	5770	int nodead = vflags & VNODE_NODEAD;
	5771	int nosusp = vflags & VNODE_NOSUSPEND;
	5772	int always = vflags & VNODE_ALWAYS;
	5773	int beatdrain = vflags & VNODE_DRAINO;
	5774	int withvid = vflags & VNODE_WITHID;
	5775	int forpager = vflags & VNODE_PAGER;
	5776
	5777	for (;;) {
	5778	int sleepflg = 0;
	5779
	5780	/*
	5781	* if it is a dead vnode with deadfs
	5782	*/
	5783	if (nodead && (vp->v_lflag & VL_DEAD) && ((vp->v_type == VBAD) \|\| (vp->v_data == 0))) {
	5784	return ENOENT;
	5785	}
	5786	/*
	5787	* will return VL_DEAD ones
	5788	*/
	5789	if ((vp->v_lflag & (VL_SUSPENDED \| VL_DRAIN \| VL_TERMINATE)) == 0) {
	5790	break;
	5791	}
	5792	/*
	5793	* if suspended vnodes are to be failed
	5794	*/
	5795	if (nosusp && (vp->v_lflag & VL_SUSPENDED)) {
	5796	return ENOENT;
	5797	}
	5798	/*
	5799	* if you are the owner of drain/suspend/termination , can acquire iocount
	5800	* check for VL_TERMINATE; it does not set owner
	5801	*/
	5802	if ((vp->v_lflag & (VL_DRAIN \| VL_SUSPENDED \| VL_TERMINATE)) &&
	5803	(vp->v_owner == current_thread())) {
	5804	break;
	5805	}
	5806
	5807	if (always != 0) {
	5808	break;
	5809	}
	5810
	5811	/*
	5812	* If this vnode is getting drained, there are some cases where
	5813	* we can't block or, in case of tty vnodes, want to be
	5814	* interruptible.
	5815	*/
	5816	if (vp->v_lflag & VL_DRAIN) {
	5817	/*
	5818	* In some situations, we want to get an iocount
	5819	* even if the vnode is draining to prevent deadlock,
	5820	* e.g. if we're in the filesystem, potentially holding
	5821	* resources that could prevent other iocounts from
	5822	* being released.
	5823	*/
	5824	if (beatdrain) {
	5825	break;
	5826	}
	5827	/*
	5828	* Don't block if the vnode's mount point is unmounting as
	5829	* we may be the thread the unmount is itself waiting on
	5830	* Only callers who pass in vids (at this point, we've already
	5831	* handled nosusp and nodead) are expecting error returns
	5832	* from this function, so only we can only return errors for
	5833	* those. ENODEV is intended to inform callers that the call
	5834	* failed because an unmount is in progress.
	5835	*/
	5836	if (withvid && (vp->v_mount) && vfs_isunmount(vp->v_mount)) {
	5837	return ENODEV;
	5838	}
	5839
	5840	if (vnode_istty(vp)) {
	5841	sleepflg = PCATCH;
	5842	}
	5843	}
	5844
	5845	vnode_lock_convert(vp);
	5846
	5847	if (vp->v_lflag & VL_TERMINATE) {
	5848	int error;
	5849
	5850	vp->v_lflag \|= VL_TERMWANT;
	5851
	5852	error = msleep(&vp->v_lflag, &vp->v_lock,
	5853	(PVFS \| sleepflg), "vnode getiocount", NULL);
	5854	if (error) {
	5855	return error;
	5856	}
	5857	} else {
	5858	msleep(&vp->v_iocount, &vp->v_lock, PVFS, "vnode_getiocount", NULL);
	5859	}
	5860	}
	5861	if (withvid && vid != vp->v_id) {
	5862	return ENOENT;
	5863	}
	5864	if (!forpager && (++vp->v_references >= UNAGE_THRESHHOLD \|\|
	5865	(vp->v_flag & VISDIRTY && vp->v_references >= UNAGE_DIRTYTHRESHHOLD))) {
	5866	vp->v_references = 0;
	5867	vnode_list_remove(vp);
	5868	}
	5869	vp->v_iocount++;
	5870	#ifdef JOE_DEBUG
	5871	record_vp(vp, 1);
	5872	#endif
	5873	return 0;
	5874	}
	5875
	5876	static void
	5877	vnode_dropiocount(vnode_t vp)
	5878	{
	5879	if (vp->v_iocount < 1) {
	5880	panic("vnode_dropiocount(%p): v_iocount < 1", vp);
	5881	}
	5882
	5883	vp->v_iocount--;
	5884	#ifdef JOE_DEBUG
	5885	record_vp(vp, -1);
	5886	#endif
	5887	if ((vp->v_lflag & (VL_DRAIN \| VL_SUSPENDED)) && (vp->v_iocount <= 1)) {
	5888	wakeup(&vp->v_iocount);
	5889	}
	5890	}
	5891
	5892
	5893	void
	5894	vnode_reclaim(struct vnode * vp)
	5895	{
	5896	vnode_reclaim_internal(vp, 0, 0, 0);
	5897	}
	5898
	5899	__private_extern__
	5900	void
	5901	vnode_reclaim_internal(struct vnode * vp, int locked, int reuse, int flags)
	5902	{
	5903	int isfifo = 0;
	5904	bool clear_tty_revoke = false;
	5905
	5906	if (!locked) {
	5907	vnode_lock(vp);
	5908	}
	5909
	5910	if (vp->v_lflag & VL_TERMINATE) {
	5911	panic("vnode reclaim in progress");
	5912	}
	5913	vp->v_lflag \|= VL_TERMINATE;
	5914
	5915	vn_clearunionwait(vp, 1);
	5916
	5917	/*
	5918	* We have to force any terminals in reads to return and give up
	5919	* their iocounts. It's important to do this after VL_TERMINATE
	5920	* has been set to ensure new reads are blocked while the
	5921	* revoke is in progress.
	5922	*/
	5923	if (vnode_istty(vp) && (flags & REVOKEALL) && (vp->v_iocount > 1)) {
	5924	vnode_unlock(vp);
	5925	VNOP_IOCTL(vp, TIOCREVOKE, (caddr_t)NULL, 0, vfs_context_kernel());
	5926	clear_tty_revoke = true;
	5927	vnode_lock(vp);
	5928	}
	5929
	5930	vnode_drain(vp);
	5931
	5932	if (clear_tty_revoke) {
	5933	vnode_unlock(vp);
	5934	VNOP_IOCTL(vp, TIOCREVOKECLEAR, (caddr_t)NULL, 0, vfs_context_kernel());
	5935	vnode_lock(vp);
	5936	}
	5937
	5938	isfifo = (vp->v_type == VFIFO);
	5939
	5940	if (vp->v_type != VBAD) {
	5941	vgone(vp, flags); /* clean and reclaim the vnode */
	5942	}
	5943	/*
	5944	* give the vnode a new identity so that vnode_getwithvid will fail
	5945	* on any stale cache accesses...
	5946	* grab the list_lock so that if we're in "new_vnode"
	5947	* behind the list_lock trying to steal this vnode, the v_id is stable...
	5948	* once new_vnode drops the list_lock, it will block trying to take
	5949	* the vnode lock until we release it... at that point it will evaluate
	5950	* whether the v_vid has changed
	5951	* also need to make sure that the vnode isn't on a list where "new_vnode"
	5952	* can find it after the v_id has been bumped until we are completely done
	5953	* with the vnode (i.e. putting it back on a list has to be the very last
	5954	* thing we do to this vnode... many of the callers of vnode_reclaim_internal
	5955	* are holding an io_count on the vnode... they need to drop the io_count
	5956	* BEFORE doing a vnode_list_add or make sure to hold the vnode lock until
	5957	* they are completely done with the vnode
	5958	*/
	5959	vnode_list_lock();
	5960
	5961	vnode_list_remove_locked(vp);
	5962	vp->v_id++;
	5963
	5964	vnode_list_unlock();
	5965
	5966	if (isfifo) {
	5967	struct fifoinfo * fip;
	5968
	5969	fip = vp->v_fifoinfo;
	5970	vp->v_fifoinfo = NULL;
	5971	kheap_free(KHEAP_DEFAULT, fip, sizeof(struct fifoinfo));
	5972	}
	5973	vp->v_type = VBAD;
	5974
	5975	if (vp->v_data) {
	5976	panic("vnode_reclaim_internal: cleaned vnode isn't");
	5977	}
	5978	if (vp->v_numoutput) {
	5979	panic("vnode_reclaim_internal: clean vnode has pending I/O's");
	5980	}
	5981	if (UBCINFOEXISTS(vp)) {
	5982	panic("vnode_reclaim_internal: ubcinfo not cleaned");
	5983	}
	5984	if (vp->v_parent) {
	5985	panic("vnode_reclaim_internal: vparent not removed");
	5986	}
	5987	if (vp->v_name) {
	5988	panic("vnode_reclaim_internal: vname not removed");
	5989	}
	5990
	5991	vp->v_socket = NULL;
	5992
	5993	vp->v_lflag &= ~VL_TERMINATE;
	5994	vp->v_owner = NULL;
	5995
	5996	KNOTE(&vp->v_knotes, NOTE_REVOKE);
	5997
	5998	/* Make sure that when we reuse the vnode, no knotes left over */
	5999	klist_init(&vp->v_knotes);
	6000
	6001	if (vp->v_lflag & VL_TERMWANT) {
	6002	vp->v_lflag &= ~VL_TERMWANT;
	6003	wakeup(&vp->v_lflag);
	6004	}
	6005	if (!reuse) {
	6006	/*
	6007	* make sure we get on the
	6008	* dead list if appropriate
	6009	*/
	6010	vnode_list_add(vp);
	6011	}
	6012	if (!locked) {
	6013	vnode_unlock(vp);
	6014	}
	6015	}
	6016
	6017	static int
	6018	vnode_create_internal(uint32_t flavor, uint32_t size, void data, vnode_t vpp,
	6019	int init_vnode)
	6020	{
	6021	int error;
	6022	int insert = 1;
	6023	int existing_vnode;
	6024	vnode_t vp;
	6025	vnode_t nvp;
	6026	vnode_t dvp;
	6027	struct uthread *ut;
	6028	struct componentname *cnp;
	6029	struct vnode_fsparam param = (struct vnode_fsparam )data;
	6030	#if CONFIG_TRIGGERS
	6031	struct vnode_trigger_param *tinfo = NULL;
	6032	#endif
	6033	if (*vpp) {
	6034	vp = *vpp;
	6035	*vpp = NULLVP;
	6036	existing_vnode = 1;
	6037	} else {
	6038	existing_vnode = 0;
	6039	}
	6040
	6041	if (init_vnode) {
	6042	/* Do quick sanity check on the parameters. */
	6043	if ((param == NULL) \|\| (param->vnfs_vtype == VBAD)) {
	6044	error = EINVAL;
	6045	goto error_out;
	6046	}
	6047
	6048	#if CONFIG_TRIGGERS
	6049	if ((flavor == VNCREATE_TRIGGER) && (size == VNCREATE_TRIGGER_SIZE)) {
	6050	tinfo = (struct vnode_trigger_param *)data;
	6051
	6052	/* Validate trigger vnode input */
	6053	if ((param->vnfs_vtype != VDIR) \|\|
	6054	(tinfo->vnt_resolve_func == NULL) \|\|
	6055	(tinfo->vnt_flags & ~VNT_VALID_MASK)) {
	6056	error = EINVAL;
	6057	goto error_out;
	6058	}
	6059	/* Fall through a normal create (params will be the same) */
	6060	flavor = VNCREATE_FLAVOR;
	6061	size = VCREATESIZE;
	6062	}
	6063	#endif
	6064	if ((flavor != VNCREATE_FLAVOR) \|\| (size != VCREATESIZE)) {
	6065	error = EINVAL;
	6066	goto error_out;
	6067	}
	6068	}
	6069
	6070	if (!existing_vnode) {
	6071	if ((error = new_vnode(&vp))) {
	6072	return error;
	6073	}
	6074	if (!init_vnode) {
	6075	/* Make it so that it can be released by a vnode_put) */
	6076	vn_set_dead(vp);
	6077	*vpp = vp;
	6078	return 0;
	6079	}
	6080	} else {
	6081	/*
	6082	* A vnode obtained by vnode_create_empty has been passed to
	6083	* vnode_initialize - Unset VL_DEAD set by vn_set_dead. After
	6084	* this point, it is set back on any error.
	6085	*
	6086	* N.B. vnode locking - We make the same assumptions as the
	6087	* "unsplit" vnode_create did - i.e. it is safe to update the
	6088	* vnode's fields without the vnode lock. This vnode has been
	6089	* out and about with the filesystem and hopefully nothing
	6090	* was done to the vnode between the vnode_create_empty and
	6091	* now when it has come in through vnode_initialize.
	6092	*/
	6093	vp->v_lflag &= ~VL_DEAD;
	6094	}
	6095
	6096	dvp = param->vnfs_dvp;
	6097	cnp = param->vnfs_cnp;
	6098
	6099	vp->v_op = param->vnfs_vops;
	6100	vp->v_type = (uint16_t)param->vnfs_vtype;
	6101	vp->v_data = param->vnfs_fsnode;
	6102
	6103	if (param->vnfs_markroot) {
	6104	vp->v_flag \|= VROOT;
	6105	}
	6106	if (param->vnfs_marksystem) {
	6107	vp->v_flag \|= VSYSTEM;
	6108	}
	6109	if (vp->v_type == VREG) {
	6110	error = ubc_info_init_withsize(vp, param->vnfs_filesize);
	6111	if (error) {
	6112	#ifdef JOE_DEBUG
	6113	record_vp(vp, 1);
	6114	#endif
	6115	vn_set_dead(vp);
	6116
	6117	vnode_put(vp);
	6118	return error;
	6119	}
	6120	if (param->vnfs_mp->mnt_ioflags & MNT_IOFLAGS_IOSCHED_SUPPORTED) {
	6121	memory_object_mark_io_tracking(vp->v_ubcinfo->ui_control);
	6122	}
	6123	}
	6124	#ifdef JOE_DEBUG
	6125	record_vp(vp, 1);
	6126	#endif
	6127
	6128	#if CONFIG_FIRMLINKS
	6129	vp->v_fmlink = NULLVP;
	6130	#endif
	6131	vp->v_flag &= ~VFMLINKTARGET;
	6132
	6133	#if CONFIG_TRIGGERS
	6134	/*
	6135	* For trigger vnodes, attach trigger info to vnode
	6136	*/
	6137	if ((vp->v_type == VDIR) && (tinfo != NULL)) {
	6138	/*
	6139	* Note: has a side effect of incrementing trigger count on the
	6140	* mount if successful, which we would need to undo on a
	6141	* subsequent failure.
	6142	*/
	6143	#ifdef JOE_DEBUG
	6144	record_vp(vp, -1);
	6145	#endif
	6146	error = vnode_resolver_create(param->vnfs_mp, vp, tinfo, FALSE);
	6147	if (error) {
	6148	printf("vnode_create: vnode_resolver_create() err %d\n", error);
	6149	vn_set_dead(vp);
	6150	#ifdef JOE_DEBUG
	6151	record_vp(vp, 1);
	6152	#endif
	6153	vnode_put(vp);
	6154	return error;
	6155	}
	6156	}
	6157	#endif
	6158	if (vp->v_type == VCHR \|\| vp->v_type == VBLK) {
	6159	vp->v_tag = VT_DEVFS; /* callers will reset if needed (bdevvp) */
	6160
	6161	if ((nvp = checkalias(vp, param->vnfs_rdev))) {
	6162	/*
	6163	* if checkalias returns a vnode, it will be locked
	6164	*
	6165	* first get rid of the unneeded vnode we acquired
	6166	*/
	6167	vp->v_data = NULL;
	6168	vp->v_op = spec_vnodeop_p;
	6169	vp->v_type = VBAD;
	6170	vp->v_lflag = VL_DEAD;
	6171	vp->v_data = NULL;
	6172	vp->v_tag = VT_NON;
	6173	vnode_put(vp);
	6174
	6175	/*
	6176	* switch to aliased vnode and finish
	6177	* preparing it
	6178	*/
	6179	vp = nvp;
	6180
	6181	vclean(vp, 0);
	6182	vp->v_op = param->vnfs_vops;
	6183	vp->v_type = (uint16_t)param->vnfs_vtype;
	6184	vp->v_data = param->vnfs_fsnode;
	6185	vp->v_lflag = 0;
	6186	vp->v_mount = NULL;
	6187	insmntque(vp, param->vnfs_mp);
	6188	insert = 0;
	6189	vnode_unlock(vp);
	6190	}
	6191
	6192	if (VCHR == vp->v_type) {
	6193	u_int maj = major(vp->v_rdev);
	6194
	6195	if (maj < (u_int)nchrdev && cdevsw[maj].d_type == D_TTY) {
	6196	vp->v_flag \|= VISTTY;
	6197	}
	6198	}
	6199	}
	6200
	6201	if (vp->v_type == VFIFO) {
	6202	struct fifoinfo *fip;
	6203
	6204	fip = kheap_alloc(KHEAP_DEFAULT, sizeof(struct fifoinfo),
	6205	Z_WAITOK \| Z_ZERO);
	6206	vp->v_fifoinfo = fip;
	6207	}
	6208	/* The file systems must pass the address of the location where
	6209	* they store the vnode pointer. When we add the vnode into the mount
	6210	* list and name cache they become discoverable. So the file system node
	6211	* must have the connection to vnode setup by then
	6212	*/
	6213	*vpp = vp;
	6214
	6215	/* Add fs named reference. */
	6216	if (param->vnfs_flags & VNFS_ADDFSREF) {
	6217	vp->v_lflag \|= VNAMED_FSHASH;
	6218	}
	6219	if (param->vnfs_mp) {
	6220	if (param->vnfs_mp->mnt_kern_flag & MNTK_LOCK_LOCAL) {
	6221	vp->v_flag \|= VLOCKLOCAL;
	6222	}
	6223	if (insert) {
	6224	if ((vp->v_freelist.tqe_prev != (struct vnode **)0xdeadb)) {
	6225	panic("insmntque: vp on the free list\n");
	6226	}
	6227
	6228	/*
	6229	* enter in mount vnode list
	6230	*/
	6231	insmntque(vp, param->vnfs_mp);
	6232	}
	6233	}
	6234	if (dvp && vnode_ref(dvp) == 0) {
	6235	vp->v_parent = dvp;
	6236	}
	6237	if (cnp) {
	6238	if (dvp && ((param->vnfs_flags & (VNFS_NOCACHE \| VNFS_CANTCACHE)) == 0)) {
	6239	/*
	6240	* enter into name cache
	6241	* we've got the info to enter it into the name cache now
	6242	* cache_enter_create will pick up an extra reference on
	6243	* the name entered into the string cache
	6244	*/
	6245	vp->v_name = cache_enter_create(dvp, vp, cnp);
	6246	} else {
	6247	vp->v_name = vfs_addname(cnp->cn_nameptr, cnp->cn_namelen, cnp->cn_hash, 0);
	6248	}
	6249
	6250	if ((cnp->cn_flags & UNIONCREATED) == UNIONCREATED) {
	6251	vp->v_flag \|= VISUNION;
	6252	}
	6253	}
	6254	if ((param->vnfs_flags & VNFS_CANTCACHE) == 0) {
	6255	/*
	6256	* this vnode is being created as cacheable in the name cache
	6257	* this allows us to re-enter it in the cache
	6258	*/
	6259	vp->v_flag \|= VNCACHEABLE;
	6260	}
	6261	ut = get_bsdthread_info(current_thread());
	6262
	6263	if ((current_proc()->p_lflag & P_LRAGE_VNODES) \|\|
	6264	(ut->uu_flag & (UT_RAGE_VNODES \| UT_KERN_RAGE_VNODES))) {
	6265	/*
	6266	* process has indicated that it wants any
	6267	* vnodes created on its behalf to be rapidly
	6268	* aged to reduce the impact on the cached set
	6269	* of vnodes
	6270	*
	6271	* if UT_KERN_RAGE_VNODES is set, then the
	6272	* kernel internally wants vnodes to be rapidly
	6273	* aged, even if the process hasn't requested
	6274	* this
	6275	*/
	6276	vp->v_flag \|= VRAGE;
	6277	}
	6278
	6279	#if CONFIG_SECLUDED_MEMORY
	6280	switch (secluded_for_filecache) {
	6281	case 0:
	6282	/*
	6283	* secluded_for_filecache == 0:
	6284	* + no file contents in secluded pool
	6285	*/
	6286	break;
	6287	case 1:
	6288	/*
	6289	* secluded_for_filecache == 1:
	6290	* + no files from /
	6291	* + files from /Applications/ are OK
	6292	* + files from /Applications/Camera are not OK
	6293	* + no files that are open for write
	6294	*/
	6295	if (vnode_vtype(vp) == VREG &&
	6296	vnode_mount(vp) != NULL &&
	6297	(!(vfs_flags(vnode_mount(vp)) & MNT_ROOTFS))) {
	6298	/* not from root filesystem: eligible for secluded pages */
	6299	memory_object_mark_eligible_for_secluded(
	6300	ubc_getobject(vp, UBC_FLAGS_NONE),
	6301	TRUE);
	6302	}
	6303	break;
	6304	case 2:
	6305	/*
	6306	* secluded_for_filecache == 2:
	6307	* + all read-only files OK, except:
	6308	* + dyld_shared_cache_arm64*
	6309	* + Camera
	6310	* + mediaserverd
	6311	*/
	6312	if (vnode_vtype(vp) == VREG) {
	6313	memory_object_mark_eligible_for_secluded(
	6314	ubc_getobject(vp, UBC_FLAGS_NONE),
	6315	TRUE);
	6316	}
	6317	break;
	6318	default:
	6319	break;
	6320	}
	6321	#endif /* CONFIG_SECLUDED_MEMORY */
	6322
	6323	return 0;
	6324
	6325	error_out:
	6326	if (existing_vnode) {
	6327	vnode_put(vp);
	6328	}
	6329	return error;
	6330	}
	6331
	6332	/* USAGE:
	6333	* The following api creates a vnode and associates all the parameter specified in vnode_fsparam
	6334	* structure and returns a vnode handle with a reference. device aliasing is handled here so checkalias
	6335	* is obsoleted by this.
	6336	*/
	6337	int
	6338	vnode_create(uint32_t flavor, uint32_t size, void data, vnode_t vpp)
	6339	{
	6340	*vpp = NULLVP;
	6341	return vnode_create_internal(flavor, size, data, vpp, 1);
	6342	}
	6343
	6344	int
	6345	vnode_create_empty(vnode_t *vpp)
	6346	{
	6347	*vpp = NULLVP;
	6348	return vnode_create_internal(VNCREATE_FLAVOR, VCREATESIZE, NULL,
	6349	vpp, 0);
	6350	}
	6351
	6352	int
	6353	vnode_initialize(uint32_t flavor, uint32_t size, void data, vnode_t vpp)
	6354	{
	6355	if (*vpp == NULLVP) {
	6356	panic("NULL vnode passed to vnode_initialize");
	6357	}
	6358	#if DEVELOPMENT \|\| DEBUG
	6359	/*
	6360	* We lock to check that vnode is fit for unlocked use in
	6361	* vnode_create_internal.
	6362	*/
	6363	vnode_lock_spin(*vpp);
	6364	VNASSERT(((vpp)->v_iocount == 1), vpp,
	6365	("vnode_initialize : iocount not 1, is %d", (*vpp)->v_iocount));
	6366	VNASSERT(((vpp)->v_usecount == 0), vpp,
	6367	("vnode_initialize : usecount not 0, is %d", (*vpp)->v_usecount));
	6368	VNASSERT(((vpp)->v_lflag & VL_DEAD), vpp,
	6369	("vnode_initialize : v_lflag does not have VL_DEAD, is 0x%x",
	6370	(*vpp)->v_lflag));
	6371	VNASSERT(((vpp)->v_data == NULL), vpp,
	6372	("vnode_initialize : v_data not NULL"));
	6373	vnode_unlock(*vpp);
	6374	#endif
	6375	return vnode_create_internal(flavor, size, data, vpp, 1);
	6376	}
	6377
	6378	int
	6379	vnode_addfsref(vnode_t vp)
	6380	{
	6381	vnode_lock_spin(vp);
	6382	if (vp->v_lflag & VNAMED_FSHASH) {
	6383	panic("add_fsref: vp already has named reference");
	6384	}
	6385	if ((vp->v_freelist.tqe_prev != (struct vnode **)0xdeadb)) {
	6386	panic("addfsref: vp on the free list\n");
	6387	}
	6388	vp->v_lflag \|= VNAMED_FSHASH;
	6389	vnode_unlock(vp);
	6390	return 0;
	6391	}
	6392	int
	6393	vnode_removefsref(vnode_t vp)
	6394	{
	6395	vnode_lock_spin(vp);
	6396	if ((vp->v_lflag & VNAMED_FSHASH) == 0) {
	6397	panic("remove_fsref: no named reference");
	6398	}
	6399	vp->v_lflag &= ~VNAMED_FSHASH;
	6400	vnode_unlock(vp);
	6401	return 0;
	6402	}
	6403
	6404
	6405	int
	6406	vfs_iterate(int flags, int (callout)(mount_t, void ), void *arg)
	6407	{
	6408	mount_t mp;
	6409	int ret = 0;
	6410	fsid_t * fsid_list;
	6411	int count, actualcount, i;
	6412	void * allocmem;
	6413	int indx_start, indx_stop, indx_incr;
	6414	int cb_dropref = (flags & VFS_ITERATE_CB_DROPREF);
	6415	int noskip_unmount = (flags & VFS_ITERATE_NOSKIP_UNMOUNT);
	6416
	6417	count = mount_getvfscnt();
	6418	count += 10;
	6419
	6420	fsid_list = kheap_alloc(KHEAP_TEMP, count * sizeof(fsid_t), Z_WAITOK);
	6421	allocmem = (void *)fsid_list;
	6422
	6423	actualcount = mount_fillfsids(fsid_list, count);
	6424
	6425	/*
	6426	* Establish the iteration direction
	6427	* VFS_ITERATE_TAIL_FIRST overrides default head first order (oldest first)
	6428	*/
	6429	if (flags & VFS_ITERATE_TAIL_FIRST) {
	6430	indx_start = actualcount - 1;
	6431	indx_stop = -1;
	6432	indx_incr = -1;
	6433	} else { /* Head first by default */
	6434	indx_start = 0;
	6435	indx_stop = actualcount;
	6436	indx_incr = 1;
	6437	}
	6438
	6439	for (i = indx_start; i != indx_stop; i += indx_incr) {
	6440	/* obtain the mount point with iteration reference */
	6441	mp = mount_list_lookupby_fsid(&fsid_list[i], 0, 1);
	6442
	6443	if (mp == (struct mount *)0) {
	6444	continue;
	6445	}
	6446	mount_lock(mp);
	6447	if ((mp->mnt_lflag & MNT_LDEAD) \|\|
	6448	(!noskip_unmount && (mp->mnt_lflag & MNT_LUNMOUNT))) {
	6449	mount_unlock(mp);
	6450	mount_iterdrop(mp);
	6451	continue;
	6452	}
	6453	mount_unlock(mp);
	6454
	6455	/* iterate over all the vnodes */
	6456	ret = callout(mp, arg);
	6457
	6458	/*
	6459	* Drop the iterref here if the callback didn't do it.
	6460	* Note: If cb_dropref is set the mp may no longer exist.
	6461	*/
	6462	if (!cb_dropref) {
	6463	mount_iterdrop(mp);
	6464	}
	6465
	6466	switch (ret) {
	6467	case VFS_RETURNED:
	6468	case VFS_RETURNED_DONE:
	6469	if (ret == VFS_RETURNED_DONE) {
	6470	ret = 0;
	6471	goto out;
	6472	}
	6473	break;
	6474
	6475	case VFS_CLAIMED_DONE:
	6476	ret = 0;
	6477	goto out;
	6478	case VFS_CLAIMED:
	6479	default:
	6480	break;
	6481	}
	6482	ret = 0;
	6483	}
	6484
	6485	out:
	6486	kheap_free(KHEAP_TEMP, allocmem, (count * sizeof(fsid_t)));
	6487	return ret;
	6488	}
	6489
	6490	/*
	6491	* Update the vfsstatfs structure in the mountpoint.
	6492	* MAC: Parameter eventtype added, indicating whether the event that
	6493	* triggered this update came from user space, via a system call
	6494	* (VFS_USER_EVENT) or an internal kernel call (VFS_KERNEL_EVENT).
	6495	*/
	6496	int
	6497	vfs_update_vfsstat(mount_t mp, vfs_context_t ctx, __unused int eventtype)
	6498	{
	6499	struct vfs_attr va;
	6500	int error;
	6501
	6502	/*
	6503	* Request the attributes we want to propagate into
	6504	* the per-mount vfsstat structure.
	6505	*/
	6506	VFSATTR_INIT(&va);
	6507	VFSATTR_WANTED(&va, f_iosize);
	6508	VFSATTR_WANTED(&va, f_blocks);
	6509	VFSATTR_WANTED(&va, f_bfree);
	6510	VFSATTR_WANTED(&va, f_bavail);
	6511	VFSATTR_WANTED(&va, f_bused);
	6512	VFSATTR_WANTED(&va, f_files);
	6513	VFSATTR_WANTED(&va, f_ffree);
	6514	VFSATTR_WANTED(&va, f_bsize);
	6515	VFSATTR_WANTED(&va, f_fssubtype);
	6516
	6517	if ((error = vfs_getattr(mp, &va, ctx)) != 0) {
	6518	KAUTH_DEBUG("STAT - filesystem returned error %d", error);
	6519	return error;
	6520	}
	6521	#if CONFIG_MACF
	6522	if (eventtype == VFS_USER_EVENT) {
	6523	error = mac_mount_check_getattr(ctx, mp, &va);
	6524	if (error != 0) {
	6525	return error;
	6526	}
	6527	}
	6528	#endif
	6529	/*
	6530	* Unpack into the per-mount structure.
	6531	*
	6532	* We only overwrite these fields, which are likely to change:
	6533	* f_blocks
	6534	* f_bfree
	6535	* f_bavail
	6536	* f_bused
	6537	* f_files
	6538	* f_ffree
	6539	*
	6540	* And these which are not, but which the FS has no other way
	6541	* of providing to us:
	6542	* f_bsize
	6543	* f_iosize
	6544	* f_fssubtype
	6545	*
	6546	*/
	6547	if (VFSATTR_IS_SUPPORTED(&va, f_bsize)) {
	6548	/* 4822056 - protect against malformed server mount */
	6549	mp->mnt_vfsstat.f_bsize = (va.f_bsize > 0 ? va.f_bsize : 512);
	6550	} else {
	6551	mp->mnt_vfsstat.f_bsize = mp->mnt_devblocksize; /* default from the device block size */
	6552	}
	6553	if (VFSATTR_IS_SUPPORTED(&va, f_iosize)) {
	6554	mp->mnt_vfsstat.f_iosize = va.f_iosize;
	6555	} else {
	6556	mp->mnt_vfsstat.f_iosize = 1024 * 1024; /* 1MB sensible I/O size */
	6557	}
	6558	if (VFSATTR_IS_SUPPORTED(&va, f_blocks)) {
	6559	mp->mnt_vfsstat.f_blocks = va.f_blocks;
	6560	}
	6561	if (VFSATTR_IS_SUPPORTED(&va, f_bfree)) {
	6562	mp->mnt_vfsstat.f_bfree = va.f_bfree;
	6563	}
	6564	if (VFSATTR_IS_SUPPORTED(&va, f_bavail)) {
	6565	mp->mnt_vfsstat.f_bavail = va.f_bavail;
	6566	}
	6567	if (VFSATTR_IS_SUPPORTED(&va, f_bused)) {
	6568	mp->mnt_vfsstat.f_bused = va.f_bused;
	6569	}
	6570	if (VFSATTR_IS_SUPPORTED(&va, f_files)) {
	6571	mp->mnt_vfsstat.f_files = va.f_files;
	6572	}
	6573	if (VFSATTR_IS_SUPPORTED(&va, f_ffree)) {
	6574	mp->mnt_vfsstat.f_ffree = va.f_ffree;
	6575	}
	6576
	6577	/* this is unlikely to change, but has to be queried for */
	6578	if (VFSATTR_IS_SUPPORTED(&va, f_fssubtype)) {
	6579	mp->mnt_vfsstat.f_fssubtype = va.f_fssubtype;
	6580	}
	6581
	6582	return 0;
	6583	}
	6584
	6585	int
	6586	mount_list_add(mount_t mp)
	6587	{
	6588	int res;
	6589
	6590	mount_list_lock();
	6591	if (get_system_inshutdown() != 0) {
	6592	res = -1;
	6593	} else {
	6594	TAILQ_INSERT_TAIL(&mountlist, mp, mnt_list);
	6595	nummounts++;
	6596	res = 0;
	6597	}
	6598	mount_list_unlock();
	6599
	6600	return res;
	6601	}
	6602
	6603	void
	6604	mount_list_remove(mount_t mp)
	6605	{
	6606	mount_list_lock();
	6607	TAILQ_REMOVE(&mountlist, mp, mnt_list);
	6608	nummounts--;
	6609	mp->mnt_list.tqe_next = NULL;
	6610	mp->mnt_list.tqe_prev = NULL;
	6611	mount_list_unlock();
	6612	}
	6613
	6614	mount_t
	6615	mount_lookupby_volfsid(int volfs_id, int withref)
	6616	{
	6617	mount_t cur_mount = (mount_t)0;
	6618	mount_t mp;
	6619
	6620	mount_list_lock();
	6621	TAILQ_FOREACH(mp, &mountlist, mnt_list) {
	6622	if (!(mp->mnt_kern_flag & MNTK_UNMOUNT) &&
	6623	(mp->mnt_kern_flag & MNTK_PATH_FROM_ID) &&
	6624	(mp->mnt_vfsstat.f_fsid.val[0] == volfs_id)) {
	6625	cur_mount = mp;
	6626	if (withref) {
	6627	if (mount_iterref(cur_mount, 1)) {
	6628	cur_mount = (mount_t)0;
	6629	mount_list_unlock();
	6630	goto out;
	6631	}
	6632	}
	6633	break;
	6634	}
	6635	}
	6636	mount_list_unlock();
	6637	if (withref && (cur_mount != (mount_t)0)) {
	6638	mp = cur_mount;
	6639	if (vfs_busy(mp, LK_NOWAIT) != 0) {
	6640	cur_mount = (mount_t)0;
	6641	}
	6642	mount_iterdrop(mp);
	6643	}
	6644	out:
	6645	return cur_mount;
	6646	}
	6647
	6648	mount_t
	6649	mount_list_lookupby_fsid(fsid_t *fsid, int locked, int withref)
	6650	{
	6651	mount_t retmp = (mount_t)0;
	6652	mount_t mp;
	6653
	6654	if (!locked) {
	6655	mount_list_lock();
	6656	}
	6657	TAILQ_FOREACH(mp, &mountlist, mnt_list)
	6658	if (mp->mnt_vfsstat.f_fsid.val[0] == fsid->val[0] &&
	6659	mp->mnt_vfsstat.f_fsid.val[1] == fsid->val[1]) {
	6660	retmp = mp;
	6661	if (withref) {
	6662	if (mount_iterref(retmp, 1)) {
	6663	retmp = (mount_t)0;
	6664	}
	6665	}
	6666	goto out;
	6667	}
	6668	out:
	6669	if (!locked) {
	6670	mount_list_unlock();
	6671	}
	6672	return retmp;
	6673	}
	6674
	6675	errno_t
	6676	vnode_lookupat(const char path, int flags, vnode_t vpp, vfs_context_t ctx,
	6677	vnode_t start_dvp)
	6678	{
	6679	struct nameidata *ndp;
	6680	int error = 0;
	6681	u_int32_t ndflags = 0;
	6682
	6683	if (ctx == NULL) {
	6684	return EINVAL;
	6685	}
	6686
	6687	ndp = kheap_alloc(KHEAP_TEMP, sizeof(struct nameidata), Z_WAITOK);
	6688	if (!ndp) {
	6689	return ENOMEM;
	6690	}
	6691
	6692	if (flags & VNODE_LOOKUP_NOFOLLOW) {
	6693	ndflags = NOFOLLOW;
	6694	} else {
	6695	ndflags = FOLLOW;
	6696	}
	6697
	6698	if (flags & VNODE_LOOKUP_NOCROSSMOUNT) {
	6699	ndflags \|= NOCROSSMOUNT;
	6700	}
	6701
	6702	if (flags & VNODE_LOOKUP_CROSSMOUNTNOWAIT) {
	6703	ndflags \|= CN_NBMOUNTLOOK;
	6704	}
	6705
	6706	/* XXX AUDITVNPATH1 needed ? */
	6707	NDINIT(ndp, LOOKUP, OP_LOOKUP, ndflags, UIO_SYSSPACE,
	6708	CAST_USER_ADDR_T(path), ctx);
	6709
	6710	if (start_dvp && (path[0] != '/')) {
	6711	ndp->ni_dvp = start_dvp;
	6712	ndp->ni_cnd.cn_flags \|= USEDVP;
	6713	}
	6714
	6715	if ((error = namei(ndp))) {
	6716	goto out_free;
	6717	}
	6718
	6719	ndp->ni_cnd.cn_flags &= ~USEDVP;
	6720
	6721	*vpp = ndp->ni_vp;
	6722	nameidone(ndp);
	6723
	6724	out_free:
	6725	kheap_free(KHEAP_TEMP, ndp, sizeof(struct nameidata));
	6726	return error;
	6727	}
	6728
	6729	errno_t
	6730	vnode_lookup(const char path, int flags, vnode_t vpp, vfs_context_t ctx)
	6731	{
	6732	return vnode_lookupat(path, flags, vpp, ctx, NULLVP);
	6733	}
	6734
	6735	errno_t
	6736	vnode_open(const char path, int fmode, int cmode, int flags, vnode_t vpp, vfs_context_t ctx)
	6737	{
	6738	struct nameidata *ndp = NULL;
	6739	int error;
	6740	u_int32_t ndflags = 0;
	6741	int lflags = flags;
	6742
	6743	if (ctx == NULL) { /* XXX technically an error */
	6744	ctx = vfs_context_current();
	6745	}
	6746
	6747	ndp = kheap_alloc(KHEAP_TEMP, sizeof(struct nameidata), Z_WAITOK);
	6748	if (!ndp) {
	6749	return ENOMEM;
	6750	}
	6751
	6752	if (fmode & O_NOFOLLOW) {
	6753	lflags \|= VNODE_LOOKUP_NOFOLLOW;
	6754	}
	6755
	6756	if (lflags & VNODE_LOOKUP_NOFOLLOW) {
	6757	ndflags = NOFOLLOW;
	6758	} else {
	6759	ndflags = FOLLOW;
	6760	}
	6761
	6762	if (lflags & VNODE_LOOKUP_NOCROSSMOUNT) {
	6763	ndflags \|= NOCROSSMOUNT;
	6764	}
	6765
	6766	if (lflags & VNODE_LOOKUP_CROSSMOUNTNOWAIT) {
	6767	ndflags \|= CN_NBMOUNTLOOK;
	6768	}
	6769
	6770	/* XXX AUDITVNPATH1 needed ? */
	6771	NDINIT(ndp, LOOKUP, OP_OPEN, ndflags, UIO_SYSSPACE,
	6772	CAST_USER_ADDR_T(path), ctx);
	6773
	6774	if ((error = vn_open(ndp, fmode, cmode))) {
	6775	*vpp = NULL;
	6776	} else {
	6777	*vpp = ndp->ni_vp;
	6778	}
	6779
	6780	kheap_free(KHEAP_TEMP, ndp, sizeof(struct nameidata));
	6781	return error;
	6782	}
	6783
	6784	errno_t
	6785	vnode_close(vnode_t vp, int flags, vfs_context_t ctx)
	6786	{
	6787	int error;
	6788
	6789	if (ctx == NULL) {
	6790	ctx = vfs_context_current();
	6791	}
	6792
	6793	error = vn_close(vp, flags, ctx);
	6794	vnode_put(vp);
	6795	return error;
	6796	}
	6797
	6798	errno_t
	6799	vnode_mtime(vnode_t vp, struct timespec *mtime, vfs_context_t ctx)
	6800	{
	6801	struct vnode_attr va;
	6802	int error;
	6803
	6804	VATTR_INIT(&va);
	6805	VATTR_WANTED(&va, va_modify_time);
	6806	error = vnode_getattr(vp, &va, ctx);
	6807	if (!error) {
	6808	*mtime = va.va_modify_time;
	6809	}
	6810	return error;
	6811	}
	6812
	6813	errno_t
	6814	vnode_flags(vnode_t vp, uint32_t *flags, vfs_context_t ctx)
	6815	{
	6816	struct vnode_attr va;
	6817	int error;
	6818
	6819	VATTR_INIT(&va);
	6820	VATTR_WANTED(&va, va_flags);
	6821	error = vnode_getattr(vp, &va, ctx);
	6822	if (!error) {
	6823	*flags = va.va_flags;
	6824	}
	6825	return error;
	6826	}
	6827
	6828	/*
	6829	* Returns: 0 Success
	6830	* vnode_getattr:???
	6831	*/
	6832	errno_t
	6833	vnode_size(vnode_t vp, off_t *sizep, vfs_context_t ctx)
	6834	{
	6835	struct vnode_attr va;
	6836	int error;
	6837
	6838	VATTR_INIT(&va);
	6839	VATTR_WANTED(&va, va_data_size);
	6840	error = vnode_getattr(vp, &va, ctx);
	6841	if (!error) {
	6842	*sizep = va.va_data_size;
	6843	}
	6844	return error;
	6845	}
	6846
	6847	errno_t
	6848	vnode_setsize(vnode_t vp, off_t size, int ioflag, vfs_context_t ctx)
	6849	{
	6850	struct vnode_attr va;
	6851
	6852	VATTR_INIT(&va);
	6853	VATTR_SET(&va, va_data_size, size);
	6854	va.va_vaflags = ioflag & 0xffff;
	6855	return vnode_setattr(vp, &va, ctx);
	6856	}
	6857
	6858	int
	6859	vnode_setdirty(vnode_t vp)
	6860	{
	6861	vnode_lock_spin(vp);
	6862	vp->v_flag \|= VISDIRTY;
	6863	vnode_unlock(vp);
	6864	return 0;
	6865	}
	6866
	6867	int
	6868	vnode_cleardirty(vnode_t vp)
	6869	{
	6870	vnode_lock_spin(vp);
	6871	vp->v_flag &= ~VISDIRTY;
	6872	vnode_unlock(vp);
	6873	return 0;
	6874	}
	6875
	6876	int
	6877	vnode_isdirty(vnode_t vp)
	6878	{
	6879	int dirty;
	6880
	6881	vnode_lock_spin(vp);
	6882	dirty = (vp->v_flag & VISDIRTY) ? 1 : 0;
	6883	vnode_unlock(vp);
	6884
	6885	return dirty;
	6886	}
	6887
	6888	static int
	6889	vn_create_reg(vnode_t dvp, vnode_t vpp, struct nameidata ndp, struct vnode_attr vap, uint32_t flags, int fmode, uint32_t statusp, vfs_context_t ctx)
	6890	{
	6891	/* Only use compound VNOP for compound operation */
	6892	if (vnode_compound_open_available(dvp) && ((flags & VN_CREATE_DOOPEN) != 0)) {
	6893	*vpp = NULLVP;
	6894	return VNOP_COMPOUND_OPEN(dvp, vpp, ndp, O_CREAT, fmode, statusp, vap, ctx);
	6895	} else {
	6896	return VNOP_CREATE(dvp, vpp, &ndp->ni_cnd, vap, ctx);
	6897	}
	6898	}
	6899
	6900	/*
	6901	* Create a filesystem object of arbitrary type with arbitrary attributes in
	6902	* the spevied directory with the specified name.
	6903	*
	6904	* Parameters: dvp Pointer to the vnode of the directory
	6905	* in which to create the object.
	6906	* vpp Pointer to the area into which to
	6907	* return the vnode of the created object.
	6908	* cnp Component name pointer from the namei
	6909	* data structure, containing the name to
	6910	* use for the create object.
	6911	* vap Pointer to the vnode_attr structure
	6912	* describing the object to be created,
	6913	* including the type of object.
	6914	* flags VN_* flags controlling ACL inheritance
	6915	* and whether or not authorization is to
	6916	* be required for the operation.
	6917	*
	6918	* Returns: 0 Success
	6919	* !0 errno value
	6920	*
	6921	* Implicit: *vpp Contains the vnode of the object that
	6922	* was created, if successful.
	6923	* *cnp May be modified by the underlying VFS.
	6924	* *vap May be modified by the underlying VFS.
	6925	* modified by either ACL inheritance or
	6926	*
	6927	*
	6928	* be modified, even if the operation is
	6929	*
	6930	*
	6931	* Notes: The kauth_filesec_t in 'vap', if any, is in host byte order.
	6932	*
	6933	* Modification of 'cnp' and 'vap' by the underlying VFS is
	6934	* strongly discouraged.
	6935	*
	6936	* XXX: This function is a 'vn_*' function; it belongs in vfs_vnops.c
	6937	*
	6938	* XXX: We should enummerate the possible errno values here, and where
	6939	* in the code they originated.
	6940	*/
	6941	errno_t
	6942	vn_create(vnode_t dvp, vnode_t vpp, struct nameidata ndp, struct vnode_attr vap, uint32_t flags, int fmode, uint32_t statusp, vfs_context_t ctx)
	6943	{
	6944	errno_t error, old_error;
	6945	vnode_t vp = (vnode_t)0;
	6946	boolean_t batched;
	6947	struct componentname *cnp;
	6948	uint32_t defaulted;
	6949
	6950	cnp = &ndp->ni_cnd;
	6951	error = 0;
	6952	batched = namei_compound_available(dvp, ndp) ? TRUE : FALSE;
	6953
	6954	KAUTH_DEBUG("%p CREATE - '%s'", dvp, cnp->cn_nameptr);
	6955
	6956	if (flags & VN_CREATE_NOINHERIT) {
	6957	vap->va_vaflags \|= VA_NOINHERIT;
	6958	}
	6959	if (flags & VN_CREATE_NOAUTH) {
	6960	vap->va_vaflags \|= VA_NOAUTH;
	6961	}
	6962	/*
	6963	* Handle ACL inheritance, initialize vap.
	6964	*/
	6965	error = vn_attribute_prepare(dvp, vap, &defaulted, ctx);
	6966	if (error) {
	6967	return error;
	6968	}
	6969
	6970	if (vap->va_type != VREG && (fmode != 0 \|\| (flags & VN_CREATE_DOOPEN) \|\| statusp)) {
	6971	panic("Open parameters, but not a regular file.");
	6972	}
	6973	if ((fmode != 0) && ((flags & VN_CREATE_DOOPEN) == 0)) {
	6974	panic("Mode for open, but not trying to open...");
	6975	}
	6976
	6977
	6978	/*
	6979	* Create the requested node.
	6980	*/
	6981	switch (vap->va_type) {
	6982	case VREG:
	6983	error = vn_create_reg(dvp, vpp, ndp, vap, flags, fmode, statusp, ctx);
	6984	break;
	6985	case VDIR:
	6986	error = vn_mkdir(dvp, vpp, ndp, vap, ctx);
	6987	break;
	6988	case VSOCK:
	6989	case VFIFO:
	6990	case VBLK:
	6991	case VCHR:
	6992	error = VNOP_MKNOD(dvp, vpp, cnp, vap, ctx);
	6993	break;
	6994	default:
	6995	panic("vnode_create: unknown vtype %d", vap->va_type);
	6996	}
	6997	if (error != 0) {
	6998	KAUTH_DEBUG("%p CREATE - error %d returned by filesystem", dvp, error);
	6999	goto out;
	7000	}
	7001
	7002	vp = *vpp;
	7003	old_error = error;
	7004
	7005	/*
	7006	* If some of the requested attributes weren't handled by the VNOP,
	7007	* use our fallback code.
	7008	*/
	7009	if ((error == 0) && !VATTR_ALL_SUPPORTED(vap) && *vpp) {
	7010	KAUTH_DEBUG(" CREATE - doing fallback with ACL %p", vap->va_acl);
	7011	error = vnode_setattr_fallback(*vpp, vap, ctx);
	7012	}
	7013
	7014	#if CONFIG_MACF
	7015	if ((error == 0) && !(flags & VN_CREATE_NOLABEL)) {
	7016	error = vnode_label(vnode_mount(vp), dvp, vp, cnp, VNODE_LABEL_CREATE, ctx);
	7017	}
	7018	#endif
	7019
	7020	if ((error != 0) && (vp != (vnode_t)0)) {
	7021	/* If we've done a compound open, close */
	7022	if (batched && (old_error == 0) && (vap->va_type == VREG)) {
	7023	VNOP_CLOSE(vp, fmode, ctx);
	7024	}
	7025
	7026	/* Need to provide notifications if a create succeeded */
	7027	if (!batched) {
	7028	*vpp = (vnode_t) 0;
	7029	vnode_put(vp);
	7030	vp = NULLVP;
	7031	}
	7032	}
	7033
	7034	/*
	7035	* For creation VNOPs, this is the equivalent of
	7036	* lookup_handle_found_vnode.
	7037	*/
	7038	if (kdebug_enable && *vpp) {
	7039	kdebug_lookup(*vpp, cnp);
	7040	}
	7041
	7042	out:
	7043	vn_attribute_cleanup(vap, defaulted);
	7044
	7045	return error;
	7046	}
	7047
	7048	static kauth_scope_t vnode_scope;
	7049	static int vnode_authorize_callback(kauth_cred_t credential, void *idata, kauth_action_t action,
	7050	uintptr_t arg0, uintptr_t arg1, uintptr_t arg2, uintptr_t arg3);
	7051	static int vnode_authorize_callback_int(kauth_action_t action, vfs_context_t ctx,
	7052	vnode_t vp, vnode_t dvp, int *errorp);
	7053
	7054	typedef struct _vnode_authorize_context {
	7055	vnode_t vp;
	7056	struct vnode_attr *vap;
	7057	vnode_t dvp;
	7058	struct vnode_attr *dvap;
	7059	vfs_context_t ctx;
	7060	int flags;
	7061	int flags_valid;
	7062	#define _VAC_IS_OWNER (1<<0)
	7063	#define _VAC_IN_GROUP (1<<1)
	7064	#define _VAC_IS_DIR_OWNER (1<<2)
	7065	#define _VAC_IN_DIR_GROUP (1<<3)
	7066	#define _VAC_NO_VNODE_POINTERS (1<<4)
	7067	} *vauth_ctx;
	7068
	7069	void
	7070	vnode_authorize_init(void)
	7071	{
	7072	vnode_scope = kauth_register_scope(KAUTH_SCOPE_VNODE, vnode_authorize_callback, NULL);
	7073	}
	7074
	7075	#define VATTR_PREPARE_DEFAULTED_UID 0x1
	7076	#define VATTR_PREPARE_DEFAULTED_GID 0x2
	7077	#define VATTR_PREPARE_DEFAULTED_MODE 0x4
	7078
	7079	int
	7080	vn_attribute_prepare(vnode_t dvp, struct vnode_attr vap, uint32_t defaulted_fieldsp, vfs_context_t ctx)
	7081	{
	7082	kauth_acl_t nacl = NULL, oacl = NULL;
	7083	int error;
	7084
	7085	/*
	7086	* Handle ACL inheritance.
	7087	*/
	7088	if (!(vap->va_vaflags & VA_NOINHERIT) && vfs_extendedsecurity(dvp->v_mount)) {
	7089	/* save the original filesec */
	7090	if (VATTR_IS_ACTIVE(vap, va_acl)) {
	7091	oacl = vap->va_acl;
	7092	}
	7093
	7094	vap->va_acl = NULL;
	7095	if ((error = kauth_acl_inherit(dvp,
	7096	oacl,
	7097	&nacl,
	7098	vap->va_type == VDIR,
	7099	ctx)) != 0) {
	7100	KAUTH_DEBUG("%p CREATE - error %d processing inheritance", dvp, error);
	7101	return error;
	7102	}
	7103
	7104	/*
	7105	* If the generated ACL is NULL, then we can save ourselves some effort
	7106	* by clearing the active bit.
	7107	*/
	7108	if (nacl == NULL) {
	7109	VATTR_CLEAR_ACTIVE(vap, va_acl);
	7110	} else {
	7111	vap->va_base_acl = oacl;
	7112	VATTR_SET(vap, va_acl, nacl);
	7113	}
	7114	}
	7115
	7116	error = vnode_authattr_new_internal(dvp, vap, (vap->va_vaflags & VA_NOAUTH), defaulted_fieldsp, ctx);
	7117	if (error) {
	7118	vn_attribute_cleanup(vap, *defaulted_fieldsp);
	7119	}
	7120
	7121	return error;
	7122	}
	7123
	7124	void
	7125	vn_attribute_cleanup(struct vnode_attr *vap, uint32_t defaulted_fields)
	7126	{
	7127	/*
	7128	* If the caller supplied a filesec in vap, it has been replaced
	7129	* now by the post-inheritance copy. We need to put the original back
	7130	* and free the inherited product.
	7131	*/
	7132	kauth_acl_t nacl, oacl;
	7133
	7134	if (VATTR_IS_ACTIVE(vap, va_acl)) {
	7135	nacl = vap->va_acl;
	7136	oacl = vap->va_base_acl;
	7137
	7138	if (oacl) {
	7139	VATTR_SET(vap, va_acl, oacl);
	7140	vap->va_base_acl = NULL;
	7141	} else {
	7142	VATTR_CLEAR_ACTIVE(vap, va_acl);
	7143	}
	7144
	7145	if (nacl != NULL) {
	7146	kauth_acl_free(nacl);
	7147	}
	7148	}
	7149
	7150	if ((defaulted_fields & VATTR_PREPARE_DEFAULTED_MODE) != 0) {
	7151	VATTR_CLEAR_ACTIVE(vap, va_mode);
	7152	}
	7153	if ((defaulted_fields & VATTR_PREPARE_DEFAULTED_GID) != 0) {
	7154	VATTR_CLEAR_ACTIVE(vap, va_gid);
	7155	}
	7156	if ((defaulted_fields & VATTR_PREPARE_DEFAULTED_UID) != 0) {
	7157	VATTR_CLEAR_ACTIVE(vap, va_uid);
	7158	}
	7159
	7160	return;
	7161	}
	7162
	7163	int
	7164	vn_authorize_unlink(vnode_t dvp, vnode_t vp, struct componentname cnp, vfs_context_t ctx, __unused void reserved)
	7165	{
	7166	#if !CONFIG_MACF
	7167	#pragma unused(cnp)
	7168	#endif
	7169	int error = 0;
	7170
	7171	/*
	7172	* Normally, unlinking of directories is not supported.
	7173	* However, some file systems may have limited support.
	7174	*/
	7175	if ((vp->v_type == VDIR) &&
	7176	!(vp->v_mount->mnt_kern_flag & MNTK_DIR_HARDLINKS)) {
	7177	return EPERM; /* POSIX */
	7178	}
	7179
	7180	/* authorize the delete operation */
	7181	#if CONFIG_MACF
	7182	if (!error) {
	7183	error = mac_vnode_check_unlink(ctx, dvp, vp, cnp);
	7184	}
	7185	#endif /* MAC */
	7186	if (!error) {
	7187	error = vnode_authorize(vp, dvp, KAUTH_VNODE_DELETE, ctx);
	7188	}
	7189
	7190	return error;
	7191	}
	7192
	7193	int
	7194	vn_authorize_open_existing(vnode_t vp, struct componentname cnp, int fmode, vfs_context_t ctx, void reserved)
	7195	{
	7196	/* Open of existing case */
	7197	kauth_action_t action;
	7198	int error = 0;
	7199	if (cnp->cn_ndp == NULL) {
	7200	panic("NULL ndp");
	7201	}
	7202	if (reserved != NULL) {
	7203	panic("reserved not NULL.");
	7204	}
	7205
	7206	#if CONFIG_MACF
	7207	/* XXX may do duplicate work here, but ignore that for now (idempotent) */
	7208	if (vfs_flags(vnode_mount(vp)) & MNT_MULTILABEL) {
	7209	error = vnode_label(vnode_mount(vp), NULL, vp, NULL, 0, ctx);
	7210	if (error) {
	7211	return error;
	7212	}
	7213	}
	7214	#endif
	7215
	7216	if ((fmode & O_DIRECTORY) && vp->v_type != VDIR) {
	7217	return ENOTDIR;
	7218	}
	7219
	7220	if (vp->v_type == VSOCK && vp->v_tag != VT_FDESC) {
	7221	return EOPNOTSUPP; /* Operation not supported on socket */
	7222	}
	7223
	7224	if (vp->v_type == VLNK && (fmode & O_NOFOLLOW) != 0) {
	7225	return ELOOP; /* O_NOFOLLOW was specified and the target is a symbolic link */
	7226	}
	7227
	7228	/* disallow write operations on directories */
	7229	if (vnode_isdir(vp) && (fmode & (FWRITE \| O_TRUNC))) {
	7230	return EISDIR;
	7231	}
	7232
	7233	if ((cnp->cn_ndp->ni_flag & NAMEI_TRAILINGSLASH)) {
	7234	if (vp->v_type != VDIR) {
	7235	return ENOTDIR;
	7236	}
	7237	}
	7238
	7239	#if CONFIG_MACF
	7240	/* If a file being opened is a shadow file containing
	7241	* namedstream data, ignore the macf checks because it
	7242	* is a kernel internal file and access should always
	7243	* be allowed.
	7244	*/
	7245	if (!(vnode_isshadow(vp) && vnode_isnamedstream(vp))) {
	7246	error = mac_vnode_check_open(ctx, vp, fmode);
	7247	if (error) {
	7248	return error;
	7249	}
	7250	}
	7251	#endif
	7252
	7253	/* compute action to be authorized */
	7254	action = 0;
	7255	if (fmode & FREAD) {
	7256	action \|= KAUTH_VNODE_READ_DATA;
	7257	}
	7258	if (fmode & (FWRITE \| O_TRUNC)) {
	7259	/*
	7260	* If we are writing, appending, and not truncating,
	7261	* indicate that we are appending so that if the
	7262	* UF_APPEND or SF_APPEND bits are set, we do not deny
	7263	* the open.
	7264	*/
	7265	if ((fmode & O_APPEND) && !(fmode & O_TRUNC)) {
	7266	action \|= KAUTH_VNODE_APPEND_DATA;
	7267	} else {
	7268	action \|= KAUTH_VNODE_WRITE_DATA;
	7269	}
	7270	}
	7271	error = vnode_authorize(vp, NULL, action, ctx);
	7272	#if NAMEDSTREAMS
	7273	if (error == EACCES) {
	7274	/*
	7275	* Shadow files may exist on-disk with a different UID/GID
	7276	* than that of the current context. Verify that this file
	7277	* is really a shadow file. If it was created successfully
	7278	* then it should be authorized.
	7279	*/
	7280	if (vnode_isshadow(vp) && vnode_isnamedstream(vp)) {
	7281	error = vnode_verifynamedstream(vp);
	7282	}
	7283	}
	7284	#endif
	7285
	7286	return error;
	7287	}
	7288
	7289	int
	7290	vn_authorize_create(vnode_t dvp, struct componentname cnp, struct vnode_attr vap, vfs_context_t ctx, void *reserved)
	7291	{
	7292	#if !CONFIG_MACF
	7293	#pragma unused(vap)
	7294	#endif
	7295	/* Creation case */
	7296	int error;
	7297
	7298	if (cnp->cn_ndp == NULL) {
	7299	panic("NULL cn_ndp");
	7300	}
	7301	if (reserved != NULL) {
	7302	panic("reserved not NULL.");
	7303	}
	7304
	7305	/* Only validate path for creation if we didn't do a complete lookup */
	7306	if (cnp->cn_ndp->ni_flag & NAMEI_UNFINISHED) {
	7307	error = lookup_validate_creation_path(cnp->cn_ndp);
	7308	if (error) {
	7309	return error;
	7310	}
	7311	}
	7312
	7313	#if CONFIG_MACF
	7314	error = mac_vnode_check_create(ctx, dvp, cnp, vap);
	7315	if (error) {
	7316	return error;
	7317	}
	7318	#endif /* CONFIG_MACF */
	7319
	7320	return vnode_authorize(dvp, NULL, KAUTH_VNODE_ADD_FILE, ctx);
	7321	}
	7322
	7323	int
	7324	vn_authorize_rename(struct vnode fdvp, struct vnode fvp, struct componentname *fcnp,
	7325	struct vnode tdvp, struct vnode tvp, struct componentname *tcnp,
	7326	vfs_context_t ctx, void *reserved)
	7327	{
	7328	return vn_authorize_renamex(fdvp, fvp, fcnp, tdvp, tvp, tcnp, ctx, 0, reserved);
	7329	}
	7330
	7331	int
	7332	vn_authorize_renamex(struct vnode fdvp, struct vnode fvp, struct componentname *fcnp,
	7333	struct vnode tdvp, struct vnode tvp, struct componentname *tcnp,
	7334	vfs_context_t ctx, vfs_rename_flags_t flags, void *reserved)
	7335	{
	7336	return vn_authorize_renamex_with_paths(fdvp, fvp, fcnp, NULL, tdvp, tvp, tcnp, NULL, ctx, flags, reserved);
	7337	}
	7338
	7339	int
	7340	vn_authorize_renamex_with_paths(struct vnode fdvp, struct vnode fvp, struct componentname fcnp, const char from_path,
	7341	struct vnode tdvp, struct vnode tvp, struct componentname tcnp, const char to_path,
	7342	vfs_context_t ctx, vfs_rename_flags_t flags, void *reserved)
	7343	{
	7344	int error = 0;
	7345	int moving = 0;
	7346	bool swap = flags & VFS_RENAME_SWAP;
	7347
	7348	if (reserved != NULL) {
	7349	panic("Passed something other than NULL as reserved field!");
	7350	}
	7351
	7352	/*
	7353	* Avoid renaming "." and "..".
	7354	*
	7355	* XXX No need to check for this in the FS. We should always have the leaves
	7356	* in VFS in this case.
	7357	*/
	7358	if (fvp->v_type == VDIR &&
	7359	((fdvp == fvp) \|\|
	7360	(fcnp->cn_namelen == 1 && fcnp->cn_nameptr[0] == '.') \|\|
	7361	((fcnp->cn_flags \| tcnp->cn_flags) & ISDOTDOT))) {
	7362	error = EINVAL;
	7363	goto out;
	7364	}
	7365
	7366	if (tvp == NULLVP && vnode_compound_rename_available(tdvp)) {
	7367	error = lookup_validate_creation_path(tcnp->cn_ndp);
	7368	if (error) {
	7369	goto out;
	7370	}
	7371	}
	7372
	7373	/*** <MACF> ***/
	7374	#if CONFIG_MACF
	7375	error = mac_vnode_check_rename(ctx, fdvp, fvp, fcnp, tdvp, tvp, tcnp);
	7376	if (error) {
	7377	goto out;
	7378	}
	7379	if (swap) {
	7380	error = mac_vnode_check_rename(ctx, tdvp, tvp, tcnp, fdvp, fvp, fcnp);
	7381	if (error) {
	7382	goto out;
	7383	}
	7384	}
	7385	#endif
	7386	/*** </MACF> ***/
	7387
	7388	/*** <MiscChecks> ***/
	7389	if (tvp != NULL) {
	7390	if (!swap) {
	7391	if (fvp->v_type == VDIR && tvp->v_type != VDIR) {
	7392	error = ENOTDIR;
	7393	goto out;
	7394	} else if (fvp->v_type != VDIR && tvp->v_type == VDIR) {
	7395	error = EISDIR;
	7396	goto out;
	7397	}
	7398	}
	7399	} else if (swap) {
	7400	/*
	7401	* Caller should have already checked this and returned
	7402	* ENOENT. If we send back ENOENT here, caller will retry
	7403	* which isn't what we want so we send back EINVAL here
	7404	* instead.
	7405	*/
	7406	error = EINVAL;
	7407	goto out;
	7408	}
	7409
	7410	if (fvp == tdvp) {
	7411	error = EINVAL;
	7412	goto out;
	7413	}
	7414
	7415	/*
	7416	* The following edge case is caught here:
	7417	* (to cannot be a descendent of from)
	7418	*
	7419	* o fdvp
	7420	* /
	7421	* /
	7422	* o fvp
	7423	* \
	7424	* \
	7425	* o tdvp
	7426	* /
	7427	* /
	7428	* o tvp
	7429	*/
	7430	if (tdvp->v_parent == fvp) {
	7431	error = EINVAL;
	7432	goto out;
	7433	}
	7434
	7435	if (swap && fdvp->v_parent == tvp) {
	7436	error = EINVAL;
	7437	goto out;
	7438	}
	7439	/*** </MiscChecks> ***/
	7440
	7441	/*** <Kauth> ***/
	7442
	7443	/*
	7444	* As part of the Kauth step, we call out to allow 3rd-party
	7445	* fileop notification of "about to rename". This is needed
	7446	* in the event that 3rd-parties need to know that the DELETE
	7447	* authorization is actually part of a rename. It's important
	7448	* that we guarantee that the DELETE call-out will always be
	7449	* made if the WILL_RENAME call-out is made. Another fileop
	7450	* call-out will be performed once the operation is completed.
	7451	* We can ignore the result of kauth_authorize_fileop().
	7452	*
	7453	* N.B. We are passing the vnode and both paths to each
	7454	* call; kauth_authorize_fileop() extracts the "from" path
	7455	* when posting a KAUTH_FILEOP_WILL_RENAME notification.
	7456	* As such, we only post these notifications if all of the
	7457	* information we need is provided.
	7458	*/
	7459
	7460	if (swap) {
	7461	kauth_action_t f = 0, t = 0;
	7462
	7463	/*
	7464	* Directories changing parents need ...ADD_SUBDIR... to
	7465	* permit changing ".."
	7466	*/
	7467	if (fdvp != tdvp) {
	7468	if (vnode_isdir(fvp)) {
	7469	f = KAUTH_VNODE_ADD_SUBDIRECTORY;
	7470	}
	7471	if (vnode_isdir(tvp)) {
	7472	t = KAUTH_VNODE_ADD_SUBDIRECTORY;
	7473	}
	7474	}
	7475	if (to_path != NULL) {
	7476	kauth_authorize_fileop(vfs_context_ucred(ctx),
	7477	KAUTH_FILEOP_WILL_RENAME,
	7478	(uintptr_t)fvp,
	7479	(uintptr_t)to_path);
	7480	}
	7481	error = vnode_authorize(fvp, fdvp, KAUTH_VNODE_DELETE \| f, ctx);
	7482	if (error) {
	7483	goto out;
	7484	}
	7485	if (from_path != NULL) {
	7486	kauth_authorize_fileop(vfs_context_ucred(ctx),
	7487	KAUTH_FILEOP_WILL_RENAME,
	7488	(uintptr_t)tvp,
	7489	(uintptr_t)from_path);
	7490	}
	7491	error = vnode_authorize(tvp, tdvp, KAUTH_VNODE_DELETE \| t, ctx);
	7492	if (error) {
	7493	goto out;
	7494	}
	7495	f = vnode_isdir(fvp) ? KAUTH_VNODE_ADD_SUBDIRECTORY : KAUTH_VNODE_ADD_FILE;
	7496	t = vnode_isdir(tvp) ? KAUTH_VNODE_ADD_SUBDIRECTORY : KAUTH_VNODE_ADD_FILE;
	7497	if (fdvp == tdvp) {
	7498	error = vnode_authorize(fdvp, NULL, f \| t, ctx);
	7499	} else {
	7500	error = vnode_authorize(fdvp, NULL, t, ctx);
	7501	if (error) {
	7502	goto out;
	7503	}
	7504	error = vnode_authorize(tdvp, NULL, f, ctx);
	7505	}
	7506	if (error) {
	7507	goto out;
	7508	}
	7509	} else {
	7510	error = 0;
	7511	if ((tvp != NULL) && vnode_isdir(tvp)) {
	7512	if (tvp != fdvp) {
	7513	moving = 1;
	7514	}
	7515	} else if (tdvp != fdvp) {
	7516	moving = 1;
	7517	}
	7518
	7519	/*
	7520	* must have delete rights to remove the old name even in
	7521	* the simple case of fdvp == tdvp.
	7522	*
	7523	* If fvp is a directory, and we are changing it's parent,
	7524	* then we also need rights to rewrite its ".." entry as well.
	7525	*/
	7526	if (to_path != NULL) {
	7527	kauth_authorize_fileop(vfs_context_ucred(ctx),
	7528	KAUTH_FILEOP_WILL_RENAME,
	7529	(uintptr_t)fvp,
	7530	(uintptr_t)to_path);
	7531	}
	7532	if (vnode_isdir(fvp)) {
	7533	if ((error = vnode_authorize(fvp, fdvp, KAUTH_VNODE_DELETE \| KAUTH_VNODE_ADD_SUBDIRECTORY, ctx)) != 0) {
	7534	goto out;
	7535	}
	7536	} else {
	7537	if ((error = vnode_authorize(fvp, fdvp, KAUTH_VNODE_DELETE, ctx)) != 0) {
	7538	goto out;
	7539	}
	7540	}
	7541	if (moving) {
	7542	/* moving into tdvp or tvp, must have rights to add */
	7543	if ((error = vnode_authorize(((tvp != NULL) && vnode_isdir(tvp)) ? tvp : tdvp,
	7544	NULL,
	7545	vnode_isdir(fvp) ? KAUTH_VNODE_ADD_SUBDIRECTORY : KAUTH_VNODE_ADD_FILE,
	7546	ctx)) != 0) {
	7547	goto out;
	7548	}
	7549	} else {
	7550	/* node staying in same directory, must be allowed to add new name */
	7551	if ((error = vnode_authorize(fdvp, NULL,
	7552	vnode_isdir(fvp) ? KAUTH_VNODE_ADD_SUBDIRECTORY : KAUTH_VNODE_ADD_FILE, ctx)) != 0) {
	7553	goto out;
	7554	}
	7555	}
	7556	/* overwriting tvp */
	7557	if ((tvp != NULL) && !vnode_isdir(tvp) &&
	7558	((error = vnode_authorize(tvp, tdvp, KAUTH_VNODE_DELETE, ctx)) != 0)) {
	7559	goto out;
	7560	}
	7561	}
	7562
	7563	/*** </Kauth> ***/
	7564
	7565	/* XXX more checks? */
	7566	out:
	7567	return error;
	7568	}
	7569
	7570	int
	7571	vn_authorize_mkdir(vnode_t dvp, struct componentname cnp, struct vnode_attr vap, vfs_context_t ctx, void *reserved)
	7572	{
	7573	#if !CONFIG_MACF
	7574	#pragma unused(vap)
	7575	#endif
	7576	int error;
	7577
	7578	if (reserved != NULL) {
	7579	panic("reserved not NULL in vn_authorize_mkdir()");
	7580	}
	7581
	7582	/* XXX A hack for now, to make shadow files work */
	7583	if (cnp->cn_ndp == NULL) {
	7584	return 0;
	7585	}
	7586
	7587	if (vnode_compound_mkdir_available(dvp)) {
	7588	error = lookup_validate_creation_path(cnp->cn_ndp);
	7589	if (error) {
	7590	goto out;
	7591	}
	7592	}
	7593
	7594	#if CONFIG_MACF
	7595	error = mac_vnode_check_create(ctx,
	7596	dvp, cnp, vap);
	7597	if (error) {
	7598	goto out;
	7599	}
	7600	#endif
	7601
	7602	/* authorize addition of a directory to the parent */
	7603	if ((error = vnode_authorize(dvp, NULL, KAUTH_VNODE_ADD_SUBDIRECTORY, ctx)) != 0) {
	7604	goto out;
	7605	}
	7606
	7607	out:
	7608	return error;
	7609	}
	7610
	7611	int
	7612	vn_authorize_rmdir(vnode_t dvp, vnode_t vp, struct componentname cnp, vfs_context_t ctx, void reserved)
	7613	{
	7614	#if CONFIG_MACF
	7615	int error;
	7616	#else
	7617	#pragma unused(cnp)
	7618	#endif
	7619	if (reserved != NULL) {
	7620	panic("Non-NULL reserved argument to vn_authorize_rmdir()");
	7621	}
	7622
	7623	if (vp->v_type != VDIR) {
	7624	/*
	7625	* rmdir only deals with directories
	7626	*/
	7627	return ENOTDIR;
	7628	}
	7629
	7630	if (dvp == vp) {
	7631	/*
	7632	* No rmdir "." please.
	7633	*/
	7634	return EINVAL;
	7635	}
	7636
	7637	#if CONFIG_MACF
	7638	error = mac_vnode_check_unlink(ctx, dvp,
	7639	vp, cnp);
	7640	if (error) {
	7641	return error;
	7642	}
	7643	#endif
	7644
	7645	return vnode_authorize(vp, dvp, KAUTH_VNODE_DELETE, ctx);
	7646	}
	7647
	7648	/*
	7649	* Authorizer for directory cloning. This does not use vnodes but instead
	7650	* uses prefilled vnode attributes from the filesystem.
	7651	*
	7652	* The same function is called to set up the attributes required, perform the
	7653	* authorization and cleanup (if required)
	7654	*/
	7655	int
	7656	vnode_attr_authorize_dir_clone(struct vnode_attr *vap, kauth_action_t action,
	7657	struct vnode_attr *dvap, __unused vnode_t sdvp, mount_t mp,
	7658	dir_clone_authorizer_op_t vattr_op, uint32_t flags, vfs_context_t ctx,
	7659	__unused void *reserved)
	7660	{
	7661	int error;
	7662	int is_suser = vfs_context_issuser(ctx);
	7663
	7664	if (vattr_op == OP_VATTR_SETUP) {
	7665	VATTR_INIT(vap);
	7666
	7667	/*
	7668	* When ACL inheritence is implemented, both vap->va_acl and
	7669	* dvap->va_acl will be required (even as superuser).
	7670	*/
	7671	VATTR_WANTED(vap, va_type);
	7672	VATTR_WANTED(vap, va_mode);
	7673	VATTR_WANTED(vap, va_flags);
	7674	VATTR_WANTED(vap, va_uid);
	7675	VATTR_WANTED(vap, va_gid);
	7676	if (dvap) {
	7677	VATTR_INIT(dvap);
	7678	VATTR_WANTED(dvap, va_flags);
	7679	}
	7680
	7681	if (!is_suser) {
	7682	/*
	7683	* If not superuser, we have to evaluate ACLs and
	7684	* need the target directory gid to set the initial
	7685	* gid of the new object.
	7686	*/
	7687	VATTR_WANTED(vap, va_acl);
	7688	if (dvap) {
	7689	VATTR_WANTED(dvap, va_gid);
	7690	}
	7691	} else if (dvap && (flags & VNODE_CLONEFILE_NOOWNERCOPY)) {
	7692	VATTR_WANTED(dvap, va_gid);
	7693	}
	7694	return 0;
	7695	} else if (vattr_op == OP_VATTR_CLEANUP) {
	7696	return 0; /* Nothing to do for now */
	7697	}
	7698
	7699	/* dvap isn't used for authorization */
	7700	error = vnode_attr_authorize(vap, NULL, mp, action, ctx);
	7701
	7702	if (error) {
	7703	return error;
	7704	}
	7705
	7706	/*
	7707	* vn_attribute_prepare should be able to accept attributes as well as
	7708	* vnodes but for now we do this inline.
	7709	*/
	7710	if (!is_suser \|\| (flags & VNODE_CLONEFILE_NOOWNERCOPY)) {
	7711	/*
	7712	* If the filesystem is mounted IGNORE_OWNERSHIP and an explicit
	7713	* owner is set, that owner takes ownership of all new files.
	7714	*/
	7715	if ((mp->mnt_flag & MNT_IGNORE_OWNERSHIP) &&
	7716	(mp->mnt_fsowner != KAUTH_UID_NONE)) {
	7717	VATTR_SET(vap, va_uid, mp->mnt_fsowner);
	7718	} else {
	7719	/* default owner is current user */
	7720	VATTR_SET(vap, va_uid,
	7721	kauth_cred_getuid(vfs_context_ucred(ctx)));
	7722	}
	7723
	7724	if ((mp->mnt_flag & MNT_IGNORE_OWNERSHIP) &&
	7725	(mp->mnt_fsgroup != KAUTH_GID_NONE)) {
	7726	VATTR_SET(vap, va_gid, mp->mnt_fsgroup);
	7727	} else {
	7728	/*
	7729	* default group comes from parent object,
	7730	* fallback to current user
	7731	*/
	7732	if (VATTR_IS_SUPPORTED(dvap, va_gid)) {
	7733	VATTR_SET(vap, va_gid, dvap->va_gid);
	7734	} else {
	7735	VATTR_SET(vap, va_gid,
	7736	kauth_cred_getgid(vfs_context_ucred(ctx)));
	7737	}
	7738	}
	7739	}
	7740
	7741	/* Inherit SF_RESTRICTED bit from destination directory only */
	7742	if (VATTR_IS_ACTIVE(vap, va_flags)) {
	7743	VATTR_SET(vap, va_flags,
	7744	((vap->va_flags & ~(UF_DATAVAULT \| SF_RESTRICTED)))); /* Turn off from source */
	7745	if (VATTR_IS_ACTIVE(dvap, va_flags)) {
	7746	VATTR_SET(vap, va_flags,
	7747	vap->va_flags \| (dvap->va_flags & (UF_DATAVAULT \| SF_RESTRICTED)));
	7748	}
	7749	} else if (VATTR_IS_ACTIVE(dvap, va_flags)) {
	7750	VATTR_SET(vap, va_flags, (dvap->va_flags & (UF_DATAVAULT \| SF_RESTRICTED)));
	7751	}
	7752
	7753	return 0;
	7754	}
	7755
	7756
	7757	/*
	7758	* Authorize an operation on a vnode.
	7759	*
	7760	* This is KPI, but here because it needs vnode_scope.
	7761	*
	7762	* Returns: 0 Success
	7763	* kauth_authorize_action:EPERM ...
	7764	* xlate => EACCES Permission denied
	7765	* kauth_authorize_action:0 Success
	7766	* kauth_authorize_action: Depends on callback return; this is
	7767	* usually only vnode_authorize_callback(),
	7768	* but may include other listerners, if any
	7769	* exist.
	7770	* EROFS
	7771	* EACCES
	7772	* EPERM
	7773	* ???
	7774	*/
	7775	int
	7776	vnode_authorize(vnode_t vp, vnode_t dvp, kauth_action_t action, vfs_context_t ctx)
	7777	{
	7778	int error, result;
	7779
	7780	/*
	7781	* We can't authorize against a dead vnode; allow all operations through so that
	7782	* the correct error can be returned.
	7783	*/
	7784	if (vp->v_type == VBAD) {
	7785	return 0;
	7786	}
	7787
	7788	error = 0;
	7789	result = kauth_authorize_action(vnode_scope, vfs_context_ucred(ctx), action,
	7790	(uintptr_t)ctx, (uintptr_t)vp, (uintptr_t)dvp, (uintptr_t)&error);
	7791	if (result == EPERM) { /* traditional behaviour */
	7792	result = EACCES;
	7793	}
	7794	/* did the lower layers give a better error return? */
	7795	if ((result != 0) && (error != 0)) {
	7796	return error;
	7797	}
	7798	return result;
	7799	}
	7800
	7801	/*
	7802	* Test for vnode immutability.
	7803	*
	7804	* The 'append' flag is set when the authorization request is constrained
	7805	* to operations which only request the right to append to a file.
	7806	*
	7807	* The 'ignore' flag is set when an operation modifying the immutability flags
	7808	* is being authorized. We check the system securelevel to determine which
	7809	* immutability flags we can ignore.
	7810	*/
	7811	static int
	7812	vnode_immutable(struct vnode_attr *vap, int append, int ignore)
	7813	{
	7814	int mask;
	7815
	7816	/* start with all bits precluding the operation */
	7817	mask = IMMUTABLE \| APPEND;
	7818
	7819	/* if appending only, remove the append-only bits */
	7820	if (append) {
	7821	mask &= ~APPEND;
	7822	}
	7823
	7824	/* ignore only set when authorizing flags changes */
	7825	if (ignore) {
	7826	if (securelevel <= 0) {
	7827	/* in insecure state, flags do not inhibit changes */
	7828	mask = 0;
	7829	} else {
	7830	/* in secure state, user flags don't inhibit */
	7831	mask &= ~(UF_IMMUTABLE \| UF_APPEND);
	7832	}
	7833	}
	7834	KAUTH_DEBUG("IMMUTABLE - file flags 0x%x mask 0x%x append = %d ignore = %d", vap->va_flags, mask, append, ignore);
	7835	if ((vap->va_flags & mask) != 0) {
	7836	return EPERM;
	7837	}
	7838	return 0;
	7839	}
	7840
	7841	static int
	7842	vauth_node_owner(struct vnode_attr *vap, kauth_cred_t cred)
	7843	{
	7844	int result;
	7845
	7846	/* default assumption is not-owner */
	7847	result = 0;
	7848
	7849	/*
	7850	* If the filesystem has given us a UID, we treat this as authoritative.
	7851	*/
	7852	if (vap && VATTR_IS_SUPPORTED(vap, va_uid)) {
	7853	result = (vap->va_uid == kauth_cred_getuid(cred)) ? 1 : 0;
	7854	}
	7855	/* we could test the owner UUID here if we had a policy for it */
	7856
	7857	return result;
	7858	}
	7859
	7860	/*
	7861	* vauth_node_group
	7862	*
	7863	* Description: Ask if a cred is a member of the group owning the vnode object
	7864	*
	7865	* Parameters: vap vnode attribute
	7866	* vap->va_gid group owner of vnode object
	7867	* cred credential to check
	7868	* ismember pointer to where to put the answer
	7869	* idontknow Return this if we can't get an answer
	7870	*
	7871	* Returns: 0 Success
	7872	* idontknow Can't get information
	7873	* kauth_cred_ismember_gid:? Error from kauth subsystem
	7874	* kauth_cred_ismember_gid:? Error from kauth subsystem
	7875	*/
	7876	static int
	7877	vauth_node_group(struct vnode_attr vap, kauth_cred_t cred, int ismember, int idontknow)
	7878	{
	7879	int error;
	7880	int result;
	7881
	7882	error = 0;
	7883	result = 0;
	7884
	7885	/*
	7886	* The caller is expected to have asked the filesystem for a group
	7887	* at some point prior to calling this function. The answer may
	7888	* have been that there is no group ownership supported for the
	7889	* vnode object, in which case we return
	7890	*/
	7891	if (vap && VATTR_IS_SUPPORTED(vap, va_gid)) {
	7892	error = kauth_cred_ismember_gid(cred, vap->va_gid, &result);
	7893	/*
	7894	* Credentials which are opted into external group membership
	7895	* resolution which are not known to the external resolver
	7896	* will result in an ENOENT error. We translate this into
	7897	* the appropriate 'idontknow' response for our caller.
	7898	*
	7899	* XXX We do not make a distinction here between an ENOENT
	7900	* XXX arising from a response from the external resolver,
	7901	* XXX and an ENOENT which is internally generated. This is
	7902	* XXX a deficiency of the published kauth_cred_ismember_gid()
	7903	* XXX KPI which can not be overcome without new KPI. For
	7904	* XXX all currently known cases, however, this wil result
	7905	* XXX in correct behaviour.
	7906	*/
	7907	if (error == ENOENT) {
	7908	error = idontknow;
	7909	}
	7910	}
	7911	/*
	7912	* XXX We could test the group UUID here if we had a policy for it,
	7913	* XXX but this is problematic from the perspective of synchronizing
	7914	* XXX group UUID and POSIX GID ownership of a file and keeping the
	7915	* XXX values coherent over time. The problem is that the local
	7916	* XXX system will vend transient group UUIDs for unknown POSIX GID
	7917	* XXX values, and these are not persistent, whereas storage of values
	7918	* XXX is persistent. One potential solution to this is a local
	7919	* XXX (persistent) replica of remote directory entries and vended
	7920	* XXX local ids in a local directory server (think in terms of a
	7921	* XXX caching DNS server).
	7922	*/
	7923
	7924	if (!error) {
	7925	*ismember = result;
	7926	}
	7927	return error;
	7928	}
	7929
	7930	static int
	7931	vauth_file_owner(vauth_ctx vcp)
	7932	{
	7933	int result;
	7934
	7935	if (vcp->flags_valid & _VAC_IS_OWNER) {
	7936	result = (vcp->flags & _VAC_IS_OWNER) ? 1 : 0;
	7937	} else {
	7938	result = vauth_node_owner(vcp->vap, vcp->ctx->vc_ucred);
	7939
	7940	/* cache our result */
	7941	vcp->flags_valid \|= _VAC_IS_OWNER;
	7942	if (result) {
	7943	vcp->flags \|= _VAC_IS_OWNER;
	7944	} else {
	7945	vcp->flags &= ~_VAC_IS_OWNER;
	7946	}
	7947	}
	7948	return result;
	7949	}
	7950
	7951
	7952	/*
	7953	* vauth_file_ingroup
	7954	*
	7955	* Description: Ask if a user is a member of the group owning the directory
	7956	*
	7957	* Parameters: vcp The vnode authorization context that
	7958	* contains the user and directory info
	7959	* vcp->flags_valid Valid flags
	7960	* vcp->flags Flags values
	7961	* vcp->vap File vnode attributes
	7962	* vcp->ctx VFS Context (for user)
	7963	* ismember pointer to where to put the answer
	7964	* idontknow Return this if we can't get an answer
	7965	*
	7966	* Returns: 0 Success
	7967	* vauth_node_group:? Error from vauth_node_group()
	7968	*
	7969	* Implicit returns: *ismember 0 The user is not a group member
	7970	* 1 The user is a group member
	7971	*/
	7972	static int
	7973	vauth_file_ingroup(vauth_ctx vcp, int *ismember, int idontknow)
	7974	{
	7975	int error;
	7976
	7977	/* Check for a cached answer first, to avoid the check if possible */
	7978	if (vcp->flags_valid & _VAC_IN_GROUP) {
	7979	*ismember = (vcp->flags & _VAC_IN_GROUP) ? 1 : 0;
	7980	error = 0;
	7981	} else {
	7982	/* Otherwise, go look for it */
	7983	error = vauth_node_group(vcp->vap, vcp->ctx->vc_ucred, ismember, idontknow);
	7984
	7985	if (!error) {
	7986	/* cache our result */
	7987	vcp->flags_valid \|= _VAC_IN_GROUP;
	7988	if (*ismember) {
	7989	vcp->flags \|= _VAC_IN_GROUP;
	7990	} else {
	7991	vcp->flags &= ~_VAC_IN_GROUP;
	7992	}
	7993	}
	7994	}
	7995	return error;
	7996	}
	7997
	7998	static int
	7999	vauth_dir_owner(vauth_ctx vcp)
	8000	{
	8001	int result;
	8002
	8003	if (vcp->flags_valid & _VAC_IS_DIR_OWNER) {
	8004	result = (vcp->flags & _VAC_IS_DIR_OWNER) ? 1 : 0;
	8005	} else {
	8006	result = vauth_node_owner(vcp->dvap, vcp->ctx->vc_ucred);
	8007
	8008	/* cache our result */
	8009	vcp->flags_valid \|= _VAC_IS_DIR_OWNER;
	8010	if (result) {
	8011	vcp->flags \|= _VAC_IS_DIR_OWNER;
	8012	} else {
	8013	vcp->flags &= ~_VAC_IS_DIR_OWNER;
	8014	}
	8015	}
	8016	return result;
	8017	}
	8018
	8019	/*
	8020	* vauth_dir_ingroup
	8021	*
	8022	* Description: Ask if a user is a member of the group owning the directory
	8023	*
	8024	* Parameters: vcp The vnode authorization context that
	8025	* contains the user and directory info
	8026	* vcp->flags_valid Valid flags
	8027	* vcp->flags Flags values
	8028	* vcp->dvap Dir vnode attributes
	8029	* vcp->ctx VFS Context (for user)
	8030	* ismember pointer to where to put the answer
	8031	* idontknow Return this if we can't get an answer
	8032	*
	8033	* Returns: 0 Success
	8034	* vauth_node_group:? Error from vauth_node_group()
	8035	*
	8036	* Implicit returns: *ismember 0 The user is not a group member
	8037	* 1 The user is a group member
	8038	*/
	8039	static int
	8040	vauth_dir_ingroup(vauth_ctx vcp, int *ismember, int idontknow)
	8041	{
	8042	int error;
	8043
	8044	/* Check for a cached answer first, to avoid the check if possible */
	8045	if (vcp->flags_valid & _VAC_IN_DIR_GROUP) {
	8046	*ismember = (vcp->flags & _VAC_IN_DIR_GROUP) ? 1 : 0;
	8047	error = 0;
	8048	} else {
	8049	/* Otherwise, go look for it */
	8050	error = vauth_node_group(vcp->dvap, vcp->ctx->vc_ucred, ismember, idontknow);
	8051
	8052	if (!error) {
	8053	/* cache our result */
	8054	vcp->flags_valid \|= _VAC_IN_DIR_GROUP;
	8055	if (*ismember) {
	8056	vcp->flags \|= _VAC_IN_DIR_GROUP;
	8057	} else {
	8058	vcp->flags &= ~_VAC_IN_DIR_GROUP;
	8059	}
	8060	}
	8061	}
	8062	return error;
	8063	}
	8064
	8065	/*
	8066	* Test the posix permissions in (vap) to determine whether (credential)
	8067	* may perform (action)
	8068	*/
	8069	static int
	8070	vnode_authorize_posix(vauth_ctx vcp, int action, int on_dir)
	8071	{
	8072	struct vnode_attr *vap;
	8073	int needed, error, owner_ok, group_ok, world_ok, ismember;
	8074	#ifdef KAUTH_DEBUG_ENABLE
	8075	const char *where = "uninitialized";
	8076	# define _SETWHERE(c) where = c;
	8077	#else
	8078	# define _SETWHERE(c)
	8079	#endif
	8080
	8081	/* checking file or directory? */
	8082	if (on_dir) {
	8083	vap = vcp->dvap;
	8084	} else {
	8085	vap = vcp->vap;
	8086	}
	8087
	8088	error = 0;
	8089
	8090	/*
	8091	* We want to do as little work here as possible. So first we check
	8092	* which sets of permissions grant us the access we need, and avoid checking
	8093	* whether specific permissions grant access when more generic ones would.
	8094	*/
	8095
	8096	/* owner permissions */
	8097	needed = 0;
	8098	if (action & VREAD) {
	8099	needed \|= S_IRUSR;
	8100	}
	8101	if (action & VWRITE) {
	8102	needed \|= S_IWUSR;
	8103	}
	8104	if (action & VEXEC) {
	8105	needed \|= S_IXUSR;
	8106	}
	8107	owner_ok = (needed & vap->va_mode) == needed;
	8108
	8109	/*
	8110	* Processes with the appropriate entitlement can marked themselves as
	8111	* ignoring file/directory permissions if they own it.
	8112	*/
	8113	if (!owner_ok && proc_ignores_node_permissions(vfs_context_proc(vcp->ctx))) {
	8114	owner_ok = 1;
	8115	}
	8116
	8117	/* group permissions */
	8118	needed = 0;
	8119	if (action & VREAD) {
	8120	needed \|= S_IRGRP;
	8121	}
	8122	if (action & VWRITE) {
	8123	needed \|= S_IWGRP;
	8124	}
	8125	if (action & VEXEC) {
	8126	needed \|= S_IXGRP;
	8127	}
	8128	group_ok = (needed & vap->va_mode) == needed;
	8129
	8130	/* world permissions */
	8131	needed = 0;
	8132	if (action & VREAD) {
	8133	needed \|= S_IROTH;
	8134	}
	8135	if (action & VWRITE) {
	8136	needed \|= S_IWOTH;
	8137	}
	8138	if (action & VEXEC) {
	8139	needed \|= S_IXOTH;
	8140	}
	8141	world_ok = (needed & vap->va_mode) == needed;
	8142
	8143	/* If granted/denied by all three, we're done */
	8144	if (owner_ok && group_ok && world_ok) {
	8145	_SETWHERE("all");
	8146	goto out;
	8147	}
	8148
	8149	if (!owner_ok && !group_ok && !world_ok) {
	8150	_SETWHERE("all");
	8151	error = EACCES;
	8152	goto out;
	8153	}
	8154
	8155	/* Check ownership (relatively cheap) */
	8156	if ((on_dir && vauth_dir_owner(vcp)) \|\|
	8157	(!on_dir && vauth_file_owner(vcp))) {
	8158	_SETWHERE("user");
	8159	if (!owner_ok) {
	8160	error = EACCES;
	8161	}
	8162	goto out;
	8163	}
	8164
	8165	/* Not owner; if group and world both grant it we're done */
	8166	if (group_ok && world_ok) {
	8167	_SETWHERE("group/world");
	8168	goto out;
	8169	}
	8170	if (!group_ok && !world_ok) {
	8171	_SETWHERE("group/world");
	8172	error = EACCES;
	8173	goto out;
	8174	}
	8175
	8176	/* Check group membership (most expensive) */
	8177	ismember = 0; /* Default to allow, if the target has no group owner */
	8178
	8179	/*
	8180	* In the case we can't get an answer about the user from the call to
	8181	* vauth_dir_ingroup() or vauth_file_ingroup(), we want to fail on
	8182	* the side of caution, rather than simply granting access, or we will
	8183	* fail to correctly implement exclusion groups, so we set the third
	8184	* parameter on the basis of the state of 'group_ok'.
	8185	*/
	8186	if (on_dir) {
	8187	error = vauth_dir_ingroup(vcp, &ismember, (!group_ok ? EACCES : 0));
	8188	} else {
	8189	error = vauth_file_ingroup(vcp, &ismember, (!group_ok ? EACCES : 0));
	8190	}
	8191	if (error) {
	8192	if (!group_ok) {
	8193	ismember = 1;
	8194	}
	8195	error = 0;
	8196	}
	8197	if (ismember) {
	8198	_SETWHERE("group");
	8199	if (!group_ok) {
	8200	error = EACCES;
	8201	}
	8202	goto out;
	8203	}
	8204
	8205	/* Not owner, not in group, use world result */
	8206	_SETWHERE("world");
	8207	if (!world_ok) {
	8208	error = EACCES;
	8209	}
	8210
	8211	/* FALLTHROUGH */
	8212
	8213	out:
	8214	KAUTH_DEBUG("%p %s - posix %s permissions : need %s%s%s %x have %s%s%s%s%s%s%s%s%s UID = %d file = %d,%d",
	8215	vcp->vp, (error == 0) ? "ALLOWED" : "DENIED", where,
	8216	(action & VREAD) ? "r" : "-",
	8217	(action & VWRITE) ? "w" : "-",
	8218	(action & VEXEC) ? "x" : "-",
	8219	needed,
	8220	(vap->va_mode & S_IRUSR) ? "r" : "-",
	8221	(vap->va_mode & S_IWUSR) ? "w" : "-",
	8222	(vap->va_mode & S_IXUSR) ? "x" : "-",
	8223	(vap->va_mode & S_IRGRP) ? "r" : "-",
	8224	(vap->va_mode & S_IWGRP) ? "w" : "-",
	8225	(vap->va_mode & S_IXGRP) ? "x" : "-",
	8226	(vap->va_mode & S_IROTH) ? "r" : "-",
	8227	(vap->va_mode & S_IWOTH) ? "w" : "-",
	8228	(vap->va_mode & S_IXOTH) ? "x" : "-",
	8229	kauth_cred_getuid(vcp->ctx->vc_ucred),
	8230	on_dir ? vcp->dvap->va_uid : vcp->vap->va_uid,
	8231	on_dir ? vcp->dvap->va_gid : vcp->vap->va_gid);
	8232	return error;
	8233	}
	8234
	8235	/*
	8236	* Authorize the deletion of the node vp from the directory dvp.
	8237	*
	8238	* We assume that:
	8239	* - Neither the node nor the directory are immutable.
	8240	* - The user is not the superuser.
	8241	*
	8242	* The precedence of factors for authorizing or denying delete for a credential
	8243	*
	8244	* 1) Explicit ACE on the node. (allow or deny DELETE)
	8245	* 2) Explicit ACE on the directory (allow or deny DELETE_CHILD).
	8246	*
	8247	* If there are conflicting ACEs on the node and the directory, the node
	8248	* ACE wins.
	8249	*
	8250	* 3) Sticky bit on the directory.
	8251	* Deletion is not permitted if the directory is sticky and the caller is
	8252	* not owner of the node or directory. The sticky bit rules are like a deny
	8253	* delete ACE except lower in priority than ACL's either allowing or denying
	8254	* delete.
	8255	*
	8256	* 4) POSIX permisions on the directory.
	8257	*
	8258	* As an optimization, we cache whether or not delete child is permitted
	8259	* on directories. This enables us to skip directory ACL and POSIX checks
	8260	* as we already have the result from those checks. However, we always check the
	8261	* node ACL and, if the directory has the sticky bit set, we always check its
	8262	* ACL (even for a directory with an authorized delete child). Furthermore,
	8263	* caching the delete child authorization is independent of the sticky bit
	8264	* being set as it is only applicable in determining whether the node can be
	8265	* deleted or not.
	8266	*/
	8267	static int
	8268	vnode_authorize_delete(vauth_ctx vcp, boolean_t cached_delete_child)
	8269	{
	8270	struct vnode_attr *vap = vcp->vap;
	8271	struct vnode_attr *dvap = vcp->dvap;
	8272	kauth_cred_t cred = vcp->ctx->vc_ucred;
	8273	struct kauth_acl_eval eval;
	8274	int error, ismember;
	8275
	8276	/* Check the ACL on the node first */
	8277	if (VATTR_IS_NOT(vap, va_acl, NULL)) {
	8278	eval.ae_requested = KAUTH_VNODE_DELETE;
	8279	eval.ae_acl = &vap->va_acl->acl_ace[0];
	8280	eval.ae_count = vap->va_acl->acl_entrycount;
	8281	eval.ae_options = 0;
	8282	if (vauth_file_owner(vcp)) {
	8283	eval.ae_options \|= KAUTH_AEVAL_IS_OWNER;
	8284	}
	8285	/*
	8286	* We use ENOENT as a marker to indicate we could not get
	8287	* information in order to delay evaluation until after we
	8288	* have the ACL evaluation answer. Previously, we would
	8289	* always deny the operation at this point.
	8290	*/
	8291	if ((error = vauth_file_ingroup(vcp, &ismember, ENOENT)) != 0 && error != ENOENT) {
	8292	return error;
	8293	}
	8294	if (error == ENOENT) {
	8295	eval.ae_options \|= KAUTH_AEVAL_IN_GROUP_UNKNOWN;
	8296	} else if (ismember) {
	8297	eval.ae_options \|= KAUTH_AEVAL_IN_GROUP;
	8298	}
	8299	eval.ae_exp_gall = KAUTH_VNODE_GENERIC_ALL_BITS;
	8300	eval.ae_exp_gread = KAUTH_VNODE_GENERIC_READ_BITS;
	8301	eval.ae_exp_gwrite = KAUTH_VNODE_GENERIC_WRITE_BITS;
	8302	eval.ae_exp_gexec = KAUTH_VNODE_GENERIC_EXECUTE_BITS;
	8303
	8304	if ((error = kauth_acl_evaluate(cred, &eval)) != 0) {
	8305	KAUTH_DEBUG("%p ERROR during ACL processing - %d", vcp->vp, error);
	8306	return error;
	8307	}
	8308
	8309	switch (eval.ae_result) {
	8310	case KAUTH_RESULT_DENY:
	8311	if (vauth_file_owner(vcp) && proc_ignores_node_permissions(vfs_context_proc(vcp->ctx))) {
	8312	KAUTH_DEBUG("%p Override DENY due to entitlement", vcp->vp);
	8313	return 0;
	8314	}
	8315	KAUTH_DEBUG("%p DENIED - denied by ACL", vcp->vp);
	8316	return EACCES;
	8317	case KAUTH_RESULT_ALLOW:
	8318	KAUTH_DEBUG("%p ALLOWED - granted by ACL", vcp->vp);
	8319	return 0;
	8320	case KAUTH_RESULT_DEFER:
	8321	default:
	8322	/* Defer to directory */
	8323	KAUTH_DEBUG("%p DEFERRED - by file ACL", vcp->vp);
	8324	break;
	8325	}
	8326	}
	8327
	8328	/*
	8329	* Without a sticky bit, a previously authorized delete child is
	8330	* sufficient to authorize this delete.
	8331	*
	8332	* If the sticky bit is set, a directory ACL which allows delete child
	8333	* overrides a (potential) sticky bit deny. The authorized delete child
	8334	* cannot tell us if it was authorized because of an explicit delete
	8335	* child allow ACE or because of POSIX permisions so we have to check
	8336	* the directory ACL everytime if the directory has a sticky bit.
	8337	*/
	8338	if (!(dvap->va_mode & S_ISTXT) && cached_delete_child) {
	8339	KAUTH_DEBUG("%p ALLOWED - granted by directory ACL or POSIX permissions and no sticky bit on directory", vcp->vp);
	8340	return 0;
	8341	}
	8342
	8343	/* check the ACL on the directory */
	8344	if (VATTR_IS_NOT(dvap, va_acl, NULL)) {
	8345	eval.ae_requested = KAUTH_VNODE_DELETE_CHILD;
	8346	eval.ae_acl = &dvap->va_acl->acl_ace[0];
	8347	eval.ae_count = dvap->va_acl->acl_entrycount;
	8348	eval.ae_options = 0;
	8349	if (vauth_dir_owner(vcp)) {
	8350	eval.ae_options \|= KAUTH_AEVAL_IS_OWNER;
	8351	}
	8352	/*
	8353	* We use ENOENT as a marker to indicate we could not get
	8354	* information in order to delay evaluation until after we
	8355	* have the ACL evaluation answer. Previously, we would
	8356	* always deny the operation at this point.
	8357	*/
	8358	if ((error = vauth_dir_ingroup(vcp, &ismember, ENOENT)) != 0 && error != ENOENT) {
	8359	return error;
	8360	}
	8361	if (error == ENOENT) {
	8362	eval.ae_options \|= KAUTH_AEVAL_IN_GROUP_UNKNOWN;
	8363	} else if (ismember) {
	8364	eval.ae_options \|= KAUTH_AEVAL_IN_GROUP;
	8365	}
	8366	eval.ae_exp_gall = KAUTH_VNODE_GENERIC_ALL_BITS;
	8367	eval.ae_exp_gread = KAUTH_VNODE_GENERIC_READ_BITS;
	8368	eval.ae_exp_gwrite = KAUTH_VNODE_GENERIC_WRITE_BITS;
	8369	eval.ae_exp_gexec = KAUTH_VNODE_GENERIC_EXECUTE_BITS;
	8370
	8371	/*
	8372	* If there is no entry, we are going to defer to other
	8373	* authorization mechanisms.
	8374	*/
	8375	error = kauth_acl_evaluate(cred, &eval);
	8376
	8377	if (error != 0) {
	8378	KAUTH_DEBUG("%p ERROR during ACL processing - %d", vcp->vp, error);
	8379	return error;
	8380	}
	8381	switch (eval.ae_result) {
	8382	case KAUTH_RESULT_DENY:
	8383	if (vauth_dir_owner(vcp) && proc_ignores_node_permissions(vfs_context_proc(vcp->ctx))) {
	8384	KAUTH_DEBUG("%p Override DENY due to entitlement", vcp->vp);
	8385	return 0;
	8386	}
	8387	KAUTH_DEBUG("%p DENIED - denied by directory ACL", vcp->vp);
	8388	return EACCES;
	8389	case KAUTH_RESULT_ALLOW:
	8390	KAUTH_DEBUG("%p ALLOWED - granted by directory ACL", vcp->vp);
	8391	if (!cached_delete_child && vcp->dvp) {
	8392	vnode_cache_authorized_action(vcp->dvp,
	8393	vcp->ctx, KAUTH_VNODE_DELETE_CHILD);
	8394	}
	8395	return 0;
	8396	case KAUTH_RESULT_DEFER:
	8397	default:
	8398	/* Deferred by directory ACL */
	8399	KAUTH_DEBUG("%p DEFERRED - directory ACL", vcp->vp);
	8400	break;
	8401	}
	8402	}
	8403
	8404	/*
	8405	* From this point, we can't explicitly allow and if we reach the end
	8406	* of the function without a denial, then the delete is authorized.
	8407	*/
	8408	if (!cached_delete_child) {
	8409	if (vnode_authorize_posix(vcp, VWRITE, 1 /* on_dir */) != 0) {
	8410	KAUTH_DEBUG("%p DENIED - denied by posix permisssions", vcp->vp);
	8411	return EACCES;
	8412	}
	8413	/*
	8414	* Cache the authorized action on the vnode if allowed by the
	8415	* directory ACL or POSIX permissions. It is correct to cache
	8416	* this action even if sticky bit would deny deleting the node.
	8417	*/
	8418	if (vcp->dvp) {
	8419	vnode_cache_authorized_action(vcp->dvp, vcp->ctx,
	8420	KAUTH_VNODE_DELETE_CHILD);
	8421	}
	8422	}
	8423
	8424	/* enforce sticky bit behaviour */
	8425	if ((dvap->va_mode & S_ISTXT) && !vauth_file_owner(vcp) && !vauth_dir_owner(vcp)) {
	8426	KAUTH_DEBUG("%p DENIED - sticky bit rules (user %d file %d dir %d)",
	8427	vcp->vp, cred->cr_posix.cr_uid, vap->va_uid, dvap->va_uid);
	8428	return EACCES;
	8429	}
	8430
	8431	/* not denied, must be OK */
	8432	return 0;
	8433	}
	8434
	8435
	8436	/*
	8437	* Authorize an operation based on the node's attributes.
	8438	*/
	8439	static int
	8440	vnode_authorize_simple(vauth_ctx vcp, kauth_ace_rights_t acl_rights, kauth_ace_rights_t preauth_rights, boolean_t *found_deny)
	8441	{
	8442	struct vnode_attr *vap = vcp->vap;
	8443	kauth_cred_t cred = vcp->ctx->vc_ucred;
	8444	struct kauth_acl_eval eval;
	8445	int error, ismember;
	8446	mode_t posix_action;
	8447
	8448	/*
	8449	* If we are the file owner, we automatically have some rights.
	8450	*
	8451	* Do we need to expand this to support group ownership?
	8452	*/
	8453	if (vauth_file_owner(vcp)) {
	8454	acl_rights &= ~(KAUTH_VNODE_WRITE_SECURITY);
	8455	}
	8456
	8457	/*
	8458	* If we are checking both TAKE_OWNERSHIP and WRITE_SECURITY, we can
	8459	* mask the latter. If TAKE_OWNERSHIP is requested the caller is about to
	8460	* change ownership to themselves, and WRITE_SECURITY is implicitly
	8461	* granted to the owner. We need to do this because at this point
	8462	* WRITE_SECURITY may not be granted as the caller is not currently
	8463	* the owner.
	8464	*/
	8465	if ((acl_rights & KAUTH_VNODE_TAKE_OWNERSHIP) &&
	8466	(acl_rights & KAUTH_VNODE_WRITE_SECURITY)) {
	8467	acl_rights &= ~KAUTH_VNODE_WRITE_SECURITY;
	8468	}
	8469
	8470	if (acl_rights == 0) {
	8471	KAUTH_DEBUG("%p ALLOWED - implicit or no rights required", vcp->vp);
	8472	return 0;
	8473	}
	8474
	8475	/* if we have an ACL, evaluate it */
	8476	if (VATTR_IS_NOT(vap, va_acl, NULL)) {
	8477	eval.ae_requested = acl_rights;
	8478	eval.ae_acl = &vap->va_acl->acl_ace[0];
	8479	eval.ae_count = vap->va_acl->acl_entrycount;
	8480	eval.ae_options = 0;
	8481	if (vauth_file_owner(vcp)) {
	8482	eval.ae_options \|= KAUTH_AEVAL_IS_OWNER;
	8483	}
	8484	/*
	8485	* We use ENOENT as a marker to indicate we could not get
	8486	* information in order to delay evaluation until after we
	8487	* have the ACL evaluation answer. Previously, we would
	8488	* always deny the operation at this point.
	8489	*/
	8490	if ((error = vauth_file_ingroup(vcp, &ismember, ENOENT)) != 0 && error != ENOENT) {
	8491	return error;
	8492	}
	8493	if (error == ENOENT) {
	8494	eval.ae_options \|= KAUTH_AEVAL_IN_GROUP_UNKNOWN;
	8495	} else if (ismember) {
	8496	eval.ae_options \|= KAUTH_AEVAL_IN_GROUP;
	8497	}
	8498	eval.ae_exp_gall = KAUTH_VNODE_GENERIC_ALL_BITS;
	8499	eval.ae_exp_gread = KAUTH_VNODE_GENERIC_READ_BITS;
	8500	eval.ae_exp_gwrite = KAUTH_VNODE_GENERIC_WRITE_BITS;
	8501	eval.ae_exp_gexec = KAUTH_VNODE_GENERIC_EXECUTE_BITS;
	8502
	8503	if ((error = kauth_acl_evaluate(cred, &eval)) != 0) {
	8504	KAUTH_DEBUG("%p ERROR during ACL processing - %d", vcp->vp, error);
	8505	return error;
	8506	}
	8507
	8508	switch (eval.ae_result) {
	8509	case KAUTH_RESULT_DENY:
	8510	if (vauth_file_owner(vcp) && proc_ignores_node_permissions(vfs_context_proc(vcp->ctx))) {
	8511	KAUTH_DEBUG("%p Override DENY due to entitlement", vcp->vp);
	8512	return 0;
	8513	}
	8514	KAUTH_DEBUG("%p DENIED - by ACL", vcp->vp);
	8515	return EACCES; /* deny, deny, counter-allege */
	8516	case KAUTH_RESULT_ALLOW:
	8517	KAUTH_DEBUG("%p ALLOWED - all rights granted by ACL", vcp->vp);
	8518	return 0;
	8519	case KAUTH_RESULT_DEFER:
	8520	default:
	8521	/* Effectively the same as !delete_child_denied */
	8522	KAUTH_DEBUG("%p DEFERRED - directory ACL", vcp->vp);
	8523	break;
	8524	}
	8525
	8526	*found_deny = eval.ae_found_deny;
	8527
	8528	/* fall through and evaluate residual rights */
	8529	} else {
	8530	/* no ACL, everything is residual */
	8531	eval.ae_residual = acl_rights;
	8532	}
	8533
	8534	/*
	8535	* Grant residual rights that have been pre-authorized.
	8536	*/
	8537	eval.ae_residual &= ~preauth_rights;
	8538
	8539	/*
	8540	* We grant WRITE_ATTRIBUTES to the owner if it hasn't been denied.
	8541	*/
	8542	if (vauth_file_owner(vcp)) {
	8543	eval.ae_residual &= ~KAUTH_VNODE_WRITE_ATTRIBUTES;
	8544	}
	8545
	8546	if (eval.ae_residual == 0) {
	8547	KAUTH_DEBUG("%p ALLOWED - rights already authorized", vcp->vp);
	8548	return 0;
	8549	}
	8550
	8551	/*
	8552	* Bail if we have residual rights that can't be granted by posix permissions,
	8553	* or aren't presumed granted at this point.
	8554	*
	8555	* XXX these can be collapsed for performance
	8556	*/
	8557	if (eval.ae_residual & KAUTH_VNODE_CHANGE_OWNER) {
	8558	KAUTH_DEBUG("%p DENIED - CHANGE_OWNER not permitted", vcp->vp);
	8559	return EACCES;
	8560	}
	8561	if (eval.ae_residual & KAUTH_VNODE_WRITE_SECURITY) {
	8562	KAUTH_DEBUG("%p DENIED - WRITE_SECURITY not permitted", vcp->vp);
	8563	return EACCES;
	8564	}
	8565
	8566	#if DIAGNOSTIC
	8567	if (eval.ae_residual & KAUTH_VNODE_DELETE) {
	8568	panic("vnode_authorize: can't be checking delete permission here");
	8569	}
	8570	#endif
	8571
	8572	/*
	8573	* Compute the fallback posix permissions that will satisfy the remaining
	8574	* rights.
	8575	*/
	8576	posix_action = 0;
	8577	if (eval.ae_residual & (KAUTH_VNODE_READ_DATA \|
	8578	KAUTH_VNODE_LIST_DIRECTORY \|
	8579	KAUTH_VNODE_READ_EXTATTRIBUTES)) {
	8580	posix_action \|= VREAD;
	8581	}
	8582	if (eval.ae_residual & (KAUTH_VNODE_WRITE_DATA \|
	8583	KAUTH_VNODE_ADD_FILE \|
	8584	KAUTH_VNODE_ADD_SUBDIRECTORY \|
	8585	KAUTH_VNODE_DELETE_CHILD \|
	8586	KAUTH_VNODE_WRITE_ATTRIBUTES \|
	8587	KAUTH_VNODE_WRITE_EXTATTRIBUTES)) {
	8588	posix_action \|= VWRITE;
	8589	}
	8590	if (eval.ae_residual & (KAUTH_VNODE_EXECUTE \|
	8591	KAUTH_VNODE_SEARCH)) {
	8592	posix_action \|= VEXEC;
	8593	}
	8594
	8595	if (posix_action != 0) {
	8596	return vnode_authorize_posix(vcp, posix_action, 0 /* !on_dir */);
	8597	} else {
	8598	KAUTH_DEBUG("%p ALLOWED - residual rights %s%s%s%s%s%s%s%s%s%s%s%s%s%s granted due to no posix mapping",
	8599	vcp->vp,
	8600	(eval.ae_residual & KAUTH_VNODE_READ_DATA)
	8601	? vnode_isdir(vcp->vp) ? " LIST_DIRECTORY" : " READ_DATA" : "",
	8602	(eval.ae_residual & KAUTH_VNODE_WRITE_DATA)
	8603	? vnode_isdir(vcp->vp) ? " ADD_FILE" : " WRITE_DATA" : "",
	8604	(eval.ae_residual & KAUTH_VNODE_EXECUTE)
	8605	? vnode_isdir(vcp->vp) ? " SEARCH" : " EXECUTE" : "",
	8606	(eval.ae_residual & KAUTH_VNODE_DELETE)
	8607	? " DELETE" : "",
	8608	(eval.ae_residual & KAUTH_VNODE_APPEND_DATA)
	8609	? vnode_isdir(vcp->vp) ? " ADD_SUBDIRECTORY" : " APPEND_DATA" : "",
	8610	(eval.ae_residual & KAUTH_VNODE_DELETE_CHILD)
	8611	? " DELETE_CHILD" : "",
	8612	(eval.ae_residual & KAUTH_VNODE_READ_ATTRIBUTES)
	8613	? " READ_ATTRIBUTES" : "",
	8614	(eval.ae_residual & KAUTH_VNODE_WRITE_ATTRIBUTES)
	8615	? " WRITE_ATTRIBUTES" : "",
	8616	(eval.ae_residual & KAUTH_VNODE_READ_EXTATTRIBUTES)
	8617	? " READ_EXTATTRIBUTES" : "",
	8618	(eval.ae_residual & KAUTH_VNODE_WRITE_EXTATTRIBUTES)
	8619	? " WRITE_EXTATTRIBUTES" : "",
	8620	(eval.ae_residual & KAUTH_VNODE_READ_SECURITY)
	8621	? " READ_SECURITY" : "",
	8622	(eval.ae_residual & KAUTH_VNODE_WRITE_SECURITY)
	8623	? " WRITE_SECURITY" : "",
	8624	(eval.ae_residual & KAUTH_VNODE_CHECKIMMUTABLE)
	8625	? " CHECKIMMUTABLE" : "",
	8626	(eval.ae_residual & KAUTH_VNODE_CHANGE_OWNER)
	8627	? " CHANGE_OWNER" : "");
	8628	}
	8629
	8630	/*
	8631	* Lack of required Posix permissions implies no reason to deny access.
	8632	*/
	8633	return 0;
	8634	}
	8635
	8636	/*
	8637	* Check for file immutability.
	8638	*/
	8639	static int
	8640	vnode_authorize_checkimmutable(mount_t mp, vauth_ctx vcp,
	8641	struct vnode_attr *vap, int rights, int ignore)
	8642	{
	8643	int error;
	8644	int append;
	8645
	8646	/*
	8647	* Perform immutability checks for operations that change data.
	8648	*
	8649	* Sockets, fifos and devices require special handling.
	8650	*/
	8651	switch (vap->va_type) {
	8652	case VSOCK:
	8653	case VFIFO:
	8654	case VBLK:
	8655	case VCHR:
	8656	/*
	8657	* Writing to these nodes does not change the filesystem data,
	8658	* so forget that it's being tried.
	8659	*/
	8660	rights &= ~KAUTH_VNODE_WRITE_DATA;
	8661	break;
	8662	default:
	8663	break;
	8664	}
	8665
	8666	error = 0;
	8667	if (rights & KAUTH_VNODE_WRITE_RIGHTS) {
	8668	/* check per-filesystem options if possible */
	8669	if (mp != NULL) {
	8670	/* check for no-EA filesystems */
	8671	if ((rights & KAUTH_VNODE_WRITE_EXTATTRIBUTES) &&
	8672	(vfs_flags(mp) & MNT_NOUSERXATTR)) {
	8673	KAUTH_DEBUG("%p DENIED - filesystem disallowed extended attributes", vap);
	8674	error = EACCES; /* User attributes disabled */
	8675	goto out;
	8676	}
	8677	}
	8678
	8679	/*
	8680	* check for file immutability. first, check if the requested rights are
	8681	* allowable for a UF_APPEND file.
	8682	*/
	8683	append = 0;
	8684	if (vap->va_type == VDIR) {
	8685	if ((rights & (KAUTH_VNODE_ADD_FILE \| KAUTH_VNODE_ADD_SUBDIRECTORY \| KAUTH_VNODE_WRITE_EXTATTRIBUTES)) == rights) {
	8686	append = 1;
	8687	}
	8688	} else {
	8689	if ((rights & (KAUTH_VNODE_APPEND_DATA \| KAUTH_VNODE_WRITE_EXTATTRIBUTES)) == rights) {
	8690	append = 1;
	8691	}
	8692	}
	8693	if ((error = vnode_immutable(vap, append, ignore)) != 0) {
	8694	if (error && !ignore) {
	8695	/*
	8696	* In case of a rename, we want to check ownership for dvp as well.
	8697	*/
	8698	int owner = 0;
	8699	if (rights & KAUTH_VNODE_DELETE_CHILD && vcp->dvp != NULL) {
	8700	owner = vauth_file_owner(vcp) && vauth_dir_owner(vcp);
	8701	} else {
	8702	owner = vauth_file_owner(vcp);
	8703	}
	8704	if (owner && proc_ignores_node_permissions(vfs_context_proc(vcp->ctx))) {
	8705	error = vnode_immutable(vap, append, 1);
	8706	}
	8707	}
	8708	}
	8709	if (error) {
	8710	KAUTH_DEBUG("%p DENIED - file is immutable", vap);
	8711	goto out;
	8712	}
	8713	}
	8714	out:
	8715	return error;
	8716	}
	8717
	8718	/*
	8719	* Handle authorization actions for filesystems that advertise that the
	8720	* server will be enforcing.
	8721	*
	8722	* Returns: 0 Authorization should be handled locally
	8723	* 1 Authorization was handled by the FS
	8724	*
	8725	* Note: Imputed returns will only occur if the authorization request
	8726	* was handled by the FS.
	8727	*
	8728	* Imputed: *resultp, modified Return code from FS when the request is
	8729	* handled by the FS.
	8730	* VNOP_ACCESS:???
	8731	* VNOP_OPEN:???
	8732	*/
	8733	static int
	8734	vnode_authorize_opaque(vnode_t vp, int *resultp, kauth_action_t action, vfs_context_t ctx)
	8735	{
	8736	int error;
	8737
	8738	/*
	8739	* If the vp is a device node, socket or FIFO it actually represents a local
	8740	* endpoint, so we need to handle it locally.
	8741	*/
	8742	switch (vp->v_type) {
	8743	case VBLK:
	8744	case VCHR:
	8745	case VSOCK:
	8746	case VFIFO:
	8747	return 0;
	8748	default:
	8749	break;
	8750	}
	8751
	8752	/*
	8753	* In the advisory request case, if the filesystem doesn't think it's reliable
	8754	* we will attempt to formulate a result ourselves based on VNOP_GETATTR data.
	8755	*/
	8756	if ((action & KAUTH_VNODE_ACCESS) && !vfs_authopaqueaccess(vp->v_mount)) {
	8757	return 0;
	8758	}
	8759
	8760	/*
	8761	* Let the filesystem have a say in the matter. It's OK for it to not implemnent
	8762	* VNOP_ACCESS, as most will authorise inline with the actual request.
	8763	*/
	8764	if ((error = VNOP_ACCESS(vp, action, ctx)) != ENOTSUP) {
	8765	*resultp = error;
	8766	KAUTH_DEBUG("%p DENIED - opaque filesystem VNOP_ACCESS denied access", vp);
	8767	return 1;
	8768	}
	8769
	8770	/*
	8771	* Typically opaque filesystems do authorisation in-line, but exec is a special case. In
	8772	* order to be reasonably sure that exec will be permitted, we try a bit harder here.
	8773	*/
	8774	if ((action & KAUTH_VNODE_EXECUTE) && (vp->v_type == VREG)) {
	8775	/* try a VNOP_OPEN for readonly access */
	8776	if ((error = VNOP_OPEN(vp, FREAD, ctx)) != 0) {
	8777	*resultp = error;
	8778	KAUTH_DEBUG("%p DENIED - EXECUTE denied because file could not be opened readonly", vp);
	8779	return 1;
	8780	}
	8781	VNOP_CLOSE(vp, FREAD, ctx);
	8782	}
	8783
	8784	/*
	8785	* We don't have any reason to believe that the request has to be denied at this point,
	8786	* so go ahead and allow it.
	8787	*/
	8788	*resultp = 0;
	8789	KAUTH_DEBUG("%p ALLOWED - bypassing access check for non-local filesystem", vp);
	8790	return 1;
	8791	}
	8792
	8793
	8794
	8795
	8796	/*
	8797	* Returns: KAUTH_RESULT_ALLOW
	8798	* KAUTH_RESULT_DENY
	8799	*
	8800	* Imputed: *arg3, modified Error code in the deny case
	8801	* EROFS Read-only file system
	8802	* EACCES Permission denied
	8803	* EPERM Operation not permitted [no execute]
	8804	* vnode_getattr:ENOMEM Not enough space [only if has filesec]
	8805	* vnode_getattr:???
	8806	* vnode_authorize_opaque:*arg2 ???
	8807	* vnode_authorize_checkimmutable:???
	8808	* vnode_authorize_delete:???
	8809	* vnode_authorize_simple:???
	8810	*/
	8811
	8812
	8813	static int
	8814	vnode_authorize_callback(__unused kauth_cred_t cred, __unused void *idata,
	8815	kauth_action_t action, uintptr_t arg0, uintptr_t arg1, uintptr_t arg2,
	8816	uintptr_t arg3)
	8817	{
	8818	vfs_context_t ctx;
	8819	vnode_t cvp = NULLVP;
	8820	vnode_t vp, dvp;
	8821	int result = KAUTH_RESULT_DENY;
	8822	int parent_iocount = 0;
	8823	int parent_action; /* In case we need to use namedstream's data fork for cached rights*/
	8824
	8825	ctx = (vfs_context_t)arg0;
	8826	vp = (vnode_t)arg1;
	8827	dvp = (vnode_t)arg2;
	8828
	8829	/*
	8830	* if there are 2 vnodes passed in, we don't know at
	8831	* this point which rights to look at based on the
	8832	* combined action being passed in... defer until later...
	8833	* otherwise check the kauth 'rights' cache hung
	8834	* off of the vnode we're interested in... if we've already
	8835	* been granted the right we're currently interested in,
	8836	* we can just return success... otherwise we'll go through
	8837	* the process of authorizing the requested right(s)... if that
	8838	* succeeds, we'll add the right(s) to the cache.
	8839	* VNOP_SETATTR and VNOP_SETXATTR will invalidate this cache
	8840	*/
	8841	if (dvp && vp) {
	8842	goto defer;
	8843	}
	8844	if (dvp) {
	8845	cvp = dvp;
	8846	} else {
	8847	/*
	8848	* For named streams on local-authorization volumes, rights are cached on the parent;
	8849	* authorization is determined by looking at the parent's properties anyway, so storing
	8850	* on the parent means that we don't recompute for the named stream and that if
	8851	* we need to flush rights (e.g. on VNOP_SETATTR()) we don't need to track down the
	8852	* stream to flush its cache separately. If we miss in the cache, then we authorize
	8853	* as if there were no cached rights (passing the named stream vnode and desired rights to
	8854	* vnode_authorize_callback_int()).
	8855	*
	8856	* On an opaquely authorized volume, we don't know the relationship between the
	8857	* data fork's properties and the rights granted on a stream. Thus, named stream vnodes
	8858	* on such a volume are authorized directly (rather than using the parent) and have their
	8859	* own caches. When a named stream vnode is created, we mark the parent as having a named
	8860	* stream. On a VNOP_SETATTR() for the parent that may invalidate cached authorization, we
	8861	* find the stream and flush its cache.
	8862	*/
	8863	if (vnode_isnamedstream(vp) && (!vfs_authopaque(vp->v_mount))) {
	8864	cvp = vnode_getparent(vp);
	8865	if (cvp != NULLVP) {
	8866	parent_iocount = 1;
	8867	} else {
	8868	cvp = NULL;
	8869	goto defer; /* If we can't use the parent, take the slow path */
	8870	}
	8871
	8872	/* Have to translate some actions */
	8873	parent_action = action;
	8874	if (parent_action & KAUTH_VNODE_READ_DATA) {
	8875	parent_action &= ~KAUTH_VNODE_READ_DATA;
	8876	parent_action \|= KAUTH_VNODE_READ_EXTATTRIBUTES;
	8877	}
	8878	if (parent_action & KAUTH_VNODE_WRITE_DATA) {
	8879	parent_action &= ~KAUTH_VNODE_WRITE_DATA;
	8880	parent_action \|= KAUTH_VNODE_WRITE_EXTATTRIBUTES;
	8881	}
	8882	} else {
	8883	cvp = vp;
	8884	}
	8885	}
	8886
	8887	if (vnode_cache_is_authorized(cvp, ctx, parent_iocount ? parent_action : action) == TRUE) {
	8888	result = KAUTH_RESULT_ALLOW;
	8889	goto out;
	8890	}
	8891	defer:
	8892	result = vnode_authorize_callback_int(action, ctx, vp, dvp, (int *)arg3);
	8893
	8894	if (result == KAUTH_RESULT_ALLOW && cvp != NULLVP) {
	8895	KAUTH_DEBUG("%p - caching action = %x", cvp, action);
	8896	vnode_cache_authorized_action(cvp, ctx, action);
	8897	}
	8898
	8899	out:
	8900	if (parent_iocount) {
	8901	vnode_put(cvp);
	8902	}
	8903
	8904	return result;
	8905	}
	8906
	8907	static int
	8908	vnode_attr_authorize_internal(vauth_ctx vcp, mount_t mp,
	8909	kauth_ace_rights_t rights, int is_suser, boolean_t *found_deny,
	8910	int noimmutable, int parent_authorized_for_delete_child)
	8911	{
	8912	int result;
	8913
	8914	/*
	8915	* Check for immutability.
	8916	*
	8917	* In the deletion case, parent directory immutability vetoes specific
	8918	* file rights.
	8919	*/
	8920	if ((result = vnode_authorize_checkimmutable(mp, vcp, vcp->vap, rights,
	8921	noimmutable)) != 0) {
	8922	goto out;
	8923	}
	8924
	8925	if ((rights & KAUTH_VNODE_DELETE) &&
	8926	!parent_authorized_for_delete_child) {
	8927	result = vnode_authorize_checkimmutable(mp, vcp, vcp->dvap,
	8928	KAUTH_VNODE_DELETE_CHILD, 0);
	8929	if (result) {
	8930	goto out;
	8931	}
	8932	}
	8933
	8934	/*
	8935	* Clear rights that have been authorized by reaching this point, bail if nothing left to
	8936	* check.
	8937	*/
	8938	rights &= ~(KAUTH_VNODE_LINKTARGET \| KAUTH_VNODE_CHECKIMMUTABLE);
	8939	if (rights == 0) {
	8940	goto out;
	8941	}
	8942
	8943	/*
	8944	* If we're not the superuser, authorize based on file properties;
	8945	* note that even if parent_authorized_for_delete_child is TRUE, we
	8946	* need to check on the node itself.
	8947	*/
	8948	if (!is_suser) {
	8949	/* process delete rights */
	8950	if ((rights & KAUTH_VNODE_DELETE) &&
	8951	((result = vnode_authorize_delete(vcp, parent_authorized_for_delete_child)) != 0)) {
	8952	goto out;
	8953	}
	8954
	8955	/* process remaining rights */
	8956	if ((rights & ~KAUTH_VNODE_DELETE) &&
	8957	(result = vnode_authorize_simple(vcp, rights, rights & KAUTH_VNODE_DELETE, found_deny)) != 0) {
	8958	goto out;
	8959	}
	8960	} else {
	8961	/*
	8962	* Execute is only granted to root if one of the x bits is set. This check only
	8963	* makes sense if the posix mode bits are actually supported.
	8964	*/
	8965	if ((rights & KAUTH_VNODE_EXECUTE) &&
	8966	(vcp->vap->va_type == VREG) &&
	8967	VATTR_IS_SUPPORTED(vcp->vap, va_mode) &&
	8968	!(vcp->vap->va_mode & (S_IXUSR \| S_IXGRP \| S_IXOTH))) {
	8969	result = EPERM;
	8970	KAUTH_DEBUG("%p DENIED - root execute requires at least one x bit in 0x%x", vcp, vcp->vap->va_mode);
	8971	goto out;
	8972	}
	8973
	8974	/* Assume that there were DENYs so we don't wrongly cache KAUTH_VNODE_SEARCHBYANYONE */
	8975	*found_deny = TRUE;
	8976
	8977	KAUTH_DEBUG("%p ALLOWED - caller is superuser", vcp);
	8978	}
	8979	out:
	8980	return result;
	8981	}
	8982
	8983	static int
	8984	vnode_authorize_callback_int(kauth_action_t action, vfs_context_t ctx,
	8985	vnode_t vp, vnode_t dvp, int *errorp)
	8986	{
	8987	struct _vnode_authorize_context auth_context;
	8988	vauth_ctx vcp;
	8989	kauth_cred_t cred;
	8990	kauth_ace_rights_t rights;
	8991	struct vnode_attr va, dva;
	8992	int result;
	8993	int noimmutable;
	8994	boolean_t parent_authorized_for_delete_child = FALSE;
	8995	boolean_t found_deny = FALSE;
	8996	boolean_t parent_ref = FALSE;
	8997	boolean_t is_suser = FALSE;
	8998
	8999	vcp = &auth_context;
	9000	vcp->ctx = ctx;
	9001	vcp->vp = vp;
	9002	vcp->dvp = dvp;
	9003	/*
	9004	* Note that we authorize against the context, not the passed cred
	9005	* (the same thing anyway)
	9006	*/
	9007	cred = ctx->vc_ucred;
	9008
	9009	VATTR_INIT(&va);
	9010	vcp->vap = &va;
	9011	VATTR_INIT(&dva);
	9012	vcp->dvap = &dva;
	9013
	9014	vcp->flags = vcp->flags_valid = 0;
	9015
	9016	#if DIAGNOSTIC
	9017	if ((ctx == NULL) \|\| (vp == NULL) \|\| (cred == NULL)) {
	9018	panic("vnode_authorize: bad arguments (context %p vp %p cred %p)", ctx, vp, cred);
	9019	}
	9020	#endif
	9021
	9022	KAUTH_DEBUG("%p AUTH - %s %s%s%s%s%s%s%s%s%s%s%s%s%s%s%s on %s '%s' (0x%x:%p/%p)",
	9023	vp, vfs_context_proc(ctx)->p_comm,
	9024	(action & KAUTH_VNODE_ACCESS) ? "access" : "auth",
	9025	(action & KAUTH_VNODE_READ_DATA) ? vnode_isdir(vp) ? " LIST_DIRECTORY" : " READ_DATA" : "",
	9026	(action & KAUTH_VNODE_WRITE_DATA) ? vnode_isdir(vp) ? " ADD_FILE" : " WRITE_DATA" : "",
	9027	(action & KAUTH_VNODE_EXECUTE) ? vnode_isdir(vp) ? " SEARCH" : " EXECUTE" : "",
	9028	(action & KAUTH_VNODE_DELETE) ? " DELETE" : "",
	9029	(action & KAUTH_VNODE_APPEND_DATA) ? vnode_isdir(vp) ? " ADD_SUBDIRECTORY" : " APPEND_DATA" : "",
	9030	(action & KAUTH_VNODE_DELETE_CHILD) ? " DELETE_CHILD" : "",
	9031	(action & KAUTH_VNODE_READ_ATTRIBUTES) ? " READ_ATTRIBUTES" : "",
	9032	(action & KAUTH_VNODE_WRITE_ATTRIBUTES) ? " WRITE_ATTRIBUTES" : "",
	9033	(action & KAUTH_VNODE_READ_EXTATTRIBUTES) ? " READ_EXTATTRIBUTES" : "",
	9034	(action & KAUTH_VNODE_WRITE_EXTATTRIBUTES) ? " WRITE_EXTATTRIBUTES" : "",
	9035	(action & KAUTH_VNODE_READ_SECURITY) ? " READ_SECURITY" : "",
	9036	(action & KAUTH_VNODE_WRITE_SECURITY) ? " WRITE_SECURITY" : "",
	9037	(action & KAUTH_VNODE_CHANGE_OWNER) ? " CHANGE_OWNER" : "",
	9038	(action & KAUTH_VNODE_NOIMMUTABLE) ? " (noimmutable)" : "",
	9039	vnode_isdir(vp) ? "directory" : "file",
	9040	vp->v_name ? vp->v_name : "<NULL>", action, vp, dvp);
	9041
	9042	/*
	9043	* Extract the control bits from the action, everything else is
	9044	* requested rights.
	9045	*/
	9046	noimmutable = (action & KAUTH_VNODE_NOIMMUTABLE) ? 1 : 0;
	9047	rights = action & ~(KAUTH_VNODE_ACCESS \| KAUTH_VNODE_NOIMMUTABLE);
	9048
	9049	if (rights & KAUTH_VNODE_DELETE) {
	9050	#if DIAGNOSTIC
	9051	if (dvp == NULL) {
	9052	panic("vnode_authorize: KAUTH_VNODE_DELETE test requires a directory");
	9053	}
	9054	#endif
	9055	/*
	9056	* check to see if we've already authorized the parent
	9057	* directory for deletion of its children... if so, we
	9058	* can skip a whole bunch of work... we will still have to
	9059	* authorize that this specific child can be removed
	9060	*/
	9061	if (vnode_cache_is_authorized(dvp, ctx, KAUTH_VNODE_DELETE_CHILD) == TRUE) {
	9062	parent_authorized_for_delete_child = TRUE;
	9063	}
	9064	} else {
	9065	vcp->dvp = NULLVP;
	9066	vcp->dvap = NULL;
	9067	}
	9068
	9069	/*
	9070	* Check for read-only filesystems.
	9071	*/
	9072	if ((rights & KAUTH_VNODE_WRITE_RIGHTS) &&
	9073	(vp->v_mount->mnt_flag & MNT_RDONLY) &&
	9074	((vp->v_type == VREG) \|\| (vp->v_type == VDIR) \|\|
	9075	(vp->v_type == VLNK) \|\| (vp->v_type == VCPLX) \|\|
	9076	(rights & KAUTH_VNODE_DELETE) \|\| (rights & KAUTH_VNODE_DELETE_CHILD))) {
	9077	result = EROFS;
	9078	goto out;
	9079	}
	9080
	9081	/*
	9082	* Check for noexec filesystems.
	9083	*/
	9084	if ((rights & KAUTH_VNODE_EXECUTE) && (vp->v_type == VREG) && (vp->v_mount->mnt_flag & MNT_NOEXEC)) {
	9085	result = EACCES;
	9086	goto out;
	9087	}
	9088
	9089	/*
	9090	* Handle cases related to filesystems with non-local enforcement.
	9091	* This call can return 0, in which case we will fall through to perform a
	9092	* check based on VNOP_GETATTR data. Otherwise it returns 1 and sets
	9093	* an appropriate result, at which point we can return immediately.
	9094	*/
	9095	if ((vp->v_mount->mnt_kern_flag & MNTK_AUTH_OPAQUE) && vnode_authorize_opaque(vp, &result, action, ctx)) {
	9096	goto out;
	9097	}
	9098
	9099	/*
	9100	* If the vnode is a namedstream (extended attribute) data vnode (eg.
	9101	* a resource fork), _DATA becomes _EXTATTRIBUTES.
	9102	*/
	9103	if (vnode_isnamedstream(vp)) {
	9104	if (rights & KAUTH_VNODE_READ_DATA) {
	9105	rights &= ~KAUTH_VNODE_READ_DATA;
	9106	rights \|= KAUTH_VNODE_READ_EXTATTRIBUTES;
	9107	}
	9108	if (rights & KAUTH_VNODE_WRITE_DATA) {
	9109	rights &= ~KAUTH_VNODE_WRITE_DATA;
	9110	rights \|= KAUTH_VNODE_WRITE_EXTATTRIBUTES;
	9111	}
	9112
	9113	/*
	9114	* Point 'vp' to the namedstream's parent for ACL checking
	9115	*/
	9116	if ((vp->v_parent != NULL) &&
	9117	(vget_internal(vp->v_parent, 0, VNODE_NODEAD \| VNODE_DRAINO) == 0)) {
	9118	parent_ref = TRUE;
	9119	vcp->vp = vp = vp->v_parent;
	9120	}
	9121	}
	9122
	9123	if (vfs_context_issuser(ctx)) {
	9124	/*
	9125	* if we're not asking for execute permissions or modifications,
	9126	* then we're done, this action is authorized.
	9127	*/
	9128	if (!(rights & (KAUTH_VNODE_EXECUTE \| KAUTH_VNODE_WRITE_RIGHTS))) {
	9129	goto success;
	9130	}
	9131
	9132	is_suser = TRUE;
	9133	}
	9134
	9135	/*
	9136	* Get vnode attributes and extended security information for the vnode
	9137	* and directory if required.
	9138	*
	9139	* If we're root we only want mode bits and flags for checking
	9140	* execute and immutability.
	9141	*/
	9142	VATTR_WANTED(&va, va_mode);
	9143	VATTR_WANTED(&va, va_flags);
	9144	if (!is_suser) {
	9145	VATTR_WANTED(&va, va_uid);
	9146	VATTR_WANTED(&va, va_gid);
	9147	VATTR_WANTED(&va, va_acl);
	9148	}
	9149	if ((result = vnode_getattr(vp, &va, ctx)) != 0) {
	9150	KAUTH_DEBUG("%p ERROR - failed to get vnode attributes - %d", vp, result);
	9151	goto out;
	9152	}
	9153	VATTR_WANTED(&va, va_type);
	9154	VATTR_RETURN(&va, va_type, vnode_vtype(vp));
	9155
	9156	if (vcp->dvp) {
	9157	VATTR_WANTED(&dva, va_mode);
	9158	VATTR_WANTED(&dva, va_flags);
	9159	if (!is_suser) {
	9160	VATTR_WANTED(&dva, va_uid);
	9161	VATTR_WANTED(&dva, va_gid);
	9162	VATTR_WANTED(&dva, va_acl);
	9163	}
	9164	if ((result = vnode_getattr(vcp->dvp, &dva, ctx)) != 0) {
	9165	KAUTH_DEBUG("%p ERROR - failed to get directory vnode attributes - %d", vp, result);
	9166	goto out;
	9167	}
	9168	VATTR_WANTED(&dva, va_type);
	9169	VATTR_RETURN(&dva, va_type, vnode_vtype(vcp->dvp));
	9170	}
	9171
	9172	result = vnode_attr_authorize_internal(vcp, vp->v_mount, rights, is_suser,
	9173	&found_deny, noimmutable, parent_authorized_for_delete_child);
	9174	out:
	9175	if (VATTR_IS_SUPPORTED(&va, va_acl) && (va.va_acl != NULL)) {
	9176	kauth_acl_free(va.va_acl);
	9177	}
	9178	if (VATTR_IS_SUPPORTED(&dva, va_acl) && (dva.va_acl != NULL)) {
	9179	kauth_acl_free(dva.va_acl);
	9180	}
	9181
	9182	if (result) {
	9183	if (parent_ref) {
	9184	vnode_put(vp);
	9185	}
	9186	*errorp = result;
	9187	KAUTH_DEBUG("%p DENIED - auth denied", vp);
	9188	return KAUTH_RESULT_DENY;
	9189	}
	9190	if ((rights & KAUTH_VNODE_SEARCH) && found_deny == FALSE && vp->v_type == VDIR) {
	9191	/*
	9192	* if we were successfully granted the right to search this directory
	9193	* and there were NO ACL DENYs for search and the posix permissions also don't
	9194	* deny execute, we can synthesize a global right that allows anyone to
	9195	* traverse this directory during a pathname lookup without having to
	9196	* match the credential associated with this cache of rights.
	9197	*
	9198	* Note that we can correctly cache KAUTH_VNODE_SEARCHBYANYONE
	9199	* only if we actually check ACLs which we don't for root. As
	9200	* a workaround, the lookup fast path checks for root.
	9201	*/
	9202	if (!VATTR_IS_SUPPORTED(&va, va_mode) \|\|
	9203	((va.va_mode & (S_IXUSR \| S_IXGRP \| S_IXOTH)) ==
	9204	(S_IXUSR \| S_IXGRP \| S_IXOTH))) {
	9205	vnode_cache_authorized_action(vp, ctx, KAUTH_VNODE_SEARCHBYANYONE);
	9206	}
	9207	}
	9208	success:
	9209	if (parent_ref) {
	9210	vnode_put(vp);
	9211	}
	9212
	9213	/*
	9214	* Note that this implies that we will allow requests for no rights, as well as
	9215	* for rights that we do not recognise. There should be none of these.
	9216	*/
	9217	KAUTH_DEBUG("%p ALLOWED - auth granted", vp);
	9218	return KAUTH_RESULT_ALLOW;
	9219	}
	9220
	9221	int
	9222	vnode_attr_authorize_init(struct vnode_attr vap, struct vnode_attr dvap,
	9223	kauth_action_t action, vfs_context_t ctx)
	9224	{
	9225	VATTR_INIT(vap);
	9226	VATTR_WANTED(vap, va_type);
	9227	VATTR_WANTED(vap, va_mode);
	9228	VATTR_WANTED(vap, va_flags);
	9229	if (dvap) {
	9230	VATTR_INIT(dvap);
	9231	if (action & KAUTH_VNODE_DELETE) {
	9232	VATTR_WANTED(dvap, va_type);
	9233	VATTR_WANTED(dvap, va_mode);
	9234	VATTR_WANTED(dvap, va_flags);
	9235	}
	9236	} else if (action & KAUTH_VNODE_DELETE) {
	9237	return EINVAL;
	9238	}
	9239
	9240	if (!vfs_context_issuser(ctx)) {
	9241	VATTR_WANTED(vap, va_uid);
	9242	VATTR_WANTED(vap, va_gid);
	9243	VATTR_WANTED(vap, va_acl);
	9244	if (dvap && (action & KAUTH_VNODE_DELETE)) {
	9245	VATTR_WANTED(dvap, va_uid);
	9246	VATTR_WANTED(dvap, va_gid);
	9247	VATTR_WANTED(dvap, va_acl);
	9248	}
	9249	}
	9250
	9251	return 0;
	9252	}
	9253
	9254	int
	9255	vnode_attr_authorize(struct vnode_attr vap, struct vnode_attr dvap, mount_t mp,
	9256	kauth_action_t action, vfs_context_t ctx)
	9257	{
	9258	struct _vnode_authorize_context auth_context;
	9259	vauth_ctx vcp;
	9260	kauth_ace_rights_t rights;
	9261	int noimmutable;
	9262	boolean_t found_deny;
	9263	boolean_t is_suser = FALSE;
	9264	int result = 0;
	9265
	9266	vcp = &auth_context;
	9267	vcp->ctx = ctx;
	9268	vcp->vp = NULLVP;
	9269	vcp->vap = vap;
	9270	vcp->dvp = NULLVP;
	9271	vcp->dvap = dvap;
	9272	vcp->flags = vcp->flags_valid = 0;
	9273
	9274	noimmutable = (action & KAUTH_VNODE_NOIMMUTABLE) ? 1 : 0;
	9275	rights = action & ~(KAUTH_VNODE_ACCESS \| KAUTH_VNODE_NOIMMUTABLE);
	9276
	9277	/*
	9278	* Check for read-only filesystems.
	9279	*/
	9280	if ((rights & KAUTH_VNODE_WRITE_RIGHTS) &&
	9281	mp && (mp->mnt_flag & MNT_RDONLY) &&
	9282	((vap->va_type == VREG) \|\| (vap->va_type == VDIR) \|\|
	9283	(vap->va_type == VLNK) \|\| (rights & KAUTH_VNODE_DELETE) \|\|
	9284	(rights & KAUTH_VNODE_DELETE_CHILD))) {
	9285	result = EROFS;
	9286	goto out;
	9287	}
	9288
	9289	/*
	9290	* Check for noexec filesystems.
	9291	*/
	9292	if ((rights & KAUTH_VNODE_EXECUTE) &&
	9293	(vap->va_type == VREG) && mp && (mp->mnt_flag & MNT_NOEXEC)) {
	9294	result = EACCES;
	9295	goto out;
	9296	}
	9297
	9298	if (vfs_context_issuser(ctx)) {
	9299	/*
	9300	* if we're not asking for execute permissions or modifications,
	9301	* then we're done, this action is authorized.
	9302	*/
	9303	if (!(rights & (KAUTH_VNODE_EXECUTE \| KAUTH_VNODE_WRITE_RIGHTS))) {
	9304	goto out;
	9305	}
	9306	is_suser = TRUE;
	9307	} else {
	9308	if (!VATTR_IS_SUPPORTED(vap, va_uid) \|\|
	9309	!VATTR_IS_SUPPORTED(vap, va_gid) \|\|
	9310	(mp && vfs_extendedsecurity(mp) && !VATTR_IS_SUPPORTED(vap, va_acl))) {
	9311	panic("vnode attrs not complete for vnode_attr_authorize\n");
	9312	}
	9313	}
	9314
	9315	if (mp) {
	9316	vnode_attr_handle_mnt_ignore_ownership(vap, mp, ctx);
	9317	}
	9318
	9319	result = vnode_attr_authorize_internal(vcp, mp, rights, is_suser,
	9320	&found_deny, noimmutable, FALSE);
	9321
	9322	if (result == EPERM) {
	9323	result = EACCES;
	9324	}
	9325	out:
	9326	return result;
	9327	}
	9328
	9329
	9330	int
	9331	vnode_authattr_new(vnode_t dvp, struct vnode_attr *vap, int noauth, vfs_context_t ctx)
	9332	{
	9333	return vnode_authattr_new_internal(dvp, vap, noauth, NULL, ctx);
	9334	}
	9335
	9336	/*
	9337	* Check that the attribute information in vattr can be legally applied to
	9338	* a new file by the context.
	9339	*/
	9340	static int
	9341	vnode_authattr_new_internal(vnode_t dvp, struct vnode_attr vap, int noauth, uint32_t defaulted_fieldsp, vfs_context_t ctx)
	9342	{
	9343	int error;
	9344	int has_priv_suser, ismember, defaulted_owner, defaulted_group, defaulted_mode;
	9345	uint32_t inherit_flags;
	9346	kauth_cred_t cred;
	9347	guid_t changer;
	9348	mount_t dmp;
	9349	struct vnode_attr dva;
	9350
	9351	error = 0;
	9352
	9353	if (defaulted_fieldsp) {
	9354	*defaulted_fieldsp = 0;
	9355	}
	9356
	9357	defaulted_owner = defaulted_group = defaulted_mode = 0;
	9358
	9359	inherit_flags = 0;
	9360
	9361	/*
	9362	* Require that the filesystem support extended security to apply any.
	9363	*/
	9364	if (!vfs_extendedsecurity(dvp->v_mount) &&
	9365	(VATTR_IS_ACTIVE(vap, va_acl) \|\| VATTR_IS_ACTIVE(vap, va_uuuid) \|\| VATTR_IS_ACTIVE(vap, va_guuid))) {
	9366	error = EINVAL;
	9367	goto out;
	9368	}
	9369
	9370	/*
	9371	* Default some fields.
	9372	*/
	9373	dmp = dvp->v_mount;
	9374
	9375	/*
	9376	* If the filesystem is mounted IGNORE_OWNERSHIP and an explicit owner is set, that
	9377	* owner takes ownership of all new files.
	9378	*/
	9379	if ((dmp->mnt_flag & MNT_IGNORE_OWNERSHIP) && (dmp->mnt_fsowner != KAUTH_UID_NONE)) {
	9380	VATTR_SET(vap, va_uid, dmp->mnt_fsowner);
	9381	defaulted_owner = 1;
	9382	} else {
	9383	if (!VATTR_IS_ACTIVE(vap, va_uid)) {
	9384	/* default owner is current user */
	9385	VATTR_SET(vap, va_uid, kauth_cred_getuid(vfs_context_ucred(ctx)));
	9386	defaulted_owner = 1;
	9387	}
	9388	}
	9389
	9390	/*
	9391	* We need the dvp's va_flags and may need the gid of the directory,
	9392	* we ask for both here.
	9393	*/
	9394	VATTR_INIT(&dva);
	9395	VATTR_WANTED(&dva, va_gid);
	9396	VATTR_WANTED(&dva, va_flags);
	9397	if ((error = vnode_getattr(dvp, &dva, ctx)) != 0) {
	9398	goto out;
	9399	}
	9400
	9401	/*
	9402	* If the filesystem is mounted IGNORE_OWNERSHIP and an explicit grouo is set, that
	9403	* group takes ownership of all new files.
	9404	*/
	9405	if ((dmp->mnt_flag & MNT_IGNORE_OWNERSHIP) && (dmp->mnt_fsgroup != KAUTH_GID_NONE)) {
	9406	VATTR_SET(vap, va_gid, dmp->mnt_fsgroup);
	9407	defaulted_group = 1;
	9408	} else {
	9409	if (!VATTR_IS_ACTIVE(vap, va_gid)) {
	9410	/* default group comes from parent object, fallback to current user */
	9411	if (VATTR_IS_SUPPORTED(&dva, va_gid)) {
	9412	VATTR_SET(vap, va_gid, dva.va_gid);
	9413	} else {
	9414	VATTR_SET(vap, va_gid, kauth_cred_getgid(vfs_context_ucred(ctx)));
	9415	}
	9416	defaulted_group = 1;
	9417	}
	9418	}
	9419
	9420	if (!VATTR_IS_ACTIVE(vap, va_flags)) {
	9421	VATTR_SET(vap, va_flags, 0);
	9422	}
	9423
	9424	/* Determine if SF_RESTRICTED should be inherited from the parent
	9425	* directory. */
	9426	if (VATTR_IS_SUPPORTED(&dva, va_flags)) {
	9427	inherit_flags = dva.va_flags & (UF_DATAVAULT \| SF_RESTRICTED);
	9428	}
	9429
	9430	/* default mode is everything, masked with current umask */
	9431	if (!VATTR_IS_ACTIVE(vap, va_mode)) {
	9432	VATTR_SET(vap, va_mode, ACCESSPERMS & ~vfs_context_proc(ctx)->p_fd->fd_cmask);
	9433	KAUTH_DEBUG("ATTR - defaulting new file mode to %o from umask %o", vap->va_mode, vfs_context_proc(ctx)->p_fd->fd_cmask);
	9434	defaulted_mode = 1;
	9435	}
	9436	/* set timestamps to now */
	9437	if (!VATTR_IS_ACTIVE(vap, va_create_time)) {
	9438	nanotime(&vap->va_create_time);
	9439	VATTR_SET_ACTIVE(vap, va_create_time);
	9440	}
	9441
	9442	/*
	9443	* Check for attempts to set nonsensical fields.
	9444	*/
	9445	if (vap->va_active & ~VNODE_ATTR_NEWOBJ) {
	9446	error = EINVAL;
	9447	KAUTH_DEBUG("ATTR - ERROR - attempt to set unsupported new-file attributes %llx",
	9448	vap->va_active & ~VNODE_ATTR_NEWOBJ);
	9449	goto out;
	9450	}
	9451
	9452	/*
	9453	* Quickly check for the applicability of any enforcement here.
	9454	* Tests below maintain the integrity of the local security model.
	9455	*/
	9456	if (vfs_authopaque(dvp->v_mount)) {
	9457	goto out;
	9458	}
	9459
	9460	/*
	9461	* We need to know if the caller is the superuser, or if the work is
	9462	* otherwise already authorised.
	9463	*/
	9464	cred = vfs_context_ucred(ctx);
	9465	if (noauth) {
	9466	/* doing work for the kernel */
	9467	has_priv_suser = 1;
	9468	} else {
	9469	has_priv_suser = vfs_context_issuser(ctx);
	9470	}
	9471
	9472
	9473	if (VATTR_IS_ACTIVE(vap, va_flags)) {
	9474	vap->va_flags &= ~SF_SYNTHETIC;
	9475	if (has_priv_suser) {
	9476	if ((vap->va_flags & (UF_SETTABLE \| SF_SETTABLE)) != vap->va_flags) {
	9477	error = EPERM;
	9478	KAUTH_DEBUG(" DENIED - superuser attempt to set illegal flag(s)");
	9479	goto out;
	9480	}
	9481	} else {
	9482	if ((vap->va_flags & UF_SETTABLE) != vap->va_flags) {
	9483	error = EPERM;
	9484	KAUTH_DEBUG(" DENIED - user attempt to set illegal flag(s)");
	9485	goto out;
	9486	}
	9487	}
	9488	}
	9489
	9490	/* if not superuser, validate legality of new-item attributes */
	9491	if (!has_priv_suser) {
	9492	if (!defaulted_mode && VATTR_IS_ACTIVE(vap, va_mode)) {
	9493	/* setgid? */
	9494	if (vap->va_mode & S_ISGID) {
	9495	if ((error = kauth_cred_ismember_gid(cred, vap->va_gid, &ismember)) != 0) {
	9496	KAUTH_DEBUG("ATTR - ERROR: got %d checking for membership in %d", error, vap->va_gid);
	9497	goto out;
	9498	}
	9499	if (!ismember) {
	9500	KAUTH_DEBUG(" DENIED - can't set SGID bit, not a member of %d", vap->va_gid);
	9501	error = EPERM;
	9502	goto out;
	9503	}
	9504	}
	9505
	9506	/* setuid? */
	9507	if ((vap->va_mode & S_ISUID) && (vap->va_uid != kauth_cred_getuid(cred))) {
	9508	KAUTH_DEBUG("ATTR - ERROR: illegal attempt to set the setuid bit");
	9509	error = EPERM;
	9510	goto out;
	9511	}
	9512	}
	9513	if (!defaulted_owner && (vap->va_uid != kauth_cred_getuid(cred))) {
	9514	KAUTH_DEBUG(" DENIED - cannot create new item owned by %d", vap->va_uid);
	9515	error = EPERM;
	9516	goto out;
	9517	}
	9518	if (!defaulted_group) {
	9519	if ((error = kauth_cred_ismember_gid(cred, vap->va_gid, &ismember)) != 0) {
	9520	KAUTH_DEBUG(" ERROR - got %d checking for membership in %d", error, vap->va_gid);
	9521	goto out;
	9522	}
	9523	if (!ismember) {
	9524	KAUTH_DEBUG(" DENIED - cannot create new item with group %d - not a member", vap->va_gid);
	9525	error = EPERM;
	9526	goto out;
	9527	}
	9528	}
	9529
	9530	/* initialising owner/group UUID */
	9531	if (VATTR_IS_ACTIVE(vap, va_uuuid)) {
	9532	if ((error = kauth_cred_getguid(cred, &changer)) != 0) {
	9533	KAUTH_DEBUG(" ERROR - got %d trying to get caller UUID", error);
	9534	/* XXX ENOENT here - no GUID - should perhaps become EPERM */
	9535	goto out;
	9536	}
	9537	if (!kauth_guid_equal(&vap->va_uuuid, &changer)) {
	9538	KAUTH_DEBUG(" ERROR - cannot create item with supplied owner UUID - not us");
	9539	error = EPERM;
	9540	goto out;
	9541	}
	9542	}
	9543	if (VATTR_IS_ACTIVE(vap, va_guuid)) {
	9544	if ((error = kauth_cred_ismember_guid(cred, &vap->va_guuid, &ismember)) != 0) {
	9545	KAUTH_DEBUG(" ERROR - got %d trying to check group membership", error);
	9546	goto out;
	9547	}
	9548	if (!ismember) {
	9549	KAUTH_DEBUG(" ERROR - cannot create item with supplied group UUID - not a member");
	9550	error = EPERM;
	9551	goto out;
	9552	}
	9553	}
	9554	}
	9555	out:
	9556	if (inherit_flags) {
	9557	/* Apply SF_RESTRICTED to the file if its parent directory was
	9558	* restricted. This is done at the end so that root is not
	9559	* required if this flag is only set due to inheritance. */
	9560	VATTR_SET(vap, va_flags, (vap->va_flags \| inherit_flags));
	9561	}
	9562	if (defaulted_fieldsp) {
	9563	if (defaulted_mode) {
	9564	*defaulted_fieldsp \|= VATTR_PREPARE_DEFAULTED_MODE;
	9565	}
	9566	if (defaulted_group) {
	9567	*defaulted_fieldsp \|= VATTR_PREPARE_DEFAULTED_GID;
	9568	}
	9569	if (defaulted_owner) {
	9570	*defaulted_fieldsp \|= VATTR_PREPARE_DEFAULTED_UID;
	9571	}
	9572	}
	9573	return error;
	9574	}
	9575
	9576	/*
	9577	* Check that the attribute information in vap can be legally written by the
	9578	* context.
	9579	*
	9580	* Call this when you're not sure about the vnode_attr; either its contents
	9581	* have come from an unknown source, or when they are variable.
	9582	*
	9583	* Returns errno, or zero and sets actionp to the KAUTH_VNODE_ actions that
	9584	* must be authorized to be permitted to write the vattr.
	9585	*/
	9586	int
	9587	vnode_authattr(vnode_t vp, struct vnode_attr vap, kauth_action_t actionp, vfs_context_t ctx)
	9588	{
	9589	struct vnode_attr ova;
	9590	kauth_action_t required_action;
	9591	int error, has_priv_suser, ismember, chowner, chgroup, clear_suid, clear_sgid;
	9592	guid_t changer;
	9593	gid_t group;
	9594	uid_t owner;
	9595	mode_t newmode;
	9596	kauth_cred_t cred;
	9597	uint32_t fdelta;
	9598
	9599	VATTR_INIT(&ova);
	9600	required_action = 0;
	9601	error = 0;
	9602
	9603	/*
	9604	* Quickly check for enforcement applicability.
	9605	*/
	9606	if (vfs_authopaque(vp->v_mount)) {
	9607	goto out;
	9608	}
	9609
	9610	/*
	9611	* Check for attempts to set nonsensical fields.
	9612	*/
	9613	if (vap->va_active & VNODE_ATTR_RDONLY) {
	9614	KAUTH_DEBUG("ATTR - ERROR: attempt to set readonly attribute(s)");
	9615	error = EINVAL;
	9616	goto out;
	9617	}
	9618
	9619	/*
	9620	* We need to know if the caller is the superuser.
	9621	*/
	9622	cred = vfs_context_ucred(ctx);
	9623	has_priv_suser = kauth_cred_issuser(cred);
	9624
	9625	/*
	9626	* If any of the following are changing, we need information from the old file:
	9627	* va_uid
	9628	* va_gid
	9629	* va_mode
	9630	* va_uuuid
	9631	* va_guuid
	9632	*/
	9633	if (VATTR_IS_ACTIVE(vap, va_uid) \|\|
	9634	VATTR_IS_ACTIVE(vap, va_gid) \|\|
	9635	VATTR_IS_ACTIVE(vap, va_mode) \|\|
	9636	VATTR_IS_ACTIVE(vap, va_uuuid) \|\|
	9637	VATTR_IS_ACTIVE(vap, va_guuid)) {
	9638	VATTR_WANTED(&ova, va_mode);
	9639	VATTR_WANTED(&ova, va_uid);
	9640	VATTR_WANTED(&ova, va_gid);
	9641	VATTR_WANTED(&ova, va_uuuid);
	9642	VATTR_WANTED(&ova, va_guuid);
	9643	KAUTH_DEBUG("ATTR - security information changing, fetching existing attributes");
	9644	}
	9645
	9646	/*
	9647	* If timestamps are being changed, we need to know who the file is owned
	9648	* by.
	9649	*/
	9650	if (VATTR_IS_ACTIVE(vap, va_create_time) \|\|
	9651	VATTR_IS_ACTIVE(vap, va_change_time) \|\|
	9652	VATTR_IS_ACTIVE(vap, va_modify_time) \|\|
	9653	VATTR_IS_ACTIVE(vap, va_access_time) \|\|
	9654	VATTR_IS_ACTIVE(vap, va_backup_time) \|\|
	9655	VATTR_IS_ACTIVE(vap, va_addedtime)) {
	9656	VATTR_WANTED(&ova, va_uid);
	9657	#if 0 /* enable this when we support UUIDs as official owners */
	9658	VATTR_WANTED(&ova, va_uuuid);
	9659	#endif
	9660	KAUTH_DEBUG("ATTR - timestamps changing, fetching uid and GUID");
	9661	}
	9662
	9663	/*
	9664	* If flags are being changed, we need the old flags.
	9665	*/
	9666	if (VATTR_IS_ACTIVE(vap, va_flags)) {
	9667	KAUTH_DEBUG("ATTR - flags changing, fetching old flags");
	9668	VATTR_WANTED(&ova, va_flags);
	9669	}
	9670
	9671	/*
	9672	* If ACLs are being changed, we need the old ACLs.
	9673	*/
	9674	if (VATTR_IS_ACTIVE(vap, va_acl)) {
	9675	KAUTH_DEBUG("ATTR - acl changing, fetching old flags");
	9676	VATTR_WANTED(&ova, va_acl);
	9677	}
	9678
	9679	/*
	9680	* If the size is being set, make sure it's not a directory.
	9681	*/
	9682	if (VATTR_IS_ACTIVE(vap, va_data_size)) {
	9683	/* size is only meaningful on regular files, don't permit otherwise */
	9684	if (!vnode_isreg(vp)) {
	9685	KAUTH_DEBUG("ATTR - ERROR: size change requested on non-file");
	9686	error = vnode_isdir(vp) ? EISDIR : EINVAL;
	9687	goto out;
	9688	}
	9689	}
	9690
	9691	/*
	9692	* Get old data.
	9693	*/
	9694	KAUTH_DEBUG("ATTR - fetching old attributes %016llx", ova.va_active);
	9695	if ((error = vnode_getattr(vp, &ova, ctx)) != 0) {
	9696	KAUTH_DEBUG(" ERROR - got %d trying to get attributes", error);
	9697	goto out;
	9698	}
	9699
	9700	/*
	9701	* Size changes require write access to the file data.
	9702	*/
	9703	if (VATTR_IS_ACTIVE(vap, va_data_size)) {
	9704	/* if we can't get the size, or it's different, we need write access */
	9705	KAUTH_DEBUG("ATTR - size change, requiring WRITE_DATA");
	9706	required_action \|= KAUTH_VNODE_WRITE_DATA;
	9707	}
	9708
	9709	/*
	9710	* Changing timestamps?
	9711	*
	9712	* Note that we are only called to authorize user-requested time changes;
	9713	* side-effect time changes are not authorized. Authorisation is only
	9714	* required for existing files.
	9715	*
	9716	* Non-owners are not permitted to change the time on an existing
	9717	* file to anything other than the current time.
	9718	*/
	9719	if (VATTR_IS_ACTIVE(vap, va_create_time) \|\|
	9720	VATTR_IS_ACTIVE(vap, va_change_time) \|\|
	9721	VATTR_IS_ACTIVE(vap, va_modify_time) \|\|
	9722	VATTR_IS_ACTIVE(vap, va_access_time) \|\|
	9723	VATTR_IS_ACTIVE(vap, va_backup_time) \|\|
	9724	VATTR_IS_ACTIVE(vap, va_addedtime)) {
	9725	/*
	9726	* The owner and root may set any timestamps they like,
	9727	* provided that the file is not immutable. The owner still needs
	9728	* WRITE_ATTRIBUTES (implied by ownership but still deniable).
	9729	*/
	9730	if (has_priv_suser \|\| vauth_node_owner(&ova, cred)) {
	9731	KAUTH_DEBUG("ATTR - root or owner changing timestamps");
	9732	required_action \|= KAUTH_VNODE_CHECKIMMUTABLE \| KAUTH_VNODE_WRITE_ATTRIBUTES;
	9733	} else {
	9734	/* just setting the current time? */
	9735	if (vap->va_vaflags & VA_UTIMES_NULL) {
	9736	KAUTH_DEBUG("ATTR - non-root/owner changing timestamps, requiring WRITE_ATTRIBUTES");
	9737	required_action \|= KAUTH_VNODE_WRITE_ATTRIBUTES;
	9738	} else {
	9739	KAUTH_DEBUG("ATTR - ERROR: illegal timestamp modification attempted");
	9740	error = EACCES;
	9741	goto out;
	9742	}
	9743	}
	9744	}
	9745
	9746	/*
	9747	* Changing file mode?
	9748	*/
	9749	if (VATTR_IS_ACTIVE(vap, va_mode) && VATTR_IS_SUPPORTED(&ova, va_mode) && (ova.va_mode != vap->va_mode)) {
	9750	KAUTH_DEBUG("ATTR - mode change from %06o to %06o", ova.va_mode, vap->va_mode);
	9751
	9752	/*
	9753	* Mode changes always have the same basic auth requirements.
	9754	*/
	9755	if (has_priv_suser) {
	9756	KAUTH_DEBUG("ATTR - superuser mode change, requiring immutability check");
	9757	required_action \|= KAUTH_VNODE_CHECKIMMUTABLE;
	9758	} else {
	9759	/* need WRITE_SECURITY */
	9760	KAUTH_DEBUG("ATTR - non-superuser mode change, requiring WRITE_SECURITY");
	9761	required_action \|= KAUTH_VNODE_WRITE_SECURITY;
	9762	}
	9763
	9764	/*
	9765	* Can't set the setgid bit if you're not in the group and not root. Have to have
	9766	* existing group information in the case we're not setting it right now.
	9767	*/
	9768	if (vap->va_mode & S_ISGID) {
	9769	required_action \|= KAUTH_VNODE_CHECKIMMUTABLE; /* always required */
	9770	if (!has_priv_suser) {
	9771	if (VATTR_IS_ACTIVE(vap, va_gid)) {
	9772	group = vap->va_gid;
	9773	} else if (VATTR_IS_SUPPORTED(&ova, va_gid)) {
	9774	group = ova.va_gid;
	9775	} else {
	9776	KAUTH_DEBUG("ATTR - ERROR: setgid but no gid available");
	9777	error = EINVAL;
	9778	goto out;
	9779	}
	9780	/*
	9781	* This might be too restrictive; WRITE_SECURITY might be implied by
	9782	* membership in this case, rather than being an additional requirement.
	9783	*/
	9784	if ((error = kauth_cred_ismember_gid(cred, group, &ismember)) != 0) {
	9785	KAUTH_DEBUG("ATTR - ERROR: got %d checking for membership in %d", error, vap->va_gid);
	9786	goto out;
	9787	}
	9788	if (!ismember) {
	9789	KAUTH_DEBUG(" DENIED - can't set SGID bit, not a member of %d", group);
	9790	error = EPERM;
	9791	goto out;
	9792	}
	9793	}
	9794	}
	9795
	9796	/*
	9797	* Can't set the setuid bit unless you're root or the file's owner.
	9798	*/
	9799	if (vap->va_mode & S_ISUID) {
	9800	required_action \|= KAUTH_VNODE_CHECKIMMUTABLE; /* always required */
	9801	if (!has_priv_suser) {
	9802	if (VATTR_IS_ACTIVE(vap, va_uid)) {
	9803	owner = vap->va_uid;
	9804	} else if (VATTR_IS_SUPPORTED(&ova, va_uid)) {
	9805	owner = ova.va_uid;
	9806	} else {
	9807	KAUTH_DEBUG("ATTR - ERROR: setuid but no uid available");
	9808	error = EINVAL;
	9809	goto out;
	9810	}
	9811	if (owner != kauth_cred_getuid(cred)) {
	9812	/*
	9813	* We could allow this if WRITE_SECURITY is permitted, perhaps.
	9814	*/
	9815	KAUTH_DEBUG("ATTR - ERROR: illegal attempt to set the setuid bit");
	9816	error = EPERM;
	9817	goto out;
	9818	}
	9819	}
	9820	}
	9821	}
	9822
	9823	/*
	9824	* Validate/mask flags changes. This checks that only the flags in
	9825	* the UF_SETTABLE mask are being set, and preserves the flags in
	9826	* the SF_SETTABLE case.
	9827	*
	9828	* Since flags changes may be made in conjunction with other changes,
	9829	* we will ask the auth code to ignore immutability in the case that
	9830	* the SF_* flags are not set and we are only manipulating the file flags.
	9831	*
	9832	*/
	9833	if (VATTR_IS_ACTIVE(vap, va_flags)) {
	9834	/* compute changing flags bits */
	9835	vap->va_flags &= ~SF_SYNTHETIC;
	9836	ova.va_flags &= ~SF_SYNTHETIC;
	9837	if (VATTR_IS_SUPPORTED(&ova, va_flags)) {
	9838	fdelta = vap->va_flags ^ ova.va_flags;
	9839	} else {
	9840	fdelta = vap->va_flags;
	9841	}
	9842
	9843	if (fdelta != 0) {
	9844	KAUTH_DEBUG("ATTR - flags changing, requiring WRITE_SECURITY");
	9845	required_action \|= KAUTH_VNODE_WRITE_SECURITY;
	9846
	9847	/* check that changing bits are legal */
	9848	if (has_priv_suser) {
	9849	/*
	9850	* The immutability check will prevent us from clearing the SF_*
	9851	* flags unless the system securelevel permits it, so just check
	9852	* for legal flags here.
	9853	*/
	9854	if (fdelta & ~(UF_SETTABLE \| SF_SETTABLE)) {
	9855	error = EPERM;
	9856	KAUTH_DEBUG(" DENIED - superuser attempt to set illegal flag(s)");
	9857	goto out;
	9858	}
	9859	} else {
	9860	if (fdelta & ~UF_SETTABLE) {
	9861	error = EPERM;
	9862	KAUTH_DEBUG(" DENIED - user attempt to set illegal flag(s)");
	9863	goto out;
	9864	}
	9865	}
	9866	/*
	9867	* If the caller has the ability to manipulate file flags,
	9868	* security is not reduced by ignoring them for this operation.
	9869	*
	9870	* A more complete test here would consider the 'after' states of the flags
	9871	* to determine whether it would permit the operation, but this becomes
	9872	* very complex.
	9873	*
	9874	* Ignoring immutability is conditional on securelevel; this does not bypass
	9875	* the SF_* flags if securelevel > 0.
	9876	*/
	9877	required_action \|= KAUTH_VNODE_NOIMMUTABLE;
	9878	}
	9879	}
	9880
	9881	/*
	9882	* Validate ownership information.
	9883	*/
	9884	chowner = 0;
	9885	chgroup = 0;
	9886	clear_suid = 0;
	9887	clear_sgid = 0;
	9888
	9889	/*
	9890	* uid changing
	9891	* Note that if the filesystem didn't give us a UID, we expect that it doesn't
	9892	* support them in general, and will ignore it if/when we try to set it.
	9893	* We might want to clear the uid out of vap completely here.
	9894	*/
	9895	if (VATTR_IS_ACTIVE(vap, va_uid)) {
	9896	if (VATTR_IS_SUPPORTED(&ova, va_uid) && (vap->va_uid != ova.va_uid)) {
	9897	if (!has_priv_suser && (kauth_cred_getuid(cred) != vap->va_uid)) {
	9898	KAUTH_DEBUG(" DENIED - non-superuser cannot change ownershipt to a third party");
	9899	error = EPERM;
	9900	goto out;
	9901	}
	9902	chowner = 1;
	9903	}
	9904	clear_suid = 1;
	9905	}
	9906
	9907	/*
	9908	* gid changing
	9909	* Note that if the filesystem didn't give us a GID, we expect that it doesn't
	9910	* support them in general, and will ignore it if/when we try to set it.
	9911	* We might want to clear the gid out of vap completely here.
	9912	*/
	9913	if (VATTR_IS_ACTIVE(vap, va_gid)) {
	9914	if (VATTR_IS_SUPPORTED(&ova, va_gid) && (vap->va_gid != ova.va_gid)) {
	9915	if (!has_priv_suser) {
	9916	if ((error = kauth_cred_ismember_gid(cred, vap->va_gid, &ismember)) != 0) {
	9917	KAUTH_DEBUG(" ERROR - got %d checking for membership in %d", error, vap->va_gid);
	9918	goto out;
	9919	}
	9920	if (!ismember) {
	9921	KAUTH_DEBUG(" DENIED - group change from %d to %d but not a member of target group",
	9922	ova.va_gid, vap->va_gid);
	9923	error = EPERM;
	9924	goto out;
	9925	}
	9926	}
	9927	chgroup = 1;
	9928	}
	9929	clear_sgid = 1;
	9930	}
	9931
	9932	/*
	9933	* Owner UUID being set or changed.
	9934	*/
	9935	if (VATTR_IS_ACTIVE(vap, va_uuuid)) {
	9936	/* if the owner UUID is not actually changing ... */
	9937	if (VATTR_IS_SUPPORTED(&ova, va_uuuid)) {
	9938	if (kauth_guid_equal(&vap->va_uuuid, &ova.va_uuuid)) {
	9939	goto no_uuuid_change;
	9940	}
	9941
	9942	/*
	9943	* If the current owner UUID is a null GUID, check
	9944	* it against the UUID corresponding to the owner UID.
	9945	*/
	9946	if (kauth_guid_equal(&ova.va_uuuid, &kauth_null_guid) &&
	9947	VATTR_IS_SUPPORTED(&ova, va_uid)) {
	9948	guid_t uid_guid;
	9949
	9950	if (kauth_cred_uid2guid(ova.va_uid, &uid_guid) == 0 &&
	9951	kauth_guid_equal(&vap->va_uuuid, &uid_guid)) {
	9952	goto no_uuuid_change;
	9953	}
	9954	}
	9955	}
	9956
	9957	/*
	9958	* The owner UUID cannot be set by a non-superuser to anything other than
	9959	* their own or a null GUID (to "unset" the owner UUID).
	9960	* Note that file systems must be prepared to handle the
	9961	* null UUID case in a manner appropriate for that file
	9962	* system.
	9963	*/
	9964	if (!has_priv_suser) {
	9965	if ((error = kauth_cred_getguid(cred, &changer)) != 0) {
	9966	KAUTH_DEBUG(" ERROR - got %d trying to get caller UUID", error);
	9967	/* XXX ENOENT here - no UUID - should perhaps become EPERM */
	9968	goto out;
	9969	}
	9970	if (!kauth_guid_equal(&vap->va_uuuid, &changer) &&
	9971	!kauth_guid_equal(&vap->va_uuuid, &kauth_null_guid)) {
	9972	KAUTH_DEBUG(" ERROR - cannot set supplied owner UUID - not us / null");
	9973	error = EPERM;
	9974	goto out;
	9975	}
	9976	}
	9977	chowner = 1;
	9978	clear_suid = 1;
	9979	}
	9980	no_uuuid_change:
	9981	/*
	9982	* Group UUID being set or changed.
	9983	*/
	9984	if (VATTR_IS_ACTIVE(vap, va_guuid)) {
	9985	/* if the group UUID is not actually changing ... */
	9986	if (VATTR_IS_SUPPORTED(&ova, va_guuid)) {
	9987	if (kauth_guid_equal(&vap->va_guuid, &ova.va_guuid)) {
	9988	goto no_guuid_change;
	9989	}
	9990
	9991	/*
	9992	* If the current group UUID is a null UUID, check
	9993	* it against the UUID corresponding to the group GID.
	9994	*/
	9995	if (kauth_guid_equal(&ova.va_guuid, &kauth_null_guid) &&
	9996	VATTR_IS_SUPPORTED(&ova, va_gid)) {
	9997	guid_t gid_guid;
	9998
	9999	if (kauth_cred_gid2guid(ova.va_gid, &gid_guid) == 0 &&
	10000	kauth_guid_equal(&vap->va_guuid, &gid_guid)) {
	10001	goto no_guuid_change;
	10002	}
	10003	}
	10004	}
	10005
	10006	/*
	10007	* The group UUID cannot be set by a non-superuser to anything other than
	10008	* one of which they are a member or a null GUID (to "unset"
	10009	* the group UUID).
	10010	* Note that file systems must be prepared to handle the
	10011	* null UUID case in a manner appropriate for that file
	10012	* system.
	10013	*/
	10014	if (!has_priv_suser) {
	10015	if (kauth_guid_equal(&vap->va_guuid, &kauth_null_guid)) {
	10016	ismember = 1;
	10017	} else if ((error = kauth_cred_ismember_guid(cred, &vap->va_guuid, &ismember)) != 0) {
	10018	KAUTH_DEBUG(" ERROR - got %d trying to check group membership", error);
	10019	goto out;
	10020	}
	10021	if (!ismember) {
	10022	KAUTH_DEBUG(" ERROR - cannot set supplied group UUID - not a member / null");
	10023	error = EPERM;
	10024	goto out;
	10025	}
	10026	}
	10027	chgroup = 1;
	10028	}
	10029	no_guuid_change:
	10030
	10031	/*
	10032	* Compute authorisation for group/ownership changes.
	10033	*/
	10034	if (chowner \|\| chgroup \|\| clear_suid \|\| clear_sgid) {
	10035	if (has_priv_suser) {
	10036	KAUTH_DEBUG("ATTR - superuser changing file owner/group, requiring immutability check");
	10037	required_action \|= KAUTH_VNODE_CHECKIMMUTABLE;
	10038	} else {
	10039	if (chowner) {
	10040	KAUTH_DEBUG("ATTR - ownership change, requiring TAKE_OWNERSHIP");
	10041	required_action \|= KAUTH_VNODE_TAKE_OWNERSHIP;
	10042	}
	10043	if (chgroup && !chowner) {
	10044	KAUTH_DEBUG("ATTR - group change, requiring WRITE_SECURITY");
	10045	required_action \|= KAUTH_VNODE_WRITE_SECURITY;
	10046	}
	10047	}
	10048
	10049	/*
	10050	* clear set-uid and set-gid bits. POSIX only requires this for
	10051	* non-privileged processes but we do it even for root.
	10052	*/
	10053	if (VATTR_IS_ACTIVE(vap, va_mode)) {
	10054	newmode = vap->va_mode;
	10055	} else if (VATTR_IS_SUPPORTED(&ova, va_mode)) {
	10056	newmode = ova.va_mode;
	10057	} else {
	10058	KAUTH_DEBUG("CHOWN - trying to change owner but cannot get mode from filesystem to mask setugid bits");
	10059	newmode = 0;
	10060	}
	10061
	10062	/* chown always clears setuid/gid bits. An exception is made for
	10063	* setattrlist which can set both at the same time: <uid, gid, mode> on a file:
	10064	* setattrlist is allowed to set the new mode on the file and change (chown)
	10065	* uid/gid.
	10066	*/
	10067	if (newmode & (S_ISUID \| S_ISGID)) {
	10068	if (!VATTR_IS_ACTIVE(vap, va_mode)) {
	10069	KAUTH_DEBUG("CHOWN - masking setugid bits from mode %o to %o",
	10070	newmode, newmode & ~(S_ISUID \| S_ISGID));
	10071	newmode &= ~(S_ISUID \| S_ISGID);
	10072	}
	10073	VATTR_SET(vap, va_mode, newmode);
	10074	}
	10075	}
	10076
	10077	/*
	10078	* Authorise changes in the ACL.
	10079	*/
	10080	if (VATTR_IS_ACTIVE(vap, va_acl)) {
	10081	/* no existing ACL */
	10082	if (!VATTR_IS_ACTIVE(&ova, va_acl) \|\| (ova.va_acl == NULL)) {
	10083	/* adding an ACL */
	10084	if (vap->va_acl != NULL) {
	10085	required_action \|= KAUTH_VNODE_WRITE_SECURITY;
	10086	KAUTH_DEBUG("CHMOD - adding ACL");
	10087	}
	10088
	10089	/* removing an existing ACL */
	10090	} else if (vap->va_acl == NULL) {
	10091	required_action \|= KAUTH_VNODE_WRITE_SECURITY;
	10092	KAUTH_DEBUG("CHMOD - removing ACL");
	10093
	10094	/* updating an existing ACL */
	10095	} else {
	10096	if (vap->va_acl->acl_entrycount != ova.va_acl->acl_entrycount) {
	10097	/* entry count changed, must be different */
	10098	required_action \|= KAUTH_VNODE_WRITE_SECURITY;
	10099	KAUTH_DEBUG("CHMOD - adding/removing ACL entries");
	10100	} else if (vap->va_acl->acl_entrycount > 0) {
	10101	/* both ACLs have the same ACE count, said count is 1 or more, bitwise compare ACLs */
	10102	if (memcmp(&vap->va_acl->acl_ace[0], &ova.va_acl->acl_ace[0],
	10103	sizeof(struct kauth_ace) * vap->va_acl->acl_entrycount)) {
	10104	required_action \|= KAUTH_VNODE_WRITE_SECURITY;
	10105	KAUTH_DEBUG("CHMOD - changing ACL entries");
	10106	}
	10107	}
	10108	}
	10109	}
	10110
	10111	/*
	10112	* Other attributes that require authorisation.
	10113	*/
	10114	if (VATTR_IS_ACTIVE(vap, va_encoding)) {
	10115	required_action \|= KAUTH_VNODE_WRITE_ATTRIBUTES;
	10116	}
	10117
	10118	out:
	10119	if (VATTR_IS_SUPPORTED(&ova, va_acl) && (ova.va_acl != NULL)) {
	10120	kauth_acl_free(ova.va_acl);
	10121	}
	10122	if (error == 0) {
	10123	*actionp = required_action;
	10124	}
	10125	return error;
	10126	}
	10127
	10128	static int
	10129	setlocklocal_callback(struct vnode vp, __unused void cargs)
	10130	{
	10131	vnode_lock_spin(vp);
	10132	vp->v_flag \|= VLOCKLOCAL;
	10133	vnode_unlock(vp);
	10134
	10135	return VNODE_RETURNED;
	10136	}
	10137
	10138	void
	10139	vfs_setlocklocal(mount_t mp)
	10140	{
	10141	mount_lock_spin(mp);
	10142	mp->mnt_kern_flag \|= MNTK_LOCK_LOCAL;
	10143	mount_unlock(mp);
	10144
	10145	/*
	10146	* The number of active vnodes is expected to be
	10147	* very small when vfs_setlocklocal is invoked.
	10148	*/
	10149	vnode_iterate(mp, 0, setlocklocal_callback, NULL);
	10150	}
	10151
	10152	void
	10153	vfs_setcompoundopen(mount_t mp)
	10154	{
	10155	mount_lock_spin(mp);
	10156	mp->mnt_compound_ops \|= COMPOUND_VNOP_OPEN;
	10157	mount_unlock(mp);
	10158	}
	10159
	10160	void
	10161	vnode_setswapmount(vnode_t vp)
	10162	{
	10163	mount_lock(vp->v_mount);
	10164	vp->v_mount->mnt_kern_flag \|= MNTK_SWAP_MOUNT;
	10165	mount_unlock(vp->v_mount);
	10166	}
	10167
	10168
	10169	int64_t
	10170	vnode_getswappin_avail(vnode_t vp)
	10171	{
	10172	int64_t max_swappin_avail = 0;
	10173
	10174	mount_lock(vp->v_mount);
	10175	if (vp->v_mount->mnt_ioflags & MNT_IOFLAGS_SWAPPIN_SUPPORTED) {
	10176	max_swappin_avail = vp->v_mount->mnt_max_swappin_available;
	10177	}
	10178	mount_unlock(vp->v_mount);
	10179
	10180	return max_swappin_avail;
	10181	}
	10182
	10183
	10184	void
	10185	vn_setunionwait(vnode_t vp)
	10186	{
	10187	vnode_lock_spin(vp);
	10188	vp->v_flag \|= VISUNION;
	10189	vnode_unlock(vp);
	10190	}
	10191
	10192
	10193	void
	10194	vn_checkunionwait(vnode_t vp)
	10195	{
	10196	vnode_lock_spin(vp);
	10197	while ((vp->v_flag & VISUNION) == VISUNION) {
	10198	msleep((caddr_t)&vp->v_flag, &vp->v_lock, 0, 0, 0);
	10199	}
	10200	vnode_unlock(vp);
	10201	}
	10202
	10203	void
	10204	vn_clearunionwait(vnode_t vp, int locked)
	10205	{
	10206	if (!locked) {
	10207	vnode_lock_spin(vp);
	10208	}
	10209	if ((vp->v_flag & VISUNION) == VISUNION) {
	10210	vp->v_flag &= ~VISUNION;
	10211	wakeup((caddr_t)&vp->v_flag);
	10212	}
	10213	if (!locked) {
	10214	vnode_unlock(vp);
	10215	}
	10216	}
	10217
	10218	int
	10219	vnode_materialize_dataless_file(vnode_t vp, uint64_t op_type)
	10220	{
	10221	int error;
	10222
	10223	/* Swap files are special; ignore them */
	10224	if (vnode_isswap(vp)) {
	10225	return 0;
	10226	}
	10227
	10228	error = resolve_nspace_item(vp,
	10229	op_type \| NAMESPACE_HANDLER_NSPACE_EVENT);
	10230
	10231	/*
	10232	* The file resolver owns the logic about what error to return
	10233	* to the caller. We only need to handle a couple of special
	10234	* cases here:
	10235	*/
	10236	if (error == EJUSTRETURN) {
	10237	/*
	10238	* The requesting process is allowed to interact with
	10239	* dataless objects. Make a couple of sanity-checks
	10240	* here to ensure the action makes sense.
	10241	*/
	10242	switch (op_type) {
	10243	case NAMESPACE_HANDLER_WRITE_OP:
	10244	case NAMESPACE_HANDLER_TRUNCATE_OP:
	10245	case NAMESPACE_HANDLER_RENAME_OP:
	10246	/*
	10247	* This handles the case of the resolver itself
	10248	* writing data to the file (or throwing it
	10249	* away).
	10250	*/
	10251	error = 0;
	10252	break;
	10253	case NAMESPACE_HANDLER_READ_OP:
	10254	/*
	10255	* This handles the case of the resolver needing
	10256	* to look up inside of a dataless directory while
	10257	* it's in the process of materializing it (for
	10258	* example, creating files or directories).
	10259	*/
	10260	error = (vnode_vtype(vp) == VDIR) ? 0 : EBADF;
	10261	break;
	10262	default:
	10263	error = EBADF;
	10264	break;
	10265	}
	10266	}
	10267
	10268	return error;
	10269	}
	10270
	10271	/*
	10272	* Removes orphaned apple double files during a rmdir
	10273	* Works by:
	10274	* 1. vnode_suspend().
	10275	* 2. Call VNOP_READDIR() till the end of directory is reached.
	10276	* 3. Check if the directory entries returned are regular files with name starting with "._". If not, return ENOTEMPTY.
	10277	* 4. Continue (2) and (3) till end of directory is reached.
	10278	* 5. If all the entries in the directory were files with "._" name, delete all the files.
	10279	* 6. vnode_resume()
	10280	* 7. If deletion of all files succeeded, call VNOP_RMDIR() again.
	10281	*/
	10282
	10283	errno_t
	10284	rmdir_remove_orphaned_appleDouble(vnode_t vp, vfs_context_t ctx, int * restart_flag)
	10285	{
	10286	#define UIO_BUFF_SIZE 2048
	10287	uio_t auio = NULL;
	10288	int eofflag, siz = UIO_BUFF_SIZE, alloc_size = 0, nentries = 0;
	10289	int open_flag = 0, full_erase_flag = 0;
	10290	char uio_buf[UIO_SIZEOF(1)];
	10291	char *rbuf = NULL;
	10292	void *dir_pos;
	10293	void *dir_end;
	10294	struct dirent *dp;
	10295	errno_t error;
	10296
	10297	error = vnode_suspend(vp);
	10298
	10299	/*
	10300	* restart_flag is set so that the calling rmdir sleeps and resets
	10301	*/
	10302	if (error == EBUSY) {
	10303	*restart_flag = 1;
	10304	}
	10305	if (error != 0) {
	10306	return error;
	10307	}
	10308
	10309	/*
	10310	* Prevent dataless fault materialization while we have
	10311	* a suspended vnode.
	10312	*/
	10313	uthread_t ut = get_bsdthread_info(current_thread());
	10314	bool saved_nodatalessfaults =
	10315	(ut->uu_flag & UT_NSPACE_NODATALESSFAULTS) ? true : false;
	10316	ut->uu_flag \|= UT_NSPACE_NODATALESSFAULTS;
	10317
	10318	/*
	10319	* set up UIO
	10320	*/
	10321	rbuf = kheap_alloc(KHEAP_DATA_BUFFERS, siz, Z_WAITOK);
	10322	alloc_size = siz;
	10323	if (rbuf) {
	10324	auio = uio_createwithbuffer(1, 0, UIO_SYSSPACE, UIO_READ,
	10325	&uio_buf[0], sizeof(uio_buf));
	10326	}
	10327	if (!rbuf \|\| !auio) {
	10328	error = ENOMEM;
	10329	goto outsc;
	10330	}
	10331
	10332	uio_setoffset(auio, 0);
	10333
	10334	eofflag = 0;
	10335
	10336	if ((error = VNOP_OPEN(vp, FREAD, ctx))) {
	10337	goto outsc;
	10338	} else {
	10339	open_flag = 1;
	10340	}
	10341
	10342	/*
	10343	* First pass checks if all files are appleDouble files.
	10344	*/
	10345
	10346	do {
	10347	siz = UIO_BUFF_SIZE;
	10348	uio_reset(auio, uio_offset(auio), UIO_SYSSPACE, UIO_READ);
	10349	uio_addiov(auio, CAST_USER_ADDR_T(rbuf), UIO_BUFF_SIZE);
	10350
	10351	if ((error = VNOP_READDIR(vp, auio, 0, &eofflag, &nentries, ctx))) {
	10352	goto outsc;
	10353	}
	10354
	10355	if (uio_resid(auio) != 0) {
	10356	siz -= uio_resid(auio);
	10357	}
	10358
	10359	/*
	10360	* Iterate through directory
	10361	*/
	10362	dir_pos = (void*) rbuf;
	10363	dir_end = (void*) (rbuf + siz);
	10364	dp = (struct dirent*) (dir_pos);
	10365
	10366	if (dir_pos == dir_end) {
	10367	eofflag = 1;
	10368	}
	10369
	10370	while (dir_pos < dir_end) {
	10371	/*
	10372	* Check for . and .. as well as directories
	10373	*/
	10374	if (dp->d_ino != 0 &&
	10375	!((dp->d_namlen == 1 && dp->d_name[0] == '.') \|\|
	10376	(dp->d_namlen == 2 && dp->d_name[0] == '.' && dp->d_name[1] == '.'))) {
	10377	/*
	10378	* Check for irregular files and ._ files
	10379	* If there is a ._._ file abort the op
	10380	*/
	10381	if (dp->d_namlen < 2 \|\|
	10382	strncmp(dp->d_name, "._", 2) \|\|
	10383	(dp->d_namlen >= 4 && !strncmp(&(dp->d_name[2]), "._", 2))) {
	10384	error = ENOTEMPTY;
	10385	goto outsc;
	10386	}
	10387	}
	10388	dir_pos = (void) ((uint8_t)dir_pos + dp->d_reclen);
	10389	dp = (struct dirent*)dir_pos;
	10390	}
	10391
	10392	/*
	10393	* workaround for HFS/NFS setting eofflag before end of file
	10394	*/
	10395	if (vp->v_tag == VT_HFS && nentries > 2) {
	10396	eofflag = 0;
	10397	}
	10398
	10399	if (vp->v_tag == VT_NFS) {
	10400	if (eofflag && !full_erase_flag) {
	10401	full_erase_flag = 1;
	10402	eofflag = 0;
	10403	uio_reset(auio, 0, UIO_SYSSPACE, UIO_READ);
	10404	} else if (!eofflag && full_erase_flag) {
	10405	full_erase_flag = 0;
	10406	}
	10407	}
	10408	} while (!eofflag);
	10409	/*
	10410	* If we've made it here all the files in the dir are ._ files.
	10411	* We can delete the files even though the node is suspended
	10412	* because we are the owner of the file.
	10413	*/
	10414
	10415	uio_reset(auio, 0, UIO_SYSSPACE, UIO_READ);
	10416	eofflag = 0;
	10417	full_erase_flag = 0;
	10418
	10419	do {
	10420	siz = UIO_BUFF_SIZE;
	10421	uio_reset(auio, uio_offset(auio), UIO_SYSSPACE, UIO_READ);
	10422	uio_addiov(auio, CAST_USER_ADDR_T(rbuf), UIO_BUFF_SIZE);
	10423
	10424	error = VNOP_READDIR(vp, auio, 0, &eofflag, &nentries, ctx);
	10425
	10426	if (error != 0) {
	10427	goto outsc;
	10428	}
	10429
	10430	if (uio_resid(auio) != 0) {
	10431	siz -= uio_resid(auio);
	10432	}
	10433
	10434	/*
	10435	* Iterate through directory
	10436	*/
	10437	dir_pos = (void*) rbuf;
	10438	dir_end = (void*) (rbuf + siz);
	10439	dp = (struct dirent*) dir_pos;
	10440
	10441	if (dir_pos == dir_end) {
	10442	eofflag = 1;
	10443	}
	10444
	10445	while (dir_pos < dir_end) {
	10446	/*
	10447	* Check for . and .. as well as directories
	10448	*/
	10449	if (dp->d_ino != 0 &&
	10450	!((dp->d_namlen == 1 && dp->d_name[0] == '.') \|\|
	10451	(dp->d_namlen == 2 && dp->d_name[0] == '.' && dp->d_name[1] == '.'))
	10452	) {
	10453	error = unlink1(ctx, vp,
	10454	CAST_USER_ADDR_T(dp->d_name), UIO_SYSSPACE,
	10455	VNODE_REMOVE_SKIP_NAMESPACE_EVENT \|
	10456	VNODE_REMOVE_NO_AUDIT_PATH);
	10457
	10458	if (error && error != ENOENT) {
	10459	goto outsc;
	10460	}
	10461	}
	10462	dir_pos = (void) ((uint8_t)dir_pos + dp->d_reclen);
	10463	dp = (struct dirent*)dir_pos;
	10464	}
	10465
	10466	/*
	10467	* workaround for HFS/NFS setting eofflag before end of file
	10468	*/
	10469	if (vp->v_tag == VT_HFS && nentries > 2) {
	10470	eofflag = 0;
	10471	}
	10472
	10473	if (vp->v_tag == VT_NFS) {
	10474	if (eofflag && !full_erase_flag) {
	10475	full_erase_flag = 1;
	10476	eofflag = 0;
	10477	uio_reset(auio, 0, UIO_SYSSPACE, UIO_READ);
	10478	} else if (!eofflag && full_erase_flag) {
	10479	full_erase_flag = 0;
	10480	}
	10481	}
	10482	} while (!eofflag);
	10483
	10484
	10485	error = 0;
	10486
	10487	outsc:
	10488	if (open_flag) {
	10489	VNOP_CLOSE(vp, FREAD, ctx);
	10490	}
	10491
	10492	if (auio) {
	10493	uio_free(auio);
	10494	}
	10495	kheap_free(KHEAP_DATA_BUFFERS, rbuf, alloc_size);
	10496
	10497	if (saved_nodatalessfaults == false) {
	10498	ut->uu_flag &= ~UT_NSPACE_NODATALESSFAULTS;
	10499	}
	10500
	10501	vnode_resume(vp);
	10502
	10503	return error;
	10504	}
	10505
	10506
	10507	void
	10508	lock_vnode_and_post(vnode_t vp, int kevent_num)
	10509	{
	10510	/* Only take the lock if there's something there! */
	10511	if (vp->v_knotes.slh_first != NULL) {
	10512	vnode_lock(vp);
	10513	KNOTE(&vp->v_knotes, kevent_num);
	10514	vnode_unlock(vp);
	10515	}
	10516	}
	10517
	10518	void panic_print_vnodes(void);
	10519
	10520	/* define PANIC_PRINTS_VNODES only if investigation is required. */
	10521	#ifdef PANIC_PRINTS_VNODES
	10522
	10523	static const char *
	10524	__vtype(uint16_t vtype)
	10525	{
	10526	switch (vtype) {
	10527	case VREG:
	10528	return "R";
	10529	case VDIR:
	10530	return "D";
	10531	case VBLK:
	10532	return "B";
	10533	case VCHR:
	10534	return "C";
	10535	case VLNK:
	10536	return "L";
	10537	case VSOCK:
	10538	return "S";
	10539	case VFIFO:
	10540	return "F";
	10541	case VBAD:
	10542	return "x";
	10543	case VSTR:
	10544	return "T";
	10545	case VCPLX:
	10546	return "X";
	10547	default:
	10548	return "?";
	10549	}
	10550	}
	10551
	10552	/*
	10553	* build a path from the bottom up
	10554	* NOTE: called from the panic path - no alloc'ing of memory and no locks!
	10555	*/
	10556	static char *
	10557	__vpath(vnode_t vp, char *str, int len, int depth)
	10558	{
	10559	int vnm_len;
	10560	const char *src;
	10561	char *dst;
	10562
	10563	if (len <= 0) {
	10564	return str;
	10565	}
	10566	/* str + len is the start of the string we created */
	10567	if (!vp->v_name) {
	10568	return str + len;
	10569	}
	10570
	10571	/* follow mount vnodes to get the full path */
	10572	if ((vp->v_flag & VROOT)) {
	10573	if (vp->v_mount != NULL && vp->v_mount->mnt_vnodecovered) {
	10574	return __vpath(vp->v_mount->mnt_vnodecovered,
	10575	str, len, depth + 1);
	10576	}
	10577	return str + len;
	10578	}
	10579
	10580	src = vp->v_name;
	10581	vnm_len = strlen(src);
	10582	if (vnm_len > len) {
	10583	/* truncate the name to fit in the string */
	10584	src += (vnm_len - len);
	10585	vnm_len = len;
	10586	}
	10587
	10588	/* start from the back and copy just characters (no NULLs) */
	10589
	10590	/* this will chop off leaf path (file) names */
	10591	if (depth > 0) {
	10592	dst = str + len - vnm_len;
	10593	memcpy(dst, src, vnm_len);
	10594	len -= vnm_len;
	10595	} else {
	10596	dst = str + len;
	10597	}
	10598
	10599	if (vp->v_parent && len > 1) {
	10600	/* follow parents up the chain */
	10601	len--;
	10602	*(dst - 1) = '/';
	10603	return __vpath(vp->v_parent, str, len, depth + 1);
	10604	}
	10605
	10606	return dst;
	10607	}
	10608
	10609	#define SANE_VNODE_PRINT_LIMIT 5000
	10610	void
	10611	panic_print_vnodes(void)
	10612	{
	10613	mount_t mnt;
	10614	vnode_t vp;
	10615	int nvnodes = 0;
	10616	const char *type;
	10617	char *nm;
	10618	char vname[257];
	10619
	10620	paniclog_append_noflush("\n*** VNODES ***\n"
	10621	"TYPE UREF ICNT PATH\n");
	10622
	10623	/* NULL-terminate the path name */
	10624	vname[sizeof(vname) - 1] = '\0';
	10625
	10626	/*
	10627	* iterate all vnodelist items in all mounts (mntlist) -> mnt_vnodelist
	10628	*/
	10629	TAILQ_FOREACH(mnt, &mountlist, mnt_list) {
	10630	if (!ml_validate_nofault((vm_offset_t)mnt, sizeof(mount_t))) {
	10631	paniclog_append_noflush("Unable to iterate the mount list %p - encountered an invalid mount pointer %p \n",
	10632	&mountlist, mnt);
	10633	break;
	10634	}
	10635
	10636	TAILQ_FOREACH(vp, &mnt->mnt_vnodelist, v_mntvnodes) {
	10637	if (!ml_validate_nofault((vm_offset_t)vp, sizeof(vnode_t))) {
	10638	paniclog_append_noflush("Unable to iterate the vnode list %p - encountered an invalid vnode pointer %p \n",
	10639	&mnt->mnt_vnodelist, vp);
	10640	break;
	10641	}
	10642
	10643	if (++nvnodes > SANE_VNODE_PRINT_LIMIT) {
	10644	return;
	10645	}
	10646	type = __vtype(vp->v_type);
	10647	nm = __vpath(vp, vname, sizeof(vname) - 1, 0);
	10648	paniclog_append_noflush("%s %0d %0d %s\n",
	10649	type, vp->v_usecount, vp->v_iocount, nm);
	10650	}
	10651	}
	10652	}
	10653
	10654	#else /* !PANIC_PRINTS_VNODES */
	10655	void
	10656	panic_print_vnodes(void)
	10657	{
	10658	return;
	10659	}
	10660	#endif
	10661
	10662
	10663	#ifdef JOE_DEBUG
	10664	static void
	10665	record_vp(vnode_t vp, int count)
	10666	{
	10667	struct uthread *ut;
	10668
	10669	#if CONFIG_TRIGGERS
	10670	if (vp->v_resolve) {
	10671	return;
	10672	}
	10673	#endif
	10674	if ((vp->v_flag & VSYSTEM)) {
	10675	return;
	10676	}
	10677
	10678	ut = get_bsdthread_info(current_thread());
	10679	ut->uu_iocount += count;
	10680
	10681	if (count == 1) {
	10682	if (ut->uu_vpindex < 32) {
	10683	OSBacktrace((void **)&ut->uu_pcs[ut->uu_vpindex][0], 10);
	10684
	10685	ut->uu_vps[ut->uu_vpindex] = vp;
	10686	ut->uu_vpindex++;
	10687	}
	10688	}
	10689	}
	10690	#endif
	10691
	10692
	10693	#if CONFIG_TRIGGERS
	10694
	10695	#define TRIG_DEBUG 0
	10696
	10697	#if TRIG_DEBUG
	10698	#define TRIG_LOG(...) do { printf("%s: ", __FUNCTION__); printf(__VA_ARGS__); } while (0)
	10699	#else
	10700	#define TRIG_LOG(...)
	10701	#endif
	10702
	10703	/*
	10704	* Resolver result functions
	10705	*/
	10706
	10707	resolver_result_t
	10708	vfs_resolver_result(uint32_t seq, enum resolver_status stat, int aux)
	10709	{
	10710	/*
	10711	* \|<--- 32 --->\|<--- 28 --->\|<- 4 ->\|
	10712	* sequence auxiliary status
	10713	*/
	10714	return (((uint64_t)seq) << 32) \|
	10715	(((uint64_t)(aux & 0x0fffffff)) << 4) \|
	10716	(uint64_t)(stat & 0x0000000F);
	10717	}
	10718
	10719	enum resolver_status
	10720	vfs_resolver_status(resolver_result_t result)
	10721	{
	10722	/* lower 4 bits is status */
	10723	return result & 0x0000000F;
	10724	}
	10725
	10726	uint32_t
	10727	vfs_resolver_sequence(resolver_result_t result)
	10728	{
	10729	/* upper 32 bits is sequence */
	10730	return (uint32_t)(result >> 32);
	10731	}
	10732
	10733	int
	10734	vfs_resolver_auxiliary(resolver_result_t result)
	10735	{
	10736	/* 28 bits of auxiliary */
	10737	return (int)(((uint32_t)(result & 0xFFFFFFF0)) >> 4);
	10738	}
	10739
	10740	/*
	10741	* SPI
	10742	* Call in for resolvers to update vnode trigger state
	10743	*/
	10744	int
	10745	vnode_trigger_update(vnode_t vp, resolver_result_t result)
	10746	{
	10747	vnode_resolve_t rp;
	10748	uint32_t seq;
	10749	enum resolver_status stat;
	10750
	10751	if (vp->v_resolve == NULL) {
	10752	return EINVAL;
	10753	}
	10754
	10755	stat = vfs_resolver_status(result);
	10756	seq = vfs_resolver_sequence(result);
	10757
	10758	if ((stat != RESOLVER_RESOLVED) && (stat != RESOLVER_UNRESOLVED)) {
	10759	return EINVAL;
	10760	}
	10761
	10762	rp = vp->v_resolve;
	10763	lck_mtx_lock(&rp->vr_lock);
	10764
	10765	if (seq > rp->vr_lastseq) {
	10766	if (stat == RESOLVER_RESOLVED) {
	10767	rp->vr_flags \|= VNT_RESOLVED;
	10768	} else {
	10769	rp->vr_flags &= ~VNT_RESOLVED;
	10770	}
	10771
	10772	rp->vr_lastseq = seq;
	10773	}
	10774
	10775	lck_mtx_unlock(&rp->vr_lock);
	10776
	10777	return 0;
	10778	}
	10779
	10780	static int
	10781	vnode_resolver_attach(vnode_t vp, vnode_resolve_t rp, boolean_t ref)
	10782	{
	10783	int error;
	10784
	10785	vnode_lock_spin(vp);
	10786	if (vp->v_resolve != NULL) {
	10787	vnode_unlock(vp);
	10788	return EINVAL;
	10789	} else {
	10790	vp->v_resolve = rp;
	10791	}
	10792	vnode_unlock(vp);
	10793
	10794	if (ref) {
	10795	error = vnode_ref_ext(vp, O_EVTONLY, VNODE_REF_FORCE);
	10796	if (error != 0) {
	10797	panic("VNODE_REF_FORCE didn't help...");
	10798	}
	10799	}
	10800
	10801	return 0;
	10802	}
	10803
	10804	/*
	10805	* VFS internal interfaces for vnode triggers
	10806	*
	10807	* vnode must already have an io count on entry
	10808	* v_resolve is stable when io count is non-zero
	10809	*/
	10810	static int
	10811	vnode_resolver_create(mount_t mp, vnode_t vp, struct vnode_trigger_param *tinfo, boolean_t external)
	10812	{
	10813	vnode_resolve_t rp;
	10814	int result;
	10815	char byte;
	10816
	10817	#if 1
	10818	/* minimum pointer test (debugging) */
	10819	if (tinfo->vnt_data) {
	10820	byte = ((char )tinfo->vnt_data);
	10821	}
	10822	#endif
	10823	rp = kheap_alloc(KHEAP_DEFAULT, sizeof(struct vnode_resolve), Z_WAITOK);
	10824	if (rp == NULL) {
	10825	return ENOMEM;
	10826	}
	10827
	10828	lck_mtx_init(&rp->vr_lock, &trigger_vnode_lck_grp, &trigger_vnode_lck_attr);
	10829
	10830	rp->vr_resolve_func = tinfo->vnt_resolve_func;
	10831	rp->vr_unresolve_func = tinfo->vnt_unresolve_func;
	10832	rp->vr_rearm_func = tinfo->vnt_rearm_func;
	10833	rp->vr_reclaim_func = tinfo->vnt_reclaim_func;
	10834	rp->vr_data = tinfo->vnt_data;
	10835	rp->vr_lastseq = 0;
	10836	rp->vr_flags = tinfo->vnt_flags & VNT_VALID_MASK;
	10837	if (external) {
	10838	rp->vr_flags \|= VNT_EXTERNAL;
	10839	}
	10840
	10841	result = vnode_resolver_attach(vp, rp, external);
	10842	if (result != 0) {
	10843	goto out;
	10844	}
	10845
	10846	if (mp) {
	10847	OSAddAtomic(1, &mp->mnt_numtriggers);
	10848	}
	10849
	10850	return result;
	10851
	10852	out:
	10853	kheap_free(KHEAP_DEFAULT, rp, sizeof(struct vnode_resolve));
	10854	return result;
	10855	}
	10856
	10857	static void
	10858	vnode_resolver_release(vnode_resolve_t rp)
	10859	{
	10860	/*
	10861	* Give them a chance to free any private data
	10862	*/
	10863	if (rp->vr_data && rp->vr_reclaim_func) {
	10864	rp->vr_reclaim_func(NULLVP, rp->vr_data);
	10865	}
	10866
	10867	lck_mtx_destroy(&rp->vr_lock, &trigger_vnode_lck_grp);
	10868	kheap_free(KHEAP_DEFAULT, rp, sizeof(struct vnode_resolve));
	10869	}
	10870
	10871	/* Called after the vnode has been drained */
	10872	static void
	10873	vnode_resolver_detach(vnode_t vp)
	10874	{
	10875	vnode_resolve_t rp;
	10876	mount_t mp;
	10877
	10878	mp = vnode_mount(vp);
	10879
	10880	vnode_lock(vp);
	10881	rp = vp->v_resolve;
	10882	vp->v_resolve = NULL;
	10883	vnode_unlock(vp);
	10884
	10885	if ((rp->vr_flags & VNT_EXTERNAL) != 0) {
	10886	vnode_rele_ext(vp, O_EVTONLY, 1);
	10887	}
	10888
	10889	vnode_resolver_release(rp);
	10890
	10891	/* Keep count of active trigger vnodes per mount */
	10892	OSAddAtomic(-1, &mp->mnt_numtriggers);
	10893	}
	10894
	10895	__private_extern__
	10896	void
	10897	vnode_trigger_rearm(vnode_t vp, vfs_context_t ctx)
	10898	{
	10899	vnode_resolve_t rp;
	10900	resolver_result_t result;
	10901	enum resolver_status status;
	10902	uint32_t seq;
	10903
	10904	if ((vp->v_resolve == NULL) \|\|
	10905	(vp->v_resolve->vr_rearm_func == NULL) \|\|
	10906	(vp->v_resolve->vr_flags & VNT_AUTO_REARM) == 0) {
	10907	return;
	10908	}
	10909
	10910	rp = vp->v_resolve;
	10911	lck_mtx_lock(&rp->vr_lock);
	10912
	10913	/*
	10914	* Check if VFS initiated this unmount. If so, we'll catch it after the unresolve completes.
	10915	*/
	10916	if (rp->vr_flags & VNT_VFS_UNMOUNTED) {
	10917	lck_mtx_unlock(&rp->vr_lock);
	10918	return;
	10919	}
	10920
	10921	/* Check if this vnode is already armed */
	10922	if ((rp->vr_flags & VNT_RESOLVED) == 0) {
	10923	lck_mtx_unlock(&rp->vr_lock);
	10924	return;
	10925	}
	10926
	10927	lck_mtx_unlock(&rp->vr_lock);
	10928
	10929	result = rp->vr_rearm_func(vp, 0, rp->vr_data, ctx);
	10930	status = vfs_resolver_status(result);
	10931	seq = vfs_resolver_sequence(result);
	10932
	10933	lck_mtx_lock(&rp->vr_lock);
	10934	if (seq > rp->vr_lastseq) {
	10935	if (status == RESOLVER_UNRESOLVED) {
	10936	rp->vr_flags &= ~VNT_RESOLVED;
	10937	}
	10938	rp->vr_lastseq = seq;
	10939	}
	10940	lck_mtx_unlock(&rp->vr_lock);
	10941	}
	10942
	10943	__private_extern__
	10944	int
	10945	vnode_trigger_resolve(vnode_t vp, struct nameidata *ndp, vfs_context_t ctx)
	10946	{
	10947	vnode_resolve_t rp;
	10948	enum path_operation op;
	10949	resolver_result_t result;
	10950	enum resolver_status status;
	10951	uint32_t seq;
	10952
	10953	/*
	10954	* N.B. we cannot call vfs_context_can_resolve_triggers()
	10955	* here because we really only want to suppress that in
	10956	* the event the trigger will be resolved by something in
	10957	* user-space. Any triggers that are resolved by the kernel
	10958	* do not pose a threat of deadlock.
	10959	*/
	10960
	10961	/* Only trigger on topmost vnodes */
	10962	if ((vp->v_resolve == NULL) \|\|
	10963	(vp->v_resolve->vr_resolve_func == NULL) \|\|
	10964	(vp->v_mountedhere != NULL)) {
	10965	return 0;
	10966	}
	10967
	10968	rp = vp->v_resolve;
	10969	lck_mtx_lock(&rp->vr_lock);
	10970
	10971	/* Check if this vnode is already resolved */
	10972	if (rp->vr_flags & VNT_RESOLVED) {
	10973	lck_mtx_unlock(&rp->vr_lock);
	10974	return 0;
	10975	}
	10976
	10977	lck_mtx_unlock(&rp->vr_lock);
	10978
	10979	#if CONFIG_MACF
	10980	if ((rp->vr_flags & VNT_KERN_RESOLVE) == 0) {
	10981	/*
	10982	* VNT_KERN_RESOLVE indicates this trigger has no parameters
	10983	* at the discression of the accessing process other than
	10984	* the act of access. All other triggers must be checked
	10985	*/
	10986	int rv = mac_vnode_check_trigger_resolve(ctx, vp, &ndp->ni_cnd);
	10987	if (rv != 0) {
	10988	return rv;
	10989	}
	10990	}
	10991	#endif
	10992
	10993	/*
	10994	* XXX
	10995	* assumes that resolver will not access this trigger vnode (otherwise the kernel will deadlock)
	10996	* is there anyway to know this???
	10997	* there can also be other legitimate lookups in parallel
	10998	*
	10999	* XXX - should we call this on a separate thread with a timeout?
	11000	*
	11001	* XXX - should we use ISLASTCN to pick the op value??? Perhaps only leafs should
	11002	* get the richer set and non-leafs should get generic OP_LOOKUP? TBD
	11003	*/
	11004	op = (ndp->ni_op < OP_MAXOP) ? ndp->ni_op: OP_LOOKUP;
	11005
	11006	result = rp->vr_resolve_func(vp, &ndp->ni_cnd, op, 0, rp->vr_data, ctx);
	11007	status = vfs_resolver_status(result);
	11008	seq = vfs_resolver_sequence(result);
	11009
	11010	lck_mtx_lock(&rp->vr_lock);
	11011	if (seq > rp->vr_lastseq) {
	11012	if (status == RESOLVER_RESOLVED) {
	11013	rp->vr_flags \|= VNT_RESOLVED;
	11014	}
	11015	rp->vr_lastseq = seq;
	11016	}
	11017	lck_mtx_unlock(&rp->vr_lock);
	11018
	11019	/* On resolver errors, propagate the error back up */
	11020	return status == RESOLVER_ERROR ? vfs_resolver_auxiliary(result) : 0;
	11021	}
	11022
	11023	static int
	11024	vnode_trigger_unresolve(vnode_t vp, int flags, vfs_context_t ctx)
	11025	{
	11026	vnode_resolve_t rp;
	11027	resolver_result_t result;
	11028	enum resolver_status status;
	11029	uint32_t seq;
	11030
	11031	if ((vp->v_resolve == NULL) \|\| (vp->v_resolve->vr_unresolve_func == NULL)) {
	11032	return 0;
	11033	}
	11034
	11035	rp = vp->v_resolve;
	11036	lck_mtx_lock(&rp->vr_lock);
	11037
	11038	/* Check if this vnode is already resolved */
	11039	if ((rp->vr_flags & VNT_RESOLVED) == 0) {
	11040	printf("vnode_trigger_unresolve: not currently resolved\n");
	11041	lck_mtx_unlock(&rp->vr_lock);
	11042	return 0;
	11043	}
	11044
	11045	rp->vr_flags \|= VNT_VFS_UNMOUNTED;
	11046
	11047	lck_mtx_unlock(&rp->vr_lock);
	11048
	11049	/*
	11050	* XXX
	11051	* assumes that resolver will not access this trigger vnode (otherwise the kernel will deadlock)
	11052	* there can also be other legitimate lookups in parallel
	11053	*
	11054	* XXX - should we call this on a separate thread with a timeout?
	11055	*/
	11056
	11057	result = rp->vr_unresolve_func(vp, flags, rp->vr_data, ctx);
	11058	status = vfs_resolver_status(result);
	11059	seq = vfs_resolver_sequence(result);
	11060
	11061	lck_mtx_lock(&rp->vr_lock);
	11062	if (seq > rp->vr_lastseq) {
	11063	if (status == RESOLVER_UNRESOLVED) {
	11064	rp->vr_flags &= ~VNT_RESOLVED;
	11065	}
	11066	rp->vr_lastseq = seq;
	11067	}
	11068	rp->vr_flags &= ~VNT_VFS_UNMOUNTED;
	11069	lck_mtx_unlock(&rp->vr_lock);
	11070
	11071	/* On resolver errors, propagate the error back up */
	11072	return status == RESOLVER_ERROR ? vfs_resolver_auxiliary(result) : 0;
	11073	}
	11074
	11075	static int
	11076	triggerisdescendant(mount_t mp, mount_t rmp)
	11077	{
	11078	int match = FALSE;
	11079
	11080	/*
	11081	* walk up vnode covered chain looking for a match
	11082	*/
	11083	name_cache_lock_shared();
	11084
	11085	while (1) {
	11086	vnode_t vp;
	11087
	11088	/* did we encounter "/" ? */
	11089	if (mp->mnt_flag & MNT_ROOTFS) {
	11090	break;
	11091	}
	11092
	11093	vp = mp->mnt_vnodecovered;
	11094	if (vp == NULLVP) {
	11095	break;
	11096	}
	11097
	11098	mp = vp->v_mount;
	11099	if (mp == rmp) {
	11100	match = TRUE;
	11101	break;
	11102	}
	11103	}
	11104
	11105	name_cache_unlock();
	11106
	11107	return match;
	11108	}
	11109
	11110	struct trigger_unmount_info {
	11111	vfs_context_t ctx;
	11112	mount_t top_mp;
	11113	vnode_t trigger_vp;
	11114	mount_t trigger_mp;
	11115	uint32_t trigger_vid;
	11116	int flags;
	11117	};
	11118
	11119	static int
	11120	trigger_unmount_callback(mount_t mp, void * arg)
	11121	{
	11122	struct trigger_unmount_info * infop = (struct trigger_unmount_info *)arg;
	11123	boolean_t mountedtrigger = FALSE;
	11124
	11125	/*
	11126	* When we encounter the top level mount we're done
	11127	*/
	11128	if (mp == infop->top_mp) {
	11129	return VFS_RETURNED_DONE;
	11130	}
	11131
	11132	if ((mp->mnt_vnodecovered == NULL) \|\|
	11133	(vnode_getwithref(mp->mnt_vnodecovered) != 0)) {
	11134	return VFS_RETURNED;
	11135	}
	11136
	11137	if ((mp->mnt_vnodecovered->v_mountedhere == mp) &&
	11138	(mp->mnt_vnodecovered->v_resolve != NULL) &&
	11139	(mp->mnt_vnodecovered->v_resolve->vr_flags & VNT_RESOLVED)) {
	11140	mountedtrigger = TRUE;
	11141	}
	11142	vnode_put(mp->mnt_vnodecovered);
	11143
	11144	/*
	11145	* When we encounter a mounted trigger, check if its under the top level mount
	11146	*/
	11147	if (!mountedtrigger \|\| !triggerisdescendant(mp, infop->top_mp)) {
	11148	return VFS_RETURNED;
	11149	}
	11150
	11151	/*
	11152	* Process any pending nested mount (now that its not referenced)
	11153	*/
	11154	if ((infop->trigger_vp != NULLVP) &&
	11155	(vnode_getwithvid(infop->trigger_vp, infop->trigger_vid) == 0)) {
	11156	vnode_t vp = infop->trigger_vp;
	11157	int error;
	11158
	11159	infop->trigger_vp = NULLVP;
	11160
	11161	if (mp == vp->v_mountedhere) {
	11162	vnode_put(vp);
	11163	printf("trigger_unmount_callback: unexpected match '%s'\n",
	11164	mp->mnt_vfsstat.f_mntonname);
	11165	return VFS_RETURNED;
	11166	}
	11167	if (infop->trigger_mp != vp->v_mountedhere) {
	11168	vnode_put(vp);
	11169	printf("trigger_unmount_callback: trigger mnt changed! (%p != %p)\n",
	11170	infop->trigger_mp, vp->v_mountedhere);
	11171	goto savenext;
	11172	}
	11173
	11174	error = vnode_trigger_unresolve(vp, infop->flags, infop->ctx);
	11175	vnode_put(vp);
	11176	if (error) {
	11177	printf("unresolving: '%s', err %d\n",
	11178	vp->v_mountedhere ? vp->v_mountedhere->mnt_vfsstat.f_mntonname :
	11179	"???", error);
	11180	return VFS_RETURNED_DONE; /* stop iteration on errors */
	11181	}
	11182	}
	11183	savenext:
	11184	/*
	11185	* We can't call resolver here since we hold a mount iter
	11186	* ref on mp so save its covered vp for later processing
	11187	*/
	11188	infop->trigger_vp = mp->mnt_vnodecovered;
	11189	if ((infop->trigger_vp != NULLVP) &&
	11190	(vnode_getwithref(infop->trigger_vp) == 0)) {
	11191	if (infop->trigger_vp->v_mountedhere == mp) {
	11192	infop->trigger_vid = infop->trigger_vp->v_id;
	11193	infop->trigger_mp = mp;
	11194	}
	11195	vnode_put(infop->trigger_vp);
	11196	}
	11197
	11198	return VFS_RETURNED;
	11199	}
	11200
	11201	/*
	11202	* Attempt to unmount any trigger mounts nested underneath a mount.
	11203	* This is a best effort attempt and no retries are performed here.
	11204	*
	11205	* Note: mp->mnt_rwlock is held exclusively on entry (so be carefull)
	11206	*/
	11207	__private_extern__
	11208	void
	11209	vfs_nested_trigger_unmounts(mount_t mp, int flags, vfs_context_t ctx)
	11210	{
	11211	struct trigger_unmount_info info;
	11212
	11213	/* Must have trigger vnodes */
	11214	if (mp->mnt_numtriggers == 0) {
	11215	return;
	11216	}
	11217	/* Avoid recursive requests (by checking covered vnode) */
	11218	if ((mp->mnt_vnodecovered != NULL) &&
	11219	(vnode_getwithref(mp->mnt_vnodecovered) == 0)) {
	11220	boolean_t recursive = FALSE;
	11221
	11222	if ((mp->mnt_vnodecovered->v_mountedhere == mp) &&
	11223	(mp->mnt_vnodecovered->v_resolve != NULL) &&
	11224	(mp->mnt_vnodecovered->v_resolve->vr_flags & VNT_VFS_UNMOUNTED)) {
	11225	recursive = TRUE;
	11226	}
	11227	vnode_put(mp->mnt_vnodecovered);
	11228	if (recursive) {
	11229	return;
	11230	}
	11231	}
	11232
	11233	/*
	11234	* Attempt to unmount any nested trigger mounts (best effort)
	11235	*/
	11236	info.ctx = ctx;
	11237	info.top_mp = mp;
	11238	info.trigger_vp = NULLVP;
	11239	info.trigger_vid = 0;
	11240	info.trigger_mp = NULL;
	11241	info.flags = flags;
	11242
	11243	(void) vfs_iterate(VFS_ITERATE_TAIL_FIRST, trigger_unmount_callback, &info);
	11244
	11245	/*
	11246	* Process remaining nested mount (now that its not referenced)
	11247	*/
	11248	if ((info.trigger_vp != NULLVP) &&
	11249	(vnode_getwithvid(info.trigger_vp, info.trigger_vid) == 0)) {
	11250	vnode_t vp = info.trigger_vp;
	11251
	11252	if (info.trigger_mp == vp->v_mountedhere) {
	11253	(void) vnode_trigger_unresolve(vp, flags, ctx);
	11254	}
	11255	vnode_put(vp);
	11256	}
	11257	}
	11258
	11259	int
	11260	vfs_addtrigger(mount_t mp, const char relpath, struct vnode_trigger_info vtip, vfs_context_t ctx)
	11261	{
	11262	struct nameidata *ndp;
	11263	int res;
	11264	vnode_t rvp, vp;
	11265	struct vnode_trigger_param vtp;
	11266
	11267	/*
	11268	* Must be called for trigger callback, wherein rwlock is held
	11269	*/
	11270	lck_rw_assert(&mp->mnt_rwlock, LCK_RW_ASSERT_HELD);
	11271
	11272	TRIG_LOG("Adding trigger at %s\n", relpath);
	11273	TRIG_LOG("Trying VFS_ROOT\n");
	11274
	11275	ndp = kheap_alloc(KHEAP_TEMP, sizeof(struct nameidata), Z_WAITOK);
	11276	if (!ndp) {
	11277	return ENOMEM;
	11278	}
	11279
	11280	/*
	11281	* We do a lookup starting at the root of the mountpoint, unwilling
	11282	* to cross into other mountpoints.
	11283	*/
	11284	res = VFS_ROOT(mp, &rvp, ctx);
	11285	if (res != 0) {
	11286	goto out;
	11287	}
	11288
	11289	TRIG_LOG("Trying namei\n");
	11290
	11291	NDINIT(ndp, LOOKUP, OP_LOOKUP, USEDVP \| NOCROSSMOUNT \| FOLLOW, UIO_SYSSPACE,
	11292	CAST_USER_ADDR_T(relpath), ctx);
	11293	ndp->ni_dvp = rvp;
	11294	res = namei(ndp);
	11295	if (res != 0) {
	11296	vnode_put(rvp);
	11297	goto out;
	11298	}
	11299
	11300	vp = ndp->ni_vp;
	11301	nameidone(ndp);
	11302	vnode_put(rvp);
	11303
	11304	TRIG_LOG("Trying vnode_resolver_create()\n");
	11305
	11306	/*
	11307	* Set up blob. vnode_create() takes a larger structure
	11308	* with creation info, and we needed something different
	11309	* for this case. One needs to win, or we need to munge both;
	11310	* vnode_create() wins.
	11311	*/
	11312	bzero(&vtp, sizeof(vtp));
	11313	vtp.vnt_resolve_func = vtip->vti_resolve_func;
	11314	vtp.vnt_unresolve_func = vtip->vti_unresolve_func;
	11315	vtp.vnt_rearm_func = vtip->vti_rearm_func;
	11316	vtp.vnt_reclaim_func = vtip->vti_reclaim_func;
	11317	vtp.vnt_reclaim_func = vtip->vti_reclaim_func;
	11318	vtp.vnt_data = vtip->vti_data;
	11319	vtp.vnt_flags = vtip->vti_flags;
	11320
	11321	res = vnode_resolver_create(mp, vp, &vtp, TRUE);
	11322	vnode_put(vp);
	11323	out:
	11324	kheap_free(KHEAP_TEMP, ndp, sizeof(struct nameidata));
	11325	TRIG_LOG("Returning %d\n", res);
	11326	return res;
	11327	}
	11328
	11329	#endif /* CONFIG_TRIGGERS */
	11330
	11331	vm_offset_t
	11332	kdebug_vnode(vnode_t vp)
	11333	{
	11334	return VM_KERNEL_ADDRPERM(vp);
	11335	}
	11336
	11337	static int flush_cache_on_write = 0;
	11338	SYSCTL_INT(_kern, OID_AUTO, flush_cache_on_write,
	11339	CTLFLAG_RW \| CTLFLAG_LOCKED, &flush_cache_on_write, 0,
	11340	"always flush the drive cache on writes to uncached files");
	11341
	11342	int
	11343	vnode_should_flush_after_write(vnode_t vp, int ioflag)
	11344	{
	11345	return flush_cache_on_write
	11346	&& (ISSET(ioflag, IO_NOCACHE) \|\| vnode_isnocache(vp));
	11347	}
	11348
	11349	/*
	11350	* sysctl for use by disk I/O tracing tools to get the list of existing
	11351	* vnodes' paths
	11352	*/
	11353
	11354	#define NPATH_WORDS (MAXPATHLEN / sizeof(unsigned long))
	11355	struct vnode_trace_paths_context {
	11356	uint64_t count;
	11357	/*
	11358	* Must be a multiple of 4, then -1, for tracing!
	11359	*/
	11360	unsigned long path[NPATH_WORDS + (4 - (NPATH_WORDS % 4)) - 1];
	11361	};
	11362
	11363	static int
	11364	vnode_trace_path_callback(struct vnode vp, void vctx)
	11365	{
	11366	struct vnode_trace_paths_context *ctx = vctx;
	11367	size_t path_len = sizeof(ctx->path);
	11368
	11369	int getpath_len = (int)path_len;
	11370	if (vn_getpath(vp, (char *)ctx->path, &getpath_len) == 0) {
	11371	/* vn_getpath() NUL-terminates, and len includes the NUL. */
	11372	assert(getpath_len >= 0);
	11373	path_len = (size_t)getpath_len;
	11374
	11375	assert(path_len <= sizeof(ctx->path));
	11376	kdebug_vfs_lookup(ctx->path, (int)path_len, vp,
	11377	KDBG_VFS_LOOKUP_FLAG_LOOKUP \| KDBG_VFS_LOOKUP_FLAG_NOPROCFILT);
	11378
	11379	if (++(ctx->count) == 1000) {
	11380	thread_yield_to_preemption();
	11381	ctx->count = 0;
	11382	}
	11383	}
	11384
	11385	return VNODE_RETURNED;
	11386	}
	11387
	11388	static int
	11389	vfs_trace_paths_callback(mount_t mp, void *arg)
	11390	{
	11391	if (mp->mnt_flag & MNT_LOCAL) {
	11392	vnode_iterate(mp, VNODE_ITERATE_ALL, vnode_trace_path_callback, arg);
	11393	}
	11394
	11395	return VFS_RETURNED;
	11396	}
	11397
	11398	static int sysctl_vfs_trace_paths SYSCTL_HANDLER_ARGS {
	11399	struct vnode_trace_paths_context ctx;
	11400
	11401	(void)oidp;
	11402	(void)arg1;
	11403	(void)arg2;
	11404	(void)req;
	11405
	11406	if (!kauth_cred_issuser(kauth_cred_get())) {
	11407	return EPERM;
	11408	}
	11409
	11410	if (!kdebug_enable \|\| !kdebug_debugid_enabled(VFS_LOOKUP)) {
	11411	return EINVAL;
	11412	}
	11413
	11414	bzero(&ctx, sizeof(struct vnode_trace_paths_context));
	11415
	11416	vfs_iterate(0, vfs_trace_paths_callback, &ctx);
	11417
	11418	return 0;
	11419	}
	11420
	11421	SYSCTL_PROC(_vfs_generic, OID_AUTO, trace_paths, CTLFLAG_RD \| CTLFLAG_LOCKED \| CTLFLAG_MASKED, NULL, 0, &sysctl_vfs_trace_paths, "-", "trace_paths");