]> git.saurik.com Git - apple/xnu.git/blame - bsd/vfs/vfs_subr.c
xnu-4903.270.47.tar.gz
[apple/xnu.git] / bsd / vfs / vfs_subr.c
CommitLineData
1c79356b 1/*
d9a64523 2 * Copyright (c) 2000-2018 Apple Inc. All rights reserved.
5d5c5d0d 3 *
2d21ac55 4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
0a7de745 5 *
2d21ac55
A
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
0a7de745 14 *
2d21ac55
A
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
0a7de745 17 *
2d21ac55
A
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
8f6c56a5
A
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
2d21ac55
A
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
0a7de745 25 *
2d21ac55 26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
1c79356b
A
27 */
28/* Copyright (c) 1995 NeXT Computer, Inc. All Rights Reserved */
29/*
30 * Copyright (c) 1989, 1993
31 * The Regents of the University of California. All rights reserved.
32 * (c) UNIX System Laboratories, Inc.
33 * All or some portions of this file are derived from material licensed
34 * to the University of California by American Telephone and Telegraph
35 * Co. or Unix System Laboratories, Inc. and are reproduced herein with
36 * the permission of UNIX System Laboratories, Inc.
37 *
38 * Redistribution and use in source and binary forms, with or without
39 * modification, are permitted provided that the following conditions
40 * are met:
41 * 1. Redistributions of source code must retain the above copyright
42 * notice, this list of conditions and the following disclaimer.
43 * 2. Redistributions in binary form must reproduce the above copyright
44 * notice, this list of conditions and the following disclaimer in the
45 * documentation and/or other materials provided with the distribution.
46 * 3. All advertising materials mentioning features or use of this software
47 * must display the following acknowledgement:
48 * This product includes software developed by the University of
49 * California, Berkeley and its contributors.
50 * 4. Neither the name of the University nor the names of its contributors
51 * may be used to endorse or promote products derived from this software
52 * without specific prior written permission.
53 *
54 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
55 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
56 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
57 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
58 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
59 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
60 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
61 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
62 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
63 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
64 * SUCH DAMAGE.
65 *
66 * @(#)vfs_subr.c 8.31 (Berkeley) 5/26/95
67 */
2d21ac55
A
68/*
69 * NOTICE: This file was modified by SPARTA, Inc. in 2005 to introduce
70 * support for mandatory and extensible security protections. This notice
71 * is included in support of clause 2.2 (b) of the Apple Public License,
72 * Version 2.0.
73 */
1c79356b
A
74
75/*
76 * External virtual filesystem routines
77 */
78
1c79356b
A
79#include <sys/param.h>
80#include <sys/systm.h>
91447636
A
81#include <sys/proc_internal.h>
82#include <sys/kauth.h>
83#include <sys/mount_internal.h>
1c79356b 84#include <sys/time.h>
91447636 85#include <sys/lock.h>
2d21ac55 86#include <sys/vnode.h>
91447636 87#include <sys/vnode_internal.h>
1c79356b
A
88#include <sys/stat.h>
89#include <sys/namei.h>
90#include <sys/ucred.h>
91447636 91#include <sys/buf_internal.h>
1c79356b
A
92#include <sys/errno.h>
93#include <sys/malloc.h>
2d21ac55
A
94#include <sys/uio_internal.h>
95#include <sys/uio.h>
1c79356b
A
96#include <sys/domain.h>
97#include <sys/mbuf.h>
98#include <sys/syslog.h>
91447636 99#include <sys/ubc_internal.h>
1c79356b
A
100#include <sys/vm.h>
101#include <sys/sysctl.h>
55e303ae
A
102#include <sys/filedesc.h>
103#include <sys/event.h>
91447636
A
104#include <sys/kdebug.h>
105#include <sys/kauth.h>
106#include <sys/user.h>
316670eb 107#include <sys/systm.h>
d1ecb069 108#include <sys/kern_memorystatus.h>
316670eb 109#include <sys/lockf.h>
91447636 110#include <miscfs/fifofs/fifo.h>
55e303ae
A
111
112#include <string.h>
39037602 113#include <machine/machine_routines.h>
1c79356b
A
114
115#include <kern/assert.h>
316670eb
A
116#include <mach/kern_return.h>
117#include <kern/thread.h>
118#include <kern/sched_prim.h>
1c79356b
A
119
120#include <miscfs/specfs/specdev.h>
121
0b4e3aa0
A
122#include <mach/mach_types.h>
123#include <mach/memory_object_types.h>
6d2010ae 124#include <mach/memory_object_control.h>
0b4e3aa0 125
0a7de745
A
126#include <kern/kalloc.h> /* kalloc()/kfree() */
127#include <kern/clock.h> /* delay_for_interval() */
128#include <libkern/OSAtomic.h> /* OSAddAtomic() */
5ba3f43e 129#if !CONFIG_EMBEDDED
fe8ab488 130#include <console/video_console.h>
5ba3f43e 131#endif
2d21ac55 132
6d2010ae
A
133#ifdef JOE_DEBUG
134#include <libkern/OSDebug.h>
135#endif
136
0a7de745 137#include <vm/vm_protos.h> /* vnode_pager_vrele() */
2d21ac55
A
138
139#if CONFIG_MACF
140#include <security/mac_framework.h>
141#endif
142
5ba3f43e
A
143#include <vfs/vfs_disk_conditioner.h>
144#include <libkern/section_keywords.h>
145
91447636
A
146extern lck_grp_t *vnode_lck_grp;
147extern lck_attr_t *vnode_lck_attr;
148
6d2010ae
A
149#if CONFIG_TRIGGERS
150extern lck_grp_t *trigger_vnode_lck_grp;
151extern lck_attr_t *trigger_vnode_lck_attr;
152#endif
91447636
A
153
154extern lck_mtx_t * mnt_list_mtx_lock;
0b4e3aa0 155
1c79356b
A
156enum vtype iftovt_tab[16] = {
157 VNON, VFIFO, VCHR, VNON, VDIR, VNON, VBLK, VNON,
158 VREG, VNON, VLNK, VNON, VSOCK, VNON, VNON, VBAD,
159};
0a7de745 160int vttoif_tab[9] = {
1c79356b
A
161 0, S_IFREG, S_IFDIR, S_IFBLK, S_IFCHR, S_IFLNK,
162 S_IFSOCK, S_IFIFO, S_IFMT,
163};
164
6d2010ae
A
165
166/* XXX These should be in a BSD accessible Mach header, but aren't. */
167extern void memory_object_mark_used(
168 memory_object_control_t control);
169
170extern void memory_object_mark_unused(
171 memory_object_control_t control,
172 boolean_t rage);
173
0a7de745 174extern void memory_object_mark_io_tracking(
fe8ab488 175 memory_object_control_t control);
6d2010ae 176
2d21ac55
A
177/* XXX next protptype should be from <nfs/nfs.h> */
178extern int nfs_vinvalbuf(vnode_t, int, vfs_context_t, int);
179
5ba3f43e
A
180extern int paniclog_append_noflush(const char *format, ...);
181
2d21ac55
A
182/* XXX next prototytype should be from libsa/stdlib.h> but conflicts libkern */
183__private_extern__ void qsort(
0a7de745
A
184 void * array,
185 size_t nmembers,
186 size_t member_size,
187 int (*)(const void *, const void *));
2d21ac55 188
2d21ac55 189__private_extern__ void vntblinit(void);
c18c124e
A
190__private_extern__ int unlink1(vfs_context_t, vnode_t, user_addr_t,
191 enum uio_seg, int);
91447636 192
b0d623f7
A
193extern int system_inshutdown;
194
91447636 195static void vnode_list_add(vnode_t);
316670eb 196static void vnode_async_list_add(vnode_t);
91447636 197static void vnode_list_remove(vnode_t);
cf7d32b8 198static void vnode_list_remove_locked(vnode_t);
91447636 199
316670eb 200static void vnode_abort_advlocks(vnode_t);
91447636 201static errno_t vnode_drain(vnode_t);
2d21ac55
A
202static void vgone(vnode_t, int flags);
203static void vclean(vnode_t vp, int flag);
204static void vnode_reclaim_internal(vnode_t, int, int, int);
91447636 205
0a7de745 206static void vnode_dropiocount(vnode_t);
91447636
A
207
208static vnode_t checkalias(vnode_t vp, dev_t nvp_rdev);
209static int vnode_reload(vnode_t);
210static int vnode_isinuse_locked(vnode_t, int, int);
211
fe8ab488
A
212static int unmount_callback(mount_t, __unused void *);
213
91447636 214static void insmntque(vnode_t vp, mount_t mp);
91447636
A
215static int mount_getvfscnt(void);
216static int mount_fillfsids(fsid_t *, int );
217static void vnode_iterate_setup(mount_t);
d1ecb069 218int vnode_umount_preflight(mount_t, vnode_t, int);
91447636
A
219static int vnode_iterate_prepare(mount_t);
220static int vnode_iterate_reloadq(mount_t);
221static void vnode_iterate_clear(mount_t);
b0d623f7 222static mount_t vfs_getvfs_locked(fsid_t *);
6d2010ae 223static int vn_create_reg(vnode_t dvp, vnode_t *vpp, struct nameidata *ndp,
0a7de745 224 struct vnode_attr *vap, uint32_t flags, int fmode, uint32_t *statusp, vfs_context_t ctx);
6d2010ae 225static int vnode_authattr_new_internal(vnode_t dvp, struct vnode_attr *vap, int noauth, uint32_t *defaulted_fieldsp, vfs_context_t ctx);
1c79356b 226
0a7de745 227errno_t rmdir_remove_orphaned_appleDouble(vnode_t, vfs_context_t, int *);
2d21ac55 228
b0d623f7
A
229#ifdef JOE_DEBUG
230static void record_vp(vnode_t vp, int count);
231#endif
232
3e170ce0
A
233#if CONFIG_JETSAM && (DEVELOPMENT || DEBUG)
234extern int bootarg_no_vnode_jetsam; /* from bsd_init.c default value is 0 */
235#endif /* CONFIG_JETSAM && (DEVELOPMENT || DEBUG) */
236
237boolean_t root_is_CF_drive = FALSE;
238
6d2010ae
A
239#if CONFIG_TRIGGERS
240static int vnode_resolver_create(mount_t, vnode_t, struct vnode_trigger_param *, boolean_t external);
241static void vnode_resolver_detach(vnode_t);
242#endif
243
0a7de745
A
244TAILQ_HEAD(freelst, vnode) vnode_free_list; /* vnode free list */
245TAILQ_HEAD(deadlst, vnode) vnode_dead_list; /* vnode dead list */
316670eb
A
246TAILQ_HEAD(async_work_lst, vnode) vnode_async_work_list;
247
2d21ac55 248
0a7de745 249TAILQ_HEAD(ragelst, vnode) vnode_rage_list; /* vnode rapid age list */
2d21ac55 250struct timeval rage_tv;
0a7de745
A
251int rage_limit = 0;
252int ragevnodes = 0;
2d21ac55 253
0a7de745
A
254#define RAGE_LIMIT_MIN 100
255#define RAGE_TIME_LIMIT 5
2d21ac55 256
0a7de745 257struct mntlist mountlist; /* mounted filesystem list */
91447636 258static int nummounts = 0;
1c79356b
A
259
260#if DIAGNOSTIC
0a7de745 261#define VLISTCHECK(fun, vp, list) \
1c79356b 262 if ((vp)->v_freelist.tqe_prev == (struct vnode **)0xdeadb) \
0a7de745 263 panic("%s: %s vnode not on %slist", (fun), (list), (list));
1c79356b
A
264#else
265#define VLISTCHECK(fun, vp, list)
1c79356b
A
266#endif /* DIAGNOSTIC */
267
0a7de745
A
268#define VLISTNONE(vp) \
269 do { \
270 (vp)->v_freelist.tqe_next = (struct vnode *)0; \
271 (vp)->v_freelist.tqe_prev = (struct vnode **)0xdeadb; \
1c79356b
A
272 } while(0)
273
0a7de745 274#define VONLIST(vp) \
1c79356b
A
275 ((vp)->v_freelist.tqe_prev != (struct vnode **)0xdeadb)
276
277/* remove a vnode from free vnode list */
0a7de745
A
278#define VREMFREE(fun, vp) \
279 do { \
280 VLISTCHECK((fun), (vp), "free"); \
281 TAILQ_REMOVE(&vnode_free_list, (vp), v_freelist); \
282 VLISTNONE((vp)); \
283 freevnodes--; \
1c79356b
A
284 } while(0)
285
2d21ac55 286
2d21ac55 287/* remove a vnode from dead vnode list */
0a7de745
A
288#define VREMDEAD(fun, vp) \
289 do { \
290 VLISTCHECK((fun), (vp), "dead"); \
291 TAILQ_REMOVE(&vnode_dead_list, (vp), v_freelist); \
292 VLISTNONE((vp)); \
293 vp->v_listflag &= ~VLIST_DEAD; \
294 deadvnodes--; \
2d21ac55
A
295 } while(0)
296
297
316670eb 298/* remove a vnode from async work vnode list */
0a7de745
A
299#define VREMASYNC_WORK(fun, vp) \
300 do { \
301 VLISTCHECK((fun), (vp), "async_work"); \
302 TAILQ_REMOVE(&vnode_async_work_list, (vp), v_freelist); \
303 VLISTNONE((vp)); \
304 vp->v_listflag &= ~VLIST_ASYNC_WORK; \
305 async_work_vnodes--; \
316670eb
A
306 } while(0)
307
308
2d21ac55 309/* remove a vnode from rage vnode list */
0a7de745
A
310#define VREMRAGE(fun, vp) \
311 do { \
312 if ( !(vp->v_listflag & VLIST_RAGE)) \
313 panic("VREMRAGE: vp not on rage list"); \
314 VLISTCHECK((fun), (vp), "rage"); \
315 TAILQ_REMOVE(&vnode_rage_list, (vp), v_freelist); \
316 VLISTNONE((vp)); \
317 vp->v_listflag &= ~VLIST_RAGE; \
318 ragevnodes--; \
1c79356b
A
319 } while(0)
320
316670eb
A
321static void async_work_continue(void);
322
1c79356b
A
323/*
324 * Initialize the vnode management data structures.
325 */
0b4e3aa0 326__private_extern__ void
91447636 327vntblinit(void)
1c79356b 328{
0a7de745 329 thread_t thread = THREAD_NULL;
316670eb 330
1c79356b 331 TAILQ_INIT(&vnode_free_list);
2d21ac55
A
332 TAILQ_INIT(&vnode_rage_list);
333 TAILQ_INIT(&vnode_dead_list);
316670eb 334 TAILQ_INIT(&vnode_async_work_list);
91447636 335 TAILQ_INIT(&mountlist);
1c79356b 336
2d21ac55
A
337 microuptime(&rage_tv);
338 rage_limit = desiredvnodes / 100;
339
0a7de745
A
340 if (rage_limit < RAGE_LIMIT_MIN) {
341 rage_limit = RAGE_LIMIT_MIN;
342 }
343
316670eb
A
344 /*
345 * create worker threads
346 */
347 kernel_thread_start((thread_continue_t)async_work_continue, NULL, &thread);
348 thread_deallocate(thread);
1c79356b
A
349}
350
91447636 351/* the timeout is in 10 msecs */
1c79356b 352int
0a7de745
A
353vnode_waitforwrites(vnode_t vp, int output_target, int slpflag, int slptimeout, const char *msg)
354{
355 int error = 0;
91447636 356 struct timespec ts;
1c79356b 357
91447636
A
358 KERNEL_DEBUG(0x3010280 | DBG_FUNC_START, (int)vp, output_target, vp->v_numoutput, 0, 0);
359
360 if (vp->v_numoutput > output_target) {
0a7de745 361 slpflag |= PDROP;
91447636 362
0a7de745 363 vnode_lock_spin(vp);
91447636
A
364
365 while ((vp->v_numoutput > output_target) && error == 0) {
0a7de745
A
366 if (output_target) {
367 vp->v_flag |= VTHROTTLED;
368 } else {
369 vp->v_flag |= VBWAIT;
370 }
2d21ac55 371
0a7de745
A
372 ts.tv_sec = (slptimeout / 100);
373 ts.tv_nsec = (slptimeout % 1000) * 10 * NSEC_PER_USEC * 1000;
91447636 374 error = msleep((caddr_t)&vp->v_numoutput, &vp->v_lock, (slpflag | (PRIBIO + 1)), msg, &ts);
b0d623f7
A
375
376 vnode_lock_spin(vp);
91447636
A
377 }
378 vnode_unlock(vp);
1c79356b 379 }
91447636
A
380 KERNEL_DEBUG(0x3010280 | DBG_FUNC_END, (int)vp, output_target, vp->v_numoutput, error, 0);
381
382 return error;
1c79356b
A
383}
384
91447636 385
1c79356b 386void
0a7de745
A
387vnode_startwrite(vnode_t vp)
388{
389 OSAddAtomic(1, &vp->v_numoutput);
91447636
A
390}
391
392
393void
394vnode_writedone(vnode_t vp)
1c79356b 395{
91447636 396 if (vp) {
6d2010ae 397 int need_wakeup = 0;
91447636 398
0a7de745 399 OSAddAtomic(-1, &vp->v_numoutput);
1c79356b 400
6d2010ae 401 vnode_lock_spin(vp);
91447636 402
0a7de745 403 if (vp->v_numoutput < 0) {
6d2010ae 404 panic("vnode_writedone: numoutput < 0");
0a7de745 405 }
2d21ac55 406
6d2010ae
A
407 if ((vp->v_flag & VTHROTTLED)) {
408 vp->v_flag &= ~VTHROTTLED;
409 need_wakeup = 1;
410 }
411 if ((vp->v_flag & VBWAIT) && (vp->v_numoutput == 0)) {
412 vp->v_flag &= ~VBWAIT;
413 need_wakeup = 1;
2d21ac55 414 }
6d2010ae 415 vnode_unlock(vp);
0a7de745
A
416
417 if (need_wakeup) {
6d2010ae 418 wakeup((caddr_t)&vp->v_numoutput);
0a7de745 419 }
91447636 420 }
1c79356b
A
421}
422
91447636
A
423
424
1c79356b 425int
91447636 426vnode_hasdirtyblks(vnode_t vp)
1c79356b 427{
0a7de745 428 struct cl_writebehind *wbp;
1c79356b 429
91447636
A
430 /*
431 * Not taking the buf_mtxp as there is little
432 * point doing it. Even if the lock is taken the
0a7de745 433 * state can change right after that. If their
91447636
A
434 * needs to be a synchronization, it must be driven
435 * by the caller
0a7de745
A
436 */
437 if (vp->v_dirtyblkhd.lh_first) {
438 return 1;
439 }
440
441 if (!UBCINFOEXISTS(vp)) {
442 return 0;
443 }
0b4e3aa0 444
91447636
A
445 wbp = vp->v_ubcinfo->cl_wbehind;
446
0a7de745
A
447 if (wbp && (wbp->cl_number || wbp->cl_scmap)) {
448 return 1;
449 }
0b4e3aa0 450
0a7de745 451 return 0;
1c79356b
A
452}
453
1c79356b 454int
91447636 455vnode_hascleanblks(vnode_t vp)
1c79356b 456{
91447636
A
457 /*
458 * Not taking the buf_mtxp as there is little
459 * point doing it. Even if the lock is taken the
0a7de745 460 * state can change right after that. If their
91447636
A
461 * needs to be a synchronization, it must be driven
462 * by the caller
0a7de745
A
463 */
464 if (vp->v_cleanblkhd.lh_first) {
465 return 1;
466 }
467 return 0;
91447636 468}
1c79356b 469
91447636
A
470void
471vnode_iterate_setup(mount_t mp)
472{
91447636 473 mp->mnt_lflag |= MNT_LITER;
1c79356b
A
474}
475
d1ecb069 476int
91447636 477vnode_umount_preflight(mount_t mp, vnode_t skipvp, int flags)
1c79356b 478{
91447636 479 vnode_t vp;
1c79356b 480
91447636 481 TAILQ_FOREACH(vp, &mp->mnt_vnodelist, v_mntvnodes) {
0a7de745 482 if (vp->v_type == VDIR) {
2d21ac55 483 continue;
0a7de745
A
484 }
485 if (vp == skipvp) {
91447636 486 continue;
0a7de745
A
487 }
488 if ((flags & SKIPSYSTEM) && ((vp->v_flag & VSYSTEM) || (vp->v_flag & VNOFLUSH))) {
91447636 489 continue;
0a7de745
A
490 }
491 if ((flags & SKIPSWAP) && (vp->v_flag & VSWAP)) {
91447636 492 continue;
0a7de745
A
493 }
494 if ((flags & WRITECLOSE) && (vp->v_writecount == 0 || vp->v_type != VREG)) {
91447636 495 continue;
0a7de745 496 }
fe8ab488 497
91447636 498 /* Look for busy vnode */
fe8ab488
A
499 if ((vp->v_usecount != 0) && ((vp->v_usecount - vp->v_kusecount) != 0)) {
500 return 1;
fe8ab488 501 } else if (vp->v_iocount > 0) {
0a7de745 502 /* Busy if iocount is > 0 for more than 3 seconds */
fe8ab488 503 tsleep(&vp->v_iocount, PVFS, "vnode_drain_network", 3 * hz);
0a7de745 504 if (vp->v_iocount > 0) {
fe8ab488 505 return 1;
0a7de745 506 }
fe8ab488 507 continue;
1c79356b 508 }
fe8ab488 509 }
0a7de745 510
fe8ab488 511 return 0;
1c79356b
A
512}
513
0a7de745 514/*
91447636
A
515 * This routine prepares iteration by moving all the vnodes to worker queue
516 * called with mount lock held
1c79356b 517 */
91447636
A
518int
519vnode_iterate_prepare(mount_t mp)
1c79356b 520{
91447636 521 vnode_t vp;
1c79356b 522
91447636
A
523 if (TAILQ_EMPTY(&mp->mnt_vnodelist)) {
524 /* nothing to do */
0a7de745
A
525 return 0;
526 }
1c79356b 527
91447636
A
528 vp = TAILQ_FIRST(&mp->mnt_vnodelist);
529 vp->v_mntvnodes.tqe_prev = &(mp->mnt_workerqueue.tqh_first);
530 mp->mnt_workerqueue.tqh_first = mp->mnt_vnodelist.tqh_first;
531 mp->mnt_workerqueue.tqh_last = mp->mnt_vnodelist.tqh_last;
532
533 TAILQ_INIT(&mp->mnt_vnodelist);
0a7de745 534 if (mp->mnt_newvnodes.tqh_first != NULL) {
91447636 535 panic("vnode_iterate_prepare: newvnode when entering vnode");
0a7de745 536 }
91447636
A
537 TAILQ_INIT(&mp->mnt_newvnodes);
538
0a7de745 539 return 1;
1c79356b
A
540}
541
91447636
A
542
543/* called with mount lock held */
0a7de745 544int
91447636 545vnode_iterate_reloadq(mount_t mp)
1c79356b 546{
91447636
A
547 int moved = 0;
548
549 /* add the remaining entries in workerq to the end of mount vnode list */
550 if (!TAILQ_EMPTY(&mp->mnt_workerqueue)) {
551 struct vnode * mvp;
552 mvp = TAILQ_LAST(&mp->mnt_vnodelist, vnodelst);
0a7de745 553
91447636 554 /* Joining the workerque entities to mount vnode list */
0a7de745 555 if (mvp) {
91447636 556 mvp->v_mntvnodes.tqe_next = mp->mnt_workerqueue.tqh_first;
0a7de745 557 } else {
91447636 558 mp->mnt_vnodelist.tqh_first = mp->mnt_workerqueue.tqh_first;
0a7de745 559 }
91447636
A
560 mp->mnt_workerqueue.tqh_first->v_mntvnodes.tqe_prev = mp->mnt_vnodelist.tqh_last;
561 mp->mnt_vnodelist.tqh_last = mp->mnt_workerqueue.tqh_last;
562 TAILQ_INIT(&mp->mnt_workerqueue);
563 }
564
565 /* add the newvnodes to the head of mount vnode list */
566 if (!TAILQ_EMPTY(&mp->mnt_newvnodes)) {
567 struct vnode * nlvp;
568 nlvp = TAILQ_LAST(&mp->mnt_newvnodes, vnodelst);
0a7de745 569
91447636
A
570 mp->mnt_newvnodes.tqh_first->v_mntvnodes.tqe_prev = &mp->mnt_vnodelist.tqh_first;
571 nlvp->v_mntvnodes.tqe_next = mp->mnt_vnodelist.tqh_first;
0a7de745 572 if (mp->mnt_vnodelist.tqh_first) {
91447636 573 mp->mnt_vnodelist.tqh_first->v_mntvnodes.tqe_prev = &nlvp->v_mntvnodes.tqe_next;
0a7de745 574 } else {
91447636 575 mp->mnt_vnodelist.tqh_last = mp->mnt_newvnodes.tqh_last;
0a7de745 576 }
91447636
A
577 mp->mnt_vnodelist.tqh_first = mp->mnt_newvnodes.tqh_first;
578 TAILQ_INIT(&mp->mnt_newvnodes);
579 moved = 1;
580 }
1c79356b 581
0a7de745 582 return moved;
1c79356b
A
583}
584
1c79356b 585
91447636
A
586void
587vnode_iterate_clear(mount_t mp)
588{
589 mp->mnt_lflag &= ~MNT_LITER;
91447636 590}
1c79356b 591
5ba3f43e 592#if !CONFIG_EMBEDDED
1c79356b 593
fe8ab488
A
594#include <i386/panic_hooks.h>
595
596struct vnode_iterate_panic_hook {
597 panic_hook_t hook;
598 mount_t mp;
599 struct vnode *vp;
600};
601
0a7de745
A
602static void
603vnode_iterate_panic_hook(panic_hook_t *hook_)
fe8ab488 604{
fe8ab488
A
605 struct vnode_iterate_panic_hook *hook = (struct vnode_iterate_panic_hook *)hook_;
606 panic_phys_range_t range;
607 uint64_t phys;
0a7de745 608
fe8ab488 609 if (panic_phys_range_before(hook->mp, &phys, &range)) {
5ba3f43e 610 paniclog_append_noflush("mp = %p, phys = %p, prev (%p: %p-%p)\n",
0a7de745
A
611 hook->mp, phys, range.type, range.phys_start,
612 range.phys_start + range.len);
fe8ab488 613 } else {
5ba3f43e 614 paniclog_append_noflush("mp = %p, phys = %p, prev (!)\n", hook->mp, phys);
fe8ab488
A
615 }
616
617 if (panic_phys_range_before(hook->vp, &phys, &range)) {
5ba3f43e 618 paniclog_append_noflush("vp = %p, phys = %p, prev (%p: %p-%p)\n",
0a7de745
A
619 hook->vp, phys, range.type, range.phys_start,
620 range.phys_start + range.len);
fe8ab488 621 } else {
5ba3f43e 622 paniclog_append_noflush("vp = %p, phys = %p, prev (!)\n", hook->vp, phys);
fe8ab488 623 }
0a7de745 624 panic_dump_mem((void *)(((vm_offset_t)hook->mp - 4096) & ~4095), 12288);
fe8ab488 625}
5ba3f43e 626#endif //CONFIG_EMBEDDED
fe8ab488 627
1c79356b 628int
2d21ac55 629vnode_iterate(mount_t mp, int flags, int (*callout)(struct vnode *, void *),
0a7de745 630 void *arg)
1c79356b 631{
1c79356b 632 struct vnode *vp;
91447636
A
633 int vid, retval;
634 int ret = 0;
1c79356b 635
813fb2f6
A
636 /*
637 * The mount iterate mutex is held for the duration of the iteration.
638 * This can be done by a state flag on the mount structure but we can
639 * run into priority inversion issues sometimes.
640 * Using a mutex allows us to benefit from the priority donation
641 * mechanisms in the kernel for locks. This mutex should never be
642 * acquired in spin mode and it should be acquired before attempting to
643 * acquire the mount lock.
644 */
645 mount_iterate_lock(mp);
646
91447636 647 mount_lock(mp);
1c79356b 648
91447636 649 vnode_iterate_setup(mp);
1c79356b 650
813fb2f6 651 /* If it returns 0 then there is nothing to do */
91447636 652 retval = vnode_iterate_prepare(mp);
1c79356b 653
0a7de745 654 if (retval == 0) {
91447636
A
655 vnode_iterate_clear(mp);
656 mount_unlock(mp);
813fb2f6 657 mount_iterate_unlock(mp);
0a7de745 658 return ret;
1c79356b 659 }
fe8ab488 660
5ba3f43e 661#if !CONFIG_EMBEDDED
fe8ab488
A
662 struct vnode_iterate_panic_hook hook;
663 hook.mp = mp;
664 hook.vp = NULL;
665 panic_hook(&hook.hook, vnode_iterate_panic_hook);
5ba3f43e 666#endif
91447636
A
667 /* iterate over all the vnodes */
668 while (!TAILQ_EMPTY(&mp->mnt_workerqueue)) {
669 vp = TAILQ_FIRST(&mp->mnt_workerqueue);
5ba3f43e 670#if !CONFIG_EMBEDDED
fe8ab488 671 hook.vp = vp;
5ba3f43e 672#endif
91447636
A
673 TAILQ_REMOVE(&mp->mnt_workerqueue, vp, v_mntvnodes);
674 TAILQ_INSERT_TAIL(&mp->mnt_vnodelist, vp, v_mntvnodes);
675 vid = vp->v_id;
676 if ((vp->v_data == NULL) || (vp->v_type == VNON) || (vp->v_mount != mp)) {
677 continue;
678 }
679 mount_unlock(mp);
1c79356b 680
0a7de745 681 if (vget_internal(vp, vid, (flags | VNODE_NODEAD | VNODE_WITHID | VNODE_NOSUSPEND))) {
91447636 682 mount_lock(mp);
0a7de745 683 continue;
91447636
A
684 }
685 if (flags & VNODE_RELOAD) {
0a7de745 686 /*
91447636
A
687 * we're reloading the filesystem
688 * cast out any inactive vnodes...
689 */
0a7de745
A
690 if (vnode_reload(vp)) {
691 /* vnode will be recycled on the refcount drop */
692 vnode_put(vp);
91447636 693 mount_lock(mp);
0a7de745 694 continue;
91447636
A
695 }
696 }
55e303ae 697
91447636
A
698 retval = callout(vp, arg);
699
700 switch (retval) {
0a7de745
A
701 case VNODE_RETURNED:
702 case VNODE_RETURNED_DONE:
703 vnode_put(vp);
704 if (retval == VNODE_RETURNED_DONE) {
91447636
A
705 mount_lock(mp);
706 ret = 0;
707 goto out;
0a7de745
A
708 }
709 break;
91447636 710
0a7de745
A
711 case VNODE_CLAIMED_DONE:
712 mount_lock(mp);
713 ret = 0;
714 goto out;
715 case VNODE_CLAIMED:
716 default:
717 break;
91447636
A
718 }
719 mount_lock(mp);
55e303ae 720 }
1c79356b 721
91447636 722out:
5ba3f43e 723#if !CONFIG_EMBEDDED
fe8ab488 724 panic_unhook(&hook.hook);
5ba3f43e 725#endif
91447636
A
726 (void)vnode_iterate_reloadq(mp);
727 vnode_iterate_clear(mp);
728 mount_unlock(mp);
813fb2f6 729 mount_iterate_unlock(mp);
0a7de745 730 return ret;
91447636 731}
55e303ae 732
91447636
A
733void
734mount_lock_renames(mount_t mp)
735{
736 lck_mtx_lock(&mp->mnt_renamelock);
1c79356b
A
737}
738
1c79356b 739void
91447636 740mount_unlock_renames(mount_t mp)
1c79356b 741{
91447636
A
742 lck_mtx_unlock(&mp->mnt_renamelock);
743}
1c79356b 744
813fb2f6
A
745void
746mount_iterate_lock(mount_t mp)
747{
748 lck_mtx_lock(&mp->mnt_iter_lock);
749}
750
751void
752mount_iterate_unlock(mount_t mp)
753{
754 lck_mtx_unlock(&mp->mnt_iter_lock);
755}
756
91447636
A
757void
758mount_lock(mount_t mp)
759{
760 lck_mtx_lock(&mp->mnt_mlock);
1c79356b
A
761}
762
b0d623f7
A
763void
764mount_lock_spin(mount_t mp)
765{
766 lck_mtx_lock_spin(&mp->mnt_mlock);
767}
768
91447636
A
769void
770mount_unlock(mount_t mp)
fa4905b1 771{
91447636 772 lck_mtx_unlock(&mp->mnt_mlock);
fa4905b1
A
773}
774
91447636 775
1c79356b 776void
91447636 777mount_ref(mount_t mp, int locked)
1c79356b 778{
0a7de745
A
779 if (!locked) {
780 mount_lock_spin(mp);
781 }
782
91447636
A
783 mp->mnt_count++;
784
0a7de745
A
785 if (!locked) {
786 mount_unlock(mp);
787 }
1c79356b
A
788}
789
91447636
A
790
791void
792mount_drop(mount_t mp, int locked)
793{
0a7de745
A
794 if (!locked) {
795 mount_lock_spin(mp);
796 }
797
91447636
A
798 mp->mnt_count--;
799
0a7de745
A
800 if (mp->mnt_count == 0 && (mp->mnt_lflag & MNT_LDRAIN)) {
801 wakeup(&mp->mnt_lflag);
802 }
91447636 803
0a7de745
A
804 if (!locked) {
805 mount_unlock(mp);
806 }
91447636
A
807}
808
809
1c79356b 810int
91447636 811mount_iterref(mount_t mp, int locked)
1c79356b 812{
91447636 813 int retval = 0;
1c79356b 814
0a7de745 815 if (!locked) {
91447636 816 mount_list_lock();
0a7de745 817 }
91447636
A
818 if (mp->mnt_iterref < 0) {
819 retval = 1;
820 } else {
821 mp->mnt_iterref++;
1c79356b 822 }
0a7de745 823 if (!locked) {
91447636 824 mount_list_unlock();
0a7de745
A
825 }
826 return retval;
91447636 827}
1c79356b 828
91447636
A
829int
830mount_isdrained(mount_t mp, int locked)
831{
832 int retval;
1c79356b 833
0a7de745 834 if (!locked) {
91447636 835 mount_list_lock();
0a7de745
A
836 }
837 if (mp->mnt_iterref < 0) {
91447636 838 retval = 1;
0a7de745
A
839 } else {
840 retval = 0;
841 }
842 if (!locked) {
91447636 843 mount_list_unlock();
0a7de745
A
844 }
845 return retval;
91447636
A
846}
847
848void
849mount_iterdrop(mount_t mp)
850{
851 mount_list_lock();
852 mp->mnt_iterref--;
853 wakeup(&mp->mnt_iterref);
854 mount_list_unlock();
855}
856
857void
858mount_iterdrain(mount_t mp)
859{
860 mount_list_lock();
0a7de745 861 while (mp->mnt_iterref) {
2d21ac55 862 msleep((caddr_t)&mp->mnt_iterref, mnt_list_mtx_lock, PVFS, "mount_iterdrain", NULL);
0a7de745 863 }
91447636
A
864 /* mount iterations drained */
865 mp->mnt_iterref = -1;
866 mount_list_unlock();
867}
868void
869mount_iterreset(mount_t mp)
870{
871 mount_list_lock();
0a7de745 872 if (mp->mnt_iterref == -1) {
91447636 873 mp->mnt_iterref = 0;
0a7de745 874 }
91447636
A
875 mount_list_unlock();
876}
877
878/* always called with mount lock held */
0a7de745 879int
91447636
A
880mount_refdrain(mount_t mp)
881{
0a7de745 882 if (mp->mnt_lflag & MNT_LDRAIN) {
91447636 883 panic("already in drain");
0a7de745 884 }
91447636
A
885 mp->mnt_lflag |= MNT_LDRAIN;
886
0a7de745 887 while (mp->mnt_count) {
2d21ac55 888 msleep((caddr_t)&mp->mnt_lflag, &mp->mnt_mlock, PVFS, "mount_drain", NULL);
0a7de745 889 }
91447636 890
0a7de745
A
891 if (mp->mnt_vnodelist.tqh_first != NULL) {
892 panic("mount_refdrain: dangling vnode");
893 }
91447636
A
894
895 mp->mnt_lflag &= ~MNT_LDRAIN;
896
0a7de745 897 return 0;
91447636
A
898}
899
6d2010ae 900/* Tags the mount point as not supportine extended readdir for NFS exports */
0a7de745
A
901void
902mount_set_noreaddirext(mount_t mp)
903{
904 mount_lock(mp);
6d2010ae 905 mp->mnt_kern_flag |= MNTK_DENY_READDIREXT;
0a7de745 906 mount_unlock(mp);
6d2010ae 907}
91447636
A
908
909/*
910 * Mark a mount point as busy. Used to synchronize access and to delay
911 * unmounting.
912 */
913int
914vfs_busy(mount_t mp, int flags)
915{
91447636 916restart:
0a7de745
A
917 if (mp->mnt_lflag & MNT_LDEAD) {
918 return ENOENT;
919 }
91447636 920
fe8ab488 921 mount_lock(mp);
91447636 922
fe8ab488
A
923 if (mp->mnt_lflag & MNT_LUNMOUNT) {
924 if (flags & LK_NOWAIT || mp->mnt_lflag & MNT_LDEAD) {
0a7de745
A
925 mount_unlock(mp);
926 return ENOENT;
1c79356b 927 }
fe8ab488
A
928
929 /*
930 * Since all busy locks are shared except the exclusive
931 * lock granted when unmounting, the only place that a
932 * wakeup needs to be done is at the release of the
933 * exclusive lock at the end of dounmount.
934 */
935 mp->mnt_lflag |= MNT_LWAIT;
936 msleep((caddr_t)mp, &mp->mnt_mlock, (PVFS | PDROP), "vfsbusy", NULL);
0a7de745 937 return ENOENT;
91447636
A
938 }
939
fe8ab488
A
940 mount_unlock(mp);
941
91447636
A
942 lck_rw_lock_shared(&mp->mnt_rwlock);
943
0a7de745 944 /*
fe8ab488
A
945 * Until we are granted the rwlock, it's possible for the mount point to
946 * change state, so re-evaluate before granting the vfs_busy.
91447636
A
947 */
948 if (mp->mnt_lflag & (MNT_LDEAD | MNT_LUNMOUNT)) {
949 lck_rw_done(&mp->mnt_rwlock);
950 goto restart;
1c79356b 951 }
0a7de745 952 return 0;
1c79356b
A
953}
954
91447636
A
955/*
956 * Free a busy filesystem.
957 */
91447636
A
958void
959vfs_unbusy(mount_t mp)
960{
961 lck_rw_done(&mp->mnt_rwlock);
962}
963
964
965
966static void
0a7de745
A
967vfs_rootmountfailed(mount_t mp)
968{
91447636
A
969 mount_list_lock();
970 mp->mnt_vtable->vfc_refcount--;
971 mount_list_unlock();
972
973 vfs_unbusy(mp);
974
975 mount_lock_destroy(mp);
976
2d21ac55
A
977#if CONFIG_MACF
978 mac_mount_label_destroy(mp);
979#endif
980
91447636
A
981 FREE_ZONE(mp, sizeof(struct mount), M_MOUNT);
982}
983
984/*
985 * Lookup a filesystem type, and if found allocate and initialize
986 * a mount structure for it.
987 *
988 * Devname is usually updated by mount(8) after booting.
989 */
990static mount_t
991vfs_rootmountalloc_internal(struct vfstable *vfsp, const char *devname)
992{
0a7de745 993 mount_t mp;
91447636 994
b0d623f7
A
995 mp = _MALLOC_ZONE(sizeof(struct mount), M_MOUNT, M_WAITOK);
996 bzero((char *)mp, sizeof(struct mount));
91447636
A
997
998 /* Initialize the default IO constraints */
999 mp->mnt_maxreadcnt = mp->mnt_maxwritecnt = MAXPHYS;
1000 mp->mnt_segreadcnt = mp->mnt_segwritecnt = 32;
1001 mp->mnt_maxsegreadsize = mp->mnt_maxreadcnt;
1002 mp->mnt_maxsegwritesize = mp->mnt_maxwritecnt;
1003 mp->mnt_devblocksize = DEV_BSIZE;
2d21ac55 1004 mp->mnt_alignmentmask = PAGE_MASK;
b0d623f7
A
1005 mp->mnt_ioqueue_depth = MNT_DEFAULT_IOQUEUE_DEPTH;
1006 mp->mnt_ioscale = 1;
2d21ac55
A
1007 mp->mnt_ioflags = 0;
1008 mp->mnt_realrootvp = NULLVP;
1009 mp->mnt_authcache_ttl = CACHED_LOOKUP_RIGHT_TTL;
6d2010ae
A
1010 mp->mnt_throttle_mask = LOWPRI_MAX_NUM_DEV - 1;
1011 mp->mnt_devbsdunit = 0;
91447636
A
1012
1013 mount_lock_init(mp);
1014 (void)vfs_busy(mp, LK_NOWAIT);
1015
1016 TAILQ_INIT(&mp->mnt_vnodelist);
1017 TAILQ_INIT(&mp->mnt_workerqueue);
1018 TAILQ_INIT(&mp->mnt_newvnodes);
1019
1020 mp->mnt_vtable = vfsp;
1021 mp->mnt_op = vfsp->vfc_vfsops;
1022 mp->mnt_flag = MNT_RDONLY | MNT_ROOTFS;
1023 mp->mnt_vnodecovered = NULLVP;
1024 //mp->mnt_stat.f_type = vfsp->vfc_typenum;
1025 mp->mnt_flag |= vfsp->vfc_flags & MNT_VISFLAGMASK;
1026
1027 mount_list_lock();
1028 vfsp->vfc_refcount++;
1029 mount_list_unlock();
1030
fe8ab488 1031 strlcpy(mp->mnt_vfsstat.f_fstypename, vfsp->vfc_name, MFSTYPENAMELEN);
91447636 1032 mp->mnt_vfsstat.f_mntonname[0] = '/';
2d21ac55
A
1033 /* XXX const poisoning layering violation */
1034 (void) copystr((const void *)devname, mp->mnt_vfsstat.f_mntfromname, MAXPATHLEN - 1, NULL);
91447636 1035
2d21ac55
A
1036#if CONFIG_MACF
1037 mac_mount_label_init(mp);
1038 mac_mount_label_associate(vfs_context_kernel(), mp);
1039#endif
0a7de745 1040 return mp;
91447636
A
1041}
1042
1043errno_t
1044vfs_rootmountalloc(const char *fstypename, const char *devname, mount_t *mpp)
1045{
0a7de745 1046 struct vfstable *vfsp;
91447636 1047
0a7de745
A
1048 for (vfsp = vfsconf; vfsp; vfsp = vfsp->vfc_next) {
1049 if (!strncmp(vfsp->vfc_name, fstypename,
1050 sizeof(vfsp->vfc_name))) {
1051 break;
1052 }
1053 }
1054 if (vfsp == NULL) {
1055 return ENODEV;
1056 }
91447636
A
1057
1058 *mpp = vfs_rootmountalloc_internal(vfsp, devname);
1059
0a7de745
A
1060 if (*mpp) {
1061 return 0;
1062 }
91447636 1063
0a7de745 1064 return ENOMEM;
91447636
A
1065}
1066
a39ff7e2 1067#define DBG_MOUNTROOT (FSDBG_CODE(DBG_MOUNT, 0))
91447636
A
1068
1069/*
1070 * Find an appropriate filesystem to use for the root. If a filesystem
1071 * has not been preselected, walk through the list of known filesystems
1072 * trying those that have mountroot routines, and try them until one
1073 * works or we have tried them all.
1074 */
1075extern int (*mountroot)(void);
1076
1077int
2d21ac55 1078vfs_mountroot(void)
91447636 1079{
2d21ac55
A
1080#if CONFIG_MACF
1081 struct vnode *vp;
1082#endif
91447636 1083 struct vfstable *vfsp;
2d21ac55 1084 vfs_context_t ctx = vfs_context_kernel();
0a7de745
A
1085 struct vfs_attr vfsattr;
1086 int error;
91447636 1087 mount_t mp;
0a7de745 1088 vnode_t bdevvp_rootvp;
91447636 1089
a39ff7e2 1090 KDBG_RELEASE(DBG_MOUNTROOT | DBG_FUNC_START);
91447636 1091 if (mountroot != NULL) {
2d21ac55 1092 /*
91447636
A
1093 * used for netboot which follows a different set of rules
1094 */
2d21ac55 1095 error = (*mountroot)();
a39ff7e2
A
1096
1097 KDBG_RELEASE(DBG_MOUNTROOT | DBG_FUNC_END, error, 0);
0a7de745 1098 return error;
91447636
A
1099 }
1100 if ((error = bdevvp(rootdev, &rootvp))) {
2d21ac55 1101 printf("vfs_mountroot: can't setup bdevvp\n");
a39ff7e2
A
1102
1103 KDBG_RELEASE(DBG_MOUNTROOT | DBG_FUNC_END, error, 1);
0a7de745 1104 return error;
91447636 1105 }
2d21ac55 1106 /*
0a7de745 1107 * 4951998 - code we call in vfc_mountroot may replace rootvp
2d21ac55
A
1108 * so keep a local copy for some house keeping.
1109 */
1110 bdevvp_rootvp = rootvp;
91447636
A
1111
1112 for (vfsp = vfsconf; vfsp; vfsp = vfsp->vfc_next) {
39037602 1113 if (vfsp->vfc_mountroot == NULL
0a7de745 1114 && !ISSET(vfsp->vfc_vfsflags, VFC_VFSCANMOUNTROOT)) {
91447636 1115 continue;
39037602 1116 }
91447636
A
1117
1118 mp = vfs_rootmountalloc_internal(vfsp, "root_device");
1119 mp->mnt_devvp = rootvp;
1120
0a7de745 1121 if (vfsp->vfc_mountroot) {
39037602 1122 error = (*vfsp->vfc_mountroot)(mp, rootvp, ctx);
0a7de745 1123 } else {
39037602 1124 error = VFS_MOUNT(mp, rootvp, 0, ctx);
0a7de745 1125 }
39037602
A
1126
1127 if (!error) {
0a7de745 1128 if (bdevvp_rootvp != rootvp) {
2d21ac55
A
1129 /*
1130 * rootvp changed...
1131 * bump the iocount and fix up mnt_devvp for the
1132 * new rootvp (it will already have a usecount taken)...
1133 * drop the iocount and the usecount on the orignal
1134 * since we are no longer going to use it...
1135 */
1136 vnode_getwithref(rootvp);
1137 mp->mnt_devvp = rootvp;
1138
0a7de745
A
1139 vnode_rele(bdevvp_rootvp);
1140 vnode_put(bdevvp_rootvp);
2d21ac55
A
1141 }
1142 mp->mnt_devvp->v_specflags |= SI_MOUNTEDON;
91447636 1143
2d21ac55 1144 vfs_unbusy(mp);
91447636
A
1145
1146 mount_list_add(mp);
1147
1148 /*
1149 * cache the IO attributes for the underlying physical media...
1150 * an error return indicates the underlying driver doesn't
1151 * support all the queries necessary... however, reasonable
1152 * defaults will have been set, so no reason to bail or care
1153 */
1154 vfs_init_io_attributes(rootvp, mp);
2d21ac55 1155
39037602 1156 if (mp->mnt_ioflags & MNT_IOFLAGS_FUSION_DRIVE) {
3e170ce0
A
1157 root_is_CF_drive = TRUE;
1158 }
39037602 1159
2d21ac55
A
1160 /*
1161 * Shadow the VFC_VFSNATIVEXATTR flag to MNTK_EXTENDED_ATTRS.
1162 */
1163 if (mp->mnt_vtable->vfc_vfsflags & VFC_VFSNATIVEXATTR) {
1164 mp->mnt_kern_flag |= MNTK_EXTENDED_ATTRS;
1165 }
1166 if (mp->mnt_vtable->vfc_vfsflags & VFC_VFSPREFLIGHT) {
1167 mp->mnt_kern_flag |= MNTK_UNMOUNT_PREFLIGHT;
1168 }
1169
5ba3f43e 1170#if !CONFIG_EMBEDDED
fe8ab488
A
1171 uint32_t speed;
1172
0a7de745
A
1173 if (MNTK_VIRTUALDEV & mp->mnt_kern_flag) {
1174 speed = 128;
1175 } else if (disk_conditioner_mount_is_ssd(mp)) {
1176 speed = 7 * 256;
1177 } else {
1178 speed = 256;
1179 }
fe8ab488 1180 vc_progress_setdiskspeed(speed);
5ba3f43e 1181#endif
2d21ac55
A
1182 /*
1183 * Probe root file system for additional features.
1184 */
1185 (void)VFS_START(mp, 0, ctx);
1186
1187 VFSATTR_INIT(&vfsattr);
1188 VFSATTR_WANTED(&vfsattr, f_capabilities);
0a7de745 1189 if (vfs_getattr(mp, &vfsattr, ctx) == 0 &&
2d21ac55
A
1190 VFSATTR_IS_SUPPORTED(&vfsattr, f_capabilities)) {
1191 if ((vfsattr.f_capabilities.capabilities[VOL_CAPABILITIES_INTERFACES] & VOL_CAP_INT_EXTENDED_ATTR) &&
1192 (vfsattr.f_capabilities.valid[VOL_CAPABILITIES_INTERFACES] & VOL_CAP_INT_EXTENDED_ATTR)) {
1193 mp->mnt_kern_flag |= MNTK_EXTENDED_ATTRS;
1194 }
1195#if NAMEDSTREAMS
1196 if ((vfsattr.f_capabilities.capabilities[VOL_CAPABILITIES_INTERFACES] & VOL_CAP_INT_NAMEDSTREAMS) &&
1197 (vfsattr.f_capabilities.valid[VOL_CAPABILITIES_INTERFACES] & VOL_CAP_INT_NAMEDSTREAMS)) {
1198 mp->mnt_kern_flag |= MNTK_NAMED_STREAMS;
1199 }
1200#endif
1201 if ((vfsattr.f_capabilities.capabilities[VOL_CAPABILITIES_FORMAT] & VOL_CAP_FMT_PATH_FROM_ID) &&
1202 (vfsattr.f_capabilities.valid[VOL_CAPABILITIES_FORMAT] & VOL_CAP_FMT_PATH_FROM_ID)) {
1203 mp->mnt_kern_flag |= MNTK_PATH_FROM_ID;
1204 }
39037602
A
1205
1206 if ((vfsattr.f_capabilities.capabilities[VOL_CAPABILITIES_FORMAT] & VOL_CAP_FMT_DIR_HARDLINKS) &&
0a7de745 1207 (vfsattr.f_capabilities.valid[VOL_CAPABILITIES_FORMAT] & VOL_CAP_FMT_DIR_HARDLINKS)) {
39037602
A
1208 mp->mnt_kern_flag |= MNTK_DIR_HARDLINKS;
1209 }
2d21ac55
A
1210 }
1211
91447636
A
1212 /*
1213 * get rid of iocount reference returned
2d21ac55
A
1214 * by bdevvp (or picked up by us on the substitued
1215 * rootvp)... it (or we) will have also taken
91447636
A
1216 * a usecount reference which we want to keep
1217 */
1218 vnode_put(rootvp);
1219
2d21ac55 1220#if CONFIG_MACF
a39ff7e2
A
1221 if ((vfs_flags(mp) & MNT_MULTILABEL) == 0) {
1222 KDBG_RELEASE(DBG_MOUNTROOT | DBG_FUNC_END, 0, 2);
0a7de745 1223 return 0;
a39ff7e2 1224 }
2d21ac55
A
1225
1226 error = VFS_ROOT(mp, &vp, ctx);
1227 if (error) {
1228 printf("%s() VFS_ROOT() returned %d\n",
1229 __func__, error);
1230 dounmount(mp, MNT_FORCE, 0, ctx);
1231 goto fail;
1232 }
2d21ac55 1233 error = vnode_label(mp, NULL, vp, NULL, 0, ctx);
b0d623f7
A
1234 /*
1235 * get rid of reference provided by VFS_ROOT
1236 */
1237 vnode_put(vp);
1238
2d21ac55
A
1239 if (error) {
1240 printf("%s() vnode_label() returned %d\n",
1241 __func__, error);
1242 dounmount(mp, MNT_FORCE, 0, ctx);
1243 goto fail;
1244 }
1245#endif
a39ff7e2 1246 KDBG_RELEASE(DBG_MOUNTROOT | DBG_FUNC_END, 0, 3);
0a7de745 1247 return 0;
91447636 1248 }
2d21ac55
A
1249#if CONFIG_MACF
1250fail:
1251#endif
91447636 1252 vfs_rootmountfailed(mp);
a39ff7e2 1253
0a7de745 1254 if (error != EINVAL) {
91447636 1255 printf("%s_mountroot failed: %d\n", vfsp->vfc_name, error);
0a7de745 1256 }
91447636 1257 }
a39ff7e2 1258 KDBG_RELEASE(DBG_MOUNTROOT | DBG_FUNC_END, error ? error : ENODEV, 4);
0a7de745 1259 return ENODEV;
91447636
A
1260}
1261
1262/*
1263 * Lookup a mount point by filesystem identifier.
1264 */
91447636
A
1265
1266struct mount *
2d21ac55 1267vfs_getvfs(fsid_t *fsid)
91447636 1268{
0a7de745 1269 return mount_list_lookupby_fsid(fsid, 0, 0);
91447636
A
1270}
1271
b0d623f7 1272static struct mount *
2d21ac55 1273vfs_getvfs_locked(fsid_t *fsid)
91447636 1274{
0a7de745 1275 return mount_list_lookupby_fsid(fsid, 1, 0);
91447636
A
1276}
1277
1278struct mount *
2d21ac55 1279vfs_getvfs_by_mntonname(char *path)
91447636
A
1280{
1281 mount_t retmp = (mount_t)0;
1282 mount_t mp;
1283
1284 mount_list_lock();
1285 TAILQ_FOREACH(mp, &mountlist, mnt_list) {
2d21ac55 1286 if (!strncmp(mp->mnt_vfsstat.f_mntonname, path,
0a7de745 1287 sizeof(mp->mnt_vfsstat.f_mntonname))) {
91447636 1288 retmp = mp;
0a7de745 1289 if (mount_iterref(retmp, 1)) {
6d2010ae 1290 retmp = NULL;
0a7de745 1291 }
91447636
A
1292 goto out;
1293 }
1294 }
1295out:
1296 mount_list_unlock();
0a7de745 1297 return retmp;
91447636
A
1298}
1299
1300/* generation number for creation of new fsids */
1301u_short mntid_gen = 0;
1302/*
1303 * Get a new unique fsid
1304 */
1305void
2d21ac55 1306vfs_getnewfsid(struct mount *mp)
91447636 1307{
91447636
A
1308 fsid_t tfsid;
1309 int mtype;
91447636
A
1310
1311 mount_list_lock();
1312
1313 /* generate a new fsid */
1314 mtype = mp->mnt_vtable->vfc_typenum;
0a7de745 1315 if (++mntid_gen == 0) {
91447636 1316 mntid_gen++;
0a7de745 1317 }
91447636
A
1318 tfsid.val[0] = makedev(nblkdev + mtype, mntid_gen);
1319 tfsid.val[1] = mtype;
1320
39037602 1321 while (vfs_getvfs_locked(&tfsid)) {
0a7de745 1322 if (++mntid_gen == 0) {
39037602 1323 mntid_gen++;
0a7de745 1324 }
39037602 1325 tfsid.val[0] = makedev(nblkdev + mtype, mntid_gen);
91447636 1326 }
39037602 1327
91447636
A
1328 mp->mnt_vfsstat.f_fsid.val[0] = tfsid.val[0];
1329 mp->mnt_vfsstat.f_fsid.val[1] = tfsid.val[1];
1330 mount_list_unlock();
1331}
1332
1333/*
1334 * Routines having to do with the management of the vnode table.
1335 */
0a7de745 1336extern int(**dead_vnodeop_p)(void *);
316670eb 1337long numvnodes, freevnodes, deadvnodes, async_work_vnodes;
91447636
A
1338
1339
316670eb
A
1340int async_work_timed_out = 0;
1341int async_work_handled = 0;
1342int dead_vnode_wanted = 0;
1343int dead_vnode_waited = 0;
1344
91447636
A
1345/*
1346 * Move a vnode from one mount queue to another.
1347 */
1348static void
1349insmntque(vnode_t vp, mount_t mp)
1350{
1351 mount_t lmp;
1352 /*
1353 * Delete from old mount point vnode list, if on one.
1354 */
0a7de745
A
1355 if ((lmp = vp->v_mount) != NULL && lmp != dead_mountp) {
1356 if ((vp->v_lflag & VNAMED_MOUNT) == 0) {
91447636 1357 panic("insmntque: vp not in mount vnode list");
0a7de745 1358 }
91447636
A
1359 vp->v_lflag &= ~VNAMED_MOUNT;
1360
b0d623f7 1361 mount_lock_spin(lmp);
91447636
A
1362
1363 mount_drop(lmp, 1);
1364
1365 if (vp->v_mntvnodes.tqe_next == NULL) {
0a7de745 1366 if (TAILQ_LAST(&lmp->mnt_vnodelist, vnodelst) == vp) {
91447636 1367 TAILQ_REMOVE(&lmp->mnt_vnodelist, vp, v_mntvnodes);
0a7de745 1368 } else if (TAILQ_LAST(&lmp->mnt_newvnodes, vnodelst) == vp) {
91447636 1369 TAILQ_REMOVE(&lmp->mnt_newvnodes, vp, v_mntvnodes);
0a7de745 1370 } else if (TAILQ_LAST(&lmp->mnt_workerqueue, vnodelst) == vp) {
91447636 1371 TAILQ_REMOVE(&lmp->mnt_workerqueue, vp, v_mntvnodes);
0a7de745
A
1372 }
1373 } else {
91447636
A
1374 vp->v_mntvnodes.tqe_next->v_mntvnodes.tqe_prev = vp->v_mntvnodes.tqe_prev;
1375 *vp->v_mntvnodes.tqe_prev = vp->v_mntvnodes.tqe_next;
0a7de745 1376 }
2d21ac55
A
1377 vp->v_mntvnodes.tqe_next = NULL;
1378 vp->v_mntvnodes.tqe_prev = NULL;
91447636
A
1379 mount_unlock(lmp);
1380 return;
1381 }
1382
1383 /*
1384 * Insert into list of vnodes for the new mount point, if available.
1385 */
1386 if ((vp->v_mount = mp) != NULL) {
b0d623f7 1387 mount_lock_spin(mp);
0a7de745 1388 if ((vp->v_mntvnodes.tqe_next != 0) && (vp->v_mntvnodes.tqe_prev != 0)) {
91447636 1389 panic("vp already in mount list");
0a7de745
A
1390 }
1391 if (mp->mnt_lflag & MNT_LITER) {
91447636 1392 TAILQ_INSERT_HEAD(&mp->mnt_newvnodes, vp, v_mntvnodes);
0a7de745 1393 } else {
91447636 1394 TAILQ_INSERT_HEAD(&mp->mnt_vnodelist, vp, v_mntvnodes);
0a7de745
A
1395 }
1396 if (vp->v_lflag & VNAMED_MOUNT) {
91447636 1397 panic("insmntque: vp already in mount vnode list");
0a7de745 1398 }
91447636
A
1399 vp->v_lflag |= VNAMED_MOUNT;
1400 mount_ref(mp, 1);
1401 mount_unlock(mp);
1402 }
1403}
1404
1405
1c79356b
A
1406/*
1407 * Create a vnode for a block device.
1408 * Used for root filesystem, argdev, and swap areas.
1409 * Also used for memory file system special devices.
1410 */
1411int
91447636 1412bdevvp(dev_t dev, vnode_t *vpp)
1c79356b 1413{
0a7de745
A
1414 vnode_t nvp;
1415 int error;
91447636
A
1416 struct vnode_fsparam vfsp;
1417 struct vfs_context context;
1c79356b
A
1418
1419 if (dev == NODEV) {
1420 *vpp = NULLVP;
0a7de745 1421 return ENODEV;
1c79356b 1422 }
91447636 1423
2d21ac55 1424 context.vc_thread = current_thread();
91447636
A
1425 context.vc_ucred = FSCRED;
1426
1427 vfsp.vnfs_mp = (struct mount *)0;
1428 vfsp.vnfs_vtype = VBLK;
1429 vfsp.vnfs_str = "bdevvp";
2d21ac55
A
1430 vfsp.vnfs_dvp = NULL;
1431 vfsp.vnfs_fsnode = NULL;
1432 vfsp.vnfs_cnp = NULL;
91447636
A
1433 vfsp.vnfs_vops = spec_vnodeop_p;
1434 vfsp.vnfs_rdev = dev;
1435 vfsp.vnfs_filesize = 0;
1436
1437 vfsp.vnfs_flags = VNFS_NOCACHE | VNFS_CANTCACHE;
1438
1439 vfsp.vnfs_marksystem = 0;
1440 vfsp.vnfs_markroot = 0;
1441
0a7de745 1442 if ((error = vnode_create(VNCREATE_FLAVOR, VCREATESIZE, &vfsp, &nvp))) {
1c79356b 1443 *vpp = NULLVP;
0a7de745 1444 return error;
1c79356b 1445 }
2d21ac55
A
1446 vnode_lock_spin(nvp);
1447 nvp->v_flag |= VBDEVVP;
0a7de745 1448 nvp->v_tag = VT_NON; /* set this to VT_NON so during aliasing it can be replaced */
2d21ac55 1449 vnode_unlock(nvp);
0a7de745 1450 if ((error = vnode_ref(nvp))) {
91447636 1451 panic("bdevvp failed: vnode_ref");
0a7de745 1452 return error;
1c79356b 1453 }
0a7de745 1454 if ((error = VNOP_FSYNC(nvp, MNT_WAIT, &context))) {
91447636 1455 panic("bdevvp failed: fsync");
0a7de745 1456 return error;
91447636 1457 }
0a7de745 1458 if ((error = buf_invalidateblks(nvp, BUF_WRITE_DATA, 0, 0))) {
91447636 1459 panic("bdevvp failed: invalidateblks");
0a7de745 1460 return error;
91447636 1461 }
2d21ac55
A
1462
1463#if CONFIG_MACF
0a7de745 1464 /*
2d21ac55
A
1465 * XXXMAC: We can't put a MAC check here, the system will
1466 * panic without this vnode.
1467 */
0a7de745 1468#endif /* MAC */
2d21ac55 1469
0a7de745 1470 if ((error = VNOP_OPEN(nvp, FREAD, &context))) {
91447636 1471 panic("bdevvp failed: open");
0a7de745 1472 return error;
91447636
A
1473 }
1474 *vpp = nvp;
1475
0a7de745 1476 return 0;
1c79356b
A
1477}
1478
1479/*
1480 * Check to see if the new vnode represents a special device
1481 * for which we already have a vnode (either because of
1482 * bdevvp() or because of a different vnode representing
1483 * the same block device). If such an alias exists, deallocate
1484 * the existing contents and return the aliased vnode. The
1485 * caller is responsible for filling it with its new contents.
1486 */
91447636 1487static vnode_t
2d21ac55 1488checkalias(struct vnode *nvp, dev_t nvp_rdev)
1c79356b 1489{
1c79356b
A
1490 struct vnode *vp;
1491 struct vnode **vpp;
b0d623f7 1492 struct specinfo *sin = NULL;
91447636 1493 int vid = 0;
1c79356b 1494
1c79356b
A
1495 vpp = &speclisth[SPECHASH(nvp_rdev)];
1496loop:
91447636
A
1497 SPECHASH_LOCK();
1498
1c79356b 1499 for (vp = *vpp; vp; vp = vp->v_specnext) {
91447636 1500 if (nvp_rdev == vp->v_rdev && nvp->v_type == vp->v_type) {
0a7de745 1501 vid = vp->v_id;
91447636
A
1502 break;
1503 }
1504 }
1505 SPECHASH_UNLOCK();
1506
1507 if (vp) {
b0d623f7 1508found_alias:
0a7de745
A
1509 if (vnode_getwithvid(vp, vid)) {
1510 goto loop;
91447636
A
1511 }
1512 /*
1513 * Termination state is checked in vnode_getwithvid
1514 */
1515 vnode_lock(vp);
1516
1c79356b
A
1517 /*
1518 * Alias, but not in use, so flush it out.
1519 */
91447636 1520 if ((vp->v_iocount == 1) && (vp->v_usecount == 0)) {
0a7de745 1521 vnode_reclaim_internal(vp, 1, 1, 0);
cf7d32b8 1522 vnode_put_locked(vp);
91447636 1523 vnode_unlock(vp);
1c79356b
A
1524 goto loop;
1525 }
1c79356b
A
1526 }
1527 if (vp == NULL || vp->v_tag != VT_NON) {
b0d623f7
A
1528 if (sin == NULL) {
1529 MALLOC_ZONE(sin, struct specinfo *, sizeof(struct specinfo),
0a7de745 1530 M_SPECINFO, M_WAITOK);
b0d623f7
A
1531 }
1532
1533 nvp->v_specinfo = sin;
1c79356b
A
1534 bzero(nvp->v_specinfo, sizeof(struct specinfo));
1535 nvp->v_rdev = nvp_rdev;
91447636
A
1536 nvp->v_specflags = 0;
1537 nvp->v_speclastr = -1;
6d2010ae 1538 nvp->v_specinfo->si_opencount = 0;
7ddcb079
A
1539 nvp->v_specinfo->si_initted = 0;
1540 nvp->v_specinfo->si_throttleable = 0;
91447636
A
1541
1542 SPECHASH_LOCK();
0a7de745 1543
b0d623f7
A
1544 /* We dropped the lock, someone could have added */
1545 if (vp == NULLVP) {
1546 for (vp = *vpp; vp; vp = vp->v_specnext) {
1547 if (nvp_rdev == vp->v_rdev && nvp->v_type == vp->v_type) {
1548 vid = vp->v_id;
1549 SPECHASH_UNLOCK();
1550 goto found_alias;
1551 }
1552 }
0a7de745 1553 }
b0d623f7 1554
1c79356b
A
1555 nvp->v_hashchain = vpp;
1556 nvp->v_specnext = *vpp;
1c79356b 1557 *vpp = nvp;
91447636 1558
1c79356b 1559 if (vp != NULLVP) {
b0d623f7
A
1560 nvp->v_specflags |= SI_ALIASED;
1561 vp->v_specflags |= SI_ALIASED;
1562 SPECHASH_UNLOCK();
cf7d32b8 1563 vnode_put_locked(vp);
91447636 1564 vnode_unlock(vp);
b0d623f7
A
1565 } else {
1566 SPECHASH_UNLOCK();
1c79356b 1567 }
b0d623f7 1568
0a7de745 1569 return NULLVP;
1c79356b 1570 }
b0d623f7
A
1571
1572 if (sin) {
1573 FREE_ZONE(sin, sizeof(struct specinfo), M_SPECINFO);
1574 }
1575
0a7de745
A
1576 if ((vp->v_flag & (VBDEVVP | VDEVFLUSH)) != 0) {
1577 return vp;
1578 }
b0d623f7
A
1579
1580 panic("checkalias with VT_NON vp that shouldn't: %p", vp);
1581
0a7de745 1582 return vp;
1c79356b
A
1583}
1584
91447636 1585
1c79356b 1586/*
0b4e3aa0
A
1587 * Get a reference on a particular vnode and lock it if requested.
1588 * If the vnode was on the inactive list, remove it from the list.
1589 * If the vnode was on the free list, remove it from the list and
1590 * move it to inactive list as needed.
1591 * The vnode lock bit is set if the vnode is being eliminated in
1592 * vgone. The process is awakened when the transition is completed,
1593 * and an error returned to indicate that the vnode is no longer
1594 * usable (possibly having been changed to a new file system type).
1c79356b 1595 */
b0d623f7 1596int
91447636 1597vget_internal(vnode_t vp, int vid, int vflags)
1c79356b
A
1598{
1599 int error = 0;
55e303ae 1600
2d21ac55 1601 vnode_lock_spin(vp);
55e303ae 1602
0a7de745
A
1603 if ((vflags & VNODE_WRITEABLE) && (vp->v_writecount == 0)) {
1604 /*
1605 * vnode to be returned only if it has writers opened
91447636 1606 */
0a7de745
A
1607 error = EINVAL;
1608 } else {
1609 error = vnode_getiocount(vp, vid, vflags);
1610 }
55e303ae 1611
91447636 1612 vnode_unlock(vp);
55e303ae 1613
0a7de745 1614 return error;
0b4e3aa0
A
1615}
1616
2d21ac55
A
1617/*
1618 * Returns: 0 Success
1619 * ENOENT No such file or directory [terminating]
1620 */
1c79356b 1621int
91447636 1622vnode_ref(vnode_t vp)
1c79356b 1623{
0a7de745 1624 return vnode_ref_ext(vp, 0, 0);
1c79356b
A
1625}
1626
2d21ac55
A
1627/*
1628 * Returns: 0 Success
1629 * ENOENT No such file or directory [terminating]
1630 */
1c79356b 1631int
6d2010ae 1632vnode_ref_ext(vnode_t vp, int fmode, int flags)
1c79356b 1633{
0a7de745 1634 int error = 0;
1c79356b 1635
2d21ac55 1636 vnode_lock_spin(vp);
1c79356b 1637
91447636
A
1638 /*
1639 * once all the current call sites have been fixed to insure they have
1640 * taken an iocount, we can toughen this assert up and insist that the
1641 * iocount is non-zero... a non-zero usecount doesn't insure correctness
1642 */
0a7de745 1643 if (vp->v_iocount <= 0 && vp->v_usecount <= 0) {
2d21ac55 1644 panic("vnode_ref_ext: vp %p has no valid reference %d, %d", vp, vp->v_iocount, vp->v_usecount);
0a7de745 1645 }
1c79356b 1646
91447636
A
1647 /*
1648 * if you are the owner of drain/termination, can acquire usecount
1649 */
6d2010ae
A
1650 if ((flags & VNODE_REF_FORCE) == 0) {
1651 if ((vp->v_lflag & (VL_DRAIN | VL_TERMINATE | VL_DEAD))) {
1652 if (vp->v_owner != current_thread()) {
1653 error = ENOENT;
1654 goto out;
1655 }
91447636
A
1656 }
1657 }
1658 vp->v_usecount++;
1c79356b 1659
91447636 1660 if (fmode & FWRITE) {
0a7de745
A
1661 if (++vp->v_writecount <= 0) {
1662 panic("vnode_ref_ext: v_writecount");
1663 }
55e303ae 1664 }
91447636 1665 if (fmode & O_EVTONLY) {
0a7de745
A
1666 if (++vp->v_kusecount <= 0) {
1667 panic("vnode_ref_ext: v_kusecount");
1668 }
55e303ae 1669 }
2d21ac55 1670 if (vp->v_flag & VRAGE) {
0a7de745
A
1671 struct uthread *ut;
1672
1673 ut = get_bsdthread_info(current_thread());
2d21ac55 1674
0a7de745
A
1675 if (!(current_proc()->p_lflag & P_LRAGE_VNODES) &&
1676 !(ut->uu_flag & UT_RAGE_VNODES)) {
1677 /*
2d21ac55
A
1678 * a 'normal' process accessed this vnode
1679 * so make sure its no longer marked
1680 * for rapid aging... also, make sure
1681 * it gets removed from the rage list...
1682 * when v_usecount drops back to 0, it
1683 * will be put back on the real free list
1684 */
1685 vp->v_flag &= ~VRAGE;
1686 vp->v_references = 0;
1687 vnode_list_remove(vp);
1688 }
1689 }
6d2010ae 1690 if (vp->v_usecount == 1 && vp->v_type == VREG && !(vp->v_flag & VSYSTEM)) {
6d2010ae
A
1691 if (vp->v_ubcinfo) {
1692 vnode_lock_convert(vp);
1693 memory_object_mark_used(vp->v_ubcinfo->ui_control);
1694 }
1695 }
91447636
A
1696out:
1697 vnode_unlock(vp);
1698
0a7de745 1699 return error;
55e303ae
A
1700}
1701
1702
fe8ab488 1703boolean_t
316670eb
A
1704vnode_on_reliable_media(vnode_t vp)
1705{
0a7de745
A
1706 if (!(vp->v_mount->mnt_kern_flag & MNTK_VIRTUALDEV) && (vp->v_mount->mnt_flag & MNT_LOCAL)) {
1707 return TRUE;
1708 }
1709 return FALSE;
316670eb
A
1710}
1711
1712static void
1713vnode_async_list_add(vnode_t vp)
1714{
1715 vnode_list_lock();
1716
0a7de745 1717 if (VONLIST(vp) || (vp->v_lflag & (VL_TERMINATE | VL_DEAD))) {
316670eb 1718 panic("vnode_async_list_add: %p is in wrong state", vp);
0a7de745 1719 }
316670eb
A
1720
1721 TAILQ_INSERT_HEAD(&vnode_async_work_list, vp, v_freelist);
1722 vp->v_listflag |= VLIST_ASYNC_WORK;
1723
1724 async_work_vnodes++;
1725
1726 vnode_list_unlock();
1727
1728 wakeup(&vnode_async_work_list);
316670eb
A
1729}
1730
1731
1c79356b
A
1732/*
1733 * put the vnode on appropriate free list.
91447636 1734 * called with vnode LOCKED
1c79356b
A
1735 */
1736static void
91447636 1737vnode_list_add(vnode_t vp)
1c79356b 1738{
316670eb
A
1739 boolean_t need_dead_wakeup = FALSE;
1740
b0d623f7
A
1741#if DIAGNOSTIC
1742 lck_mtx_assert(&vp->v_lock, LCK_MTX_ASSERT_OWNED);
1743#endif
3e170ce0
A
1744
1745again:
1746
1c79356b 1747 /*
0a7de745 1748 * if it is already on a list or non zero references return
1c79356b 1749 */
0a7de745 1750 if (VONLIST(vp) || (vp->v_usecount != 0) || (vp->v_iocount != 0) || (vp->v_lflag & VL_TERMINATE)) {
1c79356b 1751 return;
0a7de745 1752 }
b0d623f7 1753
3e170ce0
A
1754 /*
1755 * In vclean, we might have deferred ditching locked buffers
1756 * because something was still referencing them (indicated by
1757 * usecount). We can ditch them now.
1758 */
1759 if (ISSET(vp->v_lflag, VL_DEAD)
0a7de745
A
1760 && (!LIST_EMPTY(&vp->v_cleanblkhd) || !LIST_EMPTY(&vp->v_dirtyblkhd))) {
1761 ++vp->v_iocount; // Probably not necessary, but harmless
3e170ce0
A
1762#ifdef JOE_DEBUG
1763 record_vp(vp, 1);
1764#endif
1765 vnode_unlock(vp);
1766 buf_invalidateblks(vp, BUF_INVALIDATE_LOCKED, 0, 0);
1767 vnode_lock(vp);
1768 vnode_dropiocount(vp);
1769 goto again;
1770 }
1771
91447636 1772 vnode_list_lock();
1c79356b 1773
2d21ac55
A
1774 if ((vp->v_flag & VRAGE) && !(vp->v_lflag & VL_DEAD)) {
1775 /*
1776 * add the new guy to the appropriate end of the RAGE list
1777 */
0a7de745
A
1778 if ((vp->v_flag & VAGE)) {
1779 TAILQ_INSERT_HEAD(&vnode_rage_list, vp, v_freelist);
1780 } else {
1781 TAILQ_INSERT_TAIL(&vnode_rage_list, vp, v_freelist);
1782 }
2d21ac55
A
1783
1784 vp->v_listflag |= VLIST_RAGE;
1785 ragevnodes++;
1786
1787 /*
1788 * reset the timestamp for the last inserted vp on the RAGE
1789 * queue to let new_vnode know that its not ok to start stealing
1790 * from this list... as long as we're actively adding to this list
1791 * we'll push out the vnodes we want to donate to the real free list
1792 * once we stop pushing, we'll let some time elapse before we start
1793 * stealing them in the new_vnode routine
1794 */
1795 microuptime(&rage_tv);
91447636 1796 } else {
0a7de745 1797 /*
2d21ac55
A
1798 * if VL_DEAD, insert it at head of the dead list
1799 * else insert at tail of LRU list or at head if VAGE is set
1800 */
0a7de745
A
1801 if ((vp->v_lflag & VL_DEAD)) {
1802 TAILQ_INSERT_HEAD(&vnode_dead_list, vp, v_freelist);
2d21ac55
A
1803 vp->v_listflag |= VLIST_DEAD;
1804 deadvnodes++;
316670eb
A
1805
1806 if (dead_vnode_wanted) {
1807 dead_vnode_wanted--;
1808 need_dead_wakeup = TRUE;
1809 }
0a7de745
A
1810 } else if ((vp->v_flag & VAGE)) {
1811 TAILQ_INSERT_HEAD(&vnode_free_list, vp, v_freelist);
2d21ac55
A
1812 vp->v_flag &= ~VAGE;
1813 freevnodes++;
1814 } else {
0a7de745 1815 TAILQ_INSERT_TAIL(&vnode_free_list, vp, v_freelist);
2d21ac55
A
1816 freevnodes++;
1817 }
91447636 1818 }
91447636 1819 vnode_list_unlock();
316670eb 1820
0a7de745 1821 if (need_dead_wakeup == TRUE) {
316670eb 1822 wakeup_one((caddr_t)&dead_vnode_wanted);
0a7de745 1823 }
1c79356b
A
1824}
1825
cf7d32b8
A
1826
1827/*
1828 * remove the vnode from appropriate free list.
1829 * called with vnode LOCKED and
1830 * the list lock held
1831 */
1832static void
1833vnode_list_remove_locked(vnode_t vp)
1834{
1835 if (VONLIST(vp)) {
1836 /*
1837 * the v_listflag field is
1838 * protected by the vnode_list_lock
1839 */
0a7de745
A
1840 if (vp->v_listflag & VLIST_RAGE) {
1841 VREMRAGE("vnode_list_remove", vp);
1842 } else if (vp->v_listflag & VLIST_DEAD) {
1843 VREMDEAD("vnode_list_remove", vp);
1844 } else if (vp->v_listflag & VLIST_ASYNC_WORK) {
1845 VREMASYNC_WORK("vnode_list_remove", vp);
1846 } else {
1847 VREMFREE("vnode_list_remove", vp);
1848 }
cf7d32b8
A
1849 }
1850}
1851
1852
1c79356b 1853/*
91447636 1854 * remove the vnode from appropriate free list.
cf7d32b8 1855 * called with vnode LOCKED
1c79356b
A
1856 */
1857static void
91447636 1858vnode_list_remove(vnode_t vp)
1c79356b 1859{
b0d623f7
A
1860#if DIAGNOSTIC
1861 lck_mtx_assert(&vp->v_lock, LCK_MTX_ASSERT_OWNED);
1862#endif
0a7de745 1863 /*
91447636
A
1864 * we want to avoid taking the list lock
1865 * in the case where we're not on the free
1866 * list... this will be true for most
1867 * directories and any currently in use files
1868 *
1869 * we're guaranteed that we can't go from
0a7de745 1870 * the not-on-list state to the on-list
91447636
A
1871 * state since we hold the vnode lock...
1872 * all calls to vnode_list_add are done
1873 * under the vnode lock... so we can
1874 * check for that condition (the prevelant one)
1875 * without taking the list lock
1876 */
1877 if (VONLIST(vp)) {
0a7de745 1878 vnode_list_lock();
91447636
A
1879 /*
1880 * however, we're not guaranteed that
1881 * we won't go from the on-list state
cf7d32b8 1882 * to the not-on-list state until we
0a7de745 1883 * hold the vnode_list_lock... this
cf7d32b8 1884 * is due to "new_vnode" removing vnodes
91447636
A
1885 * from the free list uder the list_lock
1886 * w/o the vnode lock... so we need to
1887 * check again whether we're currently
1888 * on the free list
1889 */
cf7d32b8 1890 vnode_list_remove_locked(vp);
2d21ac55 1891
91447636
A
1892 vnode_list_unlock();
1893 }
1c79356b
A
1894}
1895
1896
1c79356b 1897void
91447636 1898vnode_rele(vnode_t vp)
1c79356b 1899{
0a7de745 1900 vnode_rele_internal(vp, 0, 0, 0);
91447636 1901}
1c79356b 1902
1c79356b 1903
91447636
A
1904void
1905vnode_rele_ext(vnode_t vp, int fmode, int dont_reenter)
1906{
0a7de745 1907 vnode_rele_internal(vp, fmode, dont_reenter, 0);
1c79356b
A
1908}
1909
91447636 1910
1c79356b 1911void
91447636 1912vnode_rele_internal(vnode_t vp, int fmode, int dont_reenter, int locked)
1c79356b 1913{
0a7de745
A
1914 if (!locked) {
1915 vnode_lock_spin(vp);
1916 }
b0d623f7 1917#if DIAGNOSTIC
0a7de745 1918 else {
b0d623f7 1919 lck_mtx_assert(&vp->v_lock, LCK_MTX_ASSERT_OWNED);
0a7de745 1920 }
b0d623f7 1921#endif
0a7de745
A
1922 if (--vp->v_usecount < 0) {
1923 panic("vnode_rele_ext: vp %p usecount -ve : %d. v_tag = %d, v_type = %d, v_flag = %x.", vp, vp->v_usecount, vp->v_tag, vp->v_type, vp->v_flag);
1924 }
91447636
A
1925
1926 if (fmode & FWRITE) {
0a7de745
A
1927 if (--vp->v_writecount < 0) {
1928 panic("vnode_rele_ext: vp %p writecount -ve : %d. v_tag = %d, v_type = %d, v_flag = %x.", vp, vp->v_writecount, vp->v_tag, vp->v_type, vp->v_flag);
1929 }
1c79356b 1930 }
91447636 1931 if (fmode & O_EVTONLY) {
0a7de745
A
1932 if (--vp->v_kusecount < 0) {
1933 panic("vnode_rele_ext: vp %p kusecount -ve : %d. v_tag = %d, v_type = %d, v_flag = %x.", vp, vp->v_kusecount, vp->v_tag, vp->v_type, vp->v_flag);
1934 }
1935 }
1936 if (vp->v_kusecount > vp->v_usecount) {
1937 panic("vnode_rele_ext: vp %p kusecount(%d) out of balance with usecount(%d). v_tag = %d, v_type = %d, v_flag = %x.", vp, vp->v_kusecount, vp->v_usecount, vp->v_tag, vp->v_type, vp->v_flag);
1c79356b 1938 }
b0d623f7 1939
91447636 1940 if ((vp->v_iocount > 0) || (vp->v_usecount > 0)) {
fe8ab488 1941 /*
91447636
A
1942 * vnode is still busy... if we're the last
1943 * usecount, mark for a future call to VNOP_INACTIVE
1944 * when the iocount finally drops to 0
1945 */
fe8ab488
A
1946 if (vp->v_usecount == 0) {
1947 vp->v_lflag |= VL_NEEDINACTIVE;
2d21ac55 1948 vp->v_flag &= ~(VNOCACHE_DATA | VRAOFF | VOPENEVT);
91447636 1949 }
6d2010ae 1950 goto done;
1c79356b 1951 }
2d21ac55 1952 vp->v_flag &= ~(VNOCACHE_DATA | VRAOFF | VOPENEVT);
91447636 1953
fe8ab488
A
1954 if (ISSET(vp->v_lflag, VL_TERMINATE | VL_DEAD) || dont_reenter) {
1955 /*
91447636
A
1956 * vnode is being cleaned, or
1957 * we've requested that we don't reenter
fe8ab488
A
1958 * the filesystem on this release...in
1959 * the latter case, we'll mark the vnode aged
1c79356b 1960 */
fe8ab488 1961 if (dont_reenter) {
0a7de745 1962 if (!(vp->v_lflag & (VL_TERMINATE | VL_DEAD | VL_MARKTERM))) {
fe8ab488 1963 vp->v_lflag |= VL_NEEDINACTIVE;
0a7de745 1964
fe8ab488 1965 if (vnode_on_reliable_media(vp) == FALSE || vp->v_flag & VISDIRTY) {
316670eb
A
1966 vnode_async_list_add(vp);
1967 goto done;
1968 }
1969 }
1970 vp->v_flag |= VAGE;
1c79356b 1971 }
fe8ab488 1972 vnode_list_add(vp);
6d2010ae
A
1973
1974 goto done;
1c79356b 1975 }
91447636
A
1976 /*
1977 * at this point both the iocount and usecount
1978 * are zero
1979 * pick up an iocount so that we can call
1980 * VNOP_INACTIVE with the vnode lock unheld
1981 */
1982 vp->v_iocount++;
1983#ifdef JOE_DEBUG
1984 record_vp(vp, 1);
1c79356b 1985#endif
0a7de745 1986 vp->v_lflag &= ~VL_NEEDINACTIVE;
91447636 1987 vnode_unlock(vp);
1c79356b 1988
2d21ac55 1989 VNOP_INACTIVE(vp, vfs_context_current());
1c79356b 1990
2d21ac55 1991 vnode_lock_spin(vp);
91447636
A
1992 /*
1993 * because we dropped the vnode lock to call VNOP_INACTIVE
1994 * the state of the vnode may have changed... we may have
1995 * picked up an iocount, usecount or the MARKTERM may have
1996 * been set... we need to reevaluate the reference counts
1997 * to determine if we can call vnode_reclaim_internal at
1998 * this point... if the reference counts are up, we'll pick
1999 * up the MARKTERM state when they get subsequently dropped
2000 */
0a7de745
A
2001 if ((vp->v_iocount == 1) && (vp->v_usecount == 0) &&
2002 ((vp->v_lflag & (VL_MARKTERM | VL_TERMINATE | VL_DEAD)) == VL_MARKTERM)) {
2003 struct uthread *ut;
2004
2005 ut = get_bsdthread_info(current_thread());
1c79356b 2006
91447636 2007 if (ut->uu_defer_reclaims) {
0a7de745 2008 vp->v_defer_reclaimlist = ut->uu_vreclaims;
6d2010ae
A
2009 ut->uu_vreclaims = vp;
2010 goto done;
91447636 2011 }
2d21ac55 2012 vnode_lock_convert(vp);
0a7de745 2013 vnode_reclaim_internal(vp, 1, 1, 0);
91447636 2014 }
2d21ac55 2015 vnode_dropiocount(vp);
91447636 2016 vnode_list_add(vp);
6d2010ae
A
2017done:
2018 if (vp->v_usecount == 0 && vp->v_type == VREG && !(vp->v_flag & VSYSTEM)) {
6d2010ae
A
2019 if (vp->v_ubcinfo) {
2020 vnode_lock_convert(vp);
2021 memory_object_mark_unused(vp->v_ubcinfo->ui_control, (vp->v_flag & VRAGE) == VRAGE);
2022 }
2023 }
0a7de745
A
2024 if (!locked) {
2025 vnode_unlock(vp);
2026 }
91447636 2027 return;
1c79356b
A
2028}
2029
2030/*
2031 * Remove any vnodes in the vnode table belonging to mount point mp.
2032 *
2033 * If MNT_NOFORCE is specified, there should not be any active ones,
2034 * return error if any are found (nb: this is a user error, not a
2035 * system error). If MNT_FORCE is specified, detach any active vnodes
2036 * that are found.
2037 */
2038#if DIAGNOSTIC
0a7de745 2039int busyprt = 0; /* print out busy vnodes */
1c79356b
A
2040#endif
2041
2042int
2d21ac55 2043vflush(struct mount *mp, struct vnode *skipvp, int flags)
1c79356b 2044{
91447636 2045 struct vnode *vp;
1c79356b 2046 int busy = 0;
91447636 2047 int reclaimed = 0;
2d21ac55 2048 int retval;
b0d623f7 2049 unsigned int vid;
1c79356b 2050
813fb2f6
A
2051 /*
2052 * See comments in vnode_iterate() for the rationale for this lock
2053 */
2054 mount_iterate_lock(mp);
2055
91447636
A
2056 mount_lock(mp);
2057 vnode_iterate_setup(mp);
2058 /*
2059 * On regular unmounts(not forced) do a
2060 * quick check for vnodes to be in use. This
2061 * preserves the caching of vnodes. automounter
2062 * tries unmounting every so often to see whether
2063 * it is still busy or not.
2064 */
0a7de745 2065 if (((flags & FORCECLOSE) == 0) && ((mp->mnt_kern_flag & MNTK_UNMOUNT_PREFLIGHT) != 0)) {
91447636
A
2066 if (vnode_umount_preflight(mp, skipvp, flags)) {
2067 vnode_iterate_clear(mp);
2068 mount_unlock(mp);
813fb2f6 2069 mount_iterate_unlock(mp);
0a7de745 2070 return EBUSY;
91447636
A
2071 }
2072 }
1c79356b 2073loop:
813fb2f6 2074 /* If it returns 0 then there is nothing to do */
91447636
A
2075 retval = vnode_iterate_prepare(mp);
2076
0a7de745 2077 if (retval == 0) {
91447636
A
2078 vnode_iterate_clear(mp);
2079 mount_unlock(mp);
813fb2f6 2080 mount_iterate_unlock(mp);
0a7de745 2081 return retval;
91447636
A
2082 }
2083
b0d623f7
A
2084 /* iterate over all the vnodes */
2085 while (!TAILQ_EMPTY(&mp->mnt_workerqueue)) {
b0d623f7
A
2086 vp = TAILQ_FIRST(&mp->mnt_workerqueue);
2087 TAILQ_REMOVE(&mp->mnt_workerqueue, vp, v_mntvnodes);
2088 TAILQ_INSERT_TAIL(&mp->mnt_vnodelist, vp, v_mntvnodes);
2089
0a7de745 2090 if ((vp->v_mount != mp) || (vp == skipvp)) {
b0d623f7
A
2091 continue;
2092 }
2093 vid = vp->v_id;
2094 mount_unlock(mp);
2095
2096 vnode_lock_spin(vp);
91447636 2097
3e170ce0
A
2098 // If vnode is already terminating, wait for it...
2099 while (vp->v_id == vid && ISSET(vp->v_lflag, VL_TERMINATE)) {
2100 vp->v_lflag |= VL_TERMWANT;
2101 msleep(&vp->v_lflag, &vp->v_lock, PVFS, "vflush", NULL);
2102 }
2103
2104 if ((vp->v_id != vid) || ISSET(vp->v_lflag, VL_DEAD)) {
0a7de745
A
2105 vnode_unlock(vp);
2106 mount_lock(mp);
2107 continue;
91447636
A
2108 }
2109
1c79356b 2110 /*
91447636
A
2111 * If requested, skip over vnodes marked VSYSTEM.
2112 * Skip over all vnodes marked VNOFLUSH.
0a7de745 2113 */
91447636
A
2114 if ((flags & SKIPSYSTEM) && ((vp->v_flag & VSYSTEM) ||
2115 (vp->v_flag & VNOFLUSH))) {
2116 vnode_unlock(vp);
2117 mount_lock(mp);
1c79356b 2118 continue;
91447636 2119 }
1c79356b 2120 /*
91447636 2121 * If requested, skip over vnodes marked VSWAP.
1c79356b 2122 */
91447636
A
2123 if ((flags & SKIPSWAP) && (vp->v_flag & VSWAP)) {
2124 vnode_unlock(vp);
2125 mount_lock(mp);
1c79356b
A
2126 continue;
2127 }
2128 /*
b0d623f7 2129 * If requested, skip over vnodes marked VROOT.
1c79356b 2130 */
91447636
A
2131 if ((flags & SKIPROOT) && (vp->v_flag & VROOT)) {
2132 vnode_unlock(vp);
2133 mount_lock(mp);
1c79356b
A
2134 continue;
2135 }
2136 /*
2137 * If WRITECLOSE is set, only flush out regular file
2138 * vnodes open for writing.
2139 */
2140 if ((flags & WRITECLOSE) &&
2141 (vp->v_writecount == 0 || vp->v_type != VREG)) {
91447636
A
2142 vnode_unlock(vp);
2143 mount_lock(mp);
1c79356b
A
2144 continue;
2145 }
2146 /*
91447636 2147 * If the real usecount is 0, all we need to do is clear
1c79356b
A
2148 * out the vnode data structures and we are done.
2149 */
91447636
A
2150 if (((vp->v_usecount == 0) ||
2151 ((vp->v_usecount - vp->v_kusecount) == 0))) {
b0d623f7 2152 vnode_lock_convert(vp);
0a7de745 2153 vp->v_iocount++; /* so that drain waits for * other iocounts */
91447636
A
2154#ifdef JOE_DEBUG
2155 record_vp(vp, 1);
2156#endif
cf7d32b8 2157 vnode_reclaim_internal(vp, 1, 1, 0);
2d21ac55 2158 vnode_dropiocount(vp);
91447636 2159 vnode_list_add(vp);
91447636 2160 vnode_unlock(vp);
cf7d32b8 2161
91447636
A
2162 reclaimed++;
2163 mount_lock(mp);
1c79356b
A
2164 continue;
2165 }
2166 /*
2167 * If FORCECLOSE is set, forcibly close the vnode.
2168 * For block or character devices, revert to an
2169 * anonymous device. For all other files, just kill them.
2170 */
2171 if (flags & FORCECLOSE) {
b0d623f7
A
2172 vnode_lock_convert(vp);
2173
1c79356b 2174 if (vp->v_type != VBLK && vp->v_type != VCHR) {
0a7de745 2175 vp->v_iocount++; /* so that drain waits * for other iocounts */
91447636
A
2176#ifdef JOE_DEBUG
2177 record_vp(vp, 1);
2178#endif
316670eb 2179 vnode_abort_advlocks(vp);
cf7d32b8 2180 vnode_reclaim_internal(vp, 1, 1, 0);
2d21ac55 2181 vnode_dropiocount(vp);
91447636
A
2182 vnode_list_add(vp);
2183 vnode_unlock(vp);
1c79356b 2184 } else {
2d21ac55 2185 vclean(vp, 0);
91447636 2186 vp->v_lflag &= ~VL_DEAD;
1c79356b 2187 vp->v_op = spec_vnodeop_p;
2d21ac55 2188 vp->v_flag |= VDEVFLUSH;
91447636 2189 vnode_unlock(vp);
1c79356b 2190 }
91447636 2191 mount_lock(mp);
1c79356b
A
2192 continue;
2193 }
2194#if DIAGNOSTIC
0a7de745 2195 if (busyprt) {
1c79356b 2196 vprint("vflush: busy vnode", vp);
0a7de745 2197 }
1c79356b 2198#endif
91447636
A
2199 vnode_unlock(vp);
2200 mount_lock(mp);
1c79356b
A
2201 busy++;
2202 }
91447636
A
2203
2204 /* At this point the worker queue is completed */
0a7de745 2205 if (busy && ((flags & FORCECLOSE) == 0) && reclaimed) {
91447636
A
2206 busy = 0;
2207 reclaimed = 0;
2208 (void)vnode_iterate_reloadq(mp);
2209 /* returned with mount lock held */
2210 goto loop;
2211 }
2212
2213 /* if new vnodes were created in between retry the reclaim */
0a7de745
A
2214 if (vnode_iterate_reloadq(mp) != 0) {
2215 if (!(busy && ((flags & FORCECLOSE) == 0))) {
91447636 2216 goto loop;
0a7de745 2217 }
91447636
A
2218 }
2219 vnode_iterate_clear(mp);
2220 mount_unlock(mp);
813fb2f6 2221 mount_iterate_unlock(mp);
91447636 2222
0a7de745
A
2223 if (busy && ((flags & FORCECLOSE) == 0)) {
2224 return EBUSY;
2225 }
2226 return 0;
1c79356b
A
2227}
2228
b0d623f7 2229long num_recycledvnodes = 0;
1c79356b
A
2230/*
2231 * Disassociate the underlying file system from a vnode.
91447636 2232 * The vnode lock is held on entry.
1c79356b
A
2233 */
2234static void
2d21ac55 2235vclean(vnode_t vp, int flags)
1c79356b 2236{
2d21ac55 2237 vfs_context_t ctx = vfs_context_current();
1c79356b 2238 int active;
91447636
A
2239 int need_inactive;
2240 int already_terminating;
2d21ac55 2241 int clflags = 0;
cf7d32b8
A
2242#if NAMEDSTREAMS
2243 int is_namedstream;
2244#endif
2245
1c79356b
A
2246 /*
2247 * Check to see if the vnode is in use.
2248 * If so we have to reference it before we clean it out
2249 * so that its count cannot fall to zero and generate a
2250 * race against ourselves to recycle it.
2251 */
91447636 2252 active = vp->v_usecount;
55e303ae 2253
91447636
A
2254 /*
2255 * just in case we missed sending a needed
2256 * VNOP_INACTIVE, we'll do it now
2257 */
2258 need_inactive = (vp->v_lflag & VL_NEEDINACTIVE);
2259
2260 vp->v_lflag &= ~VL_NEEDINACTIVE;
55e303ae 2261
1c79356b
A
2262 /*
2263 * Prevent the vnode from being recycled or
2264 * brought into use while we clean it out.
2265 */
91447636
A
2266 already_terminating = (vp->v_lflag & VL_TERMINATE);
2267
2268 vp->v_lflag |= VL_TERMINATE;
1c79356b 2269
cf7d32b8
A
2270#if NAMEDSTREAMS
2271 is_namedstream = vnode_isnamedstream(vp);
2272#endif
2273
91447636
A
2274 vnode_unlock(vp);
2275
b0d623f7 2276 OSAddAtomicLong(1, &num_recycledvnodes);
1c79356b 2277
0a7de745 2278 if (flags & DOCLOSE) {
2d21ac55 2279 clflags |= IO_NDELAY;
0a7de745
A
2280 }
2281 if (flags & REVOKEALL) {
2d21ac55 2282 clflags |= IO_REVOKE;
0a7de745
A
2283 }
2284
2285 if (active && (flags & DOCLOSE)) {
2d21ac55 2286 VNOP_CLOSE(vp, clflags, ctx);
0a7de745 2287 }
1c79356b
A
2288
2289 /*
2290 * Clean out any buffers associated with the vnode.
2291 */
2292 if (flags & DOCLOSE) {
91447636 2293#if NFSCLIENT
0a7de745 2294 if (vp->v_tag == VT_NFS) {
2d21ac55 2295 nfs_vinvalbuf(vp, V_SAVE, ctx, 0);
0a7de745 2296 } else
91447636
A
2297#endif
2298 {
3e170ce0
A
2299 VNOP_FSYNC(vp, MNT_WAIT, ctx);
2300
2301 /*
2302 * If the vnode is still in use (by the journal for
2303 * example) we don't want to invalidate locked buffers
2304 * here. In that case, either the journal will tidy them
2305 * up, or we will deal with it when the usecount is
2306 * finally released in vnode_rele_internal.
2307 */
2308 buf_invalidateblks(vp, BUF_WRITE_DATA | (active ? 0 : BUF_INVALIDATE_LOCKED), 0, 0);
91447636 2309 }
0a7de745
A
2310 if (UBCINFOEXISTS(vp)) {
2311 /*
91447636
A
2312 * Clean the pages in VM.
2313 */
0a7de745
A
2314 (void)ubc_msync(vp, (off_t)0, ubc_getsize(vp), NULL, UBC_PUSHALL | UBC_INVALIDATE | UBC_SYNC);
2315 }
55e303ae 2316 }
0a7de745 2317 if (active || need_inactive) {
2d21ac55 2318 VNOP_INACTIVE(vp, ctx);
0a7de745 2319 }
0b4e3aa0 2320
cf7d32b8 2321#if NAMEDSTREAMS
b0d623f7
A
2322 if ((is_namedstream != 0) && (vp->v_parent != NULLVP)) {
2323 vnode_t pvp = vp->v_parent;
0a7de745 2324
b0d623f7
A
2325 /* Delete the shadow stream file before we reclaim its vnode */
2326 if (vnode_isshadow(vp)) {
39236c6e 2327 vnode_relenamedstream(pvp, vp);
b0d623f7 2328 }
0a7de745
A
2329
2330 /*
b0d623f7
A
2331 * No more streams associated with the parent. We
2332 * have a ref on it, so its identity is stable.
2333 * If the parent is on an opaque volume, then we need to know
2334 * whether it has associated named streams.
2335 */
2336 if (vfs_authopaque(pvp->v_mount)) {
2337 vnode_lock_spin(pvp);
2338 pvp->v_lflag &= ~VL_HASSTREAMS;
2339 vnode_unlock(pvp);
2340 }
cf7d32b8
A
2341 }
2342#endif
2343
2d21ac55
A
2344 /*
2345 * Destroy ubc named reference
2346 * cluster_release is done on this path
2347 * along with dropping the reference on the ucred
fe8ab488
A
2348 * (and in the case of forced unmount of an mmap-ed file,
2349 * the ubc reference on the vnode is dropped here too).
2d21ac55 2350 */
91447636 2351 ubc_destroy_named(vp);
0b4e3aa0 2352
6d2010ae
A
2353#if CONFIG_TRIGGERS
2354 /*
2355 * cleanup trigger info from vnode (if any)
2356 */
0a7de745 2357 if (vp->v_resolve) {
6d2010ae 2358 vnode_resolver_detach(vp);
0a7de745 2359 }
6d2010ae
A
2360#endif
2361
1c79356b
A
2362 /*
2363 * Reclaim the vnode.
2364 */
0a7de745 2365 if (VNOP_RECLAIM(vp, ctx)) {
1c79356b 2366 panic("vclean: cannot reclaim");
0a7de745
A
2367 }
2368
55e303ae 2369 // make sure the name & parent ptrs get cleaned out!
b0d623f7 2370 vnode_update_identity(vp, NULLVP, NULL, 0, 0, VNODE_UPDATE_PARENT | VNODE_UPDATE_NAME | VNODE_UPDATE_PURGE);
55e303ae 2371
91447636 2372 vnode_lock(vp);
1c79356b 2373
3e170ce0
A
2374 /*
2375 * Remove the vnode from any mount list it might be on. It is not
2376 * safe to do this any earlier because unmount needs to wait for
2377 * any vnodes to terminate and it cannot do that if it cannot find
2378 * them.
2379 */
2380 insmntque(vp, (struct mount *)0);
2381
91447636 2382 vp->v_mount = dead_mountp;
1c79356b
A
2383 vp->v_op = dead_vnodeop_p;
2384 vp->v_tag = VT_NON;
91447636 2385 vp->v_data = NULL;
1c79356b 2386
91447636 2387 vp->v_lflag |= VL_DEAD;
fe8ab488 2388 vp->v_flag &= ~VISDIRTY;
55e303ae 2389
91447636 2390 if (already_terminating == 0) {
0a7de745 2391 vp->v_lflag &= ~VL_TERMINATE;
91447636
A
2392 /*
2393 * Done with purge, notify sleepers of the grim news.
2394 */
2395 if (vp->v_lflag & VL_TERMWANT) {
0a7de745 2396 vp->v_lflag &= ~VL_TERMWANT;
91447636
A
2397 wakeup(&vp->v_lflag);
2398 }
1c79356b
A
2399 }
2400}
2401
2402/*
2403 * Eliminate all activity associated with the requested vnode
2404 * and with all vnodes aliased to the requested vnode.
2405 */
2406int
2d21ac55 2407#if DIAGNOSTIC
91447636 2408vn_revoke(vnode_t vp, int flags, __unused vfs_context_t a_context)
2d21ac55
A
2409#else
2410vn_revoke(vnode_t vp, __unused int flags, __unused vfs_context_t a_context)
2411#endif
1c79356b 2412{
91447636
A
2413 struct vnode *vq;
2414 int vid;
1c79356b
A
2415
2416#if DIAGNOSTIC
0a7de745 2417 if ((flags & REVOKEALL) == 0) {
91447636 2418 panic("vnop_revoke");
0a7de745 2419 }
1c79356b
A
2420#endif
2421
b0d623f7 2422 if (vnode_isaliased(vp)) {
1c79356b
A
2423 /*
2424 * If a vgone (or vclean) is already in progress,
b0d623f7 2425 * return an immediate error
1c79356b 2426 */
0a7de745
A
2427 if (vp->v_lflag & VL_TERMINATE) {
2428 return ENOENT;
2429 }
b0d623f7 2430
1c79356b
A
2431 /*
2432 * Ensure that vp will not be vgone'd while we
2433 * are eliminating its aliases.
2434 */
91447636 2435 SPECHASH_LOCK();
b0d623f7 2436 while ((vp->v_specflags & SI_ALIASED)) {
1c79356b
A
2437 for (vq = *vp->v_hashchain; vq; vq = vq->v_specnext) {
2438 if (vq->v_rdev != vp->v_rdev ||
0a7de745 2439 vq->v_type != vp->v_type || vp == vq) {
1c79356b 2440 continue;
0a7de745 2441 }
91447636
A
2442 vid = vq->v_id;
2443 SPECHASH_UNLOCK();
0a7de745
A
2444 if (vnode_getwithvid(vq, vid)) {
2445 SPECHASH_LOCK();
91447636
A
2446 break;
2447 }
fe8ab488
A
2448 vnode_lock(vq);
2449 if (!(vq->v_lflag & VL_TERMINATE)) {
2450 vnode_reclaim_internal(vq, 1, 1, 0);
2451 }
2452 vnode_put_locked(vq);
2453 vnode_unlock(vq);
91447636 2454 SPECHASH_LOCK();
1c79356b
A
2455 break;
2456 }
1c79356b 2457 }
91447636 2458 SPECHASH_UNLOCK();
1c79356b 2459 }
fe8ab488
A
2460 vnode_lock(vp);
2461 if (vp->v_lflag & VL_TERMINATE) {
2462 vnode_unlock(vp);
0a7de745 2463 return ENOENT;
fe8ab488
A
2464 }
2465 vnode_reclaim_internal(vp, 1, 0, REVOKEALL);
2466 vnode_unlock(vp);
91447636 2467
0a7de745 2468 return 0;
1c79356b
A
2469}
2470
2471/*
2472 * Recycle an unused vnode to the front of the free list.
2473 * Release the passed interlock if the vnode will be recycled.
2474 */
2475int
2d21ac55 2476vnode_recycle(struct vnode *vp)
1c79356b 2477{
b0d623f7 2478 vnode_lock_spin(vp);
1c79356b 2479
91447636
A
2480 if (vp->v_iocount || vp->v_usecount) {
2481 vp->v_lflag |= VL_MARKTERM;
2482 vnode_unlock(vp);
0a7de745
A
2483 return 0;
2484 }
b0d623f7 2485 vnode_lock_convert(vp);
2d21ac55 2486 vnode_reclaim_internal(vp, 1, 0, 0);
cf7d32b8 2487
91447636
A
2488 vnode_unlock(vp);
2489
0a7de745 2490 return 1;
1c79356b
A
2491}
2492
91447636
A
2493static int
2494vnode_reload(vnode_t vp)
1c79356b 2495{
2d21ac55 2496 vnode_lock_spin(vp);
1c79356b 2497
91447636
A
2498 if ((vp->v_iocount > 1) || vp->v_usecount) {
2499 vnode_unlock(vp);
0a7de745
A
2500 return 0;
2501 }
2502 if (vp->v_iocount <= 0) {
91447636 2503 panic("vnode_reload with no iocount %d", vp->v_iocount);
0a7de745 2504 }
91447636
A
2505
2506 /* mark for release when iocount is dopped */
2507 vp->v_lflag |= VL_MARKTERM;
2508 vnode_unlock(vp);
2509
0a7de745 2510 return 1;
1c79356b
A
2511}
2512
91447636
A
2513
2514static void
2d21ac55 2515vgone(vnode_t vp, int flags)
1c79356b
A
2516{
2517 struct vnode *vq;
2518 struct vnode *vx;
2519
1c79356b
A
2520 /*
2521 * Clean out the filesystem specific data.
91447636
A
2522 * vclean also takes care of removing the
2523 * vnode from any mount list it might be on
1c79356b 2524 */
2d21ac55 2525 vclean(vp, flags | DOCLOSE);
91447636 2526
1c79356b
A
2527 /*
2528 * If special device, remove it from special device alias list
2529 * if it is on one.
2530 */
2531 if ((vp->v_type == VBLK || vp->v_type == VCHR) && vp->v_specinfo != 0) {
0a7de745
A
2532 SPECHASH_LOCK();
2533 if (*vp->v_hashchain == vp) {
2534 *vp->v_hashchain = vp->v_specnext;
2535 } else {
2536 for (vq = *vp->v_hashchain; vq; vq = vq->v_specnext) {
2537 if (vq->v_specnext != vp) {
2538 continue;
91447636 2539 }
0a7de745
A
2540 vq->v_specnext = vp->v_specnext;
2541 break;
2542 }
2543 if (vq == NULL) {
1c79356b 2544 panic("missing bdev");
1c79356b 2545 }
0a7de745
A
2546 }
2547 if (vp->v_specflags & SI_ALIASED) {
2548 vx = NULL;
2549 for (vq = *vp->v_hashchain; vq; vq = vq->v_specnext) {
2550 if (vq->v_rdev != vp->v_rdev ||
2551 vq->v_type != vp->v_type) {
2552 continue;
91447636 2553 }
0a7de745
A
2554 if (vx) {
2555 break;
2556 }
2557 vx = vq;
91447636 2558 }
0a7de745
A
2559 if (vx == NULL) {
2560 panic("missing alias");
2561 }
2562 if (vq == NULL) {
2563 vx->v_specflags &= ~SI_ALIASED;
2564 }
2565 vp->v_specflags &= ~SI_ALIASED;
2566 }
2567 SPECHASH_UNLOCK();
2568 {
91447636
A
2569 struct specinfo *tmp = vp->v_specinfo;
2570 vp->v_specinfo = NULL;
0a7de745
A
2571 FREE_ZONE(tmp, sizeof(struct specinfo), M_SPECINFO);
2572 }
1c79356b 2573 }
1c79356b
A
2574}
2575
2576/*
2577 * Lookup a vnode by device number.
2578 */
2579int
91447636 2580check_mountedon(dev_t dev, enum vtype type, int *errorp)
1c79356b 2581{
0a7de745 2582 vnode_t vp;
1c79356b 2583 int rc = 0;
91447636 2584 int vid;
1c79356b 2585
91447636
A
2586loop:
2587 SPECHASH_LOCK();
1c79356b 2588 for (vp = speclisth[SPECHASH(dev)]; vp; vp = vp->v_specnext) {
0a7de745 2589 if (dev != vp->v_rdev || type != vp->v_type) {
1c79356b 2590 continue;
0a7de745 2591 }
91447636
A
2592 vid = vp->v_id;
2593 SPECHASH_UNLOCK();
0a7de745 2594 if (vnode_getwithvid(vp, vid)) {
91447636 2595 goto loop;
0a7de745 2596 }
2d21ac55 2597 vnode_lock_spin(vp);
91447636
A
2598 if ((vp->v_usecount > 0) || (vp->v_iocount > 1)) {
2599 vnode_unlock(vp);
0a7de745 2600 if ((*errorp = vfs_mountedon(vp)) != 0) {
91447636 2601 rc = 1;
0a7de745
A
2602 }
2603 } else {
91447636 2604 vnode_unlock(vp);
0a7de745 2605 }
91447636 2606 vnode_put(vp);
0a7de745 2607 return rc;
1c79356b 2608 }
91447636 2609 SPECHASH_UNLOCK();
0a7de745 2610 return 0;
1c79356b
A
2611}
2612
2613/*
2614 * Calculate the total number of references to a special device.
2615 */
2616int
91447636 2617vcount(vnode_t vp)
1c79356b 2618{
91447636 2619 vnode_t vq, vnext;
1c79356b 2620 int count;
91447636 2621 int vid;
1c79356b 2622
39037602 2623 if (!vnode_isspec(vp)) {
0a7de745 2624 return vp->v_usecount - vp->v_kusecount;
39037602
A
2625 }
2626
1c79356b 2627loop:
0a7de745
A
2628 if (!vnode_isaliased(vp)) {
2629 return vp->v_specinfo->si_opencount;
2630 }
2d21ac55 2631 count = 0;
91447636
A
2632
2633 SPECHASH_LOCK();
2d21ac55
A
2634 /*
2635 * Grab first vnode and its vid.
2636 */
2637 vq = *vp->v_hashchain;
2638 vid = vq ? vq->v_id : 0;
2639
2640 SPECHASH_UNLOCK();
91447636 2641
2d21ac55
A
2642 while (vq) {
2643 /*
2644 * Attempt to get the vnode outside the SPECHASH lock.
2645 */
91447636
A
2646 if (vnode_getwithvid(vq, vid)) {
2647 goto loop;
2648 }
91447636 2649 vnode_lock(vq);
2d21ac55
A
2650
2651 if (vq->v_rdev == vp->v_rdev && vq->v_type == vp->v_type) {
0a7de745 2652 if ((vq->v_usecount == 0) && (vq->v_iocount == 1) && vq != vp) {
2d21ac55
A
2653 /*
2654 * Alias, but not in use, so flush it out.
2655 */
cf7d32b8
A
2656 vnode_reclaim_internal(vq, 1, 1, 0);
2657 vnode_put_locked(vq);
2d21ac55 2658 vnode_unlock(vq);
2d21ac55
A
2659 goto loop;
2660 }
6d2010ae 2661 count += vq->v_specinfo->si_opencount;
1c79356b 2662 }
91447636 2663 vnode_unlock(vq);
91447636
A
2664
2665 SPECHASH_LOCK();
2d21ac55
A
2666 /*
2667 * must do this with the reference still held on 'vq'
2668 * so that it can't be destroyed while we're poking
2669 * through v_specnext
2670 */
2671 vnext = vq->v_specnext;
2672 vid = vnext ? vnext->v_id : 0;
2673
2674 SPECHASH_UNLOCK();
2675
2676 vnode_put(vq);
2677
2678 vq = vnext;
1c79356b 2679 }
91447636 2680
0a7de745 2681 return count;
1c79356b
A
2682}
2683
0a7de745 2684int prtactive = 0; /* 1 => print out reclaim of active vnodes */
1c79356b
A
2685
2686/*
2687 * Print out a description of a vnode.
2688 */
2d21ac55 2689static const char *typename[] =
0a7de745 2690{ "VNON", "VREG", "VDIR", "VBLK", "VCHR", "VLNK", "VSOCK", "VFIFO", "VBAD" };
1c79356b
A
2691
2692void
91447636 2693vprint(const char *label, struct vnode *vp)
1c79356b 2694{
91447636 2695 char sbuf[64];
1c79356b 2696
0a7de745 2697 if (label != NULL) {
1c79356b 2698 printf("%s: ", label);
0a7de745 2699 }
b0d623f7 2700 printf("type %s, usecount %d, writecount %d",
0a7de745 2701 typename[vp->v_type], vp->v_usecount, vp->v_writecount);
91447636 2702 sbuf[0] = '\0';
0a7de745 2703 if (vp->v_flag & VROOT) {
2d21ac55 2704 strlcat(sbuf, "|VROOT", sizeof(sbuf));
0a7de745
A
2705 }
2706 if (vp->v_flag & VTEXT) {
2d21ac55 2707 strlcat(sbuf, "|VTEXT", sizeof(sbuf));
0a7de745
A
2708 }
2709 if (vp->v_flag & VSYSTEM) {
2d21ac55 2710 strlcat(sbuf, "|VSYSTEM", sizeof(sbuf));
0a7de745
A
2711 }
2712 if (vp->v_flag & VNOFLUSH) {
2d21ac55 2713 strlcat(sbuf, "|VNOFLUSH", sizeof(sbuf));
0a7de745
A
2714 }
2715 if (vp->v_flag & VBWAIT) {
2d21ac55 2716 strlcat(sbuf, "|VBWAIT", sizeof(sbuf));
0a7de745
A
2717 }
2718 if (vnode_isaliased(vp)) {
2d21ac55 2719 strlcat(sbuf, "|VALIASED", sizeof(sbuf));
0a7de745
A
2720 }
2721 if (sbuf[0] != '\0') {
91447636 2722 printf(" flags (%s)", &sbuf[1]);
0a7de745 2723 }
1c79356b
A
2724}
2725
1c79356b 2726
91447636
A
2727int
2728vn_getpath(struct vnode *vp, char *pathbuf, int *len)
2729{
2d21ac55
A
2730 return build_path(vp, pathbuf, *len, len, BUILDPATH_NO_FS_ENTER, vfs_context_current());
2731}
2732
b0d623f7
A
2733int
2734vn_getpath_fsenter(struct vnode *vp, char *pathbuf, int *len)
2735{
2736 return build_path(vp, pathbuf, *len, len, 0, vfs_context_current());
2737}
2d21ac55 2738
5ba3f43e
A
2739/*
2740 * vn_getpath_fsenter_with_parent will reenter the file system to fine the path of the
2741 * vnode. It requires that there are IO counts on both the vnode and the directory vnode.
2742 *
2743 * vn_getpath_fsenter is called by MAC hooks to authorize operations for every thing, but
2744 * unlink, rmdir and rename. For these operation the MAC hook calls vn_getpath. This presents
2745 * problems where if the path can not be found from the name cache, those operations can
2746 * erroneously fail with EPERM even though the call should succeed. When removing or moving
2747 * file system objects with operations such as unlink or rename, those operations need to
2748 * take IO counts on the target and containing directory. Calling vn_getpath_fsenter from a
2749 * MAC hook from these operations during forced unmount operations can lead to dead
2750 * lock. This happens when the operation starts, IO counts are taken on the containing
2751 * directories and targets. Before the MAC hook is called a forced unmount from another
2752 * thread takes place and blocks on the on going operation's directory vnode in vdrain.
2753 * After which, the MAC hook gets called and calls vn_getpath_fsenter. vn_getpath_fsenter
2754 * is called with the understanding that there is an IO count on the target. If in
2755 * build_path the directory vnode is no longer in the cache, then the parent object id via
2756 * vnode_getattr from the target is obtain and used to call VFS_VGET to get the parent
2757 * vnode. The file system's VFS_VGET then looks up by inode in its hash and tries to get
2758 * an IO count. But VFS_VGET "sees" the directory vnode is in vdrain and can block
2759 * depending on which version and how it calls the vnode_get family of interfaces.
2760 *
2761 * N.B. A reasonable interface to use is vnode_getwithvid. This interface was modified to
2762 * call vnode_getiocount with VNODE_DRAINO, so it will happily get an IO count and not
2763 * cause issues, but there is no guarantee that all or any file systems are doing that.
2764 *
2765 * vn_getpath_fsenter_with_parent can enter the file system safely since there is a known
2766 * IO count on the directory vnode by calling build_path_with_parent.
2767 */
2768
2769int
2770vn_getpath_fsenter_with_parent(struct vnode *dvp, struct vnode *vp, char *pathbuf, int *len)
2771{
2772 return build_path_with_parent(vp, dvp, pathbuf, *len, len, 0, vfs_context_current());
2773}
2774
2d21ac55
A
2775int
2776vn_getcdhash(struct vnode *vp, off_t offset, unsigned char *cdhash)
2777{
2778 return ubc_cs_getcdhash(vp, offset, cdhash);
1c79356b 2779}
91447636
A
2780
2781
0a7de745 2782static char *extension_table = NULL;
91447636
A
2783static int nexts;
2784static int max_ext_width;
1c79356b 2785
55e303ae 2786static int
2d21ac55 2787extension_cmp(const void *a, const void *b)
91447636 2788{
0a7de745 2789 return strlen((const char *)a) - strlen((const char *)b);
91447636 2790}
55e303ae 2791
55e303ae 2792
91447636
A
2793//
2794// This is the api LaunchServices uses to inform the kernel
2795// the list of package extensions to ignore.
2796//
2797// Internally we keep the list sorted by the length of the
2798// the extension (from longest to shortest). We sort the
2799// list of extensions so that we can speed up our searches
2800// when comparing file names -- we only compare extensions
2801// that could possibly fit into the file name, not all of
2802// them (i.e. a short 8 character name can't have an 8
2803// character extension).
2804//
b0d623f7
A
2805extern lck_mtx_t *pkg_extensions_lck;
2806
91447636 2807__private_extern__ int
b0d623f7 2808set_package_extensions_table(user_addr_t data, int nentries, int maxwidth)
91447636 2809{
0a7de745
A
2810 char *new_exts, *old_exts;
2811 int error;
2812
2813 if (nentries <= 0 || nentries > 1024 || maxwidth <= 0 || maxwidth > 255) {
2814 return EINVAL;
2815 }
91447636 2816
b0d623f7 2817
0a7de745
A
2818 // allocate one byte extra so we can guarantee null termination
2819 MALLOC(new_exts, char *, (nentries * maxwidth) + 1, M_TEMP, M_WAITOK);
2820 if (new_exts == NULL) {
2821 return ENOMEM;
2822 }
b0d623f7 2823
0a7de745
A
2824 error = copyin(data, new_exts, nentries * maxwidth);
2825 if (error) {
2826 FREE(new_exts, M_TEMP);
2827 return error;
2828 }
b0d623f7 2829
0a7de745 2830 new_exts[(nentries * maxwidth)] = '\0'; // guarantee null termination of the block
91447636 2831
0a7de745 2832 qsort(new_exts, nentries, maxwidth, extension_cmp);
b0d623f7 2833
0a7de745 2834 lck_mtx_lock(pkg_extensions_lck);
91447636 2835
0a7de745
A
2836 old_exts = extension_table;
2837 extension_table = new_exts;
2838 nexts = nentries;
2839 max_ext_width = maxwidth;
2840
2841 lck_mtx_unlock(pkg_extensions_lck);
2842
2843 if (old_exts) {
2844 FREE(old_exts, M_TEMP);
2845 }
2846
2847 return 0;
91447636
A
2848}
2849
2850
0a7de745
A
2851int
2852is_package_name(const char *name, int len)
91447636 2853{
0a7de745
A
2854 int i, extlen;
2855 const char *ptr, *name_ext;
91447636 2856
0a7de745
A
2857 if (len <= 3) {
2858 return 0;
55e303ae
A
2859 }
2860
0a7de745
A
2861 name_ext = NULL;
2862 for (ptr = name; *ptr != '\0'; ptr++) {
2863 if (*ptr == '.') {
2864 name_ext = ptr;
2865 }
2866 }
55e303ae 2867
0a7de745
A
2868 // if there is no "." extension, it can't match
2869 if (name_ext == NULL) {
2870 return 0;
2871 }
55e303ae 2872
0a7de745
A
2873 // advance over the "."
2874 name_ext++;
b0d623f7 2875
0a7de745
A
2876 lck_mtx_lock(pkg_extensions_lck);
2877
2878 // now iterate over all the extensions to see if any match
2879 ptr = &extension_table[0];
2880 for (i = 0; i < nexts; i++, ptr += max_ext_width) {
2881 extlen = strlen(ptr);
2882 if (strncasecmp(name_ext, ptr, extlen) == 0 && name_ext[extlen] == '\0') {
2883 // aha, a match!
2884 lck_mtx_unlock(pkg_extensions_lck);
2885 return 1;
2886 }
55e303ae 2887 }
55e303ae 2888
0a7de745 2889 lck_mtx_unlock(pkg_extensions_lck);
b0d623f7 2890
0a7de745
A
2891 // if we get here, no extension matched
2892 return 0;
55e303ae
A
2893}
2894
91447636
A
2895int
2896vn_path_package_check(__unused vnode_t vp, char *path, int pathlen, int *component)
55e303ae 2897{
0a7de745
A
2898 char *ptr, *end;
2899 int comp = 0;
91447636 2900
0a7de745
A
2901 *component = -1;
2902 if (*path != '/') {
2903 return EINVAL;
91447636
A
2904 }
2905
0a7de745
A
2906 end = path + 1;
2907 while (end < path + pathlen && *end != '\0') {
2908 while (end < path + pathlen && *end == '/' && *end != '\0') {
2909 end++;
2910 }
91447636 2911
0a7de745 2912 ptr = end;
91447636 2913
0a7de745
A
2914 while (end < path + pathlen && *end != '/' && *end != '\0') {
2915 end++;
2916 }
91447636 2917
0a7de745
A
2918 if (end > path + pathlen) {
2919 // hmm, string wasn't null terminated
2920 return EINVAL;
2921 }
2922
2923 *end = '\0';
2924 if (is_package_name(ptr, end - ptr)) {
2925 *component = comp;
2926 break;
2927 }
55e303ae 2928
0a7de745
A
2929 end++;
2930 comp++;
2931 }
91447636 2932
0a7de745 2933 return 0;
91447636 2934}
55e303ae 2935
0a7de745 2936/*
b0d623f7
A
2937 * Determine if a name is inappropriate for a searchfs query.
2938 * This list consists of /System currently.
2939 */
2940
0a7de745
A
2941int
2942vn_searchfs_inappropriate_name(const char *name, int len)
2943{
b0d623f7
A
2944 const char *bad_names[] = { "System" };
2945 int bad_len[] = { 6 };
2946 int i;
2947
0a7de745 2948 for (i = 0; i < (int) (sizeof(bad_names) / sizeof(bad_names[0])); i++) {
b0d623f7
A
2949 if (len == bad_len[i] && strncmp(name, bad_names[i], strlen(bad_names[i]) + 1) == 0) {
2950 return 1;
2951 }
2952 }
2953
2954 // if we get here, no name matched
2955 return 0;
2956}
55e303ae 2957
1c79356b
A
2958/*
2959 * Top level filesystem related information gathering.
2960 */
91447636
A
2961extern unsigned int vfs_nummntops;
2962
fe8ab488
A
2963/*
2964 * The VFS_NUMMNTOPS shouldn't be at name[1] since
2965 * is a VFS generic variable. Since we no longer support
2966 * VT_UFS, we reserve its value to support this sysctl node.
2967 *
2968 * It should have been:
2969 * name[0]: VFS_GENERIC
2970 * name[1]: VFS_NUMMNTOPS
2971 */
2972SYSCTL_INT(_vfs, VFS_NUMMNTOPS, nummntops,
0a7de745
A
2973 CTLFLAG_RD | CTLFLAG_KERN | CTLFLAG_LOCKED,
2974 &vfs_nummntops, 0, "");
fe8ab488
A
2975
2976int
2977vfs_sysctl(int *name __unused, u_int namelen __unused,
0a7de745
A
2978 user_addr_t oldp __unused, size_t *oldlenp __unused,
2979 user_addr_t newp __unused, size_t newlen __unused, proc_t p __unused);
fe8ab488 2980
1c79356b 2981int
fe8ab488 2982vfs_sysctl(int *name __unused, u_int namelen __unused,
0a7de745
A
2983 user_addr_t oldp __unused, size_t *oldlenp __unused,
2984 user_addr_t newp __unused, size_t newlen __unused, proc_t p __unused)
1c79356b 2985{
0a7de745 2986 return EINVAL;
fe8ab488 2987}
1c79356b 2988
316670eb 2989
fe8ab488
A
2990//
2991// The following code disallows specific sysctl's that came through
2992// the direct sysctl interface (vfs_sysctl_node) instead of the newer
2993// sysctl_vfs_ctlbyfsid() interface. We can not allow these selectors
2994// through vfs_sysctl_node() because it passes the user's oldp pointer
2995// directly to the file system which (for these selectors) casts it
2996// back to a struct sysctl_req and then proceed to use SYSCTL_IN()
2997// which jumps through an arbitrary function pointer. When called
2998// through the sysctl_vfs_ctlbyfsid() interface this does not happen
2999// and so it's safe.
3000//
3001// Unfortunately we have to pull in definitions from AFP and SMB and
3002// perform explicit name checks on the file system to determine if
3003// these selectors are being used.
3004//
1c79356b 3005
fe8ab488
A
3006#define AFPFS_VFS_CTL_GETID 0x00020001
3007#define AFPFS_VFS_CTL_NETCHANGE 0x00020002
3008#define AFPFS_VFS_CTL_VOLCHANGE 0x00020003
b0d623f7 3009
fe8ab488
A
3010#define SMBFS_SYSCTL_REMOUNT 1
3011#define SMBFS_SYSCTL_REMOUNT_INFO 2
3012#define SMBFS_SYSCTL_GET_SERVER_SHARE 3
b0d623f7 3013
91447636 3014
fe8ab488
A
3015static int
3016is_bad_sysctl_name(struct vfstable *vfsp, int selector_name)
3017{
0a7de745
A
3018 switch (selector_name) {
3019 case VFS_CTL_QUERY:
3020 case VFS_CTL_TIMEO:
3021 case VFS_CTL_NOLOCKS:
3022 case VFS_CTL_NSTATUS:
3023 case VFS_CTL_SADDR:
3024 case VFS_CTL_DISC:
3025 case VFS_CTL_SERVERINFO:
3026 return 1;
b0d623f7 3027
0a7de745
A
3028 default:
3029 break;
1c79356b 3030 }
b0d623f7 3031
fe8ab488
A
3032 // the more complicated check for some of SMB's special values
3033 if (strcmp(vfsp->vfc_name, "smbfs") == 0) {
0a7de745
A
3034 switch (selector_name) {
3035 case SMBFS_SYSCTL_REMOUNT:
3036 case SMBFS_SYSCTL_REMOUNT_INFO:
3037 case SMBFS_SYSCTL_GET_SERVER_SHARE:
3038 return 1;
fe8ab488
A
3039 }
3040 } else if (strcmp(vfsp->vfc_name, "afpfs") == 0) {
0a7de745
A
3041 switch (selector_name) {
3042 case AFPFS_VFS_CTL_GETID:
3043 case AFPFS_VFS_CTL_NETCHANGE:
3044 case AFPFS_VFS_CTL_VOLCHANGE:
3045 return 1;
91447636 3046 }
1c79356b 3047 }
316670eb 3048
fe8ab488
A
3049 //
3050 // If we get here we passed all the checks so the selector is ok
3051 //
3052 return 0;
1c79356b
A
3053}
3054
fe8ab488
A
3055
3056int vfs_sysctl_node SYSCTL_HANDLER_ARGS
1c79356b 3057{
fe8ab488
A
3058 int *name, namelen;
3059 struct vfstable *vfsp;
3060 int error;
3061 int fstypenum;
0a7de745 3062
fe8ab488
A
3063 fstypenum = oidp->oid_number;
3064 name = arg1;
3065 namelen = arg2;
1c79356b 3066
fe8ab488 3067 /* all sysctl names at this level should have at least one name slot for the FS */
0a7de745
A
3068 if (namelen < 1) {
3069 return EISDIR; /* overloaded */
3070 }
fe8ab488 3071 mount_list_lock();
0a7de745 3072 for (vfsp = vfsconf; vfsp; vfsp = vfsp->vfc_next) {
fe8ab488
A
3073 if (vfsp->vfc_typenum == fstypenum) {
3074 vfsp->vfc_refcount++;
3075 break;
3076 }
0a7de745 3077 }
fe8ab488 3078 mount_list_unlock();
0a7de745 3079
fe8ab488 3080 if (vfsp == NULL) {
0a7de745 3081 return ENOTSUP;
fe8ab488
A
3082 }
3083
3084 if (is_bad_sysctl_name(vfsp, name[0])) {
3085 printf("vfs: bad selector 0x%.8x for old-style sysctl(). use the sysctl-by-fsid interface instead\n", name[0]);
3086 return EPERM;
3087 }
2d21ac55 3088
fe8ab488
A
3089 error = (vfsp->vfc_vfsops->vfs_sysctl)(name, namelen, req->oldptr, &req->oldlen, req->newptr, req->newlen, vfs_context_current());
3090
3091 mount_list_lock();
3092 vfsp->vfc_refcount--;
3093 mount_list_unlock();
3094
3095 return error;
3096}
2d21ac55 3097
1c79356b
A
3098/*
3099 * Check to see if a filesystem is mounted on a block device.
3100 */
3101int
2d21ac55 3102vfs_mountedon(struct vnode *vp)
1c79356b
A
3103{
3104 struct vnode *vq;
3105 int error = 0;
3106
91447636
A
3107 SPECHASH_LOCK();
3108 if (vp->v_specflags & SI_MOUNTEDON) {
3109 error = EBUSY;
3110 goto out;
3111 }
b0d623f7 3112 if (vp->v_specflags & SI_ALIASED) {
1c79356b
A
3113 for (vq = *vp->v_hashchain; vq; vq = vq->v_specnext) {
3114 if (vq->v_rdev != vp->v_rdev ||
0a7de745 3115 vq->v_type != vp->v_type) {
1c79356b 3116 continue;
0a7de745 3117 }
1c79356b
A
3118 if (vq->v_specflags & SI_MOUNTEDON) {
3119 error = EBUSY;
3120 break;
3121 }
3122 }
1c79356b 3123 }
91447636
A
3124out:
3125 SPECHASH_UNLOCK();
0a7de745 3126 return error;
1c79356b
A
3127}
3128
fe8ab488 3129struct unmount_info {
0a7de745
A
3130 int u_errs; // Total failed unmounts
3131 int u_busy; // EBUSY failed unmounts
fe8ab488
A
3132};
3133
3134static int
3135unmount_callback(mount_t mp, void *arg)
3136{
3137 int error;
3138 char *mntname;
3139 struct unmount_info *uip = arg;
3140
3141 mount_ref(mp, 0);
0a7de745 3142 mount_iterdrop(mp); // avoid vfs_iterate deadlock in dounmount()
fe8ab488
A
3143
3144 MALLOC_ZONE(mntname, void *, MAXPATHLEN, M_NAMEI, M_WAITOK);
0a7de745 3145 if (mntname) {
fe8ab488 3146 strlcpy(mntname, mp->mnt_vfsstat.f_mntonname, MAXPATHLEN);
0a7de745 3147 }
fe8ab488
A
3148
3149 error = dounmount(mp, MNT_FORCE, 1, vfs_context_current());
3150 if (error) {
3151 uip->u_errs++;
3152 printf("Unmount of %s failed (%d)\n", mntname ? mntname:"?", error);
0a7de745 3153 if (error == EBUSY) {
fe8ab488 3154 uip->u_busy++;
0a7de745 3155 }
fe8ab488 3156 }
0a7de745 3157 if (mntname) {
fe8ab488 3158 FREE_ZONE(mntname, MAXPATHLEN, M_NAMEI);
0a7de745 3159 }
fe8ab488 3160
0a7de745 3161 return VFS_RETURNED;
fe8ab488
A
3162}
3163
1c79356b
A
3164/*
3165 * Unmount all filesystems. The list is traversed in reverse order
3166 * of mounting to avoid dependencies.
fe8ab488 3167 * Busy mounts are retried.
1c79356b 3168 */
0b4e3aa0 3169__private_extern__ void
2d21ac55 3170vfs_unmountall(void)
1c79356b 3171{
fe8ab488
A
3172 int mounts, sec = 1;
3173 struct unmount_info ui;
1c79356b 3174
fe8ab488
A
3175retry:
3176 ui.u_errs = ui.u_busy = 0;
3177 vfs_iterate(VFS_ITERATE_CB_DROPREF | VFS_ITERATE_TAIL_FIRST, unmount_callback, &ui);
3178 mounts = mount_getvfscnt();
0a7de745 3179 if (mounts == 0) {
fe8ab488 3180 return;
0a7de745 3181 }
fe8ab488 3182
0a7de745 3183 if (ui.u_busy > 0) { // Busy mounts - wait & retry
fe8ab488
A
3184 tsleep(&nummounts, PVFS, "busy mount", sec * hz);
3185 sec *= 2;
0a7de745 3186 if (sec <= 32) {
fe8ab488 3187 goto retry;
0a7de745 3188 }
fe8ab488 3189 printf("Unmounting timed out\n");
0a7de745 3190 } else if (ui.u_errs < mounts) {
fe8ab488
A
3191 // If the vfs_iterate missed mounts in progress - wait a bit
3192 tsleep(&nummounts, PVFS, "missed mount", 2 * hz);
1c79356b 3193 }
1c79356b
A
3194}
3195
0a7de745 3196/*
2d21ac55
A
3197 * This routine is called from vnode_pager_deallocate out of the VM
3198 * The path to vnode_pager_deallocate can only be initiated by ubc_destroy_named
3199 * on a vnode that has a UBCINFO
1c79356b 3200 */
91447636 3201__private_extern__ void
2d21ac55 3202vnode_pager_vrele(vnode_t vp)
1c79356b 3203{
0a7de745 3204 struct ubc_info *uip;
2d21ac55 3205
b0d623f7 3206 vnode_lock_spin(vp);
1c79356b 3207
91447636 3208 vp->v_lflag &= ~VNAMED_UBC;
fe8ab488
A
3209 if (vp->v_usecount != 0) {
3210 /*
3211 * At the eleventh hour, just before the ubcinfo is
3212 * destroyed, ensure the ubc-specific v_usecount
3213 * reference has gone. We use v_usecount != 0 as a hint;
3214 * ubc_unmap() does nothing if there's no mapping.
3215 *
3216 * This case is caused by coming here via forced unmount,
3217 * versus the usual vm_object_deallocate() path.
3218 * In the forced unmount case, ubc_destroy_named()
3219 * releases the pager before memory_object_last_unmap()
3220 * can be called.
3221 */
3222 vnode_unlock(vp);
3223 ubc_unmap(vp);
3224 vnode_lock_spin(vp);
3225 }
1c79356b 3226
2d21ac55
A
3227 uip = vp->v_ubcinfo;
3228 vp->v_ubcinfo = UBC_INFO_NULL;
1c79356b 3229
91447636 3230 vnode_unlock(vp);
b0d623f7
A
3231
3232 ubc_info_deallocate(uip);
1c79356b
A
3233}
3234
91447636
A
3235
3236#include <sys/disk.h>
3237
6d2010ae
A
3238u_int32_t rootunit = (u_int32_t)-1;
3239
fe8ab488
A
3240#if CONFIG_IOSCHED
3241extern int lowpri_throttle_enabled;
3242extern int iosched_enabled;
3243#endif
3244
91447636
A
3245errno_t
3246vfs_init_io_attributes(vnode_t devvp, mount_t mp)
1c79356b 3247{
0a7de745
A
3248 int error;
3249 off_t readblockcnt = 0;
3250 off_t writeblockcnt = 0;
3251 off_t readmaxcnt = 0;
3252 off_t writemaxcnt = 0;
3253 off_t readsegcnt = 0;
3254 off_t writesegcnt = 0;
3255 off_t readsegsize = 0;
3256 off_t writesegsize = 0;
3257 off_t alignment = 0;
39037602 3258 u_int32_t minsaturationbytecount = 0;
fe8ab488 3259 u_int32_t ioqueue_depth = 0;
b0d623f7 3260 u_int32_t blksize;
91447636 3261 u_int64_t temp;
2d21ac55
A
3262 u_int32_t features;
3263 vfs_context_t ctx = vfs_context_current();
3e170ce0
A
3264 dk_corestorage_info_t cs_info;
3265 boolean_t cs_present = FALSE;;
0b4c1975 3266 int isssd = 0;
55e303ae 3267 int isvirtual = 0;
6d2010ae
A
3268
3269
3270 VNOP_IOCTL(devvp, DKIOCGETTHROTTLEMASK, (caddr_t)&mp->mnt_throttle_mask, 0, NULL);
55e303ae 3271 /*
6d2010ae
A
3272 * as a reasonable approximation, only use the lowest bit of the mask
3273 * to generate a disk unit number
55e303ae 3274 */
6d2010ae 3275 mp->mnt_devbsdunit = num_trailing_0(mp->mnt_throttle_mask);
55e303ae 3276
0a7de745 3277 if (devvp == rootvp) {
6d2010ae 3278 rootunit = mp->mnt_devbsdunit;
0a7de745 3279 }
6d2010ae
A
3280
3281 if (mp->mnt_devbsdunit == rootunit) {
3282 /*
3283 * this mount point exists on the same device as the root
3284 * partition, so it comes under the hard throttle control...
3285 * this is true even for the root mount point itself
3286 */
3287 mp->mnt_kern_flag |= MNTK_ROOTDEV;
55e303ae 3288 }
91447636
A
3289 /*
3290 * force the spec device to re-cache
3291 * the underlying block size in case
3292 * the filesystem overrode the initial value
3293 */
3294 set_fsblocksize(devvp);
3295
3296
3297 if ((error = VNOP_IOCTL(devvp, DKIOCGETBLOCKSIZE,
0a7de745
A
3298 (caddr_t)&blksize, 0, ctx))) {
3299 return error;
3300 }
91447636
A
3301
3302 mp->mnt_devblocksize = blksize;
3303
b0d623f7
A
3304 /*
3305 * set the maximum possible I/O size
3306 * this may get clipped to a smaller value
3307 * based on which constraints are being advertised
3308 * and if those advertised constraints result in a smaller
3309 * limit for a given I/O
3310 */
fe8ab488
A
3311 mp->mnt_maxreadcnt = MAX_UPL_SIZE_BYTES;
3312 mp->mnt_maxwritecnt = MAX_UPL_SIZE_BYTES;
b0d623f7 3313
2d21ac55 3314 if (VNOP_IOCTL(devvp, DKIOCISVIRTUAL, (caddr_t)&isvirtual, 0, ctx) == 0) {
0a7de745
A
3315 if (isvirtual) {
3316 mp->mnt_kern_flag |= MNTK_VIRTUALDEV;
3317 }
55e303ae 3318 }
0b4c1975 3319 if (VNOP_IOCTL(devvp, DKIOCISSOLIDSTATE, (caddr_t)&isssd, 0, ctx) == 0) {
0a7de745
A
3320 if (isssd) {
3321 mp->mnt_kern_flag |= MNTK_SSD;
3322 }
0b4c1975 3323 }
2d21ac55 3324 if ((error = VNOP_IOCTL(devvp, DKIOCGETFEATURES,
0a7de745
A
3325 (caddr_t)&features, 0, ctx))) {
3326 return error;
3327 }
2d21ac55 3328
91447636 3329 if ((error = VNOP_IOCTL(devvp, DKIOCGETMAXBLOCKCOUNTREAD,
0a7de745
A
3330 (caddr_t)&readblockcnt, 0, ctx))) {
3331 return error;
3332 }
0b4e3aa0 3333
91447636 3334 if ((error = VNOP_IOCTL(devvp, DKIOCGETMAXBLOCKCOUNTWRITE,
0a7de745
A
3335 (caddr_t)&writeblockcnt, 0, ctx))) {
3336 return error;
3337 }
55e303ae 3338
91447636 3339 if ((error = VNOP_IOCTL(devvp, DKIOCGETMAXBYTECOUNTREAD,
0a7de745
A
3340 (caddr_t)&readmaxcnt, 0, ctx))) {
3341 return error;
3342 }
55e303ae 3343
91447636 3344 if ((error = VNOP_IOCTL(devvp, DKIOCGETMAXBYTECOUNTWRITE,
0a7de745
A
3345 (caddr_t)&writemaxcnt, 0, ctx))) {
3346 return error;
3347 }
0b4e3aa0 3348
91447636 3349 if ((error = VNOP_IOCTL(devvp, DKIOCGETMAXSEGMENTCOUNTREAD,
0a7de745
A
3350 (caddr_t)&readsegcnt, 0, ctx))) {
3351 return error;
3352 }
0b4e3aa0 3353
91447636 3354 if ((error = VNOP_IOCTL(devvp, DKIOCGETMAXSEGMENTCOUNTWRITE,
0a7de745
A
3355 (caddr_t)&writesegcnt, 0, ctx))) {
3356 return error;
3357 }
55e303ae 3358
91447636 3359 if ((error = VNOP_IOCTL(devvp, DKIOCGETMAXSEGMENTBYTECOUNTREAD,
0a7de745
A
3360 (caddr_t)&readsegsize, 0, ctx))) {
3361 return error;
3362 }
55e303ae 3363
91447636 3364 if ((error = VNOP_IOCTL(devvp, DKIOCGETMAXSEGMENTBYTECOUNTWRITE,
0a7de745
A
3365 (caddr_t)&writesegsize, 0, ctx))) {
3366 return error;
3367 }
2d21ac55
A
3368
3369 if ((error = VNOP_IOCTL(devvp, DKIOCGETMINSEGMENTALIGNMENTBYTECOUNT,
0a7de745
A
3370 (caddr_t)&alignment, 0, ctx))) {
3371 return error;
3372 }
0b4e3aa0 3373
b0d623f7 3374 if ((error = VNOP_IOCTL(devvp, DKIOCGETCOMMANDPOOLSIZE,
0a7de745
A
3375 (caddr_t)&ioqueue_depth, 0, ctx))) {
3376 return error;
3377 }
b0d623f7 3378
0a7de745 3379 if (readmaxcnt) {
b0d623f7 3380 mp->mnt_maxreadcnt = (readmaxcnt > UINT32_MAX) ? UINT32_MAX : readmaxcnt;
0a7de745 3381 }
b0d623f7
A
3382
3383 if (readblockcnt) {
3384 temp = readblockcnt * blksize;
3385 temp = (temp > UINT32_MAX) ? UINT32_MAX : temp;
3386
0a7de745 3387 if (temp < mp->mnt_maxreadcnt) {
b0d623f7 3388 mp->mnt_maxreadcnt = (u_int32_t)temp;
0a7de745 3389 }
55e303ae 3390 }
55e303ae 3391
0a7de745 3392 if (writemaxcnt) {
b0d623f7 3393 mp->mnt_maxwritecnt = (writemaxcnt > UINT32_MAX) ? UINT32_MAX : writemaxcnt;
0a7de745 3394 }
b0d623f7
A
3395
3396 if (writeblockcnt) {
3397 temp = writeblockcnt * blksize;
3398 temp = (temp > UINT32_MAX) ? UINT32_MAX : temp;
3399
0a7de745 3400 if (temp < mp->mnt_maxwritecnt) {
b0d623f7 3401 mp->mnt_maxwritecnt = (u_int32_t)temp;
0a7de745 3402 }
55e303ae 3403 }
0b4e3aa0 3404
55e303ae 3405 if (readsegcnt) {
0a7de745 3406 temp = (readsegcnt > UINT16_MAX) ? UINT16_MAX : readsegcnt;
b0d623f7
A
3407 } else {
3408 temp = mp->mnt_maxreadcnt / PAGE_SIZE;
3409
0a7de745 3410 if (temp > UINT16_MAX) {
b0d623f7 3411 temp = UINT16_MAX;
0a7de745 3412 }
55e303ae 3413 }
b0d623f7
A
3414 mp->mnt_segreadcnt = (u_int16_t)temp;
3415
55e303ae 3416 if (writesegcnt) {
0a7de745 3417 temp = (writesegcnt > UINT16_MAX) ? UINT16_MAX : writesegcnt;
b0d623f7
A
3418 } else {
3419 temp = mp->mnt_maxwritecnt / PAGE_SIZE;
3420
0a7de745 3421 if (temp > UINT16_MAX) {
b0d623f7 3422 temp = UINT16_MAX;
0a7de745 3423 }
55e303ae 3424 }
b0d623f7
A
3425 mp->mnt_segwritecnt = (u_int16_t)temp;
3426
0a7de745
A
3427 if (readsegsize) {
3428 temp = (readsegsize > UINT32_MAX) ? UINT32_MAX : readsegsize;
3429 } else {
3430 temp = mp->mnt_maxreadcnt;
3431 }
91447636 3432 mp->mnt_maxsegreadsize = (u_int32_t)temp;
0b4e3aa0 3433
0a7de745
A
3434 if (writesegsize) {
3435 temp = (writesegsize > UINT32_MAX) ? UINT32_MAX : writesegsize;
3436 } else {
3437 temp = mp->mnt_maxwritecnt;
3438 }
91447636 3439 mp->mnt_maxsegwritesize = (u_int32_t)temp;
0b4e3aa0 3440
0a7de745
A
3441 if (alignment) {
3442 temp = (alignment > PAGE_SIZE) ? PAGE_MASK : alignment - 1;
3443 } else {
3444 temp = 0;
3445 }
2d21ac55
A
3446 mp->mnt_alignmentmask = temp;
3447
b0d623f7 3448
0a7de745 3449 if (ioqueue_depth > MNT_DEFAULT_IOQUEUE_DEPTH) {
b0d623f7 3450 temp = ioqueue_depth;
0a7de745 3451 } else {
b0d623f7 3452 temp = MNT_DEFAULT_IOQUEUE_DEPTH;
0a7de745 3453 }
b0d623f7
A
3454
3455 mp->mnt_ioqueue_depth = temp;
d9a64523 3456 mp->mnt_ioscale = MNT_IOSCALE(mp->mnt_ioqueue_depth);
b0d623f7 3457
0a7de745 3458 if (mp->mnt_ioscale > 1) {
b0d623f7 3459 printf("ioqueue_depth = %d, ioscale = %d\n", (int)mp->mnt_ioqueue_depth, (int)mp->mnt_ioscale);
0a7de745
A
3460 }
3461
3462 if (features & DK_FEATURE_FORCE_UNIT_ACCESS) {
3463 mp->mnt_ioflags |= MNT_IOFLAGS_FUA_SUPPORTED;
3464 }
b0d623f7 3465
39037602
A
3466 if (VNOP_IOCTL(devvp, DKIOCGETIOMINSATURATIONBYTECOUNT, (caddr_t)&minsaturationbytecount, 0, ctx) == 0) {
3467 mp->mnt_minsaturationbytecount = minsaturationbytecount;
3468 } else {
3469 mp->mnt_minsaturationbytecount = 0;
3470 }
3e170ce0 3471
0a7de745 3472 if (VNOP_IOCTL(devvp, DKIOCCORESTORAGE, (caddr_t)&cs_info, 0, ctx) == 0) {
3e170ce0 3473 cs_present = TRUE;
0a7de745 3474 }
3e170ce0 3475
fe8ab488 3476 if (features & DK_FEATURE_UNMAP) {
316670eb 3477 mp->mnt_ioflags |= MNT_IOFLAGS_UNMAP_SUPPORTED;
fe8ab488 3478
0a7de745 3479 if (cs_present == TRUE) {
fe8ab488 3480 mp->mnt_ioflags |= MNT_IOFLAGS_CSUNMAP_SUPPORTED;
0a7de745 3481 }
fe8ab488 3482 }
3e170ce0
A
3483 if (cs_present == TRUE) {
3484 /*
3485 * for now we'll use the following test as a proxy for
3486 * the underlying drive being FUSION in nature
3487 */
0a7de745 3488 if ((cs_info.flags & DK_CORESTORAGE_PIN_YOUR_METADATA)) {
3e170ce0 3489 mp->mnt_ioflags |= MNT_IOFLAGS_FUSION_DRIVE;
0a7de745 3490 }
5ba3f43e
A
3491 } else {
3492 /* Check for APFS Fusion */
3493 dk_apfs_flavour_t flavour;
3494 if ((VNOP_IOCTL(devvp, DKIOCGETAPFSFLAVOUR, (caddr_t)&flavour, 0, ctx) == 0) &&
3495 (flavour == DK_APFS_FUSION)) {
3496 mp->mnt_ioflags |= MNT_IOFLAGS_FUSION_DRIVE;
3497 }
3e170ce0
A
3498 }
3499
fe8ab488 3500#if CONFIG_IOSCHED
0a7de745
A
3501 if (iosched_enabled && (features & DK_FEATURE_PRIORITY)) {
3502 mp->mnt_ioflags |= MNT_IOFLAGS_IOSCHED_SUPPORTED;
3e170ce0 3503 throttle_info_disable_throttle(mp->mnt_devbsdunit, (mp->mnt_ioflags & MNT_IOFLAGS_FUSION_DRIVE) != 0);
fe8ab488 3504 }
3e170ce0 3505#endif /* CONFIG_IOSCHED */
0a7de745 3506 return error;
55e303ae
A
3507}
3508
3509static struct klist fs_klist;
4a3eedf9
A
3510lck_grp_t *fs_klist_lck_grp;
3511lck_mtx_t *fs_klist_lock;
55e303ae
A
3512
3513void
3514vfs_event_init(void)
3515{
55e303ae 3516 klist_init(&fs_klist);
4a3eedf9
A
3517 fs_klist_lck_grp = lck_grp_alloc_init("fs_klist", NULL);
3518 fs_klist_lock = lck_mtx_alloc_init(fs_klist_lck_grp, NULL);
55e303ae
A
3519}
3520
3521void
316670eb
A
3522vfs_event_signal(fsid_t *fsid, u_int32_t event, intptr_t data)
3523{
3524 if (event == VQ_DEAD || event == VQ_NOTRESP) {
3525 struct mount *mp = vfs_getvfs(fsid);
3526 if (mp) {
3527 mount_lock_spin(mp);
0a7de745
A
3528 if (data) {
3529 mp->mnt_kern_flag &= ~MNT_LNOTRESP; // Now responding
3530 } else {
3531 mp->mnt_kern_flag |= MNT_LNOTRESP; // Not responding
3532 }
316670eb
A
3533 mount_unlock(mp);
3534 }
3535 }
3536
4a3eedf9 3537 lck_mtx_lock(fs_klist_lock);
55e303ae 3538 KNOTE(&fs_klist, event);
4a3eedf9 3539 lck_mtx_unlock(fs_klist_lock);
55e303ae
A
3540}
3541
3542/*
3543 * return the number of mounted filesystems.
3544 */
3545static int
3546sysctl_vfs_getvfscnt(void)
3547{
0a7de745 3548 return mount_getvfscnt();
91447636
A
3549}
3550
0b4e3aa0 3551
91447636
A
3552static int
3553mount_getvfscnt(void)
3554{
3555 int ret;
3556
3557 mount_list_lock();
3558 ret = nummounts;
3559 mount_list_unlock();
0a7de745 3560 return ret;
91447636
A
3561}
3562
3563
3564
3565static int
3566mount_fillfsids(fsid_t *fsidlst, int count)
3567{
3568 struct mount *mp;
0a7de745 3569 int actual = 0;
91447636
A
3570
3571 actual = 0;
3572 mount_list_lock();
3573 TAILQ_FOREACH(mp, &mountlist, mnt_list) {
3574 if (actual <= count) {
3575 fsidlst[actual] = mp->mnt_vfsstat.f_fsid;
3576 actual++;
3577 }
3578 }
3579 mount_list_unlock();
0a7de745 3580 return actual;
55e303ae
A
3581}
3582
3583/*
3584 * fill in the array of fsid_t's up to a max of 'count', the actual
3585 * number filled in will be set in '*actual'. If there are more fsid_t's
3586 * than room in fsidlst then ENOMEM will be returned and '*actual' will
3587 * have the actual count.
3588 * having *actual filled out even in the error case is depended upon.
3589 */
3590static int
3591sysctl_vfs_getvfslist(fsid_t *fsidlst, int count, int *actual)
3592{
3593 struct mount *mp;
3594
3595 *actual = 0;
91447636
A
3596 mount_list_lock();
3597 TAILQ_FOREACH(mp, &mountlist, mnt_list) {
55e303ae 3598 (*actual)++;
0a7de745 3599 if (*actual <= count) {
91447636 3600 fsidlst[(*actual) - 1] = mp->mnt_vfsstat.f_fsid;
0a7de745 3601 }
55e303ae 3602 }
91447636 3603 mount_list_unlock();
0a7de745 3604 return *actual <= count ? 0 : ENOMEM;
55e303ae
A
3605}
3606
3607static int
2d21ac55 3608sysctl_vfs_vfslist(__unused struct sysctl_oid *oidp, __unused void *arg1,
0a7de745 3609 __unused int arg2, struct sysctl_req *req)
55e303ae
A
3610{
3611 int actual, error;
3612 size_t space;
3613 fsid_t *fsidlst;
3614
3615 /* This is a readonly node. */
0a7de745
A
3616 if (req->newptr != USER_ADDR_NULL) {
3617 return EPERM;
3618 }
55e303ae
A
3619
3620 /* they are querying us so just return the space required. */
91447636 3621 if (req->oldptr == USER_ADDR_NULL) {
55e303ae
A
3622 req->oldidx = sysctl_vfs_getvfscnt() * sizeof(fsid_t);
3623 return 0;
3624 }
3625again:
3626 /*
3627 * Retrieve an accurate count of the amount of space required to copy
3628 * out all the fsids in the system.
3629 */
3630 space = req->oldlen;
3631 req->oldlen = sysctl_vfs_getvfscnt() * sizeof(fsid_t);
3632
3633 /* they didn't give us enough space. */
0a7de745
A
3634 if (space < req->oldlen) {
3635 return ENOMEM;
3636 }
55e303ae 3637
527f9951 3638 MALLOC(fsidlst, fsid_t *, req->oldlen, M_TEMP, M_WAITOK | M_ZERO);
b0d623f7 3639 if (fsidlst == NULL) {
0a7de745 3640 return ENOMEM;
b0d623f7
A
3641 }
3642
55e303ae
A
3643 error = sysctl_vfs_getvfslist(fsidlst, req->oldlen / sizeof(fsid_t),
3644 &actual);
3645 /*
3646 * If we get back ENOMEM, then another mount has been added while we
3647 * slept in malloc above. If this is the case then try again.
3648 */
3649 if (error == ENOMEM) {
3650 FREE(fsidlst, M_TEMP);
3651 req->oldlen = space;
3652 goto again;
3653 }
3654 if (error == 0) {
3655 error = SYSCTL_OUT(req, fsidlst, actual * sizeof(fsid_t));
3656 }
3657 FREE(fsidlst, M_TEMP);
0a7de745 3658 return error;
55e303ae
A
3659}
3660
3661/*
3662 * Do a sysctl by fsid.
3663 */
3664static int
2d21ac55 3665sysctl_vfs_ctlbyfsid(__unused struct sysctl_oid *oidp, void *arg1, int arg2,
0a7de745 3666 struct sysctl_req *req)
55e303ae 3667{
b0d623f7 3668 union union_vfsidctl vc;
55e303ae 3669 struct mount *mp;
91447636 3670 struct vfsstatfs *sp;
2d21ac55 3671 int *name, flags, namelen;
0a7de745 3672 int error = 0, gotref = 0;
2d21ac55 3673 vfs_context_t ctx = vfs_context_current();
0a7de745 3674 proc_t p = req->p; /* XXX req->p != current_proc()? */
91447636 3675 boolean_t is_64_bit;
55e303ae
A
3676
3677 name = arg1;
3678 namelen = arg2;
91447636 3679 is_64_bit = proc_is64bit(p);
55e303ae 3680
b0d623f7 3681 error = SYSCTL_IN(req, &vc, is_64_bit? sizeof(vc.vc64):sizeof(vc.vc32));
0a7de745 3682 if (error) {
b0d623f7 3683 goto out;
0a7de745 3684 }
b0d623f7
A
3685 if (vc.vc32.vc_vers != VFS_CTL_VERS1) { /* works for 32 and 64 */
3686 error = EINVAL;
3687 goto out;
2d21ac55 3688 }
b0d623f7 3689 mp = mount_list_lookupby_fsid(&vc.vc32.vc_fsid, 0, 1); /* works for 32 and 64 */
2d21ac55
A
3690 if (mp == NULL) {
3691 error = ENOENT;
3692 goto out;
91447636 3693 }
2d21ac55 3694 gotref = 1;
55e303ae
A
3695 /* reset so that the fs specific code can fetch it. */
3696 req->newidx = 0;
3697 /*
3698 * Note if this is a VFS_CTL then we pass the actual sysctl req
3699 * in for "oldp" so that the lower layer can DTRT and use the
3700 * SYSCTL_IN/OUT routines.
3701 */
3702 if (mp->mnt_op->vfs_sysctl != NULL) {
91447636
A
3703 if (is_64_bit) {
3704 if (vfs_64bitready(mp)) {
3705 error = mp->mnt_op->vfs_sysctl(name, namelen,
3706 CAST_USER_ADDR_T(req),
0a7de745 3707 NULL, USER_ADDR_NULL, 0,
2d21ac55 3708 ctx);
0a7de745 3709 } else {
91447636
A
3710 error = ENOTSUP;
3711 }
0a7de745 3712 } else {
91447636
A
3713 error = mp->mnt_op->vfs_sysctl(name, namelen,
3714 CAST_USER_ADDR_T(req),
0a7de745 3715 NULL, USER_ADDR_NULL, 0,
2d21ac55
A
3716 ctx);
3717 }
3718 if (error != ENOTSUP) {
3719 goto out;
91447636 3720 }
55e303ae
A
3721 }
3722 switch (name[0]) {
3723 case VFS_CTL_UMOUNT:
91447636
A
3724 req->newidx = 0;
3725 if (is_64_bit) {
b0d623f7
A
3726 req->newptr = vc.vc64.vc_ptr;
3727 req->newlen = (size_t)vc.vc64.vc_len;
0a7de745 3728 } else {
b0d623f7
A
3729 req->newptr = CAST_USER_ADDR_T(vc.vc32.vc_ptr);
3730 req->newlen = vc.vc32.vc_len;
91447636 3731 }
55e303ae 3732 error = SYSCTL_IN(req, &flags, sizeof(flags));
0a7de745 3733 if (error) {
55e303ae 3734 break;
0a7de745 3735 }
2d21ac55 3736
6601e61a 3737 mount_ref(mp, 0);
2d21ac55
A
3738 mount_iterdrop(mp);
3739 gotref = 0;
6601e61a 3740 /* safedounmount consumes a ref */
2d21ac55 3741 error = safedounmount(mp, flags, ctx);
55e303ae
A
3742 break;
3743 case VFS_CTL_STATFS:
91447636
A
3744 req->newidx = 0;
3745 if (is_64_bit) {
b0d623f7
A
3746 req->newptr = vc.vc64.vc_ptr;
3747 req->newlen = (size_t)vc.vc64.vc_len;
0a7de745 3748 } else {
b0d623f7
A
3749 req->newptr = CAST_USER_ADDR_T(vc.vc32.vc_ptr);
3750 req->newlen = vc.vc32.vc_len;
91447636 3751 }
55e303ae 3752 error = SYSCTL_IN(req, &flags, sizeof(flags));
0a7de745 3753 if (error) {
55e303ae 3754 break;
0a7de745 3755 }
91447636 3756 sp = &mp->mnt_vfsstat;
b0d623f7 3757 if (((flags & MNT_NOWAIT) == 0 || (flags & (MNT_WAIT | MNT_DWAIT))) &&
0a7de745 3758 (error = vfs_update_vfsstat(mp, ctx, VFS_USER_EVENT))) {
2d21ac55 3759 goto out;
0a7de745 3760 }
91447636 3761 if (is_64_bit) {
b0d623f7 3762 struct user64_statfs sfs;
91447636
A
3763 bzero(&sfs, sizeof(sfs));
3764 sfs.f_flags = mp->mnt_flag & MNT_VISFLAGMASK;
3765 sfs.f_type = mp->mnt_vtable->vfc_typenum;
b0d623f7
A
3766 sfs.f_bsize = (user64_long_t)sp->f_bsize;
3767 sfs.f_iosize = (user64_long_t)sp->f_iosize;
3768 sfs.f_blocks = (user64_long_t)sp->f_blocks;
3769 sfs.f_bfree = (user64_long_t)sp->f_bfree;
3770 sfs.f_bavail = (user64_long_t)sp->f_bavail;
3771 sfs.f_files = (user64_long_t)sp->f_files;
3772 sfs.f_ffree = (user64_long_t)sp->f_ffree;
91447636
A
3773 sfs.f_fsid = sp->f_fsid;
3774 sfs.f_owner = sp->f_owner;
39037602 3775#ifdef NFSCLIENT
6d2010ae 3776 if (mp->mnt_kern_flag & MNTK_TYPENAME_OVERRIDE) {
5ba3f43e 3777 strlcpy(&sfs.f_fstypename[0], &mp->fstypename_override[0], MFSNAMELEN);
39037602
A
3778 } else
3779#endif
3780 {
6d2010ae
A
3781 strlcpy(sfs.f_fstypename, sp->f_fstypename, MFSNAMELEN);
3782 }
2d21ac55
A
3783 strlcpy(sfs.f_mntonname, sp->f_mntonname, MNAMELEN);
3784 strlcpy(sfs.f_mntfromname, sp->f_mntfromname, MNAMELEN);
0a7de745 3785
91447636 3786 error = SYSCTL_OUT(req, &sfs, sizeof(sfs));
0a7de745 3787 } else {
b0d623f7
A
3788 struct user32_statfs sfs;
3789 bzero(&sfs, sizeof(sfs));
91447636
A
3790 sfs.f_flags = mp->mnt_flag & MNT_VISFLAGMASK;
3791 sfs.f_type = mp->mnt_vtable->vfc_typenum;
3792
3793 /*
3794 * It's possible for there to be more than 2^^31 blocks in the filesystem, so we
3795 * have to fudge the numbers here in that case. We inflate the blocksize in order
3796 * to reflect the filesystem size as best we can.
3797 */
b0d623f7 3798 if (sp->f_blocks > INT_MAX) {
0a7de745 3799 int shift;
91447636
A
3800
3801 /*
3802 * Work out how far we have to shift the block count down to make it fit.
3803 * Note that it's possible to have to shift so far that the resulting
3804 * blocksize would be unreportably large. At that point, we will clip
3805 * any values that don't fit.
3806 *
3807 * For safety's sake, we also ensure that f_iosize is never reported as
3808 * being smaller than f_bsize.
3809 */
3810 for (shift = 0; shift < 32; shift++) {
0a7de745 3811 if ((sp->f_blocks >> shift) <= INT_MAX) {
91447636 3812 break;
0a7de745
A
3813 }
3814 if ((((long long)sp->f_bsize) << (shift + 1)) > INT_MAX) {
91447636 3815 break;
0a7de745 3816 }
91447636 3817 }
0a7de745 3818#define __SHIFT_OR_CLIP(x, s) ((((x) >> (s)) > INT_MAX) ? INT_MAX : ((x) >> (s)))
b0d623f7
A
3819 sfs.f_blocks = (user32_long_t)__SHIFT_OR_CLIP(sp->f_blocks, shift);
3820 sfs.f_bfree = (user32_long_t)__SHIFT_OR_CLIP(sp->f_bfree, shift);
3821 sfs.f_bavail = (user32_long_t)__SHIFT_OR_CLIP(sp->f_bavail, shift);
91447636 3822#undef __SHIFT_OR_CLIP
b0d623f7 3823 sfs.f_bsize = (user32_long_t)(sp->f_bsize << shift);
91447636
A
3824 sfs.f_iosize = lmax(sp->f_iosize, sp->f_bsize);
3825 } else {
b0d623f7
A
3826 sfs.f_bsize = (user32_long_t)sp->f_bsize;
3827 sfs.f_iosize = (user32_long_t)sp->f_iosize;
3828 sfs.f_blocks = (user32_long_t)sp->f_blocks;
3829 sfs.f_bfree = (user32_long_t)sp->f_bfree;
3830 sfs.f_bavail = (user32_long_t)sp->f_bavail;
91447636 3831 }
b0d623f7
A
3832 sfs.f_files = (user32_long_t)sp->f_files;
3833 sfs.f_ffree = (user32_long_t)sp->f_ffree;
91447636
A
3834 sfs.f_fsid = sp->f_fsid;
3835 sfs.f_owner = sp->f_owner;
39037602 3836
5ba3f43e 3837#ifdef NFSCLIENT
6d2010ae 3838 if (mp->mnt_kern_flag & MNTK_TYPENAME_OVERRIDE) {
5ba3f43e 3839 strlcpy(&sfs.f_fstypename[0], &mp->fstypename_override[0], MFSNAMELEN);
39037602
A
3840 } else
3841#endif
3842 {
6d2010ae
A
3843 strlcpy(sfs.f_fstypename, sp->f_fstypename, MFSNAMELEN);
3844 }
2d21ac55
A
3845 strlcpy(sfs.f_mntonname, sp->f_mntonname, MNAMELEN);
3846 strlcpy(sfs.f_mntfromname, sp->f_mntfromname, MNAMELEN);
0a7de745 3847
91447636
A
3848 error = SYSCTL_OUT(req, &sfs, sizeof(sfs));
3849 }
55e303ae
A
3850 break;
3851 default:
2d21ac55
A
3852 error = ENOTSUP;
3853 goto out;
55e303ae 3854 }
2d21ac55 3855out:
0a7de745 3856 if (gotref != 0) {
2d21ac55 3857 mount_iterdrop(mp);
0a7de745
A
3858 }
3859 return error;
0b4e3aa0
A
3860}
3861
0a7de745
A
3862static int filt_fsattach(struct knote *kn, struct kevent_internal_s *kev);
3863static void filt_fsdetach(struct knote *kn);
3864static int filt_fsevent(struct knote *kn, long hint);
3865static int filt_fstouch(struct knote *kn, struct kevent_internal_s *kev);
3866static int filt_fsprocess(struct knote *kn, struct filt_process_s *data, struct kevent_internal_s *kev);
5ba3f43e
A
3867SECURITY_READ_ONLY_EARLY(struct filterops) fs_filtops = {
3868 .f_attach = filt_fsattach,
3869 .f_detach = filt_fsdetach,
3870 .f_event = filt_fsevent,
39037602
A
3871 .f_touch = filt_fstouch,
3872 .f_process = filt_fsprocess,
b0d623f7 3873};
55e303ae
A
3874
3875static int
5ba3f43e 3876filt_fsattach(struct knote *kn, __unused struct kevent_internal_s *kev)
55e303ae 3877{
4a3eedf9 3878 lck_mtx_lock(fs_klist_lock);
55e303ae 3879 KNOTE_ATTACH(&fs_klist, kn);
4a3eedf9 3880 lck_mtx_unlock(fs_klist_lock);
39037602 3881
0a7de745
A
3882 /*
3883 * filter only sees future events,
39037602
A
3884 * so it can't be fired already.
3885 */
0a7de745 3886 return 0;
55e303ae
A
3887}
3888
3889static void
3890filt_fsdetach(struct knote *kn)
3891{
4a3eedf9 3892 lck_mtx_lock(fs_klist_lock);
55e303ae 3893 KNOTE_DETACH(&fs_klist, kn);
4a3eedf9 3894 lck_mtx_unlock(fs_klist_lock);
55e303ae
A
3895}
3896
3897static int
3898filt_fsevent(struct knote *kn, long hint)
3899{
2d21ac55
A
3900 /*
3901 * Backwards compatibility:
3902 * Other filters would do nothing if kn->kn_sfflags == 0
3903 */
3904
3905 if ((kn->kn_sfflags == 0) || (kn->kn_sfflags & hint)) {
3906 kn->kn_fflags |= hint;
3907 }
55e303ae 3908
0a7de745 3909 return kn->kn_fflags != 0;
55e303ae
A
3910}
3911
39037602
A
3912static int
3913filt_fstouch(struct knote *kn, struct kevent_internal_s *kev)
3914{
3915 int res;
3916
3917 lck_mtx_lock(fs_klist_lock);
3918
3919 kn->kn_sfflags = kev->fflags;
39037602
A
3920
3921 /*
3922 * the above filter function sets bits even if nobody is looking for them.
3923 * Just preserve those bits even in the new mask is more selective
3924 * than before.
3925 *
3926 * For compatibility with previous implementations, we leave kn_fflags
3927 * as they were before.
3928 */
3929 //if (kn->kn_sfflags)
3930 // kn->kn_fflags &= kn->kn_sfflags;
3931 res = (kn->kn_fflags != 0);
3932
3933 lck_mtx_unlock(fs_klist_lock);
3934
3935 return res;
3936}
3937
3938static int
3939filt_fsprocess(struct knote *kn, struct filt_process_s *data, struct kevent_internal_s *kev)
3940{
3941#pragma unused(data)
3942 int res;
3943
3944 lck_mtx_lock(fs_klist_lock);
3945 res = (kn->kn_fflags != 0);
3946 if (res) {
3947 *kev = kn->kn_kevent;
3948 kn->kn_flags |= EV_CLEAR; /* automatic */
3949 kn->kn_fflags = 0;
3950 kn->kn_data = 0;
3951 }
3952 lck_mtx_unlock(fs_klist_lock);
3953 return res;
0a7de745 3954}
39037602 3955
55e303ae 3956static int
2d21ac55 3957sysctl_vfs_noremotehang(__unused struct sysctl_oid *oidp,
0a7de745 3958 __unused void *arg1, __unused int arg2, struct sysctl_req *req)
55e303ae
A
3959{
3960 int out, error;
3961 pid_t pid;
2d21ac55 3962 proc_t p;
55e303ae
A
3963
3964 /* We need a pid. */
0a7de745
A
3965 if (req->newptr == USER_ADDR_NULL) {
3966 return EINVAL;
3967 }
55e303ae
A
3968
3969 error = SYSCTL_IN(req, &pid, sizeof(pid));
0a7de745
A
3970 if (error) {
3971 return error;
3972 }
55e303ae 3973
2d21ac55 3974 p = proc_find(pid < 0 ? -pid : pid);
0a7de745
A
3975 if (p == NULL) {
3976 return ESRCH;
3977 }
55e303ae
A
3978
3979 /*
3980 * Fetching the value is ok, but we only fetch if the old
3981 * pointer is given.
3982 */
91447636 3983 if (req->oldptr != USER_ADDR_NULL) {
55e303ae 3984 out = !((p->p_flag & P_NOREMOTEHANG) == 0);
2d21ac55 3985 proc_rele(p);
55e303ae 3986 error = SYSCTL_OUT(req, &out, sizeof(out));
0a7de745 3987 return error;
55e303ae
A
3988 }
3989
3990 /* cansignal offers us enough security. */
2d21ac55
A
3991 if (p != req->p && proc_suser(req->p) != 0) {
3992 proc_rele(p);
0a7de745 3993 return EPERM;
2d21ac55 3994 }
55e303ae 3995
0a7de745 3996 if (pid < 0) {
b0d623f7 3997 OSBitAndAtomic(~((uint32_t)P_NOREMOTEHANG), &p->p_flag);
0a7de745 3998 } else {
b0d623f7 3999 OSBitOrAtomic(P_NOREMOTEHANG, &p->p_flag);
0a7de745 4000 }
2d21ac55 4001 proc_rele(p);
55e303ae 4002
0a7de745 4003 return 0;
55e303ae 4004}
2d21ac55 4005
fe8ab488
A
4006static int
4007sysctl_vfs_generic_conf SYSCTL_HANDLER_ARGS
4008{
4009 int *name, namelen;
4010 struct vfstable *vfsp;
527f9951 4011 struct vfsconf vfsc = {};
0a7de745 4012
fe8ab488
A
4013 (void)oidp;
4014 name = arg1;
4015 namelen = arg2;
0a7de745 4016
fe8ab488 4017 if (namelen < 1) {
0a7de745 4018 return EISDIR;
fe8ab488 4019 } else if (namelen > 1) {
0a7de745 4020 return ENOTDIR;
fe8ab488 4021 }
0a7de745 4022
fe8ab488 4023 mount_list_lock();
0a7de745
A
4024 for (vfsp = vfsconf; vfsp; vfsp = vfsp->vfc_next) {
4025 if (vfsp->vfc_typenum == name[0]) {
fe8ab488 4026 break;
0a7de745
A
4027 }
4028 }
4029
fe8ab488
A
4030 if (vfsp == NULL) {
4031 mount_list_unlock();
0a7de745 4032 return ENOTSUP;
fe8ab488 4033 }
0a7de745 4034
fe8ab488
A
4035 vfsc.vfc_reserved1 = 0;
4036 bcopy(vfsp->vfc_name, vfsc.vfc_name, sizeof(vfsc.vfc_name));
4037 vfsc.vfc_typenum = vfsp->vfc_typenum;
4038 vfsc.vfc_refcount = vfsp->vfc_refcount;
4039 vfsc.vfc_flags = vfsp->vfc_flags;
4040 vfsc.vfc_reserved2 = 0;
4041 vfsc.vfc_reserved3 = 0;
0a7de745 4042
fe8ab488 4043 mount_list_unlock();
0a7de745 4044 return SYSCTL_OUT(req, &vfsc, sizeof(struct vfsconf));
fe8ab488
A
4045}
4046
55e303ae 4047/* the vfs.generic. branch. */
6d2010ae 4048SYSCTL_NODE(_vfs, VFS_GENERIC, generic, CTLFLAG_RW | CTLFLAG_LOCKED, NULL, "vfs generic hinge");
55e303ae 4049/* retreive a list of mounted filesystem fsid_t */
fe8ab488
A
4050SYSCTL_PROC(_vfs_generic, OID_AUTO, vfsidlist,
4051 CTLTYPE_STRUCT | CTLFLAG_RD | CTLFLAG_LOCKED,
2d21ac55 4052 NULL, 0, sysctl_vfs_vfslist, "S,fsid", "List of mounted filesystem ids");
55e303ae 4053/* perform operations on filesystem via fsid_t */
6d2010ae 4054SYSCTL_NODE(_vfs_generic, OID_AUTO, ctlbyfsid, CTLFLAG_RW | CTLFLAG_LOCKED,
55e303ae 4055 sysctl_vfs_ctlbyfsid, "ctlbyfsid");
6d2010ae 4056SYSCTL_PROC(_vfs_generic, OID_AUTO, noremotehang, CTLFLAG_RW | CTLFLAG_ANYBODY,
2d21ac55 4057 NULL, 0, sysctl_vfs_noremotehang, "I", "noremotehang");
fe8ab488 4058SYSCTL_INT(_vfs_generic, VFS_MAXTYPENUM, maxtypenum,
0a7de745
A
4059 CTLFLAG_RD | CTLFLAG_KERN | CTLFLAG_LOCKED,
4060 &maxvfstypenum, 0, "");
d9a64523 4061SYSCTL_INT(_vfs_generic, OID_AUTO, sync_timeout, CTLFLAG_RW | CTLFLAG_LOCKED, &sync_timeout_seconds, 0, "");
fe8ab488 4062SYSCTL_NODE(_vfs_generic, VFS_CONF, conf,
0a7de745
A
4063 CTLFLAG_RD | CTLFLAG_LOCKED,
4064 sysctl_vfs_generic_conf, "");
fe8ab488 4065
5ba3f43e
A
4066/* Indicate that the root file system unmounted cleanly */
4067static int vfs_root_unmounted_cleanly = 0;
4068SYSCTL_INT(_vfs_generic, OID_AUTO, root_unmounted_cleanly, CTLFLAG_RD, &vfs_root_unmounted_cleanly, 0, "Root filesystem was unmounted cleanly");
4069
4070void
4071vfs_set_root_unmounted_cleanly(void)
4072{
4073 vfs_root_unmounted_cleanly = 1;
4074}
4075
3e170ce0
A
4076/*
4077 * Print vnode state.
4078 */
4079void
4080vn_print_state(struct vnode *vp, const char *fmt, ...)
4081{
4082 va_list ap;
4083 char perm_str[] = "(VM_KERNEL_ADDRPERM pointer)";
4084 char fs_name[MFSNAMELEN];
4085
4086 va_start(ap, fmt);
4087 vprintf(fmt, ap);
4088 va_end(ap);
4089 printf("vp 0x%0llx %s: ", (uint64_t)VM_KERNEL_ADDRPERM(vp), perm_str);
4090 printf("tag %d, type %d\n", vp->v_tag, vp->v_type);
4091 /* Counts .. */
4092 printf(" iocount %d, usecount %d, kusecount %d references %d\n",
4093 vp->v_iocount, vp->v_usecount, vp->v_kusecount, vp->v_references);
4094 printf(" writecount %d, numoutput %d\n", vp->v_writecount,
4095 vp->v_numoutput);
4096 /* Flags */
4097 printf(" flag 0x%x, lflag 0x%x, listflag 0x%x\n", vp->v_flag,
4098 vp->v_lflag, vp->v_listflag);
4099
4100 if (vp->v_mount == NULL || vp->v_mount == dead_mountp) {
4101 strlcpy(fs_name, "deadfs", MFSNAMELEN);
4102 } else {
4103 vfs_name(vp->v_mount, fs_name);
4104 }
4105
4106 printf(" v_data 0x%0llx %s\n",
4107 (vp->v_data ? (uint64_t)VM_KERNEL_ADDRPERM(vp->v_data) : 0),
4108 perm_str);
4109 printf(" v_mount 0x%0llx %s vfs_name %s\n",
4110 (vp->v_mount ? (uint64_t)VM_KERNEL_ADDRPERM(vp->v_mount) : 0),
4111 perm_str, fs_name);
4112}
4113
b0d623f7 4114long num_reusedvnodes = 0;
55e303ae 4115
316670eb
A
4116
4117static vnode_t
4118process_vp(vnode_t vp, int want_vp, int *deferred)
4119{
4120 unsigned int vpid;
4121
4122 *deferred = 0;
4123
4124 vpid = vp->v_id;
4125
4126 vnode_list_remove_locked(vp);
4127
4128 vnode_list_unlock();
4129
4130 vnode_lock_spin(vp);
4131
0a7de745 4132 /*
316670eb
A
4133 * We could wait for the vnode_lock after removing the vp from the freelist
4134 * and the vid is bumped only at the very end of reclaim. So it is possible
4135 * that we are looking at a vnode that is being terminated. If so skip it.
0a7de745
A
4136 */
4137 if ((vpid != vp->v_id) || (vp->v_usecount != 0) || (vp->v_iocount != 0) ||
316670eb
A
4138 VONLIST(vp) || (vp->v_lflag & VL_TERMINATE)) {
4139 /*
4140 * we lost the race between dropping the list lock
4141 * and picking up the vnode_lock... someone else
4142 * used this vnode and it is now in a new state
4143 */
4144 vnode_unlock(vp);
0a7de745
A
4145
4146 return NULLVP;
316670eb 4147 }
0a7de745
A
4148 if ((vp->v_lflag & (VL_NEEDINACTIVE | VL_MARKTERM)) == VL_NEEDINACTIVE) {
4149 /*
316670eb
A
4150 * we did a vnode_rele_ext that asked for
4151 * us not to reenter the filesystem during
4152 * the release even though VL_NEEDINACTIVE was
4153 * set... we'll do it here by doing a
4154 * vnode_get/vnode_put
4155 *
4156 * pick up an iocount so that we can call
4157 * vnode_put and drive the VNOP_INACTIVE...
0a7de745 4158 * vnode_put will either leave us off
316670eb
A
4159 * the freelist if a new ref comes in,
4160 * or put us back on the end of the freelist
4161 * or recycle us if we were marked for termination...
4162 * so we'll just go grab a new candidate
4163 */
0a7de745 4164 vp->v_iocount++;
316670eb
A
4165#ifdef JOE_DEBUG
4166 record_vp(vp, 1);
4167#endif
4168 vnode_put_locked(vp);
4169 vnode_unlock(vp);
4170
0a7de745 4171 return NULLVP;
316670eb
A
4172 }
4173 /*
4174 * Checks for anyone racing us for recycle
0a7de745 4175 */
316670eb 4176 if (vp->v_type != VBAD) {
fe8ab488 4177 if (want_vp && (vnode_on_reliable_media(vp) == FALSE || (vp->v_flag & VISDIRTY))) {
316670eb
A
4178 vnode_async_list_add(vp);
4179 vnode_unlock(vp);
0a7de745 4180
316670eb
A
4181 *deferred = 1;
4182
0a7de745 4183 return NULLVP;
316670eb 4184 }
0a7de745 4185 if (vp->v_lflag & VL_DEAD) {
316670eb 4186 panic("new_vnode(%p): the vnode is VL_DEAD but not VBAD", vp);
0a7de745 4187 }
316670eb
A
4188
4189 vnode_lock_convert(vp);
4190 (void)vnode_reclaim_internal(vp, 1, want_vp, 0);
4191
4192 if (want_vp) {
0a7de745 4193 if ((VONLIST(vp))) {
316670eb 4194 panic("new_vnode(%p): vp on list", vp);
0a7de745 4195 }
316670eb 4196 if (vp->v_usecount || vp->v_iocount || vp->v_kusecount ||
0a7de745 4197 (vp->v_lflag & (VNAMED_UBC | VNAMED_MOUNT | VNAMED_FSHASH))) {
316670eb 4198 panic("new_vnode(%p): free vnode still referenced", vp);
0a7de745
A
4199 }
4200 if ((vp->v_mntvnodes.tqe_prev != 0) && (vp->v_mntvnodes.tqe_next != 0)) {
316670eb 4201 panic("new_vnode(%p): vnode seems to be on mount list", vp);
0a7de745
A
4202 }
4203 if (!LIST_EMPTY(&vp->v_nclinks) || !TAILQ_EMPTY(&vp->v_ncchildren)) {
316670eb 4204 panic("new_vnode(%p): vnode still hooked into the name cache", vp);
0a7de745 4205 }
316670eb
A
4206 } else {
4207 vnode_unlock(vp);
4208 vp = NULLVP;
4209 }
4210 }
0a7de745 4211 return vp;
316670eb
A
4212}
4213
39037602 4214__attribute__((noreturn))
316670eb
A
4215static void
4216async_work_continue(void)
4217{
4218 struct async_work_lst *q;
0a7de745
A
4219 int deferred;
4220 vnode_t vp;
316670eb
A
4221
4222 q = &vnode_async_work_list;
4223
4224 for (;;) {
316670eb
A
4225 vnode_list_lock();
4226
0a7de745 4227 if (TAILQ_EMPTY(q)) {
316670eb 4228 assert_wait(q, (THREAD_UNINT));
0a7de745 4229
316670eb 4230 vnode_list_unlock();
0a7de745 4231
316670eb
A
4232 thread_block((thread_continue_t)async_work_continue);
4233
4234 continue;
4235 }
4236 async_work_handled++;
4237
4238 vp = TAILQ_FIRST(q);
4239
4240 vp = process_vp(vp, 0, &deferred);
4241
0a7de745 4242 if (vp != NULLVP) {
316670eb 4243 panic("found VBAD vp (%p) on async queue", vp);
0a7de745 4244 }
316670eb
A
4245 }
4246}
4247
4248
91447636
A
4249static int
4250new_vnode(vnode_t *vpp)
4251{
0a7de745
A
4252 vnode_t vp;
4253 uint32_t retries = 0, max_retries = 100; /* retry incase of tablefull */
2d21ac55 4254 int force_alloc = 0, walk_count = 0;
316670eb
A
4255 boolean_t need_reliable_vp = FALSE;
4256 int deferred;
0a7de745
A
4257 struct timeval initial_tv;
4258 struct timeval current_tv;
2d21ac55 4259 proc_t curproc = current_proc();
91447636 4260
316670eb 4261 initial_tv.tv_sec = 0;
91447636 4262retry:
2d21ac55
A
4263 vp = NULLVP;
4264
91447636
A
4265 vnode_list_lock();
4266
0a7de745 4267 if (need_reliable_vp == TRUE) {
316670eb 4268 async_work_timed_out++;
0a7de745 4269 }
316670eb 4270
6d2010ae 4271 if ((numvnodes - deadvnodes) < desiredvnodes || force_alloc) {
316670eb
A
4272 struct timespec ts;
4273
0a7de745 4274 if (!TAILQ_EMPTY(&vnode_dead_list)) {
6d2010ae
A
4275 /*
4276 * Can always reuse a dead one
4277 */
4278 vp = TAILQ_FIRST(&vnode_dead_list);
4279 goto steal_this_vp;
4280 }
4281 /*
4282 * no dead vnodes available... if we're under
4283 * the limit, we'll create a new vnode
91447636 4284 */
91447636
A
4285 numvnodes++;
4286 vnode_list_unlock();
b0d623f7 4287
2d21ac55
A
4288 MALLOC_ZONE(vp, struct vnode *, sizeof(*vp), M_VNODE, M_WAITOK);
4289 bzero((char *)vp, sizeof(*vp));
0a7de745 4290 VLISTNONE(vp); /* avoid double queue removal */
91447636
A
4291 lck_mtx_init(&vp->v_lock, vnode_lck_grp, vnode_lck_attr);
4292
39037602
A
4293 TAILQ_INIT(&vp->v_ncchildren);
4294
b0d623f7 4295 klist_init(&vp->v_knotes);
91447636
A
4296 nanouptime(&ts);
4297 vp->v_id = ts.tv_nsec;
4298 vp->v_flag = VSTANDARD;
4299
2d21ac55 4300#if CONFIG_MACF
0a7de745 4301 if (mac_vnode_label_init_needed(vp)) {
b0d623f7 4302 mac_vnode_label_init(vp);
0a7de745 4303 }
2d21ac55
A
4304#endif /* MAC */
4305
cf7d32b8 4306 vp->v_iocount = 1;
91447636
A
4307 goto done;
4308 }
316670eb 4309 microuptime(&current_tv);
2d21ac55
A
4310
4311#define MAX_WALK_COUNT 1000
4312
0a7de745
A
4313 if (!TAILQ_EMPTY(&vnode_rage_list) &&
4314 (ragevnodes >= rage_limit ||
4315 (current_tv.tv_sec - rage_tv.tv_sec) >= RAGE_TIME_LIMIT)) {
2d21ac55 4316 TAILQ_FOREACH(vp, &vnode_rage_list, v_freelist) {
0a7de745 4317 if (!(vp->v_listflag & VLIST_RAGE)) {
316670eb 4318 panic("new_vnode: vp (%p) on RAGE list not marked VLIST_RAGE", vp);
0a7de745 4319 }
2d21ac55 4320
316670eb 4321 // if we're a dependency-capable process, skip vnodes that can
6d2010ae
A
4322 // cause recycling deadlocks. (i.e. this process is diskimages
4323 // helper and the vnode is in a disk image). Querying the
4324 // mnt_kern_flag for the mount's virtual device status
4325 // is safer than checking the mnt_dependent_process, which
0a7de745 4326 // may not be updated if there are multiple devnode layers
6d2010ae
A
4327 // in between the disk image and the final consumer.
4328
0a7de745 4329 if ((curproc->p_flag & P_DEPENDENCY_CAPABLE) == 0 || vp->v_mount == NULL ||
316670eb
A
4330 (vp->v_mount->mnt_kern_flag & MNTK_VIRTUALDEV) == 0) {
4331 /*
4332 * if need_reliable_vp == TRUE, then we've already sent one or more
4333 * non-reliable vnodes to the async thread for processing and timed
4334 * out waiting for a dead vnode to show up. Use the MAX_WALK_COUNT
0a7de745 4335 * mechanism to first scan for a reliable vnode before forcing
316670eb
A
4336 * a new vnode to be created
4337 */
0a7de745 4338 if (need_reliable_vp == FALSE || vnode_on_reliable_media(vp) == TRUE) {
316670eb 4339 break;
0a7de745 4340 }
316670eb 4341 }
2d21ac55 4342
316670eb
A
4343 // don't iterate more than MAX_WALK_COUNT vnodes to
4344 // avoid keeping the vnode list lock held for too long.
4345
4346 if (walk_count++ > MAX_WALK_COUNT) {
6d2010ae 4347 vp = NULL;
316670eb
A
4348 break;
4349 }
2d21ac55 4350 }
2d21ac55
A
4351 }
4352
4353 if (vp == NULL && !TAILQ_EMPTY(&vnode_free_list)) {
0a7de745 4354 /*
2d21ac55
A
4355 * Pick the first vp for possible reuse
4356 */
4357 walk_count = 0;
4358 TAILQ_FOREACH(vp, &vnode_free_list, v_freelist) {
6d2010ae
A
4359 // if we're a dependency-capable process, skip vnodes that can
4360 // cause recycling deadlocks. (i.e. this process is diskimages
4361 // helper and the vnode is in a disk image). Querying the
4362 // mnt_kern_flag for the mount's virtual device status
4363 // is safer than checking the mnt_dependent_process, which
0a7de745 4364 // may not be updated if there are multiple devnode layers
6d2010ae
A
4365 // in between the disk image and the final consumer.
4366
0a7de745 4367 if ((curproc->p_flag & P_DEPENDENCY_CAPABLE) == 0 || vp->v_mount == NULL ||
316670eb
A
4368 (vp->v_mount->mnt_kern_flag & MNTK_VIRTUALDEV) == 0) {
4369 /*
4370 * if need_reliable_vp == TRUE, then we've already sent one or more
4371 * non-reliable vnodes to the async thread for processing and timed
4372 * out waiting for a dead vnode to show up. Use the MAX_WALK_COUNT
0a7de745 4373 * mechanism to first scan for a reliable vnode before forcing
316670eb
A
4374 * a new vnode to be created
4375 */
0a7de745 4376 if (need_reliable_vp == FALSE || vnode_on_reliable_media(vp) == TRUE) {
316670eb 4377 break;
0a7de745 4378 }
316670eb 4379 }
2d21ac55 4380
316670eb
A
4381 // don't iterate more than MAX_WALK_COUNT vnodes to
4382 // avoid keeping the vnode list lock held for too long.
2d21ac55 4383
316670eb
A
4384 if (walk_count++ > MAX_WALK_COUNT) {
4385 vp = NULL;
4386 break;
4387 }
4388 }
2d21ac55
A
4389 }
4390
4391 //
4392 // if we don't have a vnode and the walk_count is >= MAX_WALK_COUNT
4393 // then we're trying to create a vnode on behalf of a
4394 // process like diskimages-helper that has file systems
4395 // mounted on top of itself (and thus we can't reclaim
4396 // vnodes in the file systems on top of us). if we can't
4397 // find a vnode to reclaim then we'll just have to force
4398 // the allocation.
4399 //
4400 if (vp == NULL && walk_count >= MAX_WALK_COUNT) {
316670eb
A
4401 force_alloc = 1;
4402 vnode_list_unlock();
4403 goto retry;
2d21ac55
A
4404 }
4405
4406 if (vp == NULL) {
d1ecb069 4407 /*
91447636
A
4408 * we've reached the system imposed maximum number of vnodes
4409 * but there isn't a single one available
4410 * wait a bit and then retry... if we can't get a vnode
316670eb 4411 * after our target number of retries, than log a complaint
91447636 4412 */
316670eb 4413 if (++retries <= max_retries) {
91447636 4414 vnode_list_unlock();
2d21ac55 4415 delay_for_interval(1, 1000 * 1000);
91447636
A
4416 goto retry;
4417 }
0a7de745 4418
91447636
A
4419 vnode_list_unlock();
4420 tablefull("vnode");
4421 log(LOG_EMERG, "%d desired, %d numvnodes, "
0a7de745
A
4422 "%d free, %d dead, %d async, %d rage\n",
4423 desiredvnodes, numvnodes, freevnodes, deadvnodes, async_work_vnodes, ragevnodes);
316670eb 4424#if CONFIG_JETSAM
3e170ce0
A
4425
4426#if DEVELOPMENT || DEBUG
0a7de745 4427 if (bootarg_no_vnode_jetsam) {
3e170ce0 4428 panic("vnode table is full\n");
0a7de745 4429 }
3e170ce0
A
4430#endif /* DEVELOPMENT || DEBUG */
4431
593a1d5f 4432 /*
d1ecb069
A
4433 * Running out of vnodes tends to make a system unusable. Start killing
4434 * processes that jetsam knows are killable.
593a1d5f 4435 */
39236c6e 4436 if (memorystatus_kill_on_vnode_limit() == FALSE) {
d1ecb069
A
4437 /*
4438 * If jetsam can't find any more processes to kill and there
4439 * still aren't any free vnodes, panic. Hopefully we'll get a
4440 * panic log to tell us why we ran out.
4441 */
4442 panic("vnode table is full\n");
4443 }
4444
0a7de745
A
4445 /*
4446 * Now that we've killed someone, wait a bit and continue looking
316670eb
A
4447 * (with fewer retries before trying another kill).
4448 */
4449 delay_for_interval(3, 1000 * 1000);
0a7de745 4450 retries = 0;
316670eb 4451 max_retries = 10;
d1ecb069 4452 goto retry;
593a1d5f 4453#endif
d1ecb069 4454
2d21ac55 4455 *vpp = NULL;
0a7de745 4456 return ENFILE;
91447636
A
4457 }
4458steal_this_vp:
316670eb
A
4459 if ((vp = process_vp(vp, 1, &deferred)) == NULLVP) {
4460 if (deferred) {
0a7de745 4461 int elapsed_msecs;
316670eb 4462 struct timeval elapsed_tv;
91447636 4463
0a7de745 4464 if (initial_tv.tv_sec == 0) {
316670eb 4465 microuptime(&initial_tv);
0a7de745 4466 }
91447636 4467
316670eb 4468 vnode_list_lock();
b0d623f7 4469
316670eb
A
4470 dead_vnode_waited++;
4471 dead_vnode_wanted++;
91447636 4472
316670eb
A
4473 /*
4474 * note that we're only going to explicitly wait 10ms
4475 * for a dead vnode to become available, since even if one
4476 * isn't available, a reliable vnode might now be available
4477 * at the head of the VRAGE or free lists... if so, we
4478 * can satisfy the new_vnode request with less latency then waiting
4479 * for the full 100ms duration we're ultimately willing to tolerate
4480 */
4481 assert_wait_timeout((caddr_t)&dead_vnode_wanted, (THREAD_INTERRUPTIBLE), 10000, NSEC_PER_USEC);
4482
4483 vnode_list_unlock();
4484
4485 thread_block(THREAD_CONTINUE_NULL);
4486
4487 microuptime(&elapsed_tv);
0a7de745 4488
316670eb
A
4489 timevalsub(&elapsed_tv, &initial_tv);
4490 elapsed_msecs = elapsed_tv.tv_sec * 1000 + elapsed_tv.tv_usec / 1000;
4491
4492 if (elapsed_msecs >= 100) {
4493 /*
0a7de745 4494 * we've waited long enough... 100ms is
316670eb
A
4495 * somewhat arbitrary for this case, but the
4496 * normal worst case latency used for UI
4497 * interaction is 100ms, so I've chosen to
4498 * go with that.
4499 *
4500 * setting need_reliable_vp to TRUE
4501 * forces us to find a reliable vnode
4502 * that we can process synchronously, or
4503 * to create a new one if the scan for
4504 * a reliable one hits the scan limit
4505 */
4506 need_reliable_vp = TRUE;
4507 }
4508 }
91447636
A
4509 goto retry;
4510 }
b0d623f7 4511 OSAddAtomicLong(1, &num_reusedvnodes);
91447636 4512
91447636 4513
2d21ac55
A
4514#if CONFIG_MACF
4515 /*
4516 * We should never see VL_LABELWAIT or VL_LABEL here.
4517 * as those operations hold a reference.
4518 */
0a7de745
A
4519 assert((vp->v_lflag & VL_LABELWAIT) != VL_LABELWAIT);
4520 assert((vp->v_lflag & VL_LABEL) != VL_LABEL);
2d21ac55 4521 if (vp->v_lflag & VL_LABELED) {
0a7de745 4522 vnode_lock_convert(vp);
2d21ac55 4523 mac_vnode_label_recycle(vp);
b0d623f7 4524 } else if (mac_vnode_label_init_needed(vp)) {
0a7de745 4525 vnode_lock_convert(vp);
b0d623f7 4526 mac_vnode_label_init(vp);
2d21ac55 4527 }
b0d623f7 4528
2d21ac55
A
4529#endif /* MAC */
4530
cf7d32b8 4531 vp->v_iocount = 1;
91447636
A
4532 vp->v_lflag = 0;
4533 vp->v_writecount = 0;
0a7de745 4534 vp->v_references = 0;
91447636
A
4535 vp->v_iterblkflags = 0;
4536 vp->v_flag = VSTANDARD;
4537 /* vbad vnodes can point to dead_mountp */
2d21ac55 4538 vp->v_mount = NULL;
91447636
A
4539 vp->v_defer_reclaimlist = (vnode_t)0;
4540
4541 vnode_unlock(vp);
2d21ac55 4542
91447636
A
4543done:
4544 *vpp = vp;
4545
0a7de745 4546 return 0;
91447636
A
4547}
4548
4549void
4550vnode_lock(vnode_t vp)
4551{
4552 lck_mtx_lock(&vp->v_lock);
4553}
4554
2d21ac55
A
4555void
4556vnode_lock_spin(vnode_t vp)
4557{
4558 lck_mtx_lock_spin(&vp->v_lock);
4559}
4560
91447636
A
4561void
4562vnode_unlock(vnode_t vp)
4563{
4564 lck_mtx_unlock(&vp->v_lock);
4565}
4566
4567
4568
4569int
4570vnode_get(struct vnode *vp)
4571{
0a7de745 4572 int retval;
91447636 4573
0a7de745 4574 vnode_lock_spin(vp);
2d21ac55
A
4575 retval = vnode_get_locked(vp);
4576 vnode_unlock(vp);
4577
0a7de745 4578 return retval;
2d21ac55
A
4579}
4580
4581int
4582vnode_get_locked(struct vnode *vp)
4583{
b0d623f7
A
4584#if DIAGNOSTIC
4585 lck_mtx_assert(&vp->v_lock, LCK_MTX_ASSERT_OWNED);
4586#endif
2d21ac55 4587 if ((vp->v_iocount == 0) && (vp->v_lflag & (VL_TERMINATE | VL_DEAD))) {
0a7de745 4588 return ENOENT;
91447636 4589 }
0a7de745
A
4590
4591 if (os_add_overflow(vp->v_iocount, 1, &vp->v_iocount)) {
4592 panic("v_iocount overflow");
4593 }
4594
91447636
A
4595#ifdef JOE_DEBUG
4596 record_vp(vp, 1);
4597#endif
0a7de745 4598 return 0;
91447636
A
4599}
4600
6d2010ae
A
4601/*
4602 * vnode_getwithvid() cuts in line in front of a vnode drain (that is,
4603 * while the vnode is draining, but at no point after that) to prevent
4604 * deadlocks when getting vnodes from filesystem hashes while holding
4605 * resources that may prevent other iocounts from being released.
4606 */
91447636 4607int
b0d623f7 4608vnode_getwithvid(vnode_t vp, uint32_t vid)
91447636 4609{
0a7de745 4610 return vget_internal(vp, vid, (VNODE_NODEAD | VNODE_WITHID | VNODE_DRAINO));
6d2010ae
A
4611}
4612
4613/*
4614 * vnode_getwithvid_drainok() is like vnode_getwithvid(), but *does* block behind a vnode
4615 * drain; it exists for use in the VFS name cache, where we really do want to block behind
4616 * vnode drain to prevent holding off an unmount.
4617 */
4618int
4619vnode_getwithvid_drainok(vnode_t vp, uint32_t vid)
4620{
0a7de745 4621 return vget_internal(vp, vid, (VNODE_NODEAD | VNODE_WITHID));
91447636
A
4622}
4623
4624int
4625vnode_getwithref(vnode_t vp)
4626{
0a7de745 4627 return vget_internal(vp, 0, 0);
91447636
A
4628}
4629
4630
593a1d5f
A
4631__private_extern__ int
4632vnode_getalways(vnode_t vp)
4633{
0a7de745 4634 return vget_internal(vp, 0, VNODE_ALWAYS);
593a1d5f
A
4635}
4636
91447636
A
4637int
4638vnode_put(vnode_t vp)
4639{
0a7de745 4640 int retval;
91447636 4641
2d21ac55 4642 vnode_lock_spin(vp);
91447636
A
4643 retval = vnode_put_locked(vp);
4644 vnode_unlock(vp);
4645
0a7de745 4646 return retval;
91447636
A
4647}
4648
3e170ce0
A
4649static inline void
4650vn_set_dead(vnode_t vp)
4651{
4652 vp->v_mount = NULL;
4653 vp->v_op = dead_vnodeop_p;
4654 vp->v_tag = VT_NON;
4655 vp->v_data = NULL;
4656 vp->v_type = VBAD;
4657 vp->v_lflag |= VL_DEAD;
4658}
4659
91447636
A
4660int
4661vnode_put_locked(vnode_t vp)
4662{
0a7de745 4663 vfs_context_t ctx = vfs_context_current(); /* hoist outside loop */
91447636 4664
b0d623f7
A
4665#if DIAGNOSTIC
4666 lck_mtx_assert(&vp->v_lock, LCK_MTX_ASSERT_OWNED);
4667#endif
91447636 4668retry:
0a7de745 4669 if (vp->v_iocount < 1) {
2d21ac55 4670 panic("vnode_put(%p): iocount < 1", vp);
0a7de745 4671 }
91447636 4672
0a7de745 4673 if ((vp->v_usecount > 0) || (vp->v_iocount > 1)) {
2d21ac55 4674 vnode_dropiocount(vp);
0a7de745 4675 return 0;
91447636 4676 }
6d2010ae 4677 if ((vp->v_lflag & (VL_DEAD | VL_NEEDINACTIVE)) == VL_NEEDINACTIVE) {
0a7de745
A
4678 vp->v_lflag &= ~VL_NEEDINACTIVE;
4679 vnode_unlock(vp);
91447636 4680
2d21ac55 4681 VNOP_INACTIVE(vp, ctx);
91447636 4682
2d21ac55 4683 vnode_lock_spin(vp);
91447636
A
4684 /*
4685 * because we had to drop the vnode lock before calling
4686 * VNOP_INACTIVE, the state of this vnode may have changed...
4687 * we may pick up both VL_MARTERM and either
4688 * an iocount or a usecount while in the VNOP_INACTIVE call
4689 * we don't want to call vnode_reclaim_internal on a vnode
4690 * that has active references on it... so loop back around
4691 * and reevaluate the state
4692 */
4693 goto retry;
4694 }
0a7de745 4695 vp->v_lflag &= ~VL_NEEDINACTIVE;
91447636 4696
2d21ac55 4697 if ((vp->v_lflag & (VL_MARKTERM | VL_TERMINATE | VL_DEAD)) == VL_MARKTERM) {
0a7de745
A
4698 vnode_lock_convert(vp);
4699 vnode_reclaim_internal(vp, 1, 1, 0);
2d21ac55
A
4700 }
4701 vnode_dropiocount(vp);
91447636
A
4702 vnode_list_add(vp);
4703
0a7de745 4704 return 0;
91447636
A
4705}
4706
4707/* is vnode_t in use by others? */
0a7de745 4708int
91447636
A
4709vnode_isinuse(vnode_t vp, int refcnt)
4710{
0a7de745 4711 return vnode_isinuse_locked(vp, refcnt, 0);
91447636
A
4712}
4713
0a7de745
A
4714int
4715vnode_usecount(vnode_t vp)
39037602
A
4716{
4717 return vp->v_usecount;
4718}
4719
0a7de745
A
4720int
4721vnode_iocount(vnode_t vp)
39037602
A
4722{
4723 return vp->v_iocount;
4724}
91447636 4725
0a7de745 4726static int
91447636
A
4727vnode_isinuse_locked(vnode_t vp, int refcnt, int locked)
4728{
4729 int retval = 0;
4730
0a7de745 4731 if (!locked) {
2d21ac55 4732 vnode_lock_spin(vp);
0a7de745
A
4733 }
4734 if ((vp->v_type != VREG) && ((vp->v_usecount - vp->v_kusecount) > refcnt)) {
91447636
A
4735 retval = 1;
4736 goto out;
4737 }
0a7de745 4738 if (vp->v_type == VREG) {
91447636
A
4739 retval = ubc_isinuse_locked(vp, refcnt, 1);
4740 }
0a7de745 4741
91447636 4742out:
0a7de745 4743 if (!locked) {
91447636 4744 vnode_unlock(vp);
0a7de745
A
4745 }
4746 return retval;
91447636
A
4747}
4748
4749
4750/* resume vnode_t */
0a7de745 4751errno_t
91447636
A
4752vnode_resume(vnode_t vp)
4753{
b0d623f7 4754 if ((vp->v_lflag & VL_SUSPENDED) && vp->v_owner == current_thread()) {
b0d623f7 4755 vnode_lock_spin(vp);
0a7de745 4756 vp->v_lflag &= ~VL_SUSPENDED;
2d21ac55 4757 vp->v_owner = NULL;
91447636 4758 vnode_unlock(vp);
91447636 4759
b0d623f7
A
4760 wakeup(&vp->v_iocount);
4761 }
0a7de745 4762 return 0;
91447636
A
4763}
4764
2d21ac55
A
4765/* suspend vnode_t
4766 * Please do not use on more than one vnode at a time as it may
4767 * cause deadlocks.
4768 * xxx should we explicity prevent this from happening?
4769 */
4770
4771errno_t
4772vnode_suspend(vnode_t vp)
4773{
4774 if (vp->v_lflag & VL_SUSPENDED) {
0a7de745 4775 return EBUSY;
2d21ac55
A
4776 }
4777
4778 vnode_lock_spin(vp);
4779
0a7de745
A
4780 /*
4781 * xxx is this sufficient to check if a vnode_drain is
2d21ac55
A
4782 * progress?
4783 */
4784
4785 if (vp->v_owner == NULL) {
4786 vp->v_lflag |= VL_SUSPENDED;
4787 vp->v_owner = current_thread();
4788 }
4789 vnode_unlock(vp);
4790
0a7de745 4791 return 0;
2d21ac55 4792}
0a7de745 4793
316670eb
A
4794/*
4795 * Release any blocked locking requests on the vnode.
4796 * Used for forced-unmounts.
4797 *
4798 * XXX What about network filesystems?
4799 */
4800static void
4801vnode_abort_advlocks(vnode_t vp)
4802{
0a7de745 4803 if (vp->v_flag & VLOCKLOCAL) {
316670eb 4804 lf_abort_advlocks(vp);
0a7de745 4805 }
316670eb 4806}
2d21ac55 4807
0a7de745
A
4808
4809static errno_t
91447636
A
4810vnode_drain(vnode_t vp)
4811{
91447636 4812 if (vp->v_lflag & VL_DRAIN) {
6d2010ae 4813 panic("vnode_drain: recursive drain");
0a7de745 4814 return ENOENT;
91447636
A
4815 }
4816 vp->v_lflag |= VL_DRAIN;
4817 vp->v_owner = current_thread();
4818
0a7de745 4819 while (vp->v_iocount > 1) {
2d21ac55 4820 msleep(&vp->v_iocount, &vp->v_lock, PVFS, "vnode_drain", NULL);
0a7de745 4821 }
6d2010ae
A
4822
4823 vp->v_lflag &= ~VL_DRAIN;
4824
0a7de745 4825 return 0;
91447636
A
4826}
4827
4828
4829/*
4830 * if the number of recent references via vnode_getwithvid or vnode_getwithref
6d2010ae 4831 * exceeds this threshold, than 'UN-AGE' the vnode by removing it from
91447636
A
4832 * the LRU list if it's currently on it... once the iocount and usecount both drop
4833 * to 0, it will get put back on the end of the list, effectively making it younger
4834 * this allows us to keep actively referenced vnodes in the list without having
4835 * to constantly remove and add to the list each time a vnode w/o a usecount is
4836 * referenced which costs us taking and dropping a global lock twice.
fe8ab488 4837 * However, if the vnode is marked DIRTY, we want to pull it out much earlier
91447636 4838 */
fe8ab488 4839#define UNAGE_THRESHHOLD 25
0a7de745 4840#define UNAGE_DIRTYTHRESHHOLD 6
91447636 4841
6d2010ae 4842errno_t
b0d623f7 4843vnode_getiocount(vnode_t vp, unsigned int vid, int vflags)
91447636
A
4844{
4845 int nodead = vflags & VNODE_NODEAD;
4846 int nosusp = vflags & VNODE_NOSUSPEND;
593a1d5f 4847 int always = vflags & VNODE_ALWAYS;
6d2010ae 4848 int beatdrain = vflags & VNODE_DRAINO;
39236c6e 4849 int withvid = vflags & VNODE_WITHID;
91447636 4850
91447636 4851 for (;;) {
3e170ce0
A
4852 int sleepflg = 0;
4853
91447636
A
4854 /*
4855 * if it is a dead vnode with deadfs
4856 */
0a7de745
A
4857 if (nodead && (vp->v_lflag & VL_DEAD) && ((vp->v_type == VBAD) || (vp->v_data == 0))) {
4858 return ENOENT;
91447636
A
4859 }
4860 /*
4861 * will return VL_DEAD ones
4862 */
0a7de745 4863 if ((vp->v_lflag & (VL_SUSPENDED | VL_DRAIN | VL_TERMINATE)) == 0) {
91447636
A
4864 break;
4865 }
4866 /*
4867 * if suspended vnodes are to be failed
4868 */
4869 if (nosusp && (vp->v_lflag & VL_SUSPENDED)) {
0a7de745 4870 return ENOENT;
91447636
A
4871 }
4872 /*
4873 * if you are the owner of drain/suspend/termination , can acquire iocount
4874 * check for VL_TERMINATE; it does not set owner
4875 */
4876 if ((vp->v_lflag & (VL_DRAIN | VL_SUSPENDED | VL_TERMINATE)) &&
4877 (vp->v_owner == current_thread())) {
0a7de745 4878 break;
91447636 4879 }
0a7de745
A
4880
4881 if (always != 0) {
593a1d5f 4882 break;
0a7de745 4883 }
6d2010ae
A
4884
4885 /*
39236c6e 4886 * If this vnode is getting drained, there are some cases where
3e170ce0
A
4887 * we can't block or, in case of tty vnodes, want to be
4888 * interruptible.
6d2010ae 4889 */
39236c6e
A
4890 if (vp->v_lflag & VL_DRAIN) {
4891 /*
4892 * In some situations, we want to get an iocount
4893 * even if the vnode is draining to prevent deadlock,
4894 * e.g. if we're in the filesystem, potentially holding
4895 * resources that could prevent other iocounts from
4896 * being released.
4897 */
0a7de745 4898 if (beatdrain) {
39236c6e 4899 break;
0a7de745 4900 }
39236c6e
A
4901 /*
4902 * Don't block if the vnode's mount point is unmounting as
4903 * we may be the thread the unmount is itself waiting on
4904 * Only callers who pass in vids (at this point, we've already
4905 * handled nosusp and nodead) are expecting error returns
4906 * from this function, so only we can only return errors for
4907 * those. ENODEV is intended to inform callers that the call
4908 * failed because an unmount is in progress.
4909 */
0a7de745
A
4910 if (withvid && (vp->v_mount) && vfs_isunmount(vp->v_mount)) {
4911 return ENODEV;
4912 }
3e170ce0
A
4913
4914 if (vnode_istty(vp)) {
4915 sleepflg = PCATCH;
4916 }
6d2010ae
A
4917 }
4918
2d21ac55
A
4919 vnode_lock_convert(vp);
4920
91447636 4921 if (vp->v_lflag & VL_TERMINATE) {
3e170ce0
A
4922 int error;
4923
91447636
A
4924 vp->v_lflag |= VL_TERMWANT;
4925
0a7de745
A
4926 error = msleep(&vp->v_lflag, &vp->v_lock,
4927 (PVFS | sleepflg), "vnode getiocount", NULL);
4928 if (error) {
4929 return error;
4930 }
4931 } else {
2d21ac55 4932 msleep(&vp->v_iocount, &vp->v_lock, PVFS, "vnode_getiocount", NULL);
0a7de745 4933 }
91447636 4934 }
39236c6e 4935 if (withvid && vid != vp->v_id) {
0a7de745 4936 return ENOENT;
91447636 4937 }
fe8ab488
A
4938 if (++vp->v_references >= UNAGE_THRESHHOLD ||
4939 (vp->v_flag & VISDIRTY && vp->v_references >= UNAGE_DIRTYTHRESHHOLD)) {
0a7de745 4940 vp->v_references = 0;
91447636
A
4941 vnode_list_remove(vp);
4942 }
4943 vp->v_iocount++;
4944#ifdef JOE_DEBUG
4945 record_vp(vp, 1);
4946#endif
0a7de745 4947 return 0;
91447636
A
4948}
4949
4950static void
0a7de745 4951vnode_dropiocount(vnode_t vp)
91447636 4952{
0a7de745 4953 if (vp->v_iocount < 1) {
2d21ac55 4954 panic("vnode_dropiocount(%p): v_iocount < 1", vp);
0a7de745 4955 }
91447636
A
4956
4957 vp->v_iocount--;
4958#ifdef JOE_DEBUG
4959 record_vp(vp, -1);
4960#endif
0a7de745 4961 if ((vp->v_lflag & (VL_DRAIN | VL_SUSPENDED)) && (vp->v_iocount <= 1)) {
91447636 4962 wakeup(&vp->v_iocount);
0a7de745 4963 }
91447636
A
4964}
4965
4966
4967void
4968vnode_reclaim(struct vnode * vp)
4969{
2d21ac55 4970 vnode_reclaim_internal(vp, 0, 0, 0);
91447636
A
4971}
4972
4973__private_extern__
4974void
2d21ac55 4975vnode_reclaim_internal(struct vnode * vp, int locked, int reuse, int flags)
91447636
A
4976{
4977 int isfifo = 0;
4978
0a7de745 4979 if (!locked) {
91447636 4980 vnode_lock(vp);
0a7de745 4981 }
91447636
A
4982
4983 if (vp->v_lflag & VL_TERMINATE) {
4984 panic("vnode reclaim in progress");
4985 }
4986 vp->v_lflag |= VL_TERMINATE;
4987
2d21ac55
A
4988 vn_clearunionwait(vp, 1);
4989
b0d623f7
A
4990 vnode_drain(vp);
4991
91447636
A
4992 isfifo = (vp->v_type == VFIFO);
4993
0a7de745
A
4994 if (vp->v_type != VBAD) {
4995 vgone(vp, flags); /* clean and reclaim the vnode */
4996 }
91447636 4997 /*
4a3eedf9
A
4998 * give the vnode a new identity so that vnode_getwithvid will fail
4999 * on any stale cache accesses...
5000 * grab the list_lock so that if we're in "new_vnode"
5001 * behind the list_lock trying to steal this vnode, the v_id is stable...
5002 * once new_vnode drops the list_lock, it will block trying to take
5003 * the vnode lock until we release it... at that point it will evaluate
5004 * whether the v_vid has changed
cf7d32b8
A
5005 * also need to make sure that the vnode isn't on a list where "new_vnode"
5006 * can find it after the v_id has been bumped until we are completely done
5007 * with the vnode (i.e. putting it back on a list has to be the very last
5008 * thing we do to this vnode... many of the callers of vnode_reclaim_internal
5009 * are holding an io_count on the vnode... they need to drop the io_count
5010 * BEFORE doing a vnode_list_add or make sure to hold the vnode lock until
5011 * they are completely done with the vnode
91447636 5012 */
4a3eedf9 5013 vnode_list_lock();
cf7d32b8
A
5014
5015 vnode_list_remove_locked(vp);
91447636 5016 vp->v_id++;
cf7d32b8 5017
4a3eedf9
A
5018 vnode_list_unlock();
5019
91447636
A
5020 if (isfifo) {
5021 struct fifoinfo * fip;
5022
5023 fip = vp->v_fifoinfo;
5024 vp->v_fifoinfo = NULL;
5025 FREE(fip, M_TEMP);
5026 }
91447636
A
5027 vp->v_type = VBAD;
5028
0a7de745 5029 if (vp->v_data) {
91447636 5030 panic("vnode_reclaim_internal: cleaned vnode isn't");
0a7de745
A
5031 }
5032 if (vp->v_numoutput) {
cf7d32b8 5033 panic("vnode_reclaim_internal: clean vnode has pending I/O's");
0a7de745
A
5034 }
5035 if (UBCINFOEXISTS(vp)) {
91447636 5036 panic("vnode_reclaim_internal: ubcinfo not cleaned");
0a7de745
A
5037 }
5038 if (vp->v_parent) {
5039 panic("vnode_reclaim_internal: vparent not removed");
5040 }
5041 if (vp->v_name) {
5042 panic("vnode_reclaim_internal: vname not removed");
5043 }
91447636 5044
2d21ac55 5045 vp->v_socket = NULL;
91447636
A
5046
5047 vp->v_lflag &= ~VL_TERMINATE;
2d21ac55 5048 vp->v_owner = NULL;
91447636 5049
b0d623f7
A
5050 KNOTE(&vp->v_knotes, NOTE_REVOKE);
5051
5052 /* Make sure that when we reuse the vnode, no knotes left over */
5053 klist_init(&vp->v_knotes);
5054
91447636
A
5055 if (vp->v_lflag & VL_TERMWANT) {
5056 vp->v_lflag &= ~VL_TERMWANT;
5057 wakeup(&vp->v_lflag);
5058 }
cf7d32b8 5059 if (!reuse) {
0a7de745 5060 /*
2d21ac55 5061 * make sure we get on the
cf7d32b8 5062 * dead list if appropriate
2d21ac55 5063 */
0a7de745
A
5064 vnode_list_add(vp);
5065 }
5066 if (!locked) {
5067 vnode_unlock(vp);
2d21ac55 5068 }
91447636
A
5069}
5070
3e170ce0
A
5071static int
5072vnode_create_internal(uint32_t flavor, uint32_t size, void *data, vnode_t *vpp,
5073 int init_vnode)
91447636
A
5074{
5075 int error;
5076 int insert = 1;
3e170ce0 5077 int existing_vnode;
91447636
A
5078 vnode_t vp;
5079 vnode_t nvp;
5080 vnode_t dvp;
0a7de745 5081 struct uthread *ut;
91447636
A
5082 struct componentname *cnp;
5083 struct vnode_fsparam *param = (struct vnode_fsparam *)data;
6d2010ae
A
5084#if CONFIG_TRIGGERS
5085 struct vnode_trigger_param *tinfo = NULL;
5086#endif
3e170ce0
A
5087 if (*vpp) {
5088 vp = *vpp;
5089 *vpp = NULLVP;
5090 existing_vnode = 1;
5091 } else {
5092 existing_vnode = 0;
db609669
A
5093 }
5094
3e170ce0
A
5095 if (init_vnode) {
5096 /* Do quick sanity check on the parameters. */
5097 if ((param == NULL) || (param->vnfs_vtype == VBAD)) {
5098 error = EINVAL;
5099 goto error_out;
5100 }
6d2010ae 5101
3e170ce0
A
5102#if CONFIG_TRIGGERS
5103 if ((flavor == VNCREATE_TRIGGER) && (size == VNCREATE_TRIGGER_SIZE)) {
5104 tinfo = (struct vnode_trigger_param *)data;
5105
5106 /* Validate trigger vnode input */
5107 if ((param->vnfs_vtype != VDIR) ||
5108 (tinfo->vnt_resolve_func == NULL) ||
5109 (tinfo->vnt_flags & ~VNT_VALID_MASK)) {
5110 error = EINVAL;
5111 goto error_out;
5112 }
5113 /* Fall through a normal create (params will be the same) */
5114 flavor = VNCREATE_FLAVOR;
5115 size = VCREATESIZE;
6d2010ae 5116 }
6d2010ae 5117#endif
3e170ce0
A
5118 if ((flavor != VNCREATE_FLAVOR) || (size != VCREATESIZE)) {
5119 error = EINVAL;
5120 goto error_out;
5121 }
5122 }
6d2010ae 5123
3e170ce0 5124 if (!existing_vnode) {
0a7de745
A
5125 if ((error = new_vnode(&vp))) {
5126 return error;
3e170ce0
A
5127 }
5128 if (!init_vnode) {
5129 /* Make it so that it can be released by a vnode_put) */
5130 vn_set_dead(vp);
5131 *vpp = vp;
0a7de745 5132 return 0;
3e170ce0
A
5133 }
5134 } else {
5135 /*
5136 * A vnode obtained by vnode_create_empty has been passed to
5137 * vnode_initialize - Unset VL_DEAD set by vn_set_dead. After
5138 * this point, it is set back on any error.
5139 *
5140 * N.B. vnode locking - We make the same assumptions as the
5141 * "unsplit" vnode_create did - i.e. it is safe to update the
5142 * vnode's fields without the vnode lock. This vnode has been
5143 * out and about with the filesystem and hopefully nothing
5144 * was done to the vnode between the vnode_create_empty and
5145 * now when it has come in through vnode_initialize.
5146 */
5147 vp->v_lflag &= ~VL_DEAD;
5148 }
91447636 5149
6d2010ae
A
5150 dvp = param->vnfs_dvp;
5151 cnp = param->vnfs_cnp;
5152
5153 vp->v_op = param->vnfs_vops;
5154 vp->v_type = param->vnfs_vtype;
5155 vp->v_data = param->vnfs_fsnode;
5156
0a7de745 5157 if (param->vnfs_markroot) {
6d2010ae 5158 vp->v_flag |= VROOT;
0a7de745
A
5159 }
5160 if (param->vnfs_marksystem) {
6d2010ae 5161 vp->v_flag |= VSYSTEM;
0a7de745 5162 }
6d2010ae
A
5163 if (vp->v_type == VREG) {
5164 error = ubc_info_init_withsize(vp, param->vnfs_filesize);
5165 if (error) {
91447636 5166#ifdef JOE_DEBUG
6d2010ae 5167 record_vp(vp, 1);
91447636 5168#endif
3e170ce0 5169 vn_set_dead(vp);
6d2010ae
A
5170
5171 vnode_put(vp);
0a7de745 5172 return error;
6d2010ae 5173 }
0a7de745 5174 if (param->vnfs_mp->mnt_ioflags & MNT_IOFLAGS_IOSCHED_SUPPORTED) {
fe8ab488 5175 memory_object_mark_io_tracking(vp->v_ubcinfo->ui_control);
0a7de745 5176 }
6d2010ae
A
5177 }
5178#ifdef JOE_DEBUG
5179 record_vp(vp, 1);
5180#endif
5181
5182#if CONFIG_TRIGGERS
5183 /*
5184 * For trigger vnodes, attach trigger info to vnode
5185 */
5186 if ((vp->v_type == VDIR) && (tinfo != NULL)) {
0a7de745 5187 /*
6d2010ae 5188 * Note: has a side effect of incrementing trigger count on the
0a7de745 5189 * mount if successful, which we would need to undo on a
6d2010ae
A
5190 * subsequent failure.
5191 */
5192#ifdef JOE_DEBUG
5193 record_vp(vp, -1);
5194#endif
5195 error = vnode_resolver_create(param->vnfs_mp, vp, tinfo, FALSE);
5196 if (error) {
5197 printf("vnode_create: vnode_resolver_create() err %d\n", error);
3e170ce0 5198 vn_set_dead(vp);
91447636
A
5199#ifdef JOE_DEBUG
5200 record_vp(vp, 1);
5201#endif
6d2010ae 5202 vnode_put(vp);
0a7de745 5203 return error;
6d2010ae
A
5204 }
5205 }
5206#endif
5207 if (vp->v_type == VCHR || vp->v_type == VBLK) {
0a7de745 5208 vp->v_tag = VT_DEVFS; /* callers will reset if needed (bdevvp) */
91447636 5209
0a7de745 5210 if ((nvp = checkalias(vp, param->vnfs_rdev))) {
6d2010ae
A
5211 /*
5212 * if checkalias returns a vnode, it will be locked
5213 *
5214 * first get rid of the unneeded vnode we acquired
5215 */
5216 vp->v_data = NULL;
5217 vp->v_op = spec_vnodeop_p;
5218 vp->v_type = VBAD;
5219 vp->v_lflag = VL_DEAD;
0a7de745 5220 vp->v_data = NULL;
6d2010ae
A
5221 vp->v_tag = VT_NON;
5222 vnode_put(vp);
91447636 5223
6d2010ae
A
5224 /*
5225 * switch to aliased vnode and finish
5226 * preparing it
91447636 5227 */
6d2010ae 5228 vp = nvp;
91447636 5229
6d2010ae
A
5230 vclean(vp, 0);
5231 vp->v_op = param->vnfs_vops;
5232 vp->v_type = param->vnfs_vtype;
5233 vp->v_data = param->vnfs_fsnode;
5234 vp->v_lflag = 0;
5235 vp->v_mount = NULL;
5236 insmntque(vp, param->vnfs_mp);
5237 insert = 0;
5238 vnode_unlock(vp);
5239 }
316670eb
A
5240
5241 if (VCHR == vp->v_type) {
5242 u_int maj = major(vp->v_rdev);
5243
0a7de745 5244 if (maj < (u_int)nchrdev && cdevsw[maj].d_type == D_TTY) {
316670eb 5245 vp->v_flag |= VISTTY;
0a7de745 5246 }
316670eb 5247 }
6d2010ae
A
5248 }
5249
5250 if (vp->v_type == VFIFO) {
5251 struct fifoinfo *fip;
5252
5253 MALLOC(fip, struct fifoinfo *,
0a7de745
A
5254 sizeof(*fip), M_TEMP, M_WAITOK);
5255 bzero(fip, sizeof(struct fifoinfo));
6d2010ae
A
5256 vp->v_fifoinfo = fip;
5257 }
5258 /* The file systems must pass the address of the location where
5259 * they store the vnode pointer. When we add the vnode into the mount
5260 * list and name cache they become discoverable. So the file system node
5261 * must have the connection to vnode setup by then
5262 */
5263 *vpp = vp;
5264
5265 /* Add fs named reference. */
5266 if (param->vnfs_flags & VNFS_ADDFSREF) {
5267 vp->v_lflag |= VNAMED_FSHASH;
5268 }
5269 if (param->vnfs_mp) {
0a7de745
A
5270 if (param->vnfs_mp->mnt_kern_flag & MNTK_LOCK_LOCAL) {
5271 vp->v_flag |= VLOCKLOCAL;
5272 }
6d2010ae 5273 if (insert) {
0a7de745 5274 if ((vp->v_freelist.tqe_prev != (struct vnode **)0xdeadb)) {
6d2010ae 5275 panic("insmntque: vp on the free list\n");
0a7de745 5276 }
6d2010ae
A
5277
5278 /*
5279 * enter in mount vnode list
5280 */
5281 insmntque(vp, param->vnfs_mp);
5282 }
6d2010ae
A
5283 }
5284 if (dvp && vnode_ref(dvp) == 0) {
5285 vp->v_parent = dvp;
5286 }
5287 if (cnp) {
5288 if (dvp && ((param->vnfs_flags & (VNFS_NOCACHE | VNFS_CANTCACHE)) == 0)) {
5289 /*
5290 * enter into name cache
5291 * we've got the info to enter it into the name cache now
5292 * cache_enter_create will pick up an extra reference on
5293 * the name entered into the string cache
5294 */
5295 vp->v_name = cache_enter_create(dvp, vp, cnp);
0a7de745 5296 } else {
6d2010ae 5297 vp->v_name = vfs_addname(cnp->cn_nameptr, cnp->cn_namelen, cnp->cn_hash, 0);
0a7de745 5298 }
b0d623f7 5299
0a7de745 5300 if ((cnp->cn_flags & UNIONCREATED) == UNIONCREATED) {
6d2010ae 5301 vp->v_flag |= VISUNION;
0a7de745 5302 }
6d2010ae
A
5303 }
5304 if ((param->vnfs_flags & VNFS_CANTCACHE) == 0) {
5305 /*
5306 * this vnode is being created as cacheable in the name cache
5307 * this allows us to re-enter it in the cache
5308 */
5309 vp->v_flag |= VNCACHEABLE;
5310 }
5311 ut = get_bsdthread_info(current_thread());
5312
5313 if ((current_proc()->p_lflag & P_LRAGE_VNODES) ||
d9a64523 5314 (ut->uu_flag & (UT_RAGE_VNODES | UT_KERN_RAGE_VNODES))) {
6d2010ae
A
5315 /*
5316 * process has indicated that it wants any
5317 * vnodes created on its behalf to be rapidly
5318 * aged to reduce the impact on the cached set
5319 * of vnodes
d9a64523
A
5320 *
5321 * if UT_KERN_RAGE_VNODES is set, then the
5322 * kernel internally wants vnodes to be rapidly
5323 * aged, even if the process hasn't requested
5324 * this
6d2010ae
A
5325 */
5326 vp->v_flag |= VRAGE;
91447636 5327 }
39037602
A
5328
5329#if CONFIG_SECLUDED_MEMORY
5330 switch (secluded_for_filecache) {
5331 case 0:
5332 /*
5333 * secluded_for_filecache == 0:
5334 * + no file contents in secluded pool
5335 */
5336 break;
5337 case 1:
5338 /*
5339 * secluded_for_filecache == 1:
5340 * + no files from /
5341 * + files from /Applications/ are OK
5342 * + files from /Applications/Camera are not OK
5343 * + no files that are open for write
5344 */
5345 if (vnode_vtype(vp) == VREG &&
5346 vnode_mount(vp) != NULL &&
0a7de745 5347 (!(vfs_flags(vnode_mount(vp)) & MNT_ROOTFS))) {
39037602
A
5348 /* not from root filesystem: eligible for secluded pages */
5349 memory_object_mark_eligible_for_secluded(
5350 ubc_getobject(vp, UBC_FLAGS_NONE),
5351 TRUE);
5352 }
5353 break;
5354 case 2:
5355 /*
5356 * secluded_for_filecache == 2:
5357 * + all read-only files OK, except:
0a7de745
A
5358 * + dyld_shared_cache_arm64*
5359 * + Camera
39037602
A
5360 * + mediaserverd
5361 */
5362 if (vnode_vtype(vp) == VREG) {
5363 memory_object_mark_eligible_for_secluded(
5364 ubc_getobject(vp, UBC_FLAGS_NONE),
5365 TRUE);
5366 }
5367 break;
5368 default:
5369 break;
5370 }
5371#endif /* CONFIG_SECLUDED_MEMORY */
5372
0a7de745 5373 return 0;
3e170ce0
A
5374
5375error_out:
5376 if (existing_vnode) {
5377 vnode_put(vp);
5378 }
0a7de745 5379 return error;
3e170ce0
A
5380}
5381
5382/* USAGE:
5383 * The following api creates a vnode and associates all the parameter specified in vnode_fsparam
5384 * structure and returns a vnode handle with a reference. device aliasing is handled here so checkalias
5385 * is obsoleted by this.
5386 */
5387int
5388vnode_create(uint32_t flavor, uint32_t size, void *data, vnode_t *vpp)
5389{
5390 *vpp = NULLVP;
0a7de745 5391 return vnode_create_internal(flavor, size, data, vpp, 1);
3e170ce0
A
5392}
5393
5394int
5395vnode_create_empty(vnode_t *vpp)
5396{
5397 *vpp = NULLVP;
0a7de745
A
5398 return vnode_create_internal(VNCREATE_FLAVOR, VCREATESIZE, NULL,
5399 vpp, 0);
3e170ce0
A
5400}
5401
5402int
5403vnode_initialize(uint32_t flavor, uint32_t size, void *data, vnode_t *vpp)
5404{
5405 if (*vpp == NULLVP) {
5406 panic("NULL vnode passed to vnode_initialize");
5407 }
5408#if DEVELOPMENT || DEBUG
5409 /*
5410 * We lock to check that vnode is fit for unlocked use in
5411 * vnode_create_internal.
5412 */
5413 vnode_lock_spin(*vpp);
5414 VNASSERT(((*vpp)->v_iocount == 1), *vpp,
5415 ("vnode_initialize : iocount not 1, is %d", (*vpp)->v_iocount));
5416 VNASSERT(((*vpp)->v_usecount == 0), *vpp,
5417 ("vnode_initialize : usecount not 0, is %d", (*vpp)->v_usecount));
5418 VNASSERT(((*vpp)->v_lflag & VL_DEAD), *vpp,
5419 ("vnode_initialize : v_lflag does not have VL_DEAD, is 0x%x",
5420 (*vpp)->v_lflag));
5421 VNASSERT(((*vpp)->v_data == NULL), *vpp,
5422 ("vnode_initialize : v_data not NULL"));
5423 vnode_unlock(*vpp);
5424#endif
0a7de745 5425 return vnode_create_internal(flavor, size, data, vpp, 1);
91447636
A
5426}
5427
5428int
5429vnode_addfsref(vnode_t vp)
5430{
2d21ac55 5431 vnode_lock_spin(vp);
0a7de745 5432 if (vp->v_lflag & VNAMED_FSHASH) {
91447636 5433 panic("add_fsref: vp already has named reference");
0a7de745
A
5434 }
5435 if ((vp->v_freelist.tqe_prev != (struct vnode **)0xdeadb)) {
5436 panic("addfsref: vp on the free list\n");
5437 }
91447636
A
5438 vp->v_lflag |= VNAMED_FSHASH;
5439 vnode_unlock(vp);
0a7de745 5440 return 0;
91447636
A
5441}
5442int
5443vnode_removefsref(vnode_t vp)
5444{
2d21ac55 5445 vnode_lock_spin(vp);
0a7de745 5446 if ((vp->v_lflag & VNAMED_FSHASH) == 0) {
91447636 5447 panic("remove_fsref: no named reference");
0a7de745 5448 }
91447636
A
5449 vp->v_lflag &= ~VNAMED_FSHASH;
5450 vnode_unlock(vp);
0a7de745 5451 return 0;
91447636
A
5452}
5453
5454
5455int
6d2010ae 5456vfs_iterate(int flags, int (*callout)(mount_t, void *), void *arg)
91447636 5457{
0a7de745 5458 mount_t mp;
91447636
A
5459 int ret = 0;
5460 fsid_t * fsid_list;
0a7de745 5461 int count, actualcount, i;
91447636 5462 void * allocmem;
6d2010ae 5463 int indx_start, indx_stop, indx_incr;
fe8ab488 5464 int cb_dropref = (flags & VFS_ITERATE_CB_DROPREF);
91447636
A
5465
5466 count = mount_getvfscnt();
5467 count += 10;
5468
5469 fsid_list = (fsid_t *)kalloc(count * sizeof(fsid_t));
5470 allocmem = (void *)fsid_list;
5471
5472 actualcount = mount_fillfsids(fsid_list, count);
5473
6d2010ae
A
5474 /*
5475 * Establish the iteration direction
5476 * VFS_ITERATE_TAIL_FIRST overrides default head first order (oldest first)
5477 */
5478 if (flags & VFS_ITERATE_TAIL_FIRST) {
5479 indx_start = actualcount - 1;
5480 indx_stop = -1;
5481 indx_incr = -1;
0a7de745 5482 } else { /* Head first by default */
6d2010ae
A
5483 indx_start = 0;
5484 indx_stop = actualcount;
5485 indx_incr = 1;
5486 }
5487
0a7de745 5488 for (i = indx_start; i != indx_stop; i += indx_incr) {
91447636
A
5489 /* obtain the mount point with iteration reference */
5490 mp = mount_list_lookupby_fsid(&fsid_list[i], 0, 1);
5491
0a7de745 5492 if (mp == (struct mount *)0) {
91447636 5493 continue;
0a7de745 5494 }
91447636
A
5495 mount_lock(mp);
5496 if (mp->mnt_lflag & (MNT_LDEAD | MNT_LUNMOUNT)) {
5497 mount_unlock(mp);
5498 mount_iterdrop(mp);
5499 continue;
91447636
A
5500 }
5501 mount_unlock(mp);
5502
5503 /* iterate over all the vnodes */
5504 ret = callout(mp, arg);
5505
fe8ab488
A
5506 /*
5507 * Drop the iterref here if the callback didn't do it.
5508 * Note: If cb_dropref is set the mp may no longer exist.
5509 */
0a7de745 5510 if (!cb_dropref) {
fe8ab488 5511 mount_iterdrop(mp);
0a7de745 5512 }
91447636
A
5513
5514 switch (ret) {
5515 case VFS_RETURNED:
5516 case VFS_RETURNED_DONE:
5517 if (ret == VFS_RETURNED_DONE) {
5518 ret = 0;
5519 goto out;
5520 }
5521 break;
5522
5523 case VFS_CLAIMED_DONE:
5524 ret = 0;
5525 goto out;
5526 case VFS_CLAIMED:
5527 default:
5528 break;
5529 }
5530 ret = 0;
5531 }
5532
5533out:
5534 kfree(allocmem, (count * sizeof(fsid_t)));
0a7de745 5535 return ret;
91447636
A
5536}
5537
5538/*
5539 * Update the vfsstatfs structure in the mountpoint.
2d21ac55
A
5540 * MAC: Parameter eventtype added, indicating whether the event that
5541 * triggered this update came from user space, via a system call
5542 * (VFS_USER_EVENT) or an internal kernel call (VFS_KERNEL_EVENT).
91447636
A
5543 */
5544int
2d21ac55 5545vfs_update_vfsstat(mount_t mp, vfs_context_t ctx, __unused int eventtype)
91447636 5546{
0a7de745
A
5547 struct vfs_attr va;
5548 int error;
91447636
A
5549
5550 /*
5551 * Request the attributes we want to propagate into
5552 * the per-mount vfsstat structure.
5553 */
5554 VFSATTR_INIT(&va);
5555 VFSATTR_WANTED(&va, f_iosize);
5556 VFSATTR_WANTED(&va, f_blocks);
5557 VFSATTR_WANTED(&va, f_bfree);
5558 VFSATTR_WANTED(&va, f_bavail);
5559 VFSATTR_WANTED(&va, f_bused);
5560 VFSATTR_WANTED(&va, f_files);
5561 VFSATTR_WANTED(&va, f_ffree);
5562 VFSATTR_WANTED(&va, f_bsize);
5563 VFSATTR_WANTED(&va, f_fssubtype);
743345f9
A
5564
5565 if ((error = vfs_getattr(mp, &va, ctx)) != 0) {
5566 KAUTH_DEBUG("STAT - filesystem returned error %d", error);
0a7de745 5567 return error;
743345f9 5568 }
2d21ac55
A
5569#if CONFIG_MACF
5570 if (eventtype == VFS_USER_EVENT) {
5571 error = mac_mount_check_getattr(ctx, mp, &va);
0a7de745
A
5572 if (error != 0) {
5573 return error;
5574 }
2d21ac55
A
5575 }
5576#endif
91447636
A
5577 /*
5578 * Unpack into the per-mount structure.
5579 *
5580 * We only overwrite these fields, which are likely to change:
5581 * f_blocks
5582 * f_bfree
5583 * f_bavail
5584 * f_bused
5585 * f_files
5586 * f_ffree
5587 *
5588 * And these which are not, but which the FS has no other way
5589 * of providing to us:
5590 * f_bsize
5591 * f_iosize
5592 * f_fssubtype
5593 *
5594 */
5595 if (VFSATTR_IS_SUPPORTED(&va, f_bsize)) {
2d21ac55
A
5596 /* 4822056 - protect against malformed server mount */
5597 mp->mnt_vfsstat.f_bsize = (va.f_bsize > 0 ? va.f_bsize : 512);
91447636 5598 } else {
0a7de745 5599 mp->mnt_vfsstat.f_bsize = mp->mnt_devblocksize; /* default from the device block size */
91447636
A
5600 }
5601 if (VFSATTR_IS_SUPPORTED(&va, f_iosize)) {
5602 mp->mnt_vfsstat.f_iosize = va.f_iosize;
5603 } else {
0a7de745 5604 mp->mnt_vfsstat.f_iosize = 1024 * 1024; /* 1MB sensible I/O size */
91447636 5605 }
0a7de745 5606 if (VFSATTR_IS_SUPPORTED(&va, f_blocks)) {
91447636 5607 mp->mnt_vfsstat.f_blocks = va.f_blocks;
0a7de745
A
5608 }
5609 if (VFSATTR_IS_SUPPORTED(&va, f_bfree)) {
91447636 5610 mp->mnt_vfsstat.f_bfree = va.f_bfree;
0a7de745
A
5611 }
5612 if (VFSATTR_IS_SUPPORTED(&va, f_bavail)) {
91447636 5613 mp->mnt_vfsstat.f_bavail = va.f_bavail;
0a7de745
A
5614 }
5615 if (VFSATTR_IS_SUPPORTED(&va, f_bused)) {
91447636 5616 mp->mnt_vfsstat.f_bused = va.f_bused;
0a7de745
A
5617 }
5618 if (VFSATTR_IS_SUPPORTED(&va, f_files)) {
91447636 5619 mp->mnt_vfsstat.f_files = va.f_files;
0a7de745
A
5620 }
5621 if (VFSATTR_IS_SUPPORTED(&va, f_ffree)) {
91447636 5622 mp->mnt_vfsstat.f_ffree = va.f_ffree;
0a7de745 5623 }
91447636
A
5624
5625 /* this is unlikely to change, but has to be queried for */
0a7de745 5626 if (VFSATTR_IS_SUPPORTED(&va, f_fssubtype)) {
91447636 5627 mp->mnt_vfsstat.f_fssubtype = va.f_fssubtype;
0a7de745 5628 }
91447636 5629
0a7de745 5630 return 0;
91447636
A
5631}
5632
b0d623f7 5633int
91447636
A
5634mount_list_add(mount_t mp)
5635{
b0d623f7
A
5636 int res;
5637
91447636 5638 mount_list_lock();
b0d623f7
A
5639 if (system_inshutdown != 0) {
5640 res = -1;
5641 } else {
0a7de745 5642 TAILQ_INSERT_TAIL(&mountlist, mp, mnt_list);
b0d623f7
A
5643 nummounts++;
5644 res = 0;
5645 }
91447636 5646 mount_list_unlock();
b0d623f7
A
5647
5648 return res;
91447636
A
5649}
5650
5651void
5652mount_list_remove(mount_t mp)
5653{
5654 mount_list_lock();
5655 TAILQ_REMOVE(&mountlist, mp, mnt_list);
5656 nummounts--;
2d21ac55
A
5657 mp->mnt_list.tqe_next = NULL;
5658 mp->mnt_list.tqe_prev = NULL;
91447636
A
5659 mount_list_unlock();
5660}
5661
5662mount_t
5663mount_lookupby_volfsid(int volfs_id, int withref)
5664{
5665 mount_t cur_mount = (mount_t)0;
2d21ac55 5666 mount_t mp;
91447636
A
5667
5668 mount_list_lock();
2d21ac55
A
5669 TAILQ_FOREACH(mp, &mountlist, mnt_list) {
5670 if (!(mp->mnt_kern_flag & MNTK_UNMOUNT) &&
5671 (mp->mnt_kern_flag & MNTK_PATH_FROM_ID) &&
5672 (mp->mnt_vfsstat.f_fsid.val[0] == volfs_id)) {
5673 cur_mount = mp;
91447636 5674 if (withref) {
0a7de745 5675 if (mount_iterref(cur_mount, 1)) {
91447636
A
5676 cur_mount = (mount_t)0;
5677 mount_list_unlock();
5678 goto out;
5679 }
5680 }
2d21ac55
A
5681 break;
5682 }
91447636
A
5683 }
5684 mount_list_unlock();
5685 if (withref && (cur_mount != (mount_t)0)) {
5686 mp = cur_mount;
5687 if (vfs_busy(mp, LK_NOWAIT) != 0) {
5688 cur_mount = (mount_t)0;
2d21ac55 5689 }
91447636
A
5690 mount_iterdrop(mp);
5691 }
5692out:
0a7de745 5693 return cur_mount;
91447636 5694}
91447636 5695
0a7de745 5696mount_t
2d21ac55 5697mount_list_lookupby_fsid(fsid_t *fsid, int locked, int withref)
91447636
A
5698{
5699 mount_t retmp = (mount_t)0;
5700 mount_t mp;
5701
0a7de745 5702 if (!locked) {
91447636 5703 mount_list_lock();
0a7de745
A
5704 }
5705 TAILQ_FOREACH(mp, &mountlist, mnt_list)
5706 if (mp->mnt_vfsstat.f_fsid.val[0] == fsid->val[0] &&
5707 mp->mnt_vfsstat.f_fsid.val[1] == fsid->val[1]) {
5708 retmp = mp;
5709 if (withref) {
5710 if (mount_iterref(retmp, 1)) {
5711 retmp = (mount_t)0;
91447636 5712 }
91447636 5713 }
0a7de745
A
5714 goto out;
5715 }
91447636 5716out:
0a7de745 5717 if (!locked) {
91447636 5718 mount_list_unlock();
0a7de745
A
5719 }
5720 return retmp;
91447636
A
5721}
5722
5723errno_t
2d21ac55 5724vnode_lookup(const char *path, int flags, vnode_t *vpp, vfs_context_t ctx)
91447636
A
5725{
5726 struct nameidata nd;
5727 int error;
b0d623f7 5728 u_int32_t ndflags = 0;
91447636 5729
fe8ab488
A
5730 if (ctx == NULL) {
5731 return EINVAL;
91447636
A
5732 }
5733
0a7de745 5734 if (flags & VNODE_LOOKUP_NOFOLLOW) {
91447636 5735 ndflags = NOFOLLOW;
0a7de745 5736 } else {
91447636 5737 ndflags = FOLLOW;
0a7de745 5738 }
91447636 5739
0a7de745 5740 if (flags & VNODE_LOOKUP_NOCROSSMOUNT) {
91447636 5741 ndflags |= NOCROSSMOUNT;
0a7de745 5742 }
91447636 5743
0a7de745 5744 if (flags & VNODE_LOOKUP_CROSSMOUNTNOWAIT) {
3e170ce0 5745 ndflags |= CN_NBMOUNTLOOK;
0a7de745 5746 }
3e170ce0 5747
91447636 5748 /* XXX AUDITVNPATH1 needed ? */
6d2010ae 5749 NDINIT(&nd, LOOKUP, OP_LOOKUP, ndflags, UIO_SYSSPACE,
0a7de745 5750 CAST_USER_ADDR_T(path), ctx);
91447636 5751
0a7de745
A
5752 if ((error = namei(&nd))) {
5753 return error;
5754 }
91447636
A
5755 *vpp = nd.ni_vp;
5756 nameidone(&nd);
0a7de745
A
5757
5758 return 0;
91447636
A
5759}
5760
5761errno_t
2d21ac55 5762vnode_open(const char *path, int fmode, int cmode, int flags, vnode_t *vpp, vfs_context_t ctx)
91447636
A
5763{
5764 struct nameidata nd;
5765 int error;
b0d623f7 5766 u_int32_t ndflags = 0;
3a60a9f5 5767 int lflags = flags;
91447636 5768
0a7de745 5769 if (ctx == NULL) { /* XXX technically an error */
2d21ac55 5770 ctx = vfs_context_current();
91447636
A
5771 }
5772
0a7de745 5773 if (fmode & O_NOFOLLOW) {
3a60a9f5 5774 lflags |= VNODE_LOOKUP_NOFOLLOW;
0a7de745 5775 }
3a60a9f5 5776
0a7de745 5777 if (lflags & VNODE_LOOKUP_NOFOLLOW) {
91447636 5778 ndflags = NOFOLLOW;
0a7de745 5779 } else {
91447636 5780 ndflags = FOLLOW;
0a7de745 5781 }
91447636 5782
0a7de745 5783 if (lflags & VNODE_LOOKUP_NOCROSSMOUNT) {
91447636 5784 ndflags |= NOCROSSMOUNT;
0a7de745
A
5785 }
5786
5787 if (lflags & VNODE_LOOKUP_CROSSMOUNTNOWAIT) {
3e170ce0 5788 ndflags |= CN_NBMOUNTLOOK;
0a7de745 5789 }
3e170ce0 5790
91447636 5791 /* XXX AUDITVNPATH1 needed ? */
6d2010ae 5792 NDINIT(&nd, LOOKUP, OP_OPEN, ndflags, UIO_SYSSPACE,
0a7de745 5793 CAST_USER_ADDR_T(path), ctx);
91447636 5794
0a7de745 5795 if ((error = vn_open(&nd, fmode, cmode))) {
91447636 5796 *vpp = NULL;
0a7de745 5797 } else {
91447636 5798 *vpp = nd.ni_vp;
0a7de745
A
5799 }
5800
5801 return error;
91447636
A
5802}
5803
5804errno_t
2d21ac55 5805vnode_close(vnode_t vp, int flags, vfs_context_t ctx)
91447636 5806{
91447636
A
5807 int error;
5808
2d21ac55
A
5809 if (ctx == NULL) {
5810 ctx = vfs_context_current();
91447636 5811 }
0a7de745 5812
2d21ac55 5813 error = vn_close(vp, flags, ctx);
91447636 5814 vnode_put(vp);
0a7de745 5815 return error;
91447636
A
5816}
5817
15129b1c
A
5818errno_t
5819vnode_mtime(vnode_t vp, struct timespec *mtime, vfs_context_t ctx)
5820{
0a7de745
A
5821 struct vnode_attr va;
5822 int error;
15129b1c
A
5823
5824 VATTR_INIT(&va);
5825 VATTR_WANTED(&va, va_modify_time);
5826 error = vnode_getattr(vp, &va, ctx);
0a7de745 5827 if (!error) {
15129b1c 5828 *mtime = va.va_modify_time;
0a7de745 5829 }
15129b1c
A
5830 return error;
5831}
5832
fe8ab488
A
5833errno_t
5834vnode_flags(vnode_t vp, uint32_t *flags, vfs_context_t ctx)
5835{
0a7de745
A
5836 struct vnode_attr va;
5837 int error;
fe8ab488
A
5838
5839 VATTR_INIT(&va);
5840 VATTR_WANTED(&va, va_flags);
5841 error = vnode_getattr(vp, &va, ctx);
0a7de745 5842 if (!error) {
fe8ab488 5843 *flags = va.va_flags;
0a7de745 5844 }
fe8ab488
A
5845 return error;
5846}
5847
2d21ac55
A
5848/*
5849 * Returns: 0 Success
5850 * vnode_getattr:???
5851 */
91447636
A
5852errno_t
5853vnode_size(vnode_t vp, off_t *sizep, vfs_context_t ctx)
5854{
0a7de745
A
5855 struct vnode_attr va;
5856 int error;
91447636
A
5857
5858 VATTR_INIT(&va);
5859 VATTR_WANTED(&va, va_data_size);
5860 error = vnode_getattr(vp, &va, ctx);
0a7de745 5861 if (!error) {
91447636 5862 *sizep = va.va_data_size;
0a7de745
A
5863 }
5864 return error;
91447636
A
5865}
5866
5867errno_t
5868vnode_setsize(vnode_t vp, off_t size, int ioflag, vfs_context_t ctx)
5869{
0a7de745 5870 struct vnode_attr va;
91447636
A
5871
5872 VATTR_INIT(&va);
5873 VATTR_SET(&va, va_data_size, size);
5874 va.va_vaflags = ioflag & 0xffff;
0a7de745 5875 return vnode_setattr(vp, &va, ctx);
91447636
A
5876}
5877
fe8ab488
A
5878int
5879vnode_setdirty(vnode_t vp)
5880{
5881 vnode_lock_spin(vp);
5882 vp->v_flag |= VISDIRTY;
5883 vnode_unlock(vp);
5884 return 0;
5885}
5886
5887int
5888vnode_cleardirty(vnode_t vp)
5889{
5890 vnode_lock_spin(vp);
5891 vp->v_flag &= ~VISDIRTY;
5892 vnode_unlock(vp);
5893 return 0;
5894}
5895
0a7de745 5896int
fe8ab488
A
5897vnode_isdirty(vnode_t vp)
5898{
5899 int dirty;
5900
5901 vnode_lock_spin(vp);
5902 dirty = (vp->v_flag & VISDIRTY) ? 1 : 0;
5903 vnode_unlock(vp);
5904
5905 return dirty;
5906}
5907
6d2010ae
A
5908static int
5909vn_create_reg(vnode_t dvp, vnode_t *vpp, struct nameidata *ndp, struct vnode_attr *vap, uint32_t flags, int fmode, uint32_t *statusp, vfs_context_t ctx)
5910{
5911 /* Only use compound VNOP for compound operation */
5912 if (vnode_compound_open_available(dvp) && ((flags & VN_CREATE_DOOPEN) != 0)) {
5913 *vpp = NULLVP;
fe8ab488 5914 return VNOP_COMPOUND_OPEN(dvp, vpp, ndp, O_CREAT, fmode, statusp, vap, ctx);
6d2010ae
A
5915 } else {
5916 return VNOP_CREATE(dvp, vpp, &ndp->ni_cnd, vap, ctx);
5917 }
5918}
5919
0c530ab8
A
5920/*
5921 * Create a filesystem object of arbitrary type with arbitrary attributes in
5922 * the spevied directory with the specified name.
5923 *
5924 * Parameters: dvp Pointer to the vnode of the directory
5925 * in which to create the object.
5926 * vpp Pointer to the area into which to
5927 * return the vnode of the created object.
5928 * cnp Component name pointer from the namei
5929 * data structure, containing the name to
5930 * use for the create object.
5931 * vap Pointer to the vnode_attr structure
5932 * describing the object to be created,
5933 * including the type of object.
5934 * flags VN_* flags controlling ACL inheritance
5935 * and whether or not authorization is to
5936 * be required for the operation.
0a7de745 5937 *
0c530ab8
A
5938 * Returns: 0 Success
5939 * !0 errno value
5940 *
5941 * Implicit: *vpp Contains the vnode of the object that
5942 * was created, if successful.
5943 * *cnp May be modified by the underlying VFS.
5944 * *vap May be modified by the underlying VFS.
5945 * modified by either ACL inheritance or
0a7de745
A
5946 *
5947 *
0c530ab8 5948 * be modified, even if the operation is
0a7de745 5949 *
0c530ab8
A
5950 *
5951 * Notes: The kauth_filesec_t in 'vap', if any, is in host byte order.
5952 *
5953 * Modification of '*cnp' and '*vap' by the underlying VFS is
5954 * strongly discouraged.
5955 *
5956 * XXX: This function is a 'vn_*' function; it belongs in vfs_vnops.c
5957 *
5958 * XXX: We should enummerate the possible errno values here, and where
5959 * in the code they originated.
5960 */
91447636 5961errno_t
6d2010ae 5962vn_create(vnode_t dvp, vnode_t *vpp, struct nameidata *ndp, struct vnode_attr *vap, uint32_t flags, int fmode, uint32_t *statusp, vfs_context_t ctx)
91447636 5963{
0a7de745 5964 errno_t error, old_error;
91447636 5965 vnode_t vp = (vnode_t)0;
6d2010ae
A
5966 boolean_t batched;
5967 struct componentname *cnp;
5968 uint32_t defaulted;
91447636 5969
6d2010ae 5970 cnp = &ndp->ni_cnd;
91447636 5971 error = 0;
6d2010ae 5972 batched = namei_compound_available(dvp, ndp) ? TRUE : FALSE;
91447636
A
5973
5974 KAUTH_DEBUG("%p CREATE - '%s'", dvp, cnp->cn_nameptr);
5975
0a7de745 5976 if (flags & VN_CREATE_NOINHERIT) {
6d2010ae 5977 vap->va_vaflags |= VA_NOINHERIT;
0a7de745
A
5978 }
5979 if (flags & VN_CREATE_NOAUTH) {
6d2010ae 5980 vap->va_vaflags |= VA_NOAUTH;
0a7de745 5981 }
91447636 5982 /*
6d2010ae 5983 * Handle ACL inheritance, initialize vap.
91447636 5984 */
6d2010ae
A
5985 error = vn_attribute_prepare(dvp, vap, &defaulted, ctx);
5986 if (error) {
5987 return error;
5988 }
91447636 5989
6d2010ae
A
5990 if (vap->va_type != VREG && (fmode != 0 || (flags & VN_CREATE_DOOPEN) || statusp)) {
5991 panic("Open parameters, but not a regular file.");
91447636 5992 }
6d2010ae
A
5993 if ((fmode != 0) && ((flags & VN_CREATE_DOOPEN) == 0)) {
5994 panic("Mode for open, but not trying to open...");
91447636
A
5995 }
5996
fe8ab488 5997
91447636
A
5998 /*
5999 * Create the requested node.
6000 */
0a7de745 6001 switch (vap->va_type) {
91447636 6002 case VREG:
6d2010ae 6003 error = vn_create_reg(dvp, vpp, ndp, vap, flags, fmode, statusp, ctx);
91447636
A
6004 break;
6005 case VDIR:
6d2010ae 6006 error = vn_mkdir(dvp, vpp, ndp, vap, ctx);
91447636
A
6007 break;
6008 case VSOCK:
6009 case VFIFO:
6010 case VBLK:
6011 case VCHR:
6012 error = VNOP_MKNOD(dvp, vpp, cnp, vap, ctx);
6013 break;
6014 default:
6015 panic("vnode_create: unknown vtype %d", vap->va_type);
6016 }
6017 if (error != 0) {
6018 KAUTH_DEBUG("%p CREATE - error %d returned by filesystem", dvp, error);
6019 goto out;
6020 }
6021
6022 vp = *vpp;
6d2010ae
A
6023 old_error = error;
6024
2d21ac55
A
6025#if CONFIG_MACF
6026 if (!(flags & VN_CREATE_NOLABEL)) {
b0d623f7 6027 error = vnode_label(vnode_mount(vp), dvp, vp, cnp, VNODE_LABEL_CREATE, ctx);
0a7de745 6028 if (error) {
2d21ac55 6029 goto error;
0a7de745 6030 }
2d21ac55
A
6031 }
6032#endif
6033
91447636
A
6034 /*
6035 * If some of the requested attributes weren't handled by the VNOP,
6036 * use our fallback code.
6037 */
6038 if (!VATTR_ALL_SUPPORTED(vap) && *vpp) {
6039 KAUTH_DEBUG(" CREATE - doing fallback with ACL %p", vap->va_acl);
6040 error = vnode_setattr_fallback(*vpp, vap, ctx);
6041 }
2d21ac55
A
6042#if CONFIG_MACF
6043error:
6044#endif
6d2010ae 6045 if ((error != 0) && (vp != (vnode_t)0)) {
6d2010ae
A
6046 /* If we've done a compound open, close */
6047 if (batched && (old_error == 0) && (vap->va_type == VREG)) {
6048 VNOP_CLOSE(vp, fmode, ctx);
6049 }
6050
6051 /* Need to provide notifications if a create succeeded */
6052 if (!batched) {
6053 *vpp = (vnode_t) 0;
6054 vnode_put(vp);
d9a64523 6055 vp = NULLVP;
6d2010ae 6056 }
91447636
A
6057 }
6058
d9a64523
A
6059 /*
6060 * For creation VNOPs, this is the equivalent of
6061 * lookup_handle_found_vnode.
6062 */
0a7de745 6063 if (kdebug_enable && *vpp) {
d9a64523 6064 kdebug_lookup(*vpp, cnp);
0a7de745 6065 }
d9a64523 6066
91447636 6067out:
6d2010ae 6068 vn_attribute_cleanup(vap, defaulted);
91447636 6069
0a7de745 6070 return error;
91447636
A
6071}
6072
0a7de745
A
6073static kauth_scope_t vnode_scope;
6074static int vnode_authorize_callback(kauth_cred_t credential, void *idata, kauth_action_t action,
2d21ac55 6075 uintptr_t arg0, uintptr_t arg1, uintptr_t arg2, uintptr_t arg3);
813fb2f6
A
6076static int vnode_authorize_callback_int(kauth_action_t action, vfs_context_t ctx,
6077 vnode_t vp, vnode_t dvp, int *errorp);
91447636
A
6078
6079typedef struct _vnode_authorize_context {
0a7de745 6080 vnode_t vp;
91447636 6081 struct vnode_attr *vap;
0a7de745 6082 vnode_t dvp;
91447636 6083 struct vnode_attr *dvap;
0a7de745
A
6084 vfs_context_t ctx;
6085 int flags;
6086 int flags_valid;
6087#define _VAC_IS_OWNER (1<<0)
6088#define _VAC_IN_GROUP (1<<1)
6089#define _VAC_IS_DIR_OWNER (1<<2)
6090#define _VAC_IN_DIR_GROUP (1<<3)
6091#define _VAC_NO_VNODE_POINTERS (1<<4)
91447636
A
6092} *vauth_ctx;
6093
6094void
6095vnode_authorize_init(void)
6096{
6097 vnode_scope = kauth_register_scope(KAUTH_SCOPE_VNODE, vnode_authorize_callback, NULL);
6098}
6099
0a7de745
A
6100#define VATTR_PREPARE_DEFAULTED_UID 0x1
6101#define VATTR_PREPARE_DEFAULTED_GID 0x2
6102#define VATTR_PREPARE_DEFAULTED_MODE 0x4
6d2010ae
A
6103
6104int
6105vn_attribute_prepare(vnode_t dvp, struct vnode_attr *vap, uint32_t *defaulted_fieldsp, vfs_context_t ctx)
6106{
6107 kauth_acl_t nacl = NULL, oacl = NULL;
6108 int error;
6109
6110 /*
6111 * Handle ACL inheritance.
6112 */
6113 if (!(vap->va_vaflags & VA_NOINHERIT) && vfs_extendedsecurity(dvp->v_mount)) {
6114 /* save the original filesec */
6115 if (VATTR_IS_ACTIVE(vap, va_acl)) {
6116 oacl = vap->va_acl;
6117 }
6118
6119 vap->va_acl = NULL;
6120 if ((error = kauth_acl_inherit(dvp,
0a7de745
A
6121 oacl,
6122 &nacl,
6123 vap->va_type == VDIR,
6124 ctx)) != 0) {
6d2010ae 6125 KAUTH_DEBUG("%p CREATE - error %d processing inheritance", dvp, error);
0a7de745 6126 return error;
6d2010ae
A
6127 }
6128
6129 /*
6130 * If the generated ACL is NULL, then we can save ourselves some effort
6131 * by clearing the active bit.
6132 */
6133 if (nacl == NULL) {
6134 VATTR_CLEAR_ACTIVE(vap, va_acl);
6135 } else {
6136 vap->va_base_acl = oacl;
6137 VATTR_SET(vap, va_acl, nacl);
6138 }
6139 }
0a7de745 6140
6d2010ae
A
6141 error = vnode_authattr_new_internal(dvp, vap, (vap->va_vaflags & VA_NOAUTH), defaulted_fieldsp, ctx);
6142 if (error) {
6143 vn_attribute_cleanup(vap, *defaulted_fieldsp);
0a7de745 6144 }
6d2010ae
A
6145
6146 return error;
6147}
6148
6149void
6150vn_attribute_cleanup(struct vnode_attr *vap, uint32_t defaulted_fields)
6151{
6152 /*
6153 * If the caller supplied a filesec in vap, it has been replaced
6154 * now by the post-inheritance copy. We need to put the original back
6155 * and free the inherited product.
6156 */
6157 kauth_acl_t nacl, oacl;
6158
6159 if (VATTR_IS_ACTIVE(vap, va_acl)) {
6160 nacl = vap->va_acl;
6161 oacl = vap->va_base_acl;
6162
0a7de745 6163 if (oacl) {
6d2010ae
A
6164 VATTR_SET(vap, va_acl, oacl);
6165 vap->va_base_acl = NULL;
6166 } else {
6167 VATTR_CLEAR_ACTIVE(vap, va_acl);
6168 }
6169
6170 if (nacl != NULL) {
6171 kauth_acl_free(nacl);
6172 }
6173 }
6174
6175 if ((defaulted_fields & VATTR_PREPARE_DEFAULTED_MODE) != 0) {
6176 VATTR_CLEAR_ACTIVE(vap, va_mode);
6177 }
6178 if ((defaulted_fields & VATTR_PREPARE_DEFAULTED_GID) != 0) {
6179 VATTR_CLEAR_ACTIVE(vap, va_gid);
6180 }
6181 if ((defaulted_fields & VATTR_PREPARE_DEFAULTED_UID) != 0) {
6182 VATTR_CLEAR_ACTIVE(vap, va_uid);
6183 }
6184
6185 return;
6186}
6187
6188int
6189vn_authorize_unlink(vnode_t dvp, vnode_t vp, struct componentname *cnp, vfs_context_t ctx, __unused void *reserved)
6190{
39236c6e
A
6191#if !CONFIG_MACF
6192#pragma unused(cnp)
6193#endif
6d2010ae
A
6194 int error = 0;
6195
6196 /*
0a7de745 6197 * Normally, unlinking of directories is not supported.
6d2010ae
A
6198 * However, some file systems may have limited support.
6199 */
6200 if ((vp->v_type == VDIR) &&
0a7de745
A
6201 !(vp->v_mount->mnt_kern_flag & MNTK_DIR_HARDLINKS)) {
6202 return EPERM; /* POSIX */
6d2010ae
A
6203 }
6204
6205 /* authorize the delete operation */
6206#if CONFIG_MACF
0a7de745 6207 if (!error) {
6d2010ae 6208 error = mac_vnode_check_unlink(ctx, dvp, vp, cnp);
0a7de745 6209 }
6d2010ae 6210#endif /* MAC */
0a7de745 6211 if (!error) {
6d2010ae 6212 error = vnode_authorize(vp, dvp, KAUTH_VNODE_DELETE, ctx);
0a7de745 6213 }
6d2010ae
A
6214
6215 return error;
6216}
6217
6218int
6219vn_authorize_open_existing(vnode_t vp, struct componentname *cnp, int fmode, vfs_context_t ctx, void *reserved)
6220{
6221 /* Open of existing case */
6222 kauth_action_t action;
6223 int error = 0;
6d2010ae
A
6224 if (cnp->cn_ndp == NULL) {
6225 panic("NULL ndp");
6226 }
6227 if (reserved != NULL) {
6228 panic("reserved not NULL.");
6229 }
6230
6231#if CONFIG_MACF
6232 /* XXX may do duplicate work here, but ignore that for now (idempotent) */
6233 if (vfs_flags(vnode_mount(vp)) & MNT_MULTILABEL) {
6234 error = vnode_label(vnode_mount(vp), NULL, vp, NULL, 0, ctx);
0a7de745
A
6235 if (error) {
6236 return error;
6237 }
6d2010ae
A
6238 }
6239#endif
6240
0a7de745
A
6241 if ((fmode & O_DIRECTORY) && vp->v_type != VDIR) {
6242 return ENOTDIR;
6d2010ae
A
6243 }
6244
6245 if (vp->v_type == VSOCK && vp->v_tag != VT_FDESC) {
0a7de745 6246 return EOPNOTSUPP; /* Operation not supported on socket */
6d2010ae
A
6247 }
6248
6249 if (vp->v_type == VLNK && (fmode & O_NOFOLLOW) != 0) {
0a7de745 6250 return ELOOP; /* O_NOFOLLOW was specified and the target is a symbolic link */
6d2010ae
A
6251 }
6252
6253 /* disallow write operations on directories */
6254 if (vnode_isdir(vp) && (fmode & (FWRITE | O_TRUNC))) {
0a7de745 6255 return EISDIR;
6d2010ae
A
6256 }
6257
6258 if ((cnp->cn_ndp->ni_flag & NAMEI_TRAILINGSLASH)) {
6259 if (vp->v_type != VDIR) {
0a7de745 6260 return ENOTDIR;
6d2010ae
A
6261 }
6262 }
6263
6264#if CONFIG_MACF
0a7de745
A
6265 /* If a file being opened is a shadow file containing
6266 * namedstream data, ignore the macf checks because it
6267 * is a kernel internal file and access should always
6d2010ae
A
6268 * be allowed.
6269 */
6270 if (!(vnode_isshadow(vp) && vnode_isnamedstream(vp))) {
6271 error = mac_vnode_check_open(ctx, vp, fmode);
6272 if (error) {
0a7de745 6273 return error;
6d2010ae
A
6274 }
6275 }
6276#endif
6277
6278 /* compute action to be authorized */
6279 action = 0;
6280 if (fmode & FREAD) {
6281 action |= KAUTH_VNODE_READ_DATA;
6282 }
6283 if (fmode & (FWRITE | O_TRUNC)) {
6284 /*
6285 * If we are writing, appending, and not truncating,
6286 * indicate that we are appending so that if the
6287 * UF_APPEND or SF_APPEND bits are set, we do not deny
6288 * the open.
6289 */
6290 if ((fmode & O_APPEND) && !(fmode & O_TRUNC)) {
6291 action |= KAUTH_VNODE_APPEND_DATA;
6292 } else {
6293 action |= KAUTH_VNODE_WRITE_DATA;
6294 }
6295 }
4b17d6b6 6296 error = vnode_authorize(vp, NULL, action, ctx);
4b17d6b6
A
6297#if NAMEDSTREAMS
6298 if (error == EACCES) {
6299 /*
6300 * Shadow files may exist on-disk with a different UID/GID
6301 * than that of the current context. Verify that this file
6302 * is really a shadow file. If it was created successfully
6303 * then it should be authorized.
6304 */
0a7de745 6305 if (vnode_isshadow(vp) && vnode_isnamedstream(vp)) {
39236c6e 6306 error = vnode_verifynamedstream(vp);
4b17d6b6
A
6307 }
6308 }
6309#endif
39236c6e 6310
4b17d6b6 6311 return error;
6d2010ae
A
6312}
6313
6314int
6315vn_authorize_create(vnode_t dvp, struct componentname *cnp, struct vnode_attr *vap, vfs_context_t ctx, void *reserved)
6316{
39236c6e
A
6317#if !CONFIG_MACF
6318#pragma unused(vap)
6319#endif
6d2010ae
A
6320 /* Creation case */
6321 int error;
6322
6323 if (cnp->cn_ndp == NULL) {
6324 panic("NULL cn_ndp");
6325 }
6326 if (reserved != NULL) {
6327 panic("reserved not NULL.");
6328 }
6329
6330 /* Only validate path for creation if we didn't do a complete lookup */
6331 if (cnp->cn_ndp->ni_flag & NAMEI_UNFINISHED) {
6332 error = lookup_validate_creation_path(cnp->cn_ndp);
0a7de745
A
6333 if (error) {
6334 return error;
6335 }
6d2010ae
A
6336 }
6337
6338#if CONFIG_MACF
6339 error = mac_vnode_check_create(ctx, dvp, cnp, vap);
0a7de745
A
6340 if (error) {
6341 return error;
6342 }
6d2010ae
A
6343#endif /* CONFIG_MACF */
6344
0a7de745 6345 return vnode_authorize(dvp, NULL, KAUTH_VNODE_ADD_FILE, ctx);
6d2010ae
A
6346}
6347
39037602 6348int
0a7de745
A
6349vn_authorize_rename(struct vnode *fdvp, struct vnode *fvp, struct componentname *fcnp,
6350 struct vnode *tdvp, struct vnode *tvp, struct componentname *tcnp,
6351 vfs_context_t ctx, void *reserved)
39037602
A
6352{
6353 return vn_authorize_renamex(fdvp, fvp, fcnp, tdvp, tvp, tcnp, ctx, 0, reserved);
6354}
6355
6356int
0a7de745
A
6357vn_authorize_renamex(struct vnode *fdvp, struct vnode *fvp, struct componentname *fcnp,
6358 struct vnode *tdvp, struct vnode *tvp, struct componentname *tcnp,
6359 vfs_context_t ctx, vfs_rename_flags_t flags, void *reserved)
6d2010ae 6360{
d9a64523
A
6361 return vn_authorize_renamex_with_paths(fdvp, fvp, fcnp, NULL, tdvp, tvp, tcnp, NULL, ctx, flags, reserved);
6362}
6363
6364int
0a7de745
A
6365vn_authorize_renamex_with_paths(struct vnode *fdvp, struct vnode *fvp, struct componentname *fcnp, const char *from_path,
6366 struct vnode *tdvp, struct vnode *tvp, struct componentname *tcnp, const char *to_path,
6367 vfs_context_t ctx, vfs_rename_flags_t flags, void *reserved)
d9a64523 6368{
6d2010ae
A
6369 int error = 0;
6370 int moving = 0;
39037602 6371 bool swap = flags & VFS_RENAME_SWAP;
6d2010ae
A
6372
6373 if (reserved != NULL) {
6374 panic("Passed something other than NULL as reserved field!");
6375 }
6376
6377 /*
6378 * Avoid renaming "." and "..".
6379 *
6380 * XXX No need to check for this in the FS. We should always have the leaves
6381 * in VFS in this case.
6382 */
6383 if (fvp->v_type == VDIR &&
6384 ((fdvp == fvp) ||
0a7de745
A
6385 (fcnp->cn_namelen == 1 && fcnp->cn_nameptr[0] == '.') ||
6386 ((fcnp->cn_flags | tcnp->cn_flags) & ISDOTDOT))) {
6d2010ae
A
6387 error = EINVAL;
6388 goto out;
6389 }
6390
6391 if (tvp == NULLVP && vnode_compound_rename_available(tdvp)) {
6392 error = lookup_validate_creation_path(tcnp->cn_ndp);
0a7de745 6393 if (error) {
6d2010ae 6394 goto out;
0a7de745 6395 }
6d2010ae
A
6396 }
6397
6398 /***** <MACF> *****/
6399#if CONFIG_MACF
fe8ab488 6400 error = mac_vnode_check_rename(ctx, fdvp, fvp, fcnp, tdvp, tvp, tcnp);
0a7de745 6401 if (error) {
6d2010ae 6402 goto out;
0a7de745 6403 }
39037602
A
6404 if (swap) {
6405 error = mac_vnode_check_rename(ctx, tdvp, tvp, tcnp, fdvp, fvp, fcnp);
0a7de745 6406 if (error) {
39037602 6407 goto out;
0a7de745 6408 }
39037602 6409 }
6d2010ae
A
6410#endif
6411 /***** </MACF> *****/
6412
6413 /***** <MiscChecks> *****/
6414 if (tvp != NULL) {
39037602
A
6415 if (!swap) {
6416 if (fvp->v_type == VDIR && tvp->v_type != VDIR) {
6417 error = ENOTDIR;
6418 goto out;
6419 } else if (fvp->v_type != VDIR && tvp->v_type == VDIR) {
6420 error = EISDIR;
6421 goto out;
6422 }
6d2010ae 6423 }
39037602
A
6424 } else if (swap) {
6425 /*
6426 * Caller should have already checked this and returned
6427 * ENOENT. If we send back ENOENT here, caller will retry
6428 * which isn't what we want so we send back EINVAL here
6429 * instead.
6430 */
6431 error = EINVAL;
6432 goto out;
6d2010ae
A
6433 }
6434
6435 if (fvp == tdvp) {
6436 error = EINVAL;
6437 goto out;
6438 }
6439
6440 /*
6441 * The following edge case is caught here:
6442 * (to cannot be a descendent of from)
6443 *
6444 * o fdvp
6445 * /
6446 * /
6447 * o fvp
6448 * \
6449 * \
6450 * o tdvp
6451 * /
6452 * /
6453 * o tvp
6454 */
6455 if (tdvp->v_parent == fvp) {
6456 error = EINVAL;
6457 goto out;
6458 }
39037602
A
6459
6460 if (swap && fdvp->v_parent == tvp) {
6461 error = EINVAL;
6462 goto out;
6463 }
6d2010ae
A
6464 /***** </MiscChecks> *****/
6465
6466 /***** <Kauth> *****/
6467
d9a64523
A
6468 /*
6469 * As part of the Kauth step, we call out to allow 3rd-party
6470 * fileop notification of "about to rename". This is needed
6471 * in the event that 3rd-parties need to know that the DELETE
6472 * authorization is actually part of a rename. It's important
6473 * that we guarantee that the DELETE call-out will always be
6474 * made if the WILL_RENAME call-out is made. Another fileop
6475 * call-out will be performed once the operation is completed.
6476 * We can ignore the result of kauth_authorize_fileop().
6477 *
6478 * N.B. We are passing the vnode and *both* paths to each
6479 * call; kauth_authorize_fileop() extracts the "from" path
6480 * when posting a KAUTH_FILEOP_WILL_RENAME notification.
6481 * As such, we only post these notifications if all of the
6482 * information we need is provided.
6483 */
6484
39037602
A
6485 if (swap) {
6486 kauth_action_t f = 0, t = 0;
6d2010ae 6487
39037602
A
6488 /*
6489 * Directories changing parents need ...ADD_SUBDIR... to
6490 * permit changing ".."
6491 */
6492 if (fdvp != tdvp) {
0a7de745 6493 if (vnode_isdir(fvp)) {
39037602 6494 f = KAUTH_VNODE_ADD_SUBDIRECTORY;
0a7de745
A
6495 }
6496 if (vnode_isdir(tvp)) {
39037602 6497 t = KAUTH_VNODE_ADD_SUBDIRECTORY;
0a7de745 6498 }
39037602 6499 }
0a7de745 6500 if (to_path != NULL) {
d9a64523 6501 kauth_authorize_fileop(vfs_context_ucred(ctx),
0a7de745
A
6502 KAUTH_FILEOP_WILL_RENAME,
6503 (uintptr_t)fvp,
6504 (uintptr_t)to_path);
6505 }
39037602 6506 error = vnode_authorize(fvp, fdvp, KAUTH_VNODE_DELETE | f, ctx);
0a7de745 6507 if (error) {
6d2010ae 6508 goto out;
0a7de745
A
6509 }
6510 if (from_path != NULL) {
d9a64523 6511 kauth_authorize_fileop(vfs_context_ucred(ctx),
0a7de745
A
6512 KAUTH_FILEOP_WILL_RENAME,
6513 (uintptr_t)tvp,
6514 (uintptr_t)from_path);
6515 }
39037602 6516 error = vnode_authorize(tvp, tdvp, KAUTH_VNODE_DELETE | t, ctx);
0a7de745 6517 if (error) {
6d2010ae 6518 goto out;
0a7de745 6519 }
39037602
A
6520 f = vnode_isdir(fvp) ? KAUTH_VNODE_ADD_SUBDIRECTORY : KAUTH_VNODE_ADD_FILE;
6521 t = vnode_isdir(tvp) ? KAUTH_VNODE_ADD_SUBDIRECTORY : KAUTH_VNODE_ADD_FILE;
0a7de745 6522 if (fdvp == tdvp) {
39037602 6523 error = vnode_authorize(fdvp, NULL, f | t, ctx);
0a7de745 6524 } else {
39037602 6525 error = vnode_authorize(fdvp, NULL, t, ctx);
0a7de745 6526 if (error) {
39037602 6527 goto out;
0a7de745 6528 }
39037602 6529 error = vnode_authorize(tdvp, NULL, f, ctx);
6d2010ae 6530 }
0a7de745 6531 if (error) {
39037602 6532 goto out;
0a7de745 6533 }
6d2010ae 6534 } else {
39037602
A
6535 error = 0;
6536 if ((tvp != NULL) && vnode_isdir(tvp)) {
0a7de745 6537 if (tvp != fdvp) {
39037602 6538 moving = 1;
0a7de745 6539 }
39037602
A
6540 } else if (tdvp != fdvp) {
6541 moving = 1;
6542 }
6543
6544 /*
6545 * must have delete rights to remove the old name even in
6546 * the simple case of fdvp == tdvp.
6547 *
6548 * If fvp is a directory, and we are changing it's parent,
6549 * then we also need rights to rewrite its ".." entry as well.
6550 */
0a7de745 6551 if (to_path != NULL) {
d9a64523 6552 kauth_authorize_fileop(vfs_context_ucred(ctx),
0a7de745
A
6553 KAUTH_FILEOP_WILL_RENAME,
6554 (uintptr_t)fvp,
6555 (uintptr_t)to_path);
6556 }
39037602 6557 if (vnode_isdir(fvp)) {
0a7de745 6558 if ((error = vnode_authorize(fvp, fdvp, KAUTH_VNODE_DELETE | KAUTH_VNODE_ADD_SUBDIRECTORY, ctx)) != 0) {
39037602 6559 goto out;
0a7de745 6560 }
39037602 6561 } else {
0a7de745 6562 if ((error = vnode_authorize(fvp, fdvp, KAUTH_VNODE_DELETE, ctx)) != 0) {
39037602 6563 goto out;
0a7de745 6564 }
39037602
A
6565 }
6566 if (moving) {
6567 /* moving into tdvp or tvp, must have rights to add */
6568 if ((error = vnode_authorize(((tvp != NULL) && vnode_isdir(tvp)) ? tvp : tdvp,
0a7de745
A
6569 NULL,
6570 vnode_isdir(fvp) ? KAUTH_VNODE_ADD_SUBDIRECTORY : KAUTH_VNODE_ADD_FILE,
6571 ctx)) != 0) {
39037602
A
6572 goto out;
6573 }
6574 } else {
6575 /* node staying in same directory, must be allowed to add new name */
6576 if ((error = vnode_authorize(fdvp, NULL,
0a7de745 6577 vnode_isdir(fvp) ? KAUTH_VNODE_ADD_SUBDIRECTORY : KAUTH_VNODE_ADD_FILE, ctx)) != 0) {
39037602 6578 goto out;
0a7de745 6579 }
39037602
A
6580 }
6581 /* overwriting tvp */
6582 if ((tvp != NULL) && !vnode_isdir(tvp) &&
0a7de745 6583 ((error = vnode_authorize(tvp, tdvp, KAUTH_VNODE_DELETE, ctx)) != 0)) {
6d2010ae 6584 goto out;
39037602 6585 }
6d2010ae
A
6586 }
6587
6588 /***** </Kauth> *****/
6589
6590 /* XXX more checks? */
6591out:
6592 return error;
6593}
6594
6595int
6596vn_authorize_mkdir(vnode_t dvp, struct componentname *cnp, struct vnode_attr *vap, vfs_context_t ctx, void *reserved)
6597{
39236c6e
A
6598#if !CONFIG_MACF
6599#pragma unused(vap)
6600#endif
6d2010ae
A
6601 int error;
6602
6603 if (reserved != NULL) {
0a7de745 6604 panic("reserved not NULL in vn_authorize_mkdir()");
6d2010ae
A
6605 }
6606
6607 /* XXX A hack for now, to make shadow files work */
6608 if (cnp->cn_ndp == NULL) {
6609 return 0;
6610 }
6611
6612 if (vnode_compound_mkdir_available(dvp)) {
6613 error = lookup_validate_creation_path(cnp->cn_ndp);
0a7de745 6614 if (error) {
6d2010ae 6615 goto out;
0a7de745 6616 }
6d2010ae
A
6617 }
6618
6619#if CONFIG_MACF
6620 error = mac_vnode_check_create(ctx,
6621 dvp, cnp, vap);
0a7de745 6622 if (error) {
6d2010ae 6623 goto out;
0a7de745 6624 }
6d2010ae
A
6625#endif
6626
0a7de745
A
6627 /* authorize addition of a directory to the parent */
6628 if ((error = vnode_authorize(dvp, NULL, KAUTH_VNODE_ADD_SUBDIRECTORY, ctx)) != 0) {
6629 goto out;
6630 }
6631
6d2010ae
A
6632out:
6633 return error;
6634}
6635
6636int
6637vn_authorize_rmdir(vnode_t dvp, vnode_t vp, struct componentname *cnp, vfs_context_t ctx, void *reserved)
6638{
39236c6e 6639#if CONFIG_MACF
6d2010ae 6640 int error;
39236c6e
A
6641#else
6642#pragma unused(cnp)
6643#endif
6d2010ae
A
6644 if (reserved != NULL) {
6645 panic("Non-NULL reserved argument to vn_authorize_rmdir()");
6646 }
6647
6648 if (vp->v_type != VDIR) {
6649 /*
6650 * rmdir only deals with directories
6651 */
6652 return ENOTDIR;
0a7de745
A
6653 }
6654
6d2010ae
A
6655 if (dvp == vp) {
6656 /*
6657 * No rmdir "." please.
6658 */
6659 return EINVAL;
0a7de745
A
6660 }
6661
6d2010ae
A
6662#if CONFIG_MACF
6663 error = mac_vnode_check_unlink(ctx, dvp,
0a7de745
A
6664 vp, cnp);
6665 if (error) {
6d2010ae 6666 return error;
0a7de745 6667 }
6d2010ae
A
6668#endif
6669
6670 return vnode_authorize(vp, dvp, KAUTH_VNODE_DELETE, ctx);
6671}
6672
813fb2f6
A
6673/*
6674 * Authorizer for directory cloning. This does not use vnodes but instead
6675 * uses prefilled vnode attributes from the filesystem.
6676 *
6677 * The same function is called to set up the attributes required, perform the
6678 * authorization and cleanup (if required)
6679 */
6680int
6681vnode_attr_authorize_dir_clone(struct vnode_attr *vap, kauth_action_t action,
6682 struct vnode_attr *dvap, __unused vnode_t sdvp, mount_t mp,
5ba3f43e 6683 dir_clone_authorizer_op_t vattr_op, uint32_t flags, vfs_context_t ctx,
813fb2f6
A
6684 __unused void *reserved)
6685{
6686 int error;
6687 int is_suser = vfs_context_issuser(ctx);
6688
6689 if (vattr_op == OP_VATTR_SETUP) {
6690 VATTR_INIT(vap);
6691
6692 /*
6693 * When ACL inheritence is implemented, both vap->va_acl and
6694 * dvap->va_acl will be required (even as superuser).
6695 */
6696 VATTR_WANTED(vap, va_type);
6697 VATTR_WANTED(vap, va_mode);
6698 VATTR_WANTED(vap, va_flags);
6699 VATTR_WANTED(vap, va_uid);
6700 VATTR_WANTED(vap, va_gid);
6701 if (dvap) {
6702 VATTR_INIT(dvap);
6703 VATTR_WANTED(dvap, va_flags);
6704 }
6705
6706 if (!is_suser) {
6707 /*
6708 * If not superuser, we have to evaluate ACLs and
6709 * need the target directory gid to set the initial
6710 * gid of the new object.
6711 */
6712 VATTR_WANTED(vap, va_acl);
0a7de745 6713 if (dvap) {
813fb2f6 6714 VATTR_WANTED(dvap, va_gid);
0a7de745 6715 }
5ba3f43e
A
6716 } else if (dvap && (flags & VNODE_CLONEFILE_NOOWNERCOPY)) {
6717 VATTR_WANTED(dvap, va_gid);
813fb2f6 6718 }
0a7de745 6719 return 0;
813fb2f6 6720 } else if (vattr_op == OP_VATTR_CLEANUP) {
0a7de745 6721 return 0; /* Nothing to do for now */
813fb2f6
A
6722 }
6723
6724 /* dvap isn't used for authorization */
6725 error = vnode_attr_authorize(vap, NULL, mp, action, ctx);
6726
0a7de745
A
6727 if (error) {
6728 return error;
6729 }
813fb2f6
A
6730
6731 /*
6732 * vn_attribute_prepare should be able to accept attributes as well as
6733 * vnodes but for now we do this inline.
6734 */
5ba3f43e 6735 if (!is_suser || (flags & VNODE_CLONEFILE_NOOWNERCOPY)) {
813fb2f6
A
6736 /*
6737 * If the filesystem is mounted IGNORE_OWNERSHIP and an explicit
6738 * owner is set, that owner takes ownership of all new files.
6739 */
6740 if ((mp->mnt_flag & MNT_IGNORE_OWNERSHIP) &&
6741 (mp->mnt_fsowner != KAUTH_UID_NONE)) {
6742 VATTR_SET(vap, va_uid, mp->mnt_fsowner);
6743 } else {
6744 /* default owner is current user */
6745 VATTR_SET(vap, va_uid,
6746 kauth_cred_getuid(vfs_context_ucred(ctx)));
6747 }
6748
6749 if ((mp->mnt_flag & MNT_IGNORE_OWNERSHIP) &&
6750 (mp->mnt_fsgroup != KAUTH_GID_NONE)) {
6751 VATTR_SET(vap, va_gid, mp->mnt_fsgroup);
6752 } else {
6753 /*
6754 * default group comes from parent object,
6755 * fallback to current user
6756 */
6757 if (VATTR_IS_SUPPORTED(dvap, va_gid)) {
6758 VATTR_SET(vap, va_gid, dvap->va_gid);
6759 } else {
6760 VATTR_SET(vap, va_gid,
6761 kauth_cred_getgid(vfs_context_ucred(ctx)));
6762 }
6763 }
6764 }
6765
6766 /* Inherit SF_RESTRICTED bit from destination directory only */
6767 if (VATTR_IS_ACTIVE(vap, va_flags)) {
6768 VATTR_SET(vap, va_flags,
5ba3f43e 6769 ((vap->va_flags & ~(UF_DATAVAULT | SF_RESTRICTED)))); /* Turn off from source */
0a7de745 6770 if (VATTR_IS_ACTIVE(dvap, va_flags)) {
813fb2f6 6771 VATTR_SET(vap, va_flags,
5ba3f43e 6772 vap->va_flags | (dvap->va_flags & (UF_DATAVAULT | SF_RESTRICTED)));
0a7de745 6773 }
813fb2f6 6774 } else if (VATTR_IS_ACTIVE(dvap, va_flags)) {
5ba3f43e 6775 VATTR_SET(vap, va_flags, (dvap->va_flags & (UF_DATAVAULT | SF_RESTRICTED)));
813fb2f6
A
6776 }
6777
0a7de745 6778 return 0;
813fb2f6
A
6779}
6780
6781
91447636
A
6782/*
6783 * Authorize an operation on a vnode.
6784 *
6785 * This is KPI, but here because it needs vnode_scope.
2d21ac55
A
6786 *
6787 * Returns: 0 Success
6788 * kauth_authorize_action:EPERM ...
6789 * xlate => EACCES Permission denied
6790 * kauth_authorize_action:0 Success
6791 * kauth_authorize_action: Depends on callback return; this is
6792 * usually only vnode_authorize_callback(),
6793 * but may include other listerners, if any
6794 * exist.
6795 * EROFS
6796 * EACCES
6797 * EPERM
6798 * ???
91447636
A
6799 */
6800int
2d21ac55 6801vnode_authorize(vnode_t vp, vnode_t dvp, kauth_action_t action, vfs_context_t ctx)
91447636 6802{
0a7de745 6803 int error, result;
91447636
A
6804
6805 /*
6806 * We can't authorize against a dead vnode; allow all operations through so that
6807 * the correct error can be returned.
6808 */
0a7de745
A
6809 if (vp->v_type == VBAD) {
6810 return 0;
6811 }
6812
91447636 6813 error = 0;
2d21ac55 6814 result = kauth_authorize_action(vnode_scope, vfs_context_ucred(ctx), action,
0a7de745
A
6815 (uintptr_t)ctx, (uintptr_t)vp, (uintptr_t)dvp, (uintptr_t)&error);
6816 if (result == EPERM) { /* traditional behaviour */
91447636 6817 result = EACCES;
0a7de745 6818 }
91447636 6819 /* did the lower layers give a better error return? */
0a7de745
A
6820 if ((result != 0) && (error != 0)) {
6821 return error;
6822 }
6823 return result;
91447636
A
6824}
6825
6826/*
6827 * Test for vnode immutability.
6828 *
6829 * The 'append' flag is set when the authorization request is constrained
6830 * to operations which only request the right to append to a file.
6831 *
6832 * The 'ignore' flag is set when an operation modifying the immutability flags
6833 * is being authorized. We check the system securelevel to determine which
6834 * immutability flags we can ignore.
6835 */
6836static int
6837vnode_immutable(struct vnode_attr *vap, int append, int ignore)
6838{
0a7de745 6839 int mask;
91447636
A
6840
6841 /* start with all bits precluding the operation */
6842 mask = IMMUTABLE | APPEND;
6843
6844 /* if appending only, remove the append-only bits */
0a7de745 6845 if (append) {
91447636 6846 mask &= ~APPEND;
0a7de745 6847 }
91447636
A
6848
6849 /* ignore only set when authorizing flags changes */
6850 if (ignore) {
6851 if (securelevel <= 0) {
6852 /* in insecure state, flags do not inhibit changes */
6853 mask = 0;
6854 } else {
6855 /* in secure state, user flags don't inhibit */
6856 mask &= ~(UF_IMMUTABLE | UF_APPEND);
6857 }
6858 }
6859 KAUTH_DEBUG("IMMUTABLE - file flags 0x%x mask 0x%x append = %d ignore = %d", vap->va_flags, mask, append, ignore);
0a7de745
A
6860 if ((vap->va_flags & mask) != 0) {
6861 return EPERM;
6862 }
6863 return 0;
91447636
A
6864}
6865
6866static int
6867vauth_node_owner(struct vnode_attr *vap, kauth_cred_t cred)
6868{
6869 int result;
6870
6871 /* default assumption is not-owner */
6872 result = 0;
6873
6874 /*
6875 * If the filesystem has given us a UID, we treat this as authoritative.
6876 */
6877 if (vap && VATTR_IS_SUPPORTED(vap, va_uid)) {
6878 result = (vap->va_uid == kauth_cred_getuid(cred)) ? 1 : 0;
6879 }
6880 /* we could test the owner UUID here if we had a policy for it */
0a7de745
A
6881
6882 return result;
91447636
A
6883}
6884
0b4c1975
A
6885/*
6886 * vauth_node_group
6887 *
6888 * Description: Ask if a cred is a member of the group owning the vnode object
6889 *
6890 * Parameters: vap vnode attribute
6891 * vap->va_gid group owner of vnode object
6892 * cred credential to check
6893 * ismember pointer to where to put the answer
6894 * idontknow Return this if we can't get an answer
6895 *
6896 * Returns: 0 Success
6897 * idontknow Can't get information
6898 * kauth_cred_ismember_gid:? Error from kauth subsystem
6899 * kauth_cred_ismember_gid:? Error from kauth subsystem
6900 */
91447636 6901static int
0b4c1975 6902vauth_node_group(struct vnode_attr *vap, kauth_cred_t cred, int *ismember, int idontknow)
91447636 6903{
0a7de745
A
6904 int error;
6905 int result;
91447636
A
6906
6907 error = 0;
6908 result = 0;
6909
0b4c1975
A
6910 /*
6911 * The caller is expected to have asked the filesystem for a group
6912 * at some point prior to calling this function. The answer may
6913 * have been that there is no group ownership supported for the
6914 * vnode object, in which case we return
6915 */
91447636
A
6916 if (vap && VATTR_IS_SUPPORTED(vap, va_gid)) {
6917 error = kauth_cred_ismember_gid(cred, vap->va_gid, &result);
0b4c1975
A
6918 /*
6919 * Credentials which are opted into external group membership
6920 * resolution which are not known to the external resolver
6921 * will result in an ENOENT error. We translate this into
6922 * the appropriate 'idontknow' response for our caller.
6923 *
6924 * XXX We do not make a distinction here between an ENOENT
6925 * XXX arising from a response from the external resolver,
6926 * XXX and an ENOENT which is internally generated. This is
6927 * XXX a deficiency of the published kauth_cred_ismember_gid()
6928 * XXX KPI which can not be overcome without new KPI. For
6929 * XXX all currently known cases, however, this wil result
6930 * XXX in correct behaviour.
6931 */
0a7de745 6932 if (error == ENOENT) {
0b4c1975 6933 error = idontknow;
0a7de745 6934 }
91447636 6935 }
0b4c1975
A
6936 /*
6937 * XXX We could test the group UUID here if we had a policy for it,
6938 * XXX but this is problematic from the perspective of synchronizing
6939 * XXX group UUID and POSIX GID ownership of a file and keeping the
6940 * XXX values coherent over time. The problem is that the local
6941 * XXX system will vend transient group UUIDs for unknown POSIX GID
6942 * XXX values, and these are not persistent, whereas storage of values
6943 * XXX is persistent. One potential solution to this is a local
6944 * XXX (persistent) replica of remote directory entries and vended
6945 * XXX local ids in a local directory server (think in terms of a
6946 * XXX caching DNS server).
6947 */
91447636 6948
0a7de745 6949 if (!error) {
91447636 6950 *ismember = result;
0a7de745
A
6951 }
6952 return error;
91447636
A
6953}
6954
6955static int
6956vauth_file_owner(vauth_ctx vcp)
6957{
6958 int result;
6959
6960 if (vcp->flags_valid & _VAC_IS_OWNER) {
6961 result = (vcp->flags & _VAC_IS_OWNER) ? 1 : 0;
6962 } else {
6963 result = vauth_node_owner(vcp->vap, vcp->ctx->vc_ucred);
6964
6965 /* cache our result */
6966 vcp->flags_valid |= _VAC_IS_OWNER;
6967 if (result) {
6968 vcp->flags |= _VAC_IS_OWNER;
6969 } else {
6970 vcp->flags &= ~_VAC_IS_OWNER;
6971 }
6972 }
0a7de745 6973 return result;
91447636
A
6974}
6975
0b4c1975
A
6976
6977/*
6978 * vauth_file_ingroup
6979 *
6980 * Description: Ask if a user is a member of the group owning the directory
6981 *
6982 * Parameters: vcp The vnode authorization context that
6983 * contains the user and directory info
6984 * vcp->flags_valid Valid flags
6985 * vcp->flags Flags values
6986 * vcp->vap File vnode attributes
6987 * vcp->ctx VFS Context (for user)
6988 * ismember pointer to where to put the answer
6989 * idontknow Return this if we can't get an answer
6990 *
6991 * Returns: 0 Success
6992 * vauth_node_group:? Error from vauth_node_group()
6993 *
6994 * Implicit returns: *ismember 0 The user is not a group member
6995 * 1 The user is a group member
6996 */
91447636 6997static int
0b4c1975 6998vauth_file_ingroup(vauth_ctx vcp, int *ismember, int idontknow)
91447636 6999{
0a7de745 7000 int error;
91447636 7001
0b4c1975 7002 /* Check for a cached answer first, to avoid the check if possible */
91447636
A
7003 if (vcp->flags_valid & _VAC_IN_GROUP) {
7004 *ismember = (vcp->flags & _VAC_IN_GROUP) ? 1 : 0;
7005 error = 0;
7006 } else {
0b4c1975
A
7007 /* Otherwise, go look for it */
7008 error = vauth_node_group(vcp->vap, vcp->ctx->vc_ucred, ismember, idontknow);
91447636
A
7009
7010 if (!error) {
7011 /* cache our result */
7012 vcp->flags_valid |= _VAC_IN_GROUP;
7013 if (*ismember) {
7014 vcp->flags |= _VAC_IN_GROUP;
7015 } else {
7016 vcp->flags &= ~_VAC_IN_GROUP;
7017 }
7018 }
91447636 7019 }
0a7de745 7020 return error;
91447636
A
7021}
7022
7023static int
7024vauth_dir_owner(vauth_ctx vcp)
7025{
7026 int result;
7027
7028 if (vcp->flags_valid & _VAC_IS_DIR_OWNER) {
7029 result = (vcp->flags & _VAC_IS_DIR_OWNER) ? 1 : 0;
7030 } else {
7031 result = vauth_node_owner(vcp->dvap, vcp->ctx->vc_ucred);
7032
7033 /* cache our result */
7034 vcp->flags_valid |= _VAC_IS_DIR_OWNER;
7035 if (result) {
7036 vcp->flags |= _VAC_IS_DIR_OWNER;
7037 } else {
7038 vcp->flags &= ~_VAC_IS_DIR_OWNER;
7039 }
7040 }
0a7de745 7041 return result;
91447636
A
7042}
7043
0b4c1975
A
7044/*
7045 * vauth_dir_ingroup
7046 *
7047 * Description: Ask if a user is a member of the group owning the directory
7048 *
7049 * Parameters: vcp The vnode authorization context that
7050 * contains the user and directory info
7051 * vcp->flags_valid Valid flags
7052 * vcp->flags Flags values
7053 * vcp->dvap Dir vnode attributes
7054 * vcp->ctx VFS Context (for user)
7055 * ismember pointer to where to put the answer
7056 * idontknow Return this if we can't get an answer
7057 *
7058 * Returns: 0 Success
7059 * vauth_node_group:? Error from vauth_node_group()
7060 *
7061 * Implicit returns: *ismember 0 The user is not a group member
7062 * 1 The user is a group member
7063 */
91447636 7064static int
0b4c1975 7065vauth_dir_ingroup(vauth_ctx vcp, int *ismember, int idontknow)
91447636 7066{
0a7de745 7067 int error;
91447636 7068
0b4c1975 7069 /* Check for a cached answer first, to avoid the check if possible */
91447636
A
7070 if (vcp->flags_valid & _VAC_IN_DIR_GROUP) {
7071 *ismember = (vcp->flags & _VAC_IN_DIR_GROUP) ? 1 : 0;
7072 error = 0;
7073 } else {
0b4c1975
A
7074 /* Otherwise, go look for it */
7075 error = vauth_node_group(vcp->dvap, vcp->ctx->vc_ucred, ismember, idontknow);
91447636
A
7076
7077 if (!error) {
7078 /* cache our result */
7079 vcp->flags_valid |= _VAC_IN_DIR_GROUP;
7080 if (*ismember) {
7081 vcp->flags |= _VAC_IN_DIR_GROUP;
7082 } else {
7083 vcp->flags &= ~_VAC_IN_DIR_GROUP;
7084 }
7085 }
7086 }
0a7de745 7087 return error;
91447636
A
7088}
7089
7090/*
7091 * Test the posix permissions in (vap) to determine whether (credential)
7092 * may perform (action)
7093 */
7094static int
7095vnode_authorize_posix(vauth_ctx vcp, int action, int on_dir)
7096{
7097 struct vnode_attr *vap;
7098 int needed, error, owner_ok, group_ok, world_ok, ismember;
7099#ifdef KAUTH_DEBUG_ENABLE
2d21ac55 7100 const char *where = "uninitialized";
0a7de745 7101# define _SETWHERE(c) where = c;
91447636
A
7102#else
7103# define _SETWHERE(c)
7104#endif
7105
7106 /* checking file or directory? */
7107 if (on_dir) {
7108 vap = vcp->dvap;
7109 } else {
7110 vap = vcp->vap;
7111 }
0a7de745 7112
91447636 7113 error = 0;
0a7de745 7114
91447636
A
7115 /*
7116 * We want to do as little work here as possible. So first we check
7117 * which sets of permissions grant us the access we need, and avoid checking
7118 * whether specific permissions grant access when more generic ones would.
7119 */
7120
7121 /* owner permissions */
7122 needed = 0;
0a7de745 7123 if (action & VREAD) {
91447636 7124 needed |= S_IRUSR;
0a7de745
A
7125 }
7126 if (action & VWRITE) {
91447636 7127 needed |= S_IWUSR;
0a7de745
A
7128 }
7129 if (action & VEXEC) {
91447636 7130 needed |= S_IXUSR;
0a7de745 7131 }
91447636
A
7132 owner_ok = (needed & vap->va_mode) == needed;
7133
7134 /* group permissions */
7135 needed = 0;
0a7de745 7136 if (action & VREAD) {
91447636 7137 needed |= S_IRGRP;
0a7de745
A
7138 }
7139 if (action & VWRITE) {
91447636 7140 needed |= S_IWGRP;
0a7de745
A
7141 }
7142 if (action & VEXEC) {
91447636 7143 needed |= S_IXGRP;
0a7de745 7144 }
91447636
A
7145 group_ok = (needed & vap->va_mode) == needed;
7146
7147 /* world permissions */
7148 needed = 0;
0a7de745 7149 if (action & VREAD) {
91447636 7150 needed |= S_IROTH;
0a7de745
A
7151 }
7152 if (action & VWRITE) {
91447636 7153 needed |= S_IWOTH;
0a7de745
A
7154 }
7155 if (action & VEXEC) {
91447636 7156 needed |= S_IXOTH;
0a7de745 7157 }
91447636
A
7158 world_ok = (needed & vap->va_mode) == needed;
7159
7160 /* If granted/denied by all three, we're done */
7161 if (owner_ok && group_ok && world_ok) {
7162 _SETWHERE("all");
7163 goto out;
7164 }
7165 if (!owner_ok && !group_ok && !world_ok) {
7166 _SETWHERE("all");
7167 error = EACCES;
7168 goto out;
7169 }
7170
7171 /* Check ownership (relatively cheap) */
7172 if ((on_dir && vauth_dir_owner(vcp)) ||
7173 (!on_dir && vauth_file_owner(vcp))) {
7174 _SETWHERE("user");
0a7de745 7175 if (!owner_ok) {
91447636 7176 error = EACCES;
0a7de745 7177 }
91447636
A
7178 goto out;
7179 }
7180
7181 /* Not owner; if group and world both grant it we're done */
7182 if (group_ok && world_ok) {
7183 _SETWHERE("group/world");
7184 goto out;
7185 }
7186 if (!group_ok && !world_ok) {
7187 _SETWHERE("group/world");
7188 error = EACCES;
7189 goto out;
7190 }
7191
7192 /* Check group membership (most expensive) */
0a7de745 7193 ismember = 0; /* Default to allow, if the target has no group owner */
0b4c1975
A
7194
7195 /*
7196 * In the case we can't get an answer about the user from the call to
7197 * vauth_dir_ingroup() or vauth_file_ingroup(), we want to fail on
7198 * the side of caution, rather than simply granting access, or we will
7199 * fail to correctly implement exclusion groups, so we set the third
7200 * parameter on the basis of the state of 'group_ok'.
7201 */
91447636 7202 if (on_dir) {
0b4c1975 7203 error = vauth_dir_ingroup(vcp, &ismember, (!group_ok ? EACCES : 0));
91447636 7204 } else {
0b4c1975 7205 error = vauth_file_ingroup(vcp, &ismember, (!group_ok ? EACCES : 0));
91447636 7206 }
6d2010ae 7207 if (error) {
0a7de745 7208 if (!group_ok) {
6d2010ae 7209 ismember = 1;
0a7de745 7210 }
6d2010ae
A
7211 error = 0;
7212 }
91447636
A
7213 if (ismember) {
7214 _SETWHERE("group");
0a7de745 7215 if (!group_ok) {
91447636 7216 error = EACCES;
0a7de745 7217 }
91447636
A
7218 goto out;
7219 }
7220
7221 /* Not owner, not in group, use world result */
7222 _SETWHERE("world");
0a7de745 7223 if (!world_ok) {
91447636 7224 error = EACCES;
0a7de745 7225 }
91447636
A
7226
7227 /* FALLTHROUGH */
7228
7229out:
7230 KAUTH_DEBUG("%p %s - posix %s permissions : need %s%s%s %x have %s%s%s%s%s%s%s%s%s UID = %d file = %d,%d",
7231 vcp->vp, (error == 0) ? "ALLOWED" : "DENIED", where,
7232 (action & VREAD) ? "r" : "-",
7233 (action & VWRITE) ? "w" : "-",
7234 (action & VEXEC) ? "x" : "-",
7235 needed,
7236 (vap->va_mode & S_IRUSR) ? "r" : "-",
7237 (vap->va_mode & S_IWUSR) ? "w" : "-",
7238 (vap->va_mode & S_IXUSR) ? "x" : "-",
7239 (vap->va_mode & S_IRGRP) ? "r" : "-",
7240 (vap->va_mode & S_IWGRP) ? "w" : "-",
7241 (vap->va_mode & S_IXGRP) ? "x" : "-",
7242 (vap->va_mode & S_IROTH) ? "r" : "-",
7243 (vap->va_mode & S_IWOTH) ? "w" : "-",
7244 (vap->va_mode & S_IXOTH) ? "x" : "-",
7245 kauth_cred_getuid(vcp->ctx->vc_ucred),
7246 on_dir ? vcp->dvap->va_uid : vcp->vap->va_uid,
7247 on_dir ? vcp->dvap->va_gid : vcp->vap->va_gid);
0a7de745 7248 return error;
91447636
A
7249}
7250
7251/*
7252 * Authorize the deletion of the node vp from the directory dvp.
7253 *
7254 * We assume that:
7255 * - Neither the node nor the directory are immutable.
7256 * - The user is not the superuser.
7257 *
39037602
A
7258 * The precedence of factors for authorizing or denying delete for a credential
7259 *
7260 * 1) Explicit ACE on the node. (allow or deny DELETE)
7261 * 2) Explicit ACE on the directory (allow or deny DELETE_CHILD).
91447636 7262 *
39037602
A
7263 * If there are conflicting ACEs on the node and the directory, the node
7264 * ACE wins.
7265 *
7266 * 3) Sticky bit on the directory.
7267 * Deletion is not permitted if the directory is sticky and the caller is
7268 * not owner of the node or directory. The sticky bit rules are like a deny
7269 * delete ACE except lower in priority than ACL's either allowing or denying
7270 * delete.
7271 *
7272 * 4) POSIX permisions on the directory.
e2fac8b1
A
7273 *
7274 * As an optimization, we cache whether or not delete child is permitted
39037602
A
7275 * on directories. This enables us to skip directory ACL and POSIX checks
7276 * as we already have the result from those checks. However, we always check the
7277 * node ACL and, if the directory has the sticky bit set, we always check its
7278 * ACL (even for a directory with an authorized delete child). Furthermore,
7279 * caching the delete child authorization is independent of the sticky bit
7280 * being set as it is only applicable in determining whether the node can be
7281 * deleted or not.
91447636 7282 */
39037602 7283static int
e2fac8b1 7284vnode_authorize_delete(vauth_ctx vcp, boolean_t cached_delete_child)
91447636 7285{
0a7de745
A
7286 struct vnode_attr *vap = vcp->vap;
7287 struct vnode_attr *dvap = vcp->dvap;
7288 kauth_cred_t cred = vcp->ctx->vc_ucred;
7289 struct kauth_acl_eval eval;
7290 int error, ismember;
91447636 7291
39037602
A
7292 /* Check the ACL on the node first */
7293 if (VATTR_IS_NOT(vap, va_acl, NULL)) {
7294 eval.ae_requested = KAUTH_VNODE_DELETE;
7295 eval.ae_acl = &vap->va_acl->acl_ace[0];
7296 eval.ae_count = vap->va_acl->acl_entrycount;
91447636 7297 eval.ae_options = 0;
0a7de745 7298 if (vauth_file_owner(vcp)) {
91447636 7299 eval.ae_options |= KAUTH_AEVAL_IS_OWNER;
0a7de745 7300 }
0b4c1975
A
7301 /*
7302 * We use ENOENT as a marker to indicate we could not get
7303 * information in order to delay evaluation until after we
7304 * have the ACL evaluation answer. Previously, we would
7305 * always deny the operation at this point.
7306 */
0a7de745
A
7307 if ((error = vauth_file_ingroup(vcp, &ismember, ENOENT)) != 0 && error != ENOENT) {
7308 return error;
7309 }
7310 if (error == ENOENT) {
6d2010ae 7311 eval.ae_options |= KAUTH_AEVAL_IN_GROUP_UNKNOWN;
0a7de745 7312 } else if (ismember) {
91447636 7313 eval.ae_options |= KAUTH_AEVAL_IN_GROUP;
0a7de745 7314 }
91447636
A
7315 eval.ae_exp_gall = KAUTH_VNODE_GENERIC_ALL_BITS;
7316 eval.ae_exp_gread = KAUTH_VNODE_GENERIC_READ_BITS;
7317 eval.ae_exp_gwrite = KAUTH_VNODE_GENERIC_WRITE_BITS;
7318 eval.ae_exp_gexec = KAUTH_VNODE_GENERIC_EXECUTE_BITS;
7319
39037602 7320 if ((error = kauth_acl_evaluate(cred, &eval)) != 0) {
91447636 7321 KAUTH_DEBUG("%p ERROR during ACL processing - %d", vcp->vp, error);
0a7de745 7322 return error;
91447636 7323 }
39037602 7324
0a7de745 7325 switch (eval.ae_result) {
0b4c1975 7326 case KAUTH_RESULT_DENY:
39037602 7327 KAUTH_DEBUG("%p DENIED - denied by ACL", vcp->vp);
0a7de745 7328 return EACCES;
39037602
A
7329 case KAUTH_RESULT_ALLOW:
7330 KAUTH_DEBUG("%p ALLOWED - granted by ACL", vcp->vp);
0a7de745 7331 return 0;
0b4c1975 7332 case KAUTH_RESULT_DEFER:
0b4c1975 7333 default:
39037602
A
7334 /* Defer to directory */
7335 KAUTH_DEBUG("%p DEFERRED - by file ACL", vcp->vp);
0b4c1975 7336 break;
91447636
A
7337 }
7338 }
7339
39037602
A
7340 /*
7341 * Without a sticky bit, a previously authorized delete child is
7342 * sufficient to authorize this delete.
7343 *
7344 * If the sticky bit is set, a directory ACL which allows delete child
7345 * overrides a (potential) sticky bit deny. The authorized delete child
7346 * cannot tell us if it was authorized because of an explicit delete
7347 * child allow ACE or because of POSIX permisions so we have to check
7348 * the directory ACL everytime if the directory has a sticky bit.
7349 */
7350 if (!(dvap->va_mode & S_ISTXT) && cached_delete_child) {
7351 KAUTH_DEBUG("%p ALLOWED - granted by directory ACL or POSIX permissions and no sticky bit on directory", vcp->vp);
0a7de745 7352 return 0;
39037602
A
7353 }
7354
7355 /* check the ACL on the directory */
7356 if (VATTR_IS_NOT(dvap, va_acl, NULL)) {
7357 eval.ae_requested = KAUTH_VNODE_DELETE_CHILD;
7358 eval.ae_acl = &dvap->va_acl->acl_ace[0];
7359 eval.ae_count = dvap->va_acl->acl_entrycount;
91447636 7360 eval.ae_options = 0;
0a7de745 7361 if (vauth_dir_owner(vcp)) {
91447636 7362 eval.ae_options |= KAUTH_AEVAL_IS_OWNER;
0a7de745 7363 }
0b4c1975
A
7364 /*
7365 * We use ENOENT as a marker to indicate we could not get
7366 * information in order to delay evaluation until after we
7367 * have the ACL evaluation answer. Previously, we would
7368 * always deny the operation at this point.
7369 */
0a7de745
A
7370 if ((error = vauth_dir_ingroup(vcp, &ismember, ENOENT)) != 0 && error != ENOENT) {
7371 return error;
7372 }
7373 if (error == ENOENT) {
6d2010ae 7374 eval.ae_options |= KAUTH_AEVAL_IN_GROUP_UNKNOWN;
0a7de745 7375 } else if (ismember) {
91447636 7376 eval.ae_options |= KAUTH_AEVAL_IN_GROUP;
0a7de745 7377 }
91447636
A
7378 eval.ae_exp_gall = KAUTH_VNODE_GENERIC_ALL_BITS;
7379 eval.ae_exp_gread = KAUTH_VNODE_GENERIC_READ_BITS;
7380 eval.ae_exp_gwrite = KAUTH_VNODE_GENERIC_WRITE_BITS;
7381 eval.ae_exp_gexec = KAUTH_VNODE_GENERIC_EXECUTE_BITS;
7382
39037602
A
7383 /*
7384 * If there is no entry, we are going to defer to other
7385 * authorization mechanisms.
7386 */
7387 error = kauth_acl_evaluate(cred, &eval);
7388
7389 if (error != 0) {
91447636 7390 KAUTH_DEBUG("%p ERROR during ACL processing - %d", vcp->vp, error);
0a7de745 7391 return error;
91447636 7392 }
0a7de745 7393 switch (eval.ae_result) {
0b4c1975 7394 case KAUTH_RESULT_DENY:
39037602 7395 KAUTH_DEBUG("%p DENIED - denied by directory ACL", vcp->vp);
0a7de745 7396 return EACCES;
0b4c1975 7397 case KAUTH_RESULT_ALLOW:
39037602
A
7398 KAUTH_DEBUG("%p ALLOWED - granted by directory ACL", vcp->vp);
7399 if (!cached_delete_child && vcp->dvp) {
7400 vnode_cache_authorized_action(vcp->dvp,
7401 vcp->ctx, KAUTH_VNODE_DELETE_CHILD);
7402 }
0a7de745 7403 return 0;
0b4c1975 7404 case KAUTH_RESULT_DEFER:
0b4c1975 7405 default:
39037602
A
7406 /* Deferred by directory ACL */
7407 KAUTH_DEBUG("%p DEFERRED - directory ACL", vcp->vp);
0b4c1975 7408 break;
91447636
A
7409 }
7410 }
7411
39037602
A
7412 /*
7413 * From this point, we can't explicitly allow and if we reach the end
7414 * of the function without a denial, then the delete is authorized.
7415 */
7416 if (!cached_delete_child) {
7417 if (vnode_authorize_posix(vcp, VWRITE, 1 /* on_dir */) != 0) {
7418 KAUTH_DEBUG("%p DENIED - denied by posix permisssions", vcp->vp);
0a7de745 7419 return EACCES;
39037602
A
7420 }
7421 /*
7422 * Cache the authorized action on the vnode if allowed by the
7423 * directory ACL or POSIX permissions. It is correct to cache
7424 * this action even if sticky bit would deny deleting the node.
7425 */
7426 if (vcp->dvp) {
7427 vnode_cache_authorized_action(vcp->dvp, vcp->ctx,
7428 KAUTH_VNODE_DELETE_CHILD);
7429 }
91447636
A
7430 }
7431
ebb1b9f4
A
7432 /* enforce sticky bit behaviour */
7433 if ((dvap->va_mode & S_ISTXT) && !vauth_file_owner(vcp) && !vauth_dir_owner(vcp)) {
91447636 7434 KAUTH_DEBUG("%p DENIED - sticky bit rules (user %d file %d dir %d)",
6d2010ae 7435 vcp->vp, cred->cr_posix.cr_uid, vap->va_uid, dvap->va_uid);
0a7de745 7436 return EACCES;
91447636
A
7437 }
7438
7439 /* not denied, must be OK */
0a7de745 7440 return 0;
91447636 7441}
0a7de745 7442
91447636
A
7443
7444/*
7445 * Authorize an operation based on the node's attributes.
7446 */
7447static int
2d21ac55 7448vnode_authorize_simple(vauth_ctx vcp, kauth_ace_rights_t acl_rights, kauth_ace_rights_t preauth_rights, boolean_t *found_deny)
91447636 7449{
0a7de745
A
7450 struct vnode_attr *vap = vcp->vap;
7451 kauth_cred_t cred = vcp->ctx->vc_ucred;
7452 struct kauth_acl_eval eval;
7453 int error, ismember;
7454 mode_t posix_action;
91447636
A
7455
7456 /*
7457 * If we are the file owner, we automatically have some rights.
7458 *
7459 * Do we need to expand this to support group ownership?
7460 */
0a7de745 7461 if (vauth_file_owner(vcp)) {
91447636 7462 acl_rights &= ~(KAUTH_VNODE_WRITE_SECURITY);
0a7de745 7463 }
91447636
A
7464
7465 /*
7466 * If we are checking both TAKE_OWNERSHIP and WRITE_SECURITY, we can
7467 * mask the latter. If TAKE_OWNERSHIP is requested the caller is about to
7468 * change ownership to themselves, and WRITE_SECURITY is implicitly
7469 * granted to the owner. We need to do this because at this point
7470 * WRITE_SECURITY may not be granted as the caller is not currently
7471 * the owner.
7472 */
7473 if ((acl_rights & KAUTH_VNODE_TAKE_OWNERSHIP) &&
0a7de745 7474 (acl_rights & KAUTH_VNODE_WRITE_SECURITY)) {
91447636 7475 acl_rights &= ~KAUTH_VNODE_WRITE_SECURITY;
0a7de745
A
7476 }
7477
91447636
A
7478 if (acl_rights == 0) {
7479 KAUTH_DEBUG("%p ALLOWED - implicit or no rights required", vcp->vp);
0a7de745 7480 return 0;
91447636
A
7481 }
7482
7483 /* if we have an ACL, evaluate it */
7484 if (VATTR_IS_NOT(vap, va_acl, NULL)) {
7485 eval.ae_requested = acl_rights;
7486 eval.ae_acl = &vap->va_acl->acl_ace[0];
7487 eval.ae_count = vap->va_acl->acl_entrycount;
7488 eval.ae_options = 0;
0a7de745 7489 if (vauth_file_owner(vcp)) {
91447636 7490 eval.ae_options |= KAUTH_AEVAL_IS_OWNER;
0a7de745 7491 }
0b4c1975
A
7492 /*
7493 * We use ENOENT as a marker to indicate we could not get
7494 * information in order to delay evaluation until after we
7495 * have the ACL evaluation answer. Previously, we would
7496 * always deny the operation at this point.
7497 */
0a7de745
A
7498 if ((error = vauth_file_ingroup(vcp, &ismember, ENOENT)) != 0 && error != ENOENT) {
7499 return error;
7500 }
7501 if (error == ENOENT) {
6d2010ae 7502 eval.ae_options |= KAUTH_AEVAL_IN_GROUP_UNKNOWN;
0a7de745 7503 } else if (ismember) {
91447636 7504 eval.ae_options |= KAUTH_AEVAL_IN_GROUP;
0a7de745 7505 }
91447636
A
7506 eval.ae_exp_gall = KAUTH_VNODE_GENERIC_ALL_BITS;
7507 eval.ae_exp_gread = KAUTH_VNODE_GENERIC_READ_BITS;
7508 eval.ae_exp_gwrite = KAUTH_VNODE_GENERIC_WRITE_BITS;
7509 eval.ae_exp_gexec = KAUTH_VNODE_GENERIC_EXECUTE_BITS;
0a7de745 7510
91447636
A
7511 if ((error = kauth_acl_evaluate(cred, &eval)) != 0) {
7512 KAUTH_DEBUG("%p ERROR during ACL processing - %d", vcp->vp, error);
0a7de745 7513 return error;
91447636 7514 }
0a7de745
A
7515
7516 switch (eval.ae_result) {
0b4c1975 7517 case KAUTH_RESULT_DENY:
91447636 7518 KAUTH_DEBUG("%p DENIED - by ACL", vcp->vp);
0a7de745 7519 return EACCES; /* deny, deny, counter-allege */
0b4c1975 7520 case KAUTH_RESULT_ALLOW:
91447636 7521 KAUTH_DEBUG("%p ALLOWED - all rights granted by ACL", vcp->vp);
0a7de745 7522 return 0;
0b4c1975 7523 case KAUTH_RESULT_DEFER:
0b4c1975
A
7524 default:
7525 /* Effectively the same as !delete_child_denied */
7526 KAUTH_DEBUG("%p DEFERRED - directory ACL", vcp->vp);
7527 break;
91447636 7528 }
0b4c1975 7529
2d21ac55
A
7530 *found_deny = eval.ae_found_deny;
7531
91447636
A
7532 /* fall through and evaluate residual rights */
7533 } else {
7534 /* no ACL, everything is residual */
7535 eval.ae_residual = acl_rights;
7536 }
7537
7538 /*
7539 * Grant residual rights that have been pre-authorized.
7540 */
7541 eval.ae_residual &= ~preauth_rights;
7542
7543 /*
7544 * We grant WRITE_ATTRIBUTES to the owner if it hasn't been denied.
7545 */
0a7de745 7546 if (vauth_file_owner(vcp)) {
91447636 7547 eval.ae_residual &= ~KAUTH_VNODE_WRITE_ATTRIBUTES;
0a7de745
A
7548 }
7549
91447636
A
7550 if (eval.ae_residual == 0) {
7551 KAUTH_DEBUG("%p ALLOWED - rights already authorized", vcp->vp);
0a7de745
A
7552 return 0;
7553 }
7554
91447636
A
7555 /*
7556 * Bail if we have residual rights that can't be granted by posix permissions,
7557 * or aren't presumed granted at this point.
7558 *
7559 * XXX these can be collapsed for performance
7560 */
7561 if (eval.ae_residual & KAUTH_VNODE_CHANGE_OWNER) {
7562 KAUTH_DEBUG("%p DENIED - CHANGE_OWNER not permitted", vcp->vp);
0a7de745 7563 return EACCES;
91447636
A
7564 }
7565 if (eval.ae_residual & KAUTH_VNODE_WRITE_SECURITY) {
7566 KAUTH_DEBUG("%p DENIED - WRITE_SECURITY not permitted", vcp->vp);
0a7de745 7567 return EACCES;
91447636
A
7568 }
7569
7570#if DIAGNOSTIC
0a7de745 7571 if (eval.ae_residual & KAUTH_VNODE_DELETE) {
91447636 7572 panic("vnode_authorize: can't be checking delete permission here");
0a7de745 7573 }
91447636
A
7574#endif
7575
7576 /*
7577 * Compute the fallback posix permissions that will satisfy the remaining
7578 * rights.
7579 */
7580 posix_action = 0;
7581 if (eval.ae_residual & (KAUTH_VNODE_READ_DATA |
0a7de745
A
7582 KAUTH_VNODE_LIST_DIRECTORY |
7583 KAUTH_VNODE_READ_EXTATTRIBUTES)) {
91447636 7584 posix_action |= VREAD;
0a7de745 7585 }
91447636 7586 if (eval.ae_residual & (KAUTH_VNODE_WRITE_DATA |
0a7de745
A
7587 KAUTH_VNODE_ADD_FILE |
7588 KAUTH_VNODE_ADD_SUBDIRECTORY |
7589 KAUTH_VNODE_DELETE_CHILD |
7590 KAUTH_VNODE_WRITE_ATTRIBUTES |
7591 KAUTH_VNODE_WRITE_EXTATTRIBUTES)) {
91447636 7592 posix_action |= VWRITE;
0a7de745 7593 }
91447636 7594 if (eval.ae_residual & (KAUTH_VNODE_EXECUTE |
0a7de745 7595 KAUTH_VNODE_SEARCH)) {
91447636 7596 posix_action |= VEXEC;
0a7de745
A
7597 }
7598
91447636 7599 if (posix_action != 0) {
0a7de745 7600 return vnode_authorize_posix(vcp, posix_action, 0 /* !on_dir */);
91447636
A
7601 } else {
7602 KAUTH_DEBUG("%p ALLOWED - residual rights %s%s%s%s%s%s%s%s%s%s%s%s%s%s granted due to no posix mapping",
7603 vcp->vp,
7604 (eval.ae_residual & KAUTH_VNODE_READ_DATA)
7605 ? vnode_isdir(vcp->vp) ? " LIST_DIRECTORY" : " READ_DATA" : "",
7606 (eval.ae_residual & KAUTH_VNODE_WRITE_DATA)
7607 ? vnode_isdir(vcp->vp) ? " ADD_FILE" : " WRITE_DATA" : "",
7608 (eval.ae_residual & KAUTH_VNODE_EXECUTE)
7609 ? vnode_isdir(vcp->vp) ? " SEARCH" : " EXECUTE" : "",
7610 (eval.ae_residual & KAUTH_VNODE_DELETE)
7611 ? " DELETE" : "",
7612 (eval.ae_residual & KAUTH_VNODE_APPEND_DATA)
7613 ? vnode_isdir(vcp->vp) ? " ADD_SUBDIRECTORY" : " APPEND_DATA" : "",
7614 (eval.ae_residual & KAUTH_VNODE_DELETE_CHILD)
7615 ? " DELETE_CHILD" : "",
7616 (eval.ae_residual & KAUTH_VNODE_READ_ATTRIBUTES)
7617 ? " READ_ATTRIBUTES" : "",
7618 (eval.ae_residual & KAUTH_VNODE_WRITE_ATTRIBUTES)
7619 ? " WRITE_ATTRIBUTES" : "",
7620 (eval.ae_residual & KAUTH_VNODE_READ_EXTATTRIBUTES)
7621 ? " READ_EXTATTRIBUTES" : "",
7622 (eval.ae_residual & KAUTH_VNODE_WRITE_EXTATTRIBUTES)
7623 ? " WRITE_EXTATTRIBUTES" : "",
7624 (eval.ae_residual & KAUTH_VNODE_READ_SECURITY)
7625 ? " READ_SECURITY" : "",
7626 (eval.ae_residual & KAUTH_VNODE_WRITE_SECURITY)
7627 ? " WRITE_SECURITY" : "",
7628 (eval.ae_residual & KAUTH_VNODE_CHECKIMMUTABLE)
7629 ? " CHECKIMMUTABLE" : "",
7630 (eval.ae_residual & KAUTH_VNODE_CHANGE_OWNER)
7631 ? " CHANGE_OWNER" : "");
7632 }
7633
7634 /*
7635 * Lack of required Posix permissions implies no reason to deny access.
7636 */
0a7de745 7637 return 0;
91447636
A
7638}
7639
7640/*
7641 * Check for file immutability.
7642 */
7643static int
813fb2f6 7644vnode_authorize_checkimmutable(mount_t mp, struct vnode_attr *vap, int rights, int ignore)
91447636 7645{
91447636
A
7646 int error;
7647 int append;
7648
7649 /*
7650 * Perform immutability checks for operations that change data.
7651 *
7652 * Sockets, fifos and devices require special handling.
7653 */
0a7de745 7654 switch (vap->va_type) {
91447636
A
7655 case VSOCK:
7656 case VFIFO:
7657 case VBLK:
7658 case VCHR:
7659 /*
7660 * Writing to these nodes does not change the filesystem data,
7661 * so forget that it's being tried.
7662 */
7663 rights &= ~KAUTH_VNODE_WRITE_DATA;
7664 break;
7665 default:
7666 break;
7667 }
7668
7669 error = 0;
7670 if (rights & KAUTH_VNODE_WRITE_RIGHTS) {
91447636 7671 /* check per-filesystem options if possible */
91447636 7672 if (mp != NULL) {
91447636
A
7673 /* check for no-EA filesystems */
7674 if ((rights & KAUTH_VNODE_WRITE_EXTATTRIBUTES) &&
7675 (vfs_flags(mp) & MNT_NOUSERXATTR)) {
7676 KAUTH_DEBUG("%p DENIED - filesystem disallowed extended attributes", vp);
7677 error = EACCES; /* User attributes disabled */
7678 goto out;
7679 }
7680 }
7681
0a7de745
A
7682 /*
7683 * check for file immutability. first, check if the requested rights are
b0d623f7
A
7684 * allowable for a UF_APPEND file.
7685 */
91447636 7686 append = 0;
813fb2f6 7687 if (vap->va_type == VDIR) {
0a7de745 7688 if ((rights & (KAUTH_VNODE_ADD_FILE | KAUTH_VNODE_ADD_SUBDIRECTORY | KAUTH_VNODE_WRITE_EXTATTRIBUTES)) == rights) {
91447636 7689 append = 1;
0a7de745 7690 }
91447636 7691 } else {
0a7de745 7692 if ((rights & (KAUTH_VNODE_APPEND_DATA | KAUTH_VNODE_WRITE_EXTATTRIBUTES)) == rights) {
91447636 7693 append = 1;
0a7de745 7694 }
91447636
A
7695 }
7696 if ((error = vnode_immutable(vap, append, ignore)) != 0) {
7697 KAUTH_DEBUG("%p DENIED - file is immutable", vp);
7698 goto out;
7699 }
7700 }
7701out:
0a7de745 7702 return error;
91447636
A
7703}
7704
7705/*
2d21ac55
A
7706 * Handle authorization actions for filesystems that advertise that the
7707 * server will be enforcing.
7708 *
7709 * Returns: 0 Authorization should be handled locally
7710 * 1 Authorization was handled by the FS
7711 *
7712 * Note: Imputed returns will only occur if the authorization request
7713 * was handled by the FS.
7714 *
7715 * Imputed: *resultp, modified Return code from FS when the request is
7716 * handled by the FS.
7717 * VNOP_ACCESS:???
7718 * VNOP_OPEN:???
91447636
A
7719 */
7720static int
7721vnode_authorize_opaque(vnode_t vp, int *resultp, kauth_action_t action, vfs_context_t ctx)
7722{
0a7de745 7723 int error;
91447636
A
7724
7725 /*
7726 * If the vp is a device node, socket or FIFO it actually represents a local
7727 * endpoint, so we need to handle it locally.
7728 */
0a7de745 7729 switch (vp->v_type) {
91447636
A
7730 case VBLK:
7731 case VCHR:
7732 case VSOCK:
7733 case VFIFO:
0a7de745 7734 return 0;
91447636
A
7735 default:
7736 break;
7737 }
7738
7739 /*
7740 * In the advisory request case, if the filesystem doesn't think it's reliable
7741 * we will attempt to formulate a result ourselves based on VNOP_GETATTR data.
7742 */
0a7de745
A
7743 if ((action & KAUTH_VNODE_ACCESS) && !vfs_authopaqueaccess(vp->v_mount)) {
7744 return 0;
7745 }
91447636
A
7746
7747 /*
7748 * Let the filesystem have a say in the matter. It's OK for it to not implemnent
7749 * VNOP_ACCESS, as most will authorise inline with the actual request.
7750 */
7751 if ((error = VNOP_ACCESS(vp, action, ctx)) != ENOTSUP) {
7752 *resultp = error;
7753 KAUTH_DEBUG("%p DENIED - opaque filesystem VNOP_ACCESS denied access", vp);
0a7de745 7754 return 1;
91447636 7755 }
0a7de745 7756
91447636
A
7757 /*
7758 * Typically opaque filesystems do authorisation in-line, but exec is a special case. In
7759 * order to be reasonably sure that exec will be permitted, we try a bit harder here.
7760 */
2d21ac55 7761 if ((action & KAUTH_VNODE_EXECUTE) && (vp->v_type == VREG)) {
91447636
A
7762 /* try a VNOP_OPEN for readonly access */
7763 if ((error = VNOP_OPEN(vp, FREAD, ctx)) != 0) {
7764 *resultp = error;
7765 KAUTH_DEBUG("%p DENIED - EXECUTE denied because file could not be opened readonly", vp);
0a7de745 7766 return 1;
91447636
A
7767 }
7768 VNOP_CLOSE(vp, FREAD, ctx);
7769 }
7770
7771 /*
7772 * We don't have any reason to believe that the request has to be denied at this point,
7773 * so go ahead and allow it.
7774 */
7775 *resultp = 0;
7776 KAUTH_DEBUG("%p ALLOWED - bypassing access check for non-local filesystem", vp);
0a7de745 7777 return 1;
91447636
A
7778}
7779
2d21ac55
A
7780
7781
7782
7783/*
7784 * Returns: KAUTH_RESULT_ALLOW
7785 * KAUTH_RESULT_DENY
7786 *
7787 * Imputed: *arg3, modified Error code in the deny case
7788 * EROFS Read-only file system
7789 * EACCES Permission denied
7790 * EPERM Operation not permitted [no execute]
7791 * vnode_getattr:ENOMEM Not enough space [only if has filesec]
7792 * vnode_getattr:???
7793 * vnode_authorize_opaque:*arg2 ???
7794 * vnode_authorize_checkimmutable:???
7795 * vnode_authorize_delete:???
7796 * vnode_authorize_simple:???
7797 */
7798
7799
7800static int
813fb2f6
A
7801vnode_authorize_callback(__unused kauth_cred_t cred, __unused void *idata,
7802 kauth_action_t action, uintptr_t arg0, uintptr_t arg1, uintptr_t arg2,
7803 uintptr_t arg3)
2d21ac55 7804{
0a7de745
A
7805 vfs_context_t ctx;
7806 vnode_t cvp = NULLVP;
7807 vnode_t vp, dvp;
7808 int result = KAUTH_RESULT_DENY;
7809 int parent_iocount = 0;
7810 int parent_action; /* In case we need to use namedstream's data fork for cached rights*/
2d21ac55
A
7811
7812 ctx = (vfs_context_t)arg0;
7813 vp = (vnode_t)arg1;
7814 dvp = (vnode_t)arg2;
7815
7816 /*
7817 * if there are 2 vnodes passed in, we don't know at
0a7de745 7818 * this point which rights to look at based on the
2d21ac55
A
7819 * combined action being passed in... defer until later...
7820 * otherwise check the kauth 'rights' cache hung
7821 * off of the vnode we're interested in... if we've already
7822 * been granted the right we're currently interested in,
7823 * we can just return success... otherwise we'll go through
7824 * the process of authorizing the requested right(s)... if that
7825 * succeeds, we'll add the right(s) to the cache.
7826 * VNOP_SETATTR and VNOP_SETXATTR will invalidate this cache
7827 */
0a7de745
A
7828 if (dvp && vp) {
7829 goto defer;
7830 }
b0d623f7 7831 if (dvp) {
0a7de745 7832 cvp = dvp;
b0d623f7 7833 } else {
0a7de745 7834 /*
b0d623f7
A
7835 * For named streams on local-authorization volumes, rights are cached on the parent;
7836 * authorization is determined by looking at the parent's properties anyway, so storing
0a7de745 7837 * on the parent means that we don't recompute for the named stream and that if
b0d623f7
A
7838 * we need to flush rights (e.g. on VNOP_SETATTR()) we don't need to track down the
7839 * stream to flush its cache separately. If we miss in the cache, then we authorize
0a7de745 7840 * as if there were no cached rights (passing the named stream vnode and desired rights to
b0d623f7
A
7841 * vnode_authorize_callback_int()).
7842 *
0a7de745 7843 * On an opaquely authorized volume, we don't know the relationship between the
b0d623f7
A
7844 * data fork's properties and the rights granted on a stream. Thus, named stream vnodes
7845 * on such a volume are authorized directly (rather than using the parent) and have their
7846 * own caches. When a named stream vnode is created, we mark the parent as having a named
0a7de745 7847 * stream. On a VNOP_SETATTR() for the parent that may invalidate cached authorization, we
b0d623f7
A
7848 * find the stream and flush its cache.
7849 */
7850 if (vnode_isnamedstream(vp) && (!vfs_authopaque(vp->v_mount))) {
6d2010ae
A
7851 cvp = vnode_getparent(vp);
7852 if (cvp != NULLVP) {
b0d623f7
A
7853 parent_iocount = 1;
7854 } else {
7855 cvp = NULL;
7856 goto defer; /* If we can't use the parent, take the slow path */
7857 }
2d21ac55 7858
b0d623f7
A
7859 /* Have to translate some actions */
7860 parent_action = action;
7861 if (parent_action & KAUTH_VNODE_READ_DATA) {
7862 parent_action &= ~KAUTH_VNODE_READ_DATA;
7863 parent_action |= KAUTH_VNODE_READ_EXTATTRIBUTES;
7864 }
7865 if (parent_action & KAUTH_VNODE_WRITE_DATA) {
7866 parent_action &= ~KAUTH_VNODE_WRITE_DATA;
7867 parent_action |= KAUTH_VNODE_WRITE_EXTATTRIBUTES;
7868 }
b0d623f7
A
7869 } else {
7870 cvp = vp;
7871 }
7872 }
7873
7874 if (vnode_cache_is_authorized(cvp, ctx, parent_iocount ? parent_action : action) == TRUE) {
0a7de745 7875 result = KAUTH_RESULT_ALLOW;
b0d623f7
A
7876 goto out;
7877 }
2d21ac55 7878defer:
0a7de745 7879 result = vnode_authorize_callback_int(action, ctx, vp, dvp, (int *)arg3);
2d21ac55 7880
6d2010ae
A
7881 if (result == KAUTH_RESULT_ALLOW && cvp != NULLVP) {
7882 KAUTH_DEBUG("%p - caching action = %x", cvp, action);
0a7de745 7883 vnode_cache_authorized_action(cvp, ctx, action);
6d2010ae 7884 }
2d21ac55 7885
b0d623f7
A
7886out:
7887 if (parent_iocount) {
7888 vnode_put(cvp);
7889 }
7890
2d21ac55
A
7891 return result;
7892}
7893
813fb2f6
A
7894static int
7895vnode_attr_authorize_internal(vauth_ctx vcp, mount_t mp,
7896 kauth_ace_rights_t rights, int is_suser, boolean_t *found_deny,
7897 int noimmutable, int parent_authorized_for_delete_child)
7898{
7899 int result;
7900
7901 /*
7902 * Check for immutability.
7903 *
7904 * In the deletion case, parent directory immutability vetoes specific
7905 * file rights.
7906 */
7907 if ((result = vnode_authorize_checkimmutable(mp, vcp->vap, rights,
0a7de745 7908 noimmutable)) != 0) {
813fb2f6 7909 goto out;
0a7de745 7910 }
813fb2f6
A
7911
7912 if ((rights & KAUTH_VNODE_DELETE) &&
7913 !parent_authorized_for_delete_child) {
7914 result = vnode_authorize_checkimmutable(mp, vcp->dvap,
7915 KAUTH_VNODE_DELETE_CHILD, 0);
0a7de745 7916 if (result) {
813fb2f6 7917 goto out;
0a7de745 7918 }
813fb2f6
A
7919 }
7920
7921 /*
7922 * Clear rights that have been authorized by reaching this point, bail if nothing left to
7923 * check.
7924 */
7925 rights &= ~(KAUTH_VNODE_LINKTARGET | KAUTH_VNODE_CHECKIMMUTABLE);
0a7de745 7926 if (rights == 0) {
813fb2f6 7927 goto out;
0a7de745 7928 }
813fb2f6
A
7929
7930 /*
7931 * If we're not the superuser, authorize based on file properties;
7932 * note that even if parent_authorized_for_delete_child is TRUE, we
7933 * need to check on the node itself.
7934 */
7935 if (!is_suser) {
7936 /* process delete rights */
7937 if ((rights & KAUTH_VNODE_DELETE) &&
0a7de745
A
7938 ((result = vnode_authorize_delete(vcp, parent_authorized_for_delete_child)) != 0)) {
7939 goto out;
7940 }
813fb2f6
A
7941
7942 /* process remaining rights */
7943 if ((rights & ~KAUTH_VNODE_DELETE) &&
0a7de745 7944 (result = vnode_authorize_simple(vcp, rights, rights & KAUTH_VNODE_DELETE, found_deny)) != 0) {
813fb2f6 7945 goto out;
0a7de745 7946 }
813fb2f6
A
7947 } else {
7948 /*
7949 * Execute is only granted to root if one of the x bits is set. This check only
7950 * makes sense if the posix mode bits are actually supported.
7951 */
7952 if ((rights & KAUTH_VNODE_EXECUTE) &&
7953 (vcp->vap->va_type == VREG) &&
7954 VATTR_IS_SUPPORTED(vcp->vap, va_mode) &&
7955 !(vcp->vap->va_mode & (S_IXUSR | S_IXGRP | S_IXOTH))) {
7956 result = EPERM;
7957 KAUTH_DEBUG("%p DENIED - root execute requires at least one x bit in 0x%x", vp, va.va_mode);
7958 goto out;
7959 }
7960
7961 /* Assume that there were DENYs so we don't wrongly cache KAUTH_VNODE_SEARCHBYANYONE */
7962 *found_deny = TRUE;
7963
7964 KAUTH_DEBUG("%p ALLOWED - caller is superuser", vp);
7965 }
7966out:
0a7de745 7967 return result;
813fb2f6 7968}
2d21ac55 7969
91447636 7970static int
813fb2f6
A
7971vnode_authorize_callback_int(kauth_action_t action, vfs_context_t ctx,
7972 vnode_t vp, vnode_t dvp, int *errorp)
91447636
A
7973{
7974 struct _vnode_authorize_context auth_context;
0a7de745
A
7975 vauth_ctx vcp;
7976 kauth_cred_t cred;
7977 kauth_ace_rights_t rights;
7978 struct vnode_attr va, dva;
7979 int result;
7980 int noimmutable;
7981 boolean_t parent_authorized_for_delete_child = FALSE;
7982 boolean_t found_deny = FALSE;
7983 boolean_t parent_ref = FALSE;
7984 boolean_t is_suser = FALSE;
91447636
A
7985
7986 vcp = &auth_context;
813fb2f6
A
7987 vcp->ctx = ctx;
7988 vcp->vp = vp;
7989 vcp->dvp = dvp;
2d21ac55
A
7990 /*
7991 * Note that we authorize against the context, not the passed cred
7992 * (the same thing anyway)
7993 */
91447636
A
7994 cred = ctx->vc_ucred;
7995
7996 VATTR_INIT(&va);
7997 vcp->vap = &va;
7998 VATTR_INIT(&dva);
7999 vcp->dvap = &dva;
8000
8001 vcp->flags = vcp->flags_valid = 0;
8002
8003#if DIAGNOSTIC
0a7de745 8004 if ((ctx == NULL) || (vp == NULL) || (cred == NULL)) {
91447636 8005 panic("vnode_authorize: bad arguments (context %p vp %p cred %p)", ctx, vp, cred);
0a7de745 8006 }
91447636
A
8007#endif
8008
8009 KAUTH_DEBUG("%p AUTH - %s %s%s%s%s%s%s%s%s%s%s%s%s%s%s%s on %s '%s' (0x%x:%p/%p)",
8010 vp, vfs_context_proc(ctx)->p_comm,
0a7de745
A
8011 (action & KAUTH_VNODE_ACCESS) ? "access" : "auth",
8012 (action & KAUTH_VNODE_READ_DATA) ? vnode_isdir(vp) ? " LIST_DIRECTORY" : " READ_DATA" : "",
8013 (action & KAUTH_VNODE_WRITE_DATA) ? vnode_isdir(vp) ? " ADD_FILE" : " WRITE_DATA" : "",
8014 (action & KAUTH_VNODE_EXECUTE) ? vnode_isdir(vp) ? " SEARCH" : " EXECUTE" : "",
8015 (action & KAUTH_VNODE_DELETE) ? " DELETE" : "",
8016 (action & KAUTH_VNODE_APPEND_DATA) ? vnode_isdir(vp) ? " ADD_SUBDIRECTORY" : " APPEND_DATA" : "",
8017 (action & KAUTH_VNODE_DELETE_CHILD) ? " DELETE_CHILD" : "",
8018 (action & KAUTH_VNODE_READ_ATTRIBUTES) ? " READ_ATTRIBUTES" : "",
8019 (action & KAUTH_VNODE_WRITE_ATTRIBUTES) ? " WRITE_ATTRIBUTES" : "",
8020 (action & KAUTH_VNODE_READ_EXTATTRIBUTES) ? " READ_EXTATTRIBUTES" : "",
8021 (action & KAUTH_VNODE_WRITE_EXTATTRIBUTES) ? " WRITE_EXTATTRIBUTES" : "",
8022 (action & KAUTH_VNODE_READ_SECURITY) ? " READ_SECURITY" : "",
8023 (action & KAUTH_VNODE_WRITE_SECURITY) ? " WRITE_SECURITY" : "",
8024 (action & KAUTH_VNODE_CHANGE_OWNER) ? " CHANGE_OWNER" : "",
8025 (action & KAUTH_VNODE_NOIMMUTABLE) ? " (noimmutable)" : "",
91447636
A
8026 vnode_isdir(vp) ? "directory" : "file",
8027 vp->v_name ? vp->v_name : "<NULL>", action, vp, dvp);
8028
8029 /*
8030 * Extract the control bits from the action, everything else is
8031 * requested rights.
8032 */
8033 noimmutable = (action & KAUTH_VNODE_NOIMMUTABLE) ? 1 : 0;
8034 rights = action & ~(KAUTH_VNODE_ACCESS | KAUTH_VNODE_NOIMMUTABLE);
0a7de745 8035
91447636
A
8036 if (rights & KAUTH_VNODE_DELETE) {
8037#if DIAGNOSTIC
0a7de745 8038 if (dvp == NULL) {
91447636 8039 panic("vnode_authorize: KAUTH_VNODE_DELETE test requires a directory");
0a7de745 8040 }
91447636 8041#endif
2d21ac55
A
8042 /*
8043 * check to see if we've already authorized the parent
8044 * directory for deletion of its children... if so, we
8045 * can skip a whole bunch of work... we will still have to
8046 * authorize that this specific child can be removed
8047 */
0a7de745
A
8048 if (vnode_cache_is_authorized(dvp, ctx, KAUTH_VNODE_DELETE_CHILD) == TRUE) {
8049 parent_authorized_for_delete_child = TRUE;
8050 }
91447636 8051 } else {
813fb2f6
A
8052 vcp->dvp = NULLVP;
8053 vcp->dvap = NULL;
91447636 8054 }
0a7de745 8055
91447636
A
8056 /*
8057 * Check for read-only filesystems.
8058 */
8059 if ((rights & KAUTH_VNODE_WRITE_RIGHTS) &&
8060 (vp->v_mount->mnt_flag & MNT_RDONLY) &&
0a7de745
A
8061 ((vp->v_type == VREG) || (vp->v_type == VDIR) ||
8062 (vp->v_type == VLNK) || (vp->v_type == VCPLX) ||
8063 (rights & KAUTH_VNODE_DELETE) || (rights & KAUTH_VNODE_DELETE_CHILD))) {
91447636
A
8064 result = EROFS;
8065 goto out;
8066 }
8067
8068 /*
8069 * Check for noexec filesystems.
8070 */
2d21ac55 8071 if ((rights & KAUTH_VNODE_EXECUTE) && (vp->v_type == VREG) && (vp->v_mount->mnt_flag & MNT_NOEXEC)) {
91447636
A
8072 result = EACCES;
8073 goto out;
8074 }
8075
8076 /*
8077 * Handle cases related to filesystems with non-local enforcement.
8078 * This call can return 0, in which case we will fall through to perform a
8079 * check based on VNOP_GETATTR data. Otherwise it returns 1 and sets
8080 * an appropriate result, at which point we can return immediately.
8081 */
0a7de745 8082 if ((vp->v_mount->mnt_kern_flag & MNTK_AUTH_OPAQUE) && vnode_authorize_opaque(vp, &result, action, ctx)) {
91447636 8083 goto out;
0a7de745 8084 }
91447636
A
8085
8086 /*
39037602
A
8087 * If the vnode is a namedstream (extended attribute) data vnode (eg.
8088 * a resource fork), *_DATA becomes *_EXTATTRIBUTES.
91447636 8089 */
b0d623f7 8090 if (vnode_isnamedstream(vp)) {
91447636
A
8091 if (rights & KAUTH_VNODE_READ_DATA) {
8092 rights &= ~KAUTH_VNODE_READ_DATA;
8093 rights |= KAUTH_VNODE_READ_EXTATTRIBUTES;
8094 }
8095 if (rights & KAUTH_VNODE_WRITE_DATA) {
8096 rights &= ~KAUTH_VNODE_WRITE_DATA;
8097 rights |= KAUTH_VNODE_WRITE_EXTATTRIBUTES;
8098 }
39037602
A
8099
8100 /*
8101 * Point 'vp' to the namedstream's parent for ACL checking
8102 */
8103 if ((vp->v_parent != NULL) &&
8104 (vget_internal(vp->v_parent, 0, VNODE_NODEAD | VNODE_DRAINO) == 0)) {
8105 parent_ref = TRUE;
8106 vcp->vp = vp = vp->v_parent;
8107 }
8108 }
8109
8110 if (vfs_context_issuser(ctx)) {
8111 /*
8112 * if we're not asking for execute permissions or modifications,
8113 * then we're done, this action is authorized.
8114 */
0a7de745 8115 if (!(rights & (KAUTH_VNODE_EXECUTE | KAUTH_VNODE_WRITE_RIGHTS))) {
39037602 8116 goto success;
0a7de745 8117 }
39037602
A
8118
8119 is_suser = TRUE;
91447636 8120 }
2d21ac55
A
8121
8122 /*
39037602
A
8123 * Get vnode attributes and extended security information for the vnode
8124 * and directory if required.
8125 *
8126 * If we're root we only want mode bits and flags for checking
8127 * execute and immutability.
2d21ac55 8128 */
39037602
A
8129 VATTR_WANTED(&va, va_mode);
8130 VATTR_WANTED(&va, va_flags);
8131 if (!is_suser) {
2d21ac55
A
8132 VATTR_WANTED(&va, va_uid);
8133 VATTR_WANTED(&va, va_gid);
2d21ac55 8134 VATTR_WANTED(&va, va_acl);
39037602
A
8135 }
8136 if ((result = vnode_getattr(vp, &va, ctx)) != 0) {
8137 KAUTH_DEBUG("%p ERROR - failed to get vnode attributes - %d", vp, result);
8138 goto out;
8139 }
813fb2f6
A
8140 VATTR_WANTED(&va, va_type);
8141 VATTR_RETURN(&va, va_type, vnode_vtype(vp));
8142
8143 if (vcp->dvp) {
39037602
A
8144 VATTR_WANTED(&dva, va_mode);
8145 VATTR_WANTED(&dva, va_flags);
8146 if (!is_suser) {
8147 VATTR_WANTED(&dva, va_uid);
8148 VATTR_WANTED(&dva, va_gid);
8149 VATTR_WANTED(&dva, va_acl);
8150 }
813fb2f6 8151 if ((result = vnode_getattr(vcp->dvp, &dva, ctx)) != 0) {
39037602 8152 KAUTH_DEBUG("%p ERROR - failed to get directory vnode attributes - %d", vp, result);
2d21ac55 8153 goto out;
39037602 8154 }
813fb2f6
A
8155 VATTR_WANTED(&dva, va_type);
8156 VATTR_RETURN(&dva, va_type, vnode_vtype(vcp->dvp));
2d21ac55
A
8157 }
8158
813fb2f6
A
8159 result = vnode_attr_authorize_internal(vcp, vp->v_mount, rights, is_suser,
8160 &found_deny, noimmutable, parent_authorized_for_delete_child);
91447636 8161out:
0a7de745 8162 if (VATTR_IS_SUPPORTED(&va, va_acl) && (va.va_acl != NULL)) {
91447636 8163 kauth_acl_free(va.va_acl);
0a7de745
A
8164 }
8165 if (VATTR_IS_SUPPORTED(&dva, va_acl) && (dva.va_acl != NULL)) {
91447636 8166 kauth_acl_free(dva.va_acl);
0a7de745 8167 }
2d21ac55 8168
91447636 8169 if (result) {
0a7de745 8170 if (parent_ref) {
2d21ac55 8171 vnode_put(vp);
0a7de745 8172 }
91447636
A
8173 *errorp = result;
8174 KAUTH_DEBUG("%p DENIED - auth denied", vp);
0a7de745 8175 return KAUTH_RESULT_DENY;
91447636 8176 }
2d21ac55 8177 if ((rights & KAUTH_VNODE_SEARCH) && found_deny == FALSE && vp->v_type == VDIR) {
0a7de745 8178 /*
2d21ac55
A
8179 * if we were successfully granted the right to search this directory
8180 * and there were NO ACL DENYs for search and the posix permissions also don't
0a7de745 8181 * deny execute, we can synthesize a global right that allows anyone to
2d21ac55
A
8182 * traverse this directory during a pathname lookup without having to
8183 * match the credential associated with this cache of rights.
490019cf
A
8184 *
8185 * Note that we can correctly cache KAUTH_VNODE_SEARCHBYANYONE
8186 * only if we actually check ACLs which we don't for root. As
8187 * a workaround, the lookup fast path checks for root.
2d21ac55 8188 */
0a7de745 8189 if (!VATTR_IS_SUPPORTED(&va, va_mode) ||
2d21ac55 8190 ((va.va_mode & (S_IXUSR | S_IXGRP | S_IXOTH)) ==
0a7de745
A
8191 (S_IXUSR | S_IXGRP | S_IXOTH))) {
8192 vnode_cache_authorized_action(vp, ctx, KAUTH_VNODE_SEARCHBYANYONE);
2d21ac55
A
8193 }
8194 }
39037602 8195success:
0a7de745 8196 if (parent_ref) {
2d21ac55 8197 vnode_put(vp);
0a7de745 8198 }
39037602 8199
91447636
A
8200 /*
8201 * Note that this implies that we will allow requests for no rights, as well as
8202 * for rights that we do not recognise. There should be none of these.
8203 */
8204 KAUTH_DEBUG("%p ALLOWED - auth granted", vp);
0a7de745 8205 return KAUTH_RESULT_ALLOW;
91447636
A
8206}
8207
813fb2f6
A
8208int
8209vnode_attr_authorize_init(struct vnode_attr *vap, struct vnode_attr *dvap,
8210 kauth_action_t action, vfs_context_t ctx)
8211{
8212 VATTR_INIT(vap);
8213 VATTR_WANTED(vap, va_type);
8214 VATTR_WANTED(vap, va_mode);
8215 VATTR_WANTED(vap, va_flags);
8216 if (dvap) {
8217 VATTR_INIT(dvap);
8218 if (action & KAUTH_VNODE_DELETE) {
8219 VATTR_WANTED(dvap, va_type);
8220 VATTR_WANTED(dvap, va_mode);
8221 VATTR_WANTED(dvap, va_flags);
8222 }
8223 } else if (action & KAUTH_VNODE_DELETE) {
0a7de745 8224 return EINVAL;
813fb2f6
A
8225 }
8226
8227 if (!vfs_context_issuser(ctx)) {
8228 VATTR_WANTED(vap, va_uid);
8229 VATTR_WANTED(vap, va_gid);
8230 VATTR_WANTED(vap, va_acl);
8231 if (dvap && (action & KAUTH_VNODE_DELETE)) {
8232 VATTR_WANTED(dvap, va_uid);
8233 VATTR_WANTED(dvap, va_gid);
8234 VATTR_WANTED(dvap, va_acl);
8235 }
8236 }
8237
0a7de745 8238 return 0;
813fb2f6
A
8239}
8240
8241int
8242vnode_attr_authorize(struct vnode_attr *vap, struct vnode_attr *dvap, mount_t mp,
8243 kauth_action_t action, vfs_context_t ctx)
8244{
8245 struct _vnode_authorize_context auth_context;
8246 vauth_ctx vcp;
8247 kauth_ace_rights_t rights;
8248 int noimmutable;
8249 boolean_t found_deny;
8250 boolean_t is_suser = FALSE;
8251 int result = 0;
8252
8253 vcp = &auth_context;
8254 vcp->ctx = ctx;
8255 vcp->vp = NULLVP;
8256 vcp->vap = vap;
8257 vcp->dvp = NULLVP;
8258 vcp->dvap = dvap;
8259 vcp->flags = vcp->flags_valid = 0;
8260
8261 noimmutable = (action & KAUTH_VNODE_NOIMMUTABLE) ? 1 : 0;
8262 rights = action & ~(KAUTH_VNODE_ACCESS | KAUTH_VNODE_NOIMMUTABLE);
8263
8264 /*
8265 * Check for read-only filesystems.
8266 */
8267 if ((rights & KAUTH_VNODE_WRITE_RIGHTS) &&
8268 mp && (mp->mnt_flag & MNT_RDONLY) &&
8269 ((vap->va_type == VREG) || (vap->va_type == VDIR) ||
8270 (vap->va_type == VLNK) || (rights & KAUTH_VNODE_DELETE) ||
8271 (rights & KAUTH_VNODE_DELETE_CHILD))) {
8272 result = EROFS;
8273 goto out;
8274 }
8275
8276 /*
8277 * Check for noexec filesystems.
8278 */
8279 if ((rights & KAUTH_VNODE_EXECUTE) &&
8280 (vap->va_type == VREG) && mp && (mp->mnt_flag & MNT_NOEXEC)) {
8281 result = EACCES;
8282 goto out;
8283 }
8284
8285 if (vfs_context_issuser(ctx)) {
8286 /*
8287 * if we're not asking for execute permissions or modifications,
8288 * then we're done, this action is authorized.
8289 */
0a7de745 8290 if (!(rights & (KAUTH_VNODE_EXECUTE | KAUTH_VNODE_WRITE_RIGHTS))) {
813fb2f6 8291 goto out;
0a7de745 8292 }
813fb2f6
A
8293 is_suser = TRUE;
8294 } else {
8295 if (!VATTR_IS_SUPPORTED(vap, va_uid) ||
8296 !VATTR_IS_SUPPORTED(vap, va_gid) ||
8297 (mp && vfs_extendedsecurity(mp) && !VATTR_IS_SUPPORTED(vap, va_acl))) {
8298 panic("vnode attrs not complete for vnode_attr_authorize\n");
8299 }
8300 }
8301
8302 result = vnode_attr_authorize_internal(vcp, mp, rights, is_suser,
8303 &found_deny, noimmutable, FALSE);
8304
0a7de745 8305 if (result == EPERM) {
813fb2f6 8306 result = EACCES;
0a7de745 8307 }
813fb2f6 8308out:
0a7de745 8309 return result;
813fb2f6
A
8310}
8311
8312
0a7de745 8313int
6d2010ae
A
8314vnode_authattr_new(vnode_t dvp, struct vnode_attr *vap, int noauth, vfs_context_t ctx)
8315{
8316 return vnode_authattr_new_internal(dvp, vap, noauth, NULL, ctx);
8317}
8318
91447636
A
8319/*
8320 * Check that the attribute information in vattr can be legally applied to
8321 * a new file by the context.
8322 */
6d2010ae
A
8323static int
8324vnode_authattr_new_internal(vnode_t dvp, struct vnode_attr *vap, int noauth, uint32_t *defaulted_fieldsp, vfs_context_t ctx)
91447636 8325{
0a7de745
A
8326 int error;
8327 int has_priv_suser, ismember, defaulted_owner, defaulted_group, defaulted_mode;
8328 uint32_t inherit_flags;
8329 kauth_cred_t cred;
8330 guid_t changer;
8331 mount_t dmp;
39037602 8332 struct vnode_attr dva;
91447636
A
8333
8334 error = 0;
6d2010ae
A
8335
8336 if (defaulted_fieldsp) {
8337 *defaulted_fieldsp = 0;
8338 }
8339
91447636
A
8340 defaulted_owner = defaulted_group = defaulted_mode = 0;
8341
5ba3f43e 8342 inherit_flags = 0;
39037602 8343
91447636
A
8344 /*
8345 * Require that the filesystem support extended security to apply any.
8346 */
8347 if (!vfs_extendedsecurity(dvp->v_mount) &&
8348 (VATTR_IS_ACTIVE(vap, va_acl) || VATTR_IS_ACTIVE(vap, va_uuuid) || VATTR_IS_ACTIVE(vap, va_guuid))) {
8349 error = EINVAL;
8350 goto out;
8351 }
0a7de745 8352
91447636
A
8353 /*
8354 * Default some fields.
8355 */
8356 dmp = dvp->v_mount;
8357
8358 /*
8359 * If the filesystem is mounted IGNORE_OWNERSHIP and an explicit owner is set, that
8360 * owner takes ownership of all new files.
8361 */
8362 if ((dmp->mnt_flag & MNT_IGNORE_OWNERSHIP) && (dmp->mnt_fsowner != KAUTH_UID_NONE)) {
8363 VATTR_SET(vap, va_uid, dmp->mnt_fsowner);
8364 defaulted_owner = 1;
8365 } else {
8366 if (!VATTR_IS_ACTIVE(vap, va_uid)) {
8367 /* default owner is current user */
8368 VATTR_SET(vap, va_uid, kauth_cred_getuid(vfs_context_ucred(ctx)));
8369 defaulted_owner = 1;
8370 }
8371 }
8372
39037602
A
8373 /*
8374 * We need the dvp's va_flags and *may* need the gid of the directory,
8375 * we ask for both here.
8376 */
8377 VATTR_INIT(&dva);
8378 VATTR_WANTED(&dva, va_gid);
8379 VATTR_WANTED(&dva, va_flags);
0a7de745 8380 if ((error = vnode_getattr(dvp, &dva, ctx)) != 0) {
39037602 8381 goto out;
0a7de745 8382 }
39037602 8383
91447636
A
8384 /*
8385 * If the filesystem is mounted IGNORE_OWNERSHIP and an explicit grouo is set, that
8386 * group takes ownership of all new files.
8387 */
8388 if ((dmp->mnt_flag & MNT_IGNORE_OWNERSHIP) && (dmp->mnt_fsgroup != KAUTH_GID_NONE)) {
8389 VATTR_SET(vap, va_gid, dmp->mnt_fsgroup);
8390 defaulted_group = 1;
8391 } else {
8392 if (!VATTR_IS_ACTIVE(vap, va_gid)) {
8393 /* default group comes from parent object, fallback to current user */
91447636
A
8394 if (VATTR_IS_SUPPORTED(&dva, va_gid)) {
8395 VATTR_SET(vap, va_gid, dva.va_gid);
8396 } else {
8397 VATTR_SET(vap, va_gid, kauth_cred_getgid(vfs_context_ucred(ctx)));
8398 }
8399 defaulted_group = 1;
8400 }
8401 }
8402
0a7de745 8403 if (!VATTR_IS_ACTIVE(vap, va_flags)) {
91447636 8404 VATTR_SET(vap, va_flags, 0);
0a7de745 8405 }
39037602
A
8406
8407 /* Determine if SF_RESTRICTED should be inherited from the parent
8408 * directory. */
5ba3f43e
A
8409 if (VATTR_IS_SUPPORTED(&dva, va_flags)) {
8410 inherit_flags = dva.va_flags & (UF_DATAVAULT | SF_RESTRICTED);
39037602
A
8411 }
8412
91447636
A
8413 /* default mode is everything, masked with current umask */
8414 if (!VATTR_IS_ACTIVE(vap, va_mode)) {
8415 VATTR_SET(vap, va_mode, ACCESSPERMS & ~vfs_context_proc(ctx)->p_fd->fd_cmask);
8416 KAUTH_DEBUG("ATTR - defaulting new file mode to %o from umask %o", vap->va_mode, vfs_context_proc(ctx)->p_fd->fd_cmask);
8417 defaulted_mode = 1;
8418 }
8419 /* set timestamps to now */
8420 if (!VATTR_IS_ACTIVE(vap, va_create_time)) {
8421 nanotime(&vap->va_create_time);
8422 VATTR_SET_ACTIVE(vap, va_create_time);
8423 }
0a7de745 8424
91447636
A
8425 /*
8426 * Check for attempts to set nonsensical fields.
8427 */
8428 if (vap->va_active & ~VNODE_ATTR_NEWOBJ) {
8429 error = EINVAL;
8430 KAUTH_DEBUG("ATTR - ERROR - attempt to set unsupported new-file attributes %llx",
8431 vap->va_active & ~VNODE_ATTR_NEWOBJ);
8432 goto out;
8433 }
8434
8435 /*
8436 * Quickly check for the applicability of any enforcement here.
8437 * Tests below maintain the integrity of the local security model.
8438 */
0a7de745
A
8439 if (vfs_authopaque(dvp->v_mount)) {
8440 goto out;
8441 }
91447636
A
8442
8443 /*
8444 * We need to know if the caller is the superuser, or if the work is
8445 * otherwise already authorised.
8446 */
8447 cred = vfs_context_ucred(ctx);
8448 if (noauth) {
8449 /* doing work for the kernel */
2d21ac55 8450 has_priv_suser = 1;
91447636 8451 } else {
2d21ac55 8452 has_priv_suser = vfs_context_issuser(ctx);
91447636
A
8453 }
8454
8455
8456 if (VATTR_IS_ACTIVE(vap, va_flags)) {
2d21ac55 8457 if (has_priv_suser) {
91447636
A
8458 if ((vap->va_flags & (UF_SETTABLE | SF_SETTABLE)) != vap->va_flags) {
8459 error = EPERM;
8460 KAUTH_DEBUG(" DENIED - superuser attempt to set illegal flag(s)");
8461 goto out;
8462 }
8463 } else {
8464 if ((vap->va_flags & UF_SETTABLE) != vap->va_flags) {
8465 error = EPERM;
8466 KAUTH_DEBUG(" DENIED - user attempt to set illegal flag(s)");
8467 goto out;
8468 }
8469 }
8470 }
8471
8472 /* if not superuser, validate legality of new-item attributes */
2d21ac55 8473 if (!has_priv_suser) {
91447636
A
8474 if (!defaulted_mode && VATTR_IS_ACTIVE(vap, va_mode)) {
8475 /* setgid? */
8476 if (vap->va_mode & S_ISGID) {
8477 if ((error = kauth_cred_ismember_gid(cred, vap->va_gid, &ismember)) != 0) {
8478 KAUTH_DEBUG("ATTR - ERROR: got %d checking for membership in %d", error, vap->va_gid);
8479 goto out;
8480 }
8481 if (!ismember) {
8482 KAUTH_DEBUG(" DENIED - can't set SGID bit, not a member of %d", vap->va_gid);
8483 error = EPERM;
8484 goto out;
8485 }
8486 }
8487
8488 /* setuid? */
8489 if ((vap->va_mode & S_ISUID) && (vap->va_uid != kauth_cred_getuid(cred))) {
8490 KAUTH_DEBUG("ATTR - ERROR: illegal attempt to set the setuid bit");
8491 error = EPERM;
8492 goto out;
8493 }
8494 }
8495 if (!defaulted_owner && (vap->va_uid != kauth_cred_getuid(cred))) {
8496 KAUTH_DEBUG(" DENIED - cannot create new item owned by %d", vap->va_uid);
8497 error = EPERM;
8498 goto out;
8499 }
8500 if (!defaulted_group) {
8501 if ((error = kauth_cred_ismember_gid(cred, vap->va_gid, &ismember)) != 0) {
8502 KAUTH_DEBUG(" ERROR - got %d checking for membership in %d", error, vap->va_gid);
8503 goto out;
8504 }
8505 if (!ismember) {
8506 KAUTH_DEBUG(" DENIED - cannot create new item with group %d - not a member", vap->va_gid);
8507 error = EPERM;
8508 goto out;
8509 }
8510 }
8511
8512 /* initialising owner/group UUID */
8513 if (VATTR_IS_ACTIVE(vap, va_uuuid)) {
8514 if ((error = kauth_cred_getguid(cred, &changer)) != 0) {
8515 KAUTH_DEBUG(" ERROR - got %d trying to get caller UUID", error);
8516 /* XXX ENOENT here - no GUID - should perhaps become EPERM */
8517 goto out;
8518 }
8519 if (!kauth_guid_equal(&vap->va_uuuid, &changer)) {
8520 KAUTH_DEBUG(" ERROR - cannot create item with supplied owner UUID - not us");
8521 error = EPERM;
8522 goto out;
8523 }
8524 }
8525 if (VATTR_IS_ACTIVE(vap, va_guuid)) {
8526 if ((error = kauth_cred_ismember_guid(cred, &vap->va_guuid, &ismember)) != 0) {
8527 KAUTH_DEBUG(" ERROR - got %d trying to check group membership", error);
8528 goto out;
8529 }
8530 if (!ismember) {
8531 KAUTH_DEBUG(" ERROR - cannot create item with supplied group UUID - not a member");
8532 error = EPERM;
8533 goto out;
8534 }
8535 }
8536 }
0a7de745 8537out:
5ba3f43e 8538 if (inherit_flags) {
39037602
A
8539 /* Apply SF_RESTRICTED to the file if its parent directory was
8540 * restricted. This is done at the end so that root is not
8541 * required if this flag is only set due to inheritance. */
5ba3f43e 8542 VATTR_SET(vap, va_flags, (vap->va_flags | inherit_flags));
39037602 8543 }
6d2010ae
A
8544 if (defaulted_fieldsp) {
8545 if (defaulted_mode) {
8546 *defaulted_fieldsp |= VATTR_PREPARE_DEFAULTED_MODE;
8547 }
8548 if (defaulted_group) {
8549 *defaulted_fieldsp |= VATTR_PREPARE_DEFAULTED_GID;
8550 }
8551 if (defaulted_owner) {
8552 *defaulted_fieldsp |= VATTR_PREPARE_DEFAULTED_UID;
8553 }
8554 }
0a7de745 8555 return error;
91447636
A
8556}
8557
8558/*
2d21ac55
A
8559 * Check that the attribute information in vap can be legally written by the
8560 * context.
91447636 8561 *
2d21ac55
A
8562 * Call this when you're not sure about the vnode_attr; either its contents
8563 * have come from an unknown source, or when they are variable.
91447636
A
8564 *
8565 * Returns errno, or zero and sets *actionp to the KAUTH_VNODE_* actions that
8566 * must be authorized to be permitted to write the vattr.
8567 */
8568int
8569vnode_authattr(vnode_t vp, struct vnode_attr *vap, kauth_action_t *actionp, vfs_context_t ctx)
8570{
8571 struct vnode_attr ova;
0a7de745
A
8572 kauth_action_t required_action;
8573 int error, has_priv_suser, ismember, chowner, chgroup, clear_suid, clear_sgid;
8574 guid_t changer;
8575 gid_t group;
8576 uid_t owner;
8577 mode_t newmode;
8578 kauth_cred_t cred;
8579 uint32_t fdelta;
91447636
A
8580
8581 VATTR_INIT(&ova);
8582 required_action = 0;
8583 error = 0;
8584
8585 /*
8586 * Quickly check for enforcement applicability.
8587 */
0a7de745 8588 if (vfs_authopaque(vp->v_mount)) {
91447636 8589 goto out;
0a7de745
A
8590 }
8591
91447636
A
8592 /*
8593 * Check for attempts to set nonsensical fields.
8594 */
8595 if (vap->va_active & VNODE_ATTR_RDONLY) {
8596 KAUTH_DEBUG("ATTR - ERROR: attempt to set readonly attribute(s)");
8597 error = EINVAL;
8598 goto out;
8599 }
8600
8601 /*
8602 * We need to know if the caller is the superuser.
8603 */
8604 cred = vfs_context_ucred(ctx);
2d21ac55 8605 has_priv_suser = kauth_cred_issuser(cred);
0a7de745 8606
91447636
A
8607 /*
8608 * If any of the following are changing, we need information from the old file:
8609 * va_uid
8610 * va_gid
8611 * va_mode
8612 * va_uuuid
8613 * va_guuid
8614 */
8615 if (VATTR_IS_ACTIVE(vap, va_uid) ||
8616 VATTR_IS_ACTIVE(vap, va_gid) ||
8617 VATTR_IS_ACTIVE(vap, va_mode) ||
8618 VATTR_IS_ACTIVE(vap, va_uuuid) ||
8619 VATTR_IS_ACTIVE(vap, va_guuid)) {
8620 VATTR_WANTED(&ova, va_mode);
8621 VATTR_WANTED(&ova, va_uid);
8622 VATTR_WANTED(&ova, va_gid);
8623 VATTR_WANTED(&ova, va_uuuid);
8624 VATTR_WANTED(&ova, va_guuid);
8625 KAUTH_DEBUG("ATTR - security information changing, fetching existing attributes");
8626 }
8627
8628 /*
8629 * If timestamps are being changed, we need to know who the file is owned
8630 * by.
8631 */
8632 if (VATTR_IS_ACTIVE(vap, va_create_time) ||
8633 VATTR_IS_ACTIVE(vap, va_change_time) ||
8634 VATTR_IS_ACTIVE(vap, va_modify_time) ||
8635 VATTR_IS_ACTIVE(vap, va_access_time) ||
5ba3f43e
A
8636 VATTR_IS_ACTIVE(vap, va_backup_time) ||
8637 VATTR_IS_ACTIVE(vap, va_addedtime)) {
91447636 8638 VATTR_WANTED(&ova, va_uid);
0a7de745 8639#if 0 /* enable this when we support UUIDs as official owners */
91447636
A
8640 VATTR_WANTED(&ova, va_uuuid);
8641#endif
8642 KAUTH_DEBUG("ATTR - timestamps changing, fetching uid and GUID");
8643 }
0a7de745 8644
91447636
A
8645 /*
8646 * If flags are being changed, we need the old flags.
8647 */
8648 if (VATTR_IS_ACTIVE(vap, va_flags)) {
8649 KAUTH_DEBUG("ATTR - flags changing, fetching old flags");
8650 VATTR_WANTED(&ova, va_flags);
8651 }
8652
6d2010ae
A
8653 /*
8654 * If ACLs are being changed, we need the old ACLs.
8655 */
8656 if (VATTR_IS_ACTIVE(vap, va_acl)) {
8657 KAUTH_DEBUG("ATTR - acl changing, fetching old flags");
8658 VATTR_WANTED(&ova, va_acl);
8659 }
8660
91447636
A
8661 /*
8662 * If the size is being set, make sure it's not a directory.
8663 */
8664 if (VATTR_IS_ACTIVE(vap, va_data_size)) {
4bd07ac2
A
8665 /* size is only meaningful on regular files, don't permit otherwise */
8666 if (!vnode_isreg(vp)) {
8667 KAUTH_DEBUG("ATTR - ERROR: size change requested on non-file");
8668 error = vnode_isdir(vp) ? EISDIR : EINVAL;
91447636
A
8669 goto out;
8670 }
8671 }
8672
8673 /*
8674 * Get old data.
8675 */
8676 KAUTH_DEBUG("ATTR - fetching old attributes %016llx", ova.va_active);
8677 if ((error = vnode_getattr(vp, &ova, ctx)) != 0) {
8678 KAUTH_DEBUG(" ERROR - got %d trying to get attributes", error);
8679 goto out;
8680 }
8681
8682 /*
8683 * Size changes require write access to the file data.
8684 */
8685 if (VATTR_IS_ACTIVE(vap, va_data_size)) {
8686 /* if we can't get the size, or it's different, we need write access */
0a7de745
A
8687 KAUTH_DEBUG("ATTR - size change, requiring WRITE_DATA");
8688 required_action |= KAUTH_VNODE_WRITE_DATA;
91447636
A
8689 }
8690
8691 /*
8692 * Changing timestamps?
8693 *
8694 * Note that we are only called to authorize user-requested time changes;
8695 * side-effect time changes are not authorized. Authorisation is only
8696 * required for existing files.
8697 *
8698 * Non-owners are not permitted to change the time on an existing
8699 * file to anything other than the current time.
8700 */
8701 if (VATTR_IS_ACTIVE(vap, va_create_time) ||
8702 VATTR_IS_ACTIVE(vap, va_change_time) ||
8703 VATTR_IS_ACTIVE(vap, va_modify_time) ||
8704 VATTR_IS_ACTIVE(vap, va_access_time) ||
5ba3f43e
A
8705 VATTR_IS_ACTIVE(vap, va_backup_time) ||
8706 VATTR_IS_ACTIVE(vap, va_addedtime)) {
91447636
A
8707 /*
8708 * The owner and root may set any timestamps they like,
8709 * provided that the file is not immutable. The owner still needs
8710 * WRITE_ATTRIBUTES (implied by ownership but still deniable).
8711 */
2d21ac55 8712 if (has_priv_suser || vauth_node_owner(&ova, cred)) {
91447636
A
8713 KAUTH_DEBUG("ATTR - root or owner changing timestamps");
8714 required_action |= KAUTH_VNODE_CHECKIMMUTABLE | KAUTH_VNODE_WRITE_ATTRIBUTES;
8715 } else {
8716 /* just setting the current time? */
8717 if (vap->va_vaflags & VA_UTIMES_NULL) {
8718 KAUTH_DEBUG("ATTR - non-root/owner changing timestamps, requiring WRITE_ATTRIBUTES");
8719 required_action |= KAUTH_VNODE_WRITE_ATTRIBUTES;
8720 } else {
8721 KAUTH_DEBUG("ATTR - ERROR: illegal timestamp modification attempted");
8722 error = EACCES;
8723 goto out;
8724 }
8725 }
8726 }
8727
8728 /*
8729 * Changing file mode?
8730 */
8731 if (VATTR_IS_ACTIVE(vap, va_mode) && VATTR_IS_SUPPORTED(&ova, va_mode) && (ova.va_mode != vap->va_mode)) {
8732 KAUTH_DEBUG("ATTR - mode change from %06o to %06o", ova.va_mode, vap->va_mode);
8733
8734 /*
8735 * Mode changes always have the same basic auth requirements.
8736 */
2d21ac55 8737 if (has_priv_suser) {
91447636
A
8738 KAUTH_DEBUG("ATTR - superuser mode change, requiring immutability check");
8739 required_action |= KAUTH_VNODE_CHECKIMMUTABLE;
8740 } else {
8741 /* need WRITE_SECURITY */
8742 KAUTH_DEBUG("ATTR - non-superuser mode change, requiring WRITE_SECURITY");
8743 required_action |= KAUTH_VNODE_WRITE_SECURITY;
8744 }
8745
8746 /*
8747 * Can't set the setgid bit if you're not in the group and not root. Have to have
8748 * existing group information in the case we're not setting it right now.
8749 */
8750 if (vap->va_mode & S_ISGID) {
0a7de745 8751 required_action |= KAUTH_VNODE_CHECKIMMUTABLE; /* always required */
2d21ac55 8752 if (!has_priv_suser) {
91447636
A
8753 if (VATTR_IS_ACTIVE(vap, va_gid)) {
8754 group = vap->va_gid;
8755 } else if (VATTR_IS_SUPPORTED(&ova, va_gid)) {
8756 group = ova.va_gid;
8757 } else {
8758 KAUTH_DEBUG("ATTR - ERROR: setgid but no gid available");
8759 error = EINVAL;
8760 goto out;
8761 }
8762 /*
8763 * This might be too restrictive; WRITE_SECURITY might be implied by
8764 * membership in this case, rather than being an additional requirement.
8765 */
8766 if ((error = kauth_cred_ismember_gid(cred, group, &ismember)) != 0) {
8767 KAUTH_DEBUG("ATTR - ERROR: got %d checking for membership in %d", error, vap->va_gid);
8768 goto out;
8769 }
8770 if (!ismember) {
8771 KAUTH_DEBUG(" DENIED - can't set SGID bit, not a member of %d", group);
8772 error = EPERM;
8773 goto out;
8774 }
8775 }
8776 }
8777
8778 /*
8779 * Can't set the setuid bit unless you're root or the file's owner.
8780 */
8781 if (vap->va_mode & S_ISUID) {
0a7de745 8782 required_action |= KAUTH_VNODE_CHECKIMMUTABLE; /* always required */
2d21ac55 8783 if (!has_priv_suser) {
91447636
A
8784 if (VATTR_IS_ACTIVE(vap, va_uid)) {
8785 owner = vap->va_uid;
8786 } else if (VATTR_IS_SUPPORTED(&ova, va_uid)) {
8787 owner = ova.va_uid;
8788 } else {
8789 KAUTH_DEBUG("ATTR - ERROR: setuid but no uid available");
8790 error = EINVAL;
8791 goto out;
8792 }
8793 if (owner != kauth_cred_getuid(cred)) {
8794 /*
8795 * We could allow this if WRITE_SECURITY is permitted, perhaps.
8796 */
8797 KAUTH_DEBUG("ATTR - ERROR: illegal attempt to set the setuid bit");
8798 error = EPERM;
8799 goto out;
8800 }
8801 }
8802 }
8803 }
0a7de745 8804
91447636
A
8805 /*
8806 * Validate/mask flags changes. This checks that only the flags in
8807 * the UF_SETTABLE mask are being set, and preserves the flags in
8808 * the SF_SETTABLE case.
8809 *
8810 * Since flags changes may be made in conjunction with other changes,
8811 * we will ask the auth code to ignore immutability in the case that
8812 * the SF_* flags are not set and we are only manipulating the file flags.
0a7de745 8813 *
91447636
A
8814 */
8815 if (VATTR_IS_ACTIVE(vap, va_flags)) {
8816 /* compute changing flags bits */
8817 if (VATTR_IS_SUPPORTED(&ova, va_flags)) {
8818 fdelta = vap->va_flags ^ ova.va_flags;
8819 } else {
8820 fdelta = vap->va_flags;
8821 }
8822
8823 if (fdelta != 0) {
8824 KAUTH_DEBUG("ATTR - flags changing, requiring WRITE_SECURITY");
8825 required_action |= KAUTH_VNODE_WRITE_SECURITY;
8826
8827 /* check that changing bits are legal */
2d21ac55 8828 if (has_priv_suser) {
91447636
A
8829 /*
8830 * The immutability check will prevent us from clearing the SF_*
8831 * flags unless the system securelevel permits it, so just check
8832 * for legal flags here.
8833 */
8834 if (fdelta & ~(UF_SETTABLE | SF_SETTABLE)) {
8835 error = EPERM;
8836 KAUTH_DEBUG(" DENIED - superuser attempt to set illegal flag(s)");
8837 goto out;
8838 }
8839 } else {
8840 if (fdelta & ~UF_SETTABLE) {
8841 error = EPERM;
8842 KAUTH_DEBUG(" DENIED - user attempt to set illegal flag(s)");
8843 goto out;
8844 }
8845 }
8846 /*
8847 * If the caller has the ability to manipulate file flags,
8848 * security is not reduced by ignoring them for this operation.
8849 *
8850 * A more complete test here would consider the 'after' states of the flags
8851 * to determine whether it would permit the operation, but this becomes
8852 * very complex.
8853 *
8854 * Ignoring immutability is conditional on securelevel; this does not bypass
8855 * the SF_* flags if securelevel > 0.
8856 */
8857 required_action |= KAUTH_VNODE_NOIMMUTABLE;
8858 }
8859 }
8860
8861 /*
8862 * Validate ownership information.
8863 */
8864 chowner = 0;
8865 chgroup = 0;
2d21ac55
A
8866 clear_suid = 0;
8867 clear_sgid = 0;
91447636
A
8868
8869 /*
8870 * uid changing
8871 * Note that if the filesystem didn't give us a UID, we expect that it doesn't
8872 * support them in general, and will ignore it if/when we try to set it.
8873 * We might want to clear the uid out of vap completely here.
8874 */
2d21ac55
A
8875 if (VATTR_IS_ACTIVE(vap, va_uid)) {
8876 if (VATTR_IS_SUPPORTED(&ova, va_uid) && (vap->va_uid != ova.va_uid)) {
0a7de745
A
8877 if (!has_priv_suser && (kauth_cred_getuid(cred) != vap->va_uid)) {
8878 KAUTH_DEBUG(" DENIED - non-superuser cannot change ownershipt to a third party");
8879 error = EPERM;
8880 goto out;
8881 }
8882 chowner = 1;
91447636 8883 }
2d21ac55
A
8884 clear_suid = 1;
8885 }
0a7de745 8886
91447636
A
8887 /*
8888 * gid changing
8889 * Note that if the filesystem didn't give us a GID, we expect that it doesn't
8890 * support them in general, and will ignore it if/when we try to set it.
8891 * We might want to clear the gid out of vap completely here.
8892 */
2d21ac55
A
8893 if (VATTR_IS_ACTIVE(vap, va_gid)) {
8894 if (VATTR_IS_SUPPORTED(&ova, va_gid) && (vap->va_gid != ova.va_gid)) {
0a7de745
A
8895 if (!has_priv_suser) {
8896 if ((error = kauth_cred_ismember_gid(cred, vap->va_gid, &ismember)) != 0) {
8897 KAUTH_DEBUG(" ERROR - got %d checking for membership in %d", error, vap->va_gid);
8898 goto out;
8899 }
8900 if (!ismember) {
8901 KAUTH_DEBUG(" DENIED - group change from %d to %d but not a member of target group",
8902 ova.va_gid, vap->va_gid);
8903 error = EPERM;
8904 goto out;
8905 }
91447636 8906 }
0a7de745 8907 chgroup = 1;
91447636 8908 }
2d21ac55
A
8909 clear_sgid = 1;
8910 }
91447636
A
8911
8912 /*
8913 * Owner UUID being set or changed.
8914 */
8915 if (VATTR_IS_ACTIVE(vap, va_uuuid)) {
8916 /* if the owner UUID is not actually changing ... */
b0d623f7 8917 if (VATTR_IS_SUPPORTED(&ova, va_uuuid)) {
0a7de745 8918 if (kauth_guid_equal(&vap->va_uuuid, &ova.va_uuuid)) {
b0d623f7 8919 goto no_uuuid_change;
0a7de745
A
8920 }
8921
b0d623f7
A
8922 /*
8923 * If the current owner UUID is a null GUID, check
8924 * it against the UUID corresponding to the owner UID.
8925 */
8926 if (kauth_guid_equal(&ova.va_uuuid, &kauth_null_guid) &&
8927 VATTR_IS_SUPPORTED(&ova, va_uid)) {
8928 guid_t uid_guid;
8929
8930 if (kauth_cred_uid2guid(ova.va_uid, &uid_guid) == 0 &&
0a7de745
A
8931 kauth_guid_equal(&vap->va_uuuid, &uid_guid)) {
8932 goto no_uuuid_change;
8933 }
b0d623f7
A
8934 }
8935 }
0a7de745 8936
91447636
A
8937 /*
8938 * The owner UUID cannot be set by a non-superuser to anything other than
b0d623f7
A
8939 * their own or a null GUID (to "unset" the owner UUID).
8940 * Note that file systems must be prepared to handle the
8941 * null UUID case in a manner appropriate for that file
8942 * system.
91447636 8943 */
2d21ac55 8944 if (!has_priv_suser) {
91447636
A
8945 if ((error = kauth_cred_getguid(cred, &changer)) != 0) {
8946 KAUTH_DEBUG(" ERROR - got %d trying to get caller UUID", error);
8947 /* XXX ENOENT here - no UUID - should perhaps become EPERM */
8948 goto out;
8949 }
b0d623f7
A
8950 if (!kauth_guid_equal(&vap->va_uuuid, &changer) &&
8951 !kauth_guid_equal(&vap->va_uuuid, &kauth_null_guid)) {
8952 KAUTH_DEBUG(" ERROR - cannot set supplied owner UUID - not us / null");
91447636
A
8953 error = EPERM;
8954 goto out;
8955 }
8956 }
8957 chowner = 1;
2d21ac55 8958 clear_suid = 1;
91447636
A
8959 }
8960no_uuuid_change:
8961 /*
8962 * Group UUID being set or changed.
8963 */
8964 if (VATTR_IS_ACTIVE(vap, va_guuid)) {
8965 /* if the group UUID is not actually changing ... */
b0d623f7 8966 if (VATTR_IS_SUPPORTED(&ova, va_guuid)) {
0a7de745 8967 if (kauth_guid_equal(&vap->va_guuid, &ova.va_guuid)) {
b0d623f7 8968 goto no_guuid_change;
0a7de745 8969 }
b0d623f7
A
8970
8971 /*
8972 * If the current group UUID is a null UUID, check
8973 * it against the UUID corresponding to the group GID.
8974 */
8975 if (kauth_guid_equal(&ova.va_guuid, &kauth_null_guid) &&
8976 VATTR_IS_SUPPORTED(&ova, va_gid)) {
8977 guid_t gid_guid;
8978
8979 if (kauth_cred_gid2guid(ova.va_gid, &gid_guid) == 0 &&
0a7de745
A
8980 kauth_guid_equal(&vap->va_guuid, &gid_guid)) {
8981 goto no_guuid_change;
8982 }
b0d623f7
A
8983 }
8984 }
91447636
A
8985
8986 /*
8987 * The group UUID cannot be set by a non-superuser to anything other than
b0d623f7
A
8988 * one of which they are a member or a null GUID (to "unset"
8989 * the group UUID).
8990 * Note that file systems must be prepared to handle the
8991 * null UUID case in a manner appropriate for that file
8992 * system.
91447636 8993 */
2d21ac55 8994 if (!has_priv_suser) {
0a7de745 8995 if (kauth_guid_equal(&vap->va_guuid, &kauth_null_guid)) {
b0d623f7 8996 ismember = 1;
0a7de745 8997 } else if ((error = kauth_cred_ismember_guid(cred, &vap->va_guuid, &ismember)) != 0) {
91447636
A
8998 KAUTH_DEBUG(" ERROR - got %d trying to check group membership", error);
8999 goto out;
9000 }
9001 if (!ismember) {
b0d623f7 9002 KAUTH_DEBUG(" ERROR - cannot set supplied group UUID - not a member / null");
91447636
A
9003 error = EPERM;
9004 goto out;
9005 }
9006 }
9007 chgroup = 1;
9008 }
9009no_guuid_change:
9010
9011 /*
9012 * Compute authorisation for group/ownership changes.
9013 */
2d21ac55
A
9014 if (chowner || chgroup || clear_suid || clear_sgid) {
9015 if (has_priv_suser) {
91447636
A
9016 KAUTH_DEBUG("ATTR - superuser changing file owner/group, requiring immutability check");
9017 required_action |= KAUTH_VNODE_CHECKIMMUTABLE;
9018 } else {
9019 if (chowner) {
9020 KAUTH_DEBUG("ATTR - ownership change, requiring TAKE_OWNERSHIP");
9021 required_action |= KAUTH_VNODE_TAKE_OWNERSHIP;
9022 }
9023 if (chgroup && !chowner) {
9024 KAUTH_DEBUG("ATTR - group change, requiring WRITE_SECURITY");
9025 required_action |= KAUTH_VNODE_WRITE_SECURITY;
9026 }
39037602 9027 }
0a7de745 9028
39037602
A
9029 /*
9030 * clear set-uid and set-gid bits. POSIX only requires this for
9031 * non-privileged processes but we do it even for root.
9032 */
9033 if (VATTR_IS_ACTIVE(vap, va_mode)) {
9034 newmode = vap->va_mode;
9035 } else if (VATTR_IS_SUPPORTED(&ova, va_mode)) {
9036 newmode = ova.va_mode;
9037 } else {
9038 KAUTH_DEBUG("CHOWN - trying to change owner but cannot get mode from filesystem to mask setugid bits");
9039 newmode = 0;
9040 }
9041
9042 /* chown always clears setuid/gid bits. An exception is made for
9043 * setattrlist executed by a root process to set <uid, gid, mode> on a file:
9044 * setattrlist is allowed to set the new mode on the file and change (chown)
9045 * uid/gid.
9046 */
9047 if (newmode & (S_ISUID | S_ISGID)) {
9048 if (!VATTR_IS_ACTIVE(vap, va_mode) || !has_priv_suser) {
9049 KAUTH_DEBUG("CHOWN - masking setugid bits from mode %o to %o",
0a7de745 9050 newmode, newmode & ~(S_ISUID | S_ISGID));
39037602 9051 newmode &= ~(S_ISUID | S_ISGID);
91447636 9052 }
39037602 9053 VATTR_SET(vap, va_mode, newmode);
91447636
A
9054 }
9055 }
9056
9057 /*
9058 * Authorise changes in the ACL.
9059 */
9060 if (VATTR_IS_ACTIVE(vap, va_acl)) {
91447636
A
9061 /* no existing ACL */
9062 if (!VATTR_IS_ACTIVE(&ova, va_acl) || (ova.va_acl == NULL)) {
91447636
A
9063 /* adding an ACL */
9064 if (vap->va_acl != NULL) {
9065 required_action |= KAUTH_VNODE_WRITE_SECURITY;
9066 KAUTH_DEBUG("CHMOD - adding ACL");
9067 }
9068
9069 /* removing an existing ACL */
9070 } else if (vap->va_acl == NULL) {
9071 required_action |= KAUTH_VNODE_WRITE_SECURITY;
9072 KAUTH_DEBUG("CHMOD - removing ACL");
9073
9074 /* updating an existing ACL */
9075 } else {
9076 if (vap->va_acl->acl_entrycount != ova.va_acl->acl_entrycount) {
9077 /* entry count changed, must be different */
9078 required_action |= KAUTH_VNODE_WRITE_SECURITY;
9079 KAUTH_DEBUG("CHMOD - adding/removing ACL entries");
9080 } else if (vap->va_acl->acl_entrycount > 0) {
9081 /* both ACLs have the same ACE count, said count is 1 or more, bitwise compare ACLs */
6d2010ae 9082 if (memcmp(&vap->va_acl->acl_ace[0], &ova.va_acl->acl_ace[0],
0a7de745 9083 sizeof(struct kauth_ace) * vap->va_acl->acl_entrycount)) {
91447636
A
9084 required_action |= KAUTH_VNODE_WRITE_SECURITY;
9085 KAUTH_DEBUG("CHMOD - changing ACL entries");
9086 }
9087 }
9088 }
9089 }
9090
9091 /*
9092 * Other attributes that require authorisation.
9093 */
0a7de745 9094 if (VATTR_IS_ACTIVE(vap, va_encoding)) {
91447636 9095 required_action |= KAUTH_VNODE_WRITE_ATTRIBUTES;
0a7de745
A
9096 }
9097
91447636 9098out:
0a7de745 9099 if (VATTR_IS_SUPPORTED(&ova, va_acl) && (ova.va_acl != NULL)) {
91447636 9100 kauth_acl_free(ova.va_acl);
0a7de745
A
9101 }
9102 if (error == 0) {
91447636 9103 *actionp = required_action;
0a7de745
A
9104 }
9105 return error;
91447636
A
9106}
9107
6d2010ae
A
9108static int
9109setlocklocal_callback(struct vnode *vp, __unused void *cargs)
9110{
9111 vnode_lock_spin(vp);
9112 vp->v_flag |= VLOCKLOCAL;
9113 vnode_unlock(vp);
9114
0a7de745 9115 return VNODE_RETURNED;
6d2010ae 9116}
91447636
A
9117
9118void
9119vfs_setlocklocal(mount_t mp)
9120{
6d2010ae 9121 mount_lock_spin(mp);
91447636 9122 mp->mnt_kern_flag |= MNTK_LOCK_LOCAL;
6d2010ae 9123 mount_unlock(mp);
91447636
A
9124
9125 /*
6d2010ae
A
9126 * The number of active vnodes is expected to be
9127 * very small when vfs_setlocklocal is invoked.
91447636 9128 */
6d2010ae 9129 vnode_iterate(mp, 0, setlocklocal_callback, NULL);
d1ecb069
A
9130}
9131
9132void
fe8ab488 9133vfs_setcompoundopen(mount_t mp)
d1ecb069
A
9134{
9135 mount_lock_spin(mp);
fe8ab488 9136 mp->mnt_compound_ops |= COMPOUND_VNOP_OPEN;
d1ecb069 9137 mount_unlock(mp);
91447636
A
9138}
9139
6d2010ae 9140void
fe8ab488 9141vnode_setswapmount(vnode_t vp)
6d2010ae 9142{
fe8ab488
A
9143 mount_lock(vp->v_mount);
9144 vp->v_mount->mnt_kern_flag |= MNTK_SWAP_MOUNT;
9145 mount_unlock(vp->v_mount);
6d2010ae
A
9146}
9147
fe8ab488 9148
3e170ce0
A
9149int64_t
9150vnode_getswappin_avail(vnode_t vp)
9151{
0a7de745 9152 int64_t max_swappin_avail = 0;
3e170ce0
A
9153
9154 mount_lock(vp->v_mount);
0a7de745 9155 if (vp->v_mount->mnt_ioflags & MNT_IOFLAGS_SWAPPIN_SUPPORTED) {
3e170ce0 9156 max_swappin_avail = vp->v_mount->mnt_max_swappin_available;
0a7de745 9157 }
3e170ce0
A
9158 mount_unlock(vp->v_mount);
9159
0a7de745 9160 return max_swappin_avail;
3e170ce0
A
9161}
9162
9163
2d21ac55
A
9164void
9165vn_setunionwait(vnode_t vp)
9166{
9167 vnode_lock_spin(vp);
9168 vp->v_flag |= VISUNION;
9169 vnode_unlock(vp);
9170}
9171
9172
9173void
9174vn_checkunionwait(vnode_t vp)
9175{
b0d623f7 9176 vnode_lock_spin(vp);
0a7de745 9177 while ((vp->v_flag & VISUNION) == VISUNION) {
2d21ac55 9178 msleep((caddr_t)&vp->v_flag, &vp->v_lock, 0, 0, 0);
0a7de745 9179 }
2d21ac55
A
9180 vnode_unlock(vp);
9181}
9182
9183void
9184vn_clearunionwait(vnode_t vp, int locked)
9185{
0a7de745 9186 if (!locked) {
b0d623f7 9187 vnode_lock_spin(vp);
0a7de745
A
9188 }
9189 if ((vp->v_flag & VISUNION) == VISUNION) {
2d21ac55
A
9190 vp->v_flag &= ~VISUNION;
9191 wakeup((caddr_t)&vp->v_flag);
9192 }
0a7de745 9193 if (!locked) {
2d21ac55 9194 vnode_unlock(vp);
0a7de745 9195 }
2d21ac55
A
9196}
9197
0a7de745 9198/*
2d21ac55
A
9199 * Removes orphaned apple double files during a rmdir
9200 * Works by:
9201 * 1. vnode_suspend().
0a7de745
A
9202 * 2. Call VNOP_READDIR() till the end of directory is reached.
9203 * 3. Check if the directory entries returned are regular files with name starting with "._". If not, return ENOTEMPTY.
2d21ac55
A
9204 * 4. Continue (2) and (3) till end of directory is reached.
9205 * 5. If all the entries in the directory were files with "._" name, delete all the files.
9206 * 6. vnode_resume()
9207 * 7. If deletion of all files succeeded, call VNOP_RMDIR() again.
9208 */
9209
0a7de745
A
9210errno_t
9211rmdir_remove_orphaned_appleDouble(vnode_t vp, vfs_context_t ctx, int * restart_flag)
2d21ac55 9212{
2d21ac55
A
9213#define UIO_BUFF_SIZE 2048
9214 uio_t auio = NULL;
9215 int eofflag, siz = UIO_BUFF_SIZE, nentries = 0;
9216 int open_flag = 0, full_erase_flag = 0;
0a7de745 9217 char uio_buf[UIO_SIZEOF(1)];
fe8ab488 9218 char *rbuf = NULL;
0a7de745 9219 void *dir_pos;
fe8ab488 9220 void *dir_end;
2d21ac55
A
9221 struct dirent *dp;
9222 errno_t error;
9223
9224 error = vnode_suspend(vp);
9225
9226 /*
9227 * restart_flag is set so that the calling rmdir sleeps and resets
9228 */
0a7de745 9229 if (error == EBUSY) {
2d21ac55 9230 *restart_flag = 1;
0a7de745
A
9231 }
9232 if (error != 0) {
9233 return error;
9234 }
2d21ac55
A
9235
9236 /*
9237 * set up UIO
9238 */
9239 MALLOC(rbuf, caddr_t, siz, M_TEMP, M_WAITOK);
0a7de745 9240 if (rbuf) {
2d21ac55 9241 auio = uio_createwithbuffer(1, 0, UIO_SYSSPACE, UIO_READ,
0a7de745
A
9242 &uio_buf[0], sizeof(uio_buf));
9243 }
2d21ac55
A
9244 if (!rbuf || !auio) {
9245 error = ENOMEM;
9246 goto outsc;
9247 }
9248
0a7de745 9249 uio_setoffset(auio, 0);
2d21ac55
A
9250
9251 eofflag = 0;
9252
0a7de745
A
9253 if ((error = VNOP_OPEN(vp, FREAD, ctx))) {
9254 goto outsc;
9255 } else {
2d21ac55 9256 open_flag = 1;
0a7de745 9257 }
2d21ac55
A
9258
9259 /*
9260 * First pass checks if all files are appleDouble files.
9261 */
9262
9263 do {
9264 siz = UIO_BUFF_SIZE;
9265 uio_reset(auio, uio_offset(auio), UIO_SYSSPACE, UIO_READ);
9266 uio_addiov(auio, CAST_USER_ADDR_T(rbuf), UIO_BUFF_SIZE);
9267
0a7de745 9268 if ((error = VNOP_READDIR(vp, auio, 0, &eofflag, &nentries, ctx))) {
2d21ac55 9269 goto outsc;
0a7de745 9270 }
2d21ac55 9271
0a7de745 9272 if (uio_resid(auio) != 0) {
2d21ac55 9273 siz -= uio_resid(auio);
0a7de745 9274 }
2d21ac55
A
9275
9276 /*
9277 * Iterate through directory
9278 */
fe8ab488
A
9279 dir_pos = (void*) rbuf;
9280 dir_end = (void*) (rbuf + siz);
9281 dp = (struct dirent*) (dir_pos);
2d21ac55 9282
0a7de745 9283 if (dir_pos == dir_end) {
2d21ac55 9284 eofflag = 1;
0a7de745 9285 }
2d21ac55 9286
fe8ab488 9287 while (dir_pos < dir_end) {
2d21ac55
A
9288 /*
9289 * Check for . and .. as well as directories
9290 */
0a7de745
A
9291 if (dp->d_ino != 0 &&
9292 !((dp->d_namlen == 1 && dp->d_name[0] == '.') ||
9293 (dp->d_namlen == 2 && dp->d_name[0] == '.' && dp->d_name[1] == '.'))) {
2d21ac55
A
9294 /*
9295 * Check for irregular files and ._ files
9296 * If there is a ._._ file abort the op
9297 */
0a7de745
A
9298 if (dp->d_namlen < 2 ||
9299 strncmp(dp->d_name, "._", 2) ||
9300 (dp->d_namlen >= 4 && !strncmp(&(dp->d_name[2]), "._", 2))) {
2d21ac55
A
9301 error = ENOTEMPTY;
9302 goto outsc;
9303 }
9304 }
fe8ab488
A
9305 dir_pos = (void*) ((uint8_t*)dir_pos + dp->d_reclen);
9306 dp = (struct dirent*)dir_pos;
2d21ac55 9307 }
0a7de745 9308
cf7d32b8 9309 /*
0a7de745 9310 * workaround for HFS/NFS setting eofflag before end of file
cf7d32b8 9311 */
0a7de745
A
9312 if (vp->v_tag == VT_HFS && nentries > 2) {
9313 eofflag = 0;
9314 }
cf7d32b8
A
9315
9316 if (vp->v_tag == VT_NFS) {
9317 if (eofflag && !full_erase_flag) {
9318 full_erase_flag = 1;
9319 eofflag = 0;
9320 uio_reset(auio, 0, UIO_SYSSPACE, UIO_READ);
0a7de745 9321 } else if (!eofflag && full_erase_flag) {
cf7d32b8 9322 full_erase_flag = 0;
0a7de745 9323 }
cf7d32b8 9324 }
2d21ac55
A
9325 } while (!eofflag);
9326 /*
b0d623f7 9327 * If we've made it here all the files in the dir are ._ files.
2d21ac55
A
9328 * We can delete the files even though the node is suspended
9329 * because we are the owner of the file.
9330 */
9331
9332 uio_reset(auio, 0, UIO_SYSSPACE, UIO_READ);
9333 eofflag = 0;
cf7d32b8 9334 full_erase_flag = 0;
2d21ac55
A
9335
9336 do {
9337 siz = UIO_BUFF_SIZE;
9338 uio_reset(auio, uio_offset(auio), UIO_SYSSPACE, UIO_READ);
9339 uio_addiov(auio, CAST_USER_ADDR_T(rbuf), UIO_BUFF_SIZE);
9340
9341 error = VNOP_READDIR(vp, auio, 0, &eofflag, &nentries, ctx);
9342
0a7de745 9343 if (error != 0) {
2d21ac55 9344 goto outsc;
0a7de745 9345 }
2d21ac55 9346
0a7de745 9347 if (uio_resid(auio) != 0) {
2d21ac55 9348 siz -= uio_resid(auio);
0a7de745 9349 }
2d21ac55
A
9350
9351 /*
9352 * Iterate through directory
9353 */
fe8ab488
A
9354 dir_pos = (void*) rbuf;
9355 dir_end = (void*) (rbuf + siz);
9356 dp = (struct dirent*) dir_pos;
0a7de745
A
9357
9358 if (dir_pos == dir_end) {
2d21ac55 9359 eofflag = 1;
0a7de745
A
9360 }
9361
fe8ab488 9362 while (dir_pos < dir_end) {
2d21ac55
A
9363 /*
9364 * Check for . and .. as well as directories
9365 */
0a7de745
A
9366 if (dp->d_ino != 0 &&
9367 !((dp->d_namlen == 1 && dp->d_name[0] == '.') ||
9368 (dp->d_namlen == 2 && dp->d_name[0] == '.' && dp->d_name[1] == '.'))
9369 ) {
c18c124e
A
9370 error = unlink1(ctx, vp,
9371 CAST_USER_ADDR_T(dp->d_name), UIO_SYSSPACE,
9372 VNODE_REMOVE_SKIP_NAMESPACE_EVENT |
9373 VNODE_REMOVE_NO_AUDIT_PATH);
6d2010ae 9374
0a7de745 9375 if (error && error != ENOENT) {
2d21ac55 9376 goto outsc;
b0d623f7 9377 }
2d21ac55 9378 }
fe8ab488
A
9379 dir_pos = (void*) ((uint8_t*)dir_pos + dp->d_reclen);
9380 dp = (struct dirent*)dir_pos;
2d21ac55 9381 }
0a7de745 9382
2d21ac55 9383 /*
0a7de745 9384 * workaround for HFS/NFS setting eofflag before end of file
2d21ac55 9385 */
0a7de745
A
9386 if (vp->v_tag == VT_HFS && nentries > 2) {
9387 eofflag = 0;
9388 }
2d21ac55
A
9389
9390 if (vp->v_tag == VT_NFS) {
9391 if (eofflag && !full_erase_flag) {
9392 full_erase_flag = 1;
9393 eofflag = 0;
9394 uio_reset(auio, 0, UIO_SYSSPACE, UIO_READ);
0a7de745 9395 } else if (!eofflag && full_erase_flag) {
2d21ac55 9396 full_erase_flag = 0;
0a7de745 9397 }
2d21ac55 9398 }
2d21ac55
A
9399 } while (!eofflag);
9400
9401
9402 error = 0;
9403
9404outsc:
0a7de745 9405 if (open_flag) {
2d21ac55 9406 VNOP_CLOSE(vp, FREAD, ctx);
0a7de745 9407 }
2d21ac55 9408
0a7de745 9409 if (auio) {
3e170ce0 9410 uio_free(auio);
0a7de745 9411 }
2d21ac55
A
9412 FREE(rbuf, M_TEMP);
9413
9414 vnode_resume(vp);
9415
9416
0a7de745 9417 return error;
2d21ac55
A
9418}
9419
91447636 9420
0a7de745
A
9421void
9422lock_vnode_and_post(vnode_t vp, int kevent_num)
b0d623f7
A
9423{
9424 /* Only take the lock if there's something there! */
9425 if (vp->v_knotes.slh_first != NULL) {
9426 vnode_lock(vp);
9427 KNOTE(&vp->v_knotes, kevent_num);
9428 vnode_unlock(vp);
9429 }
9430}
91447636 9431
fe8ab488 9432void panic_print_vnodes(void);
39037602 9433
3e170ce0
A
9434/* define PANIC_PRINTS_VNODES only if investigation is required. */
9435#ifdef PANIC_PRINTS_VNODES
fe8ab488 9436
0a7de745
A
9437static const char *
9438__vtype(uint16_t vtype)
fe8ab488
A
9439{
9440 switch (vtype) {
9441 case VREG:
9442 return "R";
9443 case VDIR:
9444 return "D";
9445 case VBLK:
9446 return "B";
9447 case VCHR:
9448 return "C";
9449 case VLNK:
9450 return "L";
9451 case VSOCK:
9452 return "S";
9453 case VFIFO:
9454 return "F";
9455 case VBAD:
9456 return "x";
9457 case VSTR:
9458 return "T";
9459 case VCPLX:
9460 return "X";
9461 default:
9462 return "?";
9463 }
9464}
9465
9466/*
9467 * build a path from the bottom up
9468 * NOTE: called from the panic path - no alloc'ing of memory and no locks!
9469 */
0a7de745
A
9470static char *
9471__vpath(vnode_t vp, char *str, int len, int depth)
fe8ab488
A
9472{
9473 int vnm_len;
3e170ce0
A
9474 const char *src;
9475 char *dst;
fe8ab488 9476
0a7de745 9477 if (len <= 0) {
fe8ab488 9478 return str;
0a7de745 9479 }
fe8ab488 9480 /* str + len is the start of the string we created */
0a7de745 9481 if (!vp->v_name) {
fe8ab488 9482 return str + len;
0a7de745 9483 }
fe8ab488
A
9484
9485 /* follow mount vnodes to get the full path */
9486 if ((vp->v_flag & VROOT)) {
9487 if (vp->v_mount != NULL && vp->v_mount->mnt_vnodecovered) {
fe8ab488 9488 return __vpath(vp->v_mount->mnt_vnodecovered,
0a7de745 9489 str, len, depth + 1);
fe8ab488
A
9490 }
9491 return str + len;
9492 }
9493
3e170ce0 9494 src = vp->v_name;
fe8ab488
A
9495 vnm_len = strlen(src);
9496 if (vnm_len > len) {
9497 /* truncate the name to fit in the string */
9498 src += (vnm_len - len);
9499 vnm_len = len;
9500 }
9501
9502 /* start from the back and copy just characters (no NULLs) */
9503
9504 /* this will chop off leaf path (file) names */
9505 if (depth > 0) {
9506 dst = str + len - vnm_len;
9507 memcpy(dst, src, vnm_len);
9508 len -= vnm_len;
9509 } else {
9510 dst = str + len;
9511 }
9512
9513 if (vp->v_parent && len > 1) {
9514 /* follow parents up the chain */
9515 len--;
0a7de745 9516 *(dst - 1) = '/';
fe8ab488
A
9517 return __vpath(vp->v_parent, str, len, depth + 1);
9518 }
9519
9520 return dst;
9521}
9522
fe8ab488 9523#define SANE_VNODE_PRINT_LIMIT 5000
0a7de745
A
9524void
9525panic_print_vnodes(void)
fe8ab488
A
9526{
9527 mount_t mnt;
9528 vnode_t vp;
9529 int nvnodes = 0;
9530 const char *type;
9531 char *nm;
9532 char vname[257];
9533
5ba3f43e 9534 paniclog_append_noflush("\n***** VNODES *****\n"
0a7de745 9535 "TYPE UREF ICNT PATH\n");
fe8ab488
A
9536
9537 /* NULL-terminate the path name */
0a7de745 9538 vname[sizeof(vname) - 1] = '\0';
fe8ab488
A
9539
9540 /*
9541 * iterate all vnodelist items in all mounts (mntlist) -> mnt_vnodelist
9542 */
9543 TAILQ_FOREACH(mnt, &mountlist, mnt_list) {
39037602 9544 if (!ml_validate_nofault((vm_offset_t)mnt, sizeof(mount_t))) {
5ba3f43e 9545 paniclog_append_noflush("Unable to iterate the mount list %p - encountered an invalid mount pointer %p \n",
0a7de745 9546 &mountlist, mnt);
39037602
A
9547 break;
9548 }
9549
fe8ab488 9550 TAILQ_FOREACH(vp, &mnt->mnt_vnodelist, v_mntvnodes) {
39037602 9551 if (!ml_validate_nofault((vm_offset_t)vp, sizeof(vnode_t))) {
5ba3f43e 9552 paniclog_append_noflush("Unable to iterate the vnode list %p - encountered an invalid vnode pointer %p \n",
0a7de745 9553 &mnt->mnt_vnodelist, vp);
39037602
A
9554 break;
9555 }
0a7de745
A
9556
9557 if (++nvnodes > SANE_VNODE_PRINT_LIMIT) {
fe8ab488 9558 return;
0a7de745 9559 }
fe8ab488 9560 type = __vtype(vp->v_type);
0a7de745 9561 nm = __vpath(vp, vname, sizeof(vname) - 1, 0);
5ba3f43e 9562 paniclog_append_noflush("%s %0d %0d %s\n",
0a7de745 9563 type, vp->v_usecount, vp->v_iocount, nm);
fe8ab488
A
9564 }
9565 }
9566}
9567
9568#else /* !PANIC_PRINTS_VNODES */
0a7de745
A
9569void
9570panic_print_vnodes(void)
fe8ab488
A
9571{
9572 return;
9573}
9574#endif
9575
9576
b0d623f7 9577#ifdef JOE_DEBUG
0a7de745
A
9578static void
9579record_vp(vnode_t vp, int count)
9580{
9581 struct uthread *ut;
91447636 9582
6d2010ae 9583#if CONFIG_TRIGGERS
0a7de745 9584 if (vp->v_resolve) {
6d2010ae 9585 return;
0a7de745 9586 }
6d2010ae 9587#endif
0a7de745
A
9588 if ((vp->v_flag & VSYSTEM)) {
9589 return;
9590 }
91447636
A
9591
9592 ut = get_bsdthread_info(current_thread());
0a7de745 9593 ut->uu_iocount += count;
91447636 9594
6d2010ae
A
9595 if (count == 1) {
9596 if (ut->uu_vpindex < 32) {
9597 OSBacktrace((void **)&ut->uu_pcs[ut->uu_vpindex][0], 10);
9598
9599 ut->uu_vps[ut->uu_vpindex] = vp;
9600 ut->uu_vpindex++;
91447636 9601 }
91447636
A
9602 }
9603}
9604#endif
6d2010ae
A
9605
9606
9607#if CONFIG_TRIGGERS
9608
9609#define TRIG_DEBUG 0
9610
9611#if TRIG_DEBUG
9612#define TRIG_LOG(...) do { printf("%s: ", __FUNCTION__); printf(__VA_ARGS__); } while (0)
9613#else
9614#define TRIG_LOG(...)
9615#endif
9616
9617/*
9618 * Resolver result functions
9619 */
9620
9621resolver_result_t
9622vfs_resolver_result(uint32_t seq, enum resolver_status stat, int aux)
9623{
9624 /*
9625 * |<--- 32 --->|<--- 28 --->|<- 4 ->|
9626 * sequence auxiliary status
9627 */
0a7de745
A
9628 return (((uint64_t)seq) << 32) |
9629 (((uint64_t)(aux & 0x0fffffff)) << 4) |
9630 (uint64_t)(stat & 0x0000000F);
6d2010ae
A
9631}
9632
9633enum resolver_status
9634vfs_resolver_status(resolver_result_t result)
9635{
9636 /* lower 4 bits is status */
0a7de745 9637 return result & 0x0000000F;
6d2010ae
A
9638}
9639
9640uint32_t
9641vfs_resolver_sequence(resolver_result_t result)
9642{
9643 /* upper 32 bits is sequence */
9644 return (uint32_t)(result >> 32);
9645}
9646
9647int
9648vfs_resolver_auxiliary(resolver_result_t result)
9649{
9650 /* 28 bits of auxiliary */
9651 return (int)(((uint32_t)(result & 0xFFFFFFF0)) >> 4);
9652}
9653
9654/*
9655 * SPI
9656 * Call in for resolvers to update vnode trigger state
9657 */
9658int
9659vnode_trigger_update(vnode_t vp, resolver_result_t result)
9660{
9661 vnode_resolve_t rp;
9662 uint32_t seq;
9663 enum resolver_status stat;
9664
9665 if (vp->v_resolve == NULL) {
0a7de745 9666 return EINVAL;
6d2010ae
A
9667 }
9668
9669 stat = vfs_resolver_status(result);
9670 seq = vfs_resolver_sequence(result);
9671
9672 if ((stat != RESOLVER_RESOLVED) && (stat != RESOLVER_UNRESOLVED)) {
0a7de745 9673 return EINVAL;
6d2010ae
A
9674 }
9675
9676 rp = vp->v_resolve;
9677 lck_mtx_lock(&rp->vr_lock);
9678
9679 if (seq > rp->vr_lastseq) {
0a7de745 9680 if (stat == RESOLVER_RESOLVED) {
6d2010ae 9681 rp->vr_flags |= VNT_RESOLVED;
0a7de745 9682 } else {
6d2010ae 9683 rp->vr_flags &= ~VNT_RESOLVED;
0a7de745 9684 }
6d2010ae
A
9685
9686 rp->vr_lastseq = seq;
9687 }
9688
9689 lck_mtx_unlock(&rp->vr_lock);
9690
0a7de745 9691 return 0;
6d2010ae
A
9692}
9693
9694static int
9695vnode_resolver_attach(vnode_t vp, vnode_resolve_t rp, boolean_t ref)
9696{
9697 int error;
9698
9699 vnode_lock_spin(vp);
9700 if (vp->v_resolve != NULL) {
9701 vnode_unlock(vp);
9702 return EINVAL;
9703 } else {
9704 vp->v_resolve = rp;
9705 }
9706 vnode_unlock(vp);
0a7de745 9707
6d2010ae
A
9708 if (ref) {
9709 error = vnode_ref_ext(vp, O_EVTONLY, VNODE_REF_FORCE);
9710 if (error != 0) {
9711 panic("VNODE_REF_FORCE didn't help...");
9712 }
9713 }
9714
9715 return 0;
9716}
9717
9718/*
9719 * VFS internal interfaces for vnode triggers
9720 *
9721 * vnode must already have an io count on entry
9722 * v_resolve is stable when io count is non-zero
9723 */
9724static int
9725vnode_resolver_create(mount_t mp, vnode_t vp, struct vnode_trigger_param *tinfo, boolean_t external)
9726{
9727 vnode_resolve_t rp;
9728 int result;
9729 char byte;
9730
9731#if 1
9732 /* minimum pointer test (debugging) */
0a7de745 9733 if (tinfo->vnt_data) {
6d2010ae 9734 byte = *((char *)tinfo->vnt_data);
0a7de745 9735 }
6d2010ae
A
9736#endif
9737 MALLOC(rp, vnode_resolve_t, sizeof(*rp), M_TEMP, M_WAITOK);
0a7de745
A
9738 if (rp == NULL) {
9739 return ENOMEM;
9740 }
6d2010ae
A
9741
9742 lck_mtx_init(&rp->vr_lock, trigger_vnode_lck_grp, trigger_vnode_lck_attr);
9743
9744 rp->vr_resolve_func = tinfo->vnt_resolve_func;
9745 rp->vr_unresolve_func = tinfo->vnt_unresolve_func;
9746 rp->vr_rearm_func = tinfo->vnt_rearm_func;
9747 rp->vr_reclaim_func = tinfo->vnt_reclaim_func;
9748 rp->vr_data = tinfo->vnt_data;
9749 rp->vr_lastseq = 0;
9750 rp->vr_flags = tinfo->vnt_flags & VNT_VALID_MASK;
9751 if (external) {
9752 rp->vr_flags |= VNT_EXTERNAL;
9753 }
9754
9755 result = vnode_resolver_attach(vp, rp, external);
9756 if (result != 0) {
9757 goto out;
9758 }
9759
9760 if (mp) {
9761 OSAddAtomic(1, &mp->mnt_numtriggers);
9762 }
9763
0a7de745 9764 return result;
6d2010ae
A
9765
9766out:
9767 FREE(rp, M_TEMP);
9768 return result;
9769}
9770
9771static void
9772vnode_resolver_release(vnode_resolve_t rp)
9773{
9774 /*
9775 * Give them a chance to free any private data
9776 */
9777 if (rp->vr_data && rp->vr_reclaim_func) {
9778 rp->vr_reclaim_func(NULLVP, rp->vr_data);
9779 }
9780
9781 lck_mtx_destroy(&rp->vr_lock, trigger_vnode_lck_grp);
9782 FREE(rp, M_TEMP);
6d2010ae
A
9783}
9784
9785/* Called after the vnode has been drained */
9786static void
9787vnode_resolver_detach(vnode_t vp)
9788{
9789 vnode_resolve_t rp;
0a7de745 9790 mount_t mp;
6d2010ae
A
9791
9792 mp = vnode_mount(vp);
9793
9794 vnode_lock(vp);
9795 rp = vp->v_resolve;
9796 vp->v_resolve = NULL;
9797 vnode_unlock(vp);
9798
9799 if ((rp->vr_flags & VNT_EXTERNAL) != 0) {
9800 vnode_rele_ext(vp, O_EVTONLY, 1);
0a7de745 9801 }
6d2010ae
A
9802
9803 vnode_resolver_release(rp);
0a7de745 9804
6d2010ae 9805 /* Keep count of active trigger vnodes per mount */
0a7de745 9806 OSAddAtomic(-1, &mp->mnt_numtriggers);
6d2010ae
A
9807}
9808
6d2010ae
A
9809__private_extern__
9810void
9811vnode_trigger_rearm(vnode_t vp, vfs_context_t ctx)
9812{
9813 vnode_resolve_t rp;
9814 resolver_result_t result;
9815 enum resolver_status status;
9816 uint32_t seq;
9817
9818 if ((vp->v_resolve == NULL) ||
9819 (vp->v_resolve->vr_rearm_func == NULL) ||
9820 (vp->v_resolve->vr_flags & VNT_AUTO_REARM) == 0) {
9821 return;
9822 }
9823
9824 rp = vp->v_resolve;
9825 lck_mtx_lock(&rp->vr_lock);
9826
9827 /*
9828 * Check if VFS initiated this unmount. If so, we'll catch it after the unresolve completes.
9829 */
9830 if (rp->vr_flags & VNT_VFS_UNMOUNTED) {
9831 lck_mtx_unlock(&rp->vr_lock);
9832 return;
9833 }
9834
9835 /* Check if this vnode is already armed */
9836 if ((rp->vr_flags & VNT_RESOLVED) == 0) {
9837 lck_mtx_unlock(&rp->vr_lock);
9838 return;
9839 }
9840
9841 lck_mtx_unlock(&rp->vr_lock);
9842
9843 result = rp->vr_rearm_func(vp, 0, rp->vr_data, ctx);
9844 status = vfs_resolver_status(result);
9845 seq = vfs_resolver_sequence(result);
9846
9847 lck_mtx_lock(&rp->vr_lock);
9848 if (seq > rp->vr_lastseq) {
0a7de745 9849 if (status == RESOLVER_UNRESOLVED) {
6d2010ae 9850 rp->vr_flags &= ~VNT_RESOLVED;
0a7de745 9851 }
6d2010ae
A
9852 rp->vr_lastseq = seq;
9853 }
9854 lck_mtx_unlock(&rp->vr_lock);
9855}
9856
9857__private_extern__
9858int
9859vnode_trigger_resolve(vnode_t vp, struct nameidata *ndp, vfs_context_t ctx)
9860{
9861 vnode_resolve_t rp;
9862 enum path_operation op;
9863 resolver_result_t result;
9864 enum resolver_status status;
9865 uint32_t seq;
9866
9867 /* Only trigger on topmost vnodes */
9868 if ((vp->v_resolve == NULL) ||
9869 (vp->v_resolve->vr_resolve_func == NULL) ||
9870 (vp->v_mountedhere != NULL)) {
0a7de745 9871 return 0;
6d2010ae
A
9872 }
9873
9874 rp = vp->v_resolve;
9875 lck_mtx_lock(&rp->vr_lock);
9876
9877 /* Check if this vnode is already resolved */
9878 if (rp->vr_flags & VNT_RESOLVED) {
9879 lck_mtx_unlock(&rp->vr_lock);
0a7de745 9880 return 0;
6d2010ae
A
9881 }
9882
9883 lck_mtx_unlock(&rp->vr_lock);
9884
527f9951
A
9885#if CONFIG_MACF
9886 int rv = mac_vnode_check_trigger_resolve(ctx, vp, &ndp->ni_cnd);
0a7de745 9887 if (rv != 0) {
527f9951 9888 return rv;
0a7de745 9889 }
527f9951
A
9890#endif
9891
6d2010ae
A
9892 /*
9893 * XXX
9894 * assumes that resolver will not access this trigger vnode (otherwise the kernel will deadlock)
9895 * is there anyway to know this???
9896 * there can also be other legitimate lookups in parallel
9897 *
9898 * XXX - should we call this on a separate thread with a timeout?
0a7de745 9899 *
6d2010ae
A
9900 * XXX - should we use ISLASTCN to pick the op value??? Perhaps only leafs should
9901 * get the richer set and non-leafs should get generic OP_LOOKUP? TBD
9902 */
9903 op = (ndp->ni_op < OP_MAXOP) ? ndp->ni_op: OP_LOOKUP;
9904
9905 result = rp->vr_resolve_func(vp, &ndp->ni_cnd, op, 0, rp->vr_data, ctx);
9906 status = vfs_resolver_status(result);
9907 seq = vfs_resolver_sequence(result);
9908
9909 lck_mtx_lock(&rp->vr_lock);
9910 if (seq > rp->vr_lastseq) {
0a7de745 9911 if (status == RESOLVER_RESOLVED) {
6d2010ae 9912 rp->vr_flags |= VNT_RESOLVED;
0a7de745 9913 }
6d2010ae
A
9914 rp->vr_lastseq = seq;
9915 }
9916 lck_mtx_unlock(&rp->vr_lock);
9917
9918 /* On resolver errors, propagate the error back up */
0a7de745 9919 return status == RESOLVER_ERROR ? vfs_resolver_auxiliary(result) : 0;
6d2010ae
A
9920}
9921
9922static int
9923vnode_trigger_unresolve(vnode_t vp, int flags, vfs_context_t ctx)
9924{
9925 vnode_resolve_t rp;
9926 resolver_result_t result;
9927 enum resolver_status status;
9928 uint32_t seq;
9929
9930 if ((vp->v_resolve == NULL) || (vp->v_resolve->vr_unresolve_func == NULL)) {
0a7de745 9931 return 0;
6d2010ae
A
9932 }
9933
9934 rp = vp->v_resolve;
9935 lck_mtx_lock(&rp->vr_lock);
9936
9937 /* Check if this vnode is already resolved */
9938 if ((rp->vr_flags & VNT_RESOLVED) == 0) {
9939 printf("vnode_trigger_unresolve: not currently resolved\n");
9940 lck_mtx_unlock(&rp->vr_lock);
0a7de745 9941 return 0;
6d2010ae
A
9942 }
9943
9944 rp->vr_flags |= VNT_VFS_UNMOUNTED;
9945
9946 lck_mtx_unlock(&rp->vr_lock);
9947
9948 /*
9949 * XXX
9950 * assumes that resolver will not access this trigger vnode (otherwise the kernel will deadlock)
9951 * there can also be other legitimate lookups in parallel
9952 *
9953 * XXX - should we call this on a separate thread with a timeout?
9954 */
9955
9956 result = rp->vr_unresolve_func(vp, flags, rp->vr_data, ctx);
9957 status = vfs_resolver_status(result);
9958 seq = vfs_resolver_sequence(result);
9959
9960 lck_mtx_lock(&rp->vr_lock);
9961 if (seq > rp->vr_lastseq) {
0a7de745 9962 if (status == RESOLVER_UNRESOLVED) {
6d2010ae 9963 rp->vr_flags &= ~VNT_RESOLVED;
0a7de745 9964 }
6d2010ae
A
9965 rp->vr_lastseq = seq;
9966 }
9967 rp->vr_flags &= ~VNT_VFS_UNMOUNTED;
9968 lck_mtx_unlock(&rp->vr_lock);
9969
9970 /* On resolver errors, propagate the error back up */
0a7de745 9971 return status == RESOLVER_ERROR ? vfs_resolver_auxiliary(result) : 0;
6d2010ae
A
9972}
9973
9974static int
9975triggerisdescendant(mount_t mp, mount_t rmp)
9976{
9977 int match = FALSE;
9978
9979 /*
9980 * walk up vnode covered chain looking for a match
9981 */
9982 name_cache_lock_shared();
9983
9984 while (1) {
9985 vnode_t vp;
9986
9987 /* did we encounter "/" ? */
0a7de745 9988 if (mp->mnt_flag & MNT_ROOTFS) {
6d2010ae 9989 break;
0a7de745 9990 }
6d2010ae
A
9991
9992 vp = mp->mnt_vnodecovered;
0a7de745 9993 if (vp == NULLVP) {
6d2010ae 9994 break;
0a7de745 9995 }
6d2010ae
A
9996
9997 mp = vp->v_mount;
9998 if (mp == rmp) {
9999 match = TRUE;
10000 break;
10001 }
10002 }
10003
10004 name_cache_unlock();
10005
0a7de745 10006 return match;
6d2010ae
A
10007}
10008
10009struct trigger_unmount_info {
0a7de745
A
10010 vfs_context_t ctx;
10011 mount_t top_mp;
10012 vnode_t trigger_vp;
10013 mount_t trigger_mp;
10014 uint32_t trigger_vid;
10015 int flags;
6d2010ae
A
10016};
10017
10018static int
10019trigger_unmount_callback(mount_t mp, void * arg)
10020{
10021 struct trigger_unmount_info * infop = (struct trigger_unmount_info *)arg;
10022 boolean_t mountedtrigger = FALSE;
10023
10024 /*
10025 * When we encounter the top level mount we're done
10026 */
0a7de745
A
10027 if (mp == infop->top_mp) {
10028 return VFS_RETURNED_DONE;
10029 }
6d2010ae
A
10030
10031 if ((mp->mnt_vnodecovered == NULL) ||
10032 (vnode_getwithref(mp->mnt_vnodecovered) != 0)) {
0a7de745 10033 return VFS_RETURNED;
6d2010ae
A
10034 }
10035
10036 if ((mp->mnt_vnodecovered->v_mountedhere == mp) &&
10037 (mp->mnt_vnodecovered->v_resolve != NULL) &&
10038 (mp->mnt_vnodecovered->v_resolve->vr_flags & VNT_RESOLVED)) {
10039 mountedtrigger = TRUE;
10040 }
10041 vnode_put(mp->mnt_vnodecovered);
10042
10043 /*
10044 * When we encounter a mounted trigger, check if its under the top level mount
10045 */
0a7de745
A
10046 if (!mountedtrigger || !triggerisdescendant(mp, infop->top_mp)) {
10047 return VFS_RETURNED;
10048 }
6d2010ae
A
10049
10050 /*
10051 * Process any pending nested mount (now that its not referenced)
10052 */
10053 if ((infop->trigger_vp != NULLVP) &&
10054 (vnode_getwithvid(infop->trigger_vp, infop->trigger_vid) == 0)) {
10055 vnode_t vp = infop->trigger_vp;
10056 int error;
10057
10058 infop->trigger_vp = NULLVP;
0a7de745 10059
6d2010ae
A
10060 if (mp == vp->v_mountedhere) {
10061 vnode_put(vp);
10062 printf("trigger_unmount_callback: unexpected match '%s'\n",
0a7de745
A
10063 mp->mnt_vfsstat.f_mntonname);
10064 return VFS_RETURNED;
6d2010ae
A
10065 }
10066 if (infop->trigger_mp != vp->v_mountedhere) {
10067 vnode_put(vp);
10068 printf("trigger_unmount_callback: trigger mnt changed! (%p != %p)\n",
0a7de745 10069 infop->trigger_mp, vp->v_mountedhere);
6d2010ae
A
10070 goto savenext;
10071 }
10072
10073 error = vnode_trigger_unresolve(vp, infop->flags, infop->ctx);
10074 vnode_put(vp);
10075 if (error) {
10076 printf("unresolving: '%s', err %d\n",
0a7de745
A
10077 vp->v_mountedhere ? vp->v_mountedhere->mnt_vfsstat.f_mntonname :
10078 "???", error);
10079 return VFS_RETURNED_DONE; /* stop iteration on errors */
6d2010ae
A
10080 }
10081 }
10082savenext:
10083 /*
10084 * We can't call resolver here since we hold a mount iter
10085 * ref on mp so save its covered vp for later processing
10086 */
10087 infop->trigger_vp = mp->mnt_vnodecovered;
10088 if ((infop->trigger_vp != NULLVP) &&
10089 (vnode_getwithref(infop->trigger_vp) == 0)) {
10090 if (infop->trigger_vp->v_mountedhere == mp) {
10091 infop->trigger_vid = infop->trigger_vp->v_id;
10092 infop->trigger_mp = mp;
10093 }
10094 vnode_put(infop->trigger_vp);
10095 }
10096
0a7de745 10097 return VFS_RETURNED;
6d2010ae
A
10098}
10099
10100/*
10101 * Attempt to unmount any trigger mounts nested underneath a mount.
10102 * This is a best effort attempt and no retries are performed here.
10103 *
10104 * Note: mp->mnt_rwlock is held exclusively on entry (so be carefull)
10105 */
10106__private_extern__
10107void
10108vfs_nested_trigger_unmounts(mount_t mp, int flags, vfs_context_t ctx)
10109{
10110 struct trigger_unmount_info info;
10111
10112 /* Must have trigger vnodes */
10113 if (mp->mnt_numtriggers == 0) {
10114 return;
10115 }
10116 /* Avoid recursive requests (by checking covered vnode) */
10117 if ((mp->mnt_vnodecovered != NULL) &&
10118 (vnode_getwithref(mp->mnt_vnodecovered) == 0)) {
10119 boolean_t recursive = FALSE;
10120
10121 if ((mp->mnt_vnodecovered->v_mountedhere == mp) &&
10122 (mp->mnt_vnodecovered->v_resolve != NULL) &&
10123 (mp->mnt_vnodecovered->v_resolve->vr_flags & VNT_VFS_UNMOUNTED)) {
10124 recursive = TRUE;
10125 }
10126 vnode_put(mp->mnt_vnodecovered);
0a7de745 10127 if (recursive) {
6d2010ae 10128 return;
0a7de745 10129 }
6d2010ae
A
10130 }
10131
10132 /*
10133 * Attempt to unmount any nested trigger mounts (best effort)
10134 */
10135 info.ctx = ctx;
10136 info.top_mp = mp;
10137 info.trigger_vp = NULLVP;
10138 info.trigger_vid = 0;
10139 info.trigger_mp = NULL;
10140 info.flags = flags;
10141
10142 (void) vfs_iterate(VFS_ITERATE_TAIL_FIRST, trigger_unmount_callback, &info);
10143
10144 /*
10145 * Process remaining nested mount (now that its not referenced)
10146 */
10147 if ((info.trigger_vp != NULLVP) &&
10148 (vnode_getwithvid(info.trigger_vp, info.trigger_vid) == 0)) {
10149 vnode_t vp = info.trigger_vp;
10150
10151 if (info.trigger_mp == vp->v_mountedhere) {
10152 (void) vnode_trigger_unresolve(vp, flags, ctx);
10153 }
10154 vnode_put(vp);
10155 }
10156}
10157
10158int
10159vfs_addtrigger(mount_t mp, const char *relpath, struct vnode_trigger_info *vtip, vfs_context_t ctx)
10160{
10161 struct nameidata nd;
10162 int res;
10163 vnode_t rvp, vp;
10164 struct vnode_trigger_param vtp;
0a7de745
A
10165
10166 /*
10167 * Must be called for trigger callback, wherein rwlock is held
6d2010ae
A
10168 */
10169 lck_rw_assert(&mp->mnt_rwlock, LCK_RW_ASSERT_HELD);
10170
10171 TRIG_LOG("Adding trigger at %s\n", relpath);
10172 TRIG_LOG("Trying VFS_ROOT\n");
10173
0a7de745 10174 /*
6d2010ae
A
10175 * We do a lookup starting at the root of the mountpoint, unwilling
10176 * to cross into other mountpoints.
10177 */
10178 res = VFS_ROOT(mp, &rvp, ctx);
10179 if (res != 0) {
10180 goto out;
10181 }
10182
10183 TRIG_LOG("Trying namei\n");
10184
10185 NDINIT(&nd, LOOKUP, OP_LOOKUP, USEDVP | NOCROSSMOUNT | FOLLOW, UIO_SYSSPACE,
0a7de745 10186 CAST_USER_ADDR_T(relpath), ctx);
6d2010ae
A
10187 nd.ni_dvp = rvp;
10188 res = namei(&nd);
10189 if (res != 0) {
10190 vnode_put(rvp);
10191 goto out;
10192 }
0a7de745 10193
6d2010ae
A
10194 vp = nd.ni_vp;
10195 nameidone(&nd);
10196 vnode_put(rvp);
10197
10198 TRIG_LOG("Trying vnode_resolver_create()\n");
10199
0a7de745 10200 /*
6d2010ae
A
10201 * Set up blob. vnode_create() takes a larger structure
10202 * with creation info, and we needed something different
10203 * for this case. One needs to win, or we need to munge both;
10204 * vnode_create() wins.
10205 */
10206 bzero(&vtp, sizeof(vtp));
10207 vtp.vnt_resolve_func = vtip->vti_resolve_func;
10208 vtp.vnt_unresolve_func = vtip->vti_unresolve_func;
10209 vtp.vnt_rearm_func = vtip->vti_rearm_func;
10210 vtp.vnt_reclaim_func = vtip->vti_reclaim_func;
10211 vtp.vnt_reclaim_func = vtip->vti_reclaim_func;
10212 vtp.vnt_data = vtip->vti_data;
10213 vtp.vnt_flags = vtip->vti_flags;
10214
10215 res = vnode_resolver_create(mp, vp, &vtp, TRUE);
10216 vnode_put(vp);
10217out:
10218 TRIG_LOG("Returning %d\n", res);
10219 return res;
10220}
10221
10222#endif /* CONFIG_TRIGGERS */
39037602 10223
0a7de745
A
10224vm_offset_t
10225kdebug_vnode(vnode_t vp)
39037602
A
10226{
10227 return VM_KERNEL_ADDRPERM(vp);
10228}
10229
10230static int flush_cache_on_write = 0;
0a7de745
A
10231SYSCTL_INT(_kern, OID_AUTO, flush_cache_on_write,
10232 CTLFLAG_RW | CTLFLAG_LOCKED, &flush_cache_on_write, 0,
10233 "always flush the drive cache on writes to uncached files");
39037602 10234
0a7de745
A
10235int
10236vnode_should_flush_after_write(vnode_t vp, int ioflag)
39037602 10237{
0a7de745
A
10238 return flush_cache_on_write
10239 && (ISSET(ioflag, IO_NOCACHE) || vnode_isnocache(vp));
39037602
A
10240}
10241
10242/*
10243 * sysctl for use by disk I/O tracing tools to get the list of existing
10244 * vnodes' paths
10245 */
10246
10247struct vnode_trace_paths_context {
10248 uint64_t count;
0a7de745 10249 long path[MAXPATHLEN / sizeof(long) + 1]; /* + 1 in case sizeof (long) does not divide MAXPATHLEN */
39037602
A
10250};
10251
0a7de745
A
10252static int
10253vnode_trace_path_callback(struct vnode *vp, void *arg)
10254{
39037602
A
10255 int len, rv;
10256 struct vnode_trace_paths_context *ctx;
10257
10258 ctx = arg;
10259
0a7de745 10260 len = sizeof(ctx->path);
39037602
A
10261 rv = vn_getpath(vp, (char *)ctx->path, &len);
10262 /* vn_getpath() NUL-terminates, and len includes the NUL */
10263
10264 if (!rv) {
d9a64523 10265 kdebug_vfs_lookup(ctx->path, len, vp,
0a7de745 10266 KDBG_VFS_LOOKUP_FLAG_LOOKUP | KDBG_VFS_LOOKUP_FLAG_NOPROCFILT);
39037602
A
10267
10268 if (++(ctx->count) == 1000) {
10269 thread_yield_to_preemption();
10270 ctx->count = 0;
10271 }
10272 }
10273
10274 return VNODE_RETURNED;
10275}
10276
0a7de745
A
10277static int
10278vfs_trace_paths_callback(mount_t mp, void *arg)
10279{
10280 if (mp->mnt_flag & MNT_LOCAL) {
39037602 10281 vnode_iterate(mp, VNODE_ITERATE_ALL, vnode_trace_path_callback, arg);
0a7de745 10282 }
39037602
A
10283
10284 return VFS_RETURNED;
10285}
10286
10287static int sysctl_vfs_trace_paths SYSCTL_HANDLER_ARGS {
10288 struct vnode_trace_paths_context ctx;
10289
10290 (void)oidp;
10291 (void)arg1;
10292 (void)arg2;
10293 (void)req;
10294
0a7de745 10295 if (!kauth_cred_issuser(kauth_cred_get())) {
39037602 10296 return EPERM;
0a7de745 10297 }
39037602 10298
0a7de745 10299 if (!kdebug_enable || !kdebug_debugid_enabled(VFS_LOOKUP)) {
39037602 10300 return EINVAL;
0a7de745 10301 }
39037602 10302
0a7de745 10303 bzero(&ctx, sizeof(struct vnode_trace_paths_context));
39037602
A
10304
10305 vfs_iterate(0, vfs_trace_paths_callback, &ctx);
10306
10307 return 0;
10308}
10309
10310SYSCTL_PROC(_vfs_generic, OID_AUTO, trace_paths, CTLFLAG_RD | CTLFLAG_LOCKED | CTLFLAG_MASKED, NULL, 0, &sysctl_vfs_trace_paths, "-", "trace_paths");