]> git.saurik.com Git - apple/xnu.git/blob - bsd/vfs/vfs_subr.c
e32310c8e302bae18a07955dcc43cef1b0da61bf
[apple/xnu.git] / bsd / vfs / vfs_subr.c
1 /*
2 * Copyright (c) 2000-2018 Apple Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28 /* Copyright (c) 1995 NeXT Computer, Inc. All Rights Reserved */
29 /*
30 * Copyright (c) 1989, 1993
31 * The Regents of the University of California. All rights reserved.
32 * (c) UNIX System Laboratories, Inc.
33 * All or some portions of this file are derived from material licensed
34 * to the University of California by American Telephone and Telegraph
35 * Co. or Unix System Laboratories, Inc. and are reproduced herein with
36 * the permission of UNIX System Laboratories, Inc.
37 *
38 * Redistribution and use in source and binary forms, with or without
39 * modification, are permitted provided that the following conditions
40 * are met:
41 * 1. Redistributions of source code must retain the above copyright
42 * notice, this list of conditions and the following disclaimer.
43 * 2. Redistributions in binary form must reproduce the above copyright
44 * notice, this list of conditions and the following disclaimer in the
45 * documentation and/or other materials provided with the distribution.
46 * 3. All advertising materials mentioning features or use of this software
47 * must display the following acknowledgement:
48 * This product includes software developed by the University of
49 * California, Berkeley and its contributors.
50 * 4. Neither the name of the University nor the names of its contributors
51 * may be used to endorse or promote products derived from this software
52 * without specific prior written permission.
53 *
54 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
55 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
56 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
57 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
58 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
59 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
60 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
61 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
62 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
63 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
64 * SUCH DAMAGE.
65 *
66 * @(#)vfs_subr.c 8.31 (Berkeley) 5/26/95
67 */
68 /*
69 * NOTICE: This file was modified by SPARTA, Inc. in 2005 to introduce
70 * support for mandatory and extensible security protections. This notice
71 * is included in support of clause 2.2 (b) of the Apple Public License,
72 * Version 2.0.
73 */
74
75 /*
76 * External virtual filesystem routines
77 */
78
79 #include <sys/param.h>
80 #include <sys/systm.h>
81 #include <sys/proc_internal.h>
82 #include <sys/kauth.h>
83 #include <sys/mount_internal.h>
84 #include <sys/time.h>
85 #include <sys/lock.h>
86 #include <sys/vnode.h>
87 #include <sys/vnode_internal.h>
88 #include <sys/stat.h>
89 #include <sys/namei.h>
90 #include <sys/ucred.h>
91 #include <sys/buf_internal.h>
92 #include <sys/errno.h>
93 #include <sys/malloc.h>
94 #include <sys/uio_internal.h>
95 #include <sys/uio.h>
96 #include <sys/domain.h>
97 #include <sys/mbuf.h>
98 #include <sys/syslog.h>
99 #include <sys/ubc_internal.h>
100 #include <sys/vm.h>
101 #include <sys/sysctl.h>
102 #include <sys/filedesc.h>
103 #include <sys/event.h>
104 #include <sys/kdebug.h>
105 #include <sys/kauth.h>
106 #include <sys/user.h>
107 #include <sys/systm.h>
108 #include <sys/kern_memorystatus.h>
109 #include <sys/lockf.h>
110 #include <miscfs/fifofs/fifo.h>
111
112 #include <string.h>
113 #include <machine/machine_routines.h>
114
115 #include <kern/assert.h>
116 #include <mach/kern_return.h>
117 #include <kern/thread.h>
118 #include <kern/sched_prim.h>
119
120 #include <miscfs/specfs/specdev.h>
121
122 #include <mach/mach_types.h>
123 #include <mach/memory_object_types.h>
124 #include <mach/memory_object_control.h>
125
126 #include <kern/kalloc.h> /* kalloc()/kfree() */
127 #include <kern/clock.h> /* delay_for_interval() */
128 #include <libkern/OSAtomic.h> /* OSAddAtomic() */
129 #if !CONFIG_EMBEDDED
130 #include <console/video_console.h>
131 #endif
132
133 #ifdef JOE_DEBUG
134 #include <libkern/OSDebug.h>
135 #endif
136
137 #include <vm/vm_protos.h> /* vnode_pager_vrele() */
138
139 #if CONFIG_MACF
140 #include <security/mac_framework.h>
141 #endif
142
143 #include <vfs/vfs_disk_conditioner.h>
144 #include <libkern/section_keywords.h>
145
146 extern lck_grp_t *vnode_lck_grp;
147 extern lck_attr_t *vnode_lck_attr;
148
149 #if CONFIG_TRIGGERS
150 extern lck_grp_t *trigger_vnode_lck_grp;
151 extern lck_attr_t *trigger_vnode_lck_attr;
152 #endif
153
154 extern lck_mtx_t * mnt_list_mtx_lock;
155
156 enum vtype iftovt_tab[16] = {
157 VNON, VFIFO, VCHR, VNON, VDIR, VNON, VBLK, VNON,
158 VREG, VNON, VLNK, VNON, VSOCK, VNON, VNON, VBAD,
159 };
160 int vttoif_tab[9] = {
161 0, S_IFREG, S_IFDIR, S_IFBLK, S_IFCHR, S_IFLNK,
162 S_IFSOCK, S_IFIFO, S_IFMT,
163 };
164
165
166 /* XXX These should be in a BSD accessible Mach header, but aren't. */
167 extern void memory_object_mark_used(
168 memory_object_control_t control);
169
170 extern void memory_object_mark_unused(
171 memory_object_control_t control,
172 boolean_t rage);
173
174 extern void memory_object_mark_io_tracking(
175 memory_object_control_t control);
176
177 /* XXX next protptype should be from <nfs/nfs.h> */
178 extern int nfs_vinvalbuf(vnode_t, int, vfs_context_t, int);
179
180 extern int paniclog_append_noflush(const char *format, ...);
181
182 /* XXX next prototytype should be from libsa/stdlib.h> but conflicts libkern */
183 __private_extern__ void qsort(
184 void * array,
185 size_t nmembers,
186 size_t member_size,
187 int (*)(const void *, const void *));
188
189 __private_extern__ void vntblinit(void);
190 __private_extern__ int unlink1(vfs_context_t, vnode_t, user_addr_t,
191 enum uio_seg, int);
192
193 extern int system_inshutdown;
194
195 static void vnode_list_add(vnode_t);
196 static void vnode_async_list_add(vnode_t);
197 static void vnode_list_remove(vnode_t);
198 static void vnode_list_remove_locked(vnode_t);
199
200 static void vnode_abort_advlocks(vnode_t);
201 static errno_t vnode_drain(vnode_t);
202 static void vgone(vnode_t, int flags);
203 static void vclean(vnode_t vp, int flag);
204 static void vnode_reclaim_internal(vnode_t, int, int, int);
205
206 static void vnode_dropiocount(vnode_t);
207
208 static vnode_t checkalias(vnode_t vp, dev_t nvp_rdev);
209 static int vnode_reload(vnode_t);
210 static int vnode_isinuse_locked(vnode_t, int, int);
211
212 static int unmount_callback(mount_t, __unused void *);
213
214 static void insmntque(vnode_t vp, mount_t mp);
215 static int mount_getvfscnt(void);
216 static int mount_fillfsids(fsid_t *, int );
217 static void vnode_iterate_setup(mount_t);
218 int vnode_umount_preflight(mount_t, vnode_t, int);
219 static int vnode_iterate_prepare(mount_t);
220 static int vnode_iterate_reloadq(mount_t);
221 static void vnode_iterate_clear(mount_t);
222 static mount_t vfs_getvfs_locked(fsid_t *);
223 static int vn_create_reg(vnode_t dvp, vnode_t *vpp, struct nameidata *ndp,
224 struct vnode_attr *vap, uint32_t flags, int fmode, uint32_t *statusp, vfs_context_t ctx);
225 static int vnode_authattr_new_internal(vnode_t dvp, struct vnode_attr *vap, int noauth, uint32_t *defaulted_fieldsp, vfs_context_t ctx);
226
227 errno_t rmdir_remove_orphaned_appleDouble(vnode_t, vfs_context_t, int *);
228
229 #ifdef JOE_DEBUG
230 static void record_vp(vnode_t vp, int count);
231 #endif
232
233 #if CONFIG_JETSAM && (DEVELOPMENT || DEBUG)
234 extern int bootarg_no_vnode_jetsam; /* from bsd_init.c default value is 0 */
235 #endif /* CONFIG_JETSAM && (DEVELOPMENT || DEBUG) */
236
237 boolean_t root_is_CF_drive = FALSE;
238
239 #if CONFIG_TRIGGERS
240 static int vnode_resolver_create(mount_t, vnode_t, struct vnode_trigger_param *, boolean_t external);
241 static void vnode_resolver_detach(vnode_t);
242 #endif
243
244 TAILQ_HEAD(freelst, vnode) vnode_free_list; /* vnode free list */
245 TAILQ_HEAD(deadlst, vnode) vnode_dead_list; /* vnode dead list */
246 TAILQ_HEAD(async_work_lst, vnode) vnode_async_work_list;
247
248
249 TAILQ_HEAD(ragelst, vnode) vnode_rage_list; /* vnode rapid age list */
250 struct timeval rage_tv;
251 int rage_limit = 0;
252 int ragevnodes = 0;
253
254 #define RAGE_LIMIT_MIN 100
255 #define RAGE_TIME_LIMIT 5
256
257 struct mntlist mountlist; /* mounted filesystem list */
258 static int nummounts = 0;
259
260 #if DIAGNOSTIC
261 #define VLISTCHECK(fun, vp, list) \
262 if ((vp)->v_freelist.tqe_prev == (struct vnode **)0xdeadb) \
263 panic("%s: %s vnode not on %slist", (fun), (list), (list));
264 #else
265 #define VLISTCHECK(fun, vp, list)
266 #endif /* DIAGNOSTIC */
267
268 #define VLISTNONE(vp) \
269 do { \
270 (vp)->v_freelist.tqe_next = (struct vnode *)0; \
271 (vp)->v_freelist.tqe_prev = (struct vnode **)0xdeadb; \
272 } while(0)
273
274 #define VONLIST(vp) \
275 ((vp)->v_freelist.tqe_prev != (struct vnode **)0xdeadb)
276
277 /* remove a vnode from free vnode list */
278 #define VREMFREE(fun, vp) \
279 do { \
280 VLISTCHECK((fun), (vp), "free"); \
281 TAILQ_REMOVE(&vnode_free_list, (vp), v_freelist); \
282 VLISTNONE((vp)); \
283 freevnodes--; \
284 } while(0)
285
286
287 /* remove a vnode from dead vnode list */
288 #define VREMDEAD(fun, vp) \
289 do { \
290 VLISTCHECK((fun), (vp), "dead"); \
291 TAILQ_REMOVE(&vnode_dead_list, (vp), v_freelist); \
292 VLISTNONE((vp)); \
293 vp->v_listflag &= ~VLIST_DEAD; \
294 deadvnodes--; \
295 } while(0)
296
297
298 /* remove a vnode from async work vnode list */
299 #define VREMASYNC_WORK(fun, vp) \
300 do { \
301 VLISTCHECK((fun), (vp), "async_work"); \
302 TAILQ_REMOVE(&vnode_async_work_list, (vp), v_freelist); \
303 VLISTNONE((vp)); \
304 vp->v_listflag &= ~VLIST_ASYNC_WORK; \
305 async_work_vnodes--; \
306 } while(0)
307
308
309 /* remove a vnode from rage vnode list */
310 #define VREMRAGE(fun, vp) \
311 do { \
312 if ( !(vp->v_listflag & VLIST_RAGE)) \
313 panic("VREMRAGE: vp not on rage list"); \
314 VLISTCHECK((fun), (vp), "rage"); \
315 TAILQ_REMOVE(&vnode_rage_list, (vp), v_freelist); \
316 VLISTNONE((vp)); \
317 vp->v_listflag &= ~VLIST_RAGE; \
318 ragevnodes--; \
319 } while(0)
320
321 static void async_work_continue(void);
322
323 /*
324 * Initialize the vnode management data structures.
325 */
326 __private_extern__ void
327 vntblinit(void)
328 {
329 thread_t thread = THREAD_NULL;
330
331 TAILQ_INIT(&vnode_free_list);
332 TAILQ_INIT(&vnode_rage_list);
333 TAILQ_INIT(&vnode_dead_list);
334 TAILQ_INIT(&vnode_async_work_list);
335 TAILQ_INIT(&mountlist);
336
337 microuptime(&rage_tv);
338 rage_limit = desiredvnodes / 100;
339
340 if (rage_limit < RAGE_LIMIT_MIN) {
341 rage_limit = RAGE_LIMIT_MIN;
342 }
343
344 /*
345 * create worker threads
346 */
347 kernel_thread_start((thread_continue_t)async_work_continue, NULL, &thread);
348 thread_deallocate(thread);
349 }
350
351 /* the timeout is in 10 msecs */
352 int
353 vnode_waitforwrites(vnode_t vp, int output_target, int slpflag, int slptimeout, const char *msg)
354 {
355 int error = 0;
356 struct timespec ts;
357
358 KERNEL_DEBUG(0x3010280 | DBG_FUNC_START, (int)vp, output_target, vp->v_numoutput, 0, 0);
359
360 if (vp->v_numoutput > output_target) {
361 slpflag |= PDROP;
362
363 vnode_lock_spin(vp);
364
365 while ((vp->v_numoutput > output_target) && error == 0) {
366 if (output_target) {
367 vp->v_flag |= VTHROTTLED;
368 } else {
369 vp->v_flag |= VBWAIT;
370 }
371
372 ts.tv_sec = (slptimeout / 100);
373 ts.tv_nsec = (slptimeout % 1000) * 10 * NSEC_PER_USEC * 1000;
374 error = msleep((caddr_t)&vp->v_numoutput, &vp->v_lock, (slpflag | (PRIBIO + 1)), msg, &ts);
375
376 vnode_lock_spin(vp);
377 }
378 vnode_unlock(vp);
379 }
380 KERNEL_DEBUG(0x3010280 | DBG_FUNC_END, (int)vp, output_target, vp->v_numoutput, error, 0);
381
382 return error;
383 }
384
385
386 void
387 vnode_startwrite(vnode_t vp)
388 {
389 OSAddAtomic(1, &vp->v_numoutput);
390 }
391
392
393 void
394 vnode_writedone(vnode_t vp)
395 {
396 if (vp) {
397 int need_wakeup = 0;
398
399 OSAddAtomic(-1, &vp->v_numoutput);
400
401 vnode_lock_spin(vp);
402
403 if (vp->v_numoutput < 0) {
404 panic("vnode_writedone: numoutput < 0");
405 }
406
407 if ((vp->v_flag & VTHROTTLED)) {
408 vp->v_flag &= ~VTHROTTLED;
409 need_wakeup = 1;
410 }
411 if ((vp->v_flag & VBWAIT) && (vp->v_numoutput == 0)) {
412 vp->v_flag &= ~VBWAIT;
413 need_wakeup = 1;
414 }
415 vnode_unlock(vp);
416
417 if (need_wakeup) {
418 wakeup((caddr_t)&vp->v_numoutput);
419 }
420 }
421 }
422
423
424
425 int
426 vnode_hasdirtyblks(vnode_t vp)
427 {
428 struct cl_writebehind *wbp;
429
430 /*
431 * Not taking the buf_mtxp as there is little
432 * point doing it. Even if the lock is taken the
433 * state can change right after that. If their
434 * needs to be a synchronization, it must be driven
435 * by the caller
436 */
437 if (vp->v_dirtyblkhd.lh_first) {
438 return 1;
439 }
440
441 if (!UBCINFOEXISTS(vp)) {
442 return 0;
443 }
444
445 wbp = vp->v_ubcinfo->cl_wbehind;
446
447 if (wbp && (wbp->cl_number || wbp->cl_scmap)) {
448 return 1;
449 }
450
451 return 0;
452 }
453
454 int
455 vnode_hascleanblks(vnode_t vp)
456 {
457 /*
458 * Not taking the buf_mtxp as there is little
459 * point doing it. Even if the lock is taken the
460 * state can change right after that. If their
461 * needs to be a synchronization, it must be driven
462 * by the caller
463 */
464 if (vp->v_cleanblkhd.lh_first) {
465 return 1;
466 }
467 return 0;
468 }
469
470 void
471 vnode_iterate_setup(mount_t mp)
472 {
473 mp->mnt_lflag |= MNT_LITER;
474 }
475
476 int
477 vnode_umount_preflight(mount_t mp, vnode_t skipvp, int flags)
478 {
479 vnode_t vp;
480
481 TAILQ_FOREACH(vp, &mp->mnt_vnodelist, v_mntvnodes) {
482 if (vp->v_type == VDIR) {
483 continue;
484 }
485 if (vp == skipvp) {
486 continue;
487 }
488 if ((flags & SKIPSYSTEM) && ((vp->v_flag & VSYSTEM) || (vp->v_flag & VNOFLUSH))) {
489 continue;
490 }
491 if ((flags & SKIPSWAP) && (vp->v_flag & VSWAP)) {
492 continue;
493 }
494 if ((flags & WRITECLOSE) && (vp->v_writecount == 0 || vp->v_type != VREG)) {
495 continue;
496 }
497
498 /* Look for busy vnode */
499 if ((vp->v_usecount != 0) && ((vp->v_usecount - vp->v_kusecount) != 0)) {
500 return 1;
501 } else if (vp->v_iocount > 0) {
502 /* Busy if iocount is > 0 for more than 3 seconds */
503 tsleep(&vp->v_iocount, PVFS, "vnode_drain_network", 3 * hz);
504 if (vp->v_iocount > 0) {
505 return 1;
506 }
507 continue;
508 }
509 }
510
511 return 0;
512 }
513
514 /*
515 * This routine prepares iteration by moving all the vnodes to worker queue
516 * called with mount lock held
517 */
518 int
519 vnode_iterate_prepare(mount_t mp)
520 {
521 vnode_t vp;
522
523 if (TAILQ_EMPTY(&mp->mnt_vnodelist)) {
524 /* nothing to do */
525 return 0;
526 }
527
528 vp = TAILQ_FIRST(&mp->mnt_vnodelist);
529 vp->v_mntvnodes.tqe_prev = &(mp->mnt_workerqueue.tqh_first);
530 mp->mnt_workerqueue.tqh_first = mp->mnt_vnodelist.tqh_first;
531 mp->mnt_workerqueue.tqh_last = mp->mnt_vnodelist.tqh_last;
532
533 TAILQ_INIT(&mp->mnt_vnodelist);
534 if (mp->mnt_newvnodes.tqh_first != NULL) {
535 panic("vnode_iterate_prepare: newvnode when entering vnode");
536 }
537 TAILQ_INIT(&mp->mnt_newvnodes);
538
539 return 1;
540 }
541
542
543 /* called with mount lock held */
544 int
545 vnode_iterate_reloadq(mount_t mp)
546 {
547 int moved = 0;
548
549 /* add the remaining entries in workerq to the end of mount vnode list */
550 if (!TAILQ_EMPTY(&mp->mnt_workerqueue)) {
551 struct vnode * mvp;
552 mvp = TAILQ_LAST(&mp->mnt_vnodelist, vnodelst);
553
554 /* Joining the workerque entities to mount vnode list */
555 if (mvp) {
556 mvp->v_mntvnodes.tqe_next = mp->mnt_workerqueue.tqh_first;
557 } else {
558 mp->mnt_vnodelist.tqh_first = mp->mnt_workerqueue.tqh_first;
559 }
560 mp->mnt_workerqueue.tqh_first->v_mntvnodes.tqe_prev = mp->mnt_vnodelist.tqh_last;
561 mp->mnt_vnodelist.tqh_last = mp->mnt_workerqueue.tqh_last;
562 TAILQ_INIT(&mp->mnt_workerqueue);
563 }
564
565 /* add the newvnodes to the head of mount vnode list */
566 if (!TAILQ_EMPTY(&mp->mnt_newvnodes)) {
567 struct vnode * nlvp;
568 nlvp = TAILQ_LAST(&mp->mnt_newvnodes, vnodelst);
569
570 mp->mnt_newvnodes.tqh_first->v_mntvnodes.tqe_prev = &mp->mnt_vnodelist.tqh_first;
571 nlvp->v_mntvnodes.tqe_next = mp->mnt_vnodelist.tqh_first;
572 if (mp->mnt_vnodelist.tqh_first) {
573 mp->mnt_vnodelist.tqh_first->v_mntvnodes.tqe_prev = &nlvp->v_mntvnodes.tqe_next;
574 } else {
575 mp->mnt_vnodelist.tqh_last = mp->mnt_newvnodes.tqh_last;
576 }
577 mp->mnt_vnodelist.tqh_first = mp->mnt_newvnodes.tqh_first;
578 TAILQ_INIT(&mp->mnt_newvnodes);
579 moved = 1;
580 }
581
582 return moved;
583 }
584
585
586 void
587 vnode_iterate_clear(mount_t mp)
588 {
589 mp->mnt_lflag &= ~MNT_LITER;
590 }
591
592 #if !CONFIG_EMBEDDED
593
594 #include <i386/panic_hooks.h>
595
596 struct vnode_iterate_panic_hook {
597 panic_hook_t hook;
598 mount_t mp;
599 struct vnode *vp;
600 };
601
602 static void
603 vnode_iterate_panic_hook(panic_hook_t *hook_)
604 {
605 struct vnode_iterate_panic_hook *hook = (struct vnode_iterate_panic_hook *)hook_;
606 panic_phys_range_t range;
607 uint64_t phys;
608
609 if (panic_phys_range_before(hook->mp, &phys, &range)) {
610 paniclog_append_noflush("mp = %p, phys = %p, prev (%p: %p-%p)\n",
611 hook->mp, phys, range.type, range.phys_start,
612 range.phys_start + range.len);
613 } else {
614 paniclog_append_noflush("mp = %p, phys = %p, prev (!)\n", hook->mp, phys);
615 }
616
617 if (panic_phys_range_before(hook->vp, &phys, &range)) {
618 paniclog_append_noflush("vp = %p, phys = %p, prev (%p: %p-%p)\n",
619 hook->vp, phys, range.type, range.phys_start,
620 range.phys_start + range.len);
621 } else {
622 paniclog_append_noflush("vp = %p, phys = %p, prev (!)\n", hook->vp, phys);
623 }
624 panic_dump_mem((void *)(((vm_offset_t)hook->mp - 4096) & ~4095), 12288);
625 }
626 #endif //CONFIG_EMBEDDED
627
628 int
629 vnode_iterate(mount_t mp, int flags, int (*callout)(struct vnode *, void *),
630 void *arg)
631 {
632 struct vnode *vp;
633 int vid, retval;
634 int ret = 0;
635
636 /*
637 * The mount iterate mutex is held for the duration of the iteration.
638 * This can be done by a state flag on the mount structure but we can
639 * run into priority inversion issues sometimes.
640 * Using a mutex allows us to benefit from the priority donation
641 * mechanisms in the kernel for locks. This mutex should never be
642 * acquired in spin mode and it should be acquired before attempting to
643 * acquire the mount lock.
644 */
645 mount_iterate_lock(mp);
646
647 mount_lock(mp);
648
649 vnode_iterate_setup(mp);
650
651 /* If it returns 0 then there is nothing to do */
652 retval = vnode_iterate_prepare(mp);
653
654 if (retval == 0) {
655 vnode_iterate_clear(mp);
656 mount_unlock(mp);
657 mount_iterate_unlock(mp);
658 return ret;
659 }
660
661 #if !CONFIG_EMBEDDED
662 struct vnode_iterate_panic_hook hook;
663 hook.mp = mp;
664 hook.vp = NULL;
665 panic_hook(&hook.hook, vnode_iterate_panic_hook);
666 #endif
667 /* iterate over all the vnodes */
668 while (!TAILQ_EMPTY(&mp->mnt_workerqueue)) {
669 vp = TAILQ_FIRST(&mp->mnt_workerqueue);
670 #if !CONFIG_EMBEDDED
671 hook.vp = vp;
672 #endif
673 TAILQ_REMOVE(&mp->mnt_workerqueue, vp, v_mntvnodes);
674 TAILQ_INSERT_TAIL(&mp->mnt_vnodelist, vp, v_mntvnodes);
675 vid = vp->v_id;
676 if ((vp->v_data == NULL) || (vp->v_type == VNON) || (vp->v_mount != mp)) {
677 continue;
678 }
679 mount_unlock(mp);
680
681 if (vget_internal(vp, vid, (flags | VNODE_NODEAD | VNODE_WITHID | VNODE_NOSUSPEND))) {
682 mount_lock(mp);
683 continue;
684 }
685 if (flags & VNODE_RELOAD) {
686 /*
687 * we're reloading the filesystem
688 * cast out any inactive vnodes...
689 */
690 if (vnode_reload(vp)) {
691 /* vnode will be recycled on the refcount drop */
692 vnode_put(vp);
693 mount_lock(mp);
694 continue;
695 }
696 }
697
698 retval = callout(vp, arg);
699
700 switch (retval) {
701 case VNODE_RETURNED:
702 case VNODE_RETURNED_DONE:
703 vnode_put(vp);
704 if (retval == VNODE_RETURNED_DONE) {
705 mount_lock(mp);
706 ret = 0;
707 goto out;
708 }
709 break;
710
711 case VNODE_CLAIMED_DONE:
712 mount_lock(mp);
713 ret = 0;
714 goto out;
715 case VNODE_CLAIMED:
716 default:
717 break;
718 }
719 mount_lock(mp);
720 }
721
722 out:
723 #if !CONFIG_EMBEDDED
724 panic_unhook(&hook.hook);
725 #endif
726 (void)vnode_iterate_reloadq(mp);
727 vnode_iterate_clear(mp);
728 mount_unlock(mp);
729 mount_iterate_unlock(mp);
730 return ret;
731 }
732
733 void
734 mount_lock_renames(mount_t mp)
735 {
736 lck_mtx_lock(&mp->mnt_renamelock);
737 }
738
739 void
740 mount_unlock_renames(mount_t mp)
741 {
742 lck_mtx_unlock(&mp->mnt_renamelock);
743 }
744
745 void
746 mount_iterate_lock(mount_t mp)
747 {
748 lck_mtx_lock(&mp->mnt_iter_lock);
749 }
750
751 void
752 mount_iterate_unlock(mount_t mp)
753 {
754 lck_mtx_unlock(&mp->mnt_iter_lock);
755 }
756
757 void
758 mount_lock(mount_t mp)
759 {
760 lck_mtx_lock(&mp->mnt_mlock);
761 }
762
763 void
764 mount_lock_spin(mount_t mp)
765 {
766 lck_mtx_lock_spin(&mp->mnt_mlock);
767 }
768
769 void
770 mount_unlock(mount_t mp)
771 {
772 lck_mtx_unlock(&mp->mnt_mlock);
773 }
774
775
776 void
777 mount_ref(mount_t mp, int locked)
778 {
779 if (!locked) {
780 mount_lock_spin(mp);
781 }
782
783 mp->mnt_count++;
784
785 if (!locked) {
786 mount_unlock(mp);
787 }
788 }
789
790
791 void
792 mount_drop(mount_t mp, int locked)
793 {
794 if (!locked) {
795 mount_lock_spin(mp);
796 }
797
798 mp->mnt_count--;
799
800 if (mp->mnt_count == 0 && (mp->mnt_lflag & MNT_LDRAIN)) {
801 wakeup(&mp->mnt_lflag);
802 }
803
804 if (!locked) {
805 mount_unlock(mp);
806 }
807 }
808
809
810 int
811 mount_iterref(mount_t mp, int locked)
812 {
813 int retval = 0;
814
815 if (!locked) {
816 mount_list_lock();
817 }
818 if (mp->mnt_iterref < 0) {
819 retval = 1;
820 } else {
821 mp->mnt_iterref++;
822 }
823 if (!locked) {
824 mount_list_unlock();
825 }
826 return retval;
827 }
828
829 int
830 mount_isdrained(mount_t mp, int locked)
831 {
832 int retval;
833
834 if (!locked) {
835 mount_list_lock();
836 }
837 if (mp->mnt_iterref < 0) {
838 retval = 1;
839 } else {
840 retval = 0;
841 }
842 if (!locked) {
843 mount_list_unlock();
844 }
845 return retval;
846 }
847
848 void
849 mount_iterdrop(mount_t mp)
850 {
851 mount_list_lock();
852 mp->mnt_iterref--;
853 wakeup(&mp->mnt_iterref);
854 mount_list_unlock();
855 }
856
857 void
858 mount_iterdrain(mount_t mp)
859 {
860 mount_list_lock();
861 while (mp->mnt_iterref) {
862 msleep((caddr_t)&mp->mnt_iterref, mnt_list_mtx_lock, PVFS, "mount_iterdrain", NULL);
863 }
864 /* mount iterations drained */
865 mp->mnt_iterref = -1;
866 mount_list_unlock();
867 }
868 void
869 mount_iterreset(mount_t mp)
870 {
871 mount_list_lock();
872 if (mp->mnt_iterref == -1) {
873 mp->mnt_iterref = 0;
874 }
875 mount_list_unlock();
876 }
877
878 /* always called with mount lock held */
879 int
880 mount_refdrain(mount_t mp)
881 {
882 if (mp->mnt_lflag & MNT_LDRAIN) {
883 panic("already in drain");
884 }
885 mp->mnt_lflag |= MNT_LDRAIN;
886
887 while (mp->mnt_count) {
888 msleep((caddr_t)&mp->mnt_lflag, &mp->mnt_mlock, PVFS, "mount_drain", NULL);
889 }
890
891 if (mp->mnt_vnodelist.tqh_first != NULL) {
892 panic("mount_refdrain: dangling vnode");
893 }
894
895 mp->mnt_lflag &= ~MNT_LDRAIN;
896
897 return 0;
898 }
899
900 /* Tags the mount point as not supportine extended readdir for NFS exports */
901 void
902 mount_set_noreaddirext(mount_t mp)
903 {
904 mount_lock(mp);
905 mp->mnt_kern_flag |= MNTK_DENY_READDIREXT;
906 mount_unlock(mp);
907 }
908
909 /*
910 * Mark a mount point as busy. Used to synchronize access and to delay
911 * unmounting.
912 */
913 int
914 vfs_busy(mount_t mp, int flags)
915 {
916 restart:
917 if (mp->mnt_lflag & MNT_LDEAD) {
918 return ENOENT;
919 }
920
921 mount_lock(mp);
922
923 if (mp->mnt_lflag & MNT_LUNMOUNT) {
924 if (flags & LK_NOWAIT || mp->mnt_lflag & MNT_LDEAD) {
925 mount_unlock(mp);
926 return ENOENT;
927 }
928
929 /*
930 * Since all busy locks are shared except the exclusive
931 * lock granted when unmounting, the only place that a
932 * wakeup needs to be done is at the release of the
933 * exclusive lock at the end of dounmount.
934 */
935 mp->mnt_lflag |= MNT_LWAIT;
936 msleep((caddr_t)mp, &mp->mnt_mlock, (PVFS | PDROP), "vfsbusy", NULL);
937 return ENOENT;
938 }
939
940 mount_unlock(mp);
941
942 lck_rw_lock_shared(&mp->mnt_rwlock);
943
944 /*
945 * Until we are granted the rwlock, it's possible for the mount point to
946 * change state, so re-evaluate before granting the vfs_busy.
947 */
948 if (mp->mnt_lflag & (MNT_LDEAD | MNT_LUNMOUNT)) {
949 lck_rw_done(&mp->mnt_rwlock);
950 goto restart;
951 }
952 return 0;
953 }
954
955 /*
956 * Free a busy filesystem.
957 */
958 void
959 vfs_unbusy(mount_t mp)
960 {
961 lck_rw_done(&mp->mnt_rwlock);
962 }
963
964
965
966 static void
967 vfs_rootmountfailed(mount_t mp)
968 {
969 mount_list_lock();
970 mp->mnt_vtable->vfc_refcount--;
971 mount_list_unlock();
972
973 vfs_unbusy(mp);
974
975 mount_lock_destroy(mp);
976
977 #if CONFIG_MACF
978 mac_mount_label_destroy(mp);
979 #endif
980
981 FREE_ZONE(mp, sizeof(struct mount), M_MOUNT);
982 }
983
984 /*
985 * Lookup a filesystem type, and if found allocate and initialize
986 * a mount structure for it.
987 *
988 * Devname is usually updated by mount(8) after booting.
989 */
990 static mount_t
991 vfs_rootmountalloc_internal(struct vfstable *vfsp, const char *devname)
992 {
993 mount_t mp;
994
995 mp = _MALLOC_ZONE(sizeof(struct mount), M_MOUNT, M_WAITOK);
996 bzero((char *)mp, sizeof(struct mount));
997
998 /* Initialize the default IO constraints */
999 mp->mnt_maxreadcnt = mp->mnt_maxwritecnt = MAXPHYS;
1000 mp->mnt_segreadcnt = mp->mnt_segwritecnt = 32;
1001 mp->mnt_maxsegreadsize = mp->mnt_maxreadcnt;
1002 mp->mnt_maxsegwritesize = mp->mnt_maxwritecnt;
1003 mp->mnt_devblocksize = DEV_BSIZE;
1004 mp->mnt_alignmentmask = PAGE_MASK;
1005 mp->mnt_ioqueue_depth = MNT_DEFAULT_IOQUEUE_DEPTH;
1006 mp->mnt_ioscale = 1;
1007 mp->mnt_ioflags = 0;
1008 mp->mnt_realrootvp = NULLVP;
1009 mp->mnt_authcache_ttl = CACHED_LOOKUP_RIGHT_TTL;
1010 mp->mnt_throttle_mask = LOWPRI_MAX_NUM_DEV - 1;
1011 mp->mnt_devbsdunit = 0;
1012
1013 mount_lock_init(mp);
1014 (void)vfs_busy(mp, LK_NOWAIT);
1015
1016 TAILQ_INIT(&mp->mnt_vnodelist);
1017 TAILQ_INIT(&mp->mnt_workerqueue);
1018 TAILQ_INIT(&mp->mnt_newvnodes);
1019
1020 mp->mnt_vtable = vfsp;
1021 mp->mnt_op = vfsp->vfc_vfsops;
1022 mp->mnt_flag = MNT_RDONLY | MNT_ROOTFS;
1023 mp->mnt_vnodecovered = NULLVP;
1024 //mp->mnt_stat.f_type = vfsp->vfc_typenum;
1025 mp->mnt_flag |= vfsp->vfc_flags & MNT_VISFLAGMASK;
1026
1027 mount_list_lock();
1028 vfsp->vfc_refcount++;
1029 mount_list_unlock();
1030
1031 strlcpy(mp->mnt_vfsstat.f_fstypename, vfsp->vfc_name, MFSTYPENAMELEN);
1032 mp->mnt_vfsstat.f_mntonname[0] = '/';
1033 /* XXX const poisoning layering violation */
1034 (void) copystr((const void *)devname, mp->mnt_vfsstat.f_mntfromname, MAXPATHLEN - 1, NULL);
1035
1036 #if CONFIG_MACF
1037 mac_mount_label_init(mp);
1038 mac_mount_label_associate(vfs_context_kernel(), mp);
1039 #endif
1040 return mp;
1041 }
1042
1043 errno_t
1044 vfs_rootmountalloc(const char *fstypename, const char *devname, mount_t *mpp)
1045 {
1046 struct vfstable *vfsp;
1047
1048 for (vfsp = vfsconf; vfsp; vfsp = vfsp->vfc_next) {
1049 if (!strncmp(vfsp->vfc_name, fstypename,
1050 sizeof(vfsp->vfc_name))) {
1051 break;
1052 }
1053 }
1054 if (vfsp == NULL) {
1055 return ENODEV;
1056 }
1057
1058 *mpp = vfs_rootmountalloc_internal(vfsp, devname);
1059
1060 if (*mpp) {
1061 return 0;
1062 }
1063
1064 return ENOMEM;
1065 }
1066
1067 #define DBG_MOUNTROOT (FSDBG_CODE(DBG_MOUNT, 0))
1068
1069 /*
1070 * Find an appropriate filesystem to use for the root. If a filesystem
1071 * has not been preselected, walk through the list of known filesystems
1072 * trying those that have mountroot routines, and try them until one
1073 * works or we have tried them all.
1074 */
1075 extern int (*mountroot)(void);
1076
1077 int
1078 vfs_mountroot(void)
1079 {
1080 #if CONFIG_MACF
1081 struct vnode *vp;
1082 #endif
1083 struct vfstable *vfsp;
1084 vfs_context_t ctx = vfs_context_kernel();
1085 struct vfs_attr vfsattr;
1086 int error;
1087 mount_t mp;
1088 vnode_t bdevvp_rootvp;
1089
1090 KDBG_RELEASE(DBG_MOUNTROOT | DBG_FUNC_START);
1091 if (mountroot != NULL) {
1092 /*
1093 * used for netboot which follows a different set of rules
1094 */
1095 error = (*mountroot)();
1096
1097 KDBG_RELEASE(DBG_MOUNTROOT | DBG_FUNC_END, error, 0);
1098 return error;
1099 }
1100 if ((error = bdevvp(rootdev, &rootvp))) {
1101 printf("vfs_mountroot: can't setup bdevvp\n");
1102
1103 KDBG_RELEASE(DBG_MOUNTROOT | DBG_FUNC_END, error, 1);
1104 return error;
1105 }
1106 /*
1107 * 4951998 - code we call in vfc_mountroot may replace rootvp
1108 * so keep a local copy for some house keeping.
1109 */
1110 bdevvp_rootvp = rootvp;
1111
1112 for (vfsp = vfsconf; vfsp; vfsp = vfsp->vfc_next) {
1113 if (vfsp->vfc_mountroot == NULL
1114 && !ISSET(vfsp->vfc_vfsflags, VFC_VFSCANMOUNTROOT)) {
1115 continue;
1116 }
1117
1118 mp = vfs_rootmountalloc_internal(vfsp, "root_device");
1119 mp->mnt_devvp = rootvp;
1120
1121 if (vfsp->vfc_mountroot) {
1122 error = (*vfsp->vfc_mountroot)(mp, rootvp, ctx);
1123 } else {
1124 error = VFS_MOUNT(mp, rootvp, 0, ctx);
1125 }
1126
1127 if (!error) {
1128 if (bdevvp_rootvp != rootvp) {
1129 /*
1130 * rootvp changed...
1131 * bump the iocount and fix up mnt_devvp for the
1132 * new rootvp (it will already have a usecount taken)...
1133 * drop the iocount and the usecount on the orignal
1134 * since we are no longer going to use it...
1135 */
1136 vnode_getwithref(rootvp);
1137 mp->mnt_devvp = rootvp;
1138
1139 vnode_rele(bdevvp_rootvp);
1140 vnode_put(bdevvp_rootvp);
1141 }
1142 mp->mnt_devvp->v_specflags |= SI_MOUNTEDON;
1143
1144 vfs_unbusy(mp);
1145
1146 mount_list_add(mp);
1147
1148 /*
1149 * cache the IO attributes for the underlying physical media...
1150 * an error return indicates the underlying driver doesn't
1151 * support all the queries necessary... however, reasonable
1152 * defaults will have been set, so no reason to bail or care
1153 */
1154 vfs_init_io_attributes(rootvp, mp);
1155
1156 if (mp->mnt_ioflags & MNT_IOFLAGS_FUSION_DRIVE) {
1157 root_is_CF_drive = TRUE;
1158 }
1159
1160 /*
1161 * Shadow the VFC_VFSNATIVEXATTR flag to MNTK_EXTENDED_ATTRS.
1162 */
1163 if (mp->mnt_vtable->vfc_vfsflags & VFC_VFSNATIVEXATTR) {
1164 mp->mnt_kern_flag |= MNTK_EXTENDED_ATTRS;
1165 }
1166 if (mp->mnt_vtable->vfc_vfsflags & VFC_VFSPREFLIGHT) {
1167 mp->mnt_kern_flag |= MNTK_UNMOUNT_PREFLIGHT;
1168 }
1169
1170 #if !CONFIG_EMBEDDED
1171 uint32_t speed;
1172
1173 if (MNTK_VIRTUALDEV & mp->mnt_kern_flag) {
1174 speed = 128;
1175 } else if (disk_conditioner_mount_is_ssd(mp)) {
1176 speed = 7 * 256;
1177 } else {
1178 speed = 256;
1179 }
1180 vc_progress_setdiskspeed(speed);
1181 #endif
1182 /*
1183 * Probe root file system for additional features.
1184 */
1185 (void)VFS_START(mp, 0, ctx);
1186
1187 VFSATTR_INIT(&vfsattr);
1188 VFSATTR_WANTED(&vfsattr, f_capabilities);
1189 if (vfs_getattr(mp, &vfsattr, ctx) == 0 &&
1190 VFSATTR_IS_SUPPORTED(&vfsattr, f_capabilities)) {
1191 if ((vfsattr.f_capabilities.capabilities[VOL_CAPABILITIES_INTERFACES] & VOL_CAP_INT_EXTENDED_ATTR) &&
1192 (vfsattr.f_capabilities.valid[VOL_CAPABILITIES_INTERFACES] & VOL_CAP_INT_EXTENDED_ATTR)) {
1193 mp->mnt_kern_flag |= MNTK_EXTENDED_ATTRS;
1194 }
1195 #if NAMEDSTREAMS
1196 if ((vfsattr.f_capabilities.capabilities[VOL_CAPABILITIES_INTERFACES] & VOL_CAP_INT_NAMEDSTREAMS) &&
1197 (vfsattr.f_capabilities.valid[VOL_CAPABILITIES_INTERFACES] & VOL_CAP_INT_NAMEDSTREAMS)) {
1198 mp->mnt_kern_flag |= MNTK_NAMED_STREAMS;
1199 }
1200 #endif
1201 if ((vfsattr.f_capabilities.capabilities[VOL_CAPABILITIES_FORMAT] & VOL_CAP_FMT_PATH_FROM_ID) &&
1202 (vfsattr.f_capabilities.valid[VOL_CAPABILITIES_FORMAT] & VOL_CAP_FMT_PATH_FROM_ID)) {
1203 mp->mnt_kern_flag |= MNTK_PATH_FROM_ID;
1204 }
1205
1206 if ((vfsattr.f_capabilities.capabilities[VOL_CAPABILITIES_FORMAT] & VOL_CAP_FMT_DIR_HARDLINKS) &&
1207 (vfsattr.f_capabilities.valid[VOL_CAPABILITIES_FORMAT] & VOL_CAP_FMT_DIR_HARDLINKS)) {
1208 mp->mnt_kern_flag |= MNTK_DIR_HARDLINKS;
1209 }
1210 }
1211
1212 /*
1213 * get rid of iocount reference returned
1214 * by bdevvp (or picked up by us on the substitued
1215 * rootvp)... it (or we) will have also taken
1216 * a usecount reference which we want to keep
1217 */
1218 vnode_put(rootvp);
1219
1220 #if CONFIG_MACF
1221 if ((vfs_flags(mp) & MNT_MULTILABEL) == 0) {
1222 KDBG_RELEASE(DBG_MOUNTROOT | DBG_FUNC_END, 0, 2);
1223 return 0;
1224 }
1225
1226 error = VFS_ROOT(mp, &vp, ctx);
1227 if (error) {
1228 printf("%s() VFS_ROOT() returned %d\n",
1229 __func__, error);
1230 dounmount(mp, MNT_FORCE, 0, ctx);
1231 goto fail;
1232 }
1233 error = vnode_label(mp, NULL, vp, NULL, 0, ctx);
1234 /*
1235 * get rid of reference provided by VFS_ROOT
1236 */
1237 vnode_put(vp);
1238
1239 if (error) {
1240 printf("%s() vnode_label() returned %d\n",
1241 __func__, error);
1242 dounmount(mp, MNT_FORCE, 0, ctx);
1243 goto fail;
1244 }
1245 #endif
1246 KDBG_RELEASE(DBG_MOUNTROOT | DBG_FUNC_END, 0, 3);
1247 return 0;
1248 }
1249 #if CONFIG_MACF
1250 fail:
1251 #endif
1252 vfs_rootmountfailed(mp);
1253
1254 if (error != EINVAL) {
1255 printf("%s_mountroot failed: %d\n", vfsp->vfc_name, error);
1256 }
1257 }
1258 KDBG_RELEASE(DBG_MOUNTROOT | DBG_FUNC_END, error ? error : ENODEV, 4);
1259 return ENODEV;
1260 }
1261
1262 /*
1263 * Lookup a mount point by filesystem identifier.
1264 */
1265
1266 struct mount *
1267 vfs_getvfs(fsid_t *fsid)
1268 {
1269 return mount_list_lookupby_fsid(fsid, 0, 0);
1270 }
1271
1272 static struct mount *
1273 vfs_getvfs_locked(fsid_t *fsid)
1274 {
1275 return mount_list_lookupby_fsid(fsid, 1, 0);
1276 }
1277
1278 struct mount *
1279 vfs_getvfs_by_mntonname(char *path)
1280 {
1281 mount_t retmp = (mount_t)0;
1282 mount_t mp;
1283
1284 mount_list_lock();
1285 TAILQ_FOREACH(mp, &mountlist, mnt_list) {
1286 if (!strncmp(mp->mnt_vfsstat.f_mntonname, path,
1287 sizeof(mp->mnt_vfsstat.f_mntonname))) {
1288 retmp = mp;
1289 if (mount_iterref(retmp, 1)) {
1290 retmp = NULL;
1291 }
1292 goto out;
1293 }
1294 }
1295 out:
1296 mount_list_unlock();
1297 return retmp;
1298 }
1299
1300 /* generation number for creation of new fsids */
1301 u_short mntid_gen = 0;
1302 /*
1303 * Get a new unique fsid
1304 */
1305 void
1306 vfs_getnewfsid(struct mount *mp)
1307 {
1308 fsid_t tfsid;
1309 int mtype;
1310
1311 mount_list_lock();
1312
1313 /* generate a new fsid */
1314 mtype = mp->mnt_vtable->vfc_typenum;
1315 if (++mntid_gen == 0) {
1316 mntid_gen++;
1317 }
1318 tfsid.val[0] = makedev(nblkdev + mtype, mntid_gen);
1319 tfsid.val[1] = mtype;
1320
1321 while (vfs_getvfs_locked(&tfsid)) {
1322 if (++mntid_gen == 0) {
1323 mntid_gen++;
1324 }
1325 tfsid.val[0] = makedev(nblkdev + mtype, mntid_gen);
1326 }
1327
1328 mp->mnt_vfsstat.f_fsid.val[0] = tfsid.val[0];
1329 mp->mnt_vfsstat.f_fsid.val[1] = tfsid.val[1];
1330 mount_list_unlock();
1331 }
1332
1333 /*
1334 * Routines having to do with the management of the vnode table.
1335 */
1336 extern int(**dead_vnodeop_p)(void *);
1337 long numvnodes, freevnodes, deadvnodes, async_work_vnodes;
1338
1339
1340 int async_work_timed_out = 0;
1341 int async_work_handled = 0;
1342 int dead_vnode_wanted = 0;
1343 int dead_vnode_waited = 0;
1344
1345 /*
1346 * Move a vnode from one mount queue to another.
1347 */
1348 static void
1349 insmntque(vnode_t vp, mount_t mp)
1350 {
1351 mount_t lmp;
1352 /*
1353 * Delete from old mount point vnode list, if on one.
1354 */
1355 if ((lmp = vp->v_mount) != NULL && lmp != dead_mountp) {
1356 if ((vp->v_lflag & VNAMED_MOUNT) == 0) {
1357 panic("insmntque: vp not in mount vnode list");
1358 }
1359 vp->v_lflag &= ~VNAMED_MOUNT;
1360
1361 mount_lock_spin(lmp);
1362
1363 mount_drop(lmp, 1);
1364
1365 if (vp->v_mntvnodes.tqe_next == NULL) {
1366 if (TAILQ_LAST(&lmp->mnt_vnodelist, vnodelst) == vp) {
1367 TAILQ_REMOVE(&lmp->mnt_vnodelist, vp, v_mntvnodes);
1368 } else if (TAILQ_LAST(&lmp->mnt_newvnodes, vnodelst) == vp) {
1369 TAILQ_REMOVE(&lmp->mnt_newvnodes, vp, v_mntvnodes);
1370 } else if (TAILQ_LAST(&lmp->mnt_workerqueue, vnodelst) == vp) {
1371 TAILQ_REMOVE(&lmp->mnt_workerqueue, vp, v_mntvnodes);
1372 }
1373 } else {
1374 vp->v_mntvnodes.tqe_next->v_mntvnodes.tqe_prev = vp->v_mntvnodes.tqe_prev;
1375 *vp->v_mntvnodes.tqe_prev = vp->v_mntvnodes.tqe_next;
1376 }
1377 vp->v_mntvnodes.tqe_next = NULL;
1378 vp->v_mntvnodes.tqe_prev = NULL;
1379 mount_unlock(lmp);
1380 return;
1381 }
1382
1383 /*
1384 * Insert into list of vnodes for the new mount point, if available.
1385 */
1386 if ((vp->v_mount = mp) != NULL) {
1387 mount_lock_spin(mp);
1388 if ((vp->v_mntvnodes.tqe_next != 0) && (vp->v_mntvnodes.tqe_prev != 0)) {
1389 panic("vp already in mount list");
1390 }
1391 if (mp->mnt_lflag & MNT_LITER) {
1392 TAILQ_INSERT_HEAD(&mp->mnt_newvnodes, vp, v_mntvnodes);
1393 } else {
1394 TAILQ_INSERT_HEAD(&mp->mnt_vnodelist, vp, v_mntvnodes);
1395 }
1396 if (vp->v_lflag & VNAMED_MOUNT) {
1397 panic("insmntque: vp already in mount vnode list");
1398 }
1399 vp->v_lflag |= VNAMED_MOUNT;
1400 mount_ref(mp, 1);
1401 mount_unlock(mp);
1402 }
1403 }
1404
1405
1406 /*
1407 * Create a vnode for a block device.
1408 * Used for root filesystem, argdev, and swap areas.
1409 * Also used for memory file system special devices.
1410 */
1411 int
1412 bdevvp(dev_t dev, vnode_t *vpp)
1413 {
1414 vnode_t nvp;
1415 int error;
1416 struct vnode_fsparam vfsp;
1417 struct vfs_context context;
1418
1419 if (dev == NODEV) {
1420 *vpp = NULLVP;
1421 return ENODEV;
1422 }
1423
1424 context.vc_thread = current_thread();
1425 context.vc_ucred = FSCRED;
1426
1427 vfsp.vnfs_mp = (struct mount *)0;
1428 vfsp.vnfs_vtype = VBLK;
1429 vfsp.vnfs_str = "bdevvp";
1430 vfsp.vnfs_dvp = NULL;
1431 vfsp.vnfs_fsnode = NULL;
1432 vfsp.vnfs_cnp = NULL;
1433 vfsp.vnfs_vops = spec_vnodeop_p;
1434 vfsp.vnfs_rdev = dev;
1435 vfsp.vnfs_filesize = 0;
1436
1437 vfsp.vnfs_flags = VNFS_NOCACHE | VNFS_CANTCACHE;
1438
1439 vfsp.vnfs_marksystem = 0;
1440 vfsp.vnfs_markroot = 0;
1441
1442 if ((error = vnode_create(VNCREATE_FLAVOR, VCREATESIZE, &vfsp, &nvp))) {
1443 *vpp = NULLVP;
1444 return error;
1445 }
1446 vnode_lock_spin(nvp);
1447 nvp->v_flag |= VBDEVVP;
1448 nvp->v_tag = VT_NON; /* set this to VT_NON so during aliasing it can be replaced */
1449 vnode_unlock(nvp);
1450 if ((error = vnode_ref(nvp))) {
1451 panic("bdevvp failed: vnode_ref");
1452 return error;
1453 }
1454 if ((error = VNOP_FSYNC(nvp, MNT_WAIT, &context))) {
1455 panic("bdevvp failed: fsync");
1456 return error;
1457 }
1458 if ((error = buf_invalidateblks(nvp, BUF_WRITE_DATA, 0, 0))) {
1459 panic("bdevvp failed: invalidateblks");
1460 return error;
1461 }
1462
1463 #if CONFIG_MACF
1464 /*
1465 * XXXMAC: We can't put a MAC check here, the system will
1466 * panic without this vnode.
1467 */
1468 #endif /* MAC */
1469
1470 if ((error = VNOP_OPEN(nvp, FREAD, &context))) {
1471 panic("bdevvp failed: open");
1472 return error;
1473 }
1474 *vpp = nvp;
1475
1476 return 0;
1477 }
1478
1479 /*
1480 * Check to see if the new vnode represents a special device
1481 * for which we already have a vnode (either because of
1482 * bdevvp() or because of a different vnode representing
1483 * the same block device). If such an alias exists, deallocate
1484 * the existing contents and return the aliased vnode. The
1485 * caller is responsible for filling it with its new contents.
1486 */
1487 static vnode_t
1488 checkalias(struct vnode *nvp, dev_t nvp_rdev)
1489 {
1490 struct vnode *vp;
1491 struct vnode **vpp;
1492 struct specinfo *sin = NULL;
1493 int vid = 0;
1494
1495 vpp = &speclisth[SPECHASH(nvp_rdev)];
1496 loop:
1497 SPECHASH_LOCK();
1498
1499 for (vp = *vpp; vp; vp = vp->v_specnext) {
1500 if (nvp_rdev == vp->v_rdev && nvp->v_type == vp->v_type) {
1501 vid = vp->v_id;
1502 break;
1503 }
1504 }
1505 SPECHASH_UNLOCK();
1506
1507 if (vp) {
1508 found_alias:
1509 if (vnode_getwithvid(vp, vid)) {
1510 goto loop;
1511 }
1512 /*
1513 * Termination state is checked in vnode_getwithvid
1514 */
1515 vnode_lock(vp);
1516
1517 /*
1518 * Alias, but not in use, so flush it out.
1519 */
1520 if ((vp->v_iocount == 1) && (vp->v_usecount == 0)) {
1521 vnode_reclaim_internal(vp, 1, 1, 0);
1522 vnode_put_locked(vp);
1523 vnode_unlock(vp);
1524 goto loop;
1525 }
1526 }
1527 if (vp == NULL || vp->v_tag != VT_NON) {
1528 if (sin == NULL) {
1529 MALLOC_ZONE(sin, struct specinfo *, sizeof(struct specinfo),
1530 M_SPECINFO, M_WAITOK);
1531 }
1532
1533 nvp->v_specinfo = sin;
1534 bzero(nvp->v_specinfo, sizeof(struct specinfo));
1535 nvp->v_rdev = nvp_rdev;
1536 nvp->v_specflags = 0;
1537 nvp->v_speclastr = -1;
1538 nvp->v_specinfo->si_opencount = 0;
1539 nvp->v_specinfo->si_initted = 0;
1540 nvp->v_specinfo->si_throttleable = 0;
1541
1542 SPECHASH_LOCK();
1543
1544 /* We dropped the lock, someone could have added */
1545 if (vp == NULLVP) {
1546 for (vp = *vpp; vp; vp = vp->v_specnext) {
1547 if (nvp_rdev == vp->v_rdev && nvp->v_type == vp->v_type) {
1548 vid = vp->v_id;
1549 SPECHASH_UNLOCK();
1550 goto found_alias;
1551 }
1552 }
1553 }
1554
1555 nvp->v_hashchain = vpp;
1556 nvp->v_specnext = *vpp;
1557 *vpp = nvp;
1558
1559 if (vp != NULLVP) {
1560 nvp->v_specflags |= SI_ALIASED;
1561 vp->v_specflags |= SI_ALIASED;
1562 SPECHASH_UNLOCK();
1563 vnode_put_locked(vp);
1564 vnode_unlock(vp);
1565 } else {
1566 SPECHASH_UNLOCK();
1567 }
1568
1569 return NULLVP;
1570 }
1571
1572 if (sin) {
1573 FREE_ZONE(sin, sizeof(struct specinfo), M_SPECINFO);
1574 }
1575
1576 if ((vp->v_flag & (VBDEVVP | VDEVFLUSH)) != 0) {
1577 return vp;
1578 }
1579
1580 panic("checkalias with VT_NON vp that shouldn't: %p", vp);
1581
1582 return vp;
1583 }
1584
1585
1586 /*
1587 * Get a reference on a particular vnode and lock it if requested.
1588 * If the vnode was on the inactive list, remove it from the list.
1589 * If the vnode was on the free list, remove it from the list and
1590 * move it to inactive list as needed.
1591 * The vnode lock bit is set if the vnode is being eliminated in
1592 * vgone. The process is awakened when the transition is completed,
1593 * and an error returned to indicate that the vnode is no longer
1594 * usable (possibly having been changed to a new file system type).
1595 */
1596 int
1597 vget_internal(vnode_t vp, int vid, int vflags)
1598 {
1599 int error = 0;
1600
1601 vnode_lock_spin(vp);
1602
1603 if ((vflags & VNODE_WRITEABLE) && (vp->v_writecount == 0)) {
1604 /*
1605 * vnode to be returned only if it has writers opened
1606 */
1607 error = EINVAL;
1608 } else {
1609 error = vnode_getiocount(vp, vid, vflags);
1610 }
1611
1612 vnode_unlock(vp);
1613
1614 return error;
1615 }
1616
1617 /*
1618 * Returns: 0 Success
1619 * ENOENT No such file or directory [terminating]
1620 */
1621 int
1622 vnode_ref(vnode_t vp)
1623 {
1624 return vnode_ref_ext(vp, 0, 0);
1625 }
1626
1627 /*
1628 * Returns: 0 Success
1629 * ENOENT No such file or directory [terminating]
1630 */
1631 int
1632 vnode_ref_ext(vnode_t vp, int fmode, int flags)
1633 {
1634 int error = 0;
1635
1636 vnode_lock_spin(vp);
1637
1638 /*
1639 * once all the current call sites have been fixed to insure they have
1640 * taken an iocount, we can toughen this assert up and insist that the
1641 * iocount is non-zero... a non-zero usecount doesn't insure correctness
1642 */
1643 if (vp->v_iocount <= 0 && vp->v_usecount <= 0) {
1644 panic("vnode_ref_ext: vp %p has no valid reference %d, %d", vp, vp->v_iocount, vp->v_usecount);
1645 }
1646
1647 /*
1648 * if you are the owner of drain/termination, can acquire usecount
1649 */
1650 if ((flags & VNODE_REF_FORCE) == 0) {
1651 if ((vp->v_lflag & (VL_DRAIN | VL_TERMINATE | VL_DEAD))) {
1652 if (vp->v_owner != current_thread()) {
1653 error = ENOENT;
1654 goto out;
1655 }
1656 }
1657 }
1658 vp->v_usecount++;
1659
1660 if (fmode & FWRITE) {
1661 if (++vp->v_writecount <= 0) {
1662 panic("vnode_ref_ext: v_writecount");
1663 }
1664 }
1665 if (fmode & O_EVTONLY) {
1666 if (++vp->v_kusecount <= 0) {
1667 panic("vnode_ref_ext: v_kusecount");
1668 }
1669 }
1670 if (vp->v_flag & VRAGE) {
1671 struct uthread *ut;
1672
1673 ut = get_bsdthread_info(current_thread());
1674
1675 if (!(current_proc()->p_lflag & P_LRAGE_VNODES) &&
1676 !(ut->uu_flag & UT_RAGE_VNODES)) {
1677 /*
1678 * a 'normal' process accessed this vnode
1679 * so make sure its no longer marked
1680 * for rapid aging... also, make sure
1681 * it gets removed from the rage list...
1682 * when v_usecount drops back to 0, it
1683 * will be put back on the real free list
1684 */
1685 vp->v_flag &= ~VRAGE;
1686 vp->v_references = 0;
1687 vnode_list_remove(vp);
1688 }
1689 }
1690 if (vp->v_usecount == 1 && vp->v_type == VREG && !(vp->v_flag & VSYSTEM)) {
1691 if (vp->v_ubcinfo) {
1692 vnode_lock_convert(vp);
1693 memory_object_mark_used(vp->v_ubcinfo->ui_control);
1694 }
1695 }
1696 out:
1697 vnode_unlock(vp);
1698
1699 return error;
1700 }
1701
1702
1703 boolean_t
1704 vnode_on_reliable_media(vnode_t vp)
1705 {
1706 if (!(vp->v_mount->mnt_kern_flag & MNTK_VIRTUALDEV) && (vp->v_mount->mnt_flag & MNT_LOCAL)) {
1707 return TRUE;
1708 }
1709 return FALSE;
1710 }
1711
1712 static void
1713 vnode_async_list_add(vnode_t vp)
1714 {
1715 vnode_list_lock();
1716
1717 if (VONLIST(vp) || (vp->v_lflag & (VL_TERMINATE | VL_DEAD))) {
1718 panic("vnode_async_list_add: %p is in wrong state", vp);
1719 }
1720
1721 TAILQ_INSERT_HEAD(&vnode_async_work_list, vp, v_freelist);
1722 vp->v_listflag |= VLIST_ASYNC_WORK;
1723
1724 async_work_vnodes++;
1725
1726 vnode_list_unlock();
1727
1728 wakeup(&vnode_async_work_list);
1729 }
1730
1731
1732 /*
1733 * put the vnode on appropriate free list.
1734 * called with vnode LOCKED
1735 */
1736 static void
1737 vnode_list_add(vnode_t vp)
1738 {
1739 boolean_t need_dead_wakeup = FALSE;
1740
1741 #if DIAGNOSTIC
1742 lck_mtx_assert(&vp->v_lock, LCK_MTX_ASSERT_OWNED);
1743 #endif
1744
1745 again:
1746
1747 /*
1748 * if it is already on a list or non zero references return
1749 */
1750 if (VONLIST(vp) || (vp->v_usecount != 0) || (vp->v_iocount != 0) || (vp->v_lflag & VL_TERMINATE)) {
1751 return;
1752 }
1753
1754 /*
1755 * In vclean, we might have deferred ditching locked buffers
1756 * because something was still referencing them (indicated by
1757 * usecount). We can ditch them now.
1758 */
1759 if (ISSET(vp->v_lflag, VL_DEAD)
1760 && (!LIST_EMPTY(&vp->v_cleanblkhd) || !LIST_EMPTY(&vp->v_dirtyblkhd))) {
1761 ++vp->v_iocount; // Probably not necessary, but harmless
1762 #ifdef JOE_DEBUG
1763 record_vp(vp, 1);
1764 #endif
1765 vnode_unlock(vp);
1766 buf_invalidateblks(vp, BUF_INVALIDATE_LOCKED, 0, 0);
1767 vnode_lock(vp);
1768 vnode_dropiocount(vp);
1769 goto again;
1770 }
1771
1772 vnode_list_lock();
1773
1774 if ((vp->v_flag & VRAGE) && !(vp->v_lflag & VL_DEAD)) {
1775 /*
1776 * add the new guy to the appropriate end of the RAGE list
1777 */
1778 if ((vp->v_flag & VAGE)) {
1779 TAILQ_INSERT_HEAD(&vnode_rage_list, vp, v_freelist);
1780 } else {
1781 TAILQ_INSERT_TAIL(&vnode_rage_list, vp, v_freelist);
1782 }
1783
1784 vp->v_listflag |= VLIST_RAGE;
1785 ragevnodes++;
1786
1787 /*
1788 * reset the timestamp for the last inserted vp on the RAGE
1789 * queue to let new_vnode know that its not ok to start stealing
1790 * from this list... as long as we're actively adding to this list
1791 * we'll push out the vnodes we want to donate to the real free list
1792 * once we stop pushing, we'll let some time elapse before we start
1793 * stealing them in the new_vnode routine
1794 */
1795 microuptime(&rage_tv);
1796 } else {
1797 /*
1798 * if VL_DEAD, insert it at head of the dead list
1799 * else insert at tail of LRU list or at head if VAGE is set
1800 */
1801 if ((vp->v_lflag & VL_DEAD)) {
1802 TAILQ_INSERT_HEAD(&vnode_dead_list, vp, v_freelist);
1803 vp->v_listflag |= VLIST_DEAD;
1804 deadvnodes++;
1805
1806 if (dead_vnode_wanted) {
1807 dead_vnode_wanted--;
1808 need_dead_wakeup = TRUE;
1809 }
1810 } else if ((vp->v_flag & VAGE)) {
1811 TAILQ_INSERT_HEAD(&vnode_free_list, vp, v_freelist);
1812 vp->v_flag &= ~VAGE;
1813 freevnodes++;
1814 } else {
1815 TAILQ_INSERT_TAIL(&vnode_free_list, vp, v_freelist);
1816 freevnodes++;
1817 }
1818 }
1819 vnode_list_unlock();
1820
1821 if (need_dead_wakeup == TRUE) {
1822 wakeup_one((caddr_t)&dead_vnode_wanted);
1823 }
1824 }
1825
1826
1827 /*
1828 * remove the vnode from appropriate free list.
1829 * called with vnode LOCKED and
1830 * the list lock held
1831 */
1832 static void
1833 vnode_list_remove_locked(vnode_t vp)
1834 {
1835 if (VONLIST(vp)) {
1836 /*
1837 * the v_listflag field is
1838 * protected by the vnode_list_lock
1839 */
1840 if (vp->v_listflag & VLIST_RAGE) {
1841 VREMRAGE("vnode_list_remove", vp);
1842 } else if (vp->v_listflag & VLIST_DEAD) {
1843 VREMDEAD("vnode_list_remove", vp);
1844 } else if (vp->v_listflag & VLIST_ASYNC_WORK) {
1845 VREMASYNC_WORK("vnode_list_remove", vp);
1846 } else {
1847 VREMFREE("vnode_list_remove", vp);
1848 }
1849 }
1850 }
1851
1852
1853 /*
1854 * remove the vnode from appropriate free list.
1855 * called with vnode LOCKED
1856 */
1857 static void
1858 vnode_list_remove(vnode_t vp)
1859 {
1860 #if DIAGNOSTIC
1861 lck_mtx_assert(&vp->v_lock, LCK_MTX_ASSERT_OWNED);
1862 #endif
1863 /*
1864 * we want to avoid taking the list lock
1865 * in the case where we're not on the free
1866 * list... this will be true for most
1867 * directories and any currently in use files
1868 *
1869 * we're guaranteed that we can't go from
1870 * the not-on-list state to the on-list
1871 * state since we hold the vnode lock...
1872 * all calls to vnode_list_add are done
1873 * under the vnode lock... so we can
1874 * check for that condition (the prevelant one)
1875 * without taking the list lock
1876 */
1877 if (VONLIST(vp)) {
1878 vnode_list_lock();
1879 /*
1880 * however, we're not guaranteed that
1881 * we won't go from the on-list state
1882 * to the not-on-list state until we
1883 * hold the vnode_list_lock... this
1884 * is due to "new_vnode" removing vnodes
1885 * from the free list uder the list_lock
1886 * w/o the vnode lock... so we need to
1887 * check again whether we're currently
1888 * on the free list
1889 */
1890 vnode_list_remove_locked(vp);
1891
1892 vnode_list_unlock();
1893 }
1894 }
1895
1896
1897 void
1898 vnode_rele(vnode_t vp)
1899 {
1900 vnode_rele_internal(vp, 0, 0, 0);
1901 }
1902
1903
1904 void
1905 vnode_rele_ext(vnode_t vp, int fmode, int dont_reenter)
1906 {
1907 vnode_rele_internal(vp, fmode, dont_reenter, 0);
1908 }
1909
1910
1911 void
1912 vnode_rele_internal(vnode_t vp, int fmode, int dont_reenter, int locked)
1913 {
1914 if (!locked) {
1915 vnode_lock_spin(vp);
1916 }
1917 #if DIAGNOSTIC
1918 else {
1919 lck_mtx_assert(&vp->v_lock, LCK_MTX_ASSERT_OWNED);
1920 }
1921 #endif
1922 if (--vp->v_usecount < 0) {
1923 panic("vnode_rele_ext: vp %p usecount -ve : %d. v_tag = %d, v_type = %d, v_flag = %x.", vp, vp->v_usecount, vp->v_tag, vp->v_type, vp->v_flag);
1924 }
1925
1926 if (fmode & FWRITE) {
1927 if (--vp->v_writecount < 0) {
1928 panic("vnode_rele_ext: vp %p writecount -ve : %d. v_tag = %d, v_type = %d, v_flag = %x.", vp, vp->v_writecount, vp->v_tag, vp->v_type, vp->v_flag);
1929 }
1930 }
1931 if (fmode & O_EVTONLY) {
1932 if (--vp->v_kusecount < 0) {
1933 panic("vnode_rele_ext: vp %p kusecount -ve : %d. v_tag = %d, v_type = %d, v_flag = %x.", vp, vp->v_kusecount, vp->v_tag, vp->v_type, vp->v_flag);
1934 }
1935 }
1936 if (vp->v_kusecount > vp->v_usecount) {
1937 panic("vnode_rele_ext: vp %p kusecount(%d) out of balance with usecount(%d). v_tag = %d, v_type = %d, v_flag = %x.", vp, vp->v_kusecount, vp->v_usecount, vp->v_tag, vp->v_type, vp->v_flag);
1938 }
1939
1940 if ((vp->v_iocount > 0) || (vp->v_usecount > 0)) {
1941 /*
1942 * vnode is still busy... if we're the last
1943 * usecount, mark for a future call to VNOP_INACTIVE
1944 * when the iocount finally drops to 0
1945 */
1946 if (vp->v_usecount == 0) {
1947 vp->v_lflag |= VL_NEEDINACTIVE;
1948 vp->v_flag &= ~(VNOCACHE_DATA | VRAOFF | VOPENEVT);
1949 }
1950 goto done;
1951 }
1952 vp->v_flag &= ~(VNOCACHE_DATA | VRAOFF | VOPENEVT);
1953
1954 if (ISSET(vp->v_lflag, VL_TERMINATE | VL_DEAD) || dont_reenter) {
1955 /*
1956 * vnode is being cleaned, or
1957 * we've requested that we don't reenter
1958 * the filesystem on this release...in
1959 * the latter case, we'll mark the vnode aged
1960 */
1961 if (dont_reenter) {
1962 if (!(vp->v_lflag & (VL_TERMINATE | VL_DEAD | VL_MARKTERM))) {
1963 vp->v_lflag |= VL_NEEDINACTIVE;
1964
1965 if (vnode_on_reliable_media(vp) == FALSE || vp->v_flag & VISDIRTY) {
1966 vnode_async_list_add(vp);
1967 goto done;
1968 }
1969 }
1970 vp->v_flag |= VAGE;
1971 }
1972 vnode_list_add(vp);
1973
1974 goto done;
1975 }
1976 /*
1977 * at this point both the iocount and usecount
1978 * are zero
1979 * pick up an iocount so that we can call
1980 * VNOP_INACTIVE with the vnode lock unheld
1981 */
1982 vp->v_iocount++;
1983 #ifdef JOE_DEBUG
1984 record_vp(vp, 1);
1985 #endif
1986 vp->v_lflag &= ~VL_NEEDINACTIVE;
1987 vnode_unlock(vp);
1988
1989 VNOP_INACTIVE(vp, vfs_context_current());
1990
1991 vnode_lock_spin(vp);
1992 /*
1993 * because we dropped the vnode lock to call VNOP_INACTIVE
1994 * the state of the vnode may have changed... we may have
1995 * picked up an iocount, usecount or the MARKTERM may have
1996 * been set... we need to reevaluate the reference counts
1997 * to determine if we can call vnode_reclaim_internal at
1998 * this point... if the reference counts are up, we'll pick
1999 * up the MARKTERM state when they get subsequently dropped
2000 */
2001 if ((vp->v_iocount == 1) && (vp->v_usecount == 0) &&
2002 ((vp->v_lflag & (VL_MARKTERM | VL_TERMINATE | VL_DEAD)) == VL_MARKTERM)) {
2003 struct uthread *ut;
2004
2005 ut = get_bsdthread_info(current_thread());
2006
2007 if (ut->uu_defer_reclaims) {
2008 vp->v_defer_reclaimlist = ut->uu_vreclaims;
2009 ut->uu_vreclaims = vp;
2010 goto done;
2011 }
2012 vnode_lock_convert(vp);
2013 vnode_reclaim_internal(vp, 1, 1, 0);
2014 }
2015 vnode_dropiocount(vp);
2016 vnode_list_add(vp);
2017 done:
2018 if (vp->v_usecount == 0 && vp->v_type == VREG && !(vp->v_flag & VSYSTEM)) {
2019 if (vp->v_ubcinfo) {
2020 vnode_lock_convert(vp);
2021 memory_object_mark_unused(vp->v_ubcinfo->ui_control, (vp->v_flag & VRAGE) == VRAGE);
2022 }
2023 }
2024 if (!locked) {
2025 vnode_unlock(vp);
2026 }
2027 return;
2028 }
2029
2030 /*
2031 * Remove any vnodes in the vnode table belonging to mount point mp.
2032 *
2033 * If MNT_NOFORCE is specified, there should not be any active ones,
2034 * return error if any are found (nb: this is a user error, not a
2035 * system error). If MNT_FORCE is specified, detach any active vnodes
2036 * that are found.
2037 */
2038 #if DIAGNOSTIC
2039 int busyprt = 0; /* print out busy vnodes */
2040 #endif
2041
2042 int
2043 vflush(struct mount *mp, struct vnode *skipvp, int flags)
2044 {
2045 struct vnode *vp;
2046 int busy = 0;
2047 int reclaimed = 0;
2048 int retval;
2049 unsigned int vid;
2050
2051 /*
2052 * See comments in vnode_iterate() for the rationale for this lock
2053 */
2054 mount_iterate_lock(mp);
2055
2056 mount_lock(mp);
2057 vnode_iterate_setup(mp);
2058 /*
2059 * On regular unmounts(not forced) do a
2060 * quick check for vnodes to be in use. This
2061 * preserves the caching of vnodes. automounter
2062 * tries unmounting every so often to see whether
2063 * it is still busy or not.
2064 */
2065 if (((flags & FORCECLOSE) == 0) && ((mp->mnt_kern_flag & MNTK_UNMOUNT_PREFLIGHT) != 0)) {
2066 if (vnode_umount_preflight(mp, skipvp, flags)) {
2067 vnode_iterate_clear(mp);
2068 mount_unlock(mp);
2069 mount_iterate_unlock(mp);
2070 return EBUSY;
2071 }
2072 }
2073 loop:
2074 /* If it returns 0 then there is nothing to do */
2075 retval = vnode_iterate_prepare(mp);
2076
2077 if (retval == 0) {
2078 vnode_iterate_clear(mp);
2079 mount_unlock(mp);
2080 mount_iterate_unlock(mp);
2081 return retval;
2082 }
2083
2084 /* iterate over all the vnodes */
2085 while (!TAILQ_EMPTY(&mp->mnt_workerqueue)) {
2086 vp = TAILQ_FIRST(&mp->mnt_workerqueue);
2087 TAILQ_REMOVE(&mp->mnt_workerqueue, vp, v_mntvnodes);
2088 TAILQ_INSERT_TAIL(&mp->mnt_vnodelist, vp, v_mntvnodes);
2089
2090 if ((vp->v_mount != mp) || (vp == skipvp)) {
2091 continue;
2092 }
2093 vid = vp->v_id;
2094 mount_unlock(mp);
2095
2096 vnode_lock_spin(vp);
2097
2098 // If vnode is already terminating, wait for it...
2099 while (vp->v_id == vid && ISSET(vp->v_lflag, VL_TERMINATE)) {
2100 vp->v_lflag |= VL_TERMWANT;
2101 msleep(&vp->v_lflag, &vp->v_lock, PVFS, "vflush", NULL);
2102 }
2103
2104 if ((vp->v_id != vid) || ISSET(vp->v_lflag, VL_DEAD)) {
2105 vnode_unlock(vp);
2106 mount_lock(mp);
2107 continue;
2108 }
2109
2110 /*
2111 * If requested, skip over vnodes marked VSYSTEM.
2112 * Skip over all vnodes marked VNOFLUSH.
2113 */
2114 if ((flags & SKIPSYSTEM) && ((vp->v_flag & VSYSTEM) ||
2115 (vp->v_flag & VNOFLUSH))) {
2116 vnode_unlock(vp);
2117 mount_lock(mp);
2118 continue;
2119 }
2120 /*
2121 * If requested, skip over vnodes marked VSWAP.
2122 */
2123 if ((flags & SKIPSWAP) && (vp->v_flag & VSWAP)) {
2124 vnode_unlock(vp);
2125 mount_lock(mp);
2126 continue;
2127 }
2128 /*
2129 * If requested, skip over vnodes marked VROOT.
2130 */
2131 if ((flags & SKIPROOT) && (vp->v_flag & VROOT)) {
2132 vnode_unlock(vp);
2133 mount_lock(mp);
2134 continue;
2135 }
2136 /*
2137 * If WRITECLOSE is set, only flush out regular file
2138 * vnodes open for writing.
2139 */
2140 if ((flags & WRITECLOSE) &&
2141 (vp->v_writecount == 0 || vp->v_type != VREG)) {
2142 vnode_unlock(vp);
2143 mount_lock(mp);
2144 continue;
2145 }
2146 /*
2147 * If the real usecount is 0, all we need to do is clear
2148 * out the vnode data structures and we are done.
2149 */
2150 if (((vp->v_usecount == 0) ||
2151 ((vp->v_usecount - vp->v_kusecount) == 0))) {
2152 vnode_lock_convert(vp);
2153 vp->v_iocount++; /* so that drain waits for * other iocounts */
2154 #ifdef JOE_DEBUG
2155 record_vp(vp, 1);
2156 #endif
2157 vnode_reclaim_internal(vp, 1, 1, 0);
2158 vnode_dropiocount(vp);
2159 vnode_list_add(vp);
2160 vnode_unlock(vp);
2161
2162 reclaimed++;
2163 mount_lock(mp);
2164 continue;
2165 }
2166 /*
2167 * If FORCECLOSE is set, forcibly close the vnode.
2168 * For block or character devices, revert to an
2169 * anonymous device. For all other files, just kill them.
2170 */
2171 if (flags & FORCECLOSE) {
2172 vnode_lock_convert(vp);
2173
2174 if (vp->v_type != VBLK && vp->v_type != VCHR) {
2175 vp->v_iocount++; /* so that drain waits * for other iocounts */
2176 #ifdef JOE_DEBUG
2177 record_vp(vp, 1);
2178 #endif
2179 vnode_abort_advlocks(vp);
2180 vnode_reclaim_internal(vp, 1, 1, 0);
2181 vnode_dropiocount(vp);
2182 vnode_list_add(vp);
2183 vnode_unlock(vp);
2184 } else {
2185 vclean(vp, 0);
2186 vp->v_lflag &= ~VL_DEAD;
2187 vp->v_op = spec_vnodeop_p;
2188 vp->v_flag |= VDEVFLUSH;
2189 vnode_unlock(vp);
2190 }
2191 mount_lock(mp);
2192 continue;
2193 }
2194 #if DIAGNOSTIC
2195 if (busyprt) {
2196 vprint("vflush: busy vnode", vp);
2197 }
2198 #endif
2199 vnode_unlock(vp);
2200 mount_lock(mp);
2201 busy++;
2202 }
2203
2204 /* At this point the worker queue is completed */
2205 if (busy && ((flags & FORCECLOSE) == 0) && reclaimed) {
2206 busy = 0;
2207 reclaimed = 0;
2208 (void)vnode_iterate_reloadq(mp);
2209 /* returned with mount lock held */
2210 goto loop;
2211 }
2212
2213 /* if new vnodes were created in between retry the reclaim */
2214 if (vnode_iterate_reloadq(mp) != 0) {
2215 if (!(busy && ((flags & FORCECLOSE) == 0))) {
2216 goto loop;
2217 }
2218 }
2219 vnode_iterate_clear(mp);
2220 mount_unlock(mp);
2221 mount_iterate_unlock(mp);
2222
2223 if (busy && ((flags & FORCECLOSE) == 0)) {
2224 return EBUSY;
2225 }
2226 return 0;
2227 }
2228
2229 long num_recycledvnodes = 0;
2230 /*
2231 * Disassociate the underlying file system from a vnode.
2232 * The vnode lock is held on entry.
2233 */
2234 static void
2235 vclean(vnode_t vp, int flags)
2236 {
2237 vfs_context_t ctx = vfs_context_current();
2238 int active;
2239 int need_inactive;
2240 int already_terminating;
2241 int clflags = 0;
2242 #if NAMEDSTREAMS
2243 int is_namedstream;
2244 #endif
2245
2246 /*
2247 * Check to see if the vnode is in use.
2248 * If so we have to reference it before we clean it out
2249 * so that its count cannot fall to zero and generate a
2250 * race against ourselves to recycle it.
2251 */
2252 active = vp->v_usecount;
2253
2254 /*
2255 * just in case we missed sending a needed
2256 * VNOP_INACTIVE, we'll do it now
2257 */
2258 need_inactive = (vp->v_lflag & VL_NEEDINACTIVE);
2259
2260 vp->v_lflag &= ~VL_NEEDINACTIVE;
2261
2262 /*
2263 * Prevent the vnode from being recycled or
2264 * brought into use while we clean it out.
2265 */
2266 already_terminating = (vp->v_lflag & VL_TERMINATE);
2267
2268 vp->v_lflag |= VL_TERMINATE;
2269
2270 #if NAMEDSTREAMS
2271 is_namedstream = vnode_isnamedstream(vp);
2272 #endif
2273
2274 vnode_unlock(vp);
2275
2276 OSAddAtomicLong(1, &num_recycledvnodes);
2277
2278 if (flags & DOCLOSE) {
2279 clflags |= IO_NDELAY;
2280 }
2281 if (flags & REVOKEALL) {
2282 clflags |= IO_REVOKE;
2283 }
2284
2285 if (active && (flags & DOCLOSE)) {
2286 VNOP_CLOSE(vp, clflags, ctx);
2287 }
2288
2289 /*
2290 * Clean out any buffers associated with the vnode.
2291 */
2292 if (flags & DOCLOSE) {
2293 #if NFSCLIENT
2294 if (vp->v_tag == VT_NFS) {
2295 nfs_vinvalbuf(vp, V_SAVE, ctx, 0);
2296 } else
2297 #endif
2298 {
2299 VNOP_FSYNC(vp, MNT_WAIT, ctx);
2300
2301 /*
2302 * If the vnode is still in use (by the journal for
2303 * example) we don't want to invalidate locked buffers
2304 * here. In that case, either the journal will tidy them
2305 * up, or we will deal with it when the usecount is
2306 * finally released in vnode_rele_internal.
2307 */
2308 buf_invalidateblks(vp, BUF_WRITE_DATA | (active ? 0 : BUF_INVALIDATE_LOCKED), 0, 0);
2309 }
2310 if (UBCINFOEXISTS(vp)) {
2311 /*
2312 * Clean the pages in VM.
2313 */
2314 (void)ubc_msync(vp, (off_t)0, ubc_getsize(vp), NULL, UBC_PUSHALL | UBC_INVALIDATE | UBC_SYNC);
2315 }
2316 }
2317 if (active || need_inactive) {
2318 VNOP_INACTIVE(vp, ctx);
2319 }
2320
2321 #if NAMEDSTREAMS
2322 if ((is_namedstream != 0) && (vp->v_parent != NULLVP)) {
2323 vnode_t pvp = vp->v_parent;
2324
2325 /* Delete the shadow stream file before we reclaim its vnode */
2326 if (vnode_isshadow(vp)) {
2327 vnode_relenamedstream(pvp, vp);
2328 }
2329
2330 /*
2331 * No more streams associated with the parent. We
2332 * have a ref on it, so its identity is stable.
2333 * If the parent is on an opaque volume, then we need to know
2334 * whether it has associated named streams.
2335 */
2336 if (vfs_authopaque(pvp->v_mount)) {
2337 vnode_lock_spin(pvp);
2338 pvp->v_lflag &= ~VL_HASSTREAMS;
2339 vnode_unlock(pvp);
2340 }
2341 }
2342 #endif
2343
2344 /*
2345 * Destroy ubc named reference
2346 * cluster_release is done on this path
2347 * along with dropping the reference on the ucred
2348 * (and in the case of forced unmount of an mmap-ed file,
2349 * the ubc reference on the vnode is dropped here too).
2350 */
2351 ubc_destroy_named(vp);
2352
2353 #if CONFIG_TRIGGERS
2354 /*
2355 * cleanup trigger info from vnode (if any)
2356 */
2357 if (vp->v_resolve) {
2358 vnode_resolver_detach(vp);
2359 }
2360 #endif
2361
2362 /*
2363 * Reclaim the vnode.
2364 */
2365 if (VNOP_RECLAIM(vp, ctx)) {
2366 panic("vclean: cannot reclaim");
2367 }
2368
2369 // make sure the name & parent ptrs get cleaned out!
2370 vnode_update_identity(vp, NULLVP, NULL, 0, 0, VNODE_UPDATE_PARENT | VNODE_UPDATE_NAME | VNODE_UPDATE_PURGE);
2371
2372 vnode_lock(vp);
2373
2374 /*
2375 * Remove the vnode from any mount list it might be on. It is not
2376 * safe to do this any earlier because unmount needs to wait for
2377 * any vnodes to terminate and it cannot do that if it cannot find
2378 * them.
2379 */
2380 insmntque(vp, (struct mount *)0);
2381
2382 vp->v_mount = dead_mountp;
2383 vp->v_op = dead_vnodeop_p;
2384 vp->v_tag = VT_NON;
2385 vp->v_data = NULL;
2386
2387 vp->v_lflag |= VL_DEAD;
2388 vp->v_flag &= ~VISDIRTY;
2389
2390 if (already_terminating == 0) {
2391 vp->v_lflag &= ~VL_TERMINATE;
2392 /*
2393 * Done with purge, notify sleepers of the grim news.
2394 */
2395 if (vp->v_lflag & VL_TERMWANT) {
2396 vp->v_lflag &= ~VL_TERMWANT;
2397 wakeup(&vp->v_lflag);
2398 }
2399 }
2400 }
2401
2402 /*
2403 * Eliminate all activity associated with the requested vnode
2404 * and with all vnodes aliased to the requested vnode.
2405 */
2406 int
2407 #if DIAGNOSTIC
2408 vn_revoke(vnode_t vp, int flags, __unused vfs_context_t a_context)
2409 #else
2410 vn_revoke(vnode_t vp, __unused int flags, __unused vfs_context_t a_context)
2411 #endif
2412 {
2413 struct vnode *vq;
2414 int vid;
2415
2416 #if DIAGNOSTIC
2417 if ((flags & REVOKEALL) == 0) {
2418 panic("vnop_revoke");
2419 }
2420 #endif
2421
2422 if (vnode_isaliased(vp)) {
2423 /*
2424 * If a vgone (or vclean) is already in progress,
2425 * return an immediate error
2426 */
2427 if (vp->v_lflag & VL_TERMINATE) {
2428 return ENOENT;
2429 }
2430
2431 /*
2432 * Ensure that vp will not be vgone'd while we
2433 * are eliminating its aliases.
2434 */
2435 SPECHASH_LOCK();
2436 while ((vp->v_specflags & SI_ALIASED)) {
2437 for (vq = *vp->v_hashchain; vq; vq = vq->v_specnext) {
2438 if (vq->v_rdev != vp->v_rdev ||
2439 vq->v_type != vp->v_type || vp == vq) {
2440 continue;
2441 }
2442 vid = vq->v_id;
2443 SPECHASH_UNLOCK();
2444 if (vnode_getwithvid(vq, vid)) {
2445 SPECHASH_LOCK();
2446 break;
2447 }
2448 vnode_lock(vq);
2449 if (!(vq->v_lflag & VL_TERMINATE)) {
2450 vnode_reclaim_internal(vq, 1, 1, 0);
2451 }
2452 vnode_put_locked(vq);
2453 vnode_unlock(vq);
2454 SPECHASH_LOCK();
2455 break;
2456 }
2457 }
2458 SPECHASH_UNLOCK();
2459 }
2460 vnode_lock(vp);
2461 if (vp->v_lflag & VL_TERMINATE) {
2462 vnode_unlock(vp);
2463 return ENOENT;
2464 }
2465 vnode_reclaim_internal(vp, 1, 0, REVOKEALL);
2466 vnode_unlock(vp);
2467
2468 return 0;
2469 }
2470
2471 /*
2472 * Recycle an unused vnode to the front of the free list.
2473 * Release the passed interlock if the vnode will be recycled.
2474 */
2475 int
2476 vnode_recycle(struct vnode *vp)
2477 {
2478 vnode_lock_spin(vp);
2479
2480 if (vp->v_iocount || vp->v_usecount) {
2481 vp->v_lflag |= VL_MARKTERM;
2482 vnode_unlock(vp);
2483 return 0;
2484 }
2485 vnode_lock_convert(vp);
2486 vnode_reclaim_internal(vp, 1, 0, 0);
2487
2488 vnode_unlock(vp);
2489
2490 return 1;
2491 }
2492
2493 static int
2494 vnode_reload(vnode_t vp)
2495 {
2496 vnode_lock_spin(vp);
2497
2498 if ((vp->v_iocount > 1) || vp->v_usecount) {
2499 vnode_unlock(vp);
2500 return 0;
2501 }
2502 if (vp->v_iocount <= 0) {
2503 panic("vnode_reload with no iocount %d", vp->v_iocount);
2504 }
2505
2506 /* mark for release when iocount is dopped */
2507 vp->v_lflag |= VL_MARKTERM;
2508 vnode_unlock(vp);
2509
2510 return 1;
2511 }
2512
2513
2514 static void
2515 vgone(vnode_t vp, int flags)
2516 {
2517 struct vnode *vq;
2518 struct vnode *vx;
2519
2520 /*
2521 * Clean out the filesystem specific data.
2522 * vclean also takes care of removing the
2523 * vnode from any mount list it might be on
2524 */
2525 vclean(vp, flags | DOCLOSE);
2526
2527 /*
2528 * If special device, remove it from special device alias list
2529 * if it is on one.
2530 */
2531 if ((vp->v_type == VBLK || vp->v_type == VCHR) && vp->v_specinfo != 0) {
2532 SPECHASH_LOCK();
2533 if (*vp->v_hashchain == vp) {
2534 *vp->v_hashchain = vp->v_specnext;
2535 } else {
2536 for (vq = *vp->v_hashchain; vq; vq = vq->v_specnext) {
2537 if (vq->v_specnext != vp) {
2538 continue;
2539 }
2540 vq->v_specnext = vp->v_specnext;
2541 break;
2542 }
2543 if (vq == NULL) {
2544 panic("missing bdev");
2545 }
2546 }
2547 if (vp->v_specflags & SI_ALIASED) {
2548 vx = NULL;
2549 for (vq = *vp->v_hashchain; vq; vq = vq->v_specnext) {
2550 if (vq->v_rdev != vp->v_rdev ||
2551 vq->v_type != vp->v_type) {
2552 continue;
2553 }
2554 if (vx) {
2555 break;
2556 }
2557 vx = vq;
2558 }
2559 if (vx == NULL) {
2560 panic("missing alias");
2561 }
2562 if (vq == NULL) {
2563 vx->v_specflags &= ~SI_ALIASED;
2564 }
2565 vp->v_specflags &= ~SI_ALIASED;
2566 }
2567 SPECHASH_UNLOCK();
2568 {
2569 struct specinfo *tmp = vp->v_specinfo;
2570 vp->v_specinfo = NULL;
2571 FREE_ZONE(tmp, sizeof(struct specinfo), M_SPECINFO);
2572 }
2573 }
2574 }
2575
2576 /*
2577 * Lookup a vnode by device number.
2578 */
2579 int
2580 check_mountedon(dev_t dev, enum vtype type, int *errorp)
2581 {
2582 vnode_t vp;
2583 int rc = 0;
2584 int vid;
2585
2586 loop:
2587 SPECHASH_LOCK();
2588 for (vp = speclisth[SPECHASH(dev)]; vp; vp = vp->v_specnext) {
2589 if (dev != vp->v_rdev || type != vp->v_type) {
2590 continue;
2591 }
2592 vid = vp->v_id;
2593 SPECHASH_UNLOCK();
2594 if (vnode_getwithvid(vp, vid)) {
2595 goto loop;
2596 }
2597 vnode_lock_spin(vp);
2598 if ((vp->v_usecount > 0) || (vp->v_iocount > 1)) {
2599 vnode_unlock(vp);
2600 if ((*errorp = vfs_mountedon(vp)) != 0) {
2601 rc = 1;
2602 }
2603 } else {
2604 vnode_unlock(vp);
2605 }
2606 vnode_put(vp);
2607 return rc;
2608 }
2609 SPECHASH_UNLOCK();
2610 return 0;
2611 }
2612
2613 /*
2614 * Calculate the total number of references to a special device.
2615 */
2616 int
2617 vcount(vnode_t vp)
2618 {
2619 vnode_t vq, vnext;
2620 int count;
2621 int vid;
2622
2623 if (!vnode_isspec(vp)) {
2624 return vp->v_usecount - vp->v_kusecount;
2625 }
2626
2627 loop:
2628 if (!vnode_isaliased(vp)) {
2629 return vp->v_specinfo->si_opencount;
2630 }
2631 count = 0;
2632
2633 SPECHASH_LOCK();
2634 /*
2635 * Grab first vnode and its vid.
2636 */
2637 vq = *vp->v_hashchain;
2638 vid = vq ? vq->v_id : 0;
2639
2640 SPECHASH_UNLOCK();
2641
2642 while (vq) {
2643 /*
2644 * Attempt to get the vnode outside the SPECHASH lock.
2645 */
2646 if (vnode_getwithvid(vq, vid)) {
2647 goto loop;
2648 }
2649 vnode_lock(vq);
2650
2651 if (vq->v_rdev == vp->v_rdev && vq->v_type == vp->v_type) {
2652 if ((vq->v_usecount == 0) && (vq->v_iocount == 1) && vq != vp) {
2653 /*
2654 * Alias, but not in use, so flush it out.
2655 */
2656 vnode_reclaim_internal(vq, 1, 1, 0);
2657 vnode_put_locked(vq);
2658 vnode_unlock(vq);
2659 goto loop;
2660 }
2661 count += vq->v_specinfo->si_opencount;
2662 }
2663 vnode_unlock(vq);
2664
2665 SPECHASH_LOCK();
2666 /*
2667 * must do this with the reference still held on 'vq'
2668 * so that it can't be destroyed while we're poking
2669 * through v_specnext
2670 */
2671 vnext = vq->v_specnext;
2672 vid = vnext ? vnext->v_id : 0;
2673
2674 SPECHASH_UNLOCK();
2675
2676 vnode_put(vq);
2677
2678 vq = vnext;
2679 }
2680
2681 return count;
2682 }
2683
2684 int prtactive = 0; /* 1 => print out reclaim of active vnodes */
2685
2686 /*
2687 * Print out a description of a vnode.
2688 */
2689 static const char *typename[] =
2690 { "VNON", "VREG", "VDIR", "VBLK", "VCHR", "VLNK", "VSOCK", "VFIFO", "VBAD" };
2691
2692 void
2693 vprint(const char *label, struct vnode *vp)
2694 {
2695 char sbuf[64];
2696
2697 if (label != NULL) {
2698 printf("%s: ", label);
2699 }
2700 printf("type %s, usecount %d, writecount %d",
2701 typename[vp->v_type], vp->v_usecount, vp->v_writecount);
2702 sbuf[0] = '\0';
2703 if (vp->v_flag & VROOT) {
2704 strlcat(sbuf, "|VROOT", sizeof(sbuf));
2705 }
2706 if (vp->v_flag & VTEXT) {
2707 strlcat(sbuf, "|VTEXT", sizeof(sbuf));
2708 }
2709 if (vp->v_flag & VSYSTEM) {
2710 strlcat(sbuf, "|VSYSTEM", sizeof(sbuf));
2711 }
2712 if (vp->v_flag & VNOFLUSH) {
2713 strlcat(sbuf, "|VNOFLUSH", sizeof(sbuf));
2714 }
2715 if (vp->v_flag & VBWAIT) {
2716 strlcat(sbuf, "|VBWAIT", sizeof(sbuf));
2717 }
2718 if (vnode_isaliased(vp)) {
2719 strlcat(sbuf, "|VALIASED", sizeof(sbuf));
2720 }
2721 if (sbuf[0] != '\0') {
2722 printf(" flags (%s)", &sbuf[1]);
2723 }
2724 }
2725
2726
2727 int
2728 vn_getpath(struct vnode *vp, char *pathbuf, int *len)
2729 {
2730 return build_path(vp, pathbuf, *len, len, BUILDPATH_NO_FS_ENTER, vfs_context_current());
2731 }
2732
2733 int
2734 vn_getpath_fsenter(struct vnode *vp, char *pathbuf, int *len)
2735 {
2736 return build_path(vp, pathbuf, *len, len, 0, vfs_context_current());
2737 }
2738
2739 /*
2740 * vn_getpath_fsenter_with_parent will reenter the file system to fine the path of the
2741 * vnode. It requires that there are IO counts on both the vnode and the directory vnode.
2742 *
2743 * vn_getpath_fsenter is called by MAC hooks to authorize operations for every thing, but
2744 * unlink, rmdir and rename. For these operation the MAC hook calls vn_getpath. This presents
2745 * problems where if the path can not be found from the name cache, those operations can
2746 * erroneously fail with EPERM even though the call should succeed. When removing or moving
2747 * file system objects with operations such as unlink or rename, those operations need to
2748 * take IO counts on the target and containing directory. Calling vn_getpath_fsenter from a
2749 * MAC hook from these operations during forced unmount operations can lead to dead
2750 * lock. This happens when the operation starts, IO counts are taken on the containing
2751 * directories and targets. Before the MAC hook is called a forced unmount from another
2752 * thread takes place and blocks on the on going operation's directory vnode in vdrain.
2753 * After which, the MAC hook gets called and calls vn_getpath_fsenter. vn_getpath_fsenter
2754 * is called with the understanding that there is an IO count on the target. If in
2755 * build_path the directory vnode is no longer in the cache, then the parent object id via
2756 * vnode_getattr from the target is obtain and used to call VFS_VGET to get the parent
2757 * vnode. The file system's VFS_VGET then looks up by inode in its hash and tries to get
2758 * an IO count. But VFS_VGET "sees" the directory vnode is in vdrain and can block
2759 * depending on which version and how it calls the vnode_get family of interfaces.
2760 *
2761 * N.B. A reasonable interface to use is vnode_getwithvid. This interface was modified to
2762 * call vnode_getiocount with VNODE_DRAINO, so it will happily get an IO count and not
2763 * cause issues, but there is no guarantee that all or any file systems are doing that.
2764 *
2765 * vn_getpath_fsenter_with_parent can enter the file system safely since there is a known
2766 * IO count on the directory vnode by calling build_path_with_parent.
2767 */
2768
2769 int
2770 vn_getpath_fsenter_with_parent(struct vnode *dvp, struct vnode *vp, char *pathbuf, int *len)
2771 {
2772 return build_path_with_parent(vp, dvp, pathbuf, *len, len, 0, vfs_context_current());
2773 }
2774
2775 int
2776 vn_getcdhash(struct vnode *vp, off_t offset, unsigned char *cdhash)
2777 {
2778 return ubc_cs_getcdhash(vp, offset, cdhash);
2779 }
2780
2781
2782 static char *extension_table = NULL;
2783 static int nexts;
2784 static int max_ext_width;
2785
2786 static int
2787 extension_cmp(const void *a, const void *b)
2788 {
2789 return strlen((const char *)a) - strlen((const char *)b);
2790 }
2791
2792
2793 //
2794 // This is the api LaunchServices uses to inform the kernel
2795 // the list of package extensions to ignore.
2796 //
2797 // Internally we keep the list sorted by the length of the
2798 // the extension (from longest to shortest). We sort the
2799 // list of extensions so that we can speed up our searches
2800 // when comparing file names -- we only compare extensions
2801 // that could possibly fit into the file name, not all of
2802 // them (i.e. a short 8 character name can't have an 8
2803 // character extension).
2804 //
2805 extern lck_mtx_t *pkg_extensions_lck;
2806
2807 __private_extern__ int
2808 set_package_extensions_table(user_addr_t data, int nentries, int maxwidth)
2809 {
2810 char *new_exts, *old_exts;
2811 int error;
2812
2813 if (nentries <= 0 || nentries > 1024 || maxwidth <= 0 || maxwidth > 255) {
2814 return EINVAL;
2815 }
2816
2817
2818 // allocate one byte extra so we can guarantee null termination
2819 MALLOC(new_exts, char *, (nentries * maxwidth) + 1, M_TEMP, M_WAITOK);
2820 if (new_exts == NULL) {
2821 return ENOMEM;
2822 }
2823
2824 error = copyin(data, new_exts, nentries * maxwidth);
2825 if (error) {
2826 FREE(new_exts, M_TEMP);
2827 return error;
2828 }
2829
2830 new_exts[(nentries * maxwidth)] = '\0'; // guarantee null termination of the block
2831
2832 qsort(new_exts, nentries, maxwidth, extension_cmp);
2833
2834 lck_mtx_lock(pkg_extensions_lck);
2835
2836 old_exts = extension_table;
2837 extension_table = new_exts;
2838 nexts = nentries;
2839 max_ext_width = maxwidth;
2840
2841 lck_mtx_unlock(pkg_extensions_lck);
2842
2843 if (old_exts) {
2844 FREE(old_exts, M_TEMP);
2845 }
2846
2847 return 0;
2848 }
2849
2850
2851 int
2852 is_package_name(const char *name, int len)
2853 {
2854 int i, extlen;
2855 const char *ptr, *name_ext;
2856
2857 if (len <= 3) {
2858 return 0;
2859 }
2860
2861 name_ext = NULL;
2862 for (ptr = name; *ptr != '\0'; ptr++) {
2863 if (*ptr == '.') {
2864 name_ext = ptr;
2865 }
2866 }
2867
2868 // if there is no "." extension, it can't match
2869 if (name_ext == NULL) {
2870 return 0;
2871 }
2872
2873 // advance over the "."
2874 name_ext++;
2875
2876 lck_mtx_lock(pkg_extensions_lck);
2877
2878 // now iterate over all the extensions to see if any match
2879 ptr = &extension_table[0];
2880 for (i = 0; i < nexts; i++, ptr += max_ext_width) {
2881 extlen = strlen(ptr);
2882 if (strncasecmp(name_ext, ptr, extlen) == 0 && name_ext[extlen] == '\0') {
2883 // aha, a match!
2884 lck_mtx_unlock(pkg_extensions_lck);
2885 return 1;
2886 }
2887 }
2888
2889 lck_mtx_unlock(pkg_extensions_lck);
2890
2891 // if we get here, no extension matched
2892 return 0;
2893 }
2894
2895 int
2896 vn_path_package_check(__unused vnode_t vp, char *path, int pathlen, int *component)
2897 {
2898 char *ptr, *end;
2899 int comp = 0;
2900
2901 *component = -1;
2902 if (*path != '/') {
2903 return EINVAL;
2904 }
2905
2906 end = path + 1;
2907 while (end < path + pathlen && *end != '\0') {
2908 while (end < path + pathlen && *end == '/' && *end != '\0') {
2909 end++;
2910 }
2911
2912 ptr = end;
2913
2914 while (end < path + pathlen && *end != '/' && *end != '\0') {
2915 end++;
2916 }
2917
2918 if (end > path + pathlen) {
2919 // hmm, string wasn't null terminated
2920 return EINVAL;
2921 }
2922
2923 *end = '\0';
2924 if (is_package_name(ptr, end - ptr)) {
2925 *component = comp;
2926 break;
2927 }
2928
2929 end++;
2930 comp++;
2931 }
2932
2933 return 0;
2934 }
2935
2936 /*
2937 * Determine if a name is inappropriate for a searchfs query.
2938 * This list consists of /System currently.
2939 */
2940
2941 int
2942 vn_searchfs_inappropriate_name(const char *name, int len)
2943 {
2944 const char *bad_names[] = { "System" };
2945 int bad_len[] = { 6 };
2946 int i;
2947
2948 for (i = 0; i < (int) (sizeof(bad_names) / sizeof(bad_names[0])); i++) {
2949 if (len == bad_len[i] && strncmp(name, bad_names[i], strlen(bad_names[i]) + 1) == 0) {
2950 return 1;
2951 }
2952 }
2953
2954 // if we get here, no name matched
2955 return 0;
2956 }
2957
2958 /*
2959 * Top level filesystem related information gathering.
2960 */
2961 extern unsigned int vfs_nummntops;
2962
2963 /*
2964 * The VFS_NUMMNTOPS shouldn't be at name[1] since
2965 * is a VFS generic variable. Since we no longer support
2966 * VT_UFS, we reserve its value to support this sysctl node.
2967 *
2968 * It should have been:
2969 * name[0]: VFS_GENERIC
2970 * name[1]: VFS_NUMMNTOPS
2971 */
2972 SYSCTL_INT(_vfs, VFS_NUMMNTOPS, nummntops,
2973 CTLFLAG_RD | CTLFLAG_KERN | CTLFLAG_LOCKED,
2974 &vfs_nummntops, 0, "");
2975
2976 int
2977 vfs_sysctl(int *name __unused, u_int namelen __unused,
2978 user_addr_t oldp __unused, size_t *oldlenp __unused,
2979 user_addr_t newp __unused, size_t newlen __unused, proc_t p __unused);
2980
2981 int
2982 vfs_sysctl(int *name __unused, u_int namelen __unused,
2983 user_addr_t oldp __unused, size_t *oldlenp __unused,
2984 user_addr_t newp __unused, size_t newlen __unused, proc_t p __unused)
2985 {
2986 return EINVAL;
2987 }
2988
2989
2990 //
2991 // The following code disallows specific sysctl's that came through
2992 // the direct sysctl interface (vfs_sysctl_node) instead of the newer
2993 // sysctl_vfs_ctlbyfsid() interface. We can not allow these selectors
2994 // through vfs_sysctl_node() because it passes the user's oldp pointer
2995 // directly to the file system which (for these selectors) casts it
2996 // back to a struct sysctl_req and then proceed to use SYSCTL_IN()
2997 // which jumps through an arbitrary function pointer. When called
2998 // through the sysctl_vfs_ctlbyfsid() interface this does not happen
2999 // and so it's safe.
3000 //
3001 // Unfortunately we have to pull in definitions from AFP and SMB and
3002 // perform explicit name checks on the file system to determine if
3003 // these selectors are being used.
3004 //
3005
3006 #define AFPFS_VFS_CTL_GETID 0x00020001
3007 #define AFPFS_VFS_CTL_NETCHANGE 0x00020002
3008 #define AFPFS_VFS_CTL_VOLCHANGE 0x00020003
3009
3010 #define SMBFS_SYSCTL_REMOUNT 1
3011 #define SMBFS_SYSCTL_REMOUNT_INFO 2
3012 #define SMBFS_SYSCTL_GET_SERVER_SHARE 3
3013
3014
3015 static int
3016 is_bad_sysctl_name(struct vfstable *vfsp, int selector_name)
3017 {
3018 switch (selector_name) {
3019 case VFS_CTL_QUERY:
3020 case VFS_CTL_TIMEO:
3021 case VFS_CTL_NOLOCKS:
3022 case VFS_CTL_NSTATUS:
3023 case VFS_CTL_SADDR:
3024 case VFS_CTL_DISC:
3025 case VFS_CTL_SERVERINFO:
3026 return 1;
3027
3028 default:
3029 break;
3030 }
3031
3032 // the more complicated check for some of SMB's special values
3033 if (strcmp(vfsp->vfc_name, "smbfs") == 0) {
3034 switch (selector_name) {
3035 case SMBFS_SYSCTL_REMOUNT:
3036 case SMBFS_SYSCTL_REMOUNT_INFO:
3037 case SMBFS_SYSCTL_GET_SERVER_SHARE:
3038 return 1;
3039 }
3040 } else if (strcmp(vfsp->vfc_name, "afpfs") == 0) {
3041 switch (selector_name) {
3042 case AFPFS_VFS_CTL_GETID:
3043 case AFPFS_VFS_CTL_NETCHANGE:
3044 case AFPFS_VFS_CTL_VOLCHANGE:
3045 return 1;
3046 }
3047 }
3048
3049 //
3050 // If we get here we passed all the checks so the selector is ok
3051 //
3052 return 0;
3053 }
3054
3055
3056 int vfs_sysctl_node SYSCTL_HANDLER_ARGS
3057 {
3058 int *name, namelen;
3059 struct vfstable *vfsp;
3060 int error;
3061 int fstypenum;
3062
3063 fstypenum = oidp->oid_number;
3064 name = arg1;
3065 namelen = arg2;
3066
3067 /* all sysctl names at this level should have at least one name slot for the FS */
3068 if (namelen < 1) {
3069 return EISDIR; /* overloaded */
3070 }
3071 mount_list_lock();
3072 for (vfsp = vfsconf; vfsp; vfsp = vfsp->vfc_next) {
3073 if (vfsp->vfc_typenum == fstypenum) {
3074 vfsp->vfc_refcount++;
3075 break;
3076 }
3077 }
3078 mount_list_unlock();
3079
3080 if (vfsp == NULL) {
3081 return ENOTSUP;
3082 }
3083
3084 if (is_bad_sysctl_name(vfsp, name[0])) {
3085 printf("vfs: bad selector 0x%.8x for old-style sysctl(). use the sysctl-by-fsid interface instead\n", name[0]);
3086 return EPERM;
3087 }
3088
3089 error = (vfsp->vfc_vfsops->vfs_sysctl)(name, namelen, req->oldptr, &req->oldlen, req->newptr, req->newlen, vfs_context_current());
3090
3091 mount_list_lock();
3092 vfsp->vfc_refcount--;
3093 mount_list_unlock();
3094
3095 return error;
3096 }
3097
3098 /*
3099 * Check to see if a filesystem is mounted on a block device.
3100 */
3101 int
3102 vfs_mountedon(struct vnode *vp)
3103 {
3104 struct vnode *vq;
3105 int error = 0;
3106
3107 SPECHASH_LOCK();
3108 if (vp->v_specflags & SI_MOUNTEDON) {
3109 error = EBUSY;
3110 goto out;
3111 }
3112 if (vp->v_specflags & SI_ALIASED) {
3113 for (vq = *vp->v_hashchain; vq; vq = vq->v_specnext) {
3114 if (vq->v_rdev != vp->v_rdev ||
3115 vq->v_type != vp->v_type) {
3116 continue;
3117 }
3118 if (vq->v_specflags & SI_MOUNTEDON) {
3119 error = EBUSY;
3120 break;
3121 }
3122 }
3123 }
3124 out:
3125 SPECHASH_UNLOCK();
3126 return error;
3127 }
3128
3129 struct unmount_info {
3130 int u_errs; // Total failed unmounts
3131 int u_busy; // EBUSY failed unmounts
3132 };
3133
3134 static int
3135 unmount_callback(mount_t mp, void *arg)
3136 {
3137 int error;
3138 char *mntname;
3139 struct unmount_info *uip = arg;
3140
3141 mount_ref(mp, 0);
3142 mount_iterdrop(mp); // avoid vfs_iterate deadlock in dounmount()
3143
3144 MALLOC_ZONE(mntname, void *, MAXPATHLEN, M_NAMEI, M_WAITOK);
3145 if (mntname) {
3146 strlcpy(mntname, mp->mnt_vfsstat.f_mntonname, MAXPATHLEN);
3147 }
3148
3149 error = dounmount(mp, MNT_FORCE, 1, vfs_context_current());
3150 if (error) {
3151 uip->u_errs++;
3152 printf("Unmount of %s failed (%d)\n", mntname ? mntname:"?", error);
3153 if (error == EBUSY) {
3154 uip->u_busy++;
3155 }
3156 }
3157 if (mntname) {
3158 FREE_ZONE(mntname, MAXPATHLEN, M_NAMEI);
3159 }
3160
3161 return VFS_RETURNED;
3162 }
3163
3164 /*
3165 * Unmount all filesystems. The list is traversed in reverse order
3166 * of mounting to avoid dependencies.
3167 * Busy mounts are retried.
3168 */
3169 __private_extern__ void
3170 vfs_unmountall(void)
3171 {
3172 int mounts, sec = 1;
3173 struct unmount_info ui;
3174
3175 retry:
3176 ui.u_errs = ui.u_busy = 0;
3177 vfs_iterate(VFS_ITERATE_CB_DROPREF | VFS_ITERATE_TAIL_FIRST, unmount_callback, &ui);
3178 mounts = mount_getvfscnt();
3179 if (mounts == 0) {
3180 return;
3181 }
3182
3183 if (ui.u_busy > 0) { // Busy mounts - wait & retry
3184 tsleep(&nummounts, PVFS, "busy mount", sec * hz);
3185 sec *= 2;
3186 if (sec <= 32) {
3187 goto retry;
3188 }
3189 printf("Unmounting timed out\n");
3190 } else if (ui.u_errs < mounts) {
3191 // If the vfs_iterate missed mounts in progress - wait a bit
3192 tsleep(&nummounts, PVFS, "missed mount", 2 * hz);
3193 }
3194 }
3195
3196 /*
3197 * This routine is called from vnode_pager_deallocate out of the VM
3198 * The path to vnode_pager_deallocate can only be initiated by ubc_destroy_named
3199 * on a vnode that has a UBCINFO
3200 */
3201 __private_extern__ void
3202 vnode_pager_vrele(vnode_t vp)
3203 {
3204 struct ubc_info *uip;
3205
3206 vnode_lock_spin(vp);
3207
3208 vp->v_lflag &= ~VNAMED_UBC;
3209 if (vp->v_usecount != 0) {
3210 /*
3211 * At the eleventh hour, just before the ubcinfo is
3212 * destroyed, ensure the ubc-specific v_usecount
3213 * reference has gone. We use v_usecount != 0 as a hint;
3214 * ubc_unmap() does nothing if there's no mapping.
3215 *
3216 * This case is caused by coming here via forced unmount,
3217 * versus the usual vm_object_deallocate() path.
3218 * In the forced unmount case, ubc_destroy_named()
3219 * releases the pager before memory_object_last_unmap()
3220 * can be called.
3221 */
3222 vnode_unlock(vp);
3223 ubc_unmap(vp);
3224 vnode_lock_spin(vp);
3225 }
3226
3227 uip = vp->v_ubcinfo;
3228 vp->v_ubcinfo = UBC_INFO_NULL;
3229
3230 vnode_unlock(vp);
3231
3232 ubc_info_deallocate(uip);
3233 }
3234
3235
3236 #include <sys/disk.h>
3237
3238 u_int32_t rootunit = (u_int32_t)-1;
3239
3240 #if CONFIG_IOSCHED
3241 extern int lowpri_throttle_enabled;
3242 extern int iosched_enabled;
3243 #endif
3244
3245 errno_t
3246 vfs_init_io_attributes(vnode_t devvp, mount_t mp)
3247 {
3248 int error;
3249 off_t readblockcnt = 0;
3250 off_t writeblockcnt = 0;
3251 off_t readmaxcnt = 0;
3252 off_t writemaxcnt = 0;
3253 off_t readsegcnt = 0;
3254 off_t writesegcnt = 0;
3255 off_t readsegsize = 0;
3256 off_t writesegsize = 0;
3257 off_t alignment = 0;
3258 u_int32_t minsaturationbytecount = 0;
3259 u_int32_t ioqueue_depth = 0;
3260 u_int32_t blksize;
3261 u_int64_t temp;
3262 u_int32_t features;
3263 vfs_context_t ctx = vfs_context_current();
3264 dk_corestorage_info_t cs_info;
3265 boolean_t cs_present = FALSE;;
3266 int isssd = 0;
3267 int isvirtual = 0;
3268
3269
3270 VNOP_IOCTL(devvp, DKIOCGETTHROTTLEMASK, (caddr_t)&mp->mnt_throttle_mask, 0, NULL);
3271 /*
3272 * as a reasonable approximation, only use the lowest bit of the mask
3273 * to generate a disk unit number
3274 */
3275 mp->mnt_devbsdunit = num_trailing_0(mp->mnt_throttle_mask);
3276
3277 if (devvp == rootvp) {
3278 rootunit = mp->mnt_devbsdunit;
3279 }
3280
3281 if (mp->mnt_devbsdunit == rootunit) {
3282 /*
3283 * this mount point exists on the same device as the root
3284 * partition, so it comes under the hard throttle control...
3285 * this is true even for the root mount point itself
3286 */
3287 mp->mnt_kern_flag |= MNTK_ROOTDEV;
3288 }
3289 /*
3290 * force the spec device to re-cache
3291 * the underlying block size in case
3292 * the filesystem overrode the initial value
3293 */
3294 set_fsblocksize(devvp);
3295
3296
3297 if ((error = VNOP_IOCTL(devvp, DKIOCGETBLOCKSIZE,
3298 (caddr_t)&blksize, 0, ctx))) {
3299 return error;
3300 }
3301
3302 mp->mnt_devblocksize = blksize;
3303
3304 /*
3305 * set the maximum possible I/O size
3306 * this may get clipped to a smaller value
3307 * based on which constraints are being advertised
3308 * and if those advertised constraints result in a smaller
3309 * limit for a given I/O
3310 */
3311 mp->mnt_maxreadcnt = MAX_UPL_SIZE_BYTES;
3312 mp->mnt_maxwritecnt = MAX_UPL_SIZE_BYTES;
3313
3314 if (VNOP_IOCTL(devvp, DKIOCISVIRTUAL, (caddr_t)&isvirtual, 0, ctx) == 0) {
3315 if (isvirtual) {
3316 mp->mnt_kern_flag |= MNTK_VIRTUALDEV;
3317 }
3318 }
3319 if (VNOP_IOCTL(devvp, DKIOCISSOLIDSTATE, (caddr_t)&isssd, 0, ctx) == 0) {
3320 if (isssd) {
3321 mp->mnt_kern_flag |= MNTK_SSD;
3322 }
3323 }
3324 if ((error = VNOP_IOCTL(devvp, DKIOCGETFEATURES,
3325 (caddr_t)&features, 0, ctx))) {
3326 return error;
3327 }
3328
3329 if ((error = VNOP_IOCTL(devvp, DKIOCGETMAXBLOCKCOUNTREAD,
3330 (caddr_t)&readblockcnt, 0, ctx))) {
3331 return error;
3332 }
3333
3334 if ((error = VNOP_IOCTL(devvp, DKIOCGETMAXBLOCKCOUNTWRITE,
3335 (caddr_t)&writeblockcnt, 0, ctx))) {
3336 return error;
3337 }
3338
3339 if ((error = VNOP_IOCTL(devvp, DKIOCGETMAXBYTECOUNTREAD,
3340 (caddr_t)&readmaxcnt, 0, ctx))) {
3341 return error;
3342 }
3343
3344 if ((error = VNOP_IOCTL(devvp, DKIOCGETMAXBYTECOUNTWRITE,
3345 (caddr_t)&writemaxcnt, 0, ctx))) {
3346 return error;
3347 }
3348
3349 if ((error = VNOP_IOCTL(devvp, DKIOCGETMAXSEGMENTCOUNTREAD,
3350 (caddr_t)&readsegcnt, 0, ctx))) {
3351 return error;
3352 }
3353
3354 if ((error = VNOP_IOCTL(devvp, DKIOCGETMAXSEGMENTCOUNTWRITE,
3355 (caddr_t)&writesegcnt, 0, ctx))) {
3356 return error;
3357 }
3358
3359 if ((error = VNOP_IOCTL(devvp, DKIOCGETMAXSEGMENTBYTECOUNTREAD,
3360 (caddr_t)&readsegsize, 0, ctx))) {
3361 return error;
3362 }
3363
3364 if ((error = VNOP_IOCTL(devvp, DKIOCGETMAXSEGMENTBYTECOUNTWRITE,
3365 (caddr_t)&writesegsize, 0, ctx))) {
3366 return error;
3367 }
3368
3369 if ((error = VNOP_IOCTL(devvp, DKIOCGETMINSEGMENTALIGNMENTBYTECOUNT,
3370 (caddr_t)&alignment, 0, ctx))) {
3371 return error;
3372 }
3373
3374 if ((error = VNOP_IOCTL(devvp, DKIOCGETCOMMANDPOOLSIZE,
3375 (caddr_t)&ioqueue_depth, 0, ctx))) {
3376 return error;
3377 }
3378
3379 if (readmaxcnt) {
3380 mp->mnt_maxreadcnt = (readmaxcnt > UINT32_MAX) ? UINT32_MAX : readmaxcnt;
3381 }
3382
3383 if (readblockcnt) {
3384 temp = readblockcnt * blksize;
3385 temp = (temp > UINT32_MAX) ? UINT32_MAX : temp;
3386
3387 if (temp < mp->mnt_maxreadcnt) {
3388 mp->mnt_maxreadcnt = (u_int32_t)temp;
3389 }
3390 }
3391
3392 if (writemaxcnt) {
3393 mp->mnt_maxwritecnt = (writemaxcnt > UINT32_MAX) ? UINT32_MAX : writemaxcnt;
3394 }
3395
3396 if (writeblockcnt) {
3397 temp = writeblockcnt * blksize;
3398 temp = (temp > UINT32_MAX) ? UINT32_MAX : temp;
3399
3400 if (temp < mp->mnt_maxwritecnt) {
3401 mp->mnt_maxwritecnt = (u_int32_t)temp;
3402 }
3403 }
3404
3405 if (readsegcnt) {
3406 temp = (readsegcnt > UINT16_MAX) ? UINT16_MAX : readsegcnt;
3407 } else {
3408 temp = mp->mnt_maxreadcnt / PAGE_SIZE;
3409
3410 if (temp > UINT16_MAX) {
3411 temp = UINT16_MAX;
3412 }
3413 }
3414 mp->mnt_segreadcnt = (u_int16_t)temp;
3415
3416 if (writesegcnt) {
3417 temp = (writesegcnt > UINT16_MAX) ? UINT16_MAX : writesegcnt;
3418 } else {
3419 temp = mp->mnt_maxwritecnt / PAGE_SIZE;
3420
3421 if (temp > UINT16_MAX) {
3422 temp = UINT16_MAX;
3423 }
3424 }
3425 mp->mnt_segwritecnt = (u_int16_t)temp;
3426
3427 if (readsegsize) {
3428 temp = (readsegsize > UINT32_MAX) ? UINT32_MAX : readsegsize;
3429 } else {
3430 temp = mp->mnt_maxreadcnt;
3431 }
3432 mp->mnt_maxsegreadsize = (u_int32_t)temp;
3433
3434 if (writesegsize) {
3435 temp = (writesegsize > UINT32_MAX) ? UINT32_MAX : writesegsize;
3436 } else {
3437 temp = mp->mnt_maxwritecnt;
3438 }
3439 mp->mnt_maxsegwritesize = (u_int32_t)temp;
3440
3441 if (alignment) {
3442 temp = (alignment > PAGE_SIZE) ? PAGE_MASK : alignment - 1;
3443 } else {
3444 temp = 0;
3445 }
3446 mp->mnt_alignmentmask = temp;
3447
3448
3449 if (ioqueue_depth > MNT_DEFAULT_IOQUEUE_DEPTH) {
3450 temp = ioqueue_depth;
3451 } else {
3452 temp = MNT_DEFAULT_IOQUEUE_DEPTH;
3453 }
3454
3455 mp->mnt_ioqueue_depth = temp;
3456 mp->mnt_ioscale = MNT_IOSCALE(mp->mnt_ioqueue_depth);
3457
3458 if (mp->mnt_ioscale > 1) {
3459 printf("ioqueue_depth = %d, ioscale = %d\n", (int)mp->mnt_ioqueue_depth, (int)mp->mnt_ioscale);
3460 }
3461
3462 if (features & DK_FEATURE_FORCE_UNIT_ACCESS) {
3463 mp->mnt_ioflags |= MNT_IOFLAGS_FUA_SUPPORTED;
3464 }
3465
3466 if (VNOP_IOCTL(devvp, DKIOCGETIOMINSATURATIONBYTECOUNT, (caddr_t)&minsaturationbytecount, 0, ctx) == 0) {
3467 mp->mnt_minsaturationbytecount = minsaturationbytecount;
3468 } else {
3469 mp->mnt_minsaturationbytecount = 0;
3470 }
3471
3472 if (VNOP_IOCTL(devvp, DKIOCCORESTORAGE, (caddr_t)&cs_info, 0, ctx) == 0) {
3473 cs_present = TRUE;
3474 }
3475
3476 if (features & DK_FEATURE_UNMAP) {
3477 mp->mnt_ioflags |= MNT_IOFLAGS_UNMAP_SUPPORTED;
3478
3479 if (cs_present == TRUE) {
3480 mp->mnt_ioflags |= MNT_IOFLAGS_CSUNMAP_SUPPORTED;
3481 }
3482 }
3483 if (cs_present == TRUE) {
3484 /*
3485 * for now we'll use the following test as a proxy for
3486 * the underlying drive being FUSION in nature
3487 */
3488 if ((cs_info.flags & DK_CORESTORAGE_PIN_YOUR_METADATA)) {
3489 mp->mnt_ioflags |= MNT_IOFLAGS_FUSION_DRIVE;
3490 }
3491 } else {
3492 /* Check for APFS Fusion */
3493 dk_apfs_flavour_t flavour;
3494 if ((VNOP_IOCTL(devvp, DKIOCGETAPFSFLAVOUR, (caddr_t)&flavour, 0, ctx) == 0) &&
3495 (flavour == DK_APFS_FUSION)) {
3496 mp->mnt_ioflags |= MNT_IOFLAGS_FUSION_DRIVE;
3497 }
3498 }
3499
3500 #if CONFIG_IOSCHED
3501 if (iosched_enabled && (features & DK_FEATURE_PRIORITY)) {
3502 mp->mnt_ioflags |= MNT_IOFLAGS_IOSCHED_SUPPORTED;
3503 throttle_info_disable_throttle(mp->mnt_devbsdunit, (mp->mnt_ioflags & MNT_IOFLAGS_FUSION_DRIVE) != 0);
3504 }
3505 #endif /* CONFIG_IOSCHED */
3506 return error;
3507 }
3508
3509 static struct klist fs_klist;
3510 lck_grp_t *fs_klist_lck_grp;
3511 lck_mtx_t *fs_klist_lock;
3512
3513 void
3514 vfs_event_init(void)
3515 {
3516 klist_init(&fs_klist);
3517 fs_klist_lck_grp = lck_grp_alloc_init("fs_klist", NULL);
3518 fs_klist_lock = lck_mtx_alloc_init(fs_klist_lck_grp, NULL);
3519 }
3520
3521 void
3522 vfs_event_signal(fsid_t *fsid, u_int32_t event, intptr_t data)
3523 {
3524 if (event == VQ_DEAD || event == VQ_NOTRESP) {
3525 struct mount *mp = vfs_getvfs(fsid);
3526 if (mp) {
3527 mount_lock_spin(mp);
3528 if (data) {
3529 mp->mnt_kern_flag &= ~MNT_LNOTRESP; // Now responding
3530 } else {
3531 mp->mnt_kern_flag |= MNT_LNOTRESP; // Not responding
3532 }
3533 mount_unlock(mp);
3534 }
3535 }
3536
3537 lck_mtx_lock(fs_klist_lock);
3538 KNOTE(&fs_klist, event);
3539 lck_mtx_unlock(fs_klist_lock);
3540 }
3541
3542 /*
3543 * return the number of mounted filesystems.
3544 */
3545 static int
3546 sysctl_vfs_getvfscnt(void)
3547 {
3548 return mount_getvfscnt();
3549 }
3550
3551
3552 static int
3553 mount_getvfscnt(void)
3554 {
3555 int ret;
3556
3557 mount_list_lock();
3558 ret = nummounts;
3559 mount_list_unlock();
3560 return ret;
3561 }
3562
3563
3564
3565 static int
3566 mount_fillfsids(fsid_t *fsidlst, int count)
3567 {
3568 struct mount *mp;
3569 int actual = 0;
3570
3571 actual = 0;
3572 mount_list_lock();
3573 TAILQ_FOREACH(mp, &mountlist, mnt_list) {
3574 if (actual <= count) {
3575 fsidlst[actual] = mp->mnt_vfsstat.f_fsid;
3576 actual++;
3577 }
3578 }
3579 mount_list_unlock();
3580 return actual;
3581 }
3582
3583 /*
3584 * fill in the array of fsid_t's up to a max of 'count', the actual
3585 * number filled in will be set in '*actual'. If there are more fsid_t's
3586 * than room in fsidlst then ENOMEM will be returned and '*actual' will
3587 * have the actual count.
3588 * having *actual filled out even in the error case is depended upon.
3589 */
3590 static int
3591 sysctl_vfs_getvfslist(fsid_t *fsidlst, int count, int *actual)
3592 {
3593 struct mount *mp;
3594
3595 *actual = 0;
3596 mount_list_lock();
3597 TAILQ_FOREACH(mp, &mountlist, mnt_list) {
3598 (*actual)++;
3599 if (*actual <= count) {
3600 fsidlst[(*actual) - 1] = mp->mnt_vfsstat.f_fsid;
3601 }
3602 }
3603 mount_list_unlock();
3604 return *actual <= count ? 0 : ENOMEM;
3605 }
3606
3607 static int
3608 sysctl_vfs_vfslist(__unused struct sysctl_oid *oidp, __unused void *arg1,
3609 __unused int arg2, struct sysctl_req *req)
3610 {
3611 int actual, error;
3612 size_t space;
3613 fsid_t *fsidlst;
3614
3615 /* This is a readonly node. */
3616 if (req->newptr != USER_ADDR_NULL) {
3617 return EPERM;
3618 }
3619
3620 /* they are querying us so just return the space required. */
3621 if (req->oldptr == USER_ADDR_NULL) {
3622 req->oldidx = sysctl_vfs_getvfscnt() * sizeof(fsid_t);
3623 return 0;
3624 }
3625 again:
3626 /*
3627 * Retrieve an accurate count of the amount of space required to copy
3628 * out all the fsids in the system.
3629 */
3630 space = req->oldlen;
3631 req->oldlen = sysctl_vfs_getvfscnt() * sizeof(fsid_t);
3632
3633 /* they didn't give us enough space. */
3634 if (space < req->oldlen) {
3635 return ENOMEM;
3636 }
3637
3638 MALLOC(fsidlst, fsid_t *, req->oldlen, M_TEMP, M_WAITOK | M_ZERO);
3639 if (fsidlst == NULL) {
3640 return ENOMEM;
3641 }
3642
3643 error = sysctl_vfs_getvfslist(fsidlst, req->oldlen / sizeof(fsid_t),
3644 &actual);
3645 /*
3646 * If we get back ENOMEM, then another mount has been added while we
3647 * slept in malloc above. If this is the case then try again.
3648 */
3649 if (error == ENOMEM) {
3650 FREE(fsidlst, M_TEMP);
3651 req->oldlen = space;
3652 goto again;
3653 }
3654 if (error == 0) {
3655 error = SYSCTL_OUT(req, fsidlst, actual * sizeof(fsid_t));
3656 }
3657 FREE(fsidlst, M_TEMP);
3658 return error;
3659 }
3660
3661 /*
3662 * Do a sysctl by fsid.
3663 */
3664 static int
3665 sysctl_vfs_ctlbyfsid(__unused struct sysctl_oid *oidp, void *arg1, int arg2,
3666 struct sysctl_req *req)
3667 {
3668 union union_vfsidctl vc;
3669 struct mount *mp;
3670 struct vfsstatfs *sp;
3671 int *name, flags, namelen;
3672 int error = 0, gotref = 0;
3673 vfs_context_t ctx = vfs_context_current();
3674 proc_t p = req->p; /* XXX req->p != current_proc()? */
3675 boolean_t is_64_bit;
3676
3677 name = arg1;
3678 namelen = arg2;
3679 is_64_bit = proc_is64bit(p);
3680
3681 error = SYSCTL_IN(req, &vc, is_64_bit? sizeof(vc.vc64):sizeof(vc.vc32));
3682 if (error) {
3683 goto out;
3684 }
3685 if (vc.vc32.vc_vers != VFS_CTL_VERS1) { /* works for 32 and 64 */
3686 error = EINVAL;
3687 goto out;
3688 }
3689 mp = mount_list_lookupby_fsid(&vc.vc32.vc_fsid, 0, 1); /* works for 32 and 64 */
3690 if (mp == NULL) {
3691 error = ENOENT;
3692 goto out;
3693 }
3694 gotref = 1;
3695 /* reset so that the fs specific code can fetch it. */
3696 req->newidx = 0;
3697 /*
3698 * Note if this is a VFS_CTL then we pass the actual sysctl req
3699 * in for "oldp" so that the lower layer can DTRT and use the
3700 * SYSCTL_IN/OUT routines.
3701 */
3702 if (mp->mnt_op->vfs_sysctl != NULL) {
3703 if (is_64_bit) {
3704 if (vfs_64bitready(mp)) {
3705 error = mp->mnt_op->vfs_sysctl(name, namelen,
3706 CAST_USER_ADDR_T(req),
3707 NULL, USER_ADDR_NULL, 0,
3708 ctx);
3709 } else {
3710 error = ENOTSUP;
3711 }
3712 } else {
3713 error = mp->mnt_op->vfs_sysctl(name, namelen,
3714 CAST_USER_ADDR_T(req),
3715 NULL, USER_ADDR_NULL, 0,
3716 ctx);
3717 }
3718 if (error != ENOTSUP) {
3719 goto out;
3720 }
3721 }
3722 switch (name[0]) {
3723 case VFS_CTL_UMOUNT:
3724 req->newidx = 0;
3725 if (is_64_bit) {
3726 req->newptr = vc.vc64.vc_ptr;
3727 req->newlen = (size_t)vc.vc64.vc_len;
3728 } else {
3729 req->newptr = CAST_USER_ADDR_T(vc.vc32.vc_ptr);
3730 req->newlen = vc.vc32.vc_len;
3731 }
3732 error = SYSCTL_IN(req, &flags, sizeof(flags));
3733 if (error) {
3734 break;
3735 }
3736
3737 mount_ref(mp, 0);
3738 mount_iterdrop(mp);
3739 gotref = 0;
3740 /* safedounmount consumes a ref */
3741 error = safedounmount(mp, flags, ctx);
3742 break;
3743 case VFS_CTL_STATFS:
3744 req->newidx = 0;
3745 if (is_64_bit) {
3746 req->newptr = vc.vc64.vc_ptr;
3747 req->newlen = (size_t)vc.vc64.vc_len;
3748 } else {
3749 req->newptr = CAST_USER_ADDR_T(vc.vc32.vc_ptr);
3750 req->newlen = vc.vc32.vc_len;
3751 }
3752 error = SYSCTL_IN(req, &flags, sizeof(flags));
3753 if (error) {
3754 break;
3755 }
3756 sp = &mp->mnt_vfsstat;
3757 if (((flags & MNT_NOWAIT) == 0 || (flags & (MNT_WAIT | MNT_DWAIT))) &&
3758 (error = vfs_update_vfsstat(mp, ctx, VFS_USER_EVENT))) {
3759 goto out;
3760 }
3761 if (is_64_bit) {
3762 struct user64_statfs sfs;
3763 bzero(&sfs, sizeof(sfs));
3764 sfs.f_flags = mp->mnt_flag & MNT_VISFLAGMASK;
3765 sfs.f_type = mp->mnt_vtable->vfc_typenum;
3766 sfs.f_bsize = (user64_long_t)sp->f_bsize;
3767 sfs.f_iosize = (user64_long_t)sp->f_iosize;
3768 sfs.f_blocks = (user64_long_t)sp->f_blocks;
3769 sfs.f_bfree = (user64_long_t)sp->f_bfree;
3770 sfs.f_bavail = (user64_long_t)sp->f_bavail;
3771 sfs.f_files = (user64_long_t)sp->f_files;
3772 sfs.f_ffree = (user64_long_t)sp->f_ffree;
3773 sfs.f_fsid = sp->f_fsid;
3774 sfs.f_owner = sp->f_owner;
3775 #ifdef NFSCLIENT
3776 if (mp->mnt_kern_flag & MNTK_TYPENAME_OVERRIDE) {
3777 strlcpy(&sfs.f_fstypename[0], &mp->fstypename_override[0], MFSNAMELEN);
3778 } else
3779 #endif
3780 {
3781 strlcpy(sfs.f_fstypename, sp->f_fstypename, MFSNAMELEN);
3782 }
3783 strlcpy(sfs.f_mntonname, sp->f_mntonname, MNAMELEN);
3784 strlcpy(sfs.f_mntfromname, sp->f_mntfromname, MNAMELEN);
3785
3786 error = SYSCTL_OUT(req, &sfs, sizeof(sfs));
3787 } else {
3788 struct user32_statfs sfs;
3789 bzero(&sfs, sizeof(sfs));
3790 sfs.f_flags = mp->mnt_flag & MNT_VISFLAGMASK;
3791 sfs.f_type = mp->mnt_vtable->vfc_typenum;
3792
3793 /*
3794 * It's possible for there to be more than 2^^31 blocks in the filesystem, so we
3795 * have to fudge the numbers here in that case. We inflate the blocksize in order
3796 * to reflect the filesystem size as best we can.
3797 */
3798 if (sp->f_blocks > INT_MAX) {
3799 int shift;
3800
3801 /*
3802 * Work out how far we have to shift the block count down to make it fit.
3803 * Note that it's possible to have to shift so far that the resulting
3804 * blocksize would be unreportably large. At that point, we will clip
3805 * any values that don't fit.
3806 *
3807 * For safety's sake, we also ensure that f_iosize is never reported as
3808 * being smaller than f_bsize.
3809 */
3810 for (shift = 0; shift < 32; shift++) {
3811 if ((sp->f_blocks >> shift) <= INT_MAX) {
3812 break;
3813 }
3814 if ((((long long)sp->f_bsize) << (shift + 1)) > INT_MAX) {
3815 break;
3816 }
3817 }
3818 #define __SHIFT_OR_CLIP(x, s) ((((x) >> (s)) > INT_MAX) ? INT_MAX : ((x) >> (s)))
3819 sfs.f_blocks = (user32_long_t)__SHIFT_OR_CLIP(sp->f_blocks, shift);
3820 sfs.f_bfree = (user32_long_t)__SHIFT_OR_CLIP(sp->f_bfree, shift);
3821 sfs.f_bavail = (user32_long_t)__SHIFT_OR_CLIP(sp->f_bavail, shift);
3822 #undef __SHIFT_OR_CLIP
3823 sfs.f_bsize = (user32_long_t)(sp->f_bsize << shift);
3824 sfs.f_iosize = lmax(sp->f_iosize, sp->f_bsize);
3825 } else {
3826 sfs.f_bsize = (user32_long_t)sp->f_bsize;
3827 sfs.f_iosize = (user32_long_t)sp->f_iosize;
3828 sfs.f_blocks = (user32_long_t)sp->f_blocks;
3829 sfs.f_bfree = (user32_long_t)sp->f_bfree;
3830 sfs.f_bavail = (user32_long_t)sp->f_bavail;
3831 }
3832 sfs.f_files = (user32_long_t)sp->f_files;
3833 sfs.f_ffree = (user32_long_t)sp->f_ffree;
3834 sfs.f_fsid = sp->f_fsid;
3835 sfs.f_owner = sp->f_owner;
3836
3837 #ifdef NFSCLIENT
3838 if (mp->mnt_kern_flag & MNTK_TYPENAME_OVERRIDE) {
3839 strlcpy(&sfs.f_fstypename[0], &mp->fstypename_override[0], MFSNAMELEN);
3840 } else
3841 #endif
3842 {
3843 strlcpy(sfs.f_fstypename, sp->f_fstypename, MFSNAMELEN);
3844 }
3845 strlcpy(sfs.f_mntonname, sp->f_mntonname, MNAMELEN);
3846 strlcpy(sfs.f_mntfromname, sp->f_mntfromname, MNAMELEN);
3847
3848 error = SYSCTL_OUT(req, &sfs, sizeof(sfs));
3849 }
3850 break;
3851 default:
3852 error = ENOTSUP;
3853 goto out;
3854 }
3855 out:
3856 if (gotref != 0) {
3857 mount_iterdrop(mp);
3858 }
3859 return error;
3860 }
3861
3862 static int filt_fsattach(struct knote *kn, struct kevent_internal_s *kev);
3863 static void filt_fsdetach(struct knote *kn);
3864 static int filt_fsevent(struct knote *kn, long hint);
3865 static int filt_fstouch(struct knote *kn, struct kevent_internal_s *kev);
3866 static int filt_fsprocess(struct knote *kn, struct filt_process_s *data, struct kevent_internal_s *kev);
3867 SECURITY_READ_ONLY_EARLY(struct filterops) fs_filtops = {
3868 .f_attach = filt_fsattach,
3869 .f_detach = filt_fsdetach,
3870 .f_event = filt_fsevent,
3871 .f_touch = filt_fstouch,
3872 .f_process = filt_fsprocess,
3873 };
3874
3875 static int
3876 filt_fsattach(struct knote *kn, __unused struct kevent_internal_s *kev)
3877 {
3878 lck_mtx_lock(fs_klist_lock);
3879 KNOTE_ATTACH(&fs_klist, kn);
3880 lck_mtx_unlock(fs_klist_lock);
3881
3882 /*
3883 * filter only sees future events,
3884 * so it can't be fired already.
3885 */
3886 return 0;
3887 }
3888
3889 static void
3890 filt_fsdetach(struct knote *kn)
3891 {
3892 lck_mtx_lock(fs_klist_lock);
3893 KNOTE_DETACH(&fs_klist, kn);
3894 lck_mtx_unlock(fs_klist_lock);
3895 }
3896
3897 static int
3898 filt_fsevent(struct knote *kn, long hint)
3899 {
3900 /*
3901 * Backwards compatibility:
3902 * Other filters would do nothing if kn->kn_sfflags == 0
3903 */
3904
3905 if ((kn->kn_sfflags == 0) || (kn->kn_sfflags & hint)) {
3906 kn->kn_fflags |= hint;
3907 }
3908
3909 return kn->kn_fflags != 0;
3910 }
3911
3912 static int
3913 filt_fstouch(struct knote *kn, struct kevent_internal_s *kev)
3914 {
3915 int res;
3916
3917 lck_mtx_lock(fs_klist_lock);
3918
3919 kn->kn_sfflags = kev->fflags;
3920
3921 /*
3922 * the above filter function sets bits even if nobody is looking for them.
3923 * Just preserve those bits even in the new mask is more selective
3924 * than before.
3925 *
3926 * For compatibility with previous implementations, we leave kn_fflags
3927 * as they were before.
3928 */
3929 //if (kn->kn_sfflags)
3930 // kn->kn_fflags &= kn->kn_sfflags;
3931 res = (kn->kn_fflags != 0);
3932
3933 lck_mtx_unlock(fs_klist_lock);
3934
3935 return res;
3936 }
3937
3938 static int
3939 filt_fsprocess(struct knote *kn, struct filt_process_s *data, struct kevent_internal_s *kev)
3940 {
3941 #pragma unused(data)
3942 int res;
3943
3944 lck_mtx_lock(fs_klist_lock);
3945 res = (kn->kn_fflags != 0);
3946 if (res) {
3947 *kev = kn->kn_kevent;
3948 kn->kn_flags |= EV_CLEAR; /* automatic */
3949 kn->kn_fflags = 0;
3950 kn->kn_data = 0;
3951 }
3952 lck_mtx_unlock(fs_klist_lock);
3953 return res;
3954 }
3955
3956 static int
3957 sysctl_vfs_noremotehang(__unused struct sysctl_oid *oidp,
3958 __unused void *arg1, __unused int arg2, struct sysctl_req *req)
3959 {
3960 int out, error;
3961 pid_t pid;
3962 proc_t p;
3963
3964 /* We need a pid. */
3965 if (req->newptr == USER_ADDR_NULL) {
3966 return EINVAL;
3967 }
3968
3969 error = SYSCTL_IN(req, &pid, sizeof(pid));
3970 if (error) {
3971 return error;
3972 }
3973
3974 p = proc_find(pid < 0 ? -pid : pid);
3975 if (p == NULL) {
3976 return ESRCH;
3977 }
3978
3979 /*
3980 * Fetching the value is ok, but we only fetch if the old
3981 * pointer is given.
3982 */
3983 if (req->oldptr != USER_ADDR_NULL) {
3984 out = !((p->p_flag & P_NOREMOTEHANG) == 0);
3985 proc_rele(p);
3986 error = SYSCTL_OUT(req, &out, sizeof(out));
3987 return error;
3988 }
3989
3990 /* cansignal offers us enough security. */
3991 if (p != req->p && proc_suser(req->p) != 0) {
3992 proc_rele(p);
3993 return EPERM;
3994 }
3995
3996 if (pid < 0) {
3997 OSBitAndAtomic(~((uint32_t)P_NOREMOTEHANG), &p->p_flag);
3998 } else {
3999 OSBitOrAtomic(P_NOREMOTEHANG, &p->p_flag);
4000 }
4001 proc_rele(p);
4002
4003 return 0;
4004 }
4005
4006 static int
4007 sysctl_vfs_generic_conf SYSCTL_HANDLER_ARGS
4008 {
4009 int *name, namelen;
4010 struct vfstable *vfsp;
4011 struct vfsconf vfsc = {};
4012
4013 (void)oidp;
4014 name = arg1;
4015 namelen = arg2;
4016
4017 if (namelen < 1) {
4018 return EISDIR;
4019 } else if (namelen > 1) {
4020 return ENOTDIR;
4021 }
4022
4023 mount_list_lock();
4024 for (vfsp = vfsconf; vfsp; vfsp = vfsp->vfc_next) {
4025 if (vfsp->vfc_typenum == name[0]) {
4026 break;
4027 }
4028 }
4029
4030 if (vfsp == NULL) {
4031 mount_list_unlock();
4032 return ENOTSUP;
4033 }
4034
4035 vfsc.vfc_reserved1 = 0;
4036 bcopy(vfsp->vfc_name, vfsc.vfc_name, sizeof(vfsc.vfc_name));
4037 vfsc.vfc_typenum = vfsp->vfc_typenum;
4038 vfsc.vfc_refcount = vfsp->vfc_refcount;
4039 vfsc.vfc_flags = vfsp->vfc_flags;
4040 vfsc.vfc_reserved2 = 0;
4041 vfsc.vfc_reserved3 = 0;
4042
4043 mount_list_unlock();
4044 return SYSCTL_OUT(req, &vfsc, sizeof(struct vfsconf));
4045 }
4046
4047 /* the vfs.generic. branch. */
4048 SYSCTL_NODE(_vfs, VFS_GENERIC, generic, CTLFLAG_RW | CTLFLAG_LOCKED, NULL, "vfs generic hinge");
4049 /* retreive a list of mounted filesystem fsid_t */
4050 SYSCTL_PROC(_vfs_generic, OID_AUTO, vfsidlist,
4051 CTLTYPE_STRUCT | CTLFLAG_RD | CTLFLAG_LOCKED,
4052 NULL, 0, sysctl_vfs_vfslist, "S,fsid", "List of mounted filesystem ids");
4053 /* perform operations on filesystem via fsid_t */
4054 SYSCTL_NODE(_vfs_generic, OID_AUTO, ctlbyfsid, CTLFLAG_RW | CTLFLAG_LOCKED,
4055 sysctl_vfs_ctlbyfsid, "ctlbyfsid");
4056 SYSCTL_PROC(_vfs_generic, OID_AUTO, noremotehang, CTLFLAG_RW | CTLFLAG_ANYBODY,
4057 NULL, 0, sysctl_vfs_noremotehang, "I", "noremotehang");
4058 SYSCTL_INT(_vfs_generic, VFS_MAXTYPENUM, maxtypenum,
4059 CTLFLAG_RD | CTLFLAG_KERN | CTLFLAG_LOCKED,
4060 &maxvfstypenum, 0, "");
4061 SYSCTL_INT(_vfs_generic, OID_AUTO, sync_timeout, CTLFLAG_RW | CTLFLAG_LOCKED, &sync_timeout_seconds, 0, "");
4062 SYSCTL_NODE(_vfs_generic, VFS_CONF, conf,
4063 CTLFLAG_RD | CTLFLAG_LOCKED,
4064 sysctl_vfs_generic_conf, "");
4065
4066 /* Indicate that the root file system unmounted cleanly */
4067 static int vfs_root_unmounted_cleanly = 0;
4068 SYSCTL_INT(_vfs_generic, OID_AUTO, root_unmounted_cleanly, CTLFLAG_RD, &vfs_root_unmounted_cleanly, 0, "Root filesystem was unmounted cleanly");
4069
4070 void
4071 vfs_set_root_unmounted_cleanly(void)
4072 {
4073 vfs_root_unmounted_cleanly = 1;
4074 }
4075
4076 /*
4077 * Print vnode state.
4078 */
4079 void
4080 vn_print_state(struct vnode *vp, const char *fmt, ...)
4081 {
4082 va_list ap;
4083 char perm_str[] = "(VM_KERNEL_ADDRPERM pointer)";
4084 char fs_name[MFSNAMELEN];
4085
4086 va_start(ap, fmt);
4087 vprintf(fmt, ap);
4088 va_end(ap);
4089 printf("vp 0x%0llx %s: ", (uint64_t)VM_KERNEL_ADDRPERM(vp), perm_str);
4090 printf("tag %d, type %d\n", vp->v_tag, vp->v_type);
4091 /* Counts .. */
4092 printf(" iocount %d, usecount %d, kusecount %d references %d\n",
4093 vp->v_iocount, vp->v_usecount, vp->v_kusecount, vp->v_references);
4094 printf(" writecount %d, numoutput %d\n", vp->v_writecount,
4095 vp->v_numoutput);
4096 /* Flags */
4097 printf(" flag 0x%x, lflag 0x%x, listflag 0x%x\n", vp->v_flag,
4098 vp->v_lflag, vp->v_listflag);
4099
4100 if (vp->v_mount == NULL || vp->v_mount == dead_mountp) {
4101 strlcpy(fs_name, "deadfs", MFSNAMELEN);
4102 } else {
4103 vfs_name(vp->v_mount, fs_name);
4104 }
4105
4106 printf(" v_data 0x%0llx %s\n",
4107 (vp->v_data ? (uint64_t)VM_KERNEL_ADDRPERM(vp->v_data) : 0),
4108 perm_str);
4109 printf(" v_mount 0x%0llx %s vfs_name %s\n",
4110 (vp->v_mount ? (uint64_t)VM_KERNEL_ADDRPERM(vp->v_mount) : 0),
4111 perm_str, fs_name);
4112 }
4113
4114 long num_reusedvnodes = 0;
4115
4116
4117 static vnode_t
4118 process_vp(vnode_t vp, int want_vp, int *deferred)
4119 {
4120 unsigned int vpid;
4121
4122 *deferred = 0;
4123
4124 vpid = vp->v_id;
4125
4126 vnode_list_remove_locked(vp);
4127
4128 vnode_list_unlock();
4129
4130 vnode_lock_spin(vp);
4131
4132 /*
4133 * We could wait for the vnode_lock after removing the vp from the freelist
4134 * and the vid is bumped only at the very end of reclaim. So it is possible
4135 * that we are looking at a vnode that is being terminated. If so skip it.
4136 */
4137 if ((vpid != vp->v_id) || (vp->v_usecount != 0) || (vp->v_iocount != 0) ||
4138 VONLIST(vp) || (vp->v_lflag & VL_TERMINATE)) {
4139 /*
4140 * we lost the race between dropping the list lock
4141 * and picking up the vnode_lock... someone else
4142 * used this vnode and it is now in a new state
4143 */
4144 vnode_unlock(vp);
4145
4146 return NULLVP;
4147 }
4148 if ((vp->v_lflag & (VL_NEEDINACTIVE | VL_MARKTERM)) == VL_NEEDINACTIVE) {
4149 /*
4150 * we did a vnode_rele_ext that asked for
4151 * us not to reenter the filesystem during
4152 * the release even though VL_NEEDINACTIVE was
4153 * set... we'll do it here by doing a
4154 * vnode_get/vnode_put
4155 *
4156 * pick up an iocount so that we can call
4157 * vnode_put and drive the VNOP_INACTIVE...
4158 * vnode_put will either leave us off
4159 * the freelist if a new ref comes in,
4160 * or put us back on the end of the freelist
4161 * or recycle us if we were marked for termination...
4162 * so we'll just go grab a new candidate
4163 */
4164 vp->v_iocount++;
4165 #ifdef JOE_DEBUG
4166 record_vp(vp, 1);
4167 #endif
4168 vnode_put_locked(vp);
4169 vnode_unlock(vp);
4170
4171 return NULLVP;
4172 }
4173 /*
4174 * Checks for anyone racing us for recycle
4175 */
4176 if (vp->v_type != VBAD) {
4177 if (want_vp && (vnode_on_reliable_media(vp) == FALSE || (vp->v_flag & VISDIRTY))) {
4178 vnode_async_list_add(vp);
4179 vnode_unlock(vp);
4180
4181 *deferred = 1;
4182
4183 return NULLVP;
4184 }
4185 if (vp->v_lflag & VL_DEAD) {
4186 panic("new_vnode(%p): the vnode is VL_DEAD but not VBAD", vp);
4187 }
4188
4189 vnode_lock_convert(vp);
4190 (void)vnode_reclaim_internal(vp, 1, want_vp, 0);
4191
4192 if (want_vp) {
4193 if ((VONLIST(vp))) {
4194 panic("new_vnode(%p): vp on list", vp);
4195 }
4196 if (vp->v_usecount || vp->v_iocount || vp->v_kusecount ||
4197 (vp->v_lflag & (VNAMED_UBC | VNAMED_MOUNT | VNAMED_FSHASH))) {
4198 panic("new_vnode(%p): free vnode still referenced", vp);
4199 }
4200 if ((vp->v_mntvnodes.tqe_prev != 0) && (vp->v_mntvnodes.tqe_next != 0)) {
4201 panic("new_vnode(%p): vnode seems to be on mount list", vp);
4202 }
4203 if (!LIST_EMPTY(&vp->v_nclinks) || !TAILQ_EMPTY(&vp->v_ncchildren)) {
4204 panic("new_vnode(%p): vnode still hooked into the name cache", vp);
4205 }
4206 } else {
4207 vnode_unlock(vp);
4208 vp = NULLVP;
4209 }
4210 }
4211 return vp;
4212 }
4213
4214 __attribute__((noreturn))
4215 static void
4216 async_work_continue(void)
4217 {
4218 struct async_work_lst *q;
4219 int deferred;
4220 vnode_t vp;
4221
4222 q = &vnode_async_work_list;
4223
4224 for (;;) {
4225 vnode_list_lock();
4226
4227 if (TAILQ_EMPTY(q)) {
4228 assert_wait(q, (THREAD_UNINT));
4229
4230 vnode_list_unlock();
4231
4232 thread_block((thread_continue_t)async_work_continue);
4233
4234 continue;
4235 }
4236 async_work_handled++;
4237
4238 vp = TAILQ_FIRST(q);
4239
4240 vp = process_vp(vp, 0, &deferred);
4241
4242 if (vp != NULLVP) {
4243 panic("found VBAD vp (%p) on async queue", vp);
4244 }
4245 }
4246 }
4247
4248
4249 static int
4250 new_vnode(vnode_t *vpp)
4251 {
4252 vnode_t vp;
4253 uint32_t retries = 0, max_retries = 100; /* retry incase of tablefull */
4254 int force_alloc = 0, walk_count = 0;
4255 boolean_t need_reliable_vp = FALSE;
4256 int deferred;
4257 struct timeval initial_tv;
4258 struct timeval current_tv;
4259 proc_t curproc = current_proc();
4260
4261 initial_tv.tv_sec = 0;
4262 retry:
4263 vp = NULLVP;
4264
4265 vnode_list_lock();
4266
4267 if (need_reliable_vp == TRUE) {
4268 async_work_timed_out++;
4269 }
4270
4271 if ((numvnodes - deadvnodes) < desiredvnodes || force_alloc) {
4272 struct timespec ts;
4273
4274 if (!TAILQ_EMPTY(&vnode_dead_list)) {
4275 /*
4276 * Can always reuse a dead one
4277 */
4278 vp = TAILQ_FIRST(&vnode_dead_list);
4279 goto steal_this_vp;
4280 }
4281 /*
4282 * no dead vnodes available... if we're under
4283 * the limit, we'll create a new vnode
4284 */
4285 numvnodes++;
4286 vnode_list_unlock();
4287
4288 MALLOC_ZONE(vp, struct vnode *, sizeof(*vp), M_VNODE, M_WAITOK);
4289 bzero((char *)vp, sizeof(*vp));
4290 VLISTNONE(vp); /* avoid double queue removal */
4291 lck_mtx_init(&vp->v_lock, vnode_lck_grp, vnode_lck_attr);
4292
4293 TAILQ_INIT(&vp->v_ncchildren);
4294
4295 klist_init(&vp->v_knotes);
4296 nanouptime(&ts);
4297 vp->v_id = ts.tv_nsec;
4298 vp->v_flag = VSTANDARD;
4299
4300 #if CONFIG_MACF
4301 if (mac_vnode_label_init_needed(vp)) {
4302 mac_vnode_label_init(vp);
4303 }
4304 #endif /* MAC */
4305
4306 vp->v_iocount = 1;
4307 goto done;
4308 }
4309 microuptime(&current_tv);
4310
4311 #define MAX_WALK_COUNT 1000
4312
4313 if (!TAILQ_EMPTY(&vnode_rage_list) &&
4314 (ragevnodes >= rage_limit ||
4315 (current_tv.tv_sec - rage_tv.tv_sec) >= RAGE_TIME_LIMIT)) {
4316 TAILQ_FOREACH(vp, &vnode_rage_list, v_freelist) {
4317 if (!(vp->v_listflag & VLIST_RAGE)) {
4318 panic("new_vnode: vp (%p) on RAGE list not marked VLIST_RAGE", vp);
4319 }
4320
4321 // if we're a dependency-capable process, skip vnodes that can
4322 // cause recycling deadlocks. (i.e. this process is diskimages
4323 // helper and the vnode is in a disk image). Querying the
4324 // mnt_kern_flag for the mount's virtual device status
4325 // is safer than checking the mnt_dependent_process, which
4326 // may not be updated if there are multiple devnode layers
4327 // in between the disk image and the final consumer.
4328
4329 if ((curproc->p_flag & P_DEPENDENCY_CAPABLE) == 0 || vp->v_mount == NULL ||
4330 (vp->v_mount->mnt_kern_flag & MNTK_VIRTUALDEV) == 0) {
4331 /*
4332 * if need_reliable_vp == TRUE, then we've already sent one or more
4333 * non-reliable vnodes to the async thread for processing and timed
4334 * out waiting for a dead vnode to show up. Use the MAX_WALK_COUNT
4335 * mechanism to first scan for a reliable vnode before forcing
4336 * a new vnode to be created
4337 */
4338 if (need_reliable_vp == FALSE || vnode_on_reliable_media(vp) == TRUE) {
4339 break;
4340 }
4341 }
4342
4343 // don't iterate more than MAX_WALK_COUNT vnodes to
4344 // avoid keeping the vnode list lock held for too long.
4345
4346 if (walk_count++ > MAX_WALK_COUNT) {
4347 vp = NULL;
4348 break;
4349 }
4350 }
4351 }
4352
4353 if (vp == NULL && !TAILQ_EMPTY(&vnode_free_list)) {
4354 /*
4355 * Pick the first vp for possible reuse
4356 */
4357 walk_count = 0;
4358 TAILQ_FOREACH(vp, &vnode_free_list, v_freelist) {
4359 // if we're a dependency-capable process, skip vnodes that can
4360 // cause recycling deadlocks. (i.e. this process is diskimages
4361 // helper and the vnode is in a disk image). Querying the
4362 // mnt_kern_flag for the mount's virtual device status
4363 // is safer than checking the mnt_dependent_process, which
4364 // may not be updated if there are multiple devnode layers
4365 // in between the disk image and the final consumer.
4366
4367 if ((curproc->p_flag & P_DEPENDENCY_CAPABLE) == 0 || vp->v_mount == NULL ||
4368 (vp->v_mount->mnt_kern_flag & MNTK_VIRTUALDEV) == 0) {
4369 /*
4370 * if need_reliable_vp == TRUE, then we've already sent one or more
4371 * non-reliable vnodes to the async thread for processing and timed
4372 * out waiting for a dead vnode to show up. Use the MAX_WALK_COUNT
4373 * mechanism to first scan for a reliable vnode before forcing
4374 * a new vnode to be created
4375 */
4376 if (need_reliable_vp == FALSE || vnode_on_reliable_media(vp) == TRUE) {
4377 break;
4378 }
4379 }
4380
4381 // don't iterate more than MAX_WALK_COUNT vnodes to
4382 // avoid keeping the vnode list lock held for too long.
4383
4384 if (walk_count++ > MAX_WALK_COUNT) {
4385 vp = NULL;
4386 break;
4387 }
4388 }
4389 }
4390
4391 //
4392 // if we don't have a vnode and the walk_count is >= MAX_WALK_COUNT
4393 // then we're trying to create a vnode on behalf of a
4394 // process like diskimages-helper that has file systems
4395 // mounted on top of itself (and thus we can't reclaim
4396 // vnodes in the file systems on top of us). if we can't
4397 // find a vnode to reclaim then we'll just have to force
4398 // the allocation.
4399 //
4400 if (vp == NULL && walk_count >= MAX_WALK_COUNT) {
4401 force_alloc = 1;
4402 vnode_list_unlock();
4403 goto retry;
4404 }
4405
4406 if (vp == NULL) {
4407 /*
4408 * we've reached the system imposed maximum number of vnodes
4409 * but there isn't a single one available
4410 * wait a bit and then retry... if we can't get a vnode
4411 * after our target number of retries, than log a complaint
4412 */
4413 if (++retries <= max_retries) {
4414 vnode_list_unlock();
4415 delay_for_interval(1, 1000 * 1000);
4416 goto retry;
4417 }
4418
4419 vnode_list_unlock();
4420 tablefull("vnode");
4421 log(LOG_EMERG, "%d desired, %d numvnodes, "
4422 "%d free, %d dead, %d async, %d rage\n",
4423 desiredvnodes, numvnodes, freevnodes, deadvnodes, async_work_vnodes, ragevnodes);
4424 #if CONFIG_JETSAM
4425
4426 #if DEVELOPMENT || DEBUG
4427 if (bootarg_no_vnode_jetsam) {
4428 panic("vnode table is full\n");
4429 }
4430 #endif /* DEVELOPMENT || DEBUG */
4431
4432 /*
4433 * Running out of vnodes tends to make a system unusable. Start killing
4434 * processes that jetsam knows are killable.
4435 */
4436 if (memorystatus_kill_on_vnode_limit() == FALSE) {
4437 /*
4438 * If jetsam can't find any more processes to kill and there
4439 * still aren't any free vnodes, panic. Hopefully we'll get a
4440 * panic log to tell us why we ran out.
4441 */
4442 panic("vnode table is full\n");
4443 }
4444
4445 /*
4446 * Now that we've killed someone, wait a bit and continue looking
4447 * (with fewer retries before trying another kill).
4448 */
4449 delay_for_interval(3, 1000 * 1000);
4450 retries = 0;
4451 max_retries = 10;
4452 goto retry;
4453 #endif
4454
4455 *vpp = NULL;
4456 return ENFILE;
4457 }
4458 steal_this_vp:
4459 if ((vp = process_vp(vp, 1, &deferred)) == NULLVP) {
4460 if (deferred) {
4461 int elapsed_msecs;
4462 struct timeval elapsed_tv;
4463
4464 if (initial_tv.tv_sec == 0) {
4465 microuptime(&initial_tv);
4466 }
4467
4468 vnode_list_lock();
4469
4470 dead_vnode_waited++;
4471 dead_vnode_wanted++;
4472
4473 /*
4474 * note that we're only going to explicitly wait 10ms
4475 * for a dead vnode to become available, since even if one
4476 * isn't available, a reliable vnode might now be available
4477 * at the head of the VRAGE or free lists... if so, we
4478 * can satisfy the new_vnode request with less latency then waiting
4479 * for the full 100ms duration we're ultimately willing to tolerate
4480 */
4481 assert_wait_timeout((caddr_t)&dead_vnode_wanted, (THREAD_INTERRUPTIBLE), 10000, NSEC_PER_USEC);
4482
4483 vnode_list_unlock();
4484
4485 thread_block(THREAD_CONTINUE_NULL);
4486
4487 microuptime(&elapsed_tv);
4488
4489 timevalsub(&elapsed_tv, &initial_tv);
4490 elapsed_msecs = elapsed_tv.tv_sec * 1000 + elapsed_tv.tv_usec / 1000;
4491
4492 if (elapsed_msecs >= 100) {
4493 /*
4494 * we've waited long enough... 100ms is
4495 * somewhat arbitrary for this case, but the
4496 * normal worst case latency used for UI
4497 * interaction is 100ms, so I've chosen to
4498 * go with that.
4499 *
4500 * setting need_reliable_vp to TRUE
4501 * forces us to find a reliable vnode
4502 * that we can process synchronously, or
4503 * to create a new one if the scan for
4504 * a reliable one hits the scan limit
4505 */
4506 need_reliable_vp = TRUE;
4507 }
4508 }
4509 goto retry;
4510 }
4511 OSAddAtomicLong(1, &num_reusedvnodes);
4512
4513
4514 #if CONFIG_MACF
4515 /*
4516 * We should never see VL_LABELWAIT or VL_LABEL here.
4517 * as those operations hold a reference.
4518 */
4519 assert((vp->v_lflag & VL_LABELWAIT) != VL_LABELWAIT);
4520 assert((vp->v_lflag & VL_LABEL) != VL_LABEL);
4521 if (vp->v_lflag & VL_LABELED) {
4522 vnode_lock_convert(vp);
4523 mac_vnode_label_recycle(vp);
4524 } else if (mac_vnode_label_init_needed(vp)) {
4525 vnode_lock_convert(vp);
4526 mac_vnode_label_init(vp);
4527 }
4528
4529 #endif /* MAC */
4530
4531 vp->v_iocount = 1;
4532 vp->v_lflag = 0;
4533 vp->v_writecount = 0;
4534 vp->v_references = 0;
4535 vp->v_iterblkflags = 0;
4536 vp->v_flag = VSTANDARD;
4537 /* vbad vnodes can point to dead_mountp */
4538 vp->v_mount = NULL;
4539 vp->v_defer_reclaimlist = (vnode_t)0;
4540
4541 vnode_unlock(vp);
4542
4543 done:
4544 *vpp = vp;
4545
4546 return 0;
4547 }
4548
4549 void
4550 vnode_lock(vnode_t vp)
4551 {
4552 lck_mtx_lock(&vp->v_lock);
4553 }
4554
4555 void
4556 vnode_lock_spin(vnode_t vp)
4557 {
4558 lck_mtx_lock_spin(&vp->v_lock);
4559 }
4560
4561 void
4562 vnode_unlock(vnode_t vp)
4563 {
4564 lck_mtx_unlock(&vp->v_lock);
4565 }
4566
4567
4568
4569 int
4570 vnode_get(struct vnode *vp)
4571 {
4572 int retval;
4573
4574 vnode_lock_spin(vp);
4575 retval = vnode_get_locked(vp);
4576 vnode_unlock(vp);
4577
4578 return retval;
4579 }
4580
4581 int
4582 vnode_get_locked(struct vnode *vp)
4583 {
4584 #if DIAGNOSTIC
4585 lck_mtx_assert(&vp->v_lock, LCK_MTX_ASSERT_OWNED);
4586 #endif
4587 if ((vp->v_iocount == 0) && (vp->v_lflag & (VL_TERMINATE | VL_DEAD))) {
4588 return ENOENT;
4589 }
4590
4591 if (os_add_overflow(vp->v_iocount, 1, &vp->v_iocount)) {
4592 panic("v_iocount overflow");
4593 }
4594
4595 #ifdef JOE_DEBUG
4596 record_vp(vp, 1);
4597 #endif
4598 return 0;
4599 }
4600
4601 /*
4602 * vnode_getwithvid() cuts in line in front of a vnode drain (that is,
4603 * while the vnode is draining, but at no point after that) to prevent
4604 * deadlocks when getting vnodes from filesystem hashes while holding
4605 * resources that may prevent other iocounts from being released.
4606 */
4607 int
4608 vnode_getwithvid(vnode_t vp, uint32_t vid)
4609 {
4610 return vget_internal(vp, vid, (VNODE_NODEAD | VNODE_WITHID | VNODE_DRAINO));
4611 }
4612
4613 /*
4614 * vnode_getwithvid_drainok() is like vnode_getwithvid(), but *does* block behind a vnode
4615 * drain; it exists for use in the VFS name cache, where we really do want to block behind
4616 * vnode drain to prevent holding off an unmount.
4617 */
4618 int
4619 vnode_getwithvid_drainok(vnode_t vp, uint32_t vid)
4620 {
4621 return vget_internal(vp, vid, (VNODE_NODEAD | VNODE_WITHID));
4622 }
4623
4624 int
4625 vnode_getwithref(vnode_t vp)
4626 {
4627 return vget_internal(vp, 0, 0);
4628 }
4629
4630
4631 __private_extern__ int
4632 vnode_getalways(vnode_t vp)
4633 {
4634 return vget_internal(vp, 0, VNODE_ALWAYS);
4635 }
4636
4637 int
4638 vnode_put(vnode_t vp)
4639 {
4640 int retval;
4641
4642 vnode_lock_spin(vp);
4643 retval = vnode_put_locked(vp);
4644 vnode_unlock(vp);
4645
4646 return retval;
4647 }
4648
4649 static inline void
4650 vn_set_dead(vnode_t vp)
4651 {
4652 vp->v_mount = NULL;
4653 vp->v_op = dead_vnodeop_p;
4654 vp->v_tag = VT_NON;
4655 vp->v_data = NULL;
4656 vp->v_type = VBAD;
4657 vp->v_lflag |= VL_DEAD;
4658 }
4659
4660 int
4661 vnode_put_locked(vnode_t vp)
4662 {
4663 vfs_context_t ctx = vfs_context_current(); /* hoist outside loop */
4664
4665 #if DIAGNOSTIC
4666 lck_mtx_assert(&vp->v_lock, LCK_MTX_ASSERT_OWNED);
4667 #endif
4668 retry:
4669 if (vp->v_iocount < 1) {
4670 panic("vnode_put(%p): iocount < 1", vp);
4671 }
4672
4673 if ((vp->v_usecount > 0) || (vp->v_iocount > 1)) {
4674 vnode_dropiocount(vp);
4675 return 0;
4676 }
4677 if ((vp->v_lflag & (VL_DEAD | VL_NEEDINACTIVE)) == VL_NEEDINACTIVE) {
4678 vp->v_lflag &= ~VL_NEEDINACTIVE;
4679 vnode_unlock(vp);
4680
4681 VNOP_INACTIVE(vp, ctx);
4682
4683 vnode_lock_spin(vp);
4684 /*
4685 * because we had to drop the vnode lock before calling
4686 * VNOP_INACTIVE, the state of this vnode may have changed...
4687 * we may pick up both VL_MARTERM and either
4688 * an iocount or a usecount while in the VNOP_INACTIVE call
4689 * we don't want to call vnode_reclaim_internal on a vnode
4690 * that has active references on it... so loop back around
4691 * and reevaluate the state
4692 */
4693 goto retry;
4694 }
4695 vp->v_lflag &= ~VL_NEEDINACTIVE;
4696
4697 if ((vp->v_lflag & (VL_MARKTERM | VL_TERMINATE | VL_DEAD)) == VL_MARKTERM) {
4698 vnode_lock_convert(vp);
4699 vnode_reclaim_internal(vp, 1, 1, 0);
4700 }
4701 vnode_dropiocount(vp);
4702 vnode_list_add(vp);
4703
4704 return 0;
4705 }
4706
4707 /* is vnode_t in use by others? */
4708 int
4709 vnode_isinuse(vnode_t vp, int refcnt)
4710 {
4711 return vnode_isinuse_locked(vp, refcnt, 0);
4712 }
4713
4714 int
4715 vnode_usecount(vnode_t vp)
4716 {
4717 return vp->v_usecount;
4718 }
4719
4720 int
4721 vnode_iocount(vnode_t vp)
4722 {
4723 return vp->v_iocount;
4724 }
4725
4726 static int
4727 vnode_isinuse_locked(vnode_t vp, int refcnt, int locked)
4728 {
4729 int retval = 0;
4730
4731 if (!locked) {
4732 vnode_lock_spin(vp);
4733 }
4734 if ((vp->v_type != VREG) && ((vp->v_usecount - vp->v_kusecount) > refcnt)) {
4735 retval = 1;
4736 goto out;
4737 }
4738 if (vp->v_type == VREG) {
4739 retval = ubc_isinuse_locked(vp, refcnt, 1);
4740 }
4741
4742 out:
4743 if (!locked) {
4744 vnode_unlock(vp);
4745 }
4746 return retval;
4747 }
4748
4749
4750 /* resume vnode_t */
4751 errno_t
4752 vnode_resume(vnode_t vp)
4753 {
4754 if ((vp->v_lflag & VL_SUSPENDED) && vp->v_owner == current_thread()) {
4755 vnode_lock_spin(vp);
4756 vp->v_lflag &= ~VL_SUSPENDED;
4757 vp->v_owner = NULL;
4758 vnode_unlock(vp);
4759
4760 wakeup(&vp->v_iocount);
4761 }
4762 return 0;
4763 }
4764
4765 /* suspend vnode_t
4766 * Please do not use on more than one vnode at a time as it may
4767 * cause deadlocks.
4768 * xxx should we explicity prevent this from happening?
4769 */
4770
4771 errno_t
4772 vnode_suspend(vnode_t vp)
4773 {
4774 if (vp->v_lflag & VL_SUSPENDED) {
4775 return EBUSY;
4776 }
4777
4778 vnode_lock_spin(vp);
4779
4780 /*
4781 * xxx is this sufficient to check if a vnode_drain is
4782 * progress?
4783 */
4784
4785 if (vp->v_owner == NULL) {
4786 vp->v_lflag |= VL_SUSPENDED;
4787 vp->v_owner = current_thread();
4788 }
4789 vnode_unlock(vp);
4790
4791 return 0;
4792 }
4793
4794 /*
4795 * Release any blocked locking requests on the vnode.
4796 * Used for forced-unmounts.
4797 *
4798 * XXX What about network filesystems?
4799 */
4800 static void
4801 vnode_abort_advlocks(vnode_t vp)
4802 {
4803 if (vp->v_flag & VLOCKLOCAL) {
4804 lf_abort_advlocks(vp);
4805 }
4806 }
4807
4808
4809 static errno_t
4810 vnode_drain(vnode_t vp)
4811 {
4812 if (vp->v_lflag & VL_DRAIN) {
4813 panic("vnode_drain: recursive drain");
4814 return ENOENT;
4815 }
4816 vp->v_lflag |= VL_DRAIN;
4817 vp->v_owner = current_thread();
4818
4819 while (vp->v_iocount > 1) {
4820 msleep(&vp->v_iocount, &vp->v_lock, PVFS, "vnode_drain", NULL);
4821 }
4822
4823 vp->v_lflag &= ~VL_DRAIN;
4824
4825 return 0;
4826 }
4827
4828
4829 /*
4830 * if the number of recent references via vnode_getwithvid or vnode_getwithref
4831 * exceeds this threshold, than 'UN-AGE' the vnode by removing it from
4832 * the LRU list if it's currently on it... once the iocount and usecount both drop
4833 * to 0, it will get put back on the end of the list, effectively making it younger
4834 * this allows us to keep actively referenced vnodes in the list without having
4835 * to constantly remove and add to the list each time a vnode w/o a usecount is
4836 * referenced which costs us taking and dropping a global lock twice.
4837 * However, if the vnode is marked DIRTY, we want to pull it out much earlier
4838 */
4839 #define UNAGE_THRESHHOLD 25
4840 #define UNAGE_DIRTYTHRESHHOLD 6
4841
4842 errno_t
4843 vnode_getiocount(vnode_t vp, unsigned int vid, int vflags)
4844 {
4845 int nodead = vflags & VNODE_NODEAD;
4846 int nosusp = vflags & VNODE_NOSUSPEND;
4847 int always = vflags & VNODE_ALWAYS;
4848 int beatdrain = vflags & VNODE_DRAINO;
4849 int withvid = vflags & VNODE_WITHID;
4850
4851 for (;;) {
4852 int sleepflg = 0;
4853
4854 /*
4855 * if it is a dead vnode with deadfs
4856 */
4857 if (nodead && (vp->v_lflag & VL_DEAD) && ((vp->v_type == VBAD) || (vp->v_data == 0))) {
4858 return ENOENT;
4859 }
4860 /*
4861 * will return VL_DEAD ones
4862 */
4863 if ((vp->v_lflag & (VL_SUSPENDED | VL_DRAIN | VL_TERMINATE)) == 0) {
4864 break;
4865 }
4866 /*
4867 * if suspended vnodes are to be failed
4868 */
4869 if (nosusp && (vp->v_lflag & VL_SUSPENDED)) {
4870 return ENOENT;
4871 }
4872 /*
4873 * if you are the owner of drain/suspend/termination , can acquire iocount
4874 * check for VL_TERMINATE; it does not set owner
4875 */
4876 if ((vp->v_lflag & (VL_DRAIN | VL_SUSPENDED | VL_TERMINATE)) &&
4877 (vp->v_owner == current_thread())) {
4878 break;
4879 }
4880
4881 if (always != 0) {
4882 break;
4883 }
4884
4885 /*
4886 * If this vnode is getting drained, there are some cases where
4887 * we can't block or, in case of tty vnodes, want to be
4888 * interruptible.
4889 */
4890 if (vp->v_lflag & VL_DRAIN) {
4891 /*
4892 * In some situations, we want to get an iocount
4893 * even if the vnode is draining to prevent deadlock,
4894 * e.g. if we're in the filesystem, potentially holding
4895 * resources that could prevent other iocounts from
4896 * being released.
4897 */
4898 if (beatdrain) {
4899 break;
4900 }
4901 /*
4902 * Don't block if the vnode's mount point is unmounting as
4903 * we may be the thread the unmount is itself waiting on
4904 * Only callers who pass in vids (at this point, we've already
4905 * handled nosusp and nodead) are expecting error returns
4906 * from this function, so only we can only return errors for
4907 * those. ENODEV is intended to inform callers that the call
4908 * failed because an unmount is in progress.
4909 */
4910 if (withvid && (vp->v_mount) && vfs_isunmount(vp->v_mount)) {
4911 return ENODEV;
4912 }
4913
4914 if (vnode_istty(vp)) {
4915 sleepflg = PCATCH;
4916 }
4917 }
4918
4919 vnode_lock_convert(vp);
4920
4921 if (vp->v_lflag & VL_TERMINATE) {
4922 int error;
4923
4924 vp->v_lflag |= VL_TERMWANT;
4925
4926 error = msleep(&vp->v_lflag, &vp->v_lock,
4927 (PVFS | sleepflg), "vnode getiocount", NULL);
4928 if (error) {
4929 return error;
4930 }
4931 } else {
4932 msleep(&vp->v_iocount, &vp->v_lock, PVFS, "vnode_getiocount", NULL);
4933 }
4934 }
4935 if (withvid && vid != vp->v_id) {
4936 return ENOENT;
4937 }
4938 if (++vp->v_references >= UNAGE_THRESHHOLD ||
4939 (vp->v_flag & VISDIRTY && vp->v_references >= UNAGE_DIRTYTHRESHHOLD)) {
4940 vp->v_references = 0;
4941 vnode_list_remove(vp);
4942 }
4943 vp->v_iocount++;
4944 #ifdef JOE_DEBUG
4945 record_vp(vp, 1);
4946 #endif
4947 return 0;
4948 }
4949
4950 static void
4951 vnode_dropiocount(vnode_t vp)
4952 {
4953 if (vp->v_iocount < 1) {
4954 panic("vnode_dropiocount(%p): v_iocount < 1", vp);
4955 }
4956
4957 vp->v_iocount--;
4958 #ifdef JOE_DEBUG
4959 record_vp(vp, -1);
4960 #endif
4961 if ((vp->v_lflag & (VL_DRAIN | VL_SUSPENDED)) && (vp->v_iocount <= 1)) {
4962 wakeup(&vp->v_iocount);
4963 }
4964 }
4965
4966
4967 void
4968 vnode_reclaim(struct vnode * vp)
4969 {
4970 vnode_reclaim_internal(vp, 0, 0, 0);
4971 }
4972
4973 __private_extern__
4974 void
4975 vnode_reclaim_internal(struct vnode * vp, int locked, int reuse, int flags)
4976 {
4977 int isfifo = 0;
4978
4979 if (!locked) {
4980 vnode_lock(vp);
4981 }
4982
4983 if (vp->v_lflag & VL_TERMINATE) {
4984 panic("vnode reclaim in progress");
4985 }
4986 vp->v_lflag |= VL_TERMINATE;
4987
4988 vn_clearunionwait(vp, 1);
4989
4990 vnode_drain(vp);
4991
4992 isfifo = (vp->v_type == VFIFO);
4993
4994 if (vp->v_type != VBAD) {
4995 vgone(vp, flags); /* clean and reclaim the vnode */
4996 }
4997 /*
4998 * give the vnode a new identity so that vnode_getwithvid will fail
4999 * on any stale cache accesses...
5000 * grab the list_lock so that if we're in "new_vnode"
5001 * behind the list_lock trying to steal this vnode, the v_id is stable...
5002 * once new_vnode drops the list_lock, it will block trying to take
5003 * the vnode lock until we release it... at that point it will evaluate
5004 * whether the v_vid has changed
5005 * also need to make sure that the vnode isn't on a list where "new_vnode"
5006 * can find it after the v_id has been bumped until we are completely done
5007 * with the vnode (i.e. putting it back on a list has to be the very last
5008 * thing we do to this vnode... many of the callers of vnode_reclaim_internal
5009 * are holding an io_count on the vnode... they need to drop the io_count
5010 * BEFORE doing a vnode_list_add or make sure to hold the vnode lock until
5011 * they are completely done with the vnode
5012 */
5013 vnode_list_lock();
5014
5015 vnode_list_remove_locked(vp);
5016 vp->v_id++;
5017
5018 vnode_list_unlock();
5019
5020 if (isfifo) {
5021 struct fifoinfo * fip;
5022
5023 fip = vp->v_fifoinfo;
5024 vp->v_fifoinfo = NULL;
5025 FREE(fip, M_TEMP);
5026 }
5027 vp->v_type = VBAD;
5028
5029 if (vp->v_data) {
5030 panic("vnode_reclaim_internal: cleaned vnode isn't");
5031 }
5032 if (vp->v_numoutput) {
5033 panic("vnode_reclaim_internal: clean vnode has pending I/O's");
5034 }
5035 if (UBCINFOEXISTS(vp)) {
5036 panic("vnode_reclaim_internal: ubcinfo not cleaned");
5037 }
5038 if (vp->v_parent) {
5039 panic("vnode_reclaim_internal: vparent not removed");
5040 }
5041 if (vp->v_name) {
5042 panic("vnode_reclaim_internal: vname not removed");
5043 }
5044
5045 vp->v_socket = NULL;
5046
5047 vp->v_lflag &= ~VL_TERMINATE;
5048 vp->v_owner = NULL;
5049
5050 KNOTE(&vp->v_knotes, NOTE_REVOKE);
5051
5052 /* Make sure that when we reuse the vnode, no knotes left over */
5053 klist_init(&vp->v_knotes);
5054
5055 if (vp->v_lflag & VL_TERMWANT) {
5056 vp->v_lflag &= ~VL_TERMWANT;
5057 wakeup(&vp->v_lflag);
5058 }
5059 if (!reuse) {
5060 /*
5061 * make sure we get on the
5062 * dead list if appropriate
5063 */
5064 vnode_list_add(vp);
5065 }
5066 if (!locked) {
5067 vnode_unlock(vp);
5068 }
5069 }
5070
5071 static int
5072 vnode_create_internal(uint32_t flavor, uint32_t size, void *data, vnode_t *vpp,
5073 int init_vnode)
5074 {
5075 int error;
5076 int insert = 1;
5077 int existing_vnode;
5078 vnode_t vp;
5079 vnode_t nvp;
5080 vnode_t dvp;
5081 struct uthread *ut;
5082 struct componentname *cnp;
5083 struct vnode_fsparam *param = (struct vnode_fsparam *)data;
5084 #if CONFIG_TRIGGERS
5085 struct vnode_trigger_param *tinfo = NULL;
5086 #endif
5087 if (*vpp) {
5088 vp = *vpp;
5089 *vpp = NULLVP;
5090 existing_vnode = 1;
5091 } else {
5092 existing_vnode = 0;
5093 }
5094
5095 if (init_vnode) {
5096 /* Do quick sanity check on the parameters. */
5097 if ((param == NULL) || (param->vnfs_vtype == VBAD)) {
5098 error = EINVAL;
5099 goto error_out;
5100 }
5101
5102 #if CONFIG_TRIGGERS
5103 if ((flavor == VNCREATE_TRIGGER) && (size == VNCREATE_TRIGGER_SIZE)) {
5104 tinfo = (struct vnode_trigger_param *)data;
5105
5106 /* Validate trigger vnode input */
5107 if ((param->vnfs_vtype != VDIR) ||
5108 (tinfo->vnt_resolve_func == NULL) ||
5109 (tinfo->vnt_flags & ~VNT_VALID_MASK)) {
5110 error = EINVAL;
5111 goto error_out;
5112 }
5113 /* Fall through a normal create (params will be the same) */
5114 flavor = VNCREATE_FLAVOR;
5115 size = VCREATESIZE;
5116 }
5117 #endif
5118 if ((flavor != VNCREATE_FLAVOR) || (size != VCREATESIZE)) {
5119 error = EINVAL;
5120 goto error_out;
5121 }
5122 }
5123
5124 if (!existing_vnode) {
5125 if ((error = new_vnode(&vp))) {
5126 return error;
5127 }
5128 if (!init_vnode) {
5129 /* Make it so that it can be released by a vnode_put) */
5130 vn_set_dead(vp);
5131 *vpp = vp;
5132 return 0;
5133 }
5134 } else {
5135 /*
5136 * A vnode obtained by vnode_create_empty has been passed to
5137 * vnode_initialize - Unset VL_DEAD set by vn_set_dead. After
5138 * this point, it is set back on any error.
5139 *
5140 * N.B. vnode locking - We make the same assumptions as the
5141 * "unsplit" vnode_create did - i.e. it is safe to update the
5142 * vnode's fields without the vnode lock. This vnode has been
5143 * out and about with the filesystem and hopefully nothing
5144 * was done to the vnode between the vnode_create_empty and
5145 * now when it has come in through vnode_initialize.
5146 */
5147 vp->v_lflag &= ~VL_DEAD;
5148 }
5149
5150 dvp = param->vnfs_dvp;
5151 cnp = param->vnfs_cnp;
5152
5153 vp->v_op = param->vnfs_vops;
5154 vp->v_type = param->vnfs_vtype;
5155 vp->v_data = param->vnfs_fsnode;
5156
5157 if (param->vnfs_markroot) {
5158 vp->v_flag |= VROOT;
5159 }
5160 if (param->vnfs_marksystem) {
5161 vp->v_flag |= VSYSTEM;
5162 }
5163 if (vp->v_type == VREG) {
5164 error = ubc_info_init_withsize(vp, param->vnfs_filesize);
5165 if (error) {
5166 #ifdef JOE_DEBUG
5167 record_vp(vp, 1);
5168 #endif
5169 vn_set_dead(vp);
5170
5171 vnode_put(vp);
5172 return error;
5173 }
5174 if (param->vnfs_mp->mnt_ioflags & MNT_IOFLAGS_IOSCHED_SUPPORTED) {
5175 memory_object_mark_io_tracking(vp->v_ubcinfo->ui_control);
5176 }
5177 }
5178 #ifdef JOE_DEBUG
5179 record_vp(vp, 1);
5180 #endif
5181
5182 #if CONFIG_TRIGGERS
5183 /*
5184 * For trigger vnodes, attach trigger info to vnode
5185 */
5186 if ((vp->v_type == VDIR) && (tinfo != NULL)) {
5187 /*
5188 * Note: has a side effect of incrementing trigger count on the
5189 * mount if successful, which we would need to undo on a
5190 * subsequent failure.
5191 */
5192 #ifdef JOE_DEBUG
5193 record_vp(vp, -1);
5194 #endif
5195 error = vnode_resolver_create(param->vnfs_mp, vp, tinfo, FALSE);
5196 if (error) {
5197 printf("vnode_create: vnode_resolver_create() err %d\n", error);
5198 vn_set_dead(vp);
5199 #ifdef JOE_DEBUG
5200 record_vp(vp, 1);
5201 #endif
5202 vnode_put(vp);
5203 return error;
5204 }
5205 }
5206 #endif
5207 if (vp->v_type == VCHR || vp->v_type == VBLK) {
5208 vp->v_tag = VT_DEVFS; /* callers will reset if needed (bdevvp) */
5209
5210 if ((nvp = checkalias(vp, param->vnfs_rdev))) {
5211 /*
5212 * if checkalias returns a vnode, it will be locked
5213 *
5214 * first get rid of the unneeded vnode we acquired
5215 */
5216 vp->v_data = NULL;
5217 vp->v_op = spec_vnodeop_p;
5218 vp->v_type = VBAD;
5219 vp->v_lflag = VL_DEAD;
5220 vp->v_data = NULL;
5221 vp->v_tag = VT_NON;
5222 vnode_put(vp);
5223
5224 /*
5225 * switch to aliased vnode and finish
5226 * preparing it
5227 */
5228 vp = nvp;
5229
5230 vclean(vp, 0);
5231 vp->v_op = param->vnfs_vops;
5232 vp->v_type = param->vnfs_vtype;
5233 vp->v_data = param->vnfs_fsnode;
5234 vp->v_lflag = 0;
5235 vp->v_mount = NULL;
5236 insmntque(vp, param->vnfs_mp);
5237 insert = 0;
5238 vnode_unlock(vp);
5239 }
5240
5241 if (VCHR == vp->v_type) {
5242 u_int maj = major(vp->v_rdev);
5243
5244 if (maj < (u_int)nchrdev && cdevsw[maj].d_type == D_TTY) {
5245 vp->v_flag |= VISTTY;
5246 }
5247 }
5248 }
5249
5250 if (vp->v_type == VFIFO) {
5251 struct fifoinfo *fip;
5252
5253 MALLOC(fip, struct fifoinfo *,
5254 sizeof(*fip), M_TEMP, M_WAITOK);
5255 bzero(fip, sizeof(struct fifoinfo));
5256 vp->v_fifoinfo = fip;
5257 }
5258 /* The file systems must pass the address of the location where
5259 * they store the vnode pointer. When we add the vnode into the mount
5260 * list and name cache they become discoverable. So the file system node
5261 * must have the connection to vnode setup by then
5262 */
5263 *vpp = vp;
5264
5265 /* Add fs named reference. */
5266 if (param->vnfs_flags & VNFS_ADDFSREF) {
5267 vp->v_lflag |= VNAMED_FSHASH;
5268 }
5269 if (param->vnfs_mp) {
5270 if (param->vnfs_mp->mnt_kern_flag & MNTK_LOCK_LOCAL) {
5271 vp->v_flag |= VLOCKLOCAL;
5272 }
5273 if (insert) {
5274 if ((vp->v_freelist.tqe_prev != (struct vnode **)0xdeadb)) {
5275 panic("insmntque: vp on the free list\n");
5276 }
5277
5278 /*
5279 * enter in mount vnode list
5280 */
5281 insmntque(vp, param->vnfs_mp);
5282 }
5283 }
5284 if (dvp && vnode_ref(dvp) == 0) {
5285 vp->v_parent = dvp;
5286 }
5287 if (cnp) {
5288 if (dvp && ((param->vnfs_flags & (VNFS_NOCACHE | VNFS_CANTCACHE)) == 0)) {
5289 /*
5290 * enter into name cache
5291 * we've got the info to enter it into the name cache now
5292 * cache_enter_create will pick up an extra reference on
5293 * the name entered into the string cache
5294 */
5295 vp->v_name = cache_enter_create(dvp, vp, cnp);
5296 } else {
5297 vp->v_name = vfs_addname(cnp->cn_nameptr, cnp->cn_namelen, cnp->cn_hash, 0);
5298 }
5299
5300 if ((cnp->cn_flags & UNIONCREATED) == UNIONCREATED) {
5301 vp->v_flag |= VISUNION;
5302 }
5303 }
5304 if ((param->vnfs_flags & VNFS_CANTCACHE) == 0) {
5305 /*
5306 * this vnode is being created as cacheable in the name cache
5307 * this allows us to re-enter it in the cache
5308 */
5309 vp->v_flag |= VNCACHEABLE;
5310 }
5311 ut = get_bsdthread_info(current_thread());
5312
5313 if ((current_proc()->p_lflag & P_LRAGE_VNODES) ||
5314 (ut->uu_flag & (UT_RAGE_VNODES | UT_KERN_RAGE_VNODES))) {
5315 /*
5316 * process has indicated that it wants any
5317 * vnodes created on its behalf to be rapidly
5318 * aged to reduce the impact on the cached set
5319 * of vnodes
5320 *
5321 * if UT_KERN_RAGE_VNODES is set, then the
5322 * kernel internally wants vnodes to be rapidly
5323 * aged, even if the process hasn't requested
5324 * this
5325 */
5326 vp->v_flag |= VRAGE;
5327 }
5328
5329 #if CONFIG_SECLUDED_MEMORY
5330 switch (secluded_for_filecache) {
5331 case 0:
5332 /*
5333 * secluded_for_filecache == 0:
5334 * + no file contents in secluded pool
5335 */
5336 break;
5337 case 1:
5338 /*
5339 * secluded_for_filecache == 1:
5340 * + no files from /
5341 * + files from /Applications/ are OK
5342 * + files from /Applications/Camera are not OK
5343 * + no files that are open for write
5344 */
5345 if (vnode_vtype(vp) == VREG &&
5346 vnode_mount(vp) != NULL &&
5347 (!(vfs_flags(vnode_mount(vp)) & MNT_ROOTFS))) {
5348 /* not from root filesystem: eligible for secluded pages */
5349 memory_object_mark_eligible_for_secluded(
5350 ubc_getobject(vp, UBC_FLAGS_NONE),
5351 TRUE);
5352 }
5353 break;
5354 case 2:
5355 /*
5356 * secluded_for_filecache == 2:
5357 * + all read-only files OK, except:
5358 * + dyld_shared_cache_arm64*
5359 * + Camera
5360 * + mediaserverd
5361 */
5362 if (vnode_vtype(vp) == VREG) {
5363 memory_object_mark_eligible_for_secluded(
5364 ubc_getobject(vp, UBC_FLAGS_NONE),
5365 TRUE);
5366 }
5367 break;
5368 default:
5369 break;
5370 }
5371 #endif /* CONFIG_SECLUDED_MEMORY */
5372
5373 return 0;
5374
5375 error_out:
5376 if (existing_vnode) {
5377 vnode_put(vp);
5378 }
5379 return error;
5380 }
5381
5382 /* USAGE:
5383 * The following api creates a vnode and associates all the parameter specified in vnode_fsparam
5384 * structure and returns a vnode handle with a reference. device aliasing is handled here so checkalias
5385 * is obsoleted by this.
5386 */
5387 int
5388 vnode_create(uint32_t flavor, uint32_t size, void *data, vnode_t *vpp)
5389 {
5390 *vpp = NULLVP;
5391 return vnode_create_internal(flavor, size, data, vpp, 1);
5392 }
5393
5394 int
5395 vnode_create_empty(vnode_t *vpp)
5396 {
5397 *vpp = NULLVP;
5398 return vnode_create_internal(VNCREATE_FLAVOR, VCREATESIZE, NULL,
5399 vpp, 0);
5400 }
5401
5402 int
5403 vnode_initialize(uint32_t flavor, uint32_t size, void *data, vnode_t *vpp)
5404 {
5405 if (*vpp == NULLVP) {
5406 panic("NULL vnode passed to vnode_initialize");
5407 }
5408 #if DEVELOPMENT || DEBUG
5409 /*
5410 * We lock to check that vnode is fit for unlocked use in
5411 * vnode_create_internal.
5412 */
5413 vnode_lock_spin(*vpp);
5414 VNASSERT(((*vpp)->v_iocount == 1), *vpp,
5415 ("vnode_initialize : iocount not 1, is %d", (*vpp)->v_iocount));
5416 VNASSERT(((*vpp)->v_usecount == 0), *vpp,
5417 ("vnode_initialize : usecount not 0, is %d", (*vpp)->v_usecount));
5418 VNASSERT(((*vpp)->v_lflag & VL_DEAD), *vpp,
5419 ("vnode_initialize : v_lflag does not have VL_DEAD, is 0x%x",
5420 (*vpp)->v_lflag));
5421 VNASSERT(((*vpp)->v_data == NULL), *vpp,
5422 ("vnode_initialize : v_data not NULL"));
5423 vnode_unlock(*vpp);
5424 #endif
5425 return vnode_create_internal(flavor, size, data, vpp, 1);
5426 }
5427
5428 int
5429 vnode_addfsref(vnode_t vp)
5430 {
5431 vnode_lock_spin(vp);
5432 if (vp->v_lflag & VNAMED_FSHASH) {
5433 panic("add_fsref: vp already has named reference");
5434 }
5435 if ((vp->v_freelist.tqe_prev != (struct vnode **)0xdeadb)) {
5436 panic("addfsref: vp on the free list\n");
5437 }
5438 vp->v_lflag |= VNAMED_FSHASH;
5439 vnode_unlock(vp);
5440 return 0;
5441 }
5442 int
5443 vnode_removefsref(vnode_t vp)
5444 {
5445 vnode_lock_spin(vp);
5446 if ((vp->v_lflag & VNAMED_FSHASH) == 0) {
5447 panic("remove_fsref: no named reference");
5448 }
5449 vp->v_lflag &= ~VNAMED_FSHASH;
5450 vnode_unlock(vp);
5451 return 0;
5452 }
5453
5454
5455 int
5456 vfs_iterate(int flags, int (*callout)(mount_t, void *), void *arg)
5457 {
5458 mount_t mp;
5459 int ret = 0;
5460 fsid_t * fsid_list;
5461 int count, actualcount, i;
5462 void * allocmem;
5463 int indx_start, indx_stop, indx_incr;
5464 int cb_dropref = (flags & VFS_ITERATE_CB_DROPREF);
5465
5466 count = mount_getvfscnt();
5467 count += 10;
5468
5469 fsid_list = (fsid_t *)kalloc(count * sizeof(fsid_t));
5470 allocmem = (void *)fsid_list;
5471
5472 actualcount = mount_fillfsids(fsid_list, count);
5473
5474 /*
5475 * Establish the iteration direction
5476 * VFS_ITERATE_TAIL_FIRST overrides default head first order (oldest first)
5477 */
5478 if (flags & VFS_ITERATE_TAIL_FIRST) {
5479 indx_start = actualcount - 1;
5480 indx_stop = -1;
5481 indx_incr = -1;
5482 } else { /* Head first by default */
5483 indx_start = 0;
5484 indx_stop = actualcount;
5485 indx_incr = 1;
5486 }
5487
5488 for (i = indx_start; i != indx_stop; i += indx_incr) {
5489 /* obtain the mount point with iteration reference */
5490 mp = mount_list_lookupby_fsid(&fsid_list[i], 0, 1);
5491
5492 if (mp == (struct mount *)0) {
5493 continue;
5494 }
5495 mount_lock(mp);
5496 if (mp->mnt_lflag & (MNT_LDEAD | MNT_LUNMOUNT)) {
5497 mount_unlock(mp);
5498 mount_iterdrop(mp);
5499 continue;
5500 }
5501 mount_unlock(mp);
5502
5503 /* iterate over all the vnodes */
5504 ret = callout(mp, arg);
5505
5506 /*
5507 * Drop the iterref here if the callback didn't do it.
5508 * Note: If cb_dropref is set the mp may no longer exist.
5509 */
5510 if (!cb_dropref) {
5511 mount_iterdrop(mp);
5512 }
5513
5514 switch (ret) {
5515 case VFS_RETURNED:
5516 case VFS_RETURNED_DONE:
5517 if (ret == VFS_RETURNED_DONE) {
5518 ret = 0;
5519 goto out;
5520 }
5521 break;
5522
5523 case VFS_CLAIMED_DONE:
5524 ret = 0;
5525 goto out;
5526 case VFS_CLAIMED:
5527 default:
5528 break;
5529 }
5530 ret = 0;
5531 }
5532
5533 out:
5534 kfree(allocmem, (count * sizeof(fsid_t)));
5535 return ret;
5536 }
5537
5538 /*
5539 * Update the vfsstatfs structure in the mountpoint.
5540 * MAC: Parameter eventtype added, indicating whether the event that
5541 * triggered this update came from user space, via a system call
5542 * (VFS_USER_EVENT) or an internal kernel call (VFS_KERNEL_EVENT).
5543 */
5544 int
5545 vfs_update_vfsstat(mount_t mp, vfs_context_t ctx, __unused int eventtype)
5546 {
5547 struct vfs_attr va;
5548 int error;
5549
5550 /*
5551 * Request the attributes we want to propagate into
5552 * the per-mount vfsstat structure.
5553 */
5554 VFSATTR_INIT(&va);
5555 VFSATTR_WANTED(&va, f_iosize);
5556 VFSATTR_WANTED(&va, f_blocks);
5557 VFSATTR_WANTED(&va, f_bfree);
5558 VFSATTR_WANTED(&va, f_bavail);
5559 VFSATTR_WANTED(&va, f_bused);
5560 VFSATTR_WANTED(&va, f_files);
5561 VFSATTR_WANTED(&va, f_ffree);
5562 VFSATTR_WANTED(&va, f_bsize);
5563 VFSATTR_WANTED(&va, f_fssubtype);
5564
5565 if ((error = vfs_getattr(mp, &va, ctx)) != 0) {
5566 KAUTH_DEBUG("STAT - filesystem returned error %d", error);
5567 return error;
5568 }
5569 #if CONFIG_MACF
5570 if (eventtype == VFS_USER_EVENT) {
5571 error = mac_mount_check_getattr(ctx, mp, &va);
5572 if (error != 0) {
5573 return error;
5574 }
5575 }
5576 #endif
5577 /*
5578 * Unpack into the per-mount structure.
5579 *
5580 * We only overwrite these fields, which are likely to change:
5581 * f_blocks
5582 * f_bfree
5583 * f_bavail
5584 * f_bused
5585 * f_files
5586 * f_ffree
5587 *
5588 * And these which are not, but which the FS has no other way
5589 * of providing to us:
5590 * f_bsize
5591 * f_iosize
5592 * f_fssubtype
5593 *
5594 */
5595 if (VFSATTR_IS_SUPPORTED(&va, f_bsize)) {
5596 /* 4822056 - protect against malformed server mount */
5597 mp->mnt_vfsstat.f_bsize = (va.f_bsize > 0 ? va.f_bsize : 512);
5598 } else {
5599 mp->mnt_vfsstat.f_bsize = mp->mnt_devblocksize; /* default from the device block size */
5600 }
5601 if (VFSATTR_IS_SUPPORTED(&va, f_iosize)) {
5602 mp->mnt_vfsstat.f_iosize = va.f_iosize;
5603 } else {
5604 mp->mnt_vfsstat.f_iosize = 1024 * 1024; /* 1MB sensible I/O size */
5605 }
5606 if (VFSATTR_IS_SUPPORTED(&va, f_blocks)) {
5607 mp->mnt_vfsstat.f_blocks = va.f_blocks;
5608 }
5609 if (VFSATTR_IS_SUPPORTED(&va, f_bfree)) {
5610 mp->mnt_vfsstat.f_bfree = va.f_bfree;
5611 }
5612 if (VFSATTR_IS_SUPPORTED(&va, f_bavail)) {
5613 mp->mnt_vfsstat.f_bavail = va.f_bavail;
5614 }
5615 if (VFSATTR_IS_SUPPORTED(&va, f_bused)) {
5616 mp->mnt_vfsstat.f_bused = va.f_bused;
5617 }
5618 if (VFSATTR_IS_SUPPORTED(&va, f_files)) {
5619 mp->mnt_vfsstat.f_files = va.f_files;
5620 }
5621 if (VFSATTR_IS_SUPPORTED(&va, f_ffree)) {
5622 mp->mnt_vfsstat.f_ffree = va.f_ffree;
5623 }
5624
5625 /* this is unlikely to change, but has to be queried for */
5626 if (VFSATTR_IS_SUPPORTED(&va, f_fssubtype)) {
5627 mp->mnt_vfsstat.f_fssubtype = va.f_fssubtype;
5628 }
5629
5630 return 0;
5631 }
5632
5633 int
5634 mount_list_add(mount_t mp)
5635 {
5636 int res;
5637
5638 mount_list_lock();
5639 if (system_inshutdown != 0) {
5640 res = -1;
5641 } else {
5642 TAILQ_INSERT_TAIL(&mountlist, mp, mnt_list);
5643 nummounts++;
5644 res = 0;
5645 }
5646 mount_list_unlock();
5647
5648 return res;
5649 }
5650
5651 void
5652 mount_list_remove(mount_t mp)
5653 {
5654 mount_list_lock();
5655 TAILQ_REMOVE(&mountlist, mp, mnt_list);
5656 nummounts--;
5657 mp->mnt_list.tqe_next = NULL;
5658 mp->mnt_list.tqe_prev = NULL;
5659 mount_list_unlock();
5660 }
5661
5662 mount_t
5663 mount_lookupby_volfsid(int volfs_id, int withref)
5664 {
5665 mount_t cur_mount = (mount_t)0;
5666 mount_t mp;
5667
5668 mount_list_lock();
5669 TAILQ_FOREACH(mp, &mountlist, mnt_list) {
5670 if (!(mp->mnt_kern_flag & MNTK_UNMOUNT) &&
5671 (mp->mnt_kern_flag & MNTK_PATH_FROM_ID) &&
5672 (mp->mnt_vfsstat.f_fsid.val[0] == volfs_id)) {
5673 cur_mount = mp;
5674 if (withref) {
5675 if (mount_iterref(cur_mount, 1)) {
5676 cur_mount = (mount_t)0;
5677 mount_list_unlock();
5678 goto out;
5679 }
5680 }
5681 break;
5682 }
5683 }
5684 mount_list_unlock();
5685 if (withref && (cur_mount != (mount_t)0)) {
5686 mp = cur_mount;
5687 if (vfs_busy(mp, LK_NOWAIT) != 0) {
5688 cur_mount = (mount_t)0;
5689 }
5690 mount_iterdrop(mp);
5691 }
5692 out:
5693 return cur_mount;
5694 }
5695
5696 mount_t
5697 mount_list_lookupby_fsid(fsid_t *fsid, int locked, int withref)
5698 {
5699 mount_t retmp = (mount_t)0;
5700 mount_t mp;
5701
5702 if (!locked) {
5703 mount_list_lock();
5704 }
5705 TAILQ_FOREACH(mp, &mountlist, mnt_list)
5706 if (mp->mnt_vfsstat.f_fsid.val[0] == fsid->val[0] &&
5707 mp->mnt_vfsstat.f_fsid.val[1] == fsid->val[1]) {
5708 retmp = mp;
5709 if (withref) {
5710 if (mount_iterref(retmp, 1)) {
5711 retmp = (mount_t)0;
5712 }
5713 }
5714 goto out;
5715 }
5716 out:
5717 if (!locked) {
5718 mount_list_unlock();
5719 }
5720 return retmp;
5721 }
5722
5723 errno_t
5724 vnode_lookup(const char *path, int flags, vnode_t *vpp, vfs_context_t ctx)
5725 {
5726 struct nameidata nd;
5727 int error;
5728 u_int32_t ndflags = 0;
5729
5730 if (ctx == NULL) {
5731 return EINVAL;
5732 }
5733
5734 if (flags & VNODE_LOOKUP_NOFOLLOW) {
5735 ndflags = NOFOLLOW;
5736 } else {
5737 ndflags = FOLLOW;
5738 }
5739
5740 if (flags & VNODE_LOOKUP_NOCROSSMOUNT) {
5741 ndflags |= NOCROSSMOUNT;
5742 }
5743
5744 if (flags & VNODE_LOOKUP_CROSSMOUNTNOWAIT) {
5745 ndflags |= CN_NBMOUNTLOOK;
5746 }
5747
5748 /* XXX AUDITVNPATH1 needed ? */
5749 NDINIT(&nd, LOOKUP, OP_LOOKUP, ndflags, UIO_SYSSPACE,
5750 CAST_USER_ADDR_T(path), ctx);
5751
5752 if ((error = namei(&nd))) {
5753 return error;
5754 }
5755 *vpp = nd.ni_vp;
5756 nameidone(&nd);
5757
5758 return 0;
5759 }
5760
5761 errno_t
5762 vnode_open(const char *path, int fmode, int cmode, int flags, vnode_t *vpp, vfs_context_t ctx)
5763 {
5764 struct nameidata nd;
5765 int error;
5766 u_int32_t ndflags = 0;
5767 int lflags = flags;
5768
5769 if (ctx == NULL) { /* XXX technically an error */
5770 ctx = vfs_context_current();
5771 }
5772
5773 if (fmode & O_NOFOLLOW) {
5774 lflags |= VNODE_LOOKUP_NOFOLLOW;
5775 }
5776
5777 if (lflags & VNODE_LOOKUP_NOFOLLOW) {
5778 ndflags = NOFOLLOW;
5779 } else {
5780 ndflags = FOLLOW;
5781 }
5782
5783 if (lflags & VNODE_LOOKUP_NOCROSSMOUNT) {
5784 ndflags |= NOCROSSMOUNT;
5785 }
5786
5787 if (lflags & VNODE_LOOKUP_CROSSMOUNTNOWAIT) {
5788 ndflags |= CN_NBMOUNTLOOK;
5789 }
5790
5791 /* XXX AUDITVNPATH1 needed ? */
5792 NDINIT(&nd, LOOKUP, OP_OPEN, ndflags, UIO_SYSSPACE,
5793 CAST_USER_ADDR_T(path), ctx);
5794
5795 if ((error = vn_open(&nd, fmode, cmode))) {
5796 *vpp = NULL;
5797 } else {
5798 *vpp = nd.ni_vp;
5799 }
5800
5801 return error;
5802 }
5803
5804 errno_t
5805 vnode_close(vnode_t vp, int flags, vfs_context_t ctx)
5806 {
5807 int error;
5808
5809 if (ctx == NULL) {
5810 ctx = vfs_context_current();
5811 }
5812
5813 error = vn_close(vp, flags, ctx);
5814 vnode_put(vp);
5815 return error;
5816 }
5817
5818 errno_t
5819 vnode_mtime(vnode_t vp, struct timespec *mtime, vfs_context_t ctx)
5820 {
5821 struct vnode_attr va;
5822 int error;
5823
5824 VATTR_INIT(&va);
5825 VATTR_WANTED(&va, va_modify_time);
5826 error = vnode_getattr(vp, &va, ctx);
5827 if (!error) {
5828 *mtime = va.va_modify_time;
5829 }
5830 return error;
5831 }
5832
5833 errno_t
5834 vnode_flags(vnode_t vp, uint32_t *flags, vfs_context_t ctx)
5835 {
5836 struct vnode_attr va;
5837 int error;
5838
5839 VATTR_INIT(&va);
5840 VATTR_WANTED(&va, va_flags);
5841 error = vnode_getattr(vp, &va, ctx);
5842 if (!error) {
5843 *flags = va.va_flags;
5844 }
5845 return error;
5846 }
5847
5848 /*
5849 * Returns: 0 Success
5850 * vnode_getattr:???
5851 */
5852 errno_t
5853 vnode_size(vnode_t vp, off_t *sizep, vfs_context_t ctx)
5854 {
5855 struct vnode_attr va;
5856 int error;
5857
5858 VATTR_INIT(&va);
5859 VATTR_WANTED(&va, va_data_size);
5860 error = vnode_getattr(vp, &va, ctx);
5861 if (!error) {
5862 *sizep = va.va_data_size;
5863 }
5864 return error;
5865 }
5866
5867 errno_t
5868 vnode_setsize(vnode_t vp, off_t size, int ioflag, vfs_context_t ctx)
5869 {
5870 struct vnode_attr va;
5871
5872 VATTR_INIT(&va);
5873 VATTR_SET(&va, va_data_size, size);
5874 va.va_vaflags = ioflag & 0xffff;
5875 return vnode_setattr(vp, &va, ctx);
5876 }
5877
5878 int
5879 vnode_setdirty(vnode_t vp)
5880 {
5881 vnode_lock_spin(vp);
5882 vp->v_flag |= VISDIRTY;
5883 vnode_unlock(vp);
5884 return 0;
5885 }
5886
5887 int
5888 vnode_cleardirty(vnode_t vp)
5889 {
5890 vnode_lock_spin(vp);
5891 vp->v_flag &= ~VISDIRTY;
5892 vnode_unlock(vp);
5893 return 0;
5894 }
5895
5896 int
5897 vnode_isdirty(vnode_t vp)
5898 {
5899 int dirty;
5900
5901 vnode_lock_spin(vp);
5902 dirty = (vp->v_flag & VISDIRTY) ? 1 : 0;
5903 vnode_unlock(vp);
5904
5905 return dirty;
5906 }
5907
5908 static int
5909 vn_create_reg(vnode_t dvp, vnode_t *vpp, struct nameidata *ndp, struct vnode_attr *vap, uint32_t flags, int fmode, uint32_t *statusp, vfs_context_t ctx)
5910 {
5911 /* Only use compound VNOP for compound operation */
5912 if (vnode_compound_open_available(dvp) && ((flags & VN_CREATE_DOOPEN) != 0)) {
5913 *vpp = NULLVP;
5914 return VNOP_COMPOUND_OPEN(dvp, vpp, ndp, O_CREAT, fmode, statusp, vap, ctx);
5915 } else {
5916 return VNOP_CREATE(dvp, vpp, &ndp->ni_cnd, vap, ctx);
5917 }
5918 }
5919
5920 /*
5921 * Create a filesystem object of arbitrary type with arbitrary attributes in
5922 * the spevied directory with the specified name.
5923 *
5924 * Parameters: dvp Pointer to the vnode of the directory
5925 * in which to create the object.
5926 * vpp Pointer to the area into which to
5927 * return the vnode of the created object.
5928 * cnp Component name pointer from the namei
5929 * data structure, containing the name to
5930 * use for the create object.
5931 * vap Pointer to the vnode_attr structure
5932 * describing the object to be created,
5933 * including the type of object.
5934 * flags VN_* flags controlling ACL inheritance
5935 * and whether or not authorization is to
5936 * be required for the operation.
5937 *
5938 * Returns: 0 Success
5939 * !0 errno value
5940 *
5941 * Implicit: *vpp Contains the vnode of the object that
5942 * was created, if successful.
5943 * *cnp May be modified by the underlying VFS.
5944 * *vap May be modified by the underlying VFS.
5945 * modified by either ACL inheritance or
5946 *
5947 *
5948 * be modified, even if the operation is
5949 *
5950 *
5951 * Notes: The kauth_filesec_t in 'vap', if any, is in host byte order.
5952 *
5953 * Modification of '*cnp' and '*vap' by the underlying VFS is
5954 * strongly discouraged.
5955 *
5956 * XXX: This function is a 'vn_*' function; it belongs in vfs_vnops.c
5957 *
5958 * XXX: We should enummerate the possible errno values here, and where
5959 * in the code they originated.
5960 */
5961 errno_t
5962 vn_create(vnode_t dvp, vnode_t *vpp, struct nameidata *ndp, struct vnode_attr *vap, uint32_t flags, int fmode, uint32_t *statusp, vfs_context_t ctx)
5963 {
5964 errno_t error, old_error;
5965 vnode_t vp = (vnode_t)0;
5966 boolean_t batched;
5967 struct componentname *cnp;
5968 uint32_t defaulted;
5969
5970 cnp = &ndp->ni_cnd;
5971 error = 0;
5972 batched = namei_compound_available(dvp, ndp) ? TRUE : FALSE;
5973
5974 KAUTH_DEBUG("%p CREATE - '%s'", dvp, cnp->cn_nameptr);
5975
5976 if (flags & VN_CREATE_NOINHERIT) {
5977 vap->va_vaflags |= VA_NOINHERIT;
5978 }
5979 if (flags & VN_CREATE_NOAUTH) {
5980 vap->va_vaflags |= VA_NOAUTH;
5981 }
5982 /*
5983 * Handle ACL inheritance, initialize vap.
5984 */
5985 error = vn_attribute_prepare(dvp, vap, &defaulted, ctx);
5986 if (error) {
5987 return error;
5988 }
5989
5990 if (vap->va_type != VREG && (fmode != 0 || (flags & VN_CREATE_DOOPEN) || statusp)) {
5991 panic("Open parameters, but not a regular file.");
5992 }
5993 if ((fmode != 0) && ((flags & VN_CREATE_DOOPEN) == 0)) {
5994 panic("Mode for open, but not trying to open...");
5995 }
5996
5997
5998 /*
5999 * Create the requested node.
6000 */
6001 switch (vap->va_type) {
6002 case VREG:
6003 error = vn_create_reg(dvp, vpp, ndp, vap, flags, fmode, statusp, ctx);
6004 break;
6005 case VDIR:
6006 error = vn_mkdir(dvp, vpp, ndp, vap, ctx);
6007 break;
6008 case VSOCK:
6009 case VFIFO:
6010 case VBLK:
6011 case VCHR:
6012 error = VNOP_MKNOD(dvp, vpp, cnp, vap, ctx);
6013 break;
6014 default:
6015 panic("vnode_create: unknown vtype %d", vap->va_type);
6016 }
6017 if (error != 0) {
6018 KAUTH_DEBUG("%p CREATE - error %d returned by filesystem", dvp, error);
6019 goto out;
6020 }
6021
6022 vp = *vpp;
6023 old_error = error;
6024
6025 #if CONFIG_MACF
6026 if (!(flags & VN_CREATE_NOLABEL)) {
6027 error = vnode_label(vnode_mount(vp), dvp, vp, cnp, VNODE_LABEL_CREATE, ctx);
6028 if (error) {
6029 goto error;
6030 }
6031 }
6032 #endif
6033
6034 /*
6035 * If some of the requested attributes weren't handled by the VNOP,
6036 * use our fallback code.
6037 */
6038 if (!VATTR_ALL_SUPPORTED(vap) && *vpp) {
6039 KAUTH_DEBUG(" CREATE - doing fallback with ACL %p", vap->va_acl);
6040 error = vnode_setattr_fallback(*vpp, vap, ctx);
6041 }
6042 #if CONFIG_MACF
6043 error:
6044 #endif
6045 if ((error != 0) && (vp != (vnode_t)0)) {
6046 /* If we've done a compound open, close */
6047 if (batched && (old_error == 0) && (vap->va_type == VREG)) {
6048 VNOP_CLOSE(vp, fmode, ctx);
6049 }
6050
6051 /* Need to provide notifications if a create succeeded */
6052 if (!batched) {
6053 *vpp = (vnode_t) 0;
6054 vnode_put(vp);
6055 vp = NULLVP;
6056 }
6057 }
6058
6059 /*
6060 * For creation VNOPs, this is the equivalent of
6061 * lookup_handle_found_vnode.
6062 */
6063 if (kdebug_enable && *vpp) {
6064 kdebug_lookup(*vpp, cnp);
6065 }
6066
6067 out:
6068 vn_attribute_cleanup(vap, defaulted);
6069
6070 return error;
6071 }
6072
6073 static kauth_scope_t vnode_scope;
6074 static int vnode_authorize_callback(kauth_cred_t credential, void *idata, kauth_action_t action,
6075 uintptr_t arg0, uintptr_t arg1, uintptr_t arg2, uintptr_t arg3);
6076 static int vnode_authorize_callback_int(kauth_action_t action, vfs_context_t ctx,
6077 vnode_t vp, vnode_t dvp, int *errorp);
6078
6079 typedef struct _vnode_authorize_context {
6080 vnode_t vp;
6081 struct vnode_attr *vap;
6082 vnode_t dvp;
6083 struct vnode_attr *dvap;
6084 vfs_context_t ctx;
6085 int flags;
6086 int flags_valid;
6087 #define _VAC_IS_OWNER (1<<0)
6088 #define _VAC_IN_GROUP (1<<1)
6089 #define _VAC_IS_DIR_OWNER (1<<2)
6090 #define _VAC_IN_DIR_GROUP (1<<3)
6091 #define _VAC_NO_VNODE_POINTERS (1<<4)
6092 } *vauth_ctx;
6093
6094 void
6095 vnode_authorize_init(void)
6096 {
6097 vnode_scope = kauth_register_scope(KAUTH_SCOPE_VNODE, vnode_authorize_callback, NULL);
6098 }
6099
6100 #define VATTR_PREPARE_DEFAULTED_UID 0x1
6101 #define VATTR_PREPARE_DEFAULTED_GID 0x2
6102 #define VATTR_PREPARE_DEFAULTED_MODE 0x4
6103
6104 int
6105 vn_attribute_prepare(vnode_t dvp, struct vnode_attr *vap, uint32_t *defaulted_fieldsp, vfs_context_t ctx)
6106 {
6107 kauth_acl_t nacl = NULL, oacl = NULL;
6108 int error;
6109
6110 /*
6111 * Handle ACL inheritance.
6112 */
6113 if (!(vap->va_vaflags & VA_NOINHERIT) && vfs_extendedsecurity(dvp->v_mount)) {
6114 /* save the original filesec */
6115 if (VATTR_IS_ACTIVE(vap, va_acl)) {
6116 oacl = vap->va_acl;
6117 }
6118
6119 vap->va_acl = NULL;
6120 if ((error = kauth_acl_inherit(dvp,
6121 oacl,
6122 &nacl,
6123 vap->va_type == VDIR,
6124 ctx)) != 0) {
6125 KAUTH_DEBUG("%p CREATE - error %d processing inheritance", dvp, error);
6126 return error;
6127 }
6128
6129 /*
6130 * If the generated ACL is NULL, then we can save ourselves some effort
6131 * by clearing the active bit.
6132 */
6133 if (nacl == NULL) {
6134 VATTR_CLEAR_ACTIVE(vap, va_acl);
6135 } else {
6136 vap->va_base_acl = oacl;
6137 VATTR_SET(vap, va_acl, nacl);
6138 }
6139 }
6140
6141 error = vnode_authattr_new_internal(dvp, vap, (vap->va_vaflags & VA_NOAUTH), defaulted_fieldsp, ctx);
6142 if (error) {
6143 vn_attribute_cleanup(vap, *defaulted_fieldsp);
6144 }
6145
6146 return error;
6147 }
6148
6149 void
6150 vn_attribute_cleanup(struct vnode_attr *vap, uint32_t defaulted_fields)
6151 {
6152 /*
6153 * If the caller supplied a filesec in vap, it has been replaced
6154 * now by the post-inheritance copy. We need to put the original back
6155 * and free the inherited product.
6156 */
6157 kauth_acl_t nacl, oacl;
6158
6159 if (VATTR_IS_ACTIVE(vap, va_acl)) {
6160 nacl = vap->va_acl;
6161 oacl = vap->va_base_acl;
6162
6163 if (oacl) {
6164 VATTR_SET(vap, va_acl, oacl);
6165 vap->va_base_acl = NULL;
6166 } else {
6167 VATTR_CLEAR_ACTIVE(vap, va_acl);
6168 }
6169
6170 if (nacl != NULL) {
6171 kauth_acl_free(nacl);
6172 }
6173 }
6174
6175 if ((defaulted_fields & VATTR_PREPARE_DEFAULTED_MODE) != 0) {
6176 VATTR_CLEAR_ACTIVE(vap, va_mode);
6177 }
6178 if ((defaulted_fields & VATTR_PREPARE_DEFAULTED_GID) != 0) {
6179 VATTR_CLEAR_ACTIVE(vap, va_gid);
6180 }
6181 if ((defaulted_fields & VATTR_PREPARE_DEFAULTED_UID) != 0) {
6182 VATTR_CLEAR_ACTIVE(vap, va_uid);
6183 }
6184
6185 return;
6186 }
6187
6188 int
6189 vn_authorize_unlink(vnode_t dvp, vnode_t vp, struct componentname *cnp, vfs_context_t ctx, __unused void *reserved)
6190 {
6191 #if !CONFIG_MACF
6192 #pragma unused(cnp)
6193 #endif
6194 int error = 0;
6195
6196 /*
6197 * Normally, unlinking of directories is not supported.
6198 * However, some file systems may have limited support.
6199 */
6200 if ((vp->v_type == VDIR) &&
6201 !(vp->v_mount->mnt_kern_flag & MNTK_DIR_HARDLINKS)) {
6202 return EPERM; /* POSIX */
6203 }
6204
6205 /* authorize the delete operation */
6206 #if CONFIG_MACF
6207 if (!error) {
6208 error = mac_vnode_check_unlink(ctx, dvp, vp, cnp);
6209 }
6210 #endif /* MAC */
6211 if (!error) {
6212 error = vnode_authorize(vp, dvp, KAUTH_VNODE_DELETE, ctx);
6213 }
6214
6215 return error;
6216 }
6217
6218 int
6219 vn_authorize_open_existing(vnode_t vp, struct componentname *cnp, int fmode, vfs_context_t ctx, void *reserved)
6220 {
6221 /* Open of existing case */
6222 kauth_action_t action;
6223 int error = 0;
6224 if (cnp->cn_ndp == NULL) {
6225 panic("NULL ndp");
6226 }
6227 if (reserved != NULL) {
6228 panic("reserved not NULL.");
6229 }
6230
6231 #if CONFIG_MACF
6232 /* XXX may do duplicate work here, but ignore that for now (idempotent) */
6233 if (vfs_flags(vnode_mount(vp)) & MNT_MULTILABEL) {
6234 error = vnode_label(vnode_mount(vp), NULL, vp, NULL, 0, ctx);
6235 if (error) {
6236 return error;
6237 }
6238 }
6239 #endif
6240
6241 if ((fmode & O_DIRECTORY) && vp->v_type != VDIR) {
6242 return ENOTDIR;
6243 }
6244
6245 if (vp->v_type == VSOCK && vp->v_tag != VT_FDESC) {
6246 return EOPNOTSUPP; /* Operation not supported on socket */
6247 }
6248
6249 if (vp->v_type == VLNK && (fmode & O_NOFOLLOW) != 0) {
6250 return ELOOP; /* O_NOFOLLOW was specified and the target is a symbolic link */
6251 }
6252
6253 /* disallow write operations on directories */
6254 if (vnode_isdir(vp) && (fmode & (FWRITE | O_TRUNC))) {
6255 return EISDIR;
6256 }
6257
6258 if ((cnp->cn_ndp->ni_flag & NAMEI_TRAILINGSLASH)) {
6259 if (vp->v_type != VDIR) {
6260 return ENOTDIR;
6261 }
6262 }
6263
6264 #if CONFIG_MACF
6265 /* If a file being opened is a shadow file containing
6266 * namedstream data, ignore the macf checks because it
6267 * is a kernel internal file and access should always
6268 * be allowed.
6269 */
6270 if (!(vnode_isshadow(vp) && vnode_isnamedstream(vp))) {
6271 error = mac_vnode_check_open(ctx, vp, fmode);
6272 if (error) {
6273 return error;
6274 }
6275 }
6276 #endif
6277
6278 /* compute action to be authorized */
6279 action = 0;
6280 if (fmode & FREAD) {
6281 action |= KAUTH_VNODE_READ_DATA;
6282 }
6283 if (fmode & (FWRITE | O_TRUNC)) {
6284 /*
6285 * If we are writing, appending, and not truncating,
6286 * indicate that we are appending so that if the
6287 * UF_APPEND or SF_APPEND bits are set, we do not deny
6288 * the open.
6289 */
6290 if ((fmode & O_APPEND) && !(fmode & O_TRUNC)) {
6291 action |= KAUTH_VNODE_APPEND_DATA;
6292 } else {
6293 action |= KAUTH_VNODE_WRITE_DATA;
6294 }
6295 }
6296 error = vnode_authorize(vp, NULL, action, ctx);
6297 #if NAMEDSTREAMS
6298 if (error == EACCES) {
6299 /*
6300 * Shadow files may exist on-disk with a different UID/GID
6301 * than that of the current context. Verify that this file
6302 * is really a shadow file. If it was created successfully
6303 * then it should be authorized.
6304 */
6305 if (vnode_isshadow(vp) && vnode_isnamedstream(vp)) {
6306 error = vnode_verifynamedstream(vp);
6307 }
6308 }
6309 #endif
6310
6311 return error;
6312 }
6313
6314 int
6315 vn_authorize_create(vnode_t dvp, struct componentname *cnp, struct vnode_attr *vap, vfs_context_t ctx, void *reserved)
6316 {
6317 #if !CONFIG_MACF
6318 #pragma unused(vap)
6319 #endif
6320 /* Creation case */
6321 int error;
6322
6323 if (cnp->cn_ndp == NULL) {
6324 panic("NULL cn_ndp");
6325 }
6326 if (reserved != NULL) {
6327 panic("reserved not NULL.");
6328 }
6329
6330 /* Only validate path for creation if we didn't do a complete lookup */
6331 if (cnp->cn_ndp->ni_flag & NAMEI_UNFINISHED) {
6332 error = lookup_validate_creation_path(cnp->cn_ndp);
6333 if (error) {
6334 return error;
6335 }
6336 }
6337
6338 #if CONFIG_MACF
6339 error = mac_vnode_check_create(ctx, dvp, cnp, vap);
6340 if (error) {
6341 return error;
6342 }
6343 #endif /* CONFIG_MACF */
6344
6345 return vnode_authorize(dvp, NULL, KAUTH_VNODE_ADD_FILE, ctx);
6346 }
6347
6348 int
6349 vn_authorize_rename(struct vnode *fdvp, struct vnode *fvp, struct componentname *fcnp,
6350 struct vnode *tdvp, struct vnode *tvp, struct componentname *tcnp,
6351 vfs_context_t ctx, void *reserved)
6352 {
6353 return vn_authorize_renamex(fdvp, fvp, fcnp, tdvp, tvp, tcnp, ctx, 0, reserved);
6354 }
6355
6356 int
6357 vn_authorize_renamex(struct vnode *fdvp, struct vnode *fvp, struct componentname *fcnp,
6358 struct vnode *tdvp, struct vnode *tvp, struct componentname *tcnp,
6359 vfs_context_t ctx, vfs_rename_flags_t flags, void *reserved)
6360 {
6361 return vn_authorize_renamex_with_paths(fdvp, fvp, fcnp, NULL, tdvp, tvp, tcnp, NULL, ctx, flags, reserved);
6362 }
6363
6364 int
6365 vn_authorize_renamex_with_paths(struct vnode *fdvp, struct vnode *fvp, struct componentname *fcnp, const char *from_path,
6366 struct vnode *tdvp, struct vnode *tvp, struct componentname *tcnp, const char *to_path,
6367 vfs_context_t ctx, vfs_rename_flags_t flags, void *reserved)
6368 {
6369 int error = 0;
6370 int moving = 0;
6371 bool swap = flags & VFS_RENAME_SWAP;
6372
6373 if (reserved != NULL) {
6374 panic("Passed something other than NULL as reserved field!");
6375 }
6376
6377 /*
6378 * Avoid renaming "." and "..".
6379 *
6380 * XXX No need to check for this in the FS. We should always have the leaves
6381 * in VFS in this case.
6382 */
6383 if (fvp->v_type == VDIR &&
6384 ((fdvp == fvp) ||
6385 (fcnp->cn_namelen == 1 && fcnp->cn_nameptr[0] == '.') ||
6386 ((fcnp->cn_flags | tcnp->cn_flags) & ISDOTDOT))) {
6387 error = EINVAL;
6388 goto out;
6389 }
6390
6391 if (tvp == NULLVP && vnode_compound_rename_available(tdvp)) {
6392 error = lookup_validate_creation_path(tcnp->cn_ndp);
6393 if (error) {
6394 goto out;
6395 }
6396 }
6397
6398 /***** <MACF> *****/
6399 #if CONFIG_MACF
6400 error = mac_vnode_check_rename(ctx, fdvp, fvp, fcnp, tdvp, tvp, tcnp);
6401 if (error) {
6402 goto out;
6403 }
6404 if (swap) {
6405 error = mac_vnode_check_rename(ctx, tdvp, tvp, tcnp, fdvp, fvp, fcnp);
6406 if (error) {
6407 goto out;
6408 }
6409 }
6410 #endif
6411 /***** </MACF> *****/
6412
6413 /***** <MiscChecks> *****/
6414 if (tvp != NULL) {
6415 if (!swap) {
6416 if (fvp->v_type == VDIR && tvp->v_type != VDIR) {
6417 error = ENOTDIR;
6418 goto out;
6419 } else if (fvp->v_type != VDIR && tvp->v_type == VDIR) {
6420 error = EISDIR;
6421 goto out;
6422 }
6423 }
6424 } else if (swap) {
6425 /*
6426 * Caller should have already checked this and returned
6427 * ENOENT. If we send back ENOENT here, caller will retry
6428 * which isn't what we want so we send back EINVAL here
6429 * instead.
6430 */
6431 error = EINVAL;
6432 goto out;
6433 }
6434
6435 if (fvp == tdvp) {
6436 error = EINVAL;
6437 goto out;
6438 }
6439
6440 /*
6441 * The following edge case is caught here:
6442 * (to cannot be a descendent of from)
6443 *
6444 * o fdvp
6445 * /
6446 * /
6447 * o fvp
6448 * \
6449 * \
6450 * o tdvp
6451 * /
6452 * /
6453 * o tvp
6454 */
6455 if (tdvp->v_parent == fvp) {
6456 error = EINVAL;
6457 goto out;
6458 }
6459
6460 if (swap && fdvp->v_parent == tvp) {
6461 error = EINVAL;
6462 goto out;
6463 }
6464 /***** </MiscChecks> *****/
6465
6466 /***** <Kauth> *****/
6467
6468 /*
6469 * As part of the Kauth step, we call out to allow 3rd-party
6470 * fileop notification of "about to rename". This is needed
6471 * in the event that 3rd-parties need to know that the DELETE
6472 * authorization is actually part of a rename. It's important
6473 * that we guarantee that the DELETE call-out will always be
6474 * made if the WILL_RENAME call-out is made. Another fileop
6475 * call-out will be performed once the operation is completed.
6476 * We can ignore the result of kauth_authorize_fileop().
6477 *
6478 * N.B. We are passing the vnode and *both* paths to each
6479 * call; kauth_authorize_fileop() extracts the "from" path
6480 * when posting a KAUTH_FILEOP_WILL_RENAME notification.
6481 * As such, we only post these notifications if all of the
6482 * information we need is provided.
6483 */
6484
6485 if (swap) {
6486 kauth_action_t f = 0, t = 0;
6487
6488 /*
6489 * Directories changing parents need ...ADD_SUBDIR... to
6490 * permit changing ".."
6491 */
6492 if (fdvp != tdvp) {
6493 if (vnode_isdir(fvp)) {
6494 f = KAUTH_VNODE_ADD_SUBDIRECTORY;
6495 }
6496 if (vnode_isdir(tvp)) {
6497 t = KAUTH_VNODE_ADD_SUBDIRECTORY;
6498 }
6499 }
6500 if (to_path != NULL) {
6501 kauth_authorize_fileop(vfs_context_ucred(ctx),
6502 KAUTH_FILEOP_WILL_RENAME,
6503 (uintptr_t)fvp,
6504 (uintptr_t)to_path);
6505 }
6506 error = vnode_authorize(fvp, fdvp, KAUTH_VNODE_DELETE | f, ctx);
6507 if (error) {
6508 goto out;
6509 }
6510 if (from_path != NULL) {
6511 kauth_authorize_fileop(vfs_context_ucred(ctx),
6512 KAUTH_FILEOP_WILL_RENAME,
6513 (uintptr_t)tvp,
6514 (uintptr_t)from_path);
6515 }
6516 error = vnode_authorize(tvp, tdvp, KAUTH_VNODE_DELETE | t, ctx);
6517 if (error) {
6518 goto out;
6519 }
6520 f = vnode_isdir(fvp) ? KAUTH_VNODE_ADD_SUBDIRECTORY : KAUTH_VNODE_ADD_FILE;
6521 t = vnode_isdir(tvp) ? KAUTH_VNODE_ADD_SUBDIRECTORY : KAUTH_VNODE_ADD_FILE;
6522 if (fdvp == tdvp) {
6523 error = vnode_authorize(fdvp, NULL, f | t, ctx);
6524 } else {
6525 error = vnode_authorize(fdvp, NULL, t, ctx);
6526 if (error) {
6527 goto out;
6528 }
6529 error = vnode_authorize(tdvp, NULL, f, ctx);
6530 }
6531 if (error) {
6532 goto out;
6533 }
6534 } else {
6535 error = 0;
6536 if ((tvp != NULL) && vnode_isdir(tvp)) {
6537 if (tvp != fdvp) {
6538 moving = 1;
6539 }
6540 } else if (tdvp != fdvp) {
6541 moving = 1;
6542 }
6543
6544 /*
6545 * must have delete rights to remove the old name even in
6546 * the simple case of fdvp == tdvp.
6547 *
6548 * If fvp is a directory, and we are changing it's parent,
6549 * then we also need rights to rewrite its ".." entry as well.
6550 */
6551 if (to_path != NULL) {
6552 kauth_authorize_fileop(vfs_context_ucred(ctx),
6553 KAUTH_FILEOP_WILL_RENAME,
6554 (uintptr_t)fvp,
6555 (uintptr_t)to_path);
6556 }
6557 if (vnode_isdir(fvp)) {
6558 if ((error = vnode_authorize(fvp, fdvp, KAUTH_VNODE_DELETE | KAUTH_VNODE_ADD_SUBDIRECTORY, ctx)) != 0) {
6559 goto out;
6560 }
6561 } else {
6562 if ((error = vnode_authorize(fvp, fdvp, KAUTH_VNODE_DELETE, ctx)) != 0) {
6563 goto out;
6564 }
6565 }
6566 if (moving) {
6567 /* moving into tdvp or tvp, must have rights to add */
6568 if ((error = vnode_authorize(((tvp != NULL) && vnode_isdir(tvp)) ? tvp : tdvp,
6569 NULL,
6570 vnode_isdir(fvp) ? KAUTH_VNODE_ADD_SUBDIRECTORY : KAUTH_VNODE_ADD_FILE,
6571 ctx)) != 0) {
6572 goto out;
6573 }
6574 } else {
6575 /* node staying in same directory, must be allowed to add new name */
6576 if ((error = vnode_authorize(fdvp, NULL,
6577 vnode_isdir(fvp) ? KAUTH_VNODE_ADD_SUBDIRECTORY : KAUTH_VNODE_ADD_FILE, ctx)) != 0) {
6578 goto out;
6579 }
6580 }
6581 /* overwriting tvp */
6582 if ((tvp != NULL) && !vnode_isdir(tvp) &&
6583 ((error = vnode_authorize(tvp, tdvp, KAUTH_VNODE_DELETE, ctx)) != 0)) {
6584 goto out;
6585 }
6586 }
6587
6588 /***** </Kauth> *****/
6589
6590 /* XXX more checks? */
6591 out:
6592 return error;
6593 }
6594
6595 int
6596 vn_authorize_mkdir(vnode_t dvp, struct componentname *cnp, struct vnode_attr *vap, vfs_context_t ctx, void *reserved)
6597 {
6598 #if !CONFIG_MACF
6599 #pragma unused(vap)
6600 #endif
6601 int error;
6602
6603 if (reserved != NULL) {
6604 panic("reserved not NULL in vn_authorize_mkdir()");
6605 }
6606
6607 /* XXX A hack for now, to make shadow files work */
6608 if (cnp->cn_ndp == NULL) {
6609 return 0;
6610 }
6611
6612 if (vnode_compound_mkdir_available(dvp)) {
6613 error = lookup_validate_creation_path(cnp->cn_ndp);
6614 if (error) {
6615 goto out;
6616 }
6617 }
6618
6619 #if CONFIG_MACF
6620 error = mac_vnode_check_create(ctx,
6621 dvp, cnp, vap);
6622 if (error) {
6623 goto out;
6624 }
6625 #endif
6626
6627 /* authorize addition of a directory to the parent */
6628 if ((error = vnode_authorize(dvp, NULL, KAUTH_VNODE_ADD_SUBDIRECTORY, ctx)) != 0) {
6629 goto out;
6630 }
6631
6632 out:
6633 return error;
6634 }
6635
6636 int
6637 vn_authorize_rmdir(vnode_t dvp, vnode_t vp, struct componentname *cnp, vfs_context_t ctx, void *reserved)
6638 {
6639 #if CONFIG_MACF
6640 int error;
6641 #else
6642 #pragma unused(cnp)
6643 #endif
6644 if (reserved != NULL) {
6645 panic("Non-NULL reserved argument to vn_authorize_rmdir()");
6646 }
6647
6648 if (vp->v_type != VDIR) {
6649 /*
6650 * rmdir only deals with directories
6651 */
6652 return ENOTDIR;
6653 }
6654
6655 if (dvp == vp) {
6656 /*
6657 * No rmdir "." please.
6658 */
6659 return EINVAL;
6660 }
6661
6662 #if CONFIG_MACF
6663 error = mac_vnode_check_unlink(ctx, dvp,
6664 vp, cnp);
6665 if (error) {
6666 return error;
6667 }
6668 #endif
6669
6670 return vnode_authorize(vp, dvp, KAUTH_VNODE_DELETE, ctx);
6671 }
6672
6673 /*
6674 * Authorizer for directory cloning. This does not use vnodes but instead
6675 * uses prefilled vnode attributes from the filesystem.
6676 *
6677 * The same function is called to set up the attributes required, perform the
6678 * authorization and cleanup (if required)
6679 */
6680 int
6681 vnode_attr_authorize_dir_clone(struct vnode_attr *vap, kauth_action_t action,
6682 struct vnode_attr *dvap, __unused vnode_t sdvp, mount_t mp,
6683 dir_clone_authorizer_op_t vattr_op, uint32_t flags, vfs_context_t ctx,
6684 __unused void *reserved)
6685 {
6686 int error;
6687 int is_suser = vfs_context_issuser(ctx);
6688
6689 if (vattr_op == OP_VATTR_SETUP) {
6690 VATTR_INIT(vap);
6691
6692 /*
6693 * When ACL inheritence is implemented, both vap->va_acl and
6694 * dvap->va_acl will be required (even as superuser).
6695 */
6696 VATTR_WANTED(vap, va_type);
6697 VATTR_WANTED(vap, va_mode);
6698 VATTR_WANTED(vap, va_flags);
6699 VATTR_WANTED(vap, va_uid);
6700 VATTR_WANTED(vap, va_gid);
6701 if (dvap) {
6702 VATTR_INIT(dvap);
6703 VATTR_WANTED(dvap, va_flags);
6704 }
6705
6706 if (!is_suser) {
6707 /*
6708 * If not superuser, we have to evaluate ACLs and
6709 * need the target directory gid to set the initial
6710 * gid of the new object.
6711 */
6712 VATTR_WANTED(vap, va_acl);
6713 if (dvap) {
6714 VATTR_WANTED(dvap, va_gid);
6715 }
6716 } else if (dvap && (flags & VNODE_CLONEFILE_NOOWNERCOPY)) {
6717 VATTR_WANTED(dvap, va_gid);
6718 }
6719 return 0;
6720 } else if (vattr_op == OP_VATTR_CLEANUP) {
6721 return 0; /* Nothing to do for now */
6722 }
6723
6724 /* dvap isn't used for authorization */
6725 error = vnode_attr_authorize(vap, NULL, mp, action, ctx);
6726
6727 if (error) {
6728 return error;
6729 }
6730
6731 /*
6732 * vn_attribute_prepare should be able to accept attributes as well as
6733 * vnodes but for now we do this inline.
6734 */
6735 if (!is_suser || (flags & VNODE_CLONEFILE_NOOWNERCOPY)) {
6736 /*
6737 * If the filesystem is mounted IGNORE_OWNERSHIP and an explicit
6738 * owner is set, that owner takes ownership of all new files.
6739 */
6740 if ((mp->mnt_flag & MNT_IGNORE_OWNERSHIP) &&
6741 (mp->mnt_fsowner != KAUTH_UID_NONE)) {
6742 VATTR_SET(vap, va_uid, mp->mnt_fsowner);
6743 } else {
6744 /* default owner is current user */
6745 VATTR_SET(vap, va_uid,
6746 kauth_cred_getuid(vfs_context_ucred(ctx)));
6747 }
6748
6749 if ((mp->mnt_flag & MNT_IGNORE_OWNERSHIP) &&
6750 (mp->mnt_fsgroup != KAUTH_GID_NONE)) {
6751 VATTR_SET(vap, va_gid, mp->mnt_fsgroup);
6752 } else {
6753 /*
6754 * default group comes from parent object,
6755 * fallback to current user
6756 */
6757 if (VATTR_IS_SUPPORTED(dvap, va_gid)) {
6758 VATTR_SET(vap, va_gid, dvap->va_gid);
6759 } else {
6760 VATTR_SET(vap, va_gid,
6761 kauth_cred_getgid(vfs_context_ucred(ctx)));
6762 }
6763 }
6764 }
6765
6766 /* Inherit SF_RESTRICTED bit from destination directory only */
6767 if (VATTR_IS_ACTIVE(vap, va_flags)) {
6768 VATTR_SET(vap, va_flags,
6769 ((vap->va_flags & ~(UF_DATAVAULT | SF_RESTRICTED)))); /* Turn off from source */
6770 if (VATTR_IS_ACTIVE(dvap, va_flags)) {
6771 VATTR_SET(vap, va_flags,
6772 vap->va_flags | (dvap->va_flags & (UF_DATAVAULT | SF_RESTRICTED)));
6773 }
6774 } else if (VATTR_IS_ACTIVE(dvap, va_flags)) {
6775 VATTR_SET(vap, va_flags, (dvap->va_flags & (UF_DATAVAULT | SF_RESTRICTED)));
6776 }
6777
6778 return 0;
6779 }
6780
6781
6782 /*
6783 * Authorize an operation on a vnode.
6784 *
6785 * This is KPI, but here because it needs vnode_scope.
6786 *
6787 * Returns: 0 Success
6788 * kauth_authorize_action:EPERM ...
6789 * xlate => EACCES Permission denied
6790 * kauth_authorize_action:0 Success
6791 * kauth_authorize_action: Depends on callback return; this is
6792 * usually only vnode_authorize_callback(),
6793 * but may include other listerners, if any
6794 * exist.
6795 * EROFS
6796 * EACCES
6797 * EPERM
6798 * ???
6799 */
6800 int
6801 vnode_authorize(vnode_t vp, vnode_t dvp, kauth_action_t action, vfs_context_t ctx)
6802 {
6803 int error, result;
6804
6805 /*
6806 * We can't authorize against a dead vnode; allow all operations through so that
6807 * the correct error can be returned.
6808 */
6809 if (vp->v_type == VBAD) {
6810 return 0;
6811 }
6812
6813 error = 0;
6814 result = kauth_authorize_action(vnode_scope, vfs_context_ucred(ctx), action,
6815 (uintptr_t)ctx, (uintptr_t)vp, (uintptr_t)dvp, (uintptr_t)&error);
6816 if (result == EPERM) { /* traditional behaviour */
6817 result = EACCES;
6818 }
6819 /* did the lower layers give a better error return? */
6820 if ((result != 0) && (error != 0)) {
6821 return error;
6822 }
6823 return result;
6824 }
6825
6826 /*
6827 * Test for vnode immutability.
6828 *
6829 * The 'append' flag is set when the authorization request is constrained
6830 * to operations which only request the right to append to a file.
6831 *
6832 * The 'ignore' flag is set when an operation modifying the immutability flags
6833 * is being authorized. We check the system securelevel to determine which
6834 * immutability flags we can ignore.
6835 */
6836 static int
6837 vnode_immutable(struct vnode_attr *vap, int append, int ignore)
6838 {
6839 int mask;
6840
6841 /* start with all bits precluding the operation */
6842 mask = IMMUTABLE | APPEND;
6843
6844 /* if appending only, remove the append-only bits */
6845 if (append) {
6846 mask &= ~APPEND;
6847 }
6848
6849 /* ignore only set when authorizing flags changes */
6850 if (ignore) {
6851 if (securelevel <= 0) {
6852 /* in insecure state, flags do not inhibit changes */
6853 mask = 0;
6854 } else {
6855 /* in secure state, user flags don't inhibit */
6856 mask &= ~(UF_IMMUTABLE | UF_APPEND);
6857 }
6858 }
6859 KAUTH_DEBUG("IMMUTABLE - file flags 0x%x mask 0x%x append = %d ignore = %d", vap->va_flags, mask, append, ignore);
6860 if ((vap->va_flags & mask) != 0) {
6861 return EPERM;
6862 }
6863 return 0;
6864 }
6865
6866 static int
6867 vauth_node_owner(struct vnode_attr *vap, kauth_cred_t cred)
6868 {
6869 int result;
6870
6871 /* default assumption is not-owner */
6872 result = 0;
6873
6874 /*
6875 * If the filesystem has given us a UID, we treat this as authoritative.
6876 */
6877 if (vap && VATTR_IS_SUPPORTED(vap, va_uid)) {
6878 result = (vap->va_uid == kauth_cred_getuid(cred)) ? 1 : 0;
6879 }
6880 /* we could test the owner UUID here if we had a policy for it */
6881
6882 return result;
6883 }
6884
6885 /*
6886 * vauth_node_group
6887 *
6888 * Description: Ask if a cred is a member of the group owning the vnode object
6889 *
6890 * Parameters: vap vnode attribute
6891 * vap->va_gid group owner of vnode object
6892 * cred credential to check
6893 * ismember pointer to where to put the answer
6894 * idontknow Return this if we can't get an answer
6895 *
6896 * Returns: 0 Success
6897 * idontknow Can't get information
6898 * kauth_cred_ismember_gid:? Error from kauth subsystem
6899 * kauth_cred_ismember_gid:? Error from kauth subsystem
6900 */
6901 static int
6902 vauth_node_group(struct vnode_attr *vap, kauth_cred_t cred, int *ismember, int idontknow)
6903 {
6904 int error;
6905 int result;
6906
6907 error = 0;
6908 result = 0;
6909
6910 /*
6911 * The caller is expected to have asked the filesystem for a group
6912 * at some point prior to calling this function. The answer may
6913 * have been that there is no group ownership supported for the
6914 * vnode object, in which case we return
6915 */
6916 if (vap && VATTR_IS_SUPPORTED(vap, va_gid)) {
6917 error = kauth_cred_ismember_gid(cred, vap->va_gid, &result);
6918 /*
6919 * Credentials which are opted into external group membership
6920 * resolution which are not known to the external resolver
6921 * will result in an ENOENT error. We translate this into
6922 * the appropriate 'idontknow' response for our caller.
6923 *
6924 * XXX We do not make a distinction here between an ENOENT
6925 * XXX arising from a response from the external resolver,
6926 * XXX and an ENOENT which is internally generated. This is
6927 * XXX a deficiency of the published kauth_cred_ismember_gid()
6928 * XXX KPI which can not be overcome without new KPI. For
6929 * XXX all currently known cases, however, this wil result
6930 * XXX in correct behaviour.
6931 */
6932 if (error == ENOENT) {
6933 error = idontknow;
6934 }
6935 }
6936 /*
6937 * XXX We could test the group UUID here if we had a policy for it,
6938 * XXX but this is problematic from the perspective of synchronizing
6939 * XXX group UUID and POSIX GID ownership of a file and keeping the
6940 * XXX values coherent over time. The problem is that the local
6941 * XXX system will vend transient group UUIDs for unknown POSIX GID
6942 * XXX values, and these are not persistent, whereas storage of values
6943 * XXX is persistent. One potential solution to this is a local
6944 * XXX (persistent) replica of remote directory entries and vended
6945 * XXX local ids in a local directory server (think in terms of a
6946 * XXX caching DNS server).
6947 */
6948
6949 if (!error) {
6950 *ismember = result;
6951 }
6952 return error;
6953 }
6954
6955 static int
6956 vauth_file_owner(vauth_ctx vcp)
6957 {
6958 int result;
6959
6960 if (vcp->flags_valid & _VAC_IS_OWNER) {
6961 result = (vcp->flags & _VAC_IS_OWNER) ? 1 : 0;
6962 } else {
6963 result = vauth_node_owner(vcp->vap, vcp->ctx->vc_ucred);
6964
6965 /* cache our result */
6966 vcp->flags_valid |= _VAC_IS_OWNER;
6967 if (result) {
6968 vcp->flags |= _VAC_IS_OWNER;
6969 } else {
6970 vcp->flags &= ~_VAC_IS_OWNER;
6971 }
6972 }
6973 return result;
6974 }
6975
6976
6977 /*
6978 * vauth_file_ingroup
6979 *
6980 * Description: Ask if a user is a member of the group owning the directory
6981 *
6982 * Parameters: vcp The vnode authorization context that
6983 * contains the user and directory info
6984 * vcp->flags_valid Valid flags
6985 * vcp->flags Flags values
6986 * vcp->vap File vnode attributes
6987 * vcp->ctx VFS Context (for user)
6988 * ismember pointer to where to put the answer
6989 * idontknow Return this if we can't get an answer
6990 *
6991 * Returns: 0 Success
6992 * vauth_node_group:? Error from vauth_node_group()
6993 *
6994 * Implicit returns: *ismember 0 The user is not a group member
6995 * 1 The user is a group member
6996 */
6997 static int
6998 vauth_file_ingroup(vauth_ctx vcp, int *ismember, int idontknow)
6999 {
7000 int error;
7001
7002 /* Check for a cached answer first, to avoid the check if possible */
7003 if (vcp->flags_valid & _VAC_IN_GROUP) {
7004 *ismember = (vcp->flags & _VAC_IN_GROUP) ? 1 : 0;
7005 error = 0;
7006 } else {
7007 /* Otherwise, go look for it */
7008 error = vauth_node_group(vcp->vap, vcp->ctx->vc_ucred, ismember, idontknow);
7009
7010 if (!error) {
7011 /* cache our result */
7012 vcp->flags_valid |= _VAC_IN_GROUP;
7013 if (*ismember) {
7014 vcp->flags |= _VAC_IN_GROUP;
7015 } else {
7016 vcp->flags &= ~_VAC_IN_GROUP;
7017 }
7018 }
7019 }
7020 return error;
7021 }
7022
7023 static int
7024 vauth_dir_owner(vauth_ctx vcp)
7025 {
7026 int result;
7027
7028 if (vcp->flags_valid & _VAC_IS_DIR_OWNER) {
7029 result = (vcp->flags & _VAC_IS_DIR_OWNER) ? 1 : 0;
7030 } else {
7031 result = vauth_node_owner(vcp->dvap, vcp->ctx->vc_ucred);
7032
7033 /* cache our result */
7034 vcp->flags_valid |= _VAC_IS_DIR_OWNER;
7035 if (result) {
7036 vcp->flags |= _VAC_IS_DIR_OWNER;
7037 } else {
7038 vcp->flags &= ~_VAC_IS_DIR_OWNER;
7039 }
7040 }
7041 return result;
7042 }
7043
7044 /*
7045 * vauth_dir_ingroup
7046 *
7047 * Description: Ask if a user is a member of the group owning the directory
7048 *
7049 * Parameters: vcp The vnode authorization context that
7050 * contains the user and directory info
7051 * vcp->flags_valid Valid flags
7052 * vcp->flags Flags values
7053 * vcp->dvap Dir vnode attributes
7054 * vcp->ctx VFS Context (for user)
7055 * ismember pointer to where to put the answer
7056 * idontknow Return this if we can't get an answer
7057 *
7058 * Returns: 0 Success
7059 * vauth_node_group:? Error from vauth_node_group()
7060 *
7061 * Implicit returns: *ismember 0 The user is not a group member
7062 * 1 The user is a group member
7063 */
7064 static int
7065 vauth_dir_ingroup(vauth_ctx vcp, int *ismember, int idontknow)
7066 {
7067 int error;
7068
7069 /* Check for a cached answer first, to avoid the check if possible */
7070 if (vcp->flags_valid & _VAC_IN_DIR_GROUP) {
7071 *ismember = (vcp->flags & _VAC_IN_DIR_GROUP) ? 1 : 0;
7072 error = 0;
7073 } else {
7074 /* Otherwise, go look for it */
7075 error = vauth_node_group(vcp->dvap, vcp->ctx->vc_ucred, ismember, idontknow);
7076
7077 if (!error) {
7078 /* cache our result */
7079 vcp->flags_valid |= _VAC_IN_DIR_GROUP;
7080 if (*ismember) {
7081 vcp->flags |= _VAC_IN_DIR_GROUP;
7082 } else {
7083 vcp->flags &= ~_VAC_IN_DIR_GROUP;
7084 }
7085 }
7086 }
7087 return error;
7088 }
7089
7090 /*
7091 * Test the posix permissions in (vap) to determine whether (credential)
7092 * may perform (action)
7093 */
7094 static int
7095 vnode_authorize_posix(vauth_ctx vcp, int action, int on_dir)
7096 {
7097 struct vnode_attr *vap;
7098 int needed, error, owner_ok, group_ok, world_ok, ismember;
7099 #ifdef KAUTH_DEBUG_ENABLE
7100 const char *where = "uninitialized";
7101 # define _SETWHERE(c) where = c;
7102 #else
7103 # define _SETWHERE(c)
7104 #endif
7105
7106 /* checking file or directory? */
7107 if (on_dir) {
7108 vap = vcp->dvap;
7109 } else {
7110 vap = vcp->vap;
7111 }
7112
7113 error = 0;
7114
7115 /*
7116 * We want to do as little work here as possible. So first we check
7117 * which sets of permissions grant us the access we need, and avoid checking
7118 * whether specific permissions grant access when more generic ones would.
7119 */
7120
7121 /* owner permissions */
7122 needed = 0;
7123 if (action & VREAD) {
7124 needed |= S_IRUSR;
7125 }
7126 if (action & VWRITE) {
7127 needed |= S_IWUSR;
7128 }
7129 if (action & VEXEC) {
7130 needed |= S_IXUSR;
7131 }
7132 owner_ok = (needed & vap->va_mode) == needed;
7133
7134 /* group permissions */
7135 needed = 0;
7136 if (action & VREAD) {
7137 needed |= S_IRGRP;
7138 }
7139 if (action & VWRITE) {
7140 needed |= S_IWGRP;
7141 }
7142 if (action & VEXEC) {
7143 needed |= S_IXGRP;
7144 }
7145 group_ok = (needed & vap->va_mode) == needed;
7146
7147 /* world permissions */
7148 needed = 0;
7149 if (action & VREAD) {
7150 needed |= S_IROTH;
7151 }
7152 if (action & VWRITE) {
7153 needed |= S_IWOTH;
7154 }
7155 if (action & VEXEC) {
7156 needed |= S_IXOTH;
7157 }
7158 world_ok = (needed & vap->va_mode) == needed;
7159
7160 /* If granted/denied by all three, we're done */
7161 if (owner_ok && group_ok && world_ok) {
7162 _SETWHERE("all");
7163 goto out;
7164 }
7165 if (!owner_ok && !group_ok && !world_ok) {
7166 _SETWHERE("all");
7167 error = EACCES;
7168 goto out;
7169 }
7170
7171 /* Check ownership (relatively cheap) */
7172 if ((on_dir && vauth_dir_owner(vcp)) ||
7173 (!on_dir && vauth_file_owner(vcp))) {
7174 _SETWHERE("user");
7175 if (!owner_ok) {
7176 error = EACCES;
7177 }
7178 goto out;
7179 }
7180
7181 /* Not owner; if group and world both grant it we're done */
7182 if (group_ok && world_ok) {
7183 _SETWHERE("group/world");
7184 goto out;
7185 }
7186 if (!group_ok && !world_ok) {
7187 _SETWHERE("group/world");
7188 error = EACCES;
7189 goto out;
7190 }
7191
7192 /* Check group membership (most expensive) */
7193 ismember = 0; /* Default to allow, if the target has no group owner */
7194
7195 /*
7196 * In the case we can't get an answer about the user from the call to
7197 * vauth_dir_ingroup() or vauth_file_ingroup(), we want to fail on
7198 * the side of caution, rather than simply granting access, or we will
7199 * fail to correctly implement exclusion groups, so we set the third
7200 * parameter on the basis of the state of 'group_ok'.
7201 */
7202 if (on_dir) {
7203 error = vauth_dir_ingroup(vcp, &ismember, (!group_ok ? EACCES : 0));
7204 } else {
7205 error = vauth_file_ingroup(vcp, &ismember, (!group_ok ? EACCES : 0));
7206 }
7207 if (error) {
7208 if (!group_ok) {
7209 ismember = 1;
7210 }
7211 error = 0;
7212 }
7213 if (ismember) {
7214 _SETWHERE("group");
7215 if (!group_ok) {
7216 error = EACCES;
7217 }
7218 goto out;
7219 }
7220
7221 /* Not owner, not in group, use world result */
7222 _SETWHERE("world");
7223 if (!world_ok) {
7224 error = EACCES;
7225 }
7226
7227 /* FALLTHROUGH */
7228
7229 out:
7230 KAUTH_DEBUG("%p %s - posix %s permissions : need %s%s%s %x have %s%s%s%s%s%s%s%s%s UID = %d file = %d,%d",
7231 vcp->vp, (error == 0) ? "ALLOWED" : "DENIED", where,
7232 (action & VREAD) ? "r" : "-",
7233 (action & VWRITE) ? "w" : "-",
7234 (action & VEXEC) ? "x" : "-",
7235 needed,
7236 (vap->va_mode & S_IRUSR) ? "r" : "-",
7237 (vap->va_mode & S_IWUSR) ? "w" : "-",
7238 (vap->va_mode & S_IXUSR) ? "x" : "-",
7239 (vap->va_mode & S_IRGRP) ? "r" : "-",
7240 (vap->va_mode & S_IWGRP) ? "w" : "-",
7241 (vap->va_mode & S_IXGRP) ? "x" : "-",
7242 (vap->va_mode & S_IROTH) ? "r" : "-",
7243 (vap->va_mode & S_IWOTH) ? "w" : "-",
7244 (vap->va_mode & S_IXOTH) ? "x" : "-",
7245 kauth_cred_getuid(vcp->ctx->vc_ucred),
7246 on_dir ? vcp->dvap->va_uid : vcp->vap->va_uid,
7247 on_dir ? vcp->dvap->va_gid : vcp->vap->va_gid);
7248 return error;
7249 }
7250
7251 /*
7252 * Authorize the deletion of the node vp from the directory dvp.
7253 *
7254 * We assume that:
7255 * - Neither the node nor the directory are immutable.
7256 * - The user is not the superuser.
7257 *
7258 * The precedence of factors for authorizing or denying delete for a credential
7259 *
7260 * 1) Explicit ACE on the node. (allow or deny DELETE)
7261 * 2) Explicit ACE on the directory (allow or deny DELETE_CHILD).
7262 *
7263 * If there are conflicting ACEs on the node and the directory, the node
7264 * ACE wins.
7265 *
7266 * 3) Sticky bit on the directory.
7267 * Deletion is not permitted if the directory is sticky and the caller is
7268 * not owner of the node or directory. The sticky bit rules are like a deny
7269 * delete ACE except lower in priority than ACL's either allowing or denying
7270 * delete.
7271 *
7272 * 4) POSIX permisions on the directory.
7273 *
7274 * As an optimization, we cache whether or not delete child is permitted
7275 * on directories. This enables us to skip directory ACL and POSIX checks
7276 * as we already have the result from those checks. However, we always check the
7277 * node ACL and, if the directory has the sticky bit set, we always check its
7278 * ACL (even for a directory with an authorized delete child). Furthermore,
7279 * caching the delete child authorization is independent of the sticky bit
7280 * being set as it is only applicable in determining whether the node can be
7281 * deleted or not.
7282 */
7283 static int
7284 vnode_authorize_delete(vauth_ctx vcp, boolean_t cached_delete_child)
7285 {
7286 struct vnode_attr *vap = vcp->vap;
7287 struct vnode_attr *dvap = vcp->dvap;
7288 kauth_cred_t cred = vcp->ctx->vc_ucred;
7289 struct kauth_acl_eval eval;
7290 int error, ismember;
7291
7292 /* Check the ACL on the node first */
7293 if (VATTR_IS_NOT(vap, va_acl, NULL)) {
7294 eval.ae_requested = KAUTH_VNODE_DELETE;
7295 eval.ae_acl = &vap->va_acl->acl_ace[0];
7296 eval.ae_count = vap->va_acl->acl_entrycount;
7297 eval.ae_options = 0;
7298 if (vauth_file_owner(vcp)) {
7299 eval.ae_options |= KAUTH_AEVAL_IS_OWNER;
7300 }
7301 /*
7302 * We use ENOENT as a marker to indicate we could not get
7303 * information in order to delay evaluation until after we
7304 * have the ACL evaluation answer. Previously, we would
7305 * always deny the operation at this point.
7306 */
7307 if ((error = vauth_file_ingroup(vcp, &ismember, ENOENT)) != 0 && error != ENOENT) {
7308 return error;
7309 }
7310 if (error == ENOENT) {
7311 eval.ae_options |= KAUTH_AEVAL_IN_GROUP_UNKNOWN;
7312 } else if (ismember) {
7313 eval.ae_options |= KAUTH_AEVAL_IN_GROUP;
7314 }
7315 eval.ae_exp_gall = KAUTH_VNODE_GENERIC_ALL_BITS;
7316 eval.ae_exp_gread = KAUTH_VNODE_GENERIC_READ_BITS;
7317 eval.ae_exp_gwrite = KAUTH_VNODE_GENERIC_WRITE_BITS;
7318 eval.ae_exp_gexec = KAUTH_VNODE_GENERIC_EXECUTE_BITS;
7319
7320 if ((error = kauth_acl_evaluate(cred, &eval)) != 0) {
7321 KAUTH_DEBUG("%p ERROR during ACL processing - %d", vcp->vp, error);
7322 return error;
7323 }
7324
7325 switch (eval.ae_result) {
7326 case KAUTH_RESULT_DENY:
7327 KAUTH_DEBUG("%p DENIED - denied by ACL", vcp->vp);
7328 return EACCES;
7329 case KAUTH_RESULT_ALLOW:
7330 KAUTH_DEBUG("%p ALLOWED - granted by ACL", vcp->vp);
7331 return 0;
7332 case KAUTH_RESULT_DEFER:
7333 default:
7334 /* Defer to directory */
7335 KAUTH_DEBUG("%p DEFERRED - by file ACL", vcp->vp);
7336 break;
7337 }
7338 }
7339
7340 /*
7341 * Without a sticky bit, a previously authorized delete child is
7342 * sufficient to authorize this delete.
7343 *
7344 * If the sticky bit is set, a directory ACL which allows delete child
7345 * overrides a (potential) sticky bit deny. The authorized delete child
7346 * cannot tell us if it was authorized because of an explicit delete
7347 * child allow ACE or because of POSIX permisions so we have to check
7348 * the directory ACL everytime if the directory has a sticky bit.
7349 */
7350 if (!(dvap->va_mode & S_ISTXT) && cached_delete_child) {
7351 KAUTH_DEBUG("%p ALLOWED - granted by directory ACL or POSIX permissions and no sticky bit on directory", vcp->vp);
7352 return 0;
7353 }
7354
7355 /* check the ACL on the directory */
7356 if (VATTR_IS_NOT(dvap, va_acl, NULL)) {
7357 eval.ae_requested = KAUTH_VNODE_DELETE_CHILD;
7358 eval.ae_acl = &dvap->va_acl->acl_ace[0];
7359 eval.ae_count = dvap->va_acl->acl_entrycount;
7360 eval.ae_options = 0;
7361 if (vauth_dir_owner(vcp)) {
7362 eval.ae_options |= KAUTH_AEVAL_IS_OWNER;
7363 }
7364 /*
7365 * We use ENOENT as a marker to indicate we could not get
7366 * information in order to delay evaluation until after we
7367 * have the ACL evaluation answer. Previously, we would
7368 * always deny the operation at this point.
7369 */
7370 if ((error = vauth_dir_ingroup(vcp, &ismember, ENOENT)) != 0 && error != ENOENT) {
7371 return error;
7372 }
7373 if (error == ENOENT) {
7374 eval.ae_options |= KAUTH_AEVAL_IN_GROUP_UNKNOWN;
7375 } else if (ismember) {
7376 eval.ae_options |= KAUTH_AEVAL_IN_GROUP;
7377 }
7378 eval.ae_exp_gall = KAUTH_VNODE_GENERIC_ALL_BITS;
7379 eval.ae_exp_gread = KAUTH_VNODE_GENERIC_READ_BITS;
7380 eval.ae_exp_gwrite = KAUTH_VNODE_GENERIC_WRITE_BITS;
7381 eval.ae_exp_gexec = KAUTH_VNODE_GENERIC_EXECUTE_BITS;
7382
7383 /*
7384 * If there is no entry, we are going to defer to other
7385 * authorization mechanisms.
7386 */
7387 error = kauth_acl_evaluate(cred, &eval);
7388
7389 if (error != 0) {
7390 KAUTH_DEBUG("%p ERROR during ACL processing - %d", vcp->vp, error);
7391 return error;
7392 }
7393 switch (eval.ae_result) {
7394 case KAUTH_RESULT_DENY:
7395 KAUTH_DEBUG("%p DENIED - denied by directory ACL", vcp->vp);
7396 return EACCES;
7397 case KAUTH_RESULT_ALLOW:
7398 KAUTH_DEBUG("%p ALLOWED - granted by directory ACL", vcp->vp);
7399 if (!cached_delete_child && vcp->dvp) {
7400 vnode_cache_authorized_action(vcp->dvp,
7401 vcp->ctx, KAUTH_VNODE_DELETE_CHILD);
7402 }
7403 return 0;
7404 case KAUTH_RESULT_DEFER:
7405 default:
7406 /* Deferred by directory ACL */
7407 KAUTH_DEBUG("%p DEFERRED - directory ACL", vcp->vp);
7408 break;
7409 }
7410 }
7411
7412 /*
7413 * From this point, we can't explicitly allow and if we reach the end
7414 * of the function without a denial, then the delete is authorized.
7415 */
7416 if (!cached_delete_child) {
7417 if (vnode_authorize_posix(vcp, VWRITE, 1 /* on_dir */) != 0) {
7418 KAUTH_DEBUG("%p DENIED - denied by posix permisssions", vcp->vp);
7419 return EACCES;
7420 }
7421 /*
7422 * Cache the authorized action on the vnode if allowed by the
7423 * directory ACL or POSIX permissions. It is correct to cache
7424 * this action even if sticky bit would deny deleting the node.
7425 */
7426 if (vcp->dvp) {
7427 vnode_cache_authorized_action(vcp->dvp, vcp->ctx,
7428 KAUTH_VNODE_DELETE_CHILD);
7429 }
7430 }
7431
7432 /* enforce sticky bit behaviour */
7433 if ((dvap->va_mode & S_ISTXT) && !vauth_file_owner(vcp) && !vauth_dir_owner(vcp)) {
7434 KAUTH_DEBUG("%p DENIED - sticky bit rules (user %d file %d dir %d)",
7435 vcp->vp, cred->cr_posix.cr_uid, vap->va_uid, dvap->va_uid);
7436 return EACCES;
7437 }
7438
7439 /* not denied, must be OK */
7440 return 0;
7441 }
7442
7443
7444 /*
7445 * Authorize an operation based on the node's attributes.
7446 */
7447 static int
7448 vnode_authorize_simple(vauth_ctx vcp, kauth_ace_rights_t acl_rights, kauth_ace_rights_t preauth_rights, boolean_t *found_deny)
7449 {
7450 struct vnode_attr *vap = vcp->vap;
7451 kauth_cred_t cred = vcp->ctx->vc_ucred;
7452 struct kauth_acl_eval eval;
7453 int error, ismember;
7454 mode_t posix_action;
7455
7456 /*
7457 * If we are the file owner, we automatically have some rights.
7458 *
7459 * Do we need to expand this to support group ownership?
7460 */
7461 if (vauth_file_owner(vcp)) {
7462 acl_rights &= ~(KAUTH_VNODE_WRITE_SECURITY);
7463 }
7464
7465 /*
7466 * If we are checking both TAKE_OWNERSHIP and WRITE_SECURITY, we can
7467 * mask the latter. If TAKE_OWNERSHIP is requested the caller is about to
7468 * change ownership to themselves, and WRITE_SECURITY is implicitly
7469 * granted to the owner. We need to do this because at this point
7470 * WRITE_SECURITY may not be granted as the caller is not currently
7471 * the owner.
7472 */
7473 if ((acl_rights & KAUTH_VNODE_TAKE_OWNERSHIP) &&
7474 (acl_rights & KAUTH_VNODE_WRITE_SECURITY)) {
7475 acl_rights &= ~KAUTH_VNODE_WRITE_SECURITY;
7476 }
7477
7478 if (acl_rights == 0) {
7479 KAUTH_DEBUG("%p ALLOWED - implicit or no rights required", vcp->vp);
7480 return 0;
7481 }
7482
7483 /* if we have an ACL, evaluate it */
7484 if (VATTR_IS_NOT(vap, va_acl, NULL)) {
7485 eval.ae_requested = acl_rights;
7486 eval.ae_acl = &vap->va_acl->acl_ace[0];
7487 eval.ae_count = vap->va_acl->acl_entrycount;
7488 eval.ae_options = 0;
7489 if (vauth_file_owner(vcp)) {
7490 eval.ae_options |= KAUTH_AEVAL_IS_OWNER;
7491 }
7492 /*
7493 * We use ENOENT as a marker to indicate we could not get
7494 * information in order to delay evaluation until after we
7495 * have the ACL evaluation answer. Previously, we would
7496 * always deny the operation at this point.
7497 */
7498 if ((error = vauth_file_ingroup(vcp, &ismember, ENOENT)) != 0 && error != ENOENT) {
7499 return error;
7500 }
7501 if (error == ENOENT) {
7502 eval.ae_options |= KAUTH_AEVAL_IN_GROUP_UNKNOWN;
7503 } else if (ismember) {
7504 eval.ae_options |= KAUTH_AEVAL_IN_GROUP;
7505 }
7506 eval.ae_exp_gall = KAUTH_VNODE_GENERIC_ALL_BITS;
7507 eval.ae_exp_gread = KAUTH_VNODE_GENERIC_READ_BITS;
7508 eval.ae_exp_gwrite = KAUTH_VNODE_GENERIC_WRITE_BITS;
7509 eval.ae_exp_gexec = KAUTH_VNODE_GENERIC_EXECUTE_BITS;
7510
7511 if ((error = kauth_acl_evaluate(cred, &eval)) != 0) {
7512 KAUTH_DEBUG("%p ERROR during ACL processing - %d", vcp->vp, error);
7513 return error;
7514 }
7515
7516 switch (eval.ae_result) {
7517 case KAUTH_RESULT_DENY:
7518 KAUTH_DEBUG("%p DENIED - by ACL", vcp->vp);
7519 return EACCES; /* deny, deny, counter-allege */
7520 case KAUTH_RESULT_ALLOW:
7521 KAUTH_DEBUG("%p ALLOWED - all rights granted by ACL", vcp->vp);
7522 return 0;
7523 case KAUTH_RESULT_DEFER:
7524 default:
7525 /* Effectively the same as !delete_child_denied */
7526 KAUTH_DEBUG("%p DEFERRED - directory ACL", vcp->vp);
7527 break;
7528 }
7529
7530 *found_deny = eval.ae_found_deny;
7531
7532 /* fall through and evaluate residual rights */
7533 } else {
7534 /* no ACL, everything is residual */
7535 eval.ae_residual = acl_rights;
7536 }
7537
7538 /*
7539 * Grant residual rights that have been pre-authorized.
7540 */
7541 eval.ae_residual &= ~preauth_rights;
7542
7543 /*
7544 * We grant WRITE_ATTRIBUTES to the owner if it hasn't been denied.
7545 */
7546 if (vauth_file_owner(vcp)) {
7547 eval.ae_residual &= ~KAUTH_VNODE_WRITE_ATTRIBUTES;
7548 }
7549
7550 if (eval.ae_residual == 0) {
7551 KAUTH_DEBUG("%p ALLOWED - rights already authorized", vcp->vp);
7552 return 0;
7553 }
7554
7555 /*
7556 * Bail if we have residual rights that can't be granted by posix permissions,
7557 * or aren't presumed granted at this point.
7558 *
7559 * XXX these can be collapsed for performance
7560 */
7561 if (eval.ae_residual & KAUTH_VNODE_CHANGE_OWNER) {
7562 KAUTH_DEBUG("%p DENIED - CHANGE_OWNER not permitted", vcp->vp);
7563 return EACCES;
7564 }
7565 if (eval.ae_residual & KAUTH_VNODE_WRITE_SECURITY) {
7566 KAUTH_DEBUG("%p DENIED - WRITE_SECURITY not permitted", vcp->vp);
7567 return EACCES;
7568 }
7569
7570 #if DIAGNOSTIC
7571 if (eval.ae_residual & KAUTH_VNODE_DELETE) {
7572 panic("vnode_authorize: can't be checking delete permission here");
7573 }
7574 #endif
7575
7576 /*
7577 * Compute the fallback posix permissions that will satisfy the remaining
7578 * rights.
7579 */
7580 posix_action = 0;
7581 if (eval.ae_residual & (KAUTH_VNODE_READ_DATA |
7582 KAUTH_VNODE_LIST_DIRECTORY |
7583 KAUTH_VNODE_READ_EXTATTRIBUTES)) {
7584 posix_action |= VREAD;
7585 }
7586 if (eval.ae_residual & (KAUTH_VNODE_WRITE_DATA |
7587 KAUTH_VNODE_ADD_FILE |
7588 KAUTH_VNODE_ADD_SUBDIRECTORY |
7589 KAUTH_VNODE_DELETE_CHILD |
7590 KAUTH_VNODE_WRITE_ATTRIBUTES |
7591 KAUTH_VNODE_WRITE_EXTATTRIBUTES)) {
7592 posix_action |= VWRITE;
7593 }
7594 if (eval.ae_residual & (KAUTH_VNODE_EXECUTE |
7595 KAUTH_VNODE_SEARCH)) {
7596 posix_action |= VEXEC;
7597 }
7598
7599 if (posix_action != 0) {
7600 return vnode_authorize_posix(vcp, posix_action, 0 /* !on_dir */);
7601 } else {
7602 KAUTH_DEBUG("%p ALLOWED - residual rights %s%s%s%s%s%s%s%s%s%s%s%s%s%s granted due to no posix mapping",
7603 vcp->vp,
7604 (eval.ae_residual & KAUTH_VNODE_READ_DATA)
7605 ? vnode_isdir(vcp->vp) ? " LIST_DIRECTORY" : " READ_DATA" : "",
7606 (eval.ae_residual & KAUTH_VNODE_WRITE_DATA)
7607 ? vnode_isdir(vcp->vp) ? " ADD_FILE" : " WRITE_DATA" : "",
7608 (eval.ae_residual & KAUTH_VNODE_EXECUTE)
7609 ? vnode_isdir(vcp->vp) ? " SEARCH" : " EXECUTE" : "",
7610 (eval.ae_residual & KAUTH_VNODE_DELETE)
7611 ? " DELETE" : "",
7612 (eval.ae_residual & KAUTH_VNODE_APPEND_DATA)
7613 ? vnode_isdir(vcp->vp) ? " ADD_SUBDIRECTORY" : " APPEND_DATA" : "",
7614 (eval.ae_residual & KAUTH_VNODE_DELETE_CHILD)
7615 ? " DELETE_CHILD" : "",
7616 (eval.ae_residual & KAUTH_VNODE_READ_ATTRIBUTES)
7617 ? " READ_ATTRIBUTES" : "",
7618 (eval.ae_residual & KAUTH_VNODE_WRITE_ATTRIBUTES)
7619 ? " WRITE_ATTRIBUTES" : "",
7620 (eval.ae_residual & KAUTH_VNODE_READ_EXTATTRIBUTES)
7621 ? " READ_EXTATTRIBUTES" : "",
7622 (eval.ae_residual & KAUTH_VNODE_WRITE_EXTATTRIBUTES)
7623 ? " WRITE_EXTATTRIBUTES" : "",
7624 (eval.ae_residual & KAUTH_VNODE_READ_SECURITY)
7625 ? " READ_SECURITY" : "",
7626 (eval.ae_residual & KAUTH_VNODE_WRITE_SECURITY)
7627 ? " WRITE_SECURITY" : "",
7628 (eval.ae_residual & KAUTH_VNODE_CHECKIMMUTABLE)
7629 ? " CHECKIMMUTABLE" : "",
7630 (eval.ae_residual & KAUTH_VNODE_CHANGE_OWNER)
7631 ? " CHANGE_OWNER" : "");
7632 }
7633
7634 /*
7635 * Lack of required Posix permissions implies no reason to deny access.
7636 */
7637 return 0;
7638 }
7639
7640 /*
7641 * Check for file immutability.
7642 */
7643 static int
7644 vnode_authorize_checkimmutable(mount_t mp, struct vnode_attr *vap, int rights, int ignore)
7645 {
7646 int error;
7647 int append;
7648
7649 /*
7650 * Perform immutability checks for operations that change data.
7651 *
7652 * Sockets, fifos and devices require special handling.
7653 */
7654 switch (vap->va_type) {
7655 case VSOCK:
7656 case VFIFO:
7657 case VBLK:
7658 case VCHR:
7659 /*
7660 * Writing to these nodes does not change the filesystem data,
7661 * so forget that it's being tried.
7662 */
7663 rights &= ~KAUTH_VNODE_WRITE_DATA;
7664 break;
7665 default:
7666 break;
7667 }
7668
7669 error = 0;
7670 if (rights & KAUTH_VNODE_WRITE_RIGHTS) {
7671 /* check per-filesystem options if possible */
7672 if (mp != NULL) {
7673 /* check for no-EA filesystems */
7674 if ((rights & KAUTH_VNODE_WRITE_EXTATTRIBUTES) &&
7675 (vfs_flags(mp) & MNT_NOUSERXATTR)) {
7676 KAUTH_DEBUG("%p DENIED - filesystem disallowed extended attributes", vp);
7677 error = EACCES; /* User attributes disabled */
7678 goto out;
7679 }
7680 }
7681
7682 /*
7683 * check for file immutability. first, check if the requested rights are
7684 * allowable for a UF_APPEND file.
7685 */
7686 append = 0;
7687 if (vap->va_type == VDIR) {
7688 if ((rights & (KAUTH_VNODE_ADD_FILE | KAUTH_VNODE_ADD_SUBDIRECTORY | KAUTH_VNODE_WRITE_EXTATTRIBUTES)) == rights) {
7689 append = 1;
7690 }
7691 } else {
7692 if ((rights & (KAUTH_VNODE_APPEND_DATA | KAUTH_VNODE_WRITE_EXTATTRIBUTES)) == rights) {
7693 append = 1;
7694 }
7695 }
7696 if ((error = vnode_immutable(vap, append, ignore)) != 0) {
7697 KAUTH_DEBUG("%p DENIED - file is immutable", vp);
7698 goto out;
7699 }
7700 }
7701 out:
7702 return error;
7703 }
7704
7705 /*
7706 * Handle authorization actions for filesystems that advertise that the
7707 * server will be enforcing.
7708 *
7709 * Returns: 0 Authorization should be handled locally
7710 * 1 Authorization was handled by the FS
7711 *
7712 * Note: Imputed returns will only occur if the authorization request
7713 * was handled by the FS.
7714 *
7715 * Imputed: *resultp, modified Return code from FS when the request is
7716 * handled by the FS.
7717 * VNOP_ACCESS:???
7718 * VNOP_OPEN:???
7719 */
7720 static int
7721 vnode_authorize_opaque(vnode_t vp, int *resultp, kauth_action_t action, vfs_context_t ctx)
7722 {
7723 int error;
7724
7725 /*
7726 * If the vp is a device node, socket or FIFO it actually represents a local
7727 * endpoint, so we need to handle it locally.
7728 */
7729 switch (vp->v_type) {
7730 case VBLK:
7731 case VCHR:
7732 case VSOCK:
7733 case VFIFO:
7734 return 0;
7735 default:
7736 break;
7737 }
7738
7739 /*
7740 * In the advisory request case, if the filesystem doesn't think it's reliable
7741 * we will attempt to formulate a result ourselves based on VNOP_GETATTR data.
7742 */
7743 if ((action & KAUTH_VNODE_ACCESS) && !vfs_authopaqueaccess(vp->v_mount)) {
7744 return 0;
7745 }
7746
7747 /*
7748 * Let the filesystem have a say in the matter. It's OK for it to not implemnent
7749 * VNOP_ACCESS, as most will authorise inline with the actual request.
7750 */
7751 if ((error = VNOP_ACCESS(vp, action, ctx)) != ENOTSUP) {
7752 *resultp = error;
7753 KAUTH_DEBUG("%p DENIED - opaque filesystem VNOP_ACCESS denied access", vp);
7754 return 1;
7755 }
7756
7757 /*
7758 * Typically opaque filesystems do authorisation in-line, but exec is a special case. In
7759 * order to be reasonably sure that exec will be permitted, we try a bit harder here.
7760 */
7761 if ((action & KAUTH_VNODE_EXECUTE) && (vp->v_type == VREG)) {
7762 /* try a VNOP_OPEN for readonly access */
7763 if ((error = VNOP_OPEN(vp, FREAD, ctx)) != 0) {
7764 *resultp = error;
7765 KAUTH_DEBUG("%p DENIED - EXECUTE denied because file could not be opened readonly", vp);
7766 return 1;
7767 }
7768 VNOP_CLOSE(vp, FREAD, ctx);
7769 }
7770
7771 /*
7772 * We don't have any reason to believe that the request has to be denied at this point,
7773 * so go ahead and allow it.
7774 */
7775 *resultp = 0;
7776 KAUTH_DEBUG("%p ALLOWED - bypassing access check for non-local filesystem", vp);
7777 return 1;
7778 }
7779
7780
7781
7782
7783 /*
7784 * Returns: KAUTH_RESULT_ALLOW
7785 * KAUTH_RESULT_DENY
7786 *
7787 * Imputed: *arg3, modified Error code in the deny case
7788 * EROFS Read-only file system
7789 * EACCES Permission denied
7790 * EPERM Operation not permitted [no execute]
7791 * vnode_getattr:ENOMEM Not enough space [only if has filesec]
7792 * vnode_getattr:???
7793 * vnode_authorize_opaque:*arg2 ???
7794 * vnode_authorize_checkimmutable:???
7795 * vnode_authorize_delete:???
7796 * vnode_authorize_simple:???
7797 */
7798
7799
7800 static int
7801 vnode_authorize_callback(__unused kauth_cred_t cred, __unused void *idata,
7802 kauth_action_t action, uintptr_t arg0, uintptr_t arg1, uintptr_t arg2,
7803 uintptr_t arg3)
7804 {
7805 vfs_context_t ctx;
7806 vnode_t cvp = NULLVP;
7807 vnode_t vp, dvp;
7808 int result = KAUTH_RESULT_DENY;
7809 int parent_iocount = 0;
7810 int parent_action; /* In case we need to use namedstream's data fork for cached rights*/
7811
7812 ctx = (vfs_context_t)arg0;
7813 vp = (vnode_t)arg1;
7814 dvp = (vnode_t)arg2;
7815
7816 /*
7817 * if there are 2 vnodes passed in, we don't know at
7818 * this point which rights to look at based on the
7819 * combined action being passed in... defer until later...
7820 * otherwise check the kauth 'rights' cache hung
7821 * off of the vnode we're interested in... if we've already
7822 * been granted the right we're currently interested in,
7823 * we can just return success... otherwise we'll go through
7824 * the process of authorizing the requested right(s)... if that
7825 * succeeds, we'll add the right(s) to the cache.
7826 * VNOP_SETATTR and VNOP_SETXATTR will invalidate this cache
7827 */
7828 if (dvp && vp) {
7829 goto defer;
7830 }
7831 if (dvp) {
7832 cvp = dvp;
7833 } else {
7834 /*
7835 * For named streams on local-authorization volumes, rights are cached on the parent;
7836 * authorization is determined by looking at the parent's properties anyway, so storing
7837 * on the parent means that we don't recompute for the named stream and that if
7838 * we need to flush rights (e.g. on VNOP_SETATTR()) we don't need to track down the
7839 * stream to flush its cache separately. If we miss in the cache, then we authorize
7840 * as if there were no cached rights (passing the named stream vnode and desired rights to
7841 * vnode_authorize_callback_int()).
7842 *
7843 * On an opaquely authorized volume, we don't know the relationship between the
7844 * data fork's properties and the rights granted on a stream. Thus, named stream vnodes
7845 * on such a volume are authorized directly (rather than using the parent) and have their
7846 * own caches. When a named stream vnode is created, we mark the parent as having a named
7847 * stream. On a VNOP_SETATTR() for the parent that may invalidate cached authorization, we
7848 * find the stream and flush its cache.
7849 */
7850 if (vnode_isnamedstream(vp) && (!vfs_authopaque(vp->v_mount))) {
7851 cvp = vnode_getparent(vp);
7852 if (cvp != NULLVP) {
7853 parent_iocount = 1;
7854 } else {
7855 cvp = NULL;
7856 goto defer; /* If we can't use the parent, take the slow path */
7857 }
7858
7859 /* Have to translate some actions */
7860 parent_action = action;
7861 if (parent_action & KAUTH_VNODE_READ_DATA) {
7862 parent_action &= ~KAUTH_VNODE_READ_DATA;
7863 parent_action |= KAUTH_VNODE_READ_EXTATTRIBUTES;
7864 }
7865 if (parent_action & KAUTH_VNODE_WRITE_DATA) {
7866 parent_action &= ~KAUTH_VNODE_WRITE_DATA;
7867 parent_action |= KAUTH_VNODE_WRITE_EXTATTRIBUTES;
7868 }
7869 } else {
7870 cvp = vp;
7871 }
7872 }
7873
7874 if (vnode_cache_is_authorized(cvp, ctx, parent_iocount ? parent_action : action) == TRUE) {
7875 result = KAUTH_RESULT_ALLOW;
7876 goto out;
7877 }
7878 defer:
7879 result = vnode_authorize_callback_int(action, ctx, vp, dvp, (int *)arg3);
7880
7881 if (result == KAUTH_RESULT_ALLOW && cvp != NULLVP) {
7882 KAUTH_DEBUG("%p - caching action = %x", cvp, action);
7883 vnode_cache_authorized_action(cvp, ctx, action);
7884 }
7885
7886 out:
7887 if (parent_iocount) {
7888 vnode_put(cvp);
7889 }
7890
7891 return result;
7892 }
7893
7894 static int
7895 vnode_attr_authorize_internal(vauth_ctx vcp, mount_t mp,
7896 kauth_ace_rights_t rights, int is_suser, boolean_t *found_deny,
7897 int noimmutable, int parent_authorized_for_delete_child)
7898 {
7899 int result;
7900
7901 /*
7902 * Check for immutability.
7903 *
7904 * In the deletion case, parent directory immutability vetoes specific
7905 * file rights.
7906 */
7907 if ((result = vnode_authorize_checkimmutable(mp, vcp->vap, rights,
7908 noimmutable)) != 0) {
7909 goto out;
7910 }
7911
7912 if ((rights & KAUTH_VNODE_DELETE) &&
7913 !parent_authorized_for_delete_child) {
7914 result = vnode_authorize_checkimmutable(mp, vcp->dvap,
7915 KAUTH_VNODE_DELETE_CHILD, 0);
7916 if (result) {
7917 goto out;
7918 }
7919 }
7920
7921 /*
7922 * Clear rights that have been authorized by reaching this point, bail if nothing left to
7923 * check.
7924 */
7925 rights &= ~(KAUTH_VNODE_LINKTARGET | KAUTH_VNODE_CHECKIMMUTABLE);
7926 if (rights == 0) {
7927 goto out;
7928 }
7929
7930 /*
7931 * If we're not the superuser, authorize based on file properties;
7932 * note that even if parent_authorized_for_delete_child is TRUE, we
7933 * need to check on the node itself.
7934 */
7935 if (!is_suser) {
7936 /* process delete rights */
7937 if ((rights & KAUTH_VNODE_DELETE) &&
7938 ((result = vnode_authorize_delete(vcp, parent_authorized_for_delete_child)) != 0)) {
7939 goto out;
7940 }
7941
7942 /* process remaining rights */
7943 if ((rights & ~KAUTH_VNODE_DELETE) &&
7944 (result = vnode_authorize_simple(vcp, rights, rights & KAUTH_VNODE_DELETE, found_deny)) != 0) {
7945 goto out;
7946 }
7947 } else {
7948 /*
7949 * Execute is only granted to root if one of the x bits is set. This check only
7950 * makes sense if the posix mode bits are actually supported.
7951 */
7952 if ((rights & KAUTH_VNODE_EXECUTE) &&
7953 (vcp->vap->va_type == VREG) &&
7954 VATTR_IS_SUPPORTED(vcp->vap, va_mode) &&
7955 !(vcp->vap->va_mode & (S_IXUSR | S_IXGRP | S_IXOTH))) {
7956 result = EPERM;
7957 KAUTH_DEBUG("%p DENIED - root execute requires at least one x bit in 0x%x", vp, va.va_mode);
7958 goto out;
7959 }
7960
7961 /* Assume that there were DENYs so we don't wrongly cache KAUTH_VNODE_SEARCHBYANYONE */
7962 *found_deny = TRUE;
7963
7964 KAUTH_DEBUG("%p ALLOWED - caller is superuser", vp);
7965 }
7966 out:
7967 return result;
7968 }
7969
7970 static int
7971 vnode_authorize_callback_int(kauth_action_t action, vfs_context_t ctx,
7972 vnode_t vp, vnode_t dvp, int *errorp)
7973 {
7974 struct _vnode_authorize_context auth_context;
7975 vauth_ctx vcp;
7976 kauth_cred_t cred;
7977 kauth_ace_rights_t rights;
7978 struct vnode_attr va, dva;
7979 int result;
7980 int noimmutable;
7981 boolean_t parent_authorized_for_delete_child = FALSE;
7982 boolean_t found_deny = FALSE;
7983 boolean_t parent_ref = FALSE;
7984 boolean_t is_suser = FALSE;
7985
7986 vcp = &auth_context;
7987 vcp->ctx = ctx;
7988 vcp->vp = vp;
7989 vcp->dvp = dvp;
7990 /*
7991 * Note that we authorize against the context, not the passed cred
7992 * (the same thing anyway)
7993 */
7994 cred = ctx->vc_ucred;
7995
7996 VATTR_INIT(&va);
7997 vcp->vap = &va;
7998 VATTR_INIT(&dva);
7999 vcp->dvap = &dva;
8000
8001 vcp->flags = vcp->flags_valid = 0;
8002
8003 #if DIAGNOSTIC
8004 if ((ctx == NULL) || (vp == NULL) || (cred == NULL)) {
8005 panic("vnode_authorize: bad arguments (context %p vp %p cred %p)", ctx, vp, cred);
8006 }
8007 #endif
8008
8009 KAUTH_DEBUG("%p AUTH - %s %s%s%s%s%s%s%s%s%s%s%s%s%s%s%s on %s '%s' (0x%x:%p/%p)",
8010 vp, vfs_context_proc(ctx)->p_comm,
8011 (action & KAUTH_VNODE_ACCESS) ? "access" : "auth",
8012 (action & KAUTH_VNODE_READ_DATA) ? vnode_isdir(vp) ? " LIST_DIRECTORY" : " READ_DATA" : "",
8013 (action & KAUTH_VNODE_WRITE_DATA) ? vnode_isdir(vp) ? " ADD_FILE" : " WRITE_DATA" : "",
8014 (action & KAUTH_VNODE_EXECUTE) ? vnode_isdir(vp) ? " SEARCH" : " EXECUTE" : "",
8015 (action & KAUTH_VNODE_DELETE) ? " DELETE" : "",
8016 (action & KAUTH_VNODE_APPEND_DATA) ? vnode_isdir(vp) ? " ADD_SUBDIRECTORY" : " APPEND_DATA" : "",
8017 (action & KAUTH_VNODE_DELETE_CHILD) ? " DELETE_CHILD" : "",
8018 (action & KAUTH_VNODE_READ_ATTRIBUTES) ? " READ_ATTRIBUTES" : "",
8019 (action & KAUTH_VNODE_WRITE_ATTRIBUTES) ? " WRITE_ATTRIBUTES" : "",
8020 (action & KAUTH_VNODE_READ_EXTATTRIBUTES) ? " READ_EXTATTRIBUTES" : "",
8021 (action & KAUTH_VNODE_WRITE_EXTATTRIBUTES) ? " WRITE_EXTATTRIBUTES" : "",
8022 (action & KAUTH_VNODE_READ_SECURITY) ? " READ_SECURITY" : "",
8023 (action & KAUTH_VNODE_WRITE_SECURITY) ? " WRITE_SECURITY" : "",
8024 (action & KAUTH_VNODE_CHANGE_OWNER) ? " CHANGE_OWNER" : "",
8025 (action & KAUTH_VNODE_NOIMMUTABLE) ? " (noimmutable)" : "",
8026 vnode_isdir(vp) ? "directory" : "file",
8027 vp->v_name ? vp->v_name : "<NULL>", action, vp, dvp);
8028
8029 /*
8030 * Extract the control bits from the action, everything else is
8031 * requested rights.
8032 */
8033 noimmutable = (action & KAUTH_VNODE_NOIMMUTABLE) ? 1 : 0;
8034 rights = action & ~(KAUTH_VNODE_ACCESS | KAUTH_VNODE_NOIMMUTABLE);
8035
8036 if (rights & KAUTH_VNODE_DELETE) {
8037 #if DIAGNOSTIC
8038 if (dvp == NULL) {
8039 panic("vnode_authorize: KAUTH_VNODE_DELETE test requires a directory");
8040 }
8041 #endif
8042 /*
8043 * check to see if we've already authorized the parent
8044 * directory for deletion of its children... if so, we
8045 * can skip a whole bunch of work... we will still have to
8046 * authorize that this specific child can be removed
8047 */
8048 if (vnode_cache_is_authorized(dvp, ctx, KAUTH_VNODE_DELETE_CHILD) == TRUE) {
8049 parent_authorized_for_delete_child = TRUE;
8050 }
8051 } else {
8052 vcp->dvp = NULLVP;
8053 vcp->dvap = NULL;
8054 }
8055
8056 /*
8057 * Check for read-only filesystems.
8058 */
8059 if ((rights & KAUTH_VNODE_WRITE_RIGHTS) &&
8060 (vp->v_mount->mnt_flag & MNT_RDONLY) &&
8061 ((vp->v_type == VREG) || (vp->v_type == VDIR) ||
8062 (vp->v_type == VLNK) || (vp->v_type == VCPLX) ||
8063 (rights & KAUTH_VNODE_DELETE) || (rights & KAUTH_VNODE_DELETE_CHILD))) {
8064 result = EROFS;
8065 goto out;
8066 }
8067
8068 /*
8069 * Check for noexec filesystems.
8070 */
8071 if ((rights & KAUTH_VNODE_EXECUTE) && (vp->v_type == VREG) && (vp->v_mount->mnt_flag & MNT_NOEXEC)) {
8072 result = EACCES;
8073 goto out;
8074 }
8075
8076 /*
8077 * Handle cases related to filesystems with non-local enforcement.
8078 * This call can return 0, in which case we will fall through to perform a
8079 * check based on VNOP_GETATTR data. Otherwise it returns 1 and sets
8080 * an appropriate result, at which point we can return immediately.
8081 */
8082 if ((vp->v_mount->mnt_kern_flag & MNTK_AUTH_OPAQUE) && vnode_authorize_opaque(vp, &result, action, ctx)) {
8083 goto out;
8084 }
8085
8086 /*
8087 * If the vnode is a namedstream (extended attribute) data vnode (eg.
8088 * a resource fork), *_DATA becomes *_EXTATTRIBUTES.
8089 */
8090 if (vnode_isnamedstream(vp)) {
8091 if (rights & KAUTH_VNODE_READ_DATA) {
8092 rights &= ~KAUTH_VNODE_READ_DATA;
8093 rights |= KAUTH_VNODE_READ_EXTATTRIBUTES;
8094 }
8095 if (rights & KAUTH_VNODE_WRITE_DATA) {
8096 rights &= ~KAUTH_VNODE_WRITE_DATA;
8097 rights |= KAUTH_VNODE_WRITE_EXTATTRIBUTES;
8098 }
8099
8100 /*
8101 * Point 'vp' to the namedstream's parent for ACL checking
8102 */
8103 if ((vp->v_parent != NULL) &&
8104 (vget_internal(vp->v_parent, 0, VNODE_NODEAD | VNODE_DRAINO) == 0)) {
8105 parent_ref = TRUE;
8106 vcp->vp = vp = vp->v_parent;
8107 }
8108 }
8109
8110 if (vfs_context_issuser(ctx)) {
8111 /*
8112 * if we're not asking for execute permissions or modifications,
8113 * then we're done, this action is authorized.
8114 */
8115 if (!(rights & (KAUTH_VNODE_EXECUTE | KAUTH_VNODE_WRITE_RIGHTS))) {
8116 goto success;
8117 }
8118
8119 is_suser = TRUE;
8120 }
8121
8122 /*
8123 * Get vnode attributes and extended security information for the vnode
8124 * and directory if required.
8125 *
8126 * If we're root we only want mode bits and flags for checking
8127 * execute and immutability.
8128 */
8129 VATTR_WANTED(&va, va_mode);
8130 VATTR_WANTED(&va, va_flags);
8131 if (!is_suser) {
8132 VATTR_WANTED(&va, va_uid);
8133 VATTR_WANTED(&va, va_gid);
8134 VATTR_WANTED(&va, va_acl);
8135 }
8136 if ((result = vnode_getattr(vp, &va, ctx)) != 0) {
8137 KAUTH_DEBUG("%p ERROR - failed to get vnode attributes - %d", vp, result);
8138 goto out;
8139 }
8140 VATTR_WANTED(&va, va_type);
8141 VATTR_RETURN(&va, va_type, vnode_vtype(vp));
8142
8143 if (vcp->dvp) {
8144 VATTR_WANTED(&dva, va_mode);
8145 VATTR_WANTED(&dva, va_flags);
8146 if (!is_suser) {
8147 VATTR_WANTED(&dva, va_uid);
8148 VATTR_WANTED(&dva, va_gid);
8149 VATTR_WANTED(&dva, va_acl);
8150 }
8151 if ((result = vnode_getattr(vcp->dvp, &dva, ctx)) != 0) {
8152 KAUTH_DEBUG("%p ERROR - failed to get directory vnode attributes - %d", vp, result);
8153 goto out;
8154 }
8155 VATTR_WANTED(&dva, va_type);
8156 VATTR_RETURN(&dva, va_type, vnode_vtype(vcp->dvp));
8157 }
8158
8159 result = vnode_attr_authorize_internal(vcp, vp->v_mount, rights, is_suser,
8160 &found_deny, noimmutable, parent_authorized_for_delete_child);
8161 out:
8162 if (VATTR_IS_SUPPORTED(&va, va_acl) && (va.va_acl != NULL)) {
8163 kauth_acl_free(va.va_acl);
8164 }
8165 if (VATTR_IS_SUPPORTED(&dva, va_acl) && (dva.va_acl != NULL)) {
8166 kauth_acl_free(dva.va_acl);
8167 }
8168
8169 if (result) {
8170 if (parent_ref) {
8171 vnode_put(vp);
8172 }
8173 *errorp = result;
8174 KAUTH_DEBUG("%p DENIED - auth denied", vp);
8175 return KAUTH_RESULT_DENY;
8176 }
8177 if ((rights & KAUTH_VNODE_SEARCH) && found_deny == FALSE && vp->v_type == VDIR) {
8178 /*
8179 * if we were successfully granted the right to search this directory
8180 * and there were NO ACL DENYs for search and the posix permissions also don't
8181 * deny execute, we can synthesize a global right that allows anyone to
8182 * traverse this directory during a pathname lookup without having to
8183 * match the credential associated with this cache of rights.
8184 *
8185 * Note that we can correctly cache KAUTH_VNODE_SEARCHBYANYONE
8186 * only if we actually check ACLs which we don't for root. As
8187 * a workaround, the lookup fast path checks for root.
8188 */
8189 if (!VATTR_IS_SUPPORTED(&va, va_mode) ||
8190 ((va.va_mode & (S_IXUSR | S_IXGRP | S_IXOTH)) ==
8191 (S_IXUSR | S_IXGRP | S_IXOTH))) {
8192 vnode_cache_authorized_action(vp, ctx, KAUTH_VNODE_SEARCHBYANYONE);
8193 }
8194 }
8195 success:
8196 if (parent_ref) {
8197 vnode_put(vp);
8198 }
8199
8200 /*
8201 * Note that this implies that we will allow requests for no rights, as well as
8202 * for rights that we do not recognise. There should be none of these.
8203 */
8204 KAUTH_DEBUG("%p ALLOWED - auth granted", vp);
8205 return KAUTH_RESULT_ALLOW;
8206 }
8207
8208 int
8209 vnode_attr_authorize_init(struct vnode_attr *vap, struct vnode_attr *dvap,
8210 kauth_action_t action, vfs_context_t ctx)
8211 {
8212 VATTR_INIT(vap);
8213 VATTR_WANTED(vap, va_type);
8214 VATTR_WANTED(vap, va_mode);
8215 VATTR_WANTED(vap, va_flags);
8216 if (dvap) {
8217 VATTR_INIT(dvap);
8218 if (action & KAUTH_VNODE_DELETE) {
8219 VATTR_WANTED(dvap, va_type);
8220 VATTR_WANTED(dvap, va_mode);
8221 VATTR_WANTED(dvap, va_flags);
8222 }
8223 } else if (action & KAUTH_VNODE_DELETE) {
8224 return EINVAL;
8225 }
8226
8227 if (!vfs_context_issuser(ctx)) {
8228 VATTR_WANTED(vap, va_uid);
8229 VATTR_WANTED(vap, va_gid);
8230 VATTR_WANTED(vap, va_acl);
8231 if (dvap && (action & KAUTH_VNODE_DELETE)) {
8232 VATTR_WANTED(dvap, va_uid);
8233 VATTR_WANTED(dvap, va_gid);
8234 VATTR_WANTED(dvap, va_acl);
8235 }
8236 }
8237
8238 return 0;
8239 }
8240
8241 int
8242 vnode_attr_authorize(struct vnode_attr *vap, struct vnode_attr *dvap, mount_t mp,
8243 kauth_action_t action, vfs_context_t ctx)
8244 {
8245 struct _vnode_authorize_context auth_context;
8246 vauth_ctx vcp;
8247 kauth_ace_rights_t rights;
8248 int noimmutable;
8249 boolean_t found_deny;
8250 boolean_t is_suser = FALSE;
8251 int result = 0;
8252
8253 vcp = &auth_context;
8254 vcp->ctx = ctx;
8255 vcp->vp = NULLVP;
8256 vcp->vap = vap;
8257 vcp->dvp = NULLVP;
8258 vcp->dvap = dvap;
8259 vcp->flags = vcp->flags_valid = 0;
8260
8261 noimmutable = (action & KAUTH_VNODE_NOIMMUTABLE) ? 1 : 0;
8262 rights = action & ~(KAUTH_VNODE_ACCESS | KAUTH_VNODE_NOIMMUTABLE);
8263
8264 /*
8265 * Check for read-only filesystems.
8266 */
8267 if ((rights & KAUTH_VNODE_WRITE_RIGHTS) &&
8268 mp && (mp->mnt_flag & MNT_RDONLY) &&
8269 ((vap->va_type == VREG) || (vap->va_type == VDIR) ||
8270 (vap->va_type == VLNK) || (rights & KAUTH_VNODE_DELETE) ||
8271 (rights & KAUTH_VNODE_DELETE_CHILD))) {
8272 result = EROFS;
8273 goto out;
8274 }
8275
8276 /*
8277 * Check for noexec filesystems.
8278 */
8279 if ((rights & KAUTH_VNODE_EXECUTE) &&
8280 (vap->va_type == VREG) && mp && (mp->mnt_flag & MNT_NOEXEC)) {
8281 result = EACCES;
8282 goto out;
8283 }
8284
8285 if (vfs_context_issuser(ctx)) {
8286 /*
8287 * if we're not asking for execute permissions or modifications,
8288 * then we're done, this action is authorized.
8289 */
8290 if (!(rights & (KAUTH_VNODE_EXECUTE | KAUTH_VNODE_WRITE_RIGHTS))) {
8291 goto out;
8292 }
8293 is_suser = TRUE;
8294 } else {
8295 if (!VATTR_IS_SUPPORTED(vap, va_uid) ||
8296 !VATTR_IS_SUPPORTED(vap, va_gid) ||
8297 (mp && vfs_extendedsecurity(mp) && !VATTR_IS_SUPPORTED(vap, va_acl))) {
8298 panic("vnode attrs not complete for vnode_attr_authorize\n");
8299 }
8300 }
8301
8302 result = vnode_attr_authorize_internal(vcp, mp, rights, is_suser,
8303 &found_deny, noimmutable, FALSE);
8304
8305 if (result == EPERM) {
8306 result = EACCES;
8307 }
8308 out:
8309 return result;
8310 }
8311
8312
8313 int
8314 vnode_authattr_new(vnode_t dvp, struct vnode_attr *vap, int noauth, vfs_context_t ctx)
8315 {
8316 return vnode_authattr_new_internal(dvp, vap, noauth, NULL, ctx);
8317 }
8318
8319 /*
8320 * Check that the attribute information in vattr can be legally applied to
8321 * a new file by the context.
8322 */
8323 static int
8324 vnode_authattr_new_internal(vnode_t dvp, struct vnode_attr *vap, int noauth, uint32_t *defaulted_fieldsp, vfs_context_t ctx)
8325 {
8326 int error;
8327 int has_priv_suser, ismember, defaulted_owner, defaulted_group, defaulted_mode;
8328 uint32_t inherit_flags;
8329 kauth_cred_t cred;
8330 guid_t changer;
8331 mount_t dmp;
8332 struct vnode_attr dva;
8333
8334 error = 0;
8335
8336 if (defaulted_fieldsp) {
8337 *defaulted_fieldsp = 0;
8338 }
8339
8340 defaulted_owner = defaulted_group = defaulted_mode = 0;
8341
8342 inherit_flags = 0;
8343
8344 /*
8345 * Require that the filesystem support extended security to apply any.
8346 */
8347 if (!vfs_extendedsecurity(dvp->v_mount) &&
8348 (VATTR_IS_ACTIVE(vap, va_acl) || VATTR_IS_ACTIVE(vap, va_uuuid) || VATTR_IS_ACTIVE(vap, va_guuid))) {
8349 error = EINVAL;
8350 goto out;
8351 }
8352
8353 /*
8354 * Default some fields.
8355 */
8356 dmp = dvp->v_mount;
8357
8358 /*
8359 * If the filesystem is mounted IGNORE_OWNERSHIP and an explicit owner is set, that
8360 * owner takes ownership of all new files.
8361 */
8362 if ((dmp->mnt_flag & MNT_IGNORE_OWNERSHIP) && (dmp->mnt_fsowner != KAUTH_UID_NONE)) {
8363 VATTR_SET(vap, va_uid, dmp->mnt_fsowner);
8364 defaulted_owner = 1;
8365 } else {
8366 if (!VATTR_IS_ACTIVE(vap, va_uid)) {
8367 /* default owner is current user */
8368 VATTR_SET(vap, va_uid, kauth_cred_getuid(vfs_context_ucred(ctx)));
8369 defaulted_owner = 1;
8370 }
8371 }
8372
8373 /*
8374 * We need the dvp's va_flags and *may* need the gid of the directory,
8375 * we ask for both here.
8376 */
8377 VATTR_INIT(&dva);
8378 VATTR_WANTED(&dva, va_gid);
8379 VATTR_WANTED(&dva, va_flags);
8380 if ((error = vnode_getattr(dvp, &dva, ctx)) != 0) {
8381 goto out;
8382 }
8383
8384 /*
8385 * If the filesystem is mounted IGNORE_OWNERSHIP and an explicit grouo is set, that
8386 * group takes ownership of all new files.
8387 */
8388 if ((dmp->mnt_flag & MNT_IGNORE_OWNERSHIP) && (dmp->mnt_fsgroup != KAUTH_GID_NONE)) {
8389 VATTR_SET(vap, va_gid, dmp->mnt_fsgroup);
8390 defaulted_group = 1;
8391 } else {
8392 if (!VATTR_IS_ACTIVE(vap, va_gid)) {
8393 /* default group comes from parent object, fallback to current user */
8394 if (VATTR_IS_SUPPORTED(&dva, va_gid)) {
8395 VATTR_SET(vap, va_gid, dva.va_gid);
8396 } else {
8397 VATTR_SET(vap, va_gid, kauth_cred_getgid(vfs_context_ucred(ctx)));
8398 }
8399 defaulted_group = 1;
8400 }
8401 }
8402
8403 if (!VATTR_IS_ACTIVE(vap, va_flags)) {
8404 VATTR_SET(vap, va_flags, 0);
8405 }
8406
8407 /* Determine if SF_RESTRICTED should be inherited from the parent
8408 * directory. */
8409 if (VATTR_IS_SUPPORTED(&dva, va_flags)) {
8410 inherit_flags = dva.va_flags & (UF_DATAVAULT | SF_RESTRICTED);
8411 }
8412
8413 /* default mode is everything, masked with current umask */
8414 if (!VATTR_IS_ACTIVE(vap, va_mode)) {
8415 VATTR_SET(vap, va_mode, ACCESSPERMS & ~vfs_context_proc(ctx)->p_fd->fd_cmask);
8416 KAUTH_DEBUG("ATTR - defaulting new file mode to %o from umask %o", vap->va_mode, vfs_context_proc(ctx)->p_fd->fd_cmask);
8417 defaulted_mode = 1;
8418 }
8419 /* set timestamps to now */
8420 if (!VATTR_IS_ACTIVE(vap, va_create_time)) {
8421 nanotime(&vap->va_create_time);
8422 VATTR_SET_ACTIVE(vap, va_create_time);
8423 }
8424
8425 /*
8426 * Check for attempts to set nonsensical fields.
8427 */
8428 if (vap->va_active & ~VNODE_ATTR_NEWOBJ) {
8429 error = EINVAL;
8430 KAUTH_DEBUG("ATTR - ERROR - attempt to set unsupported new-file attributes %llx",
8431 vap->va_active & ~VNODE_ATTR_NEWOBJ);
8432 goto out;
8433 }
8434
8435 /*
8436 * Quickly check for the applicability of any enforcement here.
8437 * Tests below maintain the integrity of the local security model.
8438 */
8439 if (vfs_authopaque(dvp->v_mount)) {
8440 goto out;
8441 }
8442
8443 /*
8444 * We need to know if the caller is the superuser, or if the work is
8445 * otherwise already authorised.
8446 */
8447 cred = vfs_context_ucred(ctx);
8448 if (noauth) {
8449 /* doing work for the kernel */
8450 has_priv_suser = 1;
8451 } else {
8452 has_priv_suser = vfs_context_issuser(ctx);
8453 }
8454
8455
8456 if (VATTR_IS_ACTIVE(vap, va_flags)) {
8457 if (has_priv_suser) {
8458 if ((vap->va_flags & (UF_SETTABLE | SF_SETTABLE)) != vap->va_flags) {
8459 error = EPERM;
8460 KAUTH_DEBUG(" DENIED - superuser attempt to set illegal flag(s)");
8461 goto out;
8462 }
8463 } else {
8464 if ((vap->va_flags & UF_SETTABLE) != vap->va_flags) {
8465 error = EPERM;
8466 KAUTH_DEBUG(" DENIED - user attempt to set illegal flag(s)");
8467 goto out;
8468 }
8469 }
8470 }
8471
8472 /* if not superuser, validate legality of new-item attributes */
8473 if (!has_priv_suser) {
8474 if (!defaulted_mode && VATTR_IS_ACTIVE(vap, va_mode)) {
8475 /* setgid? */
8476 if (vap->va_mode & S_ISGID) {
8477 if ((error = kauth_cred_ismember_gid(cred, vap->va_gid, &ismember)) != 0) {
8478 KAUTH_DEBUG("ATTR - ERROR: got %d checking for membership in %d", error, vap->va_gid);
8479 goto out;
8480 }
8481 if (!ismember) {
8482 KAUTH_DEBUG(" DENIED - can't set SGID bit, not a member of %d", vap->va_gid);
8483 error = EPERM;
8484 goto out;
8485 }
8486 }
8487
8488 /* setuid? */
8489 if ((vap->va_mode & S_ISUID) && (vap->va_uid != kauth_cred_getuid(cred))) {
8490 KAUTH_DEBUG("ATTR - ERROR: illegal attempt to set the setuid bit");
8491 error = EPERM;
8492 goto out;
8493 }
8494 }
8495 if (!defaulted_owner && (vap->va_uid != kauth_cred_getuid(cred))) {
8496 KAUTH_DEBUG(" DENIED - cannot create new item owned by %d", vap->va_uid);
8497 error = EPERM;
8498 goto out;
8499 }
8500 if (!defaulted_group) {
8501 if ((error = kauth_cred_ismember_gid(cred, vap->va_gid, &ismember)) != 0) {
8502 KAUTH_DEBUG(" ERROR - got %d checking for membership in %d", error, vap->va_gid);
8503 goto out;
8504 }
8505 if (!ismember) {
8506 KAUTH_DEBUG(" DENIED - cannot create new item with group %d - not a member", vap->va_gid);
8507 error = EPERM;
8508 goto out;
8509 }
8510 }
8511
8512 /* initialising owner/group UUID */
8513 if (VATTR_IS_ACTIVE(vap, va_uuuid)) {
8514 if ((error = kauth_cred_getguid(cred, &changer)) != 0) {
8515 KAUTH_DEBUG(" ERROR - got %d trying to get caller UUID", error);
8516 /* XXX ENOENT here - no GUID - should perhaps become EPERM */
8517 goto out;
8518 }
8519 if (!kauth_guid_equal(&vap->va_uuuid, &changer)) {
8520 KAUTH_DEBUG(" ERROR - cannot create item with supplied owner UUID - not us");
8521 error = EPERM;
8522 goto out;
8523 }
8524 }
8525 if (VATTR_IS_ACTIVE(vap, va_guuid)) {
8526 if ((error = kauth_cred_ismember_guid(cred, &vap->va_guuid, &ismember)) != 0) {
8527 KAUTH_DEBUG(" ERROR - got %d trying to check group membership", error);
8528 goto out;
8529 }
8530 if (!ismember) {
8531 KAUTH_DEBUG(" ERROR - cannot create item with supplied group UUID - not a member");
8532 error = EPERM;
8533 goto out;
8534 }
8535 }
8536 }
8537 out:
8538 if (inherit_flags) {
8539 /* Apply SF_RESTRICTED to the file if its parent directory was
8540 * restricted. This is done at the end so that root is not
8541 * required if this flag is only set due to inheritance. */
8542 VATTR_SET(vap, va_flags, (vap->va_flags | inherit_flags));
8543 }
8544 if (defaulted_fieldsp) {
8545 if (defaulted_mode) {
8546 *defaulted_fieldsp |= VATTR_PREPARE_DEFAULTED_MODE;
8547 }
8548 if (defaulted_group) {
8549 *defaulted_fieldsp |= VATTR_PREPARE_DEFAULTED_GID;
8550 }
8551 if (defaulted_owner) {
8552 *defaulted_fieldsp |= VATTR_PREPARE_DEFAULTED_UID;
8553 }
8554 }
8555 return error;
8556 }
8557
8558 /*
8559 * Check that the attribute information in vap can be legally written by the
8560 * context.
8561 *
8562 * Call this when you're not sure about the vnode_attr; either its contents
8563 * have come from an unknown source, or when they are variable.
8564 *
8565 * Returns errno, or zero and sets *actionp to the KAUTH_VNODE_* actions that
8566 * must be authorized to be permitted to write the vattr.
8567 */
8568 int
8569 vnode_authattr(vnode_t vp, struct vnode_attr *vap, kauth_action_t *actionp, vfs_context_t ctx)
8570 {
8571 struct vnode_attr ova;
8572 kauth_action_t required_action;
8573 int error, has_priv_suser, ismember, chowner, chgroup, clear_suid, clear_sgid;
8574 guid_t changer;
8575 gid_t group;
8576 uid_t owner;
8577 mode_t newmode;
8578 kauth_cred_t cred;
8579 uint32_t fdelta;
8580
8581 VATTR_INIT(&ova);
8582 required_action = 0;
8583 error = 0;
8584
8585 /*
8586 * Quickly check for enforcement applicability.
8587 */
8588 if (vfs_authopaque(vp->v_mount)) {
8589 goto out;
8590 }
8591
8592 /*
8593 * Check for attempts to set nonsensical fields.
8594 */
8595 if (vap->va_active & VNODE_ATTR_RDONLY) {
8596 KAUTH_DEBUG("ATTR - ERROR: attempt to set readonly attribute(s)");
8597 error = EINVAL;
8598 goto out;
8599 }
8600
8601 /*
8602 * We need to know if the caller is the superuser.
8603 */
8604 cred = vfs_context_ucred(ctx);
8605 has_priv_suser = kauth_cred_issuser(cred);
8606
8607 /*
8608 * If any of the following are changing, we need information from the old file:
8609 * va_uid
8610 * va_gid
8611 * va_mode
8612 * va_uuuid
8613 * va_guuid
8614 */
8615 if (VATTR_IS_ACTIVE(vap, va_uid) ||
8616 VATTR_IS_ACTIVE(vap, va_gid) ||
8617 VATTR_IS_ACTIVE(vap, va_mode) ||
8618 VATTR_IS_ACTIVE(vap, va_uuuid) ||
8619 VATTR_IS_ACTIVE(vap, va_guuid)) {
8620 VATTR_WANTED(&ova, va_mode);
8621 VATTR_WANTED(&ova, va_uid);
8622 VATTR_WANTED(&ova, va_gid);
8623 VATTR_WANTED(&ova, va_uuuid);
8624 VATTR_WANTED(&ova, va_guuid);
8625 KAUTH_DEBUG("ATTR - security information changing, fetching existing attributes");
8626 }
8627
8628 /*
8629 * If timestamps are being changed, we need to know who the file is owned
8630 * by.
8631 */
8632 if (VATTR_IS_ACTIVE(vap, va_create_time) ||
8633 VATTR_IS_ACTIVE(vap, va_change_time) ||
8634 VATTR_IS_ACTIVE(vap, va_modify_time) ||
8635 VATTR_IS_ACTIVE(vap, va_access_time) ||
8636 VATTR_IS_ACTIVE(vap, va_backup_time) ||
8637 VATTR_IS_ACTIVE(vap, va_addedtime)) {
8638 VATTR_WANTED(&ova, va_uid);
8639 #if 0 /* enable this when we support UUIDs as official owners */
8640 VATTR_WANTED(&ova, va_uuuid);
8641 #endif
8642 KAUTH_DEBUG("ATTR - timestamps changing, fetching uid and GUID");
8643 }
8644
8645 /*
8646 * If flags are being changed, we need the old flags.
8647 */
8648 if (VATTR_IS_ACTIVE(vap, va_flags)) {
8649 KAUTH_DEBUG("ATTR - flags changing, fetching old flags");
8650 VATTR_WANTED(&ova, va_flags);
8651 }
8652
8653 /*
8654 * If ACLs are being changed, we need the old ACLs.
8655 */
8656 if (VATTR_IS_ACTIVE(vap, va_acl)) {
8657 KAUTH_DEBUG("ATTR - acl changing, fetching old flags");
8658 VATTR_WANTED(&ova, va_acl);
8659 }
8660
8661 /*
8662 * If the size is being set, make sure it's not a directory.
8663 */
8664 if (VATTR_IS_ACTIVE(vap, va_data_size)) {
8665 /* size is only meaningful on regular files, don't permit otherwise */
8666 if (!vnode_isreg(vp)) {
8667 KAUTH_DEBUG("ATTR - ERROR: size change requested on non-file");
8668 error = vnode_isdir(vp) ? EISDIR : EINVAL;
8669 goto out;
8670 }
8671 }
8672
8673 /*
8674 * Get old data.
8675 */
8676 KAUTH_DEBUG("ATTR - fetching old attributes %016llx", ova.va_active);
8677 if ((error = vnode_getattr(vp, &ova, ctx)) != 0) {
8678 KAUTH_DEBUG(" ERROR - got %d trying to get attributes", error);
8679 goto out;
8680 }
8681
8682 /*
8683 * Size changes require write access to the file data.
8684 */
8685 if (VATTR_IS_ACTIVE(vap, va_data_size)) {
8686 /* if we can't get the size, or it's different, we need write access */
8687 KAUTH_DEBUG("ATTR - size change, requiring WRITE_DATA");
8688 required_action |= KAUTH_VNODE_WRITE_DATA;
8689 }
8690
8691 /*
8692 * Changing timestamps?
8693 *
8694 * Note that we are only called to authorize user-requested time changes;
8695 * side-effect time changes are not authorized. Authorisation is only
8696 * required for existing files.
8697 *
8698 * Non-owners are not permitted to change the time on an existing
8699 * file to anything other than the current time.
8700 */
8701 if (VATTR_IS_ACTIVE(vap, va_create_time) ||
8702 VATTR_IS_ACTIVE(vap, va_change_time) ||
8703 VATTR_IS_ACTIVE(vap, va_modify_time) ||
8704 VATTR_IS_ACTIVE(vap, va_access_time) ||
8705 VATTR_IS_ACTIVE(vap, va_backup_time) ||
8706 VATTR_IS_ACTIVE(vap, va_addedtime)) {
8707 /*
8708 * The owner and root may set any timestamps they like,
8709 * provided that the file is not immutable. The owner still needs
8710 * WRITE_ATTRIBUTES (implied by ownership but still deniable).
8711 */
8712 if (has_priv_suser || vauth_node_owner(&ova, cred)) {
8713 KAUTH_DEBUG("ATTR - root or owner changing timestamps");
8714 required_action |= KAUTH_VNODE_CHECKIMMUTABLE | KAUTH_VNODE_WRITE_ATTRIBUTES;
8715 } else {
8716 /* just setting the current time? */
8717 if (vap->va_vaflags & VA_UTIMES_NULL) {
8718 KAUTH_DEBUG("ATTR - non-root/owner changing timestamps, requiring WRITE_ATTRIBUTES");
8719 required_action |= KAUTH_VNODE_WRITE_ATTRIBUTES;
8720 } else {
8721 KAUTH_DEBUG("ATTR - ERROR: illegal timestamp modification attempted");
8722 error = EACCES;
8723 goto out;
8724 }
8725 }
8726 }
8727
8728 /*
8729 * Changing file mode?
8730 */
8731 if (VATTR_IS_ACTIVE(vap, va_mode) && VATTR_IS_SUPPORTED(&ova, va_mode) && (ova.va_mode != vap->va_mode)) {
8732 KAUTH_DEBUG("ATTR - mode change from %06o to %06o", ova.va_mode, vap->va_mode);
8733
8734 /*
8735 * Mode changes always have the same basic auth requirements.
8736 */
8737 if (has_priv_suser) {
8738 KAUTH_DEBUG("ATTR - superuser mode change, requiring immutability check");
8739 required_action |= KAUTH_VNODE_CHECKIMMUTABLE;
8740 } else {
8741 /* need WRITE_SECURITY */
8742 KAUTH_DEBUG("ATTR - non-superuser mode change, requiring WRITE_SECURITY");
8743 required_action |= KAUTH_VNODE_WRITE_SECURITY;
8744 }
8745
8746 /*
8747 * Can't set the setgid bit if you're not in the group and not root. Have to have
8748 * existing group information in the case we're not setting it right now.
8749 */
8750 if (vap->va_mode & S_ISGID) {
8751 required_action |= KAUTH_VNODE_CHECKIMMUTABLE; /* always required */
8752 if (!has_priv_suser) {
8753 if (VATTR_IS_ACTIVE(vap, va_gid)) {
8754 group = vap->va_gid;
8755 } else if (VATTR_IS_SUPPORTED(&ova, va_gid)) {
8756 group = ova.va_gid;
8757 } else {
8758 KAUTH_DEBUG("ATTR - ERROR: setgid but no gid available");
8759 error = EINVAL;
8760 goto out;
8761 }
8762 /*
8763 * This might be too restrictive; WRITE_SECURITY might be implied by
8764 * membership in this case, rather than being an additional requirement.
8765 */
8766 if ((error = kauth_cred_ismember_gid(cred, group, &ismember)) != 0) {
8767 KAUTH_DEBUG("ATTR - ERROR: got %d checking for membership in %d", error, vap->va_gid);
8768 goto out;
8769 }
8770 if (!ismember) {
8771 KAUTH_DEBUG(" DENIED - can't set SGID bit, not a member of %d", group);
8772 error = EPERM;
8773 goto out;
8774 }
8775 }
8776 }
8777
8778 /*
8779 * Can't set the setuid bit unless you're root or the file's owner.
8780 */
8781 if (vap->va_mode & S_ISUID) {
8782 required_action |= KAUTH_VNODE_CHECKIMMUTABLE; /* always required */
8783 if (!has_priv_suser) {
8784 if (VATTR_IS_ACTIVE(vap, va_uid)) {
8785 owner = vap->va_uid;
8786 } else if (VATTR_IS_SUPPORTED(&ova, va_uid)) {
8787 owner = ova.va_uid;
8788 } else {
8789 KAUTH_DEBUG("ATTR - ERROR: setuid but no uid available");
8790 error = EINVAL;
8791 goto out;
8792 }
8793 if (owner != kauth_cred_getuid(cred)) {
8794 /*
8795 * We could allow this if WRITE_SECURITY is permitted, perhaps.
8796 */
8797 KAUTH_DEBUG("ATTR - ERROR: illegal attempt to set the setuid bit");
8798 error = EPERM;
8799 goto out;
8800 }
8801 }
8802 }
8803 }
8804
8805 /*
8806 * Validate/mask flags changes. This checks that only the flags in
8807 * the UF_SETTABLE mask are being set, and preserves the flags in
8808 * the SF_SETTABLE case.
8809 *
8810 * Since flags changes may be made in conjunction with other changes,
8811 * we will ask the auth code to ignore immutability in the case that
8812 * the SF_* flags are not set and we are only manipulating the file flags.
8813 *
8814 */
8815 if (VATTR_IS_ACTIVE(vap, va_flags)) {
8816 /* compute changing flags bits */
8817 if (VATTR_IS_SUPPORTED(&ova, va_flags)) {
8818 fdelta = vap->va_flags ^ ova.va_flags;
8819 } else {
8820 fdelta = vap->va_flags;
8821 }
8822
8823 if (fdelta != 0) {
8824 KAUTH_DEBUG("ATTR - flags changing, requiring WRITE_SECURITY");
8825 required_action |= KAUTH_VNODE_WRITE_SECURITY;
8826
8827 /* check that changing bits are legal */
8828 if (has_priv_suser) {
8829 /*
8830 * The immutability check will prevent us from clearing the SF_*
8831 * flags unless the system securelevel permits it, so just check
8832 * for legal flags here.
8833 */
8834 if (fdelta & ~(UF_SETTABLE | SF_SETTABLE)) {
8835 error = EPERM;
8836 KAUTH_DEBUG(" DENIED - superuser attempt to set illegal flag(s)");
8837 goto out;
8838 }
8839 } else {
8840 if (fdelta & ~UF_SETTABLE) {
8841 error = EPERM;
8842 KAUTH_DEBUG(" DENIED - user attempt to set illegal flag(s)");
8843 goto out;
8844 }
8845 }
8846 /*
8847 * If the caller has the ability to manipulate file flags,
8848 * security is not reduced by ignoring them for this operation.
8849 *
8850 * A more complete test here would consider the 'after' states of the flags
8851 * to determine whether it would permit the operation, but this becomes
8852 * very complex.
8853 *
8854 * Ignoring immutability is conditional on securelevel; this does not bypass
8855 * the SF_* flags if securelevel > 0.
8856 */
8857 required_action |= KAUTH_VNODE_NOIMMUTABLE;
8858 }
8859 }
8860
8861 /*
8862 * Validate ownership information.
8863 */
8864 chowner = 0;
8865 chgroup = 0;
8866 clear_suid = 0;
8867 clear_sgid = 0;
8868
8869 /*
8870 * uid changing
8871 * Note that if the filesystem didn't give us a UID, we expect that it doesn't
8872 * support them in general, and will ignore it if/when we try to set it.
8873 * We might want to clear the uid out of vap completely here.
8874 */
8875 if (VATTR_IS_ACTIVE(vap, va_uid)) {
8876 if (VATTR_IS_SUPPORTED(&ova, va_uid) && (vap->va_uid != ova.va_uid)) {
8877 if (!has_priv_suser && (kauth_cred_getuid(cred) != vap->va_uid)) {
8878 KAUTH_DEBUG(" DENIED - non-superuser cannot change ownershipt to a third party");
8879 error = EPERM;
8880 goto out;
8881 }
8882 chowner = 1;
8883 }
8884 clear_suid = 1;
8885 }
8886
8887 /*
8888 * gid changing
8889 * Note that if the filesystem didn't give us a GID, we expect that it doesn't
8890 * support them in general, and will ignore it if/when we try to set it.
8891 * We might want to clear the gid out of vap completely here.
8892 */
8893 if (VATTR_IS_ACTIVE(vap, va_gid)) {
8894 if (VATTR_IS_SUPPORTED(&ova, va_gid) && (vap->va_gid != ova.va_gid)) {
8895 if (!has_priv_suser) {
8896 if ((error = kauth_cred_ismember_gid(cred, vap->va_gid, &ismember)) != 0) {
8897 KAUTH_DEBUG(" ERROR - got %d checking for membership in %d", error, vap->va_gid);
8898 goto out;
8899 }
8900 if (!ismember) {
8901 KAUTH_DEBUG(" DENIED - group change from %d to %d but not a member of target group",
8902 ova.va_gid, vap->va_gid);
8903 error = EPERM;
8904 goto out;
8905 }
8906 }
8907 chgroup = 1;
8908 }
8909 clear_sgid = 1;
8910 }
8911
8912 /*
8913 * Owner UUID being set or changed.
8914 */
8915 if (VATTR_IS_ACTIVE(vap, va_uuuid)) {
8916 /* if the owner UUID is not actually changing ... */
8917 if (VATTR_IS_SUPPORTED(&ova, va_uuuid)) {
8918 if (kauth_guid_equal(&vap->va_uuuid, &ova.va_uuuid)) {
8919 goto no_uuuid_change;
8920 }
8921
8922 /*
8923 * If the current owner UUID is a null GUID, check
8924 * it against the UUID corresponding to the owner UID.
8925 */
8926 if (kauth_guid_equal(&ova.va_uuuid, &kauth_null_guid) &&
8927 VATTR_IS_SUPPORTED(&ova, va_uid)) {
8928 guid_t uid_guid;
8929
8930 if (kauth_cred_uid2guid(ova.va_uid, &uid_guid) == 0 &&
8931 kauth_guid_equal(&vap->va_uuuid, &uid_guid)) {
8932 goto no_uuuid_change;
8933 }
8934 }
8935 }
8936
8937 /*
8938 * The owner UUID cannot be set by a non-superuser to anything other than
8939 * their own or a null GUID (to "unset" the owner UUID).
8940 * Note that file systems must be prepared to handle the
8941 * null UUID case in a manner appropriate for that file
8942 * system.
8943 */
8944 if (!has_priv_suser) {
8945 if ((error = kauth_cred_getguid(cred, &changer)) != 0) {
8946 KAUTH_DEBUG(" ERROR - got %d trying to get caller UUID", error);
8947 /* XXX ENOENT here - no UUID - should perhaps become EPERM */
8948 goto out;
8949 }
8950 if (!kauth_guid_equal(&vap->va_uuuid, &changer) &&
8951 !kauth_guid_equal(&vap->va_uuuid, &kauth_null_guid)) {
8952 KAUTH_DEBUG(" ERROR - cannot set supplied owner UUID - not us / null");
8953 error = EPERM;
8954 goto out;
8955 }
8956 }
8957 chowner = 1;
8958 clear_suid = 1;
8959 }
8960 no_uuuid_change:
8961 /*
8962 * Group UUID being set or changed.
8963 */
8964 if (VATTR_IS_ACTIVE(vap, va_guuid)) {
8965 /* if the group UUID is not actually changing ... */
8966 if (VATTR_IS_SUPPORTED(&ova, va_guuid)) {
8967 if (kauth_guid_equal(&vap->va_guuid, &ova.va_guuid)) {
8968 goto no_guuid_change;
8969 }
8970
8971 /*
8972 * If the current group UUID is a null UUID, check
8973 * it against the UUID corresponding to the group GID.
8974 */
8975 if (kauth_guid_equal(&ova.va_guuid, &kauth_null_guid) &&
8976 VATTR_IS_SUPPORTED(&ova, va_gid)) {
8977 guid_t gid_guid;
8978
8979 if (kauth_cred_gid2guid(ova.va_gid, &gid_guid) == 0 &&
8980 kauth_guid_equal(&vap->va_guuid, &gid_guid)) {
8981 goto no_guuid_change;
8982 }
8983 }
8984 }
8985
8986 /*
8987 * The group UUID cannot be set by a non-superuser to anything other than
8988 * one of which they are a member or a null GUID (to "unset"
8989 * the group UUID).
8990 * Note that file systems must be prepared to handle the
8991 * null UUID case in a manner appropriate for that file
8992 * system.
8993 */
8994 if (!has_priv_suser) {
8995 if (kauth_guid_equal(&vap->va_guuid, &kauth_null_guid)) {
8996 ismember = 1;
8997 } else if ((error = kauth_cred_ismember_guid(cred, &vap->va_guuid, &ismember)) != 0) {
8998 KAUTH_DEBUG(" ERROR - got %d trying to check group membership", error);
8999 goto out;
9000 }
9001 if (!ismember) {
9002 KAUTH_DEBUG(" ERROR - cannot set supplied group UUID - not a member / null");
9003 error = EPERM;
9004 goto out;
9005 }
9006 }
9007 chgroup = 1;
9008 }
9009 no_guuid_change:
9010
9011 /*
9012 * Compute authorisation for group/ownership changes.
9013 */
9014 if (chowner || chgroup || clear_suid || clear_sgid) {
9015 if (has_priv_suser) {
9016 KAUTH_DEBUG("ATTR - superuser changing file owner/group, requiring immutability check");
9017 required_action |= KAUTH_VNODE_CHECKIMMUTABLE;
9018 } else {
9019 if (chowner) {
9020 KAUTH_DEBUG("ATTR - ownership change, requiring TAKE_OWNERSHIP");
9021 required_action |= KAUTH_VNODE_TAKE_OWNERSHIP;
9022 }
9023 if (chgroup && !chowner) {
9024 KAUTH_DEBUG("ATTR - group change, requiring WRITE_SECURITY");
9025 required_action |= KAUTH_VNODE_WRITE_SECURITY;
9026 }
9027 }
9028
9029 /*
9030 * clear set-uid and set-gid bits. POSIX only requires this for
9031 * non-privileged processes but we do it even for root.
9032 */
9033 if (VATTR_IS_ACTIVE(vap, va_mode)) {
9034 newmode = vap->va_mode;
9035 } else if (VATTR_IS_SUPPORTED(&ova, va_mode)) {
9036 newmode = ova.va_mode;
9037 } else {
9038 KAUTH_DEBUG("CHOWN - trying to change owner but cannot get mode from filesystem to mask setugid bits");
9039 newmode = 0;
9040 }
9041
9042 /* chown always clears setuid/gid bits. An exception is made for
9043 * setattrlist executed by a root process to set <uid, gid, mode> on a file:
9044 * setattrlist is allowed to set the new mode on the file and change (chown)
9045 * uid/gid.
9046 */
9047 if (newmode & (S_ISUID | S_ISGID)) {
9048 if (!VATTR_IS_ACTIVE(vap, va_mode) || !has_priv_suser) {
9049 KAUTH_DEBUG("CHOWN - masking setugid bits from mode %o to %o",
9050 newmode, newmode & ~(S_ISUID | S_ISGID));
9051 newmode &= ~(S_ISUID | S_ISGID);
9052 }
9053 VATTR_SET(vap, va_mode, newmode);
9054 }
9055 }
9056
9057 /*
9058 * Authorise changes in the ACL.
9059 */
9060 if (VATTR_IS_ACTIVE(vap, va_acl)) {
9061 /* no existing ACL */
9062 if (!VATTR_IS_ACTIVE(&ova, va_acl) || (ova.va_acl == NULL)) {
9063 /* adding an ACL */
9064 if (vap->va_acl != NULL) {
9065 required_action |= KAUTH_VNODE_WRITE_SECURITY;
9066 KAUTH_DEBUG("CHMOD - adding ACL");
9067 }
9068
9069 /* removing an existing ACL */
9070 } else if (vap->va_acl == NULL) {
9071 required_action |= KAUTH_VNODE_WRITE_SECURITY;
9072 KAUTH_DEBUG("CHMOD - removing ACL");
9073
9074 /* updating an existing ACL */
9075 } else {
9076 if (vap->va_acl->acl_entrycount != ova.va_acl->acl_entrycount) {
9077 /* entry count changed, must be different */
9078 required_action |= KAUTH_VNODE_WRITE_SECURITY;
9079 KAUTH_DEBUG("CHMOD - adding/removing ACL entries");
9080 } else if (vap->va_acl->acl_entrycount > 0) {
9081 /* both ACLs have the same ACE count, said count is 1 or more, bitwise compare ACLs */
9082 if (memcmp(&vap->va_acl->acl_ace[0], &ova.va_acl->acl_ace[0],
9083 sizeof(struct kauth_ace) * vap->va_acl->acl_entrycount)) {
9084 required_action |= KAUTH_VNODE_WRITE_SECURITY;
9085 KAUTH_DEBUG("CHMOD - changing ACL entries");
9086 }
9087 }
9088 }
9089 }
9090
9091 /*
9092 * Other attributes that require authorisation.
9093 */
9094 if (VATTR_IS_ACTIVE(vap, va_encoding)) {
9095 required_action |= KAUTH_VNODE_WRITE_ATTRIBUTES;
9096 }
9097
9098 out:
9099 if (VATTR_IS_SUPPORTED(&ova, va_acl) && (ova.va_acl != NULL)) {
9100 kauth_acl_free(ova.va_acl);
9101 }
9102 if (error == 0) {
9103 *actionp = required_action;
9104 }
9105 return error;
9106 }
9107
9108 static int
9109 setlocklocal_callback(struct vnode *vp, __unused void *cargs)
9110 {
9111 vnode_lock_spin(vp);
9112 vp->v_flag |= VLOCKLOCAL;
9113 vnode_unlock(vp);
9114
9115 return VNODE_RETURNED;
9116 }
9117
9118 void
9119 vfs_setlocklocal(mount_t mp)
9120 {
9121 mount_lock_spin(mp);
9122 mp->mnt_kern_flag |= MNTK_LOCK_LOCAL;
9123 mount_unlock(mp);
9124
9125 /*
9126 * The number of active vnodes is expected to be
9127 * very small when vfs_setlocklocal is invoked.
9128 */
9129 vnode_iterate(mp, 0, setlocklocal_callback, NULL);
9130 }
9131
9132 void
9133 vfs_setcompoundopen(mount_t mp)
9134 {
9135 mount_lock_spin(mp);
9136 mp->mnt_compound_ops |= COMPOUND_VNOP_OPEN;
9137 mount_unlock(mp);
9138 }
9139
9140 void
9141 vnode_setswapmount(vnode_t vp)
9142 {
9143 mount_lock(vp->v_mount);
9144 vp->v_mount->mnt_kern_flag |= MNTK_SWAP_MOUNT;
9145 mount_unlock(vp->v_mount);
9146 }
9147
9148
9149 int64_t
9150 vnode_getswappin_avail(vnode_t vp)
9151 {
9152 int64_t max_swappin_avail = 0;
9153
9154 mount_lock(vp->v_mount);
9155 if (vp->v_mount->mnt_ioflags & MNT_IOFLAGS_SWAPPIN_SUPPORTED) {
9156 max_swappin_avail = vp->v_mount->mnt_max_swappin_available;
9157 }
9158 mount_unlock(vp->v_mount);
9159
9160 return max_swappin_avail;
9161 }
9162
9163
9164 void
9165 vn_setunionwait(vnode_t vp)
9166 {
9167 vnode_lock_spin(vp);
9168 vp->v_flag |= VISUNION;
9169 vnode_unlock(vp);
9170 }
9171
9172
9173 void
9174 vn_checkunionwait(vnode_t vp)
9175 {
9176 vnode_lock_spin(vp);
9177 while ((vp->v_flag & VISUNION) == VISUNION) {
9178 msleep((caddr_t)&vp->v_flag, &vp->v_lock, 0, 0, 0);
9179 }
9180 vnode_unlock(vp);
9181 }
9182
9183 void
9184 vn_clearunionwait(vnode_t vp, int locked)
9185 {
9186 if (!locked) {
9187 vnode_lock_spin(vp);
9188 }
9189 if ((vp->v_flag & VISUNION) == VISUNION) {
9190 vp->v_flag &= ~VISUNION;
9191 wakeup((caddr_t)&vp->v_flag);
9192 }
9193 if (!locked) {
9194 vnode_unlock(vp);
9195 }
9196 }
9197
9198 /*
9199 * Removes orphaned apple double files during a rmdir
9200 * Works by:
9201 * 1. vnode_suspend().
9202 * 2. Call VNOP_READDIR() till the end of directory is reached.
9203 * 3. Check if the directory entries returned are regular files with name starting with "._". If not, return ENOTEMPTY.
9204 * 4. Continue (2) and (3) till end of directory is reached.
9205 * 5. If all the entries in the directory were files with "._" name, delete all the files.
9206 * 6. vnode_resume()
9207 * 7. If deletion of all files succeeded, call VNOP_RMDIR() again.
9208 */
9209
9210 errno_t
9211 rmdir_remove_orphaned_appleDouble(vnode_t vp, vfs_context_t ctx, int * restart_flag)
9212 {
9213 #define UIO_BUFF_SIZE 2048
9214 uio_t auio = NULL;
9215 int eofflag, siz = UIO_BUFF_SIZE, nentries = 0;
9216 int open_flag = 0, full_erase_flag = 0;
9217 char uio_buf[UIO_SIZEOF(1)];
9218 char *rbuf = NULL;
9219 void *dir_pos;
9220 void *dir_end;
9221 struct dirent *dp;
9222 errno_t error;
9223
9224 error = vnode_suspend(vp);
9225
9226 /*
9227 * restart_flag is set so that the calling rmdir sleeps and resets
9228 */
9229 if (error == EBUSY) {
9230 *restart_flag = 1;
9231 }
9232 if (error != 0) {
9233 return error;
9234 }
9235
9236 /*
9237 * set up UIO
9238 */
9239 MALLOC(rbuf, caddr_t, siz, M_TEMP, M_WAITOK);
9240 if (rbuf) {
9241 auio = uio_createwithbuffer(1, 0, UIO_SYSSPACE, UIO_READ,
9242 &uio_buf[0], sizeof(uio_buf));
9243 }
9244 if (!rbuf || !auio) {
9245 error = ENOMEM;
9246 goto outsc;
9247 }
9248
9249 uio_setoffset(auio, 0);
9250
9251 eofflag = 0;
9252
9253 if ((error = VNOP_OPEN(vp, FREAD, ctx))) {
9254 goto outsc;
9255 } else {
9256 open_flag = 1;
9257 }
9258
9259 /*
9260 * First pass checks if all files are appleDouble files.
9261 */
9262
9263 do {
9264 siz = UIO_BUFF_SIZE;
9265 uio_reset(auio, uio_offset(auio), UIO_SYSSPACE, UIO_READ);
9266 uio_addiov(auio, CAST_USER_ADDR_T(rbuf), UIO_BUFF_SIZE);
9267
9268 if ((error = VNOP_READDIR(vp, auio, 0, &eofflag, &nentries, ctx))) {
9269 goto outsc;
9270 }
9271
9272 if (uio_resid(auio) != 0) {
9273 siz -= uio_resid(auio);
9274 }
9275
9276 /*
9277 * Iterate through directory
9278 */
9279 dir_pos = (void*) rbuf;
9280 dir_end = (void*) (rbuf + siz);
9281 dp = (struct dirent*) (dir_pos);
9282
9283 if (dir_pos == dir_end) {
9284 eofflag = 1;
9285 }
9286
9287 while (dir_pos < dir_end) {
9288 /*
9289 * Check for . and .. as well as directories
9290 */
9291 if (dp->d_ino != 0 &&
9292 !((dp->d_namlen == 1 && dp->d_name[0] == '.') ||
9293 (dp->d_namlen == 2 && dp->d_name[0] == '.' && dp->d_name[1] == '.'))) {
9294 /*
9295 * Check for irregular files and ._ files
9296 * If there is a ._._ file abort the op
9297 */
9298 if (dp->d_namlen < 2 ||
9299 strncmp(dp->d_name, "._", 2) ||
9300 (dp->d_namlen >= 4 && !strncmp(&(dp->d_name[2]), "._", 2))) {
9301 error = ENOTEMPTY;
9302 goto outsc;
9303 }
9304 }
9305 dir_pos = (void*) ((uint8_t*)dir_pos + dp->d_reclen);
9306 dp = (struct dirent*)dir_pos;
9307 }
9308
9309 /*
9310 * workaround for HFS/NFS setting eofflag before end of file
9311 */
9312 if (vp->v_tag == VT_HFS && nentries > 2) {
9313 eofflag = 0;
9314 }
9315
9316 if (vp->v_tag == VT_NFS) {
9317 if (eofflag && !full_erase_flag) {
9318 full_erase_flag = 1;
9319 eofflag = 0;
9320 uio_reset(auio, 0, UIO_SYSSPACE, UIO_READ);
9321 } else if (!eofflag && full_erase_flag) {
9322 full_erase_flag = 0;
9323 }
9324 }
9325 } while (!eofflag);
9326 /*
9327 * If we've made it here all the files in the dir are ._ files.
9328 * We can delete the files even though the node is suspended
9329 * because we are the owner of the file.
9330 */
9331
9332 uio_reset(auio, 0, UIO_SYSSPACE, UIO_READ);
9333 eofflag = 0;
9334 full_erase_flag = 0;
9335
9336 do {
9337 siz = UIO_BUFF_SIZE;
9338 uio_reset(auio, uio_offset(auio), UIO_SYSSPACE, UIO_READ);
9339 uio_addiov(auio, CAST_USER_ADDR_T(rbuf), UIO_BUFF_SIZE);
9340
9341 error = VNOP_READDIR(vp, auio, 0, &eofflag, &nentries, ctx);
9342
9343 if (error != 0) {
9344 goto outsc;
9345 }
9346
9347 if (uio_resid(auio) != 0) {
9348 siz -= uio_resid(auio);
9349 }
9350
9351 /*
9352 * Iterate through directory
9353 */
9354 dir_pos = (void*) rbuf;
9355 dir_end = (void*) (rbuf + siz);
9356 dp = (struct dirent*) dir_pos;
9357
9358 if (dir_pos == dir_end) {
9359 eofflag = 1;
9360 }
9361
9362 while (dir_pos < dir_end) {
9363 /*
9364 * Check for . and .. as well as directories
9365 */
9366 if (dp->d_ino != 0 &&
9367 !((dp->d_namlen == 1 && dp->d_name[0] == '.') ||
9368 (dp->d_namlen == 2 && dp->d_name[0] == '.' && dp->d_name[1] == '.'))
9369 ) {
9370 error = unlink1(ctx, vp,
9371 CAST_USER_ADDR_T(dp->d_name), UIO_SYSSPACE,
9372 VNODE_REMOVE_SKIP_NAMESPACE_EVENT |
9373 VNODE_REMOVE_NO_AUDIT_PATH);
9374
9375 if (error && error != ENOENT) {
9376 goto outsc;
9377 }
9378 }
9379 dir_pos = (void*) ((uint8_t*)dir_pos + dp->d_reclen);
9380 dp = (struct dirent*)dir_pos;
9381 }
9382
9383 /*
9384 * workaround for HFS/NFS setting eofflag before end of file
9385 */
9386 if (vp->v_tag == VT_HFS && nentries > 2) {
9387 eofflag = 0;
9388 }
9389
9390 if (vp->v_tag == VT_NFS) {
9391 if (eofflag && !full_erase_flag) {
9392 full_erase_flag = 1;
9393 eofflag = 0;
9394 uio_reset(auio, 0, UIO_SYSSPACE, UIO_READ);
9395 } else if (!eofflag && full_erase_flag) {
9396 full_erase_flag = 0;
9397 }
9398 }
9399 } while (!eofflag);
9400
9401
9402 error = 0;
9403
9404 outsc:
9405 if (open_flag) {
9406 VNOP_CLOSE(vp, FREAD, ctx);
9407 }
9408
9409 if (auio) {
9410 uio_free(auio);
9411 }
9412 FREE(rbuf, M_TEMP);
9413
9414 vnode_resume(vp);
9415
9416
9417 return error;
9418 }
9419
9420
9421 void
9422 lock_vnode_and_post(vnode_t vp, int kevent_num)
9423 {
9424 /* Only take the lock if there's something there! */
9425 if (vp->v_knotes.slh_first != NULL) {
9426 vnode_lock(vp);
9427 KNOTE(&vp->v_knotes, kevent_num);
9428 vnode_unlock(vp);
9429 }
9430 }
9431
9432 void panic_print_vnodes(void);
9433
9434 /* define PANIC_PRINTS_VNODES only if investigation is required. */
9435 #ifdef PANIC_PRINTS_VNODES
9436
9437 static const char *
9438 __vtype(uint16_t vtype)
9439 {
9440 switch (vtype) {
9441 case VREG:
9442 return "R";
9443 case VDIR:
9444 return "D";
9445 case VBLK:
9446 return "B";
9447 case VCHR:
9448 return "C";
9449 case VLNK:
9450 return "L";
9451 case VSOCK:
9452 return "S";
9453 case VFIFO:
9454 return "F";
9455 case VBAD:
9456 return "x";
9457 case VSTR:
9458 return "T";
9459 case VCPLX:
9460 return "X";
9461 default:
9462 return "?";
9463 }
9464 }
9465
9466 /*
9467 * build a path from the bottom up
9468 * NOTE: called from the panic path - no alloc'ing of memory and no locks!
9469 */
9470 static char *
9471 __vpath(vnode_t vp, char *str, int len, int depth)
9472 {
9473 int vnm_len;
9474 const char *src;
9475 char *dst;
9476
9477 if (len <= 0) {
9478 return str;
9479 }
9480 /* str + len is the start of the string we created */
9481 if (!vp->v_name) {
9482 return str + len;
9483 }
9484
9485 /* follow mount vnodes to get the full path */
9486 if ((vp->v_flag & VROOT)) {
9487 if (vp->v_mount != NULL && vp->v_mount->mnt_vnodecovered) {
9488 return __vpath(vp->v_mount->mnt_vnodecovered,
9489 str, len, depth + 1);
9490 }
9491 return str + len;
9492 }
9493
9494 src = vp->v_name;
9495 vnm_len = strlen(src);
9496 if (vnm_len > len) {
9497 /* truncate the name to fit in the string */
9498 src += (vnm_len - len);
9499 vnm_len = len;
9500 }
9501
9502 /* start from the back and copy just characters (no NULLs) */
9503
9504 /* this will chop off leaf path (file) names */
9505 if (depth > 0) {
9506 dst = str + len - vnm_len;
9507 memcpy(dst, src, vnm_len);
9508 len -= vnm_len;
9509 } else {
9510 dst = str + len;
9511 }
9512
9513 if (vp->v_parent && len > 1) {
9514 /* follow parents up the chain */
9515 len--;
9516 *(dst - 1) = '/';
9517 return __vpath(vp->v_parent, str, len, depth + 1);
9518 }
9519
9520 return dst;
9521 }
9522
9523 #define SANE_VNODE_PRINT_LIMIT 5000
9524 void
9525 panic_print_vnodes(void)
9526 {
9527 mount_t mnt;
9528 vnode_t vp;
9529 int nvnodes = 0;
9530 const char *type;
9531 char *nm;
9532 char vname[257];
9533
9534 paniclog_append_noflush("\n***** VNODES *****\n"
9535 "TYPE UREF ICNT PATH\n");
9536
9537 /* NULL-terminate the path name */
9538 vname[sizeof(vname) - 1] = '\0';
9539
9540 /*
9541 * iterate all vnodelist items in all mounts (mntlist) -> mnt_vnodelist
9542 */
9543 TAILQ_FOREACH(mnt, &mountlist, mnt_list) {
9544 if (!ml_validate_nofault((vm_offset_t)mnt, sizeof(mount_t))) {
9545 paniclog_append_noflush("Unable to iterate the mount list %p - encountered an invalid mount pointer %p \n",
9546 &mountlist, mnt);
9547 break;
9548 }
9549
9550 TAILQ_FOREACH(vp, &mnt->mnt_vnodelist, v_mntvnodes) {
9551 if (!ml_validate_nofault((vm_offset_t)vp, sizeof(vnode_t))) {
9552 paniclog_append_noflush("Unable to iterate the vnode list %p - encountered an invalid vnode pointer %p \n",
9553 &mnt->mnt_vnodelist, vp);
9554 break;
9555 }
9556
9557 if (++nvnodes > SANE_VNODE_PRINT_LIMIT) {
9558 return;
9559 }
9560 type = __vtype(vp->v_type);
9561 nm = __vpath(vp, vname, sizeof(vname) - 1, 0);
9562 paniclog_append_noflush("%s %0d %0d %s\n",
9563 type, vp->v_usecount, vp->v_iocount, nm);
9564 }
9565 }
9566 }
9567
9568 #else /* !PANIC_PRINTS_VNODES */
9569 void
9570 panic_print_vnodes(void)
9571 {
9572 return;
9573 }
9574 #endif
9575
9576
9577 #ifdef JOE_DEBUG
9578 static void
9579 record_vp(vnode_t vp, int count)
9580 {
9581 struct uthread *ut;
9582
9583 #if CONFIG_TRIGGERS
9584 if (vp->v_resolve) {
9585 return;
9586 }
9587 #endif
9588 if ((vp->v_flag & VSYSTEM)) {
9589 return;
9590 }
9591
9592 ut = get_bsdthread_info(current_thread());
9593 ut->uu_iocount += count;
9594
9595 if (count == 1) {
9596 if (ut->uu_vpindex < 32) {
9597 OSBacktrace((void **)&ut->uu_pcs[ut->uu_vpindex][0], 10);
9598
9599 ut->uu_vps[ut->uu_vpindex] = vp;
9600 ut->uu_vpindex++;
9601 }
9602 }
9603 }
9604 #endif
9605
9606
9607 #if CONFIG_TRIGGERS
9608
9609 #define TRIG_DEBUG 0
9610
9611 #if TRIG_DEBUG
9612 #define TRIG_LOG(...) do { printf("%s: ", __FUNCTION__); printf(__VA_ARGS__); } while (0)
9613 #else
9614 #define TRIG_LOG(...)
9615 #endif
9616
9617 /*
9618 * Resolver result functions
9619 */
9620
9621 resolver_result_t
9622 vfs_resolver_result(uint32_t seq, enum resolver_status stat, int aux)
9623 {
9624 /*
9625 * |<--- 32 --->|<--- 28 --->|<- 4 ->|
9626 * sequence auxiliary status
9627 */
9628 return (((uint64_t)seq) << 32) |
9629 (((uint64_t)(aux & 0x0fffffff)) << 4) |
9630 (uint64_t)(stat & 0x0000000F);
9631 }
9632
9633 enum resolver_status
9634 vfs_resolver_status(resolver_result_t result)
9635 {
9636 /* lower 4 bits is status */
9637 return result & 0x0000000F;
9638 }
9639
9640 uint32_t
9641 vfs_resolver_sequence(resolver_result_t result)
9642 {
9643 /* upper 32 bits is sequence */
9644 return (uint32_t)(result >> 32);
9645 }
9646
9647 int
9648 vfs_resolver_auxiliary(resolver_result_t result)
9649 {
9650 /* 28 bits of auxiliary */
9651 return (int)(((uint32_t)(result & 0xFFFFFFF0)) >> 4);
9652 }
9653
9654 /*
9655 * SPI
9656 * Call in for resolvers to update vnode trigger state
9657 */
9658 int
9659 vnode_trigger_update(vnode_t vp, resolver_result_t result)
9660 {
9661 vnode_resolve_t rp;
9662 uint32_t seq;
9663 enum resolver_status stat;
9664
9665 if (vp->v_resolve == NULL) {
9666 return EINVAL;
9667 }
9668
9669 stat = vfs_resolver_status(result);
9670 seq = vfs_resolver_sequence(result);
9671
9672 if ((stat != RESOLVER_RESOLVED) && (stat != RESOLVER_UNRESOLVED)) {
9673 return EINVAL;
9674 }
9675
9676 rp = vp->v_resolve;
9677 lck_mtx_lock(&rp->vr_lock);
9678
9679 if (seq > rp->vr_lastseq) {
9680 if (stat == RESOLVER_RESOLVED) {
9681 rp->vr_flags |= VNT_RESOLVED;
9682 } else {
9683 rp->vr_flags &= ~VNT_RESOLVED;
9684 }
9685
9686 rp->vr_lastseq = seq;
9687 }
9688
9689 lck_mtx_unlock(&rp->vr_lock);
9690
9691 return 0;
9692 }
9693
9694 static int
9695 vnode_resolver_attach(vnode_t vp, vnode_resolve_t rp, boolean_t ref)
9696 {
9697 int error;
9698
9699 vnode_lock_spin(vp);
9700 if (vp->v_resolve != NULL) {
9701 vnode_unlock(vp);
9702 return EINVAL;
9703 } else {
9704 vp->v_resolve = rp;
9705 }
9706 vnode_unlock(vp);
9707
9708 if (ref) {
9709 error = vnode_ref_ext(vp, O_EVTONLY, VNODE_REF_FORCE);
9710 if (error != 0) {
9711 panic("VNODE_REF_FORCE didn't help...");
9712 }
9713 }
9714
9715 return 0;
9716 }
9717
9718 /*
9719 * VFS internal interfaces for vnode triggers
9720 *
9721 * vnode must already have an io count on entry
9722 * v_resolve is stable when io count is non-zero
9723 */
9724 static int
9725 vnode_resolver_create(mount_t mp, vnode_t vp, struct vnode_trigger_param *tinfo, boolean_t external)
9726 {
9727 vnode_resolve_t rp;
9728 int result;
9729 char byte;
9730
9731 #if 1
9732 /* minimum pointer test (debugging) */
9733 if (tinfo->vnt_data) {
9734 byte = *((char *)tinfo->vnt_data);
9735 }
9736 #endif
9737 MALLOC(rp, vnode_resolve_t, sizeof(*rp), M_TEMP, M_WAITOK);
9738 if (rp == NULL) {
9739 return ENOMEM;
9740 }
9741
9742 lck_mtx_init(&rp->vr_lock, trigger_vnode_lck_grp, trigger_vnode_lck_attr);
9743
9744 rp->vr_resolve_func = tinfo->vnt_resolve_func;
9745 rp->vr_unresolve_func = tinfo->vnt_unresolve_func;
9746 rp->vr_rearm_func = tinfo->vnt_rearm_func;
9747 rp->vr_reclaim_func = tinfo->vnt_reclaim_func;
9748 rp->vr_data = tinfo->vnt_data;
9749 rp->vr_lastseq = 0;
9750 rp->vr_flags = tinfo->vnt_flags & VNT_VALID_MASK;
9751 if (external) {
9752 rp->vr_flags |= VNT_EXTERNAL;
9753 }
9754
9755 result = vnode_resolver_attach(vp, rp, external);
9756 if (result != 0) {
9757 goto out;
9758 }
9759
9760 if (mp) {
9761 OSAddAtomic(1, &mp->mnt_numtriggers);
9762 }
9763
9764 return result;
9765
9766 out:
9767 FREE(rp, M_TEMP);
9768 return result;
9769 }
9770
9771 static void
9772 vnode_resolver_release(vnode_resolve_t rp)
9773 {
9774 /*
9775 * Give them a chance to free any private data
9776 */
9777 if (rp->vr_data && rp->vr_reclaim_func) {
9778 rp->vr_reclaim_func(NULLVP, rp->vr_data);
9779 }
9780
9781 lck_mtx_destroy(&rp->vr_lock, trigger_vnode_lck_grp);
9782 FREE(rp, M_TEMP);
9783 }
9784
9785 /* Called after the vnode has been drained */
9786 static void
9787 vnode_resolver_detach(vnode_t vp)
9788 {
9789 vnode_resolve_t rp;
9790 mount_t mp;
9791
9792 mp = vnode_mount(vp);
9793
9794 vnode_lock(vp);
9795 rp = vp->v_resolve;
9796 vp->v_resolve = NULL;
9797 vnode_unlock(vp);
9798
9799 if ((rp->vr_flags & VNT_EXTERNAL) != 0) {
9800 vnode_rele_ext(vp, O_EVTONLY, 1);
9801 }
9802
9803 vnode_resolver_release(rp);
9804
9805 /* Keep count of active trigger vnodes per mount */
9806 OSAddAtomic(-1, &mp->mnt_numtriggers);
9807 }
9808
9809 __private_extern__
9810 void
9811 vnode_trigger_rearm(vnode_t vp, vfs_context_t ctx)
9812 {
9813 vnode_resolve_t rp;
9814 resolver_result_t result;
9815 enum resolver_status status;
9816 uint32_t seq;
9817
9818 if ((vp->v_resolve == NULL) ||
9819 (vp->v_resolve->vr_rearm_func == NULL) ||
9820 (vp->v_resolve->vr_flags & VNT_AUTO_REARM) == 0) {
9821 return;
9822 }
9823
9824 rp = vp->v_resolve;
9825 lck_mtx_lock(&rp->vr_lock);
9826
9827 /*
9828 * Check if VFS initiated this unmount. If so, we'll catch it after the unresolve completes.
9829 */
9830 if (rp->vr_flags & VNT_VFS_UNMOUNTED) {
9831 lck_mtx_unlock(&rp->vr_lock);
9832 return;
9833 }
9834
9835 /* Check if this vnode is already armed */
9836 if ((rp->vr_flags & VNT_RESOLVED) == 0) {
9837 lck_mtx_unlock(&rp->vr_lock);
9838 return;
9839 }
9840
9841 lck_mtx_unlock(&rp->vr_lock);
9842
9843 result = rp->vr_rearm_func(vp, 0, rp->vr_data, ctx);
9844 status = vfs_resolver_status(result);
9845 seq = vfs_resolver_sequence(result);
9846
9847 lck_mtx_lock(&rp->vr_lock);
9848 if (seq > rp->vr_lastseq) {
9849 if (status == RESOLVER_UNRESOLVED) {
9850 rp->vr_flags &= ~VNT_RESOLVED;
9851 }
9852 rp->vr_lastseq = seq;
9853 }
9854 lck_mtx_unlock(&rp->vr_lock);
9855 }
9856
9857 __private_extern__
9858 int
9859 vnode_trigger_resolve(vnode_t vp, struct nameidata *ndp, vfs_context_t ctx)
9860 {
9861 vnode_resolve_t rp;
9862 enum path_operation op;
9863 resolver_result_t result;
9864 enum resolver_status status;
9865 uint32_t seq;
9866
9867 /* Only trigger on topmost vnodes */
9868 if ((vp->v_resolve == NULL) ||
9869 (vp->v_resolve->vr_resolve_func == NULL) ||
9870 (vp->v_mountedhere != NULL)) {
9871 return 0;
9872 }
9873
9874 rp = vp->v_resolve;
9875 lck_mtx_lock(&rp->vr_lock);
9876
9877 /* Check if this vnode is already resolved */
9878 if (rp->vr_flags & VNT_RESOLVED) {
9879 lck_mtx_unlock(&rp->vr_lock);
9880 return 0;
9881 }
9882
9883 lck_mtx_unlock(&rp->vr_lock);
9884
9885 #if CONFIG_MACF
9886 int rv = mac_vnode_check_trigger_resolve(ctx, vp, &ndp->ni_cnd);
9887 if (rv != 0) {
9888 return rv;
9889 }
9890 #endif
9891
9892 /*
9893 * XXX
9894 * assumes that resolver will not access this trigger vnode (otherwise the kernel will deadlock)
9895 * is there anyway to know this???
9896 * there can also be other legitimate lookups in parallel
9897 *
9898 * XXX - should we call this on a separate thread with a timeout?
9899 *
9900 * XXX - should we use ISLASTCN to pick the op value??? Perhaps only leafs should
9901 * get the richer set and non-leafs should get generic OP_LOOKUP? TBD
9902 */
9903 op = (ndp->ni_op < OP_MAXOP) ? ndp->ni_op: OP_LOOKUP;
9904
9905 result = rp->vr_resolve_func(vp, &ndp->ni_cnd, op, 0, rp->vr_data, ctx);
9906 status = vfs_resolver_status(result);
9907 seq = vfs_resolver_sequence(result);
9908
9909 lck_mtx_lock(&rp->vr_lock);
9910 if (seq > rp->vr_lastseq) {
9911 if (status == RESOLVER_RESOLVED) {
9912 rp->vr_flags |= VNT_RESOLVED;
9913 }
9914 rp->vr_lastseq = seq;
9915 }
9916 lck_mtx_unlock(&rp->vr_lock);
9917
9918 /* On resolver errors, propagate the error back up */
9919 return status == RESOLVER_ERROR ? vfs_resolver_auxiliary(result) : 0;
9920 }
9921
9922 static int
9923 vnode_trigger_unresolve(vnode_t vp, int flags, vfs_context_t ctx)
9924 {
9925 vnode_resolve_t rp;
9926 resolver_result_t result;
9927 enum resolver_status status;
9928 uint32_t seq;
9929
9930 if ((vp->v_resolve == NULL) || (vp->v_resolve->vr_unresolve_func == NULL)) {
9931 return 0;
9932 }
9933
9934 rp = vp->v_resolve;
9935 lck_mtx_lock(&rp->vr_lock);
9936
9937 /* Check if this vnode is already resolved */
9938 if ((rp->vr_flags & VNT_RESOLVED) == 0) {
9939 printf("vnode_trigger_unresolve: not currently resolved\n");
9940 lck_mtx_unlock(&rp->vr_lock);
9941 return 0;
9942 }
9943
9944 rp->vr_flags |= VNT_VFS_UNMOUNTED;
9945
9946 lck_mtx_unlock(&rp->vr_lock);
9947
9948 /*
9949 * XXX
9950 * assumes that resolver will not access this trigger vnode (otherwise the kernel will deadlock)
9951 * there can also be other legitimate lookups in parallel
9952 *
9953 * XXX - should we call this on a separate thread with a timeout?
9954 */
9955
9956 result = rp->vr_unresolve_func(vp, flags, rp->vr_data, ctx);
9957 status = vfs_resolver_status(result);
9958 seq = vfs_resolver_sequence(result);
9959
9960 lck_mtx_lock(&rp->vr_lock);
9961 if (seq > rp->vr_lastseq) {
9962 if (status == RESOLVER_UNRESOLVED) {
9963 rp->vr_flags &= ~VNT_RESOLVED;
9964 }
9965 rp->vr_lastseq = seq;
9966 }
9967 rp->vr_flags &= ~VNT_VFS_UNMOUNTED;
9968 lck_mtx_unlock(&rp->vr_lock);
9969
9970 /* On resolver errors, propagate the error back up */
9971 return status == RESOLVER_ERROR ? vfs_resolver_auxiliary(result) : 0;
9972 }
9973
9974 static int
9975 triggerisdescendant(mount_t mp, mount_t rmp)
9976 {
9977 int match = FALSE;
9978
9979 /*
9980 * walk up vnode covered chain looking for a match
9981 */
9982 name_cache_lock_shared();
9983
9984 while (1) {
9985 vnode_t vp;
9986
9987 /* did we encounter "/" ? */
9988 if (mp->mnt_flag & MNT_ROOTFS) {
9989 break;
9990 }
9991
9992 vp = mp->mnt_vnodecovered;
9993 if (vp == NULLVP) {
9994 break;
9995 }
9996
9997 mp = vp->v_mount;
9998 if (mp == rmp) {
9999 match = TRUE;
10000 break;
10001 }
10002 }
10003
10004 name_cache_unlock();
10005
10006 return match;
10007 }
10008
10009 struct trigger_unmount_info {
10010 vfs_context_t ctx;
10011 mount_t top_mp;
10012 vnode_t trigger_vp;
10013 mount_t trigger_mp;
10014 uint32_t trigger_vid;
10015 int flags;
10016 };
10017
10018 static int
10019 trigger_unmount_callback(mount_t mp, void * arg)
10020 {
10021 struct trigger_unmount_info * infop = (struct trigger_unmount_info *)arg;
10022 boolean_t mountedtrigger = FALSE;
10023
10024 /*
10025 * When we encounter the top level mount we're done
10026 */
10027 if (mp == infop->top_mp) {
10028 return VFS_RETURNED_DONE;
10029 }
10030
10031 if ((mp->mnt_vnodecovered == NULL) ||
10032 (vnode_getwithref(mp->mnt_vnodecovered) != 0)) {
10033 return VFS_RETURNED;
10034 }
10035
10036 if ((mp->mnt_vnodecovered->v_mountedhere == mp) &&
10037 (mp->mnt_vnodecovered->v_resolve != NULL) &&
10038 (mp->mnt_vnodecovered->v_resolve->vr_flags & VNT_RESOLVED)) {
10039 mountedtrigger = TRUE;
10040 }
10041 vnode_put(mp->mnt_vnodecovered);
10042
10043 /*
10044 * When we encounter a mounted trigger, check if its under the top level mount
10045 */
10046 if (!mountedtrigger || !triggerisdescendant(mp, infop->top_mp)) {
10047 return VFS_RETURNED;
10048 }
10049
10050 /*
10051 * Process any pending nested mount (now that its not referenced)
10052 */
10053 if ((infop->trigger_vp != NULLVP) &&
10054 (vnode_getwithvid(infop->trigger_vp, infop->trigger_vid) == 0)) {
10055 vnode_t vp = infop->trigger_vp;
10056 int error;
10057
10058 infop->trigger_vp = NULLVP;
10059
10060 if (mp == vp->v_mountedhere) {
10061 vnode_put(vp);
10062 printf("trigger_unmount_callback: unexpected match '%s'\n",
10063 mp->mnt_vfsstat.f_mntonname);
10064 return VFS_RETURNED;
10065 }
10066 if (infop->trigger_mp != vp->v_mountedhere) {
10067 vnode_put(vp);
10068 printf("trigger_unmount_callback: trigger mnt changed! (%p != %p)\n",
10069 infop->trigger_mp, vp->v_mountedhere);
10070 goto savenext;
10071 }
10072
10073 error = vnode_trigger_unresolve(vp, infop->flags, infop->ctx);
10074 vnode_put(vp);
10075 if (error) {
10076 printf("unresolving: '%s', err %d\n",
10077 vp->v_mountedhere ? vp->v_mountedhere->mnt_vfsstat.f_mntonname :
10078 "???", error);
10079 return VFS_RETURNED_DONE; /* stop iteration on errors */
10080 }
10081 }
10082 savenext:
10083 /*
10084 * We can't call resolver here since we hold a mount iter
10085 * ref on mp so save its covered vp for later processing
10086 */
10087 infop->trigger_vp = mp->mnt_vnodecovered;
10088 if ((infop->trigger_vp != NULLVP) &&
10089 (vnode_getwithref(infop->trigger_vp) == 0)) {
10090 if (infop->trigger_vp->v_mountedhere == mp) {
10091 infop->trigger_vid = infop->trigger_vp->v_id;
10092 infop->trigger_mp = mp;
10093 }
10094 vnode_put(infop->trigger_vp);
10095 }
10096
10097 return VFS_RETURNED;
10098 }
10099
10100 /*
10101 * Attempt to unmount any trigger mounts nested underneath a mount.
10102 * This is a best effort attempt and no retries are performed here.
10103 *
10104 * Note: mp->mnt_rwlock is held exclusively on entry (so be carefull)
10105 */
10106 __private_extern__
10107 void
10108 vfs_nested_trigger_unmounts(mount_t mp, int flags, vfs_context_t ctx)
10109 {
10110 struct trigger_unmount_info info;
10111
10112 /* Must have trigger vnodes */
10113 if (mp->mnt_numtriggers == 0) {
10114 return;
10115 }
10116 /* Avoid recursive requests (by checking covered vnode) */
10117 if ((mp->mnt_vnodecovered != NULL) &&
10118 (vnode_getwithref(mp->mnt_vnodecovered) == 0)) {
10119 boolean_t recursive = FALSE;
10120
10121 if ((mp->mnt_vnodecovered->v_mountedhere == mp) &&
10122 (mp->mnt_vnodecovered->v_resolve != NULL) &&
10123 (mp->mnt_vnodecovered->v_resolve->vr_flags & VNT_VFS_UNMOUNTED)) {
10124 recursive = TRUE;
10125 }
10126 vnode_put(mp->mnt_vnodecovered);
10127 if (recursive) {
10128 return;
10129 }
10130 }
10131
10132 /*
10133 * Attempt to unmount any nested trigger mounts (best effort)
10134 */
10135 info.ctx = ctx;
10136 info.top_mp = mp;
10137 info.trigger_vp = NULLVP;
10138 info.trigger_vid = 0;
10139 info.trigger_mp = NULL;
10140 info.flags = flags;
10141
10142 (void) vfs_iterate(VFS_ITERATE_TAIL_FIRST, trigger_unmount_callback, &info);
10143
10144 /*
10145 * Process remaining nested mount (now that its not referenced)
10146 */
10147 if ((info.trigger_vp != NULLVP) &&
10148 (vnode_getwithvid(info.trigger_vp, info.trigger_vid) == 0)) {
10149 vnode_t vp = info.trigger_vp;
10150
10151 if (info.trigger_mp == vp->v_mountedhere) {
10152 (void) vnode_trigger_unresolve(vp, flags, ctx);
10153 }
10154 vnode_put(vp);
10155 }
10156 }
10157
10158 int
10159 vfs_addtrigger(mount_t mp, const char *relpath, struct vnode_trigger_info *vtip, vfs_context_t ctx)
10160 {
10161 struct nameidata nd;
10162 int res;
10163 vnode_t rvp, vp;
10164 struct vnode_trigger_param vtp;
10165
10166 /*
10167 * Must be called for trigger callback, wherein rwlock is held
10168 */
10169 lck_rw_assert(&mp->mnt_rwlock, LCK_RW_ASSERT_HELD);
10170
10171 TRIG_LOG("Adding trigger at %s\n", relpath);
10172 TRIG_LOG("Trying VFS_ROOT\n");
10173
10174 /*
10175 * We do a lookup starting at the root of the mountpoint, unwilling
10176 * to cross into other mountpoints.
10177 */
10178 res = VFS_ROOT(mp, &rvp, ctx);
10179 if (res != 0) {
10180 goto out;
10181 }
10182
10183 TRIG_LOG("Trying namei\n");
10184
10185 NDINIT(&nd, LOOKUP, OP_LOOKUP, USEDVP | NOCROSSMOUNT | FOLLOW, UIO_SYSSPACE,
10186 CAST_USER_ADDR_T(relpath), ctx);
10187 nd.ni_dvp = rvp;
10188 res = namei(&nd);
10189 if (res != 0) {
10190 vnode_put(rvp);
10191 goto out;
10192 }
10193
10194 vp = nd.ni_vp;
10195 nameidone(&nd);
10196 vnode_put(rvp);
10197
10198 TRIG_LOG("Trying vnode_resolver_create()\n");
10199
10200 /*
10201 * Set up blob. vnode_create() takes a larger structure
10202 * with creation info, and we needed something different
10203 * for this case. One needs to win, or we need to munge both;
10204 * vnode_create() wins.
10205 */
10206 bzero(&vtp, sizeof(vtp));
10207 vtp.vnt_resolve_func = vtip->vti_resolve_func;
10208 vtp.vnt_unresolve_func = vtip->vti_unresolve_func;
10209 vtp.vnt_rearm_func = vtip->vti_rearm_func;
10210 vtp.vnt_reclaim_func = vtip->vti_reclaim_func;
10211 vtp.vnt_reclaim_func = vtip->vti_reclaim_func;
10212 vtp.vnt_data = vtip->vti_data;
10213 vtp.vnt_flags = vtip->vti_flags;
10214
10215 res = vnode_resolver_create(mp, vp, &vtp, TRUE);
10216 vnode_put(vp);
10217 out:
10218 TRIG_LOG("Returning %d\n", res);
10219 return res;
10220 }
10221
10222 #endif /* CONFIG_TRIGGERS */
10223
10224 vm_offset_t
10225 kdebug_vnode(vnode_t vp)
10226 {
10227 return VM_KERNEL_ADDRPERM(vp);
10228 }
10229
10230 static int flush_cache_on_write = 0;
10231 SYSCTL_INT(_kern, OID_AUTO, flush_cache_on_write,
10232 CTLFLAG_RW | CTLFLAG_LOCKED, &flush_cache_on_write, 0,
10233 "always flush the drive cache on writes to uncached files");
10234
10235 int
10236 vnode_should_flush_after_write(vnode_t vp, int ioflag)
10237 {
10238 return flush_cache_on_write
10239 && (ISSET(ioflag, IO_NOCACHE) || vnode_isnocache(vp));
10240 }
10241
10242 /*
10243 * sysctl for use by disk I/O tracing tools to get the list of existing
10244 * vnodes' paths
10245 */
10246
10247 struct vnode_trace_paths_context {
10248 uint64_t count;
10249 long path[MAXPATHLEN / sizeof(long) + 1]; /* + 1 in case sizeof (long) does not divide MAXPATHLEN */
10250 };
10251
10252 static int
10253 vnode_trace_path_callback(struct vnode *vp, void *arg)
10254 {
10255 int len, rv;
10256 struct vnode_trace_paths_context *ctx;
10257
10258 ctx = arg;
10259
10260 len = sizeof(ctx->path);
10261 rv = vn_getpath(vp, (char *)ctx->path, &len);
10262 /* vn_getpath() NUL-terminates, and len includes the NUL */
10263
10264 if (!rv) {
10265 kdebug_vfs_lookup(ctx->path, len, vp,
10266 KDBG_VFS_LOOKUP_FLAG_LOOKUP | KDBG_VFS_LOOKUP_FLAG_NOPROCFILT);
10267
10268 if (++(ctx->count) == 1000) {
10269 thread_yield_to_preemption();
10270 ctx->count = 0;
10271 }
10272 }
10273
10274 return VNODE_RETURNED;
10275 }
10276
10277 static int
10278 vfs_trace_paths_callback(mount_t mp, void *arg)
10279 {
10280 if (mp->mnt_flag & MNT_LOCAL) {
10281 vnode_iterate(mp, VNODE_ITERATE_ALL, vnode_trace_path_callback, arg);
10282 }
10283
10284 return VFS_RETURNED;
10285 }
10286
10287 static int sysctl_vfs_trace_paths SYSCTL_HANDLER_ARGS {
10288 struct vnode_trace_paths_context ctx;
10289
10290 (void)oidp;
10291 (void)arg1;
10292 (void)arg2;
10293 (void)req;
10294
10295 if (!kauth_cred_issuser(kauth_cred_get())) {
10296 return EPERM;
10297 }
10298
10299 if (!kdebug_enable || !kdebug_debugid_enabled(VFS_LOOKUP)) {
10300 return EINVAL;
10301 }
10302
10303 bzero(&ctx, sizeof(struct vnode_trace_paths_context));
10304
10305 vfs_iterate(0, vfs_trace_paths_callback, &ctx);
10306
10307 return 0;
10308 }
10309
10310 SYSCTL_PROC(_vfs_generic, OID_AUTO, trace_paths, CTLFLAG_RD | CTLFLAG_LOCKED | CTLFLAG_MASKED, NULL, 0, &sysctl_vfs_trace_paths, "-", "trace_paths");