]> git.saurik.com Git - apple/xnu.git/blob - bsd/vfs/vfs_subr.c
ce35a0fdcebbf1964679986eb5502c6d5253b2ee
[apple/xnu.git] / bsd / vfs / vfs_subr.c
1 /*
2 * Copyright (c) 2000-2005 Apple Computer, Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28 /* Copyright (c) 1995 NeXT Computer, Inc. All Rights Reserved */
29 /*
30 * Copyright (c) 1989, 1993
31 * The Regents of the University of California. All rights reserved.
32 * (c) UNIX System Laboratories, Inc.
33 * All or some portions of this file are derived from material licensed
34 * to the University of California by American Telephone and Telegraph
35 * Co. or Unix System Laboratories, Inc. and are reproduced herein with
36 * the permission of UNIX System Laboratories, Inc.
37 *
38 * Redistribution and use in source and binary forms, with or without
39 * modification, are permitted provided that the following conditions
40 * are met:
41 * 1. Redistributions of source code must retain the above copyright
42 * notice, this list of conditions and the following disclaimer.
43 * 2. Redistributions in binary form must reproduce the above copyright
44 * notice, this list of conditions and the following disclaimer in the
45 * documentation and/or other materials provided with the distribution.
46 * 3. All advertising materials mentioning features or use of this software
47 * must display the following acknowledgement:
48 * This product includes software developed by the University of
49 * California, Berkeley and its contributors.
50 * 4. Neither the name of the University nor the names of its contributors
51 * may be used to endorse or promote products derived from this software
52 * without specific prior written permission.
53 *
54 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
55 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
56 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
57 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
58 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
59 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
60 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
61 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
62 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
63 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
64 * SUCH DAMAGE.
65 *
66 * @(#)vfs_subr.c 8.31 (Berkeley) 5/26/95
67 */
68
69 /*
70 * External virtual filesystem routines
71 */
72
73 #undef DIAGNOSTIC
74 #define DIAGNOSTIC 1
75
76 #include <sys/param.h>
77 #include <sys/systm.h>
78 #include <sys/proc_internal.h>
79 #include <sys/kauth.h>
80 #include <sys/mount_internal.h>
81 #include <sys/time.h>
82 #include <sys/lock.h>
83 #include <sys/vnode_internal.h>
84 #include <sys/stat.h>
85 #include <sys/namei.h>
86 #include <sys/ucred.h>
87 #include <sys/buf_internal.h>
88 #include <sys/errno.h>
89 #include <sys/malloc.h>
90 #include <sys/domain.h>
91 #include <sys/mbuf.h>
92 #include <sys/syslog.h>
93 #include <sys/ubc_internal.h>
94 #include <sys/vm.h>
95 #include <sys/sysctl.h>
96 #include <sys/filedesc.h>
97 #include <sys/event.h>
98 #include <sys/kdebug.h>
99 #include <sys/kauth.h>
100 #include <sys/user.h>
101 #include <miscfs/fifofs/fifo.h>
102
103 #include <string.h>
104 #include <machine/spl.h>
105
106
107 #include <kern/assert.h>
108
109 #include <miscfs/specfs/specdev.h>
110
111 #include <mach/mach_types.h>
112 #include <mach/memory_object_types.h>
113
114 extern lck_grp_t *vnode_lck_grp;
115 extern lck_attr_t *vnode_lck_attr;
116
117
118 extern lck_mtx_t * mnt_list_mtx_lock;
119
120 enum vtype iftovt_tab[16] = {
121 VNON, VFIFO, VCHR, VNON, VDIR, VNON, VBLK, VNON,
122 VREG, VNON, VLNK, VNON, VSOCK, VNON, VNON, VBAD,
123 };
124 int vttoif_tab[9] = {
125 0, S_IFREG, S_IFDIR, S_IFBLK, S_IFCHR, S_IFLNK,
126 S_IFSOCK, S_IFIFO, S_IFMT,
127 };
128
129 extern int ubc_isinuse_locked(vnode_t, int, int);
130 extern kern_return_t adjust_vm_object_cache(vm_size_t oval, vm_size_t nval);
131
132 static void vnode_list_add(vnode_t);
133 static void vnode_list_remove(vnode_t);
134
135 static errno_t vnode_drain(vnode_t);
136 static void vgone(vnode_t);
137 static void vclean(vnode_t vp, int flag, proc_t p);
138 static void vnode_reclaim_internal(vnode_t, int, int);
139
140 static void vnode_dropiocount (vnode_t, int);
141 static errno_t vnode_getiocount(vnode_t vp, int locked, int vid, int vflags);
142 static int vget_internal(vnode_t, int, int);
143
144 static vnode_t checkalias(vnode_t vp, dev_t nvp_rdev);
145 static int vnode_reload(vnode_t);
146 static int vnode_isinuse_locked(vnode_t, int, int);
147
148 static void insmntque(vnode_t vp, mount_t mp);
149 mount_t mount_list_lookupby_fsid(fsid_t *, int, int);
150 static int mount_getvfscnt(void);
151 static int mount_fillfsids(fsid_t *, int );
152 static void vnode_iterate_setup(mount_t);
153 static int vnode_umount_preflight(mount_t, vnode_t, int);
154 static int vnode_iterate_prepare(mount_t);
155 static int vnode_iterate_reloadq(mount_t);
156 static void vnode_iterate_clear(mount_t);
157
158 TAILQ_HEAD(freelst, vnode) vnode_free_list; /* vnode free list */
159 TAILQ_HEAD(inactivelst, vnode) vnode_inactive_list; /* vnode inactive list */
160 struct mntlist mountlist; /* mounted filesystem list */
161 static int nummounts = 0;
162
163 #if DIAGNOSTIC
164 #define VLISTCHECK(fun, vp, list) \
165 if ((vp)->v_freelist.tqe_prev == (struct vnode **)0xdeadb) \
166 panic("%s: %s vnode not on %slist", (fun), (list), (list));
167
168 #define VINACTIVECHECK(fun, vp, expected) \
169 do { \
170 int __is_inactive = ISSET((vp)->v_flag, VUINACTIVE); \
171 if (__is_inactive ^ expected) \
172 panic("%s: %sinactive vnode, expected %s", (fun), \
173 __is_inactive? "" : "not ", \
174 expected? "inactive": "not inactive"); \
175 } while(0)
176 #else
177 #define VLISTCHECK(fun, vp, list)
178 #define VINACTIVECHECK(fun, vp, expected)
179 #endif /* DIAGNOSTIC */
180
181 #define VLISTNONE(vp) \
182 do { \
183 (vp)->v_freelist.tqe_next = (struct vnode *)0; \
184 (vp)->v_freelist.tqe_prev = (struct vnode **)0xdeadb; \
185 } while(0)
186
187 #define VONLIST(vp) \
188 ((vp)->v_freelist.tqe_prev != (struct vnode **)0xdeadb)
189
190 /* remove a vnode from free vnode list */
191 #define VREMFREE(fun, vp) \
192 do { \
193 VLISTCHECK((fun), (vp), "free"); \
194 TAILQ_REMOVE(&vnode_free_list, (vp), v_freelist); \
195 VLISTNONE((vp)); \
196 freevnodes--; \
197 } while(0)
198
199 /* remove a vnode from inactive vnode list */
200 #define VREMINACTIVE(fun, vp) \
201 do { \
202 VLISTCHECK((fun), (vp), "inactive"); \
203 VINACTIVECHECK((fun), (vp), VUINACTIVE); \
204 TAILQ_REMOVE(&vnode_inactive_list, (vp), v_freelist); \
205 CLR((vp)->v_flag, VUINACTIVE); \
206 VLISTNONE((vp)); \
207 inactivevnodes--; \
208 } while(0)
209
210 /*
211 * Have to declare first two locks as actual data even if !MACH_SLOCKS, since
212 * a pointers to them get passed around.
213 */
214 void * mntvnode_slock;
215 void * mntid_slock;
216 void * spechash_slock;
217
218 /*
219 * vnodetarget is the amount of vnodes we expect to get back
220 * from the the inactive vnode list and VM object cache.
221 * As vnreclaim() is a mainly cpu bound operation for faster
222 * processers this number could be higher.
223 * Having this number too high introduces longer delays in
224 * the execution of new_vnode().
225 */
226 unsigned long vnodetarget; /* target for vnreclaim() */
227 #define VNODE_FREE_TARGET 20 /* Default value for vnodetarget */
228
229 /*
230 * We need quite a few vnodes on the free list to sustain the
231 * rapid stat() the compilation process does, and still benefit from the name
232 * cache. Having too few vnodes on the free list causes serious disk
233 * thrashing as we cycle through them.
234 */
235 #define VNODE_FREE_MIN 300 /* freelist should have at least these many */
236
237 /*
238 * We need to get vnodes back from the VM object cache when a certain #
239 * of vnodes are reused from the freelist. This is essential for the
240 * caching to be effective in the namecache and the buffer cache [for the
241 * metadata].
242 */
243 #define VNODE_TOOMANY_REUSED (VNODE_FREE_MIN/4)
244
245 /*
246 * If we have enough vnodes on the freelist we do not want to reclaim
247 * the vnodes from the VM object cache.
248 */
249 #define VNODE_FREE_ENOUGH (VNODE_FREE_MIN + (VNODE_FREE_MIN/2))
250
251 /*
252 * Initialize the vnode management data structures.
253 */
254 __private_extern__ void
255 vntblinit(void)
256 {
257 TAILQ_INIT(&vnode_free_list);
258 TAILQ_INIT(&vnode_inactive_list);
259 TAILQ_INIT(&mountlist);
260
261 if (!vnodetarget)
262 vnodetarget = VNODE_FREE_TARGET;
263
264 /*
265 * Scale the vm_object_cache to accomodate the vnodes
266 * we want to cache
267 */
268 (void) adjust_vm_object_cache(0, desiredvnodes - VNODE_FREE_MIN);
269 }
270
271 /* Reset the VM Object Cache with the values passed in */
272 __private_extern__ kern_return_t
273 reset_vmobjectcache(unsigned int val1, unsigned int val2)
274 {
275 vm_size_t oval = val1 - VNODE_FREE_MIN;
276 vm_size_t nval;
277
278 if(val2 < VNODE_FREE_MIN)
279 nval = 0;
280 else
281 nval = val2 - VNODE_FREE_MIN;
282
283 return(adjust_vm_object_cache(oval, nval));
284 }
285
286
287 /* the timeout is in 10 msecs */
288 int
289 vnode_waitforwrites(vnode_t vp, int output_target, int slpflag, int slptimeout, char *msg) {
290 int error = 0;
291 struct timespec ts;
292
293 KERNEL_DEBUG(0x3010280 | DBG_FUNC_START, (int)vp, output_target, vp->v_numoutput, 0, 0);
294
295 if (vp->v_numoutput > output_target) {
296
297 slpflag &= ~PDROP;
298
299 vnode_lock(vp);
300
301 while ((vp->v_numoutput > output_target) && error == 0) {
302 if (output_target)
303 vp->v_flag |= VTHROTTLED;
304 else
305 vp->v_flag |= VBWAIT;
306 ts.tv_sec = (slptimeout/100);
307 ts.tv_nsec = (slptimeout % 1000) * 10 * NSEC_PER_USEC * 1000 ;
308 error = msleep((caddr_t)&vp->v_numoutput, &vp->v_lock, (slpflag | (PRIBIO + 1)), msg, &ts);
309 }
310 vnode_unlock(vp);
311 }
312 KERNEL_DEBUG(0x3010280 | DBG_FUNC_END, (int)vp, output_target, vp->v_numoutput, error, 0);
313
314 return error;
315 }
316
317
318 void
319 vnode_startwrite(vnode_t vp) {
320
321 OSAddAtomic(1, &vp->v_numoutput);
322 }
323
324
325 void
326 vnode_writedone(vnode_t vp)
327 {
328 if (vp) {
329 int need_wakeup = 0;
330
331 OSAddAtomic(-1, &vp->v_numoutput);
332
333 vnode_lock(vp);
334
335 if (vp->v_numoutput < 0)
336 panic("vnode_writedone: numoutput < 0");
337
338 if ((vp->v_flag & VTHROTTLED) && (vp->v_numoutput < (VNODE_ASYNC_THROTTLE / 3))) {
339 vp->v_flag &= ~VTHROTTLED;
340 need_wakeup = 1;
341 }
342 if ((vp->v_flag & VBWAIT) && (vp->v_numoutput == 0)) {
343 vp->v_flag &= ~VBWAIT;
344 need_wakeup = 1;
345 }
346 vnode_unlock(vp);
347
348 if (need_wakeup)
349 wakeup((caddr_t)&vp->v_numoutput);
350 }
351 }
352
353
354
355 int
356 vnode_hasdirtyblks(vnode_t vp)
357 {
358 struct cl_writebehind *wbp;
359
360 /*
361 * Not taking the buf_mtxp as there is little
362 * point doing it. Even if the lock is taken the
363 * state can change right after that. If their
364 * needs to be a synchronization, it must be driven
365 * by the caller
366 */
367 if (vp->v_dirtyblkhd.lh_first)
368 return (1);
369
370 if (!UBCINFOEXISTS(vp))
371 return (0);
372
373 wbp = vp->v_ubcinfo->cl_wbehind;
374
375 if (wbp && (wbp->cl_number || wbp->cl_scmap))
376 return (1);
377
378 return (0);
379 }
380
381 int
382 vnode_hascleanblks(vnode_t vp)
383 {
384 /*
385 * Not taking the buf_mtxp as there is little
386 * point doing it. Even if the lock is taken the
387 * state can change right after that. If their
388 * needs to be a synchronization, it must be driven
389 * by the caller
390 */
391 if (vp->v_cleanblkhd.lh_first)
392 return (1);
393 return (0);
394 }
395
396 void
397 vnode_iterate_setup(mount_t mp)
398 {
399 while (mp->mnt_lflag & MNT_LITER) {
400 mp->mnt_lflag |= MNT_LITERWAIT;
401 msleep((caddr_t)mp, &mp->mnt_mlock, PVFS, "vnode_iterate_setup", 0);
402 }
403
404 mp->mnt_lflag |= MNT_LITER;
405
406 }
407
408 static int
409 vnode_umount_preflight(mount_t mp, vnode_t skipvp, int flags)
410 {
411 vnode_t vp;
412
413 TAILQ_FOREACH(vp, &mp->mnt_vnodelist, v_mntvnodes) {
414 if (vp->v_type == VDIR)
415 continue;
416 if (vp == skipvp)
417 continue;
418 if ((flags & SKIPSYSTEM) && ((vp->v_flag & VSYSTEM) ||
419 (vp->v_flag & VNOFLUSH)))
420 continue;
421 if ((flags & SKIPSWAP) && (vp->v_flag & VSWAP))
422 continue;
423 if ((flags & WRITECLOSE) &&
424 (vp->v_writecount == 0 || vp->v_type != VREG))
425 continue;
426 /* Look for busy vnode */
427 if (((vp->v_usecount != 0) &&
428 ((vp->v_usecount - vp->v_kusecount) != 0)))
429 return(1);
430 }
431
432 return(0);
433 }
434
435 /*
436 * This routine prepares iteration by moving all the vnodes to worker queue
437 * called with mount lock held
438 */
439 int
440 vnode_iterate_prepare(mount_t mp)
441 {
442 vnode_t vp;
443
444 if (TAILQ_EMPTY(&mp->mnt_vnodelist)) {
445 /* nothing to do */
446 return (0);
447 }
448
449 vp = TAILQ_FIRST(&mp->mnt_vnodelist);
450 vp->v_mntvnodes.tqe_prev = &(mp->mnt_workerqueue.tqh_first);
451 mp->mnt_workerqueue.tqh_first = mp->mnt_vnodelist.tqh_first;
452 mp->mnt_workerqueue.tqh_last = mp->mnt_vnodelist.tqh_last;
453
454 TAILQ_INIT(&mp->mnt_vnodelist);
455 if (mp->mnt_newvnodes.tqh_first != NULL)
456 panic("vnode_iterate_prepare: newvnode when entering vnode");
457 TAILQ_INIT(&mp->mnt_newvnodes);
458
459 return (1);
460 }
461
462
463 /* called with mount lock held */
464 int
465 vnode_iterate_reloadq(mount_t mp)
466 {
467 int moved = 0;
468
469 /* add the remaining entries in workerq to the end of mount vnode list */
470 if (!TAILQ_EMPTY(&mp->mnt_workerqueue)) {
471 struct vnode * mvp;
472 mvp = TAILQ_LAST(&mp->mnt_vnodelist, vnodelst);
473
474 /* Joining the workerque entities to mount vnode list */
475 if (mvp)
476 mvp->v_mntvnodes.tqe_next = mp->mnt_workerqueue.tqh_first;
477 else
478 mp->mnt_vnodelist.tqh_first = mp->mnt_workerqueue.tqh_first;
479 mp->mnt_workerqueue.tqh_first->v_mntvnodes.tqe_prev = mp->mnt_vnodelist.tqh_last;
480 mp->mnt_vnodelist.tqh_last = mp->mnt_workerqueue.tqh_last;
481 TAILQ_INIT(&mp->mnt_workerqueue);
482 }
483
484 /* add the newvnodes to the head of mount vnode list */
485 if (!TAILQ_EMPTY(&mp->mnt_newvnodes)) {
486 struct vnode * nlvp;
487 nlvp = TAILQ_LAST(&mp->mnt_newvnodes, vnodelst);
488
489 mp->mnt_newvnodes.tqh_first->v_mntvnodes.tqe_prev = &mp->mnt_vnodelist.tqh_first;
490 nlvp->v_mntvnodes.tqe_next = mp->mnt_vnodelist.tqh_first;
491 if(mp->mnt_vnodelist.tqh_first)
492 mp->mnt_vnodelist.tqh_first->v_mntvnodes.tqe_prev = &nlvp->v_mntvnodes.tqe_next;
493 else
494 mp->mnt_vnodelist.tqh_last = mp->mnt_newvnodes.tqh_last;
495 mp->mnt_vnodelist.tqh_first = mp->mnt_newvnodes.tqh_first;
496 TAILQ_INIT(&mp->mnt_newvnodes);
497 moved = 1;
498 }
499
500 return(moved);
501 }
502
503
504 void
505 vnode_iterate_clear(mount_t mp)
506 {
507 mp->mnt_lflag &= ~MNT_LITER;
508 if (mp->mnt_lflag & MNT_LITERWAIT) {
509 mp->mnt_lflag &= ~MNT_LITERWAIT;
510 wakeup(mp);
511 }
512 }
513
514
515 int
516 vnode_iterate(mp, flags, callout, arg)
517 mount_t mp;
518 int flags;
519 int (*callout)(struct vnode *, void *);
520 void * arg;
521 {
522 struct vnode *vp;
523 int vid, retval;
524 int ret = 0;
525
526 mount_lock(mp);
527
528 vnode_iterate_setup(mp);
529
530 /* it is returns 0 then there is nothing to do */
531 retval = vnode_iterate_prepare(mp);
532
533 if (retval == 0) {
534 vnode_iterate_clear(mp);
535 mount_unlock(mp);
536 return(ret);
537 }
538
539 /* iterate over all the vnodes */
540 while (!TAILQ_EMPTY(&mp->mnt_workerqueue)) {
541 vp = TAILQ_FIRST(&mp->mnt_workerqueue);
542 TAILQ_REMOVE(&mp->mnt_workerqueue, vp, v_mntvnodes);
543 TAILQ_INSERT_TAIL(&mp->mnt_vnodelist, vp, v_mntvnodes);
544 vid = vp->v_id;
545 if ((vp->v_data == NULL) || (vp->v_type == VNON) || (vp->v_mount != mp)) {
546 continue;
547 }
548 mount_unlock(mp);
549
550 if ( vget_internal(vp, vid, (flags | VNODE_NODEAD| VNODE_WITHID | VNODE_NOSUSPEND))) {
551 mount_lock(mp);
552 continue;
553 }
554 if (flags & VNODE_RELOAD) {
555 /*
556 * we're reloading the filesystem
557 * cast out any inactive vnodes...
558 */
559 if (vnode_reload(vp)) {
560 /* vnode will be recycled on the refcount drop */
561 vnode_put(vp);
562 mount_lock(mp);
563 continue;
564 }
565 }
566
567 retval = callout(vp, arg);
568
569 switch (retval) {
570 case VNODE_RETURNED:
571 case VNODE_RETURNED_DONE:
572 vnode_put(vp);
573 if (retval == VNODE_RETURNED_DONE) {
574 mount_lock(mp);
575 ret = 0;
576 goto out;
577 }
578 break;
579
580 case VNODE_CLAIMED_DONE:
581 mount_lock(mp);
582 ret = 0;
583 goto out;
584 case VNODE_CLAIMED:
585 default:
586 break;
587 }
588 mount_lock(mp);
589 }
590
591 out:
592 (void)vnode_iterate_reloadq(mp);
593 vnode_iterate_clear(mp);
594 mount_unlock(mp);
595 return (ret);
596 }
597
598 void
599 mount_lock_renames(mount_t mp)
600 {
601 lck_mtx_lock(&mp->mnt_renamelock);
602 }
603
604 void
605 mount_unlock_renames(mount_t mp)
606 {
607 lck_mtx_unlock(&mp->mnt_renamelock);
608 }
609
610 void
611 mount_lock(mount_t mp)
612 {
613 lck_mtx_lock(&mp->mnt_mlock);
614 }
615
616 void
617 mount_unlock(mount_t mp)
618 {
619 lck_mtx_unlock(&mp->mnt_mlock);
620 }
621
622
623 void
624 mount_ref(mount_t mp, int locked)
625 {
626 if ( !locked)
627 mount_lock(mp);
628
629 mp->mnt_count++;
630
631 if ( !locked)
632 mount_unlock(mp);
633 }
634
635
636 void
637 mount_drop(mount_t mp, int locked)
638 {
639 if ( !locked)
640 mount_lock(mp);
641
642 mp->mnt_count--;
643
644 if (mp->mnt_count == 0 && (mp->mnt_lflag & MNT_LDRAIN))
645 wakeup(&mp->mnt_lflag);
646
647 if ( !locked)
648 mount_unlock(mp);
649 }
650
651
652 int
653 mount_iterref(mount_t mp, int locked)
654 {
655 int retval = 0;
656
657 if (!locked)
658 mount_list_lock();
659 if (mp->mnt_iterref < 0) {
660 retval = 1;
661 } else {
662 mp->mnt_iterref++;
663 }
664 if (!locked)
665 mount_list_unlock();
666 return(retval);
667 }
668
669 int
670 mount_isdrained(mount_t mp, int locked)
671 {
672 int retval;
673
674 if (!locked)
675 mount_list_lock();
676 if (mp->mnt_iterref < 0)
677 retval = 1;
678 else
679 retval = 0;
680 if (!locked)
681 mount_list_unlock();
682 return(retval);
683 }
684
685 void
686 mount_iterdrop(mount_t mp)
687 {
688 mount_list_lock();
689 mp->mnt_iterref--;
690 wakeup(&mp->mnt_iterref);
691 mount_list_unlock();
692 }
693
694 void
695 mount_iterdrain(mount_t mp)
696 {
697 mount_list_lock();
698 while (mp->mnt_iterref)
699 msleep((caddr_t)&mp->mnt_iterref, mnt_list_mtx_lock, PVFS, "mount_iterdrain", 0 );
700 /* mount iterations drained */
701 mp->mnt_iterref = -1;
702 mount_list_unlock();
703 }
704 void
705 mount_iterreset(mount_t mp)
706 {
707 mount_list_lock();
708 if (mp->mnt_iterref == -1)
709 mp->mnt_iterref = 0;
710 mount_list_unlock();
711 }
712
713 /* always called with mount lock held */
714 int
715 mount_refdrain(mount_t mp)
716 {
717 if (mp->mnt_lflag & MNT_LDRAIN)
718 panic("already in drain");
719 mp->mnt_lflag |= MNT_LDRAIN;
720
721 while (mp->mnt_count)
722 msleep((caddr_t)&mp->mnt_lflag, &mp->mnt_mlock, PVFS, "mount_drain", 0 );
723
724 if (mp->mnt_vnodelist.tqh_first != NULL)
725 panic("mount_refdrain: dangling vnode");
726
727 mp->mnt_lflag &= ~MNT_LDRAIN;
728
729 return(0);
730 }
731
732
733 /*
734 * Mark a mount point as busy. Used to synchronize access and to delay
735 * unmounting.
736 */
737 int
738 vfs_busy(mount_t mp, int flags)
739 {
740
741 restart:
742 if (mp->mnt_lflag & MNT_LDEAD)
743 return(ENOENT);
744
745 if (mp->mnt_lflag & MNT_LUNMOUNT) {
746 if (flags & LK_NOWAIT)
747 return (ENOENT);
748
749 mount_lock(mp);
750
751 if (mp->mnt_lflag & MNT_LDEAD) {
752 mount_unlock(mp);
753 return(ENOENT);
754 }
755 if (mp->mnt_lflag & MNT_LUNMOUNT) {
756 mp->mnt_lflag |= MNT_LWAIT;
757 /*
758 * Since all busy locks are shared except the exclusive
759 * lock granted when unmounting, the only place that a
760 * wakeup needs to be done is at the release of the
761 * exclusive lock at the end of dounmount.
762 */
763 msleep((caddr_t)mp, &mp->mnt_mlock, (PVFS | PDROP), "vfsbusy", 0 );
764 return (ENOENT);
765 }
766 mount_unlock(mp);
767 }
768
769 lck_rw_lock_shared(&mp->mnt_rwlock);
770
771 /*
772 * until we are granted the rwlock, it's possible for the mount point to
773 * change state, so reevaluate before granting the vfs_busy
774 */
775 if (mp->mnt_lflag & (MNT_LDEAD | MNT_LUNMOUNT)) {
776 lck_rw_done(&mp->mnt_rwlock);
777 goto restart;
778 }
779 return (0);
780 }
781
782 /*
783 * Free a busy filesystem.
784 */
785
786 void
787 vfs_unbusy(mount_t mp)
788 {
789 lck_rw_done(&mp->mnt_rwlock);
790 }
791
792
793
794 static void
795 vfs_rootmountfailed(mount_t mp) {
796
797 mount_list_lock();
798 mp->mnt_vtable->vfc_refcount--;
799 mount_list_unlock();
800
801 vfs_unbusy(mp);
802
803 mount_lock_destroy(mp);
804
805 FREE_ZONE(mp, sizeof(struct mount), M_MOUNT);
806 }
807
808 /*
809 * Lookup a filesystem type, and if found allocate and initialize
810 * a mount structure for it.
811 *
812 * Devname is usually updated by mount(8) after booting.
813 */
814 static mount_t
815 vfs_rootmountalloc_internal(struct vfstable *vfsp, const char *devname)
816 {
817 mount_t mp;
818
819 mp = _MALLOC_ZONE((u_long)sizeof(struct mount), M_MOUNT, M_WAITOK);
820 bzero((char *)mp, (u_long)sizeof(struct mount));
821
822 /* Initialize the default IO constraints */
823 mp->mnt_maxreadcnt = mp->mnt_maxwritecnt = MAXPHYS;
824 mp->mnt_segreadcnt = mp->mnt_segwritecnt = 32;
825 mp->mnt_maxsegreadsize = mp->mnt_maxreadcnt;
826 mp->mnt_maxsegwritesize = mp->mnt_maxwritecnt;
827 mp->mnt_devblocksize = DEV_BSIZE;
828
829 mount_lock_init(mp);
830 (void)vfs_busy(mp, LK_NOWAIT);
831
832 TAILQ_INIT(&mp->mnt_vnodelist);
833 TAILQ_INIT(&mp->mnt_workerqueue);
834 TAILQ_INIT(&mp->mnt_newvnodes);
835
836 mp->mnt_vtable = vfsp;
837 mp->mnt_op = vfsp->vfc_vfsops;
838 mp->mnt_flag = MNT_RDONLY | MNT_ROOTFS;
839 mp->mnt_vnodecovered = NULLVP;
840 //mp->mnt_stat.f_type = vfsp->vfc_typenum;
841 mp->mnt_flag |= vfsp->vfc_flags & MNT_VISFLAGMASK;
842
843 mount_list_lock();
844 vfsp->vfc_refcount++;
845 mount_list_unlock();
846
847 strncpy(mp->mnt_vfsstat.f_fstypename, vfsp->vfc_name, MFSTYPENAMELEN);
848 mp->mnt_vfsstat.f_mntonname[0] = '/';
849 (void) copystr((char *)devname, mp->mnt_vfsstat.f_mntfromname, MAXPATHLEN - 1, 0);
850
851 return (mp);
852 }
853
854 errno_t
855 vfs_rootmountalloc(const char *fstypename, const char *devname, mount_t *mpp)
856 {
857 struct vfstable *vfsp;
858
859 for (vfsp = vfsconf; vfsp; vfsp = vfsp->vfc_next)
860 if (!strcmp(vfsp->vfc_name, fstypename))
861 break;
862 if (vfsp == NULL)
863 return (ENODEV);
864
865 *mpp = vfs_rootmountalloc_internal(vfsp, devname);
866
867 if (*mpp)
868 return (0);
869
870 return (ENOMEM);
871 }
872
873
874 /*
875 * Find an appropriate filesystem to use for the root. If a filesystem
876 * has not been preselected, walk through the list of known filesystems
877 * trying those that have mountroot routines, and try them until one
878 * works or we have tried them all.
879 */
880 extern int (*mountroot)(void);
881
882 int
883 vfs_mountroot()
884 {
885 struct vfstable *vfsp;
886 struct vfs_context context;
887 int error;
888 mount_t mp;
889
890 if (mountroot != NULL) {
891 /*
892 * used for netboot which follows a different set of rules
893 */
894 error = (*mountroot)();
895 return (error);
896 }
897 if ((error = bdevvp(rootdev, &rootvp))) {
898 printf("vfs_mountroot: can't setup bdevvp\n");
899 return (error);
900 }
901 context.vc_proc = current_proc();
902 context.vc_ucred = kauth_cred_get();
903
904 for (vfsp = vfsconf; vfsp; vfsp = vfsp->vfc_next) {
905 if (vfsp->vfc_mountroot == NULL)
906 continue;
907
908 mp = vfs_rootmountalloc_internal(vfsp, "root_device");
909 mp->mnt_devvp = rootvp;
910
911 if ((error = (*vfsp->vfc_mountroot)(mp, rootvp, &context)) == 0) {
912 mp->mnt_devvp->v_specflags |= SI_MOUNTEDON;
913
914 vfs_unbusy(mp);
915
916 mount_list_add(mp);
917
918 /*
919 * cache the IO attributes for the underlying physical media...
920 * an error return indicates the underlying driver doesn't
921 * support all the queries necessary... however, reasonable
922 * defaults will have been set, so no reason to bail or care
923 */
924 vfs_init_io_attributes(rootvp, mp);
925 /*
926 * get rid of iocount reference returned
927 * by bdevvp... it will have also taken
928 * a usecount reference which we want to keep
929 */
930 vnode_put(rootvp);
931
932 return (0);
933 }
934 vfs_rootmountfailed(mp);
935
936 if (error != EINVAL)
937 printf("%s_mountroot failed: %d\n", vfsp->vfc_name, error);
938 }
939 return (ENODEV);
940 }
941
942 /*
943 * Lookup a mount point by filesystem identifier.
944 */
945 extern mount_t vfs_getvfs_locked(fsid_t *);
946
947 struct mount *
948 vfs_getvfs(fsid)
949 fsid_t *fsid;
950 {
951 return (mount_list_lookupby_fsid(fsid, 0, 0));
952 }
953
954 struct mount *
955 vfs_getvfs_locked(fsid)
956 fsid_t *fsid;
957 {
958 return(mount_list_lookupby_fsid(fsid, 1, 0));
959 }
960
961 struct mount *
962 vfs_getvfs_by_mntonname(u_char *path)
963 {
964 mount_t retmp = (mount_t)0;
965 mount_t mp;
966
967 mount_list_lock();
968 TAILQ_FOREACH(mp, &mountlist, mnt_list) {
969 if (!strcmp(mp->mnt_vfsstat.f_mntonname, path)) {
970 retmp = mp;
971 goto out;
972 }
973 }
974 out:
975 mount_list_unlock();
976 return (retmp);
977 }
978
979 /* generation number for creation of new fsids */
980 u_short mntid_gen = 0;
981 /*
982 * Get a new unique fsid
983 */
984 void
985 vfs_getnewfsid(mp)
986 struct mount *mp;
987 {
988
989 fsid_t tfsid;
990 int mtype;
991 mount_t nmp;
992
993 mount_list_lock();
994
995 /* generate a new fsid */
996 mtype = mp->mnt_vtable->vfc_typenum;
997 if (++mntid_gen == 0)
998 mntid_gen++;
999 tfsid.val[0] = makedev(nblkdev + mtype, mntid_gen);
1000 tfsid.val[1] = mtype;
1001
1002 TAILQ_FOREACH(nmp, &mountlist, mnt_list) {
1003 while (vfs_getvfs_locked(&tfsid)) {
1004 if (++mntid_gen == 0)
1005 mntid_gen++;
1006 tfsid.val[0] = makedev(nblkdev + mtype, mntid_gen);
1007 }
1008 }
1009 mp->mnt_vfsstat.f_fsid.val[0] = tfsid.val[0];
1010 mp->mnt_vfsstat.f_fsid.val[1] = tfsid.val[1];
1011 mount_list_unlock();
1012 }
1013
1014 /*
1015 * Routines having to do with the management of the vnode table.
1016 */
1017 extern int (**dead_vnodeop_p)(void *);
1018 long numvnodes, freevnodes;
1019 long inactivevnodes;
1020
1021
1022 /*
1023 * Move a vnode from one mount queue to another.
1024 */
1025 static void
1026 insmntque(vnode_t vp, mount_t mp)
1027 {
1028 mount_t lmp;
1029 /*
1030 * Delete from old mount point vnode list, if on one.
1031 */
1032 if ( (lmp = vp->v_mount) != NULL && lmp != dead_mountp) {
1033 if ((vp->v_lflag & VNAMED_MOUNT) == 0)
1034 panic("insmntque: vp not in mount vnode list");
1035 vp->v_lflag &= ~VNAMED_MOUNT;
1036
1037 mount_lock(lmp);
1038
1039 mount_drop(lmp, 1);
1040
1041 if (vp->v_mntvnodes.tqe_next == NULL) {
1042 if (TAILQ_LAST(&lmp->mnt_vnodelist, vnodelst) == vp)
1043 TAILQ_REMOVE(&lmp->mnt_vnodelist, vp, v_mntvnodes);
1044 else if (TAILQ_LAST(&lmp->mnt_newvnodes, vnodelst) == vp)
1045 TAILQ_REMOVE(&lmp->mnt_newvnodes, vp, v_mntvnodes);
1046 else if (TAILQ_LAST(&lmp->mnt_workerqueue, vnodelst) == vp)
1047 TAILQ_REMOVE(&lmp->mnt_workerqueue, vp, v_mntvnodes);
1048 } else {
1049 vp->v_mntvnodes.tqe_next->v_mntvnodes.tqe_prev = vp->v_mntvnodes.tqe_prev;
1050 *vp->v_mntvnodes.tqe_prev = vp->v_mntvnodes.tqe_next;
1051 }
1052 vp->v_mntvnodes.tqe_next = 0;
1053 vp->v_mntvnodes.tqe_prev = 0;
1054 mount_unlock(lmp);
1055 return;
1056 }
1057
1058 /*
1059 * Insert into list of vnodes for the new mount point, if available.
1060 */
1061 if ((vp->v_mount = mp) != NULL) {
1062 mount_lock(mp);
1063 if ((vp->v_mntvnodes.tqe_next != 0) && (vp->v_mntvnodes.tqe_prev != 0))
1064 panic("vp already in mount list");
1065 if (mp->mnt_lflag & MNT_LITER)
1066 TAILQ_INSERT_HEAD(&mp->mnt_newvnodes, vp, v_mntvnodes);
1067 else
1068 TAILQ_INSERT_HEAD(&mp->mnt_vnodelist, vp, v_mntvnodes);
1069 if (vp->v_lflag & VNAMED_MOUNT)
1070 panic("insmntque: vp already in mount vnode list");
1071 if ((vp->v_freelist.tqe_prev != (struct vnode **)0xdeadb))
1072 panic("insmntque: vp on the free list\n");
1073 vp->v_lflag |= VNAMED_MOUNT;
1074 mount_ref(mp, 1);
1075 mount_unlock(mp);
1076 }
1077 }
1078
1079
1080 /*
1081 * Create a vnode for a block device.
1082 * Used for root filesystem, argdev, and swap areas.
1083 * Also used for memory file system special devices.
1084 */
1085 int
1086 bdevvp(dev_t dev, vnode_t *vpp)
1087 {
1088 vnode_t nvp;
1089 int error;
1090 struct vnode_fsparam vfsp;
1091 struct vfs_context context;
1092
1093 if (dev == NODEV) {
1094 *vpp = NULLVP;
1095 return (ENODEV);
1096 }
1097
1098 context.vc_proc = current_proc();
1099 context.vc_ucred = FSCRED;
1100
1101 vfsp.vnfs_mp = (struct mount *)0;
1102 vfsp.vnfs_vtype = VBLK;
1103 vfsp.vnfs_str = "bdevvp";
1104 vfsp.vnfs_dvp = 0;
1105 vfsp.vnfs_fsnode = 0;
1106 vfsp.vnfs_cnp = 0;
1107 vfsp.vnfs_vops = spec_vnodeop_p;
1108 vfsp.vnfs_rdev = dev;
1109 vfsp.vnfs_filesize = 0;
1110
1111 vfsp.vnfs_flags = VNFS_NOCACHE | VNFS_CANTCACHE;
1112
1113 vfsp.vnfs_marksystem = 0;
1114 vfsp.vnfs_markroot = 0;
1115
1116 if ( (error = vnode_create(VNCREATE_FLAVOR, VCREATESIZE, &vfsp, &nvp)) ) {
1117 *vpp = NULLVP;
1118 return (error);
1119 }
1120 if ( (error = vnode_ref(nvp)) ) {
1121 panic("bdevvp failed: vnode_ref");
1122 return (error);
1123 }
1124 if ( (error = VNOP_FSYNC(nvp, MNT_WAIT, &context)) ) {
1125 panic("bdevvp failed: fsync");
1126 return (error);
1127 }
1128 if ( (error = buf_invalidateblks(nvp, BUF_WRITE_DATA, 0, 0)) ) {
1129 panic("bdevvp failed: invalidateblks");
1130 return (error);
1131 }
1132 if ( (error = VNOP_OPEN(nvp, FREAD, &context)) ) {
1133 panic("bdevvp failed: open");
1134 return (error);
1135 }
1136 *vpp = nvp;
1137
1138 return (0);
1139 }
1140
1141 /*
1142 * Check to see if the new vnode represents a special device
1143 * for which we already have a vnode (either because of
1144 * bdevvp() or because of a different vnode representing
1145 * the same block device). If such an alias exists, deallocate
1146 * the existing contents and return the aliased vnode. The
1147 * caller is responsible for filling it with its new contents.
1148 */
1149 static vnode_t
1150 checkalias(nvp, nvp_rdev)
1151 register struct vnode *nvp;
1152 dev_t nvp_rdev;
1153 {
1154 struct vnode *vp;
1155 struct vnode **vpp;
1156 int vid = 0;
1157
1158 vpp = &speclisth[SPECHASH(nvp_rdev)];
1159 loop:
1160 SPECHASH_LOCK();
1161
1162 for (vp = *vpp; vp; vp = vp->v_specnext) {
1163 if (nvp_rdev == vp->v_rdev && nvp->v_type == vp->v_type) {
1164 vid = vp->v_id;
1165 break;
1166 }
1167 }
1168 SPECHASH_UNLOCK();
1169
1170 if (vp) {
1171 if (vnode_getwithvid(vp,vid)) {
1172 goto loop;
1173 }
1174 /*
1175 * Termination state is checked in vnode_getwithvid
1176 */
1177 vnode_lock(vp);
1178
1179 /*
1180 * Alias, but not in use, so flush it out.
1181 */
1182 if ((vp->v_iocount == 1) && (vp->v_usecount == 0)) {
1183 vnode_reclaim_internal(vp, 1, 0);
1184 vnode_unlock(vp);
1185 vnode_put(vp);
1186 goto loop;
1187 }
1188 }
1189 if (vp == NULL || vp->v_tag != VT_NON) {
1190 MALLOC_ZONE(nvp->v_specinfo, struct specinfo *, sizeof(struct specinfo),
1191 M_SPECINFO, M_WAITOK);
1192 bzero(nvp->v_specinfo, sizeof(struct specinfo));
1193 nvp->v_rdev = nvp_rdev;
1194 nvp->v_specflags = 0;
1195 nvp->v_speclastr = -1;
1196
1197 SPECHASH_LOCK();
1198 nvp->v_hashchain = vpp;
1199 nvp->v_specnext = *vpp;
1200 *vpp = nvp;
1201 SPECHASH_UNLOCK();
1202
1203 if (vp != NULLVP) {
1204 nvp->v_flag |= VALIASED;
1205 vp->v_flag |= VALIASED;
1206 vnode_unlock(vp);
1207 vnode_put(vp);
1208 }
1209 return (NULLVP);
1210 }
1211 return (vp);
1212 }
1213
1214
1215 /*
1216 * Get a reference on a particular vnode and lock it if requested.
1217 * If the vnode was on the inactive list, remove it from the list.
1218 * If the vnode was on the free list, remove it from the list and
1219 * move it to inactive list as needed.
1220 * The vnode lock bit is set if the vnode is being eliminated in
1221 * vgone. The process is awakened when the transition is completed,
1222 * and an error returned to indicate that the vnode is no longer
1223 * usable (possibly having been changed to a new file system type).
1224 */
1225 static int
1226 vget_internal(vnode_t vp, int vid, int vflags)
1227 {
1228 int error = 0;
1229 u_long vpid;
1230
1231 vnode_lock(vp);
1232
1233 if (vflags & VNODE_WITHID)
1234 vpid = vid;
1235 else
1236 vpid = vp->v_id; // save off the original v_id
1237
1238 if ((vflags & VNODE_WRITEABLE) && (vp->v_writecount == 0))
1239 /*
1240 * vnode to be returned only if it has writers opened
1241 */
1242 error = EINVAL;
1243 else
1244 error = vnode_getiocount(vp, 1, vpid, vflags);
1245
1246 vnode_unlock(vp);
1247
1248 return (error);
1249 }
1250
1251 int
1252 vnode_ref(vnode_t vp)
1253 {
1254
1255 return (vnode_ref_ext(vp, 0));
1256 }
1257
1258 int
1259 vnode_ref_ext(vnode_t vp, int fmode)
1260 {
1261 int error = 0;
1262
1263 vnode_lock(vp);
1264
1265 /*
1266 * once all the current call sites have been fixed to insure they have
1267 * taken an iocount, we can toughen this assert up and insist that the
1268 * iocount is non-zero... a non-zero usecount doesn't insure correctness
1269 */
1270 if (vp->v_iocount <= 0 && vp->v_usecount <= 0)
1271 panic("vnode_ref_ext: vp %x has no valid reference %d, %d", vp, vp->v_iocount, vp->v_usecount);
1272
1273 /*
1274 * if you are the owner of drain/termination, can acquire usecount
1275 */
1276 if ((vp->v_lflag & (VL_DRAIN | VL_TERMINATE | VL_DEAD))) {
1277 if (vp->v_owner != current_thread()) {
1278 error = ENOENT;
1279 goto out;
1280 }
1281 }
1282 vp->v_usecount++;
1283
1284 if (fmode & FWRITE) {
1285 if (++vp->v_writecount <= 0)
1286 panic("vnode_ref_ext: v_writecount");
1287 }
1288 if (fmode & O_EVTONLY) {
1289 if (++vp->v_kusecount <= 0)
1290 panic("vnode_ref_ext: v_kusecount");
1291 }
1292 out:
1293 vnode_unlock(vp);
1294
1295 return (error);
1296 }
1297
1298
1299 /*
1300 * put the vnode on appropriate free list.
1301 * called with vnode LOCKED
1302 */
1303 static void
1304 vnode_list_add(vnode_t vp)
1305 {
1306
1307 /*
1308 * if it is already on a list or non zero references return
1309 */
1310 if (VONLIST(vp) || (vp->v_usecount != 0) || (vp->v_iocount != 0))
1311 return;
1312 vnode_list_lock();
1313
1314 /*
1315 * insert at tail of LRU list or at head if VAGE or VL_DEAD is set
1316 */
1317 if ((vp->v_flag & VAGE) || (vp->v_lflag & VL_DEAD)) {
1318 TAILQ_INSERT_HEAD(&vnode_free_list, vp, v_freelist);
1319 vp->v_flag &= ~VAGE;
1320 } else {
1321 TAILQ_INSERT_TAIL(&vnode_free_list, vp, v_freelist);
1322 }
1323 freevnodes++;
1324
1325 vnode_list_unlock();
1326 }
1327
1328 /*
1329 * remove the vnode from appropriate free list.
1330 */
1331 static void
1332 vnode_list_remove(vnode_t vp)
1333 {
1334 /*
1335 * we want to avoid taking the list lock
1336 * in the case where we're not on the free
1337 * list... this will be true for most
1338 * directories and any currently in use files
1339 *
1340 * we're guaranteed that we can't go from
1341 * the not-on-list state to the on-list
1342 * state since we hold the vnode lock...
1343 * all calls to vnode_list_add are done
1344 * under the vnode lock... so we can
1345 * check for that condition (the prevelant one)
1346 * without taking the list lock
1347 */
1348 if (VONLIST(vp)) {
1349 vnode_list_lock();
1350 /*
1351 * however, we're not guaranteed that
1352 * we won't go from the on-list state
1353 * to the non-on-list state until we
1354 * hold the vnode_list_lock... this
1355 * is due to new_vnode removing vnodes
1356 * from the free list uder the list_lock
1357 * w/o the vnode lock... so we need to
1358 * check again whether we're currently
1359 * on the free list
1360 */
1361 if (VONLIST(vp)) {
1362 VREMFREE("vnode_list_remove", vp);
1363 VLISTNONE(vp);
1364 }
1365 vnode_list_unlock();
1366 }
1367 }
1368
1369
1370 void
1371 vnode_rele(vnode_t vp)
1372 {
1373 vnode_rele_internal(vp, 0, 0, 0);
1374 }
1375
1376
1377 void
1378 vnode_rele_ext(vnode_t vp, int fmode, int dont_reenter)
1379 {
1380 vnode_rele_internal(vp, fmode, dont_reenter, 0);
1381 }
1382
1383
1384 void
1385 vnode_rele_internal(vnode_t vp, int fmode, int dont_reenter, int locked)
1386 {
1387 struct vfs_context context;
1388
1389 if ( !locked)
1390 vnode_lock(vp);
1391
1392 if (--vp->v_usecount < 0)
1393 panic("vnode_rele_ext: vp %x usecount -ve : %d", vp, vp->v_usecount);
1394
1395 if (fmode & FWRITE) {
1396 if (--vp->v_writecount < 0)
1397 panic("vnode_rele_ext: vp %x writecount -ve : %d", vp, vp->v_writecount);
1398 }
1399 if (fmode & O_EVTONLY) {
1400 if (--vp->v_kusecount < 0)
1401 panic("vnode_rele_ext: vp %x kusecount -ve : %d", vp, vp->v_kusecount);
1402 }
1403 if ((vp->v_iocount > 0) || (vp->v_usecount > 0)) {
1404 /*
1405 * vnode is still busy... if we're the last
1406 * usecount, mark for a future call to VNOP_INACTIVE
1407 * when the iocount finally drops to 0
1408 */
1409 if (vp->v_usecount == 0) {
1410 vp->v_lflag |= VL_NEEDINACTIVE;
1411 vp->v_flag &= ~(VNOCACHE_DATA | VRAOFF);
1412 }
1413 if ( !locked)
1414 vnode_unlock(vp);
1415 return;
1416 }
1417 vp->v_flag &= ~(VNOCACHE_DATA | VRAOFF);
1418
1419 if ( (vp->v_lflag & (VL_TERMINATE | VL_DEAD)) || dont_reenter) {
1420 /*
1421 * vnode is being cleaned, or
1422 * we've requested that we don't reenter
1423 * the filesystem on this release... in
1424 * this case, we'll mark the vnode aged
1425 * if it's been marked for termination
1426 */
1427 if (dont_reenter) {
1428 if ( !(vp->v_lflag & (VL_TERMINATE | VL_DEAD | VL_MARKTERM)) )
1429 vp->v_lflag |= VL_NEEDINACTIVE;
1430 vp->v_flag |= VAGE;
1431 }
1432 vnode_list_add(vp);
1433 if ( !locked)
1434 vnode_unlock(vp);
1435 return;
1436 }
1437 /*
1438 * at this point both the iocount and usecount
1439 * are zero
1440 * pick up an iocount so that we can call
1441 * VNOP_INACTIVE with the vnode lock unheld
1442 */
1443 vp->v_iocount++;
1444 #ifdef JOE_DEBUG
1445 record_vp(vp, 1);
1446 #endif
1447 vp->v_lflag &= ~VL_NEEDINACTIVE;
1448 vnode_unlock(vp);
1449
1450 context.vc_proc = current_proc();
1451 context.vc_ucred = kauth_cred_get();
1452 VNOP_INACTIVE(vp, &context);
1453
1454 vnode_lock(vp);
1455 /*
1456 * because we dropped the vnode lock to call VNOP_INACTIVE
1457 * the state of the vnode may have changed... we may have
1458 * picked up an iocount, usecount or the MARKTERM may have
1459 * been set... we need to reevaluate the reference counts
1460 * to determine if we can call vnode_reclaim_internal at
1461 * this point... if the reference counts are up, we'll pick
1462 * up the MARKTERM state when they get subsequently dropped
1463 */
1464 if ( (vp->v_iocount == 1) && (vp->v_usecount == 0) &&
1465 ((vp->v_lflag & (VL_MARKTERM | VL_TERMINATE | VL_DEAD)) == VL_MARKTERM)) {
1466 struct uthread *ut;
1467
1468 ut = get_bsdthread_info(current_thread());
1469
1470 if (ut->uu_defer_reclaims) {
1471 vp->v_defer_reclaimlist = ut->uu_vreclaims;
1472 ut->uu_vreclaims = vp;
1473 goto defer_reclaim;
1474 }
1475 vnode_reclaim_internal(vp, 1, 0);
1476 }
1477 vnode_dropiocount(vp, 1);
1478 vnode_list_add(vp);
1479 defer_reclaim:
1480 if ( !locked)
1481 vnode_unlock(vp);
1482 return;
1483 }
1484
1485 /*
1486 * Remove any vnodes in the vnode table belonging to mount point mp.
1487 *
1488 * If MNT_NOFORCE is specified, there should not be any active ones,
1489 * return error if any are found (nb: this is a user error, not a
1490 * system error). If MNT_FORCE is specified, detach any active vnodes
1491 * that are found.
1492 */
1493 #if DIAGNOSTIC
1494 int busyprt = 0; /* print out busy vnodes */
1495 #if 0
1496 struct ctldebug debug1 = { "busyprt", &busyprt };
1497 #endif /* 0 */
1498 #endif
1499
1500 int
1501 vflush(mp, skipvp, flags)
1502 struct mount *mp;
1503 struct vnode *skipvp;
1504 int flags;
1505 {
1506 struct proc *p = current_proc();
1507 struct vnode *vp;
1508 int busy = 0;
1509 int reclaimed = 0;
1510 int vid, retval;
1511
1512 mount_lock(mp);
1513 vnode_iterate_setup(mp);
1514 /*
1515 * On regular unmounts(not forced) do a
1516 * quick check for vnodes to be in use. This
1517 * preserves the caching of vnodes. automounter
1518 * tries unmounting every so often to see whether
1519 * it is still busy or not.
1520 */
1521 if ((flags & FORCECLOSE)==0) {
1522 if (vnode_umount_preflight(mp, skipvp, flags)) {
1523 vnode_iterate_clear(mp);
1524 mount_unlock(mp);
1525 return(EBUSY);
1526 }
1527 }
1528 loop:
1529 /* it is returns 0 then there is nothing to do */
1530 retval = vnode_iterate_prepare(mp);
1531
1532 if (retval == 0) {
1533 vnode_iterate_clear(mp);
1534 mount_unlock(mp);
1535 return(retval);
1536 }
1537
1538 /* iterate over all the vnodes */
1539 while (!TAILQ_EMPTY(&mp->mnt_workerqueue)) {
1540 vp = TAILQ_FIRST(&mp->mnt_workerqueue);
1541 TAILQ_REMOVE(&mp->mnt_workerqueue, vp, v_mntvnodes);
1542 TAILQ_INSERT_TAIL(&mp->mnt_vnodelist, vp, v_mntvnodes);
1543 if ( (vp->v_mount != mp) || (vp == skipvp)) {
1544 continue;
1545 }
1546 vid = vp->v_id;
1547 mount_unlock(mp);
1548 vnode_lock(vp);
1549
1550 if ((vp->v_id != vid) || ((vp->v_lflag & (VL_DEAD | VL_TERMINATE)))) {
1551 vnode_unlock(vp);
1552 mount_lock(mp);
1553 continue;
1554 }
1555
1556 /*
1557 * If requested, skip over vnodes marked VSYSTEM.
1558 * Skip over all vnodes marked VNOFLUSH.
1559 */
1560 if ((flags & SKIPSYSTEM) && ((vp->v_flag & VSYSTEM) ||
1561 (vp->v_flag & VNOFLUSH))) {
1562 vnode_unlock(vp);
1563 mount_lock(mp);
1564 continue;
1565 }
1566 /*
1567 * If requested, skip over vnodes marked VSWAP.
1568 */
1569 if ((flags & SKIPSWAP) && (vp->v_flag & VSWAP)) {
1570 vnode_unlock(vp);
1571 mount_lock(mp);
1572 continue;
1573 }
1574 /*
1575 * If requested, skip over vnodes marked VSWAP.
1576 */
1577 if ((flags & SKIPROOT) && (vp->v_flag & VROOT)) {
1578 vnode_unlock(vp);
1579 mount_lock(mp);
1580 continue;
1581 }
1582 /*
1583 * If WRITECLOSE is set, only flush out regular file
1584 * vnodes open for writing.
1585 */
1586 if ((flags & WRITECLOSE) &&
1587 (vp->v_writecount == 0 || vp->v_type != VREG)) {
1588 vnode_unlock(vp);
1589 mount_lock(mp);
1590 continue;
1591 }
1592 /*
1593 * If the real usecount is 0, all we need to do is clear
1594 * out the vnode data structures and we are done.
1595 */
1596 if (((vp->v_usecount == 0) ||
1597 ((vp->v_usecount - vp->v_kusecount) == 0))) {
1598 vp->v_iocount++; /* so that drain waits for * other iocounts */
1599 #ifdef JOE_DEBUG
1600 record_vp(vp, 1);
1601 #endif
1602 vnode_reclaim_internal(vp, 1, 0);
1603 vnode_dropiocount(vp, 1);
1604 vnode_list_add(vp);
1605
1606 vnode_unlock(vp);
1607 reclaimed++;
1608 mount_lock(mp);
1609 continue;
1610 }
1611 /*
1612 * If FORCECLOSE is set, forcibly close the vnode.
1613 * For block or character devices, revert to an
1614 * anonymous device. For all other files, just kill them.
1615 */
1616 if (flags & FORCECLOSE) {
1617 if (vp->v_type != VBLK && vp->v_type != VCHR) {
1618 vp->v_iocount++; /* so that drain waits * for other iocounts */
1619 #ifdef JOE_DEBUG
1620 record_vp(vp, 1);
1621 #endif
1622 vnode_reclaim_internal(vp, 1, 0);
1623 vnode_dropiocount(vp, 1);
1624 vnode_list_add(vp);
1625 vnode_unlock(vp);
1626 } else {
1627 vclean(vp, 0, p);
1628 vp->v_lflag &= ~VL_DEAD;
1629 vp->v_op = spec_vnodeop_p;
1630 vnode_unlock(vp);
1631 }
1632 mount_lock(mp);
1633 continue;
1634 }
1635 #if DIAGNOSTIC
1636 if (busyprt)
1637 vprint("vflush: busy vnode", vp);
1638 #endif
1639 vnode_unlock(vp);
1640 mount_lock(mp);
1641 busy++;
1642 }
1643
1644 /* At this point the worker queue is completed */
1645 if (busy && ((flags & FORCECLOSE)==0) && reclaimed) {
1646 busy = 0;
1647 reclaimed = 0;
1648 (void)vnode_iterate_reloadq(mp);
1649 /* returned with mount lock held */
1650 goto loop;
1651 }
1652
1653 /* if new vnodes were created in between retry the reclaim */
1654 if ( vnode_iterate_reloadq(mp) != 0) {
1655 if (!(busy && ((flags & FORCECLOSE)==0)))
1656 goto loop;
1657 }
1658 vnode_iterate_clear(mp);
1659 mount_unlock(mp);
1660
1661 if (busy && ((flags & FORCECLOSE)==0))
1662 return (EBUSY);
1663 return (0);
1664 }
1665
1666 int num_recycledvnodes=0;
1667 /*
1668 * Disassociate the underlying file system from a vnode.
1669 * The vnode lock is held on entry.
1670 */
1671 static void
1672 vclean(vnode_t vp, int flags, proc_t p)
1673 {
1674 struct vfs_context context;
1675 int active;
1676 int need_inactive;
1677 int already_terminating;
1678 kauth_cred_t ucred = NULL;
1679
1680 context.vc_proc = p;
1681 context.vc_ucred = kauth_cred_get();
1682
1683 /*
1684 * Check to see if the vnode is in use.
1685 * If so we have to reference it before we clean it out
1686 * so that its count cannot fall to zero and generate a
1687 * race against ourselves to recycle it.
1688 */
1689 active = vp->v_usecount;
1690
1691 /*
1692 * just in case we missed sending a needed
1693 * VNOP_INACTIVE, we'll do it now
1694 */
1695 need_inactive = (vp->v_lflag & VL_NEEDINACTIVE);
1696
1697 vp->v_lflag &= ~VL_NEEDINACTIVE;
1698
1699 /*
1700 * Prevent the vnode from being recycled or
1701 * brought into use while we clean it out.
1702 */
1703 already_terminating = (vp->v_lflag & VL_TERMINATE);
1704
1705 vp->v_lflag |= VL_TERMINATE;
1706
1707 /*
1708 * remove the vnode from any mount list
1709 * it might be on...
1710 */
1711 insmntque(vp, (struct mount *)0);
1712
1713 ucred = vp->v_cred;
1714 vp->v_cred = NULL;
1715
1716 vnode_unlock(vp);
1717
1718 if (ucred)
1719 kauth_cred_rele(ucred);
1720
1721 OSAddAtomic(1, &num_recycledvnodes);
1722 /*
1723 * purge from the name cache as early as possible...
1724 */
1725 cache_purge(vp);
1726
1727 if (active && (flags & DOCLOSE))
1728 VNOP_CLOSE(vp, IO_NDELAY, &context);
1729
1730 /*
1731 * Clean out any buffers associated with the vnode.
1732 */
1733 if (flags & DOCLOSE) {
1734 #if NFSCLIENT
1735 if (vp->v_tag == VT_NFS)
1736 nfs_vinvalbuf(vp, V_SAVE, NOCRED, p, 0);
1737 else
1738 #endif
1739 {
1740 VNOP_FSYNC(vp, MNT_WAIT, &context);
1741 buf_invalidateblks(vp, BUF_WRITE_DATA, 0, 0);
1742 }
1743 if (UBCINFOEXISTS(vp))
1744 /*
1745 * Clean the pages in VM.
1746 */
1747 (void)ubc_sync_range(vp, (off_t)0, ubc_getsize(vp), UBC_PUSHALL);
1748 }
1749 if (UBCINFOEXISTS(vp))
1750 cluster_release(vp->v_ubcinfo);
1751
1752 if (active || need_inactive)
1753 VNOP_INACTIVE(vp, &context);
1754
1755 /* Destroy ubc named reference */
1756 ubc_destroy_named(vp);
1757
1758 /*
1759 * Reclaim the vnode.
1760 */
1761 if (VNOP_RECLAIM(vp, &context))
1762 panic("vclean: cannot reclaim");
1763
1764 // make sure the name & parent ptrs get cleaned out!
1765 vnode_update_identity(vp, NULLVP, NULL, 0, 0, VNODE_UPDATE_PARENT | VNODE_UPDATE_NAME);
1766
1767 vnode_lock(vp);
1768
1769 vp->v_mount = dead_mountp;
1770 vp->v_op = dead_vnodeop_p;
1771 vp->v_tag = VT_NON;
1772 vp->v_data = NULL;
1773
1774 vp->v_lflag |= VL_DEAD;
1775
1776 if (already_terminating == 0) {
1777 vp->v_lflag &= ~VL_TERMINATE;
1778 /*
1779 * Done with purge, notify sleepers of the grim news.
1780 */
1781 if (vp->v_lflag & VL_TERMWANT) {
1782 vp->v_lflag &= ~VL_TERMWANT;
1783 wakeup(&vp->v_lflag);
1784 }
1785 }
1786 }
1787
1788 /*
1789 * Eliminate all activity associated with the requested vnode
1790 * and with all vnodes aliased to the requested vnode.
1791 */
1792 int
1793 vn_revoke(vnode_t vp, int flags, __unused vfs_context_t a_context)
1794 {
1795 struct vnode *vq;
1796 int vid;
1797
1798 #if DIAGNOSTIC
1799 if ((flags & REVOKEALL) == 0)
1800 panic("vnop_revoke");
1801 #endif
1802
1803 if (vp->v_flag & VALIASED) {
1804 /*
1805 * If a vgone (or vclean) is already in progress,
1806 * wait until it is done and return.
1807 */
1808 vnode_lock(vp);
1809 if (vp->v_lflag & VL_TERMINATE) {
1810 vnode_unlock(vp);
1811 return(ENOENT);
1812 }
1813 vnode_unlock(vp);
1814 /*
1815 * Ensure that vp will not be vgone'd while we
1816 * are eliminating its aliases.
1817 */
1818 SPECHASH_LOCK();
1819 while (vp->v_flag & VALIASED) {
1820 for (vq = *vp->v_hashchain; vq; vq = vq->v_specnext) {
1821 if (vq->v_rdev != vp->v_rdev ||
1822 vq->v_type != vp->v_type || vp == vq)
1823 continue;
1824 vid = vq->v_id;
1825 SPECHASH_UNLOCK();
1826 if (vnode_getwithvid(vq,vid)){
1827 SPECHASH_LOCK();
1828 break;
1829 }
1830 vnode_reclaim_internal(vq, 0, 0);
1831 vnode_put(vq);
1832 SPECHASH_LOCK();
1833 break;
1834 }
1835 }
1836 SPECHASH_UNLOCK();
1837 }
1838 vnode_reclaim_internal(vp, 0, 0);
1839
1840 return (0);
1841 }
1842
1843 /*
1844 * Recycle an unused vnode to the front of the free list.
1845 * Release the passed interlock if the vnode will be recycled.
1846 */
1847 int
1848 vnode_recycle(vp)
1849 struct vnode *vp;
1850 {
1851 vnode_lock(vp);
1852
1853 if (vp->v_iocount || vp->v_usecount) {
1854 vp->v_lflag |= VL_MARKTERM;
1855 vnode_unlock(vp);
1856 return(0);
1857 }
1858 vnode_reclaim_internal(vp, 1, 0);
1859 vnode_unlock(vp);
1860
1861 return (1);
1862 }
1863
1864 static int
1865 vnode_reload(vnode_t vp)
1866 {
1867 vnode_lock(vp);
1868
1869 if ((vp->v_iocount > 1) || vp->v_usecount) {
1870 vnode_unlock(vp);
1871 return(0);
1872 }
1873 if (vp->v_iocount <= 0)
1874 panic("vnode_reload with no iocount %d", vp->v_iocount);
1875
1876 /* mark for release when iocount is dopped */
1877 vp->v_lflag |= VL_MARKTERM;
1878 vnode_unlock(vp);
1879
1880 return (1);
1881 }
1882
1883
1884 static void
1885 vgone(vnode_t vp)
1886 {
1887 struct vnode *vq;
1888 struct vnode *vx;
1889
1890 /*
1891 * Clean out the filesystem specific data.
1892 * vclean also takes care of removing the
1893 * vnode from any mount list it might be on
1894 */
1895 vclean(vp, DOCLOSE, current_proc());
1896
1897 /*
1898 * If special device, remove it from special device alias list
1899 * if it is on one.
1900 */
1901 if ((vp->v_type == VBLK || vp->v_type == VCHR) && vp->v_specinfo != 0) {
1902 SPECHASH_LOCK();
1903 if (*vp->v_hashchain == vp) {
1904 *vp->v_hashchain = vp->v_specnext;
1905 } else {
1906 for (vq = *vp->v_hashchain; vq; vq = vq->v_specnext) {
1907 if (vq->v_specnext != vp)
1908 continue;
1909 vq->v_specnext = vp->v_specnext;
1910 break;
1911 }
1912 if (vq == NULL)
1913 panic("missing bdev");
1914 }
1915 if (vp->v_flag & VALIASED) {
1916 vx = NULL;
1917 for (vq = *vp->v_hashchain; vq; vq = vq->v_specnext) {
1918 if (vq->v_rdev != vp->v_rdev ||
1919 vq->v_type != vp->v_type)
1920 continue;
1921 if (vx)
1922 break;
1923 vx = vq;
1924 }
1925 if (vx == NULL)
1926 panic("missing alias");
1927 if (vq == NULL)
1928 vx->v_flag &= ~VALIASED;
1929 vp->v_flag &= ~VALIASED;
1930 }
1931 SPECHASH_UNLOCK();
1932 {
1933 struct specinfo *tmp = vp->v_specinfo;
1934 vp->v_specinfo = NULL;
1935 FREE_ZONE((void *)tmp, sizeof(struct specinfo), M_SPECINFO);
1936 }
1937 }
1938 }
1939
1940 /*
1941 * Lookup a vnode by device number.
1942 */
1943 int
1944 check_mountedon(dev_t dev, enum vtype type, int *errorp)
1945 {
1946 vnode_t vp;
1947 int rc = 0;
1948 int vid;
1949
1950 loop:
1951 SPECHASH_LOCK();
1952 for (vp = speclisth[SPECHASH(dev)]; vp; vp = vp->v_specnext) {
1953 if (dev != vp->v_rdev || type != vp->v_type)
1954 continue;
1955 vid = vp->v_id;
1956 SPECHASH_UNLOCK();
1957 if (vnode_getwithvid(vp,vid))
1958 goto loop;
1959 vnode_lock(vp);
1960 if ((vp->v_usecount > 0) || (vp->v_iocount > 1)) {
1961 vnode_unlock(vp);
1962 if ((*errorp = vfs_mountedon(vp)) != 0)
1963 rc = 1;
1964 } else
1965 vnode_unlock(vp);
1966 vnode_put(vp);
1967 return(rc);
1968 }
1969 SPECHASH_UNLOCK();
1970 return (0);
1971 }
1972
1973 /*
1974 * Calculate the total number of references to a special device.
1975 */
1976 int
1977 vcount(vnode_t vp)
1978 {
1979 vnode_t vq, vnext;
1980 int count;
1981 int vid;
1982
1983 loop:
1984 if ((vp->v_flag & VALIASED) == 0)
1985 return (vp->v_usecount - vp->v_kusecount);
1986
1987 SPECHASH_LOCK();
1988 for (count = 0, vq = *vp->v_hashchain; vq; vq = vnext) {
1989 vnext = vq->v_specnext;
1990 if (vq->v_rdev != vp->v_rdev || vq->v_type != vp->v_type)
1991 continue;
1992 vid = vq->v_id;
1993 SPECHASH_UNLOCK();
1994
1995 if (vnode_getwithvid(vq, vid)) {
1996 goto loop;
1997 }
1998 /*
1999 * Alias, but not in use, so flush it out.
2000 */
2001 vnode_lock(vq);
2002 if ((vq->v_usecount == 0) && (vq->v_iocount == 1) && vq != vp) {
2003 vnode_reclaim_internal(vq, 1, 0);
2004 vnode_unlock(vq);
2005 vnode_put(vq);
2006 goto loop;
2007 }
2008 count += (vq->v_usecount - vq->v_kusecount);
2009 vnode_unlock(vq);
2010 vnode_put(vq);
2011
2012 SPECHASH_LOCK();
2013 }
2014 SPECHASH_UNLOCK();
2015
2016 return (count);
2017 }
2018
2019 int prtactive = 0; /* 1 => print out reclaim of active vnodes */
2020
2021 /*
2022 * Print out a description of a vnode.
2023 */
2024 static char *typename[] =
2025 { "VNON", "VREG", "VDIR", "VBLK", "VCHR", "VLNK", "VSOCK", "VFIFO", "VBAD" };
2026
2027 void
2028 vprint(const char *label, struct vnode *vp)
2029 {
2030 char sbuf[64];
2031
2032 if (label != NULL)
2033 printf("%s: ", label);
2034 printf("type %s, usecount %d, writecount %d",
2035 typename[vp->v_type], vp->v_usecount, vp->v_writecount);
2036 sbuf[0] = '\0';
2037 if (vp->v_flag & VROOT)
2038 strcat(sbuf, "|VROOT");
2039 if (vp->v_flag & VTEXT)
2040 strcat(sbuf, "|VTEXT");
2041 if (vp->v_flag & VSYSTEM)
2042 strcat(sbuf, "|VSYSTEM");
2043 if (vp->v_flag & VNOFLUSH)
2044 strcat(sbuf, "|VNOFLUSH");
2045 if (vp->v_flag & VBWAIT)
2046 strcat(sbuf, "|VBWAIT");
2047 if (vp->v_flag & VALIASED)
2048 strcat(sbuf, "|VALIASED");
2049 if (sbuf[0] != '\0')
2050 printf(" flags (%s)", &sbuf[1]);
2051 }
2052
2053
2054 int
2055 vn_getpath(struct vnode *vp, char *pathbuf, int *len)
2056 {
2057 return build_path(vp, pathbuf, *len, len);
2058 }
2059
2060
2061 static char *extension_table=NULL;
2062 static int nexts;
2063 static int max_ext_width;
2064
2065 static int
2066 extension_cmp(void *a, void *b)
2067 {
2068 return (strlen((char *)a) - strlen((char *)b));
2069 }
2070
2071
2072 //
2073 // This is the api LaunchServices uses to inform the kernel
2074 // the list of package extensions to ignore.
2075 //
2076 // Internally we keep the list sorted by the length of the
2077 // the extension (from longest to shortest). We sort the
2078 // list of extensions so that we can speed up our searches
2079 // when comparing file names -- we only compare extensions
2080 // that could possibly fit into the file name, not all of
2081 // them (i.e. a short 8 character name can't have an 8
2082 // character extension).
2083 //
2084 __private_extern__ int
2085 set_package_extensions_table(void *data, int nentries, int maxwidth)
2086 {
2087 char *new_exts, *ptr;
2088 int error, i, len;
2089
2090 if (nentries <= 0 || nentries > 1024 || maxwidth <= 0 || maxwidth > 255) {
2091 return EINVAL;
2092 }
2093
2094 MALLOC(new_exts, char *, nentries * maxwidth, M_TEMP, M_WAITOK);
2095
2096 error = copyin(CAST_USER_ADDR_T(data), new_exts, nentries * maxwidth);
2097 if (error) {
2098 FREE(new_exts, M_TEMP);
2099 return error;
2100 }
2101
2102 if (extension_table) {
2103 FREE(extension_table, M_TEMP);
2104 }
2105 extension_table = new_exts;
2106 nexts = nentries;
2107 max_ext_width = maxwidth;
2108
2109 qsort(extension_table, nexts, maxwidth, extension_cmp);
2110
2111 return 0;
2112 }
2113
2114
2115 __private_extern__ int
2116 is_package_name(char *name, int len)
2117 {
2118 int i, extlen;
2119 char *ptr, *name_ext;
2120
2121 if (len <= 3) {
2122 return 0;
2123 }
2124
2125 name_ext = NULL;
2126 for(ptr=name; *ptr != '\0'; ptr++) {
2127 if (*ptr == '.') {
2128 name_ext = ptr;
2129 }
2130 }
2131
2132 // if there is no "." extension, it can't match
2133 if (name_ext == NULL) {
2134 return 0;
2135 }
2136
2137 // advance over the "."
2138 name_ext++;
2139
2140 // now iterate over all the extensions to see if any match
2141 ptr = &extension_table[0];
2142 for(i=0; i < nexts; i++, ptr+=max_ext_width) {
2143 extlen = strlen(ptr);
2144 if (strncasecmp(name_ext, ptr, extlen) == 0 && name_ext[extlen] == '\0') {
2145 // aha, a match!
2146 return 1;
2147 }
2148 }
2149
2150 // if we get here, no extension matched
2151 return 0;
2152 }
2153
2154 int
2155 vn_path_package_check(__unused vnode_t vp, char *path, int pathlen, int *component)
2156 {
2157 char *ptr, *end;
2158 int comp=0;
2159
2160 *component = -1;
2161 if (*path != '/') {
2162 return EINVAL;
2163 }
2164
2165 end = path + 1;
2166 while(end < path + pathlen && *end != '\0') {
2167 while(end < path + pathlen && *end == '/' && *end != '\0') {
2168 end++;
2169 }
2170
2171 ptr = end;
2172
2173 while(end < path + pathlen && *end != '/' && *end != '\0') {
2174 end++;
2175 }
2176
2177 if (end > path + pathlen) {
2178 // hmm, string wasn't null terminated
2179 return EINVAL;
2180 }
2181
2182 *end = '\0';
2183 if (is_package_name(ptr, end - ptr)) {
2184 *component = comp;
2185 break;
2186 }
2187
2188 end++;
2189 comp++;
2190 }
2191
2192 return 0;
2193 }
2194
2195
2196 /*
2197 * Top level filesystem related information gathering.
2198 */
2199 extern unsigned int vfs_nummntops;
2200
2201 int
2202 vfs_sysctl(int *name, u_int namelen, user_addr_t oldp, size_t *oldlenp,
2203 user_addr_t newp, size_t newlen, struct proc *p)
2204 {
2205 struct vfstable *vfsp;
2206 int *username;
2207 u_int usernamelen;
2208 int error;
2209 struct vfsconf *vfsc;
2210
2211 /*
2212 * The VFS_NUMMNTOPS shouldn't be at name[0] since
2213 * is a VFS generic variable. So now we must check
2214 * namelen so we don't end up covering any UFS
2215 * variables (sinc UFS vfc_typenum is 1).
2216 *
2217 * It should have been:
2218 * name[0]: VFS_GENERIC
2219 * name[1]: VFS_NUMMNTOPS
2220 */
2221 if (namelen == 1 && name[0] == VFS_NUMMNTOPS) {
2222 return (sysctl_rdint(oldp, oldlenp, newp, vfs_nummntops));
2223 }
2224
2225 /* all sysctl names at this level are at least name and field */
2226 if (namelen < 2)
2227 return (EISDIR); /* overloaded */
2228 if (name[0] != VFS_GENERIC) {
2229 struct vfs_context context;
2230
2231 for (vfsp = vfsconf; vfsp; vfsp = vfsp->vfc_next)
2232 if (vfsp->vfc_typenum == name[0])
2233 break;
2234 if (vfsp == NULL)
2235 return (ENOTSUP);
2236 context.vc_proc = p;
2237 context.vc_ucred = kauth_cred_get();
2238
2239 return ((*vfsp->vfc_vfsops->vfs_sysctl)(&name[1], namelen - 1,
2240 oldp, oldlenp, newp, newlen, &context));
2241 }
2242 switch (name[1]) {
2243 case VFS_MAXTYPENUM:
2244 return (sysctl_rdint(oldp, oldlenp, newp, maxvfsconf));
2245 case VFS_CONF:
2246 if (namelen < 3)
2247 return (ENOTDIR); /* overloaded */
2248 for (vfsp = vfsconf; vfsp; vfsp = vfsp->vfc_next)
2249 if (vfsp->vfc_typenum == name[2])
2250 break;
2251 if (vfsp == NULL)
2252 return (ENOTSUP);
2253 vfsc = (struct vfsconf *)vfsp;
2254 if (proc_is64bit(p)) {
2255 struct user_vfsconf usr_vfsc;
2256 usr_vfsc.vfc_vfsops = CAST_USER_ADDR_T(vfsc->vfc_vfsops);
2257 bcopy(vfsc->vfc_name, usr_vfsc.vfc_name, sizeof(usr_vfsc.vfc_name));
2258 usr_vfsc.vfc_typenum = vfsc->vfc_typenum;
2259 usr_vfsc.vfc_refcount = vfsc->vfc_refcount;
2260 usr_vfsc.vfc_flags = vfsc->vfc_flags;
2261 usr_vfsc.vfc_mountroot = CAST_USER_ADDR_T(vfsc->vfc_mountroot);
2262 usr_vfsc.vfc_next = CAST_USER_ADDR_T(vfsc->vfc_next);
2263 return (sysctl_rdstruct(oldp, oldlenp, newp, &usr_vfsc,
2264 sizeof(usr_vfsc)));
2265 }
2266 else {
2267 return (sysctl_rdstruct(oldp, oldlenp, newp, vfsc,
2268 sizeof(struct vfsconf)));
2269 }
2270
2271 case VFS_SET_PACKAGE_EXTS:
2272 return set_package_extensions_table((void *)name[1], name[2], name[3]);
2273 }
2274 /*
2275 * We need to get back into the general MIB, so we need to re-prepend
2276 * CTL_VFS to our name and try userland_sysctl().
2277 */
2278 usernamelen = namelen + 1;
2279 MALLOC(username, int *, usernamelen * sizeof(*username),
2280 M_TEMP, M_WAITOK);
2281 bcopy(name, username + 1, namelen * sizeof(*name));
2282 username[0] = CTL_VFS;
2283 error = userland_sysctl(p, username, usernamelen, oldp,
2284 oldlenp, 1, newp, newlen, oldlenp);
2285 FREE(username, M_TEMP);
2286 return (error);
2287 }
2288
2289 int kinfo_vdebug = 1;
2290 #define KINFO_VNODESLOP 10
2291 /*
2292 * Dump vnode list (via sysctl).
2293 * Copyout address of vnode followed by vnode.
2294 */
2295 /* ARGSUSED */
2296 int
2297 sysctl_vnode(__unused user_addr_t where, __unused size_t *sizep)
2298 {
2299 #if 0
2300 struct mount *mp, *nmp;
2301 struct vnode *nvp, *vp;
2302 char *bp = where, *savebp;
2303 char *ewhere;
2304 int error;
2305
2306 #define VPTRSZ sizeof (struct vnode *)
2307 #define VNODESZ sizeof (struct vnode)
2308 if (where == NULL) {
2309 *sizep = (numvnodes + KINFO_VNODESLOP) * (VPTRSZ + VNODESZ);
2310 return (0);
2311 }
2312 ewhere = where + *sizep;
2313
2314 for (mp = mountlist.cqh_first; mp != (void *)&mountlist; mp = nmp) {
2315 if (vfs_busy(mp, LK_NOWAIT)) {
2316 nmp = mp->mnt_list.cqe_next;
2317 continue;
2318 }
2319 savebp = bp;
2320 again:
2321 TAILQ_FOREACH(vp, &mp->mnt_vnodelist, v_mntvnodes) {
2322 /*
2323 * Check that the vp is still associated with
2324 * this filesystem. RACE: could have been
2325 * recycled onto the same filesystem.
2326 */
2327 if (vp->v_mount != mp) {
2328 if (kinfo_vdebug)
2329 printf("kinfo: vp changed\n");
2330 bp = savebp;
2331 goto again;
2332 }
2333 if (bp + VPTRSZ + VNODESZ > ewhere) {
2334 vfs_unbusy(mp);
2335 *sizep = bp - where;
2336 return (ENOMEM);
2337 }
2338 if ((error = copyout((caddr_t)&vp, bp, VPTRSZ)) ||
2339 (error = copyout((caddr_t)vp, bp + VPTRSZ, VNODESZ))) {
2340 vfs_unbusy(mp);
2341 return (error);
2342 }
2343 bp += VPTRSZ + VNODESZ;
2344 }
2345 nmp = mp->mnt_list.cqe_next;
2346 vfs_unbusy(mp);
2347 }
2348
2349 *sizep = bp - where;
2350 return (0);
2351 #else
2352 return(EINVAL);
2353 #endif
2354 }
2355
2356 /*
2357 * Check to see if a filesystem is mounted on a block device.
2358 */
2359 int
2360 vfs_mountedon(vp)
2361 struct vnode *vp;
2362 {
2363 struct vnode *vq;
2364 int error = 0;
2365
2366 SPECHASH_LOCK();
2367 if (vp->v_specflags & SI_MOUNTEDON) {
2368 error = EBUSY;
2369 goto out;
2370 }
2371 if (vp->v_flag & VALIASED) {
2372 for (vq = *vp->v_hashchain; vq; vq = vq->v_specnext) {
2373 if (vq->v_rdev != vp->v_rdev ||
2374 vq->v_type != vp->v_type)
2375 continue;
2376 if (vq->v_specflags & SI_MOUNTEDON) {
2377 error = EBUSY;
2378 break;
2379 }
2380 }
2381 }
2382 out:
2383 SPECHASH_UNLOCK();
2384 return (error);
2385 }
2386
2387 /*
2388 * Unmount all filesystems. The list is traversed in reverse order
2389 * of mounting to avoid dependencies.
2390 */
2391 __private_extern__ void
2392 vfs_unmountall()
2393 {
2394 struct mount *mp;
2395 struct proc *p = current_proc();
2396 int error;
2397
2398 /*
2399 * Since this only runs when rebooting, it is not interlocked.
2400 */
2401 mount_list_lock();
2402 while(!TAILQ_EMPTY(&mountlist)) {
2403 mp = TAILQ_LAST(&mountlist, mntlist);
2404 mount_list_unlock();
2405 error = dounmount(mp, MNT_FORCE, p);
2406 if (error) {
2407 mount_list_lock();
2408 TAILQ_REMOVE(&mountlist, mp, mnt_list);
2409 printf("unmount of %s failed (", mp->mnt_vfsstat.f_mntonname);
2410 if (error == EBUSY)
2411 printf("BUSY)\n");
2412 else
2413 printf("%d)\n", error);
2414 continue;
2415 }
2416 mount_list_lock();
2417 }
2418 mount_list_unlock();
2419 }
2420
2421
2422 /*
2423 * This routine is called from vnode_pager_no_senders()
2424 * which in turn can be called with vnode locked by vnode_uncache()
2425 * But it could also get called as a result of vm_object_cache_trim().
2426 * In that case lock state is unknown.
2427 * AGE the vnode so that it gets recycled quickly.
2428 */
2429 __private_extern__ void
2430 vnode_pager_vrele(struct vnode *vp)
2431 {
2432 vnode_lock(vp);
2433
2434 if (!ISSET(vp->v_lflag, VL_TERMINATE))
2435 panic("vnode_pager_vrele: vp not in termination");
2436 vp->v_lflag &= ~VNAMED_UBC;
2437
2438 if (UBCINFOEXISTS(vp)) {
2439 struct ubc_info *uip = vp->v_ubcinfo;
2440
2441 if (ISSET(uip->ui_flags, UI_WASMAPPED))
2442 SET(vp->v_flag, VWASMAPPED);
2443 vp->v_ubcinfo = UBC_INFO_NULL;
2444
2445 ubc_info_deallocate(uip);
2446 } else {
2447 panic("NO ubcinfo in vnode_pager_vrele");
2448 }
2449 vnode_unlock(vp);
2450
2451 wakeup(&vp->v_lflag);
2452 }
2453
2454
2455 #include <sys/disk.h>
2456
2457 errno_t
2458 vfs_init_io_attributes(vnode_t devvp, mount_t mp)
2459 {
2460 int error;
2461 off_t readblockcnt;
2462 off_t writeblockcnt;
2463 off_t readmaxcnt;
2464 off_t writemaxcnt;
2465 off_t readsegcnt;
2466 off_t writesegcnt;
2467 off_t readsegsize;
2468 off_t writesegsize;
2469 u_long blksize;
2470 u_int64_t temp;
2471 struct vfs_context context;
2472
2473 proc_t p = current_proc();
2474
2475 context.vc_proc = p;
2476 context.vc_ucred = kauth_cred_get();
2477
2478 int isvirtual = 0;
2479 /*
2480 * determine if this mount point exists on the same device as the root
2481 * partition... if so, then it comes under the hard throttle control
2482 */
2483 int thisunit = -1;
2484 static int rootunit = -1;
2485
2486 if (rootunit == -1) {
2487 if (VNOP_IOCTL(rootvp, DKIOCGETBSDUNIT, (caddr_t)&rootunit, 0, &context))
2488 rootunit = -1;
2489 else if (rootvp == devvp)
2490 mp->mnt_kern_flag |= MNTK_ROOTDEV;
2491 }
2492 if (devvp != rootvp && rootunit != -1) {
2493 if (VNOP_IOCTL(devvp, DKIOCGETBSDUNIT, (caddr_t)&thisunit, 0, &context) == 0) {
2494 if (thisunit == rootunit)
2495 mp->mnt_kern_flag |= MNTK_ROOTDEV;
2496 }
2497 }
2498 /*
2499 * force the spec device to re-cache
2500 * the underlying block size in case
2501 * the filesystem overrode the initial value
2502 */
2503 set_fsblocksize(devvp);
2504
2505
2506 if ((error = VNOP_IOCTL(devvp, DKIOCGETBLOCKSIZE,
2507 (caddr_t)&blksize, 0, &context)))
2508 return (error);
2509
2510 mp->mnt_devblocksize = blksize;
2511
2512 if (VNOP_IOCTL(devvp, DKIOCISVIRTUAL, (caddr_t)&isvirtual, 0, &context) == 0) {
2513 if (isvirtual)
2514 mp->mnt_kern_flag |= MNTK_VIRTUALDEV;
2515 }
2516
2517 if ((error = VNOP_IOCTL(devvp, DKIOCGETMAXBLOCKCOUNTREAD,
2518 (caddr_t)&readblockcnt, 0, &context)))
2519 return (error);
2520
2521 if ((error = VNOP_IOCTL(devvp, DKIOCGETMAXBLOCKCOUNTWRITE,
2522 (caddr_t)&writeblockcnt, 0, &context)))
2523 return (error);
2524
2525 if ((error = VNOP_IOCTL(devvp, DKIOCGETMAXBYTECOUNTREAD,
2526 (caddr_t)&readmaxcnt, 0, &context)))
2527 return (error);
2528
2529 if ((error = VNOP_IOCTL(devvp, DKIOCGETMAXBYTECOUNTWRITE,
2530 (caddr_t)&writemaxcnt, 0, &context)))
2531 return (error);
2532
2533 if ((error = VNOP_IOCTL(devvp, DKIOCGETMAXSEGMENTCOUNTREAD,
2534 (caddr_t)&readsegcnt, 0, &context)))
2535 return (error);
2536
2537 if ((error = VNOP_IOCTL(devvp, DKIOCGETMAXSEGMENTCOUNTWRITE,
2538 (caddr_t)&writesegcnt, 0, &context)))
2539 return (error);
2540
2541 if ((error = VNOP_IOCTL(devvp, DKIOCGETMAXSEGMENTBYTECOUNTREAD,
2542 (caddr_t)&readsegsize, 0, &context)))
2543 return (error);
2544
2545 if ((error = VNOP_IOCTL(devvp, DKIOCGETMAXSEGMENTBYTECOUNTWRITE,
2546 (caddr_t)&writesegsize, 0, &context)))
2547 return (error);
2548
2549 if (readmaxcnt)
2550 temp = (readmaxcnt > UINT32_MAX) ? UINT32_MAX : readmaxcnt;
2551 else {
2552 if (readblockcnt) {
2553 temp = readblockcnt * blksize;
2554 temp = (temp > UINT32_MAX) ? UINT32_MAX : temp;
2555 } else
2556 temp = MAXPHYS;
2557 }
2558 mp->mnt_maxreadcnt = (u_int32_t)temp;
2559
2560 if (writemaxcnt)
2561 temp = (writemaxcnt > UINT32_MAX) ? UINT32_MAX : writemaxcnt;
2562 else {
2563 if (writeblockcnt) {
2564 temp = writeblockcnt * blksize;
2565 temp = (temp > UINT32_MAX) ? UINT32_MAX : temp;
2566 } else
2567 temp = MAXPHYS;
2568 }
2569 mp->mnt_maxwritecnt = (u_int32_t)temp;
2570
2571 if (readsegcnt) {
2572 temp = (readsegcnt > UINT16_MAX) ? UINT16_MAX : readsegcnt;
2573 mp->mnt_segreadcnt = (u_int16_t)temp;
2574 }
2575 if (writesegcnt) {
2576 temp = (writesegcnt > UINT16_MAX) ? UINT16_MAX : writesegcnt;
2577 mp->mnt_segwritecnt = (u_int16_t)temp;
2578 }
2579 if (readsegsize)
2580 temp = (readsegsize > UINT32_MAX) ? UINT32_MAX : readsegsize;
2581 else
2582 temp = mp->mnt_maxreadcnt;
2583 mp->mnt_maxsegreadsize = (u_int32_t)temp;
2584
2585 if (writesegsize)
2586 temp = (writesegsize > UINT32_MAX) ? UINT32_MAX : writesegsize;
2587 else
2588 temp = mp->mnt_maxwritecnt;
2589 mp->mnt_maxsegwritesize = (u_int32_t)temp;
2590
2591 return (error);
2592 }
2593
2594 static struct klist fs_klist;
2595
2596 void
2597 vfs_event_init(void)
2598 {
2599
2600 klist_init(&fs_klist);
2601 }
2602
2603 void
2604 vfs_event_signal(__unused fsid_t *fsid, u_int32_t event, __unused intptr_t data)
2605 {
2606
2607 KNOTE(&fs_klist, event);
2608 }
2609
2610 /*
2611 * return the number of mounted filesystems.
2612 */
2613 static int
2614 sysctl_vfs_getvfscnt(void)
2615 {
2616 return(mount_getvfscnt());
2617 }
2618
2619
2620 static int
2621 mount_getvfscnt(void)
2622 {
2623 int ret;
2624
2625 mount_list_lock();
2626 ret = nummounts;
2627 mount_list_unlock();
2628 return (ret);
2629
2630 }
2631
2632
2633
2634 static int
2635 mount_fillfsids(fsid_t *fsidlst, int count)
2636 {
2637 struct mount *mp;
2638 int actual=0;
2639
2640 actual = 0;
2641 mount_list_lock();
2642 TAILQ_FOREACH(mp, &mountlist, mnt_list) {
2643 if (actual <= count) {
2644 fsidlst[actual] = mp->mnt_vfsstat.f_fsid;
2645 actual++;
2646 }
2647 }
2648 mount_list_unlock();
2649 return (actual);
2650
2651 }
2652
2653 /*
2654 * fill in the array of fsid_t's up to a max of 'count', the actual
2655 * number filled in will be set in '*actual'. If there are more fsid_t's
2656 * than room in fsidlst then ENOMEM will be returned and '*actual' will
2657 * have the actual count.
2658 * having *actual filled out even in the error case is depended upon.
2659 */
2660 static int
2661 sysctl_vfs_getvfslist(fsid_t *fsidlst, int count, int *actual)
2662 {
2663 struct mount *mp;
2664
2665 *actual = 0;
2666 mount_list_lock();
2667 TAILQ_FOREACH(mp, &mountlist, mnt_list) {
2668 (*actual)++;
2669 if (*actual <= count)
2670 fsidlst[(*actual) - 1] = mp->mnt_vfsstat.f_fsid;
2671 }
2672 mount_list_unlock();
2673 return (*actual <= count ? 0 : ENOMEM);
2674 }
2675
2676 static int
2677 sysctl_vfs_vfslist SYSCTL_HANDLER_ARGS
2678 {
2679 int actual, error;
2680 size_t space;
2681 fsid_t *fsidlst;
2682
2683 /* This is a readonly node. */
2684 if (req->newptr != USER_ADDR_NULL)
2685 return (EPERM);
2686
2687 /* they are querying us so just return the space required. */
2688 if (req->oldptr == USER_ADDR_NULL) {
2689 req->oldidx = sysctl_vfs_getvfscnt() * sizeof(fsid_t);
2690 return 0;
2691 }
2692 again:
2693 /*
2694 * Retrieve an accurate count of the amount of space required to copy
2695 * out all the fsids in the system.
2696 */
2697 space = req->oldlen;
2698 req->oldlen = sysctl_vfs_getvfscnt() * sizeof(fsid_t);
2699
2700 /* they didn't give us enough space. */
2701 if (space < req->oldlen)
2702 return (ENOMEM);
2703
2704 MALLOC(fsidlst, fsid_t *, req->oldlen, M_TEMP, M_WAITOK);
2705 error = sysctl_vfs_getvfslist(fsidlst, req->oldlen / sizeof(fsid_t),
2706 &actual);
2707 /*
2708 * If we get back ENOMEM, then another mount has been added while we
2709 * slept in malloc above. If this is the case then try again.
2710 */
2711 if (error == ENOMEM) {
2712 FREE(fsidlst, M_TEMP);
2713 req->oldlen = space;
2714 goto again;
2715 }
2716 if (error == 0) {
2717 error = SYSCTL_OUT(req, fsidlst, actual * sizeof(fsid_t));
2718 }
2719 FREE(fsidlst, M_TEMP);
2720 return (error);
2721 }
2722
2723 /*
2724 * Do a sysctl by fsid.
2725 */
2726 static int
2727 sysctl_vfs_ctlbyfsid SYSCTL_HANDLER_ARGS
2728 {
2729 struct vfsidctl vc;
2730 struct user_vfsidctl user_vc;
2731 struct mount *mp;
2732 struct vfsstatfs *sp;
2733 struct proc *p;
2734 int *name;
2735 int error, flags, namelen;
2736 struct vfs_context context;
2737 boolean_t is_64_bit;
2738
2739 name = arg1;
2740 namelen = arg2;
2741 p = req->p;
2742 context.vc_proc = p;
2743 context.vc_ucred = kauth_cred_get();
2744 is_64_bit = proc_is64bit(p);
2745
2746 if (is_64_bit) {
2747 error = SYSCTL_IN(req, &user_vc, sizeof(user_vc));
2748 if (error)
2749 return (error);
2750 if (user_vc.vc_vers != VFS_CTL_VERS1)
2751 return (EINVAL);
2752 mp = mount_list_lookupby_fsid(&user_vc.vc_fsid, 0, 0);
2753 }
2754 else {
2755 error = SYSCTL_IN(req, &vc, sizeof(vc));
2756 if (error)
2757 return (error);
2758 if (vc.vc_vers != VFS_CTL_VERS1)
2759 return (EINVAL);
2760 mp = mount_list_lookupby_fsid(&vc.vc_fsid, 0, 0);
2761 }
2762 if (mp == NULL)
2763 return (ENOENT);
2764 /* reset so that the fs specific code can fetch it. */
2765 req->newidx = 0;
2766 /*
2767 * Note if this is a VFS_CTL then we pass the actual sysctl req
2768 * in for "oldp" so that the lower layer can DTRT and use the
2769 * SYSCTL_IN/OUT routines.
2770 */
2771 if (mp->mnt_op->vfs_sysctl != NULL) {
2772 if (is_64_bit) {
2773 if (vfs_64bitready(mp)) {
2774 error = mp->mnt_op->vfs_sysctl(name, namelen,
2775 CAST_USER_ADDR_T(req),
2776 NULL, USER_ADDR_NULL, 0,
2777 &context);
2778 }
2779 else {
2780 error = ENOTSUP;
2781 }
2782 }
2783 else {
2784 error = mp->mnt_op->vfs_sysctl(name, namelen,
2785 CAST_USER_ADDR_T(req),
2786 NULL, USER_ADDR_NULL, 0,
2787 &context);
2788 }
2789 if (error != ENOTSUP)
2790 return (error);
2791 }
2792 switch (name[0]) {
2793 case VFS_CTL_UMOUNT:
2794 req->newidx = 0;
2795 if (is_64_bit) {
2796 req->newptr = user_vc.vc_ptr;
2797 req->newlen = (size_t)user_vc.vc_len;
2798 }
2799 else {
2800 req->newptr = CAST_USER_ADDR_T(vc.vc_ptr);
2801 req->newlen = vc.vc_len;
2802 }
2803 error = SYSCTL_IN(req, &flags, sizeof(flags));
2804 if (error)
2805 break;
2806 error = safedounmount(mp, flags, p);
2807 break;
2808 case VFS_CTL_STATFS:
2809 req->newidx = 0;
2810 if (is_64_bit) {
2811 req->newptr = user_vc.vc_ptr;
2812 req->newlen = (size_t)user_vc.vc_len;
2813 }
2814 else {
2815 req->newptr = CAST_USER_ADDR_T(vc.vc_ptr);
2816 req->newlen = vc.vc_len;
2817 }
2818 error = SYSCTL_IN(req, &flags, sizeof(flags));
2819 if (error)
2820 break;
2821 sp = &mp->mnt_vfsstat;
2822 if (((flags & MNT_NOWAIT) == 0 || (flags & MNT_WAIT)) &&
2823 (error = vfs_update_vfsstat(mp, &context)))
2824 return (error);
2825 if (is_64_bit) {
2826 struct user_statfs sfs;
2827 bzero(&sfs, sizeof(sfs));
2828 sfs.f_flags = mp->mnt_flag & MNT_VISFLAGMASK;
2829 sfs.f_type = mp->mnt_vtable->vfc_typenum;
2830 sfs.f_bsize = (user_long_t)sp->f_bsize;
2831 sfs.f_iosize = (user_long_t)sp->f_iosize;
2832 sfs.f_blocks = (user_long_t)sp->f_blocks;
2833 sfs.f_bfree = (user_long_t)sp->f_bfree;
2834 sfs.f_bavail = (user_long_t)sp->f_bavail;
2835 sfs.f_files = (user_long_t)sp->f_files;
2836 sfs.f_ffree = (user_long_t)sp->f_ffree;
2837 sfs.f_fsid = sp->f_fsid;
2838 sfs.f_owner = sp->f_owner;
2839
2840 strncpy(&sfs.f_fstypename, &sp->f_fstypename, MFSNAMELEN-1);
2841 strncpy(&sfs.f_mntonname, &sp->f_mntonname, MNAMELEN-1);
2842 strncpy(&sfs.f_mntfromname, &sp->f_mntfromname, MNAMELEN-1);
2843
2844 error = SYSCTL_OUT(req, &sfs, sizeof(sfs));
2845 }
2846 else {
2847 struct statfs sfs;
2848 bzero(&sfs, sizeof(struct statfs));
2849 sfs.f_flags = mp->mnt_flag & MNT_VISFLAGMASK;
2850 sfs.f_type = mp->mnt_vtable->vfc_typenum;
2851
2852 /*
2853 * It's possible for there to be more than 2^^31 blocks in the filesystem, so we
2854 * have to fudge the numbers here in that case. We inflate the blocksize in order
2855 * to reflect the filesystem size as best we can.
2856 */
2857 if (sp->f_blocks > LONG_MAX) {
2858 int shift;
2859
2860 /*
2861 * Work out how far we have to shift the block count down to make it fit.
2862 * Note that it's possible to have to shift so far that the resulting
2863 * blocksize would be unreportably large. At that point, we will clip
2864 * any values that don't fit.
2865 *
2866 * For safety's sake, we also ensure that f_iosize is never reported as
2867 * being smaller than f_bsize.
2868 */
2869 for (shift = 0; shift < 32; shift++) {
2870 if ((sp->f_blocks >> shift) <= LONG_MAX)
2871 break;
2872 if ((sp->f_bsize << (shift + 1)) > LONG_MAX)
2873 break;
2874 }
2875 #define __SHIFT_OR_CLIP(x, s) ((((x) >> (s)) > LONG_MAX) ? LONG_MAX : ((x) >> (s)))
2876 sfs.f_blocks = (long)__SHIFT_OR_CLIP(sp->f_blocks, shift);
2877 sfs.f_bfree = (long)__SHIFT_OR_CLIP(sp->f_bfree, shift);
2878 sfs.f_bavail = (long)__SHIFT_OR_CLIP(sp->f_bavail, shift);
2879 #undef __SHIFT_OR_CLIP
2880 sfs.f_bsize = (long)(sp->f_bsize << shift);
2881 sfs.f_iosize = lmax(sp->f_iosize, sp->f_bsize);
2882 } else {
2883 sfs.f_bsize = (long)sp->f_bsize;
2884 sfs.f_iosize = (long)sp->f_iosize;
2885 sfs.f_blocks = (long)sp->f_blocks;
2886 sfs.f_bfree = (long)sp->f_bfree;
2887 sfs.f_bavail = (long)sp->f_bavail;
2888 }
2889 sfs.f_files = (long)sp->f_files;
2890 sfs.f_ffree = (long)sp->f_ffree;
2891 sfs.f_fsid = sp->f_fsid;
2892 sfs.f_owner = sp->f_owner;
2893
2894 strncpy(&sfs.f_fstypename, &sp->f_fstypename, MFSNAMELEN-1);
2895 strncpy(&sfs.f_mntonname, &sp->f_mntonname, MNAMELEN-1);
2896 strncpy(&sfs.f_mntfromname, &sp->f_mntfromname, MNAMELEN-1);
2897
2898 error = SYSCTL_OUT(req, &sfs, sizeof(sfs));
2899 }
2900 break;
2901 default:
2902 return (ENOTSUP);
2903 }
2904 return (error);
2905 }
2906
2907 static int filt_fsattach(struct knote *kn);
2908 static void filt_fsdetach(struct knote *kn);
2909 static int filt_fsevent(struct knote *kn, long hint);
2910
2911 struct filterops fs_filtops =
2912 { 0, filt_fsattach, filt_fsdetach, filt_fsevent };
2913
2914 static int
2915 filt_fsattach(struct knote *kn)
2916 {
2917
2918 kn->kn_flags |= EV_CLEAR;
2919 KNOTE_ATTACH(&fs_klist, kn);
2920 return (0);
2921 }
2922
2923 static void
2924 filt_fsdetach(struct knote *kn)
2925 {
2926
2927 KNOTE_DETACH(&fs_klist, kn);
2928 }
2929
2930 static int
2931 filt_fsevent(struct knote *kn, long hint)
2932 {
2933
2934 kn->kn_fflags |= hint;
2935 return (kn->kn_fflags != 0);
2936 }
2937
2938 static int
2939 sysctl_vfs_noremotehang SYSCTL_HANDLER_ARGS
2940 {
2941 int out, error;
2942 pid_t pid;
2943 size_t space;
2944 struct proc *p;
2945
2946 /* We need a pid. */
2947 if (req->newptr == USER_ADDR_NULL)
2948 return (EINVAL);
2949
2950 error = SYSCTL_IN(req, &pid, sizeof(pid));
2951 if (error)
2952 return (error);
2953
2954 p = pfind(pid < 0 ? -pid : pid);
2955 if (p == NULL)
2956 return (ESRCH);
2957
2958 /*
2959 * Fetching the value is ok, but we only fetch if the old
2960 * pointer is given.
2961 */
2962 if (req->oldptr != USER_ADDR_NULL) {
2963 out = !((p->p_flag & P_NOREMOTEHANG) == 0);
2964 error = SYSCTL_OUT(req, &out, sizeof(out));
2965 return (error);
2966 }
2967
2968 /* XXX req->p->p_ucred -> kauth_cred_get() ??? */
2969 /* cansignal offers us enough security. */
2970 if (p != req->p && suser(req->p->p_ucred, &req->p->p_acflag) != 0)
2971 return (EPERM);
2972
2973 if (pid < 0)
2974 p->p_flag &= ~P_NOREMOTEHANG;
2975 else
2976 p->p_flag |= P_NOREMOTEHANG;
2977
2978 return (0);
2979 }
2980 /* the vfs.generic. branch. */
2981 SYSCTL_NODE(_vfs, VFS_GENERIC, generic, CTLFLAG_RW, 0, "vfs generic hinge");
2982 /* retreive a list of mounted filesystem fsid_t */
2983 SYSCTL_PROC(_vfs_generic, OID_AUTO, vfsidlist, CTLFLAG_RD,
2984 0, 0, sysctl_vfs_vfslist, "S,fsid", "List of mounted filesystem ids");
2985 /* perform operations on filesystem via fsid_t */
2986 SYSCTL_NODE(_vfs_generic, OID_AUTO, ctlbyfsid, CTLFLAG_RW,
2987 sysctl_vfs_ctlbyfsid, "ctlbyfsid");
2988 SYSCTL_PROC(_vfs_generic, OID_AUTO, noremotehang, CTLFLAG_RW,
2989 0, 0, sysctl_vfs_noremotehang, "I", "noremotehang");
2990
2991
2992 int num_reusedvnodes=0;
2993
2994 static int
2995 new_vnode(vnode_t *vpp)
2996 {
2997 vnode_t vp;
2998 int retries = 0; /* retry incase of tablefull */
2999 int vpid;
3000 struct timespec ts;
3001
3002 retry:
3003 vnode_list_lock();
3004
3005 if ( !TAILQ_EMPTY(&vnode_free_list)) {
3006 /*
3007 * Pick the first vp for possible reuse
3008 */
3009 vp = TAILQ_FIRST(&vnode_free_list);
3010
3011 if (vp->v_lflag & VL_DEAD)
3012 goto steal_this_vp;
3013 } else
3014 vp = NULL;
3015
3016 /*
3017 * we're either empty, or the next guy on the
3018 * list is a valid vnode... if we're under the
3019 * limit, we'll create a new vnode
3020 */
3021 if (numvnodes < desiredvnodes) {
3022 numvnodes++;
3023 vnode_list_unlock();
3024 MALLOC_ZONE(vp, struct vnode *, sizeof *vp, M_VNODE, M_WAITOK);
3025 bzero((char *)vp, sizeof *vp);
3026 VLISTNONE(vp); /* avoid double queue removal */
3027 lck_mtx_init(&vp->v_lock, vnode_lck_grp, vnode_lck_attr);
3028
3029 nanouptime(&ts);
3030 vp->v_id = ts.tv_nsec;
3031 vp->v_flag = VSTANDARD;
3032
3033 goto done;
3034 }
3035 if (vp == NULL) {
3036 /*
3037 * we've reached the system imposed maximum number of vnodes
3038 * but there isn't a single one available
3039 * wait a bit and then retry... if we can't get a vnode
3040 * after 100 retries, than log a complaint
3041 */
3042 if (++retries <= 100) {
3043 vnode_list_unlock();
3044 IOSleep(1);
3045 goto retry;
3046 }
3047
3048 vnode_list_unlock();
3049 tablefull("vnode");
3050 log(LOG_EMERG, "%d desired, %d numvnodes, "
3051 "%d free, %d inactive\n",
3052 desiredvnodes, numvnodes, freevnodes, inactivevnodes);
3053 *vpp = 0;
3054 return (ENFILE);
3055 }
3056 steal_this_vp:
3057 vpid = vp->v_id;
3058
3059 VREMFREE("new_vnode", vp);
3060 VLISTNONE(vp);
3061
3062 vnode_list_unlock();
3063 vnode_lock(vp);
3064
3065 /*
3066 * We could wait for the vnode_lock after removing the vp from the freelist
3067 * and the vid is bumped only at the very end of reclaim. So it is possible
3068 * that we are looking at a vnode that is being terminated. If so skip it.
3069 */
3070 if ((vpid != vp->v_id) || (vp->v_usecount != 0) || (vp->v_iocount != 0) ||
3071 VONLIST(vp) || (vp->v_lflag & VL_TERMINATE)) {
3072 /*
3073 * we lost the race between dropping the list lock
3074 * and picking up the vnode_lock... someone else
3075 * used this vnode and it is now in a new state
3076 * so we need to go back and try again
3077 */
3078 vnode_unlock(vp);
3079 goto retry;
3080 }
3081 if ( (vp->v_lflag & (VL_NEEDINACTIVE | VL_MARKTERM)) == VL_NEEDINACTIVE ) {
3082 /*
3083 * we did a vnode_rele_ext that asked for
3084 * us not to reenter the filesystem during
3085 * the release even though VL_NEEDINACTIVE was
3086 * set... we'll do it here by doing a
3087 * vnode_get/vnode_put
3088 *
3089 * pick up an iocount so that we can call
3090 * vnode_put and drive the VNOP_INACTIVE...
3091 * vnode_put will either leave us off
3092 * the freelist if a new ref comes in,
3093 * or put us back on the end of the freelist
3094 * or recycle us if we were marked for termination...
3095 * so we'll just go grab a new candidate
3096 */
3097 vp->v_iocount++;
3098 #ifdef JOE_DEBUG
3099 record_vp(vp, 1);
3100 #endif
3101 vnode_put_locked(vp);
3102 vnode_unlock(vp);
3103 goto retry;
3104 }
3105 OSAddAtomic(1, &num_reusedvnodes);
3106
3107 /* Checks for anyone racing us for recycle */
3108 if (vp->v_type != VBAD) {
3109 if (vp->v_lflag & VL_DEAD)
3110 panic("new_vnode: the vnode is VL_DEAD but not VBAD");
3111
3112 (void)vnode_reclaim_internal(vp, 1, 1);
3113
3114 if ((VONLIST(vp)))
3115 panic("new_vnode: vp on list ");
3116 if (vp->v_usecount || vp->v_iocount || vp->v_kusecount ||
3117 (vp->v_lflag & (VNAMED_UBC | VNAMED_MOUNT | VNAMED_FSHASH)))
3118 panic("new_vnode: free vnode still referenced\n");
3119 if ((vp->v_mntvnodes.tqe_prev != 0) && (vp->v_mntvnodes.tqe_next != 0))
3120 panic("new_vnode: vnode seems to be on mount list ");
3121 if ( !LIST_EMPTY(&vp->v_nclinks) || !LIST_EMPTY(&vp->v_ncchildren))
3122 panic("new_vnode: vnode still hooked into the name cache");
3123 }
3124 if (vp->v_unsafefs) {
3125 lck_mtx_destroy(&vp->v_unsafefs->fsnodelock, vnode_lck_grp);
3126 FREE_ZONE((void *)vp->v_unsafefs, sizeof(struct unsafe_fsnode), M_UNSAFEFS);
3127 vp->v_unsafefs = (struct unsafe_fsnode *)NULL;
3128 }
3129 vp->v_lflag = 0;
3130 vp->v_writecount = 0;
3131 vp->v_references = 0;
3132 vp->v_iterblkflags = 0;
3133 vp->v_flag = VSTANDARD;
3134 /* vbad vnodes can point to dead_mountp */
3135 vp->v_mount = 0;
3136 vp->v_defer_reclaimlist = (vnode_t)0;
3137
3138 vnode_unlock(vp);
3139 done:
3140 *vpp = vp;
3141
3142 return (0);
3143 }
3144
3145 void
3146 vnode_lock(vnode_t vp)
3147 {
3148 lck_mtx_lock(&vp->v_lock);
3149 }
3150
3151 void
3152 vnode_unlock(vnode_t vp)
3153 {
3154 lck_mtx_unlock(&vp->v_lock);
3155 }
3156
3157
3158
3159 int
3160 vnode_get(struct vnode *vp)
3161 {
3162 vnode_lock(vp);
3163
3164 if ( (vp->v_iocount == 0) && (vp->v_lflag & (VL_TERMINATE | VL_DEAD)) ) {
3165 vnode_unlock(vp);
3166 return(ENOENT);
3167 }
3168 vp->v_iocount++;
3169 #ifdef JOE_DEBUG
3170 record_vp(vp, 1);
3171 #endif
3172 vnode_unlock(vp);
3173
3174 return(0);
3175 }
3176
3177 int
3178 vnode_getwithvid(vnode_t vp, int vid)
3179 {
3180 return(vget_internal(vp, vid, ( VNODE_NODEAD| VNODE_WITHID)));
3181 }
3182
3183 int
3184 vnode_getwithref(vnode_t vp)
3185 {
3186 return(vget_internal(vp, 0, 0));
3187 }
3188
3189
3190 int
3191 vnode_put(vnode_t vp)
3192 {
3193 int retval;
3194
3195 vnode_lock(vp);
3196 retval = vnode_put_locked(vp);
3197 vnode_unlock(vp);
3198
3199 return(retval);
3200 }
3201
3202 int
3203 vnode_put_locked(vnode_t vp)
3204 {
3205 struct vfs_context context;
3206
3207 retry:
3208 if (vp->v_iocount < 1)
3209 panic("vnode_put(%x): iocount < 1", vp);
3210
3211 if ((vp->v_usecount > 0) || (vp->v_iocount > 1)) {
3212 vnode_dropiocount(vp, 1);
3213 return(0);
3214 }
3215 if ((vp->v_lflag & (VL_MARKTERM | VL_TERMINATE | VL_DEAD | VL_NEEDINACTIVE)) == VL_NEEDINACTIVE) {
3216
3217 vp->v_lflag &= ~VL_NEEDINACTIVE;
3218 vnode_unlock(vp);
3219
3220 context.vc_proc = current_proc();
3221 context.vc_ucred = kauth_cred_get();
3222 VNOP_INACTIVE(vp, &context);
3223
3224 vnode_lock(vp);
3225 /*
3226 * because we had to drop the vnode lock before calling
3227 * VNOP_INACTIVE, the state of this vnode may have changed...
3228 * we may pick up both VL_MARTERM and either
3229 * an iocount or a usecount while in the VNOP_INACTIVE call
3230 * we don't want to call vnode_reclaim_internal on a vnode
3231 * that has active references on it... so loop back around
3232 * and reevaluate the state
3233 */
3234 goto retry;
3235 }
3236 vp->v_lflag &= ~VL_NEEDINACTIVE;
3237
3238 if ((vp->v_lflag & (VL_MARKTERM | VL_TERMINATE | VL_DEAD)) == VL_MARKTERM)
3239 vnode_reclaim_internal(vp, 1, 0);
3240
3241 vnode_dropiocount(vp, 1);
3242 vnode_list_add(vp);
3243
3244 return(0);
3245 }
3246
3247 /* is vnode_t in use by others? */
3248 int
3249 vnode_isinuse(vnode_t vp, int refcnt)
3250 {
3251 return(vnode_isinuse_locked(vp, refcnt, 0));
3252 }
3253
3254
3255 static int
3256 vnode_isinuse_locked(vnode_t vp, int refcnt, int locked)
3257 {
3258 int retval = 0;
3259
3260 if (!locked)
3261 vnode_lock(vp);
3262 if ((vp->v_type != VREG) && (vp->v_usecount > refcnt)) {
3263 retval = 1;
3264 goto out;
3265 }
3266 if (vp->v_type == VREG) {
3267 retval = ubc_isinuse_locked(vp, refcnt, 1);
3268 }
3269
3270 out:
3271 if (!locked)
3272 vnode_unlock(vp);
3273 return(retval);
3274 }
3275
3276
3277 /* resume vnode_t */
3278 errno_t
3279 vnode_resume(vnode_t vp)
3280 {
3281
3282 vnode_lock(vp);
3283
3284 if (vp->v_owner == current_thread()) {
3285 vp->v_lflag &= ~VL_SUSPENDED;
3286 vp->v_owner = 0;
3287 vnode_unlock(vp);
3288 wakeup(&vp->v_iocount);
3289 } else
3290 vnode_unlock(vp);
3291
3292 return(0);
3293 }
3294
3295 static errno_t
3296 vnode_drain(vnode_t vp)
3297 {
3298
3299 if (vp->v_lflag & VL_DRAIN) {
3300 panic("vnode_drain: recursuve drain");
3301 return(ENOENT);
3302 }
3303 vp->v_lflag |= VL_DRAIN;
3304 vp->v_owner = current_thread();
3305
3306 while (vp->v_iocount > 1)
3307 msleep(&vp->v_iocount, &vp->v_lock, PVFS, "vnode_drain", 0);
3308 return(0);
3309 }
3310
3311
3312 /*
3313 * if the number of recent references via vnode_getwithvid or vnode_getwithref
3314 * exceeds this threshhold, than 'UN-AGE' the vnode by removing it from
3315 * the LRU list if it's currently on it... once the iocount and usecount both drop
3316 * to 0, it will get put back on the end of the list, effectively making it younger
3317 * this allows us to keep actively referenced vnodes in the list without having
3318 * to constantly remove and add to the list each time a vnode w/o a usecount is
3319 * referenced which costs us taking and dropping a global lock twice.
3320 */
3321 #define UNAGE_THRESHHOLD 10
3322
3323 errno_t
3324 vnode_getiocount(vnode_t vp, int locked, int vid, int vflags)
3325 {
3326 int nodead = vflags & VNODE_NODEAD;
3327 int nosusp = vflags & VNODE_NOSUSPEND;
3328
3329 if (!locked)
3330 vnode_lock(vp);
3331
3332 for (;;) {
3333 /*
3334 * if it is a dead vnode with deadfs
3335 */
3336 if (nodead && (vp->v_lflag & VL_DEAD) && ((vp->v_type == VBAD) || (vp->v_data == 0))) {
3337 if (!locked)
3338 vnode_unlock(vp);
3339 return(ENOENT);
3340 }
3341 /*
3342 * will return VL_DEAD ones
3343 */
3344 if ((vp->v_lflag & (VL_SUSPENDED | VL_DRAIN | VL_TERMINATE)) == 0 ) {
3345 break;
3346 }
3347 /*
3348 * if suspended vnodes are to be failed
3349 */
3350 if (nosusp && (vp->v_lflag & VL_SUSPENDED)) {
3351 if (!locked)
3352 vnode_unlock(vp);
3353 return(ENOENT);
3354 }
3355 /*
3356 * if you are the owner of drain/suspend/termination , can acquire iocount
3357 * check for VL_TERMINATE; it does not set owner
3358 */
3359 if ((vp->v_lflag & (VL_DRAIN | VL_SUSPENDED | VL_TERMINATE)) &&
3360 (vp->v_owner == current_thread())) {
3361 break;
3362 }
3363 if (vp->v_lflag & VL_TERMINATE) {
3364 vp->v_lflag |= VL_TERMWANT;
3365
3366 msleep(&vp->v_lflag, &vp->v_lock, PVFS, "vnode getiocount", 0);
3367 } else
3368 msleep(&vp->v_iocount, &vp->v_lock, PVFS, "vnode_getiocount", 0);
3369 }
3370 if (vid != vp->v_id) {
3371 if (!locked)
3372 vnode_unlock(vp);
3373 return(ENOENT);
3374 }
3375 if (++vp->v_references >= UNAGE_THRESHHOLD) {
3376 vp->v_references = 0;
3377 vnode_list_remove(vp);
3378 }
3379 vp->v_iocount++;
3380 #ifdef JOE_DEBUG
3381 record_vp(vp, 1);
3382 #endif
3383 if (!locked)
3384 vnode_unlock(vp);
3385 return(0);
3386 }
3387
3388 static void
3389 vnode_dropiocount (vnode_t vp, int locked)
3390 {
3391 if (!locked)
3392 vnode_lock(vp);
3393 if (vp->v_iocount < 1)
3394 panic("vnode_dropiocount(%x): v_iocount < 1", vp);
3395
3396 vp->v_iocount--;
3397 #ifdef JOE_DEBUG
3398 record_vp(vp, -1);
3399 #endif
3400 if ((vp->v_lflag & (VL_DRAIN | VL_SUSPENDED)) && (vp->v_iocount <= 1))
3401 wakeup(&vp->v_iocount);
3402
3403 if (!locked)
3404 vnode_unlock(vp);
3405 }
3406
3407
3408 void
3409 vnode_reclaim(struct vnode * vp)
3410 {
3411 vnode_reclaim_internal(vp, 0, 0);
3412 }
3413
3414 __private_extern__
3415 void
3416 vnode_reclaim_internal(struct vnode * vp, int locked, int reuse)
3417 {
3418 int isfifo = 0;
3419
3420 if (!locked)
3421 vnode_lock(vp);
3422
3423 if (vp->v_lflag & VL_TERMINATE) {
3424 panic("vnode reclaim in progress");
3425 }
3426 vp->v_lflag |= VL_TERMINATE;
3427
3428 if (vnode_drain(vp)) {
3429 panic("vnode drain failed");
3430 vnode_unlock(vp);
3431 return;
3432 }
3433 isfifo = (vp->v_type == VFIFO);
3434
3435 if (vp->v_type != VBAD)
3436 vgone(vp); /* clean and reclaim the vnode */
3437
3438 /*
3439 * give the vnode a new identity so
3440 * that vnode_getwithvid will fail
3441 * on any stale cache accesses
3442 */
3443 vp->v_id++;
3444 if (isfifo) {
3445 struct fifoinfo * fip;
3446
3447 fip = vp->v_fifoinfo;
3448 vp->v_fifoinfo = NULL;
3449 FREE(fip, M_TEMP);
3450 }
3451
3452 vp->v_type = VBAD;
3453
3454 if (vp->v_data)
3455 panic("vnode_reclaim_internal: cleaned vnode isn't");
3456 if (vp->v_numoutput)
3457 panic("vnode_reclaim_internal: Clean vnode has pending I/O's");
3458 if (UBCINFOEXISTS(vp))
3459 panic("vnode_reclaim_internal: ubcinfo not cleaned");
3460 if (vp->v_parent)
3461 panic("vnode_reclaim_internal: vparent not removed");
3462 if (vp->v_name)
3463 panic("vnode_reclaim_internal: vname not removed");
3464
3465 vp->v_socket = 0;
3466
3467 vp->v_lflag &= ~VL_TERMINATE;
3468 vp->v_lflag &= ~VL_DRAIN;
3469 vp->v_owner = 0;
3470
3471 if (vp->v_lflag & VL_TERMWANT) {
3472 vp->v_lflag &= ~VL_TERMWANT;
3473 wakeup(&vp->v_lflag);
3474 }
3475 if (!reuse && vp->v_usecount == 0)
3476 vnode_list_add(vp);
3477 if (!locked)
3478 vnode_unlock(vp);
3479 }
3480
3481 /* USAGE:
3482 * The following api creates a vnode and associates all the parameter specified in vnode_fsparam
3483 * structure and returns a vnode handle with a reference. device aliasing is handled here so checkalias
3484 * is obsoleted by this.
3485 * vnode_create(int flavor, size_t size, void * param, vnode_t *vp)
3486 */
3487 int
3488 vnode_create(int flavor, size_t size, void *data, vnode_t *vpp)
3489 {
3490 int error;
3491 int insert = 1;
3492 vnode_t vp;
3493 vnode_t nvp;
3494 vnode_t dvp;
3495 struct componentname *cnp;
3496 struct vnode_fsparam *param = (struct vnode_fsparam *)data;
3497
3498 if (flavor == VNCREATE_FLAVOR && (size == VCREATESIZE) && param) {
3499 if ( (error = new_vnode(&vp)) ) {
3500 return(error);
3501 } else {
3502 dvp = param->vnfs_dvp;
3503 cnp = param->vnfs_cnp;
3504
3505 vp->v_op = param->vnfs_vops;
3506 vp->v_type = param->vnfs_vtype;
3507 vp->v_data = param->vnfs_fsnode;
3508 vp->v_iocount = 1;
3509
3510 if (param->vnfs_markroot)
3511 vp->v_flag |= VROOT;
3512 if (param->vnfs_marksystem)
3513 vp->v_flag |= VSYSTEM;
3514 else if (vp->v_type == VREG) {
3515 /*
3516 * only non SYSTEM vp
3517 */
3518 error = ubc_info_init_withsize(vp, param->vnfs_filesize);
3519 if (error) {
3520 #ifdef JOE_DEBUG
3521 record_vp(vp, 1);
3522 #endif
3523 vp->v_mount = 0;
3524 vp->v_op = dead_vnodeop_p;
3525 vp->v_tag = VT_NON;
3526 vp->v_data = NULL;
3527 vp->v_type = VBAD;
3528 vp->v_lflag |= VL_DEAD;
3529
3530 vnode_put(vp);
3531 return(error);
3532 }
3533 }
3534 #ifdef JOE_DEBUG
3535 record_vp(vp, 1);
3536 #endif
3537 if (vp->v_type == VCHR || vp->v_type == VBLK) {
3538
3539 if ( (nvp = checkalias(vp, param->vnfs_rdev)) ) {
3540 /*
3541 * if checkalias returns a vnode, it will be locked
3542 *
3543 * first get rid of the unneeded vnode we acquired
3544 */
3545 vp->v_data = NULL;
3546 vp->v_op = spec_vnodeop_p;
3547 vp->v_type = VBAD;
3548 vp->v_lflag = VL_DEAD;
3549 vp->v_data = NULL;
3550 vp->v_tag = VT_NON;
3551 vnode_put(vp);
3552
3553 /*
3554 * switch to aliased vnode and finish
3555 * preparing it
3556 */
3557 vp = nvp;
3558
3559 vclean(vp, 0, current_proc());
3560 vp->v_op = param->vnfs_vops;
3561 vp->v_type = param->vnfs_vtype;
3562 vp->v_data = param->vnfs_fsnode;
3563 vp->v_lflag = 0;
3564 vp->v_mount = NULL;
3565 insmntque(vp, param->vnfs_mp);
3566 insert = 0;
3567 vnode_unlock(vp);
3568 }
3569 }
3570
3571 if (vp->v_type == VFIFO) {
3572 struct fifoinfo *fip;
3573
3574 MALLOC(fip, struct fifoinfo *,
3575 sizeof(*fip), M_TEMP, M_WAITOK);
3576 bzero(fip, sizeof(struct fifoinfo ));
3577 vp->v_fifoinfo = fip;
3578 }
3579 /* The file systems usually pass the address of the location where
3580 * where there store the vnode pointer. When we add the vnode in mount
3581 * point and name cache they are discoverable. So the file system node
3582 * will have the connection to vnode setup by then
3583 */
3584 *vpp = vp;
3585
3586 if (param->vnfs_mp) {
3587 if (param->vnfs_mp->mnt_kern_flag & MNTK_LOCK_LOCAL)
3588 vp->v_flag |= VLOCKLOCAL;
3589 if (insert) {
3590 /*
3591 * enter in mount vnode list
3592 */
3593 insmntque(vp, param->vnfs_mp);
3594 }
3595 #ifdef INTERIM_FSNODE_LOCK
3596 if (param->vnfs_mp->mnt_vtable->vfc_threadsafe == 0) {
3597 MALLOC_ZONE(vp->v_unsafefs, struct unsafe_fsnode *,
3598 sizeof(struct unsafe_fsnode), M_UNSAFEFS, M_WAITOK);
3599 vp->v_unsafefs->fsnode_count = 0;
3600 vp->v_unsafefs->fsnodeowner = (void *)NULL;
3601 lck_mtx_init(&vp->v_unsafefs->fsnodelock, vnode_lck_grp, vnode_lck_attr);
3602 }
3603 #endif /* INTERIM_FSNODE_LOCK */
3604 }
3605 if (dvp && vnode_ref(dvp) == 0) {
3606 vp->v_parent = dvp;
3607 }
3608 if (cnp) {
3609 if (dvp && ((param->vnfs_flags & (VNFS_NOCACHE | VNFS_CANTCACHE)) == 0)) {
3610 /*
3611 * enter into name cache
3612 * we've got the info to enter it into the name cache now
3613 */
3614 cache_enter(dvp, vp, cnp);
3615 }
3616 vp->v_name = vfs_addname(cnp->cn_nameptr, cnp->cn_namelen, cnp->cn_hash, 0);
3617 }
3618 if ((param->vnfs_flags & VNFS_CANTCACHE) == 0) {
3619 /*
3620 * this vnode is being created as cacheable in the name cache
3621 * this allows us to re-enter it in the cache
3622 */
3623 vp->v_flag |= VNCACHEABLE;
3624 }
3625 if ((vp->v_flag & VSYSTEM) && (vp->v_type != VREG))
3626 panic("incorrect vnode setup");
3627
3628 return(0);
3629 }
3630 }
3631 return (EINVAL);
3632 }
3633
3634 int
3635 vnode_addfsref(vnode_t vp)
3636 {
3637 vnode_lock(vp);
3638 if (vp->v_lflag & VNAMED_FSHASH)
3639 panic("add_fsref: vp already has named reference");
3640 if ((vp->v_freelist.tqe_prev != (struct vnode **)0xdeadb))
3641 panic("addfsref: vp on the free list\n");
3642 vp->v_lflag |= VNAMED_FSHASH;
3643 vnode_unlock(vp);
3644 return(0);
3645
3646 }
3647 int
3648 vnode_removefsref(vnode_t vp)
3649 {
3650 vnode_lock(vp);
3651 if ((vp->v_lflag & VNAMED_FSHASH) == 0)
3652 panic("remove_fsref: no named reference");
3653 vp->v_lflag &= ~VNAMED_FSHASH;
3654 vnode_unlock(vp);
3655 return(0);
3656
3657 }
3658
3659
3660 int
3661 vfs_iterate(__unused int flags, int (*callout)(mount_t, void *), void *arg)
3662 {
3663 mount_t mp;
3664 int ret = 0;
3665 fsid_t * fsid_list;
3666 int count, actualcount, i;
3667 void * allocmem;
3668
3669 count = mount_getvfscnt();
3670 count += 10;
3671
3672 fsid_list = (fsid_t *)kalloc(count * sizeof(fsid_t));
3673 allocmem = (void *)fsid_list;
3674
3675 actualcount = mount_fillfsids(fsid_list, count);
3676
3677 for (i=0; i< actualcount; i++) {
3678
3679 /* obtain the mount point with iteration reference */
3680 mp = mount_list_lookupby_fsid(&fsid_list[i], 0, 1);
3681
3682 if(mp == (struct mount *)0)
3683 continue;
3684 mount_lock(mp);
3685 if (mp->mnt_lflag & (MNT_LDEAD | MNT_LUNMOUNT)) {
3686 mount_unlock(mp);
3687 mount_iterdrop(mp);
3688 continue;
3689
3690 }
3691 mount_unlock(mp);
3692
3693 /* iterate over all the vnodes */
3694 ret = callout(mp, arg);
3695
3696 mount_iterdrop(mp);
3697
3698 switch (ret) {
3699 case VFS_RETURNED:
3700 case VFS_RETURNED_DONE:
3701 if (ret == VFS_RETURNED_DONE) {
3702 ret = 0;
3703 goto out;
3704 }
3705 break;
3706
3707 case VFS_CLAIMED_DONE:
3708 ret = 0;
3709 goto out;
3710 case VFS_CLAIMED:
3711 default:
3712 break;
3713 }
3714 ret = 0;
3715 }
3716
3717 out:
3718 kfree(allocmem, (count * sizeof(fsid_t)));
3719 return (ret);
3720 }
3721
3722 /*
3723 * Update the vfsstatfs structure in the mountpoint.
3724 */
3725 int
3726 vfs_update_vfsstat(mount_t mp, vfs_context_t ctx)
3727 {
3728 struct vfs_attr va;
3729 int error;
3730
3731 /*
3732 * Request the attributes we want to propagate into
3733 * the per-mount vfsstat structure.
3734 */
3735 VFSATTR_INIT(&va);
3736 VFSATTR_WANTED(&va, f_iosize);
3737 VFSATTR_WANTED(&va, f_blocks);
3738 VFSATTR_WANTED(&va, f_bfree);
3739 VFSATTR_WANTED(&va, f_bavail);
3740 VFSATTR_WANTED(&va, f_bused);
3741 VFSATTR_WANTED(&va, f_files);
3742 VFSATTR_WANTED(&va, f_ffree);
3743 VFSATTR_WANTED(&va, f_bsize);
3744 VFSATTR_WANTED(&va, f_fssubtype);
3745 if ((error = vfs_getattr(mp, &va, ctx)) != 0) {
3746 KAUTH_DEBUG("STAT - filesystem returned error %d", error);
3747 return(error);
3748 }
3749
3750 /*
3751 * Unpack into the per-mount structure.
3752 *
3753 * We only overwrite these fields, which are likely to change:
3754 * f_blocks
3755 * f_bfree
3756 * f_bavail
3757 * f_bused
3758 * f_files
3759 * f_ffree
3760 *
3761 * And these which are not, but which the FS has no other way
3762 * of providing to us:
3763 * f_bsize
3764 * f_iosize
3765 * f_fssubtype
3766 *
3767 */
3768 if (VFSATTR_IS_SUPPORTED(&va, f_bsize)) {
3769 mp->mnt_vfsstat.f_bsize = va.f_bsize;
3770 } else {
3771 mp->mnt_vfsstat.f_bsize = mp->mnt_devblocksize; /* default from the device block size */
3772 }
3773 if (VFSATTR_IS_SUPPORTED(&va, f_iosize)) {
3774 mp->mnt_vfsstat.f_iosize = va.f_iosize;
3775 } else {
3776 mp->mnt_vfsstat.f_iosize = 1024 * 1024; /* 1MB sensible I/O size */
3777 }
3778 if (VFSATTR_IS_SUPPORTED(&va, f_blocks))
3779 mp->mnt_vfsstat.f_blocks = va.f_blocks;
3780 if (VFSATTR_IS_SUPPORTED(&va, f_bfree))
3781 mp->mnt_vfsstat.f_bfree = va.f_bfree;
3782 if (VFSATTR_IS_SUPPORTED(&va, f_bavail))
3783 mp->mnt_vfsstat.f_bavail = va.f_bavail;
3784 if (VFSATTR_IS_SUPPORTED(&va, f_bused))
3785 mp->mnt_vfsstat.f_bused = va.f_bused;
3786 if (VFSATTR_IS_SUPPORTED(&va, f_files))
3787 mp->mnt_vfsstat.f_files = va.f_files;
3788 if (VFSATTR_IS_SUPPORTED(&va, f_ffree))
3789 mp->mnt_vfsstat.f_ffree = va.f_ffree;
3790
3791 /* this is unlikely to change, but has to be queried for */
3792 if (VFSATTR_IS_SUPPORTED(&va, f_fssubtype))
3793 mp->mnt_vfsstat.f_fssubtype = va.f_fssubtype;
3794
3795 return(0);
3796 }
3797
3798 void
3799 mount_list_add(mount_t mp)
3800 {
3801 mount_list_lock();
3802 TAILQ_INSERT_TAIL(&mountlist, mp, mnt_list);
3803 nummounts++;
3804 mount_list_unlock();
3805 }
3806
3807 void
3808 mount_list_remove(mount_t mp)
3809 {
3810 mount_list_lock();
3811 TAILQ_REMOVE(&mountlist, mp, mnt_list);
3812 nummounts--;
3813 mp->mnt_list.tqe_next = 0;
3814 mp->mnt_list.tqe_prev = 0;
3815 mount_list_unlock();
3816 }
3817
3818 mount_t
3819 mount_lookupby_volfsid(int volfs_id, int withref)
3820 {
3821 mount_t cur_mount = (mount_t)0;
3822 mount_t mp ;
3823
3824 mount_list_lock();
3825 TAILQ_FOREACH(mp, &mountlist, mnt_list) {
3826 if (validfsnode(mp) && mp->mnt_vfsstat.f_fsid.val[0] == volfs_id) {
3827 cur_mount = mp;
3828 if (withref) {
3829 if (mount_iterref(cur_mount, 1)) {
3830 cur_mount = (mount_t)0;
3831 mount_list_unlock();
3832 goto out;
3833 }
3834 }
3835 break;
3836 }
3837 }
3838 mount_list_unlock();
3839 if (withref && (cur_mount != (mount_t)0)) {
3840 mp = cur_mount;
3841 if (vfs_busy(mp, LK_NOWAIT) != 0) {
3842 cur_mount = (mount_t)0;
3843 }
3844 mount_iterdrop(mp);
3845 }
3846 out:
3847 return(cur_mount);
3848 }
3849
3850
3851 mount_t
3852 mount_list_lookupby_fsid(fsid, locked, withref)
3853 fsid_t *fsid;
3854 int locked;
3855 int withref;
3856 {
3857 mount_t retmp = (mount_t)0;
3858 mount_t mp;
3859
3860 if (!locked)
3861 mount_list_lock();
3862 TAILQ_FOREACH(mp, &mountlist, mnt_list)
3863 if (mp->mnt_vfsstat.f_fsid.val[0] == fsid->val[0] &&
3864 mp->mnt_vfsstat.f_fsid.val[1] == fsid->val[1]) {
3865 retmp = mp;
3866 if (withref) {
3867 if (mount_iterref(retmp, 1))
3868 retmp = (mount_t)0;
3869 }
3870 goto out;
3871 }
3872 out:
3873 if (!locked)
3874 mount_list_unlock();
3875 return (retmp);
3876 }
3877
3878 errno_t
3879 vnode_lookup(const char *path, int flags, vnode_t *vpp, vfs_context_t context)
3880 {
3881 struct nameidata nd;
3882 int error;
3883 struct vfs_context context2;
3884 vfs_context_t ctx = context;
3885 u_long ndflags = 0;
3886
3887 if (context == NULL) { /* XXX technically an error */
3888 context2.vc_proc = current_proc();
3889 context2.vc_ucred = kauth_cred_get();
3890 ctx = &context2;
3891 }
3892
3893 if (flags & VNODE_LOOKUP_NOFOLLOW)
3894 ndflags = NOFOLLOW;
3895 else
3896 ndflags = FOLLOW;
3897
3898 if (flags & VNODE_LOOKUP_NOCROSSMOUNT)
3899 ndflags |= NOCROSSMOUNT;
3900 if (flags & VNODE_LOOKUP_DOWHITEOUT)
3901 ndflags |= DOWHITEOUT;
3902
3903 /* XXX AUDITVNPATH1 needed ? */
3904 NDINIT(&nd, LOOKUP, ndflags, UIO_SYSSPACE, CAST_USER_ADDR_T(path), ctx);
3905
3906 if ((error = namei(&nd)))
3907 return (error);
3908 *vpp = nd.ni_vp;
3909 nameidone(&nd);
3910
3911 return (0);
3912 }
3913
3914 errno_t
3915 vnode_open(const char *path, int fmode, int cmode, int flags, vnode_t *vpp, vfs_context_t context)
3916 {
3917 struct nameidata nd;
3918 int error;
3919 struct vfs_context context2;
3920 vfs_context_t ctx = context;
3921 u_long ndflags = 0;
3922 int lflags = flags;
3923
3924 if (context == NULL) { /* XXX technically an error */
3925 context2.vc_proc = current_proc();
3926 context2.vc_ucred = kauth_cred_get();
3927 ctx = &context2;
3928 }
3929
3930 if (fmode & O_NOFOLLOW)
3931 lflags |= VNODE_LOOKUP_NOFOLLOW;
3932
3933 if (lflags & VNODE_LOOKUP_NOFOLLOW)
3934 ndflags = NOFOLLOW;
3935 else
3936 ndflags = FOLLOW;
3937
3938 if (lflags & VNODE_LOOKUP_NOCROSSMOUNT)
3939 ndflags |= NOCROSSMOUNT;
3940 if (lflags & VNODE_LOOKUP_DOWHITEOUT)
3941 ndflags |= DOWHITEOUT;
3942
3943 /* XXX AUDITVNPATH1 needed ? */
3944 NDINIT(&nd, LOOKUP, ndflags, UIO_SYSSPACE, CAST_USER_ADDR_T(path), ctx);
3945
3946 if ((error = vn_open(&nd, fmode, cmode)))
3947 *vpp = NULL;
3948 else
3949 *vpp = nd.ni_vp;
3950
3951 return (error);
3952 }
3953
3954 errno_t
3955 vnode_close(vnode_t vp, int flags, vfs_context_t context)
3956 {
3957 kauth_cred_t cred;
3958 struct proc *p;
3959 int error;
3960
3961 if (context) {
3962 p = context->vc_proc;
3963 cred = context->vc_ucred;
3964 } else {
3965 p = current_proc();
3966 cred = kauth_cred_get();
3967 }
3968
3969 error = vn_close(vp, flags, cred, p);
3970 vnode_put(vp);
3971 return (error);
3972 }
3973
3974 errno_t
3975 vnode_size(vnode_t vp, off_t *sizep, vfs_context_t ctx)
3976 {
3977 struct vnode_attr va;
3978 int error;
3979
3980 VATTR_INIT(&va);
3981 VATTR_WANTED(&va, va_data_size);
3982 error = vnode_getattr(vp, &va, ctx);
3983 if (!error)
3984 *sizep = va.va_data_size;
3985 return(error);
3986 }
3987
3988 errno_t
3989 vnode_setsize(vnode_t vp, off_t size, int ioflag, vfs_context_t ctx)
3990 {
3991 struct vnode_attr va;
3992
3993 VATTR_INIT(&va);
3994 VATTR_SET(&va, va_data_size, size);
3995 va.va_vaflags = ioflag & 0xffff;
3996 return(vnode_setattr(vp, &va, ctx));
3997 }
3998
3999 errno_t
4000 vn_create(vnode_t dvp, vnode_t *vpp, struct componentname *cnp, struct vnode_attr *vap, int flags, vfs_context_t ctx)
4001 {
4002 kauth_acl_t oacl, nacl;
4003 int initial_acl;
4004 errno_t error;
4005 vnode_t vp = (vnode_t)0;
4006
4007 error = 0;
4008 oacl = nacl = NULL;
4009 initial_acl = 0;
4010
4011 KAUTH_DEBUG("%p CREATE - '%s'", dvp, cnp->cn_nameptr);
4012
4013 /*
4014 * Handle ACL inheritance.
4015 */
4016 if (!(flags & VN_CREATE_NOINHERIT) && vfs_extendedsecurity(dvp->v_mount)) {
4017 /* save the original filesec */
4018 if (VATTR_IS_ACTIVE(vap, va_acl)) {
4019 initial_acl = 1;
4020 oacl = vap->va_acl;
4021 }
4022
4023 vap->va_acl = NULL;
4024 if ((error = kauth_acl_inherit(dvp,
4025 oacl,
4026 &nacl,
4027 vap->va_type == VDIR,
4028 ctx)) != 0) {
4029 KAUTH_DEBUG("%p CREATE - error %d processing inheritance", dvp, error);
4030 return(error);
4031 }
4032
4033 /*
4034 * If the generated ACL is NULL, then we can save ourselves some effort
4035 * by clearing the active bit.
4036 */
4037 if (nacl == NULL) {
4038 VATTR_CLEAR_ACTIVE(vap, va_acl);
4039 } else {
4040 VATTR_SET(vap, va_acl, nacl);
4041 }
4042 }
4043
4044 /*
4045 * Check and default new attributes.
4046 * This will set va_uid, va_gid, va_mode and va_create_time at least, if the caller
4047 * hasn't supplied them.
4048 */
4049 if ((error = vnode_authattr_new(dvp, vap, flags & VN_CREATE_NOAUTH, ctx)) != 0) {
4050 KAUTH_DEBUG("%p CREATE - error %d handing/defaulting attributes", dvp, error);
4051 goto out;
4052 }
4053
4054
4055 /*
4056 * Create the requested node.
4057 */
4058 switch(vap->va_type) {
4059 case VREG:
4060 error = VNOP_CREATE(dvp, vpp, cnp, vap, ctx);
4061 break;
4062 case VDIR:
4063 error = VNOP_MKDIR(dvp, vpp, cnp, vap, ctx);
4064 break;
4065 case VSOCK:
4066 case VFIFO:
4067 case VBLK:
4068 case VCHR:
4069 error = VNOP_MKNOD(dvp, vpp, cnp, vap, ctx);
4070 break;
4071 default:
4072 panic("vnode_create: unknown vtype %d", vap->va_type);
4073 }
4074 if (error != 0) {
4075 KAUTH_DEBUG("%p CREATE - error %d returned by filesystem", dvp, error);
4076 goto out;
4077 }
4078
4079 vp = *vpp;
4080 /*
4081 * If some of the requested attributes weren't handled by the VNOP,
4082 * use our fallback code.
4083 */
4084 if (!VATTR_ALL_SUPPORTED(vap) && *vpp) {
4085 KAUTH_DEBUG(" CREATE - doing fallback with ACL %p", vap->va_acl);
4086 error = vnode_setattr_fallback(*vpp, vap, ctx);
4087 }
4088 if ((error != 0 ) && (vp != (vnode_t)0)) {
4089 *vpp = (vnode_t) 0;
4090 vnode_put(vp);
4091 }
4092
4093 out:
4094 /*
4095 * If the caller supplied a filesec in vap, it has been replaced
4096 * now by the post-inheritance copy. We need to put the original back
4097 * and free the inherited product.
4098 */
4099 if (initial_acl) {
4100 VATTR_SET(vap, va_acl, oacl);
4101 } else {
4102 VATTR_CLEAR_ACTIVE(vap, va_acl);
4103 }
4104 if (nacl != NULL)
4105 kauth_acl_free(nacl);
4106
4107 return(error);
4108 }
4109
4110 static kauth_scope_t vnode_scope;
4111 static int vnode_authorize_callback(kauth_cred_t credential, __unused void *idata, kauth_action_t action,
4112 uintptr_t arg0, uintptr_t arg1, uintptr_t arg2, uintptr_t arg3);
4113
4114 typedef struct _vnode_authorize_context {
4115 vnode_t vp;
4116 struct vnode_attr *vap;
4117 vnode_t dvp;
4118 struct vnode_attr *dvap;
4119 vfs_context_t ctx;
4120 int flags;
4121 int flags_valid;
4122 #define _VAC_IS_OWNER (1<<0)
4123 #define _VAC_IN_GROUP (1<<1)
4124 #define _VAC_IS_DIR_OWNER (1<<2)
4125 #define _VAC_IN_DIR_GROUP (1<<3)
4126 } *vauth_ctx;
4127
4128 void
4129 vnode_authorize_init(void)
4130 {
4131 vnode_scope = kauth_register_scope(KAUTH_SCOPE_VNODE, vnode_authorize_callback, NULL);
4132 }
4133
4134 /*
4135 * Authorize an operation on a vnode.
4136 *
4137 * This is KPI, but here because it needs vnode_scope.
4138 */
4139 int
4140 vnode_authorize(vnode_t vp, vnode_t dvp, kauth_action_t action, vfs_context_t context)
4141 {
4142 int error, result;
4143
4144 /*
4145 * We can't authorize against a dead vnode; allow all operations through so that
4146 * the correct error can be returned.
4147 */
4148 if (vp->v_type == VBAD)
4149 return(0);
4150
4151 error = 0;
4152 result = kauth_authorize_action(vnode_scope, vfs_context_ucred(context), action,
4153 (uintptr_t)context, (uintptr_t)vp, (uintptr_t)dvp, (uintptr_t)&error);
4154 if (result == EPERM) /* traditional behaviour */
4155 result = EACCES;
4156 /* did the lower layers give a better error return? */
4157 if ((result != 0) && (error != 0))
4158 return(error);
4159 return(result);
4160 }
4161
4162 /*
4163 * Test for vnode immutability.
4164 *
4165 * The 'append' flag is set when the authorization request is constrained
4166 * to operations which only request the right to append to a file.
4167 *
4168 * The 'ignore' flag is set when an operation modifying the immutability flags
4169 * is being authorized. We check the system securelevel to determine which
4170 * immutability flags we can ignore.
4171 */
4172 static int
4173 vnode_immutable(struct vnode_attr *vap, int append, int ignore)
4174 {
4175 int mask;
4176
4177 /* start with all bits precluding the operation */
4178 mask = IMMUTABLE | APPEND;
4179
4180 /* if appending only, remove the append-only bits */
4181 if (append)
4182 mask &= ~APPEND;
4183
4184 /* ignore only set when authorizing flags changes */
4185 if (ignore) {
4186 if (securelevel <= 0) {
4187 /* in insecure state, flags do not inhibit changes */
4188 mask = 0;
4189 } else {
4190 /* in secure state, user flags don't inhibit */
4191 mask &= ~(UF_IMMUTABLE | UF_APPEND);
4192 }
4193 }
4194 KAUTH_DEBUG("IMMUTABLE - file flags 0x%x mask 0x%x append = %d ignore = %d", vap->va_flags, mask, append, ignore);
4195 if ((vap->va_flags & mask) != 0)
4196 return(EPERM);
4197 return(0);
4198 }
4199
4200 static int
4201 vauth_node_owner(struct vnode_attr *vap, kauth_cred_t cred)
4202 {
4203 int result;
4204
4205 /* default assumption is not-owner */
4206 result = 0;
4207
4208 /*
4209 * If the filesystem has given us a UID, we treat this as authoritative.
4210 */
4211 if (vap && VATTR_IS_SUPPORTED(vap, va_uid)) {
4212 result = (vap->va_uid == kauth_cred_getuid(cred)) ? 1 : 0;
4213 }
4214 /* we could test the owner UUID here if we had a policy for it */
4215
4216 return(result);
4217 }
4218
4219 static int
4220 vauth_node_group(struct vnode_attr *vap, kauth_cred_t cred, int *ismember)
4221 {
4222 int error;
4223 int result;
4224
4225 error = 0;
4226 result = 0;
4227
4228 /* the caller is expected to have asked the filesystem for a group at some point */
4229 if (vap && VATTR_IS_SUPPORTED(vap, va_gid)) {
4230 error = kauth_cred_ismember_gid(cred, vap->va_gid, &result);
4231 }
4232 /* we could test the group UUID here if we had a policy for it */
4233
4234 if (!error)
4235 *ismember = result;
4236 return(error);
4237 }
4238
4239 static int
4240 vauth_file_owner(vauth_ctx vcp)
4241 {
4242 int result;
4243
4244 if (vcp->flags_valid & _VAC_IS_OWNER) {
4245 result = (vcp->flags & _VAC_IS_OWNER) ? 1 : 0;
4246 } else {
4247 result = vauth_node_owner(vcp->vap, vcp->ctx->vc_ucred);
4248
4249 /* cache our result */
4250 vcp->flags_valid |= _VAC_IS_OWNER;
4251 if (result) {
4252 vcp->flags |= _VAC_IS_OWNER;
4253 } else {
4254 vcp->flags &= ~_VAC_IS_OWNER;
4255 }
4256 }
4257 return(result);
4258 }
4259
4260 static int
4261 vauth_file_ingroup(vauth_ctx vcp, int *ismember)
4262 {
4263 int error;
4264
4265 if (vcp->flags_valid & _VAC_IN_GROUP) {
4266 *ismember = (vcp->flags & _VAC_IN_GROUP) ? 1 : 0;
4267 error = 0;
4268 } else {
4269 error = vauth_node_group(vcp->vap, vcp->ctx->vc_ucred, ismember);
4270
4271 if (!error) {
4272 /* cache our result */
4273 vcp->flags_valid |= _VAC_IN_GROUP;
4274 if (*ismember) {
4275 vcp->flags |= _VAC_IN_GROUP;
4276 } else {
4277 vcp->flags &= ~_VAC_IN_GROUP;
4278 }
4279 }
4280
4281 }
4282 return(error);
4283 }
4284
4285 static int
4286 vauth_dir_owner(vauth_ctx vcp)
4287 {
4288 int result;
4289
4290 if (vcp->flags_valid & _VAC_IS_DIR_OWNER) {
4291 result = (vcp->flags & _VAC_IS_DIR_OWNER) ? 1 : 0;
4292 } else {
4293 result = vauth_node_owner(vcp->dvap, vcp->ctx->vc_ucred);
4294
4295 /* cache our result */
4296 vcp->flags_valid |= _VAC_IS_DIR_OWNER;
4297 if (result) {
4298 vcp->flags |= _VAC_IS_DIR_OWNER;
4299 } else {
4300 vcp->flags &= ~_VAC_IS_DIR_OWNER;
4301 }
4302 }
4303 return(result);
4304 }
4305
4306 static int
4307 vauth_dir_ingroup(vauth_ctx vcp, int *ismember)
4308 {
4309 int error;
4310
4311 if (vcp->flags_valid & _VAC_IN_DIR_GROUP) {
4312 *ismember = (vcp->flags & _VAC_IN_DIR_GROUP) ? 1 : 0;
4313 error = 0;
4314 } else {
4315 error = vauth_node_group(vcp->dvap, vcp->ctx->vc_ucred, ismember);
4316
4317 if (!error) {
4318 /* cache our result */
4319 vcp->flags_valid |= _VAC_IN_DIR_GROUP;
4320 if (*ismember) {
4321 vcp->flags |= _VAC_IN_DIR_GROUP;
4322 } else {
4323 vcp->flags &= ~_VAC_IN_DIR_GROUP;
4324 }
4325 }
4326 }
4327 return(error);
4328 }
4329
4330 /*
4331 * Test the posix permissions in (vap) to determine whether (credential)
4332 * may perform (action)
4333 */
4334 static int
4335 vnode_authorize_posix(vauth_ctx vcp, int action, int on_dir)
4336 {
4337 struct vnode_attr *vap;
4338 int needed, error, owner_ok, group_ok, world_ok, ismember;
4339 #ifdef KAUTH_DEBUG_ENABLE
4340 const char *where;
4341 # define _SETWHERE(c) where = c;
4342 #else
4343 # define _SETWHERE(c)
4344 #endif
4345
4346 /* checking file or directory? */
4347 if (on_dir) {
4348 vap = vcp->dvap;
4349 } else {
4350 vap = vcp->vap;
4351 }
4352
4353 error = 0;
4354
4355 /*
4356 * We want to do as little work here as possible. So first we check
4357 * which sets of permissions grant us the access we need, and avoid checking
4358 * whether specific permissions grant access when more generic ones would.
4359 */
4360
4361 /* owner permissions */
4362 needed = 0;
4363 if (action & VREAD)
4364 needed |= S_IRUSR;
4365 if (action & VWRITE)
4366 needed |= S_IWUSR;
4367 if (action & VEXEC)
4368 needed |= S_IXUSR;
4369 owner_ok = (needed & vap->va_mode) == needed;
4370
4371 /* group permissions */
4372 needed = 0;
4373 if (action & VREAD)
4374 needed |= S_IRGRP;
4375 if (action & VWRITE)
4376 needed |= S_IWGRP;
4377 if (action & VEXEC)
4378 needed |= S_IXGRP;
4379 group_ok = (needed & vap->va_mode) == needed;
4380
4381 /* world permissions */
4382 needed = 0;
4383 if (action & VREAD)
4384 needed |= S_IROTH;
4385 if (action & VWRITE)
4386 needed |= S_IWOTH;
4387 if (action & VEXEC)
4388 needed |= S_IXOTH;
4389 world_ok = (needed & vap->va_mode) == needed;
4390
4391 /* If granted/denied by all three, we're done */
4392 if (owner_ok && group_ok && world_ok) {
4393 _SETWHERE("all");
4394 goto out;
4395 }
4396 if (!owner_ok && !group_ok && !world_ok) {
4397 _SETWHERE("all");
4398 error = EACCES;
4399 goto out;
4400 }
4401
4402 /* Check ownership (relatively cheap) */
4403 if ((on_dir && vauth_dir_owner(vcp)) ||
4404 (!on_dir && vauth_file_owner(vcp))) {
4405 _SETWHERE("user");
4406 if (!owner_ok)
4407 error = EACCES;
4408 goto out;
4409 }
4410
4411 /* Not owner; if group and world both grant it we're done */
4412 if (group_ok && world_ok) {
4413 _SETWHERE("group/world");
4414 goto out;
4415 }
4416 if (!group_ok && !world_ok) {
4417 _SETWHERE("group/world");
4418 error = EACCES;
4419 goto out;
4420 }
4421
4422 /* Check group membership (most expensive) */
4423 ismember = 0;
4424 if (on_dir) {
4425 error = vauth_dir_ingroup(vcp, &ismember);
4426 } else {
4427 error = vauth_file_ingroup(vcp, &ismember);
4428 }
4429 if (error)
4430 goto out;
4431 if (ismember) {
4432 _SETWHERE("group");
4433 if (!group_ok)
4434 error = EACCES;
4435 goto out;
4436 }
4437
4438 /* Not owner, not in group, use world result */
4439 _SETWHERE("world");
4440 if (!world_ok)
4441 error = EACCES;
4442
4443 /* FALLTHROUGH */
4444
4445 out:
4446 KAUTH_DEBUG("%p %s - posix %s permissions : need %s%s%s %x have %s%s%s%s%s%s%s%s%s UID = %d file = %d,%d",
4447 vcp->vp, (error == 0) ? "ALLOWED" : "DENIED", where,
4448 (action & VREAD) ? "r" : "-",
4449 (action & VWRITE) ? "w" : "-",
4450 (action & VEXEC) ? "x" : "-",
4451 needed,
4452 (vap->va_mode & S_IRUSR) ? "r" : "-",
4453 (vap->va_mode & S_IWUSR) ? "w" : "-",
4454 (vap->va_mode & S_IXUSR) ? "x" : "-",
4455 (vap->va_mode & S_IRGRP) ? "r" : "-",
4456 (vap->va_mode & S_IWGRP) ? "w" : "-",
4457 (vap->va_mode & S_IXGRP) ? "x" : "-",
4458 (vap->va_mode & S_IROTH) ? "r" : "-",
4459 (vap->va_mode & S_IWOTH) ? "w" : "-",
4460 (vap->va_mode & S_IXOTH) ? "x" : "-",
4461 kauth_cred_getuid(vcp->ctx->vc_ucred),
4462 on_dir ? vcp->dvap->va_uid : vcp->vap->va_uid,
4463 on_dir ? vcp->dvap->va_gid : vcp->vap->va_gid);
4464 return(error);
4465 }
4466
4467 /*
4468 * Authorize the deletion of the node vp from the directory dvp.
4469 *
4470 * We assume that:
4471 * - Neither the node nor the directory are immutable.
4472 * - The user is not the superuser.
4473 *
4474 * Deletion is not permitted if the directory is sticky and the caller is not owner of the
4475 * node or directory.
4476 *
4477 * If either the node grants DELETE, or the directory grants DELETE_CHILD, the node may be
4478 * deleted. If neither denies the permission, and the caller has Posix write access to the
4479 * directory, then the node may be deleted.
4480 */
4481 static int
4482 vnode_authorize_delete(vauth_ctx vcp)
4483 {
4484 struct vnode_attr *vap = vcp->vap;
4485 struct vnode_attr *dvap = vcp->dvap;
4486 kauth_cred_t cred = vcp->ctx->vc_ucred;
4487 struct kauth_acl_eval eval;
4488 int error, delete_denied, delete_child_denied, ismember;
4489
4490 /* check the ACL on the directory */
4491 delete_child_denied = 0;
4492 if (VATTR_IS_NOT(dvap, va_acl, NULL)) {
4493 eval.ae_requested = KAUTH_VNODE_DELETE_CHILD;
4494 eval.ae_acl = &dvap->va_acl->acl_ace[0];
4495 eval.ae_count = dvap->va_acl->acl_entrycount;
4496 eval.ae_options = 0;
4497 if (vauth_dir_owner(vcp))
4498 eval.ae_options |= KAUTH_AEVAL_IS_OWNER;
4499 if ((error = vauth_dir_ingroup(vcp, &ismember)) != 0)
4500 return(error);
4501 if (ismember)
4502 eval.ae_options |= KAUTH_AEVAL_IN_GROUP;
4503 eval.ae_exp_gall = KAUTH_VNODE_GENERIC_ALL_BITS;
4504 eval.ae_exp_gread = KAUTH_VNODE_GENERIC_READ_BITS;
4505 eval.ae_exp_gwrite = KAUTH_VNODE_GENERIC_WRITE_BITS;
4506 eval.ae_exp_gexec = KAUTH_VNODE_GENERIC_EXECUTE_BITS;
4507
4508 error = kauth_acl_evaluate(cred, &eval);
4509
4510 if (error != 0) {
4511 KAUTH_DEBUG("%p ERROR during ACL processing - %d", vcp->vp, error);
4512 return(error);
4513 }
4514 if (eval.ae_result == KAUTH_RESULT_DENY)
4515 delete_child_denied = 1;
4516 if (eval.ae_result == KAUTH_RESULT_ALLOW) {
4517 KAUTH_DEBUG("%p ALLOWED - granted by directory ACL", vcp->vp);
4518 return(0);
4519 }
4520 }
4521
4522 /* check the ACL on the node */
4523 delete_denied = 0;
4524 if (VATTR_IS_NOT(vap, va_acl, NULL)) {
4525 eval.ae_requested = KAUTH_VNODE_DELETE;
4526 eval.ae_acl = &vap->va_acl->acl_ace[0];
4527 eval.ae_count = vap->va_acl->acl_entrycount;
4528 eval.ae_options = 0;
4529 if (vauth_file_owner(vcp))
4530 eval.ae_options |= KAUTH_AEVAL_IS_OWNER;
4531 if ((error = vauth_file_ingroup(vcp, &ismember)) != 0)
4532 return(error);
4533 if (ismember)
4534 eval.ae_options |= KAUTH_AEVAL_IN_GROUP;
4535 eval.ae_exp_gall = KAUTH_VNODE_GENERIC_ALL_BITS;
4536 eval.ae_exp_gread = KAUTH_VNODE_GENERIC_READ_BITS;
4537 eval.ae_exp_gwrite = KAUTH_VNODE_GENERIC_WRITE_BITS;
4538 eval.ae_exp_gexec = KAUTH_VNODE_GENERIC_EXECUTE_BITS;
4539
4540 if ((error = kauth_acl_evaluate(cred, &eval)) != 0) {
4541 KAUTH_DEBUG("%p ERROR during ACL processing - %d", vcp->vp, error);
4542 return(error);
4543 }
4544 if (eval.ae_result == KAUTH_RESULT_DENY)
4545 delete_denied = 1;
4546 if (eval.ae_result == KAUTH_RESULT_ALLOW) {
4547 KAUTH_DEBUG("%p ALLOWED - granted by file ACL", vcp->vp);
4548 return(0);
4549 }
4550 }
4551
4552 /* if denied by ACL on directory or node, return denial */
4553 if (delete_denied || delete_child_denied) {
4554 KAUTH_DEBUG("%p ALLOWED - denied by ACL", vcp->vp);
4555 return(EACCES);
4556 }
4557
4558 /* enforce sticky bit behaviour */
4559 if ((dvap->va_mode & S_ISTXT) && !vauth_file_owner(vcp) && !vauth_dir_owner(vcp)) {
4560 KAUTH_DEBUG("%p DENIED - sticky bit rules (user %d file %d dir %d)",
4561 vcp->vp, cred->cr_uid, vap->va_uid, dvap->va_uid);
4562 return(EACCES);
4563 }
4564
4565 /* check the directory */
4566 if ((error = vnode_authorize_posix(vcp, VWRITE, 1 /* on_dir */)) != 0) {
4567 KAUTH_DEBUG("%p ALLOWED - granted by posix permisssions", vcp->vp);
4568 return(error);
4569 }
4570
4571 /* not denied, must be OK */
4572 return(0);
4573 }
4574
4575
4576 /*
4577 * Authorize an operation based on the node's attributes.
4578 */
4579 static int
4580 vnode_authorize_simple(vauth_ctx vcp, kauth_ace_rights_t acl_rights, kauth_ace_rights_t preauth_rights)
4581 {
4582 struct vnode_attr *vap = vcp->vap;
4583 kauth_cred_t cred = vcp->ctx->vc_ucred;
4584 struct kauth_acl_eval eval;
4585 int error, ismember;
4586 mode_t posix_action;
4587
4588 /*
4589 * If we are the file owner, we automatically have some rights.
4590 *
4591 * Do we need to expand this to support group ownership?
4592 */
4593 if (vauth_file_owner(vcp))
4594 acl_rights &= ~(KAUTH_VNODE_WRITE_SECURITY);
4595
4596 /*
4597 * If we are checking both TAKE_OWNERSHIP and WRITE_SECURITY, we can
4598 * mask the latter. If TAKE_OWNERSHIP is requested the caller is about to
4599 * change ownership to themselves, and WRITE_SECURITY is implicitly
4600 * granted to the owner. We need to do this because at this point
4601 * WRITE_SECURITY may not be granted as the caller is not currently
4602 * the owner.
4603 */
4604 if ((acl_rights & KAUTH_VNODE_TAKE_OWNERSHIP) &&
4605 (acl_rights & KAUTH_VNODE_WRITE_SECURITY))
4606 acl_rights &= ~KAUTH_VNODE_WRITE_SECURITY;
4607
4608 if (acl_rights == 0) {
4609 KAUTH_DEBUG("%p ALLOWED - implicit or no rights required", vcp->vp);
4610 return(0);
4611 }
4612
4613 /* if we have an ACL, evaluate it */
4614 if (VATTR_IS_NOT(vap, va_acl, NULL)) {
4615 eval.ae_requested = acl_rights;
4616 eval.ae_acl = &vap->va_acl->acl_ace[0];
4617 eval.ae_count = vap->va_acl->acl_entrycount;
4618 eval.ae_options = 0;
4619 if (vauth_file_owner(vcp))
4620 eval.ae_options |= KAUTH_AEVAL_IS_OWNER;
4621 if ((error = vauth_file_ingroup(vcp, &ismember)) != 0)
4622 return(error);
4623 if (ismember)
4624 eval.ae_options |= KAUTH_AEVAL_IN_GROUP;
4625 eval.ae_exp_gall = KAUTH_VNODE_GENERIC_ALL_BITS;
4626 eval.ae_exp_gread = KAUTH_VNODE_GENERIC_READ_BITS;
4627 eval.ae_exp_gwrite = KAUTH_VNODE_GENERIC_WRITE_BITS;
4628 eval.ae_exp_gexec = KAUTH_VNODE_GENERIC_EXECUTE_BITS;
4629
4630 if ((error = kauth_acl_evaluate(cred, &eval)) != 0) {
4631 KAUTH_DEBUG("%p ERROR during ACL processing - %d", vcp->vp, error);
4632 return(error);
4633 }
4634
4635 if (eval.ae_result == KAUTH_RESULT_DENY) {
4636 KAUTH_DEBUG("%p DENIED - by ACL", vcp->vp);
4637 return(EACCES); /* deny, deny, counter-allege */
4638 }
4639 if (eval.ae_result == KAUTH_RESULT_ALLOW) {
4640 KAUTH_DEBUG("%p ALLOWED - all rights granted by ACL", vcp->vp);
4641 return(0);
4642 }
4643 /* fall through and evaluate residual rights */
4644 } else {
4645 /* no ACL, everything is residual */
4646 eval.ae_residual = acl_rights;
4647 }
4648
4649 /*
4650 * Grant residual rights that have been pre-authorized.
4651 */
4652 eval.ae_residual &= ~preauth_rights;
4653
4654 /*
4655 * We grant WRITE_ATTRIBUTES to the owner if it hasn't been denied.
4656 */
4657 if (vauth_file_owner(vcp))
4658 eval.ae_residual &= ~KAUTH_VNODE_WRITE_ATTRIBUTES;
4659
4660 if (eval.ae_residual == 0) {
4661 KAUTH_DEBUG("%p ALLOWED - rights already authorized", vcp->vp);
4662 return(0);
4663 }
4664
4665 /*
4666 * Bail if we have residual rights that can't be granted by posix permissions,
4667 * or aren't presumed granted at this point.
4668 *
4669 * XXX these can be collapsed for performance
4670 */
4671 if (eval.ae_residual & KAUTH_VNODE_CHANGE_OWNER) {
4672 KAUTH_DEBUG("%p DENIED - CHANGE_OWNER not permitted", vcp->vp);
4673 return(EACCES);
4674 }
4675 if (eval.ae_residual & KAUTH_VNODE_WRITE_SECURITY) {
4676 KAUTH_DEBUG("%p DENIED - WRITE_SECURITY not permitted", vcp->vp);
4677 return(EACCES);
4678 }
4679
4680 #if DIAGNOSTIC
4681 if (eval.ae_residual & KAUTH_VNODE_DELETE)
4682 panic("vnode_authorize: can't be checking delete permission here");
4683 #endif
4684
4685 /*
4686 * Compute the fallback posix permissions that will satisfy the remaining
4687 * rights.
4688 */
4689 posix_action = 0;
4690 if (eval.ae_residual & (KAUTH_VNODE_READ_DATA |
4691 KAUTH_VNODE_LIST_DIRECTORY |
4692 KAUTH_VNODE_READ_EXTATTRIBUTES))
4693 posix_action |= VREAD;
4694 if (eval.ae_residual & (KAUTH_VNODE_WRITE_DATA |
4695 KAUTH_VNODE_ADD_FILE |
4696 KAUTH_VNODE_ADD_SUBDIRECTORY |
4697 KAUTH_VNODE_DELETE_CHILD |
4698 KAUTH_VNODE_WRITE_ATTRIBUTES |
4699 KAUTH_VNODE_WRITE_EXTATTRIBUTES))
4700 posix_action |= VWRITE;
4701 if (eval.ae_residual & (KAUTH_VNODE_EXECUTE |
4702 KAUTH_VNODE_SEARCH))
4703 posix_action |= VEXEC;
4704
4705 if (posix_action != 0) {
4706 return(vnode_authorize_posix(vcp, posix_action, 0 /* !on_dir */));
4707 } else {
4708 KAUTH_DEBUG("%p ALLOWED - residual rights %s%s%s%s%s%s%s%s%s%s%s%s%s%s granted due to no posix mapping",
4709 vcp->vp,
4710 (eval.ae_residual & KAUTH_VNODE_READ_DATA)
4711 ? vnode_isdir(vcp->vp) ? " LIST_DIRECTORY" : " READ_DATA" : "",
4712 (eval.ae_residual & KAUTH_VNODE_WRITE_DATA)
4713 ? vnode_isdir(vcp->vp) ? " ADD_FILE" : " WRITE_DATA" : "",
4714 (eval.ae_residual & KAUTH_VNODE_EXECUTE)
4715 ? vnode_isdir(vcp->vp) ? " SEARCH" : " EXECUTE" : "",
4716 (eval.ae_residual & KAUTH_VNODE_DELETE)
4717 ? " DELETE" : "",
4718 (eval.ae_residual & KAUTH_VNODE_APPEND_DATA)
4719 ? vnode_isdir(vcp->vp) ? " ADD_SUBDIRECTORY" : " APPEND_DATA" : "",
4720 (eval.ae_residual & KAUTH_VNODE_DELETE_CHILD)
4721 ? " DELETE_CHILD" : "",
4722 (eval.ae_residual & KAUTH_VNODE_READ_ATTRIBUTES)
4723 ? " READ_ATTRIBUTES" : "",
4724 (eval.ae_residual & KAUTH_VNODE_WRITE_ATTRIBUTES)
4725 ? " WRITE_ATTRIBUTES" : "",
4726 (eval.ae_residual & KAUTH_VNODE_READ_EXTATTRIBUTES)
4727 ? " READ_EXTATTRIBUTES" : "",
4728 (eval.ae_residual & KAUTH_VNODE_WRITE_EXTATTRIBUTES)
4729 ? " WRITE_EXTATTRIBUTES" : "",
4730 (eval.ae_residual & KAUTH_VNODE_READ_SECURITY)
4731 ? " READ_SECURITY" : "",
4732 (eval.ae_residual & KAUTH_VNODE_WRITE_SECURITY)
4733 ? " WRITE_SECURITY" : "",
4734 (eval.ae_residual & KAUTH_VNODE_CHECKIMMUTABLE)
4735 ? " CHECKIMMUTABLE" : "",
4736 (eval.ae_residual & KAUTH_VNODE_CHANGE_OWNER)
4737 ? " CHANGE_OWNER" : "");
4738 }
4739
4740 /*
4741 * Lack of required Posix permissions implies no reason to deny access.
4742 */
4743 return(0);
4744 }
4745
4746 /*
4747 * Check for file immutability.
4748 */
4749 static int
4750 vnode_authorize_checkimmutable(vnode_t vp, struct vnode_attr *vap, int rights, int ignore)
4751 {
4752 mount_t mp;
4753 int error;
4754 int append;
4755
4756 /*
4757 * Perform immutability checks for operations that change data.
4758 *
4759 * Sockets, fifos and devices require special handling.
4760 */
4761 switch(vp->v_type) {
4762 case VSOCK:
4763 case VFIFO:
4764 case VBLK:
4765 case VCHR:
4766 /*
4767 * Writing to these nodes does not change the filesystem data,
4768 * so forget that it's being tried.
4769 */
4770 rights &= ~KAUTH_VNODE_WRITE_DATA;
4771 break;
4772 default:
4773 break;
4774 }
4775
4776 error = 0;
4777 if (rights & KAUTH_VNODE_WRITE_RIGHTS) {
4778
4779 /* check per-filesystem options if possible */
4780 mp = vnode_mount(vp);
4781 if (mp != NULL) {
4782
4783 /* check for no-EA filesystems */
4784 if ((rights & KAUTH_VNODE_WRITE_EXTATTRIBUTES) &&
4785 (vfs_flags(mp) & MNT_NOUSERXATTR)) {
4786 KAUTH_DEBUG("%p DENIED - filesystem disallowed extended attributes", vp);
4787 error = EACCES; /* User attributes disabled */
4788 goto out;
4789 }
4790 }
4791
4792 /* check for file immutability */
4793 append = 0;
4794 if (vp->v_type == VDIR) {
4795 if ((rights & (KAUTH_VNODE_ADD_FILE | KAUTH_VNODE_ADD_SUBDIRECTORY)) == rights)
4796 append = 1;
4797 } else {
4798 if ((rights & KAUTH_VNODE_APPEND_DATA) == rights)
4799 append = 1;
4800 }
4801 if ((error = vnode_immutable(vap, append, ignore)) != 0) {
4802 KAUTH_DEBUG("%p DENIED - file is immutable", vp);
4803 goto out;
4804 }
4805 }
4806 out:
4807 return(error);
4808 }
4809
4810 /*
4811 * Handle authorization actions for filesystems that advertise that the server will
4812 * be enforcing.
4813 */
4814 static int
4815 vnode_authorize_opaque(vnode_t vp, int *resultp, kauth_action_t action, vfs_context_t ctx)
4816 {
4817 int error;
4818
4819 /*
4820 * If the vp is a device node, socket or FIFO it actually represents a local
4821 * endpoint, so we need to handle it locally.
4822 */
4823 switch(vp->v_type) {
4824 case VBLK:
4825 case VCHR:
4826 case VSOCK:
4827 case VFIFO:
4828 return(0);
4829 default:
4830 break;
4831 }
4832
4833 /*
4834 * In the advisory request case, if the filesystem doesn't think it's reliable
4835 * we will attempt to formulate a result ourselves based on VNOP_GETATTR data.
4836 */
4837 if ((action & KAUTH_VNODE_ACCESS) && !vfs_authopaqueaccess(vnode_mount(vp)))
4838 return(0);
4839
4840 /*
4841 * Let the filesystem have a say in the matter. It's OK for it to not implemnent
4842 * VNOP_ACCESS, as most will authorise inline with the actual request.
4843 */
4844 if ((error = VNOP_ACCESS(vp, action, ctx)) != ENOTSUP) {
4845 *resultp = error;
4846 KAUTH_DEBUG("%p DENIED - opaque filesystem VNOP_ACCESS denied access", vp);
4847 return(1);
4848 }
4849
4850 /*
4851 * Typically opaque filesystems do authorisation in-line, but exec is a special case. In
4852 * order to be reasonably sure that exec will be permitted, we try a bit harder here.
4853 */
4854 if ((action & KAUTH_VNODE_EXECUTE) && vnode_isreg(vp)) {
4855 /* try a VNOP_OPEN for readonly access */
4856 if ((error = VNOP_OPEN(vp, FREAD, ctx)) != 0) {
4857 *resultp = error;
4858 KAUTH_DEBUG("%p DENIED - EXECUTE denied because file could not be opened readonly", vp);
4859 return(1);
4860 }
4861 VNOP_CLOSE(vp, FREAD, ctx);
4862 }
4863
4864 /*
4865 * We don't have any reason to believe that the request has to be denied at this point,
4866 * so go ahead and allow it.
4867 */
4868 *resultp = 0;
4869 KAUTH_DEBUG("%p ALLOWED - bypassing access check for non-local filesystem", vp);
4870 return(1);
4871 }
4872
4873 static int
4874 vnode_authorize_callback(__unused kauth_cred_t unused_cred, __unused void *idata, kauth_action_t action,
4875 uintptr_t arg0, uintptr_t arg1, uintptr_t arg2, uintptr_t arg3)
4876 {
4877 struct _vnode_authorize_context auth_context;
4878 vauth_ctx vcp;
4879 vfs_context_t ctx;
4880 vnode_t vp, dvp;
4881 kauth_cred_t cred;
4882 kauth_ace_rights_t rights;
4883 struct vnode_attr va, dva;
4884 int result;
4885 int *errorp;
4886 int noimmutable;
4887
4888 vcp = &auth_context;
4889 ctx = vcp->ctx = (vfs_context_t)arg0;
4890 vp = vcp->vp = (vnode_t)arg1;
4891 dvp = vcp->dvp = (vnode_t)arg2;
4892 errorp = (int *)arg3;
4893 /* note that we authorize against the context, not the passed cred (the same thing anyway) */
4894 cred = ctx->vc_ucred;
4895
4896 VATTR_INIT(&va);
4897 vcp->vap = &va;
4898 VATTR_INIT(&dva);
4899 vcp->dvap = &dva;
4900
4901 vcp->flags = vcp->flags_valid = 0;
4902
4903 #if DIAGNOSTIC
4904 if ((ctx == NULL) || (vp == NULL) || (cred == NULL))
4905 panic("vnode_authorize: bad arguments (context %p vp %p cred %p)", ctx, vp, cred);
4906 #endif
4907
4908 KAUTH_DEBUG("%p AUTH - %s %s%s%s%s%s%s%s%s%s%s%s%s%s%s%s on %s '%s' (0x%x:%p/%p)",
4909 vp, vfs_context_proc(ctx)->p_comm,
4910 (action & KAUTH_VNODE_ACCESS) ? "access" : "auth",
4911 (action & KAUTH_VNODE_READ_DATA) ? vnode_isdir(vp) ? " LIST_DIRECTORY" : " READ_DATA" : "",
4912 (action & KAUTH_VNODE_WRITE_DATA) ? vnode_isdir(vp) ? " ADD_FILE" : " WRITE_DATA" : "",
4913 (action & KAUTH_VNODE_EXECUTE) ? vnode_isdir(vp) ? " SEARCH" : " EXECUTE" : "",
4914 (action & KAUTH_VNODE_DELETE) ? " DELETE" : "",
4915 (action & KAUTH_VNODE_APPEND_DATA) ? vnode_isdir(vp) ? " ADD_SUBDIRECTORY" : " APPEND_DATA" : "",
4916 (action & KAUTH_VNODE_DELETE_CHILD) ? " DELETE_CHILD" : "",
4917 (action & KAUTH_VNODE_READ_ATTRIBUTES) ? " READ_ATTRIBUTES" : "",
4918 (action & KAUTH_VNODE_WRITE_ATTRIBUTES) ? " WRITE_ATTRIBUTES" : "",
4919 (action & KAUTH_VNODE_READ_EXTATTRIBUTES) ? " READ_EXTATTRIBUTES" : "",
4920 (action & KAUTH_VNODE_WRITE_EXTATTRIBUTES) ? " WRITE_EXTATTRIBUTES" : "",
4921 (action & KAUTH_VNODE_READ_SECURITY) ? " READ_SECURITY" : "",
4922 (action & KAUTH_VNODE_WRITE_SECURITY) ? " WRITE_SECURITY" : "",
4923 (action & KAUTH_VNODE_CHANGE_OWNER) ? " CHANGE_OWNER" : "",
4924 (action & KAUTH_VNODE_NOIMMUTABLE) ? " (noimmutable)" : "",
4925 vnode_isdir(vp) ? "directory" : "file",
4926 vp->v_name ? vp->v_name : "<NULL>", action, vp, dvp);
4927
4928 /*
4929 * Extract the control bits from the action, everything else is
4930 * requested rights.
4931 */
4932 noimmutable = (action & KAUTH_VNODE_NOIMMUTABLE) ? 1 : 0;
4933 rights = action & ~(KAUTH_VNODE_ACCESS | KAUTH_VNODE_NOIMMUTABLE);
4934
4935 if (rights & KAUTH_VNODE_DELETE) {
4936 #if DIAGNOSTIC
4937 if (dvp == NULL)
4938 panic("vnode_authorize: KAUTH_VNODE_DELETE test requires a directory");
4939 #endif
4940 } else {
4941 dvp = NULL;
4942 }
4943
4944 /*
4945 * Check for read-only filesystems.
4946 */
4947 if ((rights & KAUTH_VNODE_WRITE_RIGHTS) &&
4948 (vp->v_mount->mnt_flag & MNT_RDONLY) &&
4949 ((vp->v_type == VREG) || (vp->v_type == VDIR) ||
4950 (vp->v_type == VLNK) || (vp->v_type == VCPLX) ||
4951 (rights & KAUTH_VNODE_DELETE) || (rights & KAUTH_VNODE_DELETE_CHILD))) {
4952 result = EROFS;
4953 goto out;
4954 }
4955
4956 /*
4957 * Check for noexec filesystems.
4958 */
4959 if ((rights & KAUTH_VNODE_EXECUTE) && vnode_isreg(vp) && (vp->v_mount->mnt_flag & MNT_NOEXEC)) {
4960 result = EACCES;
4961 goto out;
4962 }
4963
4964 /*
4965 * Handle cases related to filesystems with non-local enforcement.
4966 * This call can return 0, in which case we will fall through to perform a
4967 * check based on VNOP_GETATTR data. Otherwise it returns 1 and sets
4968 * an appropriate result, at which point we can return immediately.
4969 */
4970 if (vfs_authopaque(vp->v_mount) && vnode_authorize_opaque(vp, &result, action, ctx))
4971 goto out;
4972
4973 /*
4974 * Get vnode attributes and extended security information for the vnode
4975 * and directory if required.
4976 */
4977 VATTR_WANTED(&va, va_mode);
4978 VATTR_WANTED(&va, va_uid);
4979 VATTR_WANTED(&va, va_gid);
4980 VATTR_WANTED(&va, va_flags);
4981 VATTR_WANTED(&va, va_acl);
4982 if ((result = vnode_getattr(vp, &va, ctx)) != 0) {
4983 KAUTH_DEBUG("%p ERROR - failed to get vnode attributes - %d", vp, result);
4984 goto out;
4985 }
4986 if (dvp) {
4987 VATTR_WANTED(&dva, va_mode);
4988 VATTR_WANTED(&dva, va_uid);
4989 VATTR_WANTED(&dva, va_gid);
4990 VATTR_WANTED(&dva, va_flags);
4991 VATTR_WANTED(&dva, va_acl);
4992 if ((result = vnode_getattr(dvp, &dva, ctx)) != 0) {
4993 KAUTH_DEBUG("%p ERROR - failed to get directory vnode attributes - %d", vp, result);
4994 goto out;
4995 }
4996 }
4997
4998 /*
4999 * If the vnode is an extended attribute data vnode (eg. a resource fork), *_DATA becomes
5000 * *_EXTATTRIBUTES.
5001 */
5002 if (S_ISXATTR(va.va_mode)) {
5003 if (rights & KAUTH_VNODE_READ_DATA) {
5004 rights &= ~KAUTH_VNODE_READ_DATA;
5005 rights |= KAUTH_VNODE_READ_EXTATTRIBUTES;
5006 }
5007 if (rights & KAUTH_VNODE_WRITE_DATA) {
5008 rights &= ~KAUTH_VNODE_WRITE_DATA;
5009 rights |= KAUTH_VNODE_WRITE_EXTATTRIBUTES;
5010 }
5011 }
5012
5013 /*
5014 * Check for immutability.
5015 *
5016 * In the deletion case, parent directory immutability vetoes specific
5017 * file rights.
5018 */
5019 if ((result = vnode_authorize_checkimmutable(vp, &va, rights, noimmutable)) != 0)
5020 goto out;
5021 if ((rights & KAUTH_VNODE_DELETE) &&
5022 ((result = vnode_authorize_checkimmutable(dvp, &dva, KAUTH_VNODE_DELETE_CHILD, 0)) != 0))
5023 goto out;
5024
5025 /*
5026 * Clear rights that have been authorized by reaching this point, bail if nothing left to
5027 * check.
5028 */
5029 rights &= ~(KAUTH_VNODE_LINKTARGET | KAUTH_VNODE_CHECKIMMUTABLE);
5030 if (rights == 0)
5031 goto out;
5032
5033 /*
5034 * If we're not the superuser, authorize based on file properties.
5035 */
5036 if (!vfs_context_issuser(ctx)) {
5037 /* process delete rights */
5038 if ((rights & KAUTH_VNODE_DELETE) &&
5039 ((result = vnode_authorize_delete(vcp)) != 0))
5040 goto out;
5041
5042 /* process remaining rights */
5043 if ((rights & ~KAUTH_VNODE_DELETE) &&
5044 ((result = vnode_authorize_simple(vcp, rights, rights & KAUTH_VNODE_DELETE)) != 0))
5045 goto out;
5046 } else {
5047
5048 /*
5049 * Execute is only granted to root if one of the x bits is set. This check only
5050 * makes sense if the posix mode bits are actually supported.
5051 */
5052 if ((rights & KAUTH_VNODE_EXECUTE) &&
5053 (vp->v_type == VREG) &&
5054 VATTR_IS_SUPPORTED(&va, va_mode) &&
5055 !(va.va_mode & (S_IXUSR | S_IXGRP | S_IXOTH))) {
5056 result = EPERM;
5057 KAUTH_DEBUG("%p DENIED - root execute requires at least one x bit in 0x%x", vp, va.va_mode);
5058 goto out;
5059 }
5060
5061 KAUTH_DEBUG("%p ALLOWED - caller is superuser", vp);
5062 }
5063
5064 out:
5065 if (VATTR_IS_SUPPORTED(&va, va_acl) && (va.va_acl != NULL))
5066 kauth_acl_free(va.va_acl);
5067 if (VATTR_IS_SUPPORTED(&dva, va_acl) && (dva.va_acl != NULL))
5068 kauth_acl_free(dva.va_acl);
5069 if (result) {
5070 *errorp = result;
5071 KAUTH_DEBUG("%p DENIED - auth denied", vp);
5072 return(KAUTH_RESULT_DENY);
5073 }
5074
5075 /*
5076 * Note that this implies that we will allow requests for no rights, as well as
5077 * for rights that we do not recognise. There should be none of these.
5078 */
5079 KAUTH_DEBUG("%p ALLOWED - auth granted", vp);
5080 return(KAUTH_RESULT_ALLOW);
5081 }
5082
5083 /*
5084 * Check that the attribute information in vattr can be legally applied to
5085 * a new file by the context.
5086 */
5087 int
5088 vnode_authattr_new(vnode_t dvp, struct vnode_attr *vap, int noauth, vfs_context_t ctx)
5089 {
5090 int error;
5091 int is_suser, ismember, defaulted_owner, defaulted_group, defaulted_mode;
5092 kauth_cred_t cred;
5093 guid_t changer;
5094 mount_t dmp;
5095
5096 error = 0;
5097 defaulted_owner = defaulted_group = defaulted_mode = 0;
5098
5099 /*
5100 * Require that the filesystem support extended security to apply any.
5101 */
5102 if (!vfs_extendedsecurity(dvp->v_mount) &&
5103 (VATTR_IS_ACTIVE(vap, va_acl) || VATTR_IS_ACTIVE(vap, va_uuuid) || VATTR_IS_ACTIVE(vap, va_guuid))) {
5104 error = EINVAL;
5105 goto out;
5106 }
5107
5108 /*
5109 * Default some fields.
5110 */
5111 dmp = dvp->v_mount;
5112
5113 /*
5114 * If the filesystem is mounted IGNORE_OWNERSHIP and an explicit owner is set, that
5115 * owner takes ownership of all new files.
5116 */
5117 if ((dmp->mnt_flag & MNT_IGNORE_OWNERSHIP) && (dmp->mnt_fsowner != KAUTH_UID_NONE)) {
5118 VATTR_SET(vap, va_uid, dmp->mnt_fsowner);
5119 defaulted_owner = 1;
5120 } else {
5121 if (!VATTR_IS_ACTIVE(vap, va_uid)) {
5122 /* default owner is current user */
5123 VATTR_SET(vap, va_uid, kauth_cred_getuid(vfs_context_ucred(ctx)));
5124 defaulted_owner = 1;
5125 }
5126 }
5127
5128 /*
5129 * If the filesystem is mounted IGNORE_OWNERSHIP and an explicit grouo is set, that
5130 * group takes ownership of all new files.
5131 */
5132 if ((dmp->mnt_flag & MNT_IGNORE_OWNERSHIP) && (dmp->mnt_fsgroup != KAUTH_GID_NONE)) {
5133 VATTR_SET(vap, va_gid, dmp->mnt_fsgroup);
5134 defaulted_group = 1;
5135 } else {
5136 if (!VATTR_IS_ACTIVE(vap, va_gid)) {
5137 /* default group comes from parent object, fallback to current user */
5138 struct vnode_attr dva;
5139 VATTR_INIT(&dva);
5140 VATTR_WANTED(&dva, va_gid);
5141 if ((error = vnode_getattr(dvp, &dva, ctx)) != 0)
5142 goto out;
5143 if (VATTR_IS_SUPPORTED(&dva, va_gid)) {
5144 VATTR_SET(vap, va_gid, dva.va_gid);
5145 } else {
5146 VATTR_SET(vap, va_gid, kauth_cred_getgid(vfs_context_ucred(ctx)));
5147 }
5148 defaulted_group = 1;
5149 }
5150 }
5151
5152 if (!VATTR_IS_ACTIVE(vap, va_flags))
5153 VATTR_SET(vap, va_flags, 0);
5154
5155 /* default mode is everything, masked with current umask */
5156 if (!VATTR_IS_ACTIVE(vap, va_mode)) {
5157 VATTR_SET(vap, va_mode, ACCESSPERMS & ~vfs_context_proc(ctx)->p_fd->fd_cmask);
5158 KAUTH_DEBUG("ATTR - defaulting new file mode to %o from umask %o", vap->va_mode, vfs_context_proc(ctx)->p_fd->fd_cmask);
5159 defaulted_mode = 1;
5160 }
5161 /* set timestamps to now */
5162 if (!VATTR_IS_ACTIVE(vap, va_create_time)) {
5163 nanotime(&vap->va_create_time);
5164 VATTR_SET_ACTIVE(vap, va_create_time);
5165 }
5166
5167 /*
5168 * Check for attempts to set nonsensical fields.
5169 */
5170 if (vap->va_active & ~VNODE_ATTR_NEWOBJ) {
5171 error = EINVAL;
5172 KAUTH_DEBUG("ATTR - ERROR - attempt to set unsupported new-file attributes %llx",
5173 vap->va_active & ~VNODE_ATTR_NEWOBJ);
5174 goto out;
5175 }
5176
5177 /*
5178 * Quickly check for the applicability of any enforcement here.
5179 * Tests below maintain the integrity of the local security model.
5180 */
5181 if (vfs_authopaque(vnode_mount(dvp)))
5182 goto out;
5183
5184 /*
5185 * We need to know if the caller is the superuser, or if the work is
5186 * otherwise already authorised.
5187 */
5188 cred = vfs_context_ucred(ctx);
5189 if (noauth) {
5190 /* doing work for the kernel */
5191 is_suser = 1;
5192 } else {
5193 is_suser = vfs_context_issuser(ctx);
5194 }
5195
5196
5197 if (VATTR_IS_ACTIVE(vap, va_flags)) {
5198 if (is_suser) {
5199 if ((vap->va_flags & (UF_SETTABLE | SF_SETTABLE)) != vap->va_flags) {
5200 error = EPERM;
5201 KAUTH_DEBUG(" DENIED - superuser attempt to set illegal flag(s)");
5202 goto out;
5203 }
5204 } else {
5205 if ((vap->va_flags & UF_SETTABLE) != vap->va_flags) {
5206 error = EPERM;
5207 KAUTH_DEBUG(" DENIED - user attempt to set illegal flag(s)");
5208 goto out;
5209 }
5210 }
5211 }
5212
5213 /* if not superuser, validate legality of new-item attributes */
5214 if (!is_suser) {
5215 if (!defaulted_mode && VATTR_IS_ACTIVE(vap, va_mode)) {
5216 /* setgid? */
5217 if (vap->va_mode & S_ISGID) {
5218 if ((error = kauth_cred_ismember_gid(cred, vap->va_gid, &ismember)) != 0) {
5219 KAUTH_DEBUG("ATTR - ERROR: got %d checking for membership in %d", error, vap->va_gid);
5220 goto out;
5221 }
5222 if (!ismember) {
5223 KAUTH_DEBUG(" DENIED - can't set SGID bit, not a member of %d", vap->va_gid);
5224 error = EPERM;
5225 goto out;
5226 }
5227 }
5228
5229 /* setuid? */
5230 if ((vap->va_mode & S_ISUID) && (vap->va_uid != kauth_cred_getuid(cred))) {
5231 KAUTH_DEBUG("ATTR - ERROR: illegal attempt to set the setuid bit");
5232 error = EPERM;
5233 goto out;
5234 }
5235 }
5236 if (!defaulted_owner && (vap->va_uid != kauth_cred_getuid(cred))) {
5237 KAUTH_DEBUG(" DENIED - cannot create new item owned by %d", vap->va_uid);
5238 error = EPERM;
5239 goto out;
5240 }
5241 if (!defaulted_group) {
5242 if ((error = kauth_cred_ismember_gid(cred, vap->va_gid, &ismember)) != 0) {
5243 KAUTH_DEBUG(" ERROR - got %d checking for membership in %d", error, vap->va_gid);
5244 goto out;
5245 }
5246 if (!ismember) {
5247 KAUTH_DEBUG(" DENIED - cannot create new item with group %d - not a member", vap->va_gid);
5248 error = EPERM;
5249 goto out;
5250 }
5251 }
5252
5253 /* initialising owner/group UUID */
5254 if (VATTR_IS_ACTIVE(vap, va_uuuid)) {
5255 if ((error = kauth_cred_getguid(cred, &changer)) != 0) {
5256 KAUTH_DEBUG(" ERROR - got %d trying to get caller UUID", error);
5257 /* XXX ENOENT here - no GUID - should perhaps become EPERM */
5258 goto out;
5259 }
5260 if (!kauth_guid_equal(&vap->va_uuuid, &changer)) {
5261 KAUTH_DEBUG(" ERROR - cannot create item with supplied owner UUID - not us");
5262 error = EPERM;
5263 goto out;
5264 }
5265 }
5266 if (VATTR_IS_ACTIVE(vap, va_guuid)) {
5267 if ((error = kauth_cred_ismember_guid(cred, &vap->va_guuid, &ismember)) != 0) {
5268 KAUTH_DEBUG(" ERROR - got %d trying to check group membership", error);
5269 goto out;
5270 }
5271 if (!ismember) {
5272 KAUTH_DEBUG(" ERROR - cannot create item with supplied group UUID - not a member");
5273 error = EPERM;
5274 goto out;
5275 }
5276 }
5277 }
5278 out:
5279 return(error);
5280 }
5281
5282 /*
5283 * Check that the attribute information in vap can be legally written by the context.
5284 *
5285 * Call this when you're not sure about the vnode_attr; either its contents have come
5286 * from an unknown source, or when they are variable.
5287 *
5288 * Returns errno, or zero and sets *actionp to the KAUTH_VNODE_* actions that
5289 * must be authorized to be permitted to write the vattr.
5290 */
5291 int
5292 vnode_authattr(vnode_t vp, struct vnode_attr *vap, kauth_action_t *actionp, vfs_context_t ctx)
5293 {
5294 struct vnode_attr ova;
5295 kauth_action_t required_action;
5296 int error, is_suser, ismember, chowner, chgroup;
5297 guid_t changer;
5298 gid_t group;
5299 uid_t owner;
5300 mode_t newmode;
5301 kauth_cred_t cred;
5302 uint32_t fdelta;
5303
5304 VATTR_INIT(&ova);
5305 required_action = 0;
5306 error = 0;
5307
5308 /*
5309 * Quickly check for enforcement applicability.
5310 */
5311 if (vfs_authopaque(vnode_mount(vp)))
5312 goto out;
5313
5314 /*
5315 * Check for attempts to set nonsensical fields.
5316 */
5317 if (vap->va_active & VNODE_ATTR_RDONLY) {
5318 KAUTH_DEBUG("ATTR - ERROR: attempt to set readonly attribute(s)");
5319 error = EINVAL;
5320 goto out;
5321 }
5322
5323 /*
5324 * We need to know if the caller is the superuser.
5325 */
5326 cred = vfs_context_ucred(ctx);
5327 is_suser = kauth_cred_issuser(cred);
5328
5329 /*
5330 * If any of the following are changing, we need information from the old file:
5331 * va_uid
5332 * va_gid
5333 * va_mode
5334 * va_uuuid
5335 * va_guuid
5336 */
5337 if (VATTR_IS_ACTIVE(vap, va_uid) ||
5338 VATTR_IS_ACTIVE(vap, va_gid) ||
5339 VATTR_IS_ACTIVE(vap, va_mode) ||
5340 VATTR_IS_ACTIVE(vap, va_uuuid) ||
5341 VATTR_IS_ACTIVE(vap, va_guuid)) {
5342 VATTR_WANTED(&ova, va_mode);
5343 VATTR_WANTED(&ova, va_uid);
5344 VATTR_WANTED(&ova, va_gid);
5345 VATTR_WANTED(&ova, va_uuuid);
5346 VATTR_WANTED(&ova, va_guuid);
5347 KAUTH_DEBUG("ATTR - security information changing, fetching existing attributes");
5348 }
5349
5350 /*
5351 * If timestamps are being changed, we need to know who the file is owned
5352 * by.
5353 */
5354 if (VATTR_IS_ACTIVE(vap, va_create_time) ||
5355 VATTR_IS_ACTIVE(vap, va_change_time) ||
5356 VATTR_IS_ACTIVE(vap, va_modify_time) ||
5357 VATTR_IS_ACTIVE(vap, va_access_time) ||
5358 VATTR_IS_ACTIVE(vap, va_backup_time)) {
5359
5360 VATTR_WANTED(&ova, va_uid);
5361 #if 0 /* enable this when we support UUIDs as official owners */
5362 VATTR_WANTED(&ova, va_uuuid);
5363 #endif
5364 KAUTH_DEBUG("ATTR - timestamps changing, fetching uid and GUID");
5365 }
5366
5367 /*
5368 * If flags are being changed, we need the old flags.
5369 */
5370 if (VATTR_IS_ACTIVE(vap, va_flags)) {
5371 KAUTH_DEBUG("ATTR - flags changing, fetching old flags");
5372 VATTR_WANTED(&ova, va_flags);
5373 }
5374
5375 /*
5376 * If the size is being set, make sure it's not a directory.
5377 */
5378 if (VATTR_IS_ACTIVE(vap, va_data_size)) {
5379 /* size is meaningless on a directory, don't permit this */
5380 if (vnode_isdir(vp)) {
5381 KAUTH_DEBUG("ATTR - ERROR: size change requested on a directory");
5382 error = EISDIR;
5383 goto out;
5384 }
5385 }
5386
5387 /*
5388 * Get old data.
5389 */
5390 KAUTH_DEBUG("ATTR - fetching old attributes %016llx", ova.va_active);
5391 if ((error = vnode_getattr(vp, &ova, ctx)) != 0) {
5392 KAUTH_DEBUG(" ERROR - got %d trying to get attributes", error);
5393 goto out;
5394 }
5395
5396 /*
5397 * Size changes require write access to the file data.
5398 */
5399 if (VATTR_IS_ACTIVE(vap, va_data_size)) {
5400 /* if we can't get the size, or it's different, we need write access */
5401 KAUTH_DEBUG("ATTR - size change, requiring WRITE_DATA");
5402 required_action |= KAUTH_VNODE_WRITE_DATA;
5403 }
5404
5405 /*
5406 * Changing timestamps?
5407 *
5408 * Note that we are only called to authorize user-requested time changes;
5409 * side-effect time changes are not authorized. Authorisation is only
5410 * required for existing files.
5411 *
5412 * Non-owners are not permitted to change the time on an existing
5413 * file to anything other than the current time.
5414 */
5415 if (VATTR_IS_ACTIVE(vap, va_create_time) ||
5416 VATTR_IS_ACTIVE(vap, va_change_time) ||
5417 VATTR_IS_ACTIVE(vap, va_modify_time) ||
5418 VATTR_IS_ACTIVE(vap, va_access_time) ||
5419 VATTR_IS_ACTIVE(vap, va_backup_time)) {
5420 /*
5421 * The owner and root may set any timestamps they like,
5422 * provided that the file is not immutable. The owner still needs
5423 * WRITE_ATTRIBUTES (implied by ownership but still deniable).
5424 */
5425 if (is_suser || vauth_node_owner(&ova, cred)) {
5426 KAUTH_DEBUG("ATTR - root or owner changing timestamps");
5427 required_action |= KAUTH_VNODE_CHECKIMMUTABLE | KAUTH_VNODE_WRITE_ATTRIBUTES;
5428 } else {
5429 /* just setting the current time? */
5430 if (vap->va_vaflags & VA_UTIMES_NULL) {
5431 KAUTH_DEBUG("ATTR - non-root/owner changing timestamps, requiring WRITE_ATTRIBUTES");
5432 required_action |= KAUTH_VNODE_WRITE_ATTRIBUTES;
5433 } else {
5434 KAUTH_DEBUG("ATTR - ERROR: illegal timestamp modification attempted");
5435 error = EACCES;
5436 goto out;
5437 }
5438 }
5439 }
5440
5441 /*
5442 * Changing file mode?
5443 */
5444 if (VATTR_IS_ACTIVE(vap, va_mode) && VATTR_IS_SUPPORTED(&ova, va_mode) && (ova.va_mode != vap->va_mode)) {
5445 KAUTH_DEBUG("ATTR - mode change from %06o to %06o", ova.va_mode, vap->va_mode);
5446
5447 /*
5448 * Mode changes always have the same basic auth requirements.
5449 */
5450 if (is_suser) {
5451 KAUTH_DEBUG("ATTR - superuser mode change, requiring immutability check");
5452 required_action |= KAUTH_VNODE_CHECKIMMUTABLE;
5453 } else {
5454 /* need WRITE_SECURITY */
5455 KAUTH_DEBUG("ATTR - non-superuser mode change, requiring WRITE_SECURITY");
5456 required_action |= KAUTH_VNODE_WRITE_SECURITY;
5457 }
5458
5459 /*
5460 * Can't set the setgid bit if you're not in the group and not root. Have to have
5461 * existing group information in the case we're not setting it right now.
5462 */
5463 if (vap->va_mode & S_ISGID) {
5464 required_action |= KAUTH_VNODE_CHECKIMMUTABLE; /* always required */
5465 if (!is_suser) {
5466 if (VATTR_IS_ACTIVE(vap, va_gid)) {
5467 group = vap->va_gid;
5468 } else if (VATTR_IS_SUPPORTED(&ova, va_gid)) {
5469 group = ova.va_gid;
5470 } else {
5471 KAUTH_DEBUG("ATTR - ERROR: setgid but no gid available");
5472 error = EINVAL;
5473 goto out;
5474 }
5475 /*
5476 * This might be too restrictive; WRITE_SECURITY might be implied by
5477 * membership in this case, rather than being an additional requirement.
5478 */
5479 if ((error = kauth_cred_ismember_gid(cred, group, &ismember)) != 0) {
5480 KAUTH_DEBUG("ATTR - ERROR: got %d checking for membership in %d", error, vap->va_gid);
5481 goto out;
5482 }
5483 if (!ismember) {
5484 KAUTH_DEBUG(" DENIED - can't set SGID bit, not a member of %d", group);
5485 error = EPERM;
5486 goto out;
5487 }
5488 }
5489 }
5490
5491 /*
5492 * Can't set the setuid bit unless you're root or the file's owner.
5493 */
5494 if (vap->va_mode & S_ISUID) {
5495 required_action |= KAUTH_VNODE_CHECKIMMUTABLE; /* always required */
5496 if (!is_suser) {
5497 if (VATTR_IS_ACTIVE(vap, va_uid)) {
5498 owner = vap->va_uid;
5499 } else if (VATTR_IS_SUPPORTED(&ova, va_uid)) {
5500 owner = ova.va_uid;
5501 } else {
5502 KAUTH_DEBUG("ATTR - ERROR: setuid but no uid available");
5503 error = EINVAL;
5504 goto out;
5505 }
5506 if (owner != kauth_cred_getuid(cred)) {
5507 /*
5508 * We could allow this if WRITE_SECURITY is permitted, perhaps.
5509 */
5510 KAUTH_DEBUG("ATTR - ERROR: illegal attempt to set the setuid bit");
5511 error = EPERM;
5512 goto out;
5513 }
5514 }
5515 }
5516 }
5517
5518 /*
5519 * Validate/mask flags changes. This checks that only the flags in
5520 * the UF_SETTABLE mask are being set, and preserves the flags in
5521 * the SF_SETTABLE case.
5522 *
5523 * Since flags changes may be made in conjunction with other changes,
5524 * we will ask the auth code to ignore immutability in the case that
5525 * the SF_* flags are not set and we are only manipulating the file flags.
5526 *
5527 */
5528 if (VATTR_IS_ACTIVE(vap, va_flags)) {
5529 /* compute changing flags bits */
5530 if (VATTR_IS_SUPPORTED(&ova, va_flags)) {
5531 fdelta = vap->va_flags ^ ova.va_flags;
5532 } else {
5533 fdelta = vap->va_flags;
5534 }
5535
5536 if (fdelta != 0) {
5537 KAUTH_DEBUG("ATTR - flags changing, requiring WRITE_SECURITY");
5538 required_action |= KAUTH_VNODE_WRITE_SECURITY;
5539
5540 /* check that changing bits are legal */
5541 if (is_suser) {
5542 /*
5543 * The immutability check will prevent us from clearing the SF_*
5544 * flags unless the system securelevel permits it, so just check
5545 * for legal flags here.
5546 */
5547 if (fdelta & ~(UF_SETTABLE | SF_SETTABLE)) {
5548 error = EPERM;
5549 KAUTH_DEBUG(" DENIED - superuser attempt to set illegal flag(s)");
5550 goto out;
5551 }
5552 } else {
5553 if (fdelta & ~UF_SETTABLE) {
5554 error = EPERM;
5555 KAUTH_DEBUG(" DENIED - user attempt to set illegal flag(s)");
5556 goto out;
5557 }
5558 }
5559 /*
5560 * If the caller has the ability to manipulate file flags,
5561 * security is not reduced by ignoring them for this operation.
5562 *
5563 * A more complete test here would consider the 'after' states of the flags
5564 * to determine whether it would permit the operation, but this becomes
5565 * very complex.
5566 *
5567 * Ignoring immutability is conditional on securelevel; this does not bypass
5568 * the SF_* flags if securelevel > 0.
5569 */
5570 required_action |= KAUTH_VNODE_NOIMMUTABLE;
5571 }
5572 }
5573
5574 /*
5575 * Validate ownership information.
5576 */
5577 chowner = 0;
5578 chgroup = 0;
5579
5580 /*
5581 * uid changing
5582 * Note that if the filesystem didn't give us a UID, we expect that it doesn't
5583 * support them in general, and will ignore it if/when we try to set it.
5584 * We might want to clear the uid out of vap completely here.
5585 */
5586 if (VATTR_IS_ACTIVE(vap, va_uid) && VATTR_IS_SUPPORTED(&ova, va_uid) && (vap->va_uid != ova.va_uid)) {
5587 if (!is_suser && (kauth_cred_getuid(cred) != vap->va_uid)) {
5588 KAUTH_DEBUG(" DENIED - non-superuser cannot change ownershipt to a third party");
5589 error = EPERM;
5590 goto out;
5591 }
5592 chowner = 1;
5593 }
5594
5595 /*
5596 * gid changing
5597 * Note that if the filesystem didn't give us a GID, we expect that it doesn't
5598 * support them in general, and will ignore it if/when we try to set it.
5599 * We might want to clear the gid out of vap completely here.
5600 */
5601 if (VATTR_IS_ACTIVE(vap, va_gid) && VATTR_IS_SUPPORTED(&ova, va_gid) && (vap->va_gid != ova.va_gid)) {
5602 if (!is_suser) {
5603 if ((error = kauth_cred_ismember_gid(cred, vap->va_gid, &ismember)) != 0) {
5604 KAUTH_DEBUG(" ERROR - got %d checking for membership in %d", error, vap->va_gid);
5605 goto out;
5606 }
5607 if (!ismember) {
5608 KAUTH_DEBUG(" DENIED - group change from %d to %d but not a member of target group",
5609 ova.va_gid, vap->va_gid);
5610 error = EPERM;
5611 goto out;
5612 }
5613 }
5614 chgroup = 1;
5615 }
5616
5617 /*
5618 * Owner UUID being set or changed.
5619 */
5620 if (VATTR_IS_ACTIVE(vap, va_uuuid)) {
5621 /* if the owner UUID is not actually changing ... */
5622 if (VATTR_IS_SUPPORTED(&ova, va_uuuid) && kauth_guid_equal(&vap->va_uuuid, &ova.va_uuuid))
5623 goto no_uuuid_change;
5624
5625 /*
5626 * The owner UUID cannot be set by a non-superuser to anything other than
5627 * their own.
5628 */
5629 if (!is_suser) {
5630 if ((error = kauth_cred_getguid(cred, &changer)) != 0) {
5631 KAUTH_DEBUG(" ERROR - got %d trying to get caller UUID", error);
5632 /* XXX ENOENT here - no UUID - should perhaps become EPERM */
5633 goto out;
5634 }
5635 if (!kauth_guid_equal(&vap->va_uuuid, &changer)) {
5636 KAUTH_DEBUG(" ERROR - cannot set supplied owner UUID - not us");
5637 error = EPERM;
5638 goto out;
5639 }
5640 }
5641 chowner = 1;
5642 }
5643 no_uuuid_change:
5644 /*
5645 * Group UUID being set or changed.
5646 */
5647 if (VATTR_IS_ACTIVE(vap, va_guuid)) {
5648 /* if the group UUID is not actually changing ... */
5649 if (VATTR_IS_SUPPORTED(&ova, va_guuid) && kauth_guid_equal(&vap->va_guuid, &ova.va_guuid))
5650 goto no_guuid_change;
5651
5652 /*
5653 * The group UUID cannot be set by a non-superuser to anything other than
5654 * one of which they are a member.
5655 */
5656 if (!is_suser) {
5657 if ((error = kauth_cred_ismember_guid(cred, &vap->va_guuid, &ismember)) != 0) {
5658 KAUTH_DEBUG(" ERROR - got %d trying to check group membership", error);
5659 goto out;
5660 }
5661 if (!ismember) {
5662 KAUTH_DEBUG(" ERROR - cannot create item with supplied group UUID - not a member");
5663 error = EPERM;
5664 goto out;
5665 }
5666 }
5667 chgroup = 1;
5668 }
5669 no_guuid_change:
5670
5671 /*
5672 * Compute authorisation for group/ownership changes.
5673 */
5674 if (chowner || chgroup) {
5675 if (is_suser) {
5676 KAUTH_DEBUG("ATTR - superuser changing file owner/group, requiring immutability check");
5677 required_action |= KAUTH_VNODE_CHECKIMMUTABLE;
5678 } else {
5679 if (chowner) {
5680 KAUTH_DEBUG("ATTR - ownership change, requiring TAKE_OWNERSHIP");
5681 required_action |= KAUTH_VNODE_TAKE_OWNERSHIP;
5682 }
5683 if (chgroup && !chowner) {
5684 KAUTH_DEBUG("ATTR - group change, requiring WRITE_SECURITY");
5685 required_action |= KAUTH_VNODE_WRITE_SECURITY;
5686 }
5687
5688 /* clear set-uid and set-gid bits as required by Posix */
5689 if (VATTR_IS_ACTIVE(vap, va_mode)) {
5690 newmode = vap->va_mode;
5691 } else if (VATTR_IS_SUPPORTED(&ova, va_mode)) {
5692 newmode = ova.va_mode;
5693 } else {
5694 KAUTH_DEBUG("CHOWN - trying to change owner but cannot get mode from filesystem to mask setugid bits");
5695 newmode = 0;
5696 }
5697 if (newmode & (S_ISUID | S_ISGID)) {
5698 VATTR_SET(vap, va_mode, newmode & ~(S_ISUID | S_ISGID));
5699 KAUTH_DEBUG("CHOWN - masking setugid bits from mode %o to %o", newmode, vap->va_mode);
5700 }
5701 }
5702 }
5703
5704 /*
5705 * Authorise changes in the ACL.
5706 */
5707 if (VATTR_IS_ACTIVE(vap, va_acl)) {
5708
5709 /* no existing ACL */
5710 if (!VATTR_IS_ACTIVE(&ova, va_acl) || (ova.va_acl == NULL)) {
5711
5712 /* adding an ACL */
5713 if (vap->va_acl != NULL) {
5714 required_action |= KAUTH_VNODE_WRITE_SECURITY;
5715 KAUTH_DEBUG("CHMOD - adding ACL");
5716 }
5717
5718 /* removing an existing ACL */
5719 } else if (vap->va_acl == NULL) {
5720 required_action |= KAUTH_VNODE_WRITE_SECURITY;
5721 KAUTH_DEBUG("CHMOD - removing ACL");
5722
5723 /* updating an existing ACL */
5724 } else {
5725 if (vap->va_acl->acl_entrycount != ova.va_acl->acl_entrycount) {
5726 /* entry count changed, must be different */
5727 required_action |= KAUTH_VNODE_WRITE_SECURITY;
5728 KAUTH_DEBUG("CHMOD - adding/removing ACL entries");
5729 } else if (vap->va_acl->acl_entrycount > 0) {
5730 /* both ACLs have the same ACE count, said count is 1 or more, bitwise compare ACLs */
5731 if (!memcmp(&vap->va_acl->acl_ace[0], &ova.va_acl->acl_ace[0],
5732 sizeof(struct kauth_ace) * vap->va_acl->acl_entrycount)) {
5733 required_action |= KAUTH_VNODE_WRITE_SECURITY;
5734 KAUTH_DEBUG("CHMOD - changing ACL entries");
5735 }
5736 }
5737 }
5738 }
5739
5740 /*
5741 * Other attributes that require authorisation.
5742 */
5743 if (VATTR_IS_ACTIVE(vap, va_encoding))
5744 required_action |= KAUTH_VNODE_WRITE_ATTRIBUTES;
5745
5746 out:
5747 if (VATTR_IS_SUPPORTED(&ova, va_acl) && (ova.va_acl != NULL))
5748 kauth_acl_free(ova.va_acl);
5749 if (error == 0)
5750 *actionp = required_action;
5751 return(error);
5752 }
5753
5754
5755 void
5756 vfs_setlocklocal(mount_t mp)
5757 {
5758 vnode_t vp;
5759
5760 mount_lock(mp);
5761 mp->mnt_kern_flag |= MNTK_LOCK_LOCAL;
5762
5763 /*
5764 * We do not expect anyone to be using any vnodes at the
5765 * time this routine is called. So no need for vnode locking
5766 */
5767 TAILQ_FOREACH(vp, &mp->mnt_vnodelist, v_mntvnodes) {
5768 vp->v_flag |= VLOCKLOCAL;
5769 }
5770 TAILQ_FOREACH(vp, &mp->mnt_workerqueue, v_mntvnodes) {
5771 vp->v_flag |= VLOCKLOCAL;
5772 }
5773 TAILQ_FOREACH(vp, &mp->mnt_newvnodes, v_mntvnodes) {
5774 vp->v_flag |= VLOCKLOCAL;
5775 }
5776 mount_unlock(mp);
5777 }
5778
5779
5780 #ifdef JOE_DEBUG
5781
5782 record_vp(vnode_t vp, int count) {
5783 struct uthread *ut;
5784 int i;
5785
5786 if ((vp->v_flag & VSYSTEM))
5787 return;
5788
5789 ut = get_bsdthread_info(current_thread());
5790 ut->uu_iocount += count;
5791
5792 if (ut->uu_vpindex < 32) {
5793 for (i = 0; i < ut->uu_vpindex; i++) {
5794 if (ut->uu_vps[i] == vp)
5795 return;
5796 }
5797 ut->uu_vps[ut->uu_vpindex] = vp;
5798 ut->uu_vpindex++;
5799 }
5800 }
5801 #endif