]> git.saurik.com Git - apple/xnu.git/blame - bsd/vfs/vfs_subr.c
xnu-792.18.15.tar.gz
[apple/xnu.git] / bsd / vfs / vfs_subr.c
CommitLineData
1c79356b 1/*
5d5c5d0d
A
2 * Copyright (c) 2000-2005 Apple Computer, Inc. All rights reserved.
3 *
8f6c56a5 4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
1c79356b 5 *
8f6c56a5
A
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
8ad349bb 24 * limitations under the License.
8f6c56a5
A
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
1c79356b
A
27 */
28/* Copyright (c) 1995 NeXT Computer, Inc. All Rights Reserved */
29/*
30 * Copyright (c) 1989, 1993
31 * The Regents of the University of California. All rights reserved.
32 * (c) UNIX System Laboratories, Inc.
33 * All or some portions of this file are derived from material licensed
34 * to the University of California by American Telephone and Telegraph
35 * Co. or Unix System Laboratories, Inc. and are reproduced herein with
36 * the permission of UNIX System Laboratories, Inc.
37 *
38 * Redistribution and use in source and binary forms, with or without
39 * modification, are permitted provided that the following conditions
40 * are met:
41 * 1. Redistributions of source code must retain the above copyright
42 * notice, this list of conditions and the following disclaimer.
43 * 2. Redistributions in binary form must reproduce the above copyright
44 * notice, this list of conditions and the following disclaimer in the
45 * documentation and/or other materials provided with the distribution.
46 * 3. All advertising materials mentioning features or use of this software
47 * must display the following acknowledgement:
48 * This product includes software developed by the University of
49 * California, Berkeley and its contributors.
50 * 4. Neither the name of the University nor the names of its contributors
51 * may be used to endorse or promote products derived from this software
52 * without specific prior written permission.
53 *
54 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
55 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
56 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
57 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
58 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
59 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
60 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
61 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
62 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
63 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
64 * SUCH DAMAGE.
65 *
66 * @(#)vfs_subr.c 8.31 (Berkeley) 5/26/95
67 */
68
69/*
70 * External virtual filesystem routines
71 */
72
9bccf70c 73#undef DIAGNOSTIC
1c79356b
A
74#define DIAGNOSTIC 1
75
76#include <sys/param.h>
77#include <sys/systm.h>
91447636
A
78#include <sys/proc_internal.h>
79#include <sys/kauth.h>
80#include <sys/mount_internal.h>
1c79356b 81#include <sys/time.h>
91447636
A
82#include <sys/lock.h>
83#include <sys/vnode_internal.h>
1c79356b
A
84#include <sys/stat.h>
85#include <sys/namei.h>
86#include <sys/ucred.h>
91447636 87#include <sys/buf_internal.h>
1c79356b
A
88#include <sys/errno.h>
89#include <sys/malloc.h>
90#include <sys/domain.h>
91#include <sys/mbuf.h>
92#include <sys/syslog.h>
91447636 93#include <sys/ubc_internal.h>
1c79356b
A
94#include <sys/vm.h>
95#include <sys/sysctl.h>
55e303ae
A
96#include <sys/filedesc.h>
97#include <sys/event.h>
91447636
A
98#include <sys/kdebug.h>
99#include <sys/kauth.h>
100#include <sys/user.h>
101#include <miscfs/fifofs/fifo.h>
55e303ae
A
102
103#include <string.h>
104#include <machine/spl.h>
105
1c79356b
A
106
107#include <kern/assert.h>
108
109#include <miscfs/specfs/specdev.h>
110
0b4e3aa0
A
111#include <mach/mach_types.h>
112#include <mach/memory_object_types.h>
113
91447636
A
114extern lck_grp_t *vnode_lck_grp;
115extern lck_attr_t *vnode_lck_attr;
116
117
118extern lck_mtx_t * mnt_list_mtx_lock;
0b4e3aa0 119
1c79356b
A
120enum vtype iftovt_tab[16] = {
121 VNON, VFIFO, VCHR, VNON, VDIR, VNON, VBLK, VNON,
122 VREG, VNON, VLNK, VNON, VSOCK, VNON, VNON, VBAD,
123};
124int vttoif_tab[9] = {
125 0, S_IFREG, S_IFDIR, S_IFBLK, S_IFCHR, S_IFLNK,
126 S_IFSOCK, S_IFIFO, S_IFMT,
127};
128
91447636
A
129extern int ubc_isinuse_locked(vnode_t, int, int);
130extern kern_return_t adjust_vm_object_cache(vm_size_t oval, vm_size_t nval);
131
132static void vnode_list_add(vnode_t);
133static void vnode_list_remove(vnode_t);
134
135static errno_t vnode_drain(vnode_t);
136static void vgone(vnode_t);
137static void vclean(vnode_t vp, int flag, proc_t p);
138static void vnode_reclaim_internal(vnode_t, int, int);
139
140static void vnode_dropiocount (vnode_t, int);
141static errno_t vnode_getiocount(vnode_t vp, int locked, int vid, int vflags);
142static int vget_internal(vnode_t, int, int);
143
144static vnode_t checkalias(vnode_t vp, dev_t nvp_rdev);
145static int vnode_reload(vnode_t);
146static int vnode_isinuse_locked(vnode_t, int, int);
147
148static void insmntque(vnode_t vp, mount_t mp);
149mount_t mount_list_lookupby_fsid(fsid_t *, int, int);
150static int mount_getvfscnt(void);
151static int mount_fillfsids(fsid_t *, int );
152static void vnode_iterate_setup(mount_t);
153static int vnode_umount_preflight(mount_t, vnode_t, int);
154static int vnode_iterate_prepare(mount_t);
155static int vnode_iterate_reloadq(mount_t);
156static void vnode_iterate_clear(mount_t);
1c79356b 157
1c79356b
A
158TAILQ_HEAD(freelst, vnode) vnode_free_list; /* vnode free list */
159TAILQ_HEAD(inactivelst, vnode) vnode_inactive_list; /* vnode inactive list */
160struct mntlist mountlist; /* mounted filesystem list */
91447636 161static int nummounts = 0;
1c79356b
A
162
163#if DIAGNOSTIC
164#define VLISTCHECK(fun, vp, list) \
165 if ((vp)->v_freelist.tqe_prev == (struct vnode **)0xdeadb) \
166 panic("%s: %s vnode not on %slist", (fun), (list), (list));
167
168#define VINACTIVECHECK(fun, vp, expected) \
169 do { \
170 int __is_inactive = ISSET((vp)->v_flag, VUINACTIVE); \
171 if (__is_inactive ^ expected) \
172 panic("%s: %sinactive vnode, expected %s", (fun), \
173 __is_inactive? "" : "not ", \
174 expected? "inactive": "not inactive"); \
175 } while(0)
176#else
177#define VLISTCHECK(fun, vp, list)
178#define VINACTIVECHECK(fun, vp, expected)
179#endif /* DIAGNOSTIC */
180
181#define VLISTNONE(vp) \
182 do { \
183 (vp)->v_freelist.tqe_next = (struct vnode *)0; \
184 (vp)->v_freelist.tqe_prev = (struct vnode **)0xdeadb; \
185 } while(0)
186
187#define VONLIST(vp) \
188 ((vp)->v_freelist.tqe_prev != (struct vnode **)0xdeadb)
189
190/* remove a vnode from free vnode list */
191#define VREMFREE(fun, vp) \
192 do { \
193 VLISTCHECK((fun), (vp), "free"); \
194 TAILQ_REMOVE(&vnode_free_list, (vp), v_freelist); \
195 VLISTNONE((vp)); \
196 freevnodes--; \
197 } while(0)
198
199/* remove a vnode from inactive vnode list */
200#define VREMINACTIVE(fun, vp) \
201 do { \
202 VLISTCHECK((fun), (vp), "inactive"); \
203 VINACTIVECHECK((fun), (vp), VUINACTIVE); \
204 TAILQ_REMOVE(&vnode_inactive_list, (vp), v_freelist); \
205 CLR((vp)->v_flag, VUINACTIVE); \
206 VLISTNONE((vp)); \
207 inactivevnodes--; \
208 } while(0)
209
1c79356b
A
210/*
211 * Have to declare first two locks as actual data even if !MACH_SLOCKS, since
212 * a pointers to them get passed around.
213 */
91447636
A
214void * mntvnode_slock;
215void * mntid_slock;
216void * spechash_slock;
1c79356b
A
217
218/*
219 * vnodetarget is the amount of vnodes we expect to get back
220 * from the the inactive vnode list and VM object cache.
221 * As vnreclaim() is a mainly cpu bound operation for faster
222 * processers this number could be higher.
223 * Having this number too high introduces longer delays in
91447636 224 * the execution of new_vnode().
1c79356b
A
225 */
226unsigned long vnodetarget; /* target for vnreclaim() */
227#define VNODE_FREE_TARGET 20 /* Default value for vnodetarget */
228
229/*
230 * We need quite a few vnodes on the free list to sustain the
231 * rapid stat() the compilation process does, and still benefit from the name
232 * cache. Having too few vnodes on the free list causes serious disk
233 * thrashing as we cycle through them.
234 */
0b4e3aa0 235#define VNODE_FREE_MIN 300 /* freelist should have at least these many */
1c79356b
A
236
237/*
238 * We need to get vnodes back from the VM object cache when a certain #
239 * of vnodes are reused from the freelist. This is essential for the
240 * caching to be effective in the namecache and the buffer cache [for the
241 * metadata].
242 */
243#define VNODE_TOOMANY_REUSED (VNODE_FREE_MIN/4)
244
245/*
246 * If we have enough vnodes on the freelist we do not want to reclaim
247 * the vnodes from the VM object cache.
248 */
249#define VNODE_FREE_ENOUGH (VNODE_FREE_MIN + (VNODE_FREE_MIN/2))
250
251/*
252 * Initialize the vnode management data structures.
253 */
0b4e3aa0 254__private_extern__ void
91447636 255vntblinit(void)
1c79356b 256{
1c79356b 257 TAILQ_INIT(&vnode_free_list);
1c79356b 258 TAILQ_INIT(&vnode_inactive_list);
91447636 259 TAILQ_INIT(&mountlist);
1c79356b
A
260
261 if (!vnodetarget)
262 vnodetarget = VNODE_FREE_TARGET;
263
264 /*
265 * Scale the vm_object_cache to accomodate the vnodes
266 * we want to cache
267 */
268 (void) adjust_vm_object_cache(0, desiredvnodes - VNODE_FREE_MIN);
269}
270
271/* Reset the VM Object Cache with the values passed in */
0b4e3aa0 272__private_extern__ kern_return_t
1c79356b
A
273reset_vmobjectcache(unsigned int val1, unsigned int val2)
274{
275 vm_size_t oval = val1 - VNODE_FREE_MIN;
9bccf70c
A
276 vm_size_t nval;
277
278 if(val2 < VNODE_FREE_MIN)
279 nval = 0;
280 else
281 nval = val2 - VNODE_FREE_MIN;
1c79356b
A
282
283 return(adjust_vm_object_cache(oval, nval));
284}
285
91447636
A
286
287/* the timeout is in 10 msecs */
1c79356b 288int
91447636
A
289vnode_waitforwrites(vnode_t vp, int output_target, int slpflag, int slptimeout, char *msg) {
290 int error = 0;
291 struct timespec ts;
1c79356b 292
91447636
A
293 KERNEL_DEBUG(0x3010280 | DBG_FUNC_START, (int)vp, output_target, vp->v_numoutput, 0, 0);
294
295 if (vp->v_numoutput > output_target) {
296
297 slpflag &= ~PDROP;
298
299 vnode_lock(vp);
300
301 while ((vp->v_numoutput > output_target) && error == 0) {
302 if (output_target)
303 vp->v_flag |= VTHROTTLED;
304 else
305 vp->v_flag |= VBWAIT;
306 ts.tv_sec = (slptimeout/100);
307 ts.tv_nsec = (slptimeout % 1000) * 10 * NSEC_PER_USEC * 1000 ;
308 error = msleep((caddr_t)&vp->v_numoutput, &vp->v_lock, (slpflag | (PRIBIO + 1)), msg, &ts);
309 }
310 vnode_unlock(vp);
1c79356b 311 }
91447636
A
312 KERNEL_DEBUG(0x3010280 | DBG_FUNC_END, (int)vp, output_target, vp->v_numoutput, error, 0);
313
314 return error;
1c79356b
A
315}
316
91447636 317
1c79356b 318void
91447636
A
319vnode_startwrite(vnode_t vp) {
320
321 OSAddAtomic(1, &vp->v_numoutput);
322}
323
324
325void
326vnode_writedone(vnode_t vp)
1c79356b 327{
91447636
A
328 if (vp) {
329 int need_wakeup = 0;
330
331 OSAddAtomic(-1, &vp->v_numoutput);
332
333 vnode_lock(vp);
1c79356b 334
91447636
A
335 if (vp->v_numoutput < 0)
336 panic("vnode_writedone: numoutput < 0");
337
338 if ((vp->v_flag & VTHROTTLED) && (vp->v_numoutput < (VNODE_ASYNC_THROTTLE / 3))) {
339 vp->v_flag &= ~VTHROTTLED;
340 need_wakeup = 1;
341 }
342 if ((vp->v_flag & VBWAIT) && (vp->v_numoutput == 0)) {
343 vp->v_flag &= ~VBWAIT;
344 need_wakeup = 1;
345 }
346 vnode_unlock(vp);
347
348 if (need_wakeup)
349 wakeup((caddr_t)&vp->v_numoutput);
350 }
1c79356b
A
351}
352
91447636
A
353
354
1c79356b 355int
91447636 356vnode_hasdirtyblks(vnode_t vp)
1c79356b 357{
91447636 358 struct cl_writebehind *wbp;
1c79356b 359
91447636
A
360 /*
361 * Not taking the buf_mtxp as there is little
362 * point doing it. Even if the lock is taken the
363 * state can change right after that. If their
364 * needs to be a synchronization, it must be driven
365 * by the caller
366 */
367 if (vp->v_dirtyblkhd.lh_first)
368 return (1);
369
370 if (!UBCINFOEXISTS(vp))
371 return (0);
0b4e3aa0 372
91447636
A
373 wbp = vp->v_ubcinfo->cl_wbehind;
374
375 if (wbp && (wbp->cl_number || wbp->cl_scmap))
376 return (1);
0b4e3aa0 377
1c79356b
A
378 return (0);
379}
380
1c79356b 381int
91447636 382vnode_hascleanblks(vnode_t vp)
1c79356b 383{
91447636
A
384 /*
385 * Not taking the buf_mtxp as there is little
386 * point doing it. Even if the lock is taken the
387 * state can change right after that. If their
388 * needs to be a synchronization, it must be driven
389 * by the caller
390 */
391 if (vp->v_cleanblkhd.lh_first)
392 return (1);
393 return (0);
394}
1c79356b 395
91447636
A
396void
397vnode_iterate_setup(mount_t mp)
398{
399 while (mp->mnt_lflag & MNT_LITER) {
400 mp->mnt_lflag |= MNT_LITERWAIT;
401 msleep((caddr_t)mp, &mp->mnt_mlock, PVFS, "vnode_iterate_setup", 0);
1c79356b 402 }
91447636
A
403
404 mp->mnt_lflag |= MNT_LITER;
405
1c79356b
A
406}
407
91447636
A
408static int
409vnode_umount_preflight(mount_t mp, vnode_t skipvp, int flags)
1c79356b 410{
91447636 411 vnode_t vp;
1c79356b 412
91447636
A
413 TAILQ_FOREACH(vp, &mp->mnt_vnodelist, v_mntvnodes) {
414 if (vp->v_type == VDIR)
415 continue;
416 if (vp == skipvp)
417 continue;
418 if ((flags & SKIPSYSTEM) && ((vp->v_flag & VSYSTEM) ||
419 (vp->v_flag & VNOFLUSH)))
420 continue;
421 if ((flags & SKIPSWAP) && (vp->v_flag & VSWAP))
422 continue;
423 if ((flags & WRITECLOSE) &&
424 (vp->v_writecount == 0 || vp->v_type != VREG))
425 continue;
426 /* Look for busy vnode */
427 if (((vp->v_usecount != 0) &&
428 ((vp->v_usecount - vp->v_kusecount) != 0)))
429 return(1);
1c79356b 430 }
91447636
A
431
432 return(0);
1c79356b
A
433}
434
91447636
A
435/*
436 * This routine prepares iteration by moving all the vnodes to worker queue
437 * called with mount lock held
1c79356b 438 */
91447636
A
439int
440vnode_iterate_prepare(mount_t mp)
1c79356b 441{
91447636 442 vnode_t vp;
1c79356b 443
91447636
A
444 if (TAILQ_EMPTY(&mp->mnt_vnodelist)) {
445 /* nothing to do */
446 return (0);
447 }
1c79356b 448
91447636
A
449 vp = TAILQ_FIRST(&mp->mnt_vnodelist);
450 vp->v_mntvnodes.tqe_prev = &(mp->mnt_workerqueue.tqh_first);
451 mp->mnt_workerqueue.tqh_first = mp->mnt_vnodelist.tqh_first;
452 mp->mnt_workerqueue.tqh_last = mp->mnt_vnodelist.tqh_last;
453
454 TAILQ_INIT(&mp->mnt_vnodelist);
455 if (mp->mnt_newvnodes.tqh_first != NULL)
456 panic("vnode_iterate_prepare: newvnode when entering vnode");
457 TAILQ_INIT(&mp->mnt_newvnodes);
458
459 return (1);
1c79356b
A
460}
461
91447636
A
462
463/* called with mount lock held */
464int
465vnode_iterate_reloadq(mount_t mp)
1c79356b 466{
91447636
A
467 int moved = 0;
468
469 /* add the remaining entries in workerq to the end of mount vnode list */
470 if (!TAILQ_EMPTY(&mp->mnt_workerqueue)) {
471 struct vnode * mvp;
472 mvp = TAILQ_LAST(&mp->mnt_vnodelist, vnodelst);
473
474 /* Joining the workerque entities to mount vnode list */
475 if (mvp)
476 mvp->v_mntvnodes.tqe_next = mp->mnt_workerqueue.tqh_first;
477 else
478 mp->mnt_vnodelist.tqh_first = mp->mnt_workerqueue.tqh_first;
479 mp->mnt_workerqueue.tqh_first->v_mntvnodes.tqe_prev = mp->mnt_vnodelist.tqh_last;
480 mp->mnt_vnodelist.tqh_last = mp->mnt_workerqueue.tqh_last;
481 TAILQ_INIT(&mp->mnt_workerqueue);
482 }
483
484 /* add the newvnodes to the head of mount vnode list */
485 if (!TAILQ_EMPTY(&mp->mnt_newvnodes)) {
486 struct vnode * nlvp;
487 nlvp = TAILQ_LAST(&mp->mnt_newvnodes, vnodelst);
488
489 mp->mnt_newvnodes.tqh_first->v_mntvnodes.tqe_prev = &mp->mnt_vnodelist.tqh_first;
490 nlvp->v_mntvnodes.tqe_next = mp->mnt_vnodelist.tqh_first;
491 if(mp->mnt_vnodelist.tqh_first)
492 mp->mnt_vnodelist.tqh_first->v_mntvnodes.tqe_prev = &nlvp->v_mntvnodes.tqe_next;
493 else
494 mp->mnt_vnodelist.tqh_last = mp->mnt_newvnodes.tqh_last;
495 mp->mnt_vnodelist.tqh_first = mp->mnt_newvnodes.tqh_first;
496 TAILQ_INIT(&mp->mnt_newvnodes);
497 moved = 1;
498 }
1c79356b 499
91447636 500 return(moved);
1c79356b
A
501}
502
1c79356b 503
91447636
A
504void
505vnode_iterate_clear(mount_t mp)
506{
507 mp->mnt_lflag &= ~MNT_LITER;
508 if (mp->mnt_lflag & MNT_LITERWAIT) {
509 mp->mnt_lflag &= ~MNT_LITERWAIT;
510 wakeup(mp);
511 }
512}
1c79356b 513
1c79356b 514
1c79356b 515int
91447636
A
516vnode_iterate(mp, flags, callout, arg)
517 mount_t mp;
518 int flags;
519 int (*callout)(struct vnode *, void *);
520 void * arg;
1c79356b 521{
1c79356b 522 struct vnode *vp;
91447636
A
523 int vid, retval;
524 int ret = 0;
1c79356b 525
91447636 526 mount_lock(mp);
1c79356b 527
91447636 528 vnode_iterate_setup(mp);
1c79356b 529
91447636
A
530 /* it is returns 0 then there is nothing to do */
531 retval = vnode_iterate_prepare(mp);
1c79356b 532
91447636
A
533 if (retval == 0) {
534 vnode_iterate_clear(mp);
535 mount_unlock(mp);
536 return(ret);
1c79356b 537 }
91447636
A
538
539 /* iterate over all the vnodes */
540 while (!TAILQ_EMPTY(&mp->mnt_workerqueue)) {
541 vp = TAILQ_FIRST(&mp->mnt_workerqueue);
542 TAILQ_REMOVE(&mp->mnt_workerqueue, vp, v_mntvnodes);
543 TAILQ_INSERT_TAIL(&mp->mnt_vnodelist, vp, v_mntvnodes);
544 vid = vp->v_id;
545 if ((vp->v_data == NULL) || (vp->v_type == VNON) || (vp->v_mount != mp)) {
546 continue;
547 }
548 mount_unlock(mp);
1c79356b 549
91447636
A
550 if ( vget_internal(vp, vid, (flags | VNODE_NODEAD| VNODE_WITHID | VNODE_NOSUSPEND))) {
551 mount_lock(mp);
552 continue;
553 }
554 if (flags & VNODE_RELOAD) {
555 /*
556 * we're reloading the filesystem
557 * cast out any inactive vnodes...
558 */
559 if (vnode_reload(vp)) {
560 /* vnode will be recycled on the refcount drop */
561 vnode_put(vp);
562 mount_lock(mp);
563 continue;
564 }
565 }
55e303ae 566
91447636
A
567 retval = callout(vp, arg);
568
569 switch (retval) {
570 case VNODE_RETURNED:
571 case VNODE_RETURNED_DONE:
572 vnode_put(vp);
573 if (retval == VNODE_RETURNED_DONE) {
574 mount_lock(mp);
575 ret = 0;
576 goto out;
577 }
578 break;
579
580 case VNODE_CLAIMED_DONE:
581 mount_lock(mp);
582 ret = 0;
583 goto out;
584 case VNODE_CLAIMED:
585 default:
586 break;
587 }
588 mount_lock(mp);
55e303ae 589 }
1c79356b 590
91447636
A
591out:
592 (void)vnode_iterate_reloadq(mp);
593 vnode_iterate_clear(mp);
594 mount_unlock(mp);
595 return (ret);
596}
55e303ae 597
91447636
A
598void
599mount_lock_renames(mount_t mp)
600{
601 lck_mtx_lock(&mp->mnt_renamelock);
1c79356b
A
602}
603
1c79356b 604void
91447636 605mount_unlock_renames(mount_t mp)
1c79356b 606{
91447636
A
607 lck_mtx_unlock(&mp->mnt_renamelock);
608}
1c79356b 609
91447636
A
610void
611mount_lock(mount_t mp)
612{
613 lck_mtx_lock(&mp->mnt_mlock);
1c79356b
A
614}
615
91447636
A
616void
617mount_unlock(mount_t mp)
fa4905b1 618{
91447636 619 lck_mtx_unlock(&mp->mnt_mlock);
fa4905b1
A
620}
621
91447636 622
1c79356b 623void
91447636 624mount_ref(mount_t mp, int locked)
1c79356b 625{
91447636
A
626 if ( !locked)
627 mount_lock(mp);
628
629 mp->mnt_count++;
630
631 if ( !locked)
632 mount_unlock(mp);
1c79356b
A
633}
634
91447636
A
635
636void
637mount_drop(mount_t mp, int locked)
638{
639 if ( !locked)
640 mount_lock(mp);
641
642 mp->mnt_count--;
643
644 if (mp->mnt_count == 0 && (mp->mnt_lflag & MNT_LDRAIN))
645 wakeup(&mp->mnt_lflag);
646
647 if ( !locked)
648 mount_unlock(mp);
649}
650
651
1c79356b 652int
91447636 653mount_iterref(mount_t mp, int locked)
1c79356b 654{
91447636 655 int retval = 0;
1c79356b 656
91447636
A
657 if (!locked)
658 mount_list_lock();
659 if (mp->mnt_iterref < 0) {
660 retval = 1;
661 } else {
662 mp->mnt_iterref++;
1c79356b 663 }
91447636
A
664 if (!locked)
665 mount_list_unlock();
666 return(retval);
667}
1c79356b 668
91447636
A
669int
670mount_isdrained(mount_t mp, int locked)
671{
672 int retval;
1c79356b 673
91447636
A
674 if (!locked)
675 mount_list_lock();
676 if (mp->mnt_iterref < 0)
677 retval = 1;
678 else
679 retval = 0;
680 if (!locked)
681 mount_list_unlock();
682 return(retval);
683}
684
685void
686mount_iterdrop(mount_t mp)
687{
688 mount_list_lock();
689 mp->mnt_iterref--;
690 wakeup(&mp->mnt_iterref);
691 mount_list_unlock();
692}
693
694void
695mount_iterdrain(mount_t mp)
696{
697 mount_list_lock();
698 while (mp->mnt_iterref)
699 msleep((caddr_t)&mp->mnt_iterref, mnt_list_mtx_lock, PVFS, "mount_iterdrain", 0 );
700 /* mount iterations drained */
701 mp->mnt_iterref = -1;
702 mount_list_unlock();
703}
704void
705mount_iterreset(mount_t mp)
706{
707 mount_list_lock();
708 if (mp->mnt_iterref == -1)
709 mp->mnt_iterref = 0;
710 mount_list_unlock();
711}
712
713/* always called with mount lock held */
714int
715mount_refdrain(mount_t mp)
716{
717 if (mp->mnt_lflag & MNT_LDRAIN)
718 panic("already in drain");
719 mp->mnt_lflag |= MNT_LDRAIN;
720
721 while (mp->mnt_count)
722 msleep((caddr_t)&mp->mnt_lflag, &mp->mnt_mlock, PVFS, "mount_drain", 0 );
723
724 if (mp->mnt_vnodelist.tqh_first != NULL)
725 panic("mount_refdrain: dangling vnode");
726
727 mp->mnt_lflag &= ~MNT_LDRAIN;
728
729 return(0);
730}
731
732
733/*
734 * Mark a mount point as busy. Used to synchronize access and to delay
735 * unmounting.
736 */
737int
738vfs_busy(mount_t mp, int flags)
739{
740
741restart:
742 if (mp->mnt_lflag & MNT_LDEAD)
743 return(ENOENT);
744
745 if (mp->mnt_lflag & MNT_LUNMOUNT) {
746 if (flags & LK_NOWAIT)
747 return (ENOENT);
748
749 mount_lock(mp);
750
751 if (mp->mnt_lflag & MNT_LDEAD) {
752 mount_unlock(mp);
753 return(ENOENT);
754 }
755 if (mp->mnt_lflag & MNT_LUNMOUNT) {
756 mp->mnt_lflag |= MNT_LWAIT;
1c79356b 757 /*
91447636
A
758 * Since all busy locks are shared except the exclusive
759 * lock granted when unmounting, the only place that a
760 * wakeup needs to be done is at the release of the
761 * exclusive lock at the end of dounmount.
1c79356b 762 */
91447636
A
763 msleep((caddr_t)mp, &mp->mnt_mlock, (PVFS | PDROP), "vfsbusy", 0 );
764 return (ENOENT);
1c79356b 765 }
91447636
A
766 mount_unlock(mp);
767 }
768
769 lck_rw_lock_shared(&mp->mnt_rwlock);
770
771 /*
772 * until we are granted the rwlock, it's possible for the mount point to
773 * change state, so reevaluate before granting the vfs_busy
774 */
775 if (mp->mnt_lflag & (MNT_LDEAD | MNT_LUNMOUNT)) {
776 lck_rw_done(&mp->mnt_rwlock);
777 goto restart;
1c79356b 778 }
1c79356b
A
779 return (0);
780}
781
91447636
A
782/*
783 * Free a busy filesystem.
784 */
785
786void
787vfs_unbusy(mount_t mp)
788{
789 lck_rw_done(&mp->mnt_rwlock);
790}
791
792
793
794static void
795vfs_rootmountfailed(mount_t mp) {
796
797 mount_list_lock();
798 mp->mnt_vtable->vfc_refcount--;
799 mount_list_unlock();
800
801 vfs_unbusy(mp);
802
803 mount_lock_destroy(mp);
804
805 FREE_ZONE(mp, sizeof(struct mount), M_MOUNT);
806}
807
808/*
809 * Lookup a filesystem type, and if found allocate and initialize
810 * a mount structure for it.
811 *
812 * Devname is usually updated by mount(8) after booting.
813 */
814static mount_t
815vfs_rootmountalloc_internal(struct vfstable *vfsp, const char *devname)
816{
817 mount_t mp;
818
819 mp = _MALLOC_ZONE((u_long)sizeof(struct mount), M_MOUNT, M_WAITOK);
820 bzero((char *)mp, (u_long)sizeof(struct mount));
821
822 /* Initialize the default IO constraints */
823 mp->mnt_maxreadcnt = mp->mnt_maxwritecnt = MAXPHYS;
824 mp->mnt_segreadcnt = mp->mnt_segwritecnt = 32;
825 mp->mnt_maxsegreadsize = mp->mnt_maxreadcnt;
826 mp->mnt_maxsegwritesize = mp->mnt_maxwritecnt;
827 mp->mnt_devblocksize = DEV_BSIZE;
828
829 mount_lock_init(mp);
830 (void)vfs_busy(mp, LK_NOWAIT);
831
832 TAILQ_INIT(&mp->mnt_vnodelist);
833 TAILQ_INIT(&mp->mnt_workerqueue);
834 TAILQ_INIT(&mp->mnt_newvnodes);
835
836 mp->mnt_vtable = vfsp;
837 mp->mnt_op = vfsp->vfc_vfsops;
838 mp->mnt_flag = MNT_RDONLY | MNT_ROOTFS;
839 mp->mnt_vnodecovered = NULLVP;
840 //mp->mnt_stat.f_type = vfsp->vfc_typenum;
841 mp->mnt_flag |= vfsp->vfc_flags & MNT_VISFLAGMASK;
842
843 mount_list_lock();
844 vfsp->vfc_refcount++;
845 mount_list_unlock();
846
847 strncpy(mp->mnt_vfsstat.f_fstypename, vfsp->vfc_name, MFSTYPENAMELEN);
848 mp->mnt_vfsstat.f_mntonname[0] = '/';
849 (void) copystr((char *)devname, mp->mnt_vfsstat.f_mntfromname, MAXPATHLEN - 1, 0);
850
851 return (mp);
852}
853
854errno_t
855vfs_rootmountalloc(const char *fstypename, const char *devname, mount_t *mpp)
856{
857 struct vfstable *vfsp;
858
859 for (vfsp = vfsconf; vfsp; vfsp = vfsp->vfc_next)
860 if (!strcmp(vfsp->vfc_name, fstypename))
861 break;
862 if (vfsp == NULL)
863 return (ENODEV);
864
865 *mpp = vfs_rootmountalloc_internal(vfsp, devname);
866
867 if (*mpp)
868 return (0);
869
870 return (ENOMEM);
871}
872
873
874/*
875 * Find an appropriate filesystem to use for the root. If a filesystem
876 * has not been preselected, walk through the list of known filesystems
877 * trying those that have mountroot routines, and try them until one
878 * works or we have tried them all.
879 */
880extern int (*mountroot)(void);
881
882int
883vfs_mountroot()
884{
885 struct vfstable *vfsp;
886 struct vfs_context context;
887 int error;
888 mount_t mp;
889
890 if (mountroot != NULL) {
891 /*
892 * used for netboot which follows a different set of rules
893 */
894 error = (*mountroot)();
895 return (error);
896 }
897 if ((error = bdevvp(rootdev, &rootvp))) {
898 printf("vfs_mountroot: can't setup bdevvp\n");
899 return (error);
900 }
901 context.vc_proc = current_proc();
902 context.vc_ucred = kauth_cred_get();
903
904 for (vfsp = vfsconf; vfsp; vfsp = vfsp->vfc_next) {
905 if (vfsp->vfc_mountroot == NULL)
906 continue;
907
908 mp = vfs_rootmountalloc_internal(vfsp, "root_device");
909 mp->mnt_devvp = rootvp;
910
911 if ((error = (*vfsp->vfc_mountroot)(mp, rootvp, &context)) == 0) {
912 mp->mnt_devvp->v_specflags |= SI_MOUNTEDON;
913
914 vfs_unbusy(mp);
915
916 mount_list_add(mp);
917
918 /*
919 * cache the IO attributes for the underlying physical media...
920 * an error return indicates the underlying driver doesn't
921 * support all the queries necessary... however, reasonable
922 * defaults will have been set, so no reason to bail or care
923 */
924 vfs_init_io_attributes(rootvp, mp);
925 /*
926 * get rid of iocount reference returned
927 * by bdevvp... it will have also taken
928 * a usecount reference which we want to keep
929 */
930 vnode_put(rootvp);
931
932 return (0);
933 }
934 vfs_rootmountfailed(mp);
935
936 if (error != EINVAL)
937 printf("%s_mountroot failed: %d\n", vfsp->vfc_name, error);
938 }
939 return (ENODEV);
940}
941
942/*
943 * Lookup a mount point by filesystem identifier.
944 */
945extern mount_t vfs_getvfs_locked(fsid_t *);
946
947struct mount *
948vfs_getvfs(fsid)
949 fsid_t *fsid;
950{
951 return (mount_list_lookupby_fsid(fsid, 0, 0));
952}
953
954struct mount *
955vfs_getvfs_locked(fsid)
956 fsid_t *fsid;
957{
958 return(mount_list_lookupby_fsid(fsid, 1, 0));
959}
960
961struct mount *
962vfs_getvfs_by_mntonname(u_char *path)
963{
964 mount_t retmp = (mount_t)0;
965 mount_t mp;
966
967 mount_list_lock();
968 TAILQ_FOREACH(mp, &mountlist, mnt_list) {
969 if (!strcmp(mp->mnt_vfsstat.f_mntonname, path)) {
970 retmp = mp;
971 goto out;
972 }
973 }
974out:
975 mount_list_unlock();
976 return (retmp);
977}
978
979/* generation number for creation of new fsids */
980u_short mntid_gen = 0;
981/*
982 * Get a new unique fsid
983 */
984void
985vfs_getnewfsid(mp)
986 struct mount *mp;
987{
988
989 fsid_t tfsid;
990 int mtype;
991 mount_t nmp;
992
993 mount_list_lock();
994
995 /* generate a new fsid */
996 mtype = mp->mnt_vtable->vfc_typenum;
997 if (++mntid_gen == 0)
998 mntid_gen++;
999 tfsid.val[0] = makedev(nblkdev + mtype, mntid_gen);
1000 tfsid.val[1] = mtype;
1001
1002 TAILQ_FOREACH(nmp, &mountlist, mnt_list) {
1003 while (vfs_getvfs_locked(&tfsid)) {
1004 if (++mntid_gen == 0)
1005 mntid_gen++;
1006 tfsid.val[0] = makedev(nblkdev + mtype, mntid_gen);
1007 }
1008 }
1009 mp->mnt_vfsstat.f_fsid.val[0] = tfsid.val[0];
1010 mp->mnt_vfsstat.f_fsid.val[1] = tfsid.val[1];
1011 mount_list_unlock();
1012}
1013
1014/*
1015 * Routines having to do with the management of the vnode table.
1016 */
1017extern int (**dead_vnodeop_p)(void *);
1018long numvnodes, freevnodes;
1019long inactivevnodes;
1020
1021
1022/*
1023 * Move a vnode from one mount queue to another.
1024 */
1025static void
1026insmntque(vnode_t vp, mount_t mp)
1027{
1028 mount_t lmp;
1029 /*
1030 * Delete from old mount point vnode list, if on one.
1031 */
3a60a9f5 1032 if ( (lmp = vp->v_mount) != NULL && lmp != dead_mountp) {
91447636
A
1033 if ((vp->v_lflag & VNAMED_MOUNT) == 0)
1034 panic("insmntque: vp not in mount vnode list");
1035 vp->v_lflag &= ~VNAMED_MOUNT;
1036
1037 mount_lock(lmp);
1038
1039 mount_drop(lmp, 1);
1040
1041 if (vp->v_mntvnodes.tqe_next == NULL) {
1042 if (TAILQ_LAST(&lmp->mnt_vnodelist, vnodelst) == vp)
1043 TAILQ_REMOVE(&lmp->mnt_vnodelist, vp, v_mntvnodes);
1044 else if (TAILQ_LAST(&lmp->mnt_newvnodes, vnodelst) == vp)
1045 TAILQ_REMOVE(&lmp->mnt_newvnodes, vp, v_mntvnodes);
1046 else if (TAILQ_LAST(&lmp->mnt_workerqueue, vnodelst) == vp)
1047 TAILQ_REMOVE(&lmp->mnt_workerqueue, vp, v_mntvnodes);
1048 } else {
1049 vp->v_mntvnodes.tqe_next->v_mntvnodes.tqe_prev = vp->v_mntvnodes.tqe_prev;
1050 *vp->v_mntvnodes.tqe_prev = vp->v_mntvnodes.tqe_next;
1051 }
1052 vp->v_mntvnodes.tqe_next = 0;
1053 vp->v_mntvnodes.tqe_prev = 0;
1054 mount_unlock(lmp);
1055 return;
1056 }
1057
1058 /*
1059 * Insert into list of vnodes for the new mount point, if available.
1060 */
1061 if ((vp->v_mount = mp) != NULL) {
1062 mount_lock(mp);
1063 if ((vp->v_mntvnodes.tqe_next != 0) && (vp->v_mntvnodes.tqe_prev != 0))
1064 panic("vp already in mount list");
1065 if (mp->mnt_lflag & MNT_LITER)
1066 TAILQ_INSERT_HEAD(&mp->mnt_newvnodes, vp, v_mntvnodes);
1067 else
1068 TAILQ_INSERT_HEAD(&mp->mnt_vnodelist, vp, v_mntvnodes);
1069 if (vp->v_lflag & VNAMED_MOUNT)
1070 panic("insmntque: vp already in mount vnode list");
1071 if ((vp->v_freelist.tqe_prev != (struct vnode **)0xdeadb))
1072 panic("insmntque: vp on the free list\n");
1073 vp->v_lflag |= VNAMED_MOUNT;
1074 mount_ref(mp, 1);
1075 mount_unlock(mp);
1076 }
1077}
1078
1079
1c79356b
A
1080/*
1081 * Create a vnode for a block device.
1082 * Used for root filesystem, argdev, and swap areas.
1083 * Also used for memory file system special devices.
1084 */
1085int
91447636 1086bdevvp(dev_t dev, vnode_t *vpp)
1c79356b 1087{
91447636
A
1088 vnode_t nvp;
1089 int error;
1090 struct vnode_fsparam vfsp;
1091 struct vfs_context context;
1c79356b
A
1092
1093 if (dev == NODEV) {
1094 *vpp = NULLVP;
1095 return (ENODEV);
1096 }
91447636
A
1097
1098 context.vc_proc = current_proc();
1099 context.vc_ucred = FSCRED;
1100
1101 vfsp.vnfs_mp = (struct mount *)0;
1102 vfsp.vnfs_vtype = VBLK;
1103 vfsp.vnfs_str = "bdevvp";
1104 vfsp.vnfs_dvp = 0;
1105 vfsp.vnfs_fsnode = 0;
1106 vfsp.vnfs_cnp = 0;
1107 vfsp.vnfs_vops = spec_vnodeop_p;
1108 vfsp.vnfs_rdev = dev;
1109 vfsp.vnfs_filesize = 0;
1110
1111 vfsp.vnfs_flags = VNFS_NOCACHE | VNFS_CANTCACHE;
1112
1113 vfsp.vnfs_marksystem = 0;
1114 vfsp.vnfs_markroot = 0;
1115
1116 if ( (error = vnode_create(VNCREATE_FLAVOR, VCREATESIZE, &vfsp, &nvp)) ) {
1c79356b
A
1117 *vpp = NULLVP;
1118 return (error);
1119 }
91447636
A
1120 if ( (error = vnode_ref(nvp)) ) {
1121 panic("bdevvp failed: vnode_ref");
1122 return (error);
1c79356b 1123 }
91447636
A
1124 if ( (error = VNOP_FSYNC(nvp, MNT_WAIT, &context)) ) {
1125 panic("bdevvp failed: fsync");
1126 return (error);
1127 }
1128 if ( (error = buf_invalidateblks(nvp, BUF_WRITE_DATA, 0, 0)) ) {
1129 panic("bdevvp failed: invalidateblks");
1130 return (error);
1131 }
1132 if ( (error = VNOP_OPEN(nvp, FREAD, &context)) ) {
1133 panic("bdevvp failed: open");
1134 return (error);
1135 }
1136 *vpp = nvp;
1137
1c79356b
A
1138 return (0);
1139}
1140
1141/*
1142 * Check to see if the new vnode represents a special device
1143 * for which we already have a vnode (either because of
1144 * bdevvp() or because of a different vnode representing
1145 * the same block device). If such an alias exists, deallocate
1146 * the existing contents and return the aliased vnode. The
1147 * caller is responsible for filling it with its new contents.
1148 */
91447636
A
1149static vnode_t
1150checkalias(nvp, nvp_rdev)
1c79356b
A
1151 register struct vnode *nvp;
1152 dev_t nvp_rdev;
1c79356b 1153{
1c79356b
A
1154 struct vnode *vp;
1155 struct vnode **vpp;
91447636 1156 int vid = 0;
1c79356b 1157
1c79356b
A
1158 vpp = &speclisth[SPECHASH(nvp_rdev)];
1159loop:
91447636
A
1160 SPECHASH_LOCK();
1161
1c79356b 1162 for (vp = *vpp; vp; vp = vp->v_specnext) {
91447636
A
1163 if (nvp_rdev == vp->v_rdev && nvp->v_type == vp->v_type) {
1164 vid = vp->v_id;
1165 break;
1166 }
1167 }
1168 SPECHASH_UNLOCK();
1169
1170 if (vp) {
1171 if (vnode_getwithvid(vp,vid)) {
1172 goto loop;
1173 }
1174 /*
1175 * Termination state is checked in vnode_getwithvid
1176 */
1177 vnode_lock(vp);
1178
1c79356b
A
1179 /*
1180 * Alias, but not in use, so flush it out.
1181 */
91447636
A
1182 if ((vp->v_iocount == 1) && (vp->v_usecount == 0)) {
1183 vnode_reclaim_internal(vp, 1, 0);
1184 vnode_unlock(vp);
1185 vnode_put(vp);
1c79356b
A
1186 goto loop;
1187 }
1c79356b
A
1188 }
1189 if (vp == NULL || vp->v_tag != VT_NON) {
91447636
A
1190 MALLOC_ZONE(nvp->v_specinfo, struct specinfo *, sizeof(struct specinfo),
1191 M_SPECINFO, M_WAITOK);
1c79356b
A
1192 bzero(nvp->v_specinfo, sizeof(struct specinfo));
1193 nvp->v_rdev = nvp_rdev;
91447636
A
1194 nvp->v_specflags = 0;
1195 nvp->v_speclastr = -1;
1196
1197 SPECHASH_LOCK();
1c79356b
A
1198 nvp->v_hashchain = vpp;
1199 nvp->v_specnext = *vpp;
1c79356b 1200 *vpp = nvp;
91447636
A
1201 SPECHASH_UNLOCK();
1202
1c79356b
A
1203 if (vp != NULLVP) {
1204 nvp->v_flag |= VALIASED;
1205 vp->v_flag |= VALIASED;
91447636
A
1206 vnode_unlock(vp);
1207 vnode_put(vp);
1c79356b 1208 }
1c79356b
A
1209 return (NULLVP);
1210 }
1c79356b
A
1211 return (vp);
1212}
1213
91447636 1214
1c79356b 1215/*
0b4e3aa0
A
1216 * Get a reference on a particular vnode and lock it if requested.
1217 * If the vnode was on the inactive list, remove it from the list.
1218 * If the vnode was on the free list, remove it from the list and
1219 * move it to inactive list as needed.
1220 * The vnode lock bit is set if the vnode is being eliminated in
1221 * vgone. The process is awakened when the transition is completed,
1222 * and an error returned to indicate that the vnode is no longer
1223 * usable (possibly having been changed to a new file system type).
1c79356b 1224 */
91447636
A
1225static int
1226vget_internal(vnode_t vp, int vid, int vflags)
1c79356b
A
1227{
1228 int error = 0;
55e303ae
A
1229 u_long vpid;
1230
91447636 1231 vnode_lock(vp);
55e303ae 1232
91447636
A
1233 if (vflags & VNODE_WITHID)
1234 vpid = vid;
1235 else
1236 vpid = vp->v_id; // save off the original v_id
0b4e3aa0 1237
91447636
A
1238 if ((vflags & VNODE_WRITEABLE) && (vp->v_writecount == 0))
1239 /*
1240 * vnode to be returned only if it has writers opened
1241 */
1242 error = EINVAL;
1243 else
1244 error = vnode_getiocount(vp, 1, vpid, vflags);
55e303ae 1245
91447636 1246 vnode_unlock(vp);
55e303ae 1247
0b4e3aa0
A
1248 return (error);
1249}
1250
1c79356b 1251int
91447636 1252vnode_ref(vnode_t vp)
1c79356b 1253{
1c79356b 1254
91447636 1255 return (vnode_ref_ext(vp, 0));
1c79356b
A
1256}
1257
1c79356b 1258int
91447636 1259vnode_ref_ext(vnode_t vp, int fmode)
1c79356b 1260{
91447636 1261 int error = 0;
1c79356b 1262
91447636 1263 vnode_lock(vp);
1c79356b 1264
91447636
A
1265 /*
1266 * once all the current call sites have been fixed to insure they have
1267 * taken an iocount, we can toughen this assert up and insist that the
1268 * iocount is non-zero... a non-zero usecount doesn't insure correctness
1269 */
1270 if (vp->v_iocount <= 0 && vp->v_usecount <= 0)
1271 panic("vnode_ref_ext: vp %x has no valid reference %d, %d", vp, vp->v_iocount, vp->v_usecount);
1c79356b 1272
91447636
A
1273 /*
1274 * if you are the owner of drain/termination, can acquire usecount
1275 */
1276 if ((vp->v_lflag & (VL_DRAIN | VL_TERMINATE | VL_DEAD))) {
1277 if (vp->v_owner != current_thread()) {
1278 error = ENOENT;
1279 goto out;
1280 }
1281 }
1282 vp->v_usecount++;
1c79356b 1283
91447636
A
1284 if (fmode & FWRITE) {
1285 if (++vp->v_writecount <= 0)
1286 panic("vnode_ref_ext: v_writecount");
55e303ae 1287 }
91447636
A
1288 if (fmode & O_EVTONLY) {
1289 if (++vp->v_kusecount <= 0)
1290 panic("vnode_ref_ext: v_kusecount");
55e303ae 1291 }
91447636
A
1292out:
1293 vnode_unlock(vp);
1294
1295 return (error);
55e303ae
A
1296}
1297
1298
1c79356b
A
1299/*
1300 * put the vnode on appropriate free list.
91447636 1301 * called with vnode LOCKED
1c79356b
A
1302 */
1303static void
91447636 1304vnode_list_add(vnode_t vp)
1c79356b 1305{
55e303ae 1306
1c79356b 1307 /*
91447636 1308 * if it is already on a list or non zero references return
1c79356b 1309 */
91447636 1310 if (VONLIST(vp) || (vp->v_usecount != 0) || (vp->v_iocount != 0))
1c79356b 1311 return;
91447636 1312 vnode_list_lock();
1c79356b 1313
91447636
A
1314 /*
1315 * insert at tail of LRU list or at head if VAGE or VL_DEAD is set
1316 */
1317 if ((vp->v_flag & VAGE) || (vp->v_lflag & VL_DEAD)) {
1318 TAILQ_INSERT_HEAD(&vnode_free_list, vp, v_freelist);
1c79356b 1319 vp->v_flag &= ~VAGE;
91447636
A
1320 } else {
1321 TAILQ_INSERT_TAIL(&vnode_free_list, vp, v_freelist);
1322 }
1c79356b 1323 freevnodes++;
91447636
A
1324
1325 vnode_list_unlock();
1c79356b
A
1326}
1327
1328/*
91447636 1329 * remove the vnode from appropriate free list.
1c79356b
A
1330 */
1331static void
91447636 1332vnode_list_remove(vnode_t vp)
1c79356b 1333{
91447636
A
1334 /*
1335 * we want to avoid taking the list lock
1336 * in the case where we're not on the free
1337 * list... this will be true for most
1338 * directories and any currently in use files
1339 *
1340 * we're guaranteed that we can't go from
1341 * the not-on-list state to the on-list
1342 * state since we hold the vnode lock...
1343 * all calls to vnode_list_add are done
1344 * under the vnode lock... so we can
1345 * check for that condition (the prevelant one)
1346 * without taking the list lock
1347 */
1348 if (VONLIST(vp)) {
1349 vnode_list_lock();
1350 /*
1351 * however, we're not guaranteed that
1352 * we won't go from the on-list state
1353 * to the non-on-list state until we
1354 * hold the vnode_list_lock... this
1355 * is due to new_vnode removing vnodes
1356 * from the free list uder the list_lock
1357 * w/o the vnode lock... so we need to
1358 * check again whether we're currently
1359 * on the free list
1360 */
1361 if (VONLIST(vp)) {
1362 VREMFREE("vnode_list_remove", vp);
1363 VLISTNONE(vp);
1364 }
1365 vnode_list_unlock();
1366 }
1c79356b
A
1367}
1368
1369
1c79356b 1370void
91447636 1371vnode_rele(vnode_t vp)
1c79356b 1372{
91447636
A
1373 vnode_rele_internal(vp, 0, 0, 0);
1374}
1c79356b 1375
1c79356b 1376
91447636
A
1377void
1378vnode_rele_ext(vnode_t vp, int fmode, int dont_reenter)
1379{
1380 vnode_rele_internal(vp, fmode, dont_reenter, 0);
1c79356b
A
1381}
1382
91447636 1383
1c79356b 1384void
91447636 1385vnode_rele_internal(vnode_t vp, int fmode, int dont_reenter, int locked)
1c79356b 1386{
91447636 1387 struct vfs_context context;
55e303ae 1388
91447636
A
1389 if ( !locked)
1390 vnode_lock(vp);
1c79356b 1391
91447636
A
1392 if (--vp->v_usecount < 0)
1393 panic("vnode_rele_ext: vp %x usecount -ve : %d", vp, vp->v_usecount);
1394
1395 if (fmode & FWRITE) {
1396 if (--vp->v_writecount < 0)
1397 panic("vnode_rele_ext: vp %x writecount -ve : %d", vp, vp->v_writecount);
1c79356b 1398 }
91447636
A
1399 if (fmode & O_EVTONLY) {
1400 if (--vp->v_kusecount < 0)
1401 panic("vnode_rele_ext: vp %x kusecount -ve : %d", vp, vp->v_kusecount);
1c79356b 1402 }
91447636
A
1403 if ((vp->v_iocount > 0) || (vp->v_usecount > 0)) {
1404 /*
1405 * vnode is still busy... if we're the last
1406 * usecount, mark for a future call to VNOP_INACTIVE
1407 * when the iocount finally drops to 0
1408 */
1409 if (vp->v_usecount == 0) {
1410 vp->v_lflag |= VL_NEEDINACTIVE;
1411 vp->v_flag &= ~(VNOCACHE_DATA | VRAOFF);
1412 }
1413 if ( !locked)
1414 vnode_unlock(vp);
1c79356b
A
1415 return;
1416 }
91447636
A
1417 vp->v_flag &= ~(VNOCACHE_DATA | VRAOFF);
1418
1419 if ( (vp->v_lflag & (VL_TERMINATE | VL_DEAD)) || dont_reenter) {
1420 /*
1421 * vnode is being cleaned, or
1422 * we've requested that we don't reenter
1423 * the filesystem on this release... in
1424 * this case, we'll mark the vnode aged
1425 * if it's been marked for termination
1c79356b 1426 */
91447636
A
1427 if (dont_reenter) {
1428 if ( !(vp->v_lflag & (VL_TERMINATE | VL_DEAD | VL_MARKTERM)) )
1429 vp->v_lflag |= VL_NEEDINACTIVE;
1430 vp->v_flag |= VAGE;
1c79356b 1431 }
91447636
A
1432 vnode_list_add(vp);
1433 if ( !locked)
1434 vnode_unlock(vp);
1435 return;
1c79356b 1436 }
91447636
A
1437 /*
1438 * at this point both the iocount and usecount
1439 * are zero
1440 * pick up an iocount so that we can call
1441 * VNOP_INACTIVE with the vnode lock unheld
1442 */
1443 vp->v_iocount++;
1444#ifdef JOE_DEBUG
1445 record_vp(vp, 1);
1c79356b 1446#endif
91447636
A
1447 vp->v_lflag &= ~VL_NEEDINACTIVE;
1448 vnode_unlock(vp);
1c79356b 1449
91447636
A
1450 context.vc_proc = current_proc();
1451 context.vc_ucred = kauth_cred_get();
1452 VNOP_INACTIVE(vp, &context);
1c79356b 1453
91447636
A
1454 vnode_lock(vp);
1455 /*
1456 * because we dropped the vnode lock to call VNOP_INACTIVE
1457 * the state of the vnode may have changed... we may have
1458 * picked up an iocount, usecount or the MARKTERM may have
1459 * been set... we need to reevaluate the reference counts
1460 * to determine if we can call vnode_reclaim_internal at
1461 * this point... if the reference counts are up, we'll pick
1462 * up the MARKTERM state when they get subsequently dropped
1463 */
1464 if ( (vp->v_iocount == 1) && (vp->v_usecount == 0) &&
1465 ((vp->v_lflag & (VL_MARKTERM | VL_TERMINATE | VL_DEAD)) == VL_MARKTERM)) {
1466 struct uthread *ut;
1c79356b 1467
91447636
A
1468 ut = get_bsdthread_info(current_thread());
1469
1470 if (ut->uu_defer_reclaims) {
1471 vp->v_defer_reclaimlist = ut->uu_vreclaims;
1472 ut->uu_vreclaims = vp;
1473 goto defer_reclaim;
1474 }
1475 vnode_reclaim_internal(vp, 1, 0);
1476 }
1477 vnode_dropiocount(vp, 1);
1478 vnode_list_add(vp);
1479defer_reclaim:
1480 if ( !locked)
1481 vnode_unlock(vp);
1482 return;
1c79356b
A
1483}
1484
1485/*
1486 * Remove any vnodes in the vnode table belonging to mount point mp.
1487 *
1488 * If MNT_NOFORCE is specified, there should not be any active ones,
1489 * return error if any are found (nb: this is a user error, not a
1490 * system error). If MNT_FORCE is specified, detach any active vnodes
1491 * that are found.
1492 */
1493#if DIAGNOSTIC
1494int busyprt = 0; /* print out busy vnodes */
1495#if 0
1496struct ctldebug debug1 = { "busyprt", &busyprt };
1497#endif /* 0 */
1498#endif
1499
1500int
1501vflush(mp, skipvp, flags)
1502 struct mount *mp;
1503 struct vnode *skipvp;
1504 int flags;
1505{
0b4e3aa0 1506 struct proc *p = current_proc();
91447636 1507 struct vnode *vp;
1c79356b 1508 int busy = 0;
91447636
A
1509 int reclaimed = 0;
1510 int vid, retval;
1c79356b 1511
91447636
A
1512 mount_lock(mp);
1513 vnode_iterate_setup(mp);
1514 /*
1515 * On regular unmounts(not forced) do a
1516 * quick check for vnodes to be in use. This
1517 * preserves the caching of vnodes. automounter
1518 * tries unmounting every so often to see whether
1519 * it is still busy or not.
1520 */
1521 if ((flags & FORCECLOSE)==0) {
1522 if (vnode_umount_preflight(mp, skipvp, flags)) {
1523 vnode_iterate_clear(mp);
1524 mount_unlock(mp);
1525 return(EBUSY);
1526 }
1527 }
1c79356b 1528loop:
91447636
A
1529 /* it is returns 0 then there is nothing to do */
1530 retval = vnode_iterate_prepare(mp);
1531
1532 if (retval == 0) {
1533 vnode_iterate_clear(mp);
1534 mount_unlock(mp);
1535 return(retval);
1536 }
1537
1538 /* iterate over all the vnodes */
1539 while (!TAILQ_EMPTY(&mp->mnt_workerqueue)) {
1540 vp = TAILQ_FIRST(&mp->mnt_workerqueue);
1541 TAILQ_REMOVE(&mp->mnt_workerqueue, vp, v_mntvnodes);
1542 TAILQ_INSERT_TAIL(&mp->mnt_vnodelist, vp, v_mntvnodes);
1543 if ( (vp->v_mount != mp) || (vp == skipvp)) {
1544 continue;
1545 }
1546 vid = vp->v_id;
1547 mount_unlock(mp);
1548 vnode_lock(vp);
1549
1550 if ((vp->v_id != vid) || ((vp->v_lflag & (VL_DEAD | VL_TERMINATE)))) {
1551 vnode_unlock(vp);
1552 mount_lock(mp);
1553 continue;
1554 }
1555
1c79356b 1556 /*
91447636
A
1557 * If requested, skip over vnodes marked VSYSTEM.
1558 * Skip over all vnodes marked VNOFLUSH.
1559 */
1560 if ((flags & SKIPSYSTEM) && ((vp->v_flag & VSYSTEM) ||
1561 (vp->v_flag & VNOFLUSH))) {
1562 vnode_unlock(vp);
1563 mount_lock(mp);
1c79356b 1564 continue;
91447636 1565 }
1c79356b 1566 /*
91447636 1567 * If requested, skip over vnodes marked VSWAP.
1c79356b 1568 */
91447636
A
1569 if ((flags & SKIPSWAP) && (vp->v_flag & VSWAP)) {
1570 vnode_unlock(vp);
1571 mount_lock(mp);
1c79356b
A
1572 continue;
1573 }
1574 /*
91447636 1575 * If requested, skip over vnodes marked VSWAP.
1c79356b 1576 */
91447636
A
1577 if ((flags & SKIPROOT) && (vp->v_flag & VROOT)) {
1578 vnode_unlock(vp);
1579 mount_lock(mp);
1c79356b
A
1580 continue;
1581 }
1582 /*
1583 * If WRITECLOSE is set, only flush out regular file
1584 * vnodes open for writing.
1585 */
1586 if ((flags & WRITECLOSE) &&
1587 (vp->v_writecount == 0 || vp->v_type != VREG)) {
91447636
A
1588 vnode_unlock(vp);
1589 mount_lock(mp);
1c79356b
A
1590 continue;
1591 }
1592 /*
91447636 1593 * If the real usecount is 0, all we need to do is clear
1c79356b
A
1594 * out the vnode data structures and we are done.
1595 */
91447636
A
1596 if (((vp->v_usecount == 0) ||
1597 ((vp->v_usecount - vp->v_kusecount) == 0))) {
1598 vp->v_iocount++; /* so that drain waits for * other iocounts */
1599#ifdef JOE_DEBUG
1600 record_vp(vp, 1);
1601#endif
1602 vnode_reclaim_internal(vp, 1, 0);
1603 vnode_dropiocount(vp, 1);
1604 vnode_list_add(vp);
1605
1606 vnode_unlock(vp);
1607 reclaimed++;
1608 mount_lock(mp);
1c79356b
A
1609 continue;
1610 }
1611 /*
1612 * If FORCECLOSE is set, forcibly close the vnode.
1613 * For block or character devices, revert to an
1614 * anonymous device. For all other files, just kill them.
1615 */
1616 if (flags & FORCECLOSE) {
1c79356b 1617 if (vp->v_type != VBLK && vp->v_type != VCHR) {
91447636
A
1618 vp->v_iocount++; /* so that drain waits * for other iocounts */
1619#ifdef JOE_DEBUG
1620 record_vp(vp, 1);
1621#endif
1622 vnode_reclaim_internal(vp, 1, 0);
1623 vnode_dropiocount(vp, 1);
1624 vnode_list_add(vp);
1625 vnode_unlock(vp);
1c79356b
A
1626 } else {
1627 vclean(vp, 0, p);
91447636 1628 vp->v_lflag &= ~VL_DEAD;
1c79356b 1629 vp->v_op = spec_vnodeop_p;
91447636 1630 vnode_unlock(vp);
1c79356b 1631 }
91447636 1632 mount_lock(mp);
1c79356b
A
1633 continue;
1634 }
1635#if DIAGNOSTIC
1636 if (busyprt)
1637 vprint("vflush: busy vnode", vp);
1638#endif
91447636
A
1639 vnode_unlock(vp);
1640 mount_lock(mp);
1c79356b
A
1641 busy++;
1642 }
91447636
A
1643
1644 /* At this point the worker queue is completed */
1645 if (busy && ((flags & FORCECLOSE)==0) && reclaimed) {
1646 busy = 0;
1647 reclaimed = 0;
1648 (void)vnode_iterate_reloadq(mp);
1649 /* returned with mount lock held */
1650 goto loop;
1651 }
1652
1653 /* if new vnodes were created in between retry the reclaim */
1654 if ( vnode_iterate_reloadq(mp) != 0) {
1655 if (!(busy && ((flags & FORCECLOSE)==0)))
1656 goto loop;
1657 }
1658 vnode_iterate_clear(mp);
1659 mount_unlock(mp);
1660
9bccf70c 1661 if (busy && ((flags & FORCECLOSE)==0))
1c79356b
A
1662 return (EBUSY);
1663 return (0);
1664}
1665
91447636 1666int num_recycledvnodes=0;
1c79356b
A
1667/*
1668 * Disassociate the underlying file system from a vnode.
91447636 1669 * The vnode lock is held on entry.
1c79356b
A
1670 */
1671static void
91447636 1672vclean(vnode_t vp, int flags, proc_t p)
1c79356b 1673{
91447636 1674 struct vfs_context context;
1c79356b 1675 int active;
91447636
A
1676 int need_inactive;
1677 int already_terminating;
1678 kauth_cred_t ucred = NULL;
1c79356b 1679
91447636
A
1680 context.vc_proc = p;
1681 context.vc_ucred = kauth_cred_get();
1c79356b
A
1682
1683 /*
1684 * Check to see if the vnode is in use.
1685 * If so we have to reference it before we clean it out
1686 * so that its count cannot fall to zero and generate a
1687 * race against ourselves to recycle it.
1688 */
91447636 1689 active = vp->v_usecount;
55e303ae 1690
91447636
A
1691 /*
1692 * just in case we missed sending a needed
1693 * VNOP_INACTIVE, we'll do it now
1694 */
1695 need_inactive = (vp->v_lflag & VL_NEEDINACTIVE);
1696
1697 vp->v_lflag &= ~VL_NEEDINACTIVE;
55e303ae 1698
1c79356b
A
1699 /*
1700 * Prevent the vnode from being recycled or
1701 * brought into use while we clean it out.
1702 */
91447636
A
1703 already_terminating = (vp->v_lflag & VL_TERMINATE);
1704
1705 vp->v_lflag |= VL_TERMINATE;
1c79356b
A
1706
1707 /*
91447636
A
1708 * remove the vnode from any mount list
1709 * it might be on...
1c79356b 1710 */
91447636 1711 insmntque(vp, (struct mount *)0);
1c79356b 1712
91447636 1713 ucred = vp->v_cred;
89b3af67 1714 vp->v_cred = NOCRED;
91447636
A
1715
1716 vnode_unlock(vp);
1717
89b3af67
A
1718 if (IS_VALID_CRED(ucred))
1719 kauth_cred_unref(&ucred);
91447636
A
1720
1721 OSAddAtomic(1, &num_recycledvnodes);
1c79356b 1722 /*
91447636 1723 * purge from the name cache as early as possible...
1c79356b 1724 */
91447636 1725 cache_purge(vp);
1c79356b 1726
0b4e3aa0 1727 if (active && (flags & DOCLOSE))
91447636 1728 VNOP_CLOSE(vp, IO_NDELAY, &context);
1c79356b
A
1729
1730 /*
1731 * Clean out any buffers associated with the vnode.
1732 */
1733 if (flags & DOCLOSE) {
91447636 1734#if NFSCLIENT
1c79356b 1735 if (vp->v_tag == VT_NFS)
55e303ae
A
1736 nfs_vinvalbuf(vp, V_SAVE, NOCRED, p, 0);
1737 else
91447636
A
1738#endif
1739 {
1740 VNOP_FSYNC(vp, MNT_WAIT, &context);
1741 buf_invalidateblks(vp, BUF_WRITE_DATA, 0, 0);
1742 }
1743 if (UBCINFOEXISTS(vp))
1744 /*
1745 * Clean the pages in VM.
1746 */
1747 (void)ubc_sync_range(vp, (off_t)0, ubc_getsize(vp), UBC_PUSHALL);
55e303ae 1748 }
91447636
A
1749 if (UBCINFOEXISTS(vp))
1750 cluster_release(vp->v_ubcinfo);
0b4e3aa0 1751
91447636
A
1752 if (active || need_inactive)
1753 VNOP_INACTIVE(vp, &context);
0b4e3aa0
A
1754
1755 /* Destroy ubc named reference */
91447636 1756 ubc_destroy_named(vp);
0b4e3aa0 1757
1c79356b
A
1758 /*
1759 * Reclaim the vnode.
1760 */
91447636 1761 if (VNOP_RECLAIM(vp, &context))
1c79356b 1762 panic("vclean: cannot reclaim");
55e303ae
A
1763
1764 // make sure the name & parent ptrs get cleaned out!
91447636 1765 vnode_update_identity(vp, NULLVP, NULL, 0, 0, VNODE_UPDATE_PARENT | VNODE_UPDATE_NAME);
55e303ae 1766
91447636 1767 vnode_lock(vp);
1c79356b 1768
91447636 1769 vp->v_mount = dead_mountp;
1c79356b
A
1770 vp->v_op = dead_vnodeop_p;
1771 vp->v_tag = VT_NON;
91447636 1772 vp->v_data = NULL;
1c79356b 1773
91447636 1774 vp->v_lflag |= VL_DEAD;
55e303ae 1775
91447636
A
1776 if (already_terminating == 0) {
1777 vp->v_lflag &= ~VL_TERMINATE;
1778 /*
1779 * Done with purge, notify sleepers of the grim news.
1780 */
1781 if (vp->v_lflag & VL_TERMWANT) {
1782 vp->v_lflag &= ~VL_TERMWANT;
1783 wakeup(&vp->v_lflag);
1784 }
1c79356b
A
1785 }
1786}
1787
1788/*
1789 * Eliminate all activity associated with the requested vnode
1790 * and with all vnodes aliased to the requested vnode.
1791 */
1792int
91447636 1793vn_revoke(vnode_t vp, int flags, __unused vfs_context_t a_context)
1c79356b 1794{
91447636
A
1795 struct vnode *vq;
1796 int vid;
1c79356b
A
1797
1798#if DIAGNOSTIC
91447636
A
1799 if ((flags & REVOKEALL) == 0)
1800 panic("vnop_revoke");
1c79356b
A
1801#endif
1802
1c79356b
A
1803 if (vp->v_flag & VALIASED) {
1804 /*
1805 * If a vgone (or vclean) is already in progress,
1806 * wait until it is done and return.
1807 */
91447636
A
1808 vnode_lock(vp);
1809 if (vp->v_lflag & VL_TERMINATE) {
1810 vnode_unlock(vp);
1811 return(ENOENT);
1c79356b 1812 }
91447636 1813 vnode_unlock(vp);
1c79356b
A
1814 /*
1815 * Ensure that vp will not be vgone'd while we
1816 * are eliminating its aliases.
1817 */
91447636 1818 SPECHASH_LOCK();
1c79356b 1819 while (vp->v_flag & VALIASED) {
1c79356b
A
1820 for (vq = *vp->v_hashchain; vq; vq = vq->v_specnext) {
1821 if (vq->v_rdev != vp->v_rdev ||
1822 vq->v_type != vp->v_type || vp == vq)
1823 continue;
91447636
A
1824 vid = vq->v_id;
1825 SPECHASH_UNLOCK();
1826 if (vnode_getwithvid(vq,vid)){
1827 SPECHASH_LOCK();
1828 break;
1829 }
1830 vnode_reclaim_internal(vq, 0, 0);
1831 vnode_put(vq);
1832 SPECHASH_LOCK();
1c79356b
A
1833 break;
1834 }
1c79356b 1835 }
91447636 1836 SPECHASH_UNLOCK();
1c79356b 1837 }
91447636
A
1838 vnode_reclaim_internal(vp, 0, 0);
1839
1c79356b
A
1840 return (0);
1841}
1842
1843/*
1844 * Recycle an unused vnode to the front of the free list.
1845 * Release the passed interlock if the vnode will be recycled.
1846 */
1847int
91447636 1848vnode_recycle(vp)
1c79356b 1849 struct vnode *vp;
1c79356b 1850{
91447636 1851 vnode_lock(vp);
1c79356b 1852
91447636
A
1853 if (vp->v_iocount || vp->v_usecount) {
1854 vp->v_lflag |= VL_MARKTERM;
1855 vnode_unlock(vp);
1856 return(0);
1857 }
1858 vnode_reclaim_internal(vp, 1, 0);
1859 vnode_unlock(vp);
1860
1861 return (1);
1c79356b
A
1862}
1863
91447636
A
1864static int
1865vnode_reload(vnode_t vp)
1c79356b 1866{
91447636 1867 vnode_lock(vp);
1c79356b 1868
91447636
A
1869 if ((vp->v_iocount > 1) || vp->v_usecount) {
1870 vnode_unlock(vp);
1871 return(0);
1872 }
1873 if (vp->v_iocount <= 0)
1874 panic("vnode_reload with no iocount %d", vp->v_iocount);
1875
1876 /* mark for release when iocount is dopped */
1877 vp->v_lflag |= VL_MARKTERM;
1878 vnode_unlock(vp);
1879
1880 return (1);
1c79356b
A
1881}
1882
91447636
A
1883
1884static void
1885vgone(vnode_t vp)
1c79356b
A
1886{
1887 struct vnode *vq;
1888 struct vnode *vx;
1889
1c79356b
A
1890 /*
1891 * Clean out the filesystem specific data.
91447636
A
1892 * vclean also takes care of removing the
1893 * vnode from any mount list it might be on
1c79356b 1894 */
91447636
A
1895 vclean(vp, DOCLOSE, current_proc());
1896
1c79356b
A
1897 /*
1898 * If special device, remove it from special device alias list
1899 * if it is on one.
1900 */
1901 if ((vp->v_type == VBLK || vp->v_type == VCHR) && vp->v_specinfo != 0) {
91447636
A
1902 SPECHASH_LOCK();
1903 if (*vp->v_hashchain == vp) {
1904 *vp->v_hashchain = vp->v_specnext;
1905 } else {
1906 for (vq = *vp->v_hashchain; vq; vq = vq->v_specnext) {
1907 if (vq->v_specnext != vp)
1908 continue;
1909 vq->v_specnext = vp->v_specnext;
1910 break;
1911 }
1c79356b
A
1912 if (vq == NULL)
1913 panic("missing bdev");
1c79356b 1914 }
91447636
A
1915 if (vp->v_flag & VALIASED) {
1916 vx = NULL;
1917 for (vq = *vp->v_hashchain; vq; vq = vq->v_specnext) {
1918 if (vq->v_rdev != vp->v_rdev ||
1919 vq->v_type != vp->v_type)
1920 continue;
1921 if (vx)
1922 break;
1923 vx = vq;
1924 }
1925 if (vx == NULL)
1926 panic("missing alias");
1927 if (vq == NULL)
1928 vx->v_flag &= ~VALIASED;
1929 vp->v_flag &= ~VALIASED;
1930 }
1931 SPECHASH_UNLOCK();
1932 {
1933 struct specinfo *tmp = vp->v_specinfo;
1934 vp->v_specinfo = NULL;
1935 FREE_ZONE((void *)tmp, sizeof(struct specinfo), M_SPECINFO);
1936 }
1c79356b 1937 }
1c79356b
A
1938}
1939
1940/*
1941 * Lookup a vnode by device number.
1942 */
1943int
91447636 1944check_mountedon(dev_t dev, enum vtype type, int *errorp)
1c79356b 1945{
91447636 1946 vnode_t vp;
1c79356b 1947 int rc = 0;
91447636 1948 int vid;
1c79356b 1949
91447636
A
1950loop:
1951 SPECHASH_LOCK();
1c79356b
A
1952 for (vp = speclisth[SPECHASH(dev)]; vp; vp = vp->v_specnext) {
1953 if (dev != vp->v_rdev || type != vp->v_type)
1954 continue;
91447636
A
1955 vid = vp->v_id;
1956 SPECHASH_UNLOCK();
1957 if (vnode_getwithvid(vp,vid))
1958 goto loop;
1959 vnode_lock(vp);
1960 if ((vp->v_usecount > 0) || (vp->v_iocount > 1)) {
1961 vnode_unlock(vp);
1962 if ((*errorp = vfs_mountedon(vp)) != 0)
1963 rc = 1;
1964 } else
1965 vnode_unlock(vp);
1966 vnode_put(vp);
1967 return(rc);
1c79356b 1968 }
91447636
A
1969 SPECHASH_UNLOCK();
1970 return (0);
1c79356b
A
1971}
1972
1973/*
1974 * Calculate the total number of references to a special device.
1975 */
1976int
91447636 1977vcount(vnode_t vp)
1c79356b 1978{
91447636 1979 vnode_t vq, vnext;
1c79356b 1980 int count;
91447636 1981 int vid;
1c79356b
A
1982
1983loop:
1984 if ((vp->v_flag & VALIASED) == 0)
91447636
A
1985 return (vp->v_usecount - vp->v_kusecount);
1986
1987 SPECHASH_LOCK();
1c79356b
A
1988 for (count = 0, vq = *vp->v_hashchain; vq; vq = vnext) {
1989 vnext = vq->v_specnext;
1990 if (vq->v_rdev != vp->v_rdev || vq->v_type != vp->v_type)
1991 continue;
91447636
A
1992 vid = vq->v_id;
1993 SPECHASH_UNLOCK();
1994
1995 if (vnode_getwithvid(vq, vid)) {
1996 goto loop;
1997 }
1c79356b
A
1998 /*
1999 * Alias, but not in use, so flush it out.
2000 */
91447636
A
2001 vnode_lock(vq);
2002 if ((vq->v_usecount == 0) && (vq->v_iocount == 1) && vq != vp) {
2003 vnode_reclaim_internal(vq, 1, 0);
2004 vnode_unlock(vq);
2005 vnode_put(vq);
1c79356b
A
2006 goto loop;
2007 }
91447636
A
2008 count += (vq->v_usecount - vq->v_kusecount);
2009 vnode_unlock(vq);
2010 vnode_put(vq);
2011
2012 SPECHASH_LOCK();
1c79356b 2013 }
91447636
A
2014 SPECHASH_UNLOCK();
2015
1c79356b
A
2016 return (count);
2017}
2018
2019int prtactive = 0; /* 1 => print out reclaim of active vnodes */
2020
2021/*
2022 * Print out a description of a vnode.
2023 */
2024static char *typename[] =
2025 { "VNON", "VREG", "VDIR", "VBLK", "VCHR", "VLNK", "VSOCK", "VFIFO", "VBAD" };
2026
2027void
91447636 2028vprint(const char *label, struct vnode *vp)
1c79356b 2029{
91447636 2030 char sbuf[64];
1c79356b
A
2031
2032 if (label != NULL)
2033 printf("%s: ", label);
91447636
A
2034 printf("type %s, usecount %d, writecount %d",
2035 typename[vp->v_type], vp->v_usecount, vp->v_writecount);
2036 sbuf[0] = '\0';
1c79356b 2037 if (vp->v_flag & VROOT)
91447636 2038 strcat(sbuf, "|VROOT");
1c79356b 2039 if (vp->v_flag & VTEXT)
91447636 2040 strcat(sbuf, "|VTEXT");
1c79356b 2041 if (vp->v_flag & VSYSTEM)
91447636 2042 strcat(sbuf, "|VSYSTEM");
9bccf70c 2043 if (vp->v_flag & VNOFLUSH)
91447636 2044 strcat(sbuf, "|VNOFLUSH");
1c79356b 2045 if (vp->v_flag & VBWAIT)
91447636 2046 strcat(sbuf, "|VBWAIT");
1c79356b 2047 if (vp->v_flag & VALIASED)
91447636
A
2048 strcat(sbuf, "|VALIASED");
2049 if (sbuf[0] != '\0')
2050 printf(" flags (%s)", &sbuf[1]);
1c79356b
A
2051}
2052
1c79356b 2053
91447636
A
2054int
2055vn_getpath(struct vnode *vp, char *pathbuf, int *len)
2056{
2057 return build_path(vp, pathbuf, *len, len);
1c79356b 2058}
91447636
A
2059
2060
2061static char *extension_table=NULL;
2062static int nexts;
2063static int max_ext_width;
1c79356b 2064
55e303ae 2065static int
91447636
A
2066extension_cmp(void *a, void *b)
2067{
2068 return (strlen((char *)a) - strlen((char *)b));
2069}
55e303ae 2070
55e303ae 2071
91447636
A
2072//
2073// This is the api LaunchServices uses to inform the kernel
2074// the list of package extensions to ignore.
2075//
2076// Internally we keep the list sorted by the length of the
2077// the extension (from longest to shortest). We sort the
2078// list of extensions so that we can speed up our searches
2079// when comparing file names -- we only compare extensions
2080// that could possibly fit into the file name, not all of
2081// them (i.e. a short 8 character name can't have an 8
2082// character extension).
2083//
2084__private_extern__ int
2085set_package_extensions_table(void *data, int nentries, int maxwidth)
2086{
2087 char *new_exts, *ptr;
2088 int error, i, len;
2089
2090 if (nentries <= 0 || nentries > 1024 || maxwidth <= 0 || maxwidth > 255) {
2091 return EINVAL;
2092 }
2093
2094 MALLOC(new_exts, char *, nentries * maxwidth, M_TEMP, M_WAITOK);
2095
2096 error = copyin(CAST_USER_ADDR_T(data), new_exts, nentries * maxwidth);
2097 if (error) {
2098 FREE(new_exts, M_TEMP);
2099 return error;
2100 }
2101
2102 if (extension_table) {
2103 FREE(extension_table, M_TEMP);
2104 }
2105 extension_table = new_exts;
2106 nexts = nentries;
2107 max_ext_width = maxwidth;
2108
2109 qsort(extension_table, nexts, maxwidth, extension_cmp);
2110
2111 return 0;
2112}
2113
2114
2115__private_extern__ int
2116is_package_name(char *name, int len)
2117{
2118 int i, extlen;
2119 char *ptr, *name_ext;
2120
2121 if (len <= 3) {
2122 return 0;
2123 }
2124
2125 name_ext = NULL;
2126 for(ptr=name; *ptr != '\0'; ptr++) {
2127 if (*ptr == '.') {
2128 name_ext = ptr;
55e303ae 2129 }
91447636 2130 }
55e303ae 2131
91447636
A
2132 // if there is no "." extension, it can't match
2133 if (name_ext == NULL) {
2134 return 0;
2135 }
55e303ae 2136
91447636
A
2137 // advance over the "."
2138 name_ext++;
55e303ae 2139
91447636
A
2140 // now iterate over all the extensions to see if any match
2141 ptr = &extension_table[0];
2142 for(i=0; i < nexts; i++, ptr+=max_ext_width) {
2143 extlen = strlen(ptr);
2144 if (strncasecmp(name_ext, ptr, extlen) == 0 && name_ext[extlen] == '\0') {
2145 // aha, a match!
2146 return 1;
55e303ae
A
2147 }
2148 }
2149
91447636
A
2150 // if we get here, no extension matched
2151 return 0;
55e303ae
A
2152}
2153
91447636
A
2154int
2155vn_path_package_check(__unused vnode_t vp, char *path, int pathlen, int *component)
55e303ae 2156{
91447636
A
2157 char *ptr, *end;
2158 int comp=0;
2159
2160 *component = -1;
2161 if (*path != '/') {
2162 return EINVAL;
2163 }
2164
2165 end = path + 1;
2166 while(end < path + pathlen && *end != '\0') {
2167 while(end < path + pathlen && *end == '/' && *end != '\0') {
2168 end++;
2169 }
2170
2171 ptr = end;
2172
2173 while(end < path + pathlen && *end != '/' && *end != '\0') {
2174 end++;
2175 }
2176
2177 if (end > path + pathlen) {
2178 // hmm, string wasn't null terminated
2179 return EINVAL;
2180 }
2181
2182 *end = '\0';
2183 if (is_package_name(ptr, end - ptr)) {
2184 *component = comp;
2185 break;
2186 }
55e303ae 2187
91447636
A
2188 end++;
2189 comp++;
2190 }
2191
2192 return 0;
2193}
55e303ae
A
2194
2195
1c79356b
A
2196/*
2197 * Top level filesystem related information gathering.
2198 */
91447636
A
2199extern unsigned int vfs_nummntops;
2200
1c79356b 2201int
91447636
A
2202vfs_sysctl(int *name, u_int namelen, user_addr_t oldp, size_t *oldlenp,
2203 user_addr_t newp, size_t newlen, struct proc *p)
1c79356b 2204{
91447636 2205 struct vfstable *vfsp;
55e303ae
A
2206 int *username;
2207 u_int usernamelen;
2208 int error;
91447636 2209 struct vfsconf *vfsc;
1c79356b 2210
9bccf70c
A
2211 /*
2212 * The VFS_NUMMNTOPS shouldn't be at name[0] since
2213 * is a VFS generic variable. So now we must check
2214 * namelen so we don't end up covering any UFS
2215 * variables (sinc UFS vfc_typenum is 1).
2216 *
2217 * It should have been:
2218 * name[0]: VFS_GENERIC
2219 * name[1]: VFS_NUMMNTOPS
2220 */
2221 if (namelen == 1 && name[0] == VFS_NUMMNTOPS) {
1c79356b
A
2222 return (sysctl_rdint(oldp, oldlenp, newp, vfs_nummntops));
2223 }
2224
2225 /* all sysctl names at this level are at least name and field */
2226 if (namelen < 2)
55e303ae 2227 return (EISDIR); /* overloaded */
1c79356b 2228 if (name[0] != VFS_GENERIC) {
91447636
A
2229 struct vfs_context context;
2230
1c79356b
A
2231 for (vfsp = vfsconf; vfsp; vfsp = vfsp->vfc_next)
2232 if (vfsp->vfc_typenum == name[0])
2233 break;
2234 if (vfsp == NULL)
91447636
A
2235 return (ENOTSUP);
2236 context.vc_proc = p;
2237 context.vc_ucred = kauth_cred_get();
2238
1c79356b 2239 return ((*vfsp->vfc_vfsops->vfs_sysctl)(&name[1], namelen - 1,
91447636 2240 oldp, oldlenp, newp, newlen, &context));
1c79356b
A
2241 }
2242 switch (name[1]) {
2243 case VFS_MAXTYPENUM:
2244 return (sysctl_rdint(oldp, oldlenp, newp, maxvfsconf));
2245 case VFS_CONF:
2246 if (namelen < 3)
2247 return (ENOTDIR); /* overloaded */
2248 for (vfsp = vfsconf; vfsp; vfsp = vfsp->vfc_next)
2249 if (vfsp->vfc_typenum == name[2])
2250 break;
2251 if (vfsp == NULL)
91447636
A
2252 return (ENOTSUP);
2253 vfsc = (struct vfsconf *)vfsp;
2254 if (proc_is64bit(p)) {
2255 struct user_vfsconf usr_vfsc;
2256 usr_vfsc.vfc_vfsops = CAST_USER_ADDR_T(vfsc->vfc_vfsops);
2257 bcopy(vfsc->vfc_name, usr_vfsc.vfc_name, sizeof(usr_vfsc.vfc_name));
2258 usr_vfsc.vfc_typenum = vfsc->vfc_typenum;
2259 usr_vfsc.vfc_refcount = vfsc->vfc_refcount;
2260 usr_vfsc.vfc_flags = vfsc->vfc_flags;
2261 usr_vfsc.vfc_mountroot = CAST_USER_ADDR_T(vfsc->vfc_mountroot);
2262 usr_vfsc.vfc_next = CAST_USER_ADDR_T(vfsc->vfc_next);
2263 return (sysctl_rdstruct(oldp, oldlenp, newp, &usr_vfsc,
2264 sizeof(usr_vfsc)));
2265 }
2266 else {
2267 return (sysctl_rdstruct(oldp, oldlenp, newp, vfsc,
2268 sizeof(struct vfsconf)));
2269 }
2270
2271 case VFS_SET_PACKAGE_EXTS:
2272 return set_package_extensions_table((void *)name[1], name[2], name[3]);
1c79356b 2273 }
55e303ae
A
2274 /*
2275 * We need to get back into the general MIB, so we need to re-prepend
2276 * CTL_VFS to our name and try userland_sysctl().
2277 */
2278 usernamelen = namelen + 1;
2279 MALLOC(username, int *, usernamelen * sizeof(*username),
2280 M_TEMP, M_WAITOK);
2281 bcopy(name, username + 1, namelen * sizeof(*name));
2282 username[0] = CTL_VFS;
91447636
A
2283 error = userland_sysctl(p, username, usernamelen, oldp,
2284 oldlenp, 1, newp, newlen, oldlenp);
55e303ae
A
2285 FREE(username, M_TEMP);
2286 return (error);
1c79356b
A
2287}
2288
2289int kinfo_vdebug = 1;
2290#define KINFO_VNODESLOP 10
2291/*
2292 * Dump vnode list (via sysctl).
2293 * Copyout address of vnode followed by vnode.
2294 */
2295/* ARGSUSED */
2296int
91447636 2297sysctl_vnode(__unused user_addr_t where, __unused size_t *sizep)
1c79356b 2298{
91447636 2299#if 0
1c79356b
A
2300 struct mount *mp, *nmp;
2301 struct vnode *nvp, *vp;
2302 char *bp = where, *savebp;
2303 char *ewhere;
2304 int error;
2305
2306#define VPTRSZ sizeof (struct vnode *)
2307#define VNODESZ sizeof (struct vnode)
2308 if (where == NULL) {
2309 *sizep = (numvnodes + KINFO_VNODESLOP) * (VPTRSZ + VNODESZ);
2310 return (0);
2311 }
2312 ewhere = where + *sizep;
2313
1c79356b 2314 for (mp = mountlist.cqh_first; mp != (void *)&mountlist; mp = nmp) {
91447636 2315 if (vfs_busy(mp, LK_NOWAIT)) {
1c79356b
A
2316 nmp = mp->mnt_list.cqe_next;
2317 continue;
2318 }
2319 savebp = bp;
2320again:
91447636 2321 TAILQ_FOREACH(vp, &mp->mnt_vnodelist, v_mntvnodes) {
1c79356b
A
2322 /*
2323 * Check that the vp is still associated with
2324 * this filesystem. RACE: could have been
2325 * recycled onto the same filesystem.
2326 */
2327 if (vp->v_mount != mp) {
1c79356b
A
2328 if (kinfo_vdebug)
2329 printf("kinfo: vp changed\n");
2330 bp = savebp;
2331 goto again;
2332 }
1c79356b 2333 if (bp + VPTRSZ + VNODESZ > ewhere) {
91447636 2334 vfs_unbusy(mp);
1c79356b
A
2335 *sizep = bp - where;
2336 return (ENOMEM);
2337 }
1c79356b 2338 if ((error = copyout((caddr_t)&vp, bp, VPTRSZ)) ||
55e303ae 2339 (error = copyout((caddr_t)vp, bp + VPTRSZ, VNODESZ))) {
91447636 2340 vfs_unbusy(mp);
1c79356b 2341 return (error);
55e303ae 2342 }
1c79356b 2343 bp += VPTRSZ + VNODESZ;
1c79356b 2344 }
1c79356b 2345 nmp = mp->mnt_list.cqe_next;
91447636 2346 vfs_unbusy(mp);
1c79356b 2347 }
1c79356b
A
2348
2349 *sizep = bp - where;
2350 return (0);
91447636
A
2351#else
2352 return(EINVAL);
2353#endif
1c79356b
A
2354}
2355
2356/*
2357 * Check to see if a filesystem is mounted on a block device.
2358 */
2359int
2360vfs_mountedon(vp)
2361 struct vnode *vp;
2362{
2363 struct vnode *vq;
2364 int error = 0;
2365
91447636
A
2366 SPECHASH_LOCK();
2367 if (vp->v_specflags & SI_MOUNTEDON) {
2368 error = EBUSY;
2369 goto out;
2370 }
1c79356b 2371 if (vp->v_flag & VALIASED) {
1c79356b
A
2372 for (vq = *vp->v_hashchain; vq; vq = vq->v_specnext) {
2373 if (vq->v_rdev != vp->v_rdev ||
2374 vq->v_type != vp->v_type)
2375 continue;
2376 if (vq->v_specflags & SI_MOUNTEDON) {
2377 error = EBUSY;
2378 break;
2379 }
2380 }
1c79356b 2381 }
91447636
A
2382out:
2383 SPECHASH_UNLOCK();
1c79356b
A
2384 return (error);
2385}
2386
2387/*
2388 * Unmount all filesystems. The list is traversed in reverse order
2389 * of mounting to avoid dependencies.
2390 */
0b4e3aa0 2391__private_extern__ void
1c79356b
A
2392vfs_unmountall()
2393{
91447636 2394 struct mount *mp;
0b4e3aa0 2395 struct proc *p = current_proc();
91447636 2396 int error;
89b3af67 2397 int skip_listremove;
1c79356b
A
2398
2399 /*
2400 * Since this only runs when rebooting, it is not interlocked.
2401 */
91447636
A
2402 mount_list_lock();
2403 while(!TAILQ_EMPTY(&mountlist)) {
2404 mp = TAILQ_LAST(&mountlist, mntlist);
2405 mount_list_unlock();
89b3af67
A
2406 skip_listremove = 0;
2407 error = dounmount(mp, MNT_FORCE, &skip_listremove, p);
91447636
A
2408 if (error) {
2409 mount_list_lock();
89b3af67
A
2410 if (skip_listremove == 0) {
2411 TAILQ_REMOVE(&mountlist, mp, mnt_list);
2412 printf("unmount of %s failed (", mp->mnt_vfsstat.f_mntonname);
2413 }
2414
91447636
A
2415 if (error == EBUSY)
2416 printf("BUSY)\n");
2417 else
2418 printf("%d)\n", error);
2419 continue;
1c79356b 2420 }
91447636 2421 mount_list_lock();
1c79356b 2422 }
91447636 2423 mount_list_unlock();
1c79356b
A
2424}
2425
1c79356b 2426
91447636
A
2427/*
2428 * This routine is called from vnode_pager_no_senders()
2429 * which in turn can be called with vnode locked by vnode_uncache()
2430 * But it could also get called as a result of vm_object_cache_trim().
2431 * In that case lock state is unknown.
2432 * AGE the vnode so that it gets recycled quickly.
1c79356b 2433 */
91447636
A
2434__private_extern__ void
2435vnode_pager_vrele(struct vnode *vp)
1c79356b 2436{
91447636 2437 vnode_lock(vp);
1c79356b 2438
91447636
A
2439 if (!ISSET(vp->v_lflag, VL_TERMINATE))
2440 panic("vnode_pager_vrele: vp not in termination");
2441 vp->v_lflag &= ~VNAMED_UBC;
1c79356b 2442
91447636
A
2443 if (UBCINFOEXISTS(vp)) {
2444 struct ubc_info *uip = vp->v_ubcinfo;
1c79356b 2445
91447636
A
2446 if (ISSET(uip->ui_flags, UI_WASMAPPED))
2447 SET(vp->v_flag, VWASMAPPED);
2448 vp->v_ubcinfo = UBC_INFO_NULL;
2449
2450 ubc_info_deallocate(uip);
2451 } else {
2452 panic("NO ubcinfo in vnode_pager_vrele");
1c79356b 2453 }
91447636
A
2454 vnode_unlock(vp);
2455
2456 wakeup(&vp->v_lflag);
1c79356b
A
2457}
2458
91447636
A
2459
2460#include <sys/disk.h>
2461
2462errno_t
2463vfs_init_io_attributes(vnode_t devvp, mount_t mp)
1c79356b 2464{
91447636
A
2465 int error;
2466 off_t readblockcnt;
2467 off_t writeblockcnt;
2468 off_t readmaxcnt;
2469 off_t writemaxcnt;
2470 off_t readsegcnt;
2471 off_t writesegcnt;
2472 off_t readsegsize;
2473 off_t writesegsize;
2474 u_long blksize;
2475 u_int64_t temp;
2476 struct vfs_context context;
1c79356b 2477
91447636 2478 proc_t p = current_proc();
0b4e3aa0 2479
91447636
A
2480 context.vc_proc = p;
2481 context.vc_ucred = kauth_cred_get();
0b4e3aa0 2482
55e303ae
A
2483 int isvirtual = 0;
2484 /*
2485 * determine if this mount point exists on the same device as the root
2486 * partition... if so, then it comes under the hard throttle control
2487 */
2488 int thisunit = -1;
2489 static int rootunit = -1;
55e303ae
A
2490
2491 if (rootunit == -1) {
91447636 2492 if (VNOP_IOCTL(rootvp, DKIOCGETBSDUNIT, (caddr_t)&rootunit, 0, &context))
55e303ae
A
2493 rootunit = -1;
2494 else if (rootvp == devvp)
2495 mp->mnt_kern_flag |= MNTK_ROOTDEV;
2496 }
2497 if (devvp != rootvp && rootunit != -1) {
91447636 2498 if (VNOP_IOCTL(devvp, DKIOCGETBSDUNIT, (caddr_t)&thisunit, 0, &context) == 0) {
55e303ae
A
2499 if (thisunit == rootunit)
2500 mp->mnt_kern_flag |= MNTK_ROOTDEV;
2501 }
2502 }
91447636
A
2503 /*
2504 * force the spec device to re-cache
2505 * the underlying block size in case
2506 * the filesystem overrode the initial value
2507 */
2508 set_fsblocksize(devvp);
2509
2510
2511 if ((error = VNOP_IOCTL(devvp, DKIOCGETBLOCKSIZE,
2512 (caddr_t)&blksize, 0, &context)))
2513 return (error);
2514
2515 mp->mnt_devblocksize = blksize;
2516
2517 if (VNOP_IOCTL(devvp, DKIOCISVIRTUAL, (caddr_t)&isvirtual, 0, &context) == 0) {
55e303ae
A
2518 if (isvirtual)
2519 mp->mnt_kern_flag |= MNTK_VIRTUALDEV;
2520 }
2521
91447636
A
2522 if ((error = VNOP_IOCTL(devvp, DKIOCGETMAXBLOCKCOUNTREAD,
2523 (caddr_t)&readblockcnt, 0, &context)))
0b4e3aa0
A
2524 return (error);
2525
91447636
A
2526 if ((error = VNOP_IOCTL(devvp, DKIOCGETMAXBLOCKCOUNTWRITE,
2527 (caddr_t)&writeblockcnt, 0, &context)))
55e303ae
A
2528 return (error);
2529
91447636
A
2530 if ((error = VNOP_IOCTL(devvp, DKIOCGETMAXBYTECOUNTREAD,
2531 (caddr_t)&readmaxcnt, 0, &context)))
55e303ae
A
2532 return (error);
2533
91447636
A
2534 if ((error = VNOP_IOCTL(devvp, DKIOCGETMAXBYTECOUNTWRITE,
2535 (caddr_t)&writemaxcnt, 0, &context)))
0b4e3aa0
A
2536 return (error);
2537
91447636
A
2538 if ((error = VNOP_IOCTL(devvp, DKIOCGETMAXSEGMENTCOUNTREAD,
2539 (caddr_t)&readsegcnt, 0, &context)))
0b4e3aa0
A
2540 return (error);
2541
91447636
A
2542 if ((error = VNOP_IOCTL(devvp, DKIOCGETMAXSEGMENTCOUNTWRITE,
2543 (caddr_t)&writesegcnt, 0, &context)))
55e303ae
A
2544 return (error);
2545
91447636
A
2546 if ((error = VNOP_IOCTL(devvp, DKIOCGETMAXSEGMENTBYTECOUNTREAD,
2547 (caddr_t)&readsegsize, 0, &context)))
55e303ae
A
2548 return (error);
2549
91447636
A
2550 if ((error = VNOP_IOCTL(devvp, DKIOCGETMAXSEGMENTBYTECOUNTWRITE,
2551 (caddr_t)&writesegsize, 0, &context)))
0b4e3aa0
A
2552 return (error);
2553
55e303ae
A
2554 if (readmaxcnt)
2555 temp = (readmaxcnt > UINT32_MAX) ? UINT32_MAX : readmaxcnt;
2556 else {
2557 if (readblockcnt) {
2558 temp = readblockcnt * blksize;
2559 temp = (temp > UINT32_MAX) ? UINT32_MAX : temp;
2560 } else
2561 temp = MAXPHYS;
2562 }
91447636 2563 mp->mnt_maxreadcnt = (u_int32_t)temp;
55e303ae
A
2564
2565 if (writemaxcnt)
2566 temp = (writemaxcnt > UINT32_MAX) ? UINT32_MAX : writemaxcnt;
2567 else {
2568 if (writeblockcnt) {
2569 temp = writeblockcnt * blksize;
2570 temp = (temp > UINT32_MAX) ? UINT32_MAX : temp;
2571 } else
2572 temp = MAXPHYS;
2573 }
0b4e3aa0
A
2574 mp->mnt_maxwritecnt = (u_int32_t)temp;
2575
55e303ae
A
2576 if (readsegcnt) {
2577 temp = (readsegcnt > UINT16_MAX) ? UINT16_MAX : readsegcnt;
2578 mp->mnt_segreadcnt = (u_int16_t)temp;
2579 }
2580 if (writesegcnt) {
2581 temp = (writesegcnt > UINT16_MAX) ? UINT16_MAX : writesegcnt;
2582 mp->mnt_segwritecnt = (u_int16_t)temp;
2583 }
2584 if (readsegsize)
2585 temp = (readsegsize > UINT32_MAX) ? UINT32_MAX : readsegsize;
2586 else
2587 temp = mp->mnt_maxreadcnt;
91447636 2588 mp->mnt_maxsegreadsize = (u_int32_t)temp;
0b4e3aa0 2589
55e303ae
A
2590 if (writesegsize)
2591 temp = (writesegsize > UINT32_MAX) ? UINT32_MAX : writesegsize;
2592 else
2593 temp = mp->mnt_maxwritecnt;
91447636 2594 mp->mnt_maxsegwritesize = (u_int32_t)temp;
0b4e3aa0 2595
55e303ae
A
2596 return (error);
2597}
2598
2599static struct klist fs_klist;
2600
2601void
2602vfs_event_init(void)
2603{
2604
2605 klist_init(&fs_klist);
2606}
2607
2608void
91447636 2609vfs_event_signal(__unused fsid_t *fsid, u_int32_t event, __unused intptr_t data)
55e303ae
A
2610{
2611
2612 KNOTE(&fs_klist, event);
2613}
2614
2615/*
2616 * return the number of mounted filesystems.
2617 */
2618static int
2619sysctl_vfs_getvfscnt(void)
2620{
91447636
A
2621 return(mount_getvfscnt());
2622}
2623
0b4e3aa0 2624
91447636
A
2625static int
2626mount_getvfscnt(void)
2627{
2628 int ret;
2629
2630 mount_list_lock();
2631 ret = nummounts;
2632 mount_list_unlock();
55e303ae 2633 return (ret);
91447636
A
2634
2635}
2636
2637
2638
2639static int
2640mount_fillfsids(fsid_t *fsidlst, int count)
2641{
2642 struct mount *mp;
2643 int actual=0;
2644
2645 actual = 0;
2646 mount_list_lock();
2647 TAILQ_FOREACH(mp, &mountlist, mnt_list) {
2648 if (actual <= count) {
2649 fsidlst[actual] = mp->mnt_vfsstat.f_fsid;
2650 actual++;
2651 }
2652 }
2653 mount_list_unlock();
2654 return (actual);
2655
55e303ae
A
2656}
2657
2658/*
2659 * fill in the array of fsid_t's up to a max of 'count', the actual
2660 * number filled in will be set in '*actual'. If there are more fsid_t's
2661 * than room in fsidlst then ENOMEM will be returned and '*actual' will
2662 * have the actual count.
2663 * having *actual filled out even in the error case is depended upon.
2664 */
2665static int
2666sysctl_vfs_getvfslist(fsid_t *fsidlst, int count, int *actual)
2667{
2668 struct mount *mp;
2669
2670 *actual = 0;
91447636
A
2671 mount_list_lock();
2672 TAILQ_FOREACH(mp, &mountlist, mnt_list) {
55e303ae
A
2673 (*actual)++;
2674 if (*actual <= count)
91447636 2675 fsidlst[(*actual) - 1] = mp->mnt_vfsstat.f_fsid;
55e303ae 2676 }
91447636 2677 mount_list_unlock();
55e303ae
A
2678 return (*actual <= count ? 0 : ENOMEM);
2679}
2680
2681static int
2682sysctl_vfs_vfslist SYSCTL_HANDLER_ARGS
2683{
2684 int actual, error;
2685 size_t space;
2686 fsid_t *fsidlst;
2687
2688 /* This is a readonly node. */
91447636 2689 if (req->newptr != USER_ADDR_NULL)
55e303ae
A
2690 return (EPERM);
2691
2692 /* they are querying us so just return the space required. */
91447636 2693 if (req->oldptr == USER_ADDR_NULL) {
55e303ae
A
2694 req->oldidx = sysctl_vfs_getvfscnt() * sizeof(fsid_t);
2695 return 0;
2696 }
2697again:
2698 /*
2699 * Retrieve an accurate count of the amount of space required to copy
2700 * out all the fsids in the system.
2701 */
2702 space = req->oldlen;
2703 req->oldlen = sysctl_vfs_getvfscnt() * sizeof(fsid_t);
2704
2705 /* they didn't give us enough space. */
2706 if (space < req->oldlen)
2707 return (ENOMEM);
2708
2709 MALLOC(fsidlst, fsid_t *, req->oldlen, M_TEMP, M_WAITOK);
2710 error = sysctl_vfs_getvfslist(fsidlst, req->oldlen / sizeof(fsid_t),
2711 &actual);
2712 /*
2713 * If we get back ENOMEM, then another mount has been added while we
2714 * slept in malloc above. If this is the case then try again.
2715 */
2716 if (error == ENOMEM) {
2717 FREE(fsidlst, M_TEMP);
2718 req->oldlen = space;
2719 goto again;
2720 }
2721 if (error == 0) {
2722 error = SYSCTL_OUT(req, fsidlst, actual * sizeof(fsid_t));
2723 }
2724 FREE(fsidlst, M_TEMP);
2725 return (error);
2726}
2727
2728/*
2729 * Do a sysctl by fsid.
2730 */
2731static int
2732sysctl_vfs_ctlbyfsid SYSCTL_HANDLER_ARGS
2733{
2734 struct vfsidctl vc;
91447636 2735 struct user_vfsidctl user_vc;
55e303ae 2736 struct mount *mp;
91447636 2737 struct vfsstatfs *sp;
55e303ae
A
2738 struct proc *p;
2739 int *name;
2740 int error, flags, namelen;
91447636
A
2741 struct vfs_context context;
2742 boolean_t is_64_bit;
55e303ae
A
2743
2744 name = arg1;
2745 namelen = arg2;
2746 p = req->p;
91447636
A
2747 context.vc_proc = p;
2748 context.vc_ucred = kauth_cred_get();
2749 is_64_bit = proc_is64bit(p);
55e303ae 2750
91447636
A
2751 if (is_64_bit) {
2752 error = SYSCTL_IN(req, &user_vc, sizeof(user_vc));
2753 if (error)
2754 return (error);
2755 if (user_vc.vc_vers != VFS_CTL_VERS1)
2756 return (EINVAL);
2757 mp = mount_list_lookupby_fsid(&user_vc.vc_fsid, 0, 0);
2758 }
2759 else {
2760 error = SYSCTL_IN(req, &vc, sizeof(vc));
2761 if (error)
2762 return (error);
2763 if (vc.vc_vers != VFS_CTL_VERS1)
2764 return (EINVAL);
2765 mp = mount_list_lookupby_fsid(&vc.vc_fsid, 0, 0);
2766 }
55e303ae
A
2767 if (mp == NULL)
2768 return (ENOENT);
2769 /* reset so that the fs specific code can fetch it. */
2770 req->newidx = 0;
2771 /*
2772 * Note if this is a VFS_CTL then we pass the actual sysctl req
2773 * in for "oldp" so that the lower layer can DTRT and use the
2774 * SYSCTL_IN/OUT routines.
2775 */
2776 if (mp->mnt_op->vfs_sysctl != NULL) {
91447636
A
2777 if (is_64_bit) {
2778 if (vfs_64bitready(mp)) {
2779 error = mp->mnt_op->vfs_sysctl(name, namelen,
2780 CAST_USER_ADDR_T(req),
2781 NULL, USER_ADDR_NULL, 0,
2782 &context);
2783 }
2784 else {
2785 error = ENOTSUP;
2786 }
2787 }
2788 else {
2789 error = mp->mnt_op->vfs_sysctl(name, namelen,
2790 CAST_USER_ADDR_T(req),
2791 NULL, USER_ADDR_NULL, 0,
2792 &context);
2793 }
2794 if (error != ENOTSUP)
55e303ae
A
2795 return (error);
2796 }
2797 switch (name[0]) {
2798 case VFS_CTL_UMOUNT:
91447636
A
2799 req->newidx = 0;
2800 if (is_64_bit) {
2801 req->newptr = user_vc.vc_ptr;
2802 req->newlen = (size_t)user_vc.vc_len;
2803 }
2804 else {
2805 req->newptr = CAST_USER_ADDR_T(vc.vc_ptr);
2806 req->newlen = vc.vc_len;
2807 }
55e303ae
A
2808 error = SYSCTL_IN(req, &flags, sizeof(flags));
2809 if (error)
2810 break;
2811 error = safedounmount(mp, flags, p);
2812 break;
2813 case VFS_CTL_STATFS:
91447636
A
2814 req->newidx = 0;
2815 if (is_64_bit) {
2816 req->newptr = user_vc.vc_ptr;
2817 req->newlen = (size_t)user_vc.vc_len;
2818 }
2819 else {
2820 req->newptr = CAST_USER_ADDR_T(vc.vc_ptr);
2821 req->newlen = vc.vc_len;
2822 }
55e303ae
A
2823 error = SYSCTL_IN(req, &flags, sizeof(flags));
2824 if (error)
2825 break;
91447636 2826 sp = &mp->mnt_vfsstat;
55e303ae 2827 if (((flags & MNT_NOWAIT) == 0 || (flags & MNT_WAIT)) &&
91447636 2828 (error = vfs_update_vfsstat(mp, &context)))
55e303ae 2829 return (error);
91447636
A
2830 if (is_64_bit) {
2831 struct user_statfs sfs;
2832 bzero(&sfs, sizeof(sfs));
2833 sfs.f_flags = mp->mnt_flag & MNT_VISFLAGMASK;
2834 sfs.f_type = mp->mnt_vtable->vfc_typenum;
2835 sfs.f_bsize = (user_long_t)sp->f_bsize;
2836 sfs.f_iosize = (user_long_t)sp->f_iosize;
2837 sfs.f_blocks = (user_long_t)sp->f_blocks;
2838 sfs.f_bfree = (user_long_t)sp->f_bfree;
2839 sfs.f_bavail = (user_long_t)sp->f_bavail;
2840 sfs.f_files = (user_long_t)sp->f_files;
2841 sfs.f_ffree = (user_long_t)sp->f_ffree;
2842 sfs.f_fsid = sp->f_fsid;
2843 sfs.f_owner = sp->f_owner;
2844
2845 strncpy(&sfs.f_fstypename, &sp->f_fstypename, MFSNAMELEN-1);
2846 strncpy(&sfs.f_mntonname, &sp->f_mntonname, MNAMELEN-1);
2847 strncpy(&sfs.f_mntfromname, &sp->f_mntfromname, MNAMELEN-1);
2848
2849 error = SYSCTL_OUT(req, &sfs, sizeof(sfs));
2850 }
2851 else {
2852 struct statfs sfs;
2853 bzero(&sfs, sizeof(struct statfs));
2854 sfs.f_flags = mp->mnt_flag & MNT_VISFLAGMASK;
2855 sfs.f_type = mp->mnt_vtable->vfc_typenum;
2856
2857 /*
2858 * It's possible for there to be more than 2^^31 blocks in the filesystem, so we
2859 * have to fudge the numbers here in that case. We inflate the blocksize in order
2860 * to reflect the filesystem size as best we can.
2861 */
2862 if (sp->f_blocks > LONG_MAX) {
2863 int shift;
2864
2865 /*
2866 * Work out how far we have to shift the block count down to make it fit.
2867 * Note that it's possible to have to shift so far that the resulting
2868 * blocksize would be unreportably large. At that point, we will clip
2869 * any values that don't fit.
2870 *
2871 * For safety's sake, we also ensure that f_iosize is never reported as
2872 * being smaller than f_bsize.
2873 */
2874 for (shift = 0; shift < 32; shift++) {
2875 if ((sp->f_blocks >> shift) <= LONG_MAX)
2876 break;
2877 if ((sp->f_bsize << (shift + 1)) > LONG_MAX)
2878 break;
2879 }
2880#define __SHIFT_OR_CLIP(x, s) ((((x) >> (s)) > LONG_MAX) ? LONG_MAX : ((x) >> (s)))
2881 sfs.f_blocks = (long)__SHIFT_OR_CLIP(sp->f_blocks, shift);
2882 sfs.f_bfree = (long)__SHIFT_OR_CLIP(sp->f_bfree, shift);
2883 sfs.f_bavail = (long)__SHIFT_OR_CLIP(sp->f_bavail, shift);
2884#undef __SHIFT_OR_CLIP
2885 sfs.f_bsize = (long)(sp->f_bsize << shift);
2886 sfs.f_iosize = lmax(sp->f_iosize, sp->f_bsize);
2887 } else {
2888 sfs.f_bsize = (long)sp->f_bsize;
2889 sfs.f_iosize = (long)sp->f_iosize;
2890 sfs.f_blocks = (long)sp->f_blocks;
2891 sfs.f_bfree = (long)sp->f_bfree;
2892 sfs.f_bavail = (long)sp->f_bavail;
2893 }
2894 sfs.f_files = (long)sp->f_files;
2895 sfs.f_ffree = (long)sp->f_ffree;
2896 sfs.f_fsid = sp->f_fsid;
2897 sfs.f_owner = sp->f_owner;
2898
2899 strncpy(&sfs.f_fstypename, &sp->f_fstypename, MFSNAMELEN-1);
2900 strncpy(&sfs.f_mntonname, &sp->f_mntonname, MNAMELEN-1);
2901 strncpy(&sfs.f_mntfromname, &sp->f_mntfromname, MNAMELEN-1);
2902
2903 error = SYSCTL_OUT(req, &sfs, sizeof(sfs));
2904 }
55e303ae
A
2905 break;
2906 default:
91447636 2907 return (ENOTSUP);
55e303ae 2908 }
0b4e3aa0
A
2909 return (error);
2910}
2911
55e303ae
A
2912static int filt_fsattach(struct knote *kn);
2913static void filt_fsdetach(struct knote *kn);
2914static int filt_fsevent(struct knote *kn, long hint);
2915
2916struct filterops fs_filtops =
2917 { 0, filt_fsattach, filt_fsdetach, filt_fsevent };
2918
2919static int
2920filt_fsattach(struct knote *kn)
2921{
2922
2923 kn->kn_flags |= EV_CLEAR;
2924 KNOTE_ATTACH(&fs_klist, kn);
2925 return (0);
2926}
2927
2928static void
2929filt_fsdetach(struct knote *kn)
2930{
2931
2932 KNOTE_DETACH(&fs_klist, kn);
2933}
2934
2935static int
2936filt_fsevent(struct knote *kn, long hint)
2937{
2938
2939 kn->kn_fflags |= hint;
2940 return (kn->kn_fflags != 0);
2941}
2942
2943static int
2944sysctl_vfs_noremotehang SYSCTL_HANDLER_ARGS
2945{
2946 int out, error;
2947 pid_t pid;
2948 size_t space;
2949 struct proc *p;
2950
2951 /* We need a pid. */
91447636 2952 if (req->newptr == USER_ADDR_NULL)
55e303ae
A
2953 return (EINVAL);
2954
2955 error = SYSCTL_IN(req, &pid, sizeof(pid));
2956 if (error)
2957 return (error);
2958
2959 p = pfind(pid < 0 ? -pid : pid);
2960 if (p == NULL)
2961 return (ESRCH);
2962
2963 /*
2964 * Fetching the value is ok, but we only fetch if the old
2965 * pointer is given.
2966 */
91447636 2967 if (req->oldptr != USER_ADDR_NULL) {
55e303ae
A
2968 out = !((p->p_flag & P_NOREMOTEHANG) == 0);
2969 error = SYSCTL_OUT(req, &out, sizeof(out));
2970 return (error);
2971 }
2972
89b3af67 2973 /* XXX req->p->p_ucred -> kauth_cred_get() - current unsafe ??? */
55e303ae
A
2974 /* cansignal offers us enough security. */
2975 if (p != req->p && suser(req->p->p_ucred, &req->p->p_acflag) != 0)
2976 return (EPERM);
2977
2978 if (pid < 0)
2979 p->p_flag &= ~P_NOREMOTEHANG;
2980 else
2981 p->p_flag |= P_NOREMOTEHANG;
2982
2983 return (0);
2984}
2985/* the vfs.generic. branch. */
2986SYSCTL_NODE(_vfs, VFS_GENERIC, generic, CTLFLAG_RW, 0, "vfs generic hinge");
2987/* retreive a list of mounted filesystem fsid_t */
2988SYSCTL_PROC(_vfs_generic, OID_AUTO, vfsidlist, CTLFLAG_RD,
2989 0, 0, sysctl_vfs_vfslist, "S,fsid", "List of mounted filesystem ids");
2990/* perform operations on filesystem via fsid_t */
2991SYSCTL_NODE(_vfs_generic, OID_AUTO, ctlbyfsid, CTLFLAG_RW,
2992 sysctl_vfs_ctlbyfsid, "ctlbyfsid");
2993SYSCTL_PROC(_vfs_generic, OID_AUTO, noremotehang, CTLFLAG_RW,
2994 0, 0, sysctl_vfs_noremotehang, "I", "noremotehang");
91447636
A
2995
2996
2997int num_reusedvnodes=0;
55e303ae 2998
91447636
A
2999static int
3000new_vnode(vnode_t *vpp)
3001{
3002 vnode_t vp;
3003 int retries = 0; /* retry incase of tablefull */
3004 int vpid;
3005 struct timespec ts;
3006
3007retry:
3008 vnode_list_lock();
3009
3010 if ( !TAILQ_EMPTY(&vnode_free_list)) {
3011 /*
3012 * Pick the first vp for possible reuse
3013 */
3014 vp = TAILQ_FIRST(&vnode_free_list);
3015
3016 if (vp->v_lflag & VL_DEAD)
3017 goto steal_this_vp;
3018 } else
3019 vp = NULL;
3020
3021 /*
3022 * we're either empty, or the next guy on the
3023 * list is a valid vnode... if we're under the
3024 * limit, we'll create a new vnode
3025 */
3026 if (numvnodes < desiredvnodes) {
3027 numvnodes++;
3028 vnode_list_unlock();
3029 MALLOC_ZONE(vp, struct vnode *, sizeof *vp, M_VNODE, M_WAITOK);
3030 bzero((char *)vp, sizeof *vp);
3031 VLISTNONE(vp); /* avoid double queue removal */
3032 lck_mtx_init(&vp->v_lock, vnode_lck_grp, vnode_lck_attr);
3033
3034 nanouptime(&ts);
3035 vp->v_id = ts.tv_nsec;
3036 vp->v_flag = VSTANDARD;
3037
3038 goto done;
3039 }
3040 if (vp == NULL) {
3041 /*
3042 * we've reached the system imposed maximum number of vnodes
3043 * but there isn't a single one available
3044 * wait a bit and then retry... if we can't get a vnode
3045 * after 100 retries, than log a complaint
3046 */
3047 if (++retries <= 100) {
3048 vnode_list_unlock();
3049 IOSleep(1);
3050 goto retry;
3051 }
3052
3053 vnode_list_unlock();
3054 tablefull("vnode");
3055 log(LOG_EMERG, "%d desired, %d numvnodes, "
3056 "%d free, %d inactive\n",
3057 desiredvnodes, numvnodes, freevnodes, inactivevnodes);
3058 *vpp = 0;
3059 return (ENFILE);
3060 }
3061steal_this_vp:
3062 vpid = vp->v_id;
3063
3064 VREMFREE("new_vnode", vp);
3065 VLISTNONE(vp);
3066
3067 vnode_list_unlock();
3068 vnode_lock(vp);
3069
3070 /*
3071 * We could wait for the vnode_lock after removing the vp from the freelist
3072 * and the vid is bumped only at the very end of reclaim. So it is possible
3073 * that we are looking at a vnode that is being terminated. If so skip it.
3074 */
3075 if ((vpid != vp->v_id) || (vp->v_usecount != 0) || (vp->v_iocount != 0) ||
3076 VONLIST(vp) || (vp->v_lflag & VL_TERMINATE)) {
3077 /*
3078 * we lost the race between dropping the list lock
3079 * and picking up the vnode_lock... someone else
3080 * used this vnode and it is now in a new state
3081 * so we need to go back and try again
3082 */
3083 vnode_unlock(vp);
3084 goto retry;
3085 }
3086 if ( (vp->v_lflag & (VL_NEEDINACTIVE | VL_MARKTERM)) == VL_NEEDINACTIVE ) {
3087 /*
3088 * we did a vnode_rele_ext that asked for
3089 * us not to reenter the filesystem during
3090 * the release even though VL_NEEDINACTIVE was
3091 * set... we'll do it here by doing a
3092 * vnode_get/vnode_put
3093 *
3094 * pick up an iocount so that we can call
3095 * vnode_put and drive the VNOP_INACTIVE...
3096 * vnode_put will either leave us off
3097 * the freelist if a new ref comes in,
3098 * or put us back on the end of the freelist
3099 * or recycle us if we were marked for termination...
3100 * so we'll just go grab a new candidate
3101 */
3102 vp->v_iocount++;
3103#ifdef JOE_DEBUG
3104 record_vp(vp, 1);
3105#endif
3106 vnode_put_locked(vp);
3107 vnode_unlock(vp);
3108 goto retry;
3109 }
3110 OSAddAtomic(1, &num_reusedvnodes);
3111
3112 /* Checks for anyone racing us for recycle */
3113 if (vp->v_type != VBAD) {
3114 if (vp->v_lflag & VL_DEAD)
3115 panic("new_vnode: the vnode is VL_DEAD but not VBAD");
3116
3117 (void)vnode_reclaim_internal(vp, 1, 1);
3118
3119 if ((VONLIST(vp)))
3120 panic("new_vnode: vp on list ");
3121 if (vp->v_usecount || vp->v_iocount || vp->v_kusecount ||
3122 (vp->v_lflag & (VNAMED_UBC | VNAMED_MOUNT | VNAMED_FSHASH)))
3123 panic("new_vnode: free vnode still referenced\n");
3124 if ((vp->v_mntvnodes.tqe_prev != 0) && (vp->v_mntvnodes.tqe_next != 0))
3125 panic("new_vnode: vnode seems to be on mount list ");
3126 if ( !LIST_EMPTY(&vp->v_nclinks) || !LIST_EMPTY(&vp->v_ncchildren))
3127 panic("new_vnode: vnode still hooked into the name cache");
3128 }
3129 if (vp->v_unsafefs) {
3130 lck_mtx_destroy(&vp->v_unsafefs->fsnodelock, vnode_lck_grp);
3131 FREE_ZONE((void *)vp->v_unsafefs, sizeof(struct unsafe_fsnode), M_UNSAFEFS);
3132 vp->v_unsafefs = (struct unsafe_fsnode *)NULL;
3133 }
3134 vp->v_lflag = 0;
3135 vp->v_writecount = 0;
3136 vp->v_references = 0;
3137 vp->v_iterblkflags = 0;
3138 vp->v_flag = VSTANDARD;
3139 /* vbad vnodes can point to dead_mountp */
3140 vp->v_mount = 0;
3141 vp->v_defer_reclaimlist = (vnode_t)0;
3142
3143 vnode_unlock(vp);
3144done:
3145 *vpp = vp;
3146
3147 return (0);
3148}
3149
3150void
3151vnode_lock(vnode_t vp)
3152{
3153 lck_mtx_lock(&vp->v_lock);
3154}
3155
3156void
3157vnode_unlock(vnode_t vp)
3158{
3159 lck_mtx_unlock(&vp->v_lock);
3160}
3161
3162
3163
3164int
3165vnode_get(struct vnode *vp)
3166{
3167 vnode_lock(vp);
3168
3169 if ( (vp->v_iocount == 0) && (vp->v_lflag & (VL_TERMINATE | VL_DEAD)) ) {
3170 vnode_unlock(vp);
3171 return(ENOENT);
3172 }
3173 vp->v_iocount++;
3174#ifdef JOE_DEBUG
3175 record_vp(vp, 1);
3176#endif
3177 vnode_unlock(vp);
3178
3179 return(0);
3180}
3181
3182int
3183vnode_getwithvid(vnode_t vp, int vid)
3184{
3185 return(vget_internal(vp, vid, ( VNODE_NODEAD| VNODE_WITHID)));
3186}
3187
3188int
3189vnode_getwithref(vnode_t vp)
3190{
3191 return(vget_internal(vp, 0, 0));
3192}
3193
3194
3195int
3196vnode_put(vnode_t vp)
3197{
3198 int retval;
3199
3200 vnode_lock(vp);
3201 retval = vnode_put_locked(vp);
3202 vnode_unlock(vp);
3203
3204 return(retval);
3205}
3206
3207int
3208vnode_put_locked(vnode_t vp)
3209{
3210 struct vfs_context context;
3211
3212retry:
3213 if (vp->v_iocount < 1)
3214 panic("vnode_put(%x): iocount < 1", vp);
3215
3216 if ((vp->v_usecount > 0) || (vp->v_iocount > 1)) {
3217 vnode_dropiocount(vp, 1);
3218 return(0);
3219 }
3220 if ((vp->v_lflag & (VL_MARKTERM | VL_TERMINATE | VL_DEAD | VL_NEEDINACTIVE)) == VL_NEEDINACTIVE) {
3221
3222 vp->v_lflag &= ~VL_NEEDINACTIVE;
3223 vnode_unlock(vp);
3224
3225 context.vc_proc = current_proc();
3226 context.vc_ucred = kauth_cred_get();
3227 VNOP_INACTIVE(vp, &context);
3228
3229 vnode_lock(vp);
3230 /*
3231 * because we had to drop the vnode lock before calling
3232 * VNOP_INACTIVE, the state of this vnode may have changed...
3233 * we may pick up both VL_MARTERM and either
3234 * an iocount or a usecount while in the VNOP_INACTIVE call
3235 * we don't want to call vnode_reclaim_internal on a vnode
3236 * that has active references on it... so loop back around
3237 * and reevaluate the state
3238 */
3239 goto retry;
3240 }
3241 vp->v_lflag &= ~VL_NEEDINACTIVE;
3242
3243 if ((vp->v_lflag & (VL_MARKTERM | VL_TERMINATE | VL_DEAD)) == VL_MARKTERM)
3244 vnode_reclaim_internal(vp, 1, 0);
3245
3246 vnode_dropiocount(vp, 1);
3247 vnode_list_add(vp);
3248
3249 return(0);
3250}
3251
3252/* is vnode_t in use by others? */
3253int
3254vnode_isinuse(vnode_t vp, int refcnt)
3255{
3256 return(vnode_isinuse_locked(vp, refcnt, 0));
3257}
3258
3259
3260static int
3261vnode_isinuse_locked(vnode_t vp, int refcnt, int locked)
3262{
3263 int retval = 0;
3264
3265 if (!locked)
3266 vnode_lock(vp);
3267 if ((vp->v_type != VREG) && (vp->v_usecount > refcnt)) {
3268 retval = 1;
3269 goto out;
3270 }
3271 if (vp->v_type == VREG) {
3272 retval = ubc_isinuse_locked(vp, refcnt, 1);
3273 }
3274
3275out:
3276 if (!locked)
3277 vnode_unlock(vp);
3278 return(retval);
3279}
3280
3281
3282/* resume vnode_t */
3283errno_t
3284vnode_resume(vnode_t vp)
3285{
3286
3287 vnode_lock(vp);
3288
3289 if (vp->v_owner == current_thread()) {
3290 vp->v_lflag &= ~VL_SUSPENDED;
3291 vp->v_owner = 0;
3292 vnode_unlock(vp);
3293 wakeup(&vp->v_iocount);
3294 } else
3295 vnode_unlock(vp);
3296
3297 return(0);
3298}
3299
3300static errno_t
3301vnode_drain(vnode_t vp)
3302{
3303
3304 if (vp->v_lflag & VL_DRAIN) {
3305 panic("vnode_drain: recursuve drain");
3306 return(ENOENT);
3307 }
3308 vp->v_lflag |= VL_DRAIN;
3309 vp->v_owner = current_thread();
3310
3311 while (vp->v_iocount > 1)
3312 msleep(&vp->v_iocount, &vp->v_lock, PVFS, "vnode_drain", 0);
3313 return(0);
3314}
3315
3316
3317/*
3318 * if the number of recent references via vnode_getwithvid or vnode_getwithref
3319 * exceeds this threshhold, than 'UN-AGE' the vnode by removing it from
3320 * the LRU list if it's currently on it... once the iocount and usecount both drop
3321 * to 0, it will get put back on the end of the list, effectively making it younger
3322 * this allows us to keep actively referenced vnodes in the list without having
3323 * to constantly remove and add to the list each time a vnode w/o a usecount is
3324 * referenced which costs us taking and dropping a global lock twice.
3325 */
3326#define UNAGE_THRESHHOLD 10
3327
3328errno_t
3329vnode_getiocount(vnode_t vp, int locked, int vid, int vflags)
3330{
3331 int nodead = vflags & VNODE_NODEAD;
3332 int nosusp = vflags & VNODE_NOSUSPEND;
3333
3334 if (!locked)
3335 vnode_lock(vp);
3336
3337 for (;;) {
3338 /*
3339 * if it is a dead vnode with deadfs
3340 */
3341 if (nodead && (vp->v_lflag & VL_DEAD) && ((vp->v_type == VBAD) || (vp->v_data == 0))) {
3342 if (!locked)
3343 vnode_unlock(vp);
3344 return(ENOENT);
3345 }
3346 /*
3347 * will return VL_DEAD ones
3348 */
3349 if ((vp->v_lflag & (VL_SUSPENDED | VL_DRAIN | VL_TERMINATE)) == 0 ) {
3350 break;
3351 }
3352 /*
3353 * if suspended vnodes are to be failed
3354 */
3355 if (nosusp && (vp->v_lflag & VL_SUSPENDED)) {
3356 if (!locked)
3357 vnode_unlock(vp);
3358 return(ENOENT);
3359 }
3360 /*
3361 * if you are the owner of drain/suspend/termination , can acquire iocount
3362 * check for VL_TERMINATE; it does not set owner
3363 */
3364 if ((vp->v_lflag & (VL_DRAIN | VL_SUSPENDED | VL_TERMINATE)) &&
3365 (vp->v_owner == current_thread())) {
3366 break;
3367 }
3368 if (vp->v_lflag & VL_TERMINATE) {
3369 vp->v_lflag |= VL_TERMWANT;
3370
3371 msleep(&vp->v_lflag, &vp->v_lock, PVFS, "vnode getiocount", 0);
3372 } else
3373 msleep(&vp->v_iocount, &vp->v_lock, PVFS, "vnode_getiocount", 0);
3374 }
3375 if (vid != vp->v_id) {
3376 if (!locked)
3377 vnode_unlock(vp);
3378 return(ENOENT);
3379 }
3380 if (++vp->v_references >= UNAGE_THRESHHOLD) {
3381 vp->v_references = 0;
3382 vnode_list_remove(vp);
3383 }
3384 vp->v_iocount++;
3385#ifdef JOE_DEBUG
3386 record_vp(vp, 1);
3387#endif
3388 if (!locked)
3389 vnode_unlock(vp);
3390 return(0);
3391}
3392
3393static void
3394vnode_dropiocount (vnode_t vp, int locked)
3395{
3396 if (!locked)
3397 vnode_lock(vp);
3398 if (vp->v_iocount < 1)
3399 panic("vnode_dropiocount(%x): v_iocount < 1", vp);
3400
3401 vp->v_iocount--;
3402#ifdef JOE_DEBUG
3403 record_vp(vp, -1);
3404#endif
3405 if ((vp->v_lflag & (VL_DRAIN | VL_SUSPENDED)) && (vp->v_iocount <= 1))
3406 wakeup(&vp->v_iocount);
3407
3408 if (!locked)
3409 vnode_unlock(vp);
3410}
3411
3412
3413void
3414vnode_reclaim(struct vnode * vp)
3415{
3416 vnode_reclaim_internal(vp, 0, 0);
3417}
3418
3419__private_extern__
3420void
3421vnode_reclaim_internal(struct vnode * vp, int locked, int reuse)
3422{
3423 int isfifo = 0;
3424
3425 if (!locked)
3426 vnode_lock(vp);
3427
3428 if (vp->v_lflag & VL_TERMINATE) {
3429 panic("vnode reclaim in progress");
3430 }
3431 vp->v_lflag |= VL_TERMINATE;
3432
3433 if (vnode_drain(vp)) {
3434 panic("vnode drain failed");
3435 vnode_unlock(vp);
3436 return;
3437 }
3438 isfifo = (vp->v_type == VFIFO);
3439
3440 if (vp->v_type != VBAD)
3441 vgone(vp); /* clean and reclaim the vnode */
3442
3443 /*
3444 * give the vnode a new identity so
3445 * that vnode_getwithvid will fail
3446 * on any stale cache accesses
3447 */
3448 vp->v_id++;
3449 if (isfifo) {
3450 struct fifoinfo * fip;
3451
3452 fip = vp->v_fifoinfo;
3453 vp->v_fifoinfo = NULL;
3454 FREE(fip, M_TEMP);
3455 }
3456
3457 vp->v_type = VBAD;
3458
3459 if (vp->v_data)
3460 panic("vnode_reclaim_internal: cleaned vnode isn't");
3461 if (vp->v_numoutput)
3462 panic("vnode_reclaim_internal: Clean vnode has pending I/O's");
3463 if (UBCINFOEXISTS(vp))
3464 panic("vnode_reclaim_internal: ubcinfo not cleaned");
3465 if (vp->v_parent)
3466 panic("vnode_reclaim_internal: vparent not removed");
3467 if (vp->v_name)
3468 panic("vnode_reclaim_internal: vname not removed");
3469
3470 vp->v_socket = 0;
3471
3472 vp->v_lflag &= ~VL_TERMINATE;
3473 vp->v_lflag &= ~VL_DRAIN;
3474 vp->v_owner = 0;
3475
3476 if (vp->v_lflag & VL_TERMWANT) {
3477 vp->v_lflag &= ~VL_TERMWANT;
3478 wakeup(&vp->v_lflag);
3479 }
3480 if (!reuse && vp->v_usecount == 0)
3481 vnode_list_add(vp);
3482 if (!locked)
3483 vnode_unlock(vp);
3484}
3485
3486/* USAGE:
3487 * The following api creates a vnode and associates all the parameter specified in vnode_fsparam
3488 * structure and returns a vnode handle with a reference. device aliasing is handled here so checkalias
3489 * is obsoleted by this.
3490 * vnode_create(int flavor, size_t size, void * param, vnode_t *vp)
3491 */
3492int
3493vnode_create(int flavor, size_t size, void *data, vnode_t *vpp)
3494{
3495 int error;
3496 int insert = 1;
3497 vnode_t vp;
3498 vnode_t nvp;
3499 vnode_t dvp;
3500 struct componentname *cnp;
3501 struct vnode_fsparam *param = (struct vnode_fsparam *)data;
3502
3503 if (flavor == VNCREATE_FLAVOR && (size == VCREATESIZE) && param) {
3504 if ( (error = new_vnode(&vp)) ) {
3505 return(error);
3506 } else {
3507 dvp = param->vnfs_dvp;
3508 cnp = param->vnfs_cnp;
3509
3510 vp->v_op = param->vnfs_vops;
3511 vp->v_type = param->vnfs_vtype;
3512 vp->v_data = param->vnfs_fsnode;
3513 vp->v_iocount = 1;
3514
3515 if (param->vnfs_markroot)
3516 vp->v_flag |= VROOT;
3517 if (param->vnfs_marksystem)
3518 vp->v_flag |= VSYSTEM;
3519 else if (vp->v_type == VREG) {
3520 /*
3521 * only non SYSTEM vp
3522 */
3523 error = ubc_info_init_withsize(vp, param->vnfs_filesize);
3524 if (error) {
3525#ifdef JOE_DEBUG
3526 record_vp(vp, 1);
3527#endif
3528 vp->v_mount = 0;
3529 vp->v_op = dead_vnodeop_p;
3530 vp->v_tag = VT_NON;
3531 vp->v_data = NULL;
3532 vp->v_type = VBAD;
3533 vp->v_lflag |= VL_DEAD;
3534
3535 vnode_put(vp);
3536 return(error);
3537 }
3538 }
3539#ifdef JOE_DEBUG
3540 record_vp(vp, 1);
3541#endif
3542 if (vp->v_type == VCHR || vp->v_type == VBLK) {
3543
3544 if ( (nvp = checkalias(vp, param->vnfs_rdev)) ) {
3545 /*
3546 * if checkalias returns a vnode, it will be locked
3547 *
3548 * first get rid of the unneeded vnode we acquired
3549 */
3550 vp->v_data = NULL;
3551 vp->v_op = spec_vnodeop_p;
3552 vp->v_type = VBAD;
3553 vp->v_lflag = VL_DEAD;
3554 vp->v_data = NULL;
3555 vp->v_tag = VT_NON;
3556 vnode_put(vp);
3557
3558 /*
3559 * switch to aliased vnode and finish
3560 * preparing it
3561 */
3562 vp = nvp;
3563
3564 vclean(vp, 0, current_proc());
3565 vp->v_op = param->vnfs_vops;
3566 vp->v_type = param->vnfs_vtype;
3567 vp->v_data = param->vnfs_fsnode;
3568 vp->v_lflag = 0;
3569 vp->v_mount = NULL;
3570 insmntque(vp, param->vnfs_mp);
3571 insert = 0;
3572 vnode_unlock(vp);
3573 }
3574 }
3575
3576 if (vp->v_type == VFIFO) {
3577 struct fifoinfo *fip;
3578
3579 MALLOC(fip, struct fifoinfo *,
3580 sizeof(*fip), M_TEMP, M_WAITOK);
3581 bzero(fip, sizeof(struct fifoinfo ));
3582 vp->v_fifoinfo = fip;
3583 }
3584 /* The file systems usually pass the address of the location where
3585 * where there store the vnode pointer. When we add the vnode in mount
3586 * point and name cache they are discoverable. So the file system node
3587 * will have the connection to vnode setup by then
3588 */
3589 *vpp = vp;
3590
3591 if (param->vnfs_mp) {
3592 if (param->vnfs_mp->mnt_kern_flag & MNTK_LOCK_LOCAL)
3593 vp->v_flag |= VLOCKLOCAL;
3594 if (insert) {
3595 /*
3596 * enter in mount vnode list
3597 */
3598 insmntque(vp, param->vnfs_mp);
3599 }
3600#ifdef INTERIM_FSNODE_LOCK
3601 if (param->vnfs_mp->mnt_vtable->vfc_threadsafe == 0) {
3602 MALLOC_ZONE(vp->v_unsafefs, struct unsafe_fsnode *,
3603 sizeof(struct unsafe_fsnode), M_UNSAFEFS, M_WAITOK);
3604 vp->v_unsafefs->fsnode_count = 0;
3605 vp->v_unsafefs->fsnodeowner = (void *)NULL;
3606 lck_mtx_init(&vp->v_unsafefs->fsnodelock, vnode_lck_grp, vnode_lck_attr);
3607 }
3608#endif /* INTERIM_FSNODE_LOCK */
3609 }
3610 if (dvp && vnode_ref(dvp) == 0) {
3611 vp->v_parent = dvp;
3612 }
3613 if (cnp) {
3614 if (dvp && ((param->vnfs_flags & (VNFS_NOCACHE | VNFS_CANTCACHE)) == 0)) {
3615 /*
3616 * enter into name cache
3617 * we've got the info to enter it into the name cache now
3618 */
3619 cache_enter(dvp, vp, cnp);
3620 }
3621 vp->v_name = vfs_addname(cnp->cn_nameptr, cnp->cn_namelen, cnp->cn_hash, 0);
3622 }
3623 if ((param->vnfs_flags & VNFS_CANTCACHE) == 0) {
3624 /*
3625 * this vnode is being created as cacheable in the name cache
3626 * this allows us to re-enter it in the cache
3627 */
3628 vp->v_flag |= VNCACHEABLE;
3629 }
3630 if ((vp->v_flag & VSYSTEM) && (vp->v_type != VREG))
3631 panic("incorrect vnode setup");
3632
3633 return(0);
3634 }
3635 }
3636 return (EINVAL);
3637}
3638
3639int
3640vnode_addfsref(vnode_t vp)
3641{
3642 vnode_lock(vp);
3643 if (vp->v_lflag & VNAMED_FSHASH)
3644 panic("add_fsref: vp already has named reference");
3645 if ((vp->v_freelist.tqe_prev != (struct vnode **)0xdeadb))
3646 panic("addfsref: vp on the free list\n");
3647 vp->v_lflag |= VNAMED_FSHASH;
3648 vnode_unlock(vp);
3649 return(0);
3650
3651}
3652int
3653vnode_removefsref(vnode_t vp)
3654{
3655 vnode_lock(vp);
3656 if ((vp->v_lflag & VNAMED_FSHASH) == 0)
3657 panic("remove_fsref: no named reference");
3658 vp->v_lflag &= ~VNAMED_FSHASH;
3659 vnode_unlock(vp);
3660 return(0);
3661
3662}
3663
3664
3665int
3666vfs_iterate(__unused int flags, int (*callout)(mount_t, void *), void *arg)
3667{
3668 mount_t mp;
3669 int ret = 0;
3670 fsid_t * fsid_list;
3671 int count, actualcount, i;
3672 void * allocmem;
3673
3674 count = mount_getvfscnt();
3675 count += 10;
3676
3677 fsid_list = (fsid_t *)kalloc(count * sizeof(fsid_t));
3678 allocmem = (void *)fsid_list;
3679
3680 actualcount = mount_fillfsids(fsid_list, count);
3681
3682 for (i=0; i< actualcount; i++) {
3683
3684 /* obtain the mount point with iteration reference */
3685 mp = mount_list_lookupby_fsid(&fsid_list[i], 0, 1);
3686
3687 if(mp == (struct mount *)0)
3688 continue;
3689 mount_lock(mp);
3690 if (mp->mnt_lflag & (MNT_LDEAD | MNT_LUNMOUNT)) {
3691 mount_unlock(mp);
3692 mount_iterdrop(mp);
3693 continue;
3694
3695 }
3696 mount_unlock(mp);
3697
3698 /* iterate over all the vnodes */
3699 ret = callout(mp, arg);
3700
3701 mount_iterdrop(mp);
3702
3703 switch (ret) {
3704 case VFS_RETURNED:
3705 case VFS_RETURNED_DONE:
3706 if (ret == VFS_RETURNED_DONE) {
3707 ret = 0;
3708 goto out;
3709 }
3710 break;
3711
3712 case VFS_CLAIMED_DONE:
3713 ret = 0;
3714 goto out;
3715 case VFS_CLAIMED:
3716 default:
3717 break;
3718 }
3719 ret = 0;
3720 }
3721
3722out:
3723 kfree(allocmem, (count * sizeof(fsid_t)));
3724 return (ret);
3725}
3726
3727/*
3728 * Update the vfsstatfs structure in the mountpoint.
3729 */
3730int
3731vfs_update_vfsstat(mount_t mp, vfs_context_t ctx)
3732{
3733 struct vfs_attr va;
3734 int error;
3735
3736 /*
3737 * Request the attributes we want to propagate into
3738 * the per-mount vfsstat structure.
3739 */
3740 VFSATTR_INIT(&va);
3741 VFSATTR_WANTED(&va, f_iosize);
3742 VFSATTR_WANTED(&va, f_blocks);
3743 VFSATTR_WANTED(&va, f_bfree);
3744 VFSATTR_WANTED(&va, f_bavail);
3745 VFSATTR_WANTED(&va, f_bused);
3746 VFSATTR_WANTED(&va, f_files);
3747 VFSATTR_WANTED(&va, f_ffree);
3748 VFSATTR_WANTED(&va, f_bsize);
3749 VFSATTR_WANTED(&va, f_fssubtype);
3750 if ((error = vfs_getattr(mp, &va, ctx)) != 0) {
3751 KAUTH_DEBUG("STAT - filesystem returned error %d", error);
3752 return(error);
3753 }
3754
3755 /*
3756 * Unpack into the per-mount structure.
3757 *
3758 * We only overwrite these fields, which are likely to change:
3759 * f_blocks
3760 * f_bfree
3761 * f_bavail
3762 * f_bused
3763 * f_files
3764 * f_ffree
3765 *
3766 * And these which are not, but which the FS has no other way
3767 * of providing to us:
3768 * f_bsize
3769 * f_iosize
3770 * f_fssubtype
3771 *
3772 */
3773 if (VFSATTR_IS_SUPPORTED(&va, f_bsize)) {
3774 mp->mnt_vfsstat.f_bsize = va.f_bsize;
3775 } else {
3776 mp->mnt_vfsstat.f_bsize = mp->mnt_devblocksize; /* default from the device block size */
3777 }
3778 if (VFSATTR_IS_SUPPORTED(&va, f_iosize)) {
3779 mp->mnt_vfsstat.f_iosize = va.f_iosize;
3780 } else {
3781 mp->mnt_vfsstat.f_iosize = 1024 * 1024; /* 1MB sensible I/O size */
3782 }
3783 if (VFSATTR_IS_SUPPORTED(&va, f_blocks))
3784 mp->mnt_vfsstat.f_blocks = va.f_blocks;
3785 if (VFSATTR_IS_SUPPORTED(&va, f_bfree))
3786 mp->mnt_vfsstat.f_bfree = va.f_bfree;
3787 if (VFSATTR_IS_SUPPORTED(&va, f_bavail))
3788 mp->mnt_vfsstat.f_bavail = va.f_bavail;
3789 if (VFSATTR_IS_SUPPORTED(&va, f_bused))
3790 mp->mnt_vfsstat.f_bused = va.f_bused;
3791 if (VFSATTR_IS_SUPPORTED(&va, f_files))
3792 mp->mnt_vfsstat.f_files = va.f_files;
3793 if (VFSATTR_IS_SUPPORTED(&va, f_ffree))
3794 mp->mnt_vfsstat.f_ffree = va.f_ffree;
3795
3796 /* this is unlikely to change, but has to be queried for */
3797 if (VFSATTR_IS_SUPPORTED(&va, f_fssubtype))
3798 mp->mnt_vfsstat.f_fssubtype = va.f_fssubtype;
3799
3800 return(0);
3801}
3802
3803void
3804mount_list_add(mount_t mp)
3805{
3806 mount_list_lock();
3807 TAILQ_INSERT_TAIL(&mountlist, mp, mnt_list);
3808 nummounts++;
3809 mount_list_unlock();
3810}
3811
3812void
3813mount_list_remove(mount_t mp)
3814{
3815 mount_list_lock();
3816 TAILQ_REMOVE(&mountlist, mp, mnt_list);
3817 nummounts--;
3818 mp->mnt_list.tqe_next = 0;
3819 mp->mnt_list.tqe_prev = 0;
3820 mount_list_unlock();
3821}
3822
3823mount_t
3824mount_lookupby_volfsid(int volfs_id, int withref)
3825{
3826 mount_t cur_mount = (mount_t)0;
3827 mount_t mp ;
3828
3829 mount_list_lock();
3830 TAILQ_FOREACH(mp, &mountlist, mnt_list) {
3831 if (validfsnode(mp) && mp->mnt_vfsstat.f_fsid.val[0] == volfs_id) {
3832 cur_mount = mp;
3833 if (withref) {
3834 if (mount_iterref(cur_mount, 1)) {
3835 cur_mount = (mount_t)0;
3836 mount_list_unlock();
3837 goto out;
3838 }
3839 }
3840 break;
3841 }
3842 }
3843 mount_list_unlock();
3844 if (withref && (cur_mount != (mount_t)0)) {
3845 mp = cur_mount;
3846 if (vfs_busy(mp, LK_NOWAIT) != 0) {
3847 cur_mount = (mount_t)0;
3848 }
3849 mount_iterdrop(mp);
3850 }
3851out:
3852 return(cur_mount);
3853}
3854
3855
3856mount_t
3857mount_list_lookupby_fsid(fsid, locked, withref)
3858 fsid_t *fsid;
3859 int locked;
3860 int withref;
3861{
3862 mount_t retmp = (mount_t)0;
3863 mount_t mp;
3864
3865 if (!locked)
3866 mount_list_lock();
3867 TAILQ_FOREACH(mp, &mountlist, mnt_list)
3868 if (mp->mnt_vfsstat.f_fsid.val[0] == fsid->val[0] &&
3869 mp->mnt_vfsstat.f_fsid.val[1] == fsid->val[1]) {
3870 retmp = mp;
3871 if (withref) {
3872 if (mount_iterref(retmp, 1))
3873 retmp = (mount_t)0;
3874 }
3875 goto out;
3876 }
3877out:
3878 if (!locked)
3879 mount_list_unlock();
3880 return (retmp);
3881}
3882
3883errno_t
3884vnode_lookup(const char *path, int flags, vnode_t *vpp, vfs_context_t context)
3885{
3886 struct nameidata nd;
3887 int error;
3888 struct vfs_context context2;
3889 vfs_context_t ctx = context;
3890 u_long ndflags = 0;
3891
3892 if (context == NULL) { /* XXX technically an error */
3893 context2.vc_proc = current_proc();
3894 context2.vc_ucred = kauth_cred_get();
3895 ctx = &context2;
3896 }
3897
3898 if (flags & VNODE_LOOKUP_NOFOLLOW)
3899 ndflags = NOFOLLOW;
3900 else
3901 ndflags = FOLLOW;
3902
3903 if (flags & VNODE_LOOKUP_NOCROSSMOUNT)
3904 ndflags |= NOCROSSMOUNT;
3905 if (flags & VNODE_LOOKUP_DOWHITEOUT)
3906 ndflags |= DOWHITEOUT;
3907
3908 /* XXX AUDITVNPATH1 needed ? */
3909 NDINIT(&nd, LOOKUP, ndflags, UIO_SYSSPACE, CAST_USER_ADDR_T(path), ctx);
3910
3911 if ((error = namei(&nd)))
3912 return (error);
3913 *vpp = nd.ni_vp;
3914 nameidone(&nd);
3915
3916 return (0);
3917}
3918
3919errno_t
3920vnode_open(const char *path, int fmode, int cmode, int flags, vnode_t *vpp, vfs_context_t context)
3921{
3922 struct nameidata nd;
3923 int error;
3924 struct vfs_context context2;
3925 vfs_context_t ctx = context;
3926 u_long ndflags = 0;
3a60a9f5 3927 int lflags = flags;
91447636
A
3928
3929 if (context == NULL) { /* XXX technically an error */
3930 context2.vc_proc = current_proc();
3931 context2.vc_ucred = kauth_cred_get();
3932 ctx = &context2;
3933 }
3934
3a60a9f5
A
3935 if (fmode & O_NOFOLLOW)
3936 lflags |= VNODE_LOOKUP_NOFOLLOW;
3937
3938 if (lflags & VNODE_LOOKUP_NOFOLLOW)
91447636
A
3939 ndflags = NOFOLLOW;
3940 else
3941 ndflags = FOLLOW;
3942
3a60a9f5 3943 if (lflags & VNODE_LOOKUP_NOCROSSMOUNT)
91447636 3944 ndflags |= NOCROSSMOUNT;
3a60a9f5 3945 if (lflags & VNODE_LOOKUP_DOWHITEOUT)
91447636
A
3946 ndflags |= DOWHITEOUT;
3947
3948 /* XXX AUDITVNPATH1 needed ? */
3949 NDINIT(&nd, LOOKUP, ndflags, UIO_SYSSPACE, CAST_USER_ADDR_T(path), ctx);
3950
3951 if ((error = vn_open(&nd, fmode, cmode)))
3952 *vpp = NULL;
3953 else
3954 *vpp = nd.ni_vp;
3955
3956 return (error);
3957}
3958
3959errno_t
3960vnode_close(vnode_t vp, int flags, vfs_context_t context)
3961{
3962 kauth_cred_t cred;
3963 struct proc *p;
3964 int error;
3965
3966 if (context) {
3967 p = context->vc_proc;
3968 cred = context->vc_ucred;
3969 } else {
3970 p = current_proc();
3971 cred = kauth_cred_get();
3972 }
3973
3974 error = vn_close(vp, flags, cred, p);
3975 vnode_put(vp);
3976 return (error);
3977}
3978
3979errno_t
3980vnode_size(vnode_t vp, off_t *sizep, vfs_context_t ctx)
3981{
3982 struct vnode_attr va;
3983 int error;
3984
3985 VATTR_INIT(&va);
3986 VATTR_WANTED(&va, va_data_size);
3987 error = vnode_getattr(vp, &va, ctx);
3988 if (!error)
3989 *sizep = va.va_data_size;
3990 return(error);
3991}
3992
3993errno_t
3994vnode_setsize(vnode_t vp, off_t size, int ioflag, vfs_context_t ctx)
3995{
3996 struct vnode_attr va;
3997
3998 VATTR_INIT(&va);
3999 VATTR_SET(&va, va_data_size, size);
4000 va.va_vaflags = ioflag & 0xffff;
4001 return(vnode_setattr(vp, &va, ctx));
4002}
4003
89b3af67
A
4004/*
4005 * Create a filesystem object of arbitrary type with arbitrary attributes in
4006 * the spevied directory with the specified name.
4007 *
4008 * Parameters: dvp Pointer to the vnode of the directory
4009 * in which to create the object.
4010 * vpp Pointer to the area into which to
4011 * return the vnode of the created object.
4012 * cnp Component name pointer from the namei
4013 * data structure, containing the name to
4014 * use for the create object.
4015 * vap Pointer to the vnode_attr structure
4016 * describing the object to be created,
4017 * including the type of object.
4018 * flags VN_* flags controlling ACL inheritance
4019 * and whether or not authorization is to
4020 * be required for the operation.
4021 *
4022 * Returns: 0 Success
4023 * !0 errno value
4024 *
4025 * Implicit: *vpp Contains the vnode of the object that
4026 * was created, if successful.
4027 * *cnp May be modified by the underlying VFS.
4028 * *vap May be modified by the underlying VFS.
4029 * modified by either ACL inheritance or
4030 *
4031 *
4032 * be modified, even if the operation is
4033 *
4034 *
4035 * Notes: The kauth_filesec_t in 'vap', if any, is in host byte order.
4036 *
4037 * Modification of '*cnp' and '*vap' by the underlying VFS is
4038 * strongly discouraged.
4039 *
4040 * XXX: This function is a 'vn_*' function; it belongs in vfs_vnops.c
4041 *
4042 * XXX: We should enummerate the possible errno values here, and where
4043 * in the code they originated.
4044 */
91447636
A
4045errno_t
4046vn_create(vnode_t dvp, vnode_t *vpp, struct componentname *cnp, struct vnode_attr *vap, int flags, vfs_context_t ctx)
4047{
4048 kauth_acl_t oacl, nacl;
4049 int initial_acl;
4050 errno_t error;
4051 vnode_t vp = (vnode_t)0;
4052
4053 error = 0;
4054 oacl = nacl = NULL;
4055 initial_acl = 0;
4056
4057 KAUTH_DEBUG("%p CREATE - '%s'", dvp, cnp->cn_nameptr);
4058
4059 /*
4060 * Handle ACL inheritance.
4061 */
4062 if (!(flags & VN_CREATE_NOINHERIT) && vfs_extendedsecurity(dvp->v_mount)) {
4063 /* save the original filesec */
4064 if (VATTR_IS_ACTIVE(vap, va_acl)) {
4065 initial_acl = 1;
4066 oacl = vap->va_acl;
4067 }
4068
4069 vap->va_acl = NULL;
4070 if ((error = kauth_acl_inherit(dvp,
4071 oacl,
4072 &nacl,
4073 vap->va_type == VDIR,
4074 ctx)) != 0) {
4075 KAUTH_DEBUG("%p CREATE - error %d processing inheritance", dvp, error);
4076 return(error);
4077 }
4078
4079 /*
4080 * If the generated ACL is NULL, then we can save ourselves some effort
4081 * by clearing the active bit.
4082 */
4083 if (nacl == NULL) {
4084 VATTR_CLEAR_ACTIVE(vap, va_acl);
4085 } else {
4086 VATTR_SET(vap, va_acl, nacl);
4087 }
4088 }
4089
4090 /*
4091 * Check and default new attributes.
4092 * This will set va_uid, va_gid, va_mode and va_create_time at least, if the caller
4093 * hasn't supplied them.
4094 */
4095 if ((error = vnode_authattr_new(dvp, vap, flags & VN_CREATE_NOAUTH, ctx)) != 0) {
4096 KAUTH_DEBUG("%p CREATE - error %d handing/defaulting attributes", dvp, error);
4097 goto out;
4098 }
4099
4100
4101 /*
4102 * Create the requested node.
4103 */
4104 switch(vap->va_type) {
4105 case VREG:
4106 error = VNOP_CREATE(dvp, vpp, cnp, vap, ctx);
4107 break;
4108 case VDIR:
4109 error = VNOP_MKDIR(dvp, vpp, cnp, vap, ctx);
4110 break;
4111 case VSOCK:
4112 case VFIFO:
4113 case VBLK:
4114 case VCHR:
4115 error = VNOP_MKNOD(dvp, vpp, cnp, vap, ctx);
4116 break;
4117 default:
4118 panic("vnode_create: unknown vtype %d", vap->va_type);
4119 }
4120 if (error != 0) {
4121 KAUTH_DEBUG("%p CREATE - error %d returned by filesystem", dvp, error);
4122 goto out;
4123 }
4124
4125 vp = *vpp;
4126 /*
4127 * If some of the requested attributes weren't handled by the VNOP,
4128 * use our fallback code.
4129 */
4130 if (!VATTR_ALL_SUPPORTED(vap) && *vpp) {
4131 KAUTH_DEBUG(" CREATE - doing fallback with ACL %p", vap->va_acl);
4132 error = vnode_setattr_fallback(*vpp, vap, ctx);
4133 }
4134 if ((error != 0 ) && (vp != (vnode_t)0)) {
4135 *vpp = (vnode_t) 0;
4136 vnode_put(vp);
4137 }
4138
4139out:
4140 /*
4141 * If the caller supplied a filesec in vap, it has been replaced
4142 * now by the post-inheritance copy. We need to put the original back
4143 * and free the inherited product.
4144 */
4145 if (initial_acl) {
4146 VATTR_SET(vap, va_acl, oacl);
4147 } else {
4148 VATTR_CLEAR_ACTIVE(vap, va_acl);
4149 }
4150 if (nacl != NULL)
4151 kauth_acl_free(nacl);
4152
4153 return(error);
4154}
4155
4156static kauth_scope_t vnode_scope;
4157static int vnode_authorize_callback(kauth_cred_t credential, __unused void *idata, kauth_action_t action,
4158 uintptr_t arg0, uintptr_t arg1, uintptr_t arg2, uintptr_t arg3);
4159
4160typedef struct _vnode_authorize_context {
4161 vnode_t vp;
4162 struct vnode_attr *vap;
4163 vnode_t dvp;
4164 struct vnode_attr *dvap;
4165 vfs_context_t ctx;
4166 int flags;
4167 int flags_valid;
4168#define _VAC_IS_OWNER (1<<0)
4169#define _VAC_IN_GROUP (1<<1)
4170#define _VAC_IS_DIR_OWNER (1<<2)
4171#define _VAC_IN_DIR_GROUP (1<<3)
4172} *vauth_ctx;
4173
4174void
4175vnode_authorize_init(void)
4176{
4177 vnode_scope = kauth_register_scope(KAUTH_SCOPE_VNODE, vnode_authorize_callback, NULL);
4178}
4179
4180/*
4181 * Authorize an operation on a vnode.
4182 *
4183 * This is KPI, but here because it needs vnode_scope.
4184 */
4185int
4186vnode_authorize(vnode_t vp, vnode_t dvp, kauth_action_t action, vfs_context_t context)
4187{
4188 int error, result;
4189
4190 /*
4191 * We can't authorize against a dead vnode; allow all operations through so that
4192 * the correct error can be returned.
4193 */
4194 if (vp->v_type == VBAD)
4195 return(0);
4196
4197 error = 0;
4198 result = kauth_authorize_action(vnode_scope, vfs_context_ucred(context), action,
4199 (uintptr_t)context, (uintptr_t)vp, (uintptr_t)dvp, (uintptr_t)&error);
4200 if (result == EPERM) /* traditional behaviour */
4201 result = EACCES;
4202 /* did the lower layers give a better error return? */
4203 if ((result != 0) && (error != 0))
4204 return(error);
4205 return(result);
4206}
4207
4208/*
4209 * Test for vnode immutability.
4210 *
4211 * The 'append' flag is set when the authorization request is constrained
4212 * to operations which only request the right to append to a file.
4213 *
4214 * The 'ignore' flag is set when an operation modifying the immutability flags
4215 * is being authorized. We check the system securelevel to determine which
4216 * immutability flags we can ignore.
4217 */
4218static int
4219vnode_immutable(struct vnode_attr *vap, int append, int ignore)
4220{
4221 int mask;
4222
4223 /* start with all bits precluding the operation */
4224 mask = IMMUTABLE | APPEND;
4225
4226 /* if appending only, remove the append-only bits */
4227 if (append)
4228 mask &= ~APPEND;
4229
4230 /* ignore only set when authorizing flags changes */
4231 if (ignore) {
4232 if (securelevel <= 0) {
4233 /* in insecure state, flags do not inhibit changes */
4234 mask = 0;
4235 } else {
4236 /* in secure state, user flags don't inhibit */
4237 mask &= ~(UF_IMMUTABLE | UF_APPEND);
4238 }
4239 }
4240 KAUTH_DEBUG("IMMUTABLE - file flags 0x%x mask 0x%x append = %d ignore = %d", vap->va_flags, mask, append, ignore);
4241 if ((vap->va_flags & mask) != 0)
4242 return(EPERM);
4243 return(0);
4244}
4245
4246static int
4247vauth_node_owner(struct vnode_attr *vap, kauth_cred_t cred)
4248{
4249 int result;
4250
4251 /* default assumption is not-owner */
4252 result = 0;
4253
4254 /*
4255 * If the filesystem has given us a UID, we treat this as authoritative.
4256 */
4257 if (vap && VATTR_IS_SUPPORTED(vap, va_uid)) {
4258 result = (vap->va_uid == kauth_cred_getuid(cred)) ? 1 : 0;
4259 }
4260 /* we could test the owner UUID here if we had a policy for it */
4261
4262 return(result);
4263}
4264
4265static int
4266vauth_node_group(struct vnode_attr *vap, kauth_cred_t cred, int *ismember)
4267{
4268 int error;
4269 int result;
4270
4271 error = 0;
4272 result = 0;
4273
4274 /* the caller is expected to have asked the filesystem for a group at some point */
4275 if (vap && VATTR_IS_SUPPORTED(vap, va_gid)) {
4276 error = kauth_cred_ismember_gid(cred, vap->va_gid, &result);
4277 }
4278 /* we could test the group UUID here if we had a policy for it */
4279
4280 if (!error)
4281 *ismember = result;
4282 return(error);
4283}
4284
4285static int
4286vauth_file_owner(vauth_ctx vcp)
4287{
4288 int result;
4289
4290 if (vcp->flags_valid & _VAC_IS_OWNER) {
4291 result = (vcp->flags & _VAC_IS_OWNER) ? 1 : 0;
4292 } else {
4293 result = vauth_node_owner(vcp->vap, vcp->ctx->vc_ucred);
4294
4295 /* cache our result */
4296 vcp->flags_valid |= _VAC_IS_OWNER;
4297 if (result) {
4298 vcp->flags |= _VAC_IS_OWNER;
4299 } else {
4300 vcp->flags &= ~_VAC_IS_OWNER;
4301 }
4302 }
4303 return(result);
4304}
4305
4306static int
4307vauth_file_ingroup(vauth_ctx vcp, int *ismember)
4308{
4309 int error;
4310
4311 if (vcp->flags_valid & _VAC_IN_GROUP) {
4312 *ismember = (vcp->flags & _VAC_IN_GROUP) ? 1 : 0;
4313 error = 0;
4314 } else {
4315 error = vauth_node_group(vcp->vap, vcp->ctx->vc_ucred, ismember);
4316
4317 if (!error) {
4318 /* cache our result */
4319 vcp->flags_valid |= _VAC_IN_GROUP;
4320 if (*ismember) {
4321 vcp->flags |= _VAC_IN_GROUP;
4322 } else {
4323 vcp->flags &= ~_VAC_IN_GROUP;
4324 }
4325 }
4326
4327 }
4328 return(error);
4329}
4330
4331static int
4332vauth_dir_owner(vauth_ctx vcp)
4333{
4334 int result;
4335
4336 if (vcp->flags_valid & _VAC_IS_DIR_OWNER) {
4337 result = (vcp->flags & _VAC_IS_DIR_OWNER) ? 1 : 0;
4338 } else {
4339 result = vauth_node_owner(vcp->dvap, vcp->ctx->vc_ucred);
4340
4341 /* cache our result */
4342 vcp->flags_valid |= _VAC_IS_DIR_OWNER;
4343 if (result) {
4344 vcp->flags |= _VAC_IS_DIR_OWNER;
4345 } else {
4346 vcp->flags &= ~_VAC_IS_DIR_OWNER;
4347 }
4348 }
4349 return(result);
4350}
4351
4352static int
4353vauth_dir_ingroup(vauth_ctx vcp, int *ismember)
4354{
4355 int error;
4356
4357 if (vcp->flags_valid & _VAC_IN_DIR_GROUP) {
4358 *ismember = (vcp->flags & _VAC_IN_DIR_GROUP) ? 1 : 0;
4359 error = 0;
4360 } else {
4361 error = vauth_node_group(vcp->dvap, vcp->ctx->vc_ucred, ismember);
4362
4363 if (!error) {
4364 /* cache our result */
4365 vcp->flags_valid |= _VAC_IN_DIR_GROUP;
4366 if (*ismember) {
4367 vcp->flags |= _VAC_IN_DIR_GROUP;
4368 } else {
4369 vcp->flags &= ~_VAC_IN_DIR_GROUP;
4370 }
4371 }
4372 }
4373 return(error);
4374}
4375
4376/*
4377 * Test the posix permissions in (vap) to determine whether (credential)
4378 * may perform (action)
4379 */
4380static int
4381vnode_authorize_posix(vauth_ctx vcp, int action, int on_dir)
4382{
4383 struct vnode_attr *vap;
4384 int needed, error, owner_ok, group_ok, world_ok, ismember;
4385#ifdef KAUTH_DEBUG_ENABLE
4386 const char *where;
4387# define _SETWHERE(c) where = c;
4388#else
4389# define _SETWHERE(c)
4390#endif
4391
4392 /* checking file or directory? */
4393 if (on_dir) {
4394 vap = vcp->dvap;
4395 } else {
4396 vap = vcp->vap;
4397 }
4398
4399 error = 0;
4400
4401 /*
4402 * We want to do as little work here as possible. So first we check
4403 * which sets of permissions grant us the access we need, and avoid checking
4404 * whether specific permissions grant access when more generic ones would.
4405 */
4406
4407 /* owner permissions */
4408 needed = 0;
4409 if (action & VREAD)
4410 needed |= S_IRUSR;
4411 if (action & VWRITE)
4412 needed |= S_IWUSR;
4413 if (action & VEXEC)
4414 needed |= S_IXUSR;
4415 owner_ok = (needed & vap->va_mode) == needed;
4416
4417 /* group permissions */
4418 needed = 0;
4419 if (action & VREAD)
4420 needed |= S_IRGRP;
4421 if (action & VWRITE)
4422 needed |= S_IWGRP;
4423 if (action & VEXEC)
4424 needed |= S_IXGRP;
4425 group_ok = (needed & vap->va_mode) == needed;
4426
4427 /* world permissions */
4428 needed = 0;
4429 if (action & VREAD)
4430 needed |= S_IROTH;
4431 if (action & VWRITE)
4432 needed |= S_IWOTH;
4433 if (action & VEXEC)
4434 needed |= S_IXOTH;
4435 world_ok = (needed & vap->va_mode) == needed;
4436
4437 /* If granted/denied by all three, we're done */
4438 if (owner_ok && group_ok && world_ok) {
4439 _SETWHERE("all");
4440 goto out;
4441 }
4442 if (!owner_ok && !group_ok && !world_ok) {
4443 _SETWHERE("all");
4444 error = EACCES;
4445 goto out;
4446 }
4447
4448 /* Check ownership (relatively cheap) */
4449 if ((on_dir && vauth_dir_owner(vcp)) ||
4450 (!on_dir && vauth_file_owner(vcp))) {
4451 _SETWHERE("user");
4452 if (!owner_ok)
4453 error = EACCES;
4454 goto out;
4455 }
4456
4457 /* Not owner; if group and world both grant it we're done */
4458 if (group_ok && world_ok) {
4459 _SETWHERE("group/world");
4460 goto out;
4461 }
4462 if (!group_ok && !world_ok) {
4463 _SETWHERE("group/world");
4464 error = EACCES;
4465 goto out;
4466 }
4467
4468 /* Check group membership (most expensive) */
4469 ismember = 0;
4470 if (on_dir) {
4471 error = vauth_dir_ingroup(vcp, &ismember);
4472 } else {
4473 error = vauth_file_ingroup(vcp, &ismember);
4474 }
4475 if (error)
4476 goto out;
4477 if (ismember) {
4478 _SETWHERE("group");
4479 if (!group_ok)
4480 error = EACCES;
4481 goto out;
4482 }
4483
4484 /* Not owner, not in group, use world result */
4485 _SETWHERE("world");
4486 if (!world_ok)
4487 error = EACCES;
4488
4489 /* FALLTHROUGH */
4490
4491out:
4492 KAUTH_DEBUG("%p %s - posix %s permissions : need %s%s%s %x have %s%s%s%s%s%s%s%s%s UID = %d file = %d,%d",
4493 vcp->vp, (error == 0) ? "ALLOWED" : "DENIED", where,
4494 (action & VREAD) ? "r" : "-",
4495 (action & VWRITE) ? "w" : "-",
4496 (action & VEXEC) ? "x" : "-",
4497 needed,
4498 (vap->va_mode & S_IRUSR) ? "r" : "-",
4499 (vap->va_mode & S_IWUSR) ? "w" : "-",
4500 (vap->va_mode & S_IXUSR) ? "x" : "-",
4501 (vap->va_mode & S_IRGRP) ? "r" : "-",
4502 (vap->va_mode & S_IWGRP) ? "w" : "-",
4503 (vap->va_mode & S_IXGRP) ? "x" : "-",
4504 (vap->va_mode & S_IROTH) ? "r" : "-",
4505 (vap->va_mode & S_IWOTH) ? "w" : "-",
4506 (vap->va_mode & S_IXOTH) ? "x" : "-",
4507 kauth_cred_getuid(vcp->ctx->vc_ucred),
4508 on_dir ? vcp->dvap->va_uid : vcp->vap->va_uid,
4509 on_dir ? vcp->dvap->va_gid : vcp->vap->va_gid);
4510 return(error);
4511}
4512
4513/*
4514 * Authorize the deletion of the node vp from the directory dvp.
4515 *
4516 * We assume that:
4517 * - Neither the node nor the directory are immutable.
4518 * - The user is not the superuser.
4519 *
4520 * Deletion is not permitted if the directory is sticky and the caller is not owner of the
4521 * node or directory.
4522 *
4523 * If either the node grants DELETE, or the directory grants DELETE_CHILD, the node may be
4524 * deleted. If neither denies the permission, and the caller has Posix write access to the
4525 * directory, then the node may be deleted.
4526 */
4527static int
4528vnode_authorize_delete(vauth_ctx vcp)
4529{
4530 struct vnode_attr *vap = vcp->vap;
4531 struct vnode_attr *dvap = vcp->dvap;
4532 kauth_cred_t cred = vcp->ctx->vc_ucred;
4533 struct kauth_acl_eval eval;
4534 int error, delete_denied, delete_child_denied, ismember;
4535
4536 /* check the ACL on the directory */
4537 delete_child_denied = 0;
4538 if (VATTR_IS_NOT(dvap, va_acl, NULL)) {
4539 eval.ae_requested = KAUTH_VNODE_DELETE_CHILD;
4540 eval.ae_acl = &dvap->va_acl->acl_ace[0];
4541 eval.ae_count = dvap->va_acl->acl_entrycount;
4542 eval.ae_options = 0;
4543 if (vauth_dir_owner(vcp))
4544 eval.ae_options |= KAUTH_AEVAL_IS_OWNER;
4545 if ((error = vauth_dir_ingroup(vcp, &ismember)) != 0)
4546 return(error);
4547 if (ismember)
4548 eval.ae_options |= KAUTH_AEVAL_IN_GROUP;
4549 eval.ae_exp_gall = KAUTH_VNODE_GENERIC_ALL_BITS;
4550 eval.ae_exp_gread = KAUTH_VNODE_GENERIC_READ_BITS;
4551 eval.ae_exp_gwrite = KAUTH_VNODE_GENERIC_WRITE_BITS;
4552 eval.ae_exp_gexec = KAUTH_VNODE_GENERIC_EXECUTE_BITS;
4553
4554 error = kauth_acl_evaluate(cred, &eval);
4555
4556 if (error != 0) {
4557 KAUTH_DEBUG("%p ERROR during ACL processing - %d", vcp->vp, error);
4558 return(error);
4559 }
4560 if (eval.ae_result == KAUTH_RESULT_DENY)
4561 delete_child_denied = 1;
4562 if (eval.ae_result == KAUTH_RESULT_ALLOW) {
4563 KAUTH_DEBUG("%p ALLOWED - granted by directory ACL", vcp->vp);
4564 return(0);
4565 }
4566 }
4567
4568 /* check the ACL on the node */
4569 delete_denied = 0;
4570 if (VATTR_IS_NOT(vap, va_acl, NULL)) {
4571 eval.ae_requested = KAUTH_VNODE_DELETE;
4572 eval.ae_acl = &vap->va_acl->acl_ace[0];
4573 eval.ae_count = vap->va_acl->acl_entrycount;
4574 eval.ae_options = 0;
4575 if (vauth_file_owner(vcp))
4576 eval.ae_options |= KAUTH_AEVAL_IS_OWNER;
4577 if ((error = vauth_file_ingroup(vcp, &ismember)) != 0)
4578 return(error);
4579 if (ismember)
4580 eval.ae_options |= KAUTH_AEVAL_IN_GROUP;
4581 eval.ae_exp_gall = KAUTH_VNODE_GENERIC_ALL_BITS;
4582 eval.ae_exp_gread = KAUTH_VNODE_GENERIC_READ_BITS;
4583 eval.ae_exp_gwrite = KAUTH_VNODE_GENERIC_WRITE_BITS;
4584 eval.ae_exp_gexec = KAUTH_VNODE_GENERIC_EXECUTE_BITS;
4585
4586 if ((error = kauth_acl_evaluate(cred, &eval)) != 0) {
4587 KAUTH_DEBUG("%p ERROR during ACL processing - %d", vcp->vp, error);
4588 return(error);
4589 }
4590 if (eval.ae_result == KAUTH_RESULT_DENY)
4591 delete_denied = 1;
4592 if (eval.ae_result == KAUTH_RESULT_ALLOW) {
4593 KAUTH_DEBUG("%p ALLOWED - granted by file ACL", vcp->vp);
4594 return(0);
4595 }
4596 }
4597
4598 /* if denied by ACL on directory or node, return denial */
4599 if (delete_denied || delete_child_denied) {
4600 KAUTH_DEBUG("%p ALLOWED - denied by ACL", vcp->vp);
4601 return(EACCES);
4602 }
4603
4604 /* enforce sticky bit behaviour */
4605 if ((dvap->va_mode & S_ISTXT) && !vauth_file_owner(vcp) && !vauth_dir_owner(vcp)) {
4606 KAUTH_DEBUG("%p DENIED - sticky bit rules (user %d file %d dir %d)",
4607 vcp->vp, cred->cr_uid, vap->va_uid, dvap->va_uid);
4608 return(EACCES);
4609 }
4610
4611 /* check the directory */
4612 if ((error = vnode_authorize_posix(vcp, VWRITE, 1 /* on_dir */)) != 0) {
4613 KAUTH_DEBUG("%p ALLOWED - granted by posix permisssions", vcp->vp);
4614 return(error);
4615 }
4616
4617 /* not denied, must be OK */
4618 return(0);
4619}
4620
4621
4622/*
4623 * Authorize an operation based on the node's attributes.
4624 */
4625static int
4626vnode_authorize_simple(vauth_ctx vcp, kauth_ace_rights_t acl_rights, kauth_ace_rights_t preauth_rights)
4627{
4628 struct vnode_attr *vap = vcp->vap;
4629 kauth_cred_t cred = vcp->ctx->vc_ucred;
4630 struct kauth_acl_eval eval;
4631 int error, ismember;
4632 mode_t posix_action;
4633
4634 /*
4635 * If we are the file owner, we automatically have some rights.
4636 *
4637 * Do we need to expand this to support group ownership?
4638 */
4639 if (vauth_file_owner(vcp))
4640 acl_rights &= ~(KAUTH_VNODE_WRITE_SECURITY);
4641
4642 /*
4643 * If we are checking both TAKE_OWNERSHIP and WRITE_SECURITY, we can
4644 * mask the latter. If TAKE_OWNERSHIP is requested the caller is about to
4645 * change ownership to themselves, and WRITE_SECURITY is implicitly
4646 * granted to the owner. We need to do this because at this point
4647 * WRITE_SECURITY may not be granted as the caller is not currently
4648 * the owner.
4649 */
4650 if ((acl_rights & KAUTH_VNODE_TAKE_OWNERSHIP) &&
4651 (acl_rights & KAUTH_VNODE_WRITE_SECURITY))
4652 acl_rights &= ~KAUTH_VNODE_WRITE_SECURITY;
4653
4654 if (acl_rights == 0) {
4655 KAUTH_DEBUG("%p ALLOWED - implicit or no rights required", vcp->vp);
4656 return(0);
4657 }
4658
4659 /* if we have an ACL, evaluate it */
4660 if (VATTR_IS_NOT(vap, va_acl, NULL)) {
4661 eval.ae_requested = acl_rights;
4662 eval.ae_acl = &vap->va_acl->acl_ace[0];
4663 eval.ae_count = vap->va_acl->acl_entrycount;
4664 eval.ae_options = 0;
4665 if (vauth_file_owner(vcp))
4666 eval.ae_options |= KAUTH_AEVAL_IS_OWNER;
4667 if ((error = vauth_file_ingroup(vcp, &ismember)) != 0)
4668 return(error);
4669 if (ismember)
4670 eval.ae_options |= KAUTH_AEVAL_IN_GROUP;
4671 eval.ae_exp_gall = KAUTH_VNODE_GENERIC_ALL_BITS;
4672 eval.ae_exp_gread = KAUTH_VNODE_GENERIC_READ_BITS;
4673 eval.ae_exp_gwrite = KAUTH_VNODE_GENERIC_WRITE_BITS;
4674 eval.ae_exp_gexec = KAUTH_VNODE_GENERIC_EXECUTE_BITS;
4675
4676 if ((error = kauth_acl_evaluate(cred, &eval)) != 0) {
4677 KAUTH_DEBUG("%p ERROR during ACL processing - %d", vcp->vp, error);
4678 return(error);
4679 }
4680
4681 if (eval.ae_result == KAUTH_RESULT_DENY) {
4682 KAUTH_DEBUG("%p DENIED - by ACL", vcp->vp);
4683 return(EACCES); /* deny, deny, counter-allege */
4684 }
4685 if (eval.ae_result == KAUTH_RESULT_ALLOW) {
4686 KAUTH_DEBUG("%p ALLOWED - all rights granted by ACL", vcp->vp);
4687 return(0);
4688 }
4689 /* fall through and evaluate residual rights */
4690 } else {
4691 /* no ACL, everything is residual */
4692 eval.ae_residual = acl_rights;
4693 }
4694
4695 /*
4696 * Grant residual rights that have been pre-authorized.
4697 */
4698 eval.ae_residual &= ~preauth_rights;
4699
4700 /*
4701 * We grant WRITE_ATTRIBUTES to the owner if it hasn't been denied.
4702 */
4703 if (vauth_file_owner(vcp))
4704 eval.ae_residual &= ~KAUTH_VNODE_WRITE_ATTRIBUTES;
4705
4706 if (eval.ae_residual == 0) {
4707 KAUTH_DEBUG("%p ALLOWED - rights already authorized", vcp->vp);
4708 return(0);
4709 }
4710
4711 /*
4712 * Bail if we have residual rights that can't be granted by posix permissions,
4713 * or aren't presumed granted at this point.
4714 *
4715 * XXX these can be collapsed for performance
4716 */
4717 if (eval.ae_residual & KAUTH_VNODE_CHANGE_OWNER) {
4718 KAUTH_DEBUG("%p DENIED - CHANGE_OWNER not permitted", vcp->vp);
4719 return(EACCES);
4720 }
4721 if (eval.ae_residual & KAUTH_VNODE_WRITE_SECURITY) {
4722 KAUTH_DEBUG("%p DENIED - WRITE_SECURITY not permitted", vcp->vp);
4723 return(EACCES);
4724 }
4725
4726#if DIAGNOSTIC
4727 if (eval.ae_residual & KAUTH_VNODE_DELETE)
4728 panic("vnode_authorize: can't be checking delete permission here");
4729#endif
4730
4731 /*
4732 * Compute the fallback posix permissions that will satisfy the remaining
4733 * rights.
4734 */
4735 posix_action = 0;
4736 if (eval.ae_residual & (KAUTH_VNODE_READ_DATA |
4737 KAUTH_VNODE_LIST_DIRECTORY |
4738 KAUTH_VNODE_READ_EXTATTRIBUTES))
4739 posix_action |= VREAD;
4740 if (eval.ae_residual & (KAUTH_VNODE_WRITE_DATA |
4741 KAUTH_VNODE_ADD_FILE |
4742 KAUTH_VNODE_ADD_SUBDIRECTORY |
4743 KAUTH_VNODE_DELETE_CHILD |
4744 KAUTH_VNODE_WRITE_ATTRIBUTES |
4745 KAUTH_VNODE_WRITE_EXTATTRIBUTES))
4746 posix_action |= VWRITE;
4747 if (eval.ae_residual & (KAUTH_VNODE_EXECUTE |
4748 KAUTH_VNODE_SEARCH))
4749 posix_action |= VEXEC;
4750
4751 if (posix_action != 0) {
4752 return(vnode_authorize_posix(vcp, posix_action, 0 /* !on_dir */));
4753 } else {
4754 KAUTH_DEBUG("%p ALLOWED - residual rights %s%s%s%s%s%s%s%s%s%s%s%s%s%s granted due to no posix mapping",
4755 vcp->vp,
4756 (eval.ae_residual & KAUTH_VNODE_READ_DATA)
4757 ? vnode_isdir(vcp->vp) ? " LIST_DIRECTORY" : " READ_DATA" : "",
4758 (eval.ae_residual & KAUTH_VNODE_WRITE_DATA)
4759 ? vnode_isdir(vcp->vp) ? " ADD_FILE" : " WRITE_DATA" : "",
4760 (eval.ae_residual & KAUTH_VNODE_EXECUTE)
4761 ? vnode_isdir(vcp->vp) ? " SEARCH" : " EXECUTE" : "",
4762 (eval.ae_residual & KAUTH_VNODE_DELETE)
4763 ? " DELETE" : "",
4764 (eval.ae_residual & KAUTH_VNODE_APPEND_DATA)
4765 ? vnode_isdir(vcp->vp) ? " ADD_SUBDIRECTORY" : " APPEND_DATA" : "",
4766 (eval.ae_residual & KAUTH_VNODE_DELETE_CHILD)
4767 ? " DELETE_CHILD" : "",
4768 (eval.ae_residual & KAUTH_VNODE_READ_ATTRIBUTES)
4769 ? " READ_ATTRIBUTES" : "",
4770 (eval.ae_residual & KAUTH_VNODE_WRITE_ATTRIBUTES)
4771 ? " WRITE_ATTRIBUTES" : "",
4772 (eval.ae_residual & KAUTH_VNODE_READ_EXTATTRIBUTES)
4773 ? " READ_EXTATTRIBUTES" : "",
4774 (eval.ae_residual & KAUTH_VNODE_WRITE_EXTATTRIBUTES)
4775 ? " WRITE_EXTATTRIBUTES" : "",
4776 (eval.ae_residual & KAUTH_VNODE_READ_SECURITY)
4777 ? " READ_SECURITY" : "",
4778 (eval.ae_residual & KAUTH_VNODE_WRITE_SECURITY)
4779 ? " WRITE_SECURITY" : "",
4780 (eval.ae_residual & KAUTH_VNODE_CHECKIMMUTABLE)
4781 ? " CHECKIMMUTABLE" : "",
4782 (eval.ae_residual & KAUTH_VNODE_CHANGE_OWNER)
4783 ? " CHANGE_OWNER" : "");
4784 }
4785
4786 /*
4787 * Lack of required Posix permissions implies no reason to deny access.
4788 */
4789 return(0);
4790}
4791
4792/*
4793 * Check for file immutability.
4794 */
4795static int
4796vnode_authorize_checkimmutable(vnode_t vp, struct vnode_attr *vap, int rights, int ignore)
4797{
4798 mount_t mp;
4799 int error;
4800 int append;
4801
4802 /*
4803 * Perform immutability checks for operations that change data.
4804 *
4805 * Sockets, fifos and devices require special handling.
4806 */
4807 switch(vp->v_type) {
4808 case VSOCK:
4809 case VFIFO:
4810 case VBLK:
4811 case VCHR:
4812 /*
4813 * Writing to these nodes does not change the filesystem data,
4814 * so forget that it's being tried.
4815 */
4816 rights &= ~KAUTH_VNODE_WRITE_DATA;
4817 break;
4818 default:
4819 break;
4820 }
4821
4822 error = 0;
4823 if (rights & KAUTH_VNODE_WRITE_RIGHTS) {
4824
4825 /* check per-filesystem options if possible */
4826 mp = vnode_mount(vp);
4827 if (mp != NULL) {
4828
4829 /* check for no-EA filesystems */
4830 if ((rights & KAUTH_VNODE_WRITE_EXTATTRIBUTES) &&
4831 (vfs_flags(mp) & MNT_NOUSERXATTR)) {
4832 KAUTH_DEBUG("%p DENIED - filesystem disallowed extended attributes", vp);
4833 error = EACCES; /* User attributes disabled */
4834 goto out;
4835 }
4836 }
4837
4838 /* check for file immutability */
4839 append = 0;
4840 if (vp->v_type == VDIR) {
4841 if ((rights & (KAUTH_VNODE_ADD_FILE | KAUTH_VNODE_ADD_SUBDIRECTORY)) == rights)
4842 append = 1;
4843 } else {
4844 if ((rights & KAUTH_VNODE_APPEND_DATA) == rights)
4845 append = 1;
4846 }
4847 if ((error = vnode_immutable(vap, append, ignore)) != 0) {
4848 KAUTH_DEBUG("%p DENIED - file is immutable", vp);
4849 goto out;
4850 }
4851 }
4852out:
4853 return(error);
4854}
4855
4856/*
4857 * Handle authorization actions for filesystems that advertise that the server will
4858 * be enforcing.
4859 */
4860static int
4861vnode_authorize_opaque(vnode_t vp, int *resultp, kauth_action_t action, vfs_context_t ctx)
4862{
4863 int error;
4864
4865 /*
4866 * If the vp is a device node, socket or FIFO it actually represents a local
4867 * endpoint, so we need to handle it locally.
4868 */
4869 switch(vp->v_type) {
4870 case VBLK:
4871 case VCHR:
4872 case VSOCK:
4873 case VFIFO:
4874 return(0);
4875 default:
4876 break;
4877 }
4878
4879 /*
4880 * In the advisory request case, if the filesystem doesn't think it's reliable
4881 * we will attempt to formulate a result ourselves based on VNOP_GETATTR data.
4882 */
4883 if ((action & KAUTH_VNODE_ACCESS) && !vfs_authopaqueaccess(vnode_mount(vp)))
4884 return(0);
4885
4886 /*
4887 * Let the filesystem have a say in the matter. It's OK for it to not implemnent
4888 * VNOP_ACCESS, as most will authorise inline with the actual request.
4889 */
4890 if ((error = VNOP_ACCESS(vp, action, ctx)) != ENOTSUP) {
4891 *resultp = error;
4892 KAUTH_DEBUG("%p DENIED - opaque filesystem VNOP_ACCESS denied access", vp);
4893 return(1);
4894 }
4895
4896 /*
4897 * Typically opaque filesystems do authorisation in-line, but exec is a special case. In
4898 * order to be reasonably sure that exec will be permitted, we try a bit harder here.
4899 */
4900 if ((action & KAUTH_VNODE_EXECUTE) && vnode_isreg(vp)) {
4901 /* try a VNOP_OPEN for readonly access */
4902 if ((error = VNOP_OPEN(vp, FREAD, ctx)) != 0) {
4903 *resultp = error;
4904 KAUTH_DEBUG("%p DENIED - EXECUTE denied because file could not be opened readonly", vp);
4905 return(1);
4906 }
4907 VNOP_CLOSE(vp, FREAD, ctx);
4908 }
4909
4910 /*
4911 * We don't have any reason to believe that the request has to be denied at this point,
4912 * so go ahead and allow it.
4913 */
4914 *resultp = 0;
4915 KAUTH_DEBUG("%p ALLOWED - bypassing access check for non-local filesystem", vp);
4916 return(1);
4917}
4918
4919static int
4920vnode_authorize_callback(__unused kauth_cred_t unused_cred, __unused void *idata, kauth_action_t action,
4921 uintptr_t arg0, uintptr_t arg1, uintptr_t arg2, uintptr_t arg3)
4922{
4923 struct _vnode_authorize_context auth_context;
4924 vauth_ctx vcp;
4925 vfs_context_t ctx;
4926 vnode_t vp, dvp;
4927 kauth_cred_t cred;
4928 kauth_ace_rights_t rights;
4929 struct vnode_attr va, dva;
4930 int result;
4931 int *errorp;
4932 int noimmutable;
4933
4934 vcp = &auth_context;
4935 ctx = vcp->ctx = (vfs_context_t)arg0;
4936 vp = vcp->vp = (vnode_t)arg1;
4937 dvp = vcp->dvp = (vnode_t)arg2;
4938 errorp = (int *)arg3;
4939 /* note that we authorize against the context, not the passed cred (the same thing anyway) */
4940 cred = ctx->vc_ucred;
4941
4942 VATTR_INIT(&va);
4943 vcp->vap = &va;
4944 VATTR_INIT(&dva);
4945 vcp->dvap = &dva;
4946
4947 vcp->flags = vcp->flags_valid = 0;
4948
4949#if DIAGNOSTIC
4950 if ((ctx == NULL) || (vp == NULL) || (cred == NULL))
4951 panic("vnode_authorize: bad arguments (context %p vp %p cred %p)", ctx, vp, cred);
4952#endif
4953
4954 KAUTH_DEBUG("%p AUTH - %s %s%s%s%s%s%s%s%s%s%s%s%s%s%s%s on %s '%s' (0x%x:%p/%p)",
4955 vp, vfs_context_proc(ctx)->p_comm,
4956 (action & KAUTH_VNODE_ACCESS) ? "access" : "auth",
4957 (action & KAUTH_VNODE_READ_DATA) ? vnode_isdir(vp) ? " LIST_DIRECTORY" : " READ_DATA" : "",
4958 (action & KAUTH_VNODE_WRITE_DATA) ? vnode_isdir(vp) ? " ADD_FILE" : " WRITE_DATA" : "",
4959 (action & KAUTH_VNODE_EXECUTE) ? vnode_isdir(vp) ? " SEARCH" : " EXECUTE" : "",
4960 (action & KAUTH_VNODE_DELETE) ? " DELETE" : "",
4961 (action & KAUTH_VNODE_APPEND_DATA) ? vnode_isdir(vp) ? " ADD_SUBDIRECTORY" : " APPEND_DATA" : "",
4962 (action & KAUTH_VNODE_DELETE_CHILD) ? " DELETE_CHILD" : "",
4963 (action & KAUTH_VNODE_READ_ATTRIBUTES) ? " READ_ATTRIBUTES" : "",
4964 (action & KAUTH_VNODE_WRITE_ATTRIBUTES) ? " WRITE_ATTRIBUTES" : "",
4965 (action & KAUTH_VNODE_READ_EXTATTRIBUTES) ? " READ_EXTATTRIBUTES" : "",
4966 (action & KAUTH_VNODE_WRITE_EXTATTRIBUTES) ? " WRITE_EXTATTRIBUTES" : "",
4967 (action & KAUTH_VNODE_READ_SECURITY) ? " READ_SECURITY" : "",
4968 (action & KAUTH_VNODE_WRITE_SECURITY) ? " WRITE_SECURITY" : "",
4969 (action & KAUTH_VNODE_CHANGE_OWNER) ? " CHANGE_OWNER" : "",
4970 (action & KAUTH_VNODE_NOIMMUTABLE) ? " (noimmutable)" : "",
4971 vnode_isdir(vp) ? "directory" : "file",
4972 vp->v_name ? vp->v_name : "<NULL>", action, vp, dvp);
4973
4974 /*
4975 * Extract the control bits from the action, everything else is
4976 * requested rights.
4977 */
4978 noimmutable = (action & KAUTH_VNODE_NOIMMUTABLE) ? 1 : 0;
4979 rights = action & ~(KAUTH_VNODE_ACCESS | KAUTH_VNODE_NOIMMUTABLE);
4980
4981 if (rights & KAUTH_VNODE_DELETE) {
4982#if DIAGNOSTIC
4983 if (dvp == NULL)
4984 panic("vnode_authorize: KAUTH_VNODE_DELETE test requires a directory");
4985#endif
4986 } else {
4987 dvp = NULL;
4988 }
4989
4990 /*
4991 * Check for read-only filesystems.
4992 */
4993 if ((rights & KAUTH_VNODE_WRITE_RIGHTS) &&
4994 (vp->v_mount->mnt_flag & MNT_RDONLY) &&
4995 ((vp->v_type == VREG) || (vp->v_type == VDIR) ||
4996 (vp->v_type == VLNK) || (vp->v_type == VCPLX) ||
4997 (rights & KAUTH_VNODE_DELETE) || (rights & KAUTH_VNODE_DELETE_CHILD))) {
4998 result = EROFS;
4999 goto out;
5000 }
5001
5002 /*
5003 * Check for noexec filesystems.
5004 */
5005 if ((rights & KAUTH_VNODE_EXECUTE) && vnode_isreg(vp) && (vp->v_mount->mnt_flag & MNT_NOEXEC)) {
5006 result = EACCES;
5007 goto out;
5008 }
5009
5010 /*
5011 * Handle cases related to filesystems with non-local enforcement.
5012 * This call can return 0, in which case we will fall through to perform a
5013 * check based on VNOP_GETATTR data. Otherwise it returns 1 and sets
5014 * an appropriate result, at which point we can return immediately.
5015 */
5016 if (vfs_authopaque(vp->v_mount) && vnode_authorize_opaque(vp, &result, action, ctx))
5017 goto out;
5018
5019 /*
5020 * Get vnode attributes and extended security information for the vnode
5021 * and directory if required.
5022 */
5023 VATTR_WANTED(&va, va_mode);
5024 VATTR_WANTED(&va, va_uid);
5025 VATTR_WANTED(&va, va_gid);
5026 VATTR_WANTED(&va, va_flags);
5027 VATTR_WANTED(&va, va_acl);
5028 if ((result = vnode_getattr(vp, &va, ctx)) != 0) {
5029 KAUTH_DEBUG("%p ERROR - failed to get vnode attributes - %d", vp, result);
5030 goto out;
5031 }
5032 if (dvp) {
5033 VATTR_WANTED(&dva, va_mode);
5034 VATTR_WANTED(&dva, va_uid);
5035 VATTR_WANTED(&dva, va_gid);
5036 VATTR_WANTED(&dva, va_flags);
5037 VATTR_WANTED(&dva, va_acl);
5038 if ((result = vnode_getattr(dvp, &dva, ctx)) != 0) {
5039 KAUTH_DEBUG("%p ERROR - failed to get directory vnode attributes - %d", vp, result);
5040 goto out;
5041 }
5042 }
5043
5044 /*
5045 * If the vnode is an extended attribute data vnode (eg. a resource fork), *_DATA becomes
5046 * *_EXTATTRIBUTES.
5047 */
5048 if (S_ISXATTR(va.va_mode)) {
5049 if (rights & KAUTH_VNODE_READ_DATA) {
5050 rights &= ~KAUTH_VNODE_READ_DATA;
5051 rights |= KAUTH_VNODE_READ_EXTATTRIBUTES;
5052 }
5053 if (rights & KAUTH_VNODE_WRITE_DATA) {
5054 rights &= ~KAUTH_VNODE_WRITE_DATA;
5055 rights |= KAUTH_VNODE_WRITE_EXTATTRIBUTES;
5056 }
5057 }
5058
5059 /*
5060 * Check for immutability.
5061 *
5062 * In the deletion case, parent directory immutability vetoes specific
5063 * file rights.
5064 */
5065 if ((result = vnode_authorize_checkimmutable(vp, &va, rights, noimmutable)) != 0)
5066 goto out;
5067 if ((rights & KAUTH_VNODE_DELETE) &&
5068 ((result = vnode_authorize_checkimmutable(dvp, &dva, KAUTH_VNODE_DELETE_CHILD, 0)) != 0))
5069 goto out;
5070
5071 /*
5072 * Clear rights that have been authorized by reaching this point, bail if nothing left to
5073 * check.
5074 */
5075 rights &= ~(KAUTH_VNODE_LINKTARGET | KAUTH_VNODE_CHECKIMMUTABLE);
5076 if (rights == 0)
5077 goto out;
5078
5079 /*
5080 * If we're not the superuser, authorize based on file properties.
5081 */
5082 if (!vfs_context_issuser(ctx)) {
5083 /* process delete rights */
5084 if ((rights & KAUTH_VNODE_DELETE) &&
5085 ((result = vnode_authorize_delete(vcp)) != 0))
5086 goto out;
5087
5088 /* process remaining rights */
5089 if ((rights & ~KAUTH_VNODE_DELETE) &&
5090 ((result = vnode_authorize_simple(vcp, rights, rights & KAUTH_VNODE_DELETE)) != 0))
5091 goto out;
5092 } else {
5093
5094 /*
5095 * Execute is only granted to root if one of the x bits is set. This check only
5096 * makes sense if the posix mode bits are actually supported.
5097 */
5098 if ((rights & KAUTH_VNODE_EXECUTE) &&
5099 (vp->v_type == VREG) &&
5100 VATTR_IS_SUPPORTED(&va, va_mode) &&
5101 !(va.va_mode & (S_IXUSR | S_IXGRP | S_IXOTH))) {
5102 result = EPERM;
5103 KAUTH_DEBUG("%p DENIED - root execute requires at least one x bit in 0x%x", vp, va.va_mode);
5104 goto out;
5105 }
5106
5107 KAUTH_DEBUG("%p ALLOWED - caller is superuser", vp);
5108 }
5109
5110out:
5111 if (VATTR_IS_SUPPORTED(&va, va_acl) && (va.va_acl != NULL))
5112 kauth_acl_free(va.va_acl);
5113 if (VATTR_IS_SUPPORTED(&dva, va_acl) && (dva.va_acl != NULL))
5114 kauth_acl_free(dva.va_acl);
5115 if (result) {
5116 *errorp = result;
5117 KAUTH_DEBUG("%p DENIED - auth denied", vp);
5118 return(KAUTH_RESULT_DENY);
5119 }
5120
5121 /*
5122 * Note that this implies that we will allow requests for no rights, as well as
5123 * for rights that we do not recognise. There should be none of these.
5124 */
5125 KAUTH_DEBUG("%p ALLOWED - auth granted", vp);
5126 return(KAUTH_RESULT_ALLOW);
5127}
5128
5129/*
5130 * Check that the attribute information in vattr can be legally applied to
5131 * a new file by the context.
5132 */
5133int
5134vnode_authattr_new(vnode_t dvp, struct vnode_attr *vap, int noauth, vfs_context_t ctx)
5135{
5136 int error;
5137 int is_suser, ismember, defaulted_owner, defaulted_group, defaulted_mode;
5138 kauth_cred_t cred;
5139 guid_t changer;
5140 mount_t dmp;
5141
5142 error = 0;
5143 defaulted_owner = defaulted_group = defaulted_mode = 0;
5144
5145 /*
5146 * Require that the filesystem support extended security to apply any.
5147 */
5148 if (!vfs_extendedsecurity(dvp->v_mount) &&
5149 (VATTR_IS_ACTIVE(vap, va_acl) || VATTR_IS_ACTIVE(vap, va_uuuid) || VATTR_IS_ACTIVE(vap, va_guuid))) {
5150 error = EINVAL;
5151 goto out;
5152 }
5153
5154 /*
5155 * Default some fields.
5156 */
5157 dmp = dvp->v_mount;
5158
5159 /*
5160 * If the filesystem is mounted IGNORE_OWNERSHIP and an explicit owner is set, that
5161 * owner takes ownership of all new files.
5162 */
5163 if ((dmp->mnt_flag & MNT_IGNORE_OWNERSHIP) && (dmp->mnt_fsowner != KAUTH_UID_NONE)) {
5164 VATTR_SET(vap, va_uid, dmp->mnt_fsowner);
5165 defaulted_owner = 1;
5166 } else {
5167 if (!VATTR_IS_ACTIVE(vap, va_uid)) {
5168 /* default owner is current user */
5169 VATTR_SET(vap, va_uid, kauth_cred_getuid(vfs_context_ucred(ctx)));
5170 defaulted_owner = 1;
5171 }
5172 }
5173
5174 /*
5175 * If the filesystem is mounted IGNORE_OWNERSHIP and an explicit grouo is set, that
5176 * group takes ownership of all new files.
5177 */
5178 if ((dmp->mnt_flag & MNT_IGNORE_OWNERSHIP) && (dmp->mnt_fsgroup != KAUTH_GID_NONE)) {
5179 VATTR_SET(vap, va_gid, dmp->mnt_fsgroup);
5180 defaulted_group = 1;
5181 } else {
5182 if (!VATTR_IS_ACTIVE(vap, va_gid)) {
5183 /* default group comes from parent object, fallback to current user */
5184 struct vnode_attr dva;
5185 VATTR_INIT(&dva);
5186 VATTR_WANTED(&dva, va_gid);
5187 if ((error = vnode_getattr(dvp, &dva, ctx)) != 0)
5188 goto out;
5189 if (VATTR_IS_SUPPORTED(&dva, va_gid)) {
5190 VATTR_SET(vap, va_gid, dva.va_gid);
5191 } else {
5192 VATTR_SET(vap, va_gid, kauth_cred_getgid(vfs_context_ucred(ctx)));
5193 }
5194 defaulted_group = 1;
5195 }
5196 }
5197
5198 if (!VATTR_IS_ACTIVE(vap, va_flags))
5199 VATTR_SET(vap, va_flags, 0);
5200
5201 /* default mode is everything, masked with current umask */
5202 if (!VATTR_IS_ACTIVE(vap, va_mode)) {
5203 VATTR_SET(vap, va_mode, ACCESSPERMS & ~vfs_context_proc(ctx)->p_fd->fd_cmask);
5204 KAUTH_DEBUG("ATTR - defaulting new file mode to %o from umask %o", vap->va_mode, vfs_context_proc(ctx)->p_fd->fd_cmask);
5205 defaulted_mode = 1;
5206 }
5207 /* set timestamps to now */
5208 if (!VATTR_IS_ACTIVE(vap, va_create_time)) {
5209 nanotime(&vap->va_create_time);
5210 VATTR_SET_ACTIVE(vap, va_create_time);
5211 }
5212
5213 /*
5214 * Check for attempts to set nonsensical fields.
5215 */
5216 if (vap->va_active & ~VNODE_ATTR_NEWOBJ) {
5217 error = EINVAL;
5218 KAUTH_DEBUG("ATTR - ERROR - attempt to set unsupported new-file attributes %llx",
5219 vap->va_active & ~VNODE_ATTR_NEWOBJ);
5220 goto out;
5221 }
5222
5223 /*
5224 * Quickly check for the applicability of any enforcement here.
5225 * Tests below maintain the integrity of the local security model.
5226 */
5227 if (vfs_authopaque(vnode_mount(dvp)))
5228 goto out;
5229
5230 /*
5231 * We need to know if the caller is the superuser, or if the work is
5232 * otherwise already authorised.
5233 */
5234 cred = vfs_context_ucred(ctx);
5235 if (noauth) {
5236 /* doing work for the kernel */
5237 is_suser = 1;
5238 } else {
5239 is_suser = vfs_context_issuser(ctx);
5240 }
5241
5242
5243 if (VATTR_IS_ACTIVE(vap, va_flags)) {
5244 if (is_suser) {
5245 if ((vap->va_flags & (UF_SETTABLE | SF_SETTABLE)) != vap->va_flags) {
5246 error = EPERM;
5247 KAUTH_DEBUG(" DENIED - superuser attempt to set illegal flag(s)");
5248 goto out;
5249 }
5250 } else {
5251 if ((vap->va_flags & UF_SETTABLE) != vap->va_flags) {
5252 error = EPERM;
5253 KAUTH_DEBUG(" DENIED - user attempt to set illegal flag(s)");
5254 goto out;
5255 }
5256 }
5257 }
5258
5259 /* if not superuser, validate legality of new-item attributes */
5260 if (!is_suser) {
5261 if (!defaulted_mode && VATTR_IS_ACTIVE(vap, va_mode)) {
5262 /* setgid? */
5263 if (vap->va_mode & S_ISGID) {
5264 if ((error = kauth_cred_ismember_gid(cred, vap->va_gid, &ismember)) != 0) {
5265 KAUTH_DEBUG("ATTR - ERROR: got %d checking for membership in %d", error, vap->va_gid);
5266 goto out;
5267 }
5268 if (!ismember) {
5269 KAUTH_DEBUG(" DENIED - can't set SGID bit, not a member of %d", vap->va_gid);
5270 error = EPERM;
5271 goto out;
5272 }
5273 }
5274
5275 /* setuid? */
5276 if ((vap->va_mode & S_ISUID) && (vap->va_uid != kauth_cred_getuid(cred))) {
5277 KAUTH_DEBUG("ATTR - ERROR: illegal attempt to set the setuid bit");
5278 error = EPERM;
5279 goto out;
5280 }
5281 }
5282 if (!defaulted_owner && (vap->va_uid != kauth_cred_getuid(cred))) {
5283 KAUTH_DEBUG(" DENIED - cannot create new item owned by %d", vap->va_uid);
5284 error = EPERM;
5285 goto out;
5286 }
5287 if (!defaulted_group) {
5288 if ((error = kauth_cred_ismember_gid(cred, vap->va_gid, &ismember)) != 0) {
5289 KAUTH_DEBUG(" ERROR - got %d checking for membership in %d", error, vap->va_gid);
5290 goto out;
5291 }
5292 if (!ismember) {
5293 KAUTH_DEBUG(" DENIED - cannot create new item with group %d - not a member", vap->va_gid);
5294 error = EPERM;
5295 goto out;
5296 }
5297 }
5298
5299 /* initialising owner/group UUID */
5300 if (VATTR_IS_ACTIVE(vap, va_uuuid)) {
5301 if ((error = kauth_cred_getguid(cred, &changer)) != 0) {
5302 KAUTH_DEBUG(" ERROR - got %d trying to get caller UUID", error);
5303 /* XXX ENOENT here - no GUID - should perhaps become EPERM */
5304 goto out;
5305 }
5306 if (!kauth_guid_equal(&vap->va_uuuid, &changer)) {
5307 KAUTH_DEBUG(" ERROR - cannot create item with supplied owner UUID - not us");
5308 error = EPERM;
5309 goto out;
5310 }
5311 }
5312 if (VATTR_IS_ACTIVE(vap, va_guuid)) {
5313 if ((error = kauth_cred_ismember_guid(cred, &vap->va_guuid, &ismember)) != 0) {
5314 KAUTH_DEBUG(" ERROR - got %d trying to check group membership", error);
5315 goto out;
5316 }
5317 if (!ismember) {
5318 KAUTH_DEBUG(" ERROR - cannot create item with supplied group UUID - not a member");
5319 error = EPERM;
5320 goto out;
5321 }
5322 }
5323 }
5324out:
5325 return(error);
5326}
5327
5328/*
5329 * Check that the attribute information in vap can be legally written by the context.
5330 *
5331 * Call this when you're not sure about the vnode_attr; either its contents have come
5332 * from an unknown source, or when they are variable.
5333 *
5334 * Returns errno, or zero and sets *actionp to the KAUTH_VNODE_* actions that
5335 * must be authorized to be permitted to write the vattr.
5336 */
5337int
5338vnode_authattr(vnode_t vp, struct vnode_attr *vap, kauth_action_t *actionp, vfs_context_t ctx)
5339{
5340 struct vnode_attr ova;
5341 kauth_action_t required_action;
5342 int error, is_suser, ismember, chowner, chgroup;
5343 guid_t changer;
5344 gid_t group;
5345 uid_t owner;
5346 mode_t newmode;
5347 kauth_cred_t cred;
5348 uint32_t fdelta;
5349
5350 VATTR_INIT(&ova);
5351 required_action = 0;
5352 error = 0;
5353
5354 /*
5355 * Quickly check for enforcement applicability.
5356 */
5357 if (vfs_authopaque(vnode_mount(vp)))
5358 goto out;
5359
5360 /*
5361 * Check for attempts to set nonsensical fields.
5362 */
5363 if (vap->va_active & VNODE_ATTR_RDONLY) {
5364 KAUTH_DEBUG("ATTR - ERROR: attempt to set readonly attribute(s)");
5365 error = EINVAL;
5366 goto out;
5367 }
5368
5369 /*
5370 * We need to know if the caller is the superuser.
5371 */
5372 cred = vfs_context_ucred(ctx);
5373 is_suser = kauth_cred_issuser(cred);
5374
5375 /*
5376 * If any of the following are changing, we need information from the old file:
5377 * va_uid
5378 * va_gid
5379 * va_mode
5380 * va_uuuid
5381 * va_guuid
5382 */
5383 if (VATTR_IS_ACTIVE(vap, va_uid) ||
5384 VATTR_IS_ACTIVE(vap, va_gid) ||
5385 VATTR_IS_ACTIVE(vap, va_mode) ||
5386 VATTR_IS_ACTIVE(vap, va_uuuid) ||
5387 VATTR_IS_ACTIVE(vap, va_guuid)) {
5388 VATTR_WANTED(&ova, va_mode);
5389 VATTR_WANTED(&ova, va_uid);
5390 VATTR_WANTED(&ova, va_gid);
5391 VATTR_WANTED(&ova, va_uuuid);
5392 VATTR_WANTED(&ova, va_guuid);
5393 KAUTH_DEBUG("ATTR - security information changing, fetching existing attributes");
5394 }
5395
5396 /*
5397 * If timestamps are being changed, we need to know who the file is owned
5398 * by.
5399 */
5400 if (VATTR_IS_ACTIVE(vap, va_create_time) ||
5401 VATTR_IS_ACTIVE(vap, va_change_time) ||
5402 VATTR_IS_ACTIVE(vap, va_modify_time) ||
5403 VATTR_IS_ACTIVE(vap, va_access_time) ||
5404 VATTR_IS_ACTIVE(vap, va_backup_time)) {
5405
5406 VATTR_WANTED(&ova, va_uid);
5407#if 0 /* enable this when we support UUIDs as official owners */
5408 VATTR_WANTED(&ova, va_uuuid);
5409#endif
5410 KAUTH_DEBUG("ATTR - timestamps changing, fetching uid and GUID");
5411 }
5412
5413 /*
5414 * If flags are being changed, we need the old flags.
5415 */
5416 if (VATTR_IS_ACTIVE(vap, va_flags)) {
5417 KAUTH_DEBUG("ATTR - flags changing, fetching old flags");
5418 VATTR_WANTED(&ova, va_flags);
5419 }
5420
5421 /*
5422 * If the size is being set, make sure it's not a directory.
5423 */
5424 if (VATTR_IS_ACTIVE(vap, va_data_size)) {
5425 /* size is meaningless on a directory, don't permit this */
5426 if (vnode_isdir(vp)) {
5427 KAUTH_DEBUG("ATTR - ERROR: size change requested on a directory");
5428 error = EISDIR;
5429 goto out;
5430 }
5431 }
5432
5433 /*
5434 * Get old data.
5435 */
5436 KAUTH_DEBUG("ATTR - fetching old attributes %016llx", ova.va_active);
5437 if ((error = vnode_getattr(vp, &ova, ctx)) != 0) {
5438 KAUTH_DEBUG(" ERROR - got %d trying to get attributes", error);
5439 goto out;
5440 }
5441
5442 /*
5443 * Size changes require write access to the file data.
5444 */
5445 if (VATTR_IS_ACTIVE(vap, va_data_size)) {
5446 /* if we can't get the size, or it's different, we need write access */
5447 KAUTH_DEBUG("ATTR - size change, requiring WRITE_DATA");
5448 required_action |= KAUTH_VNODE_WRITE_DATA;
5449 }
5450
5451 /*
5452 * Changing timestamps?
5453 *
5454 * Note that we are only called to authorize user-requested time changes;
5455 * side-effect time changes are not authorized. Authorisation is only
5456 * required for existing files.
5457 *
5458 * Non-owners are not permitted to change the time on an existing
5459 * file to anything other than the current time.
5460 */
5461 if (VATTR_IS_ACTIVE(vap, va_create_time) ||
5462 VATTR_IS_ACTIVE(vap, va_change_time) ||
5463 VATTR_IS_ACTIVE(vap, va_modify_time) ||
5464 VATTR_IS_ACTIVE(vap, va_access_time) ||
5465 VATTR_IS_ACTIVE(vap, va_backup_time)) {
5466 /*
5467 * The owner and root may set any timestamps they like,
5468 * provided that the file is not immutable. The owner still needs
5469 * WRITE_ATTRIBUTES (implied by ownership but still deniable).
5470 */
5471 if (is_suser || vauth_node_owner(&ova, cred)) {
5472 KAUTH_DEBUG("ATTR - root or owner changing timestamps");
5473 required_action |= KAUTH_VNODE_CHECKIMMUTABLE | KAUTH_VNODE_WRITE_ATTRIBUTES;
5474 } else {
5475 /* just setting the current time? */
5476 if (vap->va_vaflags & VA_UTIMES_NULL) {
5477 KAUTH_DEBUG("ATTR - non-root/owner changing timestamps, requiring WRITE_ATTRIBUTES");
5478 required_action |= KAUTH_VNODE_WRITE_ATTRIBUTES;
5479 } else {
5480 KAUTH_DEBUG("ATTR - ERROR: illegal timestamp modification attempted");
5481 error = EACCES;
5482 goto out;
5483 }
5484 }
5485 }
5486
5487 /*
5488 * Changing file mode?
5489 */
5490 if (VATTR_IS_ACTIVE(vap, va_mode) && VATTR_IS_SUPPORTED(&ova, va_mode) && (ova.va_mode != vap->va_mode)) {
5491 KAUTH_DEBUG("ATTR - mode change from %06o to %06o", ova.va_mode, vap->va_mode);
5492
5493 /*
5494 * Mode changes always have the same basic auth requirements.
5495 */
5496 if (is_suser) {
5497 KAUTH_DEBUG("ATTR - superuser mode change, requiring immutability check");
5498 required_action |= KAUTH_VNODE_CHECKIMMUTABLE;
5499 } else {
5500 /* need WRITE_SECURITY */
5501 KAUTH_DEBUG("ATTR - non-superuser mode change, requiring WRITE_SECURITY");
5502 required_action |= KAUTH_VNODE_WRITE_SECURITY;
5503 }
5504
5505 /*
5506 * Can't set the setgid bit if you're not in the group and not root. Have to have
5507 * existing group information in the case we're not setting it right now.
5508 */
5509 if (vap->va_mode & S_ISGID) {
5510 required_action |= KAUTH_VNODE_CHECKIMMUTABLE; /* always required */
5511 if (!is_suser) {
5512 if (VATTR_IS_ACTIVE(vap, va_gid)) {
5513 group = vap->va_gid;
5514 } else if (VATTR_IS_SUPPORTED(&ova, va_gid)) {
5515 group = ova.va_gid;
5516 } else {
5517 KAUTH_DEBUG("ATTR - ERROR: setgid but no gid available");
5518 error = EINVAL;
5519 goto out;
5520 }
5521 /*
5522 * This might be too restrictive; WRITE_SECURITY might be implied by
5523 * membership in this case, rather than being an additional requirement.
5524 */
5525 if ((error = kauth_cred_ismember_gid(cred, group, &ismember)) != 0) {
5526 KAUTH_DEBUG("ATTR - ERROR: got %d checking for membership in %d", error, vap->va_gid);
5527 goto out;
5528 }
5529 if (!ismember) {
5530 KAUTH_DEBUG(" DENIED - can't set SGID bit, not a member of %d", group);
5531 error = EPERM;
5532 goto out;
5533 }
5534 }
5535 }
5536
5537 /*
5538 * Can't set the setuid bit unless you're root or the file's owner.
5539 */
5540 if (vap->va_mode & S_ISUID) {
5541 required_action |= KAUTH_VNODE_CHECKIMMUTABLE; /* always required */
5542 if (!is_suser) {
5543 if (VATTR_IS_ACTIVE(vap, va_uid)) {
5544 owner = vap->va_uid;
5545 } else if (VATTR_IS_SUPPORTED(&ova, va_uid)) {
5546 owner = ova.va_uid;
5547 } else {
5548 KAUTH_DEBUG("ATTR - ERROR: setuid but no uid available");
5549 error = EINVAL;
5550 goto out;
5551 }
5552 if (owner != kauth_cred_getuid(cred)) {
5553 /*
5554 * We could allow this if WRITE_SECURITY is permitted, perhaps.
5555 */
5556 KAUTH_DEBUG("ATTR - ERROR: illegal attempt to set the setuid bit");
5557 error = EPERM;
5558 goto out;
5559 }
5560 }
5561 }
5562 }
5563
5564 /*
5565 * Validate/mask flags changes. This checks that only the flags in
5566 * the UF_SETTABLE mask are being set, and preserves the flags in
5567 * the SF_SETTABLE case.
5568 *
5569 * Since flags changes may be made in conjunction with other changes,
5570 * we will ask the auth code to ignore immutability in the case that
5571 * the SF_* flags are not set and we are only manipulating the file flags.
5572 *
5573 */
5574 if (VATTR_IS_ACTIVE(vap, va_flags)) {
5575 /* compute changing flags bits */
5576 if (VATTR_IS_SUPPORTED(&ova, va_flags)) {
5577 fdelta = vap->va_flags ^ ova.va_flags;
5578 } else {
5579 fdelta = vap->va_flags;
5580 }
5581
5582 if (fdelta != 0) {
5583 KAUTH_DEBUG("ATTR - flags changing, requiring WRITE_SECURITY");
5584 required_action |= KAUTH_VNODE_WRITE_SECURITY;
5585
5586 /* check that changing bits are legal */
5587 if (is_suser) {
5588 /*
5589 * The immutability check will prevent us from clearing the SF_*
5590 * flags unless the system securelevel permits it, so just check
5591 * for legal flags here.
5592 */
5593 if (fdelta & ~(UF_SETTABLE | SF_SETTABLE)) {
5594 error = EPERM;
5595 KAUTH_DEBUG(" DENIED - superuser attempt to set illegal flag(s)");
5596 goto out;
5597 }
5598 } else {
5599 if (fdelta & ~UF_SETTABLE) {
5600 error = EPERM;
5601 KAUTH_DEBUG(" DENIED - user attempt to set illegal flag(s)");
5602 goto out;
5603 }
5604 }
5605 /*
5606 * If the caller has the ability to manipulate file flags,
5607 * security is not reduced by ignoring them for this operation.
5608 *
5609 * A more complete test here would consider the 'after' states of the flags
5610 * to determine whether it would permit the operation, but this becomes
5611 * very complex.
5612 *
5613 * Ignoring immutability is conditional on securelevel; this does not bypass
5614 * the SF_* flags if securelevel > 0.
5615 */
5616 required_action |= KAUTH_VNODE_NOIMMUTABLE;
5617 }
5618 }
5619
5620 /*
5621 * Validate ownership information.
5622 */
5623 chowner = 0;
5624 chgroup = 0;
5625
5626 /*
5627 * uid changing
5628 * Note that if the filesystem didn't give us a UID, we expect that it doesn't
5629 * support them in general, and will ignore it if/when we try to set it.
5630 * We might want to clear the uid out of vap completely here.
5631 */
5632 if (VATTR_IS_ACTIVE(vap, va_uid) && VATTR_IS_SUPPORTED(&ova, va_uid) && (vap->va_uid != ova.va_uid)) {
5633 if (!is_suser && (kauth_cred_getuid(cred) != vap->va_uid)) {
5634 KAUTH_DEBUG(" DENIED - non-superuser cannot change ownershipt to a third party");
5635 error = EPERM;
5636 goto out;
5637 }
5638 chowner = 1;
5639 }
5640
5641 /*
5642 * gid changing
5643 * Note that if the filesystem didn't give us a GID, we expect that it doesn't
5644 * support them in general, and will ignore it if/when we try to set it.
5645 * We might want to clear the gid out of vap completely here.
5646 */
5647 if (VATTR_IS_ACTIVE(vap, va_gid) && VATTR_IS_SUPPORTED(&ova, va_gid) && (vap->va_gid != ova.va_gid)) {
5648 if (!is_suser) {
5649 if ((error = kauth_cred_ismember_gid(cred, vap->va_gid, &ismember)) != 0) {
5650 KAUTH_DEBUG(" ERROR - got %d checking for membership in %d", error, vap->va_gid);
5651 goto out;
5652 }
5653 if (!ismember) {
5654 KAUTH_DEBUG(" DENIED - group change from %d to %d but not a member of target group",
5655 ova.va_gid, vap->va_gid);
5656 error = EPERM;
5657 goto out;
5658 }
5659 }
5660 chgroup = 1;
5661 }
5662
5663 /*
5664 * Owner UUID being set or changed.
5665 */
5666 if (VATTR_IS_ACTIVE(vap, va_uuuid)) {
5667 /* if the owner UUID is not actually changing ... */
5668 if (VATTR_IS_SUPPORTED(&ova, va_uuuid) && kauth_guid_equal(&vap->va_uuuid, &ova.va_uuuid))
5669 goto no_uuuid_change;
5670
5671 /*
5672 * The owner UUID cannot be set by a non-superuser to anything other than
5673 * their own.
5674 */
5675 if (!is_suser) {
5676 if ((error = kauth_cred_getguid(cred, &changer)) != 0) {
5677 KAUTH_DEBUG(" ERROR - got %d trying to get caller UUID", error);
5678 /* XXX ENOENT here - no UUID - should perhaps become EPERM */
5679 goto out;
5680 }
5681 if (!kauth_guid_equal(&vap->va_uuuid, &changer)) {
5682 KAUTH_DEBUG(" ERROR - cannot set supplied owner UUID - not us");
5683 error = EPERM;
5684 goto out;
5685 }
5686 }
5687 chowner = 1;
5688 }
5689no_uuuid_change:
5690 /*
5691 * Group UUID being set or changed.
5692 */
5693 if (VATTR_IS_ACTIVE(vap, va_guuid)) {
5694 /* if the group UUID is not actually changing ... */
5695 if (VATTR_IS_SUPPORTED(&ova, va_guuid) && kauth_guid_equal(&vap->va_guuid, &ova.va_guuid))
5696 goto no_guuid_change;
5697
5698 /*
5699 * The group UUID cannot be set by a non-superuser to anything other than
5700 * one of which they are a member.
5701 */
5702 if (!is_suser) {
5703 if ((error = kauth_cred_ismember_guid(cred, &vap->va_guuid, &ismember)) != 0) {
5704 KAUTH_DEBUG(" ERROR - got %d trying to check group membership", error);
5705 goto out;
5706 }
5707 if (!ismember) {
5708 KAUTH_DEBUG(" ERROR - cannot create item with supplied group UUID - not a member");
5709 error = EPERM;
5710 goto out;
5711 }
5712 }
5713 chgroup = 1;
5714 }
5715no_guuid_change:
5716
5717 /*
5718 * Compute authorisation for group/ownership changes.
5719 */
5720 if (chowner || chgroup) {
5721 if (is_suser) {
5722 KAUTH_DEBUG("ATTR - superuser changing file owner/group, requiring immutability check");
5723 required_action |= KAUTH_VNODE_CHECKIMMUTABLE;
5724 } else {
5725 if (chowner) {
5726 KAUTH_DEBUG("ATTR - ownership change, requiring TAKE_OWNERSHIP");
5727 required_action |= KAUTH_VNODE_TAKE_OWNERSHIP;
5728 }
5729 if (chgroup && !chowner) {
5730 KAUTH_DEBUG("ATTR - group change, requiring WRITE_SECURITY");
5731 required_action |= KAUTH_VNODE_WRITE_SECURITY;
5732 }
5733
5734 /* clear set-uid and set-gid bits as required by Posix */
5735 if (VATTR_IS_ACTIVE(vap, va_mode)) {
5736 newmode = vap->va_mode;
5737 } else if (VATTR_IS_SUPPORTED(&ova, va_mode)) {
5738 newmode = ova.va_mode;
5739 } else {
5740 KAUTH_DEBUG("CHOWN - trying to change owner but cannot get mode from filesystem to mask setugid bits");
5741 newmode = 0;
5742 }
5743 if (newmode & (S_ISUID | S_ISGID)) {
5744 VATTR_SET(vap, va_mode, newmode & ~(S_ISUID | S_ISGID));
5745 KAUTH_DEBUG("CHOWN - masking setugid bits from mode %o to %o", newmode, vap->va_mode);
5746 }
5747 }
5748 }
5749
5750 /*
5751 * Authorise changes in the ACL.
5752 */
5753 if (VATTR_IS_ACTIVE(vap, va_acl)) {
5754
5755 /* no existing ACL */
5756 if (!VATTR_IS_ACTIVE(&ova, va_acl) || (ova.va_acl == NULL)) {
5757
5758 /* adding an ACL */
5759 if (vap->va_acl != NULL) {
5760 required_action |= KAUTH_VNODE_WRITE_SECURITY;
5761 KAUTH_DEBUG("CHMOD - adding ACL");
5762 }
5763
5764 /* removing an existing ACL */
5765 } else if (vap->va_acl == NULL) {
5766 required_action |= KAUTH_VNODE_WRITE_SECURITY;
5767 KAUTH_DEBUG("CHMOD - removing ACL");
5768
5769 /* updating an existing ACL */
5770 } else {
5771 if (vap->va_acl->acl_entrycount != ova.va_acl->acl_entrycount) {
5772 /* entry count changed, must be different */
5773 required_action |= KAUTH_VNODE_WRITE_SECURITY;
5774 KAUTH_DEBUG("CHMOD - adding/removing ACL entries");
5775 } else if (vap->va_acl->acl_entrycount > 0) {
5776 /* both ACLs have the same ACE count, said count is 1 or more, bitwise compare ACLs */
5777 if (!memcmp(&vap->va_acl->acl_ace[0], &ova.va_acl->acl_ace[0],
5778 sizeof(struct kauth_ace) * vap->va_acl->acl_entrycount)) {
5779 required_action |= KAUTH_VNODE_WRITE_SECURITY;
5780 KAUTH_DEBUG("CHMOD - changing ACL entries");
5781 }
5782 }
5783 }
5784 }
5785
5786 /*
5787 * Other attributes that require authorisation.
5788 */
5789 if (VATTR_IS_ACTIVE(vap, va_encoding))
5790 required_action |= KAUTH_VNODE_WRITE_ATTRIBUTES;
5791
5792out:
5793 if (VATTR_IS_SUPPORTED(&ova, va_acl) && (ova.va_acl != NULL))
5794 kauth_acl_free(ova.va_acl);
5795 if (error == 0)
5796 *actionp = required_action;
5797 return(error);
5798}
5799
5800
5801void
5802vfs_setlocklocal(mount_t mp)
5803{
5804 vnode_t vp;
5805
5806 mount_lock(mp);
5807 mp->mnt_kern_flag |= MNTK_LOCK_LOCAL;
5808
5809 /*
5810 * We do not expect anyone to be using any vnodes at the
5811 * time this routine is called. So no need for vnode locking
5812 */
5813 TAILQ_FOREACH(vp, &mp->mnt_vnodelist, v_mntvnodes) {
5814 vp->v_flag |= VLOCKLOCAL;
5815 }
5816 TAILQ_FOREACH(vp, &mp->mnt_workerqueue, v_mntvnodes) {
5817 vp->v_flag |= VLOCKLOCAL;
5818 }
5819 TAILQ_FOREACH(vp, &mp->mnt_newvnodes, v_mntvnodes) {
5820 vp->v_flag |= VLOCKLOCAL;
5821 }
5822 mount_unlock(mp);
5823}
5824
5825
5826#ifdef JOE_DEBUG
5827
5828record_vp(vnode_t vp, int count) {
5829 struct uthread *ut;
5830 int i;
5831
5832 if ((vp->v_flag & VSYSTEM))
5833 return;
5834
5835 ut = get_bsdthread_info(current_thread());
5836 ut->uu_iocount += count;
5837
5838 if (ut->uu_vpindex < 32) {
5839 for (i = 0; i < ut->uu_vpindex; i++) {
5840 if (ut->uu_vps[i] == vp)
5841 return;
5842 }
5843 ut->uu_vps[ut->uu_vpindex] = vp;
5844 ut->uu_vpindex++;
5845 }
5846}
5847#endif