]> git.saurik.com Git - apple/xnu.git/blob - bsd/vfs/vfs_subr.c
643d79c07054eca4cc8764ab8a01e366ea79bb3c
[apple/xnu.git] / bsd / vfs / vfs_subr.c
1 /*
2 * Copyright (c) 2000-2008 Apple Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28 /* Copyright (c) 1995 NeXT Computer, Inc. All Rights Reserved */
29 /*
30 * Copyright (c) 1989, 1993
31 * The Regents of the University of California. All rights reserved.
32 * (c) UNIX System Laboratories, Inc.
33 * All or some portions of this file are derived from material licensed
34 * to the University of California by American Telephone and Telegraph
35 * Co. or Unix System Laboratories, Inc. and are reproduced herein with
36 * the permission of UNIX System Laboratories, Inc.
37 *
38 * Redistribution and use in source and binary forms, with or without
39 * modification, are permitted provided that the following conditions
40 * are met:
41 * 1. Redistributions of source code must retain the above copyright
42 * notice, this list of conditions and the following disclaimer.
43 * 2. Redistributions in binary form must reproduce the above copyright
44 * notice, this list of conditions and the following disclaimer in the
45 * documentation and/or other materials provided with the distribution.
46 * 3. All advertising materials mentioning features or use of this software
47 * must display the following acknowledgement:
48 * This product includes software developed by the University of
49 * California, Berkeley and its contributors.
50 * 4. Neither the name of the University nor the names of its contributors
51 * may be used to endorse or promote products derived from this software
52 * without specific prior written permission.
53 *
54 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
55 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
56 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
57 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
58 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
59 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
60 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
61 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
62 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
63 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
64 * SUCH DAMAGE.
65 *
66 * @(#)vfs_subr.c 8.31 (Berkeley) 5/26/95
67 */
68 /*
69 * NOTICE: This file was modified by SPARTA, Inc. in 2005 to introduce
70 * support for mandatory and extensible security protections. This notice
71 * is included in support of clause 2.2 (b) of the Apple Public License,
72 * Version 2.0.
73 */
74
75 /*
76 * External virtual filesystem routines
77 */
78
79
80 #include <sys/param.h>
81 #include <sys/systm.h>
82 #include <sys/proc_internal.h>
83 #include <sys/kauth.h>
84 #include <sys/mount_internal.h>
85 #include <sys/time.h>
86 #include <sys/lock.h>
87 #include <sys/vnode.h>
88 #include <sys/vnode_internal.h>
89 #include <sys/stat.h>
90 #include <sys/namei.h>
91 #include <sys/ucred.h>
92 #include <sys/buf_internal.h>
93 #include <sys/errno.h>
94 #include <sys/malloc.h>
95 #include <sys/uio_internal.h>
96 #include <sys/uio.h>
97 #include <sys/domain.h>
98 #include <sys/mbuf.h>
99 #include <sys/syslog.h>
100 #include <sys/ubc_internal.h>
101 #include <sys/vm.h>
102 #include <sys/sysctl.h>
103 #include <sys/filedesc.h>
104 #include <sys/event.h>
105 #include <sys/kdebug.h>
106 #include <sys/kauth.h>
107 #include <sys/user.h>
108 #include <miscfs/fifofs/fifo.h>
109
110 #include <string.h>
111 #include <machine/spl.h>
112
113
114 #include <kern/assert.h>
115
116 #include <miscfs/specfs/specdev.h>
117
118 #include <mach/mach_types.h>
119 #include <mach/memory_object_types.h>
120
121 #include <kern/kalloc.h> /* kalloc()/kfree() */
122 #include <kern/clock.h> /* delay_for_interval() */
123 #include <libkern/OSAtomic.h> /* OSAddAtomic() */
124
125
126 #include <vm/vm_protos.h> /* vnode_pager_vrele() */
127
128 #if CONFIG_MACF
129 #include <security/mac_framework.h>
130 #endif
131
132 extern lck_grp_t *vnode_lck_grp;
133 extern lck_attr_t *vnode_lck_attr;
134
135
136 extern lck_mtx_t * mnt_list_mtx_lock;
137
138 enum vtype iftovt_tab[16] = {
139 VNON, VFIFO, VCHR, VNON, VDIR, VNON, VBLK, VNON,
140 VREG, VNON, VLNK, VNON, VSOCK, VNON, VNON, VBAD,
141 };
142 int vttoif_tab[9] = {
143 0, S_IFREG, S_IFDIR, S_IFBLK, S_IFCHR, S_IFLNK,
144 S_IFSOCK, S_IFIFO, S_IFMT,
145 };
146
147 /* XXX next protptype should be from <nfs/nfs.h> */
148 extern int nfs_vinvalbuf(vnode_t, int, vfs_context_t, int);
149
150 /* XXX next prototytype should be from libsa/stdlib.h> but conflicts libkern */
151 __private_extern__ void qsort(
152 void * array,
153 size_t nmembers,
154 size_t member_size,
155 int (*)(const void *, const void *));
156
157 extern kern_return_t adjust_vm_object_cache(vm_size_t oval, vm_size_t nval);
158 __private_extern__ void vntblinit(void);
159 __private_extern__ kern_return_t reset_vmobjectcache(unsigned int val1,
160 unsigned int val2);
161 __private_extern__ int unlink1(vfs_context_t, struct nameidata *, int);
162
163 static void vnode_list_add(vnode_t);
164 static void vnode_list_remove(vnode_t);
165 static void vnode_list_remove_locked(vnode_t);
166
167 static errno_t vnode_drain(vnode_t);
168 static void vgone(vnode_t, int flags);
169 static void vclean(vnode_t vp, int flag);
170 static void vnode_reclaim_internal(vnode_t, int, int, int);
171
172 static void vnode_dropiocount (vnode_t);
173 static errno_t vnode_getiocount(vnode_t vp, int vid, int vflags);
174 static int vget_internal(vnode_t, int, int);
175
176 static vnode_t checkalias(vnode_t vp, dev_t nvp_rdev);
177 static int vnode_reload(vnode_t);
178 static int vnode_isinuse_locked(vnode_t, int, int);
179
180 static void insmntque(vnode_t vp, mount_t mp);
181 static int mount_getvfscnt(void);
182 static int mount_fillfsids(fsid_t *, int );
183 static void vnode_iterate_setup(mount_t);
184 static int vnode_umount_preflight(mount_t, vnode_t, int);
185 static int vnode_iterate_prepare(mount_t);
186 static int vnode_iterate_reloadq(mount_t);
187 static void vnode_iterate_clear(mount_t);
188
189 errno_t rmdir_remove_orphaned_appleDouble(vnode_t, vfs_context_t, int *);
190
191 TAILQ_HEAD(freelst, vnode) vnode_free_list; /* vnode free list */
192 TAILQ_HEAD(deadlst, vnode) vnode_dead_list; /* vnode dead list */
193
194 TAILQ_HEAD(ragelst, vnode) vnode_rage_list; /* vnode rapid age list */
195 struct timeval rage_tv;
196 int rage_limit = 0;
197 int ragevnodes = 0;
198
199 #define RAGE_LIMIT_MIN 100
200 #define RAGE_TIME_LIMIT 5
201
202 struct mntlist mountlist; /* mounted filesystem list */
203 static int nummounts = 0;
204
205 #if DIAGNOSTIC
206 #define VLISTCHECK(fun, vp, list) \
207 if ((vp)->v_freelist.tqe_prev == (struct vnode **)0xdeadb) \
208 panic("%s: %s vnode not on %slist", (fun), (list), (list));
209 #else
210 #define VLISTCHECK(fun, vp, list)
211 #endif /* DIAGNOSTIC */
212
213 #define VLISTNONE(vp) \
214 do { \
215 (vp)->v_freelist.tqe_next = (struct vnode *)0; \
216 (vp)->v_freelist.tqe_prev = (struct vnode **)0xdeadb; \
217 } while(0)
218
219 #define VONLIST(vp) \
220 ((vp)->v_freelist.tqe_prev != (struct vnode **)0xdeadb)
221
222 /* remove a vnode from free vnode list */
223 #define VREMFREE(fun, vp) \
224 do { \
225 VLISTCHECK((fun), (vp), "free"); \
226 TAILQ_REMOVE(&vnode_free_list, (vp), v_freelist); \
227 VLISTNONE((vp)); \
228 freevnodes--; \
229 } while(0)
230
231
232
233 /* remove a vnode from dead vnode list */
234 #define VREMDEAD(fun, vp) \
235 do { \
236 VLISTCHECK((fun), (vp), "dead"); \
237 TAILQ_REMOVE(&vnode_dead_list, (vp), v_freelist); \
238 VLISTNONE((vp)); \
239 vp->v_listflag &= ~VLIST_DEAD; \
240 deadvnodes--; \
241 } while(0)
242
243
244 /* remove a vnode from rage vnode list */
245 #define VREMRAGE(fun, vp) \
246 do { \
247 if ( !(vp->v_listflag & VLIST_RAGE)) \
248 panic("VREMRAGE: vp not on rage list"); \
249 VLISTCHECK((fun), (vp), "rage"); \
250 TAILQ_REMOVE(&vnode_rage_list, (vp), v_freelist); \
251 VLISTNONE((vp)); \
252 vp->v_listflag &= ~VLIST_RAGE; \
253 ragevnodes--; \
254 } while(0)
255
256
257 /*
258 * vnodetarget hasn't been used in a long time, but
259 * it was exported for some reason... I'm leaving in
260 * place for now... it should be deprecated out of the
261 * exports and removed eventually.
262 */
263 unsigned long vnodetarget; /* target for vnreclaim() */
264 #define VNODE_FREE_TARGET 20 /* Default value for vnodetarget */
265
266 /*
267 * We need quite a few vnodes on the free list to sustain the
268 * rapid stat() the compilation process does, and still benefit from the name
269 * cache. Having too few vnodes on the free list causes serious disk
270 * thrashing as we cycle through them.
271 */
272 #define VNODE_FREE_MIN CONFIG_VNODE_FREE_MIN /* freelist should have at least this many */
273
274 /*
275 * Initialize the vnode management data structures.
276 */
277 __private_extern__ void
278 vntblinit(void)
279 {
280 TAILQ_INIT(&vnode_free_list);
281 TAILQ_INIT(&vnode_rage_list);
282 TAILQ_INIT(&vnode_dead_list);
283 TAILQ_INIT(&mountlist);
284
285 if (!vnodetarget)
286 vnodetarget = VNODE_FREE_TARGET;
287
288 microuptime(&rage_tv);
289 rage_limit = desiredvnodes / 100;
290
291 if (rage_limit < RAGE_LIMIT_MIN)
292 rage_limit = RAGE_LIMIT_MIN;
293
294 /*
295 * Scale the vm_object_cache to accomodate the vnodes
296 * we want to cache
297 */
298 (void) adjust_vm_object_cache(0, desiredvnodes - VNODE_FREE_MIN);
299 }
300
301 /* Reset the VM Object Cache with the values passed in */
302 __private_extern__ kern_return_t
303 reset_vmobjectcache(unsigned int val1, unsigned int val2)
304 {
305 vm_size_t oval = val1 - VNODE_FREE_MIN;
306 vm_size_t nval;
307
308 if(val2 < VNODE_FREE_MIN)
309 nval = 0;
310 else
311 nval = val2 - VNODE_FREE_MIN;
312
313 return(adjust_vm_object_cache(oval, nval));
314 }
315
316
317 /* the timeout is in 10 msecs */
318 int
319 vnode_waitforwrites(vnode_t vp, int output_target, int slpflag, int slptimeout, const char *msg) {
320 int error = 0;
321 struct timespec ts;
322
323 KERNEL_DEBUG(0x3010280 | DBG_FUNC_START, (int)vp, output_target, vp->v_numoutput, 0, 0);
324
325 if (vp->v_numoutput > output_target) {
326
327 slpflag &= ~PDROP;
328
329 vnode_lock(vp);
330
331 while ((vp->v_numoutput > output_target) && error == 0) {
332 if (output_target)
333 vp->v_flag |= VTHROTTLED;
334 else
335 vp->v_flag |= VBWAIT;
336
337 ts.tv_sec = (slptimeout/100);
338 ts.tv_nsec = (slptimeout % 1000) * 10 * NSEC_PER_USEC * 1000 ;
339 error = msleep((caddr_t)&vp->v_numoutput, &vp->v_lock, (slpflag | (PRIBIO + 1)), msg, &ts);
340 }
341 vnode_unlock(vp);
342 }
343 KERNEL_DEBUG(0x3010280 | DBG_FUNC_END, (int)vp, output_target, vp->v_numoutput, error, 0);
344
345 return error;
346 }
347
348
349 void
350 vnode_startwrite(vnode_t vp) {
351
352 OSAddAtomic(1, &vp->v_numoutput);
353 }
354
355
356 void
357 vnode_writedone(vnode_t vp)
358 {
359 if (vp) {
360 OSAddAtomic(-1, &vp->v_numoutput);
361
362 if (vp->v_numoutput <= 1) {
363 int need_wakeup = 0;
364
365 vnode_lock_spin(vp);
366
367 if (vp->v_numoutput < 0)
368 panic("vnode_writedone: numoutput < 0");
369
370 if ((vp->v_flag & VTHROTTLED) && (vp->v_numoutput <= 1)) {
371 vp->v_flag &= ~VTHROTTLED;
372 need_wakeup = 1;
373 }
374 if ((vp->v_flag & VBWAIT) && (vp->v_numoutput == 0)) {
375 vp->v_flag &= ~VBWAIT;
376 need_wakeup = 1;
377 }
378 vnode_unlock(vp);
379
380 if (need_wakeup)
381 wakeup((caddr_t)&vp->v_numoutput);
382 }
383 }
384 }
385
386
387
388 int
389 vnode_hasdirtyblks(vnode_t vp)
390 {
391 struct cl_writebehind *wbp;
392
393 /*
394 * Not taking the buf_mtxp as there is little
395 * point doing it. Even if the lock is taken the
396 * state can change right after that. If their
397 * needs to be a synchronization, it must be driven
398 * by the caller
399 */
400 if (vp->v_dirtyblkhd.lh_first)
401 return (1);
402
403 if (!UBCINFOEXISTS(vp))
404 return (0);
405
406 wbp = vp->v_ubcinfo->cl_wbehind;
407
408 if (wbp && (wbp->cl_number || wbp->cl_scmap))
409 return (1);
410
411 return (0);
412 }
413
414 int
415 vnode_hascleanblks(vnode_t vp)
416 {
417 /*
418 * Not taking the buf_mtxp as there is little
419 * point doing it. Even if the lock is taken the
420 * state can change right after that. If their
421 * needs to be a synchronization, it must be driven
422 * by the caller
423 */
424 if (vp->v_cleanblkhd.lh_first)
425 return (1);
426 return (0);
427 }
428
429 void
430 vnode_iterate_setup(mount_t mp)
431 {
432 while (mp->mnt_lflag & MNT_LITER) {
433 mp->mnt_lflag |= MNT_LITERWAIT;
434 msleep((caddr_t)mp, &mp->mnt_mlock, PVFS, "vnode_iterate_setup", NULL);
435 }
436
437 mp->mnt_lflag |= MNT_LITER;
438
439 }
440
441 static int
442 vnode_umount_preflight(mount_t mp, vnode_t skipvp, int flags)
443 {
444 vnode_t vp;
445
446 TAILQ_FOREACH(vp, &mp->mnt_vnodelist, v_mntvnodes) {
447 /* disable preflight only for udf, a hack to be removed after 4073176 is fixed */
448 if (vp->v_tag == VT_UDF)
449 return 0;
450 if (vp->v_type == VDIR)
451 continue;
452 if (vp == skipvp)
453 continue;
454 if ((flags & SKIPSYSTEM) && ((vp->v_flag & VSYSTEM) ||
455 (vp->v_flag & VNOFLUSH)))
456 continue;
457 if ((flags & SKIPSWAP) && (vp->v_flag & VSWAP))
458 continue;
459 if ((flags & WRITECLOSE) &&
460 (vp->v_writecount == 0 || vp->v_type != VREG))
461 continue;
462 /* Look for busy vnode */
463 if (((vp->v_usecount != 0) &&
464 ((vp->v_usecount - vp->v_kusecount) != 0)))
465 return(1);
466 }
467
468 return(0);
469 }
470
471 /*
472 * This routine prepares iteration by moving all the vnodes to worker queue
473 * called with mount lock held
474 */
475 int
476 vnode_iterate_prepare(mount_t mp)
477 {
478 vnode_t vp;
479
480 if (TAILQ_EMPTY(&mp->mnt_vnodelist)) {
481 /* nothing to do */
482 return (0);
483 }
484
485 vp = TAILQ_FIRST(&mp->mnt_vnodelist);
486 vp->v_mntvnodes.tqe_prev = &(mp->mnt_workerqueue.tqh_first);
487 mp->mnt_workerqueue.tqh_first = mp->mnt_vnodelist.tqh_first;
488 mp->mnt_workerqueue.tqh_last = mp->mnt_vnodelist.tqh_last;
489
490 TAILQ_INIT(&mp->mnt_vnodelist);
491 if (mp->mnt_newvnodes.tqh_first != NULL)
492 panic("vnode_iterate_prepare: newvnode when entering vnode");
493 TAILQ_INIT(&mp->mnt_newvnodes);
494
495 return (1);
496 }
497
498
499 /* called with mount lock held */
500 int
501 vnode_iterate_reloadq(mount_t mp)
502 {
503 int moved = 0;
504
505 /* add the remaining entries in workerq to the end of mount vnode list */
506 if (!TAILQ_EMPTY(&mp->mnt_workerqueue)) {
507 struct vnode * mvp;
508 mvp = TAILQ_LAST(&mp->mnt_vnodelist, vnodelst);
509
510 /* Joining the workerque entities to mount vnode list */
511 if (mvp)
512 mvp->v_mntvnodes.tqe_next = mp->mnt_workerqueue.tqh_first;
513 else
514 mp->mnt_vnodelist.tqh_first = mp->mnt_workerqueue.tqh_first;
515 mp->mnt_workerqueue.tqh_first->v_mntvnodes.tqe_prev = mp->mnt_vnodelist.tqh_last;
516 mp->mnt_vnodelist.tqh_last = mp->mnt_workerqueue.tqh_last;
517 TAILQ_INIT(&mp->mnt_workerqueue);
518 }
519
520 /* add the newvnodes to the head of mount vnode list */
521 if (!TAILQ_EMPTY(&mp->mnt_newvnodes)) {
522 struct vnode * nlvp;
523 nlvp = TAILQ_LAST(&mp->mnt_newvnodes, vnodelst);
524
525 mp->mnt_newvnodes.tqh_first->v_mntvnodes.tqe_prev = &mp->mnt_vnodelist.tqh_first;
526 nlvp->v_mntvnodes.tqe_next = mp->mnt_vnodelist.tqh_first;
527 if(mp->mnt_vnodelist.tqh_first)
528 mp->mnt_vnodelist.tqh_first->v_mntvnodes.tqe_prev = &nlvp->v_mntvnodes.tqe_next;
529 else
530 mp->mnt_vnodelist.tqh_last = mp->mnt_newvnodes.tqh_last;
531 mp->mnt_vnodelist.tqh_first = mp->mnt_newvnodes.tqh_first;
532 TAILQ_INIT(&mp->mnt_newvnodes);
533 moved = 1;
534 }
535
536 return(moved);
537 }
538
539
540 void
541 vnode_iterate_clear(mount_t mp)
542 {
543 mp->mnt_lflag &= ~MNT_LITER;
544 if (mp->mnt_lflag & MNT_LITERWAIT) {
545 mp->mnt_lflag &= ~MNT_LITERWAIT;
546 wakeup(mp);
547 }
548 }
549
550
551 int
552 vnode_iterate(mount_t mp, int flags, int (*callout)(struct vnode *, void *),
553 void *arg)
554 {
555 struct vnode *vp;
556 int vid, retval;
557 int ret = 0;
558
559 mount_lock(mp);
560
561 vnode_iterate_setup(mp);
562
563 /* it is returns 0 then there is nothing to do */
564 retval = vnode_iterate_prepare(mp);
565
566 if (retval == 0) {
567 vnode_iterate_clear(mp);
568 mount_unlock(mp);
569 return(ret);
570 }
571
572 /* iterate over all the vnodes */
573 while (!TAILQ_EMPTY(&mp->mnt_workerqueue)) {
574 vp = TAILQ_FIRST(&mp->mnt_workerqueue);
575 TAILQ_REMOVE(&mp->mnt_workerqueue, vp, v_mntvnodes);
576 TAILQ_INSERT_TAIL(&mp->mnt_vnodelist, vp, v_mntvnodes);
577 vid = vp->v_id;
578 if ((vp->v_data == NULL) || (vp->v_type == VNON) || (vp->v_mount != mp)) {
579 continue;
580 }
581 mount_unlock(mp);
582
583 if ( vget_internal(vp, vid, (flags | VNODE_NODEAD| VNODE_WITHID | VNODE_NOSUSPEND))) {
584 mount_lock(mp);
585 continue;
586 }
587 if (flags & VNODE_RELOAD) {
588 /*
589 * we're reloading the filesystem
590 * cast out any inactive vnodes...
591 */
592 if (vnode_reload(vp)) {
593 /* vnode will be recycled on the refcount drop */
594 vnode_put(vp);
595 mount_lock(mp);
596 continue;
597 }
598 }
599
600 retval = callout(vp, arg);
601
602 switch (retval) {
603 case VNODE_RETURNED:
604 case VNODE_RETURNED_DONE:
605 vnode_put(vp);
606 if (retval == VNODE_RETURNED_DONE) {
607 mount_lock(mp);
608 ret = 0;
609 goto out;
610 }
611 break;
612
613 case VNODE_CLAIMED_DONE:
614 mount_lock(mp);
615 ret = 0;
616 goto out;
617 case VNODE_CLAIMED:
618 default:
619 break;
620 }
621 mount_lock(mp);
622 }
623
624 out:
625 (void)vnode_iterate_reloadq(mp);
626 vnode_iterate_clear(mp);
627 mount_unlock(mp);
628 return (ret);
629 }
630
631 void
632 mount_lock_renames(mount_t mp)
633 {
634 lck_mtx_lock(&mp->mnt_renamelock);
635 }
636
637 void
638 mount_unlock_renames(mount_t mp)
639 {
640 lck_mtx_unlock(&mp->mnt_renamelock);
641 }
642
643 void
644 mount_lock(mount_t mp)
645 {
646 lck_mtx_lock(&mp->mnt_mlock);
647 }
648
649 void
650 mount_unlock(mount_t mp)
651 {
652 lck_mtx_unlock(&mp->mnt_mlock);
653 }
654
655
656 void
657 mount_ref(mount_t mp, int locked)
658 {
659 if ( !locked)
660 mount_lock(mp);
661
662 mp->mnt_count++;
663
664 if ( !locked)
665 mount_unlock(mp);
666 }
667
668
669 void
670 mount_drop(mount_t mp, int locked)
671 {
672 if ( !locked)
673 mount_lock(mp);
674
675 mp->mnt_count--;
676
677 if (mp->mnt_count == 0 && (mp->mnt_lflag & MNT_LDRAIN))
678 wakeup(&mp->mnt_lflag);
679
680 if ( !locked)
681 mount_unlock(mp);
682 }
683
684
685 int
686 mount_iterref(mount_t mp, int locked)
687 {
688 int retval = 0;
689
690 if (!locked)
691 mount_list_lock();
692 if (mp->mnt_iterref < 0) {
693 retval = 1;
694 } else {
695 mp->mnt_iterref++;
696 }
697 if (!locked)
698 mount_list_unlock();
699 return(retval);
700 }
701
702 int
703 mount_isdrained(mount_t mp, int locked)
704 {
705 int retval;
706
707 if (!locked)
708 mount_list_lock();
709 if (mp->mnt_iterref < 0)
710 retval = 1;
711 else
712 retval = 0;
713 if (!locked)
714 mount_list_unlock();
715 return(retval);
716 }
717
718 void
719 mount_iterdrop(mount_t mp)
720 {
721 mount_list_lock();
722 mp->mnt_iterref--;
723 wakeup(&mp->mnt_iterref);
724 mount_list_unlock();
725 }
726
727 void
728 mount_iterdrain(mount_t mp)
729 {
730 mount_list_lock();
731 while (mp->mnt_iterref)
732 msleep((caddr_t)&mp->mnt_iterref, mnt_list_mtx_lock, PVFS, "mount_iterdrain", NULL);
733 /* mount iterations drained */
734 mp->mnt_iterref = -1;
735 mount_list_unlock();
736 }
737 void
738 mount_iterreset(mount_t mp)
739 {
740 mount_list_lock();
741 if (mp->mnt_iterref == -1)
742 mp->mnt_iterref = 0;
743 mount_list_unlock();
744 }
745
746 /* always called with mount lock held */
747 int
748 mount_refdrain(mount_t mp)
749 {
750 if (mp->mnt_lflag & MNT_LDRAIN)
751 panic("already in drain");
752 mp->mnt_lflag |= MNT_LDRAIN;
753
754 while (mp->mnt_count)
755 msleep((caddr_t)&mp->mnt_lflag, &mp->mnt_mlock, PVFS, "mount_drain", NULL);
756
757 if (mp->mnt_vnodelist.tqh_first != NULL)
758 panic("mount_refdrain: dangling vnode");
759
760 mp->mnt_lflag &= ~MNT_LDRAIN;
761
762 return(0);
763 }
764
765
766 /*
767 * Mark a mount point as busy. Used to synchronize access and to delay
768 * unmounting.
769 */
770 int
771 vfs_busy(mount_t mp, int flags)
772 {
773
774 restart:
775 if (mp->mnt_lflag & MNT_LDEAD)
776 return(ENOENT);
777
778 if (mp->mnt_lflag & MNT_LUNMOUNT) {
779 if (flags & LK_NOWAIT)
780 return (ENOENT);
781
782 mount_lock(mp);
783
784 if (mp->mnt_lflag & MNT_LDEAD) {
785 mount_unlock(mp);
786 return(ENOENT);
787 }
788 if (mp->mnt_lflag & MNT_LUNMOUNT) {
789 mp->mnt_lflag |= MNT_LWAIT;
790 /*
791 * Since all busy locks are shared except the exclusive
792 * lock granted when unmounting, the only place that a
793 * wakeup needs to be done is at the release of the
794 * exclusive lock at the end of dounmount.
795 */
796 msleep((caddr_t)mp, &mp->mnt_mlock, (PVFS | PDROP), "vfsbusy", NULL);
797 return (ENOENT);
798 }
799 mount_unlock(mp);
800 }
801
802 lck_rw_lock_shared(&mp->mnt_rwlock);
803
804 /*
805 * until we are granted the rwlock, it's possible for the mount point to
806 * change state, so reevaluate before granting the vfs_busy
807 */
808 if (mp->mnt_lflag & (MNT_LDEAD | MNT_LUNMOUNT)) {
809 lck_rw_done(&mp->mnt_rwlock);
810 goto restart;
811 }
812 return (0);
813 }
814
815 /*
816 * Free a busy filesystem.
817 */
818
819 void
820 vfs_unbusy(mount_t mp)
821 {
822 lck_rw_done(&mp->mnt_rwlock);
823 }
824
825
826
827 static void
828 vfs_rootmountfailed(mount_t mp) {
829
830 mount_list_lock();
831 mp->mnt_vtable->vfc_refcount--;
832 mount_list_unlock();
833
834 vfs_unbusy(mp);
835
836 mount_lock_destroy(mp);
837
838 #if CONFIG_MACF
839 mac_mount_label_destroy(mp);
840 #endif
841
842 FREE_ZONE(mp, sizeof(struct mount), M_MOUNT);
843 }
844
845 /*
846 * Lookup a filesystem type, and if found allocate and initialize
847 * a mount structure for it.
848 *
849 * Devname is usually updated by mount(8) after booting.
850 */
851 static mount_t
852 vfs_rootmountalloc_internal(struct vfstable *vfsp, const char *devname)
853 {
854 mount_t mp;
855
856 mp = _MALLOC_ZONE((u_long)sizeof(struct mount), M_MOUNT, M_WAITOK);
857 bzero((char *)mp, (u_long)sizeof(struct mount));
858
859 /* Initialize the default IO constraints */
860 mp->mnt_maxreadcnt = mp->mnt_maxwritecnt = MAXPHYS;
861 mp->mnt_segreadcnt = mp->mnt_segwritecnt = 32;
862 mp->mnt_maxsegreadsize = mp->mnt_maxreadcnt;
863 mp->mnt_maxsegwritesize = mp->mnt_maxwritecnt;
864 mp->mnt_devblocksize = DEV_BSIZE;
865 mp->mnt_alignmentmask = PAGE_MASK;
866 mp->mnt_ioflags = 0;
867 mp->mnt_realrootvp = NULLVP;
868 mp->mnt_authcache_ttl = CACHED_LOOKUP_RIGHT_TTL;
869
870 mount_lock_init(mp);
871 (void)vfs_busy(mp, LK_NOWAIT);
872
873 TAILQ_INIT(&mp->mnt_vnodelist);
874 TAILQ_INIT(&mp->mnt_workerqueue);
875 TAILQ_INIT(&mp->mnt_newvnodes);
876
877 mp->mnt_vtable = vfsp;
878 mp->mnt_op = vfsp->vfc_vfsops;
879 mp->mnt_flag = MNT_RDONLY | MNT_ROOTFS;
880 mp->mnt_vnodecovered = NULLVP;
881 //mp->mnt_stat.f_type = vfsp->vfc_typenum;
882 mp->mnt_flag |= vfsp->vfc_flags & MNT_VISFLAGMASK;
883
884 mount_list_lock();
885 vfsp->vfc_refcount++;
886 mount_list_unlock();
887
888 strncpy(mp->mnt_vfsstat.f_fstypename, vfsp->vfc_name, MFSTYPENAMELEN);
889 mp->mnt_vfsstat.f_mntonname[0] = '/';
890 /* XXX const poisoning layering violation */
891 (void) copystr((const void *)devname, mp->mnt_vfsstat.f_mntfromname, MAXPATHLEN - 1, NULL);
892
893 #if CONFIG_MACF
894 mac_mount_label_init(mp);
895 mac_mount_label_associate(vfs_context_kernel(), mp);
896 #endif
897 return (mp);
898 }
899
900 errno_t
901 vfs_rootmountalloc(const char *fstypename, const char *devname, mount_t *mpp)
902 {
903 struct vfstable *vfsp;
904
905 for (vfsp = vfsconf; vfsp; vfsp = vfsp->vfc_next)
906 if (!strncmp(vfsp->vfc_name, fstypename,
907 sizeof(vfsp->vfc_name)))
908 break;
909 if (vfsp == NULL)
910 return (ENODEV);
911
912 *mpp = vfs_rootmountalloc_internal(vfsp, devname);
913
914 if (*mpp)
915 return (0);
916
917 return (ENOMEM);
918 }
919
920
921 /*
922 * Find an appropriate filesystem to use for the root. If a filesystem
923 * has not been preselected, walk through the list of known filesystems
924 * trying those that have mountroot routines, and try them until one
925 * works or we have tried them all.
926 */
927 extern int (*mountroot)(void);
928
929 int
930 vfs_mountroot(void)
931 {
932 #if CONFIG_MACF
933 struct vnode *vp;
934 #endif
935 struct vfstable *vfsp;
936 vfs_context_t ctx = vfs_context_kernel();
937 struct vfs_attr vfsattr;
938 int error;
939 mount_t mp;
940 vnode_t bdevvp_rootvp;
941
942 if (mountroot != NULL) {
943 /*
944 * used for netboot which follows a different set of rules
945 */
946 error = (*mountroot)();
947 return (error);
948 }
949 if ((error = bdevvp(rootdev, &rootvp))) {
950 printf("vfs_mountroot: can't setup bdevvp\n");
951 return (error);
952 }
953 /*
954 * 4951998 - code we call in vfc_mountroot may replace rootvp
955 * so keep a local copy for some house keeping.
956 */
957 bdevvp_rootvp = rootvp;
958
959 for (vfsp = vfsconf; vfsp; vfsp = vfsp->vfc_next) {
960 if (vfsp->vfc_mountroot == NULL)
961 continue;
962
963 mp = vfs_rootmountalloc_internal(vfsp, "root_device");
964 mp->mnt_devvp = rootvp;
965
966 if ((error = (*vfsp->vfc_mountroot)(mp, rootvp, ctx)) == 0) {
967 if ( bdevvp_rootvp != rootvp ) {
968 /*
969 * rootvp changed...
970 * bump the iocount and fix up mnt_devvp for the
971 * new rootvp (it will already have a usecount taken)...
972 * drop the iocount and the usecount on the orignal
973 * since we are no longer going to use it...
974 */
975 vnode_getwithref(rootvp);
976 mp->mnt_devvp = rootvp;
977
978 vnode_rele(bdevvp_rootvp);
979 vnode_put(bdevvp_rootvp);
980 }
981 mp->mnt_devvp->v_specflags |= SI_MOUNTEDON;
982
983 vfs_unbusy(mp);
984
985 mount_list_add(mp);
986
987 /*
988 * cache the IO attributes for the underlying physical media...
989 * an error return indicates the underlying driver doesn't
990 * support all the queries necessary... however, reasonable
991 * defaults will have been set, so no reason to bail or care
992 */
993 vfs_init_io_attributes(rootvp, mp);
994
995 /*
996 * Shadow the VFC_VFSNATIVEXATTR flag to MNTK_EXTENDED_ATTRS.
997 */
998 if (mp->mnt_vtable->vfc_vfsflags & VFC_VFSNATIVEXATTR) {
999 mp->mnt_kern_flag |= MNTK_EXTENDED_ATTRS;
1000 }
1001 if (mp->mnt_vtable->vfc_vfsflags & VFC_VFSPREFLIGHT) {
1002 mp->mnt_kern_flag |= MNTK_UNMOUNT_PREFLIGHT;
1003 }
1004
1005 /*
1006 * Probe root file system for additional features.
1007 */
1008 (void)VFS_START(mp, 0, ctx);
1009
1010 VFSATTR_INIT(&vfsattr);
1011 VFSATTR_WANTED(&vfsattr, f_capabilities);
1012 if (vfs_getattr(mp, &vfsattr, ctx) == 0 &&
1013 VFSATTR_IS_SUPPORTED(&vfsattr, f_capabilities)) {
1014 if ((vfsattr.f_capabilities.capabilities[VOL_CAPABILITIES_INTERFACES] & VOL_CAP_INT_EXTENDED_ATTR) &&
1015 (vfsattr.f_capabilities.valid[VOL_CAPABILITIES_INTERFACES] & VOL_CAP_INT_EXTENDED_ATTR)) {
1016 mp->mnt_kern_flag |= MNTK_EXTENDED_ATTRS;
1017 }
1018 #if NAMEDSTREAMS
1019 if ((vfsattr.f_capabilities.capabilities[VOL_CAPABILITIES_INTERFACES] & VOL_CAP_INT_NAMEDSTREAMS) &&
1020 (vfsattr.f_capabilities.valid[VOL_CAPABILITIES_INTERFACES] & VOL_CAP_INT_NAMEDSTREAMS)) {
1021 mp->mnt_kern_flag |= MNTK_NAMED_STREAMS;
1022 }
1023 #endif
1024 if ((vfsattr.f_capabilities.capabilities[VOL_CAPABILITIES_FORMAT] & VOL_CAP_FMT_PATH_FROM_ID) &&
1025 (vfsattr.f_capabilities.valid[VOL_CAPABILITIES_FORMAT] & VOL_CAP_FMT_PATH_FROM_ID)) {
1026 mp->mnt_kern_flag |= MNTK_PATH_FROM_ID;
1027 }
1028 }
1029
1030 /*
1031 * get rid of iocount reference returned
1032 * by bdevvp (or picked up by us on the substitued
1033 * rootvp)... it (or we) will have also taken
1034 * a usecount reference which we want to keep
1035 */
1036 vnode_put(rootvp);
1037
1038 #if CONFIG_MACF
1039 if ((vfs_flags(mp) & MNT_MULTILABEL) == 0)
1040 return (0);
1041
1042 error = VFS_ROOT(mp, &vp, ctx);
1043 if (error) {
1044 printf("%s() VFS_ROOT() returned %d\n",
1045 __func__, error);
1046 dounmount(mp, MNT_FORCE, 0, ctx);
1047 goto fail;
1048 }
1049
1050 /* VFS_ROOT provides reference so flags = 0 */
1051 error = vnode_label(mp, NULL, vp, NULL, 0, ctx);
1052 if (error) {
1053 printf("%s() vnode_label() returned %d\n",
1054 __func__, error);
1055 dounmount(mp, MNT_FORCE, 0, ctx);
1056 goto fail;
1057 }
1058 #endif
1059 return (0);
1060 }
1061 #if CONFIG_MACF
1062 fail:
1063 #endif
1064 vfs_rootmountfailed(mp);
1065
1066 if (error != EINVAL)
1067 printf("%s_mountroot failed: %d\n", vfsp->vfc_name, error);
1068 }
1069 return (ENODEV);
1070 }
1071
1072 /*
1073 * Lookup a mount point by filesystem identifier.
1074 */
1075 extern mount_t vfs_getvfs_locked(fsid_t *);
1076
1077 struct mount *
1078 vfs_getvfs(fsid_t *fsid)
1079 {
1080 return (mount_list_lookupby_fsid(fsid, 0, 0));
1081 }
1082
1083 struct mount *
1084 vfs_getvfs_locked(fsid_t *fsid)
1085 {
1086 return(mount_list_lookupby_fsid(fsid, 1, 0));
1087 }
1088
1089 struct mount *
1090 vfs_getvfs_by_mntonname(char *path)
1091 {
1092 mount_t retmp = (mount_t)0;
1093 mount_t mp;
1094
1095 mount_list_lock();
1096 TAILQ_FOREACH(mp, &mountlist, mnt_list) {
1097 if (!strncmp(mp->mnt_vfsstat.f_mntonname, path,
1098 sizeof(mp->mnt_vfsstat.f_mntonname))) {
1099 retmp = mp;
1100 goto out;
1101 }
1102 }
1103 out:
1104 mount_list_unlock();
1105 return (retmp);
1106 }
1107
1108 /* generation number for creation of new fsids */
1109 u_short mntid_gen = 0;
1110 /*
1111 * Get a new unique fsid
1112 */
1113 void
1114 vfs_getnewfsid(struct mount *mp)
1115 {
1116
1117 fsid_t tfsid;
1118 int mtype;
1119 mount_t nmp;
1120
1121 mount_list_lock();
1122
1123 /* generate a new fsid */
1124 mtype = mp->mnt_vtable->vfc_typenum;
1125 if (++mntid_gen == 0)
1126 mntid_gen++;
1127 tfsid.val[0] = makedev(nblkdev + mtype, mntid_gen);
1128 tfsid.val[1] = mtype;
1129
1130 TAILQ_FOREACH(nmp, &mountlist, mnt_list) {
1131 while (vfs_getvfs_locked(&tfsid)) {
1132 if (++mntid_gen == 0)
1133 mntid_gen++;
1134 tfsid.val[0] = makedev(nblkdev + mtype, mntid_gen);
1135 }
1136 }
1137 mp->mnt_vfsstat.f_fsid.val[0] = tfsid.val[0];
1138 mp->mnt_vfsstat.f_fsid.val[1] = tfsid.val[1];
1139 mount_list_unlock();
1140 }
1141
1142 /*
1143 * Routines having to do with the management of the vnode table.
1144 */
1145 extern int (**dead_vnodeop_p)(void *);
1146 long numvnodes, freevnodes, deadvnodes;
1147
1148
1149 /*
1150 * Move a vnode from one mount queue to another.
1151 */
1152 static void
1153 insmntque(vnode_t vp, mount_t mp)
1154 {
1155 mount_t lmp;
1156 /*
1157 * Delete from old mount point vnode list, if on one.
1158 */
1159 if ( (lmp = vp->v_mount) != NULL && lmp != dead_mountp) {
1160 if ((vp->v_lflag & VNAMED_MOUNT) == 0)
1161 panic("insmntque: vp not in mount vnode list");
1162 vp->v_lflag &= ~VNAMED_MOUNT;
1163
1164 mount_lock(lmp);
1165
1166 mount_drop(lmp, 1);
1167
1168 if (vp->v_mntvnodes.tqe_next == NULL) {
1169 if (TAILQ_LAST(&lmp->mnt_vnodelist, vnodelst) == vp)
1170 TAILQ_REMOVE(&lmp->mnt_vnodelist, vp, v_mntvnodes);
1171 else if (TAILQ_LAST(&lmp->mnt_newvnodes, vnodelst) == vp)
1172 TAILQ_REMOVE(&lmp->mnt_newvnodes, vp, v_mntvnodes);
1173 else if (TAILQ_LAST(&lmp->mnt_workerqueue, vnodelst) == vp)
1174 TAILQ_REMOVE(&lmp->mnt_workerqueue, vp, v_mntvnodes);
1175 } else {
1176 vp->v_mntvnodes.tqe_next->v_mntvnodes.tqe_prev = vp->v_mntvnodes.tqe_prev;
1177 *vp->v_mntvnodes.tqe_prev = vp->v_mntvnodes.tqe_next;
1178 }
1179 vp->v_mntvnodes.tqe_next = NULL;
1180 vp->v_mntvnodes.tqe_prev = NULL;
1181 mount_unlock(lmp);
1182 return;
1183 }
1184
1185 /*
1186 * Insert into list of vnodes for the new mount point, if available.
1187 */
1188 if ((vp->v_mount = mp) != NULL) {
1189 mount_lock(mp);
1190 if ((vp->v_mntvnodes.tqe_next != 0) && (vp->v_mntvnodes.tqe_prev != 0))
1191 panic("vp already in mount list");
1192 if (mp->mnt_lflag & MNT_LITER)
1193 TAILQ_INSERT_HEAD(&mp->mnt_newvnodes, vp, v_mntvnodes);
1194 else
1195 TAILQ_INSERT_HEAD(&mp->mnt_vnodelist, vp, v_mntvnodes);
1196 if (vp->v_lflag & VNAMED_MOUNT)
1197 panic("insmntque: vp already in mount vnode list");
1198 vp->v_lflag |= VNAMED_MOUNT;
1199 mount_ref(mp, 1);
1200 mount_unlock(mp);
1201 }
1202 }
1203
1204
1205 /*
1206 * Create a vnode for a block device.
1207 * Used for root filesystem, argdev, and swap areas.
1208 * Also used for memory file system special devices.
1209 */
1210 int
1211 bdevvp(dev_t dev, vnode_t *vpp)
1212 {
1213 vnode_t nvp;
1214 int error;
1215 struct vnode_fsparam vfsp;
1216 struct vfs_context context;
1217
1218 if (dev == NODEV) {
1219 *vpp = NULLVP;
1220 return (ENODEV);
1221 }
1222
1223 context.vc_thread = current_thread();
1224 context.vc_ucred = FSCRED;
1225
1226 vfsp.vnfs_mp = (struct mount *)0;
1227 vfsp.vnfs_vtype = VBLK;
1228 vfsp.vnfs_str = "bdevvp";
1229 vfsp.vnfs_dvp = NULL;
1230 vfsp.vnfs_fsnode = NULL;
1231 vfsp.vnfs_cnp = NULL;
1232 vfsp.vnfs_vops = spec_vnodeop_p;
1233 vfsp.vnfs_rdev = dev;
1234 vfsp.vnfs_filesize = 0;
1235
1236 vfsp.vnfs_flags = VNFS_NOCACHE | VNFS_CANTCACHE;
1237
1238 vfsp.vnfs_marksystem = 0;
1239 vfsp.vnfs_markroot = 0;
1240
1241 if ( (error = vnode_create(VNCREATE_FLAVOR, VCREATESIZE, &vfsp, &nvp)) ) {
1242 *vpp = NULLVP;
1243 return (error);
1244 }
1245 vnode_lock_spin(nvp);
1246 nvp->v_flag |= VBDEVVP;
1247 nvp->v_tag = VT_NON; /* set this to VT_NON so during aliasing it can be replaced */
1248 vnode_unlock(nvp);
1249 if ( (error = vnode_ref(nvp)) ) {
1250 panic("bdevvp failed: vnode_ref");
1251 return (error);
1252 }
1253 if ( (error = VNOP_FSYNC(nvp, MNT_WAIT, &context)) ) {
1254 panic("bdevvp failed: fsync");
1255 return (error);
1256 }
1257 if ( (error = buf_invalidateblks(nvp, BUF_WRITE_DATA, 0, 0)) ) {
1258 panic("bdevvp failed: invalidateblks");
1259 return (error);
1260 }
1261
1262 #if CONFIG_MACF
1263 /*
1264 * XXXMAC: We can't put a MAC check here, the system will
1265 * panic without this vnode.
1266 */
1267 #endif /* MAC */
1268
1269 if ( (error = VNOP_OPEN(nvp, FREAD, &context)) ) {
1270 panic("bdevvp failed: open");
1271 return (error);
1272 }
1273 *vpp = nvp;
1274
1275 return (0);
1276 }
1277
1278 /*
1279 * Check to see if the new vnode represents a special device
1280 * for which we already have a vnode (either because of
1281 * bdevvp() or because of a different vnode representing
1282 * the same block device). If such an alias exists, deallocate
1283 * the existing contents and return the aliased vnode. The
1284 * caller is responsible for filling it with its new contents.
1285 */
1286 static vnode_t
1287 checkalias(struct vnode *nvp, dev_t nvp_rdev)
1288 {
1289 struct vnode *vp;
1290 struct vnode **vpp;
1291 int vid = 0;
1292
1293 vpp = &speclisth[SPECHASH(nvp_rdev)];
1294 loop:
1295 SPECHASH_LOCK();
1296
1297 for (vp = *vpp; vp; vp = vp->v_specnext) {
1298 if (nvp_rdev == vp->v_rdev && nvp->v_type == vp->v_type) {
1299 vid = vp->v_id;
1300 break;
1301 }
1302 }
1303 SPECHASH_UNLOCK();
1304
1305 if (vp) {
1306 if (vnode_getwithvid(vp,vid)) {
1307 goto loop;
1308 }
1309 /*
1310 * Termination state is checked in vnode_getwithvid
1311 */
1312 vnode_lock(vp);
1313
1314 /*
1315 * Alias, but not in use, so flush it out.
1316 */
1317 if ((vp->v_iocount == 1) && (vp->v_usecount == 0)) {
1318 vnode_reclaim_internal(vp, 1, 1, 0);
1319 vnode_put_locked(vp);
1320 vnode_unlock(vp);
1321 goto loop;
1322 }
1323 }
1324 if (vp == NULL || vp->v_tag != VT_NON) {
1325 retnullvp:
1326 MALLOC_ZONE(nvp->v_specinfo, struct specinfo *, sizeof(struct specinfo),
1327 M_SPECINFO, M_WAITOK);
1328 bzero(nvp->v_specinfo, sizeof(struct specinfo));
1329 nvp->v_rdev = nvp_rdev;
1330 nvp->v_specflags = 0;
1331 nvp->v_speclastr = -1;
1332
1333 SPECHASH_LOCK();
1334 nvp->v_hashchain = vpp;
1335 nvp->v_specnext = *vpp;
1336 *vpp = nvp;
1337 SPECHASH_UNLOCK();
1338
1339 if (vp != NULLVP) {
1340 nvp->v_flag |= VALIASED;
1341 vp->v_flag |= VALIASED;
1342 vnode_put_locked(vp);
1343 vnode_unlock(vp);
1344 }
1345 return (NULLVP);
1346 }
1347 if ((vp->v_flag & (VBDEVVP | VDEVFLUSH)) != 0)
1348 return(vp);
1349 else {
1350 panic("checkalias with VT_NON vp that shouldn't: %x", (unsigned int)vp);
1351 goto retnullvp;
1352 }
1353 return (vp);
1354 }
1355
1356
1357 /*
1358 * Get a reference on a particular vnode and lock it if requested.
1359 * If the vnode was on the inactive list, remove it from the list.
1360 * If the vnode was on the free list, remove it from the list and
1361 * move it to inactive list as needed.
1362 * The vnode lock bit is set if the vnode is being eliminated in
1363 * vgone. The process is awakened when the transition is completed,
1364 * and an error returned to indicate that the vnode is no longer
1365 * usable (possibly having been changed to a new file system type).
1366 */
1367 static int
1368 vget_internal(vnode_t vp, int vid, int vflags)
1369 {
1370 int error = 0;
1371 int vpid;
1372
1373 vnode_lock_spin(vp);
1374
1375 if (vflags & VNODE_WITHID)
1376 vpid = vid;
1377 else
1378 vpid = vp->v_id; // save off the original v_id
1379
1380 if ((vflags & VNODE_WRITEABLE) && (vp->v_writecount == 0))
1381 /*
1382 * vnode to be returned only if it has writers opened
1383 */
1384 error = EINVAL;
1385 else
1386 error = vnode_getiocount(vp, vpid, vflags);
1387
1388 vnode_unlock(vp);
1389
1390 return (error);
1391 }
1392
1393 /*
1394 * Returns: 0 Success
1395 * ENOENT No such file or directory [terminating]
1396 */
1397 int
1398 vnode_ref(vnode_t vp)
1399 {
1400
1401 return (vnode_ref_ext(vp, 0));
1402 }
1403
1404 /*
1405 * Returns: 0 Success
1406 * ENOENT No such file or directory [terminating]
1407 */
1408 int
1409 vnode_ref_ext(vnode_t vp, int fmode)
1410 {
1411 int error = 0;
1412
1413 vnode_lock_spin(vp);
1414
1415 /*
1416 * once all the current call sites have been fixed to insure they have
1417 * taken an iocount, we can toughen this assert up and insist that the
1418 * iocount is non-zero... a non-zero usecount doesn't insure correctness
1419 */
1420 if (vp->v_iocount <= 0 && vp->v_usecount <= 0)
1421 panic("vnode_ref_ext: vp %p has no valid reference %d, %d", vp, vp->v_iocount, vp->v_usecount);
1422
1423 /*
1424 * if you are the owner of drain/termination, can acquire usecount
1425 */
1426 if ((vp->v_lflag & (VL_DRAIN | VL_TERMINATE | VL_DEAD))) {
1427 if (vp->v_owner != current_thread()) {
1428 error = ENOENT;
1429 goto out;
1430 }
1431 }
1432 vp->v_usecount++;
1433
1434 if (fmode & FWRITE) {
1435 if (++vp->v_writecount <= 0)
1436 panic("vnode_ref_ext: v_writecount");
1437 }
1438 if (fmode & O_EVTONLY) {
1439 if (++vp->v_kusecount <= 0)
1440 panic("vnode_ref_ext: v_kusecount");
1441 }
1442 if (vp->v_flag & VRAGE) {
1443 struct uthread *ut;
1444
1445 ut = get_bsdthread_info(current_thread());
1446
1447 if ( !(current_proc()->p_lflag & P_LRAGE_VNODES) &&
1448 !(ut->uu_flag & UT_RAGE_VNODES)) {
1449 /*
1450 * a 'normal' process accessed this vnode
1451 * so make sure its no longer marked
1452 * for rapid aging... also, make sure
1453 * it gets removed from the rage list...
1454 * when v_usecount drops back to 0, it
1455 * will be put back on the real free list
1456 */
1457 vp->v_flag &= ~VRAGE;
1458 vp->v_references = 0;
1459 vnode_list_remove(vp);
1460 }
1461 }
1462 out:
1463 vnode_unlock(vp);
1464
1465 return (error);
1466 }
1467
1468
1469 /*
1470 * put the vnode on appropriate free list.
1471 * called with vnode LOCKED
1472 */
1473 static void
1474 vnode_list_add(vnode_t vp)
1475 {
1476 /*
1477 * if it is already on a list or non zero references return
1478 */
1479 if (VONLIST(vp) || (vp->v_usecount != 0) || (vp->v_iocount != 0))
1480 return;
1481 vnode_list_lock();
1482
1483 if ((vp->v_flag & VRAGE) && !(vp->v_lflag & VL_DEAD)) {
1484 /*
1485 * add the new guy to the appropriate end of the RAGE list
1486 */
1487 if ((vp->v_flag & VAGE))
1488 TAILQ_INSERT_HEAD(&vnode_rage_list, vp, v_freelist);
1489 else
1490 TAILQ_INSERT_TAIL(&vnode_rage_list, vp, v_freelist);
1491
1492 vp->v_listflag |= VLIST_RAGE;
1493 ragevnodes++;
1494
1495 /*
1496 * reset the timestamp for the last inserted vp on the RAGE
1497 * queue to let new_vnode know that its not ok to start stealing
1498 * from this list... as long as we're actively adding to this list
1499 * we'll push out the vnodes we want to donate to the real free list
1500 * once we stop pushing, we'll let some time elapse before we start
1501 * stealing them in the new_vnode routine
1502 */
1503 microuptime(&rage_tv);
1504 } else {
1505 /*
1506 * if VL_DEAD, insert it at head of the dead list
1507 * else insert at tail of LRU list or at head if VAGE is set
1508 */
1509 if ( (vp->v_lflag & VL_DEAD)) {
1510 TAILQ_INSERT_HEAD(&vnode_dead_list, vp, v_freelist);
1511 vp->v_listflag |= VLIST_DEAD;
1512 deadvnodes++;
1513 } else if ((vp->v_flag & VAGE)) {
1514 TAILQ_INSERT_HEAD(&vnode_free_list, vp, v_freelist);
1515 vp->v_flag &= ~VAGE;
1516 freevnodes++;
1517 } else {
1518 TAILQ_INSERT_TAIL(&vnode_free_list, vp, v_freelist);
1519 freevnodes++;
1520 }
1521 }
1522 vnode_list_unlock();
1523 }
1524
1525
1526 /*
1527 * remove the vnode from appropriate free list.
1528 * called with vnode LOCKED and
1529 * the list lock held
1530 */
1531 static void
1532 vnode_list_remove_locked(vnode_t vp)
1533 {
1534 if (VONLIST(vp)) {
1535 /*
1536 * the v_listflag field is
1537 * protected by the vnode_list_lock
1538 */
1539 if (vp->v_listflag & VLIST_RAGE)
1540 VREMRAGE("vnode_list_remove", vp);
1541 else if (vp->v_listflag & VLIST_DEAD)
1542 VREMDEAD("vnode_list_remove", vp);
1543 else
1544 VREMFREE("vnode_list_remove", vp);
1545 }
1546 }
1547
1548
1549 /*
1550 * remove the vnode from appropriate free list.
1551 * called with vnode LOCKED
1552 */
1553 static void
1554 vnode_list_remove(vnode_t vp)
1555 {
1556 /*
1557 * we want to avoid taking the list lock
1558 * in the case where we're not on the free
1559 * list... this will be true for most
1560 * directories and any currently in use files
1561 *
1562 * we're guaranteed that we can't go from
1563 * the not-on-list state to the on-list
1564 * state since we hold the vnode lock...
1565 * all calls to vnode_list_add are done
1566 * under the vnode lock... so we can
1567 * check for that condition (the prevelant one)
1568 * without taking the list lock
1569 */
1570 if (VONLIST(vp)) {
1571 vnode_list_lock();
1572 /*
1573 * however, we're not guaranteed that
1574 * we won't go from the on-list state
1575 * to the not-on-list state until we
1576 * hold the vnode_list_lock... this
1577 * is due to "new_vnode" removing vnodes
1578 * from the free list uder the list_lock
1579 * w/o the vnode lock... so we need to
1580 * check again whether we're currently
1581 * on the free list
1582 */
1583 vnode_list_remove_locked(vp);
1584
1585 vnode_list_unlock();
1586 }
1587 }
1588
1589
1590 void
1591 vnode_rele(vnode_t vp)
1592 {
1593 vnode_rele_internal(vp, 0, 0, 0);
1594 }
1595
1596
1597 void
1598 vnode_rele_ext(vnode_t vp, int fmode, int dont_reenter)
1599 {
1600 vnode_rele_internal(vp, fmode, dont_reenter, 0);
1601 }
1602
1603
1604 void
1605 vnode_rele_internal(vnode_t vp, int fmode, int dont_reenter, int locked)
1606 {
1607 if ( !locked)
1608 vnode_lock_spin(vp);
1609
1610 if (--vp->v_usecount < 0)
1611 panic("vnode_rele_ext: vp %p usecount -ve : %d", vp, vp->v_usecount);
1612
1613 if (fmode & FWRITE) {
1614 if (--vp->v_writecount < 0)
1615 panic("vnode_rele_ext: vp %p writecount -ve : %ld", vp, vp->v_writecount);
1616 }
1617 if (fmode & O_EVTONLY) {
1618 if (--vp->v_kusecount < 0)
1619 panic("vnode_rele_ext: vp %p kusecount -ve : %d", vp, vp->v_kusecount);
1620 }
1621 if (vp->v_kusecount > vp->v_usecount)
1622 panic("vnode_rele_ext: vp %p kusecount(%d) out of balance with usecount(%d)\n",vp, vp->v_kusecount, vp->v_usecount);
1623 if ((vp->v_iocount > 0) || (vp->v_usecount > 0)) {
1624 /*
1625 * vnode is still busy... if we're the last
1626 * usecount, mark for a future call to VNOP_INACTIVE
1627 * when the iocount finally drops to 0
1628 */
1629 if (vp->v_usecount == 0) {
1630 vp->v_lflag |= VL_NEEDINACTIVE;
1631 vp->v_flag &= ~(VNOCACHE_DATA | VRAOFF | VOPENEVT);
1632 }
1633 if ( !locked)
1634 vnode_unlock(vp);
1635 return;
1636 }
1637 vp->v_flag &= ~(VNOCACHE_DATA | VRAOFF | VOPENEVT);
1638
1639 if ( (vp->v_lflag & (VL_TERMINATE | VL_DEAD)) || dont_reenter) {
1640 /*
1641 * vnode is being cleaned, or
1642 * we've requested that we don't reenter
1643 * the filesystem on this release... in
1644 * this case, we'll mark the vnode aged
1645 * if it's been marked for termination
1646 */
1647 if (dont_reenter) {
1648 if ( !(vp->v_lflag & (VL_TERMINATE | VL_DEAD | VL_MARKTERM)) )
1649 vp->v_lflag |= VL_NEEDINACTIVE;
1650 vp->v_flag |= VAGE;
1651 }
1652 vnode_list_add(vp);
1653 if ( !locked)
1654 vnode_unlock(vp);
1655 return;
1656 }
1657 /*
1658 * at this point both the iocount and usecount
1659 * are zero
1660 * pick up an iocount so that we can call
1661 * VNOP_INACTIVE with the vnode lock unheld
1662 */
1663 vp->v_iocount++;
1664 #ifdef JOE_DEBUG
1665 record_vp(vp, 1);
1666 #endif
1667 vp->v_lflag &= ~VL_NEEDINACTIVE;
1668 vnode_unlock(vp);
1669
1670 VNOP_INACTIVE(vp, vfs_context_current());
1671
1672 vnode_lock_spin(vp);
1673 /*
1674 * because we dropped the vnode lock to call VNOP_INACTIVE
1675 * the state of the vnode may have changed... we may have
1676 * picked up an iocount, usecount or the MARKTERM may have
1677 * been set... we need to reevaluate the reference counts
1678 * to determine if we can call vnode_reclaim_internal at
1679 * this point... if the reference counts are up, we'll pick
1680 * up the MARKTERM state when they get subsequently dropped
1681 */
1682 if ( (vp->v_iocount == 1) && (vp->v_usecount == 0) &&
1683 ((vp->v_lflag & (VL_MARKTERM | VL_TERMINATE | VL_DEAD)) == VL_MARKTERM)) {
1684 struct uthread *ut;
1685
1686 ut = get_bsdthread_info(current_thread());
1687
1688 if (ut->uu_defer_reclaims) {
1689 vp->v_defer_reclaimlist = ut->uu_vreclaims;
1690 ut->uu_vreclaims = vp;
1691 goto defer_reclaim;
1692 }
1693 vnode_lock_convert(vp);
1694 vnode_reclaim_internal(vp, 1, 1, 0);
1695 }
1696 vnode_dropiocount(vp);
1697 vnode_list_add(vp);
1698 defer_reclaim:
1699 if ( !locked)
1700 vnode_unlock(vp);
1701 return;
1702 }
1703
1704 /*
1705 * Remove any vnodes in the vnode table belonging to mount point mp.
1706 *
1707 * If MNT_NOFORCE is specified, there should not be any active ones,
1708 * return error if any are found (nb: this is a user error, not a
1709 * system error). If MNT_FORCE is specified, detach any active vnodes
1710 * that are found.
1711 */
1712 #if DIAGNOSTIC
1713 int busyprt = 0; /* print out busy vnodes */
1714 #if 0
1715 struct ctldebug debug1 = { "busyprt", &busyprt };
1716 #endif /* 0 */
1717 #endif
1718
1719 int
1720 vflush(struct mount *mp, struct vnode *skipvp, int flags)
1721 {
1722 struct vnode *vp;
1723 int busy = 0;
1724 int reclaimed = 0;
1725 int retval;
1726 int vid;
1727
1728 mount_lock(mp);
1729 vnode_iterate_setup(mp);
1730 /*
1731 * On regular unmounts(not forced) do a
1732 * quick check for vnodes to be in use. This
1733 * preserves the caching of vnodes. automounter
1734 * tries unmounting every so often to see whether
1735 * it is still busy or not.
1736 */
1737 if (((flags & FORCECLOSE)==0) && ((mp->mnt_kern_flag & MNTK_UNMOUNT_PREFLIGHT) != 0)) {
1738 if (vnode_umount_preflight(mp, skipvp, flags)) {
1739 vnode_iterate_clear(mp);
1740 mount_unlock(mp);
1741 return(EBUSY);
1742 }
1743 }
1744 loop:
1745 /* it is returns 0 then there is nothing to do */
1746 retval = vnode_iterate_prepare(mp);
1747
1748 if (retval == 0) {
1749 vnode_iterate_clear(mp);
1750 mount_unlock(mp);
1751 return(retval);
1752 }
1753
1754 /* iterate over all the vnodes */
1755 while (!TAILQ_EMPTY(&mp->mnt_workerqueue)) {
1756 vp = TAILQ_FIRST(&mp->mnt_workerqueue);
1757 TAILQ_REMOVE(&mp->mnt_workerqueue, vp, v_mntvnodes);
1758 TAILQ_INSERT_TAIL(&mp->mnt_vnodelist, vp, v_mntvnodes);
1759 if ( (vp->v_mount != mp) || (vp == skipvp)) {
1760 continue;
1761 }
1762 vid = vp->v_id;
1763 mount_unlock(mp);
1764 vnode_lock(vp);
1765
1766 if ((vp->v_id != vid) || ((vp->v_lflag & (VL_DEAD | VL_TERMINATE)))) {
1767 vnode_unlock(vp);
1768 mount_lock(mp);
1769 continue;
1770 }
1771
1772 /*
1773 * If requested, skip over vnodes marked VSYSTEM.
1774 * Skip over all vnodes marked VNOFLUSH.
1775 */
1776 if ((flags & SKIPSYSTEM) && ((vp->v_flag & VSYSTEM) ||
1777 (vp->v_flag & VNOFLUSH))) {
1778 vnode_unlock(vp);
1779 mount_lock(mp);
1780 continue;
1781 }
1782 /*
1783 * If requested, skip over vnodes marked VSWAP.
1784 */
1785 if ((flags & SKIPSWAP) && (vp->v_flag & VSWAP)) {
1786 vnode_unlock(vp);
1787 mount_lock(mp);
1788 continue;
1789 }
1790 /*
1791 * If requested, skip over vnodes marked VSWAP.
1792 */
1793 if ((flags & SKIPROOT) && (vp->v_flag & VROOT)) {
1794 vnode_unlock(vp);
1795 mount_lock(mp);
1796 continue;
1797 }
1798 /*
1799 * If WRITECLOSE is set, only flush out regular file
1800 * vnodes open for writing.
1801 */
1802 if ((flags & WRITECLOSE) &&
1803 (vp->v_writecount == 0 || vp->v_type != VREG)) {
1804 vnode_unlock(vp);
1805 mount_lock(mp);
1806 continue;
1807 }
1808 /*
1809 * If the real usecount is 0, all we need to do is clear
1810 * out the vnode data structures and we are done.
1811 */
1812 if (((vp->v_usecount == 0) ||
1813 ((vp->v_usecount - vp->v_kusecount) == 0))) {
1814 vp->v_iocount++; /* so that drain waits for * other iocounts */
1815 #ifdef JOE_DEBUG
1816 record_vp(vp, 1);
1817 #endif
1818 vnode_reclaim_internal(vp, 1, 1, 0);
1819 vnode_dropiocount(vp);
1820 vnode_list_add(vp);
1821 vnode_unlock(vp);
1822
1823 reclaimed++;
1824 mount_lock(mp);
1825 continue;
1826 }
1827 /*
1828 * If FORCECLOSE is set, forcibly close the vnode.
1829 * For block or character devices, revert to an
1830 * anonymous device. For all other files, just kill them.
1831 */
1832 if (flags & FORCECLOSE) {
1833 if (vp->v_type != VBLK && vp->v_type != VCHR) {
1834 vp->v_iocount++; /* so that drain waits * for other iocounts */
1835 #ifdef JOE_DEBUG
1836 record_vp(vp, 1);
1837 #endif
1838 vnode_reclaim_internal(vp, 1, 1, 0);
1839 vnode_dropiocount(vp);
1840 vnode_list_add(vp);
1841 vnode_unlock(vp);
1842 } else {
1843 vclean(vp, 0);
1844 vp->v_lflag &= ~VL_DEAD;
1845 vp->v_op = spec_vnodeop_p;
1846 vp->v_flag |= VDEVFLUSH;
1847 vnode_unlock(vp);
1848 }
1849 mount_lock(mp);
1850 continue;
1851 }
1852 #if DIAGNOSTIC
1853 if (busyprt)
1854 vprint("vflush: busy vnode", vp);
1855 #endif
1856 vnode_unlock(vp);
1857 mount_lock(mp);
1858 busy++;
1859 }
1860
1861 /* At this point the worker queue is completed */
1862 if (busy && ((flags & FORCECLOSE)==0) && reclaimed) {
1863 busy = 0;
1864 reclaimed = 0;
1865 (void)vnode_iterate_reloadq(mp);
1866 /* returned with mount lock held */
1867 goto loop;
1868 }
1869
1870 /* if new vnodes were created in between retry the reclaim */
1871 if ( vnode_iterate_reloadq(mp) != 0) {
1872 if (!(busy && ((flags & FORCECLOSE)==0)))
1873 goto loop;
1874 }
1875 vnode_iterate_clear(mp);
1876 mount_unlock(mp);
1877
1878 if (busy && ((flags & FORCECLOSE)==0))
1879 return (EBUSY);
1880 return (0);
1881 }
1882
1883 long num_recycledvnodes = 0; /* long for OSAddAtomic */
1884 /*
1885 * Disassociate the underlying file system from a vnode.
1886 * The vnode lock is held on entry.
1887 */
1888 static void
1889 vclean(vnode_t vp, int flags)
1890 {
1891 vfs_context_t ctx = vfs_context_current();
1892 int active;
1893 int need_inactive;
1894 int already_terminating;
1895 int clflags = 0;
1896
1897 #if NAMEDSTREAMS
1898 int is_namedstream;
1899 #endif
1900
1901 /*
1902 * Check to see if the vnode is in use.
1903 * If so we have to reference it before we clean it out
1904 * so that its count cannot fall to zero and generate a
1905 * race against ourselves to recycle it.
1906 */
1907 active = vp->v_usecount;
1908
1909 /*
1910 * just in case we missed sending a needed
1911 * VNOP_INACTIVE, we'll do it now
1912 */
1913 need_inactive = (vp->v_lflag & VL_NEEDINACTIVE);
1914
1915 vp->v_lflag &= ~VL_NEEDINACTIVE;
1916
1917 /*
1918 * Prevent the vnode from being recycled or
1919 * brought into use while we clean it out.
1920 */
1921 already_terminating = (vp->v_lflag & VL_TERMINATE);
1922
1923 vp->v_lflag |= VL_TERMINATE;
1924
1925 /*
1926 * remove the vnode from any mount list
1927 * it might be on...
1928 */
1929 insmntque(vp, (struct mount *)0);
1930
1931 #if NAMEDSTREAMS
1932 is_namedstream = vnode_isnamedstream(vp);
1933 #endif
1934
1935 vnode_unlock(vp);
1936
1937 OSAddAtomic(1, &num_recycledvnodes);
1938 /*
1939 * purge from the name cache as early as possible...
1940 */
1941 cache_purge(vp);
1942
1943 if (flags & DOCLOSE)
1944 clflags |= IO_NDELAY;
1945 if (flags & REVOKEALL)
1946 clflags |= IO_REVOKE;
1947
1948 if (active && (flags & DOCLOSE))
1949 VNOP_CLOSE(vp, clflags, ctx);
1950
1951 /*
1952 * Clean out any buffers associated with the vnode.
1953 */
1954 if (flags & DOCLOSE) {
1955 #if NFSCLIENT
1956 if (vp->v_tag == VT_NFS)
1957 nfs_vinvalbuf(vp, V_SAVE, ctx, 0);
1958 else
1959 #endif
1960 {
1961 VNOP_FSYNC(vp, MNT_WAIT, ctx);
1962 buf_invalidateblks(vp, BUF_WRITE_DATA, 0, 0);
1963 }
1964 if (UBCINFOEXISTS(vp))
1965 /*
1966 * Clean the pages in VM.
1967 */
1968 (void)ubc_sync_range(vp, (off_t)0, ubc_getsize(vp), UBC_PUSHALL);
1969 }
1970 if (active || need_inactive)
1971 VNOP_INACTIVE(vp, ctx);
1972
1973 #if NAMEDSTREAMS
1974 /* Delete the shadow stream file before we reclaim its vnode */
1975 if ((is_namedstream != 0) &&
1976 (vp->v_parent != NULLVP) &&
1977 (vnode_isshadow(vp))) {
1978 vnode_relenamedstream(vp->v_parent, vp, ctx);
1979 }
1980 #endif
1981
1982 /*
1983 * Destroy ubc named reference
1984 * cluster_release is done on this path
1985 * along with dropping the reference on the ucred
1986 */
1987 ubc_destroy_named(vp);
1988
1989 /*
1990 * Reclaim the vnode.
1991 */
1992 if (VNOP_RECLAIM(vp, ctx))
1993 panic("vclean: cannot reclaim");
1994
1995 // make sure the name & parent ptrs get cleaned out!
1996 vnode_update_identity(vp, NULLVP, NULL, 0, 0, VNODE_UPDATE_PARENT | VNODE_UPDATE_NAME);
1997
1998 vnode_lock(vp);
1999
2000 vp->v_mount = dead_mountp;
2001 vp->v_op = dead_vnodeop_p;
2002 vp->v_tag = VT_NON;
2003 vp->v_data = NULL;
2004
2005 vp->v_lflag |= VL_DEAD;
2006
2007 if (already_terminating == 0) {
2008 vp->v_lflag &= ~VL_TERMINATE;
2009 /*
2010 * Done with purge, notify sleepers of the grim news.
2011 */
2012 if (vp->v_lflag & VL_TERMWANT) {
2013 vp->v_lflag &= ~VL_TERMWANT;
2014 wakeup(&vp->v_lflag);
2015 }
2016 }
2017 }
2018
2019 /*
2020 * Eliminate all activity associated with the requested vnode
2021 * and with all vnodes aliased to the requested vnode.
2022 */
2023 int
2024 #if DIAGNOSTIC
2025 vn_revoke(vnode_t vp, int flags, __unused vfs_context_t a_context)
2026 #else
2027 vn_revoke(vnode_t vp, __unused int flags, __unused vfs_context_t a_context)
2028 #endif
2029 {
2030 struct vnode *vq;
2031 int vid;
2032
2033 #if DIAGNOSTIC
2034 if ((flags & REVOKEALL) == 0)
2035 panic("vnop_revoke");
2036 #endif
2037
2038 if (vp->v_flag & VALIASED) {
2039 /*
2040 * If a vgone (or vclean) is already in progress,
2041 * wait until it is done and return.
2042 */
2043 vnode_lock(vp);
2044 if (vp->v_lflag & VL_TERMINATE) {
2045 vnode_unlock(vp);
2046 return(ENOENT);
2047 }
2048 vnode_unlock(vp);
2049 /*
2050 * Ensure that vp will not be vgone'd while we
2051 * are eliminating its aliases.
2052 */
2053 SPECHASH_LOCK();
2054 while (vp->v_flag & VALIASED) {
2055 for (vq = *vp->v_hashchain; vq; vq = vq->v_specnext) {
2056 if (vq->v_rdev != vp->v_rdev ||
2057 vq->v_type != vp->v_type || vp == vq)
2058 continue;
2059 vid = vq->v_id;
2060 SPECHASH_UNLOCK();
2061 if (vnode_getwithvid(vq,vid)){
2062 SPECHASH_LOCK();
2063 break;
2064 }
2065 vnode_reclaim_internal(vq, 0, 1, 0);
2066 vnode_put(vq);
2067 SPECHASH_LOCK();
2068 break;
2069 }
2070 }
2071 SPECHASH_UNLOCK();
2072 }
2073 vnode_reclaim_internal(vp, 0, 0, REVOKEALL);
2074
2075 return (0);
2076 }
2077
2078 /*
2079 * Recycle an unused vnode to the front of the free list.
2080 * Release the passed interlock if the vnode will be recycled.
2081 */
2082 int
2083 vnode_recycle(struct vnode *vp)
2084 {
2085 vnode_lock(vp);
2086
2087 if (vp->v_iocount || vp->v_usecount) {
2088 vp->v_lflag |= VL_MARKTERM;
2089 vnode_unlock(vp);
2090 return(0);
2091 }
2092 vnode_reclaim_internal(vp, 1, 0, 0);
2093
2094 vnode_unlock(vp);
2095
2096 return (1);
2097 }
2098
2099 static int
2100 vnode_reload(vnode_t vp)
2101 {
2102 vnode_lock_spin(vp);
2103
2104 if ((vp->v_iocount > 1) || vp->v_usecount) {
2105 vnode_unlock(vp);
2106 return(0);
2107 }
2108 if (vp->v_iocount <= 0)
2109 panic("vnode_reload with no iocount %d", vp->v_iocount);
2110
2111 /* mark for release when iocount is dopped */
2112 vp->v_lflag |= VL_MARKTERM;
2113 vnode_unlock(vp);
2114
2115 return (1);
2116 }
2117
2118
2119 static void
2120 vgone(vnode_t vp, int flags)
2121 {
2122 struct vnode *vq;
2123 struct vnode *vx;
2124
2125 /*
2126 * Clean out the filesystem specific data.
2127 * vclean also takes care of removing the
2128 * vnode from any mount list it might be on
2129 */
2130 vclean(vp, flags | DOCLOSE);
2131
2132 /*
2133 * If special device, remove it from special device alias list
2134 * if it is on one.
2135 */
2136 if ((vp->v_type == VBLK || vp->v_type == VCHR) && vp->v_specinfo != 0) {
2137 SPECHASH_LOCK();
2138 if (*vp->v_hashchain == vp) {
2139 *vp->v_hashchain = vp->v_specnext;
2140 } else {
2141 for (vq = *vp->v_hashchain; vq; vq = vq->v_specnext) {
2142 if (vq->v_specnext != vp)
2143 continue;
2144 vq->v_specnext = vp->v_specnext;
2145 break;
2146 }
2147 if (vq == NULL)
2148 panic("missing bdev");
2149 }
2150 if (vp->v_flag & VALIASED) {
2151 vx = NULL;
2152 for (vq = *vp->v_hashchain; vq; vq = vq->v_specnext) {
2153 if (vq->v_rdev != vp->v_rdev ||
2154 vq->v_type != vp->v_type)
2155 continue;
2156 if (vx)
2157 break;
2158 vx = vq;
2159 }
2160 if (vx == NULL)
2161 panic("missing alias");
2162 if (vq == NULL)
2163 vx->v_flag &= ~VALIASED;
2164 vp->v_flag &= ~VALIASED;
2165 }
2166 SPECHASH_UNLOCK();
2167 {
2168 struct specinfo *tmp = vp->v_specinfo;
2169 vp->v_specinfo = NULL;
2170 FREE_ZONE((void *)tmp, sizeof(struct specinfo), M_SPECINFO);
2171 }
2172 }
2173 }
2174
2175 /*
2176 * Lookup a vnode by device number.
2177 */
2178 int
2179 check_mountedon(dev_t dev, enum vtype type, int *errorp)
2180 {
2181 vnode_t vp;
2182 int rc = 0;
2183 int vid;
2184
2185 loop:
2186 SPECHASH_LOCK();
2187 for (vp = speclisth[SPECHASH(dev)]; vp; vp = vp->v_specnext) {
2188 if (dev != vp->v_rdev || type != vp->v_type)
2189 continue;
2190 vid = vp->v_id;
2191 SPECHASH_UNLOCK();
2192 if (vnode_getwithvid(vp,vid))
2193 goto loop;
2194 vnode_lock_spin(vp);
2195 if ((vp->v_usecount > 0) || (vp->v_iocount > 1)) {
2196 vnode_unlock(vp);
2197 if ((*errorp = vfs_mountedon(vp)) != 0)
2198 rc = 1;
2199 } else
2200 vnode_unlock(vp);
2201 vnode_put(vp);
2202 return(rc);
2203 }
2204 SPECHASH_UNLOCK();
2205 return (0);
2206 }
2207
2208 /*
2209 * Calculate the total number of references to a special device.
2210 */
2211 int
2212 vcount(vnode_t vp)
2213 {
2214 vnode_t vq, vnext;
2215 int count;
2216 int vid;
2217
2218 loop:
2219 if ((vp->v_flag & VALIASED) == 0)
2220 return (vp->v_usecount - vp->v_kusecount);
2221 count = 0;
2222
2223 SPECHASH_LOCK();
2224 /*
2225 * Grab first vnode and its vid.
2226 */
2227 vq = *vp->v_hashchain;
2228 vid = vq ? vq->v_id : 0;
2229
2230 SPECHASH_UNLOCK();
2231
2232 while (vq) {
2233 /*
2234 * Attempt to get the vnode outside the SPECHASH lock.
2235 */
2236 if (vnode_getwithvid(vq, vid)) {
2237 goto loop;
2238 }
2239 vnode_lock(vq);
2240
2241 if (vq->v_rdev == vp->v_rdev && vq->v_type == vp->v_type) {
2242 if ((vq->v_usecount == 0) && (vq->v_iocount == 1) && vq != vp) {
2243 /*
2244 * Alias, but not in use, so flush it out.
2245 */
2246 vnode_reclaim_internal(vq, 1, 1, 0);
2247 vnode_put_locked(vq);
2248 vnode_unlock(vq);
2249 goto loop;
2250 }
2251 count += (vq->v_usecount - vq->v_kusecount);
2252 }
2253 vnode_unlock(vq);
2254
2255 SPECHASH_LOCK();
2256 /*
2257 * must do this with the reference still held on 'vq'
2258 * so that it can't be destroyed while we're poking
2259 * through v_specnext
2260 */
2261 vnext = vq->v_specnext;
2262 vid = vnext ? vnext->v_id : 0;
2263
2264 SPECHASH_UNLOCK();
2265
2266 vnode_put(vq);
2267
2268 vq = vnext;
2269 }
2270
2271 return (count);
2272 }
2273
2274 int prtactive = 0; /* 1 => print out reclaim of active vnodes */
2275
2276 /*
2277 * Print out a description of a vnode.
2278 */
2279 #if !CONFIG_NO_PRINTF_STRINGS
2280 static const char *typename[] =
2281 { "VNON", "VREG", "VDIR", "VBLK", "VCHR", "VLNK", "VSOCK", "VFIFO", "VBAD" };
2282 #endif
2283
2284 void
2285 vprint(const char *label, struct vnode *vp)
2286 {
2287 char sbuf[64];
2288
2289 if (label != NULL)
2290 printf("%s: ", label);
2291 printf("type %s, usecount %d, writecount %ld",
2292 typename[vp->v_type], vp->v_usecount, vp->v_writecount);
2293 sbuf[0] = '\0';
2294 if (vp->v_flag & VROOT)
2295 strlcat(sbuf, "|VROOT", sizeof(sbuf));
2296 if (vp->v_flag & VTEXT)
2297 strlcat(sbuf, "|VTEXT", sizeof(sbuf));
2298 if (vp->v_flag & VSYSTEM)
2299 strlcat(sbuf, "|VSYSTEM", sizeof(sbuf));
2300 if (vp->v_flag & VNOFLUSH)
2301 strlcat(sbuf, "|VNOFLUSH", sizeof(sbuf));
2302 if (vp->v_flag & VBWAIT)
2303 strlcat(sbuf, "|VBWAIT", sizeof(sbuf));
2304 if (vp->v_flag & VALIASED)
2305 strlcat(sbuf, "|VALIASED", sizeof(sbuf));
2306 if (sbuf[0] != '\0')
2307 printf(" flags (%s)", &sbuf[1]);
2308 }
2309
2310
2311 int
2312 vn_getpath(struct vnode *vp, char *pathbuf, int *len)
2313 {
2314 return build_path(vp, pathbuf, *len, len, BUILDPATH_NO_FS_ENTER, vfs_context_current());
2315 }
2316
2317
2318 int
2319 vn_getcdhash(struct vnode *vp, off_t offset, unsigned char *cdhash)
2320 {
2321 return ubc_cs_getcdhash(vp, offset, cdhash);
2322 }
2323
2324
2325 static char *extension_table=NULL;
2326 static int nexts;
2327 static int max_ext_width;
2328
2329 static int
2330 extension_cmp(const void *a, const void *b)
2331 {
2332 return (strlen((const char *)a) - strlen((const char *)b));
2333 }
2334
2335
2336 //
2337 // This is the api LaunchServices uses to inform the kernel
2338 // the list of package extensions to ignore.
2339 //
2340 // Internally we keep the list sorted by the length of the
2341 // the extension (from longest to shortest). We sort the
2342 // list of extensions so that we can speed up our searches
2343 // when comparing file names -- we only compare extensions
2344 // that could possibly fit into the file name, not all of
2345 // them (i.e. a short 8 character name can't have an 8
2346 // character extension).
2347 //
2348 __private_extern__ int
2349 set_package_extensions_table(void *data, int nentries, int maxwidth)
2350 {
2351 char *new_exts;
2352 int error;
2353
2354 if (nentries <= 0 || nentries > 1024 || maxwidth <= 0 || maxwidth > 255) {
2355 return EINVAL;
2356 }
2357
2358 MALLOC(new_exts, char *, nentries * maxwidth, M_TEMP, M_WAITOK);
2359
2360 error = copyin(CAST_USER_ADDR_T(data), new_exts, nentries * maxwidth);
2361 if (error) {
2362 FREE(new_exts, M_TEMP);
2363 return error;
2364 }
2365
2366 if (extension_table) {
2367 FREE(extension_table, M_TEMP);
2368 }
2369 extension_table = new_exts;
2370 nexts = nentries;
2371 max_ext_width = maxwidth;
2372
2373 qsort(extension_table, nexts, maxwidth, extension_cmp);
2374
2375 return 0;
2376 }
2377
2378
2379 __private_extern__ int
2380 is_package_name(const char *name, int len)
2381 {
2382 int i, extlen;
2383 const char *ptr, *name_ext;
2384
2385 if (len <= 3) {
2386 return 0;
2387 }
2388
2389 name_ext = NULL;
2390 for(ptr=name; *ptr != '\0'; ptr++) {
2391 if (*ptr == '.') {
2392 name_ext = ptr;
2393 }
2394 }
2395
2396 // if there is no "." extension, it can't match
2397 if (name_ext == NULL) {
2398 return 0;
2399 }
2400
2401 // advance over the "."
2402 name_ext++;
2403
2404 // now iterate over all the extensions to see if any match
2405 ptr = &extension_table[0];
2406 for(i=0; i < nexts; i++, ptr+=max_ext_width) {
2407 extlen = strlen(ptr);
2408 if (strncasecmp(name_ext, ptr, extlen) == 0 && name_ext[extlen] == '\0') {
2409 // aha, a match!
2410 return 1;
2411 }
2412 }
2413
2414 // if we get here, no extension matched
2415 return 0;
2416 }
2417
2418 int
2419 vn_path_package_check(__unused vnode_t vp, char *path, int pathlen, int *component)
2420 {
2421 char *ptr, *end;
2422 int comp=0;
2423
2424 *component = -1;
2425 if (*path != '/') {
2426 return EINVAL;
2427 }
2428
2429 end = path + 1;
2430 while(end < path + pathlen && *end != '\0') {
2431 while(end < path + pathlen && *end == '/' && *end != '\0') {
2432 end++;
2433 }
2434
2435 ptr = end;
2436
2437 while(end < path + pathlen && *end != '/' && *end != '\0') {
2438 end++;
2439 }
2440
2441 if (end > path + pathlen) {
2442 // hmm, string wasn't null terminated
2443 return EINVAL;
2444 }
2445
2446 *end = '\0';
2447 if (is_package_name(ptr, end - ptr)) {
2448 *component = comp;
2449 break;
2450 }
2451
2452 end++;
2453 comp++;
2454 }
2455
2456 return 0;
2457 }
2458
2459
2460 /*
2461 * Top level filesystem related information gathering.
2462 */
2463 extern unsigned int vfs_nummntops;
2464
2465 int
2466 vfs_sysctl(int *name, u_int namelen, user_addr_t oldp, size_t *oldlenp,
2467 user_addr_t newp, size_t newlen, proc_t p)
2468 {
2469 struct vfstable *vfsp;
2470 int *username;
2471 u_int usernamelen;
2472 int error;
2473 struct vfsconf *vfsc;
2474
2475 /* All non VFS_GENERIC and in VFS_GENERIC,
2476 * VFS_MAXTYPENUM, VFS_CONF, VFS_SET_PACKAGE_EXTS
2477 * needs to have root priv to have modifiers.
2478 * For rest the userland_sysctl(CTLFLAG_ANYBODY) would cover.
2479 */
2480 if ((newp != USER_ADDR_NULL) && ((name[0] != VFS_GENERIC) ||
2481 ((name[1] == VFS_MAXTYPENUM) ||
2482 (name[1] == VFS_CONF) ||
2483 (name[1] == VFS_SET_PACKAGE_EXTS)))
2484 && (error = suser(kauth_cred_get(), &p->p_acflag))) {
2485 return(error);
2486 }
2487 /*
2488 * The VFS_NUMMNTOPS shouldn't be at name[0] since
2489 * is a VFS generic variable. So now we must check
2490 * namelen so we don't end up covering any UFS
2491 * variables (sinc UFS vfc_typenum is 1).
2492 *
2493 * It should have been:
2494 * name[0]: VFS_GENERIC
2495 * name[1]: VFS_NUMMNTOPS
2496 */
2497 if (namelen == 1 && name[0] == VFS_NUMMNTOPS) {
2498 return (sysctl_rdint(oldp, oldlenp, newp, vfs_nummntops));
2499 }
2500
2501 /* all sysctl names at this level are at least name and field */
2502 if (namelen < 2)
2503 return (EISDIR); /* overloaded */
2504 if (name[0] != VFS_GENERIC) {
2505 for (vfsp = vfsconf; vfsp; vfsp = vfsp->vfc_next)
2506 if (vfsp->vfc_typenum == name[0])
2507 break;
2508 if (vfsp == NULL)
2509 return (ENOTSUP);
2510
2511 /* XXX current context proxy for proc p? */
2512 return ((*vfsp->vfc_vfsops->vfs_sysctl)(&name[1], namelen - 1,
2513 oldp, oldlenp, newp, newlen,
2514 vfs_context_current()));
2515 }
2516 switch (name[1]) {
2517 case VFS_MAXTYPENUM:
2518 return (sysctl_rdint(oldp, oldlenp, newp, maxvfsconf));
2519 case VFS_CONF:
2520 if (namelen < 3)
2521 return (ENOTDIR); /* overloaded */
2522 for (vfsp = vfsconf; vfsp; vfsp = vfsp->vfc_next)
2523 if (vfsp->vfc_typenum == name[2])
2524 break;
2525 if (vfsp == NULL)
2526 return (ENOTSUP);
2527 vfsc = (struct vfsconf *)vfsp;
2528 if (proc_is64bit(p)) {
2529 struct user_vfsconf usr_vfsc;
2530 usr_vfsc.vfc_vfsops = CAST_USER_ADDR_T(vfsc->vfc_vfsops);
2531 bcopy(vfsc->vfc_name, usr_vfsc.vfc_name, sizeof(usr_vfsc.vfc_name));
2532 usr_vfsc.vfc_typenum = vfsc->vfc_typenum;
2533 usr_vfsc.vfc_refcount = vfsc->vfc_refcount;
2534 usr_vfsc.vfc_flags = vfsc->vfc_flags;
2535 usr_vfsc.vfc_mountroot = CAST_USER_ADDR_T(vfsc->vfc_mountroot);
2536 usr_vfsc.vfc_next = CAST_USER_ADDR_T(vfsc->vfc_next);
2537 return (sysctl_rdstruct(oldp, oldlenp, newp, &usr_vfsc,
2538 sizeof(usr_vfsc)));
2539 }
2540 else {
2541 return (sysctl_rdstruct(oldp, oldlenp, newp, vfsc,
2542 sizeof(struct vfsconf)));
2543 }
2544
2545 case VFS_SET_PACKAGE_EXTS:
2546 return set_package_extensions_table((void *)name[1], name[2], name[3]);
2547 }
2548 /*
2549 * We need to get back into the general MIB, so we need to re-prepend
2550 * CTL_VFS to our name and try userland_sysctl().
2551 */
2552 usernamelen = namelen + 1;
2553 MALLOC(username, int *, usernamelen * sizeof(*username),
2554 M_TEMP, M_WAITOK);
2555 bcopy(name, username + 1, namelen * sizeof(*name));
2556 username[0] = CTL_VFS;
2557 error = userland_sysctl(p, username, usernamelen, oldp,
2558 oldlenp, 1, newp, newlen, oldlenp);
2559 FREE(username, M_TEMP);
2560 return (error);
2561 }
2562
2563 /*
2564 * Dump vnode list (via sysctl) - defunct
2565 * use "pstat" instead
2566 */
2567 /* ARGSUSED */
2568 int
2569 sysctl_vnode
2570 (__unused struct sysctl_oid *oidp, __unused void *arg1, __unused int arg2, __unused struct sysctl_req *req)
2571 {
2572 return(EINVAL);
2573 }
2574
2575 SYSCTL_PROC(_kern, KERN_VNODE, vnode,
2576 CTLTYPE_STRUCT | CTLFLAG_RD | CTLFLAG_MASKED,
2577 0, 0, sysctl_vnode, "S,", "");
2578
2579
2580 /*
2581 * Check to see if a filesystem is mounted on a block device.
2582 */
2583 int
2584 vfs_mountedon(struct vnode *vp)
2585 {
2586 struct vnode *vq;
2587 int error = 0;
2588
2589 SPECHASH_LOCK();
2590 if (vp->v_specflags & SI_MOUNTEDON) {
2591 error = EBUSY;
2592 goto out;
2593 }
2594 if (vp->v_flag & VALIASED) {
2595 for (vq = *vp->v_hashchain; vq; vq = vq->v_specnext) {
2596 if (vq->v_rdev != vp->v_rdev ||
2597 vq->v_type != vp->v_type)
2598 continue;
2599 if (vq->v_specflags & SI_MOUNTEDON) {
2600 error = EBUSY;
2601 break;
2602 }
2603 }
2604 }
2605 out:
2606 SPECHASH_UNLOCK();
2607 return (error);
2608 }
2609
2610 /*
2611 * Unmount all filesystems. The list is traversed in reverse order
2612 * of mounting to avoid dependencies.
2613 */
2614 __private_extern__ void
2615 vfs_unmountall(void)
2616 {
2617 struct mount *mp;
2618 int error;
2619
2620 /*
2621 * Since this only runs when rebooting, it is not interlocked.
2622 */
2623 mount_list_lock();
2624 while(!TAILQ_EMPTY(&mountlist)) {
2625 mp = TAILQ_LAST(&mountlist, mntlist);
2626 mount_list_unlock();
2627 error = dounmount(mp, MNT_FORCE, 0, vfs_context_current());
2628 if ((error != 0) && (error != EBUSY)) {
2629 printf("unmount of %s failed (", mp->mnt_vfsstat.f_mntonname);
2630 printf("%d)\n", error);
2631 mount_list_lock();
2632 TAILQ_REMOVE(&mountlist, mp, mnt_list);
2633 continue;
2634 } else if (error == EBUSY) {
2635 /* If EBUSY is returned, the unmount was already in progress */
2636 printf("unmount of %x failed (", (unsigned int)mp);
2637 printf("BUSY)\n");
2638 }
2639 mount_list_lock();
2640 }
2641 mount_list_unlock();
2642 }
2643
2644
2645 /*
2646 * This routine is called from vnode_pager_deallocate out of the VM
2647 * The path to vnode_pager_deallocate can only be initiated by ubc_destroy_named
2648 * on a vnode that has a UBCINFO
2649 */
2650 __private_extern__ void
2651 vnode_pager_vrele(vnode_t vp)
2652 {
2653 struct ubc_info *uip;
2654
2655 vnode_lock(vp);
2656
2657 vp->v_lflag &= ~VNAMED_UBC;
2658
2659 uip = vp->v_ubcinfo;
2660 vp->v_ubcinfo = UBC_INFO_NULL;
2661
2662 ubc_info_deallocate(uip);
2663
2664 vnode_unlock(vp);
2665 }
2666
2667
2668 #include <sys/disk.h>
2669
2670 errno_t
2671 vfs_init_io_attributes(vnode_t devvp, mount_t mp)
2672 {
2673 int error;
2674 off_t readblockcnt;
2675 off_t writeblockcnt;
2676 off_t readmaxcnt;
2677 off_t writemaxcnt;
2678 off_t readsegcnt;
2679 off_t writesegcnt;
2680 off_t readsegsize;
2681 off_t writesegsize;
2682 off_t alignment;
2683 u_long blksize;
2684 u_int64_t temp;
2685 u_int32_t features;
2686 vfs_context_t ctx = vfs_context_current();
2687
2688 int isvirtual = 0;
2689 /*
2690 * determine if this mount point exists on the same device as the root
2691 * partition... if so, then it comes under the hard throttle control
2692 */
2693 int thisunit = -1;
2694 static int rootunit = -1;
2695
2696 if (rootunit == -1) {
2697 if (VNOP_IOCTL(rootvp, DKIOCGETBSDUNIT, (caddr_t)&rootunit, 0, ctx))
2698 rootunit = -1;
2699 else if (rootvp == devvp)
2700 mp->mnt_kern_flag |= MNTK_ROOTDEV;
2701 }
2702 if (devvp != rootvp && rootunit != -1) {
2703 if (VNOP_IOCTL(devvp, DKIOCGETBSDUNIT, (caddr_t)&thisunit, 0, ctx) == 0) {
2704 if (thisunit == rootunit)
2705 mp->mnt_kern_flag |= MNTK_ROOTDEV;
2706 }
2707 }
2708 /*
2709 * force the spec device to re-cache
2710 * the underlying block size in case
2711 * the filesystem overrode the initial value
2712 */
2713 set_fsblocksize(devvp);
2714
2715
2716 if ((error = VNOP_IOCTL(devvp, DKIOCGETBLOCKSIZE,
2717 (caddr_t)&blksize, 0, ctx)))
2718 return (error);
2719
2720 mp->mnt_devblocksize = blksize;
2721
2722 if (VNOP_IOCTL(devvp, DKIOCISVIRTUAL, (caddr_t)&isvirtual, 0, ctx) == 0) {
2723 if (isvirtual)
2724 mp->mnt_kern_flag |= MNTK_VIRTUALDEV;
2725 }
2726
2727 if ((error = VNOP_IOCTL(devvp, DKIOCGETFEATURES,
2728 (caddr_t)&features, 0, ctx)))
2729 return (error);
2730
2731 if ((error = VNOP_IOCTL(devvp, DKIOCGETMAXBLOCKCOUNTREAD,
2732 (caddr_t)&readblockcnt, 0, ctx)))
2733 return (error);
2734
2735 if ((error = VNOP_IOCTL(devvp, DKIOCGETMAXBLOCKCOUNTWRITE,
2736 (caddr_t)&writeblockcnt, 0, ctx)))
2737 return (error);
2738
2739 if ((error = VNOP_IOCTL(devvp, DKIOCGETMAXBYTECOUNTREAD,
2740 (caddr_t)&readmaxcnt, 0, ctx)))
2741 return (error);
2742
2743 if ((error = VNOP_IOCTL(devvp, DKIOCGETMAXBYTECOUNTWRITE,
2744 (caddr_t)&writemaxcnt, 0, ctx)))
2745 return (error);
2746
2747 if ((error = VNOP_IOCTL(devvp, DKIOCGETMAXSEGMENTCOUNTREAD,
2748 (caddr_t)&readsegcnt, 0, ctx)))
2749 return (error);
2750
2751 if ((error = VNOP_IOCTL(devvp, DKIOCGETMAXSEGMENTCOUNTWRITE,
2752 (caddr_t)&writesegcnt, 0, ctx)))
2753 return (error);
2754
2755 if ((error = VNOP_IOCTL(devvp, DKIOCGETMAXSEGMENTBYTECOUNTREAD,
2756 (caddr_t)&readsegsize, 0, ctx)))
2757 return (error);
2758
2759 if ((error = VNOP_IOCTL(devvp, DKIOCGETMAXSEGMENTBYTECOUNTWRITE,
2760 (caddr_t)&writesegsize, 0, ctx)))
2761 return (error);
2762
2763 if ((error = VNOP_IOCTL(devvp, DKIOCGETMINSEGMENTALIGNMENTBYTECOUNT,
2764 (caddr_t)&alignment, 0, ctx)))
2765 return (error);
2766
2767 if (readmaxcnt)
2768 temp = (readmaxcnt > UINT32_MAX) ? UINT32_MAX : readmaxcnt;
2769 else {
2770 if (readblockcnt) {
2771 temp = readblockcnt * blksize;
2772 temp = (temp > UINT32_MAX) ? UINT32_MAX : temp;
2773 } else
2774 temp = MAXPHYS;
2775 }
2776 mp->mnt_maxreadcnt = (u_int32_t)temp;
2777
2778 if (writemaxcnt)
2779 temp = (writemaxcnt > UINT32_MAX) ? UINT32_MAX : writemaxcnt;
2780 else {
2781 if (writeblockcnt) {
2782 temp = writeblockcnt * blksize;
2783 temp = (temp > UINT32_MAX) ? UINT32_MAX : temp;
2784 } else
2785 temp = MAXPHYS;
2786 }
2787 mp->mnt_maxwritecnt = (u_int32_t)temp;
2788
2789 if (readsegcnt) {
2790 temp = (readsegcnt > UINT16_MAX) ? UINT16_MAX : readsegcnt;
2791 mp->mnt_segreadcnt = (u_int16_t)temp;
2792 }
2793 if (writesegcnt) {
2794 temp = (writesegcnt > UINT16_MAX) ? UINT16_MAX : writesegcnt;
2795 mp->mnt_segwritecnt = (u_int16_t)temp;
2796 }
2797 if (readsegsize)
2798 temp = (readsegsize > UINT32_MAX) ? UINT32_MAX : readsegsize;
2799 else
2800 temp = mp->mnt_maxreadcnt;
2801 mp->mnt_maxsegreadsize = (u_int32_t)temp;
2802
2803 if (writesegsize)
2804 temp = (writesegsize > UINT32_MAX) ? UINT32_MAX : writesegsize;
2805 else
2806 temp = mp->mnt_maxwritecnt;
2807 mp->mnt_maxsegwritesize = (u_int32_t)temp;
2808
2809 if (alignment)
2810 temp = (alignment > PAGE_SIZE) ? PAGE_MASK : alignment - 1;
2811 else
2812 temp = 0;
2813 mp->mnt_alignmentmask = temp;
2814
2815 if (features & DK_FEATURE_FORCE_UNIT_ACCESS)
2816 mp->mnt_ioflags |= MNT_IOFLAGS_FUA_SUPPORTED;
2817
2818 return (error);
2819 }
2820
2821 static struct klist fs_klist;
2822 lck_grp_t *fs_klist_lck_grp;
2823 lck_mtx_t *fs_klist_lock;
2824
2825 void
2826 vfs_event_init(void)
2827 {
2828 klist_init(&fs_klist);
2829 fs_klist_lck_grp = lck_grp_alloc_init("fs_klist", NULL);
2830 fs_klist_lock = lck_mtx_alloc_init(fs_klist_lck_grp, NULL);
2831 }
2832
2833 void
2834 vfs_event_signal(__unused fsid_t *fsid, u_int32_t event, __unused intptr_t data)
2835 {
2836 lck_mtx_lock(fs_klist_lock);
2837 KNOTE(&fs_klist, event);
2838 lck_mtx_unlock(fs_klist_lock);
2839 }
2840
2841 /*
2842 * return the number of mounted filesystems.
2843 */
2844 static int
2845 sysctl_vfs_getvfscnt(void)
2846 {
2847 return(mount_getvfscnt());
2848 }
2849
2850
2851 static int
2852 mount_getvfscnt(void)
2853 {
2854 int ret;
2855
2856 mount_list_lock();
2857 ret = nummounts;
2858 mount_list_unlock();
2859 return (ret);
2860
2861 }
2862
2863
2864
2865 static int
2866 mount_fillfsids(fsid_t *fsidlst, int count)
2867 {
2868 struct mount *mp;
2869 int actual=0;
2870
2871 actual = 0;
2872 mount_list_lock();
2873 TAILQ_FOREACH(mp, &mountlist, mnt_list) {
2874 if (actual <= count) {
2875 fsidlst[actual] = mp->mnt_vfsstat.f_fsid;
2876 actual++;
2877 }
2878 }
2879 mount_list_unlock();
2880 return (actual);
2881
2882 }
2883
2884 /*
2885 * fill in the array of fsid_t's up to a max of 'count', the actual
2886 * number filled in will be set in '*actual'. If there are more fsid_t's
2887 * than room in fsidlst then ENOMEM will be returned and '*actual' will
2888 * have the actual count.
2889 * having *actual filled out even in the error case is depended upon.
2890 */
2891 static int
2892 sysctl_vfs_getvfslist(fsid_t *fsidlst, int count, int *actual)
2893 {
2894 struct mount *mp;
2895
2896 *actual = 0;
2897 mount_list_lock();
2898 TAILQ_FOREACH(mp, &mountlist, mnt_list) {
2899 (*actual)++;
2900 if (*actual <= count)
2901 fsidlst[(*actual) - 1] = mp->mnt_vfsstat.f_fsid;
2902 }
2903 mount_list_unlock();
2904 return (*actual <= count ? 0 : ENOMEM);
2905 }
2906
2907 static int
2908 sysctl_vfs_vfslist(__unused struct sysctl_oid *oidp, __unused void *arg1,
2909 __unused int arg2, struct sysctl_req *req)
2910 {
2911 int actual, error;
2912 size_t space;
2913 fsid_t *fsidlst;
2914
2915 /* This is a readonly node. */
2916 if (req->newptr != USER_ADDR_NULL)
2917 return (EPERM);
2918
2919 /* they are querying us so just return the space required. */
2920 if (req->oldptr == USER_ADDR_NULL) {
2921 req->oldidx = sysctl_vfs_getvfscnt() * sizeof(fsid_t);
2922 return 0;
2923 }
2924 again:
2925 /*
2926 * Retrieve an accurate count of the amount of space required to copy
2927 * out all the fsids in the system.
2928 */
2929 space = req->oldlen;
2930 req->oldlen = sysctl_vfs_getvfscnt() * sizeof(fsid_t);
2931
2932 /* they didn't give us enough space. */
2933 if (space < req->oldlen)
2934 return (ENOMEM);
2935
2936 MALLOC(fsidlst, fsid_t *, req->oldlen, M_TEMP, M_WAITOK);
2937 error = sysctl_vfs_getvfslist(fsidlst, req->oldlen / sizeof(fsid_t),
2938 &actual);
2939 /*
2940 * If we get back ENOMEM, then another mount has been added while we
2941 * slept in malloc above. If this is the case then try again.
2942 */
2943 if (error == ENOMEM) {
2944 FREE(fsidlst, M_TEMP);
2945 req->oldlen = space;
2946 goto again;
2947 }
2948 if (error == 0) {
2949 error = SYSCTL_OUT(req, fsidlst, actual * sizeof(fsid_t));
2950 }
2951 FREE(fsidlst, M_TEMP);
2952 return (error);
2953 }
2954
2955 /*
2956 * Do a sysctl by fsid.
2957 */
2958 static int
2959 sysctl_vfs_ctlbyfsid(__unused struct sysctl_oid *oidp, void *arg1, int arg2,
2960 struct sysctl_req *req)
2961 {
2962 struct vfsidctl vc;
2963 struct user_vfsidctl user_vc;
2964 struct mount *mp;
2965 struct vfsstatfs *sp;
2966 int *name, flags, namelen;
2967 int error=0, gotref=0;
2968 vfs_context_t ctx = vfs_context_current();
2969 proc_t p = req->p; /* XXX req->p != current_proc()? */
2970 boolean_t is_64_bit;
2971
2972 name = arg1;
2973 namelen = arg2;
2974 is_64_bit = proc_is64bit(p);
2975
2976 if (is_64_bit) {
2977 error = SYSCTL_IN(req, &user_vc, sizeof(user_vc));
2978 if (error)
2979 goto out;
2980 if (user_vc.vc_vers != VFS_CTL_VERS1) {
2981 error = EINVAL;
2982 goto out;
2983 }
2984 mp = mount_list_lookupby_fsid(&user_vc.vc_fsid, 0, 1);
2985 }
2986 else {
2987 error = SYSCTL_IN(req, &vc, sizeof(vc));
2988 if (error)
2989 goto out;
2990 if (vc.vc_vers != VFS_CTL_VERS1) {
2991 error = EINVAL;
2992 goto out;
2993 }
2994 mp = mount_list_lookupby_fsid(&vc.vc_fsid, 0, 1);
2995 }
2996 if (mp == NULL) {
2997 error = ENOENT;
2998 goto out;
2999 }
3000 gotref = 1;
3001 /* reset so that the fs specific code can fetch it. */
3002 req->newidx = 0;
3003 /*
3004 * Note if this is a VFS_CTL then we pass the actual sysctl req
3005 * in for "oldp" so that the lower layer can DTRT and use the
3006 * SYSCTL_IN/OUT routines.
3007 */
3008 if (mp->mnt_op->vfs_sysctl != NULL) {
3009 if (is_64_bit) {
3010 if (vfs_64bitready(mp)) {
3011 error = mp->mnt_op->vfs_sysctl(name, namelen,
3012 CAST_USER_ADDR_T(req),
3013 NULL, USER_ADDR_NULL, 0,
3014 ctx);
3015 }
3016 else {
3017 error = ENOTSUP;
3018 }
3019 }
3020 else {
3021 error = mp->mnt_op->vfs_sysctl(name, namelen,
3022 CAST_USER_ADDR_T(req),
3023 NULL, USER_ADDR_NULL, 0,
3024 ctx);
3025 }
3026 if (error != ENOTSUP) {
3027 goto out;
3028 }
3029 }
3030 switch (name[0]) {
3031 case VFS_CTL_UMOUNT:
3032 req->newidx = 0;
3033 if (is_64_bit) {
3034 req->newptr = user_vc.vc_ptr;
3035 req->newlen = (size_t)user_vc.vc_len;
3036 }
3037 else {
3038 req->newptr = CAST_USER_ADDR_T(vc.vc_ptr);
3039 req->newlen = vc.vc_len;
3040 }
3041 error = SYSCTL_IN(req, &flags, sizeof(flags));
3042 if (error)
3043 break;
3044
3045 mount_ref(mp, 0);
3046 mount_iterdrop(mp);
3047 gotref = 0;
3048 /* safedounmount consumes a ref */
3049 error = safedounmount(mp, flags, ctx);
3050 break;
3051 case VFS_CTL_STATFS:
3052 req->newidx = 0;
3053 if (is_64_bit) {
3054 req->newptr = user_vc.vc_ptr;
3055 req->newlen = (size_t)user_vc.vc_len;
3056 }
3057 else {
3058 req->newptr = CAST_USER_ADDR_T(vc.vc_ptr);
3059 req->newlen = vc.vc_len;
3060 }
3061 error = SYSCTL_IN(req, &flags, sizeof(flags));
3062 if (error)
3063 break;
3064 sp = &mp->mnt_vfsstat;
3065 if (((flags & MNT_NOWAIT) == 0 || (flags & MNT_WAIT)) &&
3066 (error = vfs_update_vfsstat(mp, ctx, VFS_USER_EVENT)))
3067 goto out;
3068 if (is_64_bit) {
3069 struct user_statfs sfs;
3070 bzero(&sfs, sizeof(sfs));
3071 sfs.f_flags = mp->mnt_flag & MNT_VISFLAGMASK;
3072 sfs.f_type = mp->mnt_vtable->vfc_typenum;
3073 sfs.f_bsize = (user_long_t)sp->f_bsize;
3074 sfs.f_iosize = (user_long_t)sp->f_iosize;
3075 sfs.f_blocks = (user_long_t)sp->f_blocks;
3076 sfs.f_bfree = (user_long_t)sp->f_bfree;
3077 sfs.f_bavail = (user_long_t)sp->f_bavail;
3078 sfs.f_files = (user_long_t)sp->f_files;
3079 sfs.f_ffree = (user_long_t)sp->f_ffree;
3080 sfs.f_fsid = sp->f_fsid;
3081 sfs.f_owner = sp->f_owner;
3082
3083 strlcpy(sfs.f_fstypename, sp->f_fstypename, MFSNAMELEN);
3084 strlcpy(sfs.f_mntonname, sp->f_mntonname, MNAMELEN);
3085 strlcpy(sfs.f_mntfromname, sp->f_mntfromname, MNAMELEN);
3086
3087 error = SYSCTL_OUT(req, &sfs, sizeof(sfs));
3088 }
3089 else {
3090 struct statfs sfs;
3091 bzero(&sfs, sizeof(struct statfs));
3092 sfs.f_flags = mp->mnt_flag & MNT_VISFLAGMASK;
3093 sfs.f_type = mp->mnt_vtable->vfc_typenum;
3094
3095 /*
3096 * It's possible for there to be more than 2^^31 blocks in the filesystem, so we
3097 * have to fudge the numbers here in that case. We inflate the blocksize in order
3098 * to reflect the filesystem size as best we can.
3099 */
3100 if (sp->f_blocks > LONG_MAX) {
3101 int shift;
3102
3103 /*
3104 * Work out how far we have to shift the block count down to make it fit.
3105 * Note that it's possible to have to shift so far that the resulting
3106 * blocksize would be unreportably large. At that point, we will clip
3107 * any values that don't fit.
3108 *
3109 * For safety's sake, we also ensure that f_iosize is never reported as
3110 * being smaller than f_bsize.
3111 */
3112 for (shift = 0; shift < 32; shift++) {
3113 if ((sp->f_blocks >> shift) <= LONG_MAX)
3114 break;
3115 if ((sp->f_bsize << (shift + 1)) > LONG_MAX)
3116 break;
3117 }
3118 #define __SHIFT_OR_CLIP(x, s) ((((x) >> (s)) > LONG_MAX) ? LONG_MAX : ((x) >> (s)))
3119 sfs.f_blocks = (long)__SHIFT_OR_CLIP(sp->f_blocks, shift);
3120 sfs.f_bfree = (long)__SHIFT_OR_CLIP(sp->f_bfree, shift);
3121 sfs.f_bavail = (long)__SHIFT_OR_CLIP(sp->f_bavail, shift);
3122 #undef __SHIFT_OR_CLIP
3123 sfs.f_bsize = (long)(sp->f_bsize << shift);
3124 sfs.f_iosize = lmax(sp->f_iosize, sp->f_bsize);
3125 } else {
3126 sfs.f_bsize = (long)sp->f_bsize;
3127 sfs.f_iosize = (long)sp->f_iosize;
3128 sfs.f_blocks = (long)sp->f_blocks;
3129 sfs.f_bfree = (long)sp->f_bfree;
3130 sfs.f_bavail = (long)sp->f_bavail;
3131 }
3132 sfs.f_files = (long)sp->f_files;
3133 sfs.f_ffree = (long)sp->f_ffree;
3134 sfs.f_fsid = sp->f_fsid;
3135 sfs.f_owner = sp->f_owner;
3136
3137 strlcpy(sfs.f_fstypename, sp->f_fstypename, MFSNAMELEN);
3138 strlcpy(sfs.f_mntonname, sp->f_mntonname, MNAMELEN);
3139 strlcpy(sfs.f_mntfromname, sp->f_mntfromname, MNAMELEN);
3140
3141 error = SYSCTL_OUT(req, &sfs, sizeof(sfs));
3142 }
3143 break;
3144 default:
3145 error = ENOTSUP;
3146 goto out;
3147 }
3148 out:
3149 if(gotref != 0)
3150 mount_iterdrop(mp);
3151 return (error);
3152 }
3153
3154 static int filt_fsattach(struct knote *kn);
3155 static void filt_fsdetach(struct knote *kn);
3156 static int filt_fsevent(struct knote *kn, long hint);
3157
3158 struct filterops fs_filtops =
3159 { 0, filt_fsattach, filt_fsdetach, filt_fsevent };
3160
3161 static int
3162 filt_fsattach(struct knote *kn)
3163 {
3164
3165 lck_mtx_lock(fs_klist_lock);
3166 kn->kn_flags |= EV_CLEAR;
3167 KNOTE_ATTACH(&fs_klist, kn);
3168 lck_mtx_unlock(fs_klist_lock);
3169 return (0);
3170 }
3171
3172 static void
3173 filt_fsdetach(struct knote *kn)
3174 {
3175 lck_mtx_lock(fs_klist_lock);
3176 KNOTE_DETACH(&fs_klist, kn);
3177 lck_mtx_unlock(fs_klist_lock);
3178 }
3179
3180 static int
3181 filt_fsevent(struct knote *kn, long hint)
3182 {
3183 /*
3184 * Backwards compatibility:
3185 * Other filters would do nothing if kn->kn_sfflags == 0
3186 */
3187
3188 if ((kn->kn_sfflags == 0) || (kn->kn_sfflags & hint)) {
3189 kn->kn_fflags |= hint;
3190 }
3191
3192 return (kn->kn_fflags != 0);
3193 }
3194
3195 static int
3196 sysctl_vfs_noremotehang(__unused struct sysctl_oid *oidp,
3197 __unused void *arg1, __unused int arg2, struct sysctl_req *req)
3198 {
3199 int out, error;
3200 pid_t pid;
3201 proc_t p;
3202
3203 /* We need a pid. */
3204 if (req->newptr == USER_ADDR_NULL)
3205 return (EINVAL);
3206
3207 error = SYSCTL_IN(req, &pid, sizeof(pid));
3208 if (error)
3209 return (error);
3210
3211 p = proc_find(pid < 0 ? -pid : pid);
3212 if (p == NULL)
3213 return (ESRCH);
3214
3215 /*
3216 * Fetching the value is ok, but we only fetch if the old
3217 * pointer is given.
3218 */
3219 if (req->oldptr != USER_ADDR_NULL) {
3220 out = !((p->p_flag & P_NOREMOTEHANG) == 0);
3221 proc_rele(p);
3222 error = SYSCTL_OUT(req, &out, sizeof(out));
3223 return (error);
3224 }
3225
3226 /* cansignal offers us enough security. */
3227 if (p != req->p && proc_suser(req->p) != 0) {
3228 proc_rele(p);
3229 return (EPERM);
3230 }
3231
3232 if (pid < 0)
3233 OSBitAndAtomic(~((uint32_t)P_NOREMOTEHANG), (UInt32 *)&p->p_flag);
3234 else
3235 OSBitOrAtomic(P_NOREMOTEHANG, (UInt32 *)&p->p_flag);
3236 proc_rele(p);
3237
3238 return (0);
3239 }
3240
3241 /* the vfs.generic. branch. */
3242 SYSCTL_NODE(_vfs, VFS_GENERIC, generic, CTLFLAG_RW|CTLFLAG_LOCKED, NULL, "vfs generic hinge");
3243 /* retreive a list of mounted filesystem fsid_t */
3244 SYSCTL_PROC(_vfs_generic, OID_AUTO, vfsidlist, CTLFLAG_RD,
3245 NULL, 0, sysctl_vfs_vfslist, "S,fsid", "List of mounted filesystem ids");
3246 /* perform operations on filesystem via fsid_t */
3247 SYSCTL_NODE(_vfs_generic, OID_AUTO, ctlbyfsid, CTLFLAG_RW|CTLFLAG_LOCKED,
3248 sysctl_vfs_ctlbyfsid, "ctlbyfsid");
3249 SYSCTL_PROC(_vfs_generic, OID_AUTO, noremotehang, CTLFLAG_RW|CTLFLAG_ANYBODY,
3250 NULL, 0, sysctl_vfs_noremotehang, "I", "noremotehang");
3251
3252
3253 long num_reusedvnodes = 0; /* long for OSAddAtomic */
3254
3255 static int
3256 new_vnode(vnode_t *vpp)
3257 {
3258 vnode_t vp;
3259 int retries = 0; /* retry incase of tablefull */
3260 int force_alloc = 0, walk_count = 0;
3261 int vpid;
3262 struct timespec ts;
3263 struct timeval current_tv;
3264 struct unsafe_fsnode *l_unsafefs = 0;
3265 proc_t curproc = current_proc();
3266
3267 retry:
3268 microuptime(&current_tv);
3269
3270 vp = NULLVP;
3271
3272 vnode_list_lock();
3273
3274 if ( !TAILQ_EMPTY(&vnode_dead_list)) {
3275 /*
3276 * Can always reuse a dead one
3277 */
3278 vp = TAILQ_FIRST(&vnode_dead_list);
3279 goto steal_this_vp;
3280 }
3281 /*
3282 * no dead vnodes available... if we're under
3283 * the limit, we'll create a new vnode
3284 */
3285 if (numvnodes < desiredvnodes || force_alloc) {
3286 numvnodes++;
3287 vnode_list_unlock();
3288 MALLOC_ZONE(vp, struct vnode *, sizeof(*vp), M_VNODE, M_WAITOK);
3289 bzero((char *)vp, sizeof(*vp));
3290 VLISTNONE(vp); /* avoid double queue removal */
3291 lck_mtx_init(&vp->v_lock, vnode_lck_grp, vnode_lck_attr);
3292
3293 nanouptime(&ts);
3294 vp->v_id = ts.tv_nsec;
3295 vp->v_flag = VSTANDARD;
3296
3297 #if CONFIG_MACF
3298 mac_vnode_label_init(vp);
3299 #endif /* MAC */
3300
3301 vp->v_iocount = 1;
3302 goto done;
3303 }
3304
3305 #define MAX_WALK_COUNT 1000
3306
3307 if ( !TAILQ_EMPTY(&vnode_rage_list) &&
3308 (ragevnodes >= rage_limit ||
3309 (current_tv.tv_sec - rage_tv.tv_sec) >= RAGE_TIME_LIMIT)) {
3310
3311 TAILQ_FOREACH(vp, &vnode_rage_list, v_freelist) {
3312 if ( !(vp->v_listflag & VLIST_RAGE) || !(vp->v_flag & VRAGE))
3313 panic("new_vnode: vp on RAGE list not marked both VLIST_RAGE and VRAGE");
3314
3315 // if we're a dependency-capable process, skip vnodes that can
3316 // cause recycling deadlocks. (i.e. this process is diskimages
3317 // helper and the vnode is in a disk image).
3318 //
3319 if ((curproc->p_flag & P_DEPENDENCY_CAPABLE) == 0 || vp->v_mount == NULL || vp->v_mount->mnt_dependent_process == NULL) {
3320 break;
3321 }
3322
3323 // don't iterate more than MAX_WALK_COUNT vnodes to
3324 // avoid keeping the vnode list lock held for too long.
3325 if (walk_count++ > MAX_WALK_COUNT) {
3326 vp = NULL;
3327 break;
3328 }
3329 }
3330
3331 }
3332
3333 if (vp == NULL && !TAILQ_EMPTY(&vnode_free_list)) {
3334 /*
3335 * Pick the first vp for possible reuse
3336 */
3337 walk_count = 0;
3338 TAILQ_FOREACH(vp, &vnode_free_list, v_freelist) {
3339 // if we're a dependency-capable process, skip vnodes that can
3340 // cause recycling deadlocks. (i.e. this process is diskimages
3341 // helper and the vnode is in a disk image)
3342 //
3343 if ((curproc->p_flag & P_DEPENDENCY_CAPABLE) == 0 || vp->v_mount == NULL || vp->v_mount->mnt_dependent_process == NULL) {
3344 break;
3345 }
3346
3347 // don't iterate more than MAX_WALK_COUNT vnodes to
3348 // avoid keeping the vnode list lock held for too long.
3349 if (walk_count++ > MAX_WALK_COUNT) {
3350 vp = NULL;
3351 break;
3352 }
3353 }
3354
3355 }
3356
3357 //
3358 // if we don't have a vnode and the walk_count is >= MAX_WALK_COUNT
3359 // then we're trying to create a vnode on behalf of a
3360 // process like diskimages-helper that has file systems
3361 // mounted on top of itself (and thus we can't reclaim
3362 // vnodes in the file systems on top of us). if we can't
3363 // find a vnode to reclaim then we'll just have to force
3364 // the allocation.
3365 //
3366 if (vp == NULL && walk_count >= MAX_WALK_COUNT) {
3367 force_alloc = 1;
3368 vnode_list_unlock();
3369 goto retry;
3370 }
3371
3372 if (vp == NULL) {
3373 /*
3374 * we've reached the system imposed maximum number of vnodes
3375 * but there isn't a single one available
3376 * wait a bit and then retry... if we can't get a vnode
3377 * after 100 retries, than log a complaint
3378 */
3379 if (++retries <= 100) {
3380 vnode_list_unlock();
3381 delay_for_interval(1, 1000 * 1000);
3382 goto retry;
3383 }
3384
3385 vnode_list_unlock();
3386 tablefull("vnode");
3387 log(LOG_EMERG, "%d desired, %d numvnodes, "
3388 "%d free, %d dead, %d rage\n",
3389 desiredvnodes, numvnodes, freevnodes, deadvnodes, ragevnodes);
3390 #if CONFIG_EMBEDDED
3391 /*
3392 * Running out of vnodes tends to make a system unusable. On an
3393 * embedded system, it's unlikely that the user can do anything
3394 * about it (or would know what to do, if they could). So panic
3395 * the system so it will automatically restart (and hopefully we
3396 * can get a panic log that tells us why we ran out).
3397 */
3398 panic("vnode table is full\n");
3399 #endif
3400 *vpp = NULL;
3401 return (ENFILE);
3402 }
3403 steal_this_vp:
3404 vpid = vp->v_id;
3405
3406 vnode_list_remove_locked(vp);
3407
3408 vnode_list_unlock();
3409 vnode_lock_spin(vp);
3410
3411 /*
3412 * We could wait for the vnode_lock after removing the vp from the freelist
3413 * and the vid is bumped only at the very end of reclaim. So it is possible
3414 * that we are looking at a vnode that is being terminated. If so skip it.
3415 */
3416 if ((vpid != vp->v_id) || (vp->v_usecount != 0) || (vp->v_iocount != 0) ||
3417 VONLIST(vp) || (vp->v_lflag & VL_TERMINATE)) {
3418 /*
3419 * we lost the race between dropping the list lock
3420 * and picking up the vnode_lock... someone else
3421 * used this vnode and it is now in a new state
3422 * so we need to go back and try again
3423 */
3424 vnode_unlock(vp);
3425 goto retry;
3426 }
3427 if ( (vp->v_lflag & (VL_NEEDINACTIVE | VL_MARKTERM)) == VL_NEEDINACTIVE ) {
3428 /*
3429 * we did a vnode_rele_ext that asked for
3430 * us not to reenter the filesystem during
3431 * the release even though VL_NEEDINACTIVE was
3432 * set... we'll do it here by doing a
3433 * vnode_get/vnode_put
3434 *
3435 * pick up an iocount so that we can call
3436 * vnode_put and drive the VNOP_INACTIVE...
3437 * vnode_put will either leave us off
3438 * the freelist if a new ref comes in,
3439 * or put us back on the end of the freelist
3440 * or recycle us if we were marked for termination...
3441 * so we'll just go grab a new candidate
3442 */
3443 vp->v_iocount++;
3444 #ifdef JOE_DEBUG
3445 record_vp(vp, 1);
3446 #endif
3447 vnode_put_locked(vp);
3448 vnode_unlock(vp);
3449 goto retry;
3450 }
3451 OSAddAtomic(1, &num_reusedvnodes);
3452
3453 /* Checks for anyone racing us for recycle */
3454 if (vp->v_type != VBAD) {
3455 if (vp->v_lflag & VL_DEAD)
3456 panic("new_vnode: the vnode is VL_DEAD but not VBAD");
3457 vnode_lock_convert(vp);
3458 (void)vnode_reclaim_internal(vp, 1, 1, 0);
3459
3460 if ((VONLIST(vp)))
3461 panic("new_vnode: vp on list ");
3462 if (vp->v_usecount || vp->v_iocount || vp->v_kusecount ||
3463 (vp->v_lflag & (VNAMED_UBC | VNAMED_MOUNT | VNAMED_FSHASH)))
3464 panic("new_vnode: free vnode still referenced\n");
3465 if ((vp->v_mntvnodes.tqe_prev != 0) && (vp->v_mntvnodes.tqe_next != 0))
3466 panic("new_vnode: vnode seems to be on mount list ");
3467 if ( !LIST_EMPTY(&vp->v_nclinks) || !LIST_EMPTY(&vp->v_ncchildren))
3468 panic("new_vnode: vnode still hooked into the name cache");
3469 }
3470 if (vp->v_unsafefs) {
3471 l_unsafefs = vp->v_unsafefs;
3472 vp->v_unsafefs = (struct unsafe_fsnode *)NULL;
3473 }
3474
3475 #if CONFIG_MACF
3476 /*
3477 * We should never see VL_LABELWAIT or VL_LABEL here.
3478 * as those operations hold a reference.
3479 */
3480 assert ((vp->v_lflag & VL_LABELWAIT) != VL_LABELWAIT);
3481 assert ((vp->v_lflag & VL_LABEL) != VL_LABEL);
3482 if (vp->v_lflag & VL_LABELED) {
3483 vnode_lock_convert(vp);
3484 mac_vnode_label_recycle(vp);
3485 }
3486 #endif /* MAC */
3487
3488 vp->v_iocount = 1;
3489 vp->v_lflag = 0;
3490 vp->v_writecount = 0;
3491 vp->v_references = 0;
3492 vp->v_iterblkflags = 0;
3493 vp->v_flag = VSTANDARD;
3494 /* vbad vnodes can point to dead_mountp */
3495 vp->v_mount = NULL;
3496 vp->v_defer_reclaimlist = (vnode_t)0;
3497
3498 vnode_unlock(vp);
3499
3500 if (l_unsafefs) {
3501 lck_mtx_destroy(&l_unsafefs->fsnodelock, vnode_lck_grp);
3502 FREE_ZONE((void *)l_unsafefs, sizeof(struct unsafe_fsnode), M_UNSAFEFS);
3503 }
3504 done:
3505 *vpp = vp;
3506
3507 return (0);
3508 }
3509
3510 void
3511 vnode_lock(vnode_t vp)
3512 {
3513 lck_mtx_lock(&vp->v_lock);
3514 }
3515
3516 void
3517 vnode_lock_spin(vnode_t vp)
3518 {
3519 lck_mtx_lock_spin(&vp->v_lock);
3520 }
3521
3522 void
3523 vnode_unlock(vnode_t vp)
3524 {
3525 lck_mtx_unlock(&vp->v_lock);
3526 }
3527
3528
3529
3530 int
3531 vnode_get(struct vnode *vp)
3532 {
3533 int retval;
3534
3535 vnode_lock_spin(vp);
3536 retval = vnode_get_locked(vp);
3537 vnode_unlock(vp);
3538
3539 return(retval);
3540 }
3541
3542 int
3543 vnode_get_locked(struct vnode *vp)
3544 {
3545
3546 if ((vp->v_iocount == 0) && (vp->v_lflag & (VL_TERMINATE | VL_DEAD))) {
3547 return(ENOENT);
3548 }
3549 vp->v_iocount++;
3550 #ifdef JOE_DEBUG
3551 record_vp(vp, 1);
3552 #endif
3553 return (0);
3554 }
3555
3556 int
3557 vnode_getwithvid(vnode_t vp, int vid)
3558 {
3559 return(vget_internal(vp, vid, ( VNODE_NODEAD| VNODE_WITHID)));
3560 }
3561
3562 int
3563 vnode_getwithref(vnode_t vp)
3564 {
3565 return(vget_internal(vp, 0, 0));
3566 }
3567
3568
3569 __private_extern__ int
3570 vnode_getalways(vnode_t vp)
3571 {
3572 return(vget_internal(vp, 0, VNODE_ALWAYS));
3573 }
3574
3575 int
3576 vnode_put(vnode_t vp)
3577 {
3578 int retval;
3579
3580 vnode_lock_spin(vp);
3581 retval = vnode_put_locked(vp);
3582 vnode_unlock(vp);
3583
3584 return(retval);
3585 }
3586
3587 int
3588 vnode_put_locked(vnode_t vp)
3589 {
3590 vfs_context_t ctx = vfs_context_current(); /* hoist outside loop */
3591
3592 retry:
3593 if (vp->v_iocount < 1)
3594 panic("vnode_put(%p): iocount < 1", vp);
3595
3596 if ((vp->v_usecount > 0) || (vp->v_iocount > 1)) {
3597 vnode_dropiocount(vp);
3598 return(0);
3599 }
3600 if ((vp->v_lflag & (VL_MARKTERM | VL_TERMINATE | VL_DEAD | VL_NEEDINACTIVE)) == VL_NEEDINACTIVE) {
3601
3602 vp->v_lflag &= ~VL_NEEDINACTIVE;
3603 vnode_unlock(vp);
3604
3605 VNOP_INACTIVE(vp, ctx);
3606
3607 vnode_lock_spin(vp);
3608 /*
3609 * because we had to drop the vnode lock before calling
3610 * VNOP_INACTIVE, the state of this vnode may have changed...
3611 * we may pick up both VL_MARTERM and either
3612 * an iocount or a usecount while in the VNOP_INACTIVE call
3613 * we don't want to call vnode_reclaim_internal on a vnode
3614 * that has active references on it... so loop back around
3615 * and reevaluate the state
3616 */
3617 goto retry;
3618 }
3619 vp->v_lflag &= ~VL_NEEDINACTIVE;
3620
3621 if ((vp->v_lflag & (VL_MARKTERM | VL_TERMINATE | VL_DEAD)) == VL_MARKTERM) {
3622 vnode_lock_convert(vp);
3623 vnode_reclaim_internal(vp, 1, 1, 0);
3624 }
3625 vnode_dropiocount(vp);
3626 vnode_list_add(vp);
3627
3628 return(0);
3629 }
3630
3631 /* is vnode_t in use by others? */
3632 int
3633 vnode_isinuse(vnode_t vp, int refcnt)
3634 {
3635 return(vnode_isinuse_locked(vp, refcnt, 0));
3636 }
3637
3638
3639 static int
3640 vnode_isinuse_locked(vnode_t vp, int refcnt, int locked)
3641 {
3642 int retval = 0;
3643
3644 if (!locked)
3645 vnode_lock_spin(vp);
3646 if ((vp->v_type != VREG) && ((vp->v_usecount - vp->v_kusecount) > refcnt)) {
3647 retval = 1;
3648 goto out;
3649 }
3650 if (vp->v_type == VREG) {
3651 retval = ubc_isinuse_locked(vp, refcnt, 1);
3652 }
3653
3654 out:
3655 if (!locked)
3656 vnode_unlock(vp);
3657 return(retval);
3658 }
3659
3660
3661 /* resume vnode_t */
3662 errno_t
3663 vnode_resume(vnode_t vp)
3664 {
3665
3666 vnode_lock_spin(vp);
3667
3668 if (vp->v_owner == current_thread()) {
3669 vp->v_lflag &= ~VL_SUSPENDED;
3670 vp->v_owner = NULL;
3671 vnode_unlock(vp);
3672 wakeup(&vp->v_iocount);
3673 } else
3674 vnode_unlock(vp);
3675
3676 return(0);
3677 }
3678
3679 /* suspend vnode_t
3680 * Please do not use on more than one vnode at a time as it may
3681 * cause deadlocks.
3682 * xxx should we explicity prevent this from happening?
3683 */
3684
3685 errno_t
3686 vnode_suspend(vnode_t vp)
3687 {
3688 if (vp->v_lflag & VL_SUSPENDED) {
3689 return(EBUSY);
3690 }
3691
3692 vnode_lock_spin(vp);
3693
3694 /*
3695 * xxx is this sufficient to check if a vnode_drain is
3696 * progress?
3697 */
3698
3699 if (vp->v_owner == NULL) {
3700 vp->v_lflag |= VL_SUSPENDED;
3701 vp->v_owner = current_thread();
3702 }
3703 vnode_unlock(vp);
3704
3705 return(0);
3706 }
3707
3708
3709
3710 static errno_t
3711 vnode_drain(vnode_t vp)
3712 {
3713
3714 if (vp->v_lflag & VL_DRAIN) {
3715 panic("vnode_drain: recursuve drain");
3716 return(ENOENT);
3717 }
3718 vp->v_lflag |= VL_DRAIN;
3719 vp->v_owner = current_thread();
3720
3721 while (vp->v_iocount > 1)
3722 msleep(&vp->v_iocount, &vp->v_lock, PVFS, "vnode_drain", NULL);
3723 return(0);
3724 }
3725
3726
3727 /*
3728 * if the number of recent references via vnode_getwithvid or vnode_getwithref
3729 * exceeds this threshhold, than 'UN-AGE' the vnode by removing it from
3730 * the LRU list if it's currently on it... once the iocount and usecount both drop
3731 * to 0, it will get put back on the end of the list, effectively making it younger
3732 * this allows us to keep actively referenced vnodes in the list without having
3733 * to constantly remove and add to the list each time a vnode w/o a usecount is
3734 * referenced which costs us taking and dropping a global lock twice.
3735 */
3736 #define UNAGE_THRESHHOLD 25
3737
3738 static errno_t
3739 vnode_getiocount(vnode_t vp, int vid, int vflags)
3740 {
3741 int nodead = vflags & VNODE_NODEAD;
3742 int nosusp = vflags & VNODE_NOSUSPEND;
3743 int always = vflags & VNODE_ALWAYS;
3744
3745 for (;;) {
3746 /*
3747 * if it is a dead vnode with deadfs
3748 */
3749 if (nodead && (vp->v_lflag & VL_DEAD) && ((vp->v_type == VBAD) || (vp->v_data == 0))) {
3750 return(ENOENT);
3751 }
3752 /*
3753 * will return VL_DEAD ones
3754 */
3755 if ((vp->v_lflag & (VL_SUSPENDED | VL_DRAIN | VL_TERMINATE)) == 0 ) {
3756 break;
3757 }
3758 /*
3759 * if suspended vnodes are to be failed
3760 */
3761 if (nosusp && (vp->v_lflag & VL_SUSPENDED)) {
3762 return(ENOENT);
3763 }
3764 /*
3765 * if you are the owner of drain/suspend/termination , can acquire iocount
3766 * check for VL_TERMINATE; it does not set owner
3767 */
3768 if ((vp->v_lflag & (VL_DRAIN | VL_SUSPENDED | VL_TERMINATE)) &&
3769 (vp->v_owner == current_thread())) {
3770 break;
3771 }
3772 if (always != 0)
3773 break;
3774 vnode_lock_convert(vp);
3775
3776 if (vp->v_lflag & VL_TERMINATE) {
3777 vp->v_lflag |= VL_TERMWANT;
3778
3779 msleep(&vp->v_lflag, &vp->v_lock, PVFS, "vnode getiocount", NULL);
3780 } else
3781 msleep(&vp->v_iocount, &vp->v_lock, PVFS, "vnode_getiocount", NULL);
3782 }
3783 if (vid != vp->v_id) {
3784 return(ENOENT);
3785 }
3786 if (++vp->v_references >= UNAGE_THRESHHOLD) {
3787 vp->v_references = 0;
3788 vnode_list_remove(vp);
3789 }
3790 vp->v_iocount++;
3791 #ifdef JOE_DEBUG
3792 record_vp(vp, 1);
3793 #endif
3794 return(0);
3795 }
3796
3797 static void
3798 vnode_dropiocount (vnode_t vp)
3799 {
3800 if (vp->v_iocount < 1)
3801 panic("vnode_dropiocount(%p): v_iocount < 1", vp);
3802
3803 vp->v_iocount--;
3804 #ifdef JOE_DEBUG
3805 record_vp(vp, -1);
3806 #endif
3807 if ((vp->v_lflag & (VL_DRAIN | VL_SUSPENDED)) && (vp->v_iocount <= 1)) {
3808 vnode_lock_convert(vp);
3809 wakeup(&vp->v_iocount);
3810 }
3811 }
3812
3813
3814 void
3815 vnode_reclaim(struct vnode * vp)
3816 {
3817 vnode_reclaim_internal(vp, 0, 0, 0);
3818 }
3819
3820 __private_extern__
3821 void
3822 vnode_reclaim_internal(struct vnode * vp, int locked, int reuse, int flags)
3823 {
3824 int isfifo = 0;
3825
3826 if (!locked)
3827 vnode_lock(vp);
3828
3829 if (vp->v_lflag & VL_TERMINATE) {
3830 panic("vnode reclaim in progress");
3831 }
3832 vp->v_lflag |= VL_TERMINATE;
3833
3834 vn_clearunionwait(vp, 1);
3835
3836 if (vnode_drain(vp)) {
3837 panic("vnode drain failed");
3838 vnode_unlock(vp);
3839 return;
3840 }
3841 isfifo = (vp->v_type == VFIFO);
3842
3843 if (vp->v_type != VBAD)
3844 vgone(vp, flags); /* clean and reclaim the vnode */
3845
3846 /*
3847 * give the vnode a new identity so that vnode_getwithvid will fail
3848 * on any stale cache accesses...
3849 * grab the list_lock so that if we're in "new_vnode"
3850 * behind the list_lock trying to steal this vnode, the v_id is stable...
3851 * once new_vnode drops the list_lock, it will block trying to take
3852 * the vnode lock until we release it... at that point it will evaluate
3853 * whether the v_vid has changed
3854 * also need to make sure that the vnode isn't on a list where "new_vnode"
3855 * can find it after the v_id has been bumped until we are completely done
3856 * with the vnode (i.e. putting it back on a list has to be the very last
3857 * thing we do to this vnode... many of the callers of vnode_reclaim_internal
3858 * are holding an io_count on the vnode... they need to drop the io_count
3859 * BEFORE doing a vnode_list_add or make sure to hold the vnode lock until
3860 * they are completely done with the vnode
3861 */
3862 vnode_list_lock();
3863
3864 vnode_list_remove_locked(vp);
3865 vp->v_id++;
3866
3867 vnode_list_unlock();
3868
3869 if (isfifo) {
3870 struct fifoinfo * fip;
3871
3872 fip = vp->v_fifoinfo;
3873 vp->v_fifoinfo = NULL;
3874 FREE(fip, M_TEMP);
3875 }
3876
3877 vp->v_type = VBAD;
3878
3879 if (vp->v_data)
3880 panic("vnode_reclaim_internal: cleaned vnode isn't");
3881 if (vp->v_numoutput)
3882 panic("vnode_reclaim_internal: clean vnode has pending I/O's");
3883 if (UBCINFOEXISTS(vp))
3884 panic("vnode_reclaim_internal: ubcinfo not cleaned");
3885 if (vp->v_parent)
3886 panic("vnode_reclaim_internal: vparent not removed");
3887 if (vp->v_name)
3888 panic("vnode_reclaim_internal: vname not removed");
3889
3890 vp->v_socket = NULL;
3891
3892 vp->v_lflag &= ~VL_TERMINATE;
3893 vp->v_lflag &= ~VL_DRAIN;
3894 vp->v_owner = NULL;
3895
3896 if (vp->v_lflag & VL_TERMWANT) {
3897 vp->v_lflag &= ~VL_TERMWANT;
3898 wakeup(&vp->v_lflag);
3899 }
3900 if (!reuse) {
3901 /*
3902 * make sure we get on the
3903 * dead list if appropriate
3904 */
3905 vnode_list_add(vp);
3906 }
3907 if (!locked)
3908 vnode_unlock(vp);
3909 }
3910
3911 /* USAGE:
3912 * vnode_create(int flavor, size_t size, void * param, vnode_t *vp)
3913 */
3914 int
3915 vnode_create(int flavor, size_t size, void *data, vnode_t *vpp)
3916 {
3917 int error;
3918 int insert = 1;
3919 vnode_t vp;
3920 vnode_t nvp;
3921 vnode_t dvp;
3922 struct uthread *ut;
3923 struct componentname *cnp;
3924 struct vnode_fsparam *param = (struct vnode_fsparam *)data;
3925
3926 if (flavor == VNCREATE_FLAVOR && (size == VCREATESIZE) && param) {
3927 if ( (error = new_vnode(&vp)) ) {
3928 return(error);
3929 } else {
3930 dvp = param->vnfs_dvp;
3931 cnp = param->vnfs_cnp;
3932
3933 vp->v_op = param->vnfs_vops;
3934 vp->v_type = param->vnfs_vtype;
3935 vp->v_data = param->vnfs_fsnode;
3936
3937 if (param->vnfs_markroot)
3938 vp->v_flag |= VROOT;
3939 if (param->vnfs_marksystem)
3940 vp->v_flag |= VSYSTEM;
3941 if (vp->v_type == VREG) {
3942 error = ubc_info_init_withsize(vp, param->vnfs_filesize);
3943 if (error) {
3944 #ifdef JOE_DEBUG
3945 record_vp(vp, 1);
3946 #endif
3947 vp->v_mount = NULL;
3948 vp->v_op = dead_vnodeop_p;
3949 vp->v_tag = VT_NON;
3950 vp->v_data = NULL;
3951 vp->v_type = VBAD;
3952 vp->v_lflag |= VL_DEAD;
3953
3954 vnode_put(vp);
3955 return(error);
3956 }
3957 }
3958 #ifdef JOE_DEBUG
3959 record_vp(vp, 1);
3960 #endif
3961 if (vp->v_type == VCHR || vp->v_type == VBLK) {
3962
3963 vp->v_tag = VT_DEVFS; /* callers will reset if needed (bdevvp) */
3964
3965 if ( (nvp = checkalias(vp, param->vnfs_rdev)) ) {
3966 /*
3967 * if checkalias returns a vnode, it will be locked
3968 *
3969 * first get rid of the unneeded vnode we acquired
3970 */
3971 vp->v_data = NULL;
3972 vp->v_op = spec_vnodeop_p;
3973 vp->v_type = VBAD;
3974 vp->v_lflag = VL_DEAD;
3975 vp->v_data = NULL;
3976 vp->v_tag = VT_NON;
3977 vnode_put(vp);
3978
3979 /*
3980 * switch to aliased vnode and finish
3981 * preparing it
3982 */
3983 vp = nvp;
3984
3985 vclean(vp, 0);
3986 vp->v_op = param->vnfs_vops;
3987 vp->v_type = param->vnfs_vtype;
3988 vp->v_data = param->vnfs_fsnode;
3989 vp->v_lflag = 0;
3990 vp->v_mount = NULL;
3991 insmntque(vp, param->vnfs_mp);
3992 insert = 0;
3993 vnode_unlock(vp);
3994 }
3995 }
3996
3997 if (vp->v_type == VFIFO) {
3998 struct fifoinfo *fip;
3999
4000 MALLOC(fip, struct fifoinfo *,
4001 sizeof(*fip), M_TEMP, M_WAITOK);
4002 bzero(fip, sizeof(struct fifoinfo ));
4003 vp->v_fifoinfo = fip;
4004 }
4005 /* The file systems usually pass the address of the location where
4006 * where there store the vnode pointer. When we add the vnode in mount
4007 * point and name cache they are discoverable. So the file system node
4008 * will have the connection to vnode setup by then
4009 */
4010 *vpp = vp;
4011
4012 /* Add fs named reference. */
4013 if (param->vnfs_flags & VNFS_ADDFSREF) {
4014 vp->v_lflag |= VNAMED_FSHASH;
4015 }
4016 if (param->vnfs_mp) {
4017 if (param->vnfs_mp->mnt_kern_flag & MNTK_LOCK_LOCAL)
4018 vp->v_flag |= VLOCKLOCAL;
4019 if (insert) {
4020 if ((vp->v_freelist.tqe_prev != (struct vnode **)0xdeadb))
4021 panic("insmntque: vp on the free list\n");
4022
4023 /*
4024 * enter in mount vnode list
4025 */
4026 insmntque(vp, param->vnfs_mp);
4027 }
4028 #ifdef INTERIM_FSNODE_LOCK
4029 if (param->vnfs_mp->mnt_vtable->vfc_threadsafe == 0) {
4030 MALLOC_ZONE(vp->v_unsafefs, struct unsafe_fsnode *,
4031 sizeof(struct unsafe_fsnode), M_UNSAFEFS, M_WAITOK);
4032 vp->v_unsafefs->fsnode_count = 0;
4033 vp->v_unsafefs->fsnodeowner = (void *)NULL;
4034 lck_mtx_init(&vp->v_unsafefs->fsnodelock, vnode_lck_grp, vnode_lck_attr);
4035 }
4036 #endif /* INTERIM_FSNODE_LOCK */
4037 }
4038 if (dvp && vnode_ref(dvp) == 0) {
4039 vp->v_parent = dvp;
4040 }
4041 if (cnp) {
4042 if (dvp && ((param->vnfs_flags & (VNFS_NOCACHE | VNFS_CANTCACHE)) == 0)) {
4043 /*
4044 * enter into name cache
4045 * we've got the info to enter it into the name cache now
4046 */
4047 cache_enter(dvp, vp, cnp);
4048 }
4049 vp->v_name = vfs_addname(cnp->cn_nameptr, cnp->cn_namelen, cnp->cn_hash, 0);
4050 if ((cnp->cn_flags & UNIONCREATED) == UNIONCREATED)
4051 vp->v_flag |= VISUNION;
4052 }
4053 if ((param->vnfs_flags & VNFS_CANTCACHE) == 0) {
4054 /*
4055 * this vnode is being created as cacheable in the name cache
4056 * this allows us to re-enter it in the cache
4057 */
4058 vp->v_flag |= VNCACHEABLE;
4059 }
4060 ut = get_bsdthread_info(current_thread());
4061
4062 if ((current_proc()->p_lflag & P_LRAGE_VNODES) ||
4063 (ut->uu_flag & UT_RAGE_VNODES)) {
4064 /*
4065 * process has indicated that it wants any
4066 * vnodes created on its behalf to be rapidly
4067 * aged to reduce the impact on the cached set
4068 * of vnodes
4069 */
4070 vp->v_flag |= VRAGE;
4071 }
4072 return(0);
4073 }
4074 }
4075 return (EINVAL);
4076 }
4077
4078 int
4079 vnode_addfsref(vnode_t vp)
4080 {
4081 vnode_lock_spin(vp);
4082 if (vp->v_lflag & VNAMED_FSHASH)
4083 panic("add_fsref: vp already has named reference");
4084 if ((vp->v_freelist.tqe_prev != (struct vnode **)0xdeadb))
4085 panic("addfsref: vp on the free list\n");
4086 vp->v_lflag |= VNAMED_FSHASH;
4087 vnode_unlock(vp);
4088 return(0);
4089
4090 }
4091 int
4092 vnode_removefsref(vnode_t vp)
4093 {
4094 vnode_lock_spin(vp);
4095 if ((vp->v_lflag & VNAMED_FSHASH) == 0)
4096 panic("remove_fsref: no named reference");
4097 vp->v_lflag &= ~VNAMED_FSHASH;
4098 vnode_unlock(vp);
4099 return(0);
4100
4101 }
4102
4103
4104 int
4105 vfs_iterate(__unused int flags, int (*callout)(mount_t, void *), void *arg)
4106 {
4107 mount_t mp;
4108 int ret = 0;
4109 fsid_t * fsid_list;
4110 int count, actualcount, i;
4111 void * allocmem;
4112
4113 count = mount_getvfscnt();
4114 count += 10;
4115
4116 fsid_list = (fsid_t *)kalloc(count * sizeof(fsid_t));
4117 allocmem = (void *)fsid_list;
4118
4119 actualcount = mount_fillfsids(fsid_list, count);
4120
4121 for (i=0; i< actualcount; i++) {
4122
4123 /* obtain the mount point with iteration reference */
4124 mp = mount_list_lookupby_fsid(&fsid_list[i], 0, 1);
4125
4126 if(mp == (struct mount *)0)
4127 continue;
4128 mount_lock(mp);
4129 if (mp->mnt_lflag & (MNT_LDEAD | MNT_LUNMOUNT)) {
4130 mount_unlock(mp);
4131 mount_iterdrop(mp);
4132 continue;
4133
4134 }
4135 mount_unlock(mp);
4136
4137 /* iterate over all the vnodes */
4138 ret = callout(mp, arg);
4139
4140 mount_iterdrop(mp);
4141
4142 switch (ret) {
4143 case VFS_RETURNED:
4144 case VFS_RETURNED_DONE:
4145 if (ret == VFS_RETURNED_DONE) {
4146 ret = 0;
4147 goto out;
4148 }
4149 break;
4150
4151 case VFS_CLAIMED_DONE:
4152 ret = 0;
4153 goto out;
4154 case VFS_CLAIMED:
4155 default:
4156 break;
4157 }
4158 ret = 0;
4159 }
4160
4161 out:
4162 kfree(allocmem, (count * sizeof(fsid_t)));
4163 return (ret);
4164 }
4165
4166 /*
4167 * Update the vfsstatfs structure in the mountpoint.
4168 * MAC: Parameter eventtype added, indicating whether the event that
4169 * triggered this update came from user space, via a system call
4170 * (VFS_USER_EVENT) or an internal kernel call (VFS_KERNEL_EVENT).
4171 */
4172 int
4173 vfs_update_vfsstat(mount_t mp, vfs_context_t ctx, __unused int eventtype)
4174 {
4175 struct vfs_attr va;
4176 int error;
4177
4178 /*
4179 * Request the attributes we want to propagate into
4180 * the per-mount vfsstat structure.
4181 */
4182 VFSATTR_INIT(&va);
4183 VFSATTR_WANTED(&va, f_iosize);
4184 VFSATTR_WANTED(&va, f_blocks);
4185 VFSATTR_WANTED(&va, f_bfree);
4186 VFSATTR_WANTED(&va, f_bavail);
4187 VFSATTR_WANTED(&va, f_bused);
4188 VFSATTR_WANTED(&va, f_files);
4189 VFSATTR_WANTED(&va, f_ffree);
4190 VFSATTR_WANTED(&va, f_bsize);
4191 VFSATTR_WANTED(&va, f_fssubtype);
4192 #if CONFIG_MACF
4193 if (eventtype == VFS_USER_EVENT) {
4194 error = mac_mount_check_getattr(ctx, mp, &va);
4195 if (error != 0)
4196 return (error);
4197 }
4198 #endif
4199
4200 if ((error = vfs_getattr(mp, &va, ctx)) != 0) {
4201 KAUTH_DEBUG("STAT - filesystem returned error %d", error);
4202 return(error);
4203 }
4204
4205 /*
4206 * Unpack into the per-mount structure.
4207 *
4208 * We only overwrite these fields, which are likely to change:
4209 * f_blocks
4210 * f_bfree
4211 * f_bavail
4212 * f_bused
4213 * f_files
4214 * f_ffree
4215 *
4216 * And these which are not, but which the FS has no other way
4217 * of providing to us:
4218 * f_bsize
4219 * f_iosize
4220 * f_fssubtype
4221 *
4222 */
4223 if (VFSATTR_IS_SUPPORTED(&va, f_bsize)) {
4224 /* 4822056 - protect against malformed server mount */
4225 mp->mnt_vfsstat.f_bsize = (va.f_bsize > 0 ? va.f_bsize : 512);
4226 } else {
4227 mp->mnt_vfsstat.f_bsize = mp->mnt_devblocksize; /* default from the device block size */
4228 }
4229 if (VFSATTR_IS_SUPPORTED(&va, f_iosize)) {
4230 mp->mnt_vfsstat.f_iosize = va.f_iosize;
4231 } else {
4232 mp->mnt_vfsstat.f_iosize = 1024 * 1024; /* 1MB sensible I/O size */
4233 }
4234 if (VFSATTR_IS_SUPPORTED(&va, f_blocks))
4235 mp->mnt_vfsstat.f_blocks = va.f_blocks;
4236 if (VFSATTR_IS_SUPPORTED(&va, f_bfree))
4237 mp->mnt_vfsstat.f_bfree = va.f_bfree;
4238 if (VFSATTR_IS_SUPPORTED(&va, f_bavail))
4239 mp->mnt_vfsstat.f_bavail = va.f_bavail;
4240 if (VFSATTR_IS_SUPPORTED(&va, f_bused))
4241 mp->mnt_vfsstat.f_bused = va.f_bused;
4242 if (VFSATTR_IS_SUPPORTED(&va, f_files))
4243 mp->mnt_vfsstat.f_files = va.f_files;
4244 if (VFSATTR_IS_SUPPORTED(&va, f_ffree))
4245 mp->mnt_vfsstat.f_ffree = va.f_ffree;
4246
4247 /* this is unlikely to change, but has to be queried for */
4248 if (VFSATTR_IS_SUPPORTED(&va, f_fssubtype))
4249 mp->mnt_vfsstat.f_fssubtype = va.f_fssubtype;
4250
4251 return(0);
4252 }
4253
4254 void
4255 mount_list_add(mount_t mp)
4256 {
4257 mount_list_lock();
4258 TAILQ_INSERT_TAIL(&mountlist, mp, mnt_list);
4259 nummounts++;
4260 mount_list_unlock();
4261 }
4262
4263 void
4264 mount_list_remove(mount_t mp)
4265 {
4266 mount_list_lock();
4267 TAILQ_REMOVE(&mountlist, mp, mnt_list);
4268 nummounts--;
4269 mp->mnt_list.tqe_next = NULL;
4270 mp->mnt_list.tqe_prev = NULL;
4271 mount_list_unlock();
4272 }
4273
4274 #if CONFIG_VOLFS
4275 mount_t
4276 mount_lookupby_volfsid(int volfs_id, int withref)
4277 {
4278 mount_t cur_mount = (mount_t)0;
4279 mount_t mp;
4280
4281 mount_list_lock();
4282 TAILQ_FOREACH(mp, &mountlist, mnt_list) {
4283 if (!(mp->mnt_kern_flag & MNTK_UNMOUNT) &&
4284 (mp->mnt_kern_flag & MNTK_PATH_FROM_ID) &&
4285 (mp->mnt_vfsstat.f_fsid.val[0] == volfs_id)) {
4286 cur_mount = mp;
4287 if (withref) {
4288 if (mount_iterref(cur_mount, 1)) {
4289 cur_mount = (mount_t)0;
4290 mount_list_unlock();
4291 goto out;
4292 }
4293 }
4294 break;
4295 }
4296 }
4297 mount_list_unlock();
4298 if (withref && (cur_mount != (mount_t)0)) {
4299 mp = cur_mount;
4300 if (vfs_busy(mp, LK_NOWAIT) != 0) {
4301 cur_mount = (mount_t)0;
4302 }
4303 mount_iterdrop(mp);
4304 }
4305 out:
4306 return(cur_mount);
4307 }
4308 #endif
4309
4310
4311 mount_t
4312 mount_list_lookupby_fsid(fsid_t *fsid, int locked, int withref)
4313 {
4314 mount_t retmp = (mount_t)0;
4315 mount_t mp;
4316
4317 if (!locked)
4318 mount_list_lock();
4319 TAILQ_FOREACH(mp, &mountlist, mnt_list)
4320 if (mp->mnt_vfsstat.f_fsid.val[0] == fsid->val[0] &&
4321 mp->mnt_vfsstat.f_fsid.val[1] == fsid->val[1]) {
4322 retmp = mp;
4323 if (withref) {
4324 if (mount_iterref(retmp, 1))
4325 retmp = (mount_t)0;
4326 }
4327 goto out;
4328 }
4329 out:
4330 if (!locked)
4331 mount_list_unlock();
4332 return (retmp);
4333 }
4334
4335 errno_t
4336 vnode_lookup(const char *path, int flags, vnode_t *vpp, vfs_context_t ctx)
4337 {
4338 struct nameidata nd;
4339 int error;
4340 u_long ndflags = 0;
4341
4342 if (ctx == NULL) { /* XXX technically an error */
4343 ctx = vfs_context_current();
4344 }
4345
4346 if (flags & VNODE_LOOKUP_NOFOLLOW)
4347 ndflags = NOFOLLOW;
4348 else
4349 ndflags = FOLLOW;
4350
4351 if (flags & VNODE_LOOKUP_NOCROSSMOUNT)
4352 ndflags |= NOCROSSMOUNT;
4353 if (flags & VNODE_LOOKUP_DOWHITEOUT)
4354 ndflags |= DOWHITEOUT;
4355
4356 /* XXX AUDITVNPATH1 needed ? */
4357 NDINIT(&nd, LOOKUP, ndflags, UIO_SYSSPACE, CAST_USER_ADDR_T(path), ctx);
4358
4359 if ((error = namei(&nd)))
4360 return (error);
4361 *vpp = nd.ni_vp;
4362 nameidone(&nd);
4363
4364 return (0);
4365 }
4366
4367 errno_t
4368 vnode_open(const char *path, int fmode, int cmode, int flags, vnode_t *vpp, vfs_context_t ctx)
4369 {
4370 struct nameidata nd;
4371 int error;
4372 u_long ndflags = 0;
4373 int lflags = flags;
4374
4375 if (ctx == NULL) { /* XXX technically an error */
4376 ctx = vfs_context_current();
4377 }
4378
4379 if (fmode & O_NOFOLLOW)
4380 lflags |= VNODE_LOOKUP_NOFOLLOW;
4381
4382 if (lflags & VNODE_LOOKUP_NOFOLLOW)
4383 ndflags = NOFOLLOW;
4384 else
4385 ndflags = FOLLOW;
4386
4387 if (lflags & VNODE_LOOKUP_NOCROSSMOUNT)
4388 ndflags |= NOCROSSMOUNT;
4389 if (lflags & VNODE_LOOKUP_DOWHITEOUT)
4390 ndflags |= DOWHITEOUT;
4391
4392 /* XXX AUDITVNPATH1 needed ? */
4393 NDINIT(&nd, LOOKUP, ndflags, UIO_SYSSPACE, CAST_USER_ADDR_T(path), ctx);
4394
4395 if ((error = vn_open(&nd, fmode, cmode)))
4396 *vpp = NULL;
4397 else
4398 *vpp = nd.ni_vp;
4399
4400 return (error);
4401 }
4402
4403 errno_t
4404 vnode_close(vnode_t vp, int flags, vfs_context_t ctx)
4405 {
4406 int error;
4407
4408 if (ctx == NULL) {
4409 ctx = vfs_context_current();
4410 }
4411
4412 error = vn_close(vp, flags, ctx);
4413 vnode_put(vp);
4414 return (error);
4415 }
4416
4417 /*
4418 * Returns: 0 Success
4419 * vnode_getattr:???
4420 */
4421 errno_t
4422 vnode_size(vnode_t vp, off_t *sizep, vfs_context_t ctx)
4423 {
4424 struct vnode_attr va;
4425 int error;
4426
4427 VATTR_INIT(&va);
4428 VATTR_WANTED(&va, va_data_size);
4429 error = vnode_getattr(vp, &va, ctx);
4430 if (!error)
4431 *sizep = va.va_data_size;
4432 return(error);
4433 }
4434
4435 errno_t
4436 vnode_setsize(vnode_t vp, off_t size, int ioflag, vfs_context_t ctx)
4437 {
4438 struct vnode_attr va;
4439
4440 VATTR_INIT(&va);
4441 VATTR_SET(&va, va_data_size, size);
4442 va.va_vaflags = ioflag & 0xffff;
4443 return(vnode_setattr(vp, &va, ctx));
4444 }
4445
4446 /*
4447 * Create a filesystem object of arbitrary type with arbitrary attributes in
4448 * the spevied directory with the specified name.
4449 *
4450 * Parameters: dvp Pointer to the vnode of the directory
4451 * in which to create the object.
4452 * vpp Pointer to the area into which to
4453 * return the vnode of the created object.
4454 * cnp Component name pointer from the namei
4455 * data structure, containing the name to
4456 * use for the create object.
4457 * vap Pointer to the vnode_attr structure
4458 * describing the object to be created,
4459 * including the type of object.
4460 * flags VN_* flags controlling ACL inheritance
4461 * and whether or not authorization is to
4462 * be required for the operation.
4463 *
4464 * Returns: 0 Success
4465 * !0 errno value
4466 *
4467 * Implicit: *vpp Contains the vnode of the object that
4468 * was created, if successful.
4469 * *cnp May be modified by the underlying VFS.
4470 * *vap May be modified by the underlying VFS.
4471 * modified by either ACL inheritance or
4472 *
4473 *
4474 * be modified, even if the operation is
4475 *
4476 *
4477 * Notes: The kauth_filesec_t in 'vap', if any, is in host byte order.
4478 *
4479 * Modification of '*cnp' and '*vap' by the underlying VFS is
4480 * strongly discouraged.
4481 *
4482 * XXX: This function is a 'vn_*' function; it belongs in vfs_vnops.c
4483 *
4484 * XXX: We should enummerate the possible errno values here, and where
4485 * in the code they originated.
4486 */
4487 errno_t
4488 vn_create(vnode_t dvp, vnode_t *vpp, struct componentname *cnp, struct vnode_attr *vap, int flags, vfs_context_t ctx)
4489 {
4490 kauth_acl_t oacl, nacl;
4491 int initial_acl;
4492 errno_t error;
4493 vnode_t vp = (vnode_t)0;
4494
4495 error = 0;
4496 oacl = nacl = NULL;
4497 initial_acl = 0;
4498
4499 KAUTH_DEBUG("%p CREATE - '%s'", dvp, cnp->cn_nameptr);
4500
4501 /*
4502 * Handle ACL inheritance.
4503 */
4504 if (!(flags & VN_CREATE_NOINHERIT) && vfs_extendedsecurity(dvp->v_mount)) {
4505 /* save the original filesec */
4506 if (VATTR_IS_ACTIVE(vap, va_acl)) {
4507 initial_acl = 1;
4508 oacl = vap->va_acl;
4509 }
4510
4511 vap->va_acl = NULL;
4512 if ((error = kauth_acl_inherit(dvp,
4513 oacl,
4514 &nacl,
4515 vap->va_type == VDIR,
4516 ctx)) != 0) {
4517 KAUTH_DEBUG("%p CREATE - error %d processing inheritance", dvp, error);
4518 return(error);
4519 }
4520
4521 /*
4522 * If the generated ACL is NULL, then we can save ourselves some effort
4523 * by clearing the active bit.
4524 */
4525 if (nacl == NULL) {
4526 VATTR_CLEAR_ACTIVE(vap, va_acl);
4527 } else {
4528 VATTR_SET(vap, va_acl, nacl);
4529 }
4530 }
4531
4532 /*
4533 * Check and default new attributes.
4534 * This will set va_uid, va_gid, va_mode and va_create_time at least, if the caller
4535 * hasn't supplied them.
4536 */
4537 if ((error = vnode_authattr_new(dvp, vap, flags & VN_CREATE_NOAUTH, ctx)) != 0) {
4538 KAUTH_DEBUG("%p CREATE - error %d handing/defaulting attributes", dvp, error);
4539 goto out;
4540 }
4541
4542
4543 /*
4544 * Create the requested node.
4545 */
4546 switch(vap->va_type) {
4547 case VREG:
4548 error = VNOP_CREATE(dvp, vpp, cnp, vap, ctx);
4549 break;
4550 case VDIR:
4551 error = VNOP_MKDIR(dvp, vpp, cnp, vap, ctx);
4552 break;
4553 case VSOCK:
4554 case VFIFO:
4555 case VBLK:
4556 case VCHR:
4557 error = VNOP_MKNOD(dvp, vpp, cnp, vap, ctx);
4558 break;
4559 default:
4560 panic("vnode_create: unknown vtype %d", vap->va_type);
4561 }
4562 if (error != 0) {
4563 KAUTH_DEBUG("%p CREATE - error %d returned by filesystem", dvp, error);
4564 goto out;
4565 }
4566
4567 vp = *vpp;
4568 #if CONFIG_MACF
4569 if (!(flags & VN_CREATE_NOLABEL)) {
4570 error = vnode_label(vnode_mount(vp), dvp, vp, cnp,
4571 VNODE_LABEL_CREATE|VNODE_LABEL_NEEDREF, ctx);
4572 if (error)
4573 goto error;
4574 }
4575 #endif
4576
4577 /*
4578 * If some of the requested attributes weren't handled by the VNOP,
4579 * use our fallback code.
4580 */
4581 if (!VATTR_ALL_SUPPORTED(vap) && *vpp) {
4582 KAUTH_DEBUG(" CREATE - doing fallback with ACL %p", vap->va_acl);
4583 error = vnode_setattr_fallback(*vpp, vap, ctx);
4584 }
4585 #if CONFIG_MACF
4586 error:
4587 #endif
4588 if ((error != 0 ) && (vp != (vnode_t)0)) {
4589 *vpp = (vnode_t) 0;
4590 vnode_put(vp);
4591 }
4592
4593 out:
4594 /*
4595 * If the caller supplied a filesec in vap, it has been replaced
4596 * now by the post-inheritance copy. We need to put the original back
4597 * and free the inherited product.
4598 */
4599 if (initial_acl) {
4600 VATTR_SET(vap, va_acl, oacl);
4601 } else {
4602 VATTR_CLEAR_ACTIVE(vap, va_acl);
4603 }
4604 if (nacl != NULL)
4605 kauth_acl_free(nacl);
4606
4607 return(error);
4608 }
4609
4610 static kauth_scope_t vnode_scope;
4611 static int vnode_authorize_callback(kauth_cred_t credential, void *idata, kauth_action_t action,
4612 uintptr_t arg0, uintptr_t arg1, uintptr_t arg2, uintptr_t arg3);
4613 static int vnode_authorize_callback_int(__unused kauth_cred_t credential, __unused void *idata, kauth_action_t action,
4614 uintptr_t arg0, uintptr_t arg1, uintptr_t arg2, uintptr_t arg3);
4615
4616 typedef struct _vnode_authorize_context {
4617 vnode_t vp;
4618 struct vnode_attr *vap;
4619 vnode_t dvp;
4620 struct vnode_attr *dvap;
4621 vfs_context_t ctx;
4622 int flags;
4623 int flags_valid;
4624 #define _VAC_IS_OWNER (1<<0)
4625 #define _VAC_IN_GROUP (1<<1)
4626 #define _VAC_IS_DIR_OWNER (1<<2)
4627 #define _VAC_IN_DIR_GROUP (1<<3)
4628 } *vauth_ctx;
4629
4630 void
4631 vnode_authorize_init(void)
4632 {
4633 vnode_scope = kauth_register_scope(KAUTH_SCOPE_VNODE, vnode_authorize_callback, NULL);
4634 }
4635
4636 /*
4637 * Authorize an operation on a vnode.
4638 *
4639 * This is KPI, but here because it needs vnode_scope.
4640 *
4641 * Returns: 0 Success
4642 * kauth_authorize_action:EPERM ...
4643 * xlate => EACCES Permission denied
4644 * kauth_authorize_action:0 Success
4645 * kauth_authorize_action: Depends on callback return; this is
4646 * usually only vnode_authorize_callback(),
4647 * but may include other listerners, if any
4648 * exist.
4649 * EROFS
4650 * EACCES
4651 * EPERM
4652 * ???
4653 */
4654 int
4655 vnode_authorize(vnode_t vp, vnode_t dvp, kauth_action_t action, vfs_context_t ctx)
4656 {
4657 int error, result;
4658
4659 /*
4660 * We can't authorize against a dead vnode; allow all operations through so that
4661 * the correct error can be returned.
4662 */
4663 if (vp->v_type == VBAD)
4664 return(0);
4665
4666 error = 0;
4667 result = kauth_authorize_action(vnode_scope, vfs_context_ucred(ctx), action,
4668 (uintptr_t)ctx, (uintptr_t)vp, (uintptr_t)dvp, (uintptr_t)&error);
4669 if (result == EPERM) /* traditional behaviour */
4670 result = EACCES;
4671 /* did the lower layers give a better error return? */
4672 if ((result != 0) && (error != 0))
4673 return(error);
4674 return(result);
4675 }
4676
4677 /*
4678 * Test for vnode immutability.
4679 *
4680 * The 'append' flag is set when the authorization request is constrained
4681 * to operations which only request the right to append to a file.
4682 *
4683 * The 'ignore' flag is set when an operation modifying the immutability flags
4684 * is being authorized. We check the system securelevel to determine which
4685 * immutability flags we can ignore.
4686 */
4687 static int
4688 vnode_immutable(struct vnode_attr *vap, int append, int ignore)
4689 {
4690 int mask;
4691
4692 /* start with all bits precluding the operation */
4693 mask = IMMUTABLE | APPEND;
4694
4695 /* if appending only, remove the append-only bits */
4696 if (append)
4697 mask &= ~APPEND;
4698
4699 /* ignore only set when authorizing flags changes */
4700 if (ignore) {
4701 if (securelevel <= 0) {
4702 /* in insecure state, flags do not inhibit changes */
4703 mask = 0;
4704 } else {
4705 /* in secure state, user flags don't inhibit */
4706 mask &= ~(UF_IMMUTABLE | UF_APPEND);
4707 }
4708 }
4709 KAUTH_DEBUG("IMMUTABLE - file flags 0x%x mask 0x%x append = %d ignore = %d", vap->va_flags, mask, append, ignore);
4710 if ((vap->va_flags & mask) != 0)
4711 return(EPERM);
4712 return(0);
4713 }
4714
4715 static int
4716 vauth_node_owner(struct vnode_attr *vap, kauth_cred_t cred)
4717 {
4718 int result;
4719
4720 /* default assumption is not-owner */
4721 result = 0;
4722
4723 /*
4724 * If the filesystem has given us a UID, we treat this as authoritative.
4725 */
4726 if (vap && VATTR_IS_SUPPORTED(vap, va_uid)) {
4727 result = (vap->va_uid == kauth_cred_getuid(cred)) ? 1 : 0;
4728 }
4729 /* we could test the owner UUID here if we had a policy for it */
4730
4731 return(result);
4732 }
4733
4734 static int
4735 vauth_node_group(struct vnode_attr *vap, kauth_cred_t cred, int *ismember)
4736 {
4737 int error;
4738 int result;
4739
4740 error = 0;
4741 result = 0;
4742
4743 /* the caller is expected to have asked the filesystem for a group at some point */
4744 if (vap && VATTR_IS_SUPPORTED(vap, va_gid)) {
4745 error = kauth_cred_ismember_gid(cred, vap->va_gid, &result);
4746 }
4747 /* we could test the group UUID here if we had a policy for it */
4748
4749 if (!error)
4750 *ismember = result;
4751 return(error);
4752 }
4753
4754 static int
4755 vauth_file_owner(vauth_ctx vcp)
4756 {
4757 int result;
4758
4759 if (vcp->flags_valid & _VAC_IS_OWNER) {
4760 result = (vcp->flags & _VAC_IS_OWNER) ? 1 : 0;
4761 } else {
4762 result = vauth_node_owner(vcp->vap, vcp->ctx->vc_ucred);
4763
4764 /* cache our result */
4765 vcp->flags_valid |= _VAC_IS_OWNER;
4766 if (result) {
4767 vcp->flags |= _VAC_IS_OWNER;
4768 } else {
4769 vcp->flags &= ~_VAC_IS_OWNER;
4770 }
4771 }
4772 return(result);
4773 }
4774
4775 static int
4776 vauth_file_ingroup(vauth_ctx vcp, int *ismember)
4777 {
4778 int error;
4779
4780 if (vcp->flags_valid & _VAC_IN_GROUP) {
4781 *ismember = (vcp->flags & _VAC_IN_GROUP) ? 1 : 0;
4782 error = 0;
4783 } else {
4784 error = vauth_node_group(vcp->vap, vcp->ctx->vc_ucred, ismember);
4785
4786 if (!error) {
4787 /* cache our result */
4788 vcp->flags_valid |= _VAC_IN_GROUP;
4789 if (*ismember) {
4790 vcp->flags |= _VAC_IN_GROUP;
4791 } else {
4792 vcp->flags &= ~_VAC_IN_GROUP;
4793 }
4794 }
4795
4796 }
4797 return(error);
4798 }
4799
4800 static int
4801 vauth_dir_owner(vauth_ctx vcp)
4802 {
4803 int result;
4804
4805 if (vcp->flags_valid & _VAC_IS_DIR_OWNER) {
4806 result = (vcp->flags & _VAC_IS_DIR_OWNER) ? 1 : 0;
4807 } else {
4808 result = vauth_node_owner(vcp->dvap, vcp->ctx->vc_ucred);
4809
4810 /* cache our result */
4811 vcp->flags_valid |= _VAC_IS_DIR_OWNER;
4812 if (result) {
4813 vcp->flags |= _VAC_IS_DIR_OWNER;
4814 } else {
4815 vcp->flags &= ~_VAC_IS_DIR_OWNER;
4816 }
4817 }
4818 return(result);
4819 }
4820
4821 static int
4822 vauth_dir_ingroup(vauth_ctx vcp, int *ismember)
4823 {
4824 int error;
4825
4826 if (vcp->flags_valid & _VAC_IN_DIR_GROUP) {
4827 *ismember = (vcp->flags & _VAC_IN_DIR_GROUP) ? 1 : 0;
4828 error = 0;
4829 } else {
4830 error = vauth_node_group(vcp->dvap, vcp->ctx->vc_ucred, ismember);
4831
4832 if (!error) {
4833 /* cache our result */
4834 vcp->flags_valid |= _VAC_IN_DIR_GROUP;
4835 if (*ismember) {
4836 vcp->flags |= _VAC_IN_DIR_GROUP;
4837 } else {
4838 vcp->flags &= ~_VAC_IN_DIR_GROUP;
4839 }
4840 }
4841 }
4842 return(error);
4843 }
4844
4845 /*
4846 * Test the posix permissions in (vap) to determine whether (credential)
4847 * may perform (action)
4848 */
4849 static int
4850 vnode_authorize_posix(vauth_ctx vcp, int action, int on_dir)
4851 {
4852 struct vnode_attr *vap;
4853 int needed, error, owner_ok, group_ok, world_ok, ismember;
4854 #ifdef KAUTH_DEBUG_ENABLE
4855 const char *where = "uninitialized";
4856 # define _SETWHERE(c) where = c;
4857 #else
4858 # define _SETWHERE(c)
4859 #endif
4860
4861 /* checking file or directory? */
4862 if (on_dir) {
4863 vap = vcp->dvap;
4864 } else {
4865 vap = vcp->vap;
4866 }
4867
4868 error = 0;
4869
4870 /*
4871 * We want to do as little work here as possible. So first we check
4872 * which sets of permissions grant us the access we need, and avoid checking
4873 * whether specific permissions grant access when more generic ones would.
4874 */
4875
4876 /* owner permissions */
4877 needed = 0;
4878 if (action & VREAD)
4879 needed |= S_IRUSR;
4880 if (action & VWRITE)
4881 needed |= S_IWUSR;
4882 if (action & VEXEC)
4883 needed |= S_IXUSR;
4884 owner_ok = (needed & vap->va_mode) == needed;
4885
4886 /* group permissions */
4887 needed = 0;
4888 if (action & VREAD)
4889 needed |= S_IRGRP;
4890 if (action & VWRITE)
4891 needed |= S_IWGRP;
4892 if (action & VEXEC)
4893 needed |= S_IXGRP;
4894 group_ok = (needed & vap->va_mode) == needed;
4895
4896 /* world permissions */
4897 needed = 0;
4898 if (action & VREAD)
4899 needed |= S_IROTH;
4900 if (action & VWRITE)
4901 needed |= S_IWOTH;
4902 if (action & VEXEC)
4903 needed |= S_IXOTH;
4904 world_ok = (needed & vap->va_mode) == needed;
4905
4906 /* If granted/denied by all three, we're done */
4907 if (owner_ok && group_ok && world_ok) {
4908 _SETWHERE("all");
4909 goto out;
4910 }
4911 if (!owner_ok && !group_ok && !world_ok) {
4912 _SETWHERE("all");
4913 error = EACCES;
4914 goto out;
4915 }
4916
4917 /* Check ownership (relatively cheap) */
4918 if ((on_dir && vauth_dir_owner(vcp)) ||
4919 (!on_dir && vauth_file_owner(vcp))) {
4920 _SETWHERE("user");
4921 if (!owner_ok)
4922 error = EACCES;
4923 goto out;
4924 }
4925
4926 /* Not owner; if group and world both grant it we're done */
4927 if (group_ok && world_ok) {
4928 _SETWHERE("group/world");
4929 goto out;
4930 }
4931 if (!group_ok && !world_ok) {
4932 _SETWHERE("group/world");
4933 error = EACCES;
4934 goto out;
4935 }
4936
4937 /* Check group membership (most expensive) */
4938 ismember = 0;
4939 if (on_dir) {
4940 error = vauth_dir_ingroup(vcp, &ismember);
4941 } else {
4942 error = vauth_file_ingroup(vcp, &ismember);
4943 }
4944 if (error)
4945 goto out;
4946 if (ismember) {
4947 _SETWHERE("group");
4948 if (!group_ok)
4949 error = EACCES;
4950 goto out;
4951 }
4952
4953 /* Not owner, not in group, use world result */
4954 _SETWHERE("world");
4955 if (!world_ok)
4956 error = EACCES;
4957
4958 /* FALLTHROUGH */
4959
4960 out:
4961 KAUTH_DEBUG("%p %s - posix %s permissions : need %s%s%s %x have %s%s%s%s%s%s%s%s%s UID = %d file = %d,%d",
4962 vcp->vp, (error == 0) ? "ALLOWED" : "DENIED", where,
4963 (action & VREAD) ? "r" : "-",
4964 (action & VWRITE) ? "w" : "-",
4965 (action & VEXEC) ? "x" : "-",
4966 needed,
4967 (vap->va_mode & S_IRUSR) ? "r" : "-",
4968 (vap->va_mode & S_IWUSR) ? "w" : "-",
4969 (vap->va_mode & S_IXUSR) ? "x" : "-",
4970 (vap->va_mode & S_IRGRP) ? "r" : "-",
4971 (vap->va_mode & S_IWGRP) ? "w" : "-",
4972 (vap->va_mode & S_IXGRP) ? "x" : "-",
4973 (vap->va_mode & S_IROTH) ? "r" : "-",
4974 (vap->va_mode & S_IWOTH) ? "w" : "-",
4975 (vap->va_mode & S_IXOTH) ? "x" : "-",
4976 kauth_cred_getuid(vcp->ctx->vc_ucred),
4977 on_dir ? vcp->dvap->va_uid : vcp->vap->va_uid,
4978 on_dir ? vcp->dvap->va_gid : vcp->vap->va_gid);
4979 return(error);
4980 }
4981
4982 /*
4983 * Authorize the deletion of the node vp from the directory dvp.
4984 *
4985 * We assume that:
4986 * - Neither the node nor the directory are immutable.
4987 * - The user is not the superuser.
4988 *
4989 * Deletion is not permitted if the directory is sticky and the caller is
4990 * not owner of the node or directory.
4991 *
4992 * If either the node grants DELETE, or the directory grants DELETE_CHILD,
4993 * the node may be deleted. If neither denies the permission, and the
4994 * caller has Posix write access to the directory, then the node may be
4995 * deleted.
4996 *
4997 * As an optimization, we cache whether or not delete child is permitted
4998 * on directories without the sticky bit set.
4999 */
5000 int
5001 vnode_authorize_delete(vauth_ctx vcp, boolean_t cached_delete_child);
5002 /*static*/ int
5003 vnode_authorize_delete(vauth_ctx vcp, boolean_t cached_delete_child)
5004 {
5005 struct vnode_attr *vap = vcp->vap;
5006 struct vnode_attr *dvap = vcp->dvap;
5007 kauth_cred_t cred = vcp->ctx->vc_ucred;
5008 struct kauth_acl_eval eval;
5009 int error, delete_denied, delete_child_denied, ismember;
5010
5011 /* check the ACL on the directory */
5012 delete_child_denied = 0;
5013 if (!cached_delete_child && VATTR_IS_NOT(dvap, va_acl, NULL)) {
5014 eval.ae_requested = KAUTH_VNODE_DELETE_CHILD;
5015 eval.ae_acl = &dvap->va_acl->acl_ace[0];
5016 eval.ae_count = dvap->va_acl->acl_entrycount;
5017 eval.ae_options = 0;
5018 if (vauth_dir_owner(vcp))
5019 eval.ae_options |= KAUTH_AEVAL_IS_OWNER;
5020 if ((error = vauth_dir_ingroup(vcp, &ismember)) != 0)
5021 return(error);
5022 if (ismember)
5023 eval.ae_options |= KAUTH_AEVAL_IN_GROUP;
5024 eval.ae_exp_gall = KAUTH_VNODE_GENERIC_ALL_BITS;
5025 eval.ae_exp_gread = KAUTH_VNODE_GENERIC_READ_BITS;
5026 eval.ae_exp_gwrite = KAUTH_VNODE_GENERIC_WRITE_BITS;
5027 eval.ae_exp_gexec = KAUTH_VNODE_GENERIC_EXECUTE_BITS;
5028
5029 error = kauth_acl_evaluate(cred, &eval);
5030
5031 if (error != 0) {
5032 KAUTH_DEBUG("%p ERROR during ACL processing - %d", vcp->vp, error);
5033 return(error);
5034 }
5035 if (eval.ae_result == KAUTH_RESULT_DENY)
5036 delete_child_denied = 1;
5037 if (eval.ae_result == KAUTH_RESULT_ALLOW) {
5038 KAUTH_DEBUG("%p ALLOWED - granted by directory ACL", vcp->vp);
5039 return(0);
5040 }
5041 }
5042
5043 /* check the ACL on the node */
5044 delete_denied = 0;
5045 if (VATTR_IS_NOT(vap, va_acl, NULL)) {
5046 eval.ae_requested = KAUTH_VNODE_DELETE;
5047 eval.ae_acl = &vap->va_acl->acl_ace[0];
5048 eval.ae_count = vap->va_acl->acl_entrycount;
5049 eval.ae_options = 0;
5050 if (vauth_file_owner(vcp))
5051 eval.ae_options |= KAUTH_AEVAL_IS_OWNER;
5052 if ((error = vauth_file_ingroup(vcp, &ismember)) != 0)
5053 return(error);
5054 if (ismember)
5055 eval.ae_options |= KAUTH_AEVAL_IN_GROUP;
5056 eval.ae_exp_gall = KAUTH_VNODE_GENERIC_ALL_BITS;
5057 eval.ae_exp_gread = KAUTH_VNODE_GENERIC_READ_BITS;
5058 eval.ae_exp_gwrite = KAUTH_VNODE_GENERIC_WRITE_BITS;
5059 eval.ae_exp_gexec = KAUTH_VNODE_GENERIC_EXECUTE_BITS;
5060
5061 if ((error = kauth_acl_evaluate(cred, &eval)) != 0) {
5062 KAUTH_DEBUG("%p ERROR during ACL processing - %d", vcp->vp, error);
5063 return(error);
5064 }
5065 if (eval.ae_result == KAUTH_RESULT_DENY)
5066 delete_denied = 1;
5067 if (eval.ae_result == KAUTH_RESULT_ALLOW) {
5068 KAUTH_DEBUG("%p ALLOWED - granted by file ACL", vcp->vp);
5069 return(0);
5070 }
5071 }
5072
5073 /* if denied by ACL on directory or node, return denial */
5074 if (delete_denied || delete_child_denied) {
5075 KAUTH_DEBUG("%p ALLOWED - denied by ACL", vcp->vp);
5076 return(EACCES);
5077 }
5078
5079 /*
5080 * enforce sticky bit behaviour; the cached_delete_child property will
5081 * be false and the dvap contents valis for sticky bit directories;
5082 * this makes us check the directory each time, but it's unavoidable,
5083 * as sticky bit is an exception to caching.
5084 */
5085 if (!cached_delete_child && (dvap->va_mode & S_ISTXT) && !vauth_file_owner(vcp) && !vauth_dir_owner(vcp)) {
5086 KAUTH_DEBUG("%p DENIED - sticky bit rules (user %d file %d dir %d)",
5087 vcp->vp, cred->cr_uid, vap->va_uid, dvap->va_uid);
5088 return(EACCES);
5089 }
5090
5091 /* check the directory */
5092 if (!cached_delete_child && (error = vnode_authorize_posix(vcp, VWRITE, 1 /* on_dir */)) != 0) {
5093 KAUTH_DEBUG("%p ALLOWED - granted by posix permisssions", vcp->vp);
5094 return(error);
5095 }
5096
5097 /* not denied, must be OK */
5098 return(0);
5099 }
5100
5101
5102 /*
5103 * Authorize an operation based on the node's attributes.
5104 */
5105 static int
5106 vnode_authorize_simple(vauth_ctx vcp, kauth_ace_rights_t acl_rights, kauth_ace_rights_t preauth_rights, boolean_t *found_deny)
5107 {
5108 struct vnode_attr *vap = vcp->vap;
5109 kauth_cred_t cred = vcp->ctx->vc_ucred;
5110 struct kauth_acl_eval eval;
5111 int error, ismember;
5112 mode_t posix_action;
5113
5114 /*
5115 * If we are the file owner, we automatically have some rights.
5116 *
5117 * Do we need to expand this to support group ownership?
5118 */
5119 if (vauth_file_owner(vcp))
5120 acl_rights &= ~(KAUTH_VNODE_WRITE_SECURITY);
5121
5122 /*
5123 * If we are checking both TAKE_OWNERSHIP and WRITE_SECURITY, we can
5124 * mask the latter. If TAKE_OWNERSHIP is requested the caller is about to
5125 * change ownership to themselves, and WRITE_SECURITY is implicitly
5126 * granted to the owner. We need to do this because at this point
5127 * WRITE_SECURITY may not be granted as the caller is not currently
5128 * the owner.
5129 */
5130 if ((acl_rights & KAUTH_VNODE_TAKE_OWNERSHIP) &&
5131 (acl_rights & KAUTH_VNODE_WRITE_SECURITY))
5132 acl_rights &= ~KAUTH_VNODE_WRITE_SECURITY;
5133
5134 if (acl_rights == 0) {
5135 KAUTH_DEBUG("%p ALLOWED - implicit or no rights required", vcp->vp);
5136 return(0);
5137 }
5138
5139 /* if we have an ACL, evaluate it */
5140 if (VATTR_IS_NOT(vap, va_acl, NULL)) {
5141 eval.ae_requested = acl_rights;
5142 eval.ae_acl = &vap->va_acl->acl_ace[0];
5143 eval.ae_count = vap->va_acl->acl_entrycount;
5144 eval.ae_options = 0;
5145 if (vauth_file_owner(vcp))
5146 eval.ae_options |= KAUTH_AEVAL_IS_OWNER;
5147 if ((error = vauth_file_ingroup(vcp, &ismember)) != 0)
5148 return(error);
5149 if (ismember)
5150 eval.ae_options |= KAUTH_AEVAL_IN_GROUP;
5151 eval.ae_exp_gall = KAUTH_VNODE_GENERIC_ALL_BITS;
5152 eval.ae_exp_gread = KAUTH_VNODE_GENERIC_READ_BITS;
5153 eval.ae_exp_gwrite = KAUTH_VNODE_GENERIC_WRITE_BITS;
5154 eval.ae_exp_gexec = KAUTH_VNODE_GENERIC_EXECUTE_BITS;
5155
5156 if ((error = kauth_acl_evaluate(cred, &eval)) != 0) {
5157 KAUTH_DEBUG("%p ERROR during ACL processing - %d", vcp->vp, error);
5158 return(error);
5159 }
5160
5161 if (eval.ae_result == KAUTH_RESULT_DENY) {
5162 KAUTH_DEBUG("%p DENIED - by ACL", vcp->vp);
5163 return(EACCES); /* deny, deny, counter-allege */
5164 }
5165 if (eval.ae_result == KAUTH_RESULT_ALLOW) {
5166 KAUTH_DEBUG("%p ALLOWED - all rights granted by ACL", vcp->vp);
5167 return(0);
5168 }
5169 *found_deny = eval.ae_found_deny;
5170
5171 /* fall through and evaluate residual rights */
5172 } else {
5173 /* no ACL, everything is residual */
5174 eval.ae_residual = acl_rights;
5175 }
5176
5177 /*
5178 * Grant residual rights that have been pre-authorized.
5179 */
5180 eval.ae_residual &= ~preauth_rights;
5181
5182 /*
5183 * We grant WRITE_ATTRIBUTES to the owner if it hasn't been denied.
5184 */
5185 if (vauth_file_owner(vcp))
5186 eval.ae_residual &= ~KAUTH_VNODE_WRITE_ATTRIBUTES;
5187
5188 if (eval.ae_residual == 0) {
5189 KAUTH_DEBUG("%p ALLOWED - rights already authorized", vcp->vp);
5190 return(0);
5191 }
5192
5193 /*
5194 * Bail if we have residual rights that can't be granted by posix permissions,
5195 * or aren't presumed granted at this point.
5196 *
5197 * XXX these can be collapsed for performance
5198 */
5199 if (eval.ae_residual & KAUTH_VNODE_CHANGE_OWNER) {
5200 KAUTH_DEBUG("%p DENIED - CHANGE_OWNER not permitted", vcp->vp);
5201 return(EACCES);
5202 }
5203 if (eval.ae_residual & KAUTH_VNODE_WRITE_SECURITY) {
5204 KAUTH_DEBUG("%p DENIED - WRITE_SECURITY not permitted", vcp->vp);
5205 return(EACCES);
5206 }
5207
5208 #if DIAGNOSTIC
5209 if (eval.ae_residual & KAUTH_VNODE_DELETE)
5210 panic("vnode_authorize: can't be checking delete permission here");
5211 #endif
5212
5213 /*
5214 * Compute the fallback posix permissions that will satisfy the remaining
5215 * rights.
5216 */
5217 posix_action = 0;
5218 if (eval.ae_residual & (KAUTH_VNODE_READ_DATA |
5219 KAUTH_VNODE_LIST_DIRECTORY |
5220 KAUTH_VNODE_READ_EXTATTRIBUTES))
5221 posix_action |= VREAD;
5222 if (eval.ae_residual & (KAUTH_VNODE_WRITE_DATA |
5223 KAUTH_VNODE_ADD_FILE |
5224 KAUTH_VNODE_ADD_SUBDIRECTORY |
5225 KAUTH_VNODE_DELETE_CHILD |
5226 KAUTH_VNODE_WRITE_ATTRIBUTES |
5227 KAUTH_VNODE_WRITE_EXTATTRIBUTES))
5228 posix_action |= VWRITE;
5229 if (eval.ae_residual & (KAUTH_VNODE_EXECUTE |
5230 KAUTH_VNODE_SEARCH))
5231 posix_action |= VEXEC;
5232
5233 if (posix_action != 0) {
5234 return(vnode_authorize_posix(vcp, posix_action, 0 /* !on_dir */));
5235 } else {
5236 KAUTH_DEBUG("%p ALLOWED - residual rights %s%s%s%s%s%s%s%s%s%s%s%s%s%s granted due to no posix mapping",
5237 vcp->vp,
5238 (eval.ae_residual & KAUTH_VNODE_READ_DATA)
5239 ? vnode_isdir(vcp->vp) ? " LIST_DIRECTORY" : " READ_DATA" : "",
5240 (eval.ae_residual & KAUTH_VNODE_WRITE_DATA)
5241 ? vnode_isdir(vcp->vp) ? " ADD_FILE" : " WRITE_DATA" : "",
5242 (eval.ae_residual & KAUTH_VNODE_EXECUTE)
5243 ? vnode_isdir(vcp->vp) ? " SEARCH" : " EXECUTE" : "",
5244 (eval.ae_residual & KAUTH_VNODE_DELETE)
5245 ? " DELETE" : "",
5246 (eval.ae_residual & KAUTH_VNODE_APPEND_DATA)
5247 ? vnode_isdir(vcp->vp) ? " ADD_SUBDIRECTORY" : " APPEND_DATA" : "",
5248 (eval.ae_residual & KAUTH_VNODE_DELETE_CHILD)
5249 ? " DELETE_CHILD" : "",
5250 (eval.ae_residual & KAUTH_VNODE_READ_ATTRIBUTES)
5251 ? " READ_ATTRIBUTES" : "",
5252 (eval.ae_residual & KAUTH_VNODE_WRITE_ATTRIBUTES)
5253 ? " WRITE_ATTRIBUTES" : "",
5254 (eval.ae_residual & KAUTH_VNODE_READ_EXTATTRIBUTES)
5255 ? " READ_EXTATTRIBUTES" : "",
5256 (eval.ae_residual & KAUTH_VNODE_WRITE_EXTATTRIBUTES)
5257 ? " WRITE_EXTATTRIBUTES" : "",
5258 (eval.ae_residual & KAUTH_VNODE_READ_SECURITY)
5259 ? " READ_SECURITY" : "",
5260 (eval.ae_residual & KAUTH_VNODE_WRITE_SECURITY)
5261 ? " WRITE_SECURITY" : "",
5262 (eval.ae_residual & KAUTH_VNODE_CHECKIMMUTABLE)
5263 ? " CHECKIMMUTABLE" : "",
5264 (eval.ae_residual & KAUTH_VNODE_CHANGE_OWNER)
5265 ? " CHANGE_OWNER" : "");
5266 }
5267
5268 /*
5269 * Lack of required Posix permissions implies no reason to deny access.
5270 */
5271 return(0);
5272 }
5273
5274 /*
5275 * Check for file immutability.
5276 */
5277 static int
5278 vnode_authorize_checkimmutable(vnode_t vp, struct vnode_attr *vap, int rights, int ignore)
5279 {
5280 mount_t mp;
5281 int error;
5282 int append;
5283
5284 /*
5285 * Perform immutability checks for operations that change data.
5286 *
5287 * Sockets, fifos and devices require special handling.
5288 */
5289 switch(vp->v_type) {
5290 case VSOCK:
5291 case VFIFO:
5292 case VBLK:
5293 case VCHR:
5294 /*
5295 * Writing to these nodes does not change the filesystem data,
5296 * so forget that it's being tried.
5297 */
5298 rights &= ~KAUTH_VNODE_WRITE_DATA;
5299 break;
5300 default:
5301 break;
5302 }
5303
5304 error = 0;
5305 if (rights & KAUTH_VNODE_WRITE_RIGHTS) {
5306
5307 /* check per-filesystem options if possible */
5308 mp = vp->v_mount;
5309 if (mp != NULL) {
5310
5311 /* check for no-EA filesystems */
5312 if ((rights & KAUTH_VNODE_WRITE_EXTATTRIBUTES) &&
5313 (vfs_flags(mp) & MNT_NOUSERXATTR)) {
5314 KAUTH_DEBUG("%p DENIED - filesystem disallowed extended attributes", vp);
5315 error = EACCES; /* User attributes disabled */
5316 goto out;
5317 }
5318 }
5319
5320 /* check for file immutability */
5321 append = 0;
5322 if (vp->v_type == VDIR) {
5323 if ((rights & (KAUTH_VNODE_ADD_FILE | KAUTH_VNODE_ADD_SUBDIRECTORY)) == rights)
5324 append = 1;
5325 } else {
5326 if ((rights & KAUTH_VNODE_APPEND_DATA) == rights)
5327 append = 1;
5328 }
5329 if ((error = vnode_immutable(vap, append, ignore)) != 0) {
5330 KAUTH_DEBUG("%p DENIED - file is immutable", vp);
5331 goto out;
5332 }
5333 }
5334 out:
5335 return(error);
5336 }
5337
5338 /*
5339 * Handle authorization actions for filesystems that advertise that the
5340 * server will be enforcing.
5341 *
5342 * Returns: 0 Authorization should be handled locally
5343 * 1 Authorization was handled by the FS
5344 *
5345 * Note: Imputed returns will only occur if the authorization request
5346 * was handled by the FS.
5347 *
5348 * Imputed: *resultp, modified Return code from FS when the request is
5349 * handled by the FS.
5350 * VNOP_ACCESS:???
5351 * VNOP_OPEN:???
5352 */
5353 static int
5354 vnode_authorize_opaque(vnode_t vp, int *resultp, kauth_action_t action, vfs_context_t ctx)
5355 {
5356 int error;
5357
5358 /*
5359 * If the vp is a device node, socket or FIFO it actually represents a local
5360 * endpoint, so we need to handle it locally.
5361 */
5362 switch(vp->v_type) {
5363 case VBLK:
5364 case VCHR:
5365 case VSOCK:
5366 case VFIFO:
5367 return(0);
5368 default:
5369 break;
5370 }
5371
5372 /*
5373 * In the advisory request case, if the filesystem doesn't think it's reliable
5374 * we will attempt to formulate a result ourselves based on VNOP_GETATTR data.
5375 */
5376 if ((action & KAUTH_VNODE_ACCESS) && !vfs_authopaqueaccess(vp->v_mount))
5377 return(0);
5378
5379 /*
5380 * Let the filesystem have a say in the matter. It's OK for it to not implemnent
5381 * VNOP_ACCESS, as most will authorise inline with the actual request.
5382 */
5383 if ((error = VNOP_ACCESS(vp, action, ctx)) != ENOTSUP) {
5384 *resultp = error;
5385 KAUTH_DEBUG("%p DENIED - opaque filesystem VNOP_ACCESS denied access", vp);
5386 return(1);
5387 }
5388
5389 /*
5390 * Typically opaque filesystems do authorisation in-line, but exec is a special case. In
5391 * order to be reasonably sure that exec will be permitted, we try a bit harder here.
5392 */
5393 if ((action & KAUTH_VNODE_EXECUTE) && (vp->v_type == VREG)) {
5394 /* try a VNOP_OPEN for readonly access */
5395 if ((error = VNOP_OPEN(vp, FREAD, ctx)) != 0) {
5396 *resultp = error;
5397 KAUTH_DEBUG("%p DENIED - EXECUTE denied because file could not be opened readonly", vp);
5398 return(1);
5399 }
5400 VNOP_CLOSE(vp, FREAD, ctx);
5401 }
5402
5403 /*
5404 * We don't have any reason to believe that the request has to be denied at this point,
5405 * so go ahead and allow it.
5406 */
5407 *resultp = 0;
5408 KAUTH_DEBUG("%p ALLOWED - bypassing access check for non-local filesystem", vp);
5409 return(1);
5410 }
5411
5412
5413
5414
5415 /*
5416 * Returns: KAUTH_RESULT_ALLOW
5417 * KAUTH_RESULT_DENY
5418 *
5419 * Imputed: *arg3, modified Error code in the deny case
5420 * EROFS Read-only file system
5421 * EACCES Permission denied
5422 * EPERM Operation not permitted [no execute]
5423 * vnode_getattr:ENOMEM Not enough space [only if has filesec]
5424 * vnode_getattr:???
5425 * vnode_authorize_opaque:*arg2 ???
5426 * vnode_authorize_checkimmutable:???
5427 * vnode_authorize_delete:???
5428 * vnode_authorize_simple:???
5429 */
5430
5431
5432 static int
5433 vnode_authorize_callback(kauth_cred_t cred, void *idata, kauth_action_t action,
5434 uintptr_t arg0, uintptr_t arg1, uintptr_t arg2, uintptr_t arg3)
5435 {
5436 vfs_context_t ctx;
5437 vnode_t cvp = NULLVP;
5438 vnode_t vp, dvp;
5439 int result;
5440
5441 ctx = (vfs_context_t)arg0;
5442 vp = (vnode_t)arg1;
5443 dvp = (vnode_t)arg2;
5444
5445 /*
5446 * if there are 2 vnodes passed in, we don't know at
5447 * this point which rights to look at based on the
5448 * combined action being passed in... defer until later...
5449 * otherwise check the kauth 'rights' cache hung
5450 * off of the vnode we're interested in... if we've already
5451 * been granted the right we're currently interested in,
5452 * we can just return success... otherwise we'll go through
5453 * the process of authorizing the requested right(s)... if that
5454 * succeeds, we'll add the right(s) to the cache.
5455 * VNOP_SETATTR and VNOP_SETXATTR will invalidate this cache
5456 */
5457 if (dvp && vp)
5458 goto defer;
5459 if (dvp)
5460 cvp = dvp;
5461 else
5462 cvp = vp;
5463
5464 if (vnode_cache_is_authorized(cvp, ctx, action) == TRUE)
5465 return KAUTH_RESULT_ALLOW;
5466 defer:
5467 result = vnode_authorize_callback_int(cred, idata, action, arg0, arg1, arg2, arg3);
5468
5469 if (result == KAUTH_RESULT_ALLOW && cvp != NULLVP)
5470 vnode_cache_authorized_action(cvp, ctx, action);
5471
5472 return result;
5473 }
5474
5475
5476 static int
5477 vnode_authorize_callback_int(__unused kauth_cred_t unused_cred, __unused void *idata, kauth_action_t action,
5478 uintptr_t arg0, uintptr_t arg1, uintptr_t arg2, uintptr_t arg3)
5479 {
5480 struct _vnode_authorize_context auth_context;
5481 vauth_ctx vcp;
5482 vfs_context_t ctx;
5483 vnode_t vp, dvp;
5484 kauth_cred_t cred;
5485 kauth_ace_rights_t rights;
5486 struct vnode_attr va, dva;
5487 int result;
5488 int *errorp;
5489 int noimmutable;
5490 boolean_t parent_authorized_for_delete_child = FALSE;
5491 boolean_t found_deny = FALSE;
5492 boolean_t parent_ref= FALSE;
5493
5494 vcp = &auth_context;
5495 ctx = vcp->ctx = (vfs_context_t)arg0;
5496 vp = vcp->vp = (vnode_t)arg1;
5497 dvp = vcp->dvp = (vnode_t)arg2;
5498 errorp = (int *)arg3;
5499 /*
5500 * Note that we authorize against the context, not the passed cred
5501 * (the same thing anyway)
5502 */
5503 cred = ctx->vc_ucred;
5504
5505 VATTR_INIT(&va);
5506 vcp->vap = &va;
5507 VATTR_INIT(&dva);
5508 vcp->dvap = &dva;
5509
5510 vcp->flags = vcp->flags_valid = 0;
5511
5512 #if DIAGNOSTIC
5513 if ((ctx == NULL) || (vp == NULL) || (cred == NULL))
5514 panic("vnode_authorize: bad arguments (context %p vp %p cred %p)", ctx, vp, cred);
5515 #endif
5516
5517 KAUTH_DEBUG("%p AUTH - %s %s%s%s%s%s%s%s%s%s%s%s%s%s%s%s on %s '%s' (0x%x:%p/%p)",
5518 vp, vfs_context_proc(ctx)->p_comm,
5519 (action & KAUTH_VNODE_ACCESS) ? "access" : "auth",
5520 (action & KAUTH_VNODE_READ_DATA) ? vnode_isdir(vp) ? " LIST_DIRECTORY" : " READ_DATA" : "",
5521 (action & KAUTH_VNODE_WRITE_DATA) ? vnode_isdir(vp) ? " ADD_FILE" : " WRITE_DATA" : "",
5522 (action & KAUTH_VNODE_EXECUTE) ? vnode_isdir(vp) ? " SEARCH" : " EXECUTE" : "",
5523 (action & KAUTH_VNODE_DELETE) ? " DELETE" : "",
5524 (action & KAUTH_VNODE_APPEND_DATA) ? vnode_isdir(vp) ? " ADD_SUBDIRECTORY" : " APPEND_DATA" : "",
5525 (action & KAUTH_VNODE_DELETE_CHILD) ? " DELETE_CHILD" : "",
5526 (action & KAUTH_VNODE_READ_ATTRIBUTES) ? " READ_ATTRIBUTES" : "",
5527 (action & KAUTH_VNODE_WRITE_ATTRIBUTES) ? " WRITE_ATTRIBUTES" : "",
5528 (action & KAUTH_VNODE_READ_EXTATTRIBUTES) ? " READ_EXTATTRIBUTES" : "",
5529 (action & KAUTH_VNODE_WRITE_EXTATTRIBUTES) ? " WRITE_EXTATTRIBUTES" : "",
5530 (action & KAUTH_VNODE_READ_SECURITY) ? " READ_SECURITY" : "",
5531 (action & KAUTH_VNODE_WRITE_SECURITY) ? " WRITE_SECURITY" : "",
5532 (action & KAUTH_VNODE_CHANGE_OWNER) ? " CHANGE_OWNER" : "",
5533 (action & KAUTH_VNODE_NOIMMUTABLE) ? " (noimmutable)" : "",
5534 vnode_isdir(vp) ? "directory" : "file",
5535 vp->v_name ? vp->v_name : "<NULL>", action, vp, dvp);
5536
5537 /*
5538 * Extract the control bits from the action, everything else is
5539 * requested rights.
5540 */
5541 noimmutable = (action & KAUTH_VNODE_NOIMMUTABLE) ? 1 : 0;
5542 rights = action & ~(KAUTH_VNODE_ACCESS | KAUTH_VNODE_NOIMMUTABLE);
5543
5544 if (rights & KAUTH_VNODE_DELETE) {
5545 #if DIAGNOSTIC
5546 if (dvp == NULL)
5547 panic("vnode_authorize: KAUTH_VNODE_DELETE test requires a directory");
5548 #endif
5549 /*
5550 * check to see if we've already authorized the parent
5551 * directory for deletion of its children... if so, we
5552 * can skip a whole bunch of work... we will still have to
5553 * authorize that this specific child can be removed
5554 */
5555 if (vnode_cache_is_authorized(dvp, ctx, KAUTH_VNODE_DELETE_CHILD) == TRUE)
5556 parent_authorized_for_delete_child = TRUE;
5557 } else {
5558 dvp = NULL;
5559 }
5560
5561 /*
5562 * Check for read-only filesystems.
5563 */
5564 if ((rights & KAUTH_VNODE_WRITE_RIGHTS) &&
5565 (vp->v_mount->mnt_flag & MNT_RDONLY) &&
5566 ((vp->v_type == VREG) || (vp->v_type == VDIR) ||
5567 (vp->v_type == VLNK) || (vp->v_type == VCPLX) ||
5568 (rights & KAUTH_VNODE_DELETE) || (rights & KAUTH_VNODE_DELETE_CHILD))) {
5569 result = EROFS;
5570 goto out;
5571 }
5572
5573 /*
5574 * Check for noexec filesystems.
5575 */
5576 if ((rights & KAUTH_VNODE_EXECUTE) && (vp->v_type == VREG) && (vp->v_mount->mnt_flag & MNT_NOEXEC)) {
5577 result = EACCES;
5578 goto out;
5579 }
5580
5581 /*
5582 * Handle cases related to filesystems with non-local enforcement.
5583 * This call can return 0, in which case we will fall through to perform a
5584 * check based on VNOP_GETATTR data. Otherwise it returns 1 and sets
5585 * an appropriate result, at which point we can return immediately.
5586 */
5587 if ((vp->v_mount->mnt_kern_flag & MNTK_AUTH_OPAQUE) && vnode_authorize_opaque(vp, &result, action, ctx))
5588 goto out;
5589
5590 /*
5591 * Get vnode attributes and extended security information for the vnode
5592 * and directory if required.
5593 */
5594 VATTR_WANTED(&va, va_mode);
5595 VATTR_WANTED(&va, va_uid);
5596 VATTR_WANTED(&va, va_gid);
5597 VATTR_WANTED(&va, va_flags);
5598 VATTR_WANTED(&va, va_acl);
5599 if ((result = vnode_getattr(vp, &va, ctx)) != 0) {
5600 KAUTH_DEBUG("%p ERROR - failed to get vnode attributes - %d", vp, result);
5601 goto out;
5602 }
5603 if (dvp && parent_authorized_for_delete_child == FALSE) {
5604 VATTR_WANTED(&dva, va_mode);
5605 VATTR_WANTED(&dva, va_uid);
5606 VATTR_WANTED(&dva, va_gid);
5607 VATTR_WANTED(&dva, va_flags);
5608 VATTR_WANTED(&dva, va_acl);
5609 if ((result = vnode_getattr(dvp, &dva, ctx)) != 0) {
5610 KAUTH_DEBUG("%p ERROR - failed to get directory vnode attributes - %d", vp, result);
5611 goto out;
5612 }
5613 }
5614
5615 /*
5616 * If the vnode is an extended attribute data vnode (eg. a resource fork), *_DATA becomes
5617 * *_EXTATTRIBUTES.
5618 */
5619 if (S_ISXATTR(va.va_mode) || vnode_isnamedstream(vp)) {
5620 if (rights & KAUTH_VNODE_READ_DATA) {
5621 rights &= ~KAUTH_VNODE_READ_DATA;
5622 rights |= KAUTH_VNODE_READ_EXTATTRIBUTES;
5623 }
5624 if (rights & KAUTH_VNODE_WRITE_DATA) {
5625 rights &= ~KAUTH_VNODE_WRITE_DATA;
5626 rights |= KAUTH_VNODE_WRITE_EXTATTRIBUTES;
5627 }
5628 }
5629
5630 /*
5631 * Point 'vp' to the resource fork's parent for ACL checking
5632 */
5633 if (vnode_isnamedstream(vp) &&
5634 (vp->v_parent != NULL) &&
5635 (vget_internal(vp->v_parent, 0, VNODE_NODEAD) == 0)) {
5636 parent_ref = TRUE;
5637 vcp->vp = vp = vp->v_parent;
5638 if (VATTR_IS_SUPPORTED(&va, va_acl) && (va.va_acl != NULL))
5639 kauth_acl_free(va.va_acl);
5640 VATTR_INIT(&va);
5641 VATTR_WANTED(&va, va_mode);
5642 VATTR_WANTED(&va, va_uid);
5643 VATTR_WANTED(&va, va_gid);
5644 VATTR_WANTED(&va, va_flags);
5645 VATTR_WANTED(&va, va_acl);
5646 if ((result = vnode_getattr(vp, &va, ctx)) != 0)
5647 goto out;
5648 }
5649
5650 /*
5651 * Check for immutability.
5652 *
5653 * In the deletion case, parent directory immutability vetoes specific
5654 * file rights.
5655 */
5656 if ((result = vnode_authorize_checkimmutable(vp, &va, rights, noimmutable)) != 0)
5657 goto out;
5658 if ((rights & KAUTH_VNODE_DELETE) &&
5659 parent_authorized_for_delete_child == FALSE &&
5660 ((result = vnode_authorize_checkimmutable(dvp, &dva, KAUTH_VNODE_DELETE_CHILD, 0)) != 0))
5661 goto out;
5662
5663 /*
5664 * Clear rights that have been authorized by reaching this point, bail if nothing left to
5665 * check.
5666 */
5667 rights &= ~(KAUTH_VNODE_LINKTARGET | KAUTH_VNODE_CHECKIMMUTABLE);
5668 if (rights == 0)
5669 goto out;
5670
5671 /*
5672 * If we're not the superuser, authorize based on file properties;
5673 * note that even if parent_authorized_for_delete_child is TRUE, we
5674 * need to check on the node itself.
5675 */
5676 if (!vfs_context_issuser(ctx)) {
5677 /* process delete rights */
5678 if ((rights & KAUTH_VNODE_DELETE) &&
5679 ((result = vnode_authorize_delete(vcp, parent_authorized_for_delete_child)) != 0))
5680 goto out;
5681
5682 /* process remaining rights */
5683 if ((rights & ~KAUTH_VNODE_DELETE) &&
5684 (result = vnode_authorize_simple(vcp, rights, rights & KAUTH_VNODE_DELETE, &found_deny)) != 0)
5685 goto out;
5686 } else {
5687
5688 /*
5689 * Execute is only granted to root if one of the x bits is set. This check only
5690 * makes sense if the posix mode bits are actually supported.
5691 */
5692 if ((rights & KAUTH_VNODE_EXECUTE) &&
5693 (vp->v_type == VREG) &&
5694 VATTR_IS_SUPPORTED(&va, va_mode) &&
5695 !(va.va_mode & (S_IXUSR | S_IXGRP | S_IXOTH))) {
5696 result = EPERM;
5697 KAUTH_DEBUG("%p DENIED - root execute requires at least one x bit in 0x%x", vp, va.va_mode);
5698 goto out;
5699 }
5700
5701 KAUTH_DEBUG("%p ALLOWED - caller is superuser", vp);
5702 }
5703 out:
5704 if (VATTR_IS_SUPPORTED(&va, va_acl) && (va.va_acl != NULL))
5705 kauth_acl_free(va.va_acl);
5706 if (VATTR_IS_SUPPORTED(&dva, va_acl) && (dva.va_acl != NULL))
5707 kauth_acl_free(dva.va_acl);
5708
5709 if (result) {
5710 if (parent_ref)
5711 vnode_put(vp);
5712 *errorp = result;
5713 KAUTH_DEBUG("%p DENIED - auth denied", vp);
5714 return(KAUTH_RESULT_DENY);
5715 }
5716 if ((rights & KAUTH_VNODE_SEARCH) && found_deny == FALSE && vp->v_type == VDIR) {
5717 /*
5718 * if we were successfully granted the right to search this directory
5719 * and there were NO ACL DENYs for search and the posix permissions also don't
5720 * deny execute, we can synthesize a global right that allows anyone to
5721 * traverse this directory during a pathname lookup without having to
5722 * match the credential associated with this cache of rights.
5723 */
5724 if (!VATTR_IS_SUPPORTED(&va, va_mode) ||
5725 ((va.va_mode & (S_IXUSR | S_IXGRP | S_IXOTH)) ==
5726 (S_IXUSR | S_IXGRP | S_IXOTH))) {
5727 vnode_cache_authorized_action(vp, ctx, KAUTH_VNODE_SEARCHBYANYONE);
5728 }
5729 }
5730 if ((rights & KAUTH_VNODE_DELETE) && parent_authorized_for_delete_child == FALSE) {
5731 /*
5732 * parent was successfully and newly authorized for content deletions
5733 * add it to the cache, but only if it doesn't have the sticky
5734 * bit set on it. This same check is done earlier guarding
5735 * fetching of dva, and if we jumped to out without having done
5736 * this, we will have returned already because of a non-zero
5737 * 'result' value.
5738 */
5739 if (VATTR_IS_SUPPORTED(&dva, va_mode) &&
5740 !(dva.va_mode & (S_ISVTX))) {
5741 /* OK to cache delete rights */
5742 vnode_cache_authorized_action(dvp, ctx, KAUTH_VNODE_DELETE_CHILD);
5743 }
5744 }
5745 if (parent_ref)
5746 vnode_put(vp);
5747 /*
5748 * Note that this implies that we will allow requests for no rights, as well as
5749 * for rights that we do not recognise. There should be none of these.
5750 */
5751 KAUTH_DEBUG("%p ALLOWED - auth granted", vp);
5752 return(KAUTH_RESULT_ALLOW);
5753 }
5754
5755 /*
5756 * Check that the attribute information in vattr can be legally applied to
5757 * a new file by the context.
5758 */
5759 int
5760 vnode_authattr_new(vnode_t dvp, struct vnode_attr *vap, int noauth, vfs_context_t ctx)
5761 {
5762 int error;
5763 int has_priv_suser, ismember, defaulted_owner, defaulted_group, defaulted_mode;
5764 kauth_cred_t cred;
5765 guid_t changer;
5766 mount_t dmp;
5767
5768 error = 0;
5769 defaulted_owner = defaulted_group = defaulted_mode = 0;
5770
5771 /*
5772 * Require that the filesystem support extended security to apply any.
5773 */
5774 if (!vfs_extendedsecurity(dvp->v_mount) &&
5775 (VATTR_IS_ACTIVE(vap, va_acl) || VATTR_IS_ACTIVE(vap, va_uuuid) || VATTR_IS_ACTIVE(vap, va_guuid))) {
5776 error = EINVAL;
5777 goto out;
5778 }
5779
5780 /*
5781 * Default some fields.
5782 */
5783 dmp = dvp->v_mount;
5784
5785 /*
5786 * If the filesystem is mounted IGNORE_OWNERSHIP and an explicit owner is set, that
5787 * owner takes ownership of all new files.
5788 */
5789 if ((dmp->mnt_flag & MNT_IGNORE_OWNERSHIP) && (dmp->mnt_fsowner != KAUTH_UID_NONE)) {
5790 VATTR_SET(vap, va_uid, dmp->mnt_fsowner);
5791 defaulted_owner = 1;
5792 } else {
5793 if (!VATTR_IS_ACTIVE(vap, va_uid)) {
5794 /* default owner is current user */
5795 VATTR_SET(vap, va_uid, kauth_cred_getuid(vfs_context_ucred(ctx)));
5796 defaulted_owner = 1;
5797 }
5798 }
5799
5800 /*
5801 * If the filesystem is mounted IGNORE_OWNERSHIP and an explicit grouo is set, that
5802 * group takes ownership of all new files.
5803 */
5804 if ((dmp->mnt_flag & MNT_IGNORE_OWNERSHIP) && (dmp->mnt_fsgroup != KAUTH_GID_NONE)) {
5805 VATTR_SET(vap, va_gid, dmp->mnt_fsgroup);
5806 defaulted_group = 1;
5807 } else {
5808 if (!VATTR_IS_ACTIVE(vap, va_gid)) {
5809 /* default group comes from parent object, fallback to current user */
5810 struct vnode_attr dva;
5811 VATTR_INIT(&dva);
5812 VATTR_WANTED(&dva, va_gid);
5813 if ((error = vnode_getattr(dvp, &dva, ctx)) != 0)
5814 goto out;
5815 if (VATTR_IS_SUPPORTED(&dva, va_gid)) {
5816 VATTR_SET(vap, va_gid, dva.va_gid);
5817 } else {
5818 VATTR_SET(vap, va_gid, kauth_cred_getgid(vfs_context_ucred(ctx)));
5819 }
5820 defaulted_group = 1;
5821 }
5822 }
5823
5824 if (!VATTR_IS_ACTIVE(vap, va_flags))
5825 VATTR_SET(vap, va_flags, 0);
5826
5827 /* default mode is everything, masked with current umask */
5828 if (!VATTR_IS_ACTIVE(vap, va_mode)) {
5829 VATTR_SET(vap, va_mode, ACCESSPERMS & ~vfs_context_proc(ctx)->p_fd->fd_cmask);
5830 KAUTH_DEBUG("ATTR - defaulting new file mode to %o from umask %o", vap->va_mode, vfs_context_proc(ctx)->p_fd->fd_cmask);
5831 defaulted_mode = 1;
5832 }
5833 /* set timestamps to now */
5834 if (!VATTR_IS_ACTIVE(vap, va_create_time)) {
5835 nanotime(&vap->va_create_time);
5836 VATTR_SET_ACTIVE(vap, va_create_time);
5837 }
5838
5839 /*
5840 * Check for attempts to set nonsensical fields.
5841 */
5842 if (vap->va_active & ~VNODE_ATTR_NEWOBJ) {
5843 error = EINVAL;
5844 KAUTH_DEBUG("ATTR - ERROR - attempt to set unsupported new-file attributes %llx",
5845 vap->va_active & ~VNODE_ATTR_NEWOBJ);
5846 goto out;
5847 }
5848
5849 /*
5850 * Quickly check for the applicability of any enforcement here.
5851 * Tests below maintain the integrity of the local security model.
5852 */
5853 if (vfs_authopaque(dvp->v_mount))
5854 goto out;
5855
5856 /*
5857 * We need to know if the caller is the superuser, or if the work is
5858 * otherwise already authorised.
5859 */
5860 cred = vfs_context_ucred(ctx);
5861 if (noauth) {
5862 /* doing work for the kernel */
5863 has_priv_suser = 1;
5864 } else {
5865 has_priv_suser = vfs_context_issuser(ctx);
5866 }
5867
5868
5869 if (VATTR_IS_ACTIVE(vap, va_flags)) {
5870 if (has_priv_suser) {
5871 if ((vap->va_flags & (UF_SETTABLE | SF_SETTABLE)) != vap->va_flags) {
5872 error = EPERM;
5873 KAUTH_DEBUG(" DENIED - superuser attempt to set illegal flag(s)");
5874 goto out;
5875 }
5876 } else {
5877 if ((vap->va_flags & UF_SETTABLE) != vap->va_flags) {
5878 error = EPERM;
5879 KAUTH_DEBUG(" DENIED - user attempt to set illegal flag(s)");
5880 goto out;
5881 }
5882 }
5883 }
5884
5885 /* if not superuser, validate legality of new-item attributes */
5886 if (!has_priv_suser) {
5887 if (!defaulted_mode && VATTR_IS_ACTIVE(vap, va_mode)) {
5888 /* setgid? */
5889 if (vap->va_mode & S_ISGID) {
5890 if ((error = kauth_cred_ismember_gid(cred, vap->va_gid, &ismember)) != 0) {
5891 KAUTH_DEBUG("ATTR - ERROR: got %d checking for membership in %d", error, vap->va_gid);
5892 goto out;
5893 }
5894 if (!ismember) {
5895 KAUTH_DEBUG(" DENIED - can't set SGID bit, not a member of %d", vap->va_gid);
5896 error = EPERM;
5897 goto out;
5898 }
5899 }
5900
5901 /* setuid? */
5902 if ((vap->va_mode & S_ISUID) && (vap->va_uid != kauth_cred_getuid(cred))) {
5903 KAUTH_DEBUG("ATTR - ERROR: illegal attempt to set the setuid bit");
5904 error = EPERM;
5905 goto out;
5906 }
5907 }
5908 if (!defaulted_owner && (vap->va_uid != kauth_cred_getuid(cred))) {
5909 KAUTH_DEBUG(" DENIED - cannot create new item owned by %d", vap->va_uid);
5910 error = EPERM;
5911 goto out;
5912 }
5913 if (!defaulted_group) {
5914 if ((error = kauth_cred_ismember_gid(cred, vap->va_gid, &ismember)) != 0) {
5915 KAUTH_DEBUG(" ERROR - got %d checking for membership in %d", error, vap->va_gid);
5916 goto out;
5917 }
5918 if (!ismember) {
5919 KAUTH_DEBUG(" DENIED - cannot create new item with group %d - not a member", vap->va_gid);
5920 error = EPERM;
5921 goto out;
5922 }
5923 }
5924
5925 /* initialising owner/group UUID */
5926 if (VATTR_IS_ACTIVE(vap, va_uuuid)) {
5927 if ((error = kauth_cred_getguid(cred, &changer)) != 0) {
5928 KAUTH_DEBUG(" ERROR - got %d trying to get caller UUID", error);
5929 /* XXX ENOENT here - no GUID - should perhaps become EPERM */
5930 goto out;
5931 }
5932 if (!kauth_guid_equal(&vap->va_uuuid, &changer)) {
5933 KAUTH_DEBUG(" ERROR - cannot create item with supplied owner UUID - not us");
5934 error = EPERM;
5935 goto out;
5936 }
5937 }
5938 if (VATTR_IS_ACTIVE(vap, va_guuid)) {
5939 if ((error = kauth_cred_ismember_guid(cred, &vap->va_guuid, &ismember)) != 0) {
5940 KAUTH_DEBUG(" ERROR - got %d trying to check group membership", error);
5941 goto out;
5942 }
5943 if (!ismember) {
5944 KAUTH_DEBUG(" ERROR - cannot create item with supplied group UUID - not a member");
5945 error = EPERM;
5946 goto out;
5947 }
5948 }
5949 }
5950 out:
5951 return(error);
5952 }
5953
5954 /*
5955 * Check that the attribute information in vap can be legally written by the
5956 * context.
5957 *
5958 * Call this when you're not sure about the vnode_attr; either its contents
5959 * have come from an unknown source, or when they are variable.
5960 *
5961 * Returns errno, or zero and sets *actionp to the KAUTH_VNODE_* actions that
5962 * must be authorized to be permitted to write the vattr.
5963 */
5964 int
5965 vnode_authattr(vnode_t vp, struct vnode_attr *vap, kauth_action_t *actionp, vfs_context_t ctx)
5966 {
5967 struct vnode_attr ova;
5968 kauth_action_t required_action;
5969 int error, has_priv_suser, ismember, chowner, chgroup, clear_suid, clear_sgid;
5970 guid_t changer;
5971 gid_t group;
5972 uid_t owner;
5973 mode_t newmode;
5974 kauth_cred_t cred;
5975 uint32_t fdelta;
5976
5977 VATTR_INIT(&ova);
5978 required_action = 0;
5979 error = 0;
5980
5981 /*
5982 * Quickly check for enforcement applicability.
5983 */
5984 if (vfs_authopaque(vp->v_mount))
5985 goto out;
5986
5987 /*
5988 * Check for attempts to set nonsensical fields.
5989 */
5990 if (vap->va_active & VNODE_ATTR_RDONLY) {
5991 KAUTH_DEBUG("ATTR - ERROR: attempt to set readonly attribute(s)");
5992 error = EINVAL;
5993 goto out;
5994 }
5995
5996 /*
5997 * We need to know if the caller is the superuser.
5998 */
5999 cred = vfs_context_ucred(ctx);
6000 has_priv_suser = kauth_cred_issuser(cred);
6001
6002 /*
6003 * If any of the following are changing, we need information from the old file:
6004 * va_uid
6005 * va_gid
6006 * va_mode
6007 * va_uuuid
6008 * va_guuid
6009 */
6010 if (VATTR_IS_ACTIVE(vap, va_uid) ||
6011 VATTR_IS_ACTIVE(vap, va_gid) ||
6012 VATTR_IS_ACTIVE(vap, va_mode) ||
6013 VATTR_IS_ACTIVE(vap, va_uuuid) ||
6014 VATTR_IS_ACTIVE(vap, va_guuid)) {
6015 VATTR_WANTED(&ova, va_mode);
6016 VATTR_WANTED(&ova, va_uid);
6017 VATTR_WANTED(&ova, va_gid);
6018 VATTR_WANTED(&ova, va_uuuid);
6019 VATTR_WANTED(&ova, va_guuid);
6020 KAUTH_DEBUG("ATTR - security information changing, fetching existing attributes");
6021 }
6022
6023 /*
6024 * If timestamps are being changed, we need to know who the file is owned
6025 * by.
6026 */
6027 if (VATTR_IS_ACTIVE(vap, va_create_time) ||
6028 VATTR_IS_ACTIVE(vap, va_change_time) ||
6029 VATTR_IS_ACTIVE(vap, va_modify_time) ||
6030 VATTR_IS_ACTIVE(vap, va_access_time) ||
6031 VATTR_IS_ACTIVE(vap, va_backup_time)) {
6032
6033 VATTR_WANTED(&ova, va_uid);
6034 #if 0 /* enable this when we support UUIDs as official owners */
6035 VATTR_WANTED(&ova, va_uuuid);
6036 #endif
6037 KAUTH_DEBUG("ATTR - timestamps changing, fetching uid and GUID");
6038 }
6039
6040 /*
6041 * If flags are being changed, we need the old flags.
6042 */
6043 if (VATTR_IS_ACTIVE(vap, va_flags)) {
6044 KAUTH_DEBUG("ATTR - flags changing, fetching old flags");
6045 VATTR_WANTED(&ova, va_flags);
6046 }
6047
6048 /*
6049 * If the size is being set, make sure it's not a directory.
6050 */
6051 if (VATTR_IS_ACTIVE(vap, va_data_size)) {
6052 /* size is meaningless on a directory, don't permit this */
6053 if (vnode_isdir(vp)) {
6054 KAUTH_DEBUG("ATTR - ERROR: size change requested on a directory");
6055 error = EISDIR;
6056 goto out;
6057 }
6058 }
6059
6060 /*
6061 * Get old data.
6062 */
6063 KAUTH_DEBUG("ATTR - fetching old attributes %016llx", ova.va_active);
6064 if ((error = vnode_getattr(vp, &ova, ctx)) != 0) {
6065 KAUTH_DEBUG(" ERROR - got %d trying to get attributes", error);
6066 goto out;
6067 }
6068
6069 /*
6070 * Size changes require write access to the file data.
6071 */
6072 if (VATTR_IS_ACTIVE(vap, va_data_size)) {
6073 /* if we can't get the size, or it's different, we need write access */
6074 KAUTH_DEBUG("ATTR - size change, requiring WRITE_DATA");
6075 required_action |= KAUTH_VNODE_WRITE_DATA;
6076 }
6077
6078 /*
6079 * Changing timestamps?
6080 *
6081 * Note that we are only called to authorize user-requested time changes;
6082 * side-effect time changes are not authorized. Authorisation is only
6083 * required for existing files.
6084 *
6085 * Non-owners are not permitted to change the time on an existing
6086 * file to anything other than the current time.
6087 */
6088 if (VATTR_IS_ACTIVE(vap, va_create_time) ||
6089 VATTR_IS_ACTIVE(vap, va_change_time) ||
6090 VATTR_IS_ACTIVE(vap, va_modify_time) ||
6091 VATTR_IS_ACTIVE(vap, va_access_time) ||
6092 VATTR_IS_ACTIVE(vap, va_backup_time)) {
6093 /*
6094 * The owner and root may set any timestamps they like,
6095 * provided that the file is not immutable. The owner still needs
6096 * WRITE_ATTRIBUTES (implied by ownership but still deniable).
6097 */
6098 if (has_priv_suser || vauth_node_owner(&ova, cred)) {
6099 KAUTH_DEBUG("ATTR - root or owner changing timestamps");
6100 required_action |= KAUTH_VNODE_CHECKIMMUTABLE | KAUTH_VNODE_WRITE_ATTRIBUTES;
6101 } else {
6102 /* just setting the current time? */
6103 if (vap->va_vaflags & VA_UTIMES_NULL) {
6104 KAUTH_DEBUG("ATTR - non-root/owner changing timestamps, requiring WRITE_ATTRIBUTES");
6105 required_action |= KAUTH_VNODE_WRITE_ATTRIBUTES;
6106 } else {
6107 KAUTH_DEBUG("ATTR - ERROR: illegal timestamp modification attempted");
6108 error = EACCES;
6109 goto out;
6110 }
6111 }
6112 }
6113
6114 /*
6115 * Changing file mode?
6116 */
6117 if (VATTR_IS_ACTIVE(vap, va_mode) && VATTR_IS_SUPPORTED(&ova, va_mode) && (ova.va_mode != vap->va_mode)) {
6118 KAUTH_DEBUG("ATTR - mode change from %06o to %06o", ova.va_mode, vap->va_mode);
6119
6120 /*
6121 * Mode changes always have the same basic auth requirements.
6122 */
6123 if (has_priv_suser) {
6124 KAUTH_DEBUG("ATTR - superuser mode change, requiring immutability check");
6125 required_action |= KAUTH_VNODE_CHECKIMMUTABLE;
6126 } else {
6127 /* need WRITE_SECURITY */
6128 KAUTH_DEBUG("ATTR - non-superuser mode change, requiring WRITE_SECURITY");
6129 required_action |= KAUTH_VNODE_WRITE_SECURITY;
6130 }
6131
6132 /*
6133 * Can't set the setgid bit if you're not in the group and not root. Have to have
6134 * existing group information in the case we're not setting it right now.
6135 */
6136 if (vap->va_mode & S_ISGID) {
6137 required_action |= KAUTH_VNODE_CHECKIMMUTABLE; /* always required */
6138 if (!has_priv_suser) {
6139 if (VATTR_IS_ACTIVE(vap, va_gid)) {
6140 group = vap->va_gid;
6141 } else if (VATTR_IS_SUPPORTED(&ova, va_gid)) {
6142 group = ova.va_gid;
6143 } else {
6144 KAUTH_DEBUG("ATTR - ERROR: setgid but no gid available");
6145 error = EINVAL;
6146 goto out;
6147 }
6148 /*
6149 * This might be too restrictive; WRITE_SECURITY might be implied by
6150 * membership in this case, rather than being an additional requirement.
6151 */
6152 if ((error = kauth_cred_ismember_gid(cred, group, &ismember)) != 0) {
6153 KAUTH_DEBUG("ATTR - ERROR: got %d checking for membership in %d", error, vap->va_gid);
6154 goto out;
6155 }
6156 if (!ismember) {
6157 KAUTH_DEBUG(" DENIED - can't set SGID bit, not a member of %d", group);
6158 error = EPERM;
6159 goto out;
6160 }
6161 }
6162 }
6163
6164 /*
6165 * Can't set the setuid bit unless you're root or the file's owner.
6166 */
6167 if (vap->va_mode & S_ISUID) {
6168 required_action |= KAUTH_VNODE_CHECKIMMUTABLE; /* always required */
6169 if (!has_priv_suser) {
6170 if (VATTR_IS_ACTIVE(vap, va_uid)) {
6171 owner = vap->va_uid;
6172 } else if (VATTR_IS_SUPPORTED(&ova, va_uid)) {
6173 owner = ova.va_uid;
6174 } else {
6175 KAUTH_DEBUG("ATTR - ERROR: setuid but no uid available");
6176 error = EINVAL;
6177 goto out;
6178 }
6179 if (owner != kauth_cred_getuid(cred)) {
6180 /*
6181 * We could allow this if WRITE_SECURITY is permitted, perhaps.
6182 */
6183 KAUTH_DEBUG("ATTR - ERROR: illegal attempt to set the setuid bit");
6184 error = EPERM;
6185 goto out;
6186 }
6187 }
6188 }
6189 }
6190
6191 /*
6192 * Validate/mask flags changes. This checks that only the flags in
6193 * the UF_SETTABLE mask are being set, and preserves the flags in
6194 * the SF_SETTABLE case.
6195 *
6196 * Since flags changes may be made in conjunction with other changes,
6197 * we will ask the auth code to ignore immutability in the case that
6198 * the SF_* flags are not set and we are only manipulating the file flags.
6199 *
6200 */
6201 if (VATTR_IS_ACTIVE(vap, va_flags)) {
6202 /* compute changing flags bits */
6203 if (VATTR_IS_SUPPORTED(&ova, va_flags)) {
6204 fdelta = vap->va_flags ^ ova.va_flags;
6205 } else {
6206 fdelta = vap->va_flags;
6207 }
6208
6209 if (fdelta != 0) {
6210 KAUTH_DEBUG("ATTR - flags changing, requiring WRITE_SECURITY");
6211 required_action |= KAUTH_VNODE_WRITE_SECURITY;
6212
6213 /* check that changing bits are legal */
6214 if (has_priv_suser) {
6215 /*
6216 * The immutability check will prevent us from clearing the SF_*
6217 * flags unless the system securelevel permits it, so just check
6218 * for legal flags here.
6219 */
6220 if (fdelta & ~(UF_SETTABLE | SF_SETTABLE)) {
6221 error = EPERM;
6222 KAUTH_DEBUG(" DENIED - superuser attempt to set illegal flag(s)");
6223 goto out;
6224 }
6225 } else {
6226 if (fdelta & ~UF_SETTABLE) {
6227 error = EPERM;
6228 KAUTH_DEBUG(" DENIED - user attempt to set illegal flag(s)");
6229 goto out;
6230 }
6231 }
6232 /*
6233 * If the caller has the ability to manipulate file flags,
6234 * security is not reduced by ignoring them for this operation.
6235 *
6236 * A more complete test here would consider the 'after' states of the flags
6237 * to determine whether it would permit the operation, but this becomes
6238 * very complex.
6239 *
6240 * Ignoring immutability is conditional on securelevel; this does not bypass
6241 * the SF_* flags if securelevel > 0.
6242 */
6243 required_action |= KAUTH_VNODE_NOIMMUTABLE;
6244 }
6245 }
6246
6247 /*
6248 * Validate ownership information.
6249 */
6250 chowner = 0;
6251 chgroup = 0;
6252 clear_suid = 0;
6253 clear_sgid = 0;
6254
6255 /*
6256 * uid changing
6257 * Note that if the filesystem didn't give us a UID, we expect that it doesn't
6258 * support them in general, and will ignore it if/when we try to set it.
6259 * We might want to clear the uid out of vap completely here.
6260 */
6261 if (VATTR_IS_ACTIVE(vap, va_uid)) {
6262 if (VATTR_IS_SUPPORTED(&ova, va_uid) && (vap->va_uid != ova.va_uid)) {
6263 if (!has_priv_suser && (kauth_cred_getuid(cred) != vap->va_uid)) {
6264 KAUTH_DEBUG(" DENIED - non-superuser cannot change ownershipt to a third party");
6265 error = EPERM;
6266 goto out;
6267 }
6268 chowner = 1;
6269 }
6270 clear_suid = 1;
6271 }
6272
6273 /*
6274 * gid changing
6275 * Note that if the filesystem didn't give us a GID, we expect that it doesn't
6276 * support them in general, and will ignore it if/when we try to set it.
6277 * We might want to clear the gid out of vap completely here.
6278 */
6279 if (VATTR_IS_ACTIVE(vap, va_gid)) {
6280 if (VATTR_IS_SUPPORTED(&ova, va_gid) && (vap->va_gid != ova.va_gid)) {
6281 if (!has_priv_suser) {
6282 if ((error = kauth_cred_ismember_gid(cred, vap->va_gid, &ismember)) != 0) {
6283 KAUTH_DEBUG(" ERROR - got %d checking for membership in %d", error, vap->va_gid);
6284 goto out;
6285 }
6286 if (!ismember) {
6287 KAUTH_DEBUG(" DENIED - group change from %d to %d but not a member of target group",
6288 ova.va_gid, vap->va_gid);
6289 error = EPERM;
6290 goto out;
6291 }
6292 }
6293 chgroup = 1;
6294 }
6295 clear_sgid = 1;
6296 }
6297
6298 /*
6299 * Owner UUID being set or changed.
6300 */
6301 if (VATTR_IS_ACTIVE(vap, va_uuuid)) {
6302 /* if the owner UUID is not actually changing ... */
6303 if (VATTR_IS_SUPPORTED(&ova, va_uuuid) && kauth_guid_equal(&vap->va_uuuid, &ova.va_uuuid))
6304 goto no_uuuid_change;
6305
6306 /*
6307 * The owner UUID cannot be set by a non-superuser to anything other than
6308 * their own.
6309 */
6310 if (!has_priv_suser) {
6311 if ((error = kauth_cred_getguid(cred, &changer)) != 0) {
6312 KAUTH_DEBUG(" ERROR - got %d trying to get caller UUID", error);
6313 /* XXX ENOENT here - no UUID - should perhaps become EPERM */
6314 goto out;
6315 }
6316 if (!kauth_guid_equal(&vap->va_uuuid, &changer)) {
6317 KAUTH_DEBUG(" ERROR - cannot set supplied owner UUID - not us");
6318 error = EPERM;
6319 goto out;
6320 }
6321 }
6322 chowner = 1;
6323 clear_suid = 1;
6324 }
6325 no_uuuid_change:
6326 /*
6327 * Group UUID being set or changed.
6328 */
6329 if (VATTR_IS_ACTIVE(vap, va_guuid)) {
6330 /* if the group UUID is not actually changing ... */
6331 if (VATTR_IS_SUPPORTED(&ova, va_guuid) && kauth_guid_equal(&vap->va_guuid, &ova.va_guuid))
6332 goto no_guuid_change;
6333
6334 /*
6335 * The group UUID cannot be set by a non-superuser to anything other than
6336 * one of which they are a member.
6337 */
6338 if (!has_priv_suser) {
6339 if ((error = kauth_cred_ismember_guid(cred, &vap->va_guuid, &ismember)) != 0) {
6340 KAUTH_DEBUG(" ERROR - got %d trying to check group membership", error);
6341 goto out;
6342 }
6343 if (!ismember) {
6344 KAUTH_DEBUG(" ERROR - cannot create item with supplied group UUID - not a member");
6345 error = EPERM;
6346 goto out;
6347 }
6348 }
6349 chgroup = 1;
6350 }
6351 no_guuid_change:
6352
6353 /*
6354 * Compute authorisation for group/ownership changes.
6355 */
6356 if (chowner || chgroup || clear_suid || clear_sgid) {
6357 if (has_priv_suser) {
6358 KAUTH_DEBUG("ATTR - superuser changing file owner/group, requiring immutability check");
6359 required_action |= KAUTH_VNODE_CHECKIMMUTABLE;
6360 } else {
6361 if (chowner) {
6362 KAUTH_DEBUG("ATTR - ownership change, requiring TAKE_OWNERSHIP");
6363 required_action |= KAUTH_VNODE_TAKE_OWNERSHIP;
6364 }
6365 if (chgroup && !chowner) {
6366 KAUTH_DEBUG("ATTR - group change, requiring WRITE_SECURITY");
6367 required_action |= KAUTH_VNODE_WRITE_SECURITY;
6368 }
6369
6370 /* clear set-uid and set-gid bits as required by Posix */
6371 if (VATTR_IS_ACTIVE(vap, va_mode)) {
6372 newmode = vap->va_mode;
6373 } else if (VATTR_IS_SUPPORTED(&ova, va_mode)) {
6374 newmode = ova.va_mode;
6375 } else {
6376 KAUTH_DEBUG("CHOWN - trying to change owner but cannot get mode from filesystem to mask setugid bits");
6377 newmode = 0;
6378 }
6379 if (newmode & (S_ISUID | S_ISGID)) {
6380 VATTR_SET(vap, va_mode, newmode & ~(S_ISUID | S_ISGID));
6381 KAUTH_DEBUG("CHOWN - masking setugid bits from mode %o to %o", newmode, vap->va_mode);
6382 }
6383 }
6384 }
6385
6386 /*
6387 * Authorise changes in the ACL.
6388 */
6389 if (VATTR_IS_ACTIVE(vap, va_acl)) {
6390
6391 /* no existing ACL */
6392 if (!VATTR_IS_ACTIVE(&ova, va_acl) || (ova.va_acl == NULL)) {
6393
6394 /* adding an ACL */
6395 if (vap->va_acl != NULL) {
6396 required_action |= KAUTH_VNODE_WRITE_SECURITY;
6397 KAUTH_DEBUG("CHMOD - adding ACL");
6398 }
6399
6400 /* removing an existing ACL */
6401 } else if (vap->va_acl == NULL) {
6402 required_action |= KAUTH_VNODE_WRITE_SECURITY;
6403 KAUTH_DEBUG("CHMOD - removing ACL");
6404
6405 /* updating an existing ACL */
6406 } else {
6407 if (vap->va_acl->acl_entrycount != ova.va_acl->acl_entrycount) {
6408 /* entry count changed, must be different */
6409 required_action |= KAUTH_VNODE_WRITE_SECURITY;
6410 KAUTH_DEBUG("CHMOD - adding/removing ACL entries");
6411 } else if (vap->va_acl->acl_entrycount > 0) {
6412 /* both ACLs have the same ACE count, said count is 1 or more, bitwise compare ACLs */
6413 if (!memcmp(&vap->va_acl->acl_ace[0], &ova.va_acl->acl_ace[0],
6414 sizeof(struct kauth_ace) * vap->va_acl->acl_entrycount)) {
6415 required_action |= KAUTH_VNODE_WRITE_SECURITY;
6416 KAUTH_DEBUG("CHMOD - changing ACL entries");
6417 }
6418 }
6419 }
6420 }
6421
6422 /*
6423 * Other attributes that require authorisation.
6424 */
6425 if (VATTR_IS_ACTIVE(vap, va_encoding))
6426 required_action |= KAUTH_VNODE_WRITE_ATTRIBUTES;
6427
6428 out:
6429 if (VATTR_IS_SUPPORTED(&ova, va_acl) && (ova.va_acl != NULL))
6430 kauth_acl_free(ova.va_acl);
6431 if (error == 0)
6432 *actionp = required_action;
6433 return(error);
6434 }
6435
6436
6437 void
6438 vfs_setlocklocal(mount_t mp)
6439 {
6440 vnode_t vp;
6441
6442 mount_lock(mp);
6443 mp->mnt_kern_flag |= MNTK_LOCK_LOCAL;
6444
6445 /*
6446 * We do not expect anyone to be using any vnodes at the
6447 * time this routine is called. So no need for vnode locking
6448 */
6449 TAILQ_FOREACH(vp, &mp->mnt_vnodelist, v_mntvnodes) {
6450 vp->v_flag |= VLOCKLOCAL;
6451 }
6452 TAILQ_FOREACH(vp, &mp->mnt_workerqueue, v_mntvnodes) {
6453 vp->v_flag |= VLOCKLOCAL;
6454 }
6455 TAILQ_FOREACH(vp, &mp->mnt_newvnodes, v_mntvnodes) {
6456 vp->v_flag |= VLOCKLOCAL;
6457 }
6458 mount_unlock(mp);
6459 }
6460
6461 void
6462 vn_setunionwait(vnode_t vp)
6463 {
6464 vnode_lock_spin(vp);
6465 vp->v_flag |= VISUNION;
6466 vnode_unlock(vp);
6467 }
6468
6469
6470 void
6471 vn_checkunionwait(vnode_t vp)
6472 {
6473 vnode_lock(vp);
6474 while ((vp->v_flag & VISUNION) == VISUNION)
6475 msleep((caddr_t)&vp->v_flag, &vp->v_lock, 0, 0, 0);
6476 vnode_unlock(vp);
6477 }
6478
6479 void
6480 vn_clearunionwait(vnode_t vp, int locked)
6481 {
6482 if (!locked)
6483 vnode_lock(vp);
6484 if((vp->v_flag & VISUNION) == VISUNION) {
6485 vp->v_flag &= ~VISUNION;
6486 wakeup((caddr_t)&vp->v_flag);
6487 }
6488 if (!locked)
6489 vnode_unlock(vp);
6490 }
6491
6492 /*
6493 * XXX - get "don't trigger mounts" flag for thread; used by autofs.
6494 */
6495 extern int thread_notrigger(void);
6496
6497 int
6498 thread_notrigger(void)
6499 {
6500 struct uthread *uth = (struct uthread *)get_bsdthread_info(current_thread());
6501 return (uth->uu_notrigger);
6502 }
6503
6504 /*
6505 * Removes orphaned apple double files during a rmdir
6506 * Works by:
6507 * 1. vnode_suspend().
6508 * 2. Call VNOP_READDIR() till the end of directory is reached.
6509 * 3. Check if the directory entries returned are regular files with name starting with "._". If not, return ENOTEMPTY.
6510 * 4. Continue (2) and (3) till end of directory is reached.
6511 * 5. If all the entries in the directory were files with "._" name, delete all the files.
6512 * 6. vnode_resume()
6513 * 7. If deletion of all files succeeded, call VNOP_RMDIR() again.
6514 */
6515
6516 errno_t rmdir_remove_orphaned_appleDouble(vnode_t vp , vfs_context_t ctx, int * restart_flag)
6517 {
6518
6519 #define UIO_BUFF_SIZE 2048
6520 uio_t auio = NULL;
6521 int eofflag, siz = UIO_BUFF_SIZE, nentries = 0;
6522 int open_flag = 0, full_erase_flag = 0;
6523 char uio_buf[ UIO_SIZEOF(1) ];
6524 char *rbuf = NULL, *cpos, *cend;
6525 struct nameidata nd_temp;
6526 struct dirent *dp;
6527 errno_t error;
6528
6529 error = vnode_suspend(vp);
6530
6531 /*
6532 * restart_flag is set so that the calling rmdir sleeps and resets
6533 */
6534 if (error == EBUSY)
6535 *restart_flag = 1;
6536 if (error != 0)
6537 goto outsc;
6538
6539 /*
6540 * set up UIO
6541 */
6542 MALLOC(rbuf, caddr_t, siz, M_TEMP, M_WAITOK);
6543 if (rbuf)
6544 auio = uio_createwithbuffer(1, 0, UIO_SYSSPACE, UIO_READ,
6545 &uio_buf[0], sizeof(uio_buf));
6546 if (!rbuf || !auio) {
6547 error = ENOMEM;
6548 goto outsc;
6549 }
6550
6551 uio_setoffset(auio,0);
6552
6553 eofflag = 0;
6554
6555 if ((error = VNOP_OPEN(vp, FREAD, ctx)))
6556 goto outsc;
6557 else
6558 open_flag = 1;
6559
6560 /*
6561 * First pass checks if all files are appleDouble files.
6562 */
6563
6564 do {
6565 siz = UIO_BUFF_SIZE;
6566 uio_reset(auio, uio_offset(auio), UIO_SYSSPACE, UIO_READ);
6567 uio_addiov(auio, CAST_USER_ADDR_T(rbuf), UIO_BUFF_SIZE);
6568
6569 if((error = VNOP_READDIR(vp, auio, 0, &eofflag, &nentries, ctx)))
6570 goto outsc;
6571
6572 if (uio_resid(auio) != 0)
6573 siz -= uio_resid(auio);
6574
6575 /*
6576 * Iterate through directory
6577 */
6578 cpos = rbuf;
6579 cend = rbuf + siz;
6580 dp = (struct dirent*) cpos;
6581
6582 if (cpos == cend)
6583 eofflag = 1;
6584
6585 while ((cpos < cend)) {
6586 /*
6587 * Check for . and .. as well as directories
6588 */
6589 if (dp->d_ino != 0 &&
6590 !((dp->d_namlen == 1 && dp->d_name[0] == '.') ||
6591 (dp->d_namlen == 2 && dp->d_name[0] == '.' && dp->d_name[1] == '.'))) {
6592 /*
6593 * Check for irregular files and ._ files
6594 * If there is a ._._ file abort the op
6595 */
6596 if ( dp->d_namlen < 2 ||
6597 strncmp(dp->d_name,"._",2) ||
6598 (dp->d_namlen >= 4 && !strncmp(&(dp->d_name[2]), "._",2))) {
6599 error = ENOTEMPTY;
6600 goto outsc;
6601 }
6602 }
6603 cpos += dp->d_reclen;
6604 dp = (struct dirent*)cpos;
6605 }
6606
6607 /*
6608 * workaround for HFS/NFS setting eofflag before end of file
6609 */
6610 if (vp->v_tag == VT_HFS && nentries > 2)
6611 eofflag=0;
6612
6613 if (vp->v_tag == VT_NFS) {
6614 if (eofflag && !full_erase_flag) {
6615 full_erase_flag = 1;
6616 eofflag = 0;
6617 uio_reset(auio, 0, UIO_SYSSPACE, UIO_READ);
6618 }
6619 else if (!eofflag && full_erase_flag)
6620 full_erase_flag = 0;
6621 }
6622
6623 } while (!eofflag);
6624 /*
6625 * If we've made it here all the files in the dir are AppleDouble
6626 * We can delete the files even though the node is suspended
6627 * because we are the owner of the file.
6628 */
6629
6630 uio_reset(auio, 0, UIO_SYSSPACE, UIO_READ);
6631 eofflag = 0;
6632 full_erase_flag = 0;
6633
6634 do {
6635 siz = UIO_BUFF_SIZE;
6636 uio_reset(auio, uio_offset(auio), UIO_SYSSPACE, UIO_READ);
6637 uio_addiov(auio, CAST_USER_ADDR_T(rbuf), UIO_BUFF_SIZE);
6638
6639 error = VNOP_READDIR(vp, auio, 0, &eofflag, &nentries, ctx);
6640
6641 if (error != 0)
6642 goto outsc;
6643
6644 if (uio_resid(auio) != 0)
6645 siz -= uio_resid(auio);
6646
6647 /*
6648 * Iterate through directory
6649 */
6650 cpos = rbuf;
6651 cend = rbuf + siz;
6652 dp = (struct dirent*) cpos;
6653
6654 if (cpos == cend)
6655 eofflag = 1;
6656
6657 while ((cpos < cend)) {
6658 /*
6659 * Check for . and .. as well as directories
6660 */
6661 if (dp->d_ino != 0 &&
6662 !((dp->d_namlen == 1 && dp->d_name[0] == '.') ||
6663 (dp->d_namlen == 2 && dp->d_name[0] == '.' && dp->d_name[1] == '.'))
6664 ) {
6665 NDINIT(&nd_temp, DELETE, USEDVP, UIO_SYSSPACE, CAST_USER_ADDR_T(dp->d_name), ctx);
6666 nd_temp.ni_dvp = vp;
6667 error = unlink1(ctx, &nd_temp, 0);
6668 if(error && error != ENOENT)
6669 goto outsc;
6670 }
6671 cpos += dp->d_reclen;
6672 dp = (struct dirent*)cpos;
6673 }
6674
6675 /*
6676 * workaround for HFS/NFS setting eofflag before end of file
6677 */
6678 if (vp->v_tag == VT_HFS && nentries > 2)
6679 eofflag=0;
6680
6681 if (vp->v_tag == VT_NFS) {
6682 if (eofflag && !full_erase_flag) {
6683 full_erase_flag = 1;
6684 eofflag = 0;
6685 uio_reset(auio, 0, UIO_SYSSPACE, UIO_READ);
6686 }
6687 else if (!eofflag && full_erase_flag)
6688 full_erase_flag = 0;
6689 }
6690
6691 } while (!eofflag);
6692
6693
6694 error = 0;
6695
6696 outsc:
6697 if (open_flag)
6698 VNOP_CLOSE(vp, FREAD, ctx);
6699
6700 uio_free(auio);
6701 FREE(rbuf, M_TEMP);
6702
6703 vnode_resume(vp);
6704
6705
6706 return(error);
6707
6708 }
6709
6710
6711 #ifdef JOE_DEBUG
6712
6713 record_vp(vnode_t vp, int count) {
6714 struct uthread *ut;
6715 int i;
6716
6717 if ((vp->v_flag & VSYSTEM))
6718 return;
6719
6720 ut = get_bsdthread_info(current_thread());
6721 ut->uu_iocount += count;
6722
6723 if (ut->uu_vpindex < 32) {
6724 for (i = 0; i < ut->uu_vpindex; i++) {
6725 if (ut->uu_vps[i] == vp)
6726 return;
6727 }
6728 ut->uu_vps[ut->uu_vpindex] = vp;
6729 ut->uu_vpindex++;
6730 }
6731 }
6732 #endif