]> git.saurik.com Git - apple/xnu.git/blob - bsd/vfs/vfs_subr.c
xnu-517.3.15.tar.gz
[apple/xnu.git] / bsd / vfs / vfs_subr.c
1 /*
2 * Copyright (c) 2000-2002 Apple Computer, Inc. All rights reserved.
3 *
4 * @APPLE_LICENSE_HEADER_START@
5 *
6 * Copyright (c) 1999-2003 Apple Computer, Inc. All Rights Reserved.
7 *
8 * This file contains Original Code and/or Modifications of Original Code
9 * as defined in and that are subject to the Apple Public Source License
10 * Version 2.0 (the 'License'). You may not use this file except in
11 * compliance with the License. Please obtain a copy of the License at
12 * http://www.opensource.apple.com/apsl/ and read it before using this
13 * file.
14 *
15 * The Original Code and all software distributed under the License are
16 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
17 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
18 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
19 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
20 * Please see the License for the specific language governing rights and
21 * limitations under the License.
22 *
23 * @APPLE_LICENSE_HEADER_END@
24 */
25 /* Copyright (c) 1995 NeXT Computer, Inc. All Rights Reserved */
26 /*
27 * Copyright (c) 1989, 1993
28 * The Regents of the University of California. All rights reserved.
29 * (c) UNIX System Laboratories, Inc.
30 * All or some portions of this file are derived from material licensed
31 * to the University of California by American Telephone and Telegraph
32 * Co. or Unix System Laboratories, Inc. and are reproduced herein with
33 * the permission of UNIX System Laboratories, Inc.
34 *
35 * Redistribution and use in source and binary forms, with or without
36 * modification, are permitted provided that the following conditions
37 * are met:
38 * 1. Redistributions of source code must retain the above copyright
39 * notice, this list of conditions and the following disclaimer.
40 * 2. Redistributions in binary form must reproduce the above copyright
41 * notice, this list of conditions and the following disclaimer in the
42 * documentation and/or other materials provided with the distribution.
43 * 3. All advertising materials mentioning features or use of this software
44 * must display the following acknowledgement:
45 * This product includes software developed by the University of
46 * California, Berkeley and its contributors.
47 * 4. Neither the name of the University nor the names of its contributors
48 * may be used to endorse or promote products derived from this software
49 * without specific prior written permission.
50 *
51 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
52 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
53 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
54 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
55 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
56 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
57 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
58 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
59 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
60 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
61 * SUCH DAMAGE.
62 *
63 * @(#)vfs_subr.c 8.31 (Berkeley) 5/26/95
64 */
65
66 /*
67 * External virtual filesystem routines
68 */
69
70 #undef DIAGNOSTIC
71 #define DIAGNOSTIC 1
72
73 #include <sys/param.h>
74 #include <sys/systm.h>
75 #include <sys/proc.h>
76 #include <sys/mount.h>
77 #include <sys/time.h>
78 #include <sys/vnode.h>
79 #include <sys/stat.h>
80 #include <sys/namei.h>
81 #include <sys/ucred.h>
82 #include <sys/buf.h>
83 #include <sys/errno.h>
84 #include <sys/malloc.h>
85 #include <sys/domain.h>
86 #include <sys/mbuf.h>
87 #include <sys/syslog.h>
88 #include <sys/ubc.h>
89 #include <sys/vm.h>
90 #include <sys/sysctl.h>
91 #include <sys/filedesc.h>
92 #include <sys/event.h>
93
94 #include <string.h>
95 #include <machine/spl.h>
96
97
98 #include <kern/assert.h>
99
100 #include <miscfs/specfs/specdev.h>
101
102 #include <mach/mach_types.h>
103 #include <mach/memory_object_types.h>
104
105
106 enum vtype iftovt_tab[16] = {
107 VNON, VFIFO, VCHR, VNON, VDIR, VNON, VBLK, VNON,
108 VREG, VNON, VLNK, VNON, VSOCK, VNON, VNON, VBAD,
109 };
110 int vttoif_tab[9] = {
111 0, S_IFREG, S_IFDIR, S_IFBLK, S_IFCHR, S_IFLNK,
112 S_IFSOCK, S_IFIFO, S_IFMT,
113 };
114
115 static void vfree(struct vnode *vp);
116 static void vinactive(struct vnode *vp);
117 static int vnreclaim(int count);
118 extern kern_return_t
119 adjust_vm_object_cache(vm_size_t oval, vm_size_t nval);
120
121 TAILQ_HEAD(freelst, vnode) vnode_free_list; /* vnode free list */
122 TAILQ_HEAD(inactivelst, vnode) vnode_inactive_list; /* vnode inactive list */
123 struct mntlist mountlist; /* mounted filesystem list */
124
125 #if DIAGNOSTIC
126 #define VLISTCHECK(fun, vp, list) \
127 if ((vp)->v_freelist.tqe_prev == (struct vnode **)0xdeadb) \
128 panic("%s: %s vnode not on %slist", (fun), (list), (list));
129
130 #define VINACTIVECHECK(fun, vp, expected) \
131 do { \
132 int __is_inactive = ISSET((vp)->v_flag, VUINACTIVE); \
133 if (__is_inactive ^ expected) \
134 panic("%s: %sinactive vnode, expected %s", (fun), \
135 __is_inactive? "" : "not ", \
136 expected? "inactive": "not inactive"); \
137 } while(0)
138 #else
139 #define VLISTCHECK(fun, vp, list)
140 #define VINACTIVECHECK(fun, vp, expected)
141 #endif /* DIAGNOSTIC */
142
143 #define VLISTNONE(vp) \
144 do { \
145 (vp)->v_freelist.tqe_next = (struct vnode *)0; \
146 (vp)->v_freelist.tqe_prev = (struct vnode **)0xdeadb; \
147 } while(0)
148
149 #define VONLIST(vp) \
150 ((vp)->v_freelist.tqe_prev != (struct vnode **)0xdeadb)
151
152 /* remove a vnode from free vnode list */
153 #define VREMFREE(fun, vp) \
154 do { \
155 VLISTCHECK((fun), (vp), "free"); \
156 TAILQ_REMOVE(&vnode_free_list, (vp), v_freelist); \
157 VLISTNONE((vp)); \
158 freevnodes--; \
159 } while(0)
160
161 /* remove a vnode from inactive vnode list */
162 #define VREMINACTIVE(fun, vp) \
163 do { \
164 VLISTCHECK((fun), (vp), "inactive"); \
165 VINACTIVECHECK((fun), (vp), VUINACTIVE); \
166 TAILQ_REMOVE(&vnode_inactive_list, (vp), v_freelist); \
167 CLR((vp)->v_flag, VUINACTIVE); \
168 VLISTNONE((vp)); \
169 inactivevnodes--; \
170 } while(0)
171
172 #define VORECLAIM_ENABLE(vp) \
173 do { \
174 if (ISSET((vp)->v_flag, VORECLAIM)) \
175 panic("vm_object_reclaim already"); \
176 SET((vp)->v_flag, VORECLAIM); \
177 } while(0)
178
179 #define VORECLAIM_DISABLE(vp) \
180 do { \
181 CLR((vp)->v_flag, VORECLAIM); \
182 if (ISSET((vp)->v_flag, VXWANT)) { \
183 CLR((vp)->v_flag, VXWANT); \
184 wakeup((caddr_t)(vp)); \
185 } \
186 } while(0)
187
188 /*
189 * Have to declare first two locks as actual data even if !MACH_SLOCKS, since
190 * a pointers to them get passed around.
191 */
192 simple_lock_data_t mountlist_slock;
193 simple_lock_data_t mntvnode_slock;
194 decl_simple_lock_data(,mntid_slock);
195 decl_simple_lock_data(,vnode_free_list_slock);
196 decl_simple_lock_data(,spechash_slock);
197
198 /*
199 * vnodetarget is the amount of vnodes we expect to get back
200 * from the the inactive vnode list and VM object cache.
201 * As vnreclaim() is a mainly cpu bound operation for faster
202 * processers this number could be higher.
203 * Having this number too high introduces longer delays in
204 * the execution of getnewvnode().
205 */
206 unsigned long vnodetarget; /* target for vnreclaim() */
207 #define VNODE_FREE_TARGET 20 /* Default value for vnodetarget */
208
209 /*
210 * We need quite a few vnodes on the free list to sustain the
211 * rapid stat() the compilation process does, and still benefit from the name
212 * cache. Having too few vnodes on the free list causes serious disk
213 * thrashing as we cycle through them.
214 */
215 #define VNODE_FREE_MIN 300 /* freelist should have at least these many */
216
217 /*
218 * We need to get vnodes back from the VM object cache when a certain #
219 * of vnodes are reused from the freelist. This is essential for the
220 * caching to be effective in the namecache and the buffer cache [for the
221 * metadata].
222 */
223 #define VNODE_TOOMANY_REUSED (VNODE_FREE_MIN/4)
224
225 /*
226 * If we have enough vnodes on the freelist we do not want to reclaim
227 * the vnodes from the VM object cache.
228 */
229 #define VNODE_FREE_ENOUGH (VNODE_FREE_MIN + (VNODE_FREE_MIN/2))
230
231 /*
232 * Initialize the vnode management data structures.
233 */
234 __private_extern__ void
235 vntblinit()
236 {
237 extern struct lock__bsd__ exchangelock;
238
239 simple_lock_init(&mountlist_slock);
240 simple_lock_init(&mntvnode_slock);
241 simple_lock_init(&mntid_slock);
242 simple_lock_init(&spechash_slock);
243 TAILQ_INIT(&vnode_free_list);
244 simple_lock_init(&vnode_free_list_slock);
245 TAILQ_INIT(&vnode_inactive_list);
246 CIRCLEQ_INIT(&mountlist);
247 lockinit(&exchangelock, PVFS, "exchange", 0, 0);
248
249 if (!vnodetarget)
250 vnodetarget = VNODE_FREE_TARGET;
251
252 /*
253 * Scale the vm_object_cache to accomodate the vnodes
254 * we want to cache
255 */
256 (void) adjust_vm_object_cache(0, desiredvnodes - VNODE_FREE_MIN);
257 }
258
259 /* Reset the VM Object Cache with the values passed in */
260 __private_extern__ kern_return_t
261 reset_vmobjectcache(unsigned int val1, unsigned int val2)
262 {
263 vm_size_t oval = val1 - VNODE_FREE_MIN;
264 vm_size_t nval;
265
266 if(val2 < VNODE_FREE_MIN)
267 nval = 0;
268 else
269 nval = val2 - VNODE_FREE_MIN;
270
271 return(adjust_vm_object_cache(oval, nval));
272 }
273
274 /*
275 * Mark a mount point as busy. Used to synchronize access and to delay
276 * unmounting. Interlock is not released on failure.
277 */
278 int
279 vfs_busy(mp, flags, interlkp, p)
280 struct mount *mp;
281 int flags;
282 struct slock *interlkp;
283 struct proc *p;
284 {
285 int lkflags;
286
287 if (mp->mnt_kern_flag & MNTK_UNMOUNT) {
288 if (flags & LK_NOWAIT)
289 return (ENOENT);
290 mp->mnt_kern_flag |= MNTK_MWAIT;
291 if (interlkp)
292 simple_unlock(interlkp);
293 /*
294 * Since all busy locks are shared except the exclusive
295 * lock granted when unmounting, the only place that a
296 * wakeup needs to be done is at the release of the
297 * exclusive lock at the end of dounmount.
298 */
299 sleep((caddr_t)mp, PVFS);
300 if (interlkp)
301 simple_lock(interlkp);
302 return (ENOENT);
303 }
304 lkflags = LK_SHARED;
305 if (interlkp)
306 lkflags |= LK_INTERLOCK;
307 if (lockmgr(&mp->mnt_lock, lkflags, interlkp, p))
308 panic("vfs_busy: unexpected lock failure");
309 return (0);
310 }
311
312 /*
313 * Free a busy filesystem.
314 */
315 void
316 vfs_unbusy(mp, p)
317 struct mount *mp;
318 struct proc *p;
319 {
320
321 lockmgr(&mp->mnt_lock, LK_RELEASE, NULL, p);
322 }
323
324 /*
325 * Lookup a filesystem type, and if found allocate and initialize
326 * a mount structure for it.
327 *
328 * Devname is usually updated by mount(8) after booting.
329 */
330 int
331 vfs_rootmountalloc(fstypename, devname, mpp)
332 char *fstypename;
333 char *devname;
334 struct mount **mpp;
335 {
336 struct proc *p = current_proc(); /* XXX */
337 struct vfsconf *vfsp;
338 struct mount *mp;
339
340 for (vfsp = vfsconf; vfsp; vfsp = vfsp->vfc_next)
341 if (!strcmp(vfsp->vfc_name, fstypename))
342 break;
343 if (vfsp == NULL)
344 return (ENODEV);
345 mp = _MALLOC_ZONE((u_long)sizeof(struct mount), M_MOUNT, M_WAITOK);
346 bzero((char *)mp, (u_long)sizeof(struct mount));
347
348 /* Initialize the default IO constraints */
349 mp->mnt_maxreadcnt = mp->mnt_maxwritecnt = MAXPHYS;
350 mp->mnt_segreadcnt = mp->mnt_segwritecnt = 32;
351
352 lockinit(&mp->mnt_lock, PVFS, "vfslock", 0, 0);
353 (void)vfs_busy(mp, LK_NOWAIT, 0, p);
354 LIST_INIT(&mp->mnt_vnodelist);
355 mp->mnt_vfc = vfsp;
356 mp->mnt_op = vfsp->vfc_vfsops;
357 mp->mnt_flag = MNT_RDONLY;
358 mp->mnt_vnodecovered = NULLVP;
359 vfsp->vfc_refcount++;
360 mp->mnt_stat.f_type = vfsp->vfc_typenum;
361 mp->mnt_flag |= vfsp->vfc_flags & MNT_VISFLAGMASK;
362 strncpy(mp->mnt_stat.f_fstypename, vfsp->vfc_name, MFSNAMELEN);
363 mp->mnt_stat.f_mntonname[0] = '/';
364 (void) copystr(devname, mp->mnt_stat.f_mntfromname, MNAMELEN - 1, 0);
365 *mpp = mp;
366 return (0);
367 }
368
369 /*
370 * Find an appropriate filesystem to use for the root. If a filesystem
371 * has not been preselected, walk through the list of known filesystems
372 * trying those that have mountroot routines, and try them until one
373 * works or we have tried them all.
374 */
375 int
376 vfs_mountroot()
377 {
378 struct vfsconf *vfsp;
379 extern int (*mountroot)(void);
380 int error;
381
382 if (mountroot != NULL) {
383 error = (*mountroot)();
384 return (error);
385 }
386
387 for (vfsp = vfsconf; vfsp; vfsp = vfsp->vfc_next) {
388 if (vfsp->vfc_mountroot == NULL)
389 continue;
390 if ((error = (*vfsp->vfc_mountroot)()) == 0)
391 return (0);
392 if (error != EINVAL)
393 printf("%s_mountroot failed: %d\n", vfsp->vfc_name, error);
394 }
395 return (ENODEV);
396 }
397
398 /*
399 * Lookup a mount point by filesystem identifier.
400 */
401 struct mount *
402 vfs_getvfs(fsid)
403 fsid_t *fsid;
404 {
405 register struct mount *mp;
406
407 simple_lock(&mountlist_slock);
408 CIRCLEQ_FOREACH(mp, &mountlist, mnt_list) {
409 if (mp->mnt_stat.f_fsid.val[0] == fsid->val[0] &&
410 mp->mnt_stat.f_fsid.val[1] == fsid->val[1]) {
411 simple_unlock(&mountlist_slock);
412 return (mp);
413 }
414 }
415 simple_unlock(&mountlist_slock);
416 return ((struct mount *)0);
417 }
418
419 /*
420 * Get a new unique fsid
421 */
422 void
423 vfs_getnewfsid(mp)
424 struct mount *mp;
425 {
426 static u_short xxxfs_mntid;
427
428 fsid_t tfsid;
429 int mtype;
430
431 simple_lock(&mntid_slock);
432 mtype = mp->mnt_vfc->vfc_typenum;
433 mp->mnt_stat.f_fsid.val[0] = makedev(nblkdev + mtype, 0);
434 mp->mnt_stat.f_fsid.val[1] = mtype;
435 if (xxxfs_mntid == 0)
436 ++xxxfs_mntid;
437 tfsid.val[0] = makedev(nblkdev + mtype, xxxfs_mntid);
438 tfsid.val[1] = mtype;
439 if (!CIRCLEQ_EMPTY(&mountlist)) {
440 while (vfs_getvfs(&tfsid)) {
441 tfsid.val[0]++;
442 xxxfs_mntid++;
443 }
444 }
445 mp->mnt_stat.f_fsid.val[0] = tfsid.val[0];
446 simple_unlock(&mntid_slock);
447 }
448
449 /*
450 * Set vnode attributes to VNOVAL
451 */
452 void
453 vattr_null(vap)
454 register struct vattr *vap;
455 {
456
457 vap->va_type = VNON;
458 vap->va_size = vap->va_bytes = VNOVAL;
459 vap->va_mode = vap->va_nlink = vap->va_uid = vap->va_gid =
460 vap->va_fsid = vap->va_fileid =
461 vap->va_blocksize = vap->va_rdev =
462 vap->va_atime.tv_sec = vap->va_atime.tv_nsec =
463 vap->va_mtime.tv_sec = vap->va_mtime.tv_nsec =
464 vap->va_ctime.tv_sec = vap->va_ctime.tv_nsec =
465 vap->va_flags = vap->va_gen = VNOVAL;
466 vap->va_vaflags = 0;
467 }
468
469 /*
470 * Routines having to do with the management of the vnode table.
471 */
472 extern int (**dead_vnodeop_p)(void *);
473 static void vclean __P((struct vnode *vp, int flag, struct proc *p));
474 extern void vgonel __P((struct vnode *vp, struct proc *p));
475 long numvnodes, freevnodes;
476 long inactivevnodes;
477 long vnode_reclaim_tried;
478 long vnode_objects_reclaimed;
479
480
481 extern struct vattr va_null;
482
483 /*
484 * Return the next vnode from the free list.
485 */
486 int
487 getnewvnode(tag, mp, vops, vpp)
488 enum vtagtype tag;
489 struct mount *mp;
490 int (**vops)(void *);
491 struct vnode **vpp;
492 {
493 struct proc *p = current_proc(); /* XXX */
494 struct vnode *vp;
495 int cnt, didretry = 0;
496 static int reused = 0; /* track the reuse rate */
497 int reclaimhits = 0;
498
499 retry:
500 simple_lock(&vnode_free_list_slock);
501 /*
502 * MALLOC a vnode if the number of vnodes has not reached the desired
503 * value and the number on the free list is still reasonable...
504 * reuse from the freelist even though we may evict a name cache entry
505 * to reduce the number of vnodes that accumulate.... vnodes tie up
506 * wired memory and are never garbage collected
507 */
508 if (numvnodes < desiredvnodes && (freevnodes < (2 * VNODE_FREE_MIN))) {
509 numvnodes++;
510 simple_unlock(&vnode_free_list_slock);
511 MALLOC_ZONE(vp, struct vnode *, sizeof *vp, M_VNODE, M_WAITOK);
512 bzero((char *)vp, sizeof *vp);
513 VLISTNONE(vp); /* avoid double queue removal */
514 simple_lock_init(&vp->v_interlock);
515 goto done;
516 }
517
518 /*
519 * Once the desired number of vnodes are allocated,
520 * we start reusing the vnodes.
521 */
522 if (freevnodes < VNODE_FREE_MIN) {
523 /*
524 * if we are low on vnodes on the freelist attempt to get
525 * some back from the inactive list and VM object cache
526 */
527 simple_unlock(&vnode_free_list_slock);
528 (void)vnreclaim(vnodetarget);
529 simple_lock(&vnode_free_list_slock);
530 }
531 if (numvnodes >= desiredvnodes && reused > VNODE_TOOMANY_REUSED) {
532 reused = 0;
533 if (freevnodes < VNODE_FREE_ENOUGH) {
534 simple_unlock(&vnode_free_list_slock);
535 (void)vnreclaim(vnodetarget);
536 simple_lock(&vnode_free_list_slock);
537 }
538 }
539
540 for (cnt = 0, vp = vnode_free_list.tqh_first;
541 vp != NULLVP; cnt++, vp = vp->v_freelist.tqe_next) {
542 if (simple_lock_try(&vp->v_interlock)) {
543 /* got the interlock */
544 if (ISSET(vp->v_flag, VORECLAIM)) {
545 /* skip over the vnodes that are being reclaimed */
546 simple_unlock(&vp->v_interlock);
547 reclaimhits++;
548 } else
549 break;
550 }
551 }
552
553 /*
554 * Unless this is a bad time of the month, at most
555 * the first NCPUS items on the free list are
556 * locked, so this is close enough to being empty.
557 */
558 if (vp == NULLVP) {
559 simple_unlock(&vnode_free_list_slock);
560 if (!(didretry++) && (vnreclaim(vnodetarget) > 0))
561 goto retry;
562 tablefull("vnode");
563 log(LOG_EMERG, "%d vnodes locked, %d desired, %d numvnodes, "
564 "%d free, %d inactive, %d being reclaimed\n",
565 cnt, desiredvnodes, numvnodes, freevnodes, inactivevnodes,
566 reclaimhits);
567 *vpp = 0;
568 return (ENFILE);
569 }
570
571 if (vp->v_usecount)
572 panic("free vnode isn't: v_type = %d, v_usecount = %d?",
573 vp->v_type, vp->v_usecount);
574
575 VREMFREE("getnewvnode", vp);
576 reused++;
577 simple_unlock(&vnode_free_list_slock);
578 vp->v_lease = NULL;
579 cache_purge(vp);
580 if (vp->v_type != VBAD)
581 vgonel(vp, p); /* clean and reclaim the vnode */
582 else
583 simple_unlock(&vp->v_interlock);
584 #if DIAGNOSTIC
585 if (vp->v_data)
586 panic("cleaned vnode isn't");
587 {
588 int s = splbio();
589 if (vp->v_numoutput)
590 panic("Clean vnode has pending I/O's");
591 splx(s);
592 }
593 #endif
594 if (UBCINFOEXISTS(vp))
595 panic("getnewvnode: ubcinfo not cleaned");
596 else
597 vp->v_ubcinfo = 0;
598
599 if (vp->v_flag & VHASDIRTY)
600 cluster_release(vp);
601
602 // make sure all these fields are cleared out as the
603 // name/parent stuff uses them and assumes they're
604 // cleared to null/0.
605 if (vp->v_scmap != NULL) {
606 panic("getnewvnode: vp @ 0x%x has non-null scmap.\n", vp);
607 }
608 vp->v_un.vu_name = NULL;
609 vp->v_scdirty = 0;
610 vp->v_un1.v_cl.v_pad = 0;
611
612
613 vp->v_lastr = -1;
614 vp->v_ralen = 0;
615 vp->v_maxra = 0;
616 vp->v_ciosiz = 0;
617 vp->v_clen = 0;
618 vp->v_socket = 0;
619
620 /* we may have blocked, re-evaluate state */
621 simple_lock(&vnode_free_list_slock);
622 if (VONLIST(vp)) {
623 if (vp->v_usecount == 0)
624 VREMFREE("getnewvnode", vp);
625 else if (ISSET((vp)->v_flag, VUINACTIVE))
626 VREMINACTIVE("getnewvnode", vp);
627 }
628 simple_unlock(&vnode_free_list_slock);
629
630 done:
631 vp->v_flag = VSTANDARD;
632 vp->v_type = VNON;
633 vp->v_tag = tag;
634 vp->v_op = vops;
635 insmntque(vp, mp);
636 *vpp = vp;
637 vp->v_usecount = 1;
638 vp->v_data = 0;
639 return (0);
640 }
641
642 /*
643 * Move a vnode from one mount queue to another.
644 */
645 void
646 insmntque(vp, mp)
647 struct vnode *vp;
648 struct mount *mp;
649 {
650
651 simple_lock(&mntvnode_slock);
652 /*
653 * Delete from old mount point vnode list, if on one.
654 */
655 if (vp->v_mount != NULL)
656 LIST_REMOVE(vp, v_mntvnodes);
657 /*
658 * Insert into list of vnodes for the new mount point, if available.
659 */
660 if ((vp->v_mount = mp) != NULL)
661 LIST_INSERT_HEAD(&mp->mnt_vnodelist, vp, v_mntvnodes);
662 simple_unlock(&mntvnode_slock);
663 }
664
665 __inline void
666 vpwakeup(struct vnode *vp)
667 {
668 if (vp) {
669 if (--vp->v_numoutput < 0)
670 panic("vpwakeup: neg numoutput");
671 if ((vp->v_flag & VBWAIT || vp->v_flag & VTHROTTLED)
672 && vp->v_numoutput <= 0) {
673 vp->v_flag &= ~(VBWAIT|VTHROTTLED);
674 wakeup((caddr_t)&vp->v_numoutput);
675 }
676 }
677 }
678
679 /*
680 * Update outstanding I/O count and do wakeup if requested.
681 */
682 void
683 vwakeup(bp)
684 register struct buf *bp;
685 {
686 CLR(bp->b_flags, B_WRITEINPROG);
687 vpwakeup(bp->b_vp);
688 }
689
690 /*
691 * Flush out and invalidate all buffers associated with a vnode.
692 * Called with the underlying object locked.
693 */
694 int
695 vinvalbuf(vp, flags, cred, p, slpflag, slptimeo)
696 register struct vnode *vp;
697 int flags;
698 struct ucred *cred;
699 struct proc *p;
700 int slpflag, slptimeo;
701 {
702 register struct buf *bp;
703 struct buf *nbp, *blist;
704 int s, error = 0;
705
706 if (flags & V_SAVE) {
707 if (error = VOP_FSYNC(vp, cred, MNT_WAIT, p)) {
708 return (error);
709 }
710 if (vp->v_dirtyblkhd.lh_first)
711 panic("vinvalbuf: dirty bufs (vp 0x%x, bp 0x%x)", vp, vp->v_dirtyblkhd.lh_first);
712 }
713
714 for (;;) {
715 if ((blist = vp->v_cleanblkhd.lh_first) && (flags & V_SAVEMETA))
716 while (blist && blist->b_lblkno < 0)
717 blist = blist->b_vnbufs.le_next;
718 if (!blist && (blist = vp->v_dirtyblkhd.lh_first) &&
719 (flags & V_SAVEMETA))
720 while (blist && blist->b_lblkno < 0)
721 blist = blist->b_vnbufs.le_next;
722 if (!blist)
723 break;
724
725 for (bp = blist; bp; bp = nbp) {
726 nbp = bp->b_vnbufs.le_next;
727 if ((flags & V_SAVEMETA) && bp->b_lblkno < 0)
728 continue;
729 s = splbio();
730 if (ISSET(bp->b_flags, B_BUSY)) {
731 SET(bp->b_flags, B_WANTED);
732 error = tsleep((caddr_t)bp,
733 slpflag | (PRIBIO + 1), "vinvalbuf",
734 slptimeo);
735 splx(s);
736 if (error) {
737 return (error);
738 }
739 break;
740 }
741 bremfree(bp);
742 SET(bp->b_flags, B_BUSY);
743 splx(s);
744 /*
745 * XXX Since there are no node locks for NFS, I believe
746 * there is a slight chance that a delayed write will
747 * occur while sleeping just above, so check for it.
748 */
749 if (ISSET(bp->b_flags, B_DELWRI) && (flags & V_SAVE)) {
750 (void) VOP_BWRITE(bp);
751 break;
752 }
753
754 if (bp->b_flags & B_LOCKED) {
755 panic("vinvalbuf: bp @ 0x%x is locked!", bp);
756 break;
757 } else {
758 SET(bp->b_flags, B_INVAL);
759 }
760 brelse(bp);
761 }
762 }
763 if (!(flags & V_SAVEMETA) &&
764 (vp->v_dirtyblkhd.lh_first || vp->v_cleanblkhd.lh_first))
765 panic("vinvalbuf: flush failed");
766 return (0);
767 }
768
769 /*
770 * Create a vnode for a block device.
771 * Used for root filesystem, argdev, and swap areas.
772 * Also used for memory file system special devices.
773 */
774 int
775 bdevvp(dev, vpp)
776 dev_t dev;
777 struct vnode **vpp;
778 {
779 register struct vnode *vp;
780 struct vnode *nvp;
781 int error;
782
783 if (dev == NODEV) {
784 *vpp = NULLVP;
785 return (ENODEV);
786 }
787 error = getnewvnode(VT_NON, (struct mount *)0, spec_vnodeop_p, &nvp);
788 if (error) {
789 *vpp = NULLVP;
790 return (error);
791 }
792 vp = nvp;
793 vp->v_type = VBLK;
794 if (nvp = checkalias(vp, dev, (struct mount *)0)) {
795 vput(vp);
796 vp = nvp;
797 }
798 *vpp = vp;
799 return (0);
800 }
801
802 /*
803 * Check to see if the new vnode represents a special device
804 * for which we already have a vnode (either because of
805 * bdevvp() or because of a different vnode representing
806 * the same block device). If such an alias exists, deallocate
807 * the existing contents and return the aliased vnode. The
808 * caller is responsible for filling it with its new contents.
809 */
810 struct vnode *
811 checkalias(nvp, nvp_rdev, mp)
812 register struct vnode *nvp;
813 dev_t nvp_rdev;
814 struct mount *mp;
815 {
816 struct proc *p = current_proc(); /* XXX */
817 struct vnode *vp;
818 struct vnode **vpp;
819 struct specinfo *specinfop;
820
821 if (nvp->v_type != VBLK && nvp->v_type != VCHR)
822 return (NULLVP);
823
824 MALLOC_ZONE(specinfop, struct specinfo *, sizeof(struct specinfo),
825 M_SPECINFO, M_WAITOK);
826 vpp = &speclisth[SPECHASH(nvp_rdev)];
827 loop:
828 simple_lock(&spechash_slock);
829 for (vp = *vpp; vp; vp = vp->v_specnext) {
830 if (nvp_rdev != vp->v_rdev || nvp->v_type != vp->v_type)
831 continue;
832 /*
833 * Alias, but not in use, so flush it out.
834 */
835 simple_lock(&vp->v_interlock);
836 if (vp->v_usecount == 0) {
837 simple_unlock(&spechash_slock);
838 vgonel(vp, p);
839 goto loop;
840 }
841 if (vget(vp, LK_EXCLUSIVE | LK_INTERLOCK, p)) {
842 simple_unlock(&spechash_slock);
843 goto loop;
844 }
845 break;
846 }
847 if (vp == NULL || vp->v_tag != VT_NON) {
848 nvp->v_specinfo = specinfop;
849 specinfop = 0; /* buffer used */
850 bzero(nvp->v_specinfo, sizeof(struct specinfo));
851 nvp->v_rdev = nvp_rdev;
852 nvp->v_hashchain = vpp;
853 nvp->v_specnext = *vpp;
854 nvp->v_specflags = 0;
855 simple_unlock(&spechash_slock);
856 *vpp = nvp;
857 if (vp != NULLVP) {
858 nvp->v_flag |= VALIASED;
859 vp->v_flag |= VALIASED;
860 vput(vp);
861 }
862 /* Since buffer is used just return */
863 return (NULLVP);
864 }
865 simple_unlock(&spechash_slock);
866 VOP_UNLOCK(vp, 0, p);
867 simple_lock(&vp->v_interlock);
868 vclean(vp, 0, p);
869 vp->v_op = nvp->v_op;
870 vp->v_tag = nvp->v_tag;
871 nvp->v_type = VNON;
872 insmntque(vp, mp);
873 if (specinfop)
874 FREE_ZONE((void *)specinfop, sizeof(struct specinfo), M_SPECINFO);
875 return (vp);
876 }
877
878 /*
879 * Get a reference on a particular vnode and lock it if requested.
880 * If the vnode was on the inactive list, remove it from the list.
881 * If the vnode was on the free list, remove it from the list and
882 * move it to inactive list as needed.
883 * The vnode lock bit is set if the vnode is being eliminated in
884 * vgone. The process is awakened when the transition is completed,
885 * and an error returned to indicate that the vnode is no longer
886 * usable (possibly having been changed to a new file system type).
887 */
888 int
889 vget(vp, flags, p)
890 struct vnode *vp;
891 int flags;
892 struct proc *p;
893 {
894 int error = 0;
895 u_long vpid;
896
897 vpid = vp->v_id; // save off the original v_id
898
899 retry:
900
901 /*
902 * If the vnode is in the process of being cleaned out for
903 * another use, we wait for the cleaning to finish and then
904 * return failure. Cleaning is determined by checking that
905 * the VXLOCK flag is set.
906 */
907 if ((flags & LK_INTERLOCK) == 0)
908 simple_lock(&vp->v_interlock);
909 if ((vp->v_flag & VXLOCK) || (vp->v_flag & VORECLAIM)) {
910 vp->v_flag |= VXWANT;
911 simple_unlock(&vp->v_interlock);
912 (void)tsleep((caddr_t)vp, PINOD, "vget", 0);
913 return (ENOENT);
914 }
915
916 /*
917 * vnode is being terminated.
918 * wait for vnode_pager_no_senders() to clear VTERMINATE
919 */
920 if (ISSET(vp->v_flag, VTERMINATE)) {
921 SET(vp->v_flag, VTERMWANT);
922 simple_unlock(&vp->v_interlock);
923 (void)tsleep((caddr_t)&vp->v_ubcinfo, PINOD, "vget1", 0);
924 return (ENOENT);
925 }
926
927 /*
928 * if the vnode is being initialized,
929 * wait for it to finish initialization
930 */
931 if (ISSET(vp->v_flag, VUINIT)) {
932 SET(vp->v_flag, VUWANT);
933 simple_unlock(&vp->v_interlock);
934 (void) tsleep((caddr_t)vp, PINOD, "vget2", 0);
935 goto retry;
936 }
937
938 simple_lock(&vnode_free_list_slock);
939 if (VONLIST(vp)) {
940 if (vp->v_usecount == 0)
941 VREMFREE("vget", vp);
942 else if (ISSET((vp)->v_flag, VUINACTIVE))
943 VREMINACTIVE("vget", vp);
944 }
945 simple_unlock(&vnode_free_list_slock);
946
947 if (++vp->v_usecount <= 0)
948 panic("vget: v_usecount");
949
950 /*
951 * Recover named reference as needed
952 */
953 if (UBCISVALID(vp) && !ubc_issetflags(vp, UI_HASOBJREF)) {
954 simple_unlock(&vp->v_interlock);
955 if (ubc_getobject(vp, UBC_HOLDOBJECT) == MEMORY_OBJECT_CONTROL_NULL) {
956 error = ENOENT;
957 goto errout;
958 }
959 simple_lock(&vp->v_interlock);
960 }
961
962 if (flags & LK_TYPE_MASK) {
963 if (error = vn_lock(vp, flags | LK_INTERLOCK, p))
964 goto errout;
965 if (vpid != vp->v_id) { // make sure it's still the same vnode
966 vput(vp);
967 return ENOENT;
968 }
969 return (0);
970 }
971
972 if ((flags & LK_INTERLOCK) == 0)
973 simple_unlock(&vp->v_interlock);
974
975 if (vpid != vp->v_id) { // make sure it's still the same vnode
976 vrele(vp);
977 return ENOENT;
978 }
979
980 return (0);
981
982 errout:
983 simple_lock(&vp->v_interlock);
984
985 /*
986 * we may have blocked. Re-evaluate the state
987 */
988 simple_lock(&vnode_free_list_slock);
989 if (VONLIST(vp)) {
990 if (vp->v_usecount == 0)
991 VREMFREE("vget", vp);
992 else if (ISSET((vp)->v_flag, VUINACTIVE))
993 VREMINACTIVE("vget", vp);
994 }
995 simple_unlock(&vnode_free_list_slock);
996
997 /*
998 * If the vnode was not active in the first place
999 * must not call vrele() as VOP_INACTIVE() is not
1000 * required.
1001 * So inlined part of vrele() here.
1002 */
1003 if (--vp->v_usecount == 1) {
1004 if (UBCINFOEXISTS(vp)) {
1005 vinactive(vp);
1006 simple_unlock(&vp->v_interlock);
1007 return (error);
1008 }
1009 }
1010 if (vp->v_usecount > 0) {
1011 simple_unlock(&vp->v_interlock);
1012 return (error);
1013 }
1014 if (vp->v_usecount < 0)
1015 panic("vget: negative usecount (%d)", vp->v_usecount);
1016 vfree(vp);
1017 simple_unlock(&vp->v_interlock);
1018 return (error);
1019 }
1020
1021 /*
1022 * Get a pager reference on the particular vnode.
1023 *
1024 * This is called from ubc_info_init() and it is asumed that
1025 * the vnode is not on the free list.
1026 * It is also assumed that the vnode is neither being recycled
1027 * by vgonel nor being terminated by vnode_pager_vrele().
1028 *
1029 * The vnode interlock is NOT held by the caller.
1030 */
1031 __private_extern__ int
1032 vnode_pager_vget(vp)
1033 struct vnode *vp;
1034 {
1035 simple_lock(&vp->v_interlock);
1036
1037 UBCINFOCHECK("vnode_pager_vget", vp);
1038
1039 if (ISSET(vp->v_flag, (VXLOCK|VORECLAIM|VTERMINATE)))
1040 panic("%s: dying vnode", "vnode_pager_vget");
1041
1042 simple_lock(&vnode_free_list_slock);
1043 /* The vnode should not be on free list */
1044 if (VONLIST(vp)) {
1045 if (vp->v_usecount == 0)
1046 panic("%s: still on list", "vnode_pager_vget");
1047 else if (ISSET((vp)->v_flag, VUINACTIVE))
1048 VREMINACTIVE("vnode_pager_vget", vp);
1049 }
1050
1051 /* The vnode should not be on the inactive list here */
1052 simple_unlock(&vnode_free_list_slock);
1053
1054 /* After all those checks, now do the real work :-) */
1055 if (++vp->v_usecount <= 0)
1056 panic("vnode_pager_vget: v_usecount");
1057 simple_unlock(&vp->v_interlock);
1058
1059 return (0);
1060 }
1061
1062 /*
1063 * Stubs to use when there is no locking to be done on the underlying object.
1064 * A minimal shared lock is necessary to ensure that the underlying object
1065 * is not revoked while an operation is in progress. So, an active shared
1066 * count is maintained in an auxillary vnode lock structure.
1067 */
1068 int
1069 vop_nolock(ap)
1070 struct vop_lock_args /* {
1071 struct vnode *a_vp;
1072 int a_flags;
1073 struct proc *a_p;
1074 } */ *ap;
1075 {
1076 #ifdef notyet
1077 /*
1078 * This code cannot be used until all the non-locking filesystems
1079 * (notably NFS) are converted to properly lock and release nodes.
1080 * Also, certain vnode operations change the locking state within
1081 * the operation (create, mknod, remove, link, rename, mkdir, rmdir,
1082 * and symlink). Ideally these operations should not change the
1083 * lock state, but should be changed to let the caller of the
1084 * function unlock them. Otherwise all intermediate vnode layers
1085 * (such as union, umapfs, etc) must catch these functions to do
1086 * the necessary locking at their layer. Note that the inactive
1087 * and lookup operations also change their lock state, but this
1088 * cannot be avoided, so these two operations will always need
1089 * to be handled in intermediate layers.
1090 */
1091 struct vnode *vp = ap->a_vp;
1092 int vnflags, flags = ap->a_flags;
1093
1094 if (vp->v_vnlock == NULL) {
1095 if ((flags & LK_TYPE_MASK) == LK_DRAIN)
1096 return (0);
1097 MALLOC(vp->v_vnlock, struct lock__bsd__ *,
1098 sizeof(struct lock__bsd__), M_TEMP, M_WAITOK);
1099 lockinit(vp->v_vnlock, PVFS, "vnlock", 0, 0);
1100 }
1101 switch (flags & LK_TYPE_MASK) {
1102 case LK_DRAIN:
1103 vnflags = LK_DRAIN;
1104 break;
1105 case LK_EXCLUSIVE:
1106 case LK_SHARED:
1107 vnflags = LK_SHARED;
1108 break;
1109 case LK_UPGRADE:
1110 case LK_EXCLUPGRADE:
1111 case LK_DOWNGRADE:
1112 return (0);
1113 case LK_RELEASE:
1114 default:
1115 panic("vop_nolock: bad operation %d", flags & LK_TYPE_MASK);
1116 }
1117 if (flags & LK_INTERLOCK)
1118 vnflags |= LK_INTERLOCK;
1119 return(lockmgr(vp->v_vnlock, vnflags, &vp->v_interlock, ap->a_p));
1120 #else /* for now */
1121 /*
1122 * Since we are not using the lock manager, we must clear
1123 * the interlock here.
1124 */
1125 if (ap->a_flags & LK_INTERLOCK)
1126 simple_unlock(&ap->a_vp->v_interlock);
1127 return (0);
1128 #endif
1129 }
1130
1131 /*
1132 * Decrement the active use count.
1133 */
1134 int
1135 vop_nounlock(ap)
1136 struct vop_unlock_args /* {
1137 struct vnode *a_vp;
1138 int a_flags;
1139 struct proc *a_p;
1140 } */ *ap;
1141 {
1142 struct vnode *vp = ap->a_vp;
1143
1144 if (vp->v_vnlock == NULL)
1145 return (0);
1146 return (lockmgr(vp->v_vnlock, LK_RELEASE, NULL, ap->a_p));
1147 }
1148
1149 /*
1150 * Return whether or not the node is in use.
1151 */
1152 int
1153 vop_noislocked(ap)
1154 struct vop_islocked_args /* {
1155 struct vnode *a_vp;
1156 } */ *ap;
1157 {
1158 struct vnode *vp = ap->a_vp;
1159
1160 if (vp->v_vnlock == NULL)
1161 return (0);
1162 return (lockstatus(vp->v_vnlock));
1163 }
1164
1165 /*
1166 * Vnode reference.
1167 */
1168 void
1169 vref(vp)
1170 struct vnode *vp;
1171 {
1172
1173 simple_lock(&vp->v_interlock);
1174 if (vp->v_usecount <= 0)
1175 panic("vref used where vget required");
1176
1177 /* If on the inactive list, remove it from there */
1178 simple_lock(&vnode_free_list_slock);
1179 if (ISSET((vp)->v_flag, VUINACTIVE))
1180 VREMINACTIVE("vref", vp);
1181 simple_unlock(&vnode_free_list_slock);
1182
1183 if (++vp->v_usecount <= 0)
1184 panic("vref v_usecount");
1185 simple_unlock(&vp->v_interlock);
1186 }
1187
1188 static void
1189 clean_up_name_parent_ptrs(struct vnode *vp)
1190 {
1191 if (VNAME(vp) || VPARENT(vp)) {
1192 char *tmp1;
1193 struct vnode *tmp2;
1194
1195 // do it this way so we don't block before clearing
1196 // these fields.
1197 tmp1 = VNAME(vp);
1198 tmp2 = VPARENT(vp);
1199 VNAME(vp) = NULL;
1200 VPARENT(vp) = NULL;
1201
1202 if (tmp1) {
1203 remove_name(tmp1);
1204 }
1205
1206 if (tmp2) {
1207 vrele(tmp2);
1208 }
1209 }
1210 }
1211
1212
1213 /*
1214 * put the vnode on appropriate free list.
1215 * called with v_interlock held.
1216 */
1217 static void
1218 vfree(vp)
1219 struct vnode *vp;
1220 {
1221 funnel_t *curflock;
1222 extern int disable_funnel;
1223
1224 if ((curflock = thread_funnel_get()) != kernel_flock &&
1225 !(disable_funnel && curflock != THR_FUNNEL_NULL))
1226 panic("Entering vfree() without kernel funnel");
1227
1228 /*
1229 * if the vnode is not obtained by calling getnewvnode() we
1230 * are not responsible for the cleanup. Just return.
1231 */
1232 if (!(vp->v_flag & VSTANDARD)) {
1233 return;
1234 }
1235
1236 if (vp->v_usecount != 0)
1237 panic("vfree: v_usecount");
1238
1239 /* insert at tail of LRU list or at head if VAGE is set */
1240 simple_lock(&vnode_free_list_slock);
1241
1242 // make sure the name & parent pointers get cleared out
1243 // clean_up_name_parent_ptrs(vp);
1244
1245 if (VONLIST(vp))
1246 panic("%s: vnode still on list", "vfree");
1247
1248 if (vp->v_flag & VAGE) {
1249 TAILQ_INSERT_HEAD(&vnode_free_list, vp, v_freelist);
1250 vp->v_flag &= ~VAGE;
1251 } else
1252 TAILQ_INSERT_TAIL(&vnode_free_list, vp, v_freelist);
1253 freevnodes++;
1254 simple_unlock(&vnode_free_list_slock);
1255 return;
1256 }
1257
1258 /*
1259 * put the vnode on the inactive list.
1260 * called with v_interlock held
1261 */
1262 static void
1263 vinactive(vp)
1264 struct vnode *vp;
1265 {
1266 funnel_t *curflock;
1267 extern int disable_funnel;
1268
1269 if ((curflock = thread_funnel_get()) != kernel_flock &&
1270 !(disable_funnel && curflock != THR_FUNNEL_NULL))
1271 panic("Entering vinactive() without kernel funnel");
1272
1273 if (!UBCINFOEXISTS(vp))
1274 panic("vinactive: not a UBC vnode");
1275
1276 if (vp->v_usecount != 1)
1277 panic("vinactive: v_usecount");
1278
1279 simple_lock(&vnode_free_list_slock);
1280
1281 if (VONLIST(vp))
1282 panic("%s: vnode still on list", "vinactive");
1283 VINACTIVECHECK("vinactive", vp, 0);
1284
1285 TAILQ_INSERT_TAIL(&vnode_inactive_list, vp, v_freelist);
1286 SET(vp->v_flag, VUINACTIVE);
1287 CLR(vp->v_flag, (VNOCACHE_DATA | VRAOFF));
1288
1289 inactivevnodes++;
1290 simple_unlock(&vnode_free_list_slock);
1291 return;
1292 }
1293
1294
1295 /*
1296 * vput(), just unlock and vrele()
1297 */
1298 void
1299 vput(vp)
1300 struct vnode *vp;
1301 {
1302 struct proc *p = current_proc(); /* XXX */
1303
1304 simple_lock(&vp->v_interlock);
1305 if (--vp->v_usecount == 1) {
1306 if (UBCINFOEXISTS(vp)) {
1307 vinactive(vp);
1308 simple_unlock(&vp->v_interlock);
1309 VOP_UNLOCK(vp, 0, p);
1310 return;
1311 }
1312 }
1313 if (vp->v_usecount > 0) {
1314 simple_unlock(&vp->v_interlock);
1315 VOP_UNLOCK(vp, 0, p);
1316 return;
1317 }
1318 #if DIAGNOSTIC
1319 if (vp->v_usecount < 0 || vp->v_writecount != 0) {
1320 vprint("vput: bad ref count", vp);
1321 panic("vput: v_usecount = %d, v_writecount = %d",
1322 vp->v_usecount, vp->v_writecount);
1323 }
1324 #endif
1325 simple_lock(&vnode_free_list_slock);
1326 if (ISSET((vp)->v_flag, VUINACTIVE))
1327 VREMINACTIVE("vref", vp);
1328 simple_unlock(&vnode_free_list_slock);
1329
1330 simple_unlock(&vp->v_interlock);
1331 VOP_INACTIVE(vp, p);
1332 /*
1333 * The interlock is not held and
1334 * VOP_INCATIVE releases the vnode lock.
1335 * We could block and the vnode might get reactivated
1336 * Can not just call vfree without checking the state
1337 */
1338 simple_lock(&vp->v_interlock);
1339 if (!VONLIST(vp)) {
1340 if (vp->v_usecount == 0)
1341 vfree(vp);
1342 else if ((vp->v_usecount == 1) && UBCINFOEXISTS(vp))
1343 vinactive(vp);
1344 }
1345 simple_unlock(&vp->v_interlock);
1346 }
1347
1348 /*
1349 * Vnode release.
1350 * If count drops to zero, call inactive routine and return to freelist.
1351 */
1352 void
1353 vrele(vp)
1354 struct vnode *vp;
1355 {
1356 struct proc *p = current_proc(); /* XXX */
1357 funnel_t *curflock;
1358 extern int disable_funnel;
1359
1360 if ((curflock = thread_funnel_get()) != kernel_flock &&
1361 !(disable_funnel && curflock != THR_FUNNEL_NULL))
1362 panic("Entering vrele() without kernel funnel");
1363
1364 simple_lock(&vp->v_interlock);
1365 if (--vp->v_usecount == 1) {
1366 if (UBCINFOEXISTS(vp)) {
1367 if ((vp->v_flag & VXLOCK) == 0)
1368 vinactive(vp);
1369 simple_unlock(&vp->v_interlock);
1370 return;
1371 }
1372 }
1373 if (vp->v_usecount > 0) {
1374 simple_unlock(&vp->v_interlock);
1375 return;
1376 }
1377 #if DIAGNOSTIC
1378 if (vp->v_usecount < 0 || vp->v_writecount != 0) {
1379 vprint("vrele: bad ref count", vp);
1380 panic("vrele: ref cnt");
1381 }
1382 #endif
1383
1384 if ((vp->v_flag & VXLOCK) || (vp->v_flag & VORECLAIM)) {
1385 /* vnode is being cleaned, just return */
1386 vfree(vp);
1387 simple_unlock(&vp->v_interlock);
1388 return;
1389 }
1390
1391 if (vn_lock(vp, LK_EXCLUSIVE | LK_INTERLOCK, p) == 0) {
1392 VOP_INACTIVE(vp, p);
1393 /*
1394 * vn_lock releases the interlock and
1395 * VOP_INCATIVE releases the vnode lock.
1396 * We could block and the vnode might get reactivated
1397 * Can not just call vfree without checking the state
1398 */
1399 simple_lock(&vp->v_interlock);
1400 if (!VONLIST(vp)) {
1401 if (vp->v_usecount == 0)
1402 vfree(vp);
1403 else if ((vp->v_usecount == 1) && UBCINFOEXISTS(vp))
1404 vinactive(vp);
1405 }
1406 simple_unlock(&vp->v_interlock);
1407 }
1408 #if 0
1409 else {
1410 vfree(vp);
1411 simple_unlock(&vp->v_interlock);
1412 kprintf("vrele: vn_lock() failed for vp = 0x%08x\n", vp);
1413 }
1414 #endif
1415 }
1416
1417 void
1418 vagevp(vp)
1419 struct vnode *vp;
1420 {
1421 simple_lock(&vp->v_interlock);
1422 vp->v_flag |= VAGE;
1423 simple_unlock(&vp->v_interlock);
1424 return;
1425 }
1426
1427 /*
1428 * Page or buffer structure gets a reference.
1429 */
1430 void
1431 vhold(vp)
1432 register struct vnode *vp;
1433 {
1434
1435 simple_lock(&vp->v_interlock);
1436 vp->v_holdcnt++;
1437 simple_unlock(&vp->v_interlock);
1438 }
1439
1440 /*
1441 * Page or buffer structure frees a reference.
1442 */
1443 void
1444 holdrele(vp)
1445 register struct vnode *vp;
1446 {
1447
1448 simple_lock(&vp->v_interlock);
1449 if (vp->v_holdcnt <= 0)
1450 panic("holdrele: holdcnt");
1451 vp->v_holdcnt--;
1452 simple_unlock(&vp->v_interlock);
1453 }
1454
1455 /*
1456 * Remove any vnodes in the vnode table belonging to mount point mp.
1457 *
1458 * If MNT_NOFORCE is specified, there should not be any active ones,
1459 * return error if any are found (nb: this is a user error, not a
1460 * system error). If MNT_FORCE is specified, detach any active vnodes
1461 * that are found.
1462 */
1463 #if DIAGNOSTIC
1464 int busyprt = 0; /* print out busy vnodes */
1465 #if 0
1466 struct ctldebug debug1 = { "busyprt", &busyprt };
1467 #endif /* 0 */
1468 #endif
1469
1470 int
1471 vflush(mp, skipvp, flags)
1472 struct mount *mp;
1473 struct vnode *skipvp;
1474 int flags;
1475 {
1476 struct proc *p = current_proc();
1477 struct vnode *vp, *nvp;
1478 int busy = 0;
1479
1480 simple_lock(&mntvnode_slock);
1481 loop:
1482 for (vp = mp->mnt_vnodelist.lh_first; vp; vp = nvp) {
1483 if (vp->v_mount != mp)
1484 goto loop;
1485 nvp = vp->v_mntvnodes.le_next;
1486 /*
1487 * Skip over a selected vnode.
1488 */
1489 if (vp == skipvp)
1490 continue;
1491
1492 simple_lock(&vp->v_interlock);
1493 /*
1494 * Skip over a vnodes marked VSYSTEM or VNOFLUSH.
1495 */
1496 if ((flags & SKIPSYSTEM) && ((vp->v_flag & VSYSTEM) || (vp->v_flag & VNOFLUSH))) {
1497 simple_unlock(&vp->v_interlock);
1498 continue;
1499 }
1500 /*
1501 * Skip over a vnodes marked VSWAP.
1502 */
1503 if ((flags & SKIPSWAP) && (vp->v_flag & VSWAP)) {
1504 simple_unlock(&vp->v_interlock);
1505 continue;
1506 }
1507 /*
1508 * If WRITECLOSE is set, only flush out regular file
1509 * vnodes open for writing.
1510 */
1511 if ((flags & WRITECLOSE) &&
1512 (vp->v_writecount == 0 || vp->v_type != VREG)) {
1513 simple_unlock(&vp->v_interlock);
1514 continue;
1515 }
1516 /*
1517 * With v_usecount == 0, all we need to do is clear
1518 * out the vnode data structures and we are done.
1519 */
1520 if (vp->v_usecount == 0) {
1521 simple_unlock(&mntvnode_slock);
1522 vgonel(vp, p);
1523 simple_lock(&mntvnode_slock);
1524 continue;
1525 }
1526 /*
1527 * If FORCECLOSE is set, forcibly close the vnode.
1528 * For block or character devices, revert to an
1529 * anonymous device. For all other files, just kill them.
1530 */
1531 if (flags & FORCECLOSE) {
1532 simple_unlock(&mntvnode_slock);
1533 if (vp->v_type != VBLK && vp->v_type != VCHR) {
1534 vgonel(vp, p);
1535 } else {
1536 vclean(vp, 0, p);
1537 vp->v_op = spec_vnodeop_p;
1538 insmntque(vp, (struct mount *)0);
1539 }
1540 simple_lock(&mntvnode_slock);
1541 continue;
1542 }
1543 #if DIAGNOSTIC
1544 if (busyprt)
1545 vprint("vflush: busy vnode", vp);
1546 #endif
1547 simple_unlock(&vp->v_interlock);
1548 busy++;
1549 }
1550 simple_unlock(&mntvnode_slock);
1551 if (busy && ((flags & FORCECLOSE)==0))
1552 return (EBUSY);
1553 return (0);
1554 }
1555
1556 /*
1557 * Disassociate the underlying file system from a vnode.
1558 * The vnode interlock is held on entry.
1559 */
1560 static void
1561 vclean(vp, flags, p)
1562 struct vnode *vp;
1563 int flags;
1564 struct proc *p;
1565 {
1566 int active;
1567 int didhold;
1568
1569 /*
1570 * if the vnode is not obtained by calling getnewvnode() we
1571 * are not responsible for the cleanup. Just return.
1572 */
1573 if (!(vp->v_flag & VSTANDARD)) {
1574 simple_unlock(&vp->v_interlock);
1575 return;
1576 }
1577
1578 /*
1579 * Check to see if the vnode is in use.
1580 * If so we have to reference it before we clean it out
1581 * so that its count cannot fall to zero and generate a
1582 * race against ourselves to recycle it.
1583 */
1584 if (active = vp->v_usecount) {
1585 /*
1586 * active vnode can not be on the free list.
1587 * we are about to take an extra reference on this vnode
1588 * do the queue management as needed
1589 * Not doing so can cause "still on list" or
1590 * "vnreclaim: v_usecount" panic if VOP_LOCK() blocks.
1591 */
1592 simple_lock(&vnode_free_list_slock);
1593 if (ISSET((vp)->v_flag, VUINACTIVE))
1594 VREMINACTIVE("vclean", vp);
1595 simple_unlock(&vnode_free_list_slock);
1596
1597 if (++vp->v_usecount <= 0)
1598 panic("vclean: v_usecount");
1599 }
1600
1601 /*
1602 * Prevent the vnode from being recycled or
1603 * brought into use while we clean it out.
1604 */
1605 if (vp->v_flag & VXLOCK)
1606 panic("vclean: deadlock");
1607 vp->v_flag |= VXLOCK;
1608
1609 /*
1610 * Even if the count is zero, the VOP_INACTIVE routine may still
1611 * have the object locked while it cleans it out. The VOP_LOCK
1612 * ensures that the VOP_INACTIVE routine is done with its work.
1613 * For active vnodes, it ensures that no other activity can
1614 * occur while the underlying object is being cleaned out.
1615 */
1616 VOP_LOCK(vp, LK_DRAIN | LK_INTERLOCK, p);
1617
1618 /*
1619 * While blocked in VOP_LOCK() someone could have dropped
1620 * reference[s] and we could land on the inactive list.
1621 * if this vnode is on the inactive list
1622 * take it off the list.
1623 */
1624 simple_lock(&vnode_free_list_slock);
1625 if (ISSET((vp)->v_flag, VUINACTIVE))
1626 VREMINACTIVE("vclean", vp);
1627 simple_unlock(&vnode_free_list_slock);
1628
1629 /* Clean the pages in VM. */
1630 if (active && (flags & DOCLOSE))
1631 VOP_CLOSE(vp, IO_NDELAY, NOCRED, p);
1632
1633 /* Clean the pages in VM. */
1634 didhold = ubc_hold(vp);
1635 if ((active) && (didhold))
1636 (void)ubc_clean(vp, 0); /* do not invalidate */
1637
1638 /*
1639 * Clean out any buffers associated with the vnode.
1640 */
1641 if (flags & DOCLOSE) {
1642 if (vp->v_tag == VT_NFS)
1643 nfs_vinvalbuf(vp, V_SAVE, NOCRED, p, 0);
1644 else
1645 vinvalbuf(vp, V_SAVE, NOCRED, p, 0, 0);
1646 }
1647
1648 if (active)
1649 VOP_INACTIVE(vp, p);
1650 else
1651 VOP_UNLOCK(vp, 0, p);
1652
1653 /* Destroy ubc named reference */
1654 if (didhold) {
1655 ubc_rele(vp);
1656 ubc_destroy_named(vp);
1657 }
1658 /*
1659 * Make sure vp isn't on the inactive list.
1660 */
1661 simple_lock(&vnode_free_list_slock);
1662 if (ISSET((vp)->v_flag, VUINACTIVE)) {
1663 VREMINACTIVE("vclean", vp);
1664 }
1665 simple_unlock(&vnode_free_list_slock);
1666
1667 /*
1668 * Reclaim the vnode.
1669 */
1670 if (VOP_RECLAIM(vp, p))
1671 panic("vclean: cannot reclaim");
1672
1673 // make sure the name & parent ptrs get cleaned out!
1674 clean_up_name_parent_ptrs(vp);
1675
1676 cache_purge(vp);
1677 if (vp->v_vnlock) {
1678 struct lock__bsd__ *tmp = vp->v_vnlock;
1679 if ((tmp->lk_flags & LK_DRAINED) == 0)
1680 vprint("vclean: lock not drained", vp);
1681 vp->v_vnlock = NULL;
1682 FREE(tmp, M_TEMP);
1683 }
1684
1685 /* It's dead, Jim! */
1686 vp->v_op = dead_vnodeop_p;
1687 vp->v_tag = VT_NON;
1688
1689 insmntque(vp, (struct mount *)0);
1690
1691 /*
1692 * Done with purge, notify sleepers of the grim news.
1693 */
1694 vp->v_flag &= ~VXLOCK;
1695 if (vp->v_flag & VXWANT) {
1696 vp->v_flag &= ~VXWANT;
1697 wakeup((caddr_t)vp);
1698 }
1699
1700 if (active)
1701 vrele(vp);
1702 }
1703
1704 /*
1705 * Eliminate all activity associated with the requested vnode
1706 * and with all vnodes aliased to the requested vnode.
1707 */
1708 int
1709 vop_revoke(ap)
1710 struct vop_revoke_args /* {
1711 struct vnode *a_vp;
1712 int a_flags;
1713 } */ *ap;
1714 {
1715 struct vnode *vp, *vq;
1716 struct proc *p = current_proc();
1717
1718 #if DIAGNOSTIC
1719 if ((ap->a_flags & REVOKEALL) == 0)
1720 panic("vop_revoke");
1721 #endif
1722
1723 vp = ap->a_vp;
1724 simple_lock(&vp->v_interlock);
1725
1726 if (vp->v_flag & VALIASED) {
1727 /*
1728 * If a vgone (or vclean) is already in progress,
1729 * wait until it is done and return.
1730 */
1731 if (vp->v_flag & VXLOCK) {
1732 while (vp->v_flag & VXLOCK) {
1733 vp->v_flag |= VXWANT;
1734 simple_unlock(&vp->v_interlock);
1735 (void)tsleep((caddr_t)vp, PINOD, "vop_revokeall", 0);
1736 }
1737 return (0);
1738 }
1739 /*
1740 * Ensure that vp will not be vgone'd while we
1741 * are eliminating its aliases.
1742 */
1743 vp->v_flag |= VXLOCK;
1744 simple_unlock(&vp->v_interlock);
1745 while (vp->v_flag & VALIASED) {
1746 simple_lock(&spechash_slock);
1747 for (vq = *vp->v_hashchain; vq; vq = vq->v_specnext) {
1748 if (vq->v_rdev != vp->v_rdev ||
1749 vq->v_type != vp->v_type || vp == vq)
1750 continue;
1751 simple_unlock(&spechash_slock);
1752 vgone(vq);
1753 break;
1754 }
1755 if (vq == NULLVP)
1756 simple_unlock(&spechash_slock);
1757 }
1758 /*
1759 * Remove the lock so that vgone below will
1760 * really eliminate the vnode after which time
1761 * vgone will awaken any sleepers.
1762 */
1763 simple_lock(&vp->v_interlock);
1764 vp->v_flag &= ~VXLOCK;
1765 }
1766 vgonel(vp, p);
1767 return (0);
1768 }
1769
1770 /*
1771 * Recycle an unused vnode to the front of the free list.
1772 * Release the passed interlock if the vnode will be recycled.
1773 */
1774 int
1775 vrecycle(vp, inter_lkp, p)
1776 struct vnode *vp;
1777 struct slock *inter_lkp;
1778 struct proc *p;
1779 {
1780
1781 simple_lock(&vp->v_interlock);
1782 if (vp->v_usecount == 0) {
1783 if (inter_lkp)
1784 simple_unlock(inter_lkp);
1785 vgonel(vp, p);
1786 return (1);
1787 }
1788 simple_unlock(&vp->v_interlock);
1789 return (0);
1790 }
1791
1792 /*
1793 * Eliminate all activity associated with a vnode
1794 * in preparation for reuse.
1795 */
1796 void
1797 vgone(vp)
1798 struct vnode *vp;
1799 {
1800 struct proc *p = current_proc();
1801
1802 simple_lock(&vp->v_interlock);
1803 vgonel(vp, p);
1804 }
1805
1806 /*
1807 * vgone, with the vp interlock held.
1808 */
1809 void
1810 vgonel(vp, p)
1811 struct vnode *vp;
1812 struct proc *p;
1813 {
1814 struct vnode *vq;
1815 struct vnode *vx;
1816
1817 /*
1818 * if the vnode is not obtained by calling getnewvnode() we
1819 * are not responsible for the cleanup. Just return.
1820 */
1821 if (!(vp->v_flag & VSTANDARD)) {
1822 simple_unlock(&vp->v_interlock);
1823 return;
1824 }
1825
1826 /*
1827 * If a vgone (or vclean) is already in progress,
1828 * wait until it is done and return.
1829 */
1830 if (vp->v_flag & VXLOCK) {
1831 while (vp->v_flag & VXLOCK) {
1832 vp->v_flag |= VXWANT;
1833 simple_unlock(&vp->v_interlock);
1834 (void)tsleep((caddr_t)vp, PINOD, "vgone", 0);
1835 }
1836 return;
1837 }
1838 /*
1839 * Clean out the filesystem specific data.
1840 */
1841 vclean(vp, DOCLOSE, p);
1842 /*
1843 * Delete from old mount point vnode list, if on one.
1844 */
1845 if (vp->v_mount != NULL)
1846 insmntque(vp, (struct mount *)0);
1847 /*
1848 * If special device, remove it from special device alias list
1849 * if it is on one.
1850 */
1851 if ((vp->v_type == VBLK || vp->v_type == VCHR) && vp->v_specinfo != 0) {
1852 simple_lock(&spechash_slock);
1853 if (*vp->v_hashchain == vp) {
1854 *vp->v_hashchain = vp->v_specnext;
1855 } else {
1856 for (vq = *vp->v_hashchain; vq; vq = vq->v_specnext) {
1857 if (vq->v_specnext != vp)
1858 continue;
1859 vq->v_specnext = vp->v_specnext;
1860 break;
1861 }
1862 if (vq == NULL)
1863 panic("missing bdev");
1864 }
1865 if (vp->v_flag & VALIASED) {
1866 vx = NULL;
1867 for (vq = *vp->v_hashchain; vq; vq = vq->v_specnext) {
1868 if (vq->v_rdev != vp->v_rdev ||
1869 vq->v_type != vp->v_type)
1870 continue;
1871 if (vx)
1872 break;
1873 vx = vq;
1874 }
1875 if (vx == NULL)
1876 panic("missing alias");
1877 if (vq == NULL)
1878 vx->v_flag &= ~VALIASED;
1879 vp->v_flag &= ~VALIASED;
1880 }
1881 simple_unlock(&spechash_slock);
1882 {
1883 struct specinfo *tmp = vp->v_specinfo;
1884 vp->v_specinfo = NULL;
1885 FREE_ZONE((void *)tmp, sizeof(struct specinfo), M_SPECINFO);
1886 }
1887 }
1888 /*
1889 * If it is on the freelist and not already at the head,
1890 * move it to the head of the list. The test of the back
1891 * pointer and the reference count of zero is because
1892 * it will be removed from the free list by getnewvnode,
1893 * but will not have its reference count incremented until
1894 * after calling vgone. If the reference count were
1895 * incremented first, vgone would (incorrectly) try to
1896 * close the previous instance of the underlying object.
1897 * So, the back pointer is explicitly set to `0xdeadb' in
1898 * getnewvnode after removing it from the freelist to ensure
1899 * that we do not try to move it here.
1900 */
1901 if (vp->v_usecount == 0 && (vp->v_flag & VUINACTIVE) == 0) {
1902 simple_lock(&vnode_free_list_slock);
1903 if ((vp->v_freelist.tqe_prev != (struct vnode **)0xdeadb) &&
1904 vnode_free_list.tqh_first != vp) {
1905 TAILQ_REMOVE(&vnode_free_list, vp, v_freelist);
1906 TAILQ_INSERT_HEAD(&vnode_free_list, vp, v_freelist);
1907 }
1908 simple_unlock(&vnode_free_list_slock);
1909 }
1910 vp->v_type = VBAD;
1911 }
1912
1913 /*
1914 * Lookup a vnode by device number.
1915 */
1916 int
1917 vfinddev(dev, type, vpp)
1918 dev_t dev;
1919 enum vtype type;
1920 struct vnode **vpp;
1921 {
1922 struct vnode *vp;
1923 int rc = 0;
1924
1925 simple_lock(&spechash_slock);
1926 for (vp = speclisth[SPECHASH(dev)]; vp; vp = vp->v_specnext) {
1927 if (dev != vp->v_rdev || type != vp->v_type)
1928 continue;
1929 *vpp = vp;
1930 rc = 1;
1931 break;
1932 }
1933 simple_unlock(&spechash_slock);
1934 return (rc);
1935 }
1936
1937 /*
1938 * Calculate the total number of references to a special device.
1939 */
1940 int
1941 vcount(vp)
1942 struct vnode *vp;
1943 {
1944 struct vnode *vq, *vnext;
1945 int count;
1946
1947 loop:
1948 if ((vp->v_flag & VALIASED) == 0)
1949 return (vp->v_usecount);
1950 simple_lock(&spechash_slock);
1951 for (count = 0, vq = *vp->v_hashchain; vq; vq = vnext) {
1952 vnext = vq->v_specnext;
1953 if (vq->v_rdev != vp->v_rdev || vq->v_type != vp->v_type)
1954 continue;
1955 /*
1956 * Alias, but not in use, so flush it out.
1957 */
1958 if (vq->v_usecount == 0 && vq != vp) {
1959 simple_unlock(&spechash_slock);
1960 vgone(vq);
1961 goto loop;
1962 }
1963 count += vq->v_usecount;
1964 }
1965 simple_unlock(&spechash_slock);
1966 return (count);
1967 }
1968
1969 int prtactive = 0; /* 1 => print out reclaim of active vnodes */
1970
1971 /*
1972 * Print out a description of a vnode.
1973 */
1974 static char *typename[] =
1975 { "VNON", "VREG", "VDIR", "VBLK", "VCHR", "VLNK", "VSOCK", "VFIFO", "VBAD" };
1976
1977 void
1978 vprint(label, vp)
1979 char *label;
1980 register struct vnode *vp;
1981 {
1982 char buf[64];
1983
1984 if (label != NULL)
1985 printf("%s: ", label);
1986 printf("type %s, usecount %d, writecount %d, refcount %d,",
1987 typename[vp->v_type], vp->v_usecount, vp->v_writecount,
1988 vp->v_holdcnt);
1989 buf[0] = '\0';
1990 if (vp->v_flag & VROOT)
1991 strcat(buf, "|VROOT");
1992 if (vp->v_flag & VTEXT)
1993 strcat(buf, "|VTEXT");
1994 if (vp->v_flag & VSYSTEM)
1995 strcat(buf, "|VSYSTEM");
1996 if (vp->v_flag & VNOFLUSH)
1997 strcat(buf, "|VNOFLUSH");
1998 if (vp->v_flag & VXLOCK)
1999 strcat(buf, "|VXLOCK");
2000 if (vp->v_flag & VXWANT)
2001 strcat(buf, "|VXWANT");
2002 if (vp->v_flag & VBWAIT)
2003 strcat(buf, "|VBWAIT");
2004 if (vp->v_flag & VALIASED)
2005 strcat(buf, "|VALIASED");
2006 if (buf[0] != '\0')
2007 printf(" flags (%s)", &buf[1]);
2008 if (vp->v_data == NULL) {
2009 printf("\n");
2010 } else {
2011 printf("\n\t");
2012 VOP_PRINT(vp);
2013 }
2014 }
2015
2016 #ifdef DEBUG
2017 /*
2018 * List all of the locked vnodes in the system.
2019 * Called when debugging the kernel.
2020 */
2021 void
2022 printlockedvnodes()
2023 {
2024 struct proc *p = current_proc();
2025 struct mount *mp, *nmp;
2026 struct vnode *vp;
2027
2028 printf("Locked vnodes\n");
2029 simple_lock(&mountlist_slock);
2030 for (mp = mountlist.cqh_first; mp != (void *)&mountlist; mp = nmp) {
2031 if (vfs_busy(mp, LK_NOWAIT, &mountlist_slock, p)) {
2032 nmp = mp->mnt_list.cqe_next;
2033 continue;
2034 }
2035 for (vp = mp->mnt_vnodelist.lh_first;
2036 vp != NULL;
2037 vp = vp->v_mntvnodes.le_next) {
2038 if (VOP_ISLOCKED(vp))
2039 vprint((char *)0, vp);
2040 }
2041 simple_lock(&mountlist_slock);
2042 nmp = mp->mnt_list.cqe_next;
2043 vfs_unbusy(mp, p);
2044 }
2045 simple_unlock(&mountlist_slock);
2046 }
2047 #endif
2048
2049 static int
2050 build_path(struct vnode *vp, char *buff, int buflen, int *outlen)
2051 {
2052 char *end, *str;
2053 int i, len, ret=0, counter=0;
2054
2055 end = &buff[buflen-1];
2056 *--end = '\0';
2057
2058 while(vp && VPARENT(vp) != vp) {
2059 // the maximum depth of a file system hierarchy is MAXPATHLEN/2
2060 // (with single-char names separated by slashes). we panic if
2061 // we've ever looped more than that.
2062 if (counter++ > MAXPATHLEN/2) {
2063 panic("build_path: vnode parent chain is too long! vp 0x%x\n", vp);
2064 }
2065 str = VNAME(vp);
2066 if (VNAME(vp) == NULL) {
2067 if (VPARENT(vp) != NULL) {
2068 ret = EINVAL;
2069 }
2070 break;
2071 }
2072
2073 // count how long the string is
2074 for(len=0; *str; str++, len++)
2075 /* nothing */;
2076
2077 // check that there's enough space
2078 if ((end - buff) < len) {
2079 ret = ENOSPC;
2080 break;
2081 }
2082
2083 // copy it backwards
2084 for(; len > 0; len--) {
2085 *--end = *--str;
2086 }
2087
2088 // put in the path separator
2089 *--end = '/';
2090
2091 // walk up the chain.
2092 vp = VPARENT(vp);
2093
2094 // check if we're crossing a mount point and
2095 // switch the vp if we are.
2096 if (vp && (vp->v_flag & VROOT)) {
2097 vp = vp->v_mount->mnt_vnodecovered;
2098 }
2099 }
2100
2101 // slide it down to the beginning of the buffer
2102 memmove(buff, end, &buff[buflen] - end);
2103
2104 *outlen = &buff[buflen] - end;
2105
2106 return ret;
2107 }
2108
2109 __private_extern__ int
2110 vn_getpath(struct vnode *vp, char *pathbuf, int *len)
2111 {
2112 return build_path(vp, pathbuf, *len, len);
2113 }
2114
2115
2116
2117 /*
2118 * Top level filesystem related information gathering.
2119 */
2120 int
2121 vfs_sysctl(name, namelen, oldp, oldlenp, newp, newlen, p)
2122 int *name;
2123 u_int namelen;
2124 void *oldp;
2125 size_t *oldlenp;
2126 void *newp;
2127 size_t newlen;
2128 struct proc *p;
2129 {
2130 struct vfsconf *vfsp;
2131 int *username;
2132 u_int usernamelen;
2133 int error;
2134
2135 /*
2136 * The VFS_NUMMNTOPS shouldn't be at name[0] since
2137 * is a VFS generic variable. So now we must check
2138 * namelen so we don't end up covering any UFS
2139 * variables (sinc UFS vfc_typenum is 1).
2140 *
2141 * It should have been:
2142 * name[0]: VFS_GENERIC
2143 * name[1]: VFS_NUMMNTOPS
2144 */
2145 if (namelen == 1 && name[0] == VFS_NUMMNTOPS) {
2146 extern unsigned int vfs_nummntops;
2147 return (sysctl_rdint(oldp, oldlenp, newp, vfs_nummntops));
2148 }
2149
2150 /* all sysctl names at this level are at least name and field */
2151 if (namelen < 2)
2152 return (EISDIR); /* overloaded */
2153 if (name[0] != VFS_GENERIC) {
2154 for (vfsp = vfsconf; vfsp; vfsp = vfsp->vfc_next)
2155 if (vfsp->vfc_typenum == name[0])
2156 break;
2157 if (vfsp == NULL)
2158 return (EOPNOTSUPP);
2159 return ((*vfsp->vfc_vfsops->vfs_sysctl)(&name[1], namelen - 1,
2160 oldp, oldlenp, newp, newlen, p));
2161 }
2162 switch (name[1]) {
2163 case VFS_MAXTYPENUM:
2164 return (sysctl_rdint(oldp, oldlenp, newp, maxvfsconf));
2165 case VFS_CONF:
2166 if (namelen < 3)
2167 return (ENOTDIR); /* overloaded */
2168 for (vfsp = vfsconf; vfsp; vfsp = vfsp->vfc_next)
2169 if (vfsp->vfc_typenum == name[2])
2170 break;
2171 if (vfsp == NULL)
2172 return (EOPNOTSUPP);
2173 return (sysctl_rdstruct(oldp, oldlenp, newp, vfsp,
2174 sizeof(struct vfsconf)));
2175 }
2176 /*
2177 * We need to get back into the general MIB, so we need to re-prepend
2178 * CTL_VFS to our name and try userland_sysctl().
2179 */
2180 usernamelen = namelen + 1;
2181 MALLOC(username, int *, usernamelen * sizeof(*username),
2182 M_TEMP, M_WAITOK);
2183 bcopy(name, username + 1, namelen * sizeof(*name));
2184 username[0] = CTL_VFS;
2185 error = userland_sysctl(p, username, usernamelen, oldp, oldlenp, 1,
2186 newp, newlen, oldlenp);
2187 FREE(username, M_TEMP);
2188 return (error);
2189 }
2190
2191 int kinfo_vdebug = 1;
2192 #define KINFO_VNODESLOP 10
2193 /*
2194 * Dump vnode list (via sysctl).
2195 * Copyout address of vnode followed by vnode.
2196 */
2197 /* ARGSUSED */
2198 int
2199 sysctl_vnode(where, sizep, p)
2200 char *where;
2201 size_t *sizep;
2202 struct proc *p;
2203 {
2204 struct mount *mp, *nmp;
2205 struct vnode *nvp, *vp;
2206 char *bp = where, *savebp;
2207 char *ewhere;
2208 int error;
2209
2210 #define VPTRSZ sizeof (struct vnode *)
2211 #define VNODESZ sizeof (struct vnode)
2212 if (where == NULL) {
2213 *sizep = (numvnodes + KINFO_VNODESLOP) * (VPTRSZ + VNODESZ);
2214 return (0);
2215 }
2216 ewhere = where + *sizep;
2217
2218 simple_lock(&mountlist_slock);
2219 for (mp = mountlist.cqh_first; mp != (void *)&mountlist; mp = nmp) {
2220 if (vfs_busy(mp, LK_NOWAIT, &mountlist_slock, p)) {
2221 nmp = mp->mnt_list.cqe_next;
2222 continue;
2223 }
2224 savebp = bp;
2225 again:
2226 simple_lock(&mntvnode_slock);
2227 for (vp = mp->mnt_vnodelist.lh_first;
2228 vp != NULL;
2229 vp = nvp) {
2230 /*
2231 * Check that the vp is still associated with
2232 * this filesystem. RACE: could have been
2233 * recycled onto the same filesystem.
2234 */
2235 if (vp->v_mount != mp) {
2236 simple_unlock(&mntvnode_slock);
2237 if (kinfo_vdebug)
2238 printf("kinfo: vp changed\n");
2239 bp = savebp;
2240 goto again;
2241 }
2242 nvp = vp->v_mntvnodes.le_next;
2243 if (bp + VPTRSZ + VNODESZ > ewhere) {
2244 simple_unlock(&mntvnode_slock);
2245 vfs_unbusy(mp, p);
2246 *sizep = bp - where;
2247 return (ENOMEM);
2248 }
2249 simple_unlock(&mntvnode_slock);
2250 if ((error = copyout((caddr_t)&vp, bp, VPTRSZ)) ||
2251 (error = copyout((caddr_t)vp, bp + VPTRSZ, VNODESZ))) {
2252 vfs_unbusy(mp, p);
2253 return (error);
2254 }
2255 bp += VPTRSZ + VNODESZ;
2256 simple_lock(&mntvnode_slock);
2257 }
2258 simple_unlock(&mntvnode_slock);
2259 simple_lock(&mountlist_slock);
2260 nmp = mp->mnt_list.cqe_next;
2261 vfs_unbusy(mp, p);
2262 }
2263 simple_unlock(&mountlist_slock);
2264
2265 *sizep = bp - where;
2266 return (0);
2267 }
2268
2269 /*
2270 * Check to see if a filesystem is mounted on a block device.
2271 */
2272 int
2273 vfs_mountedon(vp)
2274 struct vnode *vp;
2275 {
2276 struct vnode *vq;
2277 int error = 0;
2278
2279 if (vp->v_specflags & SI_MOUNTEDON)
2280 return (EBUSY);
2281 if (vp->v_flag & VALIASED) {
2282 simple_lock(&spechash_slock);
2283 for (vq = *vp->v_hashchain; vq; vq = vq->v_specnext) {
2284 if (vq->v_rdev != vp->v_rdev ||
2285 vq->v_type != vp->v_type)
2286 continue;
2287 if (vq->v_specflags & SI_MOUNTEDON) {
2288 error = EBUSY;
2289 break;
2290 }
2291 }
2292 simple_unlock(&spechash_slock);
2293 }
2294 return (error);
2295 }
2296
2297 /*
2298 * Unmount all filesystems. The list is traversed in reverse order
2299 * of mounting to avoid dependencies.
2300 */
2301 __private_extern__ void
2302 vfs_unmountall()
2303 {
2304 struct mount *mp, *nmp;
2305 struct proc *p = current_proc();
2306
2307 /*
2308 * Since this only runs when rebooting, it is not interlocked.
2309 */
2310 for (mp = mountlist.cqh_last; mp != (void *)&mountlist; mp = nmp) {
2311 nmp = mp->mnt_list.cqe_prev;
2312 (void) dounmount(mp, MNT_FORCE, p);
2313 }
2314 }
2315
2316 /*
2317 * Build hash lists of net addresses and hang them off the mount point.
2318 * Called by vfs_export() to set up the lists of export addresses.
2319 */
2320 static int
2321 vfs_hang_addrlist(mp, nep, argp)
2322 struct mount *mp;
2323 struct netexport *nep;
2324 struct export_args *argp;
2325 {
2326 register struct netcred *np;
2327 register struct radix_node_head *rnh;
2328 register int i;
2329 struct radix_node *rn;
2330 struct sockaddr *saddr, *smask = 0;
2331 struct domain *dom;
2332 int error;
2333
2334 if (argp->ex_addrlen == 0) {
2335 if (mp->mnt_flag & MNT_DEFEXPORTED)
2336 return (EPERM);
2337 np = &nep->ne_defexported;
2338 np->netc_exflags = argp->ex_flags;
2339 np->netc_anon = argp->ex_anon;
2340 np->netc_anon.cr_ref = 1;
2341 mp->mnt_flag |= MNT_DEFEXPORTED;
2342 return (0);
2343 }
2344 i = sizeof(struct netcred) + argp->ex_addrlen + argp->ex_masklen;
2345 MALLOC(np, struct netcred *, i, M_NETADDR, M_WAITOK);
2346 bzero((caddr_t)np, i);
2347 saddr = (struct sockaddr *)(np + 1);
2348 if (error = copyin(argp->ex_addr, (caddr_t)saddr, argp->ex_addrlen))
2349 goto out;
2350 if (saddr->sa_len > argp->ex_addrlen)
2351 saddr->sa_len = argp->ex_addrlen;
2352 if (argp->ex_masklen) {
2353 smask = (struct sockaddr *)((caddr_t)saddr + argp->ex_addrlen);
2354 error = copyin(argp->ex_addr, (caddr_t)smask, argp->ex_masklen);
2355 if (error)
2356 goto out;
2357 if (smask->sa_len > argp->ex_masklen)
2358 smask->sa_len = argp->ex_masklen;
2359 }
2360 i = saddr->sa_family;
2361 if ((rnh = nep->ne_rtable[i]) == 0) {
2362 /*
2363 * Seems silly to initialize every AF when most are not
2364 * used, do so on demand here
2365 */
2366 for (dom = domains; dom; dom = dom->dom_next)
2367 if (dom->dom_family == i && dom->dom_rtattach) {
2368 dom->dom_rtattach((void **)&nep->ne_rtable[i],
2369 dom->dom_rtoffset);
2370 break;
2371 }
2372 if ((rnh = nep->ne_rtable[i]) == 0) {
2373 error = ENOBUFS;
2374 goto out;
2375 }
2376 }
2377 rn = (*rnh->rnh_addaddr)((caddr_t)saddr, (caddr_t)smask, rnh,
2378 np->netc_rnodes);
2379 if (rn == 0) {
2380 /*
2381 * One of the reasons that rnh_addaddr may fail is that
2382 * the entry already exists. To check for this case, we
2383 * look up the entry to see if it is there. If so, we
2384 * do not need to make a new entry but do return success.
2385 */
2386 _FREE(np, M_NETADDR);
2387 rn = (*rnh->rnh_matchaddr)((caddr_t)saddr, rnh);
2388 if (rn != 0 && (rn->rn_flags & RNF_ROOT) == 0 &&
2389 ((struct netcred *)rn)->netc_exflags == argp->ex_flags &&
2390 !bcmp((caddr_t)&((struct netcred *)rn)->netc_anon,
2391 (caddr_t)&argp->ex_anon, sizeof(struct ucred)))
2392 return (0);
2393 return (EPERM);
2394 }
2395 np->netc_exflags = argp->ex_flags;
2396 np->netc_anon = argp->ex_anon;
2397 np->netc_anon.cr_ref = 1;
2398 return (0);
2399 out:
2400 _FREE(np, M_NETADDR);
2401 return (error);
2402 }
2403
2404 /* ARGSUSED */
2405 static int
2406 vfs_free_netcred(rn, w)
2407 struct radix_node *rn;
2408 caddr_t w;
2409 {
2410 register struct radix_node_head *rnh = (struct radix_node_head *)w;
2411
2412 (*rnh->rnh_deladdr)(rn->rn_key, rn->rn_mask, rnh);
2413 _FREE((caddr_t)rn, M_NETADDR);
2414 return (0);
2415 }
2416
2417 /*
2418 * Free the net address hash lists that are hanging off the mount points.
2419 */
2420 static void
2421 vfs_free_addrlist(nep)
2422 struct netexport *nep;
2423 {
2424 register int i;
2425 register struct radix_node_head *rnh;
2426
2427 for (i = 0; i <= AF_MAX; i++)
2428 if (rnh = nep->ne_rtable[i]) {
2429 (*rnh->rnh_walktree)(rnh, vfs_free_netcred,
2430 (caddr_t)rnh);
2431 _FREE((caddr_t)rnh, M_RTABLE);
2432 nep->ne_rtable[i] = 0;
2433 }
2434 }
2435
2436 int
2437 vfs_export(mp, nep, argp)
2438 struct mount *mp;
2439 struct netexport *nep;
2440 struct export_args *argp;
2441 {
2442 int error;
2443
2444 if (argp->ex_flags & MNT_DELEXPORT) {
2445 vfs_free_addrlist(nep);
2446 mp->mnt_flag &= ~(MNT_EXPORTED | MNT_DEFEXPORTED);
2447 }
2448 if (argp->ex_flags & MNT_EXPORTED) {
2449 if (error = vfs_hang_addrlist(mp, nep, argp))
2450 return (error);
2451 mp->mnt_flag |= MNT_EXPORTED;
2452 }
2453 return (0);
2454 }
2455
2456 struct netcred *
2457 vfs_export_lookup(mp, nep, nam)
2458 register struct mount *mp;
2459 struct netexport *nep;
2460 struct mbuf *nam;
2461 {
2462 register struct netcred *np;
2463 register struct radix_node_head *rnh;
2464 struct sockaddr *saddr;
2465
2466 np = NULL;
2467 if (mp->mnt_flag & MNT_EXPORTED) {
2468 /*
2469 * Lookup in the export list first.
2470 */
2471 if (nam != NULL) {
2472 saddr = mtod(nam, struct sockaddr *);
2473 rnh = nep->ne_rtable[saddr->sa_family];
2474 if (rnh != NULL) {
2475 np = (struct netcred *)
2476 (*rnh->rnh_matchaddr)((caddr_t)saddr,
2477 rnh);
2478 if (np && np->netc_rnodes->rn_flags & RNF_ROOT)
2479 np = NULL;
2480 }
2481 }
2482 /*
2483 * If no address match, use the default if it exists.
2484 */
2485 if (np == NULL && mp->mnt_flag & MNT_DEFEXPORTED)
2486 np = &nep->ne_defexported;
2487 }
2488 return (np);
2489 }
2490
2491 /*
2492 * try to reclaim vnodes from the memory
2493 * object cache
2494 */
2495 static int
2496 vm_object_cache_reclaim(int count)
2497 {
2498 int cnt;
2499 void vnode_pager_release_from_cache(int *);
2500
2501 /* attempt to reclaim vnodes from VM object cache */
2502 cnt = count;
2503 vnode_pager_release_from_cache(&cnt);
2504 return(cnt);
2505 }
2506
2507 /*
2508 * Release memory object reference held by inactive vnodes
2509 * and then try to reclaim some vnodes from the memory
2510 * object cache
2511 */
2512 static int
2513 vnreclaim(int count)
2514 {
2515 int i, loopcnt;
2516 struct vnode *vp;
2517 int err;
2518 struct proc *p;
2519
2520 i = 0;
2521 loopcnt = 0;
2522
2523 /* Try to release "count" vnodes from the inactive list */
2524 restart:
2525 if (++loopcnt > inactivevnodes) {
2526 /*
2527 * I did my best trying to reclaim the vnodes.
2528 * Do not try any more as that would only lead to
2529 * long latencies. Also in the worst case
2530 * this can get totally CPU bound.
2531 * Just fall though and attempt a reclaim of VM
2532 * object cache
2533 */
2534 goto out;
2535 }
2536
2537 simple_lock(&vnode_free_list_slock);
2538 for (vp = TAILQ_FIRST(&vnode_inactive_list);
2539 (vp != NULLVP) && (i < count);
2540 vp = TAILQ_NEXT(vp, v_freelist)) {
2541
2542 if (!simple_lock_try(&vp->v_interlock))
2543 continue;
2544
2545 if (vp->v_usecount != 1)
2546 panic("vnreclaim: v_usecount");
2547
2548 if(!UBCINFOEXISTS(vp)) {
2549 if (vp->v_type == VBAD) {
2550 VREMINACTIVE("vnreclaim", vp);
2551 simple_unlock(&vp->v_interlock);
2552 continue;
2553 } else
2554 panic("non UBC vnode on inactive list");
2555 /* Should not reach here */
2556 }
2557
2558 /* If vnode is already being reclaimed, wait */
2559 if ((vp->v_flag & VXLOCK) || (vp->v_flag & VORECLAIM)) {
2560 vp->v_flag |= VXWANT;
2561 simple_unlock(&vp->v_interlock);
2562 simple_unlock(&vnode_free_list_slock);
2563 (void)tsleep((caddr_t)vp, PINOD, "vocr", 0);
2564 goto restart;
2565 }
2566
2567 /*
2568 * if the vnode is being initialized,
2569 * skip over it
2570 */
2571 if (ISSET(vp->v_flag, VUINIT)) {
2572 SET(vp->v_flag, VUWANT);
2573 simple_unlock(&vp->v_interlock);
2574 continue;
2575 }
2576
2577 VREMINACTIVE("vnreclaim", vp);
2578 simple_unlock(&vnode_free_list_slock);
2579
2580 if (ubc_issetflags(vp, UI_WASMAPPED)) {
2581 /*
2582 * We should not reclaim as it is likely
2583 * to be in use. Let it die a natural death.
2584 * Release the UBC reference if one exists
2585 * and put it back at the tail.
2586 */
2587 simple_unlock(&vp->v_interlock);
2588 if (ubc_release_named(vp)) {
2589 if (UBCINFOEXISTS(vp)) {
2590 simple_lock(&vp->v_interlock);
2591 if (vp->v_usecount == 1 && !VONLIST(vp))
2592 vinactive(vp);
2593 simple_unlock(&vp->v_interlock);
2594 }
2595 } else {
2596 simple_lock(&vp->v_interlock);
2597 vinactive(vp);
2598 simple_unlock(&vp->v_interlock);
2599 }
2600 } else {
2601 int didhold;
2602
2603 VORECLAIM_ENABLE(vp);
2604
2605 /*
2606 * scrub the dirty pages and invalidate the buffers
2607 */
2608 p = current_proc();
2609 err = vn_lock(vp, LK_EXCLUSIVE|LK_INTERLOCK, p);
2610 if (err) {
2611 /* cannot reclaim */
2612 simple_lock(&vp->v_interlock);
2613 vinactive(vp);
2614 VORECLAIM_DISABLE(vp);
2615 i++;
2616 simple_unlock(&vp->v_interlock);
2617 goto restart;
2618 }
2619
2620 /* keep the vnode alive so we can kill it */
2621 simple_lock(&vp->v_interlock);
2622 if(vp->v_usecount != 1)
2623 panic("VOCR: usecount race");
2624 vp->v_usecount++;
2625 simple_unlock(&vp->v_interlock);
2626
2627 /* clean up the state in VM without invalidating */
2628 didhold = ubc_hold(vp);
2629 if (didhold)
2630 (void)ubc_clean(vp, 0);
2631
2632 /* flush and invalidate buffers associated with the vnode */
2633 if (vp->v_tag == VT_NFS)
2634 nfs_vinvalbuf(vp, V_SAVE, NOCRED, p, 0);
2635 else
2636 vinvalbuf(vp, V_SAVE, NOCRED, p, 0, 0);
2637
2638 /*
2639 * Note: for the v_usecount == 2 case, VOP_INACTIVE
2640 * has not yet been called. Call it now while vp is
2641 * still locked, it will also release the lock.
2642 */
2643 if (vp->v_usecount == 2)
2644 VOP_INACTIVE(vp, p);
2645 else
2646 VOP_UNLOCK(vp, 0, p);
2647
2648 if (didhold)
2649 ubc_rele(vp);
2650
2651 /*
2652 * destroy the ubc named reference.
2653 * If we can't because it is held for I/Os
2654 * in progress, just put it back on the inactive
2655 * list and move on. Otherwise, the paging reference
2656 * is toast (and so is this vnode?).
2657 */
2658 if (ubc_destroy_named(vp)) {
2659 i++;
2660 }
2661 simple_lock(&vp->v_interlock);
2662 VORECLAIM_DISABLE(vp);
2663 simple_unlock(&vp->v_interlock);
2664 vrele(vp); /* release extra use we added here */
2665 }
2666 /* inactive list lock was released, must restart */
2667 goto restart;
2668 }
2669 simple_unlock(&vnode_free_list_slock);
2670
2671 vnode_reclaim_tried += i;
2672 out:
2673 i = vm_object_cache_reclaim(count);
2674 vnode_objects_reclaimed += i;
2675
2676 return(i);
2677 }
2678
2679 /*
2680 * This routine is called from vnode_pager_no_senders()
2681 * which in turn can be called with vnode locked by vnode_uncache()
2682 * But it could also get called as a result of vm_object_cache_trim().
2683 * In that case lock state is unknown.
2684 * AGE the vnode so that it gets recycled quickly.
2685 * Check lock status to decide whether to call vput() or vrele().
2686 */
2687 __private_extern__ void
2688 vnode_pager_vrele(struct vnode *vp)
2689 {
2690
2691 boolean_t funnel_state;
2692 int isvnreclaim = 1;
2693
2694 funnel_state = thread_funnel_set(kernel_flock, TRUE);
2695
2696 /* Mark the vnode to be recycled */
2697 vagevp(vp);
2698
2699 simple_lock(&vp->v_interlock);
2700 /*
2701 * If a vgone (or vclean) is already in progress,
2702 * Do not bother with the ubc_info cleanup.
2703 * Let the vclean deal with it.
2704 */
2705 if (vp->v_flag & VXLOCK) {
2706 CLR(vp->v_flag, VTERMINATE);
2707 if (ISSET(vp->v_flag, VTERMWANT)) {
2708 CLR(vp->v_flag, VTERMWANT);
2709 wakeup((caddr_t)&vp->v_ubcinfo);
2710 }
2711 simple_unlock(&vp->v_interlock);
2712 vrele(vp);
2713 (void) thread_funnel_set(kernel_flock, funnel_state);
2714 return;
2715 }
2716
2717 /* It's dead, Jim! */
2718 if (!ISSET(vp->v_flag, VORECLAIM)) {
2719 /*
2720 * called as a result of eviction of the memory
2721 * object from the memory object cache
2722 */
2723 isvnreclaim = 0;
2724
2725 /* So serialize vnode operations */
2726 VORECLAIM_ENABLE(vp);
2727 }
2728 if (!ISSET(vp->v_flag, VTERMINATE))
2729 SET(vp->v_flag, VTERMINATE);
2730
2731 cache_purge(vp);
2732
2733 if (UBCINFOEXISTS(vp)) {
2734 struct ubc_info *uip = vp->v_ubcinfo;
2735
2736 if (ubc_issetflags(vp, UI_WASMAPPED))
2737 SET(vp->v_flag, VWASMAPPED);
2738
2739 vp->v_ubcinfo = UBC_NOINFO; /* catch bad accesses */
2740 simple_unlock(&vp->v_interlock);
2741 ubc_info_deallocate(uip);
2742 } else {
2743 if ((vp->v_type == VBAD) && ((vp)->v_ubcinfo != UBC_INFO_NULL)
2744 && ((vp)->v_ubcinfo != UBC_NOINFO)) {
2745 struct ubc_info *uip = vp->v_ubcinfo;
2746
2747 vp->v_ubcinfo = UBC_NOINFO; /* catch bad accesses */
2748 simple_unlock(&vp->v_interlock);
2749 ubc_info_deallocate(uip);
2750 } else {
2751 simple_unlock(&vp->v_interlock);
2752 }
2753 }
2754
2755 CLR(vp->v_flag, VTERMINATE);
2756
2757 if (vp->v_type != VBAD){
2758 vgone(vp); /* revoke the vnode */
2759 vrele(vp); /* and drop the reference */
2760 } else
2761 vrele(vp);
2762
2763 if (ISSET(vp->v_flag, VTERMWANT)) {
2764 CLR(vp->v_flag, VTERMWANT);
2765 wakeup((caddr_t)&vp->v_ubcinfo);
2766 }
2767 if (!isvnreclaim)
2768 VORECLAIM_DISABLE(vp);
2769 (void) thread_funnel_set(kernel_flock, funnel_state);
2770 return;
2771 }
2772
2773
2774 #if DIAGNOSTIC
2775 int walk_vnodes_debug=0;
2776
2777 void
2778 walk_allvnodes()
2779 {
2780 struct mount *mp, *nmp;
2781 struct vnode *vp;
2782 int cnt = 0;
2783
2784 for (mp = mountlist.cqh_first; mp != (void *)&mountlist; mp = nmp) {
2785 for (vp = mp->mnt_vnodelist.lh_first;
2786 vp != NULL;
2787 vp = vp->v_mntvnodes.le_next) {
2788 if (vp->v_usecount < 0){
2789 if(walk_vnodes_debug) {
2790 printf("vp is %x\n",vp);
2791 }
2792 }
2793 }
2794 nmp = mp->mnt_list.cqe_next;
2795 }
2796 for (cnt = 0, vp = vnode_free_list.tqh_first;
2797 vp != NULLVP; cnt++, vp = vp->v_freelist.tqe_next) {
2798 if ((vp->v_usecount < 0) && walk_vnodes_debug) {
2799 if(walk_vnodes_debug) {
2800 printf("vp is %x\n",vp);
2801 }
2802 }
2803 }
2804 printf("%d - free\n", cnt);
2805
2806 for (cnt = 0, vp = vnode_inactive_list.tqh_first;
2807 vp != NULLVP; cnt++, vp = vp->v_freelist.tqe_next) {
2808 if ((vp->v_usecount < 0) && walk_vnodes_debug) {
2809 if(walk_vnodes_debug) {
2810 printf("vp is %x\n",vp);
2811 }
2812 }
2813 }
2814 printf("%d - inactive\n", cnt);
2815 }
2816 #endif /* DIAGNOSTIC */
2817
2818
2819 struct x_constraints {
2820 u_int32_t x_maxreadcnt;
2821 u_int32_t x_maxsegreadsize;
2822 u_int32_t x_maxsegwritesize;
2823 };
2824
2825
2826 void
2827 vfs_io_attributes(vp, flags, iosize, vectors)
2828 struct vnode *vp;
2829 int flags; /* B_READ or B_WRITE */
2830 int *iosize;
2831 int *vectors;
2832 {
2833 struct mount *mp;
2834
2835 /* start with "reasonable" defaults */
2836 *iosize = MAXPHYS;
2837 *vectors = 32;
2838
2839 mp = vp->v_mount;
2840 if (mp != NULL) {
2841 switch (flags) {
2842 case B_READ:
2843 if (mp->mnt_kern_flag & MNTK_IO_XINFO)
2844 *iosize = ((struct x_constraints *)(mp->mnt_xinfo_ptr))->x_maxreadcnt;
2845 else
2846 *iosize = mp->mnt_maxreadcnt;
2847 *vectors = mp->mnt_segreadcnt;
2848 break;
2849 case B_WRITE:
2850 *iosize = mp->mnt_maxwritecnt;
2851 *vectors = mp->mnt_segwritecnt;
2852 break;
2853 default:
2854 break;
2855 }
2856 if (*iosize == 0)
2857 *iosize = MAXPHYS;
2858 if (*vectors == 0)
2859 *vectors = 32;
2860 }
2861 return;
2862 }
2863
2864 __private_extern__
2865 void
2866 vfs_io_maxsegsize(vp, flags, maxsegsize)
2867 struct vnode *vp;
2868 int flags; /* B_READ or B_WRITE */
2869 int *maxsegsize;
2870 {
2871 struct mount *mp;
2872
2873 /* start with "reasonable" default */
2874 *maxsegsize = MAXPHYS;
2875
2876 mp = vp->v_mount;
2877 if (mp != NULL) {
2878 switch (flags) {
2879 case B_READ:
2880 if (mp->mnt_kern_flag & MNTK_IO_XINFO)
2881 *maxsegsize = ((struct x_constraints *)(mp->mnt_xinfo_ptr))->x_maxsegreadsize;
2882 else
2883 /*
2884 * if the extended info doesn't exist
2885 * then use the maxread I/O size as the
2886 * max segment size... this is the previous behavior
2887 */
2888 *maxsegsize = mp->mnt_maxreadcnt;
2889 break;
2890 case B_WRITE:
2891 if (mp->mnt_kern_flag & MNTK_IO_XINFO)
2892 *maxsegsize = ((struct x_constraints *)(mp->mnt_xinfo_ptr))->x_maxsegwritesize;
2893 else
2894 /*
2895 * if the extended info doesn't exist
2896 * then use the maxwrite I/O size as the
2897 * max segment size... this is the previous behavior
2898 */
2899 *maxsegsize = mp->mnt_maxwritecnt;
2900 break;
2901 default:
2902 break;
2903 }
2904 if (*maxsegsize == 0)
2905 *maxsegsize = MAXPHYS;
2906 }
2907 }
2908
2909
2910 #include <sys/disk.h>
2911
2912
2913 int
2914 vfs_init_io_attributes(devvp, mp)
2915 struct vnode *devvp;
2916 struct mount *mp;
2917 {
2918 int error;
2919 off_t readblockcnt;
2920 off_t writeblockcnt;
2921 off_t readmaxcnt;
2922 off_t writemaxcnt;
2923 off_t readsegcnt;
2924 off_t writesegcnt;
2925 off_t readsegsize;
2926 off_t writesegsize;
2927 u_long blksize;
2928
2929 u_int64_t temp;
2930
2931 struct proc *p = current_proc();
2932 struct ucred *cred = p->p_ucred;
2933
2934 int isvirtual = 0;
2935 /*
2936 * determine if this mount point exists on the same device as the root
2937 * partition... if so, then it comes under the hard throttle control
2938 */
2939 int thisunit = -1;
2940 static int rootunit = -1;
2941 extern struct vnode *rootvp;
2942
2943 if (rootunit == -1) {
2944 if (VOP_IOCTL(rootvp, DKIOCGETBSDUNIT, (caddr_t)&rootunit, 0, cred, p))
2945 rootunit = -1;
2946 else if (rootvp == devvp)
2947 mp->mnt_kern_flag |= MNTK_ROOTDEV;
2948 }
2949 if (devvp != rootvp && rootunit != -1) {
2950 if (VOP_IOCTL(devvp, DKIOCGETBSDUNIT, (caddr_t)&thisunit, 0, cred, p) == 0) {
2951 if (thisunit == rootunit)
2952 mp->mnt_kern_flag |= MNTK_ROOTDEV;
2953 }
2954 }
2955 if (VOP_IOCTL(devvp, DKIOCGETISVIRTUAL, (caddr_t)&isvirtual, 0, cred, p) == 0) {
2956 if (isvirtual)
2957 mp->mnt_kern_flag |= MNTK_VIRTUALDEV;
2958 }
2959
2960 if ((error = VOP_IOCTL(devvp, DKIOCGETMAXBLOCKCOUNTREAD,
2961 (caddr_t)&readblockcnt, 0, cred, p)))
2962 return (error);
2963
2964 if ((error = VOP_IOCTL(devvp, DKIOCGETMAXBLOCKCOUNTWRITE,
2965 (caddr_t)&writeblockcnt, 0, cred, p)))
2966 return (error);
2967
2968 if ((error = VOP_IOCTL(devvp, DKIOCGETMAXBYTECOUNTREAD,
2969 (caddr_t)&readmaxcnt, 0, cred, p)))
2970 return (error);
2971
2972 if ((error = VOP_IOCTL(devvp, DKIOCGETMAXBYTECOUNTWRITE,
2973 (caddr_t)&writemaxcnt, 0, cred, p)))
2974 return (error);
2975
2976 if ((error = VOP_IOCTL(devvp, DKIOCGETMAXSEGMENTCOUNTREAD,
2977 (caddr_t)&readsegcnt, 0, cred, p)))
2978 return (error);
2979
2980 if ((error = VOP_IOCTL(devvp, DKIOCGETMAXSEGMENTCOUNTWRITE,
2981 (caddr_t)&writesegcnt, 0, cred, p)))
2982 return (error);
2983
2984 if ((error = VOP_IOCTL(devvp, DKIOCGETMAXSEGMENTBYTECOUNTREAD,
2985 (caddr_t)&readsegsize, 0, cred, p)))
2986 return (error);
2987
2988 if ((error = VOP_IOCTL(devvp, DKIOCGETMAXSEGMENTBYTECOUNTWRITE,
2989 (caddr_t)&writesegsize, 0, cred, p)))
2990 return (error);
2991
2992 if ((error = VOP_IOCTL(devvp, DKIOCGETBLOCKSIZE,
2993 (caddr_t)&blksize, 0, cred, p)))
2994 return (error);
2995
2996
2997 if ( !(mp->mnt_kern_flag & MNTK_IO_XINFO)) {
2998 MALLOC(mp->mnt_xinfo_ptr, void *, sizeof(struct x_constraints), M_TEMP, M_WAITOK);
2999 mp->mnt_kern_flag |= MNTK_IO_XINFO;
3000 }
3001
3002 if (readmaxcnt)
3003 temp = (readmaxcnt > UINT32_MAX) ? UINT32_MAX : readmaxcnt;
3004 else {
3005 if (readblockcnt) {
3006 temp = readblockcnt * blksize;
3007 temp = (temp > UINT32_MAX) ? UINT32_MAX : temp;
3008 } else
3009 temp = MAXPHYS;
3010 }
3011 ((struct x_constraints *)(mp->mnt_xinfo_ptr))->x_maxreadcnt = (u_int32_t)temp;
3012
3013 if (writemaxcnt)
3014 temp = (writemaxcnt > UINT32_MAX) ? UINT32_MAX : writemaxcnt;
3015 else {
3016 if (writeblockcnt) {
3017 temp = writeblockcnt * blksize;
3018 temp = (temp > UINT32_MAX) ? UINT32_MAX : temp;
3019 } else
3020 temp = MAXPHYS;
3021 }
3022 mp->mnt_maxwritecnt = (u_int32_t)temp;
3023
3024 if (readsegcnt) {
3025 temp = (readsegcnt > UINT16_MAX) ? UINT16_MAX : readsegcnt;
3026 mp->mnt_segreadcnt = (u_int16_t)temp;
3027 }
3028 if (writesegcnt) {
3029 temp = (writesegcnt > UINT16_MAX) ? UINT16_MAX : writesegcnt;
3030 mp->mnt_segwritecnt = (u_int16_t)temp;
3031 }
3032 if (readsegsize)
3033 temp = (readsegsize > UINT32_MAX) ? UINT32_MAX : readsegsize;
3034 else
3035 temp = mp->mnt_maxreadcnt;
3036 ((struct x_constraints *)(mp->mnt_xinfo_ptr))->x_maxsegreadsize = (u_int32_t)temp;
3037
3038 if (writesegsize)
3039 temp = (writesegsize > UINT32_MAX) ? UINT32_MAX : writesegsize;
3040 else
3041 temp = mp->mnt_maxwritecnt;
3042 ((struct x_constraints *)(mp->mnt_xinfo_ptr))->x_maxsegwritesize = (u_int32_t)temp;
3043
3044 return (error);
3045 }
3046
3047 static struct klist fs_klist;
3048
3049 void
3050 vfs_event_init(void)
3051 {
3052
3053 klist_init(&fs_klist);
3054 }
3055
3056 void
3057 vfs_event_signal(fsid_t *fsid, u_int32_t event, intptr_t data)
3058 {
3059
3060 KNOTE(&fs_klist, event);
3061 }
3062
3063 /*
3064 * return the number of mounted filesystems.
3065 */
3066 static int
3067 sysctl_vfs_getvfscnt(void)
3068 {
3069 struct mount *mp;
3070 int ret = 0;
3071
3072 simple_lock(&mountlist_slock);
3073 CIRCLEQ_FOREACH(mp, &mountlist, mnt_list)
3074 ret++;
3075 simple_unlock(&mountlist_slock);
3076 return (ret);
3077 }
3078
3079 /*
3080 * fill in the array of fsid_t's up to a max of 'count', the actual
3081 * number filled in will be set in '*actual'. If there are more fsid_t's
3082 * than room in fsidlst then ENOMEM will be returned and '*actual' will
3083 * have the actual count.
3084 * having *actual filled out even in the error case is depended upon.
3085 */
3086 static int
3087 sysctl_vfs_getvfslist(fsid_t *fsidlst, int count, int *actual)
3088 {
3089 struct mount *mp;
3090
3091 *actual = 0;
3092 simple_lock(&mountlist_slock);
3093 CIRCLEQ_FOREACH(mp, &mountlist, mnt_list) {
3094 (*actual)++;
3095 if (*actual <= count)
3096 fsidlst[(*actual) - 1] = mp->mnt_stat.f_fsid;
3097 }
3098 simple_unlock(&mountlist_slock);
3099 return (*actual <= count ? 0 : ENOMEM);
3100 }
3101
3102 static int
3103 sysctl_vfs_vfslist SYSCTL_HANDLER_ARGS
3104 {
3105 int actual, error;
3106 size_t space;
3107 fsid_t *fsidlst;
3108
3109 /* This is a readonly node. */
3110 if (req->newptr != NULL)
3111 return (EPERM);
3112
3113 /* they are querying us so just return the space required. */
3114 if (req->oldptr == NULL) {
3115 req->oldidx = sysctl_vfs_getvfscnt() * sizeof(fsid_t);
3116 return 0;
3117 }
3118 again:
3119 /*
3120 * Retrieve an accurate count of the amount of space required to copy
3121 * out all the fsids in the system.
3122 */
3123 space = req->oldlen;
3124 req->oldlen = sysctl_vfs_getvfscnt() * sizeof(fsid_t);
3125
3126 /* they didn't give us enough space. */
3127 if (space < req->oldlen)
3128 return (ENOMEM);
3129
3130 MALLOC(fsidlst, fsid_t *, req->oldlen, M_TEMP, M_WAITOK);
3131 error = sysctl_vfs_getvfslist(fsidlst, req->oldlen / sizeof(fsid_t),
3132 &actual);
3133 /*
3134 * If we get back ENOMEM, then another mount has been added while we
3135 * slept in malloc above. If this is the case then try again.
3136 */
3137 if (error == ENOMEM) {
3138 FREE(fsidlst, M_TEMP);
3139 req->oldlen = space;
3140 goto again;
3141 }
3142 if (error == 0) {
3143 error = SYSCTL_OUT(req, fsidlst, actual * sizeof(fsid_t));
3144 }
3145 FREE(fsidlst, M_TEMP);
3146 return (error);
3147 }
3148
3149 /*
3150 * Do a sysctl by fsid.
3151 */
3152 static int
3153 sysctl_vfs_ctlbyfsid SYSCTL_HANDLER_ARGS
3154 {
3155 struct vfsidctl vc;
3156 struct mount *mp;
3157 struct statfs *sp;
3158 struct proc *p;
3159 int *name;
3160 int error, flags, namelen;
3161
3162 name = arg1;
3163 namelen = arg2;
3164 p = req->p;
3165
3166 error = SYSCTL_IN(req, &vc, sizeof(vc));
3167 if (error)
3168 return (error);
3169 if (vc.vc_vers != VFS_CTL_VERS1)
3170 return (EINVAL);
3171 mp = vfs_getvfs(&vc.vc_fsid);
3172 if (mp == NULL)
3173 return (ENOENT);
3174 /* reset so that the fs specific code can fetch it. */
3175 req->newidx = 0;
3176 /*
3177 * Note if this is a VFS_CTL then we pass the actual sysctl req
3178 * in for "oldp" so that the lower layer can DTRT and use the
3179 * SYSCTL_IN/OUT routines.
3180 */
3181 if (mp->mnt_op->vfs_sysctl != NULL) {
3182 error = mp->mnt_op->vfs_sysctl(name, namelen,
3183 req, NULL, NULL, 0, req->p);
3184 if (error != EOPNOTSUPP)
3185 return (error);
3186 }
3187 switch (name[0]) {
3188 case VFS_CTL_UMOUNT:
3189 VCTLTOREQ(&vc, req);
3190 error = SYSCTL_IN(req, &flags, sizeof(flags));
3191 if (error)
3192 break;
3193 error = safedounmount(mp, flags, p);
3194 break;
3195 case VFS_CTL_STATFS:
3196 VCTLTOREQ(&vc, req);
3197 error = SYSCTL_IN(req, &flags, sizeof(flags));
3198 if (error)
3199 break;
3200 sp = &mp->mnt_stat;
3201 if (((flags & MNT_NOWAIT) == 0 || (flags & MNT_WAIT)) &&
3202 (error = VFS_STATFS(mp, sp, p)))
3203 return (error);
3204 sp->f_flags = mp->mnt_flag & MNT_VISFLAGMASK;
3205 error = SYSCTL_OUT(req, sp, sizeof(*sp));
3206 break;
3207 default:
3208 return (EOPNOTSUPP);
3209 }
3210 return (error);
3211 }
3212
3213 static int filt_fsattach(struct knote *kn);
3214 static void filt_fsdetach(struct knote *kn);
3215 static int filt_fsevent(struct knote *kn, long hint);
3216
3217 struct filterops fs_filtops =
3218 { 0, filt_fsattach, filt_fsdetach, filt_fsevent };
3219
3220 static int
3221 filt_fsattach(struct knote *kn)
3222 {
3223
3224 kn->kn_flags |= EV_CLEAR;
3225 KNOTE_ATTACH(&fs_klist, kn);
3226 return (0);
3227 }
3228
3229 static void
3230 filt_fsdetach(struct knote *kn)
3231 {
3232
3233 KNOTE_DETACH(&fs_klist, kn);
3234 }
3235
3236 static int
3237 filt_fsevent(struct knote *kn, long hint)
3238 {
3239
3240 kn->kn_fflags |= hint;
3241 return (kn->kn_fflags != 0);
3242 }
3243
3244 static int
3245 sysctl_vfs_noremotehang SYSCTL_HANDLER_ARGS
3246 {
3247 int out, error;
3248 pid_t pid;
3249 size_t space;
3250 struct proc *p;
3251
3252 /* We need a pid. */
3253 if (req->newptr == NULL)
3254 return (EINVAL);
3255
3256 error = SYSCTL_IN(req, &pid, sizeof(pid));
3257 if (error)
3258 return (error);
3259
3260 p = pfind(pid < 0 ? -pid : pid);
3261 if (p == NULL)
3262 return (ESRCH);
3263
3264 /*
3265 * Fetching the value is ok, but we only fetch if the old
3266 * pointer is given.
3267 */
3268 if (req->oldptr != NULL) {
3269 out = !((p->p_flag & P_NOREMOTEHANG) == 0);
3270 error = SYSCTL_OUT(req, &out, sizeof(out));
3271 return (error);
3272 }
3273
3274 /* cansignal offers us enough security. */
3275 if (p != req->p && suser(req->p->p_ucred, &req->p->p_acflag) != 0)
3276 return (EPERM);
3277
3278 if (pid < 0)
3279 p->p_flag &= ~P_NOREMOTEHANG;
3280 else
3281 p->p_flag |= P_NOREMOTEHANG;
3282
3283 return (0);
3284 }
3285 /* the vfs.generic. branch. */
3286 SYSCTL_NODE(_vfs, VFS_GENERIC, generic, CTLFLAG_RW, 0, "vfs generic hinge");
3287 /* retreive a list of mounted filesystem fsid_t */
3288 SYSCTL_PROC(_vfs_generic, OID_AUTO, vfsidlist, CTLFLAG_RD,
3289 0, 0, sysctl_vfs_vfslist, "S,fsid", "List of mounted filesystem ids");
3290 /* perform operations on filesystem via fsid_t */
3291 SYSCTL_NODE(_vfs_generic, OID_AUTO, ctlbyfsid, CTLFLAG_RW,
3292 sysctl_vfs_ctlbyfsid, "ctlbyfsid");
3293 SYSCTL_PROC(_vfs_generic, OID_AUTO, noremotehang, CTLFLAG_RW,
3294 0, 0, sysctl_vfs_noremotehang, "I", "noremotehang");
3295