]> git.saurik.com Git - apple/xnu.git/blob - bsd/vfs/vfs_subr.c
c49f321c2ea8237b3da5a25788431376564d1fcf
[apple/xnu.git] / bsd / vfs / vfs_subr.c
1 /*
2 * Copyright (c) 2000-2001 Apple Computer, Inc. All rights reserved.
3 *
4 * @APPLE_LICENSE_HEADER_START@
5 *
6 * The contents of this file constitute Original Code as defined in and
7 * are subject to the Apple Public Source License Version 1.1 (the
8 * "License"). You may not use this file except in compliance with the
9 * License. Please obtain a copy of the License at
10 * http://www.apple.com/publicsource and read it before using this file.
11 *
12 * This Original Code and all software distributed under the License are
13 * distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY KIND, EITHER
14 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
15 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
16 * FITNESS FOR A PARTICULAR PURPOSE OR NON-INFRINGEMENT. Please see the
17 * License for the specific language governing rights and limitations
18 * under the License.
19 *
20 * @APPLE_LICENSE_HEADER_END@
21 */
22 /* Copyright (c) 1995 NeXT Computer, Inc. All Rights Reserved */
23 /*
24 * Copyright (c) 1989, 1993
25 * The Regents of the University of California. All rights reserved.
26 * (c) UNIX System Laboratories, Inc.
27 * All or some portions of this file are derived from material licensed
28 * to the University of California by American Telephone and Telegraph
29 * Co. or Unix System Laboratories, Inc. and are reproduced herein with
30 * the permission of UNIX System Laboratories, Inc.
31 *
32 * Redistribution and use in source and binary forms, with or without
33 * modification, are permitted provided that the following conditions
34 * are met:
35 * 1. Redistributions of source code must retain the above copyright
36 * notice, this list of conditions and the following disclaimer.
37 * 2. Redistributions in binary form must reproduce the above copyright
38 * notice, this list of conditions and the following disclaimer in the
39 * documentation and/or other materials provided with the distribution.
40 * 3. All advertising materials mentioning features or use of this software
41 * must display the following acknowledgement:
42 * This product includes software developed by the University of
43 * California, Berkeley and its contributors.
44 * 4. Neither the name of the University nor the names of its contributors
45 * may be used to endorse or promote products derived from this software
46 * without specific prior written permission.
47 *
48 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
49 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
50 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
51 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
52 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
53 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
54 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
55 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
56 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
57 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
58 * SUCH DAMAGE.
59 *
60 * @(#)vfs_subr.c 8.31 (Berkeley) 5/26/95
61 */
62
63 /*
64 * External virtual filesystem routines
65 */
66
67 #undef DIAGNOSTIC
68 #define DIAGNOSTIC 1
69
70 #include <sys/param.h>
71 #include <sys/systm.h>
72 #include <sys/proc.h>
73 #include <sys/mount.h>
74 #include <sys/time.h>
75 #include <sys/vnode.h>
76 #include <sys/stat.h>
77 #include <sys/namei.h>
78 #include <sys/ucred.h>
79 #include <sys/buf.h>
80 #include <sys/errno.h>
81 #include <sys/malloc.h>
82 #include <sys/domain.h>
83 #include <sys/mbuf.h>
84 #include <sys/syslog.h>
85 #include <sys/ubc.h>
86 #include <sys/vm.h>
87 #include <sys/sysctl.h>
88
89 #include <kern/assert.h>
90
91 #include <miscfs/specfs/specdev.h>
92
93 #include <mach/mach_types.h>
94 #include <mach/memory_object_types.h>
95
96
97 enum vtype iftovt_tab[16] = {
98 VNON, VFIFO, VCHR, VNON, VDIR, VNON, VBLK, VNON,
99 VREG, VNON, VLNK, VNON, VSOCK, VNON, VNON, VBAD,
100 };
101 int vttoif_tab[9] = {
102 0, S_IFREG, S_IFDIR, S_IFBLK, S_IFCHR, S_IFLNK,
103 S_IFSOCK, S_IFIFO, S_IFMT,
104 };
105
106 static void vfree(struct vnode *vp);
107 static void vinactive(struct vnode *vp);
108 static int vnreclaim(int count);
109 extern kern_return_t
110 adjust_vm_object_cache(vm_size_t oval, vm_size_t nval);
111
112 TAILQ_HEAD(freelst, vnode) vnode_free_list; /* vnode free list */
113 TAILQ_HEAD(inactivelst, vnode) vnode_inactive_list; /* vnode inactive list */
114 struct mntlist mountlist; /* mounted filesystem list */
115
116 #if DIAGNOSTIC
117 #define VLISTCHECK(fun, vp, list) \
118 if ((vp)->v_freelist.tqe_prev == (struct vnode **)0xdeadb) \
119 panic("%s: %s vnode not on %slist", (fun), (list), (list));
120
121 #define VINACTIVECHECK(fun, vp, expected) \
122 do { \
123 int __is_inactive = ISSET((vp)->v_flag, VUINACTIVE); \
124 if (__is_inactive ^ expected) \
125 panic("%s: %sinactive vnode, expected %s", (fun), \
126 __is_inactive? "" : "not ", \
127 expected? "inactive": "not inactive"); \
128 } while(0)
129 #else
130 #define VLISTCHECK(fun, vp, list)
131 #define VINACTIVECHECK(fun, vp, expected)
132 #endif /* DIAGNOSTIC */
133
134 #define VLISTNONE(vp) \
135 do { \
136 (vp)->v_freelist.tqe_next = (struct vnode *)0; \
137 (vp)->v_freelist.tqe_prev = (struct vnode **)0xdeadb; \
138 } while(0)
139
140 #define VONLIST(vp) \
141 ((vp)->v_freelist.tqe_prev != (struct vnode **)0xdeadb)
142
143 /* remove a vnode from free vnode list */
144 #define VREMFREE(fun, vp) \
145 do { \
146 VLISTCHECK((fun), (vp), "free"); \
147 TAILQ_REMOVE(&vnode_free_list, (vp), v_freelist); \
148 VLISTNONE((vp)); \
149 freevnodes--; \
150 } while(0)
151
152 /* remove a vnode from inactive vnode list */
153 #define VREMINACTIVE(fun, vp) \
154 do { \
155 VLISTCHECK((fun), (vp), "inactive"); \
156 VINACTIVECHECK((fun), (vp), VUINACTIVE); \
157 TAILQ_REMOVE(&vnode_inactive_list, (vp), v_freelist); \
158 CLR((vp)->v_flag, VUINACTIVE); \
159 VLISTNONE((vp)); \
160 inactivevnodes--; \
161 } while(0)
162
163 #define VORECLAIM_ENABLE(vp) \
164 do { \
165 if (ISSET((vp)->v_flag, VORECLAIM)) \
166 panic("vm object raclaim already"); \
167 SET((vp)->v_flag, VORECLAIM); \
168 } while(0)
169
170 #define VORECLAIM_DISABLE(vp) \
171 do { \
172 CLR((vp)->v_flag, VORECLAIM); \
173 if (ISSET((vp)->v_flag, VXWANT)) { \
174 CLR((vp)->v_flag, VXWANT); \
175 wakeup((caddr_t)(vp)); \
176 } \
177 } while(0)
178
179 /*
180 * Have to declare first two locks as actual data even if !MACH_SLOCKS, since
181 * a pointers to them get passed around.
182 */
183 simple_lock_data_t mountlist_slock;
184 simple_lock_data_t mntvnode_slock;
185 decl_simple_lock_data(,mntid_slock);
186 decl_simple_lock_data(,vnode_free_list_slock);
187 decl_simple_lock_data(,spechash_slock);
188
189 /*
190 * vnodetarget is the amount of vnodes we expect to get back
191 * from the the inactive vnode list and VM object cache.
192 * As vnreclaim() is a mainly cpu bound operation for faster
193 * processers this number could be higher.
194 * Having this number too high introduces longer delays in
195 * the execution of getnewvnode().
196 */
197 unsigned long vnodetarget; /* target for vnreclaim() */
198 #define VNODE_FREE_TARGET 20 /* Default value for vnodetarget */
199
200 /*
201 * We need quite a few vnodes on the free list to sustain the
202 * rapid stat() the compilation process does, and still benefit from the name
203 * cache. Having too few vnodes on the free list causes serious disk
204 * thrashing as we cycle through them.
205 */
206 #define VNODE_FREE_MIN 300 /* freelist should have at least these many */
207
208 /*
209 * We need to get vnodes back from the VM object cache when a certain #
210 * of vnodes are reused from the freelist. This is essential for the
211 * caching to be effective in the namecache and the buffer cache [for the
212 * metadata].
213 */
214 #define VNODE_TOOMANY_REUSED (VNODE_FREE_MIN/4)
215
216 /*
217 * If we have enough vnodes on the freelist we do not want to reclaim
218 * the vnodes from the VM object cache.
219 */
220 #define VNODE_FREE_ENOUGH (VNODE_FREE_MIN + (VNODE_FREE_MIN/2))
221
222 /*
223 * Initialize the vnode management data structures.
224 */
225 __private_extern__ void
226 vntblinit()
227 {
228 extern struct lock__bsd__ exchangelock;
229
230 simple_lock_init(&mountlist_slock);
231 simple_lock_init(&mntvnode_slock);
232 simple_lock_init(&mntid_slock);
233 simple_lock_init(&spechash_slock);
234 TAILQ_INIT(&vnode_free_list);
235 simple_lock_init(&vnode_free_list_slock);
236 TAILQ_INIT(&vnode_inactive_list);
237 CIRCLEQ_INIT(&mountlist);
238 lockinit(&exchangelock, PVFS, "exchange", 0, 0);
239
240 if (!vnodetarget)
241 vnodetarget = VNODE_FREE_TARGET;
242
243 /*
244 * Scale the vm_object_cache to accomodate the vnodes
245 * we want to cache
246 */
247 (void) adjust_vm_object_cache(0, desiredvnodes - VNODE_FREE_MIN);
248 }
249
250 /* Reset the VM Object Cache with the values passed in */
251 __private_extern__ kern_return_t
252 reset_vmobjectcache(unsigned int val1, unsigned int val2)
253 {
254 vm_size_t oval = val1 - VNODE_FREE_MIN;
255 vm_size_t nval;
256
257 if(val2 < VNODE_FREE_MIN)
258 nval = 0;
259 else
260 nval = val2 - VNODE_FREE_MIN;
261
262 return(adjust_vm_object_cache(oval, nval));
263 }
264
265 /*
266 * Mark a mount point as busy. Used to synchronize access and to delay
267 * unmounting. Interlock is not released on failure.
268 */
269 int
270 vfs_busy(mp, flags, interlkp, p)
271 struct mount *mp;
272 int flags;
273 struct slock *interlkp;
274 struct proc *p;
275 {
276 int lkflags;
277
278 if (mp->mnt_kern_flag & MNTK_UNMOUNT) {
279 if (flags & LK_NOWAIT)
280 return (ENOENT);
281 mp->mnt_kern_flag |= MNTK_MWAIT;
282 if (interlkp)
283 simple_unlock(interlkp);
284 /*
285 * Since all busy locks are shared except the exclusive
286 * lock granted when unmounting, the only place that a
287 * wakeup needs to be done is at the release of the
288 * exclusive lock at the end of dounmount.
289 */
290 sleep((caddr_t)mp, PVFS);
291 if (interlkp)
292 simple_lock(interlkp);
293 return (ENOENT);
294 }
295 lkflags = LK_SHARED;
296 if (interlkp)
297 lkflags |= LK_INTERLOCK;
298 if (lockmgr(&mp->mnt_lock, lkflags, interlkp, p))
299 panic("vfs_busy: unexpected lock failure");
300 return (0);
301 }
302
303 /*
304 * Free a busy filesystem.
305 */
306 void
307 vfs_unbusy(mp, p)
308 struct mount *mp;
309 struct proc *p;
310 {
311
312 lockmgr(&mp->mnt_lock, LK_RELEASE, NULL, p);
313 }
314
315 /*
316 * Lookup a filesystem type, and if found allocate and initialize
317 * a mount structure for it.
318 *
319 * Devname is usually updated by mount(8) after booting.
320 */
321 int
322 vfs_rootmountalloc(fstypename, devname, mpp)
323 char *fstypename;
324 char *devname;
325 struct mount **mpp;
326 {
327 struct proc *p = current_proc(); /* XXX */
328 struct vfsconf *vfsp;
329 struct mount *mp;
330
331 for (vfsp = vfsconf; vfsp; vfsp = vfsp->vfc_next)
332 if (!strcmp(vfsp->vfc_name, fstypename))
333 break;
334 if (vfsp == NULL)
335 return (ENODEV);
336 mp = _MALLOC_ZONE((u_long)sizeof(struct mount), M_MOUNT, M_WAITOK);
337 bzero((char *)mp, (u_long)sizeof(struct mount));
338
339 /* Initialize the default IO constraints */
340 mp->mnt_maxreadcnt = mp->mnt_maxwritecnt = MAXPHYS;
341 mp->mnt_segreadcnt = mp->mnt_segwritecnt = 32;
342
343 lockinit(&mp->mnt_lock, PVFS, "vfslock", 0, 0);
344 (void)vfs_busy(mp, LK_NOWAIT, 0, p);
345 LIST_INIT(&mp->mnt_vnodelist);
346 mp->mnt_vfc = vfsp;
347 mp->mnt_op = vfsp->vfc_vfsops;
348 mp->mnt_flag = MNT_RDONLY;
349 mp->mnt_vnodecovered = NULLVP;
350 vfsp->vfc_refcount++;
351 mp->mnt_stat.f_type = vfsp->vfc_typenum;
352 mp->mnt_flag |= vfsp->vfc_flags & MNT_VISFLAGMASK;
353 strncpy(mp->mnt_stat.f_fstypename, vfsp->vfc_name, MFSNAMELEN);
354 mp->mnt_stat.f_mntonname[0] = '/';
355 (void) copystr(devname, mp->mnt_stat.f_mntfromname, MNAMELEN - 1, 0);
356 *mpp = mp;
357 return (0);
358 }
359
360 /*
361 * Find an appropriate filesystem to use for the root. If a filesystem
362 * has not been preselected, walk through the list of known filesystems
363 * trying those that have mountroot routines, and try them until one
364 * works or we have tried them all.
365 */
366 int
367 vfs_mountroot()
368 {
369 struct vfsconf *vfsp;
370 extern int (*mountroot)(void);
371 int error;
372
373 if (mountroot != NULL) {
374 error = (*mountroot)();
375 return (error);
376 }
377
378 for (vfsp = vfsconf; vfsp; vfsp = vfsp->vfc_next) {
379 if (vfsp->vfc_mountroot == NULL)
380 continue;
381 if ((error = (*vfsp->vfc_mountroot)()) == 0)
382 return (0);
383 if (error != EINVAL)
384 printf("%s_mountroot failed: %d\n", vfsp->vfc_name, error);
385 }
386 return (ENODEV);
387 }
388
389 /*
390 * Lookup a mount point by filesystem identifier.
391 */
392 struct mount *
393 vfs_getvfs(fsid)
394 fsid_t *fsid;
395 {
396 register struct mount *mp;
397
398 simple_lock(&mountlist_slock);
399 for (mp = mountlist.cqh_first; mp != (void *)&mountlist;
400 mp = mp->mnt_list.cqe_next) {
401 if (mp->mnt_stat.f_fsid.val[0] == fsid->val[0] &&
402 mp->mnt_stat.f_fsid.val[1] == fsid->val[1]) {
403 simple_unlock(&mountlist_slock);
404 return (mp);
405 }
406 }
407 simple_unlock(&mountlist_slock);
408 return ((struct mount *)0);
409 }
410
411 /*
412 * Get a new unique fsid
413 */
414 void
415 vfs_getnewfsid(mp)
416 struct mount *mp;
417 {
418 static u_short xxxfs_mntid;
419
420 fsid_t tfsid;
421 int mtype;
422
423 simple_lock(&mntid_slock);
424 mtype = mp->mnt_vfc->vfc_typenum;
425 mp->mnt_stat.f_fsid.val[0] = makedev(nblkdev + mtype, 0);
426 mp->mnt_stat.f_fsid.val[1] = mtype;
427 if (xxxfs_mntid == 0)
428 ++xxxfs_mntid;
429 tfsid.val[0] = makedev(nblkdev + mtype, xxxfs_mntid);
430 tfsid.val[1] = mtype;
431 if (mountlist.cqh_first != (void *)&mountlist) {
432 while (vfs_getvfs(&tfsid)) {
433 tfsid.val[0]++;
434 xxxfs_mntid++;
435 }
436 }
437 mp->mnt_stat.f_fsid.val[0] = tfsid.val[0];
438 simple_unlock(&mntid_slock);
439 }
440
441 /*
442 * Set vnode attributes to VNOVAL
443 */
444 void
445 vattr_null(vap)
446 register struct vattr *vap;
447 {
448
449 vap->va_type = VNON;
450 vap->va_size = vap->va_bytes = VNOVAL;
451 vap->va_mode = vap->va_nlink = vap->va_uid = vap->va_gid =
452 vap->va_fsid = vap->va_fileid =
453 vap->va_blocksize = vap->va_rdev =
454 vap->va_atime.tv_sec = vap->va_atime.tv_nsec =
455 vap->va_mtime.tv_sec = vap->va_mtime.tv_nsec =
456 vap->va_ctime.tv_sec = vap->va_ctime.tv_nsec =
457 vap->va_flags = vap->va_gen = VNOVAL;
458 vap->va_vaflags = 0;
459 }
460
461 /*
462 * Routines having to do with the management of the vnode table.
463 */
464 extern int (**dead_vnodeop_p)(void *);
465 static void vclean __P((struct vnode *vp, int flag, struct proc *p));
466 extern void vgonel __P((struct vnode *vp, struct proc *p));
467 long numvnodes, freevnodes;
468 long inactivevnodes;
469 long vnode_reclaim_tried;
470 long vnode_objects_reclaimed;
471
472
473 extern struct vattr va_null;
474
475 /*
476 * Return the next vnode from the free list.
477 */
478 int
479 getnewvnode(tag, mp, vops, vpp)
480 enum vtagtype tag;
481 struct mount *mp;
482 int (**vops)(void *);
483 struct vnode **vpp;
484 {
485 struct proc *p = current_proc(); /* XXX */
486 struct vnode *vp;
487 int cnt, didretry = 0;
488 static int reused = 0; /* track the reuse rate */
489 int reclaimhits = 0;
490
491 retry:
492 simple_lock(&vnode_free_list_slock);
493 /*
494 * MALLOC a vnode if the number of vnodes has not reached the desired
495 * value and the number on the free list is still reasonable...
496 * reuse from the freelist even though we may evict a name cache entry
497 * to reduce the number of vnodes that accumulate.... vnodes tie up
498 * wired memory and are never garbage collected
499 */
500 if (numvnodes < desiredvnodes && (freevnodes < (2 * VNODE_FREE_MIN))) {
501 numvnodes++;
502 simple_unlock(&vnode_free_list_slock);
503 MALLOC_ZONE(vp, struct vnode *, sizeof *vp, M_VNODE, M_WAITOK);
504 bzero((char *)vp, sizeof *vp);
505 VLISTNONE(vp); /* avoid double queue removal */
506 simple_lock_init(&vp->v_interlock);
507 goto done;
508 }
509
510 /*
511 * Once the desired number of vnodes are allocated,
512 * we start reusing the vnodes.
513 */
514 if (freevnodes < VNODE_FREE_MIN) {
515 /*
516 * if we are low on vnodes on the freelist attempt to get
517 * some back from the inactive list and VM object cache
518 */
519 simple_unlock(&vnode_free_list_slock);
520 (void)vnreclaim(vnodetarget);
521 simple_lock(&vnode_free_list_slock);
522 }
523 if (numvnodes >= desiredvnodes && reused > VNODE_TOOMANY_REUSED) {
524 reused = 0;
525 if (freevnodes < VNODE_FREE_ENOUGH) {
526 simple_unlock(&vnode_free_list_slock);
527 (void)vnreclaim(vnodetarget);
528 simple_lock(&vnode_free_list_slock);
529 }
530 }
531
532 for (cnt = 0, vp = vnode_free_list.tqh_first;
533 vp != NULLVP; cnt++, vp = vp->v_freelist.tqe_next) {
534 if (simple_lock_try(&vp->v_interlock)) {
535 /* got the interlock */
536 if (ISSET(vp->v_flag, VORECLAIM)) {
537 /* skip over the vnodes that are being reclaimed */
538 simple_unlock(&vp->v_interlock);
539 reclaimhits++;
540 } else
541 break;
542 }
543 }
544
545 /*
546 * Unless this is a bad time of the month, at most
547 * the first NCPUS items on the free list are
548 * locked, so this is close enough to being empty.
549 */
550 if (vp == NULLVP) {
551 simple_unlock(&vnode_free_list_slock);
552 if (!(didretry++) && (vnreclaim(vnodetarget) > 0))
553 goto retry;
554 tablefull("vnode");
555 log(LOG_EMERG, "%d vnodes locked, %d desired, %d numvnodes, "
556 "%d free, %d inactive, %d being reclaimed\n",
557 cnt, desiredvnodes, numvnodes, freevnodes, inactivevnodes,
558 reclaimhits);
559 *vpp = 0;
560 return (ENFILE);
561 }
562
563 if (vp->v_usecount)
564 panic("free vnode isn't: v_type = %d, v_usecount = %d?",
565 vp->v_type, vp->v_usecount);
566
567 VREMFREE("getnewvnode", vp);
568 reused++;
569 simple_unlock(&vnode_free_list_slock);
570 vp->v_lease = NULL;
571 cache_purge(vp);
572 if (vp->v_type != VBAD)
573 vgonel(vp, p); /* clean and reclaim the vnode */
574 else
575 simple_unlock(&vp->v_interlock);
576 #if DIAGNOSTIC
577 if (vp->v_data)
578 panic("cleaned vnode isn't");
579 {
580 int s = splbio();
581 if (vp->v_numoutput)
582 panic("Clean vnode has pending I/O's");
583 splx(s);
584 }
585 #endif
586 if (UBCINFOEXISTS(vp))
587 panic("getnewvnode: ubcinfo not cleaned");
588 else
589 vp->v_ubcinfo = 0;
590
591 vp->v_lastr = -1;
592 vp->v_ralen = 0;
593 vp->v_maxra = 0;
594 vp->v_lastw = 0;
595 vp->v_ciosiz = 0;
596 vp->v_cstart = 0;
597 vp->v_clen = 0;
598 vp->v_socket = 0;
599
600 done:
601 vp->v_flag = VSTANDARD;
602 vp->v_type = VNON;
603 vp->v_tag = tag;
604 vp->v_op = vops;
605 insmntque(vp, mp);
606 *vpp = vp;
607 vp->v_usecount = 1;
608 vp->v_data = 0;
609 return (0);
610 }
611
612 /*
613 * Move a vnode from one mount queue to another.
614 */
615 void
616 insmntque(vp, mp)
617 struct vnode *vp;
618 struct mount *mp;
619 {
620
621 simple_lock(&mntvnode_slock);
622 /*
623 * Delete from old mount point vnode list, if on one.
624 */
625 if (vp->v_mount != NULL)
626 LIST_REMOVE(vp, v_mntvnodes);
627 /*
628 * Insert into list of vnodes for the new mount point, if available.
629 */
630 if ((vp->v_mount = mp) != NULL)
631 LIST_INSERT_HEAD(&mp->mnt_vnodelist, vp, v_mntvnodes);
632 simple_unlock(&mntvnode_slock);
633 }
634
635 __inline void
636 vpwakeup(struct vnode *vp)
637 {
638 if (vp) {
639 if (--vp->v_numoutput < 0)
640 panic("vpwakeup: neg numoutput");
641 if ((vp->v_flag & VBWAIT || vp->v_flag & VTHROTTLED)
642 && vp->v_numoutput <= 0) {
643 vp->v_flag &= ~(VBWAIT|VTHROTTLED);
644 wakeup((caddr_t)&vp->v_numoutput);
645 }
646 }
647 }
648
649 /*
650 * Update outstanding I/O count and do wakeup if requested.
651 */
652 void
653 vwakeup(bp)
654 register struct buf *bp;
655 {
656 CLR(bp->b_flags, B_WRITEINPROG);
657 vpwakeup(bp->b_vp);
658 }
659
660 /*
661 * Flush out and invalidate all buffers associated with a vnode.
662 * Called with the underlying object locked.
663 */
664 int
665 vinvalbuf(vp, flags, cred, p, slpflag, slptimeo)
666 register struct vnode *vp;
667 int flags;
668 struct ucred *cred;
669 struct proc *p;
670 int slpflag, slptimeo;
671 {
672 register struct buf *bp;
673 struct buf *nbp, *blist;
674 int s, error = 0;
675
676 if (flags & V_SAVE) {
677 if (error = VOP_FSYNC(vp, cred, MNT_WAIT, p)) {
678 return (error);
679 }
680 if (vp->v_dirtyblkhd.lh_first)
681 panic("vinvalbuf: dirty bufs");
682 }
683
684 for (;;) {
685 if ((blist = vp->v_cleanblkhd.lh_first) && flags & V_SAVEMETA)
686 while (blist && blist->b_lblkno < 0)
687 blist = blist->b_vnbufs.le_next;
688 if (!blist && (blist = vp->v_dirtyblkhd.lh_first) &&
689 (flags & V_SAVEMETA))
690 while (blist && blist->b_lblkno < 0)
691 blist = blist->b_vnbufs.le_next;
692 if (!blist)
693 break;
694
695 for (bp = blist; bp; bp = nbp) {
696 nbp = bp->b_vnbufs.le_next;
697 if (flags & V_SAVEMETA && bp->b_lblkno < 0)
698 continue;
699 s = splbio();
700 if (ISSET(bp->b_flags, B_BUSY)) {
701 SET(bp->b_flags, B_WANTED);
702 error = tsleep((caddr_t)bp,
703 slpflag | (PRIBIO + 1), "vinvalbuf",
704 slptimeo);
705 splx(s);
706 if (error) {
707 return (error);
708 }
709 break;
710 }
711 bremfree(bp);
712 SET(bp->b_flags, B_BUSY);
713 splx(s);
714 /*
715 * XXX Since there are no node locks for NFS, I believe
716 * there is a slight chance that a delayed write will
717 * occur while sleeping just above, so check for it.
718 */
719 if (ISSET(bp->b_flags, B_DELWRI) && (flags & V_SAVE)) {
720 (void) VOP_BWRITE(bp);
721 break;
722 }
723 SET(bp->b_flags, B_INVAL);
724 brelse(bp);
725 }
726 }
727 if (!(flags & V_SAVEMETA) &&
728 (vp->v_dirtyblkhd.lh_first || vp->v_cleanblkhd.lh_first))
729 panic("vinvalbuf: flush failed");
730 return (0);
731 }
732
733 /*
734 * Create a vnode for a block device.
735 * Used for root filesystem, argdev, and swap areas.
736 * Also used for memory file system special devices.
737 */
738 int
739 bdevvp(dev, vpp)
740 dev_t dev;
741 struct vnode **vpp;
742 {
743 register struct vnode *vp;
744 struct vnode *nvp;
745 int error;
746
747 if (dev == NODEV) {
748 *vpp = NULLVP;
749 return (ENODEV);
750 }
751 error = getnewvnode(VT_NON, (struct mount *)0, spec_vnodeop_p, &nvp);
752 if (error) {
753 *vpp = NULLVP;
754 return (error);
755 }
756 vp = nvp;
757 vp->v_type = VBLK;
758 if (nvp = checkalias(vp, dev, (struct mount *)0)) {
759 vput(vp);
760 vp = nvp;
761 }
762 *vpp = vp;
763 return (0);
764 }
765
766 /*
767 * Check to see if the new vnode represents a special device
768 * for which we already have a vnode (either because of
769 * bdevvp() or because of a different vnode representing
770 * the same block device). If such an alias exists, deallocate
771 * the existing contents and return the aliased vnode. The
772 * caller is responsible for filling it with its new contents.
773 */
774 struct vnode *
775 checkalias(nvp, nvp_rdev, mp)
776 register struct vnode *nvp;
777 dev_t nvp_rdev;
778 struct mount *mp;
779 {
780 struct proc *p = current_proc(); /* XXX */
781 struct vnode *vp;
782 struct vnode **vpp;
783 struct specinfo * bufhold;
784 int buffree = 1;
785
786 if (nvp->v_type != VBLK && nvp->v_type != VCHR)
787 return (NULLVP);
788
789 bufhold = (struct specinfo *)_MALLOC_ZONE(sizeof(struct specinfo),
790 M_VNODE, M_WAITOK);
791 vpp = &speclisth[SPECHASH(nvp_rdev)];
792 loop:
793 simple_lock(&spechash_slock);
794 for (vp = *vpp; vp; vp = vp->v_specnext) {
795 if (nvp_rdev != vp->v_rdev || nvp->v_type != vp->v_type)
796 continue;
797 /*
798 * Alias, but not in use, so flush it out.
799 */
800 simple_lock(&vp->v_interlock);
801 if (vp->v_usecount == 0) {
802 simple_unlock(&spechash_slock);
803 vgonel(vp, p);
804 goto loop;
805 }
806 if (vget(vp, LK_EXCLUSIVE | LK_INTERLOCK, p)) {
807 simple_unlock(&spechash_slock);
808 goto loop;
809 }
810 break;
811 }
812 if (vp == NULL || vp->v_tag != VT_NON) {
813 nvp->v_specinfo = bufhold;
814 buffree = 0; /* buffer used */
815 bzero(nvp->v_specinfo, sizeof(struct specinfo));
816 nvp->v_rdev = nvp_rdev;
817 nvp->v_hashchain = vpp;
818 nvp->v_specnext = *vpp;
819 nvp->v_specflags = 0;
820 simple_unlock(&spechash_slock);
821 *vpp = nvp;
822 if (vp != NULLVP) {
823 nvp->v_flag |= VALIASED;
824 vp->v_flag |= VALIASED;
825 vput(vp);
826 }
827 /* Since buffer is used just return */
828 return (NULLVP);
829 }
830 simple_unlock(&spechash_slock);
831 VOP_UNLOCK(vp, 0, p);
832 simple_lock(&vp->v_interlock);
833 vclean(vp, 0, p);
834 vp->v_op = nvp->v_op;
835 vp->v_tag = nvp->v_tag;
836 nvp->v_type = VNON;
837 insmntque(vp, mp);
838 if (buffree)
839 _FREE_ZONE((void *)bufhold, sizeof (struct specinfo), M_VNODE);
840 return (vp);
841 }
842
843 /*
844 * Get a reference on a particular vnode and lock it if requested.
845 * If the vnode was on the inactive list, remove it from the list.
846 * If the vnode was on the free list, remove it from the list and
847 * move it to inactive list as needed.
848 * The vnode lock bit is set if the vnode is being eliminated in
849 * vgone. The process is awakened when the transition is completed,
850 * and an error returned to indicate that the vnode is no longer
851 * usable (possibly having been changed to a new file system type).
852 */
853 int
854 vget(vp, flags, p)
855 struct vnode *vp;
856 int flags;
857 struct proc *p;
858 {
859 int error = 0;
860
861 retry:
862
863 /*
864 * If the vnode is in the process of being cleaned out for
865 * another use, we wait for the cleaning to finish and then
866 * return failure. Cleaning is determined by checking that
867 * the VXLOCK flag is set.
868 */
869 if ((flags & LK_INTERLOCK) == 0)
870 simple_lock(&vp->v_interlock);
871 if ((vp->v_flag & VXLOCK) || (vp->v_flag & VORECLAIM)) {
872 vp->v_flag |= VXWANT;
873 simple_unlock(&vp->v_interlock);
874 (void)tsleep((caddr_t)vp, PINOD, "vget", 0);
875 return (ENOENT);
876 }
877
878 /*
879 * vnode is being terminated.
880 * wait for vnode_pager_no_senders() to clear VTERMINATE
881 */
882 if (ISSET(vp->v_flag, VTERMINATE)) {
883 SET(vp->v_flag, VTERMWANT);
884 simple_unlock(&vp->v_interlock);
885 (void)tsleep((caddr_t)&vp->v_ubcinfo, PINOD, "vclean", 0);
886 return (ENOENT);
887 }
888
889 /*
890 * if the vnode is being initialized,
891 * wait for it to finish initialization
892 */
893 if (ISSET(vp->v_flag, VUINIT)) {
894 if (ISSET(vp->v_flag, VUINIT)) {
895 SET(vp->v_flag, VUWANT);
896 simple_unlock(&vp->v_interlock);
897 (void) tsleep((caddr_t)vp, PINOD, "vget2", 0);
898 goto retry;
899 }
900 }
901
902 simple_lock(&vnode_free_list_slock);
903 if (vp->v_usecount == 0) {
904 /* If on the free list, remove it from there */
905 if (VONLIST(vp))
906 VREMFREE("vget", vp);
907 } else {
908 /* If on the inactive list, remove it from there */
909 if ((vp->v_usecount == 1) && UBCINFOEXISTS(vp)) {
910 if (VONLIST(vp))
911 VREMINACTIVE("vget", vp);
912 }
913 }
914
915 /* The vnode should not be on the inactive list here */
916 VINACTIVECHECK("vget", vp, 0);
917
918 simple_unlock(&vnode_free_list_slock);
919
920 if (++vp->v_usecount <= 0)
921 panic("vget: v_usecount");
922
923 /*
924 * Recover named reference as needed
925 */
926 if (UBCISVALID(vp) && !ubc_issetflags(vp, UI_HASOBJREF)) {
927 simple_unlock(&vp->v_interlock);
928 if (ubc_getobject(vp, UBC_HOLDOBJECT)) {
929 error = ENOENT;
930 goto errout;
931 }
932 simple_lock(&vp->v_interlock);
933 }
934
935 if (flags & LK_TYPE_MASK) {
936 if (error = vn_lock(vp, flags | LK_INTERLOCK, p))
937 goto errout;
938 return (0);
939 }
940
941 if ((flags & LK_INTERLOCK) == 0)
942 simple_unlock(&vp->v_interlock);
943 return (0);
944
945 errout:
946 /*
947 * If the vnode was not active in the first place
948 * must not call vrele() as VOP_INACTIVE() is not
949 * required.
950 * So inlined part of vrele() here.
951 */
952 simple_lock(&vp->v_interlock);
953 if (--vp->v_usecount == 1) {
954 if (UBCINFOEXISTS(vp)) {
955 vinactive(vp);
956 simple_unlock(&vp->v_interlock);
957 return (error);
958 }
959 }
960 if (vp->v_usecount > 0) {
961 simple_unlock(&vp->v_interlock);
962 return (error);
963 }
964 if (vp->v_usecount < 0)
965 panic("vget: negative usecount (%d)", vp->v_usecount);
966 vfree(vp);
967 simple_unlock(&vp->v_interlock);
968 return (error);
969 }
970
971 /*
972 * Get a pager reference on the particular vnode.
973 *
974 * This is called from ubc_info_init() and it is asumed that
975 * the vnode is neither on the free list on on the inactive list.
976 * It is also assumed that the vnode is neither being recycled
977 * by vgonel nor being terminated by vnode_pager_vrele().
978 *
979 * The vnode interlock is NOT held by the caller.
980 */
981 __private_extern__ int
982 vnode_pager_vget(vp)
983 struct vnode *vp;
984 {
985 simple_lock(&vp->v_interlock);
986 if (UBCINFOMISSING(vp))
987 panic("vnode_pager_vget: stolen ubc_info");
988
989 if (!UBCINFOEXISTS(vp))
990 panic("vnode_pager_vget: lost ubc_info");
991
992 if ((vp->v_flag & VXLOCK) || (vp->v_flag & VORECLAIM))
993 panic("vnode_pager_vget: already being reclaimd");
994
995 if (ISSET(vp->v_flag, VTERMINATE))
996 panic("vnode_pager_vget: already being terminated");
997
998 simple_lock(&vnode_free_list_slock);
999 /* The vnode should not be on ANY list */
1000 if (VONLIST(vp))
1001 panic("vnode_pager_vget: still on the list");
1002
1003 /* The vnode should not be on the inactive list here */
1004 VINACTIVECHECK("vnode_pager_vget", vp, 0);
1005 simple_unlock(&vnode_free_list_slock);
1006
1007 /* After all those checks, now do the real work :-) */
1008 if (++vp->v_usecount <= 0)
1009 panic("vnode_pager_vget: v_usecount");
1010 simple_unlock(&vp->v_interlock);
1011
1012 return (0);
1013 }
1014
1015 /*
1016 * Stubs to use when there is no locking to be done on the underlying object.
1017 * A minimal shared lock is necessary to ensure that the underlying object
1018 * is not revoked while an operation is in progress. So, an active shared
1019 * count is maintained in an auxillary vnode lock structure.
1020 */
1021 int
1022 vop_nolock(ap)
1023 struct vop_lock_args /* {
1024 struct vnode *a_vp;
1025 int a_flags;
1026 struct proc *a_p;
1027 } */ *ap;
1028 {
1029 #ifdef notyet
1030 /*
1031 * This code cannot be used until all the non-locking filesystems
1032 * (notably NFS) are converted to properly lock and release nodes.
1033 * Also, certain vnode operations change the locking state within
1034 * the operation (create, mknod, remove, link, rename, mkdir, rmdir,
1035 * and symlink). Ideally these operations should not change the
1036 * lock state, but should be changed to let the caller of the
1037 * function unlock them. Otherwise all intermediate vnode layers
1038 * (such as union, umapfs, etc) must catch these functions to do
1039 * the necessary locking at their layer. Note that the inactive
1040 * and lookup operations also change their lock state, but this
1041 * cannot be avoided, so these two operations will always need
1042 * to be handled in intermediate layers.
1043 */
1044 struct vnode *vp = ap->a_vp;
1045 int vnflags, flags = ap->a_flags;
1046
1047 if (vp->v_vnlock == NULL) {
1048 if ((flags & LK_TYPE_MASK) == LK_DRAIN)
1049 return (0);
1050 MALLOC_ZONE(vp->v_vnlock, struct lock__bsd__ *,
1051 sizeof(struct lock__bsd__), M_VNODE, M_WAITOK);
1052 lockinit(vp->v_vnlock, PVFS, "vnlock", 0, 0);
1053 }
1054 switch (flags & LK_TYPE_MASK) {
1055 case LK_DRAIN:
1056 vnflags = LK_DRAIN;
1057 break;
1058 case LK_EXCLUSIVE:
1059 case LK_SHARED:
1060 vnflags = LK_SHARED;
1061 break;
1062 case LK_UPGRADE:
1063 case LK_EXCLUPGRADE:
1064 case LK_DOWNGRADE:
1065 return (0);
1066 case LK_RELEASE:
1067 default:
1068 panic("vop_nolock: bad operation %d", flags & LK_TYPE_MASK);
1069 }
1070 if (flags & LK_INTERLOCK)
1071 vnflags |= LK_INTERLOCK;
1072 return(lockmgr(vp->v_vnlock, vnflags, &vp->v_interlock, ap->a_p));
1073 #else /* for now */
1074 /*
1075 * Since we are not using the lock manager, we must clear
1076 * the interlock here.
1077 */
1078 if (ap->a_flags & LK_INTERLOCK)
1079 simple_unlock(&ap->a_vp->v_interlock);
1080 return (0);
1081 #endif
1082 }
1083
1084 /*
1085 * Decrement the active use count.
1086 */
1087 int
1088 vop_nounlock(ap)
1089 struct vop_unlock_args /* {
1090 struct vnode *a_vp;
1091 int a_flags;
1092 struct proc *a_p;
1093 } */ *ap;
1094 {
1095 struct vnode *vp = ap->a_vp;
1096
1097 if (vp->v_vnlock == NULL)
1098 return (0);
1099 return (lockmgr(vp->v_vnlock, LK_RELEASE, NULL, ap->a_p));
1100 }
1101
1102 /*
1103 * Return whether or not the node is in use.
1104 */
1105 int
1106 vop_noislocked(ap)
1107 struct vop_islocked_args /* {
1108 struct vnode *a_vp;
1109 } */ *ap;
1110 {
1111 struct vnode *vp = ap->a_vp;
1112
1113 if (vp->v_vnlock == NULL)
1114 return (0);
1115 return (lockstatus(vp->v_vnlock));
1116 }
1117
1118 /*
1119 * Vnode reference.
1120 */
1121 void
1122 vref(vp)
1123 struct vnode *vp;
1124 {
1125
1126 simple_lock(&vp->v_interlock);
1127 if (vp->v_usecount <= 0)
1128 panic("vref used where vget required");
1129
1130 /* If on the inactive list, remove it from there */
1131 if ((vp->v_usecount == 1) && UBCINFOEXISTS(vp)) {
1132 if (VONLIST(vp)) {
1133 simple_lock(&vnode_free_list_slock);
1134 VREMINACTIVE("vref", vp);
1135 simple_unlock(&vnode_free_list_slock);
1136 }
1137 }
1138 /* The vnode should not be on the inactive list here */
1139 VINACTIVECHECK("vref", vp, 0);
1140
1141 if (++vp->v_usecount <= 0)
1142 panic("vref v_usecount");
1143 simple_unlock(&vp->v_interlock);
1144 }
1145
1146 /*
1147 * put the vnode on appropriate free list.
1148 * called with v_interlock held.
1149 */
1150 static void
1151 vfree(vp)
1152 struct vnode *vp;
1153 {
1154 /*
1155 * if the vnode is not obtained by calling getnewvnode() we
1156 * are not responsible for the cleanup. Just return.
1157 */
1158 if (!(vp->v_flag & VSTANDARD)) {
1159 return;
1160 }
1161
1162 if (vp->v_usecount != 0)
1163 panic("vfree: v_usecount");
1164
1165 /* insert at tail of LRU list or at head if VAGE is set */
1166 simple_lock(&vnode_free_list_slock);
1167
1168 if (VONLIST(vp))
1169 panic("vfree: vnode still on list");
1170
1171 if (vp->v_flag & VAGE) {
1172 TAILQ_INSERT_HEAD(&vnode_free_list, vp, v_freelist);
1173 vp->v_flag &= ~VAGE;
1174 } else
1175 TAILQ_INSERT_TAIL(&vnode_free_list, vp, v_freelist);
1176 freevnodes++;
1177 simple_unlock(&vnode_free_list_slock);
1178 return;
1179 }
1180
1181 /*
1182 * put the vnode on the inactive list.
1183 * called with v_interlock held
1184 */
1185 static void
1186 vinactive(vp)
1187 struct vnode *vp;
1188 {
1189 if (!UBCINFOEXISTS(vp))
1190 panic("vinactive: not a UBC vnode");
1191
1192 if (vp->v_usecount != 1)
1193 panic("vinactive: v_usecount");
1194
1195 simple_lock(&vnode_free_list_slock);
1196
1197 if (VONLIST(vp))
1198 panic("vinactive: vnode still on list");
1199 VINACTIVECHECK("vinactive", vp, 0);
1200
1201 TAILQ_INSERT_TAIL(&vnode_inactive_list, vp, v_freelist);
1202 SET(vp->v_flag, VUINACTIVE);
1203 CLR(vp->v_flag, (VNOCACHE_DATA | VRAOFF));
1204
1205 inactivevnodes++;
1206 simple_unlock(&vnode_free_list_slock);
1207 return;
1208 }
1209
1210
1211 /*
1212 * vput(), just unlock and vrele()
1213 */
1214 void
1215 vput(vp)
1216 struct vnode *vp;
1217 {
1218 struct proc *p = current_proc(); /* XXX */
1219
1220 simple_lock(&vp->v_interlock);
1221 if (--vp->v_usecount == 1) {
1222 if (UBCINFOEXISTS(vp)) {
1223 vinactive(vp);
1224 simple_unlock(&vp->v_interlock);
1225 VOP_UNLOCK(vp, 0, p);
1226 return;
1227 }
1228 }
1229 if (vp->v_usecount > 0) {
1230 simple_unlock(&vp->v_interlock);
1231 VOP_UNLOCK(vp, 0, p);
1232 return;
1233 }
1234 #if DIAGNOSTIC
1235 if (vp->v_usecount < 0 || vp->v_writecount != 0) {
1236 vprint("vput: bad ref count", vp);
1237 panic("vput: v_usecount = %d, v_writecount = %d",
1238 vp->v_usecount, vp->v_writecount);
1239 }
1240 #endif
1241 if (ISSET((vp)->v_flag, VUINACTIVE) && VONLIST(vp))
1242 VREMINACTIVE("vrele", vp);
1243
1244 simple_unlock(&vp->v_interlock);
1245 VOP_INACTIVE(vp, p);
1246 /*
1247 * The interlock is not held and
1248 * VOP_INCATIVE releases the vnode lock.
1249 * We could block and the vnode might get reactivated
1250 * Can not just call vfree without checking the state
1251 */
1252 simple_lock(&vp->v_interlock);
1253 if (!VONLIST(vp)) {
1254 if (vp->v_usecount == 0)
1255 vfree(vp);
1256 else if ((vp->v_usecount == 1) && UBCINFOEXISTS(vp))
1257 vinactive(vp);
1258 }
1259 simple_unlock(&vp->v_interlock);
1260 }
1261
1262 /*
1263 * Vnode release.
1264 * If count drops to zero, call inactive routine and return to freelist.
1265 */
1266 void
1267 vrele(vp)
1268 struct vnode *vp;
1269 {
1270 struct proc *p = current_proc(); /* XXX */
1271
1272 simple_lock(&vp->v_interlock);
1273 if (--vp->v_usecount == 1) {
1274 if (UBCINFOEXISTS(vp)) {
1275 vinactive(vp);
1276 simple_unlock(&vp->v_interlock);
1277 return;
1278 }
1279 }
1280 if (vp->v_usecount > 0) {
1281 simple_unlock(&vp->v_interlock);
1282 return;
1283 }
1284 #if DIAGNOSTIC
1285 if (vp->v_usecount < 0 || vp->v_writecount != 0) {
1286 vprint("vrele: bad ref count", vp);
1287 panic("vrele: ref cnt");
1288 }
1289 #endif
1290 if (ISSET((vp)->v_flag, VUINACTIVE) && VONLIST(vp))
1291 VREMINACTIVE("vrele", vp);
1292
1293
1294 if ((vp->v_flag & VXLOCK) || (vp->v_flag & VORECLAIM)) {
1295 /* vnode is being cleaned, just return */
1296 vfree(vp);
1297 simple_unlock(&vp->v_interlock);
1298 return;
1299 }
1300
1301 if (vn_lock(vp, LK_EXCLUSIVE | LK_INTERLOCK, p) == 0) {
1302 VOP_INACTIVE(vp, p);
1303 /*
1304 * vn_lock releases the interlock and
1305 * VOP_INCATIVE releases the vnode lock.
1306 * We could block and the vnode might get reactivated
1307 * Can not just call vfree without checking the state
1308 */
1309 simple_lock(&vp->v_interlock);
1310 if (!VONLIST(vp)) {
1311 if (vp->v_usecount == 0)
1312 vfree(vp);
1313 else if ((vp->v_usecount == 1) && UBCINFOEXISTS(vp))
1314 vinactive(vp);
1315 }
1316 simple_unlock(&vp->v_interlock);
1317 }
1318 #if 0
1319 else {
1320 vfree(vp);
1321 simple_unlock(&vp->v_interlock);
1322 kprintf("vrele: vn_lock() failed for vp = 0x%08x\n", vp);
1323 }
1324 #endif
1325 }
1326
1327 void
1328 vagevp(vp)
1329 struct vnode *vp;
1330 {
1331 simple_lock(&vp->v_interlock);
1332 vp->v_flag |= VAGE;
1333 simple_unlock(&vp->v_interlock);
1334 return;
1335 }
1336
1337 /*
1338 * Page or buffer structure gets a reference.
1339 */
1340 void
1341 vhold(vp)
1342 register struct vnode *vp;
1343 {
1344
1345 simple_lock(&vp->v_interlock);
1346 vp->v_holdcnt++;
1347 simple_unlock(&vp->v_interlock);
1348 }
1349
1350 /*
1351 * Page or buffer structure frees a reference.
1352 */
1353 void
1354 holdrele(vp)
1355 register struct vnode *vp;
1356 {
1357
1358 simple_lock(&vp->v_interlock);
1359 if (vp->v_holdcnt <= 0)
1360 panic("holdrele: holdcnt");
1361 vp->v_holdcnt--;
1362 simple_unlock(&vp->v_interlock);
1363 }
1364
1365 /*
1366 * Remove any vnodes in the vnode table belonging to mount point mp.
1367 *
1368 * If MNT_NOFORCE is specified, there should not be any active ones,
1369 * return error if any are found (nb: this is a user error, not a
1370 * system error). If MNT_FORCE is specified, detach any active vnodes
1371 * that are found.
1372 */
1373 #if DIAGNOSTIC
1374 int busyprt = 0; /* print out busy vnodes */
1375 #if 0
1376 struct ctldebug debug1 = { "busyprt", &busyprt };
1377 #endif /* 0 */
1378 #endif
1379
1380 int
1381 vflush(mp, skipvp, flags)
1382 struct mount *mp;
1383 struct vnode *skipvp;
1384 int flags;
1385 {
1386 struct proc *p = current_proc();
1387 struct vnode *vp, *nvp;
1388 int busy = 0;
1389
1390 simple_lock(&mntvnode_slock);
1391 loop:
1392 for (vp = mp->mnt_vnodelist.lh_first; vp; vp = nvp) {
1393 if (vp->v_mount != mp)
1394 goto loop;
1395 nvp = vp->v_mntvnodes.le_next;
1396 /*
1397 * Skip over a selected vnode.
1398 */
1399 if (vp == skipvp)
1400 continue;
1401
1402 simple_lock(&vp->v_interlock);
1403 /*
1404 * Skip over a vnodes marked VSYSTEM or VNOFLUSH.
1405 */
1406 if ((flags & SKIPSYSTEM) && ((vp->v_flag & VSYSTEM) || (vp->v_flag & VNOFLUSH))) {
1407 simple_unlock(&vp->v_interlock);
1408 continue;
1409 }
1410 /*
1411 * Skip over a vnodes marked VSWAP.
1412 */
1413 if ((flags & SKIPSWAP) && (vp->v_flag & VSWAP)) {
1414 simple_unlock(&vp->v_interlock);
1415 continue;
1416 }
1417 /*
1418 * If WRITECLOSE is set, only flush out regular file
1419 * vnodes open for writing.
1420 */
1421 if ((flags & WRITECLOSE) &&
1422 (vp->v_writecount == 0 || vp->v_type != VREG)) {
1423 simple_unlock(&vp->v_interlock);
1424 continue;
1425 }
1426 /*
1427 * With v_usecount == 0, all we need to do is clear
1428 * out the vnode data structures and we are done.
1429 */
1430 if (vp->v_usecount == 0) {
1431 simple_unlock(&mntvnode_slock);
1432 vgonel(vp, p);
1433 simple_lock(&mntvnode_slock);
1434 continue;
1435 }
1436 /*
1437 * If FORCECLOSE is set, forcibly close the vnode.
1438 * For block or character devices, revert to an
1439 * anonymous device. For all other files, just kill them.
1440 */
1441 if (flags & FORCECLOSE) {
1442 simple_unlock(&mntvnode_slock);
1443 if (vp->v_type != VBLK && vp->v_type != VCHR) {
1444 vgonel(vp, p);
1445 } else {
1446 vclean(vp, 0, p);
1447 vp->v_op = spec_vnodeop_p;
1448 insmntque(vp, (struct mount *)0);
1449 }
1450 simple_lock(&mntvnode_slock);
1451 continue;
1452 }
1453 #if DIAGNOSTIC
1454 if (busyprt)
1455 vprint("vflush: busy vnode", vp);
1456 #endif
1457 simple_unlock(&vp->v_interlock);
1458 busy++;
1459 }
1460 simple_unlock(&mntvnode_slock);
1461 if (busy && ((flags & FORCECLOSE)==0))
1462 return (EBUSY);
1463 return (0);
1464 }
1465
1466 /*
1467 * Disassociate the underlying file system from a vnode.
1468 * The vnode interlock is held on entry.
1469 */
1470 static void
1471 vclean(vp, flags, p)
1472 struct vnode *vp;
1473 int flags;
1474 struct proc *p;
1475 {
1476 int active;
1477 int removed = 0;
1478 int didhold;
1479
1480 /*
1481 * if the vnode is not obtained by calling getnewvnode() we
1482 * are not responsible for the cleanup. Just return.
1483 */
1484 if (!(vp->v_flag & VSTANDARD)) {
1485 simple_unlock(&vp->v_interlock);
1486 return;
1487 }
1488
1489 /*
1490 * Check to see if the vnode is in use.
1491 * If so we have to reference it before we clean it out
1492 * so that its count cannot fall to zero and generate a
1493 * race against ourselves to recycle it.
1494 */
1495 if (active = vp->v_usecount)
1496 if (++vp->v_usecount <= 0)
1497 panic("vclean: v_usecount");
1498 /*
1499 * Prevent the vnode from being recycled or
1500 * brought into use while we clean it out.
1501 */
1502 if (vp->v_flag & VXLOCK)
1503 panic("vclean: deadlock");
1504 vp->v_flag |= VXLOCK;
1505
1506 /*
1507 * Even if the count is zero, the VOP_INACTIVE routine may still
1508 * have the object locked while it cleans it out. The VOP_LOCK
1509 * ensures that the VOP_INACTIVE routine is done with its work.
1510 * For active vnodes, it ensures that no other activity can
1511 * occur while the underlying object is being cleaned out.
1512 */
1513 VOP_LOCK(vp, LK_DRAIN | LK_INTERLOCK, p);
1514
1515 /*
1516 * if this vnode is on the inactive list
1517 * take it off the list.
1518 */
1519 if ((active == 1) &&
1520 (ISSET((vp)->v_flag, VUINACTIVE) && VONLIST(vp))) {
1521 simple_lock(&vnode_free_list_slock);
1522 VREMINACTIVE("vclean", vp);
1523 simple_unlock(&vnode_free_list_slock);
1524 removed++;
1525 }
1526
1527 /* Clean the pages in VM. */
1528 if (active && (flags & DOCLOSE))
1529 VOP_CLOSE(vp, IO_NDELAY, NOCRED, p);
1530
1531 /* Clean the pages in VM. */
1532 didhold = ubc_hold(vp);
1533 if ((active) && (didhold))
1534 (void)ubc_clean(vp, 0); /* do not invalidate */
1535
1536 /*
1537 * Clean out any buffers associated with the vnode.
1538 */
1539 if (flags & DOCLOSE) {
1540 if (vp->v_tag == VT_NFS)
1541 nfs_vinvalbuf(vp, V_SAVE, NOCRED, p, 0);
1542 else
1543 vinvalbuf(vp, V_SAVE, NOCRED, p, 0, 0);
1544 }
1545
1546 if (active)
1547 VOP_INACTIVE(vp, p);
1548 else
1549 VOP_UNLOCK(vp, 0, p);
1550
1551 /* Destroy ubc named reference */
1552 if (didhold) {
1553 ubc_rele(vp);
1554 ubc_destroy_named(vp);
1555 }
1556
1557 /*
1558 * Reclaim the vnode.
1559 */
1560 if (VOP_RECLAIM(vp, p))
1561 panic("vclean: cannot reclaim");
1562 cache_purge(vp);
1563 if (vp->v_vnlock) {
1564 if ((vp->v_vnlock->lk_flags & LK_DRAINED) == 0)
1565 vprint("vclean: lock not drained", vp);
1566 FREE_ZONE(vp->v_vnlock, sizeof (struct lock__bsd__), M_VNODE);
1567 vp->v_vnlock = NULL;
1568 }
1569
1570 /* It's dead, Jim! */
1571 vp->v_op = dead_vnodeop_p;
1572 vp->v_tag = VT_NON;
1573
1574 /*
1575 * Done with purge, notify sleepers of the grim news.
1576 */
1577 vp->v_flag &= ~VXLOCK;
1578 if (vp->v_flag & VXWANT) {
1579 vp->v_flag &= ~VXWANT;
1580 wakeup((caddr_t)vp);
1581 }
1582
1583 if (active)
1584 vrele(vp);
1585 }
1586
1587 /*
1588 * Eliminate all activity associated with the requested vnode
1589 * and with all vnodes aliased to the requested vnode.
1590 */
1591 int
1592 vop_revoke(ap)
1593 struct vop_revoke_args /* {
1594 struct vnode *a_vp;
1595 int a_flags;
1596 } */ *ap;
1597 {
1598 struct vnode *vp, *vq;
1599 struct proc *p = current_proc();
1600
1601 #if DIAGNOSTIC
1602 if ((ap->a_flags & REVOKEALL) == 0)
1603 panic("vop_revoke");
1604 #endif
1605
1606 vp = ap->a_vp;
1607 simple_lock(&vp->v_interlock);
1608
1609 if (vp->v_flag & VALIASED) {
1610 /*
1611 * If a vgone (or vclean) is already in progress,
1612 * wait until it is done and return.
1613 */
1614 if (vp->v_flag & VXLOCK) {
1615 while (vp->v_flag & VXLOCK) {
1616 vp->v_flag |= VXWANT;
1617 simple_unlock(&vp->v_interlock);
1618 (void)tsleep((caddr_t)vp, PINOD, "vop_revokeall", 0);
1619 }
1620 return (0);
1621 }
1622 /*
1623 * Ensure that vp will not be vgone'd while we
1624 * are eliminating its aliases.
1625 */
1626 vp->v_flag |= VXLOCK;
1627 simple_unlock(&vp->v_interlock);
1628 while (vp->v_flag & VALIASED) {
1629 simple_lock(&spechash_slock);
1630 for (vq = *vp->v_hashchain; vq; vq = vq->v_specnext) {
1631 if (vq->v_rdev != vp->v_rdev ||
1632 vq->v_type != vp->v_type || vp == vq)
1633 continue;
1634 simple_unlock(&spechash_slock);
1635 vgone(vq);
1636 break;
1637 }
1638 if (vq == NULLVP)
1639 simple_unlock(&spechash_slock);
1640 }
1641 /*
1642 * Remove the lock so that vgone below will
1643 * really eliminate the vnode after which time
1644 * vgone will awaken any sleepers.
1645 */
1646 simple_lock(&vp->v_interlock);
1647 vp->v_flag &= ~VXLOCK;
1648 }
1649 vgonel(vp, p);
1650 return (0);
1651 }
1652
1653 /*
1654 * Recycle an unused vnode to the front of the free list.
1655 * Release the passed interlock if the vnode will be recycled.
1656 */
1657 int
1658 vrecycle(vp, inter_lkp, p)
1659 struct vnode *vp;
1660 struct slock *inter_lkp;
1661 struct proc *p;
1662 {
1663
1664 simple_lock(&vp->v_interlock);
1665 if (vp->v_usecount == 0) {
1666 if (inter_lkp)
1667 simple_unlock(inter_lkp);
1668 vgonel(vp, p);
1669 return (1);
1670 }
1671 simple_unlock(&vp->v_interlock);
1672 return (0);
1673 }
1674
1675 /*
1676 * Eliminate all activity associated with a vnode
1677 * in preparation for reuse.
1678 */
1679 void
1680 vgone(vp)
1681 struct vnode *vp;
1682 {
1683 struct proc *p = current_proc();
1684
1685 simple_lock(&vp->v_interlock);
1686 vgonel(vp, p);
1687 }
1688
1689 /*
1690 * vgone, with the vp interlock held.
1691 */
1692 void
1693 vgonel(vp, p)
1694 struct vnode *vp;
1695 struct proc *p;
1696 {
1697 struct vnode *vq;
1698 struct vnode *vx;
1699
1700 /*
1701 * if the vnode is not obtained by calling getnewvnode() we
1702 * are not responsible for the cleanup. Just return.
1703 */
1704 if (!(vp->v_flag & VSTANDARD)) {
1705 simple_unlock(&vp->v_interlock);
1706 return;
1707 }
1708
1709 /*
1710 * If a vgone (or vclean) is already in progress,
1711 * wait until it is done and return.
1712 */
1713 if (vp->v_flag & VXLOCK) {
1714 while (vp->v_flag & VXLOCK) {
1715 vp->v_flag |= VXWANT;
1716 simple_unlock(&vp->v_interlock);
1717 (void)tsleep((caddr_t)vp, PINOD, "vgone", 0);
1718 }
1719 return;
1720 }
1721 /*
1722 * Clean out the filesystem specific data.
1723 */
1724 vclean(vp, DOCLOSE, p);
1725 /*
1726 * Delete from old mount point vnode list, if on one.
1727 */
1728 if (vp->v_mount != NULL)
1729 insmntque(vp, (struct mount *)0);
1730 /*
1731 * If special device, remove it from special device alias list
1732 * if it is on one.
1733 */
1734 if ((vp->v_type == VBLK || vp->v_type == VCHR) && vp->v_specinfo != 0) {
1735 simple_lock(&spechash_slock);
1736 if (*vp->v_hashchain == vp) {
1737 *vp->v_hashchain = vp->v_specnext;
1738 } else {
1739 for (vq = *vp->v_hashchain; vq; vq = vq->v_specnext) {
1740 if (vq->v_specnext != vp)
1741 continue;
1742 vq->v_specnext = vp->v_specnext;
1743 break;
1744 }
1745 if (vq == NULL)
1746 panic("missing bdev");
1747 }
1748 if (vp->v_flag & VALIASED) {
1749 vx = NULL;
1750 for (vq = *vp->v_hashchain; vq; vq = vq->v_specnext) {
1751 if (vq->v_rdev != vp->v_rdev ||
1752 vq->v_type != vp->v_type)
1753 continue;
1754 if (vx)
1755 break;
1756 vx = vq;
1757 }
1758 if (vx == NULL)
1759 panic("missing alias");
1760 if (vq == NULL)
1761 vx->v_flag &= ~VALIASED;
1762 vp->v_flag &= ~VALIASED;
1763 }
1764 simple_unlock(&spechash_slock);
1765 FREE_ZONE(vp->v_specinfo, sizeof (struct specinfo), M_VNODE);
1766 vp->v_specinfo = NULL;
1767 }
1768 /*
1769 * If it is on the freelist and not already at the head,
1770 * move it to the head of the list. The test of the back
1771 * pointer and the reference count of zero is because
1772 * it will be removed from the free list by getnewvnode,
1773 * but will not have its reference count incremented until
1774 * after calling vgone. If the reference count were
1775 * incremented first, vgone would (incorrectly) try to
1776 * close the previous instance of the underlying object.
1777 * So, the back pointer is explicitly set to `0xdeadb' in
1778 * getnewvnode after removing it from the freelist to ensure
1779 * that we do not try to move it here.
1780 */
1781 if (vp->v_usecount == 0) {
1782 simple_lock(&vnode_free_list_slock);
1783 if ((vp->v_freelist.tqe_prev != (struct vnode **)0xdeadb) &&
1784 vnode_free_list.tqh_first != vp) {
1785 TAILQ_REMOVE(&vnode_free_list, vp, v_freelist);
1786 TAILQ_INSERT_HEAD(&vnode_free_list, vp, v_freelist);
1787 }
1788 simple_unlock(&vnode_free_list_slock);
1789 }
1790 vp->v_type = VBAD;
1791 }
1792
1793 /*
1794 * Lookup a vnode by device number.
1795 */
1796 int
1797 vfinddev(dev, type, vpp)
1798 dev_t dev;
1799 enum vtype type;
1800 struct vnode **vpp;
1801 {
1802 struct vnode *vp;
1803 int rc = 0;
1804
1805 simple_lock(&spechash_slock);
1806 for (vp = speclisth[SPECHASH(dev)]; vp; vp = vp->v_specnext) {
1807 if (dev != vp->v_rdev || type != vp->v_type)
1808 continue;
1809 *vpp = vp;
1810 rc = 1;
1811 break;
1812 }
1813 simple_unlock(&spechash_slock);
1814 return (rc);
1815 }
1816
1817 /*
1818 * Calculate the total number of references to a special device.
1819 */
1820 int
1821 vcount(vp)
1822 struct vnode *vp;
1823 {
1824 struct vnode *vq, *vnext;
1825 int count;
1826
1827 loop:
1828 if ((vp->v_flag & VALIASED) == 0)
1829 return (vp->v_usecount);
1830 simple_lock(&spechash_slock);
1831 for (count = 0, vq = *vp->v_hashchain; vq; vq = vnext) {
1832 vnext = vq->v_specnext;
1833 if (vq->v_rdev != vp->v_rdev || vq->v_type != vp->v_type)
1834 continue;
1835 /*
1836 * Alias, but not in use, so flush it out.
1837 */
1838 if (vq->v_usecount == 0 && vq != vp) {
1839 simple_unlock(&spechash_slock);
1840 vgone(vq);
1841 goto loop;
1842 }
1843 count += vq->v_usecount;
1844 }
1845 simple_unlock(&spechash_slock);
1846 return (count);
1847 }
1848
1849 int prtactive = 0; /* 1 => print out reclaim of active vnodes */
1850
1851 /*
1852 * Print out a description of a vnode.
1853 */
1854 static char *typename[] =
1855 { "VNON", "VREG", "VDIR", "VBLK", "VCHR", "VLNK", "VSOCK", "VFIFO", "VBAD" };
1856
1857 void
1858 vprint(label, vp)
1859 char *label;
1860 register struct vnode *vp;
1861 {
1862 char buf[64];
1863
1864 if (label != NULL)
1865 printf("%s: ", label);
1866 printf("type %s, usecount %d, writecount %d, refcount %d,",
1867 typename[vp->v_type], vp->v_usecount, vp->v_writecount,
1868 vp->v_holdcnt);
1869 buf[0] = '\0';
1870 if (vp->v_flag & VROOT)
1871 strcat(buf, "|VROOT");
1872 if (vp->v_flag & VTEXT)
1873 strcat(buf, "|VTEXT");
1874 if (vp->v_flag & VSYSTEM)
1875 strcat(buf, "|VSYSTEM");
1876 if (vp->v_flag & VNOFLUSH)
1877 strcat(buf, "|VNOFLUSH");
1878 if (vp->v_flag & VXLOCK)
1879 strcat(buf, "|VXLOCK");
1880 if (vp->v_flag & VXWANT)
1881 strcat(buf, "|VXWANT");
1882 if (vp->v_flag & VBWAIT)
1883 strcat(buf, "|VBWAIT");
1884 if (vp->v_flag & VALIASED)
1885 strcat(buf, "|VALIASED");
1886 if (buf[0] != '\0')
1887 printf(" flags (%s)", &buf[1]);
1888 if (vp->v_data == NULL) {
1889 printf("\n");
1890 } else {
1891 printf("\n\t");
1892 VOP_PRINT(vp);
1893 }
1894 }
1895
1896 #ifdef DEBUG
1897 /*
1898 * List all of the locked vnodes in the system.
1899 * Called when debugging the kernel.
1900 */
1901 void
1902 printlockedvnodes()
1903 {
1904 struct proc *p = current_proc();
1905 struct mount *mp, *nmp;
1906 struct vnode *vp;
1907
1908 printf("Locked vnodes\n");
1909 simple_lock(&mountlist_slock);
1910 for (mp = mountlist.cqh_first; mp != (void *)&mountlist; mp = nmp) {
1911 if (vfs_busy(mp, LK_NOWAIT, &mountlist_slock, p)) {
1912 nmp = mp->mnt_list.cqe_next;
1913 continue;
1914 }
1915 for (vp = mp->mnt_vnodelist.lh_first;
1916 vp != NULL;
1917 vp = vp->v_mntvnodes.le_next) {
1918 if (VOP_ISLOCKED(vp))
1919 vprint((char *)0, vp);
1920 }
1921 simple_lock(&mountlist_slock);
1922 nmp = mp->mnt_list.cqe_next;
1923 vfs_unbusy(mp, p);
1924 }
1925 simple_unlock(&mountlist_slock);
1926 }
1927 #endif
1928
1929 /*
1930 * Top level filesystem related information gathering.
1931 */
1932 int
1933 vfs_sysctl(name, namelen, oldp, oldlenp, newp, newlen, p)
1934 int *name;
1935 u_int namelen;
1936 void *oldp;
1937 size_t *oldlenp;
1938 void *newp;
1939 size_t newlen;
1940 struct proc *p;
1941 {
1942 struct vfsconf *vfsp;
1943
1944 /*
1945 * The VFS_NUMMNTOPS shouldn't be at name[0] since
1946 * is a VFS generic variable. So now we must check
1947 * namelen so we don't end up covering any UFS
1948 * variables (sinc UFS vfc_typenum is 1).
1949 *
1950 * It should have been:
1951 * name[0]: VFS_GENERIC
1952 * name[1]: VFS_NUMMNTOPS
1953 */
1954 if (namelen == 1 && name[0] == VFS_NUMMNTOPS) {
1955 extern unsigned int vfs_nummntops;
1956 return (sysctl_rdint(oldp, oldlenp, newp, vfs_nummntops));
1957 }
1958
1959 /* all sysctl names at this level are at least name and field */
1960 if (namelen < 2)
1961 return (ENOTDIR); /* overloaded */
1962 if (name[0] != VFS_GENERIC) {
1963 for (vfsp = vfsconf; vfsp; vfsp = vfsp->vfc_next)
1964 if (vfsp->vfc_typenum == name[0])
1965 break;
1966 if (vfsp == NULL)
1967 return (EOPNOTSUPP);
1968 return ((*vfsp->vfc_vfsops->vfs_sysctl)(&name[1], namelen - 1,
1969 oldp, oldlenp, newp, newlen, p));
1970 }
1971 switch (name[1]) {
1972 case VFS_MAXTYPENUM:
1973 return (sysctl_rdint(oldp, oldlenp, newp, maxvfsconf));
1974 case VFS_CONF:
1975 if (namelen < 3)
1976 return (ENOTDIR); /* overloaded */
1977 for (vfsp = vfsconf; vfsp; vfsp = vfsp->vfc_next)
1978 if (vfsp->vfc_typenum == name[2])
1979 break;
1980 if (vfsp == NULL)
1981 return (EOPNOTSUPP);
1982 return (sysctl_rdstruct(oldp, oldlenp, newp, vfsp,
1983 sizeof(struct vfsconf)));
1984 }
1985 return (EOPNOTSUPP);
1986 }
1987
1988 int kinfo_vdebug = 1;
1989 #define KINFO_VNODESLOP 10
1990 /*
1991 * Dump vnode list (via sysctl).
1992 * Copyout address of vnode followed by vnode.
1993 */
1994 /* ARGSUSED */
1995 int
1996 sysctl_vnode(where, sizep, p)
1997 char *where;
1998 size_t *sizep;
1999 struct proc *p;
2000 {
2001 struct mount *mp, *nmp;
2002 struct vnode *nvp, *vp;
2003 char *bp = where, *savebp;
2004 char *ewhere;
2005 int error;
2006
2007 #define VPTRSZ sizeof (struct vnode *)
2008 #define VNODESZ sizeof (struct vnode)
2009 if (where == NULL) {
2010 *sizep = (numvnodes + KINFO_VNODESLOP) * (VPTRSZ + VNODESZ);
2011 return (0);
2012 }
2013 ewhere = where + *sizep;
2014
2015 simple_lock(&mountlist_slock);
2016 for (mp = mountlist.cqh_first; mp != (void *)&mountlist; mp = nmp) {
2017 if (vfs_busy(mp, LK_NOWAIT, &mountlist_slock, p)) {
2018 nmp = mp->mnt_list.cqe_next;
2019 continue;
2020 }
2021 savebp = bp;
2022 again:
2023 simple_lock(&mntvnode_slock);
2024 for (vp = mp->mnt_vnodelist.lh_first;
2025 vp != NULL;
2026 vp = nvp) {
2027 /*
2028 * Check that the vp is still associated with
2029 * this filesystem. RACE: could have been
2030 * recycled onto the same filesystem.
2031 */
2032 if (vp->v_mount != mp) {
2033 simple_unlock(&mntvnode_slock);
2034 if (kinfo_vdebug)
2035 printf("kinfo: vp changed\n");
2036 bp = savebp;
2037 goto again;
2038 }
2039 nvp = vp->v_mntvnodes.le_next;
2040 if (bp + VPTRSZ + VNODESZ > ewhere) {
2041 simple_unlock(&mntvnode_slock);
2042 *sizep = bp - where;
2043 return (ENOMEM);
2044 }
2045 simple_unlock(&mntvnode_slock);
2046 if ((error = copyout((caddr_t)&vp, bp, VPTRSZ)) ||
2047 (error = copyout((caddr_t)vp, bp + VPTRSZ, VNODESZ)))
2048 return (error);
2049 bp += VPTRSZ + VNODESZ;
2050 simple_lock(&mntvnode_slock);
2051 }
2052 simple_unlock(&mntvnode_slock);
2053 simple_lock(&mountlist_slock);
2054 nmp = mp->mnt_list.cqe_next;
2055 vfs_unbusy(mp, p);
2056 }
2057 simple_unlock(&mountlist_slock);
2058
2059 *sizep = bp - where;
2060 return (0);
2061 }
2062
2063 /*
2064 * Check to see if a filesystem is mounted on a block device.
2065 */
2066 int
2067 vfs_mountedon(vp)
2068 struct vnode *vp;
2069 {
2070 struct vnode *vq;
2071 int error = 0;
2072
2073 if (vp->v_specflags & SI_MOUNTEDON)
2074 return (EBUSY);
2075 if (vp->v_flag & VALIASED) {
2076 simple_lock(&spechash_slock);
2077 for (vq = *vp->v_hashchain; vq; vq = vq->v_specnext) {
2078 if (vq->v_rdev != vp->v_rdev ||
2079 vq->v_type != vp->v_type)
2080 continue;
2081 if (vq->v_specflags & SI_MOUNTEDON) {
2082 error = EBUSY;
2083 break;
2084 }
2085 }
2086 simple_unlock(&spechash_slock);
2087 }
2088 return (error);
2089 }
2090
2091 /*
2092 * Unmount all filesystems. The list is traversed in reverse order
2093 * of mounting to avoid dependencies.
2094 */
2095 __private_extern__ void
2096 vfs_unmountall()
2097 {
2098 struct mount *mp, *nmp;
2099 struct proc *p = current_proc();
2100
2101 /*
2102 * Since this only runs when rebooting, it is not interlocked.
2103 */
2104 for (mp = mountlist.cqh_last; mp != (void *)&mountlist; mp = nmp) {
2105 nmp = mp->mnt_list.cqe_prev;
2106 (void) dounmount(mp, MNT_FORCE, p);
2107 }
2108 }
2109
2110 /*
2111 * Build hash lists of net addresses and hang them off the mount point.
2112 * Called by vfs_export() to set up the lists of export addresses.
2113 */
2114 static int
2115 vfs_hang_addrlist(mp, nep, argp)
2116 struct mount *mp;
2117 struct netexport *nep;
2118 struct export_args *argp;
2119 {
2120 register struct netcred *np;
2121 register struct radix_node_head *rnh;
2122 register int i;
2123 struct radix_node *rn;
2124 struct sockaddr *saddr, *smask = 0;
2125 struct domain *dom;
2126 int error;
2127
2128 if (argp->ex_addrlen == 0) {
2129 if (mp->mnt_flag & MNT_DEFEXPORTED)
2130 return (EPERM);
2131 np = &nep->ne_defexported;
2132 np->netc_exflags = argp->ex_flags;
2133 np->netc_anon = argp->ex_anon;
2134 np->netc_anon.cr_ref = 1;
2135 mp->mnt_flag |= MNT_DEFEXPORTED;
2136 return (0);
2137 }
2138 i = sizeof(struct netcred) + argp->ex_addrlen + argp->ex_masklen;
2139 MALLOC(np, struct netcred *, i, M_NETADDR, M_WAITOK);
2140 bzero((caddr_t)np, i);
2141 saddr = (struct sockaddr *)(np + 1);
2142 if (error = copyin(argp->ex_addr, (caddr_t)saddr, argp->ex_addrlen))
2143 goto out;
2144 if (saddr->sa_len > argp->ex_addrlen)
2145 saddr->sa_len = argp->ex_addrlen;
2146 if (argp->ex_masklen) {
2147 smask = (struct sockaddr *)((caddr_t)saddr + argp->ex_addrlen);
2148 error = copyin(argp->ex_addr, (caddr_t)smask, argp->ex_masklen);
2149 if (error)
2150 goto out;
2151 if (smask->sa_len > argp->ex_masklen)
2152 smask->sa_len = argp->ex_masklen;
2153 }
2154 i = saddr->sa_family;
2155 if ((rnh = nep->ne_rtable[i]) == 0) {
2156 /*
2157 * Seems silly to initialize every AF when most are not
2158 * used, do so on demand here
2159 */
2160 for (dom = domains; dom; dom = dom->dom_next)
2161 if (dom->dom_family == i && dom->dom_rtattach) {
2162 dom->dom_rtattach((void **)&nep->ne_rtable[i],
2163 dom->dom_rtoffset);
2164 break;
2165 }
2166 if ((rnh = nep->ne_rtable[i]) == 0) {
2167 error = ENOBUFS;
2168 goto out;
2169 }
2170 }
2171 rn = (*rnh->rnh_addaddr)((caddr_t)saddr, (caddr_t)smask, rnh,
2172 np->netc_rnodes);
2173 if (rn == 0) {
2174 /*
2175 * One of the reasons that rnh_addaddr may fail is that
2176 * the entry already exists. To check for this case, we
2177 * look up the entry to see if it is there. If so, we
2178 * do not need to make a new entry but do return success.
2179 */
2180 _FREE(np, M_NETADDR);
2181 rn = (*rnh->rnh_matchaddr)((caddr_t)saddr, rnh);
2182 if (rn != 0 && (rn->rn_flags & RNF_ROOT) == 0 &&
2183 ((struct netcred *)rn)->netc_exflags == argp->ex_flags &&
2184 !bcmp((caddr_t)&((struct netcred *)rn)->netc_anon,
2185 (caddr_t)&argp->ex_anon, sizeof(struct ucred)))
2186 return (0);
2187 return (EPERM);
2188 }
2189 np->netc_exflags = argp->ex_flags;
2190 np->netc_anon = argp->ex_anon;
2191 np->netc_anon.cr_ref = 1;
2192 return (0);
2193 out:
2194 _FREE(np, M_NETADDR);
2195 return (error);
2196 }
2197
2198 /* ARGSUSED */
2199 static int
2200 vfs_free_netcred(rn, w)
2201 struct radix_node *rn;
2202 caddr_t w;
2203 {
2204 register struct radix_node_head *rnh = (struct radix_node_head *)w;
2205
2206 (*rnh->rnh_deladdr)(rn->rn_key, rn->rn_mask, rnh);
2207 _FREE((caddr_t)rn, M_NETADDR);
2208 return (0);
2209 }
2210
2211 /*
2212 * Free the net address hash lists that are hanging off the mount points.
2213 */
2214 static void
2215 vfs_free_addrlist(nep)
2216 struct netexport *nep;
2217 {
2218 register int i;
2219 register struct radix_node_head *rnh;
2220
2221 for (i = 0; i <= AF_MAX; i++)
2222 if (rnh = nep->ne_rtable[i]) {
2223 (*rnh->rnh_walktree)(rnh, vfs_free_netcred,
2224 (caddr_t)rnh);
2225 _FREE((caddr_t)rnh, M_RTABLE);
2226 nep->ne_rtable[i] = 0;
2227 }
2228 }
2229
2230 int
2231 vfs_export(mp, nep, argp)
2232 struct mount *mp;
2233 struct netexport *nep;
2234 struct export_args *argp;
2235 {
2236 int error;
2237
2238 if (argp->ex_flags & MNT_DELEXPORT) {
2239 vfs_free_addrlist(nep);
2240 mp->mnt_flag &= ~(MNT_EXPORTED | MNT_DEFEXPORTED);
2241 }
2242 if (argp->ex_flags & MNT_EXPORTED) {
2243 if (error = vfs_hang_addrlist(mp, nep, argp))
2244 return (error);
2245 mp->mnt_flag |= MNT_EXPORTED;
2246 }
2247 return (0);
2248 }
2249
2250 struct netcred *
2251 vfs_export_lookup(mp, nep, nam)
2252 register struct mount *mp;
2253 struct netexport *nep;
2254 struct mbuf *nam;
2255 {
2256 register struct netcred *np;
2257 register struct radix_node_head *rnh;
2258 struct sockaddr *saddr;
2259
2260 np = NULL;
2261 if (mp->mnt_flag & MNT_EXPORTED) {
2262 /*
2263 * Lookup in the export list first.
2264 */
2265 if (nam != NULL) {
2266 saddr = mtod(nam, struct sockaddr *);
2267 rnh = nep->ne_rtable[saddr->sa_family];
2268 if (rnh != NULL) {
2269 np = (struct netcred *)
2270 (*rnh->rnh_matchaddr)((caddr_t)saddr,
2271 rnh);
2272 if (np && np->netc_rnodes->rn_flags & RNF_ROOT)
2273 np = NULL;
2274 }
2275 }
2276 /*
2277 * If no address match, use the default if it exists.
2278 */
2279 if (np == NULL && mp->mnt_flag & MNT_DEFEXPORTED)
2280 np = &nep->ne_defexported;
2281 }
2282 return (np);
2283 }
2284
2285 /*
2286 * try to reclaim vnodes from the memory
2287 * object cache
2288 */
2289 static int
2290 vm_object_cache_reclaim(int count)
2291 {
2292 int cnt;
2293 void vnode_pager_release_from_cache(int *);
2294
2295 /* attempt to reclaim vnodes from VM object cache */
2296 cnt = count;
2297 vnode_pager_release_from_cache(&cnt);
2298 return(cnt);
2299 }
2300
2301 /*
2302 * Release memory object reference held by inactive vnodes
2303 * and then try to reclaim some vnodes from the memory
2304 * object cache
2305 */
2306 static int
2307 vnreclaim(int count)
2308 {
2309 int i, loopcnt;
2310 struct vnode *vp;
2311 int err;
2312 struct proc *p;
2313
2314 i = 0;
2315 loopcnt = 0;
2316
2317 /* Try to release "count" vnodes from the inactive list */
2318 restart:
2319 if (++loopcnt > inactivevnodes) {
2320 /*
2321 * I did my best trying to reclaim the vnodes.
2322 * Do not try any more as that would only lead to
2323 * long latencies. Also in the worst case
2324 * this can get totally CPU bound.
2325 * Just fall though and attempt a reclaim of VM
2326 * object cache
2327 */
2328 goto out;
2329 }
2330
2331 simple_lock(&vnode_free_list_slock);
2332 for (vp = TAILQ_FIRST(&vnode_inactive_list);
2333 (vp != NULLVP) && (i < count);
2334 vp = TAILQ_NEXT(vp, v_freelist)) {
2335
2336 if (!simple_lock_try(&vp->v_interlock))
2337 continue;
2338
2339 if (vp->v_usecount != 1)
2340 panic("vnreclaim: v_usecount");
2341
2342 if(!UBCINFOEXISTS(vp)) {
2343 if (vp->v_type == VBAD) {
2344 VREMINACTIVE("vnreclaim", vp);
2345 simple_unlock(&vp->v_interlock);
2346 continue;
2347 } else
2348 panic("non UBC vnode on inactive list");
2349 /* Should not reach here */
2350 }
2351
2352 /* If vnode is already being reclaimed, wait */
2353 if ((vp->v_flag & VXLOCK) || (vp->v_flag & VORECLAIM)) {
2354 vp->v_flag |= VXWANT;
2355 simple_unlock(&vp->v_interlock);
2356 simple_unlock(&vnode_free_list_slock);
2357 (void)tsleep((caddr_t)vp, PINOD, "vocr", 0);
2358 goto restart;
2359 }
2360
2361 VREMINACTIVE("vnreclaim", vp);
2362 simple_unlock(&vnode_free_list_slock);
2363
2364 if (ubc_issetflags(vp, UI_WASMAPPED)) {
2365 /*
2366 * We should not reclaim as it is likely
2367 * to be in use. Let it die a natural death.
2368 * Release the UBC reference if one exists
2369 * and put it back at the tail.
2370 */
2371 simple_unlock(&vp->v_interlock);
2372 if (ubc_release_named(vp)) {
2373 if (UBCINFOEXISTS(vp)) {
2374 simple_lock(&vp->v_interlock);
2375 if (vp->v_usecount == 1 && !VONLIST(vp))
2376 vinactive(vp);
2377 simple_unlock(&vp->v_interlock);
2378 }
2379 } else {
2380 simple_lock(&vp->v_interlock);
2381 vinactive(vp);
2382 simple_unlock(&vp->v_interlock);
2383 }
2384 } else {
2385 int didhold;
2386
2387 VORECLAIM_ENABLE(vp);
2388
2389 /*
2390 * scrub the dirty pages and invalidate the buffers
2391 */
2392 p = current_proc();
2393 err = vn_lock(vp, LK_EXCLUSIVE|LK_INTERLOCK, p);
2394 if (err) {
2395 /* cannot reclaim */
2396 simple_lock(&vp->v_interlock);
2397 vinactive(vp);
2398 VORECLAIM_DISABLE(vp);
2399 i++;
2400 simple_unlock(&vp->v_interlock);
2401 goto restart;
2402 }
2403
2404 /* keep the vnode alive so we can kill it */
2405 simple_lock(&vp->v_interlock);
2406 if(vp->v_usecount != 1)
2407 panic("VOCR: usecount race");
2408 vp->v_usecount++;
2409 simple_unlock(&vp->v_interlock);
2410
2411 /* clean up the state in VM without invalidating */
2412 didhold = ubc_hold(vp);
2413 if (didhold)
2414 (void)ubc_clean(vp, 0);
2415
2416 /* flush and invalidate buffers associated with the vnode */
2417 if (vp->v_tag == VT_NFS)
2418 nfs_vinvalbuf(vp, V_SAVE, NOCRED, p, 0);
2419 else
2420 vinvalbuf(vp, V_SAVE, NOCRED, p, 0, 0);
2421
2422 /*
2423 * Note: for the v_usecount == 2 case, VOP_INACTIVE
2424 * has not yet been called. Call it now while vp is
2425 * still locked, it will also release the lock.
2426 */
2427 if (vp->v_usecount == 2)
2428 VOP_INACTIVE(vp, p);
2429 else
2430 VOP_UNLOCK(vp, 0, p);
2431
2432 if (didhold)
2433 ubc_rele(vp);
2434
2435 /*
2436 * destroy the ubc named reference.
2437 * If we can't because it is held for I/Os
2438 * in progress, just put it back on the inactive
2439 * list and move on. Otherwise, the paging reference
2440 * is toast (and so is this vnode?).
2441 */
2442 if (ubc_destroy_named(vp)) {
2443 i++;
2444 }
2445 simple_lock(&vp->v_interlock);
2446 VORECLAIM_DISABLE(vp);
2447 simple_unlock(&vp->v_interlock);
2448 vrele(vp); /* release extra use we added here */
2449 }
2450 /* inactive list lock was released, must restart */
2451 goto restart;
2452 }
2453 simple_unlock(&vnode_free_list_slock);
2454
2455 vnode_reclaim_tried += i;
2456 out:
2457 i = vm_object_cache_reclaim(count);
2458 vnode_objects_reclaimed += i;
2459
2460 return(i);
2461 }
2462
2463 /*
2464 * This routine is called from vnode_pager_no_senders()
2465 * which in turn can be called with vnode locked by vnode_uncache()
2466 * But it could also get called as a result of vm_object_cache_trim().
2467 * In that case lock state is unknown.
2468 * AGE the vnode so that it gets recycled quickly.
2469 * Check lock status to decide whether to call vput() or vrele().
2470 */
2471 __private_extern__ void
2472 vnode_pager_vrele(struct vnode *vp)
2473 {
2474
2475 boolean_t funnel_state;
2476 int isvnreclaim = 1;
2477
2478 if (vp == (struct vnode *) NULL)
2479 panic("vnode_pager_vrele: null vp");
2480
2481 funnel_state = thread_funnel_set(kernel_flock, TRUE);
2482
2483 /* Mark the vnode to be recycled */
2484 vagevp(vp);
2485
2486 simple_lock(&vp->v_interlock);
2487 /*
2488 * If a vgone (or vclean) is already in progress,
2489 * Do not bother with the ubc_info cleanup.
2490 * Let the vclean deal with it.
2491 */
2492 if (vp->v_flag & VXLOCK) {
2493 CLR(vp->v_flag, VTERMINATE);
2494 if (ISSET(vp->v_flag, VTERMWANT)) {
2495 CLR(vp->v_flag, VTERMWANT);
2496 wakeup((caddr_t)&vp->v_ubcinfo);
2497 }
2498 simple_unlock(&vp->v_interlock);
2499 vrele(vp);
2500 (void) thread_funnel_set(kernel_flock, funnel_state);
2501 return;
2502 }
2503
2504 /* It's dead, Jim! */
2505 if (!ISSET(vp->v_flag, VORECLAIM)) {
2506 /*
2507 * called as a result of eviction of the memory
2508 * object from the memory object cache
2509 */
2510 isvnreclaim = 0;
2511
2512 /* So serialize vnode operations */
2513 VORECLAIM_ENABLE(vp);
2514 }
2515 if (!ISSET(vp->v_flag, VTERMINATE))
2516 SET(vp->v_flag, VTERMINATE);
2517 if (UBCINFOEXISTS(vp)) {
2518 struct ubc_info *uip = vp->v_ubcinfo;
2519
2520 if (ubc_issetflags(vp, UI_WASMAPPED))
2521 SET(vp->v_flag, VWASMAPPED);
2522
2523 vp->v_ubcinfo = UBC_NOINFO; /* catch bad accesses */
2524 simple_unlock(&vp->v_interlock);
2525 ubc_info_deallocate(uip);
2526 } else {
2527 if ((vp->v_type == VBAD) && ((vp)->v_ubcinfo != UBC_INFO_NULL)
2528 && ((vp)->v_ubcinfo != UBC_NOINFO)) {
2529 struct ubc_info *uip = vp->v_ubcinfo;
2530
2531 vp->v_ubcinfo = UBC_NOINFO; /* catch bad accesses */
2532 simple_unlock(&vp->v_interlock);
2533 ubc_info_deallocate(uip);
2534 } else {
2535 simple_unlock(&vp->v_interlock);
2536 }
2537 }
2538
2539 CLR(vp->v_flag, VTERMINATE);
2540
2541 if (vp->v_type != VBAD){
2542 vgone(vp); /* revoke the vnode */
2543 vrele(vp); /* and drop the reference */
2544 } else
2545 vrele(vp);
2546
2547 if (ISSET(vp->v_flag, VTERMWANT)) {
2548 CLR(vp->v_flag, VTERMWANT);
2549 wakeup((caddr_t)&vp->v_ubcinfo);
2550 }
2551 if (!isvnreclaim)
2552 VORECLAIM_DISABLE(vp);
2553 (void) thread_funnel_set(kernel_flock, funnel_state);
2554 return;
2555 }
2556
2557
2558 #if DIAGNOSTIC
2559 int walk_vnodes_debug=0;
2560
2561 void
2562 walk_allvnodes()
2563 {
2564 struct mount *mp, *nmp;
2565 struct vnode *vp;
2566 int cnt = 0;
2567
2568 for (mp = mountlist.cqh_first; mp != (void *)&mountlist; mp = nmp) {
2569 for (vp = mp->mnt_vnodelist.lh_first;
2570 vp != NULL;
2571 vp = vp->v_mntvnodes.le_next) {
2572 if (vp->v_usecount < 0){
2573 if(walk_vnodes_debug) {
2574 printf("vp is %x\n",vp);
2575 }
2576 }
2577 }
2578 nmp = mp->mnt_list.cqe_next;
2579 }
2580 for (cnt = 0, vp = vnode_free_list.tqh_first;
2581 vp != NULLVP; cnt++, vp = vp->v_freelist.tqe_next) {
2582 if ((vp->v_usecount < 0) && walk_vnodes_debug) {
2583 if(walk_vnodes_debug) {
2584 printf("vp is %x\n",vp);
2585 }
2586 }
2587 }
2588 printf("%d - free\n", cnt);
2589
2590 for (cnt = 0, vp = vnode_inactive_list.tqh_first;
2591 vp != NULLVP; cnt++, vp = vp->v_freelist.tqe_next) {
2592 if ((vp->v_usecount < 0) && walk_vnodes_debug) {
2593 if(walk_vnodes_debug) {
2594 printf("vp is %x\n",vp);
2595 }
2596 }
2597 }
2598 printf("%d - inactive\n", cnt);
2599 }
2600 #endif /* DIAGNOSTIC */
2601
2602 void
2603 vfs_io_attributes(vp, flags, iosize, vectors)
2604 struct vnode *vp;
2605 int flags; /* B_READ or B_WRITE */
2606 int *iosize;
2607 int *vectors;
2608 {
2609 struct mount *mp;
2610
2611 /* start with "reasonable" defaults */
2612 *iosize = MAXPHYS;
2613 *vectors = 32;
2614
2615 mp = vp->v_mount;
2616 if (mp != NULL) {
2617 switch (flags) {
2618 case B_READ:
2619 *iosize = mp->mnt_maxreadcnt;
2620 *vectors = mp->mnt_segreadcnt;
2621 break;
2622 case B_WRITE:
2623 *iosize = mp->mnt_maxwritecnt;
2624 *vectors = mp->mnt_segwritecnt;
2625 break;
2626 default:
2627 break;
2628 }
2629 }
2630
2631 return;
2632 }
2633
2634 #include <dev/disk.h>
2635
2636 int
2637 vfs_init_io_attributes(devvp, mp)
2638 struct vnode *devvp;
2639 struct mount *mp;
2640 {
2641 int error;
2642 off_t readblockcnt;
2643 off_t writeblockcnt;
2644 off_t readsegcnt;
2645 off_t writesegcnt;
2646 u_long blksize;
2647
2648 u_int64_t temp;
2649
2650 struct proc *p = current_proc();
2651 struct ucred *cred = p->p_ucred;
2652
2653 if ((error = VOP_IOCTL(devvp, DKIOCGETMAXBLOCKCOUNTREAD,
2654 (caddr_t)&readblockcnt, 0, cred, p)))
2655 return (error);
2656
2657 if ((error = VOP_IOCTL(devvp, DKIOCGETMAXBLOCKCOUNTWRITE,
2658 (caddr_t)&writeblockcnt, 0, cred, p)))
2659 return (error);
2660
2661 if ((error = VOP_IOCTL(devvp, DKIOCGETMAXSEGMENTCOUNTREAD,
2662 (caddr_t)&readsegcnt, 0, cred, p)))
2663 return (error);
2664
2665 if ((error = VOP_IOCTL(devvp, DKIOCGETMAXSEGMENTCOUNTWRITE,
2666 (caddr_t)&writesegcnt, 0, cred, p)))
2667 return (error);
2668
2669 if ((error = VOP_IOCTL(devvp, DKIOCGETBLOCKSIZE,
2670 (caddr_t)&blksize, 0, cred, p)))
2671 return (error);
2672
2673 temp = readblockcnt * blksize;
2674 temp = (temp > UINT32_MAX) ? (UINT32_MAX / blksize) * blksize : temp;
2675 mp->mnt_maxreadcnt = (u_int32_t)temp;
2676
2677 temp = writeblockcnt * blksize;
2678 temp = (temp > UINT32_MAX) ? (UINT32_MAX / blksize) * blksize : temp;
2679 mp->mnt_maxwritecnt = (u_int32_t)temp;
2680
2681 temp = (readsegcnt > UINT16_MAX) ? UINT16_MAX : readsegcnt;
2682 mp->mnt_segreadcnt = (u_int16_t)temp;
2683
2684 temp = (writesegcnt > UINT16_MAX) ? UINT16_MAX : writesegcnt;
2685 mp->mnt_segwritecnt = (u_int16_t)temp;
2686
2687 #if 0
2688 printf("--- IO attributes for mount point 0x%08x ---\n", mp);
2689 printf("\tmnt_maxreadcnt = 0x%x", mp->mnt_maxreadcnt);
2690 printf("\tmnt_maxwritecnt = 0x%x\n", mp->mnt_maxwritecnt);
2691 printf("\tmnt_segreadcnt = 0x%x", mp->mnt_segreadcnt);
2692 printf("\tmnt_segwritecnt = 0x%x\n", mp->mnt_segwritecnt);
2693 #endif /* 0 */
2694
2695 return (error);
2696 }
2697