]> git.saurik.com Git - apple/xnu.git/blob - bsd/vfs/vfs_subr.c
ce79f9d4ddc573857d629d3fe8cb910d97ffa207
[apple/xnu.git] / bsd / vfs / vfs_subr.c
1 /*
2 * Copyright (c) 2000-2001 Apple Computer, Inc. All rights reserved.
3 *
4 * @APPLE_LICENSE_HEADER_START@
5 *
6 * The contents of this file constitute Original Code as defined in and
7 * are subject to the Apple Public Source License Version 1.1 (the
8 * "License"). You may not use this file except in compliance with the
9 * License. Please obtain a copy of the License at
10 * http://www.apple.com/publicsource and read it before using this file.
11 *
12 * This Original Code and all software distributed under the License are
13 * distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY KIND, EITHER
14 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
15 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
16 * FITNESS FOR A PARTICULAR PURPOSE OR NON-INFRINGEMENT. Please see the
17 * License for the specific language governing rights and limitations
18 * under the License.
19 *
20 * @APPLE_LICENSE_HEADER_END@
21 */
22 /* Copyright (c) 1995 NeXT Computer, Inc. All Rights Reserved */
23 /*
24 * Copyright (c) 1989, 1993
25 * The Regents of the University of California. All rights reserved.
26 * (c) UNIX System Laboratories, Inc.
27 * All or some portions of this file are derived from material licensed
28 * to the University of California by American Telephone and Telegraph
29 * Co. or Unix System Laboratories, Inc. and are reproduced herein with
30 * the permission of UNIX System Laboratories, Inc.
31 *
32 * Redistribution and use in source and binary forms, with or without
33 * modification, are permitted provided that the following conditions
34 * are met:
35 * 1. Redistributions of source code must retain the above copyright
36 * notice, this list of conditions and the following disclaimer.
37 * 2. Redistributions in binary form must reproduce the above copyright
38 * notice, this list of conditions and the following disclaimer in the
39 * documentation and/or other materials provided with the distribution.
40 * 3. All advertising materials mentioning features or use of this software
41 * must display the following acknowledgement:
42 * This product includes software developed by the University of
43 * California, Berkeley and its contributors.
44 * 4. Neither the name of the University nor the names of its contributors
45 * may be used to endorse or promote products derived from this software
46 * without specific prior written permission.
47 *
48 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
49 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
50 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
51 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
52 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
53 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
54 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
55 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
56 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
57 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
58 * SUCH DAMAGE.
59 *
60 * @(#)vfs_subr.c 8.31 (Berkeley) 5/26/95
61 */
62
63 /*
64 * External virtual filesystem routines
65 */
66
67 #undef DIAGNOSTIC
68 #define DIAGNOSTIC 1
69
70 #include <sys/param.h>
71 #include <sys/systm.h>
72 #include <sys/proc.h>
73 #include <sys/mount.h>
74 #include <sys/time.h>
75 #include <sys/vnode.h>
76 #include <sys/stat.h>
77 #include <sys/namei.h>
78 #include <sys/ucred.h>
79 #include <sys/buf.h>
80 #include <sys/errno.h>
81 #include <sys/malloc.h>
82 #include <sys/domain.h>
83 #include <sys/mbuf.h>
84 #include <sys/syslog.h>
85 #include <sys/ubc.h>
86 #include <sys/vm.h>
87 #include <sys/sysctl.h>
88
89 #include <kern/assert.h>
90
91 #include <miscfs/specfs/specdev.h>
92
93 #include <mach/mach_types.h>
94 #include <mach/memory_object_types.h>
95
96
97 enum vtype iftovt_tab[16] = {
98 VNON, VFIFO, VCHR, VNON, VDIR, VNON, VBLK, VNON,
99 VREG, VNON, VLNK, VNON, VSOCK, VNON, VNON, VBAD,
100 };
101 int vttoif_tab[9] = {
102 0, S_IFREG, S_IFDIR, S_IFBLK, S_IFCHR, S_IFLNK,
103 S_IFSOCK, S_IFIFO, S_IFMT,
104 };
105
106 static void vfree(struct vnode *vp);
107 static void vinactive(struct vnode *vp);
108 static int vnreclaim(int count);
109 extern kern_return_t
110 adjust_vm_object_cache(vm_size_t oval, vm_size_t nval);
111
112 TAILQ_HEAD(freelst, vnode) vnode_free_list; /* vnode free list */
113 TAILQ_HEAD(inactivelst, vnode) vnode_inactive_list; /* vnode inactive list */
114 struct mntlist mountlist; /* mounted filesystem list */
115
116 #if DIAGNOSTIC
117 #define VLISTCHECK(fun, vp, list) \
118 if ((vp)->v_freelist.tqe_prev == (struct vnode **)0xdeadb) \
119 panic("%s: %s vnode not on %slist", (fun), (list), (list));
120
121 #define VINACTIVECHECK(fun, vp, expected) \
122 do { \
123 int __is_inactive = ISSET((vp)->v_flag, VUINACTIVE); \
124 if (__is_inactive ^ expected) \
125 panic("%s: %sinactive vnode, expected %s", (fun), \
126 __is_inactive? "" : "not ", \
127 expected? "inactive": "not inactive"); \
128 } while(0)
129 #else
130 #define VLISTCHECK(fun, vp, list)
131 #define VINACTIVECHECK(fun, vp, expected)
132 #endif /* DIAGNOSTIC */
133
134 #define VLISTNONE(vp) \
135 do { \
136 (vp)->v_freelist.tqe_next = (struct vnode *)0; \
137 (vp)->v_freelist.tqe_prev = (struct vnode **)0xdeadb; \
138 } while(0)
139
140 #define VONLIST(vp) \
141 ((vp)->v_freelist.tqe_prev != (struct vnode **)0xdeadb)
142
143 /* remove a vnode from free vnode list */
144 #define VREMFREE(fun, vp) \
145 do { \
146 VLISTCHECK((fun), (vp), "free"); \
147 TAILQ_REMOVE(&vnode_free_list, (vp), v_freelist); \
148 VLISTNONE((vp)); \
149 freevnodes--; \
150 } while(0)
151
152 /* remove a vnode from inactive vnode list */
153 #define VREMINACTIVE(fun, vp) \
154 do { \
155 VLISTCHECK((fun), (vp), "inactive"); \
156 VINACTIVECHECK((fun), (vp), VUINACTIVE); \
157 TAILQ_REMOVE(&vnode_inactive_list, (vp), v_freelist); \
158 CLR((vp)->v_flag, VUINACTIVE); \
159 VLISTNONE((vp)); \
160 inactivevnodes--; \
161 } while(0)
162
163 #define VORECLAIM_ENABLE(vp) \
164 do { \
165 if (ISSET((vp)->v_flag, VORECLAIM)) \
166 panic("vm object raclaim already"); \
167 SET((vp)->v_flag, VORECLAIM); \
168 } while(0)
169
170 #define VORECLAIM_DISABLE(vp) \
171 do { \
172 CLR((vp)->v_flag, VORECLAIM); \
173 if (ISSET((vp)->v_flag, VXWANT)) { \
174 CLR((vp)->v_flag, VXWANT); \
175 wakeup((caddr_t)(vp)); \
176 } \
177 } while(0)
178
179 /*
180 * Have to declare first two locks as actual data even if !MACH_SLOCKS, since
181 * a pointers to them get passed around.
182 */
183 simple_lock_data_t mountlist_slock;
184 simple_lock_data_t mntvnode_slock;
185 decl_simple_lock_data(,mntid_slock);
186 decl_simple_lock_data(,vnode_free_list_slock);
187 decl_simple_lock_data(,spechash_slock);
188
189 /*
190 * vnodetarget is the amount of vnodes we expect to get back
191 * from the the inactive vnode list and VM object cache.
192 * As vnreclaim() is a mainly cpu bound operation for faster
193 * processers this number could be higher.
194 * Having this number too high introduces longer delays in
195 * the execution of getnewvnode().
196 */
197 unsigned long vnodetarget; /* target for vnreclaim() */
198 #define VNODE_FREE_TARGET 20 /* Default value for vnodetarget */
199
200 /*
201 * We need quite a few vnodes on the free list to sustain the
202 * rapid stat() the compilation process does, and still benefit from the name
203 * cache. Having too few vnodes on the free list causes serious disk
204 * thrashing as we cycle through them.
205 */
206 #define VNODE_FREE_MIN 300 /* freelist should have at least these many */
207
208 /*
209 * We need to get vnodes back from the VM object cache when a certain #
210 * of vnodes are reused from the freelist. This is essential for the
211 * caching to be effective in the namecache and the buffer cache [for the
212 * metadata].
213 */
214 #define VNODE_TOOMANY_REUSED (VNODE_FREE_MIN/4)
215
216 /*
217 * If we have enough vnodes on the freelist we do not want to reclaim
218 * the vnodes from the VM object cache.
219 */
220 #define VNODE_FREE_ENOUGH (VNODE_FREE_MIN + (VNODE_FREE_MIN/2))
221
222 /*
223 * Initialize the vnode management data structures.
224 */
225 __private_extern__ void
226 vntblinit()
227 {
228 extern struct lock__bsd__ exchangelock;
229
230 simple_lock_init(&mountlist_slock);
231 simple_lock_init(&mntvnode_slock);
232 simple_lock_init(&mntid_slock);
233 simple_lock_init(&spechash_slock);
234 TAILQ_INIT(&vnode_free_list);
235 simple_lock_init(&vnode_free_list_slock);
236 TAILQ_INIT(&vnode_inactive_list);
237 CIRCLEQ_INIT(&mountlist);
238 lockinit(&exchangelock, PVFS, "exchange", 0, 0);
239
240 if (!vnodetarget)
241 vnodetarget = VNODE_FREE_TARGET;
242
243 /*
244 * Scale the vm_object_cache to accomodate the vnodes
245 * we want to cache
246 */
247 (void) adjust_vm_object_cache(0, desiredvnodes - VNODE_FREE_MIN);
248 }
249
250 /* Reset the VM Object Cache with the values passed in */
251 __private_extern__ kern_return_t
252 reset_vmobjectcache(unsigned int val1, unsigned int val2)
253 {
254 vm_size_t oval = val1 - VNODE_FREE_MIN;
255 vm_size_t nval;
256
257 if(val2 < VNODE_FREE_MIN)
258 nval = 0;
259 else
260 nval = val2 - VNODE_FREE_MIN;
261
262 return(adjust_vm_object_cache(oval, nval));
263 }
264
265 /*
266 * Mark a mount point as busy. Used to synchronize access and to delay
267 * unmounting. Interlock is not released on failure.
268 */
269 int
270 vfs_busy(mp, flags, interlkp, p)
271 struct mount *mp;
272 int flags;
273 struct slock *interlkp;
274 struct proc *p;
275 {
276 int lkflags;
277
278 if (mp->mnt_kern_flag & MNTK_UNMOUNT) {
279 if (flags & LK_NOWAIT)
280 return (ENOENT);
281 mp->mnt_kern_flag |= MNTK_MWAIT;
282 if (interlkp)
283 simple_unlock(interlkp);
284 /*
285 * Since all busy locks are shared except the exclusive
286 * lock granted when unmounting, the only place that a
287 * wakeup needs to be done is at the release of the
288 * exclusive lock at the end of dounmount.
289 */
290 sleep((caddr_t)mp, PVFS);
291 if (interlkp)
292 simple_lock(interlkp);
293 return (ENOENT);
294 }
295 lkflags = LK_SHARED;
296 if (interlkp)
297 lkflags |= LK_INTERLOCK;
298 if (lockmgr(&mp->mnt_lock, lkflags, interlkp, p))
299 panic("vfs_busy: unexpected lock failure");
300 return (0);
301 }
302
303 /*
304 * Free a busy filesystem.
305 */
306 void
307 vfs_unbusy(mp, p)
308 struct mount *mp;
309 struct proc *p;
310 {
311
312 lockmgr(&mp->mnt_lock, LK_RELEASE, NULL, p);
313 }
314
315 /*
316 * Lookup a filesystem type, and if found allocate and initialize
317 * a mount structure for it.
318 *
319 * Devname is usually updated by mount(8) after booting.
320 */
321 int
322 vfs_rootmountalloc(fstypename, devname, mpp)
323 char *fstypename;
324 char *devname;
325 struct mount **mpp;
326 {
327 struct proc *p = current_proc(); /* XXX */
328 struct vfsconf *vfsp;
329 struct mount *mp;
330
331 for (vfsp = vfsconf; vfsp; vfsp = vfsp->vfc_next)
332 if (!strcmp(vfsp->vfc_name, fstypename))
333 break;
334 if (vfsp == NULL)
335 return (ENODEV);
336 mp = _MALLOC_ZONE((u_long)sizeof(struct mount), M_MOUNT, M_WAITOK);
337 bzero((char *)mp, (u_long)sizeof(struct mount));
338
339 /* Initialize the default IO constraints */
340 mp->mnt_maxreadcnt = mp->mnt_maxwritecnt = MAXPHYS;
341 mp->mnt_segreadcnt = mp->mnt_segwritecnt = 32;
342
343 lockinit(&mp->mnt_lock, PVFS, "vfslock", 0, 0);
344 (void)vfs_busy(mp, LK_NOWAIT, 0, p);
345 LIST_INIT(&mp->mnt_vnodelist);
346 mp->mnt_vfc = vfsp;
347 mp->mnt_op = vfsp->vfc_vfsops;
348 mp->mnt_flag = MNT_RDONLY;
349 mp->mnt_vnodecovered = NULLVP;
350 vfsp->vfc_refcount++;
351 mp->mnt_stat.f_type = vfsp->vfc_typenum;
352 mp->mnt_flag |= vfsp->vfc_flags & MNT_VISFLAGMASK;
353 strncpy(mp->mnt_stat.f_fstypename, vfsp->vfc_name, MFSNAMELEN);
354 mp->mnt_stat.f_mntonname[0] = '/';
355 (void) copystr(devname, mp->mnt_stat.f_mntfromname, MNAMELEN - 1, 0);
356 *mpp = mp;
357 return (0);
358 }
359
360 /*
361 * Find an appropriate filesystem to use for the root. If a filesystem
362 * has not been preselected, walk through the list of known filesystems
363 * trying those that have mountroot routines, and try them until one
364 * works or we have tried them all.
365 */
366 int
367 vfs_mountroot()
368 {
369 struct vfsconf *vfsp;
370 extern int (*mountroot)(void);
371 int error;
372
373 if (mountroot != NULL) {
374 error = (*mountroot)();
375 return (error);
376 }
377
378 for (vfsp = vfsconf; vfsp; vfsp = vfsp->vfc_next) {
379 if (vfsp->vfc_mountroot == NULL)
380 continue;
381 if ((error = (*vfsp->vfc_mountroot)()) == 0)
382 return (0);
383 if (error != EINVAL)
384 printf("%s_mountroot failed: %d\n", vfsp->vfc_name, error);
385 }
386 return (ENODEV);
387 }
388
389 /*
390 * Lookup a mount point by filesystem identifier.
391 */
392 struct mount *
393 vfs_getvfs(fsid)
394 fsid_t *fsid;
395 {
396 register struct mount *mp;
397
398 simple_lock(&mountlist_slock);
399 for (mp = mountlist.cqh_first; mp != (void *)&mountlist;
400 mp = mp->mnt_list.cqe_next) {
401 if (mp->mnt_stat.f_fsid.val[0] == fsid->val[0] &&
402 mp->mnt_stat.f_fsid.val[1] == fsid->val[1]) {
403 simple_unlock(&mountlist_slock);
404 return (mp);
405 }
406 }
407 simple_unlock(&mountlist_slock);
408 return ((struct mount *)0);
409 }
410
411 /*
412 * Get a new unique fsid
413 */
414 void
415 vfs_getnewfsid(mp)
416 struct mount *mp;
417 {
418 static u_short xxxfs_mntid;
419
420 fsid_t tfsid;
421 int mtype;
422
423 simple_lock(&mntid_slock);
424 mtype = mp->mnt_vfc->vfc_typenum;
425 mp->mnt_stat.f_fsid.val[0] = makedev(nblkdev + mtype, 0);
426 mp->mnt_stat.f_fsid.val[1] = mtype;
427 if (xxxfs_mntid == 0)
428 ++xxxfs_mntid;
429 tfsid.val[0] = makedev(nblkdev + mtype, xxxfs_mntid);
430 tfsid.val[1] = mtype;
431 if (mountlist.cqh_first != (void *)&mountlist) {
432 while (vfs_getvfs(&tfsid)) {
433 tfsid.val[0]++;
434 xxxfs_mntid++;
435 }
436 }
437 mp->mnt_stat.f_fsid.val[0] = tfsid.val[0];
438 simple_unlock(&mntid_slock);
439 }
440
441 /*
442 * Set vnode attributes to VNOVAL
443 */
444 void
445 vattr_null(vap)
446 register struct vattr *vap;
447 {
448
449 vap->va_type = VNON;
450 vap->va_size = vap->va_bytes = VNOVAL;
451 vap->va_mode = vap->va_nlink = vap->va_uid = vap->va_gid =
452 vap->va_fsid = vap->va_fileid =
453 vap->va_blocksize = vap->va_rdev =
454 vap->va_atime.tv_sec = vap->va_atime.tv_nsec =
455 vap->va_mtime.tv_sec = vap->va_mtime.tv_nsec =
456 vap->va_ctime.tv_sec = vap->va_ctime.tv_nsec =
457 vap->va_flags = vap->va_gen = VNOVAL;
458 vap->va_vaflags = 0;
459 }
460
461 /*
462 * Routines having to do with the management of the vnode table.
463 */
464 extern int (**dead_vnodeop_p)(void *);
465 static void vclean __P((struct vnode *vp, int flag, struct proc *p));
466 extern void vgonel __P((struct vnode *vp, struct proc *p));
467 long numvnodes, freevnodes;
468 long inactivevnodes;
469 long vnode_reclaim_tried;
470 long vnode_objects_reclaimed;
471
472
473 extern struct vattr va_null;
474
475 /*
476 * Return the next vnode from the free list.
477 */
478 int
479 getnewvnode(tag, mp, vops, vpp)
480 enum vtagtype tag;
481 struct mount *mp;
482 int (**vops)(void *);
483 struct vnode **vpp;
484 {
485 struct proc *p = current_proc(); /* XXX */
486 struct vnode *vp;
487 int cnt, didretry = 0;
488 static int reused = 0; /* track the reuse rate */
489 int reclaimhits = 0;
490
491 retry:
492 simple_lock(&vnode_free_list_slock);
493 /*
494 * MALLOC a vnode if the number of vnodes has not reached the desired
495 * value and the number on the free list is still reasonable...
496 * reuse from the freelist even though we may evict a name cache entry
497 * to reduce the number of vnodes that accumulate.... vnodes tie up
498 * wired memory and are never garbage collected
499 */
500 if (numvnodes < desiredvnodes && (freevnodes < (2 * VNODE_FREE_MIN))) {
501 numvnodes++;
502 simple_unlock(&vnode_free_list_slock);
503 MALLOC_ZONE(vp, struct vnode *, sizeof *vp, M_VNODE, M_WAITOK);
504 bzero((char *)vp, sizeof *vp);
505 VLISTNONE(vp); /* avoid double queue removal */
506 simple_lock_init(&vp->v_interlock);
507 goto done;
508 }
509
510 /*
511 * Once the desired number of vnodes are allocated,
512 * we start reusing the vnodes.
513 */
514 if (freevnodes < VNODE_FREE_MIN) {
515 /*
516 * if we are low on vnodes on the freelist attempt to get
517 * some back from the inactive list and VM object cache
518 */
519 simple_unlock(&vnode_free_list_slock);
520 (void)vnreclaim(vnodetarget);
521 simple_lock(&vnode_free_list_slock);
522 }
523 if (numvnodes >= desiredvnodes && reused > VNODE_TOOMANY_REUSED) {
524 reused = 0;
525 if (freevnodes < VNODE_FREE_ENOUGH) {
526 simple_unlock(&vnode_free_list_slock);
527 (void)vnreclaim(vnodetarget);
528 simple_lock(&vnode_free_list_slock);
529 }
530 }
531
532 for (cnt = 0, vp = vnode_free_list.tqh_first;
533 vp != NULLVP; cnt++, vp = vp->v_freelist.tqe_next) {
534 if (simple_lock_try(&vp->v_interlock)) {
535 /* got the interlock */
536 if (ISSET(vp->v_flag, VORECLAIM)) {
537 /* skip over the vnodes that are being reclaimed */
538 simple_unlock(&vp->v_interlock);
539 reclaimhits++;
540 } else
541 break;
542 }
543 }
544
545 /*
546 * Unless this is a bad time of the month, at most
547 * the first NCPUS items on the free list are
548 * locked, so this is close enough to being empty.
549 */
550 if (vp == NULLVP) {
551 simple_unlock(&vnode_free_list_slock);
552 if (!(didretry++) && (vnreclaim(vnodetarget) > 0))
553 goto retry;
554 tablefull("vnode");
555 log(LOG_EMERG, "%d vnodes locked, %d desired, %d numvnodes, "
556 "%d free, %d inactive, %d being reclaimed\n",
557 cnt, desiredvnodes, numvnodes, freevnodes, inactivevnodes,
558 reclaimhits);
559 *vpp = 0;
560 return (ENFILE);
561 }
562
563 if (vp->v_usecount)
564 panic("free vnode isn't: v_type = %d, v_usecount = %d?",
565 vp->v_type, vp->v_usecount);
566
567 VREMFREE("getnewvnode", vp);
568 reused++;
569 simple_unlock(&vnode_free_list_slock);
570 vp->v_lease = NULL;
571 cache_purge(vp);
572 if (vp->v_type != VBAD)
573 vgonel(vp, p); /* clean and reclaim the vnode */
574 else
575 simple_unlock(&vp->v_interlock);
576 #if DIAGNOSTIC
577 if (vp->v_data)
578 panic("cleaned vnode isn't");
579 {
580 int s = splbio();
581 if (vp->v_numoutput)
582 panic("Clean vnode has pending I/O's");
583 splx(s);
584 }
585 #endif
586 if (UBCINFOEXISTS(vp))
587 panic("getnewvnode: ubcinfo not cleaned");
588 else
589 vp->v_ubcinfo = 0;
590
591 vp->v_lastr = -1;
592 vp->v_ralen = 0;
593 vp->v_maxra = 0;
594 vp->v_lastw = 0;
595 vp->v_ciosiz = 0;
596 vp->v_cstart = 0;
597 vp->v_clen = 0;
598 vp->v_socket = 0;
599
600 done:
601 vp->v_flag = VSTANDARD;
602 vp->v_type = VNON;
603 vp->v_tag = tag;
604 vp->v_op = vops;
605 insmntque(vp, mp);
606 *vpp = vp;
607 vp->v_usecount = 1;
608 vp->v_data = 0;
609 return (0);
610 }
611
612 /*
613 * Move a vnode from one mount queue to another.
614 */
615 void
616 insmntque(vp, mp)
617 struct vnode *vp;
618 struct mount *mp;
619 {
620
621 simple_lock(&mntvnode_slock);
622 /*
623 * Delete from old mount point vnode list, if on one.
624 */
625 if (vp->v_mount != NULL)
626 LIST_REMOVE(vp, v_mntvnodes);
627 /*
628 * Insert into list of vnodes for the new mount point, if available.
629 */
630 if ((vp->v_mount = mp) != NULL)
631 LIST_INSERT_HEAD(&mp->mnt_vnodelist, vp, v_mntvnodes);
632 simple_unlock(&mntvnode_slock);
633 }
634
635 __inline void
636 vpwakeup(struct vnode *vp)
637 {
638 if (vp) {
639 if (--vp->v_numoutput < 0)
640 panic("vpwakeup: neg numoutput");
641 if ((vp->v_flag & VBWAIT || vp->v_flag & VTHROTTLED)
642 && vp->v_numoutput <= 0) {
643 vp->v_flag &= ~(VBWAIT|VTHROTTLED);
644 wakeup((caddr_t)&vp->v_numoutput);
645 }
646 }
647 }
648
649 /*
650 * Update outstanding I/O count and do wakeup if requested.
651 */
652 void
653 vwakeup(bp)
654 register struct buf *bp;
655 {
656 CLR(bp->b_flags, B_WRITEINPROG);
657 vpwakeup(bp->b_vp);
658 }
659
660 /*
661 * Flush out and invalidate all buffers associated with a vnode.
662 * Called with the underlying object locked.
663 */
664 int
665 vinvalbuf(vp, flags, cred, p, slpflag, slptimeo)
666 register struct vnode *vp;
667 int flags;
668 struct ucred *cred;
669 struct proc *p;
670 int slpflag, slptimeo;
671 {
672 register struct buf *bp;
673 struct buf *nbp, *blist;
674 int s, error = 0;
675
676 if (flags & V_SAVE) {
677 if (error = VOP_FSYNC(vp, cred, MNT_WAIT, p)) {
678 return (error);
679 }
680
681 // XXXdbg - if there are dirty bufs, wait for 'em if they're busy
682 for (bp=vp->v_dirtyblkhd.lh_first; bp; bp=nbp) {
683 nbp = bp->b_vnbufs.le_next;
684 if (ISSET(bp->b_flags, B_BUSY)) {
685 SET(bp->b_flags, B_WANTED);
686 tsleep((caddr_t)bp, slpflag | (PRIBIO + 1), "vinvalbuf", 0);
687 nbp = vp->v_dirtyblkhd.lh_first;
688 } else {
689 panic("vinvalbuf: dirty buf (vp 0x%x, bp 0x%x)", vp, bp);
690 }
691 }
692 }
693
694 for (;;) {
695 if ((blist = vp->v_cleanblkhd.lh_first) && (flags & V_SAVEMETA))
696 while (blist && blist->b_lblkno < 0)
697 blist = blist->b_vnbufs.le_next;
698 if (!blist && (blist = vp->v_dirtyblkhd.lh_first) &&
699 (flags & V_SAVEMETA))
700 while (blist && blist->b_lblkno < 0)
701 blist = blist->b_vnbufs.le_next;
702 if (!blist)
703 break;
704
705 for (bp = blist; bp; bp = nbp) {
706 nbp = bp->b_vnbufs.le_next;
707 if ((flags & V_SAVEMETA) && bp->b_lblkno < 0)
708 continue;
709 s = splbio();
710 if (ISSET(bp->b_flags, B_BUSY)) {
711 SET(bp->b_flags, B_WANTED);
712 error = tsleep((caddr_t)bp,
713 slpflag | (PRIBIO + 1), "vinvalbuf",
714 slptimeo);
715 splx(s);
716 if (error) {
717 return (error);
718 }
719 break;
720 }
721 bremfree(bp);
722 SET(bp->b_flags, B_BUSY);
723 splx(s);
724 /*
725 * XXX Since there are no node locks for NFS, I believe
726 * there is a slight chance that a delayed write will
727 * occur while sleeping just above, so check for it.
728 */
729 if (ISSET(bp->b_flags, B_DELWRI) && (flags & V_SAVE)) {
730 (void) VOP_BWRITE(bp);
731 break;
732 }
733
734 if (bp->b_flags & B_LOCKED) {
735 panic("vinvalbuf: bp @ 0x%x is locked!\n", bp);
736 break;
737 } else {
738 SET(bp->b_flags, B_INVAL);
739 }
740 brelse(bp);
741 }
742 }
743 if (!(flags & V_SAVEMETA) &&
744 (vp->v_dirtyblkhd.lh_first || vp->v_cleanblkhd.lh_first))
745 panic("vinvalbuf: flush failed");
746 return (0);
747 }
748
749 /*
750 * Create a vnode for a block device.
751 * Used for root filesystem, argdev, and swap areas.
752 * Also used for memory file system special devices.
753 */
754 int
755 bdevvp(dev, vpp)
756 dev_t dev;
757 struct vnode **vpp;
758 {
759 register struct vnode *vp;
760 struct vnode *nvp;
761 int error;
762
763 if (dev == NODEV) {
764 *vpp = NULLVP;
765 return (ENODEV);
766 }
767 error = getnewvnode(VT_NON, (struct mount *)0, spec_vnodeop_p, &nvp);
768 if (error) {
769 *vpp = NULLVP;
770 return (error);
771 }
772 vp = nvp;
773 vp->v_type = VBLK;
774 if (nvp = checkalias(vp, dev, (struct mount *)0)) {
775 vput(vp);
776 vp = nvp;
777 }
778 *vpp = vp;
779 return (0);
780 }
781
782 /*
783 * Check to see if the new vnode represents a special device
784 * for which we already have a vnode (either because of
785 * bdevvp() or because of a different vnode representing
786 * the same block device). If such an alias exists, deallocate
787 * the existing contents and return the aliased vnode. The
788 * caller is responsible for filling it with its new contents.
789 */
790 struct vnode *
791 checkalias(nvp, nvp_rdev, mp)
792 register struct vnode *nvp;
793 dev_t nvp_rdev;
794 struct mount *mp;
795 {
796 struct proc *p = current_proc(); /* XXX */
797 struct vnode *vp;
798 struct vnode **vpp;
799 struct specinfo * bufhold;
800 int buffree = 1;
801
802 if (nvp->v_type != VBLK && nvp->v_type != VCHR)
803 return (NULLVP);
804
805 bufhold = (struct specinfo *)_MALLOC_ZONE(sizeof(struct specinfo),
806 M_VNODE, M_WAITOK);
807 vpp = &speclisth[SPECHASH(nvp_rdev)];
808 loop:
809 simple_lock(&spechash_slock);
810 for (vp = *vpp; vp; vp = vp->v_specnext) {
811 if (nvp_rdev != vp->v_rdev || nvp->v_type != vp->v_type)
812 continue;
813 /*
814 * Alias, but not in use, so flush it out.
815 */
816 simple_lock(&vp->v_interlock);
817 if (vp->v_usecount == 0) {
818 simple_unlock(&spechash_slock);
819 vgonel(vp, p);
820 goto loop;
821 }
822 if (vget(vp, LK_EXCLUSIVE | LK_INTERLOCK, p)) {
823 simple_unlock(&spechash_slock);
824 goto loop;
825 }
826 break;
827 }
828 if (vp == NULL || vp->v_tag != VT_NON) {
829 nvp->v_specinfo = bufhold;
830 buffree = 0; /* buffer used */
831 bzero(nvp->v_specinfo, sizeof(struct specinfo));
832 nvp->v_rdev = nvp_rdev;
833 nvp->v_hashchain = vpp;
834 nvp->v_specnext = *vpp;
835 nvp->v_specflags = 0;
836 simple_unlock(&spechash_slock);
837 *vpp = nvp;
838 if (vp != NULLVP) {
839 nvp->v_flag |= VALIASED;
840 vp->v_flag |= VALIASED;
841 vput(vp);
842 }
843 /* Since buffer is used just return */
844 return (NULLVP);
845 }
846 simple_unlock(&spechash_slock);
847 VOP_UNLOCK(vp, 0, p);
848 simple_lock(&vp->v_interlock);
849 vclean(vp, 0, p);
850 vp->v_op = nvp->v_op;
851 vp->v_tag = nvp->v_tag;
852 nvp->v_type = VNON;
853 insmntque(vp, mp);
854 if (buffree)
855 _FREE_ZONE((void *)bufhold, sizeof (struct specinfo), M_VNODE);
856 return (vp);
857 }
858
859 /*
860 * Get a reference on a particular vnode and lock it if requested.
861 * If the vnode was on the inactive list, remove it from the list.
862 * If the vnode was on the free list, remove it from the list and
863 * move it to inactive list as needed.
864 * The vnode lock bit is set if the vnode is being eliminated in
865 * vgone. The process is awakened when the transition is completed,
866 * and an error returned to indicate that the vnode is no longer
867 * usable (possibly having been changed to a new file system type).
868 */
869 int
870 vget(vp, flags, p)
871 struct vnode *vp;
872 int flags;
873 struct proc *p;
874 {
875 int error = 0;
876
877 retry:
878
879 /*
880 * If the vnode is in the process of being cleaned out for
881 * another use, we wait for the cleaning to finish and then
882 * return failure. Cleaning is determined by checking that
883 * the VXLOCK flag is set.
884 */
885 if ((flags & LK_INTERLOCK) == 0)
886 simple_lock(&vp->v_interlock);
887 if ((vp->v_flag & VXLOCK) || (vp->v_flag & VORECLAIM)) {
888 vp->v_flag |= VXWANT;
889 simple_unlock(&vp->v_interlock);
890 (void)tsleep((caddr_t)vp, PINOD, "vget", 0);
891 return (ENOENT);
892 }
893
894 /*
895 * vnode is being terminated.
896 * wait for vnode_pager_no_senders() to clear VTERMINATE
897 */
898 if (ISSET(vp->v_flag, VTERMINATE)) {
899 SET(vp->v_flag, VTERMWANT);
900 simple_unlock(&vp->v_interlock);
901 (void)tsleep((caddr_t)&vp->v_ubcinfo, PINOD, "vclean", 0);
902 return (ENOENT);
903 }
904
905 /*
906 * if the vnode is being initialized,
907 * wait for it to finish initialization
908 */
909 if (ISSET(vp->v_flag, VUINIT)) {
910 if (ISSET(vp->v_flag, VUINIT)) {
911 SET(vp->v_flag, VUWANT);
912 simple_unlock(&vp->v_interlock);
913 (void) tsleep((caddr_t)vp, PINOD, "vget2", 0);
914 goto retry;
915 }
916 }
917
918 simple_lock(&vnode_free_list_slock);
919 if (vp->v_usecount == 0) {
920 /* If on the free list, remove it from there */
921 if (VONLIST(vp))
922 VREMFREE("vget", vp);
923 } else {
924 /* If on the inactive list, remove it from there */
925 if ((vp->v_usecount == 1) && UBCINFOEXISTS(vp)) {
926 if (VONLIST(vp))
927 VREMINACTIVE("vget", vp);
928 }
929 }
930
931 /* The vnode should not be on the inactive list here */
932 VINACTIVECHECK("vget", vp, 0);
933
934 simple_unlock(&vnode_free_list_slock);
935
936 if (++vp->v_usecount <= 0)
937 panic("vget: v_usecount");
938
939 /*
940 * Recover named reference as needed
941 */
942 if (UBCISVALID(vp) && !ubc_issetflags(vp, UI_HASOBJREF)) {
943 simple_unlock(&vp->v_interlock);
944 if (ubc_getobject(vp, UBC_HOLDOBJECT)) {
945 error = ENOENT;
946 goto errout;
947 }
948 simple_lock(&vp->v_interlock);
949 }
950
951 if (flags & LK_TYPE_MASK) {
952 if (error = vn_lock(vp, flags | LK_INTERLOCK, p))
953 goto errout;
954 return (0);
955 }
956
957 if ((flags & LK_INTERLOCK) == 0)
958 simple_unlock(&vp->v_interlock);
959 return (0);
960
961 errout:
962 /*
963 * If the vnode was not active in the first place
964 * must not call vrele() as VOP_INACTIVE() is not
965 * required.
966 * So inlined part of vrele() here.
967 */
968 simple_lock(&vp->v_interlock);
969 if (--vp->v_usecount == 1) {
970 if (UBCINFOEXISTS(vp)) {
971 vinactive(vp);
972 simple_unlock(&vp->v_interlock);
973 return (error);
974 }
975 }
976 if (vp->v_usecount > 0) {
977 simple_unlock(&vp->v_interlock);
978 return (error);
979 }
980 if (vp->v_usecount < 0)
981 panic("vget: negative usecount (%d)", vp->v_usecount);
982 vfree(vp);
983 simple_unlock(&vp->v_interlock);
984 return (error);
985 }
986
987 /*
988 * Get a pager reference on the particular vnode.
989 *
990 * This is called from ubc_info_init() and it is asumed that
991 * the vnode is neither on the free list on on the inactive list.
992 * It is also assumed that the vnode is neither being recycled
993 * by vgonel nor being terminated by vnode_pager_vrele().
994 *
995 * The vnode interlock is NOT held by the caller.
996 */
997 __private_extern__ int
998 vnode_pager_vget(vp)
999 struct vnode *vp;
1000 {
1001 simple_lock(&vp->v_interlock);
1002 if (UBCINFOMISSING(vp))
1003 panic("vnode_pager_vget: stolen ubc_info");
1004
1005 if (!UBCINFOEXISTS(vp))
1006 panic("vnode_pager_vget: lost ubc_info");
1007
1008 if ((vp->v_flag & VXLOCK) || (vp->v_flag & VORECLAIM))
1009 panic("vnode_pager_vget: already being reclaimd");
1010
1011 if (ISSET(vp->v_flag, VTERMINATE))
1012 panic("vnode_pager_vget: already being terminated");
1013
1014 simple_lock(&vnode_free_list_slock);
1015 /* The vnode should not be on ANY list */
1016 if (VONLIST(vp))
1017 panic("vnode_pager_vget: still on the list");
1018
1019 /* The vnode should not be on the inactive list here */
1020 VINACTIVECHECK("vnode_pager_vget", vp, 0);
1021 simple_unlock(&vnode_free_list_slock);
1022
1023 /* After all those checks, now do the real work :-) */
1024 if (++vp->v_usecount <= 0)
1025 panic("vnode_pager_vget: v_usecount");
1026 simple_unlock(&vp->v_interlock);
1027
1028 return (0);
1029 }
1030
1031 /*
1032 * Stubs to use when there is no locking to be done on the underlying object.
1033 * A minimal shared lock is necessary to ensure that the underlying object
1034 * is not revoked while an operation is in progress. So, an active shared
1035 * count is maintained in an auxillary vnode lock structure.
1036 */
1037 int
1038 vop_nolock(ap)
1039 struct vop_lock_args /* {
1040 struct vnode *a_vp;
1041 int a_flags;
1042 struct proc *a_p;
1043 } */ *ap;
1044 {
1045 #ifdef notyet
1046 /*
1047 * This code cannot be used until all the non-locking filesystems
1048 * (notably NFS) are converted to properly lock and release nodes.
1049 * Also, certain vnode operations change the locking state within
1050 * the operation (create, mknod, remove, link, rename, mkdir, rmdir,
1051 * and symlink). Ideally these operations should not change the
1052 * lock state, but should be changed to let the caller of the
1053 * function unlock them. Otherwise all intermediate vnode layers
1054 * (such as union, umapfs, etc) must catch these functions to do
1055 * the necessary locking at their layer. Note that the inactive
1056 * and lookup operations also change their lock state, but this
1057 * cannot be avoided, so these two operations will always need
1058 * to be handled in intermediate layers.
1059 */
1060 struct vnode *vp = ap->a_vp;
1061 int vnflags, flags = ap->a_flags;
1062
1063 if (vp->v_vnlock == NULL) {
1064 if ((flags & LK_TYPE_MASK) == LK_DRAIN)
1065 return (0);
1066 MALLOC_ZONE(vp->v_vnlock, struct lock__bsd__ *,
1067 sizeof(struct lock__bsd__), M_VNODE, M_WAITOK);
1068 lockinit(vp->v_vnlock, PVFS, "vnlock", 0, 0);
1069 }
1070 switch (flags & LK_TYPE_MASK) {
1071 case LK_DRAIN:
1072 vnflags = LK_DRAIN;
1073 break;
1074 case LK_EXCLUSIVE:
1075 case LK_SHARED:
1076 vnflags = LK_SHARED;
1077 break;
1078 case LK_UPGRADE:
1079 case LK_EXCLUPGRADE:
1080 case LK_DOWNGRADE:
1081 return (0);
1082 case LK_RELEASE:
1083 default:
1084 panic("vop_nolock: bad operation %d", flags & LK_TYPE_MASK);
1085 }
1086 if (flags & LK_INTERLOCK)
1087 vnflags |= LK_INTERLOCK;
1088 return(lockmgr(vp->v_vnlock, vnflags, &vp->v_interlock, ap->a_p));
1089 #else /* for now */
1090 /*
1091 * Since we are not using the lock manager, we must clear
1092 * the interlock here.
1093 */
1094 if (ap->a_flags & LK_INTERLOCK)
1095 simple_unlock(&ap->a_vp->v_interlock);
1096 return (0);
1097 #endif
1098 }
1099
1100 /*
1101 * Decrement the active use count.
1102 */
1103 int
1104 vop_nounlock(ap)
1105 struct vop_unlock_args /* {
1106 struct vnode *a_vp;
1107 int a_flags;
1108 struct proc *a_p;
1109 } */ *ap;
1110 {
1111 struct vnode *vp = ap->a_vp;
1112
1113 if (vp->v_vnlock == NULL)
1114 return (0);
1115 return (lockmgr(vp->v_vnlock, LK_RELEASE, NULL, ap->a_p));
1116 }
1117
1118 /*
1119 * Return whether or not the node is in use.
1120 */
1121 int
1122 vop_noislocked(ap)
1123 struct vop_islocked_args /* {
1124 struct vnode *a_vp;
1125 } */ *ap;
1126 {
1127 struct vnode *vp = ap->a_vp;
1128
1129 if (vp->v_vnlock == NULL)
1130 return (0);
1131 return (lockstatus(vp->v_vnlock));
1132 }
1133
1134 /*
1135 * Vnode reference.
1136 */
1137 void
1138 vref(vp)
1139 struct vnode *vp;
1140 {
1141
1142 simple_lock(&vp->v_interlock);
1143 if (vp->v_usecount <= 0)
1144 panic("vref used where vget required");
1145
1146 /* If on the inactive list, remove it from there */
1147 if ((vp->v_usecount == 1) && UBCINFOEXISTS(vp)) {
1148 if (VONLIST(vp)) {
1149 simple_lock(&vnode_free_list_slock);
1150 VREMINACTIVE("vref", vp);
1151 simple_unlock(&vnode_free_list_slock);
1152 }
1153 }
1154 /* The vnode should not be on the inactive list here */
1155 VINACTIVECHECK("vref", vp, 0);
1156
1157 if (++vp->v_usecount <= 0)
1158 panic("vref v_usecount");
1159 simple_unlock(&vp->v_interlock);
1160 }
1161
1162 /*
1163 * put the vnode on appropriate free list.
1164 * called with v_interlock held.
1165 */
1166 static void
1167 vfree(vp)
1168 struct vnode *vp;
1169 {
1170 /*
1171 * if the vnode is not obtained by calling getnewvnode() we
1172 * are not responsible for the cleanup. Just return.
1173 */
1174 if (!(vp->v_flag & VSTANDARD)) {
1175 return;
1176 }
1177
1178 if (vp->v_usecount != 0)
1179 panic("vfree: v_usecount");
1180
1181 /* insert at tail of LRU list or at head if VAGE is set */
1182 simple_lock(&vnode_free_list_slock);
1183
1184 if (VONLIST(vp))
1185 panic("vfree: vnode still on list");
1186
1187 if (vp->v_flag & VAGE) {
1188 TAILQ_INSERT_HEAD(&vnode_free_list, vp, v_freelist);
1189 vp->v_flag &= ~VAGE;
1190 } else
1191 TAILQ_INSERT_TAIL(&vnode_free_list, vp, v_freelist);
1192 freevnodes++;
1193 simple_unlock(&vnode_free_list_slock);
1194 return;
1195 }
1196
1197 /*
1198 * put the vnode on the inactive list.
1199 * called with v_interlock held
1200 */
1201 static void
1202 vinactive(vp)
1203 struct vnode *vp;
1204 {
1205 if (!UBCINFOEXISTS(vp))
1206 panic("vinactive: not a UBC vnode");
1207
1208 if (vp->v_usecount != 1)
1209 panic("vinactive: v_usecount");
1210
1211 simple_lock(&vnode_free_list_slock);
1212
1213 if (VONLIST(vp))
1214 panic("vinactive: vnode still on list");
1215 VINACTIVECHECK("vinactive", vp, 0);
1216
1217 TAILQ_INSERT_TAIL(&vnode_inactive_list, vp, v_freelist);
1218 SET(vp->v_flag, VUINACTIVE);
1219 CLR(vp->v_flag, (VNOCACHE_DATA | VRAOFF));
1220
1221 inactivevnodes++;
1222 simple_unlock(&vnode_free_list_slock);
1223 return;
1224 }
1225
1226
1227 /*
1228 * vput(), just unlock and vrele()
1229 */
1230 void
1231 vput(vp)
1232 struct vnode *vp;
1233 {
1234 struct proc *p = current_proc(); /* XXX */
1235
1236 simple_lock(&vp->v_interlock);
1237 if (--vp->v_usecount == 1) {
1238 if (UBCINFOEXISTS(vp)) {
1239 vinactive(vp);
1240 simple_unlock(&vp->v_interlock);
1241 VOP_UNLOCK(vp, 0, p);
1242 return;
1243 }
1244 }
1245 if (vp->v_usecount > 0) {
1246 simple_unlock(&vp->v_interlock);
1247 VOP_UNLOCK(vp, 0, p);
1248 return;
1249 }
1250 #if DIAGNOSTIC
1251 if (vp->v_usecount < 0 || vp->v_writecount != 0) {
1252 vprint("vput: bad ref count", vp);
1253 panic("vput: v_usecount = %d, v_writecount = %d",
1254 vp->v_usecount, vp->v_writecount);
1255 }
1256 #endif
1257 if (ISSET((vp)->v_flag, VUINACTIVE) && VONLIST(vp))
1258 VREMINACTIVE("vrele", vp);
1259
1260 simple_unlock(&vp->v_interlock);
1261 VOP_INACTIVE(vp, p);
1262 /*
1263 * The interlock is not held and
1264 * VOP_INCATIVE releases the vnode lock.
1265 * We could block and the vnode might get reactivated
1266 * Can not just call vfree without checking the state
1267 */
1268 simple_lock(&vp->v_interlock);
1269 if (!VONLIST(vp)) {
1270 if (vp->v_usecount == 0)
1271 vfree(vp);
1272 else if ((vp->v_usecount == 1) && UBCINFOEXISTS(vp))
1273 vinactive(vp);
1274 }
1275 simple_unlock(&vp->v_interlock);
1276 }
1277
1278 /*
1279 * Vnode release.
1280 * If count drops to zero, call inactive routine and return to freelist.
1281 */
1282 void
1283 vrele(vp)
1284 struct vnode *vp;
1285 {
1286 struct proc *p = current_proc(); /* XXX */
1287
1288 simple_lock(&vp->v_interlock);
1289 if (--vp->v_usecount == 1) {
1290 if (UBCINFOEXISTS(vp)) {
1291 vinactive(vp);
1292 simple_unlock(&vp->v_interlock);
1293 return;
1294 }
1295 }
1296 if (vp->v_usecount > 0) {
1297 simple_unlock(&vp->v_interlock);
1298 return;
1299 }
1300 #if DIAGNOSTIC
1301 if (vp->v_usecount < 0 || vp->v_writecount != 0) {
1302 vprint("vrele: bad ref count", vp);
1303 panic("vrele: ref cnt");
1304 }
1305 #endif
1306 if (ISSET((vp)->v_flag, VUINACTIVE) && VONLIST(vp))
1307 VREMINACTIVE("vrele", vp);
1308
1309
1310 if ((vp->v_flag & VXLOCK) || (vp->v_flag & VORECLAIM)) {
1311 /* vnode is being cleaned, just return */
1312 vfree(vp);
1313 simple_unlock(&vp->v_interlock);
1314 return;
1315 }
1316
1317 if (vn_lock(vp, LK_EXCLUSIVE | LK_INTERLOCK, p) == 0) {
1318 VOP_INACTIVE(vp, p);
1319 /*
1320 * vn_lock releases the interlock and
1321 * VOP_INCATIVE releases the vnode lock.
1322 * We could block and the vnode might get reactivated
1323 * Can not just call vfree without checking the state
1324 */
1325 simple_lock(&vp->v_interlock);
1326 if (!VONLIST(vp)) {
1327 if (vp->v_usecount == 0)
1328 vfree(vp);
1329 else if ((vp->v_usecount == 1) && UBCINFOEXISTS(vp))
1330 vinactive(vp);
1331 }
1332 simple_unlock(&vp->v_interlock);
1333 }
1334 #if 0
1335 else {
1336 vfree(vp);
1337 simple_unlock(&vp->v_interlock);
1338 kprintf("vrele: vn_lock() failed for vp = 0x%08x\n", vp);
1339 }
1340 #endif
1341 }
1342
1343 void
1344 vagevp(vp)
1345 struct vnode *vp;
1346 {
1347 simple_lock(&vp->v_interlock);
1348 vp->v_flag |= VAGE;
1349 simple_unlock(&vp->v_interlock);
1350 return;
1351 }
1352
1353 /*
1354 * Page or buffer structure gets a reference.
1355 */
1356 void
1357 vhold(vp)
1358 register struct vnode *vp;
1359 {
1360
1361 simple_lock(&vp->v_interlock);
1362 vp->v_holdcnt++;
1363 simple_unlock(&vp->v_interlock);
1364 }
1365
1366 /*
1367 * Page or buffer structure frees a reference.
1368 */
1369 void
1370 holdrele(vp)
1371 register struct vnode *vp;
1372 {
1373
1374 simple_lock(&vp->v_interlock);
1375 if (vp->v_holdcnt <= 0)
1376 panic("holdrele: holdcnt");
1377 vp->v_holdcnt--;
1378 simple_unlock(&vp->v_interlock);
1379 }
1380
1381 /*
1382 * Remove any vnodes in the vnode table belonging to mount point mp.
1383 *
1384 * If MNT_NOFORCE is specified, there should not be any active ones,
1385 * return error if any are found (nb: this is a user error, not a
1386 * system error). If MNT_FORCE is specified, detach any active vnodes
1387 * that are found.
1388 */
1389 #if DIAGNOSTIC
1390 int busyprt = 0; /* print out busy vnodes */
1391 #if 0
1392 struct ctldebug debug1 = { "busyprt", &busyprt };
1393 #endif /* 0 */
1394 #endif
1395
1396 int
1397 vflush(mp, skipvp, flags)
1398 struct mount *mp;
1399 struct vnode *skipvp;
1400 int flags;
1401 {
1402 struct proc *p = current_proc();
1403 struct vnode *vp, *nvp;
1404 int busy = 0;
1405
1406 simple_lock(&mntvnode_slock);
1407 loop:
1408 for (vp = mp->mnt_vnodelist.lh_first; vp; vp = nvp) {
1409 if (vp->v_mount != mp)
1410 goto loop;
1411 nvp = vp->v_mntvnodes.le_next;
1412 /*
1413 * Skip over a selected vnode.
1414 */
1415 if (vp == skipvp)
1416 continue;
1417
1418 simple_lock(&vp->v_interlock);
1419 /*
1420 * Skip over a vnodes marked VSYSTEM or VNOFLUSH.
1421 */
1422 if ((flags & SKIPSYSTEM) && ((vp->v_flag & VSYSTEM) || (vp->v_flag & VNOFLUSH))) {
1423 simple_unlock(&vp->v_interlock);
1424 continue;
1425 }
1426 /*
1427 * Skip over a vnodes marked VSWAP.
1428 */
1429 if ((flags & SKIPSWAP) && (vp->v_flag & VSWAP)) {
1430 simple_unlock(&vp->v_interlock);
1431 continue;
1432 }
1433 /*
1434 * If WRITECLOSE is set, only flush out regular file
1435 * vnodes open for writing.
1436 */
1437 if ((flags & WRITECLOSE) &&
1438 (vp->v_writecount == 0 || vp->v_type != VREG)) {
1439 simple_unlock(&vp->v_interlock);
1440 continue;
1441 }
1442 /*
1443 * With v_usecount == 0, all we need to do is clear
1444 * out the vnode data structures and we are done.
1445 */
1446 if (vp->v_usecount == 0) {
1447 simple_unlock(&mntvnode_slock);
1448 vgonel(vp, p);
1449 simple_lock(&mntvnode_slock);
1450 continue;
1451 }
1452 /*
1453 * If FORCECLOSE is set, forcibly close the vnode.
1454 * For block or character devices, revert to an
1455 * anonymous device. For all other files, just kill them.
1456 */
1457 if (flags & FORCECLOSE) {
1458 simple_unlock(&mntvnode_slock);
1459 if (vp->v_type != VBLK && vp->v_type != VCHR) {
1460 vgonel(vp, p);
1461 } else {
1462 vclean(vp, 0, p);
1463 vp->v_op = spec_vnodeop_p;
1464 insmntque(vp, (struct mount *)0);
1465 }
1466 simple_lock(&mntvnode_slock);
1467 continue;
1468 }
1469 #if DIAGNOSTIC
1470 if (busyprt)
1471 vprint("vflush: busy vnode", vp);
1472 #endif
1473 simple_unlock(&vp->v_interlock);
1474 busy++;
1475 }
1476 simple_unlock(&mntvnode_slock);
1477 if (busy && ((flags & FORCECLOSE)==0))
1478 return (EBUSY);
1479 return (0);
1480 }
1481
1482 /*
1483 * Disassociate the underlying file system from a vnode.
1484 * The vnode interlock is held on entry.
1485 */
1486 static void
1487 vclean(vp, flags, p)
1488 struct vnode *vp;
1489 int flags;
1490 struct proc *p;
1491 {
1492 int active;
1493 int removed = 0;
1494 int didhold;
1495
1496 /*
1497 * if the vnode is not obtained by calling getnewvnode() we
1498 * are not responsible for the cleanup. Just return.
1499 */
1500 if (!(vp->v_flag & VSTANDARD)) {
1501 simple_unlock(&vp->v_interlock);
1502 return;
1503 }
1504
1505 /*
1506 * Check to see if the vnode is in use.
1507 * If so we have to reference it before we clean it out
1508 * so that its count cannot fall to zero and generate a
1509 * race against ourselves to recycle it.
1510 */
1511 if (active = vp->v_usecount)
1512 if (++vp->v_usecount <= 0)
1513 panic("vclean: v_usecount");
1514 /*
1515 * Prevent the vnode from being recycled or
1516 * brought into use while we clean it out.
1517 */
1518 if (vp->v_flag & VXLOCK)
1519 panic("vclean: deadlock");
1520 vp->v_flag |= VXLOCK;
1521
1522 /*
1523 * Even if the count is zero, the VOP_INACTIVE routine may still
1524 * have the object locked while it cleans it out. The VOP_LOCK
1525 * ensures that the VOP_INACTIVE routine is done with its work.
1526 * For active vnodes, it ensures that no other activity can
1527 * occur while the underlying object is being cleaned out.
1528 */
1529 VOP_LOCK(vp, LK_DRAIN | LK_INTERLOCK, p);
1530
1531 /*
1532 * if this vnode is on the inactive list
1533 * take it off the list.
1534 */
1535 if ((active == 1) &&
1536 (ISSET((vp)->v_flag, VUINACTIVE) && VONLIST(vp))) {
1537 simple_lock(&vnode_free_list_slock);
1538 VREMINACTIVE("vclean", vp);
1539 simple_unlock(&vnode_free_list_slock);
1540 removed++;
1541 }
1542
1543 /* Clean the pages in VM. */
1544 if (active && (flags & DOCLOSE))
1545 VOP_CLOSE(vp, IO_NDELAY, NOCRED, p);
1546
1547 /* Clean the pages in VM. */
1548 didhold = ubc_hold(vp);
1549 if ((active) && (didhold))
1550 (void)ubc_clean(vp, 0); /* do not invalidate */
1551
1552 /*
1553 * Clean out any buffers associated with the vnode.
1554 */
1555 if (flags & DOCLOSE) {
1556 if (vp->v_tag == VT_NFS)
1557 nfs_vinvalbuf(vp, V_SAVE, NOCRED, p, 0);
1558 else
1559 vinvalbuf(vp, V_SAVE, NOCRED, p, 0, 0);
1560 }
1561
1562 if (active)
1563 VOP_INACTIVE(vp, p);
1564 else
1565 VOP_UNLOCK(vp, 0, p);
1566
1567 /* Destroy ubc named reference */
1568 if (didhold) {
1569 ubc_rele(vp);
1570 ubc_destroy_named(vp);
1571 }
1572
1573 /*
1574 * Reclaim the vnode.
1575 */
1576 if (VOP_RECLAIM(vp, p))
1577 panic("vclean: cannot reclaim");
1578 cache_purge(vp);
1579 if (vp->v_vnlock) {
1580 if ((vp->v_vnlock->lk_flags & LK_DRAINED) == 0)
1581 vprint("vclean: lock not drained", vp);
1582 FREE_ZONE(vp->v_vnlock, sizeof (struct lock__bsd__), M_VNODE);
1583 vp->v_vnlock = NULL;
1584 }
1585
1586 /* It's dead, Jim! */
1587 vp->v_op = dead_vnodeop_p;
1588 vp->v_tag = VT_NON;
1589
1590 /*
1591 * Done with purge, notify sleepers of the grim news.
1592 */
1593 vp->v_flag &= ~VXLOCK;
1594 if (vp->v_flag & VXWANT) {
1595 vp->v_flag &= ~VXWANT;
1596 wakeup((caddr_t)vp);
1597 }
1598
1599 if (active)
1600 vrele(vp);
1601 }
1602
1603 /*
1604 * Eliminate all activity associated with the requested vnode
1605 * and with all vnodes aliased to the requested vnode.
1606 */
1607 int
1608 vop_revoke(ap)
1609 struct vop_revoke_args /* {
1610 struct vnode *a_vp;
1611 int a_flags;
1612 } */ *ap;
1613 {
1614 struct vnode *vp, *vq;
1615 struct proc *p = current_proc();
1616
1617 #if DIAGNOSTIC
1618 if ((ap->a_flags & REVOKEALL) == 0)
1619 panic("vop_revoke");
1620 #endif
1621
1622 vp = ap->a_vp;
1623 simple_lock(&vp->v_interlock);
1624
1625 if (vp->v_flag & VALIASED) {
1626 /*
1627 * If a vgone (or vclean) is already in progress,
1628 * wait until it is done and return.
1629 */
1630 if (vp->v_flag & VXLOCK) {
1631 while (vp->v_flag & VXLOCK) {
1632 vp->v_flag |= VXWANT;
1633 simple_unlock(&vp->v_interlock);
1634 (void)tsleep((caddr_t)vp, PINOD, "vop_revokeall", 0);
1635 }
1636 return (0);
1637 }
1638 /*
1639 * Ensure that vp will not be vgone'd while we
1640 * are eliminating its aliases.
1641 */
1642 vp->v_flag |= VXLOCK;
1643 simple_unlock(&vp->v_interlock);
1644 while (vp->v_flag & VALIASED) {
1645 simple_lock(&spechash_slock);
1646 for (vq = *vp->v_hashchain; vq; vq = vq->v_specnext) {
1647 if (vq->v_rdev != vp->v_rdev ||
1648 vq->v_type != vp->v_type || vp == vq)
1649 continue;
1650 simple_unlock(&spechash_slock);
1651 vgone(vq);
1652 break;
1653 }
1654 if (vq == NULLVP)
1655 simple_unlock(&spechash_slock);
1656 }
1657 /*
1658 * Remove the lock so that vgone below will
1659 * really eliminate the vnode after which time
1660 * vgone will awaken any sleepers.
1661 */
1662 simple_lock(&vp->v_interlock);
1663 vp->v_flag &= ~VXLOCK;
1664 }
1665 vgonel(vp, p);
1666 return (0);
1667 }
1668
1669 /*
1670 * Recycle an unused vnode to the front of the free list.
1671 * Release the passed interlock if the vnode will be recycled.
1672 */
1673 int
1674 vrecycle(vp, inter_lkp, p)
1675 struct vnode *vp;
1676 struct slock *inter_lkp;
1677 struct proc *p;
1678 {
1679
1680 simple_lock(&vp->v_interlock);
1681 if (vp->v_usecount == 0) {
1682 if (inter_lkp)
1683 simple_unlock(inter_lkp);
1684 vgonel(vp, p);
1685 return (1);
1686 }
1687 simple_unlock(&vp->v_interlock);
1688 return (0);
1689 }
1690
1691 /*
1692 * Eliminate all activity associated with a vnode
1693 * in preparation for reuse.
1694 */
1695 void
1696 vgone(vp)
1697 struct vnode *vp;
1698 {
1699 struct proc *p = current_proc();
1700
1701 simple_lock(&vp->v_interlock);
1702 vgonel(vp, p);
1703 }
1704
1705 /*
1706 * vgone, with the vp interlock held.
1707 */
1708 void
1709 vgonel(vp, p)
1710 struct vnode *vp;
1711 struct proc *p;
1712 {
1713 struct vnode *vq;
1714 struct vnode *vx;
1715
1716 /*
1717 * if the vnode is not obtained by calling getnewvnode() we
1718 * are not responsible for the cleanup. Just return.
1719 */
1720 if (!(vp->v_flag & VSTANDARD)) {
1721 simple_unlock(&vp->v_interlock);
1722 return;
1723 }
1724
1725 /*
1726 * If a vgone (or vclean) is already in progress,
1727 * wait until it is done and return.
1728 */
1729 if (vp->v_flag & VXLOCK) {
1730 while (vp->v_flag & VXLOCK) {
1731 vp->v_flag |= VXWANT;
1732 simple_unlock(&vp->v_interlock);
1733 (void)tsleep((caddr_t)vp, PINOD, "vgone", 0);
1734 }
1735 return;
1736 }
1737 /*
1738 * Clean out the filesystem specific data.
1739 */
1740 vclean(vp, DOCLOSE, p);
1741 /*
1742 * Delete from old mount point vnode list, if on one.
1743 */
1744 if (vp->v_mount != NULL)
1745 insmntque(vp, (struct mount *)0);
1746 /*
1747 * If special device, remove it from special device alias list
1748 * if it is on one.
1749 */
1750 if ((vp->v_type == VBLK || vp->v_type == VCHR) && vp->v_specinfo != 0) {
1751 simple_lock(&spechash_slock);
1752 if (*vp->v_hashchain == vp) {
1753 *vp->v_hashchain = vp->v_specnext;
1754 } else {
1755 for (vq = *vp->v_hashchain; vq; vq = vq->v_specnext) {
1756 if (vq->v_specnext != vp)
1757 continue;
1758 vq->v_specnext = vp->v_specnext;
1759 break;
1760 }
1761 if (vq == NULL)
1762 panic("missing bdev");
1763 }
1764 if (vp->v_flag & VALIASED) {
1765 vx = NULL;
1766 for (vq = *vp->v_hashchain; vq; vq = vq->v_specnext) {
1767 if (vq->v_rdev != vp->v_rdev ||
1768 vq->v_type != vp->v_type)
1769 continue;
1770 if (vx)
1771 break;
1772 vx = vq;
1773 }
1774 if (vx == NULL)
1775 panic("missing alias");
1776 if (vq == NULL)
1777 vx->v_flag &= ~VALIASED;
1778 vp->v_flag &= ~VALIASED;
1779 }
1780 simple_unlock(&spechash_slock);
1781 FREE_ZONE(vp->v_specinfo, sizeof (struct specinfo), M_VNODE);
1782 vp->v_specinfo = NULL;
1783 }
1784 /*
1785 * If it is on the freelist and not already at the head,
1786 * move it to the head of the list. The test of the back
1787 * pointer and the reference count of zero is because
1788 * it will be removed from the free list by getnewvnode,
1789 * but will not have its reference count incremented until
1790 * after calling vgone. If the reference count were
1791 * incremented first, vgone would (incorrectly) try to
1792 * close the previous instance of the underlying object.
1793 * So, the back pointer is explicitly set to `0xdeadb' in
1794 * getnewvnode after removing it from the freelist to ensure
1795 * that we do not try to move it here.
1796 */
1797 if (vp->v_usecount == 0) {
1798 simple_lock(&vnode_free_list_slock);
1799 if ((vp->v_freelist.tqe_prev != (struct vnode **)0xdeadb) &&
1800 vnode_free_list.tqh_first != vp) {
1801 TAILQ_REMOVE(&vnode_free_list, vp, v_freelist);
1802 TAILQ_INSERT_HEAD(&vnode_free_list, vp, v_freelist);
1803 }
1804 simple_unlock(&vnode_free_list_slock);
1805 }
1806 vp->v_type = VBAD;
1807 }
1808
1809 /*
1810 * Lookup a vnode by device number.
1811 */
1812 int
1813 vfinddev(dev, type, vpp)
1814 dev_t dev;
1815 enum vtype type;
1816 struct vnode **vpp;
1817 {
1818 struct vnode *vp;
1819 int rc = 0;
1820
1821 simple_lock(&spechash_slock);
1822 for (vp = speclisth[SPECHASH(dev)]; vp; vp = vp->v_specnext) {
1823 if (dev != vp->v_rdev || type != vp->v_type)
1824 continue;
1825 *vpp = vp;
1826 rc = 1;
1827 break;
1828 }
1829 simple_unlock(&spechash_slock);
1830 return (rc);
1831 }
1832
1833 /*
1834 * Calculate the total number of references to a special device.
1835 */
1836 int
1837 vcount(vp)
1838 struct vnode *vp;
1839 {
1840 struct vnode *vq, *vnext;
1841 int count;
1842
1843 loop:
1844 if ((vp->v_flag & VALIASED) == 0)
1845 return (vp->v_usecount);
1846 simple_lock(&spechash_slock);
1847 for (count = 0, vq = *vp->v_hashchain; vq; vq = vnext) {
1848 vnext = vq->v_specnext;
1849 if (vq->v_rdev != vp->v_rdev || vq->v_type != vp->v_type)
1850 continue;
1851 /*
1852 * Alias, but not in use, so flush it out.
1853 */
1854 if (vq->v_usecount == 0 && vq != vp) {
1855 simple_unlock(&spechash_slock);
1856 vgone(vq);
1857 goto loop;
1858 }
1859 count += vq->v_usecount;
1860 }
1861 simple_unlock(&spechash_slock);
1862 return (count);
1863 }
1864
1865 int prtactive = 0; /* 1 => print out reclaim of active vnodes */
1866
1867 /*
1868 * Print out a description of a vnode.
1869 */
1870 static char *typename[] =
1871 { "VNON", "VREG", "VDIR", "VBLK", "VCHR", "VLNK", "VSOCK", "VFIFO", "VBAD" };
1872
1873 void
1874 vprint(label, vp)
1875 char *label;
1876 register struct vnode *vp;
1877 {
1878 char buf[64];
1879
1880 if (label != NULL)
1881 printf("%s: ", label);
1882 printf("type %s, usecount %d, writecount %d, refcount %d,",
1883 typename[vp->v_type], vp->v_usecount, vp->v_writecount,
1884 vp->v_holdcnt);
1885 buf[0] = '\0';
1886 if (vp->v_flag & VROOT)
1887 strcat(buf, "|VROOT");
1888 if (vp->v_flag & VTEXT)
1889 strcat(buf, "|VTEXT");
1890 if (vp->v_flag & VSYSTEM)
1891 strcat(buf, "|VSYSTEM");
1892 if (vp->v_flag & VNOFLUSH)
1893 strcat(buf, "|VNOFLUSH");
1894 if (vp->v_flag & VXLOCK)
1895 strcat(buf, "|VXLOCK");
1896 if (vp->v_flag & VXWANT)
1897 strcat(buf, "|VXWANT");
1898 if (vp->v_flag & VBWAIT)
1899 strcat(buf, "|VBWAIT");
1900 if (vp->v_flag & VALIASED)
1901 strcat(buf, "|VALIASED");
1902 if (buf[0] != '\0')
1903 printf(" flags (%s)", &buf[1]);
1904 if (vp->v_data == NULL) {
1905 printf("\n");
1906 } else {
1907 printf("\n\t");
1908 VOP_PRINT(vp);
1909 }
1910 }
1911
1912 #ifdef DEBUG
1913 /*
1914 * List all of the locked vnodes in the system.
1915 * Called when debugging the kernel.
1916 */
1917 void
1918 printlockedvnodes()
1919 {
1920 struct proc *p = current_proc();
1921 struct mount *mp, *nmp;
1922 struct vnode *vp;
1923
1924 printf("Locked vnodes\n");
1925 simple_lock(&mountlist_slock);
1926 for (mp = mountlist.cqh_first; mp != (void *)&mountlist; mp = nmp) {
1927 if (vfs_busy(mp, LK_NOWAIT, &mountlist_slock, p)) {
1928 nmp = mp->mnt_list.cqe_next;
1929 continue;
1930 }
1931 for (vp = mp->mnt_vnodelist.lh_first;
1932 vp != NULL;
1933 vp = vp->v_mntvnodes.le_next) {
1934 if (VOP_ISLOCKED(vp))
1935 vprint((char *)0, vp);
1936 }
1937 simple_lock(&mountlist_slock);
1938 nmp = mp->mnt_list.cqe_next;
1939 vfs_unbusy(mp, p);
1940 }
1941 simple_unlock(&mountlist_slock);
1942 }
1943 #endif
1944
1945 /*
1946 * Top level filesystem related information gathering.
1947 */
1948 int
1949 vfs_sysctl(name, namelen, oldp, oldlenp, newp, newlen, p)
1950 int *name;
1951 u_int namelen;
1952 void *oldp;
1953 size_t *oldlenp;
1954 void *newp;
1955 size_t newlen;
1956 struct proc *p;
1957 {
1958 struct vfsconf *vfsp;
1959
1960 /*
1961 * The VFS_NUMMNTOPS shouldn't be at name[0] since
1962 * is a VFS generic variable. So now we must check
1963 * namelen so we don't end up covering any UFS
1964 * variables (sinc UFS vfc_typenum is 1).
1965 *
1966 * It should have been:
1967 * name[0]: VFS_GENERIC
1968 * name[1]: VFS_NUMMNTOPS
1969 */
1970 if (namelen == 1 && name[0] == VFS_NUMMNTOPS) {
1971 extern unsigned int vfs_nummntops;
1972 return (sysctl_rdint(oldp, oldlenp, newp, vfs_nummntops));
1973 }
1974
1975 /* all sysctl names at this level are at least name and field */
1976 if (namelen < 2)
1977 return (ENOTDIR); /* overloaded */
1978 if (name[0] != VFS_GENERIC) {
1979 for (vfsp = vfsconf; vfsp; vfsp = vfsp->vfc_next)
1980 if (vfsp->vfc_typenum == name[0])
1981 break;
1982 if (vfsp == NULL)
1983 return (EOPNOTSUPP);
1984 return ((*vfsp->vfc_vfsops->vfs_sysctl)(&name[1], namelen - 1,
1985 oldp, oldlenp, newp, newlen, p));
1986 }
1987 switch (name[1]) {
1988 case VFS_MAXTYPENUM:
1989 return (sysctl_rdint(oldp, oldlenp, newp, maxvfsconf));
1990 case VFS_CONF:
1991 if (namelen < 3)
1992 return (ENOTDIR); /* overloaded */
1993 for (vfsp = vfsconf; vfsp; vfsp = vfsp->vfc_next)
1994 if (vfsp->vfc_typenum == name[2])
1995 break;
1996 if (vfsp == NULL)
1997 return (EOPNOTSUPP);
1998 return (sysctl_rdstruct(oldp, oldlenp, newp, vfsp,
1999 sizeof(struct vfsconf)));
2000 }
2001 return (EOPNOTSUPP);
2002 }
2003
2004 int kinfo_vdebug = 1;
2005 #define KINFO_VNODESLOP 10
2006 /*
2007 * Dump vnode list (via sysctl).
2008 * Copyout address of vnode followed by vnode.
2009 */
2010 /* ARGSUSED */
2011 int
2012 sysctl_vnode(where, sizep, p)
2013 char *where;
2014 size_t *sizep;
2015 struct proc *p;
2016 {
2017 struct mount *mp, *nmp;
2018 struct vnode *nvp, *vp;
2019 char *bp = where, *savebp;
2020 char *ewhere;
2021 int error;
2022
2023 #define VPTRSZ sizeof (struct vnode *)
2024 #define VNODESZ sizeof (struct vnode)
2025 if (where == NULL) {
2026 *sizep = (numvnodes + KINFO_VNODESLOP) * (VPTRSZ + VNODESZ);
2027 return (0);
2028 }
2029 ewhere = where + *sizep;
2030
2031 simple_lock(&mountlist_slock);
2032 for (mp = mountlist.cqh_first; mp != (void *)&mountlist; mp = nmp) {
2033 if (vfs_busy(mp, LK_NOWAIT, &mountlist_slock, p)) {
2034 nmp = mp->mnt_list.cqe_next;
2035 continue;
2036 }
2037 savebp = bp;
2038 again:
2039 simple_lock(&mntvnode_slock);
2040 for (vp = mp->mnt_vnodelist.lh_first;
2041 vp != NULL;
2042 vp = nvp) {
2043 /*
2044 * Check that the vp is still associated with
2045 * this filesystem. RACE: could have been
2046 * recycled onto the same filesystem.
2047 */
2048 if (vp->v_mount != mp) {
2049 simple_unlock(&mntvnode_slock);
2050 if (kinfo_vdebug)
2051 printf("kinfo: vp changed\n");
2052 bp = savebp;
2053 goto again;
2054 }
2055 nvp = vp->v_mntvnodes.le_next;
2056 if (bp + VPTRSZ + VNODESZ > ewhere) {
2057 simple_unlock(&mntvnode_slock);
2058 *sizep = bp - where;
2059 return (ENOMEM);
2060 }
2061 simple_unlock(&mntvnode_slock);
2062 if ((error = copyout((caddr_t)&vp, bp, VPTRSZ)) ||
2063 (error = copyout((caddr_t)vp, bp + VPTRSZ, VNODESZ)))
2064 return (error);
2065 bp += VPTRSZ + VNODESZ;
2066 simple_lock(&mntvnode_slock);
2067 }
2068 simple_unlock(&mntvnode_slock);
2069 simple_lock(&mountlist_slock);
2070 nmp = mp->mnt_list.cqe_next;
2071 vfs_unbusy(mp, p);
2072 }
2073 simple_unlock(&mountlist_slock);
2074
2075 *sizep = bp - where;
2076 return (0);
2077 }
2078
2079 /*
2080 * Check to see if a filesystem is mounted on a block device.
2081 */
2082 int
2083 vfs_mountedon(vp)
2084 struct vnode *vp;
2085 {
2086 struct vnode *vq;
2087 int error = 0;
2088
2089 if (vp->v_specflags & SI_MOUNTEDON)
2090 return (EBUSY);
2091 if (vp->v_flag & VALIASED) {
2092 simple_lock(&spechash_slock);
2093 for (vq = *vp->v_hashchain; vq; vq = vq->v_specnext) {
2094 if (vq->v_rdev != vp->v_rdev ||
2095 vq->v_type != vp->v_type)
2096 continue;
2097 if (vq->v_specflags & SI_MOUNTEDON) {
2098 error = EBUSY;
2099 break;
2100 }
2101 }
2102 simple_unlock(&spechash_slock);
2103 }
2104 return (error);
2105 }
2106
2107 /*
2108 * Unmount all filesystems. The list is traversed in reverse order
2109 * of mounting to avoid dependencies.
2110 */
2111 __private_extern__ void
2112 vfs_unmountall()
2113 {
2114 struct mount *mp, *nmp;
2115 struct proc *p = current_proc();
2116
2117 /*
2118 * Since this only runs when rebooting, it is not interlocked.
2119 */
2120 for (mp = mountlist.cqh_last; mp != (void *)&mountlist; mp = nmp) {
2121 nmp = mp->mnt_list.cqe_prev;
2122 (void) dounmount(mp, MNT_FORCE, p);
2123 }
2124 }
2125
2126 /*
2127 * Build hash lists of net addresses and hang them off the mount point.
2128 * Called by vfs_export() to set up the lists of export addresses.
2129 */
2130 static int
2131 vfs_hang_addrlist(mp, nep, argp)
2132 struct mount *mp;
2133 struct netexport *nep;
2134 struct export_args *argp;
2135 {
2136 register struct netcred *np;
2137 register struct radix_node_head *rnh;
2138 register int i;
2139 struct radix_node *rn;
2140 struct sockaddr *saddr, *smask = 0;
2141 struct domain *dom;
2142 int error;
2143
2144 if (argp->ex_addrlen == 0) {
2145 if (mp->mnt_flag & MNT_DEFEXPORTED)
2146 return (EPERM);
2147 np = &nep->ne_defexported;
2148 np->netc_exflags = argp->ex_flags;
2149 np->netc_anon = argp->ex_anon;
2150 np->netc_anon.cr_ref = 1;
2151 mp->mnt_flag |= MNT_DEFEXPORTED;
2152 return (0);
2153 }
2154 i = sizeof(struct netcred) + argp->ex_addrlen + argp->ex_masklen;
2155 MALLOC(np, struct netcred *, i, M_NETADDR, M_WAITOK);
2156 bzero((caddr_t)np, i);
2157 saddr = (struct sockaddr *)(np + 1);
2158 if (error = copyin(argp->ex_addr, (caddr_t)saddr, argp->ex_addrlen))
2159 goto out;
2160 if (saddr->sa_len > argp->ex_addrlen)
2161 saddr->sa_len = argp->ex_addrlen;
2162 if (argp->ex_masklen) {
2163 smask = (struct sockaddr *)((caddr_t)saddr + argp->ex_addrlen);
2164 error = copyin(argp->ex_addr, (caddr_t)smask, argp->ex_masklen);
2165 if (error)
2166 goto out;
2167 if (smask->sa_len > argp->ex_masklen)
2168 smask->sa_len = argp->ex_masklen;
2169 }
2170 i = saddr->sa_family;
2171 if ((rnh = nep->ne_rtable[i]) == 0) {
2172 /*
2173 * Seems silly to initialize every AF when most are not
2174 * used, do so on demand here
2175 */
2176 for (dom = domains; dom; dom = dom->dom_next)
2177 if (dom->dom_family == i && dom->dom_rtattach) {
2178 dom->dom_rtattach((void **)&nep->ne_rtable[i],
2179 dom->dom_rtoffset);
2180 break;
2181 }
2182 if ((rnh = nep->ne_rtable[i]) == 0) {
2183 error = ENOBUFS;
2184 goto out;
2185 }
2186 }
2187 rn = (*rnh->rnh_addaddr)((caddr_t)saddr, (caddr_t)smask, rnh,
2188 np->netc_rnodes);
2189 if (rn == 0) {
2190 /*
2191 * One of the reasons that rnh_addaddr may fail is that
2192 * the entry already exists. To check for this case, we
2193 * look up the entry to see if it is there. If so, we
2194 * do not need to make a new entry but do return success.
2195 */
2196 _FREE(np, M_NETADDR);
2197 rn = (*rnh->rnh_matchaddr)((caddr_t)saddr, rnh);
2198 if (rn != 0 && (rn->rn_flags & RNF_ROOT) == 0 &&
2199 ((struct netcred *)rn)->netc_exflags == argp->ex_flags &&
2200 !bcmp((caddr_t)&((struct netcred *)rn)->netc_anon,
2201 (caddr_t)&argp->ex_anon, sizeof(struct ucred)))
2202 return (0);
2203 return (EPERM);
2204 }
2205 np->netc_exflags = argp->ex_flags;
2206 np->netc_anon = argp->ex_anon;
2207 np->netc_anon.cr_ref = 1;
2208 return (0);
2209 out:
2210 _FREE(np, M_NETADDR);
2211 return (error);
2212 }
2213
2214 /* ARGSUSED */
2215 static int
2216 vfs_free_netcred(rn, w)
2217 struct radix_node *rn;
2218 caddr_t w;
2219 {
2220 register struct radix_node_head *rnh = (struct radix_node_head *)w;
2221
2222 (*rnh->rnh_deladdr)(rn->rn_key, rn->rn_mask, rnh);
2223 _FREE((caddr_t)rn, M_NETADDR);
2224 return (0);
2225 }
2226
2227 /*
2228 * Free the net address hash lists that are hanging off the mount points.
2229 */
2230 static void
2231 vfs_free_addrlist(nep)
2232 struct netexport *nep;
2233 {
2234 register int i;
2235 register struct radix_node_head *rnh;
2236
2237 for (i = 0; i <= AF_MAX; i++)
2238 if (rnh = nep->ne_rtable[i]) {
2239 (*rnh->rnh_walktree)(rnh, vfs_free_netcred,
2240 (caddr_t)rnh);
2241 _FREE((caddr_t)rnh, M_RTABLE);
2242 nep->ne_rtable[i] = 0;
2243 }
2244 }
2245
2246 int
2247 vfs_export(mp, nep, argp)
2248 struct mount *mp;
2249 struct netexport *nep;
2250 struct export_args *argp;
2251 {
2252 int error;
2253
2254 if (argp->ex_flags & MNT_DELEXPORT) {
2255 vfs_free_addrlist(nep);
2256 mp->mnt_flag &= ~(MNT_EXPORTED | MNT_DEFEXPORTED);
2257 }
2258 if (argp->ex_flags & MNT_EXPORTED) {
2259 if (error = vfs_hang_addrlist(mp, nep, argp))
2260 return (error);
2261 mp->mnt_flag |= MNT_EXPORTED;
2262 }
2263 return (0);
2264 }
2265
2266 struct netcred *
2267 vfs_export_lookup(mp, nep, nam)
2268 register struct mount *mp;
2269 struct netexport *nep;
2270 struct mbuf *nam;
2271 {
2272 register struct netcred *np;
2273 register struct radix_node_head *rnh;
2274 struct sockaddr *saddr;
2275
2276 np = NULL;
2277 if (mp->mnt_flag & MNT_EXPORTED) {
2278 /*
2279 * Lookup in the export list first.
2280 */
2281 if (nam != NULL) {
2282 saddr = mtod(nam, struct sockaddr *);
2283 rnh = nep->ne_rtable[saddr->sa_family];
2284 if (rnh != NULL) {
2285 np = (struct netcred *)
2286 (*rnh->rnh_matchaddr)((caddr_t)saddr,
2287 rnh);
2288 if (np && np->netc_rnodes->rn_flags & RNF_ROOT)
2289 np = NULL;
2290 }
2291 }
2292 /*
2293 * If no address match, use the default if it exists.
2294 */
2295 if (np == NULL && mp->mnt_flag & MNT_DEFEXPORTED)
2296 np = &nep->ne_defexported;
2297 }
2298 return (np);
2299 }
2300
2301 /*
2302 * try to reclaim vnodes from the memory
2303 * object cache
2304 */
2305 static int
2306 vm_object_cache_reclaim(int count)
2307 {
2308 int cnt;
2309 void vnode_pager_release_from_cache(int *);
2310
2311 /* attempt to reclaim vnodes from VM object cache */
2312 cnt = count;
2313 vnode_pager_release_from_cache(&cnt);
2314 return(cnt);
2315 }
2316
2317 /*
2318 * Release memory object reference held by inactive vnodes
2319 * and then try to reclaim some vnodes from the memory
2320 * object cache
2321 */
2322 static int
2323 vnreclaim(int count)
2324 {
2325 int i, loopcnt;
2326 struct vnode *vp;
2327 int err;
2328 struct proc *p;
2329
2330 i = 0;
2331 loopcnt = 0;
2332
2333 /* Try to release "count" vnodes from the inactive list */
2334 restart:
2335 if (++loopcnt > inactivevnodes) {
2336 /*
2337 * I did my best trying to reclaim the vnodes.
2338 * Do not try any more as that would only lead to
2339 * long latencies. Also in the worst case
2340 * this can get totally CPU bound.
2341 * Just fall though and attempt a reclaim of VM
2342 * object cache
2343 */
2344 goto out;
2345 }
2346
2347 simple_lock(&vnode_free_list_slock);
2348 for (vp = TAILQ_FIRST(&vnode_inactive_list);
2349 (vp != NULLVP) && (i < count);
2350 vp = TAILQ_NEXT(vp, v_freelist)) {
2351
2352 if (!simple_lock_try(&vp->v_interlock))
2353 continue;
2354
2355 if (vp->v_usecount != 1)
2356 panic("vnreclaim: v_usecount");
2357
2358 if(!UBCINFOEXISTS(vp)) {
2359 if (vp->v_type == VBAD) {
2360 VREMINACTIVE("vnreclaim", vp);
2361 simple_unlock(&vp->v_interlock);
2362 continue;
2363 } else
2364 panic("non UBC vnode on inactive list");
2365 /* Should not reach here */
2366 }
2367
2368 /* If vnode is already being reclaimed, wait */
2369 if ((vp->v_flag & VXLOCK) || (vp->v_flag & VORECLAIM)) {
2370 vp->v_flag |= VXWANT;
2371 simple_unlock(&vp->v_interlock);
2372 simple_unlock(&vnode_free_list_slock);
2373 (void)tsleep((caddr_t)vp, PINOD, "vocr", 0);
2374 goto restart;
2375 }
2376
2377 VREMINACTIVE("vnreclaim", vp);
2378 simple_unlock(&vnode_free_list_slock);
2379
2380 if (ubc_issetflags(vp, UI_WASMAPPED)) {
2381 /*
2382 * We should not reclaim as it is likely
2383 * to be in use. Let it die a natural death.
2384 * Release the UBC reference if one exists
2385 * and put it back at the tail.
2386 */
2387 simple_unlock(&vp->v_interlock);
2388 if (ubc_release_named(vp)) {
2389 if (UBCINFOEXISTS(vp)) {
2390 simple_lock(&vp->v_interlock);
2391 if (vp->v_usecount == 1 && !VONLIST(vp))
2392 vinactive(vp);
2393 simple_unlock(&vp->v_interlock);
2394 }
2395 } else {
2396 simple_lock(&vp->v_interlock);
2397 vinactive(vp);
2398 simple_unlock(&vp->v_interlock);
2399 }
2400 } else {
2401 int didhold;
2402
2403 VORECLAIM_ENABLE(vp);
2404
2405 /*
2406 * scrub the dirty pages and invalidate the buffers
2407 */
2408 p = current_proc();
2409 err = vn_lock(vp, LK_EXCLUSIVE|LK_INTERLOCK, p);
2410 if (err) {
2411 /* cannot reclaim */
2412 simple_lock(&vp->v_interlock);
2413 vinactive(vp);
2414 VORECLAIM_DISABLE(vp);
2415 i++;
2416 simple_unlock(&vp->v_interlock);
2417 goto restart;
2418 }
2419
2420 /* keep the vnode alive so we can kill it */
2421 simple_lock(&vp->v_interlock);
2422 if(vp->v_usecount != 1)
2423 panic("VOCR: usecount race");
2424 vp->v_usecount++;
2425 simple_unlock(&vp->v_interlock);
2426
2427 /* clean up the state in VM without invalidating */
2428 didhold = ubc_hold(vp);
2429 if (didhold)
2430 (void)ubc_clean(vp, 0);
2431
2432 /* flush and invalidate buffers associated with the vnode */
2433 if (vp->v_tag == VT_NFS)
2434 nfs_vinvalbuf(vp, V_SAVE, NOCRED, p, 0);
2435 else
2436 vinvalbuf(vp, V_SAVE, NOCRED, p, 0, 0);
2437
2438 /*
2439 * Note: for the v_usecount == 2 case, VOP_INACTIVE
2440 * has not yet been called. Call it now while vp is
2441 * still locked, it will also release the lock.
2442 */
2443 if (vp->v_usecount == 2)
2444 VOP_INACTIVE(vp, p);
2445 else
2446 VOP_UNLOCK(vp, 0, p);
2447
2448 if (didhold)
2449 ubc_rele(vp);
2450
2451 /*
2452 * destroy the ubc named reference.
2453 * If we can't because it is held for I/Os
2454 * in progress, just put it back on the inactive
2455 * list and move on. Otherwise, the paging reference
2456 * is toast (and so is this vnode?).
2457 */
2458 if (ubc_destroy_named(vp)) {
2459 i++;
2460 }
2461 simple_lock(&vp->v_interlock);
2462 VORECLAIM_DISABLE(vp);
2463 simple_unlock(&vp->v_interlock);
2464 vrele(vp); /* release extra use we added here */
2465 }
2466 /* inactive list lock was released, must restart */
2467 goto restart;
2468 }
2469 simple_unlock(&vnode_free_list_slock);
2470
2471 vnode_reclaim_tried += i;
2472 out:
2473 i = vm_object_cache_reclaim(count);
2474 vnode_objects_reclaimed += i;
2475
2476 return(i);
2477 }
2478
2479 /*
2480 * This routine is called from vnode_pager_no_senders()
2481 * which in turn can be called with vnode locked by vnode_uncache()
2482 * But it could also get called as a result of vm_object_cache_trim().
2483 * In that case lock state is unknown.
2484 * AGE the vnode so that it gets recycled quickly.
2485 * Check lock status to decide whether to call vput() or vrele().
2486 */
2487 __private_extern__ void
2488 vnode_pager_vrele(struct vnode *vp)
2489 {
2490
2491 boolean_t funnel_state;
2492 int isvnreclaim = 1;
2493
2494 if (vp == (struct vnode *) NULL)
2495 panic("vnode_pager_vrele: null vp");
2496
2497 funnel_state = thread_funnel_set(kernel_flock, TRUE);
2498
2499 /* Mark the vnode to be recycled */
2500 vagevp(vp);
2501
2502 simple_lock(&vp->v_interlock);
2503 /*
2504 * If a vgone (or vclean) is already in progress,
2505 * Do not bother with the ubc_info cleanup.
2506 * Let the vclean deal with it.
2507 */
2508 if (vp->v_flag & VXLOCK) {
2509 CLR(vp->v_flag, VTERMINATE);
2510 if (ISSET(vp->v_flag, VTERMWANT)) {
2511 CLR(vp->v_flag, VTERMWANT);
2512 wakeup((caddr_t)&vp->v_ubcinfo);
2513 }
2514 simple_unlock(&vp->v_interlock);
2515 vrele(vp);
2516 (void) thread_funnel_set(kernel_flock, funnel_state);
2517 return;
2518 }
2519
2520 /* It's dead, Jim! */
2521 if (!ISSET(vp->v_flag, VORECLAIM)) {
2522 /*
2523 * called as a result of eviction of the memory
2524 * object from the memory object cache
2525 */
2526 isvnreclaim = 0;
2527
2528 /* So serialize vnode operations */
2529 VORECLAIM_ENABLE(vp);
2530 }
2531 if (!ISSET(vp->v_flag, VTERMINATE))
2532 SET(vp->v_flag, VTERMINATE);
2533 if (UBCINFOEXISTS(vp)) {
2534 struct ubc_info *uip = vp->v_ubcinfo;
2535
2536 if (ubc_issetflags(vp, UI_WASMAPPED))
2537 SET(vp->v_flag, VWASMAPPED);
2538
2539 vp->v_ubcinfo = UBC_NOINFO; /* catch bad accesses */
2540 simple_unlock(&vp->v_interlock);
2541 ubc_info_deallocate(uip);
2542 } else {
2543 if ((vp->v_type == VBAD) && ((vp)->v_ubcinfo != UBC_INFO_NULL)
2544 && ((vp)->v_ubcinfo != UBC_NOINFO)) {
2545 struct ubc_info *uip = vp->v_ubcinfo;
2546
2547 vp->v_ubcinfo = UBC_NOINFO; /* catch bad accesses */
2548 simple_unlock(&vp->v_interlock);
2549 ubc_info_deallocate(uip);
2550 } else {
2551 simple_unlock(&vp->v_interlock);
2552 }
2553 }
2554
2555 CLR(vp->v_flag, VTERMINATE);
2556
2557 if (vp->v_type != VBAD){
2558 vgone(vp); /* revoke the vnode */
2559 vrele(vp); /* and drop the reference */
2560 } else
2561 vrele(vp);
2562
2563 if (ISSET(vp->v_flag, VTERMWANT)) {
2564 CLR(vp->v_flag, VTERMWANT);
2565 wakeup((caddr_t)&vp->v_ubcinfo);
2566 }
2567 if (!isvnreclaim)
2568 VORECLAIM_DISABLE(vp);
2569 (void) thread_funnel_set(kernel_flock, funnel_state);
2570 return;
2571 }
2572
2573
2574 #if DIAGNOSTIC
2575 int walk_vnodes_debug=0;
2576
2577 void
2578 walk_allvnodes()
2579 {
2580 struct mount *mp, *nmp;
2581 struct vnode *vp;
2582 int cnt = 0;
2583
2584 for (mp = mountlist.cqh_first; mp != (void *)&mountlist; mp = nmp) {
2585 for (vp = mp->mnt_vnodelist.lh_first;
2586 vp != NULL;
2587 vp = vp->v_mntvnodes.le_next) {
2588 if (vp->v_usecount < 0){
2589 if(walk_vnodes_debug) {
2590 printf("vp is %x\n",vp);
2591 }
2592 }
2593 }
2594 nmp = mp->mnt_list.cqe_next;
2595 }
2596 for (cnt = 0, vp = vnode_free_list.tqh_first;
2597 vp != NULLVP; cnt++, vp = vp->v_freelist.tqe_next) {
2598 if ((vp->v_usecount < 0) && walk_vnodes_debug) {
2599 if(walk_vnodes_debug) {
2600 printf("vp is %x\n",vp);
2601 }
2602 }
2603 }
2604 printf("%d - free\n", cnt);
2605
2606 for (cnt = 0, vp = vnode_inactive_list.tqh_first;
2607 vp != NULLVP; cnt++, vp = vp->v_freelist.tqe_next) {
2608 if ((vp->v_usecount < 0) && walk_vnodes_debug) {
2609 if(walk_vnodes_debug) {
2610 printf("vp is %x\n",vp);
2611 }
2612 }
2613 }
2614 printf("%d - inactive\n", cnt);
2615 }
2616 #endif /* DIAGNOSTIC */
2617
2618 void
2619 vfs_io_attributes(vp, flags, iosize, vectors)
2620 struct vnode *vp;
2621 int flags; /* B_READ or B_WRITE */
2622 int *iosize;
2623 int *vectors;
2624 {
2625 struct mount *mp;
2626
2627 /* start with "reasonable" defaults */
2628 *iosize = MAXPHYS;
2629 *vectors = 32;
2630
2631 mp = vp->v_mount;
2632 if (mp != NULL) {
2633 switch (flags) {
2634 case B_READ:
2635 *iosize = mp->mnt_maxreadcnt;
2636 *vectors = mp->mnt_segreadcnt;
2637 break;
2638 case B_WRITE:
2639 *iosize = mp->mnt_maxwritecnt;
2640 *vectors = mp->mnt_segwritecnt;
2641 break;
2642 default:
2643 break;
2644 }
2645 }
2646
2647 return;
2648 }
2649
2650 #include <dev/disk.h>
2651
2652 int
2653 vfs_init_io_attributes(devvp, mp)
2654 struct vnode *devvp;
2655 struct mount *mp;
2656 {
2657 int error;
2658 off_t readblockcnt;
2659 off_t writeblockcnt;
2660 off_t readsegcnt;
2661 off_t writesegcnt;
2662 u_long blksize;
2663
2664 u_int64_t temp;
2665
2666 struct proc *p = current_proc();
2667 struct ucred *cred = p->p_ucred;
2668
2669 if ((error = VOP_IOCTL(devvp, DKIOCGETMAXBLOCKCOUNTREAD,
2670 (caddr_t)&readblockcnt, 0, cred, p)))
2671 return (error);
2672
2673 if ((error = VOP_IOCTL(devvp, DKIOCGETMAXBLOCKCOUNTWRITE,
2674 (caddr_t)&writeblockcnt, 0, cred, p)))
2675 return (error);
2676
2677 if ((error = VOP_IOCTL(devvp, DKIOCGETMAXSEGMENTCOUNTREAD,
2678 (caddr_t)&readsegcnt, 0, cred, p)))
2679 return (error);
2680
2681 if ((error = VOP_IOCTL(devvp, DKIOCGETMAXSEGMENTCOUNTWRITE,
2682 (caddr_t)&writesegcnt, 0, cred, p)))
2683 return (error);
2684
2685 if ((error = VOP_IOCTL(devvp, DKIOCGETBLOCKSIZE,
2686 (caddr_t)&blksize, 0, cred, p)))
2687 return (error);
2688
2689 temp = readblockcnt * blksize;
2690 temp = (temp > UINT32_MAX) ? (UINT32_MAX / blksize) * blksize : temp;
2691 mp->mnt_maxreadcnt = (u_int32_t)temp;
2692
2693 temp = writeblockcnt * blksize;
2694 temp = (temp > UINT32_MAX) ? (UINT32_MAX / blksize) * blksize : temp;
2695 mp->mnt_maxwritecnt = (u_int32_t)temp;
2696
2697 temp = (readsegcnt > UINT16_MAX) ? UINT16_MAX : readsegcnt;
2698 mp->mnt_segreadcnt = (u_int16_t)temp;
2699
2700 temp = (writesegcnt > UINT16_MAX) ? UINT16_MAX : writesegcnt;
2701 mp->mnt_segwritecnt = (u_int16_t)temp;
2702
2703 #if 0
2704 printf("--- IO attributes for mount point 0x%08x ---\n", mp);
2705 printf("\tmnt_maxreadcnt = 0x%x", mp->mnt_maxreadcnt);
2706 printf("\tmnt_maxwritecnt = 0x%x\n", mp->mnt_maxwritecnt);
2707 printf("\tmnt_segreadcnt = 0x%x", mp->mnt_segreadcnt);
2708 printf("\tmnt_segwritecnt = 0x%x\n", mp->mnt_segwritecnt);
2709 #endif /* 0 */
2710
2711 return (error);
2712 }
2713