bsd/vfs/vfs_subr.c

   1 /*
   2  * Copyright (c) 2000-2001 Apple Computer, Inc. All rights reserved.
   3  *
   4  * @APPLE_LICENSE_HEADER_START@
   5  *
   6  * The contents of this file constitute Original Code as defined in and
   7  * are subject to the Apple Public Source License Version 1.1 (the
   8  * "License").  You may not use this file except in compliance with the
   9  * License.  Please obtain a copy of the License at
  10  * http://www.apple.com/publicsource and read it before using this file.
  11  *
  12  * This Original Code and all software distributed under the License are
  13  * distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY KIND, EITHER
  14  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
  15  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
  16  * FITNESS FOR A PARTICULAR PURPOSE OR NON-INFRINGEMENT.  Please see the
  17  * License for the specific language governing rights and limitations
  18  * under the License.
  19  *
  20  * @APPLE_LICENSE_HEADER_END@
  21  */
  22 /* Copyright (c) 1995 NeXT Computer, Inc. All Rights Reserved */
  23 /*
  24  * Copyright (c) 1989, 1993
  25  *      The Regents of the University of California.  All rights reserved.
  26  * (c) UNIX System Laboratories, Inc.
  27  * All or some portions of this file are derived from material licensed
  28  * to the University of California by American Telephone and Telegraph
  29  * Co. or Unix System Laboratories, Inc. and are reproduced herein with
  30  * the permission of UNIX System Laboratories, Inc.
  31  *
  32  * Redistribution and use in source and binary forms, with or without
  33  * modification, are permitted provided that the following conditions
  34  * are met:
  35  * 1. Redistributions of source code must retain the above copyright
  36  *    notice, this list of conditions and the following disclaimer.
  37  * 2. Redistributions in binary form must reproduce the above copyright
  38  *    notice, this list of conditions and the following disclaimer in the
  39  *    documentation and/or other materials provided with the distribution.
  40  * 3. All advertising materials mentioning features or use of this software
  41  *    must display the following acknowledgement:
  42  *      This product includes software developed by the University of
  43  *      California, Berkeley and its contributors.
  44  * 4. Neither the name of the University nor the names of its contributors
  45  *    may be used to endorse or promote products derived from this software
  46  *    without specific prior written permission.
  47  *
  48  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  49  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  50  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  51  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  52  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  53  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  54  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  55  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  56  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  57  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  58  * SUCH DAMAGE.
  59  *
  60  *      @(#)vfs_subr.c  8.31 (Berkeley) 5/26/95
  61  */
  62
  63 /*
  64  * External virtual filesystem routines
  65  */
  66
  67 #undef  DIAGNOSTIC
  68 #define DIAGNOSTIC 1
  69
  70 #include <sys/param.h>
  71 #include <sys/systm.h>
  72 #include <sys/proc.h>
  73 #include <sys/mount.h>
  74 #include <sys/time.h>
  75 #include <sys/vnode.h>
  76 #include <sys/stat.h>
  77 #include <sys/namei.h>
  78 #include <sys/ucred.h>
  79 #include <sys/buf.h>
  80 #include <sys/errno.h>
  81 #include <sys/malloc.h>
  82 #include <sys/domain.h>
  83 #include <sys/mbuf.h>
  84 #include <sys/syslog.h>
  85 #include <sys/ubc.h>
  86 #include <sys/vm.h>
  87 #include <sys/sysctl.h>
  88
  89 #include <kern/assert.h>
  90
  91 #include <miscfs/specfs/specdev.h>
  92
  93 #include <mach/mach_types.h>
  94 #include <mach/memory_object_types.h>
  95
  96
  97 enum vtype iftovt_tab[16] = {
  98         VNON, VFIFO, VCHR, VNON, VDIR, VNON, VBLK, VNON,
  99         VREG, VNON, VLNK, VNON, VSOCK, VNON, VNON, VBAD,
 100 };
 101 int     vttoif_tab[9] = {
 102         0, S_IFREG, S_IFDIR, S_IFBLK, S_IFCHR, S_IFLNK,
 103         S_IFSOCK, S_IFIFO, S_IFMT,
 104 };
 105
 106 static void vfree(struct vnode *vp);
 107 static void vinactive(struct vnode *vp);
 108 static int vnreclaim(int count);
 109 extern kern_return_t
 110         adjust_vm_object_cache(vm_size_t oval, vm_size_t nval);
 111
 112 TAILQ_HEAD(freelst, vnode) vnode_free_list;     /* vnode free list */
 113 TAILQ_HEAD(inactivelst, vnode) vnode_inactive_list;     /* vnode inactive list */
 114 struct mntlist mountlist;                       /* mounted filesystem list */
 115
 116 #if DIAGNOSTIC
 117 #define VLISTCHECK(fun, vp, list)       \
 118         if ((vp)->v_freelist.tqe_prev == (struct vnode **)0xdeadb) \
 119                 panic("%s: %s vnode not on %slist", (fun), (list), (list));
 120
 121 #define VINACTIVECHECK(fun, vp, expected)       \
 122         do {    \
 123                 int __is_inactive = ISSET((vp)->v_flag, VUINACTIVE);    \
 124                 if (__is_inactive ^ expected)   \
 125                         panic("%s: %sinactive vnode, expected %s", (fun),       \
 126                                 __is_inactive? "" : "not ",     \
 127                                 expected? "inactive": "not inactive"); \
 128         } while(0)
 129 #else
 130 #define VLISTCHECK(fun, vp, list)
 131 #define VINACTIVECHECK(fun, vp, expected)
 132 #endif /* DIAGNOSTIC */
 133
 134 #define VLISTNONE(vp)   \
 135         do {    \
 136                 (vp)->v_freelist.tqe_next = (struct vnode *)0;  \
 137                 (vp)->v_freelist.tqe_prev = (struct vnode **)0xdeadb;   \
 138         } while(0)
 139
 140 #define VONLIST(vp)     \
 141         ((vp)->v_freelist.tqe_prev != (struct vnode **)0xdeadb)
 142
 143 /* remove a vnode from free vnode list */
 144 #define VREMFREE(fun, vp)       \
 145         do {    \
 146                 VLISTCHECK((fun), (vp), "free");        \
 147                 TAILQ_REMOVE(&vnode_free_list, (vp), v_freelist);       \
 148                 VLISTNONE((vp));        \
 149                 freevnodes--;   \
 150         } while(0)
 151
 152 /* remove a vnode from inactive vnode list */
 153 #define VREMINACTIVE(fun, vp)   \
 154         do {    \
 155                 VLISTCHECK((fun), (vp), "inactive"); \
 156                 VINACTIVECHECK((fun), (vp), VUINACTIVE); \
 157                 TAILQ_REMOVE(&vnode_inactive_list, (vp), v_freelist); \
 158                 CLR((vp)->v_flag, VUINACTIVE); \
 159                 VLISTNONE((vp));        \
 160                 inactivevnodes--;       \
 161         } while(0)
 162
 163 #define VORECLAIM_ENABLE(vp)   \
 164         do {    \
 165                 if (ISSET((vp)->v_flag, VORECLAIM))     \
 166                         panic("vm object raclaim already");     \
 167                 SET((vp)->v_flag, VORECLAIM);   \
 168         } while(0)
 169
 170 #define VORECLAIM_DISABLE(vp)   \
 171         do {    \
 172                 CLR((vp)->v_flag, VORECLAIM);   \
 173                 if (ISSET((vp)->v_flag, VXWANT)) {      \
 174                         CLR((vp)->v_flag, VXWANT);      \
 175                         wakeup((caddr_t)(vp));  \
 176                 }       \
 177         } while(0)
 178
 179 /*
 180  * Have to declare first two locks as actual data even if !MACH_SLOCKS, since
 181  * a pointers to them get passed around.
 182  */
 183 simple_lock_data_t mountlist_slock;
 184 simple_lock_data_t mntvnode_slock;
 185 decl_simple_lock_data(,mntid_slock);
 186 decl_simple_lock_data(,vnode_free_list_slock);
 187 decl_simple_lock_data(,spechash_slock);
 188
 189 /*
 190  * vnodetarget is the amount of vnodes we expect to get back
 191  * from the the inactive vnode list and VM object cache.
 192  * As vnreclaim() is a mainly cpu bound operation for faster
 193  * processers this number could be higher.
 194  * Having this number too high introduces longer delays in
 195  * the execution of getnewvnode().
 196  */
 197 unsigned long vnodetarget;              /* target for vnreclaim() */
 198 #define VNODE_FREE_TARGET       20      /* Default value for vnodetarget */
 199
 200 /*
 201  * We need quite a few vnodes on the free list to sustain the
 202  * rapid stat() the compilation process does, and still benefit from the name
 203  * cache. Having too few vnodes on the free list causes serious disk
 204  * thrashing as we cycle through them.
 205  */
 206 #define VNODE_FREE_MIN          300     /* freelist should have at least these many */
 207
 208 /*
 209  * We need to get vnodes back from the VM object cache when a certain #
 210  * of vnodes are reused from the freelist. This is essential for the
 211  * caching to be effective in the namecache and the buffer cache [for the
 212  * metadata].
 213  */
 214 #define VNODE_TOOMANY_REUSED    (VNODE_FREE_MIN/4)
 215
 216 /*
 217  * If we have enough vnodes on the freelist we do not want to reclaim
 218  * the vnodes from the VM object cache.
 219  */
 220 #define VNODE_FREE_ENOUGH       (VNODE_FREE_MIN + (VNODE_FREE_MIN/2))
 221
 222 /*
 223  * Initialize the vnode management data structures.
 224  */
 225 __private_extern__ void
 226 vntblinit()
 227 {
 228         extern struct lock__bsd__       exchangelock;
 229
 230         simple_lock_init(&mountlist_slock);
 231         simple_lock_init(&mntvnode_slock);
 232         simple_lock_init(&mntid_slock);
 233         simple_lock_init(&spechash_slock);
 234         TAILQ_INIT(&vnode_free_list);
 235         simple_lock_init(&vnode_free_list_slock);
 236         TAILQ_INIT(&vnode_inactive_list);
 237         CIRCLEQ_INIT(&mountlist);
 238     lockinit(&exchangelock, PVFS, "exchange", 0, 0);
 239
 240         if (!vnodetarget)
 241                 vnodetarget = VNODE_FREE_TARGET;
 242
 243         /*
 244          * Scale the vm_object_cache to accomodate the vnodes
 245          * we want to cache
 246          */
 247         (void) adjust_vm_object_cache(0, desiredvnodes - VNODE_FREE_MIN);
 248 }
 249
 250 /* Reset the VM Object Cache with the values passed in */
 251 __private_extern__ kern_return_t
 252 reset_vmobjectcache(unsigned int val1, unsigned int val2)
 253 {
 254         vm_size_t oval = val1 - VNODE_FREE_MIN;
 255         vm_size_t nval;
 256
 257         if(val2 < VNODE_FREE_MIN)
 258                 nval = 0;
 259         else
 260                 nval = val2 - VNODE_FREE_MIN;
 261
 262         return(adjust_vm_object_cache(oval, nval));
 263 }
 264
 265 /*
 266  * Mark a mount point as busy. Used to synchronize access and to delay
 267  * unmounting. Interlock is not released on failure.
 268  */
 269 int
 270 vfs_busy(mp, flags, interlkp, p)
 271         struct mount *mp;
 272         int flags;
 273         struct slock *interlkp;
 274         struct proc *p;
 275 {
 276         int lkflags;
 277
 278         if (mp->mnt_kern_flag & MNTK_UNMOUNT) {
 279                 if (flags & LK_NOWAIT)
 280                         return (ENOENT);
 281                 mp->mnt_kern_flag |= MNTK_MWAIT;
 282                 if (interlkp)
 283                         simple_unlock(interlkp);
 284                 /*
 285                  * Since all busy locks are shared except the exclusive
 286                  * lock granted when unmounting, the only place that a
 287                  * wakeup needs to be done is at the release of the
 288                  * exclusive lock at the end of dounmount.
 289                  */
 290                 sleep((caddr_t)mp, PVFS);
 291                 if (interlkp)
 292                         simple_lock(interlkp);
 293                 return (ENOENT);
 294         }
 295         lkflags = LK_SHARED;
 296         if (interlkp)
 297                 lkflags |= LK_INTERLOCK;
 298         if (lockmgr(&mp->mnt_lock, lkflags, interlkp, p))
 299                 panic("vfs_busy: unexpected lock failure");
 300         return (0);
 301 }
 302
 303 /*
 304  * Free a busy filesystem.
 305  */
 306 void
 307 vfs_unbusy(mp, p)
 308         struct mount *mp;
 309         struct proc *p;
 310 {
 311
 312         lockmgr(&mp->mnt_lock, LK_RELEASE, NULL, p);
 313 }
 314
 315 /*
 316  * Lookup a filesystem type, and if found allocate and initialize
 317  * a mount structure for it.
 318  *
 319  * Devname is usually updated by mount(8) after booting.
 320  */
 321 int
 322 vfs_rootmountalloc(fstypename, devname, mpp)
 323         char *fstypename;
 324         char *devname;
 325         struct mount **mpp;
 326 {
 327         struct proc *p = current_proc();        /* XXX */
 328         struct vfsconf *vfsp;
 329         struct mount *mp;
 330
 331         for (vfsp = vfsconf; vfsp; vfsp = vfsp->vfc_next)
 332                 if (!strcmp(vfsp->vfc_name, fstypename))
 333                         break;
 334         if (vfsp == NULL)
 335                 return (ENODEV);
 336         mp = _MALLOC_ZONE((u_long)sizeof(struct mount), M_MOUNT, M_WAITOK);
 337         bzero((char *)mp, (u_long)sizeof(struct mount));
 338
 339     /* Initialize the default IO constraints */
 340     mp->mnt_maxreadcnt = mp->mnt_maxwritecnt = MAXPHYS;
 341     mp->mnt_segreadcnt = mp->mnt_segwritecnt = 32;
 342
 343         lockinit(&mp->mnt_lock, PVFS, "vfslock", 0, 0);
 344         (void)vfs_busy(mp, LK_NOWAIT, 0, p);
 345         LIST_INIT(&mp->mnt_vnodelist);
 346         mp->mnt_vfc = vfsp;
 347         mp->mnt_op = vfsp->vfc_vfsops;
 348         mp->mnt_flag = MNT_RDONLY;
 349         mp->mnt_vnodecovered = NULLVP;
 350         vfsp->vfc_refcount++;
 351         mp->mnt_stat.f_type = vfsp->vfc_typenum;
 352         mp->mnt_flag |= vfsp->vfc_flags & MNT_VISFLAGMASK;
 353         strncpy(mp->mnt_stat.f_fstypename, vfsp->vfc_name, MFSNAMELEN);
 354         mp->mnt_stat.f_mntonname[0] = '/';
 355         (void) copystr(devname, mp->mnt_stat.f_mntfromname, MNAMELEN - 1, 0);
 356         *mpp = mp;
 357         return (0);
 358 }
 359
 360 /*
 361  * Find an appropriate filesystem to use for the root. If a filesystem
 362  * has not been preselected, walk through the list of known filesystems
 363  * trying those that have mountroot routines, and try them until one
 364  * works or we have tried them all.
 365  */
 366 int
 367 vfs_mountroot()
 368 {
 369         struct vfsconf *vfsp;
 370         extern int (*mountroot)(void);
 371         int error;
 372
 373         if (mountroot != NULL) {
 374                 error = (*mountroot)();
 375                 return (error);
 376         }
 377
 378         for (vfsp = vfsconf; vfsp; vfsp = vfsp->vfc_next) {
 379                 if (vfsp->vfc_mountroot == NULL)
 380                         continue;
 381                 if ((error = (*vfsp->vfc_mountroot)()) == 0)
 382                         return (0);
 383                 if (error != EINVAL)
 384                         printf("%s_mountroot failed: %d\n", vfsp->vfc_name, error);
 385         }
 386         return (ENODEV);
 387 }
 388
 389 /*
 390  * Lookup a mount point by filesystem identifier.
 391  */
 392 struct mount *
 393 vfs_getvfs(fsid)
 394         fsid_t *fsid;
 395 {
 396         register struct mount *mp;
 397
 398         simple_lock(&mountlist_slock);
 399         for (mp = mountlist.cqh_first; mp != (void *)&mountlist;
 400              mp = mp->mnt_list.cqe_next) {
 401                 if (mp->mnt_stat.f_fsid.val[0] == fsid->val[0] &&
 402                     mp->mnt_stat.f_fsid.val[1] == fsid->val[1]) {
 403                         simple_unlock(&mountlist_slock);
 404                         return (mp);
 405                 }
 406         }
 407         simple_unlock(&mountlist_slock);
 408         return ((struct mount *)0);
 409 }
 410
 411 /*
 412  * Get a new unique fsid
 413  */
 414 void
 415 vfs_getnewfsid(mp)
 416         struct mount *mp;
 417 {
 418 static u_short xxxfs_mntid;
 419
 420         fsid_t tfsid;
 421         int mtype;
 422
 423         simple_lock(&mntid_slock);
 424         mtype = mp->mnt_vfc->vfc_typenum;
 425         mp->mnt_stat.f_fsid.val[0] = makedev(nblkdev + mtype, 0);
 426         mp->mnt_stat.f_fsid.val[1] = mtype;
 427         if (xxxfs_mntid == 0)
 428                 ++xxxfs_mntid;
 429         tfsid.val[0] = makedev(nblkdev + mtype, xxxfs_mntid);
 430         tfsid.val[1] = mtype;
 431         if (mountlist.cqh_first != (void *)&mountlist) {
 432                 while (vfs_getvfs(&tfsid)) {
 433                         tfsid.val[0]++;
 434                         xxxfs_mntid++;
 435                 }
 436         }
 437         mp->mnt_stat.f_fsid.val[0] = tfsid.val[0];
 438         simple_unlock(&mntid_slock);
 439 }
 440
 441 /*
 442  * Set vnode attributes to VNOVAL
 443  */
 444 void
 445 vattr_null(vap)
 446         register struct vattr *vap;
 447 {
 448
 449         vap->va_type = VNON;
 450         vap->va_size = vap->va_bytes = VNOVAL;
 451         vap->va_mode = vap->va_nlink = vap->va_uid = vap->va_gid =
 452                 vap->va_fsid = vap->va_fileid =
 453                 vap->va_blocksize = vap->va_rdev =
 454                 vap->va_atime.tv_sec = vap->va_atime.tv_nsec =
 455                 vap->va_mtime.tv_sec = vap->va_mtime.tv_nsec =
 456                 vap->va_ctime.tv_sec = vap->va_ctime.tv_nsec =
 457                 vap->va_flags = vap->va_gen = VNOVAL;
 458         vap->va_vaflags = 0;
 459 }
 460
 461 /*
 462  * Routines having to do with the management of the vnode table.
 463  */
 464 extern int (**dead_vnodeop_p)(void *);
 465 static void vclean __P((struct vnode *vp, int flag, struct proc *p));
 466 extern void vgonel __P((struct vnode *vp, struct proc *p));
 467 long numvnodes, freevnodes;
 468 long inactivevnodes;
 469 long vnode_reclaim_tried;
 470 long vnode_objects_reclaimed;
 471
 472
 473 extern struct vattr va_null;
 474
 475 /*
 476  * Return the next vnode from the free list.
 477  */
 478 int
 479 getnewvnode(tag, mp, vops, vpp)
 480         enum vtagtype tag;
 481         struct mount *mp;
 482         int (**vops)(void *);
 483         struct vnode **vpp;
 484 {
 485         struct proc *p = current_proc();        /* XXX */
 486         struct vnode *vp;
 487         int cnt, didretry = 0;
 488         static int reused = 0;                          /* track the reuse rate */
 489         int reclaimhits = 0;
 490
 491 retry:
 492         simple_lock(&vnode_free_list_slock);
 493         /*
 494          * MALLOC a vnode if the number of vnodes has not reached the desired
 495          * value and the number on the free list is still reasonable...
 496          * reuse from the freelist even though we may evict a name cache entry
 497          * to reduce the number of vnodes that accumulate.... vnodes tie up
 498          * wired memory and are never garbage collected
 499          */
 500         if (numvnodes < desiredvnodes && (freevnodes < (2 * VNODE_FREE_MIN))) {
 501                 numvnodes++;
 502                 simple_unlock(&vnode_free_list_slock);
 503                 MALLOC_ZONE(vp, struct vnode *, sizeof *vp, M_VNODE, M_WAITOK);
 504                 bzero((char *)vp, sizeof *vp);
 505                 VLISTNONE(vp);          /* avoid double queue removal */
 506                 simple_lock_init(&vp->v_interlock);
 507                 goto done;
 508         }
 509
 510         /*
 511          * Once the desired number of vnodes are allocated,
 512          * we start reusing the vnodes.
 513          */
 514         if (freevnodes < VNODE_FREE_MIN) {
 515                 /*
 516                  * if we are low on vnodes on the freelist attempt to get
 517                  * some back from the inactive list and VM object cache
 518                  */
 519                 simple_unlock(&vnode_free_list_slock);
 520                 (void)vnreclaim(vnodetarget);
 521                 simple_lock(&vnode_free_list_slock);
 522         }
 523         if (numvnodes >= desiredvnodes && reused > VNODE_TOOMANY_REUSED) {
 524                 reused = 0;
 525                 if (freevnodes < VNODE_FREE_ENOUGH) {
 526                         simple_unlock(&vnode_free_list_slock);
 527                         (void)vnreclaim(vnodetarget);
 528                         simple_lock(&vnode_free_list_slock);
 529                 }
 530         }
 531
 532         for (cnt = 0, vp = vnode_free_list.tqh_first;
 533                         vp != NULLVP; cnt++, vp = vp->v_freelist.tqe_next) {
 534                 if (simple_lock_try(&vp->v_interlock)) {
 535                         /* got the interlock */
 536                         if (ISSET(vp->v_flag, VORECLAIM)) {
 537                                 /* skip over the vnodes that are being reclaimed */
 538                                 simple_unlock(&vp->v_interlock);
 539                                 reclaimhits++;
 540                         } else
 541                         break;
 542         }
 543         }
 544
 545         /*
 546          * Unless this is a bad time of the month, at most
 547          * the first NCPUS items on the free list are
 548          * locked, so this is close enough to being empty.
 549          */
 550         if (vp == NULLVP) {
 551                 simple_unlock(&vnode_free_list_slock);
 552                 if (!(didretry++) && (vnreclaim(vnodetarget) > 0))
 553                         goto retry;
 554                 tablefull("vnode");
 555                 log(LOG_EMERG, "%d vnodes locked, %d desired, %d numvnodes, "
 556                         "%d free, %d inactive, %d being reclaimed\n",
 557                         cnt, desiredvnodes, numvnodes, freevnodes, inactivevnodes,
 558                         reclaimhits);
 559                 *vpp = 0;
 560                 return (ENFILE);
 561         }
 562
 563         if (vp->v_usecount)
 564                 panic("free vnode isn't: v_type = %d, v_usecount = %d?",
 565                                 vp->v_type, vp->v_usecount);
 566
 567         VREMFREE("getnewvnode", vp);
 568         reused++;
 569         simple_unlock(&vnode_free_list_slock);
 570         vp->v_lease = NULL;
 571         cache_purge(vp);
 572         if (vp->v_type != VBAD)
 573                 vgonel(vp, p);  /* clean and reclaim the vnode */
 574         else
 575                 simple_unlock(&vp->v_interlock);
 576 #if DIAGNOSTIC
 577         if (vp->v_data)
 578                 panic("cleaned vnode isn't");
 579         {
 580         int s = splbio();
 581         if (vp->v_numoutput)
 582                 panic("Clean vnode has pending I/O's");
 583         splx(s);
 584         }
 585 #endif
 586         if (UBCINFOEXISTS(vp))
 587                 panic("getnewvnode: ubcinfo not cleaned");
 588         else
 589                 vp->v_ubcinfo = 0;
 590
 591         vp->v_lastr = -1;
 592         vp->v_ralen = 0;
 593         vp->v_maxra = 0;
 594         vp->v_lastw = 0;
 595         vp->v_ciosiz = 0;
 596         vp->v_cstart = 0;
 597         vp->v_clen = 0;
 598         vp->v_socket = 0;
 599
 600 done:
 601         vp->v_flag = VSTANDARD;
 602         vp->v_type = VNON;
 603         vp->v_tag = tag;
 604         vp->v_op = vops;
 605         insmntque(vp, mp);
 606         *vpp = vp;
 607         vp->v_usecount = 1;
 608         vp->v_data = 0;
 609         return (0);
 610 }
 611
 612 /*
 613  * Move a vnode from one mount queue to another.
 614  */
 615 void
 616 insmntque(vp, mp)
 617         struct vnode *vp;
 618         struct mount *mp;
 619 {
 620
 621         simple_lock(&mntvnode_slock);
 622         /*
 623          * Delete from old mount point vnode list, if on one.
 624          */
 625         if (vp->v_mount != NULL)
 626                 LIST_REMOVE(vp, v_mntvnodes);
 627         /*
 628          * Insert into list of vnodes for the new mount point, if available.
 629          */
 630         if ((vp->v_mount = mp) != NULL)
 631                 LIST_INSERT_HEAD(&mp->mnt_vnodelist, vp, v_mntvnodes);
 632         simple_unlock(&mntvnode_slock);
 633 }
 634
 635 __inline void
 636 vpwakeup(struct vnode *vp)
 637 {
 638         if (vp) {
 639                 if (--vp->v_numoutput < 0)
 640                         panic("vpwakeup: neg numoutput");
 641                 if ((vp->v_flag & VBWAIT || vp->v_flag & VTHROTTLED)
 642                     && vp->v_numoutput <= 0) {
 643                         vp->v_flag &= ~(VBWAIT|VTHROTTLED);
 644                         wakeup((caddr_t)&vp->v_numoutput);
 645                 }
 646         }
 647 }
 648
 649 /*
 650  * Update outstanding I/O count and do wakeup if requested.
 651  */
 652 void
 653 vwakeup(bp)
 654         register struct buf *bp;
 655 {
 656         CLR(bp->b_flags, B_WRITEINPROG);
 657         vpwakeup(bp->b_vp);
 658 }
 659
 660 /*
 661  * Flush out and invalidate all buffers associated with a vnode.
 662  * Called with the underlying object locked.
 663  */
 664 int
 665 vinvalbuf(vp, flags, cred, p, slpflag, slptimeo)
 666         register struct vnode *vp;
 667         int flags;
 668         struct ucred *cred;
 669         struct proc *p;
 670         int slpflag, slptimeo;
 671 {
 672         register struct buf *bp;
 673         struct buf *nbp, *blist;
 674         int s, error = 0;
 675
 676         if (flags & V_SAVE) {
 677                 if (error = VOP_FSYNC(vp, cred, MNT_WAIT, p)) {
 678                         return (error);
 679                 }
 680                 if (vp->v_dirtyblkhd.lh_first)
 681                         panic("vinvalbuf: dirty bufs");
 682         }
 683
 684         for (;;) {
 685                 if ((blist = vp->v_cleanblkhd.lh_first) && flags & V_SAVEMETA)
 686                         while (blist && blist->b_lblkno < 0)
 687                                 blist = blist->b_vnbufs.le_next;
 688                 if (!blist && (blist = vp->v_dirtyblkhd.lh_first) &&
 689                     (flags & V_SAVEMETA))
 690                         while (blist && blist->b_lblkno < 0)
 691                                 blist = blist->b_vnbufs.le_next;
 692                 if (!blist)
 693                         break;
 694
 695                 for (bp = blist; bp; bp = nbp) {
 696                         nbp = bp->b_vnbufs.le_next;
 697                         if (flags & V_SAVEMETA && bp->b_lblkno < 0)
 698                                 continue;
 699                         s = splbio();
 700                         if (ISSET(bp->b_flags, B_BUSY)) {
 701                                 SET(bp->b_flags, B_WANTED);
 702                                 error = tsleep((caddr_t)bp,
 703                                         slpflag | (PRIBIO + 1), "vinvalbuf",
 704                                         slptimeo);
 705                                 splx(s);
 706                                 if (error) {
 707                                         return (error);
 708                                 }
 709                                 break;
 710                         }
 711                         bremfree(bp);
 712                         SET(bp->b_flags, B_BUSY);
 713                         splx(s);
 714                         /*
 715                          * XXX Since there are no node locks for NFS, I believe
 716                          * there is a slight chance that a delayed write will
 717                          * occur while sleeping just above, so check for it.
 718                          */
 719                         if (ISSET(bp->b_flags, B_DELWRI) && (flags & V_SAVE)) {
 720                                 (void) VOP_BWRITE(bp);
 721                                 break;
 722                         }
 723                         SET(bp->b_flags, B_INVAL);
 724                         brelse(bp);
 725                 }
 726         }
 727         if (!(flags & V_SAVEMETA) &&
 728             (vp->v_dirtyblkhd.lh_first || vp->v_cleanblkhd.lh_first))
 729                 panic("vinvalbuf: flush failed");
 730         return (0);
 731 }
 732
 733 /*
 734  * Create a vnode for a block device.
 735  * Used for root filesystem, argdev, and swap areas.
 736  * Also used for memory file system special devices.
 737  */
 738 int
 739 bdevvp(dev, vpp)
 740         dev_t dev;
 741         struct vnode **vpp;
 742 {
 743         register struct vnode *vp;
 744         struct vnode *nvp;
 745         int error;
 746
 747         if (dev == NODEV) {
 748                 *vpp = NULLVP;
 749                 return (ENODEV);
 750         }
 751         error = getnewvnode(VT_NON, (struct mount *)0, spec_vnodeop_p, &nvp);
 752         if (error) {
 753                 *vpp = NULLVP;
 754                 return (error);
 755         }
 756         vp = nvp;
 757         vp->v_type = VBLK;
 758         if (nvp = checkalias(vp, dev, (struct mount *)0)) {
 759                 vput(vp);
 760                 vp = nvp;
 761         }
 762         *vpp = vp;
 763         return (0);
 764 }
 765
 766 /*
 767  * Check to see if the new vnode represents a special device
 768  * for which we already have a vnode (either because of
 769  * bdevvp() or because of a different vnode representing
 770  * the same block device). If such an alias exists, deallocate
 771  * the existing contents and return the aliased vnode. The
 772  * caller is responsible for filling it with its new contents.
 773  */
 774 struct vnode *
 775 checkalias(nvp, nvp_rdev, mp)
 776         register struct vnode *nvp;
 777         dev_t nvp_rdev;
 778         struct mount *mp;
 779 {
 780         struct proc *p = current_proc();        /* XXX */
 781         struct vnode *vp;
 782         struct vnode **vpp;
 783         struct specinfo * bufhold;
 784         int buffree = 1;
 785
 786         if (nvp->v_type != VBLK && nvp->v_type != VCHR)
 787                 return (NULLVP);
 788
 789         bufhold = (struct specinfo *)_MALLOC_ZONE(sizeof(struct specinfo),
 790                         M_VNODE, M_WAITOK);
 791         vpp = &speclisth[SPECHASH(nvp_rdev)];
 792 loop:
 793         simple_lock(&spechash_slock);
 794         for (vp = *vpp; vp; vp = vp->v_specnext) {
 795                 if (nvp_rdev != vp->v_rdev || nvp->v_type != vp->v_type)
 796                         continue;
 797                 /*
 798                  * Alias, but not in use, so flush it out.
 799                  */
 800                 simple_lock(&vp->v_interlock);
 801                 if (vp->v_usecount == 0) {
 802                         simple_unlock(&spechash_slock);
 803                         vgonel(vp, p);
 804                         goto loop;
 805                 }
 806                 if (vget(vp, LK_EXCLUSIVE | LK_INTERLOCK, p)) {
 807                         simple_unlock(&spechash_slock);
 808                         goto loop;
 809                 }
 810                 break;
 811         }
 812         if (vp == NULL || vp->v_tag != VT_NON) {
 813                 nvp->v_specinfo = bufhold;
 814                 buffree = 0;    /* buffer used */
 815                 bzero(nvp->v_specinfo, sizeof(struct specinfo));
 816                 nvp->v_rdev = nvp_rdev;
 817                 nvp->v_hashchain = vpp;
 818                 nvp->v_specnext = *vpp;
 819                 nvp->v_specflags = 0;
 820                 simple_unlock(&spechash_slock);
 821                 *vpp = nvp;
 822                 if (vp != NULLVP) {
 823                         nvp->v_flag |= VALIASED;
 824                         vp->v_flag |= VALIASED;
 825                         vput(vp);
 826                 }
 827                 /* Since buffer is used just return */
 828                 return (NULLVP);
 829         }
 830         simple_unlock(&spechash_slock);
 831         VOP_UNLOCK(vp, 0, p);
 832         simple_lock(&vp->v_interlock);
 833         vclean(vp, 0, p);
 834         vp->v_op = nvp->v_op;
 835         vp->v_tag = nvp->v_tag;
 836         nvp->v_type = VNON;
 837         insmntque(vp, mp);
 838         if (buffree)
 839                 _FREE_ZONE((void *)bufhold, sizeof (struct specinfo), M_VNODE);
 840         return (vp);
 841 }
 842
 843 /*
 844  * Get a reference on a particular vnode and lock it if requested.
 845  * If the vnode was on the inactive list, remove it from the list.
 846  * If the vnode was on the free list, remove it from the list and
 847  * move it to inactive list as needed.
 848  * The vnode lock bit is set if the vnode is being eliminated in
 849  * vgone. The process is awakened when the transition is completed,
 850  * and an error returned to indicate that the vnode is no longer
 851  * usable (possibly having been changed to a new file system type).
 852  */
 853 int
 854 vget(vp, flags, p)
 855         struct vnode *vp;
 856         int flags;
 857         struct proc *p;
 858 {
 859         int error = 0;
 860
 861 retry:
 862
 863         /*
 864          * If the vnode is in the process of being cleaned out for
 865          * another use, we wait for the cleaning to finish and then
 866          * return failure. Cleaning is determined by checking that
 867          * the VXLOCK flag is set.
 868          */
 869         if ((flags & LK_INTERLOCK) == 0)
 870                 simple_lock(&vp->v_interlock);
 871         if ((vp->v_flag & VXLOCK) || (vp->v_flag & VORECLAIM)) {
 872                 vp->v_flag |= VXWANT;
 873                 simple_unlock(&vp->v_interlock);
 874                 (void)tsleep((caddr_t)vp, PINOD, "vget", 0);
 875                 return (ENOENT);
 876         }
 877
 878         /*
 879          * vnode is being terminated.
 880          * wait for vnode_pager_no_senders() to clear VTERMINATE
 881          */
 882         if (ISSET(vp->v_flag, VTERMINATE)) {
 883                 SET(vp->v_flag, VTERMWANT);
 884                 simple_unlock(&vp->v_interlock);
 885                 (void)tsleep((caddr_t)&vp->v_ubcinfo, PINOD, "vclean", 0);
 886                 return (ENOENT);
 887         }
 888
 889         /*
 890          * if the vnode is being initialized,
 891          * wait for it to finish initialization
 892          */
 893         if (ISSET(vp->v_flag,  VUINIT)) {
 894                 if (ISSET(vp->v_flag,  VUINIT)) {
 895                         SET(vp->v_flag, VUWANT);
 896                         simple_unlock(&vp->v_interlock);
 897                         (void) tsleep((caddr_t)vp, PINOD, "vget2", 0);
 898                         goto retry;
 899                 }
 900         }
 901
 902         simple_lock(&vnode_free_list_slock);
 903         if (vp->v_usecount == 0) {
 904                 /* If on the free list, remove it from there */
 905                 if (VONLIST(vp))
 906                         VREMFREE("vget", vp);
 907         } else {
 908                 /* If on the inactive list, remove it from there */
 909                 if ((vp->v_usecount == 1) && UBCINFOEXISTS(vp)) {
 910                         if (VONLIST(vp))
 911                                 VREMINACTIVE("vget", vp);
 912                 }
 913         }
 914
 915         /* The vnode should not be on the inactive list here */
 916         VINACTIVECHECK("vget", vp, 0);
 917
 918         simple_unlock(&vnode_free_list_slock);
 919
 920         if (++vp->v_usecount <= 0)
 921                 panic("vget: v_usecount");
 922
 923         /*
 924          * Recover named reference as needed
 925          */
 926         if (UBCISVALID(vp) && !ubc_issetflags(vp, UI_HASOBJREF)) {
 927                 simple_unlock(&vp->v_interlock);
 928                 if (ubc_getobject(vp, UBC_HOLDOBJECT)) {
 929                         error = ENOENT;
 930                         goto errout;
 931                 }
 932                 simple_lock(&vp->v_interlock);
 933         }
 934
 935         if (flags & LK_TYPE_MASK) {
 936                 if (error = vn_lock(vp, flags | LK_INTERLOCK, p))
 937                         goto errout;
 938                 return (0);
 939         }
 940
 941         if ((flags & LK_INTERLOCK) == 0)
 942                 simple_unlock(&vp->v_interlock);
 943         return (0);
 944
 945 errout:
 946         /*
 947          * If the vnode was not active in the first place
 948          * must not call vrele() as VOP_INACTIVE() is not
 949          * required.
 950          * So inlined part of vrele() here.
 951          */
 952         simple_lock(&vp->v_interlock);
 953         if (--vp->v_usecount == 1) {
 954                 if (UBCINFOEXISTS(vp)) {
 955                         vinactive(vp);
 956                         simple_unlock(&vp->v_interlock);
 957                         return (error);
 958                 }
 959         }
 960         if (vp->v_usecount > 0) {
 961                 simple_unlock(&vp->v_interlock);
 962                 return (error);
 963         }
 964         if (vp->v_usecount < 0)
 965                 panic("vget: negative usecount (%d)", vp->v_usecount);
 966         vfree(vp);
 967         simple_unlock(&vp->v_interlock);
 968         return (error);
 969 }
 970
 971 /*
 972  * Get a pager reference on the particular vnode.
 973  *
 974  * This is called from ubc_info_init() and it is asumed that
 975  * the vnode is neither on the free list on on the inactive list.
 976  * It is also assumed that the vnode is neither being recycled
 977  * by vgonel nor being terminated by vnode_pager_vrele().
 978  *
 979  * The vnode interlock is NOT held by the caller.
 980  */
 981 __private_extern__ int
 982 vnode_pager_vget(vp)
 983         struct vnode *vp;
 984 {
 985         simple_lock(&vp->v_interlock);
 986         if (UBCINFOMISSING(vp))
 987                 panic("vnode_pager_vget: stolen ubc_info");
 988
 989         if (!UBCINFOEXISTS(vp))
 990                 panic("vnode_pager_vget: lost ubc_info");
 991
 992         if ((vp->v_flag & VXLOCK) || (vp->v_flag & VORECLAIM))
 993                 panic("vnode_pager_vget: already being reclaimd");
 994
 995         if (ISSET(vp->v_flag, VTERMINATE))
 996                 panic("vnode_pager_vget: already being terminated");
 997
 998         simple_lock(&vnode_free_list_slock);
 999         /* The vnode should not be on ANY list */
1000         if (VONLIST(vp))
1001                 panic("vnode_pager_vget: still on the list");
1002
1003         /* The vnode should not be on the inactive list here */
1004         VINACTIVECHECK("vnode_pager_vget", vp, 0);
1005         simple_unlock(&vnode_free_list_slock);
1006
1007         /* After all those checks, now do the real work :-) */
1008         if (++vp->v_usecount <= 0)
1009                 panic("vnode_pager_vget: v_usecount");
1010         simple_unlock(&vp->v_interlock);
1011
1012         return (0);
1013 }
1014
1015 /*
1016  * Stubs to use when there is no locking to be done on the underlying object.
1017  * A minimal shared lock is necessary to ensure that the underlying object
1018  * is not revoked while an operation is in progress. So, an active shared
1019  * count is maintained in an auxillary vnode lock structure.
1020  */
1021 int
1022 vop_nolock(ap)
1023         struct vop_lock_args /* {
1024                 struct vnode *a_vp;
1025                 int a_flags;
1026                 struct proc *a_p;
1027         } */ *ap;
1028 {
1029 #ifdef notyet
1030         /*
1031          * This code cannot be used until all the non-locking filesystems
1032          * (notably NFS) are converted to properly lock and release nodes.
1033          * Also, certain vnode operations change the locking state within
1034          * the operation (create, mknod, remove, link, rename, mkdir, rmdir,
1035          * and symlink). Ideally these operations should not change the
1036          * lock state, but should be changed to let the caller of the
1037          * function unlock them. Otherwise all intermediate vnode layers
1038          * (such as union, umapfs, etc) must catch these functions to do
1039          * the necessary locking at their layer. Note that the inactive
1040          * and lookup operations also change their lock state, but this
1041          * cannot be avoided, so these two operations will always need
1042          * to be handled in intermediate layers.
1043          */
1044         struct vnode *vp = ap->a_vp;
1045         int vnflags, flags = ap->a_flags;
1046
1047         if (vp->v_vnlock == NULL) {
1048                 if ((flags & LK_TYPE_MASK) == LK_DRAIN)
1049                         return (0);
1050                 MALLOC_ZONE(vp->v_vnlock, struct lock__bsd__ *,
1051                                 sizeof(struct lock__bsd__), M_VNODE, M_WAITOK);
1052                 lockinit(vp->v_vnlock, PVFS, "vnlock", 0, 0);
1053         }
1054         switch (flags & LK_TYPE_MASK) {
1055         case LK_DRAIN:
1056                 vnflags = LK_DRAIN;
1057                 break;
1058         case LK_EXCLUSIVE:
1059         case LK_SHARED:
1060                 vnflags = LK_SHARED;
1061                 break;
1062         case LK_UPGRADE:
1063         case LK_EXCLUPGRADE:
1064         case LK_DOWNGRADE:
1065                 return (0);
1066         case LK_RELEASE:
1067         default:
1068                 panic("vop_nolock: bad operation %d", flags & LK_TYPE_MASK);
1069         }
1070         if (flags & LK_INTERLOCK)
1071                 vnflags |= LK_INTERLOCK;
1072         return(lockmgr(vp->v_vnlock, vnflags, &vp->v_interlock, ap->a_p));
1073 #else /* for now */
1074         /*
1075          * Since we are not using the lock manager, we must clear
1076          * the interlock here.
1077          */
1078         if (ap->a_flags & LK_INTERLOCK)
1079                 simple_unlock(&ap->a_vp->v_interlock);
1080         return (0);
1081 #endif
1082 }
1083
1084 /*
1085  * Decrement the active use count.
1086  */
1087 int
1088 vop_nounlock(ap)
1089         struct vop_unlock_args /* {
1090                 struct vnode *a_vp;
1091                 int a_flags;
1092                 struct proc *a_p;
1093         } */ *ap;
1094 {
1095         struct vnode *vp = ap->a_vp;
1096
1097         if (vp->v_vnlock == NULL)
1098                 return (0);
1099         return (lockmgr(vp->v_vnlock, LK_RELEASE, NULL, ap->a_p));
1100 }
1101
1102 /*
1103  * Return whether or not the node is in use.
1104  */
1105 int
1106 vop_noislocked(ap)
1107         struct vop_islocked_args /* {
1108                 struct vnode *a_vp;
1109         } */ *ap;
1110 {
1111         struct vnode *vp = ap->a_vp;
1112
1113         if (vp->v_vnlock == NULL)
1114                 return (0);
1115         return (lockstatus(vp->v_vnlock));
1116 }
1117
1118 /*
1119  * Vnode reference.
1120  */
1121 void
1122 vref(vp)
1123         struct vnode *vp;
1124 {
1125
1126         simple_lock(&vp->v_interlock);
1127         if (vp->v_usecount <= 0)
1128                 panic("vref used where vget required");
1129
1130         /* If on the inactive list, remove it from there */
1131         if ((vp->v_usecount == 1) && UBCINFOEXISTS(vp)) {
1132                 if (VONLIST(vp)) {
1133                         simple_lock(&vnode_free_list_slock);
1134                         VREMINACTIVE("vref", vp);
1135                         simple_unlock(&vnode_free_list_slock);
1136                 }
1137         }
1138         /* The vnode should not be on the inactive list here */
1139         VINACTIVECHECK("vref", vp, 0);
1140
1141         if (++vp->v_usecount <= 0)
1142                 panic("vref v_usecount");
1143         simple_unlock(&vp->v_interlock);
1144 }
1145
1146 /*
1147  * put the vnode on appropriate free list.
1148  * called with v_interlock held.
1149  */
1150 static void
1151 vfree(vp)
1152         struct vnode *vp;
1153 {
1154         /*
1155          * if the vnode is not obtained by calling getnewvnode() we
1156          * are not responsible for the cleanup. Just return.
1157          */
1158         if (!(vp->v_flag & VSTANDARD)) {
1159                 return;
1160         }
1161
1162         if (vp->v_usecount != 0)
1163                 panic("vfree: v_usecount");
1164
1165         /* insert at tail of LRU list or at head if VAGE is set */
1166         simple_lock(&vnode_free_list_slock);
1167
1168         if (VONLIST(vp))
1169                  panic("vfree: vnode still on list");
1170
1171         if (vp->v_flag & VAGE) {
1172                 TAILQ_INSERT_HEAD(&vnode_free_list, vp, v_freelist);
1173                 vp->v_flag &= ~VAGE;
1174         } else
1175                 TAILQ_INSERT_TAIL(&vnode_free_list, vp, v_freelist);
1176         freevnodes++;
1177         simple_unlock(&vnode_free_list_slock);
1178         return;
1179 }
1180
1181 /*
1182  * put the vnode on the inactive list.
1183  * called with v_interlock held
1184  */
1185 static void
1186 vinactive(vp)
1187         struct vnode *vp;
1188 {
1189         if (!UBCINFOEXISTS(vp))
1190                 panic("vinactive: not a UBC vnode");
1191
1192         if (vp->v_usecount != 1)
1193                 panic("vinactive: v_usecount");
1194
1195         simple_lock(&vnode_free_list_slock);
1196
1197         if (VONLIST(vp))
1198                  panic("vinactive: vnode still on list");
1199         VINACTIVECHECK("vinactive", vp, 0);
1200
1201         TAILQ_INSERT_TAIL(&vnode_inactive_list, vp, v_freelist);
1202         SET(vp->v_flag, VUINACTIVE);
1203         CLR(vp->v_flag, (VNOCACHE_DATA | VRAOFF));
1204
1205         inactivevnodes++;
1206         simple_unlock(&vnode_free_list_slock);
1207         return;
1208 }
1209
1210
1211 /*
1212  * vput(), just unlock and vrele()
1213  */
1214 void
1215 vput(vp)
1216         struct vnode *vp;
1217 {
1218         struct proc *p = current_proc();        /* XXX */
1219
1220         simple_lock(&vp->v_interlock);
1221         if (--vp->v_usecount == 1) {
1222                 if (UBCINFOEXISTS(vp)) {
1223                         vinactive(vp);
1224                         simple_unlock(&vp->v_interlock);
1225                         VOP_UNLOCK(vp, 0, p);
1226                         return;
1227                 }
1228         }
1229         if (vp->v_usecount > 0) {
1230                 simple_unlock(&vp->v_interlock);
1231                 VOP_UNLOCK(vp, 0, p);
1232                 return;
1233         }
1234 #if DIAGNOSTIC
1235         if (vp->v_usecount < 0 || vp->v_writecount != 0) {
1236                 vprint("vput: bad ref count", vp);
1237                 panic("vput: v_usecount = %d, v_writecount = %d",
1238                         vp->v_usecount, vp->v_writecount);
1239         }
1240 #endif
1241         if (ISSET((vp)->v_flag, VUINACTIVE) && VONLIST(vp))
1242                 VREMINACTIVE("vrele", vp);
1243
1244         simple_unlock(&vp->v_interlock);
1245         VOP_INACTIVE(vp, p);
1246         /*
1247          * The interlock is not held and
1248          * VOP_INCATIVE releases the vnode lock.
1249          * We could block and the vnode might get reactivated
1250          * Can not just call vfree without checking the state
1251          */
1252         simple_lock(&vp->v_interlock);
1253         if (!VONLIST(vp)) {
1254                 if (vp->v_usecount == 0)
1255                         vfree(vp);
1256                 else if ((vp->v_usecount == 1) && UBCINFOEXISTS(vp))
1257                         vinactive(vp);
1258         }
1259         simple_unlock(&vp->v_interlock);
1260 }
1261
1262 /*
1263  * Vnode release.
1264  * If count drops to zero, call inactive routine and return to freelist.
1265  */
1266 void
1267 vrele(vp)
1268         struct vnode *vp;
1269 {
1270         struct proc *p = current_proc();        /* XXX */
1271
1272         simple_lock(&vp->v_interlock);
1273         if (--vp->v_usecount == 1) {
1274                 if (UBCINFOEXISTS(vp)) {
1275                         vinactive(vp);
1276                         simple_unlock(&vp->v_interlock);
1277                         return;
1278                 }
1279         }
1280         if (vp->v_usecount > 0) {
1281                 simple_unlock(&vp->v_interlock);
1282                 return;
1283         }
1284 #if DIAGNOSTIC
1285         if (vp->v_usecount < 0 || vp->v_writecount != 0) {
1286                 vprint("vrele: bad ref count", vp);
1287                 panic("vrele: ref cnt");
1288         }
1289 #endif
1290         if (ISSET((vp)->v_flag, VUINACTIVE) && VONLIST(vp))
1291                 VREMINACTIVE("vrele", vp);
1292
1293
1294         if ((vp->v_flag & VXLOCK) || (vp->v_flag & VORECLAIM)) {
1295                 /* vnode is being cleaned, just return */
1296                 vfree(vp);
1297                 simple_unlock(&vp->v_interlock);
1298                 return;
1299         }
1300
1301         if (vn_lock(vp, LK_EXCLUSIVE | LK_INTERLOCK, p) == 0) {
1302                 VOP_INACTIVE(vp, p);
1303                 /*
1304                  * vn_lock releases the interlock and
1305                  * VOP_INCATIVE releases the vnode lock.
1306                  * We could block and the vnode might get reactivated
1307                  * Can not just call vfree without checking the state
1308                  */
1309                 simple_lock(&vp->v_interlock);
1310                 if (!VONLIST(vp)) {
1311                         if (vp->v_usecount == 0)
1312                                 vfree(vp);
1313                         else if ((vp->v_usecount == 1) && UBCINFOEXISTS(vp))
1314                                 vinactive(vp);
1315                 }
1316                 simple_unlock(&vp->v_interlock);
1317         }
1318 #if 0
1319         else {
1320                 vfree(vp);
1321                 simple_unlock(&vp->v_interlock);
1322                 kprintf("vrele: vn_lock() failed for vp = 0x%08x\n", vp);
1323         }
1324 #endif
1325 }
1326
1327 void
1328 vagevp(vp)
1329         struct vnode *vp;
1330 {
1331         simple_lock(&vp->v_interlock);
1332         vp->v_flag |= VAGE;
1333         simple_unlock(&vp->v_interlock);
1334         return;
1335 }
1336
1337 /*
1338  * Page or buffer structure gets a reference.
1339  */
1340 void
1341 vhold(vp)
1342         register struct vnode *vp;
1343 {
1344
1345         simple_lock(&vp->v_interlock);
1346         vp->v_holdcnt++;
1347         simple_unlock(&vp->v_interlock);
1348 }
1349
1350 /*
1351  * Page or buffer structure frees a reference.
1352  */
1353 void
1354 holdrele(vp)
1355         register struct vnode *vp;
1356 {
1357
1358         simple_lock(&vp->v_interlock);
1359         if (vp->v_holdcnt <= 0)
1360                 panic("holdrele: holdcnt");
1361         vp->v_holdcnt--;
1362         simple_unlock(&vp->v_interlock);
1363 }
1364
1365 /*
1366  * Remove any vnodes in the vnode table belonging to mount point mp.
1367  *
1368  * If MNT_NOFORCE is specified, there should not be any active ones,
1369  * return error if any are found (nb: this is a user error, not a
1370  * system error). If MNT_FORCE is specified, detach any active vnodes
1371  * that are found.
1372  */
1373 #if DIAGNOSTIC
1374 int busyprt = 0;        /* print out busy vnodes */
1375 #if 0
1376 struct ctldebug debug1 = { "busyprt", &busyprt };
1377 #endif /* 0 */
1378 #endif
1379
1380 int
1381 vflush(mp, skipvp, flags)
1382         struct mount *mp;
1383         struct vnode *skipvp;
1384         int flags;
1385 {
1386         struct proc *p = current_proc();
1387         struct vnode *vp, *nvp;
1388         int busy = 0;
1389
1390         simple_lock(&mntvnode_slock);
1391 loop:
1392         for (vp = mp->mnt_vnodelist.lh_first; vp; vp = nvp) {
1393                 if (vp->v_mount != mp)
1394                         goto loop;
1395                 nvp = vp->v_mntvnodes.le_next;
1396                 /*
1397                  * Skip over a selected vnode.
1398                  */
1399                 if (vp == skipvp)
1400                         continue;
1401
1402                 simple_lock(&vp->v_interlock);
1403                 /*
1404                  * Skip over a vnodes marked VSYSTEM or VNOFLUSH.
1405                  */
1406                 if ((flags & SKIPSYSTEM) && ((vp->v_flag & VSYSTEM) || (vp->v_flag & VNOFLUSH))) {
1407                         simple_unlock(&vp->v_interlock);
1408                         continue;
1409                 }
1410                 /*
1411                  * Skip over a vnodes marked VSWAP.
1412                  */
1413                 if ((flags & SKIPSWAP) && (vp->v_flag & VSWAP)) {
1414                         simple_unlock(&vp->v_interlock);
1415                         continue;
1416                 }
1417                 /*
1418                  * If WRITECLOSE is set, only flush out regular file
1419                  * vnodes open for writing.
1420                  */
1421                 if ((flags & WRITECLOSE) &&
1422                     (vp->v_writecount == 0 || vp->v_type != VREG)) {
1423                         simple_unlock(&vp->v_interlock);
1424                         continue;
1425                 }
1426                 /*
1427                  * With v_usecount == 0, all we need to do is clear
1428                  * out the vnode data structures and we are done.
1429                  */
1430                 if (vp->v_usecount == 0) {
1431                         simple_unlock(&mntvnode_slock);
1432                         vgonel(vp, p);
1433                         simple_lock(&mntvnode_slock);
1434                         continue;
1435                 }
1436                 /*
1437                  * If FORCECLOSE is set, forcibly close the vnode.
1438                  * For block or character devices, revert to an
1439                  * anonymous device. For all other files, just kill them.
1440                  */
1441                 if (flags & FORCECLOSE) {
1442                         simple_unlock(&mntvnode_slock);
1443                         if (vp->v_type != VBLK && vp->v_type != VCHR) {
1444                                 vgonel(vp, p);
1445                         } else {
1446                                 vclean(vp, 0, p);
1447                                 vp->v_op = spec_vnodeop_p;
1448                                 insmntque(vp, (struct mount *)0);
1449                         }
1450                         simple_lock(&mntvnode_slock);
1451                         continue;
1452                 }
1453 #if DIAGNOSTIC
1454                 if (busyprt)
1455                         vprint("vflush: busy vnode", vp);
1456 #endif
1457                 simple_unlock(&vp->v_interlock);
1458                 busy++;
1459         }
1460         simple_unlock(&mntvnode_slock);
1461         if (busy && ((flags & FORCECLOSE)==0))
1462                 return (EBUSY);
1463         return (0);
1464 }
1465
1466 /*
1467  * Disassociate the underlying file system from a vnode.
1468  * The vnode interlock is held on entry.
1469  */
1470 static void
1471 vclean(vp, flags, p)
1472         struct vnode *vp;
1473         int flags;
1474         struct proc *p;
1475 {
1476         int active;
1477         int removed = 0;
1478         int didhold;
1479
1480         /*
1481          * if the vnode is not obtained by calling getnewvnode() we
1482          * are not responsible for the cleanup. Just return.
1483          */
1484         if (!(vp->v_flag & VSTANDARD)) {
1485                 simple_unlock(&vp->v_interlock);
1486                 return;
1487         }
1488
1489         /*
1490          * Check to see if the vnode is in use.
1491          * If so we have to reference it before we clean it out
1492          * so that its count cannot fall to zero and generate a
1493          * race against ourselves to recycle it.
1494          */
1495         if (active = vp->v_usecount)
1496                 if (++vp->v_usecount <= 0)
1497                         panic("vclean: v_usecount");
1498         /*
1499          * Prevent the vnode from being recycled or
1500          * brought into use while we clean it out.
1501          */
1502         if (vp->v_flag & VXLOCK)
1503                 panic("vclean: deadlock");
1504         vp->v_flag |= VXLOCK;
1505
1506         /*
1507          * Even if the count is zero, the VOP_INACTIVE routine may still
1508          * have the object locked while it cleans it out. The VOP_LOCK
1509          * ensures that the VOP_INACTIVE routine is done with its work.
1510          * For active vnodes, it ensures that no other activity can
1511          * occur while the underlying object is being cleaned out.
1512          */
1513         VOP_LOCK(vp, LK_DRAIN | LK_INTERLOCK, p);
1514
1515         /*
1516          * if this vnode is on the inactive list
1517          * take it off the list.
1518          */
1519         if ((active == 1) &&
1520                 (ISSET((vp)->v_flag, VUINACTIVE) && VONLIST(vp))) {
1521                 simple_lock(&vnode_free_list_slock);
1522                 VREMINACTIVE("vclean", vp);
1523                 simple_unlock(&vnode_free_list_slock);
1524                 removed++;
1525         }
1526
1527         /* Clean the pages in VM. */
1528         if (active && (flags & DOCLOSE))
1529                 VOP_CLOSE(vp, IO_NDELAY, NOCRED, p);
1530
1531         /* Clean the pages in VM. */
1532         didhold = ubc_hold(vp);
1533         if ((active) && (didhold))
1534                 (void)ubc_clean(vp, 0); /* do not invalidate */
1535
1536         /*
1537          * Clean out any buffers associated with the vnode.
1538          */
1539         if (flags & DOCLOSE) {
1540                 if (vp->v_tag == VT_NFS)
1541             nfs_vinvalbuf(vp, V_SAVE, NOCRED, p, 0);
1542         else
1543             vinvalbuf(vp, V_SAVE, NOCRED, p, 0, 0);
1544     }
1545
1546         if (active)
1547                 VOP_INACTIVE(vp, p);
1548         else
1549                 VOP_UNLOCK(vp, 0, p);
1550
1551         /* Destroy ubc named reference */
1552     if (didhold) {
1553         ubc_rele(vp);
1554                 ubc_destroy_named(vp);
1555         }
1556
1557         /*
1558          * Reclaim the vnode.
1559          */
1560         if (VOP_RECLAIM(vp, p))
1561                 panic("vclean: cannot reclaim");
1562         cache_purge(vp);
1563         if (vp->v_vnlock) {
1564                 if ((vp->v_vnlock->lk_flags & LK_DRAINED) == 0)
1565                         vprint("vclean: lock not drained", vp);
1566                 FREE_ZONE(vp->v_vnlock, sizeof (struct lock__bsd__), M_VNODE);
1567                 vp->v_vnlock = NULL;
1568         }
1569
1570         /* It's dead, Jim! */
1571         vp->v_op = dead_vnodeop_p;
1572         vp->v_tag = VT_NON;
1573
1574         /*
1575          * Done with purge, notify sleepers of the grim news.
1576          */
1577         vp->v_flag &= ~VXLOCK;
1578         if (vp->v_flag & VXWANT) {
1579                 vp->v_flag &= ~VXWANT;
1580                 wakeup((caddr_t)vp);
1581         }
1582
1583         if (active)
1584                 vrele(vp);
1585 }
1586
1587 /*
1588  * Eliminate all activity associated with  the requested vnode
1589  * and with all vnodes aliased to the requested vnode.
1590  */
1591 int
1592 vop_revoke(ap)
1593         struct vop_revoke_args /* {
1594                 struct vnode *a_vp;
1595                 int a_flags;
1596         } */ *ap;
1597 {
1598         struct vnode *vp, *vq;
1599         struct proc *p = current_proc();
1600
1601 #if DIAGNOSTIC
1602         if ((ap->a_flags & REVOKEALL) == 0)
1603                 panic("vop_revoke");
1604 #endif
1605
1606         vp = ap->a_vp;
1607         simple_lock(&vp->v_interlock);
1608
1609         if (vp->v_flag & VALIASED) {
1610                 /*
1611                  * If a vgone (or vclean) is already in progress,
1612                  * wait until it is done and return.
1613                  */
1614                 if (vp->v_flag & VXLOCK) {
1615                         while (vp->v_flag & VXLOCK) {
1616                                 vp->v_flag |= VXWANT;
1617                                 simple_unlock(&vp->v_interlock);
1618                                 (void)tsleep((caddr_t)vp, PINOD, "vop_revokeall", 0);
1619                         }
1620                         return (0);
1621                 }
1622                 /*
1623                  * Ensure that vp will not be vgone'd while we
1624                  * are eliminating its aliases.
1625                  */
1626                 vp->v_flag |= VXLOCK;
1627                 simple_unlock(&vp->v_interlock);
1628                 while (vp->v_flag & VALIASED) {
1629                         simple_lock(&spechash_slock);
1630                         for (vq = *vp->v_hashchain; vq; vq = vq->v_specnext) {
1631                                 if (vq->v_rdev != vp->v_rdev ||
1632                                     vq->v_type != vp->v_type || vp == vq)
1633                                         continue;
1634                                 simple_unlock(&spechash_slock);
1635                                 vgone(vq);
1636                                 break;
1637                         }
1638                         if (vq == NULLVP)
1639                                 simple_unlock(&spechash_slock);
1640                 }
1641                 /*
1642                  * Remove the lock so that vgone below will
1643                  * really eliminate the vnode after which time
1644                  * vgone will awaken any sleepers.
1645                  */
1646                 simple_lock(&vp->v_interlock);
1647                 vp->v_flag &= ~VXLOCK;
1648         }
1649         vgonel(vp, p);
1650         return (0);
1651 }
1652
1653 /*
1654  * Recycle an unused vnode to the front of the free list.
1655  * Release the passed interlock if the vnode will be recycled.
1656  */
1657 int
1658 vrecycle(vp, inter_lkp, p)
1659         struct vnode *vp;
1660         struct slock *inter_lkp;
1661         struct proc *p;
1662 {
1663
1664         simple_lock(&vp->v_interlock);
1665         if (vp->v_usecount == 0) {
1666                 if (inter_lkp)
1667                         simple_unlock(inter_lkp);
1668                 vgonel(vp, p);
1669                 return (1);
1670         }
1671         simple_unlock(&vp->v_interlock);
1672         return (0);
1673 }
1674
1675 /*
1676  * Eliminate all activity associated with a vnode
1677  * in preparation for reuse.
1678  */
1679 void
1680 vgone(vp)
1681         struct vnode *vp;
1682 {
1683         struct proc *p = current_proc();
1684
1685         simple_lock(&vp->v_interlock);
1686         vgonel(vp, p);
1687 }
1688
1689 /*
1690  * vgone, with the vp interlock held.
1691  */
1692 void
1693 vgonel(vp, p)
1694         struct vnode *vp;
1695         struct proc *p;
1696 {
1697         struct vnode *vq;
1698         struct vnode *vx;
1699
1700         /*
1701          * if the vnode is not obtained by calling getnewvnode() we
1702          * are not responsible for the cleanup. Just return.
1703          */
1704         if (!(vp->v_flag & VSTANDARD)) {
1705                 simple_unlock(&vp->v_interlock);
1706                 return;
1707         }
1708
1709         /*
1710          * If a vgone (or vclean) is already in progress,
1711          * wait until it is done and return.
1712          */
1713         if (vp->v_flag & VXLOCK) {
1714                 while (vp->v_flag & VXLOCK) {
1715                         vp->v_flag |= VXWANT;
1716                         simple_unlock(&vp->v_interlock);
1717                         (void)tsleep((caddr_t)vp, PINOD, "vgone", 0);
1718                 }
1719                 return;
1720         }
1721         /*
1722          * Clean out the filesystem specific data.
1723          */
1724         vclean(vp, DOCLOSE, p);
1725         /*
1726          * Delete from old mount point vnode list, if on one.
1727          */
1728         if (vp->v_mount != NULL)
1729                 insmntque(vp, (struct mount *)0);
1730         /*
1731          * If special device, remove it from special device alias list
1732          * if it is on one.
1733          */
1734         if ((vp->v_type == VBLK || vp->v_type == VCHR) && vp->v_specinfo != 0) {
1735                 simple_lock(&spechash_slock);
1736                 if (*vp->v_hashchain == vp) {
1737                         *vp->v_hashchain = vp->v_specnext;
1738                 } else {
1739                         for (vq = *vp->v_hashchain; vq; vq = vq->v_specnext) {
1740                                 if (vq->v_specnext != vp)
1741                                         continue;
1742                                 vq->v_specnext = vp->v_specnext;
1743                                 break;
1744                         }
1745                         if (vq == NULL)
1746                                 panic("missing bdev");
1747                 }
1748                 if (vp->v_flag & VALIASED) {
1749                         vx = NULL;
1750                         for (vq = *vp->v_hashchain; vq; vq = vq->v_specnext) {
1751                                 if (vq->v_rdev != vp->v_rdev ||
1752                                     vq->v_type != vp->v_type)
1753                                         continue;
1754                                 if (vx)
1755                                         break;
1756                                 vx = vq;
1757                         }
1758                         if (vx == NULL)
1759                                 panic("missing alias");
1760                         if (vq == NULL)
1761                                 vx->v_flag &= ~VALIASED;
1762                         vp->v_flag &= ~VALIASED;
1763                 }
1764                 simple_unlock(&spechash_slock);
1765                 FREE_ZONE(vp->v_specinfo, sizeof (struct specinfo), M_VNODE);
1766                 vp->v_specinfo = NULL;
1767         }
1768         /*
1769          * If it is on the freelist and not already at the head,
1770          * move it to the head of the list. The test of the back
1771          * pointer and the reference count of zero is because
1772          * it will be removed from the free list by getnewvnode,
1773          * but will not have its reference count incremented until
1774          * after calling vgone. If the reference count were
1775          * incremented first, vgone would (incorrectly) try to
1776          * close the previous instance of the underlying object.
1777          * So, the back pointer is explicitly set to `0xdeadb' in
1778          * getnewvnode after removing it from the freelist to ensure
1779          * that we do not try to move it here.
1780          */
1781         if (vp->v_usecount == 0) {
1782                 simple_lock(&vnode_free_list_slock);
1783                 if ((vp->v_freelist.tqe_prev != (struct vnode **)0xdeadb) &&
1784                     vnode_free_list.tqh_first != vp) {
1785                         TAILQ_REMOVE(&vnode_free_list, vp, v_freelist);
1786                         TAILQ_INSERT_HEAD(&vnode_free_list, vp, v_freelist);
1787                 }
1788                 simple_unlock(&vnode_free_list_slock);
1789         }
1790         vp->v_type = VBAD;
1791 }
1792
1793 /*
1794  * Lookup a vnode by device number.
1795  */
1796 int
1797 vfinddev(dev, type, vpp)
1798         dev_t dev;
1799         enum vtype type;
1800         struct vnode **vpp;
1801 {
1802         struct vnode *vp;
1803         int rc = 0;
1804
1805         simple_lock(&spechash_slock);
1806         for (vp = speclisth[SPECHASH(dev)]; vp; vp = vp->v_specnext) {
1807                 if (dev != vp->v_rdev || type != vp->v_type)
1808                         continue;
1809                 *vpp = vp;
1810                 rc = 1;
1811                 break;
1812         }
1813         simple_unlock(&spechash_slock);
1814         return (rc);
1815 }
1816
1817 /*
1818  * Calculate the total number of references to a special device.
1819  */
1820 int
1821 vcount(vp)
1822         struct vnode *vp;
1823 {
1824         struct vnode *vq, *vnext;
1825         int count;
1826
1827 loop:
1828         if ((vp->v_flag & VALIASED) == 0)
1829                 return (vp->v_usecount);
1830         simple_lock(&spechash_slock);
1831         for (count = 0, vq = *vp->v_hashchain; vq; vq = vnext) {
1832                 vnext = vq->v_specnext;
1833                 if (vq->v_rdev != vp->v_rdev || vq->v_type != vp->v_type)
1834                         continue;
1835                 /*
1836                  * Alias, but not in use, so flush it out.
1837                  */
1838                 if (vq->v_usecount == 0 && vq != vp) {
1839                         simple_unlock(&spechash_slock);
1840                         vgone(vq);
1841                         goto loop;
1842                 }
1843                 count += vq->v_usecount;
1844         }
1845         simple_unlock(&spechash_slock);
1846         return (count);
1847 }
1848
1849 int     prtactive = 0;          /* 1 => print out reclaim of active vnodes */
1850
1851 /*
1852  * Print out a description of a vnode.
1853  */
1854 static char *typename[] =
1855    { "VNON", "VREG", "VDIR", "VBLK", "VCHR", "VLNK", "VSOCK", "VFIFO", "VBAD" };
1856
1857 void
1858 vprint(label, vp)
1859         char *label;
1860         register struct vnode *vp;
1861 {
1862         char buf[64];
1863
1864         if (label != NULL)
1865                 printf("%s: ", label);
1866         printf("type %s, usecount %d, writecount %d, refcount %d,",
1867                 typename[vp->v_type], vp->v_usecount, vp->v_writecount,
1868                 vp->v_holdcnt);
1869         buf[0] = '\0';
1870         if (vp->v_flag & VROOT)
1871                 strcat(buf, "|VROOT");
1872         if (vp->v_flag & VTEXT)
1873                 strcat(buf, "|VTEXT");
1874         if (vp->v_flag & VSYSTEM)
1875                 strcat(buf, "|VSYSTEM");
1876         if (vp->v_flag & VNOFLUSH)
1877                 strcat(buf, "|VNOFLUSH");
1878         if (vp->v_flag & VXLOCK)
1879                 strcat(buf, "|VXLOCK");
1880         if (vp->v_flag & VXWANT)
1881                 strcat(buf, "|VXWANT");
1882         if (vp->v_flag & VBWAIT)
1883                 strcat(buf, "|VBWAIT");
1884         if (vp->v_flag & VALIASED)
1885                 strcat(buf, "|VALIASED");
1886         if (buf[0] != '\0')
1887                 printf(" flags (%s)", &buf[1]);
1888         if (vp->v_data == NULL) {
1889                 printf("\n");
1890         } else {
1891                 printf("\n\t");
1892                 VOP_PRINT(vp);
1893         }
1894 }
1895
1896 #ifdef DEBUG
1897 /*
1898  * List all of the locked vnodes in the system.
1899  * Called when debugging the kernel.
1900  */
1901 void
1902 printlockedvnodes()
1903 {
1904         struct proc *p = current_proc();
1905         struct mount *mp, *nmp;
1906         struct vnode *vp;
1907
1908         printf("Locked vnodes\n");
1909         simple_lock(&mountlist_slock);
1910         for (mp = mountlist.cqh_first; mp != (void *)&mountlist; mp = nmp) {
1911                 if (vfs_busy(mp, LK_NOWAIT, &mountlist_slock, p)) {
1912                         nmp = mp->mnt_list.cqe_next;
1913                         continue;
1914                 }
1915                 for (vp = mp->mnt_vnodelist.lh_first;
1916                      vp != NULL;
1917                      vp = vp->v_mntvnodes.le_next) {
1918                         if (VOP_ISLOCKED(vp))
1919                                 vprint((char *)0, vp);
1920                 }
1921                 simple_lock(&mountlist_slock);
1922                 nmp = mp->mnt_list.cqe_next;
1923                 vfs_unbusy(mp, p);
1924         }
1925         simple_unlock(&mountlist_slock);
1926 }
1927 #endif
1928
1929 /*
1930  * Top level filesystem related information gathering.
1931  */
1932 int
1933 vfs_sysctl(name, namelen, oldp, oldlenp, newp, newlen, p)
1934         int *name;
1935         u_int namelen;
1936         void *oldp;
1937         size_t *oldlenp;
1938         void *newp;
1939         size_t newlen;
1940         struct proc *p;
1941 {
1942         struct vfsconf *vfsp;
1943
1944         /*
1945          * The VFS_NUMMNTOPS shouldn't be at name[0] since
1946          * is a VFS generic variable. So now we must check
1947          * namelen so we don't end up covering any UFS
1948          * variables (sinc UFS vfc_typenum is 1).
1949          *
1950          * It should have been:
1951          *    name[0]:  VFS_GENERIC
1952          *    name[1]:  VFS_NUMMNTOPS
1953          */
1954         if (namelen == 1 && name[0] == VFS_NUMMNTOPS) {
1955                 extern unsigned int vfs_nummntops;
1956                 return (sysctl_rdint(oldp, oldlenp, newp, vfs_nummntops));
1957         }
1958
1959         /* all sysctl names at this level are at least name and field */
1960         if (namelen < 2)
1961                 return (ENOTDIR);               /* overloaded */
1962         if (name[0] != VFS_GENERIC) {
1963                 for (vfsp = vfsconf; vfsp; vfsp = vfsp->vfc_next)
1964                         if (vfsp->vfc_typenum == name[0])
1965                                 break;
1966                 if (vfsp == NULL)
1967                         return (EOPNOTSUPP);
1968                 return ((*vfsp->vfc_vfsops->vfs_sysctl)(&name[1], namelen - 1,
1969                     oldp, oldlenp, newp, newlen, p));
1970         }
1971         switch (name[1]) {
1972         case VFS_MAXTYPENUM:
1973                 return (sysctl_rdint(oldp, oldlenp, newp, maxvfsconf));
1974         case VFS_CONF:
1975                 if (namelen < 3)
1976                         return (ENOTDIR);       /* overloaded */
1977                 for (vfsp = vfsconf; vfsp; vfsp = vfsp->vfc_next)
1978                         if (vfsp->vfc_typenum == name[2])
1979                                 break;
1980                 if (vfsp == NULL)
1981                         return (EOPNOTSUPP);
1982                 return (sysctl_rdstruct(oldp, oldlenp, newp, vfsp,
1983                     sizeof(struct vfsconf)));
1984         }
1985         return (EOPNOTSUPP);
1986 }
1987
1988 int kinfo_vdebug = 1;
1989 #define KINFO_VNODESLOP 10
1990 /*
1991  * Dump vnode list (via sysctl).
1992  * Copyout address of vnode followed by vnode.
1993  */
1994 /* ARGSUSED */
1995 int
1996 sysctl_vnode(where, sizep, p)
1997         char *where;
1998         size_t *sizep;
1999         struct proc *p;
2000 {
2001         struct mount *mp, *nmp;
2002         struct vnode *nvp, *vp;
2003         char *bp = where, *savebp;
2004         char *ewhere;
2005         int error;
2006
2007 #define VPTRSZ  sizeof (struct vnode *)
2008 #define VNODESZ sizeof (struct vnode)
2009         if (where == NULL) {
2010                 *sizep = (numvnodes + KINFO_VNODESLOP) * (VPTRSZ + VNODESZ);
2011                 return (0);
2012         }
2013         ewhere = where + *sizep;
2014
2015         simple_lock(&mountlist_slock);
2016         for (mp = mountlist.cqh_first; mp != (void *)&mountlist; mp = nmp) {
2017                 if (vfs_busy(mp, LK_NOWAIT, &mountlist_slock, p)) {
2018                         nmp = mp->mnt_list.cqe_next;
2019                         continue;
2020                 }
2021                 savebp = bp;
2022 again:
2023                 simple_lock(&mntvnode_slock);
2024                 for (vp = mp->mnt_vnodelist.lh_first;
2025                      vp != NULL;
2026                      vp = nvp) {
2027                         /*
2028                          * Check that the vp is still associated with
2029                          * this filesystem.  RACE: could have been
2030                          * recycled onto the same filesystem.
2031                          */
2032                         if (vp->v_mount != mp) {
2033                                 simple_unlock(&mntvnode_slock);
2034                                 if (kinfo_vdebug)
2035                                         printf("kinfo: vp changed\n");
2036                                 bp = savebp;
2037                                 goto again;
2038                         }
2039                         nvp = vp->v_mntvnodes.le_next;
2040                         if (bp + VPTRSZ + VNODESZ > ewhere) {
2041                                 simple_unlock(&mntvnode_slock);
2042                                 *sizep = bp - where;
2043                                 return (ENOMEM);
2044                         }
2045                         simple_unlock(&mntvnode_slock);
2046                         if ((error = copyout((caddr_t)&vp, bp, VPTRSZ)) ||
2047                            (error = copyout((caddr_t)vp, bp + VPTRSZ, VNODESZ)))
2048                                 return (error);
2049                         bp += VPTRSZ + VNODESZ;
2050                         simple_lock(&mntvnode_slock);
2051                 }
2052                 simple_unlock(&mntvnode_slock);
2053                 simple_lock(&mountlist_slock);
2054                 nmp = mp->mnt_list.cqe_next;
2055                 vfs_unbusy(mp, p);
2056         }
2057         simple_unlock(&mountlist_slock);
2058
2059         *sizep = bp - where;
2060         return (0);
2061 }
2062
2063 /*
2064  * Check to see if a filesystem is mounted on a block device.
2065  */
2066 int
2067 vfs_mountedon(vp)
2068         struct vnode *vp;
2069 {
2070         struct vnode *vq;
2071         int error = 0;
2072
2073         if (vp->v_specflags & SI_MOUNTEDON)
2074                 return (EBUSY);
2075         if (vp->v_flag & VALIASED) {
2076                 simple_lock(&spechash_slock);
2077                 for (vq = *vp->v_hashchain; vq; vq = vq->v_specnext) {
2078                         if (vq->v_rdev != vp->v_rdev ||
2079                             vq->v_type != vp->v_type)
2080                                 continue;
2081                         if (vq->v_specflags & SI_MOUNTEDON) {
2082                                 error = EBUSY;
2083                                 break;
2084                         }
2085                 }
2086                 simple_unlock(&spechash_slock);
2087         }
2088         return (error);
2089 }
2090
2091 /*
2092  * Unmount all filesystems. The list is traversed in reverse order
2093  * of mounting to avoid dependencies.
2094  */
2095 __private_extern__ void
2096 vfs_unmountall()
2097 {
2098         struct mount *mp, *nmp;
2099         struct proc *p = current_proc();
2100
2101         /*
2102          * Since this only runs when rebooting, it is not interlocked.
2103          */
2104         for (mp = mountlist.cqh_last; mp != (void *)&mountlist; mp = nmp) {
2105                 nmp = mp->mnt_list.cqe_prev;
2106                 (void) dounmount(mp, MNT_FORCE, p);
2107         }
2108 }
2109
2110 /*
2111  * Build hash lists of net addresses and hang them off the mount point.
2112  * Called by vfs_export() to set up the lists of export addresses.
2113  */
2114 static int
2115 vfs_hang_addrlist(mp, nep, argp)
2116         struct mount *mp;
2117         struct netexport *nep;
2118         struct export_args *argp;
2119 {
2120         register struct netcred *np;
2121         register struct radix_node_head *rnh;
2122         register int i;
2123         struct radix_node *rn;
2124         struct sockaddr *saddr, *smask = 0;
2125         struct domain *dom;
2126         int error;
2127
2128         if (argp->ex_addrlen == 0) {
2129                 if (mp->mnt_flag & MNT_DEFEXPORTED)
2130                         return (EPERM);
2131                 np = &nep->ne_defexported;
2132                 np->netc_exflags = argp->ex_flags;
2133                 np->netc_anon = argp->ex_anon;
2134                 np->netc_anon.cr_ref = 1;
2135                 mp->mnt_flag |= MNT_DEFEXPORTED;
2136                 return (0);
2137         }
2138         i = sizeof(struct netcred) + argp->ex_addrlen + argp->ex_masklen;
2139         MALLOC(np, struct netcred *, i, M_NETADDR, M_WAITOK);
2140         bzero((caddr_t)np, i);
2141         saddr = (struct sockaddr *)(np + 1);
2142         if (error = copyin(argp->ex_addr, (caddr_t)saddr, argp->ex_addrlen))
2143                 goto out;
2144         if (saddr->sa_len > argp->ex_addrlen)
2145                 saddr->sa_len = argp->ex_addrlen;
2146         if (argp->ex_masklen) {
2147                 smask = (struct sockaddr *)((caddr_t)saddr + argp->ex_addrlen);
2148                 error = copyin(argp->ex_addr, (caddr_t)smask, argp->ex_masklen);
2149                 if (error)
2150                         goto out;
2151                 if (smask->sa_len > argp->ex_masklen)
2152                         smask->sa_len = argp->ex_masklen;
2153         }
2154         i = saddr->sa_family;
2155         if ((rnh = nep->ne_rtable[i]) == 0) {
2156                 /*
2157                  * Seems silly to initialize every AF when most are not
2158                  * used, do so on demand here
2159                  */
2160                 for (dom = domains; dom; dom = dom->dom_next)
2161                         if (dom->dom_family == i && dom->dom_rtattach) {
2162                                 dom->dom_rtattach((void **)&nep->ne_rtable[i],
2163                                         dom->dom_rtoffset);
2164                                 break;
2165                         }
2166                 if ((rnh = nep->ne_rtable[i]) == 0) {
2167                         error = ENOBUFS;
2168                         goto out;
2169                 }
2170         }
2171         rn = (*rnh->rnh_addaddr)((caddr_t)saddr, (caddr_t)smask, rnh,
2172                 np->netc_rnodes);
2173         if (rn == 0) {
2174                 /*
2175                  * One of the reasons that rnh_addaddr may fail is that
2176                  * the entry already exists. To check for this case, we
2177                  * look up the entry to see if it is there. If so, we
2178                  * do not need to make a new entry but do return success.
2179                  */
2180                 _FREE(np, M_NETADDR);
2181                 rn = (*rnh->rnh_matchaddr)((caddr_t)saddr, rnh);
2182                 if (rn != 0 && (rn->rn_flags & RNF_ROOT) == 0 &&
2183                     ((struct netcred *)rn)->netc_exflags == argp->ex_flags &&
2184                     !bcmp((caddr_t)&((struct netcred *)rn)->netc_anon,
2185                             (caddr_t)&argp->ex_anon, sizeof(struct ucred)))
2186                         return (0);
2187                 return (EPERM);
2188         }
2189         np->netc_exflags = argp->ex_flags;
2190         np->netc_anon = argp->ex_anon;
2191         np->netc_anon.cr_ref = 1;
2192         return (0);
2193 out:
2194         _FREE(np, M_NETADDR);
2195         return (error);
2196 }
2197
2198 /* ARGSUSED */
2199 static int
2200 vfs_free_netcred(rn, w)
2201         struct radix_node *rn;
2202         caddr_t w;
2203 {
2204         register struct radix_node_head *rnh = (struct radix_node_head *)w;
2205
2206         (*rnh->rnh_deladdr)(rn->rn_key, rn->rn_mask, rnh);
2207         _FREE((caddr_t)rn, M_NETADDR);
2208         return (0);
2209 }
2210
2211 /*
2212  * Free the net address hash lists that are hanging off the mount points.
2213  */
2214 static void
2215 vfs_free_addrlist(nep)
2216         struct netexport *nep;
2217 {
2218         register int i;
2219         register struct radix_node_head *rnh;
2220
2221         for (i = 0; i <= AF_MAX; i++)
2222                 if (rnh = nep->ne_rtable[i]) {
2223                         (*rnh->rnh_walktree)(rnh, vfs_free_netcred,
2224                             (caddr_t)rnh);
2225                         _FREE((caddr_t)rnh, M_RTABLE);
2226                         nep->ne_rtable[i] = 0;
2227                 }
2228 }
2229
2230 int
2231 vfs_export(mp, nep, argp)
2232         struct mount *mp;
2233         struct netexport *nep;
2234         struct export_args *argp;
2235 {
2236         int error;
2237
2238         if (argp->ex_flags & MNT_DELEXPORT) {
2239                 vfs_free_addrlist(nep);
2240                 mp->mnt_flag &= ~(MNT_EXPORTED | MNT_DEFEXPORTED);
2241         }
2242         if (argp->ex_flags & MNT_EXPORTED) {
2243                 if (error = vfs_hang_addrlist(mp, nep, argp))
2244                         return (error);
2245                 mp->mnt_flag |= MNT_EXPORTED;
2246         }
2247         return (0);
2248 }
2249
2250 struct netcred *
2251 vfs_export_lookup(mp, nep, nam)
2252         register struct mount *mp;
2253         struct netexport *nep;
2254         struct mbuf *nam;
2255 {
2256         register struct netcred *np;
2257         register struct radix_node_head *rnh;
2258         struct sockaddr *saddr;
2259
2260         np = NULL;
2261         if (mp->mnt_flag & MNT_EXPORTED) {
2262                 /*
2263                  * Lookup in the export list first.
2264                  */
2265                 if (nam != NULL) {
2266                         saddr = mtod(nam, struct sockaddr *);
2267                         rnh = nep->ne_rtable[saddr->sa_family];
2268                         if (rnh != NULL) {
2269                                 np = (struct netcred *)
2270                                         (*rnh->rnh_matchaddr)((caddr_t)saddr,
2271                                                               rnh);
2272                                 if (np && np->netc_rnodes->rn_flags & RNF_ROOT)
2273                                         np = NULL;
2274                         }
2275                 }
2276                 /*
2277                  * If no address match, use the default if it exists.
2278                  */
2279                 if (np == NULL && mp->mnt_flag & MNT_DEFEXPORTED)
2280                         np = &nep->ne_defexported;
2281         }
2282         return (np);
2283 }
2284
2285 /*
2286  * try to reclaim vnodes from the memory
2287  * object cache
2288  */
2289 static int
2290 vm_object_cache_reclaim(int count)
2291 {
2292         int cnt;
2293         void vnode_pager_release_from_cache(int *);
2294
2295         /* attempt to reclaim vnodes from VM object cache */
2296         cnt = count;
2297         vnode_pager_release_from_cache(&cnt);
2298         return(cnt);
2299 }
2300
2301 /*
2302  * Release memory object reference held by inactive vnodes
2303  * and then try to reclaim some vnodes from the memory
2304  * object cache
2305  */
2306 static int
2307 vnreclaim(int count)
2308 {
2309         int i, loopcnt;
2310         struct vnode *vp;
2311         int err;
2312         struct proc *p;
2313
2314         i = 0;
2315         loopcnt = 0;
2316
2317         /* Try to release "count" vnodes from the inactive list */
2318 restart:
2319         if (++loopcnt > inactivevnodes) {
2320                 /*
2321                  * I did my best trying to reclaim the vnodes.
2322                  * Do not try any more as that would only lead to
2323                  * long latencies. Also in the worst case
2324                  * this can get totally CPU bound.
2325                  * Just fall though and attempt a reclaim of VM
2326                  * object cache
2327                  */
2328                 goto out;
2329         }
2330
2331         simple_lock(&vnode_free_list_slock);
2332         for (vp = TAILQ_FIRST(&vnode_inactive_list);
2333                         (vp != NULLVP) && (i < count);
2334                         vp = TAILQ_NEXT(vp, v_freelist)) {
2335
2336                 if (!simple_lock_try(&vp->v_interlock))
2337                         continue;
2338
2339                 if (vp->v_usecount != 1)
2340                         panic("vnreclaim: v_usecount");
2341
2342                 if(!UBCINFOEXISTS(vp)) {
2343                         if (vp->v_type == VBAD) {
2344                                 VREMINACTIVE("vnreclaim", vp);
2345                                 simple_unlock(&vp->v_interlock);
2346                                 continue;
2347                         } else
2348                                 panic("non UBC vnode on inactive list");
2349                                 /* Should not reach here */
2350                 }
2351
2352                 /* If vnode is already being reclaimed, wait */
2353                 if ((vp->v_flag & VXLOCK) || (vp->v_flag & VORECLAIM)) {
2354                         vp->v_flag |= VXWANT;
2355                         simple_unlock(&vp->v_interlock);
2356                         simple_unlock(&vnode_free_list_slock);
2357                         (void)tsleep((caddr_t)vp, PINOD, "vocr", 0);
2358                         goto restart;
2359                 }
2360
2361                 VREMINACTIVE("vnreclaim", vp);
2362                 simple_unlock(&vnode_free_list_slock);
2363
2364                 if (ubc_issetflags(vp, UI_WASMAPPED)) {
2365                         /*
2366                          * We should not reclaim as it is likely
2367                          * to be in use. Let it die a natural death.
2368                          * Release the UBC reference if one exists
2369                          * and put it back at the tail.
2370                          */
2371                         simple_unlock(&vp->v_interlock);
2372                         if (ubc_release_named(vp)) {
2373                                 if (UBCINFOEXISTS(vp)) {
2374                                         simple_lock(&vp->v_interlock);
2375                                         if (vp->v_usecount == 1 && !VONLIST(vp))
2376                                                 vinactive(vp);
2377                                         simple_unlock(&vp->v_interlock);
2378                                 }
2379                         } else {
2380                             simple_lock(&vp->v_interlock);
2381                                 vinactive(vp);
2382                                 simple_unlock(&vp->v_interlock);
2383                         }
2384                 } else {
2385                         int didhold;
2386
2387                         VORECLAIM_ENABLE(vp);
2388
2389                         /*
2390                          * scrub the dirty pages and invalidate the buffers
2391                          */
2392                         p = current_proc();
2393                         err = vn_lock(vp, LK_EXCLUSIVE|LK_INTERLOCK, p);
2394                         if (err) {
2395                                 /* cannot reclaim */
2396                                 simple_lock(&vp->v_interlock);
2397                                 vinactive(vp);
2398                                 VORECLAIM_DISABLE(vp);
2399                                 i++;
2400                                 simple_unlock(&vp->v_interlock);
2401                                 goto restart;
2402                         }
2403
2404                         /* keep the vnode alive so we can kill it */
2405                         simple_lock(&vp->v_interlock);
2406                         if(vp->v_usecount != 1)
2407                                 panic("VOCR: usecount race");
2408                         vp->v_usecount++;
2409                         simple_unlock(&vp->v_interlock);
2410
2411                         /* clean up the state in VM without invalidating */
2412                         didhold = ubc_hold(vp);
2413                         if (didhold)
2414                                 (void)ubc_clean(vp, 0);
2415
2416                         /* flush and invalidate buffers associated with the vnode */
2417                         if (vp->v_tag == VT_NFS)
2418                                 nfs_vinvalbuf(vp, V_SAVE, NOCRED, p, 0);
2419                         else
2420                                 vinvalbuf(vp, V_SAVE, NOCRED, p, 0, 0);
2421
2422                         /*
2423                          * Note: for the v_usecount == 2 case, VOP_INACTIVE
2424                          * has not yet been called.  Call it now while vp is
2425                          * still locked, it will also release the lock.
2426                          */
2427                         if (vp->v_usecount == 2)
2428                                 VOP_INACTIVE(vp, p);
2429                         else
2430                                 VOP_UNLOCK(vp, 0, p);
2431
2432                         if (didhold)
2433                                 ubc_rele(vp);
2434
2435                         /*
2436                          * destroy the ubc named reference.
2437                          * If we can't because it is held for I/Os
2438                          * in progress, just put it back on the inactive
2439                          * list and move on.  Otherwise, the paging reference
2440                          * is toast (and so is this vnode?).
2441                          */
2442                         if (ubc_destroy_named(vp)) {
2443                             i++;
2444                         }
2445                         simple_lock(&vp->v_interlock);
2446                         VORECLAIM_DISABLE(vp);
2447                         simple_unlock(&vp->v_interlock);
2448                         vrele(vp);  /* release extra use we added here */
2449                 }
2450                 /* inactive list lock was released, must restart */
2451                 goto restart;
2452         }
2453         simple_unlock(&vnode_free_list_slock);
2454
2455         vnode_reclaim_tried += i;
2456 out:
2457         i = vm_object_cache_reclaim(count);
2458         vnode_objects_reclaimed += i;
2459
2460         return(i);
2461 }
2462
2463 /*
2464  * This routine is called from vnode_pager_no_senders()
2465  * which in turn can be called with vnode locked by vnode_uncache()
2466  * But it could also get called as a result of vm_object_cache_trim().
2467  * In that case lock state is unknown.
2468  * AGE the vnode so that it gets recycled quickly.
2469  * Check lock status to decide whether to call vput() or vrele().
2470  */
2471 __private_extern__ void
2472 vnode_pager_vrele(struct vnode *vp)
2473 {
2474
2475         boolean_t       funnel_state;
2476         int isvnreclaim = 1;
2477
2478         if (vp == (struct vnode *) NULL)
2479                 panic("vnode_pager_vrele: null vp");
2480
2481         funnel_state = thread_funnel_set(kernel_flock, TRUE);
2482
2483         /* Mark the vnode to be recycled */
2484         vagevp(vp);
2485
2486         simple_lock(&vp->v_interlock);
2487         /*
2488          * If a vgone (or vclean) is already in progress,
2489          * Do not bother with the ubc_info cleanup.
2490          * Let the vclean deal with it.
2491          */
2492         if (vp->v_flag & VXLOCK) {
2493                 CLR(vp->v_flag, VTERMINATE);
2494                 if (ISSET(vp->v_flag, VTERMWANT)) {
2495                         CLR(vp->v_flag, VTERMWANT);
2496                         wakeup((caddr_t)&vp->v_ubcinfo);
2497                 }
2498                 simple_unlock(&vp->v_interlock);
2499                 vrele(vp);
2500                 (void) thread_funnel_set(kernel_flock, funnel_state);
2501                 return;
2502         }
2503
2504         /* It's dead, Jim! */
2505         if (!ISSET(vp->v_flag, VORECLAIM)) {
2506                 /*
2507                  * called as a result of eviction of the memory
2508                  * object from the memory object cache
2509                  */
2510                 isvnreclaim = 0;
2511
2512                 /* So serialize vnode operations */
2513                 VORECLAIM_ENABLE(vp);
2514         }
2515         if (!ISSET(vp->v_flag, VTERMINATE))
2516                 SET(vp->v_flag, VTERMINATE);
2517         if (UBCINFOEXISTS(vp)) {
2518                 struct ubc_info *uip = vp->v_ubcinfo;
2519
2520                 if (ubc_issetflags(vp, UI_WASMAPPED))
2521                         SET(vp->v_flag, VWASMAPPED);
2522
2523                 vp->v_ubcinfo = UBC_NOINFO;  /* catch bad accesses */
2524                 simple_unlock(&vp->v_interlock);
2525                 ubc_info_deallocate(uip);
2526         } else {
2527                 if ((vp->v_type == VBAD) && ((vp)->v_ubcinfo != UBC_INFO_NULL)
2528                         && ((vp)->v_ubcinfo != UBC_NOINFO)) {
2529                         struct ubc_info *uip = vp->v_ubcinfo;
2530
2531                         vp->v_ubcinfo = UBC_NOINFO;  /* catch bad accesses */
2532                         simple_unlock(&vp->v_interlock);
2533                         ubc_info_deallocate(uip);
2534                 } else {
2535                         simple_unlock(&vp->v_interlock);
2536                 }
2537         }
2538
2539         CLR(vp->v_flag, VTERMINATE);
2540
2541         if (vp->v_type != VBAD){
2542                 vgone(vp);      /* revoke the vnode */
2543                 vrele(vp);      /* and drop the reference */
2544         } else
2545                 vrele(vp);
2546
2547         if (ISSET(vp->v_flag, VTERMWANT)) {
2548                 CLR(vp->v_flag, VTERMWANT);
2549                 wakeup((caddr_t)&vp->v_ubcinfo);
2550         }
2551         if (!isvnreclaim)
2552                 VORECLAIM_DISABLE(vp);
2553         (void) thread_funnel_set(kernel_flock, funnel_state);
2554         return;
2555 }
2556
2557
2558 #if DIAGNOSTIC
2559 int walk_vnodes_debug=0;
2560
2561 void
2562 walk_allvnodes()
2563 {
2564         struct mount *mp, *nmp;
2565         struct vnode *vp;
2566         int cnt = 0;
2567
2568         for (mp = mountlist.cqh_first; mp != (void *)&mountlist; mp = nmp) {
2569                 for (vp = mp->mnt_vnodelist.lh_first;
2570                      vp != NULL;
2571                      vp = vp->v_mntvnodes.le_next) {
2572                         if (vp->v_usecount < 0){
2573                                 if(walk_vnodes_debug) {
2574                                         printf("vp is %x\n",vp);
2575                                 }
2576                         }
2577                 }
2578                 nmp = mp->mnt_list.cqe_next;
2579         }
2580         for (cnt = 0, vp = vnode_free_list.tqh_first;
2581                 vp != NULLVP; cnt++, vp = vp->v_freelist.tqe_next) {
2582                 if ((vp->v_usecount < 0) && walk_vnodes_debug) {
2583                         if(walk_vnodes_debug) {
2584                                 printf("vp is %x\n",vp);
2585                         }
2586                 }
2587         }
2588         printf("%d - free\n", cnt);
2589
2590         for (cnt = 0, vp = vnode_inactive_list.tqh_first;
2591                 vp != NULLVP; cnt++, vp = vp->v_freelist.tqe_next) {
2592                 if ((vp->v_usecount < 0) && walk_vnodes_debug) {
2593                         if(walk_vnodes_debug) {
2594                                 printf("vp is %x\n",vp);
2595                         }
2596                 }
2597         }
2598         printf("%d - inactive\n", cnt);
2599 }
2600 #endif /* DIAGNOSTIC */
2601
2602 void
2603 vfs_io_attributes(vp, flags, iosize, vectors)
2604         struct vnode    *vp;
2605         int     flags;  /* B_READ or B_WRITE */
2606         int     *iosize;
2607         int     *vectors;
2608 {
2609         struct mount *mp;
2610
2611         /* start with "reasonable" defaults */
2612         *iosize = MAXPHYS;
2613         *vectors = 32;
2614
2615         mp = vp->v_mount;
2616         if (mp != NULL) {
2617                 switch (flags) {
2618                 case B_READ:
2619                         *iosize = mp->mnt_maxreadcnt;
2620                         *vectors = mp->mnt_segreadcnt;
2621                         break;
2622                 case B_WRITE:
2623                         *iosize = mp->mnt_maxwritecnt;
2624                         *vectors = mp->mnt_segwritecnt;
2625                         break;
2626                 default:
2627                         break;
2628                 }
2629         }
2630
2631         return;
2632 }
2633
2634 #include <dev/disk.h>
2635
2636 int
2637 vfs_init_io_attributes(devvp, mp)
2638         struct vnode *devvp;
2639         struct mount *mp;
2640 {
2641         int error;
2642         off_t readblockcnt;
2643         off_t writeblockcnt;
2644         off_t readsegcnt;
2645         off_t writesegcnt;
2646         u_long blksize;
2647
2648         u_int64_t temp;
2649
2650         struct proc *p = current_proc();
2651         struct  ucred *cred = p->p_ucred;
2652
2653         if ((error = VOP_IOCTL(devvp, DKIOCGETMAXBLOCKCOUNTREAD,
2654                                 (caddr_t)&readblockcnt, 0, cred, p)))
2655                 return (error);
2656
2657         if ((error = VOP_IOCTL(devvp, DKIOCGETMAXBLOCKCOUNTWRITE,
2658                                 (caddr_t)&writeblockcnt, 0, cred, p)))
2659                 return (error);
2660
2661         if ((error = VOP_IOCTL(devvp, DKIOCGETMAXSEGMENTCOUNTREAD,
2662                                 (caddr_t)&readsegcnt, 0, cred, p)))
2663                 return (error);
2664
2665         if ((error = VOP_IOCTL(devvp, DKIOCGETMAXSEGMENTCOUNTWRITE,
2666                                 (caddr_t)&writesegcnt, 0, cred, p)))
2667                 return (error);
2668
2669         if ((error = VOP_IOCTL(devvp, DKIOCGETBLOCKSIZE,
2670                                 (caddr_t)&blksize, 0, cred, p)))
2671                 return (error);
2672
2673         temp = readblockcnt * blksize;
2674         temp = (temp > UINT32_MAX) ? (UINT32_MAX / blksize) * blksize : temp;
2675         mp->mnt_maxreadcnt = (u_int32_t)temp;
2676
2677         temp = writeblockcnt * blksize;
2678         temp = (temp > UINT32_MAX) ? (UINT32_MAX / blksize) * blksize : temp;
2679         mp->mnt_maxwritecnt = (u_int32_t)temp;
2680
2681         temp = (readsegcnt > UINT16_MAX) ? UINT16_MAX : readsegcnt;
2682         mp->mnt_segreadcnt = (u_int16_t)temp;
2683
2684         temp = (writesegcnt > UINT16_MAX) ? UINT16_MAX : writesegcnt;
2685         mp->mnt_segwritecnt = (u_int16_t)temp;
2686
2687 #if 0
2688         printf("--- IO attributes for mount point 0x%08x ---\n", mp);
2689         printf("\tmnt_maxreadcnt = 0x%x", mp->mnt_maxreadcnt);
2690         printf("\tmnt_maxwritecnt = 0x%x\n", mp->mnt_maxwritecnt);
2691         printf("\tmnt_segreadcnt = 0x%x", mp->mnt_segreadcnt);
2692         printf("\tmnt_segwritecnt = 0x%x\n", mp->mnt_segwritecnt);
2693 #endif /* 0 */
2694
2695         return (error);
2696 }
2697