xnu-517.7.7.tar.gz

[apple/xnu.git] / bsd / vfs / vfs_subr.c
diff --git a/bsd/vfs/vfs_subr.c b/bsd/vfs/vfs_subr.c

index fd99cca8d7b2bd369a04235d49524ea7330f2601..0801f1a0953c56cb2e004636067a424aba6a367a 100644 (file)
--- a/bsd/vfs/vfs_subr.c
+++ b/bsd/vfs/vfs_subr.c
@@ -1,5 +1,5 @@
  /*
- * Copyright (c) 2000-2001 Apple Computer, Inc. All rights reserved.
+ * Copyright (c) 2000-2002 Apple Computer, Inc. All rights reserved.
   *
   * @APPLE_LICENSE_HEADER_START@
   * 
@@ -64,6 +64,7 @@
   * External virtual filesystem routines
   */
  
+#undef DIAGNOSTIC
  #define DIAGNOSTIC 1
  
  #include <sys/param.h>
@@ -84,6 +85,12 @@
  #include <sys/ubc.h>
  #include <sys/vm.h>
  #include <sys/sysctl.h>
+#include <sys/filedesc.h>
+#include <sys/event.h>
+
+#include <string.h>
+#include <machine/spl.h>
+
  
  #include <kern/assert.h>
  
@@ -108,15 +115,6 @@ static int vnreclaim(int count);
  extern kern_return_t 
         adjust_vm_object_cache(vm_size_t oval, vm_size_t nval);
  
-/*
- * Insq/Remq for the vnode usage lists.
- */
-#define        bufinsvn(bp, dp)        LIST_INSERT_HEAD(dp, bp, b_vnbufs)
-#define        bufremvn(bp) {                                                  \
-       LIST_REMOVE(bp, b_vnbufs);                                      \
-       (bp)->b_vnbufs.le_next = NOLIST;                                \
-}
-
  TAILQ_HEAD(freelst, vnode) vnode_free_list;    /* vnode free list */
  TAILQ_HEAD(inactivelst, vnode) vnode_inactive_list;    /* vnode inactive list */
  struct mntlist mountlist;                      /* mounted filesystem list */
@@ -171,7 +169,7 @@ struct mntlist mountlist;                   /* mounted filesystem list */
  #define VORECLAIM_ENABLE(vp)   \
         do {    \
                 if (ISSET((vp)->v_flag, VORECLAIM))     \
-                       panic("vm object raclaim already");     \
+                       panic("vm_object_reclaim already");     \
                 SET((vp)->v_flag, VORECLAIM);   \
         } while(0)
  
@@ -260,7 +258,12 @@ __private_extern__ kern_return_t
  reset_vmobjectcache(unsigned int val1, unsigned int val2)
  {
         vm_size_t oval = val1 - VNODE_FREE_MIN;
-       vm_size_t nval = val2 - VNODE_FREE_MIN;
+       vm_size_t nval;
+       
+       if(val2 < VNODE_FREE_MIN)
+               nval = 0;
+       else
+               nval = val2 - VNODE_FREE_MIN;
  
         return(adjust_vm_object_cache(oval, nval));
  }
@@ -399,8 +402,7 @@ vfs_getvfs(fsid)
         register struct mount *mp;
  
         simple_lock(&mountlist_slock);
-       for (mp = mountlist.cqh_first; mp != (void *)&mountlist;
-            mp = mp->mnt_list.cqe_next) {
+       CIRCLEQ_FOREACH(mp, &mountlist, mnt_list) {
                 if (mp->mnt_stat.f_fsid.val[0] == fsid->val[0] &&
                     mp->mnt_stat.f_fsid.val[1] == fsid->val[1]) {
                         simple_unlock(&mountlist_slock);
@@ -431,7 +433,7 @@ static u_short xxxfs_mntid;
                 ++xxxfs_mntid;
         tfsid.val[0] = makedev(nblkdev + mtype, xxxfs_mntid);
         tfsid.val[1] = mtype;
-       if (mountlist.cqh_first != (void *)&mountlist) {
+       if (!CIRCLEQ_EMPTY(&mountlist)) {
                 while (vfs_getvfs(&tfsid)) {
                         tfsid.val[0]++;
                         xxxfs_mntid++;
@@ -541,8 +543,8 @@ retry:
                                 simple_unlock(&vp->v_interlock);
                                 reclaimhits++;
                         } else
-                       break;
-       }
+                               break;
+               }
         }
  
         /*
@@ -591,15 +593,37 @@ retry:
         else
                 vp->v_ubcinfo = 0;
  
+       if (vp->v_flag & VHASDIRTY)
+               cluster_release(vp);
+
+       // make sure all these fields are cleared out as the
+       // name/parent stuff uses them and assumes they're
+       // cleared to null/0.
+       if (vp->v_scmap != NULL) {
+           panic("getnewvnode: vp @ 0x%x has non-null scmap.\n", vp);
+       }
+       vp->v_un.vu_name = NULL;
+       vp->v_scdirty = 0;
+       vp->v_un1.v_cl.v_pad = 0;
+       
+       
         vp->v_lastr = -1;
         vp->v_ralen = 0;
         vp->v_maxra = 0;
-       vp->v_lastw = 0;
         vp->v_ciosiz = 0;
-       vp->v_cstart = 0;
         vp->v_clen = 0;
         vp->v_socket = 0;
  
+       /* we may have blocked, re-evaluate state */
+       simple_lock(&vnode_free_list_slock);
+       if (VONLIST(vp)) {
+               if (vp->v_usecount == 0)
+                       VREMFREE("getnewvnode", vp);
+                else if (ISSET((vp)->v_flag, VUINACTIVE))
+                       VREMINACTIVE("getnewvnode", vp);
+       }
+       simple_unlock(&vnode_free_list_slock);
+
  done:
         vp->v_flag = VSTANDARD;
         vp->v_type = VNON;
@@ -635,6 +659,20 @@ insmntque(vp, mp)
         simple_unlock(&mntvnode_slock);
  }
  
+__inline void
+vpwakeup(struct vnode *vp)
+{
+       if (vp) {
+               if (--vp->v_numoutput < 0)
+                       panic("vpwakeup: neg numoutput");
+               if ((vp->v_flag & VBWAIT || vp->v_flag & VTHROTTLED)
+                   && vp->v_numoutput <= 0) {
+                       vp->v_flag &= ~(VBWAIT|VTHROTTLED);
+                       wakeup((caddr_t)&vp->v_numoutput);
+               }
+       }
+}
+
  /*
   * Update outstanding I/O count and do wakeup if requested.
   */
@@ -642,19 +680,8 @@ void
  vwakeup(bp)
         register struct buf *bp;
  {
-       register struct vnode *vp;
-
         CLR(bp->b_flags, B_WRITEINPROG);
-       if (vp = bp->b_vp) {
-               if (--vp->v_numoutput < 0)
-                       panic("vwakeup: neg numoutput");
-               if ((vp->v_flag & VBWAIT) && vp->v_numoutput <= 0) {
-                       if (vp->v_numoutput < 0)
-                               panic("vwakeup: neg numoutput 2");
-                       vp->v_flag &= ~VBWAIT;
-                       wakeup((caddr_t)&vp->v_numoutput);
-               }
-       }
+       vpwakeup(bp->b_vp);
  }
  
  /*
@@ -677,12 +704,12 @@ vinvalbuf(vp, flags, cred, p, slpflag, slptimeo)
                 if (error = VOP_FSYNC(vp, cred, MNT_WAIT, p)) {
                         return (error);
                 }
-               if (vp->v_dirtyblkhd.lh_first != NULL || (vp->v_flag & VHASDIRTY))
-                       panic("vinvalbuf: dirty bufs");
+               if (vp->v_dirtyblkhd.lh_first)
+                       panic("vinvalbuf: dirty bufs (vp 0x%x, bp 0x%x)", vp, vp->v_dirtyblkhd.lh_first);
         }
  
         for (;;) {
-               if ((blist = vp->v_cleanblkhd.lh_first) && flags & V_SAVEMETA)
+               if ((blist = vp->v_cleanblkhd.lh_first) && (flags & V_SAVEMETA))
                         while (blist && blist->b_lblkno < 0)
                                 blist = blist->b_vnbufs.le_next;
                 if (!blist && (blist = vp->v_dirtyblkhd.lh_first) &&
@@ -694,7 +721,7 @@ vinvalbuf(vp, flags, cred, p, slpflag, slptimeo)
  
                 for (bp = blist; bp; bp = nbp) {
                         nbp = bp->b_vnbufs.le_next;
-                       if (flags & V_SAVEMETA && bp->b_lblkno < 0)
+                       if ((flags & V_SAVEMETA) && bp->b_lblkno < 0)
                                 continue;
                         s = splbio();
                         if (ISSET(bp->b_flags, B_BUSY)) {
@@ -720,7 +747,13 @@ vinvalbuf(vp, flags, cred, p, slpflag, slptimeo)
                                 (void) VOP_BWRITE(bp);
                                 break;
                         }
-                       SET(bp->b_flags, B_INVAL);
+
+                       if (bp->b_flags & B_LOCKED) {
+                               panic("vinvalbuf: bp @ 0x%x is locked!", bp);
+                               break;
+                       } else {
+                               SET(bp->b_flags, B_INVAL);
+                       }
                         brelse(bp);
                 }
         }
@@ -730,82 +763,6 @@ vinvalbuf(vp, flags, cred, p, slpflag, slptimeo)
         return (0);
  }
  
-/*
- * Associate a buffer with a vnode.
- */
-void
-bgetvp(vp, bp)
-       register struct vnode *vp;
-       register struct buf *bp;
-{
-
-       if (bp->b_vp)
-               panic("bgetvp: not free");
-       VHOLD(vp);
-       bp->b_vp = vp;
-       if (vp->v_type == VBLK || vp->v_type == VCHR)
-               bp->b_dev = vp->v_rdev;
-       else
-               bp->b_dev = NODEV;
-       /*
-        * Insert onto list for new vnode.
-        */
-       bufinsvn(bp, &vp->v_cleanblkhd);
-}
-
-/*
- * Disassociate a buffer from a vnode.
- */
-void
-brelvp(bp)
-       register struct buf *bp;
-{
-       struct vnode *vp;
-
-       if (bp->b_vp == (struct vnode *) 0)
-               panic("brelvp: NULL");
-       /*
-        * Delete from old vnode list, if on one.
-        */
-       if (bp->b_vnbufs.le_next != NOLIST)
-               bufremvn(bp);
-       vp = bp->b_vp;
-       bp->b_vp = (struct vnode *) 0;
-       HOLDRELE(vp);
-}
-
-/*
- * Reassign a buffer from one vnode to another.
- * Used to assign file specific control information
- * (indirect blocks) to the vnode to which they belong.
- */
-void
-reassignbuf(bp, newvp)
-       register struct buf *bp;
-       register struct vnode *newvp;
-{
-       register struct buflists *listheadp;
-
-       if (newvp == NULL) {
-               printf("reassignbuf: NULL");
-               return;
-       }
-       /*
-        * Delete from old vnode list, if on one.
-        */
-       if (bp->b_vnbufs.le_next != NOLIST)
-               bufremvn(bp);
-       /*
-        * If dirty, put on list of dirty buffers;
-        * otherwise insert onto list of clean buffers.
-        */
-       if (ISSET(bp->b_flags, B_DELWRI))
-               listheadp = &newvp->v_dirtyblkhd;
-       else
-               listheadp = &newvp->v_cleanblkhd;
-       bufinsvn(bp, listheadp);
-}
-
  /*
   * Create a vnode for a block device.
   * Used for root filesystem, argdev, and swap areas.
@@ -856,14 +813,13 @@ checkalias(nvp, nvp_rdev, mp)
         struct proc *p = current_proc();        /* XXX */
         struct vnode *vp;
         struct vnode **vpp;
-       struct specinfo * bufhold;
-       int buffree = 1;
+       struct specinfo *specinfop;
  
         if (nvp->v_type != VBLK && nvp->v_type != VCHR)
                 return (NULLVP);
  
-       bufhold = (struct specinfo *)_MALLOC_ZONE(sizeof(struct specinfo),
-                       M_VNODE, M_WAITOK);
+       MALLOC_ZONE(specinfop, struct specinfo *, sizeof(struct specinfo),
+                       M_SPECINFO, M_WAITOK);
         vpp = &speclisth[SPECHASH(nvp_rdev)];
  loop:
         simple_lock(&spechash_slock);
@@ -886,8 +842,8 @@ loop:
                 break;
         }
         if (vp == NULL || vp->v_tag != VT_NON) {
-               nvp->v_specinfo = bufhold;
-               buffree = 0;    /* buffer used */
+               nvp->v_specinfo = specinfop;
+               specinfop = 0;  /* buffer used */
                 bzero(nvp->v_specinfo, sizeof(struct specinfo));
                 nvp->v_rdev = nvp_rdev;
                 nvp->v_hashchain = vpp;
@@ -911,8 +867,8 @@ loop:
         vp->v_tag = nvp->v_tag;
         nvp->v_type = VNON;
         insmntque(vp, mp);
-       if (buffree)
-               _FREE_ZONE((void *)bufhold, sizeof (struct specinfo), M_VNODE);
+       if (specinfop)
+               FREE_ZONE((void *)specinfop, sizeof(struct specinfo), M_SPECINFO);
         return (vp);
  }
  
@@ -933,6 +889,11 @@ vget(vp, flags, p)
         struct proc *p;
  {
         int error = 0;
+       u_long vpid;
+
+       vpid = vp->v_id;    // save off the original v_id
+
+retry:
  
         /*
          * If the vnode is in the process of being cleaned out for
@@ -956,26 +917,28 @@ vget(vp, flags, p)
         if (ISSET(vp->v_flag, VTERMINATE)) {
                 SET(vp->v_flag, VTERMWANT);
                 simple_unlock(&vp->v_interlock);
-               (void)tsleep((caddr_t)&vp->v_ubcinfo, PINOD, "vclean", 0);
+               (void)tsleep((caddr_t)&vp->v_ubcinfo, PINOD, "vget1", 0);
                 return (ENOENT);
         }
  
+       /*
+        * if the vnode is being initialized,
+        * wait for it to finish initialization
+        */
+       if (ISSET(vp->v_flag,  VUINIT)) {
+               SET(vp->v_flag, VUWANT);
+               simple_unlock(&vp->v_interlock);
+               (void) tsleep((caddr_t)vp, PINOD, "vget2", 0);
+               goto retry;
+       }
+
         simple_lock(&vnode_free_list_slock);
-       if (vp->v_usecount == 0) {
-               /* If on the free list, remove it from there */
-               if (VONLIST(vp))
+       if (VONLIST(vp)) {
+               if (vp->v_usecount == 0)
                         VREMFREE("vget", vp);
-       } else {
-               /* If on the inactive list, remove it from there */
-               if ((vp->v_usecount == 1) && UBCINFOEXISTS(vp)) {
-                       if (VONLIST(vp))
-                               VREMINACTIVE("vget", vp);
-               }
+                else if (ISSET((vp)->v_flag, VUINACTIVE))
+                       VREMINACTIVE("vget", vp);
         }
-
-       /* The vnode should not be on the inactive list here */
-       VINACTIVECHECK("vget", vp, 0);
-
         simple_unlock(&vnode_free_list_slock);
  
         if (++vp->v_usecount <= 0)
@@ -986,7 +949,7 @@ vget(vp, flags, p)
          */
         if (UBCISVALID(vp) && !ubc_issetflags(vp, UI_HASOBJREF)) {
                 simple_unlock(&vp->v_interlock);
-               if (ubc_getobject(vp, UBC_HOLDOBJECT)) {
+               if (ubc_getobject(vp, UBC_HOLDOBJECT) == MEMORY_OBJECT_CONTROL_NULL) {
                         error = ENOENT;
                         goto errout;
                 }
@@ -996,21 +959,44 @@ vget(vp, flags, p)
         if (flags & LK_TYPE_MASK) {
                 if (error = vn_lock(vp, flags | LK_INTERLOCK, p))
                         goto errout;
+               if (vpid != vp->v_id) {    // make sure it's still the same vnode
+                   vput(vp);
+                   return ENOENT;
+               }
                 return (0);
         }
  
         if ((flags & LK_INTERLOCK) == 0)
                 simple_unlock(&vp->v_interlock);
+
+       if (vpid != vp->v_id) {            // make sure it's still the same vnode
+           vrele(vp);
+           return ENOENT;
+       }
+
         return (0);
  
  errout:
+       simple_lock(&vp->v_interlock);
+
+       /*
+        * we may have blocked. Re-evaluate the state
+        */
+       simple_lock(&vnode_free_list_slock);
+       if (VONLIST(vp)) {
+               if (vp->v_usecount == 0)
+                       VREMFREE("vget", vp);
+                else if (ISSET((vp)->v_flag, VUINACTIVE))
+                       VREMINACTIVE("vget", vp);
+       }
+       simple_unlock(&vnode_free_list_slock);
+
         /*
          * If the vnode was not active in the first place
          * must not call vrele() as VOP_INACTIVE() is not
          * required.
          * So inlined part of vrele() here.
          */
-       simple_lock(&vp->v_interlock);
         if (--vp->v_usecount == 1) {
                 if (UBCINFOEXISTS(vp)) {
                         vinactive(vp);
@@ -1033,7 +1019,7 @@ errout:
   * Get a pager reference on the particular vnode.
   *
   * This is called from ubc_info_init() and it is asumed that
- * the vnode is neither on the free list on on the inactive list.
+ * the vnode is not on the free list.
   * It is also assumed that the vnode is neither being recycled
   * by vgonel nor being terminated by vnode_pager_vrele().
   *
@@ -1044,25 +1030,22 @@ vnode_pager_vget(vp)
         struct vnode *vp;
  {
         simple_lock(&vp->v_interlock);
-       if (UBCINFOMISSING(vp))
-               panic("vnode_pager_vget: stolen ubc_info");
  
-       if (!UBCINFOEXISTS(vp))
-               panic("vnode_pager_vget: lost ubc_info");
+       UBCINFOCHECK("vnode_pager_vget", vp);
  
-       if ((vp->v_flag & VXLOCK) || (vp->v_flag & VORECLAIM))
-               panic("vnode_pager_vget: already being reclaimd");
-
-       if (ISSET(vp->v_flag, VTERMINATE))
-               panic("vnode_pager_vget: already being terminated");
+       if (ISSET(vp->v_flag, (VXLOCK|VORECLAIM|VTERMINATE)))
+               panic("%s: dying vnode", "vnode_pager_vget");
  
         simple_lock(&vnode_free_list_slock);
-       /* The vnode should not be on ANY list */
-       if (VONLIST(vp))
-               panic("vnode_pager_vget: still on the list");
+       /* The vnode should not be on free list */
+       if (VONLIST(vp)) {     
+               if (vp->v_usecount == 0)
+                       panic("%s: still on list", "vnode_pager_vget");
+               else if (ISSET((vp)->v_flag, VUINACTIVE))
+                       VREMINACTIVE("vnode_pager_vget", vp);
+       }
  
         /* The vnode should not be on the inactive list here */
-       VINACTIVECHECK("vnode_pager_vget", vp, 0);
         simple_unlock(&vnode_free_list_slock);
  
         /* After all those checks, now do the real work :-) */
@@ -1108,8 +1091,8 @@ vop_nolock(ap)
         if (vp->v_vnlock == NULL) {
                 if ((flags & LK_TYPE_MASK) == LK_DRAIN)
                         return (0);
-               MALLOC_ZONE(vp->v_vnlock, struct lock__bsd__ *,
-                               sizeof(struct lock__bsd__), M_VNODE, M_WAITOK);
+               MALLOC(vp->v_vnlock, struct lock__bsd__ *,
+                               sizeof(struct lock__bsd__), M_TEMP, M_WAITOK);
                 lockinit(vp->v_vnlock, PVFS, "vnlock", 0, 0);
         }
         switch (flags & LK_TYPE_MASK) {
@@ -1189,21 +1172,41 @@ vref(vp)
                 panic("vref used where vget required");
  
         /* If on the inactive list, remove it from there */
-       if ((vp->v_usecount == 1) && UBCINFOEXISTS(vp)) {
-               if (VONLIST(vp)) {
-                       simple_lock(&vnode_free_list_slock);
-                       VREMINACTIVE("vref", vp);
-                       simple_unlock(&vnode_free_list_slock);
-               }
-       }
-       /* The vnode should not be on the inactive list here */
-       VINACTIVECHECK("vref", vp, 0);
+       simple_lock(&vnode_free_list_slock);
+       if (ISSET((vp)->v_flag, VUINACTIVE))
+               VREMINACTIVE("vref", vp);
+       simple_unlock(&vnode_free_list_slock);
  
         if (++vp->v_usecount <= 0)
                 panic("vref v_usecount");                     
         simple_unlock(&vp->v_interlock);
  }
  
+static void
+clean_up_name_parent_ptrs(struct vnode *vp)
+{
+    if (VNAME(vp) || VPARENT(vp)) {
+       char *tmp1;
+       struct vnode *tmp2;
+
+       // do it this way so we don't block before clearing 
+       // these fields.
+       tmp1 = VNAME(vp);
+       tmp2 = VPARENT(vp);
+       VNAME(vp) = NULL;
+       VPARENT(vp) = NULL;
+           
+       if (tmp1) {
+           remove_name(tmp1);
+       }
+           
+       if (tmp2) {
+           vrele(tmp2);
+       }
+    }
+}
+
+
  /*
   * put the vnode on appropriate free list.
   * called with v_interlock held.
@@ -1212,6 +1215,13 @@ static void
  vfree(vp)
         struct vnode *vp;
  {
+       funnel_t *curflock;
+       extern int disable_funnel;
+
+       if ((curflock = thread_funnel_get()) != kernel_flock &&
+           !(disable_funnel && curflock != THR_FUNNEL_NULL))
+               panic("Entering vfree() without kernel funnel");
+
         /*
          * if the vnode is not obtained by calling getnewvnode() we
          * are not responsible for the cleanup. Just return.
@@ -1226,8 +1236,11 @@ vfree(vp)
         /* insert at tail of LRU list or at head if VAGE is set */
         simple_lock(&vnode_free_list_slock);
  
+       // make sure the name & parent pointers get cleared out
+//     clean_up_name_parent_ptrs(vp);
+
         if (VONLIST(vp))
-                panic("vfree: vnode still on list");
+                panic("%s: vnode still on list", "vfree");
  
         if (vp->v_flag & VAGE) {
                 TAILQ_INSERT_HEAD(&vnode_free_list, vp, v_freelist);
@@ -1247,6 +1260,13 @@ static void
  vinactive(vp)
         struct vnode *vp;
  {
+       funnel_t *curflock;
+       extern int disable_funnel;
+
+       if ((curflock = thread_funnel_get()) != kernel_flock &&
+           !(disable_funnel && curflock != THR_FUNNEL_NULL))
+               panic("Entering vinactive() without kernel funnel");
+
         if (!UBCINFOEXISTS(vp))
                 panic("vinactive: not a UBC vnode");
  
@@ -1256,7 +1276,7 @@ vinactive(vp)
         simple_lock(&vnode_free_list_slock);
  
         if (VONLIST(vp))
-                panic("vinactive: vnode still on list");
+                panic("%s: vnode still on list", "vinactive");
         VINACTIVECHECK("vinactive", vp, 0);
  
         TAILQ_INSERT_TAIL(&vnode_inactive_list, vp, v_freelist);
@@ -1299,8 +1319,10 @@ vput(vp)
                         vp->v_usecount, vp->v_writecount);
         }
  #endif
-       if (ISSET((vp)->v_flag, VUINACTIVE) && VONLIST(vp))
-               VREMINACTIVE("vrele", vp);
+       simple_lock(&vnode_free_list_slock);
+       if (ISSET((vp)->v_flag, VUINACTIVE))
+               VREMINACTIVE("vref", vp);
+       simple_unlock(&vnode_free_list_slock);
  
         simple_unlock(&vp->v_interlock);
         VOP_INACTIVE(vp, p);
@@ -1329,11 +1351,18 @@ vrele(vp)
         struct vnode *vp;
  {
         struct proc *p = current_proc();        /* XXX */
+       funnel_t *curflock;
+       extern int disable_funnel;
+
+       if ((curflock = thread_funnel_get()) != kernel_flock &&
+           !(disable_funnel && curflock != THR_FUNNEL_NULL))
+               panic("Entering vrele() without kernel funnel");
  
         simple_lock(&vp->v_interlock);
         if (--vp->v_usecount == 1) {
                 if (UBCINFOEXISTS(vp)) {
-                       vinactive(vp);
+                       if ((vp->v_flag & VXLOCK) == 0)
+                               vinactive(vp);
                         simple_unlock(&vp->v_interlock);
                         return;
                 }
@@ -1348,9 +1377,6 @@ vrele(vp)
                 panic("vrele: ref cnt");
         }
  #endif
-       if (ISSET((vp)->v_flag, VUINACTIVE) && VONLIST(vp))
-               VREMINACTIVE("vrele", vp);
-
  
         if ((vp->v_flag & VXLOCK) || (vp->v_flag & VORECLAIM)) {
                 /* vnode is being cleaned, just return */
@@ -1462,9 +1488,9 @@ loop:
  
                 simple_lock(&vp->v_interlock);
                 /*
-                * Skip over a vnodes marked VSYSTEM.
+                * Skip over a vnodes marked VSYSTEM or VNOFLUSH.
                  */
-               if ((flags & SKIPSYSTEM) && (vp->v_flag & VSYSTEM)) {
+               if ((flags & SKIPSYSTEM) && ((vp->v_flag & VSYSTEM) || (vp->v_flag & VNOFLUSH))) {
                         simple_unlock(&vp->v_interlock);
                         continue;
                 }
@@ -1519,7 +1545,7 @@ loop:
                 busy++;
         }
         simple_unlock(&mntvnode_slock);
-       if (busy)
+       if (busy && ((flags & FORCECLOSE)==0))
                 return (EBUSY);
         return (0);
  }
@@ -1535,9 +1561,6 @@ vclean(vp, flags, p)
         struct proc *p;
  {
         int active;
-       void *obj;
-       kern_return_t kret;
-       int removed = 0;
         int didhold;
  
         /*
@@ -1555,9 +1578,23 @@ vclean(vp, flags, p)
          * so that its count cannot fall to zero and generate a
          * race against ourselves to recycle it.
          */
-       if (active = vp->v_usecount)
+       if (active = vp->v_usecount) {
+               /*
+                * active vnode can not be on the free list.
+                * we are about to take an extra reference on this vnode
+                * do the queue management as needed
+                * Not doing so can cause "still on list" or
+                * "vnreclaim: v_usecount" panic if VOP_LOCK() blocks.
+                */
+               simple_lock(&vnode_free_list_slock);
+               if (ISSET((vp)->v_flag, VUINACTIVE))
+                       VREMINACTIVE("vclean", vp);
+               simple_unlock(&vnode_free_list_slock);
+
                 if (++vp->v_usecount <= 0)
                         panic("vclean: v_usecount");
+       }
+
         /*
          * Prevent the vnode from being recycled or
          * brought into use while we clean it out.
@@ -1576,16 +1613,15 @@ vclean(vp, flags, p)
         VOP_LOCK(vp, LK_DRAIN | LK_INTERLOCK, p);
  
         /*
+        * While blocked in VOP_LOCK() someone could have dropped
+        * reference[s] and we could land on the inactive list.
          * if this vnode is on the inactive list 
          * take it off the list.
          */
-       if ((active == 1) && 
-               (ISSET((vp)->v_flag, VUINACTIVE) && VONLIST(vp))) {
-               simple_lock(&vnode_free_list_slock);
+       simple_lock(&vnode_free_list_slock);
+       if (ISSET((vp)->v_flag, VUINACTIVE))
                 VREMINACTIVE("vclean", vp);
-               simple_unlock(&vnode_free_list_slock);
-               removed++;
-       }
+       simple_unlock(&vnode_free_list_slock);
  
         /* Clean the pages in VM. */
         if (active && (flags & DOCLOSE))
@@ -1601,10 +1637,10 @@ vclean(vp, flags, p)
          */
         if (flags & DOCLOSE) {
                 if (vp->v_tag == VT_NFS)
-            nfs_vinvalbuf(vp, V_SAVE, NOCRED, p, 0);
-        else
-            vinvalbuf(vp, V_SAVE, NOCRED, p, 0, 0);
-    }
+                       nfs_vinvalbuf(vp, V_SAVE, NOCRED, p, 0);
+               else
+                       vinvalbuf(vp, V_SAVE, NOCRED, p, 0, 0);
+       }
  
         if (active)
                 VOP_INACTIVE(vp, p);
@@ -1612,28 +1648,43 @@ vclean(vp, flags, p)
                 VOP_UNLOCK(vp, 0, p);
  
         /* Destroy ubc named reference */
-    if (didhold) {
-        ubc_rele(vp);
+       if (didhold) {
+               ubc_rele(vp);
                 ubc_destroy_named(vp);
         }
+       /*
+        * Make sure vp isn't on the inactive list.
+        */
+       simple_lock(&vnode_free_list_slock);
+       if (ISSET((vp)->v_flag, VUINACTIVE)) {
+               VREMINACTIVE("vclean", vp);
+       }
+       simple_unlock(&vnode_free_list_slock);
  
         /*
          * Reclaim the vnode.
          */
         if (VOP_RECLAIM(vp, p))
                 panic("vclean: cannot reclaim");
+       
+       // make sure the name & parent ptrs get cleaned out!
+       clean_up_name_parent_ptrs(vp);
+
         cache_purge(vp);
         if (vp->v_vnlock) {
-               if ((vp->v_vnlock->lk_flags & LK_DRAINED) == 0)
+               struct lock__bsd__ *tmp = vp->v_vnlock;
+               if ((tmp->lk_flags & LK_DRAINED) == 0)
                         vprint("vclean: lock not drained", vp);
-               FREE_ZONE(vp->v_vnlock, sizeof (struct lock__bsd__), M_VNODE);
                 vp->v_vnlock = NULL;
+               FREE(tmp, M_TEMP);
         }
  
         /* It's dead, Jim! */
         vp->v_op = dead_vnodeop_p;
         vp->v_tag = VT_NON;
  
+       insmntque(vp, (struct mount *)0);
+
         /*
          * Done with purge, notify sleepers of the grim news.
          */
@@ -1825,8 +1876,11 @@ vgonel(vp, p)
                         vp->v_flag &= ~VALIASED;
                 }
                 simple_unlock(&spechash_slock);
-               FREE_ZONE(vp->v_specinfo, sizeof (struct specinfo), M_VNODE);
+               {
+               struct specinfo *tmp = vp->v_specinfo;
                 vp->v_specinfo = NULL;
+               FREE_ZONE((void *)tmp, sizeof(struct specinfo), M_SPECINFO);
+               }
         }
         /*
          * If it is on the freelist and not already at the head,
@@ -1841,7 +1895,7 @@ vgonel(vp, p)
          * getnewvnode after removing it from the freelist to ensure
          * that we do not try to move it here.
          */
-       if (vp->v_usecount == 0) {
+       if (vp->v_usecount == 0 && (vp->v_flag & VUINACTIVE) == 0) {
                 simple_lock(&vnode_free_list_slock);
                 if ((vp->v_freelist.tqe_prev != (struct vnode **)0xdeadb) &&
                     vnode_free_list.tqh_first != vp) {
@@ -1936,6 +1990,8 @@ vprint(label, vp)
                 strcat(buf, "|VTEXT");
         if (vp->v_flag & VSYSTEM)
                 strcat(buf, "|VSYSTEM");
+       if (vp->v_flag & VNOFLUSH)
+               strcat(buf, "|VNOFLUSH");
         if (vp->v_flag & VXLOCK)
                 strcat(buf, "|VXLOCK");
         if (vp->v_flag & VXWANT)
@@ -1987,6 +2043,74 @@ printlockedvnodes()
  }
  #endif
  
+static int
+build_path(struct vnode *vp, char *buff, int buflen, int *outlen)
+{
+    char *end, *str;
+    int   i, len, ret=0, counter=0;
+
+    end = &buff[buflen-1];
+    *--end = '\0';
+
+    while(vp && VPARENT(vp) != vp) {
+       // the maximum depth of a file system hierarchy is MAXPATHLEN/2
+       // (with single-char names separated by slashes).  we panic if
+       // we've ever looped more than that.
+       if (counter++ > MAXPATHLEN/2) {
+           panic("build_path: vnode parent chain is too long! vp 0x%x\n", vp);
+       }
+       str = VNAME(vp);
+       if (VNAME(vp) == NULL) {
+           if (VPARENT(vp) != NULL) {
+               ret = EINVAL;
+           }
+           break;
+       }
+       
+       // count how long the string is
+       for(len=0; *str; str++, len++)
+           /* nothing */;
+
+       // check that there's enough space
+       if ((end - buff) < len) {
+           ret = ENOSPC;
+           break;
+       }
+
+       // copy it backwards
+       for(; len > 0; len--) {
+           *--end = *--str;
+       }
+
+       // put in the path separator
+       *--end = '/';
+
+       // walk up the chain.  
+       vp = VPARENT(vp);
+
+       // check if we're crossing a mount point and
+       // switch the vp if we are.
+       if (vp && (vp->v_flag & VROOT)) {
+           vp = vp->v_mount->mnt_vnodecovered;
+       }
+    }
+
+    // slide it down to the beginning of the buffer
+    memmove(buff, end, &buff[buflen] - end);
+    
+    *outlen = &buff[buflen] - end;
+ 
+    return ret;
+}
+
+__private_extern__ int
+vn_getpath(struct vnode *vp, char *pathbuf, int *len)
+{
+    return build_path(vp, pathbuf, *len, len);
+}
+
+
+
  /*
   * Top level filesystem related information gathering.
   */
@@ -2000,17 +2124,29 @@ vfs_sysctl(name, namelen, oldp, oldlenp, newp, newlen, p)
         size_t newlen;
         struct proc *p;
  {
-       struct ctldebug *cdp;
         struct vfsconf *vfsp;
+       int *username;
+       u_int usernamelen;
+       int error;
  
-       if (name[0] == VFS_NUMMNTOPS) {
+       /*
+        * The VFS_NUMMNTOPS shouldn't be at name[0] since
+        * is a VFS generic variable. So now we must check
+        * namelen so we don't end up covering any UFS
+        * variables (sinc UFS vfc_typenum is 1).
+        *
+        * It should have been:
+        *    name[0]:  VFS_GENERIC
+        *    name[1]:  VFS_NUMMNTOPS
+        */
+       if (namelen == 1 && name[0] == VFS_NUMMNTOPS) {
                 extern unsigned int vfs_nummntops;
                 return (sysctl_rdint(oldp, oldlenp, newp, vfs_nummntops));
         }
  
         /* all sysctl names at this level are at least name and field */
         if (namelen < 2)
-               return (ENOTDIR);               /* overloaded */
+               return (EISDIR);                /* overloaded */
         if (name[0] != VFS_GENERIC) {
                 for (vfsp = vfsconf; vfsp; vfsp = vfsp->vfc_next)
                         if (vfsp->vfc_typenum == name[0])
@@ -2034,7 +2170,19 @@ vfs_sysctl(name, namelen, oldp, oldlenp, newp, newlen, p)
                 return (sysctl_rdstruct(oldp, oldlenp, newp, vfsp,
                     sizeof(struct vfsconf)));
         }
-       return (EOPNOTSUPP);
+       /*
+        * We need to get back into the general MIB, so we need to re-prepend
+        * CTL_VFS to our name and try userland_sysctl().
+        */
+       usernamelen = namelen + 1;
+       MALLOC(username, int *, usernamelen * sizeof(*username),
+           M_TEMP, M_WAITOK);
+       bcopy(name, username + 1, namelen * sizeof(*name));
+       username[0] = CTL_VFS;
+       error = userland_sysctl(p, username, usernamelen, oldp, oldlenp, 1,
+           newp, newlen, oldlenp);
+       FREE(username, M_TEMP);
+       return (error);
  }
  
  int kinfo_vdebug = 1;
@@ -2091,13 +2239,16 @@ again:
                         nvp = vp->v_mntvnodes.le_next;
                         if (bp + VPTRSZ + VNODESZ > ewhere) {
                                 simple_unlock(&mntvnode_slock);
+                               vfs_unbusy(mp, p);
                                 *sizep = bp - where;
                                 return (ENOMEM);
                         }
                         simple_unlock(&mntvnode_slock);
                         if ((error = copyout((caddr_t)&vp, bp, VPTRSZ)) ||
-                          (error = copyout((caddr_t)vp, bp + VPTRSZ, VNODESZ)))
+                           (error = copyout((caddr_t)vp, bp + VPTRSZ, VNODESZ))) {
+                               vfs_unbusy(mp, p);
                                 return (error);
+                       }
                         bp += VPTRSZ + VNODESZ;
                         simple_lock(&mntvnode_slock);
                 }
@@ -2358,12 +2509,10 @@ vm_object_cache_reclaim(int count)
  static int
  vnreclaim(int count)
  {
-       int cnt, i, loopcnt;
-       void *obj;
+       int i, loopcnt;
         struct vnode *vp;
         int err;
         struct proc *p;
-       kern_return_t kret;
  
         i = 0;
         loopcnt = 0;
@@ -2412,6 +2561,16 @@ restart:
                         goto restart;
                 }
  
+               /*
+                * if the vnode is being initialized,
+                * skip over it
+                */
+               if (ISSET(vp->v_flag,  VUINIT)) {
+                       SET(vp->v_flag, VUWANT);
+                       simple_unlock(&vp->v_interlock);
+                       continue;
+               }
+
                 VREMINACTIVE("vnreclaim", vp);
                 simple_unlock(&vnode_free_list_slock);
  
@@ -2529,9 +2688,6 @@ vnode_pager_vrele(struct vnode *vp)
         boolean_t       funnel_state;
         int isvnreclaim = 1;
  
-       if (vp == (struct vnode *) NULL) 
-               panic("vnode_pager_vrele: null vp");
-
         funnel_state = thread_funnel_set(kernel_flock, TRUE);
  
         /* Mark the vnode to be recycled */
@@ -2568,6 +2724,9 @@ vnode_pager_vrele(struct vnode *vp)
         }
         if (!ISSET(vp->v_flag, VTERMINATE))
                 SET(vp->v_flag, VTERMINATE);
+
+       cache_purge(vp);
+
         if (UBCINFOEXISTS(vp)) {
                 struct ubc_info *uip = vp->v_ubcinfo;
  
@@ -2615,7 +2774,6 @@ int walk_vnodes_debug=0;
  void
  walk_allvnodes()
  {
-       struct proc *p = current_proc();
         struct mount *mp, *nmp;
         struct vnode *vp;
         int cnt = 0;
@@ -2654,6 +2812,14 @@ walk_allvnodes()
  }
  #endif /* DIAGNOSTIC */
  
+
+struct x_constraints {
+        u_int32_t x_maxreadcnt;
+        u_int32_t x_maxsegreadsize;
+        u_int32_t x_maxsegwritesize;
+};
+
+
  void
  vfs_io_attributes(vp, flags, iosize, vectors)
         struct vnode    *vp;
@@ -2671,7 +2837,10 @@ vfs_io_attributes(vp, flags, iosize, vectors)
         if (mp != NULL) {
                 switch (flags) {
                 case B_READ:
-                       *iosize = mp->mnt_maxreadcnt;
+                       if (mp->mnt_kern_flag & MNTK_IO_XINFO)
+                               *iosize = ((struct x_constraints *)(mp->mnt_xinfo_ptr))->x_maxreadcnt;
+                       else
+                               *iosize = mp->mnt_maxreadcnt;
                         *vectors = mp->mnt_segreadcnt;
                         break;
                 case B_WRITE:
@@ -2681,12 +2850,62 @@ vfs_io_attributes(vp, flags, iosize, vectors)
                 default:
                         break;
                 }
+               if (*iosize == 0)
+                       *iosize = MAXPHYS;
+               if (*vectors == 0)
+                       *vectors = 32;
         }
-
         return;
  }
  
-#include <dev/disk.h>
+__private_extern__
+void
+vfs_io_maxsegsize(vp, flags, maxsegsize)
+       struct vnode    *vp;
+       int     flags;  /* B_READ or B_WRITE */
+       int     *maxsegsize;
+{
+       struct mount *mp;
+
+       /* start with "reasonable" default */
+       *maxsegsize = MAXPHYS;
+
+       mp = vp->v_mount;
+       if (mp != NULL) {
+               switch (flags) {
+               case B_READ:
+                       if (mp->mnt_kern_flag & MNTK_IO_XINFO)
+                               *maxsegsize = ((struct x_constraints *)(mp->mnt_xinfo_ptr))->x_maxsegreadsize;
+                       else
+                               /*
+                                * if the extended info doesn't exist
+                                * then use the maxread I/O size as the 
+                                * max segment size... this is the previous behavior
+                                */
+                               *maxsegsize = mp->mnt_maxreadcnt;
+                       break;
+               case B_WRITE:
+                       if (mp->mnt_kern_flag & MNTK_IO_XINFO)
+                               *maxsegsize = ((struct x_constraints *)(mp->mnt_xinfo_ptr))->x_maxsegwritesize;
+                       else
+                               /*
+                                * if the extended info doesn't exist
+                                * then use the maxwrite I/O size as the 
+                                * max segment size... this is the previous behavior
+                                */
+                               *maxsegsize = mp->mnt_maxwritecnt;
+                       break;
+               default:
+                       break;
+               }
+               if (*maxsegsize == 0)
+                       *maxsegsize = MAXPHYS;
+       }
+}
+
+
+#include <sys/disk.h>
+
  
  int
  vfs_init_io_attributes(devvp, mp)
@@ -2696,8 +2915,12 @@ vfs_init_io_attributes(devvp, mp)
         int error;
         off_t readblockcnt;
         off_t writeblockcnt;
+       off_t readmaxcnt;
+       off_t writemaxcnt;
         off_t readsegcnt;
         off_t writesegcnt;
+       off_t readsegsize;
+       off_t writesegsize;
         u_long blksize;
  
         u_int64_t temp;
@@ -2705,6 +2928,32 @@ vfs_init_io_attributes(devvp, mp)
         struct proc *p = current_proc();
         struct  ucred *cred = p->p_ucred;
  
+       int isvirtual = 0;
+       /*
+        * determine if this mount point exists on the same device as the root
+        * partition... if so, then it comes under the hard throttle control
+        */
+       int        thisunit = -1;
+       static int rootunit = -1;
+       extern struct vnode *rootvp;
+
+       if (rootunit == -1) {
+               if (VOP_IOCTL(rootvp, DKIOCGETBSDUNIT, (caddr_t)&rootunit, 0, cred, p))
+                       rootunit = -1; 
+               else if (rootvp == devvp)
+                       mp->mnt_kern_flag |= MNTK_ROOTDEV;
+       }
+       if (devvp != rootvp && rootunit != -1) {
+               if (VOP_IOCTL(devvp, DKIOCGETBSDUNIT, (caddr_t)&thisunit, 0, cred, p) == 0) {
+                       if (thisunit == rootunit)
+                               mp->mnt_kern_flag |= MNTK_ROOTDEV;
+               }
+       }
+       if (VOP_IOCTL(devvp, DKIOCGETISVIRTUAL, (caddr_t)&isvirtual, 0, cred, p) == 0) {
+               if (isvirtual)
+                       mp->mnt_kern_flag |= MNTK_VIRTUALDEV;
+       }
+
         if ((error = VOP_IOCTL(devvp, DKIOCGETMAXBLOCKCOUNTREAD,
                                 (caddr_t)&readblockcnt, 0, cred, p)))
                 return (error);
@@ -2713,6 +2962,14 @@ vfs_init_io_attributes(devvp, mp)
                                 (caddr_t)&writeblockcnt, 0, cred, p)))
                 return (error);
  
+       if ((error = VOP_IOCTL(devvp, DKIOCGETMAXBYTECOUNTREAD,
+                               (caddr_t)&readmaxcnt, 0, cred, p)))
+               return (error);
+
+       if ((error = VOP_IOCTL(devvp, DKIOCGETMAXBYTECOUNTWRITE,
+                               (caddr_t)&writemaxcnt, 0, cred, p)))
+               return (error);
+
         if ((error = VOP_IOCTL(devvp, DKIOCGETMAXSEGMENTCOUNTREAD,
                                 (caddr_t)&readsegcnt, 0, cred, p)))
                 return (error);
@@ -2721,32 +2978,315 @@ vfs_init_io_attributes(devvp, mp)
                                 (caddr_t)&writesegcnt, 0, cred, p)))
                 return (error);
  
+       if ((error = VOP_IOCTL(devvp, DKIOCGETMAXSEGMENTBYTECOUNTREAD,
+                               (caddr_t)&readsegsize, 0, cred, p)))
+               return (error);
+
+       if ((error = VOP_IOCTL(devvp, DKIOCGETMAXSEGMENTBYTECOUNTWRITE,
+                               (caddr_t)&writesegsize, 0, cred, p)))
+               return (error);
+
         if ((error = VOP_IOCTL(devvp, DKIOCGETBLOCKSIZE,
                                 (caddr_t)&blksize, 0, cred, p)))
                 return (error);
  
-       temp = readblockcnt * blksize;
-       temp = (temp > UINT32_MAX) ? (UINT32_MAX / blksize) * blksize : temp;
-       mp->mnt_maxreadcnt = (u_int32_t)temp;
  
-       temp = writeblockcnt * blksize;
-       temp = (temp > UINT32_MAX) ? (UINT32_MAX / blksize) * blksize : temp;
+        if ( !(mp->mnt_kern_flag & MNTK_IO_XINFO)) {
+               MALLOC(mp->mnt_xinfo_ptr, void *, sizeof(struct x_constraints), M_TEMP, M_WAITOK);
+               mp->mnt_kern_flag |= MNTK_IO_XINFO;
+       }
+
+       if (readmaxcnt)
+               temp = (readmaxcnt > UINT32_MAX) ? UINT32_MAX : readmaxcnt;
+       else {
+               if (readblockcnt) {
+                       temp = readblockcnt * blksize;
+                       temp = (temp > UINT32_MAX) ? UINT32_MAX : temp;
+               } else
+                       temp = MAXPHYS;
+       }
+       ((struct x_constraints *)(mp->mnt_xinfo_ptr))->x_maxreadcnt = (u_int32_t)temp;
+
+       if (writemaxcnt)
+               temp = (writemaxcnt > UINT32_MAX) ? UINT32_MAX : writemaxcnt;
+       else {
+               if (writeblockcnt) {
+                       temp = writeblockcnt * blksize;
+                       temp = (temp > UINT32_MAX) ? UINT32_MAX : temp;
+               } else
+                       temp = MAXPHYS;
+       }
         mp->mnt_maxwritecnt = (u_int32_t)temp;
  
-       temp = (readsegcnt > UINT16_MAX) ? UINT16_MAX : readsegcnt;
-       mp->mnt_segreadcnt = (u_int16_t)temp;
+       if (readsegcnt) {
+               temp = (readsegcnt > UINT16_MAX) ? UINT16_MAX : readsegcnt;
+               mp->mnt_segreadcnt = (u_int16_t)temp;
+       }
+       if (writesegcnt) {
+               temp = (writesegcnt > UINT16_MAX) ? UINT16_MAX : writesegcnt;
+               mp->mnt_segwritecnt = (u_int16_t)temp;
+       }
+       if (readsegsize)
+               temp = (readsegsize > UINT32_MAX) ? UINT32_MAX : readsegsize;
+       else
+               temp = mp->mnt_maxreadcnt;
+       ((struct x_constraints *)(mp->mnt_xinfo_ptr))->x_maxsegreadsize = (u_int32_t)temp;
  
-       temp = (writesegcnt > UINT16_MAX) ? UINT16_MAX : writesegcnt;
-       mp->mnt_segwritecnt = (u_int16_t)temp;
+       if (writesegsize)
+               temp = (writesegsize > UINT32_MAX) ? UINT32_MAX : writesegsize;
+       else
+               temp = mp->mnt_maxwritecnt;
+       ((struct x_constraints *)(mp->mnt_xinfo_ptr))->x_maxsegwritesize = (u_int32_t)temp;
  
-#if 0
-       printf("--- IO attributes for mount point 0x%08x ---\n", mp);
-       printf("\tmnt_maxreadcnt = 0x%x", mp->mnt_maxreadcnt);
-       printf("\tmnt_maxwritecnt = 0x%x\n", mp->mnt_maxwritecnt);
-       printf("\tmnt_segreadcnt = 0x%x", mp->mnt_segreadcnt);
-       printf("\tmnt_segwritecnt = 0x%x\n", mp->mnt_segwritecnt);
-#endif /* 0 */
+       return (error);
+}
+
+static struct klist fs_klist;
+
+void
+vfs_event_init(void)
+{
+
+       klist_init(&fs_klist);
+}
+
+void
+vfs_event_signal(fsid_t *fsid, u_int32_t event, intptr_t data)
+{
+
+       KNOTE(&fs_klist, event);
+}
+
+/*
+ * return the number of mounted filesystems.
+ */
+static int
+sysctl_vfs_getvfscnt(void)
+{
+       struct mount *mp;
+       int ret = 0;
+
+       simple_lock(&mountlist_slock);
+       CIRCLEQ_FOREACH(mp, &mountlist, mnt_list)
+           ret++;
+       simple_unlock(&mountlist_slock);
+       return (ret);
+}
+
+/*
+ * fill in the array of fsid_t's up to a max of 'count', the actual
+ * number filled in will be set in '*actual'.  If there are more fsid_t's
+ * than room in fsidlst then ENOMEM will be returned and '*actual' will
+ * have the actual count.
+ * having *actual filled out even in the error case is depended upon.
+ */
+static int
+sysctl_vfs_getvfslist(fsid_t *fsidlst, int count, int *actual)
+{
+       struct mount *mp;
+
+       *actual = 0;
+       simple_lock(&mountlist_slock);
+       CIRCLEQ_FOREACH(mp, &mountlist, mnt_list) {
+               (*actual)++;
+               if (*actual <= count)
+                       fsidlst[(*actual) - 1] = mp->mnt_stat.f_fsid;
+       }
+       simple_unlock(&mountlist_slock);
+       return (*actual <= count ? 0 : ENOMEM);
+}
+
+static int
+sysctl_vfs_vfslist SYSCTL_HANDLER_ARGS
+{
+       int actual, error;
+       size_t space;
+       fsid_t *fsidlst;
+
+       /* This is a readonly node. */
+       if (req->newptr != NULL)
+               return (EPERM);
+
+       /* they are querying us so just return the space required. */
+       if (req->oldptr == NULL) {
+               req->oldidx = sysctl_vfs_getvfscnt() * sizeof(fsid_t);
+               return 0;
+       }
+again:
+       /*
+        * Retrieve an accurate count of the amount of space required to copy
+        * out all the fsids in the system.
+        */
+       space = req->oldlen;
+       req->oldlen = sysctl_vfs_getvfscnt() * sizeof(fsid_t);
+
+       /* they didn't give us enough space. */
+       if (space < req->oldlen)
+               return (ENOMEM);
  
+       MALLOC(fsidlst, fsid_t *, req->oldlen, M_TEMP, M_WAITOK);
+       error = sysctl_vfs_getvfslist(fsidlst, req->oldlen / sizeof(fsid_t),
+           &actual);
+       /*
+        * If we get back ENOMEM, then another mount has been added while we
+        * slept in malloc above.  If this is the case then try again.
+        */
+       if (error == ENOMEM) {
+               FREE(fsidlst, M_TEMP);
+               req->oldlen = space;
+               goto again;
+       }
+       if (error == 0) {
+               error = SYSCTL_OUT(req, fsidlst, actual * sizeof(fsid_t));
+       }
+       FREE(fsidlst, M_TEMP);
+       return (error);
+}
+
+/*
+ * Do a sysctl by fsid.
+ */
+static int
+sysctl_vfs_ctlbyfsid SYSCTL_HANDLER_ARGS
+{
+       struct vfsidctl vc;
+       struct mount *mp;
+       struct statfs *sp;
+       struct proc *p;
+       int *name;
+       int error, flags, namelen;
+
+       name = arg1;
+       namelen = arg2;
+       p = req->p;
+
+       error = SYSCTL_IN(req, &vc, sizeof(vc));
+       if (error)
+               return (error);
+       if (vc.vc_vers != VFS_CTL_VERS1)
+               return (EINVAL);
+       mp = vfs_getvfs(&vc.vc_fsid);
+       if (mp == NULL)
+               return (ENOENT);
+       /* reset so that the fs specific code can fetch it. */
+       req->newidx = 0;
+       /*
+        * Note if this is a VFS_CTL then we pass the actual sysctl req
+        * in for "oldp" so that the lower layer can DTRT and use the
+        * SYSCTL_IN/OUT routines.
+        */
+       if (mp->mnt_op->vfs_sysctl != NULL) {
+               error = mp->mnt_op->vfs_sysctl(name, namelen,
+                   req, NULL, NULL, 0, req->p);
+               if (error != EOPNOTSUPP)
+                       return (error);
+       }
+       switch (name[0]) {
+       case VFS_CTL_UMOUNT:
+               VCTLTOREQ(&vc, req);
+               error = SYSCTL_IN(req, &flags, sizeof(flags));
+               if (error)
+                       break;
+               error = safedounmount(mp, flags, p);
+               break;
+       case VFS_CTL_STATFS:
+               VCTLTOREQ(&vc, req);
+               error = SYSCTL_IN(req, &flags, sizeof(flags));
+               if (error)
+                       break;
+               sp = &mp->mnt_stat;
+               if (((flags & MNT_NOWAIT) == 0 || (flags & MNT_WAIT)) &&
+                   (error = VFS_STATFS(mp, sp, p)))
+                       return (error);
+               sp->f_flags = mp->mnt_flag & MNT_VISFLAGMASK;
+               error = SYSCTL_OUT(req, sp, sizeof(*sp));
+               break;
+       default:
+               return (EOPNOTSUPP);
+       }
         return (error);
  }
  
+static int     filt_fsattach(struct knote *kn);
+static void    filt_fsdetach(struct knote *kn);
+static int     filt_fsevent(struct knote *kn, long hint);
+
+struct filterops fs_filtops =
+       { 0, filt_fsattach, filt_fsdetach, filt_fsevent };
+
+static int
+filt_fsattach(struct knote *kn)
+{
+
+       kn->kn_flags |= EV_CLEAR;
+       KNOTE_ATTACH(&fs_klist, kn);
+       return (0);
+}
+
+static void
+filt_fsdetach(struct knote *kn)
+{
+
+       KNOTE_DETACH(&fs_klist, kn);
+}
+
+static int
+filt_fsevent(struct knote *kn, long hint)
+{
+
+       kn->kn_fflags |= hint;
+       return (kn->kn_fflags != 0);
+}
+
+static int
+sysctl_vfs_noremotehang SYSCTL_HANDLER_ARGS
+{
+       int out, error;
+       pid_t pid;
+       size_t space;
+       struct proc *p;
+
+       /* We need a pid. */
+       if (req->newptr == NULL)
+               return (EINVAL);
+
+       error = SYSCTL_IN(req, &pid, sizeof(pid));
+       if (error)
+               return (error);
+
+       p = pfind(pid < 0 ? -pid : pid);
+       if (p == NULL)
+               return (ESRCH);
+
+       /*
+        * Fetching the value is ok, but we only fetch if the old
+        * pointer is given.
+        */
+       if (req->oldptr != NULL) {
+               out = !((p->p_flag & P_NOREMOTEHANG) == 0);
+               error = SYSCTL_OUT(req, &out, sizeof(out));
+               return (error);
+       }
+
+       /* cansignal offers us enough security. */
+       if (p != req->p && suser(req->p->p_ucred, &req->p->p_acflag) != 0)
+               return (EPERM);
+
+       if (pid < 0)
+               p->p_flag &= ~P_NOREMOTEHANG;
+       else
+               p->p_flag |= P_NOREMOTEHANG;
+
+       return (0);
+}
+/* the vfs.generic. branch. */
+SYSCTL_NODE(_vfs, VFS_GENERIC, generic, CTLFLAG_RW, 0, "vfs generic hinge");
+/* retreive a list of mounted filesystem fsid_t */
+SYSCTL_PROC(_vfs_generic, OID_AUTO, vfsidlist, CTLFLAG_RD,
+    0, 0, sysctl_vfs_vfslist, "S,fsid", "List of mounted filesystem ids");
+/* perform operations on filesystem via fsid_t */
+SYSCTL_NODE(_vfs_generic, OID_AUTO, ctlbyfsid, CTLFLAG_RW,
+    sysctl_vfs_ctlbyfsid, "ctlbyfsid");
+SYSCTL_PROC(_vfs_generic, OID_AUTO, noremotehang, CTLFLAG_RW,
+    0, 0, sysctl_vfs_noremotehang, "I", "noremotehang");
+