X-Git-Url: https://git.saurik.com/apple/xnu.git/blobdiff_plain/1c79356b52d46aa6b508fb032f5ae709b1f2897b..e5568f75972dfc723778653c11cb6b4dc825716a:/bsd/vfs/vfs_subr.c?ds=inline diff --git a/bsd/vfs/vfs_subr.c b/bsd/vfs/vfs_subr.c index 216c1a0e4..0801f1a09 100644 --- a/bsd/vfs/vfs_subr.c +++ b/bsd/vfs/vfs_subr.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000 Apple Computer, Inc. All rights reserved. + * Copyright (c) 2000-2002 Apple Computer, Inc. All rights reserved. * * @APPLE_LICENSE_HEADER_START@ * @@ -64,6 +64,7 @@ * External virtual filesystem routines */ +#undef DIAGNOSTIC #define DIAGNOSTIC 1 #include @@ -84,11 +85,21 @@ #include #include #include +#include +#include + +#include +#include + #include #include +#include +#include + + enum vtype iftovt_tab[16] = { VNON, VFIFO, VCHR, VNON, VDIR, VNON, VBLK, VNON, VREG, VNON, VLNK, VNON, VSOCK, VNON, VNON, VBAD, @@ -100,19 +111,10 @@ int vttoif_tab[9] = { static void vfree(struct vnode *vp); static void vinactive(struct vnode *vp); -extern int vnreclaim(int count); +static int vnreclaim(int count); extern kern_return_t adjust_vm_object_cache(vm_size_t oval, vm_size_t nval); -/* - * Insq/Remq for the vnode usage lists. - */ -#define bufinsvn(bp, dp) LIST_INSERT_HEAD(dp, bp, b_vnbufs) -#define bufremvn(bp) { \ - LIST_REMOVE(bp, b_vnbufs); \ - (bp)->b_vnbufs.le_next = NOLIST; \ -} - TAILQ_HEAD(freelst, vnode) vnode_free_list; /* vnode free list */ TAILQ_HEAD(inactivelst, vnode) vnode_inactive_list; /* vnode inactive list */ struct mntlist mountlist; /* mounted filesystem list */ @@ -167,7 +169,7 @@ struct mntlist mountlist; /* mounted filesystem list */ #define VORECLAIM_ENABLE(vp) \ do { \ if (ISSET((vp)->v_flag, VORECLAIM)) \ - panic("vm object raclaim already"); \ + panic("vm_object_reclaim already"); \ SET((vp)->v_flag, VORECLAIM); \ } while(0) @@ -207,7 +209,7 @@ unsigned long vnodetarget; /* target for vnreclaim() */ * cache. Having too few vnodes on the free list causes serious disk * thrashing as we cycle through them. */ -#define VNODE_FREE_MIN 100 /* freelist should have at least these many */ +#define VNODE_FREE_MIN 300 /* freelist should have at least these many */ /* * We need to get vnodes back from the VM object cache when a certain # @@ -226,7 +228,7 @@ unsigned long vnodetarget; /* target for vnreclaim() */ /* * Initialize the vnode management data structures. */ -void +__private_extern__ void vntblinit() { extern struct lock__bsd__ exchangelock; @@ -252,11 +254,16 @@ vntblinit() } /* Reset the VM Object Cache with the values passed in */ -kern_return_t +__private_extern__ kern_return_t reset_vmobjectcache(unsigned int val1, unsigned int val2) { vm_size_t oval = val1 - VNODE_FREE_MIN; - vm_size_t nval = val2 - VNODE_FREE_MIN; + vm_size_t nval; + + if(val2 < VNODE_FREE_MIN) + nval = 0; + else + nval = val2 - VNODE_FREE_MIN; return(adjust_vm_object_cache(oval, nval)); } @@ -334,6 +341,11 @@ vfs_rootmountalloc(fstypename, devname, mpp) return (ENODEV); mp = _MALLOC_ZONE((u_long)sizeof(struct mount), M_MOUNT, M_WAITOK); bzero((char *)mp, (u_long)sizeof(struct mount)); + + /* Initialize the default IO constraints */ + mp->mnt_maxreadcnt = mp->mnt_maxwritecnt = MAXPHYS; + mp->mnt_segreadcnt = mp->mnt_segwritecnt = 32; + lockinit(&mp->mnt_lock, PVFS, "vfslock", 0, 0); (void)vfs_busy(mp, LK_NOWAIT, 0, p); LIST_INIT(&mp->mnt_vnodelist); @@ -390,8 +402,7 @@ vfs_getvfs(fsid) register struct mount *mp; simple_lock(&mountlist_slock); - for (mp = mountlist.cqh_first; mp != (void *)&mountlist; - mp = mp->mnt_list.cqe_next) { + CIRCLEQ_FOREACH(mp, &mountlist, mnt_list) { if (mp->mnt_stat.f_fsid.val[0] == fsid->val[0] && mp->mnt_stat.f_fsid.val[1] == fsid->val[1]) { simple_unlock(&mountlist_slock); @@ -422,7 +433,7 @@ static u_short xxxfs_mntid; ++xxxfs_mntid; tfsid.val[0] = makedev(nblkdev + mtype, xxxfs_mntid); tfsid.val[1] = mtype; - if (mountlist.cqh_first != (void *)&mountlist) { + if (!CIRCLEQ_EMPTY(&mountlist)) { while (vfs_getvfs(&tfsid)) { tfsid.val[0]++; xxxfs_mntid++; @@ -532,8 +543,8 @@ retry: simple_unlock(&vp->v_interlock); reclaimhits++; } else - break; - } + break; + } } /* @@ -582,15 +593,37 @@ retry: else vp->v_ubcinfo = 0; + if (vp->v_flag & VHASDIRTY) + cluster_release(vp); + + // make sure all these fields are cleared out as the + // name/parent stuff uses them and assumes they're + // cleared to null/0. + if (vp->v_scmap != NULL) { + panic("getnewvnode: vp @ 0x%x has non-null scmap.\n", vp); + } + vp->v_un.vu_name = NULL; + vp->v_scdirty = 0; + vp->v_un1.v_cl.v_pad = 0; + + vp->v_lastr = -1; vp->v_ralen = 0; vp->v_maxra = 0; - vp->v_lastw = 0; vp->v_ciosiz = 0; - vp->v_cstart = 0; vp->v_clen = 0; vp->v_socket = 0; + /* we may have blocked, re-evaluate state */ + simple_lock(&vnode_free_list_slock); + if (VONLIST(vp)) { + if (vp->v_usecount == 0) + VREMFREE("getnewvnode", vp); + else if (ISSET((vp)->v_flag, VUINACTIVE)) + VREMINACTIVE("getnewvnode", vp); + } + simple_unlock(&vnode_free_list_slock); + done: vp->v_flag = VSTANDARD; vp->v_type = VNON; @@ -626,6 +659,20 @@ insmntque(vp, mp) simple_unlock(&mntvnode_slock); } +__inline void +vpwakeup(struct vnode *vp) +{ + if (vp) { + if (--vp->v_numoutput < 0) + panic("vpwakeup: neg numoutput"); + if ((vp->v_flag & VBWAIT || vp->v_flag & VTHROTTLED) + && vp->v_numoutput <= 0) { + vp->v_flag &= ~(VBWAIT|VTHROTTLED); + wakeup((caddr_t)&vp->v_numoutput); + } + } +} + /* * Update outstanding I/O count and do wakeup if requested. */ @@ -633,19 +680,8 @@ void vwakeup(bp) register struct buf *bp; { - register struct vnode *vp; - CLR(bp->b_flags, B_WRITEINPROG); - if (vp = bp->b_vp) { - if (--vp->v_numoutput < 0) - panic("vwakeup: neg numoutput"); - if ((vp->v_flag & VBWAIT) && vp->v_numoutput <= 0) { - if (vp->v_numoutput < 0) - panic("vwakeup: neg numoutput 2"); - vp->v_flag &= ~VBWAIT; - wakeup((caddr_t)&vp->v_numoutput); - } - } + vpwakeup(bp->b_vp); } /* @@ -668,12 +704,12 @@ vinvalbuf(vp, flags, cred, p, slpflag, slptimeo) if (error = VOP_FSYNC(vp, cred, MNT_WAIT, p)) { return (error); } - if (vp->v_dirtyblkhd.lh_first != NULL || (vp->v_flag & VHASDIRTY)) - panic("vinvalbuf: dirty bufs"); + if (vp->v_dirtyblkhd.lh_first) + panic("vinvalbuf: dirty bufs (vp 0x%x, bp 0x%x)", vp, vp->v_dirtyblkhd.lh_first); } for (;;) { - if ((blist = vp->v_cleanblkhd.lh_first) && flags & V_SAVEMETA) + if ((blist = vp->v_cleanblkhd.lh_first) && (flags & V_SAVEMETA)) while (blist && blist->b_lblkno < 0) blist = blist->b_vnbufs.le_next; if (!blist && (blist = vp->v_dirtyblkhd.lh_first) && @@ -685,7 +721,7 @@ vinvalbuf(vp, flags, cred, p, slpflag, slptimeo) for (bp = blist; bp; bp = nbp) { nbp = bp->b_vnbufs.le_next; - if (flags & V_SAVEMETA && bp->b_lblkno < 0) + if ((flags & V_SAVEMETA) && bp->b_lblkno < 0) continue; s = splbio(); if (ISSET(bp->b_flags, B_BUSY)) { @@ -711,7 +747,13 @@ vinvalbuf(vp, flags, cred, p, slpflag, slptimeo) (void) VOP_BWRITE(bp); break; } - SET(bp->b_flags, B_INVAL); + + if (bp->b_flags & B_LOCKED) { + panic("vinvalbuf: bp @ 0x%x is locked!", bp); + break; + } else { + SET(bp->b_flags, B_INVAL); + } brelse(bp); } } @@ -721,82 +763,6 @@ vinvalbuf(vp, flags, cred, p, slpflag, slptimeo) return (0); } -/* - * Associate a buffer with a vnode. - */ -void -bgetvp(vp, bp) - register struct vnode *vp; - register struct buf *bp; -{ - - if (bp->b_vp) - panic("bgetvp: not free"); - VHOLD(vp); - bp->b_vp = vp; - if (vp->v_type == VBLK || vp->v_type == VCHR) - bp->b_dev = vp->v_rdev; - else - bp->b_dev = NODEV; - /* - * Insert onto list for new vnode. - */ - bufinsvn(bp, &vp->v_cleanblkhd); -} - -/* - * Disassociate a buffer from a vnode. - */ -void -brelvp(bp) - register struct buf *bp; -{ - struct vnode *vp; - - if (bp->b_vp == (struct vnode *) 0) - panic("brelvp: NULL"); - /* - * Delete from old vnode list, if on one. - */ - if (bp->b_vnbufs.le_next != NOLIST) - bufremvn(bp); - vp = bp->b_vp; - bp->b_vp = (struct vnode *) 0; - HOLDRELE(vp); -} - -/* - * Reassign a buffer from one vnode to another. - * Used to assign file specific control information - * (indirect blocks) to the vnode to which they belong. - */ -void -reassignbuf(bp, newvp) - register struct buf *bp; - register struct vnode *newvp; -{ - register struct buflists *listheadp; - - if (newvp == NULL) { - printf("reassignbuf: NULL"); - return; - } - /* - * Delete from old vnode list, if on one. - */ - if (bp->b_vnbufs.le_next != NOLIST) - bufremvn(bp); - /* - * If dirty, put on list of dirty buffers; - * otherwise insert onto list of clean buffers. - */ - if (ISSET(bp->b_flags, B_DELWRI)) - listheadp = &newvp->v_dirtyblkhd; - else - listheadp = &newvp->v_cleanblkhd; - bufinsvn(bp, listheadp); -} - /* * Create a vnode for a block device. * Used for root filesystem, argdev, and swap areas. @@ -847,14 +813,13 @@ checkalias(nvp, nvp_rdev, mp) struct proc *p = current_proc(); /* XXX */ struct vnode *vp; struct vnode **vpp; - struct specinfo * bufhold; - int buffree = 1; + struct specinfo *specinfop; if (nvp->v_type != VBLK && nvp->v_type != VCHR) return (NULLVP); - bufhold = (struct specinfo *)_MALLOC_ZONE(sizeof(struct specinfo), - M_VNODE, M_WAITOK); + MALLOC_ZONE(specinfop, struct specinfo *, sizeof(struct specinfo), + M_SPECINFO, M_WAITOK); vpp = &speclisth[SPECHASH(nvp_rdev)]; loop: simple_lock(&spechash_slock); @@ -877,8 +842,8 @@ loop: break; } if (vp == NULL || vp->v_tag != VT_NON) { - nvp->v_specinfo = bufhold; - buffree = 0; /* buffer used */ + nvp->v_specinfo = specinfop; + specinfop = 0; /* buffer used */ bzero(nvp->v_specinfo, sizeof(struct specinfo)); nvp->v_rdev = nvp_rdev; nvp->v_hashchain = vpp; @@ -902,18 +867,20 @@ loop: vp->v_tag = nvp->v_tag; nvp->v_type = VNON; insmntque(vp, mp); - if (buffree) - _FREE_ZONE((void *)bufhold, sizeof (struct specinfo), M_VNODE); + if (specinfop) + FREE_ZONE((void *)specinfop, sizeof(struct specinfo), M_SPECINFO); return (vp); } /* - * Grab a particular vnode from the free list, increment its - * reference count and lock it. The vnode lock bit is set the - * vnode is being eliminated in vgone. The process is awakened - * when the transition is completed, and an error returned to - * indicate that the vnode is no longer usable (possibly having - * been changed to a new file system type). + * Get a reference on a particular vnode and lock it if requested. + * If the vnode was on the inactive list, remove it from the list. + * If the vnode was on the free list, remove it from the list and + * move it to inactive list as needed. + * The vnode lock bit is set if the vnode is being eliminated in + * vgone. The process is awakened when the transition is completed, + * and an error returned to indicate that the vnode is no longer + * usable (possibly having been changed to a new file system type). */ int vget(vp, flags, p) @@ -922,6 +889,11 @@ vget(vp, flags, p) struct proc *p; { int error = 0; + u_long vpid; + + vpid = vp->v_id; // save off the original v_id + +retry: /* * If the vnode is in the process of being cleaned out for @@ -934,7 +906,7 @@ vget(vp, flags, p) if ((vp->v_flag & VXLOCK) || (vp->v_flag & VORECLAIM)) { vp->v_flag |= VXWANT; simple_unlock(&vp->v_interlock); - tsleep((caddr_t)vp, PINOD, "vget", 0); + (void)tsleep((caddr_t)vp, PINOD, "vget", 0); return (ENOENT); } @@ -945,95 +917,142 @@ vget(vp, flags, p) if (ISSET(vp->v_flag, VTERMINATE)) { SET(vp->v_flag, VTERMWANT); simple_unlock(&vp->v_interlock); - tsleep((caddr_t)&vp->v_ubcinfo, PINOD, "vclean", 0); + (void)tsleep((caddr_t)&vp->v_ubcinfo, PINOD, "vget1", 0); return (ENOENT); } + /* + * if the vnode is being initialized, + * wait for it to finish initialization + */ + if (ISSET(vp->v_flag, VUINIT)) { + SET(vp->v_flag, VUWANT); + simple_unlock(&vp->v_interlock); + (void) tsleep((caddr_t)vp, PINOD, "vget2", 0); + goto retry; + } + simple_lock(&vnode_free_list_slock); - /* If on the free list, remove it from there */ - if (vp->v_usecount == 0) { - if (VONLIST(vp)) + if (VONLIST(vp)) { + if (vp->v_usecount == 0) VREMFREE("vget", vp); - } else { - /* If on the inactive list, remove it from there */ - if ((vp->v_usecount == 1) && UBCINFOEXISTS(vp)) { - if (VONLIST(vp)) - VREMINACTIVE("vget", vp); - } + else if (ISSET((vp)->v_flag, VUINACTIVE)) + VREMINACTIVE("vget", vp); } - - /* The vnode should not be on the inactive list here */ - VINACTIVECHECK("vget", vp, 0); - simple_unlock(&vnode_free_list_slock); + if (++vp->v_usecount <= 0) panic("vget: v_usecount"); + /* + * Recover named reference as needed + */ + if (UBCISVALID(vp) && !ubc_issetflags(vp, UI_HASOBJREF)) { + simple_unlock(&vp->v_interlock); + if (ubc_getobject(vp, UBC_HOLDOBJECT) == MEMORY_OBJECT_CONTROL_NULL) { + error = ENOENT; + goto errout; + } + simple_lock(&vp->v_interlock); + } + if (flags & LK_TYPE_MASK) { - if (error = vn_lock(vp, flags | LK_INTERLOCK, p)) { - /* - * If the vnode was not active in the first place - * must not call vrele() as VOP_INACTIVE() is not - * required. - * So inlined part of vrele() here. - */ - simple_lock(&vp->v_interlock); - if (--vp->v_usecount == 1) { - if (UBCINFOEXISTS(vp)) { - vinactive(vp); - simple_unlock(&vp->v_interlock); - return (error); - } - } - if (vp->v_usecount > 0) { - simple_unlock(&vp->v_interlock); - return (error); - } - if (vp->v_usecount < 0) - panic("vget: negative usecount (%d)", vp->v_usecount); - vfree(vp); - simple_unlock(&vp->v_interlock); + if (error = vn_lock(vp, flags | LK_INTERLOCK, p)) + goto errout; + if (vpid != vp->v_id) { // make sure it's still the same vnode + vput(vp); + return ENOENT; } - return (error); + return (0); + } + + if ((flags & LK_INTERLOCK) == 0) + simple_unlock(&vp->v_interlock); + + if (vpid != vp->v_id) { // make sure it's still the same vnode + vrele(vp); + return ENOENT; } + return (0); + +errout: + simple_lock(&vp->v_interlock); + /* - * If this is a valid UBC vnode, if usecount is 1 and if - * this vnode was mapped in the past, it is likely - * that ubc_info freed due to the memory object getting recycled. - * Just re-initialize the ubc_info. + * we may have blocked. Re-evaluate the state */ - if ((vp->v_usecount == 1) && UBCISVALID(vp)) { - if (UBCINFOMISSING(vp)) - panic("vget: lost ubc_info"); - - if (ISSET(vp->v_flag, VTERMINATE)) { - /* - * vnode is being terminated. - * wait for vnode_pager_no_senders() to clear - * VTERMINATE - */ - SET(vp->v_flag, VTERMWANT); + simple_lock(&vnode_free_list_slock); + if (VONLIST(vp)) { + if (vp->v_usecount == 0) + VREMFREE("vget", vp); + else if (ISSET((vp)->v_flag, VUINACTIVE)) + VREMINACTIVE("vget", vp); + } + simple_unlock(&vnode_free_list_slock); + + /* + * If the vnode was not active in the first place + * must not call vrele() as VOP_INACTIVE() is not + * required. + * So inlined part of vrele() here. + */ + if (--vp->v_usecount == 1) { + if (UBCINFOEXISTS(vp)) { + vinactive(vp); simple_unlock(&vp->v_interlock); - tsleep((caddr_t)&vp->v_ubcinfo, PINOD, "vclean", 0); - /* return error */ - return (ENOENT); + return (error); } + } + if (vp->v_usecount > 0) { + simple_unlock(&vp->v_interlock); + return (error); + } + if (vp->v_usecount < 0) + panic("vget: negative usecount (%d)", vp->v_usecount); + vfree(vp); + simple_unlock(&vp->v_interlock); + return (error); +} - if ((!UBCINFOEXISTS(vp)) && ISSET(vp->v_flag, VWASMAPPED)) { - simple_unlock(&vp->v_interlock); - ubc_info_init(vp); - simple_lock(&vp->v_interlock); - } else - panic("vget: stolen ubc_info"); +/* + * Get a pager reference on the particular vnode. + * + * This is called from ubc_info_init() and it is asumed that + * the vnode is not on the free list. + * It is also assumed that the vnode is neither being recycled + * by vgonel nor being terminated by vnode_pager_vrele(). + * + * The vnode interlock is NOT held by the caller. + */ +__private_extern__ int +vnode_pager_vget(vp) + struct vnode *vp; +{ + simple_lock(&vp->v_interlock); + + UBCINFOCHECK("vnode_pager_vget", vp); - if (!ubc_issetflags(vp, UI_HASOBJREF)) - if (ubc_getobject(vp, (UBC_NOREACTIVATE|UBC_HOLDOBJECT))) - panic("vget: null object"); + if (ISSET(vp->v_flag, (VXLOCK|VORECLAIM|VTERMINATE))) + panic("%s: dying vnode", "vnode_pager_vget"); + + simple_lock(&vnode_free_list_slock); + /* The vnode should not be on free list */ + if (VONLIST(vp)) { + if (vp->v_usecount == 0) + panic("%s: still on list", "vnode_pager_vget"); + else if (ISSET((vp)->v_flag, VUINACTIVE)) + VREMINACTIVE("vnode_pager_vget", vp); } -out: - if ((flags & LK_INTERLOCK) == 0) - simple_unlock(&vp->v_interlock); + + /* The vnode should not be on the inactive list here */ + simple_unlock(&vnode_free_list_slock); + + /* After all those checks, now do the real work :-) */ + if (++vp->v_usecount <= 0) + panic("vnode_pager_vget: v_usecount"); + simple_unlock(&vp->v_interlock); + return (0); } @@ -1072,8 +1091,8 @@ vop_nolock(ap) if (vp->v_vnlock == NULL) { if ((flags & LK_TYPE_MASK) == LK_DRAIN) return (0); - MALLOC_ZONE(vp->v_vnlock, struct lock__bsd__ *, - sizeof(struct lock__bsd__), M_VNODE, M_WAITOK); + MALLOC(vp->v_vnlock, struct lock__bsd__ *, + sizeof(struct lock__bsd__), M_TEMP, M_WAITOK); lockinit(vp->v_vnlock, PVFS, "vnlock", 0, 0); } switch (flags & LK_TYPE_MASK) { @@ -1153,21 +1172,41 @@ vref(vp) panic("vref used where vget required"); /* If on the inactive list, remove it from there */ - if ((vp->v_usecount == 1) && UBCINFOEXISTS(vp)) { - if (VONLIST(vp)) { - simple_lock(&vnode_free_list_slock); - VREMINACTIVE("vref", vp); - simple_unlock(&vnode_free_list_slock); - } - } - /* The vnode should not be on the inactive list here */ - VINACTIVECHECK("vref", vp, 0); + simple_lock(&vnode_free_list_slock); + if (ISSET((vp)->v_flag, VUINACTIVE)) + VREMINACTIVE("vref", vp); + simple_unlock(&vnode_free_list_slock); if (++vp->v_usecount <= 0) panic("vref v_usecount"); simple_unlock(&vp->v_interlock); } +static void +clean_up_name_parent_ptrs(struct vnode *vp) +{ + if (VNAME(vp) || VPARENT(vp)) { + char *tmp1; + struct vnode *tmp2; + + // do it this way so we don't block before clearing + // these fields. + tmp1 = VNAME(vp); + tmp2 = VPARENT(vp); + VNAME(vp) = NULL; + VPARENT(vp) = NULL; + + if (tmp1) { + remove_name(tmp1); + } + + if (tmp2) { + vrele(tmp2); + } + } +} + + /* * put the vnode on appropriate free list. * called with v_interlock held. @@ -1176,6 +1215,13 @@ static void vfree(vp) struct vnode *vp; { + funnel_t *curflock; + extern int disable_funnel; + + if ((curflock = thread_funnel_get()) != kernel_flock && + !(disable_funnel && curflock != THR_FUNNEL_NULL)) + panic("Entering vfree() without kernel funnel"); + /* * if the vnode is not obtained by calling getnewvnode() we * are not responsible for the cleanup. Just return. @@ -1190,8 +1236,11 @@ vfree(vp) /* insert at tail of LRU list or at head if VAGE is set */ simple_lock(&vnode_free_list_slock); + // make sure the name & parent pointers get cleared out +// clean_up_name_parent_ptrs(vp); + if (VONLIST(vp)) - panic("vfree: vnode still on list"); + panic("%s: vnode still on list", "vfree"); if (vp->v_flag & VAGE) { TAILQ_INSERT_HEAD(&vnode_free_list, vp, v_freelist); @@ -1211,6 +1260,13 @@ static void vinactive(vp) struct vnode *vp; { + funnel_t *curflock; + extern int disable_funnel; + + if ((curflock = thread_funnel_get()) != kernel_flock && + !(disable_funnel && curflock != THR_FUNNEL_NULL)) + panic("Entering vinactive() without kernel funnel"); + if (!UBCINFOEXISTS(vp)) panic("vinactive: not a UBC vnode"); @@ -1220,7 +1276,7 @@ vinactive(vp) simple_lock(&vnode_free_list_slock); if (VONLIST(vp)) - panic("vinactive: vnode still on list"); + panic("%s: vnode still on list", "vinactive"); VINACTIVECHECK("vinactive", vp, 0); TAILQ_INSERT_TAIL(&vnode_inactive_list, vp, v_freelist); @@ -1242,10 +1298,6 @@ vput(vp) { struct proc *p = current_proc(); /* XXX */ -#if DIAGNOSTIC - if (vp == NULL) - panic("vput: null vp"); -#endif simple_lock(&vp->v_interlock); if (--vp->v_usecount == 1) { if (UBCINFOEXISTS(vp)) { @@ -1267,8 +1319,10 @@ vput(vp) vp->v_usecount, vp->v_writecount); } #endif - if (ISSET((vp)->v_flag, VUINACTIVE) && VONLIST(vp)) - VREMINACTIVE("vrele", vp); + simple_lock(&vnode_free_list_slock); + if (ISSET((vp)->v_flag, VUINACTIVE)) + VREMINACTIVE("vref", vp); + simple_unlock(&vnode_free_list_slock); simple_unlock(&vp->v_interlock); VOP_INACTIVE(vp, p); @@ -1297,15 +1351,18 @@ vrele(vp) struct vnode *vp; { struct proc *p = current_proc(); /* XXX */ + funnel_t *curflock; + extern int disable_funnel; + + if ((curflock = thread_funnel_get()) != kernel_flock && + !(disable_funnel && curflock != THR_FUNNEL_NULL)) + panic("Entering vrele() without kernel funnel"); -#if DIAGNOSTIC - if (vp == NULL) - panic("vrele: null vp"); -#endif simple_lock(&vp->v_interlock); if (--vp->v_usecount == 1) { if (UBCINFOEXISTS(vp)) { - vinactive(vp); + if ((vp->v_flag & VXLOCK) == 0) + vinactive(vp); simple_unlock(&vp->v_interlock); return; } @@ -1320,9 +1377,6 @@ vrele(vp) panic("vrele: ref cnt"); } #endif - if (ISSET((vp)->v_flag, VUINACTIVE) && VONLIST(vp)) - VREMINACTIVE("vrele", vp); - if ((vp->v_flag & VXLOCK) || (vp->v_flag & VORECLAIM)) { /* vnode is being cleaned, just return */ @@ -1361,7 +1415,6 @@ void vagevp(vp) struct vnode *vp; { - assert(vp); simple_lock(&vp->v_interlock); vp->v_flag |= VAGE; simple_unlock(&vp->v_interlock); @@ -1417,7 +1470,7 @@ vflush(mp, skipvp, flags) struct vnode *skipvp; int flags; { - struct proc *p = current_proc(); /* XXX */ + struct proc *p = current_proc(); struct vnode *vp, *nvp; int busy = 0; @@ -1435,9 +1488,9 @@ loop: simple_lock(&vp->v_interlock); /* - * Skip over a vnodes marked VSYSTEM. + * Skip over a vnodes marked VSYSTEM or VNOFLUSH. */ - if ((flags & SKIPSYSTEM) && (vp->v_flag & VSYSTEM)) { + if ((flags & SKIPSYSTEM) && ((vp->v_flag & VSYSTEM) || (vp->v_flag & VNOFLUSH))) { simple_unlock(&vp->v_interlock); continue; } @@ -1492,7 +1545,7 @@ loop: busy++; } simple_unlock(&mntvnode_slock); - if (busy) + if (busy && ((flags & FORCECLOSE)==0)) return (EBUSY); return (0); } @@ -1508,8 +1561,7 @@ vclean(vp, flags, p) struct proc *p; { int active; - void *obj; - int removed = 0; + int didhold; /* * if the vnode is not obtained by calling getnewvnode() we @@ -1526,9 +1578,23 @@ vclean(vp, flags, p) * so that its count cannot fall to zero and generate a * race against ourselves to recycle it. */ - if (active = vp->v_usecount) + if (active = vp->v_usecount) { + /* + * active vnode can not be on the free list. + * we are about to take an extra reference on this vnode + * do the queue management as needed + * Not doing so can cause "still on list" or + * "vnreclaim: v_usecount" panic if VOP_LOCK() blocks. + */ + simple_lock(&vnode_free_list_slock); + if (ISSET((vp)->v_flag, VUINACTIVE)) + VREMINACTIVE("vclean", vp); + simple_unlock(&vnode_free_list_slock); + if (++vp->v_usecount <= 0) panic("vclean: v_usecount"); + } + /* * Prevent the vnode from being recycled or * brought into use while we clean it out. @@ -1547,21 +1613,24 @@ vclean(vp, flags, p) VOP_LOCK(vp, LK_DRAIN | LK_INTERLOCK, p); /* + * While blocked in VOP_LOCK() someone could have dropped + * reference[s] and we could land on the inactive list. * if this vnode is on the inactive list * take it off the list. */ - if ((active == 1) && - (ISSET((vp)->v_flag, VUINACTIVE) && VONLIST(vp))) { - simple_lock(&vnode_free_list_slock); + simple_lock(&vnode_free_list_slock); + if (ISSET((vp)->v_flag, VUINACTIVE)) VREMINACTIVE("vclean", vp); - simple_unlock(&vnode_free_list_slock); - removed++; - } + simple_unlock(&vnode_free_list_slock); + + /* Clean the pages in VM. */ + if (active && (flags & DOCLOSE)) + VOP_CLOSE(vp, IO_NDELAY, NOCRED, p); /* Clean the pages in VM. */ - if ((active) && UBCINFOEXISTS(vp)) { + didhold = ubc_hold(vp); + if ((active) && (didhold)) (void)ubc_clean(vp, 0); /* do not invalidate */ - } /* * Clean out any buffers associated with the vnode. @@ -1572,75 +1641,49 @@ vclean(vp, flags, p) else vinvalbuf(vp, V_SAVE, NOCRED, p, 0, 0); } - /* - * If purging an active vnode, it must be closed and - * deactivated before being reclaimed. Note that the - * VOP_INACTIVE will unlock the vnode. - */ - if (active) { - if (flags & DOCLOSE) - VOP_CLOSE(vp, IO_NDELAY, NOCRED, p); + + if (active) VOP_INACTIVE(vp, p); - } else { - /* - * Any other processes trying to obtain this lock must first - * wait for VXLOCK to clear, then call the new lock operation. - */ + else VOP_UNLOCK(vp, 0, p); + + /* Destroy ubc named reference */ + if (didhold) { + ubc_rele(vp); + ubc_destroy_named(vp); + } + /* + * Make sure vp isn't on the inactive list. + */ + simple_lock(&vnode_free_list_slock); + if (ISSET((vp)->v_flag, VUINACTIVE)) { + VREMINACTIVE("vclean", vp); } + simple_unlock(&vnode_free_list_slock); + /* * Reclaim the vnode. */ if (VOP_RECLAIM(vp, p)) panic("vclean: cannot reclaim"); - if (active) - vrele(vp); + + // make sure the name & parent ptrs get cleaned out! + clean_up_name_parent_ptrs(vp); + cache_purge(vp); if (vp->v_vnlock) { - if ((vp->v_vnlock->lk_flags & LK_DRAINED) == 0) + struct lock__bsd__ *tmp = vp->v_vnlock; + if ((tmp->lk_flags & LK_DRAINED) == 0) vprint("vclean: lock not drained", vp); - FREE_ZONE(vp->v_vnlock, sizeof (struct lock__bsd__), M_VNODE); vp->v_vnlock = NULL; + FREE(tmp, M_TEMP); } /* It's dead, Jim! */ vp->v_op = dead_vnodeop_p; vp->v_tag = VT_NON; - /* - * v_data is reclaimed by VOP_RECLAIM, all the vnode - * operation generated by the code below would be directed - * to the deadfs - */ - if (UBCINFOEXISTS(vp)) { - /* vnode is dying, destroy the object */ - if (ubc_issetflags(vp, UI_HASOBJREF)) { - obj = ubc_getobject(vp, UBC_NOREACTIVATE); - if (obj == NULL) - panic("vclean: null object"); - if (ISSET(vp->v_flag, VTERMINATE)) - panic("vclean: already teminating"); - SET(vp->v_flag, VTERMINATE); - - ubc_clearflags(vp, UI_HASOBJREF); - memory_object_destroy(obj, 0); - - /* - * memory_object_destroy() is asynchronous with respect - * to vnode_pager_no_senders(). - * wait for vnode_pager_no_senders() to clear - * VTERMINATE - */ - while (ISSET(vp->v_flag, VTERMINATE)) { - SET(vp->v_flag, VTERMWANT); - tsleep((caddr_t)&vp->v_ubcinfo, PINOD, "vclean", 0); - } - if (UBCINFOEXISTS(vp)) { - ubc_info_free(vp); - vp->v_ubcinfo = UBC_NOINFO; /* catch bad accesses */ - } - } - } + insmntque(vp, (struct mount *)0); /* * Done with purge, notify sleepers of the grim news. @@ -1650,6 +1693,9 @@ vclean(vp, flags, p) vp->v_flag &= ~VXWANT; wakeup((caddr_t)vp); } + + if (active) + vrele(vp); } /* @@ -1664,7 +1710,7 @@ vop_revoke(ap) } */ *ap; { struct vnode *vp, *vq; - struct proc *p = current_proc(); /* XXX */ + struct proc *p = current_proc(); #if DIAGNOSTIC if ((ap->a_flags & REVOKEALL) == 0) @@ -1683,7 +1729,7 @@ vop_revoke(ap) while (vp->v_flag & VXLOCK) { vp->v_flag |= VXWANT; simple_unlock(&vp->v_interlock); - tsleep((caddr_t)vp, PINOD, "vop_revokeall", 0); + (void)tsleep((caddr_t)vp, PINOD, "vop_revokeall", 0); } return (0); } @@ -1748,7 +1794,7 @@ void vgone(vp) struct vnode *vp; { - struct proc *p = current_proc(); /* XXX */ + struct proc *p = current_proc(); simple_lock(&vp->v_interlock); vgonel(vp, p); @@ -1782,7 +1828,7 @@ vgonel(vp, p) while (vp->v_flag & VXLOCK) { vp->v_flag |= VXWANT; simple_unlock(&vp->v_interlock); - tsleep((caddr_t)vp, PINOD, "vgone", 0); + (void)tsleep((caddr_t)vp, PINOD, "vgone", 0); } return; } @@ -1830,8 +1876,11 @@ vgonel(vp, p) vp->v_flag &= ~VALIASED; } simple_unlock(&spechash_slock); - FREE_ZONE(vp->v_specinfo, sizeof (struct specinfo), M_VNODE); + { + struct specinfo *tmp = vp->v_specinfo; vp->v_specinfo = NULL; + FREE_ZONE((void *)tmp, sizeof(struct specinfo), M_SPECINFO); + } } /* * If it is on the freelist and not already at the head, @@ -1846,7 +1895,7 @@ vgonel(vp, p) * getnewvnode after removing it from the freelist to ensure * that we do not try to move it here. */ - if (vp->v_usecount == 0) { + if (vp->v_usecount == 0 && (vp->v_flag & VUINACTIVE) == 0) { simple_lock(&vnode_free_list_slock); if ((vp->v_freelist.tqe_prev != (struct vnode **)0xdeadb) && vnode_free_list.tqh_first != vp) { @@ -1941,6 +1990,8 @@ vprint(label, vp) strcat(buf, "|VTEXT"); if (vp->v_flag & VSYSTEM) strcat(buf, "|VSYSTEM"); + if (vp->v_flag & VNOFLUSH) + strcat(buf, "|VNOFLUSH"); if (vp->v_flag & VXLOCK) strcat(buf, "|VXLOCK"); if (vp->v_flag & VXWANT) @@ -1967,7 +2018,7 @@ vprint(label, vp) void printlockedvnodes() { - struct proc *p = current_proc(); /* XXX */ + struct proc *p = current_proc(); struct mount *mp, *nmp; struct vnode *vp; @@ -1992,30 +2043,110 @@ printlockedvnodes() } #endif -/* - * Top level filesystem related information gathering. - */ -int -vfs_sysctl(name, namelen, oldp, oldlenp, newp, newlen, p) - int *name; - u_int namelen; +static int +build_path(struct vnode *vp, char *buff, int buflen, int *outlen) +{ + char *end, *str; + int i, len, ret=0, counter=0; + + end = &buff[buflen-1]; + *--end = '\0'; + + while(vp && VPARENT(vp) != vp) { + // the maximum depth of a file system hierarchy is MAXPATHLEN/2 + // (with single-char names separated by slashes). we panic if + // we've ever looped more than that. + if (counter++ > MAXPATHLEN/2) { + panic("build_path: vnode parent chain is too long! vp 0x%x\n", vp); + } + str = VNAME(vp); + if (VNAME(vp) == NULL) { + if (VPARENT(vp) != NULL) { + ret = EINVAL; + } + break; + } + + // count how long the string is + for(len=0; *str; str++, len++) + /* nothing */; + + // check that there's enough space + if ((end - buff) < len) { + ret = ENOSPC; + break; + } + + // copy it backwards + for(; len > 0; len--) { + *--end = *--str; + } + + // put in the path separator + *--end = '/'; + + // walk up the chain. + vp = VPARENT(vp); + + // check if we're crossing a mount point and + // switch the vp if we are. + if (vp && (vp->v_flag & VROOT)) { + vp = vp->v_mount->mnt_vnodecovered; + } + } + + // slide it down to the beginning of the buffer + memmove(buff, end, &buff[buflen] - end); + + *outlen = &buff[buflen] - end; + + return ret; +} + +__private_extern__ int +vn_getpath(struct vnode *vp, char *pathbuf, int *len) +{ + return build_path(vp, pathbuf, *len, len); +} + + + +/* + * Top level filesystem related information gathering. + */ +int +vfs_sysctl(name, namelen, oldp, oldlenp, newp, newlen, p) + int *name; + u_int namelen; void *oldp; size_t *oldlenp; void *newp; size_t newlen; struct proc *p; { - struct ctldebug *cdp; struct vfsconf *vfsp; + int *username; + u_int usernamelen; + int error; - if (name[0] == VFS_NUMMNTOPS) { + /* + * The VFS_NUMMNTOPS shouldn't be at name[0] since + * is a VFS generic variable. So now we must check + * namelen so we don't end up covering any UFS + * variables (sinc UFS vfc_typenum is 1). + * + * It should have been: + * name[0]: VFS_GENERIC + * name[1]: VFS_NUMMNTOPS + */ + if (namelen == 1 && name[0] == VFS_NUMMNTOPS) { extern unsigned int vfs_nummntops; return (sysctl_rdint(oldp, oldlenp, newp, vfs_nummntops)); } /* all sysctl names at this level are at least name and field */ if (namelen < 2) - return (ENOTDIR); /* overloaded */ + return (EISDIR); /* overloaded */ if (name[0] != VFS_GENERIC) { for (vfsp = vfsconf; vfsp; vfsp = vfsp->vfc_next) if (vfsp->vfc_typenum == name[0]) @@ -2039,7 +2170,19 @@ vfs_sysctl(name, namelen, oldp, oldlenp, newp, newlen, p) return (sysctl_rdstruct(oldp, oldlenp, newp, vfsp, sizeof(struct vfsconf))); } - return (EOPNOTSUPP); + /* + * We need to get back into the general MIB, so we need to re-prepend + * CTL_VFS to our name and try userland_sysctl(). + */ + usernamelen = namelen + 1; + MALLOC(username, int *, usernamelen * sizeof(*username), + M_TEMP, M_WAITOK); + bcopy(name, username + 1, namelen * sizeof(*name)); + username[0] = CTL_VFS; + error = userland_sysctl(p, username, usernamelen, oldp, oldlenp, 1, + newp, newlen, oldlenp); + FREE(username, M_TEMP); + return (error); } int kinfo_vdebug = 1; @@ -2096,13 +2239,16 @@ again: nvp = vp->v_mntvnodes.le_next; if (bp + VPTRSZ + VNODESZ > ewhere) { simple_unlock(&mntvnode_slock); + vfs_unbusy(mp, p); *sizep = bp - where; return (ENOMEM); } simple_unlock(&mntvnode_slock); if ((error = copyout((caddr_t)&vp, bp, VPTRSZ)) || - (error = copyout((caddr_t)vp, bp + VPTRSZ, VNODESZ))) + (error = copyout((caddr_t)vp, bp + VPTRSZ, VNODESZ))) { + vfs_unbusy(mp, p); return (error); + } bp += VPTRSZ + VNODESZ; simple_lock(&mntvnode_slock); } @@ -2149,11 +2295,11 @@ vfs_mountedon(vp) * Unmount all filesystems. The list is traversed in reverse order * of mounting to avoid dependencies. */ -void +__private_extern__ void vfs_unmountall() { struct mount *mp, *nmp; - struct proc *p = current_proc(); /* XXX */ + struct proc *p = current_proc(); /* * Since this only runs when rebooting, it is not interlocked. @@ -2166,7 +2312,7 @@ vfs_unmountall() /* * Build hash lists of net addresses and hang them off the mount point. - * Called by ufs_mount() to set up the lists of export addresses. + * Called by vfs_export() to set up the lists of export addresses. */ static int vfs_hang_addrlist(mp, nep, argp) @@ -2343,7 +2489,7 @@ vfs_export_lookup(mp, nep, nam) * try to reclaim vnodes from the memory * object cache */ -int +static int vm_object_cache_reclaim(int count) { int cnt; @@ -2360,11 +2506,10 @@ vm_object_cache_reclaim(int count) * and then try to reclaim some vnodes from the memory * object cache */ -int +static int vnreclaim(int count) { - int cnt, i, loopcnt; - void *obj; + int i, loopcnt; struct vnode *vp; int err; struct proc *p; @@ -2390,163 +2535,133 @@ restart: for (vp = TAILQ_FIRST(&vnode_inactive_list); (vp != NULLVP) && (i < count); vp = TAILQ_NEXT(vp, v_freelist)) { + + if (!simple_lock_try(&vp->v_interlock)) + continue; - if (simple_lock_try(&vp->v_interlock)) { - if (vp->v_usecount != 1) - panic("vnreclaim: v_usecount"); - - if(!UBCINFOEXISTS(vp)) { - if (vp->v_type == VBAD) { - VREMINACTIVE("vnreclaim", vp); - simple_unlock(&vp->v_interlock); - continue; - } else - panic("non UBC vnode on inactive list"); - /* Should not reach here */ - } + if (vp->v_usecount != 1) + panic("vnreclaim: v_usecount"); - /* If vnode is already being reclaimed, wait */ - if ((vp->v_flag & VXLOCK) || (vp->v_flag & VORECLAIM)) { - vp->v_flag |= VXWANT; + if(!UBCINFOEXISTS(vp)) { + if (vp->v_type == VBAD) { + VREMINACTIVE("vnreclaim", vp); simple_unlock(&vp->v_interlock); - simple_unlock(&vnode_free_list_slock); - (void)tsleep((caddr_t)vp, PINOD, "vocr", 0); - goto restart; - } + continue; + } else + panic("non UBC vnode on inactive list"); + /* Should not reach here */ + } - VREMINACTIVE("vnreclaim", vp); + /* If vnode is already being reclaimed, wait */ + if ((vp->v_flag & VXLOCK) || (vp->v_flag & VORECLAIM)) { + vp->v_flag |= VXWANT; + simple_unlock(&vp->v_interlock); simple_unlock(&vnode_free_list_slock); + (void)tsleep((caddr_t)vp, PINOD, "vocr", 0); + goto restart; + } - /* held vnodes must not be reclaimed */ - if (vp->v_ubcinfo->ui_holdcnt) { /* XXX */ - vinactive(vp); - simple_unlock(&vp->v_interlock); - goto restart; - } + /* + * if the vnode is being initialized, + * skip over it + */ + if (ISSET(vp->v_flag, VUINIT)) { + SET(vp->v_flag, VUWANT); + simple_unlock(&vp->v_interlock); + continue; + } - if (ubc_issetflags(vp, UI_WASMAPPED)) { - /* - * We should not reclaim as it is likely - * to be in use. Let it die a natural death. - * Release the UBC reference if one exists - * and put it back at the tail. - */ - if (ubc_issetflags(vp, UI_HASOBJREF)) { - obj = ubc_getobject(vp, UBC_NOREACTIVATE); - if (obj == NULL) - panic("vnreclaim: null object"); - /* release the reference gained by ubc_info_init() */ - ubc_clearflags(vp, UI_HASOBJREF); - simple_unlock(&vp->v_interlock); - vm_object_deallocate(obj); - /* - * The vnode interlock was release. - * vm_object_deallocate() might have blocked. - * It is possible that the object was terminated. - * It is also possible that the vnode was - * reactivated. Evaluate the state again. - */ - if (UBCINFOEXISTS(vp)) { - simple_lock(&vp->v_interlock); - if ((vp->v_usecount == 1) && !VONLIST(vp)) - vinactive(vp); - simple_unlock(&vp->v_interlock); - } - } else { - vinactive(vp); - simple_unlock(&vp->v_interlock); - } - } else { - VORECLAIM_ENABLE(vp); + VREMINACTIVE("vnreclaim", vp); + simple_unlock(&vnode_free_list_slock); - /* - * scrub the dirty pages and invalidate the buffers - */ - p = current_proc(); - err = vn_lock(vp, LK_EXCLUSIVE|LK_INTERLOCK, p); - if (err) { - /* cannot reclaim */ + if (ubc_issetflags(vp, UI_WASMAPPED)) { + /* + * We should not reclaim as it is likely + * to be in use. Let it die a natural death. + * Release the UBC reference if one exists + * and put it back at the tail. + */ + simple_unlock(&vp->v_interlock); + if (ubc_release_named(vp)) { + if (UBCINFOEXISTS(vp)) { simple_lock(&vp->v_interlock); - vinactive(vp); - VORECLAIM_DISABLE(vp); + if (vp->v_usecount == 1 && !VONLIST(vp)) + vinactive(vp); simple_unlock(&vp->v_interlock); - goto restart; } + } else { + simple_lock(&vp->v_interlock); + vinactive(vp); + simple_unlock(&vp->v_interlock); + } + } else { + int didhold; + + VORECLAIM_ENABLE(vp); + + /* + * scrub the dirty pages and invalidate the buffers + */ + p = current_proc(); + err = vn_lock(vp, LK_EXCLUSIVE|LK_INTERLOCK, p); + if (err) { + /* cannot reclaim */ simple_lock(&vp->v_interlock); - if(vp->v_usecount != 1) - panic("VOCR: usecount race"); + vinactive(vp); + VORECLAIM_DISABLE(vp); + i++; simple_unlock(&vp->v_interlock); + goto restart; + } - /* - * If the UBC reference on the memory object - * was already lost, regain it. This will - * keep the memory object alive for rest of the - * reclaim and finally this reference would - * be lost by memory_object_destroy() - */ - obj = ubc_getobject(vp, (UBC_NOREACTIVATE|UBC_HOLDOBJECT)); - if (obj == (void *)NULL) - panic("vnreclaim: null object"); + /* keep the vnode alive so we can kill it */ + simple_lock(&vp->v_interlock); + if(vp->v_usecount != 1) + panic("VOCR: usecount race"); + vp->v_usecount++; + simple_unlock(&vp->v_interlock); - /* clean up the state in VM without invalidating */ + /* clean up the state in VM without invalidating */ + didhold = ubc_hold(vp); + if (didhold) (void)ubc_clean(vp, 0); - /* flush and invalidate buffers associated with the vnode */ - if (vp->v_tag == VT_NFS) - nfs_vinvalbuf(vp, V_SAVE, NOCRED, p, 0); - else - vinvalbuf(vp, V_SAVE, NOCRED, p, 0, 0); - - /* - * It is not necessary to call ubc_uncache() - * here because memory_object_destroy() marks - * the memory object non cachable already - * - * Need to release the vnode lock before calling - * vm_object_deallocate() to avoid deadlock - * when the vnode goes through vop_inactive - * - * Note: for the v_usecount == 1 case, VOP_INACTIVE - * has not yet been called. Call it now while vp is - * still locked, it will also release the lock. - */ - if (vp->v_usecount == 1) - VOP_INACTIVE(vp, p); - else - VOP_UNLOCK(vp, 0, p); - - /* - * This vnode is ready to be reclaimed. - * Terminate the memory object. - * memory_object_destroy() will result in - * vnode_pager_no_senders(). - * That will release the pager reference - * and the vnode will move to the free list. - */ - if (ISSET(vp->v_flag, VTERMINATE)) - panic("vnreclaim: already teminating"); - SET(vp->v_flag, VTERMINATE); + /* flush and invalidate buffers associated with the vnode */ + if (vp->v_tag == VT_NFS) + nfs_vinvalbuf(vp, V_SAVE, NOCRED, p, 0); + else + vinvalbuf(vp, V_SAVE, NOCRED, p, 0, 0); - memory_object_destroy(obj, 0); + /* + * Note: for the v_usecount == 2 case, VOP_INACTIVE + * has not yet been called. Call it now while vp is + * still locked, it will also release the lock. + */ + if (vp->v_usecount == 2) + VOP_INACTIVE(vp, p); + else + VOP_UNLOCK(vp, 0, p); - /* - * memory_object_destroy() is asynchronous with respect - * to vnode_pager_no_senders(). - * wait for vnode_pager_no_senders() to clear - * VTERMINATE - */ - while (ISSET(vp->v_flag, VTERMINATE)) { - SET(vp->v_flag, VTERMWANT); - tsleep((caddr_t)&vp->v_ubcinfo, PINOD, "vnreclaim", 0); - } - simple_lock(&vp->v_interlock); - VORECLAIM_DISABLE(vp); - i++; - simple_unlock(&vp->v_interlock); + if (didhold) + ubc_rele(vp); + + /* + * destroy the ubc named reference. + * If we can't because it is held for I/Os + * in progress, just put it back on the inactive + * list and move on. Otherwise, the paging reference + * is toast (and so is this vnode?). + */ + if (ubc_destroy_named(vp)) { + i++; } - /* inactive list lock was released, must restart */ - goto restart; + simple_lock(&vp->v_interlock); + VORECLAIM_DISABLE(vp); + simple_unlock(&vp->v_interlock); + vrele(vp); /* release extra use we added here */ } + /* inactive list lock was released, must restart */ + goto restart; } simple_unlock(&vnode_free_list_slock); @@ -2566,16 +2681,13 @@ out: * AGE the vnode so that it gets recycled quickly. * Check lock status to decide whether to call vput() or vrele(). */ -void +__private_extern__ void vnode_pager_vrele(struct vnode *vp) { boolean_t funnel_state; int isvnreclaim = 1; - if (vp == (struct vnode *) NULL) - panic("vnode_pager_vrele: null vp"); - funnel_state = thread_funnel_set(kernel_flock, TRUE); /* Mark the vnode to be recycled */ @@ -2612,23 +2724,26 @@ vnode_pager_vrele(struct vnode *vp) } if (!ISSET(vp->v_flag, VTERMINATE)) SET(vp->v_flag, VTERMINATE); + + cache_purge(vp); + if (UBCINFOEXISTS(vp)) { + struct ubc_info *uip = vp->v_ubcinfo; + if (ubc_issetflags(vp, UI_WASMAPPED)) SET(vp->v_flag, VWASMAPPED); - if ((vp->v_ubcinfo->ui_holdcnt) /* XXX */ - && !(vp->v_flag & VXLOCK)) - panic("vnode_pager_vrele: freeing held ubc_info"); - - simple_unlock(&vp->v_interlock); - ubc_info_free(vp); vp->v_ubcinfo = UBC_NOINFO; /* catch bad accesses */ + simple_unlock(&vp->v_interlock); + ubc_info_deallocate(uip); } else { if ((vp->v_type == VBAD) && ((vp)->v_ubcinfo != UBC_INFO_NULL) && ((vp)->v_ubcinfo != UBC_NOINFO)) { - simple_unlock(&vp->v_interlock); - ubc_info_free(vp); + struct ubc_info *uip = vp->v_ubcinfo; + vp->v_ubcinfo = UBC_NOINFO; /* catch bad accesses */ + simple_unlock(&vp->v_interlock); + ubc_info_deallocate(uip); } else { simple_unlock(&vp->v_interlock); } @@ -2659,7 +2774,6 @@ int walk_vnodes_debug=0; void walk_allvnodes() { - struct proc *p = current_proc(); /* XXX */ struct mount *mp, *nmp; struct vnode *vp; int cnt = 0; @@ -2697,3 +2811,482 @@ walk_allvnodes() printf("%d - inactive\n", cnt); } #endif /* DIAGNOSTIC */ + + +struct x_constraints { + u_int32_t x_maxreadcnt; + u_int32_t x_maxsegreadsize; + u_int32_t x_maxsegwritesize; +}; + + +void +vfs_io_attributes(vp, flags, iosize, vectors) + struct vnode *vp; + int flags; /* B_READ or B_WRITE */ + int *iosize; + int *vectors; +{ + struct mount *mp; + + /* start with "reasonable" defaults */ + *iosize = MAXPHYS; + *vectors = 32; + + mp = vp->v_mount; + if (mp != NULL) { + switch (flags) { + case B_READ: + if (mp->mnt_kern_flag & MNTK_IO_XINFO) + *iosize = ((struct x_constraints *)(mp->mnt_xinfo_ptr))->x_maxreadcnt; + else + *iosize = mp->mnt_maxreadcnt; + *vectors = mp->mnt_segreadcnt; + break; + case B_WRITE: + *iosize = mp->mnt_maxwritecnt; + *vectors = mp->mnt_segwritecnt; + break; + default: + break; + } + if (*iosize == 0) + *iosize = MAXPHYS; + if (*vectors == 0) + *vectors = 32; + } + return; +} + +__private_extern__ +void +vfs_io_maxsegsize(vp, flags, maxsegsize) + struct vnode *vp; + int flags; /* B_READ or B_WRITE */ + int *maxsegsize; +{ + struct mount *mp; + + /* start with "reasonable" default */ + *maxsegsize = MAXPHYS; + + mp = vp->v_mount; + if (mp != NULL) { + switch (flags) { + case B_READ: + if (mp->mnt_kern_flag & MNTK_IO_XINFO) + *maxsegsize = ((struct x_constraints *)(mp->mnt_xinfo_ptr))->x_maxsegreadsize; + else + /* + * if the extended info doesn't exist + * then use the maxread I/O size as the + * max segment size... this is the previous behavior + */ + *maxsegsize = mp->mnt_maxreadcnt; + break; + case B_WRITE: + if (mp->mnt_kern_flag & MNTK_IO_XINFO) + *maxsegsize = ((struct x_constraints *)(mp->mnt_xinfo_ptr))->x_maxsegwritesize; + else + /* + * if the extended info doesn't exist + * then use the maxwrite I/O size as the + * max segment size... this is the previous behavior + */ + *maxsegsize = mp->mnt_maxwritecnt; + break; + default: + break; + } + if (*maxsegsize == 0) + *maxsegsize = MAXPHYS; + } +} + + +#include + + +int +vfs_init_io_attributes(devvp, mp) + struct vnode *devvp; + struct mount *mp; +{ + int error; + off_t readblockcnt; + off_t writeblockcnt; + off_t readmaxcnt; + off_t writemaxcnt; + off_t readsegcnt; + off_t writesegcnt; + off_t readsegsize; + off_t writesegsize; + u_long blksize; + + u_int64_t temp; + + struct proc *p = current_proc(); + struct ucred *cred = p->p_ucred; + + int isvirtual = 0; + /* + * determine if this mount point exists on the same device as the root + * partition... if so, then it comes under the hard throttle control + */ + int thisunit = -1; + static int rootunit = -1; + extern struct vnode *rootvp; + + if (rootunit == -1) { + if (VOP_IOCTL(rootvp, DKIOCGETBSDUNIT, (caddr_t)&rootunit, 0, cred, p)) + rootunit = -1; + else if (rootvp == devvp) + mp->mnt_kern_flag |= MNTK_ROOTDEV; + } + if (devvp != rootvp && rootunit != -1) { + if (VOP_IOCTL(devvp, DKIOCGETBSDUNIT, (caddr_t)&thisunit, 0, cred, p) == 0) { + if (thisunit == rootunit) + mp->mnt_kern_flag |= MNTK_ROOTDEV; + } + } + if (VOP_IOCTL(devvp, DKIOCGETISVIRTUAL, (caddr_t)&isvirtual, 0, cred, p) == 0) { + if (isvirtual) + mp->mnt_kern_flag |= MNTK_VIRTUALDEV; + } + + if ((error = VOP_IOCTL(devvp, DKIOCGETMAXBLOCKCOUNTREAD, + (caddr_t)&readblockcnt, 0, cred, p))) + return (error); + + if ((error = VOP_IOCTL(devvp, DKIOCGETMAXBLOCKCOUNTWRITE, + (caddr_t)&writeblockcnt, 0, cred, p))) + return (error); + + if ((error = VOP_IOCTL(devvp, DKIOCGETMAXBYTECOUNTREAD, + (caddr_t)&readmaxcnt, 0, cred, p))) + return (error); + + if ((error = VOP_IOCTL(devvp, DKIOCGETMAXBYTECOUNTWRITE, + (caddr_t)&writemaxcnt, 0, cred, p))) + return (error); + + if ((error = VOP_IOCTL(devvp, DKIOCGETMAXSEGMENTCOUNTREAD, + (caddr_t)&readsegcnt, 0, cred, p))) + return (error); + + if ((error = VOP_IOCTL(devvp, DKIOCGETMAXSEGMENTCOUNTWRITE, + (caddr_t)&writesegcnt, 0, cred, p))) + return (error); + + if ((error = VOP_IOCTL(devvp, DKIOCGETMAXSEGMENTBYTECOUNTREAD, + (caddr_t)&readsegsize, 0, cred, p))) + return (error); + + if ((error = VOP_IOCTL(devvp, DKIOCGETMAXSEGMENTBYTECOUNTWRITE, + (caddr_t)&writesegsize, 0, cred, p))) + return (error); + + if ((error = VOP_IOCTL(devvp, DKIOCGETBLOCKSIZE, + (caddr_t)&blksize, 0, cred, p))) + return (error); + + + if ( !(mp->mnt_kern_flag & MNTK_IO_XINFO)) { + MALLOC(mp->mnt_xinfo_ptr, void *, sizeof(struct x_constraints), M_TEMP, M_WAITOK); + mp->mnt_kern_flag |= MNTK_IO_XINFO; + } + + if (readmaxcnt) + temp = (readmaxcnt > UINT32_MAX) ? UINT32_MAX : readmaxcnt; + else { + if (readblockcnt) { + temp = readblockcnt * blksize; + temp = (temp > UINT32_MAX) ? UINT32_MAX : temp; + } else + temp = MAXPHYS; + } + ((struct x_constraints *)(mp->mnt_xinfo_ptr))->x_maxreadcnt = (u_int32_t)temp; + + if (writemaxcnt) + temp = (writemaxcnt > UINT32_MAX) ? UINT32_MAX : writemaxcnt; + else { + if (writeblockcnt) { + temp = writeblockcnt * blksize; + temp = (temp > UINT32_MAX) ? UINT32_MAX : temp; + } else + temp = MAXPHYS; + } + mp->mnt_maxwritecnt = (u_int32_t)temp; + + if (readsegcnt) { + temp = (readsegcnt > UINT16_MAX) ? UINT16_MAX : readsegcnt; + mp->mnt_segreadcnt = (u_int16_t)temp; + } + if (writesegcnt) { + temp = (writesegcnt > UINT16_MAX) ? UINT16_MAX : writesegcnt; + mp->mnt_segwritecnt = (u_int16_t)temp; + } + if (readsegsize) + temp = (readsegsize > UINT32_MAX) ? UINT32_MAX : readsegsize; + else + temp = mp->mnt_maxreadcnt; + ((struct x_constraints *)(mp->mnt_xinfo_ptr))->x_maxsegreadsize = (u_int32_t)temp; + + if (writesegsize) + temp = (writesegsize > UINT32_MAX) ? UINT32_MAX : writesegsize; + else + temp = mp->mnt_maxwritecnt; + ((struct x_constraints *)(mp->mnt_xinfo_ptr))->x_maxsegwritesize = (u_int32_t)temp; + + return (error); +} + +static struct klist fs_klist; + +void +vfs_event_init(void) +{ + + klist_init(&fs_klist); +} + +void +vfs_event_signal(fsid_t *fsid, u_int32_t event, intptr_t data) +{ + + KNOTE(&fs_klist, event); +} + +/* + * return the number of mounted filesystems. + */ +static int +sysctl_vfs_getvfscnt(void) +{ + struct mount *mp; + int ret = 0; + + simple_lock(&mountlist_slock); + CIRCLEQ_FOREACH(mp, &mountlist, mnt_list) + ret++; + simple_unlock(&mountlist_slock); + return (ret); +} + +/* + * fill in the array of fsid_t's up to a max of 'count', the actual + * number filled in will be set in '*actual'. If there are more fsid_t's + * than room in fsidlst then ENOMEM will be returned and '*actual' will + * have the actual count. + * having *actual filled out even in the error case is depended upon. + */ +static int +sysctl_vfs_getvfslist(fsid_t *fsidlst, int count, int *actual) +{ + struct mount *mp; + + *actual = 0; + simple_lock(&mountlist_slock); + CIRCLEQ_FOREACH(mp, &mountlist, mnt_list) { + (*actual)++; + if (*actual <= count) + fsidlst[(*actual) - 1] = mp->mnt_stat.f_fsid; + } + simple_unlock(&mountlist_slock); + return (*actual <= count ? 0 : ENOMEM); +} + +static int +sysctl_vfs_vfslist SYSCTL_HANDLER_ARGS +{ + int actual, error; + size_t space; + fsid_t *fsidlst; + + /* This is a readonly node. */ + if (req->newptr != NULL) + return (EPERM); + + /* they are querying us so just return the space required. */ + if (req->oldptr == NULL) { + req->oldidx = sysctl_vfs_getvfscnt() * sizeof(fsid_t); + return 0; + } +again: + /* + * Retrieve an accurate count of the amount of space required to copy + * out all the fsids in the system. + */ + space = req->oldlen; + req->oldlen = sysctl_vfs_getvfscnt() * sizeof(fsid_t); + + /* they didn't give us enough space. */ + if (space < req->oldlen) + return (ENOMEM); + + MALLOC(fsidlst, fsid_t *, req->oldlen, M_TEMP, M_WAITOK); + error = sysctl_vfs_getvfslist(fsidlst, req->oldlen / sizeof(fsid_t), + &actual); + /* + * If we get back ENOMEM, then another mount has been added while we + * slept in malloc above. If this is the case then try again. + */ + if (error == ENOMEM) { + FREE(fsidlst, M_TEMP); + req->oldlen = space; + goto again; + } + if (error == 0) { + error = SYSCTL_OUT(req, fsidlst, actual * sizeof(fsid_t)); + } + FREE(fsidlst, M_TEMP); + return (error); +} + +/* + * Do a sysctl by fsid. + */ +static int +sysctl_vfs_ctlbyfsid SYSCTL_HANDLER_ARGS +{ + struct vfsidctl vc; + struct mount *mp; + struct statfs *sp; + struct proc *p; + int *name; + int error, flags, namelen; + + name = arg1; + namelen = arg2; + p = req->p; + + error = SYSCTL_IN(req, &vc, sizeof(vc)); + if (error) + return (error); + if (vc.vc_vers != VFS_CTL_VERS1) + return (EINVAL); + mp = vfs_getvfs(&vc.vc_fsid); + if (mp == NULL) + return (ENOENT); + /* reset so that the fs specific code can fetch it. */ + req->newidx = 0; + /* + * Note if this is a VFS_CTL then we pass the actual sysctl req + * in for "oldp" so that the lower layer can DTRT and use the + * SYSCTL_IN/OUT routines. + */ + if (mp->mnt_op->vfs_sysctl != NULL) { + error = mp->mnt_op->vfs_sysctl(name, namelen, + req, NULL, NULL, 0, req->p); + if (error != EOPNOTSUPP) + return (error); + } + switch (name[0]) { + case VFS_CTL_UMOUNT: + VCTLTOREQ(&vc, req); + error = SYSCTL_IN(req, &flags, sizeof(flags)); + if (error) + break; + error = safedounmount(mp, flags, p); + break; + case VFS_CTL_STATFS: + VCTLTOREQ(&vc, req); + error = SYSCTL_IN(req, &flags, sizeof(flags)); + if (error) + break; + sp = &mp->mnt_stat; + if (((flags & MNT_NOWAIT) == 0 || (flags & MNT_WAIT)) && + (error = VFS_STATFS(mp, sp, p))) + return (error); + sp->f_flags = mp->mnt_flag & MNT_VISFLAGMASK; + error = SYSCTL_OUT(req, sp, sizeof(*sp)); + break; + default: + return (EOPNOTSUPP); + } + return (error); +} + +static int filt_fsattach(struct knote *kn); +static void filt_fsdetach(struct knote *kn); +static int filt_fsevent(struct knote *kn, long hint); + +struct filterops fs_filtops = + { 0, filt_fsattach, filt_fsdetach, filt_fsevent }; + +static int +filt_fsattach(struct knote *kn) +{ + + kn->kn_flags |= EV_CLEAR; + KNOTE_ATTACH(&fs_klist, kn); + return (0); +} + +static void +filt_fsdetach(struct knote *kn) +{ + + KNOTE_DETACH(&fs_klist, kn); +} + +static int +filt_fsevent(struct knote *kn, long hint) +{ + + kn->kn_fflags |= hint; + return (kn->kn_fflags != 0); +} + +static int +sysctl_vfs_noremotehang SYSCTL_HANDLER_ARGS +{ + int out, error; + pid_t pid; + size_t space; + struct proc *p; + + /* We need a pid. */ + if (req->newptr == NULL) + return (EINVAL); + + error = SYSCTL_IN(req, &pid, sizeof(pid)); + if (error) + return (error); + + p = pfind(pid < 0 ? -pid : pid); + if (p == NULL) + return (ESRCH); + + /* + * Fetching the value is ok, but we only fetch if the old + * pointer is given. + */ + if (req->oldptr != NULL) { + out = !((p->p_flag & P_NOREMOTEHANG) == 0); + error = SYSCTL_OUT(req, &out, sizeof(out)); + return (error); + } + + /* cansignal offers us enough security. */ + if (p != req->p && suser(req->p->p_ucred, &req->p->p_acflag) != 0) + return (EPERM); + + if (pid < 0) + p->p_flag &= ~P_NOREMOTEHANG; + else + p->p_flag |= P_NOREMOTEHANG; + + return (0); +} +/* the vfs.generic. branch. */ +SYSCTL_NODE(_vfs, VFS_GENERIC, generic, CTLFLAG_RW, 0, "vfs generic hinge"); +/* retreive a list of mounted filesystem fsid_t */ +SYSCTL_PROC(_vfs_generic, OID_AUTO, vfsidlist, CTLFLAG_RD, + 0, 0, sysctl_vfs_vfslist, "S,fsid", "List of mounted filesystem ids"); +/* perform operations on filesystem via fsid_t */ +SYSCTL_NODE(_vfs_generic, OID_AUTO, ctlbyfsid, CTLFLAG_RW, + sysctl_vfs_ctlbyfsid, "ctlbyfsid"); +SYSCTL_PROC(_vfs_generic, OID_AUTO, noremotehang, CTLFLAG_RW, + 0, 0, sysctl_vfs_noremotehang, "I", "noremotehang"); +