X-Git-Url: https://git.saurik.com/apple/xnu.git/blobdiff_plain/de355530ae67247cbd0da700edb3a2a1dae884c2..eee3565979933af707c711411001ba11fe406a3c:/bsd/nfs/nfs_bio.c diff --git a/bsd/nfs/nfs_bio.c b/bsd/nfs/nfs_bio.c index 4b77d6637..acaf26c24 100644 --- a/bsd/nfs/nfs_bio.c +++ b/bsd/nfs/nfs_bio.c @@ -1,23 +1,29 @@ /* - * Copyright (c) 2000 Apple Computer, Inc. All rights reserved. + * Copyright (c) 2000-2015 Apple Inc. All rights reserved. * - * @APPLE_LICENSE_HEADER_START@ + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * - * The contents of this file constitute Original Code as defined in and - * are subject to the Apple Public Source License Version 1.1 (the - * "License"). You may not use this file except in compliance with the - * License. Please obtain a copy of the License at - * http://www.apple.com/publicsource and read it before using this file. + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. * - * This Original Code and all software distributed under the License are - * distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE OR NON-INFRINGEMENT. Please see the - * License for the specific language governing rights and limitations - * under the License. + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. * - * @APPLE_LICENSE_HEADER_END@ + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ */ /* Copyright (c) 1995 NeXT Computer, Inc. All Rights Reserved */ /* @@ -62,1321 +68,3837 @@ #include #include #include -#include -#include +#include +#include +#include #include -#include +#include +#include #include -#include -#include +#include +#include +#include #include #include #include #include +#include +#include +#include #include #include #include +#include #include -#include #include +#include +#include + +#define NFS_BIO_DBG(...) NFS_DBG(NFS_FAC_BIO, 7, ## __VA_ARGS__) + +kern_return_t thread_terminate(thread_t); /* XXX */ + +#define NFSBUFHASH(np, lbn) \ + (&nfsbufhashtbl[((long)(np) / sizeof(*(np)) + (int)(lbn)) & nfsbufhash]) +LIST_HEAD(nfsbufhashhead, nfsbuf) *nfsbufhashtbl; +struct nfsbuffreehead nfsbuffree, nfsbuffreemeta, nfsbufdelwri; +u_long nfsbufhash; +int nfsbufcnt, nfsbufmin, nfsbufmax, nfsbufmetacnt, nfsbufmetamax; +int nfsbuffreecnt, nfsbuffreemetacnt, nfsbufdelwricnt, nfsneedbuffer; +int nfs_nbdwrite; +int nfs_buf_timer_on = 0; +thread_t nfsbufdelwrithd = NULL; -#include +lck_grp_t *nfs_buf_lck_grp; +lck_mtx_t *nfs_buf_mutex; -#define FSDBG(A, B, C, D, E) \ - KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, (A))) | DBG_FUNC_NONE, \ - (int)(B), (int)(C), (int)(D), (int)(E), 0) -#define FSDBG_TOP(A, B, C, D, E) \ - KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, (A))) | DBG_FUNC_START, \ - (int)(B), (int)(C), (int)(D), (int)(E), 0) -#define FSDBG_BOT(A, B, C, D, E) \ - KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, (A))) | DBG_FUNC_END, \ - (int)(B), (int)(C), (int)(D), (int)(E), 0) +#define NFSBUF_FREE_PERIOD 30 /* seconds */ +#define NFSBUF_LRU_STALE 120 +#define NFSBUF_META_STALE 240 -static struct buf *nfs_getcacheblk __P((struct vnode *vp, daddr_t bn, int size, - struct proc *p, int operation)); +/* number of nfsbufs nfs_buf_freeup() should attempt to free from nfsbuffree list */ +#define LRU_TO_FREEUP 6 +/* number of nfsbufs nfs_buf_freeup() should attempt to free from nfsbuffreemeta list */ +#define META_TO_FREEUP 3 +/* total number of nfsbufs nfs_buf_freeup() should attempt to free */ +#define TOTAL_TO_FREEUP (LRU_TO_FREEUP+META_TO_FREEUP) +/* fraction of nfsbufs nfs_buf_freeup() should attempt to free from nfsbuffree list when called from timer */ +#define LRU_FREEUP_FRAC_ON_TIMER 8 +/* fraction of nfsbufs nfs_buf_freeup() should attempt to free from nfsbuffreemeta list when called from timer */ +#define META_FREEUP_FRAC_ON_TIMER 16 +/* fraction of total nfsbufs that nfsbuffreecnt should exceed before bothering to call nfs_buf_freeup() */ +#define LRU_FREEUP_MIN_FRAC 4 +/* fraction of total nfsbufs that nfsbuffreemetacnt should exceed before bothering to call nfs_buf_freeup() */ +#define META_FREEUP_MIN_FRAC 2 -extern int nfs_numasync; -extern struct nfsstats nfsstats; -extern int nbdwrite; +#define NFS_BUF_FREEUP() \ + do { \ + /* only call nfs_buf_freeup() if it has work to do: */ \ + if (((nfsbuffreecnt > nfsbufcnt/LRU_FREEUP_MIN_FRAC) || \ + (nfsbuffreemetacnt > nfsbufcnt/META_FREEUP_MIN_FRAC)) && \ + ((nfsbufcnt - TOTAL_TO_FREEUP) > nfsbufmin)) \ + nfs_buf_freeup(0); \ + } while (0) /* - * Vnode op for read using bio - * Any similarity to readip() is purely coincidental + * Initialize nfsbuf lists */ -int -nfs_bioread(vp, uio, ioflag, cred, getpages) - register struct vnode *vp; - register struct uio *uio; - int ioflag; - struct ucred *cred; - int getpages; +void +nfs_nbinit(void) { - register struct nfsnode *np = VTONFS(vp); - register int biosize, i; - off_t diff; - struct buf *bp = 0, *rabp; - struct vattr vattr; - struct proc *p; - struct nfsmount *nmp = VFSTONFS(vp->v_mount); - daddr_t lbn, rabn; - int bufsize; - int nra, error = 0, n = 0, on = 0, not_readin; - int operation = (getpages? BLK_PAGEIN : BLK_READ); - -#if DIAGNOSTIC - if (uio->uio_rw != UIO_READ) - panic("nfs_read mode"); -#endif - if (uio->uio_resid == 0) - return (0); - if (uio->uio_offset < 0) - return (EINVAL); - p = uio->uio_procp; - if ((nmp->nm_flag & (NFSMNT_NFSV3 | NFSMNT_GOTFSINFO)) == NFSMNT_NFSV3) - (void)nfs_fsinfo(nmp, vp, cred, p); - /*due to getblk/vm interractions, use vm page size or less values */ - biosize = min(vp->v_mount->mnt_stat.f_iosize, PAGE_SIZE); - /* - * For nfs, cache consistency can only be maintained approximately. - * Although RFC1094 does not specify the criteria, the following is - * believed to be compatible with the reference port. - * For nqnfs, full cache consistency is maintained within the loop. - * For nfs: - * If the file's modify time on the server has changed since the - * last read rpc or you have written to the file, - * you may have lost data cache consistency with the - * server, so flush all of the file's data out of the cache. - * Then force a getattr rpc to ensure that you have up to date - * attributes. - * NB: This implies that cache data can be read when up to - * NFS_ATTRTIMEO seconds out of date. If you find that you need current - * attributes this could be forced by setting n_attrstamp to 0 before - * the VOP_GETATTR() call. - */ - if ((nmp->nm_flag & NFSMNT_NQNFS) == 0) { - if (np->n_flag & NMODIFIED) { - if (vp->v_type != VREG) { - if (vp->v_type != VDIR) - panic("nfs: bioread, not dir"); - nfs_invaldir(vp); - error = nfs_vinvalbuf(vp, V_SAVE, cred, p, 1); - if (error) - return (error); - } - np->n_attrstamp = 0; - error = VOP_GETATTR(vp, &vattr, cred, p); - if (error) - return (error); - np->n_mtime = vattr.va_mtime.tv_sec; - } else { - error = VOP_GETATTR(vp, &vattr, cred, p); - if (error) - return (error); - if (np->n_mtime != vattr.va_mtime.tv_sec) { - if (vp->v_type == VDIR) - nfs_invaldir(vp); - error = nfs_vinvalbuf(vp, V_SAVE, cred, p, 1); - if (error) - return (error); - np->n_mtime = vattr.va_mtime.tv_sec; - } - } - } - do { + nfs_buf_lck_grp = lck_grp_alloc_init("nfs_buf", LCK_GRP_ATTR_NULL); + nfs_buf_mutex = lck_mtx_alloc_init(nfs_buf_lck_grp, LCK_ATTR_NULL); - /* - * Get a valid lease. If cached data is stale, flush it. - */ - if (nmp->nm_flag & NFSMNT_NQNFS) { - if (NQNFS_CKINVALID(vp, np, ND_READ)) { - do { - error = nqnfs_getlease(vp, ND_READ, cred, p); - } while (error == NQNFS_EXPIRED); - if (error) - return (error); - if (np->n_lrev != np->n_brev || - (np->n_flag & NQNFSNONCACHE) || - ((np->n_flag & NMODIFIED) && vp->v_type == VDIR)) { - if (vp->v_type == VDIR) - nfs_invaldir(vp); - error = nfs_vinvalbuf(vp, V_SAVE, cred, p, 1); - if (error) - return (error); - np->n_brev = np->n_lrev; - } - } else if (vp->v_type == VDIR && (np->n_flag & NMODIFIED)) { - nfs_invaldir(vp); - error = nfs_vinvalbuf(vp, V_SAVE, cred, p, 1); - if (error) - return (error); - } - } - if (np->n_flag & NQNFSNONCACHE) { - switch (vp->v_type) { - case VREG: - return (nfs_readrpc(vp, uio, cred)); - case VLNK: - return (nfs_readlinkrpc(vp, uio, cred)); - case VDIR: - break; - default: - printf(" NQNFSNONCACHE: type %x unexpected\n", - vp->v_type); - }; - } - switch (vp->v_type) { - case VREG: - nfsstats.biocache_reads++; - lbn = uio->uio_offset / biosize; - on = uio->uio_offset & (biosize - 1); - not_readin = 1; + nfsbufcnt = nfsbufmetacnt = + nfsbuffreecnt = nfsbuffreemetacnt = nfsbufdelwricnt = 0; + nfsbufmin = 128; + /* size nfsbufmax to cover at most half sane_size (w/default buf size) */ + nfsbufmax = (sane_size >> PAGE_SHIFT) / (2 * (NFS_RWSIZE >> PAGE_SHIFT)); + nfsbufmetamax = nfsbufmax / 4; + nfsneedbuffer = 0; + nfs_nbdwrite = 0; - /* - * Start the read ahead(s), as required. - */ - if (nfs_numasync > 0 && nmp->nm_readahead > 0) { - for (nra = 0; nra < nmp->nm_readahead && - (off_t)(lbn + 1 + nra) * biosize < np->n_size; - nra++) { - rabn = lbn + 1 + nra; - if (!incore(vp, rabn)) { - rabp = nfs_getcacheblk(vp, rabn, biosize, p, operation); - if (!rabp) - return (EINTR); - if (!ISSET(rabp->b_flags, (B_CACHE|B_DELWRI))) { - SET(rabp->b_flags, (B_READ | B_ASYNC)); - if (nfs_asyncio(rabp, cred)) { - SET(rabp->b_flags, (B_INVAL|B_ERROR)); - rabp->b_error = EIO; - brelse(rabp); - } - } else - brelse(rabp); - } - } - } + nfsbufhashtbl = hashinit(nfsbufmax/4, M_TEMP, &nfsbufhash); + TAILQ_INIT(&nfsbuffree); + TAILQ_INIT(&nfsbuffreemeta); + TAILQ_INIT(&nfsbufdelwri); - /* - * If the block is in the cache and has the required data - * in a valid region, just copy it out. - * Otherwise, get the block and write back/read in, - * as required. - */ -again: - bufsize = biosize; - if ((off_t)(lbn + 1) * biosize > np->n_size && - (off_t)(lbn + 1) * biosize - np->n_size < biosize) { - bufsize = np->n_size - (off_t)lbn * biosize; - bufsize = (bufsize + DEV_BSIZE - 1) & ~(DEV_BSIZE - 1); - } - bp = nfs_getcacheblk(vp, lbn, bufsize, p, operation); - if (!bp) - return (EINTR); +} - if (!ISSET(bp->b_flags, B_CACHE)) { - SET(bp->b_flags, B_READ); - CLR(bp->b_flags, (B_DONE | B_ERROR | B_INVAL)); - not_readin = 0; - error = nfs_doio(bp, cred, p); - if (error) { - brelse(bp); - return (error); - } - } - if (bufsize > on) { - n = min((unsigned)(bufsize - on), uio->uio_resid); - } else { - n = 0; - } - diff = np->n_size - uio->uio_offset; - if (diff < n) - n = diff; - if (not_readin && n > 0) { - if (on < bp->b_validoff || (on + n) > bp->b_validend) { - SET(bp->b_flags, (B_NOCACHE|B_INVAFTERWRITE)); - if (bp->b_dirtyend > 0) { - if (!ISSET(bp->b_flags, B_DELWRI)) - panic("nfsbioread"); - if (VOP_BWRITE(bp) == EINTR) - return (EINTR); - } else - brelse(bp); - goto again; - } - } - vp->v_lastr = lbn; - diff = (on >= bp->b_validend) ? 0 : (bp->b_validend - on); - if (diff < n) - n = diff; - break; - case VLNK: - nfsstats.biocache_readlinks++; - bp = nfs_getcacheblk(vp, (daddr_t)0, NFS_MAXPATHLEN, p, operation); - if (!bp) - return (EINTR); - if (!ISSET(bp->b_flags, B_CACHE)) { - SET(bp->b_flags, B_READ); - error = nfs_doio(bp, cred, p); - if (error) { - SET(bp->b_flags, B_ERROR); - brelse(bp); - return (error); - } - } - n = min(uio->uio_resid, NFS_MAXPATHLEN - bp->b_resid); - on = 0; - break; - case VDIR: - nfsstats.biocache_readdirs++; - if (np->n_direofoffset - && uio->uio_offset >= np->n_direofoffset) { - return (0); - } - lbn = uio->uio_offset / NFS_DIRBLKSIZ; - on = uio->uio_offset & (NFS_DIRBLKSIZ - 1); - bp = nfs_getcacheblk(vp, lbn, NFS_DIRBLKSIZ, p, operation); - if (!bp) - return (EINTR); - if (!ISSET(bp->b_flags, B_CACHE)) { - SET(bp->b_flags, B_READ); - error = nfs_doio(bp, cred, p); - if (error) { - brelse(bp); - } - while (error == NFSERR_BAD_COOKIE) { - nfs_invaldir(vp); - error = nfs_vinvalbuf(vp, 0, cred, p, 1); - /* - * Yuck! The directory has been modified on the - * server. The only way to get the block is by - * reading from the beginning to get all the - * offset cookies. - */ - for (i = 0; i <= lbn && !error; i++) { - if (np->n_direofoffset - && (i * NFS_DIRBLKSIZ) >= np->n_direofoffset) - return (0); - bp = nfs_getcacheblk(vp, i, NFS_DIRBLKSIZ, p, - operation); - if (!bp) - return (EINTR); - if (!ISSET(bp->b_flags, B_CACHE)) { - SET(bp->b_flags, B_READ); - error = nfs_doio(bp, cred, p); - /* - * no error + B_INVAL == directory EOF, - * use the block. - */ - if (error == 0 && (bp->b_flags & B_INVAL)) - break; - } - /* - * An error will throw away the block and the - * for loop will break out. If no error and this - * is not the block we want, we throw away the - * block and go for the next one via the for loop. - */ - if (error || i < lbn) - brelse(bp); - } - } - /* - * The above while is repeated if we hit another cookie - * error. If we hit an error and it wasn't a cookie error, - * we give up. - */ - if (error) - return (error); - } +/* + * Check periodically for stale/unused nfs bufs + */ +void +nfs_buf_timer(__unused void *param0, __unused void *param1) +{ + nfs_buf_freeup(1); - /* - * If not eof and read aheads are enabled, start one. - * (You need the current block first, so that you have the - * directory offset cookie of the next block.) - */ - if (nfs_numasync > 0 && nmp->nm_readahead > 0 && - (np->n_direofoffset == 0 || - (lbn + 1) * NFS_DIRBLKSIZ < np->n_direofoffset) && - !(np->n_flag & NQNFSNONCACHE) && - !incore(vp, lbn + 1)) { - rabp = nfs_getcacheblk(vp, lbn + 1, NFS_DIRBLKSIZ, p, - operation); - if (rabp) { - if (!ISSET(rabp->b_flags, (B_CACHE|B_DELWRI))) { - SET(rabp->b_flags, (B_READ | B_ASYNC)); - if (nfs_asyncio(rabp, cred)) { - SET(rabp->b_flags, (B_INVAL|B_ERROR)); - rabp->b_error = EIO; - brelse(rabp); - } - } else { - brelse(rabp); - } - } - } - /* - * Make sure we use a signed variant of min() since - * the second term may be negative. - */ - n = lmin(uio->uio_resid, NFS_DIRBLKSIZ - bp->b_resid - on); - /* - * Unlike VREG files, whos buffer size ( bp->b_bcount ) is - * chopped for the EOF condition, we cannot tell how large - * NFS directories are going to be until we hit EOF. So - * an NFS directory buffer is *not* chopped to its EOF. Now, - * it just so happens that b_resid will effectively chop it - * to EOF. *BUT* this information is lost if the buffer goes - * away and is reconstituted into a B_CACHE state (recovered - * from VM) later. So we keep track of the directory eof - * in np->n_direofoffset and chop it off as an extra step - * right here. - */ - if (np->n_direofoffset && - n > np->n_direofoffset - uio->uio_offset) - n = np->n_direofoffset - uio->uio_offset; - break; - default: - printf(" nfs_bioread: type %x unexpected\n",vp->v_type); - break; - }; + lck_mtx_lock(nfs_buf_mutex); + if (nfsbufcnt <= nfsbufmin) { + nfs_buf_timer_on = 0; + lck_mtx_unlock(nfs_buf_mutex); + return; + } + lck_mtx_unlock(nfs_buf_mutex); - if (n > 0) { - error = uiomove(bp->b_data + on, (int)n, uio); - } - switch (vp->v_type) { - case VREG: - break; - case VLNK: - n = 0; - break; - case VDIR: - if (np->n_flag & NQNFSNONCACHE) - SET(bp->b_flags, B_INVAL); - break; - default: - printf(" nfs_bioread: type %x unexpected\n",vp->v_type); - } - brelse(bp); - } while (error == 0 && uio->uio_resid > 0 && n > 0); - return (error); + nfs_interval_timer_start(nfs_buf_timer_call, + NFSBUF_FREE_PERIOD * 1000); } - /* - * Vnode op for write using bio + * try to free up some excess, unused nfsbufs */ -int -nfs_write(ap) - struct vop_write_args /* { - struct vnode *a_vp; - struct uio *a_uio; - int a_ioflag; - struct ucred *a_cred; - } */ *ap; +void +nfs_buf_freeup(int timer) { - register int biosize; - register struct uio *uio = ap->a_uio; - struct proc *p = uio->uio_procp; - register struct vnode *vp = ap->a_vp; - struct nfsnode *np = VTONFS(vp); - register struct ucred *cred = ap->a_cred; - int ioflag = ap->a_ioflag; - struct buf *bp; - struct vattr vattr; - struct nfsmount *nmp = VFSTONFS(vp->v_mount); - daddr_t lbn; - int bufsize; - int n, on, error = 0, iomode, must_commit; - off_t boff; - struct iovec iov; - struct uio auio; - -#if DIAGNOSTIC - if (uio->uio_rw != UIO_WRITE) - panic("nfs_write mode"); - if (uio->uio_segflg == UIO_USERSPACE && uio->uio_procp != current_proc()) - panic("nfs_write proc"); -#endif - if (vp->v_type != VREG) - return (EIO); - if (np->n_flag & NWRITEERR) { - np->n_flag &= ~NWRITEERR; - return (np->n_error); - } - if ((nmp->nm_flag & (NFSMNT_NFSV3 | NFSMNT_GOTFSINFO)) == NFSMNT_NFSV3) - (void)nfs_fsinfo(nmp, vp, cred, p); - if (ioflag & (IO_APPEND | IO_SYNC)) { - if (np->n_flag & NMODIFIED) { - np->n_attrstamp = 0; - error = nfs_vinvalbuf(vp, V_SAVE, cred, p, 1); - if (error) - return (error); - } - if (ioflag & IO_APPEND) { - np->n_attrstamp = 0; - error = VOP_GETATTR(vp, &vattr, cred, p); - if (error) - return (error); - uio->uio_offset = np->n_size; - } - } - if (uio->uio_offset < 0) - return (EINVAL); - if (uio->uio_resid == 0) - return (0); - /* - * Maybe this should be above the vnode op call, but so long as - * file servers have no limits, i don't think it matters - */ - if (p && uio->uio_offset + uio->uio_resid > - p->p_rlimit[RLIMIT_FSIZE].rlim_cur) { - psignal(p, SIGXFSZ); - return (EFBIG); - } - /* - * I use nm_rsize, not nm_wsize so that all buffer cache blocks - * will be the same size within a filesystem. nfs_writerpc will - * still use nm_wsize when sizing the rpc's. - */ - /*due to getblk/vm interractions, use vm page size or less values */ - biosize = min(vp->v_mount->mnt_stat.f_iosize, PAGE_SIZE); + struct nfsbuf *fbp; + struct timeval now; + int count; + struct nfsbuffreehead nfsbuffreeup; - do { - /* - * Check for a valid write lease. - */ - if ((nmp->nm_flag & NFSMNT_NQNFS) && - NQNFS_CKINVALID(vp, np, ND_WRITE)) { - do { - error = nqnfs_getlease(vp, ND_WRITE, cred, p); - } while (error == NQNFS_EXPIRED); - if (error) - return (error); - if (np->n_lrev != np->n_brev || - (np->n_flag & NQNFSNONCACHE)) { - error = nfs_vinvalbuf(vp, V_SAVE, cred, p, 1); - if (error) - return (error); - np->n_brev = np->n_lrev; - } - } - if ((np->n_flag & NQNFSNONCACHE) && uio->uio_iovcnt == 1) { - iomode = NFSV3WRITE_FILESYNC; - error = nfs_writerpc(vp, uio, cred, &iomode, &must_commit); - if (must_commit) - nfs_clearcommit(vp->v_mount); - return (error); - } - nfsstats.biocache_writes++; - lbn = uio->uio_offset / biosize; - on = uio->uio_offset & (biosize-1); - n = min((unsigned)(biosize - on), uio->uio_resid); -again: - bufsize = biosize; -#if 0 -/* (removed for UBC) */ - if ((lbn + 1) * biosize > np->n_size) { - bufsize = np->n_size - lbn * biosize; - bufsize = (bufsize + DEV_BSIZE - 1) & ~(DEV_BSIZE - 1); - } -#endif - /* - * Get a cache block for writing. The range to be written is - * (off..off+len) within the block. We ensure that the block - * either has no dirty region or that the given range is - * contiguous with the existing dirty region. - */ - bp = nfs_getcacheblk(vp, lbn, bufsize, p, BLK_WRITE); - if (!bp) - return (EINTR); - /* - * Resize nfsnode *after* we busy the buffer to prevent - * readers from reading garbage. - * If there was a partial buf at the old eof, validate - * and zero the new bytes. - */ - if (uio->uio_offset + n > np->n_size) { - struct buf *bp0 = NULL; - daddr_t bn = np->n_size / biosize; - int off = np->n_size & (biosize - 1); - - if (off && bn < lbn && incore(vp, bn)) - bp0 = nfs_getcacheblk(vp, bn, biosize, p, - BLK_WRITE); - np->n_flag |= NMODIFIED; - np->n_size = uio->uio_offset + n; - ubc_setsize(vp, (off_t)np->n_size); /* XXX errors */ - if (bp0) { - bzero((char *)bp0->b_data + off, biosize - off); - bp0->b_validend = biosize; - brelse(bp0); - } - } - /* - * NFS has embedded ucred so crhold() risks zone corruption - */ - if (bp->b_wcred == NOCRED) - bp->b_wcred = crdup(cred); - /* - * If dirtyend exceeds file size, chop it down. This should - * not occur unless there is a race. - */ - if ((off_t)bp->b_blkno * DEV_BSIZE + bp->b_dirtyend > - np->n_size) - bp->b_dirtyend = np->n_size - (off_t)bp->b_blkno * - DEV_BSIZE; - /* - * UBC doesn't (yet) handle partial pages so nfs_biowrite was - * hacked to never bdwrite, to start every little write right - * away. Running IE Avie noticed the performance problem, thus - * this code, which permits those delayed writes by ensuring an - * initial read of the entire page. The read may hit eof - * ("short read") but that we will handle. - * - * We are quite dependant on the correctness of B_CACHE so check - * that first in case of problems. - */ - if (!ISSET(bp->b_flags, B_CACHE) && n < PAGE_SIZE) { - boff = (off_t)bp->b_blkno * DEV_BSIZE; - auio.uio_iov = &iov; - auio.uio_iovcnt = 1; - auio.uio_offset = boff; - auio.uio_resid = PAGE_SIZE; - auio.uio_segflg = UIO_SYSSPACE; - auio.uio_rw = UIO_READ; - auio.uio_procp = p; - iov.iov_base = bp->b_data; - iov.iov_len = PAGE_SIZE; - error = nfs_readrpc(vp, &auio, cred); - if (error) { - bp->b_error = error; - SET(bp->b_flags, B_ERROR); - printf("nfs_write: readrpc %d", error); - } - if (auio.uio_resid > 0) - bzero(iov.iov_base, auio.uio_resid); - bp->b_validoff = 0; - bp->b_validend = PAGE_SIZE - auio.uio_resid; - if (np->n_size > boff + bp->b_validend) - bp->b_validend = min(np->n_size - boff, - PAGE_SIZE); - bp->b_dirtyoff = 0; - bp->b_dirtyend = 0; - } - - /* - * If the new write will leave a contiguous dirty - * area, just update the b_dirtyoff and b_dirtyend, - * otherwise try to extend the dirty region. - */ - if (bp->b_dirtyend > 0 && - (on > bp->b_dirtyend || (on + n) < bp->b_dirtyoff)) { - off_t start, end; - - boff = (off_t)bp->b_blkno * DEV_BSIZE; - if (on > bp->b_dirtyend) { - start = boff + bp->b_validend; - end = boff + on; - } else { - start = boff + on + n; - end = boff + bp->b_validoff; - } - - /* - * It may be that the valid region in the buffer - * covers the region we want, in which case just - * extend the dirty region. Otherwise we try to - * extend the valid region. - */ - if (end > start) { - auio.uio_iov = &iov; - auio.uio_iovcnt = 1; - auio.uio_offset = start; - auio.uio_resid = end - start; - auio.uio_segflg = UIO_SYSSPACE; - auio.uio_rw = UIO_READ; - auio.uio_procp = p; - iov.iov_base = bp->b_data + (start - boff); - iov.iov_len = end - start; - error = nfs_readrpc(vp, &auio, cred); - /* - * If we couldn't read, do not do a VOP_BWRITE - * as originally coded. That could also error - * and looping back to "again" as it was doing - * could have us stuck trying to write same buf - * again. nfs_write, will get the entire region - * if nfs_readrpc succeeded. If unsuccessful - * we should just error out. Errors like ESTALE - * would keep us looping rather than transient - * errors justifying a retry. We can return here - * instead of altering dirty region later. We - * did not write old dirty region at this point. - */ - if (error) { - bp->b_error = error; - SET(bp->b_flags, B_ERROR); - printf("nfs_write: readrpc2 %d", error); - brelse(bp); - return (error); - } - /* - * The read worked. - * If there was a short read, just zero fill. - */ - if (auio.uio_resid > 0) - bzero(iov.iov_base, auio.uio_resid); - if (on > bp->b_dirtyend) - bp->b_validend = on; - else - bp->b_validoff = on + n; + TAILQ_INIT(&nfsbuffreeup); + + lck_mtx_lock(nfs_buf_mutex); + + microuptime(&now); + + FSDBG(320, nfsbufcnt, nfsbuffreecnt, nfsbuffreemetacnt, 0); + + count = timer ? nfsbuffreecnt/LRU_FREEUP_FRAC_ON_TIMER : LRU_TO_FREEUP; + while ((nfsbufcnt > nfsbufmin) && (count-- > 0)) { + fbp = TAILQ_FIRST(&nfsbuffree); + if (!fbp) + break; + if (fbp->nb_refs) + break; + if (NBUFSTAMPVALID(fbp) && + (fbp->nb_timestamp + (2*NFSBUF_LRU_STALE)) > now.tv_sec) + break; + nfs_buf_remfree(fbp); + /* disassociate buffer from any nfsnode */ + if (fbp->nb_np) { + if (fbp->nb_vnbufs.le_next != NFSNOLIST) { + LIST_REMOVE(fbp, nb_vnbufs); + fbp->nb_vnbufs.le_next = NFSNOLIST; } - /* - * We now have a valid region which extends up to the - * dirty region which we want. - */ - if (on > bp->b_dirtyend) - bp->b_dirtyend = on; - else - bp->b_dirtyoff = on + n; - } - if (ISSET(bp->b_flags, B_ERROR)) { - error = bp->b_error; - brelse(bp); - return (error); + fbp->nb_np = NULL; } - /* - * NFS has embedded ucred so crhold() risks zone corruption - */ - if (bp->b_wcred == NOCRED) - bp->b_wcred = crdup(cred); - np->n_flag |= NMODIFIED; + LIST_REMOVE(fbp, nb_hash); + TAILQ_INSERT_TAIL(&nfsbuffreeup, fbp, nb_free); + nfsbufcnt--; + } - /* - * Check for valid write lease and get one as required. - * In case getblk() and/or bwrite() delayed us. - */ - if ((nmp->nm_flag & NFSMNT_NQNFS) && - NQNFS_CKINVALID(vp, np, ND_WRITE)) { - do { - error = nqnfs_getlease(vp, ND_WRITE, cred, p); - } while (error == NQNFS_EXPIRED); - if (error) { - brelse(bp); - return (error); - } - if (np->n_lrev != np->n_brev || - (np->n_flag & NQNFSNONCACHE)) { - brelse(bp); - error = nfs_vinvalbuf(vp, V_SAVE, cred, p, 1); - if (error) - return (error); - np->n_brev = np->n_lrev; - goto again; + count = timer ? nfsbuffreemetacnt/META_FREEUP_FRAC_ON_TIMER : META_TO_FREEUP; + while ((nfsbufcnt > nfsbufmin) && (count-- > 0)) { + fbp = TAILQ_FIRST(&nfsbuffreemeta); + if (!fbp) + break; + if (fbp->nb_refs) + break; + if (NBUFSTAMPVALID(fbp) && + (fbp->nb_timestamp + (2*NFSBUF_META_STALE)) > now.tv_sec) + break; + nfs_buf_remfree(fbp); + /* disassociate buffer from any nfsnode */ + if (fbp->nb_np) { + if (fbp->nb_vnbufs.le_next != NFSNOLIST) { + LIST_REMOVE(fbp, nb_vnbufs); + fbp->nb_vnbufs.le_next = NFSNOLIST; } + fbp->nb_np = NULL; } - error = uiomove((char *)bp->b_data + on, n, uio); - if (error) { - SET(bp->b_flags, B_ERROR); - brelse(bp); - return (error); - } - if (bp->b_dirtyend > 0) { - bp->b_dirtyoff = min(on, bp->b_dirtyoff); - bp->b_dirtyend = max((on + n), bp->b_dirtyend); - } else { - bp->b_dirtyoff = on; - bp->b_dirtyend = on + n; - } - if (bp->b_validend == 0 || bp->b_validend < bp->b_dirtyoff || - bp->b_validoff > bp->b_dirtyend) { - bp->b_validoff = bp->b_dirtyoff; - bp->b_validend = bp->b_dirtyend; - } else { - bp->b_validoff = min(bp->b_validoff, bp->b_dirtyoff); - bp->b_validend = max(bp->b_validend, bp->b_dirtyend); - } + LIST_REMOVE(fbp, nb_hash); + TAILQ_INSERT_TAIL(&nfsbuffreeup, fbp, nb_free); + nfsbufcnt--; + nfsbufmetacnt--; + } - /* - * Since this block is being modified, it must be written - * again and not just committed. - */ - CLR(bp->b_flags, B_NEEDCOMMIT); + FSDBG(320, nfsbufcnt, nfsbuffreecnt, nfsbuffreemetacnt, 0); + NFSBUFCNTCHK(); - /* - * If the lease is non-cachable or IO_SYNC do bwrite(). - */ - if ((np->n_flag & NQNFSNONCACHE) || (ioflag & IO_SYNC)) { - bp->b_proc = p; - error = VOP_BWRITE(bp); - if (error) - return (error); - if (np->n_flag & NQNFSNONCACHE) { - error = nfs_vinvalbuf(vp, V_SAVE, cred, p, 1); - if (error) - return (error); - } - } else if ((n + on) == biosize && - (nmp->nm_flag & NFSMNT_NQNFS) == 0) { - bp->b_proc = (struct proc *)0; - SET(bp->b_flags, B_ASYNC); - (void)nfs_writebp(bp, 0); - } else - bdwrite(bp); - } while (uio->uio_resid > 0 && n > 0); - return (0); -} + lck_mtx_unlock(nfs_buf_mutex); + + while ((fbp = TAILQ_FIRST(&nfsbuffreeup))) { + TAILQ_REMOVE(&nfsbuffreeup, fbp, nb_free); + /* nuke any creds */ + if (IS_VALID_CRED(fbp->nb_rcred)) + kauth_cred_unref(&fbp->nb_rcred); + if (IS_VALID_CRED(fbp->nb_wcred)) + kauth_cred_unref(&fbp->nb_wcred); + /* if buf was NB_META, dump buffer */ + if (ISSET(fbp->nb_flags, NB_META) && fbp->nb_data) + kfree(fbp->nb_data, fbp->nb_bufsize); + FREE(fbp, M_TEMP); + } +} /* - * Get an nfs cache block. - * Allocate a new one if the block isn't currently in the cache - * and return the block marked busy. If the calling process is - * interrupted by a signal for an interruptible mount point, return - * NULL. + * remove a buffer from the freelist + * (must be called with nfs_buf_mutex held) */ -static struct buf * -nfs_getcacheblk(vp, bn, size, p, operation) - struct vnode *vp; - daddr_t bn; - int size; - struct proc *p; - int operation; /* defined in sys/buf.h */ +void +nfs_buf_remfree(struct nfsbuf *bp) { - register struct buf *bp; - struct nfsmount *nmp = VFSTONFS(vp->v_mount); - /*due to getblk/vm interractions, use vm page size or less values */ - int biosize = min(vp->v_mount->mnt_stat.f_iosize, PAGE_SIZE); - - if (nbdwrite > ((nbuf/4)*3) && operation == BLK_WRITE) { -#define __BUFFERS_RECLAIMED 2 - struct buf *tbp[__BUFFERS_RECLAIMED]; - int i; - - /* too many delayed writes, try to free up some buffers */ - for (i = 0; i < __BUFFERS_RECLAIMED; i++) - tbp[i] = geteblk(512); - - /* Yield to IO thread */ - (void)tsleep((caddr_t)&nbdwrite, PCATCH, "nbdwrite", 1); - - for (i = (__BUFFERS_RECLAIMED - 1); i >= 0; i--) - brelse(tbp[i]); + if (bp->nb_free.tqe_next == NFSNOLIST) + panic("nfsbuf not on free list"); + if (ISSET(bp->nb_flags, NB_DELWRI)) { + nfsbufdelwricnt--; + TAILQ_REMOVE(&nfsbufdelwri, bp, nb_free); + } else if (ISSET(bp->nb_flags, NB_META)) { + nfsbuffreemetacnt--; + TAILQ_REMOVE(&nfsbuffreemeta, bp, nb_free); + } else { + nfsbuffreecnt--; + TAILQ_REMOVE(&nfsbuffree, bp, nb_free); } + bp->nb_free.tqe_next = NFSNOLIST; + NFSBUFCNTCHK(); +} - if (nmp->nm_flag & NFSMNT_INT) { - bp = getblk(vp, bn, size, PCATCH, 0, operation); - while (bp == (struct buf *)0) { - if (nfs_sigintr(nmp, (struct nfsreq *)0, p)) - return ((struct buf *)0); - bp = getblk(vp, bn, size, 0, 2 * hz, operation); - } - } else - bp = getblk(vp, bn, size, 0, 0, operation); - - if( vp->v_type == VREG) - bp->b_blkno = ((off_t)bn * biosize) / DEV_BSIZE; +/* + * check for existence of nfsbuf in cache + */ +boolean_t +nfs_buf_is_incore(nfsnode_t np, daddr64_t blkno) +{ + boolean_t rv; + lck_mtx_lock(nfs_buf_mutex); + if (nfs_buf_incore(np, blkno)) + rv = TRUE; + else + rv = FALSE; + lck_mtx_unlock(nfs_buf_mutex); + return (rv); +} - return (bp); +/* + * return incore buffer (must be called with nfs_buf_mutex held) + */ +struct nfsbuf * +nfs_buf_incore(nfsnode_t np, daddr64_t blkno) +{ + /* Search hash chain */ + struct nfsbuf * bp = NFSBUFHASH(np, blkno)->lh_first; + for (; bp != NULL; bp = bp->nb_hash.le_next) + if ((bp->nb_lblkno == blkno) && (bp->nb_np == np)) { + if (!ISSET(bp->nb_flags, NB_INVAL)) { + FSDBG(547, bp, blkno, bp->nb_flags, bp->nb_np); + return (bp); + } + } + return (NULL); } /* - * Flush and invalidate all dirty buffers. If another process is already - * doing the flush, just wait for completion. + * Check if it's OK to drop a page. + * + * Called by vnode_pager() on pageout request of non-dirty page. + * We need to make sure that it's not part of a delayed write. + * If it is, we can't let the VM drop it because we may need it + * later when/if we need to write the data (again). */ int -nfs_vinvalbuf(vp, flags, cred, p, intrflg) - struct vnode *vp; - int flags; - struct ucred *cred; - struct proc *p; - int intrflg; +nfs_buf_page_inval(vnode_t vp, off_t offset) { - register struct nfsnode *np = VTONFS(vp); - struct nfsmount *nmp = VFSTONFS(vp->v_mount); - int error = 0, slpflag, slptimeo; - int didhold = 0; + struct nfsmount *nmp = VTONMP(vp); + struct nfsbuf *bp; + int error = 0; - if ((nmp->nm_flag & NFSMNT_INT) == 0) - intrflg = 0; - if (intrflg) { - slpflag = PCATCH; - slptimeo = 2 * hz; - } else { - slpflag = 0; - slptimeo = 0; + if (nfs_mount_gone(nmp)) + return (ENXIO); + + lck_mtx_lock(nfs_buf_mutex); + bp = nfs_buf_incore(VTONFS(vp), (daddr64_t)(offset / nmp->nm_biosize)); + if (!bp) + goto out; + FSDBG(325, bp, bp->nb_flags, bp->nb_dirtyoff, bp->nb_dirtyend); + if (ISSET(bp->nb_lflags, NBL_BUSY)) { + error = EBUSY; + goto out; } /* - * First wait for any other process doing a flush to complete. + * If there's a dirty range in the buffer, check to + * see if this page intersects with the dirty range. + * If it does, we can't let the pager drop the page. */ - while (np->n_flag & NFLUSHINPROG) { - np->n_flag |= NFLUSHWANT; - error = tsleep((caddr_t)&np->n_flag, PRIBIO + 2, "nfsvinval", - slptimeo); - if (error && intrflg && nfs_sigintr(nmp, (struct nfsreq *)0, p)) - return (EINTR); + if (bp->nb_dirtyend > 0) { + int start = offset - NBOFF(bp); + if ((bp->nb_dirtyend > start) && + (bp->nb_dirtyoff < (start + PAGE_SIZE))) { + /* + * Before returning the bad news, move the + * buffer to the start of the delwri list and + * give the list a push to try to flush the + * buffer out. + */ + error = EBUSY; + nfs_buf_remfree(bp); + TAILQ_INSERT_HEAD(&nfsbufdelwri, bp, nb_free); + nfsbufdelwricnt++; + nfs_buf_delwri_push(1); + } } +out: + lck_mtx_unlock(nfs_buf_mutex); + return (error); +} - /* - * Now, flush as required. - */ - np->n_flag |= NFLUSHINPROG; - error = vinvalbuf(vp, flags, cred, p, slpflag, 0); - while (error) { - /* we seem to be stuck in a loop here if the thread got aborted. - * nfs_flush will return EINTR. Not sure if that will cause - * other consequences due to EINTR having other meanings in NFS - * To handle, no dirty pages, it seems safe to just return from - * here. But if we did have dirty pages, how would we get them - * written out if thread was aborted? Some other strategy is - * necessary. -- EKN +/* + * set up the UPL for a buffer + * (must NOT be called with nfs_buf_mutex held) + */ +int +nfs_buf_upl_setup(struct nfsbuf *bp) +{ + kern_return_t kret; + upl_t upl; + int upl_flags; + + if (ISSET(bp->nb_flags, NB_PAGELIST)) + return (0); + + upl_flags = UPL_PRECIOUS; + if (!ISSET(bp->nb_flags, NB_READ)) { + /* + * We're doing a "write", so we intend to modify + * the pages we're gathering. */ - if ((intrflg && nfs_sigintr(nmp, (struct nfsreq *)0, p)) || - (error == EINTR && current_thread_aborted())) { - np->n_flag &= ~NFLUSHINPROG; - if (np->n_flag & NFLUSHWANT) { - np->n_flag &= ~NFLUSHWANT; - wakeup((caddr_t)&np->n_flag); - } - return (EINTR); - } - error = vinvalbuf(vp, flags, cred, p, 0, slptimeo); + upl_flags |= UPL_WILL_MODIFY; } - np->n_flag &= ~(NMODIFIED | NFLUSHINPROG); - if (np->n_flag & NFLUSHWANT) { - np->n_flag &= ~NFLUSHWANT; - wakeup((caddr_t)&np->n_flag); + kret = ubc_create_upl(NFSTOV(bp->nb_np), NBOFF(bp), bp->nb_bufsize, + &upl, NULL, upl_flags); + if (kret == KERN_INVALID_ARGUMENT) { + /* vm object probably doesn't exist any more */ + bp->nb_pagelist = NULL; + return (EINVAL); } - didhold = ubc_hold(vp); - if (didhold) { - (void) ubc_clean(vp, 1); /* get the pages out of vm also */ - ubc_rele(vp); + if (kret != KERN_SUCCESS) { + printf("nfs_buf_upl_setup(): failed to get pagelist %d\n", kret); + bp->nb_pagelist = NULL; + return (EIO); } + + FSDBG(538, bp, NBOFF(bp), bp->nb_bufsize, bp->nb_np); + + bp->nb_pagelist = upl; + SET(bp->nb_flags, NB_PAGELIST); return (0); } /* - * Initiate asynchronous I/O. Return an error if no nfsiods are available. - * This is mainly to avoid queueing async I/O requests when the nfsiods - * are all hung on a dead server. + * update buffer's valid/dirty info from UBC + * (must NOT be called with nfs_buf_mutex held) */ -int -nfs_asyncio(bp, cred) - register struct buf *bp; - struct ucred *cred; +void +nfs_buf_upl_check(struct nfsbuf *bp) { - struct nfsmount *nmp; - int i; - int gotiod; - int slpflag = 0; - int slptimeo = 0; - int error; + upl_page_info_t *pl; + off_t filesize, fileoffset; + int i, npages; - if (nfs_numasync == 0) - return (EIO); - - nmp = VFSTONFS(bp->b_vp->v_mount); -again: - if (nmp->nm_flag & NFSMNT_INT) - slpflag = PCATCH; - gotiod = FALSE; + if (!ISSET(bp->nb_flags, NB_PAGELIST)) + return; + + npages = round_page_32(bp->nb_bufsize) / PAGE_SIZE; + filesize = ubc_getsize(NFSTOV(bp->nb_np)); + fileoffset = NBOFF(bp); + if (fileoffset < filesize) + SET(bp->nb_flags, NB_CACHE); + else + CLR(bp->nb_flags, NB_CACHE); + + pl = ubc_upl_pageinfo(bp->nb_pagelist); + bp->nb_valid = bp->nb_dirty = 0; + + for (i=0; i < npages; i++, fileoffset += PAGE_SIZE_64) { + /* anything beyond the end of the file is not valid or dirty */ + if (fileoffset >= filesize) + break; + if (!upl_valid_page(pl, i)) { + CLR(bp->nb_flags, NB_CACHE); + continue; + } + NBPGVALID_SET(bp,i); + if (upl_dirty_page(pl, i)) + NBPGDIRTY_SET(bp, i); + } + fileoffset = NBOFF(bp); + if (ISSET(bp->nb_flags, NB_CACHE)) { + bp->nb_validoff = 0; + bp->nb_validend = bp->nb_bufsize; + if (fileoffset + bp->nb_validend > filesize) + bp->nb_validend = filesize - fileoffset; + } else { + bp->nb_validoff = bp->nb_validend = -1; + } + FSDBG(539, bp, fileoffset, bp->nb_valid, bp->nb_dirty); + FSDBG(539, bp->nb_validoff, bp->nb_validend, bp->nb_dirtyoff, bp->nb_dirtyend); +} + +/* + * make sure that a buffer is mapped + * (must NOT be called with nfs_buf_mutex held) + */ +int +nfs_buf_map(struct nfsbuf *bp) +{ + kern_return_t kret; + + if (bp->nb_data) + return (0); + if (!ISSET(bp->nb_flags, NB_PAGELIST)) + return (EINVAL); + + kret = ubc_upl_map(bp->nb_pagelist, (vm_offset_t *)&(bp->nb_data)); + if (kret != KERN_SUCCESS) + panic("nfs_buf_map: ubc_upl_map() failed with (%d)", kret); + if (bp->nb_data == 0) + panic("ubc_upl_map mapped 0"); + FSDBG(540, bp, bp->nb_flags, NBOFF(bp), bp->nb_data); + return (0); +} + +/* + * normalize an nfsbuf's valid range + * + * the read/write code guarantees that we'll always have a valid + * region that is an integral number of pages. If either end + * of the valid range isn't page-aligned, it gets corrected + * here as we extend the valid range through all of the + * contiguous valid pages. + */ +void +nfs_buf_normalize_valid_range(nfsnode_t np, struct nfsbuf *bp) +{ + int pg, npg; + /* pull validoff back to start of contiguous valid page range */ + pg = bp->nb_validoff/PAGE_SIZE; + while (pg >= 0 && NBPGVALID(bp,pg)) + pg--; + bp->nb_validoff = (pg+1) * PAGE_SIZE; + /* push validend forward to end of contiguous valid page range */ + npg = bp->nb_bufsize/PAGE_SIZE; + pg = bp->nb_validend/PAGE_SIZE; + while (pg < npg && NBPGVALID(bp,pg)) + pg++; + bp->nb_validend = pg * PAGE_SIZE; + /* clip to EOF */ + if (NBOFF(bp) + bp->nb_validend > (off_t)np->n_size) + bp->nb_validend = np->n_size % bp->nb_bufsize; +} + +/* + * process some entries on the delayed write queue + * (must be called with nfs_buf_mutex held) + */ +void +nfs_buf_delwri_service(void) +{ + struct nfsbuf *bp; + nfsnode_t np; + int error, i = 0; + + while (i < 8 && (bp = TAILQ_FIRST(&nfsbufdelwri)) != NULL) { + np = bp->nb_np; + nfs_buf_remfree(bp); + nfs_buf_refget(bp); + while ((error = nfs_buf_acquire(bp, 0, 0, 0)) == EAGAIN); + nfs_buf_refrele(bp); + if (error) + break; + if (!bp->nb_np) { + /* buffer is no longer valid */ + nfs_buf_drop(bp); + continue; + } + if (ISSET(bp->nb_flags, NB_NEEDCOMMIT)) + nfs_buf_check_write_verifier(np, bp); + if (ISSET(bp->nb_flags, NB_NEEDCOMMIT)) { + /* put buffer at end of delwri list */ + TAILQ_INSERT_TAIL(&nfsbufdelwri, bp, nb_free); + nfsbufdelwricnt++; + nfs_buf_drop(bp); + lck_mtx_unlock(nfs_buf_mutex); + nfs_flushcommits(np, 1); + } else { + SET(bp->nb_flags, NB_ASYNC); + lck_mtx_unlock(nfs_buf_mutex); + nfs_buf_write(bp); + } + i++; + lck_mtx_lock(nfs_buf_mutex); + } +} + +/* + * thread to service the delayed write queue when asked + */ +void +nfs_buf_delwri_thread(__unused void *arg, __unused wait_result_t wr) +{ + struct timespec ts = { 30, 0 }; + int error = 0; + + lck_mtx_lock(nfs_buf_mutex); + while (!error) { + nfs_buf_delwri_service(); + error = msleep(&nfsbufdelwrithd, nfs_buf_mutex, 0, "nfsbufdelwri", &ts); + } + nfsbufdelwrithd = NULL; + lck_mtx_unlock(nfs_buf_mutex); + thread_terminate(nfsbufdelwrithd); +} + +/* + * try to push out some delayed/uncommitted writes + * ("locked" indicates whether nfs_buf_mutex is already held) + */ +void +nfs_buf_delwri_push(int locked) +{ + if (TAILQ_EMPTY(&nfsbufdelwri)) + return; + if (!locked) + lck_mtx_lock(nfs_buf_mutex); + /* wake up the delayed write service thread */ + if (nfsbufdelwrithd) + wakeup(&nfsbufdelwrithd); + else if (kernel_thread_start(nfs_buf_delwri_thread, NULL, &nfsbufdelwrithd) == KERN_SUCCESS) + thread_deallocate(nfsbufdelwrithd); + /* otherwise, try to do some of the work ourselves */ + if (!nfsbufdelwrithd) + nfs_buf_delwri_service(); + if (!locked) + lck_mtx_unlock(nfs_buf_mutex); +} + +/* + * Get an nfs buffer. + * + * Returns errno on error, 0 otherwise. + * Any buffer is returned in *bpp. + * + * If NBLK_ONLYVALID is set, only return buffer if found in cache. + * If NBLK_NOWAIT is set, don't wait for the buffer if it's marked BUSY. + * + * Check for existence of buffer in cache. + * Or attempt to reuse a buffer from one of the free lists. + * Or allocate a new buffer if we haven't already hit max allocation. + * Or wait for a free buffer. + * + * If available buffer found, prepare it, and return it. + * + * If the calling process is interrupted by a signal for + * an interruptible mount point, return EINTR. + */ +int +nfs_buf_get( + nfsnode_t np, + daddr64_t blkno, + uint32_t size, + thread_t thd, + int flags, + struct nfsbuf **bpp) +{ + vnode_t vp = NFSTOV(np); + struct nfsmount *nmp = VTONMP(vp); + struct nfsbuf *bp; + uint32_t bufsize; + int slpflag = PCATCH; + int operation = (flags & NBLK_OPMASK); + int error = 0; + struct timespec ts; + + FSDBG_TOP(541, np, blkno, size, flags); + *bpp = NULL; + + bufsize = size; + if (bufsize > NFS_MAXBSIZE) + panic("nfs_buf_get: buffer larger than NFS_MAXBSIZE requested"); + + if (nfs_mount_gone(nmp)) { + FSDBG_BOT(541, np, blkno, 0, ENXIO); + return (ENXIO); + } + + if (!UBCINFOEXISTS(vp)) { + operation = NBLK_META; + } else if (bufsize < (uint32_t)nmp->nm_biosize) { + /* reg files should always have biosize blocks */ + bufsize = nmp->nm_biosize; + } + + /* if NBLK_WRITE, check for too many delayed/uncommitted writes */ + if ((operation == NBLK_WRITE) && (nfs_nbdwrite > NFS_A_LOT_OF_DELAYED_WRITES)) { + FSDBG_TOP(542, np, blkno, nfs_nbdwrite, NFS_A_LOT_OF_DELAYED_WRITES); + + /* poke the delwri list */ + nfs_buf_delwri_push(0); + + /* sleep to let other threads run... */ + tsleep(&nfs_nbdwrite, PCATCH, "nfs_nbdwrite", 1); + FSDBG_BOT(542, np, blkno, nfs_nbdwrite, NFS_A_LOT_OF_DELAYED_WRITES); + } + +loop: + lck_mtx_lock(nfs_buf_mutex); + + /* wait for any buffer invalidation/flushing to complete */ + while (np->n_bflag & NBINVALINPROG) { + np->n_bflag |= NBINVALWANT; + ts.tv_sec = 2; + ts.tv_nsec = 0; + msleep(&np->n_bflag, nfs_buf_mutex, slpflag, "nfs_buf_get_invalwait", &ts); + if ((error = nfs_sigintr(VTONMP(vp), NULL, thd, 0))) { + lck_mtx_unlock(nfs_buf_mutex); + FSDBG_BOT(541, np, blkno, 0, error); + return (error); + } + if (np->n_bflag & NBINVALINPROG) + slpflag = 0; + } + + /* check for existence of nfsbuf in cache */ + if ((bp = nfs_buf_incore(np, blkno))) { + /* if busy, set wanted and wait */ + if (ISSET(bp->nb_lflags, NBL_BUSY)) { + if (flags & NBLK_NOWAIT) { + lck_mtx_unlock(nfs_buf_mutex); + FSDBG_BOT(541, np, blkno, bp, 0xbcbcbcbc); + return (0); + } + FSDBG_TOP(543, np, blkno, bp, bp->nb_flags); + SET(bp->nb_lflags, NBL_WANTED); + + ts.tv_sec = 2; + ts.tv_nsec = 0; + msleep(bp, nfs_buf_mutex, slpflag|(PRIBIO+1)|PDROP, + "nfsbufget", (slpflag == PCATCH) ? NULL : &ts); + slpflag = 0; + FSDBG_BOT(543, np, blkno, bp, bp->nb_flags); + if ((error = nfs_sigintr(VTONMP(vp), NULL, thd, 0))) { + FSDBG_BOT(541, np, blkno, 0, error); + return (error); + } + goto loop; + } + if (bp->nb_bufsize != bufsize) + panic("nfsbuf size mismatch"); + SET(bp->nb_lflags, NBL_BUSY); + SET(bp->nb_flags, NB_CACHE); + nfs_buf_remfree(bp); + /* additional paranoia: */ + if (ISSET(bp->nb_flags, NB_PAGELIST)) + panic("pagelist buffer was not busy"); + goto buffer_setup; + } + + if (flags & NBLK_ONLYVALID) { + lck_mtx_unlock(nfs_buf_mutex); + FSDBG_BOT(541, np, blkno, 0, 0x0000cace); + return (0); + } + + /* + * where to get a free buffer: + * - if meta and maxmeta reached, must reuse meta + * - alloc new if we haven't reached min bufs + * - if free lists are NOT empty + * - if free list is stale, use it + * - else if freemeta list is stale, use it + * - else if max bufs allocated, use least-time-to-stale + * - alloc new if we haven't reached max allowed + * - start clearing out delwri list and try again + */ + + if ((operation == NBLK_META) && (nfsbufmetacnt >= nfsbufmetamax)) { + /* if we've hit max meta buffers, must reuse a meta buffer */ + bp = TAILQ_FIRST(&nfsbuffreemeta); + } else if ((nfsbufcnt > nfsbufmin) && + (!TAILQ_EMPTY(&nfsbuffree) || !TAILQ_EMPTY(&nfsbuffreemeta))) { + /* try to pull an nfsbuf off a free list */ + struct nfsbuf *lrubp, *metabp; + struct timeval now; + microuptime(&now); + + /* if the next LRU or META buffer is invalid or stale, use it */ + lrubp = TAILQ_FIRST(&nfsbuffree); + if (lrubp && (!NBUFSTAMPVALID(lrubp) || + ((lrubp->nb_timestamp + NFSBUF_LRU_STALE) < now.tv_sec))) + bp = lrubp; + metabp = TAILQ_FIRST(&nfsbuffreemeta); + if (!bp && metabp && (!NBUFSTAMPVALID(metabp) || + ((metabp->nb_timestamp + NFSBUF_META_STALE) < now.tv_sec))) + bp = metabp; + + if (!bp && (nfsbufcnt >= nfsbufmax)) { + /* we've already allocated all bufs, so */ + /* choose the buffer that'll go stale first */ + if (!metabp) + bp = lrubp; + else if (!lrubp) + bp = metabp; + else { + int32_t lru_stale_time, meta_stale_time; + lru_stale_time = lrubp->nb_timestamp + NFSBUF_LRU_STALE; + meta_stale_time = metabp->nb_timestamp + NFSBUF_META_STALE; + if (lru_stale_time <= meta_stale_time) + bp = lrubp; + else + bp = metabp; + } + } + } + + if (bp) { + /* we have a buffer to reuse */ + FSDBG(544, np, blkno, bp, bp->nb_flags); + nfs_buf_remfree(bp); + if (ISSET(bp->nb_flags, NB_DELWRI)) + panic("nfs_buf_get: delwri"); + SET(bp->nb_lflags, NBL_BUSY); + /* disassociate buffer from previous nfsnode */ + if (bp->nb_np) { + if (bp->nb_vnbufs.le_next != NFSNOLIST) { + LIST_REMOVE(bp, nb_vnbufs); + bp->nb_vnbufs.le_next = NFSNOLIST; + } + bp->nb_np = NULL; + } + LIST_REMOVE(bp, nb_hash); + /* nuke any creds we're holding */ + if (IS_VALID_CRED(bp->nb_rcred)) + kauth_cred_unref(&bp->nb_rcred); + if (IS_VALID_CRED(bp->nb_wcred)) + kauth_cred_unref(&bp->nb_wcred); + /* if buf will no longer be NB_META, dump old buffer */ + if (operation == NBLK_META) { + if (!ISSET(bp->nb_flags, NB_META)) + nfsbufmetacnt++; + } else if (ISSET(bp->nb_flags, NB_META)) { + if (bp->nb_data) { + kfree(bp->nb_data, bp->nb_bufsize); + bp->nb_data = NULL; + } + nfsbufmetacnt--; + } + /* re-init buf fields */ + bp->nb_error = 0; + bp->nb_validoff = bp->nb_validend = -1; + bp->nb_dirtyoff = bp->nb_dirtyend = 0; + bp->nb_valid = 0; + bp->nb_dirty = 0; + bp->nb_verf = 0; + } else { + /* no buffer to reuse */ + if ((nfsbufcnt < nfsbufmax) && + ((operation != NBLK_META) || (nfsbufmetacnt < nfsbufmetamax))) { + /* just alloc a new one */ + MALLOC(bp, struct nfsbuf *, sizeof(struct nfsbuf), M_TEMP, M_WAITOK); + if (!bp) { + lck_mtx_unlock(nfs_buf_mutex); + FSDBG_BOT(541, np, blkno, 0, error); + return (ENOMEM); + } + nfsbufcnt++; + + /* + * If any excess bufs, make sure the timer + * is running to free them up later. + */ + if (nfsbufcnt > nfsbufmin && !nfs_buf_timer_on) { + nfs_buf_timer_on = 1; + nfs_interval_timer_start(nfs_buf_timer_call, + NFSBUF_FREE_PERIOD * 1000); + } + + if (operation == NBLK_META) + nfsbufmetacnt++; + NFSBUFCNTCHK(); + /* init nfsbuf */ + bzero(bp, sizeof(*bp)); + bp->nb_free.tqe_next = NFSNOLIST; + bp->nb_validoff = bp->nb_validend = -1; + FSDBG(545, np, blkno, bp, 0); + } else { + /* too many bufs... wait for buffers to free up */ + FSDBG_TOP(546, np, blkno, nfsbufcnt, nfsbufmax); + + /* poke the delwri list */ + nfs_buf_delwri_push(1); + + nfsneedbuffer = 1; + msleep(&nfsneedbuffer, nfs_buf_mutex, PCATCH|PDROP, "nfsbufget", NULL); + FSDBG_BOT(546, np, blkno, nfsbufcnt, nfsbufmax); + if ((error = nfs_sigintr(VTONMP(vp), NULL, thd, 0))) { + FSDBG_BOT(541, np, blkno, 0, error); + return (error); + } + goto loop; + } + } + + /* set up nfsbuf */ + SET(bp->nb_lflags, NBL_BUSY); + bp->nb_flags = 0; + bp->nb_lblkno = blkno; + /* insert buf in hash */ + LIST_INSERT_HEAD(NFSBUFHASH(np, blkno), bp, nb_hash); + /* associate buffer with new nfsnode */ + bp->nb_np = np; + LIST_INSERT_HEAD(&np->n_cleanblkhd, bp, nb_vnbufs); + +buffer_setup: + + /* unlock hash */ + lck_mtx_unlock(nfs_buf_mutex); + + switch (operation) { + case NBLK_META: + SET(bp->nb_flags, NB_META); + if ((bp->nb_bufsize != bufsize) && bp->nb_data) { + kfree(bp->nb_data, bp->nb_bufsize); + bp->nb_data = NULL; + bp->nb_validoff = bp->nb_validend = -1; + bp->nb_dirtyoff = bp->nb_dirtyend = 0; + bp->nb_valid = 0; + bp->nb_dirty = 0; + CLR(bp->nb_flags, NB_CACHE); + } + if (!bp->nb_data) + bp->nb_data = kalloc(bufsize); + if (!bp->nb_data) { + /* Ack! couldn't allocate the data buffer! */ + /* clean up buffer and return error */ + lck_mtx_lock(nfs_buf_mutex); + LIST_REMOVE(bp, nb_vnbufs); + bp->nb_vnbufs.le_next = NFSNOLIST; + bp->nb_np = NULL; + /* invalidate usage timestamp to allow immediate freeing */ + NBUFSTAMPINVALIDATE(bp); + if (bp->nb_free.tqe_next != NFSNOLIST) + panic("nfsbuf on freelist"); + TAILQ_INSERT_HEAD(&nfsbuffree, bp, nb_free); + nfsbuffreecnt++; + lck_mtx_unlock(nfs_buf_mutex); + FSDBG_BOT(541, np, blkno, 0xb00, ENOMEM); + return (ENOMEM); + } + bp->nb_bufsize = bufsize; + break; + + case NBLK_READ: + case NBLK_WRITE: + /* + * Set or clear NB_READ now to let the UPL subsystem know + * if we intend to modify the pages or not. + */ + if (operation == NBLK_READ) { + SET(bp->nb_flags, NB_READ); + } else { + CLR(bp->nb_flags, NB_READ); + } + if (bufsize < PAGE_SIZE) + bufsize = PAGE_SIZE; + bp->nb_bufsize = bufsize; + bp->nb_validoff = bp->nb_validend = -1; + + if (UBCINFOEXISTS(vp)) { + /* set up upl */ + if (nfs_buf_upl_setup(bp)) { + /* unable to create upl */ + /* vm object must no longer exist */ + /* clean up buffer and return error */ + lck_mtx_lock(nfs_buf_mutex); + LIST_REMOVE(bp, nb_vnbufs); + bp->nb_vnbufs.le_next = NFSNOLIST; + bp->nb_np = NULL; + /* invalidate usage timestamp to allow immediate freeing */ + NBUFSTAMPINVALIDATE(bp); + if (bp->nb_free.tqe_next != NFSNOLIST) + panic("nfsbuf on freelist"); + TAILQ_INSERT_HEAD(&nfsbuffree, bp, nb_free); + nfsbuffreecnt++; + lck_mtx_unlock(nfs_buf_mutex); + FSDBG_BOT(541, np, blkno, 0x2bc, EIO); + return (EIO); + } + nfs_buf_upl_check(bp); + } + break; + + default: + panic("nfs_buf_get: %d unknown operation", operation); + } + + *bpp = bp; + + FSDBG_BOT(541, np, blkno, bp, bp->nb_flags); + + return (0); +} + +void +nfs_buf_release(struct nfsbuf *bp, int freeup) +{ + nfsnode_t np = bp->nb_np; + vnode_t vp; + struct timeval now; + int wakeup_needbuffer, wakeup_buffer, wakeup_nbdwrite; + + FSDBG_TOP(548, bp, NBOFF(bp), bp->nb_flags, bp->nb_data); + FSDBG(548, bp->nb_validoff, bp->nb_validend, bp->nb_dirtyoff, bp->nb_dirtyend); + FSDBG(548, bp->nb_valid, 0, bp->nb_dirty, 0); + + vp = np ? NFSTOV(np) : NULL; + if (vp && UBCINFOEXISTS(vp) && bp->nb_bufsize) { + int upl_flags, rv; + upl_t upl; + uint32_t i; + + if (!ISSET(bp->nb_flags, NB_PAGELIST) && !ISSET(bp->nb_flags, NB_INVAL)) { + rv = nfs_buf_upl_setup(bp); + if (rv) + printf("nfs_buf_release: upl create failed %d\n", rv); + else + nfs_buf_upl_check(bp); + } + upl = bp->nb_pagelist; + if (!upl) + goto pagelist_cleanup_done; + if (bp->nb_data) { + if (ubc_upl_unmap(upl) != KERN_SUCCESS) + panic("ubc_upl_unmap failed"); + bp->nb_data = NULL; + } + /* + * Abort the pages on error or: if this is an invalid or + * non-needcommit nocache buffer AND no pages are dirty. + */ + if (ISSET(bp->nb_flags, NB_ERROR) || (!bp->nb_dirty && (ISSET(bp->nb_flags, NB_INVAL) || + (ISSET(bp->nb_flags, NB_NOCACHE) && !ISSET(bp->nb_flags, (NB_NEEDCOMMIT | NB_DELWRI)))))) { + if (ISSET(bp->nb_flags, (NB_READ | NB_INVAL | NB_NOCACHE))) + upl_flags = UPL_ABORT_DUMP_PAGES; + else + upl_flags = 0; + ubc_upl_abort(upl, upl_flags); + goto pagelist_cleanup_done; + } + for (i=0; i <= (bp->nb_bufsize - 1)/PAGE_SIZE; i++) { + if (!NBPGVALID(bp,i)) + ubc_upl_abort_range(upl, + i*PAGE_SIZE, PAGE_SIZE, + UPL_ABORT_DUMP_PAGES | + UPL_ABORT_FREE_ON_EMPTY); + else { + if (NBPGDIRTY(bp,i)) + upl_flags = UPL_COMMIT_SET_DIRTY; + else + upl_flags = UPL_COMMIT_CLEAR_DIRTY; + + if (!ISSET(bp->nb_flags, (NB_NEEDCOMMIT | NB_DELWRI))) + upl_flags |= UPL_COMMIT_CLEAR_PRECIOUS; + + ubc_upl_commit_range(upl, + i*PAGE_SIZE, PAGE_SIZE, + upl_flags | + UPL_COMMIT_INACTIVATE | + UPL_COMMIT_FREE_ON_EMPTY); + } + } +pagelist_cleanup_done: + /* invalidate any pages past EOF */ + if (NBOFF(bp) + bp->nb_bufsize > (off_t)(np->n_size)) { + off_t start, end; + start = trunc_page_64(np->n_size) + PAGE_SIZE_64; + end = trunc_page_64(NBOFF(bp) + bp->nb_bufsize); + if (start < NBOFF(bp)) + start = NBOFF(bp); + if (end > start) { + if ((rv = ubc_msync(vp, start, end, NULL, UBC_INVALIDATE))) + printf("nfs_buf_release(): ubc_msync failed!, error %d\n", rv); + } + } + CLR(bp->nb_flags, NB_PAGELIST); + bp->nb_pagelist = NULL; + } + + lck_mtx_lock(nfs_buf_mutex); + + wakeup_needbuffer = wakeup_buffer = wakeup_nbdwrite = 0; + + /* Wake up any processes waiting for any buffer to become free. */ + if (nfsneedbuffer) { + nfsneedbuffer = 0; + wakeup_needbuffer = 1; + } + /* Wake up any processes waiting for _this_ buffer to become free. */ + if (ISSET(bp->nb_lflags, NBL_WANTED)) { + CLR(bp->nb_lflags, NBL_WANTED); + wakeup_buffer = 1; + } + + /* If it's non-needcommit nocache, or an error, mark it invalid. */ + if (ISSET(bp->nb_flags, NB_ERROR) || + (ISSET(bp->nb_flags, NB_NOCACHE) && !ISSET(bp->nb_flags, (NB_NEEDCOMMIT | NB_DELWRI)))) + SET(bp->nb_flags, NB_INVAL); + + if ((bp->nb_bufsize <= 0) || ISSET(bp->nb_flags, NB_INVAL)) { + /* If it's invalid or empty, dissociate it from its nfsnode */ + if (bp->nb_vnbufs.le_next != NFSNOLIST) { + LIST_REMOVE(bp, nb_vnbufs); + bp->nb_vnbufs.le_next = NFSNOLIST; + } + bp->nb_np = NULL; + /* if this was a delayed write, wakeup anyone */ + /* waiting for delayed writes to complete */ + if (ISSET(bp->nb_flags, NB_DELWRI)) { + CLR(bp->nb_flags, NB_DELWRI); + nfs_nbdwrite--; + NFSBUFCNTCHK(); + wakeup_nbdwrite = 1; + } + /* invalidate usage timestamp to allow immediate freeing */ + NBUFSTAMPINVALIDATE(bp); + /* put buffer at head of free list */ + if (bp->nb_free.tqe_next != NFSNOLIST) + panic("nfsbuf on freelist"); + SET(bp->nb_flags, NB_INVAL); + if (ISSET(bp->nb_flags, NB_META)) { + TAILQ_INSERT_HEAD(&nfsbuffreemeta, bp, nb_free); + nfsbuffreemetacnt++; + } else { + TAILQ_INSERT_HEAD(&nfsbuffree, bp, nb_free); + nfsbuffreecnt++; + } + } else if (ISSET(bp->nb_flags, NB_DELWRI)) { + /* put buffer at end of delwri list */ + if (bp->nb_free.tqe_next != NFSNOLIST) + panic("nfsbuf on freelist"); + TAILQ_INSERT_TAIL(&nfsbufdelwri, bp, nb_free); + nfsbufdelwricnt++; + freeup = 0; + } else { + /* update usage timestamp */ + microuptime(&now); + bp->nb_timestamp = now.tv_sec; + /* put buffer at end of free list */ + if (bp->nb_free.tqe_next != NFSNOLIST) + panic("nfsbuf on freelist"); + if (ISSET(bp->nb_flags, NB_META)) { + TAILQ_INSERT_TAIL(&nfsbuffreemeta, bp, nb_free); + nfsbuffreemetacnt++; + } else { + TAILQ_INSERT_TAIL(&nfsbuffree, bp, nb_free); + nfsbuffreecnt++; + } + } + + NFSBUFCNTCHK(); + + /* Unlock the buffer. */ + CLR(bp->nb_flags, (NB_ASYNC | NB_STABLE)); + CLR(bp->nb_lflags, NBL_BUSY); + + FSDBG_BOT(548, bp, NBOFF(bp), bp->nb_flags, bp->nb_data); + + lck_mtx_unlock(nfs_buf_mutex); + + if (wakeup_needbuffer) + wakeup(&nfsneedbuffer); + if (wakeup_buffer) + wakeup(bp); + if (wakeup_nbdwrite) + wakeup(&nfs_nbdwrite); + if (freeup) + NFS_BUF_FREEUP(); +} + +/* + * Wait for operations on the buffer to complete. + * When they do, extract and return the I/O's error value. + */ +int +nfs_buf_iowait(struct nfsbuf *bp) +{ + FSDBG_TOP(549, bp, NBOFF(bp), bp->nb_flags, bp->nb_error); + + lck_mtx_lock(nfs_buf_mutex); + + while (!ISSET(bp->nb_flags, NB_DONE)) + msleep(bp, nfs_buf_mutex, PRIBIO + 1, "nfs_buf_iowait", NULL); + + lck_mtx_unlock(nfs_buf_mutex); + + FSDBG_BOT(549, bp, NBOFF(bp), bp->nb_flags, bp->nb_error); + + /* check for interruption of I/O, then errors. */ + if (ISSET(bp->nb_flags, NB_EINTR)) { + CLR(bp->nb_flags, NB_EINTR); + return (EINTR); + } else if (ISSET(bp->nb_flags, NB_ERROR)) + return (bp->nb_error ? bp->nb_error : EIO); + return (0); +} + +/* + * Mark I/O complete on a buffer. + */ +void +nfs_buf_iodone(struct nfsbuf *bp) +{ + + FSDBG_TOP(550, bp, NBOFF(bp), bp->nb_flags, bp->nb_error); + + if (ISSET(bp->nb_flags, NB_DONE)) + panic("nfs_buf_iodone already"); + + if (!ISSET(bp->nb_flags, NB_READ)) { + CLR(bp->nb_flags, NB_WRITEINPROG); + /* + * vnode_writedone() takes care of waking up + * any throttled write operations + */ + vnode_writedone(NFSTOV(bp->nb_np)); + nfs_node_lock_force(bp->nb_np); + bp->nb_np->n_numoutput--; + nfs_node_unlock(bp->nb_np); + } + if (ISSET(bp->nb_flags, NB_ASYNC)) { /* if async, release it */ + SET(bp->nb_flags, NB_DONE); /* note that it's done */ + nfs_buf_release(bp, 1); + } else { /* or just wakeup the buffer */ + lck_mtx_lock(nfs_buf_mutex); + SET(bp->nb_flags, NB_DONE); /* note that it's done */ + CLR(bp->nb_lflags, NBL_WANTED); + lck_mtx_unlock(nfs_buf_mutex); + wakeup(bp); + } + + FSDBG_BOT(550, bp, NBOFF(bp), bp->nb_flags, bp->nb_error); +} + +void +nfs_buf_write_delayed(struct nfsbuf *bp) +{ + nfsnode_t np = bp->nb_np; + + FSDBG_TOP(551, bp, NBOFF(bp), bp->nb_flags, 0); + FSDBG(551, bp, bp->nb_dirtyoff, bp->nb_dirtyend, bp->nb_dirty); + + /* + * If the block hasn't been seen before: + * (1) Mark it as having been seen, + * (2) Make sure it's on its node's correct block list, + */ + if (!ISSET(bp->nb_flags, NB_DELWRI)) { + SET(bp->nb_flags, NB_DELWRI); + /* move to dirty list */ + lck_mtx_lock(nfs_buf_mutex); + nfs_nbdwrite++; + NFSBUFCNTCHK(); + if (bp->nb_vnbufs.le_next != NFSNOLIST) + LIST_REMOVE(bp, nb_vnbufs); + LIST_INSERT_HEAD(&np->n_dirtyblkhd, bp, nb_vnbufs); + lck_mtx_unlock(nfs_buf_mutex); + } + + /* + * If the vnode has "too many" write operations in progress + * wait for them to finish the IO + */ + vnode_waitforwrites(NFSTOV(np), VNODE_ASYNC_THROTTLE, 0, 0, "nfs_buf_write_delayed"); + + /* the file is in a modified state, so make sure the flag's set */ + nfs_node_lock_force(np); + np->n_flag |= NMODIFIED; + nfs_node_unlock(np); + + /* + * If we have too many delayed write buffers, + * just fall back to doing the async write. + */ + if (nfs_nbdwrite < 0) + panic("nfs_buf_write_delayed: Negative nfs_nbdwrite"); + if (nfs_nbdwrite > NFS_A_LOT_OF_DELAYED_WRITES) { + /* issue async write */ + SET(bp->nb_flags, NB_ASYNC); + nfs_buf_write(bp); + FSDBG_BOT(551, bp, NBOFF(bp), bp->nb_flags, bp->nb_error); + return; + } + + /* Otherwise, the "write" is done, so mark and release the buffer. */ + SET(bp->nb_flags, NB_DONE); + nfs_buf_release(bp, 1); + FSDBG_BOT(551, bp, NBOFF(bp), bp->nb_flags, 0); + return; +} + +/* + * Check that a "needcommit" buffer can still be committed. + * If the write verifier has changed, we need to clear the + * the needcommit flag. + */ +void +nfs_buf_check_write_verifier(nfsnode_t np, struct nfsbuf *bp) +{ + struct nfsmount *nmp; + + if (!ISSET(bp->nb_flags, NB_NEEDCOMMIT)) + return; + + nmp = NFSTONMP(np); + if (nfs_mount_gone(nmp)) + return; + if (!ISSET(bp->nb_flags, NB_STALEWVERF) && (bp->nb_verf == nmp->nm_verf)) + return; + + /* write verifier changed, clear commit/wverf flags */ + CLR(bp->nb_flags, (NB_NEEDCOMMIT | NB_STALEWVERF)); + bp->nb_verf = 0; + nfs_node_lock_force(np); + np->n_needcommitcnt--; + CHECK_NEEDCOMMITCNT(np); + nfs_node_unlock(np); +} + +/* + * add a reference to a buffer so it doesn't disappear while being used + * (must be called with nfs_buf_mutex held) + */ +void +nfs_buf_refget(struct nfsbuf *bp) +{ + bp->nb_refs++; +} +/* + * release a reference on a buffer + * (must be called with nfs_buf_mutex held) + */ +void +nfs_buf_refrele(struct nfsbuf *bp) +{ + bp->nb_refs--; +} + +/* + * mark a particular buffer as BUSY + * (must be called with nfs_buf_mutex held) + */ +errno_t +nfs_buf_acquire(struct nfsbuf *bp, int flags, int slpflag, int slptimeo) +{ + errno_t error; + struct timespec ts; + + if (ISSET(bp->nb_lflags, NBL_BUSY)) { + /* + * since the lck_mtx_lock may block, the buffer + * may become BUSY, so we need to recheck for + * a NOWAIT request + */ + if (flags & NBAC_NOWAIT) + return (EBUSY); + SET(bp->nb_lflags, NBL_WANTED); + + ts.tv_sec = (slptimeo/100); + /* the hz value is 100; which leads to 10ms */ + ts.tv_nsec = (slptimeo % 100) * 10 * NSEC_PER_USEC * 1000; + + error = msleep(bp, nfs_buf_mutex, slpflag | (PRIBIO + 1), + "nfs_buf_acquire", &ts); + if (error) + return (error); + return (EAGAIN); + } + if (flags & NBAC_REMOVE) + nfs_buf_remfree(bp); + SET(bp->nb_lflags, NBL_BUSY); + + return (0); +} + +/* + * simply drop the BUSY status of a buffer + * (must be called with nfs_buf_mutex held) + */ +void +nfs_buf_drop(struct nfsbuf *bp) +{ + int need_wakeup = 0; + + if (!ISSET(bp->nb_lflags, NBL_BUSY)) + panic("nfs_buf_drop: buffer not busy!"); + if (ISSET(bp->nb_lflags, NBL_WANTED)) { + /* delay the actual wakeup until after we clear NBL_BUSY */ + need_wakeup = 1; + } + /* Unlock the buffer. */ + CLR(bp->nb_lflags, (NBL_BUSY | NBL_WANTED)); + + if (need_wakeup) + wakeup(bp); +} + +/* + * prepare for iterating over an nfsnode's buffer list + * this lock protects the queue manipulation + * (must be called with nfs_buf_mutex held) + */ +int +nfs_buf_iterprepare(nfsnode_t np, struct nfsbuflists *iterheadp, int flags) +{ + struct nfsbuflists *listheadp; + + if (flags & NBI_DIRTY) + listheadp = &np->n_dirtyblkhd; + else + listheadp = &np->n_cleanblkhd; + + if ((flags & NBI_NOWAIT) && (np->n_bufiterflags & NBI_ITER)) { + LIST_INIT(iterheadp); + return(EWOULDBLOCK); + } + + while (np->n_bufiterflags & NBI_ITER) { + np->n_bufiterflags |= NBI_ITERWANT; + msleep(&np->n_bufiterflags, nfs_buf_mutex, 0, "nfs_buf_iterprepare", NULL); + } + if (LIST_EMPTY(listheadp)) { + LIST_INIT(iterheadp); + return(EINVAL); + } + np->n_bufiterflags |= NBI_ITER; + + iterheadp->lh_first = listheadp->lh_first; + listheadp->lh_first->nb_vnbufs.le_prev = &iterheadp->lh_first; + LIST_INIT(listheadp); + + return(0); +} + +/* + * clean up after iterating over an nfsnode's buffer list + * this lock protects the queue manipulation + * (must be called with nfs_buf_mutex held) + */ +void +nfs_buf_itercomplete(nfsnode_t np, struct nfsbuflists *iterheadp, int flags) +{ + struct nfsbuflists * listheadp; + struct nfsbuf *bp; + + if (flags & NBI_DIRTY) + listheadp = &np->n_dirtyblkhd; + else + listheadp = &np->n_cleanblkhd; + + while (!LIST_EMPTY(iterheadp)) { + bp = LIST_FIRST(iterheadp); + LIST_REMOVE(bp, nb_vnbufs); + LIST_INSERT_HEAD(listheadp, bp, nb_vnbufs); + } + + np->n_bufiterflags &= ~NBI_ITER; + if (np->n_bufiterflags & NBI_ITERWANT) { + np->n_bufiterflags &= ~NBI_ITERWANT; + wakeup(&np->n_bufiterflags); + } +} + + +/* + * Read an NFS buffer for a file. + */ +int +nfs_buf_read(struct nfsbuf *bp) +{ + int error = 0; + nfsnode_t np; + thread_t thd; + kauth_cred_t cred; + + np = bp->nb_np; + cred = bp->nb_rcred; + if (IS_VALID_CRED(cred)) + kauth_cred_ref(cred); + thd = ISSET(bp->nb_flags, NB_ASYNC) ? NULL : current_thread(); + + /* sanity checks */ + if (!ISSET(bp->nb_flags, NB_READ)) + panic("nfs_buf_read: !NB_READ"); + if (ISSET(bp->nb_flags, NB_DONE)) + CLR(bp->nb_flags, NB_DONE); + + NFS_BUF_MAP(bp); + + OSAddAtomic64(1, &nfsstats.read_bios); + + error = nfs_buf_read_rpc(bp, thd, cred); + /* + * For async I/O, the callbacks will finish up the + * read. Otherwise, the read has already been finished. + */ + + if (IS_VALID_CRED(cred)) + kauth_cred_unref(&cred); + return (error); +} + +/* + * finish the reading of a buffer + */ +void +nfs_buf_read_finish(struct nfsbuf *bp) +{ + nfsnode_t np = bp->nb_np; + struct nfsmount *nmp; + + if (!ISSET(bp->nb_flags, NB_ERROR)) { + /* update valid range */ + bp->nb_validoff = 0; + bp->nb_validend = bp->nb_endio; + if (bp->nb_endio < (int)bp->nb_bufsize) { + /* + * The read may be short because we have unflushed writes + * that are extending the file size and the reads hit the + * (old) EOF on the server. So, just make sure nb_validend + * correctly tracks EOF. + * Note that the missing data should have already been zeroed + * in nfs_buf_read_rpc_finish(). + */ + off_t boff = NBOFF(bp); + if ((off_t)np->n_size >= (boff + bp->nb_bufsize)) + bp->nb_validend = bp->nb_bufsize; + else if ((off_t)np->n_size >= boff) + bp->nb_validend = np->n_size - boff; + else + bp->nb_validend = 0; + } + if ((nmp = NFSTONMP(np)) && (nmp->nm_vers == NFS_VER2) && + ((NBOFF(bp) + bp->nb_validend) > 0x100000000LL)) + bp->nb_validend = 0x100000000LL - NBOFF(bp); + bp->nb_valid = (1 << (round_page_32(bp->nb_validend) / PAGE_SIZE)) - 1; + if (bp->nb_validend & PAGE_MASK) { + /* zero-fill remainder of last page */ + bzero(bp->nb_data + bp->nb_validend, PAGE_SIZE - (bp->nb_validend & PAGE_MASK)); + } + } + nfs_buf_iodone(bp); +} + +/* + * initiate the NFS READ RPC(s) for a buffer + */ +int +nfs_buf_read_rpc(struct nfsbuf *bp, thread_t thd, kauth_cred_t cred) +{ + struct nfsmount *nmp; + nfsnode_t np = bp->nb_np; + int error = 0, nfsvers, async; + int offset, nrpcs; + uint32_t nmrsize, length, len; + off_t boff; + struct nfsreq *req; + struct nfsreq_cbinfo cb; + + nmp = NFSTONMP(np); + if (nfs_mount_gone(nmp)) { + bp->nb_error = error = ENXIO; + SET(bp->nb_flags, NB_ERROR); + nfs_buf_iodone(bp); + return (error); + } + nfsvers = nmp->nm_vers; + nmrsize = nmp->nm_rsize; + + boff = NBOFF(bp); + offset = 0; + length = bp->nb_bufsize; + + if (nfsvers == NFS_VER2) { + if (boff > 0xffffffffLL) { + bp->nb_error = error = EFBIG; + SET(bp->nb_flags, NB_ERROR); + nfs_buf_iodone(bp); + return (error); + } + if ((boff + length - 1) > 0xffffffffLL) + length = 0x100000000LL - boff; + } + + /* Note: Can only do async I/O if nfsiods are configured. */ + async = (bp->nb_flags & NB_ASYNC); + cb.rcb_func = async ? nfs_buf_read_rpc_finish : NULL; + cb.rcb_bp = bp; + + bp->nb_offio = bp->nb_endio = 0; + bp->nb_rpcs = nrpcs = (length + nmrsize - 1) / nmrsize; + if (async && (nrpcs > 1)) { + SET(bp->nb_flags, NB_MULTASYNCRPC); + } else { + CLR(bp->nb_flags, NB_MULTASYNCRPC); + } + + while (length > 0) { + if (ISSET(bp->nb_flags, NB_ERROR)) { + error = bp->nb_error; + break; + } + len = (length > nmrsize) ? nmrsize : length; + cb.rcb_args[0] = offset; + cb.rcb_args[1] = len; + if (nmp->nm_vers >= NFS_VER4) + cb.rcb_args[2] = nmp->nm_stategenid; + req = NULL; + error = nmp->nm_funcs->nf_read_rpc_async(np, boff + offset, len, thd, cred, &cb, &req); + if (error) + break; + offset += len; + length -= len; + if (async) + continue; + nfs_buf_read_rpc_finish(req); + if (ISSET(bp->nb_flags, NB_ERROR)) { + error = bp->nb_error; + break; + } + } + + if (length > 0) { + /* + * Something bad happened while trying to send the RPC(s). + * Wait for any outstanding requests to complete. + */ + bp->nb_error = error; + SET(bp->nb_flags, NB_ERROR); + if (ISSET(bp->nb_flags, NB_MULTASYNCRPC)) { + nrpcs = (length + nmrsize - 1) / nmrsize; + lck_mtx_lock(nfs_buf_mutex); + bp->nb_rpcs -= nrpcs; + if (bp->nb_rpcs == 0) { + /* No RPCs left, so the buffer's done */ + lck_mtx_unlock(nfs_buf_mutex); + nfs_buf_iodone(bp); + } else { + /* wait for the last RPC to mark it done */ + while (bp->nb_rpcs > 0) + msleep(&bp->nb_rpcs, nfs_buf_mutex, 0, + "nfs_buf_read_rpc_cancel", NULL); + lck_mtx_unlock(nfs_buf_mutex); + } + } else { + nfs_buf_iodone(bp); + } + } + + return (error); +} + +/* + * finish up an NFS READ RPC on a buffer + */ +void +nfs_buf_read_rpc_finish(struct nfsreq *req) +{ + struct nfsmount *nmp; + size_t rlen; + struct nfsreq_cbinfo cb; + struct nfsbuf *bp; + int error = 0, nfsvers, offset, length, eof = 0, multasyncrpc, finished; + void *wakeme = NULL; + struct nfsreq *rreq = NULL; + nfsnode_t np; + thread_t thd; + kauth_cred_t cred; + uio_t auio; + char uio_buf [ UIO_SIZEOF(1) ]; + +finish: + np = req->r_np; + thd = req->r_thread; + cred = req->r_cred; + if (IS_VALID_CRED(cred)) + kauth_cred_ref(cred); + cb = req->r_callback; + bp = cb.rcb_bp; + if (cb.rcb_func) /* take an extra reference on the nfsreq in case we want to resend it later due to grace error */ + nfs_request_ref(req, 0); + + nmp = NFSTONMP(np); + if (nfs_mount_gone(nmp)) { + SET(bp->nb_flags, NB_ERROR); + bp->nb_error = error = ENXIO; + } + if (error || ISSET(bp->nb_flags, NB_ERROR)) { + /* just drop it */ + nfs_request_async_cancel(req); + goto out; + } + + nfsvers = nmp->nm_vers; + offset = cb.rcb_args[0]; + rlen = length = cb.rcb_args[1]; + + auio = uio_createwithbuffer(1, NBOFF(bp) + offset, UIO_SYSSPACE, + UIO_READ, &uio_buf, sizeof(uio_buf)); + uio_addiov(auio, CAST_USER_ADDR_T(bp->nb_data + offset), length); + + /* finish the RPC */ + error = nmp->nm_funcs->nf_read_rpc_async_finish(np, req, auio, &rlen, &eof); + if ((error == EINPROGRESS) && cb.rcb_func) { + /* async request restarted */ + if (cb.rcb_func) + nfs_request_rele(req); + if (IS_VALID_CRED(cred)) + kauth_cred_unref(&cred); + return; + } + if ((nmp->nm_vers >= NFS_VER4) && nfs_mount_state_error_should_restart(error) && !ISSET(bp->nb_flags, NB_ERROR)) { + lck_mtx_lock(&nmp->nm_lock); + if ((error != NFSERR_OLD_STATEID) && (error != NFSERR_GRACE) && (cb.rcb_args[2] == nmp->nm_stategenid)) { + NP(np, "nfs_buf_read_rpc_finish: error %d @ 0x%llx, 0x%x 0x%x, initiating recovery", + error, NBOFF(bp)+offset, cb.rcb_args[2], nmp->nm_stategenid); + nfs_need_recover(nmp, error); + } + lck_mtx_unlock(&nmp->nm_lock); + if (np->n_flag & NREVOKE) { + error = EIO; + } else { + if (error == NFSERR_GRACE) { + if (cb.rcb_func) { + /* + * For an async I/O request, handle a grace delay just like + * jukebox errors. Set the resend time and queue it up. + */ + struct timeval now; + if (req->r_nmrep.nmc_mhead) { + mbuf_freem(req->r_nmrep.nmc_mhead); + req->r_nmrep.nmc_mhead = NULL; + } + req->r_error = 0; + microuptime(&now); + lck_mtx_lock(&req->r_mtx); + req->r_resendtime = now.tv_sec + 2; + req->r_xid = 0; // get a new XID + req->r_flags |= R_RESTART; + req->r_start = 0; + nfs_asyncio_resend(req); + lck_mtx_unlock(&req->r_mtx); + if (IS_VALID_CRED(cred)) + kauth_cred_unref(&cred); + /* Note: nfsreq reference taken will be dropped later when finished */ + return; + } + /* otherwise, just pause a couple seconds and retry */ + tsleep(&nmp->nm_state, (PZERO-1), "nfsgrace", 2*hz); + } + if (!(error = nfs_mount_state_wait_for_recovery(nmp))) { + rlen = 0; + goto readagain; + } + } + } + if (error) { + SET(bp->nb_flags, NB_ERROR); + bp->nb_error = error; + goto out; + } + + if ((rlen > 0) && (bp->nb_endio < (offset + (int)rlen))) + bp->nb_endio = offset + rlen; + + if ((nfsvers == NFS_VER2) || eof || (rlen == 0)) { + /* zero out the remaining data (up to EOF) */ + off_t rpcrem, eofrem, rem; + rpcrem = (length - rlen); + eofrem = np->n_size - (NBOFF(bp) + offset + rlen); + rem = (rpcrem < eofrem) ? rpcrem : eofrem; + if (rem > 0) + bzero(bp->nb_data + offset + rlen, rem); + } else if (((int)rlen < length) && !ISSET(bp->nb_flags, NB_ERROR)) { + /* + * short read + * + * We haven't hit EOF and we didn't get all the data + * requested, so we need to issue another read for the rest. + * (Don't bother if the buffer already hit an error.) + */ +readagain: + offset += rlen; + length -= rlen; + cb.rcb_args[0] = offset; + cb.rcb_args[1] = length; + if (nmp->nm_vers >= NFS_VER4) + cb.rcb_args[2] = nmp->nm_stategenid; + error = nmp->nm_funcs->nf_read_rpc_async(np, NBOFF(bp) + offset, length, thd, cred, &cb, &rreq); + if (!error) { + if (IS_VALID_CRED(cred)) + kauth_cred_unref(&cred); + if (!cb.rcb_func) { + /* if !async we'll need to wait for this RPC to finish */ + req = rreq; + rreq = NULL; + goto finish; + } + nfs_request_rele(req); + /* + * We're done here. + * Outstanding RPC count is unchanged. + * Callback will be called when RPC is done. + */ + return; + } + SET(bp->nb_flags, NB_ERROR); + bp->nb_error = error; + } + +out: + if (cb.rcb_func) + nfs_request_rele(req); + if (IS_VALID_CRED(cred)) + kauth_cred_unref(&cred); + + /* + * Decrement outstanding RPC count on buffer + * and call nfs_buf_read_finish on last RPC. + * + * (Note: when there are multiple async RPCs issued for a + * buffer we need nfs_buffer_mutex to avoid problems when + * aborting a partially-initiated set of RPCs) + */ + + multasyncrpc = ISSET(bp->nb_flags, NB_MULTASYNCRPC); + if (multasyncrpc) + lck_mtx_lock(nfs_buf_mutex); + + bp->nb_rpcs--; + finished = (bp->nb_rpcs == 0); + + if (multasyncrpc) + lck_mtx_unlock(nfs_buf_mutex); + + if (finished) { + if (multasyncrpc) + wakeme = &bp->nb_rpcs; + nfs_buf_read_finish(bp); + if (wakeme) + wakeup(wakeme); + } +} + +/* + * Do buffer readahead. + * Initiate async I/O to read buffers not in cache. + */ +int +nfs_buf_readahead(nfsnode_t np, int ioflag, daddr64_t *rabnp, daddr64_t lastrabn, thread_t thd, kauth_cred_t cred) +{ + struct nfsmount *nmp = NFSTONMP(np); + struct nfsbuf *bp; + int error = 0; + uint32_t nra; + + if (nfs_mount_gone(nmp)) + return (ENXIO); + if (nmp->nm_readahead <= 0) + return (0); + if (*rabnp > lastrabn) + return (0); + + for (nra = 0; (nra < nmp->nm_readahead) && (*rabnp <= lastrabn); nra++, *rabnp = *rabnp + 1) { + /* check if block exists and is valid. */ + if ((*rabnp * nmp->nm_biosize) >= (off_t)np->n_size) { + /* stop reading ahead if we're beyond EOF */ + *rabnp = lastrabn; + break; + } + error = nfs_buf_get(np, *rabnp, nmp->nm_biosize, thd, NBLK_READ|NBLK_NOWAIT, &bp); + if (error) + break; + nfs_node_lock_force(np); + np->n_lastrahead = *rabnp; + nfs_node_unlock(np); + if (!bp) + continue; + if ((ioflag & IO_NOCACHE) && ISSET(bp->nb_flags, NB_CACHE) && + !bp->nb_dirty && !ISSET(bp->nb_flags, (NB_DELWRI|NB_NCRDAHEAD))) { + CLR(bp->nb_flags, NB_CACHE); + bp->nb_valid = 0; + bp->nb_validoff = bp->nb_validend = -1; + } + if ((bp->nb_dirtyend <= 0) && !bp->nb_dirty && + !ISSET(bp->nb_flags, (NB_CACHE|NB_DELWRI))) { + SET(bp->nb_flags, (NB_READ|NB_ASYNC)); + if (ioflag & IO_NOCACHE) + SET(bp->nb_flags, NB_NCRDAHEAD); + if (!IS_VALID_CRED(bp->nb_rcred) && IS_VALID_CRED(cred)) { + kauth_cred_ref(cred); + bp->nb_rcred = cred; + } + if ((error = nfs_buf_read(bp))) + break; + continue; + } + nfs_buf_release(bp, 1); + } + return (error); +} + +/* + * NFS buffer I/O for reading files. + */ +int +nfs_bioread(nfsnode_t np, uio_t uio, int ioflag, vfs_context_t ctx) +{ + vnode_t vp = NFSTOV(np); + struct nfsbuf *bp = NULL; + struct nfsmount *nmp = VTONMP(vp); + daddr64_t lbn, rabn = 0, lastrabn, maxrabn = -1; + off_t diff; + int error = 0, n = 0, on = 0; + int nfsvers, biosize, modified, readaheads = 0; + thread_t thd; + kauth_cred_t cred; + int64_t io_resid; + + FSDBG_TOP(514, np, uio_offset(uio), uio_resid(uio), ioflag); + + nfsvers = nmp->nm_vers; + biosize = nmp->nm_biosize; + thd = vfs_context_thread(ctx); + cred = vfs_context_ucred(ctx); + + if (vnode_vtype(vp) != VREG) { + printf("nfs_bioread: type %x unexpected\n", vnode_vtype(vp)); + FSDBG_BOT(514, np, 0xd1e0016, 0, EINVAL); + return (EINVAL); + } + + /* + * For NFS, cache consistency can only be maintained approximately. + * Although RFC1094 does not specify the criteria, the following is + * believed to be compatible with the reference port. + * + * If the file has changed since the last read RPC or you have + * written to the file, you may have lost data cache consistency + * with the server. So, check for a change, and flush all of the + * file's data out of the cache. + * NB: This implies that cache data can be read when up to + * NFS_MAXATTRTIMO seconds out of date. If you find that you + * need current attributes, nfs_getattr() can be forced to fetch + * new attributes (via NATTRINVALIDATE() or NGA_UNCACHED). + */ + + if (ISSET(np->n_flag, NUPDATESIZE)) + nfs_data_update_size(np, 0); + + if ((error = nfs_node_lock(np))) { + FSDBG_BOT(514, np, 0xd1e0222, 0, error); + return (error); + } + + if (np->n_flag & NNEEDINVALIDATE) { + np->n_flag &= ~NNEEDINVALIDATE; + nfs_node_unlock(np); + error = nfs_vinvalbuf(vp, V_SAVE|V_IGNORE_WRITEERR, ctx, 1); + if (!error) + error = nfs_node_lock(np); + if (error) { + FSDBG_BOT(514, np, 0xd1e0322, 0, error); + return (error); + } + } + + modified = (np->n_flag & NMODIFIED); + nfs_node_unlock(np); + /* nfs_getattr() will check changed and purge caches */ + error = nfs_getattr(np, NULL, ctx, modified ? NGA_UNCACHED : NGA_CACHED); + if (error) { + FSDBG_BOT(514, np, 0xd1e0004, 0, error); + return (error); + } + + if (uio_resid(uio) == 0) { + FSDBG_BOT(514, np, 0xd1e0001, 0, 0); + return (0); + } + if (uio_offset(uio) < 0) { + FSDBG_BOT(514, np, 0xd1e0002, 0, EINVAL); + return (EINVAL); + } + + /* + * set up readahead - which may be limited by: + * + current request length (for IO_NOCACHE) + * + readahead setting + * + file size + */ + if (nmp->nm_readahead > 0) { + off_t end = uio_offset(uio) + uio_resid(uio); + if (end > (off_t)np->n_size) + end = np->n_size; + rabn = uio_offset(uio) / biosize; + maxrabn = (end - 1) / biosize; + nfs_node_lock_force(np); + if (!(ioflag & IO_NOCACHE) && + (!rabn || (rabn == np->n_lastread) || (rabn == (np->n_lastread+1)))) { + maxrabn += nmp->nm_readahead; + if ((maxrabn * biosize) >= (off_t)np->n_size) + maxrabn = ((off_t)np->n_size - 1)/biosize; + } + if (maxrabn < np->n_lastrahead) + np->n_lastrahead = -1; + if (rabn < np->n_lastrahead) + rabn = np->n_lastrahead + 1; + nfs_node_unlock(np); + } else { + rabn = maxrabn = 0; + } + + do { + + nfs_data_lock(np, NFS_DATA_LOCK_SHARED); + lbn = uio_offset(uio) / biosize; + + /* + * Copy directly from any cached pages without grabbing the bufs. + * (If we are NOCACHE and we've issued readahead requests, we need + * to grab the NB_NCRDAHEAD bufs to drop them.) + */ + if ((!(ioflag & IO_NOCACHE) || !readaheads) && + ((uio->uio_segflg == UIO_USERSPACE32 || + uio->uio_segflg == UIO_USERSPACE64 || + uio->uio_segflg == UIO_USERSPACE))) { + io_resid = uio_resid(uio); + diff = np->n_size - uio_offset(uio); + if (diff < io_resid) + io_resid = diff; + if (io_resid > 0) { + int count = (io_resid > INT_MAX) ? INT_MAX : io_resid; + error = cluster_copy_ubc_data(vp, uio, &count, 0); + if (error) { + nfs_data_unlock(np); + FSDBG_BOT(514, np, uio_offset(uio), 0xcacefeed, error); + return (error); + } + } + /* count any biocache reads that we just copied directly */ + if (lbn != (uio_offset(uio)/biosize)) { + OSAddAtomic64((uio_offset(uio)/biosize) - lbn, &nfsstats.biocache_reads); + FSDBG(514, np, 0xcacefeed, uio_offset(uio), error); + } + } + + lbn = uio_offset(uio) / biosize; + on = uio_offset(uio) % biosize; + nfs_node_lock_force(np); + np->n_lastread = (uio_offset(uio) - 1) / biosize; + nfs_node_unlock(np); + + if ((uio_resid(uio) <= 0) || (uio_offset(uio) >= (off_t)np->n_size)) { + nfs_data_unlock(np); + FSDBG_BOT(514, np, uio_offset(uio), uio_resid(uio), 0xaaaaaaaa); + return (0); + } + + /* adjust readahead block number, if necessary */ + if (rabn < lbn) + rabn = lbn; + lastrabn = MIN(maxrabn, lbn + nmp->nm_readahead); + if (rabn <= lastrabn) { /* start readaheads */ + error = nfs_buf_readahead(np, ioflag, &rabn, lastrabn, thd, cred); + if (error) { + nfs_data_unlock(np); + FSDBG_BOT(514, np, 0xd1e000b, 1, error); + return (error); + } + readaheads = 1; + } + + OSAddAtomic64(1, &nfsstats.biocache_reads); + + /* + * If the block is in the cache and has the required data + * in a valid region, just copy it out. + * Otherwise, get the block and write back/read in, + * as required. + */ +again: + io_resid = uio_resid(uio); + n = (io_resid > (biosize - on)) ? (biosize - on) : io_resid; + diff = np->n_size - uio_offset(uio); + if (diff < n) + n = diff; + + error = nfs_buf_get(np, lbn, biosize, thd, NBLK_READ, &bp); + if (error) { + nfs_data_unlock(np); + FSDBG_BOT(514, np, 0xd1e000c, 0, error); + return (error); + } + + if ((ioflag & IO_NOCACHE) && ISSET(bp->nb_flags, NB_CACHE)) { + /* + * IO_NOCACHE found a cached buffer. + * Flush the buffer if it's dirty. + * Invalidate the data if it wasn't just read + * in as part of a "nocache readahead". + */ + if (bp->nb_dirty || (bp->nb_dirtyend > 0)) { + /* so write the buffer out and try again */ + SET(bp->nb_flags, NB_NOCACHE); + goto flushbuffer; + } + if (ISSET(bp->nb_flags, NB_NCRDAHEAD)) { + CLR(bp->nb_flags, NB_NCRDAHEAD); + SET(bp->nb_flags, NB_NOCACHE); + } + } + + /* if any pages are valid... */ + if (bp->nb_valid) { + /* ...check for any invalid pages in the read range */ + int pg, firstpg, lastpg, dirtypg; + dirtypg = firstpg = lastpg = -1; + pg = on/PAGE_SIZE; + while (pg <= (on + n - 1)/PAGE_SIZE) { + if (!NBPGVALID(bp,pg)) { + if (firstpg < 0) + firstpg = pg; + lastpg = pg; + } else if (firstpg >= 0 && dirtypg < 0 && NBPGDIRTY(bp,pg)) + dirtypg = pg; + pg++; + } + + /* if there are no invalid pages, we're all set */ + if (firstpg < 0) { + if (bp->nb_validoff < 0) { + /* valid range isn't set up, so */ + /* set it to what we know is valid */ + bp->nb_validoff = trunc_page(on); + bp->nb_validend = round_page(on+n); + nfs_buf_normalize_valid_range(np, bp); + } + goto buffer_ready; + } + + /* there are invalid pages in the read range */ + if (((dirtypg > firstpg) && (dirtypg < lastpg)) || + (((firstpg*PAGE_SIZE) < bp->nb_dirtyend) && (((lastpg+1)*PAGE_SIZE) > bp->nb_dirtyoff))) { + /* there are also dirty page(s) (or range) in the read range, */ + /* so write the buffer out and try again */ +flushbuffer: + CLR(bp->nb_flags, (NB_DONE | NB_ERROR | NB_INVAL)); + SET(bp->nb_flags, NB_ASYNC); + if (!IS_VALID_CRED(bp->nb_wcred)) { + kauth_cred_ref(cred); + bp->nb_wcred = cred; + } + error = nfs_buf_write(bp); + if (error) { + nfs_data_unlock(np); + FSDBG_BOT(514, np, 0xd1e000d, 0, error); + return (error); + } + goto again; + } + if (!bp->nb_dirty && bp->nb_dirtyend <= 0 && + (lastpg - firstpg + 1) > (biosize/PAGE_SIZE)/2) { + /* we need to read in more than half the buffer and the */ + /* buffer's not dirty, so just fetch the whole buffer */ + bp->nb_valid = 0; + } else { + /* read the page range in */ + uio_t auio; + char uio_buf[ UIO_SIZEOF(1) ]; + + NFS_BUF_MAP(bp); + auio = uio_createwithbuffer(1, (NBOFF(bp) + firstpg * PAGE_SIZE_64), + UIO_SYSSPACE, UIO_READ, &uio_buf[0], sizeof(uio_buf)); + if (!auio) { + error = ENOMEM; + } else { + uio_addiov(auio, CAST_USER_ADDR_T(bp->nb_data + (firstpg * PAGE_SIZE)), + ((lastpg - firstpg + 1) * PAGE_SIZE)); + error = nfs_read_rpc(np, auio, ctx); + } + if (error) { + if (ioflag & IO_NOCACHE) + SET(bp->nb_flags, NB_NOCACHE); + nfs_buf_release(bp, 1); + nfs_data_unlock(np); + FSDBG_BOT(514, np, 0xd1e000e, 0, error); + return (error); + } + /* Make sure that the valid range is set to cover this read. */ + bp->nb_validoff = trunc_page_32(on); + bp->nb_validend = round_page_32(on+n); + nfs_buf_normalize_valid_range(np, bp); + if (uio_resid(auio) > 0) { + /* if short read, must have hit EOF, */ + /* so zero the rest of the range */ + bzero(CAST_DOWN(caddr_t, uio_curriovbase(auio)), uio_resid(auio)); + } + /* mark the pages (successfully read) as valid */ + for (pg=firstpg; pg <= lastpg; pg++) + NBPGVALID_SET(bp,pg); + } + } + /* if no pages are valid, read the whole block */ + if (!bp->nb_valid) { + if (!IS_VALID_CRED(bp->nb_rcred) && IS_VALID_CRED(cred)) { + kauth_cred_ref(cred); + bp->nb_rcred = cred; + } + SET(bp->nb_flags, NB_READ); + CLR(bp->nb_flags, (NB_DONE | NB_ERROR | NB_INVAL)); + error = nfs_buf_read(bp); + if (ioflag & IO_NOCACHE) + SET(bp->nb_flags, NB_NOCACHE); + if (error) { + nfs_data_unlock(np); + nfs_buf_release(bp, 1); + FSDBG_BOT(514, np, 0xd1e000f, 0, error); + return (error); + } + } +buffer_ready: + /* validate read range against valid range and clip */ + if (bp->nb_validend > 0) { + diff = (on >= bp->nb_validend) ? 0 : (bp->nb_validend - on); + if (diff < n) + n = diff; + } + if (n > 0) { + NFS_BUF_MAP(bp); + error = uiomove(bp->nb_data + on, n, uio); + } + + nfs_buf_release(bp, 1); + nfs_data_unlock(np); + nfs_node_lock_force(np); + np->n_lastread = (uio_offset(uio) - 1) / biosize; + nfs_node_unlock(np); + } while (error == 0 && uio_resid(uio) > 0 && n > 0); + FSDBG_BOT(514, np, uio_offset(uio), uio_resid(uio), error); + return (error); +} + +/* + * limit the number of outstanding async I/O writes + */ +int +nfs_async_write_start(struct nfsmount *nmp) +{ + int error = 0, slpflag = NMFLAG(nmp, INTR) ? PCATCH : 0; + struct timespec ts = {1, 0}; + + if (nfs_max_async_writes <= 0) + return (0); + lck_mtx_lock(&nmp->nm_lock); + while ((nfs_max_async_writes > 0) && (nmp->nm_asyncwrites >= nfs_max_async_writes)) { + if ((error = nfs_sigintr(nmp, NULL, current_thread(), 1))) + break; + msleep(&nmp->nm_asyncwrites, &nmp->nm_lock, slpflag|(PZERO-1), "nfsasyncwrites", &ts); + slpflag = 0; + } + if (!error) + nmp->nm_asyncwrites++; + lck_mtx_unlock(&nmp->nm_lock); + return (error); +} +void +nfs_async_write_done(struct nfsmount *nmp) +{ + if (nmp->nm_asyncwrites <= 0) + return; + lck_mtx_lock(&nmp->nm_lock); + if (nmp->nm_asyncwrites-- >= nfs_max_async_writes) + wakeup(&nmp->nm_asyncwrites); + lck_mtx_unlock(&nmp->nm_lock); +} + +/* + * write (or commit) the given NFS buffer + * + * Commit the buffer if we can. + * Write out any dirty range. + * If any dirty pages remain, write them out. + * Mark buffer done. + * + * For async requests, all the work beyond sending the initial + * write RPC is handled in the RPC callback(s). + */ +int +nfs_buf_write(struct nfsbuf *bp) +{ + int error = 0, oldflags, async; + nfsnode_t np; + thread_t thd; + kauth_cred_t cred; + proc_t p = current_proc(); + int iomode, doff, dend, firstpg, lastpg; + uint32_t pagemask; + + FSDBG_TOP(553, bp, NBOFF(bp), bp->nb_flags, 0); + + if (!ISSET(bp->nb_lflags, NBL_BUSY)) + panic("nfs_buf_write: buffer is not busy???"); + + np = bp->nb_np; + async = ISSET(bp->nb_flags, NB_ASYNC); + oldflags = bp->nb_flags; + + CLR(bp->nb_flags, (NB_READ|NB_DONE|NB_ERROR|NB_DELWRI)); + if (ISSET(oldflags, NB_DELWRI)) { + lck_mtx_lock(nfs_buf_mutex); + nfs_nbdwrite--; + NFSBUFCNTCHK(); + lck_mtx_unlock(nfs_buf_mutex); + wakeup(&nfs_nbdwrite); + } + + /* move to clean list */ + if (ISSET(oldflags, (NB_ASYNC|NB_DELWRI))) { + lck_mtx_lock(nfs_buf_mutex); + if (bp->nb_vnbufs.le_next != NFSNOLIST) + LIST_REMOVE(bp, nb_vnbufs); + LIST_INSERT_HEAD(&np->n_cleanblkhd, bp, nb_vnbufs); + lck_mtx_unlock(nfs_buf_mutex); + } + nfs_node_lock_force(np); + np->n_numoutput++; + nfs_node_unlock(np); + vnode_startwrite(NFSTOV(np)); + + if (p && p->p_stats) + OSIncrementAtomicLong(&p->p_stats->p_ru.ru_oublock); + + cred = bp->nb_wcred; + if (!IS_VALID_CRED(cred) && ISSET(bp->nb_flags, NB_READ)) + cred = bp->nb_rcred; /* shouldn't really happen, but... */ + if (IS_VALID_CRED(cred)) + kauth_cred_ref(cred); + thd = async ? NULL : current_thread(); + + /* We need to make sure the pages are locked before doing I/O. */ + if (!ISSET(bp->nb_flags, NB_META)) { + if (UBCINFOEXISTS(NFSTOV(np))) { + if (!ISSET(bp->nb_flags, NB_PAGELIST)) { + error = nfs_buf_upl_setup(bp); + if (error) { + printf("nfs_buf_write: upl create failed %d\n", error); + SET(bp->nb_flags, NB_ERROR); + bp->nb_error = error = EIO; + nfs_buf_iodone(bp); + goto out; + } + nfs_buf_upl_check(bp); + } + } else { + /* We should never be in nfs_buf_write() with no UBCINFO. */ + printf("nfs_buf_write: ubcinfo already gone\n"); + SET(bp->nb_flags, NB_ERROR); + bp->nb_error = error = EIO; + nfs_buf_iodone(bp); + goto out; + } + } + + /* If NB_NEEDCOMMIT is set, a commit RPC may do the trick. */ + if (ISSET(bp->nb_flags, NB_NEEDCOMMIT)) + nfs_buf_check_write_verifier(np, bp); + if (ISSET(bp->nb_flags, NB_NEEDCOMMIT)) { + struct nfsmount *nmp = NFSTONMP(np); + if (nfs_mount_gone(nmp)) { + SET(bp->nb_flags, NB_ERROR); + bp->nb_error = error = EIO; + nfs_buf_iodone(bp); + goto out; + } + SET(bp->nb_flags, NB_WRITEINPROG); + error = nmp->nm_funcs->nf_commit_rpc(np, NBOFF(bp) + bp->nb_dirtyoff, + bp->nb_dirtyend - bp->nb_dirtyoff, bp->nb_wcred, bp->nb_verf); + CLR(bp->nb_flags, NB_WRITEINPROG); + if (error) { + if (error != NFSERR_STALEWRITEVERF) { + SET(bp->nb_flags, NB_ERROR); + bp->nb_error = error; + } + nfs_buf_iodone(bp); + goto out; + } + bp->nb_dirtyoff = bp->nb_dirtyend = 0; + CLR(bp->nb_flags, NB_NEEDCOMMIT); + nfs_node_lock_force(np); + np->n_needcommitcnt--; + CHECK_NEEDCOMMITCNT(np); + nfs_node_unlock(np); + } + if (!error && (bp->nb_dirtyend > 0)) { + /* sanity check the dirty range */ + if (NBOFF(bp) + bp->nb_dirtyend > (off_t) np->n_size) { + bp->nb_dirtyend = np->n_size - NBOFF(bp); + if (bp->nb_dirtyoff >= bp->nb_dirtyend) + bp->nb_dirtyoff = bp->nb_dirtyend = 0; + } + } + if (!error && (bp->nb_dirtyend > 0)) { + /* there's a dirty range that needs to be written out */ + NFS_BUF_MAP(bp); + + doff = bp->nb_dirtyoff; + dend = bp->nb_dirtyend; + + /* if doff page is dirty, move doff to start of page */ + if (NBPGDIRTY(bp, doff / PAGE_SIZE)) + doff -= doff & PAGE_MASK; + /* try to expand write range to include preceding dirty pages */ + if (!(doff & PAGE_MASK)) + while ((doff > 0) && NBPGDIRTY(bp, (doff - 1) / PAGE_SIZE)) + doff -= PAGE_SIZE; + /* if dend page is dirty, move dend to start of next page */ + if ((dend & PAGE_MASK) && NBPGDIRTY(bp, dend / PAGE_SIZE)) + dend = round_page_32(dend); + /* try to expand write range to include trailing dirty pages */ + if (!(dend & PAGE_MASK)) + while ((dend < (int)bp->nb_bufsize) && NBPGDIRTY(bp, dend / PAGE_SIZE)) + dend += PAGE_SIZE; + /* make sure to keep dend clipped to EOF */ + if ((NBOFF(bp) + dend) > (off_t) np->n_size) + dend = np->n_size - NBOFF(bp); + /* calculate range of complete pages being written */ + firstpg = round_page_32(doff) / PAGE_SIZE; + lastpg = (trunc_page_32(dend) - 1) / PAGE_SIZE; + /* calculate mask for that page range */ + pagemask = ((1 << (lastpg + 1)) - 1) & ~((1 << firstpg) - 1); + + /* + * compare page mask to nb_dirty; if there are other dirty pages + * then write FILESYNC; otherwise, write UNSTABLE if async and + * not needcommit/stable; otherwise write FILESYNC + */ + if (bp->nb_dirty & ~pagemask) + iomode = NFS_WRITE_FILESYNC; + else if ((bp->nb_flags & (NB_ASYNC | NB_NEEDCOMMIT | NB_STABLE)) == NB_ASYNC) + iomode = NFS_WRITE_UNSTABLE; + else + iomode = NFS_WRITE_FILESYNC; + + /* write the whole contiguous dirty range */ + bp->nb_offio = doff; + bp->nb_endio = dend; + + OSAddAtomic64(1, &nfsstats.write_bios); + + SET(bp->nb_flags, NB_WRITEINPROG); + error = nfs_buf_write_rpc(bp, iomode, thd, cred); + /* + * For async I/O, the callbacks will finish up the + * write and push out any dirty pages. Otherwise, + * the write has already been finished and any dirty + * pages pushed out. + */ + } else { + if (!error && bp->nb_dirty) /* write out any dirty pages */ + error = nfs_buf_write_dirty_pages(bp, thd, cred); + nfs_buf_iodone(bp); + } + /* note: bp is still valid only for !async case */ +out: + if (!async) { + error = nfs_buf_iowait(bp); + /* move to clean list */ + if (oldflags & NB_DELWRI) { + lck_mtx_lock(nfs_buf_mutex); + if (bp->nb_vnbufs.le_next != NFSNOLIST) + LIST_REMOVE(bp, nb_vnbufs); + LIST_INSERT_HEAD(&np->n_cleanblkhd, bp, nb_vnbufs); + lck_mtx_unlock(nfs_buf_mutex); + } + FSDBG_BOT(553, bp, NBOFF(bp), bp->nb_flags, error); + nfs_buf_release(bp, 1); + /* check if we need to invalidate (and we can) */ + if ((np->n_flag & NNEEDINVALIDATE) && + !(np->n_bflag & (NBINVALINPROG|NBFLUSHINPROG))) { + int invalidate = 0; + nfs_node_lock_force(np); + if (np->n_flag & NNEEDINVALIDATE) { + invalidate = 1; + np->n_flag &= ~NNEEDINVALIDATE; + } + nfs_node_unlock(np); + if (invalidate) { + /* + * There was a write error and we need to + * invalidate attrs and flush buffers in + * order to sync up with the server. + * (if this write was extending the file, + * we may no longer know the correct size) + * + * But we couldn't call vinvalbuf while holding + * the buffer busy. So we call vinvalbuf() after + * releasing the buffer. + */ + nfs_vinvalbuf2(NFSTOV(np), V_SAVE|V_IGNORE_WRITEERR, thd, cred, 1); + } + } + } + + if (IS_VALID_CRED(cred)) + kauth_cred_unref(&cred); + return (error); +} + +/* + * finish the writing of a buffer + */ +void +nfs_buf_write_finish(struct nfsbuf *bp, thread_t thd, kauth_cred_t cred) +{ + nfsnode_t np = bp->nb_np; + int error = (bp->nb_flags & NB_ERROR) ? bp->nb_error : 0; + int firstpg, lastpg; + uint32_t pagemask; + + if ((error == EINTR) || (error == ERESTART)) { + CLR(bp->nb_flags, NB_ERROR); + SET(bp->nb_flags, NB_EINTR); + } + + if (!error) { + /* calculate range of complete pages being written */ + firstpg = round_page_32(bp->nb_offio) / PAGE_SIZE; + lastpg = (trunc_page_32(bp->nb_endio) - 1) / PAGE_SIZE; + /* calculate mask for that page range written */ + pagemask = ((1 << (lastpg + 1)) - 1) & ~((1 << firstpg) - 1); + /* clear dirty bits for pages we've written */ + bp->nb_dirty &= ~pagemask; + } + + /* manage needcommit state */ + if (!error && (bp->nb_commitlevel == NFS_WRITE_UNSTABLE)) { + if (!ISSET(bp->nb_flags, NB_NEEDCOMMIT)) { + nfs_node_lock_force(np); + np->n_needcommitcnt++; + nfs_node_unlock(np); + SET(bp->nb_flags, NB_NEEDCOMMIT); + } + /* make sure nb_dirtyoff/nb_dirtyend reflect actual range written */ + bp->nb_dirtyoff = bp->nb_offio; + bp->nb_dirtyend = bp->nb_endio; + } else if (ISSET(bp->nb_flags, NB_NEEDCOMMIT)) { + nfs_node_lock_force(np); + np->n_needcommitcnt--; + CHECK_NEEDCOMMITCNT(np); + nfs_node_unlock(np); + CLR(bp->nb_flags, NB_NEEDCOMMIT); + } + + CLR(bp->nb_flags, NB_WRITEINPROG); + + /* + * For an unstable write, the buffer is still treated as dirty until + * a commit (or stable (re)write) is performed. Buffers needing only + * a commit are marked with the NB_DELWRI and NB_NEEDCOMMIT flags. + * + * If the write was interrupted we set NB_EINTR. Don't set NB_ERROR + * because that would cause the buffer to be dropped. The buffer is + * still valid and simply needs to be written again. + */ + if ((error == EINTR) || (error == ERESTART) || (!error && (bp->nb_flags & NB_NEEDCOMMIT))) { + CLR(bp->nb_flags, NB_INVAL); + if (!ISSET(bp->nb_flags, NB_DELWRI)) { + SET(bp->nb_flags, NB_DELWRI); + lck_mtx_lock(nfs_buf_mutex); + nfs_nbdwrite++; + NFSBUFCNTCHK(); + lck_mtx_unlock(nfs_buf_mutex); + } + /* + * Since for the NB_ASYNC case, we've reassigned the buffer to the + * clean list, we have to reassign it back to the dirty one. Ugh. + */ + if (ISSET(bp->nb_flags, NB_ASYNC)) { + /* move to dirty list */ + lck_mtx_lock(nfs_buf_mutex); + if (bp->nb_vnbufs.le_next != NFSNOLIST) + LIST_REMOVE(bp, nb_vnbufs); + LIST_INSERT_HEAD(&np->n_dirtyblkhd, bp, nb_vnbufs); + lck_mtx_unlock(nfs_buf_mutex); + } + } else { + /* either there's an error or we don't need to commit */ + if (error) { + /* + * There was a write error and we need to invalidate + * attrs and flush buffers in order to sync up with the + * server. (if this write was extending the file, we + * may no longer know the correct size) + * + * But we can't call vinvalbuf while holding this + * buffer busy. Set a flag to do it after releasing + * the buffer. + */ + nfs_node_lock_force(np); + np->n_error = error; + np->n_flag |= (NWRITEERR | NNEEDINVALIDATE); + NATTRINVALIDATE(np); + nfs_node_unlock(np); + } + /* clear the dirty range */ + bp->nb_dirtyoff = bp->nb_dirtyend = 0; + } + + if (!error && bp->nb_dirty) + nfs_buf_write_dirty_pages(bp, thd, cred); + nfs_buf_iodone(bp); +} + +/* + * write out any pages marked dirty in a buffer + * + * We do use unstable writes and follow up with a commit. + * If we catch the write verifier changing we'll restart + * do the writes filesync. + */ +int +nfs_buf_write_dirty_pages(struct nfsbuf *bp, thread_t thd, kauth_cred_t cred) +{ + nfsnode_t np = bp->nb_np; + struct nfsmount *nmp = NFSTONMP(np); + int error = 0, commit, iomode, iomode2, len, pg, count, npages, off; + uint32_t dirty = bp->nb_dirty; + uint64_t wverf; + uio_t auio; + char uio_buf [ UIO_SIZEOF(1) ]; + + if (!bp->nb_dirty) + return (0); + + /* there are pages marked dirty that need to be written out */ + OSAddAtomic64(1, &nfsstats.write_bios); + NFS_BUF_MAP(bp); + SET(bp->nb_flags, NB_WRITEINPROG); + npages = bp->nb_bufsize / PAGE_SIZE; + iomode = NFS_WRITE_UNSTABLE; + + auio = uio_createwithbuffer(1, 0, UIO_SYSSPACE, UIO_WRITE, + &uio_buf, sizeof(uio_buf)); + +again: + dirty = bp->nb_dirty; + wverf = bp->nb_verf; + commit = NFS_WRITE_FILESYNC; + for (pg = 0; pg < npages; pg++) { + if (!NBPGDIRTY(bp, pg)) + continue; + count = 1; + while (((pg + count) < npages) && NBPGDIRTY(bp, pg + count)) + count++; + /* write count pages starting with page pg */ + off = pg * PAGE_SIZE; + len = count * PAGE_SIZE; + /* clip writes to EOF */ + if (NBOFF(bp) + off + len > (off_t) np->n_size) + len -= (NBOFF(bp) + off + len) - np->n_size; + if (len > 0) { + iomode2 = iomode; + uio_reset(auio, NBOFF(bp) + off, UIO_SYSSPACE, UIO_WRITE); + uio_addiov(auio, CAST_USER_ADDR_T(bp->nb_data + off), len); + error = nfs_write_rpc2(np, auio, thd, cred, &iomode2, &bp->nb_verf); + if (error) + break; + if (iomode2 < commit) /* Retain the lowest commitment level returned. */ + commit = iomode2; + if ((commit != NFS_WRITE_FILESYNC) && (wverf != bp->nb_verf)) { + /* verifier changed, redo all the writes filesync */ + iomode = NFS_WRITE_FILESYNC; + goto again; + } + } + /* clear dirty bits */ + while (count--) { + dirty &= ~(1 << pg); + if (count) /* leave pg on last page */ + pg++; + } + } + CLR(bp->nb_flags, NB_WRITEINPROG); + + if (!error && (commit != NFS_WRITE_FILESYNC)) { + error = nmp->nm_funcs->nf_commit_rpc(np, NBOFF(bp), bp->nb_bufsize, cred, wverf); + if (error == NFSERR_STALEWRITEVERF) { + /* verifier changed, so we need to restart all the writes */ + iomode = NFS_WRITE_FILESYNC; + goto again; + } + } + if (!error) { + bp->nb_dirty = dirty; + } else { + SET(bp->nb_flags, NB_ERROR); + bp->nb_error = error; + } + return (error); +} + +/* + * initiate the NFS WRITE RPC(s) for a buffer + */ +int +nfs_buf_write_rpc(struct nfsbuf *bp, int iomode, thread_t thd, kauth_cred_t cred) +{ + struct nfsmount *nmp; + nfsnode_t np = bp->nb_np; + int error = 0, nfsvers, async; + int offset, nrpcs; + uint32_t nmwsize, length, len; + struct nfsreq *req; + struct nfsreq_cbinfo cb; + uio_t auio; + char uio_buf [ UIO_SIZEOF(1) ]; + + nmp = NFSTONMP(np); + if (nfs_mount_gone(nmp)) { + bp->nb_error = error = ENXIO; + SET(bp->nb_flags, NB_ERROR); + nfs_buf_iodone(bp); + return (error); + } + nfsvers = nmp->nm_vers; + nmwsize = nmp->nm_wsize; + + offset = bp->nb_offio; + length = bp->nb_endio - bp->nb_offio; + + /* Note: Can only do async I/O if nfsiods are configured. */ + async = (bp->nb_flags & NB_ASYNC) && (NFSIOD_MAX > 0); + bp->nb_commitlevel = NFS_WRITE_FILESYNC; + cb.rcb_func = async ? nfs_buf_write_rpc_finish : NULL; + cb.rcb_bp = bp; + + if ((nfsvers == NFS_VER2) && ((NBOFF(bp) + bp->nb_endio) > 0xffffffffLL)) { + bp->nb_error = error = EFBIG; + SET(bp->nb_flags, NB_ERROR); + nfs_buf_iodone(bp); + return (error); + } + + auio = uio_createwithbuffer(1, NBOFF(bp) + offset, UIO_SYSSPACE, + UIO_WRITE, &uio_buf, sizeof(uio_buf)); + uio_addiov(auio, CAST_USER_ADDR_T(bp->nb_data + offset), length); + + bp->nb_rpcs = nrpcs = (length + nmwsize - 1) / nmwsize; + if (async && (nrpcs > 1)) { + SET(bp->nb_flags, NB_MULTASYNCRPC); + } else { + CLR(bp->nb_flags, NB_MULTASYNCRPC); + } + + while (length > 0) { + if (ISSET(bp->nb_flags, NB_ERROR)) { + error = bp->nb_error; + break; + } + len = (length > nmwsize) ? nmwsize : length; + cb.rcb_args[0] = offset; + cb.rcb_args[1] = len; + if (nmp->nm_vers >= NFS_VER4) + cb.rcb_args[2] = nmp->nm_stategenid; + if (async && ((error = nfs_async_write_start(nmp)))) + break; + req = NULL; + error = nmp->nm_funcs->nf_write_rpc_async(np, auio, len, thd, cred, + iomode, &cb, &req); + if (error) { + if (async) + nfs_async_write_done(nmp); + break; + } + offset += len; + length -= len; + if (async) + continue; + nfs_buf_write_rpc_finish(req); + } + + if (length > 0) { + /* + * Something bad happened while trying to send the RPCs. + * Wait for any outstanding requests to complete. + */ + bp->nb_error = error; + SET(bp->nb_flags, NB_ERROR); + if (ISSET(bp->nb_flags, NB_MULTASYNCRPC)) { + nrpcs = (length + nmwsize - 1) / nmwsize; + lck_mtx_lock(nfs_buf_mutex); + bp->nb_rpcs -= nrpcs; + if (bp->nb_rpcs == 0) { + /* No RPCs left, so the buffer's done */ + lck_mtx_unlock(nfs_buf_mutex); + nfs_buf_write_finish(bp, thd, cred); + } else { + /* wait for the last RPC to mark it done */ + while (bp->nb_rpcs > 0) + msleep(&bp->nb_rpcs, nfs_buf_mutex, 0, + "nfs_buf_write_rpc_cancel", NULL); + lck_mtx_unlock(nfs_buf_mutex); + } + } else { + nfs_buf_write_finish(bp, thd, cred); + } + /* It may have just been an interrupt... that's OK */ + if (!ISSET(bp->nb_flags, NB_ERROR)) + error = 0; + } + + return (error); +} + +/* + * finish up an NFS WRITE RPC on a buffer + */ +void +nfs_buf_write_rpc_finish(struct nfsreq *req) +{ + int error = 0, nfsvers, offset, length, multasyncrpc, finished; + int committed = NFS_WRITE_FILESYNC; + uint64_t wverf = 0; + size_t rlen; + void *wakeme = NULL; + struct nfsreq_cbinfo cb; + struct nfsreq *wreq = NULL; + struct nfsbuf *bp; + struct nfsmount *nmp; + nfsnode_t np; + thread_t thd; + kauth_cred_t cred; + uio_t auio; + char uio_buf [ UIO_SIZEOF(1) ]; + +finish: + np = req->r_np; + thd = req->r_thread; + cred = req->r_cred; + if (IS_VALID_CRED(cred)) + kauth_cred_ref(cred); + cb = req->r_callback; + bp = cb.rcb_bp; + if (cb.rcb_func) /* take an extra reference on the nfsreq in case we want to resend it later due to grace error */ + nfs_request_ref(req, 0); + + nmp = NFSTONMP(np); + if (nfs_mount_gone(nmp)) { + SET(bp->nb_flags, NB_ERROR); + bp->nb_error = error = ENXIO; + } + if (error || ISSET(bp->nb_flags, NB_ERROR)) { + /* just drop it */ + nfs_request_async_cancel(req); + goto out; + } + nfsvers = nmp->nm_vers; + + offset = cb.rcb_args[0]; + rlen = length = cb.rcb_args[1]; + + /* finish the RPC */ + error = nmp->nm_funcs->nf_write_rpc_async_finish(np, req, &committed, &rlen, &wverf); + if ((error == EINPROGRESS) && cb.rcb_func) { + /* async request restarted */ + if (cb.rcb_func) + nfs_request_rele(req); + if (IS_VALID_CRED(cred)) + kauth_cred_unref(&cred); + return; + } + if ((nmp->nm_vers >= NFS_VER4) && nfs_mount_state_error_should_restart(error) && !ISSET(bp->nb_flags, NB_ERROR)) { + lck_mtx_lock(&nmp->nm_lock); + if ((error != NFSERR_OLD_STATEID) && (error != NFSERR_GRACE) && (cb.rcb_args[2] == nmp->nm_stategenid)) { + NP(np, "nfs_buf_write_rpc_finish: error %d @ 0x%llx, 0x%x 0x%x, initiating recovery", + error, NBOFF(bp)+offset, cb.rcb_args[2], nmp->nm_stategenid); + nfs_need_recover(nmp, error); + } + lck_mtx_unlock(&nmp->nm_lock); + if (np->n_flag & NREVOKE) { + error = EIO; + } else { + if (error == NFSERR_GRACE) { + if (cb.rcb_func) { + /* + * For an async I/O request, handle a grace delay just like + * jukebox errors. Set the resend time and queue it up. + */ + struct timeval now; + if (req->r_nmrep.nmc_mhead) { + mbuf_freem(req->r_nmrep.nmc_mhead); + req->r_nmrep.nmc_mhead = NULL; + } + req->r_error = 0; + microuptime(&now); + lck_mtx_lock(&req->r_mtx); + req->r_resendtime = now.tv_sec + 2; + req->r_xid = 0; // get a new XID + req->r_flags |= R_RESTART; + req->r_start = 0; + nfs_asyncio_resend(req); + lck_mtx_unlock(&req->r_mtx); + if (IS_VALID_CRED(cred)) + kauth_cred_unref(&cred); + /* Note: nfsreq reference taken will be dropped later when finished */ + return; + } + /* otherwise, just pause a couple seconds and retry */ + tsleep(&nmp->nm_state, (PZERO-1), "nfsgrace", 2*hz); + } + if (!(error = nfs_mount_state_wait_for_recovery(nmp))) { + rlen = 0; + goto writeagain; + } + } + } + if (error) { + SET(bp->nb_flags, NB_ERROR); + bp->nb_error = error; + } + if (error || (nfsvers == NFS_VER2)) + goto out; + if (rlen <= 0) { + SET(bp->nb_flags, NB_ERROR); + bp->nb_error = error = EIO; + goto out; + } + + /* save lowest commit level returned */ + if (committed < bp->nb_commitlevel) + bp->nb_commitlevel = committed; + + /* check the write verifier */ + if (!bp->nb_verf) { + bp->nb_verf = wverf; + } else if (bp->nb_verf != wverf) { + /* verifier changed, so buffer will need to be rewritten */ + bp->nb_flags |= NB_STALEWVERF; + bp->nb_commitlevel = NFS_WRITE_UNSTABLE; + bp->nb_verf = wverf; + } + + /* + * check for a short write + * + * If the server didn't write all the data, then we + * need to issue another write for the rest of it. + * (Don't bother if the buffer hit an error or stale wverf.) + */ + if (((int)rlen < length) && !(bp->nb_flags & (NB_STALEWVERF|NB_ERROR))) { +writeagain: + offset += rlen; + length -= rlen; + + auio = uio_createwithbuffer(1, NBOFF(bp) + offset, UIO_SYSSPACE, + UIO_WRITE, &uio_buf, sizeof(uio_buf)); + uio_addiov(auio, CAST_USER_ADDR_T(bp->nb_data + offset), length); + + cb.rcb_args[0] = offset; + cb.rcb_args[1] = length; + if (nmp->nm_vers >= NFS_VER4) + cb.rcb_args[2] = nmp->nm_stategenid; + + // XXX iomode should really match the original request + error = nmp->nm_funcs->nf_write_rpc_async(np, auio, length, thd, cred, + NFS_WRITE_FILESYNC, &cb, &wreq); + if (!error) { + if (IS_VALID_CRED(cred)) + kauth_cred_unref(&cred); + if (!cb.rcb_func) { + /* if !async we'll need to wait for this RPC to finish */ + req = wreq; + wreq = NULL; + goto finish; + } + nfs_request_rele(req); + /* + * We're done here. + * Outstanding RPC count is unchanged. + * Callback will be called when RPC is done. + */ + return; + } + SET(bp->nb_flags, NB_ERROR); + bp->nb_error = error; + } + +out: + if (cb.rcb_func) { + nfs_async_write_done(nmp); + nfs_request_rele(req); + } + /* + * Decrement outstanding RPC count on buffer + * and call nfs_buf_write_finish on last RPC. + * + * (Note: when there are multiple async RPCs issued for a + * buffer we need nfs_buffer_mutex to avoid problems when + * aborting a partially-initiated set of RPCs) + */ + multasyncrpc = ISSET(bp->nb_flags, NB_MULTASYNCRPC); + if (multasyncrpc) + lck_mtx_lock(nfs_buf_mutex); + + bp->nb_rpcs--; + finished = (bp->nb_rpcs == 0); + + if (multasyncrpc) + lck_mtx_unlock(nfs_buf_mutex); + + if (finished) { + if (multasyncrpc) + wakeme = &bp->nb_rpcs; + nfs_buf_write_finish(bp, thd, cred); + if (wakeme) + wakeup(wakeme); + } + + if (IS_VALID_CRED(cred)) + kauth_cred_unref(&cred); +} + +/* + * Send commit(s) for the given node's "needcommit" buffers + */ +int +nfs_flushcommits(nfsnode_t np, int nowait) +{ + struct nfsmount *nmp; + struct nfsbuf *bp, *prevlbp, *lbp; + struct nfsbuflists blist, commitlist; + int error = 0, retv, wcred_set, flags, dirty; + u_quad_t off, endoff, toff; + uint64_t wverf; + u_int32_t count; + kauth_cred_t wcred = NULL; + + FSDBG_TOP(557, np, 0, 0, 0); /* - * Find a free iod to process this request. + * A nb_flags == (NB_DELWRI | NB_NEEDCOMMIT) block has been written to the + * server, but nas not been committed to stable storage on the server + * yet. The byte range is worked out for as many nfsbufs as we can handle + * and the commit rpc is done. */ - for (i = 0; i < NFS_MAXASYNCDAEMON; i++) - if (nfs_iodwant[i]) { + if (!LIST_EMPTY(&np->n_dirtyblkhd)) { + error = nfs_node_lock(np); + if (error) + goto done; + np->n_flag |= NMODIFIED; + nfs_node_unlock(np); + } + + off = (u_quad_t)-1; + endoff = 0; + wcred_set = 0; + LIST_INIT(&commitlist); + + nmp = NFSTONMP(np); + if (nfs_mount_gone(nmp)) { + error = ENXIO; + goto done; + } + if (nmp->nm_vers == NFS_VER2) { + error = EINVAL; + goto done; + } + + flags = NBI_DIRTY; + if (nowait) + flags |= NBI_NOWAIT; + lck_mtx_lock(nfs_buf_mutex); + wverf = nmp->nm_verf; + if (!nfs_buf_iterprepare(np, &blist, flags)) { + while ((bp = LIST_FIRST(&blist))) { + LIST_REMOVE(bp, nb_vnbufs); + LIST_INSERT_HEAD(&np->n_dirtyblkhd, bp, nb_vnbufs); + error = nfs_buf_acquire(bp, NBAC_NOWAIT, 0, 0); + if (error) + continue; + if (ISSET(bp->nb_flags, NB_NEEDCOMMIT)) + nfs_buf_check_write_verifier(np, bp); + if (((bp->nb_flags & (NB_DELWRI | NB_NEEDCOMMIT)) != (NB_DELWRI | NB_NEEDCOMMIT)) || + (bp->nb_verf != wverf)) { + nfs_buf_drop(bp); + continue; + } + nfs_buf_remfree(bp); + + /* buffer UPLs will be grabbed *in order* below */ + + FSDBG(557, bp, bp->nb_flags, bp->nb_valid, bp->nb_dirty); + FSDBG(557, bp->nb_validoff, bp->nb_validend, + bp->nb_dirtyoff, bp->nb_dirtyend); + /* - * Found one, so wake it up and tell it which - * mount to process. + * Work out if all buffers are using the same cred + * so we can deal with them all with one commit. + * + * Note: creds in bp's must be obtained by kauth_cred_ref + * on the same original cred in order for them to be equal. */ - NFS_DPF(ASYNCIO, - ("nfs_asyncio: waking iod %d for mount %p\n", - i, nmp)); - nfs_iodwant[i] = (struct proc *)0; - nfs_iodmount[i] = nmp; - nmp->nm_bufqiods++; - wakeup((caddr_t)&nfs_iodwant[i]); - gotiod = TRUE; - break; + if (wcred_set == 0) { + wcred = bp->nb_wcred; + if (!IS_VALID_CRED(wcred)) + panic("nfs: needcommit w/out wcred"); + wcred_set = 1; + } else if ((wcred_set == 1) && wcred != bp->nb_wcred) { + wcred_set = -1; + } + SET(bp->nb_flags, NB_WRITEINPROG); + + /* + * Add this buffer to the list of buffers we are committing. + * Buffers are inserted into the list in ascending order so that + * we can take the UPLs in order after the list is complete. + */ + prevlbp = NULL; + LIST_FOREACH(lbp, &commitlist, nb_vnbufs) { + if (bp->nb_lblkno < lbp->nb_lblkno) + break; + prevlbp = lbp; + } + LIST_REMOVE(bp, nb_vnbufs); + if (prevlbp) + LIST_INSERT_AFTER(prevlbp, bp, nb_vnbufs); + else + LIST_INSERT_HEAD(&commitlist, bp, nb_vnbufs); + + /* update commit range start, end */ + toff = NBOFF(bp) + bp->nb_dirtyoff; + if (toff < off) + off = toff; + toff += (u_quad_t)(bp->nb_dirtyend - bp->nb_dirtyoff); + if (toff > endoff) + endoff = toff; } + nfs_buf_itercomplete(np, &blist, NBI_DIRTY); + } + lck_mtx_unlock(nfs_buf_mutex); + + if (LIST_EMPTY(&commitlist)) { + error = ENOBUFS; + goto done; + } /* - * If none are free, we may already have an iod working on this mount - * point. If so, it will process our request. + * We need a UPL to prevent others from accessing the buffers during + * our commit RPC(s). + * + * We used to also check for dirty pages here; if there were any we'd + * abort the commit and force the entire buffer to be written again. + * Instead of doing that, we just go ahead and commit the dirty range, + * and then leave the buffer around with dirty pages that will be + * written out later. */ - if (!gotiod) { - if (nmp->nm_bufqiods > 0) { - NFS_DPF(ASYNCIO, - ("nfs_asyncio: %d iods are already processing mount %p\n", - nmp->nm_bufqiods, nmp)); - gotiod = TRUE; + LIST_FOREACH(bp, &commitlist, nb_vnbufs) { + if (!ISSET(bp->nb_flags, NB_PAGELIST)) { + retv = nfs_buf_upl_setup(bp); + if (retv) { + /* Unable to create the UPL, the VM object probably no longer exists. */ + printf("nfs_flushcommits: upl create failed %d\n", retv); + bp->nb_valid = bp->nb_dirty = 0; + } } + nfs_buf_upl_check(bp); } /* - * If we have an iod which can process the request, then queue - * the buffer. + * Commit data on the server, as required. + * If all bufs are using the same wcred, then use that with + * one call for all of them, otherwise commit each one + * separately. */ - if (gotiod) { + if (wcred_set == 1) { /* - * Ensure that the queue never grows too large. + * Note, it's possible the commit range could be >2^32-1. + * If it is, we'll send one commit that covers the whole file. */ - while (nmp->nm_bufqlen >= 2*nfs_numasync) { - NFS_DPF(ASYNCIO, - ("nfs_asyncio: waiting for mount %p queue to drain\n", nmp)); - nmp->nm_bufqwant = TRUE; - error = tsleep(&nmp->nm_bufq, slpflag | PRIBIO, - "nfsaio", slptimeo); - if (error) { - if (nfs_sigintr(nmp, NULL, bp->b_proc)) - return (EINTR); - if (slpflag == PCATCH) { - slpflag = 0; - slptimeo = 2 * hz; + if ((endoff - off) > 0xffffffff) + count = 0; + else + count = (endoff - off); + retv = nmp->nm_funcs->nf_commit_rpc(np, off, count, wcred, wverf); + } else { + retv = 0; + LIST_FOREACH(bp, &commitlist, nb_vnbufs) { + toff = NBOFF(bp) + bp->nb_dirtyoff; + count = bp->nb_dirtyend - bp->nb_dirtyoff; + retv = nmp->nm_funcs->nf_commit_rpc(np, toff, count, bp->nb_wcred, wverf); + if (retv) + break; + } + } + + /* + * Now, either mark the blocks I/O done or mark the + * blocks dirty, depending on whether the commit + * succeeded. + */ + while ((bp = LIST_FIRST(&commitlist))) { + LIST_REMOVE(bp, nb_vnbufs); + FSDBG(557, bp, retv, bp->nb_flags, bp->nb_dirty); + nfs_node_lock_force(np); + CLR(bp->nb_flags, (NB_NEEDCOMMIT | NB_WRITEINPROG)); + np->n_needcommitcnt--; + CHECK_NEEDCOMMITCNT(np); + nfs_node_unlock(np); + + if (retv) { + /* move back to dirty list */ + lck_mtx_lock(nfs_buf_mutex); + LIST_INSERT_HEAD(&np->n_dirtyblkhd, bp, nb_vnbufs); + lck_mtx_unlock(nfs_buf_mutex); + nfs_buf_release(bp, 1); + continue; + } + + nfs_node_lock_force(np); + np->n_numoutput++; + nfs_node_unlock(np); + vnode_startwrite(NFSTOV(np)); + if (ISSET(bp->nb_flags, NB_DELWRI)) { + lck_mtx_lock(nfs_buf_mutex); + nfs_nbdwrite--; + NFSBUFCNTCHK(); + lck_mtx_unlock(nfs_buf_mutex); + wakeup(&nfs_nbdwrite); + } + CLR(bp->nb_flags, (NB_READ|NB_DONE|NB_ERROR|NB_DELWRI)); + /* if block still has dirty pages, we don't want it to */ + /* be released in nfs_buf_iodone(). So, don't set NB_ASYNC. */ + if (!(dirty = bp->nb_dirty)) + SET(bp->nb_flags, NB_ASYNC); + else + CLR(bp->nb_flags, NB_ASYNC); + + /* move to clean list */ + lck_mtx_lock(nfs_buf_mutex); + LIST_INSERT_HEAD(&np->n_cleanblkhd, bp, nb_vnbufs); + lck_mtx_unlock(nfs_buf_mutex); + + bp->nb_dirtyoff = bp->nb_dirtyend = 0; + + nfs_buf_iodone(bp); + if (dirty) { + /* throw it back in as a delayed write buffer */ + CLR(bp->nb_flags, NB_DONE); + nfs_buf_write_delayed(bp); + } + } + +done: + FSDBG_BOT(557, np, 0, 0, error); + return (error); +} + +/* + * Flush all the blocks associated with a vnode. + * Walk through the buffer pool and push any dirty pages + * associated with the vnode. + */ +int +nfs_flush(nfsnode_t np, int waitfor, thread_t thd, int ignore_writeerr) +{ + struct nfsbuf *bp; + struct nfsbuflists blist; + struct nfsmount *nmp = NFSTONMP(np); + int error = 0, error2, slptimeo = 0, slpflag = 0; + int nfsvers, flags, passone = 1; + + FSDBG_TOP(517, np, waitfor, ignore_writeerr, 0); + + if (nfs_mount_gone(nmp)) { + error = ENXIO; + goto out; + } + nfsvers = nmp->nm_vers; + if (NMFLAG(nmp, INTR)) + slpflag = PCATCH; + + if (!LIST_EMPTY(&np->n_dirtyblkhd)) { + nfs_node_lock_force(np); + np->n_flag |= NMODIFIED; + nfs_node_unlock(np); + } + + lck_mtx_lock(nfs_buf_mutex); + while (np->n_bflag & NBFLUSHINPROG) { + np->n_bflag |= NBFLUSHWANT; + error = msleep(&np->n_bflag, nfs_buf_mutex, slpflag, "nfs_flush", NULL); + if ((error && (error != EWOULDBLOCK)) || + ((error = nfs_sigintr(NFSTONMP(np), NULL, thd, 0)))) { + lck_mtx_unlock(nfs_buf_mutex); + goto out; + } + } + np->n_bflag |= NBFLUSHINPROG; + + /* + * On the first pass, start async/unstable writes on all + * delayed write buffers. Then wait for all writes to complete + * and call nfs_flushcommits() to commit any uncommitted buffers. + * On all subsequent passes, start STABLE writes on any remaining + * dirty buffers. Then wait for all writes to complete. + */ +again: + FSDBG(518, LIST_FIRST(&np->n_dirtyblkhd), np->n_flag, 0, 0); + if (!NFSTONMP(np)) { + lck_mtx_unlock(nfs_buf_mutex); + error = ENXIO; + goto done; + } + + /* Start/do any write(s) that are required. */ + if (!nfs_buf_iterprepare(np, &blist, NBI_DIRTY)) { + while ((bp = LIST_FIRST(&blist))) { + LIST_REMOVE(bp, nb_vnbufs); + LIST_INSERT_HEAD(&np->n_dirtyblkhd, bp, nb_vnbufs); + flags = (passone || !(waitfor == MNT_WAIT || waitfor == MNT_DWAIT)) ? NBAC_NOWAIT : 0; + if (flags != NBAC_NOWAIT) + nfs_buf_refget(bp); + while ((error = nfs_buf_acquire(bp, flags, slpflag, slptimeo))) { + FSDBG(524, bp, flags, bp->nb_lflags, bp->nb_flags); + if (error == EBUSY) + break; + if (error) { + error2 = nfs_sigintr(NFSTONMP(np), NULL, thd, 0); + if (error2) { + if (flags != NBAC_NOWAIT) + nfs_buf_refrele(bp); + nfs_buf_itercomplete(np, &blist, NBI_DIRTY); + lck_mtx_unlock(nfs_buf_mutex); + error = error2; + goto done; + } + if (slpflag == PCATCH) { + slpflag = 0; + slptimeo = 2 * hz; + } } } - /* - * We might have lost our iod while sleeping, - * so check and loop if nescessary. - */ - if (nmp->nm_bufqiods == 0) { - NFS_DPF(ASYNCIO, - ("nfs_asyncio: no iods after mount %p queue was drained, looping\n", nmp)); - goto again; + if (flags != NBAC_NOWAIT) + nfs_buf_refrele(bp); + if (error == EBUSY) + continue; + if (!bp->nb_np) { + /* buffer is no longer valid */ + nfs_buf_drop(bp); + continue; } + if (ISSET(bp->nb_flags, NB_NEEDCOMMIT)) + nfs_buf_check_write_verifier(np, bp); + if (!ISSET(bp->nb_flags, NB_DELWRI)) { + /* buffer is no longer dirty */ + nfs_buf_drop(bp); + continue; + } + FSDBG(525, bp, passone, bp->nb_lflags, bp->nb_flags); + if ((passone || !(waitfor == MNT_WAIT || waitfor == MNT_DWAIT)) && + ISSET(bp->nb_flags, NB_NEEDCOMMIT)) { + nfs_buf_drop(bp); + continue; + } + nfs_buf_remfree(bp); + lck_mtx_unlock(nfs_buf_mutex); + if (ISSET(bp->nb_flags, NB_ERROR)) { + nfs_node_lock_force(np); + np->n_error = bp->nb_error ? bp->nb_error : EIO; + np->n_flag |= NWRITEERR; + nfs_node_unlock(np); + nfs_buf_release(bp, 1); + lck_mtx_lock(nfs_buf_mutex); + continue; + } + SET(bp->nb_flags, NB_ASYNC); + if (!passone) { + /* NB_STABLE forces this to be written FILESYNC */ + SET(bp->nb_flags, NB_STABLE); + } + nfs_buf_write(bp); + lck_mtx_lock(nfs_buf_mutex); } + nfs_buf_itercomplete(np, &blist, NBI_DIRTY); + } + lck_mtx_unlock(nfs_buf_mutex); - if (ISSET(bp->b_flags, B_READ)) { - if (bp->b_rcred == NOCRED && cred != NOCRED) { - /* - * NFS has embedded ucred. - * Can not crhold() here as that causes zone corruption - */ - bp->b_rcred = crdup(cred); + if (waitfor == MNT_WAIT || waitfor == MNT_DWAIT) { + while ((error = vnode_waitforwrites(NFSTOV(np), 0, slpflag, slptimeo, "nfsflush"))) { + error2 = nfs_sigintr(NFSTONMP(np), NULL, thd, 0); + if (error2) { + error = error2; + goto done; } - } else { - SET(bp->b_flags, B_WRITEINPROG); - if (bp->b_wcred == NOCRED && cred != NOCRED) { - /* - * NFS has embedded ucred. - * Can not crhold() here as that causes zone corruption - */ - bp->b_wcred = crdup(cred); + if (slpflag == PCATCH) { + slpflag = 0; + slptimeo = 2 * hz; } } + } - TAILQ_INSERT_TAIL(&nmp->nm_bufq, bp, b_freelist); - nmp->nm_bufqlen++; - return (0); + if (nfsvers != NFS_VER2) { + /* loop while it looks like there are still buffers to be */ + /* commited and nfs_flushcommits() seems to be handling them. */ + while (np->n_needcommitcnt) + if (nfs_flushcommits(np, 0)) + break; } - /* - * All the iods are busy on other mounts, so return EIO to - * force the caller to process the i/o synchronously. - */ - NFS_DPF(ASYNCIO, ("nfs_asyncio: no iods available, i/o is synchronous\n")); - return (EIO); + if (passone) { + passone = 0; + if (!LIST_EMPTY(&np->n_dirtyblkhd)) { + nfs_node_lock_force(np); + np->n_flag |= NMODIFIED; + nfs_node_unlock(np); + } + lck_mtx_lock(nfs_buf_mutex); + goto again; + } + + if (waitfor == MNT_WAIT || waitfor == MNT_DWAIT) { + if (!LIST_EMPTY(&np->n_dirtyblkhd)) { + nfs_node_lock_force(np); + np->n_flag |= NMODIFIED; + nfs_node_unlock(np); + } + lck_mtx_lock(nfs_buf_mutex); + if (!LIST_EMPTY(&np->n_dirtyblkhd)) + goto again; + lck_mtx_unlock(nfs_buf_mutex); + nfs_node_lock_force(np); + /* + * OK, it looks like there are no dirty blocks. If we have no + * writes in flight and no one in the write code, we can clear + * the modified flag. In order to make sure we see the latest + * attributes and size, we also invalidate the attributes and + * advance the attribute cache XID to guarantee that attributes + * newer than our clearing of NMODIFIED will get loaded next. + * (If we don't do this, it's possible for the flush's final + * write/commit (xid1) to be executed in parallel with a subsequent + * getattr request (xid2). The getattr could return attributes + * from *before* the write/commit completed but the stale attributes + * would be preferred because of the xid ordering.) + */ + if (!np->n_wrbusy && !np->n_numoutput) { + np->n_flag &= ~NMODIFIED; + NATTRINVALIDATE(np); + nfs_get_xid(&np->n_xid); + } + } else { + nfs_node_lock_force(np); + } + + FSDBG(526, np->n_flag, np->n_error, 0, 0); + if (!ignore_writeerr && (np->n_flag & NWRITEERR)) { + error = np->n_error; + np->n_flag &= ~NWRITEERR; + } + nfs_node_unlock(np); +done: + lck_mtx_lock(nfs_buf_mutex); + flags = np->n_bflag; + np->n_bflag &= ~(NBFLUSHINPROG|NBFLUSHWANT); + lck_mtx_unlock(nfs_buf_mutex); + if (flags & NBFLUSHWANT) + wakeup(&np->n_bflag); +out: + FSDBG_BOT(517, np, error, ignore_writeerr, 0); + return (error); } /* - * Do an I/O operation to/from a cache block. This may be called - * synchronously or from an nfsiod. + * Flush out and invalidate all buffers associated with a vnode. + * Called with the underlying object locked. */ int -nfs_doio(bp, cr, p) - register struct buf *bp; - struct ucred *cr; - struct proc *p; +nfs_vinvalbuf_internal( + nfsnode_t np, + int flags, + thread_t thd, + kauth_cred_t cred, + int slpflag, + int slptimeo) { - register struct uio *uiop; - register struct vnode *vp; - struct nfsnode *np; - struct nfsmount *nmp; - int error = 0, diff, len, iomode, must_commit = 0; - struct uio uio; - struct iovec io; - - vp = bp->b_vp; - np = VTONFS(vp); - nmp = VFSTONFS(vp->v_mount); - uiop = &uio; - uiop->uio_iov = &io; - uiop->uio_iovcnt = 1; - uiop->uio_segflg = UIO_SYSSPACE; - uiop->uio_procp = p; - - /* - * With UBC, getblk() can return a buf with B_DONE set. - * This indicates that the VM has valid data for that page. - * NFS being stateless, this case poses a problem. - * By definition, the NFS server should always be consulted - * for the data in that page. - * So we choose to clear the B_DONE and to do the IO. - * - * XXX revisit this if there is a performance issue. - * XXX In that case, we could play the attribute cache games ... - */ - if (ISSET(bp->b_flags, B_DONE)) { - if (!ISSET(bp->b_flags, B_ASYNC)) - panic("nfs_doio: done and not async"); - CLR(bp->b_flags, B_DONE); - } - FSDBG_TOP(256, np->n_size, bp->b_blkno * DEV_BSIZE, bp->b_bcount, - bp->b_flags); - FSDBG(257, bp->b_validoff, bp->b_validend, bp->b_dirtyoff, - bp->b_dirtyend); + struct nfsbuf *bp; + struct nfsbuflists blist; + int list, error = 0; + + if (flags & V_SAVE) { + if ((error = nfs_flush(np, MNT_WAIT, thd, (flags & V_IGNORE_WRITEERR)))) + return (error); + } + + lck_mtx_lock(nfs_buf_mutex); + for (;;) { + list = NBI_CLEAN; + if (nfs_buf_iterprepare(np, &blist, list)) { + list = NBI_DIRTY; + if (nfs_buf_iterprepare(np, &blist, list)) + break; + } + while ((bp = LIST_FIRST(&blist))) { + LIST_REMOVE(bp, nb_vnbufs); + if (list == NBI_CLEAN) + LIST_INSERT_HEAD(&np->n_cleanblkhd, bp, nb_vnbufs); + else + LIST_INSERT_HEAD(&np->n_dirtyblkhd, bp, nb_vnbufs); + nfs_buf_refget(bp); + while ((error = nfs_buf_acquire(bp, NBAC_REMOVE, slpflag, slptimeo))) { + FSDBG(556, np, bp, NBOFF(bp), bp->nb_flags); + if (error != EAGAIN) { + FSDBG(554, np, bp, -1, error); + nfs_buf_refrele(bp); + nfs_buf_itercomplete(np, &blist, list); + lck_mtx_unlock(nfs_buf_mutex); + return (error); + } + } + nfs_buf_refrele(bp); + FSDBG(554, np, bp, NBOFF(bp), bp->nb_flags); + lck_mtx_unlock(nfs_buf_mutex); + if ((flags & V_SAVE) && UBCINFOEXISTS(NFSTOV(np)) && bp->nb_np && + (NBOFF(bp) < (off_t)np->n_size)) { + /* extra paranoia: make sure we're not */ + /* somehow leaving any dirty data around */ + int mustwrite = 0; + int end = (NBOFF(bp) + bp->nb_bufsize > (off_t)np->n_size) ? + ((off_t)np->n_size - NBOFF(bp)) : bp->nb_bufsize; + if (!ISSET(bp->nb_flags, NB_PAGELIST)) { + error = nfs_buf_upl_setup(bp); + if (error == EINVAL) { + /* vm object must no longer exist */ + /* hopefully we don't need to do */ + /* anything for this buffer */ + } else if (error) + printf("nfs_vinvalbuf: upl setup failed %d\n", error); + bp->nb_valid = bp->nb_dirty = 0; + } + nfs_buf_upl_check(bp); + /* check for any dirty data before the EOF */ + if ((bp->nb_dirtyend > 0) && (bp->nb_dirtyoff < end)) { + /* clip dirty range to EOF */ + if (bp->nb_dirtyend > end) { + bp->nb_dirtyend = end; + if (bp->nb_dirtyoff >= bp->nb_dirtyend) + bp->nb_dirtyoff = bp->nb_dirtyend = 0; + } + if ((bp->nb_dirtyend > 0) && (bp->nb_dirtyoff < end)) + mustwrite++; + } + bp->nb_dirty &= (1 << (round_page_32(end)/PAGE_SIZE)) - 1; + if (bp->nb_dirty) + mustwrite++; + /* also make sure we'll have a credential to do the write */ + if (mustwrite && !IS_VALID_CRED(bp->nb_wcred) && !IS_VALID_CRED(cred)) { + printf("nfs_vinvalbuf: found dirty buffer with no write creds\n"); + mustwrite = 0; + } + if (mustwrite) { + FSDBG(554, np, bp, 0xd00dee, bp->nb_flags); + if (!ISSET(bp->nb_flags, NB_PAGELIST)) + panic("nfs_vinvalbuf: dirty buffer without upl"); + /* gotta write out dirty data before invalidating */ + /* (NB_STABLE indicates that data writes should be FILESYNC) */ + /* (NB_NOCACHE indicates buffer should be discarded) */ + CLR(bp->nb_flags, (NB_DONE | NB_ERROR | NB_INVAL | NB_ASYNC)); + SET(bp->nb_flags, NB_STABLE | NB_NOCACHE); + if (!IS_VALID_CRED(bp->nb_wcred)) { + kauth_cred_ref(cred); + bp->nb_wcred = cred; + } + error = nfs_buf_write(bp); + // Note: bp has been released + if (error) { + FSDBG(554, bp, 0xd00dee, 0xbad, error); + nfs_node_lock_force(np); + if ((error != EINTR) && (error != ERESTART)) { + np->n_error = error; + np->n_flag |= NWRITEERR; + } + /* + * There was a write error and we need to + * invalidate attrs to sync with server. + * (if this write was extending the file, + * we may no longer know the correct size) + */ + NATTRINVALIDATE(np); + nfs_node_unlock(np); + if ((error == EINTR) || (error == ERESTART)) { + /* + * Abort on EINTR. If we don't, we could + * be stuck in this loop forever because + * the buffer will continue to stay dirty. + */ + lck_mtx_lock(nfs_buf_mutex); + nfs_buf_itercomplete(np, &blist, list); + lck_mtx_unlock(nfs_buf_mutex); + return (error); + } + error = 0; + } + lck_mtx_lock(nfs_buf_mutex); + continue; + } + } + SET(bp->nb_flags, NB_INVAL); + // hold off on FREEUPs until we're done here + nfs_buf_release(bp, 0); + lck_mtx_lock(nfs_buf_mutex); + } + nfs_buf_itercomplete(np, &blist, list); + } + if (!LIST_EMPTY(&(np)->n_dirtyblkhd) || !LIST_EMPTY(&(np)->n_cleanblkhd)) + panic("nfs_vinvalbuf: flush/inval failed"); + lck_mtx_unlock(nfs_buf_mutex); + nfs_node_lock_force(np); + if (!(flags & V_SAVE)) + np->n_flag &= ~NMODIFIED; + if (vnode_vtype(NFSTOV(np)) == VREG) + np->n_lastrahead = -1; + nfs_node_unlock(np); + NFS_BUF_FREEUP(); + return (0); +} + + +/* + * Flush and invalidate all dirty buffers. If another process is already + * doing the flush, just wait for completion. + */ +int +nfs_vinvalbuf(vnode_t vp, int flags, vfs_context_t ctx, int intrflg) +{ + return nfs_vinvalbuf2(vp, flags, vfs_context_thread(ctx), vfs_context_ucred(ctx), intrflg); +} + +int +nfs_vinvalbuf2(vnode_t vp, int flags, thread_t thd, kauth_cred_t cred, int intrflg) +{ + nfsnode_t np = VTONFS(vp); + struct nfsmount *nmp = VTONMP(vp); + int error, slpflag, slptimeo, nflags, retry = 0; + int ubcflags = UBC_PUSHALL | UBC_SYNC | UBC_INVALIDATE; + struct timespec ts = { 2, 0 }; + off_t size; + + FSDBG_TOP(554, np, flags, intrflg, 0); + /* - * Historically, paging was done with physio, but no more. + * If the mount is gone no sense to try and write anything. + * and hang trying to do IO. */ - if (ISSET(bp->b_flags, B_PHYS)) { - /* - * ...though reading /dev/drum still gets us here. - */ - io.iov_len = uiop->uio_resid = bp->b_bcount; - /* mapping was done by vmapbuf() */ - io.iov_base = bp->b_data; - uiop->uio_offset = (off_t)bp->b_blkno * DEV_BSIZE; - if (ISSET(bp->b_flags, B_READ)) { - uiop->uio_rw = UIO_READ; - nfsstats.read_physios++; - error = nfs_readrpc(vp, uiop, cr); - } else { - int com; - - iomode = NFSV3WRITE_DATASYNC; - uiop->uio_rw = UIO_WRITE; - nfsstats.write_physios++; - error = nfs_writerpc(vp, uiop, cr, &iomode, &com); - } - if (error) { - SET(bp->b_flags, B_ERROR); - bp->b_error = error; - } - } else if (ISSET(bp->b_flags, B_READ)) { - io.iov_len = uiop->uio_resid = bp->b_bcount; - io.iov_base = bp->b_data; - uiop->uio_rw = UIO_READ; - switch (vp->v_type) { - case VREG: - uiop->uio_offset = (off_t)bp->b_blkno * DEV_BSIZE; - nfsstats.read_bios++; - error = nfs_readrpc(vp, uiop, cr); - FSDBG(262, np->n_size, bp->b_blkno * DEV_BSIZE, - uiop->uio_resid, error); - if (!error) { - bp->b_validoff = 0; - if (uiop->uio_resid) { - /* - * If len > 0, there is a hole in the file and - * no writes after the hole have been pushed to - * the server yet. - * Just zero fill the rest of the valid area. - */ - diff = bp->b_bcount - uiop->uio_resid; - len = np->n_size - ((u_quad_t)bp->b_blkno * DEV_BSIZE + - diff); - if (len > 0) { - len = min(len, uiop->uio_resid); - bzero((char *)bp->b_data + diff, len); - bp->b_validend = diff + len; - FSDBG(258, diff, len, 0, 1); - } else - bp->b_validend = diff; - } else - bp->b_validend = bp->b_bcount; - - if (bp->b_validend < bp->b_bufsize) { - /* - * we're about to release a partial buffer after a - * read... the only way we should get here is if - * this buffer contains the EOF before releasing it, - * we'll zero out to the end of the buffer so that - * if a mmap of this page occurs, we'll see zero's - * even if a ftruncate extends the file in the - * meantime - */ - bzero((caddr_t)(bp->b_data + bp->b_validend), - bp->b_bufsize - bp->b_validend); - FSDBG(258, bp->b_validend, - bp->b_bufsize - bp->b_validend, 0, 2); - } - } - if (p && (vp->v_flag & VTEXT) && - (((nmp->nm_flag & NFSMNT_NQNFS) && - NQNFS_CKINVALID(vp, np, ND_READ) && - np->n_lrev != np->n_brev) || - (!(nmp->nm_flag & NFSMNT_NQNFS) && - np->n_mtime != np->n_vattr.va_mtime.tv_sec))) { - uprintf("Process killed due to text file modification\n"); - psignal(p, SIGKILL); - p->p_flag |= P_NOSWAP; - } - break; - case VLNK: - uiop->uio_offset = (off_t)0; - nfsstats.readlink_bios++; - error = nfs_readlinkrpc(vp, uiop, cr); - break; - case VDIR: - nfsstats.readdir_bios++; - uiop->uio_offset = ((u_quad_t)bp->b_lblkno) * NFS_DIRBLKSIZ; - if (!(nmp->nm_flag & NFSMNT_NFSV3)) - nmp->nm_flag &= ~NFSMNT_RDIRPLUS; /* dk@farm.org */ - if (nmp->nm_flag & NFSMNT_RDIRPLUS) { - error = nfs_readdirplusrpc(vp, uiop, cr); - if (error == NFSERR_NOTSUPP) - nmp->nm_flag &= ~NFSMNT_RDIRPLUS; - } - if ((nmp->nm_flag & NFSMNT_RDIRPLUS) == 0) - error = nfs_readdirrpc(vp, uiop, cr); - break; - default: - printf("nfs_doio: type %x unexpected\n", vp->v_type); - break; - }; - if (error) { - SET(bp->b_flags, B_ERROR); - bp->b_error = error; - } + if (nfs_mount_gone(nmp)) { + flags &= ~V_SAVE; + ubcflags &= ~UBC_PUSHALL; + } + + if (nmp && !NMFLAG(nmp, INTR)) + intrflg = 0; + if (intrflg) { + slpflag = PCATCH; + slptimeo = 2 * hz; } else { - /* - * mapped I/O may have altered any bytes, so we extend - * the dirty zone to the valid zone. For best performance - * a better solution would be to save & restore page dirty bits - * around the uiomove which brings write-data into the buffer. - * Then here we'd check if the page is dirty rather than WASMAPPED - * Also vnode_pager would change - if a page is clean it might - * still need to be written due to DELWRI. - */ - if (UBCINFOEXISTS(vp) && ubc_issetflags(vp, UI_WASMAPPED)) { - bp->b_dirtyoff = min(bp->b_dirtyoff, bp->b_validoff); - bp->b_dirtyend = max(bp->b_dirtyend, bp->b_validend); - } - if ((off_t)bp->b_blkno * DEV_BSIZE + bp->b_dirtyend > np->n_size) - bp->b_dirtyend = np->n_size - (off_t)bp->b_blkno * DEV_BSIZE; - - if (bp->b_dirtyend > bp->b_dirtyoff) { - io.iov_len = uiop->uio_resid = bp->b_dirtyend - bp->b_dirtyoff; - uiop->uio_offset = (off_t)bp->b_blkno * DEV_BSIZE + - bp->b_dirtyoff; - io.iov_base = (char *)bp->b_data + bp->b_dirtyoff; - uiop->uio_rw = UIO_WRITE; - - nfsstats.write_bios++; - if ((bp->b_flags & (B_ASYNC | B_NEEDCOMMIT | B_NOCACHE)) == - B_ASYNC) - iomode = NFSV3WRITE_UNSTABLE; - else - iomode = NFSV3WRITE_FILESYNC; - SET(bp->b_flags, B_WRITEINPROG); - error = nfs_writerpc(vp, uiop, cr, &iomode, &must_commit); - if (!error && iomode == NFSV3WRITE_UNSTABLE) - SET(bp->b_flags, B_NEEDCOMMIT); - else - CLR(bp->b_flags, B_NEEDCOMMIT); - CLR(bp->b_flags, B_WRITEINPROG); - /* - * For an interrupted write, the buffer is still valid - * and the write hasn't been pushed to the server yet, - * so we can't set B_ERROR and report the interruption - * by setting B_EINTR. For the B_ASYNC case, B_EINTR - * is not relevant, so the rpc attempt is essentially - * a noop. For the case of a V3 write rpc not being - * committed to stable storage, the block is still - * dirty and requires either a commit rpc or another - * write rpc with iomode == NFSV3WRITE_FILESYNC before - * the block is reused. This is indicated by setting - * the B_DELWRI and B_NEEDCOMMIT flags. - */ - if (error == EINTR || (!error && bp->b_flags & B_NEEDCOMMIT)) { - int s; + slpflag = 0; + slptimeo = 0; + } + + /* First wait for any other process doing a flush to complete. */ + lck_mtx_lock(nfs_buf_mutex); + while (np->n_bflag & NBINVALINPROG) { + np->n_bflag |= NBINVALWANT; + msleep(&np->n_bflag, nfs_buf_mutex, slpflag, "nfs_vinvalbuf", &ts); + if ((error = nfs_sigintr(VTONMP(vp), NULL, thd, 0))) { + lck_mtx_unlock(nfs_buf_mutex); + return (error); + } + if (np->n_bflag & NBINVALINPROG) + slpflag = 0; + } + np->n_bflag |= NBINVALINPROG; + lck_mtx_unlock(nfs_buf_mutex); + + /* Now, flush as required. */ +again: + error = nfs_vinvalbuf_internal(np, flags, thd, cred, slpflag, 0); + while (error) { + FSDBG(554, np, 0, 0, error); + if ((error = nfs_sigintr(VTONMP(vp), NULL, thd, 0))) + goto done; + error = nfs_vinvalbuf_internal(np, flags, thd, cred, 0, slptimeo); + } - CLR(bp->b_flags, B_INVAL | B_NOCACHE); - if (!ISSET(bp->b_flags, B_DELWRI)) { - SET(bp->b_flags, B_DELWRI); - nbdwrite++; + /* get the pages out of vm also */ + if (UBCINFOEXISTS(vp) && (size = ubc_getsize(vp))) + if ((error = ubc_msync(vp, 0, size, NULL, ubcflags))) { + if (error == EINVAL) + panic("nfs_vinvalbuf(): ubc_msync failed!, error %d", error); + if (retry++ < 10) { /* retry invalidating a few times */ + if (retry > 1 || error == ENXIO) + ubcflags &= ~UBC_PUSHALL; + goto again; } - FSDBG(261, bp->b_validoff, bp->b_validend, - bp->b_bufsize, bp->b_bcount); - /* - * Since for the B_ASYNC case, nfs_bwrite() has - * reassigned the buffer to the clean list, we have to - * reassign it back to the dirty one. Ugh. - */ - if (ISSET(bp->b_flags, B_ASYNC)) { - s = splbio(); - reassignbuf(bp, vp); - splx(s); - } else { - SET(bp->b_flags, B_EINTR); + /* give up */ + printf("nfs_vinvalbuf(): ubc_msync failed!, error %d\n", error); + } +done: + lck_mtx_lock(nfs_buf_mutex); + nflags = np->n_bflag; + np->n_bflag &= ~(NBINVALINPROG|NBINVALWANT); + lck_mtx_unlock(nfs_buf_mutex); + if (nflags & NBINVALWANT) + wakeup(&np->n_bflag); + + FSDBG_BOT(554, np, flags, intrflg, error); + return (error); +} + +/* + * Wait for any busy buffers to complete. + */ +void +nfs_wait_bufs(nfsnode_t np) +{ + struct nfsbuf *bp; + struct nfsbuflists blist; + int error = 0; + + lck_mtx_lock(nfs_buf_mutex); + if (!nfs_buf_iterprepare(np, &blist, NBI_CLEAN)) { + while ((bp = LIST_FIRST(&blist))) { + LIST_REMOVE(bp, nb_vnbufs); + LIST_INSERT_HEAD(&np->n_cleanblkhd, bp, nb_vnbufs); + nfs_buf_refget(bp); + while ((error = nfs_buf_acquire(bp, 0, 0, 0))) { + if (error != EAGAIN) { + nfs_buf_refrele(bp); + nfs_buf_itercomplete(np, &blist, NBI_CLEAN); + lck_mtx_unlock(nfs_buf_mutex); + return; + } } - } else { - if (error) { - SET(bp->b_flags, B_ERROR); - bp->b_error = np->n_error = error; - np->n_flag |= NWRITEERR; + nfs_buf_refrele(bp); + nfs_buf_drop(bp); + } + nfs_buf_itercomplete(np, &blist, NBI_CLEAN); + } + if (!nfs_buf_iterprepare(np, &blist, NBI_DIRTY)) { + while ((bp = LIST_FIRST(&blist))) { + LIST_REMOVE(bp, nb_vnbufs); + LIST_INSERT_HEAD(&np->n_dirtyblkhd, bp, nb_vnbufs); + nfs_buf_refget(bp); + while ((error = nfs_buf_acquire(bp, 0, 0, 0))) { + if (error != EAGAIN) { + nfs_buf_refrele(bp); + nfs_buf_itercomplete(np, &blist, NBI_DIRTY); + lck_mtx_unlock(nfs_buf_mutex); + return; + } } - bp->b_dirtyoff = bp->b_dirtyend = 0; + nfs_buf_refrele(bp); + nfs_buf_drop(bp); + } + nfs_buf_itercomplete(np, &blist, NBI_DIRTY); + } + lck_mtx_unlock(nfs_buf_mutex); +} + + +/* + * Add an async I/O request to the mount's async I/O queue and make + * sure that an nfsiod will service it. + */ +void +nfs_asyncio_finish(struct nfsreq *req) +{ + struct nfsmount *nmp; + struct nfsiod *niod; + int started = 0; + + FSDBG_TOP(552, nmp, 0, 0, 0); +again: + nmp = req->r_nmp; + + if (nmp == NULL) + return; + lck_mtx_lock(nfsiod_mutex); + niod = nmp->nm_niod; + + /* grab an nfsiod if we don't have one already */ + if (!niod) { + niod = TAILQ_FIRST(&nfsiodfree); + if (niod) { + TAILQ_REMOVE(&nfsiodfree, niod, niod_link); + TAILQ_INSERT_TAIL(&nfsiodwork, niod, niod_link); + niod->niod_nmp = nmp; + } else if (((nfsiod_thread_count < NFSIOD_MAX) || (nfsiod_thread_count <= 0)) && (started < 4)) { /* - * validoff and validend represent the real data present - * in this buffer if validoff is non-zero, than we have - * to invalidate the buffer and kill the page when - * biodone is called... the same is also true when - * validend doesn't extend all the way to the end of the - * buffer and validend doesn't equate to the current - * EOF... eventually we need to deal with this in a more - * humane way (like keeping the partial buffer without - * making it immediately available to the VM page cache) + * Try starting a new thread. + * We may try a couple times if other callers + * get the new threads before we do. */ - if (bp->b_validoff) - SET(bp->b_flags, B_INVAL); - else - if (bp->b_validend < bp->b_bufsize) { - if ((off_t)bp->b_blkno * DEV_BSIZE + - bp->b_validend == np->n_size) { - bzero((caddr_t)(bp->b_data + - bp->b_validend), - bp->b_bufsize - bp->b_validend); - FSDBG(259, bp->b_validend, - bp->b_bufsize - bp->b_validend, 0, - 0); - } else - SET(bp->b_flags, B_INVAL); - } + lck_mtx_unlock(nfsiod_mutex); + started++; + if (!nfsiod_start()) + goto again; + lck_mtx_lock(nfsiod_mutex); } + } - } else { - if (bp->b_validoff || - (bp->b_validend < bp->b_bufsize && - (off_t)bp->b_blkno * DEV_BSIZE + bp->b_validend != - np->n_size)) { - SET(bp->b_flags, B_INVAL); + /* + * If we got here while being on the resendq we need to get off. This + * happens when the timer fires and errors out requests from nfs_sigintr + * or we receive a reply (UDP case) while being on the resend queue so + * we're just finishing up and are not going to be resent. + */ + lck_mtx_lock(&req->r_mtx); + if (req->r_flags & R_RESENDQ) { + lck_mtx_lock(&nmp->nm_lock); + if (req->r_rchain.tqe_next != NFSREQNOLIST) { + NFS_BIO_DBG("Proccessing async request on resendq. Removing"); + TAILQ_REMOVE(&nmp->nm_resendq, req, r_rchain); + req->r_rchain.tqe_next = NFSREQNOLIST; + assert(req->r_refs > 1); + /* Remove resendq reference */ + req->r_refs--; } - if (bp->b_flags & B_INVAL) { - FSDBG(260, bp->b_validoff, bp->b_validend, - bp->b_bufsize, bp->b_bcount); + lck_mtx_unlock(&nmp->nm_lock); + req->r_flags &= ~R_RESENDQ; + } + lck_mtx_unlock(&req->r_mtx); + + if (req->r_achain.tqe_next == NFSREQNOLIST) + TAILQ_INSERT_TAIL(&nmp->nm_iodq, req, r_achain); + + /* If this mount doesn't already have an nfsiod working on it... */ + if (!nmp->nm_niod) { + if (niod) { /* give it the nfsiod we just grabbed */ + nmp->nm_niod = niod; + lck_mtx_unlock(nfsiod_mutex); + wakeup(niod); + } else if (nfsiod_thread_count > 0) { + /* just queue it up on nfsiod mounts queue if needed */ + if (nmp->nm_iodlink.tqe_next == NFSNOLIST) + TAILQ_INSERT_TAIL(&nfsiodmounts, nmp, nm_iodlink); + lck_mtx_unlock(nfsiod_mutex); + } else { + printf("nfs_asyncio(): no nfsiods? %d %d (%d)\n", nfsiod_thread_count, NFSIOD_MAX, started); + lck_mtx_unlock(nfsiod_mutex); + /* we have no other option but to be persistent */ + started = 0; + goto again; } - bp->b_resid = 0; - biodone(bp); - FSDBG_BOT(256, bp->b_validoff, bp->b_validend, bp->b_bufsize, - np->n_size); - return (0); - } + } else { + lck_mtx_unlock(nfsiod_mutex); } - bp->b_resid = uiop->uio_resid; - if (must_commit) - nfs_clearcommit(vp->v_mount); - if (bp->b_flags & B_INVAL) { - FSDBG(260, bp->b_validoff, bp->b_validend, bp->b_bufsize, - bp->b_bcount); + FSDBG_BOT(552, nmp, 0, 0, 0); +} + +/* + * queue up async I/O request for resend + */ +void +nfs_asyncio_resend(struct nfsreq *req) +{ + struct nfsmount *nmp = req->r_nmp; + + if (nfs_mount_gone(nmp)) + return; + + nfs_gss_clnt_rpcdone(req); + lck_mtx_lock(&nmp->nm_lock); + if (!(req->r_flags & R_RESENDQ)) { + TAILQ_INSERT_TAIL(&nmp->nm_resendq, req, r_rchain); + req->r_flags |= R_RESENDQ; + /* + * We take a reference on this request so that it can't be + * destroyed while a resend is queued or in progress. + */ + nfs_request_ref(req, 1); } - FSDBG_BOT(256, bp->b_validoff, bp->b_validend, bp->b_bcount, error); + nfs_mount_sock_thread_wake(nmp); + lck_mtx_unlock(&nmp->nm_lock); +} + +/* + * Read directory data into a buffer. + * + * Buffer will be filled (unless EOF is hit). + * Buffers after this one may also be completely/partially filled. + */ +int +nfs_buf_readdir(struct nfsbuf *bp, vfs_context_t ctx) +{ + nfsnode_t np = bp->nb_np; + struct nfsmount *nmp = NFSTONMP(np); + int error = 0; + + if (nfs_mount_gone(nmp)) + return (ENXIO); - biodone(bp); + if (nmp->nm_vers < NFS_VER4) + error = nfs3_readdir_rpc(np, bp, ctx); + else + error = nfs4_readdir_rpc(np, bp, ctx); + + if (error && (error != NFSERR_DIRBUFDROPPED)) { + SET(bp->nb_flags, NB_ERROR); + bp->nb_error = error; + } return (error); }