X-Git-Url: https://git.saurik.com/apple/xnu.git/blobdiff_plain/43866e378188c25dd1e2208016ab3cbeb086ae6c..55e303ae13a4cf49d70f2294092726f2fffb9ef2:/bsd/nfs/nfs_bio.c diff --git a/bsd/nfs/nfs_bio.c b/bsd/nfs/nfs_bio.c index 7f41efe13..1b6b078c5 100644 --- a/bsd/nfs/nfs_bio.c +++ b/bsd/nfs/nfs_bio.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000 Apple Computer, Inc. All rights reserved. + * Copyright (c) 2000-2003 Apple Computer, Inc. All rights reserved. * * @APPLE_LICENSE_HEADER_START@ * @@ -66,8 +66,9 @@ #include #include #include -#include +#include #include +#include #include #include #include @@ -98,12 +99,863 @@ KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, (A))) | DBG_FUNC_END, \ (int)(B), (int)(C), (int)(D), (int)(E), 0) -static struct buf *nfs_getcacheblk __P((struct vnode *vp, daddr_t bn, int size, - struct proc *p, int operation)); - extern int nfs_numasync; +extern int nfs_ioddelwri; extern struct nfsstats nfsstats; -extern int nbdwrite; + +#define NFSBUFHASH(dvp, lbn) \ + (&nfsbufhashtbl[((long)(dvp) / sizeof(*(dvp)) + (int)(lbn)) & nfsbufhash]) +LIST_HEAD(nfsbufhashhead, nfsbuf) *nfsbufhashtbl; +struct nfsbuffreehead nfsbuffree, nfsbufdelwri; +u_long nfsbufhash; +int nfsbufhashlock, nfsbufcnt, nfsbufmin, nfsbufmax; +int nfsbuffreecnt, nfsbufdelwricnt, nfsneedbuffer; +int nfs_nbdwrite; + +#define NFSBUFWRITE_THROTTLE 9 + +/* + * Initialize nfsbuf lists + */ +void +nfs_nbinit(void) +{ + nfsbufhashlock = 0; + nfsbufhashtbl = hashinit(nbuf, M_TEMP, &nfsbufhash); + TAILQ_INIT(&nfsbuffree); + TAILQ_INIT(&nfsbufdelwri); + nfsbufcnt = nfsbuffreecnt = nfsbufdelwricnt = 0; + nfsbufmin = 128; // XXX tune me! + nfsbufmax = 8192; // XXX tune me! + nfsneedbuffer = 0; + nfs_nbdwrite = 0; +} + +/* + * try to free up some excess, unused nfsbufs + */ +static void +nfs_buf_freeup(void) +{ + struct nfsbuf *fbp; + int cnt; + +#define NFS_BUF_FREEUP() \ + do { \ + /* only call nfs_buf_freeup() if it has work to do */ \ + if ((nfsbuffreecnt > nfsbufcnt/4) && \ + (nfsbufcnt-nfsbuffreecnt/8 > nfsbufmin)) \ + nfs_buf_freeup(); \ + } while (0) + + if (nfsbuffreecnt < nfsbufcnt/4) + return; + cnt = nfsbuffreecnt/8; + if (nfsbufcnt-cnt < nfsbufmin) + return; + + FSDBG(320, -1, nfsbufcnt, nfsbuffreecnt, cnt); + while (cnt-- > 0) { + fbp = TAILQ_FIRST(&nfsbuffree); + if (!fbp) + break; + nfs_buf_remfree(fbp); + /* disassociate buffer from any vnode */ + if (fbp->nb_vp) { + struct vnode *oldvp; + if (fbp->nb_vnbufs.le_next != NFSNOLIST) { + LIST_REMOVE(fbp, nb_vnbufs); + fbp->nb_vnbufs.le_next = NFSNOLIST; + } + oldvp = fbp->nb_vp; + fbp->nb_vp = NULL; + HOLDRELE(oldvp); + } + LIST_REMOVE(fbp, nb_hash); + /* nuke any creds */ + if (fbp->nb_rcred != NOCRED) + crfree(fbp->nb_rcred); + if (fbp->nb_wcred != NOCRED) + crfree(fbp->nb_wcred); + /* if buf was NB_META, dump buffer */ + if (ISSET(fbp->nb_flags, NB_META) && fbp->nb_data) { + FREE(fbp->nb_data, M_TEMP); + } + FREE(fbp, M_TEMP); + nfsbufcnt--; + } + FSDBG(320, -1, nfsbufcnt, nfsbuffreecnt, cnt); +} + +void +nfs_buf_remfree(struct nfsbuf *bp) +{ + if (bp->nb_free.tqe_next == NFSNOLIST) + panic("nfsbuf not on free list"); + if (ISSET(bp->nb_flags, NB_DELWRI)) { + nfsbufdelwricnt--; + TAILQ_REMOVE(&nfsbufdelwri, bp, nb_free); + } else { + nfsbuffreecnt--; + TAILQ_REMOVE(&nfsbuffree, bp, nb_free); + } + bp->nb_free.tqe_next = NFSNOLIST; + NFSBUFCNTCHK(); +} + +/* + * check for existence of nfsbuf in cache + */ +struct nfsbuf * +nfs_buf_incore(struct vnode *vp, daddr_t blkno) +{ + /* Search hash chain */ + struct nfsbuf * bp = NFSBUFHASH(vp, blkno)->lh_first; + for (; bp != NULL; bp = bp->nb_hash.le_next) + if (bp->nb_lblkno == blkno && bp->nb_vp == vp && + !ISSET(bp->nb_flags, NB_INVAL)) { + FSDBG(547, bp, blkno, bp->nb_flags, bp->nb_vp); + return (bp); + } + return (NULL); +} + +/* + * Check if it's OK to drop a page. + * + * Called by vnode_pager() on pageout request of non-dirty page. + * We need to make sure that it's not part of a delayed write. + * If it is, we can't let the VM drop it because we may need it + * later when/if we need to write the data (again). + */ +int +nfs_buf_page_inval(struct vnode *vp, off_t offset) +{ + struct nfsbuf *bp; + bp = nfs_buf_incore(vp, ubc_offtoblk(vp, offset)); + if (!bp) + return (0); + FSDBG(325, bp, bp->nb_flags, bp->nb_dirtyoff, bp->nb_dirtyend); + if (ISSET(bp->nb_flags, NB_BUSY)) + return (EBUSY); + /* + * If there's a dirty range in the buffer, check to + * see if this page intersects with the dirty range. + * If it does, we can't let the pager drop the page. + */ + if (bp->nb_dirtyend > 0) { + int start = offset - NBOFF(bp); + if (bp->nb_dirtyend <= start || + bp->nb_dirtyoff >= (start + PAGE_SIZE)) + return (0); + return (EBUSY); + } + return (0); +} + +int +nfs_buf_upl_setup(struct nfsbuf *bp) +{ + kern_return_t kret; + upl_t upl; + int s; + + if (ISSET(bp->nb_flags, NB_PAGELIST)) + return (0); + + kret = ubc_create_upl(bp->nb_vp, NBOFF(bp), bp->nb_bufsize, + &upl, NULL, UPL_PRECIOUS); + if (kret == KERN_INVALID_ARGUMENT) { + /* vm object probably doesn't exist any more */ + bp->nb_pagelist = NULL; + return (EINVAL); + } + if (kret != KERN_SUCCESS) { + printf("nfs_buf_upl_setup(): failed to get pagelist %d\n", kret); + bp->nb_pagelist = NULL; + return (EIO); + } + + FSDBG(538, bp, NBOFF(bp), bp->nb_bufsize, bp->nb_vp); + + s = splbio(); + bp->nb_pagelist = upl; + SET(bp->nb_flags, NB_PAGELIST); + splx(s); + return (0); +} + +void +nfs_buf_upl_check(struct nfsbuf *bp) +{ + upl_page_info_t *pl; + off_t filesize, fileoffset; + int i, npages; + + if (!ISSET(bp->nb_flags, NB_PAGELIST)) + return; + + npages = round_page_32(bp->nb_bufsize) / PAGE_SIZE; + filesize = ubc_getsize(bp->nb_vp); + fileoffset = NBOFF(bp); + if (fileoffset < filesize) + SET(bp->nb_flags, NB_CACHE); + else + CLR(bp->nb_flags, NB_CACHE); + + pl = ubc_upl_pageinfo(bp->nb_pagelist); + bp->nb_valid = bp->nb_dirty = 0; + + for (i=0; i < npages; i++, fileoffset += PAGE_SIZE_64) { + /* anything beyond the end of the file is not valid or dirty */ + if (fileoffset >= filesize) + break; + if (!upl_valid_page(pl, i)) { + CLR(bp->nb_flags, NB_CACHE); + continue; + } + NBPGVALID_SET(bp,i); + if (upl_dirty_page(pl, i)) { + NBPGDIRTY_SET(bp, i); + if (!ISSET(bp->nb_flags, NB_WASDIRTY)) + SET(bp->nb_flags, NB_WASDIRTY); + } + } + fileoffset = NBOFF(bp); + if (ISSET(bp->nb_flags, NB_CACHE)) { + bp->nb_validoff = 0; + bp->nb_validend = bp->nb_bufsize; + if (fileoffset + bp->nb_validend > filesize) + bp->nb_validend = filesize - fileoffset; + } else { + bp->nb_validoff = bp->nb_validend = -1; + } + FSDBG(539, bp, fileoffset, bp->nb_valid, bp->nb_dirty); + FSDBG(539, bp->nb_validoff, bp->nb_validend, bp->nb_dirtyoff, bp->nb_dirtyend); +} + +static int +nfs_buf_map(struct nfsbuf *bp) +{ + kern_return_t kret; + + if (bp->nb_data) + return (0); + if (!ISSET(bp->nb_flags, NB_PAGELIST)) + return (EINVAL); + + kret = ubc_upl_map(bp->nb_pagelist, (vm_address_t *)&(bp->nb_data)); + if (kret != KERN_SUCCESS) + panic("nfs_buf_map: ubc_upl_map() failed with (%d)", kret); + if (bp->nb_data == 0) + panic("ubc_upl_map mapped 0"); + FSDBG(540, bp, bp->nb_flags, NBOFF(bp), bp->nb_data); + return (0); +} + +/* + * check range of pages in nfsbuf's UPL for validity + */ +static int +nfs_buf_upl_valid_range(struct nfsbuf *bp, int off, int size) +{ + off_t fileoffset, filesize; + int pg, lastpg; + upl_page_info_t *pl; + + if (!ISSET(bp->nb_flags, NB_PAGELIST)) + return (0); + pl = ubc_upl_pageinfo(bp->nb_pagelist); + + size += off & PAGE_MASK; + off &= ~PAGE_MASK; + fileoffset = NBOFF(bp); + filesize = VTONFS(bp->nb_vp)->n_size; + if ((fileoffset + off + size) > filesize) + size = filesize - (fileoffset + off); + + pg = off/PAGE_SIZE; + lastpg = (off + size - 1)/PAGE_SIZE; + while (pg <= lastpg) { + if (!upl_valid_page(pl, pg)) + return (0); + pg++; + } + return (1); +} + +/* + * normalize an nfsbuf's valid range + * + * the read/write code guarantees that we'll always have a valid + * region that is an integral number of pages. If either end + * of the valid range isn't page-aligned, it gets corrected + * here as we extend the valid range through all of the + * contiguous valid pages. + */ +static void +nfs_buf_normalize_valid_range(struct nfsnode *np, struct nfsbuf *bp) +{ + int pg, npg; + /* pull validoff back to start of contiguous valid page range */ + pg = bp->nb_validoff/PAGE_SIZE; + while (pg >= 0 && NBPGVALID(bp,pg)) + pg--; + bp->nb_validoff = (pg+1) * PAGE_SIZE; + /* push validend forward to end of contiguous valid page range */ + npg = bp->nb_bufsize/PAGE_SIZE; + pg = bp->nb_validend/PAGE_SIZE; + while (pg < npg && NBPGVALID(bp,pg)) + pg++; + bp->nb_validend = pg * PAGE_SIZE; + /* clip to EOF */ + if (NBOFF(bp) + bp->nb_validend > np->n_size) + bp->nb_validend = np->n_size % bp->nb_bufsize; +} + +/* + * try to push out some delayed/uncommitted writes + */ +static void +nfs_buf_delwri_push(void) +{ + struct nfsbuf *bp; + int i; + + if (TAILQ_EMPTY(&nfsbufdelwri)) + return; + + /* first try to tell the nfsiods to do it */ + if (nfs_asyncio(NULL, NULL) == 0) + return; + + /* otherwise, try to do some of the work ourselves */ + i = 0; + while (i < 8 && (bp = TAILQ_FIRST(&nfsbufdelwri)) != NULL) { + struct nfsnode *np = VTONFS(bp->nb_vp); + nfs_buf_remfree(bp); + if (ISSET(bp->nb_flags, NB_NEEDCOMMIT)) { + /* put buffer at end of delwri list */ + TAILQ_INSERT_TAIL(&nfsbufdelwri, bp, nb_free); + nfsbufdelwricnt++; + nfs_flushcommits(np->n_vnode, (struct proc *)0); + } else { + SET(bp->nb_flags, (NB_BUSY | NB_ASYNC)); + nfs_buf_write(bp); + } + i++; + } +} + +/* + * Get an nfs cache block. + * Allocate a new one if the block isn't currently in the cache + * and return the block marked busy. If the calling process is + * interrupted by a signal for an interruptible mount point, return + * NULL. + */ +struct nfsbuf * +nfs_buf_get( + struct vnode *vp, + daddr_t blkno, + int size, + struct proc *p, + int operation) +{ + struct nfsnode *np = VTONFS(vp); + struct nfsbuf *bp; + int i, biosize, bufsize, rv; + struct ucred *cred; + int slpflag = PCATCH; + + FSDBG_TOP(541, vp, blkno, size, operation); + + bufsize = size; + if (bufsize > MAXBSIZE) + panic("nfs_buf_get: buffer larger than MAXBSIZE requested"); + + biosize = vp->v_mount->mnt_stat.f_iosize; + + if (UBCINVALID(vp) || !UBCINFOEXISTS(vp)) + operation = BLK_META; + else if (bufsize < biosize) + /* reg files should always have biosize blocks */ + bufsize = biosize; + + /* if BLK_WRITE, check for too many delayed/uncommitted writes */ + if ((operation == BLK_WRITE) && (nfs_nbdwrite > ((nfsbufcnt*3)/4))) { + FSDBG_TOP(542, vp, blkno, nfs_nbdwrite, ((nfsbufcnt*3)/4)); + + /* poke the delwri list */ + nfs_buf_delwri_push(); + + /* sleep to let other threads run... */ + tsleep(&nfs_nbdwrite, PCATCH, "nfs_nbdwrite", 1); + FSDBG_BOT(542, vp, blkno, nfs_nbdwrite, ((nfsbufcnt*3)/4)); + } + +loop: + /* + * Obtain a lock to prevent a race condition if the + * MALLOC() below happens to block. + */ + if (nfsbufhashlock) { + while (nfsbufhashlock) { + nfsbufhashlock = -1; + tsleep(&nfsbufhashlock, PCATCH, "nfsbufget", 0); + if (nfs_sigintr(VFSTONFS(vp->v_mount), NULL, p)) + return (NULL); + } + goto loop; + } + nfsbufhashlock = 1; + + /* check for existence of nfsbuf in cache */ + if (bp = nfs_buf_incore(vp, blkno)) { + /* if busy, set wanted and wait */ + if (ISSET(bp->nb_flags, NB_BUSY)) { + FSDBG_TOP(543, vp, blkno, bp, bp->nb_flags); + SET(bp->nb_flags, NB_WANTED); + /* unlock hash */ + if (nfsbufhashlock < 0) { + nfsbufhashlock = 0; + wakeup(&nfsbufhashlock); + } else + nfsbufhashlock = 0; + tsleep(bp, slpflag|(PRIBIO+1), "nfsbufget", (slpflag == PCATCH) ? 0 : 2*hz); + slpflag = 0; + FSDBG_BOT(543, vp, blkno, bp, bp->nb_flags); + if (nfs_sigintr(VFSTONFS(vp->v_mount), NULL, p)) { + FSDBG_BOT(541, vp, blkno, 0, EINTR); + return (NULL); + } + goto loop; + } + if (bp->nb_bufsize != bufsize) + panic("nfsbuf size mismatch"); + SET(bp->nb_flags, (NB_BUSY | NB_CACHE)); + nfs_buf_remfree(bp); + /* additional paranoia: */ + if (ISSET(bp->nb_flags, NB_PAGELIST)) + panic("pagelist buffer was not busy"); + goto buffer_setup; + } + + /* + * where to get a free buffer: + * - alloc new if we haven't reached min bufs + * - free list + * - alloc new if we haven't reached max allowed + * - start clearing out delwri list and try again + */ + + if ((nfsbufcnt > nfsbufmin) && !TAILQ_EMPTY(&nfsbuffree)) { + /* pull an nfsbuf off the free list */ + bp = TAILQ_FIRST(&nfsbuffree); + FSDBG(544, vp, blkno, bp, bp->nb_flags); + nfs_buf_remfree(bp); + if (ISSET(bp->nb_flags, NB_DELWRI)) + panic("nfs_buf_get: delwri"); + SET(bp->nb_flags, NB_BUSY); + /* disassociate buffer from previous vnode */ + if (bp->nb_vp) { + struct vnode *oldvp; + if (bp->nb_vnbufs.le_next != NFSNOLIST) { + LIST_REMOVE(bp, nb_vnbufs); + bp->nb_vnbufs.le_next = NFSNOLIST; + } + oldvp = bp->nb_vp; + bp->nb_vp = NULL; + HOLDRELE(oldvp); + } + LIST_REMOVE(bp, nb_hash); + /* nuke any creds we're holding */ + cred = bp->nb_rcred; + if (cred != NOCRED) { + bp->nb_rcred = NOCRED; + crfree(cred); + } + cred = bp->nb_wcred; + if (cred != NOCRED) { + bp->nb_wcred = NOCRED; + crfree(cred); + } + /* if buf will no longer be NB_META, dump old buffer */ + if ((operation != BLK_META) && + ISSET(bp->nb_flags, NB_META) && bp->nb_data) { + FREE(bp->nb_data, M_TEMP); + bp->nb_data = NULL; + } + /* re-init buf fields */ + bp->nb_error = 0; + bp->nb_validoff = bp->nb_validend = -1; + bp->nb_dirtyoff = bp->nb_dirtyend = 0; + bp->nb_valid = 0; + bp->nb_dirty = 0; + } else if (nfsbufcnt < nfsbufmax) { + /* just alloc a new one */ + MALLOC(bp, struct nfsbuf *, sizeof(struct nfsbuf), M_TEMP, M_WAITOK); + nfsbufcnt++; + NFSBUFCNTCHK(); + /* init nfsbuf */ + bzero(bp, sizeof(*bp)); + bp->nb_free.tqe_next = NFSNOLIST; + bp->nb_validoff = bp->nb_validend = -1; + FSDBG(545, vp, blkno, bp, 0); + } else { + /* too many bufs... wait for buffers to free up */ + FSDBG_TOP(546, vp, blkno, nfsbufcnt, nfsbufmax); + /* unlock hash */ + if (nfsbufhashlock < 0) { + nfsbufhashlock = 0; + wakeup(&nfsbufhashlock); + } else + nfsbufhashlock = 0; + + /* poke the delwri list */ + nfs_buf_delwri_push(); + + nfsneedbuffer = 1; + tsleep(&nfsneedbuffer, PCATCH, "nfsbufget", 0); + FSDBG_BOT(546, vp, blkno, nfsbufcnt, nfsbufmax); + if (nfs_sigintr(VFSTONFS(vp->v_mount), NULL, p)) { + FSDBG_BOT(541, vp, blkno, 0, EINTR); + return (NULL); + } + goto loop; + } + +setup_nfsbuf: + + /* setup nfsbuf */ + bp->nb_flags = NB_BUSY; + bp->nb_lblkno = blkno; + /* insert buf in hash */ + LIST_INSERT_HEAD(NFSBUFHASH(vp, blkno), bp, nb_hash); + /* associate buffer with new vnode */ + VHOLD(vp); + bp->nb_vp = vp; + LIST_INSERT_HEAD(&np->n_cleanblkhd, bp, nb_vnbufs); + +buffer_setup: + + switch (operation) { + case BLK_META: + SET(bp->nb_flags, NB_META); + if ((bp->nb_bufsize != bufsize) && bp->nb_data) { + FREE(bp->nb_data, M_TEMP); + bp->nb_data = NULL; + bp->nb_validoff = bp->nb_validend = -1; + bp->nb_dirtyoff = bp->nb_dirtyend = 0; + bp->nb_valid = 0; + bp->nb_dirty = 0; + CLR(bp->nb_flags, NB_CACHE); + } + if (!bp->nb_data) + MALLOC(bp->nb_data, caddr_t, bufsize, M_TEMP, M_WAITOK); + if (!bp->nb_data) + panic("nfs_buf_get: null nb_data"); + bp->nb_bufsize = bufsize; + break; + + case BLK_READ: + case BLK_WRITE: + if (bufsize < PAGE_SIZE) + bufsize = PAGE_SIZE; + bp->nb_bufsize = bufsize; + bp->nb_validoff = bp->nb_validend = -1; + + if (UBCISVALID(vp)) { + /* setup upl */ + if (nfs_buf_upl_setup(bp)) { + /* unable to create upl */ + /* vm object must no longer exist */ + /* cleanup buffer and return NULL */ + LIST_REMOVE(bp, nb_vnbufs); + bp->nb_vnbufs.le_next = NFSNOLIST; + bp->nb_vp = NULL; + HOLDRELE(vp); + if (bp->nb_free.tqe_next != NFSNOLIST) + panic("nfsbuf on freelist"); + TAILQ_INSERT_HEAD(&nfsbuffree, bp, nb_free); + nfsbuffreecnt++; + FSDBG_BOT(541, vp, blkno, 0x2bc, EIO); + return (NULL); + } + nfs_buf_upl_check(bp); + } + break; + + default: + panic("nfs_buf_get: %d unknown operation", operation); + } + + /* unlock hash */ + if (nfsbufhashlock < 0) { + nfsbufhashlock = 0; + wakeup(&nfsbufhashlock); + } else + nfsbufhashlock = 0; + + FSDBG_BOT(541, vp, blkno, bp, bp->nb_flags); + + return (bp); +} + +void +nfs_buf_release(struct nfsbuf *bp) +{ + struct vnode *vp = bp->nb_vp; + + FSDBG_TOP(548, bp, NBOFF(bp), bp->nb_flags, bp->nb_data); + FSDBG(548, bp->nb_validoff, bp->nb_validend, bp->nb_dirtyoff, bp->nb_dirtyend); + FSDBG(548, bp->nb_valid, 0, bp->nb_dirty, 0); + + if (UBCINFOEXISTS(vp) && bp->nb_bufsize) { + int upl_flags; + upl_t upl; + int i, rv; + + if (!ISSET(bp->nb_flags, NB_PAGELIST) && !ISSET(bp->nb_flags, NB_INVAL)) { + rv = nfs_buf_upl_setup(bp); + if (rv) + printf("nfs_buf_release: upl create failed %d\n", rv); + else + nfs_buf_upl_check(bp); + } + upl = bp->nb_pagelist; + if (!upl) + goto pagelist_cleanup_done; + if (bp->nb_data) { + if (ubc_upl_unmap(upl) != KERN_SUCCESS) + panic("ubc_upl_unmap failed"); + bp->nb_data = NULL; + } + if (bp->nb_flags & (NB_ERROR | NB_INVAL)) { + if (bp->nb_flags & (NB_READ | NB_INVAL)) + upl_flags = UPL_ABORT_DUMP_PAGES; + else + upl_flags = 0; + ubc_upl_abort(upl, upl_flags); + goto pagelist_cleanup_done; + } + for (i=0; i <= (bp->nb_bufsize - 1)/PAGE_SIZE; i++) { + if (!NBPGVALID(bp,i)) + ubc_upl_abort_range(upl, + i*PAGE_SIZE, PAGE_SIZE, + UPL_ABORT_DUMP_PAGES | + UPL_ABORT_FREE_ON_EMPTY); + else { + if (NBPGDIRTY(bp,i)) + upl_flags = UPL_COMMIT_SET_DIRTY; + else + upl_flags = UPL_COMMIT_CLEAR_DIRTY; + ubc_upl_commit_range(upl, + i*PAGE_SIZE, PAGE_SIZE, + upl_flags | + UPL_COMMIT_INACTIVATE | + UPL_COMMIT_FREE_ON_EMPTY); + } + } +pagelist_cleanup_done: + /* was this the last buffer in the file? */ + if (NBOFF(bp) + bp->nb_bufsize > VTONFS(vp)->n_size) { + /* if so, invalidate all pages of last buffer past EOF */ + int biosize = vp->v_mount->mnt_stat.f_iosize; + off_t off, size; + off = trunc_page_64(VTONFS(vp)->n_size) + PAGE_SIZE_64; + size = trunc_page_64(NBOFF(bp) + biosize) - off; + if (size) + ubc_invalidate(vp, off, size); + } + CLR(bp->nb_flags, NB_PAGELIST); + bp->nb_pagelist = NULL; + } + + /* Wake up any processes waiting for any buffer to become free. */ + if (nfsneedbuffer) { + nfsneedbuffer = 0; + wakeup(&nfsneedbuffer); + } + /* Wake up any processes waiting for _this_ buffer to become free. */ + if (ISSET(bp->nb_flags, NB_WANTED)) { + CLR(bp->nb_flags, NB_WANTED); + wakeup(bp); + } + + /* If it's not cacheable, or an error, mark it invalid. */ + if (ISSET(bp->nb_flags, (NB_NOCACHE|NB_ERROR))) + SET(bp->nb_flags, NB_INVAL); + + if ((bp->nb_bufsize <= 0) || ISSET(bp->nb_flags, NB_INVAL)) { + /* If it's invalid or empty, dissociate it from its vnode */ + if (bp->nb_vnbufs.le_next != NFSNOLIST) { + LIST_REMOVE(bp, nb_vnbufs); + bp->nb_vnbufs.le_next = NFSNOLIST; + } + bp->nb_vp = NULL; + HOLDRELE(vp); + /* if this was a delayed write, wakeup anyone */ + /* waiting for delayed writes to complete */ + if (ISSET(bp->nb_flags, NB_DELWRI)) { + CLR(bp->nb_flags, NB_DELWRI); + nfs_nbdwrite--; + NFSBUFCNTCHK(); + wakeup((caddr_t)&nfs_nbdwrite); + } + /* put buffer at head of free list */ + if (bp->nb_free.tqe_next != NFSNOLIST) + panic("nfsbuf on freelist"); + TAILQ_INSERT_HEAD(&nfsbuffree, bp, nb_free); + nfsbuffreecnt++; + NFS_BUF_FREEUP(); + } else if (ISSET(bp->nb_flags, NB_DELWRI)) { + /* put buffer at end of delwri list */ + if (bp->nb_free.tqe_next != NFSNOLIST) + panic("nfsbuf on freelist"); + TAILQ_INSERT_TAIL(&nfsbufdelwri, bp, nb_free); + nfsbufdelwricnt++; + } else { + /* put buffer at end of free list */ + if (bp->nb_free.tqe_next != NFSNOLIST) + panic("nfsbuf on freelist"); + TAILQ_INSERT_TAIL(&nfsbuffree, bp, nb_free); + nfsbuffreecnt++; + NFS_BUF_FREEUP(); + } + + NFSBUFCNTCHK(); + + /* Unlock the buffer. */ + CLR(bp->nb_flags, (NB_ASYNC | NB_BUSY | NB_NOCACHE | NB_STABLE | NB_IOD)); + + FSDBG_BOT(548, bp, NBOFF(bp), bp->nb_flags, bp->nb_data); +} + +/* + * Wait for operations on the buffer to complete. + * When they do, extract and return the I/O's error value. + */ +int +nfs_buf_iowait(struct nfsbuf *bp) +{ + FSDBG_TOP(549, bp, NBOFF(bp), bp->nb_flags, bp->nb_error); + + while (!ISSET(bp->nb_flags, NB_DONE)) + tsleep(bp, PRIBIO + 1, "nfs_buf_iowait", 0); + + FSDBG_BOT(549, bp, NBOFF(bp), bp->nb_flags, bp->nb_error); + + /* check for interruption of I/O, then errors. */ + if (ISSET(bp->nb_flags, NB_EINTR)) { + CLR(bp->nb_flags, NB_EINTR); + return (EINTR); + } else if (ISSET(bp->nb_flags, NB_ERROR)) + return (bp->nb_error ? bp->nb_error : EIO); + return (0); +} + +/* + * Mark I/O complete on a buffer. + */ +void +nfs_buf_iodone(struct nfsbuf *bp) +{ + struct vnode *vp; + + FSDBG_TOP(550, bp, NBOFF(bp), bp->nb_flags, bp->nb_error); + + if (ISSET(bp->nb_flags, NB_DONE)) + panic("nfs_buf_iodone already"); + SET(bp->nb_flags, NB_DONE); /* note that it's done */ + /* + * I/O was done, so don't believe + * the DIRTY state from VM anymore + */ + CLR(bp->nb_flags, NB_WASDIRTY); + + if (!ISSET(bp->nb_flags, NB_READ)) { + CLR(bp->nb_flags, NB_WRITEINPROG); + vpwakeup(bp->nb_vp); + } + + /* Wakeup the throttled write operations as needed */ + vp = bp->nb_vp; + if (vp && (vp->v_flag & VTHROTTLED) + && (vp->v_numoutput <= (NFSBUFWRITE_THROTTLE / 3))) { + vp->v_flag &= ~VTHROTTLED; + wakeup((caddr_t)&vp->v_numoutput); + } + + if (ISSET(bp->nb_flags, NB_ASYNC)) /* if async, release it */ + nfs_buf_release(bp); + else { /* or just wakeup the buffer */ + CLR(bp->nb_flags, NB_WANTED); + wakeup(bp); + } + + FSDBG_BOT(550, bp, NBOFF(bp), bp->nb_flags, bp->nb_error); +} + +void +nfs_buf_write_delayed(struct nfsbuf *bp) +{ + struct proc *p = current_proc(); + struct vnode *vp = bp->nb_vp; + + FSDBG_TOP(551, bp, NBOFF(bp), bp->nb_flags, 0); + FSDBG(551, bp, bp->nb_dirtyoff, bp->nb_dirtyend, bp->nb_dirty); + + /* + * If the block hasn't been seen before: + * (1) Mark it as having been seen, + * (2) Charge for the write. + * (3) Make sure it's on its vnode's correct block list, + */ + if (!ISSET(bp->nb_flags, NB_DELWRI)) { + SET(bp->nb_flags, NB_DELWRI); + if (p && p->p_stats) + p->p_stats->p_ru.ru_oublock++; /* XXX */ + nfs_nbdwrite++; + NFSBUFCNTCHK(); + /* move to dirty list */ + if (bp->nb_vnbufs.le_next != NFSNOLIST) + LIST_REMOVE(bp, nb_vnbufs); + LIST_INSERT_HEAD(&VTONFS(vp)->n_dirtyblkhd, bp, nb_vnbufs); + } + + /* + * If the vnode has "too many" write operations in progress + * wait for them to finish the IO + */ + while (vp->v_numoutput >= NFSBUFWRITE_THROTTLE) { + vp->v_flag |= VTHROTTLED; + tsleep((caddr_t)&vp->v_numoutput, PRIBIO + 1, "nfs_buf_write_delayed", 0); + } + + /* + * If we have too many delayed write buffers, + * more than we can "safely" handle, just fall back to + * doing the async write + */ + if (nfs_nbdwrite < 0) + panic("nfs_buf_write_delayed: Negative nfs_nbdwrite"); + + if (nfs_nbdwrite > ((nfsbufcnt/4)*3)) { + /* issue async write */ + SET(bp->nb_flags, NB_ASYNC); + nfs_buf_write(bp); + FSDBG_BOT(551, bp, NBOFF(bp), bp->nb_flags, bp->nb_error); + return; + } + + /* Otherwise, the "write" is done, so mark and release the buffer. */ + SET(bp->nb_flags, NB_DONE); + nfs_buf_release(bp); + FSDBG_BOT(551, bp, NBOFF(bp), bp->nb_flags, 0); + return; +} + /* * Vnode op for read using bio @@ -115,33 +967,41 @@ nfs_bioread(vp, uio, ioflag, cred, getpages) register struct uio *uio; int ioflag; struct ucred *cred; - int getpages; + int getpages; // XXX unused! { - register struct nfsnode *np = VTONFS(vp); - register int biosize, i; + struct nfsnode *np = VTONFS(vp); + int biosize, i; off_t diff; - struct buf *bp = 0, *rabp; + struct nfsbuf *bp = 0, *rabp; struct vattr vattr; struct proc *p; struct nfsmount *nmp = VFSTONFS(vp->v_mount); - daddr_t lbn, rabn; + daddr_t lbn, rabn, lastrabn = -1; int bufsize; - int nra, error = 0, n = 0, on = 0, not_readin; + int nra, error = 0, n = 0, on = 0; int operation = (getpages? BLK_PAGEIN : BLK_READ); + caddr_t dp; + struct dirent *direntp; + + FSDBG_TOP(514, vp, uio->uio_offset, uio->uio_resid, ioflag); #if DIAGNOSTIC if (uio->uio_rw != UIO_READ) panic("nfs_read mode"); #endif - if (uio->uio_resid == 0) + if (uio->uio_resid == 0) { + FSDBG_BOT(514, vp, 0xd1e0001, 0, 0); return (0); - if (uio->uio_offset < 0) + } + if (uio->uio_offset < 0) { + FSDBG_BOT(514, vp, 0xd1e0002, 0, EINVAL); return (EINVAL); + } p = uio->uio_procp; - if ((nmp->nm_flag & (NFSMNT_NFSV3 | NFSMNT_GOTFSINFO)) == NFSMNT_NFSV3) + if ((nmp->nm_flag & NFSMNT_NFSV3) && + !(nmp->nm_state & NFSSTA_GOTFSINFO)) (void)nfs_fsinfo(nmp, vp, cred, p); - /*due to getblk/vm interractions, use vm page size or less values */ - biosize = min(vp->v_mount->mnt_stat.f_iosize, PAGE_SIZE); + biosize = vp->v_mount->mnt_stat.f_iosize; /* * For nfs, cache consistency can only be maintained approximately. * Although RFC1094 does not specify the criteria, the following is @@ -155,7 +1015,7 @@ nfs_bioread(vp, uio, ioflag, cred, getpages) * Then force a getattr rpc to ensure that you have up to date * attributes. * NB: This implies that cache data can be read when up to - * NFS_ATTRTIMEO seconds out of date. If you find that you need current + * NFS_MAXATTRTIMEO seconds out of date. If you find that you need current * attributes this could be forced by setting n_attrstamp to 0 before * the VOP_GETATTR() call. */ @@ -166,24 +1026,35 @@ nfs_bioread(vp, uio, ioflag, cred, getpages) panic("nfs: bioread, not dir"); nfs_invaldir(vp); error = nfs_vinvalbuf(vp, V_SAVE, cred, p, 1); - if (error) + if (error) { + FSDBG_BOT(514, vp, 0xd1e0003, 0, error); return (error); + } } np->n_attrstamp = 0; error = VOP_GETATTR(vp, &vattr, cred, p); - if (error) + if (error) { + FSDBG_BOT(514, vp, 0xd1e0004, 0, error); return (error); + } np->n_mtime = vattr.va_mtime.tv_sec; } else { error = VOP_GETATTR(vp, &vattr, cred, p); - if (error) + if (error) { + FSDBG_BOT(514, vp, 0xd1e0005, 0, error); return (error); + } if (np->n_mtime != vattr.va_mtime.tv_sec) { - if (vp->v_type == VDIR) + if (vp->v_type == VDIR) { nfs_invaldir(vp); + /* purge name cache entries */ + cache_purge(vp); + } error = nfs_vinvalbuf(vp, V_SAVE, cred, p, 1); - if (error) + if (error) { + FSDBG_BOT(514, vp, 0xd1e0006, 0, error); return (error); + } np->n_mtime = vattr.va_mtime.tv_sec; } } @@ -198,70 +1069,126 @@ nfs_bioread(vp, uio, ioflag, cred, getpages) do { error = nqnfs_getlease(vp, ND_READ, cred, p); } while (error == NQNFS_EXPIRED); - if (error) + if (error) { + FSDBG_BOT(514, vp, 0xd1e0007, 0, error); return (error); + } if (np->n_lrev != np->n_brev || (np->n_flag & NQNFSNONCACHE) || ((np->n_flag & NMODIFIED) && vp->v_type == VDIR)) { if (vp->v_type == VDIR) nfs_invaldir(vp); error = nfs_vinvalbuf(vp, V_SAVE, cred, p, 1); - if (error) + if (error) { + FSDBG_BOT(514, vp, 0xd1e0008, 0, error); return (error); + } np->n_brev = np->n_lrev; } } else if (vp->v_type == VDIR && (np->n_flag & NMODIFIED)) { nfs_invaldir(vp); error = nfs_vinvalbuf(vp, V_SAVE, cred, p, 1); - if (error) + if (error) { + FSDBG_BOT(514, vp, 0xd1e0009, 0, error); return (error); + } } } - if (np->n_flag & NQNFSNONCACHE) { + if ((np->n_flag & NQNFSNONCACHE) || (vp->v_flag & VNOCACHE_DATA)) { + if ((vp->v_flag & VNOCACHE_DATA) && + (np->n_dirtyblkhd.lh_first || np->n_cleanblkhd.lh_first)) { + error = nfs_vinvalbuf(vp, V_SAVE, cred, p, 1); + if (error) { + FSDBG_BOT(514, vp, 0xd1e000a, 0, error); + return (error); + } + } switch (vp->v_type) { case VREG: - return (nfs_readrpc(vp, uio, cred)); + error = nfs_readrpc(vp, uio, cred); + FSDBG_BOT(514, vp, uio->uio_offset, uio->uio_resid, error); + return (error); case VLNK: - return (nfs_readlinkrpc(vp, uio, cred)); + error = nfs_readlinkrpc(vp, uio, cred); + FSDBG_BOT(514, vp, uio->uio_offset, uio->uio_resid, error); + return (error); case VDIR: break; default: - printf(" NQNFSNONCACHE: type %x unexpected\n", - vp->v_type); + printf(" NQNFSNONCACHE: type %x unexpected\n", vp->v_type); }; } switch (vp->v_type) { case VREG: - nfsstats.biocache_reads++; lbn = uio->uio_offset / biosize; - on = uio->uio_offset & (biosize - 1); - not_readin = 1; + + /* + * Copy directly from any cached pages without grabbing the bufs. + */ + if (uio->uio_segflg == UIO_USERSPACE) { + int io_resid = uio->uio_resid; + diff = np->n_size - uio->uio_offset; + if (diff < io_resid) + io_resid = diff; + if (io_resid > 0) { + error = cluster_copy_ubc_data(vp, uio, &io_resid, 0); + if (error) { + FSDBG_BOT(514, vp, uio->uio_offset, 0xcacefeed, error); + return (error); + } + } + /* count any biocache reads that we just copied directly */ + if (lbn != uio->uio_offset / biosize) { + nfsstats.biocache_reads += (uio->uio_offset / biosize) - lbn; + FSDBG(514, vp, 0xcacefeed, uio->uio_offset, error); + } + } + + lbn = uio->uio_offset / biosize; + on = uio->uio_offset % biosize; /* * Start the read ahead(s), as required. */ if (nfs_numasync > 0 && nmp->nm_readahead > 0) { - for (nra = 0; nra < nmp->nm_readahead && - (off_t)(lbn + 1 + nra) * biosize < np->n_size; - nra++) { + for (nra = 0; nra < nmp->nm_readahead; nra++) { rabn = lbn + 1 + nra; - if (!incore(vp, rabn)) { - rabp = nfs_getcacheblk(vp, rabn, biosize, p, operation); - if (!rabp) - return (EINTR); - if (!ISSET(rabp->b_flags, (B_CACHE|B_DELWRI))) { - SET(rabp->b_flags, (B_READ | B_ASYNC)); - if (nfs_asyncio(rabp, cred)) { - SET(rabp->b_flags, (B_INVAL|B_ERROR)); - rabp->b_error = EIO; - brelse(rabp); - } - } else - brelse(rabp); + if (rabn <= lastrabn) { + /* we've already (tried to) read this block */ + /* no need to try it again... */ + continue; } - } + lastrabn = rabn; + if ((off_t)rabn * biosize >= np->n_size) + break; + /* check if block exists and is valid. */ + rabp = nfs_buf_incore(vp, rabn); + if (rabp && nfs_buf_upl_valid_range(rabp, 0, rabp->nb_bufsize)) + continue; + rabp = nfs_buf_get(vp, rabn, biosize, p, operation); + if (!rabp) { + FSDBG_BOT(514, vp, 0xd1e000b, 0, EINTR); + return (EINTR); + } + if (!ISSET(rabp->nb_flags, (NB_CACHE|NB_DELWRI))) { + SET(rabp->nb_flags, (NB_READ|NB_ASYNC)); + if (nfs_asyncio(rabp, cred)) { + SET(rabp->nb_flags, (NB_INVAL|NB_ERROR)); + rabp->nb_error = EIO; + nfs_buf_release(rabp); + } + } else + nfs_buf_release(rabp); + } } + if ((uio->uio_resid <= 0) || (uio->uio_offset >= np->n_size)) { + FSDBG_BOT(514, vp, uio->uio_offset, uio->uio_resid, 0xaaaaaaaa); + return (0); + } + + nfsstats.biocache_reads++; + /* * If the block is in the cache and has the required data * in a valid region, just copy it out. @@ -270,84 +1197,162 @@ nfs_bioread(vp, uio, ioflag, cred, getpages) */ again: bufsize = biosize; - if ((off_t)(lbn + 1) * biosize > np->n_size && - (off_t)(lbn + 1) * biosize - np->n_size < biosize) { - bufsize = np->n_size - (off_t)lbn * biosize; - bufsize = (bufsize + DEV_BSIZE - 1) & ~(DEV_BSIZE - 1); - } - bp = nfs_getcacheblk(vp, lbn, bufsize, p, operation); - if (!bp) - return (EINTR); - - if (!ISSET(bp->b_flags, B_CACHE)) { - SET(bp->b_flags, B_READ); - CLR(bp->b_flags, (B_DONE | B_ERROR | B_INVAL)); - not_readin = 0; - error = nfs_doio(bp, cred, p); - if (error) { - brelse(bp); - return (error); - } - } - if (bufsize > on) { - n = min((unsigned)(bufsize - on), uio->uio_resid); - } else { - n = 0; - } + n = min((unsigned)(bufsize - on), uio->uio_resid); diff = np->n_size - uio->uio_offset; if (diff < n) n = diff; - if (not_readin && n > 0) { - if (on < bp->b_validoff || (on + n) > bp->b_validend) { - SET(bp->b_flags, (B_NOCACHE|B_INVAFTERWRITE)); - if (bp->b_dirtyend > 0) { - if (!ISSET(bp->b_flags, B_DELWRI)) - panic("nfsbioread"); - if (VOP_BWRITE(bp) == EINTR) - return (EINTR); - } else - brelse(bp); + + bp = nfs_buf_get(vp, lbn, bufsize, p, operation); + if (!bp) { + FSDBG_BOT(514, vp, 0xd1e000c, 0, EINTR); + return (EINTR); + } + + /* if any pages are valid... */ + if (bp->nb_valid) { + /* ...check for any invalid pages in the read range */ + int pg, firstpg, lastpg, dirtypg; + dirtypg = firstpg = lastpg = -1; + pg = on/PAGE_SIZE; + while (pg <= (on + n - 1)/PAGE_SIZE) { + if (!NBPGVALID(bp,pg)) { + if (firstpg < 0) + firstpg = pg; + lastpg = pg; + } else if (firstpg >= 0 && dirtypg < 0 && NBPGDIRTY(bp,pg)) + dirtypg = pg; + pg++; + } + + /* if there are no invalid pages, we're all set */ + if (firstpg < 0) { + if (bp->nb_validoff < 0) { + /* valid range isn't set up, so */ + /* set it to what we know is valid */ + bp->nb_validoff = trunc_page_32(on); + bp->nb_validend = round_page_32(on+n); + nfs_buf_normalize_valid_range(np, bp); + } + goto buffer_ready; + } + + /* there are invalid pages in the read range */ + if ((dirtypg > firstpg) && (dirtypg < lastpg)) { + /* there are also dirty page(s) in the range, */ + /* so write the buffer out and try again */ + CLR(bp->nb_flags, (NB_DONE | NB_ERROR | NB_INVAL)); + SET(bp->nb_flags, NB_ASYNC); + /* + * NFS has embedded ucred so crhold() risks zone corruption + */ + if (bp->nb_wcred == NOCRED) + bp->nb_wcred = crdup(cred); + error = nfs_buf_write(bp); + if (error) { + FSDBG_BOT(514, vp, 0xd1e000d, 0, error); + return (error); + } goto again; } + if (!bp->nb_dirty && bp->nb_dirtyend <= 0 && + (lastpg - firstpg + 1) > (bufsize/PAGE_SIZE)/2) { + /* we need to read in more than half the buffer and the */ + /* buffer's not dirty, so just fetch the whole buffer */ + bp->nb_valid = 0; + } else { + /* read the page range in */ + struct iovec iov; + struct uio auio; + auio.uio_iov = &iov; + auio.uio_iovcnt = 1; + auio.uio_offset = NBOFF(bp) + firstpg * PAGE_SIZE_64; + auio.uio_resid = (lastpg - firstpg + 1) * PAGE_SIZE; + auio.uio_segflg = UIO_SYSSPACE; + auio.uio_rw = UIO_READ; + auio.uio_procp = p; + NFS_BUF_MAP(bp); + iov.iov_base = bp->nb_data + firstpg * PAGE_SIZE; + iov.iov_len = auio.uio_resid; + error = nfs_readrpc(vp, &auio, cred); + if (error) { + nfs_buf_release(bp); + FSDBG_BOT(514, vp, 0xd1e000e, 0, error); + return (error); + } + /* Make sure that the valid range is set to cover this read. */ + bp->nb_validoff = trunc_page_32(on); + bp->nb_validend = round_page_32(on+n); + nfs_buf_normalize_valid_range(np, bp); + if (auio.uio_resid > 0) { + /* if short read, must have hit EOF, */ + /* so zero the rest of the range */ + bzero(iov.iov_base, auio.uio_resid); + } + /* mark the pages (successfully read) as valid */ + for (pg=firstpg; pg <= lastpg; pg++) + NBPGVALID_SET(bp,pg); + } } + /* if no pages are valid, read the whole block */ + if (!bp->nb_valid) { + SET(bp->nb_flags, NB_READ); + CLR(bp->nb_flags, (NB_DONE | NB_ERROR | NB_INVAL)); + error = nfs_doio(bp, cred, p); + if (error) { + nfs_buf_release(bp); + FSDBG_BOT(514, vp, 0xd1e000f, 0, error); + return (error); + } + } +buffer_ready: vp->v_lastr = lbn; - diff = (on >= bp->b_validend) ? 0 : (bp->b_validend - on); - if (diff < n) - n = diff; + /* validate read range against valid range and clip */ + if (bp->nb_validend > 0) { + diff = (on >= bp->nb_validend) ? 0 : (bp->nb_validend - on); + if (diff < n) + n = diff; + } + if (n > 0) + NFS_BUF_MAP(bp); break; case VLNK: nfsstats.biocache_readlinks++; - bp = nfs_getcacheblk(vp, (daddr_t)0, NFS_MAXPATHLEN, p, operation); - if (!bp) + bp = nfs_buf_get(vp, (daddr_t)0, NFS_MAXPATHLEN, p, operation); + if (!bp) { + FSDBG_BOT(514, vp, 0xd1e0010, 0, EINTR); return (EINTR); - if (!ISSET(bp->b_flags, B_CACHE)) { - SET(bp->b_flags, B_READ); + } + if (!ISSET(bp->nb_flags, NB_CACHE)) { + SET(bp->nb_flags, NB_READ); error = nfs_doio(bp, cred, p); if (error) { - SET(bp->b_flags, B_ERROR); - brelse(bp); + SET(bp->nb_flags, NB_ERROR); + nfs_buf_release(bp); + FSDBG_BOT(514, vp, 0xd1e0011, 0, error); return (error); } } - n = min(uio->uio_resid, NFS_MAXPATHLEN - bp->b_resid); + n = min(uio->uio_resid, bp->nb_validend); on = 0; break; case VDIR: nfsstats.biocache_readdirs++; - if (np->n_direofoffset - && uio->uio_offset >= np->n_direofoffset) { - return (0); + if (np->n_direofoffset && uio->uio_offset >= np->n_direofoffset) { + FSDBG_BOT(514, vp, 0xde0f0001, 0, 0); + return (0); } lbn = uio->uio_offset / NFS_DIRBLKSIZ; on = uio->uio_offset & (NFS_DIRBLKSIZ - 1); - bp = nfs_getcacheblk(vp, lbn, NFS_DIRBLKSIZ, p, operation); - if (!bp) - return (EINTR); - if (!ISSET(bp->b_flags, B_CACHE)) { - SET(bp->b_flags, B_READ); + bp = nfs_buf_get(vp, lbn, NFS_DIRBLKSIZ, p, operation); + if (!bp) { + FSDBG_BOT(514, vp, 0xd1e0012, 0, EINTR); + return (EINTR); + } + if (!ISSET(bp->nb_flags, NB_CACHE)) { + SET(bp->nb_flags, NB_READ); error = nfs_doio(bp, cred, p); if (error) { - brelse(bp); + nfs_buf_release(bp); } while (error == NFSERR_BAD_COOKIE) { nfs_invaldir(vp); @@ -360,20 +1365,23 @@ again: */ for (i = 0; i <= lbn && !error; i++) { if (np->n_direofoffset - && (i * NFS_DIRBLKSIZ) >= np->n_direofoffset) + && (i * NFS_DIRBLKSIZ) >= np->n_direofoffset) { + FSDBG_BOT(514, vp, 0xde0f0002, 0, 0); return (0); - bp = nfs_getcacheblk(vp, i, NFS_DIRBLKSIZ, p, - operation); - if (!bp) + } + bp = nfs_buf_get(vp, i, NFS_DIRBLKSIZ, p, operation); + if (!bp) { + FSDBG_BOT(514, vp, 0xd1e0013, 0, EINTR); return (EINTR); - if (!ISSET(bp->b_flags, B_CACHE)) { - SET(bp->b_flags, B_READ); + } + if (!ISSET(bp->nb_flags, NB_CACHE)) { + SET(bp->nb_flags, NB_READ); error = nfs_doio(bp, cred, p); /* - * no error + B_INVAL == directory EOF, + * no error + NB_INVAL == directory EOF, * use the block. */ - if (error == 0 && (bp->b_flags & B_INVAL)) + if (error == 0 && (bp->nb_flags & NB_INVAL)) break; } /* @@ -383,7 +1391,7 @@ again: * block and go for the next one via the for loop. */ if (error || i < lbn) - brelse(bp); + nfs_buf_release(bp); } } /* @@ -391,8 +1399,10 @@ again: * error. If we hit an error and it wasn't a cookie error, * we give up. */ - if (error) + if (error) { + FSDBG_BOT(514, vp, 0xd1e0014, 0, error); return (error); + } } /* @@ -404,19 +1414,19 @@ again: (np->n_direofoffset == 0 || (lbn + 1) * NFS_DIRBLKSIZ < np->n_direofoffset) && !(np->n_flag & NQNFSNONCACHE) && - !incore(vp, lbn + 1)) { - rabp = nfs_getcacheblk(vp, lbn + 1, NFS_DIRBLKSIZ, p, + !nfs_buf_incore(vp, lbn + 1)) { + rabp = nfs_buf_get(vp, lbn + 1, NFS_DIRBLKSIZ, p, operation); if (rabp) { - if (!ISSET(rabp->b_flags, (B_CACHE|B_DELWRI))) { - SET(rabp->b_flags, (B_READ | B_ASYNC)); + if (!ISSET(rabp->nb_flags, (NB_CACHE))) { + SET(rabp->nb_flags, (NB_READ | NB_ASYNC)); if (nfs_asyncio(rabp, cred)) { - SET(rabp->b_flags, (B_INVAL|B_ERROR)); - rabp->b_error = EIO; - brelse(rabp); + SET(rabp->nb_flags, (NB_INVAL|NB_ERROR)); + rabp->nb_error = EIO; + nfs_buf_release(rabp); } } else { - brelse(rabp); + nfs_buf_release(rabp); } } } @@ -424,30 +1434,41 @@ again: * Make sure we use a signed variant of min() since * the second term may be negative. */ - n = lmin(uio->uio_resid, NFS_DIRBLKSIZ - bp->b_resid - on); + n = lmin(uio->uio_resid, bp->nb_validend - on); /* - * Unlike VREG files, whos buffer size ( bp->b_bcount ) is - * chopped for the EOF condition, we cannot tell how large - * NFS directories are going to be until we hit EOF. So - * an NFS directory buffer is *not* chopped to its EOF. Now, - * it just so happens that b_resid will effectively chop it - * to EOF. *BUT* this information is lost if the buffer goes - * away and is reconstituted into a B_CACHE state (recovered - * from VM) later. So we keep track of the directory eof - * in np->n_direofoffset and chop it off as an extra step - * right here. + * We keep track of the directory eof in + * np->n_direofoffset and chop it off as an + * extra step right here. */ if (np->n_direofoffset && n > np->n_direofoffset - uio->uio_offset) n = np->n_direofoffset - uio->uio_offset; + /* + * Make sure that we return an integral number of entries so + * that any subsequent calls will start copying from the start + * of the next entry. + * + * If the current value of n has the last entry cut short, + * set n to copy everything up to the last entry instead. + */ + if (n > 0) { + dp = bp->nb_data + on; + while (dp < (bp->nb_data + on + n)) { + direntp = (struct dirent *)dp; + dp += direntp->d_reclen; + } + if (dp > (bp->nb_data + on + n)) + n = (dp - direntp->d_reclen) - (bp->nb_data + on); + } break; default: - printf(" nfs_bioread: type %x unexpected\n",vp->v_type); - break; + printf("nfs_bioread: type %x unexpected\n",vp->v_type); + FSDBG_BOT(514, vp, 0xd1e0015, 0, EINVAL); + return (EINVAL); }; if (n > 0) { - error = uiomove(bp->b_data + on, (int)n, uio); + error = uiomove(bp->nb_data + on, (int)n, uio); } switch (vp->v_type) { case VREG: @@ -457,13 +1478,12 @@ again: break; case VDIR: if (np->n_flag & NQNFSNONCACHE) - SET(bp->b_flags, B_INVAL); + SET(bp->nb_flags, NB_INVAL); break; - default: - printf(" nfs_bioread: type %x unexpected\n",vp->v_type); } - brelse(bp); + nfs_buf_release(bp); } while (error == 0 && uio->uio_resid > 0 && n > 0); + FSDBG_BOT(514, vp, uio->uio_offset, uio->uio_resid, error); return (error); } @@ -480,23 +1500,24 @@ nfs_write(ap) struct ucred *a_cred; } */ *ap; { - register int biosize; - register struct uio *uio = ap->a_uio; + struct uio *uio = ap->a_uio; struct proc *p = uio->uio_procp; - register struct vnode *vp = ap->a_vp; + struct vnode *vp = ap->a_vp; struct nfsnode *np = VTONFS(vp); - register struct ucred *cred = ap->a_cred; + struct ucred *cred = ap->a_cred; int ioflag = ap->a_ioflag; - struct buf *bp; + struct nfsbuf *bp; struct vattr vattr; struct nfsmount *nmp = VFSTONFS(vp->v_mount); daddr_t lbn; - int bufsize; + int biosize, bufsize, writeop; int n, on, error = 0, iomode, must_commit; - off_t boff; + off_t boff, start, end; struct iovec iov; struct uio auio; + FSDBG_TOP(515, vp, uio->uio_offset, uio->uio_resid, ioflag); + #if DIAGNOSTIC if (uio->uio_rw != UIO_WRITE) panic("nfs_write mode"); @@ -507,29 +1528,39 @@ nfs_write(ap) return (EIO); if (np->n_flag & NWRITEERR) { np->n_flag &= ~NWRITEERR; + FSDBG_BOT(515, vp, uio->uio_offset, uio->uio_resid, np->n_error); return (np->n_error); } - if ((nmp->nm_flag & (NFSMNT_NFSV3 | NFSMNT_GOTFSINFO)) == NFSMNT_NFSV3) + if ((nmp->nm_flag & NFSMNT_NFSV3) && + !(nmp->nm_state & NFSSTA_GOTFSINFO)) (void)nfs_fsinfo(nmp, vp, cred, p); if (ioflag & (IO_APPEND | IO_SYNC)) { if (np->n_flag & NMODIFIED) { np->n_attrstamp = 0; error = nfs_vinvalbuf(vp, V_SAVE, cred, p, 1); - if (error) + if (error) { + FSDBG_BOT(515, vp, uio->uio_offset, 0x10bad01, error); return (error); + } } if (ioflag & IO_APPEND) { np->n_attrstamp = 0; error = VOP_GETATTR(vp, &vattr, cred, p); - if (error) + if (error) { + FSDBG_BOT(515, vp, uio->uio_offset, 0x10bad02, error); return (error); + } uio->uio_offset = np->n_size; } } - if (uio->uio_offset < 0) + if (uio->uio_offset < 0) { + FSDBG_BOT(515, vp, uio->uio_offset, 0xbad0ff, EINVAL); return (EINVAL); - if (uio->uio_resid == 0) + } + if (uio->uio_resid == 0) { + FSDBG_BOT(515, vp, uio->uio_offset, uio->uio_resid, 0); return (0); + } /* * Maybe this should be above the vnode op call, but so long as * file servers have no limits, i don't think it matters @@ -537,15 +1568,11 @@ nfs_write(ap) if (p && uio->uio_offset + uio->uio_resid > p->p_rlimit[RLIMIT_FSIZE].rlim_cur) { psignal(p, SIGXFSZ); + FSDBG_BOT(515, vp, uio->uio_offset, 0x2b1f, EFBIG); return (EFBIG); } - /* - * I use nm_rsize, not nm_wsize so that all buffer cache blocks - * will be the same size within a filesystem. nfs_writerpc will - * still use nm_wsize when sizing the rpc's. - */ - /*due to getblk/vm interractions, use vm page size or less values */ - biosize = min(vp->v_mount->mnt_stat.f_iosize, PAGE_SIZE); + + biosize = vp->v_mount->mnt_stat.f_iosize; do { /* @@ -556,210 +1583,376 @@ nfs_write(ap) do { error = nqnfs_getlease(vp, ND_WRITE, cred, p); } while (error == NQNFS_EXPIRED); - if (error) + if (error) { + FSDBG_BOT(515, vp, uio->uio_offset, 0x11110001, error); return (error); + } if (np->n_lrev != np->n_brev || (np->n_flag & NQNFSNONCACHE)) { error = nfs_vinvalbuf(vp, V_SAVE, cred, p, 1); - if (error) + if (error) { + FSDBG_BOT(515, vp, uio->uio_offset, 0x11110002, error); return (error); + } np->n_brev = np->n_lrev; } } - if ((np->n_flag & NQNFSNONCACHE) && uio->uio_iovcnt == 1) { + if (ISSET(vp->v_flag, VNOCACHE_DATA) && + (np->n_dirtyblkhd.lh_first || np->n_cleanblkhd.lh_first)) { + error = nfs_vinvalbuf(vp, V_SAVE, cred, p, 1); + if (error) { + FSDBG_BOT(515, vp, 0, 0, error); + return (error); + } + } + if (((np->n_flag & NQNFSNONCACHE) || + ISSET(vp->v_flag, VNOCACHE_DATA)) && + uio->uio_iovcnt == 1) { iomode = NFSV3WRITE_FILESYNC; error = nfs_writerpc(vp, uio, cred, &iomode, &must_commit); if (must_commit) nfs_clearcommit(vp->v_mount); + FSDBG_BOT(515, vp, uio->uio_offset, uio->uio_resid, error); return (error); } nfsstats.biocache_writes++; lbn = uio->uio_offset / biosize; - on = uio->uio_offset & (biosize-1); + on = uio->uio_offset % biosize; n = min((unsigned)(biosize - on), uio->uio_resid); again: bufsize = biosize; -#if 0 -/* (removed for UBC) */ - if ((lbn + 1) * biosize > np->n_size) { - bufsize = np->n_size - lbn * biosize; - bufsize = (bufsize + DEV_BSIZE - 1) & ~(DEV_BSIZE - 1); - } -#endif /* * Get a cache block for writing. The range to be written is - * (off..off+len) within the block. We ensure that the block + * (off..off+n) within the block. We ensure that the block * either has no dirty region or that the given range is * contiguous with the existing dirty region. */ - bp = nfs_getcacheblk(vp, lbn, bufsize, p, BLK_WRITE); - if (!bp) + bp = nfs_buf_get(vp, lbn, bufsize, p, BLK_WRITE); + if (!bp) { + FSDBG_BOT(515, vp, uio->uio_offset, uio->uio_resid, EINTR); return (EINTR); + } + /* map the block because we know we're going to write to it */ + NFS_BUF_MAP(bp); + + if (ISSET(vp->v_flag, VNOCACHE_DATA)) + SET(bp->nb_flags, (NB_NOCACHE|NB_INVAL)); + + /* + * NFS has embedded ucred so crhold() risks zone corruption + */ + if (bp->nb_wcred == NOCRED) + bp->nb_wcred = crdup(cred); + + /* + * If there's already a dirty range AND dirty pages in this block we + * need to send a commit AND write the dirty pages before continuing. + * + * If there's already a dirty range OR dirty pages in this block + * and the new write range is not contiguous with the existing range, + * then force the buffer to be written out now. + * (We used to just extend the dirty range to cover the valid, + * but unwritten, data in between also. But writing ranges + * of data that weren't actually written by an application + * risks overwriting some other client's data with stale data + * that's just masquerading as new written data.) + */ + if (bp->nb_dirtyend > 0) { + if (on > bp->nb_dirtyend || (on + n) < bp->nb_dirtyoff || bp->nb_dirty) { + FSDBG(515, vp, uio->uio_offset, bp, 0xd15c001); + /* write/commit buffer "synchronously" */ + /* (NB_STABLE indicates that data writes should be FILESYNC) */ + CLR(bp->nb_flags, (NB_DONE | NB_ERROR | NB_INVAL)); + SET(bp->nb_flags, (NB_ASYNC | NB_STABLE)); + error = nfs_buf_write(bp); + if (error) { + FSDBG_BOT(515, vp, uio->uio_offset, uio->uio_resid, error); + return (error); + } + goto again; + } + } else if (bp->nb_dirty) { + int firstpg, lastpg; + u_int32_t pagemask; + /* calculate write range pagemask */ + firstpg = on/PAGE_SIZE; + lastpg = (on+n-1)/PAGE_SIZE; + pagemask = ((1 << (lastpg+1)) - 1) & ~((1 << firstpg) - 1); + /* check if there are dirty pages outside the write range */ + if (bp->nb_dirty & ~pagemask) { + FSDBG(515, vp, uio->uio_offset, bp, 0xd15c002); + /* write/commit buffer "synchronously" */ + /* (NB_STABLE indicates that data writes should be FILESYNC) */ + CLR(bp->nb_flags, (NB_DONE | NB_ERROR | NB_INVAL)); + SET(bp->nb_flags, (NB_ASYNC | NB_STABLE)); + error = nfs_buf_write(bp); + if (error) { + FSDBG_BOT(515, vp, uio->uio_offset, uio->uio_resid, error); + return (error); + } + goto again; + } + /* if the first or last pages are already dirty */ + /* make sure that the dirty range encompasses those pages */ + if (NBPGDIRTY(bp,firstpg) || NBPGDIRTY(bp,lastpg)) { + FSDBG(515, vp, uio->uio_offset, bp, 0xd15c003); + bp->nb_dirtyoff = min(on, firstpg * PAGE_SIZE); + if (NBPGDIRTY(bp,lastpg)) { + bp->nb_dirtyend = (lastpg+1) * PAGE_SIZE; + /* clip to EOF */ + if (NBOFF(bp) + bp->nb_dirtyend > np->n_size) + bp->nb_dirtyend = np->n_size - NBOFF(bp); + } else + bp->nb_dirtyend = on+n; + } + } + /* - * Resize nfsnode *after* we busy the buffer to prevent - * readers from reading garbage. + * Are we extending the size of the file with this write? + * If so, update file size now that we have the block. * If there was a partial buf at the old eof, validate * and zero the new bytes. */ if (uio->uio_offset + n > np->n_size) { - struct buf *bp0 = NULL; - daddr_t bn = np->n_size / biosize; - int off = np->n_size & (biosize - 1); + struct nfsbuf *eofbp = NULL; + daddr_t eofbn = np->n_size / biosize; + int eofoff = np->n_size % biosize; + int neweofoff = (uio->uio_offset + n) % biosize; + + FSDBG(515, 0xb1ffa000, uio->uio_offset + n, eofoff, neweofoff); - if (off && bn < lbn && incore(vp, bn)) - bp0 = nfs_getcacheblk(vp, bn, biosize, p, - BLK_WRITE); + if (eofoff && eofbn < lbn && nfs_buf_incore(vp, eofbn)) + eofbp = nfs_buf_get(vp, eofbn, biosize, p, BLK_WRITE); + + /* if we're extending within the same last block */ + /* and the block is flagged as being cached... */ + if ((lbn == eofbn) && ISSET(bp->nb_flags, NB_CACHE)) { + /* ...check that all pages in buffer are valid */ + int endpg = ((neweofoff ? neweofoff : biosize) - 1)/PAGE_SIZE; + u_int32_t pagemask; + /* pagemask only has to extend to last page being written to */ + pagemask = (1 << (endpg+1)) - 1; + FSDBG(515, 0xb1ffa001, bp->nb_valid, pagemask, 0); + if ((bp->nb_valid & pagemask) != pagemask) { + /* zerofill any hole */ + if (on > bp->nb_validend) { + int i; + for (i=bp->nb_validend/PAGE_SIZE; i <= (on - 1)/PAGE_SIZE; i++) + NBPGVALID_SET(bp, i); + NFS_BUF_MAP(bp); + FSDBG(516, bp, bp->nb_validend, on - bp->nb_validend, 0xf01e); + bzero((char *)bp->nb_data + bp->nb_validend, + on - bp->nb_validend); + } + /* zerofill any trailing data in the last page */ + if (neweofoff) { + NFS_BUF_MAP(bp); + FSDBG(516, bp, neweofoff, PAGE_SIZE - (neweofoff & PAGE_MASK), 0xe0f); + bzero((char *)bp->nb_data + neweofoff, + PAGE_SIZE - (neweofoff & PAGE_MASK)); + } + } + } np->n_flag |= NMODIFIED; np->n_size = uio->uio_offset + n; ubc_setsize(vp, (off_t)np->n_size); /* XXX errors */ - if (bp0) { - bzero((char *)bp0->b_data + off, biosize - off); - bp0->b_validend = biosize; - brelse(bp0); + if (eofbp) { + /* + * We may need to zero any previously invalid data + * after the old EOF in the previous EOF buffer. + * + * For the old last page, don't zero bytes if there + * are invalid bytes in that page (i.e. the page isn't + * currently valid). + * For pages after the old last page, zero them and + * mark them as valid. + */ + char *d; + int i; + if (ISSET(vp->v_flag, VNOCACHE_DATA)) + SET(eofbp->nb_flags, (NB_NOCACHE|NB_INVAL)); + NFS_BUF_MAP(eofbp); + FSDBG(516, eofbp, eofoff, biosize - eofoff, 0xe0fff01e); + d = eofbp->nb_data; + i = eofoff/PAGE_SIZE; + while (eofoff < biosize) { + int poff = eofoff & PAGE_MASK; + if (!poff || NBPGVALID(eofbp,i)) { + bzero(d + eofoff, PAGE_SIZE - poff); + NBPGVALID_SET(eofbp, i); + } + if (bp->nb_validend == eofoff) + bp->nb_validend += PAGE_SIZE - poff; + eofoff += PAGE_SIZE - poff; + i++; + } + nfs_buf_release(eofbp); } } - /* - * NFS has embedded ucred so crhold() risks zone corruption - */ - if (bp->b_wcred == NOCRED) - bp->b_wcred = crdup(cred); /* * If dirtyend exceeds file size, chop it down. This should * not occur unless there is a race. */ - if ((off_t)bp->b_blkno * DEV_BSIZE + bp->b_dirtyend > - np->n_size) - bp->b_dirtyend = np->n_size - (off_t)bp->b_blkno * - DEV_BSIZE; + if (NBOFF(bp) + bp->nb_dirtyend > np->n_size) + bp->nb_dirtyend = np->n_size - NBOFF(bp); /* - * UBC doesn't (yet) handle partial pages so nfs_biowrite was - * hacked to never bdwrite, to start every little write right - * away. Running IE Avie noticed the performance problem, thus - * this code, which permits those delayed writes by ensuring an - * initial read of the entire page. The read may hit eof - * ("short read") but that we will handle. + * UBC doesn't handle partial pages, so we need to make sure + * that any pages left in the page cache are completely valid. + * + * Writes that are smaller than a block are delayed if they + * don't extend to the end of the block. * - * We are quite dependant on the correctness of B_CACHE so check - * that first in case of problems. + * If the block isn't (completely) cached, we may need to read + * in some parts of pages that aren't covered by the write. + * If the write offset (on) isn't page aligned, we'll need to + * read the start of the first page being written to. Likewise, + * if the offset of the end of the write (on+n) isn't page aligned, + * we'll need to read the end of the last page being written to. + * + * Notes: + * We don't want to read anything we're just going to write over. + * We don't want to issue multiple I/Os if we don't have to + * (because they're synchronous rpcs). + * We don't want to read anything we already have modified in the + * page cache. */ - if (!ISSET(bp->b_flags, B_CACHE) && n < PAGE_SIZE) { - boff = (off_t)bp->b_blkno * DEV_BSIZE; - auio.uio_iov = &iov; - auio.uio_iovcnt = 1; - auio.uio_offset = boff; - auio.uio_resid = PAGE_SIZE; - auio.uio_segflg = UIO_SYSSPACE; - auio.uio_rw = UIO_READ; - auio.uio_procp = p; - iov.iov_base = bp->b_data; - iov.iov_len = PAGE_SIZE; - error = nfs_readrpc(vp, &auio, cred); - if (error) { - bp->b_error = error; - SET(bp->b_flags, B_ERROR); - printf("nfs_write: readrpc %d", error); + if (!ISSET(bp->nb_flags, NB_CACHE) && n < biosize) { + int firstpg, lastpg, dirtypg; + int firstpgoff, lastpgoff; + start = end = -1; + firstpg = on/PAGE_SIZE; + firstpgoff = on & PAGE_MASK; + lastpg = (on+n-1)/PAGE_SIZE; + lastpgoff = (on+n) & PAGE_MASK; + if (firstpgoff && !NBPGVALID(bp,firstpg)) { + /* need to read start of first page */ + start = firstpg * PAGE_SIZE; + end = start + firstpgoff; } - if (auio.uio_resid > 0) - bzero(iov.iov_base, auio.uio_resid); - bp->b_validoff = 0; - bp->b_validend = PAGE_SIZE - auio.uio_resid; - if (np->n_size > boff + bp->b_validend) - bp->b_validend = min(np->n_size - boff, - PAGE_SIZE); - bp->b_dirtyoff = 0; - bp->b_dirtyend = 0; - } - - /* - * If the new write will leave a contiguous dirty - * area, just update the b_dirtyoff and b_dirtyend, - * otherwise try to extend the dirty region. - */ - if (bp->b_dirtyend > 0 && - (on > bp->b_dirtyend || (on + n) < bp->b_dirtyoff)) { - off_t start, end; - - boff = (off_t)bp->b_blkno * DEV_BSIZE; - if (on > bp->b_dirtyend) { - start = boff + bp->b_validend; - end = boff + on; - } else { - start = boff + on + n; - end = boff + bp->b_validoff; + if (lastpgoff && !NBPGVALID(bp,lastpg)) { + /* need to read end of last page */ + if (start < 0) + start = (lastpg * PAGE_SIZE) + lastpgoff; + end = (lastpg + 1) * PAGE_SIZE; } - - /* - * It may be that the valid region in the buffer - * covers the region we want, in which case just - * extend the dirty region. Otherwise we try to - * extend the valid region. - */ if (end > start) { + /* need to read the data in range: start...end-1 */ + + /* + * XXX: If we know any of these reads are beyond the + * current EOF (what np->n_size was before we possibly + * just modified it above), we could short-circuit the + * reads and just zero buffer. No need to make a trip + * across the network to read nothing. + */ + + /* first, check for dirty pages in between */ + /* if there are, we'll have to do two reads because */ + /* we don't want to overwrite the dirty pages. */ + for (dirtypg=start/PAGE_SIZE; dirtypg <= (end-1)/PAGE_SIZE; dirtypg++) + if (NBPGDIRTY(bp,dirtypg)) + break; + + /* if start is at beginning of page, try */ + /* to get any preceeding pages as well. */ + if (!(start & PAGE_MASK)) { + /* stop at next dirty/valid page or start of block */ + for (; start > 0; start-=PAGE_SIZE) + if (NBPGVALID(bp,((start-1)/PAGE_SIZE))) + break; + } + + NFS_BUF_MAP(bp); + /* setup uio for read(s) */ + boff = NBOFF(bp); auio.uio_iov = &iov; auio.uio_iovcnt = 1; - auio.uio_offset = start; - auio.uio_resid = end - start; auio.uio_segflg = UIO_SYSSPACE; auio.uio_rw = UIO_READ; auio.uio_procp = p; - iov.iov_base = bp->b_data + (start - boff); - iov.iov_len = end - start; + + if (dirtypg <= (end-1)/PAGE_SIZE) { + /* there's a dirty page in the way, so just do two reads */ + /* we'll read the preceding data here */ + auio.uio_offset = boff + start; + auio.uio_resid = iov.iov_len = on - start; + iov.iov_base = bp->nb_data + start; + error = nfs_readrpc(vp, &auio, cred); + if (error) { + bp->nb_error = error; + SET(bp->nb_flags, NB_ERROR); + printf("nfs_write: readrpc %d", error); + } + if (auio.uio_resid > 0) { + FSDBG(516, bp, iov.iov_base - bp->nb_data, auio.uio_resid, 0xd00dee01); + bzero(iov.iov_base, auio.uio_resid); + } + /* update validoff/validend if necessary */ + if ((bp->nb_validoff < 0) || (bp->nb_validoff > start)) + bp->nb_validoff = start; + if ((bp->nb_validend < 0) || (bp->nb_validend < on)) + bp->nb_validend = on; + if (np->n_size > boff + bp->nb_validend) + bp->nb_validend = min(np->n_size - (boff + start), biosize); + /* validate any pages before the write offset */ + for (; start < on/PAGE_SIZE; start+=PAGE_SIZE) + NBPGVALID_SET(bp, start/PAGE_SIZE); + /* adjust start to read any trailing data */ + start = on+n; + } + + /* if end is at end of page, try to */ + /* get any following pages as well. */ + if (!(end & PAGE_MASK)) { + /* stop at next valid page or end of block */ + for (; end < bufsize; end+=PAGE_SIZE) + if (NBPGVALID(bp,end/PAGE_SIZE)) + break; + } + + /* now we'll read the (rest of the) data */ + auio.uio_offset = boff + start; + auio.uio_resid = iov.iov_len = end - start; + iov.iov_base = bp->nb_data + start; error = nfs_readrpc(vp, &auio, cred); - /* - * If we couldn't read, do not do a VOP_BWRITE - * as originally coded. That could also error - * and looping back to "again" as it was doing - * could have us stuck trying to write same buf - * again. nfs_write, will get the entire region - * if nfs_readrpc succeeded. If unsuccessful - * we should just error out. Errors like ESTALE - * would keep us looping rather than transient - * errors justifying a retry. We can return here - * instead of altering dirty region later. We - * did not write old dirty region at this point. - */ if (error) { - bp->b_error = error; - SET(bp->b_flags, B_ERROR); - printf("nfs_write: readrpc2 %d", error); - brelse(bp); - return (error); + bp->nb_error = error; + SET(bp->nb_flags, NB_ERROR); + printf("nfs_write: readrpc %d", error); } - /* - * The read worked. - * If there was a short read, just zero fill. - */ - if (auio.uio_resid > 0) + if (auio.uio_resid > 0) { + FSDBG(516, bp, iov.iov_base - bp->nb_data, auio.uio_resid, 0xd00dee02); bzero(iov.iov_base, auio.uio_resid); - if (on > bp->b_dirtyend) - bp->b_validend = on; - else - bp->b_validoff = on + n; + } + /* update validoff/validend if necessary */ + if ((bp->nb_validoff < 0) || (bp->nb_validoff > start)) + bp->nb_validoff = start; + if ((bp->nb_validend < 0) || (bp->nb_validend < end)) + bp->nb_validend = end; + if (np->n_size > boff + bp->nb_validend) + bp->nb_validend = min(np->n_size - (boff + start), biosize); + /* validate any pages before the write offset's page */ + for (; start < trunc_page_32(on); start+=PAGE_SIZE) + NBPGVALID_SET(bp, start/PAGE_SIZE); + /* validate any pages after the range of pages being written to */ + for (; (end - 1) > round_page_32(on+n-1); end-=PAGE_SIZE) + NBPGVALID_SET(bp, (end-1)/PAGE_SIZE); + /* Note: pages being written to will be validated when written */ } - /* - * We now have a valid region which extends up to the - * dirty region which we want. - */ - if (on > bp->b_dirtyend) - bp->b_dirtyend = on; - else - bp->b_dirtyoff = on + n; } - if (ISSET(bp->b_flags, B_ERROR)) { - error = bp->b_error; - brelse(bp); + + if (ISSET(bp->nb_flags, NB_ERROR)) { + error = bp->nb_error; + nfs_buf_release(bp); + FSDBG_BOT(515, vp, uio->uio_offset, uio->uio_resid, error); return (error); } - /* - * NFS has embedded ucred so crhold() risks zone corruption - */ - if (bp->b_wcred == NOCRED) - bp->b_wcred = crdup(cred); + np->n_flag |= NMODIFIED; /* * Check for valid write lease and get one as required. - * In case getblk() and/or bwrite() delayed us. + * In case nfs_buf_get() and/or nfs_buf_write() delayed us. */ if ((nmp->nm_flag & NFSMNT_NQNFS) && NQNFS_CKINVALID(vp, np, ND_WRITE)) { @@ -767,124 +1960,222 @@ again: error = nqnfs_getlease(vp, ND_WRITE, cred, p); } while (error == NQNFS_EXPIRED); if (error) { - brelse(bp); + nfs_buf_release(bp); + FSDBG_BOT(515, vp, uio->uio_offset, 0x11220001, error); return (error); } if (np->n_lrev != np->n_brev || (np->n_flag & NQNFSNONCACHE)) { - brelse(bp); + nfs_buf_release(bp); error = nfs_vinvalbuf(vp, V_SAVE, cred, p, 1); - if (error) + if (error) { + FSDBG_BOT(515, vp, uio->uio_offset, 0x11220002, error); return (error); + } np->n_brev = np->n_lrev; goto again; } } - error = uiomove((char *)bp->b_data + on, n, uio); + NFS_BUF_MAP(bp); + error = uiomove((char *)bp->nb_data + on, n, uio); if (error) { - SET(bp->b_flags, B_ERROR); - brelse(bp); + SET(bp->nb_flags, NB_ERROR); + nfs_buf_release(bp); + FSDBG_BOT(515, vp, uio->uio_offset, uio->uio_resid, error); return (error); } - if (bp->b_dirtyend > 0) { - bp->b_dirtyoff = min(on, bp->b_dirtyoff); - bp->b_dirtyend = max((on + n), bp->b_dirtyend); + + /* validate any pages written to */ + start = on & ~PAGE_MASK; + for (; start < on+n; start += PAGE_SIZE) { + NBPGVALID_SET(bp, start/PAGE_SIZE); + /* + * This may seem a little weird, but we don't actually set the + * dirty bits for writes. This is because we keep the dirty range + * in the nb_dirtyoff/nb_dirtyend fields. Also, particularly for + * delayed writes, when we give the pages back to the VM we don't + * want to keep them marked dirty, because when we later write the + * buffer we won't be able to tell which pages were written dirty + * and which pages were mmapped and dirtied. + */ + } + if (bp->nb_dirtyend > 0) { + bp->nb_dirtyoff = min(on, bp->nb_dirtyoff); + bp->nb_dirtyend = max((on + n), bp->nb_dirtyend); } else { - bp->b_dirtyoff = on; - bp->b_dirtyend = on + n; + bp->nb_dirtyoff = on; + bp->nb_dirtyend = on + n; } - if (bp->b_validend == 0 || bp->b_validend < bp->b_dirtyoff || - bp->b_validoff > bp->b_dirtyend) { - bp->b_validoff = bp->b_dirtyoff; - bp->b_validend = bp->b_dirtyend; + if (bp->nb_validend <= 0 || bp->nb_validend < bp->nb_dirtyoff || + bp->nb_validoff > bp->nb_dirtyend) { + bp->nb_validoff = bp->nb_dirtyoff; + bp->nb_validend = bp->nb_dirtyend; } else { - bp->b_validoff = min(bp->b_validoff, bp->b_dirtyoff); - bp->b_validend = max(bp->b_validend, bp->b_dirtyend); + bp->nb_validoff = min(bp->nb_validoff, bp->nb_dirtyoff); + bp->nb_validend = max(bp->nb_validend, bp->nb_dirtyend); } + if (!ISSET(bp->nb_flags, NB_CACHE)) + nfs_buf_normalize_valid_range(np, bp); /* * Since this block is being modified, it must be written * again and not just committed. */ - CLR(bp->b_flags, B_NEEDCOMMIT); + if (ISSET(bp->nb_flags, NB_NEEDCOMMIT)) { + np->n_needcommitcnt--; + CHECK_NEEDCOMMITCNT(np); + } + CLR(bp->nb_flags, NB_NEEDCOMMIT); - /* - * If the lease is non-cachable or IO_SYNC do bwrite(). - */ - if ((np->n_flag & NQNFSNONCACHE) || (ioflag & IO_SYNC)) { - bp->b_proc = p; - error = VOP_BWRITE(bp); - if (error) + if ((np->n_flag & NQNFSNONCACHE) || + (ioflag & IO_SYNC) || (vp->v_flag & VNOCACHE_DATA)) { + bp->nb_proc = p; + error = nfs_buf_write(bp); + if (error) { + FSDBG_BOT(515, vp, uio->uio_offset, + uio->uio_resid, error); return (error); + } if (np->n_flag & NQNFSNONCACHE) { error = nfs_vinvalbuf(vp, V_SAVE, cred, p, 1); - if (error) + if (error) { + FSDBG_BOT(515, vp, uio->uio_offset, + uio->uio_resid, error); return (error); + } } - } else if ((n + on) == biosize && - (nmp->nm_flag & NFSMNT_NQNFS) == 0) { - bp->b_proc = (struct proc *)0; - SET(bp->b_flags, B_ASYNC); - (void)nfs_writebp(bp, 0); + } else if ((n + on) == biosize && (nmp->nm_flag & NFSMNT_NQNFS) == 0) { + bp->nb_proc = (struct proc *)0; + SET(bp->nb_flags, NB_ASYNC); + nfs_buf_write(bp); } else - bdwrite(bp); + nfs_buf_write_delayed(bp); + + if (np->n_needcommitcnt > (nbuf/16)) + nfs_flushcommits(vp, p); + } while (uio->uio_resid > 0 && n > 0); + + FSDBG_BOT(515, vp, uio->uio_offset, uio->uio_resid, 0); return (0); } - /* - * Get an nfs cache block. - * Allocate a new one if the block isn't currently in the cache - * and return the block marked busy. If the calling process is - * interrupted by a signal for an interruptible mount point, return - * NULL. + * Flush out and invalidate all buffers associated with a vnode. + * Called with the underlying object locked. */ -static struct buf * -nfs_getcacheblk(vp, bn, size, p, operation) - struct vnode *vp; - daddr_t bn; - int size; +static int +nfs_vinvalbuf_internal(vp, flags, cred, p, slpflag, slptimeo) + register struct vnode *vp; + int flags; + struct ucred *cred; struct proc *p; - int operation; /* defined in sys/buf.h */ + int slpflag, slptimeo; { - register struct buf *bp; - struct nfsmount *nmp = VFSTONFS(vp->v_mount); - /*due to getblk/vm interractions, use vm page size or less values */ - int biosize = min(vp->v_mount->mnt_stat.f_iosize, PAGE_SIZE); - - if (nbdwrite > ((nbuf/4)*3) && operation == BLK_WRITE) { -#define __BUFFERS_RECLAIMED 2 - struct buf *tbp[__BUFFERS_RECLAIMED]; - int i; - - /* too many delayed writes, try to free up some buffers */ - for (i = 0; i < __BUFFERS_RECLAIMED; i++) - tbp[i] = geteblk(512); - - /* Yield to IO thread */ - (void)tsleep((caddr_t)&nbdwrite, PCATCH, "nbdwrite", 1); + struct nfsbuf *bp; + struct nfsbuf *nbp, *blist; + int s, error = 0; + struct nfsnode *np = VTONFS(vp); - for (i = (__BUFFERS_RECLAIMED - 1); i >= 0; i--) - brelse(tbp[i]); + if (flags & V_SAVE) { + if (error = VOP_FSYNC(vp, cred, MNT_WAIT, p)) + return (error); + if (np->n_dirtyblkhd.lh_first) + panic("nfs_vinvalbuf: dirty bufs (vp 0x%x, bp 0x%x)", + vp, np->n_dirtyblkhd.lh_first); } - if (nmp->nm_flag & NFSMNT_INT) { - bp = getblk(vp, bn, size, PCATCH, 0, operation); - while (bp == (struct buf *)0) { - if (nfs_sigintr(nmp, (struct nfsreq *)0, p)) - return ((struct buf *)0); - bp = getblk(vp, bn, size, 0, 2 * hz, operation); - } - } else - bp = getblk(vp, bn, size, 0, 0, operation); - - if( vp->v_type == VREG) - bp->b_blkno = ((off_t)bn * biosize) / DEV_BSIZE; + for (;;) { + blist = np->n_cleanblkhd.lh_first; + if (!blist) + blist = np->n_dirtyblkhd.lh_first; + if (!blist) + break; - return (bp); + for (bp = blist; bp; bp = nbp) { + nbp = bp->nb_vnbufs.le_next; + s = splbio(); + if (ISSET(bp->nb_flags, NB_BUSY)) { + SET(bp->nb_flags, NB_WANTED); + FSDBG_TOP(556, vp, bp, NBOFF(bp), bp->nb_flags); + error = tsleep((caddr_t)bp, + slpflag | (PRIBIO + 1), "nfs_vinvalbuf", + slptimeo); + FSDBG_BOT(556, vp, bp, NBOFF(bp), bp->nb_flags); + splx(s); + if (error) { + FSDBG(554, vp, bp, -1, error); + return (error); + } + break; + } + FSDBG(554, vp, bp, NBOFF(bp), bp->nb_flags); + nfs_buf_remfree(bp); + SET(bp->nb_flags, NB_BUSY); + splx(s); + if ((flags & V_SAVE) && UBCINFOEXISTS(vp) && (NBOFF(bp) < np->n_size)) { + /* XXX extra paranoia: make sure we're not */ + /* somehow leaving any dirty data around */ + int mustwrite = 0; + int end = (NBOFF(bp) + bp->nb_bufsize >= np->n_size) ? + bp->nb_bufsize : (np->n_size - NBOFF(bp)); + if (!ISSET(bp->nb_flags, NB_PAGELIST)) { + error = nfs_buf_upl_setup(bp); + if (error == EINVAL) { + /* vm object must no longer exist */ + /* hopefully we don't need to do */ + /* anything for this buffer */ + } else if (error) + printf("nfs_vinvalbuf: upl setup failed %d\n", + error); + bp->nb_valid = bp->nb_dirty = 0; + } + nfs_buf_upl_check(bp); + /* check for any dirty data before the EOF */ + if (bp->nb_dirtyend && bp->nb_dirtyoff < end) { + /* clip dirty range to EOF */ + if (bp->nb_dirtyend > end) + bp->nb_dirtyend = end; + mustwrite++; + } + bp->nb_dirty &= (1 << (round_page_32(end)/PAGE_SIZE)) - 1; + if (bp->nb_dirty) + mustwrite++; + if (mustwrite) { + FSDBG(554, vp, bp, 0xd00dee, bp->nb_flags); + if (!ISSET(bp->nb_flags, NB_PAGELIST)) + panic("nfs_vinvalbuf: dirty buffer without upl"); + /* gotta write out dirty data before invalidating */ + /* (NB_STABLE indicates that data writes should be FILESYNC) */ + /* (NB_NOCACHE indicates buffer should be discarded) */ + CLR(bp->nb_flags, (NB_DONE | NB_ERROR | NB_INVAL | NB_ASYNC)); + SET(bp->nb_flags, NB_STABLE | NB_NOCACHE); + /* + * NFS has embedded ucred so crhold() risks zone corruption + */ + if (bp->nb_wcred == NOCRED) + bp->nb_wcred = crdup(cred); + error = nfs_buf_write(bp); + // Note: bp has been released + if (error) { + FSDBG(554, bp, 0xd00dee, 0xbad, error); + np->n_error = error; + np->n_flag |= NWRITEERR; + error = 0; + } + break; + } + } + SET(bp->nb_flags, NB_INVAL); + nfs_buf_release(bp); + } + } + if (np->n_dirtyblkhd.lh_first || np->n_cleanblkhd.lh_first) + panic("nfs_vinvalbuf: flush failed"); + return (0); } + /* * Flush and invalidate all dirty buffers. If another process is already * doing the flush, just wait for completion. @@ -902,7 +2193,9 @@ nfs_vinvalbuf(vp, flags, cred, p, intrflg) int error = 0, slpflag, slptimeo; int didhold = 0; - if ((nmp->nm_flag & NFSMNT_INT) == 0) + FSDBG_TOP(554, vp, flags, intrflg, 0); + + if (nmp && ((nmp->nm_flag & NFSMNT_INT) == 0)) intrflg = 0; if (intrflg) { slpflag = PCATCH; @@ -916,36 +2209,33 @@ nfs_vinvalbuf(vp, flags, cred, p, intrflg) */ while (np->n_flag & NFLUSHINPROG) { np->n_flag |= NFLUSHWANT; - error = tsleep((caddr_t)&np->n_flag, PRIBIO + 2, "nfsvinval", - slptimeo); - if (error && intrflg && nfs_sigintr(nmp, (struct nfsreq *)0, p)) - return (EINTR); + FSDBG_TOP(555, vp, flags, intrflg, np->n_flag); + error = tsleep((caddr_t)&np->n_flag, PRIBIO + 2, "nfsvinval", slptimeo); + FSDBG_BOT(555, vp, flags, intrflg, np->n_flag); + if (error && (error = nfs_sigintr(VFSTONFS(vp->v_mount), NULL, p))) { + FSDBG_BOT(554, vp, flags, intrflg, error); + return (error); + } } /* * Now, flush as required. */ np->n_flag |= NFLUSHINPROG; - error = vinvalbuf(vp, flags, cred, p, slpflag, 0); + error = nfs_vinvalbuf_internal(vp, flags, cred, p, slpflag, 0); while (error) { - /* we seem to be stuck in a loop here if the thread got aborted. - * nfs_flush will return EINTR. Not sure if that will cause - * other consequences due to EINTR having other meanings in NFS - * To handle, no dirty pages, it seems safe to just return from - * here. But if we did have dirty pages, how would we get them - * written out if thread was aborted? Some other strategy is - * necessary. -- EKN - */ - if ((intrflg && nfs_sigintr(nmp, (struct nfsreq *)0, p)) || - (error == EINTR && current_thread_aborted())) { + FSDBG(554, vp, 0, 0, error); + error = nfs_sigintr(VFSTONFS(vp->v_mount), NULL, p); + if (error) { np->n_flag &= ~NFLUSHINPROG; if (np->n_flag & NFLUSHWANT) { np->n_flag &= ~NFLUSHWANT; wakeup((caddr_t)&np->n_flag); } - return (EINTR); + FSDBG_BOT(554, vp, flags, intrflg, error); + return (error); } - error = vinvalbuf(vp, flags, cred, p, 0, slptimeo); + error = nfs_vinvalbuf_internal(vp, flags, cred, p, 0, slptimeo); } np->n_flag &= ~(NMODIFIED | NFLUSHINPROG); if (np->n_flag & NFLUSHWANT) { @@ -954,9 +2244,12 @@ nfs_vinvalbuf(vp, flags, cred, p, intrflg) } didhold = ubc_hold(vp); if (didhold) { - (void) ubc_clean(vp, 1); /* get the pages out of vm also */ + int rv = ubc_clean(vp, 1); /* get the pages out of vm also */ + if (!rv) + panic("nfs_vinvalbuf(): ubc_clean failed!"); ubc_rele(vp); } + FSDBG_BOT(554, vp, flags, intrflg, 0); return (0); } @@ -967,7 +2260,7 @@ nfs_vinvalbuf(vp, flags, cred, p, intrflg) */ int nfs_asyncio(bp, cred) - register struct buf *bp; + struct nfsbuf *bp; struct ucred *cred; { struct nfsmount *nmp; @@ -975,17 +2268,23 @@ nfs_asyncio(bp, cred) int gotiod; int slpflag = 0; int slptimeo = 0; - int error; + int error, error2; if (nfs_numasync == 0) return (EIO); - - nmp = VFSTONFS(bp->b_vp->v_mount); + + FSDBG_TOP(552, bp, bp ? NBOFF(bp) : 0, bp ? bp->nb_flags : 0, 0); + + nmp = ((bp != NULL) ? VFSTONFS(bp->nb_vp->v_mount) : NULL); again: - if (nmp->nm_flag & NFSMNT_INT) + if (nmp && nmp->nm_flag & NFSMNT_INT) slpflag = PCATCH; gotiod = FALSE; + /* no nfsbuf means tell nfsiod to process delwri list */ + if (!bp) + nfs_ioddelwri = 1; + /* * Find a free iod to process this request. */ @@ -1000,12 +2299,17 @@ again: i, nmp)); nfs_iodwant[i] = (struct proc *)0; nfs_iodmount[i] = nmp; - nmp->nm_bufqiods++; + if (nmp) + nmp->nm_bufqiods++; wakeup((caddr_t)&nfs_iodwant[i]); gotiod = TRUE; break; } + /* if we're just poking the delwri list, we're done */ + if (!bp) + return (0); + /* * If none are free, we may already have an iod working on this mount * point. If so, it will process our request. @@ -1023,19 +2327,31 @@ again: * If we have an iod which can process the request, then queue * the buffer. */ + FSDBG(552, bp, gotiod, i, nmp->nm_bufqiods); if (gotiod) { /* * Ensure that the queue never grows too large. */ while (nmp->nm_bufqlen >= 2*nfs_numasync) { + if (ISSET(bp->nb_flags, NB_IOD)) { + /* An nfsiod is attempting this async operation so */ + /* we must not fall asleep on the bufq because we */ + /* could be waiting on ourself. Just return error */ + /* and we'll do this operation syncrhonously. */ + goto out; + } + FSDBG(552, bp, nmp->nm_bufqlen, 2*nfs_numasync, -1); NFS_DPF(ASYNCIO, ("nfs_asyncio: waiting for mount %p queue to drain\n", nmp)); nmp->nm_bufqwant = TRUE; error = tsleep(&nmp->nm_bufq, slpflag | PRIBIO, "nfsaio", slptimeo); if (error) { - if (nfs_sigintr(nmp, NULL, bp->b_proc)) - return (EINTR); + error2 = nfs_sigintr(nmp, NULL, bp->nb_proc); + if (error2) { + FSDBG_BOT(552, bp, NBOFF(bp), bp->nb_flags, error2); + return (error2); + } if (slpflag == PCATCH) { slpflag = 0; slptimeo = 2 * hz; @@ -1052,35 +2368,38 @@ again: } } - if (ISSET(bp->b_flags, B_READ)) { - if (bp->b_rcred == NOCRED && cred != NOCRED) { + if (ISSET(bp->nb_flags, NB_READ)) { + if (bp->nb_rcred == NOCRED && cred != NOCRED) { /* * NFS has embedded ucred. * Can not crhold() here as that causes zone corruption */ - bp->b_rcred = crdup(cred); + bp->nb_rcred = crdup(cred); } } else { - SET(bp->b_flags, B_WRITEINPROG); - if (bp->b_wcred == NOCRED && cred != NOCRED) { + SET(bp->nb_flags, NB_WRITEINPROG); + if (bp->nb_wcred == NOCRED && cred != NOCRED) { /* * NFS has embedded ucred. * Can not crhold() here as that causes zone corruption */ - bp->b_wcred = crdup(cred); + bp->nb_wcred = crdup(cred); } } - TAILQ_INSERT_TAIL(&nmp->nm_bufq, bp, b_freelist); + TAILQ_INSERT_TAIL(&nmp->nm_bufq, bp, nb_free); nmp->nm_bufqlen++; + FSDBG_BOT(552, bp, NBOFF(bp), bp->nb_flags, 0); return (0); } +out: /* * All the iods are busy on other mounts, so return EIO to * force the caller to process the i/o synchronously. */ NFS_DPF(ASYNCIO, ("nfs_asyncio: no iods available, i/o is synchronous\n")); + FSDBG_BOT(552, bp, NBOFF(bp), bp->nb_flags, EIO); return (EIO); } @@ -1090,7 +2409,7 @@ again: */ int nfs_doio(bp, cr, p) - register struct buf *bp; + struct nfsbuf *bp; struct ucred *cr; struct proc *p; { @@ -1102,7 +2421,7 @@ nfs_doio(bp, cr, p) struct uio uio; struct iovec io; - vp = bp->b_vp; + vp = bp->nb_vp; np = VTONFS(vp); nmp = VFSTONFS(vp->v_mount); uiop = &uio; @@ -1111,66 +2430,34 @@ nfs_doio(bp, cr, p) uiop->uio_segflg = UIO_SYSSPACE; uiop->uio_procp = p; - /* - * With UBC, getblk() can return a buf with B_DONE set. - * This indicates that the VM has valid data for that page. - * NFS being stateless, this case poses a problem. - * By definition, the NFS server should always be consulted - * for the data in that page. - * So we choose to clear the B_DONE and to do the IO. - * - * XXX revisit this if there is a performance issue. - * XXX In that case, we could play the attribute cache games ... + /* + * we've decided to perform I/O for this block, + * so we couldn't possibly NB_DONE. So, clear it. */ - if (ISSET(bp->b_flags, B_DONE)) { - if (!ISSET(bp->b_flags, B_ASYNC)) + if (ISSET(bp->nb_flags, NB_DONE)) { + if (!ISSET(bp->nb_flags, NB_ASYNC)) panic("nfs_doio: done and not async"); - CLR(bp->b_flags, B_DONE); + CLR(bp->nb_flags, NB_DONE); } - FSDBG_TOP(256, np->n_size, bp->b_blkno * DEV_BSIZE, bp->b_bcount, - bp->b_flags); - FSDBG(257, bp->b_validoff, bp->b_validend, bp->b_dirtyoff, - bp->b_dirtyend); - /* - * Historically, paging was done with physio, but no more. - */ - if (ISSET(bp->b_flags, B_PHYS)) { - /* - * ...though reading /dev/drum still gets us here. - */ - io.iov_len = uiop->uio_resid = bp->b_bcount; - /* mapping was done by vmapbuf() */ - io.iov_base = bp->b_data; - uiop->uio_offset = (off_t)bp->b_blkno * DEV_BSIZE; - if (ISSET(bp->b_flags, B_READ)) { - uiop->uio_rw = UIO_READ; - nfsstats.read_physios++; - error = nfs_readrpc(vp, uiop, cr); - } else { - int com; - - iomode = NFSV3WRITE_DATASYNC; - uiop->uio_rw = UIO_WRITE; - nfsstats.write_physios++; - error = nfs_writerpc(vp, uiop, cr, &iomode, &com); - } - if (error) { - SET(bp->b_flags, B_ERROR); - bp->b_error = error; - } - } else if (ISSET(bp->b_flags, B_READ)) { - io.iov_len = uiop->uio_resid = bp->b_bcount; - io.iov_base = bp->b_data; + FSDBG_TOP(256, np->n_size, NBOFF(bp), bp->nb_bufsize, bp->nb_flags); + FSDBG(257, bp->nb_validoff, bp->nb_validend, bp->nb_dirtyoff, + bp->nb_dirtyend); + + if (ISSET(bp->nb_flags, NB_READ)) { + if (vp->v_type == VREG) + NFS_BUF_MAP(bp); + io.iov_len = uiop->uio_resid = bp->nb_bufsize; + io.iov_base = bp->nb_data; uiop->uio_rw = UIO_READ; switch (vp->v_type) { case VREG: - uiop->uio_offset = (off_t)bp->b_blkno * DEV_BSIZE; + uiop->uio_offset = NBOFF(bp); nfsstats.read_bios++; error = nfs_readrpc(vp, uiop, cr); - FSDBG(262, np->n_size, bp->b_blkno * DEV_BSIZE, - uiop->uio_resid, error); + FSDBG(262, np->n_size, NBOFF(bp), uiop->uio_resid, error); if (!error) { - bp->b_validoff = 0; + /* update valid range */ + bp->nb_validoff = 0; if (uiop->uio_resid) { /* * If len > 0, there is a hole in the file and @@ -1178,33 +2465,26 @@ nfs_doio(bp, cr, p) * the server yet. * Just zero fill the rest of the valid area. */ - diff = bp->b_bcount - uiop->uio_resid; - len = np->n_size - ((u_quad_t)bp->b_blkno * DEV_BSIZE + - diff); + diff = bp->nb_bufsize - uiop->uio_resid; + len = np->n_size - (NBOFF(bp) + diff); if (len > 0) { len = min(len, uiop->uio_resid); - bzero((char *)bp->b_data + diff, len); - bp->b_validend = diff + len; + bzero((char *)bp->nb_data + diff, len); + bp->nb_validend = diff + len; FSDBG(258, diff, len, 0, 1); } else - bp->b_validend = diff; + bp->nb_validend = diff; } else - bp->b_validend = bp->b_bcount; - - if (bp->b_validend < bp->b_bufsize) { - /* - * we're about to release a partial buffer after a - * read... the only way we should get here is if - * this buffer contains the EOF before releasing it, - * we'll zero out to the end of the buffer so that - * if a mmap of this page occurs, we'll see zero's - * even if a ftruncate extends the file in the - * meantime - */ - bzero((caddr_t)(bp->b_data + bp->b_validend), - bp->b_bufsize - bp->b_validend); - FSDBG(258, bp->b_validend, - bp->b_bufsize - bp->b_validend, 0, 2); + bp->nb_validend = bp->nb_bufsize; + bp->nb_valid = (1 << (round_page_32(bp->nb_validend)/PAGE_SIZE)) - 1; + if (bp->nb_validend & PAGE_MASK) { + /* valid range ends in the middle of a page so we */ + /* need to zero-fill any invalid data at the end */ + /* of the last page */ + bzero((caddr_t)(bp->nb_data + bp->nb_validend), + bp->nb_bufsize - bp->nb_validend); + FSDBG(258, bp->nb_validend, + bp->nb_bufsize - bp->nb_validend, 0, 2); } } if (p && (vp->v_flag & VTEXT) && @@ -1222,10 +2502,14 @@ nfs_doio(bp, cr, p) uiop->uio_offset = (off_t)0; nfsstats.readlink_bios++; error = nfs_readlinkrpc(vp, uiop, cr); + if (!error) { + bp->nb_validoff = 0; + bp->nb_validend = uiop->uio_offset; + } break; case VDIR: nfsstats.readdir_bios++; - uiop->uio_offset = ((u_quad_t)bp->b_lblkno) * NFS_DIRBLKSIZ; + uiop->uio_offset = NBOFF(bp); if (!(nmp->nm_flag & NFSMNT_NFSV3)) nmp->nm_flag &= ~NFSMNT_RDIRPLUS; /* dk@farm.org */ if (nmp->nm_flag & NFSMNT_RDIRPLUS) { @@ -1235,151 +2519,276 @@ nfs_doio(bp, cr, p) } if ((nmp->nm_flag & NFSMNT_RDIRPLUS) == 0) error = nfs_readdirrpc(vp, uiop, cr); + if (!error) { + bp->nb_validoff = 0; + bp->nb_validend = uiop->uio_offset - NBOFF(bp); + bp->nb_valid = (1 << (round_page_32(bp->nb_validend)/PAGE_SIZE)) - 1; + } break; default: printf("nfs_doio: type %x unexpected\n", vp->v_type); break; }; if (error) { - SET(bp->b_flags, B_ERROR); - bp->b_error = error; + SET(bp->nb_flags, NB_ERROR); + bp->nb_error = error; } + } else { + /* we're doing a write */ + int doff, dend = 0; + + /* We need to make sure the pages are locked before doing I/O. */ + if (!ISSET(bp->nb_flags, NB_META) && UBCISVALID(vp)) { + if (!ISSET(bp->nb_flags, NB_PAGELIST)) { + error = nfs_buf_upl_setup(bp); + if (error) { + printf("nfs_doio: upl create failed %d\n", error); + SET(bp->nb_flags, NB_ERROR); + bp->nb_error = EIO; + return (EIO); + } + nfs_buf_upl_check(bp); + } + } + + if (ISSET(bp->nb_flags, NB_WASDIRTY)) { + FSDBG(256, bp, NBOFF(bp), bp->nb_dirty, 0xd00dee); + /* + * There are pages marked dirty that need to be written out. + * + * We don't want to just combine the write range with the + * range of pages that are dirty because that could cause us + * to write data that wasn't actually written to. + * We also don't want to write data more than once. + * + * If the dirty range just needs to be committed, we do that. + * Otherwise, we write the dirty range and clear the dirty bits + * for any COMPLETE pages covered by that range. + * If there are dirty pages left after that, we write out the + * parts that we haven't written yet. + */ + } + /* - * mapped I/O may have altered any bytes, so we extend - * the dirty zone to the valid zone. For best performance - * a better solution would be to save & restore page dirty bits - * around the uiomove which brings write-data into the buffer. - * Then here we'd check if the page is dirty rather than WASMAPPED - * Also vnode_pager would change - if a page is clean it might - * still need to be written due to DELWRI. + * If NB_NEEDCOMMIT is set, a commit rpc may do the trick. If not + * an actual write will have to be done. + * If NB_WRITEINPROG is already set, then push it with a write anyhow. */ - if (UBCINFOEXISTS(vp) && ubc_issetflags(vp, UI_WASMAPPED)) { - bp->b_dirtyoff = min(bp->b_dirtyoff, bp->b_validoff); - bp->b_dirtyend = max(bp->b_dirtyend, bp->b_validend); + if ((bp->nb_flags & (NB_NEEDCOMMIT | NB_WRITEINPROG)) == NB_NEEDCOMMIT) { + doff = NBOFF(bp) + bp->nb_dirtyoff; + SET(bp->nb_flags, NB_WRITEINPROG); + error = nfs_commit(vp, doff, bp->nb_dirtyend - bp->nb_dirtyoff, + bp->nb_wcred, bp->nb_proc); + CLR(bp->nb_flags, NB_WRITEINPROG); + if (!error) { + bp->nb_dirtyoff = bp->nb_dirtyend = 0; + CLR(bp->nb_flags, NB_NEEDCOMMIT); + np->n_needcommitcnt--; + CHECK_NEEDCOMMITCNT(np); + } else if (error == NFSERR_STALEWRITEVERF) + nfs_clearcommit(vp->v_mount); } - if ((off_t)bp->b_blkno * DEV_BSIZE + bp->b_dirtyend > np->n_size) - bp->b_dirtyend = np->n_size - (off_t)bp->b_blkno * DEV_BSIZE; - - if (bp->b_dirtyend > bp->b_dirtyoff) { - io.iov_len = uiop->uio_resid = bp->b_dirtyend - bp->b_dirtyoff; - uiop->uio_offset = (off_t)bp->b_blkno * DEV_BSIZE + - bp->b_dirtyoff; - io.iov_base = (char *)bp->b_data + bp->b_dirtyoff; - uiop->uio_rw = UIO_WRITE; - nfsstats.write_bios++; - if ((bp->b_flags & (B_ASYNC | B_NEEDCOMMIT | B_NOCACHE)) == - B_ASYNC) + if (!error && bp->nb_dirtyend > 0) { + /* there's a dirty range that needs to be written out */ + u_int32_t pagemask; + int firstpg, lastpg; + + if (NBOFF(bp) + bp->nb_dirtyend > np->n_size) + bp->nb_dirtyend = np->n_size - NBOFF(bp); + + NFS_BUF_MAP(bp); + + doff = bp->nb_dirtyoff; + dend = bp->nb_dirtyend; + + /* if doff page is dirty, move doff to start of page */ + if (NBPGDIRTY(bp,doff/PAGE_SIZE)) + doff -= doff & PAGE_MASK; + /* try to expand write range to include preceding dirty pages */ + if (!(doff & PAGE_MASK)) + while (doff > 0 && NBPGDIRTY(bp,(doff-1)/PAGE_SIZE)) + doff -= PAGE_SIZE; + /* if dend page is dirty, move dend to start of next page */ + if ((dend & PAGE_MASK) && NBPGDIRTY(bp,dend/PAGE_SIZE)) + dend = round_page_32(dend); + /* try to expand write range to include trailing dirty pages */ + if (!(dend & PAGE_MASK)) + while (dend < bp->nb_bufsize && NBPGDIRTY(bp,dend/PAGE_SIZE)) + dend += PAGE_SIZE; + /* make sure to keep dend clipped to EOF */ + if (NBOFF(bp) + dend > np->n_size) + dend = np->n_size - NBOFF(bp); + /* calculate range of complete pages being written */ + firstpg = round_page_32(doff) / PAGE_SIZE; + lastpg = (trunc_page_32(dend) - 1)/ PAGE_SIZE; + /* calculate mask for that page range */ + pagemask = ((1 << (lastpg+1)) - 1) & ~((1 << firstpg) - 1); + + /* compare page mask to nb_dirty; if there are other dirty pages */ + /* then write FILESYNC; otherwise, write UNSTABLE if async and */ + /* not needcommit/nocache/call; otherwise write FILESYNC */ + if (bp->nb_dirty & ~pagemask) + iomode = NFSV3WRITE_FILESYNC; + else if ((bp->nb_flags & (NB_ASYNC | NB_NEEDCOMMIT | NB_NOCACHE | NB_STABLE)) == NB_ASYNC) iomode = NFSV3WRITE_UNSTABLE; else iomode = NFSV3WRITE_FILESYNC; - SET(bp->b_flags, B_WRITEINPROG); + + /* write the dirty range */ + io.iov_len = uiop->uio_resid = dend - doff; + uiop->uio_offset = NBOFF(bp) + doff; + io.iov_base = (char *)bp->nb_data + doff; + uiop->uio_rw = UIO_WRITE; + + nfsstats.write_bios++; + + SET(bp->nb_flags, NB_WRITEINPROG); error = nfs_writerpc(vp, uiop, cr, &iomode, &must_commit); - if (!error && iomode == NFSV3WRITE_UNSTABLE) - SET(bp->b_flags, B_NEEDCOMMIT); - else - CLR(bp->b_flags, B_NEEDCOMMIT); - CLR(bp->b_flags, B_WRITEINPROG); + if (must_commit) + nfs_clearcommit(vp->v_mount); + /* clear dirty bits for pages we've written */ + if (!error) + bp->nb_dirty &= ~pagemask; + /* set/clear needcommit flag */ + if (!error && iomode == NFSV3WRITE_UNSTABLE) { + if (!ISSET(bp->nb_flags, NB_NEEDCOMMIT)) + np->n_needcommitcnt++; + SET(bp->nb_flags, NB_NEEDCOMMIT); + /* make sure nb_dirtyoff/nb_dirtyend reflect actual range written */ + bp->nb_dirtyoff = doff; + bp->nb_dirtyend = dend; + } else { + if (ISSET(bp->nb_flags, NB_NEEDCOMMIT)) { + np->n_needcommitcnt--; + CHECK_NEEDCOMMITCNT(np); + } + CLR(bp->nb_flags, NB_NEEDCOMMIT); + } + CLR(bp->nb_flags, NB_WRITEINPROG); /* - * For an interrupted write, the buffer is still valid - * and the write hasn't been pushed to the server yet, - * so we can't set B_ERROR and report the interruption - * by setting B_EINTR. For the B_ASYNC case, B_EINTR - * is not relevant, so the rpc attempt is essentially - * a noop. For the case of a V3 write rpc not being - * committed to stable storage, the block is still - * dirty and requires either a commit rpc or another - * write rpc with iomode == NFSV3WRITE_FILESYNC before - * the block is reused. This is indicated by setting - * the B_DELWRI and B_NEEDCOMMIT flags. + * For an interrupted write, the buffer is still valid and the write + * hasn't been pushed to the server yet, so we can't set NB_ERROR and + * report the interruption by setting NB_EINTR. For the NB_ASYNC case, + * NB_EINTR is not relevant. + * + * For the case of a V3 write rpc not being committed to stable + * storage, the block is still dirty and requires either a commit rpc + * or another write rpc with iomode == NFSV3WRITE_FILESYNC before the + * block is reused. This is indicated by setting the NB_DELWRI and + * NB_NEEDCOMMIT flags. */ - if (error == EINTR || (!error && bp->b_flags & B_NEEDCOMMIT)) { - int s; - - CLR(bp->b_flags, B_INVAL | B_NOCACHE); - if (!ISSET(bp->b_flags, B_DELWRI)) { - SET(bp->b_flags, B_DELWRI); - nbdwrite++; - } - FSDBG(261, bp->b_validoff, bp->b_validend, - bp->b_bufsize, bp->b_bcount); - /* - * Since for the B_ASYNC case, nfs_bwrite() has - * reassigned the buffer to the clean list, we have to - * reassign it back to the dirty one. Ugh. - */ - if (ISSET(bp->b_flags, B_ASYNC)) { - s = splbio(); - reassignbuf(bp, vp); - splx(s); - } else { - SET(bp->b_flags, B_EINTR); - } + if (error == EINTR || (!error && bp->nb_flags & NB_NEEDCOMMIT)) { + CLR(bp->nb_flags, NB_INVAL | NB_NOCACHE); + if (!ISSET(bp->nb_flags, NB_DELWRI)) { + SET(bp->nb_flags, NB_DELWRI); + nfs_nbdwrite++; + NFSBUFCNTCHK(); + } + FSDBG(261, bp->nb_validoff, bp->nb_validend, + bp->nb_bufsize, 0); + /* + * Since for the NB_ASYNC case, nfs_bwrite() has + * reassigned the buffer to the clean list, we have to + * reassign it back to the dirty one. Ugh. + */ + if (ISSET(bp->nb_flags, NB_ASYNC)) { + /* move to dirty list */ + int s = splbio(); + if (bp->nb_vnbufs.le_next != NFSNOLIST) + LIST_REMOVE(bp, nb_vnbufs); + LIST_INSERT_HEAD(&np->n_dirtyblkhd, bp, nb_vnbufs); + splx(s); + } else { + SET(bp->nb_flags, NB_EINTR); + } } else { + /* either there's an error or we don't need to commit */ if (error) { - SET(bp->b_flags, B_ERROR); - bp->b_error = np->n_error = error; - np->n_flag |= NWRITEERR; - } - bp->b_dirtyoff = bp->b_dirtyend = 0; - - /* - * validoff and validend represent the real data present - * in this buffer if validoff is non-zero, than we have - * to invalidate the buffer and kill the page when - * biodone is called... the same is also true when - * validend doesn't extend all the way to the end of the - * buffer and validend doesn't equate to the current - * EOF... eventually we need to deal with this in a more - * humane way (like keeping the partial buffer without - * making it immediately available to the VM page cache) - */ - if (bp->b_validoff) - SET(bp->b_flags, B_INVAL); - else - if (bp->b_validend < bp->b_bufsize) { - if ((off_t)bp->b_blkno * DEV_BSIZE + - bp->b_validend == np->n_size) { - bzero((caddr_t)(bp->b_data + - bp->b_validend), - bp->b_bufsize - bp->b_validend); - FSDBG(259, bp->b_validend, - bp->b_bufsize - bp->b_validend, 0, - 0); - } else - SET(bp->b_flags, B_INVAL); + SET(bp->nb_flags, NB_ERROR); + bp->nb_error = np->n_error = error; + np->n_flag |= NWRITEERR; } + /* clear the dirty range */ + bp->nb_dirtyoff = bp->nb_dirtyend = 0; } + } + + if (!error && bp->nb_dirty) { + /* there are pages marked dirty that need to be written out */ + int pg, cnt, npages, off, len; + + nfsstats.write_bios++; - } else { - if (bp->b_validoff || - (bp->b_validend < bp->b_bufsize && - (off_t)bp->b_blkno * DEV_BSIZE + bp->b_validend != - np->n_size)) { - SET(bp->b_flags, B_INVAL); + NFS_BUF_MAP(bp); + + /* + * we do these writes synchronously because we can't really + * support the unstable/needommit method. We could write + * them unstable, clear the dirty bits, and then commit the + * whole block later, but if we need to rewrite the data, we + * won't have any idea which pages were written because that + * info can't be stored in the nb_dirtyoff/nb_dirtyend. We + * also can't leave the dirty bits set because then we wouldn't + * be able to tell if the pages were re-dirtied between the end + * of the write and the commit. + */ + iomode = NFSV3WRITE_FILESYNC; + uiop->uio_rw = UIO_WRITE; + + SET(bp->nb_flags, NB_WRITEINPROG); + npages = bp->nb_bufsize/PAGE_SIZE; + for (pg=0; pg < npages; pg++) { + if (!NBPGDIRTY(bp,pg)) + continue; + cnt = 1; + while (((pg+cnt) < npages) && NBPGDIRTY(bp,pg+cnt)) + cnt++; + /* write cnt pages starting with page pg */ + off = pg * PAGE_SIZE; + len = cnt * PAGE_SIZE; + + /* clip writes to EOF */ + if (NBOFF(bp) + off + len > np->n_size) + len -= (NBOFF(bp) + off + len) - np->n_size; + if (len > 0) { + io.iov_len = uiop->uio_resid = len; + uiop->uio_offset = NBOFF(bp) + off; + io.iov_base = (char *)bp->nb_data + off; + error = nfs_writerpc(vp, uiop, cr, &iomode, &must_commit); + if (must_commit) + nfs_clearcommit(vp->v_mount); + if (error) + break; + } + /* clear dirty bits */ + while (cnt--) { + bp->nb_dirty &= ~(1 << pg); + /* leave pg on last page */ + if (cnt) pg++; + } } - if (bp->b_flags & B_INVAL) { - FSDBG(260, bp->b_validoff, bp->b_validend, - bp->b_bufsize, bp->b_bcount); + if (!error) { + if (ISSET(bp->nb_flags, NB_NEEDCOMMIT)) { + np->n_needcommitcnt--; + CHECK_NEEDCOMMITCNT(np); + } + CLR(bp->nb_flags, NB_NEEDCOMMIT); } - bp->b_resid = 0; - biodone(bp); - FSDBG_BOT(256, bp->b_validoff, bp->b_validend, bp->b_bufsize, + CLR(bp->nb_flags, NB_WRITEINPROG); + FSDBG_BOT(256, bp->nb_validoff, bp->nb_validend, bp->nb_bufsize, np->n_size); - return (0); } - } - bp->b_resid = uiop->uio_resid; - if (must_commit) - nfs_clearcommit(vp->v_mount); - if (bp->b_flags & B_INVAL) { - FSDBG(260, bp->b_validoff, bp->b_validend, bp->b_bufsize, - bp->b_bcount); + if (error) { + SET(bp->nb_flags, NB_ERROR); + bp->nb_error = error; + } } - FSDBG_BOT(256, bp->b_validoff, bp->b_validend, bp->b_bcount, error); - biodone(bp); + FSDBG_BOT(256, bp->nb_validoff, bp->nb_validend, bp->nb_bufsize, error); + + nfs_buf_iodone(bp); return (error); }