X-Git-Url: https://git.saurik.com/apple/xnu.git/blobdiff_plain/43866e378188c25dd1e2208016ab3cbeb086ae6c..55e303ae13a4cf49d70f2294092726f2fffb9ef2:/bsd/nfs/nfs_bio.c

diff --git a/bsd/nfs/nfs_bio.c b/bsd/nfs/nfs_bio.c
index 7f41efe13..1b6b078c5 100644
--- a/bsd/nfs/nfs_bio.c
+++ b/bsd/nfs/nfs_bio.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2000 Apple Computer, Inc. All rights reserved.
+ * Copyright (c) 2000-2003 Apple Computer, Inc. All rights reserved.
  *
  * @APPLE_LICENSE_HEADER_START@
  * 
@@ -66,8 +66,9 @@
 #include <sys/resourcevar.h>
 #include <sys/signalvar.h>
 #include <sys/proc.h>
-#include <sys/buf.h>
+#include <sys/malloc.h>
 #include <sys/vnode.h>
+#include <sys/dirent.h>
 #include <sys/mount.h>
 #include <sys/kernel.h>
 #include <sys/sysctl.h>
@@ -98,12 +99,863 @@
 	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, (A))) | DBG_FUNC_END, \
 		(int)(B), (int)(C), (int)(D), (int)(E), 0)
 
-static struct buf *nfs_getcacheblk __P((struct vnode *vp, daddr_t bn, int size,
-					struct proc *p, int operation));
-
 extern int nfs_numasync;
+extern int nfs_ioddelwri;
 extern struct nfsstats nfsstats;
-extern int nbdwrite;
+
+#define	NFSBUFHASH(dvp, lbn)	\
+	(&nfsbufhashtbl[((long)(dvp) / sizeof(*(dvp)) + (int)(lbn)) & nfsbufhash])
+LIST_HEAD(nfsbufhashhead, nfsbuf) *nfsbufhashtbl;
+struct nfsbuffreehead nfsbuffree, nfsbufdelwri;
+u_long nfsbufhash;
+int nfsbufhashlock, nfsbufcnt, nfsbufmin, nfsbufmax;
+int nfsbuffreecnt, nfsbufdelwricnt, nfsneedbuffer;
+int nfs_nbdwrite;
+
+#define NFSBUFWRITE_THROTTLE 9
+
+/*
+ * Initialize nfsbuf lists
+ */
+void
+nfs_nbinit(void)
+{
+	nfsbufhashlock = 0;
+	nfsbufhashtbl = hashinit(nbuf, M_TEMP, &nfsbufhash);
+	TAILQ_INIT(&nfsbuffree);
+	TAILQ_INIT(&nfsbufdelwri);
+	nfsbufcnt = nfsbuffreecnt = nfsbufdelwricnt = 0;
+	nfsbufmin = 128; // XXX tune me!
+	nfsbufmax = 8192; // XXX tune me!
+	nfsneedbuffer = 0;
+	nfs_nbdwrite = 0;
+}
+
+/*
+ * try to free up some excess, unused nfsbufs
+ */
+static void
+nfs_buf_freeup(void)
+{
+	struct nfsbuf *fbp;
+	int cnt;
+
+#define NFS_BUF_FREEUP() \
+    	do { \
+		/* only call nfs_buf_freeup() if it has work to do */ \
+		if ((nfsbuffreecnt > nfsbufcnt/4) && \
+		    (nfsbufcnt-nfsbuffreecnt/8 > nfsbufmin)) \
+			nfs_buf_freeup(); \
+	} while (0)
+
+	if (nfsbuffreecnt < nfsbufcnt/4)
+		return;
+	cnt = nfsbuffreecnt/8;
+	if (nfsbufcnt-cnt < nfsbufmin)
+		return;
+
+	FSDBG(320, -1, nfsbufcnt, nfsbuffreecnt, cnt);
+	while (cnt-- > 0) {
+		fbp = TAILQ_FIRST(&nfsbuffree);
+		if (!fbp)
+			break;
+		nfs_buf_remfree(fbp);
+		/* disassociate buffer from any vnode */
+		if (fbp->nb_vp) {
+			struct vnode *oldvp;
+			if (fbp->nb_vnbufs.le_next != NFSNOLIST) {
+				LIST_REMOVE(fbp, nb_vnbufs);
+				fbp->nb_vnbufs.le_next = NFSNOLIST;
+			}
+			oldvp = fbp->nb_vp;
+			fbp->nb_vp = NULL;
+			HOLDRELE(oldvp);
+		}
+		LIST_REMOVE(fbp, nb_hash);
+		/* nuke any creds */
+		if (fbp->nb_rcred != NOCRED)
+			crfree(fbp->nb_rcred);
+		if (fbp->nb_wcred != NOCRED)
+			crfree(fbp->nb_wcred);
+		/* if buf was NB_META, dump buffer */
+		if (ISSET(fbp->nb_flags, NB_META) && fbp->nb_data) {
+			FREE(fbp->nb_data, M_TEMP);
+		}
+		FREE(fbp, M_TEMP);
+		nfsbufcnt--;
+	}
+	FSDBG(320, -1, nfsbufcnt, nfsbuffreecnt, cnt);
+}
+
+void
+nfs_buf_remfree(struct nfsbuf *bp)
+{
+	if (bp->nb_free.tqe_next == NFSNOLIST)
+		panic("nfsbuf not on free list");
+	if (ISSET(bp->nb_flags, NB_DELWRI)) {
+		nfsbufdelwricnt--;
+		TAILQ_REMOVE(&nfsbufdelwri, bp, nb_free);
+	} else {
+		nfsbuffreecnt--;
+		TAILQ_REMOVE(&nfsbuffree, bp, nb_free);
+	}
+	bp->nb_free.tqe_next = NFSNOLIST;
+	NFSBUFCNTCHK();
+}
+
+/*
+ * check for existence of nfsbuf in cache
+ */
+struct nfsbuf *
+nfs_buf_incore(struct vnode *vp, daddr_t blkno)
+{
+	/* Search hash chain */
+	struct nfsbuf * bp = NFSBUFHASH(vp, blkno)->lh_first;
+	for (; bp != NULL; bp = bp->nb_hash.le_next)
+		if (bp->nb_lblkno == blkno && bp->nb_vp == vp &&
+		    !ISSET(bp->nb_flags, NB_INVAL)) {
+			FSDBG(547, bp, blkno, bp->nb_flags, bp->nb_vp);
+			return (bp);
+		    }
+	return (NULL);
+}
+
+/*
+ * Check if it's OK to drop a page.
+ *
+ * Called by vnode_pager() on pageout request of non-dirty page.
+ * We need to make sure that it's not part of a delayed write.
+ * If it is, we can't let the VM drop it because we may need it
+ * later when/if we need to write the data (again).
+ */
+int
+nfs_buf_page_inval(struct vnode *vp, off_t offset)
+{
+	struct nfsbuf *bp;
+	bp = nfs_buf_incore(vp, ubc_offtoblk(vp, offset));
+	if (!bp)
+		return (0);
+	FSDBG(325, bp, bp->nb_flags, bp->nb_dirtyoff, bp->nb_dirtyend);
+	if (ISSET(bp->nb_flags, NB_BUSY))
+		return (EBUSY);
+	/*
+	 * If there's a dirty range in the buffer, check to
+	 * see if this page intersects with the dirty range.
+	 * If it does, we can't let the pager drop the page.
+	 */
+	if (bp->nb_dirtyend > 0) {
+		int start = offset - NBOFF(bp);
+		if (bp->nb_dirtyend <= start ||
+		    bp->nb_dirtyoff >= (start + PAGE_SIZE))
+			return (0);
+		return (EBUSY);
+	}
+	return (0);
+}
+
+int
+nfs_buf_upl_setup(struct nfsbuf *bp)
+{
+	kern_return_t kret;
+	upl_t upl;
+	int s;
+
+	if (ISSET(bp->nb_flags, NB_PAGELIST))
+		return (0);
+
+	kret = ubc_create_upl(bp->nb_vp, NBOFF(bp), bp->nb_bufsize,
+				&upl, NULL, UPL_PRECIOUS);
+	if (kret == KERN_INVALID_ARGUMENT) {
+		/* vm object probably doesn't exist any more */
+		bp->nb_pagelist = NULL;
+		return (EINVAL);
+	}
+	if (kret != KERN_SUCCESS) {
+		printf("nfs_buf_upl_setup(): failed to get pagelist %d\n", kret);
+		bp->nb_pagelist = NULL;
+		return (EIO);
+	}
+
+	FSDBG(538, bp, NBOFF(bp), bp->nb_bufsize, bp->nb_vp);
+
+	s = splbio();
+	bp->nb_pagelist = upl;
+	SET(bp->nb_flags, NB_PAGELIST);
+	splx(s);
+	return (0);
+}
+
+void
+nfs_buf_upl_check(struct nfsbuf *bp)
+{
+	upl_page_info_t *pl;
+	off_t filesize, fileoffset;
+	int i, npages;
+
+	if (!ISSET(bp->nb_flags, NB_PAGELIST))
+		return;
+
+	npages = round_page_32(bp->nb_bufsize) / PAGE_SIZE;
+	filesize = ubc_getsize(bp->nb_vp);
+	fileoffset = NBOFF(bp);
+	if (fileoffset < filesize)
+		SET(bp->nb_flags, NB_CACHE);
+	else
+		CLR(bp->nb_flags, NB_CACHE);
+
+	pl = ubc_upl_pageinfo(bp->nb_pagelist);
+	bp->nb_valid = bp->nb_dirty = 0;
+
+	for (i=0; i < npages; i++, fileoffset += PAGE_SIZE_64) {
+		/* anything beyond the end of the file is not valid or dirty */
+		if (fileoffset >= filesize)
+			break;
+		if (!upl_valid_page(pl, i)) {
+			CLR(bp->nb_flags, NB_CACHE);
+			continue;
+		}
+		NBPGVALID_SET(bp,i);
+		if (upl_dirty_page(pl, i)) {
+			NBPGDIRTY_SET(bp, i);
+			if (!ISSET(bp->nb_flags, NB_WASDIRTY))
+				SET(bp->nb_flags, NB_WASDIRTY);
+		}
+	}
+	fileoffset = NBOFF(bp);
+	if (ISSET(bp->nb_flags, NB_CACHE)) {
+		bp->nb_validoff = 0;
+		bp->nb_validend = bp->nb_bufsize;
+		if (fileoffset + bp->nb_validend > filesize)
+			bp->nb_validend = filesize - fileoffset;
+	} else {
+		bp->nb_validoff = bp->nb_validend = -1;
+	}
+	FSDBG(539, bp, fileoffset, bp->nb_valid, bp->nb_dirty);
+	FSDBG(539, bp->nb_validoff, bp->nb_validend, bp->nb_dirtyoff, bp->nb_dirtyend);
+}
+
+static int
+nfs_buf_map(struct nfsbuf *bp)
+{
+	kern_return_t kret;
+
+	if (bp->nb_data)
+		return (0);
+	if (!ISSET(bp->nb_flags, NB_PAGELIST))
+		return (EINVAL);
+
+	kret = ubc_upl_map(bp->nb_pagelist, (vm_address_t *)&(bp->nb_data));
+	if (kret != KERN_SUCCESS)
+		panic("nfs_buf_map: ubc_upl_map() failed with (%d)", kret);
+	if (bp->nb_data == 0)
+		panic("ubc_upl_map mapped 0");
+	FSDBG(540, bp, bp->nb_flags, NBOFF(bp), bp->nb_data);
+	return (0);
+}
+
+/*
+ * check range of pages in nfsbuf's UPL for validity
+ */
+static int
+nfs_buf_upl_valid_range(struct nfsbuf *bp, int off, int size)
+{
+	off_t fileoffset, filesize;
+	int pg, lastpg;
+	upl_page_info_t *pl;
+
+	if (!ISSET(bp->nb_flags, NB_PAGELIST))
+		return (0);
+	pl = ubc_upl_pageinfo(bp->nb_pagelist);
+
+	size += off & PAGE_MASK;
+	off &= ~PAGE_MASK;
+	fileoffset = NBOFF(bp);
+	filesize = VTONFS(bp->nb_vp)->n_size;
+	if ((fileoffset + off + size) > filesize)
+		size = filesize - (fileoffset + off);
+
+	pg = off/PAGE_SIZE;
+	lastpg = (off + size - 1)/PAGE_SIZE;
+	while (pg <= lastpg) {
+		if (!upl_valid_page(pl, pg))
+			return (0);
+		pg++;
+	}
+	return (1);
+}
+
+/*
+ * normalize an nfsbuf's valid range
+ *
+ * the read/write code guarantees that we'll always have a valid
+ * region that is an integral number of pages.  If either end
+ * of the valid range isn't page-aligned, it gets corrected
+ * here as we extend the valid range through all of the
+ * contiguous valid pages.
+ */
+static void
+nfs_buf_normalize_valid_range(struct nfsnode *np, struct nfsbuf *bp)
+{
+	int pg, npg;
+	/* pull validoff back to start of contiguous valid page range */
+	pg = bp->nb_validoff/PAGE_SIZE;
+	while (pg >= 0 && NBPGVALID(bp,pg))
+		pg--;
+	bp->nb_validoff = (pg+1) * PAGE_SIZE;
+	/* push validend forward to end of contiguous valid page range */
+	npg = bp->nb_bufsize/PAGE_SIZE;
+	pg = bp->nb_validend/PAGE_SIZE;
+	while (pg < npg && NBPGVALID(bp,pg))
+		pg++;
+	bp->nb_validend = pg * PAGE_SIZE;
+	/* clip to EOF */
+	if (NBOFF(bp) + bp->nb_validend > np->n_size)
+		bp->nb_validend = np->n_size % bp->nb_bufsize;
+}
+
+/*
+ * try to push out some delayed/uncommitted writes
+ */
+static void
+nfs_buf_delwri_push(void)
+{
+	struct nfsbuf *bp;
+	int i;
+
+	if (TAILQ_EMPTY(&nfsbufdelwri))
+		return;
+
+	/* first try to tell the nfsiods to do it */
+	if (nfs_asyncio(NULL, NULL) == 0)
+		return;
+
+	/* otherwise, try to do some of the work ourselves */
+	i = 0;
+	while (i < 8 && (bp = TAILQ_FIRST(&nfsbufdelwri)) != NULL) {
+		struct nfsnode *np = VTONFS(bp->nb_vp);
+		nfs_buf_remfree(bp);
+		if (ISSET(bp->nb_flags, NB_NEEDCOMMIT)) {
+			/* put buffer at end of delwri list */
+			TAILQ_INSERT_TAIL(&nfsbufdelwri, bp, nb_free);
+			nfsbufdelwricnt++;
+			nfs_flushcommits(np->n_vnode, (struct proc *)0);
+		} else {
+			SET(bp->nb_flags, (NB_BUSY | NB_ASYNC));
+			nfs_buf_write(bp);
+		}
+		i++;
+	}
+}
+
+/*
+ * Get an nfs cache block.
+ * Allocate a new one if the block isn't currently in the cache
+ * and return the block marked busy. If the calling process is
+ * interrupted by a signal for an interruptible mount point, return
+ * NULL.
+ */
+struct nfsbuf *
+nfs_buf_get(
+	struct vnode *vp,
+	daddr_t blkno,
+	int size,
+	struct proc *p,
+	int operation)
+{
+	struct nfsnode *np = VTONFS(vp);
+	struct nfsbuf *bp;
+	int i, biosize, bufsize, rv;
+	struct ucred *cred;
+	int slpflag = PCATCH;
+
+	FSDBG_TOP(541, vp, blkno, size, operation);
+
+	bufsize = size;
+	if (bufsize > MAXBSIZE)
+		panic("nfs_buf_get: buffer larger than MAXBSIZE requested");
+
+	biosize = vp->v_mount->mnt_stat.f_iosize;
+
+	if (UBCINVALID(vp) || !UBCINFOEXISTS(vp))
+		operation = BLK_META;
+	else if (bufsize < biosize)
+		/* reg files should always have biosize blocks */
+		bufsize = biosize;
+
+	/* if BLK_WRITE, check for too many delayed/uncommitted writes */
+	if ((operation == BLK_WRITE) && (nfs_nbdwrite > ((nfsbufcnt*3)/4))) {
+		FSDBG_TOP(542, vp, blkno, nfs_nbdwrite, ((nfsbufcnt*3)/4));
+
+		/* poke the delwri list */
+		nfs_buf_delwri_push();
+
+		/* sleep to let other threads run... */
+		tsleep(&nfs_nbdwrite, PCATCH, "nfs_nbdwrite", 1);
+		FSDBG_BOT(542, vp, blkno, nfs_nbdwrite, ((nfsbufcnt*3)/4));
+	}
+
+loop:
+	/*
+	 * Obtain a lock to prevent a race condition if the
+	 * MALLOC() below happens to block.
+	 */
+	if (nfsbufhashlock) {
+		while (nfsbufhashlock) {
+			nfsbufhashlock = -1;
+			tsleep(&nfsbufhashlock, PCATCH, "nfsbufget", 0);
+			if (nfs_sigintr(VFSTONFS(vp->v_mount), NULL, p))
+				return (NULL);
+		}
+		goto loop;
+	}
+	nfsbufhashlock = 1;
+
+	/* check for existence of nfsbuf in cache */
+	if (bp = nfs_buf_incore(vp, blkno)) {
+		/* if busy, set wanted and wait */
+		if (ISSET(bp->nb_flags, NB_BUSY)) {
+			FSDBG_TOP(543, vp, blkno, bp, bp->nb_flags);
+			SET(bp->nb_flags, NB_WANTED);
+			/* unlock hash */
+			if (nfsbufhashlock < 0) {
+				nfsbufhashlock = 0;
+				wakeup(&nfsbufhashlock);
+			} else
+				nfsbufhashlock = 0;
+			tsleep(bp, slpflag|(PRIBIO+1), "nfsbufget", (slpflag == PCATCH) ? 0 : 2*hz);
+			slpflag = 0;
+			FSDBG_BOT(543, vp, blkno, bp, bp->nb_flags);
+			if (nfs_sigintr(VFSTONFS(vp->v_mount), NULL, p)) {
+				FSDBG_BOT(541, vp, blkno, 0, EINTR);
+				return (NULL);
+			}
+			goto loop;
+		}
+		if (bp->nb_bufsize != bufsize)
+			panic("nfsbuf size mismatch");
+		SET(bp->nb_flags, (NB_BUSY | NB_CACHE));
+		nfs_buf_remfree(bp);
+		/* additional paranoia: */
+		if (ISSET(bp->nb_flags, NB_PAGELIST))
+			panic("pagelist buffer was not busy");
+		goto buffer_setup;
+	}
+
+	/*
+	 * where to get a free buffer:
+	 * - alloc new if we haven't reached min bufs
+	 * - free list
+	 * - alloc new if we haven't reached max allowed
+	 * - start clearing out delwri list and try again
+	 */
+
+	if ((nfsbufcnt > nfsbufmin) && !TAILQ_EMPTY(&nfsbuffree)) {
+		/* pull an nfsbuf off the free list */
+		bp = TAILQ_FIRST(&nfsbuffree);
+		FSDBG(544, vp, blkno, bp, bp->nb_flags);
+		nfs_buf_remfree(bp);
+		if (ISSET(bp->nb_flags, NB_DELWRI))
+			panic("nfs_buf_get: delwri");
+		SET(bp->nb_flags, NB_BUSY);
+		/* disassociate buffer from previous vnode */
+		if (bp->nb_vp) {
+			struct vnode *oldvp;
+			if (bp->nb_vnbufs.le_next != NFSNOLIST) {
+				LIST_REMOVE(bp, nb_vnbufs);
+				bp->nb_vnbufs.le_next = NFSNOLIST;
+			}
+			oldvp = bp->nb_vp;
+			bp->nb_vp = NULL;
+			HOLDRELE(oldvp);
+		}
+		LIST_REMOVE(bp, nb_hash);
+		/* nuke any creds we're holding */
+		cred = bp->nb_rcred;
+		if (cred != NOCRED) {
+			bp->nb_rcred = NOCRED; 
+			crfree(cred);
+		}
+		cred = bp->nb_wcred;
+		if (cred != NOCRED) {
+			bp->nb_wcred = NOCRED; 
+			crfree(cred);
+		}
+		/* if buf will no longer be NB_META, dump old buffer */
+		if ((operation != BLK_META) &&
+		    ISSET(bp->nb_flags, NB_META) && bp->nb_data) {
+			FREE(bp->nb_data, M_TEMP);
+			bp->nb_data = NULL;
+		}
+		/* re-init buf fields */
+		bp->nb_error = 0;
+		bp->nb_validoff = bp->nb_validend = -1;
+		bp->nb_dirtyoff = bp->nb_dirtyend = 0;
+		bp->nb_valid = 0;
+		bp->nb_dirty = 0;
+	} else if (nfsbufcnt < nfsbufmax) {
+		/* just alloc a new one */
+		MALLOC(bp, struct nfsbuf *, sizeof(struct nfsbuf), M_TEMP, M_WAITOK);
+		nfsbufcnt++;
+		NFSBUFCNTCHK();
+		/* init nfsbuf */
+		bzero(bp, sizeof(*bp));
+		bp->nb_free.tqe_next = NFSNOLIST;
+		bp->nb_validoff = bp->nb_validend = -1;
+		FSDBG(545, vp, blkno, bp, 0);
+	} else {
+		/* too many bufs... wait for buffers to free up */
+		FSDBG_TOP(546, vp, blkno, nfsbufcnt, nfsbufmax);
+		/* unlock hash */
+		if (nfsbufhashlock < 0) {
+			nfsbufhashlock = 0;
+			wakeup(&nfsbufhashlock);
+		} else
+			nfsbufhashlock = 0;
+
+		/* poke the delwri list */
+		nfs_buf_delwri_push();
+
+		nfsneedbuffer = 1;
+		tsleep(&nfsneedbuffer, PCATCH, "nfsbufget", 0);
+		FSDBG_BOT(546, vp, blkno, nfsbufcnt, nfsbufmax);
+		if (nfs_sigintr(VFSTONFS(vp->v_mount), NULL, p)) {
+			FSDBG_BOT(541, vp, blkno, 0, EINTR);
+			return (NULL);
+		}
+		goto loop;
+	}
+
+setup_nfsbuf:
+
+	/* setup nfsbuf */
+	bp->nb_flags = NB_BUSY;
+	bp->nb_lblkno = blkno;
+	/* insert buf in hash */
+	LIST_INSERT_HEAD(NFSBUFHASH(vp, blkno), bp, nb_hash);
+	/* associate buffer with new vnode */
+	VHOLD(vp);
+	bp->nb_vp = vp;
+	LIST_INSERT_HEAD(&np->n_cleanblkhd, bp, nb_vnbufs);
+
+buffer_setup:
+
+	switch (operation) {
+	case BLK_META:
+		SET(bp->nb_flags, NB_META);
+		if ((bp->nb_bufsize != bufsize) && bp->nb_data) {
+			FREE(bp->nb_data, M_TEMP);
+			bp->nb_data = NULL;
+			bp->nb_validoff = bp->nb_validend = -1;
+			bp->nb_dirtyoff = bp->nb_dirtyend = 0;
+			bp->nb_valid = 0;
+			bp->nb_dirty = 0;
+			CLR(bp->nb_flags, NB_CACHE);
+		}
+		if (!bp->nb_data)
+			MALLOC(bp->nb_data, caddr_t, bufsize, M_TEMP, M_WAITOK);
+		if (!bp->nb_data)
+			panic("nfs_buf_get: null nb_data");
+		bp->nb_bufsize = bufsize;
+		break;
+
+	case BLK_READ:
+	case BLK_WRITE:
+		if (bufsize < PAGE_SIZE)
+			bufsize = PAGE_SIZE;
+		bp->nb_bufsize = bufsize;
+		bp->nb_validoff = bp->nb_validend = -1;
+
+		if (UBCISVALID(vp)) {
+			/* setup upl */
+			if (nfs_buf_upl_setup(bp)) {
+				/* unable to create upl */
+				/* vm object must no longer exist */
+				/* cleanup buffer and return NULL */
+				LIST_REMOVE(bp, nb_vnbufs);
+				bp->nb_vnbufs.le_next = NFSNOLIST;
+				bp->nb_vp = NULL;
+				HOLDRELE(vp);
+				if (bp->nb_free.tqe_next != NFSNOLIST)
+					panic("nfsbuf on freelist");
+				TAILQ_INSERT_HEAD(&nfsbuffree, bp, nb_free);
+				nfsbuffreecnt++;
+				FSDBG_BOT(541, vp, blkno, 0x2bc, EIO);
+				return (NULL);
+			}
+			nfs_buf_upl_check(bp);
+		}
+		break;
+
+	default:
+		panic("nfs_buf_get: %d unknown operation", operation);
+	}
+
+	/* unlock hash */
+	if (nfsbufhashlock < 0) {
+		nfsbufhashlock = 0;
+		wakeup(&nfsbufhashlock);
+	} else
+		nfsbufhashlock = 0;
+
+	FSDBG_BOT(541, vp, blkno, bp, bp->nb_flags);
+
+	return (bp);
+}
+
+void
+nfs_buf_release(struct nfsbuf *bp)
+{
+	struct vnode *vp = bp->nb_vp;
+
+	FSDBG_TOP(548, bp, NBOFF(bp), bp->nb_flags, bp->nb_data);
+	FSDBG(548, bp->nb_validoff, bp->nb_validend, bp->nb_dirtyoff, bp->nb_dirtyend);
+	FSDBG(548, bp->nb_valid, 0, bp->nb_dirty, 0);
+
+	if (UBCINFOEXISTS(vp) && bp->nb_bufsize) {
+		int upl_flags;
+		upl_t upl;
+		int i, rv;
+
+		if (!ISSET(bp->nb_flags, NB_PAGELIST) && !ISSET(bp->nb_flags, NB_INVAL)) {
+			rv = nfs_buf_upl_setup(bp);
+			if (rv)
+				printf("nfs_buf_release: upl create failed %d\n", rv);
+			else
+				nfs_buf_upl_check(bp);
+		}
+		upl = bp->nb_pagelist;
+		if (!upl)
+			goto pagelist_cleanup_done;
+		if (bp->nb_data) {
+			if (ubc_upl_unmap(upl) != KERN_SUCCESS)
+				panic("ubc_upl_unmap failed");
+			bp->nb_data = NULL;
+		}
+		if (bp->nb_flags & (NB_ERROR | NB_INVAL)) {
+			if (bp->nb_flags & (NB_READ | NB_INVAL))
+				upl_flags = UPL_ABORT_DUMP_PAGES;
+			else
+				upl_flags = 0;
+			ubc_upl_abort(upl, upl_flags);
+			goto pagelist_cleanup_done;
+		}
+		for (i=0; i <= (bp->nb_bufsize - 1)/PAGE_SIZE; i++) {
+			if (!NBPGVALID(bp,i))
+				ubc_upl_abort_range(upl,
+					i*PAGE_SIZE, PAGE_SIZE,
+					UPL_ABORT_DUMP_PAGES |
+					UPL_ABORT_FREE_ON_EMPTY);
+			else {
+				if (NBPGDIRTY(bp,i))
+					upl_flags = UPL_COMMIT_SET_DIRTY;
+				else
+					upl_flags = UPL_COMMIT_CLEAR_DIRTY;
+				ubc_upl_commit_range(upl,
+					i*PAGE_SIZE, PAGE_SIZE,
+					upl_flags |
+					UPL_COMMIT_INACTIVATE |
+					UPL_COMMIT_FREE_ON_EMPTY);
+			}
+		}
+pagelist_cleanup_done:
+		/* was this the last buffer in the file? */
+		if (NBOFF(bp) + bp->nb_bufsize > VTONFS(vp)->n_size) {
+			/* if so, invalidate all pages of last buffer past EOF */
+			int biosize = vp->v_mount->mnt_stat.f_iosize;
+			off_t off, size;
+			off = trunc_page_64(VTONFS(vp)->n_size) + PAGE_SIZE_64;
+			size = trunc_page_64(NBOFF(bp) + biosize) - off;
+			if (size)
+				ubc_invalidate(vp, off, size);
+		}
+		CLR(bp->nb_flags, NB_PAGELIST);
+		bp->nb_pagelist = NULL;
+	}
+
+	/* Wake up any processes waiting for any buffer to become free. */
+	if (nfsneedbuffer) {
+		nfsneedbuffer = 0;
+		wakeup(&nfsneedbuffer);
+	}
+	/* Wake up any processes waiting for _this_ buffer to become free. */
+	if (ISSET(bp->nb_flags, NB_WANTED)) {
+		CLR(bp->nb_flags, NB_WANTED);
+		wakeup(bp);
+	}
+
+	/* If it's not cacheable, or an error, mark it invalid. */
+	if (ISSET(bp->nb_flags, (NB_NOCACHE|NB_ERROR)))
+		SET(bp->nb_flags, NB_INVAL);
+
+	if ((bp->nb_bufsize <= 0) || ISSET(bp->nb_flags, NB_INVAL)) {
+		/* If it's invalid or empty, dissociate it from its vnode */
+		if (bp->nb_vnbufs.le_next != NFSNOLIST) {
+			LIST_REMOVE(bp, nb_vnbufs);
+			bp->nb_vnbufs.le_next = NFSNOLIST;
+		}
+		bp->nb_vp = NULL;
+		HOLDRELE(vp);
+		/* if this was a delayed write, wakeup anyone */
+		/* waiting for delayed writes to complete */
+		if (ISSET(bp->nb_flags, NB_DELWRI)) {
+			CLR(bp->nb_flags, NB_DELWRI);
+			nfs_nbdwrite--;
+			NFSBUFCNTCHK();
+			wakeup((caddr_t)&nfs_nbdwrite);
+		}
+		/* put buffer at head of free list */
+		if (bp->nb_free.tqe_next != NFSNOLIST)
+			panic("nfsbuf on freelist");
+		TAILQ_INSERT_HEAD(&nfsbuffree, bp, nb_free);
+		nfsbuffreecnt++;
+		NFS_BUF_FREEUP();
+	} else if (ISSET(bp->nb_flags, NB_DELWRI)) {
+		/* put buffer at end of delwri list */
+		if (bp->nb_free.tqe_next != NFSNOLIST)
+			panic("nfsbuf on freelist");
+		TAILQ_INSERT_TAIL(&nfsbufdelwri, bp, nb_free);
+		nfsbufdelwricnt++;
+	} else {
+		/* put buffer at end of free list */
+		if (bp->nb_free.tqe_next != NFSNOLIST)
+			panic("nfsbuf on freelist");
+		TAILQ_INSERT_TAIL(&nfsbuffree, bp, nb_free);
+		nfsbuffreecnt++;
+		NFS_BUF_FREEUP();
+	}
+
+	NFSBUFCNTCHK();
+
+	/* Unlock the buffer. */
+	CLR(bp->nb_flags, (NB_ASYNC | NB_BUSY | NB_NOCACHE | NB_STABLE | NB_IOD));
+
+	FSDBG_BOT(548, bp, NBOFF(bp), bp->nb_flags, bp->nb_data);
+}
+
+/*
+ * Wait for operations on the buffer to complete.
+ * When they do, extract and return the I/O's error value.
+ */
+int
+nfs_buf_iowait(struct nfsbuf *bp)
+{
+	FSDBG_TOP(549, bp, NBOFF(bp), bp->nb_flags, bp->nb_error);
+
+	while (!ISSET(bp->nb_flags, NB_DONE))
+		tsleep(bp, PRIBIO + 1, "nfs_buf_iowait", 0);
+
+	FSDBG_BOT(549, bp, NBOFF(bp), bp->nb_flags, bp->nb_error);
+
+	/* check for interruption of I/O, then errors. */
+	if (ISSET(bp->nb_flags, NB_EINTR)) {
+		CLR(bp->nb_flags, NB_EINTR);
+		return (EINTR);
+	} else if (ISSET(bp->nb_flags, NB_ERROR))
+		return (bp->nb_error ? bp->nb_error : EIO);
+	return (0);
+}
+
+/*
+ * Mark I/O complete on a buffer.
+ */
+void
+nfs_buf_iodone(struct nfsbuf *bp)
+{
+	struct vnode *vp;
+
+	FSDBG_TOP(550, bp, NBOFF(bp), bp->nb_flags, bp->nb_error);
+
+	if (ISSET(bp->nb_flags, NB_DONE))
+		panic("nfs_buf_iodone already");
+	SET(bp->nb_flags, NB_DONE);		/* note that it's done */
+	/*
+	 * I/O was done, so don't believe
+	 * the DIRTY state from VM anymore
+	 */
+	CLR(bp->nb_flags, NB_WASDIRTY);
+
+	if (!ISSET(bp->nb_flags, NB_READ)) {
+		CLR(bp->nb_flags, NB_WRITEINPROG);
+		vpwakeup(bp->nb_vp);
+	}
+
+	/* Wakeup the throttled write operations as needed */
+	vp = bp->nb_vp;
+	if (vp && (vp->v_flag & VTHROTTLED)
+		&& (vp->v_numoutput <= (NFSBUFWRITE_THROTTLE / 3))) {
+		vp->v_flag &= ~VTHROTTLED;
+		wakeup((caddr_t)&vp->v_numoutput);
+	}
+
+	if (ISSET(bp->nb_flags, NB_ASYNC))	/* if async, release it */
+		nfs_buf_release(bp);
+	else {		                        /* or just wakeup the buffer */	
+		CLR(bp->nb_flags, NB_WANTED);
+		wakeup(bp);
+	}
+
+	FSDBG_BOT(550, bp, NBOFF(bp), bp->nb_flags, bp->nb_error);
+}
+
+void
+nfs_buf_write_delayed(struct nfsbuf *bp)
+{
+	struct proc *p = current_proc();
+	struct vnode *vp = bp->nb_vp;
+
+	FSDBG_TOP(551, bp, NBOFF(bp), bp->nb_flags, 0);
+	FSDBG(551, bp, bp->nb_dirtyoff, bp->nb_dirtyend, bp->nb_dirty);
+
+	/*
+	 * If the block hasn't been seen before:
+	 *	(1) Mark it as having been seen,
+	 *	(2) Charge for the write.
+	 *	(3) Make sure it's on its vnode's correct block list,
+	 */
+	if (!ISSET(bp->nb_flags, NB_DELWRI)) {
+		SET(bp->nb_flags, NB_DELWRI);
+		if (p && p->p_stats) 
+			p->p_stats->p_ru.ru_oublock++;		/* XXX */
+		nfs_nbdwrite++;
+		NFSBUFCNTCHK();
+		/* move to dirty list */
+		if (bp->nb_vnbufs.le_next != NFSNOLIST)
+			LIST_REMOVE(bp, nb_vnbufs);
+		LIST_INSERT_HEAD(&VTONFS(vp)->n_dirtyblkhd, bp, nb_vnbufs);
+	}
+
+	/*
+	 * If the vnode has "too many" write operations in progress
+	 * wait for them to finish the IO
+	 */
+	while (vp->v_numoutput >= NFSBUFWRITE_THROTTLE) {
+		vp->v_flag |= VTHROTTLED;
+		tsleep((caddr_t)&vp->v_numoutput, PRIBIO + 1, "nfs_buf_write_delayed", 0);
+	}
+
+	/*
+	 * If we have too many delayed write buffers, 
+	 * more than we can "safely" handle, just fall back to
+	 * doing the async write
+	 */
+	if (nfs_nbdwrite < 0)
+		panic("nfs_buf_write_delayed: Negative nfs_nbdwrite");
+
+	if (nfs_nbdwrite > ((nfsbufcnt/4)*3)) {
+		/* issue async write */
+		SET(bp->nb_flags, NB_ASYNC);
+		nfs_buf_write(bp);
+		FSDBG_BOT(551, bp, NBOFF(bp), bp->nb_flags, bp->nb_error);
+		return;
+	}
+	 
+	/* Otherwise, the "write" is done, so mark and release the buffer. */
+	SET(bp->nb_flags, NB_DONE);
+	nfs_buf_release(bp);
+	FSDBG_BOT(551, bp, NBOFF(bp), bp->nb_flags, 0);
+	return;
+}
+
 
 /*
  * Vnode op for read using bio
@@ -115,33 +967,41 @@ nfs_bioread(vp, uio, ioflag, cred, getpages)
 	register struct uio *uio;
 	int ioflag;
 	struct ucred *cred;
-	int getpages;
+	int getpages; // XXX unused!
 {
-	register struct nfsnode *np = VTONFS(vp);
-	register int biosize, i;
+	struct nfsnode *np = VTONFS(vp);
+	int biosize, i;
 	off_t diff;
-	struct buf *bp = 0, *rabp;
+	struct nfsbuf *bp = 0, *rabp;
 	struct vattr vattr;
 	struct proc *p;
 	struct nfsmount *nmp = VFSTONFS(vp->v_mount);
-	daddr_t lbn, rabn;
+	daddr_t lbn, rabn, lastrabn = -1;
 	int bufsize;
-	int nra, error = 0, n = 0, on = 0, not_readin;
+	int nra, error = 0, n = 0, on = 0;
 	int operation = (getpages? BLK_PAGEIN : BLK_READ);
+	caddr_t dp;
+	struct dirent *direntp;
+
+	FSDBG_TOP(514, vp, uio->uio_offset, uio->uio_resid, ioflag);
 
 #if DIAGNOSTIC
 	if (uio->uio_rw != UIO_READ)
 		panic("nfs_read mode");
 #endif
-	if (uio->uio_resid == 0)
+	if (uio->uio_resid == 0) {
+		FSDBG_BOT(514, vp, 0xd1e0001, 0, 0);
 		return (0);
-	if (uio->uio_offset < 0)
+	}
+	if (uio->uio_offset < 0) {
+		FSDBG_BOT(514, vp, 0xd1e0002, 0, EINVAL);
 		return (EINVAL);
+	}
 	p = uio->uio_procp;
-	if ((nmp->nm_flag & (NFSMNT_NFSV3 | NFSMNT_GOTFSINFO)) == NFSMNT_NFSV3)
+	if ((nmp->nm_flag & NFSMNT_NFSV3) &&
+	    !(nmp->nm_state & NFSSTA_GOTFSINFO))
 		(void)nfs_fsinfo(nmp, vp, cred, p);
-	/*due to getblk/vm interractions, use vm page size or less values */
-	biosize = min(vp->v_mount->mnt_stat.f_iosize, PAGE_SIZE);
+	biosize = vp->v_mount->mnt_stat.f_iosize;
 	/*
 	 * For nfs, cache consistency can only be maintained approximately.
 	 * Although RFC1094 does not specify the criteria, the following is
@@ -155,7 +1015,7 @@ nfs_bioread(vp, uio, ioflag, cred, getpages)
 	 * Then force a getattr rpc to ensure that you have up to date
 	 * attributes.
 	 * NB: This implies that cache data can be read when up to
-	 * NFS_ATTRTIMEO seconds out of date. If you find that you need current
+	 * NFS_MAXATTRTIMEO seconds out of date. If you find that you need current
 	 * attributes this could be forced by setting n_attrstamp to 0 before
 	 * the VOP_GETATTR() call.
 	 */
@@ -166,24 +1026,35 @@ nfs_bioread(vp, uio, ioflag, cred, getpages)
 					panic("nfs: bioread, not dir");
 				nfs_invaldir(vp);
 				error = nfs_vinvalbuf(vp, V_SAVE, cred, p, 1);
-				if (error)
+				if (error) {
+					FSDBG_BOT(514, vp, 0xd1e0003, 0, error);
 					return (error);
+				}
 			}
 			np->n_attrstamp = 0;
 			error = VOP_GETATTR(vp, &vattr, cred, p);
-			if (error)
+			if (error) {
+				FSDBG_BOT(514, vp, 0xd1e0004, 0, error);
 				return (error);
+			}
 			np->n_mtime = vattr.va_mtime.tv_sec;
 		} else {
 			error = VOP_GETATTR(vp, &vattr, cred, p);
-			if (error)
+			if (error) {
+				FSDBG_BOT(514, vp, 0xd1e0005, 0, error);
 				return (error);
+			}
 			if (np->n_mtime != vattr.va_mtime.tv_sec) {
-				if (vp->v_type == VDIR)
+				if (vp->v_type == VDIR) {
 					nfs_invaldir(vp);
+					/* purge name cache entries */
+					cache_purge(vp);
+				}
 				error = nfs_vinvalbuf(vp, V_SAVE, cred, p, 1);
-				if (error)
+				if (error) {
+					FSDBG_BOT(514, vp, 0xd1e0006, 0, error);
 					return (error);
+				}
 				np->n_mtime = vattr.va_mtime.tv_sec;
 			}
 		}
@@ -198,70 +1069,126 @@ nfs_bioread(vp, uio, ioflag, cred, getpages)
 		    do {
 			error = nqnfs_getlease(vp, ND_READ, cred, p);
 		    } while (error == NQNFS_EXPIRED);
-		    if (error)
+		    if (error) {
+			FSDBG_BOT(514, vp, 0xd1e0007, 0, error);
 			return (error);
+		    }
 		    if (np->n_lrev != np->n_brev ||
 			(np->n_flag & NQNFSNONCACHE) ||
 			((np->n_flag & NMODIFIED) && vp->v_type == VDIR)) {
 			if (vp->v_type == VDIR)
 			    nfs_invaldir(vp);
 			error = nfs_vinvalbuf(vp, V_SAVE, cred, p, 1);
-			if (error)
+			if (error) {
+			    FSDBG_BOT(514, vp, 0xd1e0008, 0, error);
 			    return (error);
+			}
 			np->n_brev = np->n_lrev;
 		    }
 		} else if (vp->v_type == VDIR && (np->n_flag & NMODIFIED)) {
 		    nfs_invaldir(vp);
 		    error = nfs_vinvalbuf(vp, V_SAVE, cred, p, 1);
-		    if (error)
+		    if (error) {
+			FSDBG_BOT(514, vp, 0xd1e0009, 0, error);
 			return (error);
+		    }
 		}
 	    }
-	    if (np->n_flag & NQNFSNONCACHE) {
+	    if ((np->n_flag & NQNFSNONCACHE) || (vp->v_flag & VNOCACHE_DATA)) {
+		if ((vp->v_flag & VNOCACHE_DATA) &&
+		    (np->n_dirtyblkhd.lh_first || np->n_cleanblkhd.lh_first)) {
+			error = nfs_vinvalbuf(vp, V_SAVE, cred, p, 1);
+			if (error) {
+				FSDBG_BOT(514, vp, 0xd1e000a, 0, error);
+				return (error);
+			}
+		}
 		switch (vp->v_type) {
 		case VREG:
-			return (nfs_readrpc(vp, uio, cred));
+			error = nfs_readrpc(vp, uio, cred);
+			FSDBG_BOT(514, vp, uio->uio_offset, uio->uio_resid, error);
+			return (error);
 		case VLNK:
-			return (nfs_readlinkrpc(vp, uio, cred));
+			error = nfs_readlinkrpc(vp, uio, cred);
+			FSDBG_BOT(514, vp, uio->uio_offset, uio->uio_resid, error);
+			return (error);
 		case VDIR:
 			break;
 		default:
-			printf(" NQNFSNONCACHE: type %x unexpected\n",	
-				vp->v_type);
+			printf(" NQNFSNONCACHE: type %x unexpected\n", vp->v_type);
 		};
 	    }
 	    switch (vp->v_type) {
 	    case VREG:
-		nfsstats.biocache_reads++;
 		lbn = uio->uio_offset / biosize;
-		on = uio->uio_offset & (biosize - 1);
-		not_readin = 1;
+
+		/*
+		 * Copy directly from any cached pages without grabbing the bufs.
+		 */
+		if (uio->uio_segflg == UIO_USERSPACE) {
+			int io_resid = uio->uio_resid;
+			diff = np->n_size - uio->uio_offset;
+			if (diff < io_resid)
+				io_resid = diff;
+			if (io_resid > 0) {
+				error = cluster_copy_ubc_data(vp, uio, &io_resid, 0);
+				if (error) {
+					FSDBG_BOT(514, vp, uio->uio_offset, 0xcacefeed, error);
+					return (error);
+				}
+			}
+			/* count any biocache reads that we just copied directly */
+			if (lbn != uio->uio_offset / biosize) {
+				nfsstats.biocache_reads += (uio->uio_offset / biosize) - lbn;
+				FSDBG(514, vp, 0xcacefeed, uio->uio_offset, error);
+			}
+		}
+
+		lbn = uio->uio_offset / biosize;
+		on = uio->uio_offset % biosize;
 
 		/*
 		 * Start the read ahead(s), as required.
 		 */
 		if (nfs_numasync > 0 && nmp->nm_readahead > 0) {
-		    for (nra = 0; nra < nmp->nm_readahead &&
-				  (off_t)(lbn + 1 + nra) * biosize < np->n_size;
-			 nra++) {
+			for (nra = 0; nra < nmp->nm_readahead; nra++) {
 				rabn = lbn + 1 + nra;
-				if (!incore(vp, rabn)) {
-					rabp = nfs_getcacheblk(vp, rabn, biosize, p, operation);
-					if (!rabp)
-						return (EINTR);
-					if (!ISSET(rabp->b_flags, (B_CACHE|B_DELWRI))) {
-						SET(rabp->b_flags, (B_READ | B_ASYNC));
-						if (nfs_asyncio(rabp, cred)) {
-							SET(rabp->b_flags, (B_INVAL|B_ERROR));
-							rabp->b_error = EIO;
-							brelse(rabp);
-						}
-					} else
-						brelse(rabp);
+				if (rabn <= lastrabn) {
+					/* we've already (tried to) read this block */
+					/* no need to try it again... */
+					continue;
 				}
-		    }
+				lastrabn = rabn;
+				if ((off_t)rabn * biosize >= np->n_size)
+					break;
+				/* check if block exists and is valid. */
+				rabp = nfs_buf_incore(vp, rabn);
+				if (rabp && nfs_buf_upl_valid_range(rabp, 0, rabp->nb_bufsize))
+					continue;
+				rabp = nfs_buf_get(vp, rabn, biosize, p, operation);
+				if (!rabp) {
+					FSDBG_BOT(514, vp, 0xd1e000b, 0, EINTR);
+					return (EINTR);
+				}
+				if (!ISSET(rabp->nb_flags, (NB_CACHE|NB_DELWRI))) {
+					SET(rabp->nb_flags, (NB_READ|NB_ASYNC));
+					if (nfs_asyncio(rabp, cred)) {
+						SET(rabp->nb_flags, (NB_INVAL|NB_ERROR));
+						rabp->nb_error = EIO;
+						nfs_buf_release(rabp);
+					}
+				} else
+					nfs_buf_release(rabp);
+			}
 		}
 
+		if ((uio->uio_resid <= 0) || (uio->uio_offset >= np->n_size)) {
+			FSDBG_BOT(514, vp, uio->uio_offset, uio->uio_resid, 0xaaaaaaaa);
+			return (0);
+		}
+
+		nfsstats.biocache_reads++;
+
 		/*
 		 * If the block is in the cache and has the required data
 		 * in a valid region, just copy it out.
@@ -270,84 +1197,162 @@ nfs_bioread(vp, uio, ioflag, cred, getpages)
 		 */
 again:
 		bufsize = biosize;
-		if ((off_t)(lbn + 1) * biosize > np->n_size && 
-		    (off_t)(lbn + 1) * biosize - np->n_size < biosize) {
-			bufsize = np->n_size - (off_t)lbn * biosize;
-			bufsize = (bufsize + DEV_BSIZE - 1) & ~(DEV_BSIZE - 1);
-		}
-		bp = nfs_getcacheblk(vp, lbn, bufsize, p, operation);
-		if (!bp)
-			return (EINTR);
-
-		if (!ISSET(bp->b_flags, B_CACHE)) {
-			SET(bp->b_flags, B_READ);
-			CLR(bp->b_flags, (B_DONE | B_ERROR | B_INVAL));
-			not_readin = 0;
-			error = nfs_doio(bp, cred, p);
-			if (error) {
-			    brelse(bp);
-			    return (error);
-			}
-		}
-		if (bufsize > on) {
-			n = min((unsigned)(bufsize - on), uio->uio_resid);
-		} else {
-			n = 0;
-		}
+		n = min((unsigned)(bufsize - on), uio->uio_resid);
 		diff = np->n_size - uio->uio_offset;
 		if (diff < n)
 			n = diff;
-		if (not_readin && n > 0) {
-			if (on < bp->b_validoff || (on + n) > bp->b_validend) {
-				SET(bp->b_flags, (B_NOCACHE|B_INVAFTERWRITE));
-				if (bp->b_dirtyend > 0) {
-					if (!ISSET(bp->b_flags, B_DELWRI))
-						panic("nfsbioread");
-					if (VOP_BWRITE(bp) == EINTR)
-						return (EINTR);
-				} else
-					brelse(bp);
+
+		bp = nfs_buf_get(vp, lbn, bufsize, p, operation);
+		if (!bp) {
+			FSDBG_BOT(514, vp, 0xd1e000c, 0, EINTR);
+			return (EINTR);
+		}
+
+		/* if any pages are valid... */
+		if (bp->nb_valid) {
+			/* ...check for any invalid pages in the read range */
+			int pg, firstpg, lastpg, dirtypg;
+			dirtypg = firstpg = lastpg = -1;
+			pg = on/PAGE_SIZE;
+			while (pg <= (on + n - 1)/PAGE_SIZE) {
+				if (!NBPGVALID(bp,pg)) {
+					if (firstpg < 0)
+						firstpg = pg;
+					lastpg = pg;
+				} else if (firstpg >= 0 && dirtypg < 0 && NBPGDIRTY(bp,pg))
+					dirtypg = pg;
+				pg++;
+			}
+
+			/* if there are no invalid pages, we're all set */
+			if (firstpg < 0) {
+				if (bp->nb_validoff < 0) {
+					/* valid range isn't set up, so */
+					/* set it to what we know is valid */
+					bp->nb_validoff = trunc_page_32(on);
+					bp->nb_validend = round_page_32(on+n);
+					nfs_buf_normalize_valid_range(np, bp);
+				}
+				goto buffer_ready;
+			}
+
+			/* there are invalid pages in the read range */
+			if ((dirtypg > firstpg) && (dirtypg < lastpg)) {
+				/* there are also dirty page(s) in the range, */
+				/* so write the buffer out and try again */
+				CLR(bp->nb_flags, (NB_DONE | NB_ERROR | NB_INVAL));
+				SET(bp->nb_flags, NB_ASYNC);
+				/*
+				 * NFS has embedded ucred so crhold() risks zone corruption
+				 */
+				if (bp->nb_wcred == NOCRED)
+					bp->nb_wcred = crdup(cred);
+				error = nfs_buf_write(bp);
+				if (error) {
+					FSDBG_BOT(514, vp, 0xd1e000d, 0, error);
+					return (error);
+				}
 				goto again;
 			}
+			if (!bp->nb_dirty && bp->nb_dirtyend <= 0 &&
+			    (lastpg - firstpg + 1) > (bufsize/PAGE_SIZE)/2) {
+				/* we need to read in more than half the buffer and the */
+				/* buffer's not dirty, so just fetch the whole buffer */
+				bp->nb_valid = 0;
+			} else {
+				/* read the page range in */
+				struct iovec iov;
+				struct uio auio;
+				auio.uio_iov = &iov;
+				auio.uio_iovcnt = 1;
+				auio.uio_offset = NBOFF(bp) + firstpg * PAGE_SIZE_64;
+				auio.uio_resid = (lastpg - firstpg + 1) * PAGE_SIZE;
+				auio.uio_segflg = UIO_SYSSPACE;
+				auio.uio_rw = UIO_READ;
+				auio.uio_procp = p;
+				NFS_BUF_MAP(bp);
+				iov.iov_base = bp->nb_data + firstpg * PAGE_SIZE;
+				iov.iov_len = auio.uio_resid;
+				error = nfs_readrpc(vp, &auio, cred);
+				if (error) {
+					nfs_buf_release(bp);
+					FSDBG_BOT(514, vp, 0xd1e000e, 0, error);
+					return (error);
+				}
+				/* Make sure that the valid range is set to cover this read. */
+				bp->nb_validoff = trunc_page_32(on);
+				bp->nb_validend = round_page_32(on+n);
+				nfs_buf_normalize_valid_range(np, bp);
+				if (auio.uio_resid > 0) {
+					/* if short read, must have hit EOF, */
+					/* so zero the rest of the range */
+					bzero(iov.iov_base, auio.uio_resid);
+				}
+				/* mark the pages (successfully read) as valid */
+				for (pg=firstpg; pg <= lastpg; pg++)
+					NBPGVALID_SET(bp,pg);
+			}
 		}
+		/* if no pages are valid, read the whole block */
+		if (!bp->nb_valid) {
+			SET(bp->nb_flags, NB_READ);
+			CLR(bp->nb_flags, (NB_DONE | NB_ERROR | NB_INVAL));
+			error = nfs_doio(bp, cred, p);
+			if (error) {
+				nfs_buf_release(bp);
+				FSDBG_BOT(514, vp, 0xd1e000f, 0, error);
+				return (error);
+			}
+		}
+buffer_ready:
 		vp->v_lastr = lbn;
-		diff = (on >= bp->b_validend) ? 0 : (bp->b_validend - on);
-		if (diff < n)
-			n = diff;
+		/* validate read range against valid range and clip */
+		if (bp->nb_validend > 0) {
+			diff = (on >= bp->nb_validend) ? 0 : (bp->nb_validend - on);
+			if (diff < n)
+				n = diff;
+		}
+		if (n > 0)
+			NFS_BUF_MAP(bp);
 		break;
 	    case VLNK:
 		nfsstats.biocache_readlinks++;
-		bp = nfs_getcacheblk(vp, (daddr_t)0, NFS_MAXPATHLEN, p, operation);
-		if (!bp)
+		bp = nfs_buf_get(vp, (daddr_t)0, NFS_MAXPATHLEN, p, operation);
+		if (!bp) {
+			FSDBG_BOT(514, vp, 0xd1e0010, 0, EINTR);
 			return (EINTR);
-		if (!ISSET(bp->b_flags, B_CACHE)) {
-			SET(bp->b_flags, B_READ);
+		}
+		if (!ISSET(bp->nb_flags, NB_CACHE)) {
+			SET(bp->nb_flags, NB_READ);
 			error = nfs_doio(bp, cred, p);
 			if (error) {
-				SET(bp->b_flags, B_ERROR);
-				brelse(bp);
+				SET(bp->nb_flags, NB_ERROR);
+				nfs_buf_release(bp);
+				FSDBG_BOT(514, vp, 0xd1e0011, 0, error);
 				return (error);
 			}
 		}
-		n = min(uio->uio_resid, NFS_MAXPATHLEN - bp->b_resid);
+		n = min(uio->uio_resid, bp->nb_validend);
 		on = 0;
 		break;
 	    case VDIR:
 		nfsstats.biocache_readdirs++;
-		if (np->n_direofoffset
-		    && uio->uio_offset >= np->n_direofoffset) {
-		    return (0);
+		if (np->n_direofoffset && uio->uio_offset >= np->n_direofoffset) {
+			FSDBG_BOT(514, vp, 0xde0f0001, 0, 0);
+			return (0);
 		}
 		lbn = uio->uio_offset / NFS_DIRBLKSIZ;
 		on = uio->uio_offset & (NFS_DIRBLKSIZ - 1);
-		bp = nfs_getcacheblk(vp, lbn, NFS_DIRBLKSIZ, p, operation);
-		if (!bp)
-		    return (EINTR);
-		if (!ISSET(bp->b_flags, B_CACHE)) {
-		    SET(bp->b_flags, B_READ);
+		bp = nfs_buf_get(vp, lbn, NFS_DIRBLKSIZ, p, operation);
+		if (!bp) {
+			FSDBG_BOT(514, vp, 0xd1e0012, 0, EINTR);
+			return (EINTR);
+		}
+		if (!ISSET(bp->nb_flags, NB_CACHE)) {
+		    SET(bp->nb_flags, NB_READ);
 		    error = nfs_doio(bp, cred, p);
 		    if (error) {
-			brelse(bp);
+			nfs_buf_release(bp);
 		    }
 		    while (error == NFSERR_BAD_COOKIE) {
 			nfs_invaldir(vp);
@@ -360,20 +1365,23 @@ again:
 			 */
 			for (i = 0; i <= lbn && !error; i++) {
 			    if (np->n_direofoffset
-				&& (i * NFS_DIRBLKSIZ) >= np->n_direofoffset)
+				&& (i * NFS_DIRBLKSIZ) >= np->n_direofoffset) {
+				    FSDBG_BOT(514, vp, 0xde0f0002, 0, 0);
 				    return (0);
-			    bp = nfs_getcacheblk(vp, i, NFS_DIRBLKSIZ, p,
-			    			 operation);
-			    if (!bp)
+			    }
+			    bp = nfs_buf_get(vp, i, NFS_DIRBLKSIZ, p, operation);
+			    if (!bp) {
+				    FSDBG_BOT(514, vp, 0xd1e0013, 0, EINTR);
 				    return (EINTR);
-			    if (!ISSET(bp->b_flags, B_CACHE)) {
-				    SET(bp->b_flags, B_READ);
+			    }
+			    if (!ISSET(bp->nb_flags, NB_CACHE)) {
+				    SET(bp->nb_flags, NB_READ);
 				    error = nfs_doio(bp, cred, p);
 				    /*
-				     * no error + B_INVAL == directory EOF,
+				     * no error + NB_INVAL == directory EOF,
 				     * use the block.
 				     */
-				    if (error == 0 && (bp->b_flags & B_INVAL))
+				    if (error == 0 && (bp->nb_flags & NB_INVAL))
 					    break;
 			    }
 			    /*
@@ -383,7 +1391,7 @@ again:
 			     * block and go for the next one via the for loop.
 			     */
 			    if (error || i < lbn)
-				    brelse(bp);
+				    nfs_buf_release(bp);
 			}
 		    }
 		    /*
@@ -391,8 +1399,10 @@ again:
 		     * error.  If we hit an error and it wasn't a cookie error,
 		     * we give up.
 		     */
-		    if (error)
+		    if (error) {
+		        FSDBG_BOT(514, vp, 0xd1e0014, 0, error);
 			return (error);
+		    }
 		}
 
 		/*
@@ -404,19 +1414,19 @@ again:
 		    (np->n_direofoffset == 0 ||
 		    (lbn + 1) * NFS_DIRBLKSIZ < np->n_direofoffset) &&
 		    !(np->n_flag & NQNFSNONCACHE) &&
-		    !incore(vp, lbn + 1)) {
-			rabp = nfs_getcacheblk(vp, lbn + 1, NFS_DIRBLKSIZ, p,
+		    !nfs_buf_incore(vp, lbn + 1)) {
+			rabp = nfs_buf_get(vp, lbn + 1, NFS_DIRBLKSIZ, p,
 					       operation);
 			if (rabp) {
-			    if (!ISSET(rabp->b_flags, (B_CACHE|B_DELWRI))) {
-				SET(rabp->b_flags, (B_READ | B_ASYNC));
+			    if (!ISSET(rabp->nb_flags, (NB_CACHE))) {
+				SET(rabp->nb_flags, (NB_READ | NB_ASYNC));
 				if (nfs_asyncio(rabp, cred)) {
-				    SET(rabp->b_flags, (B_INVAL|B_ERROR));
-				    rabp->b_error = EIO;
-				    brelse(rabp);
+				    SET(rabp->nb_flags, (NB_INVAL|NB_ERROR));
+				    rabp->nb_error = EIO;
+				    nfs_buf_release(rabp);
 				}
 			    } else {
-				brelse(rabp);
+				nfs_buf_release(rabp);
 			    }
 			}
 		}
@@ -424,30 +1434,41 @@ again:
 		 * Make sure we use a signed variant of min() since
 		 * the second term may be negative.
 		 */
-		n = lmin(uio->uio_resid, NFS_DIRBLKSIZ - bp->b_resid - on);
+		n = lmin(uio->uio_resid, bp->nb_validend - on);
 		/*
-		 * Unlike VREG files, whos buffer size ( bp->b_bcount ) is
-		 * chopped for the EOF condition, we cannot tell how large
-		 * NFS directories are going to be until we hit EOF.  So
-		 * an NFS directory buffer is *not* chopped to its EOF.  Now,
-		 * it just so happens that b_resid will effectively chop it
-		 * to EOF.  *BUT* this information is lost if the buffer goes
-		 * away and is reconstituted into a B_CACHE state (recovered
-		 * from VM) later.  So we keep track of the directory eof
-		 * in np->n_direofoffset and chop it off as an extra step
-		 * right here.
+		 * We keep track of the directory eof in
+		 * np->n_direofoffset and chop it off as an
+		 * extra step right here.
 		 */
 		if (np->n_direofoffset &&
 		    n > np->n_direofoffset - uio->uio_offset)
 			n = np->n_direofoffset - uio->uio_offset;
+		/*
+		 * Make sure that we return an integral number of entries so
+		 * that any subsequent calls will start copying from the start
+		 * of the next entry.
+		 *
+		 * If the current value of n has the last entry cut short,
+		 * set n to copy everything up to the last entry instead.
+		 */
+		if (n > 0) {
+			dp = bp->nb_data + on;
+			while (dp < (bp->nb_data + on + n)) {
+				direntp = (struct dirent *)dp;
+				dp += direntp->d_reclen;
+			}
+			if (dp > (bp->nb_data + on + n))
+				n = (dp - direntp->d_reclen) - (bp->nb_data + on);
+		}
 		break;
 	    default:
-		printf(" nfs_bioread: type %x unexpected\n",vp->v_type);
-		break;
+		printf("nfs_bioread: type %x unexpected\n",vp->v_type);
+		FSDBG_BOT(514, vp, 0xd1e0015, 0, EINVAL);
+		return (EINVAL);
 	    };
 
 	    if (n > 0) {
-		error = uiomove(bp->b_data + on, (int)n, uio);
+		error = uiomove(bp->nb_data + on, (int)n, uio);
 	    }
 	    switch (vp->v_type) {
 	    case VREG:
@@ -457,13 +1478,12 @@ again:
 		break;
 	    case VDIR:
 		if (np->n_flag & NQNFSNONCACHE)
-			SET(bp->b_flags, B_INVAL);
+			SET(bp->nb_flags, NB_INVAL);
 		break;
-	    default:
-		printf(" nfs_bioread: type %x unexpected\n",vp->v_type);
 	    }
- 	    brelse(bp);
+ 	    nfs_buf_release(bp);
 	} while (error == 0 && uio->uio_resid > 0 && n > 0);
+	FSDBG_BOT(514, vp, uio->uio_offset, uio->uio_resid, error);
 	return (error);
 }
 
@@ -480,23 +1500,24 @@ nfs_write(ap)
 		struct ucred *a_cred;
 	} */ *ap;
 {
-	register int biosize;
-	register struct uio *uio = ap->a_uio;
+	struct uio *uio = ap->a_uio;
 	struct proc *p = uio->uio_procp;
-	register struct vnode *vp = ap->a_vp;
+	struct vnode *vp = ap->a_vp;
 	struct nfsnode *np = VTONFS(vp);
-	register struct ucred *cred = ap->a_cred;
+	struct ucred *cred = ap->a_cred;
 	int ioflag = ap->a_ioflag;
-	struct buf *bp;
+	struct nfsbuf *bp;
 	struct vattr vattr;
 	struct nfsmount *nmp = VFSTONFS(vp->v_mount);
 	daddr_t lbn;
-	int bufsize;
+	int biosize, bufsize, writeop;
 	int n, on, error = 0, iomode, must_commit;
-	off_t boff;
+	off_t boff, start, end;
 	struct iovec iov;
 	struct uio auio;
 
+	FSDBG_TOP(515, vp, uio->uio_offset, uio->uio_resid, ioflag);
+
 #if DIAGNOSTIC
 	if (uio->uio_rw != UIO_WRITE)
 		panic("nfs_write mode");
@@ -507,29 +1528,39 @@ nfs_write(ap)
 		return (EIO);
 	if (np->n_flag & NWRITEERR) {
 		np->n_flag &= ~NWRITEERR;
+		FSDBG_BOT(515, vp, uio->uio_offset, uio->uio_resid, np->n_error);
 		return (np->n_error);
 	}
-	if ((nmp->nm_flag & (NFSMNT_NFSV3 | NFSMNT_GOTFSINFO)) == NFSMNT_NFSV3)
+	if ((nmp->nm_flag & NFSMNT_NFSV3) &&
+	    !(nmp->nm_state & NFSSTA_GOTFSINFO))
 		(void)nfs_fsinfo(nmp, vp, cred, p);
 	if (ioflag & (IO_APPEND | IO_SYNC)) {
 		if (np->n_flag & NMODIFIED) {
 			np->n_attrstamp = 0;
 			error = nfs_vinvalbuf(vp, V_SAVE, cred, p, 1);
-			if (error)
+			if (error) {
+				FSDBG_BOT(515, vp, uio->uio_offset, 0x10bad01, error);
 				return (error);
+			}
 		}
 		if (ioflag & IO_APPEND) {
 			np->n_attrstamp = 0;
 			error = VOP_GETATTR(vp, &vattr, cred, p);
-			if (error)
+			if (error) {
+				FSDBG_BOT(515, vp, uio->uio_offset, 0x10bad02, error);
 				return (error);
+			}
 			uio->uio_offset = np->n_size;
 		}
 	}
-	if (uio->uio_offset < 0)
+	if (uio->uio_offset < 0) {
+		FSDBG_BOT(515, vp, uio->uio_offset, 0xbad0ff, EINVAL);
 		return (EINVAL);
-	if (uio->uio_resid == 0)
+	}
+	if (uio->uio_resid == 0) {
+		FSDBG_BOT(515, vp, uio->uio_offset, uio->uio_resid, 0);
 		return (0);
+	}
 	/*
 	 * Maybe this should be above the vnode op call, but so long as
 	 * file servers have no limits, i don't think it matters
@@ -537,15 +1568,11 @@ nfs_write(ap)
 	if (p && uio->uio_offset + uio->uio_resid >
 	      p->p_rlimit[RLIMIT_FSIZE].rlim_cur) {
 		psignal(p, SIGXFSZ);
+		FSDBG_BOT(515, vp, uio->uio_offset, 0x2b1f, EFBIG);
 		return (EFBIG);
 	}
-	/*
-	 * I use nm_rsize, not nm_wsize so that all buffer cache blocks
-	 * will be the same size within a filesystem. nfs_writerpc will
-	 * still use nm_wsize when sizing the rpc's.
-	 */
-	/*due to getblk/vm interractions, use vm page size or less values */
-	biosize = min(vp->v_mount->mnt_stat.f_iosize, PAGE_SIZE);
+
+	biosize = vp->v_mount->mnt_stat.f_iosize;
 
 	do {
 		/*
@@ -556,210 +1583,376 @@ nfs_write(ap)
 			do {
 				error = nqnfs_getlease(vp, ND_WRITE, cred, p);
 			} while (error == NQNFS_EXPIRED);
-			if (error)
+			if (error) {
+				FSDBG_BOT(515, vp, uio->uio_offset, 0x11110001, error);
 				return (error);
+			}
 			if (np->n_lrev != np->n_brev ||
 			    (np->n_flag & NQNFSNONCACHE)) {
 				error = nfs_vinvalbuf(vp, V_SAVE, cred, p, 1);
-				if (error)
+				if (error) {
+					FSDBG_BOT(515, vp, uio->uio_offset, 0x11110002, error);
 					return (error);
+				}
 				np->n_brev = np->n_lrev;
 			}
 		}
-		if ((np->n_flag & NQNFSNONCACHE) && uio->uio_iovcnt == 1) {
+		if (ISSET(vp->v_flag, VNOCACHE_DATA) &&
+		    (np->n_dirtyblkhd.lh_first || np->n_cleanblkhd.lh_first)) {
+			error = nfs_vinvalbuf(vp, V_SAVE, cred, p, 1);
+			if (error) {
+				FSDBG_BOT(515, vp, 0, 0, error);
+				return (error);
+			}
+		}
+		if (((np->n_flag & NQNFSNONCACHE) ||
+		     ISSET(vp->v_flag, VNOCACHE_DATA)) &&
+		    uio->uio_iovcnt == 1) {
 		    iomode = NFSV3WRITE_FILESYNC;
 		    error = nfs_writerpc(vp, uio, cred, &iomode, &must_commit);
 		    if (must_commit)
 			nfs_clearcommit(vp->v_mount);
+		    FSDBG_BOT(515, vp, uio->uio_offset, uio->uio_resid, error);
 		    return (error);
 		}
 		nfsstats.biocache_writes++;
 		lbn = uio->uio_offset / biosize;
-		on = uio->uio_offset & (biosize-1);
+		on = uio->uio_offset % biosize;
 		n = min((unsigned)(biosize - on), uio->uio_resid);
 again:
 		bufsize = biosize;
-#if 0
-/* (removed for UBC) */
-		if ((lbn + 1) * biosize > np->n_size) {
-			bufsize = np->n_size - lbn * biosize;
-			bufsize = (bufsize + DEV_BSIZE - 1) & ~(DEV_BSIZE - 1);
-		}
-#endif
 		/*
 		 * Get a cache block for writing.  The range to be written is
-		 * (off..off+len) within the block.  We ensure that the block
+		 * (off..off+n) within the block.  We ensure that the block
 		 * either has no dirty region or that the given range is
 		 * contiguous with the existing dirty region.
 		 */
-		bp = nfs_getcacheblk(vp, lbn, bufsize, p, BLK_WRITE);
-		if (!bp)
+		bp = nfs_buf_get(vp, lbn, bufsize, p, BLK_WRITE);
+		if (!bp) {
+			FSDBG_BOT(515, vp, uio->uio_offset, uio->uio_resid, EINTR);
 			return (EINTR);
+		}
+		/* map the block because we know we're going to write to it */
+		NFS_BUF_MAP(bp);
+
+		if (ISSET(vp->v_flag, VNOCACHE_DATA))
+			SET(bp->nb_flags, (NB_NOCACHE|NB_INVAL));
+
+		/*
+		 * NFS has embedded ucred so crhold() risks zone corruption
+		 */
+		if (bp->nb_wcred == NOCRED)
+			bp->nb_wcred = crdup(cred);
+
+		/*
+		 * If there's already a dirty range AND dirty pages in this block we
+		 * need to send a commit AND write the dirty pages before continuing.
+		 *
+		 * If there's already a dirty range OR dirty pages in this block
+		 * and the new write range is not contiguous with the existing range,
+		 * then force the buffer to be written out now.
+		 * (We used to just extend the dirty range to cover the valid,
+		 * but unwritten, data in between also.  But writing ranges
+		 * of data that weren't actually written by an application
+		 * risks overwriting some other client's data with stale data
+		 * that's just masquerading as new written data.)
+		 */
+		if (bp->nb_dirtyend > 0) {
+		    if (on > bp->nb_dirtyend || (on + n) < bp->nb_dirtyoff || bp->nb_dirty) {
+			FSDBG(515, vp, uio->uio_offset, bp, 0xd15c001);
+			/* write/commit buffer "synchronously" */
+			/* (NB_STABLE indicates that data writes should be FILESYNC) */
+			CLR(bp->nb_flags, (NB_DONE | NB_ERROR | NB_INVAL));
+			SET(bp->nb_flags, (NB_ASYNC | NB_STABLE));
+			error = nfs_buf_write(bp);
+			if (error) {
+			    FSDBG_BOT(515, vp, uio->uio_offset, uio->uio_resid, error);
+			    return (error);
+			}
+			goto again;
+		    }
+		} else if (bp->nb_dirty) {
+		    int firstpg, lastpg;
+		    u_int32_t pagemask;
+		    /* calculate write range pagemask */
+		    firstpg = on/PAGE_SIZE;
+		    lastpg = (on+n-1)/PAGE_SIZE;
+		    pagemask = ((1 << (lastpg+1)) - 1) & ~((1 << firstpg) - 1);
+		    /* check if there are dirty pages outside the write range */
+		    if (bp->nb_dirty & ~pagemask) {
+			FSDBG(515, vp, uio->uio_offset, bp, 0xd15c002);
+			/* write/commit buffer "synchronously" */
+			/* (NB_STABLE indicates that data writes should be FILESYNC) */
+			CLR(bp->nb_flags, (NB_DONE | NB_ERROR | NB_INVAL));
+			SET(bp->nb_flags, (NB_ASYNC | NB_STABLE));
+			error = nfs_buf_write(bp);
+			if (error) {
+			    FSDBG_BOT(515, vp, uio->uio_offset, uio->uio_resid, error);
+			    return (error);
+			}
+			goto again;
+		    }
+		    /* if the first or last pages are already dirty */
+		    /* make sure that the dirty range encompasses those pages */
+		    if (NBPGDIRTY(bp,firstpg) || NBPGDIRTY(bp,lastpg)) {
+			FSDBG(515, vp, uio->uio_offset, bp, 0xd15c003);
+		    	bp->nb_dirtyoff = min(on, firstpg * PAGE_SIZE);
+			if (NBPGDIRTY(bp,lastpg)) {
+			    bp->nb_dirtyend = (lastpg+1) * PAGE_SIZE;
+			    /* clip to EOF */
+			    if (NBOFF(bp) + bp->nb_dirtyend > np->n_size)
+				    bp->nb_dirtyend = np->n_size - NBOFF(bp);
+			} else
+			    bp->nb_dirtyend = on+n;
+		    }
+		}
+
 		/*
-		 * Resize nfsnode *after* we busy the buffer to prevent
-		 * readers from reading garbage.
+		 * Are we extending the size of the file with this write?
+		 * If so, update file size now that we have the block.
 		 * If there was a partial buf at the old eof, validate
 		 * and zero the new bytes. 
 		 */
 		if (uio->uio_offset + n > np->n_size) {
-			struct buf *bp0 = NULL;
-			daddr_t bn = np->n_size / biosize;
-			int off = np->n_size & (biosize - 1);
+			struct nfsbuf *eofbp = NULL;
+			daddr_t eofbn = np->n_size / biosize;
+			int eofoff = np->n_size % biosize;
+			int neweofoff = (uio->uio_offset + n) % biosize;
+
+			FSDBG(515, 0xb1ffa000, uio->uio_offset + n, eofoff, neweofoff);
 
-			if (off && bn < lbn && incore(vp, bn))
-				bp0 = nfs_getcacheblk(vp, bn, biosize, p,
-						      BLK_WRITE);
+			if (eofoff && eofbn < lbn && nfs_buf_incore(vp, eofbn))
+				eofbp = nfs_buf_get(vp, eofbn, biosize, p, BLK_WRITE);
+
+			/* if we're extending within the same last block */
+			/* and the block is flagged as being cached... */
+			if ((lbn == eofbn) && ISSET(bp->nb_flags, NB_CACHE)) {
+				/* ...check that all pages in buffer are valid */
+				int endpg = ((neweofoff ? neweofoff : biosize) - 1)/PAGE_SIZE;
+				u_int32_t pagemask;
+				/* pagemask only has to extend to last page being written to */
+				pagemask = (1 << (endpg+1)) - 1;
+				FSDBG(515, 0xb1ffa001, bp->nb_valid, pagemask, 0);
+				if ((bp->nb_valid & pagemask) != pagemask) {
+					/* zerofill any hole */
+					if (on > bp->nb_validend) {
+						int i;
+						for (i=bp->nb_validend/PAGE_SIZE; i <= (on - 1)/PAGE_SIZE; i++)
+							NBPGVALID_SET(bp, i);
+						NFS_BUF_MAP(bp);
+						FSDBG(516, bp, bp->nb_validend, on - bp->nb_validend, 0xf01e);
+						bzero((char *)bp->nb_data + bp->nb_validend,
+							on - bp->nb_validend);
+					}
+					/* zerofill any trailing data in the last page */
+					if (neweofoff) {
+						NFS_BUF_MAP(bp);
+						FSDBG(516, bp, neweofoff, PAGE_SIZE - (neweofoff & PAGE_MASK), 0xe0f);
+						bzero((char *)bp->nb_data + neweofoff,
+							PAGE_SIZE - (neweofoff & PAGE_MASK));
+					}
+				}
+			}
 			np->n_flag |= NMODIFIED;
 			np->n_size = uio->uio_offset + n;
 			ubc_setsize(vp, (off_t)np->n_size); /* XXX errors */
-			if (bp0) {
-				bzero((char *)bp0->b_data + off, biosize - off);
-				bp0->b_validend = biosize;
-				brelse(bp0);
+			if (eofbp) {
+				/*
+				 * We may need to zero any previously invalid data
+				 * after the old EOF in the previous EOF buffer.
+				 *
+				 * For the old last page, don't zero bytes if there
+				 * are invalid bytes in that page (i.e. the page isn't
+				 * currently valid).
+				 * For pages after the old last page, zero them and
+				 * mark them as valid.
+				 */
+				char *d;
+				int i;
+				if (ISSET(vp->v_flag, VNOCACHE_DATA))
+					SET(eofbp->nb_flags, (NB_NOCACHE|NB_INVAL));
+				NFS_BUF_MAP(eofbp);
+				FSDBG(516, eofbp, eofoff, biosize - eofoff, 0xe0fff01e);
+				d = eofbp->nb_data;
+				i = eofoff/PAGE_SIZE;
+				while (eofoff < biosize) {
+					int poff = eofoff & PAGE_MASK;
+					if (!poff || NBPGVALID(eofbp,i)) {
+						bzero(d + eofoff, PAGE_SIZE - poff);
+						NBPGVALID_SET(eofbp, i);
+					}
+					if (bp->nb_validend == eofoff)
+						bp->nb_validend += PAGE_SIZE - poff;
+					eofoff += PAGE_SIZE - poff;
+					i++;
+				}
+				nfs_buf_release(eofbp);
 			}
 		}
-		/*
-		 * NFS has embedded ucred so crhold() risks zone corruption
-		 */
-		if (bp->b_wcred == NOCRED)
-			bp->b_wcred = crdup(cred);
 		/*
 		 * If dirtyend exceeds file size, chop it down.  This should
 		 * not occur unless there is a race.
 		 */
-		if ((off_t)bp->b_blkno * DEV_BSIZE + bp->b_dirtyend >
-		    np->n_size)
-			bp->b_dirtyend = np->n_size - (off_t)bp->b_blkno *
-						      DEV_BSIZE;
+		if (NBOFF(bp) + bp->nb_dirtyend > np->n_size)
+			bp->nb_dirtyend = np->n_size - NBOFF(bp);
 		/*
-		 * UBC doesn't (yet) handle partial pages so nfs_biowrite was
-		 * hacked to never bdwrite, to start every little write right
-		 * away.  Running IE Avie noticed the performance problem, thus
-		 * this code, which permits those delayed writes by ensuring an
-		 * initial read of the entire page.  The read may hit eof
-		 * ("short read") but that we will handle.
+		 * UBC doesn't handle partial pages, so we need to make sure
+		 * that any pages left in the page cache are completely valid.
+		 *
+		 * Writes that are smaller than a block are delayed if they
+		 * don't extend to the end of the block.
 		 *
-		 * We are quite dependant on the correctness of B_CACHE so check
-		 * that first in case of problems.
+		 * If the block isn't (completely) cached, we may need to read
+		 * in some parts of pages that aren't covered by the write.
+		 * If the write offset (on) isn't page aligned, we'll need to
+		 * read the start of the first page being written to.  Likewise,
+		 * if the offset of the end of the write (on+n) isn't page aligned,
+		 * we'll need to read the end of the last page being written to.
+		 *
+		 * Notes:
+		 * We don't want to read anything we're just going to write over.
+		 * We don't want to issue multiple I/Os if we don't have to
+		 *   (because they're synchronous rpcs).
+		 * We don't want to read anything we already have modified in the
+		 *   page cache.
 		 */
-		if (!ISSET(bp->b_flags, B_CACHE) && n < PAGE_SIZE) {
-			boff = (off_t)bp->b_blkno * DEV_BSIZE;
-			auio.uio_iov = &iov;
-			auio.uio_iovcnt = 1;
-			auio.uio_offset = boff;
-			auio.uio_resid = PAGE_SIZE;
-			auio.uio_segflg = UIO_SYSSPACE;
-			auio.uio_rw = UIO_READ;
-			auio.uio_procp = p;
-			iov.iov_base = bp->b_data;
-			iov.iov_len = PAGE_SIZE;
-			error = nfs_readrpc(vp, &auio, cred);
-			if (error) {
-				bp->b_error = error;
-				SET(bp->b_flags, B_ERROR);
-				printf("nfs_write: readrpc %d", error);
+		if (!ISSET(bp->nb_flags, NB_CACHE) && n < biosize) {
+			int firstpg, lastpg, dirtypg;
+			int firstpgoff, lastpgoff;
+			start = end = -1;
+			firstpg = on/PAGE_SIZE;
+			firstpgoff = on & PAGE_MASK;
+			lastpg = (on+n-1)/PAGE_SIZE;
+			lastpgoff = (on+n) & PAGE_MASK;
+			if (firstpgoff && !NBPGVALID(bp,firstpg)) {
+				/* need to read start of first page */
+				start = firstpg * PAGE_SIZE;
+				end = start + firstpgoff;
 			}
-			if (auio.uio_resid > 0)
-				bzero(iov.iov_base, auio.uio_resid);
-			bp->b_validoff = 0;
-			bp->b_validend = PAGE_SIZE - auio.uio_resid;
-			if (np->n_size > boff + bp->b_validend)
-				bp->b_validend = min(np->n_size - boff,
-						     PAGE_SIZE);
-			bp->b_dirtyoff = 0;
-			bp->b_dirtyend = 0;
-		}
-	
-		/*
-		 * If the new write will leave a contiguous dirty
-		 * area, just update the b_dirtyoff and b_dirtyend,
-		 * otherwise try to extend the dirty region.
-		 */
-		if (bp->b_dirtyend > 0 &&
-		    (on > bp->b_dirtyend || (on + n) < bp->b_dirtyoff)) {
-			off_t start, end;
-	
-			boff = (off_t)bp->b_blkno * DEV_BSIZE;
-			if (on > bp->b_dirtyend) {
-				start = boff + bp->b_validend;
-				end = boff + on;
-			} else {
-				start = boff + on + n;
-				end = boff + bp->b_validoff;
+			if (lastpgoff && !NBPGVALID(bp,lastpg)) {
+				/* need to read end of last page */
+				if (start < 0)
+					start = (lastpg * PAGE_SIZE) + lastpgoff;
+				end = (lastpg + 1) * PAGE_SIZE;
 			}
-			
-			/*
-			 * It may be that the valid region in the buffer
-			 * covers the region we want, in which case just
-			 * extend the dirty region.  Otherwise we try to
-			 * extend the valid region.
-			 */
 			if (end > start) {
+				/* need to read the data in range: start...end-1 */
+
+				/*
+				 * XXX: If we know any of these reads are beyond the
+				 * current EOF (what np->n_size was before we possibly
+				 * just modified it above), we could short-circuit the
+				 * reads and just zero buffer.  No need to make a trip
+				 * across the network to read nothing.
+				 */
+
+				/* first, check for dirty pages in between */
+				/* if there are, we'll have to do two reads because */
+				/* we don't want to overwrite the dirty pages. */
+				for (dirtypg=start/PAGE_SIZE; dirtypg <= (end-1)/PAGE_SIZE; dirtypg++)
+					if (NBPGDIRTY(bp,dirtypg))
+						break;
+
+				/* if start is at beginning of page, try */
+				/* to get any preceeding pages as well. */
+				if (!(start & PAGE_MASK)) {
+					/* stop at next dirty/valid page or start of block */
+					for (; start > 0; start-=PAGE_SIZE)
+						if (NBPGVALID(bp,((start-1)/PAGE_SIZE)))
+							break;
+				}
+
+				NFS_BUF_MAP(bp);
+				/* setup uio for read(s) */
+				boff = NBOFF(bp);
 				auio.uio_iov = &iov;
 				auio.uio_iovcnt = 1;
-				auio.uio_offset = start;
-				auio.uio_resid = end - start;
 				auio.uio_segflg = UIO_SYSSPACE;
 				auio.uio_rw = UIO_READ;
 				auio.uio_procp = p;
-				iov.iov_base = bp->b_data + (start - boff);
-				iov.iov_len = end - start;
+
+				if (dirtypg <= (end-1)/PAGE_SIZE) {
+					/* there's a dirty page in the way, so just do two reads */
+					/* we'll read the preceding data here */
+					auio.uio_offset = boff + start;
+					auio.uio_resid = iov.iov_len = on - start;
+					iov.iov_base = bp->nb_data + start;
+					error = nfs_readrpc(vp, &auio, cred);
+					if (error) {
+						bp->nb_error = error;
+						SET(bp->nb_flags, NB_ERROR);
+						printf("nfs_write: readrpc %d", error);
+					}
+					if (auio.uio_resid > 0) {
+						FSDBG(516, bp, iov.iov_base - bp->nb_data, auio.uio_resid, 0xd00dee01);
+						bzero(iov.iov_base, auio.uio_resid);
+					}
+					/* update validoff/validend if necessary */
+					if ((bp->nb_validoff < 0) || (bp->nb_validoff > start))
+						bp->nb_validoff = start;
+					if ((bp->nb_validend < 0) || (bp->nb_validend < on))
+						bp->nb_validend = on;
+					if (np->n_size > boff + bp->nb_validend)
+						bp->nb_validend = min(np->n_size - (boff + start), biosize);
+					/* validate any pages before the write offset */
+					for (; start < on/PAGE_SIZE; start+=PAGE_SIZE)
+						NBPGVALID_SET(bp, start/PAGE_SIZE);
+					/* adjust start to read any trailing data */
+					start = on+n;
+				}
+
+				/* if end is at end of page, try to */
+				/* get any following pages as well. */
+				if (!(end & PAGE_MASK)) {
+					/* stop at next valid page or end of block */
+					for (; end < bufsize; end+=PAGE_SIZE)
+						if (NBPGVALID(bp,end/PAGE_SIZE))
+							break;
+				}
+
+				/* now we'll read the (rest of the) data */
+				auio.uio_offset = boff + start;
+				auio.uio_resid = iov.iov_len = end - start;
+				iov.iov_base = bp->nb_data + start;
 				error = nfs_readrpc(vp, &auio, cred);
-				/*
-				 * If we couldn't read, do not do a VOP_BWRITE
-				 * as originally coded. That could also error
-				 * and looping back to "again" as it was doing
-				 * could have us stuck trying to write same buf
-				 * again. nfs_write, will get the entire region
-				 * if nfs_readrpc succeeded. If unsuccessful
-				 * we should just error out. Errors like ESTALE
-				 * would keep us looping rather than transient
-				 * errors justifying a retry. We can return here
-				 * instead of altering dirty region later.  We
-				 * did not write old dirty region at this point.
-				 */
 				if (error) {
-					bp->b_error = error;
-					SET(bp->b_flags, B_ERROR);
-					printf("nfs_write: readrpc2 %d", error);
-					brelse(bp);
-					return (error);
+					bp->nb_error = error;
+					SET(bp->nb_flags, NB_ERROR);
+					printf("nfs_write: readrpc %d", error);
 				}
-				/*
-				 * The read worked.
-				 * If there was a short read, just zero fill.
-				 */
-				if (auio.uio_resid > 0)
+				if (auio.uio_resid > 0) {
+					FSDBG(516, bp, iov.iov_base - bp->nb_data, auio.uio_resid, 0xd00dee02);
 					bzero(iov.iov_base, auio.uio_resid);
-				if (on > bp->b_dirtyend)
-					bp->b_validend = on;
-				else
-					bp->b_validoff = on + n;
+				}
+				/* update validoff/validend if necessary */
+				if ((bp->nb_validoff < 0) || (bp->nb_validoff > start))
+					bp->nb_validoff = start;
+				if ((bp->nb_validend < 0) || (bp->nb_validend < end))
+					bp->nb_validend = end;
+				if (np->n_size > boff + bp->nb_validend)
+					bp->nb_validend = min(np->n_size - (boff + start), biosize);
+				/* validate any pages before the write offset's page */
+				for (; start < trunc_page_32(on); start+=PAGE_SIZE)
+					NBPGVALID_SET(bp, start/PAGE_SIZE);
+				/* validate any pages after the range of pages being written to */
+				for (; (end - 1) > round_page_32(on+n-1); end-=PAGE_SIZE)
+					NBPGVALID_SET(bp, (end-1)/PAGE_SIZE);
+				/* Note: pages being written to will be validated when written */
 			}
-			/*
-			 * We now have a valid region which extends up to the
-			 * dirty region which we want.
-			 */
-			if (on > bp->b_dirtyend)
-				bp->b_dirtyend = on;
-			else
-				bp->b_dirtyoff = on + n;
 		}
-		if (ISSET(bp->b_flags, B_ERROR)) {
-			error = bp->b_error;
-			brelse(bp);
+
+		if (ISSET(bp->nb_flags, NB_ERROR)) {
+			error = bp->nb_error;
+			nfs_buf_release(bp);
+			FSDBG_BOT(515, vp, uio->uio_offset, uio->uio_resid, error);
 			return (error);
 		}
-		/*
-		 * NFS has embedded ucred so crhold() risks zone corruption
-		 */
-		if (bp->b_wcred == NOCRED)
-			bp->b_wcred = crdup(cred);
+
 		np->n_flag |= NMODIFIED;
 
 		/*
 		 * Check for valid write lease and get one as required.
-		 * In case getblk() and/or bwrite() delayed us.
+		 * In case nfs_buf_get() and/or nfs_buf_write() delayed us.
 		 */
 		if ((nmp->nm_flag & NFSMNT_NQNFS) &&
 		    NQNFS_CKINVALID(vp, np, ND_WRITE)) {
@@ -767,124 +1960,222 @@ again:
 				error = nqnfs_getlease(vp, ND_WRITE, cred, p);
 			} while (error == NQNFS_EXPIRED);
 			if (error) {
-				brelse(bp);
+				nfs_buf_release(bp);
+				FSDBG_BOT(515, vp, uio->uio_offset, 0x11220001, error);
 				return (error);
 			}
 			if (np->n_lrev != np->n_brev ||
 			    (np->n_flag & NQNFSNONCACHE)) {
-				brelse(bp);
+				nfs_buf_release(bp);
 				error = nfs_vinvalbuf(vp, V_SAVE, cred, p, 1);
-				if (error)
+				if (error) {
+					FSDBG_BOT(515, vp, uio->uio_offset, 0x11220002, error);
 					return (error);
+				}
 				np->n_brev = np->n_lrev;
 				goto again;
 			}
 		}
-		error = uiomove((char *)bp->b_data + on, n, uio);
+		NFS_BUF_MAP(bp);
+		error = uiomove((char *)bp->nb_data + on, n, uio);
 		if (error) {
-			SET(bp->b_flags, B_ERROR);
-			brelse(bp);
+			SET(bp->nb_flags, NB_ERROR);
+			nfs_buf_release(bp);
+			FSDBG_BOT(515, vp, uio->uio_offset, uio->uio_resid, error);
 			return (error);
 		}
-		if (bp->b_dirtyend > 0) {
-			bp->b_dirtyoff = min(on, bp->b_dirtyoff);
-			bp->b_dirtyend = max((on + n), bp->b_dirtyend);
+
+		/* validate any pages written to */
+		start = on & ~PAGE_MASK;
+		for (; start < on+n; start += PAGE_SIZE) {
+			NBPGVALID_SET(bp, start/PAGE_SIZE);
+			/*
+			 * This may seem a little weird, but we don't actually set the
+			 * dirty bits for writes.  This is because we keep the dirty range
+			 * in the nb_dirtyoff/nb_dirtyend fields.  Also, particularly for
+			 * delayed writes, when we give the pages back to the VM we don't
+			 * want to keep them marked dirty, because when we later write the
+			 * buffer we won't be able to tell which pages were written dirty
+			 * and which pages were mmapped and dirtied.
+			 */
+		}
+		if (bp->nb_dirtyend > 0) {
+			bp->nb_dirtyoff = min(on, bp->nb_dirtyoff);
+			bp->nb_dirtyend = max((on + n), bp->nb_dirtyend);
 		} else {
-			bp->b_dirtyoff = on;
-			bp->b_dirtyend = on + n;
+			bp->nb_dirtyoff = on;
+			bp->nb_dirtyend = on + n;
 		}
-		if (bp->b_validend == 0 || bp->b_validend < bp->b_dirtyoff ||
-		    bp->b_validoff > bp->b_dirtyend) {
-			bp->b_validoff = bp->b_dirtyoff;
-			bp->b_validend = bp->b_dirtyend;
+		if (bp->nb_validend <= 0 || bp->nb_validend < bp->nb_dirtyoff ||
+		    bp->nb_validoff > bp->nb_dirtyend) {
+			bp->nb_validoff = bp->nb_dirtyoff;
+			bp->nb_validend = bp->nb_dirtyend;
 		} else {
-			bp->b_validoff = min(bp->b_validoff, bp->b_dirtyoff);
-			bp->b_validend = max(bp->b_validend, bp->b_dirtyend);
+			bp->nb_validoff = min(bp->nb_validoff, bp->nb_dirtyoff);
+			bp->nb_validend = max(bp->nb_validend, bp->nb_dirtyend);
 		}
+		if (!ISSET(bp->nb_flags, NB_CACHE))
+			nfs_buf_normalize_valid_range(np, bp);
 
 		/*
 		 * Since this block is being modified, it must be written
 		 * again and not just committed.
 		 */
-		CLR(bp->b_flags, B_NEEDCOMMIT);
+		if (ISSET(bp->nb_flags, NB_NEEDCOMMIT)) {
+			np->n_needcommitcnt--;
+			CHECK_NEEDCOMMITCNT(np);
+		}
+		CLR(bp->nb_flags, NB_NEEDCOMMIT);
 
-		/*
-		 * If the lease is non-cachable or IO_SYNC do bwrite().
-		 */
-		if ((np->n_flag & NQNFSNONCACHE) || (ioflag & IO_SYNC)) {
-			bp->b_proc = p;
-			error = VOP_BWRITE(bp);
-			if (error)
+		if ((np->n_flag & NQNFSNONCACHE) ||
+		    (ioflag & IO_SYNC) || (vp->v_flag & VNOCACHE_DATA)) {
+			bp->nb_proc = p;
+			error = nfs_buf_write(bp);
+			if (error) {
+				FSDBG_BOT(515, vp, uio->uio_offset,
+					uio->uio_resid, error);
 				return (error);
+			}
 			if (np->n_flag & NQNFSNONCACHE) {
 				error = nfs_vinvalbuf(vp, V_SAVE, cred, p, 1);
-				if (error)
+				if (error) {
+					FSDBG_BOT(515, vp, uio->uio_offset,
+						uio->uio_resid, error);
 					return (error);
+				}
 			}
-		} else if ((n + on) == biosize &&
-			(nmp->nm_flag & NFSMNT_NQNFS) == 0) {
-			bp->b_proc = (struct proc *)0;
-			SET(bp->b_flags, B_ASYNC);
-			(void)nfs_writebp(bp, 0);
+		} else if ((n + on) == biosize && (nmp->nm_flag & NFSMNT_NQNFS) == 0) {
+			bp->nb_proc = (struct proc *)0;
+			SET(bp->nb_flags, NB_ASYNC);
+			nfs_buf_write(bp);
 		} else
-			bdwrite(bp);
+			nfs_buf_write_delayed(bp);
+
+		if (np->n_needcommitcnt > (nbuf/16))
+		        nfs_flushcommits(vp, p);
+
 	} while (uio->uio_resid > 0 && n > 0);
+
+	FSDBG_BOT(515, vp, uio->uio_offset, uio->uio_resid, 0);
 	return (0);
 }
 
-
 /*
- * Get an nfs cache block.
- * Allocate a new one if the block isn't currently in the cache
- * and return the block marked busy. If the calling process is
- * interrupted by a signal for an interruptible mount point, return
- * NULL.
+ * Flush out and invalidate all buffers associated with a vnode.
+ * Called with the underlying object locked.
  */
-static struct buf *
-nfs_getcacheblk(vp, bn, size, p, operation)
-	struct vnode *vp;
-	daddr_t bn;
-	int size;
+static int
+nfs_vinvalbuf_internal(vp, flags, cred, p, slpflag, slptimeo)
+	register struct vnode *vp;
+	int flags;
+	struct ucred *cred;
 	struct proc *p;
-	int operation;	/* defined in sys/buf.h */
+	int slpflag, slptimeo;
 {
-	register struct buf *bp;
-	struct nfsmount *nmp = VFSTONFS(vp->v_mount);
-	/*due to getblk/vm interractions, use vm page size or less values */
-	int biosize = min(vp->v_mount->mnt_stat.f_iosize, PAGE_SIZE);
-
-	if (nbdwrite > ((nbuf/4)*3) && operation == BLK_WRITE) {
-#define __BUFFERS_RECLAIMED 2
-		struct buf *tbp[__BUFFERS_RECLAIMED];
-		int i;
-
-		/* too many delayed writes, try to free up some buffers */
-		for (i = 0; i < __BUFFERS_RECLAIMED; i++)
-			tbp[i] = geteblk(512);
-
-		/* Yield to IO thread */
-		(void)tsleep((caddr_t)&nbdwrite, PCATCH, "nbdwrite", 1);
+	struct nfsbuf *bp;
+	struct nfsbuf *nbp, *blist;
+	int s, error = 0;
+	struct nfsnode *np = VTONFS(vp);
 
-		for (i = (__BUFFERS_RECLAIMED - 1); i >= 0; i--)
-			 brelse(tbp[i]);
+	if (flags & V_SAVE) {
+		if (error = VOP_FSYNC(vp, cred, MNT_WAIT, p))
+			return (error);
+		if (np->n_dirtyblkhd.lh_first)
+			panic("nfs_vinvalbuf: dirty bufs (vp 0x%x, bp 0x%x)",
+				vp, np->n_dirtyblkhd.lh_first);
 	}
 
-	if (nmp->nm_flag & NFSMNT_INT) {
-		bp = getblk(vp, bn, size, PCATCH, 0, operation);
-		while (bp == (struct buf *)0) {
-			if (nfs_sigintr(nmp, (struct nfsreq *)0, p))
-				return ((struct buf *)0);
-			bp = getblk(vp, bn, size, 0, 2 * hz, operation);
-		}
-	} else
-		bp = getblk(vp, bn, size, 0, 0, operation);
-
-	if( vp->v_type == VREG)
-		bp->b_blkno = ((off_t)bn * biosize) / DEV_BSIZE;
+	for (;;) {
+		blist = np->n_cleanblkhd.lh_first;
+		if (!blist)
+			blist = np->n_dirtyblkhd.lh_first;
+		if (!blist)
+			break;
 
-	return (bp);
+		for (bp = blist; bp; bp = nbp) {
+			nbp = bp->nb_vnbufs.le_next;
+			s = splbio();
+			if (ISSET(bp->nb_flags, NB_BUSY)) {
+				SET(bp->nb_flags, NB_WANTED);
+				FSDBG_TOP(556, vp, bp, NBOFF(bp), bp->nb_flags);
+				error = tsleep((caddr_t)bp,
+					slpflag | (PRIBIO + 1), "nfs_vinvalbuf",
+					slptimeo);
+				FSDBG_BOT(556, vp, bp, NBOFF(bp), bp->nb_flags);
+				splx(s);
+				if (error) {
+					FSDBG(554, vp, bp, -1, error);
+					return (error);
+				}
+				break;
+			}
+			FSDBG(554, vp, bp, NBOFF(bp), bp->nb_flags);
+			nfs_buf_remfree(bp);
+			SET(bp->nb_flags, NB_BUSY);
+			splx(s);
+			if ((flags & V_SAVE) && UBCINFOEXISTS(vp) && (NBOFF(bp) < np->n_size)) {
+				/* XXX extra paranoia: make sure we're not */
+				/* somehow leaving any dirty data around */
+				int mustwrite = 0;
+				int end = (NBOFF(bp) + bp->nb_bufsize >= np->n_size) ?
+				    bp->nb_bufsize : (np->n_size - NBOFF(bp));
+				if (!ISSET(bp->nb_flags, NB_PAGELIST)) {
+					error = nfs_buf_upl_setup(bp);
+					if (error == EINVAL) {
+						/* vm object must no longer exist */
+						/* hopefully we don't need to do */
+						/* anything for this buffer */
+					} else if (error)
+						printf("nfs_vinvalbuf: upl setup failed %d\n",
+							error);
+					bp->nb_valid = bp->nb_dirty = 0;
+				}
+				nfs_buf_upl_check(bp);
+				/* check for any dirty data before the EOF */
+				if (bp->nb_dirtyend && bp->nb_dirtyoff < end) {
+					/* clip dirty range to EOF */
+					if (bp->nb_dirtyend > end)
+						bp->nb_dirtyend = end;
+					mustwrite++;
+				}
+				bp->nb_dirty &= (1 << (round_page_32(end)/PAGE_SIZE)) - 1;
+				if (bp->nb_dirty)
+					mustwrite++;
+				if (mustwrite) {
+					FSDBG(554, vp, bp, 0xd00dee, bp->nb_flags);
+					if (!ISSET(bp->nb_flags, NB_PAGELIST))
+						panic("nfs_vinvalbuf: dirty buffer without upl");
+					/* gotta write out dirty data before invalidating */
+					/* (NB_STABLE indicates that data writes should be FILESYNC) */
+					/* (NB_NOCACHE indicates buffer should be discarded) */
+					CLR(bp->nb_flags, (NB_DONE | NB_ERROR | NB_INVAL | NB_ASYNC));
+					SET(bp->nb_flags, NB_STABLE | NB_NOCACHE);
+					/*
+					 * NFS has embedded ucred so crhold() risks zone corruption
+					 */
+					if (bp->nb_wcred == NOCRED)
+						bp->nb_wcred = crdup(cred);
+					error = nfs_buf_write(bp);
+					// Note: bp has been released
+					if (error) {
+						FSDBG(554, bp, 0xd00dee, 0xbad, error);
+						np->n_error = error;
+						np->n_flag |= NWRITEERR;
+						error = 0;
+					}
+					break;
+				}
+			}
+			SET(bp->nb_flags, NB_INVAL);
+			nfs_buf_release(bp);
+		}
+	}
+	if (np->n_dirtyblkhd.lh_first || np->n_cleanblkhd.lh_first)
+		panic("nfs_vinvalbuf: flush failed");
+	return (0);
 }
 
+
 /*
  * Flush and invalidate all dirty buffers. If another process is already
  * doing the flush, just wait for completion.
@@ -902,7 +2193,9 @@ nfs_vinvalbuf(vp, flags, cred, p, intrflg)
 	int error = 0, slpflag, slptimeo;
 	int didhold = 0;
 
-	if ((nmp->nm_flag & NFSMNT_INT) == 0)
+	FSDBG_TOP(554, vp, flags, intrflg, 0);
+
+	if (nmp && ((nmp->nm_flag & NFSMNT_INT) == 0))
 		intrflg = 0;
 	if (intrflg) {
 		slpflag = PCATCH;
@@ -916,36 +2209,33 @@ nfs_vinvalbuf(vp, flags, cred, p, intrflg)
 	 */
 	while (np->n_flag & NFLUSHINPROG) {
 		np->n_flag |= NFLUSHWANT;
-		error = tsleep((caddr_t)&np->n_flag, PRIBIO + 2, "nfsvinval",
-			slptimeo);
-		if (error && intrflg && nfs_sigintr(nmp, (struct nfsreq *)0, p))
-			return (EINTR);
+		FSDBG_TOP(555, vp, flags, intrflg, np->n_flag);
+		error = tsleep((caddr_t)&np->n_flag, PRIBIO + 2, "nfsvinval", slptimeo);
+		FSDBG_BOT(555, vp, flags, intrflg, np->n_flag);
+		if (error && (error = nfs_sigintr(VFSTONFS(vp->v_mount), NULL, p))) {
+			FSDBG_BOT(554, vp, flags, intrflg, error);
+			return (error);
+		}
 	}
 
 	/*
 	 * Now, flush as required.
 	 */
 	np->n_flag |= NFLUSHINPROG;
-	error = vinvalbuf(vp, flags, cred, p, slpflag, 0);
+	error = nfs_vinvalbuf_internal(vp, flags, cred, p, slpflag, 0);
 	while (error) {
-		/* we seem to be stuck in a loop here if the thread got aborted.
-		 * nfs_flush will return EINTR. Not sure if that will cause
-		 * other consequences due to EINTR having other meanings in NFS
-		 * To handle, no dirty pages, it seems safe to just return from
-		 * here. But if we did have dirty pages, how would we get them
-		 * written out if thread was aborted? Some other strategy is
-		 * necessary. -- EKN
-		 */
-		if ((intrflg && nfs_sigintr(nmp, (struct nfsreq *)0, p)) ||
-		    (error == EINTR && current_thread_aborted())) {
+		FSDBG(554, vp, 0, 0, error);
+		error = nfs_sigintr(VFSTONFS(vp->v_mount), NULL, p);
+		if (error) {
 			np->n_flag &= ~NFLUSHINPROG;
 			if (np->n_flag & NFLUSHWANT) {
 				np->n_flag &= ~NFLUSHWANT;
 				wakeup((caddr_t)&np->n_flag);
 			}
-			return (EINTR);
+			FSDBG_BOT(554, vp, flags, intrflg, error);
+			return (error);
 		}
-		error = vinvalbuf(vp, flags, cred, p, 0, slptimeo);
+		error = nfs_vinvalbuf_internal(vp, flags, cred, p, 0, slptimeo);
 	}
 	np->n_flag &= ~(NMODIFIED | NFLUSHINPROG);
 	if (np->n_flag & NFLUSHWANT) {
@@ -954,9 +2244,12 @@ nfs_vinvalbuf(vp, flags, cred, p, intrflg)
 	}
 	didhold = ubc_hold(vp);
 	if (didhold) {
-		(void) ubc_clean(vp, 1); /* get the pages out of vm also */
+		int rv = ubc_clean(vp, 1); /* get the pages out of vm also */
+		if (!rv)
+			panic("nfs_vinvalbuf(): ubc_clean failed!");
 		ubc_rele(vp);
 	}
+	FSDBG_BOT(554, vp, flags, intrflg, 0);
 	return (0);
 }
 
@@ -967,7 +2260,7 @@ nfs_vinvalbuf(vp, flags, cred, p, intrflg)
  */
 int
 nfs_asyncio(bp, cred)
-	register struct buf *bp;
+	struct nfsbuf *bp;
 	struct ucred *cred;
 {
 	struct nfsmount *nmp;
@@ -975,17 +2268,23 @@ nfs_asyncio(bp, cred)
 	int gotiod;
 	int slpflag = 0;
 	int slptimeo = 0;
-	int error;
+	int error, error2;
 
 	if (nfs_numasync == 0)
 		return (EIO);
-	
-	nmp = VFSTONFS(bp->b_vp->v_mount);
+
+	FSDBG_TOP(552, bp, bp ? NBOFF(bp) : 0, bp ? bp->nb_flags : 0, 0);
+
+	nmp = ((bp != NULL) ? VFSTONFS(bp->nb_vp->v_mount) : NULL);
 again:
-	if (nmp->nm_flag & NFSMNT_INT)
+	if (nmp && nmp->nm_flag & NFSMNT_INT)
 		slpflag = PCATCH;
 	gotiod = FALSE;
 
+	/* no nfsbuf means tell nfsiod to process delwri list */
+	if (!bp)
+		nfs_ioddelwri = 1;
+
 	/*
 	 * Find a free iod to process this request.
 	 */
@@ -1000,12 +2299,17 @@ again:
 				 i, nmp));
 			nfs_iodwant[i] = (struct proc *)0;
 			nfs_iodmount[i] = nmp;
-			nmp->nm_bufqiods++;
+			if (nmp)
+				nmp->nm_bufqiods++;
 			wakeup((caddr_t)&nfs_iodwant[i]);
 			gotiod = TRUE;
 			break;
 		}
 
+	/* if we're just poking the delwri list, we're done */
+	if (!bp)
+		return (0);
+
 	/*
 	 * If none are free, we may already have an iod working on this mount
 	 * point.  If so, it will process our request.
@@ -1023,19 +2327,31 @@ again:
 	 * If we have an iod which can process the request, then queue
 	 * the buffer.
 	 */
+	FSDBG(552, bp, gotiod, i, nmp->nm_bufqiods);
 	if (gotiod) {
 		/*
 		 * Ensure that the queue never grows too large.
 		 */
 		while (nmp->nm_bufqlen >= 2*nfs_numasync) {
+			if (ISSET(bp->nb_flags, NB_IOD)) {
+				/* An nfsiod is attempting this async operation so */
+				/* we must not fall asleep on the bufq because we */
+				/* could be waiting on ourself.  Just return error */
+				/* and we'll do this operation syncrhonously. */
+				goto out;
+			}
+			FSDBG(552, bp, nmp->nm_bufqlen, 2*nfs_numasync, -1);
 			NFS_DPF(ASYNCIO,
 				("nfs_asyncio: waiting for mount %p queue to drain\n", nmp));
 			nmp->nm_bufqwant = TRUE;
 			error = tsleep(&nmp->nm_bufq, slpflag | PRIBIO,
 				       "nfsaio", slptimeo);
 			if (error) {
-				if (nfs_sigintr(nmp, NULL, bp->b_proc))
-					return (EINTR);
+				error2 = nfs_sigintr(nmp, NULL, bp->nb_proc);
+				if (error2) {
+					FSDBG_BOT(552, bp, NBOFF(bp), bp->nb_flags, error2);
+					return (error2);
+				}
 				if (slpflag == PCATCH) {
 					slpflag = 0;
 					slptimeo = 2 * hz;
@@ -1052,35 +2368,38 @@ again:
 			}
 		}
 
-		if (ISSET(bp->b_flags, B_READ)) {
-			if (bp->b_rcred == NOCRED && cred != NOCRED) {
+		if (ISSET(bp->nb_flags, NB_READ)) {
+			if (bp->nb_rcred == NOCRED && cred != NOCRED) {
 				/*
 				 * NFS has embedded ucred.
 				 * Can not crhold() here as that causes zone corruption
 				 */
-				bp->b_rcred = crdup(cred);
+				bp->nb_rcred = crdup(cred);
 			}
 		} else {
-			SET(bp->b_flags, B_WRITEINPROG);
-			if (bp->b_wcred == NOCRED && cred != NOCRED) {
+			SET(bp->nb_flags, NB_WRITEINPROG);
+			if (bp->nb_wcred == NOCRED && cred != NOCRED) {
 				/*
 				 * NFS has embedded ucred.
 				 * Can not crhold() here as that causes zone corruption
 				 */
-				bp->b_wcred = crdup(cred);
+				bp->nb_wcred = crdup(cred);
 			}
 		}
 
-		TAILQ_INSERT_TAIL(&nmp->nm_bufq, bp, b_freelist);
+		TAILQ_INSERT_TAIL(&nmp->nm_bufq, bp, nb_free);
 		nmp->nm_bufqlen++;
+		FSDBG_BOT(552, bp, NBOFF(bp), bp->nb_flags, 0);
 		return (0);
 	}
 
+out:
 	/*
 	 * All the iods are busy on other mounts, so return EIO to
 	 * force the caller to process the i/o synchronously.
 	 */
 	NFS_DPF(ASYNCIO, ("nfs_asyncio: no iods available, i/o is synchronous\n"));
+	FSDBG_BOT(552, bp, NBOFF(bp), bp->nb_flags, EIO);
 	return (EIO);
 }
 
@@ -1090,7 +2409,7 @@ again:
  */
 int
 nfs_doio(bp, cr, p)
-	register struct buf *bp;
+	struct nfsbuf *bp;
 	struct ucred *cr;
 	struct proc *p;
 {
@@ -1102,7 +2421,7 @@ nfs_doio(bp, cr, p)
 	struct uio uio;
 	struct iovec io;
 
-	vp = bp->b_vp;
+	vp = bp->nb_vp;
 	np = VTONFS(vp);
 	nmp = VFSTONFS(vp->v_mount);
 	uiop = &uio;
@@ -1111,66 +2430,34 @@ nfs_doio(bp, cr, p)
 	uiop->uio_segflg = UIO_SYSSPACE;
 	uiop->uio_procp = p;
 
-	/* 
-	 * With UBC, getblk() can return a buf with B_DONE set.
-	 * This indicates that the VM has valid data for that page.
-	 * NFS being stateless, this case poses a problem.
-	 * By definition, the NFS server should always be consulted
-	 * for the data in that page.
-	 * So we choose to clear the B_DONE and to do the IO.
-	 *
-	 * XXX revisit this if there is a performance issue.
-	 * XXX In that case, we could play the attribute cache games ...
+	/*
+	 * we've decided to perform I/O for this block,
+	 * so we couldn't possibly NB_DONE.  So, clear it.
 	 */
-	 if (ISSET(bp->b_flags, B_DONE)) {
-		if (!ISSET(bp->b_flags, B_ASYNC))
+	if (ISSET(bp->nb_flags, NB_DONE)) {
+		if (!ISSET(bp->nb_flags, NB_ASYNC))
 			panic("nfs_doio: done and not async");
-		CLR(bp->b_flags, B_DONE);
+		CLR(bp->nb_flags, NB_DONE);
 	}
-	FSDBG_TOP(256, np->n_size, bp->b_blkno * DEV_BSIZE, bp->b_bcount,
-		  bp->b_flags);
-	FSDBG(257, bp->b_validoff, bp->b_validend, bp->b_dirtyoff,
-	      bp->b_dirtyend);
-	/*
-	 * Historically, paging was done with physio, but no more.
-	 */
-	if (ISSET(bp->b_flags, B_PHYS)) {
-	    /*
-	     * ...though reading /dev/drum still gets us here.
-	     */
-	    io.iov_len = uiop->uio_resid = bp->b_bcount;
-	    /* mapping was done by vmapbuf() */
-	    io.iov_base = bp->b_data;
-	    uiop->uio_offset = (off_t)bp->b_blkno * DEV_BSIZE;
-	    if (ISSET(bp->b_flags, B_READ)) {
-			uiop->uio_rw = UIO_READ;
-			nfsstats.read_physios++;
-			error = nfs_readrpc(vp, uiop, cr);
-	    } else {
-			int com;
-
-			iomode = NFSV3WRITE_DATASYNC;
-			uiop->uio_rw = UIO_WRITE;
-			nfsstats.write_physios++;
-			error = nfs_writerpc(vp, uiop, cr, &iomode, &com);
-	    }
-	    if (error) {
-			SET(bp->b_flags, B_ERROR);
-			bp->b_error = error;
-	    }
-	} else if (ISSET(bp->b_flags, B_READ)) {
-	    io.iov_len = uiop->uio_resid = bp->b_bcount;
-	    io.iov_base = bp->b_data;
+	FSDBG_TOP(256, np->n_size, NBOFF(bp), bp->nb_bufsize, bp->nb_flags);
+	FSDBG(257, bp->nb_validoff, bp->nb_validend, bp->nb_dirtyoff,
+	      bp->nb_dirtyend);
+
+	if (ISSET(bp->nb_flags, NB_READ)) {
+	    if (vp->v_type == VREG)
+		    NFS_BUF_MAP(bp);
+	    io.iov_len = uiop->uio_resid = bp->nb_bufsize;
+	    io.iov_base = bp->nb_data;
 	    uiop->uio_rw = UIO_READ;
 	    switch (vp->v_type) {
 	    case VREG:
-		uiop->uio_offset = (off_t)bp->b_blkno * DEV_BSIZE;
+		uiop->uio_offset = NBOFF(bp);
 		nfsstats.read_bios++;
 		error = nfs_readrpc(vp, uiop, cr);
-		FSDBG(262, np->n_size, bp->b_blkno * DEV_BSIZE,
-		      uiop->uio_resid, error);
+		FSDBG(262, np->n_size, NBOFF(bp), uiop->uio_resid, error);
 		if (!error) {
-		    bp->b_validoff = 0;
+		    /* update valid range */
+		    bp->nb_validoff = 0;
 		    if (uiop->uio_resid) {
 			/*
 			 * If len > 0, there is a hole in the file and
@@ -1178,33 +2465,26 @@ nfs_doio(bp, cr, p)
 			 * the server yet.
 			 * Just zero fill the rest of the valid area.
 			 */
-			diff = bp->b_bcount - uiop->uio_resid;
-			len = np->n_size - ((u_quad_t)bp->b_blkno * DEV_BSIZE +
-					    diff);
+			diff = bp->nb_bufsize - uiop->uio_resid;
+			len = np->n_size - (NBOFF(bp) + diff);
 			if (len > 0) {
 				len = min(len, uiop->uio_resid);
-				bzero((char *)bp->b_data + diff, len);
-				bp->b_validend = diff + len;
+				bzero((char *)bp->nb_data + diff, len);
+				bp->nb_validend = diff + len;
 				FSDBG(258, diff, len, 0, 1);
 			} else
-				bp->b_validend = diff;
+				bp->nb_validend = diff;
 		    } else
-				bp->b_validend = bp->b_bcount;
-
-		    if (bp->b_validend < bp->b_bufsize) {
-			    /*
-			     * we're about to release a partial buffer after a
-			     * read... the only way we should get here is if
-			     * this buffer contains the EOF before releasing it,
-			     * we'll zero out to the end of the buffer so that
-			     * if a mmap of this page occurs, we'll see zero's
-			     * even if a ftruncate extends the file in the
-			     * meantime
-			     */
-			    bzero((caddr_t)(bp->b_data + bp->b_validend),
-			          bp->b_bufsize - bp->b_validend);
-			    FSDBG(258, bp->b_validend,
-			          bp->b_bufsize - bp->b_validend, 0, 2);
+				bp->nb_validend = bp->nb_bufsize;
+		    bp->nb_valid = (1 << (round_page_32(bp->nb_validend)/PAGE_SIZE)) - 1;
+		    if (bp->nb_validend & PAGE_MASK) {
+			    /* valid range ends in the middle of a page so we */
+			    /* need to zero-fill any invalid data at the end */
+			    /* of the last page */
+			    bzero((caddr_t)(bp->nb_data + bp->nb_validend),
+			          bp->nb_bufsize - bp->nb_validend);
+			    FSDBG(258, bp->nb_validend,
+			          bp->nb_bufsize - bp->nb_validend, 0, 2);
 		    }
 		}
 		if (p && (vp->v_flag & VTEXT) &&
@@ -1222,10 +2502,14 @@ nfs_doio(bp, cr, p)
 		uiop->uio_offset = (off_t)0;
 		nfsstats.readlink_bios++;
 		error = nfs_readlinkrpc(vp, uiop, cr);
+		if (!error) {
+			bp->nb_validoff = 0;
+			bp->nb_validend = uiop->uio_offset;
+		}
 		break;
 	    case VDIR:
 		nfsstats.readdir_bios++;
-		uiop->uio_offset = ((u_quad_t)bp->b_lblkno) * NFS_DIRBLKSIZ;
+		uiop->uio_offset = NBOFF(bp);
 		if (!(nmp->nm_flag & NFSMNT_NFSV3))
 			nmp->nm_flag &= ~NFSMNT_RDIRPLUS; /* dk@farm.org */
 		if (nmp->nm_flag & NFSMNT_RDIRPLUS) {
@@ -1235,151 +2519,276 @@ nfs_doio(bp, cr, p)
 		}
 		if ((nmp->nm_flag & NFSMNT_RDIRPLUS) == 0)
 			error = nfs_readdirrpc(vp, uiop, cr);
+		if (!error) {
+			bp->nb_validoff = 0;
+			bp->nb_validend = uiop->uio_offset - NBOFF(bp);
+			bp->nb_valid = (1 << (round_page_32(bp->nb_validend)/PAGE_SIZE)) - 1;
+		}
 		break;
 	    default:
 		printf("nfs_doio: type %x unexpected\n", vp->v_type);
 		break;
 	    };
 	    if (error) {
-		SET(bp->b_flags, B_ERROR);
-		bp->b_error = error;
+		SET(bp->nb_flags, NB_ERROR);
+		bp->nb_error = error;
 	    }
+
 	} else {
+	    /* we're doing a write */
+	    int doff, dend = 0;
+
+	    /* We need to make sure the pages are locked before doing I/O.  */
+	    if (!ISSET(bp->nb_flags, NB_META) && UBCISVALID(vp)) {
+		if (!ISSET(bp->nb_flags, NB_PAGELIST)) {
+		    error = nfs_buf_upl_setup(bp);
+		    if (error) {
+			printf("nfs_doio: upl create failed %d\n", error);
+			SET(bp->nb_flags, NB_ERROR);
+			bp->nb_error = EIO;
+			return (EIO);
+		    }
+		    nfs_buf_upl_check(bp);
+		}
+	    }
+
+	    if (ISSET(bp->nb_flags, NB_WASDIRTY)) {
+		FSDBG(256, bp, NBOFF(bp), bp->nb_dirty, 0xd00dee);
+		/*
+		 * There are pages marked dirty that need to be written out.
+		 *
+		 * We don't want to just combine the write range with the
+		 * range of pages that are dirty because that could cause us
+		 * to write data that wasn't actually written to.
+		 * We also don't want to write data more than once.
+		 *
+		 * If the dirty range just needs to be committed, we do that.
+		 * Otherwise, we write the dirty range and clear the dirty bits
+		 * for any COMPLETE pages covered by that range.
+		 * If there are dirty pages left after that, we write out the
+		 * parts that we haven't written yet.
+		 */
+	    }
+
 	    /*
-	     * mapped I/O may have altered any bytes, so we extend
-	     * the dirty zone to the valid zone.  For best performance
-	     * a better solution would be to save & restore page dirty bits
-	     * around the uiomove which brings write-data into the buffer.
-	     * Then here we'd check if the page is dirty rather than WASMAPPED
-	     * Also vnode_pager would change - if a page is clean it might
-	     * still need to be written due to DELWRI.
+	     * If NB_NEEDCOMMIT is set, a commit rpc may do the trick. If not
+	     * an actual write will have to be done.
+	     * If NB_WRITEINPROG is already set, then push it with a write anyhow.
 	     */
-	    if (UBCINFOEXISTS(vp) && ubc_issetflags(vp, UI_WASMAPPED)) {
-		bp->b_dirtyoff = min(bp->b_dirtyoff, bp->b_validoff);
-		bp->b_dirtyend = max(bp->b_dirtyend, bp->b_validend);
+	    if ((bp->nb_flags & (NB_NEEDCOMMIT | NB_WRITEINPROG)) == NB_NEEDCOMMIT) {
+		doff = NBOFF(bp) + bp->nb_dirtyoff;
+		SET(bp->nb_flags, NB_WRITEINPROG);
+		error = nfs_commit(vp, doff, bp->nb_dirtyend - bp->nb_dirtyoff,
+				bp->nb_wcred, bp->nb_proc);
+		CLR(bp->nb_flags, NB_WRITEINPROG);
+		if (!error) {
+		    bp->nb_dirtyoff = bp->nb_dirtyend = 0;
+		    CLR(bp->nb_flags, NB_NEEDCOMMIT);
+		    np->n_needcommitcnt--;
+		    CHECK_NEEDCOMMITCNT(np);
+		} else if (error == NFSERR_STALEWRITEVERF)
+		    nfs_clearcommit(vp->v_mount);
 	    }
-	    if ((off_t)bp->b_blkno * DEV_BSIZE + bp->b_dirtyend > np->n_size)
-		bp->b_dirtyend = np->n_size - (off_t)bp->b_blkno * DEV_BSIZE;
-
-	    if (bp->b_dirtyend > bp->b_dirtyoff) {
-		io.iov_len = uiop->uio_resid = bp->b_dirtyend - bp->b_dirtyoff;
-		uiop->uio_offset = (off_t)bp->b_blkno * DEV_BSIZE +
-				   bp->b_dirtyoff;
-		io.iov_base = (char *)bp->b_data + bp->b_dirtyoff;
-		uiop->uio_rw = UIO_WRITE;
 
-		nfsstats.write_bios++;
-		if ((bp->b_flags & (B_ASYNC | B_NEEDCOMMIT | B_NOCACHE)) ==
-		    B_ASYNC)
+	    if (!error && bp->nb_dirtyend > 0) {
+		/* there's a dirty range that needs to be written out */
+		u_int32_t pagemask;
+		int firstpg, lastpg;
+
+		if (NBOFF(bp) + bp->nb_dirtyend > np->n_size)
+		    bp->nb_dirtyend = np->n_size - NBOFF(bp);
+
+		NFS_BUF_MAP(bp);
+
+		doff = bp->nb_dirtyoff;
+		dend = bp->nb_dirtyend;
+
+		/* if doff page is dirty, move doff to start of page */
+		if (NBPGDIRTY(bp,doff/PAGE_SIZE))
+		    doff -= doff & PAGE_MASK;
+		/* try to expand write range to include preceding dirty pages */
+		if (!(doff & PAGE_MASK))
+		    while (doff > 0 && NBPGDIRTY(bp,(doff-1)/PAGE_SIZE))
+		    	doff -= PAGE_SIZE;
+		/* if dend page is dirty, move dend to start of next page */
+		if ((dend & PAGE_MASK) && NBPGDIRTY(bp,dend/PAGE_SIZE))
+		    dend = round_page_32(dend);
+		/* try to expand write range to include trailing dirty pages */
+		if (!(dend & PAGE_MASK))
+		    while (dend < bp->nb_bufsize && NBPGDIRTY(bp,dend/PAGE_SIZE))
+		    	dend += PAGE_SIZE;
+		/* make sure to keep dend clipped to EOF */
+		if (NBOFF(bp) + dend > np->n_size)
+		    dend = np->n_size - NBOFF(bp);
+		/* calculate range of complete pages being written */
+		firstpg = round_page_32(doff) / PAGE_SIZE;
+		lastpg = (trunc_page_32(dend) - 1)/ PAGE_SIZE;
+		/* calculate mask for that page range */
+		pagemask = ((1 << (lastpg+1)) - 1) & ~((1 << firstpg) - 1);
+
+		/* compare page mask to nb_dirty; if there are other dirty pages */
+		/* then write FILESYNC; otherwise, write UNSTABLE if async and */
+		/* not needcommit/nocache/call; otherwise write FILESYNC */
+		if (bp->nb_dirty & ~pagemask)
+		    iomode = NFSV3WRITE_FILESYNC;
+		else if ((bp->nb_flags & (NB_ASYNC | NB_NEEDCOMMIT | NB_NOCACHE | NB_STABLE)) == NB_ASYNC)
 		    iomode = NFSV3WRITE_UNSTABLE;
 		else
 		    iomode = NFSV3WRITE_FILESYNC;
-		SET(bp->b_flags, B_WRITEINPROG);
+
+		/* write the dirty range */
+		io.iov_len = uiop->uio_resid = dend - doff;
+		uiop->uio_offset = NBOFF(bp) + doff;
+		io.iov_base = (char *)bp->nb_data + doff;
+		uiop->uio_rw = UIO_WRITE;
+
+		nfsstats.write_bios++;
+
+		SET(bp->nb_flags, NB_WRITEINPROG);
 		error = nfs_writerpc(vp, uiop, cr, &iomode, &must_commit);
-		if (!error && iomode == NFSV3WRITE_UNSTABLE)
-		    SET(bp->b_flags, B_NEEDCOMMIT);
-		else
-		    CLR(bp->b_flags, B_NEEDCOMMIT);
-		CLR(bp->b_flags, B_WRITEINPROG);
+		if (must_commit)
+		    nfs_clearcommit(vp->v_mount);
+		/* clear dirty bits for pages we've written */
+		if (!error)
+		    bp->nb_dirty &= ~pagemask;
+		/* set/clear needcommit flag */
+		if (!error && iomode == NFSV3WRITE_UNSTABLE) {
+		    if (!ISSET(bp->nb_flags, NB_NEEDCOMMIT))
+			np->n_needcommitcnt++;
+		    SET(bp->nb_flags, NB_NEEDCOMMIT);
+		    /* make sure nb_dirtyoff/nb_dirtyend reflect actual range written */
+		    bp->nb_dirtyoff = doff;
+		    bp->nb_dirtyend = dend;
+		} else {
+		    if (ISSET(bp->nb_flags, NB_NEEDCOMMIT)) {
+			np->n_needcommitcnt--;
+			CHECK_NEEDCOMMITCNT(np);
+		    }
+		    CLR(bp->nb_flags, NB_NEEDCOMMIT);
+		}
+		CLR(bp->nb_flags, NB_WRITEINPROG);
 		/*
-		 * For an interrupted write, the buffer is still valid
-		 * and the write hasn't been pushed to the server yet,
-		 * so we can't set B_ERROR and report the interruption
-		 * by setting B_EINTR. For the B_ASYNC case, B_EINTR
-		 * is not relevant, so the rpc attempt is essentially
-		 * a noop.  For the case of a V3 write rpc not being
-		 * committed to stable storage, the block is still
-		 * dirty and requires either a commit rpc or another
-		 * write rpc with iomode == NFSV3WRITE_FILESYNC before
-		 * the block is reused. This is indicated by setting
-		 * the B_DELWRI and B_NEEDCOMMIT flags.
+		 * For an interrupted write, the buffer is still valid and the write
+		 * hasn't been pushed to the server yet, so we can't set NB_ERROR and
+		 * report the interruption by setting NB_EINTR.  For the NB_ASYNC case,
+		 * NB_EINTR is not relevant.
+		 *
+		 * For the case of a V3 write rpc not being committed to stable
+		 * storage, the block is still dirty and requires either a commit rpc
+		 * or another write rpc with iomode == NFSV3WRITE_FILESYNC before the
+		 * block is reused. This is indicated by setting the NB_DELWRI and
+		 * NB_NEEDCOMMIT flags.
 		 */
-		if (error == EINTR || (!error && bp->b_flags & B_NEEDCOMMIT)) {
-			int s;
-
-			CLR(bp->b_flags, B_INVAL | B_NOCACHE);
-			if (!ISSET(bp->b_flags, B_DELWRI)) {
-				SET(bp->b_flags, B_DELWRI);
-				nbdwrite++;
-			}
-			FSDBG(261, bp->b_validoff, bp->b_validend,
-			      bp->b_bufsize, bp->b_bcount);
-			/*
-			 * Since for the B_ASYNC case, nfs_bwrite() has
-			 * reassigned the buffer to the clean list, we have to
-			 * reassign it back to the dirty one. Ugh.
-			 */
-			if (ISSET(bp->b_flags, B_ASYNC)) {
-				s = splbio();
-				reassignbuf(bp, vp);
-				splx(s);
-			} else {
-				SET(bp->b_flags, B_EINTR);
-			}
+		if (error == EINTR || (!error && bp->nb_flags & NB_NEEDCOMMIT)) {
+		    CLR(bp->nb_flags, NB_INVAL | NB_NOCACHE);
+		    if (!ISSET(bp->nb_flags, NB_DELWRI)) {
+			SET(bp->nb_flags, NB_DELWRI);
+			nfs_nbdwrite++;
+			NFSBUFCNTCHK();
+		    }
+		    FSDBG(261, bp->nb_validoff, bp->nb_validend,
+			  bp->nb_bufsize, 0);
+		    /*
+		     * Since for the NB_ASYNC case, nfs_bwrite() has
+		     * reassigned the buffer to the clean list, we have to
+		     * reassign it back to the dirty one. Ugh.
+		     */
+		    if (ISSET(bp->nb_flags, NB_ASYNC)) {
+			/* move to dirty list */
+			int s = splbio();
+			if (bp->nb_vnbufs.le_next != NFSNOLIST)
+			    LIST_REMOVE(bp, nb_vnbufs);
+			LIST_INSERT_HEAD(&np->n_dirtyblkhd, bp, nb_vnbufs);
+			splx(s);
+		    } else {
+			SET(bp->nb_flags, NB_EINTR);
+		    }
 		} else {
+			/* either there's an error or we don't need to commit */
 			if (error) {
-				SET(bp->b_flags, B_ERROR);
-				bp->b_error = np->n_error = error;
-				np->n_flag |= NWRITEERR;
-			}
-			bp->b_dirtyoff = bp->b_dirtyend = 0;
-
-			/*
-			 * validoff and validend represent the real data present
-			 * in this buffer if validoff is non-zero, than we have
-			 * to invalidate the buffer and kill the page when
-			 * biodone is called... the same is also true when
-			 * validend doesn't extend all the way to the end of the
-			 * buffer and validend doesn't equate to the current
-			 * EOF... eventually we need to deal with this in a more
-			 * humane way (like keeping the partial buffer without
-			 * making it immediately available to the VM page cache)
-			 */
-			if (bp->b_validoff)
-				SET(bp->b_flags, B_INVAL);
-			else
-			if (bp->b_validend < bp->b_bufsize) {
-				if ((off_t)bp->b_blkno * DEV_BSIZE +
-				    bp->b_validend == np->n_size) {
-					bzero((caddr_t)(bp->b_data +
-							bp->b_validend),
-					      bp->b_bufsize - bp->b_validend);
-					FSDBG(259, bp->b_validend,
-					      bp->b_bufsize - bp->b_validend, 0,
-					      0);
-				} else
-					SET(bp->b_flags, B_INVAL);
+			    SET(bp->nb_flags, NB_ERROR);
+			    bp->nb_error = np->n_error = error;
+			    np->n_flag |= NWRITEERR;
 			}
+			/* clear the dirty range */
+			bp->nb_dirtyoff = bp->nb_dirtyend = 0;
 		}
+	    }
+
+	    if (!error && bp->nb_dirty) {
+		/* there are pages marked dirty that need to be written out */
+		int pg, cnt, npages, off, len;
+
+		nfsstats.write_bios++;
 
-	    } else {
-		if (bp->b_validoff ||
-		    (bp->b_validend < bp->b_bufsize &&
-		     (off_t)bp->b_blkno * DEV_BSIZE + bp->b_validend !=
-		     np->n_size)) {
-			SET(bp->b_flags, B_INVAL);
+		NFS_BUF_MAP(bp);
+
+		/*
+		 * we do these writes synchronously because we can't really
+		 * support the unstable/needommit method.  We could write
+		 * them unstable, clear the dirty bits, and then commit the
+		 * whole block later, but if we need to rewrite the data, we
+		 * won't have any idea which pages were written because that
+		 * info can't be stored in the nb_dirtyoff/nb_dirtyend.  We
+		 * also can't leave the dirty bits set because then we wouldn't
+		 * be able to tell if the pages were re-dirtied between the end
+		 * of the write and the commit.
+		 */
+		iomode = NFSV3WRITE_FILESYNC;
+		uiop->uio_rw = UIO_WRITE;
+
+		SET(bp->nb_flags, NB_WRITEINPROG);
+		npages = bp->nb_bufsize/PAGE_SIZE;
+		for (pg=0; pg < npages; pg++) {
+		    if (!NBPGDIRTY(bp,pg))
+		    	continue;
+		    cnt = 1;
+		    while (((pg+cnt) < npages) && NBPGDIRTY(bp,pg+cnt))
+			    cnt++;
+		    /* write cnt pages starting with page pg */
+		    off = pg * PAGE_SIZE;
+		    len = cnt * PAGE_SIZE;
+
+		    /* clip writes to EOF */
+		    if (NBOFF(bp) + off + len > np->n_size)
+		    	len -= (NBOFF(bp) + off + len) - np->n_size;
+		    if (len > 0) {
+			io.iov_len = uiop->uio_resid = len;
+			uiop->uio_offset = NBOFF(bp) + off;
+			io.iov_base = (char *)bp->nb_data + off;
+			error = nfs_writerpc(vp, uiop, cr, &iomode, &must_commit);
+			if (must_commit)
+			    nfs_clearcommit(vp->v_mount);
+			if (error)
+			    break;
+		    }
+		    /* clear dirty bits */
+		    while (cnt--) {
+			   bp->nb_dirty &= ~(1 << pg);
+			   /* leave pg on last page */
+			   if (cnt) pg++;
+		    }
 		}
-		if (bp->b_flags & B_INVAL) {
-			FSDBG(260, bp->b_validoff, bp->b_validend,
-			      bp->b_bufsize, bp->b_bcount);
+		if (!error) {
+		    if (ISSET(bp->nb_flags, NB_NEEDCOMMIT)) {
+			np->n_needcommitcnt--;
+			CHECK_NEEDCOMMITCNT(np);
+		    }
+		    CLR(bp->nb_flags, NB_NEEDCOMMIT);
 		}
-		bp->b_resid = 0;
-		biodone(bp);
-		FSDBG_BOT(256, bp->b_validoff, bp->b_validend, bp->b_bufsize,
+		CLR(bp->nb_flags, NB_WRITEINPROG);
+		FSDBG_BOT(256, bp->nb_validoff, bp->nb_validend, bp->nb_bufsize,
 			  np->n_size);
-		return (0);
 	    }
-	}
-	bp->b_resid = uiop->uio_resid;
-	if (must_commit)
-		nfs_clearcommit(vp->v_mount);
 
-	if (bp->b_flags & B_INVAL) {
-		FSDBG(260, bp->b_validoff, bp->b_validend, bp->b_bufsize,
-		      bp->b_bcount);
+	    if (error) {
+		SET(bp->nb_flags, NB_ERROR);
+		bp->nb_error = error;
+	    }
 	}
-	FSDBG_BOT(256, bp->b_validoff, bp->b_validend, bp->b_bcount, error);
 
-	biodone(bp);
+	FSDBG_BOT(256, bp->nb_validoff, bp->nb_validend, bp->nb_bufsize, error);
+
+	nfs_buf_iodone(bp);
 	return (error);
 }