X-Git-Url: https://git.saurik.com/apple/xnu.git/blobdiff_plain/0b4e3aa066abc0728aacb4bbeb86f53f9737156e..d7e50217d7adf6e52786a38bcaa4cd698cb9a79e:/bsd/nfs/nfs_bio.c

diff --git a/bsd/nfs/nfs_bio.c b/bsd/nfs/nfs_bio.c
index 040678d37..7f41efe13 100644
--- a/bsd/nfs/nfs_bio.c
+++ b/bsd/nfs/nfs_bio.c
@@ -3,19 +3,22 @@
  *
  * @APPLE_LICENSE_HEADER_START@
  * 
- * The contents of this file constitute Original Code as defined in and
- * are subject to the Apple Public Source License Version 1.1 (the
- * "License").  You may not use this file except in compliance with the
- * License.  Please obtain a copy of the License at
- * http://www.apple.com/publicsource and read it before using this file.
+ * Copyright (c) 1999-2003 Apple Computer, Inc.  All Rights Reserved.
  * 
- * This Original Code and all software distributed under the License are
- * distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY KIND, EITHER
+ * This file contains Original Code and/or Modifications of Original Code
+ * as defined in and that are subject to the Apple Public Source License
+ * Version 2.0 (the 'License'). You may not use this file except in
+ * compliance with the License. Please obtain a copy of the License at
+ * http://www.opensource.apple.com/apsl/ and read it before using this
+ * file.
+ * 
+ * The Original Code and all software distributed under the License are
+ * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE OR NON-INFRINGEMENT.  Please see the
- * License for the specific language governing rights and limitations
- * under the License.
+ * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
+ * Please see the License for the specific language governing rights and
+ * limitations under the License.
  * 
  * @APPLE_LICENSE_HEADER_END@
  */
@@ -58,7 +61,6 @@
  *	@(#)nfs_bio.c	8.9 (Berkeley) 3/30/95
  * FreeBSD-Id: nfs_bio.c,v 1.44 1997/09/10 19:52:25 phk Exp $
  */
-
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/resourcevar.h>
@@ -86,14 +88,22 @@
 
 #include <sys/kdebug.h>
 
+#define FSDBG(A, B, C, D, E) \
+	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, (A))) | DBG_FUNC_NONE, \
+		(int)(B), (int)(C), (int)(D), (int)(E), 0)
+#define FSDBG_TOP(A, B, C, D, E) \
+	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, (A))) | DBG_FUNC_START, \
+		(int)(B), (int)(C), (int)(D), (int)(E), 0)
+#define FSDBG_BOT(A, B, C, D, E) \
+	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, (A))) | DBG_FUNC_END, \
+		(int)(B), (int)(C), (int)(D), (int)(E), 0)
+
 static struct buf *nfs_getcacheblk __P((struct vnode *vp, daddr_t bn, int size,
 					struct proc *p, int operation));
-static struct buf *nfs_getwriteblk __P((struct vnode *vp, daddr_t bn,
-					int size, struct proc *p,
-					struct ucred *cred, int off, int len));
 
 extern int nfs_numasync;
 extern struct nfsstats nfsstats;
+extern int nbdwrite;
 
 /*
  * Vnode op for read using bio
@@ -108,7 +118,8 @@ nfs_bioread(vp, uio, ioflag, cred, getpages)
 	int getpages;
 {
 	register struct nfsnode *np = VTONFS(vp);
-	register int biosize, diff, i;
+	register int biosize, i;
+	off_t diff;
 	struct buf *bp = 0, *rabp;
 	struct vattr vattr;
 	struct proc *p;
@@ -129,7 +140,7 @@ nfs_bioread(vp, uio, ioflag, cred, getpages)
 	p = uio->uio_procp;
 	if ((nmp->nm_flag & (NFSMNT_NFSV3 | NFSMNT_GOTFSINFO)) == NFSMNT_NFSV3)
 		(void)nfs_fsinfo(nmp, vp, cred, p);
-        /*due to getblk/vm interractions, use vm page size or less values */
+	/*due to getblk/vm interractions, use vm page size or less values */
 	biosize = min(vp->v_mount->mnt_stat.f_iosize, PAGE_SIZE);
 	/*
 	 * For nfs, cache consistency can only be maintained approximately.
@@ -231,7 +242,8 @@ nfs_bioread(vp, uio, ioflag, cred, getpages)
 		 */
 		if (nfs_numasync > 0 && nmp->nm_readahead > 0) {
 		    for (nra = 0; nra < nmp->nm_readahead &&
-				(off_t)(lbn + 1 + nra) * biosize < np->n_size; nra++) {
+				  (off_t)(lbn + 1 + nra) * biosize < np->n_size;
+			 nra++) {
 				rabn = lbn + 1 + nra;
 				if (!incore(vp, rabn)) {
 					rabp = nfs_getcacheblk(vp, rabn, biosize, p, operation);
@@ -260,7 +272,7 @@ again:
 		bufsize = biosize;
 		if ((off_t)(lbn + 1) * biosize > np->n_size && 
 		    (off_t)(lbn + 1) * biosize - np->n_size < biosize) {
-			bufsize = np->n_size - lbn * biosize;
+			bufsize = np->n_size - (off_t)lbn * biosize;
 			bufsize = (bufsize + DEV_BSIZE - 1) & ~(DEV_BSIZE - 1);
 		}
 		bp = nfs_getcacheblk(vp, lbn, bufsize, p, operation);
@@ -335,36 +347,52 @@ again:
 		    SET(bp->b_flags, B_READ);
 		    error = nfs_doio(bp, cred, p);
 		    if (error) {
-				brelse(bp);
-				while (error == NFSERR_BAD_COOKIE) {
-					nfs_invaldir(vp);
-					error = nfs_vinvalbuf(vp, 0, cred, p, 1);
-					/*
-					 * Yuck! The directory has been modified on the
-					 * server. The only way to get the block is by
-					 * reading from the beginning to get all the
-					 * offset cookies.
-					 */
-					for (i = 0; i <= lbn && !error; i++) {
-					if (np->n_direofoffset
-						&& (i * NFS_DIRBLKSIZ) >= np->n_direofoffset)
-						return (0);
-					bp = nfs_getcacheblk(vp, i, NFS_DIRBLKSIZ, p, operation);
-					if (!bp)
-						return (EINTR);
-					if (!ISSET(bp->b_flags, B_DONE)) {
-						SET(bp->b_flags, B_READ);
-						error = nfs_doio(bp, cred, p);
-						if (error) {
-							brelse(bp);
-						} else if (i < lbn)
-							brelse(bp);
-					}
-					}
-				}
-				if (error)
-					return (error);
+			brelse(bp);
 		    }
+		    while (error == NFSERR_BAD_COOKIE) {
+			nfs_invaldir(vp);
+			error = nfs_vinvalbuf(vp, 0, cred, p, 1);
+			/*
+			 * Yuck! The directory has been modified on the
+			 * server. The only way to get the block is by
+			 * reading from the beginning to get all the
+			 * offset cookies.
+			 */
+			for (i = 0; i <= lbn && !error; i++) {
+			    if (np->n_direofoffset
+				&& (i * NFS_DIRBLKSIZ) >= np->n_direofoffset)
+				    return (0);
+			    bp = nfs_getcacheblk(vp, i, NFS_DIRBLKSIZ, p,
+			    			 operation);
+			    if (!bp)
+				    return (EINTR);
+			    if (!ISSET(bp->b_flags, B_CACHE)) {
+				    SET(bp->b_flags, B_READ);
+				    error = nfs_doio(bp, cred, p);
+				    /*
+				     * no error + B_INVAL == directory EOF,
+				     * use the block.
+				     */
+				    if (error == 0 && (bp->b_flags & B_INVAL))
+					    break;
+			    }
+			    /*
+			     * An error will throw away the block and the
+			     * for loop will break out.  If no error and this
+			     * is not the block we want, we throw away the
+			     * block and go for the next one via the for loop.
+			     */
+			    if (error || i < lbn)
+				    brelse(bp);
+			}
+		    }
+		    /*
+		     * The above while is repeated if we hit another cookie
+		     * error.  If we hit an error and it wasn't a cookie error,
+		     * we give up.
+		     */
+		    if (error)
+			return (error);
 		}
 
 		/*
@@ -377,17 +405,18 @@ again:
 		    (lbn + 1) * NFS_DIRBLKSIZ < np->n_direofoffset) &&
 		    !(np->n_flag & NQNFSNONCACHE) &&
 		    !incore(vp, lbn + 1)) {
-			rabp = nfs_getcacheblk(vp, lbn + 1, NFS_DIRBLKSIZ, p, operation);
+			rabp = nfs_getcacheblk(vp, lbn + 1, NFS_DIRBLKSIZ, p,
+					       operation);
 			if (rabp) {
 			    if (!ISSET(rabp->b_flags, (B_CACHE|B_DELWRI))) {
-					SET(rabp->b_flags, (B_READ | B_ASYNC));
-					if (nfs_asyncio(rabp, cred)) {
-						SET(rabp->b_flags, (B_INVAL|B_ERROR));
-						rabp->b_error = EIO;
-						brelse(rabp);
-					}
+				SET(rabp->b_flags, (B_READ | B_ASYNC));
+				if (nfs_asyncio(rabp, cred)) {
+				    SET(rabp->b_flags, (B_INVAL|B_ERROR));
+				    rabp->b_error = EIO;
+				    brelse(rabp);
+				}
 			    } else {
-					brelse(rabp);
+				brelse(rabp);
 			    }
 			}
 		}
@@ -396,6 +425,21 @@ again:
 		 * the second term may be negative.
 		 */
 		n = lmin(uio->uio_resid, NFS_DIRBLKSIZ - bp->b_resid - on);
+		/*
+		 * Unlike VREG files, whos buffer size ( bp->b_bcount ) is
+		 * chopped for the EOF condition, we cannot tell how large
+		 * NFS directories are going to be until we hit EOF.  So
+		 * an NFS directory buffer is *not* chopped to its EOF.  Now,
+		 * it just so happens that b_resid will effectively chop it
+		 * to EOF.  *BUT* this information is lost if the buffer goes
+		 * away and is reconstituted into a B_CACHE state (recovered
+		 * from VM) later.  So we keep track of the directory eof
+		 * in np->n_direofoffset and chop it off as an extra step
+		 * right here.
+		 */
+		if (np->n_direofoffset &&
+		    n > np->n_direofoffset - uio->uio_offset)
+			n = np->n_direofoffset - uio->uio_offset;
 		break;
 	    default:
 		printf(" nfs_bioread: type %x unexpected\n",vp->v_type);
@@ -423,6 +467,7 @@ again:
 	return (error);
 }
 
+
 /*
  * Vnode op for write using bio
  */
@@ -448,6 +493,9 @@ nfs_write(ap)
 	daddr_t lbn;
 	int bufsize;
 	int n, on, error = 0, iomode, must_commit;
+	off_t boff;
+	struct iovec iov;
+	struct uio auio;
 
 #if DIAGNOSTIC
 	if (uio->uio_rw != UIO_WRITE)
@@ -496,8 +544,8 @@ nfs_write(ap)
 	 * will be the same size within a filesystem. nfs_writerpc will
 	 * still use nm_wsize when sizing the rpc's.
 	 */
-        /*due to getblk/vm interractions, use vm page size or less values */
-        biosize = min(vp->v_mount->mnt_stat.f_iosize, PAGE_SIZE);
+	/*due to getblk/vm interractions, use vm page size or less values */
+	biosize = min(vp->v_mount->mnt_stat.f_iosize, PAGE_SIZE);
 
 	do {
 		/*
@@ -530,12 +578,6 @@ nfs_write(ap)
 		on = uio->uio_offset & (biosize-1);
 		n = min((unsigned)(biosize - on), uio->uio_resid);
 again:
-		if (uio->uio_offset + n > np->n_size) {
-			np->n_size = uio->uio_offset + n;
-			np->n_flag |= NMODIFIED;
-			if (UBCISVALID(vp))
-				ubc_setsize(vp, (off_t)np->n_size); /* XXX check error */
-		}
 		bufsize = biosize;
 #if 0
 /* (removed for UBC) */
@@ -544,21 +586,175 @@ again:
 			bufsize = (bufsize + DEV_BSIZE - 1) & ~(DEV_BSIZE - 1);
 		}
 #endif
-		bp = nfs_getwriteblk(vp, lbn, bufsize, p, cred, on, n);
+		/*
+		 * Get a cache block for writing.  The range to be written is
+		 * (off..off+len) within the block.  We ensure that the block
+		 * either has no dirty region or that the given range is
+		 * contiguous with the existing dirty region.
+		 */
+		bp = nfs_getcacheblk(vp, lbn, bufsize, p, BLK_WRITE);
 		if (!bp)
 			return (EINTR);
+		/*
+		 * Resize nfsnode *after* we busy the buffer to prevent
+		 * readers from reading garbage.
+		 * If there was a partial buf at the old eof, validate
+		 * and zero the new bytes. 
+		 */
+		if (uio->uio_offset + n > np->n_size) {
+			struct buf *bp0 = NULL;
+			daddr_t bn = np->n_size / biosize;
+			int off = np->n_size & (biosize - 1);
+
+			if (off && bn < lbn && incore(vp, bn))
+				bp0 = nfs_getcacheblk(vp, bn, biosize, p,
+						      BLK_WRITE);
+			np->n_flag |= NMODIFIED;
+			np->n_size = uio->uio_offset + n;
+			ubc_setsize(vp, (off_t)np->n_size); /* XXX errors */
+			if (bp0) {
+				bzero((char *)bp0->b_data + off, biosize - off);
+				bp0->b_validend = biosize;
+				brelse(bp0);
+			}
+		}
+		/*
+		 * NFS has embedded ucred so crhold() risks zone corruption
+		 */
+		if (bp->b_wcred == NOCRED)
+			bp->b_wcred = crdup(cred);
+		/*
+		 * If dirtyend exceeds file size, chop it down.  This should
+		 * not occur unless there is a race.
+		 */
+		if ((off_t)bp->b_blkno * DEV_BSIZE + bp->b_dirtyend >
+		    np->n_size)
+			bp->b_dirtyend = np->n_size - (off_t)bp->b_blkno *
+						      DEV_BSIZE;
+		/*
+		 * UBC doesn't (yet) handle partial pages so nfs_biowrite was
+		 * hacked to never bdwrite, to start every little write right
+		 * away.  Running IE Avie noticed the performance problem, thus
+		 * this code, which permits those delayed writes by ensuring an
+		 * initial read of the entire page.  The read may hit eof
+		 * ("short read") but that we will handle.
+		 *
+		 * We are quite dependant on the correctness of B_CACHE so check
+		 * that first in case of problems.
+		 */
+		if (!ISSET(bp->b_flags, B_CACHE) && n < PAGE_SIZE) {
+			boff = (off_t)bp->b_blkno * DEV_BSIZE;
+			auio.uio_iov = &iov;
+			auio.uio_iovcnt = 1;
+			auio.uio_offset = boff;
+			auio.uio_resid = PAGE_SIZE;
+			auio.uio_segflg = UIO_SYSSPACE;
+			auio.uio_rw = UIO_READ;
+			auio.uio_procp = p;
+			iov.iov_base = bp->b_data;
+			iov.iov_len = PAGE_SIZE;
+			error = nfs_readrpc(vp, &auio, cred);
+			if (error) {
+				bp->b_error = error;
+				SET(bp->b_flags, B_ERROR);
+				printf("nfs_write: readrpc %d", error);
+			}
+			if (auio.uio_resid > 0)
+				bzero(iov.iov_base, auio.uio_resid);
+			bp->b_validoff = 0;
+			bp->b_validend = PAGE_SIZE - auio.uio_resid;
+			if (np->n_size > boff + bp->b_validend)
+				bp->b_validend = min(np->n_size - boff,
+						     PAGE_SIZE);
+			bp->b_dirtyoff = 0;
+			bp->b_dirtyend = 0;
+		}
+	
+		/*
+		 * If the new write will leave a contiguous dirty
+		 * area, just update the b_dirtyoff and b_dirtyend,
+		 * otherwise try to extend the dirty region.
+		 */
+		if (bp->b_dirtyend > 0 &&
+		    (on > bp->b_dirtyend || (on + n) < bp->b_dirtyoff)) {
+			off_t start, end;
+	
+			boff = (off_t)bp->b_blkno * DEV_BSIZE;
+			if (on > bp->b_dirtyend) {
+				start = boff + bp->b_validend;
+				end = boff + on;
+			} else {
+				start = boff + on + n;
+				end = boff + bp->b_validoff;
+			}
+			
+			/*
+			 * It may be that the valid region in the buffer
+			 * covers the region we want, in which case just
+			 * extend the dirty region.  Otherwise we try to
+			 * extend the valid region.
+			 */
+			if (end > start) {
+				auio.uio_iov = &iov;
+				auio.uio_iovcnt = 1;
+				auio.uio_offset = start;
+				auio.uio_resid = end - start;
+				auio.uio_segflg = UIO_SYSSPACE;
+				auio.uio_rw = UIO_READ;
+				auio.uio_procp = p;
+				iov.iov_base = bp->b_data + (start - boff);
+				iov.iov_len = end - start;
+				error = nfs_readrpc(vp, &auio, cred);
+				/*
+				 * If we couldn't read, do not do a VOP_BWRITE
+				 * as originally coded. That could also error
+				 * and looping back to "again" as it was doing
+				 * could have us stuck trying to write same buf
+				 * again. nfs_write, will get the entire region
+				 * if nfs_readrpc succeeded. If unsuccessful
+				 * we should just error out. Errors like ESTALE
+				 * would keep us looping rather than transient
+				 * errors justifying a retry. We can return here
+				 * instead of altering dirty region later.  We
+				 * did not write old dirty region at this point.
+				 */
+				if (error) {
+					bp->b_error = error;
+					SET(bp->b_flags, B_ERROR);
+					printf("nfs_write: readrpc2 %d", error);
+					brelse(bp);
+					return (error);
+				}
+				/*
+				 * The read worked.
+				 * If there was a short read, just zero fill.
+				 */
+				if (auio.uio_resid > 0)
+					bzero(iov.iov_base, auio.uio_resid);
+				if (on > bp->b_dirtyend)
+					bp->b_validend = on;
+				else
+					bp->b_validoff = on + n;
+			}
+			/*
+			 * We now have a valid region which extends up to the
+			 * dirty region which we want.
+			 */
+			if (on > bp->b_dirtyend)
+				bp->b_dirtyend = on;
+			else
+				bp->b_dirtyoff = on + n;
+		}
 		if (ISSET(bp->b_flags, B_ERROR)) {
 			error = bp->b_error;
 			brelse(bp);
 			return (error);
 		}
-		if (bp->b_wcred == NOCRED) {
-            /*
-             * NFS has embedded ucred.
-             * Can not crhold() here as that causes zone corruption
-             */
+		/*
+		 * NFS has embedded ucred so crhold() risks zone corruption
+		 */
+		if (bp->b_wcred == NOCRED)
 			bp->b_wcred = crdup(cred);
-		}
 		np->n_flag |= NMODIFIED;
 
 		/*
@@ -636,168 +832,6 @@ again:
 	return (0);
 }
 
-/*
- * Get a cache block for writing.  The range to be written is
- * (off..off+len) within the block.  This routine ensures that the
- * block is either has no dirty region or that the given range is
- * contiguous with the existing dirty region.
- */
-static struct buf *
-nfs_getwriteblk(vp, bn, size, p, cred, off, len)
-	struct vnode *vp;
-	daddr_t bn;
-	int size;
-	struct proc *p;
-	struct ucred *cred;
-	int off, len;
-{
-	struct nfsnode *np = VTONFS(vp);
-	struct buf *bp;
-	int error;
-	struct iovec iov;
-	struct uio uio;
-	off_t boff;
-
- again:
-	bp = nfs_getcacheblk(vp, bn, size, p, BLK_WRITE);
-	if (!bp)
-		return (NULL);
-	if (bp->b_wcred == NOCRED) {
-		/*
-		 * NFS has embedded ucred.
-		 * Can not crhold() here as that causes zone corruption
-		 */
-		bp->b_wcred = crdup(cred);
-	}
-
-	if ((bp->b_blkno * DEV_BSIZE) + bp->b_dirtyend > np->n_size) {
-		bp->b_dirtyend = np->n_size - (bp->b_blkno * DEV_BSIZE);
-	}
-
-	/*
-	 * UBC doesn't (yet) handle partial pages so nfs_biowrite was
-	 * hacked to never bdwrite, to start every little write right away.
-	 * Running IE Avie noticed the performance problem, thus this code,
-	 * which permits those delayed writes by ensuring an initial read
-	 * of the entire page.  The read may hit eof ("short read") but
-	 * that we will handle.
-	 *
-	 * We are quite dependant on the correctness of B_CACHE so check
-	 * that first in case of problems.
-	 */
-	if (!ISSET(bp->b_flags, B_CACHE) && len < PAGE_SIZE) {
-		struct nfsnode *np = VTONFS(vp);
-
-		boff = (off_t)bp->b_blkno * DEV_BSIZE;
-		uio.uio_iov = &iov;
-		uio.uio_iovcnt = 1;
-		uio.uio_offset = boff;
-		uio.uio_resid = PAGE_SIZE;
-		uio.uio_segflg = UIO_SYSSPACE;
-		uio.uio_rw = UIO_READ;
-		uio.uio_procp = p;
-		iov.iov_base = bp->b_data;
-		iov.iov_len = PAGE_SIZE;
-		error = nfs_readrpc(vp, &uio, cred);
-		if (error) {
-			bp->b_error = error;
-			SET(bp->b_flags, B_ERROR);
-			printf("nfs_getwriteblk: readrpc returned %d", error);
-		}
-		if (uio.uio_resid > 0)
-			bzero(iov.iov_base, uio.uio_resid);
-		bp->b_validoff = 0;
-		bp->b_validend = PAGE_SIZE - uio.uio_resid;
-		if (np->n_size > boff + bp->b_validend)
-			bp->b_validend = min(np->n_size - boff, PAGE_SIZE);
-		bp->b_dirtyoff = 0;
-		bp->b_dirtyend = 0;
-	}
-
-	/*
-	 * If the new write will leave a contiguous dirty
-	 * area, just update the b_dirtyoff and b_dirtyend,
-	 * otherwise try to extend the dirty region.
-	 */
-	if (bp->b_dirtyend > 0 &&
-	    (off > bp->b_dirtyend || (off + len) < bp->b_dirtyoff)) {
-		off_t start, end;
-
-		boff = (off_t)bp->b_blkno * DEV_BSIZE;
-		if (off > bp->b_dirtyend) {
-			start = boff + bp->b_validend;
-			end = boff + off;
-		} else {
-			start = boff + off + len;
-			end = boff + bp->b_validoff;
-		}
-		
-		/*
-		 * It may be that the valid region in the buffer
-		 * covers the region we want, in which case just
-		 * extend the dirty region.  Otherwise we try to
-		 * extend the valid region.
-		 */
-		if (end > start) {
-			uio.uio_iov = &iov;
-			uio.uio_iovcnt = 1;
-			uio.uio_offset = start;
-			uio.uio_resid = end - start;
-			uio.uio_segflg = UIO_SYSSPACE;
-			uio.uio_rw = UIO_READ;
-			uio.uio_procp = p;
-			iov.iov_base = bp->b_data + (start - boff);
-			iov.iov_len = end - start;
-			error = nfs_readrpc(vp, &uio, cred);
-			if (error) {
-				/*
-				 * If we couldn't read, do not do a VOP_BWRITE
-                                 * as originally coded. That, could also error
-                                 * and looping back to "again" as it was doing
-                                 * could have us stuck trying to write same buffer
-                                 * again. nfs_write, will get the entire region
-                                 * if nfs_readrpc was successful. If not successful
-                                 * we should just error out. Errors like ESTALE
-                                 * would keep us in this loop rather than transient
-                                 * errors justifying a retry. We can return from here
-                                 * instead of altering dirty region later in routine.
-                                 * We did not write out old dirty region at this point.
-                                 */
-                                bp->b_error = error;
-                                SET(bp->b_flags, B_ERROR);
-                                printf("nfs_getwriteblk: readrpc (2) returned %d", error);
-                                return bp; 
-			} else {
-				/*
-				 * The read worked.
-				 */
-				if (uio.uio_resid > 0) {
-					/*
-					 * If there was a short read,
-					 * just zero fill.
-					 */
-					bzero(iov.iov_base,
-					      uio.uio_resid);
-				}
-				if (off > bp->b_dirtyend)
-					bp->b_validend = off;
-				else
-					bp->b_validoff = off + len;
-			}
-		}
-
-		/*
-		 * We now have a valid region which extends up to the
-		 * dirty region which we want.
-		 */
-		if (off > bp->b_dirtyend)
-			bp->b_dirtyend = off;
-		else
-			bp->b_dirtyoff = off + len;
-	}
-
-	return bp;
-}
 
 /*
  * Get an nfs cache block.
@@ -816,9 +850,25 @@ nfs_getcacheblk(vp, bn, size, p, operation)
 {
 	register struct buf *bp;
 	struct nfsmount *nmp = VFSTONFS(vp->v_mount);
-        /*due to getblk/vm interractions, use vm page size or less values */
+	/*due to getblk/vm interractions, use vm page size or less values */
 	int biosize = min(vp->v_mount->mnt_stat.f_iosize, PAGE_SIZE);
 
+	if (nbdwrite > ((nbuf/4)*3) && operation == BLK_WRITE) {
+#define __BUFFERS_RECLAIMED 2
+		struct buf *tbp[__BUFFERS_RECLAIMED];
+		int i;
+
+		/* too many delayed writes, try to free up some buffers */
+		for (i = 0; i < __BUFFERS_RECLAIMED; i++)
+			tbp[i] = geteblk(512);
+
+		/* Yield to IO thread */
+		(void)tsleep((caddr_t)&nbdwrite, PCATCH, "nbdwrite", 1);
+
+		for (i = (__BUFFERS_RECLAIMED - 1); i >= 0; i--)
+			 brelse(tbp[i]);
+	}
+
 	if (nmp->nm_flag & NFSMNT_INT) {
 		bp = getblk(vp, bn, size, PCATCH, 0, operation);
 		while (bp == (struct buf *)0) {
@@ -830,7 +880,7 @@ nfs_getcacheblk(vp, bn, size, p, operation)
 		bp = getblk(vp, bn, size, 0, 0, operation);
 
 	if( vp->v_type == VREG)
-		bp->b_blkno = (bn * biosize) / DEV_BSIZE;
+		bp->b_blkno = ((off_t)bn * biosize) / DEV_BSIZE;
 
 	return (bp);
 }
@@ -887,7 +937,7 @@ nfs_vinvalbuf(vp, flags, cred, p, intrflg)
 		 * necessary. -- EKN
 		 */
 		if ((intrflg && nfs_sigintr(nmp, (struct nfsreq *)0, p)) ||
-				((error == EINTR) && current_thread_aborted())) {
+		    (error == EINTR && current_thread_aborted())) {
 			np->n_flag &= ~NFLUSHINPROG;
 			if (np->n_flag & NFLUSHWANT) {
 				np->n_flag &= ~NFLUSHWANT;
@@ -904,7 +954,7 @@ nfs_vinvalbuf(vp, flags, cred, p, intrflg)
 	}
 	didhold = ubc_hold(vp);
 	if (didhold) {
-	  (void) ubc_clean(vp, 1); /* get the pages out of vm also */
+		(void) ubc_clean(vp, 1); /* get the pages out of vm also */
 		ubc_rele(vp);
 	}
 	return (0);
@@ -1053,7 +1103,6 @@ nfs_doio(bp, cr, p)
 	struct iovec io;
 
 	vp = bp->b_vp;
-	NFSTRACE(NFSTRC_DIO, vp);
 	np = VTONFS(vp);
 	nmp = VFSTONFS(vp->v_mount);
 	uiop = &uio;
@@ -1068,7 +1117,7 @@ nfs_doio(bp, cr, p)
 	 * NFS being stateless, this case poses a problem.
 	 * By definition, the NFS server should always be consulted
 	 * for the data in that page.
-	 * So we choose to clear the B_DONE and to the IO.
+	 * So we choose to clear the B_DONE and to do the IO.
 	 *
 	 * XXX revisit this if there is a performance issue.
 	 * XXX In that case, we could play the attribute cache games ...
@@ -1078,13 +1127,10 @@ nfs_doio(bp, cr, p)
 			panic("nfs_doio: done and not async");
 		CLR(bp->b_flags, B_DONE);
 	}
-
-	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 256)) | DBG_FUNC_START,
-		     (int)np->n_size, bp->b_blkno * DEV_BSIZE, bp->b_bcount, bp->b_flags, 0);
-
-	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 257)) | DBG_FUNC_NONE,
-		     bp->b_validoff, bp->b_validend, bp->b_dirtyoff, bp->b_dirtyend, 0);
-
+	FSDBG_TOP(256, np->n_size, bp->b_blkno * DEV_BSIZE, bp->b_bcount,
+		  bp->b_flags);
+	FSDBG(257, bp->b_validoff, bp->b_validend, bp->b_dirtyoff,
+	      bp->b_dirtyend);
 	/*
 	 * Historically, paging was done with physio, but no more.
 	 */
@@ -1095,7 +1141,7 @@ nfs_doio(bp, cr, p)
 	    io.iov_len = uiop->uio_resid = bp->b_bcount;
 	    /* mapping was done by vmapbuf() */
 	    io.iov_base = bp->b_data;
-	    uiop->uio_offset = ((off_t)bp->b_blkno) * DEV_BSIZE;
+	    uiop->uio_offset = (off_t)bp->b_blkno * DEV_BSIZE;
 	    if (ISSET(bp->b_flags, B_READ)) {
 			uiop->uio_rw = UIO_READ;
 			nfsstats.read_physios++;
@@ -1118,14 +1164,11 @@ nfs_doio(bp, cr, p)
 	    uiop->uio_rw = UIO_READ;
 	    switch (vp->v_type) {
 	    case VREG:
-		uiop->uio_offset = ((off_t)bp->b_blkno) * DEV_BSIZE;
+		uiop->uio_offset = (off_t)bp->b_blkno * DEV_BSIZE;
 		nfsstats.read_bios++;
 		error = nfs_readrpc(vp, uiop, cr);
-
-		KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 262)) | DBG_FUNC_NONE,
-			     (int)np->n_size, bp->b_blkno * DEV_BSIZE, uiop->uio_resid, error, 0);
-
-
+		FSDBG(262, np->n_size, bp->b_blkno * DEV_BSIZE,
+		      uiop->uio_resid, error);
 		if (!error) {
 		    bp->b_validoff = 0;
 		    if (uiop->uio_resid) {
@@ -1136,35 +1179,33 @@ nfs_doio(bp, cr, p)
 			 * Just zero fill the rest of the valid area.
 			 */
 			diff = bp->b_bcount - uiop->uio_resid;
-			len = np->n_size - (((u_quad_t)bp->b_blkno) * DEV_BSIZE
-				+ diff);
-				if (len > 0) {
-					len = min(len, uiop->uio_resid);
-					bzero((char *)bp->b_data + diff, len);
-					bp->b_validend = diff + len;
-
-					KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 258)) | DBG_FUNC_NONE,
-						     diff, len, 0, 1, 0);
-
-				} else
-					bp->b_validend = diff;
+			len = np->n_size - ((u_quad_t)bp->b_blkno * DEV_BSIZE +
+					    diff);
+			if (len > 0) {
+				len = min(len, uiop->uio_resid);
+				bzero((char *)bp->b_data + diff, len);
+				bp->b_validend = diff + len;
+				FSDBG(258, diff, len, 0, 1);
+			} else
+				bp->b_validend = diff;
 		    } else
 				bp->b_validend = bp->b_bcount;
-#if 1 /* USV + JOE [ */
+
 		    if (bp->b_validend < bp->b_bufsize) {
-		            /*
-			     * we're about to release a partial buffer after a read... the only
-			     * way we should get here is if this buffer contains the EOF
-			     * before releasing it, we'll zero out to the end of the buffer
-			     * so that if a mmap of this page occurs, we'll see zero's even
-			     * if a ftruncate extends the file in the meantime
+			    /*
+			     * we're about to release a partial buffer after a
+			     * read... the only way we should get here is if
+			     * this buffer contains the EOF before releasing it,
+			     * we'll zero out to the end of the buffer so that
+			     * if a mmap of this page occurs, we'll see zero's
+			     * even if a ftruncate extends the file in the
+			     * meantime
 			     */
-		            bzero((caddr_t)(bp->b_data + bp->b_validend), (bp->b_bufsize - bp->b_validend));
-
-			    KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 258)) | DBG_FUNC_NONE,
-					 bp->b_validend, (bp->b_bufsize - bp->b_validend), 0, 2, 0);
+			    bzero((caddr_t)(bp->b_data + bp->b_validend),
+			          bp->b_bufsize - bp->b_validend);
+			    FSDBG(258, bp->b_validend,
+			          bp->b_bufsize - bp->b_validend, 0, 2);
 		    }
-#endif /* ] USV + JOE */
 		}
 		if (p && (vp->v_flag & VTEXT) &&
 			(((nmp->nm_flag & NFSMNT_NQNFS) &&
@@ -1196,28 +1237,40 @@ nfs_doio(bp, cr, p)
 			error = nfs_readdirrpc(vp, uiop, cr);
 		break;
 	    default:
-		printf("nfs_doio:  type %x unexpected\n",vp->v_type);
+		printf("nfs_doio: type %x unexpected\n", vp->v_type);
 		break;
 	    };
 	    if (error) {
-			SET(bp->b_flags, B_ERROR);
-			bp->b_error = error;
+		SET(bp->b_flags, B_ERROR);
+		bp->b_error = error;
 	    }
 	} else {
-	    if (((bp->b_blkno * DEV_BSIZE) + bp->b_dirtyend) > np->n_size)
-		bp->b_dirtyend = np->n_size - (bp->b_blkno * DEV_BSIZE);
+	    /*
+	     * mapped I/O may have altered any bytes, so we extend
+	     * the dirty zone to the valid zone.  For best performance
+	     * a better solution would be to save & restore page dirty bits
+	     * around the uiomove which brings write-data into the buffer.
+	     * Then here we'd check if the page is dirty rather than WASMAPPED
+	     * Also vnode_pager would change - if a page is clean it might
+	     * still need to be written due to DELWRI.
+	     */
+	    if (UBCINFOEXISTS(vp) && ubc_issetflags(vp, UI_WASMAPPED)) {
+		bp->b_dirtyoff = min(bp->b_dirtyoff, bp->b_validoff);
+		bp->b_dirtyend = max(bp->b_dirtyend, bp->b_validend);
+	    }
+	    if ((off_t)bp->b_blkno * DEV_BSIZE + bp->b_dirtyend > np->n_size)
+		bp->b_dirtyend = np->n_size - (off_t)bp->b_blkno * DEV_BSIZE;
 
 	    if (bp->b_dirtyend > bp->b_dirtyoff) {
-
-		io.iov_len = uiop->uio_resid = bp->b_dirtyend
-		    - bp->b_dirtyoff;
-		uiop->uio_offset = ((off_t)bp->b_blkno) * DEV_BSIZE
-		    + bp->b_dirtyoff;
+		io.iov_len = uiop->uio_resid = bp->b_dirtyend - bp->b_dirtyoff;
+		uiop->uio_offset = (off_t)bp->b_blkno * DEV_BSIZE +
+				   bp->b_dirtyoff;
 		io.iov_base = (char *)bp->b_data + bp->b_dirtyoff;
 		uiop->uio_rw = UIO_WRITE;
 
 		nfsstats.write_bios++;
-		if ((bp->b_flags & (B_ASYNC | B_NEEDCOMMIT | B_NOCACHE)) == B_ASYNC)
+		if ((bp->b_flags & (B_ASYNC | B_NEEDCOMMIT | B_NOCACHE)) ==
+		    B_ASYNC)
 		    iomode = NFSV3WRITE_UNSTABLE;
 		else
 		    iomode = NFSV3WRITE_FILESYNC;
@@ -1228,7 +1281,6 @@ nfs_doio(bp, cr, p)
 		else
 		    CLR(bp->b_flags, B_NEEDCOMMIT);
 		CLR(bp->b_flags, B_WRITEINPROG);
-
 		/*
 		 * For an interrupted write, the buffer is still valid
 		 * and the write hasn't been pushed to the server yet,
@@ -1242,20 +1294,20 @@ nfs_doio(bp, cr, p)
 		 * the block is reused. This is indicated by setting
 		 * the B_DELWRI and B_NEEDCOMMIT flags.
 		 */
-		if (error == EINTR
-			|| (!error && (bp->b_flags & B_NEEDCOMMIT))) {
+		if (error == EINTR || (!error && bp->b_flags & B_NEEDCOMMIT)) {
 			int s;
 
-			CLR(bp->b_flags, (B_INVAL|B_NOCACHE));
-			SET(bp->b_flags, B_DELWRI);
-			
-			KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 261)) | DBG_FUNC_NONE,
-				     bp->b_validoff, bp->b_validend, bp->b_bufsize, bp->b_bcount, 0);
-	     
+			CLR(bp->b_flags, B_INVAL | B_NOCACHE);
+			if (!ISSET(bp->b_flags, B_DELWRI)) {
+				SET(bp->b_flags, B_DELWRI);
+				nbdwrite++;
+			}
+			FSDBG(261, bp->b_validoff, bp->b_validend,
+			      bp->b_bufsize, bp->b_bcount);
 			/*
-			 * Since for the B_ASYNC case, nfs_bwrite() has reassigned the
-			 * buffer to the clean list, we have to reassign it back to the
-			 * dirty one. Ugh.
+			 * Since for the B_ASYNC case, nfs_bwrite() has
+			 * reassigned the buffer to the clean list, we have to
+			 * reassign it back to the dirty one. Ugh.
 			 */
 			if (ISSET(bp->b_flags, B_ASYNC)) {
 				s = splbio();
@@ -1272,50 +1324,50 @@ nfs_doio(bp, cr, p)
 			}
 			bp->b_dirtyoff = bp->b_dirtyend = 0;
 
-#if 1  /* JOE */
 			/*
-			 * validoff and validend represent the real data present in this buffer
-			 * if validoff is non-zero, than we have to invalidate the buffer and kill
-			 * the page when biodone is called... the same is also true when validend
-			 * doesn't extend all the way to the end of the buffer and validend doesn't
-			 * equate to the current EOF... eventually we need to deal with this in a 
-			 * more humane way (like keeping the partial buffer without making it immediately
-			 * available to the VM page cache).
+			 * validoff and validend represent the real data present
+			 * in this buffer if validoff is non-zero, than we have
+			 * to invalidate the buffer and kill the page when
+			 * biodone is called... the same is also true when
+			 * validend doesn't extend all the way to the end of the
+			 * buffer and validend doesn't equate to the current
+			 * EOF... eventually we need to deal with this in a more
+			 * humane way (like keeping the partial buffer without
+			 * making it immediately available to the VM page cache)
 			 */
 			if (bp->b_validoff)
 				SET(bp->b_flags, B_INVAL);
 			else
 			if (bp->b_validend < bp->b_bufsize) {
-			        if ((((off_t)bp->b_blkno * (off_t)DEV_BSIZE) + bp->b_validend) == np->n_size) {
-				        bzero((caddr_t)(bp->b_data + bp->b_validend), (bp->b_bufsize - bp->b_validend));
-
-					KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 259)) | DBG_FUNC_NONE,
-						     bp->b_validend, (bp->b_bufsize - bp->b_validend), 0, 0, 0);;
-				}
-				else
-				        SET(bp->b_flags, B_INVAL);
+				if ((off_t)bp->b_blkno * DEV_BSIZE +
+				    bp->b_validend == np->n_size) {
+					bzero((caddr_t)(bp->b_data +
+							bp->b_validend),
+					      bp->b_bufsize - bp->b_validend);
+					FSDBG(259, bp->b_validend,
+					      bp->b_bufsize - bp->b_validend, 0,
+					      0);
+				} else
+					SET(bp->b_flags, B_INVAL);
 			}
-#endif
 		}
 
 	    } else {
-
-#if 1  /* JOE */
-			if (bp->b_validoff)
-				SET(bp->b_flags, B_INVAL);
-			else if (bp->b_validend < bp->b_bufsize) {
-			        if ((((off_t)bp->b_blkno * (off_t)DEV_BSIZE) + bp->b_validend) != np->n_size)
-				         SET(bp->b_flags, B_INVAL);
-			}
-			if (bp->b_flags & B_INVAL) {
-			        KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 260)) | DBG_FUNC_NONE,
-					     bp->b_validoff, bp->b_validend, bp->b_bufsize, bp->b_bcount, 0);
-			}
-#endif
-			bp->b_resid = 0;
-			biodone(bp);
-			NFSTRACE(NFSTRC_DIO_DONE, vp);
-			return (0);
+		if (bp->b_validoff ||
+		    (bp->b_validend < bp->b_bufsize &&
+		     (off_t)bp->b_blkno * DEV_BSIZE + bp->b_validend !=
+		     np->n_size)) {
+			SET(bp->b_flags, B_INVAL);
+		}
+		if (bp->b_flags & B_INVAL) {
+			FSDBG(260, bp->b_validoff, bp->b_validend,
+			      bp->b_bufsize, bp->b_bcount);
+		}
+		bp->b_resid = 0;
+		biodone(bp);
+		FSDBG_BOT(256, bp->b_validoff, bp->b_validend, bp->b_bufsize,
+			  np->n_size);
+		return (0);
 	    }
 	}
 	bp->b_resid = uiop->uio_resid;
@@ -1323,13 +1375,11 @@ nfs_doio(bp, cr, p)
 		nfs_clearcommit(vp->v_mount);
 
 	if (bp->b_flags & B_INVAL) {
-	        KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 260)) | DBG_FUNC_NONE,
-			     bp->b_validoff, bp->b_validend, bp->b_bufsize, bp->b_bcount, 0);
+		FSDBG(260, bp->b_validoff, bp->b_validend, bp->b_bufsize,
+		      bp->b_bcount);
 	}
-	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 256)) | DBG_FUNC_END,
-		     bp->b_validoff, bp->b_validend, bp->b_bcount, error, 0);
+	FSDBG_BOT(256, bp->b_validoff, bp->b_validend, bp->b_bcount, error);
 
 	biodone(bp);
-	NFSTRACE(NFSTRC_DIO_DONE, vp);
 	return (error);
 }