X-Git-Url: https://git.saurik.com/apple/xnu.git/blobdiff_plain/36401178fd6817c043cc00b0c00c7f723e58efae..fe8ab488e9161c46dd9885d58fc52996dc0249ff:/bsd/nfs/nfs_bio.c

diff --git a/bsd/nfs/nfs_bio.c b/bsd/nfs/nfs_bio.c
index b1dccb036..a58e5d866 100644
--- a/bsd/nfs/nfs_bio.c
+++ b/bsd/nfs/nfs_bio.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2000-2007 Apple Inc. All rights reserved.
+ * Copyright (c) 2000-2014 Apple Inc. All rights reserved.
  *
  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
  * 
@@ -77,6 +77,7 @@
 #include <sys/kernel.h>
 #include <sys/ubc_internal.h>
 #include <sys/uio_internal.h>
+#include <sys/kpi_mbuf.h>
 
 #include <sys/vm.h>
 #include <sys/vmparam.h>
@@ -345,7 +346,7 @@ nfs_buf_page_inval(vnode_t vp, off_t offset)
 	struct nfsbuf *bp;
 	int error = 0;
 
-	if (!nmp)
+	if (nfs_mount_gone(nmp))
 		return (ENXIO);
 
 	lck_mtx_lock(nfs_buf_mutex);
@@ -364,11 +365,20 @@ nfs_buf_page_inval(vnode_t vp, off_t offset)
 	 */
 	if (bp->nb_dirtyend > 0) {
 		int start = offset - NBOFF(bp);
-		if (bp->nb_dirtyend <= start ||
-		    bp->nb_dirtyoff >= (start + PAGE_SIZE))
-			error = 0;
-		else
+		if ((bp->nb_dirtyend > start) &&
+		    (bp->nb_dirtyoff < (start + PAGE_SIZE))) {
+			/*
+			 * Before returning the bad news, move the
+			 * buffer to the start of the delwri list and
+			 * give the list a push to try to flush the
+			 * buffer out.
+			 */
 			error = EBUSY;
+			nfs_buf_remfree(bp);
+			TAILQ_INSERT_HEAD(&nfsbufdelwri, bp, nb_free);
+			nfsbufdelwricnt++;
+			nfs_buf_delwri_push(1);
+		}
 	}
 out:
 	lck_mtx_unlock(nfs_buf_mutex);
@@ -481,7 +491,7 @@ nfs_buf_map(struct nfsbuf *bp)
 	if (!ISSET(bp->nb_flags, NB_PAGELIST))
 		return (EINVAL);
 
-	kret = ubc_upl_map(bp->nb_pagelist, (vm_address_t *)&(bp->nb_data));
+	kret = ubc_upl_map(bp->nb_pagelist, (vm_offset_t *)&(bp->nb_data));
 	if (kret != KERN_SUCCESS)
 		panic("nfs_buf_map: ubc_upl_map() failed with (%d)", kret);
 	if (bp->nb_data == 0)
@@ -523,7 +533,7 @@ nfs_buf_normalize_valid_range(nfsnode_t np, struct nfsbuf *bp)
  * process some entries on the delayed write queue
  * (must be called with nfs_buf_mutex held)
  */
-static void
+void
 nfs_buf_delwri_service(void)
 {
 	struct nfsbuf *bp;
@@ -565,7 +575,7 @@ nfs_buf_delwri_service(void)
 /*
  * thread to service the delayed write queue when asked
  */
-static void
+void
 nfs_buf_delwri_thread(__unused void *arg, __unused wait_result_t wr)
 {
 	struct timespec ts = { 30, 0 };
@@ -585,7 +595,7 @@ nfs_buf_delwri_thread(__unused void *arg, __unused wait_result_t wr)
  * try to push out some delayed/uncommitted writes
  * ("locked" indicates whether nfs_buf_mutex is already held)
  */
-static void
+void
 nfs_buf_delwri_push(int locked)
 {
 	if (TAILQ_EMPTY(&nfsbufdelwri))
@@ -627,7 +637,7 @@ int
 nfs_buf_get(
 	nfsnode_t np,
 	daddr64_t blkno,
-	int size,
+	uint32_t size,
 	thread_t thd,
 	int flags,
 	struct nfsbuf **bpp)
@@ -635,7 +645,7 @@ nfs_buf_get(
 	vnode_t vp = NFSTOV(np);
 	struct nfsmount *nmp = VTONMP(vp);
 	struct nfsbuf *bp;
-	int bufsize;
+	uint32_t bufsize;
 	int slpflag = PCATCH;
 	int operation = (flags & NBLK_OPMASK);
 	int error = 0;
@@ -648,14 +658,14 @@ nfs_buf_get(
 	if (bufsize > NFS_MAXBSIZE)
 		panic("nfs_buf_get: buffer larger than NFS_MAXBSIZE requested");
 
-	if (!nmp) {
+	if (nfs_mount_gone(nmp)) {
 		FSDBG_BOT(541, np, blkno, 0, ENXIO);
 		return (ENXIO);
 	}
 
 	if (!UBCINFOEXISTS(vp)) {
 		operation = NBLK_META;
-	} else if (bufsize < nmp->nm_biosize) {
+	} else if (bufsize < (uint32_t)nmp->nm_biosize) {
 		/* reg files should always have biosize blocks */
 		bufsize = nmp->nm_biosize;
 	}
@@ -675,6 +685,21 @@ nfs_buf_get(
 loop:
 	lck_mtx_lock(nfs_buf_mutex);
 
+	/* wait for any buffer invalidation/flushing to complete */
+	while (np->n_bflag & NBINVALINPROG) {
+		np->n_bflag |= NBINVALWANT;
+		ts.tv_sec = 2;
+		ts.tv_nsec = 0;
+		msleep(&np->n_bflag, nfs_buf_mutex, slpflag, "nfs_buf_get_invalwait", &ts);
+		if ((error = nfs_sigintr(VTONMP(vp), NULL, thd, 0))) {
+			lck_mtx_unlock(nfs_buf_mutex);
+			FSDBG_BOT(541, np, blkno, 0, error);
+			return (error);
+		}
+		if (np->n_bflag & NBINVALINPROG)
+			slpflag = 0;
+	}
+
 	/* check for existence of nfsbuf in cache */
 	if ((bp = nfs_buf_incore(np, blkno))) {
 		/* if busy, set wanted and wait */
@@ -855,8 +880,8 @@ loop:
 		}
 	}
 
-	/* setup nfsbuf */
-	bp->nb_lflags = NBL_BUSY;
+	/* set up nfsbuf */
+	SET(bp->nb_lflags, NBL_BUSY);
 	bp->nb_flags = 0;
 	bp->nb_lblkno = blkno;
 	/* insert buf in hash */
@@ -969,9 +994,9 @@ nfs_buf_release(struct nfsbuf *bp, int freeup)
 
 	vp = np ? NFSTOV(np) : NULL;
 	if (vp && UBCINFOEXISTS(vp) && bp->nb_bufsize) {
-		int upl_flags;
+		int upl_flags, rv;
 		upl_t upl;
-		int i, rv;
+		uint32_t i;
 
 		if (!ISSET(bp->nb_flags, NB_PAGELIST) && !ISSET(bp->nb_flags, NB_INVAL)) {
 			rv = nfs_buf_upl_setup(bp);
@@ -1012,6 +1037,10 @@ nfs_buf_release(struct nfsbuf *bp, int freeup)
 					upl_flags = UPL_COMMIT_SET_DIRTY;
 				else
 					upl_flags = UPL_COMMIT_CLEAR_DIRTY;
+				
+				if (!ISSET(bp->nb_flags, (NB_NEEDCOMMIT | NB_DELWRI)))
+					upl_flags |= UPL_COMMIT_CLEAR_PRECIOUS;
+
 				ubc_upl_commit_range(upl,
 					i*PAGE_SIZE, PAGE_SIZE,
 					upl_flags |
@@ -1020,15 +1049,16 @@ nfs_buf_release(struct nfsbuf *bp, int freeup)
 			}
 		}
 pagelist_cleanup_done:
-		/* was this the last buffer in the file? */
+		/* invalidate any pages past EOF */
 		if (NBOFF(bp) + bp->nb_bufsize > (off_t)(np->n_size)) {
-			/* if so, invalidate all pages of last buffer past EOF */
 			off_t start, end;
 			start = trunc_page_64(np->n_size) + PAGE_SIZE_64;
 			end = trunc_page_64(NBOFF(bp) + bp->nb_bufsize);
+			if (start < NBOFF(bp))
+				start = NBOFF(bp);
 			if (end > start) {
-				if (!(rv = ubc_sync_range(vp, start, end, UBC_INVALIDATE)))
-					printf("nfs_buf_release(): ubc_sync_range failed!\n");
+				if ((rv = ubc_msync(vp, start, end, NULL, UBC_INVALIDATE)))
+					printf("nfs_buf_release(): ubc_msync failed!, error %d\n", rv);
 			}
 		}
 		CLR(bp->nb_flags, NB_PAGELIST);
@@ -1172,6 +1202,9 @@ nfs_buf_iodone(struct nfsbuf *bp)
 		 * any throttled write operations
 		 */
 		vnode_writedone(NFSTOV(bp->nb_np));
+		nfs_node_lock_force(bp->nb_np);
+		bp->nb_np->n_numoutput--;
+		nfs_node_unlock(bp->nb_np);
 	}
 	if (ISSET(bp->nb_flags, NB_ASYNC)) {	/* if async, release it */
 		SET(bp->nb_flags, NB_DONE);		/* note that it's done */
@@ -1219,9 +1252,9 @@ nfs_buf_write_delayed(struct nfsbuf *bp)
 	vnode_waitforwrites(NFSTOV(np), VNODE_ASYNC_THROTTLE, 0, 0, "nfs_buf_write_delayed");
 
 	/* the file is in a modified state, so make sure the flag's set */
-	nfs_lock(np, NFS_NODE_LOCK_FORCE);
+	nfs_node_lock_force(np);
 	np->n_flag |= NMODIFIED;
-	nfs_unlock(np);
+	nfs_node_unlock(np);
 
 	/*
 	 * If we have too many delayed write buffers,
@@ -1258,7 +1291,7 @@ nfs_buf_check_write_verifier(nfsnode_t np, struct nfsbuf *bp)
 		return;
 
 	nmp = NFSTONMP(np);
-	if (!nmp)
+	if (nfs_mount_gone(nmp))
 		return;
 	if (!ISSET(bp->nb_flags, NB_STALEWVERF) && (bp->nb_verf == nmp->nm_verf))
 		return;
@@ -1266,10 +1299,10 @@ nfs_buf_check_write_verifier(nfsnode_t np, struct nfsbuf *bp)
 	/* write verifier changed, clear commit/wverf flags */
 	CLR(bp->nb_flags, (NB_NEEDCOMMIT | NB_STALEWVERF));
 	bp->nb_verf = 0;
-	nfs_lock(np, NFS_NODE_LOCK_FORCE);
+	nfs_node_lock_force(np);
 	np->n_needcommitcnt--;
 	CHECK_NEEDCOMMITCNT(np);
-	nfs_unlock(np);
+	nfs_node_unlock(np);
 }
 
 /*
@@ -1303,7 +1336,7 @@ nfs_buf_acquire(struct nfsbuf *bp, int flags, int slpflag, int slptimeo)
 
 	if (ISSET(bp->nb_lflags, NBL_BUSY)) {
 		/*	
-		 * since the mutex_lock may block, the buffer
+		 * since the lck_mtx_lock may block, the buffer
 		 * may become BUSY, so we need to recheck for
 		 * a NOWAIT request
 		 */
@@ -1442,7 +1475,7 @@ nfs_buf_read(struct nfsbuf *bp)
 
 	NFS_BUF_MAP(bp);
 
-	OSAddAtomic(1, (SInt32 *)&nfsstats.read_bios);
+	OSAddAtomic64(1, &nfsstats.read_bios);
 
 	error = nfs_buf_read_rpc(bp, thd, cred);
 	/*
@@ -1468,7 +1501,7 @@ nfs_buf_read_finish(struct nfsbuf *bp)
 		/* update valid range */
 		bp->nb_validoff = 0;
 		bp->nb_validend = bp->nb_endio;
-		if (bp->nb_endio < bp->nb_bufsize) { 
+		if (bp->nb_endio < (int)bp->nb_bufsize) { 
 			/*
 			 * The read may be short because we have unflushed writes
 			 * that are extending the file size and the reads hit the
@@ -1491,7 +1524,7 @@ nfs_buf_read_finish(struct nfsbuf *bp)
 		bp->nb_valid = (1 << (round_page_32(bp->nb_validend) / PAGE_SIZE)) - 1;
 		if (bp->nb_validend & PAGE_MASK) {
 			/* zero-fill remainder of last page */
-			bzero(bp->nb_data + bp->nb_validend, bp->nb_bufsize - bp->nb_validend);
+			bzero(bp->nb_data + bp->nb_validend, PAGE_SIZE - (bp->nb_validend & PAGE_MASK));
 		}
 	}
 	nfs_buf_iodone(bp);
@@ -1506,13 +1539,14 @@ nfs_buf_read_rpc(struct nfsbuf *bp, thread_t thd, kauth_cred_t cred)
 	struct nfsmount *nmp;
 	nfsnode_t np = bp->nb_np;
 	int error = 0, nfsvers, async;
-	int offset, length, nmrsize, nrpcs, len;
+	int offset, nrpcs;
+	uint32_t nmrsize, length, len;
 	off_t boff;
 	struct nfsreq *req;
 	struct nfsreq_cbinfo cb;
 
 	nmp = NFSTONMP(np);
-	if (!nmp) {
+	if (nfs_mount_gone(nmp)) {
 		bp->nb_error = error = ENXIO;
 		SET(bp->nb_flags, NB_ERROR);
 		nfs_buf_iodone(bp);
@@ -1557,6 +1591,8 @@ nfs_buf_read_rpc(struct nfsbuf *bp, thread_t thd, kauth_cred_t cred)
 		len = (length > nmrsize) ? nmrsize : length;
 		cb.rcb_args[0] = offset;
 		cb.rcb_args[1] = len;
+		if (nmp->nm_vers >= NFS_VER4)
+			cb.rcb_args[2] = nmp->nm_stategenid;
 		req = NULL;
 		error = nmp->nm_funcs->nf_read_rpc_async(np, boff + offset, len, thd, cred, &cb, &req);
 		if (error)
@@ -1618,8 +1654,8 @@ nfs_buf_read_rpc_finish(struct nfsreq *req)
 	nfsnode_t np;
 	thread_t thd;
 	kauth_cred_t cred;
-	struct uio uio;
-	struct iovec_32 io;
+	uio_t auio;
+	char uio_buf [ UIO_SIZEOF(1) ];
 
 finish:
 	np = req->r_np;
@@ -1629,9 +1665,11 @@ finish:
 		kauth_cred_ref(cred);
 	cb = req->r_callback;
 	bp = cb.rcb_bp;
+	if (cb.rcb_func) /* take an extra reference on the nfsreq in case we want to resend it later due to grace error */
+		nfs_request_ref(req, 0);
 
 	nmp = NFSTONMP(np);
-	if (!nmp) {
+	if (nfs_mount_gone(nmp)) {
 		SET(bp->nb_flags, NB_ERROR);
 		bp->nb_error = error = ENXIO;
 	}
@@ -1645,28 +1683,65 @@ finish:
 	offset = cb.rcb_args[0];
 	rlen = length = cb.rcb_args[1];
 
-	uio.uio_iovs.iov32p = &io;
-	uio.uio_iovcnt = 1;
-	uio.uio_rw = UIO_READ;
-#if 1	/* LP64todo - can't use new segment flags until the drivers are ready */
-	uio.uio_segflg = UIO_SYSSPACE;
-#else
-	uio.uio_segflg = UIO_SYSSPACE32;
-#endif
-	io.iov_len = length;
-	uio_uio_resid_set(&uio, io.iov_len);
-	uio.uio_offset = NBOFF(bp) + offset;
-	io.iov_base = (uintptr_t) bp->nb_data + offset;
+	auio = uio_createwithbuffer(1, NBOFF(bp) + offset, UIO_SYSSPACE,
+                                UIO_READ, &uio_buf, sizeof(uio_buf));
+	uio_addiov(auio, CAST_USER_ADDR_T(bp->nb_data + offset), length);
 
 	/* finish the RPC */
-	error = nmp->nm_funcs->nf_read_rpc_async_finish(np, req, &uio, &rlen, &eof);
+	error = nmp->nm_funcs->nf_read_rpc_async_finish(np, req, auio, &rlen, &eof);
 	if ((error == EINPROGRESS) && cb.rcb_func) {
 		/* async request restarted */
+		if (cb.rcb_func)
+			nfs_request_rele(req);
 		if (IS_VALID_CRED(cred))
 			kauth_cred_unref(&cred);
 		return;
 	}
-
+	if ((nmp->nm_vers >= NFS_VER4) && nfs_mount_state_error_should_restart(error) && !ISSET(bp->nb_flags, NB_ERROR)) {
+		lck_mtx_lock(&nmp->nm_lock);
+		if ((error != NFSERR_OLD_STATEID) && (error != NFSERR_GRACE) && (cb.rcb_args[2] == nmp->nm_stategenid)) {
+			NP(np, "nfs_buf_read_rpc_finish: error %d @ 0x%llx, 0x%x 0x%x, initiating recovery",
+				error, NBOFF(bp)+offset, cb.rcb_args[2], nmp->nm_stategenid);
+			nfs_need_recover(nmp, error);
+		}
+		lck_mtx_unlock(&nmp->nm_lock);
+		if (np->n_flag & NREVOKE) {
+			error = EIO;
+		} else {
+			if (error == NFSERR_GRACE) {
+				if (cb.rcb_func) {
+					/*
+					 * For an async I/O request, handle a grace delay just like
+					 * jukebox errors.  Set the resend time and queue it up.
+					 */
+					struct timeval now;
+					if (req->r_nmrep.nmc_mhead) {
+						mbuf_freem(req->r_nmrep.nmc_mhead);
+						req->r_nmrep.nmc_mhead = NULL;
+					}
+					req->r_error = 0;
+					microuptime(&now);
+					lck_mtx_lock(&req->r_mtx);
+					req->r_resendtime = now.tv_sec + 2;
+					req->r_xid = 0;                 // get a new XID
+					req->r_flags |= R_RESTART;
+					req->r_start = 0;
+					nfs_asyncio_resend(req);
+					lck_mtx_unlock(&req->r_mtx);
+					if (IS_VALID_CRED(cred))
+						kauth_cred_unref(&cred);
+					/* Note: nfsreq reference taken will be dropped later when finished */
+					return;
+				}
+				/* otherwise, just pause a couple seconds and retry */
+				tsleep(&nmp->nm_state, (PZERO-1), "nfsgrace", 2*hz);
+			}
+			if (!(error = nfs_mount_state_wait_for_recovery(nmp))) {
+				rlen = 0;
+				goto readagain;
+			}
+		}
+	}
 	if (error) {
 		SET(bp->nb_flags, NB_ERROR);
 		bp->nb_error = error;
@@ -1692,19 +1767,24 @@ finish:
 		 * requested, so we need to issue another read for the rest.
 		 * (Don't bother if the buffer already hit an error.)
 		 */
+readagain:
 		offset += rlen;
 		length -= rlen;
 		cb.rcb_args[0] = offset;
 		cb.rcb_args[1] = length;
-		error = nmp->nm_funcs->nf_read_rpc_async(np, offset, length, thd, cred, &cb, &rreq);
+		if (nmp->nm_vers >= NFS_VER4)
+			cb.rcb_args[2] = nmp->nm_stategenid;
+		error = nmp->nm_funcs->nf_read_rpc_async(np, NBOFF(bp) + offset, length, thd, cred, &cb, &rreq);
 		if (!error) {
 			if (IS_VALID_CRED(cred))
 				kauth_cred_unref(&cred);
 			if (!cb.rcb_func) {
 				/* if !async we'll need to wait for this RPC to finish */
 				req = rreq;
+				rreq = NULL;
 				goto finish;
 			}
+			nfs_request_rele(req);
 			/*
 			 * We're done here.
 			 * Outstanding RPC count is unchanged.
@@ -1717,6 +1797,8 @@ finish:
 	}
 
 out:
+	if (cb.rcb_func)
+		nfs_request_rele(req);
 	if (IS_VALID_CRED(cred))
 		kauth_cred_unref(&cred);
 
@@ -1752,14 +1834,15 @@ out:
  * Do buffer readahead.
  * Initiate async I/O to read buffers not in cache.
  */
-static int
+int
 nfs_buf_readahead(nfsnode_t np, int ioflag, daddr64_t *rabnp, daddr64_t lastrabn, thread_t thd, kauth_cred_t cred)
 {
 	struct nfsmount *nmp = NFSTONMP(np);
 	struct nfsbuf *bp;
-	int error = 0, nra;
+	int error = 0;
+	uint32_t nra;
 
-	if (!nmp)
+	if (nfs_mount_gone(nmp))
 		return (ENXIO);
 	if (nmp->nm_readahead <= 0)
 		return (0);
@@ -1768,9 +1851,17 @@ nfs_buf_readahead(nfsnode_t np, int ioflag, daddr64_t *rabnp, daddr64_t lastrabn
 
 	for (nra = 0; (nra < nmp->nm_readahead) && (*rabnp <= lastrabn); nra++, *rabnp = *rabnp + 1) {
 		/* check if block exists and is valid. */
+		if ((*rabnp * nmp->nm_biosize) >= (off_t)np->n_size) {
+			/* stop reading ahead if we're beyond EOF */
+			*rabnp = lastrabn;
+			break;
+		}
 		error = nfs_buf_get(np, *rabnp, nmp->nm_biosize, thd, NBLK_READ|NBLK_NOWAIT, &bp);
 		if (error)
 			break;
+		nfs_node_lock_force(np);
+		np->n_lastrahead = *rabnp;
+		nfs_node_unlock(np);
 		if (!bp)
 			continue;
 		if ((ioflag & IO_NOCACHE) && ISSET(bp->nb_flags, NB_CACHE) &&
@@ -1798,209 +1889,161 @@ nfs_buf_readahead(nfsnode_t np, int ioflag, daddr64_t *rabnp, daddr64_t lastrabn
 }
 
 /*
- * NFS buffer I/O for reading files/directories.
+ * NFS buffer I/O for reading files.
  */
 int
-nfs_bioread(nfsnode_t np, struct uio *uio, int ioflag, int *eofflag, vfs_context_t ctx)
+nfs_bioread(nfsnode_t np, uio_t uio, int ioflag, vfs_context_t ctx)
 {
 	vnode_t vp = NFSTOV(np);
 	struct nfsbuf *bp = NULL;
-	struct nfs_vattr nvattr;
 	struct nfsmount *nmp = VTONMP(vp);
-	daddr64_t lbn, rabn = 0, lastrabn, maxrabn = -1, tlbn;
+	daddr64_t lbn, rabn = 0, lastrabn, maxrabn = -1;
 	off_t diff;
 	int error = 0, n = 0, on = 0;
-	int nfsvers, biosize;
-	caddr_t dp;
-	struct dirent *direntp = NULL;
-	enum vtype vtype;
+	int nfsvers, biosize, modified, readaheads = 0;
 	thread_t thd;
 	kauth_cred_t cred;
+	int64_t io_resid;
 
-	FSDBG_TOP(514, np, uio->uio_offset, uio_uio_resid(uio), ioflag);
-
-	if (uio_uio_resid(uio) == 0) {
-		FSDBG_BOT(514, np, 0xd1e0001, 0, 0);
-		return (0);
-	}
-	if (uio->uio_offset < 0) {
-		FSDBG_BOT(514, np, 0xd1e0002, 0, EINVAL);
-		return (EINVAL);
-	}
+	FSDBG_TOP(514, np, uio_offset(uio), uio_resid(uio), ioflag);
 
 	nfsvers = nmp->nm_vers;
 	biosize = nmp->nm_biosize;
 	thd = vfs_context_thread(ctx);
 	cred = vfs_context_ucred(ctx);
 
-	vtype = vnode_vtype(vp);
-	if ((vtype != VREG) && (vtype != VDIR)) {
-		printf("nfs_bioread: type %x unexpected\n", vtype);
+	if (vnode_vtype(vp) != VREG) {
+		printf("nfs_bioread: type %x unexpected\n", vnode_vtype(vp));
 		FSDBG_BOT(514, np, 0xd1e0016, 0, EINVAL);
 		return (EINVAL);
 	}
 
 	/*
-	 * For nfs, cache consistency can only be maintained approximately.
+	 * For NFS, cache consistency can only be maintained approximately.
 	 * Although RFC1094 does not specify the criteria, the following is
 	 * believed to be compatible with the reference port.
-	 * For nfs:
-	 * If the file's modify time on the server has changed since the
-	 * last read rpc or you have written to the file,
-	 * you may have lost data cache consistency with the
-	 * server, so flush all of the file's data out of the cache.
-	 * Then force a getattr rpc to ensure that you have up to date
-	 * attributes.
+	 * 
+	 * If the file has changed since the last read RPC or you have
+	 * written to the file, you may have lost data cache consistency
+	 * with the server.  So, check for a change, and flush all of the
+	 * file's data out of the cache.
 	 * NB: This implies that cache data can be read when up to
-	 * NFS_MAXATTRTIMEO seconds out of date. If you find that you need
-	 * current attributes this could be forced by calling
-	 * NATTRINVALIDATE() before the nfs_getattr() call.
+	 * NFS_MAXATTRTIMO seconds out of date. If you find that you
+	 * need current attributes, nfs_getattr() can be forced to fetch
+	 * new attributes (via NATTRINVALIDATE() or NGA_UNCACHED).
 	 */
 
 	if (ISSET(np->n_flag, NUPDATESIZE))
 		nfs_data_update_size(np, 0);
 
-	if ((error = nfs_lock(np, NFS_NODE_LOCK_EXCLUSIVE))) {
+	if ((error = nfs_node_lock(np))) {
 		FSDBG_BOT(514, np, 0xd1e0222, 0, error);
 		return (error);
 	}
 
 	if (np->n_flag & NNEEDINVALIDATE) {
 		np->n_flag &= ~NNEEDINVALIDATE;
-		nfs_unlock(np);
-		nfs_vinvalbuf(vp, V_SAVE|V_IGNORE_WRITEERR, ctx, 1);
-		if ((error = nfs_lock(np, NFS_NODE_LOCK_EXCLUSIVE))) {
+		nfs_node_unlock(np);
+		error = nfs_vinvalbuf(vp, V_SAVE|V_IGNORE_WRITEERR, ctx, 1);
+		if (!error)
+			error = nfs_node_lock(np);
+		if (error) {
 			FSDBG_BOT(514, np, 0xd1e0322, 0, error);
 			return (error);
 		}
 	}
 
-	if (np->n_flag & NMODIFIED) {
-		if (vtype == VDIR) {
-			nfs_invaldir(np);
-			nfs_unlock(np);
-			error = nfs_vinvalbuf(vp, V_SAVE, ctx, 1);
-			if (!error)
-				error = nfs_lock(np, NFS_NODE_LOCK_EXCLUSIVE);
-			if (error) {
-				FSDBG_BOT(514, np, 0xd1e0003, 0, error);
-				return (error);
-			}
-		}
-		NATTRINVALIDATE(np);
-		error = nfs_getattr(np, &nvattr, ctx, 1);
-		if (error) {
-			nfs_unlock(np);
-			FSDBG_BOT(514, np, 0xd1e0004, 0, error);
-			return (error);
-		}
-		if (vtype == VDIR) {
-			/* if directory changed, purge any name cache entries */
-			if (NFS_CHANGED_NC(nfsvers, np, &nvattr))
-				cache_purge(vp);
-			NFS_CHANGED_UPDATE_NC(nfsvers, np, &nvattr);
-		}
-		NFS_CHANGED_UPDATE(nfsvers, np, &nvattr);
-	} else {
-		error = nfs_getattr(np, &nvattr, ctx, 1);
-		if (error) {
-			nfs_unlock(np);
-			FSDBG_BOT(514, np, 0xd1e0005, 0, error);
-			return (error);
-		}
-		if (NFS_CHANGED(nfsvers, np, &nvattr)) {
-			if (vtype == VDIR) {
-				nfs_invaldir(np);
-				/* purge name cache entries */
-				if (NFS_CHANGED_NC(nfsvers, np, &nvattr))
-					cache_purge(vp);
-			}
-			nfs_unlock(np);
-			error = nfs_vinvalbuf(vp, V_SAVE, ctx, 1);
-			if (!error)
-				error = nfs_lock(np, NFS_NODE_LOCK_EXCLUSIVE);
-			if (error) {
-				FSDBG_BOT(514, np, 0xd1e0006, 0, error);
-				return (error);
-			}
-			if (vtype == VDIR)
-				NFS_CHANGED_UPDATE_NC(nfsvers, np, &nvattr);
-			NFS_CHANGED_UPDATE(nfsvers, np, &nvattr);
-		}
+	modified = (np->n_flag & NMODIFIED);
+	nfs_node_unlock(np);
+	/* nfs_getattr() will check changed and purge caches */
+	error = nfs_getattr(np, NULL, ctx, modified ? NGA_UNCACHED : NGA_CACHED);
+	if (error) {
+		FSDBG_BOT(514, np, 0xd1e0004, 0, error);
+		return (error);
 	}
 
-	nfs_unlock(np);
+	if (uio_resid(uio) == 0) {
+		FSDBG_BOT(514, np, 0xd1e0001, 0, 0);
+		return (0);
+	}
+	if (uio_offset(uio) < 0) {
+		FSDBG_BOT(514, np, 0xd1e0002, 0, EINVAL);
+		return (EINVAL);
+	}
 
-	if (vtype == VREG) {
-		if ((ioflag & IO_NOCACHE) && (uio_uio_resid(uio) < (2*biosize))) {
-			/* We have only a block or so to read, just do the rpc directly. */
-			error = nfs_read_rpc(np, uio, ctx);
-			FSDBG_BOT(514, np, uio->uio_offset, uio_uio_resid(uio), error);
-			return (error);
-		}
-		/*
-		 * set up readahead - which may be limited by:
-		 * + current request length (for IO_NOCACHE)
-		 * + readahead setting
-		 * + file size
-		 */
-		if (nmp->nm_readahead > 0) {
-			off_t end = uio->uio_offset + uio_uio_resid(uio);
-			if (end > (off_t)np->n_size)
-				end = np->n_size;
-			rabn = uio->uio_offset / biosize;
-			maxrabn = (end - 1) / biosize;
-			if (!(ioflag & IO_NOCACHE) &&
-			    (!rabn || (rabn == np->n_lastread) || (rabn == (np->n_lastread+1)))) {
-				maxrabn += nmp->nm_readahead;
-				if ((maxrabn * biosize) >= (off_t)np->n_size)
-					maxrabn = ((off_t)np->n_size - 1)/biosize;
-			}
-		} else {
-			rabn = maxrabn = 0;
-		}
+	/*
+	 * set up readahead - which may be limited by:
+	 * + current request length (for IO_NOCACHE)
+	 * + readahead setting
+	 * + file size
+	 */
+	if (nmp->nm_readahead > 0) {
+		off_t end = uio_offset(uio) + uio_resid(uio);
+		if (end > (off_t)np->n_size)
+			end = np->n_size;
+		rabn = uio_offset(uio) / biosize;
+		maxrabn = (end - 1) / biosize;
+		nfs_node_lock_force(np);
+		if (!(ioflag & IO_NOCACHE) &&
+		    (!rabn || (rabn == np->n_lastread) || (rabn == (np->n_lastread+1)))) {
+			maxrabn += nmp->nm_readahead;
+			if ((maxrabn * biosize) >= (off_t)np->n_size)
+				maxrabn = ((off_t)np->n_size - 1)/biosize;
+		}
+		if (maxrabn < np->n_lastrahead)
+			np->n_lastrahead = -1;
+		if (rabn < np->n_lastrahead)
+			rabn = np->n_lastrahead + 1;
+		nfs_node_unlock(np);
+	} else {
+		rabn = maxrabn = 0;
 	}
 
 	do {
 
-	    if (vtype == VREG) {
-		nfs_data_lock(np, NFS_NODE_LOCK_SHARED);
-		lbn = uio->uio_offset / biosize;
+		nfs_data_lock(np, NFS_DATA_LOCK_SHARED);
+		lbn = uio_offset(uio) / biosize;
 
 		/*
 		 * Copy directly from any cached pages without grabbing the bufs.
-		 *
-		 * Note: for "nocache" reads, we don't copy directly from UBC
-		 * because any cached pages will be for readahead buffers that
-		 * need to be invalidated anyway before we finish this request.
+		 * (If we are NOCACHE and we've issued readahead requests, we need
+		 * to grab the NB_NCRDAHEAD bufs to drop them.)
 		 */
-		if (!(ioflag & IO_NOCACHE) &&
-			(uio->uio_segflg == UIO_USERSPACE32 ||
-			 uio->uio_segflg == UIO_USERSPACE64 ||
-			 uio->uio_segflg == UIO_USERSPACE)) {
-			// LP64todo - fix this!
-			int io_resid = uio_uio_resid(uio);
-			diff = np->n_size - uio->uio_offset;
+		if ((!(ioflag & IO_NOCACHE) || !readaheads) &&
+		    ((uio->uio_segflg == UIO_USERSPACE32 ||
+		      uio->uio_segflg == UIO_USERSPACE64 ||
+		      uio->uio_segflg == UIO_USERSPACE))) {
+			io_resid = uio_resid(uio);
+			diff = np->n_size - uio_offset(uio);
 			if (diff < io_resid)
 				io_resid = diff;
 			if (io_resid > 0) {
-				error = cluster_copy_ubc_data(vp, uio, &io_resid, 0);
+				int count = (io_resid > INT_MAX) ? INT_MAX : io_resid;
+				error = cluster_copy_ubc_data(vp, uio, &count, 0);
 				if (error) {
 					nfs_data_unlock(np);
-					FSDBG_BOT(514, np, uio->uio_offset, 0xcacefeed, error);
+					FSDBG_BOT(514, np, uio_offset(uio), 0xcacefeed, error);
 					return (error);
 				}
 			}
 			/* count any biocache reads that we just copied directly */
-			if (lbn != (uio->uio_offset / biosize)) {
-				OSAddAtomic((uio->uio_offset / biosize) - lbn, (SInt32*)&nfsstats.biocache_reads);
-				FSDBG(514, np, 0xcacefeed, uio->uio_offset, error);
+			if (lbn != (uio_offset(uio)/biosize)) {
+				OSAddAtomic64((uio_offset(uio)/biosize) - lbn, &nfsstats.biocache_reads);
+				FSDBG(514, np, 0xcacefeed, uio_offset(uio), error);
 			}
 		}
 
-		lbn = uio->uio_offset / biosize;
-		on = uio->uio_offset % biosize;
-		np->n_lastread = (uio->uio_offset - 1) / biosize;
+		lbn = uio_offset(uio) / biosize;
+		on = uio_offset(uio) % biosize;
+		nfs_node_lock_force(np);
+		np->n_lastread = (uio_offset(uio) - 1) / biosize;
+		nfs_node_unlock(np);
+
+		if ((uio_resid(uio) <= 0) || (uio_offset(uio) >= (off_t)np->n_size)) {
+			nfs_data_unlock(np);
+			FSDBG_BOT(514, np, uio_offset(uio), uio_resid(uio), 0xaaaaaaaa);
+			return (0);
+		}
 
 		/* adjust readahead block number, if necessary */
 		if (rabn < lbn)
@@ -2013,15 +2056,10 @@ nfs_bioread(nfsnode_t np, struct uio *uio, int ioflag, int *eofflag, vfs_context
 				FSDBG_BOT(514, np, 0xd1e000b, 1, error);
 				return (error);
 			}
+			readaheads = 1;
 		}
 
-		if ((uio_uio_resid(uio) <= 0) || (uio->uio_offset >= (off_t)np->n_size)) {
-			nfs_data_unlock(np);
-			FSDBG_BOT(514, np, uio->uio_offset, uio_uio_resid(uio), 0xaaaaaaaa);
-			return (0);
-		}
-
-		OSAddAtomic(1, (SInt32*)&nfsstats.biocache_reads);
+		OSAddAtomic64(1, &nfsstats.biocache_reads);
 
 		/*
 		 * If the block is in the cache and has the required data
@@ -2030,9 +2068,9 @@ nfs_bioread(nfsnode_t np, struct uio *uio, int ioflag, int *eofflag, vfs_context
 		 * as required.
 		 */
 again:
-		// LP64todo - fix this!
-		n = min((unsigned)(biosize - on), uio_uio_resid(uio));
-		diff = np->n_size - uio->uio_offset;
+		io_resid = uio_resid(uio);
+		n = (io_resid > (biosize - on)) ? (biosize - on) : io_resid;
+		diff = np->n_size - uio_offset(uio);
 		if (diff < n)
 			n = diff;
 
@@ -2055,11 +2093,9 @@ again:
 				SET(bp->nb_flags, NB_NOCACHE);
 				goto flushbuffer;
 			}
-			if (!ISSET(bp->nb_flags, NB_NCRDAHEAD)) {
-				CLR(bp->nb_flags, NB_CACHE);
-				bp->nb_valid = 0;
-			} else {
+			if (ISSET(bp->nb_flags, NB_NCRDAHEAD)) {
 				CLR(bp->nb_flags, NB_NCRDAHEAD);
+				SET(bp->nb_flags, NB_NOCACHE);
 			}
 		}
 
@@ -2127,7 +2163,7 @@ flushbuffer:
 				if (!auio) {
 					error = ENOMEM;
 				} else {
-					uio_addiov(auio, CAST_USER_ADDR_T((bp->nb_data + firstpg * PAGE_SIZE)),
+					uio_addiov(auio, CAST_USER_ADDR_T(bp->nb_data + (firstpg * PAGE_SIZE)),
 							((lastpg - firstpg + 1) * PAGE_SIZE));
 					error = nfs_read_rpc(np, auio, ctx);
 				}
@@ -2162,6 +2198,8 @@ flushbuffer:
 			SET(bp->nb_flags, NB_READ);
 			CLR(bp->nb_flags, (NB_DONE | NB_ERROR | NB_INVAL));
 			error = nfs_buf_read(bp);
+			if (ioflag & IO_NOCACHE)
+				SET(bp->nb_flags, NB_NOCACHE);
 			if (error) {
 				nfs_data_unlock(np);
 				nfs_buf_release(bp, 1);
@@ -2176,157 +2214,28 @@ buffer_ready:
 			if (diff < n)
 				n = diff;
 		}
-		if (n > 0)
-			NFS_BUF_MAP(bp);
-	    } else if (vtype == VDIR) {
-		OSAddAtomic(1, (SInt32*)&nfsstats.biocache_readdirs);
-		error = nfs_lock(np, NFS_NODE_LOCK_SHARED);
-		if (error || (np->n_direofoffset && (uio->uio_offset >= np->n_direofoffset))) {
-			if (!error)
-				nfs_unlock(np);
-			if (eofflag)
-				*eofflag = 1;
-			FSDBG_BOT(514, np, 0xde0f0001, 0, 0);
-			return (0);
-		}
-		nfs_unlock(np);
-		lbn = uio->uio_offset / NFS_DIRBLKSIZ;
-		on = uio->uio_offset & (NFS_DIRBLKSIZ - 1);
-		error = nfs_buf_get(np, lbn, NFS_DIRBLKSIZ, thd, NBLK_READ, &bp);
-		if (error) {
-			FSDBG_BOT(514, np, 0xd1e0012, 0, error);
-			return (error);
-		}
-		if (!ISSET(bp->nb_flags, NB_CACHE)) {
-		    SET(bp->nb_flags, NB_READ);
-		    error = nfs_buf_readdir(bp, ctx);
-		    if (error)
-			nfs_buf_release(bp, 1);
-		    while (error == NFSERR_BAD_COOKIE) {
-			error = nfs_lock(np, NFS_NODE_LOCK_EXCLUSIVE);
-			if (!error) {
-				nfs_invaldir(np);
-				nfs_unlock(np);
-			}
-			error = nfs_vinvalbuf(vp, 0, ctx, 1);
-			/*
-			 * Yuck! The directory has been modified on the
-			 * server. The only way to get the block is by
-			 * reading from the beginning to get all the
-			 * offset cookies.
-			 */
-			for (tlbn = 0; tlbn <= lbn && !error; tlbn++) {
-			    if ((error = nfs_lock(np, NFS_NODE_LOCK_SHARED)))
-				    break;
-			    if (np->n_direofoffset
-				&& (tlbn * NFS_DIRBLKSIZ) >= np->n_direofoffset) {
-				    nfs_unlock(np);
-				    if (eofflag)
-					    *eofflag = 1;
-				    FSDBG_BOT(514, np, 0xde0f0002, 0, 0);
-				    return (0);
-			    }
-			    nfs_unlock(np);
-			    error = nfs_buf_get(np, tlbn, NFS_DIRBLKSIZ, thd, NBLK_READ, &bp);
-			    if (error) {
-				    FSDBG_BOT(514, np, 0xd1e0013, 0, error);
-				    return (error);
-			    }
-			    if (!ISSET(bp->nb_flags, NB_CACHE)) {
-				    SET(bp->nb_flags, NB_READ);
-				    error = nfs_buf_readdir(bp, ctx);
-				    /*
-				     * no error + NB_INVAL == directory EOF,
-				     * use the block.
-				     */
-				    if (error == 0 && ISSET(bp->nb_flags, NB_INVAL)) {
-					    if (eofflag)
-						    *eofflag = 1;
-					    break;
-				    }
-			    }
-			    /*
-			     * An error will throw away the block and the
-			     * for loop will break out.  If no error and this
-			     * is not the block we want, we throw away the
-			     * block and go for the next one via the for loop.
-			     */
-			    if (error || (tlbn < lbn))
-				    nfs_buf_release(bp, 1);
-			}
-		    }
-		    /*
-		     * The above while is repeated if we hit another cookie
-		     * error.  If we hit an error and it wasn't a cookie error,
-		     * we give up.
-		     */
-		    if (error) {
-		        FSDBG_BOT(514, np, 0xd1e0014, 0, error);
-			return (error);
-		    }
-		}
-		/*
-		 * Make sure we use a signed variant of min() since
-		 * the second term may be negative.
-		 */
-		// LP64todo - fix this!
-		n = lmin(uio_uio_resid(uio), bp->nb_validend - on);
-		/*
-		 * We keep track of the directory eof in
-		 * np->n_direofoffset and chop it off as an
-		 * extra step right here.
-		 */
-		if ((error = nfs_lock(np, NFS_NODE_LOCK_SHARED))) {
-			FSDBG_BOT(514, np, 0xd1e0115, 0, error);
-			return (error);
-		}
-		if (np->n_direofoffset &&
-		    n > np->n_direofoffset - uio->uio_offset)
-			n = np->n_direofoffset - uio->uio_offset;
-		nfs_unlock(np);
-		/*
-		 * Make sure that we return an integral number of entries so
-		 * that any subsequent calls will start copying from the start
-		 * of the next entry.
-		 *
-		 * If the current value of n has the last entry cut short,
-		 * set n to copy everything up to the last entry instead.
-		 */
 		if (n > 0) {
-			dp = bp->nb_data + on;
-			while (dp < (bp->nb_data + on + n)) {
-				direntp = (struct dirent *)dp;
-				dp += direntp->d_reclen;
-			}
-			if (dp > (bp->nb_data + on + n))
-				n = (dp - direntp->d_reclen) - (bp->nb_data + on);
+			NFS_BUF_MAP(bp);
+			error = uiomove(bp->nb_data + on, n, uio);
 		}
-	    }
-
-	    if (n > 0)
-		error = uiomove(bp->nb_data + on, (int)n, uio);
 
-	    if (vtype == VREG) {
-		if (ioflag & IO_NOCACHE)
-			SET(bp->nb_flags, NB_NOCACHE);
 		nfs_buf_release(bp, 1);
 		nfs_data_unlock(np);
-		np->n_lastread = (uio->uio_offset - 1) / biosize;
-	    } else {
-		nfs_buf_release(bp, 1);
-	    }
-	} while (error == 0 && uio_uio_resid(uio) > 0 && n > 0);
-	FSDBG_BOT(514, np, uio->uio_offset, uio_uio_resid(uio), error);
+		nfs_node_lock_force(np);
+		np->n_lastread = (uio_offset(uio) - 1) / biosize;
+		nfs_node_unlock(np);
+	} while (error == 0 && uio_resid(uio) > 0 && n > 0);
+	FSDBG_BOT(514, np, uio_offset(uio), uio_resid(uio), error);
 	return (error);
 }
 
 /*
  * limit the number of outstanding async I/O writes
  */
-static int
+int
 nfs_async_write_start(struct nfsmount *nmp)
 {
-	int error = 0, slpflag = (nmp->nm_flag & NFSMNT_INT) ? PCATCH : 0;
+	int error = 0, slpflag = NMFLAG(nmp, INTR) ? PCATCH : 0;
 	struct timespec ts = {1, 0};
 
 	if (nfs_max_async_writes <= 0)
@@ -2343,7 +2252,7 @@ nfs_async_write_start(struct nfsmount *nmp)
 	lck_mtx_unlock(&nmp->nm_lock);
 	return (error);
 }
-static void
+void
 nfs_async_write_done(struct nfsmount *nmp)
 {
 	if (nmp->nm_asyncwrites <= 0)
@@ -2402,10 +2311,13 @@ nfs_buf_write(struct nfsbuf *bp)
 		LIST_INSERT_HEAD(&np->n_cleanblkhd, bp, nb_vnbufs);
 		lck_mtx_unlock(nfs_buf_mutex);
 	}
+	nfs_node_lock_force(np);
+	np->n_numoutput++;
+	nfs_node_unlock(np);
 	vnode_startwrite(NFSTOV(np));
 
 	if (p && p->p_stats)
-		OSIncrementAtomic(&p->p_stats->p_ru.ru_oublock);
+		OSIncrementAtomicLong(&p->p_stats->p_ru.ru_oublock);
 
 	cred = bp->nb_wcred;
 	if (!IS_VALID_CRED(cred) && ISSET(bp->nb_flags, NB_READ))
@@ -2415,17 +2327,26 @@ nfs_buf_write(struct nfsbuf *bp)
 	thd = async ? NULL : current_thread();
 
 	/* We need to make sure the pages are locked before doing I/O.  */
-	if (!ISSET(bp->nb_flags, NB_META) && UBCINFOEXISTS(NFSTOV(np))) {
-		if (!ISSET(bp->nb_flags, NB_PAGELIST)) {
-			error = nfs_buf_upl_setup(bp);
-			if (error) {
-				printf("nfs_buf_write: upl create failed %d\n", error);
-				SET(bp->nb_flags, NB_ERROR);
-				bp->nb_error = error = EIO;
-				nfs_buf_iodone(bp);
-				goto out;
+	if (!ISSET(bp->nb_flags, NB_META)) {
+		if (UBCINFOEXISTS(NFSTOV(np))) {
+			if (!ISSET(bp->nb_flags, NB_PAGELIST)) {
+				error = nfs_buf_upl_setup(bp);
+				if (error) {
+					printf("nfs_buf_write: upl create failed %d\n", error);
+					SET(bp->nb_flags, NB_ERROR);
+					bp->nb_error = error = EIO;
+					nfs_buf_iodone(bp);
+					goto out;
+				}
+				nfs_buf_upl_check(bp);
 			}
-			nfs_buf_upl_check(bp);
+		} else {
+			/* We should never be in nfs_buf_write() with no UBCINFO. */
+			printf("nfs_buf_write: ubcinfo already gone\n");
+			SET(bp->nb_flags, NB_ERROR);
+			bp->nb_error = error = EIO;
+			nfs_buf_iodone(bp);
+			goto out;
 		}
 	}
 
@@ -2434,7 +2355,7 @@ nfs_buf_write(struct nfsbuf *bp)
 		nfs_buf_check_write_verifier(np, bp);
 	if (ISSET(bp->nb_flags, NB_NEEDCOMMIT)) {
 		struct nfsmount *nmp = NFSTONMP(np);
-		if (!nmp) {
+		if (nfs_mount_gone(nmp)) {
 			SET(bp->nb_flags, NB_ERROR);
 			bp->nb_error = error = EIO;
 			nfs_buf_iodone(bp);
@@ -2442,7 +2363,7 @@ nfs_buf_write(struct nfsbuf *bp)
 		}
 		SET(bp->nb_flags, NB_WRITEINPROG);
 		error = nmp->nm_funcs->nf_commit_rpc(np, NBOFF(bp) + bp->nb_dirtyoff,
-				bp->nb_dirtyend - bp->nb_dirtyoff, bp->nb_wcred);
+				bp->nb_dirtyend - bp->nb_dirtyoff, bp->nb_wcred, bp->nb_verf);
 		CLR(bp->nb_flags, NB_WRITEINPROG);
 		if (error) {
 			if (error != NFSERR_STALEWRITEVERF) {
@@ -2454,10 +2375,10 @@ nfs_buf_write(struct nfsbuf *bp)
 		}
 		bp->nb_dirtyoff = bp->nb_dirtyend = 0;
 		CLR(bp->nb_flags, NB_NEEDCOMMIT);
-		nfs_lock(np, NFS_NODE_LOCK_FORCE);
+		nfs_node_lock_force(np);
 		np->n_needcommitcnt--;
 		CHECK_NEEDCOMMITCNT(np);
-		nfs_unlock(np);
+		nfs_node_unlock(np);
 	}
 	if (!error && (bp->nb_dirtyend > 0)) {
 		/* sanity check the dirty range */
@@ -2486,7 +2407,7 @@ nfs_buf_write(struct nfsbuf *bp)
 			dend = round_page_32(dend);
 		/* try to expand write range to include trailing dirty pages */
 		if (!(dend & PAGE_MASK))
-			while ((dend < bp->nb_bufsize) && NBPGDIRTY(bp, dend / PAGE_SIZE))
+			while ((dend < (int)bp->nb_bufsize) && NBPGDIRTY(bp, dend / PAGE_SIZE))
 				dend += PAGE_SIZE;
 		/* make sure to keep dend clipped to EOF */
 		if ((NBOFF(bp) + dend) > (off_t) np->n_size)
@@ -2513,7 +2434,7 @@ nfs_buf_write(struct nfsbuf *bp)
 		bp->nb_offio = doff;
 		bp->nb_endio = dend;
 
-		OSAddAtomic(1, (SInt32 *)&nfsstats.write_bios);
+		OSAddAtomic64(1, &nfsstats.write_bios);
 
 		SET(bp->nb_flags, NB_WRITEINPROG);
 		error = nfs_buf_write_rpc(bp, iomode, thd, cred);
@@ -2546,12 +2467,12 @@ out:
 		if ((np->n_flag & NNEEDINVALIDATE) &&
 		    !(np->n_bflag & (NBINVALINPROG|NBFLUSHINPROG))) {
 			int invalidate = 0;
-			nfs_lock(np, NFS_NODE_LOCK_FORCE);
+			nfs_node_lock_force(np);
 			if (np->n_flag & NNEEDINVALIDATE) {
 				invalidate = 1;
 				np->n_flag &= ~NNEEDINVALIDATE;
 			}
-			nfs_unlock(np);
+			nfs_node_unlock(np);
 			if (invalidate) {
 				/*
 				 * There was a write error and we need to
@@ -2603,19 +2524,19 @@ nfs_buf_write_finish(struct nfsbuf *bp, thread_t thd, kauth_cred_t cred)
 	/* manage needcommit state */
 	if (!error && (bp->nb_commitlevel == NFS_WRITE_UNSTABLE)) {
 		if (!ISSET(bp->nb_flags, NB_NEEDCOMMIT)) {
-			nfs_lock(np, NFS_NODE_LOCK_FORCE);
+			nfs_node_lock_force(np);
 			np->n_needcommitcnt++;
-			nfs_unlock(np);
+			nfs_node_unlock(np);
 			SET(bp->nb_flags, NB_NEEDCOMMIT);
 		}
 		/* make sure nb_dirtyoff/nb_dirtyend reflect actual range written */
 		bp->nb_dirtyoff = bp->nb_offio;
 		bp->nb_dirtyend = bp->nb_endio;
 	} else if (ISSET(bp->nb_flags, NB_NEEDCOMMIT)) {
-		nfs_lock(np, NFS_NODE_LOCK_FORCE);
+		nfs_node_lock_force(np);
 		np->n_needcommitcnt--;
 		CHECK_NEEDCOMMITCNT(np);
-		nfs_unlock(np);
+		nfs_node_unlock(np);
 		CLR(bp->nb_flags, NB_NEEDCOMMIT);
 	}
 
@@ -2664,11 +2585,11 @@ nfs_buf_write_finish(struct nfsbuf *bp, thread_t thd, kauth_cred_t cred)
 			 * buffer busy.  Set a flag to do it after releasing
 			 * the buffer.
 			 */
-			nfs_lock(np, NFS_NODE_LOCK_FORCE);
+			nfs_node_lock_force(np);
 			np->n_error = error;
 			np->n_flag |= (NWRITEERR | NNEEDINVALIDATE);
 			NATTRINVALIDATE(np);
-			nfs_unlock(np);
+			nfs_node_unlock(np);
 		}
 		/* clear the dirty range */
 		bp->nb_dirtyoff = bp->nb_dirtyend = 0;
@@ -2694,27 +2615,21 @@ nfs_buf_write_dirty_pages(struct nfsbuf *bp, thread_t thd, kauth_cred_t cred)
 	int error = 0, commit, iomode, iomode2, len, pg, count, npages, off;
 	uint32_t dirty = bp->nb_dirty;
 	uint64_t wverf;
-	struct uio uio;
-	struct iovec_32 io;
+	uio_t auio;
+	char uio_buf [ UIO_SIZEOF(1) ];
 
 	if (!bp->nb_dirty)
 		return (0);
 
 	/* there are pages marked dirty that need to be written out */
-	OSAddAtomic(1, (SInt32 *)&nfsstats.write_bios);
+	OSAddAtomic64(1, &nfsstats.write_bios);
 	NFS_BUF_MAP(bp);
 	SET(bp->nb_flags, NB_WRITEINPROG);
 	npages = bp->nb_bufsize / PAGE_SIZE;
 	iomode = NFS_WRITE_UNSTABLE;
 
-	uio.uio_iovs.iov32p = &io;
-	uio.uio_iovcnt = 1;
-	uio.uio_rw = UIO_WRITE;
-#if 1   /* LP64todo - can't use new segment flags until the drivers are ready */
-	uio.uio_segflg = UIO_SYSSPACE;
-#else
-	uio.uio_segflg = UIO_SYSSPACE32;
-#endif
+	auio = uio_createwithbuffer(1, 0, UIO_SYSSPACE, UIO_WRITE,
+		&uio_buf, sizeof(uio_buf));
 
 again:
 	dirty = bp->nb_dirty;
@@ -2734,11 +2649,9 @@ again:
 			len -= (NBOFF(bp) + off + len) - np->n_size;
 		if (len > 0) {
 			iomode2 = iomode;
-			io.iov_len = len;
-			uio_uio_resid_set(&uio, io.iov_len);
-			uio.uio_offset = NBOFF(bp) + off;
-			io.iov_base = (uintptr_t) bp->nb_data + off;
-			error = nfs_write_rpc2(np, &uio, thd, cred, &iomode2, &bp->nb_verf);
+			uio_reset(auio, NBOFF(bp) + off, UIO_SYSSPACE, UIO_WRITE);
+			uio_addiov(auio, CAST_USER_ADDR_T(bp->nb_data + off), len);
+			error = nfs_write_rpc2(np, auio, thd, cred, &iomode2, &bp->nb_verf);
 			if (error)
 				break;
 			if (iomode2 < commit) /* Retain the lowest commitment level returned. */
@@ -2759,7 +2672,7 @@ again:
 	CLR(bp->nb_flags, NB_WRITEINPROG);
 
 	if (!error && (commit != NFS_WRITE_FILESYNC)) {
-		error = nmp->nm_funcs->nf_commit_rpc(np, NBOFF(bp), bp->nb_bufsize, cred);
+		error = nmp->nm_funcs->nf_commit_rpc(np, NBOFF(bp), bp->nb_bufsize, cred, wverf);
 		if (error == NFSERR_STALEWRITEVERF) {
 			/* verifier changed, so we need to restart all the writes */
 			iomode = NFS_WRITE_FILESYNC;
@@ -2784,14 +2697,15 @@ nfs_buf_write_rpc(struct nfsbuf *bp, int iomode, thread_t thd, kauth_cred_t cred
 	struct nfsmount *nmp;
 	nfsnode_t np = bp->nb_np;
 	int error = 0, nfsvers, async;
-	int offset, length, nmwsize, nrpcs, len;
+	int offset, nrpcs;
+	uint32_t nmwsize, length, len;
 	struct nfsreq *req;
 	struct nfsreq_cbinfo cb;
-	struct uio uio;
-	struct iovec_32 io;
+	uio_t auio;
+	char uio_buf [ UIO_SIZEOF(1) ];
 
 	nmp = NFSTONMP(np);
-	if (!nmp) {
+	if (nfs_mount_gone(nmp)) {
 		bp->nb_error = error = ENXIO;
 		SET(bp->nb_flags, NB_ERROR);
 		nfs_buf_iodone(bp);
@@ -2816,18 +2730,9 @@ nfs_buf_write_rpc(struct nfsbuf *bp, int iomode, thread_t thd, kauth_cred_t cred
 		return (error);
 	}
 
-	uio.uio_iovs.iov32p = &io;
-	uio.uio_iovcnt = 1;
-	uio.uio_rw = UIO_WRITE;
-#if 1   /* LP64todo - can't use new segment flags until the drivers are ready */
-	uio.uio_segflg = UIO_SYSSPACE;
-#else
-	uio.uio_segflg = UIO_SYSSPACE32;
-#endif
-	io.iov_len = length;
-	uio_uio_resid_set(&uio, io.iov_len);
-	uio.uio_offset = NBOFF(bp) + offset;
-	io.iov_base = (uintptr_t) bp->nb_data + offset;
+	auio = uio_createwithbuffer(1, NBOFF(bp) + offset, UIO_SYSSPACE,
+		UIO_WRITE, &uio_buf, sizeof(uio_buf));
+	uio_addiov(auio, CAST_USER_ADDR_T(bp->nb_data + offset), length);
 
 	bp->nb_rpcs = nrpcs = (length + nmwsize - 1) / nmwsize;
 	if (async && (nrpcs > 1)) {
@@ -2844,10 +2749,12 @@ nfs_buf_write_rpc(struct nfsbuf *bp, int iomode, thread_t thd, kauth_cred_t cred
 		len = (length > nmwsize) ? nmwsize : length;
 		cb.rcb_args[0] = offset;
 		cb.rcb_args[1] = len;
+		if (nmp->nm_vers >= NFS_VER4)
+			cb.rcb_args[2] = nmp->nm_stategenid;
 		if (async && ((error = nfs_async_write_start(nmp))))
 			break;
 		req = NULL;
-		error = nmp->nm_funcs->nf_write_rpc_async(np, &uio, len, thd, cred,
+		error = nmp->nm_funcs->nf_write_rpc_async(np, auio, len, thd, cred,
 				iomode, &cb, &req);
 		if (error) {
 			if (async)
@@ -2886,6 +2793,9 @@ nfs_buf_write_rpc(struct nfsbuf *bp, int iomode, thread_t thd, kauth_cred_t cred
 		} else {
 			nfs_buf_write_finish(bp, thd, cred);
 		}
+		/* It may have just been an interrupt... that's OK */
+		if (!ISSET(bp->nb_flags, NB_ERROR))
+			error = 0;
 	}
 
 	return (error);
@@ -2909,8 +2819,8 @@ nfs_buf_write_rpc_finish(struct nfsreq *req)
 	nfsnode_t np;
 	thread_t thd;
 	kauth_cred_t cred;
-	struct uio uio;
-	struct iovec_32 io;
+	uio_t auio;
+	char uio_buf [ UIO_SIZEOF(1) ];
 
 finish:
 	np = req->r_np;
@@ -2920,9 +2830,11 @@ finish:
 		kauth_cred_ref(cred);
 	cb = req->r_callback;
 	bp = cb.rcb_bp;
+	if (cb.rcb_func) /* take an extra reference on the nfsreq in case we want to resend it later due to grace error */
+		nfs_request_ref(req, 0);
 
 	nmp = NFSTONMP(np);
-	if (!nmp) {
+	if (nfs_mount_gone(nmp)) {
 		SET(bp->nb_flags, NB_ERROR);
 		bp->nb_error = error = ENXIO;
 	}
@@ -2940,11 +2852,57 @@ finish:
 	error = nmp->nm_funcs->nf_write_rpc_async_finish(np, req, &committed, &rlen, &wverf);
 	if ((error == EINPROGRESS) && cb.rcb_func) {
 		/* async request restarted */
+		if (cb.rcb_func)
+			nfs_request_rele(req);
 		if (IS_VALID_CRED(cred))
 			kauth_cred_unref(&cred);
 		return;
 	}
-
+	if ((nmp->nm_vers >= NFS_VER4) && nfs_mount_state_error_should_restart(error) && !ISSET(bp->nb_flags, NB_ERROR)) {
+		lck_mtx_lock(&nmp->nm_lock);
+		if ((error != NFSERR_OLD_STATEID) && (error != NFSERR_GRACE) && (cb.rcb_args[2] == nmp->nm_stategenid)) {
+			NP(np, "nfs_buf_write_rpc_finish: error %d @ 0x%llx, 0x%x 0x%x, initiating recovery",
+				error, NBOFF(bp)+offset, cb.rcb_args[2], nmp->nm_stategenid);
+			nfs_need_recover(nmp, error);
+		}
+		lck_mtx_unlock(&nmp->nm_lock);
+		if (np->n_flag & NREVOKE) {
+			error = EIO;
+		} else {
+			if (error == NFSERR_GRACE) {
+				if (cb.rcb_func) {
+					/*
+					 * For an async I/O request, handle a grace delay just like
+					 * jukebox errors.  Set the resend time and queue it up.
+					 */
+					struct timeval now;
+					if (req->r_nmrep.nmc_mhead) {
+						mbuf_freem(req->r_nmrep.nmc_mhead);
+						req->r_nmrep.nmc_mhead = NULL;
+					}
+					req->r_error = 0;
+					microuptime(&now);
+					lck_mtx_lock(&req->r_mtx);
+					req->r_resendtime = now.tv_sec + 2;
+					req->r_xid = 0;                 // get a new XID
+					req->r_flags |= R_RESTART;
+					req->r_start = 0;
+					nfs_asyncio_resend(req);
+					lck_mtx_unlock(&req->r_mtx);
+					if (IS_VALID_CRED(cred))
+						kauth_cred_unref(&cred);
+					/* Note: nfsreq reference taken will be dropped later when finished */
+					return;
+				}
+				/* otherwise, just pause a couple seconds and retry */
+				tsleep(&nmp->nm_state, (PZERO-1), "nfsgrace", 2*hz);
+			}
+			if (!(error = nfs_mount_state_wait_for_recovery(nmp))) {
+				rlen = 0;
+				goto writeagain;
+			}
+		}
+	}
 	if (error) {
 		SET(bp->nb_flags, NB_ERROR);
 		bp->nb_error = error;
@@ -2979,26 +2937,21 @@ finish:
 	 * (Don't bother if the buffer hit an error or stale wverf.)
 	 */
 	if (((int)rlen < length) && !(bp->nb_flags & (NB_STALEWVERF|NB_ERROR))) {
+writeagain:
 		offset += rlen;
 		length -= rlen;
 
-		uio.uio_iovs.iov32p = &io;
-		uio.uio_iovcnt = 1;
-		uio.uio_rw = UIO_WRITE;
-#if 1   /* LP64todo - can't use new segment flags until the drivers are ready */
-		uio.uio_segflg = UIO_SYSSPACE;
-#else
-		uio.uio_segflg = UIO_SYSSPACE32;
-#endif
-		io.iov_len = length;
-		uio_uio_resid_set(&uio, io.iov_len);
-		uio.uio_offset = NBOFF(bp) + offset;
-		io.iov_base = (uintptr_t) bp->nb_data + offset;
+		auio = uio_createwithbuffer(1, NBOFF(bp) + offset, UIO_SYSSPACE,
+			UIO_WRITE, &uio_buf, sizeof(uio_buf));
+		uio_addiov(auio, CAST_USER_ADDR_T(bp->nb_data + offset), length);
 
 		cb.rcb_args[0] = offset;
 		cb.rcb_args[1] = length;
+		if (nmp->nm_vers >= NFS_VER4)
+			cb.rcb_args[2] = nmp->nm_stategenid;
 
-		error = nmp->nm_funcs->nf_write_rpc_async(np, &uio, length, thd, cred,
+		// XXX iomode should really match the original request
+		error = nmp->nm_funcs->nf_write_rpc_async(np, auio, length, thd, cred,
 				NFS_WRITE_FILESYNC, &cb, &wreq);
 		if (!error) {
 			if (IS_VALID_CRED(cred))
@@ -3006,8 +2959,10 @@ finish:
 			if (!cb.rcb_func) {
 				/* if !async we'll need to wait for this RPC to finish */
 				req = wreq;
+				wreq = NULL;
 				goto finish;
 			}
+			nfs_request_rele(req);
 			/*
 			 * We're done here.
 			 * Outstanding RPC count is unchanged.
@@ -3020,8 +2975,10 @@ finish:
 	}
 
 out:
-	if (cb.rcb_func)
+	if (cb.rcb_func) {
 		nfs_async_write_done(nmp);
+		nfs_request_rele(req);
+	}
 	/*
 	 * Decrement outstanding RPC count on buffer
 	 * and call nfs_buf_write_finish on last RPC.
@@ -3059,10 +3016,11 @@ int
 nfs_flushcommits(nfsnode_t np, int nowait)
 {
 	struct nfsmount *nmp;
-	struct nfsbuf *bp;
+	struct nfsbuf *bp, *prevlbp, *lbp;
 	struct nfsbuflists blist, commitlist;
 	int error = 0, retv, wcred_set, flags, dirty;
 	u_quad_t off, endoff, toff;
+	uint64_t wverf;
 	u_int32_t count;
 	kauth_cred_t wcred = NULL;
 
@@ -3075,11 +3033,11 @@ nfs_flushcommits(nfsnode_t np, int nowait)
 	 * and the commit rpc is done.
 	 */
 	if (!LIST_EMPTY(&np->n_dirtyblkhd)) {
-		error = nfs_lock(np, NFS_NODE_LOCK_EXCLUSIVE);
+		error = nfs_node_lock(np);
 		if (error)
 			goto done;
 		np->n_flag |= NMODIFIED;
-		nfs_unlock(np);
+		nfs_node_unlock(np);
 	}
 
 	off = (u_quad_t)-1;
@@ -3088,7 +3046,7 @@ nfs_flushcommits(nfsnode_t np, int nowait)
 	LIST_INIT(&commitlist);
 
 	nmp = NFSTONMP(np);
-	if (!nmp) {
+	if (nfs_mount_gone(nmp)) {
 		error = ENXIO;
 		goto done;
 	}
@@ -3101,6 +3059,7 @@ nfs_flushcommits(nfsnode_t np, int nowait)
 	if (nowait)
 		flags |= NBI_NOWAIT;
 	lck_mtx_lock(nfs_buf_mutex);
+	wverf = nmp->nm_verf;
 	if (!nfs_buf_iterprepare(np, &blist, flags)) {
 		while ((bp = LIST_FIRST(&blist))) {
 			LIST_REMOVE(bp, nb_vnbufs);
@@ -3110,46 +3069,19 @@ nfs_flushcommits(nfsnode_t np, int nowait)
 				continue;
 			if (ISSET(bp->nb_flags, NB_NEEDCOMMIT))
 				nfs_buf_check_write_verifier(np, bp);
-			if (((bp->nb_flags & (NB_DELWRI | NB_NEEDCOMMIT))
-				!= (NB_DELWRI | NB_NEEDCOMMIT))) {
+			if (((bp->nb_flags & (NB_DELWRI | NB_NEEDCOMMIT)) != (NB_DELWRI | NB_NEEDCOMMIT)) ||
+			    (bp->nb_verf != wverf)) {
 				nfs_buf_drop(bp);
 				continue;
 			}
 			nfs_buf_remfree(bp);
-			lck_mtx_unlock(nfs_buf_mutex);
-			/*
-			 * we need a upl to see if the page has been
-			 * dirtied (think mmap) since the unstable write, and
-			 * also to prevent vm from paging it during our commit rpc
-			 */
-			if (!ISSET(bp->nb_flags, NB_PAGELIST)) {
-				retv = nfs_buf_upl_setup(bp);
-				if (retv) {
-					/* unable to create upl */
-					/* vm object must no longer exist */
-					/* this could be fatal if we need */
-					/* to write the data again, we'll see...  */
-					printf("nfs_flushcommits: upl create failed %d\n", retv);
-					bp->nb_valid = bp->nb_dirty = 0;
-				}
-			}
-			nfs_buf_upl_check(bp);
-			lck_mtx_lock(nfs_buf_mutex);
+
+			/* buffer UPLs will be grabbed *in order* below */
 
 			FSDBG(557, bp, bp->nb_flags, bp->nb_valid, bp->nb_dirty);
 			FSDBG(557, bp->nb_validoff, bp->nb_validend,
 			      bp->nb_dirtyoff, bp->nb_dirtyend);
 
-			/*
-			 * We used to check for dirty pages here; if there were any
-			 * we'd abort the commit and force the entire buffer to be
-			 * written again.
-			 *
-			 * Instead of doing that, we now go ahead and commit the dirty
-			 * range, and then leave the buffer around with dirty pages
-			 * that will be written out later.
-			 */
-
 			/*
 			 * Work out if all buffers are using the same cred
 			 * so we can deal with them all with one commit.
@@ -3168,14 +3100,23 @@ nfs_flushcommits(nfsnode_t np, int nowait)
 			SET(bp->nb_flags, NB_WRITEINPROG);
 
 			/*
-			 * A list of these buffers is kept so that the
-			 * second loop knows which buffers have actually
-			 * been committed. This is necessary, since there
-			 * may be a race between the commit rpc and new
-			 * uncommitted writes on the file.
+			 * Add this buffer to the list of buffers we are committing.
+			 * Buffers are inserted into the list in ascending order so that
+			 * we can take the UPLs in order after the list is complete.
 			 */
+			prevlbp = NULL;
+			LIST_FOREACH(lbp, &commitlist, nb_vnbufs) {
+				if (bp->nb_lblkno < lbp->nb_lblkno)
+					break;
+				prevlbp = lbp;
+			}
 			LIST_REMOVE(bp, nb_vnbufs);
-			LIST_INSERT_HEAD(&commitlist, bp, nb_vnbufs);
+			if (prevlbp)
+				LIST_INSERT_AFTER(prevlbp, bp, nb_vnbufs);
+			else
+				LIST_INSERT_HEAD(&commitlist, bp, nb_vnbufs);
+
+			/* update commit range start, end */
 			toff = NBOFF(bp) + bp->nb_dirtyoff;
 			if (toff < off)
 				off = toff;
@@ -3192,6 +3133,28 @@ nfs_flushcommits(nfsnode_t np, int nowait)
 		goto done;
 	}
 
+	/*
+	 * We need a UPL to prevent others from accessing the buffers during
+	 * our commit RPC(s).
+	 *
+	 * We used to also check for dirty pages here; if there were any we'd
+	 * abort the commit and force the entire buffer to be written again.
+	 * Instead of doing that, we just go ahead and commit the dirty range,
+	 * and then leave the buffer around with dirty pages that will be
+	 * written out later.
+	 */
+	LIST_FOREACH(bp, &commitlist, nb_vnbufs) {
+		if (!ISSET(bp->nb_flags, NB_PAGELIST)) {
+			retv = nfs_buf_upl_setup(bp);
+			if (retv) {
+				/* Unable to create the UPL, the VM object probably no longer exists. */
+				printf("nfs_flushcommits: upl create failed %d\n", retv);
+				bp->nb_valid = bp->nb_dirty = 0;
+			}
+		}
+		nfs_buf_upl_check(bp);
+	}
+
 	/*
 	 * Commit data on the server, as required.
 	 * If all bufs are using the same wcred, then use that with
@@ -3207,13 +3170,13 @@ nfs_flushcommits(nfsnode_t np, int nowait)
 			count = 0;
 		else
 			count = (endoff - off);
-		retv = nmp->nm_funcs->nf_commit_rpc(np, off, count, wcred);
+		retv = nmp->nm_funcs->nf_commit_rpc(np, off, count, wcred, wverf);
 	} else {
 		retv = 0;
 		LIST_FOREACH(bp, &commitlist, nb_vnbufs) {
 			toff = NBOFF(bp) + bp->nb_dirtyoff;
 			count = bp->nb_dirtyend - bp->nb_dirtyoff;
-			retv = nmp->nm_funcs->nf_commit_rpc(np, toff, count, bp->nb_wcred);
+			retv = nmp->nm_funcs->nf_commit_rpc(np, toff, count, bp->nb_wcred, wverf);
 			if (retv)
 				break;
 		}
@@ -3227,11 +3190,11 @@ nfs_flushcommits(nfsnode_t np, int nowait)
 	while ((bp = LIST_FIRST(&commitlist))) {
 		LIST_REMOVE(bp, nb_vnbufs);
 		FSDBG(557, bp, retv, bp->nb_flags, bp->nb_dirty);
-		nfs_lock(np, NFS_NODE_LOCK_FORCE);
+		nfs_node_lock_force(np);
 		CLR(bp->nb_flags, (NB_NEEDCOMMIT | NB_WRITEINPROG));
 		np->n_needcommitcnt--;
 		CHECK_NEEDCOMMITCNT(np);
-		nfs_unlock(np);
+		nfs_node_unlock(np);
 
 		if (retv) {
 			/* move back to dirty list */
@@ -3242,6 +3205,9 @@ nfs_flushcommits(nfsnode_t np, int nowait)
 			continue;
 		}
 
+		nfs_node_lock_force(np);
+		np->n_numoutput++;
+		nfs_node_unlock(np);
 		vnode_startwrite(NFSTOV(np));
 		if (ISSET(bp->nb_flags, NB_DELWRI)) {
 			lck_mtx_lock(nfs_buf_mutex);
@@ -3294,25 +3260,26 @@ nfs_flush(nfsnode_t np, int waitfor, thread_t thd, int ignore_writeerr)
 
 	FSDBG_TOP(517, np, waitfor, ignore_writeerr, 0);
 
-	if (!nmp) {
+	if (nfs_mount_gone(nmp)) {
 		error = ENXIO;
 		goto out;
 	}
 	nfsvers = nmp->nm_vers;
-	if (nmp->nm_flag & NFSMNT_INT)
+	if (NMFLAG(nmp, INTR))
 		slpflag = PCATCH;
 
 	if (!LIST_EMPTY(&np->n_dirtyblkhd)) {
-		nfs_lock(np, NFS_NODE_LOCK_FORCE);
+		nfs_node_lock_force(np);
 		np->n_flag |= NMODIFIED;
-		nfs_unlock(np);
+		nfs_node_unlock(np);
 	}
 
 	lck_mtx_lock(nfs_buf_mutex);
 	while (np->n_bflag & NBFLUSHINPROG) {
 		np->n_bflag |= NBFLUSHWANT;
 		error = msleep(&np->n_bflag, nfs_buf_mutex, slpflag, "nfs_flush", NULL);
-		if (error) {
+		if ((error && (error != EWOULDBLOCK)) ||
+		    ((error = nfs_sigintr(NFSTONMP(np), NULL, thd, 0)))) {
 			lck_mtx_unlock(nfs_buf_mutex);
 			goto out;
 		}
@@ -3339,7 +3306,7 @@ again:
 		while ((bp = LIST_FIRST(&blist))) {
 			LIST_REMOVE(bp, nb_vnbufs);
 			LIST_INSERT_HEAD(&np->n_dirtyblkhd, bp, nb_vnbufs);
-			flags = (passone || (waitfor != MNT_WAIT)) ? NBAC_NOWAIT : 0;
+			flags = (passone || !(waitfor == MNT_WAIT || waitfor == MNT_DWAIT)) ? NBAC_NOWAIT : 0;
 			if (flags != NBAC_NOWAIT)
 				nfs_buf_refget(bp);
 			while ((error = nfs_buf_acquire(bp, flags, slpflag, slptimeo))) {
@@ -3379,7 +3346,7 @@ again:
 				continue;
 			}
 			FSDBG(525, bp, passone, bp->nb_lflags, bp->nb_flags);
-			if ((passone || (waitfor != MNT_WAIT)) &&
+			if ((passone || !(waitfor == MNT_WAIT || waitfor == MNT_DWAIT)) &&
 			    ISSET(bp->nb_flags, NB_NEEDCOMMIT)) {
 				nfs_buf_drop(bp);
 				continue;
@@ -3387,10 +3354,10 @@ again:
 			nfs_buf_remfree(bp);
 			lck_mtx_unlock(nfs_buf_mutex);
 			if (ISSET(bp->nb_flags, NB_ERROR)) {
-				nfs_lock(np, NFS_NODE_LOCK_FORCE);
+				nfs_node_lock_force(np);
 				np->n_error = bp->nb_error ? bp->nb_error : EIO;
 				np->n_flag |= NWRITEERR;
-				nfs_unlock(np);
+				nfs_node_unlock(np);
 				nfs_buf_release(bp, 1);
 				lck_mtx_lock(nfs_buf_mutex);
 				continue;
@@ -3407,7 +3374,7 @@ again:
 	}
 	lck_mtx_unlock(nfs_buf_mutex);
 
-	if (waitfor == MNT_WAIT) {
+	if (waitfor == MNT_WAIT || waitfor == MNT_DWAIT) {
 	        while ((error = vnode_waitforwrites(NFSTOV(np), 0, slpflag, slptimeo, "nfsflush"))) {
 		        error2 = nfs_sigintr(NFSTONMP(np), NULL, thd, 0);
 			if (error2) {
@@ -3432,30 +3399,45 @@ again:
 	if (passone) {
 		passone = 0;
 		if (!LIST_EMPTY(&np->n_dirtyblkhd)) {
-			nfs_lock(np, NFS_NODE_LOCK_FORCE);
+			nfs_node_lock_force(np);
 			np->n_flag |= NMODIFIED;
-			nfs_unlock(np);
+			nfs_node_unlock(np);
 		}
 		lck_mtx_lock(nfs_buf_mutex);
 		goto again;
 	}
 
-	if (waitfor == MNT_WAIT) {
+	if (waitfor == MNT_WAIT || waitfor == MNT_DWAIT) {
 		if (!LIST_EMPTY(&np->n_dirtyblkhd)) {
-			nfs_lock(np, NFS_NODE_LOCK_FORCE);
+			nfs_node_lock_force(np);
 			np->n_flag |= NMODIFIED;
-			nfs_unlock(np);
+			nfs_node_unlock(np);
 		}
 		lck_mtx_lock(nfs_buf_mutex);
 		if (!LIST_EMPTY(&np->n_dirtyblkhd))
 			goto again;
 		lck_mtx_unlock(nfs_buf_mutex);
-		nfs_lock(np, NFS_NODE_LOCK_FORCE);
-		/* if we have no dirty blocks, we can clear the modified flag */
-		if (!np->n_wrbusy)
+		nfs_node_lock_force(np);
+		/*
+		 * OK, it looks like there are no dirty blocks.  If we have no
+		 * writes in flight and no one in the write code, we can clear
+		 * the modified flag.  In order to make sure we see the latest
+		 * attributes and size, we also invalidate the attributes and
+		 * advance the attribute cache XID to guarantee that attributes
+		 * newer than our clearing of NMODIFIED will get loaded next.
+		 * (If we don't do this, it's possible for the flush's final
+		 * write/commit (xid1) to be executed in parallel with a subsequent
+		 * getattr request (xid2).  The getattr could return attributes
+		 * from *before* the write/commit completed but the stale attributes
+		 * would be preferred because of the xid ordering.)
+		 */
+		if (!np->n_wrbusy && !np->n_numoutput) {
 			np->n_flag &= ~NMODIFIED;
+			NATTRINVALIDATE(np);
+			nfs_get_xid(&np->n_xid);
+		}
 	} else {
-		nfs_lock(np, NFS_NODE_LOCK_FORCE);
+		nfs_node_lock_force(np);
 	}
 
 	FSDBG(526, np->n_flag, np->n_error, 0, 0);
@@ -3463,7 +3445,7 @@ again:
 		error = np->n_error;
 		np->n_flag &= ~NWRITEERR;
 	}
-	nfs_unlock(np);
+	nfs_node_unlock(np);
 done:
 	lck_mtx_lock(nfs_buf_mutex);
 	flags = np->n_bflag;
@@ -3480,7 +3462,7 @@ out:
  * Flush out and invalidate all buffers associated with a vnode.
  * Called with the underlying object locked.
  */
-static int
+int
 nfs_vinvalbuf_internal(
 	nfsnode_t np,
 	int flags,
@@ -3580,10 +3562,11 @@ nfs_vinvalbuf_internal(
 					// Note: bp has been released
 					if (error) {
 						FSDBG(554, bp, 0xd00dee, 0xbad, error);
-						nfs_lock(np, NFS_NODE_LOCK_FORCE);
-						np->n_error = error;
-						np->n_flag |= NWRITEERR;
-						nfs_unlock(np);
+						nfs_node_lock_force(np);
+						if ((error != EINTR) && (error != ERESTART)) {
+							np->n_error = error;
+							np->n_flag |= NWRITEERR;
+						}
 						/*
 						 * There was a write error and we need to
 						 * invalidate attrs to sync with server.
@@ -3591,6 +3574,18 @@ nfs_vinvalbuf_internal(
 						 * we may no longer know the correct size)
 						 */
 						NATTRINVALIDATE(np);
+						nfs_node_unlock(np);
+						if ((error == EINTR) || (error == ERESTART)) {
+							/*
+							 * Abort on EINTR.  If we don't, we could
+							 * be stuck in this loop forever because
+							 * the buffer will continue to stay dirty.
+							 */
+							lck_mtx_lock(nfs_buf_mutex);
+							nfs_buf_itercomplete(np, &blist, list);
+							lck_mtx_unlock(nfs_buf_mutex);
+							return (error);
+						}
 						error = 0;
 					}
 					lck_mtx_lock(nfs_buf_mutex);
@@ -3607,11 +3602,12 @@ nfs_vinvalbuf_internal(
 	if (!LIST_EMPTY(&(np)->n_dirtyblkhd) || !LIST_EMPTY(&(np)->n_cleanblkhd))
 		panic("nfs_vinvalbuf: flush/inval failed");
 	lck_mtx_unlock(nfs_buf_mutex);
-	if (!(flags & V_SAVE)) {
-		nfs_lock(np, NFS_NODE_LOCK_FORCE);
+	nfs_node_lock_force(np);
+	if (!(flags & V_SAVE))
 		np->n_flag &= ~NMODIFIED;
-		nfs_unlock(np);
-	}
+	if (vnode_vtype(NFSTOV(np)) == VREG)
+		np->n_lastrahead = -1;
+	nfs_node_unlock(np);
 	NFS_BUF_FREEUP();
 	return (0);
 }
@@ -3632,12 +3628,23 @@ nfs_vinvalbuf2(vnode_t vp, int flags, thread_t thd, kauth_cred_t cred, int intrf
 {
 	nfsnode_t np = VTONFS(vp);
 	struct nfsmount *nmp = VTONMP(vp);
-	int error, rv, slpflag, slptimeo, nflags;
+	int error, slpflag, slptimeo, nflags, retry = 0;
+	int ubcflags = UBC_PUSHALL | UBC_SYNC | UBC_INVALIDATE;
+	struct timespec ts = { 2, 0 };
 	off_t size;
 
 	FSDBG_TOP(554, np, flags, intrflg, 0);
 
-	if (nmp && !(nmp->nm_flag & NFSMNT_INT))
+	/*
+	 * If the mount is gone no sense to try and write anything.
+	 * and hang trying to do IO.
+	 */
+	if (nfs_mount_gone(nmp)) {
+		flags &= ~V_SAVE;
+		ubcflags &= ~UBC_PUSHALL;
+	}
+	
+	if (nmp && !NMFLAG(nmp, INTR))
 		intrflg = 0;
 	if (intrflg) {
 		slpflag = PCATCH;
@@ -3651,16 +3658,19 @@ nfs_vinvalbuf2(vnode_t vp, int flags, thread_t thd, kauth_cred_t cred, int intrf
 	lck_mtx_lock(nfs_buf_mutex);
 	while (np->n_bflag & NBINVALINPROG) {
 		np->n_bflag |= NBINVALWANT;
-		error = msleep(&np->n_bflag, nfs_buf_mutex, slpflag, "nfs_vinvalbuf", NULL);
-		if (error) {
+		msleep(&np->n_bflag, nfs_buf_mutex, slpflag, "nfs_vinvalbuf", &ts);
+		if ((error = nfs_sigintr(VTONMP(vp), NULL, thd, 0))) {
 			lck_mtx_unlock(nfs_buf_mutex);
 			return (error);
 		}
+		if (np->n_bflag & NBINVALINPROG)
+			slpflag = 0;
 	}
 	np->n_bflag |= NBINVALINPROG;
 	lck_mtx_unlock(nfs_buf_mutex);
 
 	/* Now, flush as required.  */
+again:
 	error = nfs_vinvalbuf_internal(np, flags, thd, cred, slpflag, 0);
 	while (error) {
 		FSDBG(554, np, 0, 0, error);
@@ -3671,8 +3681,17 @@ nfs_vinvalbuf2(vnode_t vp, int flags, thread_t thd, kauth_cred_t cred, int intrf
 
 	/* get the pages out of vm also */
 	if (UBCINFOEXISTS(vp) && (size = ubc_getsize(vp)))
-		if (!(rv = ubc_sync_range(vp, 0, size, UBC_PUSHALL | UBC_SYNC | UBC_INVALIDATE)))
-			panic("nfs_vinvalbuf(): ubc_sync_range failed!");
+		if ((error = ubc_msync(vp, 0, size, NULL, ubcflags))) {
+			if (error == EINVAL)
+				panic("nfs_vinvalbuf(): ubc_msync failed!, error %d", error);
+			if (retry++ < 10) { /* retry invalidating a few times */
+				if (retry > 1 || error == ENXIO)
+					ubcflags &= ~UBC_PUSHALL;
+				goto again;
+			}
+			/* give up */
+			printf("nfs_vinvalbuf(): ubc_msync failed!, error %d\n", error);
+		}
 done:
 	lck_mtx_lock(nfs_buf_mutex);
 	nflags = np->n_bflag;
@@ -3685,6 +3704,57 @@ done:
 	return (error);
 }
 
+/*
+ * Wait for any busy buffers to complete.
+ */
+void
+nfs_wait_bufs(nfsnode_t np)
+{
+	struct nfsbuf *bp;
+	struct nfsbuflists blist;
+	int error = 0;
+
+	lck_mtx_lock(nfs_buf_mutex);
+	if (!nfs_buf_iterprepare(np, &blist, NBI_CLEAN)) {
+		while ((bp = LIST_FIRST(&blist))) {
+			LIST_REMOVE(bp, nb_vnbufs);
+			LIST_INSERT_HEAD(&np->n_cleanblkhd, bp, nb_vnbufs);
+			nfs_buf_refget(bp);
+			while ((error = nfs_buf_acquire(bp, 0, 0, 0))) {
+				if (error != EAGAIN) {
+					nfs_buf_refrele(bp);
+					nfs_buf_itercomplete(np, &blist, NBI_CLEAN);
+					lck_mtx_unlock(nfs_buf_mutex);
+					return;
+				}
+			}
+			nfs_buf_refrele(bp);
+			nfs_buf_drop(bp);
+		}
+		nfs_buf_itercomplete(np, &blist, NBI_CLEAN);
+	}
+	if (!nfs_buf_iterprepare(np, &blist, NBI_DIRTY)) {
+		while ((bp = LIST_FIRST(&blist))) {
+			LIST_REMOVE(bp, nb_vnbufs);
+			LIST_INSERT_HEAD(&np->n_dirtyblkhd, bp, nb_vnbufs);
+			nfs_buf_refget(bp);
+			while ((error = nfs_buf_acquire(bp, 0, 0, 0))) {
+				if (error != EAGAIN) {
+					nfs_buf_refrele(bp);
+					nfs_buf_itercomplete(np, &blist, NBI_DIRTY);
+					lck_mtx_unlock(nfs_buf_mutex);
+					return;
+				}
+			}
+			nfs_buf_refrele(bp);
+			nfs_buf_drop(bp);
+		}
+		nfs_buf_itercomplete(np, &blist, NBI_DIRTY);
+	}
+	lck_mtx_unlock(nfs_buf_mutex);
+}
+
+
 /*
  * Add an async I/O request to the mount's async I/O queue and make
  * sure that an nfsiod will service it.
@@ -3698,8 +3768,11 @@ nfs_asyncio_finish(struct nfsreq *req)
 
 	FSDBG_TOP(552, nmp, 0, 0, 0);
 again:
-	if (((nmp = req->r_nmp)) == NULL)
+	nmp = req->r_nmp;
+
+	if (nmp == NULL)
 		return;
+
 	lck_mtx_lock(nfsiod_mutex);
 	niod = nmp->nm_niod;
 
@@ -3734,8 +3807,9 @@ again:
 			lck_mtx_unlock(nfsiod_mutex);
 			wakeup(niod);
 		} else if (nfsiod_thread_count > 0) {
-			/* just queue it up on nfsiod mounts queue */
-			TAILQ_INSERT_TAIL(&nfsiodmounts, nmp, nm_iodlink);
+			/* just queue it up on nfsiod mounts queue if needed */
+			if (nmp->nm_iodlink.tqe_next == NFSNOLIST)
+				TAILQ_INSERT_TAIL(&nfsiodmounts, nmp, nm_iodlink);
 			lck_mtx_unlock(nfsiod_mutex);
 		} else {
 			printf("nfs_asyncio(): no nfsiods? %d %d (%d)\n", nfsiod_thread_count, NFSIOD_MAX, started);
@@ -3759,11 +3833,11 @@ nfs_asyncio_resend(struct nfsreq *req)
 {
 	struct nfsmount *nmp = req->r_nmp;
 
-	if (!nmp)
+	if (nfs_mount_gone(nmp))
 		return;
 	nfs_gss_clnt_rpcdone(req);
 	lck_mtx_lock(&nmp->nm_lock);
-	if (req->r_rchain.tqe_next == NFSREQNOLIST) {
+	if (!(req->r_flags & R_RESENDQ)) {
 		TAILQ_INSERT_TAIL(&nmp->nm_resendq, req, r_rchain);
 		req->r_flags |= R_RESENDQ;
 	}
@@ -3772,64 +3846,29 @@ nfs_asyncio_resend(struct nfsreq *req)
 }
 
 /*
- * Read an NFS buffer for a directory.
+ * Read directory data into a buffer.
+ *
+ * Buffer will be filled (unless EOF is hit).
+ * Buffers after this one may also be completely/partially filled.
  */
 int
 nfs_buf_readdir(struct nfsbuf *bp, vfs_context_t ctx)
 {
-	nfsnode_t np;
-	vnode_t vp;
-	struct nfsmount *nmp;
-	int error = 0, nfsvers;
-	struct uio uio;
-	struct iovec_32 io;
+	nfsnode_t np = bp->nb_np;
+	struct nfsmount *nmp = NFSTONMP(np);
+	int error = 0;
 
-	np = bp->nb_np;
-	vp = NFSTOV(np);
-	nmp = VTONMP(vp);
-	nfsvers = nmp->nm_vers;
-	uio.uio_iovs.iov32p = &io;
-	uio.uio_iovcnt = 1;
-#if 1   /* LP64todo - can't use new segment flags until the drivers are ready */
-	uio.uio_segflg = UIO_SYSSPACE;
-#else
-	uio.uio_segflg = UIO_SYSSPACE32;
-#endif
-
-	/* sanity check */
-	if (ISSET(bp->nb_flags, NB_DONE))
-		CLR(bp->nb_flags, NB_DONE);
+	if (nfs_mount_gone(nmp))
+		return (ENXIO);
 
-	uio.uio_rw = UIO_READ;
-	io.iov_len = bp->nb_bufsize;
-	uio_uio_resid_set(&uio, io.iov_len);
-	io.iov_base = (uintptr_t) bp->nb_data;
-	uio.uio_offset = NBOFF(bp);
-
-	OSAddAtomic(1, (SInt32*)&nfsstats.readdir_bios);
-	if (nfsvers < NFS_VER4) {
-		if (nmp->nm_flag & NFSMNT_RDIRPLUS) {
-			error = nfs3_readdirplus_rpc(np, &uio, ctx);
-			if (error == NFSERR_NOTSUPP) {
-				lck_mtx_lock(&nmp->nm_lock);
-				nmp->nm_flag &= ~NFSMNT_RDIRPLUS;
-				lck_mtx_unlock(&nmp->nm_lock);
-			}
-		}
-		if (!(nmp->nm_flag & NFSMNT_RDIRPLUS))
-			error = nfs3_readdir_rpc(np, &uio, ctx);
-	} else {
-		error = nfs4_readdir_rpc(np, &uio, ctx);
-	}
-	if (error) {
+	if (nmp->nm_vers < NFS_VER4)
+		error = nfs3_readdir_rpc(np, bp, ctx);
+	else
+		error = nfs4_readdir_rpc(np, bp, ctx);
+
+	if (error && (error != NFSERR_DIRBUFDROPPED)) {
 		SET(bp->nb_flags, NB_ERROR);
 		bp->nb_error = error;
-	} else {
-		bp->nb_validoff = 0;
-		bp->nb_validend = uio.uio_offset - NBOFF(bp);
-		bp->nb_valid = (1 << (round_page_32(bp->nb_validend)/PAGE_SIZE)) - 1;
 	}
-
-	nfs_buf_iodone(bp);
 	return (error);
 }