X-Git-Url: https://git.saurik.com/apple/xnu.git/blobdiff_plain/2d21ac55c334faf3a56e5634905ed6987fc787d4..2a1bd2d3eef5c7a7bb14f4bb9fdbca9a96ee4752:/bsd/nfs/nfs_bio.c

diff --git a/bsd/nfs/nfs_bio.c b/bsd/nfs/nfs_bio.c
index 8b2ab3e1f..b9c2b5ac1 100644
--- a/bsd/nfs/nfs_bio.c
+++ b/bsd/nfs/nfs_bio.c
@@ -1,8 +1,8 @@
 /*
- * Copyright (c) 2000-2007 Apple Inc. All rights reserved.
+ * Copyright (c) 2000-2019 Apple Inc. All rights reserved.
  *
  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
- * 
+ *
  * This file contains Original Code and/or Modifications of Original Code
  * as defined in and that are subject to the Apple Public Source License
  * Version 2.0 (the 'License'). You may not use this file except in
@@ -11,10 +11,10 @@
  * unlawful or unlicensed copies of an Apple operating system, or to
  * circumvent, violate, or enable the circumvention or violation of, any
  * terms of an Apple operating system software license agreement.
- * 
+ *
  * Please obtain a copy of the License at
  * http://www.opensource.apple.com/apsl/ and read it before using this file.
- * 
+ *
  * The Original Code and all software distributed under the License are
  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
@@ -22,7 +22,7 @@
  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
  * Please see the License for the specific language governing rights and
  * limitations under the License.
- * 
+ *
  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
  */
 /* Copyright (c) 1995 NeXT Computer, Inc. All Rights Reserved */
@@ -64,6 +64,10 @@
  *	@(#)nfs_bio.c	8.9 (Berkeley) 3/30/95
  * FreeBSD-Id: nfs_bio.c,v 1.44 1997/09/10 19:52:25 phk Exp $
  */
+
+#include <nfs/nfs_conf.h>
+#if CONFIG_NFS_CLIENT
+
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/resourcevar.h>
@@ -77,6 +81,7 @@
 #include <sys/kernel.h>
 #include <sys/ubc_internal.h>
 #include <sys/uio_internal.h>
+#include <sys/kpi_mbuf.h>
 
 #include <sys/vm.h>
 #include <sys/vmparam.h>
@@ -95,12 +100,15 @@
 #include <nfs/nfsnode.h>
 #include <sys/buf_internal.h>
 #include <libkern/OSAtomic.h>
+#include <os/refcnt.h>
 
-kern_return_t	thread_terminate(thread_t); /* XXX */
+#define NFS_BIO_DBG(...) NFS_DBG(NFS_FAC_BIO, 7, ## __VA_ARGS__)
 
-#define	NFSBUFHASH(np, lbn)	\
+kern_return_t   thread_terminate(thread_t); /* XXX */
+
+#define NFSBUFHASH(np, lbn)     \
 	(&nfsbufhashtbl[((long)(np) / sizeof(*(np)) + (int)(lbn)) & nfsbufhash])
-LIST_HEAD(nfsbufhashhead, nfsbuf) *nfsbufhashtbl;
+LIST_HEAD(nfsbufhashhead, nfsbuf) * nfsbufhashtbl;
 struct nfsbuffreehead nfsbuffree, nfsbuffreemeta, nfsbufdelwri;
 u_long nfsbufhash;
 int nfsbufcnt, nfsbufmin, nfsbufmax, nfsbufmetacnt, nfsbufmetamax;
@@ -109,37 +117,98 @@ int nfs_nbdwrite;
 int nfs_buf_timer_on = 0;
 thread_t nfsbufdelwrithd = NULL;
 
+ZONE_DECLARE(nfsbuf_zone, "NFS bio", sizeof(struct nfsbuf), ZC_NONE);
+
 lck_grp_t *nfs_buf_lck_grp;
 lck_mtx_t *nfs_buf_mutex;
 
-#define NFSBUF_FREE_PERIOD	30	/* seconds */
-#define NFSBUF_LRU_STALE	120
-#define NFSBUF_META_STALE	240
+#define NFSBUF_FREE_PERIOD      30      /* seconds */
+#define NFSBUF_LRU_STALE        120
+#define NFSBUF_META_STALE       240
 
 /* number of nfsbufs nfs_buf_freeup() should attempt to free from nfsbuffree list */
-#define LRU_TO_FREEUP			6
+#define LRU_TO_FREEUP                   6
 /* number of nfsbufs nfs_buf_freeup() should attempt to free from nfsbuffreemeta list */
-#define META_TO_FREEUP			3
+#define META_TO_FREEUP                  3
 /* total number of nfsbufs nfs_buf_freeup() should attempt to free */
-#define TOTAL_TO_FREEUP			(LRU_TO_FREEUP+META_TO_FREEUP)
+#define TOTAL_TO_FREEUP                 (LRU_TO_FREEUP+META_TO_FREEUP)
 /* fraction of nfsbufs nfs_buf_freeup() should attempt to free from nfsbuffree list when called from timer */
-#define LRU_FREEUP_FRAC_ON_TIMER	8
+#define LRU_FREEUP_FRAC_ON_TIMER        8
 /* fraction of nfsbufs nfs_buf_freeup() should attempt to free from nfsbuffreemeta list when called from timer */
-#define META_FREEUP_FRAC_ON_TIMER	16
+#define META_FREEUP_FRAC_ON_TIMER       16
 /* fraction of total nfsbufs that nfsbuffreecnt should exceed before bothering to call nfs_buf_freeup() */
-#define LRU_FREEUP_MIN_FRAC		4
+#define LRU_FREEUP_MIN_FRAC             4
 /* fraction of total nfsbufs that nfsbuffreemetacnt should exceed before bothering to call nfs_buf_freeup() */
-#define META_FREEUP_MIN_FRAC		2
+#define META_FREEUP_MIN_FRAC            2
+
+#define NFS_ROUND_BLOCK(p, blksize)         ((((uint64_t)(p) + blksize - 1) & ~((uint64_t)blksize - 1)) / blksize)
 
 #define NFS_BUF_FREEUP() \
 	do { \
-		/* only call nfs_buf_freeup() if it has work to do: */ \
-		if (((nfsbuffreecnt > nfsbufcnt/LRU_FREEUP_MIN_FRAC) || \
-		     (nfsbuffreemetacnt > nfsbufcnt/META_FREEUP_MIN_FRAC)) && \
-		    ((nfsbufcnt - TOTAL_TO_FREEUP) > nfsbufmin)) \
-			nfs_buf_freeup(0); \
+	/* only call nfs_buf_freeup() if it has work to do: */ \
+	        if (((nfsbuffreecnt > nfsbufcnt/LRU_FREEUP_MIN_FRAC) || \
+	             (nfsbuffreemetacnt > nfsbufcnt/META_FREEUP_MIN_FRAC)) && \
+	            ((nfsbufcnt - TOTAL_TO_FREEUP) > nfsbufmin)) \
+	                nfs_buf_freeup(0); \
 	} while (0)
 
+void
+nfs_buf_pgs_get_page_mask(nfsbufpgs *nfsbp, off_t page)
+{
+	off_t page_pos = page / NBPGS_ELEMENT_PAGES;
+	off_t max_page = NBPGS_STRUCT_SIZE * 8;
+	NBPGS_ERASE(nfsbp);
+
+	if (page >= max_page) {
+		nfs_buf_pgs_bit_not(nfsbp);
+		return;
+	}
+
+	NBPGS_SET(nfsbp, page);
+	nfsbp->pages[page_pos]--;
+	for (off_t i = page_pos - 1; i >= 0; i--) {
+		nfsbp->pages[i] = ~0;
+	}
+}
+
+void
+nfs_buf_pgs_bit_not(nfsbufpgs *nfsbp)
+{
+	for (uint32_t i = 0; i < NBPGS_ELEMENTS; i++) {
+		nfsbp->pages[i] = ~nfsbp->pages[i];
+	}
+}
+
+void
+nfs_buf_pgs_bit_and(nfsbufpgs *nfsbp_src1, nfsbufpgs *nfsbp_src2, nfsbufpgs *nfsbp_dst)
+{
+	for (uint32_t i = 0; i < NBPGS_ELEMENTS; i++) {
+		nfsbp_dst->pages[i] = nfsbp_src1->pages[i] & nfsbp_src2->pages[i];
+	}
+}
+
+void
+nfs_buf_pgs_set_pages_between(nfsbufpgs *nfsbp, off_t firstpg, off_t lastpg)
+{
+	nfsbufpgs pagemaskfirst, pagemasklast;
+
+	nfs_buf_pgs_get_page_mask(&pagemasklast, lastpg);
+	nfs_buf_pgs_get_page_mask(&pagemaskfirst, firstpg);
+	nfs_buf_pgs_bit_not(&pagemaskfirst);
+	nfs_buf_pgs_bit_and(&pagemaskfirst, &pagemasklast, nfsbp);
+}
+
+int
+nfs_buf_pgs_is_set(nfsbufpgs *nfsbp)
+{
+	for (uint32_t i = 0; i < NBPGS_ELEMENTS; i++) {
+		if (nfsbp->pages[i] != 0) {
+			return 1;
+		}
+	}
+	return 0;
+}
+
 /*
  * Initialize nfsbuf lists
  */
@@ -150,19 +219,18 @@ nfs_nbinit(void)
 	nfs_buf_mutex = lck_mtx_alloc_init(nfs_buf_lck_grp, LCK_ATTR_NULL);
 
 	nfsbufcnt = nfsbufmetacnt =
-	nfsbuffreecnt = nfsbuffreemetacnt = nfsbufdelwricnt = 0;
+	    nfsbuffreecnt = nfsbuffreemetacnt = nfsbufdelwricnt = 0;
 	nfsbufmin = 128;
 	/* size nfsbufmax to cover at most half sane_size (w/default buf size) */
-	nfsbufmax = (sane_size >> PAGE_SHIFT) / (2 * (NFS_RWSIZE >> PAGE_SHIFT));
+	nfsbufmax = (int)(sane_size >> PAGE_SHIFT) / (2 * (NFS_RWSIZE >> PAGE_SHIFT));
 	nfsbufmetamax = nfsbufmax / 4;
 	nfsneedbuffer = 0;
 	nfs_nbdwrite = 0;
 
-	nfsbufhashtbl = hashinit(nfsbufmax/4, M_TEMP, &nfsbufhash);
+	nfsbufhashtbl = hashinit(nfsbufmax / 4, M_NFSBIO, &nfsbufhash);
 	TAILQ_INIT(&nfsbuffree);
 	TAILQ_INIT(&nfsbuffreemeta);
 	TAILQ_INIT(&nfsbufdelwri);
-
 }
 
 /*
@@ -182,7 +250,7 @@ nfs_buf_timer(__unused void *param0, __unused void *param1)
 	lck_mtx_unlock(nfs_buf_mutex);
 
 	nfs_interval_timer_start(nfs_buf_timer_call,
-		NFSBUF_FREE_PERIOD * 1000);
+	    NFSBUF_FREE_PERIOD * 1000);
 }
 
 /*
@@ -204,16 +272,19 @@ nfs_buf_freeup(int timer)
 
 	FSDBG(320, nfsbufcnt, nfsbuffreecnt, nfsbuffreemetacnt, 0);
 
-	count = timer ? nfsbuffreecnt/LRU_FREEUP_FRAC_ON_TIMER : LRU_TO_FREEUP;
+	count = timer ? nfsbuffreecnt / LRU_FREEUP_FRAC_ON_TIMER : LRU_TO_FREEUP;
 	while ((nfsbufcnt > nfsbufmin) && (count-- > 0)) {
 		fbp = TAILQ_FIRST(&nfsbuffree);
-		if (!fbp)
+		if (!fbp) {
 			break;
-		if (fbp->nb_refs)
+		}
+		if (os_ref_get_count(&fbp->nb_refs) > 1) {
 			break;
+		}
 		if (NBUFSTAMPVALID(fbp) &&
-		    (fbp->nb_timestamp + (2*NFSBUF_LRU_STALE)) > now.tv_sec)
+		    (fbp->nb_timestamp + (2 * NFSBUF_LRU_STALE)) > now.tv_sec) {
 			break;
+		}
 		nfs_buf_remfree(fbp);
 		/* disassociate buffer from any nfsnode */
 		if (fbp->nb_np) {
@@ -228,16 +299,19 @@ nfs_buf_freeup(int timer)
 		nfsbufcnt--;
 	}
 
-	count = timer ? nfsbuffreemetacnt/META_FREEUP_FRAC_ON_TIMER : META_TO_FREEUP;
+	count = timer ? nfsbuffreemetacnt / META_FREEUP_FRAC_ON_TIMER : META_TO_FREEUP;
 	while ((nfsbufcnt > nfsbufmin) && (count-- > 0)) {
 		fbp = TAILQ_FIRST(&nfsbuffreemeta);
-		if (!fbp)
+		if (!fbp) {
 			break;
-		if (fbp->nb_refs)
+		}
+		if (os_ref_get_count(&fbp->nb_refs) > 1) {
 			break;
+		}
 		if (NBUFSTAMPVALID(fbp) &&
-		    (fbp->nb_timestamp + (2*NFSBUF_META_STALE)) > now.tv_sec)
+		    (fbp->nb_timestamp + (2 * NFSBUF_META_STALE)) > now.tv_sec) {
 			break;
+		}
 		nfs_buf_remfree(fbp);
 		/* disassociate buffer from any nfsnode */
 		if (fbp->nb_np) {
@@ -261,16 +335,18 @@ nfs_buf_freeup(int timer)
 	while ((fbp = TAILQ_FIRST(&nfsbuffreeup))) {
 		TAILQ_REMOVE(&nfsbuffreeup, fbp, nb_free);
 		/* nuke any creds */
-		if (IS_VALID_CRED(fbp->nb_rcred))
+		if (IS_VALID_CRED(fbp->nb_rcred)) {
 			kauth_cred_unref(&fbp->nb_rcred);
-		if (IS_VALID_CRED(fbp->nb_wcred))
+		}
+		if (IS_VALID_CRED(fbp->nb_wcred)) {
 			kauth_cred_unref(&fbp->nb_wcred);
+		}
 		/* if buf was NB_META, dump buffer */
-		if (ISSET(fbp->nb_flags, NB_META) && fbp->nb_data)
-			kfree(fbp->nb_data, fbp->nb_bufsize);
-		FREE(fbp, M_TEMP);
+		if (ISSET(fbp->nb_flags, NB_META) && fbp->nb_data) {
+			kheap_free(KHEAP_DATA_BUFFERS, fbp->nb_data, fbp->nb_bufsize);
+		}
+		NFS_ZFREE(nfsbuf_zone, fbp);
 	}
-
 }
 
 /*
@@ -280,8 +356,9 @@ nfs_buf_freeup(int timer)
 void
 nfs_buf_remfree(struct nfsbuf *bp)
 {
-	if (bp->nb_free.tqe_next == NFSNOLIST)
+	if (bp->nb_free.tqe_next == NFSNOLIST) {
 		panic("nfsbuf not on free list");
+	}
 	if (ISSET(bp->nb_flags, NB_DELWRI)) {
 		nfsbufdelwricnt--;
 		TAILQ_REMOVE(&nfsbufdelwri, bp, nb_free);
@@ -304,12 +381,13 @@ nfs_buf_is_incore(nfsnode_t np, daddr64_t blkno)
 {
 	boolean_t rv;
 	lck_mtx_lock(nfs_buf_mutex);
-	if (nfs_buf_incore(np, blkno))
+	if (nfs_buf_incore(np, blkno)) {
 		rv = TRUE;
-	else
+	} else {
 		rv = FALSE;
+	}
 	lck_mtx_unlock(nfs_buf_mutex);
-	return (rv);
+	return rv;
 }
 
 /*
@@ -320,14 +398,15 @@ nfs_buf_incore(nfsnode_t np, daddr64_t blkno)
 {
 	/* Search hash chain */
 	struct nfsbuf * bp = NFSBUFHASH(np, blkno)->lh_first;
-	for (; bp != NULL; bp = bp->nb_hash.le_next)
+	for (; bp != NULL; bp = bp->nb_hash.le_next) {
 		if ((bp->nb_lblkno == blkno) && (bp->nb_np == np)) {
 			if (!ISSET(bp->nb_flags, NB_INVAL)) {
 				FSDBG(547, bp, blkno, bp->nb_flags, bp->nb_np);
-				return (bp);
+				return bp;
 			}
 		}
-	return (NULL);
+	}
+	return NULL;
 }
 
 /*
@@ -345,13 +424,15 @@ nfs_buf_page_inval(vnode_t vp, off_t offset)
 	struct nfsbuf *bp;
 	int error = 0;
 
-	if (!nmp)
-		return (ENXIO);
+	if (nfs_mount_gone(nmp)) {
+		return ENXIO;
+	}
 
 	lck_mtx_lock(nfs_buf_mutex);
 	bp = nfs_buf_incore(VTONFS(vp), (daddr64_t)(offset / nmp->nm_biosize));
-	if (!bp)
+	if (!bp) {
 		goto out;
+	}
 	FSDBG(325, bp, bp->nb_flags, bp->nb_dirtyoff, bp->nb_dirtyend);
 	if (ISSET(bp->nb_lflags, NBL_BUSY)) {
 		error = EBUSY;
@@ -363,16 +444,25 @@ nfs_buf_page_inval(vnode_t vp, off_t offset)
 	 * If it does, we can't let the pager drop the page.
 	 */
 	if (bp->nb_dirtyend > 0) {
-		int start = offset - NBOFF(bp);
-		if (bp->nb_dirtyend <= start ||
-		    bp->nb_dirtyoff >= (start + PAGE_SIZE))
-			error = 0;
-		else
+		off_t start = offset - NBOFF(bp);
+		if ((bp->nb_dirtyend > start) &&
+		    (bp->nb_dirtyoff < (start + PAGE_SIZE))) {
+			/*
+			 * Before returning the bad news, move the
+			 * buffer to the start of the delwri list and
+			 * give the list a push to try to flush the
+			 * buffer out.
+			 */
 			error = EBUSY;
+			nfs_buf_remfree(bp);
+			TAILQ_INSERT_HEAD(&nfsbufdelwri, bp, nb_free);
+			nfsbufdelwricnt++;
+			nfs_buf_delwri_push(1);
+		}
 	}
 out:
 	lck_mtx_unlock(nfs_buf_mutex);
-	return (error);
+	return error;
 }
 
 /*
@@ -386,8 +476,9 @@ nfs_buf_upl_setup(struct nfsbuf *bp)
 	upl_t upl;
 	int upl_flags;
 
-	if (ISSET(bp->nb_flags, NB_PAGELIST))
-		return (0);
+	if (ISSET(bp->nb_flags, NB_PAGELIST)) {
+		return 0;
+	}
 
 	upl_flags = UPL_PRECIOUS;
 	if (!ISSET(bp->nb_flags, NB_READ)) {
@@ -397,24 +488,24 @@ nfs_buf_upl_setup(struct nfsbuf *bp)
 		 */
 		upl_flags |= UPL_WILL_MODIFY;
 	}
-	kret = ubc_create_upl(NFSTOV(bp->nb_np), NBOFF(bp), bp->nb_bufsize,
-				&upl, NULL, upl_flags);
+	kret = ubc_create_upl_kernel(NFSTOV(bp->nb_np), NBOFF(bp), bp->nb_bufsize,
+	    &upl, NULL, upl_flags, VM_KERN_MEMORY_FILE);
 	if (kret == KERN_INVALID_ARGUMENT) {
 		/* vm object probably doesn't exist any more */
 		bp->nb_pagelist = NULL;
-		return (EINVAL);
+		return EINVAL;
 	}
 	if (kret != KERN_SUCCESS) {
 		printf("nfs_buf_upl_setup(): failed to get pagelist %d\n", kret);
 		bp->nb_pagelist = NULL;
-		return (EIO);
+		return EIO;
 	}
 
 	FSDBG(538, bp, NBOFF(bp), bp->nb_bufsize, bp->nb_np);
 
 	bp->nb_pagelist = upl;
 	SET(bp->nb_flags, NB_PAGELIST);
-	return (0);
+	return 0;
 }
 
 /*
@@ -428,38 +519,44 @@ nfs_buf_upl_check(struct nfsbuf *bp)
 	off_t filesize, fileoffset;
 	int i, npages;
 
-	if (!ISSET(bp->nb_flags, NB_PAGELIST))
+	if (!ISSET(bp->nb_flags, NB_PAGELIST)) {
 		return;
+	}
 
 	npages = round_page_32(bp->nb_bufsize) / PAGE_SIZE;
 	filesize = ubc_getsize(NFSTOV(bp->nb_np));
 	fileoffset = NBOFF(bp);
-	if (fileoffset < filesize)
+	if (fileoffset < filesize) {
 		SET(bp->nb_flags, NB_CACHE);
-	else
+	} else {
 		CLR(bp->nb_flags, NB_CACHE);
+	}
 
 	pl = ubc_upl_pageinfo(bp->nb_pagelist);
-	bp->nb_valid = bp->nb_dirty = 0;
+	NBPGS_ERASE(&bp->nb_valid);
+	NBPGS_ERASE(&bp->nb_dirty);
 
-	for (i=0; i < npages; i++, fileoffset += PAGE_SIZE_64) {
+	for (i = 0; i < npages; i++, fileoffset += PAGE_SIZE_64) {
 		/* anything beyond the end of the file is not valid or dirty */
-		if (fileoffset >= filesize)
+		if (fileoffset >= filesize) {
 			break;
+		}
 		if (!upl_valid_page(pl, i)) {
 			CLR(bp->nb_flags, NB_CACHE);
 			continue;
 		}
-		NBPGVALID_SET(bp,i);
-		if (upl_dirty_page(pl, i))
+		NBPGVALID_SET(bp, i);
+		if (upl_dirty_page(pl, i)) {
 			NBPGDIRTY_SET(bp, i);
+		}
 	}
 	fileoffset = NBOFF(bp);
 	if (ISSET(bp->nb_flags, NB_CACHE)) {
 		bp->nb_validoff = 0;
 		bp->nb_validend = bp->nb_bufsize;
-		if (fileoffset + bp->nb_validend > filesize)
+		if (fileoffset + bp->nb_validend > filesize) {
 			bp->nb_validend = filesize - fileoffset;
+		}
 	} else {
 		bp->nb_validoff = bp->nb_validend = -1;
 	}
@@ -476,18 +573,22 @@ nfs_buf_map(struct nfsbuf *bp)
 {
 	kern_return_t kret;
 
-	if (bp->nb_data)
-		return (0);
-	if (!ISSET(bp->nb_flags, NB_PAGELIST))
-		return (EINVAL);
+	if (bp->nb_data) {
+		return 0;
+	}
+	if (!ISSET(bp->nb_flags, NB_PAGELIST)) {
+		return EINVAL;
+	}
 
-	kret = ubc_upl_map(bp->nb_pagelist, (vm_address_t *)&(bp->nb_data));
-	if (kret != KERN_SUCCESS)
+	kret = ubc_upl_map(bp->nb_pagelist, (vm_offset_t *)&(bp->nb_data));
+	if (kret != KERN_SUCCESS) {
 		panic("nfs_buf_map: ubc_upl_map() failed with (%d)", kret);
-	if (bp->nb_data == 0)
+	}
+	if (bp->nb_data == 0) {
 		panic("ubc_upl_map mapped 0");
+	}
 	FSDBG(540, bp, bp->nb_flags, NBOFF(bp), bp->nb_data);
-	return (0);
+	return 0;
 }
 
 /*
@@ -502,28 +603,31 @@ nfs_buf_map(struct nfsbuf *bp)
 void
 nfs_buf_normalize_valid_range(nfsnode_t np, struct nfsbuf *bp)
 {
-	int pg, npg;
+	off_t pg, npg;
 	/* pull validoff back to start of contiguous valid page range */
-	pg = bp->nb_validoff/PAGE_SIZE;
-	while (pg >= 0 && NBPGVALID(bp,pg))
+	pg = bp->nb_validoff / PAGE_SIZE;
+	while (pg >= 0 && NBPGVALID(bp, pg)) {
 		pg--;
-	bp->nb_validoff = (pg+1) * PAGE_SIZE;
+	}
+	bp->nb_validoff = (pg + 1) * PAGE_SIZE;
 	/* push validend forward to end of contiguous valid page range */
-	npg = bp->nb_bufsize/PAGE_SIZE;
-	pg = bp->nb_validend/PAGE_SIZE;
-	while (pg < npg && NBPGVALID(bp,pg))
+	npg = bp->nb_bufsize / PAGE_SIZE;
+	pg = bp->nb_validend / PAGE_SIZE;
+	while (pg < npg && NBPGVALID(bp, pg)) {
 		pg++;
+	}
 	bp->nb_validend = pg * PAGE_SIZE;
 	/* clip to EOF */
-	if (NBOFF(bp) + bp->nb_validend > (off_t)np->n_size)
+	if (NBOFF(bp) + bp->nb_validend > (off_t)np->n_size) {
 		bp->nb_validend = np->n_size % bp->nb_bufsize;
+	}
 }
 
 /*
  * process some entries on the delayed write queue
  * (must be called with nfs_buf_mutex held)
  */
-static void
+void
 nfs_buf_delwri_service(void)
 {
 	struct nfsbuf *bp;
@@ -534,17 +638,21 @@ nfs_buf_delwri_service(void)
 		np = bp->nb_np;
 		nfs_buf_remfree(bp);
 		nfs_buf_refget(bp);
-		while ((error = nfs_buf_acquire(bp, 0, 0, 0)) == EAGAIN);
+		while ((error = nfs_buf_acquire(bp, 0, 0, 0)) == EAGAIN) {
+			;
+		}
 		nfs_buf_refrele(bp);
-		if (error)
+		if (error) {
 			break;
+		}
 		if (!bp->nb_np) {
 			/* buffer is no longer valid */
 			nfs_buf_drop(bp);
 			continue;
 		}
-		if (ISSET(bp->nb_flags, NB_NEEDCOMMIT))
+		if (ISSET(bp->nb_flags, NB_NEEDCOMMIT)) {
 			nfs_buf_check_write_verifier(np, bp);
+		}
 		if (ISSET(bp->nb_flags, NB_NEEDCOMMIT)) {
 			/* put buffer at end of delwri list */
 			TAILQ_INSERT_TAIL(&nfsbufdelwri, bp, nb_free);
@@ -565,10 +673,10 @@ nfs_buf_delwri_service(void)
 /*
  * thread to service the delayed write queue when asked
  */
-static void
+void
 nfs_buf_delwri_thread(__unused void *arg, __unused wait_result_t wr)
 {
-	struct timespec ts = { 30, 0 };
+	struct timespec ts = { .tv_sec = 30, .tv_nsec = 0 };
 	int error = 0;
 
 	lck_mtx_lock(nfs_buf_mutex);
@@ -585,23 +693,28 @@ nfs_buf_delwri_thread(__unused void *arg, __unused wait_result_t wr)
  * try to push out some delayed/uncommitted writes
  * ("locked" indicates whether nfs_buf_mutex is already held)
  */
-static void
+void
 nfs_buf_delwri_push(int locked)
 {
-	if (TAILQ_EMPTY(&nfsbufdelwri))
+	if (TAILQ_EMPTY(&nfsbufdelwri)) {
 		return;
-	if (!locked)
+	}
+	if (!locked) {
 		lck_mtx_lock(nfs_buf_mutex);
+	}
 	/* wake up the delayed write service thread */
-	if (nfsbufdelwrithd)
+	if (nfsbufdelwrithd) {
 		wakeup(&nfsbufdelwrithd);
-	else if (kernel_thread_start(nfs_buf_delwri_thread, NULL, &nfsbufdelwrithd) == KERN_SUCCESS)
+	} else if (kernel_thread_start(nfs_buf_delwri_thread, NULL, &nfsbufdelwrithd) == KERN_SUCCESS) {
 		thread_deallocate(nfsbufdelwrithd);
+	}
 	/* otherwise, try to do some of the work ourselves */
-	if (!nfsbufdelwrithd)
+	if (!nfsbufdelwrithd) {
 		nfs_buf_delwri_service();
-	if (!locked)
+	}
+	if (!locked) {
 		lck_mtx_unlock(nfs_buf_mutex);
+	}
 }
 
 /*
@@ -627,7 +740,7 @@ int
 nfs_buf_get(
 	nfsnode_t np,
 	daddr64_t blkno,
-	int size,
+	uint32_t size,
 	thread_t thd,
 	int flags,
 	struct nfsbuf **bpp)
@@ -635,7 +748,7 @@ nfs_buf_get(
 	vnode_t vp = NFSTOV(np);
 	struct nfsmount *nmp = VTONMP(vp);
 	struct nfsbuf *bp;
-	int bufsize;
+	uint32_t bufsize;
 	int slpflag = PCATCH;
 	int operation = (flags & NBLK_OPMASK);
 	int error = 0;
@@ -645,17 +758,18 @@ nfs_buf_get(
 	*bpp = NULL;
 
 	bufsize = size;
-	if (bufsize > NFS_MAXBSIZE)
+	if (bufsize > NFS_MAXBSIZE) {
 		panic("nfs_buf_get: buffer larger than NFS_MAXBSIZE requested");
+	}
 
-	if (!nmp) {
+	if (nfs_mount_gone(nmp)) {
 		FSDBG_BOT(541, np, blkno, 0, ENXIO);
-		return (ENXIO);
+		return ENXIO;
 	}
 
 	if (!UBCINFOEXISTS(vp)) {
 		operation = NBLK_META;
-	} else if (bufsize < nmp->nm_biosize) {
+	} else if (bufsize < (uint32_t)nmp->nm_biosize) {
 		/* reg files should always have biosize blocks */
 		bufsize = nmp->nm_biosize;
 	}
@@ -675,6 +789,22 @@ nfs_buf_get(
 loop:
 	lck_mtx_lock(nfs_buf_mutex);
 
+	/* wait for any buffer invalidation/flushing to complete */
+	while (np->n_bflag & NBINVALINPROG) {
+		np->n_bflag |= NBINVALWANT;
+		ts.tv_sec = 2;
+		ts.tv_nsec = 0;
+		msleep(&np->n_bflag, nfs_buf_mutex, slpflag, "nfs_buf_get_invalwait", &ts);
+		if ((error = nfs_sigintr(VTONMP(vp), NULL, thd, 0))) {
+			lck_mtx_unlock(nfs_buf_mutex);
+			FSDBG_BOT(541, np, blkno, 0, error);
+			return error;
+		}
+		if (np->n_bflag & NBINVALINPROG) {
+			slpflag = 0;
+		}
+	}
+
 	/* check for existence of nfsbuf in cache */
 	if ((bp = nfs_buf_incore(np, blkno))) {
 		/* if busy, set wanted and wait */
@@ -682,40 +812,40 @@ loop:
 			if (flags & NBLK_NOWAIT) {
 				lck_mtx_unlock(nfs_buf_mutex);
 				FSDBG_BOT(541, np, blkno, bp, 0xbcbcbcbc);
-				return (0);
+				return 0;
 			}
 			FSDBG_TOP(543, np, blkno, bp, bp->nb_flags);
 			SET(bp->nb_lflags, NBL_WANTED);
 
 			ts.tv_sec = 2;
 			ts.tv_nsec = 0;
-			error = msleep(bp, nfs_buf_mutex, slpflag|(PRIBIO+1)|PDROP,
-					"nfsbufget", (slpflag == PCATCH) ? NULL : &ts);
-			if (error == EWOULDBLOCK)
-				error = 0;
+			msleep(bp, nfs_buf_mutex, slpflag | (PRIBIO + 1) | PDROP,
+			    "nfsbufget", (slpflag == PCATCH) ? NULL : &ts);
 			slpflag = 0;
 			FSDBG_BOT(543, np, blkno, bp, bp->nb_flags);
-			if (error || ((error = nfs_sigintr(VTONMP(vp), NULL, thd, 0)))) {
+			if ((error = nfs_sigintr(VTONMP(vp), NULL, thd, 0))) {
 				FSDBG_BOT(541, np, blkno, 0, error);
-				return (error);
+				return error;
 			}
 			goto loop;
 		}
-		if (bp->nb_bufsize != bufsize)
+		if (bp->nb_bufsize != bufsize) {
 			panic("nfsbuf size mismatch");
+		}
 		SET(bp->nb_lflags, NBL_BUSY);
 		SET(bp->nb_flags, NB_CACHE);
 		nfs_buf_remfree(bp);
 		/* additional paranoia: */
-		if (ISSET(bp->nb_flags, NB_PAGELIST))
+		if (ISSET(bp->nb_flags, NB_PAGELIST)) {
 			panic("pagelist buffer was not busy");
+		}
 		goto buffer_setup;
 	}
 
 	if (flags & NBLK_ONLYVALID) {
 		lck_mtx_unlock(nfs_buf_mutex);
 		FSDBG_BOT(541, np, blkno, 0, 0x0000cace);
-		return (0);
+		return 0;
 	}
 
 	/*
@@ -743,28 +873,31 @@ loop:
 		/* if the next LRU or META buffer is invalid or stale, use it */
 		lrubp = TAILQ_FIRST(&nfsbuffree);
 		if (lrubp && (!NBUFSTAMPVALID(lrubp) ||
-		    ((lrubp->nb_timestamp + NFSBUF_LRU_STALE) < now.tv_sec)))
+		    ((lrubp->nb_timestamp + NFSBUF_LRU_STALE) < now.tv_sec))) {
 			bp = lrubp;
+		}
 		metabp = TAILQ_FIRST(&nfsbuffreemeta);
 		if (!bp && metabp && (!NBUFSTAMPVALID(metabp) ||
-		    ((metabp->nb_timestamp + NFSBUF_META_STALE) < now.tv_sec)))
+		    ((metabp->nb_timestamp + NFSBUF_META_STALE) < now.tv_sec))) {
 			bp = metabp;
+		}
 
 		if (!bp && (nfsbufcnt >= nfsbufmax)) {
 			/* we've already allocated all bufs, so */
 			/* choose the buffer that'll go stale first */
-			if (!metabp)
+			if (!metabp) {
 				bp = lrubp;
-			else if (!lrubp)
+			} else if (!lrubp) {
 				bp = metabp;
-			else {
-				int32_t lru_stale_time, meta_stale_time;
+			} else {
+				time_t lru_stale_time, meta_stale_time;
 				lru_stale_time = lrubp->nb_timestamp + NFSBUF_LRU_STALE;
 				meta_stale_time = metabp->nb_timestamp + NFSBUF_META_STALE;
-				if (lru_stale_time <= meta_stale_time)
+				if (lru_stale_time <= meta_stale_time) {
 					bp = lrubp;
-				else
+				} else {
 					bp = metabp;
+				}
 			}
 		}
 	}
@@ -773,8 +906,9 @@ loop:
 		/* we have a buffer to reuse */
 		FSDBG(544, np, blkno, bp, bp->nb_flags);
 		nfs_buf_remfree(bp);
-		if (ISSET(bp->nb_flags, NB_DELWRI))
+		if (ISSET(bp->nb_flags, NB_DELWRI)) {
 			panic("nfs_buf_get: delwri");
+		}
 		SET(bp->nb_lflags, NBL_BUSY);
 		/* disassociate buffer from previous nfsnode */
 		if (bp->nb_np) {
@@ -786,17 +920,20 @@ loop:
 		}
 		LIST_REMOVE(bp, nb_hash);
 		/* nuke any creds we're holding */
-		if (IS_VALID_CRED(bp->nb_rcred))
+		if (IS_VALID_CRED(bp->nb_rcred)) {
 			kauth_cred_unref(&bp->nb_rcred);
-		if (IS_VALID_CRED(bp->nb_wcred))
+		}
+		if (IS_VALID_CRED(bp->nb_wcred)) {
 			kauth_cred_unref(&bp->nb_wcred);
+		}
 		/* if buf will no longer be NB_META, dump old buffer */
 		if (operation == NBLK_META) {
-			if (!ISSET(bp->nb_flags, NB_META))
+			if (!ISSET(bp->nb_flags, NB_META)) {
 				nfsbufmetacnt++;
+			}
 		} else if (ISSET(bp->nb_flags, NB_META)) {
 			if (bp->nb_data) {
-				kfree(bp->nb_data, bp->nb_bufsize);
+				kheap_free(KHEAP_DATA_BUFFERS, bp->nb_data, bp->nb_bufsize);
 				bp->nb_data = NULL;
 			}
 			nfsbufmetacnt--;
@@ -805,20 +942,15 @@ loop:
 		bp->nb_error = 0;
 		bp->nb_validoff = bp->nb_validend = -1;
 		bp->nb_dirtyoff = bp->nb_dirtyend = 0;
-		bp->nb_valid = 0;
-		bp->nb_dirty = 0;
+		NBPGS_ERASE(&bp->nb_valid);
+		NBPGS_ERASE(&bp->nb_dirty);
 		bp->nb_verf = 0;
 	} else {
 		/* no buffer to reuse */
 		if ((nfsbufcnt < nfsbufmax) &&
 		    ((operation != NBLK_META) || (nfsbufmetacnt < nfsbufmetamax))) {
 			/* just alloc a new one */
-			MALLOC(bp, struct nfsbuf *, sizeof(struct nfsbuf), M_TEMP, M_WAITOK);
-			if (!bp) {
-				lck_mtx_unlock(nfs_buf_mutex);
-				FSDBG_BOT(541, np, blkno, 0, error);
-				return (ENOMEM);
-			}
+			bp = zalloc(nfsbuf_zone);
 			nfsbufcnt++;
 
 			/*
@@ -828,14 +960,17 @@ loop:
 			if (nfsbufcnt > nfsbufmin && !nfs_buf_timer_on) {
 				nfs_buf_timer_on = 1;
 				nfs_interval_timer_start(nfs_buf_timer_call,
-					NFSBUF_FREE_PERIOD * 1000);
+				    NFSBUF_FREE_PERIOD * 1000);
 			}
 
-			if (operation == NBLK_META)
+			if (operation == NBLK_META) {
 				nfsbufmetacnt++;
+			}
 			NFSBUFCNTCHK();
 			/* init nfsbuf */
 			bzero(bp, sizeof(*bp));
+			os_ref_init(&bp->nb_refs, NULL);
+
 			bp->nb_free.tqe_next = NFSNOLIST;
 			bp->nb_validoff = bp->nb_validend = -1;
 			FSDBG(545, np, blkno, bp, 0);
@@ -847,18 +982,18 @@ loop:
 			nfs_buf_delwri_push(1);
 
 			nfsneedbuffer = 1;
-			error = msleep(&nfsneedbuffer, nfs_buf_mutex, PCATCH|PDROP, "nfsbufget", NULL);
+			msleep(&nfsneedbuffer, nfs_buf_mutex, PCATCH | PDROP, "nfsbufget", NULL);
 			FSDBG_BOT(546, np, blkno, nfsbufcnt, nfsbufmax);
-			if (error || ((error = nfs_sigintr(VTONMP(vp), NULL, thd, 0)))) {
+			if ((error = nfs_sigintr(VTONMP(vp), NULL, thd, 0))) {
 				FSDBG_BOT(541, np, blkno, 0, error);
-				return (error);
+				return error;
 			}
 			goto loop;
 		}
 	}
 
-	/* setup nfsbuf */
-	bp->nb_lflags = NBL_BUSY;
+	/* set up nfsbuf */
+	SET(bp->nb_lflags, NBL_BUSY);
 	bp->nb_flags = 0;
 	bp->nb_lblkno = blkno;
 	/* insert buf in hash */
@@ -876,16 +1011,18 @@ buffer_setup:
 	case NBLK_META:
 		SET(bp->nb_flags, NB_META);
 		if ((bp->nb_bufsize != bufsize) && bp->nb_data) {
-			kfree(bp->nb_data, bp->nb_bufsize);
+			kheap_free(KHEAP_DATA_BUFFERS, bp->nb_data, bp->nb_bufsize);
 			bp->nb_data = NULL;
 			bp->nb_validoff = bp->nb_validend = -1;
 			bp->nb_dirtyoff = bp->nb_dirtyend = 0;
-			bp->nb_valid = 0;
-			bp->nb_dirty = 0;
+			NBPGS_ERASE(&bp->nb_valid);
+			NBPGS_ERASE(&bp->nb_dirty);
 			CLR(bp->nb_flags, NB_CACHE);
 		}
-		if (!bp->nb_data)
-			bp->nb_data = kalloc(bufsize);
+		if (!bp->nb_data) {
+			bp->nb_data = kheap_alloc(KHEAP_DATA_BUFFERS,
+			    bufsize, Z_WAITOK);
+		}
 		if (!bp->nb_data) {
 			/* Ack! couldn't allocate the data buffer! */
 			/* clean up buffer and return error */
@@ -895,13 +1032,14 @@ buffer_setup:
 			bp->nb_np = NULL;
 			/* invalidate usage timestamp to allow immediate freeing */
 			NBUFSTAMPINVALIDATE(bp);
-			if (bp->nb_free.tqe_next != NFSNOLIST)
+			if (bp->nb_free.tqe_next != NFSNOLIST) {
 				panic("nfsbuf on freelist");
+			}
 			TAILQ_INSERT_HEAD(&nfsbuffree, bp, nb_free);
 			nfsbuffreecnt++;
 			lck_mtx_unlock(nfs_buf_mutex);
 			FSDBG_BOT(541, np, blkno, 0xb00, ENOMEM);
-			return (ENOMEM);
+			return ENOMEM;
 		}
 		bp->nb_bufsize = bufsize;
 		break;
@@ -917,8 +1055,9 @@ buffer_setup:
 		} else {
 			CLR(bp->nb_flags, NB_READ);
 		}
-		if (bufsize < PAGE_SIZE)
+		if (bufsize < PAGE_SIZE) {
 			bufsize = PAGE_SIZE;
+		}
 		bp->nb_bufsize = bufsize;
 		bp->nb_validoff = bp->nb_validend = -1;
 
@@ -934,13 +1073,14 @@ buffer_setup:
 				bp->nb_np = NULL;
 				/* invalidate usage timestamp to allow immediate freeing */
 				NBUFSTAMPINVALIDATE(bp);
-				if (bp->nb_free.tqe_next != NFSNOLIST)
+				if (bp->nb_free.tqe_next != NFSNOLIST) {
 					panic("nfsbuf on freelist");
+				}
 				TAILQ_INSERT_HEAD(&nfsbuffree, bp, nb_free);
 				nfsbuffreecnt++;
 				lck_mtx_unlock(nfs_buf_mutex);
 				FSDBG_BOT(541, np, blkno, 0x2bc, EIO);
-				return (EIO);
+				return EIO;
 			}
 			nfs_buf_upl_check(bp);
 		}
@@ -954,7 +1094,7 @@ buffer_setup:
 
 	FSDBG_BOT(541, np, blkno, bp, bp->nb_flags);
 
-	return (0);
+	return 0;
 }
 
 void
@@ -971,66 +1111,79 @@ nfs_buf_release(struct nfsbuf *bp, int freeup)
 
 	vp = np ? NFSTOV(np) : NULL;
 	if (vp && UBCINFOEXISTS(vp) && bp->nb_bufsize) {
-		int upl_flags;
+		int upl_flags, rv;
 		upl_t upl;
-		int i, rv;
+		uint32_t i;
 
 		if (!ISSET(bp->nb_flags, NB_PAGELIST) && !ISSET(bp->nb_flags, NB_INVAL)) {
 			rv = nfs_buf_upl_setup(bp);
-			if (rv)
+			if (rv) {
 				printf("nfs_buf_release: upl create failed %d\n", rv);
-			else
+			} else {
 				nfs_buf_upl_check(bp);
+			}
 		}
 		upl = bp->nb_pagelist;
-		if (!upl)
+		if (!upl) {
 			goto pagelist_cleanup_done;
+		}
 		if (bp->nb_data) {
-			if (ubc_upl_unmap(upl) != KERN_SUCCESS)
+			if (ubc_upl_unmap(upl) != KERN_SUCCESS) {
 				panic("ubc_upl_unmap failed");
+			}
 			bp->nb_data = NULL;
 		}
 		/*
 		 * Abort the pages on error or: if this is an invalid or
 		 * non-needcommit nocache buffer AND no pages are dirty.
 		 */
-		if (ISSET(bp->nb_flags, NB_ERROR) || (!bp->nb_dirty && (ISSET(bp->nb_flags, NB_INVAL) ||
+		if (ISSET(bp->nb_flags, NB_ERROR) || (!nfs_buf_pgs_is_set(&bp->nb_dirty) && (ISSET(bp->nb_flags, NB_INVAL) ||
 		    (ISSET(bp->nb_flags, NB_NOCACHE) && !ISSET(bp->nb_flags, (NB_NEEDCOMMIT | NB_DELWRI)))))) {
-			if (ISSET(bp->nb_flags, (NB_READ | NB_INVAL | NB_NOCACHE)))
+			if (ISSET(bp->nb_flags, (NB_READ | NB_INVAL | NB_NOCACHE))) {
 				upl_flags = UPL_ABORT_DUMP_PAGES;
-			else
+			} else {
 				upl_flags = 0;
+			}
 			ubc_upl_abort(upl, upl_flags);
 			goto pagelist_cleanup_done;
 		}
-		for (i=0; i <= (bp->nb_bufsize - 1)/PAGE_SIZE; i++) {
-			if (!NBPGVALID(bp,i))
+		for (i = 0; i <= (bp->nb_bufsize - 1) / PAGE_SIZE; i++) {
+			if (!NBPGVALID(bp, i)) {
 				ubc_upl_abort_range(upl,
-					i*PAGE_SIZE, PAGE_SIZE,
-					UPL_ABORT_DUMP_PAGES |
-					UPL_ABORT_FREE_ON_EMPTY);
-			else {
-				if (NBPGDIRTY(bp,i))
+				    i * PAGE_SIZE, PAGE_SIZE,
+				    UPL_ABORT_DUMP_PAGES |
+				    UPL_ABORT_FREE_ON_EMPTY);
+			} else {
+				if (NBPGDIRTY(bp, i)) {
 					upl_flags = UPL_COMMIT_SET_DIRTY;
-				else
+				} else {
 					upl_flags = UPL_COMMIT_CLEAR_DIRTY;
+				}
+
+				if (!ISSET(bp->nb_flags, (NB_NEEDCOMMIT | NB_DELWRI))) {
+					upl_flags |= UPL_COMMIT_CLEAR_PRECIOUS;
+				}
+
 				ubc_upl_commit_range(upl,
-					i*PAGE_SIZE, PAGE_SIZE,
-					upl_flags |
-					UPL_COMMIT_INACTIVATE |
-					UPL_COMMIT_FREE_ON_EMPTY);
+				    i * PAGE_SIZE, PAGE_SIZE,
+				    upl_flags |
+				    UPL_COMMIT_INACTIVATE |
+				    UPL_COMMIT_FREE_ON_EMPTY);
 			}
 		}
 pagelist_cleanup_done:
-		/* was this the last buffer in the file? */
+		/* invalidate any pages past EOF */
 		if (NBOFF(bp) + bp->nb_bufsize > (off_t)(np->n_size)) {
-			/* if so, invalidate all pages of last buffer past EOF */
 			off_t start, end;
 			start = trunc_page_64(np->n_size) + PAGE_SIZE_64;
 			end = trunc_page_64(NBOFF(bp) + bp->nb_bufsize);
+			if (start < NBOFF(bp)) {
+				start = NBOFF(bp);
+			}
 			if (end > start) {
-				if (!(rv = ubc_sync_range(vp, start, end, UBC_INVALIDATE)))
-					printf("nfs_buf_release(): ubc_sync_range failed!\n");
+				if ((rv = ubc_msync(vp, start, end, NULL, UBC_INVALIDATE))) {
+					printf("nfs_buf_release(): ubc_msync failed!, error %d\n", rv);
+				}
 			}
 		}
 		CLR(bp->nb_flags, NB_PAGELIST);
@@ -1054,8 +1207,9 @@ pagelist_cleanup_done:
 
 	/* If it's non-needcommit nocache, or an error, mark it invalid. */
 	if (ISSET(bp->nb_flags, NB_ERROR) ||
-	    (ISSET(bp->nb_flags, NB_NOCACHE) && !ISSET(bp->nb_flags, (NB_NEEDCOMMIT | NB_DELWRI))))
+	    (ISSET(bp->nb_flags, NB_NOCACHE) && !ISSET(bp->nb_flags, (NB_NEEDCOMMIT | NB_DELWRI)))) {
 		SET(bp->nb_flags, NB_INVAL);
+	}
 
 	if ((bp->nb_bufsize <= 0) || ISSET(bp->nb_flags, NB_INVAL)) {
 		/* If it's invalid or empty, dissociate it from its nfsnode */
@@ -1075,8 +1229,9 @@ pagelist_cleanup_done:
 		/* invalidate usage timestamp to allow immediate freeing */
 		NBUFSTAMPINVALIDATE(bp);
 		/* put buffer at head of free list */
-		if (bp->nb_free.tqe_next != NFSNOLIST)
+		if (bp->nb_free.tqe_next != NFSNOLIST) {
 			panic("nfsbuf on freelist");
+		}
 		SET(bp->nb_flags, NB_INVAL);
 		if (ISSET(bp->nb_flags, NB_META)) {
 			TAILQ_INSERT_HEAD(&nfsbuffreemeta, bp, nb_free);
@@ -1087,8 +1242,9 @@ pagelist_cleanup_done:
 		}
 	} else if (ISSET(bp->nb_flags, NB_DELWRI)) {
 		/* put buffer at end of delwri list */
-		if (bp->nb_free.tqe_next != NFSNOLIST)
+		if (bp->nb_free.tqe_next != NFSNOLIST) {
 			panic("nfsbuf on freelist");
+		}
 		TAILQ_INSERT_TAIL(&nfsbufdelwri, bp, nb_free);
 		nfsbufdelwricnt++;
 		freeup = 0;
@@ -1097,8 +1253,9 @@ pagelist_cleanup_done:
 		microuptime(&now);
 		bp->nb_timestamp = now.tv_sec;
 		/* put buffer at end of free list */
-		if (bp->nb_free.tqe_next != NFSNOLIST)
+		if (bp->nb_free.tqe_next != NFSNOLIST) {
 			panic("nfsbuf on freelist");
+		}
 		if (ISSET(bp->nb_flags, NB_META)) {
 			TAILQ_INSERT_TAIL(&nfsbuffreemeta, bp, nb_free);
 			nfsbuffreemetacnt++;
@@ -1118,14 +1275,18 @@ pagelist_cleanup_done:
 
 	lck_mtx_unlock(nfs_buf_mutex);
 
-	if (wakeup_needbuffer)
+	if (wakeup_needbuffer) {
 		wakeup(&nfsneedbuffer);
-	if (wakeup_buffer)
+	}
+	if (wakeup_buffer) {
 		wakeup(bp);
-	if (wakeup_nbdwrite)
+	}
+	if (wakeup_nbdwrite) {
 		wakeup(&nfs_nbdwrite);
-	if (freeup)
+	}
+	if (freeup) {
 		NFS_BUF_FREEUP();
+	}
 }
 
 /*
@@ -1139,8 +1300,9 @@ nfs_buf_iowait(struct nfsbuf *bp)
 
 	lck_mtx_lock(nfs_buf_mutex);
 
-	while (!ISSET(bp->nb_flags, NB_DONE))
+	while (!ISSET(bp->nb_flags, NB_DONE)) {
 		msleep(bp, nfs_buf_mutex, PRIBIO + 1, "nfs_buf_iowait", NULL);
+	}
 
 	lck_mtx_unlock(nfs_buf_mutex);
 
@@ -1149,10 +1311,11 @@ nfs_buf_iowait(struct nfsbuf *bp)
 	/* check for interruption of I/O, then errors. */
 	if (ISSET(bp->nb_flags, NB_EINTR)) {
 		CLR(bp->nb_flags, NB_EINTR);
-		return (EINTR);
-	} else if (ISSET(bp->nb_flags, NB_ERROR))
-		return (bp->nb_error ? bp->nb_error : EIO);
-	return (0);
+		return EINTR;
+	} else if (ISSET(bp->nb_flags, NB_ERROR)) {
+		return bp->nb_error ? bp->nb_error : EIO;
+	}
+	return 0;
 }
 
 /*
@@ -1161,11 +1324,11 @@ nfs_buf_iowait(struct nfsbuf *bp)
 void
 nfs_buf_iodone(struct nfsbuf *bp)
 {
-
 	FSDBG_TOP(550, bp, NBOFF(bp), bp->nb_flags, bp->nb_error);
 
-	if (ISSET(bp->nb_flags, NB_DONE))
+	if (ISSET(bp->nb_flags, NB_DONE)) {
 		panic("nfs_buf_iodone already");
+	}
 
 	if (!ISSET(bp->nb_flags, NB_READ)) {
 		CLR(bp->nb_flags, NB_WRITEINPROG);
@@ -1174,15 +1337,18 @@ nfs_buf_iodone(struct nfsbuf *bp)
 		 * any throttled write operations
 		 */
 		vnode_writedone(NFSTOV(bp->nb_np));
+		nfs_node_lock_force(bp->nb_np);
+		bp->nb_np->n_numoutput--;
+		nfs_node_unlock(bp->nb_np);
 	}
-	if (ISSET(bp->nb_flags, NB_ASYNC)) {	/* if async, release it */
-		SET(bp->nb_flags, NB_DONE);		/* note that it's done */
+	if (ISSET(bp->nb_flags, NB_ASYNC)) {    /* if async, release it */
+		SET(bp->nb_flags, NB_DONE);             /* note that it's done */
 		nfs_buf_release(bp, 1);
-	} else {		                        /* or just wakeup the buffer */	
-	        lck_mtx_lock(nfs_buf_mutex);
-		SET(bp->nb_flags, NB_DONE);		/* note that it's done */
+	} else {                                        /* or just wakeup the buffer */
+		lck_mtx_lock(nfs_buf_mutex);
+		SET(bp->nb_flags, NB_DONE);             /* note that it's done */
 		CLR(bp->nb_lflags, NBL_WANTED);
-	        lck_mtx_unlock(nfs_buf_mutex);
+		lck_mtx_unlock(nfs_buf_mutex);
 		wakeup(bp);
 	}
 
@@ -1208,8 +1374,9 @@ nfs_buf_write_delayed(struct nfsbuf *bp)
 		lck_mtx_lock(nfs_buf_mutex);
 		nfs_nbdwrite++;
 		NFSBUFCNTCHK();
-		if (bp->nb_vnbufs.le_next != NFSNOLIST)
+		if (bp->nb_vnbufs.le_next != NFSNOLIST) {
 			LIST_REMOVE(bp, nb_vnbufs);
+		}
 		LIST_INSERT_HEAD(&np->n_dirtyblkhd, bp, nb_vnbufs);
 		lck_mtx_unlock(nfs_buf_mutex);
 	}
@@ -1221,16 +1388,17 @@ nfs_buf_write_delayed(struct nfsbuf *bp)
 	vnode_waitforwrites(NFSTOV(np), VNODE_ASYNC_THROTTLE, 0, 0, "nfs_buf_write_delayed");
 
 	/* the file is in a modified state, so make sure the flag's set */
-	nfs_lock(np, NFS_NODE_LOCK_FORCE);
+	nfs_node_lock_force(np);
 	np->n_flag |= NMODIFIED;
-	nfs_unlock(np);
+	nfs_node_unlock(np);
 
 	/*
 	 * If we have too many delayed write buffers,
 	 * just fall back to doing the async write.
 	 */
-	if (nfs_nbdwrite < 0)
+	if (nfs_nbdwrite < 0) {
 		panic("nfs_buf_write_delayed: Negative nfs_nbdwrite");
+	}
 	if (nfs_nbdwrite > NFS_A_LOT_OF_DELAYED_WRITES) {
 		/* issue async write */
 		SET(bp->nb_flags, NB_ASYNC);
@@ -1256,22 +1424,25 @@ nfs_buf_check_write_verifier(nfsnode_t np, struct nfsbuf *bp)
 {
 	struct nfsmount *nmp;
 
-	if (!ISSET(bp->nb_flags, NB_NEEDCOMMIT))
+	if (!ISSET(bp->nb_flags, NB_NEEDCOMMIT)) {
 		return;
+	}
 
 	nmp = NFSTONMP(np);
-	if (!nmp)
+	if (nfs_mount_gone(nmp)) {
 		return;
-	if (!ISSET(bp->nb_flags, NB_STALEWVERF) && (bp->nb_verf == nmp->nm_verf))
+	}
+	if (!ISSET(bp->nb_flags, NB_STALEWVERF) && (bp->nb_verf == nmp->nm_verf)) {
 		return;
+	}
 
 	/* write verifier changed, clear commit/wverf flags */
 	CLR(bp->nb_flags, (NB_NEEDCOMMIT | NB_STALEWVERF));
 	bp->nb_verf = 0;
-	nfs_lock(np, NFS_NODE_LOCK_FORCE);
+	nfs_node_lock_force(np);
 	np->n_needcommitcnt--;
 	CHECK_NEEDCOMMITCNT(np);
-	nfs_unlock(np);
+	nfs_node_unlock(np);
 }
 
 /*
@@ -1281,7 +1452,7 @@ nfs_buf_check_write_verifier(nfsnode_t np, struct nfsbuf *bp)
 void
 nfs_buf_refget(struct nfsbuf *bp)
 {
-	bp->nb_refs++;
+	os_ref_retain_locked(&bp->nb_refs);
 }
 /*
  * release a reference on a buffer
@@ -1290,7 +1461,7 @@ nfs_buf_refget(struct nfsbuf *bp)
 void
 nfs_buf_refrele(struct nfsbuf *bp)
 {
-	bp->nb_refs--;
+	(void) os_ref_release_locked(&bp->nb_refs);
 }
 
 /*
@@ -1304,30 +1475,33 @@ nfs_buf_acquire(struct nfsbuf *bp, int flags, int slpflag, int slptimeo)
 	struct timespec ts;
 
 	if (ISSET(bp->nb_lflags, NBL_BUSY)) {
-		/*	
-		 * since the mutex_lock may block, the buffer
+		/*
+		 * since the lck_mtx_lock may block, the buffer
 		 * may become BUSY, so we need to recheck for
 		 * a NOWAIT request
 		 */
-	        if (flags & NBAC_NOWAIT)
-			return (EBUSY);
-	        SET(bp->nb_lflags, NBL_WANTED);
+		if (flags & NBAC_NOWAIT) {
+			return EBUSY;
+		}
+		SET(bp->nb_lflags, NBL_WANTED);
 
-		ts.tv_sec = (slptimeo/100);
+		ts.tv_sec = (slptimeo / 100);
 		/* the hz value is 100; which leads to 10ms */
 		ts.tv_nsec = (slptimeo % 100) * 10  * NSEC_PER_USEC * 1000;
 
 		error = msleep(bp, nfs_buf_mutex, slpflag | (PRIBIO + 1),
-			"nfs_buf_acquire", &ts);
-		if (error)
-			return (error);
-		return (EAGAIN);
+		    "nfs_buf_acquire", &ts);
+		if (error) {
+			return error;
+		}
+		return EAGAIN;
+	}
+	if (flags & NBAC_REMOVE) {
+		nfs_buf_remfree(bp);
 	}
-	if (flags & NBAC_REMOVE)
-	        nfs_buf_remfree(bp);
 	SET(bp->nb_lflags, NBL_BUSY);
 
-	return (0);
+	return 0;
 }
 
 /*
@@ -1339,17 +1513,19 @@ nfs_buf_drop(struct nfsbuf *bp)
 {
 	int need_wakeup = 0;
 
-	if (!ISSET(bp->nb_lflags, NBL_BUSY))
+	if (!ISSET(bp->nb_lflags, NBL_BUSY)) {
 		panic("nfs_buf_drop: buffer not busy!");
+	}
 	if (ISSET(bp->nb_lflags, NBL_WANTED)) {
-	        /* delay the actual wakeup until after we clear NBL_BUSY */
+		/* delay the actual wakeup until after we clear NBL_BUSY */
 		need_wakeup = 1;
 	}
 	/* Unlock the buffer. */
 	CLR(bp->nb_lflags, (NBL_BUSY | NBL_WANTED));
 
-	if (need_wakeup)
-	        wakeup(bp);
+	if (need_wakeup) {
+		wakeup(bp);
+	}
 }
 
 /*
@@ -1362,31 +1538,32 @@ nfs_buf_iterprepare(nfsnode_t np, struct nfsbuflists *iterheadp, int flags)
 {
 	struct nfsbuflists *listheadp;
 
-	if (flags & NBI_DIRTY)
+	if (flags & NBI_DIRTY) {
 		listheadp = &np->n_dirtyblkhd;
-	else
+	} else {
 		listheadp = &np->n_cleanblkhd;
+	}
 
 	if ((flags & NBI_NOWAIT) && (np->n_bufiterflags & NBI_ITER)) {
-	        LIST_INIT(iterheadp);
-		return(EWOULDBLOCK);
+		LIST_INIT(iterheadp);
+		return EWOULDBLOCK;
 	}
 
-	while (np->n_bufiterflags & NBI_ITER) 	{
-	        np->n_bufiterflags |= NBI_ITERWANT;
+	while (np->n_bufiterflags & NBI_ITER) {
+		np->n_bufiterflags |= NBI_ITERWANT;
 		msleep(&np->n_bufiterflags, nfs_buf_mutex, 0, "nfs_buf_iterprepare", NULL);
 	}
 	if (LIST_EMPTY(listheadp)) {
-	        LIST_INIT(iterheadp);
-		return(EINVAL);
+		LIST_INIT(iterheadp);
+		return EINVAL;
 	}
 	np->n_bufiterflags |= NBI_ITER;
 
 	iterheadp->lh_first = listheadp->lh_first;
-	listheadp->lh_first->nb_vnbufs.le_prev = &iterheadp->lh_first;	
+	listheadp->lh_first->nb_vnbufs.le_prev = &iterheadp->lh_first;
 	LIST_INIT(listheadp);
 
-	return(0);
+	return 0;
 }
 
 /*
@@ -1400,10 +1577,11 @@ nfs_buf_itercomplete(nfsnode_t np, struct nfsbuflists *iterheadp, int flags)
 	struct nfsbuflists * listheadp;
 	struct nfsbuf *bp;
 
-	if (flags & NBI_DIRTY)
+	if (flags & NBI_DIRTY) {
 		listheadp = &np->n_dirtyblkhd;
-	else
+	} else {
 		listheadp = &np->n_cleanblkhd;
+	}
 
 	while (!LIST_EMPTY(iterheadp)) {
 		bp = LIST_FIRST(iterheadp);
@@ -1432,19 +1610,22 @@ nfs_buf_read(struct nfsbuf *bp)
 
 	np = bp->nb_np;
 	cred = bp->nb_rcred;
-	if (IS_VALID_CRED(cred))
+	if (IS_VALID_CRED(cred)) {
 		kauth_cred_ref(cred);
+	}
 	thd = ISSET(bp->nb_flags, NB_ASYNC) ? NULL : current_thread();
 
 	/* sanity checks */
-	if (!ISSET(bp->nb_flags, NB_READ))
+	if (!ISSET(bp->nb_flags, NB_READ)) {
 		panic("nfs_buf_read: !NB_READ");
-	if (ISSET(bp->nb_flags, NB_DONE))
+	}
+	if (ISSET(bp->nb_flags, NB_DONE)) {
 		CLR(bp->nb_flags, NB_DONE);
+	}
 
 	NFS_BUF_MAP(bp);
 
-	OSAddAtomic(1, (SInt32 *)&nfsstats.read_bios);
+	OSAddAtomic64(1, &nfsstats.read_bios);
 
 	error = nfs_buf_read_rpc(bp, thd, cred);
 	/*
@@ -1452,9 +1633,10 @@ nfs_buf_read(struct nfsbuf *bp)
 	 * read.  Otherwise, the read has already been finished.
 	 */
 
-	if (IS_VALID_CRED(cred))
+	if (IS_VALID_CRED(cred)) {
 		kauth_cred_unref(&cred);
-	return (error);
+	}
+	return error;
 }
 
 /*
@@ -1470,7 +1652,7 @@ nfs_buf_read_finish(struct nfsbuf *bp)
 		/* update valid range */
 		bp->nb_validoff = 0;
 		bp->nb_validend = bp->nb_endio;
-		if (bp->nb_endio < bp->nb_bufsize) { 
+		if (bp->nb_endio < bp->nb_bufsize) {
 			/*
 			 * The read may be short because we have unflushed writes
 			 * that are extending the file size and the reads hit the
@@ -1480,20 +1662,22 @@ nfs_buf_read_finish(struct nfsbuf *bp)
 			 * in nfs_buf_read_rpc_finish().
 			 */
 			off_t boff = NBOFF(bp);
-			if ((off_t)np->n_size >= (boff + bp->nb_bufsize))
+			if ((off_t)np->n_size >= (boff + bp->nb_bufsize)) {
 				bp->nb_validend = bp->nb_bufsize;
-			else if ((off_t)np->n_size >= boff)
+			} else if ((off_t)np->n_size >= boff) {
 				bp->nb_validend = np->n_size - boff;
-			else
+			} else {
 				bp->nb_validend = 0;
+			}
 		}
 		if ((nmp = NFSTONMP(np)) && (nmp->nm_vers == NFS_VER2) &&
-		    ((NBOFF(bp) + bp->nb_validend) > 0x100000000LL))
+		    ((NBOFF(bp) + bp->nb_validend) > 0x100000000LL)) {
 			bp->nb_validend = 0x100000000LL - NBOFF(bp);
-		bp->nb_valid = (1 << (round_page_32(bp->nb_validend) / PAGE_SIZE)) - 1;
+		}
+		nfs_buf_pgs_get_page_mask(&bp->nb_valid, round_page_64(bp->nb_validend) / PAGE_SIZE);
 		if (bp->nb_validend & PAGE_MASK) {
 			/* zero-fill remainder of last page */
-			bzero(bp->nb_data + bp->nb_validend, bp->nb_bufsize - bp->nb_validend);
+			bzero(bp->nb_data + bp->nb_validend, PAGE_SIZE - (bp->nb_validend & PAGE_MASK));
 		}
 	}
 	nfs_buf_iodone(bp);
@@ -1508,17 +1692,20 @@ nfs_buf_read_rpc(struct nfsbuf *bp, thread_t thd, kauth_cred_t cred)
 	struct nfsmount *nmp;
 	nfsnode_t np = bp->nb_np;
 	int error = 0, nfsvers, async;
-	int offset, length, nmrsize, nrpcs, len;
+	int offset;
+	uint64_t length, nrpcs;
+	uint32_t nmrsize;
+	size_t len;
 	off_t boff;
 	struct nfsreq *req;
 	struct nfsreq_cbinfo cb;
 
 	nmp = NFSTONMP(np);
-	if (!nmp) {
+	if (nfs_mount_gone(nmp)) {
 		bp->nb_error = error = ENXIO;
 		SET(bp->nb_flags, NB_ERROR);
 		nfs_buf_iodone(bp);
-		return (error);
+		return error;
 	}
 	nfsvers = nmp->nm_vers;
 	nmrsize = nmp->nm_rsize;
@@ -1532,10 +1719,11 @@ nfs_buf_read_rpc(struct nfsbuf *bp, thread_t thd, kauth_cred_t cred)
 			bp->nb_error = error = EFBIG;
 			SET(bp->nb_flags, NB_ERROR);
 			nfs_buf_iodone(bp);
-			return (error);
+			return error;
 		}
-		if ((boff + length - 1) > 0xffffffffLL)
+		if ((boff + length - 1) > 0xffffffffLL) {
 			length = 0x100000000LL - boff;
+		}
 	}
 
 	/* Note: Can only do async I/O if nfsiods are configured. */
@@ -1556,17 +1744,24 @@ nfs_buf_read_rpc(struct nfsbuf *bp, thread_t thd, kauth_cred_t cred)
 			error = bp->nb_error;
 			break;
 		}
-		len = (length > nmrsize) ? nmrsize : length;
-		cb.rcb_args[0] = offset;
-		cb.rcb_args[1] = len;
+		len = (length > nmrsize) ? nmrsize : (uint32_t)length;
+		cb.rcb_args.offset = offset;
+		cb.rcb_args.length = len;
+#if CONFIG_NFS4
+		if (nmp->nm_vers >= NFS_VER4) {
+			cb.rcb_args.stategenid = nmp->nm_stategenid;
+		}
+#endif
 		req = NULL;
 		error = nmp->nm_funcs->nf_read_rpc_async(np, boff + offset, len, thd, cred, &cb, &req);
-		if (error)
+		if (error) {
 			break;
+		}
 		offset += len;
 		length -= len;
-		if (async)
+		if (async) {
 			continue;
+		}
 		nfs_buf_read_rpc_finish(req);
 		if (ISSET(bp->nb_flags, NB_ERROR)) {
 			error = bp->nb_error;
@@ -1591,9 +1786,10 @@ nfs_buf_read_rpc(struct nfsbuf *bp, thread_t thd, kauth_cred_t cred)
 				nfs_buf_iodone(bp);
 			} else {
 				/* wait for the last RPC to mark it done */
-				while (bp->nb_rpcs > 0)
+				while (bp->nb_rpcs > 0) {
 					msleep(&bp->nb_rpcs, nfs_buf_mutex, 0,
-						"nfs_buf_read_rpc_cancel", NULL);
+					    "nfs_buf_read_rpc_cancel", NULL);
+				}
 				lck_mtx_unlock(nfs_buf_mutex);
 			}
 		} else {
@@ -1601,7 +1797,7 @@ nfs_buf_read_rpc(struct nfsbuf *bp, thread_t thd, kauth_cred_t cred)
 		}
 	}
 
-	return (error);
+	return error;
 }
 
 /*
@@ -1611,29 +1807,34 @@ void
 nfs_buf_read_rpc_finish(struct nfsreq *req)
 {
 	struct nfsmount *nmp;
-	size_t rlen;
+	size_t rlen, length;
 	struct nfsreq_cbinfo cb;
 	struct nfsbuf *bp;
-	int error = 0, nfsvers, offset, length, eof = 0, multasyncrpc, finished;
+	int error = 0, nfsvers, eof = 0, multasyncrpc, finished;
+	off_t offset;
 	void *wakeme = NULL;
 	struct nfsreq *rreq = NULL;
 	nfsnode_t np;
 	thread_t thd;
 	kauth_cred_t cred;
-	struct uio uio;
-	struct iovec_32 io;
+	uio_t auio;
+	char uio_buf[UIO_SIZEOF(1)];
 
 finish:
 	np = req->r_np;
 	thd = req->r_thread;
 	cred = req->r_cred;
-	if (IS_VALID_CRED(cred))
+	if (IS_VALID_CRED(cred)) {
 		kauth_cred_ref(cred);
+	}
 	cb = req->r_callback;
 	bp = cb.rcb_bp;
+	if (cb.rcb_func) { /* take an extra reference on the nfsreq in case we want to resend it later due to grace error */
+		nfs_request_ref(req, 0);
+	}
 
 	nmp = NFSTONMP(np);
-	if (!nmp) {
+	if (nfs_mount_gone(nmp)) {
 		SET(bp->nb_flags, NB_ERROR);
 		bp->nb_error = error = ENXIO;
 	}
@@ -1644,39 +1845,82 @@ finish:
 	}
 
 	nfsvers = nmp->nm_vers;
-	offset = cb.rcb_args[0];
-	rlen = length = cb.rcb_args[1];
-
-	uio.uio_iovs.iov32p = &io;
-	uio.uio_iovcnt = 1;
-	uio.uio_rw = UIO_READ;
-#if 1	/* LP64todo - can't use new segment flags until the drivers are ready */
-	uio.uio_segflg = UIO_SYSSPACE;
-#else
-	uio.uio_segflg = UIO_SYSSPACE32;
-#endif
-	io.iov_len = length;
-	uio_uio_resid_set(&uio, io.iov_len);
-	uio.uio_offset = NBOFF(bp) + offset;
-	io.iov_base = (uintptr_t) bp->nb_data + offset;
+	offset = cb.rcb_args.offset;
+	rlen = length = cb.rcb_args.length;
+
+	auio = uio_createwithbuffer(1, NBOFF(bp) + offset, UIO_SYSSPACE,
+	    UIO_READ, &uio_buf, sizeof(uio_buf));
+	uio_addiov(auio, CAST_USER_ADDR_T(bp->nb_data + offset), length);
 
 	/* finish the RPC */
-	error = nmp->nm_funcs->nf_read_rpc_async_finish(np, req, &uio, &rlen, &eof);
+	error = nmp->nm_funcs->nf_read_rpc_async_finish(np, req, auio, &rlen, &eof);
 	if ((error == EINPROGRESS) && cb.rcb_func) {
 		/* async request restarted */
-		if (IS_VALID_CRED(cred))
+		if (cb.rcb_func) {
+			nfs_request_rele(req);
+		}
+		if (IS_VALID_CRED(cred)) {
 			kauth_cred_unref(&cred);
+		}
 		return;
 	}
-
+#if CONFIG_NFS4
+	if ((nmp->nm_vers >= NFS_VER4) && nfs_mount_state_error_should_restart(error) && !ISSET(bp->nb_flags, NB_ERROR)) {
+		lck_mtx_lock(&nmp->nm_lock);
+		if ((error != NFSERR_OLD_STATEID) && (error != NFSERR_GRACE) && (cb.rcb_args.stategenid == nmp->nm_stategenid)) {
+			NP(np, "nfs_buf_read_rpc_finish: error %d @ 0x%llx, 0x%x 0x%x, initiating recovery",
+			    error, NBOFF(bp) + offset, cb.rcb_args.stategenid, nmp->nm_stategenid);
+			nfs_need_recover(nmp, error);
+		}
+		lck_mtx_unlock(&nmp->nm_lock);
+		if (np->n_flag & NREVOKE) {
+			error = EIO;
+		} else {
+			if (error == NFSERR_GRACE) {
+				if (cb.rcb_func) {
+					/*
+					 * For an async I/O request, handle a grace delay just like
+					 * jukebox errors.  Set the resend time and queue it up.
+					 */
+					struct timeval now;
+					if (req->r_nmrep.nmc_mhead) {
+						mbuf_freem(req->r_nmrep.nmc_mhead);
+						req->r_nmrep.nmc_mhead = NULL;
+					}
+					req->r_error = 0;
+					microuptime(&now);
+					lck_mtx_lock(&req->r_mtx);
+					req->r_resendtime = now.tv_sec + 2;
+					req->r_xid = 0;                 // get a new XID
+					req->r_flags |= R_RESTART;
+					req->r_start = 0;
+					nfs_asyncio_resend(req);
+					lck_mtx_unlock(&req->r_mtx);
+					if (IS_VALID_CRED(cred)) {
+						kauth_cred_unref(&cred);
+					}
+					/* Note: nfsreq reference taken will be dropped later when finished */
+					return;
+				}
+				/* otherwise, just pause a couple seconds and retry */
+				tsleep(&nmp->nm_state, (PZERO - 1), "nfsgrace", 2 * hz);
+			}
+			if (!(error = nfs_mount_state_wait_for_recovery(nmp))) {
+				rlen = 0;
+				goto readagain;
+			}
+		}
+	}
+#endif
 	if (error) {
 		SET(bp->nb_flags, NB_ERROR);
 		bp->nb_error = error;
 		goto out;
 	}
 
-	if ((rlen > 0) && (bp->nb_endio < (offset + (int)rlen)))
+	if ((rlen > 0) && (bp->nb_endio < (offset + (int)rlen))) {
 		bp->nb_endio = offset + rlen;
+	}
 
 	if ((nfsvers == NFS_VER2) || eof || (rlen == 0)) {
 		/* zero out the remaining data (up to EOF) */
@@ -1684,9 +1928,10 @@ finish:
 		rpcrem = (length - rlen);
 		eofrem = np->n_size - (NBOFF(bp) + offset + rlen);
 		rem = (rpcrem < eofrem) ? rpcrem : eofrem;
-		if (rem > 0)
-			bzero(bp->nb_data + offset + rlen, rem);
-	} else if (((int)rlen < length) && !ISSET(bp->nb_flags, NB_ERROR)) {
+		if (rem > 0) {
+			NFS_BZERO(bp->nb_data + offset + rlen, rem);
+		}
+	} else if ((rlen < length) && !ISSET(bp->nb_flags, NB_ERROR)) {
 		/*
 		 * short read
 		 *
@@ -1694,19 +1939,30 @@ finish:
 		 * requested, so we need to issue another read for the rest.
 		 * (Don't bother if the buffer already hit an error.)
 		 */
+#if CONFIG_NFS4
+readagain:
+#endif
 		offset += rlen;
 		length -= rlen;
-		cb.rcb_args[0] = offset;
-		cb.rcb_args[1] = length;
-		error = nmp->nm_funcs->nf_read_rpc_async(np, offset, length, thd, cred, &cb, &rreq);
+		cb.rcb_args.offset = offset;
+		cb.rcb_args.length = length;
+#if CONFIG_NFS4
+		if (nmp->nm_vers >= NFS_VER4) {
+			cb.rcb_args.stategenid = nmp->nm_stategenid;
+		}
+#endif
+		error = nmp->nm_funcs->nf_read_rpc_async(np, NBOFF(bp) + offset, length, thd, cred, &cb, &rreq);
 		if (!error) {
-			if (IS_VALID_CRED(cred))
+			if (IS_VALID_CRED(cred)) {
 				kauth_cred_unref(&cred);
+			}
 			if (!cb.rcb_func) {
 				/* if !async we'll need to wait for this RPC to finish */
 				req = rreq;
+				rreq = NULL;
 				goto finish;
 			}
+			nfs_request_rele(req);
 			/*
 			 * We're done here.
 			 * Outstanding RPC count is unchanged.
@@ -1719,8 +1975,12 @@ finish:
 	}
 
 out:
-	if (IS_VALID_CRED(cred))
+	if (cb.rcb_func) {
+		nfs_request_rele(req);
+	}
+	if (IS_VALID_CRED(cred)) {
 		kauth_cred_unref(&cred);
+	}
 
 	/*
 	 * Decrement outstanding RPC count on buffer
@@ -1732,21 +1992,25 @@ out:
 	 */
 
 	multasyncrpc = ISSET(bp->nb_flags, NB_MULTASYNCRPC);
-	if (multasyncrpc)
+	if (multasyncrpc) {
 		lck_mtx_lock(nfs_buf_mutex);
+	}
 
 	bp->nb_rpcs--;
 	finished = (bp->nb_rpcs == 0);
 
-	if (multasyncrpc)
+	if (multasyncrpc) {
 		lck_mtx_unlock(nfs_buf_mutex);
+	}
 
 	if (finished) {
-		if (multasyncrpc)
+		if (multasyncrpc) {
 			wakeme = &bp->nb_rpcs;
+		}
 		nfs_buf_read_finish(bp);
-		if (wakeme)
+		if (wakeme) {
 			wakeup(wakeme);
+		}
 	}
 }
 
@@ -1754,277 +2018,248 @@ out:
  * Do buffer readahead.
  * Initiate async I/O to read buffers not in cache.
  */
-static int
+int
 nfs_buf_readahead(nfsnode_t np, int ioflag, daddr64_t *rabnp, daddr64_t lastrabn, thread_t thd, kauth_cred_t cred)
 {
 	struct nfsmount *nmp = NFSTONMP(np);
 	struct nfsbuf *bp;
-	int error = 0, nra;
+	int error = 0;
+	uint32_t nra;
 
-	if (!nmp)
-		return (ENXIO);
-	if (nmp->nm_readahead <= 0)
-		return (0);
-	if (*rabnp > lastrabn)
-		return (0);
+	if (nfs_mount_gone(nmp)) {
+		return ENXIO;
+	}
+	if (nmp->nm_readahead <= 0) {
+		return 0;
+	}
+	if (*rabnp > lastrabn) {
+		return 0;
+	}
 
 	for (nra = 0; (nra < nmp->nm_readahead) && (*rabnp <= lastrabn); nra++, *rabnp = *rabnp + 1) {
 		/* check if block exists and is valid. */
-		error = nfs_buf_get(np, *rabnp, nmp->nm_biosize, thd, NBLK_READ|NBLK_NOWAIT, &bp);
-		if (error)
+		if ((*rabnp * nmp->nm_biosize) >= (off_t)np->n_size) {
+			/* stop reading ahead if we're beyond EOF */
+			*rabnp = lastrabn;
 			break;
-		if (!bp)
+		}
+		error = nfs_buf_get(np, *rabnp, nmp->nm_biosize, thd, NBLK_READ | NBLK_NOWAIT, &bp);
+		if (error) {
+			break;
+		}
+		nfs_node_lock_force(np);
+		np->n_lastrahead = *rabnp;
+		nfs_node_unlock(np);
+		if (!bp) {
 			continue;
+		}
 		if ((ioflag & IO_NOCACHE) && ISSET(bp->nb_flags, NB_CACHE) &&
-		    !bp->nb_dirty && !ISSET(bp->nb_flags, (NB_DELWRI|NB_NCRDAHEAD))) {
+		    !nfs_buf_pgs_is_set(&bp->nb_dirty) && !ISSET(bp->nb_flags, (NB_DELWRI | NB_NCRDAHEAD))) {
 			CLR(bp->nb_flags, NB_CACHE);
-			bp->nb_valid = 0;
+			NBPGS_ERASE(&bp->nb_valid);
 			bp->nb_validoff = bp->nb_validend = -1;
 		}
-		if ((bp->nb_dirtyend <= 0) && !bp->nb_dirty &&
-		    !ISSET(bp->nb_flags, (NB_CACHE|NB_DELWRI))) {
-			SET(bp->nb_flags, (NB_READ|NB_ASYNC));
-			if (ioflag & IO_NOCACHE)
+		if ((bp->nb_dirtyend <= 0) && !nfs_buf_pgs_is_set(&bp->nb_dirty) &&
+		    !ISSET(bp->nb_flags, (NB_CACHE | NB_DELWRI))) {
+			SET(bp->nb_flags, (NB_READ | NB_ASYNC));
+			if (ioflag & IO_NOCACHE) {
 				SET(bp->nb_flags, NB_NCRDAHEAD);
+			}
 			if (!IS_VALID_CRED(bp->nb_rcred) && IS_VALID_CRED(cred)) {
 				kauth_cred_ref(cred);
 				bp->nb_rcred = cred;
 			}
-			if ((error = nfs_buf_read(bp)))
+			if ((error = nfs_buf_read(bp))) {
 				break;
+			}
 			continue;
 		}
 		nfs_buf_release(bp, 1);
 	}
-	return (error);
+	return error;
 }
 
 /*
- * NFS buffer I/O for reading files/directories.
+ * NFS buffer I/O for reading files.
  */
 int
-nfs_bioread(nfsnode_t np, struct uio *uio, int ioflag, int *eofflag, vfs_context_t ctx)
+nfs_bioread(nfsnode_t np, uio_t uio, int ioflag, vfs_context_t ctx)
 {
 	vnode_t vp = NFSTOV(np);
 	struct nfsbuf *bp = NULL;
-	struct nfs_vattr nvattr;
 	struct nfsmount *nmp = VTONMP(vp);
-	daddr64_t lbn, rabn = 0, lastrabn, maxrabn = -1, tlbn;
-	off_t diff;
-	int error = 0, n = 0, on = 0;
-	int nfsvers, biosize;
-	caddr_t dp;
-	struct dirent *direntp = NULL;
-	enum vtype vtype;
+	daddr64_t lbn, rabn = 0, lastrabn, maxrabn = -1;
+	off_t diff, on = 0, n = 0;
+	int error = 0, n32;
+	int nfsvers, biosize, modified, readaheads = 0;
 	thread_t thd;
 	kauth_cred_t cred;
+	int64_t io_resid;
 
-	FSDBG_TOP(514, np, uio->uio_offset, uio_uio_resid(uio), ioflag);
-
-	if (uio_uio_resid(uio) == 0) {
-		FSDBG_BOT(514, np, 0xd1e0001, 0, 0);
-		return (0);
-	}
-	if (uio->uio_offset < 0) {
-		FSDBG_BOT(514, np, 0xd1e0002, 0, EINVAL);
-		return (EINVAL);
-	}
+	FSDBG_TOP(514, np, uio_offset(uio), uio_resid(uio), ioflag);
 
 	nfsvers = nmp->nm_vers;
 	biosize = nmp->nm_biosize;
 	thd = vfs_context_thread(ctx);
 	cred = vfs_context_ucred(ctx);
 
-	vtype = vnode_vtype(vp);
-	if ((vtype != VREG) && (vtype != VDIR)) {
-		printf("nfs_bioread: type %x unexpected\n", vtype);
+	if (vnode_vtype(vp) != VREG) {
+		printf("nfs_bioread: type %x unexpected\n", vnode_vtype(vp));
 		FSDBG_BOT(514, np, 0xd1e0016, 0, EINVAL);
-		return (EINVAL);
+		return EINVAL;
 	}
 
 	/*
-	 * For nfs, cache consistency can only be maintained approximately.
+	 * For NFS, cache consistency can only be maintained approximately.
 	 * Although RFC1094 does not specify the criteria, the following is
 	 * believed to be compatible with the reference port.
-	 * For nfs:
-	 * If the file's modify time on the server has changed since the
-	 * last read rpc or you have written to the file,
-	 * you may have lost data cache consistency with the
-	 * server, so flush all of the file's data out of the cache.
-	 * Then force a getattr rpc to ensure that you have up to date
-	 * attributes.
+	 *
+	 * If the file has changed since the last read RPC or you have
+	 * written to the file, you may have lost data cache consistency
+	 * with the server.  So, check for a change, and flush all of the
+	 * file's data out of the cache.
 	 * NB: This implies that cache data can be read when up to
-	 * NFS_MAXATTRTIMEO seconds out of date. If you find that you need
-	 * current attributes this could be forced by calling
-	 * NATTRINVALIDATE() before the nfs_getattr() call.
+	 * NFS_MAXATTRTIMO seconds out of date. If you find that you
+	 * need current attributes, nfs_getattr() can be forced to fetch
+	 * new attributes (via NATTRINVALIDATE() or NGA_UNCACHED).
 	 */
 
-	if (ISSET(np->n_flag, NUPDATESIZE))
+	if (ISSET(np->n_flag, NUPDATESIZE)) {
 		nfs_data_update_size(np, 0);
+	}
 
-	if ((error = nfs_lock(np, NFS_NODE_LOCK_EXCLUSIVE))) {
+	if ((error = nfs_node_lock(np))) {
 		FSDBG_BOT(514, np, 0xd1e0222, 0, error);
-		return (error);
+		return error;
 	}
 
 	if (np->n_flag & NNEEDINVALIDATE) {
 		np->n_flag &= ~NNEEDINVALIDATE;
-		nfs_unlock(np);
-		nfs_vinvalbuf(vp, V_SAVE|V_IGNORE_WRITEERR, ctx, 1);
-		if ((error = nfs_lock(np, NFS_NODE_LOCK_EXCLUSIVE))) {
+		nfs_node_unlock(np);
+		error = nfs_vinvalbuf(vp, V_SAVE | V_IGNORE_WRITEERR, ctx, 1);
+		if (!error) {
+			error = nfs_node_lock(np);
+		}
+		if (error) {
 			FSDBG_BOT(514, np, 0xd1e0322, 0, error);
-			return (error);
+			return error;
 		}
 	}
 
-	if (np->n_flag & NMODIFIED) {
-		if (vtype == VDIR) {
-			nfs_invaldir(np);
-			nfs_unlock(np);
-			error = nfs_vinvalbuf(vp, V_SAVE, ctx, 1);
-			if (!error)
-				error = nfs_lock(np, NFS_NODE_LOCK_EXCLUSIVE);
-			if (error) {
-				FSDBG_BOT(514, np, 0xd1e0003, 0, error);
-				return (error);
+	modified = (np->n_flag & NMODIFIED);
+	nfs_node_unlock(np);
+	/* nfs_getattr() will check changed and purge caches */
+	error = nfs_getattr(np, NULL, ctx, modified ? NGA_UNCACHED : NGA_CACHED);
+	if (error) {
+		FSDBG_BOT(514, np, 0xd1e0004, 0, error);
+		return error;
+	}
+
+	if (uio_resid(uio) == 0) {
+		FSDBG_BOT(514, np, 0xd1e0001, 0, 0);
+		return 0;
+	}
+	if (uio_offset(uio) < 0) {
+		FSDBG_BOT(514, np, 0xd1e0002, 0, EINVAL);
+		return EINVAL;
+	}
+
+	/*
+	 * set up readahead - which may be limited by:
+	 * + current request length (for IO_NOCACHE)
+	 * + readahead setting
+	 * + file size
+	 */
+	if (nmp->nm_readahead > 0) {
+		off_t end = uio_offset(uio) + uio_resid(uio);
+		if (end > (off_t)np->n_size) {
+			end = np->n_size;
+		}
+		rabn = uio_offset(uio) / biosize;
+		maxrabn = (end - 1) / biosize;
+		nfs_node_lock_force(np);
+		if (!(ioflag & IO_NOCACHE) &&
+		    (!rabn || (rabn == np->n_lastread) || (rabn == (np->n_lastread + 1)))) {
+			maxrabn += nmp->nm_readahead;
+			if ((maxrabn * biosize) >= (off_t)np->n_size) {
+				maxrabn = ((off_t)np->n_size - 1) / biosize;
 			}
 		}
-		NATTRINVALIDATE(np);
-		error = nfs_getattr(np, &nvattr, ctx, 1);
-		if (error) {
-			nfs_unlock(np);
-			FSDBG_BOT(514, np, 0xd1e0004, 0, error);
-			return (error);
+		if (maxrabn < np->n_lastrahead) {
+			np->n_lastrahead = -1;
 		}
-		if (vtype == VDIR) {
-			/* if directory changed, purge any name cache entries */
-			if (NFS_CHANGED_NC(nfsvers, np, &nvattr))
-				cache_purge(vp);
-			NFS_CHANGED_UPDATE_NC(nfsvers, np, &nvattr);
+		if (rabn < np->n_lastrahead) {
+			rabn = np->n_lastrahead + 1;
 		}
-		NFS_CHANGED_UPDATE(nfsvers, np, &nvattr);
+		nfs_node_unlock(np);
 	} else {
-		error = nfs_getattr(np, &nvattr, ctx, 1);
-		if (error) {
-			nfs_unlock(np);
-			FSDBG_BOT(514, np, 0xd1e0005, 0, error);
-			return (error);
-		}
-		if (NFS_CHANGED(nfsvers, np, &nvattr)) {
-			if (vtype == VDIR) {
-				nfs_invaldir(np);
-				/* purge name cache entries */
-				if (NFS_CHANGED_NC(nfsvers, np, &nvattr))
-					cache_purge(vp);
-			}
-			nfs_unlock(np);
-			error = nfs_vinvalbuf(vp, V_SAVE, ctx, 1);
-			if (!error)
-				error = nfs_lock(np, NFS_NODE_LOCK_EXCLUSIVE);
-			if (error) {
-				FSDBG_BOT(514, np, 0xd1e0006, 0, error);
-				return (error);
-			}
-			if (vtype == VDIR)
-				NFS_CHANGED_UPDATE_NC(nfsvers, np, &nvattr);
-			NFS_CHANGED_UPDATE(nfsvers, np, &nvattr);
-		}
+		rabn = maxrabn = 0;
 	}
 
-	nfs_unlock(np);
-
-	if (vtype == VREG) {
-		if ((ioflag & IO_NOCACHE) && (uio_uio_resid(uio) < (2*biosize))) {
-			/* We have only a block or so to read, just do the rpc directly. */
-			error = nfs_read_rpc(np, uio, ctx);
-			FSDBG_BOT(514, np, uio->uio_offset, uio_uio_resid(uio), error);
-			return (error);
-		}
-		/*
-		 * set up readahead - which may be limited by:
-		 * + current request length (for IO_NOCACHE)
-		 * + readahead setting
-		 * + file size
-		 */
-		if (nmp->nm_readahead > 0) {
-			off_t end = uio->uio_offset + uio_uio_resid(uio);
-			if (end > (off_t)np->n_size)
-				end = np->n_size;
-			rabn = uio->uio_offset / biosize;
-			maxrabn = (end - 1) / biosize;
-			if (!(ioflag & IO_NOCACHE) &&
-			    (!rabn || (rabn == np->n_lastread) || (rabn == (np->n_lastread+1)))) {
-				maxrabn += nmp->nm_readahead;
-				if ((maxrabn * biosize) >= (off_t)np->n_size)
-					maxrabn = ((off_t)np->n_size - 1)/biosize;
-			}
-		} else {
-			rabn = maxrabn = 0;
-		}
-	}
-
-	do {
-
-	    if (vtype == VREG) {
-		nfs_data_lock(np, NFS_NODE_LOCK_SHARED);
-		lbn = uio->uio_offset / biosize;
+	do {
+		nfs_data_lock(np, NFS_DATA_LOCK_SHARED);
+		lbn = uio_offset(uio) / biosize;
 
 		/*
 		 * Copy directly from any cached pages without grabbing the bufs.
-		 *
-		 * Note: for "nocache" reads, we don't copy directly from UBC
-		 * because any cached pages will be for readahead buffers that
-		 * need to be invalidated anyway before we finish this request.
+		 * (If we are NOCACHE and we've issued readahead requests, we need
+		 * to grab the NB_NCRDAHEAD bufs to drop them.)
 		 */
-		if (!(ioflag & IO_NOCACHE) &&
-			(uio->uio_segflg == UIO_USERSPACE32 ||
-			 uio->uio_segflg == UIO_USERSPACE64 ||
-			 uio->uio_segflg == UIO_USERSPACE)) {
-			// LP64todo - fix this!
-			int io_resid = uio_uio_resid(uio);
-			diff = np->n_size - uio->uio_offset;
-			if (diff < io_resid)
+		if ((!(ioflag & IO_NOCACHE) || !readaheads) &&
+		    ((uio->uio_segflg == UIO_USERSPACE32 ||
+		    uio->uio_segflg == UIO_USERSPACE64 ||
+		    uio->uio_segflg == UIO_USERSPACE))) {
+			io_resid = uio_resid(uio);
+			diff = np->n_size - uio_offset(uio);
+			if (diff < io_resid) {
 				io_resid = diff;
+			}
 			if (io_resid > 0) {
-				error = cluster_copy_ubc_data(vp, uio, &io_resid, 0);
+				int count = (io_resid > INT_MAX) ? INT_MAX : (int)io_resid;
+				error = cluster_copy_ubc_data(vp, uio, &count, 0);
 				if (error) {
 					nfs_data_unlock(np);
-					FSDBG_BOT(514, np, uio->uio_offset, 0xcacefeed, error);
-					return (error);
+					FSDBG_BOT(514, np, uio_offset(uio), 0xcacefeed, error);
+					return error;
 				}
 			}
 			/* count any biocache reads that we just copied directly */
-			if (lbn != (uio->uio_offset / biosize)) {
-				OSAddAtomic((uio->uio_offset / biosize) - lbn, (SInt32*)&nfsstats.biocache_reads);
-				FSDBG(514, np, 0xcacefeed, uio->uio_offset, error);
+			if (lbn != (uio_offset(uio) / biosize)) {
+				OSAddAtomic64(NFS_ROUND_BLOCK(uio_offset(uio), biosize) - lbn, &nfsstats.biocache_reads);
+				FSDBG(514, np, 0xcacefeed, uio_offset(uio), error);
 			}
 		}
 
-		lbn = uio->uio_offset / biosize;
-		on = uio->uio_offset % biosize;
-		np->n_lastread = (uio->uio_offset - 1) / biosize;
+		lbn = uio_offset(uio) / biosize;
+		on = uio_offset(uio) % biosize;
+		nfs_node_lock_force(np);
+		np->n_lastread = (uio_offset(uio) - 1) / biosize;
+		nfs_node_unlock(np);
+
+		if ((uio_resid(uio) <= 0) || (uio_offset(uio) >= (off_t)np->n_size)) {
+			nfs_data_unlock(np);
+			FSDBG_BOT(514, np, uio_offset(uio), uio_resid(uio), 0xaaaaaaaa);
+			return 0;
+		}
 
 		/* adjust readahead block number, if necessary */
-		if (rabn < lbn)
+		if (rabn < lbn) {
 			rabn = lbn;
+		}
 		lastrabn = MIN(maxrabn, lbn + nmp->nm_readahead);
 		if (rabn <= lastrabn) { /* start readaheads */
 			error = nfs_buf_readahead(np, ioflag, &rabn, lastrabn, thd, cred);
 			if (error) {
 				nfs_data_unlock(np);
 				FSDBG_BOT(514, np, 0xd1e000b, 1, error);
-				return (error);
+				return error;
 			}
+			readaheads = 1;
+			OSAddAtomic64(rabn - lbn, &nfsstats.biocache_reads);
+		} else {
+			OSAddAtomic64(1, &nfsstats.biocache_reads);
 		}
 
-		if ((uio_uio_resid(uio) <= 0) || (uio->uio_offset >= (off_t)np->n_size)) {
-			nfs_data_unlock(np);
-			FSDBG_BOT(514, np, uio->uio_offset, uio_uio_resid(uio), 0xaaaaaaaa);
-			return (0);
-		}
-
-		OSAddAtomic(1, (SInt32*)&nfsstats.biocache_reads);
-
 		/*
 		 * If the block is in the cache and has the required data
 		 * in a valid region, just copy it out.
@@ -2032,17 +2267,18 @@ nfs_bioread(nfsnode_t np, struct uio *uio, int ioflag, int *eofflag, vfs_context
 		 * as required.
 		 */
 again:
-		// LP64todo - fix this!
-		n = min((unsigned)(biosize - on), uio_uio_resid(uio));
-		diff = np->n_size - uio->uio_offset;
-		if (diff < n)
+		io_resid = uio_resid(uio);
+		n = (io_resid > (biosize - on)) ? (biosize - on) : io_resid;
+		diff = np->n_size - uio_offset(uio);
+		if (diff < n) {
 			n = diff;
+		}
 
 		error = nfs_buf_get(np, lbn, biosize, thd, NBLK_READ, &bp);
 		if (error) {
 			nfs_data_unlock(np);
 			FSDBG_BOT(514, np, 0xd1e000c, 0, error);
-			return (error);
+			return error;
 		}
 
 		if ((ioflag & IO_NOCACHE) && ISSET(bp->nb_flags, NB_CACHE)) {
@@ -2052,32 +2288,32 @@ again:
 			 * Invalidate the data if it wasn't just read
 			 * in as part of a "nocache readahead".
 			 */
-			if (bp->nb_dirty || (bp->nb_dirtyend > 0)) {
+			if (nfs_buf_pgs_is_set(&bp->nb_dirty) || (bp->nb_dirtyend > 0)) {
 				/* so write the buffer out and try again */
 				SET(bp->nb_flags, NB_NOCACHE);
 				goto flushbuffer;
 			}
-			if (!ISSET(bp->nb_flags, NB_NCRDAHEAD)) {
-				CLR(bp->nb_flags, NB_CACHE);
-				bp->nb_valid = 0;
-			} else {
+			if (ISSET(bp->nb_flags, NB_NCRDAHEAD)) {
 				CLR(bp->nb_flags, NB_NCRDAHEAD);
+				SET(bp->nb_flags, NB_NOCACHE);
 			}
 		}
 
 		/* if any pages are valid... */
-		if (bp->nb_valid) {
+		if (nfs_buf_pgs_is_set(&bp->nb_valid)) {
 			/* ...check for any invalid pages in the read range */
-			int pg, firstpg, lastpg, dirtypg;
+			off_t pg, firstpg, lastpg, dirtypg;
 			dirtypg = firstpg = lastpg = -1;
-			pg = on/PAGE_SIZE;
-			while (pg <= (on + n - 1)/PAGE_SIZE) {
-				if (!NBPGVALID(bp,pg)) {
-					if (firstpg < 0)
+			pg = on / PAGE_SIZE;
+			while (pg <= (on + n - 1) / PAGE_SIZE) {
+				if (!NBPGVALID(bp, pg)) {
+					if (firstpg < 0) {
 						firstpg = pg;
+					}
 					lastpg = pg;
-				} else if (firstpg >= 0 && dirtypg < 0 && NBPGDIRTY(bp,pg))
+				} else if (firstpg >= 0 && dirtypg < 0 && NBPGDIRTY(bp, pg)) {
 					dirtypg = pg;
+				}
 				pg++;
 			}
 
@@ -2086,8 +2322,8 @@ again:
 				if (bp->nb_validoff < 0) {
 					/* valid range isn't set up, so */
 					/* set it to what we know is valid */
-					bp->nb_validoff = trunc_page(on);
-					bp->nb_validend = round_page(on+n);
+					bp->nb_validoff = trunc_page_64(on);
+					bp->nb_validend = round_page_64(on + n);
 					nfs_buf_normalize_valid_range(np, bp);
 				}
 				goto buffer_ready;
@@ -2095,7 +2331,7 @@ again:
 
 			/* there are invalid pages in the read range */
 			if (((dirtypg > firstpg) && (dirtypg < lastpg)) ||
-			    (((firstpg*PAGE_SIZE) < bp->nb_dirtyend) && (((lastpg+1)*PAGE_SIZE) > bp->nb_dirtyoff))) {
+			    (((firstpg * PAGE_SIZE) < bp->nb_dirtyend) && (((lastpg + 1) * PAGE_SIZE) > bp->nb_dirtyoff))) {
 				/* there are also dirty page(s) (or range) in the read range, */
 				/* so write the buffer out and try again */
 flushbuffer:
@@ -2109,41 +2345,42 @@ flushbuffer:
 				if (error) {
 					nfs_data_unlock(np);
 					FSDBG_BOT(514, np, 0xd1e000d, 0, error);
-					return (error);
+					return error;
 				}
 				goto again;
 			}
-			if (!bp->nb_dirty && bp->nb_dirtyend <= 0 &&
-			    (lastpg - firstpg + 1) > (biosize/PAGE_SIZE)/2) {
+			if (!nfs_buf_pgs_is_set(&bp->nb_dirty) && bp->nb_dirtyend <= 0 &&
+			    (lastpg - firstpg + 1) > (biosize / PAGE_SIZE) / 2) {
 				/* we need to read in more than half the buffer and the */
 				/* buffer's not dirty, so just fetch the whole buffer */
-				bp->nb_valid = 0;
+				NBPGS_ERASE(&bp->nb_valid);
 			} else {
 				/* read the page range in */
 				uio_t auio;
-				char uio_buf[ UIO_SIZEOF(1) ];
-				
+				char uio_buf[UIO_SIZEOF(1)];
+
 				NFS_BUF_MAP(bp);
 				auio = uio_createwithbuffer(1, (NBOFF(bp) + firstpg * PAGE_SIZE_64),
-						UIO_SYSSPACE, UIO_READ, &uio_buf[0], sizeof(uio_buf));
+				    UIO_SYSSPACE, UIO_READ, &uio_buf[0], sizeof(uio_buf));
 				if (!auio) {
 					error = ENOMEM;
 				} else {
-					uio_addiov(auio, CAST_USER_ADDR_T((bp->nb_data + firstpg * PAGE_SIZE)),
-							((lastpg - firstpg + 1) * PAGE_SIZE));
+					NFS_UIO_ADDIOV(auio, CAST_USER_ADDR_T(bp->nb_data + (firstpg * PAGE_SIZE)),
+					    ((lastpg - firstpg + 1) * PAGE_SIZE));
 					error = nfs_read_rpc(np, auio, ctx);
 				}
 				if (error) {
-					if (ioflag & IO_NOCACHE)
+					if (ioflag & IO_NOCACHE) {
 						SET(bp->nb_flags, NB_NOCACHE);
+					}
 					nfs_buf_release(bp, 1);
 					nfs_data_unlock(np);
 					FSDBG_BOT(514, np, 0xd1e000e, 0, error);
-					return (error);
+					return error;
 				}
 				/* Make sure that the valid range is set to cover this read. */
-				bp->nb_validoff = trunc_page_32(on);
-				bp->nb_validend = round_page_32(on+n);
+				bp->nb_validoff = trunc_page_64(on);
+				bp->nb_validend = round_page_64(on + n);
 				nfs_buf_normalize_valid_range(np, bp);
 				if (uio_resid(auio) > 0) {
 					/* if short read, must have hit EOF, */
@@ -2151,12 +2388,13 @@ flushbuffer:
 					bzero(CAST_DOWN(caddr_t, uio_curriovbase(auio)), uio_resid(auio));
 				}
 				/* mark the pages (successfully read) as valid */
-				for (pg=firstpg; pg <= lastpg; pg++)
-					NBPGVALID_SET(bp,pg);
+				for (pg = firstpg; pg <= lastpg; pg++) {
+					NBPGVALID_SET(bp, pg);
+				}
 			}
 		}
 		/* if no pages are valid, read the whole block */
-		if (!bp->nb_valid) {
+		if (!nfs_buf_pgs_is_set(&bp->nb_valid)) {
 			if (!IS_VALID_CRED(bp->nb_rcred) && IS_VALID_CRED(cred)) {
 				kauth_cred_ref(cred);
 				bp->nb_rcred = cred;
@@ -2164,196 +2402,80 @@ flushbuffer:
 			SET(bp->nb_flags, NB_READ);
 			CLR(bp->nb_flags, (NB_DONE | NB_ERROR | NB_INVAL));
 			error = nfs_buf_read(bp);
+			if (ioflag & IO_NOCACHE) {
+				SET(bp->nb_flags, NB_NOCACHE);
+			}
 			if (error) {
 				nfs_data_unlock(np);
 				nfs_buf_release(bp, 1);
 				FSDBG_BOT(514, np, 0xd1e000f, 0, error);
-				return (error);
+				return error;
 			}
 		}
 buffer_ready:
 		/* validate read range against valid range and clip */
 		if (bp->nb_validend > 0) {
 			diff = (on >= bp->nb_validend) ? 0 : (bp->nb_validend - on);
-			if (diff < n)
+			if (diff < n) {
 				n = diff;
-		}
-		if (n > 0)
-			NFS_BUF_MAP(bp);
-	    } else if (vtype == VDIR) {
-		OSAddAtomic(1, (SInt32*)&nfsstats.biocache_readdirs);
-		error = nfs_lock(np, NFS_NODE_LOCK_SHARED);
-		if (error || (np->n_direofoffset && (uio->uio_offset >= np->n_direofoffset))) {
-			if (!error)
-				nfs_unlock(np);
-			if (eofflag)
-				*eofflag = 1;
-			FSDBG_BOT(514, np, 0xde0f0001, 0, 0);
-			return (0);
-		}
-		nfs_unlock(np);
-		lbn = uio->uio_offset / NFS_DIRBLKSIZ;
-		on = uio->uio_offset & (NFS_DIRBLKSIZ - 1);
-		error = nfs_buf_get(np, lbn, NFS_DIRBLKSIZ, thd, NBLK_READ, &bp);
-		if (error) {
-			FSDBG_BOT(514, np, 0xd1e0012, 0, error);
-			return (error);
-		}
-		if (!ISSET(bp->nb_flags, NB_CACHE)) {
-		    SET(bp->nb_flags, NB_READ);
-		    error = nfs_buf_readdir(bp, ctx);
-		    if (error)
-			nfs_buf_release(bp, 1);
-		    while (error == NFSERR_BAD_COOKIE) {
-			error = nfs_lock(np, NFS_NODE_LOCK_EXCLUSIVE);
-			if (!error) {
-				nfs_invaldir(np);
-				nfs_unlock(np);
 			}
-			error = nfs_vinvalbuf(vp, 0, ctx, 1);
-			/*
-			 * Yuck! The directory has been modified on the
-			 * server. The only way to get the block is by
-			 * reading from the beginning to get all the
-			 * offset cookies.
-			 */
-			for (tlbn = 0; tlbn <= lbn && !error; tlbn++) {
-			    if ((error = nfs_lock(np, NFS_NODE_LOCK_SHARED)))
-				    break;
-			    if (np->n_direofoffset
-				&& (tlbn * NFS_DIRBLKSIZ) >= np->n_direofoffset) {
-				    nfs_unlock(np);
-				    if (eofflag)
-					    *eofflag = 1;
-				    FSDBG_BOT(514, np, 0xde0f0002, 0, 0);
-				    return (0);
-			    }
-			    nfs_unlock(np);
-			    error = nfs_buf_get(np, tlbn, NFS_DIRBLKSIZ, thd, NBLK_READ, &bp);
-			    if (error) {
-				    FSDBG_BOT(514, np, 0xd1e0013, 0, error);
-				    return (error);
-			    }
-			    if (!ISSET(bp->nb_flags, NB_CACHE)) {
-				    SET(bp->nb_flags, NB_READ);
-				    error = nfs_buf_readdir(bp, ctx);
-				    /*
-				     * no error + NB_INVAL == directory EOF,
-				     * use the block.
-				     */
-				    if (error == 0 && ISSET(bp->nb_flags, NB_INVAL)) {
-					    if (eofflag)
-						    *eofflag = 1;
-					    break;
-				    }
-			    }
-			    /*
-			     * An error will throw away the block and the
-			     * for loop will break out.  If no error and this
-			     * is not the block we want, we throw away the
-			     * block and go for the next one via the for loop.
-			     */
-			    if (error || (tlbn < lbn))
-				    nfs_buf_release(bp, 1);
-			}
-		    }
-		    /*
-		     * The above while is repeated if we hit another cookie
-		     * error.  If we hit an error and it wasn't a cookie error,
-		     * we give up.
-		     */
-		    if (error) {
-		        FSDBG_BOT(514, np, 0xd1e0014, 0, error);
-			return (error);
-		    }
 		}
-		/*
-		 * Make sure we use a signed variant of min() since
-		 * the second term may be negative.
-		 */
-		// LP64todo - fix this!
-		n = lmin(uio_uio_resid(uio), bp->nb_validend - on);
-		/*
-		 * We keep track of the directory eof in
-		 * np->n_direofoffset and chop it off as an
-		 * extra step right here.
-		 */
-		if ((error = nfs_lock(np, NFS_NODE_LOCK_SHARED))) {
-			FSDBG_BOT(514, np, 0xd1e0115, 0, error);
-			return (error);
-		}
-		if (np->n_direofoffset &&
-		    n > np->n_direofoffset - uio->uio_offset)
-			n = np->n_direofoffset - uio->uio_offset;
-		nfs_unlock(np);
-		/*
-		 * Make sure that we return an integral number of entries so
-		 * that any subsequent calls will start copying from the start
-		 * of the next entry.
-		 *
-		 * If the current value of n has the last entry cut short,
-		 * set n to copy everything up to the last entry instead.
-		 */
 		if (n > 0) {
-			dp = bp->nb_data + on;
-			while (dp < (bp->nb_data + on + n)) {
-				direntp = (struct dirent *)dp;
-				dp += direntp->d_reclen;
+			NFS_BUF_MAP(bp);
+			n32 = n > INT_MAX ? INT_MAX : (int)n;
+			error = uiomove(bp->nb_data + on, n32, uio);
+			if (!error && n > n32) {
+				error = uiomove(bp->nb_data + on + n32, (int)(n - n32), uio);
 			}
-			if (dp > (bp->nb_data + on + n))
-				n = (dp - direntp->d_reclen) - (bp->nb_data + on);
 		}
-	    }
 
-	    if (n > 0)
-		error = uiomove(bp->nb_data + on, (int)n, uio);
 
-	    if (vtype == VREG) {
-		if (ioflag & IO_NOCACHE)
-			SET(bp->nb_flags, NB_NOCACHE);
 		nfs_buf_release(bp, 1);
 		nfs_data_unlock(np);
-		np->n_lastread = (uio->uio_offset - 1) / biosize;
-	    } else {
-		nfs_buf_release(bp, 1);
-	    }
-	} while (error == 0 && uio_uio_resid(uio) > 0 && n > 0);
-	FSDBG_BOT(514, np, uio->uio_offset, uio_uio_resid(uio), error);
-	return (error);
+		nfs_node_lock_force(np);
+		np->n_lastread = (uio_offset(uio) - 1) / biosize;
+		nfs_node_unlock(np);
+	} while (error == 0 && uio_resid(uio) > 0 && n > 0);
+	FSDBG_BOT(514, np, uio_offset(uio), uio_resid(uio), error);
+	return error;
 }
 
 /*
  * limit the number of outstanding async I/O writes
  */
-static int
+int
 nfs_async_write_start(struct nfsmount *nmp)
 {
-	int error = 0, slpflag = (nmp->nm_flag & NFSMNT_INT) ? PCATCH : 0;
-	struct timespec ts = {1, 0};
+	int error = 0, slpflag = NMFLAG(nmp, INTR) ? PCATCH : 0;
+	struct timespec ts = { .tv_sec = 1, .tv_nsec = 0 };
 
-	if (nfs_max_async_writes <= 0)
-		return (0);
+	if (nfs_max_async_writes <= 0) {
+		return 0;
+	}
 	lck_mtx_lock(&nmp->nm_lock);
-	while (!error && (nfs_max_async_writes > 0) && (nmp->nm_asyncwrites >= nfs_max_async_writes)) {
-		if ((error = nfs_sigintr(nmp, NULL, current_thread(), 1)))
+	while ((nfs_max_async_writes > 0) && (nmp->nm_asyncwrites >= nfs_max_async_writes)) {
+		if ((error = nfs_sigintr(nmp, NULL, current_thread(), 1))) {
 			break;
-		error = msleep(&nmp->nm_asyncwrites, &nmp->nm_lock, slpflag|(PZERO-1), "nfsasyncwrites", &ts);
-		if (error == EWOULDBLOCK)
-			error = 0;
+		}
+		msleep(&nmp->nm_asyncwrites, &nmp->nm_lock, slpflag | (PZERO - 1), "nfsasyncwrites", &ts);
+		slpflag = 0;
 	}
-	if (!error)
+	if (!error) {
 		nmp->nm_asyncwrites++;
+	}
 	lck_mtx_unlock(&nmp->nm_lock);
-	return (error);
+	return error;
 }
-static void
+void
 nfs_async_write_done(struct nfsmount *nmp)
 {
-	if (nmp->nm_asyncwrites <= 0)
+	if (nmp->nm_asyncwrites <= 0) {
 		return;
+	}
 	lck_mtx_lock(&nmp->nm_lock);
-	if (nmp->nm_asyncwrites-- >= nfs_max_async_writes)
+	if (nmp->nm_asyncwrites-- >= nfs_max_async_writes) {
 		wakeup(&nmp->nm_asyncwrites);
+	}
 	lck_mtx_unlock(&nmp->nm_lock);
 }
 
@@ -2376,19 +2498,20 @@ nfs_buf_write(struct nfsbuf *bp)
 	thread_t thd;
 	kauth_cred_t cred;
 	proc_t p = current_proc();
-	int iomode, doff, dend, firstpg, lastpg;
-	uint32_t pagemask;
+	int iomode;
+	off_t doff, dend, firstpg, lastpg;
 
 	FSDBG_TOP(553, bp, NBOFF(bp), bp->nb_flags, 0);
 
-	if (!ISSET(bp->nb_lflags, NBL_BUSY))
+	if (!ISSET(bp->nb_lflags, NBL_BUSY)) {
 		panic("nfs_buf_write: buffer is not busy???");
+	}
 
 	np = bp->nb_np;
 	async = ISSET(bp->nb_flags, NB_ASYNC);
 	oldflags = bp->nb_flags;
 
-	CLR(bp->nb_flags, (NB_READ|NB_DONE|NB_ERROR|NB_DELWRI));
+	CLR(bp->nb_flags, (NB_READ | NB_DONE | NB_ERROR | NB_DELWRI));
 	if (ISSET(oldflags, NB_DELWRI)) {
 		lck_mtx_lock(nfs_buf_mutex);
 		nfs_nbdwrite--;
@@ -2398,46 +2521,63 @@ nfs_buf_write(struct nfsbuf *bp)
 	}
 
 	/* move to clean list */
-	if (ISSET(oldflags, (NB_ASYNC|NB_DELWRI))) {
+	if (ISSET(oldflags, (NB_ASYNC | NB_DELWRI))) {
 		lck_mtx_lock(nfs_buf_mutex);
-		if (bp->nb_vnbufs.le_next != NFSNOLIST)
+		if (bp->nb_vnbufs.le_next != NFSNOLIST) {
 			LIST_REMOVE(bp, nb_vnbufs);
+		}
 		LIST_INSERT_HEAD(&np->n_cleanblkhd, bp, nb_vnbufs);
 		lck_mtx_unlock(nfs_buf_mutex);
 	}
+	nfs_node_lock_force(np);
+	np->n_numoutput++;
+	nfs_node_unlock(np);
 	vnode_startwrite(NFSTOV(np));
 
-	if (p && p->p_stats)
-		OSIncrementAtomic(&p->p_stats->p_ru.ru_oublock);
+	if (p && p->p_stats) {
+		OSIncrementAtomicLong(&p->p_stats->p_ru.ru_oublock);
+	}
 
 	cred = bp->nb_wcred;
-	if (!IS_VALID_CRED(cred) && ISSET(bp->nb_flags, NB_READ))
+	if (!IS_VALID_CRED(cred) && ISSET(bp->nb_flags, NB_READ)) {
 		cred = bp->nb_rcred;  /* shouldn't really happen, but... */
-	if (IS_VALID_CRED(cred))
+	}
+	if (IS_VALID_CRED(cred)) {
 		kauth_cred_ref(cred);
+	}
 	thd = async ? NULL : current_thread();
 
 	/* We need to make sure the pages are locked before doing I/O.  */
-	if (!ISSET(bp->nb_flags, NB_META) && UBCINFOEXISTS(NFSTOV(np))) {
-		if (!ISSET(bp->nb_flags, NB_PAGELIST)) {
-			error = nfs_buf_upl_setup(bp);
-			if (error) {
-				printf("nfs_buf_write: upl create failed %d\n", error);
-				SET(bp->nb_flags, NB_ERROR);
-				bp->nb_error = error = EIO;
-				nfs_buf_iodone(bp);
-				goto out;
+	if (!ISSET(bp->nb_flags, NB_META)) {
+		if (UBCINFOEXISTS(NFSTOV(np))) {
+			if (!ISSET(bp->nb_flags, NB_PAGELIST)) {
+				error = nfs_buf_upl_setup(bp);
+				if (error) {
+					printf("nfs_buf_write: upl create failed %d\n", error);
+					SET(bp->nb_flags, NB_ERROR);
+					bp->nb_error = error = EIO;
+					nfs_buf_iodone(bp);
+					goto out;
+				}
+				nfs_buf_upl_check(bp);
 			}
-			nfs_buf_upl_check(bp);
+		} else {
+			/* We should never be in nfs_buf_write() with no UBCINFO. */
+			printf("nfs_buf_write: ubcinfo already gone\n");
+			SET(bp->nb_flags, NB_ERROR);
+			bp->nb_error = error = EIO;
+			nfs_buf_iodone(bp);
+			goto out;
 		}
 	}
 
 	/* If NB_NEEDCOMMIT is set, a commit RPC may do the trick. */
-	if (ISSET(bp->nb_flags, NB_NEEDCOMMIT))
+	if (ISSET(bp->nb_flags, NB_NEEDCOMMIT)) {
 		nfs_buf_check_write_verifier(np, bp);
+	}
 	if (ISSET(bp->nb_flags, NB_NEEDCOMMIT)) {
 		struct nfsmount *nmp = NFSTONMP(np);
-		if (!nmp) {
+		if (nfs_mount_gone(nmp)) {
 			SET(bp->nb_flags, NB_ERROR);
 			bp->nb_error = error = EIO;
 			nfs_buf_iodone(bp);
@@ -2445,7 +2585,7 @@ nfs_buf_write(struct nfsbuf *bp)
 		}
 		SET(bp->nb_flags, NB_WRITEINPROG);
 		error = nmp->nm_funcs->nf_commit_rpc(np, NBOFF(bp) + bp->nb_dirtyoff,
-				bp->nb_dirtyend - bp->nb_dirtyoff, bp->nb_wcred);
+		    bp->nb_dirtyend - bp->nb_dirtyoff, bp->nb_wcred, bp->nb_verf);
 		CLR(bp->nb_flags, NB_WRITEINPROG);
 		if (error) {
 			if (error != NFSERR_STALEWRITEVERF) {
@@ -2457,66 +2597,82 @@ nfs_buf_write(struct nfsbuf *bp)
 		}
 		bp->nb_dirtyoff = bp->nb_dirtyend = 0;
 		CLR(bp->nb_flags, NB_NEEDCOMMIT);
-		nfs_lock(np, NFS_NODE_LOCK_FORCE);
+		nfs_node_lock_force(np);
 		np->n_needcommitcnt--;
 		CHECK_NEEDCOMMITCNT(np);
-		nfs_unlock(np);
+		nfs_node_unlock(np);
 	}
 	if (!error && (bp->nb_dirtyend > 0)) {
 		/* sanity check the dirty range */
 		if (NBOFF(bp) + bp->nb_dirtyend > (off_t) np->n_size) {
 			bp->nb_dirtyend = np->n_size - NBOFF(bp);
-			if (bp->nb_dirtyoff >= bp->nb_dirtyend)
+			if (bp->nb_dirtyoff >= bp->nb_dirtyend) {
 				bp->nb_dirtyoff = bp->nb_dirtyend = 0;
+			}
 		}
 	}
 	if (!error && (bp->nb_dirtyend > 0)) {
 		/* there's a dirty range that needs to be written out */
+		nfsbufpgs pagemask, pagemaskand;
 		NFS_BUF_MAP(bp);
 
 		doff = bp->nb_dirtyoff;
 		dend = bp->nb_dirtyend;
 
 		/* if doff page is dirty, move doff to start of page */
-		if (NBPGDIRTY(bp, doff / PAGE_SIZE))
+		if (NBPGDIRTY(bp, doff / PAGE_SIZE)) {
 			doff -= doff & PAGE_MASK;
+		}
 		/* try to expand write range to include preceding dirty pages */
-		if (!(doff & PAGE_MASK))
-			while ((doff > 0) && NBPGDIRTY(bp, (doff - 1) / PAGE_SIZE))
+		if (!(doff & PAGE_MASK)) {
+			while ((doff > 0) && NBPGDIRTY(bp, (doff - 1) / PAGE_SIZE)) {
 				doff -= PAGE_SIZE;
+			}
+		}
 		/* if dend page is dirty, move dend to start of next page */
-		if ((dend & PAGE_MASK) && NBPGDIRTY(bp, dend / PAGE_SIZE))
-			dend = round_page_32(dend);
+		if ((dend & PAGE_MASK) && NBPGDIRTY(bp, dend / PAGE_SIZE)) {
+			dend = round_page_64(dend);
+		}
 		/* try to expand write range to include trailing dirty pages */
-		if (!(dend & PAGE_MASK))
-			while ((dend < bp->nb_bufsize) && NBPGDIRTY(bp, dend / PAGE_SIZE))
+		if (!(dend & PAGE_MASK)) {
+			while ((dend < (int)bp->nb_bufsize) && NBPGDIRTY(bp, dend / PAGE_SIZE)) {
 				dend += PAGE_SIZE;
+			}
+		}
 		/* make sure to keep dend clipped to EOF */
-		if ((NBOFF(bp) + dend) > (off_t) np->n_size)
+		if ((NBOFF(bp) + dend) > (off_t) np->n_size) {
 			dend = np->n_size - NBOFF(bp);
+		}
 		/* calculate range of complete pages being written */
-		firstpg = round_page_32(doff) / PAGE_SIZE;
-		lastpg = (trunc_page_32(dend) - 1) / PAGE_SIZE;
-		/* calculate mask for that page range */
-		pagemask = ((1 << (lastpg + 1)) - 1) & ~((1 << firstpg) - 1);
+		if (dend > doff) {
+			firstpg = doff / PAGE_SIZE;
+			lastpg = (dend - 1) / PAGE_SIZE;
+			/* calculate mask for that page range */
+			nfs_buf_pgs_set_pages_between(&pagemask, firstpg, lastpg + 1);
+		} else {
+			NBPGS_ERASE(&pagemask);
+		}
 
 		/*
 		 * compare page mask to nb_dirty; if there are other dirty pages
 		 * then write FILESYNC; otherwise, write UNSTABLE if async and
 		 * not needcommit/stable; otherwise write FILESYNC
 		 */
-		if (bp->nb_dirty & ~pagemask)
+		nfs_buf_pgs_bit_not(&pagemask);
+		nfs_buf_pgs_bit_and(&bp->nb_dirty, &pagemask, &pagemaskand);
+		if (nfs_buf_pgs_is_set(&pagemaskand)) {
 			iomode = NFS_WRITE_FILESYNC;
-		else if ((bp->nb_flags & (NB_ASYNC | NB_NEEDCOMMIT | NB_STABLE)) == NB_ASYNC)
+		} else if ((bp->nb_flags & (NB_ASYNC | NB_NEEDCOMMIT | NB_STABLE)) == NB_ASYNC) {
 			iomode = NFS_WRITE_UNSTABLE;
-		else
+		} else {
 			iomode = NFS_WRITE_FILESYNC;
+		}
 
 		/* write the whole contiguous dirty range */
 		bp->nb_offio = doff;
 		bp->nb_endio = dend;
 
-		OSAddAtomic(1, (SInt32 *)&nfsstats.write_bios);
+		OSAddAtomic64(1, &nfsstats.write_bios);
 
 		SET(bp->nb_flags, NB_WRITEINPROG);
 		error = nfs_buf_write_rpc(bp, iomode, thd, cred);
@@ -2527,8 +2683,9 @@ nfs_buf_write(struct nfsbuf *bp)
 		 * pages pushed out.
 		 */
 	} else {
-		if (!error && bp->nb_dirty) /* write out any dirty pages */
+		if (!error && nfs_buf_pgs_is_set(&bp->nb_dirty)) { /* write out any dirty pages */
 			error = nfs_buf_write_dirty_pages(bp, thd, cred);
+		}
 		nfs_buf_iodone(bp);
 	}
 	/* note: bp is still valid only for !async case */
@@ -2538,8 +2695,9 @@ out:
 		/* move to clean list */
 		if (oldflags & NB_DELWRI) {
 			lck_mtx_lock(nfs_buf_mutex);
-			if (bp->nb_vnbufs.le_next != NFSNOLIST)
+			if (bp->nb_vnbufs.le_next != NFSNOLIST) {
 				LIST_REMOVE(bp, nb_vnbufs);
+			}
 			LIST_INSERT_HEAD(&np->n_cleanblkhd, bp, nb_vnbufs);
 			lck_mtx_unlock(nfs_buf_mutex);
 		}
@@ -2547,14 +2705,14 @@ out:
 		nfs_buf_release(bp, 1);
 		/* check if we need to invalidate (and we can) */
 		if ((np->n_flag & NNEEDINVALIDATE) &&
-		    !(np->n_bflag & (NBINVALINPROG|NBFLUSHINPROG))) {
+		    !(np->n_bflag & (NBINVALINPROG | NBFLUSHINPROG))) {
 			int invalidate = 0;
-			nfs_lock(np, NFS_NODE_LOCK_FORCE);
+			nfs_node_lock_force(np);
 			if (np->n_flag & NNEEDINVALIDATE) {
 				invalidate = 1;
 				np->n_flag &= ~NNEEDINVALIDATE;
 			}
-			nfs_unlock(np);
+			nfs_node_unlock(np);
 			if (invalidate) {
 				/*
 				 * There was a write error and we need to
@@ -2567,14 +2725,15 @@ out:
 				 * the buffer busy.  So we call vinvalbuf() after
 				 * releasing the buffer.
 				 */
-				nfs_vinvalbuf2(NFSTOV(np), V_SAVE|V_IGNORE_WRITEERR, thd, cred, 1);
+				nfs_vinvalbuf2(NFSTOV(np), V_SAVE | V_IGNORE_WRITEERR, thd, cred, 1);
 			}
 		}
 	}
 
-	if (IS_VALID_CRED(cred))
+	if (IS_VALID_CRED(cred)) {
 		kauth_cred_unref(&cred);
-	return (error);
+	}
+	return error;
 }
 
 /*
@@ -2585,8 +2744,7 @@ nfs_buf_write_finish(struct nfsbuf *bp, thread_t thd, kauth_cred_t cred)
 {
 	nfsnode_t np = bp->nb_np;
 	int error = (bp->nb_flags & NB_ERROR) ? bp->nb_error : 0;
-	int firstpg, lastpg;
-	uint32_t pagemask;
+	off_t firstpg, lastpg;
 
 	if ((error == EINTR) || (error == ERESTART)) {
 		CLR(bp->nb_flags, NB_ERROR);
@@ -2594,31 +2752,37 @@ nfs_buf_write_finish(struct nfsbuf *bp, thread_t thd, kauth_cred_t cred)
 	}
 
 	if (!error) {
+		nfsbufpgs pagemask;
 		/* calculate range of complete pages being written */
-		firstpg = round_page_32(bp->nb_offio) / PAGE_SIZE;
-		lastpg = (trunc_page_32(bp->nb_endio) - 1) / PAGE_SIZE;
-		/* calculate mask for that page range written */
-		pagemask = ((1 << (lastpg + 1)) - 1) & ~((1 << firstpg) - 1);
+		if (bp->nb_endio > bp->nb_offio) {
+			firstpg = bp->nb_offio / PAGE_SIZE;
+			lastpg = (bp->nb_endio - 1) / PAGE_SIZE;
+			/* calculate mask for that page range written */
+			nfs_buf_pgs_set_pages_between(&pagemask, firstpg, lastpg + 1);
+		} else {
+			NBPGS_ERASE(&pagemask);
+		}
 		/* clear dirty bits for pages we've written */
-		bp->nb_dirty &= ~pagemask;
+		nfs_buf_pgs_bit_not(&pagemask);
+		nfs_buf_pgs_bit_and(&bp->nb_dirty, &pagemask, &bp->nb_dirty);
 	}
 
 	/* manage needcommit state */
 	if (!error && (bp->nb_commitlevel == NFS_WRITE_UNSTABLE)) {
 		if (!ISSET(bp->nb_flags, NB_NEEDCOMMIT)) {
-			nfs_lock(np, NFS_NODE_LOCK_FORCE);
+			nfs_node_lock_force(np);
 			np->n_needcommitcnt++;
-			nfs_unlock(np);
+			nfs_node_unlock(np);
 			SET(bp->nb_flags, NB_NEEDCOMMIT);
 		}
 		/* make sure nb_dirtyoff/nb_dirtyend reflect actual range written */
 		bp->nb_dirtyoff = bp->nb_offio;
 		bp->nb_dirtyend = bp->nb_endio;
 	} else if (ISSET(bp->nb_flags, NB_NEEDCOMMIT)) {
-		nfs_lock(np, NFS_NODE_LOCK_FORCE);
+		nfs_node_lock_force(np);
 		np->n_needcommitcnt--;
 		CHECK_NEEDCOMMITCNT(np);
-		nfs_unlock(np);
+		nfs_node_unlock(np);
 		CLR(bp->nb_flags, NB_NEEDCOMMIT);
 	}
 
@@ -2649,8 +2813,9 @@ nfs_buf_write_finish(struct nfsbuf *bp, thread_t thd, kauth_cred_t cred)
 		if (ISSET(bp->nb_flags, NB_ASYNC)) {
 			/* move to dirty list */
 			lck_mtx_lock(nfs_buf_mutex);
-			if (bp->nb_vnbufs.le_next != NFSNOLIST)
+			if (bp->nb_vnbufs.le_next != NFSNOLIST) {
 				LIST_REMOVE(bp, nb_vnbufs);
+			}
 			LIST_INSERT_HEAD(&np->n_dirtyblkhd, bp, nb_vnbufs);
 			lck_mtx_unlock(nfs_buf_mutex);
 		}
@@ -2667,18 +2832,19 @@ nfs_buf_write_finish(struct nfsbuf *bp, thread_t thd, kauth_cred_t cred)
 			 * buffer busy.  Set a flag to do it after releasing
 			 * the buffer.
 			 */
-			nfs_lock(np, NFS_NODE_LOCK_FORCE);
+			nfs_node_lock_force(np);
 			np->n_error = error;
 			np->n_flag |= (NWRITEERR | NNEEDINVALIDATE);
 			NATTRINVALIDATE(np);
-			nfs_unlock(np);
+			nfs_node_unlock(np);
 		}
 		/* clear the dirty range */
 		bp->nb_dirtyoff = bp->nb_dirtyend = 0;
 	}
 
-	if (!error && bp->nb_dirty)
+	if (!error && nfs_buf_pgs_is_set(&bp->nb_dirty)) {
 		nfs_buf_write_dirty_pages(bp, thd, cred);
+	}
 	nfs_buf_iodone(bp);
 }
 
@@ -2695,57 +2861,55 @@ nfs_buf_write_dirty_pages(struct nfsbuf *bp, thread_t thd, kauth_cred_t cred)
 	nfsnode_t np = bp->nb_np;
 	struct nfsmount *nmp = NFSTONMP(np);
 	int error = 0, commit, iomode, iomode2, len, pg, count, npages, off;
-	uint32_t dirty = bp->nb_dirty;
+	nfsbufpgs dirty;
 	uint64_t wverf;
-	struct uio uio;
-	struct iovec_32 io;
+	uio_t auio;
+	char uio_buf[UIO_SIZEOF(1)];
 
-	if (!bp->nb_dirty)
-		return (0);
+	if (!nfs_buf_pgs_is_set(&bp->nb_dirty)) {
+		return 0;
+	}
 
 	/* there are pages marked dirty that need to be written out */
-	OSAddAtomic(1, (SInt32 *)&nfsstats.write_bios);
+	OSAddAtomic64(1, &nfsstats.write_bios);
 	NFS_BUF_MAP(bp);
 	SET(bp->nb_flags, NB_WRITEINPROG);
 	npages = bp->nb_bufsize / PAGE_SIZE;
 	iomode = NFS_WRITE_UNSTABLE;
 
-	uio.uio_iovs.iov32p = &io;
-	uio.uio_iovcnt = 1;
-	uio.uio_rw = UIO_WRITE;
-#if 1   /* LP64todo - can't use new segment flags until the drivers are ready */
-	uio.uio_segflg = UIO_SYSSPACE;
-#else
-	uio.uio_segflg = UIO_SYSSPACE32;
-#endif
+	auio = uio_createwithbuffer(1, 0, UIO_SYSSPACE, UIO_WRITE,
+	    &uio_buf, sizeof(uio_buf));
 
 again:
-	dirty = bp->nb_dirty;
+	NBPGS_COPY(&dirty, &bp->nb_dirty);
 	wverf = bp->nb_verf;
 	commit = NFS_WRITE_FILESYNC;
 	for (pg = 0; pg < npages; pg++) {
-		if (!NBPGDIRTY(bp, pg))
+		if (!NBPGDIRTY(bp, pg)) {
 			continue;
+		}
 		count = 1;
-		while (((pg + count) < npages) && NBPGDIRTY(bp, pg + count))
+		while (((pg + count) < npages) && NBPGDIRTY(bp, pg + count)) {
 			count++;
+		}
 		/* write count pages starting with page pg */
 		off = pg * PAGE_SIZE;
 		len = count * PAGE_SIZE;
 		/* clip writes to EOF */
-		if (NBOFF(bp) + off + len > (off_t) np->n_size)
+		if (NBOFF(bp) + off + len > (off_t) np->n_size) {
 			len -= (NBOFF(bp) + off + len) - np->n_size;
+		}
 		if (len > 0) {
 			iomode2 = iomode;
-			io.iov_len = len;
-			uio_uio_resid_set(&uio, io.iov_len);
-			uio.uio_offset = NBOFF(bp) + off;
-			io.iov_base = (uintptr_t) bp->nb_data + off;
-			error = nfs_write_rpc2(np, &uio, thd, cred, &iomode2, &bp->nb_verf);
-			if (error)
+			uio_reset(auio, NBOFF(bp) + off, UIO_SYSSPACE, UIO_WRITE);
+			uio_addiov(auio, CAST_USER_ADDR_T(bp->nb_data + off), len);
+			error = nfs_write_rpc2(np, auio, thd, cred, &iomode2, &bp->nb_verf);
+			if (error) {
 				break;
-			if (iomode2 < commit) /* Retain the lowest commitment level returned. */
+			}
+			if (iomode2 < commit) { /* Retain the lowest commitment level returned. */
 				commit = iomode2;
+			}
 			if ((commit != NFS_WRITE_FILESYNC) && (wverf != bp->nb_verf)) {
 				/* verifier changed, redo all the writes filesync */
 				iomode = NFS_WRITE_FILESYNC;
@@ -2754,15 +2918,16 @@ again:
 		}
 		/* clear dirty bits */
 		while (count--) {
-			dirty &= ~(1 << pg);
-			if (count) /* leave pg on last page */
+			NBPGS_UNSET(&dirty, pg);
+			if (count) { /* leave pg on last page */
 				pg++;
+			}
 		}
 	}
 	CLR(bp->nb_flags, NB_WRITEINPROG);
 
 	if (!error && (commit != NFS_WRITE_FILESYNC)) {
-		error = nmp->nm_funcs->nf_commit_rpc(np, NBOFF(bp), bp->nb_bufsize, cred);
+		error = nmp->nm_funcs->nf_commit_rpc(np, NBOFF(bp), bp->nb_bufsize, cred, wverf);
 		if (error == NFSERR_STALEWRITEVERF) {
 			/* verifier changed, so we need to restart all the writes */
 			iomode = NFS_WRITE_FILESYNC;
@@ -2770,12 +2935,12 @@ again:
 		}
 	}
 	if (!error) {
-		bp->nb_dirty = dirty;
+		NBPGS_COPY(&bp->nb_dirty, &dirty);
 	} else {
 		SET(bp->nb_flags, NB_ERROR);
 		bp->nb_error = error;
 	}
-	return (error);
+	return error;
 }
 
 /*
@@ -2787,18 +2952,21 @@ nfs_buf_write_rpc(struct nfsbuf *bp, int iomode, thread_t thd, kauth_cred_t cred
 	struct nfsmount *nmp;
 	nfsnode_t np = bp->nb_np;
 	int error = 0, nfsvers, async;
-	int offset, length, nmwsize, nrpcs, len;
+	int64_t nrpcs;
+	size_t len;
+	uint32_t nmwsize;
 	struct nfsreq *req;
 	struct nfsreq_cbinfo cb;
-	struct uio uio;
-	struct iovec_32 io;
+	uio_t auio;
+	char uio_buf[UIO_SIZEOF(1)];
+	off_t offset, length;
 
 	nmp = NFSTONMP(np);
-	if (!nmp) {
+	if (nfs_mount_gone(nmp)) {
 		bp->nb_error = error = ENXIO;
 		SET(bp->nb_flags, NB_ERROR);
 		nfs_buf_iodone(bp);
-		return (error);
+		return error;
 	}
 	nfsvers = nmp->nm_vers;
 	nmwsize = nmp->nm_wsize;
@@ -2816,21 +2984,23 @@ nfs_buf_write_rpc(struct nfsbuf *bp, int iomode, thread_t thd, kauth_cred_t cred
 		bp->nb_error = error = EFBIG;
 		SET(bp->nb_flags, NB_ERROR);
 		nfs_buf_iodone(bp);
-		return (error);
+		return error;
 	}
 
-	uio.uio_iovs.iov32p = &io;
-	uio.uio_iovcnt = 1;
-	uio.uio_rw = UIO_WRITE;
-#if 1   /* LP64todo - can't use new segment flags until the drivers are ready */
-	uio.uio_segflg = UIO_SYSSPACE;
+	if (length == 0) {
+		/* We should never get here  */
+#if DEVELOPMENT
+		printf("nfs_buf_write_rpc: Got request with zero length. np %p, bp %p, offset %lld\n", np, bp, offset);
 #else
-	uio.uio_segflg = UIO_SYSSPACE32;
-#endif
-	io.iov_len = length;
-	uio_uio_resid_set(&uio, io.iov_len);
-	uio.uio_offset = NBOFF(bp) + offset;
-	io.iov_base = (uintptr_t) bp->nb_data + offset;
+		printf("nfs_buf_write_rpc: Got request with zero length.\n");
+#endif /* DEVELOPMENT */
+		nfs_buf_iodone(bp);
+		return 0;
+	}
+
+	auio = uio_createwithbuffer(1, NBOFF(bp) + offset, UIO_SYSSPACE,
+	    UIO_WRITE, &uio_buf, sizeof(uio_buf));
+	NFS_UIO_ADDIOV(auio, CAST_USER_ADDR_T(bp->nb_data + offset), length);
 
 	bp->nb_rpcs = nrpcs = (length + nmwsize - 1) / nmwsize;
 	if (async && (nrpcs > 1)) {
@@ -2844,23 +3014,31 @@ nfs_buf_write_rpc(struct nfsbuf *bp, int iomode, thread_t thd, kauth_cred_t cred
 			error = bp->nb_error;
 			break;
 		}
-		len = (length > nmwsize) ? nmwsize : length;
-		cb.rcb_args[0] = offset;
-		cb.rcb_args[1] = len;
-		if (async && ((error = nfs_async_write_start(nmp))))
+		len = (length > nmwsize) ? nmwsize : (uint32_t)length;
+		cb.rcb_args.offset = offset;
+		cb.rcb_args.length = len;
+#if CONFIG_NFS4
+		if (nmp->nm_vers >= NFS_VER4) {
+			cb.rcb_args.stategenid = nmp->nm_stategenid;
+		}
+#endif
+		if (async && ((error = nfs_async_write_start(nmp)))) {
 			break;
+		}
 		req = NULL;
-		error = nmp->nm_funcs->nf_write_rpc_async(np, &uio, len, thd, cred,
-				iomode, &cb, &req);
+		error = nmp->nm_funcs->nf_write_rpc_async(np, auio, len, thd, cred,
+		    iomode, &cb, &req);
 		if (error) {
-			if (async)
+			if (async) {
 				nfs_async_write_done(nmp);
+			}
 			break;
 		}
 		offset += len;
 		length -= len;
-		if (async)
+		if (async) {
 			continue;
+		}
 		nfs_buf_write_rpc_finish(req);
 	}
 
@@ -2881,17 +3059,22 @@ nfs_buf_write_rpc(struct nfsbuf *bp, int iomode, thread_t thd, kauth_cred_t cred
 				nfs_buf_write_finish(bp, thd, cred);
 			} else {
 				/* wait for the last RPC to mark it done */
-				while (bp->nb_rpcs > 0)
+				while (bp->nb_rpcs > 0) {
 					msleep(&bp->nb_rpcs, nfs_buf_mutex, 0,
-						"nfs_buf_write_rpc_cancel", NULL);
+					    "nfs_buf_write_rpc_cancel", NULL);
+				}
 				lck_mtx_unlock(nfs_buf_mutex);
 			}
 		} else {
 			nfs_buf_write_finish(bp, thd, cred);
 		}
+		/* It may have just been an interrupt... that's OK */
+		if (!ISSET(bp->nb_flags, NB_ERROR)) {
+			error = 0;
+		}
 	}
 
-	return (error);
+	return error;
 }
 
 /*
@@ -2900,10 +3083,11 @@ nfs_buf_write_rpc(struct nfsbuf *bp, int iomode, thread_t thd, kauth_cred_t cred
 void
 nfs_buf_write_rpc_finish(struct nfsreq *req)
 {
-	int error = 0, nfsvers, offset, length, multasyncrpc, finished;
+	int error = 0, nfsvers, multasyncrpc, finished;
 	int committed = NFS_WRITE_FILESYNC;
 	uint64_t wverf = 0;
-	size_t rlen;
+	off_t offset;
+	size_t rlen, length;
 	void *wakeme = NULL;
 	struct nfsreq_cbinfo cb;
 	struct nfsreq *wreq = NULL;
@@ -2912,20 +3096,24 @@ nfs_buf_write_rpc_finish(struct nfsreq *req)
 	nfsnode_t np;
 	thread_t thd;
 	kauth_cred_t cred;
-	struct uio uio;
-	struct iovec_32 io;
+	uio_t auio;
+	char uio_buf[UIO_SIZEOF(1)];
 
 finish:
 	np = req->r_np;
 	thd = req->r_thread;
 	cred = req->r_cred;
-	if (IS_VALID_CRED(cred))
+	if (IS_VALID_CRED(cred)) {
 		kauth_cred_ref(cred);
+	}
 	cb = req->r_callback;
 	bp = cb.rcb_bp;
+	if (cb.rcb_func) { /* take an extra reference on the nfsreq in case we want to resend it later due to grace error */
+		nfs_request_ref(req, 0);
+	}
 
 	nmp = NFSTONMP(np);
-	if (!nmp) {
+	if (nfs_mount_gone(nmp)) {
 		SET(bp->nb_flags, NB_ERROR);
 		bp->nb_error = error = ENXIO;
 	}
@@ -2936,24 +3124,76 @@ finish:
 	}
 	nfsvers = nmp->nm_vers;
 
-	offset = cb.rcb_args[0];
-	rlen = length = cb.rcb_args[1];
+	offset = cb.rcb_args.offset;
+	rlen = length = cb.rcb_args.length;
 
 	/* finish the RPC */
 	error = nmp->nm_funcs->nf_write_rpc_async_finish(np, req, &committed, &rlen, &wverf);
 	if ((error == EINPROGRESS) && cb.rcb_func) {
 		/* async request restarted */
-		if (IS_VALID_CRED(cred))
+		if (cb.rcb_func) {
+			nfs_request_rele(req);
+		}
+		if (IS_VALID_CRED(cred)) {
 			kauth_cred_unref(&cred);
+		}
 		return;
 	}
-
+#if CONFIG_NFS4
+	if ((nmp->nm_vers >= NFS_VER4) && nfs_mount_state_error_should_restart(error) && !ISSET(bp->nb_flags, NB_ERROR)) {
+		lck_mtx_lock(&nmp->nm_lock);
+		if ((error != NFSERR_OLD_STATEID) && (error != NFSERR_GRACE) && (cb.rcb_args.stategenid == nmp->nm_stategenid)) {
+			NP(np, "nfs_buf_write_rpc_finish: error %d @ 0x%llx, 0x%x 0x%x, initiating recovery",
+			    error, NBOFF(bp) + offset, cb.rcb_args.stategenid, nmp->nm_stategenid);
+			nfs_need_recover(nmp, error);
+		}
+		lck_mtx_unlock(&nmp->nm_lock);
+		if (np->n_flag & NREVOKE) {
+			error = EIO;
+		} else {
+			if (error == NFSERR_GRACE) {
+				if (cb.rcb_func) {
+					/*
+					 * For an async I/O request, handle a grace delay just like
+					 * jukebox errors.  Set the resend time and queue it up.
+					 */
+					struct timeval now;
+					if (req->r_nmrep.nmc_mhead) {
+						mbuf_freem(req->r_nmrep.nmc_mhead);
+						req->r_nmrep.nmc_mhead = NULL;
+					}
+					req->r_error = 0;
+					microuptime(&now);
+					lck_mtx_lock(&req->r_mtx);
+					req->r_resendtime = now.tv_sec + 2;
+					req->r_xid = 0;                 // get a new XID
+					req->r_flags |= R_RESTART;
+					req->r_start = 0;
+					nfs_asyncio_resend(req);
+					lck_mtx_unlock(&req->r_mtx);
+					if (IS_VALID_CRED(cred)) {
+						kauth_cred_unref(&cred);
+					}
+					/* Note: nfsreq reference taken will be dropped later when finished */
+					return;
+				}
+				/* otherwise, just pause a couple seconds and retry */
+				tsleep(&nmp->nm_state, (PZERO - 1), "nfsgrace", 2 * hz);
+			}
+			if (!(error = nfs_mount_state_wait_for_recovery(nmp))) {
+				rlen = 0;
+				goto writeagain;
+			}
+		}
+	}
+#endif
 	if (error) {
 		SET(bp->nb_flags, NB_ERROR);
 		bp->nb_error = error;
 	}
-	if (error || (nfsvers == NFS_VER2))
+	if (error || (nfsvers == NFS_VER2)) {
 		goto out;
+	}
 	if (rlen <= 0) {
 		SET(bp->nb_flags, NB_ERROR);
 		bp->nb_error = error = EIO;
@@ -2961,8 +3201,9 @@ finish:
 	}
 
 	/* save lowest commit level returned */
-	if (committed < bp->nb_commitlevel)
+	if (committed < bp->nb_commitlevel) {
 		bp->nb_commitlevel = committed;
+	}
 
 	/* check the write verifier */
 	if (!bp->nb_verf) {
@@ -2974,6 +3215,10 @@ finish:
 		bp->nb_verf = wverf;
 	}
 
+	if (!ISSET(bp->nb_flags, NB_STALEWVERF) && rlen > 0 && (bp->nb_offio < (offset + (int)rlen))) {
+		bp->nb_offio = offset + rlen;
+	}
+
 	/*
 	 * check for a short write
 	 *
@@ -2981,36 +3226,38 @@ finish:
 	 * need to issue another write for the rest of it.
 	 * (Don't bother if the buffer hit an error or stale wverf.)
 	 */
-	if (((int)rlen < length) && !(bp->nb_flags & (NB_STALEWVERF|NB_ERROR))) {
+	if ((rlen < length) && !(bp->nb_flags & (NB_STALEWVERF | NB_ERROR))) {
+#if CONFIG_NFS4
+writeagain:
+#endif
 		offset += rlen;
 		length -= rlen;
 
-		uio.uio_iovs.iov32p = &io;
-		uio.uio_iovcnt = 1;
-		uio.uio_rw = UIO_WRITE;
-#if 1   /* LP64todo - can't use new segment flags until the drivers are ready */
-		uio.uio_segflg = UIO_SYSSPACE;
-#else
-		uio.uio_segflg = UIO_SYSSPACE32;
-#endif
-		io.iov_len = length;
-		uio_uio_resid_set(&uio, io.iov_len);
-		uio.uio_offset = NBOFF(bp) + offset;
-		io.iov_base = (uintptr_t) bp->nb_data + offset;
-
-		cb.rcb_args[0] = offset;
-		cb.rcb_args[1] = length;
+		auio = uio_createwithbuffer(1, NBOFF(bp) + offset, UIO_SYSSPACE,
+		    UIO_WRITE, &uio_buf, sizeof(uio_buf));
+		uio_addiov(auio, CAST_USER_ADDR_T(bp->nb_data + offset), length);
 
-		error = nmp->nm_funcs->nf_write_rpc_async(np, &uio, length, thd, cred,
-				NFS_WRITE_FILESYNC, &cb, &wreq);
+		cb.rcb_args.offset = offset;
+		cb.rcb_args.length = length;
+#if CONFIG_NFS4
+		if (nmp->nm_vers >= NFS_VER4) {
+			cb.rcb_args.stategenid = nmp->nm_stategenid;
+		}
+#endif
+		// XXX iomode should really match the original request
+		error = nmp->nm_funcs->nf_write_rpc_async(np, auio, length, thd, cred,
+		    NFS_WRITE_FILESYNC, &cb, &wreq);
 		if (!error) {
-			if (IS_VALID_CRED(cred))
+			if (IS_VALID_CRED(cred)) {
 				kauth_cred_unref(&cred);
+			}
 			if (!cb.rcb_func) {
 				/* if !async we'll need to wait for this RPC to finish */
 				req = wreq;
+				wreq = NULL;
 				goto finish;
 			}
+			nfs_request_rele(req);
 			/*
 			 * We're done here.
 			 * Outstanding RPC count is unchanged.
@@ -3023,8 +3270,10 @@ finish:
 	}
 
 out:
-	if (cb.rcb_func)
+	if (cb.rcb_func) {
 		nfs_async_write_done(nmp);
+		nfs_request_rele(req);
+	}
 	/*
 	 * Decrement outstanding RPC count on buffer
 	 * and call nfs_buf_write_finish on last RPC.
@@ -3034,40 +3283,50 @@ out:
 	 * aborting a partially-initiated set of RPCs)
 	 */
 	multasyncrpc = ISSET(bp->nb_flags, NB_MULTASYNCRPC);
-	if (multasyncrpc)
+	if (multasyncrpc) {
 		lck_mtx_lock(nfs_buf_mutex);
+	}
 
 	bp->nb_rpcs--;
 	finished = (bp->nb_rpcs == 0);
 
-	if (multasyncrpc)
+	if (multasyncrpc) {
 		lck_mtx_unlock(nfs_buf_mutex);
+	}
 
 	if (finished) {
-		if (multasyncrpc)
+		if (multasyncrpc) {
 			wakeme = &bp->nb_rpcs;
+		}
 		nfs_buf_write_finish(bp, thd, cred);
-		if (wakeme)
+		if (wakeme) {
 			wakeup(wakeme);
+		}
 	}
 
-	if (IS_VALID_CRED(cred))
+	if (IS_VALID_CRED(cred)) {
 		kauth_cred_unref(&cred);
+	}
+
+	if (cb.rcb_func && np->n_needcommitcnt >= NFS_A_LOT_OF_NEEDCOMMITS) {
+		nfs_flushcommits(np, 1);
+	}
 }
 
 /*
- * Send commit(s) for the given node's "needcommit" buffers 
+ * Send commit(s) for the given node's "needcommit" buffers
  */
 int
 nfs_flushcommits(nfsnode_t np, int nowait)
 {
 	struct nfsmount *nmp;
-	struct nfsbuf *bp;
+	struct nfsbuf *bp, *prevlbp, *lbp;
 	struct nfsbuflists blist, commitlist;
-	int error = 0, retv, wcred_set, flags, dirty;
+	int error = 0, retv, wcred_set, flags;
 	u_quad_t off, endoff, toff;
-	u_int32_t count;
+	uint64_t wverf, count;
 	kauth_cred_t wcred = NULL;
+	nfsbufpgs dirty;
 
 	FSDBG_TOP(557, np, 0, 0, 0);
 
@@ -3078,11 +3337,12 @@ nfs_flushcommits(nfsnode_t np, int nowait)
 	 * and the commit rpc is done.
 	 */
 	if (!LIST_EMPTY(&np->n_dirtyblkhd)) {
-		error = nfs_lock(np, NFS_NODE_LOCK_EXCLUSIVE);
-		if (error)
+		error = nfs_node_lock(np);
+		if (error) {
 			goto done;
+		}
 		np->n_flag |= NMODIFIED;
-		nfs_unlock(np);
+		nfs_node_unlock(np);
 	}
 
 	off = (u_quad_t)-1;
@@ -3091,7 +3351,7 @@ nfs_flushcommits(nfsnode_t np, int nowait)
 	LIST_INIT(&commitlist);
 
 	nmp = NFSTONMP(np);
-	if (!nmp) {
+	if (nfs_mount_gone(nmp)) {
 		error = ENXIO;
 		goto done;
 	}
@@ -3101,57 +3361,34 @@ nfs_flushcommits(nfsnode_t np, int nowait)
 	}
 
 	flags = NBI_DIRTY;
-	if (nowait)
+	if (nowait) {
 		flags |= NBI_NOWAIT;
+	}
 	lck_mtx_lock(nfs_buf_mutex);
+	wverf = nmp->nm_verf;
 	if (!nfs_buf_iterprepare(np, &blist, flags)) {
 		while ((bp = LIST_FIRST(&blist))) {
 			LIST_REMOVE(bp, nb_vnbufs);
 			LIST_INSERT_HEAD(&np->n_dirtyblkhd, bp, nb_vnbufs);
 			error = nfs_buf_acquire(bp, NBAC_NOWAIT, 0, 0);
-			if (error)
+			if (error) {
 				continue;
-			if (ISSET(bp->nb_flags, NB_NEEDCOMMIT))
+			}
+			if (ISSET(bp->nb_flags, NB_NEEDCOMMIT)) {
 				nfs_buf_check_write_verifier(np, bp);
-			if (((bp->nb_flags & (NB_DELWRI | NB_NEEDCOMMIT))
-				!= (NB_DELWRI | NB_NEEDCOMMIT))) {
+			}
+			if (((bp->nb_flags & (NB_DELWRI | NB_NEEDCOMMIT)) != (NB_DELWRI | NB_NEEDCOMMIT)) ||
+			    (bp->nb_verf != wverf)) {
 				nfs_buf_drop(bp);
 				continue;
 			}
 			nfs_buf_remfree(bp);
-			lck_mtx_unlock(nfs_buf_mutex);
-			/*
-			 * we need a upl to see if the page has been
-			 * dirtied (think mmap) since the unstable write, and
-			 * also to prevent vm from paging it during our commit rpc
-			 */
-			if (!ISSET(bp->nb_flags, NB_PAGELIST)) {
-				retv = nfs_buf_upl_setup(bp);
-				if (retv) {
-					/* unable to create upl */
-					/* vm object must no longer exist */
-					/* this could be fatal if we need */
-					/* to write the data again, we'll see...  */
-					printf("nfs_flushcommits: upl create failed %d\n", retv);
-					bp->nb_valid = bp->nb_dirty = 0;
-				}
-			}
-			nfs_buf_upl_check(bp);
-			lck_mtx_lock(nfs_buf_mutex);
+
+			/* buffer UPLs will be grabbed *in order* below */
 
 			FSDBG(557, bp, bp->nb_flags, bp->nb_valid, bp->nb_dirty);
 			FSDBG(557, bp->nb_validoff, bp->nb_validend,
-			      bp->nb_dirtyoff, bp->nb_dirtyend);
-
-			/*
-			 * We used to check for dirty pages here; if there were any
-			 * we'd abort the commit and force the entire buffer to be
-			 * written again.
-			 *
-			 * Instead of doing that, we now go ahead and commit the dirty
-			 * range, and then leave the buffer around with dirty pages
-			 * that will be written out later.
-			 */
+			    bp->nb_dirtyoff, bp->nb_dirtyend);
 
 			/*
 			 * Work out if all buffers are using the same cred
@@ -3162,8 +3399,9 @@ nfs_flushcommits(nfsnode_t np, int nowait)
 			 */
 			if (wcred_set == 0) {
 				wcred = bp->nb_wcred;
-				if (!IS_VALID_CRED(wcred))
+				if (!IS_VALID_CRED(wcred)) {
 					panic("nfs: needcommit w/out wcred");
+				}
 				wcred_set = 1;
 			} else if ((wcred_set == 1) && wcred != bp->nb_wcred) {
 				wcred_set = -1;
@@ -3171,20 +3409,33 @@ nfs_flushcommits(nfsnode_t np, int nowait)
 			SET(bp->nb_flags, NB_WRITEINPROG);
 
 			/*
-			 * A list of these buffers is kept so that the
-			 * second loop knows which buffers have actually
-			 * been committed. This is necessary, since there
-			 * may be a race between the commit rpc and new
-			 * uncommitted writes on the file.
+			 * Add this buffer to the list of buffers we are committing.
+			 * Buffers are inserted into the list in ascending order so that
+			 * we can take the UPLs in order after the list is complete.
 			 */
+			prevlbp = NULL;
+			LIST_FOREACH(lbp, &commitlist, nb_vnbufs) {
+				if (bp->nb_lblkno < lbp->nb_lblkno) {
+					break;
+				}
+				prevlbp = lbp;
+			}
 			LIST_REMOVE(bp, nb_vnbufs);
-			LIST_INSERT_HEAD(&commitlist, bp, nb_vnbufs);
+			if (prevlbp) {
+				LIST_INSERT_AFTER(prevlbp, bp, nb_vnbufs);
+			} else {
+				LIST_INSERT_HEAD(&commitlist, bp, nb_vnbufs);
+			}
+
+			/* update commit range start, end */
 			toff = NBOFF(bp) + bp->nb_dirtyoff;
-			if (toff < off)
+			if (toff < off) {
 				off = toff;
+			}
 			toff += (u_quad_t)(bp->nb_dirtyend - bp->nb_dirtyoff);
-			if (toff > endoff)
+			if (toff > endoff) {
 				endoff = toff;
+			}
 		}
 		nfs_buf_itercomplete(np, &blist, NBI_DIRTY);
 	}
@@ -3195,6 +3446,29 @@ nfs_flushcommits(nfsnode_t np, int nowait)
 		goto done;
 	}
 
+	/*
+	 * We need a UPL to prevent others from accessing the buffers during
+	 * our commit RPC(s).
+	 *
+	 * We used to also check for dirty pages here; if there were any we'd
+	 * abort the commit and force the entire buffer to be written again.
+	 * Instead of doing that, we just go ahead and commit the dirty range,
+	 * and then leave the buffer around with dirty pages that will be
+	 * written out later.
+	 */
+	LIST_FOREACH(bp, &commitlist, nb_vnbufs) {
+		if (!ISSET(bp->nb_flags, NB_PAGELIST)) {
+			retv = nfs_buf_upl_setup(bp);
+			if (retv) {
+				/* Unable to create the UPL, the VM object probably no longer exists. */
+				printf("nfs_flushcommits: upl create failed %d\n", retv);
+				NBPGS_ERASE(&bp->nb_valid);
+				NBPGS_ERASE(&bp->nb_dirty);
+			}
+		}
+		nfs_buf_upl_check(bp);
+	}
+
 	/*
 	 * Commit data on the server, as required.
 	 * If all bufs are using the same wcred, then use that with
@@ -3206,19 +3480,21 @@ nfs_flushcommits(nfsnode_t np, int nowait)
 		 * Note, it's possible the commit range could be >2^32-1.
 		 * If it is, we'll send one commit that covers the whole file.
 		 */
-		if ((endoff - off) > 0xffffffff)
+		if ((endoff - off) > 0xffffffff) {
 			count = 0;
-		else
+		} else {
 			count = (endoff - off);
-		retv = nmp->nm_funcs->nf_commit_rpc(np, off, count, wcred);
+		}
+		retv = nmp->nm_funcs->nf_commit_rpc(np, off, count, wcred, wverf);
 	} else {
 		retv = 0;
 		LIST_FOREACH(bp, &commitlist, nb_vnbufs) {
 			toff = NBOFF(bp) + bp->nb_dirtyoff;
 			count = bp->nb_dirtyend - bp->nb_dirtyoff;
-			retv = nmp->nm_funcs->nf_commit_rpc(np, toff, count, bp->nb_wcred);
-			if (retv)
+			retv = nmp->nm_funcs->nf_commit_rpc(np, toff, count, bp->nb_wcred, wverf);
+			if (retv) {
 				break;
+			}
 		}
 	}
 
@@ -3230,11 +3506,11 @@ nfs_flushcommits(nfsnode_t np, int nowait)
 	while ((bp = LIST_FIRST(&commitlist))) {
 		LIST_REMOVE(bp, nb_vnbufs);
 		FSDBG(557, bp, retv, bp->nb_flags, bp->nb_dirty);
-		nfs_lock(np, NFS_NODE_LOCK_FORCE);
+		nfs_node_lock_force(np);
 		CLR(bp->nb_flags, (NB_NEEDCOMMIT | NB_WRITEINPROG));
 		np->n_needcommitcnt--;
 		CHECK_NEEDCOMMITCNT(np);
-		nfs_unlock(np);
+		nfs_node_unlock(np);
 
 		if (retv) {
 			/* move back to dirty list */
@@ -3245,6 +3521,9 @@ nfs_flushcommits(nfsnode_t np, int nowait)
 			continue;
 		}
 
+		nfs_node_lock_force(np);
+		np->n_numoutput++;
+		nfs_node_unlock(np);
 		vnode_startwrite(NFSTOV(np));
 		if (ISSET(bp->nb_flags, NB_DELWRI)) {
 			lck_mtx_lock(nfs_buf_mutex);
@@ -3253,13 +3532,15 @@ nfs_flushcommits(nfsnode_t np, int nowait)
 			lck_mtx_unlock(nfs_buf_mutex);
 			wakeup(&nfs_nbdwrite);
 		}
-		CLR(bp->nb_flags, (NB_READ|NB_DONE|NB_ERROR|NB_DELWRI));
+		CLR(bp->nb_flags, (NB_READ | NB_DONE | NB_ERROR | NB_DELWRI));
 		/* if block still has dirty pages, we don't want it to */
 		/* be released in nfs_buf_iodone().  So, don't set NB_ASYNC. */
-		if (!(dirty = bp->nb_dirty))
+		NBPGS_COPY(&dirty, &bp->nb_dirty);
+		if (!nfs_buf_pgs_is_set(&dirty)) {
 			SET(bp->nb_flags, NB_ASYNC);
-		else
+		} else {
 			CLR(bp->nb_flags, NB_ASYNC);
+		}
 
 		/* move to clean list */
 		lck_mtx_lock(nfs_buf_mutex);
@@ -3269,7 +3550,7 @@ nfs_flushcommits(nfsnode_t np, int nowait)
 		bp->nb_dirtyoff = bp->nb_dirtyend = 0;
 
 		nfs_buf_iodone(bp);
-		if (dirty) {
+		if (nfs_buf_pgs_is_set(&dirty)) {
 			/* throw it back in as a delayed write buffer */
 			CLR(bp->nb_flags, NB_DONE);
 			nfs_buf_write_delayed(bp);
@@ -3278,12 +3559,12 @@ nfs_flushcommits(nfsnode_t np, int nowait)
 
 done:
 	FSDBG_BOT(557, np, 0, 0, error);
-	return (error);
+	return error;
 }
 
 /*
  * Flush all the blocks associated with a vnode.
- * 	Walk through the buffer pool and push any dirty pages
+ *      Walk through the buffer pool and push any dirty pages
  *	associated with the vnode.
  */
 int
@@ -3297,25 +3578,27 @@ nfs_flush(nfsnode_t np, int waitfor, thread_t thd, int ignore_writeerr)
 
 	FSDBG_TOP(517, np, waitfor, ignore_writeerr, 0);
 
-	if (!nmp) {
+	if (nfs_mount_gone(nmp)) {
 		error = ENXIO;
 		goto out;
 	}
 	nfsvers = nmp->nm_vers;
-	if (nmp->nm_flag & NFSMNT_INT)
+	if (NMFLAG(nmp, INTR)) {
 		slpflag = PCATCH;
+	}
 
 	if (!LIST_EMPTY(&np->n_dirtyblkhd)) {
-		nfs_lock(np, NFS_NODE_LOCK_FORCE);
+		nfs_node_lock_force(np);
 		np->n_flag |= NMODIFIED;
-		nfs_unlock(np);
+		nfs_node_unlock(np);
 	}
 
 	lck_mtx_lock(nfs_buf_mutex);
 	while (np->n_bflag & NBFLUSHINPROG) {
 		np->n_bflag |= NBFLUSHWANT;
 		error = msleep(&np->n_bflag, nfs_buf_mutex, slpflag, "nfs_flush", NULL);
-		if (error) {
+		if ((error && (error != EWOULDBLOCK)) ||
+		    ((error = nfs_sigintr(NFSTONMP(np), NULL, thd, 0)))) {
 			lck_mtx_unlock(nfs_buf_mutex);
 			goto out;
 		}
@@ -3342,18 +3625,21 @@ again:
 		while ((bp = LIST_FIRST(&blist))) {
 			LIST_REMOVE(bp, nb_vnbufs);
 			LIST_INSERT_HEAD(&np->n_dirtyblkhd, bp, nb_vnbufs);
-			flags = (passone || (waitfor != MNT_WAIT)) ? NBAC_NOWAIT : 0;
-			if (flags != NBAC_NOWAIT)
+			flags = (passone || !(waitfor == MNT_WAIT || waitfor == MNT_DWAIT)) ? NBAC_NOWAIT : 0;
+			if (flags != NBAC_NOWAIT) {
 				nfs_buf_refget(bp);
+			}
 			while ((error = nfs_buf_acquire(bp, flags, slpflag, slptimeo))) {
 				FSDBG(524, bp, flags, bp->nb_lflags, bp->nb_flags);
-				if (error == EBUSY)
+				if (error == EBUSY) {
 					break;
+				}
 				if (error) {
 					error2 = nfs_sigintr(NFSTONMP(np), NULL, thd, 0);
 					if (error2) {
-						if (flags != NBAC_NOWAIT)
+						if (flags != NBAC_NOWAIT) {
 							nfs_buf_refrele(bp);
+						}
 						nfs_buf_itercomplete(np, &blist, NBI_DIRTY);
 						lck_mtx_unlock(nfs_buf_mutex);
 						error = error2;
@@ -3365,24 +3651,27 @@ again:
 					}
 				}
 			}
-			if (flags != NBAC_NOWAIT)
+			if (flags != NBAC_NOWAIT) {
 				nfs_buf_refrele(bp);
-			if (error == EBUSY)
+			}
+			if (error == EBUSY) {
 				continue;
+			}
 			if (!bp->nb_np) {
 				/* buffer is no longer valid */
 				nfs_buf_drop(bp);
 				continue;
 			}
-			if (ISSET(bp->nb_flags, NB_NEEDCOMMIT))
+			if (ISSET(bp->nb_flags, NB_NEEDCOMMIT)) {
 				nfs_buf_check_write_verifier(np, bp);
+			}
 			if (!ISSET(bp->nb_flags, NB_DELWRI)) {
 				/* buffer is no longer dirty */
 				nfs_buf_drop(bp);
 				continue;
 			}
 			FSDBG(525, bp, passone, bp->nb_lflags, bp->nb_flags);
-			if ((passone || (waitfor != MNT_WAIT)) &&
+			if ((passone || !(waitfor == MNT_WAIT || waitfor == MNT_DWAIT)) &&
 			    ISSET(bp->nb_flags, NB_NEEDCOMMIT)) {
 				nfs_buf_drop(bp);
 				continue;
@@ -3390,10 +3679,10 @@ again:
 			nfs_buf_remfree(bp);
 			lck_mtx_unlock(nfs_buf_mutex);
 			if (ISSET(bp->nb_flags, NB_ERROR)) {
-				nfs_lock(np, NFS_NODE_LOCK_FORCE);
+				nfs_node_lock_force(np);
 				np->n_error = bp->nb_error ? bp->nb_error : EIO;
 				np->n_flag |= NWRITEERR;
-				nfs_unlock(np);
+				nfs_node_unlock(np);
 				nfs_buf_release(bp, 1);
 				lck_mtx_lock(nfs_buf_mutex);
 				continue;
@@ -3410,11 +3699,11 @@ again:
 	}
 	lck_mtx_unlock(nfs_buf_mutex);
 
-	if (waitfor == MNT_WAIT) {
-	        while ((error = vnode_waitforwrites(NFSTOV(np), 0, slpflag, slptimeo, "nfsflush"))) {
-		        error2 = nfs_sigintr(NFSTONMP(np), NULL, thd, 0);
+	if (waitfor == MNT_WAIT || waitfor == MNT_DWAIT) {
+		while ((error = vnode_waitforwrites(NFSTOV(np), 0, slpflag, slptimeo, "nfsflush"))) {
+			error2 = nfs_sigintr(NFSTONMP(np), NULL, thd, 0);
 			if (error2) {
-			        error = error2;
+				error = error2;
 				goto done;
 			}
 			if (slpflag == PCATCH) {
@@ -3427,38 +3716,56 @@ again:
 	if (nfsvers != NFS_VER2) {
 		/* loop while it looks like there are still buffers to be */
 		/* commited and nfs_flushcommits() seems to be handling them. */
-		while (np->n_needcommitcnt)
-			if (nfs_flushcommits(np, 0))
+		while (np->n_needcommitcnt) {
+			if (nfs_flushcommits(np, 0)) {
 				break;
+			}
+		}
 	}
 
 	if (passone) {
 		passone = 0;
 		if (!LIST_EMPTY(&np->n_dirtyblkhd)) {
-			nfs_lock(np, NFS_NODE_LOCK_FORCE);
+			nfs_node_lock_force(np);
 			np->n_flag |= NMODIFIED;
-			nfs_unlock(np);
+			nfs_node_unlock(np);
 		}
 		lck_mtx_lock(nfs_buf_mutex);
 		goto again;
 	}
 
-	if (waitfor == MNT_WAIT) {
+	if (waitfor == MNT_WAIT || waitfor == MNT_DWAIT) {
 		if (!LIST_EMPTY(&np->n_dirtyblkhd)) {
-			nfs_lock(np, NFS_NODE_LOCK_FORCE);
+			nfs_node_lock_force(np);
 			np->n_flag |= NMODIFIED;
-			nfs_unlock(np);
+			nfs_node_unlock(np);
 		}
 		lck_mtx_lock(nfs_buf_mutex);
-		if (!LIST_EMPTY(&np->n_dirtyblkhd))
+		if (!LIST_EMPTY(&np->n_dirtyblkhd)) {
 			goto again;
+		}
 		lck_mtx_unlock(nfs_buf_mutex);
-		nfs_lock(np, NFS_NODE_LOCK_FORCE);
-		/* if we have no dirty blocks, we can clear the modified flag */
-		if (!np->n_wrbusy)
+		nfs_node_lock_force(np);
+		/*
+		 * OK, it looks like there are no dirty blocks.  If we have no
+		 * writes in flight and no one in the write code, we can clear
+		 * the modified flag.  In order to make sure we see the latest
+		 * attributes and size, we also invalidate the attributes and
+		 * advance the attribute cache XID to guarantee that attributes
+		 * newer than our clearing of NMODIFIED will get loaded next.
+		 * (If we don't do this, it's possible for the flush's final
+		 * write/commit (xid1) to be executed in parallel with a subsequent
+		 * getattr request (xid2).  The getattr could return attributes
+		 * from *before* the write/commit completed but the stale attributes
+		 * would be preferred because of the xid ordering.)
+		 */
+		if (!np->n_wrbusy && !np->n_numoutput) {
 			np->n_flag &= ~NMODIFIED;
+			NATTRINVALIDATE(np);
+			nfs_get_xid(&np->n_xid);
+		}
 	} else {
-		nfs_lock(np, NFS_NODE_LOCK_FORCE);
+		nfs_node_lock_force(np);
 	}
 
 	FSDBG(526, np->n_flag, np->n_error, 0, 0);
@@ -3466,24 +3773,25 @@ again:
 		error = np->n_error;
 		np->n_flag &= ~NWRITEERR;
 	}
-	nfs_unlock(np);
+	nfs_node_unlock(np);
 done:
 	lck_mtx_lock(nfs_buf_mutex);
 	flags = np->n_bflag;
-	np->n_bflag &= ~(NBFLUSHINPROG|NBFLUSHWANT);
+	np->n_bflag &= ~(NBFLUSHINPROG | NBFLUSHWANT);
 	lck_mtx_unlock(nfs_buf_mutex);
-	if (flags & NBFLUSHWANT)
+	if (flags & NBFLUSHWANT) {
 		wakeup(&np->n_bflag);
+	}
 out:
 	FSDBG_BOT(517, np, error, ignore_writeerr, 0);
-	return (error);
+	return error;
 }
 
 /*
  * Flush out and invalidate all buffers associated with a vnode.
  * Called with the underlying object locked.
  */
-static int
+int
 nfs_vinvalbuf_internal(
 	nfsnode_t np,
 	int flags,
@@ -3497,8 +3805,9 @@ nfs_vinvalbuf_internal(
 	int list, error = 0;
 
 	if (flags & V_SAVE) {
-		if ((error = nfs_flush(np, MNT_WAIT, thd, (flags & V_IGNORE_WRITEERR))))
-			return (error);
+		if ((error = nfs_flush(np, MNT_WAIT, thd, (flags & V_IGNORE_WRITEERR)))) {
+			return error;
+		}
 	}
 
 	lck_mtx_lock(nfs_buf_mutex);
@@ -3506,15 +3815,17 @@ nfs_vinvalbuf_internal(
 		list = NBI_CLEAN;
 		if (nfs_buf_iterprepare(np, &blist, list)) {
 			list = NBI_DIRTY;
-			if (nfs_buf_iterprepare(np, &blist, list))
+			if (nfs_buf_iterprepare(np, &blist, list)) {
 				break;
+			}
 		}
 		while ((bp = LIST_FIRST(&blist))) {
 			LIST_REMOVE(bp, nb_vnbufs);
-			if (list == NBI_CLEAN)
+			if (list == NBI_CLEAN) {
 				LIST_INSERT_HEAD(&np->n_cleanblkhd, bp, nb_vnbufs);
-			else
+			} else {
 				LIST_INSERT_HEAD(&np->n_dirtyblkhd, bp, nb_vnbufs);
+			}
 			nfs_buf_refget(bp);
 			while ((error = nfs_buf_acquire(bp, NBAC_REMOVE, slpflag, slptimeo))) {
 				FSDBG(556, np, bp, NBOFF(bp), bp->nb_flags);
@@ -3523,7 +3834,7 @@ nfs_vinvalbuf_internal(
 					nfs_buf_refrele(bp);
 					nfs_buf_itercomplete(np, &blist, list);
 					lck_mtx_unlock(nfs_buf_mutex);
-					return (error);
+					return error;
 				}
 			}
 			nfs_buf_refrele(bp);
@@ -3533,18 +3844,21 @@ nfs_vinvalbuf_internal(
 			    (NBOFF(bp) < (off_t)np->n_size)) {
 				/* extra paranoia: make sure we're not */
 				/* somehow leaving any dirty data around */
+				nfsbufpgs pagemask;
 				int mustwrite = 0;
-				int end = (NBOFF(bp) + bp->nb_bufsize > (off_t)np->n_size) ?
-				    ((off_t)np->n_size - NBOFF(bp)) : bp->nb_bufsize;
+				off_t end = (NBOFF(bp) + bp->nb_bufsize > (off_t)np->n_size) ?
+				    (np->n_size - NBOFF(bp)) : bp->nb_bufsize;
 				if (!ISSET(bp->nb_flags, NB_PAGELIST)) {
 					error = nfs_buf_upl_setup(bp);
 					if (error == EINVAL) {
 						/* vm object must no longer exist */
 						/* hopefully we don't need to do */
 						/* anything for this buffer */
-					} else if (error)
+					} else if (error) {
 						printf("nfs_vinvalbuf: upl setup failed %d\n", error);
-					bp->nb_valid = bp->nb_dirty = 0;
+					}
+					NBPGS_ERASE(&bp->nb_valid);
+					NBPGS_ERASE(&bp->nb_dirty);
 				}
 				nfs_buf_upl_check(bp);
 				/* check for any dirty data before the EOF */
@@ -3552,15 +3866,19 @@ nfs_vinvalbuf_internal(
 					/* clip dirty range to EOF */
 					if (bp->nb_dirtyend > end) {
 						bp->nb_dirtyend = end;
-						if (bp->nb_dirtyoff >= bp->nb_dirtyend)
+						if (bp->nb_dirtyoff >= bp->nb_dirtyend) {
 							bp->nb_dirtyoff = bp->nb_dirtyend = 0;
+						}
 					}
-					if ((bp->nb_dirtyend > 0) && (bp->nb_dirtyoff < end))
+					if ((bp->nb_dirtyend > 0) && (bp->nb_dirtyoff < end)) {
 						mustwrite++;
+					}
 				}
-				bp->nb_dirty &= (1 << (round_page_32(end)/PAGE_SIZE)) - 1;
-				if (bp->nb_dirty)
+				nfs_buf_pgs_get_page_mask(&pagemask, round_page_64(end) / PAGE_SIZE);
+				nfs_buf_pgs_bit_and(&bp->nb_dirty, &pagemask, &bp->nb_dirty);
+				if (nfs_buf_pgs_is_set(&bp->nb_dirty)) {
 					mustwrite++;
+				}
 				/* also make sure we'll have a credential to do the write */
 				if (mustwrite && !IS_VALID_CRED(bp->nb_wcred) && !IS_VALID_CRED(cred)) {
 					printf("nfs_vinvalbuf: found dirty buffer with no write creds\n");
@@ -3568,8 +3886,9 @@ nfs_vinvalbuf_internal(
 				}
 				if (mustwrite) {
 					FSDBG(554, np, bp, 0xd00dee, bp->nb_flags);
-					if (!ISSET(bp->nb_flags, NB_PAGELIST))
+					if (!ISSET(bp->nb_flags, NB_PAGELIST)) {
 						panic("nfs_vinvalbuf: dirty buffer without upl");
+					}
 					/* gotta write out dirty data before invalidating */
 					/* (NB_STABLE indicates that data writes should be FILESYNC) */
 					/* (NB_NOCACHE indicates buffer should be discarded) */
@@ -3583,10 +3902,11 @@ nfs_vinvalbuf_internal(
 					// Note: bp has been released
 					if (error) {
 						FSDBG(554, bp, 0xd00dee, 0xbad, error);
-						nfs_lock(np, NFS_NODE_LOCK_FORCE);
-						np->n_error = error;
-						np->n_flag |= NWRITEERR;
-						nfs_unlock(np);
+						nfs_node_lock_force(np);
+						if ((error != EINTR) && (error != ERESTART)) {
+							np->n_error = error;
+							np->n_flag |= NWRITEERR;
+						}
 						/*
 						 * There was a write error and we need to
 						 * invalidate attrs to sync with server.
@@ -3594,6 +3914,18 @@ nfs_vinvalbuf_internal(
 						 * we may no longer know the correct size)
 						 */
 						NATTRINVALIDATE(np);
+						nfs_node_unlock(np);
+						if ((error == EINTR) || (error == ERESTART)) {
+							/*
+							 * Abort on EINTR.  If we don't, we could
+							 * be stuck in this loop forever because
+							 * the buffer will continue to stay dirty.
+							 */
+							lck_mtx_lock(nfs_buf_mutex);
+							nfs_buf_itercomplete(np, &blist, list);
+							lck_mtx_unlock(nfs_buf_mutex);
+							return error;
+						}
 						error = 0;
 					}
 					lck_mtx_lock(nfs_buf_mutex);
@@ -3607,16 +3939,20 @@ nfs_vinvalbuf_internal(
 		}
 		nfs_buf_itercomplete(np, &blist, list);
 	}
-	if (!LIST_EMPTY(&(np)->n_dirtyblkhd) || !LIST_EMPTY(&(np)->n_cleanblkhd))
+	if (!LIST_EMPTY(&(np)->n_dirtyblkhd) || !LIST_EMPTY(&(np)->n_cleanblkhd)) {
 		panic("nfs_vinvalbuf: flush/inval failed");
+	}
 	lck_mtx_unlock(nfs_buf_mutex);
+	nfs_node_lock_force(np);
 	if (!(flags & V_SAVE)) {
-		nfs_lock(np, NFS_NODE_LOCK_FORCE);
 		np->n_flag &= ~NMODIFIED;
-		nfs_unlock(np);
 	}
+	if (vnode_vtype(NFSTOV(np)) == VREG) {
+		np->n_lastrahead = -1;
+	}
+	nfs_node_unlock(np);
 	NFS_BUF_FREEUP();
-	return (0);
+	return 0;
 }
 
 
@@ -3635,13 +3971,25 @@ nfs_vinvalbuf2(vnode_t vp, int flags, thread_t thd, kauth_cred_t cred, int intrf
 {
 	nfsnode_t np = VTONFS(vp);
 	struct nfsmount *nmp = VTONMP(vp);
-	int error, rv, slpflag, slptimeo, nflags;
+	int error, slpflag, slptimeo, nflags, retry = 0;
+	int ubcflags = UBC_PUSHALL | UBC_SYNC | UBC_INVALIDATE;
+	struct timespec ts = { .tv_sec = 2, .tv_nsec = 0 };
 	off_t size;
 
 	FSDBG_TOP(554, np, flags, intrflg, 0);
 
-	if (nmp && !(nmp->nm_flag & NFSMNT_INT))
+	/*
+	 * If the mount is gone no sense to try and write anything.
+	 * and hang trying to do IO.
+	 */
+	if (nfs_mount_gone(nmp)) {
+		flags &= ~V_SAVE;
+		ubcflags &= ~UBC_PUSHALL;
+	}
+
+	if (nmp && !NMFLAG(nmp, INTR)) {
 		intrflg = 0;
+	}
 	if (intrflg) {
 		slpflag = PCATCH;
 		slptimeo = 2 * hz;
@@ -3654,40 +4002,109 @@ nfs_vinvalbuf2(vnode_t vp, int flags, thread_t thd, kauth_cred_t cred, int intrf
 	lck_mtx_lock(nfs_buf_mutex);
 	while (np->n_bflag & NBINVALINPROG) {
 		np->n_bflag |= NBINVALWANT;
-		error = msleep(&np->n_bflag, nfs_buf_mutex, slpflag, "nfs_vinvalbuf", NULL);
-		if (error) {
+		msleep(&np->n_bflag, nfs_buf_mutex, slpflag, "nfs_vinvalbuf", &ts);
+		if ((error = nfs_sigintr(VTONMP(vp), NULL, thd, 0))) {
 			lck_mtx_unlock(nfs_buf_mutex);
-			return (error);
+			return error;
+		}
+		if (np->n_bflag & NBINVALINPROG) {
+			slpflag = 0;
 		}
 	}
 	np->n_bflag |= NBINVALINPROG;
 	lck_mtx_unlock(nfs_buf_mutex);
 
 	/* Now, flush as required.  */
+again:
 	error = nfs_vinvalbuf_internal(np, flags, thd, cred, slpflag, 0);
 	while (error) {
 		FSDBG(554, np, 0, 0, error);
-		if ((error = nfs_sigintr(VTONMP(vp), NULL, thd, 0)))
+		if ((error = nfs_sigintr(VTONMP(vp), NULL, thd, 0))) {
 			goto done;
+		}
 		error = nfs_vinvalbuf_internal(np, flags, thd, cred, 0, slptimeo);
 	}
 
 	/* get the pages out of vm also */
-	if (UBCINFOEXISTS(vp) && (size = ubc_getsize(vp)))
-		if (!(rv = ubc_sync_range(vp, 0, size, UBC_PUSHALL | UBC_SYNC | UBC_INVALIDATE)))
-			panic("nfs_vinvalbuf(): ubc_sync_range failed!");
+	if (UBCINFOEXISTS(vp) && (size = ubc_getsize(vp))) {
+		if ((error = ubc_msync(vp, 0, size, NULL, ubcflags))) {
+			if (error == EINVAL) {
+				panic("nfs_vinvalbuf(): ubc_msync failed!, error %d", error);
+			}
+			if (retry++ < 10) { /* retry invalidating a few times */
+				if (retry > 1 || error == ENXIO) {
+					ubcflags &= ~UBC_PUSHALL;
+				}
+				goto again;
+			}
+			/* give up */
+			printf("nfs_vinvalbuf(): ubc_msync failed!, error %d\n", error);
+		}
+	}
 done:
 	lck_mtx_lock(nfs_buf_mutex);
 	nflags = np->n_bflag;
-	np->n_bflag &= ~(NBINVALINPROG|NBINVALWANT);
+	np->n_bflag &= ~(NBINVALINPROG | NBINVALWANT);
 	lck_mtx_unlock(nfs_buf_mutex);
-	if (nflags & NBINVALWANT)
+	if (nflags & NBINVALWANT) {
 		wakeup(&np->n_bflag);
+	}
 
 	FSDBG_BOT(554, np, flags, intrflg, error);
-	return (error);
+	return error;
 }
 
+/*
+ * Wait for any busy buffers to complete.
+ */
+void
+nfs_wait_bufs(nfsnode_t np)
+{
+	struct nfsbuf *bp;
+	struct nfsbuflists blist;
+	int error = 0;
+
+	lck_mtx_lock(nfs_buf_mutex);
+	if (!nfs_buf_iterprepare(np, &blist, NBI_CLEAN)) {
+		while ((bp = LIST_FIRST(&blist))) {
+			LIST_REMOVE(bp, nb_vnbufs);
+			LIST_INSERT_HEAD(&np->n_cleanblkhd, bp, nb_vnbufs);
+			nfs_buf_refget(bp);
+			while ((error = nfs_buf_acquire(bp, 0, 0, 0))) {
+				if (error != EAGAIN) {
+					nfs_buf_refrele(bp);
+					nfs_buf_itercomplete(np, &blist, NBI_CLEAN);
+					lck_mtx_unlock(nfs_buf_mutex);
+					return;
+				}
+			}
+			nfs_buf_refrele(bp);
+			nfs_buf_drop(bp);
+		}
+		nfs_buf_itercomplete(np, &blist, NBI_CLEAN);
+	}
+	if (!nfs_buf_iterprepare(np, &blist, NBI_DIRTY)) {
+		while ((bp = LIST_FIRST(&blist))) {
+			LIST_REMOVE(bp, nb_vnbufs);
+			LIST_INSERT_HEAD(&np->n_dirtyblkhd, bp, nb_vnbufs);
+			nfs_buf_refget(bp);
+			while ((error = nfs_buf_acquire(bp, 0, 0, 0))) {
+				if (error != EAGAIN) {
+					nfs_buf_refrele(bp);
+					nfs_buf_itercomplete(np, &blist, NBI_DIRTY);
+					lck_mtx_unlock(nfs_buf_mutex);
+					return;
+				}
+			}
+			nfs_buf_refrele(bp);
+			nfs_buf_drop(bp);
+		}
+		nfs_buf_itercomplete(np, &blist, NBI_DIRTY);
+	}
+	lck_mtx_unlock(nfs_buf_mutex);
+}
+
+
 /*
  * Add an async I/O request to the mount's async I/O queue and make
  * sure that an nfsiod will service it.
@@ -3701,8 +4118,12 @@ nfs_asyncio_finish(struct nfsreq *req)
 
 	FSDBG_TOP(552, nmp, 0, 0, 0);
 again:
-	if (((nmp = req->r_nmp)) == NULL)
+	nmp = req->r_nmp;
+
+	if (nmp == NULL) {
 		return;
+	}
+
 	lck_mtx_lock(nfsiod_mutex);
 	niod = nmp->nm_niod;
 
@@ -3721,14 +4142,38 @@ again:
 			 */
 			lck_mtx_unlock(nfsiod_mutex);
 			started++;
-			if (!nfsiod_start())
+			if (!nfsiod_start()) {
 				goto again;
+			}
 			lck_mtx_lock(nfsiod_mutex);
 		}
 	}
 
-	if (req->r_achain.tqe_next == NFSREQNOLIST)
+	/*
+	 * If we got here while being on the resendq we need to get off. This
+	 * happens when the timer fires and errors out requests from nfs_sigintr
+	 * or we receive a reply (UDP case) while being on the resend queue so
+	 * we're just finishing up and are not going to be resent.
+	 */
+	lck_mtx_lock(&req->r_mtx);
+	if (req->r_flags & R_RESENDQ) {
+		lck_mtx_lock(&nmp->nm_lock);
+		if ((req->r_flags & R_RESENDQ) && req->r_rchain.tqe_next != NFSREQNOLIST) {
+			NFS_BIO_DBG("Proccessing async request on resendq. Removing");
+			TAILQ_REMOVE(&nmp->nm_resendq, req, r_rchain);
+			req->r_flags &= ~R_RESENDQ;
+			req->r_rchain.tqe_next = NFSREQNOLIST;
+			assert(req->r_refs > 1);
+			/* Remove resendq reference */
+			req->r_refs--;
+		}
+		lck_mtx_unlock(&nmp->nm_lock);
+	}
+	lck_mtx_unlock(&req->r_mtx);
+
+	if (req->r_achain.tqe_next == NFSREQNOLIST) {
 		TAILQ_INSERT_TAIL(&nmp->nm_iodq, req, r_achain);
+	}
 
 	/* If this mount doesn't already have an nfsiod working on it... */
 	if (!nmp->nm_niod) {
@@ -3737,8 +4182,10 @@ again:
 			lck_mtx_unlock(nfsiod_mutex);
 			wakeup(niod);
 		} else if (nfsiod_thread_count > 0) {
-			/* just queue it up on nfsiod mounts queue */
-			TAILQ_INSERT_TAIL(&nfsiodmounts, nmp, nm_iodlink);
+			/* just queue it up on nfsiod mounts queue if needed */
+			if (nmp->nm_iodlink.tqe_next == NFSNOLIST) {
+				TAILQ_INSERT_TAIL(&nfsiodmounts, nmp, nm_iodlink);
+			}
 			lck_mtx_unlock(nfsiod_mutex);
 		} else {
 			printf("nfs_asyncio(): no nfsiods? %d %d (%d)\n", nfsiod_thread_count, NFSIOD_MAX, started);
@@ -3756,83 +4203,64 @@ again:
 
 /*
  * queue up async I/O request for resend
+ * Must be called with req->r_mtx locked.
  */
 void
 nfs_asyncio_resend(struct nfsreq *req)
 {
 	struct nfsmount *nmp = req->r_nmp;
 
-	if (!nmp)
+	if (nfs_mount_gone(nmp)) {
 		return;
+	}
+
+#if CONFIG_NFS_GSS
 	nfs_gss_clnt_rpcdone(req);
+#endif
 	lck_mtx_lock(&nmp->nm_lock);
-	if (req->r_rchain.tqe_next == NFSREQNOLIST) {
+	if (!(req->r_flags & R_RESENDQ)) {
 		TAILQ_INSERT_TAIL(&nmp->nm_resendq, req, r_rchain);
 		req->r_flags |= R_RESENDQ;
+		/*
+		 * We take a reference on this request so that it can't be
+		 * destroyed while a resend is queued or in progress.
+		 */
+		nfs_request_ref(req, 1);
 	}
 	nfs_mount_sock_thread_wake(nmp);
 	lck_mtx_unlock(&nmp->nm_lock);
 }
 
 /*
- * Read an NFS buffer for a directory.
+ * Read directory data into a buffer.
+ *
+ * Buffer will be filled (unless EOF is hit).
+ * Buffers after this one may also be completely/partially filled.
  */
 int
 nfs_buf_readdir(struct nfsbuf *bp, vfs_context_t ctx)
 {
-	nfsnode_t np;
-	vnode_t vp;
-	struct nfsmount *nmp;
-	int error = 0, nfsvers;
-	struct uio uio;
-	struct iovec_32 io;
-
-	np = bp->nb_np;
-	vp = NFSTOV(np);
-	nmp = VTONMP(vp);
-	nfsvers = nmp->nm_vers;
-	uio.uio_iovs.iov32p = &io;
-	uio.uio_iovcnt = 1;
-#if 1   /* LP64todo - can't use new segment flags until the drivers are ready */
-	uio.uio_segflg = UIO_SYSSPACE;
-#else
-	uio.uio_segflg = UIO_SYSSPACE32;
-#endif
-
-	/* sanity check */
-	if (ISSET(bp->nb_flags, NB_DONE))
-		CLR(bp->nb_flags, NB_DONE);
+	nfsnode_t np = bp->nb_np;
+	struct nfsmount *nmp = NFSTONMP(np);
+	int error = 0;
 
-	uio.uio_rw = UIO_READ;
-	io.iov_len = bp->nb_bufsize;
-	uio_uio_resid_set(&uio, io.iov_len);
-	io.iov_base = (uintptr_t) bp->nb_data;
-	uio.uio_offset = NBOFF(bp);
+	if (nfs_mount_gone(nmp)) {
+		return ENXIO;
+	}
 
-	OSAddAtomic(1, (SInt32*)&nfsstats.readdir_bios);
-	if (nfsvers < NFS_VER4) {
-		if (nmp->nm_flag & NFSMNT_RDIRPLUS) {
-			error = nfs3_readdirplus_rpc(np, &uio, ctx);
-			if (error == NFSERR_NOTSUPP) {
-				lck_mtx_lock(&nmp->nm_lock);
-				nmp->nm_flag &= ~NFSMNT_RDIRPLUS;
-				lck_mtx_unlock(&nmp->nm_lock);
-			}
-		}
-		if (!(nmp->nm_flag & NFSMNT_RDIRPLUS))
-			error = nfs3_readdir_rpc(np, &uio, ctx);
-	} else {
-		error = nfs4_readdir_rpc(np, &uio, ctx);
+	if (nmp->nm_vers < NFS_VER4) {
+		error = nfs3_readdir_rpc(np, bp, ctx);
 	}
-	if (error) {
+#if CONFIG_NFS4
+	else {
+		error = nfs4_readdir_rpc(np, bp, ctx);
+	}
+#endif
+	if (error && (error != NFSERR_DIRBUFDROPPED)) {
 		SET(bp->nb_flags, NB_ERROR);
 		bp->nb_error = error;
-	} else {
-		bp->nb_validoff = 0;
-		bp->nb_validend = uio.uio_offset - NBOFF(bp);
-		bp->nb_valid = (1 << (round_page_32(bp->nb_validend)/PAGE_SIZE)) - 1;
 	}
-
-	nfs_buf_iodone(bp);
-	return (error);
+	return error;
 }
+
+#endif /* CONFIG_NFS_CLIENT */