xnu-2782.1.97.tar.gz

[apple/xnu.git] / bsd / nfs / nfs_bio.c
diff --git a/bsd/nfs/nfs_bio.c b/bsd/nfs/nfs_bio.c

index b1dccb036b823f4b54e52ec78ac257d53b4b685a..a58e5d866ab4899add3726b19fa39f0b408e2420 100644 (file)
--- a/bsd/nfs/nfs_bio.c
+++ b/bsd/nfs/nfs_bio.c
@@ -1,5 +1,5 @@
  /*
- * Copyright (c) 2000-2007 Apple Inc. All rights reserved.
+ * Copyright (c) 2000-2014 Apple Inc. All rights reserved.
   *
   * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
   * 
@@ -77,6 +77,7 @@
  #include <sys/kernel.h>
  #include <sys/ubc_internal.h>
  #include <sys/uio_internal.h>
+#include <sys/kpi_mbuf.h>
  
  #include <sys/vm.h>
  #include <sys/vmparam.h>
@@ -345,7 +346,7 @@ nfs_buf_page_inval(vnode_t vp, off_t offset)
         struct nfsbuf *bp;
         int error = 0;
  
-       if (!nmp)
+       if (nfs_mount_gone(nmp))
                 return (ENXIO);
  
         lck_mtx_lock(nfs_buf_mutex);
@@ -364,11 +365,20 @@ nfs_buf_page_inval(vnode_t vp, off_t offset)
          */
         if (bp->nb_dirtyend > 0) {
                 int start = offset - NBOFF(bp);
-               if (bp->nb_dirtyend <= start ||
-                   bp->nb_dirtyoff >= (start + PAGE_SIZE))
-                       error = 0;
-               else
+               if ((bp->nb_dirtyend > start) &&
+                   (bp->nb_dirtyoff < (start + PAGE_SIZE))) {
+                       /*
+                        * Before returning the bad news, move the
+                        * buffer to the start of the delwri list and
+                        * give the list a push to try to flush the
+                        * buffer out.
+                        */
                         error = EBUSY;
+                       nfs_buf_remfree(bp);
+                       TAILQ_INSERT_HEAD(&nfsbufdelwri, bp, nb_free);
+                       nfsbufdelwricnt++;
+                       nfs_buf_delwri_push(1);
+               }
         }
  out:
         lck_mtx_unlock(nfs_buf_mutex);
@@ -481,7 +491,7 @@ nfs_buf_map(struct nfsbuf *bp)
         if (!ISSET(bp->nb_flags, NB_PAGELIST))
                 return (EINVAL);
  
-       kret = ubc_upl_map(bp->nb_pagelist, (vm_address_t *)&(bp->nb_data));
+       kret = ubc_upl_map(bp->nb_pagelist, (vm_offset_t *)&(bp->nb_data));
         if (kret != KERN_SUCCESS)
                 panic("nfs_buf_map: ubc_upl_map() failed with (%d)", kret);
         if (bp->nb_data == 0)
@@ -523,7 +533,7 @@ nfs_buf_normalize_valid_range(nfsnode_t np, struct nfsbuf *bp)
   * process some entries on the delayed write queue
   * (must be called with nfs_buf_mutex held)
   */
-static void
+void
  nfs_buf_delwri_service(void)
  {
         struct nfsbuf *bp;
@@ -565,7 +575,7 @@ nfs_buf_delwri_service(void)
  /*
   * thread to service the delayed write queue when asked
   */
-static void
+void
  nfs_buf_delwri_thread(__unused void *arg, __unused wait_result_t wr)
  {
         struct timespec ts = { 30, 0 };
@@ -585,7 +595,7 @@ nfs_buf_delwri_thread(__unused void *arg, __unused wait_result_t wr)
   * try to push out some delayed/uncommitted writes
   * ("locked" indicates whether nfs_buf_mutex is already held)
   */
-static void
+void
  nfs_buf_delwri_push(int locked)
  {
         if (TAILQ_EMPTY(&nfsbufdelwri))
@@ -627,7 +637,7 @@ int
  nfs_buf_get(
         nfsnode_t np,
         daddr64_t blkno,
-       int size,
+       uint32_t size,
         thread_t thd,
         int flags,
         struct nfsbuf **bpp)
@@ -635,7 +645,7 @@ nfs_buf_get(
         vnode_t vp = NFSTOV(np);
         struct nfsmount *nmp = VTONMP(vp);
         struct nfsbuf *bp;
-       int bufsize;
+       uint32_t bufsize;
         int slpflag = PCATCH;
         int operation = (flags & NBLK_OPMASK);
         int error = 0;
@@ -648,14 +658,14 @@ nfs_buf_get(
         if (bufsize > NFS_MAXBSIZE)
                 panic("nfs_buf_get: buffer larger than NFS_MAXBSIZE requested");
  
-       if (!nmp) {
+       if (nfs_mount_gone(nmp)) {
                 FSDBG_BOT(541, np, blkno, 0, ENXIO);
                 return (ENXIO);
         }
  
         if (!UBCINFOEXISTS(vp)) {
                 operation = NBLK_META;
-       } else if (bufsize < nmp->nm_biosize) {
+       } else if (bufsize < (uint32_t)nmp->nm_biosize) {
                 /* reg files should always have biosize blocks */
                 bufsize = nmp->nm_biosize;
         }
@@ -675,6 +685,21 @@ nfs_buf_get(
  loop:
         lck_mtx_lock(nfs_buf_mutex);
  
+       /* wait for any buffer invalidation/flushing to complete */
+       while (np->n_bflag & NBINVALINPROG) {
+               np->n_bflag |= NBINVALWANT;
+               ts.tv_sec = 2;
+               ts.tv_nsec = 0;
+               msleep(&np->n_bflag, nfs_buf_mutex, slpflag, "nfs_buf_get_invalwait", &ts);
+               if ((error = nfs_sigintr(VTONMP(vp), NULL, thd, 0))) {
+                       lck_mtx_unlock(nfs_buf_mutex);
+                       FSDBG_BOT(541, np, blkno, 0, error);
+                       return (error);
+               }
+               if (np->n_bflag & NBINVALINPROG)
+                       slpflag = 0;
+       }
+
         /* check for existence of nfsbuf in cache */
         if ((bp = nfs_buf_incore(np, blkno))) {
                 /* if busy, set wanted and wait */
@@ -855,8 +880,8 @@ loop:
                 }
         }
  
-       /* setup nfsbuf */
-       bp->nb_lflags = NBL_BUSY;
+       /* set up nfsbuf */
+       SET(bp->nb_lflags, NBL_BUSY);
         bp->nb_flags = 0;
         bp->nb_lblkno = blkno;
         /* insert buf in hash */
@@ -969,9 +994,9 @@ nfs_buf_release(struct nfsbuf *bp, int freeup)
  
         vp = np ? NFSTOV(np) : NULL;
         if (vp && UBCINFOEXISTS(vp) && bp->nb_bufsize) {
-               int upl_flags;
+               int upl_flags, rv;
                 upl_t upl;
-               int i, rv;
+               uint32_t i;
  
                 if (!ISSET(bp->nb_flags, NB_PAGELIST) && !ISSET(bp->nb_flags, NB_INVAL)) {
                         rv = nfs_buf_upl_setup(bp);
@@ -1012,6 +1037,10 @@ nfs_buf_release(struct nfsbuf *bp, int freeup)
                                         upl_flags = UPL_COMMIT_SET_DIRTY;
                                 else
                                         upl_flags = UPL_COMMIT_CLEAR_DIRTY;
+                               
+                               if (!ISSET(bp->nb_flags, (NB_NEEDCOMMIT | NB_DELWRI)))
+                                       upl_flags |= UPL_COMMIT_CLEAR_PRECIOUS;
+
                                 ubc_upl_commit_range(upl,
                                         i*PAGE_SIZE, PAGE_SIZE,
                                         upl_flags |
@@ -1020,15 +1049,16 @@ nfs_buf_release(struct nfsbuf *bp, int freeup)
                         }
                 }
  pagelist_cleanup_done:
-               /* was this the last buffer in the file? */
+               /* invalidate any pages past EOF */
                 if (NBOFF(bp) + bp->nb_bufsize > (off_t)(np->n_size)) {
-                       /* if so, invalidate all pages of last buffer past EOF */
                         off_t start, end;
                         start = trunc_page_64(np->n_size) + PAGE_SIZE_64;
                         end = trunc_page_64(NBOFF(bp) + bp->nb_bufsize);
+                       if (start < NBOFF(bp))
+                               start = NBOFF(bp);
                         if (end > start) {
-                               if (!(rv = ubc_sync_range(vp, start, end, UBC_INVALIDATE)))
-                                       printf("nfs_buf_release(): ubc_sync_range failed!\n");
+                               if ((rv = ubc_msync(vp, start, end, NULL, UBC_INVALIDATE)))
+                                       printf("nfs_buf_release(): ubc_msync failed!, error %d\n", rv);
                         }
                 }
                 CLR(bp->nb_flags, NB_PAGELIST);
@@ -1172,6 +1202,9 @@ nfs_buf_iodone(struct nfsbuf *bp)
                  * any throttled write operations
                  */
                 vnode_writedone(NFSTOV(bp->nb_np));
+               nfs_node_lock_force(bp->nb_np);
+               bp->nb_np->n_numoutput--;
+               nfs_node_unlock(bp->nb_np);
         }
         if (ISSET(bp->nb_flags, NB_ASYNC)) {    /* if async, release it */
                 SET(bp->nb_flags, NB_DONE);             /* note that it's done */
@@ -1219,9 +1252,9 @@ nfs_buf_write_delayed(struct nfsbuf *bp)
         vnode_waitforwrites(NFSTOV(np), VNODE_ASYNC_THROTTLE, 0, 0, "nfs_buf_write_delayed");
  
         /* the file is in a modified state, so make sure the flag's set */
-       nfs_lock(np, NFS_NODE_LOCK_FORCE);
+       nfs_node_lock_force(np);
         np->n_flag |= NMODIFIED;
-       nfs_unlock(np);
+       nfs_node_unlock(np);
  
         /*
          * If we have too many delayed write buffers,
@@ -1258,7 +1291,7 @@ nfs_buf_check_write_verifier(nfsnode_t np, struct nfsbuf *bp)
                 return;
  
         nmp = NFSTONMP(np);
-       if (!nmp)
+       if (nfs_mount_gone(nmp))
                 return;
         if (!ISSET(bp->nb_flags, NB_STALEWVERF) && (bp->nb_verf == nmp->nm_verf))
                 return;
@@ -1266,10 +1299,10 @@ nfs_buf_check_write_verifier(nfsnode_t np, struct nfsbuf *bp)
         /* write verifier changed, clear commit/wverf flags */
         CLR(bp->nb_flags, (NB_NEEDCOMMIT | NB_STALEWVERF));
         bp->nb_verf = 0;
-       nfs_lock(np, NFS_NODE_LOCK_FORCE);
+       nfs_node_lock_force(np);
         np->n_needcommitcnt--;
         CHECK_NEEDCOMMITCNT(np);
-       nfs_unlock(np);
+       nfs_node_unlock(np);
  }
  
  /*
@@ -1303,7 +1336,7 @@ nfs_buf_acquire(struct nfsbuf *bp, int flags, int slpflag, int slptimeo)
  
         if (ISSET(bp->nb_lflags, NBL_BUSY)) {
                 /*      
-                * since the mutex_lock may block, the buffer
+                * since the lck_mtx_lock may block, the buffer
                  * may become BUSY, so we need to recheck for
                  * a NOWAIT request
                  */
@@ -1442,7 +1475,7 @@ nfs_buf_read(struct nfsbuf *bp)
  
         NFS_BUF_MAP(bp);
  
-       OSAddAtomic(1, (SInt32 *)&nfsstats.read_bios);
+       OSAddAtomic64(1, &nfsstats.read_bios);
  
         error = nfs_buf_read_rpc(bp, thd, cred);
         /*
@@ -1468,7 +1501,7 @@ nfs_buf_read_finish(struct nfsbuf *bp)
                 /* update valid range */
                 bp->nb_validoff = 0;
                 bp->nb_validend = bp->nb_endio;
-               if (bp->nb_endio < bp->nb_bufsize) { 
+               if (bp->nb_endio < (int)bp->nb_bufsize) { 
                         /*
                          * The read may be short because we have unflushed writes
                          * that are extending the file size and the reads hit the
@@ -1491,7 +1524,7 @@ nfs_buf_read_finish(struct nfsbuf *bp)
                 bp->nb_valid = (1 << (round_page_32(bp->nb_validend) / PAGE_SIZE)) - 1;
                 if (bp->nb_validend & PAGE_MASK) {
                         /* zero-fill remainder of last page */
-                       bzero(bp->nb_data + bp->nb_validend, bp->nb_bufsize - bp->nb_validend);
+                       bzero(bp->nb_data + bp->nb_validend, PAGE_SIZE - (bp->nb_validend & PAGE_MASK));
                 }
         }
         nfs_buf_iodone(bp);
@@ -1506,13 +1539,14 @@ nfs_buf_read_rpc(struct nfsbuf *bp, thread_t thd, kauth_cred_t cred)
         struct nfsmount *nmp;
         nfsnode_t np = bp->nb_np;
         int error = 0, nfsvers, async;
-       int offset, length, nmrsize, nrpcs, len;
+       int offset, nrpcs;
+       uint32_t nmrsize, length, len;
         off_t boff;
         struct nfsreq *req;
         struct nfsreq_cbinfo cb;
  
         nmp = NFSTONMP(np);
-       if (!nmp) {
+       if (nfs_mount_gone(nmp)) {
                 bp->nb_error = error = ENXIO;
                 SET(bp->nb_flags, NB_ERROR);
                 nfs_buf_iodone(bp);
@@ -1557,6 +1591,8 @@ nfs_buf_read_rpc(struct nfsbuf *bp, thread_t thd, kauth_cred_t cred)
                 len = (length > nmrsize) ? nmrsize : length;
                 cb.rcb_args[0] = offset;
                 cb.rcb_args[1] = len;
+               if (nmp->nm_vers >= NFS_VER4)
+                       cb.rcb_args[2] = nmp->nm_stategenid;
                 req = NULL;
                 error = nmp->nm_funcs->nf_read_rpc_async(np, boff + offset, len, thd, cred, &cb, &req);
                 if (error)
@@ -1618,8 +1654,8 @@ nfs_buf_read_rpc_finish(struct nfsreq *req)
         nfsnode_t np;
         thread_t thd;
         kauth_cred_t cred;
-       struct uio uio;
-       struct iovec_32 io;
+       uio_t auio;
+       char uio_buf [ UIO_SIZEOF(1) ];
  
  finish:
         np = req->r_np;
@@ -1629,9 +1665,11 @@ finish:
                 kauth_cred_ref(cred);
         cb = req->r_callback;
         bp = cb.rcb_bp;
+       if (cb.rcb_func) /* take an extra reference on the nfsreq in case we want to resend it later due to grace error */
+               nfs_request_ref(req, 0);
  
         nmp = NFSTONMP(np);
-       if (!nmp) {
+       if (nfs_mount_gone(nmp)) {
                 SET(bp->nb_flags, NB_ERROR);
                 bp->nb_error = error = ENXIO;
         }
@@ -1645,28 +1683,65 @@ finish:
         offset = cb.rcb_args[0];
         rlen = length = cb.rcb_args[1];
  
-       uio.uio_iovs.iov32p = &io;
-       uio.uio_iovcnt = 1;
-       uio.uio_rw = UIO_READ;
-#if 1  /* LP64todo - can't use new segment flags until the drivers are ready */
-       uio.uio_segflg = UIO_SYSSPACE;
-#else
-       uio.uio_segflg = UIO_SYSSPACE32;
-#endif
-       io.iov_len = length;
-       uio_uio_resid_set(&uio, io.iov_len);
-       uio.uio_offset = NBOFF(bp) + offset;
-       io.iov_base = (uintptr_t) bp->nb_data + offset;
+       auio = uio_createwithbuffer(1, NBOFF(bp) + offset, UIO_SYSSPACE,
+                                UIO_READ, &uio_buf, sizeof(uio_buf));
+       uio_addiov(auio, CAST_USER_ADDR_T(bp->nb_data + offset), length);
  
         /* finish the RPC */
-       error = nmp->nm_funcs->nf_read_rpc_async_finish(np, req, &uio, &rlen, &eof);
+       error = nmp->nm_funcs->nf_read_rpc_async_finish(np, req, auio, &rlen, &eof);
         if ((error == EINPROGRESS) && cb.rcb_func) {
                 /* async request restarted */
+               if (cb.rcb_func)
+                       nfs_request_rele(req);
                 if (IS_VALID_CRED(cred))
                         kauth_cred_unref(&cred);
                 return;
         }
-
+       if ((nmp->nm_vers >= NFS_VER4) && nfs_mount_state_error_should_restart(error) && !ISSET(bp->nb_flags, NB_ERROR)) {
+               lck_mtx_lock(&nmp->nm_lock);
+               if ((error != NFSERR_OLD_STATEID) && (error != NFSERR_GRACE) && (cb.rcb_args[2] == nmp->nm_stategenid)) {
+                       NP(np, "nfs_buf_read_rpc_finish: error %d @ 0x%llx, 0x%x 0x%x, initiating recovery",
+                               error, NBOFF(bp)+offset, cb.rcb_args[2], nmp->nm_stategenid);
+                       nfs_need_recover(nmp, error);
+               }
+               lck_mtx_unlock(&nmp->nm_lock);
+               if (np->n_flag & NREVOKE) {
+                       error = EIO;
+               } else {
+                       if (error == NFSERR_GRACE) {
+                               if (cb.rcb_func) {
+                                       /*
+                                        * For an async I/O request, handle a grace delay just like
+                                        * jukebox errors.  Set the resend time and queue it up.
+                                        */
+                                       struct timeval now;
+                                       if (req->r_nmrep.nmc_mhead) {
+                                               mbuf_freem(req->r_nmrep.nmc_mhead);
+                                               req->r_nmrep.nmc_mhead = NULL;
+                                       }
+                                       req->r_error = 0;
+                                       microuptime(&now);
+                                       lck_mtx_lock(&req->r_mtx);
+                                       req->r_resendtime = now.tv_sec + 2;
+                                       req->r_xid = 0;                 // get a new XID
+                                       req->r_flags |= R_RESTART;
+                                       req->r_start = 0;
+                                       nfs_asyncio_resend(req);
+                                       lck_mtx_unlock(&req->r_mtx);
+                                       if (IS_VALID_CRED(cred))
+                                               kauth_cred_unref(&cred);
+                                       /* Note: nfsreq reference taken will be dropped later when finished */
+                                       return;
+                               }
+                               /* otherwise, just pause a couple seconds and retry */
+                               tsleep(&nmp->nm_state, (PZERO-1), "nfsgrace", 2*hz);
+                       }
+                       if (!(error = nfs_mount_state_wait_for_recovery(nmp))) {
+                               rlen = 0;
+                               goto readagain;
+                       }
+               }
+       }
         if (error) {
                 SET(bp->nb_flags, NB_ERROR);
                 bp->nb_error = error;
@@ -1692,19 +1767,24 @@ finish:
                  * requested, so we need to issue another read for the rest.
                  * (Don't bother if the buffer already hit an error.)
                  */
+readagain:
                 offset += rlen;
                 length -= rlen;
                 cb.rcb_args[0] = offset;
                 cb.rcb_args[1] = length;
-               error = nmp->nm_funcs->nf_read_rpc_async(np, offset, length, thd, cred, &cb, &rreq);
+               if (nmp->nm_vers >= NFS_VER4)
+                       cb.rcb_args[2] = nmp->nm_stategenid;
+               error = nmp->nm_funcs->nf_read_rpc_async(np, NBOFF(bp) + offset, length, thd, cred, &cb, &rreq);
                 if (!error) {
                         if (IS_VALID_CRED(cred))
                                 kauth_cred_unref(&cred);
                         if (!cb.rcb_func) {
                                 /* if !async we'll need to wait for this RPC to finish */
                                 req = rreq;
+                               rreq = NULL;
                                 goto finish;
                         }
+                       nfs_request_rele(req);
                         /*
                          * We're done here.
                          * Outstanding RPC count is unchanged.
@@ -1717,6 +1797,8 @@ finish:
         }
  
  out:
+       if (cb.rcb_func)
+               nfs_request_rele(req);
         if (IS_VALID_CRED(cred))
                 kauth_cred_unref(&cred);
  
@@ -1752,14 +1834,15 @@ out:
   * Do buffer readahead.
   * Initiate async I/O to read buffers not in cache.
   */
-static int
+int
  nfs_buf_readahead(nfsnode_t np, int ioflag, daddr64_t *rabnp, daddr64_t lastrabn, thread_t thd, kauth_cred_t cred)
  {
         struct nfsmount *nmp = NFSTONMP(np);
         struct nfsbuf *bp;
-       int error = 0, nra;
+       int error = 0;
+       uint32_t nra;
  
-       if (!nmp)
+       if (nfs_mount_gone(nmp))
                 return (ENXIO);
         if (nmp->nm_readahead <= 0)
                 return (0);
@@ -1768,9 +1851,17 @@ nfs_buf_readahead(nfsnode_t np, int ioflag, daddr64_t *rabnp, daddr64_t lastrabn
  
         for (nra = 0; (nra < nmp->nm_readahead) && (*rabnp <= lastrabn); nra++, *rabnp = *rabnp + 1) {
                 /* check if block exists and is valid. */
+               if ((*rabnp * nmp->nm_biosize) >= (off_t)np->n_size) {
+                       /* stop reading ahead if we're beyond EOF */
+                       *rabnp = lastrabn;
+                       break;
+               }
                 error = nfs_buf_get(np, *rabnp, nmp->nm_biosize, thd, NBLK_READ|NBLK_NOWAIT, &bp);
                 if (error)
                         break;
+               nfs_node_lock_force(np);
+               np->n_lastrahead = *rabnp;
+               nfs_node_unlock(np);
                 if (!bp)
                         continue;
                 if ((ioflag & IO_NOCACHE) && ISSET(bp->nb_flags, NB_CACHE) &&
@@ -1798,209 +1889,161 @@ nfs_buf_readahead(nfsnode_t np, int ioflag, daddr64_t *rabnp, daddr64_t lastrabn
  }
  
  /*
- * NFS buffer I/O for reading files/directories.
+ * NFS buffer I/O for reading files.
   */
  int
-nfs_bioread(nfsnode_t np, struct uio *uio, int ioflag, int *eofflag, vfs_context_t ctx)
+nfs_bioread(nfsnode_t np, uio_t uio, int ioflag, vfs_context_t ctx)
  {
         vnode_t vp = NFSTOV(np);
         struct nfsbuf *bp = NULL;
-       struct nfs_vattr nvattr;
         struct nfsmount *nmp = VTONMP(vp);
-       daddr64_t lbn, rabn = 0, lastrabn, maxrabn = -1, tlbn;
+       daddr64_t lbn, rabn = 0, lastrabn, maxrabn = -1;
         off_t diff;
         int error = 0, n = 0, on = 0;
-       int nfsvers, biosize;
-       caddr_t dp;
-       struct dirent *direntp = NULL;
-       enum vtype vtype;
+       int nfsvers, biosize, modified, readaheads = 0;
         thread_t thd;
         kauth_cred_t cred;
+       int64_t io_resid;
  
-       FSDBG_TOP(514, np, uio->uio_offset, uio_uio_resid(uio), ioflag);
-
-       if (uio_uio_resid(uio) == 0) {
-               FSDBG_BOT(514, np, 0xd1e0001, 0, 0);
-               return (0);
-       }
-       if (uio->uio_offset < 0) {
-               FSDBG_BOT(514, np, 0xd1e0002, 0, EINVAL);
-               return (EINVAL);
-       }
+       FSDBG_TOP(514, np, uio_offset(uio), uio_resid(uio), ioflag);
  
         nfsvers = nmp->nm_vers;
         biosize = nmp->nm_biosize;
         thd = vfs_context_thread(ctx);
         cred = vfs_context_ucred(ctx);
  
-       vtype = vnode_vtype(vp);
-       if ((vtype != VREG) && (vtype != VDIR)) {
-               printf("nfs_bioread: type %x unexpected\n", vtype);
+       if (vnode_vtype(vp) != VREG) {
+               printf("nfs_bioread: type %x unexpected\n", vnode_vtype(vp));
                 FSDBG_BOT(514, np, 0xd1e0016, 0, EINVAL);
                 return (EINVAL);
         }
  
         /*
-        * For nfs, cache consistency can only be maintained approximately.
+        * For NFS, cache consistency can only be maintained approximately.
          * Although RFC1094 does not specify the criteria, the following is
          * believed to be compatible with the reference port.
-        * For nfs:
-        * If the file's modify time on the server has changed since the
-        * last read rpc or you have written to the file,
-        * you may have lost data cache consistency with the
-        * server, so flush all of the file's data out of the cache.
-        * Then force a getattr rpc to ensure that you have up to date
-        * attributes.
+        * 
+        * If the file has changed since the last read RPC or you have
+        * written to the file, you may have lost data cache consistency
+        * with the server.  So, check for a change, and flush all of the
+        * file's data out of the cache.
          * NB: This implies that cache data can be read when up to
-        * NFS_MAXATTRTIMEO seconds out of date. If you find that you need
-        * current attributes this could be forced by calling
-        * NATTRINVALIDATE() before the nfs_getattr() call.
+        * NFS_MAXATTRTIMO seconds out of date. If you find that you
+        * need current attributes, nfs_getattr() can be forced to fetch
+        * new attributes (via NATTRINVALIDATE() or NGA_UNCACHED).
          */
  
         if (ISSET(np->n_flag, NUPDATESIZE))
                 nfs_data_update_size(np, 0);
  
-       if ((error = nfs_lock(np, NFS_NODE_LOCK_EXCLUSIVE))) {
+       if ((error = nfs_node_lock(np))) {
                 FSDBG_BOT(514, np, 0xd1e0222, 0, error);
                 return (error);
         }
  
         if (np->n_flag & NNEEDINVALIDATE) {
                 np->n_flag &= ~NNEEDINVALIDATE;
-               nfs_unlock(np);
-               nfs_vinvalbuf(vp, V_SAVE|V_IGNORE_WRITEERR, ctx, 1);
-               if ((error = nfs_lock(np, NFS_NODE_LOCK_EXCLUSIVE))) {
+               nfs_node_unlock(np);
+               error = nfs_vinvalbuf(vp, V_SAVE|V_IGNORE_WRITEERR, ctx, 1);
+               if (!error)
+                       error = nfs_node_lock(np);
+               if (error) {
                         FSDBG_BOT(514, np, 0xd1e0322, 0, error);
                         return (error);
                 }
         }
  
-       if (np->n_flag & NMODIFIED) {
-               if (vtype == VDIR) {
-                       nfs_invaldir(np);
-                       nfs_unlock(np);
-                       error = nfs_vinvalbuf(vp, V_SAVE, ctx, 1);
-                       if (!error)
-                               error = nfs_lock(np, NFS_NODE_LOCK_EXCLUSIVE);
-                       if (error) {
-                               FSDBG_BOT(514, np, 0xd1e0003, 0, error);
-                               return (error);
-                       }
-               }
-               NATTRINVALIDATE(np);
-               error = nfs_getattr(np, &nvattr, ctx, 1);
-               if (error) {
-                       nfs_unlock(np);
-                       FSDBG_BOT(514, np, 0xd1e0004, 0, error);
-                       return (error);
-               }
-               if (vtype == VDIR) {
-                       /* if directory changed, purge any name cache entries */
-                       if (NFS_CHANGED_NC(nfsvers, np, &nvattr))
-                               cache_purge(vp);
-                       NFS_CHANGED_UPDATE_NC(nfsvers, np, &nvattr);
-               }
-               NFS_CHANGED_UPDATE(nfsvers, np, &nvattr);
-       } else {
-               error = nfs_getattr(np, &nvattr, ctx, 1);
-               if (error) {
-                       nfs_unlock(np);
-                       FSDBG_BOT(514, np, 0xd1e0005, 0, error);
-                       return (error);
-               }
-               if (NFS_CHANGED(nfsvers, np, &nvattr)) {
-                       if (vtype == VDIR) {
-                               nfs_invaldir(np);
-                               /* purge name cache entries */
-                               if (NFS_CHANGED_NC(nfsvers, np, &nvattr))
-                                       cache_purge(vp);
-                       }
-                       nfs_unlock(np);
-                       error = nfs_vinvalbuf(vp, V_SAVE, ctx, 1);
-                       if (!error)
-                               error = nfs_lock(np, NFS_NODE_LOCK_EXCLUSIVE);
-                       if (error) {
-                               FSDBG_BOT(514, np, 0xd1e0006, 0, error);
-                               return (error);
-                       }
-                       if (vtype == VDIR)
-                               NFS_CHANGED_UPDATE_NC(nfsvers, np, &nvattr);
-                       NFS_CHANGED_UPDATE(nfsvers, np, &nvattr);
-               }
+       modified = (np->n_flag & NMODIFIED);
+       nfs_node_unlock(np);
+       /* nfs_getattr() will check changed and purge caches */
+       error = nfs_getattr(np, NULL, ctx, modified ? NGA_UNCACHED : NGA_CACHED);
+       if (error) {
+               FSDBG_BOT(514, np, 0xd1e0004, 0, error);
+               return (error);
         }
  
-       nfs_unlock(np);
+       if (uio_resid(uio) == 0) {
+               FSDBG_BOT(514, np, 0xd1e0001, 0, 0);
+               return (0);
+       }
+       if (uio_offset(uio) < 0) {
+               FSDBG_BOT(514, np, 0xd1e0002, 0, EINVAL);
+               return (EINVAL);
+       }
  
-       if (vtype == VREG) {
-               if ((ioflag & IO_NOCACHE) && (uio_uio_resid(uio) < (2*biosize))) {
-                       /* We have only a block or so to read, just do the rpc directly. */
-                       error = nfs_read_rpc(np, uio, ctx);
-                       FSDBG_BOT(514, np, uio->uio_offset, uio_uio_resid(uio), error);
-                       return (error);
-               }
-               /*
-                * set up readahead - which may be limited by:
-                * + current request length (for IO_NOCACHE)
-                * + readahead setting
-                * + file size
-                */
-               if (nmp->nm_readahead > 0) {
-                       off_t end = uio->uio_offset + uio_uio_resid(uio);
-                       if (end > (off_t)np->n_size)
-                               end = np->n_size;
-                       rabn = uio->uio_offset / biosize;
-                       maxrabn = (end - 1) / biosize;
-                       if (!(ioflag & IO_NOCACHE) &&
-                           (!rabn || (rabn == np->n_lastread) || (rabn == (np->n_lastread+1)))) {
-                               maxrabn += nmp->nm_readahead;
-                               if ((maxrabn * biosize) >= (off_t)np->n_size)
-                                       maxrabn = ((off_t)np->n_size - 1)/biosize;
-                       }
-               } else {
-                       rabn = maxrabn = 0;
-               }
+       /*
+        * set up readahead - which may be limited by:
+        * + current request length (for IO_NOCACHE)
+        * + readahead setting
+        * + file size
+        */
+       if (nmp->nm_readahead > 0) {
+               off_t end = uio_offset(uio) + uio_resid(uio);
+               if (end > (off_t)np->n_size)
+                       end = np->n_size;
+               rabn = uio_offset(uio) / biosize;
+               maxrabn = (end - 1) / biosize;
+               nfs_node_lock_force(np);
+               if (!(ioflag & IO_NOCACHE) &&
+                   (!rabn || (rabn == np->n_lastread) || (rabn == (np->n_lastread+1)))) {
+                       maxrabn += nmp->nm_readahead;
+                       if ((maxrabn * biosize) >= (off_t)np->n_size)
+                               maxrabn = ((off_t)np->n_size - 1)/biosize;
+               }
+               if (maxrabn < np->n_lastrahead)
+                       np->n_lastrahead = -1;
+               if (rabn < np->n_lastrahead)
+                       rabn = np->n_lastrahead + 1;
+               nfs_node_unlock(np);
+       } else {
+               rabn = maxrabn = 0;
         }
  
         do {
  
-           if (vtype == VREG) {
-               nfs_data_lock(np, NFS_NODE_LOCK_SHARED);
-               lbn = uio->uio_offset / biosize;
+               nfs_data_lock(np, NFS_DATA_LOCK_SHARED);
+               lbn = uio_offset(uio) / biosize;
  
                 /*
                  * Copy directly from any cached pages without grabbing the bufs.
-                *
-                * Note: for "nocache" reads, we don't copy directly from UBC
-                * because any cached pages will be for readahead buffers that
-                * need to be invalidated anyway before we finish this request.
+                * (If we are NOCACHE and we've issued readahead requests, we need
+                * to grab the NB_NCRDAHEAD bufs to drop them.)
                  */
-               if (!(ioflag & IO_NOCACHE) &&
-                       (uio->uio_segflg == UIO_USERSPACE32 ||
-                        uio->uio_segflg == UIO_USERSPACE64 ||
-                        uio->uio_segflg == UIO_USERSPACE)) {
-                       // LP64todo - fix this!
-                       int io_resid = uio_uio_resid(uio);
-                       diff = np->n_size - uio->uio_offset;
+               if ((!(ioflag & IO_NOCACHE) || !readaheads) &&
+                   ((uio->uio_segflg == UIO_USERSPACE32 ||
+                     uio->uio_segflg == UIO_USERSPACE64 ||
+                     uio->uio_segflg == UIO_USERSPACE))) {
+                       io_resid = uio_resid(uio);
+                       diff = np->n_size - uio_offset(uio);
                         if (diff < io_resid)
                                 io_resid = diff;
                         if (io_resid > 0) {
-                               error = cluster_copy_ubc_data(vp, uio, &io_resid, 0);
+                               int count = (io_resid > INT_MAX) ? INT_MAX : io_resid;
+                               error = cluster_copy_ubc_data(vp, uio, &count, 0);
                                 if (error) {
                                         nfs_data_unlock(np);
-                                       FSDBG_BOT(514, np, uio->uio_offset, 0xcacefeed, error);
+                                       FSDBG_BOT(514, np, uio_offset(uio), 0xcacefeed, error);
                                         return (error);
                                 }
                         }
                         /* count any biocache reads that we just copied directly */
-                       if (lbn != (uio->uio_offset / biosize)) {
-                               OSAddAtomic((uio->uio_offset / biosize) - lbn, (SInt32*)&nfsstats.biocache_reads);
-                               FSDBG(514, np, 0xcacefeed, uio->uio_offset, error);
+                       if (lbn != (uio_offset(uio)/biosize)) {
+                               OSAddAtomic64((uio_offset(uio)/biosize) - lbn, &nfsstats.biocache_reads);
+                               FSDBG(514, np, 0xcacefeed, uio_offset(uio), error);
                         }
                 }
  
-               lbn = uio->uio_offset / biosize;
-               on = uio->uio_offset % biosize;
-               np->n_lastread = (uio->uio_offset - 1) / biosize;
+               lbn = uio_offset(uio) / biosize;
+               on = uio_offset(uio) % biosize;
+               nfs_node_lock_force(np);
+               np->n_lastread = (uio_offset(uio) - 1) / biosize;
+               nfs_node_unlock(np);
+
+               if ((uio_resid(uio) <= 0) || (uio_offset(uio) >= (off_t)np->n_size)) {
+                       nfs_data_unlock(np);
+                       FSDBG_BOT(514, np, uio_offset(uio), uio_resid(uio), 0xaaaaaaaa);
+                       return (0);
+               }
  
                 /* adjust readahead block number, if necessary */
                 if (rabn < lbn)
@@ -2013,15 +2056,10 @@ nfs_bioread(nfsnode_t np, struct uio *uio, int ioflag, int *eofflag, vfs_context
                                 FSDBG_BOT(514, np, 0xd1e000b, 1, error);
                                 return (error);
                         }
+                       readaheads = 1;
                 }
  
-               if ((uio_uio_resid(uio) <= 0) || (uio->uio_offset >= (off_t)np->n_size)) {
-                       nfs_data_unlock(np);
-                       FSDBG_BOT(514, np, uio->uio_offset, uio_uio_resid(uio), 0xaaaaaaaa);
-                       return (0);
-               }
-
-               OSAddAtomic(1, (SInt32*)&nfsstats.biocache_reads);
+               OSAddAtomic64(1, &nfsstats.biocache_reads);
  
                 /*
                  * If the block is in the cache and has the required data
@@ -2030,9 +2068,9 @@ nfs_bioread(nfsnode_t np, struct uio *uio, int ioflag, int *eofflag, vfs_context
                  * as required.
                  */
  again:
-               // LP64todo - fix this!
-               n = min((unsigned)(biosize - on), uio_uio_resid(uio));
-               diff = np->n_size - uio->uio_offset;
+               io_resid = uio_resid(uio);
+               n = (io_resid > (biosize - on)) ? (biosize - on) : io_resid;
+               diff = np->n_size - uio_offset(uio);
                 if (diff < n)
                         n = diff;
  
@@ -2055,11 +2093,9 @@ again:
                                 SET(bp->nb_flags, NB_NOCACHE);
                                 goto flushbuffer;
                         }
-                       if (!ISSET(bp->nb_flags, NB_NCRDAHEAD)) {
-                               CLR(bp->nb_flags, NB_CACHE);
-                               bp->nb_valid = 0;
-                       } else {
+                       if (ISSET(bp->nb_flags, NB_NCRDAHEAD)) {
                                 CLR(bp->nb_flags, NB_NCRDAHEAD);
+                               SET(bp->nb_flags, NB_NOCACHE);
                         }
                 }
  
@@ -2127,7 +2163,7 @@ flushbuffer:
                                 if (!auio) {
                                         error = ENOMEM;
                                 } else {
-                                       uio_addiov(auio, CAST_USER_ADDR_T((bp->nb_data + firstpg * PAGE_SIZE)),
+                                       uio_addiov(auio, CAST_USER_ADDR_T(bp->nb_data + (firstpg * PAGE_SIZE)),
                                                         ((lastpg - firstpg + 1) * PAGE_SIZE));
                                         error = nfs_read_rpc(np, auio, ctx);
                                 }
@@ -2162,6 +2198,8 @@ flushbuffer:
                         SET(bp->nb_flags, NB_READ);
                         CLR(bp->nb_flags, (NB_DONE | NB_ERROR | NB_INVAL));
                         error = nfs_buf_read(bp);
+                       if (ioflag & IO_NOCACHE)
+                               SET(bp->nb_flags, NB_NOCACHE);
                         if (error) {
                                 nfs_data_unlock(np);
                                 nfs_buf_release(bp, 1);
@@ -2176,157 +2214,28 @@ buffer_ready:
                         if (diff < n)
                                 n = diff;
                 }
-               if (n > 0)
-                       NFS_BUF_MAP(bp);
-           } else if (vtype == VDIR) {
-               OSAddAtomic(1, (SInt32*)&nfsstats.biocache_readdirs);
-               error = nfs_lock(np, NFS_NODE_LOCK_SHARED);
-               if (error || (np->n_direofoffset && (uio->uio_offset >= np->n_direofoffset))) {
-                       if (!error)
-                               nfs_unlock(np);
-                       if (eofflag)
-                               *eofflag = 1;
-                       FSDBG_BOT(514, np, 0xde0f0001, 0, 0);
-                       return (0);
-               }
-               nfs_unlock(np);
-               lbn = uio->uio_offset / NFS_DIRBLKSIZ;
-               on = uio->uio_offset & (NFS_DIRBLKSIZ - 1);
-               error = nfs_buf_get(np, lbn, NFS_DIRBLKSIZ, thd, NBLK_READ, &bp);
-               if (error) {
-                       FSDBG_BOT(514, np, 0xd1e0012, 0, error);
-                       return (error);
-               }
-               if (!ISSET(bp->nb_flags, NB_CACHE)) {
-                   SET(bp->nb_flags, NB_READ);
-                   error = nfs_buf_readdir(bp, ctx);
-                   if (error)
-                       nfs_buf_release(bp, 1);
-                   while (error == NFSERR_BAD_COOKIE) {
-                       error = nfs_lock(np, NFS_NODE_LOCK_EXCLUSIVE);
-                       if (!error) {
-                               nfs_invaldir(np);
-                               nfs_unlock(np);
-                       }
-                       error = nfs_vinvalbuf(vp, 0, ctx, 1);
-                       /*
-                        * Yuck! The directory has been modified on the
-                        * server. The only way to get the block is by
-                        * reading from the beginning to get all the
-                        * offset cookies.
-                        */
-                       for (tlbn = 0; tlbn <= lbn && !error; tlbn++) {
-                           if ((error = nfs_lock(np, NFS_NODE_LOCK_SHARED)))
-                                   break;
-                           if (np->n_direofoffset
-                               && (tlbn * NFS_DIRBLKSIZ) >= np->n_direofoffset) {
-                                   nfs_unlock(np);
-                                   if (eofflag)
-                                           *eofflag = 1;
-                                   FSDBG_BOT(514, np, 0xde0f0002, 0, 0);
-                                   return (0);
-                           }
-                           nfs_unlock(np);
-                           error = nfs_buf_get(np, tlbn, NFS_DIRBLKSIZ, thd, NBLK_READ, &bp);
-                           if (error) {
-                                   FSDBG_BOT(514, np, 0xd1e0013, 0, error);
-                                   return (error);
-                           }
-                           if (!ISSET(bp->nb_flags, NB_CACHE)) {
-                                   SET(bp->nb_flags, NB_READ);
-                                   error = nfs_buf_readdir(bp, ctx);
-                                   /*
-                                    * no error + NB_INVAL == directory EOF,
-                                    * use the block.
-                                    */
-                                   if (error == 0 && ISSET(bp->nb_flags, NB_INVAL)) {
-                                           if (eofflag)
-                                                   *eofflag = 1;
-                                           break;
-                                   }
-                           }
-                           /*
-                            * An error will throw away the block and the
-                            * for loop will break out.  If no error and this
-                            * is not the block we want, we throw away the
-                            * block and go for the next one via the for loop.
-                            */
-                           if (error || (tlbn < lbn))
-                                   nfs_buf_release(bp, 1);
-                       }
-                   }
-                   /*
-                    * The above while is repeated if we hit another cookie
-                    * error.  If we hit an error and it wasn't a cookie error,
-                    * we give up.
-                    */
-                   if (error) {
-                       FSDBG_BOT(514, np, 0xd1e0014, 0, error);
-                       return (error);
-                   }
-               }
-               /*
-                * Make sure we use a signed variant of min() since
-                * the second term may be negative.
-                */
-               // LP64todo - fix this!
-               n = lmin(uio_uio_resid(uio), bp->nb_validend - on);
-               /*
-                * We keep track of the directory eof in
-                * np->n_direofoffset and chop it off as an
-                * extra step right here.
-                */
-               if ((error = nfs_lock(np, NFS_NODE_LOCK_SHARED))) {
-                       FSDBG_BOT(514, np, 0xd1e0115, 0, error);
-                       return (error);
-               }
-               if (np->n_direofoffset &&
-                   n > np->n_direofoffset - uio->uio_offset)
-                       n = np->n_direofoffset - uio->uio_offset;
-               nfs_unlock(np);
-               /*
-                * Make sure that we return an integral number of entries so
-                * that any subsequent calls will start copying from the start
-                * of the next entry.
-                *
-                * If the current value of n has the last entry cut short,
-                * set n to copy everything up to the last entry instead.
-                */
                 if (n > 0) {
-                       dp = bp->nb_data + on;
-                       while (dp < (bp->nb_data + on + n)) {
-                               direntp = (struct dirent *)dp;
-                               dp += direntp->d_reclen;
-                       }
-                       if (dp > (bp->nb_data + on + n))
-                               n = (dp - direntp->d_reclen) - (bp->nb_data + on);
+                       NFS_BUF_MAP(bp);
+                       error = uiomove(bp->nb_data + on, n, uio);
                 }
-           }
-
-           if (n > 0)
-               error = uiomove(bp->nb_data + on, (int)n, uio);
  
-           if (vtype == VREG) {
-               if (ioflag & IO_NOCACHE)
-                       SET(bp->nb_flags, NB_NOCACHE);
                 nfs_buf_release(bp, 1);
                 nfs_data_unlock(np);
-               np->n_lastread = (uio->uio_offset - 1) / biosize;
-           } else {
-               nfs_buf_release(bp, 1);
-           }
-       } while (error == 0 && uio_uio_resid(uio) > 0 && n > 0);
-       FSDBG_BOT(514, np, uio->uio_offset, uio_uio_resid(uio), error);
+               nfs_node_lock_force(np);
+               np->n_lastread = (uio_offset(uio) - 1) / biosize;
+               nfs_node_unlock(np);
+       } while (error == 0 && uio_resid(uio) > 0 && n > 0);
+       FSDBG_BOT(514, np, uio_offset(uio), uio_resid(uio), error);
         return (error);
  }
  
  /*
   * limit the number of outstanding async I/O writes
   */
-static int
+int
  nfs_async_write_start(struct nfsmount *nmp)
  {
-       int error = 0, slpflag = (nmp->nm_flag & NFSMNT_INT) ? PCATCH : 0;
+       int error = 0, slpflag = NMFLAG(nmp, INTR) ? PCATCH : 0;
         struct timespec ts = {1, 0};
  
         if (nfs_max_async_writes <= 0)
@@ -2343,7 +2252,7 @@ nfs_async_write_start(struct nfsmount *nmp)
         lck_mtx_unlock(&nmp->nm_lock);
         return (error);
  }
-static void
+void
  nfs_async_write_done(struct nfsmount *nmp)
  {
         if (nmp->nm_asyncwrites <= 0)
@@ -2402,10 +2311,13 @@ nfs_buf_write(struct nfsbuf *bp)
                 LIST_INSERT_HEAD(&np->n_cleanblkhd, bp, nb_vnbufs);
                 lck_mtx_unlock(nfs_buf_mutex);
         }
+       nfs_node_lock_force(np);
+       np->n_numoutput++;
+       nfs_node_unlock(np);
         vnode_startwrite(NFSTOV(np));
  
         if (p && p->p_stats)
-               OSIncrementAtomic(&p->p_stats->p_ru.ru_oublock);
+               OSIncrementAtomicLong(&p->p_stats->p_ru.ru_oublock);
  
         cred = bp->nb_wcred;
         if (!IS_VALID_CRED(cred) && ISSET(bp->nb_flags, NB_READ))
@@ -2415,17 +2327,26 @@ nfs_buf_write(struct nfsbuf *bp)
         thd = async ? NULL : current_thread();
  
         /* We need to make sure the pages are locked before doing I/O.  */
-       if (!ISSET(bp->nb_flags, NB_META) && UBCINFOEXISTS(NFSTOV(np))) {
-               if (!ISSET(bp->nb_flags, NB_PAGELIST)) {
-                       error = nfs_buf_upl_setup(bp);
-                       if (error) {
-                               printf("nfs_buf_write: upl create failed %d\n", error);
-                               SET(bp->nb_flags, NB_ERROR);
-                               bp->nb_error = error = EIO;
-                               nfs_buf_iodone(bp);
-                               goto out;
+       if (!ISSET(bp->nb_flags, NB_META)) {
+               if (UBCINFOEXISTS(NFSTOV(np))) {
+                       if (!ISSET(bp->nb_flags, NB_PAGELIST)) {
+                               error = nfs_buf_upl_setup(bp);
+                               if (error) {
+                                       printf("nfs_buf_write: upl create failed %d\n", error);
+                                       SET(bp->nb_flags, NB_ERROR);
+                                       bp->nb_error = error = EIO;
+                                       nfs_buf_iodone(bp);
+                                       goto out;
+                               }
+                               nfs_buf_upl_check(bp);
                         }
-                       nfs_buf_upl_check(bp);
+               } else {
+                       /* We should never be in nfs_buf_write() with no UBCINFO. */
+                       printf("nfs_buf_write: ubcinfo already gone\n");
+                       SET(bp->nb_flags, NB_ERROR);
+                       bp->nb_error = error = EIO;
+                       nfs_buf_iodone(bp);
+                       goto out;
                 }
         }
  
@@ -2434,7 +2355,7 @@ nfs_buf_write(struct nfsbuf *bp)
                 nfs_buf_check_write_verifier(np, bp);
         if (ISSET(bp->nb_flags, NB_NEEDCOMMIT)) {
                 struct nfsmount *nmp = NFSTONMP(np);
-               if (!nmp) {
+               if (nfs_mount_gone(nmp)) {
                         SET(bp->nb_flags, NB_ERROR);
                         bp->nb_error = error = EIO;
                         nfs_buf_iodone(bp);
@@ -2442,7 +2363,7 @@ nfs_buf_write(struct nfsbuf *bp)
                 }
                 SET(bp->nb_flags, NB_WRITEINPROG);
                 error = nmp->nm_funcs->nf_commit_rpc(np, NBOFF(bp) + bp->nb_dirtyoff,
-                               bp->nb_dirtyend - bp->nb_dirtyoff, bp->nb_wcred);
+                               bp->nb_dirtyend - bp->nb_dirtyoff, bp->nb_wcred, bp->nb_verf);
                 CLR(bp->nb_flags, NB_WRITEINPROG);
                 if (error) {
                         if (error != NFSERR_STALEWRITEVERF) {
@@ -2454,10 +2375,10 @@ nfs_buf_write(struct nfsbuf *bp)
                 }
                 bp->nb_dirtyoff = bp->nb_dirtyend = 0;
                 CLR(bp->nb_flags, NB_NEEDCOMMIT);
-               nfs_lock(np, NFS_NODE_LOCK_FORCE);
+               nfs_node_lock_force(np);
                 np->n_needcommitcnt--;
                 CHECK_NEEDCOMMITCNT(np);
-               nfs_unlock(np);
+               nfs_node_unlock(np);
         }
         if (!error && (bp->nb_dirtyend > 0)) {
                 /* sanity check the dirty range */
@@ -2486,7 +2407,7 @@ nfs_buf_write(struct nfsbuf *bp)
                         dend = round_page_32(dend);
                 /* try to expand write range to include trailing dirty pages */
                 if (!(dend & PAGE_MASK))
-                       while ((dend < bp->nb_bufsize) && NBPGDIRTY(bp, dend / PAGE_SIZE))
+                       while ((dend < (int)bp->nb_bufsize) && NBPGDIRTY(bp, dend / PAGE_SIZE))
                                 dend += PAGE_SIZE;
                 /* make sure to keep dend clipped to EOF */
                 if ((NBOFF(bp) + dend) > (off_t) np->n_size)
@@ -2513,7 +2434,7 @@ nfs_buf_write(struct nfsbuf *bp)
                 bp->nb_offio = doff;
                 bp->nb_endio = dend;
  
-               OSAddAtomic(1, (SInt32 *)&nfsstats.write_bios);
+               OSAddAtomic64(1, &nfsstats.write_bios);
  
                 SET(bp->nb_flags, NB_WRITEINPROG);
                 error = nfs_buf_write_rpc(bp, iomode, thd, cred);
@@ -2546,12 +2467,12 @@ out:
                 if ((np->n_flag & NNEEDINVALIDATE) &&
                     !(np->n_bflag & (NBINVALINPROG|NBFLUSHINPROG))) {
                         int invalidate = 0;
-                       nfs_lock(np, NFS_NODE_LOCK_FORCE);
+                       nfs_node_lock_force(np);
                         if (np->n_flag & NNEEDINVALIDATE) {
                                 invalidate = 1;
                                 np->n_flag &= ~NNEEDINVALIDATE;
                         }
-                       nfs_unlock(np);
+                       nfs_node_unlock(np);
                         if (invalidate) {
                                 /*
                                  * There was a write error and we need to
@@ -2603,19 +2524,19 @@ nfs_buf_write_finish(struct nfsbuf *bp, thread_t thd, kauth_cred_t cred)
         /* manage needcommit state */
         if (!error && (bp->nb_commitlevel == NFS_WRITE_UNSTABLE)) {
                 if (!ISSET(bp->nb_flags, NB_NEEDCOMMIT)) {
-                       nfs_lock(np, NFS_NODE_LOCK_FORCE);
+                       nfs_node_lock_force(np);
                         np->n_needcommitcnt++;
-                       nfs_unlock(np);
+                       nfs_node_unlock(np);
                         SET(bp->nb_flags, NB_NEEDCOMMIT);
                 }
                 /* make sure nb_dirtyoff/nb_dirtyend reflect actual range written */
                 bp->nb_dirtyoff = bp->nb_offio;
                 bp->nb_dirtyend = bp->nb_endio;
         } else if (ISSET(bp->nb_flags, NB_NEEDCOMMIT)) {
-               nfs_lock(np, NFS_NODE_LOCK_FORCE);
+               nfs_node_lock_force(np);
                 np->n_needcommitcnt--;
                 CHECK_NEEDCOMMITCNT(np);
-               nfs_unlock(np);
+               nfs_node_unlock(np);
                 CLR(bp->nb_flags, NB_NEEDCOMMIT);
         }
  
@@ -2664,11 +2585,11 @@ nfs_buf_write_finish(struct nfsbuf *bp, thread_t thd, kauth_cred_t cred)
                          * buffer busy.  Set a flag to do it after releasing
                          * the buffer.
                          */
-                       nfs_lock(np, NFS_NODE_LOCK_FORCE);
+                       nfs_node_lock_force(np);
                         np->n_error = error;
                         np->n_flag |= (NWRITEERR | NNEEDINVALIDATE);
                         NATTRINVALIDATE(np);
-                       nfs_unlock(np);
+                       nfs_node_unlock(np);
                 }
                 /* clear the dirty range */
                 bp->nb_dirtyoff = bp->nb_dirtyend = 0;
@@ -2694,27 +2615,21 @@ nfs_buf_write_dirty_pages(struct nfsbuf *bp, thread_t thd, kauth_cred_t cred)
         int error = 0, commit, iomode, iomode2, len, pg, count, npages, off;
         uint32_t dirty = bp->nb_dirty;
         uint64_t wverf;
-       struct uio uio;
-       struct iovec_32 io;
+       uio_t auio;
+       char uio_buf [ UIO_SIZEOF(1) ];
  
         if (!bp->nb_dirty)
                 return (0);
  
         /* there are pages marked dirty that need to be written out */
-       OSAddAtomic(1, (SInt32 *)&nfsstats.write_bios);
+       OSAddAtomic64(1, &nfsstats.write_bios);
         NFS_BUF_MAP(bp);
         SET(bp->nb_flags, NB_WRITEINPROG);
         npages = bp->nb_bufsize / PAGE_SIZE;
         iomode = NFS_WRITE_UNSTABLE;
  
-       uio.uio_iovs.iov32p = &io;
-       uio.uio_iovcnt = 1;
-       uio.uio_rw = UIO_WRITE;
-#if 1   /* LP64todo - can't use new segment flags until the drivers are ready */
-       uio.uio_segflg = UIO_SYSSPACE;
-#else
-       uio.uio_segflg = UIO_SYSSPACE32;
-#endif
+       auio = uio_createwithbuffer(1, 0, UIO_SYSSPACE, UIO_WRITE,
+               &uio_buf, sizeof(uio_buf));
  
  again:
         dirty = bp->nb_dirty;
@@ -2734,11 +2649,9 @@ again:
                         len -= (NBOFF(bp) + off + len) - np->n_size;
                 if (len > 0) {
                         iomode2 = iomode;
-                       io.iov_len = len;
-                       uio_uio_resid_set(&uio, io.iov_len);
-                       uio.uio_offset = NBOFF(bp) + off;
-                       io.iov_base = (uintptr_t) bp->nb_data + off;
-                       error = nfs_write_rpc2(np, &uio, thd, cred, &iomode2, &bp->nb_verf);
+                       uio_reset(auio, NBOFF(bp) + off, UIO_SYSSPACE, UIO_WRITE);
+                       uio_addiov(auio, CAST_USER_ADDR_T(bp->nb_data + off), len);
+                       error = nfs_write_rpc2(np, auio, thd, cred, &iomode2, &bp->nb_verf);
                         if (error)
                                 break;
                         if (iomode2 < commit) /* Retain the lowest commitment level returned. */
@@ -2759,7 +2672,7 @@ again:
         CLR(bp->nb_flags, NB_WRITEINPROG);
  
         if (!error && (commit != NFS_WRITE_FILESYNC)) {
-               error = nmp->nm_funcs->nf_commit_rpc(np, NBOFF(bp), bp->nb_bufsize, cred);
+               error = nmp->nm_funcs->nf_commit_rpc(np, NBOFF(bp), bp->nb_bufsize, cred, wverf);
                 if (error == NFSERR_STALEWRITEVERF) {
                         /* verifier changed, so we need to restart all the writes */
                         iomode = NFS_WRITE_FILESYNC;
@@ -2784,14 +2697,15 @@ nfs_buf_write_rpc(struct nfsbuf *bp, int iomode, thread_t thd, kauth_cred_t cred
         struct nfsmount *nmp;
         nfsnode_t np = bp->nb_np;
         int error = 0, nfsvers, async;
-       int offset, length, nmwsize, nrpcs, len;
+       int offset, nrpcs;
+       uint32_t nmwsize, length, len;
         struct nfsreq *req;
         struct nfsreq_cbinfo cb;
-       struct uio uio;
-       struct iovec_32 io;
+       uio_t auio;
+       char uio_buf [ UIO_SIZEOF(1) ];
  
         nmp = NFSTONMP(np);
-       if (!nmp) {
+       if (nfs_mount_gone(nmp)) {
                 bp->nb_error = error = ENXIO;
                 SET(bp->nb_flags, NB_ERROR);
                 nfs_buf_iodone(bp);
@@ -2816,18 +2730,9 @@ nfs_buf_write_rpc(struct nfsbuf *bp, int iomode, thread_t thd, kauth_cred_t cred
                 return (error);
         }
  
-       uio.uio_iovs.iov32p = &io;
-       uio.uio_iovcnt = 1;
-       uio.uio_rw = UIO_WRITE;
-#if 1   /* LP64todo - can't use new segment flags until the drivers are ready */
-       uio.uio_segflg = UIO_SYSSPACE;
-#else
-       uio.uio_segflg = UIO_SYSSPACE32;
-#endif
-       io.iov_len = length;
-       uio_uio_resid_set(&uio, io.iov_len);
-       uio.uio_offset = NBOFF(bp) + offset;
-       io.iov_base = (uintptr_t) bp->nb_data + offset;
+       auio = uio_createwithbuffer(1, NBOFF(bp) + offset, UIO_SYSSPACE,
+               UIO_WRITE, &uio_buf, sizeof(uio_buf));
+       uio_addiov(auio, CAST_USER_ADDR_T(bp->nb_data + offset), length);
  
         bp->nb_rpcs = nrpcs = (length + nmwsize - 1) / nmwsize;
         if (async && (nrpcs > 1)) {
@@ -2844,10 +2749,12 @@ nfs_buf_write_rpc(struct nfsbuf *bp, int iomode, thread_t thd, kauth_cred_t cred
                 len = (length > nmwsize) ? nmwsize : length;
                 cb.rcb_args[0] = offset;
                 cb.rcb_args[1] = len;
+               if (nmp->nm_vers >= NFS_VER4)
+                       cb.rcb_args[2] = nmp->nm_stategenid;
                 if (async && ((error = nfs_async_write_start(nmp))))
                         break;
                 req = NULL;
-               error = nmp->nm_funcs->nf_write_rpc_async(np, &uio, len, thd, cred,
+               error = nmp->nm_funcs->nf_write_rpc_async(np, auio, len, thd, cred,
                                 iomode, &cb, &req);
                 if (error) {
                         if (async)
@@ -2886,6 +2793,9 @@ nfs_buf_write_rpc(struct nfsbuf *bp, int iomode, thread_t thd, kauth_cred_t cred
                 } else {
                         nfs_buf_write_finish(bp, thd, cred);
                 }
+               /* It may have just been an interrupt... that's OK */
+               if (!ISSET(bp->nb_flags, NB_ERROR))
+                       error = 0;
         }
  
         return (error);
@@ -2909,8 +2819,8 @@ nfs_buf_write_rpc_finish(struct nfsreq *req)
         nfsnode_t np;
         thread_t thd;
         kauth_cred_t cred;
-       struct uio uio;
-       struct iovec_32 io;
+       uio_t auio;
+       char uio_buf [ UIO_SIZEOF(1) ];
  
  finish:
         np = req->r_np;
@@ -2920,9 +2830,11 @@ finish:
                 kauth_cred_ref(cred);
         cb = req->r_callback;
         bp = cb.rcb_bp;
+       if (cb.rcb_func) /* take an extra reference on the nfsreq in case we want to resend it later due to grace error */
+               nfs_request_ref(req, 0);
  
         nmp = NFSTONMP(np);
-       if (!nmp) {
+       if (nfs_mount_gone(nmp)) {
                 SET(bp->nb_flags, NB_ERROR);
                 bp->nb_error = error = ENXIO;
         }
@@ -2940,11 +2852,57 @@ finish:
         error = nmp->nm_funcs->nf_write_rpc_async_finish(np, req, &committed, &rlen, &wverf);
         if ((error == EINPROGRESS) && cb.rcb_func) {
                 /* async request restarted */
+               if (cb.rcb_func)
+                       nfs_request_rele(req);
                 if (IS_VALID_CRED(cred))
                         kauth_cred_unref(&cred);
                 return;
         }
-
+       if ((nmp->nm_vers >= NFS_VER4) && nfs_mount_state_error_should_restart(error) && !ISSET(bp->nb_flags, NB_ERROR)) {
+               lck_mtx_lock(&nmp->nm_lock);
+               if ((error != NFSERR_OLD_STATEID) && (error != NFSERR_GRACE) && (cb.rcb_args[2] == nmp->nm_stategenid)) {
+                       NP(np, "nfs_buf_write_rpc_finish: error %d @ 0x%llx, 0x%x 0x%x, initiating recovery",
+                               error, NBOFF(bp)+offset, cb.rcb_args[2], nmp->nm_stategenid);
+                       nfs_need_recover(nmp, error);
+               }
+               lck_mtx_unlock(&nmp->nm_lock);
+               if (np->n_flag & NREVOKE) {
+                       error = EIO;
+               } else {
+                       if (error == NFSERR_GRACE) {
+                               if (cb.rcb_func) {
+                                       /*
+                                        * For an async I/O request, handle a grace delay just like
+                                        * jukebox errors.  Set the resend time and queue it up.
+                                        */
+                                       struct timeval now;
+                                       if (req->r_nmrep.nmc_mhead) {
+                                               mbuf_freem(req->r_nmrep.nmc_mhead);
+                                               req->r_nmrep.nmc_mhead = NULL;
+                                       }
+                                       req->r_error = 0;
+                                       microuptime(&now);
+                                       lck_mtx_lock(&req->r_mtx);
+                                       req->r_resendtime = now.tv_sec + 2;
+                                       req->r_xid = 0;                 // get a new XID
+                                       req->r_flags |= R_RESTART;
+                                       req->r_start = 0;
+                                       nfs_asyncio_resend(req);
+                                       lck_mtx_unlock(&req->r_mtx);
+                                       if (IS_VALID_CRED(cred))
+                                               kauth_cred_unref(&cred);
+                                       /* Note: nfsreq reference taken will be dropped later when finished */
+                                       return;
+                               }
+                               /* otherwise, just pause a couple seconds and retry */
+                               tsleep(&nmp->nm_state, (PZERO-1), "nfsgrace", 2*hz);
+                       }
+                       if (!(error = nfs_mount_state_wait_for_recovery(nmp))) {
+                               rlen = 0;
+                               goto writeagain;
+                       }
+               }
+       }
         if (error) {
                 SET(bp->nb_flags, NB_ERROR);
                 bp->nb_error = error;
@@ -2979,26 +2937,21 @@ finish:
          * (Don't bother if the buffer hit an error or stale wverf.)
          */
         if (((int)rlen < length) && !(bp->nb_flags & (NB_STALEWVERF|NB_ERROR))) {
+writeagain:
                 offset += rlen;
                 length -= rlen;
  
-               uio.uio_iovs.iov32p = &io;
-               uio.uio_iovcnt = 1;
-               uio.uio_rw = UIO_WRITE;
-#if 1   /* LP64todo - can't use new segment flags until the drivers are ready */
-               uio.uio_segflg = UIO_SYSSPACE;
-#else
-               uio.uio_segflg = UIO_SYSSPACE32;
-#endif
-               io.iov_len = length;
-               uio_uio_resid_set(&uio, io.iov_len);
-               uio.uio_offset = NBOFF(bp) + offset;
-               io.iov_base = (uintptr_t) bp->nb_data + offset;
+               auio = uio_createwithbuffer(1, NBOFF(bp) + offset, UIO_SYSSPACE,
+                       UIO_WRITE, &uio_buf, sizeof(uio_buf));
+               uio_addiov(auio, CAST_USER_ADDR_T(bp->nb_data + offset), length);
  
                 cb.rcb_args[0] = offset;
                 cb.rcb_args[1] = length;
+               if (nmp->nm_vers >= NFS_VER4)
+                       cb.rcb_args[2] = nmp->nm_stategenid;
  
-               error = nmp->nm_funcs->nf_write_rpc_async(np, &uio, length, thd, cred,
+               // XXX iomode should really match the original request
+               error = nmp->nm_funcs->nf_write_rpc_async(np, auio, length, thd, cred,
                                 NFS_WRITE_FILESYNC, &cb, &wreq);
                 if (!error) {
                         if (IS_VALID_CRED(cred))
@@ -3006,8 +2959,10 @@ finish:
                         if (!cb.rcb_func) {
                                 /* if !async we'll need to wait for this RPC to finish */
                                 req = wreq;
+                               wreq = NULL;
                                 goto finish;
                         }
+                       nfs_request_rele(req);
                         /*
                          * We're done here.
                          * Outstanding RPC count is unchanged.
@@ -3020,8 +2975,10 @@ finish:
         }
  
  out:
-       if (cb.rcb_func)
+       if (cb.rcb_func) {
                 nfs_async_write_done(nmp);
+               nfs_request_rele(req);
+       }
         /*
          * Decrement outstanding RPC count on buffer
          * and call nfs_buf_write_finish on last RPC.
@@ -3059,10 +3016,11 @@ int
  nfs_flushcommits(nfsnode_t np, int nowait)
  {
         struct nfsmount *nmp;
-       struct nfsbuf *bp;
+       struct nfsbuf *bp, *prevlbp, *lbp;
         struct nfsbuflists blist, commitlist;
         int error = 0, retv, wcred_set, flags, dirty;
         u_quad_t off, endoff, toff;
+       uint64_t wverf;
         u_int32_t count;
         kauth_cred_t wcred = NULL;
  
@@ -3075,11 +3033,11 @@ nfs_flushcommits(nfsnode_t np, int nowait)
          * and the commit rpc is done.
          */
         if (!LIST_EMPTY(&np->n_dirtyblkhd)) {
-               error = nfs_lock(np, NFS_NODE_LOCK_EXCLUSIVE);
+               error = nfs_node_lock(np);
                 if (error)
                         goto done;
                 np->n_flag |= NMODIFIED;
-               nfs_unlock(np);
+               nfs_node_unlock(np);
         }
  
         off = (u_quad_t)-1;
@@ -3088,7 +3046,7 @@ nfs_flushcommits(nfsnode_t np, int nowait)
         LIST_INIT(&commitlist);
  
         nmp = NFSTONMP(np);
-       if (!nmp) {
+       if (nfs_mount_gone(nmp)) {
                 error = ENXIO;
                 goto done;
         }
@@ -3101,6 +3059,7 @@ nfs_flushcommits(nfsnode_t np, int nowait)
         if (nowait)
                 flags |= NBI_NOWAIT;
         lck_mtx_lock(nfs_buf_mutex);
+       wverf = nmp->nm_verf;
         if (!nfs_buf_iterprepare(np, &blist, flags)) {
                 while ((bp = LIST_FIRST(&blist))) {
                         LIST_REMOVE(bp, nb_vnbufs);
@@ -3110,46 +3069,19 @@ nfs_flushcommits(nfsnode_t np, int nowait)
                                 continue;
                         if (ISSET(bp->nb_flags, NB_NEEDCOMMIT))
                                 nfs_buf_check_write_verifier(np, bp);
-                       if (((bp->nb_flags & (NB_DELWRI | NB_NEEDCOMMIT))
-                               != (NB_DELWRI | NB_NEEDCOMMIT))) {
+                       if (((bp->nb_flags & (NB_DELWRI | NB_NEEDCOMMIT)) != (NB_DELWRI | NB_NEEDCOMMIT)) ||
+                           (bp->nb_verf != wverf)) {
                                 nfs_buf_drop(bp);
                                 continue;
                         }
                         nfs_buf_remfree(bp);
-                       lck_mtx_unlock(nfs_buf_mutex);
-                       /*
-                        * we need a upl to see if the page has been
-                        * dirtied (think mmap) since the unstable write, and
-                        * also to prevent vm from paging it during our commit rpc
-                        */
-                       if (!ISSET(bp->nb_flags, NB_PAGELIST)) {
-                               retv = nfs_buf_upl_setup(bp);
-                               if (retv) {
-                                       /* unable to create upl */
-                                       /* vm object must no longer exist */
-                                       /* this could be fatal if we need */
-                                       /* to write the data again, we'll see...  */
-                                       printf("nfs_flushcommits: upl create failed %d\n", retv);
-                                       bp->nb_valid = bp->nb_dirty = 0;
-                               }
-                       }
-                       nfs_buf_upl_check(bp);
-                       lck_mtx_lock(nfs_buf_mutex);
+
+                       /* buffer UPLs will be grabbed *in order* below */
  
                         FSDBG(557, bp, bp->nb_flags, bp->nb_valid, bp->nb_dirty);
                         FSDBG(557, bp->nb_validoff, bp->nb_validend,
                               bp->nb_dirtyoff, bp->nb_dirtyend);
  
-                       /*
-                        * We used to check for dirty pages here; if there were any
-                        * we'd abort the commit and force the entire buffer to be
-                        * written again.
-                        *
-                        * Instead of doing that, we now go ahead and commit the dirty
-                        * range, and then leave the buffer around with dirty pages
-                        * that will be written out later.
-                        */
-
                         /*
                          * Work out if all buffers are using the same cred
                          * so we can deal with them all with one commit.
@@ -3168,14 +3100,23 @@ nfs_flushcommits(nfsnode_t np, int nowait)
                         SET(bp->nb_flags, NB_WRITEINPROG);
  
                         /*
-                        * A list of these buffers is kept so that the
-                        * second loop knows which buffers have actually
-                        * been committed. This is necessary, since there
-                        * may be a race between the commit rpc and new
-                        * uncommitted writes on the file.
+                        * Add this buffer to the list of buffers we are committing.
+                        * Buffers are inserted into the list in ascending order so that
+                        * we can take the UPLs in order after the list is complete.
                          */
+                       prevlbp = NULL;
+                       LIST_FOREACH(lbp, &commitlist, nb_vnbufs) {
+                               if (bp->nb_lblkno < lbp->nb_lblkno)
+                                       break;
+                               prevlbp = lbp;
+                       }
                         LIST_REMOVE(bp, nb_vnbufs);
-                       LIST_INSERT_HEAD(&commitlist, bp, nb_vnbufs);
+                       if (prevlbp)
+                               LIST_INSERT_AFTER(prevlbp, bp, nb_vnbufs);
+                       else
+                               LIST_INSERT_HEAD(&commitlist, bp, nb_vnbufs);
+
+                       /* update commit range start, end */
                         toff = NBOFF(bp) + bp->nb_dirtyoff;
                         if (toff < off)
                                 off = toff;
@@ -3192,6 +3133,28 @@ nfs_flushcommits(nfsnode_t np, int nowait)
                 goto done;
         }
  
+       /*
+        * We need a UPL to prevent others from accessing the buffers during
+        * our commit RPC(s).
+        *
+        * We used to also check for dirty pages here; if there were any we'd
+        * abort the commit and force the entire buffer to be written again.
+        * Instead of doing that, we just go ahead and commit the dirty range,
+        * and then leave the buffer around with dirty pages that will be
+        * written out later.
+        */
+       LIST_FOREACH(bp, &commitlist, nb_vnbufs) {
+               if (!ISSET(bp->nb_flags, NB_PAGELIST)) {
+                       retv = nfs_buf_upl_setup(bp);
+                       if (retv) {
+                               /* Unable to create the UPL, the VM object probably no longer exists. */
+                               printf("nfs_flushcommits: upl create failed %d\n", retv);
+                               bp->nb_valid = bp->nb_dirty = 0;
+                       }
+               }
+               nfs_buf_upl_check(bp);
+       }
+
         /*
          * Commit data on the server, as required.
          * If all bufs are using the same wcred, then use that with
@@ -3207,13 +3170,13 @@ nfs_flushcommits(nfsnode_t np, int nowait)
                         count = 0;
                 else
                         count = (endoff - off);
-               retv = nmp->nm_funcs->nf_commit_rpc(np, off, count, wcred);
+               retv = nmp->nm_funcs->nf_commit_rpc(np, off, count, wcred, wverf);
         } else {
                 retv = 0;
                 LIST_FOREACH(bp, &commitlist, nb_vnbufs) {
                         toff = NBOFF(bp) + bp->nb_dirtyoff;
                         count = bp->nb_dirtyend - bp->nb_dirtyoff;
-                       retv = nmp->nm_funcs->nf_commit_rpc(np, toff, count, bp->nb_wcred);
+                       retv = nmp->nm_funcs->nf_commit_rpc(np, toff, count, bp->nb_wcred, wverf);
                         if (retv)
                                 break;
                 }
@@ -3227,11 +3190,11 @@ nfs_flushcommits(nfsnode_t np, int nowait)
         while ((bp = LIST_FIRST(&commitlist))) {
                 LIST_REMOVE(bp, nb_vnbufs);
                 FSDBG(557, bp, retv, bp->nb_flags, bp->nb_dirty);
-               nfs_lock(np, NFS_NODE_LOCK_FORCE);
+               nfs_node_lock_force(np);
                 CLR(bp->nb_flags, (NB_NEEDCOMMIT | NB_WRITEINPROG));
                 np->n_needcommitcnt--;
                 CHECK_NEEDCOMMITCNT(np);
-               nfs_unlock(np);
+               nfs_node_unlock(np);
  
                 if (retv) {
                         /* move back to dirty list */
@@ -3242,6 +3205,9 @@ nfs_flushcommits(nfsnode_t np, int nowait)
                         continue;
                 }
  
+               nfs_node_lock_force(np);
+               np->n_numoutput++;
+               nfs_node_unlock(np);
                 vnode_startwrite(NFSTOV(np));
                 if (ISSET(bp->nb_flags, NB_DELWRI)) {
                         lck_mtx_lock(nfs_buf_mutex);
@@ -3294,25 +3260,26 @@ nfs_flush(nfsnode_t np, int waitfor, thread_t thd, int ignore_writeerr)
  
         FSDBG_TOP(517, np, waitfor, ignore_writeerr, 0);
  
-       if (!nmp) {
+       if (nfs_mount_gone(nmp)) {
                 error = ENXIO;
                 goto out;
         }
         nfsvers = nmp->nm_vers;
-       if (nmp->nm_flag & NFSMNT_INT)
+       if (NMFLAG(nmp, INTR))
                 slpflag = PCATCH;
  
         if (!LIST_EMPTY(&np->n_dirtyblkhd)) {
-               nfs_lock(np, NFS_NODE_LOCK_FORCE);
+               nfs_node_lock_force(np);
                 np->n_flag |= NMODIFIED;
-               nfs_unlock(np);
+               nfs_node_unlock(np);
         }
  
         lck_mtx_lock(nfs_buf_mutex);
         while (np->n_bflag & NBFLUSHINPROG) {
                 np->n_bflag |= NBFLUSHWANT;
                 error = msleep(&np->n_bflag, nfs_buf_mutex, slpflag, "nfs_flush", NULL);
-               if (error) {
+               if ((error && (error != EWOULDBLOCK)) ||
+                   ((error = nfs_sigintr(NFSTONMP(np), NULL, thd, 0)))) {
                         lck_mtx_unlock(nfs_buf_mutex);
                         goto out;
                 }
@@ -3339,7 +3306,7 @@ again:
                 while ((bp = LIST_FIRST(&blist))) {
                         LIST_REMOVE(bp, nb_vnbufs);
                         LIST_INSERT_HEAD(&np->n_dirtyblkhd, bp, nb_vnbufs);
-                       flags = (passone || (waitfor != MNT_WAIT)) ? NBAC_NOWAIT : 0;
+                       flags = (passone || !(waitfor == MNT_WAIT || waitfor == MNT_DWAIT)) ? NBAC_NOWAIT : 0;
                         if (flags != NBAC_NOWAIT)
                                 nfs_buf_refget(bp);
                         while ((error = nfs_buf_acquire(bp, flags, slpflag, slptimeo))) {
@@ -3379,7 +3346,7 @@ again:
                                 continue;
                         }
                         FSDBG(525, bp, passone, bp->nb_lflags, bp->nb_flags);
-                       if ((passone || (waitfor != MNT_WAIT)) &&
+                       if ((passone || !(waitfor == MNT_WAIT || waitfor == MNT_DWAIT)) &&
                             ISSET(bp->nb_flags, NB_NEEDCOMMIT)) {
                                 nfs_buf_drop(bp);
                                 continue;
@@ -3387,10 +3354,10 @@ again:
                         nfs_buf_remfree(bp);
                         lck_mtx_unlock(nfs_buf_mutex);
                         if (ISSET(bp->nb_flags, NB_ERROR)) {
-                               nfs_lock(np, NFS_NODE_LOCK_FORCE);
+                               nfs_node_lock_force(np);
                                 np->n_error = bp->nb_error ? bp->nb_error : EIO;
                                 np->n_flag |= NWRITEERR;
-                               nfs_unlock(np);
+                               nfs_node_unlock(np);
                                 nfs_buf_release(bp, 1);
                                 lck_mtx_lock(nfs_buf_mutex);
                                 continue;
@@ -3407,7 +3374,7 @@ again:
         }
         lck_mtx_unlock(nfs_buf_mutex);
  
-       if (waitfor == MNT_WAIT) {
+       if (waitfor == MNT_WAIT || waitfor == MNT_DWAIT) {
                 while ((error = vnode_waitforwrites(NFSTOV(np), 0, slpflag, slptimeo, "nfsflush"))) {
                         error2 = nfs_sigintr(NFSTONMP(np), NULL, thd, 0);
                         if (error2) {
@@ -3432,30 +3399,45 @@ again:
         if (passone) {
                 passone = 0;
                 if (!LIST_EMPTY(&np->n_dirtyblkhd)) {
-                       nfs_lock(np, NFS_NODE_LOCK_FORCE);
+                       nfs_node_lock_force(np);
                         np->n_flag |= NMODIFIED;
-                       nfs_unlock(np);
+                       nfs_node_unlock(np);
                 }
                 lck_mtx_lock(nfs_buf_mutex);
                 goto again;
         }
  
-       if (waitfor == MNT_WAIT) {
+       if (waitfor == MNT_WAIT || waitfor == MNT_DWAIT) {
                 if (!LIST_EMPTY(&np->n_dirtyblkhd)) {
-                       nfs_lock(np, NFS_NODE_LOCK_FORCE);
+                       nfs_node_lock_force(np);
                         np->n_flag |= NMODIFIED;
-                       nfs_unlock(np);
+                       nfs_node_unlock(np);
                 }
                 lck_mtx_lock(nfs_buf_mutex);
                 if (!LIST_EMPTY(&np->n_dirtyblkhd))
                         goto again;
                 lck_mtx_unlock(nfs_buf_mutex);
-               nfs_lock(np, NFS_NODE_LOCK_FORCE);
-               /* if we have no dirty blocks, we can clear the modified flag */
-               if (!np->n_wrbusy)
+               nfs_node_lock_force(np);
+               /*
+                * OK, it looks like there are no dirty blocks.  If we have no
+                * writes in flight and no one in the write code, we can clear
+                * the modified flag.  In order to make sure we see the latest
+                * attributes and size, we also invalidate the attributes and
+                * advance the attribute cache XID to guarantee that attributes
+                * newer than our clearing of NMODIFIED will get loaded next.
+                * (If we don't do this, it's possible for the flush's final
+                * write/commit (xid1) to be executed in parallel with a subsequent
+                * getattr request (xid2).  The getattr could return attributes
+                * from *before* the write/commit completed but the stale attributes
+                * would be preferred because of the xid ordering.)
+                */
+               if (!np->n_wrbusy && !np->n_numoutput) {
                         np->n_flag &= ~NMODIFIED;
+                       NATTRINVALIDATE(np);
+                       nfs_get_xid(&np->n_xid);
+               }
         } else {
-               nfs_lock(np, NFS_NODE_LOCK_FORCE);
+               nfs_node_lock_force(np);
         }
  
         FSDBG(526, np->n_flag, np->n_error, 0, 0);
@@ -3463,7 +3445,7 @@ again:
                 error = np->n_error;
                 np->n_flag &= ~NWRITEERR;
         }
-       nfs_unlock(np);
+       nfs_node_unlock(np);
  done:
         lck_mtx_lock(nfs_buf_mutex);
         flags = np->n_bflag;
@@ -3480,7 +3462,7 @@ out:
   * Flush out and invalidate all buffers associated with a vnode.
   * Called with the underlying object locked.
   */
-static int
+int
  nfs_vinvalbuf_internal(
         nfsnode_t np,
         int flags,
@@ -3580,10 +3562,11 @@ nfs_vinvalbuf_internal(
                                         // Note: bp has been released
                                         if (error) {
                                                 FSDBG(554, bp, 0xd00dee, 0xbad, error);
-                                               nfs_lock(np, NFS_NODE_LOCK_FORCE);
-                                               np->n_error = error;
-                                               np->n_flag |= NWRITEERR;
-                                               nfs_unlock(np);
+                                               nfs_node_lock_force(np);
+                                               if ((error != EINTR) && (error != ERESTART)) {
+                                                       np->n_error = error;
+                                                       np->n_flag |= NWRITEERR;
+                                               }
                                                 /*
                                                  * There was a write error and we need to
                                                  * invalidate attrs to sync with server.
@@ -3591,6 +3574,18 @@ nfs_vinvalbuf_internal(
                                                  * we may no longer know the correct size)
                                                  */
                                                 NATTRINVALIDATE(np);
+                                               nfs_node_unlock(np);
+                                               if ((error == EINTR) || (error == ERESTART)) {
+                                                       /*
+                                                        * Abort on EINTR.  If we don't, we could
+                                                        * be stuck in this loop forever because
+                                                        * the buffer will continue to stay dirty.
+                                                        */
+                                                       lck_mtx_lock(nfs_buf_mutex);
+                                                       nfs_buf_itercomplete(np, &blist, list);
+                                                       lck_mtx_unlock(nfs_buf_mutex);
+                                                       return (error);
+                                               }
                                                 error = 0;
                                         }
                                         lck_mtx_lock(nfs_buf_mutex);
@@ -3607,11 +3602,12 @@ nfs_vinvalbuf_internal(
         if (!LIST_EMPTY(&(np)->n_dirtyblkhd) || !LIST_EMPTY(&(np)->n_cleanblkhd))
                 panic("nfs_vinvalbuf: flush/inval failed");
         lck_mtx_unlock(nfs_buf_mutex);
-       if (!(flags & V_SAVE)) {
-               nfs_lock(np, NFS_NODE_LOCK_FORCE);
+       nfs_node_lock_force(np);
+       if (!(flags & V_SAVE))
                 np->n_flag &= ~NMODIFIED;
-               nfs_unlock(np);
-       }
+       if (vnode_vtype(NFSTOV(np)) == VREG)
+               np->n_lastrahead = -1;
+       nfs_node_unlock(np);
         NFS_BUF_FREEUP();
         return (0);
  }
@@ -3632,12 +3628,23 @@ nfs_vinvalbuf2(vnode_t vp, int flags, thread_t thd, kauth_cred_t cred, int intrf
  {
         nfsnode_t np = VTONFS(vp);
         struct nfsmount *nmp = VTONMP(vp);
-       int error, rv, slpflag, slptimeo, nflags;
+       int error, slpflag, slptimeo, nflags, retry = 0;
+       int ubcflags = UBC_PUSHALL | UBC_SYNC | UBC_INVALIDATE;
+       struct timespec ts = { 2, 0 };
         off_t size;
  
         FSDBG_TOP(554, np, flags, intrflg, 0);
  
-       if (nmp && !(nmp->nm_flag & NFSMNT_INT))
+       /*
+        * If the mount is gone no sense to try and write anything.
+        * and hang trying to do IO.
+        */
+       if (nfs_mount_gone(nmp)) {
+               flags &= ~V_SAVE;
+               ubcflags &= ~UBC_PUSHALL;
+       }
+       
+       if (nmp && !NMFLAG(nmp, INTR))
                 intrflg = 0;
         if (intrflg) {
                 slpflag = PCATCH;
@@ -3651,16 +3658,19 @@ nfs_vinvalbuf2(vnode_t vp, int flags, thread_t thd, kauth_cred_t cred, int intrf
         lck_mtx_lock(nfs_buf_mutex);
         while (np->n_bflag & NBINVALINPROG) {
                 np->n_bflag |= NBINVALWANT;
-               error = msleep(&np->n_bflag, nfs_buf_mutex, slpflag, "nfs_vinvalbuf", NULL);
-               if (error) {
+               msleep(&np->n_bflag, nfs_buf_mutex, slpflag, "nfs_vinvalbuf", &ts);
+               if ((error = nfs_sigintr(VTONMP(vp), NULL, thd, 0))) {
                         lck_mtx_unlock(nfs_buf_mutex);
                         return (error);
                 }
+               if (np->n_bflag & NBINVALINPROG)
+                       slpflag = 0;
         }
         np->n_bflag |= NBINVALINPROG;
         lck_mtx_unlock(nfs_buf_mutex);
  
         /* Now, flush as required.  */
+again:
         error = nfs_vinvalbuf_internal(np, flags, thd, cred, slpflag, 0);
         while (error) {
                 FSDBG(554, np, 0, 0, error);
@@ -3671,8 +3681,17 @@ nfs_vinvalbuf2(vnode_t vp, int flags, thread_t thd, kauth_cred_t cred, int intrf
  
         /* get the pages out of vm also */
         if (UBCINFOEXISTS(vp) && (size = ubc_getsize(vp)))
-               if (!(rv = ubc_sync_range(vp, 0, size, UBC_PUSHALL | UBC_SYNC | UBC_INVALIDATE)))
-                       panic("nfs_vinvalbuf(): ubc_sync_range failed!");
+               if ((error = ubc_msync(vp, 0, size, NULL, ubcflags))) {
+                       if (error == EINVAL)
+                               panic("nfs_vinvalbuf(): ubc_msync failed!, error %d", error);
+                       if (retry++ < 10) { /* retry invalidating a few times */
+                               if (retry > 1 || error == ENXIO)
+                                       ubcflags &= ~UBC_PUSHALL;
+                               goto again;
+                       }
+                       /* give up */
+                       printf("nfs_vinvalbuf(): ubc_msync failed!, error %d\n", error);
+               }
  done:
         lck_mtx_lock(nfs_buf_mutex);
         nflags = np->n_bflag;
@@ -3685,6 +3704,57 @@ done:
         return (error);
  }
  
+/*
+ * Wait for any busy buffers to complete.
+ */
+void
+nfs_wait_bufs(nfsnode_t np)
+{
+       struct nfsbuf *bp;
+       struct nfsbuflists blist;
+       int error = 0;
+
+       lck_mtx_lock(nfs_buf_mutex);
+       if (!nfs_buf_iterprepare(np, &blist, NBI_CLEAN)) {
+               while ((bp = LIST_FIRST(&blist))) {
+                       LIST_REMOVE(bp, nb_vnbufs);
+                       LIST_INSERT_HEAD(&np->n_cleanblkhd, bp, nb_vnbufs);
+                       nfs_buf_refget(bp);
+                       while ((error = nfs_buf_acquire(bp, 0, 0, 0))) {
+                               if (error != EAGAIN) {
+                                       nfs_buf_refrele(bp);
+                                       nfs_buf_itercomplete(np, &blist, NBI_CLEAN);
+                                       lck_mtx_unlock(nfs_buf_mutex);
+                                       return;
+                               }
+                       }
+                       nfs_buf_refrele(bp);
+                       nfs_buf_drop(bp);
+               }
+               nfs_buf_itercomplete(np, &blist, NBI_CLEAN);
+       }
+       if (!nfs_buf_iterprepare(np, &blist, NBI_DIRTY)) {
+               while ((bp = LIST_FIRST(&blist))) {
+                       LIST_REMOVE(bp, nb_vnbufs);
+                       LIST_INSERT_HEAD(&np->n_dirtyblkhd, bp, nb_vnbufs);
+                       nfs_buf_refget(bp);
+                       while ((error = nfs_buf_acquire(bp, 0, 0, 0))) {
+                               if (error != EAGAIN) {
+                                       nfs_buf_refrele(bp);
+                                       nfs_buf_itercomplete(np, &blist, NBI_DIRTY);
+                                       lck_mtx_unlock(nfs_buf_mutex);
+                                       return;
+                               }
+                       }
+                       nfs_buf_refrele(bp);
+                       nfs_buf_drop(bp);
+               }
+               nfs_buf_itercomplete(np, &blist, NBI_DIRTY);
+       }
+       lck_mtx_unlock(nfs_buf_mutex);
+}
+
+
  /*
   * Add an async I/O request to the mount's async I/O queue and make
   * sure that an nfsiod will service it.
@@ -3698,8 +3768,11 @@ nfs_asyncio_finish(struct nfsreq *req)
  
         FSDBG_TOP(552, nmp, 0, 0, 0);
  again:
-       if (((nmp = req->r_nmp)) == NULL)
+       nmp = req->r_nmp;
+
+       if (nmp == NULL)
                 return;
+
         lck_mtx_lock(nfsiod_mutex);
         niod = nmp->nm_niod;
  
@@ -3734,8 +3807,9 @@ again:
                         lck_mtx_unlock(nfsiod_mutex);
                         wakeup(niod);
                 } else if (nfsiod_thread_count > 0) {
-                       /* just queue it up on nfsiod mounts queue */
-                       TAILQ_INSERT_TAIL(&nfsiodmounts, nmp, nm_iodlink);
+                       /* just queue it up on nfsiod mounts queue if needed */
+                       if (nmp->nm_iodlink.tqe_next == NFSNOLIST)
+                               TAILQ_INSERT_TAIL(&nfsiodmounts, nmp, nm_iodlink);
                         lck_mtx_unlock(nfsiod_mutex);
                 } else {
                         printf("nfs_asyncio(): no nfsiods? %d %d (%d)\n", nfsiod_thread_count, NFSIOD_MAX, started);
@@ -3759,11 +3833,11 @@ nfs_asyncio_resend(struct nfsreq *req)
  {
         struct nfsmount *nmp = req->r_nmp;
  
-       if (!nmp)
+       if (nfs_mount_gone(nmp))
                 return;
         nfs_gss_clnt_rpcdone(req);
         lck_mtx_lock(&nmp->nm_lock);
-       if (req->r_rchain.tqe_next == NFSREQNOLIST) {
+       if (!(req->r_flags & R_RESENDQ)) {
                 TAILQ_INSERT_TAIL(&nmp->nm_resendq, req, r_rchain);
                 req->r_flags |= R_RESENDQ;
         }
@@ -3772,64 +3846,29 @@ nfs_asyncio_resend(struct nfsreq *req)
  }
  
  /*
- * Read an NFS buffer for a directory.
+ * Read directory data into a buffer.
+ *
+ * Buffer will be filled (unless EOF is hit).
+ * Buffers after this one may also be completely/partially filled.
   */
  int
  nfs_buf_readdir(struct nfsbuf *bp, vfs_context_t ctx)
  {
-       nfsnode_t np;
-       vnode_t vp;
-       struct nfsmount *nmp;
-       int error = 0, nfsvers;
-       struct uio uio;
-       struct iovec_32 io;
+       nfsnode_t np = bp->nb_np;
+       struct nfsmount *nmp = NFSTONMP(np);
+       int error = 0;
  
-       np = bp->nb_np;
-       vp = NFSTOV(np);
-       nmp = VTONMP(vp);
-       nfsvers = nmp->nm_vers;
-       uio.uio_iovs.iov32p = &io;
-       uio.uio_iovcnt = 1;
-#if 1   /* LP64todo - can't use new segment flags until the drivers are ready */
-       uio.uio_segflg = UIO_SYSSPACE;
-#else
-       uio.uio_segflg = UIO_SYSSPACE32;
-#endif
-
-       /* sanity check */
-       if (ISSET(bp->nb_flags, NB_DONE))
-               CLR(bp->nb_flags, NB_DONE);
+       if (nfs_mount_gone(nmp))
+               return (ENXIO);
  
-       uio.uio_rw = UIO_READ;
-       io.iov_len = bp->nb_bufsize;
-       uio_uio_resid_set(&uio, io.iov_len);
-       io.iov_base = (uintptr_t) bp->nb_data;
-       uio.uio_offset = NBOFF(bp);
-
-       OSAddAtomic(1, (SInt32*)&nfsstats.readdir_bios);
-       if (nfsvers < NFS_VER4) {
-               if (nmp->nm_flag & NFSMNT_RDIRPLUS) {
-                       error = nfs3_readdirplus_rpc(np, &uio, ctx);
-                       if (error == NFSERR_NOTSUPP) {
-                               lck_mtx_lock(&nmp->nm_lock);
-                               nmp->nm_flag &= ~NFSMNT_RDIRPLUS;
-                               lck_mtx_unlock(&nmp->nm_lock);
-                       }
-               }
-               if (!(nmp->nm_flag & NFSMNT_RDIRPLUS))
-                       error = nfs3_readdir_rpc(np, &uio, ctx);
-       } else {
-               error = nfs4_readdir_rpc(np, &uio, ctx);
-       }
-       if (error) {
+       if (nmp->nm_vers < NFS_VER4)
+               error = nfs3_readdir_rpc(np, bp, ctx);
+       else
+               error = nfs4_readdir_rpc(np, bp, ctx);
+
+       if (error && (error != NFSERR_DIRBUFDROPPED)) {
                 SET(bp->nb_flags, NB_ERROR);
                 bp->nb_error = error;
-       } else {
-               bp->nb_validoff = 0;
-               bp->nb_validend = uio.uio_offset - NBOFF(bp);
-               bp->nb_valid = (1 << (round_page_32(bp->nb_validend)/PAGE_SIZE)) - 1;
         }
-
-       nfs_buf_iodone(bp);
         return (error);
  }