]> git.saurik.com Git - apple/xnu.git/blobdiff - bsd/nfs/nfs_bio.c
xnu-3789.70.16.tar.gz
[apple/xnu.git] / bsd / nfs / nfs_bio.c
index b1dccb036b823f4b54e52ec78ac257d53b4b685a..acaf26c24a6c222e155640eae44d7ffda2d22fe2 100644 (file)
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2000-2007 Apple Inc. All rights reserved.
+ * Copyright (c) 2000-2015 Apple Inc. All rights reserved.
  *
  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
  * 
@@ -77,6 +77,7 @@
 #include <sys/kernel.h>
 #include <sys/ubc_internal.h>
 #include <sys/uio_internal.h>
+#include <sys/kpi_mbuf.h>
 
 #include <sys/vm.h>
 #include <sys/vmparam.h>
@@ -96,6 +97,8 @@
 #include <sys/buf_internal.h>
 #include <libkern/OSAtomic.h>
 
+#define NFS_BIO_DBG(...) NFS_DBG(NFS_FAC_BIO, 7, ## __VA_ARGS__)
+
 kern_return_t  thread_terminate(thread_t); /* XXX */
 
 #define        NFSBUFHASH(np, lbn)     \
@@ -345,7 +348,7 @@ nfs_buf_page_inval(vnode_t vp, off_t offset)
        struct nfsbuf *bp;
        int error = 0;
 
-       if (!nmp)
+       if (nfs_mount_gone(nmp))
                return (ENXIO);
 
        lck_mtx_lock(nfs_buf_mutex);
@@ -364,11 +367,20 @@ nfs_buf_page_inval(vnode_t vp, off_t offset)
         */
        if (bp->nb_dirtyend > 0) {
                int start = offset - NBOFF(bp);
-               if (bp->nb_dirtyend <= start ||
-                   bp->nb_dirtyoff >= (start + PAGE_SIZE))
-                       error = 0;
-               else
+               if ((bp->nb_dirtyend > start) &&
+                   (bp->nb_dirtyoff < (start + PAGE_SIZE))) {
+                       /*
+                        * Before returning the bad news, move the
+                        * buffer to the start of the delwri list and
+                        * give the list a push to try to flush the
+                        * buffer out.
+                        */
                        error = EBUSY;
+                       nfs_buf_remfree(bp);
+                       TAILQ_INSERT_HEAD(&nfsbufdelwri, bp, nb_free);
+                       nfsbufdelwricnt++;
+                       nfs_buf_delwri_push(1);
+               }
        }
 out:
        lck_mtx_unlock(nfs_buf_mutex);
@@ -481,7 +493,7 @@ nfs_buf_map(struct nfsbuf *bp)
        if (!ISSET(bp->nb_flags, NB_PAGELIST))
                return (EINVAL);
 
-       kret = ubc_upl_map(bp->nb_pagelist, (vm_address_t *)&(bp->nb_data));
+       kret = ubc_upl_map(bp->nb_pagelist, (vm_offset_t *)&(bp->nb_data));
        if (kret != KERN_SUCCESS)
                panic("nfs_buf_map: ubc_upl_map() failed with (%d)", kret);
        if (bp->nb_data == 0)
@@ -523,7 +535,7 @@ nfs_buf_normalize_valid_range(nfsnode_t np, struct nfsbuf *bp)
  * process some entries on the delayed write queue
  * (must be called with nfs_buf_mutex held)
  */
-static void
+void
 nfs_buf_delwri_service(void)
 {
        struct nfsbuf *bp;
@@ -565,7 +577,7 @@ nfs_buf_delwri_service(void)
 /*
  * thread to service the delayed write queue when asked
  */
-static void
+void
 nfs_buf_delwri_thread(__unused void *arg, __unused wait_result_t wr)
 {
        struct timespec ts = { 30, 0 };
@@ -585,7 +597,7 @@ nfs_buf_delwri_thread(__unused void *arg, __unused wait_result_t wr)
  * try to push out some delayed/uncommitted writes
  * ("locked" indicates whether nfs_buf_mutex is already held)
  */
-static void
+void
 nfs_buf_delwri_push(int locked)
 {
        if (TAILQ_EMPTY(&nfsbufdelwri))
@@ -627,7 +639,7 @@ int
 nfs_buf_get(
        nfsnode_t np,
        daddr64_t blkno,
-       int size,
+       uint32_t size,
        thread_t thd,
        int flags,
        struct nfsbuf **bpp)
@@ -635,7 +647,7 @@ nfs_buf_get(
        vnode_t vp = NFSTOV(np);
        struct nfsmount *nmp = VTONMP(vp);
        struct nfsbuf *bp;
-       int bufsize;
+       uint32_t bufsize;
        int slpflag = PCATCH;
        int operation = (flags & NBLK_OPMASK);
        int error = 0;
@@ -648,14 +660,14 @@ nfs_buf_get(
        if (bufsize > NFS_MAXBSIZE)
                panic("nfs_buf_get: buffer larger than NFS_MAXBSIZE requested");
 
-       if (!nmp) {
+       if (nfs_mount_gone(nmp)) {
                FSDBG_BOT(541, np, blkno, 0, ENXIO);
                return (ENXIO);
        }
 
        if (!UBCINFOEXISTS(vp)) {
                operation = NBLK_META;
-       } else if (bufsize < nmp->nm_biosize) {
+       } else if (bufsize < (uint32_t)nmp->nm_biosize) {
                /* reg files should always have biosize blocks */
                bufsize = nmp->nm_biosize;
        }
@@ -675,6 +687,21 @@ nfs_buf_get(
 loop:
        lck_mtx_lock(nfs_buf_mutex);
 
+       /* wait for any buffer invalidation/flushing to complete */
+       while (np->n_bflag & NBINVALINPROG) {
+               np->n_bflag |= NBINVALWANT;
+               ts.tv_sec = 2;
+               ts.tv_nsec = 0;
+               msleep(&np->n_bflag, nfs_buf_mutex, slpflag, "nfs_buf_get_invalwait", &ts);
+               if ((error = nfs_sigintr(VTONMP(vp), NULL, thd, 0))) {
+                       lck_mtx_unlock(nfs_buf_mutex);
+                       FSDBG_BOT(541, np, blkno, 0, error);
+                       return (error);
+               }
+               if (np->n_bflag & NBINVALINPROG)
+                       slpflag = 0;
+       }
+
        /* check for existence of nfsbuf in cache */
        if ((bp = nfs_buf_incore(np, blkno))) {
                /* if busy, set wanted and wait */
@@ -855,8 +882,8 @@ loop:
                }
        }
 
-       /* setup nfsbuf */
-       bp->nb_lflags = NBL_BUSY;
+       /* set up nfsbuf */
+       SET(bp->nb_lflags, NBL_BUSY);
        bp->nb_flags = 0;
        bp->nb_lblkno = blkno;
        /* insert buf in hash */
@@ -969,9 +996,9 @@ nfs_buf_release(struct nfsbuf *bp, int freeup)
 
        vp = np ? NFSTOV(np) : NULL;
        if (vp && UBCINFOEXISTS(vp) && bp->nb_bufsize) {
-               int upl_flags;
+               int upl_flags, rv;
                upl_t upl;
-               int i, rv;
+               uint32_t i;
 
                if (!ISSET(bp->nb_flags, NB_PAGELIST) && !ISSET(bp->nb_flags, NB_INVAL)) {
                        rv = nfs_buf_upl_setup(bp);
@@ -1012,6 +1039,10 @@ nfs_buf_release(struct nfsbuf *bp, int freeup)
                                        upl_flags = UPL_COMMIT_SET_DIRTY;
                                else
                                        upl_flags = UPL_COMMIT_CLEAR_DIRTY;
+                               
+                               if (!ISSET(bp->nb_flags, (NB_NEEDCOMMIT | NB_DELWRI)))
+                                       upl_flags |= UPL_COMMIT_CLEAR_PRECIOUS;
+
                                ubc_upl_commit_range(upl,
                                        i*PAGE_SIZE, PAGE_SIZE,
                                        upl_flags |
@@ -1020,15 +1051,16 @@ nfs_buf_release(struct nfsbuf *bp, int freeup)
                        }
                }
 pagelist_cleanup_done:
-               /* was this the last buffer in the file? */
+               /* invalidate any pages past EOF */
                if (NBOFF(bp) + bp->nb_bufsize > (off_t)(np->n_size)) {
-                       /* if so, invalidate all pages of last buffer past EOF */
                        off_t start, end;
                        start = trunc_page_64(np->n_size) + PAGE_SIZE_64;
                        end = trunc_page_64(NBOFF(bp) + bp->nb_bufsize);
+                       if (start < NBOFF(bp))
+                               start = NBOFF(bp);
                        if (end > start) {
-                               if (!(rv = ubc_sync_range(vp, start, end, UBC_INVALIDATE)))
-                                       printf("nfs_buf_release(): ubc_sync_range failed!\n");
+                               if ((rv = ubc_msync(vp, start, end, NULL, UBC_INVALIDATE)))
+                                       printf("nfs_buf_release(): ubc_msync failed!, error %d\n", rv);
                        }
                }
                CLR(bp->nb_flags, NB_PAGELIST);
@@ -1172,6 +1204,9 @@ nfs_buf_iodone(struct nfsbuf *bp)
                 * any throttled write operations
                 */
                vnode_writedone(NFSTOV(bp->nb_np));
+               nfs_node_lock_force(bp->nb_np);
+               bp->nb_np->n_numoutput--;
+               nfs_node_unlock(bp->nb_np);
        }
        if (ISSET(bp->nb_flags, NB_ASYNC)) {    /* if async, release it */
                SET(bp->nb_flags, NB_DONE);             /* note that it's done */
@@ -1219,9 +1254,9 @@ nfs_buf_write_delayed(struct nfsbuf *bp)
        vnode_waitforwrites(NFSTOV(np), VNODE_ASYNC_THROTTLE, 0, 0, "nfs_buf_write_delayed");
 
        /* the file is in a modified state, so make sure the flag's set */
-       nfs_lock(np, NFS_NODE_LOCK_FORCE);
+       nfs_node_lock_force(np);
        np->n_flag |= NMODIFIED;
-       nfs_unlock(np);
+       nfs_node_unlock(np);
 
        /*
         * If we have too many delayed write buffers,
@@ -1258,7 +1293,7 @@ nfs_buf_check_write_verifier(nfsnode_t np, struct nfsbuf *bp)
                return;
 
        nmp = NFSTONMP(np);
-       if (!nmp)
+       if (nfs_mount_gone(nmp))
                return;
        if (!ISSET(bp->nb_flags, NB_STALEWVERF) && (bp->nb_verf == nmp->nm_verf))
                return;
@@ -1266,10 +1301,10 @@ nfs_buf_check_write_verifier(nfsnode_t np, struct nfsbuf *bp)
        /* write verifier changed, clear commit/wverf flags */
        CLR(bp->nb_flags, (NB_NEEDCOMMIT | NB_STALEWVERF));
        bp->nb_verf = 0;
-       nfs_lock(np, NFS_NODE_LOCK_FORCE);
+       nfs_node_lock_force(np);
        np->n_needcommitcnt--;
        CHECK_NEEDCOMMITCNT(np);
-       nfs_unlock(np);
+       nfs_node_unlock(np);
 }
 
 /*
@@ -1303,7 +1338,7 @@ nfs_buf_acquire(struct nfsbuf *bp, int flags, int slpflag, int slptimeo)
 
        if (ISSET(bp->nb_lflags, NBL_BUSY)) {
                /*      
-                * since the mutex_lock may block, the buffer
+                * since the lck_mtx_lock may block, the buffer
                 * may become BUSY, so we need to recheck for
                 * a NOWAIT request
                 */
@@ -1442,7 +1477,7 @@ nfs_buf_read(struct nfsbuf *bp)
 
        NFS_BUF_MAP(bp);
 
-       OSAddAtomic(1, (SInt32 *)&nfsstats.read_bios);
+       OSAddAtomic64(1, &nfsstats.read_bios);
 
        error = nfs_buf_read_rpc(bp, thd, cred);
        /*
@@ -1468,7 +1503,7 @@ nfs_buf_read_finish(struct nfsbuf *bp)
                /* update valid range */
                bp->nb_validoff = 0;
                bp->nb_validend = bp->nb_endio;
-               if (bp->nb_endio < bp->nb_bufsize) { 
+               if (bp->nb_endio < (int)bp->nb_bufsize) { 
                        /*
                         * The read may be short because we have unflushed writes
                         * that are extending the file size and the reads hit the
@@ -1491,7 +1526,7 @@ nfs_buf_read_finish(struct nfsbuf *bp)
                bp->nb_valid = (1 << (round_page_32(bp->nb_validend) / PAGE_SIZE)) - 1;
                if (bp->nb_validend & PAGE_MASK) {
                        /* zero-fill remainder of last page */
-                       bzero(bp->nb_data + bp->nb_validend, bp->nb_bufsize - bp->nb_validend);
+                       bzero(bp->nb_data + bp->nb_validend, PAGE_SIZE - (bp->nb_validend & PAGE_MASK));
                }
        }
        nfs_buf_iodone(bp);
@@ -1506,13 +1541,14 @@ nfs_buf_read_rpc(struct nfsbuf *bp, thread_t thd, kauth_cred_t cred)
        struct nfsmount *nmp;
        nfsnode_t np = bp->nb_np;
        int error = 0, nfsvers, async;
-       int offset, length, nmrsize, nrpcs, len;
+       int offset, nrpcs;
+       uint32_t nmrsize, length, len;
        off_t boff;
        struct nfsreq *req;
        struct nfsreq_cbinfo cb;
 
        nmp = NFSTONMP(np);
-       if (!nmp) {
+       if (nfs_mount_gone(nmp)) {
                bp->nb_error = error = ENXIO;
                SET(bp->nb_flags, NB_ERROR);
                nfs_buf_iodone(bp);
@@ -1557,6 +1593,8 @@ nfs_buf_read_rpc(struct nfsbuf *bp, thread_t thd, kauth_cred_t cred)
                len = (length > nmrsize) ? nmrsize : length;
                cb.rcb_args[0] = offset;
                cb.rcb_args[1] = len;
+               if (nmp->nm_vers >= NFS_VER4)
+                       cb.rcb_args[2] = nmp->nm_stategenid;
                req = NULL;
                error = nmp->nm_funcs->nf_read_rpc_async(np, boff + offset, len, thd, cred, &cb, &req);
                if (error)
@@ -1618,8 +1656,8 @@ nfs_buf_read_rpc_finish(struct nfsreq *req)
        nfsnode_t np;
        thread_t thd;
        kauth_cred_t cred;
-       struct uio uio;
-       struct iovec_32 io;
+       uio_t auio;
+       char uio_buf [ UIO_SIZEOF(1) ];
 
 finish:
        np = req->r_np;
@@ -1629,9 +1667,11 @@ finish:
                kauth_cred_ref(cred);
        cb = req->r_callback;
        bp = cb.rcb_bp;
+       if (cb.rcb_func) /* take an extra reference on the nfsreq in case we want to resend it later due to grace error */
+               nfs_request_ref(req, 0);
 
        nmp = NFSTONMP(np);
-       if (!nmp) {
+       if (nfs_mount_gone(nmp)) {
                SET(bp->nb_flags, NB_ERROR);
                bp->nb_error = error = ENXIO;
        }
@@ -1645,28 +1685,65 @@ finish:
        offset = cb.rcb_args[0];
        rlen = length = cb.rcb_args[1];
 
-       uio.uio_iovs.iov32p = &io;
-       uio.uio_iovcnt = 1;
-       uio.uio_rw = UIO_READ;
-#if 1  /* LP64todo - can't use new segment flags until the drivers are ready */
-       uio.uio_segflg = UIO_SYSSPACE;
-#else
-       uio.uio_segflg = UIO_SYSSPACE32;
-#endif
-       io.iov_len = length;
-       uio_uio_resid_set(&uio, io.iov_len);
-       uio.uio_offset = NBOFF(bp) + offset;
-       io.iov_base = (uintptr_t) bp->nb_data + offset;
+       auio = uio_createwithbuffer(1, NBOFF(bp) + offset, UIO_SYSSPACE,
+                                UIO_READ, &uio_buf, sizeof(uio_buf));
+       uio_addiov(auio, CAST_USER_ADDR_T(bp->nb_data + offset), length);
 
        /* finish the RPC */
-       error = nmp->nm_funcs->nf_read_rpc_async_finish(np, req, &uio, &rlen, &eof);
+       error = nmp->nm_funcs->nf_read_rpc_async_finish(np, req, auio, &rlen, &eof);
        if ((error == EINPROGRESS) && cb.rcb_func) {
                /* async request restarted */
+               if (cb.rcb_func)
+                       nfs_request_rele(req);
                if (IS_VALID_CRED(cred))
                        kauth_cred_unref(&cred);
                return;
        }
-
+       if ((nmp->nm_vers >= NFS_VER4) && nfs_mount_state_error_should_restart(error) && !ISSET(bp->nb_flags, NB_ERROR)) {
+               lck_mtx_lock(&nmp->nm_lock);
+               if ((error != NFSERR_OLD_STATEID) && (error != NFSERR_GRACE) && (cb.rcb_args[2] == nmp->nm_stategenid)) {
+                       NP(np, "nfs_buf_read_rpc_finish: error %d @ 0x%llx, 0x%x 0x%x, initiating recovery",
+                               error, NBOFF(bp)+offset, cb.rcb_args[2], nmp->nm_stategenid);
+                       nfs_need_recover(nmp, error);
+               }
+               lck_mtx_unlock(&nmp->nm_lock);
+               if (np->n_flag & NREVOKE) {
+                       error = EIO;
+               } else {
+                       if (error == NFSERR_GRACE) {
+                               if (cb.rcb_func) {
+                                       /*
+                                        * For an async I/O request, handle a grace delay just like
+                                        * jukebox errors.  Set the resend time and queue it up.
+                                        */
+                                       struct timeval now;
+                                       if (req->r_nmrep.nmc_mhead) {
+                                               mbuf_freem(req->r_nmrep.nmc_mhead);
+                                               req->r_nmrep.nmc_mhead = NULL;
+                                       }
+                                       req->r_error = 0;
+                                       microuptime(&now);
+                                       lck_mtx_lock(&req->r_mtx);
+                                       req->r_resendtime = now.tv_sec + 2;
+                                       req->r_xid = 0;                 // get a new XID
+                                       req->r_flags |= R_RESTART;
+                                       req->r_start = 0;
+                                       nfs_asyncio_resend(req);
+                                       lck_mtx_unlock(&req->r_mtx);
+                                       if (IS_VALID_CRED(cred))
+                                               kauth_cred_unref(&cred);
+                                       /* Note: nfsreq reference taken will be dropped later when finished */
+                                       return;
+                               }
+                               /* otherwise, just pause a couple seconds and retry */
+                               tsleep(&nmp->nm_state, (PZERO-1), "nfsgrace", 2*hz);
+                       }
+                       if (!(error = nfs_mount_state_wait_for_recovery(nmp))) {
+                               rlen = 0;
+                               goto readagain;
+                       }
+               }
+       }
        if (error) {
                SET(bp->nb_flags, NB_ERROR);
                bp->nb_error = error;
@@ -1692,19 +1769,24 @@ finish:
                 * requested, so we need to issue another read for the rest.
                 * (Don't bother if the buffer already hit an error.)
                 */
+readagain:
                offset += rlen;
                length -= rlen;
                cb.rcb_args[0] = offset;
                cb.rcb_args[1] = length;
-               error = nmp->nm_funcs->nf_read_rpc_async(np, offset, length, thd, cred, &cb, &rreq);
+               if (nmp->nm_vers >= NFS_VER4)
+                       cb.rcb_args[2] = nmp->nm_stategenid;
+               error = nmp->nm_funcs->nf_read_rpc_async(np, NBOFF(bp) + offset, length, thd, cred, &cb, &rreq);
                if (!error) {
                        if (IS_VALID_CRED(cred))
                                kauth_cred_unref(&cred);
                        if (!cb.rcb_func) {
                                /* if !async we'll need to wait for this RPC to finish */
                                req = rreq;
+                               rreq = NULL;
                                goto finish;
                        }
+                       nfs_request_rele(req);
                        /*
                         * We're done here.
                         * Outstanding RPC count is unchanged.
@@ -1717,6 +1799,8 @@ finish:
        }
 
 out:
+       if (cb.rcb_func)
+               nfs_request_rele(req);
        if (IS_VALID_CRED(cred))
                kauth_cred_unref(&cred);
 
@@ -1752,14 +1836,15 @@ out:
  * Do buffer readahead.
  * Initiate async I/O to read buffers not in cache.
  */
-static int
+int
 nfs_buf_readahead(nfsnode_t np, int ioflag, daddr64_t *rabnp, daddr64_t lastrabn, thread_t thd, kauth_cred_t cred)
 {
        struct nfsmount *nmp = NFSTONMP(np);
        struct nfsbuf *bp;
-       int error = 0, nra;
+       int error = 0;
+       uint32_t nra;
 
-       if (!nmp)
+       if (nfs_mount_gone(nmp))
                return (ENXIO);
        if (nmp->nm_readahead <= 0)
                return (0);
@@ -1768,9 +1853,17 @@ nfs_buf_readahead(nfsnode_t np, int ioflag, daddr64_t *rabnp, daddr64_t lastrabn
 
        for (nra = 0; (nra < nmp->nm_readahead) && (*rabnp <= lastrabn); nra++, *rabnp = *rabnp + 1) {
                /* check if block exists and is valid. */
+               if ((*rabnp * nmp->nm_biosize) >= (off_t)np->n_size) {
+                       /* stop reading ahead if we're beyond EOF */
+                       *rabnp = lastrabn;
+                       break;
+               }
                error = nfs_buf_get(np, *rabnp, nmp->nm_biosize, thd, NBLK_READ|NBLK_NOWAIT, &bp);
                if (error)
                        break;
+               nfs_node_lock_force(np);
+               np->n_lastrahead = *rabnp;
+               nfs_node_unlock(np);
                if (!bp)
                        continue;
                if ((ioflag & IO_NOCACHE) && ISSET(bp->nb_flags, NB_CACHE) &&
@@ -1798,209 +1891,161 @@ nfs_buf_readahead(nfsnode_t np, int ioflag, daddr64_t *rabnp, daddr64_t lastrabn
 }
 
 /*
- * NFS buffer I/O for reading files/directories.
+ * NFS buffer I/O for reading files.
  */
 int
-nfs_bioread(nfsnode_t np, struct uio *uio, int ioflag, int *eofflag, vfs_context_t ctx)
+nfs_bioread(nfsnode_t np, uio_t uio, int ioflag, vfs_context_t ctx)
 {
        vnode_t vp = NFSTOV(np);
        struct nfsbuf *bp = NULL;
-       struct nfs_vattr nvattr;
        struct nfsmount *nmp = VTONMP(vp);
-       daddr64_t lbn, rabn = 0, lastrabn, maxrabn = -1, tlbn;
+       daddr64_t lbn, rabn = 0, lastrabn, maxrabn = -1;
        off_t diff;
        int error = 0, n = 0, on = 0;
-       int nfsvers, biosize;
-       caddr_t dp;
-       struct dirent *direntp = NULL;
-       enum vtype vtype;
+       int nfsvers, biosize, modified, readaheads = 0;
        thread_t thd;
        kauth_cred_t cred;
+       int64_t io_resid;
 
-       FSDBG_TOP(514, np, uio->uio_offset, uio_uio_resid(uio), ioflag);
-
-       if (uio_uio_resid(uio) == 0) {
-               FSDBG_BOT(514, np, 0xd1e0001, 0, 0);
-               return (0);
-       }
-       if (uio->uio_offset < 0) {
-               FSDBG_BOT(514, np, 0xd1e0002, 0, EINVAL);
-               return (EINVAL);
-       }
+       FSDBG_TOP(514, np, uio_offset(uio), uio_resid(uio), ioflag);
 
        nfsvers = nmp->nm_vers;
        biosize = nmp->nm_biosize;
        thd = vfs_context_thread(ctx);
        cred = vfs_context_ucred(ctx);
 
-       vtype = vnode_vtype(vp);
-       if ((vtype != VREG) && (vtype != VDIR)) {
-               printf("nfs_bioread: type %x unexpected\n", vtype);
+       if (vnode_vtype(vp) != VREG) {
+               printf("nfs_bioread: type %x unexpected\n", vnode_vtype(vp));
                FSDBG_BOT(514, np, 0xd1e0016, 0, EINVAL);
                return (EINVAL);
        }
 
        /*
-        * For nfs, cache consistency can only be maintained approximately.
+        * For NFS, cache consistency can only be maintained approximately.
         * Although RFC1094 does not specify the criteria, the following is
         * believed to be compatible with the reference port.
-        * For nfs:
-        * If the file's modify time on the server has changed since the
-        * last read rpc or you have written to the file,
-        * you may have lost data cache consistency with the
-        * server, so flush all of the file's data out of the cache.
-        * Then force a getattr rpc to ensure that you have up to date
-        * attributes.
+        * 
+        * If the file has changed since the last read RPC or you have
+        * written to the file, you may have lost data cache consistency
+        * with the server.  So, check for a change, and flush all of the
+        * file's data out of the cache.
         * NB: This implies that cache data can be read when up to
-        * NFS_MAXATTRTIMEO seconds out of date. If you find that you need
-        * current attributes this could be forced by calling
-        * NATTRINVALIDATE() before the nfs_getattr() call.
+        * NFS_MAXATTRTIMO seconds out of date. If you find that you
+        * need current attributes, nfs_getattr() can be forced to fetch
+        * new attributes (via NATTRINVALIDATE() or NGA_UNCACHED).
         */
 
        if (ISSET(np->n_flag, NUPDATESIZE))
                nfs_data_update_size(np, 0);
 
-       if ((error = nfs_lock(np, NFS_NODE_LOCK_EXCLUSIVE))) {
+       if ((error = nfs_node_lock(np))) {
                FSDBG_BOT(514, np, 0xd1e0222, 0, error);
                return (error);
        }
 
        if (np->n_flag & NNEEDINVALIDATE) {
                np->n_flag &= ~NNEEDINVALIDATE;
-               nfs_unlock(np);
-               nfs_vinvalbuf(vp, V_SAVE|V_IGNORE_WRITEERR, ctx, 1);
-               if ((error = nfs_lock(np, NFS_NODE_LOCK_EXCLUSIVE))) {
+               nfs_node_unlock(np);
+               error = nfs_vinvalbuf(vp, V_SAVE|V_IGNORE_WRITEERR, ctx, 1);
+               if (!error)
+                       error = nfs_node_lock(np);
+               if (error) {
                        FSDBG_BOT(514, np, 0xd1e0322, 0, error);
                        return (error);
                }
        }
 
-       if (np->n_flag & NMODIFIED) {
-               if (vtype == VDIR) {
-                       nfs_invaldir(np);
-                       nfs_unlock(np);
-                       error = nfs_vinvalbuf(vp, V_SAVE, ctx, 1);
-                       if (!error)
-                               error = nfs_lock(np, NFS_NODE_LOCK_EXCLUSIVE);
-                       if (error) {
-                               FSDBG_BOT(514, np, 0xd1e0003, 0, error);
-                               return (error);
-                       }
-               }
-               NATTRINVALIDATE(np);
-               error = nfs_getattr(np, &nvattr, ctx, 1);
-               if (error) {
-                       nfs_unlock(np);
-                       FSDBG_BOT(514, np, 0xd1e0004, 0, error);
-                       return (error);
-               }
-               if (vtype == VDIR) {
-                       /* if directory changed, purge any name cache entries */
-                       if (NFS_CHANGED_NC(nfsvers, np, &nvattr))
-                               cache_purge(vp);
-                       NFS_CHANGED_UPDATE_NC(nfsvers, np, &nvattr);
-               }
-               NFS_CHANGED_UPDATE(nfsvers, np, &nvattr);
-       } else {
-               error = nfs_getattr(np, &nvattr, ctx, 1);
-               if (error) {
-                       nfs_unlock(np);
-                       FSDBG_BOT(514, np, 0xd1e0005, 0, error);
-                       return (error);
-               }
-               if (NFS_CHANGED(nfsvers, np, &nvattr)) {
-                       if (vtype == VDIR) {
-                               nfs_invaldir(np);
-                               /* purge name cache entries */
-                               if (NFS_CHANGED_NC(nfsvers, np, &nvattr))
-                                       cache_purge(vp);
-                       }
-                       nfs_unlock(np);
-                       error = nfs_vinvalbuf(vp, V_SAVE, ctx, 1);
-                       if (!error)
-                               error = nfs_lock(np, NFS_NODE_LOCK_EXCLUSIVE);
-                       if (error) {
-                               FSDBG_BOT(514, np, 0xd1e0006, 0, error);
-                               return (error);
-                       }
-                       if (vtype == VDIR)
-                               NFS_CHANGED_UPDATE_NC(nfsvers, np, &nvattr);
-                       NFS_CHANGED_UPDATE(nfsvers, np, &nvattr);
-               }
+       modified = (np->n_flag & NMODIFIED);
+       nfs_node_unlock(np);
+       /* nfs_getattr() will check changed and purge caches */
+       error = nfs_getattr(np, NULL, ctx, modified ? NGA_UNCACHED : NGA_CACHED);
+       if (error) {
+               FSDBG_BOT(514, np, 0xd1e0004, 0, error);
+               return (error);
        }
 
-       nfs_unlock(np);
+       if (uio_resid(uio) == 0) {
+               FSDBG_BOT(514, np, 0xd1e0001, 0, 0);
+               return (0);
+       }
+       if (uio_offset(uio) < 0) {
+               FSDBG_BOT(514, np, 0xd1e0002, 0, EINVAL);
+               return (EINVAL);
+       }
 
-       if (vtype == VREG) {
-               if ((ioflag & IO_NOCACHE) && (uio_uio_resid(uio) < (2*biosize))) {
-                       /* We have only a block or so to read, just do the rpc directly. */
-                       error = nfs_read_rpc(np, uio, ctx);
-                       FSDBG_BOT(514, np, uio->uio_offset, uio_uio_resid(uio), error);
-                       return (error);
-               }
-               /*
-                * set up readahead - which may be limited by:
-                * + current request length (for IO_NOCACHE)
-                * + readahead setting
-                * + file size
-                */
-               if (nmp->nm_readahead > 0) {
-                       off_t end = uio->uio_offset + uio_uio_resid(uio);
-                       if (end > (off_t)np->n_size)
-                               end = np->n_size;
-                       rabn = uio->uio_offset / biosize;
-                       maxrabn = (end - 1) / biosize;
-                       if (!(ioflag & IO_NOCACHE) &&
-                           (!rabn || (rabn == np->n_lastread) || (rabn == (np->n_lastread+1)))) {
-                               maxrabn += nmp->nm_readahead;
-                               if ((maxrabn * biosize) >= (off_t)np->n_size)
-                                       maxrabn = ((off_t)np->n_size - 1)/biosize;
-                       }
-               } else {
-                       rabn = maxrabn = 0;
-               }
+       /*
+        * set up readahead - which may be limited by:
+        * + current request length (for IO_NOCACHE)
+        * + readahead setting
+        * + file size
+        */
+       if (nmp->nm_readahead > 0) {
+               off_t end = uio_offset(uio) + uio_resid(uio);
+               if (end > (off_t)np->n_size)
+                       end = np->n_size;
+               rabn = uio_offset(uio) / biosize;
+               maxrabn = (end - 1) / biosize;
+               nfs_node_lock_force(np);
+               if (!(ioflag & IO_NOCACHE) &&
+                   (!rabn || (rabn == np->n_lastread) || (rabn == (np->n_lastread+1)))) {
+                       maxrabn += nmp->nm_readahead;
+                       if ((maxrabn * biosize) >= (off_t)np->n_size)
+                               maxrabn = ((off_t)np->n_size - 1)/biosize;
+               }
+               if (maxrabn < np->n_lastrahead)
+                       np->n_lastrahead = -1;
+               if (rabn < np->n_lastrahead)
+                       rabn = np->n_lastrahead + 1;
+               nfs_node_unlock(np);
+       } else {
+               rabn = maxrabn = 0;
        }
 
        do {
 
-           if (vtype == VREG) {
-               nfs_data_lock(np, NFS_NODE_LOCK_SHARED);
-               lbn = uio->uio_offset / biosize;
+               nfs_data_lock(np, NFS_DATA_LOCK_SHARED);
+               lbn = uio_offset(uio) / biosize;
 
                /*
                 * Copy directly from any cached pages without grabbing the bufs.
-                *
-                * Note: for "nocache" reads, we don't copy directly from UBC
-                * because any cached pages will be for readahead buffers that
-                * need to be invalidated anyway before we finish this request.
+                * (If we are NOCACHE and we've issued readahead requests, we need
+                * to grab the NB_NCRDAHEAD bufs to drop them.)
                 */
-               if (!(ioflag & IO_NOCACHE) &&
-                       (uio->uio_segflg == UIO_USERSPACE32 ||
-                        uio->uio_segflg == UIO_USERSPACE64 ||
-                        uio->uio_segflg == UIO_USERSPACE)) {
-                       // LP64todo - fix this!
-                       int io_resid = uio_uio_resid(uio);
-                       diff = np->n_size - uio->uio_offset;
+               if ((!(ioflag & IO_NOCACHE) || !readaheads) &&
+                   ((uio->uio_segflg == UIO_USERSPACE32 ||
+                     uio->uio_segflg == UIO_USERSPACE64 ||
+                     uio->uio_segflg == UIO_USERSPACE))) {
+                       io_resid = uio_resid(uio);
+                       diff = np->n_size - uio_offset(uio);
                        if (diff < io_resid)
                                io_resid = diff;
                        if (io_resid > 0) {
-                               error = cluster_copy_ubc_data(vp, uio, &io_resid, 0);
+                               int count = (io_resid > INT_MAX) ? INT_MAX : io_resid;
+                               error = cluster_copy_ubc_data(vp, uio, &count, 0);
                                if (error) {
                                        nfs_data_unlock(np);
-                                       FSDBG_BOT(514, np, uio->uio_offset, 0xcacefeed, error);
+                                       FSDBG_BOT(514, np, uio_offset(uio), 0xcacefeed, error);
                                        return (error);
                                }
                        }
                        /* count any biocache reads that we just copied directly */
-                       if (lbn != (uio->uio_offset / biosize)) {
-                               OSAddAtomic((uio->uio_offset / biosize) - lbn, (SInt32*)&nfsstats.biocache_reads);
-                               FSDBG(514, np, 0xcacefeed, uio->uio_offset, error);
+                       if (lbn != (uio_offset(uio)/biosize)) {
+                               OSAddAtomic64((uio_offset(uio)/biosize) - lbn, &nfsstats.biocache_reads);
+                               FSDBG(514, np, 0xcacefeed, uio_offset(uio), error);
                        }
                }
 
-               lbn = uio->uio_offset / biosize;
-               on = uio->uio_offset % biosize;
-               np->n_lastread = (uio->uio_offset - 1) / biosize;
+               lbn = uio_offset(uio) / biosize;
+               on = uio_offset(uio) % biosize;
+               nfs_node_lock_force(np);
+               np->n_lastread = (uio_offset(uio) - 1) / biosize;
+               nfs_node_unlock(np);
+
+               if ((uio_resid(uio) <= 0) || (uio_offset(uio) >= (off_t)np->n_size)) {
+                       nfs_data_unlock(np);
+                       FSDBG_BOT(514, np, uio_offset(uio), uio_resid(uio), 0xaaaaaaaa);
+                       return (0);
+               }
 
                /* adjust readahead block number, if necessary */
                if (rabn < lbn)
@@ -2013,15 +2058,10 @@ nfs_bioread(nfsnode_t np, struct uio *uio, int ioflag, int *eofflag, vfs_context
                                FSDBG_BOT(514, np, 0xd1e000b, 1, error);
                                return (error);
                        }
+                       readaheads = 1;
                }
 
-               if ((uio_uio_resid(uio) <= 0) || (uio->uio_offset >= (off_t)np->n_size)) {
-                       nfs_data_unlock(np);
-                       FSDBG_BOT(514, np, uio->uio_offset, uio_uio_resid(uio), 0xaaaaaaaa);
-                       return (0);
-               }
-
-               OSAddAtomic(1, (SInt32*)&nfsstats.biocache_reads);
+               OSAddAtomic64(1, &nfsstats.biocache_reads);
 
                /*
                 * If the block is in the cache and has the required data
@@ -2030,9 +2070,9 @@ nfs_bioread(nfsnode_t np, struct uio *uio, int ioflag, int *eofflag, vfs_context
                 * as required.
                 */
 again:
-               // LP64todo - fix this!
-               n = min((unsigned)(biosize - on), uio_uio_resid(uio));
-               diff = np->n_size - uio->uio_offset;
+               io_resid = uio_resid(uio);
+               n = (io_resid > (biosize - on)) ? (biosize - on) : io_resid;
+               diff = np->n_size - uio_offset(uio);
                if (diff < n)
                        n = diff;
 
@@ -2055,11 +2095,9 @@ again:
                                SET(bp->nb_flags, NB_NOCACHE);
                                goto flushbuffer;
                        }
-                       if (!ISSET(bp->nb_flags, NB_NCRDAHEAD)) {
-                               CLR(bp->nb_flags, NB_CACHE);
-                               bp->nb_valid = 0;
-                       } else {
+                       if (ISSET(bp->nb_flags, NB_NCRDAHEAD)) {
                                CLR(bp->nb_flags, NB_NCRDAHEAD);
+                               SET(bp->nb_flags, NB_NOCACHE);
                        }
                }
 
@@ -2127,7 +2165,7 @@ flushbuffer:
                                if (!auio) {
                                        error = ENOMEM;
                                } else {
-                                       uio_addiov(auio, CAST_USER_ADDR_T((bp->nb_data + firstpg * PAGE_SIZE)),
+                                       uio_addiov(auio, CAST_USER_ADDR_T(bp->nb_data + (firstpg * PAGE_SIZE)),
                                                        ((lastpg - firstpg + 1) * PAGE_SIZE));
                                        error = nfs_read_rpc(np, auio, ctx);
                                }
@@ -2162,6 +2200,8 @@ flushbuffer:
                        SET(bp->nb_flags, NB_READ);
                        CLR(bp->nb_flags, (NB_DONE | NB_ERROR | NB_INVAL));
                        error = nfs_buf_read(bp);
+                       if (ioflag & IO_NOCACHE)
+                               SET(bp->nb_flags, NB_NOCACHE);
                        if (error) {
                                nfs_data_unlock(np);
                                nfs_buf_release(bp, 1);
@@ -2176,157 +2216,28 @@ buffer_ready:
                        if (diff < n)
                                n = diff;
                }
-               if (n > 0)
-                       NFS_BUF_MAP(bp);
-           } else if (vtype == VDIR) {
-               OSAddAtomic(1, (SInt32*)&nfsstats.biocache_readdirs);
-               error = nfs_lock(np, NFS_NODE_LOCK_SHARED);
-               if (error || (np->n_direofoffset && (uio->uio_offset >= np->n_direofoffset))) {
-                       if (!error)
-                               nfs_unlock(np);
-                       if (eofflag)
-                               *eofflag = 1;
-                       FSDBG_BOT(514, np, 0xde0f0001, 0, 0);
-                       return (0);
-               }
-               nfs_unlock(np);
-               lbn = uio->uio_offset / NFS_DIRBLKSIZ;
-               on = uio->uio_offset & (NFS_DIRBLKSIZ - 1);
-               error = nfs_buf_get(np, lbn, NFS_DIRBLKSIZ, thd, NBLK_READ, &bp);
-               if (error) {
-                       FSDBG_BOT(514, np, 0xd1e0012, 0, error);
-                       return (error);
-               }
-               if (!ISSET(bp->nb_flags, NB_CACHE)) {
-                   SET(bp->nb_flags, NB_READ);
-                   error = nfs_buf_readdir(bp, ctx);
-                   if (error)
-                       nfs_buf_release(bp, 1);
-                   while (error == NFSERR_BAD_COOKIE) {
-                       error = nfs_lock(np, NFS_NODE_LOCK_EXCLUSIVE);
-                       if (!error) {
-                               nfs_invaldir(np);
-                               nfs_unlock(np);
-                       }
-                       error = nfs_vinvalbuf(vp, 0, ctx, 1);
-                       /*
-                        * Yuck! The directory has been modified on the
-                        * server. The only way to get the block is by
-                        * reading from the beginning to get all the
-                        * offset cookies.
-                        */
-                       for (tlbn = 0; tlbn <= lbn && !error; tlbn++) {
-                           if ((error = nfs_lock(np, NFS_NODE_LOCK_SHARED)))
-                                   break;
-                           if (np->n_direofoffset
-                               && (tlbn * NFS_DIRBLKSIZ) >= np->n_direofoffset) {
-                                   nfs_unlock(np);
-                                   if (eofflag)
-                                           *eofflag = 1;
-                                   FSDBG_BOT(514, np, 0xde0f0002, 0, 0);
-                                   return (0);
-                           }
-                           nfs_unlock(np);
-                           error = nfs_buf_get(np, tlbn, NFS_DIRBLKSIZ, thd, NBLK_READ, &bp);
-                           if (error) {
-                                   FSDBG_BOT(514, np, 0xd1e0013, 0, error);
-                                   return (error);
-                           }
-                           if (!ISSET(bp->nb_flags, NB_CACHE)) {
-                                   SET(bp->nb_flags, NB_READ);
-                                   error = nfs_buf_readdir(bp, ctx);
-                                   /*
-                                    * no error + NB_INVAL == directory EOF,
-                                    * use the block.
-                                    */
-                                   if (error == 0 && ISSET(bp->nb_flags, NB_INVAL)) {
-                                           if (eofflag)
-                                                   *eofflag = 1;
-                                           break;
-                                   }
-                           }
-                           /*
-                            * An error will throw away the block and the
-                            * for loop will break out.  If no error and this
-                            * is not the block we want, we throw away the
-                            * block and go for the next one via the for loop.
-                            */
-                           if (error || (tlbn < lbn))
-                                   nfs_buf_release(bp, 1);
-                       }
-                   }
-                   /*
-                    * The above while is repeated if we hit another cookie
-                    * error.  If we hit an error and it wasn't a cookie error,
-                    * we give up.
-                    */
-                   if (error) {
-                       FSDBG_BOT(514, np, 0xd1e0014, 0, error);
-                       return (error);
-                   }
-               }
-               /*
-                * Make sure we use a signed variant of min() since
-                * the second term may be negative.
-                */
-               // LP64todo - fix this!
-               n = lmin(uio_uio_resid(uio), bp->nb_validend - on);
-               /*
-                * We keep track of the directory eof in
-                * np->n_direofoffset and chop it off as an
-                * extra step right here.
-                */
-               if ((error = nfs_lock(np, NFS_NODE_LOCK_SHARED))) {
-                       FSDBG_BOT(514, np, 0xd1e0115, 0, error);
-                       return (error);
-               }
-               if (np->n_direofoffset &&
-                   n > np->n_direofoffset - uio->uio_offset)
-                       n = np->n_direofoffset - uio->uio_offset;
-               nfs_unlock(np);
-               /*
-                * Make sure that we return an integral number of entries so
-                * that any subsequent calls will start copying from the start
-                * of the next entry.
-                *
-                * If the current value of n has the last entry cut short,
-                * set n to copy everything up to the last entry instead.
-                */
                if (n > 0) {
-                       dp = bp->nb_data + on;
-                       while (dp < (bp->nb_data + on + n)) {
-                               direntp = (struct dirent *)dp;
-                               dp += direntp->d_reclen;
-                       }
-                       if (dp > (bp->nb_data + on + n))
-                               n = (dp - direntp->d_reclen) - (bp->nb_data + on);
+                       NFS_BUF_MAP(bp);
+                       error = uiomove(bp->nb_data + on, n, uio);
                }
-           }
-
-           if (n > 0)
-               error = uiomove(bp->nb_data + on, (int)n, uio);
 
-           if (vtype == VREG) {
-               if (ioflag & IO_NOCACHE)
-                       SET(bp->nb_flags, NB_NOCACHE);
                nfs_buf_release(bp, 1);
                nfs_data_unlock(np);
-               np->n_lastread = (uio->uio_offset - 1) / biosize;
-           } else {
-               nfs_buf_release(bp, 1);
-           }
-       } while (error == 0 && uio_uio_resid(uio) > 0 && n > 0);
-       FSDBG_BOT(514, np, uio->uio_offset, uio_uio_resid(uio), error);
+               nfs_node_lock_force(np);
+               np->n_lastread = (uio_offset(uio) - 1) / biosize;
+               nfs_node_unlock(np);
+       } while (error == 0 && uio_resid(uio) > 0 && n > 0);
+       FSDBG_BOT(514, np, uio_offset(uio), uio_resid(uio), error);
        return (error);
 }
 
 /*
  * limit the number of outstanding async I/O writes
  */
-static int
+int
 nfs_async_write_start(struct nfsmount *nmp)
 {
-       int error = 0, slpflag = (nmp->nm_flag & NFSMNT_INT) ? PCATCH : 0;
+       int error = 0, slpflag = NMFLAG(nmp, INTR) ? PCATCH : 0;
        struct timespec ts = {1, 0};
 
        if (nfs_max_async_writes <= 0)
@@ -2343,7 +2254,7 @@ nfs_async_write_start(struct nfsmount *nmp)
        lck_mtx_unlock(&nmp->nm_lock);
        return (error);
 }
-static void
+void
 nfs_async_write_done(struct nfsmount *nmp)
 {
        if (nmp->nm_asyncwrites <= 0)
@@ -2402,10 +2313,13 @@ nfs_buf_write(struct nfsbuf *bp)
                LIST_INSERT_HEAD(&np->n_cleanblkhd, bp, nb_vnbufs);
                lck_mtx_unlock(nfs_buf_mutex);
        }
+       nfs_node_lock_force(np);
+       np->n_numoutput++;
+       nfs_node_unlock(np);
        vnode_startwrite(NFSTOV(np));
 
        if (p && p->p_stats)
-               OSIncrementAtomic(&p->p_stats->p_ru.ru_oublock);
+               OSIncrementAtomicLong(&p->p_stats->p_ru.ru_oublock);
 
        cred = bp->nb_wcred;
        if (!IS_VALID_CRED(cred) && ISSET(bp->nb_flags, NB_READ))
@@ -2415,17 +2329,26 @@ nfs_buf_write(struct nfsbuf *bp)
        thd = async ? NULL : current_thread();
 
        /* We need to make sure the pages are locked before doing I/O.  */
-       if (!ISSET(bp->nb_flags, NB_META) && UBCINFOEXISTS(NFSTOV(np))) {
-               if (!ISSET(bp->nb_flags, NB_PAGELIST)) {
-                       error = nfs_buf_upl_setup(bp);
-                       if (error) {
-                               printf("nfs_buf_write: upl create failed %d\n", error);
-                               SET(bp->nb_flags, NB_ERROR);
-                               bp->nb_error = error = EIO;
-                               nfs_buf_iodone(bp);
-                               goto out;
+       if (!ISSET(bp->nb_flags, NB_META)) {
+               if (UBCINFOEXISTS(NFSTOV(np))) {
+                       if (!ISSET(bp->nb_flags, NB_PAGELIST)) {
+                               error = nfs_buf_upl_setup(bp);
+                               if (error) {
+                                       printf("nfs_buf_write: upl create failed %d\n", error);
+                                       SET(bp->nb_flags, NB_ERROR);
+                                       bp->nb_error = error = EIO;
+                                       nfs_buf_iodone(bp);
+                                       goto out;
+                               }
+                               nfs_buf_upl_check(bp);
                        }
-                       nfs_buf_upl_check(bp);
+               } else {
+                       /* We should never be in nfs_buf_write() with no UBCINFO. */
+                       printf("nfs_buf_write: ubcinfo already gone\n");
+                       SET(bp->nb_flags, NB_ERROR);
+                       bp->nb_error = error = EIO;
+                       nfs_buf_iodone(bp);
+                       goto out;
                }
        }
 
@@ -2434,7 +2357,7 @@ nfs_buf_write(struct nfsbuf *bp)
                nfs_buf_check_write_verifier(np, bp);
        if (ISSET(bp->nb_flags, NB_NEEDCOMMIT)) {
                struct nfsmount *nmp = NFSTONMP(np);
-               if (!nmp) {
+               if (nfs_mount_gone(nmp)) {
                        SET(bp->nb_flags, NB_ERROR);
                        bp->nb_error = error = EIO;
                        nfs_buf_iodone(bp);
@@ -2442,7 +2365,7 @@ nfs_buf_write(struct nfsbuf *bp)
                }
                SET(bp->nb_flags, NB_WRITEINPROG);
                error = nmp->nm_funcs->nf_commit_rpc(np, NBOFF(bp) + bp->nb_dirtyoff,
-                               bp->nb_dirtyend - bp->nb_dirtyoff, bp->nb_wcred);
+                               bp->nb_dirtyend - bp->nb_dirtyoff, bp->nb_wcred, bp->nb_verf);
                CLR(bp->nb_flags, NB_WRITEINPROG);
                if (error) {
                        if (error != NFSERR_STALEWRITEVERF) {
@@ -2454,10 +2377,10 @@ nfs_buf_write(struct nfsbuf *bp)
                }
                bp->nb_dirtyoff = bp->nb_dirtyend = 0;
                CLR(bp->nb_flags, NB_NEEDCOMMIT);
-               nfs_lock(np, NFS_NODE_LOCK_FORCE);
+               nfs_node_lock_force(np);
                np->n_needcommitcnt--;
                CHECK_NEEDCOMMITCNT(np);
-               nfs_unlock(np);
+               nfs_node_unlock(np);
        }
        if (!error && (bp->nb_dirtyend > 0)) {
                /* sanity check the dirty range */
@@ -2486,7 +2409,7 @@ nfs_buf_write(struct nfsbuf *bp)
                        dend = round_page_32(dend);
                /* try to expand write range to include trailing dirty pages */
                if (!(dend & PAGE_MASK))
-                       while ((dend < bp->nb_bufsize) && NBPGDIRTY(bp, dend / PAGE_SIZE))
+                       while ((dend < (int)bp->nb_bufsize) && NBPGDIRTY(bp, dend / PAGE_SIZE))
                                dend += PAGE_SIZE;
                /* make sure to keep dend clipped to EOF */
                if ((NBOFF(bp) + dend) > (off_t) np->n_size)
@@ -2513,7 +2436,7 @@ nfs_buf_write(struct nfsbuf *bp)
                bp->nb_offio = doff;
                bp->nb_endio = dend;
 
-               OSAddAtomic(1, (SInt32 *)&nfsstats.write_bios);
+               OSAddAtomic64(1, &nfsstats.write_bios);
 
                SET(bp->nb_flags, NB_WRITEINPROG);
                error = nfs_buf_write_rpc(bp, iomode, thd, cred);
@@ -2546,12 +2469,12 @@ out:
                if ((np->n_flag & NNEEDINVALIDATE) &&
                    !(np->n_bflag & (NBINVALINPROG|NBFLUSHINPROG))) {
                        int invalidate = 0;
-                       nfs_lock(np, NFS_NODE_LOCK_FORCE);
+                       nfs_node_lock_force(np);
                        if (np->n_flag & NNEEDINVALIDATE) {
                                invalidate = 1;
                                np->n_flag &= ~NNEEDINVALIDATE;
                        }
-                       nfs_unlock(np);
+                       nfs_node_unlock(np);
                        if (invalidate) {
                                /*
                                 * There was a write error and we need to
@@ -2603,19 +2526,19 @@ nfs_buf_write_finish(struct nfsbuf *bp, thread_t thd, kauth_cred_t cred)
        /* manage needcommit state */
        if (!error && (bp->nb_commitlevel == NFS_WRITE_UNSTABLE)) {
                if (!ISSET(bp->nb_flags, NB_NEEDCOMMIT)) {
-                       nfs_lock(np, NFS_NODE_LOCK_FORCE);
+                       nfs_node_lock_force(np);
                        np->n_needcommitcnt++;
-                       nfs_unlock(np);
+                       nfs_node_unlock(np);
                        SET(bp->nb_flags, NB_NEEDCOMMIT);
                }
                /* make sure nb_dirtyoff/nb_dirtyend reflect actual range written */
                bp->nb_dirtyoff = bp->nb_offio;
                bp->nb_dirtyend = bp->nb_endio;
        } else if (ISSET(bp->nb_flags, NB_NEEDCOMMIT)) {
-               nfs_lock(np, NFS_NODE_LOCK_FORCE);
+               nfs_node_lock_force(np);
                np->n_needcommitcnt--;
                CHECK_NEEDCOMMITCNT(np);
-               nfs_unlock(np);
+               nfs_node_unlock(np);
                CLR(bp->nb_flags, NB_NEEDCOMMIT);
        }
 
@@ -2664,11 +2587,11 @@ nfs_buf_write_finish(struct nfsbuf *bp, thread_t thd, kauth_cred_t cred)
                         * buffer busy.  Set a flag to do it after releasing
                         * the buffer.
                         */
-                       nfs_lock(np, NFS_NODE_LOCK_FORCE);
+                       nfs_node_lock_force(np);
                        np->n_error = error;
                        np->n_flag |= (NWRITEERR | NNEEDINVALIDATE);
                        NATTRINVALIDATE(np);
-                       nfs_unlock(np);
+                       nfs_node_unlock(np);
                }
                /* clear the dirty range */
                bp->nb_dirtyoff = bp->nb_dirtyend = 0;
@@ -2694,27 +2617,21 @@ nfs_buf_write_dirty_pages(struct nfsbuf *bp, thread_t thd, kauth_cred_t cred)
        int error = 0, commit, iomode, iomode2, len, pg, count, npages, off;
        uint32_t dirty = bp->nb_dirty;
        uint64_t wverf;
-       struct uio uio;
-       struct iovec_32 io;
+       uio_t auio;
+       char uio_buf [ UIO_SIZEOF(1) ];
 
        if (!bp->nb_dirty)
                return (0);
 
        /* there are pages marked dirty that need to be written out */
-       OSAddAtomic(1, (SInt32 *)&nfsstats.write_bios);
+       OSAddAtomic64(1, &nfsstats.write_bios);
        NFS_BUF_MAP(bp);
        SET(bp->nb_flags, NB_WRITEINPROG);
        npages = bp->nb_bufsize / PAGE_SIZE;
        iomode = NFS_WRITE_UNSTABLE;
 
-       uio.uio_iovs.iov32p = &io;
-       uio.uio_iovcnt = 1;
-       uio.uio_rw = UIO_WRITE;
-#if 1   /* LP64todo - can't use new segment flags until the drivers are ready */
-       uio.uio_segflg = UIO_SYSSPACE;
-#else
-       uio.uio_segflg = UIO_SYSSPACE32;
-#endif
+       auio = uio_createwithbuffer(1, 0, UIO_SYSSPACE, UIO_WRITE,
+               &uio_buf, sizeof(uio_buf));
 
 again:
        dirty = bp->nb_dirty;
@@ -2734,11 +2651,9 @@ again:
                        len -= (NBOFF(bp) + off + len) - np->n_size;
                if (len > 0) {
                        iomode2 = iomode;
-                       io.iov_len = len;
-                       uio_uio_resid_set(&uio, io.iov_len);
-                       uio.uio_offset = NBOFF(bp) + off;
-                       io.iov_base = (uintptr_t) bp->nb_data + off;
-                       error = nfs_write_rpc2(np, &uio, thd, cred, &iomode2, &bp->nb_verf);
+                       uio_reset(auio, NBOFF(bp) + off, UIO_SYSSPACE, UIO_WRITE);
+                       uio_addiov(auio, CAST_USER_ADDR_T(bp->nb_data + off), len);
+                       error = nfs_write_rpc2(np, auio, thd, cred, &iomode2, &bp->nb_verf);
                        if (error)
                                break;
                        if (iomode2 < commit) /* Retain the lowest commitment level returned. */
@@ -2759,7 +2674,7 @@ again:
        CLR(bp->nb_flags, NB_WRITEINPROG);
 
        if (!error && (commit != NFS_WRITE_FILESYNC)) {
-               error = nmp->nm_funcs->nf_commit_rpc(np, NBOFF(bp), bp->nb_bufsize, cred);
+               error = nmp->nm_funcs->nf_commit_rpc(np, NBOFF(bp), bp->nb_bufsize, cred, wverf);
                if (error == NFSERR_STALEWRITEVERF) {
                        /* verifier changed, so we need to restart all the writes */
                        iomode = NFS_WRITE_FILESYNC;
@@ -2784,14 +2699,15 @@ nfs_buf_write_rpc(struct nfsbuf *bp, int iomode, thread_t thd, kauth_cred_t cred
        struct nfsmount *nmp;
        nfsnode_t np = bp->nb_np;
        int error = 0, nfsvers, async;
-       int offset, length, nmwsize, nrpcs, len;
+       int offset, nrpcs;
+       uint32_t nmwsize, length, len;
        struct nfsreq *req;
        struct nfsreq_cbinfo cb;
-       struct uio uio;
-       struct iovec_32 io;
+       uio_t auio;
+       char uio_buf [ UIO_SIZEOF(1) ];
 
        nmp = NFSTONMP(np);
-       if (!nmp) {
+       if (nfs_mount_gone(nmp)) {
                bp->nb_error = error = ENXIO;
                SET(bp->nb_flags, NB_ERROR);
                nfs_buf_iodone(bp);
@@ -2816,18 +2732,9 @@ nfs_buf_write_rpc(struct nfsbuf *bp, int iomode, thread_t thd, kauth_cred_t cred
                return (error);
        }
 
-       uio.uio_iovs.iov32p = &io;
-       uio.uio_iovcnt = 1;
-       uio.uio_rw = UIO_WRITE;
-#if 1   /* LP64todo - can't use new segment flags until the drivers are ready */
-       uio.uio_segflg = UIO_SYSSPACE;
-#else
-       uio.uio_segflg = UIO_SYSSPACE32;
-#endif
-       io.iov_len = length;
-       uio_uio_resid_set(&uio, io.iov_len);
-       uio.uio_offset = NBOFF(bp) + offset;
-       io.iov_base = (uintptr_t) bp->nb_data + offset;
+       auio = uio_createwithbuffer(1, NBOFF(bp) + offset, UIO_SYSSPACE,
+               UIO_WRITE, &uio_buf, sizeof(uio_buf));
+       uio_addiov(auio, CAST_USER_ADDR_T(bp->nb_data + offset), length);
 
        bp->nb_rpcs = nrpcs = (length + nmwsize - 1) / nmwsize;
        if (async && (nrpcs > 1)) {
@@ -2844,10 +2751,12 @@ nfs_buf_write_rpc(struct nfsbuf *bp, int iomode, thread_t thd, kauth_cred_t cred
                len = (length > nmwsize) ? nmwsize : length;
                cb.rcb_args[0] = offset;
                cb.rcb_args[1] = len;
+               if (nmp->nm_vers >= NFS_VER4)
+                       cb.rcb_args[2] = nmp->nm_stategenid;
                if (async && ((error = nfs_async_write_start(nmp))))
                        break;
                req = NULL;
-               error = nmp->nm_funcs->nf_write_rpc_async(np, &uio, len, thd, cred,
+               error = nmp->nm_funcs->nf_write_rpc_async(np, auio, len, thd, cred,
                                iomode, &cb, &req);
                if (error) {
                        if (async)
@@ -2886,6 +2795,9 @@ nfs_buf_write_rpc(struct nfsbuf *bp, int iomode, thread_t thd, kauth_cred_t cred
                } else {
                        nfs_buf_write_finish(bp, thd, cred);
                }
+               /* It may have just been an interrupt... that's OK */
+               if (!ISSET(bp->nb_flags, NB_ERROR))
+                       error = 0;
        }
 
        return (error);
@@ -2909,8 +2821,8 @@ nfs_buf_write_rpc_finish(struct nfsreq *req)
        nfsnode_t np;
        thread_t thd;
        kauth_cred_t cred;
-       struct uio uio;
-       struct iovec_32 io;
+       uio_t auio;
+       char uio_buf [ UIO_SIZEOF(1) ];
 
 finish:
        np = req->r_np;
@@ -2920,9 +2832,11 @@ finish:
                kauth_cred_ref(cred);
        cb = req->r_callback;
        bp = cb.rcb_bp;
+       if (cb.rcb_func) /* take an extra reference on the nfsreq in case we want to resend it later due to grace error */
+               nfs_request_ref(req, 0);
 
        nmp = NFSTONMP(np);
-       if (!nmp) {
+       if (nfs_mount_gone(nmp)) {
                SET(bp->nb_flags, NB_ERROR);
                bp->nb_error = error = ENXIO;
        }
@@ -2940,11 +2854,57 @@ finish:
        error = nmp->nm_funcs->nf_write_rpc_async_finish(np, req, &committed, &rlen, &wverf);
        if ((error == EINPROGRESS) && cb.rcb_func) {
                /* async request restarted */
+               if (cb.rcb_func)
+                       nfs_request_rele(req);
                if (IS_VALID_CRED(cred))
                        kauth_cred_unref(&cred);
                return;
        }
-
+       if ((nmp->nm_vers >= NFS_VER4) && nfs_mount_state_error_should_restart(error) && !ISSET(bp->nb_flags, NB_ERROR)) {
+               lck_mtx_lock(&nmp->nm_lock);
+               if ((error != NFSERR_OLD_STATEID) && (error != NFSERR_GRACE) && (cb.rcb_args[2] == nmp->nm_stategenid)) {
+                       NP(np, "nfs_buf_write_rpc_finish: error %d @ 0x%llx, 0x%x 0x%x, initiating recovery",
+                               error, NBOFF(bp)+offset, cb.rcb_args[2], nmp->nm_stategenid);
+                       nfs_need_recover(nmp, error);
+               }
+               lck_mtx_unlock(&nmp->nm_lock);
+               if (np->n_flag & NREVOKE) {
+                       error = EIO;
+               } else {
+                       if (error == NFSERR_GRACE) {
+                               if (cb.rcb_func) {
+                                       /*
+                                        * For an async I/O request, handle a grace delay just like
+                                        * jukebox errors.  Set the resend time and queue it up.
+                                        */
+                                       struct timeval now;
+                                       if (req->r_nmrep.nmc_mhead) {
+                                               mbuf_freem(req->r_nmrep.nmc_mhead);
+                                               req->r_nmrep.nmc_mhead = NULL;
+                                       }
+                                       req->r_error = 0;
+                                       microuptime(&now);
+                                       lck_mtx_lock(&req->r_mtx);
+                                       req->r_resendtime = now.tv_sec + 2;
+                                       req->r_xid = 0;                 // get a new XID
+                                       req->r_flags |= R_RESTART;
+                                       req->r_start = 0;
+                                       nfs_asyncio_resend(req);
+                                       lck_mtx_unlock(&req->r_mtx);
+                                       if (IS_VALID_CRED(cred))
+                                               kauth_cred_unref(&cred);
+                                       /* Note: nfsreq reference taken will be dropped later when finished */
+                                       return;
+                               }
+                               /* otherwise, just pause a couple seconds and retry */
+                               tsleep(&nmp->nm_state, (PZERO-1), "nfsgrace", 2*hz);
+                       }
+                       if (!(error = nfs_mount_state_wait_for_recovery(nmp))) {
+                               rlen = 0;
+                               goto writeagain;
+                       }
+               }
+       }
        if (error) {
                SET(bp->nb_flags, NB_ERROR);
                bp->nb_error = error;
@@ -2979,26 +2939,21 @@ finish:
         * (Don't bother if the buffer hit an error or stale wverf.)
         */
        if (((int)rlen < length) && !(bp->nb_flags & (NB_STALEWVERF|NB_ERROR))) {
+writeagain:
                offset += rlen;
                length -= rlen;
 
-               uio.uio_iovs.iov32p = &io;
-               uio.uio_iovcnt = 1;
-               uio.uio_rw = UIO_WRITE;
-#if 1   /* LP64todo - can't use new segment flags until the drivers are ready */
-               uio.uio_segflg = UIO_SYSSPACE;
-#else
-               uio.uio_segflg = UIO_SYSSPACE32;
-#endif
-               io.iov_len = length;
-               uio_uio_resid_set(&uio, io.iov_len);
-               uio.uio_offset = NBOFF(bp) + offset;
-               io.iov_base = (uintptr_t) bp->nb_data + offset;
+               auio = uio_createwithbuffer(1, NBOFF(bp) + offset, UIO_SYSSPACE,
+                       UIO_WRITE, &uio_buf, sizeof(uio_buf));
+               uio_addiov(auio, CAST_USER_ADDR_T(bp->nb_data + offset), length);
 
                cb.rcb_args[0] = offset;
                cb.rcb_args[1] = length;
+               if (nmp->nm_vers >= NFS_VER4)
+                       cb.rcb_args[2] = nmp->nm_stategenid;
 
-               error = nmp->nm_funcs->nf_write_rpc_async(np, &uio, length, thd, cred,
+               // XXX iomode should really match the original request
+               error = nmp->nm_funcs->nf_write_rpc_async(np, auio, length, thd, cred,
                                NFS_WRITE_FILESYNC, &cb, &wreq);
                if (!error) {
                        if (IS_VALID_CRED(cred))
@@ -3006,8 +2961,10 @@ finish:
                        if (!cb.rcb_func) {
                                /* if !async we'll need to wait for this RPC to finish */
                                req = wreq;
+                               wreq = NULL;
                                goto finish;
                        }
+                       nfs_request_rele(req);
                        /*
                         * We're done here.
                         * Outstanding RPC count is unchanged.
@@ -3020,8 +2977,10 @@ finish:
        }
 
 out:
-       if (cb.rcb_func)
+       if (cb.rcb_func) {
                nfs_async_write_done(nmp);
+               nfs_request_rele(req);
+       }
        /*
         * Decrement outstanding RPC count on buffer
         * and call nfs_buf_write_finish on last RPC.
@@ -3059,10 +3018,11 @@ int
 nfs_flushcommits(nfsnode_t np, int nowait)
 {
        struct nfsmount *nmp;
-       struct nfsbuf *bp;
+       struct nfsbuf *bp, *prevlbp, *lbp;
        struct nfsbuflists blist, commitlist;
        int error = 0, retv, wcred_set, flags, dirty;
        u_quad_t off, endoff, toff;
+       uint64_t wverf;
        u_int32_t count;
        kauth_cred_t wcred = NULL;
 
@@ -3075,11 +3035,11 @@ nfs_flushcommits(nfsnode_t np, int nowait)
         * and the commit rpc is done.
         */
        if (!LIST_EMPTY(&np->n_dirtyblkhd)) {
-               error = nfs_lock(np, NFS_NODE_LOCK_EXCLUSIVE);
+               error = nfs_node_lock(np);
                if (error)
                        goto done;
                np->n_flag |= NMODIFIED;
-               nfs_unlock(np);
+               nfs_node_unlock(np);
        }
 
        off = (u_quad_t)-1;
@@ -3088,7 +3048,7 @@ nfs_flushcommits(nfsnode_t np, int nowait)
        LIST_INIT(&commitlist);
 
        nmp = NFSTONMP(np);
-       if (!nmp) {
+       if (nfs_mount_gone(nmp)) {
                error = ENXIO;
                goto done;
        }
@@ -3101,6 +3061,7 @@ nfs_flushcommits(nfsnode_t np, int nowait)
        if (nowait)
                flags |= NBI_NOWAIT;
        lck_mtx_lock(nfs_buf_mutex);
+       wverf = nmp->nm_verf;
        if (!nfs_buf_iterprepare(np, &blist, flags)) {
                while ((bp = LIST_FIRST(&blist))) {
                        LIST_REMOVE(bp, nb_vnbufs);
@@ -3110,46 +3071,19 @@ nfs_flushcommits(nfsnode_t np, int nowait)
                                continue;
                        if (ISSET(bp->nb_flags, NB_NEEDCOMMIT))
                                nfs_buf_check_write_verifier(np, bp);
-                       if (((bp->nb_flags & (NB_DELWRI | NB_NEEDCOMMIT))
-                               != (NB_DELWRI | NB_NEEDCOMMIT))) {
+                       if (((bp->nb_flags & (NB_DELWRI | NB_NEEDCOMMIT)) != (NB_DELWRI | NB_NEEDCOMMIT)) ||
+                           (bp->nb_verf != wverf)) {
                                nfs_buf_drop(bp);
                                continue;
                        }
                        nfs_buf_remfree(bp);
-                       lck_mtx_unlock(nfs_buf_mutex);
-                       /*
-                        * we need a upl to see if the page has been
-                        * dirtied (think mmap) since the unstable write, and
-                        * also to prevent vm from paging it during our commit rpc
-                        */
-                       if (!ISSET(bp->nb_flags, NB_PAGELIST)) {
-                               retv = nfs_buf_upl_setup(bp);
-                               if (retv) {
-                                       /* unable to create upl */
-                                       /* vm object must no longer exist */
-                                       /* this could be fatal if we need */
-                                       /* to write the data again, we'll see...  */
-                                       printf("nfs_flushcommits: upl create failed %d\n", retv);
-                                       bp->nb_valid = bp->nb_dirty = 0;
-                               }
-                       }
-                       nfs_buf_upl_check(bp);
-                       lck_mtx_lock(nfs_buf_mutex);
+
+                       /* buffer UPLs will be grabbed *in order* below */
 
                        FSDBG(557, bp, bp->nb_flags, bp->nb_valid, bp->nb_dirty);
                        FSDBG(557, bp->nb_validoff, bp->nb_validend,
                              bp->nb_dirtyoff, bp->nb_dirtyend);
 
-                       /*
-                        * We used to check for dirty pages here; if there were any
-                        * we'd abort the commit and force the entire buffer to be
-                        * written again.
-                        *
-                        * Instead of doing that, we now go ahead and commit the dirty
-                        * range, and then leave the buffer around with dirty pages
-                        * that will be written out later.
-                        */
-
                        /*
                         * Work out if all buffers are using the same cred
                         * so we can deal with them all with one commit.
@@ -3168,14 +3102,23 @@ nfs_flushcommits(nfsnode_t np, int nowait)
                        SET(bp->nb_flags, NB_WRITEINPROG);
 
                        /*
-                        * A list of these buffers is kept so that the
-                        * second loop knows which buffers have actually
-                        * been committed. This is necessary, since there
-                        * may be a race between the commit rpc and new
-                        * uncommitted writes on the file.
+                        * Add this buffer to the list of buffers we are committing.
+                        * Buffers are inserted into the list in ascending order so that
+                        * we can take the UPLs in order after the list is complete.
                         */
+                       prevlbp = NULL;
+                       LIST_FOREACH(lbp, &commitlist, nb_vnbufs) {
+                               if (bp->nb_lblkno < lbp->nb_lblkno)
+                                       break;
+                               prevlbp = lbp;
+                       }
                        LIST_REMOVE(bp, nb_vnbufs);
-                       LIST_INSERT_HEAD(&commitlist, bp, nb_vnbufs);
+                       if (prevlbp)
+                               LIST_INSERT_AFTER(prevlbp, bp, nb_vnbufs);
+                       else
+                               LIST_INSERT_HEAD(&commitlist, bp, nb_vnbufs);
+
+                       /* update commit range start, end */
                        toff = NBOFF(bp) + bp->nb_dirtyoff;
                        if (toff < off)
                                off = toff;
@@ -3192,6 +3135,28 @@ nfs_flushcommits(nfsnode_t np, int nowait)
                goto done;
        }
 
+       /*
+        * We need a UPL to prevent others from accessing the buffers during
+        * our commit RPC(s).
+        *
+        * We used to also check for dirty pages here; if there were any we'd
+        * abort the commit and force the entire buffer to be written again.
+        * Instead of doing that, we just go ahead and commit the dirty range,
+        * and then leave the buffer around with dirty pages that will be
+        * written out later.
+        */
+       LIST_FOREACH(bp, &commitlist, nb_vnbufs) {
+               if (!ISSET(bp->nb_flags, NB_PAGELIST)) {
+                       retv = nfs_buf_upl_setup(bp);
+                       if (retv) {
+                               /* Unable to create the UPL, the VM object probably no longer exists. */
+                               printf("nfs_flushcommits: upl create failed %d\n", retv);
+                               bp->nb_valid = bp->nb_dirty = 0;
+                       }
+               }
+               nfs_buf_upl_check(bp);
+       }
+
        /*
         * Commit data on the server, as required.
         * If all bufs are using the same wcred, then use that with
@@ -3207,13 +3172,13 @@ nfs_flushcommits(nfsnode_t np, int nowait)
                        count = 0;
                else
                        count = (endoff - off);
-               retv = nmp->nm_funcs->nf_commit_rpc(np, off, count, wcred);
+               retv = nmp->nm_funcs->nf_commit_rpc(np, off, count, wcred, wverf);
        } else {
                retv = 0;
                LIST_FOREACH(bp, &commitlist, nb_vnbufs) {
                        toff = NBOFF(bp) + bp->nb_dirtyoff;
                        count = bp->nb_dirtyend - bp->nb_dirtyoff;
-                       retv = nmp->nm_funcs->nf_commit_rpc(np, toff, count, bp->nb_wcred);
+                       retv = nmp->nm_funcs->nf_commit_rpc(np, toff, count, bp->nb_wcred, wverf);
                        if (retv)
                                break;
                }
@@ -3227,11 +3192,11 @@ nfs_flushcommits(nfsnode_t np, int nowait)
        while ((bp = LIST_FIRST(&commitlist))) {
                LIST_REMOVE(bp, nb_vnbufs);
                FSDBG(557, bp, retv, bp->nb_flags, bp->nb_dirty);
-               nfs_lock(np, NFS_NODE_LOCK_FORCE);
+               nfs_node_lock_force(np);
                CLR(bp->nb_flags, (NB_NEEDCOMMIT | NB_WRITEINPROG));
                np->n_needcommitcnt--;
                CHECK_NEEDCOMMITCNT(np);
-               nfs_unlock(np);
+               nfs_node_unlock(np);
 
                if (retv) {
                        /* move back to dirty list */
@@ -3242,6 +3207,9 @@ nfs_flushcommits(nfsnode_t np, int nowait)
                        continue;
                }
 
+               nfs_node_lock_force(np);
+               np->n_numoutput++;
+               nfs_node_unlock(np);
                vnode_startwrite(NFSTOV(np));
                if (ISSET(bp->nb_flags, NB_DELWRI)) {
                        lck_mtx_lock(nfs_buf_mutex);
@@ -3294,25 +3262,26 @@ nfs_flush(nfsnode_t np, int waitfor, thread_t thd, int ignore_writeerr)
 
        FSDBG_TOP(517, np, waitfor, ignore_writeerr, 0);
 
-       if (!nmp) {
+       if (nfs_mount_gone(nmp)) {
                error = ENXIO;
                goto out;
        }
        nfsvers = nmp->nm_vers;
-       if (nmp->nm_flag & NFSMNT_INT)
+       if (NMFLAG(nmp, INTR))
                slpflag = PCATCH;
 
        if (!LIST_EMPTY(&np->n_dirtyblkhd)) {
-               nfs_lock(np, NFS_NODE_LOCK_FORCE);
+               nfs_node_lock_force(np);
                np->n_flag |= NMODIFIED;
-               nfs_unlock(np);
+               nfs_node_unlock(np);
        }
 
        lck_mtx_lock(nfs_buf_mutex);
        while (np->n_bflag & NBFLUSHINPROG) {
                np->n_bflag |= NBFLUSHWANT;
                error = msleep(&np->n_bflag, nfs_buf_mutex, slpflag, "nfs_flush", NULL);
-               if (error) {
+               if ((error && (error != EWOULDBLOCK)) ||
+                   ((error = nfs_sigintr(NFSTONMP(np), NULL, thd, 0)))) {
                        lck_mtx_unlock(nfs_buf_mutex);
                        goto out;
                }
@@ -3339,7 +3308,7 @@ again:
                while ((bp = LIST_FIRST(&blist))) {
                        LIST_REMOVE(bp, nb_vnbufs);
                        LIST_INSERT_HEAD(&np->n_dirtyblkhd, bp, nb_vnbufs);
-                       flags = (passone || (waitfor != MNT_WAIT)) ? NBAC_NOWAIT : 0;
+                       flags = (passone || !(waitfor == MNT_WAIT || waitfor == MNT_DWAIT)) ? NBAC_NOWAIT : 0;
                        if (flags != NBAC_NOWAIT)
                                nfs_buf_refget(bp);
                        while ((error = nfs_buf_acquire(bp, flags, slpflag, slptimeo))) {
@@ -3379,7 +3348,7 @@ again:
                                continue;
                        }
                        FSDBG(525, bp, passone, bp->nb_lflags, bp->nb_flags);
-                       if ((passone || (waitfor != MNT_WAIT)) &&
+                       if ((passone || !(waitfor == MNT_WAIT || waitfor == MNT_DWAIT)) &&
                            ISSET(bp->nb_flags, NB_NEEDCOMMIT)) {
                                nfs_buf_drop(bp);
                                continue;
@@ -3387,10 +3356,10 @@ again:
                        nfs_buf_remfree(bp);
                        lck_mtx_unlock(nfs_buf_mutex);
                        if (ISSET(bp->nb_flags, NB_ERROR)) {
-                               nfs_lock(np, NFS_NODE_LOCK_FORCE);
+                               nfs_node_lock_force(np);
                                np->n_error = bp->nb_error ? bp->nb_error : EIO;
                                np->n_flag |= NWRITEERR;
-                               nfs_unlock(np);
+                               nfs_node_unlock(np);
                                nfs_buf_release(bp, 1);
                                lck_mtx_lock(nfs_buf_mutex);
                                continue;
@@ -3407,7 +3376,7 @@ again:
        }
        lck_mtx_unlock(nfs_buf_mutex);
 
-       if (waitfor == MNT_WAIT) {
+       if (waitfor == MNT_WAIT || waitfor == MNT_DWAIT) {
                while ((error = vnode_waitforwrites(NFSTOV(np), 0, slpflag, slptimeo, "nfsflush"))) {
                        error2 = nfs_sigintr(NFSTONMP(np), NULL, thd, 0);
                        if (error2) {
@@ -3432,30 +3401,45 @@ again:
        if (passone) {
                passone = 0;
                if (!LIST_EMPTY(&np->n_dirtyblkhd)) {
-                       nfs_lock(np, NFS_NODE_LOCK_FORCE);
+                       nfs_node_lock_force(np);
                        np->n_flag |= NMODIFIED;
-                       nfs_unlock(np);
+                       nfs_node_unlock(np);
                }
                lck_mtx_lock(nfs_buf_mutex);
                goto again;
        }
 
-       if (waitfor == MNT_WAIT) {
+       if (waitfor == MNT_WAIT || waitfor == MNT_DWAIT) {
                if (!LIST_EMPTY(&np->n_dirtyblkhd)) {
-                       nfs_lock(np, NFS_NODE_LOCK_FORCE);
+                       nfs_node_lock_force(np);
                        np->n_flag |= NMODIFIED;
-                       nfs_unlock(np);
+                       nfs_node_unlock(np);
                }
                lck_mtx_lock(nfs_buf_mutex);
                if (!LIST_EMPTY(&np->n_dirtyblkhd))
                        goto again;
                lck_mtx_unlock(nfs_buf_mutex);
-               nfs_lock(np, NFS_NODE_LOCK_FORCE);
-               /* if we have no dirty blocks, we can clear the modified flag */
-               if (!np->n_wrbusy)
+               nfs_node_lock_force(np);
+               /*
+                * OK, it looks like there are no dirty blocks.  If we have no
+                * writes in flight and no one in the write code, we can clear
+                * the modified flag.  In order to make sure we see the latest
+                * attributes and size, we also invalidate the attributes and
+                * advance the attribute cache XID to guarantee that attributes
+                * newer than our clearing of NMODIFIED will get loaded next.
+                * (If we don't do this, it's possible for the flush's final
+                * write/commit (xid1) to be executed in parallel with a subsequent
+                * getattr request (xid2).  The getattr could return attributes
+                * from *before* the write/commit completed but the stale attributes
+                * would be preferred because of the xid ordering.)
+                */
+               if (!np->n_wrbusy && !np->n_numoutput) {
                        np->n_flag &= ~NMODIFIED;
+                       NATTRINVALIDATE(np);
+                       nfs_get_xid(&np->n_xid);
+               }
        } else {
-               nfs_lock(np, NFS_NODE_LOCK_FORCE);
+               nfs_node_lock_force(np);
        }
 
        FSDBG(526, np->n_flag, np->n_error, 0, 0);
@@ -3463,7 +3447,7 @@ again:
                error = np->n_error;
                np->n_flag &= ~NWRITEERR;
        }
-       nfs_unlock(np);
+       nfs_node_unlock(np);
 done:
        lck_mtx_lock(nfs_buf_mutex);
        flags = np->n_bflag;
@@ -3480,7 +3464,7 @@ out:
  * Flush out and invalidate all buffers associated with a vnode.
  * Called with the underlying object locked.
  */
-static int
+int
 nfs_vinvalbuf_internal(
        nfsnode_t np,
        int flags,
@@ -3580,10 +3564,11 @@ nfs_vinvalbuf_internal(
                                        // Note: bp has been released
                                        if (error) {
                                                FSDBG(554, bp, 0xd00dee, 0xbad, error);
-                                               nfs_lock(np, NFS_NODE_LOCK_FORCE);
-                                               np->n_error = error;
-                                               np->n_flag |= NWRITEERR;
-                                               nfs_unlock(np);
+                                               nfs_node_lock_force(np);
+                                               if ((error != EINTR) && (error != ERESTART)) {
+                                                       np->n_error = error;
+                                                       np->n_flag |= NWRITEERR;
+                                               }
                                                /*
                                                 * There was a write error and we need to
                                                 * invalidate attrs to sync with server.
@@ -3591,6 +3576,18 @@ nfs_vinvalbuf_internal(
                                                 * we may no longer know the correct size)
                                                 */
                                                NATTRINVALIDATE(np);
+                                               nfs_node_unlock(np);
+                                               if ((error == EINTR) || (error == ERESTART)) {
+                                                       /*
+                                                        * Abort on EINTR.  If we don't, we could
+                                                        * be stuck in this loop forever because
+                                                        * the buffer will continue to stay dirty.
+                                                        */
+                                                       lck_mtx_lock(nfs_buf_mutex);
+                                                       nfs_buf_itercomplete(np, &blist, list);
+                                                       lck_mtx_unlock(nfs_buf_mutex);
+                                                       return (error);
+                                               }
                                                error = 0;
                                        }
                                        lck_mtx_lock(nfs_buf_mutex);
@@ -3607,11 +3604,12 @@ nfs_vinvalbuf_internal(
        if (!LIST_EMPTY(&(np)->n_dirtyblkhd) || !LIST_EMPTY(&(np)->n_cleanblkhd))
                panic("nfs_vinvalbuf: flush/inval failed");
        lck_mtx_unlock(nfs_buf_mutex);
-       if (!(flags & V_SAVE)) {
-               nfs_lock(np, NFS_NODE_LOCK_FORCE);
+       nfs_node_lock_force(np);
+       if (!(flags & V_SAVE))
                np->n_flag &= ~NMODIFIED;
-               nfs_unlock(np);
-       }
+       if (vnode_vtype(NFSTOV(np)) == VREG)
+               np->n_lastrahead = -1;
+       nfs_node_unlock(np);
        NFS_BUF_FREEUP();
        return (0);
 }
@@ -3632,12 +3630,23 @@ nfs_vinvalbuf2(vnode_t vp, int flags, thread_t thd, kauth_cred_t cred, int intrf
 {
        nfsnode_t np = VTONFS(vp);
        struct nfsmount *nmp = VTONMP(vp);
-       int error, rv, slpflag, slptimeo, nflags;
+       int error, slpflag, slptimeo, nflags, retry = 0;
+       int ubcflags = UBC_PUSHALL | UBC_SYNC | UBC_INVALIDATE;
+       struct timespec ts = { 2, 0 };
        off_t size;
 
        FSDBG_TOP(554, np, flags, intrflg, 0);
 
-       if (nmp && !(nmp->nm_flag & NFSMNT_INT))
+       /*
+        * If the mount is gone no sense to try and write anything.
+        * and hang trying to do IO.
+        */
+       if (nfs_mount_gone(nmp)) {
+               flags &= ~V_SAVE;
+               ubcflags &= ~UBC_PUSHALL;
+       }
+       
+       if (nmp && !NMFLAG(nmp, INTR))
                intrflg = 0;
        if (intrflg) {
                slpflag = PCATCH;
@@ -3651,16 +3660,19 @@ nfs_vinvalbuf2(vnode_t vp, int flags, thread_t thd, kauth_cred_t cred, int intrf
        lck_mtx_lock(nfs_buf_mutex);
        while (np->n_bflag & NBINVALINPROG) {
                np->n_bflag |= NBINVALWANT;
-               error = msleep(&np->n_bflag, nfs_buf_mutex, slpflag, "nfs_vinvalbuf", NULL);
-               if (error) {
+               msleep(&np->n_bflag, nfs_buf_mutex, slpflag, "nfs_vinvalbuf", &ts);
+               if ((error = nfs_sigintr(VTONMP(vp), NULL, thd, 0))) {
                        lck_mtx_unlock(nfs_buf_mutex);
                        return (error);
                }
+               if (np->n_bflag & NBINVALINPROG)
+                       slpflag = 0;
        }
        np->n_bflag |= NBINVALINPROG;
        lck_mtx_unlock(nfs_buf_mutex);
 
        /* Now, flush as required.  */
+again:
        error = nfs_vinvalbuf_internal(np, flags, thd, cred, slpflag, 0);
        while (error) {
                FSDBG(554, np, 0, 0, error);
@@ -3671,8 +3683,17 @@ nfs_vinvalbuf2(vnode_t vp, int flags, thread_t thd, kauth_cred_t cred, int intrf
 
        /* get the pages out of vm also */
        if (UBCINFOEXISTS(vp) && (size = ubc_getsize(vp)))
-               if (!(rv = ubc_sync_range(vp, 0, size, UBC_PUSHALL | UBC_SYNC | UBC_INVALIDATE)))
-                       panic("nfs_vinvalbuf(): ubc_sync_range failed!");
+               if ((error = ubc_msync(vp, 0, size, NULL, ubcflags))) {
+                       if (error == EINVAL)
+                               panic("nfs_vinvalbuf(): ubc_msync failed!, error %d", error);
+                       if (retry++ < 10) { /* retry invalidating a few times */
+                               if (retry > 1 || error == ENXIO)
+                                       ubcflags &= ~UBC_PUSHALL;
+                               goto again;
+                       }
+                       /* give up */
+                       printf("nfs_vinvalbuf(): ubc_msync failed!, error %d\n", error);
+               }
 done:
        lck_mtx_lock(nfs_buf_mutex);
        nflags = np->n_bflag;
@@ -3685,6 +3706,57 @@ done:
        return (error);
 }
 
+/*
+ * Wait for any busy buffers to complete.
+ */
+void
+nfs_wait_bufs(nfsnode_t np)
+{
+       struct nfsbuf *bp;
+       struct nfsbuflists blist;
+       int error = 0;
+
+       lck_mtx_lock(nfs_buf_mutex);
+       if (!nfs_buf_iterprepare(np, &blist, NBI_CLEAN)) {
+               while ((bp = LIST_FIRST(&blist))) {
+                       LIST_REMOVE(bp, nb_vnbufs);
+                       LIST_INSERT_HEAD(&np->n_cleanblkhd, bp, nb_vnbufs);
+                       nfs_buf_refget(bp);
+                       while ((error = nfs_buf_acquire(bp, 0, 0, 0))) {
+                               if (error != EAGAIN) {
+                                       nfs_buf_refrele(bp);
+                                       nfs_buf_itercomplete(np, &blist, NBI_CLEAN);
+                                       lck_mtx_unlock(nfs_buf_mutex);
+                                       return;
+                               }
+                       }
+                       nfs_buf_refrele(bp);
+                       nfs_buf_drop(bp);
+               }
+               nfs_buf_itercomplete(np, &blist, NBI_CLEAN);
+       }
+       if (!nfs_buf_iterprepare(np, &blist, NBI_DIRTY)) {
+               while ((bp = LIST_FIRST(&blist))) {
+                       LIST_REMOVE(bp, nb_vnbufs);
+                       LIST_INSERT_HEAD(&np->n_dirtyblkhd, bp, nb_vnbufs);
+                       nfs_buf_refget(bp);
+                       while ((error = nfs_buf_acquire(bp, 0, 0, 0))) {
+                               if (error != EAGAIN) {
+                                       nfs_buf_refrele(bp);
+                                       nfs_buf_itercomplete(np, &blist, NBI_DIRTY);
+                                       lck_mtx_unlock(nfs_buf_mutex);
+                                       return;
+                               }
+                       }
+                       nfs_buf_refrele(bp);
+                       nfs_buf_drop(bp);
+               }
+               nfs_buf_itercomplete(np, &blist, NBI_DIRTY);
+       }
+       lck_mtx_unlock(nfs_buf_mutex);
+}
+
+
 /*
  * Add an async I/O request to the mount's async I/O queue and make
  * sure that an nfsiod will service it.
@@ -3698,8 +3770,11 @@ nfs_asyncio_finish(struct nfsreq *req)
 
        FSDBG_TOP(552, nmp, 0, 0, 0);
 again:
-       if (((nmp = req->r_nmp)) == NULL)
+       nmp = req->r_nmp;
+
+       if (nmp == NULL)
                return;
+
        lck_mtx_lock(nfsiod_mutex);
        niod = nmp->nm_niod;
 
@@ -3724,6 +3799,28 @@ again:
                }
        }
 
+       /*
+        * If we got here while being on the resendq we need to get off. This
+        * happens when the timer fires and errors out requests from nfs_sigintr
+        * or we receive a reply (UDP case) while being on the resend queue so
+        * we're just finishing up and are not going to be resent.
+        */
+       lck_mtx_lock(&req->r_mtx);
+       if (req->r_flags & R_RESENDQ) {
+               lck_mtx_lock(&nmp->nm_lock);
+               if (req->r_rchain.tqe_next != NFSREQNOLIST) {
+                       NFS_BIO_DBG("Proccessing async request on resendq. Removing");
+                       TAILQ_REMOVE(&nmp->nm_resendq, req, r_rchain);
+                       req->r_rchain.tqe_next = NFSREQNOLIST;
+                       assert(req->r_refs > 1);
+                       /* Remove resendq reference */
+                       req->r_refs--;
+               }
+               lck_mtx_unlock(&nmp->nm_lock);
+               req->r_flags &= ~R_RESENDQ;
+       }
+       lck_mtx_unlock(&req->r_mtx);
+
        if (req->r_achain.tqe_next == NFSREQNOLIST)
                TAILQ_INSERT_TAIL(&nmp->nm_iodq, req, r_achain);
 
@@ -3734,8 +3831,9 @@ again:
                        lck_mtx_unlock(nfsiod_mutex);
                        wakeup(niod);
                } else if (nfsiod_thread_count > 0) {
-                       /* just queue it up on nfsiod mounts queue */
-                       TAILQ_INSERT_TAIL(&nfsiodmounts, nmp, nm_iodlink);
+                       /* just queue it up on nfsiod mounts queue if needed */
+                       if (nmp->nm_iodlink.tqe_next == NFSNOLIST)
+                               TAILQ_INSERT_TAIL(&nfsiodmounts, nmp, nm_iodlink);
                        lck_mtx_unlock(nfsiod_mutex);
                } else {
                        printf("nfs_asyncio(): no nfsiods? %d %d (%d)\n", nfsiod_thread_count, NFSIOD_MAX, started);
@@ -3759,77 +3857,48 @@ nfs_asyncio_resend(struct nfsreq *req)
 {
        struct nfsmount *nmp = req->r_nmp;
 
-       if (!nmp)
+       if (nfs_mount_gone(nmp))
                return;
+
        nfs_gss_clnt_rpcdone(req);
        lck_mtx_lock(&nmp->nm_lock);
-       if (req->r_rchain.tqe_next == NFSREQNOLIST) {
+       if (!(req->r_flags & R_RESENDQ)) {
                TAILQ_INSERT_TAIL(&nmp->nm_resendq, req, r_rchain);
                req->r_flags |= R_RESENDQ;
+               /*
+                * We take a reference on this request so that it can't be
+                * destroyed while a resend is queued or in progress.
+                */
+               nfs_request_ref(req, 1);
        }
        nfs_mount_sock_thread_wake(nmp);
        lck_mtx_unlock(&nmp->nm_lock);
 }
 
 /*
- * Read an NFS buffer for a directory.
+ * Read directory data into a buffer.
+ *
+ * Buffer will be filled (unless EOF is hit).
+ * Buffers after this one may also be completely/partially filled.
  */
 int
 nfs_buf_readdir(struct nfsbuf *bp, vfs_context_t ctx)
 {
-       nfsnode_t np;
-       vnode_t vp;
-       struct nfsmount *nmp;
-       int error = 0, nfsvers;
-       struct uio uio;
-       struct iovec_32 io;
+       nfsnode_t np = bp->nb_np;
+       struct nfsmount *nmp = NFSTONMP(np);
+       int error = 0;
 
-       np = bp->nb_np;
-       vp = NFSTOV(np);
-       nmp = VTONMP(vp);
-       nfsvers = nmp->nm_vers;
-       uio.uio_iovs.iov32p = &io;
-       uio.uio_iovcnt = 1;
-#if 1   /* LP64todo - can't use new segment flags until the drivers are ready */
-       uio.uio_segflg = UIO_SYSSPACE;
-#else
-       uio.uio_segflg = UIO_SYSSPACE32;
-#endif
-
-       /* sanity check */
-       if (ISSET(bp->nb_flags, NB_DONE))
-               CLR(bp->nb_flags, NB_DONE);
+       if (nfs_mount_gone(nmp))
+               return (ENXIO);
 
-       uio.uio_rw = UIO_READ;
-       io.iov_len = bp->nb_bufsize;
-       uio_uio_resid_set(&uio, io.iov_len);
-       io.iov_base = (uintptr_t) bp->nb_data;
-       uio.uio_offset = NBOFF(bp);
-
-       OSAddAtomic(1, (SInt32*)&nfsstats.readdir_bios);
-       if (nfsvers < NFS_VER4) {
-               if (nmp->nm_flag & NFSMNT_RDIRPLUS) {
-                       error = nfs3_readdirplus_rpc(np, &uio, ctx);
-                       if (error == NFSERR_NOTSUPP) {
-                               lck_mtx_lock(&nmp->nm_lock);
-                               nmp->nm_flag &= ~NFSMNT_RDIRPLUS;
-                               lck_mtx_unlock(&nmp->nm_lock);
-                       }
-               }
-               if (!(nmp->nm_flag & NFSMNT_RDIRPLUS))
-                       error = nfs3_readdir_rpc(np, &uio, ctx);
-       } else {
-               error = nfs4_readdir_rpc(np, &uio, ctx);
-       }
-       if (error) {
+       if (nmp->nm_vers < NFS_VER4)
+               error = nfs3_readdir_rpc(np, bp, ctx);
+       else
+               error = nfs4_readdir_rpc(np, bp, ctx);
+
+       if (error && (error != NFSERR_DIRBUFDROPPED)) {
                SET(bp->nb_flags, NB_ERROR);
                bp->nb_error = error;
-       } else {
-               bp->nb_validoff = 0;
-               bp->nb_validend = uio.uio_offset - NBOFF(bp);
-               bp->nb_valid = (1 << (round_page_32(bp->nb_validend)/PAGE_SIZE)) - 1;
        }
-
-       nfs_buf_iodone(bp);
        return (error);
 }