+ }
+
+ /* If NB_NEEDCOMMIT is set, a commit RPC may do the trick. */
+ if (ISSET(bp->nb_flags, NB_NEEDCOMMIT))
+ nfs_buf_check_write_verifier(np, bp);
+ if (ISSET(bp->nb_flags, NB_NEEDCOMMIT)) {
+ struct nfsmount *nmp = NFSTONMP(np);
+ if (!nmp) {
+ SET(bp->nb_flags, NB_ERROR);
+ bp->nb_error = error = EIO;
+ nfs_buf_iodone(bp);
+ goto out;
+ }
+ SET(bp->nb_flags, NB_WRITEINPROG);
+ error = nmp->nm_funcs->nf_commit_rpc(np, NBOFF(bp) + bp->nb_dirtyoff,
+ bp->nb_dirtyend - bp->nb_dirtyoff, bp->nb_wcred, bp->nb_verf);
+ CLR(bp->nb_flags, NB_WRITEINPROG);
+ if (error) {
+ if (error != NFSERR_STALEWRITEVERF) {
+ SET(bp->nb_flags, NB_ERROR);
+ bp->nb_error = error;
+ }
+ nfs_buf_iodone(bp);
+ goto out;
+ }
+ bp->nb_dirtyoff = bp->nb_dirtyend = 0;
+ CLR(bp->nb_flags, NB_NEEDCOMMIT);
+ nfs_node_lock_force(np);
+ np->n_needcommitcnt--;
+ CHECK_NEEDCOMMITCNT(np);
+ nfs_node_unlock(np);
+ }
+ if (!error && (bp->nb_dirtyend > 0)) {
+ /* sanity check the dirty range */
+ if (NBOFF(bp) + bp->nb_dirtyend > (off_t) np->n_size) {
+ bp->nb_dirtyend = np->n_size - NBOFF(bp);
+ if (bp->nb_dirtyoff >= bp->nb_dirtyend)
+ bp->nb_dirtyoff = bp->nb_dirtyend = 0;
+ }
+ }
+ if (!error && (bp->nb_dirtyend > 0)) {
+ /* there's a dirty range that needs to be written out */
+ NFS_BUF_MAP(bp);
+
+ doff = bp->nb_dirtyoff;
+ dend = bp->nb_dirtyend;
+
+ /* if doff page is dirty, move doff to start of page */
+ if (NBPGDIRTY(bp, doff / PAGE_SIZE))
+ doff -= doff & PAGE_MASK;
+ /* try to expand write range to include preceding dirty pages */
+ if (!(doff & PAGE_MASK))
+ while ((doff > 0) && NBPGDIRTY(bp, (doff - 1) / PAGE_SIZE))
+ doff -= PAGE_SIZE;
+ /* if dend page is dirty, move dend to start of next page */
+ if ((dend & PAGE_MASK) && NBPGDIRTY(bp, dend / PAGE_SIZE))
+ dend = round_page_32(dend);
+ /* try to expand write range to include trailing dirty pages */
+ if (!(dend & PAGE_MASK))
+ while ((dend < (int)bp->nb_bufsize) && NBPGDIRTY(bp, dend / PAGE_SIZE))
+ dend += PAGE_SIZE;
+ /* make sure to keep dend clipped to EOF */
+ if ((NBOFF(bp) + dend) > (off_t) np->n_size)
+ dend = np->n_size - NBOFF(bp);
+ /* calculate range of complete pages being written */
+ firstpg = round_page_32(doff) / PAGE_SIZE;
+ lastpg = (trunc_page_32(dend) - 1) / PAGE_SIZE;
+ /* calculate mask for that page range */
+ pagemask = ((1 << (lastpg + 1)) - 1) & ~((1 << firstpg) - 1);
+
+ /*
+ * compare page mask to nb_dirty; if there are other dirty pages
+ * then write FILESYNC; otherwise, write UNSTABLE if async and
+ * not needcommit/stable; otherwise write FILESYNC
+ */
+ if (bp->nb_dirty & ~pagemask)
+ iomode = NFS_WRITE_FILESYNC;
+ else if ((bp->nb_flags & (NB_ASYNC | NB_NEEDCOMMIT | NB_STABLE)) == NB_ASYNC)
+ iomode = NFS_WRITE_UNSTABLE;
+ else
+ iomode = NFS_WRITE_FILESYNC;
+
+ /* write the whole contiguous dirty range */
+ bp->nb_offio = doff;
+ bp->nb_endio = dend;
+
+ OSAddAtomic64(1, &nfsstats.write_bios);
+
+ SET(bp->nb_flags, NB_WRITEINPROG);
+ error = nfs_buf_write_rpc(bp, iomode, thd, cred);
+ /*
+ * For async I/O, the callbacks will finish up the
+ * write and push out any dirty pages. Otherwise,
+ * the write has already been finished and any dirty
+ * pages pushed out.
+ */
+ } else {
+ if (!error && bp->nb_dirty) /* write out any dirty pages */
+ error = nfs_buf_write_dirty_pages(bp, thd, cred);
+ nfs_buf_iodone(bp);
+ }
+ /* note: bp is still valid only for !async case */
+out:
+ if (!async) {
+ error = nfs_buf_iowait(bp);
+ /* move to clean list */
+ if (oldflags & NB_DELWRI) {
+ lck_mtx_lock(nfs_buf_mutex);
+ if (bp->nb_vnbufs.le_next != NFSNOLIST)
+ LIST_REMOVE(bp, nb_vnbufs);
+ LIST_INSERT_HEAD(&np->n_cleanblkhd, bp, nb_vnbufs);
+ lck_mtx_unlock(nfs_buf_mutex);
+ }
+ FSDBG_BOT(553, bp, NBOFF(bp), bp->nb_flags, error);
+ nfs_buf_release(bp, 1);
+ /* check if we need to invalidate (and we can) */
+ if ((np->n_flag & NNEEDINVALIDATE) &&
+ !(np->n_bflag & (NBINVALINPROG|NBFLUSHINPROG))) {
+ int invalidate = 0;
+ nfs_node_lock_force(np);
+ if (np->n_flag & NNEEDINVALIDATE) {
+ invalidate = 1;
+ np->n_flag &= ~NNEEDINVALIDATE;
+ }
+ nfs_node_unlock(np);
+ if (invalidate) {
+ /*
+ * There was a write error and we need to
+ * invalidate attrs and flush buffers in
+ * order to sync up with the server.
+ * (if this write was extending the file,
+ * we may no longer know the correct size)
+ *
+ * But we couldn't call vinvalbuf while holding
+ * the buffer busy. So we call vinvalbuf() after
+ * releasing the buffer.
+ */
+ nfs_vinvalbuf2(NFSTOV(np), V_SAVE|V_IGNORE_WRITEERR, thd, cred, 1);
+ }
+ }
+ }
+
+ if (IS_VALID_CRED(cred))
+ kauth_cred_unref(&cred);
+ return (error);
+}
+
+/*
+ * finish the writing of a buffer
+ */
+void
+nfs_buf_write_finish(struct nfsbuf *bp, thread_t thd, kauth_cred_t cred)
+{
+ nfsnode_t np = bp->nb_np;
+ int error = (bp->nb_flags & NB_ERROR) ? bp->nb_error : 0;
+ int firstpg, lastpg;
+ uint32_t pagemask;
+
+ if ((error == EINTR) || (error == ERESTART)) {
+ CLR(bp->nb_flags, NB_ERROR);
+ SET(bp->nb_flags, NB_EINTR);
+ }
+
+ if (!error) {
+ /* calculate range of complete pages being written */
+ firstpg = round_page_32(bp->nb_offio) / PAGE_SIZE;
+ lastpg = (trunc_page_32(bp->nb_endio) - 1) / PAGE_SIZE;
+ /* calculate mask for that page range written */
+ pagemask = ((1 << (lastpg + 1)) - 1) & ~((1 << firstpg) - 1);
+ /* clear dirty bits for pages we've written */
+ bp->nb_dirty &= ~pagemask;
+ }
+
+ /* manage needcommit state */
+ if (!error && (bp->nb_commitlevel == NFS_WRITE_UNSTABLE)) {
+ if (!ISSET(bp->nb_flags, NB_NEEDCOMMIT)) {
+ nfs_node_lock_force(np);
+ np->n_needcommitcnt++;
+ nfs_node_unlock(np);
+ SET(bp->nb_flags, NB_NEEDCOMMIT);
+ }
+ /* make sure nb_dirtyoff/nb_dirtyend reflect actual range written */
+ bp->nb_dirtyoff = bp->nb_offio;
+ bp->nb_dirtyend = bp->nb_endio;
+ } else if (ISSET(bp->nb_flags, NB_NEEDCOMMIT)) {
+ nfs_node_lock_force(np);
+ np->n_needcommitcnt--;
+ CHECK_NEEDCOMMITCNT(np);
+ nfs_node_unlock(np);
+ CLR(bp->nb_flags, NB_NEEDCOMMIT);
+ }
+
+ CLR(bp->nb_flags, NB_WRITEINPROG);
+
+ /*
+ * For an unstable write, the buffer is still treated as dirty until
+ * a commit (or stable (re)write) is performed. Buffers needing only
+ * a commit are marked with the NB_DELWRI and NB_NEEDCOMMIT flags.
+ *
+ * If the write was interrupted we set NB_EINTR. Don't set NB_ERROR
+ * because that would cause the buffer to be dropped. The buffer is
+ * still valid and simply needs to be written again.
+ */
+ if ((error == EINTR) || (error == ERESTART) || (!error && (bp->nb_flags & NB_NEEDCOMMIT))) {
+ CLR(bp->nb_flags, NB_INVAL);
+ if (!ISSET(bp->nb_flags, NB_DELWRI)) {
+ SET(bp->nb_flags, NB_DELWRI);
+ lck_mtx_lock(nfs_buf_mutex);
+ nfs_nbdwrite++;
+ NFSBUFCNTCHK();
+ lck_mtx_unlock(nfs_buf_mutex);
+ }
+ /*
+ * Since for the NB_ASYNC case, we've reassigned the buffer to the
+ * clean list, we have to reassign it back to the dirty one. Ugh.
+ */
+ if (ISSET(bp->nb_flags, NB_ASYNC)) {
+ /* move to dirty list */
+ lck_mtx_lock(nfs_buf_mutex);
+ if (bp->nb_vnbufs.le_next != NFSNOLIST)
+ LIST_REMOVE(bp, nb_vnbufs);
+ LIST_INSERT_HEAD(&np->n_dirtyblkhd, bp, nb_vnbufs);
+ lck_mtx_unlock(nfs_buf_mutex);
+ }
+ } else {
+ /* either there's an error or we don't need to commit */
+ if (error) {
+ /*
+ * There was a write error and we need to invalidate
+ * attrs and flush buffers in order to sync up with the
+ * server. (if this write was extending the file, we
+ * may no longer know the correct size)
+ *
+ * But we can't call vinvalbuf while holding this
+ * buffer busy. Set a flag to do it after releasing
+ * the buffer.
+ */
+ nfs_node_lock_force(np);
+ np->n_error = error;
+ np->n_flag |= (NWRITEERR | NNEEDINVALIDATE);
+ NATTRINVALIDATE(np);
+ nfs_node_unlock(np);
+ }
+ /* clear the dirty range */
+ bp->nb_dirtyoff = bp->nb_dirtyend = 0;
+ }
+
+ if (!error && bp->nb_dirty)
+ nfs_buf_write_dirty_pages(bp, thd, cred);
+ nfs_buf_iodone(bp);
+}
+
+/*
+ * write out any pages marked dirty in a buffer
+ *
+ * We do use unstable writes and follow up with a commit.
+ * If we catch the write verifier changing we'll restart
+ * do the writes filesync.
+ */
+int
+nfs_buf_write_dirty_pages(struct nfsbuf *bp, thread_t thd, kauth_cred_t cred)
+{
+ nfsnode_t np = bp->nb_np;
+ struct nfsmount *nmp = NFSTONMP(np);
+ int error = 0, commit, iomode, iomode2, len, pg, count, npages, off;
+ uint32_t dirty = bp->nb_dirty;
+ uint64_t wverf;
+ uio_t auio;
+ char uio_buf [ UIO_SIZEOF(1) ];
+
+ if (!bp->nb_dirty)
+ return (0);
+
+ /* there are pages marked dirty that need to be written out */
+ OSAddAtomic64(1, &nfsstats.write_bios);
+ NFS_BUF_MAP(bp);
+ SET(bp->nb_flags, NB_WRITEINPROG);
+ npages = bp->nb_bufsize / PAGE_SIZE;
+ iomode = NFS_WRITE_UNSTABLE;
+
+ auio = uio_createwithbuffer(1, 0, UIO_SYSSPACE, UIO_WRITE,
+ &uio_buf, sizeof(uio_buf));
+
+again:
+ dirty = bp->nb_dirty;
+ wverf = bp->nb_verf;
+ commit = NFS_WRITE_FILESYNC;
+ for (pg = 0; pg < npages; pg++) {
+ if (!NBPGDIRTY(bp, pg))
+ continue;
+ count = 1;
+ while (((pg + count) < npages) && NBPGDIRTY(bp, pg + count))
+ count++;
+ /* write count pages starting with page pg */
+ off = pg * PAGE_SIZE;
+ len = count * PAGE_SIZE;
+ /* clip writes to EOF */
+ if (NBOFF(bp) + off + len > (off_t) np->n_size)
+ len -= (NBOFF(bp) + off + len) - np->n_size;
+ if (len > 0) {
+ iomode2 = iomode;
+ uio_reset(auio, NBOFF(bp) + off, UIO_SYSSPACE, UIO_WRITE);
+ uio_addiov(auio, CAST_USER_ADDR_T(bp->nb_data + off), len);
+ error = nfs_write_rpc2(np, auio, thd, cred, &iomode2, &bp->nb_verf);
+ if (error)
+ break;
+ if (iomode2 < commit) /* Retain the lowest commitment level returned. */
+ commit = iomode2;
+ if ((commit != NFS_WRITE_FILESYNC) && (wverf != bp->nb_verf)) {
+ /* verifier changed, redo all the writes filesync */
+ iomode = NFS_WRITE_FILESYNC;
+ goto again;
+ }
+ }
+ /* clear dirty bits */
+ while (count--) {
+ dirty &= ~(1 << pg);
+ if (count) /* leave pg on last page */
+ pg++;
+ }
+ }
+ CLR(bp->nb_flags, NB_WRITEINPROG);
+
+ if (!error && (commit != NFS_WRITE_FILESYNC)) {
+ error = nmp->nm_funcs->nf_commit_rpc(np, NBOFF(bp), bp->nb_bufsize, cred, wverf);
+ if (error == NFSERR_STALEWRITEVERF) {
+ /* verifier changed, so we need to restart all the writes */
+ iomode = NFS_WRITE_FILESYNC;
+ goto again;
+ }
+ }
+ if (!error) {
+ bp->nb_dirty = dirty;
+ } else {
+ SET(bp->nb_flags, NB_ERROR);
+ bp->nb_error = error;
+ }
+ return (error);
+}
+
+/*
+ * initiate the NFS WRITE RPC(s) for a buffer
+ */
+int
+nfs_buf_write_rpc(struct nfsbuf *bp, int iomode, thread_t thd, kauth_cred_t cred)
+{
+ struct nfsmount *nmp;
+ nfsnode_t np = bp->nb_np;
+ int error = 0, nfsvers, async;
+ int offset, nrpcs;
+ uint32_t nmwsize, length, len;
+ struct nfsreq *req;
+ struct nfsreq_cbinfo cb;
+ uio_t auio;
+ char uio_buf [ UIO_SIZEOF(1) ];
+
+ nmp = NFSTONMP(np);
+ if (!nmp) {
+ bp->nb_error = error = ENXIO;
+ SET(bp->nb_flags, NB_ERROR);
+ nfs_buf_iodone(bp);
+ return (error);
+ }
+ nfsvers = nmp->nm_vers;
+ nmwsize = nmp->nm_wsize;
+
+ offset = bp->nb_offio;
+ length = bp->nb_endio - bp->nb_offio;
+
+ /* Note: Can only do async I/O if nfsiods are configured. */
+ async = (bp->nb_flags & NB_ASYNC) && (NFSIOD_MAX > 0);
+ bp->nb_commitlevel = NFS_WRITE_FILESYNC;
+ cb.rcb_func = async ? nfs_buf_write_rpc_finish : NULL;
+ cb.rcb_bp = bp;
+
+ if ((nfsvers == NFS_VER2) && ((NBOFF(bp) + bp->nb_endio) > 0xffffffffLL)) {
+ bp->nb_error = error = EFBIG;
+ SET(bp->nb_flags, NB_ERROR);
+ nfs_buf_iodone(bp);
+ return (error);
+ }
+
+ auio = uio_createwithbuffer(1, NBOFF(bp) + offset, UIO_SYSSPACE,
+ UIO_WRITE, &uio_buf, sizeof(uio_buf));
+ uio_addiov(auio, CAST_USER_ADDR_T(bp->nb_data + offset), length);
+
+ bp->nb_rpcs = nrpcs = (length + nmwsize - 1) / nmwsize;
+ if (async && (nrpcs > 1)) {
+ SET(bp->nb_flags, NB_MULTASYNCRPC);
+ } else {
+ CLR(bp->nb_flags, NB_MULTASYNCRPC);
+ }
+
+ while (length > 0) {
+ if (ISSET(bp->nb_flags, NB_ERROR)) {
+ error = bp->nb_error;
+ break;