+ nfs_unlock(np);
+ /*
+ * Make sure that we return an integral number of entries so
+ * that any subsequent calls will start copying from the start
+ * of the next entry.
+ *
+ * If the current value of n has the last entry cut short,
+ * set n to copy everything up to the last entry instead.
+ */
+ if (n > 0) {
+ dp = bp->nb_data + on;
+ while (dp < (bp->nb_data + on + n)) {
+ direntp = (struct dirent *)dp;
+ dp += direntp->d_reclen;
+ }
+ if (dp > (bp->nb_data + on + n))
+ n = (dp - direntp->d_reclen) - (bp->nb_data + on);
+ }
+ }
+
+ if (n > 0)
+ error = uiomove(bp->nb_data + on, (int)n, uio);
+
+ if (vtype == VREG) {
+ if (ioflag & IO_NOCACHE)
+ SET(bp->nb_flags, NB_NOCACHE);
+ nfs_buf_release(bp, 1);
+ nfs_data_unlock(np);
+ np->n_lastread = (uio->uio_offset - 1) / biosize;
+ } else {
+ nfs_buf_release(bp, 1);
+ }
+ } while (error == 0 && uio_uio_resid(uio) > 0 && n > 0);
+ FSDBG_BOT(514, np, uio->uio_offset, uio_uio_resid(uio), error);
+ return (error);
+}
+
+/*
+ * limit the number of outstanding async I/O writes
+ */
+static int
+nfs_async_write_start(struct nfsmount *nmp)
+{
+ int error = 0, slpflag = (nmp->nm_flag & NFSMNT_INT) ? PCATCH : 0;
+ struct timespec ts = {1, 0};
+
+ if (nfs_max_async_writes <= 0)
+ return (0);
+ lck_mtx_lock(&nmp->nm_lock);
+ while ((nfs_max_async_writes > 0) && (nmp->nm_asyncwrites >= nfs_max_async_writes)) {
+ if ((error = nfs_sigintr(nmp, NULL, current_thread(), 1)))
+ break;
+ msleep(&nmp->nm_asyncwrites, &nmp->nm_lock, slpflag|(PZERO-1), "nfsasyncwrites", &ts);
+ slpflag = 0;
+ }
+ if (!error)
+ nmp->nm_asyncwrites++;
+ lck_mtx_unlock(&nmp->nm_lock);
+ return (error);
+}
+static void
+nfs_async_write_done(struct nfsmount *nmp)
+{
+ if (nmp->nm_asyncwrites <= 0)
+ return;
+ lck_mtx_lock(&nmp->nm_lock);
+ if (nmp->nm_asyncwrites-- >= nfs_max_async_writes)
+ wakeup(&nmp->nm_asyncwrites);
+ lck_mtx_unlock(&nmp->nm_lock);
+}
+
+/*
+ * write (or commit) the given NFS buffer
+ *
+ * Commit the buffer if we can.
+ * Write out any dirty range.
+ * If any dirty pages remain, write them out.
+ * Mark buffer done.
+ *
+ * For async requests, all the work beyond sending the initial
+ * write RPC is handled in the RPC callback(s).
+ */
+int
+nfs_buf_write(struct nfsbuf *bp)
+{
+ int error = 0, oldflags, async;
+ nfsnode_t np;
+ thread_t thd;
+ kauth_cred_t cred;
+ proc_t p = current_proc();
+ int iomode, doff, dend, firstpg, lastpg;
+ uint32_t pagemask;
+
+ FSDBG_TOP(553, bp, NBOFF(bp), bp->nb_flags, 0);
+
+ if (!ISSET(bp->nb_lflags, NBL_BUSY))
+ panic("nfs_buf_write: buffer is not busy???");
+
+ np = bp->nb_np;
+ async = ISSET(bp->nb_flags, NB_ASYNC);
+ oldflags = bp->nb_flags;
+
+ CLR(bp->nb_flags, (NB_READ|NB_DONE|NB_ERROR|NB_DELWRI));
+ if (ISSET(oldflags, NB_DELWRI)) {
+ lck_mtx_lock(nfs_buf_mutex);
+ nfs_nbdwrite--;
+ NFSBUFCNTCHK();
+ lck_mtx_unlock(nfs_buf_mutex);
+ wakeup(&nfs_nbdwrite);
+ }
+
+ /* move to clean list */
+ if (ISSET(oldflags, (NB_ASYNC|NB_DELWRI))) {
+ lck_mtx_lock(nfs_buf_mutex);
+ if (bp->nb_vnbufs.le_next != NFSNOLIST)
+ LIST_REMOVE(bp, nb_vnbufs);
+ LIST_INSERT_HEAD(&np->n_cleanblkhd, bp, nb_vnbufs);
+ lck_mtx_unlock(nfs_buf_mutex);
+ }
+ vnode_startwrite(NFSTOV(np));
+
+ if (p && p->p_stats)
+ OSIncrementAtomic(&p->p_stats->p_ru.ru_oublock);
+
+ cred = bp->nb_wcred;
+ if (!IS_VALID_CRED(cred) && ISSET(bp->nb_flags, NB_READ))
+ cred = bp->nb_rcred; /* shouldn't really happen, but... */
+ if (IS_VALID_CRED(cred))
+ kauth_cred_ref(cred);
+ thd = async ? NULL : current_thread();
+
+ /* We need to make sure the pages are locked before doing I/O. */
+ if (!ISSET(bp->nb_flags, NB_META) && UBCINFOEXISTS(NFSTOV(np))) {
+ if (!ISSET(bp->nb_flags, NB_PAGELIST)) {
+ error = nfs_buf_upl_setup(bp);
+ if (error) {
+ printf("nfs_buf_write: upl create failed %d\n", error);
+ SET(bp->nb_flags, NB_ERROR);
+ bp->nb_error = error = EIO;
+ nfs_buf_iodone(bp);
+ goto out;
+ }
+ nfs_buf_upl_check(bp);
+ }
+ }
+
+ /* If NB_NEEDCOMMIT is set, a commit RPC may do the trick. */
+ if (ISSET(bp->nb_flags, NB_NEEDCOMMIT))
+ nfs_buf_check_write_verifier(np, bp);
+ if (ISSET(bp->nb_flags, NB_NEEDCOMMIT)) {
+ struct nfsmount *nmp = NFSTONMP(np);
+ if (!nmp) {
+ SET(bp->nb_flags, NB_ERROR);
+ bp->nb_error = error = EIO;
+ nfs_buf_iodone(bp);
+ goto out;
+ }
+ SET(bp->nb_flags, NB_WRITEINPROG);
+ error = nmp->nm_funcs->nf_commit_rpc(np, NBOFF(bp) + bp->nb_dirtyoff,
+ bp->nb_dirtyend - bp->nb_dirtyoff, bp->nb_wcred);
+ CLR(bp->nb_flags, NB_WRITEINPROG);
+ if (error) {
+ if (error != NFSERR_STALEWRITEVERF) {
+ SET(bp->nb_flags, NB_ERROR);
+ bp->nb_error = error;
+ }
+ nfs_buf_iodone(bp);
+ goto out;
+ }
+ bp->nb_dirtyoff = bp->nb_dirtyend = 0;
+ CLR(bp->nb_flags, NB_NEEDCOMMIT);
+ nfs_lock(np, NFS_NODE_LOCK_FORCE);
+ np->n_needcommitcnt--;
+ CHECK_NEEDCOMMITCNT(np);
+ nfs_unlock(np);
+ }
+ if (!error && (bp->nb_dirtyend > 0)) {
+ /* sanity check the dirty range */
+ if (NBOFF(bp) + bp->nb_dirtyend > (off_t) np->n_size) {
+ bp->nb_dirtyend = np->n_size - NBOFF(bp);
+ if (bp->nb_dirtyoff >= bp->nb_dirtyend)
+ bp->nb_dirtyoff = bp->nb_dirtyend = 0;
+ }
+ }
+ if (!error && (bp->nb_dirtyend > 0)) {
+ /* there's a dirty range that needs to be written out */
+ NFS_BUF_MAP(bp);
+
+ doff = bp->nb_dirtyoff;
+ dend = bp->nb_dirtyend;
+
+ /* if doff page is dirty, move doff to start of page */
+ if (NBPGDIRTY(bp, doff / PAGE_SIZE))
+ doff -= doff & PAGE_MASK;
+ /* try to expand write range to include preceding dirty pages */
+ if (!(doff & PAGE_MASK))
+ while ((doff > 0) && NBPGDIRTY(bp, (doff - 1) / PAGE_SIZE))
+ doff -= PAGE_SIZE;
+ /* if dend page is dirty, move dend to start of next page */
+ if ((dend & PAGE_MASK) && NBPGDIRTY(bp, dend / PAGE_SIZE))
+ dend = round_page_32(dend);
+ /* try to expand write range to include trailing dirty pages */
+ if (!(dend & PAGE_MASK))
+ while ((dend < bp->nb_bufsize) && NBPGDIRTY(bp, dend / PAGE_SIZE))
+ dend += PAGE_SIZE;
+ /* make sure to keep dend clipped to EOF */
+ if ((NBOFF(bp) + dend) > (off_t) np->n_size)
+ dend = np->n_size - NBOFF(bp);
+ /* calculate range of complete pages being written */
+ firstpg = round_page_32(doff) / PAGE_SIZE;
+ lastpg = (trunc_page_32(dend) - 1) / PAGE_SIZE;
+ /* calculate mask for that page range */
+ pagemask = ((1 << (lastpg + 1)) - 1) & ~((1 << firstpg) - 1);
+
+ /*
+ * compare page mask to nb_dirty; if there are other dirty pages
+ * then write FILESYNC; otherwise, write UNSTABLE if async and
+ * not needcommit/stable; otherwise write FILESYNC
+ */
+ if (bp->nb_dirty & ~pagemask)
+ iomode = NFS_WRITE_FILESYNC;
+ else if ((bp->nb_flags & (NB_ASYNC | NB_NEEDCOMMIT | NB_STABLE)) == NB_ASYNC)
+ iomode = NFS_WRITE_UNSTABLE;
+ else
+ iomode = NFS_WRITE_FILESYNC;
+
+ /* write the whole contiguous dirty range */
+ bp->nb_offio = doff;
+ bp->nb_endio = dend;
+
+ OSAddAtomic(1, (SInt32 *)&nfsstats.write_bios);
+
+ SET(bp->nb_flags, NB_WRITEINPROG);
+ error = nfs_buf_write_rpc(bp, iomode, thd, cred);
+ /*
+ * For async I/O, the callbacks will finish up the
+ * write and push out any dirty pages. Otherwise,
+ * the write has already been finished and any dirty
+ * pages pushed out.
+ */
+ } else {
+ if (!error && bp->nb_dirty) /* write out any dirty pages */
+ error = nfs_buf_write_dirty_pages(bp, thd, cred);
+ nfs_buf_iodone(bp);
+ }
+ /* note: bp is still valid only for !async case */
+out:
+ if (!async) {
+ error = nfs_buf_iowait(bp);
+ /* move to clean list */
+ if (oldflags & NB_DELWRI) {
+ lck_mtx_lock(nfs_buf_mutex);
+ if (bp->nb_vnbufs.le_next != NFSNOLIST)
+ LIST_REMOVE(bp, nb_vnbufs);
+ LIST_INSERT_HEAD(&np->n_cleanblkhd, bp, nb_vnbufs);
+ lck_mtx_unlock(nfs_buf_mutex);
+ }
+ FSDBG_BOT(553, bp, NBOFF(bp), bp->nb_flags, error);
+ nfs_buf_release(bp, 1);
+ /* check if we need to invalidate (and we can) */
+ if ((np->n_flag & NNEEDINVALIDATE) &&
+ !(np->n_bflag & (NBINVALINPROG|NBFLUSHINPROG))) {
+ int invalidate = 0;
+ nfs_lock(np, NFS_NODE_LOCK_FORCE);
+ if (np->n_flag & NNEEDINVALIDATE) {
+ invalidate = 1;
+ np->n_flag &= ~NNEEDINVALIDATE;
+ }
+ nfs_unlock(np);
+ if (invalidate) {
+ /*
+ * There was a write error and we need to
+ * invalidate attrs and flush buffers in
+ * order to sync up with the server.
+ * (if this write was extending the file,
+ * we may no longer know the correct size)
+ *
+ * But we couldn't call vinvalbuf while holding
+ * the buffer busy. So we call vinvalbuf() after
+ * releasing the buffer.
+ */
+ nfs_vinvalbuf2(NFSTOV(np), V_SAVE|V_IGNORE_WRITEERR, thd, cred, 1);
+ }
+ }
+ }
+
+ if (IS_VALID_CRED(cred))
+ kauth_cred_unref(&cred);
+ return (error);
+}
+
+/*
+ * finish the writing of a buffer
+ */
+void
+nfs_buf_write_finish(struct nfsbuf *bp, thread_t thd, kauth_cred_t cred)
+{
+ nfsnode_t np = bp->nb_np;
+ int error = (bp->nb_flags & NB_ERROR) ? bp->nb_error : 0;
+ int firstpg, lastpg;
+ uint32_t pagemask;
+
+ if ((error == EINTR) || (error == ERESTART)) {
+ CLR(bp->nb_flags, NB_ERROR);
+ SET(bp->nb_flags, NB_EINTR);
+ }
+
+ if (!error) {
+ /* calculate range of complete pages being written */
+ firstpg = round_page_32(bp->nb_offio) / PAGE_SIZE;
+ lastpg = (trunc_page_32(bp->nb_endio) - 1) / PAGE_SIZE;
+ /* calculate mask for that page range written */
+ pagemask = ((1 << (lastpg + 1)) - 1) & ~((1 << firstpg) - 1);
+ /* clear dirty bits for pages we've written */
+ bp->nb_dirty &= ~pagemask;
+ }
+
+ /* manage needcommit state */
+ if (!error && (bp->nb_commitlevel == NFS_WRITE_UNSTABLE)) {
+ if (!ISSET(bp->nb_flags, NB_NEEDCOMMIT)) {
+ nfs_lock(np, NFS_NODE_LOCK_FORCE);
+ np->n_needcommitcnt++;
+ nfs_unlock(np);
+ SET(bp->nb_flags, NB_NEEDCOMMIT);
+ }
+ /* make sure nb_dirtyoff/nb_dirtyend reflect actual range written */
+ bp->nb_dirtyoff = bp->nb_offio;
+ bp->nb_dirtyend = bp->nb_endio;
+ } else if (ISSET(bp->nb_flags, NB_NEEDCOMMIT)) {
+ nfs_lock(np, NFS_NODE_LOCK_FORCE);
+ np->n_needcommitcnt--;
+ CHECK_NEEDCOMMITCNT(np);
+ nfs_unlock(np);
+ CLR(bp->nb_flags, NB_NEEDCOMMIT);
+ }
+
+ CLR(bp->nb_flags, NB_WRITEINPROG);
+
+ /*
+ * For an unstable write, the buffer is still treated as dirty until
+ * a commit (or stable (re)write) is performed. Buffers needing only
+ * a commit are marked with the NB_DELWRI and NB_NEEDCOMMIT flags.
+ *
+ * If the write was interrupted we set NB_EINTR. Don't set NB_ERROR
+ * because that would cause the buffer to be dropped. The buffer is
+ * still valid and simply needs to be written again.
+ */
+ if ((error == EINTR) || (error == ERESTART) || (!error && (bp->nb_flags & NB_NEEDCOMMIT))) {
+ CLR(bp->nb_flags, NB_INVAL);
+ if (!ISSET(bp->nb_flags, NB_DELWRI)) {
+ SET(bp->nb_flags, NB_DELWRI);
+ lck_mtx_lock(nfs_buf_mutex);
+ nfs_nbdwrite++;
+ NFSBUFCNTCHK();
+ lck_mtx_unlock(nfs_buf_mutex);
+ }
+ /*
+ * Since for the NB_ASYNC case, we've reassigned the buffer to the
+ * clean list, we have to reassign it back to the dirty one. Ugh.
+ */
+ if (ISSET(bp->nb_flags, NB_ASYNC)) {
+ /* move to dirty list */
+ lck_mtx_lock(nfs_buf_mutex);
+ if (bp->nb_vnbufs.le_next != NFSNOLIST)
+ LIST_REMOVE(bp, nb_vnbufs);
+ LIST_INSERT_HEAD(&np->n_dirtyblkhd, bp, nb_vnbufs);
+ lck_mtx_unlock(nfs_buf_mutex);
+ }
+ } else {
+ /* either there's an error or we don't need to commit */
+ if (error) {
+ /*
+ * There was a write error and we need to invalidate
+ * attrs and flush buffers in order to sync up with the
+ * server. (if this write was extending the file, we
+ * may no longer know the correct size)
+ *
+ * But we can't call vinvalbuf while holding this
+ * buffer busy. Set a flag to do it after releasing
+ * the buffer.
+ */
+ nfs_lock(np, NFS_NODE_LOCK_FORCE);
+ np->n_error = error;
+ np->n_flag |= (NWRITEERR | NNEEDINVALIDATE);
+ NATTRINVALIDATE(np);
+ nfs_unlock(np);
+ }
+ /* clear the dirty range */
+ bp->nb_dirtyoff = bp->nb_dirtyend = 0;
+ }
+
+ if (!error && bp->nb_dirty)
+ nfs_buf_write_dirty_pages(bp, thd, cred);
+ nfs_buf_iodone(bp);
+}
+
+/*
+ * write out any pages marked dirty in a buffer
+ *
+ * We do use unstable writes and follow up with a commit.
+ * If we catch the write verifier changing we'll restart
+ * do the writes filesync.
+ */
+int
+nfs_buf_write_dirty_pages(struct nfsbuf *bp, thread_t thd, kauth_cred_t cred)
+{
+ nfsnode_t np = bp->nb_np;
+ struct nfsmount *nmp = NFSTONMP(np);
+ int error = 0, commit, iomode, iomode2, len, pg, count, npages, off;
+ uint32_t dirty = bp->nb_dirty;
+ uint64_t wverf;
+ struct uio uio;
+ struct iovec_32 io;
+
+ if (!bp->nb_dirty)
+ return (0);
+
+ /* there are pages marked dirty that need to be written out */
+ OSAddAtomic(1, (SInt32 *)&nfsstats.write_bios);
+ NFS_BUF_MAP(bp);
+ SET(bp->nb_flags, NB_WRITEINPROG);
+ npages = bp->nb_bufsize / PAGE_SIZE;
+ iomode = NFS_WRITE_UNSTABLE;
+
+ uio.uio_iovs.iov32p = &io;
+ uio.uio_iovcnt = 1;
+ uio.uio_rw = UIO_WRITE;
+#if 1 /* LP64todo - can't use new segment flags until the drivers are ready */
+ uio.uio_segflg = UIO_SYSSPACE;
+#else
+ uio.uio_segflg = UIO_SYSSPACE32;
+#endif
+
+again:
+ dirty = bp->nb_dirty;
+ wverf = bp->nb_verf;
+ commit = NFS_WRITE_FILESYNC;
+ for (pg = 0; pg < npages; pg++) {
+ if (!NBPGDIRTY(bp, pg))
+ continue;
+ count = 1;
+ while (((pg + count) < npages) && NBPGDIRTY(bp, pg + count))
+ count++;
+ /* write count pages starting with page pg */
+ off = pg * PAGE_SIZE;
+ len = count * PAGE_SIZE;
+ /* clip writes to EOF */
+ if (NBOFF(bp) + off + len > (off_t) np->n_size)
+ len -= (NBOFF(bp) + off + len) - np->n_size;
+ if (len > 0) {
+ iomode2 = iomode;
+ io.iov_len = len;
+ uio_uio_resid_set(&uio, io.iov_len);
+ uio.uio_offset = NBOFF(bp) + off;
+ io.iov_base = (uintptr_t) bp->nb_data + off;
+ error = nfs_write_rpc2(np, &uio, thd, cred, &iomode2, &bp->nb_verf);
+ if (error)
+ break;
+ if (iomode2 < commit) /* Retain the lowest commitment level returned. */
+ commit = iomode2;
+ if ((commit != NFS_WRITE_FILESYNC) && (wverf != bp->nb_verf)) {
+ /* verifier changed, redo all the writes filesync */
+ iomode = NFS_WRITE_FILESYNC;
+ goto again;
+ }
+ }
+ /* clear dirty bits */
+ while (count--) {
+ dirty &= ~(1 << pg);
+ if (count) /* leave pg on last page */
+ pg++;
+ }
+ }
+ CLR(bp->nb_flags, NB_WRITEINPROG);
+
+ if (!error && (commit != NFS_WRITE_FILESYNC)) {
+ error = nmp->nm_funcs->nf_commit_rpc(np, NBOFF(bp), bp->nb_bufsize, cred);
+ if (error == NFSERR_STALEWRITEVERF) {
+ /* verifier changed, so we need to restart all the writes */
+ iomode = NFS_WRITE_FILESYNC;
+ goto again;
+ }
+ }
+ if (!error) {
+ bp->nb_dirty = dirty;
+ } else {
+ SET(bp->nb_flags, NB_ERROR);
+ bp->nb_error = error;
+ }
+ return (error);
+}
+
+/*
+ * initiate the NFS WRITE RPC(s) for a buffer
+ */
+int
+nfs_buf_write_rpc(struct nfsbuf *bp, int iomode, thread_t thd, kauth_cred_t cred)
+{
+ struct nfsmount *nmp;
+ nfsnode_t np = bp->nb_np;
+ int error = 0, nfsvers, async;
+ int offset, length, nmwsize, nrpcs, len;
+ struct nfsreq *req;
+ struct nfsreq_cbinfo cb;
+ struct uio uio;
+ struct iovec_32 io;
+
+ nmp = NFSTONMP(np);
+ if (!nmp) {
+ bp->nb_error = error = ENXIO;
+ SET(bp->nb_flags, NB_ERROR);
+ nfs_buf_iodone(bp);
+ return (error);
+ }
+ nfsvers = nmp->nm_vers;
+ nmwsize = nmp->nm_wsize;
+
+ offset = bp->nb_offio;
+ length = bp->nb_endio - bp->nb_offio;
+
+ /* Note: Can only do async I/O if nfsiods are configured. */
+ async = (bp->nb_flags & NB_ASYNC) && (NFSIOD_MAX > 0);
+ bp->nb_commitlevel = NFS_WRITE_FILESYNC;
+ cb.rcb_func = async ? nfs_buf_write_rpc_finish : NULL;
+ cb.rcb_bp = bp;
+
+ if ((nfsvers == NFS_VER2) && ((NBOFF(bp) + bp->nb_endio) > 0xffffffffLL)) {
+ bp->nb_error = error = EFBIG;
+ SET(bp->nb_flags, NB_ERROR);
+ nfs_buf_iodone(bp);
+ return (error);
+ }
+
+ uio.uio_iovs.iov32p = &io;
+ uio.uio_iovcnt = 1;
+ uio.uio_rw = UIO_WRITE;
+#if 1 /* LP64todo - can't use new segment flags until the drivers are ready */
+ uio.uio_segflg = UIO_SYSSPACE;
+#else
+ uio.uio_segflg = UIO_SYSSPACE32;
+#endif
+ io.iov_len = length;
+ uio_uio_resid_set(&uio, io.iov_len);
+ uio.uio_offset = NBOFF(bp) + offset;
+ io.iov_base = (uintptr_t) bp->nb_data + offset;
+
+ bp->nb_rpcs = nrpcs = (length + nmwsize - 1) / nmwsize;
+ if (async && (nrpcs > 1)) {
+ SET(bp->nb_flags, NB_MULTASYNCRPC);
+ } else {
+ CLR(bp->nb_flags, NB_MULTASYNCRPC);
+ }
+
+ while (length > 0) {
+ if (ISSET(bp->nb_flags, NB_ERROR)) {
+ error = bp->nb_error;
+ break;
+ }
+ len = (length > nmwsize) ? nmwsize : length;
+ cb.rcb_args[0] = offset;
+ cb.rcb_args[1] = len;
+ if (async && ((error = nfs_async_write_start(nmp))))
+ break;
+ req = NULL;
+ error = nmp->nm_funcs->nf_write_rpc_async(np, &uio, len, thd, cred,
+ iomode, &cb, &req);
+ if (error) {
+ if (async)
+ nfs_async_write_done(nmp);
+ break;
+ }
+ offset += len;
+ length -= len;
+ if (async)
+ continue;
+ nfs_buf_write_rpc_finish(req);
+ }
+
+ if (length > 0) {