X-Git-Url: https://git.saurik.com/apple/xnu.git/blobdiff_plain/483a1d1004b64bbaaef2c64c17c6b999009a54d2..13f56ec4e58bf8687e2a68032c093c0213dd519b:/bsd/nfs/nfs_bio.c diff --git a/bsd/nfs/nfs_bio.c b/bsd/nfs/nfs_bio.c index c756be676..4bd1bff61 100644 --- a/bsd/nfs/nfs_bio.c +++ b/bsd/nfs/nfs_bio.c @@ -1,23 +1,29 @@ /* - * Copyright (c) 2000-2003 Apple Computer, Inc. All rights reserved. + * Copyright (c) 2000-2010 Apple Inc. All rights reserved. * - * @APPLE_LICENSE_HEADER_START@ + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * - * The contents of this file constitute Original Code as defined in and - * are subject to the Apple Public Source License Version 1.1 (the - * "License"). You may not use this file except in compliance with the - * License. Please obtain a copy of the License at - * http://www.apple.com/publicsource and read it before using this file. + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. * - * This Original Code and all software distributed under the License are - * distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE OR NON-INFRINGEMENT. Please see the - * License for the specific language governing rights and limitations - * under the License. + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. * - * @APPLE_LICENSE_HEADER_END@ + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ */ /* Copyright (c) 1995 NeXT Computer, Inc. All Rights Reserved */ /* @@ -62,55 +68,52 @@ #include #include #include -#include +#include +#include #include #include #include -#include +#include #include -#include -#include +#include +#include +#include #include #include #include #include +#include +#include +#include #include #include #include +#include #include -#include #include +#include +#include -#include - -#define FSDBG(A, B, C, D, E) \ - KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, (A))) | DBG_FUNC_NONE, \ - (int)(B), (int)(C), (int)(D), (int)(E), 0) -#define FSDBG_TOP(A, B, C, D, E) \ - KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, (A))) | DBG_FUNC_START, \ - (int)(B), (int)(C), (int)(D), (int)(E), 0) -#define FSDBG_BOT(A, B, C, D, E) \ - KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, (A))) | DBG_FUNC_END, \ - (int)(B), (int)(C), (int)(D), (int)(E), 0) +kern_return_t thread_terminate(thread_t); /* XXX */ -extern int nfs_numasync; -extern int nfs_ioddelwri; -extern struct nfsstats nfsstats; - -#define NFSBUFHASH(dvp, lbn) \ - (&nfsbufhashtbl[((long)(dvp) / sizeof(*(dvp)) + (int)(lbn)) & nfsbufhash]) +#define NFSBUFHASH(np, lbn) \ + (&nfsbufhashtbl[((long)(np) / sizeof(*(np)) + (int)(lbn)) & nfsbufhash]) LIST_HEAD(nfsbufhashhead, nfsbuf) *nfsbufhashtbl; struct nfsbuffreehead nfsbuffree, nfsbuffreemeta, nfsbufdelwri; u_long nfsbufhash; -int nfsbufhashlock, nfsbufcnt, nfsbufmin, nfsbufmax; +int nfsbufcnt, nfsbufmin, nfsbufmax, nfsbufmetacnt, nfsbufmetamax; int nfsbuffreecnt, nfsbuffreemetacnt, nfsbufdelwricnt, nfsneedbuffer; int nfs_nbdwrite; -time_t nfsbuffreeuptimestamp; +int nfs_buf_timer_on = 0; +thread_t nfsbufdelwrithd = NULL; + +lck_grp_t *nfs_buf_lck_grp; +lck_mtx_t *nfs_buf_mutex; -#define NFSBUFWRITE_THROTTLE 9 +#define NFSBUF_FREE_PERIOD 30 /* seconds */ #define NFSBUF_LRU_STALE 120 #define NFSBUF_META_STALE 240 @@ -120,9 +123,9 @@ time_t nfsbuffreeuptimestamp; #define META_TO_FREEUP 3 /* total number of nfsbufs nfs_buf_freeup() should attempt to free */ #define TOTAL_TO_FREEUP (LRU_TO_FREEUP+META_TO_FREEUP) -/* fraction of nfsbufs nfs_buf_freeup() should attempt to free from nfsbuffree list when called from nfs_timer() */ +/* fraction of nfsbufs nfs_buf_freeup() should attempt to free from nfsbuffree list when called from timer */ #define LRU_FREEUP_FRAC_ON_TIMER 8 -/* fraction of nfsbufs nfs_buf_freeup() should attempt to free from nfsbuffreemeta list when called from nfs_timer() */ +/* fraction of nfsbufs nfs_buf_freeup() should attempt to free from nfsbuffreemeta list when called from timer */ #define META_FREEUP_FRAC_ON_TIMER 16 /* fraction of total nfsbufs that nfsbuffreecnt should exceed before bothering to call nfs_buf_freeup() */ #define LRU_FREEUP_MIN_FRAC 4 @@ -130,7 +133,7 @@ time_t nfsbuffreeuptimestamp; #define META_FREEUP_MIN_FRAC 2 #define NFS_BUF_FREEUP() \ - do { \ + do { \ /* only call nfs_buf_freeup() if it has work to do: */ \ if (((nfsbuffreecnt > nfsbufcnt/LRU_FREEUP_MIN_FRAC) || \ (nfsbuffreemetacnt > nfsbufcnt/META_FREEUP_MIN_FRAC)) && \ @@ -144,17 +147,43 @@ time_t nfsbuffreeuptimestamp; void nfs_nbinit(void) { - nfsbufhashlock = 0; - nfsbufhashtbl = hashinit(nbuf, M_TEMP, &nfsbufhash); + nfs_buf_lck_grp = lck_grp_alloc_init("nfs_buf", LCK_GRP_ATTR_NULL); + nfs_buf_mutex = lck_mtx_alloc_init(nfs_buf_lck_grp, LCK_ATTR_NULL); + + nfsbufcnt = nfsbufmetacnt = + nfsbuffreecnt = nfsbuffreemetacnt = nfsbufdelwricnt = 0; + nfsbufmin = 128; + /* size nfsbufmax to cover at most half sane_size (w/default buf size) */ + nfsbufmax = (sane_size >> PAGE_SHIFT) / (2 * (NFS_RWSIZE >> PAGE_SHIFT)); + nfsbufmetamax = nfsbufmax / 4; + nfsneedbuffer = 0; + nfs_nbdwrite = 0; + + nfsbufhashtbl = hashinit(nfsbufmax/4, M_TEMP, &nfsbufhash); TAILQ_INIT(&nfsbuffree); TAILQ_INIT(&nfsbuffreemeta); TAILQ_INIT(&nfsbufdelwri); - nfsbufcnt = nfsbuffreecnt = nfsbuffreemetacnt = nfsbufdelwricnt = 0; - nfsbufmin = 128; // XXX tune me! - nfsbufmax = 8192; // XXX tune me! - nfsneedbuffer = 0; - nfs_nbdwrite = 0; - nfsbuffreeuptimestamp = 0; + +} + +/* + * Check periodically for stale/unused nfs bufs + */ +void +nfs_buf_timer(__unused void *param0, __unused void *param1) +{ + nfs_buf_freeup(1); + + lck_mtx_lock(nfs_buf_mutex); + if (nfsbufcnt <= nfsbufmin) { + nfs_buf_timer_on = 0; + lck_mtx_unlock(nfs_buf_mutex); + return; + } + lck_mtx_unlock(nfs_buf_mutex); + + nfs_interval_timer_start(nfs_buf_timer_call, + NFSBUF_FREE_PERIOD * 1000); } /* @@ -166,41 +195,37 @@ nfs_buf_freeup(int timer) struct nfsbuf *fbp; struct timeval now; int count; + struct nfsbuffreehead nfsbuffreeup; + + TAILQ_INIT(&nfsbuffreeup); + + lck_mtx_lock(nfs_buf_mutex); microuptime(&now); - nfsbuffreeuptimestamp = now.tv_sec; - FSDBG(320, nfsbufcnt, nfsbuffreecnt, nfsbuffreemetacnt, count); + FSDBG(320, nfsbufcnt, nfsbuffreecnt, nfsbuffreemetacnt, 0); + count = timer ? nfsbuffreecnt/LRU_FREEUP_FRAC_ON_TIMER : LRU_TO_FREEUP; while ((nfsbufcnt > nfsbufmin) && (count-- > 0)) { fbp = TAILQ_FIRST(&nfsbuffree); if (!fbp) break; - if ((fbp->nb_timestamp + (2*NFSBUF_LRU_STALE)) > now.tv_sec) + if (fbp->nb_refs) + break; + if (NBUFSTAMPVALID(fbp) && + (fbp->nb_timestamp + (2*NFSBUF_LRU_STALE)) > now.tv_sec) break; nfs_buf_remfree(fbp); - /* disassociate buffer from any vnode */ - if (fbp->nb_vp) { - struct vnode *oldvp; + /* disassociate buffer from any nfsnode */ + if (fbp->nb_np) { if (fbp->nb_vnbufs.le_next != NFSNOLIST) { LIST_REMOVE(fbp, nb_vnbufs); fbp->nb_vnbufs.le_next = NFSNOLIST; } - oldvp = fbp->nb_vp; - fbp->nb_vp = NULL; - HOLDRELE(oldvp); + fbp->nb_np = NULL; } LIST_REMOVE(fbp, nb_hash); - /* nuke any creds */ - if (fbp->nb_rcred != NOCRED) - crfree(fbp->nb_rcred); - if (fbp->nb_wcred != NOCRED) - crfree(fbp->nb_wcred); - /* if buf was NB_META, dump buffer */ - if (ISSET(fbp->nb_flags, NB_META) && fbp->nb_data) { - FREE(fbp->nb_data, M_TEMP); - } - FREE(fbp, M_TEMP); + TAILQ_INSERT_TAIL(&nfsbuffreeup, fbp, nb_free); nfsbufcnt--; } @@ -209,36 +234,50 @@ nfs_buf_freeup(int timer) fbp = TAILQ_FIRST(&nfsbuffreemeta); if (!fbp) break; - if ((fbp->nb_timestamp + (2*NFSBUF_META_STALE)) > now.tv_sec) + if (fbp->nb_refs) + break; + if (NBUFSTAMPVALID(fbp) && + (fbp->nb_timestamp + (2*NFSBUF_META_STALE)) > now.tv_sec) break; nfs_buf_remfree(fbp); - /* disassociate buffer from any vnode */ - if (fbp->nb_vp) { - struct vnode *oldvp; + /* disassociate buffer from any nfsnode */ + if (fbp->nb_np) { if (fbp->nb_vnbufs.le_next != NFSNOLIST) { LIST_REMOVE(fbp, nb_vnbufs); fbp->nb_vnbufs.le_next = NFSNOLIST; } - oldvp = fbp->nb_vp; - fbp->nb_vp = NULL; - HOLDRELE(oldvp); + fbp->nb_np = NULL; } LIST_REMOVE(fbp, nb_hash); + TAILQ_INSERT_TAIL(&nfsbuffreeup, fbp, nb_free); + nfsbufcnt--; + nfsbufmetacnt--; + } + + FSDBG(320, nfsbufcnt, nfsbuffreecnt, nfsbuffreemetacnt, 0); + NFSBUFCNTCHK(); + + lck_mtx_unlock(nfs_buf_mutex); + + while ((fbp = TAILQ_FIRST(&nfsbuffreeup))) { + TAILQ_REMOVE(&nfsbuffreeup, fbp, nb_free); /* nuke any creds */ - if (fbp->nb_rcred != NOCRED) - crfree(fbp->nb_rcred); - if (fbp->nb_wcred != NOCRED) - crfree(fbp->nb_wcred); + if (IS_VALID_CRED(fbp->nb_rcred)) + kauth_cred_unref(&fbp->nb_rcred); + if (IS_VALID_CRED(fbp->nb_wcred)) + kauth_cred_unref(&fbp->nb_wcred); /* if buf was NB_META, dump buffer */ - if (ISSET(fbp->nb_flags, NB_META) && fbp->nb_data) { - FREE(fbp->nb_data, M_TEMP); - } + if (ISSET(fbp->nb_flags, NB_META) && fbp->nb_data) + kfree(fbp->nb_data, fbp->nb_bufsize); FREE(fbp, M_TEMP); - nfsbufcnt--; } - FSDBG(320, nfsbufcnt, nfsbuffreecnt, nfsbuffreemetacnt, count); + } +/* + * remove a buffer from the freelist + * (must be called with nfs_buf_mutex held) + */ void nfs_buf_remfree(struct nfsbuf *bp) { @@ -247,7 +286,7 @@ nfs_buf_remfree(struct nfsbuf *bp) if (ISSET(bp->nb_flags, NB_DELWRI)) { nfsbufdelwricnt--; TAILQ_REMOVE(&nfsbufdelwri, bp, nb_free); - } else if (ISSET(bp->nb_flags, NB_META) && !ISSET(bp->nb_flags, NB_INVAL)) { + } else if (ISSET(bp->nb_flags, NB_META)) { nfsbuffreemetacnt--; TAILQ_REMOVE(&nfsbuffreemeta, bp, nb_free); } else { @@ -261,15 +300,31 @@ nfs_buf_remfree(struct nfsbuf *bp) /* * check for existence of nfsbuf in cache */ +boolean_t +nfs_buf_is_incore(nfsnode_t np, daddr64_t blkno) +{ + boolean_t rv; + lck_mtx_lock(nfs_buf_mutex); + if (nfs_buf_incore(np, blkno)) + rv = TRUE; + else + rv = FALSE; + lck_mtx_unlock(nfs_buf_mutex); + return (rv); +} + +/* + * return incore buffer (must be called with nfs_buf_mutex held) + */ struct nfsbuf * -nfs_buf_incore(struct vnode *vp, daddr_t blkno) +nfs_buf_incore(nfsnode_t np, daddr64_t blkno) { /* Search hash chain */ - struct nfsbuf * bp = NFSBUFHASH(vp, blkno)->lh_first; + struct nfsbuf * bp = NFSBUFHASH(np, blkno)->lh_first; for (; bp != NULL; bp = bp->nb_hash.le_next) - if (bp->nb_lblkno == blkno && bp->nb_vp == vp) { + if ((bp->nb_lblkno == blkno) && (bp->nb_np == np)) { if (!ISSET(bp->nb_flags, NB_INVAL)) { - FSDBG(547, bp, blkno, bp->nb_flags, bp->nb_vp); + FSDBG(547, bp, blkno, bp->nb_flags, bp->nb_np); return (bp); } } @@ -285,15 +340,24 @@ nfs_buf_incore(struct vnode *vp, daddr_t blkno) * later when/if we need to write the data (again). */ int -nfs_buf_page_inval(struct vnode *vp, off_t offset) +nfs_buf_page_inval(vnode_t vp, off_t offset) { + struct nfsmount *nmp = VTONMP(vp); struct nfsbuf *bp; - bp = nfs_buf_incore(vp, ubc_offtoblk(vp, offset)); + int error = 0; + + if (!nmp) + return (ENXIO); + + lck_mtx_lock(nfs_buf_mutex); + bp = nfs_buf_incore(VTONFS(vp), (daddr64_t)(offset / nmp->nm_biosize)); if (!bp) - return (0); + goto out; FSDBG(325, bp, bp->nb_flags, bp->nb_dirtyoff, bp->nb_dirtyend); - if (ISSET(bp->nb_flags, NB_BUSY)) - return (EBUSY); + if (ISSET(bp->nb_lflags, NBL_BUSY)) { + error = EBUSY; + goto out; + } /* * If there's a dirty range in the buffer, check to * see if this page intersects with the dirty range. @@ -301,26 +365,50 @@ nfs_buf_page_inval(struct vnode *vp, off_t offset) */ if (bp->nb_dirtyend > 0) { int start = offset - NBOFF(bp); - if (bp->nb_dirtyend <= start || - bp->nb_dirtyoff >= (start + PAGE_SIZE)) - return (0); - return (EBUSY); + if ((bp->nb_dirtyend > start) && + (bp->nb_dirtyoff < (start + PAGE_SIZE))) { + /* + * Before returning the bad news, move the + * buffer to the start of the delwri list and + * give the list a push to try to flush the + * buffer out. + */ + error = EBUSY; + nfs_buf_remfree(bp); + TAILQ_INSERT_HEAD(&nfsbufdelwri, bp, nb_free); + nfsbufdelwricnt++; + nfs_buf_delwri_push(1); + } } - return (0); +out: + lck_mtx_unlock(nfs_buf_mutex); + return (error); } +/* + * set up the UPL for a buffer + * (must NOT be called with nfs_buf_mutex held) + */ int nfs_buf_upl_setup(struct nfsbuf *bp) { kern_return_t kret; upl_t upl; - int s; + int upl_flags; if (ISSET(bp->nb_flags, NB_PAGELIST)) return (0); - kret = ubc_create_upl(bp->nb_vp, NBOFF(bp), bp->nb_bufsize, - &upl, NULL, UPL_PRECIOUS); + upl_flags = UPL_PRECIOUS; + if (!ISSET(bp->nb_flags, NB_READ)) { + /* + * We're doing a "write", so we intend to modify + * the pages we're gathering. + */ + upl_flags |= UPL_WILL_MODIFY; + } + kret = ubc_create_upl(NFSTOV(bp->nb_np), NBOFF(bp), bp->nb_bufsize, + &upl, NULL, upl_flags); if (kret == KERN_INVALID_ARGUMENT) { /* vm object probably doesn't exist any more */ bp->nb_pagelist = NULL; @@ -332,15 +420,17 @@ nfs_buf_upl_setup(struct nfsbuf *bp) return (EIO); } - FSDBG(538, bp, NBOFF(bp), bp->nb_bufsize, bp->nb_vp); + FSDBG(538, bp, NBOFF(bp), bp->nb_bufsize, bp->nb_np); - s = splbio(); bp->nb_pagelist = upl; SET(bp->nb_flags, NB_PAGELIST); - splx(s); return (0); } +/* + * update buffer's valid/dirty info from UBC + * (must NOT be called with nfs_buf_mutex held) + */ void nfs_buf_upl_check(struct nfsbuf *bp) { @@ -352,7 +442,7 @@ nfs_buf_upl_check(struct nfsbuf *bp) return; npages = round_page_32(bp->nb_bufsize) / PAGE_SIZE; - filesize = ubc_getsize(bp->nb_vp); + filesize = ubc_getsize(NFSTOV(bp->nb_np)); fileoffset = NBOFF(bp); if (fileoffset < filesize) SET(bp->nb_flags, NB_CACHE); @@ -371,11 +461,8 @@ nfs_buf_upl_check(struct nfsbuf *bp) continue; } NBPGVALID_SET(bp,i); - if (upl_dirty_page(pl, i)) { + if (upl_dirty_page(pl, i)) NBPGDIRTY_SET(bp, i); - if (!ISSET(bp->nb_flags, NB_WASDIRTY)) - SET(bp->nb_flags, NB_WASDIRTY); - } } fileoffset = NBOFF(bp); if (ISSET(bp->nb_flags, NB_CACHE)) { @@ -390,7 +477,11 @@ nfs_buf_upl_check(struct nfsbuf *bp) FSDBG(539, bp->nb_validoff, bp->nb_validend, bp->nb_dirtyoff, bp->nb_dirtyend); } -static int +/* + * make sure that a buffer is mapped + * (must NOT be called with nfs_buf_mutex held) + */ +int nfs_buf_map(struct nfsbuf *bp) { kern_return_t kret; @@ -400,7 +491,7 @@ nfs_buf_map(struct nfsbuf *bp) if (!ISSET(bp->nb_flags, NB_PAGELIST)) return (EINVAL); - kret = ubc_upl_map(bp->nb_pagelist, (vm_address_t *)&(bp->nb_data)); + kret = ubc_upl_map(bp->nb_pagelist, (vm_offset_t *)&(bp->nb_data)); if (kret != KERN_SUCCESS) panic("nfs_buf_map: ubc_upl_map() failed with (%d)", kret); if (bp->nb_data == 0) @@ -409,37 +500,6 @@ nfs_buf_map(struct nfsbuf *bp) return (0); } -/* - * check range of pages in nfsbuf's UPL for validity - */ -static int -nfs_buf_upl_valid_range(struct nfsbuf *bp, int off, int size) -{ - off_t fileoffset, filesize; - int pg, lastpg; - upl_page_info_t *pl; - - if (!ISSET(bp->nb_flags, NB_PAGELIST)) - return (0); - pl = ubc_upl_pageinfo(bp->nb_pagelist); - - size += off & PAGE_MASK; - off &= ~PAGE_MASK; - fileoffset = NBOFF(bp); - filesize = VTONFS(bp->nb_vp)->n_size; - if ((fileoffset + off + size) > filesize) - size = filesize - (fileoffset + off); - - pg = off/PAGE_SIZE; - lastpg = (off + size - 1)/PAGE_SIZE; - while (pg <= lastpg) { - if (!upl_valid_page(pl, pg)) - return (0); - pg++; - } - return (1); -} - /* * normalize an nfsbuf's valid range * @@ -449,8 +509,8 @@ nfs_buf_upl_valid_range(struct nfsbuf *bp, int off, int size) * here as we extend the valid range through all of the * contiguous valid pages. */ -static void -nfs_buf_normalize_valid_range(struct nfsnode *np, struct nfsbuf *bp) +void +nfs_buf_normalize_valid_range(nfsnode_t np, struct nfsbuf *bp) { int pg, npg; /* pull validoff back to start of contiguous valid page range */ @@ -465,131 +525,209 @@ nfs_buf_normalize_valid_range(struct nfsnode *np, struct nfsbuf *bp) pg++; bp->nb_validend = pg * PAGE_SIZE; /* clip to EOF */ - if (NBOFF(bp) + bp->nb_validend > np->n_size) + if (NBOFF(bp) + bp->nb_validend > (off_t)np->n_size) bp->nb_validend = np->n_size % bp->nb_bufsize; } /* - * try to push out some delayed/uncommitted writes + * process some entries on the delayed write queue + * (must be called with nfs_buf_mutex held) */ -static void -nfs_buf_delwri_push(void) +void +nfs_buf_delwri_service(void) { struct nfsbuf *bp; - int i; - - if (TAILQ_EMPTY(&nfsbufdelwri)) - return; + nfsnode_t np; + int error, i = 0; - /* first try to tell the nfsiods to do it */ - if (nfs_asyncio(NULL, NULL) == 0) - return; - - /* otherwise, try to do some of the work ourselves */ - i = 0; while (i < 8 && (bp = TAILQ_FIRST(&nfsbufdelwri)) != NULL) { - struct nfsnode *np = VTONFS(bp->nb_vp); + np = bp->nb_np; nfs_buf_remfree(bp); + nfs_buf_refget(bp); + while ((error = nfs_buf_acquire(bp, 0, 0, 0)) == EAGAIN); + nfs_buf_refrele(bp); + if (error) + break; + if (!bp->nb_np) { + /* buffer is no longer valid */ + nfs_buf_drop(bp); + continue; + } + if (ISSET(bp->nb_flags, NB_NEEDCOMMIT)) + nfs_buf_check_write_verifier(np, bp); if (ISSET(bp->nb_flags, NB_NEEDCOMMIT)) { /* put buffer at end of delwri list */ TAILQ_INSERT_TAIL(&nfsbufdelwri, bp, nb_free); nfsbufdelwricnt++; - nfs_flushcommits(np->n_vnode, (struct proc *)0); + nfs_buf_drop(bp); + lck_mtx_unlock(nfs_buf_mutex); + nfs_flushcommits(np, 1); } else { - SET(bp->nb_flags, (NB_BUSY | NB_ASYNC)); + SET(bp->nb_flags, NB_ASYNC); + lck_mtx_unlock(nfs_buf_mutex); nfs_buf_write(bp); } i++; + lck_mtx_lock(nfs_buf_mutex); } } /* - * Get an nfs cache block. - * Allocate a new one if the block isn't currently in the cache - * and return the block marked busy. If the calling process is - * interrupted by a signal for an interruptible mount point, return - * NULL. + * thread to service the delayed write queue when asked */ -struct nfsbuf * +void +nfs_buf_delwri_thread(__unused void *arg, __unused wait_result_t wr) +{ + struct timespec ts = { 30, 0 }; + int error = 0; + + lck_mtx_lock(nfs_buf_mutex); + while (!error) { + nfs_buf_delwri_service(); + error = msleep(&nfsbufdelwrithd, nfs_buf_mutex, 0, "nfsbufdelwri", &ts); + } + nfsbufdelwrithd = NULL; + lck_mtx_unlock(nfs_buf_mutex); + thread_terminate(nfsbufdelwrithd); +} + +/* + * try to push out some delayed/uncommitted writes + * ("locked" indicates whether nfs_buf_mutex is already held) + */ +void +nfs_buf_delwri_push(int locked) +{ + if (TAILQ_EMPTY(&nfsbufdelwri)) + return; + if (!locked) + lck_mtx_lock(nfs_buf_mutex); + /* wake up the delayed write service thread */ + if (nfsbufdelwrithd) + wakeup(&nfsbufdelwrithd); + else if (kernel_thread_start(nfs_buf_delwri_thread, NULL, &nfsbufdelwrithd) == KERN_SUCCESS) + thread_deallocate(nfsbufdelwrithd); + /* otherwise, try to do some of the work ourselves */ + if (!nfsbufdelwrithd) + nfs_buf_delwri_service(); + if (!locked) + lck_mtx_unlock(nfs_buf_mutex); +} + +/* + * Get an nfs buffer. + * + * Returns errno on error, 0 otherwise. + * Any buffer is returned in *bpp. + * + * If NBLK_ONLYVALID is set, only return buffer if found in cache. + * If NBLK_NOWAIT is set, don't wait for the buffer if it's marked BUSY. + * + * Check for existence of buffer in cache. + * Or attempt to reuse a buffer from one of the free lists. + * Or allocate a new buffer if we haven't already hit max allocation. + * Or wait for a free buffer. + * + * If available buffer found, prepare it, and return it. + * + * If the calling process is interrupted by a signal for + * an interruptible mount point, return EINTR. + */ +int nfs_buf_get( - struct vnode *vp, - daddr_t blkno, - int size, - struct proc *p, - int operation) + nfsnode_t np, + daddr64_t blkno, + uint32_t size, + thread_t thd, + int flags, + struct nfsbuf **bpp) { - struct nfsnode *np = VTONFS(vp); + vnode_t vp = NFSTOV(np); + struct nfsmount *nmp = VTONMP(vp); struct nfsbuf *bp; - int i, biosize, bufsize, rv; - struct ucred *cred; + uint32_t bufsize; int slpflag = PCATCH; + int operation = (flags & NBLK_OPMASK); + int error = 0; + struct timespec ts; - FSDBG_TOP(541, vp, blkno, size, operation); + FSDBG_TOP(541, np, blkno, size, flags); + *bpp = NULL; bufsize = size; - if (bufsize > MAXBSIZE) - panic("nfs_buf_get: buffer larger than MAXBSIZE requested"); + if (bufsize > NFS_MAXBSIZE) + panic("nfs_buf_get: buffer larger than NFS_MAXBSIZE requested"); - biosize = vp->v_mount->mnt_stat.f_iosize; + if (!nmp) { + FSDBG_BOT(541, np, blkno, 0, ENXIO); + return (ENXIO); + } - if (UBCINVALID(vp) || !UBCINFOEXISTS(vp)) - operation = BLK_META; - else if (bufsize < biosize) + if (!UBCINFOEXISTS(vp)) { + operation = NBLK_META; + } else if (bufsize < (uint32_t)nmp->nm_biosize) { /* reg files should always have biosize blocks */ - bufsize = biosize; + bufsize = nmp->nm_biosize; + } - /* if BLK_WRITE, check for too many delayed/uncommitted writes */ - if ((operation == BLK_WRITE) && (nfs_nbdwrite > ((nfsbufcnt*3)/4))) { - FSDBG_TOP(542, vp, blkno, nfs_nbdwrite, ((nfsbufcnt*3)/4)); + /* if NBLK_WRITE, check for too many delayed/uncommitted writes */ + if ((operation == NBLK_WRITE) && (nfs_nbdwrite > NFS_A_LOT_OF_DELAYED_WRITES)) { + FSDBG_TOP(542, np, blkno, nfs_nbdwrite, NFS_A_LOT_OF_DELAYED_WRITES); /* poke the delwri list */ - nfs_buf_delwri_push(); + nfs_buf_delwri_push(0); /* sleep to let other threads run... */ tsleep(&nfs_nbdwrite, PCATCH, "nfs_nbdwrite", 1); - FSDBG_BOT(542, vp, blkno, nfs_nbdwrite, ((nfsbufcnt*3)/4)); + FSDBG_BOT(542, np, blkno, nfs_nbdwrite, NFS_A_LOT_OF_DELAYED_WRITES); } loop: - /* - * Obtain a lock to prevent a race condition if the - * MALLOC() below happens to block. - */ - if (nfsbufhashlock) { - while (nfsbufhashlock) { - nfsbufhashlock = -1; - tsleep(&nfsbufhashlock, PCATCH, "nfsbufget", 0); - if (nfs_sigintr(VFSTONFS(vp->v_mount), NULL, p)) - return (NULL); + lck_mtx_lock(nfs_buf_mutex); + + /* wait for any buffer invalidation/flushing to complete */ + while (np->n_bflag & NBINVALINPROG) { + np->n_bflag |= NBINVALWANT; + ts.tv_sec = 2; + ts.tv_nsec = 0; + msleep(&np->n_bflag, nfs_buf_mutex, slpflag, "nfs_buf_get_invalwait", &ts); + if ((error = nfs_sigintr(VTONMP(vp), NULL, thd, 0))) { + lck_mtx_unlock(nfs_buf_mutex); + FSDBG_BOT(541, np, blkno, 0, error); + return (error); } - goto loop; + if (np->n_bflag & NBINVALINPROG) + slpflag = 0; } - nfsbufhashlock = 1; /* check for existence of nfsbuf in cache */ - if (bp = nfs_buf_incore(vp, blkno)) { + if ((bp = nfs_buf_incore(np, blkno))) { /* if busy, set wanted and wait */ - if (ISSET(bp->nb_flags, NB_BUSY)) { - FSDBG_TOP(543, vp, blkno, bp, bp->nb_flags); - SET(bp->nb_flags, NB_WANTED); - /* unlock hash */ - if (nfsbufhashlock < 0) { - nfsbufhashlock = 0; - wakeup(&nfsbufhashlock); - } else - nfsbufhashlock = 0; - tsleep(bp, slpflag|(PRIBIO+1), "nfsbufget", (slpflag == PCATCH) ? 0 : 2*hz); + if (ISSET(bp->nb_lflags, NBL_BUSY)) { + if (flags & NBLK_NOWAIT) { + lck_mtx_unlock(nfs_buf_mutex); + FSDBG_BOT(541, np, blkno, bp, 0xbcbcbcbc); + return (0); + } + FSDBG_TOP(543, np, blkno, bp, bp->nb_flags); + SET(bp->nb_lflags, NBL_WANTED); + + ts.tv_sec = 2; + ts.tv_nsec = 0; + msleep(bp, nfs_buf_mutex, slpflag|(PRIBIO+1)|PDROP, + "nfsbufget", (slpflag == PCATCH) ? NULL : &ts); slpflag = 0; - FSDBG_BOT(543, vp, blkno, bp, bp->nb_flags); - if (nfs_sigintr(VFSTONFS(vp->v_mount), NULL, p)) { - FSDBG_BOT(541, vp, blkno, 0, EINTR); - return (NULL); + FSDBG_BOT(543, np, blkno, bp, bp->nb_flags); + if ((error = nfs_sigintr(VTONMP(vp), NULL, thd, 0))) { + FSDBG_BOT(541, np, blkno, 0, error); + return (error); } goto loop; } if (bp->nb_bufsize != bufsize) panic("nfsbuf size mismatch"); - SET(bp->nb_flags, (NB_BUSY | NB_CACHE)); + SET(bp->nb_lflags, NBL_BUSY); + SET(bp->nb_flags, NB_CACHE); nfs_buf_remfree(bp); /* additional paranoia: */ if (ISSET(bp->nb_flags, NB_PAGELIST)) @@ -597,8 +735,15 @@ loop: goto buffer_setup; } + if (flags & NBLK_ONLYVALID) { + lck_mtx_unlock(nfs_buf_mutex); + FSDBG_BOT(541, np, blkno, 0, 0x0000cace); + return (0); + } + /* * where to get a free buffer: + * - if meta and maxmeta reached, must reuse meta * - alloc new if we haven't reached min bufs * - if free lists are NOT empty * - if free list is stale, use it @@ -608,19 +753,24 @@ loop: * - start clearing out delwri list and try again */ - if ((nfsbufcnt > nfsbufmin) && + if ((operation == NBLK_META) && (nfsbufmetacnt >= nfsbufmetamax)) { + /* if we've hit max meta buffers, must reuse a meta buffer */ + bp = TAILQ_FIRST(&nfsbuffreemeta); + } else if ((nfsbufcnt > nfsbufmin) && (!TAILQ_EMPTY(&nfsbuffree) || !TAILQ_EMPTY(&nfsbuffreemeta))) { /* try to pull an nfsbuf off a free list */ struct nfsbuf *lrubp, *metabp; struct timeval now; microuptime(&now); - /* if the next LRU or META buffer is stale, use it */ + /* if the next LRU or META buffer is invalid or stale, use it */ lrubp = TAILQ_FIRST(&nfsbuffree); - if (lrubp && ((lrubp->nb_timestamp + NFSBUF_LRU_STALE) < now.tv_sec)) + if (lrubp && (!NBUFSTAMPVALID(lrubp) || + ((lrubp->nb_timestamp + NFSBUF_LRU_STALE) < now.tv_sec))) bp = lrubp; metabp = TAILQ_FIRST(&nfsbuffreemeta); - if (!bp && metabp && ((metabp->nb_timestamp + NFSBUF_META_STALE) < now.tv_sec)) + if (!bp && metabp && (!NBUFSTAMPVALID(metabp) || + ((metabp->nb_timestamp + NFSBUF_META_STALE) < now.tv_sec))) bp = metabp; if (!bp && (nfsbufcnt >= nfsbufmax)) { @@ -640,106 +790,116 @@ loop: bp = metabp; } } + } - if (bp) { - /* we have a buffer to reuse */ - FSDBG(544, vp, blkno, bp, bp->nb_flags); - nfs_buf_remfree(bp); - if (ISSET(bp->nb_flags, NB_DELWRI)) - panic("nfs_buf_get: delwri"); - SET(bp->nb_flags, NB_BUSY); - /* disassociate buffer from previous vnode */ - if (bp->nb_vp) { - struct vnode *oldvp; - if (bp->nb_vnbufs.le_next != NFSNOLIST) { - LIST_REMOVE(bp, nb_vnbufs); - bp->nb_vnbufs.le_next = NFSNOLIST; - } - oldvp = bp->nb_vp; - bp->nb_vp = NULL; - HOLDRELE(oldvp); - } - LIST_REMOVE(bp, nb_hash); - /* nuke any creds we're holding */ - cred = bp->nb_rcred; - if (cred != NOCRED) { - bp->nb_rcred = NOCRED; - crfree(cred); - } - cred = bp->nb_wcred; - if (cred != NOCRED) { - bp->nb_wcred = NOCRED; - crfree(cred); + if (bp) { + /* we have a buffer to reuse */ + FSDBG(544, np, blkno, bp, bp->nb_flags); + nfs_buf_remfree(bp); + if (ISSET(bp->nb_flags, NB_DELWRI)) + panic("nfs_buf_get: delwri"); + SET(bp->nb_lflags, NBL_BUSY); + /* disassociate buffer from previous nfsnode */ + if (bp->nb_np) { + if (bp->nb_vnbufs.le_next != NFSNOLIST) { + LIST_REMOVE(bp, nb_vnbufs); + bp->nb_vnbufs.le_next = NFSNOLIST; } - /* if buf will no longer be NB_META, dump old buffer */ - if ((operation != BLK_META) && - ISSET(bp->nb_flags, NB_META) && bp->nb_data) { - FREE(bp->nb_data, M_TEMP); + bp->nb_np = NULL; + } + LIST_REMOVE(bp, nb_hash); + /* nuke any creds we're holding */ + if (IS_VALID_CRED(bp->nb_rcred)) + kauth_cred_unref(&bp->nb_rcred); + if (IS_VALID_CRED(bp->nb_wcred)) + kauth_cred_unref(&bp->nb_wcred); + /* if buf will no longer be NB_META, dump old buffer */ + if (operation == NBLK_META) { + if (!ISSET(bp->nb_flags, NB_META)) + nfsbufmetacnt++; + } else if (ISSET(bp->nb_flags, NB_META)) { + if (bp->nb_data) { + kfree(bp->nb_data, bp->nb_bufsize); bp->nb_data = NULL; } - /* re-init buf fields */ - bp->nb_error = 0; - bp->nb_validoff = bp->nb_validend = -1; - bp->nb_dirtyoff = bp->nb_dirtyend = 0; - bp->nb_valid = 0; - bp->nb_dirty = 0; + nfsbufmetacnt--; } - } - - if (!bp) { - if (nfsbufcnt < nfsbufmax) { + /* re-init buf fields */ + bp->nb_error = 0; + bp->nb_validoff = bp->nb_validend = -1; + bp->nb_dirtyoff = bp->nb_dirtyend = 0; + bp->nb_valid = 0; + bp->nb_dirty = 0; + bp->nb_verf = 0; + } else { + /* no buffer to reuse */ + if ((nfsbufcnt < nfsbufmax) && + ((operation != NBLK_META) || (nfsbufmetacnt < nfsbufmetamax))) { /* just alloc a new one */ MALLOC(bp, struct nfsbuf *, sizeof(struct nfsbuf), M_TEMP, M_WAITOK); + if (!bp) { + lck_mtx_unlock(nfs_buf_mutex); + FSDBG_BOT(541, np, blkno, 0, error); + return (ENOMEM); + } nfsbufcnt++; + + /* + * If any excess bufs, make sure the timer + * is running to free them up later. + */ + if (nfsbufcnt > nfsbufmin && !nfs_buf_timer_on) { + nfs_buf_timer_on = 1; + nfs_interval_timer_start(nfs_buf_timer_call, + NFSBUF_FREE_PERIOD * 1000); + } + + if (operation == NBLK_META) + nfsbufmetacnt++; NFSBUFCNTCHK(); /* init nfsbuf */ bzero(bp, sizeof(*bp)); bp->nb_free.tqe_next = NFSNOLIST; bp->nb_validoff = bp->nb_validend = -1; - FSDBG(545, vp, blkno, bp, 0); + FSDBG(545, np, blkno, bp, 0); } else { /* too many bufs... wait for buffers to free up */ - FSDBG_TOP(546, vp, blkno, nfsbufcnt, nfsbufmax); - /* unlock hash */ - if (nfsbufhashlock < 0) { - nfsbufhashlock = 0; - wakeup(&nfsbufhashlock); - } else - nfsbufhashlock = 0; + FSDBG_TOP(546, np, blkno, nfsbufcnt, nfsbufmax); /* poke the delwri list */ - nfs_buf_delwri_push(); + nfs_buf_delwri_push(1); nfsneedbuffer = 1; - tsleep(&nfsneedbuffer, PCATCH, "nfsbufget", 0); - FSDBG_BOT(546, vp, blkno, nfsbufcnt, nfsbufmax); - if (nfs_sigintr(VFSTONFS(vp->v_mount), NULL, p)) { - FSDBG_BOT(541, vp, blkno, 0, EINTR); - return (NULL); + msleep(&nfsneedbuffer, nfs_buf_mutex, PCATCH|PDROP, "nfsbufget", NULL); + FSDBG_BOT(546, np, blkno, nfsbufcnt, nfsbufmax); + if ((error = nfs_sigintr(VTONMP(vp), NULL, thd, 0))) { + FSDBG_BOT(541, np, blkno, 0, error); + return (error); } goto loop; } } -setup_nfsbuf: - - /* setup nfsbuf */ - bp->nb_flags = NB_BUSY; + /* set up nfsbuf */ + SET(bp->nb_lflags, NBL_BUSY); + bp->nb_flags = 0; bp->nb_lblkno = blkno; /* insert buf in hash */ - LIST_INSERT_HEAD(NFSBUFHASH(vp, blkno), bp, nb_hash); - /* associate buffer with new vnode */ - VHOLD(vp); - bp->nb_vp = vp; + LIST_INSERT_HEAD(NFSBUFHASH(np, blkno), bp, nb_hash); + /* associate buffer with new nfsnode */ + bp->nb_np = np; LIST_INSERT_HEAD(&np->n_cleanblkhd, bp, nb_vnbufs); buffer_setup: + /* unlock hash */ + lck_mtx_unlock(nfs_buf_mutex); + switch (operation) { - case BLK_META: + case NBLK_META: SET(bp->nb_flags, NB_META); if ((bp->nb_bufsize != bufsize) && bp->nb_data) { - FREE(bp->nb_data, M_TEMP); + kfree(bp->nb_data, bp->nb_bufsize); bp->nb_data = NULL; bp->nb_validoff = bp->nb_validend = -1; bp->nb_dirtyoff = bp->nb_dirtyend = 0; @@ -748,37 +908,62 @@ buffer_setup: CLR(bp->nb_flags, NB_CACHE); } if (!bp->nb_data) - MALLOC(bp->nb_data, caddr_t, bufsize, M_TEMP, M_WAITOK); - if (!bp->nb_data) - panic("nfs_buf_get: null nb_data"); + bp->nb_data = kalloc(bufsize); + if (!bp->nb_data) { + /* Ack! couldn't allocate the data buffer! */ + /* clean up buffer and return error */ + lck_mtx_lock(nfs_buf_mutex); + LIST_REMOVE(bp, nb_vnbufs); + bp->nb_vnbufs.le_next = NFSNOLIST; + bp->nb_np = NULL; + /* invalidate usage timestamp to allow immediate freeing */ + NBUFSTAMPINVALIDATE(bp); + if (bp->nb_free.tqe_next != NFSNOLIST) + panic("nfsbuf on freelist"); + TAILQ_INSERT_HEAD(&nfsbuffree, bp, nb_free); + nfsbuffreecnt++; + lck_mtx_unlock(nfs_buf_mutex); + FSDBG_BOT(541, np, blkno, 0xb00, ENOMEM); + return (ENOMEM); + } bp->nb_bufsize = bufsize; break; - case BLK_READ: - case BLK_WRITE: + case NBLK_READ: + case NBLK_WRITE: + /* + * Set or clear NB_READ now to let the UPL subsystem know + * if we intend to modify the pages or not. + */ + if (operation == NBLK_READ) { + SET(bp->nb_flags, NB_READ); + } else { + CLR(bp->nb_flags, NB_READ); + } if (bufsize < PAGE_SIZE) bufsize = PAGE_SIZE; bp->nb_bufsize = bufsize; bp->nb_validoff = bp->nb_validend = -1; - if (UBCISVALID(vp)) { - /* setup upl */ + if (UBCINFOEXISTS(vp)) { + /* set up upl */ if (nfs_buf_upl_setup(bp)) { /* unable to create upl */ /* vm object must no longer exist */ - /* cleanup buffer and return NULL */ + /* clean up buffer and return error */ + lck_mtx_lock(nfs_buf_mutex); LIST_REMOVE(bp, nb_vnbufs); bp->nb_vnbufs.le_next = NFSNOLIST; - bp->nb_vp = NULL; - /* clear usage timestamp to allow immediate freeing */ - bp->nb_timestamp = 0; - HOLDRELE(vp); + bp->nb_np = NULL; + /* invalidate usage timestamp to allow immediate freeing */ + NBUFSTAMPINVALIDATE(bp); if (bp->nb_free.tqe_next != NFSNOLIST) panic("nfsbuf on freelist"); TAILQ_INSERT_HEAD(&nfsbuffree, bp, nb_free); nfsbuffreecnt++; - FSDBG_BOT(541, vp, blkno, 0x2bc, EIO); - return (NULL); + lck_mtx_unlock(nfs_buf_mutex); + FSDBG_BOT(541, np, blkno, 0x2bc, EIO); + return (EIO); } nfs_buf_upl_check(bp); } @@ -788,32 +973,30 @@ buffer_setup: panic("nfs_buf_get: %d unknown operation", operation); } - /* unlock hash */ - if (nfsbufhashlock < 0) { - nfsbufhashlock = 0; - wakeup(&nfsbufhashlock); - } else - nfsbufhashlock = 0; + *bpp = bp; - FSDBG_BOT(541, vp, blkno, bp, bp->nb_flags); + FSDBG_BOT(541, np, blkno, bp, bp->nb_flags); - return (bp); + return (0); } void nfs_buf_release(struct nfsbuf *bp, int freeup) { - struct vnode *vp = bp->nb_vp; + nfsnode_t np = bp->nb_np; + vnode_t vp; struct timeval now; + int wakeup_needbuffer, wakeup_buffer, wakeup_nbdwrite; FSDBG_TOP(548, bp, NBOFF(bp), bp->nb_flags, bp->nb_data); FSDBG(548, bp->nb_validoff, bp->nb_validend, bp->nb_dirtyoff, bp->nb_dirtyend); FSDBG(548, bp->nb_valid, 0, bp->nb_dirty, 0); - if (UBCINFOEXISTS(vp) && bp->nb_bufsize) { - int upl_flags; + vp = np ? NFSTOV(np) : NULL; + if (vp && UBCINFOEXISTS(vp) && bp->nb_bufsize) { + int upl_flags, rv; upl_t upl; - int i, rv; + uint32_t i; if (!ISSET(bp->nb_flags, NB_PAGELIST) && !ISSET(bp->nb_flags, NB_INVAL)) { rv = nfs_buf_upl_setup(bp); @@ -830,8 +1013,13 @@ nfs_buf_release(struct nfsbuf *bp, int freeup) panic("ubc_upl_unmap failed"); bp->nb_data = NULL; } - if (bp->nb_flags & (NB_ERROR | NB_INVAL)) { - if (bp->nb_flags & (NB_READ | NB_INVAL)) + /* + * Abort the pages on error or: if this is an invalid or + * non-needcommit nocache buffer AND no pages are dirty. + */ + if (ISSET(bp->nb_flags, NB_ERROR) || (!bp->nb_dirty && (ISSET(bp->nb_flags, NB_INVAL) || + (ISSET(bp->nb_flags, NB_NOCACHE) && !ISSET(bp->nb_flags, (NB_NEEDCOMMIT | NB_DELWRI)))))) { + if (ISSET(bp->nb_flags, (NB_READ | NB_INVAL | NB_NOCACHE))) upl_flags = UPL_ABORT_DUMP_PAGES; else upl_flags = 0; @@ -849,6 +1037,10 @@ nfs_buf_release(struct nfsbuf *bp, int freeup) upl_flags = UPL_COMMIT_SET_DIRTY; else upl_flags = UPL_COMMIT_CLEAR_DIRTY; + + if (!ISSET(bp->nb_flags, (NB_NEEDCOMMIT | NB_DELWRI))) + upl_flags |= UPL_COMMIT_CLEAR_PRECIOUS; + ubc_upl_commit_range(upl, i*PAGE_SIZE, PAGE_SIZE, upl_flags | @@ -857,67 +1049,77 @@ nfs_buf_release(struct nfsbuf *bp, int freeup) } } pagelist_cleanup_done: - /* was this the last buffer in the file? */ - if (NBOFF(bp) + bp->nb_bufsize > VTONFS(vp)->n_size) { - /* if so, invalidate all pages of last buffer past EOF */ - int biosize = vp->v_mount->mnt_stat.f_iosize; - off_t off, size; - off = trunc_page_64(VTONFS(vp)->n_size) + PAGE_SIZE_64; - size = trunc_page_64(NBOFF(bp) + biosize) - off; - if (size) - ubc_invalidate(vp, off, size); + /* invalidate any pages past EOF */ + if (NBOFF(bp) + bp->nb_bufsize > (off_t)(np->n_size)) { + off_t start, end; + start = trunc_page_64(np->n_size) + PAGE_SIZE_64; + end = trunc_page_64(NBOFF(bp) + bp->nb_bufsize); + if (start < NBOFF(bp)) + start = NBOFF(bp); + if (end > start) { + if ((rv = ubc_msync(vp, start, end, NULL, UBC_INVALIDATE))) + printf("nfs_buf_release(): ubc_msync failed!, error %d\n", rv); + } } CLR(bp->nb_flags, NB_PAGELIST); bp->nb_pagelist = NULL; } + lck_mtx_lock(nfs_buf_mutex); + + wakeup_needbuffer = wakeup_buffer = wakeup_nbdwrite = 0; + /* Wake up any processes waiting for any buffer to become free. */ if (nfsneedbuffer) { nfsneedbuffer = 0; - wakeup(&nfsneedbuffer); + wakeup_needbuffer = 1; } /* Wake up any processes waiting for _this_ buffer to become free. */ - if (ISSET(bp->nb_flags, NB_WANTED)) { - CLR(bp->nb_flags, NB_WANTED); - wakeup(bp); + if (ISSET(bp->nb_lflags, NBL_WANTED)) { + CLR(bp->nb_lflags, NBL_WANTED); + wakeup_buffer = 1; } - /* If it's not cacheable, or an error, mark it invalid. */ - if (ISSET(bp->nb_flags, (NB_NOCACHE|NB_ERROR))) + /* If it's non-needcommit nocache, or an error, mark it invalid. */ + if (ISSET(bp->nb_flags, NB_ERROR) || + (ISSET(bp->nb_flags, NB_NOCACHE) && !ISSET(bp->nb_flags, (NB_NEEDCOMMIT | NB_DELWRI)))) SET(bp->nb_flags, NB_INVAL); if ((bp->nb_bufsize <= 0) || ISSET(bp->nb_flags, NB_INVAL)) { - /* If it's invalid or empty, dissociate it from its vnode */ + /* If it's invalid or empty, dissociate it from its nfsnode */ if (bp->nb_vnbufs.le_next != NFSNOLIST) { LIST_REMOVE(bp, nb_vnbufs); bp->nb_vnbufs.le_next = NFSNOLIST; } - bp->nb_vp = NULL; - HOLDRELE(vp); + bp->nb_np = NULL; /* if this was a delayed write, wakeup anyone */ /* waiting for delayed writes to complete */ if (ISSET(bp->nb_flags, NB_DELWRI)) { CLR(bp->nb_flags, NB_DELWRI); nfs_nbdwrite--; NFSBUFCNTCHK(); - wakeup((caddr_t)&nfs_nbdwrite); + wakeup_nbdwrite = 1; } - /* clear usage timestamp to allow immediate freeing */ - bp->nb_timestamp = 0; + /* invalidate usage timestamp to allow immediate freeing */ + NBUFSTAMPINVALIDATE(bp); /* put buffer at head of free list */ if (bp->nb_free.tqe_next != NFSNOLIST) panic("nfsbuf on freelist"); SET(bp->nb_flags, NB_INVAL); - TAILQ_INSERT_HEAD(&nfsbuffree, bp, nb_free); - nfsbuffreecnt++; - if (freeup) - NFS_BUF_FREEUP(); + if (ISSET(bp->nb_flags, NB_META)) { + TAILQ_INSERT_HEAD(&nfsbuffreemeta, bp, nb_free); + nfsbuffreemetacnt++; + } else { + TAILQ_INSERT_HEAD(&nfsbuffree, bp, nb_free); + nfsbuffreecnt++; + } } else if (ISSET(bp->nb_flags, NB_DELWRI)) { /* put buffer at end of delwri list */ if (bp->nb_free.tqe_next != NFSNOLIST) panic("nfsbuf on freelist"); TAILQ_INSERT_TAIL(&nfsbufdelwri, bp, nb_free); nfsbufdelwricnt++; + freeup = 0; } else { /* update usage timestamp */ microuptime(&now); @@ -932,16 +1134,26 @@ pagelist_cleanup_done: TAILQ_INSERT_TAIL(&nfsbuffree, bp, nb_free); nfsbuffreecnt++; } - if (freeup) - NFS_BUF_FREEUP(); } NFSBUFCNTCHK(); /* Unlock the buffer. */ - CLR(bp->nb_flags, (NB_ASYNC | NB_BUSY | NB_NOCACHE | NB_STABLE | NB_IOD)); + CLR(bp->nb_flags, (NB_ASYNC | NB_STABLE)); + CLR(bp->nb_lflags, NBL_BUSY); FSDBG_BOT(548, bp, NBOFF(bp), bp->nb_flags, bp->nb_data); + + lck_mtx_unlock(nfs_buf_mutex); + + if (wakeup_needbuffer) + wakeup(&nfsneedbuffer); + if (wakeup_buffer) + wakeup(bp); + if (wakeup_nbdwrite) + wakeup(&nfs_nbdwrite); + if (freeup) + NFS_BUF_FREEUP(); } /* @@ -953,8 +1165,12 @@ nfs_buf_iowait(struct nfsbuf *bp) { FSDBG_TOP(549, bp, NBOFF(bp), bp->nb_flags, bp->nb_error); + lck_mtx_lock(nfs_buf_mutex); + while (!ISSET(bp->nb_flags, NB_DONE)) - tsleep(bp, PRIBIO + 1, "nfs_buf_iowait", 0); + msleep(bp, nfs_buf_mutex, PRIBIO + 1, "nfs_buf_iowait", NULL); + + lck_mtx_unlock(nfs_buf_mutex); FSDBG_BOT(549, bp, NBOFF(bp), bp->nb_flags, bp->nb_error); @@ -973,36 +1189,31 @@ nfs_buf_iowait(struct nfsbuf *bp) void nfs_buf_iodone(struct nfsbuf *bp) { - struct vnode *vp; FSDBG_TOP(550, bp, NBOFF(bp), bp->nb_flags, bp->nb_error); if (ISSET(bp->nb_flags, NB_DONE)) panic("nfs_buf_iodone already"); - SET(bp->nb_flags, NB_DONE); /* note that it's done */ - /* - * I/O was done, so don't believe - * the DIRTY state from VM anymore - */ - CLR(bp->nb_flags, NB_WASDIRTY); if (!ISSET(bp->nb_flags, NB_READ)) { CLR(bp->nb_flags, NB_WRITEINPROG); - vpwakeup(bp->nb_vp); - } - - /* Wakeup the throttled write operations as needed */ - vp = bp->nb_vp; - if (vp && (vp->v_flag & VTHROTTLED) - && (vp->v_numoutput <= (NFSBUFWRITE_THROTTLE / 3))) { - vp->v_flag &= ~VTHROTTLED; - wakeup((caddr_t)&vp->v_numoutput); + /* + * vnode_writedone() takes care of waking up + * any throttled write operations + */ + vnode_writedone(NFSTOV(bp->nb_np)); + nfs_node_lock_force(bp->nb_np); + bp->nb_np->n_numoutput--; + nfs_node_unlock(bp->nb_np); } - - if (ISSET(bp->nb_flags, NB_ASYNC)) /* if async, release it */ + if (ISSET(bp->nb_flags, NB_ASYNC)) { /* if async, release it */ + SET(bp->nb_flags, NB_DONE); /* note that it's done */ nfs_buf_release(bp, 1); - else { /* or just wakeup the buffer */ - CLR(bp->nb_flags, NB_WANTED); + } else { /* or just wakeup the buffer */ + lck_mtx_lock(nfs_buf_mutex); + SET(bp->nb_flags, NB_DONE); /* note that it's done */ + CLR(bp->nb_lflags, NBL_WANTED); + lck_mtx_unlock(nfs_buf_mutex); wakeup(bp); } @@ -1012,8 +1223,7 @@ nfs_buf_iodone(struct nfsbuf *bp) void nfs_buf_write_delayed(struct nfsbuf *bp) { - struct proc *p = current_proc(); - struct vnode *vp = bp->nb_vp; + nfsnode_t np = bp->nb_np; FSDBG_TOP(551, bp, NBOFF(bp), bp->nb_flags, 0); FSDBG(551, bp, bp->nb_dirtyoff, bp->nb_dirtyend, bp->nb_dirty); @@ -1021,46 +1231,45 @@ nfs_buf_write_delayed(struct nfsbuf *bp) /* * If the block hasn't been seen before: * (1) Mark it as having been seen, - * (2) Charge for the write. - * (3) Make sure it's on its vnode's correct block list, + * (2) Make sure it's on its node's correct block list, */ if (!ISSET(bp->nb_flags, NB_DELWRI)) { SET(bp->nb_flags, NB_DELWRI); - if (p && p->p_stats) - p->p_stats->p_ru.ru_oublock++; /* XXX */ + /* move to dirty list */ + lck_mtx_lock(nfs_buf_mutex); nfs_nbdwrite++; NFSBUFCNTCHK(); - /* move to dirty list */ if (bp->nb_vnbufs.le_next != NFSNOLIST) LIST_REMOVE(bp, nb_vnbufs); - LIST_INSERT_HEAD(&VTONFS(vp)->n_dirtyblkhd, bp, nb_vnbufs); + LIST_INSERT_HEAD(&np->n_dirtyblkhd, bp, nb_vnbufs); + lck_mtx_unlock(nfs_buf_mutex); } /* * If the vnode has "too many" write operations in progress * wait for them to finish the IO */ - while (vp->v_numoutput >= NFSBUFWRITE_THROTTLE) { - vp->v_flag |= VTHROTTLED; - tsleep((caddr_t)&vp->v_numoutput, PRIBIO + 1, "nfs_buf_write_delayed", 0); - } + vnode_waitforwrites(NFSTOV(np), VNODE_ASYNC_THROTTLE, 0, 0, "nfs_buf_write_delayed"); + + /* the file is in a modified state, so make sure the flag's set */ + nfs_node_lock_force(np); + np->n_flag |= NMODIFIED; + nfs_node_unlock(np); /* - * If we have too many delayed write buffers, - * more than we can "safely" handle, just fall back to - * doing the async write + * If we have too many delayed write buffers, + * just fall back to doing the async write. */ if (nfs_nbdwrite < 0) panic("nfs_buf_write_delayed: Negative nfs_nbdwrite"); - - if (nfs_nbdwrite > ((nfsbufcnt/4)*3)) { + if (nfs_nbdwrite > NFS_A_LOT_OF_DELAYED_WRITES) { /* issue async write */ SET(bp->nb_flags, NB_ASYNC); nfs_buf_write(bp); FSDBG_BOT(551, bp, NBOFF(bp), bp->nb_flags, bp->nb_error); return; } - + /* Otherwise, the "write" is done, so mark and release the buffer. */ SET(bp->nb_flags, NB_DONE); nfs_buf_release(bp, 1); @@ -1068,247 +1277,789 @@ nfs_buf_write_delayed(struct nfsbuf *bp) return; } +/* + * Check that a "needcommit" buffer can still be committed. + * If the write verifier has changed, we need to clear the + * the needcommit flag. + */ +void +nfs_buf_check_write_verifier(nfsnode_t np, struct nfsbuf *bp) +{ + struct nfsmount *nmp; + + if (!ISSET(bp->nb_flags, NB_NEEDCOMMIT)) + return; + + nmp = NFSTONMP(np); + if (!nmp) + return; + if (!ISSET(bp->nb_flags, NB_STALEWVERF) && (bp->nb_verf == nmp->nm_verf)) + return; + + /* write verifier changed, clear commit/wverf flags */ + CLR(bp->nb_flags, (NB_NEEDCOMMIT | NB_STALEWVERF)); + bp->nb_verf = 0; + nfs_node_lock_force(np); + np->n_needcommitcnt--; + CHECK_NEEDCOMMITCNT(np); + nfs_node_unlock(np); +} /* - * Vnode op for read using bio - * Any similarity to readip() is purely coincidental + * add a reference to a buffer so it doesn't disappear while being used + * (must be called with nfs_buf_mutex held) */ -int -nfs_bioread(vp, uio, ioflag, cred, getpages) - register struct vnode *vp; - register struct uio *uio; - int ioflag; - struct ucred *cred; - int getpages; // XXX unused! +void +nfs_buf_refget(struct nfsbuf *bp) { - struct nfsnode *np = VTONFS(vp); - int biosize, i; - off_t diff; - struct nfsbuf *bp = 0, *rabp; - struct vattr vattr; - struct proc *p; - struct nfsmount *nmp = VFSTONFS(vp->v_mount); - daddr_t lbn, rabn, lastrabn = -1; - int bufsize; - int nra, error = 0, n = 0, on = 0; - int operation = (getpages? BLK_PAGEIN : BLK_READ); - caddr_t dp; - struct dirent *direntp; - - FSDBG_TOP(514, vp, uio->uio_offset, uio->uio_resid, ioflag); - -#if DIAGNOSTIC - if (uio->uio_rw != UIO_READ) - panic("nfs_read mode"); -#endif - if (uio->uio_resid == 0) { - FSDBG_BOT(514, vp, 0xd1e0001, 0, 0); - return (0); + bp->nb_refs++; +} +/* + * release a reference on a buffer + * (must be called with nfs_buf_mutex held) + */ +void +nfs_buf_refrele(struct nfsbuf *bp) +{ + bp->nb_refs--; +} + +/* + * mark a particular buffer as BUSY + * (must be called with nfs_buf_mutex held) + */ +errno_t +nfs_buf_acquire(struct nfsbuf *bp, int flags, int slpflag, int slptimeo) +{ + errno_t error; + struct timespec ts; + + if (ISSET(bp->nb_lflags, NBL_BUSY)) { + /* + * since the lck_mtx_lock may block, the buffer + * may become BUSY, so we need to recheck for + * a NOWAIT request + */ + if (flags & NBAC_NOWAIT) + return (EBUSY); + SET(bp->nb_lflags, NBL_WANTED); + + ts.tv_sec = (slptimeo/100); + /* the hz value is 100; which leads to 10ms */ + ts.tv_nsec = (slptimeo % 100) * 10 * NSEC_PER_USEC * 1000; + + error = msleep(bp, nfs_buf_mutex, slpflag | (PRIBIO + 1), + "nfs_buf_acquire", &ts); + if (error) + return (error); + return (EAGAIN); } - if (uio->uio_offset < 0) { - FSDBG_BOT(514, vp, 0xd1e0002, 0, EINVAL); - return (EINVAL); + if (flags & NBAC_REMOVE) + nfs_buf_remfree(bp); + SET(bp->nb_lflags, NBL_BUSY); + + return (0); +} + +/* + * simply drop the BUSY status of a buffer + * (must be called with nfs_buf_mutex held) + */ +void +nfs_buf_drop(struct nfsbuf *bp) +{ + int need_wakeup = 0; + + if (!ISSET(bp->nb_lflags, NBL_BUSY)) + panic("nfs_buf_drop: buffer not busy!"); + if (ISSET(bp->nb_lflags, NBL_WANTED)) { + /* delay the actual wakeup until after we clear NBL_BUSY */ + need_wakeup = 1; } - p = uio->uio_procp; - if ((nmp->nm_flag & NFSMNT_NFSV3) && - !(nmp->nm_state & NFSSTA_GOTFSINFO)) - (void)nfs_fsinfo(nmp, vp, cred, p); - biosize = vp->v_mount->mnt_stat.f_iosize; - /* - * For nfs, cache consistency can only be maintained approximately. - * Although RFC1094 does not specify the criteria, the following is - * believed to be compatible with the reference port. - * For nqnfs, full cache consistency is maintained within the loop. - * For nfs: - * If the file's modify time on the server has changed since the - * last read rpc or you have written to the file, - * you may have lost data cache consistency with the - * server, so flush all of the file's data out of the cache. - * Then force a getattr rpc to ensure that you have up to date - * attributes. - * NB: This implies that cache data can be read when up to - * NFS_MAXATTRTIMEO seconds out of date. If you find that you need - * current attributes this could be forced by setting n_xid to 0 - * before the VOP_GETATTR() call. - */ - if ((nmp->nm_flag & NFSMNT_NQNFS) == 0) { - if (np->n_flag & NMODIFIED) { - if (vp->v_type != VREG) { - if (vp->v_type != VDIR) - panic("nfs: bioread, not dir"); - nfs_invaldir(vp); - error = nfs_vinvalbuf(vp, V_SAVE, cred, p, 1); - if (error) { - FSDBG_BOT(514, vp, 0xd1e0003, 0, error); - return (error); - } - } - np->n_xid = 0; - error = VOP_GETATTR(vp, &vattr, cred, p); - if (error) { - FSDBG_BOT(514, vp, 0xd1e0004, 0, error); - return (error); - } - if (vp->v_type == VDIR) { - /* if directory changed, purge any name cache entries */ - if (np->n_ncmtime != vattr.va_mtime.tv_sec) - cache_purge(vp); - np->n_ncmtime = vattr.va_mtime.tv_sec; + /* Unlock the buffer. */ + CLR(bp->nb_lflags, (NBL_BUSY | NBL_WANTED)); + + if (need_wakeup) + wakeup(bp); +} + +/* + * prepare for iterating over an nfsnode's buffer list + * this lock protects the queue manipulation + * (must be called with nfs_buf_mutex held) + */ +int +nfs_buf_iterprepare(nfsnode_t np, struct nfsbuflists *iterheadp, int flags) +{ + struct nfsbuflists *listheadp; + + if (flags & NBI_DIRTY) + listheadp = &np->n_dirtyblkhd; + else + listheadp = &np->n_cleanblkhd; + + if ((flags & NBI_NOWAIT) && (np->n_bufiterflags & NBI_ITER)) { + LIST_INIT(iterheadp); + return(EWOULDBLOCK); + } + + while (np->n_bufiterflags & NBI_ITER) { + np->n_bufiterflags |= NBI_ITERWANT; + msleep(&np->n_bufiterflags, nfs_buf_mutex, 0, "nfs_buf_iterprepare", NULL); + } + if (LIST_EMPTY(listheadp)) { + LIST_INIT(iterheadp); + return(EINVAL); + } + np->n_bufiterflags |= NBI_ITER; + + iterheadp->lh_first = listheadp->lh_first; + listheadp->lh_first->nb_vnbufs.le_prev = &iterheadp->lh_first; + LIST_INIT(listheadp); + + return(0); +} + +/* + * clean up after iterating over an nfsnode's buffer list + * this lock protects the queue manipulation + * (must be called with nfs_buf_mutex held) + */ +void +nfs_buf_itercomplete(nfsnode_t np, struct nfsbuflists *iterheadp, int flags) +{ + struct nfsbuflists * listheadp; + struct nfsbuf *bp; + + if (flags & NBI_DIRTY) + listheadp = &np->n_dirtyblkhd; + else + listheadp = &np->n_cleanblkhd; + + while (!LIST_EMPTY(iterheadp)) { + bp = LIST_FIRST(iterheadp); + LIST_REMOVE(bp, nb_vnbufs); + LIST_INSERT_HEAD(listheadp, bp, nb_vnbufs); + } + + np->n_bufiterflags &= ~NBI_ITER; + if (np->n_bufiterflags & NBI_ITERWANT) { + np->n_bufiterflags &= ~NBI_ITERWANT; + wakeup(&np->n_bufiterflags); + } +} + + +/* + * Read an NFS buffer for a file. + */ +int +nfs_buf_read(struct nfsbuf *bp) +{ + int error = 0; + nfsnode_t np; + thread_t thd; + kauth_cred_t cred; + + np = bp->nb_np; + cred = bp->nb_rcred; + if (IS_VALID_CRED(cred)) + kauth_cred_ref(cred); + thd = ISSET(bp->nb_flags, NB_ASYNC) ? NULL : current_thread(); + + /* sanity checks */ + if (!ISSET(bp->nb_flags, NB_READ)) + panic("nfs_buf_read: !NB_READ"); + if (ISSET(bp->nb_flags, NB_DONE)) + CLR(bp->nb_flags, NB_DONE); + + NFS_BUF_MAP(bp); + + OSAddAtomic(1, &nfsstats.read_bios); + + error = nfs_buf_read_rpc(bp, thd, cred); + /* + * For async I/O, the callbacks will finish up the + * read. Otherwise, the read has already been finished. + */ + + if (IS_VALID_CRED(cred)) + kauth_cred_unref(&cred); + return (error); +} + +/* + * finish the reading of a buffer + */ +void +nfs_buf_read_finish(struct nfsbuf *bp) +{ + nfsnode_t np = bp->nb_np; + struct nfsmount *nmp; + + if (!ISSET(bp->nb_flags, NB_ERROR)) { + /* update valid range */ + bp->nb_validoff = 0; + bp->nb_validend = bp->nb_endio; + if (bp->nb_endio < (int)bp->nb_bufsize) { + /* + * The read may be short because we have unflushed writes + * that are extending the file size and the reads hit the + * (old) EOF on the server. So, just make sure nb_validend + * correctly tracks EOF. + * Note that the missing data should have already been zeroed + * in nfs_buf_read_rpc_finish(). + */ + off_t boff = NBOFF(bp); + if ((off_t)np->n_size >= (boff + bp->nb_bufsize)) + bp->nb_validend = bp->nb_bufsize; + else if ((off_t)np->n_size >= boff) + bp->nb_validend = np->n_size - boff; + else + bp->nb_validend = 0; + } + if ((nmp = NFSTONMP(np)) && (nmp->nm_vers == NFS_VER2) && + ((NBOFF(bp) + bp->nb_validend) > 0x100000000LL)) + bp->nb_validend = 0x100000000LL - NBOFF(bp); + bp->nb_valid = (1 << (round_page_32(bp->nb_validend) / PAGE_SIZE)) - 1; + if (bp->nb_validend & PAGE_MASK) { + /* zero-fill remainder of last page */ + bzero(bp->nb_data + bp->nb_validend, PAGE_SIZE - (bp->nb_validend & PAGE_MASK)); + } + } + nfs_buf_iodone(bp); +} + +/* + * initiate the NFS READ RPC(s) for a buffer + */ +int +nfs_buf_read_rpc(struct nfsbuf *bp, thread_t thd, kauth_cred_t cred) +{ + struct nfsmount *nmp; + nfsnode_t np = bp->nb_np; + int error = 0, nfsvers, async; + int offset, nrpcs; + uint32_t nmrsize, length, len; + off_t boff; + struct nfsreq *req; + struct nfsreq_cbinfo cb; + + nmp = NFSTONMP(np); + if (!nmp) { + bp->nb_error = error = ENXIO; + SET(bp->nb_flags, NB_ERROR); + nfs_buf_iodone(bp); + return (error); + } + nfsvers = nmp->nm_vers; + nmrsize = nmp->nm_rsize; + + boff = NBOFF(bp); + offset = 0; + length = bp->nb_bufsize; + + if (nfsvers == NFS_VER2) { + if (boff > 0xffffffffLL) { + bp->nb_error = error = EFBIG; + SET(bp->nb_flags, NB_ERROR); + nfs_buf_iodone(bp); + return (error); + } + if ((boff + length - 1) > 0xffffffffLL) + length = 0x100000000LL - boff; + } + + /* Note: Can only do async I/O if nfsiods are configured. */ + async = (bp->nb_flags & NB_ASYNC); + cb.rcb_func = async ? nfs_buf_read_rpc_finish : NULL; + cb.rcb_bp = bp; + + bp->nb_offio = bp->nb_endio = 0; + bp->nb_rpcs = nrpcs = (length + nmrsize - 1) / nmrsize; + if (async && (nrpcs > 1)) { + SET(bp->nb_flags, NB_MULTASYNCRPC); + } else { + CLR(bp->nb_flags, NB_MULTASYNCRPC); + } + + while (length > 0) { + if (ISSET(bp->nb_flags, NB_ERROR)) { + error = bp->nb_error; + break; + } + len = (length > nmrsize) ? nmrsize : length; + cb.rcb_args[0] = offset; + cb.rcb_args[1] = len; + if (nmp->nm_vers >= NFS_VER4) + cb.rcb_args[2] = nmp->nm_stategenid; + req = NULL; + error = nmp->nm_funcs->nf_read_rpc_async(np, boff + offset, len, thd, cred, &cb, &req); + if (error) + break; + offset += len; + length -= len; + if (async) + continue; + nfs_buf_read_rpc_finish(req); + if (ISSET(bp->nb_flags, NB_ERROR)) { + error = bp->nb_error; + break; + } + } + + if (length > 0) { + /* + * Something bad happened while trying to send the RPC(s). + * Wait for any outstanding requests to complete. + */ + bp->nb_error = error; + SET(bp->nb_flags, NB_ERROR); + if (ISSET(bp->nb_flags, NB_MULTASYNCRPC)) { + nrpcs = (length + nmrsize - 1) / nmrsize; + lck_mtx_lock(nfs_buf_mutex); + bp->nb_rpcs -= nrpcs; + if (bp->nb_rpcs == 0) { + /* No RPCs left, so the buffer's done */ + lck_mtx_unlock(nfs_buf_mutex); + nfs_buf_iodone(bp); + } else { + /* wait for the last RPC to mark it done */ + while (bp->nb_rpcs > 0) + msleep(&bp->nb_rpcs, nfs_buf_mutex, 0, + "nfs_buf_read_rpc_cancel", NULL); + lck_mtx_unlock(nfs_buf_mutex); } - np->n_mtime = vattr.va_mtime.tv_sec; } else { - error = VOP_GETATTR(vp, &vattr, cred, p); - if (error) { - FSDBG_BOT(514, vp, 0xd1e0005, 0, error); - return (error); - } - if (np->n_mtime != vattr.va_mtime.tv_sec) { - if (vp->v_type == VDIR) { - nfs_invaldir(vp); - /* purge name cache entries */ - if (np->n_ncmtime != vattr.va_mtime.tv_sec) - cache_purge(vp); - } - error = nfs_vinvalbuf(vp, V_SAVE, cred, p, 1); - if (error) { - FSDBG_BOT(514, vp, 0xd1e0006, 0, error); - return (error); + nfs_buf_iodone(bp); + } + } + + return (error); +} + +/* + * finish up an NFS READ RPC on a buffer + */ +void +nfs_buf_read_rpc_finish(struct nfsreq *req) +{ + struct nfsmount *nmp; + size_t rlen; + struct nfsreq_cbinfo cb; + struct nfsbuf *bp; + int error = 0, nfsvers, offset, length, eof = 0, multasyncrpc, finished; + void *wakeme = NULL; + struct nfsreq *rreq = NULL; + nfsnode_t np; + thread_t thd; + kauth_cred_t cred; + uio_t auio; + char uio_buf [ UIO_SIZEOF(1) ]; + +finish: + np = req->r_np; + thd = req->r_thread; + cred = req->r_cred; + if (IS_VALID_CRED(cred)) + kauth_cred_ref(cred); + cb = req->r_callback; + bp = cb.rcb_bp; + if (cb.rcb_func) /* take an extra reference on the nfsreq in case we want to resend it later due to grace error */ + nfs_request_ref(req, 0); + + nmp = NFSTONMP(np); + if (!nmp) { + SET(bp->nb_flags, NB_ERROR); + bp->nb_error = error = ENXIO; + } + if (error || ISSET(bp->nb_flags, NB_ERROR)) { + /* just drop it */ + nfs_request_async_cancel(req); + goto out; + } + + nfsvers = nmp->nm_vers; + offset = cb.rcb_args[0]; + rlen = length = cb.rcb_args[1]; + + auio = uio_createwithbuffer(1, NBOFF(bp) + offset, UIO_SYSSPACE, + UIO_READ, &uio_buf, sizeof(uio_buf)); + uio_addiov(auio, CAST_USER_ADDR_T(bp->nb_data + offset), length); + + /* finish the RPC */ + error = nmp->nm_funcs->nf_read_rpc_async_finish(np, req, auio, &rlen, &eof); + if ((error == EINPROGRESS) && cb.rcb_func) { + /* async request restarted */ + if (cb.rcb_func) + nfs_request_rele(req); + if (IS_VALID_CRED(cred)) + kauth_cred_unref(&cred); + return; + } + if ((nmp->nm_vers >= NFS_VER4) && nfs_mount_state_error_should_restart(error) && !ISSET(bp->nb_flags, NB_ERROR)) { + lck_mtx_lock(&nmp->nm_lock); + if ((error != NFSERR_OLD_STATEID) && (error != NFSERR_GRACE) && (cb.rcb_args[2] == nmp->nm_stategenid)) { + NP(np, "nfs_buf_read_rpc_finish: error %d @ 0x%llx, 0x%x 0x%x, initiating recovery", + error, NBOFF(bp)+offset, cb.rcb_args[2], nmp->nm_stategenid); + nfs_need_recover(nmp, error); + } + lck_mtx_unlock(&nmp->nm_lock); + if (np->n_flag & NREVOKE) { + error = EIO; + } else { + if (error == NFSERR_GRACE) { + if (cb.rcb_func) { + /* + * For an async I/O request, handle a grace delay just like + * jukebox errors. Set the resend time and queue it up. + */ + struct timeval now; + if (req->r_nmrep.nmc_mhead) { + mbuf_freem(req->r_nmrep.nmc_mhead); + req->r_nmrep.nmc_mhead = NULL; + } + req->r_error = 0; + microuptime(&now); + lck_mtx_lock(&req->r_mtx); + req->r_resendtime = now.tv_sec + 2; + req->r_xid = 0; // get a new XID + req->r_flags |= R_RESTART; + req->r_start = 0; + nfs_asyncio_resend(req); + lck_mtx_unlock(&req->r_mtx); + if (IS_VALID_CRED(cred)) + kauth_cred_unref(&cred); + /* Note: nfsreq reference taken will be dropped later when finished */ + return; } - if (vp->v_type == VDIR) - np->n_ncmtime = vattr.va_mtime.tv_sec; - np->n_mtime = vattr.va_mtime.tv_sec; + /* otherwise, just pause a couple seconds and retry */ + tsleep(&nmp->nm_state, (PZERO-1), "nfsgrace", 2*hz); + } + if (!(error = nfs_mount_state_wait_for_recovery(nmp))) { + rlen = 0; + goto readagain; } } } - do { + if (error) { + SET(bp->nb_flags, NB_ERROR); + bp->nb_error = error; + goto out; + } - /* - * Get a valid lease. If cached data is stale, flush it. - */ - if (nmp->nm_flag & NFSMNT_NQNFS) { - if (NQNFS_CKINVALID(vp, np, ND_READ)) { - do { - error = nqnfs_getlease(vp, ND_READ, cred, p); - } while (error == NQNFS_EXPIRED); - if (error) { - FSDBG_BOT(514, vp, 0xd1e0007, 0, error); - return (error); - } - if (np->n_lrev != np->n_brev || - (np->n_flag & NQNFSNONCACHE) || - ((np->n_flag & NMODIFIED) && vp->v_type == VDIR)) { - if (vp->v_type == VDIR) - nfs_invaldir(vp); - error = nfs_vinvalbuf(vp, V_SAVE, cred, p, 1); - if (error) { - FSDBG_BOT(514, vp, 0xd1e0008, 0, error); - return (error); + if ((rlen > 0) && (bp->nb_endio < (offset + (int)rlen))) + bp->nb_endio = offset + rlen; + + if ((nfsvers == NFS_VER2) || eof || (rlen == 0)) { + /* zero out the remaining data (up to EOF) */ + off_t rpcrem, eofrem, rem; + rpcrem = (length - rlen); + eofrem = np->n_size - (NBOFF(bp) + offset + rlen); + rem = (rpcrem < eofrem) ? rpcrem : eofrem; + if (rem > 0) + bzero(bp->nb_data + offset + rlen, rem); + } else if (((int)rlen < length) && !ISSET(bp->nb_flags, NB_ERROR)) { + /* + * short read + * + * We haven't hit EOF and we didn't get all the data + * requested, so we need to issue another read for the rest. + * (Don't bother if the buffer already hit an error.) + */ +readagain: + offset += rlen; + length -= rlen; + cb.rcb_args[0] = offset; + cb.rcb_args[1] = length; + if (nmp->nm_vers >= NFS_VER4) + cb.rcb_args[2] = nmp->nm_stategenid; + error = nmp->nm_funcs->nf_read_rpc_async(np, NBOFF(bp) + offset, length, thd, cred, &cb, &rreq); + if (!error) { + if (IS_VALID_CRED(cred)) + kauth_cred_unref(&cred); + if (!cb.rcb_func) { + /* if !async we'll need to wait for this RPC to finish */ + req = rreq; + rreq = NULL; + goto finish; } - np->n_brev = np->n_lrev; - } - } else if (vp->v_type == VDIR && (np->n_flag & NMODIFIED)) { - nfs_invaldir(vp); - error = nfs_vinvalbuf(vp, V_SAVE, cred, p, 1); - if (error) { - FSDBG_BOT(514, vp, 0xd1e0009, 0, error); - return (error); - } + nfs_request_rele(req); + /* + * We're done here. + * Outstanding RPC count is unchanged. + * Callback will be called when RPC is done. + */ + return; } - } - if ((np->n_flag & NQNFSNONCACHE) || (vp->v_flag & VNOCACHE_DATA)) { - if ((vp->v_flag & VNOCACHE_DATA) && - (np->n_dirtyblkhd.lh_first || np->n_cleanblkhd.lh_first)) { - error = nfs_vinvalbuf(vp, V_SAVE, cred, p, 1); - if (error) { - FSDBG_BOT(514, vp, 0xd1e000a, 0, error); - return (error); + SET(bp->nb_flags, NB_ERROR); + bp->nb_error = error; + } + +out: + if (cb.rcb_func) + nfs_request_rele(req); + if (IS_VALID_CRED(cred)) + kauth_cred_unref(&cred); + + /* + * Decrement outstanding RPC count on buffer + * and call nfs_buf_read_finish on last RPC. + * + * (Note: when there are multiple async RPCs issued for a + * buffer we need nfs_buffer_mutex to avoid problems when + * aborting a partially-initiated set of RPCs) + */ + + multasyncrpc = ISSET(bp->nb_flags, NB_MULTASYNCRPC); + if (multasyncrpc) + lck_mtx_lock(nfs_buf_mutex); + + bp->nb_rpcs--; + finished = (bp->nb_rpcs == 0); + + if (multasyncrpc) + lck_mtx_unlock(nfs_buf_mutex); + + if (finished) { + if (multasyncrpc) + wakeme = &bp->nb_rpcs; + nfs_buf_read_finish(bp); + if (wakeme) + wakeup(wakeme); + } +} + +/* + * Do buffer readahead. + * Initiate async I/O to read buffers not in cache. + */ +int +nfs_buf_readahead(nfsnode_t np, int ioflag, daddr64_t *rabnp, daddr64_t lastrabn, thread_t thd, kauth_cred_t cred) +{ + struct nfsmount *nmp = NFSTONMP(np); + struct nfsbuf *bp; + int error = 0; + uint32_t nra; + + if (!nmp) + return (ENXIO); + if (nmp->nm_readahead <= 0) + return (0); + if (*rabnp > lastrabn) + return (0); + + for (nra = 0; (nra < nmp->nm_readahead) && (*rabnp <= lastrabn); nra++, *rabnp = *rabnp + 1) { + /* check if block exists and is valid. */ + if ((*rabnp * nmp->nm_biosize) >= (off_t)np->n_size) { + /* stop reading ahead if we're beyond EOF */ + *rabnp = lastrabn; + break; + } + error = nfs_buf_get(np, *rabnp, nmp->nm_biosize, thd, NBLK_READ|NBLK_NOWAIT, &bp); + if (error) + break; + nfs_node_lock_force(np); + np->n_lastrahead = *rabnp; + nfs_node_unlock(np); + if (!bp) + continue; + if ((ioflag & IO_NOCACHE) && ISSET(bp->nb_flags, NB_CACHE) && + !bp->nb_dirty && !ISSET(bp->nb_flags, (NB_DELWRI|NB_NCRDAHEAD))) { + CLR(bp->nb_flags, NB_CACHE); + bp->nb_valid = 0; + bp->nb_validoff = bp->nb_validend = -1; + } + if ((bp->nb_dirtyend <= 0) && !bp->nb_dirty && + !ISSET(bp->nb_flags, (NB_CACHE|NB_DELWRI))) { + SET(bp->nb_flags, (NB_READ|NB_ASYNC)); + if (ioflag & IO_NOCACHE) + SET(bp->nb_flags, NB_NCRDAHEAD); + if (!IS_VALID_CRED(bp->nb_rcred) && IS_VALID_CRED(cred)) { + kauth_cred_ref(cred); + bp->nb_rcred = cred; } + if ((error = nfs_buf_read(bp))) + break; + continue; } - switch (vp->v_type) { - case VREG: - error = nfs_readrpc(vp, uio, cred); - FSDBG_BOT(514, vp, uio->uio_offset, uio->uio_resid, error); - return (error); - case VLNK: - error = nfs_readlinkrpc(vp, uio, cred); - FSDBG_BOT(514, vp, uio->uio_offset, uio->uio_resid, error); + nfs_buf_release(bp, 1); + } + return (error); +} + +/* + * NFS buffer I/O for reading files. + */ +int +nfs_bioread(nfsnode_t np, uio_t uio, int ioflag, vfs_context_t ctx) +{ + vnode_t vp = NFSTOV(np); + struct nfsbuf *bp = NULL; + struct nfsmount *nmp = VTONMP(vp); + daddr64_t lbn, rabn = 0, lastrabn, maxrabn = -1; + off_t diff; + int error = 0, n = 0, on = 0; + int nfsvers, biosize, modified, readaheads = 0; + thread_t thd; + kauth_cred_t cred; + int64_t io_resid; + + FSDBG_TOP(514, np, uio_offset(uio), uio_resid(uio), ioflag); + + nfsvers = nmp->nm_vers; + biosize = nmp->nm_biosize; + thd = vfs_context_thread(ctx); + cred = vfs_context_ucred(ctx); + + if (vnode_vtype(vp) != VREG) { + printf("nfs_bioread: type %x unexpected\n", vnode_vtype(vp)); + FSDBG_BOT(514, np, 0xd1e0016, 0, EINVAL); + return (EINVAL); + } + + /* + * For NFS, cache consistency can only be maintained approximately. + * Although RFC1094 does not specify the criteria, the following is + * believed to be compatible with the reference port. + * + * If the file has changed since the last read RPC or you have + * written to the file, you may have lost data cache consistency + * with the server. So, check for a change, and flush all of the + * file's data out of the cache. + * NB: This implies that cache data can be read when up to + * NFS_MAXATTRTIMO seconds out of date. If you find that you + * need current attributes, nfs_getattr() can be forced to fetch + * new attributes (via NATTRINVALIDATE() or NGA_UNCACHED). + */ + + if (ISSET(np->n_flag, NUPDATESIZE)) + nfs_data_update_size(np, 0); + + if ((error = nfs_node_lock(np))) { + FSDBG_BOT(514, np, 0xd1e0222, 0, error); + return (error); + } + + if (np->n_flag & NNEEDINVALIDATE) { + np->n_flag &= ~NNEEDINVALIDATE; + nfs_node_unlock(np); + error = nfs_vinvalbuf(vp, V_SAVE|V_IGNORE_WRITEERR, ctx, 1); + if (!error) + error = nfs_node_lock(np); + if (error) { + FSDBG_BOT(514, np, 0xd1e0322, 0, error); return (error); - case VDIR: - break; - default: - printf(" NQNFSNONCACHE: type %x unexpected\n", vp->v_type); - }; - } - switch (vp->v_type) { - case VREG: - lbn = uio->uio_offset / biosize; + } + } + + modified = (np->n_flag & NMODIFIED); + nfs_node_unlock(np); + /* nfs_getattr() will check changed and purge caches */ + error = nfs_getattr(np, NULL, ctx, modified ? NGA_UNCACHED : NGA_CACHED); + if (error) { + FSDBG_BOT(514, np, 0xd1e0004, 0, error); + return (error); + } + + if (uio_resid(uio) == 0) { + FSDBG_BOT(514, np, 0xd1e0001, 0, 0); + return (0); + } + if (uio_offset(uio) < 0) { + FSDBG_BOT(514, np, 0xd1e0002, 0, EINVAL); + return (EINVAL); + } + + /* + * set up readahead - which may be limited by: + * + current request length (for IO_NOCACHE) + * + readahead setting + * + file size + */ + if (nmp->nm_readahead > 0) { + off_t end = uio_offset(uio) + uio_resid(uio); + if (end > (off_t)np->n_size) + end = np->n_size; + rabn = uio_offset(uio) / biosize; + maxrabn = (end - 1) / biosize; + nfs_node_lock_force(np); + if (!(ioflag & IO_NOCACHE) && + (!rabn || (rabn == np->n_lastread) || (rabn == (np->n_lastread+1)))) { + maxrabn += nmp->nm_readahead; + if ((maxrabn * biosize) >= (off_t)np->n_size) + maxrabn = ((off_t)np->n_size - 1)/biosize; + } + if (maxrabn < np->n_lastrahead) + np->n_lastrahead = -1; + if (rabn < np->n_lastrahead) + rabn = np->n_lastrahead + 1; + nfs_node_unlock(np); + } else { + rabn = maxrabn = 0; + } + + do { + + nfs_data_lock(np, NFS_DATA_LOCK_SHARED); + lbn = uio_offset(uio) / biosize; /* * Copy directly from any cached pages without grabbing the bufs. + * (If we are NOCACHE and we've issued readahead requests, we need + * to grab the NB_NCRDAHEAD bufs to drop them.) */ - if (uio->uio_segflg == UIO_USERSPACE) { - int io_resid = uio->uio_resid; - diff = np->n_size - uio->uio_offset; + if ((!(ioflag & IO_NOCACHE) || !readaheads) && + ((uio->uio_segflg == UIO_USERSPACE32 || + uio->uio_segflg == UIO_USERSPACE64 || + uio->uio_segflg == UIO_USERSPACE))) { + io_resid = uio_resid(uio); + diff = np->n_size - uio_offset(uio); if (diff < io_resid) io_resid = diff; if (io_resid > 0) { - error = cluster_copy_ubc_data(vp, uio, &io_resid, 0); + int count = (io_resid > INT_MAX) ? INT_MAX : io_resid; + error = cluster_copy_ubc_data(vp, uio, &count, 0); if (error) { - FSDBG_BOT(514, vp, uio->uio_offset, 0xcacefeed, error); + nfs_data_unlock(np); + FSDBG_BOT(514, np, uio_offset(uio), 0xcacefeed, error); return (error); } } /* count any biocache reads that we just copied directly */ - if (lbn != uio->uio_offset / biosize) { - nfsstats.biocache_reads += (uio->uio_offset / biosize) - lbn; - FSDBG(514, vp, 0xcacefeed, uio->uio_offset, error); + if (lbn != (uio_offset(uio)/biosize)) { + OSAddAtomic((uio_offset(uio)/biosize) - lbn, &nfsstats.biocache_reads); + FSDBG(514, np, 0xcacefeed, uio_offset(uio), error); } } - lbn = uio->uio_offset / biosize; - on = uio->uio_offset % biosize; + lbn = uio_offset(uio) / biosize; + on = uio_offset(uio) % biosize; + nfs_node_lock_force(np); + np->n_lastread = (uio_offset(uio) - 1) / biosize; + nfs_node_unlock(np); - /* - * Start the read ahead(s), as required. - */ - if (nfs_numasync > 0 && nmp->nm_readahead > 0) { - for (nra = 0; nra < nmp->nm_readahead; nra++) { - rabn = lbn + 1 + nra; - if (rabn <= lastrabn) { - /* we've already (tried to) read this block */ - /* no need to try it again... */ - continue; - } - lastrabn = rabn; - if ((off_t)rabn * biosize >= np->n_size) - break; - /* check if block exists and is valid. */ - rabp = nfs_buf_incore(vp, rabn); - if (rabp && nfs_buf_upl_valid_range(rabp, 0, rabp->nb_bufsize)) - continue; - rabp = nfs_buf_get(vp, rabn, biosize, p, operation); - if (!rabp) { - FSDBG_BOT(514, vp, 0xd1e000b, 0, EINTR); - return (EINTR); - } - if (!ISSET(rabp->nb_flags, (NB_CACHE|NB_DELWRI))) { - SET(rabp->nb_flags, (NB_READ|NB_ASYNC)); - if (nfs_asyncio(rabp, cred)) { - SET(rabp->nb_flags, (NB_INVAL|NB_ERROR)); - rabp->nb_error = EIO; - nfs_buf_release(rabp, 1); - } - } else - nfs_buf_release(rabp, 1); - } + if ((uio_resid(uio) <= 0) || (uio_offset(uio) >= (off_t)np->n_size)) { + nfs_data_unlock(np); + FSDBG_BOT(514, np, uio_offset(uio), uio_resid(uio), 0xaaaaaaaa); + return (0); } - if ((uio->uio_resid <= 0) || (uio->uio_offset >= np->n_size)) { - FSDBG_BOT(514, vp, uio->uio_offset, uio->uio_resid, 0xaaaaaaaa); - return (0); + /* adjust readahead block number, if necessary */ + if (rabn < lbn) + rabn = lbn; + lastrabn = MIN(maxrabn, lbn + nmp->nm_readahead); + if (rabn <= lastrabn) { /* start readaheads */ + error = nfs_buf_readahead(np, ioflag, &rabn, lastrabn, thd, cred); + if (error) { + nfs_data_unlock(np); + FSDBG_BOT(514, np, 0xd1e000b, 1, error); + return (error); + } + readaheads = 1; } - nfsstats.biocache_reads++; + OSAddAtomic(1, &nfsstats.biocache_reads); /* * If the block is in the cache and has the required data @@ -1317,16 +2068,35 @@ nfs_bioread(vp, uio, ioflag, cred, getpages) * as required. */ again: - bufsize = biosize; - n = min((unsigned)(bufsize - on), uio->uio_resid); - diff = np->n_size - uio->uio_offset; + io_resid = uio_resid(uio); + n = (io_resid > (biosize - on)) ? (biosize - on) : io_resid; + diff = np->n_size - uio_offset(uio); if (diff < n) n = diff; - bp = nfs_buf_get(vp, lbn, bufsize, p, operation); - if (!bp) { - FSDBG_BOT(514, vp, 0xd1e000c, 0, EINTR); - return (EINTR); + error = nfs_buf_get(np, lbn, biosize, thd, NBLK_READ, &bp); + if (error) { + nfs_data_unlock(np); + FSDBG_BOT(514, np, 0xd1e000c, 0, error); + return (error); + } + + if ((ioflag & IO_NOCACHE) && ISSET(bp->nb_flags, NB_CACHE)) { + /* + * IO_NOCACHE found a cached buffer. + * Flush the buffer if it's dirty. + * Invalidate the data if it wasn't just read + * in as part of a "nocache readahead". + */ + if (bp->nb_dirty || (bp->nb_dirtyend > 0)) { + /* so write the buffer out and try again */ + SET(bp->nb_flags, NB_NOCACHE); + goto flushbuffer; + } + if (ISSET(bp->nb_flags, NB_NCRDAHEAD)) { + CLR(bp->nb_flags, NB_NCRDAHEAD); + SET(bp->nb_flags, NB_NOCACHE); + } } /* if any pages are valid... */ @@ -1350,64 +2120,69 @@ again: if (bp->nb_validoff < 0) { /* valid range isn't set up, so */ /* set it to what we know is valid */ - bp->nb_validoff = trunc_page_32(on); - bp->nb_validend = round_page_32(on+n); + bp->nb_validoff = trunc_page(on); + bp->nb_validend = round_page(on+n); nfs_buf_normalize_valid_range(np, bp); } goto buffer_ready; } /* there are invalid pages in the read range */ - if ((dirtypg > firstpg) && (dirtypg < lastpg)) { - /* there are also dirty page(s) in the range, */ + if (((dirtypg > firstpg) && (dirtypg < lastpg)) || + (((firstpg*PAGE_SIZE) < bp->nb_dirtyend) && (((lastpg+1)*PAGE_SIZE) > bp->nb_dirtyoff))) { + /* there are also dirty page(s) (or range) in the read range, */ /* so write the buffer out and try again */ +flushbuffer: CLR(bp->nb_flags, (NB_DONE | NB_ERROR | NB_INVAL)); SET(bp->nb_flags, NB_ASYNC); - /* - * NFS has embedded ucred so crhold() risks zone corruption - */ - if (bp->nb_wcred == NOCRED) - bp->nb_wcred = crdup(cred); + if (!IS_VALID_CRED(bp->nb_wcred)) { + kauth_cred_ref(cred); + bp->nb_wcred = cred; + } error = nfs_buf_write(bp); if (error) { - FSDBG_BOT(514, vp, 0xd1e000d, 0, error); + nfs_data_unlock(np); + FSDBG_BOT(514, np, 0xd1e000d, 0, error); return (error); } goto again; } if (!bp->nb_dirty && bp->nb_dirtyend <= 0 && - (lastpg - firstpg + 1) > (bufsize/PAGE_SIZE)/2) { + (lastpg - firstpg + 1) > (biosize/PAGE_SIZE)/2) { /* we need to read in more than half the buffer and the */ /* buffer's not dirty, so just fetch the whole buffer */ bp->nb_valid = 0; } else { /* read the page range in */ - struct iovec iov; - struct uio auio; - auio.uio_iov = &iov; - auio.uio_iovcnt = 1; - auio.uio_offset = NBOFF(bp) + firstpg * PAGE_SIZE_64; - auio.uio_resid = (lastpg - firstpg + 1) * PAGE_SIZE; - auio.uio_segflg = UIO_SYSSPACE; - auio.uio_rw = UIO_READ; - auio.uio_procp = p; + uio_t auio; + char uio_buf[ UIO_SIZEOF(1) ]; + NFS_BUF_MAP(bp); - iov.iov_base = bp->nb_data + firstpg * PAGE_SIZE; - iov.iov_len = auio.uio_resid; - error = nfs_readrpc(vp, &auio, cred); + auio = uio_createwithbuffer(1, (NBOFF(bp) + firstpg * PAGE_SIZE_64), + UIO_SYSSPACE, UIO_READ, &uio_buf[0], sizeof(uio_buf)); + if (!auio) { + error = ENOMEM; + } else { + uio_addiov(auio, CAST_USER_ADDR_T(bp->nb_data + (firstpg * PAGE_SIZE)), + ((lastpg - firstpg + 1) * PAGE_SIZE)); + error = nfs_read_rpc(np, auio, ctx); + } if (error) { + if (ioflag & IO_NOCACHE) + SET(bp->nb_flags, NB_NOCACHE); nfs_buf_release(bp, 1); - FSDBG_BOT(514, vp, 0xd1e000e, 0, error); + nfs_data_unlock(np); + FSDBG_BOT(514, np, 0xd1e000e, 0, error); return (error); } /* Make sure that the valid range is set to cover this read. */ bp->nb_validoff = trunc_page_32(on); bp->nb_validend = round_page_32(on+n); nfs_buf_normalize_valid_range(np, bp); - if (auio.uio_resid > 0) { + if (uio_resid(auio) > 0) { /* if short read, must have hit EOF, */ /* so zero the rest of the range */ - bzero(iov.iov_base, auio.uio_resid); + bzero(CAST_DOWN(caddr_t, uio_curriovbase(auio)), uio_resid(auio)); } /* mark the pages (successfully read) as valid */ for (pg=firstpg; pg <= lastpg; pg++) @@ -1416,835 +2191,1321 @@ again: } /* if no pages are valid, read the whole block */ if (!bp->nb_valid) { + if (!IS_VALID_CRED(bp->nb_rcred) && IS_VALID_CRED(cred)) { + kauth_cred_ref(cred); + bp->nb_rcred = cred; + } SET(bp->nb_flags, NB_READ); CLR(bp->nb_flags, (NB_DONE | NB_ERROR | NB_INVAL)); - error = nfs_doio(bp, cred, p); + error = nfs_buf_read(bp); + if (ioflag & IO_NOCACHE) + SET(bp->nb_flags, NB_NOCACHE); if (error) { + nfs_data_unlock(np); nfs_buf_release(bp, 1); - FSDBG_BOT(514, vp, 0xd1e000f, 0, error); + FSDBG_BOT(514, np, 0xd1e000f, 0, error); return (error); } } buffer_ready: - vp->v_lastr = lbn; /* validate read range against valid range and clip */ if (bp->nb_validend > 0) { diff = (on >= bp->nb_validend) ? 0 : (bp->nb_validend - on); if (diff < n) n = diff; } - if (n > 0) + if (n > 0) { NFS_BUF_MAP(bp); - break; - case VLNK: - nfsstats.biocache_readlinks++; - bp = nfs_buf_get(vp, (daddr_t)0, NFS_MAXPATHLEN, p, operation); - if (!bp) { - FSDBG_BOT(514, vp, 0xd1e0010, 0, EINTR); - return (EINTR); - } - if (!ISSET(bp->nb_flags, NB_CACHE)) { - SET(bp->nb_flags, NB_READ); - error = nfs_doio(bp, cred, p); + error = uiomove(bp->nb_data + on, n, uio); + } + + nfs_buf_release(bp, 1); + nfs_data_unlock(np); + nfs_node_lock_force(np); + np->n_lastread = (uio_offset(uio) - 1) / biosize; + nfs_node_unlock(np); + } while (error == 0 && uio_resid(uio) > 0 && n > 0); + FSDBG_BOT(514, np, uio_offset(uio), uio_resid(uio), error); + return (error); +} + +/* + * limit the number of outstanding async I/O writes + */ +int +nfs_async_write_start(struct nfsmount *nmp) +{ + int error = 0, slpflag = NMFLAG(nmp, INTR) ? PCATCH : 0; + struct timespec ts = {1, 0}; + + if (nfs_max_async_writes <= 0) + return (0); + lck_mtx_lock(&nmp->nm_lock); + while ((nfs_max_async_writes > 0) && (nmp->nm_asyncwrites >= nfs_max_async_writes)) { + if ((error = nfs_sigintr(nmp, NULL, current_thread(), 1))) + break; + msleep(&nmp->nm_asyncwrites, &nmp->nm_lock, slpflag|(PZERO-1), "nfsasyncwrites", &ts); + slpflag = 0; + } + if (!error) + nmp->nm_asyncwrites++; + lck_mtx_unlock(&nmp->nm_lock); + return (error); +} +void +nfs_async_write_done(struct nfsmount *nmp) +{ + if (nmp->nm_asyncwrites <= 0) + return; + lck_mtx_lock(&nmp->nm_lock); + if (nmp->nm_asyncwrites-- >= nfs_max_async_writes) + wakeup(&nmp->nm_asyncwrites); + lck_mtx_unlock(&nmp->nm_lock); +} + +/* + * write (or commit) the given NFS buffer + * + * Commit the buffer if we can. + * Write out any dirty range. + * If any dirty pages remain, write them out. + * Mark buffer done. + * + * For async requests, all the work beyond sending the initial + * write RPC is handled in the RPC callback(s). + */ +int +nfs_buf_write(struct nfsbuf *bp) +{ + int error = 0, oldflags, async; + nfsnode_t np; + thread_t thd; + kauth_cred_t cred; + proc_t p = current_proc(); + int iomode, doff, dend, firstpg, lastpg; + uint32_t pagemask; + + FSDBG_TOP(553, bp, NBOFF(bp), bp->nb_flags, 0); + + if (!ISSET(bp->nb_lflags, NBL_BUSY)) + panic("nfs_buf_write: buffer is not busy???"); + + np = bp->nb_np; + async = ISSET(bp->nb_flags, NB_ASYNC); + oldflags = bp->nb_flags; + + CLR(bp->nb_flags, (NB_READ|NB_DONE|NB_ERROR|NB_DELWRI)); + if (ISSET(oldflags, NB_DELWRI)) { + lck_mtx_lock(nfs_buf_mutex); + nfs_nbdwrite--; + NFSBUFCNTCHK(); + lck_mtx_unlock(nfs_buf_mutex); + wakeup(&nfs_nbdwrite); + } + + /* move to clean list */ + if (ISSET(oldflags, (NB_ASYNC|NB_DELWRI))) { + lck_mtx_lock(nfs_buf_mutex); + if (bp->nb_vnbufs.le_next != NFSNOLIST) + LIST_REMOVE(bp, nb_vnbufs); + LIST_INSERT_HEAD(&np->n_cleanblkhd, bp, nb_vnbufs); + lck_mtx_unlock(nfs_buf_mutex); + } + nfs_node_lock_force(np); + np->n_numoutput++; + nfs_node_unlock(np); + vnode_startwrite(NFSTOV(np)); + + if (p && p->p_stats) + OSIncrementAtomicLong(&p->p_stats->p_ru.ru_oublock); + + cred = bp->nb_wcred; + if (!IS_VALID_CRED(cred) && ISSET(bp->nb_flags, NB_READ)) + cred = bp->nb_rcred; /* shouldn't really happen, but... */ + if (IS_VALID_CRED(cred)) + kauth_cred_ref(cred); + thd = async ? NULL : current_thread(); + + /* We need to make sure the pages are locked before doing I/O. */ + if (!ISSET(bp->nb_flags, NB_META) && UBCINFOEXISTS(NFSTOV(np))) { + if (!ISSET(bp->nb_flags, NB_PAGELIST)) { + error = nfs_buf_upl_setup(bp); if (error) { + printf("nfs_buf_write: upl create failed %d\n", error); SET(bp->nb_flags, NB_ERROR); - nfs_buf_release(bp, 1); - FSDBG_BOT(514, vp, 0xd1e0011, 0, error); - return (error); + bp->nb_error = error = EIO; + nfs_buf_iodone(bp); + goto out; } + nfs_buf_upl_check(bp); } - n = min(uio->uio_resid, bp->nb_validend); - on = 0; - break; - case VDIR: - nfsstats.biocache_readdirs++; - if (np->n_direofoffset && uio->uio_offset >= np->n_direofoffset) { - FSDBG_BOT(514, vp, 0xde0f0001, 0, 0); - return (0); + } + + /* If NB_NEEDCOMMIT is set, a commit RPC may do the trick. */ + if (ISSET(bp->nb_flags, NB_NEEDCOMMIT)) + nfs_buf_check_write_verifier(np, bp); + if (ISSET(bp->nb_flags, NB_NEEDCOMMIT)) { + struct nfsmount *nmp = NFSTONMP(np); + if (!nmp) { + SET(bp->nb_flags, NB_ERROR); + bp->nb_error = error = EIO; + nfs_buf_iodone(bp); + goto out; } - lbn = uio->uio_offset / NFS_DIRBLKSIZ; - on = uio->uio_offset & (NFS_DIRBLKSIZ - 1); - bp = nfs_buf_get(vp, lbn, NFS_DIRBLKSIZ, p, operation); - if (!bp) { - FSDBG_BOT(514, vp, 0xd1e0012, 0, EINTR); - return (EINTR); - } - if (!ISSET(bp->nb_flags, NB_CACHE)) { - SET(bp->nb_flags, NB_READ); - error = nfs_doio(bp, cred, p); - if (error) { - nfs_buf_release(bp, 1); - } - while (error == NFSERR_BAD_COOKIE) { - nfs_invaldir(vp); - error = nfs_vinvalbuf(vp, 0, cred, p, 1); - /* - * Yuck! The directory has been modified on the - * server. The only way to get the block is by - * reading from the beginning to get all the - * offset cookies. - */ - for (i = 0; i <= lbn && !error; i++) { - if (np->n_direofoffset - && (i * NFS_DIRBLKSIZ) >= np->n_direofoffset) { - FSDBG_BOT(514, vp, 0xde0f0002, 0, 0); - return (0); - } - bp = nfs_buf_get(vp, i, NFS_DIRBLKSIZ, p, operation); - if (!bp) { - FSDBG_BOT(514, vp, 0xd1e0013, 0, EINTR); - return (EINTR); - } - if (!ISSET(bp->nb_flags, NB_CACHE)) { - SET(bp->nb_flags, NB_READ); - error = nfs_doio(bp, cred, p); - /* - * no error + NB_INVAL == directory EOF, - * use the block. - */ - if (error == 0 && (bp->nb_flags & NB_INVAL)) - break; - } - /* - * An error will throw away the block and the - * for loop will break out. If no error and this - * is not the block we want, we throw away the - * block and go for the next one via the for loop. - */ - if (error || i < lbn) - nfs_buf_release(bp, 1); + SET(bp->nb_flags, NB_WRITEINPROG); + error = nmp->nm_funcs->nf_commit_rpc(np, NBOFF(bp) + bp->nb_dirtyoff, + bp->nb_dirtyend - bp->nb_dirtyoff, bp->nb_wcred, bp->nb_verf); + CLR(bp->nb_flags, NB_WRITEINPROG); + if (error) { + if (error != NFSERR_STALEWRITEVERF) { + SET(bp->nb_flags, NB_ERROR); + bp->nb_error = error; } - } - /* - * The above while is repeated if we hit another cookie - * error. If we hit an error and it wasn't a cookie error, - * we give up. - */ - if (error) { - FSDBG_BOT(514, vp, 0xd1e0014, 0, error); - return (error); - } + nfs_buf_iodone(bp); + goto out; } - - /* - * If not eof and read aheads are enabled, start one. - * (You need the current block first, so that you have the - * directory offset cookie of the next block.) - */ - if (nfs_numasync > 0 && nmp->nm_readahead > 0 && - (np->n_direofoffset == 0 || - (lbn + 1) * NFS_DIRBLKSIZ < np->n_direofoffset) && - !(np->n_flag & NQNFSNONCACHE) && - !nfs_buf_incore(vp, lbn + 1)) { - rabp = nfs_buf_get(vp, lbn + 1, NFS_DIRBLKSIZ, p, - operation); - if (rabp) { - if (!ISSET(rabp->nb_flags, (NB_CACHE))) { - SET(rabp->nb_flags, (NB_READ | NB_ASYNC)); - if (nfs_asyncio(rabp, cred)) { - SET(rabp->nb_flags, (NB_INVAL|NB_ERROR)); - rabp->nb_error = EIO; - nfs_buf_release(rabp, 1); - } - } else { - nfs_buf_release(rabp, 1); - } - } + bp->nb_dirtyoff = bp->nb_dirtyend = 0; + CLR(bp->nb_flags, NB_NEEDCOMMIT); + nfs_node_lock_force(np); + np->n_needcommitcnt--; + CHECK_NEEDCOMMITCNT(np); + nfs_node_unlock(np); + } + if (!error && (bp->nb_dirtyend > 0)) { + /* sanity check the dirty range */ + if (NBOFF(bp) + bp->nb_dirtyend > (off_t) np->n_size) { + bp->nb_dirtyend = np->n_size - NBOFF(bp); + if (bp->nb_dirtyoff >= bp->nb_dirtyend) + bp->nb_dirtyoff = bp->nb_dirtyend = 0; } + } + if (!error && (bp->nb_dirtyend > 0)) { + /* there's a dirty range that needs to be written out */ + NFS_BUF_MAP(bp); + + doff = bp->nb_dirtyoff; + dend = bp->nb_dirtyend; + + /* if doff page is dirty, move doff to start of page */ + if (NBPGDIRTY(bp, doff / PAGE_SIZE)) + doff -= doff & PAGE_MASK; + /* try to expand write range to include preceding dirty pages */ + if (!(doff & PAGE_MASK)) + while ((doff > 0) && NBPGDIRTY(bp, (doff - 1) / PAGE_SIZE)) + doff -= PAGE_SIZE; + /* if dend page is dirty, move dend to start of next page */ + if ((dend & PAGE_MASK) && NBPGDIRTY(bp, dend / PAGE_SIZE)) + dend = round_page_32(dend); + /* try to expand write range to include trailing dirty pages */ + if (!(dend & PAGE_MASK)) + while ((dend < (int)bp->nb_bufsize) && NBPGDIRTY(bp, dend / PAGE_SIZE)) + dend += PAGE_SIZE; + /* make sure to keep dend clipped to EOF */ + if ((NBOFF(bp) + dend) > (off_t) np->n_size) + dend = np->n_size - NBOFF(bp); + /* calculate range of complete pages being written */ + firstpg = round_page_32(doff) / PAGE_SIZE; + lastpg = (trunc_page_32(dend) - 1) / PAGE_SIZE; + /* calculate mask for that page range */ + pagemask = ((1 << (lastpg + 1)) - 1) & ~((1 << firstpg) - 1); + /* - * Make sure we use a signed variant of min() since - * the second term may be negative. - */ - n = lmin(uio->uio_resid, bp->nb_validend - on); - /* - * We keep track of the directory eof in - * np->n_direofoffset and chop it off as an - * extra step right here. + * compare page mask to nb_dirty; if there are other dirty pages + * then write FILESYNC; otherwise, write UNSTABLE if async and + * not needcommit/stable; otherwise write FILESYNC */ - if (np->n_direofoffset && - n > np->n_direofoffset - uio->uio_offset) - n = np->n_direofoffset - uio->uio_offset; + if (bp->nb_dirty & ~pagemask) + iomode = NFS_WRITE_FILESYNC; + else if ((bp->nb_flags & (NB_ASYNC | NB_NEEDCOMMIT | NB_STABLE)) == NB_ASYNC) + iomode = NFS_WRITE_UNSTABLE; + else + iomode = NFS_WRITE_FILESYNC; + + /* write the whole contiguous dirty range */ + bp->nb_offio = doff; + bp->nb_endio = dend; + + OSAddAtomic(1, &nfsstats.write_bios); + + SET(bp->nb_flags, NB_WRITEINPROG); + error = nfs_buf_write_rpc(bp, iomode, thd, cred); /* - * Make sure that we return an integral number of entries so - * that any subsequent calls will start copying from the start - * of the next entry. - * - * If the current value of n has the last entry cut short, - * set n to copy everything up to the last entry instead. + * For async I/O, the callbacks will finish up the + * write and push out any dirty pages. Otherwise, + * the write has already been finished and any dirty + * pages pushed out. */ - if (n > 0) { - dp = bp->nb_data + on; - while (dp < (bp->nb_data + on + n)) { - direntp = (struct dirent *)dp; - dp += direntp->d_reclen; + } else { + if (!error && bp->nb_dirty) /* write out any dirty pages */ + error = nfs_buf_write_dirty_pages(bp, thd, cred); + nfs_buf_iodone(bp); + } + /* note: bp is still valid only for !async case */ +out: + if (!async) { + error = nfs_buf_iowait(bp); + /* move to clean list */ + if (oldflags & NB_DELWRI) { + lck_mtx_lock(nfs_buf_mutex); + if (bp->nb_vnbufs.le_next != NFSNOLIST) + LIST_REMOVE(bp, nb_vnbufs); + LIST_INSERT_HEAD(&np->n_cleanblkhd, bp, nb_vnbufs); + lck_mtx_unlock(nfs_buf_mutex); + } + FSDBG_BOT(553, bp, NBOFF(bp), bp->nb_flags, error); + nfs_buf_release(bp, 1); + /* check if we need to invalidate (and we can) */ + if ((np->n_flag & NNEEDINVALIDATE) && + !(np->n_bflag & (NBINVALINPROG|NBFLUSHINPROG))) { + int invalidate = 0; + nfs_node_lock_force(np); + if (np->n_flag & NNEEDINVALIDATE) { + invalidate = 1; + np->n_flag &= ~NNEEDINVALIDATE; + } + nfs_node_unlock(np); + if (invalidate) { + /* + * There was a write error and we need to + * invalidate attrs and flush buffers in + * order to sync up with the server. + * (if this write was extending the file, + * we may no longer know the correct size) + * + * But we couldn't call vinvalbuf while holding + * the buffer busy. So we call vinvalbuf() after + * releasing the buffer. + */ + nfs_vinvalbuf2(NFSTOV(np), V_SAVE|V_IGNORE_WRITEERR, thd, cred, 1); } - if (dp > (bp->nb_data + on + n)) - n = (dp - direntp->d_reclen) - (bp->nb_data + on); } - break; - default: - printf("nfs_bioread: type %x unexpected\n",vp->v_type); - FSDBG_BOT(514, vp, 0xd1e0015, 0, EINVAL); - return (EINVAL); - }; + } - if (n > 0) { - error = uiomove(bp->nb_data + on, (int)n, uio); - } - switch (vp->v_type) { - case VREG: - break; - case VLNK: - n = 0; - break; - case VDIR: - if (np->n_flag & NQNFSNONCACHE) - SET(bp->nb_flags, NB_INVAL); - break; - } - nfs_buf_release(bp, 1); - } while (error == 0 && uio->uio_resid > 0 && n > 0); - FSDBG_BOT(514, vp, uio->uio_offset, uio->uio_resid, error); + if (IS_VALID_CRED(cred)) + kauth_cred_unref(&cred); return (error); } - /* - * Vnode op for write using bio + * finish the writing of a buffer */ -int -nfs_write(ap) - struct vop_write_args /* { - struct vnode *a_vp; - struct uio *a_uio; - int a_ioflag; - struct ucred *a_cred; - } */ *ap; +void +nfs_buf_write_finish(struct nfsbuf *bp, thread_t thd, kauth_cred_t cred) { - struct uio *uio = ap->a_uio; - struct proc *p = uio->uio_procp; - struct vnode *vp = ap->a_vp; - struct nfsnode *np = VTONFS(vp); - struct ucred *cred = ap->a_cred; - int ioflag = ap->a_ioflag; - struct nfsbuf *bp; - struct vattr vattr; - struct nfsmount *nmp = VFSTONFS(vp->v_mount); - daddr_t lbn; - int biosize, bufsize, writeop; - int n, on, error = 0, iomode, must_commit; - off_t boff, start, end, cureof; - struct iovec iov; - struct uio auio; - - FSDBG_TOP(515, vp, uio->uio_offset, uio->uio_resid, ioflag); - -#if DIAGNOSTIC - if (uio->uio_rw != UIO_WRITE) - panic("nfs_write mode"); - if (uio->uio_segflg == UIO_USERSPACE && uio->uio_procp != current_proc()) - panic("nfs_write proc"); -#endif - if (vp->v_type != VREG) - return (EIO); - if (np->n_flag & NWRITEERR) { - np->n_flag &= ~NWRITEERR; - FSDBG_BOT(515, vp, uio->uio_offset, uio->uio_resid, np->n_error); - return (np->n_error); - } - if ((nmp->nm_flag & NFSMNT_NFSV3) && - !(nmp->nm_state & NFSSTA_GOTFSINFO)) - (void)nfs_fsinfo(nmp, vp, cred, p); - if (ioflag & (IO_APPEND | IO_SYNC)) { - if (np->n_flag & NMODIFIED) { - np->n_xid = 0; - error = nfs_vinvalbuf(vp, V_SAVE, cred, p, 1); - if (error) { - FSDBG_BOT(515, vp, uio->uio_offset, 0x10bad01, error); - return (error); - } - } - if (ioflag & IO_APPEND) { - np->n_xid = 0; - error = VOP_GETATTR(vp, &vattr, cred, p); - if (error) { - FSDBG_BOT(515, vp, uio->uio_offset, 0x10bad02, error); - return (error); - } - uio->uio_offset = np->n_size; - } + nfsnode_t np = bp->nb_np; + int error = (bp->nb_flags & NB_ERROR) ? bp->nb_error : 0; + int firstpg, lastpg; + uint32_t pagemask; + + if ((error == EINTR) || (error == ERESTART)) { + CLR(bp->nb_flags, NB_ERROR); + SET(bp->nb_flags, NB_EINTR); } - if (uio->uio_offset < 0) { - FSDBG_BOT(515, vp, uio->uio_offset, 0xbad0ff, EINVAL); - return (EINVAL); + + if (!error) { + /* calculate range of complete pages being written */ + firstpg = round_page_32(bp->nb_offio) / PAGE_SIZE; + lastpg = (trunc_page_32(bp->nb_endio) - 1) / PAGE_SIZE; + /* calculate mask for that page range written */ + pagemask = ((1 << (lastpg + 1)) - 1) & ~((1 << firstpg) - 1); + /* clear dirty bits for pages we've written */ + bp->nb_dirty &= ~pagemask; } - if (uio->uio_resid == 0) { - FSDBG_BOT(515, vp, uio->uio_offset, uio->uio_resid, 0); - return (0); + + /* manage needcommit state */ + if (!error && (bp->nb_commitlevel == NFS_WRITE_UNSTABLE)) { + if (!ISSET(bp->nb_flags, NB_NEEDCOMMIT)) { + nfs_node_lock_force(np); + np->n_needcommitcnt++; + nfs_node_unlock(np); + SET(bp->nb_flags, NB_NEEDCOMMIT); + } + /* make sure nb_dirtyoff/nb_dirtyend reflect actual range written */ + bp->nb_dirtyoff = bp->nb_offio; + bp->nb_dirtyend = bp->nb_endio; + } else if (ISSET(bp->nb_flags, NB_NEEDCOMMIT)) { + nfs_node_lock_force(np); + np->n_needcommitcnt--; + CHECK_NEEDCOMMITCNT(np); + nfs_node_unlock(np); + CLR(bp->nb_flags, NB_NEEDCOMMIT); } + + CLR(bp->nb_flags, NB_WRITEINPROG); + /* - * Maybe this should be above the vnode op call, but so long as - * file servers have no limits, i don't think it matters + * For an unstable write, the buffer is still treated as dirty until + * a commit (or stable (re)write) is performed. Buffers needing only + * a commit are marked with the NB_DELWRI and NB_NEEDCOMMIT flags. + * + * If the write was interrupted we set NB_EINTR. Don't set NB_ERROR + * because that would cause the buffer to be dropped. The buffer is + * still valid and simply needs to be written again. */ - if (p && uio->uio_offset + uio->uio_resid > - p->p_rlimit[RLIMIT_FSIZE].rlim_cur) { - psignal(p, SIGXFSZ); - FSDBG_BOT(515, vp, uio->uio_offset, 0x2b1f, EFBIG); - return (EFBIG); + if ((error == EINTR) || (error == ERESTART) || (!error && (bp->nb_flags & NB_NEEDCOMMIT))) { + CLR(bp->nb_flags, NB_INVAL); + if (!ISSET(bp->nb_flags, NB_DELWRI)) { + SET(bp->nb_flags, NB_DELWRI); + lck_mtx_lock(nfs_buf_mutex); + nfs_nbdwrite++; + NFSBUFCNTCHK(); + lck_mtx_unlock(nfs_buf_mutex); + } + /* + * Since for the NB_ASYNC case, we've reassigned the buffer to the + * clean list, we have to reassign it back to the dirty one. Ugh. + */ + if (ISSET(bp->nb_flags, NB_ASYNC)) { + /* move to dirty list */ + lck_mtx_lock(nfs_buf_mutex); + if (bp->nb_vnbufs.le_next != NFSNOLIST) + LIST_REMOVE(bp, nb_vnbufs); + LIST_INSERT_HEAD(&np->n_dirtyblkhd, bp, nb_vnbufs); + lck_mtx_unlock(nfs_buf_mutex); + } + } else { + /* either there's an error or we don't need to commit */ + if (error) { + /* + * There was a write error and we need to invalidate + * attrs and flush buffers in order to sync up with the + * server. (if this write was extending the file, we + * may no longer know the correct size) + * + * But we can't call vinvalbuf while holding this + * buffer busy. Set a flag to do it after releasing + * the buffer. + */ + nfs_node_lock_force(np); + np->n_error = error; + np->n_flag |= (NWRITEERR | NNEEDINVALIDATE); + NATTRINVALIDATE(np); + nfs_node_unlock(np); + } + /* clear the dirty range */ + bp->nb_dirtyoff = bp->nb_dirtyend = 0; } - biosize = vp->v_mount->mnt_stat.f_iosize; + if (!error && bp->nb_dirty) + nfs_buf_write_dirty_pages(bp, thd, cred); + nfs_buf_iodone(bp); +} - do { - /* - * Check for a valid write lease. - */ - if ((nmp->nm_flag & NFSMNT_NQNFS) && - NQNFS_CKINVALID(vp, np, ND_WRITE)) { - do { - error = nqnfs_getlease(vp, ND_WRITE, cred, p); - } while (error == NQNFS_EXPIRED); - if (error) { - FSDBG_BOT(515, vp, uio->uio_offset, 0x11110001, error); - return (error); - } - if (np->n_lrev != np->n_brev || - (np->n_flag & NQNFSNONCACHE)) { - error = nfs_vinvalbuf(vp, V_SAVE, cred, p, 1); - if (error) { - FSDBG_BOT(515, vp, uio->uio_offset, 0x11110002, error); - return (error); - } - np->n_brev = np->n_lrev; +/* + * write out any pages marked dirty in a buffer + * + * We do use unstable writes and follow up with a commit. + * If we catch the write verifier changing we'll restart + * do the writes filesync. + */ +int +nfs_buf_write_dirty_pages(struct nfsbuf *bp, thread_t thd, kauth_cred_t cred) +{ + nfsnode_t np = bp->nb_np; + struct nfsmount *nmp = NFSTONMP(np); + int error = 0, commit, iomode, iomode2, len, pg, count, npages, off; + uint32_t dirty = bp->nb_dirty; + uint64_t wverf; + uio_t auio; + char uio_buf [ UIO_SIZEOF(1) ]; + + if (!bp->nb_dirty) + return (0); + + /* there are pages marked dirty that need to be written out */ + OSAddAtomic(1, &nfsstats.write_bios); + NFS_BUF_MAP(bp); + SET(bp->nb_flags, NB_WRITEINPROG); + npages = bp->nb_bufsize / PAGE_SIZE; + iomode = NFS_WRITE_UNSTABLE; + + auio = uio_createwithbuffer(1, 0, UIO_SYSSPACE, UIO_WRITE, + &uio_buf, sizeof(uio_buf)); + +again: + dirty = bp->nb_dirty; + wverf = bp->nb_verf; + commit = NFS_WRITE_FILESYNC; + for (pg = 0; pg < npages; pg++) { + if (!NBPGDIRTY(bp, pg)) + continue; + count = 1; + while (((pg + count) < npages) && NBPGDIRTY(bp, pg + count)) + count++; + /* write count pages starting with page pg */ + off = pg * PAGE_SIZE; + len = count * PAGE_SIZE; + /* clip writes to EOF */ + if (NBOFF(bp) + off + len > (off_t) np->n_size) + len -= (NBOFF(bp) + off + len) - np->n_size; + if (len > 0) { + iomode2 = iomode; + uio_reset(auio, NBOFF(bp) + off, UIO_SYSSPACE, UIO_WRITE); + uio_addiov(auio, CAST_USER_ADDR_T(bp->nb_data + off), len); + error = nfs_write_rpc2(np, auio, thd, cred, &iomode2, &bp->nb_verf); + if (error) + break; + if (iomode2 < commit) /* Retain the lowest commitment level returned. */ + commit = iomode2; + if ((commit != NFS_WRITE_FILESYNC) && (wverf != bp->nb_verf)) { + /* verifier changed, redo all the writes filesync */ + iomode = NFS_WRITE_FILESYNC; + goto again; } } - if (ISSET(vp->v_flag, VNOCACHE_DATA) && - (np->n_dirtyblkhd.lh_first || np->n_cleanblkhd.lh_first)) { - error = nfs_vinvalbuf(vp, V_SAVE, cred, p, 1); - if (error) { - FSDBG_BOT(515, vp, 0, 0, error); - return (error); - } + /* clear dirty bits */ + while (count--) { + dirty &= ~(1 << pg); + if (count) /* leave pg on last page */ + pg++; } - if (((np->n_flag & NQNFSNONCACHE) || - ISSET(vp->v_flag, VNOCACHE_DATA)) && - uio->uio_iovcnt == 1) { - iomode = NFSV3WRITE_FILESYNC; - error = nfs_writerpc(vp, uio, cred, &iomode, &must_commit); - if (must_commit) - nfs_clearcommit(vp->v_mount); - FSDBG_BOT(515, vp, uio->uio_offset, uio->uio_resid, error); - return (error); - } - nfsstats.biocache_writes++; - lbn = uio->uio_offset / biosize; - on = uio->uio_offset % biosize; - n = min((unsigned)(biosize - on), uio->uio_resid); -again: - bufsize = biosize; - /* - * Get a cache block for writing. The range to be written is - * (off..off+n) within the block. We ensure that the block - * either has no dirty region or that the given range is - * contiguous with the existing dirty region. - */ - bp = nfs_buf_get(vp, lbn, bufsize, p, BLK_WRITE); - if (!bp) { - FSDBG_BOT(515, vp, uio->uio_offset, uio->uio_resid, EINTR); - return (EINTR); + } + CLR(bp->nb_flags, NB_WRITEINPROG); + + if (!error && (commit != NFS_WRITE_FILESYNC)) { + error = nmp->nm_funcs->nf_commit_rpc(np, NBOFF(bp), bp->nb_bufsize, cred, wverf); + if (error == NFSERR_STALEWRITEVERF) { + /* verifier changed, so we need to restart all the writes */ + iomode = NFS_WRITE_FILESYNC; + goto again; } - /* map the block because we know we're going to write to it */ - NFS_BUF_MAP(bp); + } + if (!error) { + bp->nb_dirty = dirty; + } else { + SET(bp->nb_flags, NB_ERROR); + bp->nb_error = error; + } + return (error); +} - if (ISSET(vp->v_flag, VNOCACHE_DATA)) - SET(bp->nb_flags, (NB_NOCACHE|NB_INVAL)); +/* + * initiate the NFS WRITE RPC(s) for a buffer + */ +int +nfs_buf_write_rpc(struct nfsbuf *bp, int iomode, thread_t thd, kauth_cred_t cred) +{ + struct nfsmount *nmp; + nfsnode_t np = bp->nb_np; + int error = 0, nfsvers, async; + int offset, nrpcs; + uint32_t nmwsize, length, len; + struct nfsreq *req; + struct nfsreq_cbinfo cb; + uio_t auio; + char uio_buf [ UIO_SIZEOF(1) ]; + + nmp = NFSTONMP(np); + if (!nmp) { + bp->nb_error = error = ENXIO; + SET(bp->nb_flags, NB_ERROR); + nfs_buf_iodone(bp); + return (error); + } + nfsvers = nmp->nm_vers; + nmwsize = nmp->nm_wsize; - /* - * NFS has embedded ucred so crhold() risks zone corruption - */ - if (bp->nb_wcred == NOCRED) - bp->nb_wcred = crdup(cred); + offset = bp->nb_offio; + length = bp->nb_endio - bp->nb_offio; - /* - * If there's already a dirty range AND dirty pages in this block we - * need to send a commit AND write the dirty pages before continuing. - * - * If there's already a dirty range OR dirty pages in this block - * and the new write range is not contiguous with the existing range, - * then force the buffer to be written out now. - * (We used to just extend the dirty range to cover the valid, - * but unwritten, data in between also. But writing ranges - * of data that weren't actually written by an application - * risks overwriting some other client's data with stale data - * that's just masquerading as new written data.) - */ - if (bp->nb_dirtyend > 0) { - if (on > bp->nb_dirtyend || (on + n) < bp->nb_dirtyoff || bp->nb_dirty) { - FSDBG(515, vp, uio->uio_offset, bp, 0xd15c001); - /* write/commit buffer "synchronously" */ - /* (NB_STABLE indicates that data writes should be FILESYNC) */ - CLR(bp->nb_flags, (NB_DONE | NB_ERROR | NB_INVAL)); - SET(bp->nb_flags, (NB_ASYNC | NB_STABLE)); - error = nfs_buf_write(bp); - if (error) { - FSDBG_BOT(515, vp, uio->uio_offset, uio->uio_resid, error); - return (error); - } - goto again; - } - } else if (bp->nb_dirty) { - int firstpg, lastpg; - u_int32_t pagemask; - /* calculate write range pagemask */ - firstpg = on/PAGE_SIZE; - lastpg = (on+n-1)/PAGE_SIZE; - pagemask = ((1 << (lastpg+1)) - 1) & ~((1 << firstpg) - 1); - /* check if there are dirty pages outside the write range */ - if (bp->nb_dirty & ~pagemask) { - FSDBG(515, vp, uio->uio_offset, bp, 0xd15c002); - /* write/commit buffer "synchronously" */ - /* (NB_STABLE indicates that data writes should be FILESYNC) */ - CLR(bp->nb_flags, (NB_DONE | NB_ERROR | NB_INVAL)); - SET(bp->nb_flags, (NB_ASYNC | NB_STABLE)); - error = nfs_buf_write(bp); - if (error) { - FSDBG_BOT(515, vp, uio->uio_offset, uio->uio_resid, error); - return (error); - } - goto again; - } - /* if the first or last pages are already dirty */ - /* make sure that the dirty range encompasses those pages */ - if (NBPGDIRTY(bp,firstpg) || NBPGDIRTY(bp,lastpg)) { - FSDBG(515, vp, uio->uio_offset, bp, 0xd15c003); - bp->nb_dirtyoff = min(on, firstpg * PAGE_SIZE); - if (NBPGDIRTY(bp,lastpg)) { - bp->nb_dirtyend = (lastpg+1) * PAGE_SIZE; - /* clip to EOF */ - if (NBOFF(bp) + bp->nb_dirtyend > np->n_size) - bp->nb_dirtyend = np->n_size - NBOFF(bp); - } else - bp->nb_dirtyend = on+n; - } + /* Note: Can only do async I/O if nfsiods are configured. */ + async = (bp->nb_flags & NB_ASYNC) && (NFSIOD_MAX > 0); + bp->nb_commitlevel = NFS_WRITE_FILESYNC; + cb.rcb_func = async ? nfs_buf_write_rpc_finish : NULL; + cb.rcb_bp = bp; + + if ((nfsvers == NFS_VER2) && ((NBOFF(bp) + bp->nb_endio) > 0xffffffffLL)) { + bp->nb_error = error = EFBIG; + SET(bp->nb_flags, NB_ERROR); + nfs_buf_iodone(bp); + return (error); + } + + auio = uio_createwithbuffer(1, NBOFF(bp) + offset, UIO_SYSSPACE, + UIO_WRITE, &uio_buf, sizeof(uio_buf)); + uio_addiov(auio, CAST_USER_ADDR_T(bp->nb_data + offset), length); + + bp->nb_rpcs = nrpcs = (length + nmwsize - 1) / nmwsize; + if (async && (nrpcs > 1)) { + SET(bp->nb_flags, NB_MULTASYNCRPC); + } else { + CLR(bp->nb_flags, NB_MULTASYNCRPC); + } + + while (length > 0) { + if (ISSET(bp->nb_flags, NB_ERROR)) { + error = bp->nb_error; + break; } + len = (length > nmwsize) ? nmwsize : length; + cb.rcb_args[0] = offset; + cb.rcb_args[1] = len; + if (nmp->nm_vers >= NFS_VER4) + cb.rcb_args[2] = nmp->nm_stategenid; + if (async && ((error = nfs_async_write_start(nmp)))) + break; + req = NULL; + error = nmp->nm_funcs->nf_write_rpc_async(np, auio, len, thd, cred, + iomode, &cb, &req); + if (error) { + if (async) + nfs_async_write_done(nmp); + break; + } + offset += len; + length -= len; + if (async) + continue; + nfs_buf_write_rpc_finish(req); + } + if (length > 0) { /* - * Are we extending the size of the file with this write? - * If so, update file size now that we have the block. - * If there was a partial buf at the old eof, validate - * and zero the new bytes. + * Something bad happened while trying to send the RPCs. + * Wait for any outstanding requests to complete. */ - cureof = (off_t)np->n_size; - if (uio->uio_offset + n > np->n_size) { - struct nfsbuf *eofbp = NULL; - daddr_t eofbn = np->n_size / biosize; - int eofoff = np->n_size % biosize; - int neweofoff = (uio->uio_offset + n) % biosize; - - FSDBG(515, 0xb1ffa000, uio->uio_offset + n, eofoff, neweofoff); - - if (eofoff && eofbn < lbn && nfs_buf_incore(vp, eofbn)) - eofbp = nfs_buf_get(vp, eofbn, biosize, p, BLK_WRITE); - - /* if we're extending within the same last block */ - /* and the block is flagged as being cached... */ - if ((lbn == eofbn) && ISSET(bp->nb_flags, NB_CACHE)) { - /* ...check that all pages in buffer are valid */ - int endpg = ((neweofoff ? neweofoff : biosize) - 1)/PAGE_SIZE; - u_int32_t pagemask; - /* pagemask only has to extend to last page being written to */ - pagemask = (1 << (endpg+1)) - 1; - FSDBG(515, 0xb1ffa001, bp->nb_valid, pagemask, 0); - if ((bp->nb_valid & pagemask) != pagemask) { - /* zerofill any hole */ - if (on > bp->nb_validend) { - int i; - for (i=bp->nb_validend/PAGE_SIZE; i <= (on - 1)/PAGE_SIZE; i++) - NBPGVALID_SET(bp, i); - NFS_BUF_MAP(bp); - FSDBG(516, bp, bp->nb_validend, on - bp->nb_validend, 0xf01e); - bzero((char *)bp->nb_data + bp->nb_validend, - on - bp->nb_validend); - } - /* zerofill any trailing data in the last page */ - if (neweofoff) { - NFS_BUF_MAP(bp); - FSDBG(516, bp, neweofoff, PAGE_SIZE - (neweofoff & PAGE_MASK), 0xe0f); - bzero((char *)bp->nb_data + neweofoff, - PAGE_SIZE - (neweofoff & PAGE_MASK)); - } - } - } - np->n_flag |= NMODIFIED; - np->n_size = uio->uio_offset + n; - ubc_setsize(vp, (off_t)np->n_size); /* XXX errors */ - if (eofbp) { - /* - * We may need to zero any previously invalid data - * after the old EOF in the previous EOF buffer. - * - * For the old last page, don't zero bytes if there - * are invalid bytes in that page (i.e. the page isn't - * currently valid). - * For pages after the old last page, zero them and - * mark them as valid. - */ - char *d; - int i; - if (ISSET(vp->v_flag, VNOCACHE_DATA)) - SET(eofbp->nb_flags, (NB_NOCACHE|NB_INVAL)); - NFS_BUF_MAP(eofbp); - FSDBG(516, eofbp, eofoff, biosize - eofoff, 0xe0fff01e); - d = eofbp->nb_data; - i = eofoff/PAGE_SIZE; - while (eofoff < biosize) { - int poff = eofoff & PAGE_MASK; - if (!poff || NBPGVALID(eofbp,i)) { - bzero(d + eofoff, PAGE_SIZE - poff); - NBPGVALID_SET(eofbp, i); - } - if (bp->nb_validend == eofoff) - bp->nb_validend += PAGE_SIZE - poff; - eofoff += PAGE_SIZE - poff; - i++; - } - nfs_buf_release(eofbp, 1); + bp->nb_error = error; + SET(bp->nb_flags, NB_ERROR); + if (ISSET(bp->nb_flags, NB_MULTASYNCRPC)) { + nrpcs = (length + nmwsize - 1) / nmwsize; + lck_mtx_lock(nfs_buf_mutex); + bp->nb_rpcs -= nrpcs; + if (bp->nb_rpcs == 0) { + /* No RPCs left, so the buffer's done */ + lck_mtx_unlock(nfs_buf_mutex); + nfs_buf_write_finish(bp, thd, cred); + } else { + /* wait for the last RPC to mark it done */ + while (bp->nb_rpcs > 0) + msleep(&bp->nb_rpcs, nfs_buf_mutex, 0, + "nfs_buf_write_rpc_cancel", NULL); + lck_mtx_unlock(nfs_buf_mutex); } + } else { + nfs_buf_write_finish(bp, thd, cred); } - /* - * If dirtyend exceeds file size, chop it down. This should - * not occur unless there is a race. - */ - if (NBOFF(bp) + bp->nb_dirtyend > np->n_size) - bp->nb_dirtyend = np->n_size - NBOFF(bp); - /* - * UBC doesn't handle partial pages, so we need to make sure - * that any pages left in the page cache are completely valid. - * - * Writes that are smaller than a block are delayed if they - * don't extend to the end of the block. - * - * If the block isn't (completely) cached, we may need to read - * in some parts of pages that aren't covered by the write. - * If the write offset (on) isn't page aligned, we'll need to - * read the start of the first page being written to. Likewise, - * if the offset of the end of the write (on+n) isn't page aligned, - * we'll need to read the end of the last page being written to. - * - * Notes: - * We don't want to read anything we're just going to write over. - * We don't want to issue multiple I/Os if we don't have to - * (because they're synchronous rpcs). - * We don't want to read anything we already have modified in the - * page cache. - */ - if (!ISSET(bp->nb_flags, NB_CACHE) && n < biosize) { - int firstpg, lastpg, dirtypg; - int firstpgoff, lastpgoff; - start = end = -1; - firstpg = on/PAGE_SIZE; - firstpgoff = on & PAGE_MASK; - lastpg = (on+n-1)/PAGE_SIZE; - lastpgoff = (on+n) & PAGE_MASK; - if (firstpgoff && !NBPGVALID(bp,firstpg)) { - /* need to read start of first page */ - start = firstpg * PAGE_SIZE; - end = start + firstpgoff; - } - if (lastpgoff && !NBPGVALID(bp,lastpg)) { - /* need to read end of last page */ - if (start < 0) - start = (lastpg * PAGE_SIZE) + lastpgoff; - end = (lastpg + 1) * PAGE_SIZE; - } - if (end > start) { - /* need to read the data in range: start...end-1 */ - - /* first, check for dirty pages in between */ - /* if there are, we'll have to do two reads because */ - /* we don't want to overwrite the dirty pages. */ - for (dirtypg=start/PAGE_SIZE; dirtypg <= (end-1)/PAGE_SIZE; dirtypg++) - if (NBPGDIRTY(bp,dirtypg)) - break; - - /* if start is at beginning of page, try */ - /* to get any preceeding pages as well. */ - if (!(start & PAGE_MASK)) { - /* stop at next dirty/valid page or start of block */ - for (; start > 0; start-=PAGE_SIZE) - if (NBPGVALID(bp,((start-1)/PAGE_SIZE))) - break; - } - - NFS_BUF_MAP(bp); - /* setup uio for read(s) */ - boff = NBOFF(bp); - auio.uio_iov = &iov; - auio.uio_iovcnt = 1; - auio.uio_segflg = UIO_SYSSPACE; - auio.uio_rw = UIO_READ; - auio.uio_procp = p; - - if (dirtypg <= (end-1)/PAGE_SIZE) { - /* there's a dirty page in the way, so just do two reads */ - /* we'll read the preceding data here */ - auio.uio_offset = boff + start; - auio.uio_resid = iov.iov_len = on - start; - iov.iov_base = bp->nb_data + start; - error = nfs_readrpc(vp, &auio, cred); - if (error) { - bp->nb_error = error; - SET(bp->nb_flags, NB_ERROR); - printf("nfs_write: readrpc %d", error); - } - if (auio.uio_resid > 0) { - FSDBG(516, bp, iov.iov_base - bp->nb_data, auio.uio_resid, 0xd00dee01); - bzero(iov.iov_base, auio.uio_resid); - } - /* update validoff/validend if necessary */ - if ((bp->nb_validoff < 0) || (bp->nb_validoff > start)) - bp->nb_validoff = start; - if ((bp->nb_validend < 0) || (bp->nb_validend < on)) - bp->nb_validend = on; - if (np->n_size > boff + bp->nb_validend) - bp->nb_validend = min(np->n_size - (boff + start), biosize); - /* validate any pages before the write offset */ - for (; start < on/PAGE_SIZE; start+=PAGE_SIZE) - NBPGVALID_SET(bp, start/PAGE_SIZE); - /* adjust start to read any trailing data */ - start = on+n; - } + /* It may have just been an interrupt... that's OK */ + if (!ISSET(bp->nb_flags, NB_ERROR)) + error = 0; + } - /* if end is at end of page, try to */ - /* get any following pages as well. */ - if (!(end & PAGE_MASK)) { - /* stop at next valid page or end of block */ - for (; end < bufsize; end+=PAGE_SIZE) - if (NBPGVALID(bp,end/PAGE_SIZE)) - break; - } + return (error); +} - if (((boff+start) >= cureof) || ((start >= on) && ((boff + on + n) >= cureof))) { +/* + * finish up an NFS WRITE RPC on a buffer + */ +void +nfs_buf_write_rpc_finish(struct nfsreq *req) +{ + int error = 0, nfsvers, offset, length, multasyncrpc, finished; + int committed = NFS_WRITE_FILESYNC; + uint64_t wverf = 0; + size_t rlen; + void *wakeme = NULL; + struct nfsreq_cbinfo cb; + struct nfsreq *wreq = NULL; + struct nfsbuf *bp; + struct nfsmount *nmp; + nfsnode_t np; + thread_t thd; + kauth_cred_t cred; + uio_t auio; + char uio_buf [ UIO_SIZEOF(1) ]; + +finish: + np = req->r_np; + thd = req->r_thread; + cred = req->r_cred; + if (IS_VALID_CRED(cred)) + kauth_cred_ref(cred); + cb = req->r_callback; + bp = cb.rcb_bp; + if (cb.rcb_func) /* take an extra reference on the nfsreq in case we want to resend it later due to grace error */ + nfs_request_ref(req, 0); + + nmp = NFSTONMP(np); + if (!nmp) { + SET(bp->nb_flags, NB_ERROR); + bp->nb_error = error = ENXIO; + } + if (error || ISSET(bp->nb_flags, NB_ERROR)) { + /* just drop it */ + nfs_request_async_cancel(req); + goto out; + } + nfsvers = nmp->nm_vers; + + offset = cb.rcb_args[0]; + rlen = length = cb.rcb_args[1]; + + /* finish the RPC */ + error = nmp->nm_funcs->nf_write_rpc_async_finish(np, req, &committed, &rlen, &wverf); + if ((error == EINPROGRESS) && cb.rcb_func) { + /* async request restarted */ + if (cb.rcb_func) + nfs_request_rele(req); + if (IS_VALID_CRED(cred)) + kauth_cred_unref(&cred); + return; + } + if ((nmp->nm_vers >= NFS_VER4) && nfs_mount_state_error_should_restart(error) && !ISSET(bp->nb_flags, NB_ERROR)) { + lck_mtx_lock(&nmp->nm_lock); + if ((error != NFSERR_OLD_STATEID) && (error != NFSERR_GRACE) && (cb.rcb_args[2] == nmp->nm_stategenid)) { + NP(np, "nfs_buf_write_rpc_finish: error %d @ 0x%llx, 0x%x 0x%x, initiating recovery", + error, NBOFF(bp)+offset, cb.rcb_args[2], nmp->nm_stategenid); + nfs_need_recover(nmp, error); + } + lck_mtx_unlock(&nmp->nm_lock); + if (np->n_flag & NREVOKE) { + error = EIO; + } else { + if (error == NFSERR_GRACE) { + if (cb.rcb_func) { /* - * Either this entire read is beyond the current EOF - * or the range that we won't be modifying (on+n...end) - * is all beyond the current EOF. - * No need to make a trip across the network to - * read nothing. So, just zero the buffer instead. + * For an async I/O request, handle a grace delay just like + * jukebox errors. Set the resend time and queue it up. */ - FSDBG(516, bp, start, end - start, 0xd00dee00); - bzero(bp->nb_data + start, end - start); - } else { - /* now we'll read the (rest of the) data */ - auio.uio_offset = boff + start; - auio.uio_resid = iov.iov_len = end - start; - iov.iov_base = bp->nb_data + start; - error = nfs_readrpc(vp, &auio, cred); - if (error) { - bp->nb_error = error; - SET(bp->nb_flags, NB_ERROR); - printf("nfs_write: readrpc %d", error); - } - if (auio.uio_resid > 0) { - FSDBG(516, bp, iov.iov_base - bp->nb_data, auio.uio_resid, 0xd00dee02); - bzero(iov.iov_base, auio.uio_resid); + struct timeval now; + if (req->r_nmrep.nmc_mhead) { + mbuf_freem(req->r_nmrep.nmc_mhead); + req->r_nmrep.nmc_mhead = NULL; } + req->r_error = 0; + microuptime(&now); + lck_mtx_lock(&req->r_mtx); + req->r_resendtime = now.tv_sec + 2; + req->r_xid = 0; // get a new XID + req->r_flags |= R_RESTART; + req->r_start = 0; + nfs_asyncio_resend(req); + lck_mtx_unlock(&req->r_mtx); + if (IS_VALID_CRED(cred)) + kauth_cred_unref(&cred); + /* Note: nfsreq reference taken will be dropped later when finished */ + return; } - /* update validoff/validend if necessary */ - if ((bp->nb_validoff < 0) || (bp->nb_validoff > start)) - bp->nb_validoff = start; - if ((bp->nb_validend < 0) || (bp->nb_validend < end)) - bp->nb_validend = end; - if (np->n_size > boff + bp->nb_validend) - bp->nb_validend = min(np->n_size - (boff + start), biosize); - /* validate any pages before the write offset's page */ - for (; start < trunc_page_32(on); start+=PAGE_SIZE) - NBPGVALID_SET(bp, start/PAGE_SIZE); - /* validate any pages after the range of pages being written to */ - for (; (end - 1) > round_page_32(on+n-1); end-=PAGE_SIZE) - NBPGVALID_SET(bp, (end-1)/PAGE_SIZE); - /* Note: pages being written to will be validated when written */ + /* otherwise, just pause a couple seconds and retry */ + tsleep(&nmp->nm_state, (PZERO-1), "nfsgrace", 2*hz); + } + if (!(error = nfs_mount_state_wait_for_recovery(nmp))) { + rlen = 0; + goto writeagain; } } + } + if (error) { + SET(bp->nb_flags, NB_ERROR); + bp->nb_error = error; + } + if (error || (nfsvers == NFS_VER2)) + goto out; + if (rlen <= 0) { + SET(bp->nb_flags, NB_ERROR); + bp->nb_error = error = EIO; + goto out; + } - if (ISSET(bp->nb_flags, NB_ERROR)) { - error = bp->nb_error; - nfs_buf_release(bp, 1); - FSDBG_BOT(515, vp, uio->uio_offset, uio->uio_resid, error); - return (error); + /* save lowest commit level returned */ + if (committed < bp->nb_commitlevel) + bp->nb_commitlevel = committed; + + /* check the write verifier */ + if (!bp->nb_verf) { + bp->nb_verf = wverf; + } else if (bp->nb_verf != wverf) { + /* verifier changed, so buffer will need to be rewritten */ + bp->nb_flags |= NB_STALEWVERF; + bp->nb_commitlevel = NFS_WRITE_UNSTABLE; + bp->nb_verf = wverf; + } + + /* + * check for a short write + * + * If the server didn't write all the data, then we + * need to issue another write for the rest of it. + * (Don't bother if the buffer hit an error or stale wverf.) + */ + if (((int)rlen < length) && !(bp->nb_flags & (NB_STALEWVERF|NB_ERROR))) { +writeagain: + offset += rlen; + length -= rlen; + + auio = uio_createwithbuffer(1, NBOFF(bp) + offset, UIO_SYSSPACE, + UIO_WRITE, &uio_buf, sizeof(uio_buf)); + uio_addiov(auio, CAST_USER_ADDR_T(bp->nb_data + offset), length); + + cb.rcb_args[0] = offset; + cb.rcb_args[1] = length; + if (nmp->nm_vers >= NFS_VER4) + cb.rcb_args[2] = nmp->nm_stategenid; + + // XXX iomode should really match the original request + error = nmp->nm_funcs->nf_write_rpc_async(np, auio, length, thd, cred, + NFS_WRITE_FILESYNC, &cb, &wreq); + if (!error) { + if (IS_VALID_CRED(cred)) + kauth_cred_unref(&cred); + if (!cb.rcb_func) { + /* if !async we'll need to wait for this RPC to finish */ + req = wreq; + wreq = NULL; + goto finish; + } + nfs_request_rele(req); + /* + * We're done here. + * Outstanding RPC count is unchanged. + * Callback will be called when RPC is done. + */ + return; } + SET(bp->nb_flags, NB_ERROR); + bp->nb_error = error; + } + +out: + if (cb.rcb_func) { + nfs_async_write_done(nmp); + nfs_request_rele(req); + } + /* + * Decrement outstanding RPC count on buffer + * and call nfs_buf_write_finish on last RPC. + * + * (Note: when there are multiple async RPCs issued for a + * buffer we need nfs_buffer_mutex to avoid problems when + * aborting a partially-initiated set of RPCs) + */ + multasyncrpc = ISSET(bp->nb_flags, NB_MULTASYNCRPC); + if (multasyncrpc) + lck_mtx_lock(nfs_buf_mutex); + + bp->nb_rpcs--; + finished = (bp->nb_rpcs == 0); + + if (multasyncrpc) + lck_mtx_unlock(nfs_buf_mutex); + + if (finished) { + if (multasyncrpc) + wakeme = &bp->nb_rpcs; + nfs_buf_write_finish(bp, thd, cred); + if (wakeme) + wakeup(wakeme); + } + if (IS_VALID_CRED(cred)) + kauth_cred_unref(&cred); +} + +/* + * Send commit(s) for the given node's "needcommit" buffers + */ +int +nfs_flushcommits(nfsnode_t np, int nowait) +{ + struct nfsmount *nmp; + struct nfsbuf *bp, *prevlbp, *lbp; + struct nfsbuflists blist, commitlist; + int error = 0, retv, wcred_set, flags, dirty; + u_quad_t off, endoff, toff; + uint64_t wverf; + u_int32_t count; + kauth_cred_t wcred = NULL; + + FSDBG_TOP(557, np, 0, 0, 0); + + /* + * A nb_flags == (NB_DELWRI | NB_NEEDCOMMIT) block has been written to the + * server, but nas not been committed to stable storage on the server + * yet. The byte range is worked out for as many nfsbufs as we can handle + * and the commit rpc is done. + */ + if (!LIST_EMPTY(&np->n_dirtyblkhd)) { + error = nfs_node_lock(np); + if (error) + goto done; np->n_flag |= NMODIFIED; + nfs_node_unlock(np); + } - /* - * Check for valid write lease and get one as required. - * In case nfs_buf_get() and/or nfs_buf_write() delayed us. - */ - if ((nmp->nm_flag & NFSMNT_NQNFS) && - NQNFS_CKINVALID(vp, np, ND_WRITE)) { - do { - error = nqnfs_getlease(vp, ND_WRITE, cred, p); - } while (error == NQNFS_EXPIRED); - if (error) { - nfs_buf_release(bp, 1); - FSDBG_BOT(515, vp, uio->uio_offset, 0x11220001, error); - return (error); + off = (u_quad_t)-1; + endoff = 0; + wcred_set = 0; + LIST_INIT(&commitlist); + + nmp = NFSTONMP(np); + if (!nmp) { + error = ENXIO; + goto done; + } + if (nmp->nm_vers == NFS_VER2) { + error = EINVAL; + goto done; + } + + flags = NBI_DIRTY; + if (nowait) + flags |= NBI_NOWAIT; + lck_mtx_lock(nfs_buf_mutex); + wverf = nmp->nm_verf; + if (!nfs_buf_iterprepare(np, &blist, flags)) { + while ((bp = LIST_FIRST(&blist))) { + LIST_REMOVE(bp, nb_vnbufs); + LIST_INSERT_HEAD(&np->n_dirtyblkhd, bp, nb_vnbufs); + error = nfs_buf_acquire(bp, NBAC_NOWAIT, 0, 0); + if (error) + continue; + if (ISSET(bp->nb_flags, NB_NEEDCOMMIT)) + nfs_buf_check_write_verifier(np, bp); + if (((bp->nb_flags & (NB_DELWRI | NB_NEEDCOMMIT)) != (NB_DELWRI | NB_NEEDCOMMIT)) || + (bp->nb_verf != wverf)) { + nfs_buf_drop(bp); + continue; } - if (np->n_lrev != np->n_brev || - (np->n_flag & NQNFSNONCACHE)) { - nfs_buf_release(bp, 1); - error = nfs_vinvalbuf(vp, V_SAVE, cred, p, 1); - if (error) { - FSDBG_BOT(515, vp, uio->uio_offset, 0x11220002, error); - return (error); - } - np->n_brev = np->n_lrev; - goto again; + nfs_buf_remfree(bp); + + /* buffer UPLs will be grabbed *in order* below */ + + FSDBG(557, bp, bp->nb_flags, bp->nb_valid, bp->nb_dirty); + FSDBG(557, bp->nb_validoff, bp->nb_validend, + bp->nb_dirtyoff, bp->nb_dirtyend); + + /* + * Work out if all buffers are using the same cred + * so we can deal with them all with one commit. + * + * Note: creds in bp's must be obtained by kauth_cred_ref + * on the same original cred in order for them to be equal. + */ + if (wcred_set == 0) { + wcred = bp->nb_wcred; + if (!IS_VALID_CRED(wcred)) + panic("nfs: needcommit w/out wcred"); + wcred_set = 1; + } else if ((wcred_set == 1) && wcred != bp->nb_wcred) { + wcred_set = -1; } - } - NFS_BUF_MAP(bp); - error = uiomove((char *)bp->nb_data + on, n, uio); - if (error) { - SET(bp->nb_flags, NB_ERROR); - nfs_buf_release(bp, 1); - FSDBG_BOT(515, vp, uio->uio_offset, uio->uio_resid, error); - return (error); - } + SET(bp->nb_flags, NB_WRITEINPROG); - /* validate any pages written to */ - start = on & ~PAGE_MASK; - for (; start < on+n; start += PAGE_SIZE) { - NBPGVALID_SET(bp, start/PAGE_SIZE); /* - * This may seem a little weird, but we don't actually set the - * dirty bits for writes. This is because we keep the dirty range - * in the nb_dirtyoff/nb_dirtyend fields. Also, particularly for - * delayed writes, when we give the pages back to the VM we don't - * want to keep them marked dirty, because when we later write the - * buffer we won't be able to tell which pages were written dirty - * and which pages were mmapped and dirtied. + * Add this buffer to the list of buffers we are committing. + * Buffers are inserted into the list in ascending order so that + * we can take the UPLs in order after the list is complete. */ + prevlbp = NULL; + LIST_FOREACH(lbp, &commitlist, nb_vnbufs) { + if (bp->nb_lblkno < lbp->nb_lblkno) + break; + prevlbp = lbp; + } + LIST_REMOVE(bp, nb_vnbufs); + if (prevlbp) + LIST_INSERT_AFTER(prevlbp, bp, nb_vnbufs); + else + LIST_INSERT_HEAD(&commitlist, bp, nb_vnbufs); + + /* update commit range start, end */ + toff = NBOFF(bp) + bp->nb_dirtyoff; + if (toff < off) + off = toff; + toff += (u_quad_t)(bp->nb_dirtyend - bp->nb_dirtyoff); + if (toff > endoff) + endoff = toff; } - if (bp->nb_dirtyend > 0) { - bp->nb_dirtyoff = min(on, bp->nb_dirtyoff); - bp->nb_dirtyend = max((on + n), bp->nb_dirtyend); - } else { - bp->nb_dirtyoff = on; - bp->nb_dirtyend = on + n; - } - if (bp->nb_validend <= 0 || bp->nb_validend < bp->nb_dirtyoff || - bp->nb_validoff > bp->nb_dirtyend) { - bp->nb_validoff = bp->nb_dirtyoff; - bp->nb_validend = bp->nb_dirtyend; - } else { - bp->nb_validoff = min(bp->nb_validoff, bp->nb_dirtyoff); - bp->nb_validend = max(bp->nb_validend, bp->nb_dirtyend); + nfs_buf_itercomplete(np, &blist, NBI_DIRTY); + } + lck_mtx_unlock(nfs_buf_mutex); + + if (LIST_EMPTY(&commitlist)) { + error = ENOBUFS; + goto done; + } + + /* + * We need a UPL to prevent others from accessing the buffers during + * our commit RPC(s). + * + * We used to also check for dirty pages here; if there were any we'd + * abort the commit and force the entire buffer to be written again. + * Instead of doing that, we just go ahead and commit the dirty range, + * and then leave the buffer around with dirty pages that will be + * written out later. + */ + LIST_FOREACH(bp, &commitlist, nb_vnbufs) { + if (!ISSET(bp->nb_flags, NB_PAGELIST)) { + retv = nfs_buf_upl_setup(bp); + if (retv) { + /* Unable to create the UPL, the VM object probably no longer exists. */ + printf("nfs_flushcommits: upl create failed %d\n", retv); + bp->nb_valid = bp->nb_dirty = 0; + } } - if (!ISSET(bp->nb_flags, NB_CACHE)) - nfs_buf_normalize_valid_range(np, bp); + nfs_buf_upl_check(bp); + } + /* + * Commit data on the server, as required. + * If all bufs are using the same wcred, then use that with + * one call for all of them, otherwise commit each one + * separately. + */ + if (wcred_set == 1) { /* - * Since this block is being modified, it must be written - * again and not just committed. + * Note, it's possible the commit range could be >2^32-1. + * If it is, we'll send one commit that covers the whole file. */ - if (ISSET(bp->nb_flags, NB_NEEDCOMMIT)) { - np->n_needcommitcnt--; - CHECK_NEEDCOMMITCNT(np); + if ((endoff - off) > 0xffffffff) + count = 0; + else + count = (endoff - off); + retv = nmp->nm_funcs->nf_commit_rpc(np, off, count, wcred, wverf); + } else { + retv = 0; + LIST_FOREACH(bp, &commitlist, nb_vnbufs) { + toff = NBOFF(bp) + bp->nb_dirtyoff; + count = bp->nb_dirtyend - bp->nb_dirtyoff; + retv = nmp->nm_funcs->nf_commit_rpc(np, toff, count, bp->nb_wcred, wverf); + if (retv) + break; } - CLR(bp->nb_flags, NB_NEEDCOMMIT); + } + + /* + * Now, either mark the blocks I/O done or mark the + * blocks dirty, depending on whether the commit + * succeeded. + */ + while ((bp = LIST_FIRST(&commitlist))) { + LIST_REMOVE(bp, nb_vnbufs); + FSDBG(557, bp, retv, bp->nb_flags, bp->nb_dirty); + nfs_node_lock_force(np); + CLR(bp->nb_flags, (NB_NEEDCOMMIT | NB_WRITEINPROG)); + np->n_needcommitcnt--; + CHECK_NEEDCOMMITCNT(np); + nfs_node_unlock(np); + + if (retv) { + /* move back to dirty list */ + lck_mtx_lock(nfs_buf_mutex); + LIST_INSERT_HEAD(&np->n_dirtyblkhd, bp, nb_vnbufs); + lck_mtx_unlock(nfs_buf_mutex); + nfs_buf_release(bp, 1); + continue; + } + + nfs_node_lock_force(np); + np->n_numoutput++; + nfs_node_unlock(np); + vnode_startwrite(NFSTOV(np)); + if (ISSET(bp->nb_flags, NB_DELWRI)) { + lck_mtx_lock(nfs_buf_mutex); + nfs_nbdwrite--; + NFSBUFCNTCHK(); + lck_mtx_unlock(nfs_buf_mutex); + wakeup(&nfs_nbdwrite); + } + CLR(bp->nb_flags, (NB_READ|NB_DONE|NB_ERROR|NB_DELWRI)); + /* if block still has dirty pages, we don't want it to */ + /* be released in nfs_buf_iodone(). So, don't set NB_ASYNC. */ + if (!(dirty = bp->nb_dirty)) + SET(bp->nb_flags, NB_ASYNC); + else + CLR(bp->nb_flags, NB_ASYNC); + + /* move to clean list */ + lck_mtx_lock(nfs_buf_mutex); + LIST_INSERT_HEAD(&np->n_cleanblkhd, bp, nb_vnbufs); + lck_mtx_unlock(nfs_buf_mutex); + + bp->nb_dirtyoff = bp->nb_dirtyend = 0; + + nfs_buf_iodone(bp); + if (dirty) { + /* throw it back in as a delayed write buffer */ + CLR(bp->nb_flags, NB_DONE); + nfs_buf_write_delayed(bp); + } + } + +done: + FSDBG_BOT(557, np, 0, 0, error); + return (error); +} + +/* + * Flush all the blocks associated with a vnode. + * Walk through the buffer pool and push any dirty pages + * associated with the vnode. + */ +int +nfs_flush(nfsnode_t np, int waitfor, thread_t thd, int ignore_writeerr) +{ + struct nfsbuf *bp; + struct nfsbuflists blist; + struct nfsmount *nmp = NFSTONMP(np); + int error = 0, error2, slptimeo = 0, slpflag = 0; + int nfsvers, flags, passone = 1; + + FSDBG_TOP(517, np, waitfor, ignore_writeerr, 0); + + if (!nmp) { + error = ENXIO; + goto out; + } + nfsvers = nmp->nm_vers; + if (NMFLAG(nmp, INTR)) + slpflag = PCATCH; + + if (!LIST_EMPTY(&np->n_dirtyblkhd)) { + nfs_node_lock_force(np); + np->n_flag |= NMODIFIED; + nfs_node_unlock(np); + } + + lck_mtx_lock(nfs_buf_mutex); + while (np->n_bflag & NBFLUSHINPROG) { + np->n_bflag |= NBFLUSHWANT; + error = msleep(&np->n_bflag, nfs_buf_mutex, slpflag, "nfs_flush", NULL); + if ((error && (error != EWOULDBLOCK)) || + ((error = nfs_sigintr(NFSTONMP(np), NULL, thd, 0)))) { + lck_mtx_unlock(nfs_buf_mutex); + goto out; + } + } + np->n_bflag |= NBFLUSHINPROG; + + /* + * On the first pass, start async/unstable writes on all + * delayed write buffers. Then wait for all writes to complete + * and call nfs_flushcommits() to commit any uncommitted buffers. + * On all subsequent passes, start STABLE writes on any remaining + * dirty buffers. Then wait for all writes to complete. + */ +again: + FSDBG(518, LIST_FIRST(&np->n_dirtyblkhd), np->n_flag, 0, 0); + if (!NFSTONMP(np)) { + lck_mtx_unlock(nfs_buf_mutex); + error = ENXIO; + goto done; + } - if ((np->n_flag & NQNFSNONCACHE) || - (ioflag & IO_SYNC) || (vp->v_flag & VNOCACHE_DATA)) { - bp->nb_proc = p; - error = nfs_buf_write(bp); - if (error) { - FSDBG_BOT(515, vp, uio->uio_offset, - uio->uio_resid, error); - return (error); - } - if (np->n_flag & NQNFSNONCACHE) { - error = nfs_vinvalbuf(vp, V_SAVE, cred, p, 1); + /* Start/do any write(s) that are required. */ + if (!nfs_buf_iterprepare(np, &blist, NBI_DIRTY)) { + while ((bp = LIST_FIRST(&blist))) { + LIST_REMOVE(bp, nb_vnbufs); + LIST_INSERT_HEAD(&np->n_dirtyblkhd, bp, nb_vnbufs); + flags = (passone || !(waitfor == MNT_WAIT || waitfor == MNT_DWAIT)) ? NBAC_NOWAIT : 0; + if (flags != NBAC_NOWAIT) + nfs_buf_refget(bp); + while ((error = nfs_buf_acquire(bp, flags, slpflag, slptimeo))) { + FSDBG(524, bp, flags, bp->nb_lflags, bp->nb_flags); + if (error == EBUSY) + break; if (error) { - FSDBG_BOT(515, vp, uio->uio_offset, - uio->uio_resid, error); - return (error); + error2 = nfs_sigintr(NFSTONMP(np), NULL, thd, 0); + if (error2) { + if (flags != NBAC_NOWAIT) + nfs_buf_refrele(bp); + nfs_buf_itercomplete(np, &blist, NBI_DIRTY); + lck_mtx_unlock(nfs_buf_mutex); + error = error2; + goto done; + } + if (slpflag == PCATCH) { + slpflag = 0; + slptimeo = 2 * hz; + } } } - } else if ((n + on) == biosize && (nmp->nm_flag & NFSMNT_NQNFS) == 0) { - bp->nb_proc = (struct proc *)0; + if (flags != NBAC_NOWAIT) + nfs_buf_refrele(bp); + if (error == EBUSY) + continue; + if (!bp->nb_np) { + /* buffer is no longer valid */ + nfs_buf_drop(bp); + continue; + } + if (ISSET(bp->nb_flags, NB_NEEDCOMMIT)) + nfs_buf_check_write_verifier(np, bp); + if (!ISSET(bp->nb_flags, NB_DELWRI)) { + /* buffer is no longer dirty */ + nfs_buf_drop(bp); + continue; + } + FSDBG(525, bp, passone, bp->nb_lflags, bp->nb_flags); + if ((passone || !(waitfor == MNT_WAIT || waitfor == MNT_DWAIT)) && + ISSET(bp->nb_flags, NB_NEEDCOMMIT)) { + nfs_buf_drop(bp); + continue; + } + nfs_buf_remfree(bp); + lck_mtx_unlock(nfs_buf_mutex); + if (ISSET(bp->nb_flags, NB_ERROR)) { + nfs_node_lock_force(np); + np->n_error = bp->nb_error ? bp->nb_error : EIO; + np->n_flag |= NWRITEERR; + nfs_node_unlock(np); + nfs_buf_release(bp, 1); + lck_mtx_lock(nfs_buf_mutex); + continue; + } SET(bp->nb_flags, NB_ASYNC); + if (!passone) { + /* NB_STABLE forces this to be written FILESYNC */ + SET(bp->nb_flags, NB_STABLE); + } nfs_buf_write(bp); - } else - nfs_buf_write_delayed(bp); + lck_mtx_lock(nfs_buf_mutex); + } + nfs_buf_itercomplete(np, &blist, NBI_DIRTY); + } + lck_mtx_unlock(nfs_buf_mutex); + + if (waitfor == MNT_WAIT || waitfor == MNT_DWAIT) { + while ((error = vnode_waitforwrites(NFSTOV(np), 0, slpflag, slptimeo, "nfsflush"))) { + error2 = nfs_sigintr(NFSTONMP(np), NULL, thd, 0); + if (error2) { + error = error2; + goto done; + } + if (slpflag == PCATCH) { + slpflag = 0; + slptimeo = 2 * hz; + } + } + } - if (np->n_needcommitcnt > (nbuf/16)) - nfs_flushcommits(vp, p); + if (nfsvers != NFS_VER2) { + /* loop while it looks like there are still buffers to be */ + /* commited and nfs_flushcommits() seems to be handling them. */ + while (np->n_needcommitcnt) + if (nfs_flushcommits(np, 0)) + break; + } - } while (uio->uio_resid > 0 && n > 0); + if (passone) { + passone = 0; + if (!LIST_EMPTY(&np->n_dirtyblkhd)) { + nfs_node_lock_force(np); + np->n_flag |= NMODIFIED; + nfs_node_unlock(np); + } + lck_mtx_lock(nfs_buf_mutex); + goto again; + } - FSDBG_BOT(515, vp, uio->uio_offset, uio->uio_resid, 0); - return (0); + if (waitfor == MNT_WAIT || waitfor == MNT_DWAIT) { + if (!LIST_EMPTY(&np->n_dirtyblkhd)) { + nfs_node_lock_force(np); + np->n_flag |= NMODIFIED; + nfs_node_unlock(np); + } + lck_mtx_lock(nfs_buf_mutex); + if (!LIST_EMPTY(&np->n_dirtyblkhd)) + goto again; + lck_mtx_unlock(nfs_buf_mutex); + nfs_node_lock_force(np); + /* + * OK, it looks like there are no dirty blocks. If we have no + * writes in flight and no one in the write code, we can clear + * the modified flag. In order to make sure we see the latest + * attributes and size, we also invalidate the attributes and + * advance the attribute cache XID to guarantee that attributes + * newer than our clearing of NMODIFIED will get loaded next. + * (If we don't do this, it's possible for the flush's final + * write/commit (xid1) to be executed in parallel with a subsequent + * getattr request (xid2). The getattr could return attributes + * from *before* the write/commit completed but the stale attributes + * would be preferred because of the xid ordering.) + */ + if (!np->n_wrbusy && !np->n_numoutput) { + np->n_flag &= ~NMODIFIED; + NATTRINVALIDATE(np); + nfs_get_xid(&np->n_xid); + } + } else { + nfs_node_lock_force(np); + } + + FSDBG(526, np->n_flag, np->n_error, 0, 0); + if (!ignore_writeerr && (np->n_flag & NWRITEERR)) { + error = np->n_error; + np->n_flag &= ~NWRITEERR; + } + nfs_node_unlock(np); +done: + lck_mtx_lock(nfs_buf_mutex); + flags = np->n_bflag; + np->n_bflag &= ~(NBFLUSHINPROG|NBFLUSHWANT); + lck_mtx_unlock(nfs_buf_mutex); + if (flags & NBFLUSHWANT) + wakeup(&np->n_bflag); +out: + FSDBG_BOT(517, np, error, ignore_writeerr, 0); + return (error); } /* * Flush out and invalidate all buffers associated with a vnode. * Called with the underlying object locked. */ -static int -nfs_vinvalbuf_internal(vp, flags, cred, p, slpflag, slptimeo) - register struct vnode *vp; - int flags; - struct ucred *cred; - struct proc *p; - int slpflag, slptimeo; +int +nfs_vinvalbuf_internal( + nfsnode_t np, + int flags, + thread_t thd, + kauth_cred_t cred, + int slpflag, + int slptimeo) { struct nfsbuf *bp; - struct nfsbuf *nbp, *blist; - int s, error = 0; - struct nfsnode *np = VTONFS(vp); + struct nfsbuflists blist; + int list, error = 0; if (flags & V_SAVE) { - if (error = VOP_FSYNC(vp, cred, MNT_WAIT, p)) + if ((error = nfs_flush(np, MNT_WAIT, thd, (flags & V_IGNORE_WRITEERR)))) return (error); - if (np->n_dirtyblkhd.lh_first) - panic("nfs_vinvalbuf: dirty bufs (vp 0x%x, bp 0x%x)", - vp, np->n_dirtyblkhd.lh_first); } + lck_mtx_lock(nfs_buf_mutex); for (;;) { - blist = np->n_cleanblkhd.lh_first; - if (!blist) - blist = np->n_dirtyblkhd.lh_first; - if (!blist) - break; - - for (bp = blist; bp; bp = nbp) { - nbp = bp->nb_vnbufs.le_next; - s = splbio(); - if (ISSET(bp->nb_flags, NB_BUSY)) { - SET(bp->nb_flags, NB_WANTED); - FSDBG_TOP(556, vp, bp, NBOFF(bp), bp->nb_flags); - error = tsleep((caddr_t)bp, - slpflag | (PRIBIO + 1), "nfs_vinvalbuf", - slptimeo); - FSDBG_BOT(556, vp, bp, NBOFF(bp), bp->nb_flags); - splx(s); - if (error) { - FSDBG(554, vp, bp, -1, error); + list = NBI_CLEAN; + if (nfs_buf_iterprepare(np, &blist, list)) { + list = NBI_DIRTY; + if (nfs_buf_iterprepare(np, &blist, list)) + break; + } + while ((bp = LIST_FIRST(&blist))) { + LIST_REMOVE(bp, nb_vnbufs); + if (list == NBI_CLEAN) + LIST_INSERT_HEAD(&np->n_cleanblkhd, bp, nb_vnbufs); + else + LIST_INSERT_HEAD(&np->n_dirtyblkhd, bp, nb_vnbufs); + nfs_buf_refget(bp); + while ((error = nfs_buf_acquire(bp, NBAC_REMOVE, slpflag, slptimeo))) { + FSDBG(556, np, bp, NBOFF(bp), bp->nb_flags); + if (error != EAGAIN) { + FSDBG(554, np, bp, -1, error); + nfs_buf_refrele(bp); + nfs_buf_itercomplete(np, &blist, list); + lck_mtx_unlock(nfs_buf_mutex); return (error); } - break; } - FSDBG(554, vp, bp, NBOFF(bp), bp->nb_flags); - nfs_buf_remfree(bp); - SET(bp->nb_flags, NB_BUSY); - splx(s); - if ((flags & V_SAVE) && UBCINFOEXISTS(vp) && (NBOFF(bp) < np->n_size)) { - /* XXX extra paranoia: make sure we're not */ + nfs_buf_refrele(bp); + FSDBG(554, np, bp, NBOFF(bp), bp->nb_flags); + lck_mtx_unlock(nfs_buf_mutex); + if ((flags & V_SAVE) && UBCINFOEXISTS(NFSTOV(np)) && bp->nb_np && + (NBOFF(bp) < (off_t)np->n_size)) { + /* extra paranoia: make sure we're not */ /* somehow leaving any dirty data around */ int mustwrite = 0; - int end = (NBOFF(bp) + bp->nb_bufsize >= np->n_size) ? - bp->nb_bufsize : (np->n_size - NBOFF(bp)); + int end = (NBOFF(bp) + bp->nb_bufsize > (off_t)np->n_size) ? + ((off_t)np->n_size - NBOFF(bp)) : bp->nb_bufsize; if (!ISSET(bp->nb_flags, NB_PAGELIST)) { error = nfs_buf_upl_setup(bp); if (error == EINVAL) { @@ -2252,23 +3513,31 @@ nfs_vinvalbuf_internal(vp, flags, cred, p, slpflag, slptimeo) /* hopefully we don't need to do */ /* anything for this buffer */ } else if (error) - printf("nfs_vinvalbuf: upl setup failed %d\n", - error); + printf("nfs_vinvalbuf: upl setup failed %d\n", error); bp->nb_valid = bp->nb_dirty = 0; } nfs_buf_upl_check(bp); /* check for any dirty data before the EOF */ - if (bp->nb_dirtyend && bp->nb_dirtyoff < end) { + if ((bp->nb_dirtyend > 0) && (bp->nb_dirtyoff < end)) { /* clip dirty range to EOF */ - if (bp->nb_dirtyend > end) + if (bp->nb_dirtyend > end) { bp->nb_dirtyend = end; - mustwrite++; + if (bp->nb_dirtyoff >= bp->nb_dirtyend) + bp->nb_dirtyoff = bp->nb_dirtyend = 0; + } + if ((bp->nb_dirtyend > 0) && (bp->nb_dirtyoff < end)) + mustwrite++; } bp->nb_dirty &= (1 << (round_page_32(end)/PAGE_SIZE)) - 1; if (bp->nb_dirty) mustwrite++; + /* also make sure we'll have a credential to do the write */ + if (mustwrite && !IS_VALID_CRED(bp->nb_wcred) && !IS_VALID_CRED(cred)) { + printf("nfs_vinvalbuf: found dirty buffer with no write creds\n"); + mustwrite = 0; + } if (mustwrite) { - FSDBG(554, vp, bp, 0xd00dee, bp->nb_flags); + FSDBG(554, np, bp, 0xd00dee, bp->nb_flags); if (!ISSET(bp->nb_flags, NB_PAGELIST)) panic("nfs_vinvalbuf: dirty buffer without upl"); /* gotta write out dirty data before invalidating */ @@ -2276,31 +3545,61 @@ nfs_vinvalbuf_internal(vp, flags, cred, p, slpflag, slptimeo) /* (NB_NOCACHE indicates buffer should be discarded) */ CLR(bp->nb_flags, (NB_DONE | NB_ERROR | NB_INVAL | NB_ASYNC)); SET(bp->nb_flags, NB_STABLE | NB_NOCACHE); - /* - * NFS has embedded ucred so crhold() risks zone corruption - */ - if (bp->nb_wcred == NOCRED) - bp->nb_wcred = crdup(cred); + if (!IS_VALID_CRED(bp->nb_wcred)) { + kauth_cred_ref(cred); + bp->nb_wcred = cred; + } error = nfs_buf_write(bp); // Note: bp has been released if (error) { FSDBG(554, bp, 0xd00dee, 0xbad, error); - np->n_error = error; - np->n_flag |= NWRITEERR; + nfs_node_lock_force(np); + if ((error != EINTR) && (error != ERESTART)) { + np->n_error = error; + np->n_flag |= NWRITEERR; + } + /* + * There was a write error and we need to + * invalidate attrs to sync with server. + * (if this write was extending the file, + * we may no longer know the correct size) + */ + NATTRINVALIDATE(np); + nfs_node_unlock(np); + if ((error == EINTR) || (error == ERESTART)) { + /* + * Abort on EINTR. If we don't, we could + * be stuck in this loop forever because + * the buffer will continue to stay dirty. + */ + lck_mtx_lock(nfs_buf_mutex); + nfs_buf_itercomplete(np, &blist, list); + lck_mtx_unlock(nfs_buf_mutex); + return (error); + } error = 0; } - break; + lck_mtx_lock(nfs_buf_mutex); + continue; } } SET(bp->nb_flags, NB_INVAL); - // Note: We don't want to do FREEUPs here because - // that may modify the buffer chain we're iterating! + // hold off on FREEUPs until we're done here nfs_buf_release(bp, 0); + lck_mtx_lock(nfs_buf_mutex); } + nfs_buf_itercomplete(np, &blist, list); } + if (!LIST_EMPTY(&(np)->n_dirtyblkhd) || !LIST_EMPTY(&(np)->n_cleanblkhd)) + panic("nfs_vinvalbuf: flush/inval failed"); + lck_mtx_unlock(nfs_buf_mutex); + nfs_node_lock_force(np); + if (!(flags & V_SAVE)) + np->n_flag &= ~NMODIFIED; + if (vnode_vtype(NFSTOV(np)) == VREG) + np->n_lastrahead = -1; + nfs_node_unlock(np); NFS_BUF_FREEUP(); - if (np->n_dirtyblkhd.lh_first || np->n_cleanblkhd.lh_first) - panic("nfs_vinvalbuf: flush failed"); return (0); } @@ -2310,21 +3609,23 @@ nfs_vinvalbuf_internal(vp, flags, cred, p, slpflag, slptimeo) * doing the flush, just wait for completion. */ int -nfs_vinvalbuf(vp, flags, cred, p, intrflg) - struct vnode *vp; - int flags; - struct ucred *cred; - struct proc *p; - int intrflg; +nfs_vinvalbuf(vnode_t vp, int flags, vfs_context_t ctx, int intrflg) +{ + return nfs_vinvalbuf2(vp, flags, vfs_context_thread(ctx), vfs_context_ucred(ctx), intrflg); +} + +int +nfs_vinvalbuf2(vnode_t vp, int flags, thread_t thd, kauth_cred_t cred, int intrflg) { - register struct nfsnode *np = VTONFS(vp); - struct nfsmount *nmp = VFSTONFS(vp->v_mount); - int error = 0, slpflag, slptimeo; - int didhold = 0; + nfsnode_t np = VTONFS(vp); + struct nfsmount *nmp = VTONMP(vp); + int error, slpflag, slptimeo, nflags, retry = 0; + struct timespec ts = { 2, 0 }; + off_t size; - FSDBG_TOP(554, vp, flags, intrflg, 0); + FSDBG_TOP(554, np, flags, intrflg, 0); - if (nmp && ((nmp->nm_flag & NFSMNT_INT) == 0)) + if (nmp && !NMFLAG(nmp, INTR)) intrflg = 0; if (intrflg) { slpflag = PCATCH; @@ -2333,591 +3634,216 @@ nfs_vinvalbuf(vp, flags, cred, p, intrflg) slpflag = 0; slptimeo = 0; } - /* - * First wait for any other process doing a flush to complete. - */ - while (np->n_flag & NFLUSHINPROG) { - np->n_flag |= NFLUSHWANT; - FSDBG_TOP(555, vp, flags, intrflg, np->n_flag); - error = tsleep((caddr_t)&np->n_flag, PRIBIO + 2, "nfsvinval", slptimeo); - FSDBG_BOT(555, vp, flags, intrflg, np->n_flag); - if (error && (error = nfs_sigintr(VFSTONFS(vp->v_mount), NULL, p))) { - FSDBG_BOT(554, vp, flags, intrflg, error); - return (error); - } - } - /* - * Now, flush as required. - */ - np->n_flag |= NFLUSHINPROG; - error = nfs_vinvalbuf_internal(vp, flags, cred, p, slpflag, 0); - while (error) { - FSDBG(554, vp, 0, 0, error); - error = nfs_sigintr(VFSTONFS(vp->v_mount), NULL, p); - if (error) { - np->n_flag &= ~NFLUSHINPROG; - if (np->n_flag & NFLUSHWANT) { - np->n_flag &= ~NFLUSHWANT; - wakeup((caddr_t)&np->n_flag); - } - FSDBG_BOT(554, vp, flags, intrflg, error); + /* First wait for any other process doing a flush to complete. */ + lck_mtx_lock(nfs_buf_mutex); + while (np->n_bflag & NBINVALINPROG) { + np->n_bflag |= NBINVALWANT; + msleep(&np->n_bflag, nfs_buf_mutex, slpflag, "nfs_vinvalbuf", &ts); + if ((error = nfs_sigintr(VTONMP(vp), NULL, thd, 0))) { + lck_mtx_unlock(nfs_buf_mutex); return (error); } - error = nfs_vinvalbuf_internal(vp, flags, cred, p, 0, slptimeo); - } - np->n_flag &= ~(NMODIFIED | NFLUSHINPROG); - if (np->n_flag & NFLUSHWANT) { - np->n_flag &= ~NFLUSHWANT; - wakeup((caddr_t)&np->n_flag); - } - didhold = ubc_hold(vp); - if (didhold) { - int rv = ubc_clean(vp, 1); /* get the pages out of vm also */ - if (!rv) - panic("nfs_vinvalbuf(): ubc_clean failed!"); - ubc_rele(vp); + if (np->n_bflag & NBINVALINPROG) + slpflag = 0; } - FSDBG_BOT(554, vp, flags, intrflg, 0); - return (0); -} - -/* - * Initiate asynchronous I/O. Return an error if no nfsiods are available. - * This is mainly to avoid queueing async I/O requests when the nfsiods - * are all hung on a dead server. - */ -int -nfs_asyncio(bp, cred) - struct nfsbuf *bp; - struct ucred *cred; -{ - struct nfsmount *nmp; - int i; - int gotiod; - int slpflag = 0; - int slptimeo = 0; - int error, error2; - - if (nfs_numasync == 0) - return (EIO); - - FSDBG_TOP(552, bp, bp ? NBOFF(bp) : 0, bp ? bp->nb_flags : 0, 0); + np->n_bflag |= NBINVALINPROG; + lck_mtx_unlock(nfs_buf_mutex); - nmp = ((bp != NULL) ? VFSTONFS(bp->nb_vp->v_mount) : NULL); + /* Now, flush as required. */ again: - if (nmp && nmp->nm_flag & NFSMNT_INT) - slpflag = PCATCH; - gotiod = FALSE; + error = nfs_vinvalbuf_internal(np, flags, thd, cred, slpflag, 0); + while (error) { + FSDBG(554, np, 0, 0, error); + if ((error = nfs_sigintr(VTONMP(vp), NULL, thd, 0))) + goto done; + error = nfs_vinvalbuf_internal(np, flags, thd, cred, 0, slptimeo); + } - /* no nfsbuf means tell nfsiod to process delwri list */ - if (!bp) - nfs_ioddelwri = 1; + /* get the pages out of vm also */ + if (UBCINFOEXISTS(vp) && (size = ubc_getsize(vp))) + if ((error = ubc_msync(vp, 0, size, NULL, UBC_PUSHALL | UBC_SYNC | UBC_INVALIDATE))) { + if (error == EINVAL) + panic("nfs_vinvalbuf(): ubc_msync failed!, error %d", error); + if (retry++ < 10) /* retry invalidating a few times */ + goto again; + /* give up */ + printf("nfs_vinvalbuf(): ubc_msync failed!, error %d", error); - /* - * Find a free iod to process this request. - */ - for (i = 0; i < NFS_MAXASYNCDAEMON; i++) - if (nfs_iodwant[i]) { - /* - * Found one, so wake it up and tell it which - * mount to process. - */ - NFS_DPF(ASYNCIO, - ("nfs_asyncio: waking iod %d for mount %p\n", - i, nmp)); - nfs_iodwant[i] = (struct proc *)0; - nfs_iodmount[i] = nmp; - if (nmp) - nmp->nm_bufqiods++; - wakeup((caddr_t)&nfs_iodwant[i]); - gotiod = TRUE; - break; } +done: + lck_mtx_lock(nfs_buf_mutex); + nflags = np->n_bflag; + np->n_bflag &= ~(NBINVALINPROG|NBINVALWANT); + lck_mtx_unlock(nfs_buf_mutex); + if (nflags & NBINVALWANT) + wakeup(&np->n_bflag); - /* if we're just poking the delwri list, we're done */ - if (!bp) - return (0); + FSDBG_BOT(554, np, flags, intrflg, error); + return (error); +} - /* - * If none are free, we may already have an iod working on this mount - * point. If so, it will process our request. - */ - if (!gotiod) { - if (nmp->nm_bufqiods > 0) { - NFS_DPF(ASYNCIO, - ("nfs_asyncio: %d iods are already processing mount %p\n", - nmp->nm_bufqiods, nmp)); - gotiod = TRUE; - } - } +/* + * Wait for any busy buffers to complete. + */ +void +nfs_wait_bufs(nfsnode_t np) +{ + struct nfsbuf *bp; + struct nfsbuflists blist; + int error = 0; - /* - * If we have an iod which can process the request, then queue - * the buffer. - */ - FSDBG(552, bp, gotiod, i, nmp->nm_bufqiods); - if (gotiod) { - /* - * Ensure that the queue never grows too large. - */ - while (nmp->nm_bufqlen >= 2*nfs_numasync) { - if (ISSET(bp->nb_flags, NB_IOD)) { - /* An nfsiod is attempting this async operation so */ - /* we must not fall asleep on the bufq because we */ - /* could be waiting on ourself. Just return error */ - /* and we'll do this operation syncrhonously. */ - goto out; - } - FSDBG(552, bp, nmp->nm_bufqlen, 2*nfs_numasync, -1); - NFS_DPF(ASYNCIO, - ("nfs_asyncio: waiting for mount %p queue to drain\n", nmp)); - nmp->nm_bufqwant = TRUE; - error = tsleep(&nmp->nm_bufq, slpflag | PRIBIO, - "nfsaio", slptimeo); - if (error) { - error2 = nfs_sigintr(nmp, NULL, bp->nb_proc); - if (error2) { - FSDBG_BOT(552, bp, NBOFF(bp), bp->nb_flags, error2); - return (error2); - } - if (slpflag == PCATCH) { - slpflag = 0; - slptimeo = 2 * hz; + lck_mtx_lock(nfs_buf_mutex); + if (!nfs_buf_iterprepare(np, &blist, NBI_CLEAN)) { + while ((bp = LIST_FIRST(&blist))) { + LIST_REMOVE(bp, nb_vnbufs); + LIST_INSERT_HEAD(&np->n_cleanblkhd, bp, nb_vnbufs); + nfs_buf_refget(bp); + while ((error = nfs_buf_acquire(bp, 0, 0, 0))) { + if (error != EAGAIN) { + nfs_buf_refrele(bp); + nfs_buf_itercomplete(np, &blist, NBI_CLEAN); + lck_mtx_unlock(nfs_buf_mutex); + return; } } - /* - * We might have lost our iod while sleeping, - * so check and loop if nescessary. - */ - if (nmp->nm_bufqiods == 0) { - NFS_DPF(ASYNCIO, - ("nfs_asyncio: no iods after mount %p queue was drained, looping\n", nmp)); - goto again; - } + nfs_buf_refrele(bp); + nfs_buf_drop(bp); } - - if (ISSET(bp->nb_flags, NB_READ)) { - if (bp->nb_rcred == NOCRED && cred != NOCRED) { - /* - * NFS has embedded ucred. - * Can not crhold() here as that causes zone corruption - */ - bp->nb_rcred = crdup(cred); - } - } else { - SET(bp->nb_flags, NB_WRITEINPROG); - if (bp->nb_wcred == NOCRED && cred != NOCRED) { - /* - * NFS has embedded ucred. - * Can not crhold() here as that causes zone corruption - */ - bp->nb_wcred = crdup(cred); + nfs_buf_itercomplete(np, &blist, NBI_CLEAN); + } + if (!nfs_buf_iterprepare(np, &blist, NBI_DIRTY)) { + while ((bp = LIST_FIRST(&blist))) { + LIST_REMOVE(bp, nb_vnbufs); + LIST_INSERT_HEAD(&np->n_dirtyblkhd, bp, nb_vnbufs); + nfs_buf_refget(bp); + while ((error = nfs_buf_acquire(bp, 0, 0, 0))) { + if (error != EAGAIN) { + nfs_buf_refrele(bp); + nfs_buf_itercomplete(np, &blist, NBI_DIRTY); + lck_mtx_unlock(nfs_buf_mutex); + return; + } } + nfs_buf_refrele(bp); + nfs_buf_drop(bp); } - - TAILQ_INSERT_TAIL(&nmp->nm_bufq, bp, nb_free); - nmp->nm_bufqlen++; - FSDBG_BOT(552, bp, NBOFF(bp), bp->nb_flags, 0); - return (0); + nfs_buf_itercomplete(np, &blist, NBI_DIRTY); } - -out: - /* - * All the iods are busy on other mounts, so return EIO to - * force the caller to process the i/o synchronously. - */ - NFS_DPF(ASYNCIO, ("nfs_asyncio: no iods available, i/o is synchronous\n")); - FSDBG_BOT(552, bp, NBOFF(bp), bp->nb_flags, EIO); - return (EIO); + lck_mtx_unlock(nfs_buf_mutex); } + /* - * Do an I/O operation to/from a cache block. This may be called - * synchronously or from an nfsiod. + * Add an async I/O request to the mount's async I/O queue and make + * sure that an nfsiod will service it. */ -int -nfs_doio(bp, cr, p) - struct nfsbuf *bp; - struct ucred *cr; - struct proc *p; +void +nfs_asyncio_finish(struct nfsreq *req) { - register struct uio *uiop; - register struct vnode *vp; - struct nfsnode *np; struct nfsmount *nmp; - int error = 0, diff, len, iomode, must_commit = 0; - struct uio uio; - struct iovec io; - - vp = bp->nb_vp; - np = VTONFS(vp); - nmp = VFSTONFS(vp->v_mount); - uiop = &uio; - uiop->uio_iov = &io; - uiop->uio_iovcnt = 1; - uiop->uio_segflg = UIO_SYSSPACE; - uiop->uio_procp = p; + struct nfsiod *niod; + int started = 0; - /* - * we've decided to perform I/O for this block, - * so we couldn't possibly NB_DONE. So, clear it. - */ - if (ISSET(bp->nb_flags, NB_DONE)) { - if (!ISSET(bp->nb_flags, NB_ASYNC)) - panic("nfs_doio: done and not async"); - CLR(bp->nb_flags, NB_DONE); - } - FSDBG_TOP(256, np->n_size, NBOFF(bp), bp->nb_bufsize, bp->nb_flags); - FSDBG(257, bp->nb_validoff, bp->nb_validend, bp->nb_dirtyoff, - bp->nb_dirtyend); - - if (ISSET(bp->nb_flags, NB_READ)) { - if (vp->v_type == VREG) - NFS_BUF_MAP(bp); - io.iov_len = uiop->uio_resid = bp->nb_bufsize; - io.iov_base = bp->nb_data; - uiop->uio_rw = UIO_READ; - switch (vp->v_type) { - case VREG: - uiop->uio_offset = NBOFF(bp); - nfsstats.read_bios++; - error = nfs_readrpc(vp, uiop, cr); - FSDBG(262, np->n_size, NBOFF(bp), uiop->uio_resid, error); - if (!error) { - /* update valid range */ - bp->nb_validoff = 0; - if (uiop->uio_resid) { + FSDBG_TOP(552, nmp, 0, 0, 0); +again: + if (((nmp = req->r_nmp)) == NULL) + return; + lck_mtx_lock(nfsiod_mutex); + niod = nmp->nm_niod; + + /* grab an nfsiod if we don't have one already */ + if (!niod) { + niod = TAILQ_FIRST(&nfsiodfree); + if (niod) { + TAILQ_REMOVE(&nfsiodfree, niod, niod_link); + TAILQ_INSERT_TAIL(&nfsiodwork, niod, niod_link); + niod->niod_nmp = nmp; + } else if (((nfsiod_thread_count < NFSIOD_MAX) || (nfsiod_thread_count <= 0)) && (started < 4)) { /* - * If len > 0, there is a hole in the file and - * no writes after the hole have been pushed to - * the server yet. - * Just zero fill the rest of the valid area. + * Try starting a new thread. + * We may try a couple times if other callers + * get the new threads before we do. */ - diff = bp->nb_bufsize - uiop->uio_resid; - len = np->n_size - (NBOFF(bp) + diff); - if (len > 0) { - len = min(len, uiop->uio_resid); - bzero((char *)bp->nb_data + diff, len); - bp->nb_validend = diff + len; - FSDBG(258, diff, len, 0, 1); - } else - bp->nb_validend = diff; - } else - bp->nb_validend = bp->nb_bufsize; - bp->nb_valid = (1 << (round_page_32(bp->nb_validend)/PAGE_SIZE)) - 1; - if (bp->nb_validend & PAGE_MASK) { - /* valid range ends in the middle of a page so we */ - /* need to zero-fill any invalid data at the end */ - /* of the last page */ - bzero((caddr_t)(bp->nb_data + bp->nb_validend), - bp->nb_bufsize - bp->nb_validend); - FSDBG(258, bp->nb_validend, - bp->nb_bufsize - bp->nb_validend, 0, 2); - } - } - if (p && (vp->v_flag & VTEXT) && - (((nmp->nm_flag & NFSMNT_NQNFS) && - NQNFS_CKINVALID(vp, np, ND_READ) && - np->n_lrev != np->n_brev) || - (!(nmp->nm_flag & NFSMNT_NQNFS) && - np->n_mtime != np->n_vattr.va_mtime.tv_sec))) { - uprintf("Process killed due to text file modification\n"); - psignal(p, SIGKILL); - p->p_flag |= P_NOSWAP; - } - break; - case VLNK: - uiop->uio_offset = (off_t)0; - nfsstats.readlink_bios++; - error = nfs_readlinkrpc(vp, uiop, cr); - if (!error) { - bp->nb_validoff = 0; - bp->nb_validend = uiop->uio_offset; - } - break; - case VDIR: - nfsstats.readdir_bios++; - uiop->uio_offset = NBOFF(bp); - if (!(nmp->nm_flag & NFSMNT_NFSV3)) - nmp->nm_flag &= ~NFSMNT_RDIRPLUS; /* dk@farm.org */ - if (nmp->nm_flag & NFSMNT_RDIRPLUS) { - error = nfs_readdirplusrpc(vp, uiop, cr); - if (error == NFSERR_NOTSUPP) - nmp->nm_flag &= ~NFSMNT_RDIRPLUS; - } - if ((nmp->nm_flag & NFSMNT_RDIRPLUS) == 0) - error = nfs_readdirrpc(vp, uiop, cr); - if (!error) { - bp->nb_validoff = 0; - bp->nb_validend = uiop->uio_offset - NBOFF(bp); - bp->nb_valid = (1 << (round_page_32(bp->nb_validend)/PAGE_SIZE)) - 1; - } - break; - default: - printf("nfs_doio: type %x unexpected\n", vp->v_type); - break; - }; - if (error) { - SET(bp->nb_flags, NB_ERROR); - bp->nb_error = error; - } - - } else { - /* we're doing a write */ - int doff, dend = 0; - - /* We need to make sure the pages are locked before doing I/O. */ - if (!ISSET(bp->nb_flags, NB_META) && UBCISVALID(vp)) { - if (!ISSET(bp->nb_flags, NB_PAGELIST)) { - error = nfs_buf_upl_setup(bp); - if (error) { - printf("nfs_doio: upl create failed %d\n", error); - SET(bp->nb_flags, NB_ERROR); - bp->nb_error = EIO; - return (EIO); - } - nfs_buf_upl_check(bp); + lck_mtx_unlock(nfsiod_mutex); + started++; + if (!nfsiod_start()) + goto again; + lck_mtx_lock(nfsiod_mutex); } - } - - if (ISSET(bp->nb_flags, NB_WASDIRTY)) { - FSDBG(256, bp, NBOFF(bp), bp->nb_dirty, 0xd00dee); - /* - * There are pages marked dirty that need to be written out. - * - * We don't want to just combine the write range with the - * range of pages that are dirty because that could cause us - * to write data that wasn't actually written to. - * We also don't want to write data more than once. - * - * If the dirty range just needs to be committed, we do that. - * Otherwise, we write the dirty range and clear the dirty bits - * for any COMPLETE pages covered by that range. - * If there are dirty pages left after that, we write out the - * parts that we haven't written yet. - */ - } - - /* - * If NB_NEEDCOMMIT is set, a commit rpc may do the trick. If not - * an actual write will have to be done. - * If NB_WRITEINPROG is already set, then push it with a write anyhow. - */ - if ((bp->nb_flags & (NB_NEEDCOMMIT | NB_WRITEINPROG)) == NB_NEEDCOMMIT) { - doff = NBOFF(bp) + bp->nb_dirtyoff; - SET(bp->nb_flags, NB_WRITEINPROG); - error = nfs_commit(vp, doff, bp->nb_dirtyend - bp->nb_dirtyoff, - bp->nb_wcred, bp->nb_proc); - CLR(bp->nb_flags, NB_WRITEINPROG); - if (!error) { - bp->nb_dirtyoff = bp->nb_dirtyend = 0; - CLR(bp->nb_flags, NB_NEEDCOMMIT); - np->n_needcommitcnt--; - CHECK_NEEDCOMMITCNT(np); - } else if (error == NFSERR_STALEWRITEVERF) - nfs_clearcommit(vp->v_mount); - } - - if (!error && bp->nb_dirtyend > 0) { - /* there's a dirty range that needs to be written out */ - u_int32_t pagemask; - int firstpg, lastpg; - - if (NBOFF(bp) + bp->nb_dirtyend > np->n_size) - bp->nb_dirtyend = np->n_size - NBOFF(bp); - - NFS_BUF_MAP(bp); - - doff = bp->nb_dirtyoff; - dend = bp->nb_dirtyend; - - /* if doff page is dirty, move doff to start of page */ - if (NBPGDIRTY(bp,doff/PAGE_SIZE)) - doff -= doff & PAGE_MASK; - /* try to expand write range to include preceding dirty pages */ - if (!(doff & PAGE_MASK)) - while (doff > 0 && NBPGDIRTY(bp,(doff-1)/PAGE_SIZE)) - doff -= PAGE_SIZE; - /* if dend page is dirty, move dend to start of next page */ - if ((dend & PAGE_MASK) && NBPGDIRTY(bp,dend/PAGE_SIZE)) - dend = round_page_32(dend); - /* try to expand write range to include trailing dirty pages */ - if (!(dend & PAGE_MASK)) - while (dend < bp->nb_bufsize && NBPGDIRTY(bp,dend/PAGE_SIZE)) - dend += PAGE_SIZE; - /* make sure to keep dend clipped to EOF */ - if (NBOFF(bp) + dend > np->n_size) - dend = np->n_size - NBOFF(bp); - /* calculate range of complete pages being written */ - firstpg = round_page_32(doff) / PAGE_SIZE; - lastpg = (trunc_page_32(dend) - 1)/ PAGE_SIZE; - /* calculate mask for that page range */ - pagemask = ((1 << (lastpg+1)) - 1) & ~((1 << firstpg) - 1); - - /* compare page mask to nb_dirty; if there are other dirty pages */ - /* then write FILESYNC; otherwise, write UNSTABLE if async and */ - /* not needcommit/nocache/call; otherwise write FILESYNC */ - if (bp->nb_dirty & ~pagemask) - iomode = NFSV3WRITE_FILESYNC; - else if ((bp->nb_flags & (NB_ASYNC | NB_NEEDCOMMIT | NB_NOCACHE | NB_STABLE)) == NB_ASYNC) - iomode = NFSV3WRITE_UNSTABLE; - else - iomode = NFSV3WRITE_FILESYNC; - - /* write the dirty range */ - io.iov_len = uiop->uio_resid = dend - doff; - uiop->uio_offset = NBOFF(bp) + doff; - io.iov_base = (char *)bp->nb_data + doff; - uiop->uio_rw = UIO_WRITE; - - nfsstats.write_bios++; + } - SET(bp->nb_flags, NB_WRITEINPROG); - error = nfs_writerpc(vp, uiop, cr, &iomode, &must_commit); - if (must_commit) - nfs_clearcommit(vp->v_mount); - /* clear dirty bits for pages we've written */ - if (!error) - bp->nb_dirty &= ~pagemask; - /* set/clear needcommit flag */ - if (!error && iomode == NFSV3WRITE_UNSTABLE) { - if (!ISSET(bp->nb_flags, NB_NEEDCOMMIT)) - np->n_needcommitcnt++; - SET(bp->nb_flags, NB_NEEDCOMMIT); - /* make sure nb_dirtyoff/nb_dirtyend reflect actual range written */ - bp->nb_dirtyoff = doff; - bp->nb_dirtyend = dend; - } else { - if (ISSET(bp->nb_flags, NB_NEEDCOMMIT)) { - np->n_needcommitcnt--; - CHECK_NEEDCOMMITCNT(np); - } - CLR(bp->nb_flags, NB_NEEDCOMMIT); - } - CLR(bp->nb_flags, NB_WRITEINPROG); - /* - * For an interrupted write, the buffer is still valid and the write - * hasn't been pushed to the server yet, so we can't set NB_ERROR and - * report the interruption by setting NB_EINTR. For the NB_ASYNC case, - * NB_EINTR is not relevant. - * - * For the case of a V3 write rpc not being committed to stable - * storage, the block is still dirty and requires either a commit rpc - * or another write rpc with iomode == NFSV3WRITE_FILESYNC before the - * block is reused. This is indicated by setting the NB_DELWRI and - * NB_NEEDCOMMIT flags. - */ - if (error == EINTR || (!error && bp->nb_flags & NB_NEEDCOMMIT)) { - CLR(bp->nb_flags, NB_INVAL | NB_NOCACHE); - if (!ISSET(bp->nb_flags, NB_DELWRI)) { - SET(bp->nb_flags, NB_DELWRI); - nfs_nbdwrite++; - NFSBUFCNTCHK(); - } - FSDBG(261, bp->nb_validoff, bp->nb_validend, - bp->nb_bufsize, 0); - /* - * Since for the NB_ASYNC case, nfs_bwrite() has - * reassigned the buffer to the clean list, we have to - * reassign it back to the dirty one. Ugh. - */ - if (ISSET(bp->nb_flags, NB_ASYNC)) { - /* move to dirty list */ - int s = splbio(); - if (bp->nb_vnbufs.le_next != NFSNOLIST) - LIST_REMOVE(bp, nb_vnbufs); - LIST_INSERT_HEAD(&np->n_dirtyblkhd, bp, nb_vnbufs); - splx(s); - } else { - SET(bp->nb_flags, NB_EINTR); - } + if (req->r_achain.tqe_next == NFSREQNOLIST) + TAILQ_INSERT_TAIL(&nmp->nm_iodq, req, r_achain); + + /* If this mount doesn't already have an nfsiod working on it... */ + if (!nmp->nm_niod) { + if (niod) { /* give it the nfsiod we just grabbed */ + nmp->nm_niod = niod; + lck_mtx_unlock(nfsiod_mutex); + wakeup(niod); + } else if (nfsiod_thread_count > 0) { + /* just queue it up on nfsiod mounts queue */ + TAILQ_INSERT_TAIL(&nfsiodmounts, nmp, nm_iodlink); + lck_mtx_unlock(nfsiod_mutex); } else { - /* either there's an error or we don't need to commit */ - if (error) { - SET(bp->nb_flags, NB_ERROR); - bp->nb_error = np->n_error = error; - np->n_flag |= NWRITEERR; - } - /* clear the dirty range */ - bp->nb_dirtyoff = bp->nb_dirtyend = 0; + printf("nfs_asyncio(): no nfsiods? %d %d (%d)\n", nfsiod_thread_count, NFSIOD_MAX, started); + lck_mtx_unlock(nfsiod_mutex); + /* we have no other option but to be persistent */ + started = 0; + goto again; } - } + } else { + lck_mtx_unlock(nfsiod_mutex); + } - if (!error && bp->nb_dirty) { - /* there are pages marked dirty that need to be written out */ - int pg, cnt, npages, off, len; + FSDBG_BOT(552, nmp, 0, 0, 0); +} - nfsstats.write_bios++; +/* + * queue up async I/O request for resend + */ +void +nfs_asyncio_resend(struct nfsreq *req) +{ + struct nfsmount *nmp = req->r_nmp; - NFS_BUF_MAP(bp); + if (!nmp) + return; + nfs_gss_clnt_rpcdone(req); + lck_mtx_lock(&nmp->nm_lock); + if (!(req->r_flags & R_RESENDQ)) { + TAILQ_INSERT_TAIL(&nmp->nm_resendq, req, r_rchain); + req->r_flags |= R_RESENDQ; + } + nfs_mount_sock_thread_wake(nmp); + lck_mtx_unlock(&nmp->nm_lock); +} - /* - * we do these writes synchronously because we can't really - * support the unstable/needommit method. We could write - * them unstable, clear the dirty bits, and then commit the - * whole block later, but if we need to rewrite the data, we - * won't have any idea which pages were written because that - * info can't be stored in the nb_dirtyoff/nb_dirtyend. We - * also can't leave the dirty bits set because then we wouldn't - * be able to tell if the pages were re-dirtied between the end - * of the write and the commit. - */ - iomode = NFSV3WRITE_FILESYNC; - uiop->uio_rw = UIO_WRITE; +/* + * Read directory data into a buffer. + * + * Buffer will be filled (unless EOF is hit). + * Buffers after this one may also be completely/partially filled. + */ +int +nfs_buf_readdir(struct nfsbuf *bp, vfs_context_t ctx) +{ + nfsnode_t np = bp->nb_np; + struct nfsmount *nmp = NFSTONMP(np); + int error = 0; - SET(bp->nb_flags, NB_WRITEINPROG); - npages = bp->nb_bufsize/PAGE_SIZE; - for (pg=0; pg < npages; pg++) { - if (!NBPGDIRTY(bp,pg)) - continue; - cnt = 1; - while (((pg+cnt) < npages) && NBPGDIRTY(bp,pg+cnt)) - cnt++; - /* write cnt pages starting with page pg */ - off = pg * PAGE_SIZE; - len = cnt * PAGE_SIZE; - - /* clip writes to EOF */ - if (NBOFF(bp) + off + len > np->n_size) - len -= (NBOFF(bp) + off + len) - np->n_size; - if (len > 0) { - io.iov_len = uiop->uio_resid = len; - uiop->uio_offset = NBOFF(bp) + off; - io.iov_base = (char *)bp->nb_data + off; - error = nfs_writerpc(vp, uiop, cr, &iomode, &must_commit); - if (must_commit) - nfs_clearcommit(vp->v_mount); - if (error) - break; - } - /* clear dirty bits */ - while (cnt--) { - bp->nb_dirty &= ~(1 << pg); - /* leave pg on last page */ - if (cnt) pg++; - } - } - if (!error) { - if (ISSET(bp->nb_flags, NB_NEEDCOMMIT)) { - np->n_needcommitcnt--; - CHECK_NEEDCOMMITCNT(np); - } - CLR(bp->nb_flags, NB_NEEDCOMMIT); - } - CLR(bp->nb_flags, NB_WRITEINPROG); - FSDBG_BOT(256, bp->nb_validoff, bp->nb_validend, bp->nb_bufsize, - np->n_size); - } + if (!nmp) + return (ENXIO); + + if (nmp->nm_vers < NFS_VER4) + error = nfs3_readdir_rpc(np, bp, ctx); + else + error = nfs4_readdir_rpc(np, bp, ctx); - if (error) { + if (error && (error != NFSERR_DIRBUFDROPPED)) { SET(bp->nb_flags, NB_ERROR); bp->nb_error = error; - } } - - FSDBG_BOT(256, bp->nb_validoff, bp->nb_validend, bp->nb_bufsize, error); - - nfs_buf_iodone(bp); return (error); }