X-Git-Url: https://git.saurik.com/apple/xnu.git/blobdiff_plain/378393581903b274cb7a4d18e0d978071a6b592d..15129b1c8dbb3650c63b70adb1cad9af601c6c17:/bsd/nfs/nfs_bio.c diff --git a/bsd/nfs/nfs_bio.c b/bsd/nfs/nfs_bio.c index b4a0836a2..b9d22e786 100644 --- a/bsd/nfs/nfs_bio.c +++ b/bsd/nfs/nfs_bio.c @@ -1,23 +1,29 @@ /* - * Copyright (c) 2000-2005 Apple Computer, Inc. All rights reserved. + * Copyright (c) 2000-2010 Apple Inc. All rights reserved. * - * @APPLE_LICENSE_HEADER_START@ + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * - * The contents of this file constitute Original Code as defined in and - * are subject to the Apple Public Source License Version 1.1 (the - * "License"). You may not use this file except in compliance with the - * License. Please obtain a copy of the License at - * http://www.apple.com/publicsource and read it before using this file. + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. * - * This Original Code and all software distributed under the License are - * distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE OR NON-INFRINGEMENT. Please see the - * License for the specific language governing rights and limitations - * under the License. + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. * - * @APPLE_LICENSE_HEADER_END@ + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ */ /* Copyright (c) 1995 NeXT Computer, Inc. All Rights Reserved */ /* @@ -69,9 +75,9 @@ #include #include #include -#include #include #include +#include #include #include @@ -80,29 +86,18 @@ #include #include #include +#include #include #include #include +#include #include #include #include +#include -#include - -#define FSDBG(A, B, C, D, E) \ - KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, (A))) | DBG_FUNC_NONE, \ - (int)(B), (int)(C), (int)(D), (int)(E), 0) -#define FSDBG_TOP(A, B, C, D, E) \ - KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, (A))) | DBG_FUNC_START, \ - (int)(B), (int)(C), (int)(D), (int)(E), 0) -#define FSDBG_BOT(A, B, C, D, E) \ - KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, (A))) | DBG_FUNC_END, \ - (int)(B), (int)(C), (int)(D), (int)(E), 0) - -extern int nfs_numasync; -extern int nfs_ioddelwri; -extern struct nfsstats nfsstats; +kern_return_t thread_terminate(thread_t); /* XXX */ #define NFSBUFHASH(np, lbn) \ (&nfsbufhashtbl[((long)(np) / sizeof(*(np)) + (int)(lbn)) & nfsbufhash]) @@ -112,14 +107,13 @@ u_long nfsbufhash; int nfsbufcnt, nfsbufmin, nfsbufmax, nfsbufmetacnt, nfsbufmetamax; int nfsbuffreecnt, nfsbuffreemetacnt, nfsbufdelwricnt, nfsneedbuffer; int nfs_nbdwrite; -time_t nfsbuffreeuptimestamp; +int nfs_buf_timer_on = 0; +thread_t nfsbufdelwrithd = NULL; lck_grp_t *nfs_buf_lck_grp; -lck_grp_attr_t *nfs_buf_lck_grp_attr; -lck_attr_t *nfs_buf_lck_attr; lck_mtx_t *nfs_buf_mutex; -#define NFSBUFWRITE_THROTTLE 9 +#define NFSBUF_FREE_PERIOD 30 /* seconds */ #define NFSBUF_LRU_STALE 120 #define NFSBUF_META_STALE 240 @@ -129,9 +123,9 @@ lck_mtx_t *nfs_buf_mutex; #define META_TO_FREEUP 3 /* total number of nfsbufs nfs_buf_freeup() should attempt to free */ #define TOTAL_TO_FREEUP (LRU_TO_FREEUP+META_TO_FREEUP) -/* fraction of nfsbufs nfs_buf_freeup() should attempt to free from nfsbuffree list when called from nfs_timer() */ +/* fraction of nfsbufs nfs_buf_freeup() should attempt to free from nfsbuffree list when called from timer */ #define LRU_FREEUP_FRAC_ON_TIMER 8 -/* fraction of nfsbufs nfs_buf_freeup() should attempt to free from nfsbuffreemeta list when called from nfs_timer() */ +/* fraction of nfsbufs nfs_buf_freeup() should attempt to free from nfsbuffreemeta list when called from timer */ #define META_FREEUP_FRAC_ON_TIMER 16 /* fraction of total nfsbufs that nfsbuffreecnt should exceed before bothering to call nfs_buf_freeup() */ #define LRU_FREEUP_MIN_FRAC 4 @@ -153,22 +147,17 @@ lck_mtx_t *nfs_buf_mutex; void nfs_nbinit(void) { - nfs_buf_lck_grp_attr = lck_grp_attr_alloc_init(); - lck_grp_attr_setstat(nfs_buf_lck_grp_attr); - nfs_buf_lck_grp = lck_grp_alloc_init("nfs_buf", nfs_buf_lck_grp_attr); - - nfs_buf_lck_attr = lck_attr_alloc_init(); - - nfs_buf_mutex = lck_mtx_alloc_init(nfs_buf_lck_grp, nfs_buf_lck_attr); + nfs_buf_lck_grp = lck_grp_alloc_init("nfs_buf", LCK_GRP_ATTR_NULL); + nfs_buf_mutex = lck_mtx_alloc_init(nfs_buf_lck_grp, LCK_ATTR_NULL); nfsbufcnt = nfsbufmetacnt = nfsbuffreecnt = nfsbuffreemetacnt = nfsbufdelwricnt = 0; nfsbufmin = 128; - nfsbufmax = (sane_size >> PAGE_SHIFT) / 4; - nfsbufmetamax = (sane_size >> PAGE_SHIFT) / 16; + /* size nfsbufmax to cover at most half sane_size (w/default buf size) */ + nfsbufmax = (sane_size >> PAGE_SHIFT) / (2 * (NFS_RWSIZE >> PAGE_SHIFT)); + nfsbufmetamax = nfsbufmax / 4; nfsneedbuffer = 0; nfs_nbdwrite = 0; - nfsbuffreeuptimestamp = 0; nfsbufhashtbl = hashinit(nfsbufmax/4, M_TEMP, &nfsbufhash); TAILQ_INIT(&nfsbuffree); @@ -177,6 +166,26 @@ nfs_nbinit(void) } +/* + * Check periodically for stale/unused nfs bufs + */ +void +nfs_buf_timer(__unused void *param0, __unused void *param1) +{ + nfs_buf_freeup(1); + + lck_mtx_lock(nfs_buf_mutex); + if (nfsbufcnt <= nfsbufmin) { + nfs_buf_timer_on = 0; + lck_mtx_unlock(nfs_buf_mutex); + return; + } + lck_mtx_unlock(nfs_buf_mutex); + + nfs_interval_timer_start(nfs_buf_timer_call, + NFSBUF_FREE_PERIOD * 1000); +} + /* * try to free up some excess, unused nfsbufs */ @@ -193,7 +202,6 @@ nfs_buf_freeup(int timer) lck_mtx_lock(nfs_buf_mutex); microuptime(&now); - nfsbuffreeuptimestamp = now.tv_sec; FSDBG(320, nfsbufcnt, nfsbuffreecnt, nfsbuffreemetacnt, 0); @@ -208,13 +216,13 @@ nfs_buf_freeup(int timer) (fbp->nb_timestamp + (2*NFSBUF_LRU_STALE)) > now.tv_sec) break; nfs_buf_remfree(fbp); - /* disassociate buffer from any vnode */ - if (fbp->nb_vp) { + /* disassociate buffer from any nfsnode */ + if (fbp->nb_np) { if (fbp->nb_vnbufs.le_next != NFSNOLIST) { LIST_REMOVE(fbp, nb_vnbufs); fbp->nb_vnbufs.le_next = NFSNOLIST; } - fbp->nb_vp = NULL; + fbp->nb_np = NULL; } LIST_REMOVE(fbp, nb_hash); TAILQ_INSERT_TAIL(&nfsbuffreeup, fbp, nb_free); @@ -232,13 +240,13 @@ nfs_buf_freeup(int timer) (fbp->nb_timestamp + (2*NFSBUF_META_STALE)) > now.tv_sec) break; nfs_buf_remfree(fbp); - /* disassociate buffer from any vnode */ - if (fbp->nb_vp) { + /* disassociate buffer from any nfsnode */ + if (fbp->nb_np) { if (fbp->nb_vnbufs.le_next != NFSNOLIST) { LIST_REMOVE(fbp, nb_vnbufs); fbp->nb_vnbufs.le_next = NFSNOLIST; } - fbp->nb_vp = NULL; + fbp->nb_np = NULL; } LIST_REMOVE(fbp, nb_hash); TAILQ_INSERT_TAIL(&nfsbuffreeup, fbp, nb_free); @@ -247,21 +255,17 @@ nfs_buf_freeup(int timer) } FSDBG(320, nfsbufcnt, nfsbuffreecnt, nfsbuffreemetacnt, 0); - NFSBUFCNTCHK(1); + NFSBUFCNTCHK(); lck_mtx_unlock(nfs_buf_mutex); while ((fbp = TAILQ_FIRST(&nfsbuffreeup))) { TAILQ_REMOVE(&nfsbuffreeup, fbp, nb_free); /* nuke any creds */ - if (fbp->nb_rcred != NOCRED) { - kauth_cred_rele(fbp->nb_rcred); - fbp->nb_rcred = NOCRED; - } - if (fbp->nb_wcred != NOCRED) { - kauth_cred_rele(fbp->nb_wcred); - fbp->nb_wcred = NOCRED; - } + if (IS_VALID_CRED(fbp->nb_rcred)) + kauth_cred_unref(&fbp->nb_rcred); + if (IS_VALID_CRED(fbp->nb_wcred)) + kauth_cred_unref(&fbp->nb_wcred); /* if buf was NB_META, dump buffer */ if (ISSET(fbp->nb_flags, NB_META) && fbp->nb_data) kfree(fbp->nb_data, fbp->nb_bufsize); @@ -290,18 +294,18 @@ nfs_buf_remfree(struct nfsbuf *bp) TAILQ_REMOVE(&nfsbuffree, bp, nb_free); } bp->nb_free.tqe_next = NFSNOLIST; - NFSBUFCNTCHK(1); + NFSBUFCNTCHK(); } /* * check for existence of nfsbuf in cache */ boolean_t -nfs_buf_is_incore(vnode_t vp, daddr64_t blkno) +nfs_buf_is_incore(nfsnode_t np, daddr64_t blkno) { boolean_t rv; lck_mtx_lock(nfs_buf_mutex); - if (nfs_buf_incore(vp, blkno)) + if (nfs_buf_incore(np, blkno)) rv = TRUE; else rv = FALSE; @@ -313,14 +317,14 @@ nfs_buf_is_incore(vnode_t vp, daddr64_t blkno) * return incore buffer (must be called with nfs_buf_mutex held) */ struct nfsbuf * -nfs_buf_incore(vnode_t vp, daddr64_t blkno) +nfs_buf_incore(nfsnode_t np, daddr64_t blkno) { /* Search hash chain */ - struct nfsbuf * bp = NFSBUFHASH(VTONFS(vp), blkno)->lh_first; + struct nfsbuf * bp = NFSBUFHASH(np, blkno)->lh_first; for (; bp != NULL; bp = bp->nb_hash.le_next) - if (bp->nb_lblkno == blkno && bp->nb_vp == vp) { + if ((bp->nb_lblkno == blkno) && (bp->nb_np == np)) { if (!ISSET(bp->nb_flags, NB_INVAL)) { - FSDBG(547, bp, blkno, bp->nb_flags, bp->nb_vp); + FSDBG(547, bp, blkno, bp->nb_flags, bp->nb_np); return (bp); } } @@ -338,11 +342,15 @@ nfs_buf_incore(vnode_t vp, daddr64_t blkno) int nfs_buf_page_inval(vnode_t vp, off_t offset) { + struct nfsmount *nmp = VTONMP(vp); struct nfsbuf *bp; int error = 0; + if (!nmp) + return (ENXIO); + lck_mtx_lock(nfs_buf_mutex); - bp = nfs_buf_incore(vp, ubc_offtoblk(vp, offset)); + bp = nfs_buf_incore(VTONFS(vp), (daddr64_t)(offset / nmp->nm_biosize)); if (!bp) goto out; FSDBG(325, bp, bp->nb_flags, bp->nb_dirtyoff, bp->nb_dirtyend); @@ -357,11 +365,20 @@ nfs_buf_page_inval(vnode_t vp, off_t offset) */ if (bp->nb_dirtyend > 0) { int start = offset - NBOFF(bp); - if (bp->nb_dirtyend <= start || - bp->nb_dirtyoff >= (start + PAGE_SIZE)) - error = 0; - else + if ((bp->nb_dirtyend > start) && + (bp->nb_dirtyoff < (start + PAGE_SIZE))) { + /* + * Before returning the bad news, move the + * buffer to the start of the delwri list and + * give the list a push to try to flush the + * buffer out. + */ error = EBUSY; + nfs_buf_remfree(bp); + TAILQ_INSERT_HEAD(&nfsbufdelwri, bp, nb_free); + nfsbufdelwricnt++; + nfs_buf_delwri_push(1); + } } out: lck_mtx_unlock(nfs_buf_mutex); @@ -383,14 +400,14 @@ nfs_buf_upl_setup(struct nfsbuf *bp) return (0); upl_flags = UPL_PRECIOUS; - if (! ISSET(bp->nb_flags, NB_READ)) { + if (!ISSET(bp->nb_flags, NB_READ)) { /* * We're doing a "write", so we intend to modify * the pages we're gathering. */ upl_flags |= UPL_WILL_MODIFY; } - kret = ubc_create_upl(bp->nb_vp, NBOFF(bp), bp->nb_bufsize, + kret = ubc_create_upl(NFSTOV(bp->nb_np), NBOFF(bp), bp->nb_bufsize, &upl, NULL, upl_flags); if (kret == KERN_INVALID_ARGUMENT) { /* vm object probably doesn't exist any more */ @@ -403,7 +420,7 @@ nfs_buf_upl_setup(struct nfsbuf *bp) return (EIO); } - FSDBG(538, bp, NBOFF(bp), bp->nb_bufsize, bp->nb_vp); + FSDBG(538, bp, NBOFF(bp), bp->nb_bufsize, bp->nb_np); bp->nb_pagelist = upl; SET(bp->nb_flags, NB_PAGELIST); @@ -425,7 +442,7 @@ nfs_buf_upl_check(struct nfsbuf *bp) return; npages = round_page_32(bp->nb_bufsize) / PAGE_SIZE; - filesize = ubc_getsize(bp->nb_vp); + filesize = ubc_getsize(NFSTOV(bp->nb_np)); fileoffset = NBOFF(bp); if (fileoffset < filesize) SET(bp->nb_flags, NB_CACHE); @@ -444,11 +461,8 @@ nfs_buf_upl_check(struct nfsbuf *bp) continue; } NBPGVALID_SET(bp,i); - if (upl_dirty_page(pl, i)) { + if (upl_dirty_page(pl, i)) NBPGDIRTY_SET(bp, i); - if (!ISSET(bp->nb_flags, NB_WASDIRTY)) - SET(bp->nb_flags, NB_WASDIRTY); - } } fileoffset = NBOFF(bp); if (ISSET(bp->nb_flags, NB_CACHE)) { @@ -467,7 +481,7 @@ nfs_buf_upl_check(struct nfsbuf *bp) * make sure that a buffer is mapped * (must NOT be called with nfs_buf_mutex held) */ -static int +int nfs_buf_map(struct nfsbuf *bp) { kern_return_t kret; @@ -477,7 +491,7 @@ nfs_buf_map(struct nfsbuf *bp) if (!ISSET(bp->nb_flags, NB_PAGELIST)) return (EINVAL); - kret = ubc_upl_map(bp->nb_pagelist, (vm_address_t *)&(bp->nb_data)); + kret = ubc_upl_map(bp->nb_pagelist, (vm_offset_t *)&(bp->nb_data)); if (kret != KERN_SUCCESS) panic("nfs_buf_map: ubc_upl_map() failed with (%d)", kret); if (bp->nb_data == 0) @@ -486,37 +500,6 @@ nfs_buf_map(struct nfsbuf *bp) return (0); } -/* - * check range of pages in nfsbuf's UPL for validity - */ -static int -nfs_buf_upl_valid_range(struct nfsbuf *bp, int off, int size) -{ - off_t fileoffset, filesize; - int pg, lastpg; - upl_page_info_t *pl; - - if (!ISSET(bp->nb_flags, NB_PAGELIST)) - return (0); - pl = ubc_upl_pageinfo(bp->nb_pagelist); - - size += off & PAGE_MASK; - off &= ~PAGE_MASK; - fileoffset = NBOFF(bp); - filesize = VTONFS(bp->nb_vp)->n_size; - if ((fileoffset + off + size) > filesize) - size = filesize - (fileoffset + off); - - pg = off/PAGE_SIZE; - lastpg = (off + size - 1)/PAGE_SIZE; - while (pg <= lastpg) { - if (!upl_valid_page(pl, pg)) - return (0); - pg++; - } - return (1); -} - /* * normalize an nfsbuf's valid range * @@ -526,8 +509,8 @@ nfs_buf_upl_valid_range(struct nfsbuf *bp, int off, int size) * here as we extend the valid range through all of the * contiguous valid pages. */ -static void -nfs_buf_normalize_valid_range(struct nfsnode *np, struct nfsbuf *bp) +void +nfs_buf_normalize_valid_range(nfsnode_t np, struct nfsbuf *bp) { int pg, npg; /* pull validoff back to start of contiguous valid page range */ @@ -547,46 +530,38 @@ nfs_buf_normalize_valid_range(struct nfsnode *np, struct nfsbuf *bp) } /* - * try to push out some delayed/uncommitted writes - * ("locked" indicates whether nfs_buf_mutex is already held) + * process some entries on the delayed write queue + * (must be called with nfs_buf_mutex held) */ -static void -nfs_buf_delwri_push(int locked) +void +nfs_buf_delwri_service(void) { struct nfsbuf *bp; - int i, error; - - if (TAILQ_EMPTY(&nfsbufdelwri)) - return; + nfsnode_t np; + int error, i = 0; - /* first try to tell the nfsiods to do it */ - if (nfs_asyncio(NULL, NULL) == 0) - return; - - /* otherwise, try to do some of the work ourselves */ - i = 0; - if (!locked) - lck_mtx_lock(nfs_buf_mutex); while (i < 8 && (bp = TAILQ_FIRST(&nfsbufdelwri)) != NULL) { - struct nfsnode *np = VTONFS(bp->nb_vp); + np = bp->nb_np; nfs_buf_remfree(bp); nfs_buf_refget(bp); while ((error = nfs_buf_acquire(bp, 0, 0, 0)) == EAGAIN); nfs_buf_refrele(bp); if (error) break; - if (!bp->nb_vp) { + if (!bp->nb_np) { /* buffer is no longer valid */ nfs_buf_drop(bp); continue; } + if (ISSET(bp->nb_flags, NB_NEEDCOMMIT)) + nfs_buf_check_write_verifier(np, bp); if (ISSET(bp->nb_flags, NB_NEEDCOMMIT)) { /* put buffer at end of delwri list */ TAILQ_INSERT_TAIL(&nfsbufdelwri, bp, nb_free); nfsbufdelwricnt++; nfs_buf_drop(bp); lck_mtx_unlock(nfs_buf_mutex); - nfs_flushcommits(np->n_vnode, NULL, 1); + nfs_flushcommits(np, 1); } else { SET(bp->nb_flags, NB_ASYNC); lck_mtx_unlock(nfs_buf_mutex); @@ -595,6 +570,46 @@ nfs_buf_delwri_push(int locked) i++; lck_mtx_lock(nfs_buf_mutex); } +} + +/* + * thread to service the delayed write queue when asked + */ +void +nfs_buf_delwri_thread(__unused void *arg, __unused wait_result_t wr) +{ + struct timespec ts = { 30, 0 }; + int error = 0; + + lck_mtx_lock(nfs_buf_mutex); + while (!error) { + nfs_buf_delwri_service(); + error = msleep(&nfsbufdelwrithd, nfs_buf_mutex, 0, "nfsbufdelwri", &ts); + } + nfsbufdelwrithd = NULL; + lck_mtx_unlock(nfs_buf_mutex); + thread_terminate(nfsbufdelwrithd); +} + +/* + * try to push out some delayed/uncommitted writes + * ("locked" indicates whether nfs_buf_mutex is already held) + */ +void +nfs_buf_delwri_push(int locked) +{ + if (TAILQ_EMPTY(&nfsbufdelwri)) + return; + if (!locked) + lck_mtx_lock(nfs_buf_mutex); + /* wake up the delayed write service thread */ + if (nfsbufdelwrithd) + wakeup(&nfsbufdelwrithd); + else if (kernel_thread_start(nfs_buf_delwri_thread, NULL, &nfsbufdelwrithd) == KERN_SUCCESS) + thread_deallocate(nfsbufdelwrithd); + /* otherwise, try to do some of the work ourselves */ + if (!nfsbufdelwrithd) + nfs_buf_delwri_service(); if (!locked) lck_mtx_unlock(nfs_buf_mutex); } @@ -620,73 +635,91 @@ nfs_buf_delwri_push(int locked) */ int nfs_buf_get( - vnode_t vp, + nfsnode_t np, daddr64_t blkno, - int size, - proc_t p, + uint32_t size, + thread_t thd, int flags, struct nfsbuf **bpp) { - struct nfsnode *np = VTONFS(vp); + vnode_t vp = NFSTOV(np); + struct nfsmount *nmp = VTONMP(vp); struct nfsbuf *bp; - int biosize, bufsize; - kauth_cred_t cred; + uint32_t bufsize; int slpflag = PCATCH; int operation = (flags & NBLK_OPMASK); int error = 0; struct timespec ts; - FSDBG_TOP(541, vp, blkno, size, flags); + FSDBG_TOP(541, np, blkno, size, flags); *bpp = NULL; bufsize = size; - if (bufsize > MAXBSIZE) - panic("nfs_buf_get: buffer larger than MAXBSIZE requested"); + if (bufsize > NFS_MAXBSIZE) + panic("nfs_buf_get: buffer larger than NFS_MAXBSIZE requested"); - biosize = vfs_statfs(vnode_mount(vp))->f_iosize; + if (!nmp) { + FSDBG_BOT(541, np, blkno, 0, ENXIO); + return (ENXIO); + } - if (UBCINVALID(vp) || !UBCINFOEXISTS(vp)) { + if (!UBCINFOEXISTS(vp)) { operation = NBLK_META; - } else if (bufsize < biosize) { + } else if (bufsize < (uint32_t)nmp->nm_biosize) { /* reg files should always have biosize blocks */ - bufsize = biosize; + bufsize = nmp->nm_biosize; } /* if NBLK_WRITE, check for too many delayed/uncommitted writes */ - if ((operation == NBLK_WRITE) && (nfs_nbdwrite > ((nfsbufcnt*3)/4))) { - FSDBG_TOP(542, vp, blkno, nfs_nbdwrite, ((nfsbufcnt*3)/4)); + if ((operation == NBLK_WRITE) && (nfs_nbdwrite > NFS_A_LOT_OF_DELAYED_WRITES)) { + FSDBG_TOP(542, np, blkno, nfs_nbdwrite, NFS_A_LOT_OF_DELAYED_WRITES); /* poke the delwri list */ nfs_buf_delwri_push(0); /* sleep to let other threads run... */ tsleep(&nfs_nbdwrite, PCATCH, "nfs_nbdwrite", 1); - FSDBG_BOT(542, vp, blkno, nfs_nbdwrite, ((nfsbufcnt*3)/4)); + FSDBG_BOT(542, np, blkno, nfs_nbdwrite, NFS_A_LOT_OF_DELAYED_WRITES); } loop: lck_mtx_lock(nfs_buf_mutex); + /* wait for any buffer invalidation/flushing to complete */ + while (np->n_bflag & NBINVALINPROG) { + np->n_bflag |= NBINVALWANT; + ts.tv_sec = 2; + ts.tv_nsec = 0; + msleep(&np->n_bflag, nfs_buf_mutex, slpflag, "nfs_buf_get_invalwait", &ts); + if ((error = nfs_sigintr(VTONMP(vp), NULL, thd, 0))) { + lck_mtx_unlock(nfs_buf_mutex); + FSDBG_BOT(541, np, blkno, 0, error); + return (error); + } + if (np->n_bflag & NBINVALINPROG) + slpflag = 0; + } + /* check for existence of nfsbuf in cache */ - if ((bp = nfs_buf_incore(vp, blkno))) { + if ((bp = nfs_buf_incore(np, blkno))) { /* if busy, set wanted and wait */ if (ISSET(bp->nb_lflags, NBL_BUSY)) { if (flags & NBLK_NOWAIT) { lck_mtx_unlock(nfs_buf_mutex); - FSDBG_BOT(541, vp, blkno, bp, 0xbcbcbcbc); + FSDBG_BOT(541, np, blkno, bp, 0xbcbcbcbc); return (0); } - FSDBG_TOP(543, vp, blkno, bp, bp->nb_flags); + FSDBG_TOP(543, np, blkno, bp, bp->nb_flags); SET(bp->nb_lflags, NBL_WANTED); ts.tv_sec = 2; ts.tv_nsec = 0; msleep(bp, nfs_buf_mutex, slpflag|(PRIBIO+1)|PDROP, - "nfsbufget", (slpflag == PCATCH) ? 0 : &ts); + "nfsbufget", (slpflag == PCATCH) ? NULL : &ts); slpflag = 0; - FSDBG_BOT(543, vp, blkno, bp, bp->nb_flags); - if ((error = nfs_sigintr(VFSTONFS(vnode_mount(vp)), NULL, p))) { - FSDBG_BOT(541, vp, blkno, 0, error); + FSDBG_BOT(543, np, blkno, bp, bp->nb_flags); + if ((error = nfs_sigintr(VTONMP(vp), NULL, thd, 0))) { + FSDBG_BOT(541, np, blkno, 0, error); return (error); } goto loop; @@ -704,7 +737,7 @@ loop: if (flags & NBLK_ONLYVALID) { lck_mtx_unlock(nfs_buf_mutex); - FSDBG_BOT(541, vp, blkno, 0, 0x0000cace); + FSDBG_BOT(541, np, blkno, 0, 0x0000cace); return (0); } @@ -761,31 +794,25 @@ loop: if (bp) { /* we have a buffer to reuse */ - FSDBG(544, vp, blkno, bp, bp->nb_flags); + FSDBG(544, np, blkno, bp, bp->nb_flags); nfs_buf_remfree(bp); if (ISSET(bp->nb_flags, NB_DELWRI)) panic("nfs_buf_get: delwri"); SET(bp->nb_lflags, NBL_BUSY); - /* disassociate buffer from previous vnode */ - if (bp->nb_vp) { + /* disassociate buffer from previous nfsnode */ + if (bp->nb_np) { if (bp->nb_vnbufs.le_next != NFSNOLIST) { LIST_REMOVE(bp, nb_vnbufs); bp->nb_vnbufs.le_next = NFSNOLIST; } - bp->nb_vp = NULL; + bp->nb_np = NULL; } LIST_REMOVE(bp, nb_hash); /* nuke any creds we're holding */ - cred = bp->nb_rcred; - if (cred != NOCRED) { - bp->nb_rcred = NOCRED; - kauth_cred_rele(cred); - } - cred = bp->nb_wcred; - if (cred != NOCRED) { - bp->nb_wcred = NOCRED; - kauth_cred_rele(cred); - } + if (IS_VALID_CRED(bp->nb_rcred)) + kauth_cred_unref(&bp->nb_rcred); + if (IS_VALID_CRED(bp->nb_wcred)) + kauth_cred_unref(&bp->nb_wcred); /* if buf will no longer be NB_META, dump old buffer */ if (operation == NBLK_META) { if (!ISSET(bp->nb_flags, NB_META)) @@ -803,6 +830,7 @@ loop: bp->nb_dirtyoff = bp->nb_dirtyend = 0; bp->nb_valid = 0; bp->nb_dirty = 0; + bp->nb_verf = 0; } else { /* no buffer to reuse */ if ((nfsbufcnt < nfsbufmax) && @@ -811,45 +839,55 @@ loop: MALLOC(bp, struct nfsbuf *, sizeof(struct nfsbuf), M_TEMP, M_WAITOK); if (!bp) { lck_mtx_unlock(nfs_buf_mutex); - FSDBG_BOT(541, vp, blkno, 0, error); + FSDBG_BOT(541, np, blkno, 0, error); return (ENOMEM); } nfsbufcnt++; + + /* + * If any excess bufs, make sure the timer + * is running to free them up later. + */ + if (nfsbufcnt > nfsbufmin && !nfs_buf_timer_on) { + nfs_buf_timer_on = 1; + nfs_interval_timer_start(nfs_buf_timer_call, + NFSBUF_FREE_PERIOD * 1000); + } + if (operation == NBLK_META) nfsbufmetacnt++; - NFSBUFCNTCHK(1); + NFSBUFCNTCHK(); /* init nfsbuf */ bzero(bp, sizeof(*bp)); bp->nb_free.tqe_next = NFSNOLIST; bp->nb_validoff = bp->nb_validend = -1; - FSDBG(545, vp, blkno, bp, 0); + FSDBG(545, np, blkno, bp, 0); } else { /* too many bufs... wait for buffers to free up */ - FSDBG_TOP(546, vp, blkno, nfsbufcnt, nfsbufmax); + FSDBG_TOP(546, np, blkno, nfsbufcnt, nfsbufmax); /* poke the delwri list */ nfs_buf_delwri_push(1); nfsneedbuffer = 1; - msleep(&nfsneedbuffer, nfs_buf_mutex, PCATCH|PDROP, - "nfsbufget", 0); - FSDBG_BOT(546, vp, blkno, nfsbufcnt, nfsbufmax); - if ((error = nfs_sigintr(VFSTONFS(vnode_mount(vp)), NULL, p))) { - FSDBG_BOT(541, vp, blkno, 0, error); + msleep(&nfsneedbuffer, nfs_buf_mutex, PCATCH|PDROP, "nfsbufget", NULL); + FSDBG_BOT(546, np, blkno, nfsbufcnt, nfsbufmax); + if ((error = nfs_sigintr(VTONMP(vp), NULL, thd, 0))) { + FSDBG_BOT(541, np, blkno, 0, error); return (error); } goto loop; } } - /* setup nfsbuf */ - bp->nb_lflags = NBL_BUSY; + /* set up nfsbuf */ + SET(bp->nb_lflags, NBL_BUSY); bp->nb_flags = 0; bp->nb_lblkno = blkno; /* insert buf in hash */ LIST_INSERT_HEAD(NFSBUFHASH(np, blkno), bp, nb_hash); - /* associate buffer with new vnode */ - bp->nb_vp = vp; + /* associate buffer with new nfsnode */ + bp->nb_np = np; LIST_INSERT_HEAD(&np->n_cleanblkhd, bp, nb_vnbufs); buffer_setup: @@ -873,11 +911,11 @@ buffer_setup: bp->nb_data = kalloc(bufsize); if (!bp->nb_data) { /* Ack! couldn't allocate the data buffer! */ - /* cleanup buffer and return error */ + /* clean up buffer and return error */ lck_mtx_lock(nfs_buf_mutex); LIST_REMOVE(bp, nb_vnbufs); bp->nb_vnbufs.le_next = NFSNOLIST; - bp->nb_vp = NULL; + bp->nb_np = NULL; /* invalidate usage timestamp to allow immediate freeing */ NBUFSTAMPINVALIDATE(bp); if (bp->nb_free.tqe_next != NFSNOLIST) @@ -885,7 +923,7 @@ buffer_setup: TAILQ_INSERT_HEAD(&nfsbuffree, bp, nb_free); nfsbuffreecnt++; lck_mtx_unlock(nfs_buf_mutex); - FSDBG_BOT(541, vp, blkno, 0xb00, ENOMEM); + FSDBG_BOT(541, np, blkno, 0xb00, ENOMEM); return (ENOMEM); } bp->nb_bufsize = bufsize; @@ -908,15 +946,15 @@ buffer_setup: bp->nb_validoff = bp->nb_validend = -1; if (UBCINFOEXISTS(vp)) { - /* setup upl */ + /* set up upl */ if (nfs_buf_upl_setup(bp)) { /* unable to create upl */ /* vm object must no longer exist */ - /* cleanup buffer and return error */ + /* clean up buffer and return error */ lck_mtx_lock(nfs_buf_mutex); LIST_REMOVE(bp, nb_vnbufs); bp->nb_vnbufs.le_next = NFSNOLIST; - bp->nb_vp = NULL; + bp->nb_np = NULL; /* invalidate usage timestamp to allow immediate freeing */ NBUFSTAMPINVALIDATE(bp); if (bp->nb_free.tqe_next != NFSNOLIST) @@ -924,7 +962,7 @@ buffer_setup: TAILQ_INSERT_HEAD(&nfsbuffree, bp, nb_free); nfsbuffreecnt++; lck_mtx_unlock(nfs_buf_mutex); - FSDBG_BOT(541, vp, blkno, 0x2bc, EIO); + FSDBG_BOT(541, np, blkno, 0x2bc, EIO); return (EIO); } nfs_buf_upl_check(bp); @@ -937,7 +975,7 @@ buffer_setup: *bpp = bp; - FSDBG_BOT(541, vp, blkno, bp, bp->nb_flags); + FSDBG_BOT(541, np, blkno, bp, bp->nb_flags); return (0); } @@ -945,7 +983,8 @@ buffer_setup: void nfs_buf_release(struct nfsbuf *bp, int freeup) { - vnode_t vp = bp->nb_vp; + nfsnode_t np = bp->nb_np; + vnode_t vp; struct timeval now; int wakeup_needbuffer, wakeup_buffer, wakeup_nbdwrite; @@ -953,10 +992,11 @@ nfs_buf_release(struct nfsbuf *bp, int freeup) FSDBG(548, bp->nb_validoff, bp->nb_validend, bp->nb_dirtyoff, bp->nb_dirtyend); FSDBG(548, bp->nb_valid, 0, bp->nb_dirty, 0); - if (UBCINFOEXISTS(vp) && bp->nb_bufsize) { - int upl_flags; + vp = np ? NFSTOV(np) : NULL; + if (vp && UBCINFOEXISTS(vp) && bp->nb_bufsize) { + int upl_flags, rv; upl_t upl; - int i, rv; + uint32_t i; if (!ISSET(bp->nb_flags, NB_PAGELIST) && !ISSET(bp->nb_flags, NB_INVAL)) { rv = nfs_buf_upl_setup(bp); @@ -973,8 +1013,13 @@ nfs_buf_release(struct nfsbuf *bp, int freeup) panic("ubc_upl_unmap failed"); bp->nb_data = NULL; } - if (bp->nb_flags & (NB_ERROR | NB_INVAL | NB_NOCACHE)) { - if (bp->nb_flags & (NB_READ | NB_INVAL | NB_NOCACHE)) + /* + * Abort the pages on error or: if this is an invalid or + * non-needcommit nocache buffer AND no pages are dirty. + */ + if (ISSET(bp->nb_flags, NB_ERROR) || (!bp->nb_dirty && (ISSET(bp->nb_flags, NB_INVAL) || + (ISSET(bp->nb_flags, NB_NOCACHE) && !ISSET(bp->nb_flags, (NB_NEEDCOMMIT | NB_DELWRI)))))) { + if (ISSET(bp->nb_flags, (NB_READ | NB_INVAL | NB_NOCACHE))) upl_flags = UPL_ABORT_DUMP_PAGES; else upl_flags = 0; @@ -992,6 +1037,10 @@ nfs_buf_release(struct nfsbuf *bp, int freeup) upl_flags = UPL_COMMIT_SET_DIRTY; else upl_flags = UPL_COMMIT_CLEAR_DIRTY; + + if (!ISSET(bp->nb_flags, (NB_NEEDCOMMIT | NB_DELWRI))) + upl_flags |= UPL_COMMIT_CLEAR_PRECIOUS; + ubc_upl_commit_range(upl, i*PAGE_SIZE, PAGE_SIZE, upl_flags | @@ -1000,16 +1049,16 @@ nfs_buf_release(struct nfsbuf *bp, int freeup) } } pagelist_cleanup_done: - /* was this the last buffer in the file? */ - if (NBOFF(bp) + bp->nb_bufsize > (off_t)(VTONFS(vp)->n_size)) { - /* if so, invalidate all pages of last buffer past EOF */ - int biosize = vfs_statfs(vnode_mount(vp))->f_iosize; + /* invalidate any pages past EOF */ + if (NBOFF(bp) + bp->nb_bufsize > (off_t)(np->n_size)) { off_t start, end; - start = trunc_page_64(VTONFS(vp)->n_size) + PAGE_SIZE_64; - end = trunc_page_64(NBOFF(bp) + biosize); + start = trunc_page_64(np->n_size) + PAGE_SIZE_64; + end = trunc_page_64(NBOFF(bp) + bp->nb_bufsize); + if (start < NBOFF(bp)) + start = NBOFF(bp); if (end > start) { - if (!(rv = ubc_sync_range(vp, start, end, UBC_INVALIDATE))) - printf("nfs_buf_release(): ubc_sync_range failed!\n"); + if ((rv = ubc_msync(vp, start, end, NULL, UBC_INVALIDATE))) + printf("nfs_buf_release(): ubc_msync failed!, error %d\n", rv); } } CLR(bp->nb_flags, NB_PAGELIST); @@ -1031,23 +1080,24 @@ pagelist_cleanup_done: wakeup_buffer = 1; } - /* If it's not cacheable, or an error, mark it invalid. */ - if (ISSET(bp->nb_flags, (NB_NOCACHE|NB_ERROR))) + /* If it's non-needcommit nocache, or an error, mark it invalid. */ + if (ISSET(bp->nb_flags, NB_ERROR) || + (ISSET(bp->nb_flags, NB_NOCACHE) && !ISSET(bp->nb_flags, (NB_NEEDCOMMIT | NB_DELWRI)))) SET(bp->nb_flags, NB_INVAL); if ((bp->nb_bufsize <= 0) || ISSET(bp->nb_flags, NB_INVAL)) { - /* If it's invalid or empty, dissociate it from its vnode */ + /* If it's invalid or empty, dissociate it from its nfsnode */ if (bp->nb_vnbufs.le_next != NFSNOLIST) { LIST_REMOVE(bp, nb_vnbufs); bp->nb_vnbufs.le_next = NFSNOLIST; } - bp->nb_vp = NULL; + bp->nb_np = NULL; /* if this was a delayed write, wakeup anyone */ /* waiting for delayed writes to complete */ if (ISSET(bp->nb_flags, NB_DELWRI)) { CLR(bp->nb_flags, NB_DELWRI); - OSAddAtomic(-1, (SInt32*)&nfs_nbdwrite); - NFSBUFCNTCHK(1); + nfs_nbdwrite--; + NFSBUFCNTCHK(); wakeup_nbdwrite = 1; } /* invalidate usage timestamp to allow immediate freeing */ @@ -1086,10 +1136,10 @@ pagelist_cleanup_done: } } - NFSBUFCNTCHK(1); + NFSBUFCNTCHK(); /* Unlock the buffer. */ - CLR(bp->nb_flags, (NB_ASYNC | NB_NOCACHE | NB_STABLE | NB_IOD)); + CLR(bp->nb_flags, (NB_ASYNC | NB_STABLE)); CLR(bp->nb_lflags, NBL_BUSY); FSDBG_BOT(548, bp, NBOFF(bp), bp->nb_flags, bp->nb_data); @@ -1118,7 +1168,7 @@ nfs_buf_iowait(struct nfsbuf *bp) lck_mtx_lock(nfs_buf_mutex); while (!ISSET(bp->nb_flags, NB_DONE)) - msleep(bp, nfs_buf_mutex, PRIBIO + 1, "nfs_buf_iowait", 0); + msleep(bp, nfs_buf_mutex, PRIBIO + 1, "nfs_buf_iowait", NULL); lck_mtx_unlock(nfs_buf_mutex); @@ -1144,11 +1194,6 @@ nfs_buf_iodone(struct nfsbuf *bp) if (ISSET(bp->nb_flags, NB_DONE)) panic("nfs_buf_iodone already"); - /* - * I/O was done, so don't believe - * the DIRTY state from VM anymore - */ - CLR(bp->nb_flags, NB_WASDIRTY); if (!ISSET(bp->nb_flags, NB_READ)) { CLR(bp->nb_flags, NB_WRITEINPROG); @@ -1156,7 +1201,10 @@ nfs_buf_iodone(struct nfsbuf *bp) * vnode_writedone() takes care of waking up * any throttled write operations */ - vnode_writedone(bp->nb_vp); + vnode_writedone(NFSTOV(bp->nb_np)); + nfs_node_lock_force(bp->nb_np); + bp->nb_np->n_numoutput--; + nfs_node_unlock(bp->nb_np); } if (ISSET(bp->nb_flags, NB_ASYNC)) { /* if async, release it */ SET(bp->nb_flags, NB_DONE); /* note that it's done */ @@ -1173,9 +1221,9 @@ nfs_buf_iodone(struct nfsbuf *bp) } void -nfs_buf_write_delayed(struct nfsbuf *bp, proc_t p) +nfs_buf_write_delayed(struct nfsbuf *bp) { - vnode_t vp = bp->nb_vp; + nfsnode_t np = bp->nb_np; FSDBG_TOP(551, bp, NBOFF(bp), bp->nb_flags, 0); FSDBG(551, bp, bp->nb_dirtyoff, bp->nb_dirtyend, bp->nb_dirty); @@ -1183,20 +1231,17 @@ nfs_buf_write_delayed(struct nfsbuf *bp, proc_t p) /* * If the block hasn't been seen before: * (1) Mark it as having been seen, - * (2) Charge for the write. - * (3) Make sure it's on its vnode's correct block list, + * (2) Make sure it's on its node's correct block list, */ if (!ISSET(bp->nb_flags, NB_DELWRI)) { SET(bp->nb_flags, NB_DELWRI); - if (p && p->p_stats) - p->p_stats->p_ru.ru_oublock++; /* XXX */ - OSAddAtomic(1, (SInt32*)&nfs_nbdwrite); - NFSBUFCNTCHK(0); /* move to dirty list */ lck_mtx_lock(nfs_buf_mutex); + nfs_nbdwrite++; + NFSBUFCNTCHK(); if (bp->nb_vnbufs.le_next != NFSNOLIST) LIST_REMOVE(bp, nb_vnbufs); - LIST_INSERT_HEAD(&VTONFS(vp)->n_dirtyblkhd, bp, nb_vnbufs); + LIST_INSERT_HEAD(&np->n_dirtyblkhd, bp, nb_vnbufs); lck_mtx_unlock(nfs_buf_mutex); } @@ -1204,24 +1249,27 @@ nfs_buf_write_delayed(struct nfsbuf *bp, proc_t p) * If the vnode has "too many" write operations in progress * wait for them to finish the IO */ - (void)vnode_waitforwrites(vp, VNODE_ASYNC_THROTTLE, 0, 0, "nfs_buf_write_delayed"); + vnode_waitforwrites(NFSTOV(np), VNODE_ASYNC_THROTTLE, 0, 0, "nfs_buf_write_delayed"); + + /* the file is in a modified state, so make sure the flag's set */ + nfs_node_lock_force(np); + np->n_flag |= NMODIFIED; + nfs_node_unlock(np); /* - * If we have too many delayed write buffers, - * more than we can "safely" handle, just fall back to - * doing the async write + * If we have too many delayed write buffers, + * just fall back to doing the async write. */ if (nfs_nbdwrite < 0) panic("nfs_buf_write_delayed: Negative nfs_nbdwrite"); - - if (nfs_nbdwrite > ((nfsbufcnt/4)*3)) { + if (nfs_nbdwrite > NFS_A_LOT_OF_DELAYED_WRITES) { /* issue async write */ SET(bp->nb_flags, NB_ASYNC); nfs_buf_write(bp); FSDBG_BOT(551, bp, NBOFF(bp), bp->nb_flags, bp->nb_error); return; } - + /* Otherwise, the "write" is done, so mark and release the buffer. */ SET(bp->nb_flags, NB_DONE); nfs_buf_release(bp, 1); @@ -1229,6 +1277,34 @@ nfs_buf_write_delayed(struct nfsbuf *bp, proc_t p) return; } +/* + * Check that a "needcommit" buffer can still be committed. + * If the write verifier has changed, we need to clear the + * the needcommit flag. + */ +void +nfs_buf_check_write_verifier(nfsnode_t np, struct nfsbuf *bp) +{ + struct nfsmount *nmp; + + if (!ISSET(bp->nb_flags, NB_NEEDCOMMIT)) + return; + + nmp = NFSTONMP(np); + if (!nmp) + return; + if (!ISSET(bp->nb_flags, NB_STALEWVERF) && (bp->nb_verf == nmp->nm_verf)) + return; + + /* write verifier changed, clear commit/wverf flags */ + CLR(bp->nb_flags, (NB_NEEDCOMMIT | NB_STALEWVERF)); + bp->nb_verf = 0; + nfs_node_lock_force(np); + np->n_needcommitcnt--; + CHECK_NEEDCOMMITCNT(np); + nfs_node_unlock(np); +} + /* * add a reference to a buffer so it doesn't disappear while being used * (must be called with nfs_buf_mutex held) @@ -1260,7 +1336,7 @@ nfs_buf_acquire(struct nfsbuf *bp, int flags, int slpflag, int slptimeo) if (ISSET(bp->nb_lflags, NBL_BUSY)) { /* - * since the mutex_lock may block, the buffer + * since the lck_mtx_lock may block, the buffer * may become BUSY, so we need to recheck for * a NOWAIT request */ @@ -1268,15 +1344,15 @@ nfs_buf_acquire(struct nfsbuf *bp, int flags, int slpflag, int slptimeo) return (EBUSY); SET(bp->nb_lflags, NBL_WANTED); - ts.tv_sec = (slptimeo/100); - /* the hz value is 100; which leads to 10ms */ - ts.tv_nsec = (slptimeo % 100) * 10 * NSEC_PER_USEC * 1000; + ts.tv_sec = (slptimeo/100); + /* the hz value is 100; which leads to 10ms */ + ts.tv_nsec = (slptimeo % 100) * 10 * NSEC_PER_USEC * 1000; - error = msleep(bp, nfs_buf_mutex, slpflag | (PRIBIO + 1), - "nfs_buf_acquire", &ts); - if (error) - return (error); - return (EAGAIN); + error = msleep(bp, nfs_buf_mutex, slpflag | (PRIBIO + 1), + "nfs_buf_acquire", &ts); + if (error) + return (error); + return (EAGAIN); } if (flags & NBAC_REMOVE) nfs_buf_remfree(bp); @@ -1297,10 +1373,7 @@ nfs_buf_drop(struct nfsbuf *bp) if (!ISSET(bp->nb_lflags, NBL_BUSY)) panic("nfs_buf_drop: buffer not busy!"); if (ISSET(bp->nb_lflags, NBL_WANTED)) { - /* - * delay the actual wakeup until after we - * clear NBL_BUSY and we've dropped nfs_buf_mutex - */ + /* delay the actual wakeup until after we clear NBL_BUSY */ need_wakeup = 1; } /* Unlock the buffer. */ @@ -1316,7 +1389,7 @@ nfs_buf_drop(struct nfsbuf *bp) * (must be called with nfs_buf_mutex held) */ int -nfs_buf_iterprepare(struct nfsnode *np, struct nfsbuflists *iterheadp, int flags) +nfs_buf_iterprepare(nfsnode_t np, struct nfsbuflists *iterheadp, int flags) { struct nfsbuflists *listheadp; @@ -1332,7 +1405,7 @@ nfs_buf_iterprepare(struct nfsnode *np, struct nfsbuflists *iterheadp, int flags while (np->n_bufiterflags & NBI_ITER) { np->n_bufiterflags |= NBI_ITERWANT; - msleep(&np->n_bufiterflags, nfs_buf_mutex, 0, "nfs_buf_iterprepare", 0); + msleep(&np->n_bufiterflags, nfs_buf_mutex, 0, "nfs_buf_iterprepare", NULL); } if (LIST_EMPTY(listheadp)) { LIST_INIT(iterheadp); @@ -1348,12 +1421,12 @@ nfs_buf_iterprepare(struct nfsnode *np, struct nfsbuflists *iterheadp, int flags } /* - * cleanup after iterating over an nfsnode's buffer list + * clean up after iterating over an nfsnode's buffer list * this lock protects the queue manipulation * (must be called with nfs_buf_mutex held) */ void -nfs_buf_itercomplete(struct nfsnode *np, struct nfsbuflists *iterheadp, int flags) +nfs_buf_itercomplete(nfsnode_t np, struct nfsbuflists *iterheadp, int flags) { struct nfsbuflists * listheadp; struct nfsbuf *bp; @@ -1378,265 +1451,652 @@ nfs_buf_itercomplete(struct nfsnode *np, struct nfsbuflists *iterheadp, int flag /* - * Vnode op for read using bio - * Any similarity to readip() is purely coincidental + * Read an NFS buffer for a file. */ int -nfs_bioread( - vnode_t vp, - struct uio *uio, - __unused int ioflag, - kauth_cred_t cred, - proc_t p) +nfs_buf_read(struct nfsbuf *bp) { - struct nfsnode *np = VTONFS(vp); - int biosize; - off_t diff; - struct nfsbuf *bp = NULL, *rabp; - struct nfs_vattr nvattr; - struct nfsmount *nmp = VFSTONFS(vnode_mount(vp)); - daddr64_t lbn, rabn, lastrabn = -1, tlbn; - int bufsize; - int nra, error = 0, n = 0, on = 0; - caddr_t dp; - struct dirent *direntp = NULL; - enum vtype vtype; - int nocachereadahead = 0; - - FSDBG_TOP(514, vp, uio->uio_offset, uio_uio_resid(uio), ioflag); - -#if DIAGNOSTIC - if (uio->uio_rw != UIO_READ) - panic("nfs_read mode"); -#endif - if (uio_uio_resid(uio) == 0) { - FSDBG_BOT(514, vp, 0xd1e0001, 0, 0); - return (0); - } - if (uio->uio_offset < 0) { - FSDBG_BOT(514, vp, 0xd1e0002, 0, EINVAL); - return (EINVAL); - } + int error = 0; + nfsnode_t np; + thread_t thd; + kauth_cred_t cred; + + np = bp->nb_np; + cred = bp->nb_rcred; + if (IS_VALID_CRED(cred)) + kauth_cred_ref(cred); + thd = ISSET(bp->nb_flags, NB_ASYNC) ? NULL : current_thread(); + + /* sanity checks */ + if (!ISSET(bp->nb_flags, NB_READ)) + panic("nfs_buf_read: !NB_READ"); + if (ISSET(bp->nb_flags, NB_DONE)) + CLR(bp->nb_flags, NB_DONE); + + NFS_BUF_MAP(bp); + + OSAddAtomic64(1, &nfsstats.read_bios); - if ((nmp->nm_flag & NFSMNT_NFSV3) && !(nmp->nm_state & NFSSTA_GOTFSINFO)) - nfs_fsinfo(nmp, vp, cred, p); - biosize = vfs_statfs(vnode_mount(vp))->f_iosize; - vtype = vnode_vtype(vp); + error = nfs_buf_read_rpc(bp, thd, cred); /* - * For nfs, cache consistency can only be maintained approximately. - * Although RFC1094 does not specify the criteria, the following is - * believed to be compatible with the reference port. - * For nfs: - * If the file's modify time on the server has changed since the - * last read rpc or you have written to the file, - * you may have lost data cache consistency with the - * server, so flush all of the file's data out of the cache. - * Then force a getattr rpc to ensure that you have up to date - * attributes. - * NB: This implies that cache data can be read when up to - * NFS_MAXATTRTIMEO seconds out of date. If you find that you need - * current attributes this could be forced by setting calling - * NATTRINVALIDATE() before the nfs_getattr() call. + * For async I/O, the callbacks will finish up the + * read. Otherwise, the read has already been finished. */ - if (np->n_flag & NNEEDINVALIDATE) { - np->n_flag &= ~NNEEDINVALIDATE; - nfs_vinvalbuf(vp, V_SAVE|V_IGNORE_WRITEERR, cred, p, 1); - } - if (np->n_flag & NMODIFIED) { - if (vtype != VREG) { - if (vtype != VDIR) - panic("nfs: bioread, not dir"); - nfs_invaldir(vp); - error = nfs_vinvalbuf(vp, V_SAVE, cred, p, 1); - if (error) { - FSDBG_BOT(514, vp, 0xd1e0003, 0, error); - return (error); - } - } - NATTRINVALIDATE(np); - error = nfs_getattr(vp, &nvattr, cred, p); - if (error) { - FSDBG_BOT(514, vp, 0xd1e0004, 0, error); - return (error); + + if (IS_VALID_CRED(cred)) + kauth_cred_unref(&cred); + return (error); +} + +/* + * finish the reading of a buffer + */ +void +nfs_buf_read_finish(struct nfsbuf *bp) +{ + nfsnode_t np = bp->nb_np; + struct nfsmount *nmp; + + if (!ISSET(bp->nb_flags, NB_ERROR)) { + /* update valid range */ + bp->nb_validoff = 0; + bp->nb_validend = bp->nb_endio; + if (bp->nb_endio < (int)bp->nb_bufsize) { + /* + * The read may be short because we have unflushed writes + * that are extending the file size and the reads hit the + * (old) EOF on the server. So, just make sure nb_validend + * correctly tracks EOF. + * Note that the missing data should have already been zeroed + * in nfs_buf_read_rpc_finish(). + */ + off_t boff = NBOFF(bp); + if ((off_t)np->n_size >= (boff + bp->nb_bufsize)) + bp->nb_validend = bp->nb_bufsize; + else if ((off_t)np->n_size >= boff) + bp->nb_validend = np->n_size - boff; + else + bp->nb_validend = 0; } - if (vtype == VDIR) { - /* if directory changed, purge any name cache entries */ - if (nfstimespeccmp(&np->n_ncmtime, &nvattr.nva_mtime, !=)) - cache_purge(vp); - np->n_ncmtime = nvattr.nva_mtime; + if ((nmp = NFSTONMP(np)) && (nmp->nm_vers == NFS_VER2) && + ((NBOFF(bp) + bp->nb_validend) > 0x100000000LL)) + bp->nb_validend = 0x100000000LL - NBOFF(bp); + bp->nb_valid = (1 << (round_page_32(bp->nb_validend) / PAGE_SIZE)) - 1; + if (bp->nb_validend & PAGE_MASK) { + /* zero-fill remainder of last page */ + bzero(bp->nb_data + bp->nb_validend, PAGE_SIZE - (bp->nb_validend & PAGE_MASK)); } - np->n_mtime = nvattr.nva_mtime; - } else { - error = nfs_getattr(vp, &nvattr, cred, p); - if (error) { - FSDBG_BOT(514, vp, 0xd1e0005, 0, error); + } + nfs_buf_iodone(bp); +} + +/* + * initiate the NFS READ RPC(s) for a buffer + */ +int +nfs_buf_read_rpc(struct nfsbuf *bp, thread_t thd, kauth_cred_t cred) +{ + struct nfsmount *nmp; + nfsnode_t np = bp->nb_np; + int error = 0, nfsvers, async; + int offset, nrpcs; + uint32_t nmrsize, length, len; + off_t boff; + struct nfsreq *req; + struct nfsreq_cbinfo cb; + + nmp = NFSTONMP(np); + if (!nmp) { + bp->nb_error = error = ENXIO; + SET(bp->nb_flags, NB_ERROR); + nfs_buf_iodone(bp); + return (error); + } + nfsvers = nmp->nm_vers; + nmrsize = nmp->nm_rsize; + + boff = NBOFF(bp); + offset = 0; + length = bp->nb_bufsize; + + if (nfsvers == NFS_VER2) { + if (boff > 0xffffffffLL) { + bp->nb_error = error = EFBIG; + SET(bp->nb_flags, NB_ERROR); + nfs_buf_iodone(bp); return (error); } - if (nfstimespeccmp(&np->n_mtime, &nvattr.nva_mtime, !=)) { - if (vtype == VDIR) { - nfs_invaldir(vp); - /* purge name cache entries */ - if (nfstimespeccmp(&np->n_ncmtime, &nvattr.nva_mtime, !=)) - cache_purge(vp); - } - error = nfs_vinvalbuf(vp, V_SAVE, cred, p, 1); - if (error) { - FSDBG_BOT(514, vp, 0xd1e0006, 0, error); - return (error); - } - if (vtype == VDIR) - np->n_ncmtime = nvattr.nva_mtime; - np->n_mtime = nvattr.nva_mtime; - } + if ((boff + length - 1) > 0xffffffffLL) + length = 0x100000000LL - boff; } - if (vnode_isnocache(vp)) { - if (!(np->n_flag & NNOCACHE)) { - if (NVALIDBUFS(np)) { - error = nfs_vinvalbuf(vp, V_SAVE, cred, p, 1); - if (error) { - FSDBG_BOT(514, vp, 0xd1e000a, 0, error); - return (error); - } - } - np->n_flag |= NNOCACHE; - } - } else if (np->n_flag & NNOCACHE) { - np->n_flag &= ~NNOCACHE; + /* Note: Can only do async I/O if nfsiods are configured. */ + async = (bp->nb_flags & NB_ASYNC); + cb.rcb_func = async ? nfs_buf_read_rpc_finish : NULL; + cb.rcb_bp = bp; + + bp->nb_offio = bp->nb_endio = 0; + bp->nb_rpcs = nrpcs = (length + nmrsize - 1) / nmrsize; + if (async && (nrpcs > 1)) { + SET(bp->nb_flags, NB_MULTASYNCRPC); + } else { + CLR(bp->nb_flags, NB_MULTASYNCRPC); } - do { - if (np->n_flag & NNOCACHE) { - switch (vtype) { - case VREG: - /* - * If we have only a block or so to read, - * just do the rpc directly. - * If we have a couple blocks or more to read, - * then we'll take advantage of readahead within - * this loop to try to fetch all the data in parallel - */ - if (!nocachereadahead && (uio_uio_resid(uio) < 2*biosize)) { - error = nfs_readrpc(vp, uio, cred, p); - FSDBG_BOT(514, vp, uio->uio_offset, uio_uio_resid(uio), error); - return (error); - } - nocachereadahead = 1; + while (length > 0) { + if (ISSET(bp->nb_flags, NB_ERROR)) { + error = bp->nb_error; break; - case VLNK: - error = nfs_readlinkrpc(vp, uio, cred, p); - FSDBG_BOT(514, vp, uio->uio_offset, uio_uio_resid(uio), error); - return (error); - case VDIR: + } + len = (length > nmrsize) ? nmrsize : length; + cb.rcb_args[0] = offset; + cb.rcb_args[1] = len; + if (nmp->nm_vers >= NFS_VER4) + cb.rcb_args[2] = nmp->nm_stategenid; + req = NULL; + error = nmp->nm_funcs->nf_read_rpc_async(np, boff + offset, len, thd, cred, &cb, &req); + if (error) + break; + offset += len; + length -= len; + if (async) + continue; + nfs_buf_read_rpc_finish(req); + if (ISSET(bp->nb_flags, NB_ERROR)) { + error = bp->nb_error; break; - default: - printf(" NFSNOCACHE: type %x unexpected\n", vtype); - }; - } - switch (vtype) { - case VREG: - lbn = uio->uio_offset / biosize; - - /* - * Copy directly from any cached pages without grabbing the bufs. - * - * Note: for "nocache" reads, we don't copy directly from UBC - * because any cached pages will be for readahead buffers that - * need to be invalidated anyway before we finish this request. - */ - if (!(np->n_flag & NNOCACHE) && - (uio->uio_segflg == UIO_USERSPACE32 || - uio->uio_segflg == UIO_USERSPACE64 || - uio->uio_segflg == UIO_USERSPACE)) { - // LP64todo - fix this! - int io_resid = uio_uio_resid(uio); - diff = np->n_size - uio->uio_offset; - if (diff < io_resid) - io_resid = diff; - if (io_resid > 0) { - error = cluster_copy_ubc_data(vp, uio, &io_resid, 0); - if (error) { - FSDBG_BOT(514, vp, uio->uio_offset, 0xcacefeed, error); - return (error); - } - } - /* count any biocache reads that we just copied directly */ - if (lbn != uio->uio_offset / biosize) { - OSAddAtomic((uio->uio_offset / biosize) - lbn, (SInt32*)&nfsstats.biocache_reads); - FSDBG(514, vp, 0xcacefeed, uio->uio_offset, error); - } } + } - lbn = uio->uio_offset / biosize; - on = uio->uio_offset % biosize; - + if (length > 0) { /* - * Start the read ahead(s), as required. + * Something bad happened while trying to send the RPC(s). + * Wait for any outstanding requests to complete. */ - if (nfs_numasync > 0 && nmp->nm_readahead > 0) { - for (nra = 0; nra < nmp->nm_readahead; nra++) { - rabn = lbn + 1 + nra; - if (rabn <= lastrabn) { - /* we've already (tried to) read this block */ - /* no need to try it again... */ - continue; - } - lastrabn = rabn; - if ((off_t)rabn * biosize >= (off_t)np->n_size) - break; - if ((np->n_flag & NNOCACHE) && - (((off_t)rabn * biosize) >= (uio->uio_offset + uio_uio_resid(uio)))) - /* for uncached readahead, don't go beyond end of request */ - break; - /* check if block exists and is valid. */ - error = nfs_buf_get(vp, rabn, biosize, p, NBLK_READ|NBLK_NOWAIT, &rabp); - if (error) { - FSDBG_BOT(514, vp, 0xd1e000b, 1, error); - return (error); - } - if (!rabp) - continue; - if (nfs_buf_upl_valid_range(rabp, 0, rabp->nb_bufsize)) { - nfs_buf_release(rabp, 1); - continue; - } - if (!ISSET(rabp->nb_flags, (NB_CACHE|NB_DELWRI))) { - SET(rabp->nb_flags, (NB_READ|NB_ASYNC)); - if (nfs_asyncio(rabp, cred)) { - SET(rabp->nb_flags, (NB_INVAL|NB_ERROR)); - rabp->nb_error = EIO; - nfs_buf_release(rabp, 1); - } - } else - nfs_buf_release(rabp, 1); + bp->nb_error = error; + SET(bp->nb_flags, NB_ERROR); + if (ISSET(bp->nb_flags, NB_MULTASYNCRPC)) { + nrpcs = (length + nmrsize - 1) / nmrsize; + lck_mtx_lock(nfs_buf_mutex); + bp->nb_rpcs -= nrpcs; + if (bp->nb_rpcs == 0) { + /* No RPCs left, so the buffer's done */ + lck_mtx_unlock(nfs_buf_mutex); + nfs_buf_iodone(bp); + } else { + /* wait for the last RPC to mark it done */ + while (bp->nb_rpcs > 0) + msleep(&bp->nb_rpcs, nfs_buf_mutex, 0, + "nfs_buf_read_rpc_cancel", NULL); + lck_mtx_unlock(nfs_buf_mutex); } + } else { + nfs_buf_iodone(bp); } + } - if ((uio_uio_resid(uio) <= 0) || (uio->uio_offset >= (off_t)np->n_size)) { - FSDBG_BOT(514, vp, uio->uio_offset, uio_uio_resid(uio), 0xaaaaaaaa); - return (0); - } - - OSAddAtomic(1, (SInt32*)&nfsstats.biocache_reads); + return (error); +} - /* - * If the block is in the cache and has the required data - * in a valid region, just copy it out. - * Otherwise, get the block and write back/read in, +/* + * finish up an NFS READ RPC on a buffer + */ +void +nfs_buf_read_rpc_finish(struct nfsreq *req) +{ + struct nfsmount *nmp; + size_t rlen; + struct nfsreq_cbinfo cb; + struct nfsbuf *bp; + int error = 0, nfsvers, offset, length, eof = 0, multasyncrpc, finished; + void *wakeme = NULL; + struct nfsreq *rreq = NULL; + nfsnode_t np; + thread_t thd; + kauth_cred_t cred; + uio_t auio; + char uio_buf [ UIO_SIZEOF(1) ]; + +finish: + np = req->r_np; + thd = req->r_thread; + cred = req->r_cred; + if (IS_VALID_CRED(cred)) + kauth_cred_ref(cred); + cb = req->r_callback; + bp = cb.rcb_bp; + if (cb.rcb_func) /* take an extra reference on the nfsreq in case we want to resend it later due to grace error */ + nfs_request_ref(req, 0); + + nmp = NFSTONMP(np); + if (!nmp) { + SET(bp->nb_flags, NB_ERROR); + bp->nb_error = error = ENXIO; + } + if (error || ISSET(bp->nb_flags, NB_ERROR)) { + /* just drop it */ + nfs_request_async_cancel(req); + goto out; + } + + nfsvers = nmp->nm_vers; + offset = cb.rcb_args[0]; + rlen = length = cb.rcb_args[1]; + + auio = uio_createwithbuffer(1, NBOFF(bp) + offset, UIO_SYSSPACE, + UIO_READ, &uio_buf, sizeof(uio_buf)); + uio_addiov(auio, CAST_USER_ADDR_T(bp->nb_data + offset), length); + + /* finish the RPC */ + error = nmp->nm_funcs->nf_read_rpc_async_finish(np, req, auio, &rlen, &eof); + if ((error == EINPROGRESS) && cb.rcb_func) { + /* async request restarted */ + if (cb.rcb_func) + nfs_request_rele(req); + if (IS_VALID_CRED(cred)) + kauth_cred_unref(&cred); + return; + } + if ((nmp->nm_vers >= NFS_VER4) && nfs_mount_state_error_should_restart(error) && !ISSET(bp->nb_flags, NB_ERROR)) { + lck_mtx_lock(&nmp->nm_lock); + if ((error != NFSERR_OLD_STATEID) && (error != NFSERR_GRACE) && (cb.rcb_args[2] == nmp->nm_stategenid)) { + NP(np, "nfs_buf_read_rpc_finish: error %d @ 0x%llx, 0x%x 0x%x, initiating recovery", + error, NBOFF(bp)+offset, cb.rcb_args[2], nmp->nm_stategenid); + nfs_need_recover(nmp, error); + } + lck_mtx_unlock(&nmp->nm_lock); + if (np->n_flag & NREVOKE) { + error = EIO; + } else { + if (error == NFSERR_GRACE) { + if (cb.rcb_func) { + /* + * For an async I/O request, handle a grace delay just like + * jukebox errors. Set the resend time and queue it up. + */ + struct timeval now; + if (req->r_nmrep.nmc_mhead) { + mbuf_freem(req->r_nmrep.nmc_mhead); + req->r_nmrep.nmc_mhead = NULL; + } + req->r_error = 0; + microuptime(&now); + lck_mtx_lock(&req->r_mtx); + req->r_resendtime = now.tv_sec + 2; + req->r_xid = 0; // get a new XID + req->r_flags |= R_RESTART; + req->r_start = 0; + nfs_asyncio_resend(req); + lck_mtx_unlock(&req->r_mtx); + if (IS_VALID_CRED(cred)) + kauth_cred_unref(&cred); + /* Note: nfsreq reference taken will be dropped later when finished */ + return; + } + /* otherwise, just pause a couple seconds and retry */ + tsleep(&nmp->nm_state, (PZERO-1), "nfsgrace", 2*hz); + } + if (!(error = nfs_mount_state_wait_for_recovery(nmp))) { + rlen = 0; + goto readagain; + } + } + } + if (error) { + SET(bp->nb_flags, NB_ERROR); + bp->nb_error = error; + goto out; + } + + if ((rlen > 0) && (bp->nb_endio < (offset + (int)rlen))) + bp->nb_endio = offset + rlen; + + if ((nfsvers == NFS_VER2) || eof || (rlen == 0)) { + /* zero out the remaining data (up to EOF) */ + off_t rpcrem, eofrem, rem; + rpcrem = (length - rlen); + eofrem = np->n_size - (NBOFF(bp) + offset + rlen); + rem = (rpcrem < eofrem) ? rpcrem : eofrem; + if (rem > 0) + bzero(bp->nb_data + offset + rlen, rem); + } else if (((int)rlen < length) && !ISSET(bp->nb_flags, NB_ERROR)) { + /* + * short read + * + * We haven't hit EOF and we didn't get all the data + * requested, so we need to issue another read for the rest. + * (Don't bother if the buffer already hit an error.) + */ +readagain: + offset += rlen; + length -= rlen; + cb.rcb_args[0] = offset; + cb.rcb_args[1] = length; + if (nmp->nm_vers >= NFS_VER4) + cb.rcb_args[2] = nmp->nm_stategenid; + error = nmp->nm_funcs->nf_read_rpc_async(np, NBOFF(bp) + offset, length, thd, cred, &cb, &rreq); + if (!error) { + if (IS_VALID_CRED(cred)) + kauth_cred_unref(&cred); + if (!cb.rcb_func) { + /* if !async we'll need to wait for this RPC to finish */ + req = rreq; + rreq = NULL; + goto finish; + } + nfs_request_rele(req); + /* + * We're done here. + * Outstanding RPC count is unchanged. + * Callback will be called when RPC is done. + */ + return; + } + SET(bp->nb_flags, NB_ERROR); + bp->nb_error = error; + } + +out: + if (cb.rcb_func) + nfs_request_rele(req); + if (IS_VALID_CRED(cred)) + kauth_cred_unref(&cred); + + /* + * Decrement outstanding RPC count on buffer + * and call nfs_buf_read_finish on last RPC. + * + * (Note: when there are multiple async RPCs issued for a + * buffer we need nfs_buffer_mutex to avoid problems when + * aborting a partially-initiated set of RPCs) + */ + + multasyncrpc = ISSET(bp->nb_flags, NB_MULTASYNCRPC); + if (multasyncrpc) + lck_mtx_lock(nfs_buf_mutex); + + bp->nb_rpcs--; + finished = (bp->nb_rpcs == 0); + + if (multasyncrpc) + lck_mtx_unlock(nfs_buf_mutex); + + if (finished) { + if (multasyncrpc) + wakeme = &bp->nb_rpcs; + nfs_buf_read_finish(bp); + if (wakeme) + wakeup(wakeme); + } +} + +/* + * Do buffer readahead. + * Initiate async I/O to read buffers not in cache. + */ +int +nfs_buf_readahead(nfsnode_t np, int ioflag, daddr64_t *rabnp, daddr64_t lastrabn, thread_t thd, kauth_cred_t cred) +{ + struct nfsmount *nmp = NFSTONMP(np); + struct nfsbuf *bp; + int error = 0; + uint32_t nra; + + if (!nmp) + return (ENXIO); + if (nmp->nm_readahead <= 0) + return (0); + if (*rabnp > lastrabn) + return (0); + + for (nra = 0; (nra < nmp->nm_readahead) && (*rabnp <= lastrabn); nra++, *rabnp = *rabnp + 1) { + /* check if block exists and is valid. */ + if ((*rabnp * nmp->nm_biosize) >= (off_t)np->n_size) { + /* stop reading ahead if we're beyond EOF */ + *rabnp = lastrabn; + break; + } + error = nfs_buf_get(np, *rabnp, nmp->nm_biosize, thd, NBLK_READ|NBLK_NOWAIT, &bp); + if (error) + break; + nfs_node_lock_force(np); + np->n_lastrahead = *rabnp; + nfs_node_unlock(np); + if (!bp) + continue; + if ((ioflag & IO_NOCACHE) && ISSET(bp->nb_flags, NB_CACHE) && + !bp->nb_dirty && !ISSET(bp->nb_flags, (NB_DELWRI|NB_NCRDAHEAD))) { + CLR(bp->nb_flags, NB_CACHE); + bp->nb_valid = 0; + bp->nb_validoff = bp->nb_validend = -1; + } + if ((bp->nb_dirtyend <= 0) && !bp->nb_dirty && + !ISSET(bp->nb_flags, (NB_CACHE|NB_DELWRI))) { + SET(bp->nb_flags, (NB_READ|NB_ASYNC)); + if (ioflag & IO_NOCACHE) + SET(bp->nb_flags, NB_NCRDAHEAD); + if (!IS_VALID_CRED(bp->nb_rcred) && IS_VALID_CRED(cred)) { + kauth_cred_ref(cred); + bp->nb_rcred = cred; + } + if ((error = nfs_buf_read(bp))) + break; + continue; + } + nfs_buf_release(bp, 1); + } + return (error); +} + +/* + * NFS buffer I/O for reading files. + */ +int +nfs_bioread(nfsnode_t np, uio_t uio, int ioflag, vfs_context_t ctx) +{ + vnode_t vp = NFSTOV(np); + struct nfsbuf *bp = NULL; + struct nfsmount *nmp = VTONMP(vp); + daddr64_t lbn, rabn = 0, lastrabn, maxrabn = -1; + off_t diff; + int error = 0, n = 0, on = 0; + int nfsvers, biosize, modified, readaheads = 0; + thread_t thd; + kauth_cred_t cred; + int64_t io_resid; + + FSDBG_TOP(514, np, uio_offset(uio), uio_resid(uio), ioflag); + + nfsvers = nmp->nm_vers; + biosize = nmp->nm_biosize; + thd = vfs_context_thread(ctx); + cred = vfs_context_ucred(ctx); + + if (vnode_vtype(vp) != VREG) { + printf("nfs_bioread: type %x unexpected\n", vnode_vtype(vp)); + FSDBG_BOT(514, np, 0xd1e0016, 0, EINVAL); + return (EINVAL); + } + + /* + * For NFS, cache consistency can only be maintained approximately. + * Although RFC1094 does not specify the criteria, the following is + * believed to be compatible with the reference port. + * + * If the file has changed since the last read RPC or you have + * written to the file, you may have lost data cache consistency + * with the server. So, check for a change, and flush all of the + * file's data out of the cache. + * NB: This implies that cache data can be read when up to + * NFS_MAXATTRTIMO seconds out of date. If you find that you + * need current attributes, nfs_getattr() can be forced to fetch + * new attributes (via NATTRINVALIDATE() or NGA_UNCACHED). + */ + + if (ISSET(np->n_flag, NUPDATESIZE)) + nfs_data_update_size(np, 0); + + if ((error = nfs_node_lock(np))) { + FSDBG_BOT(514, np, 0xd1e0222, 0, error); + return (error); + } + + if (np->n_flag & NNEEDINVALIDATE) { + np->n_flag &= ~NNEEDINVALIDATE; + nfs_node_unlock(np); + error = nfs_vinvalbuf(vp, V_SAVE|V_IGNORE_WRITEERR, ctx, 1); + if (!error) + error = nfs_node_lock(np); + if (error) { + FSDBG_BOT(514, np, 0xd1e0322, 0, error); + return (error); + } + } + + modified = (np->n_flag & NMODIFIED); + nfs_node_unlock(np); + /* nfs_getattr() will check changed and purge caches */ + error = nfs_getattr(np, NULL, ctx, modified ? NGA_UNCACHED : NGA_CACHED); + if (error) { + FSDBG_BOT(514, np, 0xd1e0004, 0, error); + return (error); + } + + if (uio_resid(uio) == 0) { + FSDBG_BOT(514, np, 0xd1e0001, 0, 0); + return (0); + } + if (uio_offset(uio) < 0) { + FSDBG_BOT(514, np, 0xd1e0002, 0, EINVAL); + return (EINVAL); + } + + /* + * set up readahead - which may be limited by: + * + current request length (for IO_NOCACHE) + * + readahead setting + * + file size + */ + if (nmp->nm_readahead > 0) { + off_t end = uio_offset(uio) + uio_resid(uio); + if (end > (off_t)np->n_size) + end = np->n_size; + rabn = uio_offset(uio) / biosize; + maxrabn = (end - 1) / biosize; + nfs_node_lock_force(np); + if (!(ioflag & IO_NOCACHE) && + (!rabn || (rabn == np->n_lastread) || (rabn == (np->n_lastread+1)))) { + maxrabn += nmp->nm_readahead; + if ((maxrabn * biosize) >= (off_t)np->n_size) + maxrabn = ((off_t)np->n_size - 1)/biosize; + } + if (maxrabn < np->n_lastrahead) + np->n_lastrahead = -1; + if (rabn < np->n_lastrahead) + rabn = np->n_lastrahead + 1; + nfs_node_unlock(np); + } else { + rabn = maxrabn = 0; + } + + do { + + nfs_data_lock(np, NFS_DATA_LOCK_SHARED); + lbn = uio_offset(uio) / biosize; + + /* + * Copy directly from any cached pages without grabbing the bufs. + * (If we are NOCACHE and we've issued readahead requests, we need + * to grab the NB_NCRDAHEAD bufs to drop them.) + */ + if ((!(ioflag & IO_NOCACHE) || !readaheads) && + ((uio->uio_segflg == UIO_USERSPACE32 || + uio->uio_segflg == UIO_USERSPACE64 || + uio->uio_segflg == UIO_USERSPACE))) { + io_resid = uio_resid(uio); + diff = np->n_size - uio_offset(uio); + if (diff < io_resid) + io_resid = diff; + if (io_resid > 0) { + int count = (io_resid > INT_MAX) ? INT_MAX : io_resid; + error = cluster_copy_ubc_data(vp, uio, &count, 0); + if (error) { + nfs_data_unlock(np); + FSDBG_BOT(514, np, uio_offset(uio), 0xcacefeed, error); + return (error); + } + } + /* count any biocache reads that we just copied directly */ + if (lbn != (uio_offset(uio)/biosize)) { + OSAddAtomic64((uio_offset(uio)/biosize) - lbn, &nfsstats.biocache_reads); + FSDBG(514, np, 0xcacefeed, uio_offset(uio), error); + } + } + + lbn = uio_offset(uio) / biosize; + on = uio_offset(uio) % biosize; + nfs_node_lock_force(np); + np->n_lastread = (uio_offset(uio) - 1) / biosize; + nfs_node_unlock(np); + + if ((uio_resid(uio) <= 0) || (uio_offset(uio) >= (off_t)np->n_size)) { + nfs_data_unlock(np); + FSDBG_BOT(514, np, uio_offset(uio), uio_resid(uio), 0xaaaaaaaa); + return (0); + } + + /* adjust readahead block number, if necessary */ + if (rabn < lbn) + rabn = lbn; + lastrabn = MIN(maxrabn, lbn + nmp->nm_readahead); + if (rabn <= lastrabn) { /* start readaheads */ + error = nfs_buf_readahead(np, ioflag, &rabn, lastrabn, thd, cred); + if (error) { + nfs_data_unlock(np); + FSDBG_BOT(514, np, 0xd1e000b, 1, error); + return (error); + } + readaheads = 1; + } + + OSAddAtomic64(1, &nfsstats.biocache_reads); + + /* + * If the block is in the cache and has the required data + * in a valid region, just copy it out. + * Otherwise, get the block and write back/read in, * as required. */ again: - bufsize = biosize; - // LP64todo - fix this! - n = min((unsigned)(bufsize - on), uio_uio_resid(uio)); - diff = np->n_size - uio->uio_offset; + io_resid = uio_resid(uio); + n = (io_resid > (biosize - on)) ? (biosize - on) : io_resid; + diff = np->n_size - uio_offset(uio); if (diff < n) n = diff; - error = nfs_buf_get(vp, lbn, bufsize, p, NBLK_READ, &bp); + error = nfs_buf_get(np, lbn, biosize, thd, NBLK_READ, &bp); if (error) { - FSDBG_BOT(514, vp, 0xd1e000c, 0, EINTR); - return (EINTR); + nfs_data_unlock(np); + FSDBG_BOT(514, np, 0xd1e000c, 0, error); + return (error); + } + + if ((ioflag & IO_NOCACHE) && ISSET(bp->nb_flags, NB_CACHE)) { + /* + * IO_NOCACHE found a cached buffer. + * Flush the buffer if it's dirty. + * Invalidate the data if it wasn't just read + * in as part of a "nocache readahead". + */ + if (bp->nb_dirty || (bp->nb_dirtyend > 0)) { + /* so write the buffer out and try again */ + SET(bp->nb_flags, NB_NOCACHE); + goto flushbuffer; + } + if (ISSET(bp->nb_flags, NB_NCRDAHEAD)) { + CLR(bp->nb_flags, NB_NCRDAHEAD); + SET(bp->nb_flags, NB_NOCACHE); + } } /* if any pages are valid... */ @@ -1668,24 +2128,27 @@ again: } /* there are invalid pages in the read range */ - if ((dirtypg > firstpg) && (dirtypg < lastpg)) { - /* there are also dirty page(s) in the range, */ + if (((dirtypg > firstpg) && (dirtypg < lastpg)) || + (((firstpg*PAGE_SIZE) < bp->nb_dirtyend) && (((lastpg+1)*PAGE_SIZE) > bp->nb_dirtyoff))) { + /* there are also dirty page(s) (or range) in the read range, */ /* so write the buffer out and try again */ +flushbuffer: CLR(bp->nb_flags, (NB_DONE | NB_ERROR | NB_INVAL)); SET(bp->nb_flags, NB_ASYNC); - if (bp->nb_wcred == NOCRED) { + if (!IS_VALID_CRED(bp->nb_wcred)) { kauth_cred_ref(cred); bp->nb_wcred = cred; } error = nfs_buf_write(bp); if (error) { - FSDBG_BOT(514, vp, 0xd1e000d, 0, error); + nfs_data_unlock(np); + FSDBG_BOT(514, np, 0xd1e000d, 0, error); return (error); } goto again; } if (!bp->nb_dirty && bp->nb_dirtyend <= 0 && - (lastpg - firstpg + 1) > (bufsize/PAGE_SIZE)/2) { + (lastpg - firstpg + 1) > (biosize/PAGE_SIZE)/2) { /* we need to read in more than half the buffer and the */ /* buffer's not dirty, so just fetch the whole buffer */ bp->nb_valid = 0; @@ -1695,20 +2158,21 @@ again: char uio_buf[ UIO_SIZEOF(1) ]; NFS_BUF_MAP(bp); - auio = uio_createwithbuffer(1, (NBOFF(bp) + firstpg * PAGE_SIZE_64), + auio = uio_createwithbuffer(1, (NBOFF(bp) + firstpg * PAGE_SIZE_64), UIO_SYSSPACE, UIO_READ, &uio_buf[0], sizeof(uio_buf)); if (!auio) { error = ENOMEM; } else { - uio_addiov(auio, CAST_USER_ADDR_T((bp->nb_data + firstpg * PAGE_SIZE)), + uio_addiov(auio, CAST_USER_ADDR_T(bp->nb_data + (firstpg * PAGE_SIZE)), ((lastpg - firstpg + 1) * PAGE_SIZE)); - error = nfs_readrpc(vp, auio, cred, p); + error = nfs_read_rpc(np, auio, ctx); } if (error) { - if (np->n_flag & NNOCACHE) + if (ioflag & IO_NOCACHE) SET(bp->nb_flags, NB_NOCACHE); nfs_buf_release(bp, 1); - FSDBG_BOT(514, vp, 0xd1e000e, 0, error); + nfs_data_unlock(np); + FSDBG_BOT(514, np, 0xd1e000e, 0, error); return (error); } /* Make sure that the valid range is set to cover this read. */ @@ -1727,14 +2191,19 @@ again: } /* if no pages are valid, read the whole block */ if (!bp->nb_valid) { + if (!IS_VALID_CRED(bp->nb_rcred) && IS_VALID_CRED(cred)) { + kauth_cred_ref(cred); + bp->nb_rcred = cred; + } SET(bp->nb_flags, NB_READ); CLR(bp->nb_flags, (NB_DONE | NB_ERROR | NB_INVAL)); - error = nfs_doio(bp, cred, p); + error = nfs_buf_read(bp); + if (ioflag & IO_NOCACHE) + SET(bp->nb_flags, NB_NOCACHE); if (error) { - if (np->n_flag & NNOCACHE) - SET(bp->nb_flags, NB_NOCACHE); + nfs_data_unlock(np); nfs_buf_release(bp, 1); - FSDBG_BOT(514, vp, 0xd1e000f, 0, error); + FSDBG_BOT(514, np, 0xd1e000f, 0, error); return (error); } } @@ -1745,757 +2214,1261 @@ buffer_ready: if (diff < n) n = diff; } - if (n > 0) + if (n > 0) { NFS_BUF_MAP(bp); - break; - case VLNK: - OSAddAtomic(1, (SInt32*)&nfsstats.biocache_readlinks); - error = nfs_buf_get(vp, 0, NFS_MAXPATHLEN, p, NBLK_READ, &bp); - if (error) { - FSDBG_BOT(514, vp, 0xd1e0010, 0, error); - return (error); - } - if (!ISSET(bp->nb_flags, NB_CACHE)) { - SET(bp->nb_flags, NB_READ); - error = nfs_doio(bp, cred, p); - if (error) { - SET(bp->nb_flags, NB_ERROR); - nfs_buf_release(bp, 1); - FSDBG_BOT(514, vp, 0xd1e0011, 0, error); - return (error); - } - } - // LP64todo - fix this! - n = min(uio_uio_resid(uio), bp->nb_validend); - on = 0; - break; - case VDIR: - OSAddAtomic(1, (SInt32*)&nfsstats.biocache_readdirs); - if (np->n_direofoffset && uio->uio_offset >= np->n_direofoffset) { - FSDBG_BOT(514, vp, 0xde0f0001, 0, 0); - return (0); - } - lbn = uio->uio_offset / NFS_DIRBLKSIZ; - on = uio->uio_offset & (NFS_DIRBLKSIZ - 1); - error = nfs_buf_get(vp, lbn, NFS_DIRBLKSIZ, p, NBLK_READ, &bp); - if (error) { - FSDBG_BOT(514, vp, 0xd1e0012, 0, error); - return (error); - } - if (!ISSET(bp->nb_flags, NB_CACHE)) { - SET(bp->nb_flags, NB_READ); - error = nfs_doio(bp, cred, p); - if (error) { - nfs_buf_release(bp, 1); - } - while (error == NFSERR_BAD_COOKIE) { - nfs_invaldir(vp); - error = nfs_vinvalbuf(vp, 0, cred, p, 1); - /* - * Yuck! The directory has been modified on the - * server. The only way to get the block is by - * reading from the beginning to get all the - * offset cookies. - */ - for (tlbn = 0; tlbn <= lbn && !error; tlbn++) { - if (np->n_direofoffset - && (tlbn * NFS_DIRBLKSIZ) >= np->n_direofoffset) { - FSDBG_BOT(514, vp, 0xde0f0002, 0, 0); - return (0); - } - error = nfs_buf_get(vp, tlbn, NFS_DIRBLKSIZ, p, NBLK_READ, &bp); - if (error) { - FSDBG_BOT(514, vp, 0xd1e0013, 0, error); - return (error); - } - if (!ISSET(bp->nb_flags, NB_CACHE)) { - SET(bp->nb_flags, NB_READ); - error = nfs_doio(bp, cred, p); - /* - * no error + NB_INVAL == directory EOF, - * use the block. - */ - if (error == 0 && (bp->nb_flags & NB_INVAL)) - break; - } - /* - * An error will throw away the block and the - * for loop will break out. If no error and this - * is not the block we want, we throw away the - * block and go for the next one via the for loop. - */ - if (error || tlbn < lbn) - nfs_buf_release(bp, 1); - } - } - /* - * The above while is repeated if we hit another cookie - * error. If we hit an error and it wasn't a cookie error, - * we give up. - */ - if (error) { - FSDBG_BOT(514, vp, 0xd1e0014, 0, error); - return (error); - } + error = uiomove(bp->nb_data + on, n, uio); } - /* - * If not eof and read aheads are enabled, start one. - * (You need the current block first, so that you have the - * directory offset cookie of the next block.) - */ - if (nfs_numasync > 0 && nmp->nm_readahead > 0 && - (np->n_direofoffset == 0 || - (lbn + 1) * NFS_DIRBLKSIZ < np->n_direofoffset) && - !nfs_buf_is_incore(vp, lbn + 1)) { - error = nfs_buf_get(vp, lbn + 1, NFS_DIRBLKSIZ, p, NBLK_READ|NBLK_NOWAIT, &rabp); - if (error) { - FSDBG_BOT(514, vp, 0xd1e0015, 0, error); - return (error); - } - if (rabp) { - if (!ISSET(rabp->nb_flags, (NB_CACHE))) { - SET(rabp->nb_flags, (NB_READ | NB_ASYNC)); - if (nfs_asyncio(rabp, cred)) { - SET(rabp->nb_flags, (NB_INVAL|NB_ERROR)); - rabp->nb_error = EIO; - nfs_buf_release(rabp, 1); - } - } else { - nfs_buf_release(rabp, 1); - } - } - } - /* - * Make sure we use a signed variant of min() since - * the second term may be negative. - */ - // LP64todo - fix this! - n = lmin(uio_uio_resid(uio), bp->nb_validend - on); - /* - * We keep track of the directory eof in - * np->n_direofoffset and chop it off as an - * extra step right here. - */ - if (np->n_direofoffset && - n > np->n_direofoffset - uio->uio_offset) - n = np->n_direofoffset - uio->uio_offset; - /* - * Make sure that we return an integral number of entries so - * that any subsequent calls will start copying from the start - * of the next entry. - * - * If the current value of n has the last entry cut short, - * set n to copy everything up to the last entry instead. - */ - if (n > 0) { - dp = bp->nb_data + on; - while (dp < (bp->nb_data + on + n)) { - direntp = (struct dirent *)dp; - dp += direntp->d_reclen; - } - if (dp > (bp->nb_data + on + n)) - n = (dp - direntp->d_reclen) - (bp->nb_data + on); - } - break; - default: - printf("nfs_bioread: type %x unexpected\n", vtype); - FSDBG_BOT(514, vp, 0xd1e0016, 0, EINVAL); - return (EINVAL); - }; - - if (n > 0) { - error = uiomove(bp->nb_data + on, (int)n, uio); - } - switch (vtype) { - case VREG: - if (np->n_flag & NNOCACHE) - SET(bp->nb_flags, NB_NOCACHE); - break; - case VLNK: - n = 0; - break; - case VDIR: - break; - default: - break; - } - nfs_buf_release(bp, 1); - } while (error == 0 && uio_uio_resid(uio) > 0 && n > 0); - FSDBG_BOT(514, vp, uio->uio_offset, uio_uio_resid(uio), error); + nfs_buf_release(bp, 1); + nfs_data_unlock(np); + nfs_node_lock_force(np); + np->n_lastread = (uio_offset(uio) - 1) / biosize; + nfs_node_unlock(np); + } while (error == 0 && uio_resid(uio) > 0 && n > 0); + FSDBG_BOT(514, np, uio_offset(uio), uio_resid(uio), error); return (error); } +/* + * limit the number of outstanding async I/O writes + */ +int +nfs_async_write_start(struct nfsmount *nmp) +{ + int error = 0, slpflag = NMFLAG(nmp, INTR) ? PCATCH : 0; + struct timespec ts = {1, 0}; + + if (nfs_max_async_writes <= 0) + return (0); + lck_mtx_lock(&nmp->nm_lock); + while ((nfs_max_async_writes > 0) && (nmp->nm_asyncwrites >= nfs_max_async_writes)) { + if ((error = nfs_sigintr(nmp, NULL, current_thread(), 1))) + break; + msleep(&nmp->nm_asyncwrites, &nmp->nm_lock, slpflag|(PZERO-1), "nfsasyncwrites", &ts); + slpflag = 0; + } + if (!error) + nmp->nm_asyncwrites++; + lck_mtx_unlock(&nmp->nm_lock); + return (error); +} +void +nfs_async_write_done(struct nfsmount *nmp) +{ + if (nmp->nm_asyncwrites <= 0) + return; + lck_mtx_lock(&nmp->nm_lock); + if (nmp->nm_asyncwrites-- >= nfs_max_async_writes) + wakeup(&nmp->nm_asyncwrites); + lck_mtx_unlock(&nmp->nm_lock); +} /* - * Vnode op for write using bio + * write (or commit) the given NFS buffer + * + * Commit the buffer if we can. + * Write out any dirty range. + * If any dirty pages remain, write them out. + * Mark buffer done. + * + * For async requests, all the work beyond sending the initial + * write RPC is handled in the RPC callback(s). */ int -nfs_write(ap) - struct vnop_write_args /* { - struct vnodeop_desc *a_desc; - vnode_t a_vp; - struct uio *a_uio; - int a_ioflag; - vfs_context_t a_context; - } */ *ap; +nfs_buf_write(struct nfsbuf *bp) { - struct uio *uio = ap->a_uio; - vnode_t vp = ap->a_vp; - struct nfsnode *np = VTONFS(vp); - proc_t p; + int error = 0, oldflags, async; + nfsnode_t np; + thread_t thd; kauth_cred_t cred; - int ioflag = ap->a_ioflag; - struct nfsbuf *bp; - struct nfs_vattr nvattr; - struct nfsmount *nmp = VFSTONFS(vnode_mount(vp)); - daddr64_t lbn; - int biosize, bufsize; - int n, on, error = 0; - off_t boff, start, end, cureof; - struct iovec_32 iov; - struct uio auio; - - FSDBG_TOP(515, vp, uio->uio_offset, uio_uio_resid(uio), ioflag); - -#if DIAGNOSTIC - if (uio->uio_rw != UIO_WRITE) - panic("nfs_write mode"); - if (UIO_SEG_IS_USER_SPACE(uio->uio_segflg)) - panic("nfs_write proc"); -#endif - - p = vfs_context_proc(ap->a_context); - cred = vfs_context_ucred(ap->a_context); - - if (vnode_vtype(vp) != VREG) - return (EIO); + proc_t p = current_proc(); + int iomode, doff, dend, firstpg, lastpg; + uint32_t pagemask; - np->n_flag |= NWRBUSY; + FSDBG_TOP(553, bp, NBOFF(bp), bp->nb_flags, 0); - if (np->n_flag & NNEEDINVALIDATE) { - np->n_flag &= ~NNEEDINVALIDATE; - nfs_vinvalbuf(vp, V_SAVE|V_IGNORE_WRITEERR, cred, p, 1); - } - if (np->n_flag & NWRITEERR) { - np->n_flag &= ~(NWRITEERR | NWRBUSY); - FSDBG_BOT(515, vp, uio->uio_offset, uio_uio_resid(uio), np->n_error); - return (np->n_error); - } - if ((nmp->nm_flag & NFSMNT_NFSV3) && - !(nmp->nm_state & NFSSTA_GOTFSINFO)) - (void)nfs_fsinfo(nmp, vp, cred, p); - if (ioflag & (IO_APPEND | IO_SYNC)) { - if (np->n_flag & NMODIFIED) { - NATTRINVALIDATE(np); - error = nfs_vinvalbuf(vp, V_SAVE, cred, p, 1); + if (!ISSET(bp->nb_lflags, NBL_BUSY)) + panic("nfs_buf_write: buffer is not busy???"); + + np = bp->nb_np; + async = ISSET(bp->nb_flags, NB_ASYNC); + oldflags = bp->nb_flags; + + CLR(bp->nb_flags, (NB_READ|NB_DONE|NB_ERROR|NB_DELWRI)); + if (ISSET(oldflags, NB_DELWRI)) { + lck_mtx_lock(nfs_buf_mutex); + nfs_nbdwrite--; + NFSBUFCNTCHK(); + lck_mtx_unlock(nfs_buf_mutex); + wakeup(&nfs_nbdwrite); + } + + /* move to clean list */ + if (ISSET(oldflags, (NB_ASYNC|NB_DELWRI))) { + lck_mtx_lock(nfs_buf_mutex); + if (bp->nb_vnbufs.le_next != NFSNOLIST) + LIST_REMOVE(bp, nb_vnbufs); + LIST_INSERT_HEAD(&np->n_cleanblkhd, bp, nb_vnbufs); + lck_mtx_unlock(nfs_buf_mutex); + } + nfs_node_lock_force(np); + np->n_numoutput++; + nfs_node_unlock(np); + vnode_startwrite(NFSTOV(np)); + + if (p && p->p_stats) + OSIncrementAtomicLong(&p->p_stats->p_ru.ru_oublock); + + cred = bp->nb_wcred; + if (!IS_VALID_CRED(cred) && ISSET(bp->nb_flags, NB_READ)) + cred = bp->nb_rcred; /* shouldn't really happen, but... */ + if (IS_VALID_CRED(cred)) + kauth_cred_ref(cred); + thd = async ? NULL : current_thread(); + + /* We need to make sure the pages are locked before doing I/O. */ + if (!ISSET(bp->nb_flags, NB_META) && UBCINFOEXISTS(NFSTOV(np))) { + if (!ISSET(bp->nb_flags, NB_PAGELIST)) { + error = nfs_buf_upl_setup(bp); if (error) { - np->n_flag &= ~NWRBUSY; - FSDBG_BOT(515, vp, uio->uio_offset, 0x10bad01, error); - return (error); + printf("nfs_buf_write: upl create failed %d\n", error); + SET(bp->nb_flags, NB_ERROR); + bp->nb_error = error = EIO; + nfs_buf_iodone(bp); + goto out; } + nfs_buf_upl_check(bp); } - if (ioflag & IO_APPEND) { - NATTRINVALIDATE(np); - error = nfs_getattr(vp, &nvattr, cred, p); - if (error) { - np->n_flag &= ~NWRBUSY; - FSDBG_BOT(515, vp, uio->uio_offset, 0x10bad02, error); - return (error); + } + + /* If NB_NEEDCOMMIT is set, a commit RPC may do the trick. */ + if (ISSET(bp->nb_flags, NB_NEEDCOMMIT)) + nfs_buf_check_write_verifier(np, bp); + if (ISSET(bp->nb_flags, NB_NEEDCOMMIT)) { + struct nfsmount *nmp = NFSTONMP(np); + if (!nmp) { + SET(bp->nb_flags, NB_ERROR); + bp->nb_error = error = EIO; + nfs_buf_iodone(bp); + goto out; + } + SET(bp->nb_flags, NB_WRITEINPROG); + error = nmp->nm_funcs->nf_commit_rpc(np, NBOFF(bp) + bp->nb_dirtyoff, + bp->nb_dirtyend - bp->nb_dirtyoff, bp->nb_wcred, bp->nb_verf); + CLR(bp->nb_flags, NB_WRITEINPROG); + if (error) { + if (error != NFSERR_STALEWRITEVERF) { + SET(bp->nb_flags, NB_ERROR); + bp->nb_error = error; } - uio->uio_offset = np->n_size; + nfs_buf_iodone(bp); + goto out; } + bp->nb_dirtyoff = bp->nb_dirtyend = 0; + CLR(bp->nb_flags, NB_NEEDCOMMIT); + nfs_node_lock_force(np); + np->n_needcommitcnt--; + CHECK_NEEDCOMMITCNT(np); + nfs_node_unlock(np); } - if (uio->uio_offset < 0) { - np->n_flag &= ~NWRBUSY; - FSDBG_BOT(515, vp, uio->uio_offset, 0xbad0ff, EINVAL); - return (EINVAL); - } - if (uio_uio_resid(uio) == 0) { - np->n_flag &= ~NWRBUSY; - FSDBG_BOT(515, vp, uio->uio_offset, uio_uio_resid(uio), 0); - return (0); + if (!error && (bp->nb_dirtyend > 0)) { + /* sanity check the dirty range */ + if (NBOFF(bp) + bp->nb_dirtyend > (off_t) np->n_size) { + bp->nb_dirtyend = np->n_size - NBOFF(bp); + if (bp->nb_dirtyoff >= bp->nb_dirtyend) + bp->nb_dirtyoff = bp->nb_dirtyend = 0; + } } + if (!error && (bp->nb_dirtyend > 0)) { + /* there's a dirty range that needs to be written out */ + NFS_BUF_MAP(bp); - biosize = vfs_statfs(vnode_mount(vp))->f_iosize; + doff = bp->nb_dirtyoff; + dend = bp->nb_dirtyend; - if (vnode_isnocache(vp)) { - if (!(np->n_flag & NNOCACHE)) { - if (NVALIDBUFS(np)) { - error = nfs_vinvalbuf(vp, V_SAVE, cred, p, 1); - if (error) { - np->n_flag &= ~NWRBUSY; - FSDBG_BOT(515, vp, 0, 0, error); - return (error); - } + /* if doff page is dirty, move doff to start of page */ + if (NBPGDIRTY(bp, doff / PAGE_SIZE)) + doff -= doff & PAGE_MASK; + /* try to expand write range to include preceding dirty pages */ + if (!(doff & PAGE_MASK)) + while ((doff > 0) && NBPGDIRTY(bp, (doff - 1) / PAGE_SIZE)) + doff -= PAGE_SIZE; + /* if dend page is dirty, move dend to start of next page */ + if ((dend & PAGE_MASK) && NBPGDIRTY(bp, dend / PAGE_SIZE)) + dend = round_page_32(dend); + /* try to expand write range to include trailing dirty pages */ + if (!(dend & PAGE_MASK)) + while ((dend < (int)bp->nb_bufsize) && NBPGDIRTY(bp, dend / PAGE_SIZE)) + dend += PAGE_SIZE; + /* make sure to keep dend clipped to EOF */ + if ((NBOFF(bp) + dend) > (off_t) np->n_size) + dend = np->n_size - NBOFF(bp); + /* calculate range of complete pages being written */ + firstpg = round_page_32(doff) / PAGE_SIZE; + lastpg = (trunc_page_32(dend) - 1) / PAGE_SIZE; + /* calculate mask for that page range */ + pagemask = ((1 << (lastpg + 1)) - 1) & ~((1 << firstpg) - 1); + + /* + * compare page mask to nb_dirty; if there are other dirty pages + * then write FILESYNC; otherwise, write UNSTABLE if async and + * not needcommit/stable; otherwise write FILESYNC + */ + if (bp->nb_dirty & ~pagemask) + iomode = NFS_WRITE_FILESYNC; + else if ((bp->nb_flags & (NB_ASYNC | NB_NEEDCOMMIT | NB_STABLE)) == NB_ASYNC) + iomode = NFS_WRITE_UNSTABLE; + else + iomode = NFS_WRITE_FILESYNC; + + /* write the whole contiguous dirty range */ + bp->nb_offio = doff; + bp->nb_endio = dend; + + OSAddAtomic64(1, &nfsstats.write_bios); + + SET(bp->nb_flags, NB_WRITEINPROG); + error = nfs_buf_write_rpc(bp, iomode, thd, cred); + /* + * For async I/O, the callbacks will finish up the + * write and push out any dirty pages. Otherwise, + * the write has already been finished and any dirty + * pages pushed out. + */ + } else { + if (!error && bp->nb_dirty) /* write out any dirty pages */ + error = nfs_buf_write_dirty_pages(bp, thd, cred); + nfs_buf_iodone(bp); + } + /* note: bp is still valid only for !async case */ +out: + if (!async) { + error = nfs_buf_iowait(bp); + /* move to clean list */ + if (oldflags & NB_DELWRI) { + lck_mtx_lock(nfs_buf_mutex); + if (bp->nb_vnbufs.le_next != NFSNOLIST) + LIST_REMOVE(bp, nb_vnbufs); + LIST_INSERT_HEAD(&np->n_cleanblkhd, bp, nb_vnbufs); + lck_mtx_unlock(nfs_buf_mutex); + } + FSDBG_BOT(553, bp, NBOFF(bp), bp->nb_flags, error); + nfs_buf_release(bp, 1); + /* check if we need to invalidate (and we can) */ + if ((np->n_flag & NNEEDINVALIDATE) && + !(np->n_bflag & (NBINVALINPROG|NBFLUSHINPROG))) { + int invalidate = 0; + nfs_node_lock_force(np); + if (np->n_flag & NNEEDINVALIDATE) { + invalidate = 1; + np->n_flag &= ~NNEEDINVALIDATE; + } + nfs_node_unlock(np); + if (invalidate) { + /* + * There was a write error and we need to + * invalidate attrs and flush buffers in + * order to sync up with the server. + * (if this write was extending the file, + * we may no longer know the correct size) + * + * But we couldn't call vinvalbuf while holding + * the buffer busy. So we call vinvalbuf() after + * releasing the buffer. + */ + nfs_vinvalbuf2(NFSTOV(np), V_SAVE|V_IGNORE_WRITEERR, thd, cred, 1); } - np->n_flag |= NNOCACHE; } - } else if (np->n_flag & NNOCACHE) { - np->n_flag &= ~NNOCACHE; } - do { - OSAddAtomic(1, (SInt32*)&nfsstats.biocache_writes); - lbn = uio->uio_offset / biosize; - on = uio->uio_offset % biosize; - // LP64todo - fix this - n = min((unsigned)(biosize - on), uio_uio_resid(uio)); -again: - bufsize = biosize; + if (IS_VALID_CRED(cred)) + kauth_cred_unref(&cred); + return (error); +} + +/* + * finish the writing of a buffer + */ +void +nfs_buf_write_finish(struct nfsbuf *bp, thread_t thd, kauth_cred_t cred) +{ + nfsnode_t np = bp->nb_np; + int error = (bp->nb_flags & NB_ERROR) ? bp->nb_error : 0; + int firstpg, lastpg; + uint32_t pagemask; + + if ((error == EINTR) || (error == ERESTART)) { + CLR(bp->nb_flags, NB_ERROR); + SET(bp->nb_flags, NB_EINTR); + } + + if (!error) { + /* calculate range of complete pages being written */ + firstpg = round_page_32(bp->nb_offio) / PAGE_SIZE; + lastpg = (trunc_page_32(bp->nb_endio) - 1) / PAGE_SIZE; + /* calculate mask for that page range written */ + pagemask = ((1 << (lastpg + 1)) - 1) & ~((1 << firstpg) - 1); + /* clear dirty bits for pages we've written */ + bp->nb_dirty &= ~pagemask; + } + + /* manage needcommit state */ + if (!error && (bp->nb_commitlevel == NFS_WRITE_UNSTABLE)) { + if (!ISSET(bp->nb_flags, NB_NEEDCOMMIT)) { + nfs_node_lock_force(np); + np->n_needcommitcnt++; + nfs_node_unlock(np); + SET(bp->nb_flags, NB_NEEDCOMMIT); + } + /* make sure nb_dirtyoff/nb_dirtyend reflect actual range written */ + bp->nb_dirtyoff = bp->nb_offio; + bp->nb_dirtyend = bp->nb_endio; + } else if (ISSET(bp->nb_flags, NB_NEEDCOMMIT)) { + nfs_node_lock_force(np); + np->n_needcommitcnt--; + CHECK_NEEDCOMMITCNT(np); + nfs_node_unlock(np); + CLR(bp->nb_flags, NB_NEEDCOMMIT); + } + + CLR(bp->nb_flags, NB_WRITEINPROG); + + /* + * For an unstable write, the buffer is still treated as dirty until + * a commit (or stable (re)write) is performed. Buffers needing only + * a commit are marked with the NB_DELWRI and NB_NEEDCOMMIT flags. + * + * If the write was interrupted we set NB_EINTR. Don't set NB_ERROR + * because that would cause the buffer to be dropped. The buffer is + * still valid and simply needs to be written again. + */ + if ((error == EINTR) || (error == ERESTART) || (!error && (bp->nb_flags & NB_NEEDCOMMIT))) { + CLR(bp->nb_flags, NB_INVAL); + if (!ISSET(bp->nb_flags, NB_DELWRI)) { + SET(bp->nb_flags, NB_DELWRI); + lck_mtx_lock(nfs_buf_mutex); + nfs_nbdwrite++; + NFSBUFCNTCHK(); + lck_mtx_unlock(nfs_buf_mutex); + } /* - * Get a cache block for writing. The range to be written is - * (off..off+n) within the block. We ensure that the block - * either has no dirty region or that the given range is - * contiguous with the existing dirty region. + * Since for the NB_ASYNC case, we've reassigned the buffer to the + * clean list, we have to reassign it back to the dirty one. Ugh. */ - error = nfs_buf_get(vp, lbn, bufsize, p, NBLK_WRITE, &bp); + if (ISSET(bp->nb_flags, NB_ASYNC)) { + /* move to dirty list */ + lck_mtx_lock(nfs_buf_mutex); + if (bp->nb_vnbufs.le_next != NFSNOLIST) + LIST_REMOVE(bp, nb_vnbufs); + LIST_INSERT_HEAD(&np->n_dirtyblkhd, bp, nb_vnbufs); + lck_mtx_unlock(nfs_buf_mutex); + } + } else { + /* either there's an error or we don't need to commit */ if (error) { - np->n_flag &= ~NWRBUSY; - FSDBG_BOT(515, vp, uio->uio_offset, uio_uio_resid(uio), error); - return (error); + /* + * There was a write error and we need to invalidate + * attrs and flush buffers in order to sync up with the + * server. (if this write was extending the file, we + * may no longer know the correct size) + * + * But we can't call vinvalbuf while holding this + * buffer busy. Set a flag to do it after releasing + * the buffer. + */ + nfs_node_lock_force(np); + np->n_error = error; + np->n_flag |= (NWRITEERR | NNEEDINVALIDATE); + NATTRINVALIDATE(np); + nfs_node_unlock(np); } - /* map the block because we know we're going to write to it */ - NFS_BUF_MAP(bp); + /* clear the dirty range */ + bp->nb_dirtyoff = bp->nb_dirtyend = 0; + } - if (np->n_flag & NNOCACHE) - SET(bp->nb_flags, (NB_NOCACHE|NB_STABLE)); + if (!error && bp->nb_dirty) + nfs_buf_write_dirty_pages(bp, thd, cred); + nfs_buf_iodone(bp); +} - if (bp->nb_wcred == NOCRED) { - kauth_cred_ref(cred); - bp->nb_wcred = cred; - } +/* + * write out any pages marked dirty in a buffer + * + * We do use unstable writes and follow up with a commit. + * If we catch the write verifier changing we'll restart + * do the writes filesync. + */ +int +nfs_buf_write_dirty_pages(struct nfsbuf *bp, thread_t thd, kauth_cred_t cred) +{ + nfsnode_t np = bp->nb_np; + struct nfsmount *nmp = NFSTONMP(np); + int error = 0, commit, iomode, iomode2, len, pg, count, npages, off; + uint32_t dirty = bp->nb_dirty; + uint64_t wverf; + uio_t auio; + char uio_buf [ UIO_SIZEOF(1) ]; + + if (!bp->nb_dirty) + return (0); - /* - * If there's already a dirty range AND dirty pages in this block we - * need to send a commit AND write the dirty pages before continuing. - * - * If there's already a dirty range OR dirty pages in this block - * and the new write range is not contiguous with the existing range, - * then force the buffer to be written out now. - * (We used to just extend the dirty range to cover the valid, - * but unwritten, data in between also. But writing ranges - * of data that weren't actually written by an application - * risks overwriting some other client's data with stale data - * that's just masquerading as new written data.) - */ - if (bp->nb_dirtyend > 0) { - if (on > bp->nb_dirtyend || (on + n) < bp->nb_dirtyoff || bp->nb_dirty) { - FSDBG(515, vp, uio->uio_offset, bp, 0xd15c001); - /* write/commit buffer "synchronously" */ - /* (NB_STABLE indicates that data writes should be FILESYNC) */ - CLR(bp->nb_flags, (NB_DONE | NB_ERROR | NB_INVAL)); - SET(bp->nb_flags, (NB_ASYNC | NB_STABLE)); - error = nfs_buf_write(bp); - if (error) { - np->n_flag &= ~NWRBUSY; - FSDBG_BOT(515, vp, uio->uio_offset, uio_uio_resid(uio), error); - return (error); - } - goto again; - } - } else if (bp->nb_dirty) { - int firstpg, lastpg; - u_int32_t pagemask; - /* calculate write range pagemask */ - firstpg = on/PAGE_SIZE; - lastpg = (on+n-1)/PAGE_SIZE; - pagemask = ((1 << (lastpg+1)) - 1) & ~((1 << firstpg) - 1); - /* check if there are dirty pages outside the write range */ - if (bp->nb_dirty & ~pagemask) { - FSDBG(515, vp, uio->uio_offset, bp, 0xd15c002); - /* write/commit buffer "synchronously" */ - /* (NB_STABLE indicates that data writes should be FILESYNC) */ - CLR(bp->nb_flags, (NB_DONE | NB_ERROR | NB_INVAL)); - SET(bp->nb_flags, (NB_ASYNC | NB_STABLE)); - error = nfs_buf_write(bp); - if (error) { - np->n_flag &= ~NWRBUSY; - FSDBG_BOT(515, vp, uio->uio_offset, uio_uio_resid(uio), error); - return (error); + /* there are pages marked dirty that need to be written out */ + OSAddAtomic64(1, &nfsstats.write_bios); + NFS_BUF_MAP(bp); + SET(bp->nb_flags, NB_WRITEINPROG); + npages = bp->nb_bufsize / PAGE_SIZE; + iomode = NFS_WRITE_UNSTABLE; + + auio = uio_createwithbuffer(1, 0, UIO_SYSSPACE, UIO_WRITE, + &uio_buf, sizeof(uio_buf)); + +again: + dirty = bp->nb_dirty; + wverf = bp->nb_verf; + commit = NFS_WRITE_FILESYNC; + for (pg = 0; pg < npages; pg++) { + if (!NBPGDIRTY(bp, pg)) + continue; + count = 1; + while (((pg + count) < npages) && NBPGDIRTY(bp, pg + count)) + count++; + /* write count pages starting with page pg */ + off = pg * PAGE_SIZE; + len = count * PAGE_SIZE; + /* clip writes to EOF */ + if (NBOFF(bp) + off + len > (off_t) np->n_size) + len -= (NBOFF(bp) + off + len) - np->n_size; + if (len > 0) { + iomode2 = iomode; + uio_reset(auio, NBOFF(bp) + off, UIO_SYSSPACE, UIO_WRITE); + uio_addiov(auio, CAST_USER_ADDR_T(bp->nb_data + off), len); + error = nfs_write_rpc2(np, auio, thd, cred, &iomode2, &bp->nb_verf); + if (error) + break; + if (iomode2 < commit) /* Retain the lowest commitment level returned. */ + commit = iomode2; + if ((commit != NFS_WRITE_FILESYNC) && (wverf != bp->nb_verf)) { + /* verifier changed, redo all the writes filesync */ + iomode = NFS_WRITE_FILESYNC; + goto again; } + } + /* clear dirty bits */ + while (count--) { + dirty &= ~(1 << pg); + if (count) /* leave pg on last page */ + pg++; + } + } + CLR(bp->nb_flags, NB_WRITEINPROG); + + if (!error && (commit != NFS_WRITE_FILESYNC)) { + error = nmp->nm_funcs->nf_commit_rpc(np, NBOFF(bp), bp->nb_bufsize, cred, wverf); + if (error == NFSERR_STALEWRITEVERF) { + /* verifier changed, so we need to restart all the writes */ + iomode = NFS_WRITE_FILESYNC; goto again; - } - /* if the first or last pages are already dirty */ - /* make sure that the dirty range encompasses those pages */ - if (NBPGDIRTY(bp,firstpg) || NBPGDIRTY(bp,lastpg)) { - FSDBG(515, vp, uio->uio_offset, bp, 0xd15c003); - bp->nb_dirtyoff = min(on, firstpg * PAGE_SIZE); - if (NBPGDIRTY(bp,lastpg)) { - bp->nb_dirtyend = (lastpg+1) * PAGE_SIZE; - /* clip to EOF */ - if (NBOFF(bp) + bp->nb_dirtyend > (off_t)np->n_size) - bp->nb_dirtyend = np->n_size - NBOFF(bp); - } else - bp->nb_dirtyend = on+n; - } } + } + if (!error) { + bp->nb_dirty = dirty; + } else { + SET(bp->nb_flags, NB_ERROR); + bp->nb_error = error; + } + return (error); +} - /* - * Are we extending the size of the file with this write? - * If so, update file size now that we have the block. - * If there was a partial buf at the old eof, validate - * and zero the new bytes. - */ - cureof = (off_t)np->n_size; - if (uio->uio_offset + n > (off_t)np->n_size) { - struct nfsbuf *eofbp = NULL; - daddr64_t eofbn = np->n_size / biosize; - int eofoff = np->n_size % biosize; - int neweofoff = (uio->uio_offset + n) % biosize; +/* + * initiate the NFS WRITE RPC(s) for a buffer + */ +int +nfs_buf_write_rpc(struct nfsbuf *bp, int iomode, thread_t thd, kauth_cred_t cred) +{ + struct nfsmount *nmp; + nfsnode_t np = bp->nb_np; + int error = 0, nfsvers, async; + int offset, nrpcs; + uint32_t nmwsize, length, len; + struct nfsreq *req; + struct nfsreq_cbinfo cb; + uio_t auio; + char uio_buf [ UIO_SIZEOF(1) ]; + + nmp = NFSTONMP(np); + if (!nmp) { + bp->nb_error = error = ENXIO; + SET(bp->nb_flags, NB_ERROR); + nfs_buf_iodone(bp); + return (error); + } + nfsvers = nmp->nm_vers; + nmwsize = nmp->nm_wsize; - FSDBG(515, 0xb1ffa000, uio->uio_offset + n, eofoff, neweofoff); + offset = bp->nb_offio; + length = bp->nb_endio - bp->nb_offio; - if (eofoff && (eofbn < lbn)) { - error = nfs_buf_get(vp, eofbn, biosize, p, NBLK_WRITE|NBLK_ONLYVALID, &eofbp); - if (error) { - np->n_flag &= ~NWRBUSY; - FSDBG_BOT(515, vp, uio->uio_offset, uio_uio_resid(uio), error); - return (error); - } + /* Note: Can only do async I/O if nfsiods are configured. */ + async = (bp->nb_flags & NB_ASYNC) && (NFSIOD_MAX > 0); + bp->nb_commitlevel = NFS_WRITE_FILESYNC; + cb.rcb_func = async ? nfs_buf_write_rpc_finish : NULL; + cb.rcb_bp = bp; + + if ((nfsvers == NFS_VER2) && ((NBOFF(bp) + bp->nb_endio) > 0xffffffffLL)) { + bp->nb_error = error = EFBIG; + SET(bp->nb_flags, NB_ERROR); + nfs_buf_iodone(bp); + return (error); + } + + auio = uio_createwithbuffer(1, NBOFF(bp) + offset, UIO_SYSSPACE, + UIO_WRITE, &uio_buf, sizeof(uio_buf)); + uio_addiov(auio, CAST_USER_ADDR_T(bp->nb_data + offset), length); + + bp->nb_rpcs = nrpcs = (length + nmwsize - 1) / nmwsize; + if (async && (nrpcs > 1)) { + SET(bp->nb_flags, NB_MULTASYNCRPC); + } else { + CLR(bp->nb_flags, NB_MULTASYNCRPC); + } + + while (length > 0) { + if (ISSET(bp->nb_flags, NB_ERROR)) { + error = bp->nb_error; + break; + } + len = (length > nmwsize) ? nmwsize : length; + cb.rcb_args[0] = offset; + cb.rcb_args[1] = len; + if (nmp->nm_vers >= NFS_VER4) + cb.rcb_args[2] = nmp->nm_stategenid; + if (async && ((error = nfs_async_write_start(nmp)))) + break; + req = NULL; + error = nmp->nm_funcs->nf_write_rpc_async(np, auio, len, thd, cred, + iomode, &cb, &req); + if (error) { + if (async) + nfs_async_write_done(nmp); + break; + } + offset += len; + length -= len; + if (async) + continue; + nfs_buf_write_rpc_finish(req); + } + + if (length > 0) { + /* + * Something bad happened while trying to send the RPCs. + * Wait for any outstanding requests to complete. + */ + bp->nb_error = error; + SET(bp->nb_flags, NB_ERROR); + if (ISSET(bp->nb_flags, NB_MULTASYNCRPC)) { + nrpcs = (length + nmwsize - 1) / nmwsize; + lck_mtx_lock(nfs_buf_mutex); + bp->nb_rpcs -= nrpcs; + if (bp->nb_rpcs == 0) { + /* No RPCs left, so the buffer's done */ + lck_mtx_unlock(nfs_buf_mutex); + nfs_buf_write_finish(bp, thd, cred); + } else { + /* wait for the last RPC to mark it done */ + while (bp->nb_rpcs > 0) + msleep(&bp->nb_rpcs, nfs_buf_mutex, 0, + "nfs_buf_write_rpc_cancel", NULL); + lck_mtx_unlock(nfs_buf_mutex); } + } else { + nfs_buf_write_finish(bp, thd, cred); + } + /* It may have just been an interrupt... that's OK */ + if (!ISSET(bp->nb_flags, NB_ERROR)) + error = 0; + } - /* if we're extending within the same last block */ - /* and the block is flagged as being cached... */ - if ((lbn == eofbn) && ISSET(bp->nb_flags, NB_CACHE)) { - /* ...check that all pages in buffer are valid */ - int endpg = ((neweofoff ? neweofoff : biosize) - 1)/PAGE_SIZE; - u_int32_t pagemask; - /* pagemask only has to extend to last page being written to */ - pagemask = (1 << (endpg+1)) - 1; - FSDBG(515, 0xb1ffa001, bp->nb_valid, pagemask, 0); - if ((bp->nb_valid & pagemask) != pagemask) { - /* zerofill any hole */ - if (on > bp->nb_validend) { - int i; - for (i=bp->nb_validend/PAGE_SIZE; i <= (on - 1)/PAGE_SIZE; i++) - NBPGVALID_SET(bp, i); - NFS_BUF_MAP(bp); - FSDBG(516, bp, bp->nb_validend, on - bp->nb_validend, 0xf01e); - bzero((char *)bp->nb_data + bp->nb_validend, - on - bp->nb_validend); - } - /* zerofill any trailing data in the last page */ - if (neweofoff) { - NFS_BUF_MAP(bp); - FSDBG(516, bp, neweofoff, PAGE_SIZE - (neweofoff & PAGE_MASK), 0xe0f); - bzero((char *)bp->nb_data + neweofoff, - PAGE_SIZE - (neweofoff & PAGE_MASK)); + return (error); +} + +/* + * finish up an NFS WRITE RPC on a buffer + */ +void +nfs_buf_write_rpc_finish(struct nfsreq *req) +{ + int error = 0, nfsvers, offset, length, multasyncrpc, finished; + int committed = NFS_WRITE_FILESYNC; + uint64_t wverf = 0; + size_t rlen; + void *wakeme = NULL; + struct nfsreq_cbinfo cb; + struct nfsreq *wreq = NULL; + struct nfsbuf *bp; + struct nfsmount *nmp; + nfsnode_t np; + thread_t thd; + kauth_cred_t cred; + uio_t auio; + char uio_buf [ UIO_SIZEOF(1) ]; + +finish: + np = req->r_np; + thd = req->r_thread; + cred = req->r_cred; + if (IS_VALID_CRED(cred)) + kauth_cred_ref(cred); + cb = req->r_callback; + bp = cb.rcb_bp; + if (cb.rcb_func) /* take an extra reference on the nfsreq in case we want to resend it later due to grace error */ + nfs_request_ref(req, 0); + + nmp = NFSTONMP(np); + if (!nmp) { + SET(bp->nb_flags, NB_ERROR); + bp->nb_error = error = ENXIO; + } + if (error || ISSET(bp->nb_flags, NB_ERROR)) { + /* just drop it */ + nfs_request_async_cancel(req); + goto out; + } + nfsvers = nmp->nm_vers; + + offset = cb.rcb_args[0]; + rlen = length = cb.rcb_args[1]; + + /* finish the RPC */ + error = nmp->nm_funcs->nf_write_rpc_async_finish(np, req, &committed, &rlen, &wverf); + if ((error == EINPROGRESS) && cb.rcb_func) { + /* async request restarted */ + if (cb.rcb_func) + nfs_request_rele(req); + if (IS_VALID_CRED(cred)) + kauth_cred_unref(&cred); + return; + } + if ((nmp->nm_vers >= NFS_VER4) && nfs_mount_state_error_should_restart(error) && !ISSET(bp->nb_flags, NB_ERROR)) { + lck_mtx_lock(&nmp->nm_lock); + if ((error != NFSERR_OLD_STATEID) && (error != NFSERR_GRACE) && (cb.rcb_args[2] == nmp->nm_stategenid)) { + NP(np, "nfs_buf_write_rpc_finish: error %d @ 0x%llx, 0x%x 0x%x, initiating recovery", + error, NBOFF(bp)+offset, cb.rcb_args[2], nmp->nm_stategenid); + nfs_need_recover(nmp, error); + } + lck_mtx_unlock(&nmp->nm_lock); + if (np->n_flag & NREVOKE) { + error = EIO; + } else { + if (error == NFSERR_GRACE) { + if (cb.rcb_func) { + /* + * For an async I/O request, handle a grace delay just like + * jukebox errors. Set the resend time and queue it up. + */ + struct timeval now; + if (req->r_nmrep.nmc_mhead) { + mbuf_freem(req->r_nmrep.nmc_mhead); + req->r_nmrep.nmc_mhead = NULL; } + req->r_error = 0; + microuptime(&now); + lck_mtx_lock(&req->r_mtx); + req->r_resendtime = now.tv_sec + 2; + req->r_xid = 0; // get a new XID + req->r_flags |= R_RESTART; + req->r_start = 0; + nfs_asyncio_resend(req); + lck_mtx_unlock(&req->r_mtx); + if (IS_VALID_CRED(cred)) + kauth_cred_unref(&cred); + /* Note: nfsreq reference taken will be dropped later when finished */ + return; } + /* otherwise, just pause a couple seconds and retry */ + tsleep(&nmp->nm_state, (PZERO-1), "nfsgrace", 2*hz); } - np->n_flag |= NMODIFIED; - np->n_size = uio->uio_offset + n; - ubc_setsize(vp, (off_t)np->n_size); /* XXX errors */ - if (eofbp) { - /* - * We may need to zero any previously invalid data - * after the old EOF in the previous EOF buffer. - * - * For the old last page, don't zero bytes if there - * are invalid bytes in that page (i.e. the page isn't - * currently valid). - * For pages after the old last page, zero them and - * mark them as valid. - */ - char *d; - int i; - if (np->n_flag & NNOCACHE) - SET(eofbp->nb_flags, (NB_NOCACHE|NB_STABLE)); - NFS_BUF_MAP(eofbp); - FSDBG(516, eofbp, eofoff, biosize - eofoff, 0xe0fff01e); - d = eofbp->nb_data; - i = eofoff/PAGE_SIZE; - while (eofoff < biosize) { - int poff = eofoff & PAGE_MASK; - if (!poff || NBPGVALID(eofbp,i)) { - bzero(d + eofoff, PAGE_SIZE - poff); - NBPGVALID_SET(eofbp, i); - } - if (bp->nb_validend == eofoff) - bp->nb_validend += PAGE_SIZE - poff; - eofoff += PAGE_SIZE - poff; - i++; - } - nfs_buf_release(eofbp, 1); + if (!(error = nfs_mount_state_wait_for_recovery(nmp))) { + rlen = 0; + goto writeagain; } } - /* - * If dirtyend exceeds file size, chop it down. This should - * not occur unless there is a race. - */ - if (NBOFF(bp) + bp->nb_dirtyend > (off_t)np->n_size) - bp->nb_dirtyend = np->n_size - NBOFF(bp); - /* - * UBC doesn't handle partial pages, so we need to make sure - * that any pages left in the page cache are completely valid. - * - * Writes that are smaller than a block are delayed if they - * don't extend to the end of the block. - * - * If the block isn't (completely) cached, we may need to read - * in some parts of pages that aren't covered by the write. - * If the write offset (on) isn't page aligned, we'll need to - * read the start of the first page being written to. Likewise, - * if the offset of the end of the write (on+n) isn't page aligned, - * we'll need to read the end of the last page being written to. - * - * Notes: - * We don't want to read anything we're just going to write over. - * We don't want to issue multiple I/Os if we don't have to - * (because they're synchronous rpcs). - * We don't want to read anything we already have modified in the - * page cache. - */ - if (!ISSET(bp->nb_flags, NB_CACHE) && n < biosize) { - int firstpg, lastpg, dirtypg; - int firstpgoff, lastpgoff; - start = end = -1; - firstpg = on/PAGE_SIZE; - firstpgoff = on & PAGE_MASK; - lastpg = (on+n-1)/PAGE_SIZE; - lastpgoff = (on+n) & PAGE_MASK; - if (firstpgoff && !NBPGVALID(bp,firstpg)) { - /* need to read start of first page */ - start = firstpg * PAGE_SIZE; - end = start + firstpgoff; + } + if (error) { + SET(bp->nb_flags, NB_ERROR); + bp->nb_error = error; + } + if (error || (nfsvers == NFS_VER2)) + goto out; + if (rlen <= 0) { + SET(bp->nb_flags, NB_ERROR); + bp->nb_error = error = EIO; + goto out; + } + + /* save lowest commit level returned */ + if (committed < bp->nb_commitlevel) + bp->nb_commitlevel = committed; + + /* check the write verifier */ + if (!bp->nb_verf) { + bp->nb_verf = wverf; + } else if (bp->nb_verf != wverf) { + /* verifier changed, so buffer will need to be rewritten */ + bp->nb_flags |= NB_STALEWVERF; + bp->nb_commitlevel = NFS_WRITE_UNSTABLE; + bp->nb_verf = wverf; + } + + /* + * check for a short write + * + * If the server didn't write all the data, then we + * need to issue another write for the rest of it. + * (Don't bother if the buffer hit an error or stale wverf.) + */ + if (((int)rlen < length) && !(bp->nb_flags & (NB_STALEWVERF|NB_ERROR))) { +writeagain: + offset += rlen; + length -= rlen; + + auio = uio_createwithbuffer(1, NBOFF(bp) + offset, UIO_SYSSPACE, + UIO_WRITE, &uio_buf, sizeof(uio_buf)); + uio_addiov(auio, CAST_USER_ADDR_T(bp->nb_data + offset), length); + + cb.rcb_args[0] = offset; + cb.rcb_args[1] = length; + if (nmp->nm_vers >= NFS_VER4) + cb.rcb_args[2] = nmp->nm_stategenid; + + // XXX iomode should really match the original request + error = nmp->nm_funcs->nf_write_rpc_async(np, auio, length, thd, cred, + NFS_WRITE_FILESYNC, &cb, &wreq); + if (!error) { + if (IS_VALID_CRED(cred)) + kauth_cred_unref(&cred); + if (!cb.rcb_func) { + /* if !async we'll need to wait for this RPC to finish */ + req = wreq; + wreq = NULL; + goto finish; } - if (lastpgoff && !NBPGVALID(bp,lastpg)) { - /* need to read end of last page */ - if (start < 0) - start = (lastpg * PAGE_SIZE) + lastpgoff; - end = (lastpg + 1) * PAGE_SIZE; + nfs_request_rele(req); + /* + * We're done here. + * Outstanding RPC count is unchanged. + * Callback will be called when RPC is done. + */ + return; + } + SET(bp->nb_flags, NB_ERROR); + bp->nb_error = error; + } + +out: + if (cb.rcb_func) { + nfs_async_write_done(nmp); + nfs_request_rele(req); + } + /* + * Decrement outstanding RPC count on buffer + * and call nfs_buf_write_finish on last RPC. + * + * (Note: when there are multiple async RPCs issued for a + * buffer we need nfs_buffer_mutex to avoid problems when + * aborting a partially-initiated set of RPCs) + */ + multasyncrpc = ISSET(bp->nb_flags, NB_MULTASYNCRPC); + if (multasyncrpc) + lck_mtx_lock(nfs_buf_mutex); + + bp->nb_rpcs--; + finished = (bp->nb_rpcs == 0); + + if (multasyncrpc) + lck_mtx_unlock(nfs_buf_mutex); + + if (finished) { + if (multasyncrpc) + wakeme = &bp->nb_rpcs; + nfs_buf_write_finish(bp, thd, cred); + if (wakeme) + wakeup(wakeme); + } + + if (IS_VALID_CRED(cred)) + kauth_cred_unref(&cred); +} + +/* + * Send commit(s) for the given node's "needcommit" buffers + */ +int +nfs_flushcommits(nfsnode_t np, int nowait) +{ + struct nfsmount *nmp; + struct nfsbuf *bp, *prevlbp, *lbp; + struct nfsbuflists blist, commitlist; + int error = 0, retv, wcred_set, flags, dirty; + u_quad_t off, endoff, toff; + uint64_t wverf; + u_int32_t count; + kauth_cred_t wcred = NULL; + + FSDBG_TOP(557, np, 0, 0, 0); + + /* + * A nb_flags == (NB_DELWRI | NB_NEEDCOMMIT) block has been written to the + * server, but nas not been committed to stable storage on the server + * yet. The byte range is worked out for as many nfsbufs as we can handle + * and the commit rpc is done. + */ + if (!LIST_EMPTY(&np->n_dirtyblkhd)) { + error = nfs_node_lock(np); + if (error) + goto done; + np->n_flag |= NMODIFIED; + nfs_node_unlock(np); + } + + off = (u_quad_t)-1; + endoff = 0; + wcred_set = 0; + LIST_INIT(&commitlist); + + nmp = NFSTONMP(np); + if (!nmp) { + error = ENXIO; + goto done; + } + if (nmp->nm_vers == NFS_VER2) { + error = EINVAL; + goto done; + } + + flags = NBI_DIRTY; + if (nowait) + flags |= NBI_NOWAIT; + lck_mtx_lock(nfs_buf_mutex); + wverf = nmp->nm_verf; + if (!nfs_buf_iterprepare(np, &blist, flags)) { + while ((bp = LIST_FIRST(&blist))) { + LIST_REMOVE(bp, nb_vnbufs); + LIST_INSERT_HEAD(&np->n_dirtyblkhd, bp, nb_vnbufs); + error = nfs_buf_acquire(bp, NBAC_NOWAIT, 0, 0); + if (error) + continue; + if (ISSET(bp->nb_flags, NB_NEEDCOMMIT)) + nfs_buf_check_write_verifier(np, bp); + if (((bp->nb_flags & (NB_DELWRI | NB_NEEDCOMMIT)) != (NB_DELWRI | NB_NEEDCOMMIT)) || + (bp->nb_verf != wverf)) { + nfs_buf_drop(bp); + continue; } - if (end > start) { - /* need to read the data in range: start...end-1 */ - - /* first, check for dirty pages in between */ - /* if there are, we'll have to do two reads because */ - /* we don't want to overwrite the dirty pages. */ - for (dirtypg=start/PAGE_SIZE; dirtypg <= (end-1)/PAGE_SIZE; dirtypg++) - if (NBPGDIRTY(bp,dirtypg)) - break; - - /* if start is at beginning of page, try */ - /* to get any preceeding pages as well. */ - if (!(start & PAGE_MASK)) { - /* stop at next dirty/valid page or start of block */ - for (; start > 0; start-=PAGE_SIZE) - if (NBPGVALID(bp,((start-1)/PAGE_SIZE))) - break; - } + nfs_buf_remfree(bp); - NFS_BUF_MAP(bp); - /* setup uio for read(s) */ - boff = NBOFF(bp); - auio.uio_iovs.iov32p = &iov; - auio.uio_iovcnt = 1; -#if 1 /* LP64todo - can't use new segment flags until the drivers are ready */ - auio.uio_segflg = UIO_SYSSPACE; -#else - auio.uio_segflg = UIO_SYSSPACE32; -#endif - auio.uio_rw = UIO_READ; - - if (dirtypg <= (end-1)/PAGE_SIZE) { - /* there's a dirty page in the way, so just do two reads */ - /* we'll read the preceding data here */ - auio.uio_offset = boff + start; - iov.iov_len = on - start; - uio_uio_resid_set(&auio, iov.iov_len); - iov.iov_base = (uintptr_t) bp->nb_data + start; - error = nfs_readrpc(vp, &auio, cred, p); - if (error) { - bp->nb_error = error; - SET(bp->nb_flags, NB_ERROR); - printf("nfs_write: readrpc %d", error); - } - if (uio_uio_resid(&auio) > 0) { - FSDBG(516, bp, iov.iov_base - bp->nb_data, uio_uio_resid(&auio), 0xd00dee01); - // LP64todo - fix this - bzero((caddr_t)iov.iov_base, uio_uio_resid(&auio)); - } - /* update validoff/validend if necessary */ - if ((bp->nb_validoff < 0) || (bp->nb_validoff > start)) - bp->nb_validoff = start; - if ((bp->nb_validend < 0) || (bp->nb_validend < on)) - bp->nb_validend = on; - if ((off_t)np->n_size > boff + bp->nb_validend) - bp->nb_validend = min(np->n_size - (boff + start), biosize); - /* validate any pages before the write offset */ - for (; start < on/PAGE_SIZE; start+=PAGE_SIZE) - NBPGVALID_SET(bp, start/PAGE_SIZE); - /* adjust start to read any trailing data */ - start = on+n; - } + /* buffer UPLs will be grabbed *in order* below */ - /* if end is at end of page, try to */ - /* get any following pages as well. */ - if (!(end & PAGE_MASK)) { - /* stop at next valid page or end of block */ - for (; end < bufsize; end+=PAGE_SIZE) - if (NBPGVALID(bp,end/PAGE_SIZE)) - break; - } + FSDBG(557, bp, bp->nb_flags, bp->nb_valid, bp->nb_dirty); + FSDBG(557, bp->nb_validoff, bp->nb_validend, + bp->nb_dirtyoff, bp->nb_dirtyend); - if (((boff+start) >= cureof) || ((start >= on) && ((boff + on + n) >= cureof))) { - /* - * Either this entire read is beyond the current EOF - * or the range that we won't be modifying (on+n...end) - * is all beyond the current EOF. - * No need to make a trip across the network to - * read nothing. So, just zero the buffer instead. - */ - FSDBG(516, bp, start, end - start, 0xd00dee00); - bzero(bp->nb_data + start, end - start); - } else { - /* now we'll read the (rest of the) data */ - auio.uio_offset = boff + start; - iov.iov_len = end - start; - uio_uio_resid_set(&auio, iov.iov_len); - iov.iov_base = (uintptr_t) (bp->nb_data + start); - error = nfs_readrpc(vp, &auio, cred, p); - if (error) { - bp->nb_error = error; - SET(bp->nb_flags, NB_ERROR); - printf("nfs_write: readrpc %d", error); - } - if (uio_uio_resid(&auio) > 0) { - FSDBG(516, bp, iov.iov_base - bp->nb_data, uio_uio_resid(&auio), 0xd00dee02); - // LP64todo - fix this - bzero((caddr_t)iov.iov_base, uio_uio_resid(&auio)); - } - } - /* update validoff/validend if necessary */ - if ((bp->nb_validoff < 0) || (bp->nb_validoff > start)) - bp->nb_validoff = start; - if ((bp->nb_validend < 0) || (bp->nb_validend < end)) - bp->nb_validend = end; - if ((off_t)np->n_size > boff + bp->nb_validend) - bp->nb_validend = min(np->n_size - (boff + start), biosize); - /* validate any pages before the write offset's page */ - for (; start < trunc_page_32(on); start+=PAGE_SIZE) - NBPGVALID_SET(bp, start/PAGE_SIZE); - /* validate any pages after the range of pages being written to */ - for (; (end - 1) > round_page_32(on+n-1); end-=PAGE_SIZE) - NBPGVALID_SET(bp, (end-1)/PAGE_SIZE); - /* Note: pages being written to will be validated when written */ + /* + * Work out if all buffers are using the same cred + * so we can deal with them all with one commit. + * + * Note: creds in bp's must be obtained by kauth_cred_ref + * on the same original cred in order for them to be equal. + */ + if (wcred_set == 0) { + wcred = bp->nb_wcred; + if (!IS_VALID_CRED(wcred)) + panic("nfs: needcommit w/out wcred"); + wcred_set = 1; + } else if ((wcred_set == 1) && wcred != bp->nb_wcred) { + wcred_set = -1; + } + SET(bp->nb_flags, NB_WRITEINPROG); + + /* + * Add this buffer to the list of buffers we are committing. + * Buffers are inserted into the list in ascending order so that + * we can take the UPLs in order after the list is complete. + */ + prevlbp = NULL; + LIST_FOREACH(lbp, &commitlist, nb_vnbufs) { + if (bp->nb_lblkno < lbp->nb_lblkno) + break; + prevlbp = lbp; + } + LIST_REMOVE(bp, nb_vnbufs); + if (prevlbp) + LIST_INSERT_AFTER(prevlbp, bp, nb_vnbufs); + else + LIST_INSERT_HEAD(&commitlist, bp, nb_vnbufs); + + /* update commit range start, end */ + toff = NBOFF(bp) + bp->nb_dirtyoff; + if (toff < off) + off = toff; + toff += (u_quad_t)(bp->nb_dirtyend - bp->nb_dirtyoff); + if (toff > endoff) + endoff = toff; + } + nfs_buf_itercomplete(np, &blist, NBI_DIRTY); + } + lck_mtx_unlock(nfs_buf_mutex); + + if (LIST_EMPTY(&commitlist)) { + error = ENOBUFS; + goto done; + } + + /* + * We need a UPL to prevent others from accessing the buffers during + * our commit RPC(s). + * + * We used to also check for dirty pages here; if there were any we'd + * abort the commit and force the entire buffer to be written again. + * Instead of doing that, we just go ahead and commit the dirty range, + * and then leave the buffer around with dirty pages that will be + * written out later. + */ + LIST_FOREACH(bp, &commitlist, nb_vnbufs) { + if (!ISSET(bp->nb_flags, NB_PAGELIST)) { + retv = nfs_buf_upl_setup(bp); + if (retv) { + /* Unable to create the UPL, the VM object probably no longer exists. */ + printf("nfs_flushcommits: upl create failed %d\n", retv); + bp->nb_valid = bp->nb_dirty = 0; } } + nfs_buf_upl_check(bp); + } - if (ISSET(bp->nb_flags, NB_ERROR)) { - error = bp->nb_error; + /* + * Commit data on the server, as required. + * If all bufs are using the same wcred, then use that with + * one call for all of them, otherwise commit each one + * separately. + */ + if (wcred_set == 1) { + /* + * Note, it's possible the commit range could be >2^32-1. + * If it is, we'll send one commit that covers the whole file. + */ + if ((endoff - off) > 0xffffffff) + count = 0; + else + count = (endoff - off); + retv = nmp->nm_funcs->nf_commit_rpc(np, off, count, wcred, wverf); + } else { + retv = 0; + LIST_FOREACH(bp, &commitlist, nb_vnbufs) { + toff = NBOFF(bp) + bp->nb_dirtyoff; + count = bp->nb_dirtyend - bp->nb_dirtyoff; + retv = nmp->nm_funcs->nf_commit_rpc(np, toff, count, bp->nb_wcred, wverf); + if (retv) + break; + } + } + + /* + * Now, either mark the blocks I/O done or mark the + * blocks dirty, depending on whether the commit + * succeeded. + */ + while ((bp = LIST_FIRST(&commitlist))) { + LIST_REMOVE(bp, nb_vnbufs); + FSDBG(557, bp, retv, bp->nb_flags, bp->nb_dirty); + nfs_node_lock_force(np); + CLR(bp->nb_flags, (NB_NEEDCOMMIT | NB_WRITEINPROG)); + np->n_needcommitcnt--; + CHECK_NEEDCOMMITCNT(np); + nfs_node_unlock(np); + + if (retv) { + /* move back to dirty list */ + lck_mtx_lock(nfs_buf_mutex); + LIST_INSERT_HEAD(&np->n_dirtyblkhd, bp, nb_vnbufs); + lck_mtx_unlock(nfs_buf_mutex); nfs_buf_release(bp, 1); - np->n_flag &= ~NWRBUSY; - FSDBG_BOT(515, vp, uio->uio_offset, uio_uio_resid(uio), error); - return (error); + continue; + } + + nfs_node_lock_force(np); + np->n_numoutput++; + nfs_node_unlock(np); + vnode_startwrite(NFSTOV(np)); + if (ISSET(bp->nb_flags, NB_DELWRI)) { + lck_mtx_lock(nfs_buf_mutex); + nfs_nbdwrite--; + NFSBUFCNTCHK(); + lck_mtx_unlock(nfs_buf_mutex); + wakeup(&nfs_nbdwrite); + } + CLR(bp->nb_flags, (NB_READ|NB_DONE|NB_ERROR|NB_DELWRI)); + /* if block still has dirty pages, we don't want it to */ + /* be released in nfs_buf_iodone(). So, don't set NB_ASYNC. */ + if (!(dirty = bp->nb_dirty)) + SET(bp->nb_flags, NB_ASYNC); + else + CLR(bp->nb_flags, NB_ASYNC); + + /* move to clean list */ + lck_mtx_lock(nfs_buf_mutex); + LIST_INSERT_HEAD(&np->n_cleanblkhd, bp, nb_vnbufs); + lck_mtx_unlock(nfs_buf_mutex); + + bp->nb_dirtyoff = bp->nb_dirtyend = 0; + + nfs_buf_iodone(bp); + if (dirty) { + /* throw it back in as a delayed write buffer */ + CLR(bp->nb_flags, NB_DONE); + nfs_buf_write_delayed(bp); } + } + +done: + FSDBG_BOT(557, np, 0, 0, error); + return (error); +} + +/* + * Flush all the blocks associated with a vnode. + * Walk through the buffer pool and push any dirty pages + * associated with the vnode. + */ +int +nfs_flush(nfsnode_t np, int waitfor, thread_t thd, int ignore_writeerr) +{ + struct nfsbuf *bp; + struct nfsbuflists blist; + struct nfsmount *nmp = NFSTONMP(np); + int error = 0, error2, slptimeo = 0, slpflag = 0; + int nfsvers, flags, passone = 1; + + FSDBG_TOP(517, np, waitfor, ignore_writeerr, 0); + + if (!nmp) { + error = ENXIO; + goto out; + } + nfsvers = nmp->nm_vers; + if (NMFLAG(nmp, INTR)) + slpflag = PCATCH; + if (!LIST_EMPTY(&np->n_dirtyblkhd)) { + nfs_node_lock_force(np); np->n_flag |= NMODIFIED; + nfs_node_unlock(np); + } - NFS_BUF_MAP(bp); - error = uiomove((char *)bp->nb_data + on, n, uio); - if (error) { - SET(bp->nb_flags, NB_ERROR); - nfs_buf_release(bp, 1); - np->n_flag &= ~NWRBUSY; - FSDBG_BOT(515, vp, uio->uio_offset, uio_uio_resid(uio), error); - return (error); + lck_mtx_lock(nfs_buf_mutex); + while (np->n_bflag & NBFLUSHINPROG) { + np->n_bflag |= NBFLUSHWANT; + error = msleep(&np->n_bflag, nfs_buf_mutex, slpflag, "nfs_flush", NULL); + if ((error && (error != EWOULDBLOCK)) || + ((error = nfs_sigintr(NFSTONMP(np), NULL, thd, 0)))) { + lck_mtx_unlock(nfs_buf_mutex); + goto out; } + } + np->n_bflag |= NBFLUSHINPROG; - /* validate any pages written to */ - start = on & ~PAGE_MASK; - for (; start < on+n; start += PAGE_SIZE) { - NBPGVALID_SET(bp, start/PAGE_SIZE); - /* - * This may seem a little weird, but we don't actually set the - * dirty bits for writes. This is because we keep the dirty range - * in the nb_dirtyoff/nb_dirtyend fields. Also, particularly for - * delayed writes, when we give the pages back to the VM we don't - * want to keep them marked dirty, because when we later write the - * buffer we won't be able to tell which pages were written dirty - * and which pages were mmapped and dirtied. - */ + /* + * On the first pass, start async/unstable writes on all + * delayed write buffers. Then wait for all writes to complete + * and call nfs_flushcommits() to commit any uncommitted buffers. + * On all subsequent passes, start STABLE writes on any remaining + * dirty buffers. Then wait for all writes to complete. + */ +again: + FSDBG(518, LIST_FIRST(&np->n_dirtyblkhd), np->n_flag, 0, 0); + if (!NFSTONMP(np)) { + lck_mtx_unlock(nfs_buf_mutex); + error = ENXIO; + goto done; + } + + /* Start/do any write(s) that are required. */ + if (!nfs_buf_iterprepare(np, &blist, NBI_DIRTY)) { + while ((bp = LIST_FIRST(&blist))) { + LIST_REMOVE(bp, nb_vnbufs); + LIST_INSERT_HEAD(&np->n_dirtyblkhd, bp, nb_vnbufs); + flags = (passone || !(waitfor == MNT_WAIT || waitfor == MNT_DWAIT)) ? NBAC_NOWAIT : 0; + if (flags != NBAC_NOWAIT) + nfs_buf_refget(bp); + while ((error = nfs_buf_acquire(bp, flags, slpflag, slptimeo))) { + FSDBG(524, bp, flags, bp->nb_lflags, bp->nb_flags); + if (error == EBUSY) + break; + if (error) { + error2 = nfs_sigintr(NFSTONMP(np), NULL, thd, 0); + if (error2) { + if (flags != NBAC_NOWAIT) + nfs_buf_refrele(bp); + nfs_buf_itercomplete(np, &blist, NBI_DIRTY); + lck_mtx_unlock(nfs_buf_mutex); + error = error2; + goto done; + } + if (slpflag == PCATCH) { + slpflag = 0; + slptimeo = 2 * hz; + } + } + } + if (flags != NBAC_NOWAIT) + nfs_buf_refrele(bp); + if (error == EBUSY) + continue; + if (!bp->nb_np) { + /* buffer is no longer valid */ + nfs_buf_drop(bp); + continue; + } + if (ISSET(bp->nb_flags, NB_NEEDCOMMIT)) + nfs_buf_check_write_verifier(np, bp); + if (!ISSET(bp->nb_flags, NB_DELWRI)) { + /* buffer is no longer dirty */ + nfs_buf_drop(bp); + continue; + } + FSDBG(525, bp, passone, bp->nb_lflags, bp->nb_flags); + if ((passone || !(waitfor == MNT_WAIT || waitfor == MNT_DWAIT)) && + ISSET(bp->nb_flags, NB_NEEDCOMMIT)) { + nfs_buf_drop(bp); + continue; + } + nfs_buf_remfree(bp); + lck_mtx_unlock(nfs_buf_mutex); + if (ISSET(bp->nb_flags, NB_ERROR)) { + nfs_node_lock_force(np); + np->n_error = bp->nb_error ? bp->nb_error : EIO; + np->n_flag |= NWRITEERR; + nfs_node_unlock(np); + nfs_buf_release(bp, 1); + lck_mtx_lock(nfs_buf_mutex); + continue; + } + SET(bp->nb_flags, NB_ASYNC); + if (!passone) { + /* NB_STABLE forces this to be written FILESYNC */ + SET(bp->nb_flags, NB_STABLE); + } + nfs_buf_write(bp); + lck_mtx_lock(nfs_buf_mutex); } - if (bp->nb_dirtyend > 0) { - bp->nb_dirtyoff = min(on, bp->nb_dirtyoff); - bp->nb_dirtyend = max((on + n), bp->nb_dirtyend); - } else { - bp->nb_dirtyoff = on; - bp->nb_dirtyend = on + n; + nfs_buf_itercomplete(np, &blist, NBI_DIRTY); + } + lck_mtx_unlock(nfs_buf_mutex); + + if (waitfor == MNT_WAIT || waitfor == MNT_DWAIT) { + while ((error = vnode_waitforwrites(NFSTOV(np), 0, slpflag, slptimeo, "nfsflush"))) { + error2 = nfs_sigintr(NFSTONMP(np), NULL, thd, 0); + if (error2) { + error = error2; + goto done; + } + if (slpflag == PCATCH) { + slpflag = 0; + slptimeo = 2 * hz; + } } - if (bp->nb_validend <= 0 || bp->nb_validend < bp->nb_dirtyoff || - bp->nb_validoff > bp->nb_dirtyend) { - bp->nb_validoff = bp->nb_dirtyoff; - bp->nb_validend = bp->nb_dirtyend; - } else { - bp->nb_validoff = min(bp->nb_validoff, bp->nb_dirtyoff); - bp->nb_validend = max(bp->nb_validend, bp->nb_dirtyend); + } + + if (nfsvers != NFS_VER2) { + /* loop while it looks like there are still buffers to be */ + /* commited and nfs_flushcommits() seems to be handling them. */ + while (np->n_needcommitcnt) + if (nfs_flushcommits(np, 0)) + break; + } + + if (passone) { + passone = 0; + if (!LIST_EMPTY(&np->n_dirtyblkhd)) { + nfs_node_lock_force(np); + np->n_flag |= NMODIFIED; + nfs_node_unlock(np); } - if (!ISSET(bp->nb_flags, NB_CACHE)) - nfs_buf_normalize_valid_range(np, bp); + lck_mtx_lock(nfs_buf_mutex); + goto again; + } + if (waitfor == MNT_WAIT || waitfor == MNT_DWAIT) { + if (!LIST_EMPTY(&np->n_dirtyblkhd)) { + nfs_node_lock_force(np); + np->n_flag |= NMODIFIED; + nfs_node_unlock(np); + } + lck_mtx_lock(nfs_buf_mutex); + if (!LIST_EMPTY(&np->n_dirtyblkhd)) + goto again; + lck_mtx_unlock(nfs_buf_mutex); + nfs_node_lock_force(np); /* - * Since this block is being modified, it must be written - * again and not just committed. + * OK, it looks like there are no dirty blocks. If we have no + * writes in flight and no one in the write code, we can clear + * the modified flag. In order to make sure we see the latest + * attributes and size, we also invalidate the attributes and + * advance the attribute cache XID to guarantee that attributes + * newer than our clearing of NMODIFIED will get loaded next. + * (If we don't do this, it's possible for the flush's final + * write/commit (xid1) to be executed in parallel with a subsequent + * getattr request (xid2). The getattr could return attributes + * from *before* the write/commit completed but the stale attributes + * would be preferred because of the xid ordering.) */ - if (ISSET(bp->nb_flags, NB_NEEDCOMMIT)) { - np->n_needcommitcnt--; - CHECK_NEEDCOMMITCNT(np); + if (!np->n_wrbusy && !np->n_numoutput) { + np->n_flag &= ~NMODIFIED; + NATTRINVALIDATE(np); + nfs_get_xid(&np->n_xid); } - CLR(bp->nb_flags, NB_NEEDCOMMIT); - - if (ioflag & IO_SYNC) { - bp->nb_proc = p; - error = nfs_buf_write(bp); - if (error) { - np->n_flag &= ~NWRBUSY; - FSDBG_BOT(515, vp, uio->uio_offset, - uio_uio_resid(uio), error); - return (error); - } - } else if (((n + on) == biosize) || (np->n_flag & NNOCACHE)) { - bp->nb_proc = NULL; - SET(bp->nb_flags, NB_ASYNC); - nfs_buf_write(bp); - } else - nfs_buf_write_delayed(bp, p); - - if (np->n_needcommitcnt > (nfsbufcnt/16)) - nfs_flushcommits(vp, p, 1); - - } while (uio_uio_resid(uio) > 0 && n > 0); + } else { + nfs_node_lock_force(np); + } - np->n_flag &= ~NWRBUSY; - FSDBG_BOT(515, vp, uio->uio_offset, uio_uio_resid(uio), 0); - return (0); + FSDBG(526, np->n_flag, np->n_error, 0, 0); + if (!ignore_writeerr && (np->n_flag & NWRITEERR)) { + error = np->n_error; + np->n_flag &= ~NWRITEERR; + } + nfs_node_unlock(np); +done: + lck_mtx_lock(nfs_buf_mutex); + flags = np->n_bflag; + np->n_bflag &= ~(NBFLUSHINPROG|NBFLUSHWANT); + lck_mtx_unlock(nfs_buf_mutex); + if (flags & NBFLUSHWANT) + wakeup(&np->n_bflag); +out: + FSDBG_BOT(517, np, error, ignore_writeerr, 0); + return (error); } /* * Flush out and invalidate all buffers associated with a vnode. * Called with the underlying object locked. */ -static int +int nfs_vinvalbuf_internal( - vnode_t vp, + nfsnode_t np, int flags, + thread_t thd, kauth_cred_t cred, - proc_t p, int slpflag, int slptimeo) { struct nfsbuf *bp; struct nfsbuflists blist; int list, error = 0; - struct nfsnode *np = VTONFS(vp); if (flags & V_SAVE) { - if ((error = nfs_flush(vp, MNT_WAIT, cred, p, - (flags & V_IGNORE_WRITEERR)))) + if ((error = nfs_flush(np, MNT_WAIT, thd, (flags & V_IGNORE_WRITEERR)))) return (error); - if (!LIST_EMPTY(&np->n_dirtyblkhd)) - panic("nfs_vinvalbuf: dirty bufs (vp 0x%x, bp 0x%x)", - vp, LIST_FIRST(&np->n_dirtyblkhd)); } lck_mtx_lock(nfs_buf_mutex); @@ -2514,9 +3487,9 @@ nfs_vinvalbuf_internal( LIST_INSERT_HEAD(&np->n_dirtyblkhd, bp, nb_vnbufs); nfs_buf_refget(bp); while ((error = nfs_buf_acquire(bp, NBAC_REMOVE, slpflag, slptimeo))) { - FSDBG(556, vp, bp, NBOFF(bp), bp->nb_flags); + FSDBG(556, np, bp, NBOFF(bp), bp->nb_flags); if (error != EAGAIN) { - FSDBG(554, vp, bp, -1, error); + FSDBG(554, np, bp, -1, error); nfs_buf_refrele(bp); nfs_buf_itercomplete(np, &blist, list); lck_mtx_unlock(nfs_buf_mutex); @@ -2524,11 +3497,11 @@ nfs_vinvalbuf_internal( } } nfs_buf_refrele(bp); - FSDBG(554, vp, bp, NBOFF(bp), bp->nb_flags); + FSDBG(554, np, bp, NBOFF(bp), bp->nb_flags); lck_mtx_unlock(nfs_buf_mutex); - if ((flags & V_SAVE) && UBCINFOEXISTS(vp) && bp->nb_vp && + if ((flags & V_SAVE) && UBCINFOEXISTS(NFSTOV(np)) && bp->nb_np && (NBOFF(bp) < (off_t)np->n_size)) { - /* XXX extra paranoia: make sure we're not */ + /* extra paranoia: make sure we're not */ /* somehow leaving any dirty data around */ int mustwrite = 0; int end = (NBOFF(bp) + bp->nb_bufsize > (off_t)np->n_size) ? @@ -2545,20 +3518,26 @@ nfs_vinvalbuf_internal( } nfs_buf_upl_check(bp); /* check for any dirty data before the EOF */ - if (bp->nb_dirtyend && bp->nb_dirtyoff < end) { + if ((bp->nb_dirtyend > 0) && (bp->nb_dirtyoff < end)) { /* clip dirty range to EOF */ - if (bp->nb_dirtyend > end) + if (bp->nb_dirtyend > end) { bp->nb_dirtyend = end; - mustwrite++; + if (bp->nb_dirtyoff >= bp->nb_dirtyend) + bp->nb_dirtyoff = bp->nb_dirtyend = 0; + } + if ((bp->nb_dirtyend > 0) && (bp->nb_dirtyoff < end)) + mustwrite++; } bp->nb_dirty &= (1 << (round_page_32(end)/PAGE_SIZE)) - 1; + if (bp->nb_dirty) + mustwrite++; /* also make sure we'll have a credential to do the write */ - if (mustwrite && (bp->nb_wcred == NOCRED) && (cred == NOCRED)) { + if (mustwrite && !IS_VALID_CRED(bp->nb_wcred) && !IS_VALID_CRED(cred)) { printf("nfs_vinvalbuf: found dirty buffer with no write creds\n"); mustwrite = 0; } if (mustwrite) { - FSDBG(554, vp, bp, 0xd00dee, bp->nb_flags); + FSDBG(554, np, bp, 0xd00dee, bp->nb_flags); if (!ISSET(bp->nb_flags, NB_PAGELIST)) panic("nfs_vinvalbuf: dirty buffer without upl"); /* gotta write out dirty data before invalidating */ @@ -2566,7 +3545,7 @@ nfs_vinvalbuf_internal( /* (NB_NOCACHE indicates buffer should be discarded) */ CLR(bp->nb_flags, (NB_DONE | NB_ERROR | NB_INVAL | NB_ASYNC)); SET(bp->nb_flags, NB_STABLE | NB_NOCACHE); - if (bp->nb_wcred == NOCRED) { + if (!IS_VALID_CRED(bp->nb_wcred)) { kauth_cred_ref(cred); bp->nb_wcred = cred; } @@ -2574,8 +3553,11 @@ nfs_vinvalbuf_internal( // Note: bp has been released if (error) { FSDBG(554, bp, 0xd00dee, 0xbad, error); - np->n_error = error; - np->n_flag |= NWRITEERR; + nfs_node_lock_force(np); + if ((error != EINTR) && (error != ERESTART)) { + np->n_error = error; + np->n_flag |= NWRITEERR; + } /* * There was a write error and we need to * invalidate attrs to sync with server. @@ -2583,6 +3565,18 @@ nfs_vinvalbuf_internal( * we may no longer know the correct size) */ NATTRINVALIDATE(np); + nfs_node_unlock(np); + if ((error == EINTR) || (error == ERESTART)) { + /* + * Abort on EINTR. If we don't, we could + * be stuck in this loop forever because + * the buffer will continue to stay dirty. + */ + lck_mtx_lock(nfs_buf_mutex); + nfs_buf_itercomplete(np, &blist, list); + lck_mtx_unlock(nfs_buf_mutex); + return (error); + } error = 0; } lck_mtx_lock(nfs_buf_mutex); @@ -2596,10 +3590,16 @@ nfs_vinvalbuf_internal( } nfs_buf_itercomplete(np, &blist, list); } + if (!LIST_EMPTY(&(np)->n_dirtyblkhd) || !LIST_EMPTY(&(np)->n_cleanblkhd)) + panic("nfs_vinvalbuf: flush/inval failed"); lck_mtx_unlock(nfs_buf_mutex); + nfs_node_lock_force(np); + if (!(flags & V_SAVE)) + np->n_flag &= ~NMODIFIED; + if (vnode_vtype(NFSTOV(np)) == VREG) + np->n_lastrahead = -1; + nfs_node_unlock(np); NFS_BUF_FREEUP(); - if (NVALIDBUFS(np)) - panic("nfs_vinvalbuf: flush failed"); return (0); } @@ -2609,21 +3609,23 @@ nfs_vinvalbuf_internal( * doing the flush, just wait for completion. */ int -nfs_vinvalbuf( - vnode_t vp, - int flags, - kauth_cred_t cred, - proc_t p, - int intrflg) +nfs_vinvalbuf(vnode_t vp, int flags, vfs_context_t ctx, int intrflg) +{ + return nfs_vinvalbuf2(vp, flags, vfs_context_thread(ctx), vfs_context_ucred(ctx), intrflg); +} + +int +nfs_vinvalbuf2(vnode_t vp, int flags, thread_t thd, kauth_cred_t cred, int intrflg) { - struct nfsnode *np = VTONFS(vp); - struct nfsmount *nmp = VFSTONFS(vnode_mount(vp)); - int error = 0, slpflag, slptimeo; + nfsnode_t np = VTONFS(vp); + struct nfsmount *nmp = VTONMP(vp); + int error, slpflag, slptimeo, nflags, retry = 0; + struct timespec ts = { 2, 0 }; off_t size; - FSDBG_TOP(554, vp, flags, intrflg, 0); + FSDBG_TOP(554, np, flags, intrflg, 0); - if (nmp && ((nmp->nm_flag & NFSMNT_INT) == 0)) + if (nmp && !NMFLAG(nmp, INTR)) intrflg = 0; if (intrflg) { slpflag = PCATCH; @@ -2632,639 +3634,216 @@ nfs_vinvalbuf( slpflag = 0; slptimeo = 0; } - /* - * First wait for any other process doing a flush to complete. - */ - while (np->n_flag & NFLUSHINPROG) { - np->n_flag |= NFLUSHWANT; - FSDBG_TOP(555, vp, flags, intrflg, np->n_flag); - error = tsleep((caddr_t)&np->n_flag, PRIBIO + 2, "nfsvinval", slptimeo); - FSDBG_BOT(555, vp, flags, intrflg, np->n_flag); - if (error && (error = nfs_sigintr(VFSTONFS(vnode_mount(vp)), NULL, p))) { - FSDBG_BOT(554, vp, flags, intrflg, error); + + /* First wait for any other process doing a flush to complete. */ + lck_mtx_lock(nfs_buf_mutex); + while (np->n_bflag & NBINVALINPROG) { + np->n_bflag |= NBINVALWANT; + msleep(&np->n_bflag, nfs_buf_mutex, slpflag, "nfs_vinvalbuf", &ts); + if ((error = nfs_sigintr(VTONMP(vp), NULL, thd, 0))) { + lck_mtx_unlock(nfs_buf_mutex); return (error); } + if (np->n_bflag & NBINVALINPROG) + slpflag = 0; } + np->n_bflag |= NBINVALINPROG; + lck_mtx_unlock(nfs_buf_mutex); - /* - * Now, flush as required. - */ - np->n_flag |= NFLUSHINPROG; - error = nfs_vinvalbuf_internal(vp, flags, cred, p, slpflag, 0); + /* Now, flush as required. */ +again: + error = nfs_vinvalbuf_internal(np, flags, thd, cred, slpflag, 0); while (error) { - FSDBG(554, vp, 0, 0, error); - error = nfs_sigintr(VFSTONFS(vnode_mount(vp)), NULL, p); - if (error) { - np->n_flag &= ~NFLUSHINPROG; - if (np->n_flag & NFLUSHWANT) { - np->n_flag &= ~NFLUSHWANT; - wakeup((caddr_t)&np->n_flag); - } - FSDBG_BOT(554, vp, flags, intrflg, error); - return (error); - } - error = nfs_vinvalbuf_internal(vp, flags, cred, p, 0, slptimeo); - } - np->n_flag &= ~(NMODIFIED | NFLUSHINPROG); - if (np->n_flag & NFLUSHWANT) { - np->n_flag &= ~NFLUSHWANT; - wakeup((caddr_t)&np->n_flag); - } - /* - * get the pages out of vm also - */ - if (UBCINFOEXISTS(vp) && (size = ubc_getsize(vp))) { - int rv = ubc_sync_range(vp, 0, size, UBC_PUSHALL | UBC_INVALIDATE); - if (!rv) - panic("nfs_vinvalbuf(): ubc_sync_range failed!"); + FSDBG(554, np, 0, 0, error); + if ((error = nfs_sigintr(VTONMP(vp), NULL, thd, 0))) + goto done; + error = nfs_vinvalbuf_internal(np, flags, thd, cred, 0, slptimeo); } - FSDBG_BOT(554, vp, flags, intrflg, 0); - return (0); + /* get the pages out of vm also */ + if (UBCINFOEXISTS(vp) && (size = ubc_getsize(vp))) + if ((error = ubc_msync(vp, 0, size, NULL, UBC_PUSHALL | UBC_SYNC | UBC_INVALIDATE))) { + if (error == EINVAL) + panic("nfs_vinvalbuf(): ubc_msync failed!, error %d", error); + if (retry++ < 10) /* retry invalidating a few times */ + goto again; + /* give up */ + printf("nfs_vinvalbuf(): ubc_msync failed!, error %d", error); + + } +done: + lck_mtx_lock(nfs_buf_mutex); + nflags = np->n_bflag; + np->n_bflag &= ~(NBINVALINPROG|NBINVALWANT); + lck_mtx_unlock(nfs_buf_mutex); + if (nflags & NBINVALWANT) + wakeup(&np->n_bflag); + + FSDBG_BOT(554, np, flags, intrflg, error); + return (error); } /* - * Initiate asynchronous I/O. Return an error if no nfsiods are available. - * This is mainly to avoid queueing async I/O requests when the nfsiods - * are all hung on a dead server. + * Wait for any busy buffers to complete. */ -int -nfs_asyncio(bp, cred) - struct nfsbuf *bp; - kauth_cred_t cred; +void +nfs_wait_bufs(nfsnode_t np) { - struct nfsmount *nmp; - int i; - int gotiod; - int slpflag = 0; - int slptimeo = 0; - int error, error2; - void *wakeme = NULL; - struct timespec ts; - - if (nfs_numasync == 0) - return (EIO); - - FSDBG_TOP(552, bp, bp ? NBOFF(bp) : 0, bp ? bp->nb_flags : 0, 0); - - nmp = ((bp != NULL) ? VFSTONFS(vnode_mount(bp->nb_vp)) : NULL); -again: - if (nmp && nmp->nm_flag & NFSMNT_INT) - slpflag = PCATCH; - gotiod = FALSE; - - lck_mtx_lock(nfs_iod_mutex); - - /* no nfsbuf means tell nfsiod to process delwri list */ - if (!bp) - nfs_ioddelwri = 1; + struct nfsbuf *bp; + struct nfsbuflists blist; + int error = 0; - /* - * Find a free iod to process this request. - */ - for (i = 0; i < NFS_MAXASYNCDAEMON; i++) - if (nfs_iodwant[i]) { - /* - * Found one, so wake it up and tell it which - * mount to process. - */ - nfs_iodwant[i] = NULL; - nfs_iodmount[i] = nmp; - if (nmp) - nmp->nm_bufqiods++; - wakeme = &nfs_iodwant[i]; - gotiod = TRUE; - break; + lck_mtx_lock(nfs_buf_mutex); + if (!nfs_buf_iterprepare(np, &blist, NBI_CLEAN)) { + while ((bp = LIST_FIRST(&blist))) { + LIST_REMOVE(bp, nb_vnbufs); + LIST_INSERT_HEAD(&np->n_cleanblkhd, bp, nb_vnbufs); + nfs_buf_refget(bp); + while ((error = nfs_buf_acquire(bp, 0, 0, 0))) { + if (error != EAGAIN) { + nfs_buf_refrele(bp); + nfs_buf_itercomplete(np, &blist, NBI_CLEAN); + lck_mtx_unlock(nfs_buf_mutex); + return; + } + } + nfs_buf_refrele(bp); + nfs_buf_drop(bp); } - - /* if we're just poking the delwri list, we're done */ - if (!bp) { - lck_mtx_unlock(nfs_iod_mutex); - if (wakeme) - wakeup(wakeme); - FSDBG_BOT(552, bp, 0x10101010, wakeme, 0); - return (0); + nfs_buf_itercomplete(np, &blist, NBI_CLEAN); } - - /* - * If none are free, we may already have an iod working on this mount - * point. If so, it will process our request. - */ - if (!gotiod) { - if (nmp->nm_bufqiods > 0) { - gotiod = TRUE; + if (!nfs_buf_iterprepare(np, &blist, NBI_DIRTY)) { + while ((bp = LIST_FIRST(&blist))) { + LIST_REMOVE(bp, nb_vnbufs); + LIST_INSERT_HEAD(&np->n_dirtyblkhd, bp, nb_vnbufs); + nfs_buf_refget(bp); + while ((error = nfs_buf_acquire(bp, 0, 0, 0))) { + if (error != EAGAIN) { + nfs_buf_refrele(bp); + nfs_buf_itercomplete(np, &blist, NBI_DIRTY); + lck_mtx_unlock(nfs_buf_mutex); + return; + } + } + nfs_buf_refrele(bp); + nfs_buf_drop(bp); } + nfs_buf_itercomplete(np, &blist, NBI_DIRTY); } + lck_mtx_unlock(nfs_buf_mutex); +} - /* - * If we have an iod which can process the request, then queue - * the buffer. - */ - FSDBG(552, bp, gotiod, i, nmp->nm_bufqiods); - if (gotiod) { - /* - * Ensure that the queue never grows too large. - */ - while (nmp->nm_bufqlen >= 2*nfs_numasync) { - if (ISSET(bp->nb_flags, NB_IOD)) { - /* An nfsiod is attempting this async operation so */ - /* we must not fall asleep on the bufq because we */ - /* could be waiting on ourself. Just return error */ - /* and we'll do this operation syncrhonously. */ - goto out; - } - FSDBG(552, bp, nmp->nm_bufqlen, 2*nfs_numasync, -1); - nmp->nm_bufqwant = TRUE; - ts.tv_sec = (slptimeo/100); - /* the hz value is 100; which leads to 10ms */ - ts.tv_nsec = (slptimeo % 100) * 10 * NSEC_PER_USEC * 1000; +/* + * Add an async I/O request to the mount's async I/O queue and make + * sure that an nfsiod will service it. + */ +void +nfs_asyncio_finish(struct nfsreq *req) +{ + struct nfsmount *nmp; + struct nfsiod *niod; + int started = 0; - error = msleep(&nmp->nm_bufq, nfs_iod_mutex, slpflag | PRIBIO, - "nfsaio", &ts); - if (error) { - error2 = nfs_sigintr(nmp, NULL, bp->nb_proc); - if (error2) { - lck_mtx_unlock(nfs_iod_mutex); - FSDBG_BOT(552, bp, NBOFF(bp), bp->nb_flags, error2); - return (error2); - } - if (slpflag == PCATCH) { - slpflag = 0; - slptimeo = 2 * hz; - } - } + FSDBG_TOP(552, nmp, 0, 0, 0); +again: + if (((nmp = req->r_nmp)) == NULL) + return; + lck_mtx_lock(nfsiod_mutex); + niod = nmp->nm_niod; + + /* grab an nfsiod if we don't have one already */ + if (!niod) { + niod = TAILQ_FIRST(&nfsiodfree); + if (niod) { + TAILQ_REMOVE(&nfsiodfree, niod, niod_link); + TAILQ_INSERT_TAIL(&nfsiodwork, niod, niod_link); + niod->niod_nmp = nmp; + } else if (((nfsiod_thread_count < NFSIOD_MAX) || (nfsiod_thread_count <= 0)) && (started < 4)) { /* - * We might have lost our iod while sleeping, - * so check and loop if nescessary. + * Try starting a new thread. + * We may try a couple times if other callers + * get the new threads before we do. */ - if (nmp->nm_bufqiods == 0) { - lck_mtx_unlock(nfs_iod_mutex); + lck_mtx_unlock(nfsiod_mutex); + started++; + if (!nfsiod_start()) goto again; - } + lck_mtx_lock(nfsiod_mutex); } + } - if (ISSET(bp->nb_flags, NB_READ)) { - if (bp->nb_rcred == NOCRED && cred != NOCRED) { - kauth_cred_ref(cred); - bp->nb_rcred = cred; - } + if (req->r_achain.tqe_next == NFSREQNOLIST) + TAILQ_INSERT_TAIL(&nmp->nm_iodq, req, r_achain); + + /* If this mount doesn't already have an nfsiod working on it... */ + if (!nmp->nm_niod) { + if (niod) { /* give it the nfsiod we just grabbed */ + nmp->nm_niod = niod; + lck_mtx_unlock(nfsiod_mutex); + wakeup(niod); + } else if (nfsiod_thread_count > 0) { + /* just queue it up on nfsiod mounts queue */ + TAILQ_INSERT_TAIL(&nfsiodmounts, nmp, nm_iodlink); + lck_mtx_unlock(nfsiod_mutex); } else { - SET(bp->nb_flags, NB_WRITEINPROG); - if (bp->nb_wcred == NOCRED && cred != NOCRED) { - kauth_cred_ref(cred); - bp->nb_wcred = cred; - } + printf("nfs_asyncio(): no nfsiods? %d %d (%d)\n", nfsiod_thread_count, NFSIOD_MAX, started); + lck_mtx_unlock(nfsiod_mutex); + /* we have no other option but to be persistent */ + started = 0; + goto again; } - - TAILQ_INSERT_TAIL(&nmp->nm_bufq, bp, nb_free); - nmp->nm_bufqlen++; - lck_mtx_unlock(nfs_iod_mutex); - if (wakeme) - wakeup(wakeme); - FSDBG_BOT(552, bp, NBOFF(bp), bp->nb_flags, 0); - return (0); + } else { + lck_mtx_unlock(nfsiod_mutex); } -out: - lck_mtx_unlock(nfs_iod_mutex); - /* - * All the iods are busy on other mounts, so return EIO to - * force the caller to process the i/o synchronously. - */ - FSDBG_BOT(552, bp, NBOFF(bp), bp->nb_flags, EIO); - return (EIO); + FSDBG_BOT(552, nmp, 0, 0, 0); } /* - * Do an I/O operation to/from a cache block. This may be called - * synchronously or from an nfsiod. + * queue up async I/O request for resend */ -int -nfs_doio(struct nfsbuf *bp, kauth_cred_t cr, proc_t p) +void +nfs_asyncio_resend(struct nfsreq *req) { - struct uio *uiop; - vnode_t vp; - struct nfsnode *np; - struct nfsmount *nmp; - int error = 0, diff, len, iomode, must_commit = 0, invalidate = 0; - struct uio uio; - struct iovec_32 io; - enum vtype vtype; - - vp = bp->nb_vp; - vtype = vnode_vtype(vp); - np = VTONFS(vp); - nmp = VFSTONFS(vnode_mount(vp)); - uiop = &uio; - uiop->uio_iovs.iov32p = &io; - uiop->uio_iovcnt = 1; -#if 1 /* LP64todo - can't use new segment flags until the drivers are ready */ - uiop->uio_segflg = UIO_SYSSPACE; -#else - uiop->uio_segflg = UIO_SYSSPACE32; -#endif + struct nfsmount *nmp = req->r_nmp; - /* - * we've decided to perform I/O for this block, - * so we couldn't possibly NB_DONE. So, clear it. - */ - if (ISSET(bp->nb_flags, NB_DONE)) { - if (!ISSET(bp->nb_flags, NB_ASYNC)) - panic("nfs_doio: done and not async"); - CLR(bp->nb_flags, NB_DONE); + if (!nmp) + return; + nfs_gss_clnt_rpcdone(req); + lck_mtx_lock(&nmp->nm_lock); + if (!(req->r_flags & R_RESENDQ)) { + TAILQ_INSERT_TAIL(&nmp->nm_resendq, req, r_rchain); + req->r_flags |= R_RESENDQ; } - FSDBG_TOP(256, np->n_size, NBOFF(bp), bp->nb_bufsize, bp->nb_flags); - FSDBG(257, bp->nb_validoff, bp->nb_validend, bp->nb_dirtyoff, - bp->nb_dirtyend); - - if (ISSET(bp->nb_flags, NB_READ)) { - if (vtype == VREG) - NFS_BUF_MAP(bp); - io.iov_len = bp->nb_bufsize; - uio_uio_resid_set(uiop, io.iov_len); - io.iov_base = (uintptr_t) bp->nb_data; - uiop->uio_rw = UIO_READ; - switch (vtype) { - case VREG: - uiop->uio_offset = NBOFF(bp); - OSAddAtomic(1, (SInt32*)&nfsstats.read_bios); - error = nfs_readrpc(vp, uiop, cr, p); - FSDBG(262, np->n_size, NBOFF(bp), uio_uio_resid(uiop), error); - if (!error) { - /* update valid range */ - bp->nb_validoff = 0; - if (uio_uio_resid(uiop) != 0) { - /* - * If len > 0, there is a hole in the file and - * no writes after the hole have been pushed to - * the server yet. - * Just zero fill the rest of the valid area. - */ - // LP64todo - fix this - diff = bp->nb_bufsize - uio_uio_resid(uiop); - len = np->n_size - (NBOFF(bp) + diff); - if (len > 0) { - // LP64todo - fix this - len = min(len, uio_uio_resid(uiop)); - bzero((char *)bp->nb_data + diff, len); - bp->nb_validend = diff + len; - FSDBG(258, diff, len, 0, 1); - } else - bp->nb_validend = diff; - } else - bp->nb_validend = bp->nb_bufsize; - bp->nb_valid = (1 << (round_page_32(bp->nb_validend)/PAGE_SIZE)) - 1; - if (bp->nb_validend & PAGE_MASK) { - /* valid range ends in the middle of a page so we */ - /* need to zero-fill any invalid data at the end */ - /* of the last page */ - bzero((caddr_t)(bp->nb_data + bp->nb_validend), - bp->nb_bufsize - bp->nb_validend); - FSDBG(258, bp->nb_validend, - bp->nb_bufsize - bp->nb_validend, 0, 2); - } - } - break; - case VLNK: - uiop->uio_offset = (off_t)0; - OSAddAtomic(1, (SInt32*)&nfsstats.readlink_bios); - error = nfs_readlinkrpc(vp, uiop, cr, p); - if (!error) { - bp->nb_validoff = 0; - bp->nb_validend = uiop->uio_offset; - } - break; - case VDIR: - OSAddAtomic(1, (SInt32*)&nfsstats.readdir_bios); - uiop->uio_offset = NBOFF(bp); - if (!(nmp->nm_flag & NFSMNT_NFSV3)) - nmp->nm_flag &= ~NFSMNT_RDIRPLUS; /* dk@farm.org */ - if (nmp->nm_flag & NFSMNT_RDIRPLUS) { - error = nfs_readdirplusrpc(vp, uiop, cr, p); - if (error == NFSERR_NOTSUPP) - nmp->nm_flag &= ~NFSMNT_RDIRPLUS; - } - if ((nmp->nm_flag & NFSMNT_RDIRPLUS) == 0) - error = nfs_readdirrpc(vp, uiop, cr, p); - if (!error) { - bp->nb_validoff = 0; - bp->nb_validend = uiop->uio_offset - NBOFF(bp); - bp->nb_valid = (1 << (round_page_32(bp->nb_validend)/PAGE_SIZE)) - 1; - } - break; - default: - printf("nfs_doio: type %x unexpected\n", vtype); - break; - }; - if (error) { - SET(bp->nb_flags, NB_ERROR); - bp->nb_error = error; - } - - } else { - /* we're doing a write */ - int doff, dend = 0; - - /* We need to make sure the pages are locked before doing I/O. */ - if (!ISSET(bp->nb_flags, NB_META) && UBCINFOEXISTS(vp)) { - if (!ISSET(bp->nb_flags, NB_PAGELIST)) { - error = nfs_buf_upl_setup(bp); - if (error) { - printf("nfs_doio: upl create failed %d\n", error); - SET(bp->nb_flags, NB_ERROR); - bp->nb_error = EIO; - return (EIO); - } - nfs_buf_upl_check(bp); - } - } - - if (ISSET(bp->nb_flags, NB_WASDIRTY)) { - FSDBG(256, bp, NBOFF(bp), bp->nb_dirty, 0xd00dee); - /* - * There are pages marked dirty that need to be written out. - * - * We don't want to just combine the write range with the - * range of pages that are dirty because that could cause us - * to write data that wasn't actually written to. - * We also don't want to write data more than once. - * - * If the dirty range just needs to be committed, we do that. - * Otherwise, we write the dirty range and clear the dirty bits - * for any COMPLETE pages covered by that range. - * If there are dirty pages left after that, we write out the - * parts that we haven't written yet. - */ - } - - /* - * If NB_NEEDCOMMIT is set, a commit rpc may do the trick. If not - * an actual write will have to be done. - * If NB_WRITEINPROG is already set, then push it with a write anyhow. - */ - if ((bp->nb_flags & (NB_NEEDCOMMIT | NB_WRITEINPROG)) == NB_NEEDCOMMIT) { - doff = NBOFF(bp) + bp->nb_dirtyoff; - SET(bp->nb_flags, NB_WRITEINPROG); - error = nfs_commit(vp, doff, bp->nb_dirtyend - bp->nb_dirtyoff, - bp->nb_wcred, bp->nb_proc); - CLR(bp->nb_flags, NB_WRITEINPROG); - if (!error) { - bp->nb_dirtyoff = bp->nb_dirtyend = 0; - CLR(bp->nb_flags, NB_NEEDCOMMIT); - np->n_needcommitcnt--; - CHECK_NEEDCOMMITCNT(np); - } else if (error == NFSERR_STALEWRITEVERF) - nfs_clearcommit(vnode_mount(vp)); - } - - if (!error && bp->nb_dirtyend > 0) { - /* there's a dirty range that needs to be written out */ - u_int32_t pagemask; - int firstpg, lastpg; - - if (NBOFF(bp) + bp->nb_dirtyend > (off_t)np->n_size) - bp->nb_dirtyend = np->n_size - NBOFF(bp); - - NFS_BUF_MAP(bp); - - doff = bp->nb_dirtyoff; - dend = bp->nb_dirtyend; - - /* if doff page is dirty, move doff to start of page */ - if (NBPGDIRTY(bp,doff/PAGE_SIZE)) - doff -= doff & PAGE_MASK; - /* try to expand write range to include preceding dirty pages */ - if (!(doff & PAGE_MASK)) - while (doff > 0 && NBPGDIRTY(bp,(doff-1)/PAGE_SIZE)) - doff -= PAGE_SIZE; - /* if dend page is dirty, move dend to start of next page */ - if ((dend & PAGE_MASK) && NBPGDIRTY(bp,dend/PAGE_SIZE)) - dend = round_page_32(dend); - /* try to expand write range to include trailing dirty pages */ - if (!(dend & PAGE_MASK)) - while (dend < bp->nb_bufsize && NBPGDIRTY(bp,dend/PAGE_SIZE)) - dend += PAGE_SIZE; - /* make sure to keep dend clipped to EOF */ - if (NBOFF(bp) + dend > (off_t)np->n_size) - dend = np->n_size - NBOFF(bp); - /* calculate range of complete pages being written */ - firstpg = round_page_32(doff) / PAGE_SIZE; - lastpg = (trunc_page_32(dend) - 1)/ PAGE_SIZE; - /* calculate mask for that page range */ - pagemask = ((1 << (lastpg+1)) - 1) & ~((1 << firstpg) - 1); - - /* compare page mask to nb_dirty; if there are other dirty pages */ - /* then write FILESYNC; otherwise, write UNSTABLE if async and */ - /* not needcommit/nocache/call; otherwise write FILESYNC */ - if (bp->nb_dirty & ~pagemask) - iomode = NFSV3WRITE_FILESYNC; - else if ((bp->nb_flags & (NB_ASYNC | NB_NEEDCOMMIT | NB_NOCACHE | NB_STABLE)) == NB_ASYNC) - iomode = NFSV3WRITE_UNSTABLE; - else - iomode = NFSV3WRITE_FILESYNC; - - /* write the dirty range */ - io.iov_len = dend - doff; - uio_uio_resid_set(uiop, io.iov_len); - uiop->uio_offset = NBOFF(bp) + doff; - io.iov_base = (uintptr_t) bp->nb_data + doff; - uiop->uio_rw = UIO_WRITE; - - OSAddAtomic(1, (SInt32*)&nfsstats.write_bios); - - SET(bp->nb_flags, NB_WRITEINPROG); - error = nfs_writerpc(vp, uiop, cr, p, &iomode, &must_commit); - if (must_commit) - nfs_clearcommit(vnode_mount(vp)); - /* clear dirty bits for pages we've written */ - if (!error) - bp->nb_dirty &= ~pagemask; - /* set/clear needcommit flag */ - if (!error && iomode == NFSV3WRITE_UNSTABLE) { - if (!ISSET(bp->nb_flags, NB_NEEDCOMMIT)) - np->n_needcommitcnt++; - SET(bp->nb_flags, NB_NEEDCOMMIT); - /* make sure nb_dirtyoff/nb_dirtyend reflect actual range written */ - bp->nb_dirtyoff = doff; - bp->nb_dirtyend = dend; - } else { - if (ISSET(bp->nb_flags, NB_NEEDCOMMIT)) { - np->n_needcommitcnt--; - CHECK_NEEDCOMMITCNT(np); - } - CLR(bp->nb_flags, NB_NEEDCOMMIT); - } - CLR(bp->nb_flags, NB_WRITEINPROG); - /* - * For an interrupted write, the buffer is still valid and the write - * hasn't been pushed to the server yet, so we can't set NB_ERROR and - * report the interruption by setting NB_EINTR. For the NB_ASYNC case, - * NB_EINTR is not relevant. - * - * For the case of a V3 write rpc not being committed to stable - * storage, the block is still dirty and requires either a commit rpc - * or another write rpc with iomode == NFSV3WRITE_FILESYNC before the - * block is reused. This is indicated by setting the NB_DELWRI and - * NB_NEEDCOMMIT flags. - */ - if (error == EINTR || (!error && bp->nb_flags & NB_NEEDCOMMIT)) { - CLR(bp->nb_flags, NB_INVAL | NB_NOCACHE); - if (!ISSET(bp->nb_flags, NB_DELWRI)) { - SET(bp->nb_flags, NB_DELWRI); - OSAddAtomic(1, (SInt32*)&nfs_nbdwrite); - NFSBUFCNTCHK(0); - } - FSDBG(261, bp->nb_validoff, bp->nb_validend, - bp->nb_bufsize, 0); - /* - * Since for the NB_ASYNC case, nfs_bwrite() has - * reassigned the buffer to the clean list, we have to - * reassign it back to the dirty one. Ugh. - */ - if (ISSET(bp->nb_flags, NB_ASYNC)) { - /* move to dirty list */ - lck_mtx_lock(nfs_buf_mutex); - if (bp->nb_vnbufs.le_next != NFSNOLIST) - LIST_REMOVE(bp, nb_vnbufs); - LIST_INSERT_HEAD(&np->n_dirtyblkhd, bp, nb_vnbufs); - lck_mtx_unlock(nfs_buf_mutex); - } else { - SET(bp->nb_flags, NB_EINTR); - } - } else { - /* either there's an error or we don't need to commit */ - if (error) { - SET(bp->nb_flags, NB_ERROR); - bp->nb_error = np->n_error = error; - np->n_flag |= NWRITEERR; - /* - * There was a write error and we need to - * invalidate attrs and flush buffers in - * order to sync up with the server. - * (if this write was extending the file, - * we may no longer know the correct size) - * - * But we can't call vinvalbuf while holding - * this buffer busy. Set a flag to do it after - * releasing the buffer. - * - * Note we can only invalidate in this function - * if this is an async write and so the iodone - * below will release the buffer. Also, we - * shouldn't call vinvalbuf from nfsiod because - * that may deadlock waiting for the completion - * of writes that are queued up behind this one. - */ - if (ISSET(bp->nb_flags, NB_ASYNC) && - !ISSET(bp->nb_flags, NB_IOD)) { - invalidate = 1; - } else { - /* invalidate later */ - np->n_flag |= NNEEDINVALIDATE; - } - NATTRINVALIDATE(np); - } - /* clear the dirty range */ - bp->nb_dirtyoff = bp->nb_dirtyend = 0; - } - } - - if (!error && bp->nb_dirty) { - /* there are pages marked dirty that need to be written out */ - int pg, count, npages, off; - - OSAddAtomic(1, (SInt32*)&nfsstats.write_bios); + nfs_mount_sock_thread_wake(nmp); + lck_mtx_unlock(&nmp->nm_lock); +} - NFS_BUF_MAP(bp); +/* + * Read directory data into a buffer. + * + * Buffer will be filled (unless EOF is hit). + * Buffers after this one may also be completely/partially filled. + */ +int +nfs_buf_readdir(struct nfsbuf *bp, vfs_context_t ctx) +{ + nfsnode_t np = bp->nb_np; + struct nfsmount *nmp = NFSTONMP(np); + int error = 0; - /* - * we do these writes synchronously because we can't really - * support the unstable/needommit method. We could write - * them unstable, clear the dirty bits, and then commit the - * whole block later, but if we need to rewrite the data, we - * won't have any idea which pages were written because that - * info can't be stored in the nb_dirtyoff/nb_dirtyend. We - * also can't leave the dirty bits set because then we wouldn't - * be able to tell if the pages were re-dirtied between the end - * of the write and the commit. - */ - iomode = NFSV3WRITE_FILESYNC; - uiop->uio_rw = UIO_WRITE; + if (!nmp) + return (ENXIO); - SET(bp->nb_flags, NB_WRITEINPROG); - npages = bp->nb_bufsize/PAGE_SIZE; - for (pg=0; pg < npages; pg++) { - if (!NBPGDIRTY(bp,pg)) - continue; - count = 1; - while (((pg+count) < npages) && NBPGDIRTY(bp,pg+count)) - count++; - /* write count pages starting with page pg */ - off = pg * PAGE_SIZE; - len = count * PAGE_SIZE; - - /* clip writes to EOF */ - if (NBOFF(bp) + off + len > (off_t)np->n_size) - len -= (NBOFF(bp) + off + len) - np->n_size; - if (len > 0) { - io.iov_len = len; - uio_uio_resid_set(uiop, io.iov_len); - uiop->uio_offset = NBOFF(bp) + off; - io.iov_base = (uintptr_t) bp->nb_data + off; - error = nfs_writerpc(vp, uiop, cr, p, &iomode, &must_commit); - if (must_commit) - nfs_clearcommit(vnode_mount(vp)); - if (error) - break; - } - /* clear dirty bits */ - while (count--) { - bp->nb_dirty &= ~(1 << pg); - /* leave pg on last page */ - if (count) pg++; - } - } - if (!error) { - if (ISSET(bp->nb_flags, NB_NEEDCOMMIT)) { - np->n_needcommitcnt--; - CHECK_NEEDCOMMITCNT(np); - } - CLR(bp->nb_flags, NB_NEEDCOMMIT); - } - CLR(bp->nb_flags, NB_WRITEINPROG); - FSDBG_BOT(256, bp->nb_validoff, bp->nb_validend, bp->nb_bufsize, - np->n_size); - } + if (nmp->nm_vers < NFS_VER4) + error = nfs3_readdir_rpc(np, bp, ctx); + else + error = nfs4_readdir_rpc(np, bp, ctx); - if (error) { + if (error && (error != NFSERR_DIRBUFDROPPED)) { SET(bp->nb_flags, NB_ERROR); bp->nb_error = error; - } - } - - FSDBG_BOT(256, bp->nb_validoff, bp->nb_validend, bp->nb_bufsize, error); - - nfs_buf_iodone(bp); - - if (invalidate) { - /* - * There was a write error and we need to - * invalidate attrs and flush buffers in - * order to sync up with the server. - * (if this write was extending the file, - * we may no longer know the correct size) - * - * But we couldn't call vinvalbuf while holding - * the buffer busy. So we call vinvalbuf() after - * releasing the buffer. - * - * Note: we don't bother calling nfs_vinvalbuf() if - * there's already a flush in progress. - */ - if (!(np->n_flag & NFLUSHINPROG)) - nfs_vinvalbuf(vp, V_SAVE|V_IGNORE_WRITEERR, cr, p, 1); } - return (error); }