X-Git-Url: https://git.saurik.com/apple/xnu.git/blobdiff_plain/2d21ac55c334faf3a56e5634905ed6987fc787d4..2a1bd2d3eef5c7a7bb14f4bb9fdbca9a96ee4752:/bsd/nfs/nfs_bio.c diff --git a/bsd/nfs/nfs_bio.c b/bsd/nfs/nfs_bio.c index 8b2ab3e1f..b9c2b5ac1 100644 --- a/bsd/nfs/nfs_bio.c +++ b/bsd/nfs/nfs_bio.c @@ -1,8 +1,8 @@ /* - * Copyright (c) 2000-2007 Apple Inc. All rights reserved. + * Copyright (c) 2000-2019 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * + * * This file contains Original Code and/or Modifications of Original Code * as defined in and that are subject to the Apple Public Source License * Version 2.0 (the 'License'). You may not use this file except in @@ -11,10 +11,10 @@ * unlawful or unlicensed copies of an Apple operating system, or to * circumvent, violate, or enable the circumvention or violation of, any * terms of an Apple operating system software license agreement. - * + * * Please obtain a copy of the License at * http://www.opensource.apple.com/apsl/ and read it before using this file. - * + * * The Original Code and all software distributed under the License are * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, @@ -22,7 +22,7 @@ * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. * Please see the License for the specific language governing rights and * limitations under the License. - * + * * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ */ /* Copyright (c) 1995 NeXT Computer, Inc. All Rights Reserved */ @@ -64,6 +64,10 @@ * @(#)nfs_bio.c 8.9 (Berkeley) 3/30/95 * FreeBSD-Id: nfs_bio.c,v 1.44 1997/09/10 19:52:25 phk Exp $ */ + +#include +#if CONFIG_NFS_CLIENT + #include #include #include @@ -77,6 +81,7 @@ #include #include #include +#include #include #include @@ -95,12 +100,15 @@ #include #include #include +#include -kern_return_t thread_terminate(thread_t); /* XXX */ +#define NFS_BIO_DBG(...) NFS_DBG(NFS_FAC_BIO, 7, ## __VA_ARGS__) -#define NFSBUFHASH(np, lbn) \ +kern_return_t thread_terminate(thread_t); /* XXX */ + +#define NFSBUFHASH(np, lbn) \ (&nfsbufhashtbl[((long)(np) / sizeof(*(np)) + (int)(lbn)) & nfsbufhash]) -LIST_HEAD(nfsbufhashhead, nfsbuf) *nfsbufhashtbl; +LIST_HEAD(nfsbufhashhead, nfsbuf) * nfsbufhashtbl; struct nfsbuffreehead nfsbuffree, nfsbuffreemeta, nfsbufdelwri; u_long nfsbufhash; int nfsbufcnt, nfsbufmin, nfsbufmax, nfsbufmetacnt, nfsbufmetamax; @@ -109,37 +117,98 @@ int nfs_nbdwrite; int nfs_buf_timer_on = 0; thread_t nfsbufdelwrithd = NULL; +ZONE_DECLARE(nfsbuf_zone, "NFS bio", sizeof(struct nfsbuf), ZC_NONE); + lck_grp_t *nfs_buf_lck_grp; lck_mtx_t *nfs_buf_mutex; -#define NFSBUF_FREE_PERIOD 30 /* seconds */ -#define NFSBUF_LRU_STALE 120 -#define NFSBUF_META_STALE 240 +#define NFSBUF_FREE_PERIOD 30 /* seconds */ +#define NFSBUF_LRU_STALE 120 +#define NFSBUF_META_STALE 240 /* number of nfsbufs nfs_buf_freeup() should attempt to free from nfsbuffree list */ -#define LRU_TO_FREEUP 6 +#define LRU_TO_FREEUP 6 /* number of nfsbufs nfs_buf_freeup() should attempt to free from nfsbuffreemeta list */ -#define META_TO_FREEUP 3 +#define META_TO_FREEUP 3 /* total number of nfsbufs nfs_buf_freeup() should attempt to free */ -#define TOTAL_TO_FREEUP (LRU_TO_FREEUP+META_TO_FREEUP) +#define TOTAL_TO_FREEUP (LRU_TO_FREEUP+META_TO_FREEUP) /* fraction of nfsbufs nfs_buf_freeup() should attempt to free from nfsbuffree list when called from timer */ -#define LRU_FREEUP_FRAC_ON_TIMER 8 +#define LRU_FREEUP_FRAC_ON_TIMER 8 /* fraction of nfsbufs nfs_buf_freeup() should attempt to free from nfsbuffreemeta list when called from timer */ -#define META_FREEUP_FRAC_ON_TIMER 16 +#define META_FREEUP_FRAC_ON_TIMER 16 /* fraction of total nfsbufs that nfsbuffreecnt should exceed before bothering to call nfs_buf_freeup() */ -#define LRU_FREEUP_MIN_FRAC 4 +#define LRU_FREEUP_MIN_FRAC 4 /* fraction of total nfsbufs that nfsbuffreemetacnt should exceed before bothering to call nfs_buf_freeup() */ -#define META_FREEUP_MIN_FRAC 2 +#define META_FREEUP_MIN_FRAC 2 + +#define NFS_ROUND_BLOCK(p, blksize) ((((uint64_t)(p) + blksize - 1) & ~((uint64_t)blksize - 1)) / blksize) #define NFS_BUF_FREEUP() \ do { \ - /* only call nfs_buf_freeup() if it has work to do: */ \ - if (((nfsbuffreecnt > nfsbufcnt/LRU_FREEUP_MIN_FRAC) || \ - (nfsbuffreemetacnt > nfsbufcnt/META_FREEUP_MIN_FRAC)) && \ - ((nfsbufcnt - TOTAL_TO_FREEUP) > nfsbufmin)) \ - nfs_buf_freeup(0); \ + /* only call nfs_buf_freeup() if it has work to do: */ \ + if (((nfsbuffreecnt > nfsbufcnt/LRU_FREEUP_MIN_FRAC) || \ + (nfsbuffreemetacnt > nfsbufcnt/META_FREEUP_MIN_FRAC)) && \ + ((nfsbufcnt - TOTAL_TO_FREEUP) > nfsbufmin)) \ + nfs_buf_freeup(0); \ } while (0) +void +nfs_buf_pgs_get_page_mask(nfsbufpgs *nfsbp, off_t page) +{ + off_t page_pos = page / NBPGS_ELEMENT_PAGES; + off_t max_page = NBPGS_STRUCT_SIZE * 8; + NBPGS_ERASE(nfsbp); + + if (page >= max_page) { + nfs_buf_pgs_bit_not(nfsbp); + return; + } + + NBPGS_SET(nfsbp, page); + nfsbp->pages[page_pos]--; + for (off_t i = page_pos - 1; i >= 0; i--) { + nfsbp->pages[i] = ~0; + } +} + +void +nfs_buf_pgs_bit_not(nfsbufpgs *nfsbp) +{ + for (uint32_t i = 0; i < NBPGS_ELEMENTS; i++) { + nfsbp->pages[i] = ~nfsbp->pages[i]; + } +} + +void +nfs_buf_pgs_bit_and(nfsbufpgs *nfsbp_src1, nfsbufpgs *nfsbp_src2, nfsbufpgs *nfsbp_dst) +{ + for (uint32_t i = 0; i < NBPGS_ELEMENTS; i++) { + nfsbp_dst->pages[i] = nfsbp_src1->pages[i] & nfsbp_src2->pages[i]; + } +} + +void +nfs_buf_pgs_set_pages_between(nfsbufpgs *nfsbp, off_t firstpg, off_t lastpg) +{ + nfsbufpgs pagemaskfirst, pagemasklast; + + nfs_buf_pgs_get_page_mask(&pagemasklast, lastpg); + nfs_buf_pgs_get_page_mask(&pagemaskfirst, firstpg); + nfs_buf_pgs_bit_not(&pagemaskfirst); + nfs_buf_pgs_bit_and(&pagemaskfirst, &pagemasklast, nfsbp); +} + +int +nfs_buf_pgs_is_set(nfsbufpgs *nfsbp) +{ + for (uint32_t i = 0; i < NBPGS_ELEMENTS; i++) { + if (nfsbp->pages[i] != 0) { + return 1; + } + } + return 0; +} + /* * Initialize nfsbuf lists */ @@ -150,19 +219,18 @@ nfs_nbinit(void) nfs_buf_mutex = lck_mtx_alloc_init(nfs_buf_lck_grp, LCK_ATTR_NULL); nfsbufcnt = nfsbufmetacnt = - nfsbuffreecnt = nfsbuffreemetacnt = nfsbufdelwricnt = 0; + nfsbuffreecnt = nfsbuffreemetacnt = nfsbufdelwricnt = 0; nfsbufmin = 128; /* size nfsbufmax to cover at most half sane_size (w/default buf size) */ - nfsbufmax = (sane_size >> PAGE_SHIFT) / (2 * (NFS_RWSIZE >> PAGE_SHIFT)); + nfsbufmax = (int)(sane_size >> PAGE_SHIFT) / (2 * (NFS_RWSIZE >> PAGE_SHIFT)); nfsbufmetamax = nfsbufmax / 4; nfsneedbuffer = 0; nfs_nbdwrite = 0; - nfsbufhashtbl = hashinit(nfsbufmax/4, M_TEMP, &nfsbufhash); + nfsbufhashtbl = hashinit(nfsbufmax / 4, M_NFSBIO, &nfsbufhash); TAILQ_INIT(&nfsbuffree); TAILQ_INIT(&nfsbuffreemeta); TAILQ_INIT(&nfsbufdelwri); - } /* @@ -182,7 +250,7 @@ nfs_buf_timer(__unused void *param0, __unused void *param1) lck_mtx_unlock(nfs_buf_mutex); nfs_interval_timer_start(nfs_buf_timer_call, - NFSBUF_FREE_PERIOD * 1000); + NFSBUF_FREE_PERIOD * 1000); } /* @@ -204,16 +272,19 @@ nfs_buf_freeup(int timer) FSDBG(320, nfsbufcnt, nfsbuffreecnt, nfsbuffreemetacnt, 0); - count = timer ? nfsbuffreecnt/LRU_FREEUP_FRAC_ON_TIMER : LRU_TO_FREEUP; + count = timer ? nfsbuffreecnt / LRU_FREEUP_FRAC_ON_TIMER : LRU_TO_FREEUP; while ((nfsbufcnt > nfsbufmin) && (count-- > 0)) { fbp = TAILQ_FIRST(&nfsbuffree); - if (!fbp) + if (!fbp) { break; - if (fbp->nb_refs) + } + if (os_ref_get_count(&fbp->nb_refs) > 1) { break; + } if (NBUFSTAMPVALID(fbp) && - (fbp->nb_timestamp + (2*NFSBUF_LRU_STALE)) > now.tv_sec) + (fbp->nb_timestamp + (2 * NFSBUF_LRU_STALE)) > now.tv_sec) { break; + } nfs_buf_remfree(fbp); /* disassociate buffer from any nfsnode */ if (fbp->nb_np) { @@ -228,16 +299,19 @@ nfs_buf_freeup(int timer) nfsbufcnt--; } - count = timer ? nfsbuffreemetacnt/META_FREEUP_FRAC_ON_TIMER : META_TO_FREEUP; + count = timer ? nfsbuffreemetacnt / META_FREEUP_FRAC_ON_TIMER : META_TO_FREEUP; while ((nfsbufcnt > nfsbufmin) && (count-- > 0)) { fbp = TAILQ_FIRST(&nfsbuffreemeta); - if (!fbp) + if (!fbp) { break; - if (fbp->nb_refs) + } + if (os_ref_get_count(&fbp->nb_refs) > 1) { break; + } if (NBUFSTAMPVALID(fbp) && - (fbp->nb_timestamp + (2*NFSBUF_META_STALE)) > now.tv_sec) + (fbp->nb_timestamp + (2 * NFSBUF_META_STALE)) > now.tv_sec) { break; + } nfs_buf_remfree(fbp); /* disassociate buffer from any nfsnode */ if (fbp->nb_np) { @@ -261,16 +335,18 @@ nfs_buf_freeup(int timer) while ((fbp = TAILQ_FIRST(&nfsbuffreeup))) { TAILQ_REMOVE(&nfsbuffreeup, fbp, nb_free); /* nuke any creds */ - if (IS_VALID_CRED(fbp->nb_rcred)) + if (IS_VALID_CRED(fbp->nb_rcred)) { kauth_cred_unref(&fbp->nb_rcred); - if (IS_VALID_CRED(fbp->nb_wcred)) + } + if (IS_VALID_CRED(fbp->nb_wcred)) { kauth_cred_unref(&fbp->nb_wcred); + } /* if buf was NB_META, dump buffer */ - if (ISSET(fbp->nb_flags, NB_META) && fbp->nb_data) - kfree(fbp->nb_data, fbp->nb_bufsize); - FREE(fbp, M_TEMP); + if (ISSET(fbp->nb_flags, NB_META) && fbp->nb_data) { + kheap_free(KHEAP_DATA_BUFFERS, fbp->nb_data, fbp->nb_bufsize); + } + NFS_ZFREE(nfsbuf_zone, fbp); } - } /* @@ -280,8 +356,9 @@ nfs_buf_freeup(int timer) void nfs_buf_remfree(struct nfsbuf *bp) { - if (bp->nb_free.tqe_next == NFSNOLIST) + if (bp->nb_free.tqe_next == NFSNOLIST) { panic("nfsbuf not on free list"); + } if (ISSET(bp->nb_flags, NB_DELWRI)) { nfsbufdelwricnt--; TAILQ_REMOVE(&nfsbufdelwri, bp, nb_free); @@ -304,12 +381,13 @@ nfs_buf_is_incore(nfsnode_t np, daddr64_t blkno) { boolean_t rv; lck_mtx_lock(nfs_buf_mutex); - if (nfs_buf_incore(np, blkno)) + if (nfs_buf_incore(np, blkno)) { rv = TRUE; - else + } else { rv = FALSE; + } lck_mtx_unlock(nfs_buf_mutex); - return (rv); + return rv; } /* @@ -320,14 +398,15 @@ nfs_buf_incore(nfsnode_t np, daddr64_t blkno) { /* Search hash chain */ struct nfsbuf * bp = NFSBUFHASH(np, blkno)->lh_first; - for (; bp != NULL; bp = bp->nb_hash.le_next) + for (; bp != NULL; bp = bp->nb_hash.le_next) { if ((bp->nb_lblkno == blkno) && (bp->nb_np == np)) { if (!ISSET(bp->nb_flags, NB_INVAL)) { FSDBG(547, bp, blkno, bp->nb_flags, bp->nb_np); - return (bp); + return bp; } } - return (NULL); + } + return NULL; } /* @@ -345,13 +424,15 @@ nfs_buf_page_inval(vnode_t vp, off_t offset) struct nfsbuf *bp; int error = 0; - if (!nmp) - return (ENXIO); + if (nfs_mount_gone(nmp)) { + return ENXIO; + } lck_mtx_lock(nfs_buf_mutex); bp = nfs_buf_incore(VTONFS(vp), (daddr64_t)(offset / nmp->nm_biosize)); - if (!bp) + if (!bp) { goto out; + } FSDBG(325, bp, bp->nb_flags, bp->nb_dirtyoff, bp->nb_dirtyend); if (ISSET(bp->nb_lflags, NBL_BUSY)) { error = EBUSY; @@ -363,16 +444,25 @@ nfs_buf_page_inval(vnode_t vp, off_t offset) * If it does, we can't let the pager drop the page. */ if (bp->nb_dirtyend > 0) { - int start = offset - NBOFF(bp); - if (bp->nb_dirtyend <= start || - bp->nb_dirtyoff >= (start + PAGE_SIZE)) - error = 0; - else + off_t start = offset - NBOFF(bp); + if ((bp->nb_dirtyend > start) && + (bp->nb_dirtyoff < (start + PAGE_SIZE))) { + /* + * Before returning the bad news, move the + * buffer to the start of the delwri list and + * give the list a push to try to flush the + * buffer out. + */ error = EBUSY; + nfs_buf_remfree(bp); + TAILQ_INSERT_HEAD(&nfsbufdelwri, bp, nb_free); + nfsbufdelwricnt++; + nfs_buf_delwri_push(1); + } } out: lck_mtx_unlock(nfs_buf_mutex); - return (error); + return error; } /* @@ -386,8 +476,9 @@ nfs_buf_upl_setup(struct nfsbuf *bp) upl_t upl; int upl_flags; - if (ISSET(bp->nb_flags, NB_PAGELIST)) - return (0); + if (ISSET(bp->nb_flags, NB_PAGELIST)) { + return 0; + } upl_flags = UPL_PRECIOUS; if (!ISSET(bp->nb_flags, NB_READ)) { @@ -397,24 +488,24 @@ nfs_buf_upl_setup(struct nfsbuf *bp) */ upl_flags |= UPL_WILL_MODIFY; } - kret = ubc_create_upl(NFSTOV(bp->nb_np), NBOFF(bp), bp->nb_bufsize, - &upl, NULL, upl_flags); + kret = ubc_create_upl_kernel(NFSTOV(bp->nb_np), NBOFF(bp), bp->nb_bufsize, + &upl, NULL, upl_flags, VM_KERN_MEMORY_FILE); if (kret == KERN_INVALID_ARGUMENT) { /* vm object probably doesn't exist any more */ bp->nb_pagelist = NULL; - return (EINVAL); + return EINVAL; } if (kret != KERN_SUCCESS) { printf("nfs_buf_upl_setup(): failed to get pagelist %d\n", kret); bp->nb_pagelist = NULL; - return (EIO); + return EIO; } FSDBG(538, bp, NBOFF(bp), bp->nb_bufsize, bp->nb_np); bp->nb_pagelist = upl; SET(bp->nb_flags, NB_PAGELIST); - return (0); + return 0; } /* @@ -428,38 +519,44 @@ nfs_buf_upl_check(struct nfsbuf *bp) off_t filesize, fileoffset; int i, npages; - if (!ISSET(bp->nb_flags, NB_PAGELIST)) + if (!ISSET(bp->nb_flags, NB_PAGELIST)) { return; + } npages = round_page_32(bp->nb_bufsize) / PAGE_SIZE; filesize = ubc_getsize(NFSTOV(bp->nb_np)); fileoffset = NBOFF(bp); - if (fileoffset < filesize) + if (fileoffset < filesize) { SET(bp->nb_flags, NB_CACHE); - else + } else { CLR(bp->nb_flags, NB_CACHE); + } pl = ubc_upl_pageinfo(bp->nb_pagelist); - bp->nb_valid = bp->nb_dirty = 0; + NBPGS_ERASE(&bp->nb_valid); + NBPGS_ERASE(&bp->nb_dirty); - for (i=0; i < npages; i++, fileoffset += PAGE_SIZE_64) { + for (i = 0; i < npages; i++, fileoffset += PAGE_SIZE_64) { /* anything beyond the end of the file is not valid or dirty */ - if (fileoffset >= filesize) + if (fileoffset >= filesize) { break; + } if (!upl_valid_page(pl, i)) { CLR(bp->nb_flags, NB_CACHE); continue; } - NBPGVALID_SET(bp,i); - if (upl_dirty_page(pl, i)) + NBPGVALID_SET(bp, i); + if (upl_dirty_page(pl, i)) { NBPGDIRTY_SET(bp, i); + } } fileoffset = NBOFF(bp); if (ISSET(bp->nb_flags, NB_CACHE)) { bp->nb_validoff = 0; bp->nb_validend = bp->nb_bufsize; - if (fileoffset + bp->nb_validend > filesize) + if (fileoffset + bp->nb_validend > filesize) { bp->nb_validend = filesize - fileoffset; + } } else { bp->nb_validoff = bp->nb_validend = -1; } @@ -476,18 +573,22 @@ nfs_buf_map(struct nfsbuf *bp) { kern_return_t kret; - if (bp->nb_data) - return (0); - if (!ISSET(bp->nb_flags, NB_PAGELIST)) - return (EINVAL); + if (bp->nb_data) { + return 0; + } + if (!ISSET(bp->nb_flags, NB_PAGELIST)) { + return EINVAL; + } - kret = ubc_upl_map(bp->nb_pagelist, (vm_address_t *)&(bp->nb_data)); - if (kret != KERN_SUCCESS) + kret = ubc_upl_map(bp->nb_pagelist, (vm_offset_t *)&(bp->nb_data)); + if (kret != KERN_SUCCESS) { panic("nfs_buf_map: ubc_upl_map() failed with (%d)", kret); - if (bp->nb_data == 0) + } + if (bp->nb_data == 0) { panic("ubc_upl_map mapped 0"); + } FSDBG(540, bp, bp->nb_flags, NBOFF(bp), bp->nb_data); - return (0); + return 0; } /* @@ -502,28 +603,31 @@ nfs_buf_map(struct nfsbuf *bp) void nfs_buf_normalize_valid_range(nfsnode_t np, struct nfsbuf *bp) { - int pg, npg; + off_t pg, npg; /* pull validoff back to start of contiguous valid page range */ - pg = bp->nb_validoff/PAGE_SIZE; - while (pg >= 0 && NBPGVALID(bp,pg)) + pg = bp->nb_validoff / PAGE_SIZE; + while (pg >= 0 && NBPGVALID(bp, pg)) { pg--; - bp->nb_validoff = (pg+1) * PAGE_SIZE; + } + bp->nb_validoff = (pg + 1) * PAGE_SIZE; /* push validend forward to end of contiguous valid page range */ - npg = bp->nb_bufsize/PAGE_SIZE; - pg = bp->nb_validend/PAGE_SIZE; - while (pg < npg && NBPGVALID(bp,pg)) + npg = bp->nb_bufsize / PAGE_SIZE; + pg = bp->nb_validend / PAGE_SIZE; + while (pg < npg && NBPGVALID(bp, pg)) { pg++; + } bp->nb_validend = pg * PAGE_SIZE; /* clip to EOF */ - if (NBOFF(bp) + bp->nb_validend > (off_t)np->n_size) + if (NBOFF(bp) + bp->nb_validend > (off_t)np->n_size) { bp->nb_validend = np->n_size % bp->nb_bufsize; + } } /* * process some entries on the delayed write queue * (must be called with nfs_buf_mutex held) */ -static void +void nfs_buf_delwri_service(void) { struct nfsbuf *bp; @@ -534,17 +638,21 @@ nfs_buf_delwri_service(void) np = bp->nb_np; nfs_buf_remfree(bp); nfs_buf_refget(bp); - while ((error = nfs_buf_acquire(bp, 0, 0, 0)) == EAGAIN); + while ((error = nfs_buf_acquire(bp, 0, 0, 0)) == EAGAIN) { + ; + } nfs_buf_refrele(bp); - if (error) + if (error) { break; + } if (!bp->nb_np) { /* buffer is no longer valid */ nfs_buf_drop(bp); continue; } - if (ISSET(bp->nb_flags, NB_NEEDCOMMIT)) + if (ISSET(bp->nb_flags, NB_NEEDCOMMIT)) { nfs_buf_check_write_verifier(np, bp); + } if (ISSET(bp->nb_flags, NB_NEEDCOMMIT)) { /* put buffer at end of delwri list */ TAILQ_INSERT_TAIL(&nfsbufdelwri, bp, nb_free); @@ -565,10 +673,10 @@ nfs_buf_delwri_service(void) /* * thread to service the delayed write queue when asked */ -static void +void nfs_buf_delwri_thread(__unused void *arg, __unused wait_result_t wr) { - struct timespec ts = { 30, 0 }; + struct timespec ts = { .tv_sec = 30, .tv_nsec = 0 }; int error = 0; lck_mtx_lock(nfs_buf_mutex); @@ -585,23 +693,28 @@ nfs_buf_delwri_thread(__unused void *arg, __unused wait_result_t wr) * try to push out some delayed/uncommitted writes * ("locked" indicates whether nfs_buf_mutex is already held) */ -static void +void nfs_buf_delwri_push(int locked) { - if (TAILQ_EMPTY(&nfsbufdelwri)) + if (TAILQ_EMPTY(&nfsbufdelwri)) { return; - if (!locked) + } + if (!locked) { lck_mtx_lock(nfs_buf_mutex); + } /* wake up the delayed write service thread */ - if (nfsbufdelwrithd) + if (nfsbufdelwrithd) { wakeup(&nfsbufdelwrithd); - else if (kernel_thread_start(nfs_buf_delwri_thread, NULL, &nfsbufdelwrithd) == KERN_SUCCESS) + } else if (kernel_thread_start(nfs_buf_delwri_thread, NULL, &nfsbufdelwrithd) == KERN_SUCCESS) { thread_deallocate(nfsbufdelwrithd); + } /* otherwise, try to do some of the work ourselves */ - if (!nfsbufdelwrithd) + if (!nfsbufdelwrithd) { nfs_buf_delwri_service(); - if (!locked) + } + if (!locked) { lck_mtx_unlock(nfs_buf_mutex); + } } /* @@ -627,7 +740,7 @@ int nfs_buf_get( nfsnode_t np, daddr64_t blkno, - int size, + uint32_t size, thread_t thd, int flags, struct nfsbuf **bpp) @@ -635,7 +748,7 @@ nfs_buf_get( vnode_t vp = NFSTOV(np); struct nfsmount *nmp = VTONMP(vp); struct nfsbuf *bp; - int bufsize; + uint32_t bufsize; int slpflag = PCATCH; int operation = (flags & NBLK_OPMASK); int error = 0; @@ -645,17 +758,18 @@ nfs_buf_get( *bpp = NULL; bufsize = size; - if (bufsize > NFS_MAXBSIZE) + if (bufsize > NFS_MAXBSIZE) { panic("nfs_buf_get: buffer larger than NFS_MAXBSIZE requested"); + } - if (!nmp) { + if (nfs_mount_gone(nmp)) { FSDBG_BOT(541, np, blkno, 0, ENXIO); - return (ENXIO); + return ENXIO; } if (!UBCINFOEXISTS(vp)) { operation = NBLK_META; - } else if (bufsize < nmp->nm_biosize) { + } else if (bufsize < (uint32_t)nmp->nm_biosize) { /* reg files should always have biosize blocks */ bufsize = nmp->nm_biosize; } @@ -675,6 +789,22 @@ nfs_buf_get( loop: lck_mtx_lock(nfs_buf_mutex); + /* wait for any buffer invalidation/flushing to complete */ + while (np->n_bflag & NBINVALINPROG) { + np->n_bflag |= NBINVALWANT; + ts.tv_sec = 2; + ts.tv_nsec = 0; + msleep(&np->n_bflag, nfs_buf_mutex, slpflag, "nfs_buf_get_invalwait", &ts); + if ((error = nfs_sigintr(VTONMP(vp), NULL, thd, 0))) { + lck_mtx_unlock(nfs_buf_mutex); + FSDBG_BOT(541, np, blkno, 0, error); + return error; + } + if (np->n_bflag & NBINVALINPROG) { + slpflag = 0; + } + } + /* check for existence of nfsbuf in cache */ if ((bp = nfs_buf_incore(np, blkno))) { /* if busy, set wanted and wait */ @@ -682,40 +812,40 @@ loop: if (flags & NBLK_NOWAIT) { lck_mtx_unlock(nfs_buf_mutex); FSDBG_BOT(541, np, blkno, bp, 0xbcbcbcbc); - return (0); + return 0; } FSDBG_TOP(543, np, blkno, bp, bp->nb_flags); SET(bp->nb_lflags, NBL_WANTED); ts.tv_sec = 2; ts.tv_nsec = 0; - error = msleep(bp, nfs_buf_mutex, slpflag|(PRIBIO+1)|PDROP, - "nfsbufget", (slpflag == PCATCH) ? NULL : &ts); - if (error == EWOULDBLOCK) - error = 0; + msleep(bp, nfs_buf_mutex, slpflag | (PRIBIO + 1) | PDROP, + "nfsbufget", (slpflag == PCATCH) ? NULL : &ts); slpflag = 0; FSDBG_BOT(543, np, blkno, bp, bp->nb_flags); - if (error || ((error = nfs_sigintr(VTONMP(vp), NULL, thd, 0)))) { + if ((error = nfs_sigintr(VTONMP(vp), NULL, thd, 0))) { FSDBG_BOT(541, np, blkno, 0, error); - return (error); + return error; } goto loop; } - if (bp->nb_bufsize != bufsize) + if (bp->nb_bufsize != bufsize) { panic("nfsbuf size mismatch"); + } SET(bp->nb_lflags, NBL_BUSY); SET(bp->nb_flags, NB_CACHE); nfs_buf_remfree(bp); /* additional paranoia: */ - if (ISSET(bp->nb_flags, NB_PAGELIST)) + if (ISSET(bp->nb_flags, NB_PAGELIST)) { panic("pagelist buffer was not busy"); + } goto buffer_setup; } if (flags & NBLK_ONLYVALID) { lck_mtx_unlock(nfs_buf_mutex); FSDBG_BOT(541, np, blkno, 0, 0x0000cace); - return (0); + return 0; } /* @@ -743,28 +873,31 @@ loop: /* if the next LRU or META buffer is invalid or stale, use it */ lrubp = TAILQ_FIRST(&nfsbuffree); if (lrubp && (!NBUFSTAMPVALID(lrubp) || - ((lrubp->nb_timestamp + NFSBUF_LRU_STALE) < now.tv_sec))) + ((lrubp->nb_timestamp + NFSBUF_LRU_STALE) < now.tv_sec))) { bp = lrubp; + } metabp = TAILQ_FIRST(&nfsbuffreemeta); if (!bp && metabp && (!NBUFSTAMPVALID(metabp) || - ((metabp->nb_timestamp + NFSBUF_META_STALE) < now.tv_sec))) + ((metabp->nb_timestamp + NFSBUF_META_STALE) < now.tv_sec))) { bp = metabp; + } if (!bp && (nfsbufcnt >= nfsbufmax)) { /* we've already allocated all bufs, so */ /* choose the buffer that'll go stale first */ - if (!metabp) + if (!metabp) { bp = lrubp; - else if (!lrubp) + } else if (!lrubp) { bp = metabp; - else { - int32_t lru_stale_time, meta_stale_time; + } else { + time_t lru_stale_time, meta_stale_time; lru_stale_time = lrubp->nb_timestamp + NFSBUF_LRU_STALE; meta_stale_time = metabp->nb_timestamp + NFSBUF_META_STALE; - if (lru_stale_time <= meta_stale_time) + if (lru_stale_time <= meta_stale_time) { bp = lrubp; - else + } else { bp = metabp; + } } } } @@ -773,8 +906,9 @@ loop: /* we have a buffer to reuse */ FSDBG(544, np, blkno, bp, bp->nb_flags); nfs_buf_remfree(bp); - if (ISSET(bp->nb_flags, NB_DELWRI)) + if (ISSET(bp->nb_flags, NB_DELWRI)) { panic("nfs_buf_get: delwri"); + } SET(bp->nb_lflags, NBL_BUSY); /* disassociate buffer from previous nfsnode */ if (bp->nb_np) { @@ -786,17 +920,20 @@ loop: } LIST_REMOVE(bp, nb_hash); /* nuke any creds we're holding */ - if (IS_VALID_CRED(bp->nb_rcred)) + if (IS_VALID_CRED(bp->nb_rcred)) { kauth_cred_unref(&bp->nb_rcred); - if (IS_VALID_CRED(bp->nb_wcred)) + } + if (IS_VALID_CRED(bp->nb_wcred)) { kauth_cred_unref(&bp->nb_wcred); + } /* if buf will no longer be NB_META, dump old buffer */ if (operation == NBLK_META) { - if (!ISSET(bp->nb_flags, NB_META)) + if (!ISSET(bp->nb_flags, NB_META)) { nfsbufmetacnt++; + } } else if (ISSET(bp->nb_flags, NB_META)) { if (bp->nb_data) { - kfree(bp->nb_data, bp->nb_bufsize); + kheap_free(KHEAP_DATA_BUFFERS, bp->nb_data, bp->nb_bufsize); bp->nb_data = NULL; } nfsbufmetacnt--; @@ -805,20 +942,15 @@ loop: bp->nb_error = 0; bp->nb_validoff = bp->nb_validend = -1; bp->nb_dirtyoff = bp->nb_dirtyend = 0; - bp->nb_valid = 0; - bp->nb_dirty = 0; + NBPGS_ERASE(&bp->nb_valid); + NBPGS_ERASE(&bp->nb_dirty); bp->nb_verf = 0; } else { /* no buffer to reuse */ if ((nfsbufcnt < nfsbufmax) && ((operation != NBLK_META) || (nfsbufmetacnt < nfsbufmetamax))) { /* just alloc a new one */ - MALLOC(bp, struct nfsbuf *, sizeof(struct nfsbuf), M_TEMP, M_WAITOK); - if (!bp) { - lck_mtx_unlock(nfs_buf_mutex); - FSDBG_BOT(541, np, blkno, 0, error); - return (ENOMEM); - } + bp = zalloc(nfsbuf_zone); nfsbufcnt++; /* @@ -828,14 +960,17 @@ loop: if (nfsbufcnt > nfsbufmin && !nfs_buf_timer_on) { nfs_buf_timer_on = 1; nfs_interval_timer_start(nfs_buf_timer_call, - NFSBUF_FREE_PERIOD * 1000); + NFSBUF_FREE_PERIOD * 1000); } - if (operation == NBLK_META) + if (operation == NBLK_META) { nfsbufmetacnt++; + } NFSBUFCNTCHK(); /* init nfsbuf */ bzero(bp, sizeof(*bp)); + os_ref_init(&bp->nb_refs, NULL); + bp->nb_free.tqe_next = NFSNOLIST; bp->nb_validoff = bp->nb_validend = -1; FSDBG(545, np, blkno, bp, 0); @@ -847,18 +982,18 @@ loop: nfs_buf_delwri_push(1); nfsneedbuffer = 1; - error = msleep(&nfsneedbuffer, nfs_buf_mutex, PCATCH|PDROP, "nfsbufget", NULL); + msleep(&nfsneedbuffer, nfs_buf_mutex, PCATCH | PDROP, "nfsbufget", NULL); FSDBG_BOT(546, np, blkno, nfsbufcnt, nfsbufmax); - if (error || ((error = nfs_sigintr(VTONMP(vp), NULL, thd, 0)))) { + if ((error = nfs_sigintr(VTONMP(vp), NULL, thd, 0))) { FSDBG_BOT(541, np, blkno, 0, error); - return (error); + return error; } goto loop; } } - /* setup nfsbuf */ - bp->nb_lflags = NBL_BUSY; + /* set up nfsbuf */ + SET(bp->nb_lflags, NBL_BUSY); bp->nb_flags = 0; bp->nb_lblkno = blkno; /* insert buf in hash */ @@ -876,16 +1011,18 @@ buffer_setup: case NBLK_META: SET(bp->nb_flags, NB_META); if ((bp->nb_bufsize != bufsize) && bp->nb_data) { - kfree(bp->nb_data, bp->nb_bufsize); + kheap_free(KHEAP_DATA_BUFFERS, bp->nb_data, bp->nb_bufsize); bp->nb_data = NULL; bp->nb_validoff = bp->nb_validend = -1; bp->nb_dirtyoff = bp->nb_dirtyend = 0; - bp->nb_valid = 0; - bp->nb_dirty = 0; + NBPGS_ERASE(&bp->nb_valid); + NBPGS_ERASE(&bp->nb_dirty); CLR(bp->nb_flags, NB_CACHE); } - if (!bp->nb_data) - bp->nb_data = kalloc(bufsize); + if (!bp->nb_data) { + bp->nb_data = kheap_alloc(KHEAP_DATA_BUFFERS, + bufsize, Z_WAITOK); + } if (!bp->nb_data) { /* Ack! couldn't allocate the data buffer! */ /* clean up buffer and return error */ @@ -895,13 +1032,14 @@ buffer_setup: bp->nb_np = NULL; /* invalidate usage timestamp to allow immediate freeing */ NBUFSTAMPINVALIDATE(bp); - if (bp->nb_free.tqe_next != NFSNOLIST) + if (bp->nb_free.tqe_next != NFSNOLIST) { panic("nfsbuf on freelist"); + } TAILQ_INSERT_HEAD(&nfsbuffree, bp, nb_free); nfsbuffreecnt++; lck_mtx_unlock(nfs_buf_mutex); FSDBG_BOT(541, np, blkno, 0xb00, ENOMEM); - return (ENOMEM); + return ENOMEM; } bp->nb_bufsize = bufsize; break; @@ -917,8 +1055,9 @@ buffer_setup: } else { CLR(bp->nb_flags, NB_READ); } - if (bufsize < PAGE_SIZE) + if (bufsize < PAGE_SIZE) { bufsize = PAGE_SIZE; + } bp->nb_bufsize = bufsize; bp->nb_validoff = bp->nb_validend = -1; @@ -934,13 +1073,14 @@ buffer_setup: bp->nb_np = NULL; /* invalidate usage timestamp to allow immediate freeing */ NBUFSTAMPINVALIDATE(bp); - if (bp->nb_free.tqe_next != NFSNOLIST) + if (bp->nb_free.tqe_next != NFSNOLIST) { panic("nfsbuf on freelist"); + } TAILQ_INSERT_HEAD(&nfsbuffree, bp, nb_free); nfsbuffreecnt++; lck_mtx_unlock(nfs_buf_mutex); FSDBG_BOT(541, np, blkno, 0x2bc, EIO); - return (EIO); + return EIO; } nfs_buf_upl_check(bp); } @@ -954,7 +1094,7 @@ buffer_setup: FSDBG_BOT(541, np, blkno, bp, bp->nb_flags); - return (0); + return 0; } void @@ -971,66 +1111,79 @@ nfs_buf_release(struct nfsbuf *bp, int freeup) vp = np ? NFSTOV(np) : NULL; if (vp && UBCINFOEXISTS(vp) && bp->nb_bufsize) { - int upl_flags; + int upl_flags, rv; upl_t upl; - int i, rv; + uint32_t i; if (!ISSET(bp->nb_flags, NB_PAGELIST) && !ISSET(bp->nb_flags, NB_INVAL)) { rv = nfs_buf_upl_setup(bp); - if (rv) + if (rv) { printf("nfs_buf_release: upl create failed %d\n", rv); - else + } else { nfs_buf_upl_check(bp); + } } upl = bp->nb_pagelist; - if (!upl) + if (!upl) { goto pagelist_cleanup_done; + } if (bp->nb_data) { - if (ubc_upl_unmap(upl) != KERN_SUCCESS) + if (ubc_upl_unmap(upl) != KERN_SUCCESS) { panic("ubc_upl_unmap failed"); + } bp->nb_data = NULL; } /* * Abort the pages on error or: if this is an invalid or * non-needcommit nocache buffer AND no pages are dirty. */ - if (ISSET(bp->nb_flags, NB_ERROR) || (!bp->nb_dirty && (ISSET(bp->nb_flags, NB_INVAL) || + if (ISSET(bp->nb_flags, NB_ERROR) || (!nfs_buf_pgs_is_set(&bp->nb_dirty) && (ISSET(bp->nb_flags, NB_INVAL) || (ISSET(bp->nb_flags, NB_NOCACHE) && !ISSET(bp->nb_flags, (NB_NEEDCOMMIT | NB_DELWRI)))))) { - if (ISSET(bp->nb_flags, (NB_READ | NB_INVAL | NB_NOCACHE))) + if (ISSET(bp->nb_flags, (NB_READ | NB_INVAL | NB_NOCACHE))) { upl_flags = UPL_ABORT_DUMP_PAGES; - else + } else { upl_flags = 0; + } ubc_upl_abort(upl, upl_flags); goto pagelist_cleanup_done; } - for (i=0; i <= (bp->nb_bufsize - 1)/PAGE_SIZE; i++) { - if (!NBPGVALID(bp,i)) + for (i = 0; i <= (bp->nb_bufsize - 1) / PAGE_SIZE; i++) { + if (!NBPGVALID(bp, i)) { ubc_upl_abort_range(upl, - i*PAGE_SIZE, PAGE_SIZE, - UPL_ABORT_DUMP_PAGES | - UPL_ABORT_FREE_ON_EMPTY); - else { - if (NBPGDIRTY(bp,i)) + i * PAGE_SIZE, PAGE_SIZE, + UPL_ABORT_DUMP_PAGES | + UPL_ABORT_FREE_ON_EMPTY); + } else { + if (NBPGDIRTY(bp, i)) { upl_flags = UPL_COMMIT_SET_DIRTY; - else + } else { upl_flags = UPL_COMMIT_CLEAR_DIRTY; + } + + if (!ISSET(bp->nb_flags, (NB_NEEDCOMMIT | NB_DELWRI))) { + upl_flags |= UPL_COMMIT_CLEAR_PRECIOUS; + } + ubc_upl_commit_range(upl, - i*PAGE_SIZE, PAGE_SIZE, - upl_flags | - UPL_COMMIT_INACTIVATE | - UPL_COMMIT_FREE_ON_EMPTY); + i * PAGE_SIZE, PAGE_SIZE, + upl_flags | + UPL_COMMIT_INACTIVATE | + UPL_COMMIT_FREE_ON_EMPTY); } } pagelist_cleanup_done: - /* was this the last buffer in the file? */ + /* invalidate any pages past EOF */ if (NBOFF(bp) + bp->nb_bufsize > (off_t)(np->n_size)) { - /* if so, invalidate all pages of last buffer past EOF */ off_t start, end; start = trunc_page_64(np->n_size) + PAGE_SIZE_64; end = trunc_page_64(NBOFF(bp) + bp->nb_bufsize); + if (start < NBOFF(bp)) { + start = NBOFF(bp); + } if (end > start) { - if (!(rv = ubc_sync_range(vp, start, end, UBC_INVALIDATE))) - printf("nfs_buf_release(): ubc_sync_range failed!\n"); + if ((rv = ubc_msync(vp, start, end, NULL, UBC_INVALIDATE))) { + printf("nfs_buf_release(): ubc_msync failed!, error %d\n", rv); + } } } CLR(bp->nb_flags, NB_PAGELIST); @@ -1054,8 +1207,9 @@ pagelist_cleanup_done: /* If it's non-needcommit nocache, or an error, mark it invalid. */ if (ISSET(bp->nb_flags, NB_ERROR) || - (ISSET(bp->nb_flags, NB_NOCACHE) && !ISSET(bp->nb_flags, (NB_NEEDCOMMIT | NB_DELWRI)))) + (ISSET(bp->nb_flags, NB_NOCACHE) && !ISSET(bp->nb_flags, (NB_NEEDCOMMIT | NB_DELWRI)))) { SET(bp->nb_flags, NB_INVAL); + } if ((bp->nb_bufsize <= 0) || ISSET(bp->nb_flags, NB_INVAL)) { /* If it's invalid or empty, dissociate it from its nfsnode */ @@ -1075,8 +1229,9 @@ pagelist_cleanup_done: /* invalidate usage timestamp to allow immediate freeing */ NBUFSTAMPINVALIDATE(bp); /* put buffer at head of free list */ - if (bp->nb_free.tqe_next != NFSNOLIST) + if (bp->nb_free.tqe_next != NFSNOLIST) { panic("nfsbuf on freelist"); + } SET(bp->nb_flags, NB_INVAL); if (ISSET(bp->nb_flags, NB_META)) { TAILQ_INSERT_HEAD(&nfsbuffreemeta, bp, nb_free); @@ -1087,8 +1242,9 @@ pagelist_cleanup_done: } } else if (ISSET(bp->nb_flags, NB_DELWRI)) { /* put buffer at end of delwri list */ - if (bp->nb_free.tqe_next != NFSNOLIST) + if (bp->nb_free.tqe_next != NFSNOLIST) { panic("nfsbuf on freelist"); + } TAILQ_INSERT_TAIL(&nfsbufdelwri, bp, nb_free); nfsbufdelwricnt++; freeup = 0; @@ -1097,8 +1253,9 @@ pagelist_cleanup_done: microuptime(&now); bp->nb_timestamp = now.tv_sec; /* put buffer at end of free list */ - if (bp->nb_free.tqe_next != NFSNOLIST) + if (bp->nb_free.tqe_next != NFSNOLIST) { panic("nfsbuf on freelist"); + } if (ISSET(bp->nb_flags, NB_META)) { TAILQ_INSERT_TAIL(&nfsbuffreemeta, bp, nb_free); nfsbuffreemetacnt++; @@ -1118,14 +1275,18 @@ pagelist_cleanup_done: lck_mtx_unlock(nfs_buf_mutex); - if (wakeup_needbuffer) + if (wakeup_needbuffer) { wakeup(&nfsneedbuffer); - if (wakeup_buffer) + } + if (wakeup_buffer) { wakeup(bp); - if (wakeup_nbdwrite) + } + if (wakeup_nbdwrite) { wakeup(&nfs_nbdwrite); - if (freeup) + } + if (freeup) { NFS_BUF_FREEUP(); + } } /* @@ -1139,8 +1300,9 @@ nfs_buf_iowait(struct nfsbuf *bp) lck_mtx_lock(nfs_buf_mutex); - while (!ISSET(bp->nb_flags, NB_DONE)) + while (!ISSET(bp->nb_flags, NB_DONE)) { msleep(bp, nfs_buf_mutex, PRIBIO + 1, "nfs_buf_iowait", NULL); + } lck_mtx_unlock(nfs_buf_mutex); @@ -1149,10 +1311,11 @@ nfs_buf_iowait(struct nfsbuf *bp) /* check for interruption of I/O, then errors. */ if (ISSET(bp->nb_flags, NB_EINTR)) { CLR(bp->nb_flags, NB_EINTR); - return (EINTR); - } else if (ISSET(bp->nb_flags, NB_ERROR)) - return (bp->nb_error ? bp->nb_error : EIO); - return (0); + return EINTR; + } else if (ISSET(bp->nb_flags, NB_ERROR)) { + return bp->nb_error ? bp->nb_error : EIO; + } + return 0; } /* @@ -1161,11 +1324,11 @@ nfs_buf_iowait(struct nfsbuf *bp) void nfs_buf_iodone(struct nfsbuf *bp) { - FSDBG_TOP(550, bp, NBOFF(bp), bp->nb_flags, bp->nb_error); - if (ISSET(bp->nb_flags, NB_DONE)) + if (ISSET(bp->nb_flags, NB_DONE)) { panic("nfs_buf_iodone already"); + } if (!ISSET(bp->nb_flags, NB_READ)) { CLR(bp->nb_flags, NB_WRITEINPROG); @@ -1174,15 +1337,18 @@ nfs_buf_iodone(struct nfsbuf *bp) * any throttled write operations */ vnode_writedone(NFSTOV(bp->nb_np)); + nfs_node_lock_force(bp->nb_np); + bp->nb_np->n_numoutput--; + nfs_node_unlock(bp->nb_np); } - if (ISSET(bp->nb_flags, NB_ASYNC)) { /* if async, release it */ - SET(bp->nb_flags, NB_DONE); /* note that it's done */ + if (ISSET(bp->nb_flags, NB_ASYNC)) { /* if async, release it */ + SET(bp->nb_flags, NB_DONE); /* note that it's done */ nfs_buf_release(bp, 1); - } else { /* or just wakeup the buffer */ - lck_mtx_lock(nfs_buf_mutex); - SET(bp->nb_flags, NB_DONE); /* note that it's done */ + } else { /* or just wakeup the buffer */ + lck_mtx_lock(nfs_buf_mutex); + SET(bp->nb_flags, NB_DONE); /* note that it's done */ CLR(bp->nb_lflags, NBL_WANTED); - lck_mtx_unlock(nfs_buf_mutex); + lck_mtx_unlock(nfs_buf_mutex); wakeup(bp); } @@ -1208,8 +1374,9 @@ nfs_buf_write_delayed(struct nfsbuf *bp) lck_mtx_lock(nfs_buf_mutex); nfs_nbdwrite++; NFSBUFCNTCHK(); - if (bp->nb_vnbufs.le_next != NFSNOLIST) + if (bp->nb_vnbufs.le_next != NFSNOLIST) { LIST_REMOVE(bp, nb_vnbufs); + } LIST_INSERT_HEAD(&np->n_dirtyblkhd, bp, nb_vnbufs); lck_mtx_unlock(nfs_buf_mutex); } @@ -1221,16 +1388,17 @@ nfs_buf_write_delayed(struct nfsbuf *bp) vnode_waitforwrites(NFSTOV(np), VNODE_ASYNC_THROTTLE, 0, 0, "nfs_buf_write_delayed"); /* the file is in a modified state, so make sure the flag's set */ - nfs_lock(np, NFS_NODE_LOCK_FORCE); + nfs_node_lock_force(np); np->n_flag |= NMODIFIED; - nfs_unlock(np); + nfs_node_unlock(np); /* * If we have too many delayed write buffers, * just fall back to doing the async write. */ - if (nfs_nbdwrite < 0) + if (nfs_nbdwrite < 0) { panic("nfs_buf_write_delayed: Negative nfs_nbdwrite"); + } if (nfs_nbdwrite > NFS_A_LOT_OF_DELAYED_WRITES) { /* issue async write */ SET(bp->nb_flags, NB_ASYNC); @@ -1256,22 +1424,25 @@ nfs_buf_check_write_verifier(nfsnode_t np, struct nfsbuf *bp) { struct nfsmount *nmp; - if (!ISSET(bp->nb_flags, NB_NEEDCOMMIT)) + if (!ISSET(bp->nb_flags, NB_NEEDCOMMIT)) { return; + } nmp = NFSTONMP(np); - if (!nmp) + if (nfs_mount_gone(nmp)) { return; - if (!ISSET(bp->nb_flags, NB_STALEWVERF) && (bp->nb_verf == nmp->nm_verf)) + } + if (!ISSET(bp->nb_flags, NB_STALEWVERF) && (bp->nb_verf == nmp->nm_verf)) { return; + } /* write verifier changed, clear commit/wverf flags */ CLR(bp->nb_flags, (NB_NEEDCOMMIT | NB_STALEWVERF)); bp->nb_verf = 0; - nfs_lock(np, NFS_NODE_LOCK_FORCE); + nfs_node_lock_force(np); np->n_needcommitcnt--; CHECK_NEEDCOMMITCNT(np); - nfs_unlock(np); + nfs_node_unlock(np); } /* @@ -1281,7 +1452,7 @@ nfs_buf_check_write_verifier(nfsnode_t np, struct nfsbuf *bp) void nfs_buf_refget(struct nfsbuf *bp) { - bp->nb_refs++; + os_ref_retain_locked(&bp->nb_refs); } /* * release a reference on a buffer @@ -1290,7 +1461,7 @@ nfs_buf_refget(struct nfsbuf *bp) void nfs_buf_refrele(struct nfsbuf *bp) { - bp->nb_refs--; + (void) os_ref_release_locked(&bp->nb_refs); } /* @@ -1304,30 +1475,33 @@ nfs_buf_acquire(struct nfsbuf *bp, int flags, int slpflag, int slptimeo) struct timespec ts; if (ISSET(bp->nb_lflags, NBL_BUSY)) { - /* - * since the mutex_lock may block, the buffer + /* + * since the lck_mtx_lock may block, the buffer * may become BUSY, so we need to recheck for * a NOWAIT request */ - if (flags & NBAC_NOWAIT) - return (EBUSY); - SET(bp->nb_lflags, NBL_WANTED); + if (flags & NBAC_NOWAIT) { + return EBUSY; + } + SET(bp->nb_lflags, NBL_WANTED); - ts.tv_sec = (slptimeo/100); + ts.tv_sec = (slptimeo / 100); /* the hz value is 100; which leads to 10ms */ ts.tv_nsec = (slptimeo % 100) * 10 * NSEC_PER_USEC * 1000; error = msleep(bp, nfs_buf_mutex, slpflag | (PRIBIO + 1), - "nfs_buf_acquire", &ts); - if (error) - return (error); - return (EAGAIN); + "nfs_buf_acquire", &ts); + if (error) { + return error; + } + return EAGAIN; + } + if (flags & NBAC_REMOVE) { + nfs_buf_remfree(bp); } - if (flags & NBAC_REMOVE) - nfs_buf_remfree(bp); SET(bp->nb_lflags, NBL_BUSY); - return (0); + return 0; } /* @@ -1339,17 +1513,19 @@ nfs_buf_drop(struct nfsbuf *bp) { int need_wakeup = 0; - if (!ISSET(bp->nb_lflags, NBL_BUSY)) + if (!ISSET(bp->nb_lflags, NBL_BUSY)) { panic("nfs_buf_drop: buffer not busy!"); + } if (ISSET(bp->nb_lflags, NBL_WANTED)) { - /* delay the actual wakeup until after we clear NBL_BUSY */ + /* delay the actual wakeup until after we clear NBL_BUSY */ need_wakeup = 1; } /* Unlock the buffer. */ CLR(bp->nb_lflags, (NBL_BUSY | NBL_WANTED)); - if (need_wakeup) - wakeup(bp); + if (need_wakeup) { + wakeup(bp); + } } /* @@ -1362,31 +1538,32 @@ nfs_buf_iterprepare(nfsnode_t np, struct nfsbuflists *iterheadp, int flags) { struct nfsbuflists *listheadp; - if (flags & NBI_DIRTY) + if (flags & NBI_DIRTY) { listheadp = &np->n_dirtyblkhd; - else + } else { listheadp = &np->n_cleanblkhd; + } if ((flags & NBI_NOWAIT) && (np->n_bufiterflags & NBI_ITER)) { - LIST_INIT(iterheadp); - return(EWOULDBLOCK); + LIST_INIT(iterheadp); + return EWOULDBLOCK; } - while (np->n_bufiterflags & NBI_ITER) { - np->n_bufiterflags |= NBI_ITERWANT; + while (np->n_bufiterflags & NBI_ITER) { + np->n_bufiterflags |= NBI_ITERWANT; msleep(&np->n_bufiterflags, nfs_buf_mutex, 0, "nfs_buf_iterprepare", NULL); } if (LIST_EMPTY(listheadp)) { - LIST_INIT(iterheadp); - return(EINVAL); + LIST_INIT(iterheadp); + return EINVAL; } np->n_bufiterflags |= NBI_ITER; iterheadp->lh_first = listheadp->lh_first; - listheadp->lh_first->nb_vnbufs.le_prev = &iterheadp->lh_first; + listheadp->lh_first->nb_vnbufs.le_prev = &iterheadp->lh_first; LIST_INIT(listheadp); - return(0); + return 0; } /* @@ -1400,10 +1577,11 @@ nfs_buf_itercomplete(nfsnode_t np, struct nfsbuflists *iterheadp, int flags) struct nfsbuflists * listheadp; struct nfsbuf *bp; - if (flags & NBI_DIRTY) + if (flags & NBI_DIRTY) { listheadp = &np->n_dirtyblkhd; - else + } else { listheadp = &np->n_cleanblkhd; + } while (!LIST_EMPTY(iterheadp)) { bp = LIST_FIRST(iterheadp); @@ -1432,19 +1610,22 @@ nfs_buf_read(struct nfsbuf *bp) np = bp->nb_np; cred = bp->nb_rcred; - if (IS_VALID_CRED(cred)) + if (IS_VALID_CRED(cred)) { kauth_cred_ref(cred); + } thd = ISSET(bp->nb_flags, NB_ASYNC) ? NULL : current_thread(); /* sanity checks */ - if (!ISSET(bp->nb_flags, NB_READ)) + if (!ISSET(bp->nb_flags, NB_READ)) { panic("nfs_buf_read: !NB_READ"); - if (ISSET(bp->nb_flags, NB_DONE)) + } + if (ISSET(bp->nb_flags, NB_DONE)) { CLR(bp->nb_flags, NB_DONE); + } NFS_BUF_MAP(bp); - OSAddAtomic(1, (SInt32 *)&nfsstats.read_bios); + OSAddAtomic64(1, &nfsstats.read_bios); error = nfs_buf_read_rpc(bp, thd, cred); /* @@ -1452,9 +1633,10 @@ nfs_buf_read(struct nfsbuf *bp) * read. Otherwise, the read has already been finished. */ - if (IS_VALID_CRED(cred)) + if (IS_VALID_CRED(cred)) { kauth_cred_unref(&cred); - return (error); + } + return error; } /* @@ -1470,7 +1652,7 @@ nfs_buf_read_finish(struct nfsbuf *bp) /* update valid range */ bp->nb_validoff = 0; bp->nb_validend = bp->nb_endio; - if (bp->nb_endio < bp->nb_bufsize) { + if (bp->nb_endio < bp->nb_bufsize) { /* * The read may be short because we have unflushed writes * that are extending the file size and the reads hit the @@ -1480,20 +1662,22 @@ nfs_buf_read_finish(struct nfsbuf *bp) * in nfs_buf_read_rpc_finish(). */ off_t boff = NBOFF(bp); - if ((off_t)np->n_size >= (boff + bp->nb_bufsize)) + if ((off_t)np->n_size >= (boff + bp->nb_bufsize)) { bp->nb_validend = bp->nb_bufsize; - else if ((off_t)np->n_size >= boff) + } else if ((off_t)np->n_size >= boff) { bp->nb_validend = np->n_size - boff; - else + } else { bp->nb_validend = 0; + } } if ((nmp = NFSTONMP(np)) && (nmp->nm_vers == NFS_VER2) && - ((NBOFF(bp) + bp->nb_validend) > 0x100000000LL)) + ((NBOFF(bp) + bp->nb_validend) > 0x100000000LL)) { bp->nb_validend = 0x100000000LL - NBOFF(bp); - bp->nb_valid = (1 << (round_page_32(bp->nb_validend) / PAGE_SIZE)) - 1; + } + nfs_buf_pgs_get_page_mask(&bp->nb_valid, round_page_64(bp->nb_validend) / PAGE_SIZE); if (bp->nb_validend & PAGE_MASK) { /* zero-fill remainder of last page */ - bzero(bp->nb_data + bp->nb_validend, bp->nb_bufsize - bp->nb_validend); + bzero(bp->nb_data + bp->nb_validend, PAGE_SIZE - (bp->nb_validend & PAGE_MASK)); } } nfs_buf_iodone(bp); @@ -1508,17 +1692,20 @@ nfs_buf_read_rpc(struct nfsbuf *bp, thread_t thd, kauth_cred_t cred) struct nfsmount *nmp; nfsnode_t np = bp->nb_np; int error = 0, nfsvers, async; - int offset, length, nmrsize, nrpcs, len; + int offset; + uint64_t length, nrpcs; + uint32_t nmrsize; + size_t len; off_t boff; struct nfsreq *req; struct nfsreq_cbinfo cb; nmp = NFSTONMP(np); - if (!nmp) { + if (nfs_mount_gone(nmp)) { bp->nb_error = error = ENXIO; SET(bp->nb_flags, NB_ERROR); nfs_buf_iodone(bp); - return (error); + return error; } nfsvers = nmp->nm_vers; nmrsize = nmp->nm_rsize; @@ -1532,10 +1719,11 @@ nfs_buf_read_rpc(struct nfsbuf *bp, thread_t thd, kauth_cred_t cred) bp->nb_error = error = EFBIG; SET(bp->nb_flags, NB_ERROR); nfs_buf_iodone(bp); - return (error); + return error; } - if ((boff + length - 1) > 0xffffffffLL) + if ((boff + length - 1) > 0xffffffffLL) { length = 0x100000000LL - boff; + } } /* Note: Can only do async I/O if nfsiods are configured. */ @@ -1556,17 +1744,24 @@ nfs_buf_read_rpc(struct nfsbuf *bp, thread_t thd, kauth_cred_t cred) error = bp->nb_error; break; } - len = (length > nmrsize) ? nmrsize : length; - cb.rcb_args[0] = offset; - cb.rcb_args[1] = len; + len = (length > nmrsize) ? nmrsize : (uint32_t)length; + cb.rcb_args.offset = offset; + cb.rcb_args.length = len; +#if CONFIG_NFS4 + if (nmp->nm_vers >= NFS_VER4) { + cb.rcb_args.stategenid = nmp->nm_stategenid; + } +#endif req = NULL; error = nmp->nm_funcs->nf_read_rpc_async(np, boff + offset, len, thd, cred, &cb, &req); - if (error) + if (error) { break; + } offset += len; length -= len; - if (async) + if (async) { continue; + } nfs_buf_read_rpc_finish(req); if (ISSET(bp->nb_flags, NB_ERROR)) { error = bp->nb_error; @@ -1591,9 +1786,10 @@ nfs_buf_read_rpc(struct nfsbuf *bp, thread_t thd, kauth_cred_t cred) nfs_buf_iodone(bp); } else { /* wait for the last RPC to mark it done */ - while (bp->nb_rpcs > 0) + while (bp->nb_rpcs > 0) { msleep(&bp->nb_rpcs, nfs_buf_mutex, 0, - "nfs_buf_read_rpc_cancel", NULL); + "nfs_buf_read_rpc_cancel", NULL); + } lck_mtx_unlock(nfs_buf_mutex); } } else { @@ -1601,7 +1797,7 @@ nfs_buf_read_rpc(struct nfsbuf *bp, thread_t thd, kauth_cred_t cred) } } - return (error); + return error; } /* @@ -1611,29 +1807,34 @@ void nfs_buf_read_rpc_finish(struct nfsreq *req) { struct nfsmount *nmp; - size_t rlen; + size_t rlen, length; struct nfsreq_cbinfo cb; struct nfsbuf *bp; - int error = 0, nfsvers, offset, length, eof = 0, multasyncrpc, finished; + int error = 0, nfsvers, eof = 0, multasyncrpc, finished; + off_t offset; void *wakeme = NULL; struct nfsreq *rreq = NULL; nfsnode_t np; thread_t thd; kauth_cred_t cred; - struct uio uio; - struct iovec_32 io; + uio_t auio; + char uio_buf[UIO_SIZEOF(1)]; finish: np = req->r_np; thd = req->r_thread; cred = req->r_cred; - if (IS_VALID_CRED(cred)) + if (IS_VALID_CRED(cred)) { kauth_cred_ref(cred); + } cb = req->r_callback; bp = cb.rcb_bp; + if (cb.rcb_func) { /* take an extra reference on the nfsreq in case we want to resend it later due to grace error */ + nfs_request_ref(req, 0); + } nmp = NFSTONMP(np); - if (!nmp) { + if (nfs_mount_gone(nmp)) { SET(bp->nb_flags, NB_ERROR); bp->nb_error = error = ENXIO; } @@ -1644,39 +1845,82 @@ finish: } nfsvers = nmp->nm_vers; - offset = cb.rcb_args[0]; - rlen = length = cb.rcb_args[1]; - - uio.uio_iovs.iov32p = &io; - uio.uio_iovcnt = 1; - uio.uio_rw = UIO_READ; -#if 1 /* LP64todo - can't use new segment flags until the drivers are ready */ - uio.uio_segflg = UIO_SYSSPACE; -#else - uio.uio_segflg = UIO_SYSSPACE32; -#endif - io.iov_len = length; - uio_uio_resid_set(&uio, io.iov_len); - uio.uio_offset = NBOFF(bp) + offset; - io.iov_base = (uintptr_t) bp->nb_data + offset; + offset = cb.rcb_args.offset; + rlen = length = cb.rcb_args.length; + + auio = uio_createwithbuffer(1, NBOFF(bp) + offset, UIO_SYSSPACE, + UIO_READ, &uio_buf, sizeof(uio_buf)); + uio_addiov(auio, CAST_USER_ADDR_T(bp->nb_data + offset), length); /* finish the RPC */ - error = nmp->nm_funcs->nf_read_rpc_async_finish(np, req, &uio, &rlen, &eof); + error = nmp->nm_funcs->nf_read_rpc_async_finish(np, req, auio, &rlen, &eof); if ((error == EINPROGRESS) && cb.rcb_func) { /* async request restarted */ - if (IS_VALID_CRED(cred)) + if (cb.rcb_func) { + nfs_request_rele(req); + } + if (IS_VALID_CRED(cred)) { kauth_cred_unref(&cred); + } return; } - +#if CONFIG_NFS4 + if ((nmp->nm_vers >= NFS_VER4) && nfs_mount_state_error_should_restart(error) && !ISSET(bp->nb_flags, NB_ERROR)) { + lck_mtx_lock(&nmp->nm_lock); + if ((error != NFSERR_OLD_STATEID) && (error != NFSERR_GRACE) && (cb.rcb_args.stategenid == nmp->nm_stategenid)) { + NP(np, "nfs_buf_read_rpc_finish: error %d @ 0x%llx, 0x%x 0x%x, initiating recovery", + error, NBOFF(bp) + offset, cb.rcb_args.stategenid, nmp->nm_stategenid); + nfs_need_recover(nmp, error); + } + lck_mtx_unlock(&nmp->nm_lock); + if (np->n_flag & NREVOKE) { + error = EIO; + } else { + if (error == NFSERR_GRACE) { + if (cb.rcb_func) { + /* + * For an async I/O request, handle a grace delay just like + * jukebox errors. Set the resend time and queue it up. + */ + struct timeval now; + if (req->r_nmrep.nmc_mhead) { + mbuf_freem(req->r_nmrep.nmc_mhead); + req->r_nmrep.nmc_mhead = NULL; + } + req->r_error = 0; + microuptime(&now); + lck_mtx_lock(&req->r_mtx); + req->r_resendtime = now.tv_sec + 2; + req->r_xid = 0; // get a new XID + req->r_flags |= R_RESTART; + req->r_start = 0; + nfs_asyncio_resend(req); + lck_mtx_unlock(&req->r_mtx); + if (IS_VALID_CRED(cred)) { + kauth_cred_unref(&cred); + } + /* Note: nfsreq reference taken will be dropped later when finished */ + return; + } + /* otherwise, just pause a couple seconds and retry */ + tsleep(&nmp->nm_state, (PZERO - 1), "nfsgrace", 2 * hz); + } + if (!(error = nfs_mount_state_wait_for_recovery(nmp))) { + rlen = 0; + goto readagain; + } + } + } +#endif if (error) { SET(bp->nb_flags, NB_ERROR); bp->nb_error = error; goto out; } - if ((rlen > 0) && (bp->nb_endio < (offset + (int)rlen))) + if ((rlen > 0) && (bp->nb_endio < (offset + (int)rlen))) { bp->nb_endio = offset + rlen; + } if ((nfsvers == NFS_VER2) || eof || (rlen == 0)) { /* zero out the remaining data (up to EOF) */ @@ -1684,9 +1928,10 @@ finish: rpcrem = (length - rlen); eofrem = np->n_size - (NBOFF(bp) + offset + rlen); rem = (rpcrem < eofrem) ? rpcrem : eofrem; - if (rem > 0) - bzero(bp->nb_data + offset + rlen, rem); - } else if (((int)rlen < length) && !ISSET(bp->nb_flags, NB_ERROR)) { + if (rem > 0) { + NFS_BZERO(bp->nb_data + offset + rlen, rem); + } + } else if ((rlen < length) && !ISSET(bp->nb_flags, NB_ERROR)) { /* * short read * @@ -1694,19 +1939,30 @@ finish: * requested, so we need to issue another read for the rest. * (Don't bother if the buffer already hit an error.) */ +#if CONFIG_NFS4 +readagain: +#endif offset += rlen; length -= rlen; - cb.rcb_args[0] = offset; - cb.rcb_args[1] = length; - error = nmp->nm_funcs->nf_read_rpc_async(np, offset, length, thd, cred, &cb, &rreq); + cb.rcb_args.offset = offset; + cb.rcb_args.length = length; +#if CONFIG_NFS4 + if (nmp->nm_vers >= NFS_VER4) { + cb.rcb_args.stategenid = nmp->nm_stategenid; + } +#endif + error = nmp->nm_funcs->nf_read_rpc_async(np, NBOFF(bp) + offset, length, thd, cred, &cb, &rreq); if (!error) { - if (IS_VALID_CRED(cred)) + if (IS_VALID_CRED(cred)) { kauth_cred_unref(&cred); + } if (!cb.rcb_func) { /* if !async we'll need to wait for this RPC to finish */ req = rreq; + rreq = NULL; goto finish; } + nfs_request_rele(req); /* * We're done here. * Outstanding RPC count is unchanged. @@ -1719,8 +1975,12 @@ finish: } out: - if (IS_VALID_CRED(cred)) + if (cb.rcb_func) { + nfs_request_rele(req); + } + if (IS_VALID_CRED(cred)) { kauth_cred_unref(&cred); + } /* * Decrement outstanding RPC count on buffer @@ -1732,21 +1992,25 @@ out: */ multasyncrpc = ISSET(bp->nb_flags, NB_MULTASYNCRPC); - if (multasyncrpc) + if (multasyncrpc) { lck_mtx_lock(nfs_buf_mutex); + } bp->nb_rpcs--; finished = (bp->nb_rpcs == 0); - if (multasyncrpc) + if (multasyncrpc) { lck_mtx_unlock(nfs_buf_mutex); + } if (finished) { - if (multasyncrpc) + if (multasyncrpc) { wakeme = &bp->nb_rpcs; + } nfs_buf_read_finish(bp); - if (wakeme) + if (wakeme) { wakeup(wakeme); + } } } @@ -1754,277 +2018,248 @@ out: * Do buffer readahead. * Initiate async I/O to read buffers not in cache. */ -static int +int nfs_buf_readahead(nfsnode_t np, int ioflag, daddr64_t *rabnp, daddr64_t lastrabn, thread_t thd, kauth_cred_t cred) { struct nfsmount *nmp = NFSTONMP(np); struct nfsbuf *bp; - int error = 0, nra; + int error = 0; + uint32_t nra; - if (!nmp) - return (ENXIO); - if (nmp->nm_readahead <= 0) - return (0); - if (*rabnp > lastrabn) - return (0); + if (nfs_mount_gone(nmp)) { + return ENXIO; + } + if (nmp->nm_readahead <= 0) { + return 0; + } + if (*rabnp > lastrabn) { + return 0; + } for (nra = 0; (nra < nmp->nm_readahead) && (*rabnp <= lastrabn); nra++, *rabnp = *rabnp + 1) { /* check if block exists and is valid. */ - error = nfs_buf_get(np, *rabnp, nmp->nm_biosize, thd, NBLK_READ|NBLK_NOWAIT, &bp); - if (error) + if ((*rabnp * nmp->nm_biosize) >= (off_t)np->n_size) { + /* stop reading ahead if we're beyond EOF */ + *rabnp = lastrabn; break; - if (!bp) + } + error = nfs_buf_get(np, *rabnp, nmp->nm_biosize, thd, NBLK_READ | NBLK_NOWAIT, &bp); + if (error) { + break; + } + nfs_node_lock_force(np); + np->n_lastrahead = *rabnp; + nfs_node_unlock(np); + if (!bp) { continue; + } if ((ioflag & IO_NOCACHE) && ISSET(bp->nb_flags, NB_CACHE) && - !bp->nb_dirty && !ISSET(bp->nb_flags, (NB_DELWRI|NB_NCRDAHEAD))) { + !nfs_buf_pgs_is_set(&bp->nb_dirty) && !ISSET(bp->nb_flags, (NB_DELWRI | NB_NCRDAHEAD))) { CLR(bp->nb_flags, NB_CACHE); - bp->nb_valid = 0; + NBPGS_ERASE(&bp->nb_valid); bp->nb_validoff = bp->nb_validend = -1; } - if ((bp->nb_dirtyend <= 0) && !bp->nb_dirty && - !ISSET(bp->nb_flags, (NB_CACHE|NB_DELWRI))) { - SET(bp->nb_flags, (NB_READ|NB_ASYNC)); - if (ioflag & IO_NOCACHE) + if ((bp->nb_dirtyend <= 0) && !nfs_buf_pgs_is_set(&bp->nb_dirty) && + !ISSET(bp->nb_flags, (NB_CACHE | NB_DELWRI))) { + SET(bp->nb_flags, (NB_READ | NB_ASYNC)); + if (ioflag & IO_NOCACHE) { SET(bp->nb_flags, NB_NCRDAHEAD); + } if (!IS_VALID_CRED(bp->nb_rcred) && IS_VALID_CRED(cred)) { kauth_cred_ref(cred); bp->nb_rcred = cred; } - if ((error = nfs_buf_read(bp))) + if ((error = nfs_buf_read(bp))) { break; + } continue; } nfs_buf_release(bp, 1); } - return (error); + return error; } /* - * NFS buffer I/O for reading files/directories. + * NFS buffer I/O for reading files. */ int -nfs_bioread(nfsnode_t np, struct uio *uio, int ioflag, int *eofflag, vfs_context_t ctx) +nfs_bioread(nfsnode_t np, uio_t uio, int ioflag, vfs_context_t ctx) { vnode_t vp = NFSTOV(np); struct nfsbuf *bp = NULL; - struct nfs_vattr nvattr; struct nfsmount *nmp = VTONMP(vp); - daddr64_t lbn, rabn = 0, lastrabn, maxrabn = -1, tlbn; - off_t diff; - int error = 0, n = 0, on = 0; - int nfsvers, biosize; - caddr_t dp; - struct dirent *direntp = NULL; - enum vtype vtype; + daddr64_t lbn, rabn = 0, lastrabn, maxrabn = -1; + off_t diff, on = 0, n = 0; + int error = 0, n32; + int nfsvers, biosize, modified, readaheads = 0; thread_t thd; kauth_cred_t cred; + int64_t io_resid; - FSDBG_TOP(514, np, uio->uio_offset, uio_uio_resid(uio), ioflag); - - if (uio_uio_resid(uio) == 0) { - FSDBG_BOT(514, np, 0xd1e0001, 0, 0); - return (0); - } - if (uio->uio_offset < 0) { - FSDBG_BOT(514, np, 0xd1e0002, 0, EINVAL); - return (EINVAL); - } + FSDBG_TOP(514, np, uio_offset(uio), uio_resid(uio), ioflag); nfsvers = nmp->nm_vers; biosize = nmp->nm_biosize; thd = vfs_context_thread(ctx); cred = vfs_context_ucred(ctx); - vtype = vnode_vtype(vp); - if ((vtype != VREG) && (vtype != VDIR)) { - printf("nfs_bioread: type %x unexpected\n", vtype); + if (vnode_vtype(vp) != VREG) { + printf("nfs_bioread: type %x unexpected\n", vnode_vtype(vp)); FSDBG_BOT(514, np, 0xd1e0016, 0, EINVAL); - return (EINVAL); + return EINVAL; } /* - * For nfs, cache consistency can only be maintained approximately. + * For NFS, cache consistency can only be maintained approximately. * Although RFC1094 does not specify the criteria, the following is * believed to be compatible with the reference port. - * For nfs: - * If the file's modify time on the server has changed since the - * last read rpc or you have written to the file, - * you may have lost data cache consistency with the - * server, so flush all of the file's data out of the cache. - * Then force a getattr rpc to ensure that you have up to date - * attributes. + * + * If the file has changed since the last read RPC or you have + * written to the file, you may have lost data cache consistency + * with the server. So, check for a change, and flush all of the + * file's data out of the cache. * NB: This implies that cache data can be read when up to - * NFS_MAXATTRTIMEO seconds out of date. If you find that you need - * current attributes this could be forced by calling - * NATTRINVALIDATE() before the nfs_getattr() call. + * NFS_MAXATTRTIMO seconds out of date. If you find that you + * need current attributes, nfs_getattr() can be forced to fetch + * new attributes (via NATTRINVALIDATE() or NGA_UNCACHED). */ - if (ISSET(np->n_flag, NUPDATESIZE)) + if (ISSET(np->n_flag, NUPDATESIZE)) { nfs_data_update_size(np, 0); + } - if ((error = nfs_lock(np, NFS_NODE_LOCK_EXCLUSIVE))) { + if ((error = nfs_node_lock(np))) { FSDBG_BOT(514, np, 0xd1e0222, 0, error); - return (error); + return error; } if (np->n_flag & NNEEDINVALIDATE) { np->n_flag &= ~NNEEDINVALIDATE; - nfs_unlock(np); - nfs_vinvalbuf(vp, V_SAVE|V_IGNORE_WRITEERR, ctx, 1); - if ((error = nfs_lock(np, NFS_NODE_LOCK_EXCLUSIVE))) { + nfs_node_unlock(np); + error = nfs_vinvalbuf(vp, V_SAVE | V_IGNORE_WRITEERR, ctx, 1); + if (!error) { + error = nfs_node_lock(np); + } + if (error) { FSDBG_BOT(514, np, 0xd1e0322, 0, error); - return (error); + return error; } } - if (np->n_flag & NMODIFIED) { - if (vtype == VDIR) { - nfs_invaldir(np); - nfs_unlock(np); - error = nfs_vinvalbuf(vp, V_SAVE, ctx, 1); - if (!error) - error = nfs_lock(np, NFS_NODE_LOCK_EXCLUSIVE); - if (error) { - FSDBG_BOT(514, np, 0xd1e0003, 0, error); - return (error); + modified = (np->n_flag & NMODIFIED); + nfs_node_unlock(np); + /* nfs_getattr() will check changed and purge caches */ + error = nfs_getattr(np, NULL, ctx, modified ? NGA_UNCACHED : NGA_CACHED); + if (error) { + FSDBG_BOT(514, np, 0xd1e0004, 0, error); + return error; + } + + if (uio_resid(uio) == 0) { + FSDBG_BOT(514, np, 0xd1e0001, 0, 0); + return 0; + } + if (uio_offset(uio) < 0) { + FSDBG_BOT(514, np, 0xd1e0002, 0, EINVAL); + return EINVAL; + } + + /* + * set up readahead - which may be limited by: + * + current request length (for IO_NOCACHE) + * + readahead setting + * + file size + */ + if (nmp->nm_readahead > 0) { + off_t end = uio_offset(uio) + uio_resid(uio); + if (end > (off_t)np->n_size) { + end = np->n_size; + } + rabn = uio_offset(uio) / biosize; + maxrabn = (end - 1) / biosize; + nfs_node_lock_force(np); + if (!(ioflag & IO_NOCACHE) && + (!rabn || (rabn == np->n_lastread) || (rabn == (np->n_lastread + 1)))) { + maxrabn += nmp->nm_readahead; + if ((maxrabn * biosize) >= (off_t)np->n_size) { + maxrabn = ((off_t)np->n_size - 1) / biosize; } } - NATTRINVALIDATE(np); - error = nfs_getattr(np, &nvattr, ctx, 1); - if (error) { - nfs_unlock(np); - FSDBG_BOT(514, np, 0xd1e0004, 0, error); - return (error); + if (maxrabn < np->n_lastrahead) { + np->n_lastrahead = -1; } - if (vtype == VDIR) { - /* if directory changed, purge any name cache entries */ - if (NFS_CHANGED_NC(nfsvers, np, &nvattr)) - cache_purge(vp); - NFS_CHANGED_UPDATE_NC(nfsvers, np, &nvattr); + if (rabn < np->n_lastrahead) { + rabn = np->n_lastrahead + 1; } - NFS_CHANGED_UPDATE(nfsvers, np, &nvattr); + nfs_node_unlock(np); } else { - error = nfs_getattr(np, &nvattr, ctx, 1); - if (error) { - nfs_unlock(np); - FSDBG_BOT(514, np, 0xd1e0005, 0, error); - return (error); - } - if (NFS_CHANGED(nfsvers, np, &nvattr)) { - if (vtype == VDIR) { - nfs_invaldir(np); - /* purge name cache entries */ - if (NFS_CHANGED_NC(nfsvers, np, &nvattr)) - cache_purge(vp); - } - nfs_unlock(np); - error = nfs_vinvalbuf(vp, V_SAVE, ctx, 1); - if (!error) - error = nfs_lock(np, NFS_NODE_LOCK_EXCLUSIVE); - if (error) { - FSDBG_BOT(514, np, 0xd1e0006, 0, error); - return (error); - } - if (vtype == VDIR) - NFS_CHANGED_UPDATE_NC(nfsvers, np, &nvattr); - NFS_CHANGED_UPDATE(nfsvers, np, &nvattr); - } + rabn = maxrabn = 0; } - nfs_unlock(np); - - if (vtype == VREG) { - if ((ioflag & IO_NOCACHE) && (uio_uio_resid(uio) < (2*biosize))) { - /* We have only a block or so to read, just do the rpc directly. */ - error = nfs_read_rpc(np, uio, ctx); - FSDBG_BOT(514, np, uio->uio_offset, uio_uio_resid(uio), error); - return (error); - } - /* - * set up readahead - which may be limited by: - * + current request length (for IO_NOCACHE) - * + readahead setting - * + file size - */ - if (nmp->nm_readahead > 0) { - off_t end = uio->uio_offset + uio_uio_resid(uio); - if (end > (off_t)np->n_size) - end = np->n_size; - rabn = uio->uio_offset / biosize; - maxrabn = (end - 1) / biosize; - if (!(ioflag & IO_NOCACHE) && - (!rabn || (rabn == np->n_lastread) || (rabn == (np->n_lastread+1)))) { - maxrabn += nmp->nm_readahead; - if ((maxrabn * biosize) >= (off_t)np->n_size) - maxrabn = ((off_t)np->n_size - 1)/biosize; - } - } else { - rabn = maxrabn = 0; - } - } - - do { - - if (vtype == VREG) { - nfs_data_lock(np, NFS_NODE_LOCK_SHARED); - lbn = uio->uio_offset / biosize; + do { + nfs_data_lock(np, NFS_DATA_LOCK_SHARED); + lbn = uio_offset(uio) / biosize; /* * Copy directly from any cached pages without grabbing the bufs. - * - * Note: for "nocache" reads, we don't copy directly from UBC - * because any cached pages will be for readahead buffers that - * need to be invalidated anyway before we finish this request. + * (If we are NOCACHE and we've issued readahead requests, we need + * to grab the NB_NCRDAHEAD bufs to drop them.) */ - if (!(ioflag & IO_NOCACHE) && - (uio->uio_segflg == UIO_USERSPACE32 || - uio->uio_segflg == UIO_USERSPACE64 || - uio->uio_segflg == UIO_USERSPACE)) { - // LP64todo - fix this! - int io_resid = uio_uio_resid(uio); - diff = np->n_size - uio->uio_offset; - if (diff < io_resid) + if ((!(ioflag & IO_NOCACHE) || !readaheads) && + ((uio->uio_segflg == UIO_USERSPACE32 || + uio->uio_segflg == UIO_USERSPACE64 || + uio->uio_segflg == UIO_USERSPACE))) { + io_resid = uio_resid(uio); + diff = np->n_size - uio_offset(uio); + if (diff < io_resid) { io_resid = diff; + } if (io_resid > 0) { - error = cluster_copy_ubc_data(vp, uio, &io_resid, 0); + int count = (io_resid > INT_MAX) ? INT_MAX : (int)io_resid; + error = cluster_copy_ubc_data(vp, uio, &count, 0); if (error) { nfs_data_unlock(np); - FSDBG_BOT(514, np, uio->uio_offset, 0xcacefeed, error); - return (error); + FSDBG_BOT(514, np, uio_offset(uio), 0xcacefeed, error); + return error; } } /* count any biocache reads that we just copied directly */ - if (lbn != (uio->uio_offset / biosize)) { - OSAddAtomic((uio->uio_offset / biosize) - lbn, (SInt32*)&nfsstats.biocache_reads); - FSDBG(514, np, 0xcacefeed, uio->uio_offset, error); + if (lbn != (uio_offset(uio) / biosize)) { + OSAddAtomic64(NFS_ROUND_BLOCK(uio_offset(uio), biosize) - lbn, &nfsstats.biocache_reads); + FSDBG(514, np, 0xcacefeed, uio_offset(uio), error); } } - lbn = uio->uio_offset / biosize; - on = uio->uio_offset % biosize; - np->n_lastread = (uio->uio_offset - 1) / biosize; + lbn = uio_offset(uio) / biosize; + on = uio_offset(uio) % biosize; + nfs_node_lock_force(np); + np->n_lastread = (uio_offset(uio) - 1) / biosize; + nfs_node_unlock(np); + + if ((uio_resid(uio) <= 0) || (uio_offset(uio) >= (off_t)np->n_size)) { + nfs_data_unlock(np); + FSDBG_BOT(514, np, uio_offset(uio), uio_resid(uio), 0xaaaaaaaa); + return 0; + } /* adjust readahead block number, if necessary */ - if (rabn < lbn) + if (rabn < lbn) { rabn = lbn; + } lastrabn = MIN(maxrabn, lbn + nmp->nm_readahead); if (rabn <= lastrabn) { /* start readaheads */ error = nfs_buf_readahead(np, ioflag, &rabn, lastrabn, thd, cred); if (error) { nfs_data_unlock(np); FSDBG_BOT(514, np, 0xd1e000b, 1, error); - return (error); + return error; } + readaheads = 1; + OSAddAtomic64(rabn - lbn, &nfsstats.biocache_reads); + } else { + OSAddAtomic64(1, &nfsstats.biocache_reads); } - if ((uio_uio_resid(uio) <= 0) || (uio->uio_offset >= (off_t)np->n_size)) { - nfs_data_unlock(np); - FSDBG_BOT(514, np, uio->uio_offset, uio_uio_resid(uio), 0xaaaaaaaa); - return (0); - } - - OSAddAtomic(1, (SInt32*)&nfsstats.biocache_reads); - /* * If the block is in the cache and has the required data * in a valid region, just copy it out. @@ -2032,17 +2267,18 @@ nfs_bioread(nfsnode_t np, struct uio *uio, int ioflag, int *eofflag, vfs_context * as required. */ again: - // LP64todo - fix this! - n = min((unsigned)(biosize - on), uio_uio_resid(uio)); - diff = np->n_size - uio->uio_offset; - if (diff < n) + io_resid = uio_resid(uio); + n = (io_resid > (biosize - on)) ? (biosize - on) : io_resid; + diff = np->n_size - uio_offset(uio); + if (diff < n) { n = diff; + } error = nfs_buf_get(np, lbn, biosize, thd, NBLK_READ, &bp); if (error) { nfs_data_unlock(np); FSDBG_BOT(514, np, 0xd1e000c, 0, error); - return (error); + return error; } if ((ioflag & IO_NOCACHE) && ISSET(bp->nb_flags, NB_CACHE)) { @@ -2052,32 +2288,32 @@ again: * Invalidate the data if it wasn't just read * in as part of a "nocache readahead". */ - if (bp->nb_dirty || (bp->nb_dirtyend > 0)) { + if (nfs_buf_pgs_is_set(&bp->nb_dirty) || (bp->nb_dirtyend > 0)) { /* so write the buffer out and try again */ SET(bp->nb_flags, NB_NOCACHE); goto flushbuffer; } - if (!ISSET(bp->nb_flags, NB_NCRDAHEAD)) { - CLR(bp->nb_flags, NB_CACHE); - bp->nb_valid = 0; - } else { + if (ISSET(bp->nb_flags, NB_NCRDAHEAD)) { CLR(bp->nb_flags, NB_NCRDAHEAD); + SET(bp->nb_flags, NB_NOCACHE); } } /* if any pages are valid... */ - if (bp->nb_valid) { + if (nfs_buf_pgs_is_set(&bp->nb_valid)) { /* ...check for any invalid pages in the read range */ - int pg, firstpg, lastpg, dirtypg; + off_t pg, firstpg, lastpg, dirtypg; dirtypg = firstpg = lastpg = -1; - pg = on/PAGE_SIZE; - while (pg <= (on + n - 1)/PAGE_SIZE) { - if (!NBPGVALID(bp,pg)) { - if (firstpg < 0) + pg = on / PAGE_SIZE; + while (pg <= (on + n - 1) / PAGE_SIZE) { + if (!NBPGVALID(bp, pg)) { + if (firstpg < 0) { firstpg = pg; + } lastpg = pg; - } else if (firstpg >= 0 && dirtypg < 0 && NBPGDIRTY(bp,pg)) + } else if (firstpg >= 0 && dirtypg < 0 && NBPGDIRTY(bp, pg)) { dirtypg = pg; + } pg++; } @@ -2086,8 +2322,8 @@ again: if (bp->nb_validoff < 0) { /* valid range isn't set up, so */ /* set it to what we know is valid */ - bp->nb_validoff = trunc_page(on); - bp->nb_validend = round_page(on+n); + bp->nb_validoff = trunc_page_64(on); + bp->nb_validend = round_page_64(on + n); nfs_buf_normalize_valid_range(np, bp); } goto buffer_ready; @@ -2095,7 +2331,7 @@ again: /* there are invalid pages in the read range */ if (((dirtypg > firstpg) && (dirtypg < lastpg)) || - (((firstpg*PAGE_SIZE) < bp->nb_dirtyend) && (((lastpg+1)*PAGE_SIZE) > bp->nb_dirtyoff))) { + (((firstpg * PAGE_SIZE) < bp->nb_dirtyend) && (((lastpg + 1) * PAGE_SIZE) > bp->nb_dirtyoff))) { /* there are also dirty page(s) (or range) in the read range, */ /* so write the buffer out and try again */ flushbuffer: @@ -2109,41 +2345,42 @@ flushbuffer: if (error) { nfs_data_unlock(np); FSDBG_BOT(514, np, 0xd1e000d, 0, error); - return (error); + return error; } goto again; } - if (!bp->nb_dirty && bp->nb_dirtyend <= 0 && - (lastpg - firstpg + 1) > (biosize/PAGE_SIZE)/2) { + if (!nfs_buf_pgs_is_set(&bp->nb_dirty) && bp->nb_dirtyend <= 0 && + (lastpg - firstpg + 1) > (biosize / PAGE_SIZE) / 2) { /* we need to read in more than half the buffer and the */ /* buffer's not dirty, so just fetch the whole buffer */ - bp->nb_valid = 0; + NBPGS_ERASE(&bp->nb_valid); } else { /* read the page range in */ uio_t auio; - char uio_buf[ UIO_SIZEOF(1) ]; - + char uio_buf[UIO_SIZEOF(1)]; + NFS_BUF_MAP(bp); auio = uio_createwithbuffer(1, (NBOFF(bp) + firstpg * PAGE_SIZE_64), - UIO_SYSSPACE, UIO_READ, &uio_buf[0], sizeof(uio_buf)); + UIO_SYSSPACE, UIO_READ, &uio_buf[0], sizeof(uio_buf)); if (!auio) { error = ENOMEM; } else { - uio_addiov(auio, CAST_USER_ADDR_T((bp->nb_data + firstpg * PAGE_SIZE)), - ((lastpg - firstpg + 1) * PAGE_SIZE)); + NFS_UIO_ADDIOV(auio, CAST_USER_ADDR_T(bp->nb_data + (firstpg * PAGE_SIZE)), + ((lastpg - firstpg + 1) * PAGE_SIZE)); error = nfs_read_rpc(np, auio, ctx); } if (error) { - if (ioflag & IO_NOCACHE) + if (ioflag & IO_NOCACHE) { SET(bp->nb_flags, NB_NOCACHE); + } nfs_buf_release(bp, 1); nfs_data_unlock(np); FSDBG_BOT(514, np, 0xd1e000e, 0, error); - return (error); + return error; } /* Make sure that the valid range is set to cover this read. */ - bp->nb_validoff = trunc_page_32(on); - bp->nb_validend = round_page_32(on+n); + bp->nb_validoff = trunc_page_64(on); + bp->nb_validend = round_page_64(on + n); nfs_buf_normalize_valid_range(np, bp); if (uio_resid(auio) > 0) { /* if short read, must have hit EOF, */ @@ -2151,12 +2388,13 @@ flushbuffer: bzero(CAST_DOWN(caddr_t, uio_curriovbase(auio)), uio_resid(auio)); } /* mark the pages (successfully read) as valid */ - for (pg=firstpg; pg <= lastpg; pg++) - NBPGVALID_SET(bp,pg); + for (pg = firstpg; pg <= lastpg; pg++) { + NBPGVALID_SET(bp, pg); + } } } /* if no pages are valid, read the whole block */ - if (!bp->nb_valid) { + if (!nfs_buf_pgs_is_set(&bp->nb_valid)) { if (!IS_VALID_CRED(bp->nb_rcred) && IS_VALID_CRED(cred)) { kauth_cred_ref(cred); bp->nb_rcred = cred; @@ -2164,196 +2402,80 @@ flushbuffer: SET(bp->nb_flags, NB_READ); CLR(bp->nb_flags, (NB_DONE | NB_ERROR | NB_INVAL)); error = nfs_buf_read(bp); + if (ioflag & IO_NOCACHE) { + SET(bp->nb_flags, NB_NOCACHE); + } if (error) { nfs_data_unlock(np); nfs_buf_release(bp, 1); FSDBG_BOT(514, np, 0xd1e000f, 0, error); - return (error); + return error; } } buffer_ready: /* validate read range against valid range and clip */ if (bp->nb_validend > 0) { diff = (on >= bp->nb_validend) ? 0 : (bp->nb_validend - on); - if (diff < n) + if (diff < n) { n = diff; - } - if (n > 0) - NFS_BUF_MAP(bp); - } else if (vtype == VDIR) { - OSAddAtomic(1, (SInt32*)&nfsstats.biocache_readdirs); - error = nfs_lock(np, NFS_NODE_LOCK_SHARED); - if (error || (np->n_direofoffset && (uio->uio_offset >= np->n_direofoffset))) { - if (!error) - nfs_unlock(np); - if (eofflag) - *eofflag = 1; - FSDBG_BOT(514, np, 0xde0f0001, 0, 0); - return (0); - } - nfs_unlock(np); - lbn = uio->uio_offset / NFS_DIRBLKSIZ; - on = uio->uio_offset & (NFS_DIRBLKSIZ - 1); - error = nfs_buf_get(np, lbn, NFS_DIRBLKSIZ, thd, NBLK_READ, &bp); - if (error) { - FSDBG_BOT(514, np, 0xd1e0012, 0, error); - return (error); - } - if (!ISSET(bp->nb_flags, NB_CACHE)) { - SET(bp->nb_flags, NB_READ); - error = nfs_buf_readdir(bp, ctx); - if (error) - nfs_buf_release(bp, 1); - while (error == NFSERR_BAD_COOKIE) { - error = nfs_lock(np, NFS_NODE_LOCK_EXCLUSIVE); - if (!error) { - nfs_invaldir(np); - nfs_unlock(np); } - error = nfs_vinvalbuf(vp, 0, ctx, 1); - /* - * Yuck! The directory has been modified on the - * server. The only way to get the block is by - * reading from the beginning to get all the - * offset cookies. - */ - for (tlbn = 0; tlbn <= lbn && !error; tlbn++) { - if ((error = nfs_lock(np, NFS_NODE_LOCK_SHARED))) - break; - if (np->n_direofoffset - && (tlbn * NFS_DIRBLKSIZ) >= np->n_direofoffset) { - nfs_unlock(np); - if (eofflag) - *eofflag = 1; - FSDBG_BOT(514, np, 0xde0f0002, 0, 0); - return (0); - } - nfs_unlock(np); - error = nfs_buf_get(np, tlbn, NFS_DIRBLKSIZ, thd, NBLK_READ, &bp); - if (error) { - FSDBG_BOT(514, np, 0xd1e0013, 0, error); - return (error); - } - if (!ISSET(bp->nb_flags, NB_CACHE)) { - SET(bp->nb_flags, NB_READ); - error = nfs_buf_readdir(bp, ctx); - /* - * no error + NB_INVAL == directory EOF, - * use the block. - */ - if (error == 0 && ISSET(bp->nb_flags, NB_INVAL)) { - if (eofflag) - *eofflag = 1; - break; - } - } - /* - * An error will throw away the block and the - * for loop will break out. If no error and this - * is not the block we want, we throw away the - * block and go for the next one via the for loop. - */ - if (error || (tlbn < lbn)) - nfs_buf_release(bp, 1); - } - } - /* - * The above while is repeated if we hit another cookie - * error. If we hit an error and it wasn't a cookie error, - * we give up. - */ - if (error) { - FSDBG_BOT(514, np, 0xd1e0014, 0, error); - return (error); - } } - /* - * Make sure we use a signed variant of min() since - * the second term may be negative. - */ - // LP64todo - fix this! - n = lmin(uio_uio_resid(uio), bp->nb_validend - on); - /* - * We keep track of the directory eof in - * np->n_direofoffset and chop it off as an - * extra step right here. - */ - if ((error = nfs_lock(np, NFS_NODE_LOCK_SHARED))) { - FSDBG_BOT(514, np, 0xd1e0115, 0, error); - return (error); - } - if (np->n_direofoffset && - n > np->n_direofoffset - uio->uio_offset) - n = np->n_direofoffset - uio->uio_offset; - nfs_unlock(np); - /* - * Make sure that we return an integral number of entries so - * that any subsequent calls will start copying from the start - * of the next entry. - * - * If the current value of n has the last entry cut short, - * set n to copy everything up to the last entry instead. - */ if (n > 0) { - dp = bp->nb_data + on; - while (dp < (bp->nb_data + on + n)) { - direntp = (struct dirent *)dp; - dp += direntp->d_reclen; + NFS_BUF_MAP(bp); + n32 = n > INT_MAX ? INT_MAX : (int)n; + error = uiomove(bp->nb_data + on, n32, uio); + if (!error && n > n32) { + error = uiomove(bp->nb_data + on + n32, (int)(n - n32), uio); } - if (dp > (bp->nb_data + on + n)) - n = (dp - direntp->d_reclen) - (bp->nb_data + on); } - } - if (n > 0) - error = uiomove(bp->nb_data + on, (int)n, uio); - if (vtype == VREG) { - if (ioflag & IO_NOCACHE) - SET(bp->nb_flags, NB_NOCACHE); nfs_buf_release(bp, 1); nfs_data_unlock(np); - np->n_lastread = (uio->uio_offset - 1) / biosize; - } else { - nfs_buf_release(bp, 1); - } - } while (error == 0 && uio_uio_resid(uio) > 0 && n > 0); - FSDBG_BOT(514, np, uio->uio_offset, uio_uio_resid(uio), error); - return (error); + nfs_node_lock_force(np); + np->n_lastread = (uio_offset(uio) - 1) / biosize; + nfs_node_unlock(np); + } while (error == 0 && uio_resid(uio) > 0 && n > 0); + FSDBG_BOT(514, np, uio_offset(uio), uio_resid(uio), error); + return error; } /* * limit the number of outstanding async I/O writes */ -static int +int nfs_async_write_start(struct nfsmount *nmp) { - int error = 0, slpflag = (nmp->nm_flag & NFSMNT_INT) ? PCATCH : 0; - struct timespec ts = {1, 0}; + int error = 0, slpflag = NMFLAG(nmp, INTR) ? PCATCH : 0; + struct timespec ts = { .tv_sec = 1, .tv_nsec = 0 }; - if (nfs_max_async_writes <= 0) - return (0); + if (nfs_max_async_writes <= 0) { + return 0; + } lck_mtx_lock(&nmp->nm_lock); - while (!error && (nfs_max_async_writes > 0) && (nmp->nm_asyncwrites >= nfs_max_async_writes)) { - if ((error = nfs_sigintr(nmp, NULL, current_thread(), 1))) + while ((nfs_max_async_writes > 0) && (nmp->nm_asyncwrites >= nfs_max_async_writes)) { + if ((error = nfs_sigintr(nmp, NULL, current_thread(), 1))) { break; - error = msleep(&nmp->nm_asyncwrites, &nmp->nm_lock, slpflag|(PZERO-1), "nfsasyncwrites", &ts); - if (error == EWOULDBLOCK) - error = 0; + } + msleep(&nmp->nm_asyncwrites, &nmp->nm_lock, slpflag | (PZERO - 1), "nfsasyncwrites", &ts); + slpflag = 0; } - if (!error) + if (!error) { nmp->nm_asyncwrites++; + } lck_mtx_unlock(&nmp->nm_lock); - return (error); + return error; } -static void +void nfs_async_write_done(struct nfsmount *nmp) { - if (nmp->nm_asyncwrites <= 0) + if (nmp->nm_asyncwrites <= 0) { return; + } lck_mtx_lock(&nmp->nm_lock); - if (nmp->nm_asyncwrites-- >= nfs_max_async_writes) + if (nmp->nm_asyncwrites-- >= nfs_max_async_writes) { wakeup(&nmp->nm_asyncwrites); + } lck_mtx_unlock(&nmp->nm_lock); } @@ -2376,19 +2498,20 @@ nfs_buf_write(struct nfsbuf *bp) thread_t thd; kauth_cred_t cred; proc_t p = current_proc(); - int iomode, doff, dend, firstpg, lastpg; - uint32_t pagemask; + int iomode; + off_t doff, dend, firstpg, lastpg; FSDBG_TOP(553, bp, NBOFF(bp), bp->nb_flags, 0); - if (!ISSET(bp->nb_lflags, NBL_BUSY)) + if (!ISSET(bp->nb_lflags, NBL_BUSY)) { panic("nfs_buf_write: buffer is not busy???"); + } np = bp->nb_np; async = ISSET(bp->nb_flags, NB_ASYNC); oldflags = bp->nb_flags; - CLR(bp->nb_flags, (NB_READ|NB_DONE|NB_ERROR|NB_DELWRI)); + CLR(bp->nb_flags, (NB_READ | NB_DONE | NB_ERROR | NB_DELWRI)); if (ISSET(oldflags, NB_DELWRI)) { lck_mtx_lock(nfs_buf_mutex); nfs_nbdwrite--; @@ -2398,46 +2521,63 @@ nfs_buf_write(struct nfsbuf *bp) } /* move to clean list */ - if (ISSET(oldflags, (NB_ASYNC|NB_DELWRI))) { + if (ISSET(oldflags, (NB_ASYNC | NB_DELWRI))) { lck_mtx_lock(nfs_buf_mutex); - if (bp->nb_vnbufs.le_next != NFSNOLIST) + if (bp->nb_vnbufs.le_next != NFSNOLIST) { LIST_REMOVE(bp, nb_vnbufs); + } LIST_INSERT_HEAD(&np->n_cleanblkhd, bp, nb_vnbufs); lck_mtx_unlock(nfs_buf_mutex); } + nfs_node_lock_force(np); + np->n_numoutput++; + nfs_node_unlock(np); vnode_startwrite(NFSTOV(np)); - if (p && p->p_stats) - OSIncrementAtomic(&p->p_stats->p_ru.ru_oublock); + if (p && p->p_stats) { + OSIncrementAtomicLong(&p->p_stats->p_ru.ru_oublock); + } cred = bp->nb_wcred; - if (!IS_VALID_CRED(cred) && ISSET(bp->nb_flags, NB_READ)) + if (!IS_VALID_CRED(cred) && ISSET(bp->nb_flags, NB_READ)) { cred = bp->nb_rcred; /* shouldn't really happen, but... */ - if (IS_VALID_CRED(cred)) + } + if (IS_VALID_CRED(cred)) { kauth_cred_ref(cred); + } thd = async ? NULL : current_thread(); /* We need to make sure the pages are locked before doing I/O. */ - if (!ISSET(bp->nb_flags, NB_META) && UBCINFOEXISTS(NFSTOV(np))) { - if (!ISSET(bp->nb_flags, NB_PAGELIST)) { - error = nfs_buf_upl_setup(bp); - if (error) { - printf("nfs_buf_write: upl create failed %d\n", error); - SET(bp->nb_flags, NB_ERROR); - bp->nb_error = error = EIO; - nfs_buf_iodone(bp); - goto out; + if (!ISSET(bp->nb_flags, NB_META)) { + if (UBCINFOEXISTS(NFSTOV(np))) { + if (!ISSET(bp->nb_flags, NB_PAGELIST)) { + error = nfs_buf_upl_setup(bp); + if (error) { + printf("nfs_buf_write: upl create failed %d\n", error); + SET(bp->nb_flags, NB_ERROR); + bp->nb_error = error = EIO; + nfs_buf_iodone(bp); + goto out; + } + nfs_buf_upl_check(bp); } - nfs_buf_upl_check(bp); + } else { + /* We should never be in nfs_buf_write() with no UBCINFO. */ + printf("nfs_buf_write: ubcinfo already gone\n"); + SET(bp->nb_flags, NB_ERROR); + bp->nb_error = error = EIO; + nfs_buf_iodone(bp); + goto out; } } /* If NB_NEEDCOMMIT is set, a commit RPC may do the trick. */ - if (ISSET(bp->nb_flags, NB_NEEDCOMMIT)) + if (ISSET(bp->nb_flags, NB_NEEDCOMMIT)) { nfs_buf_check_write_verifier(np, bp); + } if (ISSET(bp->nb_flags, NB_NEEDCOMMIT)) { struct nfsmount *nmp = NFSTONMP(np); - if (!nmp) { + if (nfs_mount_gone(nmp)) { SET(bp->nb_flags, NB_ERROR); bp->nb_error = error = EIO; nfs_buf_iodone(bp); @@ -2445,7 +2585,7 @@ nfs_buf_write(struct nfsbuf *bp) } SET(bp->nb_flags, NB_WRITEINPROG); error = nmp->nm_funcs->nf_commit_rpc(np, NBOFF(bp) + bp->nb_dirtyoff, - bp->nb_dirtyend - bp->nb_dirtyoff, bp->nb_wcred); + bp->nb_dirtyend - bp->nb_dirtyoff, bp->nb_wcred, bp->nb_verf); CLR(bp->nb_flags, NB_WRITEINPROG); if (error) { if (error != NFSERR_STALEWRITEVERF) { @@ -2457,66 +2597,82 @@ nfs_buf_write(struct nfsbuf *bp) } bp->nb_dirtyoff = bp->nb_dirtyend = 0; CLR(bp->nb_flags, NB_NEEDCOMMIT); - nfs_lock(np, NFS_NODE_LOCK_FORCE); + nfs_node_lock_force(np); np->n_needcommitcnt--; CHECK_NEEDCOMMITCNT(np); - nfs_unlock(np); + nfs_node_unlock(np); } if (!error && (bp->nb_dirtyend > 0)) { /* sanity check the dirty range */ if (NBOFF(bp) + bp->nb_dirtyend > (off_t) np->n_size) { bp->nb_dirtyend = np->n_size - NBOFF(bp); - if (bp->nb_dirtyoff >= bp->nb_dirtyend) + if (bp->nb_dirtyoff >= bp->nb_dirtyend) { bp->nb_dirtyoff = bp->nb_dirtyend = 0; + } } } if (!error && (bp->nb_dirtyend > 0)) { /* there's a dirty range that needs to be written out */ + nfsbufpgs pagemask, pagemaskand; NFS_BUF_MAP(bp); doff = bp->nb_dirtyoff; dend = bp->nb_dirtyend; /* if doff page is dirty, move doff to start of page */ - if (NBPGDIRTY(bp, doff / PAGE_SIZE)) + if (NBPGDIRTY(bp, doff / PAGE_SIZE)) { doff -= doff & PAGE_MASK; + } /* try to expand write range to include preceding dirty pages */ - if (!(doff & PAGE_MASK)) - while ((doff > 0) && NBPGDIRTY(bp, (doff - 1) / PAGE_SIZE)) + if (!(doff & PAGE_MASK)) { + while ((doff > 0) && NBPGDIRTY(bp, (doff - 1) / PAGE_SIZE)) { doff -= PAGE_SIZE; + } + } /* if dend page is dirty, move dend to start of next page */ - if ((dend & PAGE_MASK) && NBPGDIRTY(bp, dend / PAGE_SIZE)) - dend = round_page_32(dend); + if ((dend & PAGE_MASK) && NBPGDIRTY(bp, dend / PAGE_SIZE)) { + dend = round_page_64(dend); + } /* try to expand write range to include trailing dirty pages */ - if (!(dend & PAGE_MASK)) - while ((dend < bp->nb_bufsize) && NBPGDIRTY(bp, dend / PAGE_SIZE)) + if (!(dend & PAGE_MASK)) { + while ((dend < (int)bp->nb_bufsize) && NBPGDIRTY(bp, dend / PAGE_SIZE)) { dend += PAGE_SIZE; + } + } /* make sure to keep dend clipped to EOF */ - if ((NBOFF(bp) + dend) > (off_t) np->n_size) + if ((NBOFF(bp) + dend) > (off_t) np->n_size) { dend = np->n_size - NBOFF(bp); + } /* calculate range of complete pages being written */ - firstpg = round_page_32(doff) / PAGE_SIZE; - lastpg = (trunc_page_32(dend) - 1) / PAGE_SIZE; - /* calculate mask for that page range */ - pagemask = ((1 << (lastpg + 1)) - 1) & ~((1 << firstpg) - 1); + if (dend > doff) { + firstpg = doff / PAGE_SIZE; + lastpg = (dend - 1) / PAGE_SIZE; + /* calculate mask for that page range */ + nfs_buf_pgs_set_pages_between(&pagemask, firstpg, lastpg + 1); + } else { + NBPGS_ERASE(&pagemask); + } /* * compare page mask to nb_dirty; if there are other dirty pages * then write FILESYNC; otherwise, write UNSTABLE if async and * not needcommit/stable; otherwise write FILESYNC */ - if (bp->nb_dirty & ~pagemask) + nfs_buf_pgs_bit_not(&pagemask); + nfs_buf_pgs_bit_and(&bp->nb_dirty, &pagemask, &pagemaskand); + if (nfs_buf_pgs_is_set(&pagemaskand)) { iomode = NFS_WRITE_FILESYNC; - else if ((bp->nb_flags & (NB_ASYNC | NB_NEEDCOMMIT | NB_STABLE)) == NB_ASYNC) + } else if ((bp->nb_flags & (NB_ASYNC | NB_NEEDCOMMIT | NB_STABLE)) == NB_ASYNC) { iomode = NFS_WRITE_UNSTABLE; - else + } else { iomode = NFS_WRITE_FILESYNC; + } /* write the whole contiguous dirty range */ bp->nb_offio = doff; bp->nb_endio = dend; - OSAddAtomic(1, (SInt32 *)&nfsstats.write_bios); + OSAddAtomic64(1, &nfsstats.write_bios); SET(bp->nb_flags, NB_WRITEINPROG); error = nfs_buf_write_rpc(bp, iomode, thd, cred); @@ -2527,8 +2683,9 @@ nfs_buf_write(struct nfsbuf *bp) * pages pushed out. */ } else { - if (!error && bp->nb_dirty) /* write out any dirty pages */ + if (!error && nfs_buf_pgs_is_set(&bp->nb_dirty)) { /* write out any dirty pages */ error = nfs_buf_write_dirty_pages(bp, thd, cred); + } nfs_buf_iodone(bp); } /* note: bp is still valid only for !async case */ @@ -2538,8 +2695,9 @@ out: /* move to clean list */ if (oldflags & NB_DELWRI) { lck_mtx_lock(nfs_buf_mutex); - if (bp->nb_vnbufs.le_next != NFSNOLIST) + if (bp->nb_vnbufs.le_next != NFSNOLIST) { LIST_REMOVE(bp, nb_vnbufs); + } LIST_INSERT_HEAD(&np->n_cleanblkhd, bp, nb_vnbufs); lck_mtx_unlock(nfs_buf_mutex); } @@ -2547,14 +2705,14 @@ out: nfs_buf_release(bp, 1); /* check if we need to invalidate (and we can) */ if ((np->n_flag & NNEEDINVALIDATE) && - !(np->n_bflag & (NBINVALINPROG|NBFLUSHINPROG))) { + !(np->n_bflag & (NBINVALINPROG | NBFLUSHINPROG))) { int invalidate = 0; - nfs_lock(np, NFS_NODE_LOCK_FORCE); + nfs_node_lock_force(np); if (np->n_flag & NNEEDINVALIDATE) { invalidate = 1; np->n_flag &= ~NNEEDINVALIDATE; } - nfs_unlock(np); + nfs_node_unlock(np); if (invalidate) { /* * There was a write error and we need to @@ -2567,14 +2725,15 @@ out: * the buffer busy. So we call vinvalbuf() after * releasing the buffer. */ - nfs_vinvalbuf2(NFSTOV(np), V_SAVE|V_IGNORE_WRITEERR, thd, cred, 1); + nfs_vinvalbuf2(NFSTOV(np), V_SAVE | V_IGNORE_WRITEERR, thd, cred, 1); } } } - if (IS_VALID_CRED(cred)) + if (IS_VALID_CRED(cred)) { kauth_cred_unref(&cred); - return (error); + } + return error; } /* @@ -2585,8 +2744,7 @@ nfs_buf_write_finish(struct nfsbuf *bp, thread_t thd, kauth_cred_t cred) { nfsnode_t np = bp->nb_np; int error = (bp->nb_flags & NB_ERROR) ? bp->nb_error : 0; - int firstpg, lastpg; - uint32_t pagemask; + off_t firstpg, lastpg; if ((error == EINTR) || (error == ERESTART)) { CLR(bp->nb_flags, NB_ERROR); @@ -2594,31 +2752,37 @@ nfs_buf_write_finish(struct nfsbuf *bp, thread_t thd, kauth_cred_t cred) } if (!error) { + nfsbufpgs pagemask; /* calculate range of complete pages being written */ - firstpg = round_page_32(bp->nb_offio) / PAGE_SIZE; - lastpg = (trunc_page_32(bp->nb_endio) - 1) / PAGE_SIZE; - /* calculate mask for that page range written */ - pagemask = ((1 << (lastpg + 1)) - 1) & ~((1 << firstpg) - 1); + if (bp->nb_endio > bp->nb_offio) { + firstpg = bp->nb_offio / PAGE_SIZE; + lastpg = (bp->nb_endio - 1) / PAGE_SIZE; + /* calculate mask for that page range written */ + nfs_buf_pgs_set_pages_between(&pagemask, firstpg, lastpg + 1); + } else { + NBPGS_ERASE(&pagemask); + } /* clear dirty bits for pages we've written */ - bp->nb_dirty &= ~pagemask; + nfs_buf_pgs_bit_not(&pagemask); + nfs_buf_pgs_bit_and(&bp->nb_dirty, &pagemask, &bp->nb_dirty); } /* manage needcommit state */ if (!error && (bp->nb_commitlevel == NFS_WRITE_UNSTABLE)) { if (!ISSET(bp->nb_flags, NB_NEEDCOMMIT)) { - nfs_lock(np, NFS_NODE_LOCK_FORCE); + nfs_node_lock_force(np); np->n_needcommitcnt++; - nfs_unlock(np); + nfs_node_unlock(np); SET(bp->nb_flags, NB_NEEDCOMMIT); } /* make sure nb_dirtyoff/nb_dirtyend reflect actual range written */ bp->nb_dirtyoff = bp->nb_offio; bp->nb_dirtyend = bp->nb_endio; } else if (ISSET(bp->nb_flags, NB_NEEDCOMMIT)) { - nfs_lock(np, NFS_NODE_LOCK_FORCE); + nfs_node_lock_force(np); np->n_needcommitcnt--; CHECK_NEEDCOMMITCNT(np); - nfs_unlock(np); + nfs_node_unlock(np); CLR(bp->nb_flags, NB_NEEDCOMMIT); } @@ -2649,8 +2813,9 @@ nfs_buf_write_finish(struct nfsbuf *bp, thread_t thd, kauth_cred_t cred) if (ISSET(bp->nb_flags, NB_ASYNC)) { /* move to dirty list */ lck_mtx_lock(nfs_buf_mutex); - if (bp->nb_vnbufs.le_next != NFSNOLIST) + if (bp->nb_vnbufs.le_next != NFSNOLIST) { LIST_REMOVE(bp, nb_vnbufs); + } LIST_INSERT_HEAD(&np->n_dirtyblkhd, bp, nb_vnbufs); lck_mtx_unlock(nfs_buf_mutex); } @@ -2667,18 +2832,19 @@ nfs_buf_write_finish(struct nfsbuf *bp, thread_t thd, kauth_cred_t cred) * buffer busy. Set a flag to do it after releasing * the buffer. */ - nfs_lock(np, NFS_NODE_LOCK_FORCE); + nfs_node_lock_force(np); np->n_error = error; np->n_flag |= (NWRITEERR | NNEEDINVALIDATE); NATTRINVALIDATE(np); - nfs_unlock(np); + nfs_node_unlock(np); } /* clear the dirty range */ bp->nb_dirtyoff = bp->nb_dirtyend = 0; } - if (!error && bp->nb_dirty) + if (!error && nfs_buf_pgs_is_set(&bp->nb_dirty)) { nfs_buf_write_dirty_pages(bp, thd, cred); + } nfs_buf_iodone(bp); } @@ -2695,57 +2861,55 @@ nfs_buf_write_dirty_pages(struct nfsbuf *bp, thread_t thd, kauth_cred_t cred) nfsnode_t np = bp->nb_np; struct nfsmount *nmp = NFSTONMP(np); int error = 0, commit, iomode, iomode2, len, pg, count, npages, off; - uint32_t dirty = bp->nb_dirty; + nfsbufpgs dirty; uint64_t wverf; - struct uio uio; - struct iovec_32 io; + uio_t auio; + char uio_buf[UIO_SIZEOF(1)]; - if (!bp->nb_dirty) - return (0); + if (!nfs_buf_pgs_is_set(&bp->nb_dirty)) { + return 0; + } /* there are pages marked dirty that need to be written out */ - OSAddAtomic(1, (SInt32 *)&nfsstats.write_bios); + OSAddAtomic64(1, &nfsstats.write_bios); NFS_BUF_MAP(bp); SET(bp->nb_flags, NB_WRITEINPROG); npages = bp->nb_bufsize / PAGE_SIZE; iomode = NFS_WRITE_UNSTABLE; - uio.uio_iovs.iov32p = &io; - uio.uio_iovcnt = 1; - uio.uio_rw = UIO_WRITE; -#if 1 /* LP64todo - can't use new segment flags until the drivers are ready */ - uio.uio_segflg = UIO_SYSSPACE; -#else - uio.uio_segflg = UIO_SYSSPACE32; -#endif + auio = uio_createwithbuffer(1, 0, UIO_SYSSPACE, UIO_WRITE, + &uio_buf, sizeof(uio_buf)); again: - dirty = bp->nb_dirty; + NBPGS_COPY(&dirty, &bp->nb_dirty); wverf = bp->nb_verf; commit = NFS_WRITE_FILESYNC; for (pg = 0; pg < npages; pg++) { - if (!NBPGDIRTY(bp, pg)) + if (!NBPGDIRTY(bp, pg)) { continue; + } count = 1; - while (((pg + count) < npages) && NBPGDIRTY(bp, pg + count)) + while (((pg + count) < npages) && NBPGDIRTY(bp, pg + count)) { count++; + } /* write count pages starting with page pg */ off = pg * PAGE_SIZE; len = count * PAGE_SIZE; /* clip writes to EOF */ - if (NBOFF(bp) + off + len > (off_t) np->n_size) + if (NBOFF(bp) + off + len > (off_t) np->n_size) { len -= (NBOFF(bp) + off + len) - np->n_size; + } if (len > 0) { iomode2 = iomode; - io.iov_len = len; - uio_uio_resid_set(&uio, io.iov_len); - uio.uio_offset = NBOFF(bp) + off; - io.iov_base = (uintptr_t) bp->nb_data + off; - error = nfs_write_rpc2(np, &uio, thd, cred, &iomode2, &bp->nb_verf); - if (error) + uio_reset(auio, NBOFF(bp) + off, UIO_SYSSPACE, UIO_WRITE); + uio_addiov(auio, CAST_USER_ADDR_T(bp->nb_data + off), len); + error = nfs_write_rpc2(np, auio, thd, cred, &iomode2, &bp->nb_verf); + if (error) { break; - if (iomode2 < commit) /* Retain the lowest commitment level returned. */ + } + if (iomode2 < commit) { /* Retain the lowest commitment level returned. */ commit = iomode2; + } if ((commit != NFS_WRITE_FILESYNC) && (wverf != bp->nb_verf)) { /* verifier changed, redo all the writes filesync */ iomode = NFS_WRITE_FILESYNC; @@ -2754,15 +2918,16 @@ again: } /* clear dirty bits */ while (count--) { - dirty &= ~(1 << pg); - if (count) /* leave pg on last page */ + NBPGS_UNSET(&dirty, pg); + if (count) { /* leave pg on last page */ pg++; + } } } CLR(bp->nb_flags, NB_WRITEINPROG); if (!error && (commit != NFS_WRITE_FILESYNC)) { - error = nmp->nm_funcs->nf_commit_rpc(np, NBOFF(bp), bp->nb_bufsize, cred); + error = nmp->nm_funcs->nf_commit_rpc(np, NBOFF(bp), bp->nb_bufsize, cred, wverf); if (error == NFSERR_STALEWRITEVERF) { /* verifier changed, so we need to restart all the writes */ iomode = NFS_WRITE_FILESYNC; @@ -2770,12 +2935,12 @@ again: } } if (!error) { - bp->nb_dirty = dirty; + NBPGS_COPY(&bp->nb_dirty, &dirty); } else { SET(bp->nb_flags, NB_ERROR); bp->nb_error = error; } - return (error); + return error; } /* @@ -2787,18 +2952,21 @@ nfs_buf_write_rpc(struct nfsbuf *bp, int iomode, thread_t thd, kauth_cred_t cred struct nfsmount *nmp; nfsnode_t np = bp->nb_np; int error = 0, nfsvers, async; - int offset, length, nmwsize, nrpcs, len; + int64_t nrpcs; + size_t len; + uint32_t nmwsize; struct nfsreq *req; struct nfsreq_cbinfo cb; - struct uio uio; - struct iovec_32 io; + uio_t auio; + char uio_buf[UIO_SIZEOF(1)]; + off_t offset, length; nmp = NFSTONMP(np); - if (!nmp) { + if (nfs_mount_gone(nmp)) { bp->nb_error = error = ENXIO; SET(bp->nb_flags, NB_ERROR); nfs_buf_iodone(bp); - return (error); + return error; } nfsvers = nmp->nm_vers; nmwsize = nmp->nm_wsize; @@ -2816,21 +2984,23 @@ nfs_buf_write_rpc(struct nfsbuf *bp, int iomode, thread_t thd, kauth_cred_t cred bp->nb_error = error = EFBIG; SET(bp->nb_flags, NB_ERROR); nfs_buf_iodone(bp); - return (error); + return error; } - uio.uio_iovs.iov32p = &io; - uio.uio_iovcnt = 1; - uio.uio_rw = UIO_WRITE; -#if 1 /* LP64todo - can't use new segment flags until the drivers are ready */ - uio.uio_segflg = UIO_SYSSPACE; + if (length == 0) { + /* We should never get here */ +#if DEVELOPMENT + printf("nfs_buf_write_rpc: Got request with zero length. np %p, bp %p, offset %lld\n", np, bp, offset); #else - uio.uio_segflg = UIO_SYSSPACE32; -#endif - io.iov_len = length; - uio_uio_resid_set(&uio, io.iov_len); - uio.uio_offset = NBOFF(bp) + offset; - io.iov_base = (uintptr_t) bp->nb_data + offset; + printf("nfs_buf_write_rpc: Got request with zero length.\n"); +#endif /* DEVELOPMENT */ + nfs_buf_iodone(bp); + return 0; + } + + auio = uio_createwithbuffer(1, NBOFF(bp) + offset, UIO_SYSSPACE, + UIO_WRITE, &uio_buf, sizeof(uio_buf)); + NFS_UIO_ADDIOV(auio, CAST_USER_ADDR_T(bp->nb_data + offset), length); bp->nb_rpcs = nrpcs = (length + nmwsize - 1) / nmwsize; if (async && (nrpcs > 1)) { @@ -2844,23 +3014,31 @@ nfs_buf_write_rpc(struct nfsbuf *bp, int iomode, thread_t thd, kauth_cred_t cred error = bp->nb_error; break; } - len = (length > nmwsize) ? nmwsize : length; - cb.rcb_args[0] = offset; - cb.rcb_args[1] = len; - if (async && ((error = nfs_async_write_start(nmp)))) + len = (length > nmwsize) ? nmwsize : (uint32_t)length; + cb.rcb_args.offset = offset; + cb.rcb_args.length = len; +#if CONFIG_NFS4 + if (nmp->nm_vers >= NFS_VER4) { + cb.rcb_args.stategenid = nmp->nm_stategenid; + } +#endif + if (async && ((error = nfs_async_write_start(nmp)))) { break; + } req = NULL; - error = nmp->nm_funcs->nf_write_rpc_async(np, &uio, len, thd, cred, - iomode, &cb, &req); + error = nmp->nm_funcs->nf_write_rpc_async(np, auio, len, thd, cred, + iomode, &cb, &req); if (error) { - if (async) + if (async) { nfs_async_write_done(nmp); + } break; } offset += len; length -= len; - if (async) + if (async) { continue; + } nfs_buf_write_rpc_finish(req); } @@ -2881,17 +3059,22 @@ nfs_buf_write_rpc(struct nfsbuf *bp, int iomode, thread_t thd, kauth_cred_t cred nfs_buf_write_finish(bp, thd, cred); } else { /* wait for the last RPC to mark it done */ - while (bp->nb_rpcs > 0) + while (bp->nb_rpcs > 0) { msleep(&bp->nb_rpcs, nfs_buf_mutex, 0, - "nfs_buf_write_rpc_cancel", NULL); + "nfs_buf_write_rpc_cancel", NULL); + } lck_mtx_unlock(nfs_buf_mutex); } } else { nfs_buf_write_finish(bp, thd, cred); } + /* It may have just been an interrupt... that's OK */ + if (!ISSET(bp->nb_flags, NB_ERROR)) { + error = 0; + } } - return (error); + return error; } /* @@ -2900,10 +3083,11 @@ nfs_buf_write_rpc(struct nfsbuf *bp, int iomode, thread_t thd, kauth_cred_t cred void nfs_buf_write_rpc_finish(struct nfsreq *req) { - int error = 0, nfsvers, offset, length, multasyncrpc, finished; + int error = 0, nfsvers, multasyncrpc, finished; int committed = NFS_WRITE_FILESYNC; uint64_t wverf = 0; - size_t rlen; + off_t offset; + size_t rlen, length; void *wakeme = NULL; struct nfsreq_cbinfo cb; struct nfsreq *wreq = NULL; @@ -2912,20 +3096,24 @@ nfs_buf_write_rpc_finish(struct nfsreq *req) nfsnode_t np; thread_t thd; kauth_cred_t cred; - struct uio uio; - struct iovec_32 io; + uio_t auio; + char uio_buf[UIO_SIZEOF(1)]; finish: np = req->r_np; thd = req->r_thread; cred = req->r_cred; - if (IS_VALID_CRED(cred)) + if (IS_VALID_CRED(cred)) { kauth_cred_ref(cred); + } cb = req->r_callback; bp = cb.rcb_bp; + if (cb.rcb_func) { /* take an extra reference on the nfsreq in case we want to resend it later due to grace error */ + nfs_request_ref(req, 0); + } nmp = NFSTONMP(np); - if (!nmp) { + if (nfs_mount_gone(nmp)) { SET(bp->nb_flags, NB_ERROR); bp->nb_error = error = ENXIO; } @@ -2936,24 +3124,76 @@ finish: } nfsvers = nmp->nm_vers; - offset = cb.rcb_args[0]; - rlen = length = cb.rcb_args[1]; + offset = cb.rcb_args.offset; + rlen = length = cb.rcb_args.length; /* finish the RPC */ error = nmp->nm_funcs->nf_write_rpc_async_finish(np, req, &committed, &rlen, &wverf); if ((error == EINPROGRESS) && cb.rcb_func) { /* async request restarted */ - if (IS_VALID_CRED(cred)) + if (cb.rcb_func) { + nfs_request_rele(req); + } + if (IS_VALID_CRED(cred)) { kauth_cred_unref(&cred); + } return; } - +#if CONFIG_NFS4 + if ((nmp->nm_vers >= NFS_VER4) && nfs_mount_state_error_should_restart(error) && !ISSET(bp->nb_flags, NB_ERROR)) { + lck_mtx_lock(&nmp->nm_lock); + if ((error != NFSERR_OLD_STATEID) && (error != NFSERR_GRACE) && (cb.rcb_args.stategenid == nmp->nm_stategenid)) { + NP(np, "nfs_buf_write_rpc_finish: error %d @ 0x%llx, 0x%x 0x%x, initiating recovery", + error, NBOFF(bp) + offset, cb.rcb_args.stategenid, nmp->nm_stategenid); + nfs_need_recover(nmp, error); + } + lck_mtx_unlock(&nmp->nm_lock); + if (np->n_flag & NREVOKE) { + error = EIO; + } else { + if (error == NFSERR_GRACE) { + if (cb.rcb_func) { + /* + * For an async I/O request, handle a grace delay just like + * jukebox errors. Set the resend time and queue it up. + */ + struct timeval now; + if (req->r_nmrep.nmc_mhead) { + mbuf_freem(req->r_nmrep.nmc_mhead); + req->r_nmrep.nmc_mhead = NULL; + } + req->r_error = 0; + microuptime(&now); + lck_mtx_lock(&req->r_mtx); + req->r_resendtime = now.tv_sec + 2; + req->r_xid = 0; // get a new XID + req->r_flags |= R_RESTART; + req->r_start = 0; + nfs_asyncio_resend(req); + lck_mtx_unlock(&req->r_mtx); + if (IS_VALID_CRED(cred)) { + kauth_cred_unref(&cred); + } + /* Note: nfsreq reference taken will be dropped later when finished */ + return; + } + /* otherwise, just pause a couple seconds and retry */ + tsleep(&nmp->nm_state, (PZERO - 1), "nfsgrace", 2 * hz); + } + if (!(error = nfs_mount_state_wait_for_recovery(nmp))) { + rlen = 0; + goto writeagain; + } + } + } +#endif if (error) { SET(bp->nb_flags, NB_ERROR); bp->nb_error = error; } - if (error || (nfsvers == NFS_VER2)) + if (error || (nfsvers == NFS_VER2)) { goto out; + } if (rlen <= 0) { SET(bp->nb_flags, NB_ERROR); bp->nb_error = error = EIO; @@ -2961,8 +3201,9 @@ finish: } /* save lowest commit level returned */ - if (committed < bp->nb_commitlevel) + if (committed < bp->nb_commitlevel) { bp->nb_commitlevel = committed; + } /* check the write verifier */ if (!bp->nb_verf) { @@ -2974,6 +3215,10 @@ finish: bp->nb_verf = wverf; } + if (!ISSET(bp->nb_flags, NB_STALEWVERF) && rlen > 0 && (bp->nb_offio < (offset + (int)rlen))) { + bp->nb_offio = offset + rlen; + } + /* * check for a short write * @@ -2981,36 +3226,38 @@ finish: * need to issue another write for the rest of it. * (Don't bother if the buffer hit an error or stale wverf.) */ - if (((int)rlen < length) && !(bp->nb_flags & (NB_STALEWVERF|NB_ERROR))) { + if ((rlen < length) && !(bp->nb_flags & (NB_STALEWVERF | NB_ERROR))) { +#if CONFIG_NFS4 +writeagain: +#endif offset += rlen; length -= rlen; - uio.uio_iovs.iov32p = &io; - uio.uio_iovcnt = 1; - uio.uio_rw = UIO_WRITE; -#if 1 /* LP64todo - can't use new segment flags until the drivers are ready */ - uio.uio_segflg = UIO_SYSSPACE; -#else - uio.uio_segflg = UIO_SYSSPACE32; -#endif - io.iov_len = length; - uio_uio_resid_set(&uio, io.iov_len); - uio.uio_offset = NBOFF(bp) + offset; - io.iov_base = (uintptr_t) bp->nb_data + offset; - - cb.rcb_args[0] = offset; - cb.rcb_args[1] = length; + auio = uio_createwithbuffer(1, NBOFF(bp) + offset, UIO_SYSSPACE, + UIO_WRITE, &uio_buf, sizeof(uio_buf)); + uio_addiov(auio, CAST_USER_ADDR_T(bp->nb_data + offset), length); - error = nmp->nm_funcs->nf_write_rpc_async(np, &uio, length, thd, cred, - NFS_WRITE_FILESYNC, &cb, &wreq); + cb.rcb_args.offset = offset; + cb.rcb_args.length = length; +#if CONFIG_NFS4 + if (nmp->nm_vers >= NFS_VER4) { + cb.rcb_args.stategenid = nmp->nm_stategenid; + } +#endif + // XXX iomode should really match the original request + error = nmp->nm_funcs->nf_write_rpc_async(np, auio, length, thd, cred, + NFS_WRITE_FILESYNC, &cb, &wreq); if (!error) { - if (IS_VALID_CRED(cred)) + if (IS_VALID_CRED(cred)) { kauth_cred_unref(&cred); + } if (!cb.rcb_func) { /* if !async we'll need to wait for this RPC to finish */ req = wreq; + wreq = NULL; goto finish; } + nfs_request_rele(req); /* * We're done here. * Outstanding RPC count is unchanged. @@ -3023,8 +3270,10 @@ finish: } out: - if (cb.rcb_func) + if (cb.rcb_func) { nfs_async_write_done(nmp); + nfs_request_rele(req); + } /* * Decrement outstanding RPC count on buffer * and call nfs_buf_write_finish on last RPC. @@ -3034,40 +3283,50 @@ out: * aborting a partially-initiated set of RPCs) */ multasyncrpc = ISSET(bp->nb_flags, NB_MULTASYNCRPC); - if (multasyncrpc) + if (multasyncrpc) { lck_mtx_lock(nfs_buf_mutex); + } bp->nb_rpcs--; finished = (bp->nb_rpcs == 0); - if (multasyncrpc) + if (multasyncrpc) { lck_mtx_unlock(nfs_buf_mutex); + } if (finished) { - if (multasyncrpc) + if (multasyncrpc) { wakeme = &bp->nb_rpcs; + } nfs_buf_write_finish(bp, thd, cred); - if (wakeme) + if (wakeme) { wakeup(wakeme); + } } - if (IS_VALID_CRED(cred)) + if (IS_VALID_CRED(cred)) { kauth_cred_unref(&cred); + } + + if (cb.rcb_func && np->n_needcommitcnt >= NFS_A_LOT_OF_NEEDCOMMITS) { + nfs_flushcommits(np, 1); + } } /* - * Send commit(s) for the given node's "needcommit" buffers + * Send commit(s) for the given node's "needcommit" buffers */ int nfs_flushcommits(nfsnode_t np, int nowait) { struct nfsmount *nmp; - struct nfsbuf *bp; + struct nfsbuf *bp, *prevlbp, *lbp; struct nfsbuflists blist, commitlist; - int error = 0, retv, wcred_set, flags, dirty; + int error = 0, retv, wcred_set, flags; u_quad_t off, endoff, toff; - u_int32_t count; + uint64_t wverf, count; kauth_cred_t wcred = NULL; + nfsbufpgs dirty; FSDBG_TOP(557, np, 0, 0, 0); @@ -3078,11 +3337,12 @@ nfs_flushcommits(nfsnode_t np, int nowait) * and the commit rpc is done. */ if (!LIST_EMPTY(&np->n_dirtyblkhd)) { - error = nfs_lock(np, NFS_NODE_LOCK_EXCLUSIVE); - if (error) + error = nfs_node_lock(np); + if (error) { goto done; + } np->n_flag |= NMODIFIED; - nfs_unlock(np); + nfs_node_unlock(np); } off = (u_quad_t)-1; @@ -3091,7 +3351,7 @@ nfs_flushcommits(nfsnode_t np, int nowait) LIST_INIT(&commitlist); nmp = NFSTONMP(np); - if (!nmp) { + if (nfs_mount_gone(nmp)) { error = ENXIO; goto done; } @@ -3101,57 +3361,34 @@ nfs_flushcommits(nfsnode_t np, int nowait) } flags = NBI_DIRTY; - if (nowait) + if (nowait) { flags |= NBI_NOWAIT; + } lck_mtx_lock(nfs_buf_mutex); + wverf = nmp->nm_verf; if (!nfs_buf_iterprepare(np, &blist, flags)) { while ((bp = LIST_FIRST(&blist))) { LIST_REMOVE(bp, nb_vnbufs); LIST_INSERT_HEAD(&np->n_dirtyblkhd, bp, nb_vnbufs); error = nfs_buf_acquire(bp, NBAC_NOWAIT, 0, 0); - if (error) + if (error) { continue; - if (ISSET(bp->nb_flags, NB_NEEDCOMMIT)) + } + if (ISSET(bp->nb_flags, NB_NEEDCOMMIT)) { nfs_buf_check_write_verifier(np, bp); - if (((bp->nb_flags & (NB_DELWRI | NB_NEEDCOMMIT)) - != (NB_DELWRI | NB_NEEDCOMMIT))) { + } + if (((bp->nb_flags & (NB_DELWRI | NB_NEEDCOMMIT)) != (NB_DELWRI | NB_NEEDCOMMIT)) || + (bp->nb_verf != wverf)) { nfs_buf_drop(bp); continue; } nfs_buf_remfree(bp); - lck_mtx_unlock(nfs_buf_mutex); - /* - * we need a upl to see if the page has been - * dirtied (think mmap) since the unstable write, and - * also to prevent vm from paging it during our commit rpc - */ - if (!ISSET(bp->nb_flags, NB_PAGELIST)) { - retv = nfs_buf_upl_setup(bp); - if (retv) { - /* unable to create upl */ - /* vm object must no longer exist */ - /* this could be fatal if we need */ - /* to write the data again, we'll see... */ - printf("nfs_flushcommits: upl create failed %d\n", retv); - bp->nb_valid = bp->nb_dirty = 0; - } - } - nfs_buf_upl_check(bp); - lck_mtx_lock(nfs_buf_mutex); + + /* buffer UPLs will be grabbed *in order* below */ FSDBG(557, bp, bp->nb_flags, bp->nb_valid, bp->nb_dirty); FSDBG(557, bp->nb_validoff, bp->nb_validend, - bp->nb_dirtyoff, bp->nb_dirtyend); - - /* - * We used to check for dirty pages here; if there were any - * we'd abort the commit and force the entire buffer to be - * written again. - * - * Instead of doing that, we now go ahead and commit the dirty - * range, and then leave the buffer around with dirty pages - * that will be written out later. - */ + bp->nb_dirtyoff, bp->nb_dirtyend); /* * Work out if all buffers are using the same cred @@ -3162,8 +3399,9 @@ nfs_flushcommits(nfsnode_t np, int nowait) */ if (wcred_set == 0) { wcred = bp->nb_wcred; - if (!IS_VALID_CRED(wcred)) + if (!IS_VALID_CRED(wcred)) { panic("nfs: needcommit w/out wcred"); + } wcred_set = 1; } else if ((wcred_set == 1) && wcred != bp->nb_wcred) { wcred_set = -1; @@ -3171,20 +3409,33 @@ nfs_flushcommits(nfsnode_t np, int nowait) SET(bp->nb_flags, NB_WRITEINPROG); /* - * A list of these buffers is kept so that the - * second loop knows which buffers have actually - * been committed. This is necessary, since there - * may be a race between the commit rpc and new - * uncommitted writes on the file. + * Add this buffer to the list of buffers we are committing. + * Buffers are inserted into the list in ascending order so that + * we can take the UPLs in order after the list is complete. */ + prevlbp = NULL; + LIST_FOREACH(lbp, &commitlist, nb_vnbufs) { + if (bp->nb_lblkno < lbp->nb_lblkno) { + break; + } + prevlbp = lbp; + } LIST_REMOVE(bp, nb_vnbufs); - LIST_INSERT_HEAD(&commitlist, bp, nb_vnbufs); + if (prevlbp) { + LIST_INSERT_AFTER(prevlbp, bp, nb_vnbufs); + } else { + LIST_INSERT_HEAD(&commitlist, bp, nb_vnbufs); + } + + /* update commit range start, end */ toff = NBOFF(bp) + bp->nb_dirtyoff; - if (toff < off) + if (toff < off) { off = toff; + } toff += (u_quad_t)(bp->nb_dirtyend - bp->nb_dirtyoff); - if (toff > endoff) + if (toff > endoff) { endoff = toff; + } } nfs_buf_itercomplete(np, &blist, NBI_DIRTY); } @@ -3195,6 +3446,29 @@ nfs_flushcommits(nfsnode_t np, int nowait) goto done; } + /* + * We need a UPL to prevent others from accessing the buffers during + * our commit RPC(s). + * + * We used to also check for dirty pages here; if there were any we'd + * abort the commit and force the entire buffer to be written again. + * Instead of doing that, we just go ahead and commit the dirty range, + * and then leave the buffer around with dirty pages that will be + * written out later. + */ + LIST_FOREACH(bp, &commitlist, nb_vnbufs) { + if (!ISSET(bp->nb_flags, NB_PAGELIST)) { + retv = nfs_buf_upl_setup(bp); + if (retv) { + /* Unable to create the UPL, the VM object probably no longer exists. */ + printf("nfs_flushcommits: upl create failed %d\n", retv); + NBPGS_ERASE(&bp->nb_valid); + NBPGS_ERASE(&bp->nb_dirty); + } + } + nfs_buf_upl_check(bp); + } + /* * Commit data on the server, as required. * If all bufs are using the same wcred, then use that with @@ -3206,19 +3480,21 @@ nfs_flushcommits(nfsnode_t np, int nowait) * Note, it's possible the commit range could be >2^32-1. * If it is, we'll send one commit that covers the whole file. */ - if ((endoff - off) > 0xffffffff) + if ((endoff - off) > 0xffffffff) { count = 0; - else + } else { count = (endoff - off); - retv = nmp->nm_funcs->nf_commit_rpc(np, off, count, wcred); + } + retv = nmp->nm_funcs->nf_commit_rpc(np, off, count, wcred, wverf); } else { retv = 0; LIST_FOREACH(bp, &commitlist, nb_vnbufs) { toff = NBOFF(bp) + bp->nb_dirtyoff; count = bp->nb_dirtyend - bp->nb_dirtyoff; - retv = nmp->nm_funcs->nf_commit_rpc(np, toff, count, bp->nb_wcred); - if (retv) + retv = nmp->nm_funcs->nf_commit_rpc(np, toff, count, bp->nb_wcred, wverf); + if (retv) { break; + } } } @@ -3230,11 +3506,11 @@ nfs_flushcommits(nfsnode_t np, int nowait) while ((bp = LIST_FIRST(&commitlist))) { LIST_REMOVE(bp, nb_vnbufs); FSDBG(557, bp, retv, bp->nb_flags, bp->nb_dirty); - nfs_lock(np, NFS_NODE_LOCK_FORCE); + nfs_node_lock_force(np); CLR(bp->nb_flags, (NB_NEEDCOMMIT | NB_WRITEINPROG)); np->n_needcommitcnt--; CHECK_NEEDCOMMITCNT(np); - nfs_unlock(np); + nfs_node_unlock(np); if (retv) { /* move back to dirty list */ @@ -3245,6 +3521,9 @@ nfs_flushcommits(nfsnode_t np, int nowait) continue; } + nfs_node_lock_force(np); + np->n_numoutput++; + nfs_node_unlock(np); vnode_startwrite(NFSTOV(np)); if (ISSET(bp->nb_flags, NB_DELWRI)) { lck_mtx_lock(nfs_buf_mutex); @@ -3253,13 +3532,15 @@ nfs_flushcommits(nfsnode_t np, int nowait) lck_mtx_unlock(nfs_buf_mutex); wakeup(&nfs_nbdwrite); } - CLR(bp->nb_flags, (NB_READ|NB_DONE|NB_ERROR|NB_DELWRI)); + CLR(bp->nb_flags, (NB_READ | NB_DONE | NB_ERROR | NB_DELWRI)); /* if block still has dirty pages, we don't want it to */ /* be released in nfs_buf_iodone(). So, don't set NB_ASYNC. */ - if (!(dirty = bp->nb_dirty)) + NBPGS_COPY(&dirty, &bp->nb_dirty); + if (!nfs_buf_pgs_is_set(&dirty)) { SET(bp->nb_flags, NB_ASYNC); - else + } else { CLR(bp->nb_flags, NB_ASYNC); + } /* move to clean list */ lck_mtx_lock(nfs_buf_mutex); @@ -3269,7 +3550,7 @@ nfs_flushcommits(nfsnode_t np, int nowait) bp->nb_dirtyoff = bp->nb_dirtyend = 0; nfs_buf_iodone(bp); - if (dirty) { + if (nfs_buf_pgs_is_set(&dirty)) { /* throw it back in as a delayed write buffer */ CLR(bp->nb_flags, NB_DONE); nfs_buf_write_delayed(bp); @@ -3278,12 +3559,12 @@ nfs_flushcommits(nfsnode_t np, int nowait) done: FSDBG_BOT(557, np, 0, 0, error); - return (error); + return error; } /* * Flush all the blocks associated with a vnode. - * Walk through the buffer pool and push any dirty pages + * Walk through the buffer pool and push any dirty pages * associated with the vnode. */ int @@ -3297,25 +3578,27 @@ nfs_flush(nfsnode_t np, int waitfor, thread_t thd, int ignore_writeerr) FSDBG_TOP(517, np, waitfor, ignore_writeerr, 0); - if (!nmp) { + if (nfs_mount_gone(nmp)) { error = ENXIO; goto out; } nfsvers = nmp->nm_vers; - if (nmp->nm_flag & NFSMNT_INT) + if (NMFLAG(nmp, INTR)) { slpflag = PCATCH; + } if (!LIST_EMPTY(&np->n_dirtyblkhd)) { - nfs_lock(np, NFS_NODE_LOCK_FORCE); + nfs_node_lock_force(np); np->n_flag |= NMODIFIED; - nfs_unlock(np); + nfs_node_unlock(np); } lck_mtx_lock(nfs_buf_mutex); while (np->n_bflag & NBFLUSHINPROG) { np->n_bflag |= NBFLUSHWANT; error = msleep(&np->n_bflag, nfs_buf_mutex, slpflag, "nfs_flush", NULL); - if (error) { + if ((error && (error != EWOULDBLOCK)) || + ((error = nfs_sigintr(NFSTONMP(np), NULL, thd, 0)))) { lck_mtx_unlock(nfs_buf_mutex); goto out; } @@ -3342,18 +3625,21 @@ again: while ((bp = LIST_FIRST(&blist))) { LIST_REMOVE(bp, nb_vnbufs); LIST_INSERT_HEAD(&np->n_dirtyblkhd, bp, nb_vnbufs); - flags = (passone || (waitfor != MNT_WAIT)) ? NBAC_NOWAIT : 0; - if (flags != NBAC_NOWAIT) + flags = (passone || !(waitfor == MNT_WAIT || waitfor == MNT_DWAIT)) ? NBAC_NOWAIT : 0; + if (flags != NBAC_NOWAIT) { nfs_buf_refget(bp); + } while ((error = nfs_buf_acquire(bp, flags, slpflag, slptimeo))) { FSDBG(524, bp, flags, bp->nb_lflags, bp->nb_flags); - if (error == EBUSY) + if (error == EBUSY) { break; + } if (error) { error2 = nfs_sigintr(NFSTONMP(np), NULL, thd, 0); if (error2) { - if (flags != NBAC_NOWAIT) + if (flags != NBAC_NOWAIT) { nfs_buf_refrele(bp); + } nfs_buf_itercomplete(np, &blist, NBI_DIRTY); lck_mtx_unlock(nfs_buf_mutex); error = error2; @@ -3365,24 +3651,27 @@ again: } } } - if (flags != NBAC_NOWAIT) + if (flags != NBAC_NOWAIT) { nfs_buf_refrele(bp); - if (error == EBUSY) + } + if (error == EBUSY) { continue; + } if (!bp->nb_np) { /* buffer is no longer valid */ nfs_buf_drop(bp); continue; } - if (ISSET(bp->nb_flags, NB_NEEDCOMMIT)) + if (ISSET(bp->nb_flags, NB_NEEDCOMMIT)) { nfs_buf_check_write_verifier(np, bp); + } if (!ISSET(bp->nb_flags, NB_DELWRI)) { /* buffer is no longer dirty */ nfs_buf_drop(bp); continue; } FSDBG(525, bp, passone, bp->nb_lflags, bp->nb_flags); - if ((passone || (waitfor != MNT_WAIT)) && + if ((passone || !(waitfor == MNT_WAIT || waitfor == MNT_DWAIT)) && ISSET(bp->nb_flags, NB_NEEDCOMMIT)) { nfs_buf_drop(bp); continue; @@ -3390,10 +3679,10 @@ again: nfs_buf_remfree(bp); lck_mtx_unlock(nfs_buf_mutex); if (ISSET(bp->nb_flags, NB_ERROR)) { - nfs_lock(np, NFS_NODE_LOCK_FORCE); + nfs_node_lock_force(np); np->n_error = bp->nb_error ? bp->nb_error : EIO; np->n_flag |= NWRITEERR; - nfs_unlock(np); + nfs_node_unlock(np); nfs_buf_release(bp, 1); lck_mtx_lock(nfs_buf_mutex); continue; @@ -3410,11 +3699,11 @@ again: } lck_mtx_unlock(nfs_buf_mutex); - if (waitfor == MNT_WAIT) { - while ((error = vnode_waitforwrites(NFSTOV(np), 0, slpflag, slptimeo, "nfsflush"))) { - error2 = nfs_sigintr(NFSTONMP(np), NULL, thd, 0); + if (waitfor == MNT_WAIT || waitfor == MNT_DWAIT) { + while ((error = vnode_waitforwrites(NFSTOV(np), 0, slpflag, slptimeo, "nfsflush"))) { + error2 = nfs_sigintr(NFSTONMP(np), NULL, thd, 0); if (error2) { - error = error2; + error = error2; goto done; } if (slpflag == PCATCH) { @@ -3427,38 +3716,56 @@ again: if (nfsvers != NFS_VER2) { /* loop while it looks like there are still buffers to be */ /* commited and nfs_flushcommits() seems to be handling them. */ - while (np->n_needcommitcnt) - if (nfs_flushcommits(np, 0)) + while (np->n_needcommitcnt) { + if (nfs_flushcommits(np, 0)) { break; + } + } } if (passone) { passone = 0; if (!LIST_EMPTY(&np->n_dirtyblkhd)) { - nfs_lock(np, NFS_NODE_LOCK_FORCE); + nfs_node_lock_force(np); np->n_flag |= NMODIFIED; - nfs_unlock(np); + nfs_node_unlock(np); } lck_mtx_lock(nfs_buf_mutex); goto again; } - if (waitfor == MNT_WAIT) { + if (waitfor == MNT_WAIT || waitfor == MNT_DWAIT) { if (!LIST_EMPTY(&np->n_dirtyblkhd)) { - nfs_lock(np, NFS_NODE_LOCK_FORCE); + nfs_node_lock_force(np); np->n_flag |= NMODIFIED; - nfs_unlock(np); + nfs_node_unlock(np); } lck_mtx_lock(nfs_buf_mutex); - if (!LIST_EMPTY(&np->n_dirtyblkhd)) + if (!LIST_EMPTY(&np->n_dirtyblkhd)) { goto again; + } lck_mtx_unlock(nfs_buf_mutex); - nfs_lock(np, NFS_NODE_LOCK_FORCE); - /* if we have no dirty blocks, we can clear the modified flag */ - if (!np->n_wrbusy) + nfs_node_lock_force(np); + /* + * OK, it looks like there are no dirty blocks. If we have no + * writes in flight and no one in the write code, we can clear + * the modified flag. In order to make sure we see the latest + * attributes and size, we also invalidate the attributes and + * advance the attribute cache XID to guarantee that attributes + * newer than our clearing of NMODIFIED will get loaded next. + * (If we don't do this, it's possible for the flush's final + * write/commit (xid1) to be executed in parallel with a subsequent + * getattr request (xid2). The getattr could return attributes + * from *before* the write/commit completed but the stale attributes + * would be preferred because of the xid ordering.) + */ + if (!np->n_wrbusy && !np->n_numoutput) { np->n_flag &= ~NMODIFIED; + NATTRINVALIDATE(np); + nfs_get_xid(&np->n_xid); + } } else { - nfs_lock(np, NFS_NODE_LOCK_FORCE); + nfs_node_lock_force(np); } FSDBG(526, np->n_flag, np->n_error, 0, 0); @@ -3466,24 +3773,25 @@ again: error = np->n_error; np->n_flag &= ~NWRITEERR; } - nfs_unlock(np); + nfs_node_unlock(np); done: lck_mtx_lock(nfs_buf_mutex); flags = np->n_bflag; - np->n_bflag &= ~(NBFLUSHINPROG|NBFLUSHWANT); + np->n_bflag &= ~(NBFLUSHINPROG | NBFLUSHWANT); lck_mtx_unlock(nfs_buf_mutex); - if (flags & NBFLUSHWANT) + if (flags & NBFLUSHWANT) { wakeup(&np->n_bflag); + } out: FSDBG_BOT(517, np, error, ignore_writeerr, 0); - return (error); + return error; } /* * Flush out and invalidate all buffers associated with a vnode. * Called with the underlying object locked. */ -static int +int nfs_vinvalbuf_internal( nfsnode_t np, int flags, @@ -3497,8 +3805,9 @@ nfs_vinvalbuf_internal( int list, error = 0; if (flags & V_SAVE) { - if ((error = nfs_flush(np, MNT_WAIT, thd, (flags & V_IGNORE_WRITEERR)))) - return (error); + if ((error = nfs_flush(np, MNT_WAIT, thd, (flags & V_IGNORE_WRITEERR)))) { + return error; + } } lck_mtx_lock(nfs_buf_mutex); @@ -3506,15 +3815,17 @@ nfs_vinvalbuf_internal( list = NBI_CLEAN; if (nfs_buf_iterprepare(np, &blist, list)) { list = NBI_DIRTY; - if (nfs_buf_iterprepare(np, &blist, list)) + if (nfs_buf_iterprepare(np, &blist, list)) { break; + } } while ((bp = LIST_FIRST(&blist))) { LIST_REMOVE(bp, nb_vnbufs); - if (list == NBI_CLEAN) + if (list == NBI_CLEAN) { LIST_INSERT_HEAD(&np->n_cleanblkhd, bp, nb_vnbufs); - else + } else { LIST_INSERT_HEAD(&np->n_dirtyblkhd, bp, nb_vnbufs); + } nfs_buf_refget(bp); while ((error = nfs_buf_acquire(bp, NBAC_REMOVE, slpflag, slptimeo))) { FSDBG(556, np, bp, NBOFF(bp), bp->nb_flags); @@ -3523,7 +3834,7 @@ nfs_vinvalbuf_internal( nfs_buf_refrele(bp); nfs_buf_itercomplete(np, &blist, list); lck_mtx_unlock(nfs_buf_mutex); - return (error); + return error; } } nfs_buf_refrele(bp); @@ -3533,18 +3844,21 @@ nfs_vinvalbuf_internal( (NBOFF(bp) < (off_t)np->n_size)) { /* extra paranoia: make sure we're not */ /* somehow leaving any dirty data around */ + nfsbufpgs pagemask; int mustwrite = 0; - int end = (NBOFF(bp) + bp->nb_bufsize > (off_t)np->n_size) ? - ((off_t)np->n_size - NBOFF(bp)) : bp->nb_bufsize; + off_t end = (NBOFF(bp) + bp->nb_bufsize > (off_t)np->n_size) ? + (np->n_size - NBOFF(bp)) : bp->nb_bufsize; if (!ISSET(bp->nb_flags, NB_PAGELIST)) { error = nfs_buf_upl_setup(bp); if (error == EINVAL) { /* vm object must no longer exist */ /* hopefully we don't need to do */ /* anything for this buffer */ - } else if (error) + } else if (error) { printf("nfs_vinvalbuf: upl setup failed %d\n", error); - bp->nb_valid = bp->nb_dirty = 0; + } + NBPGS_ERASE(&bp->nb_valid); + NBPGS_ERASE(&bp->nb_dirty); } nfs_buf_upl_check(bp); /* check for any dirty data before the EOF */ @@ -3552,15 +3866,19 @@ nfs_vinvalbuf_internal( /* clip dirty range to EOF */ if (bp->nb_dirtyend > end) { bp->nb_dirtyend = end; - if (bp->nb_dirtyoff >= bp->nb_dirtyend) + if (bp->nb_dirtyoff >= bp->nb_dirtyend) { bp->nb_dirtyoff = bp->nb_dirtyend = 0; + } } - if ((bp->nb_dirtyend > 0) && (bp->nb_dirtyoff < end)) + if ((bp->nb_dirtyend > 0) && (bp->nb_dirtyoff < end)) { mustwrite++; + } } - bp->nb_dirty &= (1 << (round_page_32(end)/PAGE_SIZE)) - 1; - if (bp->nb_dirty) + nfs_buf_pgs_get_page_mask(&pagemask, round_page_64(end) / PAGE_SIZE); + nfs_buf_pgs_bit_and(&bp->nb_dirty, &pagemask, &bp->nb_dirty); + if (nfs_buf_pgs_is_set(&bp->nb_dirty)) { mustwrite++; + } /* also make sure we'll have a credential to do the write */ if (mustwrite && !IS_VALID_CRED(bp->nb_wcred) && !IS_VALID_CRED(cred)) { printf("nfs_vinvalbuf: found dirty buffer with no write creds\n"); @@ -3568,8 +3886,9 @@ nfs_vinvalbuf_internal( } if (mustwrite) { FSDBG(554, np, bp, 0xd00dee, bp->nb_flags); - if (!ISSET(bp->nb_flags, NB_PAGELIST)) + if (!ISSET(bp->nb_flags, NB_PAGELIST)) { panic("nfs_vinvalbuf: dirty buffer without upl"); + } /* gotta write out dirty data before invalidating */ /* (NB_STABLE indicates that data writes should be FILESYNC) */ /* (NB_NOCACHE indicates buffer should be discarded) */ @@ -3583,10 +3902,11 @@ nfs_vinvalbuf_internal( // Note: bp has been released if (error) { FSDBG(554, bp, 0xd00dee, 0xbad, error); - nfs_lock(np, NFS_NODE_LOCK_FORCE); - np->n_error = error; - np->n_flag |= NWRITEERR; - nfs_unlock(np); + nfs_node_lock_force(np); + if ((error != EINTR) && (error != ERESTART)) { + np->n_error = error; + np->n_flag |= NWRITEERR; + } /* * There was a write error and we need to * invalidate attrs to sync with server. @@ -3594,6 +3914,18 @@ nfs_vinvalbuf_internal( * we may no longer know the correct size) */ NATTRINVALIDATE(np); + nfs_node_unlock(np); + if ((error == EINTR) || (error == ERESTART)) { + /* + * Abort on EINTR. If we don't, we could + * be stuck in this loop forever because + * the buffer will continue to stay dirty. + */ + lck_mtx_lock(nfs_buf_mutex); + nfs_buf_itercomplete(np, &blist, list); + lck_mtx_unlock(nfs_buf_mutex); + return error; + } error = 0; } lck_mtx_lock(nfs_buf_mutex); @@ -3607,16 +3939,20 @@ nfs_vinvalbuf_internal( } nfs_buf_itercomplete(np, &blist, list); } - if (!LIST_EMPTY(&(np)->n_dirtyblkhd) || !LIST_EMPTY(&(np)->n_cleanblkhd)) + if (!LIST_EMPTY(&(np)->n_dirtyblkhd) || !LIST_EMPTY(&(np)->n_cleanblkhd)) { panic("nfs_vinvalbuf: flush/inval failed"); + } lck_mtx_unlock(nfs_buf_mutex); + nfs_node_lock_force(np); if (!(flags & V_SAVE)) { - nfs_lock(np, NFS_NODE_LOCK_FORCE); np->n_flag &= ~NMODIFIED; - nfs_unlock(np); } + if (vnode_vtype(NFSTOV(np)) == VREG) { + np->n_lastrahead = -1; + } + nfs_node_unlock(np); NFS_BUF_FREEUP(); - return (0); + return 0; } @@ -3635,13 +3971,25 @@ nfs_vinvalbuf2(vnode_t vp, int flags, thread_t thd, kauth_cred_t cred, int intrf { nfsnode_t np = VTONFS(vp); struct nfsmount *nmp = VTONMP(vp); - int error, rv, slpflag, slptimeo, nflags; + int error, slpflag, slptimeo, nflags, retry = 0; + int ubcflags = UBC_PUSHALL | UBC_SYNC | UBC_INVALIDATE; + struct timespec ts = { .tv_sec = 2, .tv_nsec = 0 }; off_t size; FSDBG_TOP(554, np, flags, intrflg, 0); - if (nmp && !(nmp->nm_flag & NFSMNT_INT)) + /* + * If the mount is gone no sense to try and write anything. + * and hang trying to do IO. + */ + if (nfs_mount_gone(nmp)) { + flags &= ~V_SAVE; + ubcflags &= ~UBC_PUSHALL; + } + + if (nmp && !NMFLAG(nmp, INTR)) { intrflg = 0; + } if (intrflg) { slpflag = PCATCH; slptimeo = 2 * hz; @@ -3654,40 +4002,109 @@ nfs_vinvalbuf2(vnode_t vp, int flags, thread_t thd, kauth_cred_t cred, int intrf lck_mtx_lock(nfs_buf_mutex); while (np->n_bflag & NBINVALINPROG) { np->n_bflag |= NBINVALWANT; - error = msleep(&np->n_bflag, nfs_buf_mutex, slpflag, "nfs_vinvalbuf", NULL); - if (error) { + msleep(&np->n_bflag, nfs_buf_mutex, slpflag, "nfs_vinvalbuf", &ts); + if ((error = nfs_sigintr(VTONMP(vp), NULL, thd, 0))) { lck_mtx_unlock(nfs_buf_mutex); - return (error); + return error; + } + if (np->n_bflag & NBINVALINPROG) { + slpflag = 0; } } np->n_bflag |= NBINVALINPROG; lck_mtx_unlock(nfs_buf_mutex); /* Now, flush as required. */ +again: error = nfs_vinvalbuf_internal(np, flags, thd, cred, slpflag, 0); while (error) { FSDBG(554, np, 0, 0, error); - if ((error = nfs_sigintr(VTONMP(vp), NULL, thd, 0))) + if ((error = nfs_sigintr(VTONMP(vp), NULL, thd, 0))) { goto done; + } error = nfs_vinvalbuf_internal(np, flags, thd, cred, 0, slptimeo); } /* get the pages out of vm also */ - if (UBCINFOEXISTS(vp) && (size = ubc_getsize(vp))) - if (!(rv = ubc_sync_range(vp, 0, size, UBC_PUSHALL | UBC_SYNC | UBC_INVALIDATE))) - panic("nfs_vinvalbuf(): ubc_sync_range failed!"); + if (UBCINFOEXISTS(vp) && (size = ubc_getsize(vp))) { + if ((error = ubc_msync(vp, 0, size, NULL, ubcflags))) { + if (error == EINVAL) { + panic("nfs_vinvalbuf(): ubc_msync failed!, error %d", error); + } + if (retry++ < 10) { /* retry invalidating a few times */ + if (retry > 1 || error == ENXIO) { + ubcflags &= ~UBC_PUSHALL; + } + goto again; + } + /* give up */ + printf("nfs_vinvalbuf(): ubc_msync failed!, error %d\n", error); + } + } done: lck_mtx_lock(nfs_buf_mutex); nflags = np->n_bflag; - np->n_bflag &= ~(NBINVALINPROG|NBINVALWANT); + np->n_bflag &= ~(NBINVALINPROG | NBINVALWANT); lck_mtx_unlock(nfs_buf_mutex); - if (nflags & NBINVALWANT) + if (nflags & NBINVALWANT) { wakeup(&np->n_bflag); + } FSDBG_BOT(554, np, flags, intrflg, error); - return (error); + return error; } +/* + * Wait for any busy buffers to complete. + */ +void +nfs_wait_bufs(nfsnode_t np) +{ + struct nfsbuf *bp; + struct nfsbuflists blist; + int error = 0; + + lck_mtx_lock(nfs_buf_mutex); + if (!nfs_buf_iterprepare(np, &blist, NBI_CLEAN)) { + while ((bp = LIST_FIRST(&blist))) { + LIST_REMOVE(bp, nb_vnbufs); + LIST_INSERT_HEAD(&np->n_cleanblkhd, bp, nb_vnbufs); + nfs_buf_refget(bp); + while ((error = nfs_buf_acquire(bp, 0, 0, 0))) { + if (error != EAGAIN) { + nfs_buf_refrele(bp); + nfs_buf_itercomplete(np, &blist, NBI_CLEAN); + lck_mtx_unlock(nfs_buf_mutex); + return; + } + } + nfs_buf_refrele(bp); + nfs_buf_drop(bp); + } + nfs_buf_itercomplete(np, &blist, NBI_CLEAN); + } + if (!nfs_buf_iterprepare(np, &blist, NBI_DIRTY)) { + while ((bp = LIST_FIRST(&blist))) { + LIST_REMOVE(bp, nb_vnbufs); + LIST_INSERT_HEAD(&np->n_dirtyblkhd, bp, nb_vnbufs); + nfs_buf_refget(bp); + while ((error = nfs_buf_acquire(bp, 0, 0, 0))) { + if (error != EAGAIN) { + nfs_buf_refrele(bp); + nfs_buf_itercomplete(np, &blist, NBI_DIRTY); + lck_mtx_unlock(nfs_buf_mutex); + return; + } + } + nfs_buf_refrele(bp); + nfs_buf_drop(bp); + } + nfs_buf_itercomplete(np, &blist, NBI_DIRTY); + } + lck_mtx_unlock(nfs_buf_mutex); +} + + /* * Add an async I/O request to the mount's async I/O queue and make * sure that an nfsiod will service it. @@ -3701,8 +4118,12 @@ nfs_asyncio_finish(struct nfsreq *req) FSDBG_TOP(552, nmp, 0, 0, 0); again: - if (((nmp = req->r_nmp)) == NULL) + nmp = req->r_nmp; + + if (nmp == NULL) { return; + } + lck_mtx_lock(nfsiod_mutex); niod = nmp->nm_niod; @@ -3721,14 +4142,38 @@ again: */ lck_mtx_unlock(nfsiod_mutex); started++; - if (!nfsiod_start()) + if (!nfsiod_start()) { goto again; + } lck_mtx_lock(nfsiod_mutex); } } - if (req->r_achain.tqe_next == NFSREQNOLIST) + /* + * If we got here while being on the resendq we need to get off. This + * happens when the timer fires and errors out requests from nfs_sigintr + * or we receive a reply (UDP case) while being on the resend queue so + * we're just finishing up and are not going to be resent. + */ + lck_mtx_lock(&req->r_mtx); + if (req->r_flags & R_RESENDQ) { + lck_mtx_lock(&nmp->nm_lock); + if ((req->r_flags & R_RESENDQ) && req->r_rchain.tqe_next != NFSREQNOLIST) { + NFS_BIO_DBG("Proccessing async request on resendq. Removing"); + TAILQ_REMOVE(&nmp->nm_resendq, req, r_rchain); + req->r_flags &= ~R_RESENDQ; + req->r_rchain.tqe_next = NFSREQNOLIST; + assert(req->r_refs > 1); + /* Remove resendq reference */ + req->r_refs--; + } + lck_mtx_unlock(&nmp->nm_lock); + } + lck_mtx_unlock(&req->r_mtx); + + if (req->r_achain.tqe_next == NFSREQNOLIST) { TAILQ_INSERT_TAIL(&nmp->nm_iodq, req, r_achain); + } /* If this mount doesn't already have an nfsiod working on it... */ if (!nmp->nm_niod) { @@ -3737,8 +4182,10 @@ again: lck_mtx_unlock(nfsiod_mutex); wakeup(niod); } else if (nfsiod_thread_count > 0) { - /* just queue it up on nfsiod mounts queue */ - TAILQ_INSERT_TAIL(&nfsiodmounts, nmp, nm_iodlink); + /* just queue it up on nfsiod mounts queue if needed */ + if (nmp->nm_iodlink.tqe_next == NFSNOLIST) { + TAILQ_INSERT_TAIL(&nfsiodmounts, nmp, nm_iodlink); + } lck_mtx_unlock(nfsiod_mutex); } else { printf("nfs_asyncio(): no nfsiods? %d %d (%d)\n", nfsiod_thread_count, NFSIOD_MAX, started); @@ -3756,83 +4203,64 @@ again: /* * queue up async I/O request for resend + * Must be called with req->r_mtx locked. */ void nfs_asyncio_resend(struct nfsreq *req) { struct nfsmount *nmp = req->r_nmp; - if (!nmp) + if (nfs_mount_gone(nmp)) { return; + } + +#if CONFIG_NFS_GSS nfs_gss_clnt_rpcdone(req); +#endif lck_mtx_lock(&nmp->nm_lock); - if (req->r_rchain.tqe_next == NFSREQNOLIST) { + if (!(req->r_flags & R_RESENDQ)) { TAILQ_INSERT_TAIL(&nmp->nm_resendq, req, r_rchain); req->r_flags |= R_RESENDQ; + /* + * We take a reference on this request so that it can't be + * destroyed while a resend is queued or in progress. + */ + nfs_request_ref(req, 1); } nfs_mount_sock_thread_wake(nmp); lck_mtx_unlock(&nmp->nm_lock); } /* - * Read an NFS buffer for a directory. + * Read directory data into a buffer. + * + * Buffer will be filled (unless EOF is hit). + * Buffers after this one may also be completely/partially filled. */ int nfs_buf_readdir(struct nfsbuf *bp, vfs_context_t ctx) { - nfsnode_t np; - vnode_t vp; - struct nfsmount *nmp; - int error = 0, nfsvers; - struct uio uio; - struct iovec_32 io; - - np = bp->nb_np; - vp = NFSTOV(np); - nmp = VTONMP(vp); - nfsvers = nmp->nm_vers; - uio.uio_iovs.iov32p = &io; - uio.uio_iovcnt = 1; -#if 1 /* LP64todo - can't use new segment flags until the drivers are ready */ - uio.uio_segflg = UIO_SYSSPACE; -#else - uio.uio_segflg = UIO_SYSSPACE32; -#endif - - /* sanity check */ - if (ISSET(bp->nb_flags, NB_DONE)) - CLR(bp->nb_flags, NB_DONE); + nfsnode_t np = bp->nb_np; + struct nfsmount *nmp = NFSTONMP(np); + int error = 0; - uio.uio_rw = UIO_READ; - io.iov_len = bp->nb_bufsize; - uio_uio_resid_set(&uio, io.iov_len); - io.iov_base = (uintptr_t) bp->nb_data; - uio.uio_offset = NBOFF(bp); + if (nfs_mount_gone(nmp)) { + return ENXIO; + } - OSAddAtomic(1, (SInt32*)&nfsstats.readdir_bios); - if (nfsvers < NFS_VER4) { - if (nmp->nm_flag & NFSMNT_RDIRPLUS) { - error = nfs3_readdirplus_rpc(np, &uio, ctx); - if (error == NFSERR_NOTSUPP) { - lck_mtx_lock(&nmp->nm_lock); - nmp->nm_flag &= ~NFSMNT_RDIRPLUS; - lck_mtx_unlock(&nmp->nm_lock); - } - } - if (!(nmp->nm_flag & NFSMNT_RDIRPLUS)) - error = nfs3_readdir_rpc(np, &uio, ctx); - } else { - error = nfs4_readdir_rpc(np, &uio, ctx); + if (nmp->nm_vers < NFS_VER4) { + error = nfs3_readdir_rpc(np, bp, ctx); } - if (error) { +#if CONFIG_NFS4 + else { + error = nfs4_readdir_rpc(np, bp, ctx); + } +#endif + if (error && (error != NFSERR_DIRBUFDROPPED)) { SET(bp->nb_flags, NB_ERROR); bp->nb_error = error; - } else { - bp->nb_validoff = 0; - bp->nb_validend = uio.uio_offset - NBOFF(bp); - bp->nb_valid = (1 << (round_page_32(bp->nb_validend)/PAGE_SIZE)) - 1; } - - nfs_buf_iodone(bp); - return (error); + return error; } + +#endif /* CONFIG_NFS_CLIENT */