bsd/nfs/nfs_bio.c

   1 /*
   2  * Copyright (c) 2000-2019 Apple Inc. All rights reserved.
   3  *
   4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
   5  *
   6  * This file contains Original Code and/or Modifications of Original Code
   7  * as defined in and that are subject to the Apple Public Source License
   8  * Version 2.0 (the 'License'). You may not use this file except in
   9  * compliance with the License. The rights granted to you under the License
  10  * may not be used to create, or enable the creation or redistribution of,
  11  * unlawful or unlicensed copies of an Apple operating system, or to
  12  * circumvent, violate, or enable the circumvention or violation of, any
  13  * terms of an Apple operating system software license agreement.
  14  *
  15  * Please obtain a copy of the License at
  16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
  17  *
  18  * The Original Code and all software distributed under the License are
  19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
  20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
  21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
  22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
  23  * Please see the License for the specific language governing rights and
  24  * limitations under the License.
  25  *
  26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
  27  */
  28 /* Copyright (c) 1995 NeXT Computer, Inc. All Rights Reserved */
  29 /*
  30  * Copyright (c) 1989, 1993
  31  *      The Regents of the University of California.  All rights reserved.
  32  *
  33  * This code is derived from software contributed to Berkeley by
  34  * Rick Macklem at The University of Guelph.
  35  *
  36  * Redistribution and use in source and binary forms, with or without
  37  * modification, are permitted provided that the following conditions
  38  * are met:
  39  * 1. Redistributions of source code must retain the above copyright
  40  *    notice, this list of conditions and the following disclaimer.
  41  * 2. Redistributions in binary form must reproduce the above copyright
  42  *    notice, this list of conditions and the following disclaimer in the
  43  *    documentation and/or other materials provided with the distribution.
  44  * 3. All advertising materials mentioning features or use of this software
  45  *    must display the following acknowledgement:
  46  *      This product includes software developed by the University of
  47  *      California, Berkeley and its contributors.
  48  * 4. Neither the name of the University nor the names of its contributors
  49  *    may be used to endorse or promote products derived from this software
  50  *    without specific prior written permission.
  51  *
  52  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  53  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  54  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  55  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  56  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  57  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  58  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  59  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  60  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  61  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  62  * SUCH DAMAGE.
  63  *
  64  *      @(#)nfs_bio.c   8.9 (Berkeley) 3/30/95
  65  * FreeBSD-Id: nfs_bio.c,v 1.44 1997/09/10 19:52:25 phk Exp $
  66  */
  67
  68 #include <nfs/nfs_conf.h>
  69 #if CONFIG_NFS_CLIENT
  70
  71 #include <sys/param.h>
  72 #include <sys/systm.h>
  73 #include <sys/resourcevar.h>
  74 #include <sys/signalvar.h>
  75 #include <sys/proc_internal.h>
  76 #include <sys/kauth.h>
  77 #include <sys/malloc.h>
  78 #include <sys/vnode.h>
  79 #include <sys/dirent.h>
  80 #include <sys/mount_internal.h>
  81 #include <sys/kernel.h>
  82 #include <sys/ubc_internal.h>
  83 #include <sys/uio_internal.h>
  84 #include <sys/kpi_mbuf.h>
  85
  86 #include <sys/vm.h>
  87 #include <sys/vmparam.h>
  88
  89 #include <sys/time.h>
  90 #include <kern/clock.h>
  91 #include <libkern/OSAtomic.h>
  92 #include <kern/kalloc.h>
  93 #include <kern/thread_call.h>
  94
  95 #include <nfs/rpcv2.h>
  96 #include <nfs/nfsproto.h>
  97 #include <nfs/nfs.h>
  98 #include <nfs/nfs_gss.h>
  99 #include <nfs/nfsmount.h>
 100 #include <nfs/nfsnode.h>
 101 #include <sys/buf_internal.h>
 102 #include <libkern/OSAtomic.h>
 103 #include <os/refcnt.h>
 104
 105 #define NFS_BIO_DBG(...) NFS_DBG(NFS_FAC_BIO, 7, ## __VA_ARGS__)
 106
 107 kern_return_t   thread_terminate(thread_t); /* XXX */
 108
 109 #define NFSBUFHASH(np, lbn)     \
 110         (&nfsbufhashtbl[((long)(np) / sizeof(*(np)) + (int)(lbn)) & nfsbufhash])
 111 LIST_HEAD(nfsbufhashhead, nfsbuf) * nfsbufhashtbl;
 112 struct nfsbuffreehead nfsbuffree, nfsbuffreemeta, nfsbufdelwri;
 113 u_long nfsbufhash;
 114 int nfsbufcnt, nfsbufmin, nfsbufmax, nfsbufmetacnt, nfsbufmetamax;
 115 int nfsbuffreecnt, nfsbuffreemetacnt, nfsbufdelwricnt, nfsneedbuffer;
 116 int nfs_nbdwrite;
 117 int nfs_buf_timer_on = 0;
 118 thread_t nfsbufdelwrithd = NULL;
 119
 120 lck_grp_t *nfs_buf_lck_grp;
 121 lck_mtx_t *nfs_buf_mutex;
 122
 123 #define NFSBUF_FREE_PERIOD      30      /* seconds */
 124 #define NFSBUF_LRU_STALE        120
 125 #define NFSBUF_META_STALE       240
 126
 127 /* number of nfsbufs nfs_buf_freeup() should attempt to free from nfsbuffree list */
 128 #define LRU_TO_FREEUP                   6
 129 /* number of nfsbufs nfs_buf_freeup() should attempt to free from nfsbuffreemeta list */
 130 #define META_TO_FREEUP                  3
 131 /* total number of nfsbufs nfs_buf_freeup() should attempt to free */
 132 #define TOTAL_TO_FREEUP                 (LRU_TO_FREEUP+META_TO_FREEUP)
 133 /* fraction of nfsbufs nfs_buf_freeup() should attempt to free from nfsbuffree list when called from timer */
 134 #define LRU_FREEUP_FRAC_ON_TIMER        8
 135 /* fraction of nfsbufs nfs_buf_freeup() should attempt to free from nfsbuffreemeta list when called from timer */
 136 #define META_FREEUP_FRAC_ON_TIMER       16
 137 /* fraction of total nfsbufs that nfsbuffreecnt should exceed before bothering to call nfs_buf_freeup() */
 138 #define LRU_FREEUP_MIN_FRAC             4
 139 /* fraction of total nfsbufs that nfsbuffreemetacnt should exceed before bothering to call nfs_buf_freeup() */
 140 #define META_FREEUP_MIN_FRAC            2
 141
 142 #define NFS_BUF_FREEUP() \
 143         do { \
 144         /* only call nfs_buf_freeup() if it has work to do: */ \
 145                 if (((nfsbuffreecnt > nfsbufcnt/LRU_FREEUP_MIN_FRAC) || \
 146                      (nfsbuffreemetacnt > nfsbufcnt/META_FREEUP_MIN_FRAC)) && \
 147                     ((nfsbufcnt - TOTAL_TO_FREEUP) > nfsbufmin)) \
 148                         nfs_buf_freeup(0); \
 149         } while (0)
 150
 151 /*
 152  * Initialize nfsbuf lists
 153  */
 154 void
 155 nfs_nbinit(void)
 156 {
 157         nfs_buf_lck_grp = lck_grp_alloc_init("nfs_buf", LCK_GRP_ATTR_NULL);
 158         nfs_buf_mutex = lck_mtx_alloc_init(nfs_buf_lck_grp, LCK_ATTR_NULL);
 159
 160         nfsbufcnt = nfsbufmetacnt =
 161             nfsbuffreecnt = nfsbuffreemetacnt = nfsbufdelwricnt = 0;
 162         nfsbufmin = 128;
 163         /* size nfsbufmax to cover at most half sane_size (w/default buf size) */
 164         nfsbufmax = (sane_size >> PAGE_SHIFT) / (2 * (NFS_RWSIZE >> PAGE_SHIFT));
 165         nfsbufmetamax = nfsbufmax / 4;
 166         nfsneedbuffer = 0;
 167         nfs_nbdwrite = 0;
 168
 169         nfsbufhashtbl = hashinit(nfsbufmax / 4, M_TEMP, &nfsbufhash);
 170         TAILQ_INIT(&nfsbuffree);
 171         TAILQ_INIT(&nfsbuffreemeta);
 172         TAILQ_INIT(&nfsbufdelwri);
 173 }
 174
 175 /*
 176  * Check periodically for stale/unused nfs bufs
 177  */
 178 void
 179 nfs_buf_timer(__unused void *param0, __unused void *param1)
 180 {
 181         nfs_buf_freeup(1);
 182
 183         lck_mtx_lock(nfs_buf_mutex);
 184         if (nfsbufcnt <= nfsbufmin) {
 185                 nfs_buf_timer_on = 0;
 186                 lck_mtx_unlock(nfs_buf_mutex);
 187                 return;
 188         }
 189         lck_mtx_unlock(nfs_buf_mutex);
 190
 191         nfs_interval_timer_start(nfs_buf_timer_call,
 192             NFSBUF_FREE_PERIOD * 1000);
 193 }
 194
 195 /*
 196  * try to free up some excess, unused nfsbufs
 197  */
 198 void
 199 nfs_buf_freeup(int timer)
 200 {
 201         struct nfsbuf *fbp;
 202         struct timeval now;
 203         int count;
 204         struct nfsbuffreehead nfsbuffreeup;
 205
 206         TAILQ_INIT(&nfsbuffreeup);
 207
 208         lck_mtx_lock(nfs_buf_mutex);
 209
 210         microuptime(&now);
 211
 212         FSDBG(320, nfsbufcnt, nfsbuffreecnt, nfsbuffreemetacnt, 0);
 213
 214         count = timer ? nfsbuffreecnt / LRU_FREEUP_FRAC_ON_TIMER : LRU_TO_FREEUP;
 215         while ((nfsbufcnt > nfsbufmin) && (count-- > 0)) {
 216                 fbp = TAILQ_FIRST(&nfsbuffree);
 217                 if (!fbp) {
 218                         break;
 219                 }
 220                 if (os_ref_get_count(&fbp->nb_refs) > 1) {
 221                         break;
 222                 }
 223                 if (NBUFSTAMPVALID(fbp) &&
 224                     (fbp->nb_timestamp + (2 * NFSBUF_LRU_STALE)) > now.tv_sec) {
 225                         break;
 226                 }
 227                 nfs_buf_remfree(fbp);
 228                 /* disassociate buffer from any nfsnode */
 229                 if (fbp->nb_np) {
 230                         if (fbp->nb_vnbufs.le_next != NFSNOLIST) {
 231                                 LIST_REMOVE(fbp, nb_vnbufs);
 232                                 fbp->nb_vnbufs.le_next = NFSNOLIST;
 233                         }
 234                         fbp->nb_np = NULL;
 235                 }
 236                 LIST_REMOVE(fbp, nb_hash);
 237                 TAILQ_INSERT_TAIL(&nfsbuffreeup, fbp, nb_free);
 238                 nfsbufcnt--;
 239         }
 240
 241         count = timer ? nfsbuffreemetacnt / META_FREEUP_FRAC_ON_TIMER : META_TO_FREEUP;
 242         while ((nfsbufcnt > nfsbufmin) && (count-- > 0)) {
 243                 fbp = TAILQ_FIRST(&nfsbuffreemeta);
 244                 if (!fbp) {
 245                         break;
 246                 }
 247                 if (os_ref_get_count(&fbp->nb_refs) > 1) {
 248                         break;
 249                 }
 250                 if (NBUFSTAMPVALID(fbp) &&
 251                     (fbp->nb_timestamp + (2 * NFSBUF_META_STALE)) > now.tv_sec) {
 252                         break;
 253                 }
 254                 nfs_buf_remfree(fbp);
 255                 /* disassociate buffer from any nfsnode */
 256                 if (fbp->nb_np) {
 257                         if (fbp->nb_vnbufs.le_next != NFSNOLIST) {
 258                                 LIST_REMOVE(fbp, nb_vnbufs);
 259                                 fbp->nb_vnbufs.le_next = NFSNOLIST;
 260                         }
 261                         fbp->nb_np = NULL;
 262                 }
 263                 LIST_REMOVE(fbp, nb_hash);
 264                 TAILQ_INSERT_TAIL(&nfsbuffreeup, fbp, nb_free);
 265                 nfsbufcnt--;
 266                 nfsbufmetacnt--;
 267         }
 268
 269         FSDBG(320, nfsbufcnt, nfsbuffreecnt, nfsbuffreemetacnt, 0);
 270         NFSBUFCNTCHK();
 271
 272         lck_mtx_unlock(nfs_buf_mutex);
 273
 274         while ((fbp = TAILQ_FIRST(&nfsbuffreeup))) {
 275                 TAILQ_REMOVE(&nfsbuffreeup, fbp, nb_free);
 276                 /* nuke any creds */
 277                 if (IS_VALID_CRED(fbp->nb_rcred)) {
 278                         kauth_cred_unref(&fbp->nb_rcred);
 279                 }
 280                 if (IS_VALID_CRED(fbp->nb_wcred)) {
 281                         kauth_cred_unref(&fbp->nb_wcred);
 282                 }
 283                 /* if buf was NB_META, dump buffer */
 284                 if (ISSET(fbp->nb_flags, NB_META) && fbp->nb_data) {
 285                         kfree(fbp->nb_data, fbp->nb_bufsize);
 286                 }
 287                 FREE(fbp, M_TEMP);
 288         }
 289 }
 290
 291 /*
 292  * remove a buffer from the freelist
 293  * (must be called with nfs_buf_mutex held)
 294  */
 295 void
 296 nfs_buf_remfree(struct nfsbuf *bp)
 297 {
 298         if (bp->nb_free.tqe_next == NFSNOLIST) {
 299                 panic("nfsbuf not on free list");
 300         }
 301         if (ISSET(bp->nb_flags, NB_DELWRI)) {
 302                 nfsbufdelwricnt--;
 303                 TAILQ_REMOVE(&nfsbufdelwri, bp, nb_free);
 304         } else if (ISSET(bp->nb_flags, NB_META)) {
 305                 nfsbuffreemetacnt--;
 306                 TAILQ_REMOVE(&nfsbuffreemeta, bp, nb_free);
 307         } else {
 308                 nfsbuffreecnt--;
 309                 TAILQ_REMOVE(&nfsbuffree, bp, nb_free);
 310         }
 311         bp->nb_free.tqe_next = NFSNOLIST;
 312         NFSBUFCNTCHK();
 313 }
 314
 315 /*
 316  * check for existence of nfsbuf in cache
 317  */
 318 boolean_t
 319 nfs_buf_is_incore(nfsnode_t np, daddr64_t blkno)
 320 {
 321         boolean_t rv;
 322         lck_mtx_lock(nfs_buf_mutex);
 323         if (nfs_buf_incore(np, blkno)) {
 324                 rv = TRUE;
 325         } else {
 326                 rv = FALSE;
 327         }
 328         lck_mtx_unlock(nfs_buf_mutex);
 329         return rv;
 330 }
 331
 332 /*
 333  * return incore buffer (must be called with nfs_buf_mutex held)
 334  */
 335 struct nfsbuf *
 336 nfs_buf_incore(nfsnode_t np, daddr64_t blkno)
 337 {
 338         /* Search hash chain */
 339         struct nfsbuf * bp = NFSBUFHASH(np, blkno)->lh_first;
 340         for (; bp != NULL; bp = bp->nb_hash.le_next) {
 341                 if ((bp->nb_lblkno == blkno) && (bp->nb_np == np)) {
 342                         if (!ISSET(bp->nb_flags, NB_INVAL)) {
 343                                 FSDBG(547, bp, blkno, bp->nb_flags, bp->nb_np);
 344                                 return bp;
 345                         }
 346                 }
 347         }
 348         return NULL;
 349 }
 350
 351 /*
 352  * Check if it's OK to drop a page.
 353  *
 354  * Called by vnode_pager() on pageout request of non-dirty page.
 355  * We need to make sure that it's not part of a delayed write.
 356  * If it is, we can't let the VM drop it because we may need it
 357  * later when/if we need to write the data (again).
 358  */
 359 int
 360 nfs_buf_page_inval(vnode_t vp, off_t offset)
 361 {
 362         struct nfsmount *nmp = VTONMP(vp);
 363         struct nfsbuf *bp;
 364         int error = 0;
 365
 366         if (nfs_mount_gone(nmp)) {
 367                 return ENXIO;
 368         }
 369
 370         lck_mtx_lock(nfs_buf_mutex);
 371         bp = nfs_buf_incore(VTONFS(vp), (daddr64_t)(offset / nmp->nm_biosize));
 372         if (!bp) {
 373                 goto out;
 374         }
 375         FSDBG(325, bp, bp->nb_flags, bp->nb_dirtyoff, bp->nb_dirtyend);
 376         if (ISSET(bp->nb_lflags, NBL_BUSY)) {
 377                 error = EBUSY;
 378                 goto out;
 379         }
 380         /*
 381          * If there's a dirty range in the buffer, check to
 382          * see if this page intersects with the dirty range.
 383          * If it does, we can't let the pager drop the page.
 384          */
 385         if (bp->nb_dirtyend > 0) {
 386                 int start = offset - NBOFF(bp);
 387                 if ((bp->nb_dirtyend > start) &&
 388                     (bp->nb_dirtyoff < (start + PAGE_SIZE))) {
 389                         /*
 390                          * Before returning the bad news, move the
 391                          * buffer to the start of the delwri list and
 392                          * give the list a push to try to flush the
 393                          * buffer out.
 394                          */
 395                         error = EBUSY;
 396                         nfs_buf_remfree(bp);
 397                         TAILQ_INSERT_HEAD(&nfsbufdelwri, bp, nb_free);
 398                         nfsbufdelwricnt++;
 399                         nfs_buf_delwri_push(1);
 400                 }
 401         }
 402 out:
 403         lck_mtx_unlock(nfs_buf_mutex);
 404         return error;
 405 }
 406
 407 /*
 408  * set up the UPL for a buffer
 409  * (must NOT be called with nfs_buf_mutex held)
 410  */
 411 int
 412 nfs_buf_upl_setup(struct nfsbuf *bp)
 413 {
 414         kern_return_t kret;
 415         upl_t upl;
 416         int upl_flags;
 417
 418         if (ISSET(bp->nb_flags, NB_PAGELIST)) {
 419                 return 0;
 420         }
 421
 422         upl_flags = UPL_PRECIOUS;
 423         if (!ISSET(bp->nb_flags, NB_READ)) {
 424                 /*
 425                  * We're doing a "write", so we intend to modify
 426                  * the pages we're gathering.
 427                  */
 428                 upl_flags |= UPL_WILL_MODIFY;
 429         }
 430         kret = ubc_create_upl_kernel(NFSTOV(bp->nb_np), NBOFF(bp), bp->nb_bufsize,
 431             &upl, NULL, upl_flags, VM_KERN_MEMORY_FILE);
 432         if (kret == KERN_INVALID_ARGUMENT) {
 433                 /* vm object probably doesn't exist any more */
 434                 bp->nb_pagelist = NULL;
 435                 return EINVAL;
 436         }
 437         if (kret != KERN_SUCCESS) {
 438                 printf("nfs_buf_upl_setup(): failed to get pagelist %d\n", kret);
 439                 bp->nb_pagelist = NULL;
 440                 return EIO;
 441         }
 442
 443         FSDBG(538, bp, NBOFF(bp), bp->nb_bufsize, bp->nb_np);
 444
 445         bp->nb_pagelist = upl;
 446         SET(bp->nb_flags, NB_PAGELIST);
 447         return 0;
 448 }
 449
 450 /*
 451  * update buffer's valid/dirty info from UBC
 452  * (must NOT be called with nfs_buf_mutex held)
 453  */
 454 void
 455 nfs_buf_upl_check(struct nfsbuf *bp)
 456 {
 457         upl_page_info_t *pl;
 458         off_t filesize, fileoffset;
 459         int i, npages;
 460
 461         if (!ISSET(bp->nb_flags, NB_PAGELIST)) {
 462                 return;
 463         }
 464
 465         npages = round_page_32(bp->nb_bufsize) / PAGE_SIZE;
 466         filesize = ubc_getsize(NFSTOV(bp->nb_np));
 467         fileoffset = NBOFF(bp);
 468         if (fileoffset < filesize) {
 469                 SET(bp->nb_flags, NB_CACHE);
 470         } else {
 471                 CLR(bp->nb_flags, NB_CACHE);
 472         }
 473
 474         pl = ubc_upl_pageinfo(bp->nb_pagelist);
 475         bp->nb_valid = bp->nb_dirty = 0;
 476
 477         for (i = 0; i < npages; i++, fileoffset += PAGE_SIZE_64) {
 478                 /* anything beyond the end of the file is not valid or dirty */
 479                 if (fileoffset >= filesize) {
 480                         break;
 481                 }
 482                 if (!upl_valid_page(pl, i)) {
 483                         CLR(bp->nb_flags, NB_CACHE);
 484                         continue;
 485                 }
 486                 NBPGVALID_SET(bp, i);
 487                 if (upl_dirty_page(pl, i)) {
 488                         NBPGDIRTY_SET(bp, i);
 489                 }
 490         }
 491         fileoffset = NBOFF(bp);
 492         if (ISSET(bp->nb_flags, NB_CACHE)) {
 493                 bp->nb_validoff = 0;
 494                 bp->nb_validend = bp->nb_bufsize;
 495                 if (fileoffset + bp->nb_validend > filesize) {
 496                         bp->nb_validend = filesize - fileoffset;
 497                 }
 498         } else {
 499                 bp->nb_validoff = bp->nb_validend = -1;
 500         }
 501         FSDBG(539, bp, fileoffset, bp->nb_valid, bp->nb_dirty);
 502         FSDBG(539, bp->nb_validoff, bp->nb_validend, bp->nb_dirtyoff, bp->nb_dirtyend);
 503 }
 504
 505 /*
 506  * make sure that a buffer is mapped
 507  * (must NOT be called with nfs_buf_mutex held)
 508  */
 509 int
 510 nfs_buf_map(struct nfsbuf *bp)
 511 {
 512         kern_return_t kret;
 513
 514         if (bp->nb_data) {
 515                 return 0;
 516         }
 517         if (!ISSET(bp->nb_flags, NB_PAGELIST)) {
 518                 return EINVAL;
 519         }
 520
 521         kret = ubc_upl_map(bp->nb_pagelist, (vm_offset_t *)&(bp->nb_data));
 522         if (kret != KERN_SUCCESS) {
 523                 panic("nfs_buf_map: ubc_upl_map() failed with (%d)", kret);
 524         }
 525         if (bp->nb_data == 0) {
 526                 panic("ubc_upl_map mapped 0");
 527         }
 528         FSDBG(540, bp, bp->nb_flags, NBOFF(bp), bp->nb_data);
 529         return 0;
 530 }
 531
 532 /*
 533  * normalize an nfsbuf's valid range
 534  *
 535  * the read/write code guarantees that we'll always have a valid
 536  * region that is an integral number of pages.  If either end
 537  * of the valid range isn't page-aligned, it gets corrected
 538  * here as we extend the valid range through all of the
 539  * contiguous valid pages.
 540  */
 541 void
 542 nfs_buf_normalize_valid_range(nfsnode_t np, struct nfsbuf *bp)
 543 {
 544         int pg, npg;
 545         /* pull validoff back to start of contiguous valid page range */
 546         pg = bp->nb_validoff / PAGE_SIZE;
 547         while (pg >= 0 && NBPGVALID(bp, pg)) {
 548                 pg--;
 549         }
 550         bp->nb_validoff = (pg + 1) * PAGE_SIZE;
 551         /* push validend forward to end of contiguous valid page range */
 552         npg = bp->nb_bufsize / PAGE_SIZE;
 553         pg = bp->nb_validend / PAGE_SIZE;
 554         while (pg < npg && NBPGVALID(bp, pg)) {
 555                 pg++;
 556         }
 557         bp->nb_validend = pg * PAGE_SIZE;
 558         /* clip to EOF */
 559         if (NBOFF(bp) + bp->nb_validend > (off_t)np->n_size) {
 560                 bp->nb_validend = np->n_size % bp->nb_bufsize;
 561         }
 562 }
 563
 564 /*
 565  * process some entries on the delayed write queue
 566  * (must be called with nfs_buf_mutex held)
 567  */
 568 void
 569 nfs_buf_delwri_service(void)
 570 {
 571         struct nfsbuf *bp;
 572         nfsnode_t np;
 573         int error, i = 0;
 574
 575         while (i < 8 && (bp = TAILQ_FIRST(&nfsbufdelwri)) != NULL) {
 576                 np = bp->nb_np;
 577                 nfs_buf_remfree(bp);
 578                 nfs_buf_refget(bp);
 579                 while ((error = nfs_buf_acquire(bp, 0, 0, 0)) == EAGAIN) {
 580                         ;
 581                 }
 582                 nfs_buf_refrele(bp);
 583                 if (error) {
 584                         break;
 585                 }
 586                 if (!bp->nb_np) {
 587                         /* buffer is no longer valid */
 588                         nfs_buf_drop(bp);
 589                         continue;
 590                 }
 591                 if (ISSET(bp->nb_flags, NB_NEEDCOMMIT)) {
 592                         nfs_buf_check_write_verifier(np, bp);
 593                 }
 594                 if (ISSET(bp->nb_flags, NB_NEEDCOMMIT)) {
 595                         /* put buffer at end of delwri list */
 596                         TAILQ_INSERT_TAIL(&nfsbufdelwri, bp, nb_free);
 597                         nfsbufdelwricnt++;
 598                         nfs_buf_drop(bp);
 599                         lck_mtx_unlock(nfs_buf_mutex);
 600                         nfs_flushcommits(np, 1);
 601                 } else {
 602                         SET(bp->nb_flags, NB_ASYNC);
 603                         lck_mtx_unlock(nfs_buf_mutex);
 604                         nfs_buf_write(bp);
 605                 }
 606                 i++;
 607                 lck_mtx_lock(nfs_buf_mutex);
 608         }
 609 }
 610
 611 /*
 612  * thread to service the delayed write queue when asked
 613  */
 614 void
 615 nfs_buf_delwri_thread(__unused void *arg, __unused wait_result_t wr)
 616 {
 617         struct timespec ts = { .tv_sec = 30, .tv_nsec = 0 };
 618         int error = 0;
 619
 620         lck_mtx_lock(nfs_buf_mutex);
 621         while (!error) {
 622                 nfs_buf_delwri_service();
 623                 error = msleep(&nfsbufdelwrithd, nfs_buf_mutex, 0, "nfsbufdelwri", &ts);
 624         }
 625         nfsbufdelwrithd = NULL;
 626         lck_mtx_unlock(nfs_buf_mutex);
 627         thread_terminate(nfsbufdelwrithd);
 628 }
 629
 630 /*
 631  * try to push out some delayed/uncommitted writes
 632  * ("locked" indicates whether nfs_buf_mutex is already held)
 633  */
 634 void
 635 nfs_buf_delwri_push(int locked)
 636 {
 637         if (TAILQ_EMPTY(&nfsbufdelwri)) {
 638                 return;
 639         }
 640         if (!locked) {
 641                 lck_mtx_lock(nfs_buf_mutex);
 642         }
 643         /* wake up the delayed write service thread */
 644         if (nfsbufdelwrithd) {
 645                 wakeup(&nfsbufdelwrithd);
 646         } else if (kernel_thread_start(nfs_buf_delwri_thread, NULL, &nfsbufdelwrithd) == KERN_SUCCESS) {
 647                 thread_deallocate(nfsbufdelwrithd);
 648         }
 649         /* otherwise, try to do some of the work ourselves */
 650         if (!nfsbufdelwrithd) {
 651                 nfs_buf_delwri_service();
 652         }
 653         if (!locked) {
 654                 lck_mtx_unlock(nfs_buf_mutex);
 655         }
 656 }
 657
 658 /*
 659  * Get an nfs buffer.
 660  *
 661  * Returns errno on error, 0 otherwise.
 662  * Any buffer is returned in *bpp.
 663  *
 664  * If NBLK_ONLYVALID is set, only return buffer if found in cache.
 665  * If NBLK_NOWAIT is set, don't wait for the buffer if it's marked BUSY.
 666  *
 667  * Check for existence of buffer in cache.
 668  * Or attempt to reuse a buffer from one of the free lists.
 669  * Or allocate a new buffer if we haven't already hit max allocation.
 670  * Or wait for a free buffer.
 671  *
 672  * If available buffer found, prepare it, and return it.
 673  *
 674  * If the calling process is interrupted by a signal for
 675  * an interruptible mount point, return EINTR.
 676  */
 677 int
 678 nfs_buf_get(
 679         nfsnode_t np,
 680         daddr64_t blkno,
 681         uint32_t size,
 682         thread_t thd,
 683         int flags,
 684         struct nfsbuf **bpp)
 685 {
 686         vnode_t vp = NFSTOV(np);
 687         struct nfsmount *nmp = VTONMP(vp);
 688         struct nfsbuf *bp;
 689         uint32_t bufsize;
 690         int slpflag = PCATCH;
 691         int operation = (flags & NBLK_OPMASK);
 692         int error = 0;
 693         struct timespec ts;
 694
 695         FSDBG_TOP(541, np, blkno, size, flags);
 696         *bpp = NULL;
 697
 698         bufsize = size;
 699         if (bufsize > NFS_MAXBSIZE) {
 700                 panic("nfs_buf_get: buffer larger than NFS_MAXBSIZE requested");
 701         }
 702
 703         if (nfs_mount_gone(nmp)) {
 704                 FSDBG_BOT(541, np, blkno, 0, ENXIO);
 705                 return ENXIO;
 706         }
 707
 708         if (!UBCINFOEXISTS(vp)) {
 709                 operation = NBLK_META;
 710         } else if (bufsize < (uint32_t)nmp->nm_biosize) {
 711                 /* reg files should always have biosize blocks */
 712                 bufsize = nmp->nm_biosize;
 713         }
 714
 715         /* if NBLK_WRITE, check for too many delayed/uncommitted writes */
 716         if ((operation == NBLK_WRITE) && (nfs_nbdwrite > NFS_A_LOT_OF_DELAYED_WRITES)) {
 717                 FSDBG_TOP(542, np, blkno, nfs_nbdwrite, NFS_A_LOT_OF_DELAYED_WRITES);
 718
 719                 /* poke the delwri list */
 720                 nfs_buf_delwri_push(0);
 721
 722                 /* sleep to let other threads run... */
 723                 tsleep(&nfs_nbdwrite, PCATCH, "nfs_nbdwrite", 1);
 724                 FSDBG_BOT(542, np, blkno, nfs_nbdwrite, NFS_A_LOT_OF_DELAYED_WRITES);
 725         }
 726
 727 loop:
 728         lck_mtx_lock(nfs_buf_mutex);
 729
 730         /* wait for any buffer invalidation/flushing to complete */
 731         while (np->n_bflag & NBINVALINPROG) {
 732                 np->n_bflag |= NBINVALWANT;
 733                 ts.tv_sec = 2;
 734                 ts.tv_nsec = 0;
 735                 msleep(&np->n_bflag, nfs_buf_mutex, slpflag, "nfs_buf_get_invalwait", &ts);
 736                 if ((error = nfs_sigintr(VTONMP(vp), NULL, thd, 0))) {
 737                         lck_mtx_unlock(nfs_buf_mutex);
 738                         FSDBG_BOT(541, np, blkno, 0, error);
 739                         return error;
 740                 }
 741                 if (np->n_bflag & NBINVALINPROG) {
 742                         slpflag = 0;
 743                 }
 744         }
 745
 746         /* check for existence of nfsbuf in cache */
 747         if ((bp = nfs_buf_incore(np, blkno))) {
 748                 /* if busy, set wanted and wait */
 749                 if (ISSET(bp->nb_lflags, NBL_BUSY)) {
 750                         if (flags & NBLK_NOWAIT) {
 751                                 lck_mtx_unlock(nfs_buf_mutex);
 752                                 FSDBG_BOT(541, np, blkno, bp, 0xbcbcbcbc);
 753                                 return 0;
 754                         }
 755                         FSDBG_TOP(543, np, blkno, bp, bp->nb_flags);
 756                         SET(bp->nb_lflags, NBL_WANTED);
 757
 758                         ts.tv_sec = 2;
 759                         ts.tv_nsec = 0;
 760                         msleep(bp, nfs_buf_mutex, slpflag | (PRIBIO + 1) | PDROP,
 761                             "nfsbufget", (slpflag == PCATCH) ? NULL : &ts);
 762                         slpflag = 0;
 763                         FSDBG_BOT(543, np, blkno, bp, bp->nb_flags);
 764                         if ((error = nfs_sigintr(VTONMP(vp), NULL, thd, 0))) {
 765                                 FSDBG_BOT(541, np, blkno, 0, error);
 766                                 return error;
 767                         }
 768                         goto loop;
 769                 }
 770                 if (bp->nb_bufsize != bufsize) {
 771                         panic("nfsbuf size mismatch");
 772                 }
 773                 SET(bp->nb_lflags, NBL_BUSY);
 774                 SET(bp->nb_flags, NB_CACHE);
 775                 nfs_buf_remfree(bp);
 776                 /* additional paranoia: */
 777                 if (ISSET(bp->nb_flags, NB_PAGELIST)) {
 778                         panic("pagelist buffer was not busy");
 779                 }
 780                 goto buffer_setup;
 781         }
 782
 783         if (flags & NBLK_ONLYVALID) {
 784                 lck_mtx_unlock(nfs_buf_mutex);
 785                 FSDBG_BOT(541, np, blkno, 0, 0x0000cace);
 786                 return 0;
 787         }
 788
 789         /*
 790          * where to get a free buffer:
 791          * - if meta and maxmeta reached, must reuse meta
 792          * - alloc new if we haven't reached min bufs
 793          * - if free lists are NOT empty
 794          *   - if free list is stale, use it
 795          *   - else if freemeta list is stale, use it
 796          *   - else if max bufs allocated, use least-time-to-stale
 797          * - alloc new if we haven't reached max allowed
 798          * - start clearing out delwri list and try again
 799          */
 800
 801         if ((operation == NBLK_META) && (nfsbufmetacnt >= nfsbufmetamax)) {
 802                 /* if we've hit max meta buffers, must reuse a meta buffer */
 803                 bp = TAILQ_FIRST(&nfsbuffreemeta);
 804         } else if ((nfsbufcnt > nfsbufmin) &&
 805             (!TAILQ_EMPTY(&nfsbuffree) || !TAILQ_EMPTY(&nfsbuffreemeta))) {
 806                 /* try to pull an nfsbuf off a free list */
 807                 struct nfsbuf *lrubp, *metabp;
 808                 struct timeval now;
 809                 microuptime(&now);
 810
 811                 /* if the next LRU or META buffer is invalid or stale, use it */
 812                 lrubp = TAILQ_FIRST(&nfsbuffree);
 813                 if (lrubp && (!NBUFSTAMPVALID(lrubp) ||
 814                     ((lrubp->nb_timestamp + NFSBUF_LRU_STALE) < now.tv_sec))) {
 815                         bp = lrubp;
 816                 }
 817                 metabp = TAILQ_FIRST(&nfsbuffreemeta);
 818                 if (!bp && metabp && (!NBUFSTAMPVALID(metabp) ||
 819                     ((metabp->nb_timestamp + NFSBUF_META_STALE) < now.tv_sec))) {
 820                         bp = metabp;
 821                 }
 822
 823                 if (!bp && (nfsbufcnt >= nfsbufmax)) {
 824                         /* we've already allocated all bufs, so */
 825                         /* choose the buffer that'll go stale first */
 826                         if (!metabp) {
 827                                 bp = lrubp;
 828                         } else if (!lrubp) {
 829                                 bp = metabp;
 830                         } else {
 831                                 int32_t lru_stale_time, meta_stale_time;
 832                                 lru_stale_time = lrubp->nb_timestamp + NFSBUF_LRU_STALE;
 833                                 meta_stale_time = metabp->nb_timestamp + NFSBUF_META_STALE;
 834                                 if (lru_stale_time <= meta_stale_time) {
 835                                         bp = lrubp;
 836                                 } else {
 837                                         bp = metabp;
 838                                 }
 839                         }
 840                 }
 841         }
 842
 843         if (bp) {
 844                 /* we have a buffer to reuse */
 845                 FSDBG(544, np, blkno, bp, bp->nb_flags);
 846                 nfs_buf_remfree(bp);
 847                 if (ISSET(bp->nb_flags, NB_DELWRI)) {
 848                         panic("nfs_buf_get: delwri");
 849                 }
 850                 SET(bp->nb_lflags, NBL_BUSY);
 851                 /* disassociate buffer from previous nfsnode */
 852                 if (bp->nb_np) {
 853                         if (bp->nb_vnbufs.le_next != NFSNOLIST) {
 854                                 LIST_REMOVE(bp, nb_vnbufs);
 855                                 bp->nb_vnbufs.le_next = NFSNOLIST;
 856                         }
 857                         bp->nb_np = NULL;
 858                 }
 859                 LIST_REMOVE(bp, nb_hash);
 860                 /* nuke any creds we're holding */
 861                 if (IS_VALID_CRED(bp->nb_rcred)) {
 862                         kauth_cred_unref(&bp->nb_rcred);
 863                 }
 864                 if (IS_VALID_CRED(bp->nb_wcred)) {
 865                         kauth_cred_unref(&bp->nb_wcred);
 866                 }
 867                 /* if buf will no longer be NB_META, dump old buffer */
 868                 if (operation == NBLK_META) {
 869                         if (!ISSET(bp->nb_flags, NB_META)) {
 870                                 nfsbufmetacnt++;
 871                         }
 872                 } else if (ISSET(bp->nb_flags, NB_META)) {
 873                         if (bp->nb_data) {
 874                                 kfree(bp->nb_data, bp->nb_bufsize);
 875                                 bp->nb_data = NULL;
 876                         }
 877                         nfsbufmetacnt--;
 878                 }
 879                 /* re-init buf fields */
 880                 bp->nb_error = 0;
 881                 bp->nb_validoff = bp->nb_validend = -1;
 882                 bp->nb_dirtyoff = bp->nb_dirtyend = 0;
 883                 bp->nb_valid = 0;
 884                 bp->nb_dirty = 0;
 885                 bp->nb_verf = 0;
 886         } else {
 887                 /* no buffer to reuse */
 888                 if ((nfsbufcnt < nfsbufmax) &&
 889                     ((operation != NBLK_META) || (nfsbufmetacnt < nfsbufmetamax))) {
 890                         /* just alloc a new one */
 891                         MALLOC(bp, struct nfsbuf *, sizeof(struct nfsbuf), M_TEMP, M_WAITOK);
 892                         if (!bp) {
 893                                 lck_mtx_unlock(nfs_buf_mutex);
 894                                 FSDBG_BOT(541, np, blkno, 0, error);
 895                                 return ENOMEM;
 896                         }
 897                         nfsbufcnt++;
 898
 899                         /*
 900                          * If any excess bufs, make sure the timer
 901                          * is running to free them up later.
 902                          */
 903                         if (nfsbufcnt > nfsbufmin && !nfs_buf_timer_on) {
 904                                 nfs_buf_timer_on = 1;
 905                                 nfs_interval_timer_start(nfs_buf_timer_call,
 906                                     NFSBUF_FREE_PERIOD * 1000);
 907                         }
 908
 909                         if (operation == NBLK_META) {
 910                                 nfsbufmetacnt++;
 911                         }
 912                         NFSBUFCNTCHK();
 913                         /* init nfsbuf */
 914                         bzero(bp, sizeof(*bp));
 915                         os_ref_init(&bp->nb_refs, NULL);
 916
 917                         bp->nb_free.tqe_next = NFSNOLIST;
 918                         bp->nb_validoff = bp->nb_validend = -1;
 919                         FSDBG(545, np, blkno, bp, 0);
 920                 } else {
 921                         /* too many bufs... wait for buffers to free up */
 922                         FSDBG_TOP(546, np, blkno, nfsbufcnt, nfsbufmax);
 923
 924                         /* poke the delwri list */
 925                         nfs_buf_delwri_push(1);
 926
 927                         nfsneedbuffer = 1;
 928                         msleep(&nfsneedbuffer, nfs_buf_mutex, PCATCH | PDROP, "nfsbufget", NULL);
 929                         FSDBG_BOT(546, np, blkno, nfsbufcnt, nfsbufmax);
 930                         if ((error = nfs_sigintr(VTONMP(vp), NULL, thd, 0))) {
 931                                 FSDBG_BOT(541, np, blkno, 0, error);
 932                                 return error;
 933                         }
 934                         goto loop;
 935                 }
 936         }
 937
 938         /* set up nfsbuf */
 939         SET(bp->nb_lflags, NBL_BUSY);
 940         bp->nb_flags = 0;
 941         bp->nb_lblkno = blkno;
 942         /* insert buf in hash */
 943         LIST_INSERT_HEAD(NFSBUFHASH(np, blkno), bp, nb_hash);
 944         /* associate buffer with new nfsnode */
 945         bp->nb_np = np;
 946         LIST_INSERT_HEAD(&np->n_cleanblkhd, bp, nb_vnbufs);
 947
 948 buffer_setup:
 949
 950         /* unlock hash */
 951         lck_mtx_unlock(nfs_buf_mutex);
 952
 953         switch (operation) {
 954         case NBLK_META:
 955                 SET(bp->nb_flags, NB_META);
 956                 if ((bp->nb_bufsize != bufsize) && bp->nb_data) {
 957                         kfree(bp->nb_data, bp->nb_bufsize);
 958                         bp->nb_data = NULL;
 959                         bp->nb_validoff = bp->nb_validend = -1;
 960                         bp->nb_dirtyoff = bp->nb_dirtyend = 0;
 961                         bp->nb_valid = 0;
 962                         bp->nb_dirty = 0;
 963                         CLR(bp->nb_flags, NB_CACHE);
 964                 }
 965                 if (!bp->nb_data) {
 966                         bp->nb_data = kalloc(bufsize);
 967                 }
 968                 if (!bp->nb_data) {
 969                         /* Ack! couldn't allocate the data buffer! */
 970                         /* clean up buffer and return error */
 971                         lck_mtx_lock(nfs_buf_mutex);
 972                         LIST_REMOVE(bp, nb_vnbufs);
 973                         bp->nb_vnbufs.le_next = NFSNOLIST;
 974                         bp->nb_np = NULL;
 975                         /* invalidate usage timestamp to allow immediate freeing */
 976                         NBUFSTAMPINVALIDATE(bp);
 977                         if (bp->nb_free.tqe_next != NFSNOLIST) {
 978                                 panic("nfsbuf on freelist");
 979                         }
 980                         TAILQ_INSERT_HEAD(&nfsbuffree, bp, nb_free);
 981                         nfsbuffreecnt++;
 982                         lck_mtx_unlock(nfs_buf_mutex);
 983                         FSDBG_BOT(541, np, blkno, 0xb00, ENOMEM);
 984                         return ENOMEM;
 985                 }
 986                 bp->nb_bufsize = bufsize;
 987                 break;
 988
 989         case NBLK_READ:
 990         case NBLK_WRITE:
 991                 /*
 992                  * Set or clear NB_READ now to let the UPL subsystem know
 993                  * if we intend to modify the pages or not.
 994                  */
 995                 if (operation == NBLK_READ) {
 996                         SET(bp->nb_flags, NB_READ);
 997                 } else {
 998                         CLR(bp->nb_flags, NB_READ);
 999                 }
1000                 if (bufsize < PAGE_SIZE) {
1001                         bufsize = PAGE_SIZE;
1002                 }
1003                 bp->nb_bufsize = bufsize;
1004                 bp->nb_validoff = bp->nb_validend = -1;
1005
1006                 if (UBCINFOEXISTS(vp)) {
1007                         /* set up upl */
1008                         if (nfs_buf_upl_setup(bp)) {
1009                                 /* unable to create upl */
1010                                 /* vm object must no longer exist */
1011                                 /* clean up buffer and return error */
1012                                 lck_mtx_lock(nfs_buf_mutex);
1013                                 LIST_REMOVE(bp, nb_vnbufs);
1014                                 bp->nb_vnbufs.le_next = NFSNOLIST;
1015                                 bp->nb_np = NULL;
1016                                 /* invalidate usage timestamp to allow immediate freeing */
1017                                 NBUFSTAMPINVALIDATE(bp);
1018                                 if (bp->nb_free.tqe_next != NFSNOLIST) {
1019                                         panic("nfsbuf on freelist");
1020                                 }
1021                                 TAILQ_INSERT_HEAD(&nfsbuffree, bp, nb_free);
1022                                 nfsbuffreecnt++;
1023                                 lck_mtx_unlock(nfs_buf_mutex);
1024                                 FSDBG_BOT(541, np, blkno, 0x2bc, EIO);
1025                                 return EIO;
1026                         }
1027                         nfs_buf_upl_check(bp);
1028                 }
1029                 break;
1030
1031         default:
1032                 panic("nfs_buf_get: %d unknown operation", operation);
1033         }
1034
1035         *bpp = bp;
1036
1037         FSDBG_BOT(541, np, blkno, bp, bp->nb_flags);
1038
1039         return 0;
1040 }
1041
1042 void
1043 nfs_buf_release(struct nfsbuf *bp, int freeup)
1044 {
1045         nfsnode_t np = bp->nb_np;
1046         vnode_t vp;
1047         struct timeval now;
1048         int wakeup_needbuffer, wakeup_buffer, wakeup_nbdwrite;
1049
1050         FSDBG_TOP(548, bp, NBOFF(bp), bp->nb_flags, bp->nb_data);
1051         FSDBG(548, bp->nb_validoff, bp->nb_validend, bp->nb_dirtyoff, bp->nb_dirtyend);
1052         FSDBG(548, bp->nb_valid, 0, bp->nb_dirty, 0);
1053
1054         vp = np ? NFSTOV(np) : NULL;
1055         if (vp && UBCINFOEXISTS(vp) && bp->nb_bufsize) {
1056                 int upl_flags, rv;
1057                 upl_t upl;
1058                 uint32_t i;
1059
1060                 if (!ISSET(bp->nb_flags, NB_PAGELIST) && !ISSET(bp->nb_flags, NB_INVAL)) {
1061                         rv = nfs_buf_upl_setup(bp);
1062                         if (rv) {
1063                                 printf("nfs_buf_release: upl create failed %d\n", rv);
1064                         } else {
1065                                 nfs_buf_upl_check(bp);
1066                         }
1067                 }
1068                 upl = bp->nb_pagelist;
1069                 if (!upl) {
1070                         goto pagelist_cleanup_done;
1071                 }
1072                 if (bp->nb_data) {
1073                         if (ubc_upl_unmap(upl) != KERN_SUCCESS) {
1074                                 panic("ubc_upl_unmap failed");
1075                         }
1076                         bp->nb_data = NULL;
1077                 }
1078                 /*
1079                  * Abort the pages on error or: if this is an invalid or
1080                  * non-needcommit nocache buffer AND no pages are dirty.
1081                  */
1082                 if (ISSET(bp->nb_flags, NB_ERROR) || (!bp->nb_dirty && (ISSET(bp->nb_flags, NB_INVAL) ||
1083                     (ISSET(bp->nb_flags, NB_NOCACHE) && !ISSET(bp->nb_flags, (NB_NEEDCOMMIT | NB_DELWRI)))))) {
1084                         if (ISSET(bp->nb_flags, (NB_READ | NB_INVAL | NB_NOCACHE))) {
1085                                 upl_flags = UPL_ABORT_DUMP_PAGES;
1086                         } else {
1087                                 upl_flags = 0;
1088                         }
1089                         ubc_upl_abort(upl, upl_flags);
1090                         goto pagelist_cleanup_done;
1091                 }
1092                 for (i = 0; i <= (bp->nb_bufsize - 1) / PAGE_SIZE; i++) {
1093                         if (!NBPGVALID(bp, i)) {
1094                                 ubc_upl_abort_range(upl,
1095                                     i * PAGE_SIZE, PAGE_SIZE,
1096                                     UPL_ABORT_DUMP_PAGES |
1097                                     UPL_ABORT_FREE_ON_EMPTY);
1098                         } else {
1099                                 if (NBPGDIRTY(bp, i)) {
1100                                         upl_flags = UPL_COMMIT_SET_DIRTY;
1101                                 } else {
1102                                         upl_flags = UPL_COMMIT_CLEAR_DIRTY;
1103                                 }
1104
1105                                 if (!ISSET(bp->nb_flags, (NB_NEEDCOMMIT | NB_DELWRI))) {
1106                                         upl_flags |= UPL_COMMIT_CLEAR_PRECIOUS;
1107                                 }
1108
1109                                 ubc_upl_commit_range(upl,
1110                                     i * PAGE_SIZE, PAGE_SIZE,
1111                                     upl_flags |
1112                                     UPL_COMMIT_INACTIVATE |
1113                                     UPL_COMMIT_FREE_ON_EMPTY);
1114                         }
1115                 }
1116 pagelist_cleanup_done:
1117                 /* invalidate any pages past EOF */
1118                 if (NBOFF(bp) + bp->nb_bufsize > (off_t)(np->n_size)) {
1119                         off_t start, end;
1120                         start = trunc_page_64(np->n_size) + PAGE_SIZE_64;
1121                         end = trunc_page_64(NBOFF(bp) + bp->nb_bufsize);
1122                         if (start < NBOFF(bp)) {
1123                                 start = NBOFF(bp);
1124                         }
1125                         if (end > start) {
1126                                 if ((rv = ubc_msync(vp, start, end, NULL, UBC_INVALIDATE))) {
1127                                         printf("nfs_buf_release(): ubc_msync failed!, error %d\n", rv);
1128                                 }
1129                         }
1130                 }
1131                 CLR(bp->nb_flags, NB_PAGELIST);
1132                 bp->nb_pagelist = NULL;
1133         }
1134
1135         lck_mtx_lock(nfs_buf_mutex);
1136
1137         wakeup_needbuffer = wakeup_buffer = wakeup_nbdwrite = 0;
1138
1139         /* Wake up any processes waiting for any buffer to become free. */
1140         if (nfsneedbuffer) {
1141                 nfsneedbuffer = 0;
1142                 wakeup_needbuffer = 1;
1143         }
1144         /* Wake up any processes waiting for _this_ buffer to become free. */
1145         if (ISSET(bp->nb_lflags, NBL_WANTED)) {
1146                 CLR(bp->nb_lflags, NBL_WANTED);
1147                 wakeup_buffer = 1;
1148         }
1149
1150         /* If it's non-needcommit nocache, or an error, mark it invalid. */
1151         if (ISSET(bp->nb_flags, NB_ERROR) ||
1152             (ISSET(bp->nb_flags, NB_NOCACHE) && !ISSET(bp->nb_flags, (NB_NEEDCOMMIT | NB_DELWRI)))) {
1153                 SET(bp->nb_flags, NB_INVAL);
1154         }
1155
1156         if ((bp->nb_bufsize <= 0) || ISSET(bp->nb_flags, NB_INVAL)) {
1157                 /* If it's invalid or empty, dissociate it from its nfsnode */
1158                 if (bp->nb_vnbufs.le_next != NFSNOLIST) {
1159                         LIST_REMOVE(bp, nb_vnbufs);
1160                         bp->nb_vnbufs.le_next = NFSNOLIST;
1161                 }
1162                 bp->nb_np = NULL;
1163                 /* if this was a delayed write, wakeup anyone */
1164                 /* waiting for delayed writes to complete */
1165                 if (ISSET(bp->nb_flags, NB_DELWRI)) {
1166                         CLR(bp->nb_flags, NB_DELWRI);
1167                         nfs_nbdwrite--;
1168                         NFSBUFCNTCHK();
1169                         wakeup_nbdwrite = 1;
1170                 }
1171                 /* invalidate usage timestamp to allow immediate freeing */
1172                 NBUFSTAMPINVALIDATE(bp);
1173                 /* put buffer at head of free list */
1174                 if (bp->nb_free.tqe_next != NFSNOLIST) {
1175                         panic("nfsbuf on freelist");
1176                 }
1177                 SET(bp->nb_flags, NB_INVAL);
1178                 if (ISSET(bp->nb_flags, NB_META)) {
1179                         TAILQ_INSERT_HEAD(&nfsbuffreemeta, bp, nb_free);
1180                         nfsbuffreemetacnt++;
1181                 } else {
1182                         TAILQ_INSERT_HEAD(&nfsbuffree, bp, nb_free);
1183                         nfsbuffreecnt++;
1184                 }
1185         } else if (ISSET(bp->nb_flags, NB_DELWRI)) {
1186                 /* put buffer at end of delwri list */
1187                 if (bp->nb_free.tqe_next != NFSNOLIST) {
1188                         panic("nfsbuf on freelist");
1189                 }
1190                 TAILQ_INSERT_TAIL(&nfsbufdelwri, bp, nb_free);
1191                 nfsbufdelwricnt++;
1192                 freeup = 0;
1193         } else {
1194                 /* update usage timestamp */
1195                 microuptime(&now);
1196                 bp->nb_timestamp = now.tv_sec;
1197                 /* put buffer at end of free list */
1198                 if (bp->nb_free.tqe_next != NFSNOLIST) {
1199                         panic("nfsbuf on freelist");
1200                 }
1201                 if (ISSET(bp->nb_flags, NB_META)) {
1202                         TAILQ_INSERT_TAIL(&nfsbuffreemeta, bp, nb_free);
1203                         nfsbuffreemetacnt++;
1204                 } else {
1205                         TAILQ_INSERT_TAIL(&nfsbuffree, bp, nb_free);
1206                         nfsbuffreecnt++;
1207                 }
1208         }
1209
1210         NFSBUFCNTCHK();
1211
1212         /* Unlock the buffer. */
1213         CLR(bp->nb_flags, (NB_ASYNC | NB_STABLE));
1214         CLR(bp->nb_lflags, NBL_BUSY);
1215
1216         FSDBG_BOT(548, bp, NBOFF(bp), bp->nb_flags, bp->nb_data);
1217
1218         lck_mtx_unlock(nfs_buf_mutex);
1219
1220         if (wakeup_needbuffer) {
1221                 wakeup(&nfsneedbuffer);
1222         }
1223         if (wakeup_buffer) {
1224                 wakeup(bp);
1225         }
1226         if (wakeup_nbdwrite) {
1227                 wakeup(&nfs_nbdwrite);
1228         }
1229         if (freeup) {
1230                 NFS_BUF_FREEUP();
1231         }
1232 }
1233
1234 /*
1235  * Wait for operations on the buffer to complete.
1236  * When they do, extract and return the I/O's error value.
1237  */
1238 int
1239 nfs_buf_iowait(struct nfsbuf *bp)
1240 {
1241         FSDBG_TOP(549, bp, NBOFF(bp), bp->nb_flags, bp->nb_error);
1242
1243         lck_mtx_lock(nfs_buf_mutex);
1244
1245         while (!ISSET(bp->nb_flags, NB_DONE)) {
1246                 msleep(bp, nfs_buf_mutex, PRIBIO + 1, "nfs_buf_iowait", NULL);
1247         }
1248
1249         lck_mtx_unlock(nfs_buf_mutex);
1250
1251         FSDBG_BOT(549, bp, NBOFF(bp), bp->nb_flags, bp->nb_error);
1252
1253         /* check for interruption of I/O, then errors. */
1254         if (ISSET(bp->nb_flags, NB_EINTR)) {
1255                 CLR(bp->nb_flags, NB_EINTR);
1256                 return EINTR;
1257         } else if (ISSET(bp->nb_flags, NB_ERROR)) {
1258                 return bp->nb_error ? bp->nb_error : EIO;
1259         }
1260         return 0;
1261 }
1262
1263 /*
1264  * Mark I/O complete on a buffer.
1265  */
1266 void
1267 nfs_buf_iodone(struct nfsbuf *bp)
1268 {
1269         FSDBG_TOP(550, bp, NBOFF(bp), bp->nb_flags, bp->nb_error);
1270
1271         if (ISSET(bp->nb_flags, NB_DONE)) {
1272                 panic("nfs_buf_iodone already");
1273         }
1274
1275         if (!ISSET(bp->nb_flags, NB_READ)) {
1276                 CLR(bp->nb_flags, NB_WRITEINPROG);
1277                 /*
1278                  * vnode_writedone() takes care of waking up
1279                  * any throttled write operations
1280                  */
1281                 vnode_writedone(NFSTOV(bp->nb_np));
1282                 nfs_node_lock_force(bp->nb_np);
1283                 bp->nb_np->n_numoutput--;
1284                 nfs_node_unlock(bp->nb_np);
1285         }
1286         if (ISSET(bp->nb_flags, NB_ASYNC)) {    /* if async, release it */
1287                 SET(bp->nb_flags, NB_DONE);             /* note that it's done */
1288                 nfs_buf_release(bp, 1);
1289         } else {                                        /* or just wakeup the buffer */
1290                 lck_mtx_lock(nfs_buf_mutex);
1291                 SET(bp->nb_flags, NB_DONE);             /* note that it's done */
1292                 CLR(bp->nb_lflags, NBL_WANTED);
1293                 lck_mtx_unlock(nfs_buf_mutex);
1294                 wakeup(bp);
1295         }
1296
1297         FSDBG_BOT(550, bp, NBOFF(bp), bp->nb_flags, bp->nb_error);
1298 }
1299
1300 void
1301 nfs_buf_write_delayed(struct nfsbuf *bp)
1302 {
1303         nfsnode_t np = bp->nb_np;
1304
1305         FSDBG_TOP(551, bp, NBOFF(bp), bp->nb_flags, 0);
1306         FSDBG(551, bp, bp->nb_dirtyoff, bp->nb_dirtyend, bp->nb_dirty);
1307
1308         /*
1309          * If the block hasn't been seen before:
1310          *      (1) Mark it as having been seen,
1311          *      (2) Make sure it's on its node's correct block list,
1312          */
1313         if (!ISSET(bp->nb_flags, NB_DELWRI)) {
1314                 SET(bp->nb_flags, NB_DELWRI);
1315                 /* move to dirty list */
1316                 lck_mtx_lock(nfs_buf_mutex);
1317                 nfs_nbdwrite++;
1318                 NFSBUFCNTCHK();
1319                 if (bp->nb_vnbufs.le_next != NFSNOLIST) {
1320                         LIST_REMOVE(bp, nb_vnbufs);
1321                 }
1322                 LIST_INSERT_HEAD(&np->n_dirtyblkhd, bp, nb_vnbufs);
1323                 lck_mtx_unlock(nfs_buf_mutex);
1324         }
1325
1326         /*
1327          * If the vnode has "too many" write operations in progress
1328          * wait for them to finish the IO
1329          */
1330         vnode_waitforwrites(NFSTOV(np), VNODE_ASYNC_THROTTLE, 0, 0, "nfs_buf_write_delayed");
1331
1332         /* the file is in a modified state, so make sure the flag's set */
1333         nfs_node_lock_force(np);
1334         np->n_flag |= NMODIFIED;
1335         nfs_node_unlock(np);
1336
1337         /*
1338          * If we have too many delayed write buffers,
1339          * just fall back to doing the async write.
1340          */
1341         if (nfs_nbdwrite < 0) {
1342                 panic("nfs_buf_write_delayed: Negative nfs_nbdwrite");
1343         }
1344         if (nfs_nbdwrite > NFS_A_LOT_OF_DELAYED_WRITES) {
1345                 /* issue async write */
1346                 SET(bp->nb_flags, NB_ASYNC);
1347                 nfs_buf_write(bp);
1348                 FSDBG_BOT(551, bp, NBOFF(bp), bp->nb_flags, bp->nb_error);
1349                 return;
1350         }
1351
1352         /* Otherwise, the "write" is done, so mark and release the buffer. */
1353         SET(bp->nb_flags, NB_DONE);
1354         nfs_buf_release(bp, 1);
1355         FSDBG_BOT(551, bp, NBOFF(bp), bp->nb_flags, 0);
1356         return;
1357 }
1358
1359 /*
1360  * Check that a "needcommit" buffer can still be committed.
1361  * If the write verifier has changed, we need to clear the
1362  * the needcommit flag.
1363  */
1364 void
1365 nfs_buf_check_write_verifier(nfsnode_t np, struct nfsbuf *bp)
1366 {
1367         struct nfsmount *nmp;
1368
1369         if (!ISSET(bp->nb_flags, NB_NEEDCOMMIT)) {
1370                 return;
1371         }
1372
1373         nmp = NFSTONMP(np);
1374         if (nfs_mount_gone(nmp)) {
1375                 return;
1376         }
1377         if (!ISSET(bp->nb_flags, NB_STALEWVERF) && (bp->nb_verf == nmp->nm_verf)) {
1378                 return;
1379         }
1380
1381         /* write verifier changed, clear commit/wverf flags */
1382         CLR(bp->nb_flags, (NB_NEEDCOMMIT | NB_STALEWVERF));
1383         bp->nb_verf = 0;
1384         nfs_node_lock_force(np);
1385         np->n_needcommitcnt--;
1386         CHECK_NEEDCOMMITCNT(np);
1387         nfs_node_unlock(np);
1388 }
1389
1390 /*
1391  * add a reference to a buffer so it doesn't disappear while being used
1392  * (must be called with nfs_buf_mutex held)
1393  */
1394 void
1395 nfs_buf_refget(struct nfsbuf *bp)
1396 {
1397         os_ref_retain_locked(&bp->nb_refs);
1398 }
1399 /*
1400  * release a reference on a buffer
1401  * (must be called with nfs_buf_mutex held)
1402  */
1403 void
1404 nfs_buf_refrele(struct nfsbuf *bp)
1405 {
1406         (void) os_ref_release_locked(&bp->nb_refs);
1407 }
1408
1409 /*
1410  * mark a particular buffer as BUSY
1411  * (must be called with nfs_buf_mutex held)
1412  */
1413 errno_t
1414 nfs_buf_acquire(struct nfsbuf *bp, int flags, int slpflag, int slptimeo)
1415 {
1416         errno_t error;
1417         struct timespec ts;
1418
1419         if (ISSET(bp->nb_lflags, NBL_BUSY)) {
1420                 /*
1421                  * since the lck_mtx_lock may block, the buffer
1422                  * may become BUSY, so we need to recheck for
1423                  * a NOWAIT request
1424                  */
1425                 if (flags & NBAC_NOWAIT) {
1426                         return EBUSY;
1427                 }
1428                 SET(bp->nb_lflags, NBL_WANTED);
1429
1430                 ts.tv_sec = (slptimeo / 100);
1431                 /* the hz value is 100; which leads to 10ms */
1432                 ts.tv_nsec = (slptimeo % 100) * 10  * NSEC_PER_USEC * 1000;
1433
1434                 error = msleep(bp, nfs_buf_mutex, slpflag | (PRIBIO + 1),
1435                     "nfs_buf_acquire", &ts);
1436                 if (error) {
1437                         return error;
1438                 }
1439                 return EAGAIN;
1440         }
1441         if (flags & NBAC_REMOVE) {
1442                 nfs_buf_remfree(bp);
1443         }
1444         SET(bp->nb_lflags, NBL_BUSY);
1445
1446         return 0;
1447 }
1448
1449 /*
1450  * simply drop the BUSY status of a buffer
1451  * (must be called with nfs_buf_mutex held)
1452  */
1453 void
1454 nfs_buf_drop(struct nfsbuf *bp)
1455 {
1456         int need_wakeup = 0;
1457
1458         if (!ISSET(bp->nb_lflags, NBL_BUSY)) {
1459                 panic("nfs_buf_drop: buffer not busy!");
1460         }
1461         if (ISSET(bp->nb_lflags, NBL_WANTED)) {
1462                 /* delay the actual wakeup until after we clear NBL_BUSY */
1463                 need_wakeup = 1;
1464         }
1465         /* Unlock the buffer. */
1466         CLR(bp->nb_lflags, (NBL_BUSY | NBL_WANTED));
1467
1468         if (need_wakeup) {
1469                 wakeup(bp);
1470         }
1471 }
1472
1473 /*
1474  * prepare for iterating over an nfsnode's buffer list
1475  * this lock protects the queue manipulation
1476  * (must be called with nfs_buf_mutex held)
1477  */
1478 int
1479 nfs_buf_iterprepare(nfsnode_t np, struct nfsbuflists *iterheadp, int flags)
1480 {
1481         struct nfsbuflists *listheadp;
1482
1483         if (flags & NBI_DIRTY) {
1484                 listheadp = &np->n_dirtyblkhd;
1485         } else {
1486                 listheadp = &np->n_cleanblkhd;
1487         }
1488
1489         if ((flags & NBI_NOWAIT) && (np->n_bufiterflags & NBI_ITER)) {
1490                 LIST_INIT(iterheadp);
1491                 return EWOULDBLOCK;
1492         }
1493
1494         while (np->n_bufiterflags & NBI_ITER) {
1495                 np->n_bufiterflags |= NBI_ITERWANT;
1496                 msleep(&np->n_bufiterflags, nfs_buf_mutex, 0, "nfs_buf_iterprepare", NULL);
1497         }
1498         if (LIST_EMPTY(listheadp)) {
1499                 LIST_INIT(iterheadp);
1500                 return EINVAL;
1501         }
1502         np->n_bufiterflags |= NBI_ITER;
1503
1504         iterheadp->lh_first = listheadp->lh_first;
1505         listheadp->lh_first->nb_vnbufs.le_prev = &iterheadp->lh_first;
1506         LIST_INIT(listheadp);
1507
1508         return 0;
1509 }
1510
1511 /*
1512  * clean up after iterating over an nfsnode's buffer list
1513  * this lock protects the queue manipulation
1514  * (must be called with nfs_buf_mutex held)
1515  */
1516 void
1517 nfs_buf_itercomplete(nfsnode_t np, struct nfsbuflists *iterheadp, int flags)
1518 {
1519         struct nfsbuflists * listheadp;
1520         struct nfsbuf *bp;
1521
1522         if (flags & NBI_DIRTY) {
1523                 listheadp = &np->n_dirtyblkhd;
1524         } else {
1525                 listheadp = &np->n_cleanblkhd;
1526         }
1527
1528         while (!LIST_EMPTY(iterheadp)) {
1529                 bp = LIST_FIRST(iterheadp);
1530                 LIST_REMOVE(bp, nb_vnbufs);
1531                 LIST_INSERT_HEAD(listheadp, bp, nb_vnbufs);
1532         }
1533
1534         np->n_bufiterflags &= ~NBI_ITER;
1535         if (np->n_bufiterflags & NBI_ITERWANT) {
1536                 np->n_bufiterflags &= ~NBI_ITERWANT;
1537                 wakeup(&np->n_bufiterflags);
1538         }
1539 }
1540
1541
1542 /*
1543  * Read an NFS buffer for a file.
1544  */
1545 int
1546 nfs_buf_read(struct nfsbuf *bp)
1547 {
1548         int error = 0;
1549         nfsnode_t np;
1550         thread_t thd;
1551         kauth_cred_t cred;
1552
1553         np = bp->nb_np;
1554         cred = bp->nb_rcred;
1555         if (IS_VALID_CRED(cred)) {
1556                 kauth_cred_ref(cred);
1557         }
1558         thd = ISSET(bp->nb_flags, NB_ASYNC) ? NULL : current_thread();
1559
1560         /* sanity checks */
1561         if (!ISSET(bp->nb_flags, NB_READ)) {
1562                 panic("nfs_buf_read: !NB_READ");
1563         }
1564         if (ISSET(bp->nb_flags, NB_DONE)) {
1565                 CLR(bp->nb_flags, NB_DONE);
1566         }
1567
1568         NFS_BUF_MAP(bp);
1569
1570         OSAddAtomic64(1, &nfsstats.read_bios);
1571
1572         error = nfs_buf_read_rpc(bp, thd, cred);
1573         /*
1574          * For async I/O, the callbacks will finish up the
1575          * read.  Otherwise, the read has already been finished.
1576          */
1577
1578         if (IS_VALID_CRED(cred)) {
1579                 kauth_cred_unref(&cred);
1580         }
1581         return error;
1582 }
1583
1584 /*
1585  * finish the reading of a buffer
1586  */
1587 void
1588 nfs_buf_read_finish(struct nfsbuf *bp)
1589 {
1590         nfsnode_t np = bp->nb_np;
1591         struct nfsmount *nmp;
1592
1593         if (!ISSET(bp->nb_flags, NB_ERROR)) {
1594                 /* update valid range */
1595                 bp->nb_validoff = 0;
1596                 bp->nb_validend = bp->nb_endio;
1597                 if (bp->nb_endio < (int)bp->nb_bufsize) {
1598                         /*
1599                          * The read may be short because we have unflushed writes
1600                          * that are extending the file size and the reads hit the
1601                          * (old) EOF on the server.  So, just make sure nb_validend
1602                          * correctly tracks EOF.
1603                          * Note that the missing data should have already been zeroed
1604                          * in nfs_buf_read_rpc_finish().
1605                          */
1606                         off_t boff = NBOFF(bp);
1607                         if ((off_t)np->n_size >= (boff + bp->nb_bufsize)) {
1608                                 bp->nb_validend = bp->nb_bufsize;
1609                         } else if ((off_t)np->n_size >= boff) {
1610                                 bp->nb_validend = np->n_size - boff;
1611                         } else {
1612                                 bp->nb_validend = 0;
1613                         }
1614                 }
1615                 if ((nmp = NFSTONMP(np)) && (nmp->nm_vers == NFS_VER2) &&
1616                     ((NBOFF(bp) + bp->nb_validend) > 0x100000000LL)) {
1617                         bp->nb_validend = 0x100000000LL - NBOFF(bp);
1618                 }
1619                 bp->nb_valid = (uint32_t)(1LLU << (round_page_32(bp->nb_validend) / PAGE_SIZE)) - 1;
1620                 if (bp->nb_validend & PAGE_MASK) {
1621                         /* zero-fill remainder of last page */
1622                         bzero(bp->nb_data + bp->nb_validend, PAGE_SIZE - (bp->nb_validend & PAGE_MASK));
1623                 }
1624         }
1625         nfs_buf_iodone(bp);
1626 }
1627
1628 /*
1629  * initiate the NFS READ RPC(s) for a buffer
1630  */
1631 int
1632 nfs_buf_read_rpc(struct nfsbuf *bp, thread_t thd, kauth_cred_t cred)
1633 {
1634         struct nfsmount *nmp;
1635         nfsnode_t np = bp->nb_np;
1636         int error = 0, nfsvers, async;
1637         int offset, nrpcs;
1638         uint32_t nmrsize, length, len;
1639         off_t boff;
1640         struct nfsreq *req;
1641         struct nfsreq_cbinfo cb;
1642
1643         nmp = NFSTONMP(np);
1644         if (nfs_mount_gone(nmp)) {
1645                 bp->nb_error = error = ENXIO;
1646                 SET(bp->nb_flags, NB_ERROR);
1647                 nfs_buf_iodone(bp);
1648                 return error;
1649         }
1650         nfsvers = nmp->nm_vers;
1651         nmrsize = nmp->nm_rsize;
1652
1653         boff = NBOFF(bp);
1654         offset = 0;
1655         length = bp->nb_bufsize;
1656
1657         if (nfsvers == NFS_VER2) {
1658                 if (boff > 0xffffffffLL) {
1659                         bp->nb_error = error = EFBIG;
1660                         SET(bp->nb_flags, NB_ERROR);
1661                         nfs_buf_iodone(bp);
1662                         return error;
1663                 }
1664                 if ((boff + length - 1) > 0xffffffffLL) {
1665                         length = 0x100000000LL - boff;
1666                 }
1667         }
1668
1669         /* Note: Can only do async I/O if nfsiods are configured. */
1670         async = (bp->nb_flags & NB_ASYNC);
1671         cb.rcb_func = async ? nfs_buf_read_rpc_finish : NULL;
1672         cb.rcb_bp = bp;
1673
1674         bp->nb_offio = bp->nb_endio = 0;
1675         bp->nb_rpcs = nrpcs = (length + nmrsize - 1) / nmrsize;
1676         if (async && (nrpcs > 1)) {
1677                 SET(bp->nb_flags, NB_MULTASYNCRPC);
1678         } else {
1679                 CLR(bp->nb_flags, NB_MULTASYNCRPC);
1680         }
1681
1682         while (length > 0) {
1683                 if (ISSET(bp->nb_flags, NB_ERROR)) {
1684                         error = bp->nb_error;
1685                         break;
1686                 }
1687                 len = (length > nmrsize) ? nmrsize : length;
1688                 cb.rcb_args[0] = offset;
1689                 cb.rcb_args[1] = len;
1690 #if CONFIG_NFS4
1691                 if (nmp->nm_vers >= NFS_VER4) {
1692                         cb.rcb_args[2] = nmp->nm_stategenid;
1693                 }
1694 #endif
1695                 req = NULL;
1696                 error = nmp->nm_funcs->nf_read_rpc_async(np, boff + offset, len, thd, cred, &cb, &req);
1697                 if (error) {
1698                         break;
1699                 }
1700                 offset += len;
1701                 length -= len;
1702                 if (async) {
1703                         continue;
1704                 }
1705                 nfs_buf_read_rpc_finish(req);
1706                 if (ISSET(bp->nb_flags, NB_ERROR)) {
1707                         error = bp->nb_error;
1708                         break;
1709                 }
1710         }
1711
1712         if (length > 0) {
1713                 /*
1714                  * Something bad happened while trying to send the RPC(s).
1715                  * Wait for any outstanding requests to complete.
1716                  */
1717                 bp->nb_error = error;
1718                 SET(bp->nb_flags, NB_ERROR);
1719                 if (ISSET(bp->nb_flags, NB_MULTASYNCRPC)) {
1720                         nrpcs = (length + nmrsize - 1) / nmrsize;
1721                         lck_mtx_lock(nfs_buf_mutex);
1722                         bp->nb_rpcs -= nrpcs;
1723                         if (bp->nb_rpcs == 0) {
1724                                 /* No RPCs left, so the buffer's done */
1725                                 lck_mtx_unlock(nfs_buf_mutex);
1726                                 nfs_buf_iodone(bp);
1727                         } else {
1728                                 /* wait for the last RPC to mark it done */
1729                                 while (bp->nb_rpcs > 0) {
1730                                         msleep(&bp->nb_rpcs, nfs_buf_mutex, 0,
1731                                             "nfs_buf_read_rpc_cancel", NULL);
1732                                 }
1733                                 lck_mtx_unlock(nfs_buf_mutex);
1734                         }
1735                 } else {
1736                         nfs_buf_iodone(bp);
1737                 }
1738         }
1739
1740         return error;
1741 }
1742
1743 /*
1744  * finish up an NFS READ RPC on a buffer
1745  */
1746 void
1747 nfs_buf_read_rpc_finish(struct nfsreq *req)
1748 {
1749         struct nfsmount *nmp;
1750         size_t rlen;
1751         struct nfsreq_cbinfo cb;
1752         struct nfsbuf *bp;
1753         int error = 0, nfsvers, offset, length, eof = 0, multasyncrpc, finished;
1754         void *wakeme = NULL;
1755         struct nfsreq *rreq = NULL;
1756         nfsnode_t np;
1757         thread_t thd;
1758         kauth_cred_t cred;
1759         uio_t auio;
1760         char uio_buf[UIO_SIZEOF(1)];
1761
1762 finish:
1763         np = req->r_np;
1764         thd = req->r_thread;
1765         cred = req->r_cred;
1766         if (IS_VALID_CRED(cred)) {
1767                 kauth_cred_ref(cred);
1768         }
1769         cb = req->r_callback;
1770         bp = cb.rcb_bp;
1771         if (cb.rcb_func) { /* take an extra reference on the nfsreq in case we want to resend it later due to grace error */
1772                 nfs_request_ref(req, 0);
1773         }
1774
1775         nmp = NFSTONMP(np);
1776         if (nfs_mount_gone(nmp)) {
1777                 SET(bp->nb_flags, NB_ERROR);
1778                 bp->nb_error = error = ENXIO;
1779         }
1780         if (error || ISSET(bp->nb_flags, NB_ERROR)) {
1781                 /* just drop it */
1782                 nfs_request_async_cancel(req);
1783                 goto out;
1784         }
1785
1786         nfsvers = nmp->nm_vers;
1787         offset = cb.rcb_args[0];
1788         rlen = length = cb.rcb_args[1];
1789
1790         auio = uio_createwithbuffer(1, NBOFF(bp) + offset, UIO_SYSSPACE,
1791             UIO_READ, &uio_buf, sizeof(uio_buf));
1792         uio_addiov(auio, CAST_USER_ADDR_T(bp->nb_data + offset), length);
1793
1794         /* finish the RPC */
1795         error = nmp->nm_funcs->nf_read_rpc_async_finish(np, req, auio, &rlen, &eof);
1796         if ((error == EINPROGRESS) && cb.rcb_func) {
1797                 /* async request restarted */
1798                 if (cb.rcb_func) {
1799                         nfs_request_rele(req);
1800                 }
1801                 if (IS_VALID_CRED(cred)) {
1802                         kauth_cred_unref(&cred);
1803                 }
1804                 return;
1805         }
1806 #if CONFIG_NFS4
1807         if ((nmp->nm_vers >= NFS_VER4) && nfs_mount_state_error_should_restart(error) && !ISSET(bp->nb_flags, NB_ERROR)) {
1808                 lck_mtx_lock(&nmp->nm_lock);
1809                 if ((error != NFSERR_OLD_STATEID) && (error != NFSERR_GRACE) && (cb.rcb_args[2] == nmp->nm_stategenid)) {
1810                         NP(np, "nfs_buf_read_rpc_finish: error %d @ 0x%llx, 0x%x 0x%x, initiating recovery",
1811                             error, NBOFF(bp) + offset, cb.rcb_args[2], nmp->nm_stategenid);
1812                         nfs_need_recover(nmp, error);
1813                 }
1814                 lck_mtx_unlock(&nmp->nm_lock);
1815                 if (np->n_flag & NREVOKE) {
1816                         error = EIO;
1817                 } else {
1818                         if (error == NFSERR_GRACE) {
1819                                 if (cb.rcb_func) {
1820                                         /*
1821                                          * For an async I/O request, handle a grace delay just like
1822                                          * jukebox errors.  Set the resend time and queue it up.
1823                                          */
1824                                         struct timeval now;
1825                                         if (req->r_nmrep.nmc_mhead) {
1826                                                 mbuf_freem(req->r_nmrep.nmc_mhead);
1827                                                 req->r_nmrep.nmc_mhead = NULL;
1828                                         }
1829                                         req->r_error = 0;
1830                                         microuptime(&now);
1831                                         lck_mtx_lock(&req->r_mtx);
1832                                         req->r_resendtime = now.tv_sec + 2;
1833                                         req->r_xid = 0;                 // get a new XID
1834                                         req->r_flags |= R_RESTART;
1835                                         req->r_start = 0;
1836                                         nfs_asyncio_resend(req);
1837                                         lck_mtx_unlock(&req->r_mtx);
1838                                         if (IS_VALID_CRED(cred)) {
1839                                                 kauth_cred_unref(&cred);
1840                                         }
1841                                         /* Note: nfsreq reference taken will be dropped later when finished */
1842                                         return;
1843                                 }
1844                                 /* otherwise, just pause a couple seconds and retry */
1845                                 tsleep(&nmp->nm_state, (PZERO - 1), "nfsgrace", 2 * hz);
1846                         }
1847                         if (!(error = nfs_mount_state_wait_for_recovery(nmp))) {
1848                                 rlen = 0;
1849                                 goto readagain;
1850                         }
1851                 }
1852         }
1853 #endif
1854         if (error) {
1855                 SET(bp->nb_flags, NB_ERROR);
1856                 bp->nb_error = error;
1857                 goto out;
1858         }
1859
1860         if ((rlen > 0) && (bp->nb_endio < (offset + (int)rlen))) {
1861                 bp->nb_endio = offset + rlen;
1862         }
1863
1864         if ((nfsvers == NFS_VER2) || eof || (rlen == 0)) {
1865                 /* zero out the remaining data (up to EOF) */
1866                 off_t rpcrem, eofrem, rem;
1867                 rpcrem = (length - rlen);
1868                 eofrem = np->n_size - (NBOFF(bp) + offset + rlen);
1869                 rem = (rpcrem < eofrem) ? rpcrem : eofrem;
1870                 if (rem > 0) {
1871                         bzero(bp->nb_data + offset + rlen, rem);
1872                 }
1873         } else if (((int)rlen < length) && !ISSET(bp->nb_flags, NB_ERROR)) {
1874                 /*
1875                  * short read
1876                  *
1877                  * We haven't hit EOF and we didn't get all the data
1878                  * requested, so we need to issue another read for the rest.
1879                  * (Don't bother if the buffer already hit an error.)
1880                  */
1881 #if CONFIG_NFS4
1882 readagain:
1883 #endif
1884                 offset += rlen;
1885                 length -= rlen;
1886                 cb.rcb_args[0] = offset;
1887                 cb.rcb_args[1] = length;
1888 #if CONFIG_NFS4
1889                 if (nmp->nm_vers >= NFS_VER4) {
1890                         cb.rcb_args[2] = nmp->nm_stategenid;
1891                 }
1892 #endif
1893                 error = nmp->nm_funcs->nf_read_rpc_async(np, NBOFF(bp) + offset, length, thd, cred, &cb, &rreq);
1894                 if (!error) {
1895                         if (IS_VALID_CRED(cred)) {
1896                                 kauth_cred_unref(&cred);
1897                         }
1898                         if (!cb.rcb_func) {
1899                                 /* if !async we'll need to wait for this RPC to finish */
1900                                 req = rreq;
1901                                 rreq = NULL;
1902                                 goto finish;
1903                         }
1904                         nfs_request_rele(req);
1905                         /*
1906                          * We're done here.
1907                          * Outstanding RPC count is unchanged.
1908                          * Callback will be called when RPC is done.
1909                          */
1910                         return;
1911                 }
1912                 SET(bp->nb_flags, NB_ERROR);
1913                 bp->nb_error = error;
1914         }
1915
1916 out:
1917         if (cb.rcb_func) {
1918                 nfs_request_rele(req);
1919         }
1920         if (IS_VALID_CRED(cred)) {
1921                 kauth_cred_unref(&cred);
1922         }
1923
1924         /*
1925          * Decrement outstanding RPC count on buffer
1926          * and call nfs_buf_read_finish on last RPC.
1927          *
1928          * (Note: when there are multiple async RPCs issued for a
1929          * buffer we need nfs_buffer_mutex to avoid problems when
1930          * aborting a partially-initiated set of RPCs)
1931          */
1932
1933         multasyncrpc = ISSET(bp->nb_flags, NB_MULTASYNCRPC);
1934         if (multasyncrpc) {
1935                 lck_mtx_lock(nfs_buf_mutex);
1936         }
1937
1938         bp->nb_rpcs--;
1939         finished = (bp->nb_rpcs == 0);
1940
1941         if (multasyncrpc) {
1942                 lck_mtx_unlock(nfs_buf_mutex);
1943         }
1944
1945         if (finished) {
1946                 if (multasyncrpc) {
1947                         wakeme = &bp->nb_rpcs;
1948                 }
1949                 nfs_buf_read_finish(bp);
1950                 if (wakeme) {
1951                         wakeup(wakeme);
1952                 }
1953         }
1954 }
1955
1956 /*
1957  * Do buffer readahead.
1958  * Initiate async I/O to read buffers not in cache.
1959  */
1960 int
1961 nfs_buf_readahead(nfsnode_t np, int ioflag, daddr64_t *rabnp, daddr64_t lastrabn, thread_t thd, kauth_cred_t cred)
1962 {
1963         struct nfsmount *nmp = NFSTONMP(np);
1964         struct nfsbuf *bp;
1965         int error = 0;
1966         uint32_t nra;
1967
1968         if (nfs_mount_gone(nmp)) {
1969                 return ENXIO;
1970         }
1971         if (nmp->nm_readahead <= 0) {
1972                 return 0;
1973         }
1974         if (*rabnp > lastrabn) {
1975                 return 0;
1976         }
1977
1978         for (nra = 0; (nra < nmp->nm_readahead) && (*rabnp <= lastrabn); nra++, *rabnp = *rabnp + 1) {
1979                 /* check if block exists and is valid. */
1980                 if ((*rabnp * nmp->nm_biosize) >= (off_t)np->n_size) {
1981                         /* stop reading ahead if we're beyond EOF */
1982                         *rabnp = lastrabn;
1983                         break;
1984                 }
1985                 error = nfs_buf_get(np, *rabnp, nmp->nm_biosize, thd, NBLK_READ | NBLK_NOWAIT, &bp);
1986                 if (error) {
1987                         break;
1988                 }
1989                 nfs_node_lock_force(np);
1990                 np->n_lastrahead = *rabnp;
1991                 nfs_node_unlock(np);
1992                 if (!bp) {
1993                         continue;
1994                 }
1995                 if ((ioflag & IO_NOCACHE) && ISSET(bp->nb_flags, NB_CACHE) &&
1996                     !bp->nb_dirty && !ISSET(bp->nb_flags, (NB_DELWRI | NB_NCRDAHEAD))) {
1997                         CLR(bp->nb_flags, NB_CACHE);
1998                         bp->nb_valid = 0;
1999                         bp->nb_validoff = bp->nb_validend = -1;
2000                 }
2001                 if ((bp->nb_dirtyend <= 0) && !bp->nb_dirty &&
2002                     !ISSET(bp->nb_flags, (NB_CACHE | NB_DELWRI))) {
2003                         SET(bp->nb_flags, (NB_READ | NB_ASYNC));
2004                         if (ioflag & IO_NOCACHE) {
2005                                 SET(bp->nb_flags, NB_NCRDAHEAD);
2006                         }
2007                         if (!IS_VALID_CRED(bp->nb_rcred) && IS_VALID_CRED(cred)) {
2008                                 kauth_cred_ref(cred);
2009                                 bp->nb_rcred = cred;
2010                         }
2011                         if ((error = nfs_buf_read(bp))) {
2012                                 break;
2013                         }
2014                         continue;
2015                 }
2016                 nfs_buf_release(bp, 1);
2017         }
2018         return error;
2019 }
2020
2021 /*
2022  * NFS buffer I/O for reading files.
2023  */
2024 int
2025 nfs_bioread(nfsnode_t np, uio_t uio, int ioflag, vfs_context_t ctx)
2026 {
2027         vnode_t vp = NFSTOV(np);
2028         struct nfsbuf *bp = NULL;
2029         struct nfsmount *nmp = VTONMP(vp);
2030         daddr64_t lbn, rabn = 0, lastrabn, maxrabn = -1;
2031         off_t diff;
2032         int error = 0, n = 0, on = 0;
2033         int nfsvers, biosize, modified, readaheads = 0;
2034         thread_t thd;
2035         kauth_cred_t cred;
2036         int64_t io_resid;
2037
2038         FSDBG_TOP(514, np, uio_offset(uio), uio_resid(uio), ioflag);
2039
2040         nfsvers = nmp->nm_vers;
2041         biosize = nmp->nm_biosize;
2042         thd = vfs_context_thread(ctx);
2043         cred = vfs_context_ucred(ctx);
2044
2045         if (vnode_vtype(vp) != VREG) {
2046                 printf("nfs_bioread: type %x unexpected\n", vnode_vtype(vp));
2047                 FSDBG_BOT(514, np, 0xd1e0016, 0, EINVAL);
2048                 return EINVAL;
2049         }
2050
2051         /*
2052          * For NFS, cache consistency can only be maintained approximately.
2053          * Although RFC1094 does not specify the criteria, the following is
2054          * believed to be compatible with the reference port.
2055          *
2056          * If the file has changed since the last read RPC or you have
2057          * written to the file, you may have lost data cache consistency
2058          * with the server.  So, check for a change, and flush all of the
2059          * file's data out of the cache.
2060          * NB: This implies that cache data can be read when up to
2061          * NFS_MAXATTRTIMO seconds out of date. If you find that you
2062          * need current attributes, nfs_getattr() can be forced to fetch
2063          * new attributes (via NATTRINVALIDATE() or NGA_UNCACHED).
2064          */
2065
2066         if (ISSET(np->n_flag, NUPDATESIZE)) {
2067                 nfs_data_update_size(np, 0);
2068         }
2069
2070         if ((error = nfs_node_lock(np))) {
2071                 FSDBG_BOT(514, np, 0xd1e0222, 0, error);
2072                 return error;
2073         }
2074
2075         if (np->n_flag & NNEEDINVALIDATE) {
2076                 np->n_flag &= ~NNEEDINVALIDATE;
2077                 nfs_node_unlock(np);
2078                 error = nfs_vinvalbuf(vp, V_SAVE | V_IGNORE_WRITEERR, ctx, 1);
2079                 if (!error) {
2080                         error = nfs_node_lock(np);
2081                 }
2082                 if (error) {
2083                         FSDBG_BOT(514, np, 0xd1e0322, 0, error);
2084                         return error;
2085                 }
2086         }
2087
2088         modified = (np->n_flag & NMODIFIED);
2089         nfs_node_unlock(np);
2090         /* nfs_getattr() will check changed and purge caches */
2091         error = nfs_getattr(np, NULL, ctx, modified ? NGA_UNCACHED : NGA_CACHED);
2092         if (error) {
2093                 FSDBG_BOT(514, np, 0xd1e0004, 0, error);
2094                 return error;
2095         }
2096
2097         if (uio_resid(uio) == 0) {
2098                 FSDBG_BOT(514, np, 0xd1e0001, 0, 0);
2099                 return 0;
2100         }
2101         if (uio_offset(uio) < 0) {
2102                 FSDBG_BOT(514, np, 0xd1e0002, 0, EINVAL);
2103                 return EINVAL;
2104         }
2105
2106         /*
2107          * set up readahead - which may be limited by:
2108          * + current request length (for IO_NOCACHE)
2109          * + readahead setting
2110          * + file size
2111          */
2112         if (nmp->nm_readahead > 0) {
2113                 off_t end = uio_offset(uio) + uio_resid(uio);
2114                 if (end > (off_t)np->n_size) {
2115                         end = np->n_size;
2116                 }
2117                 rabn = uio_offset(uio) / biosize;
2118                 maxrabn = (end - 1) / biosize;
2119                 nfs_node_lock_force(np);
2120                 if (!(ioflag & IO_NOCACHE) &&
2121                     (!rabn || (rabn == np->n_lastread) || (rabn == (np->n_lastread + 1)))) {
2122                         maxrabn += nmp->nm_readahead;
2123                         if ((maxrabn * biosize) >= (off_t)np->n_size) {
2124                                 maxrabn = ((off_t)np->n_size - 1) / biosize;
2125                         }
2126                 }
2127                 if (maxrabn < np->n_lastrahead) {
2128                         np->n_lastrahead = -1;
2129                 }
2130                 if (rabn < np->n_lastrahead) {
2131                         rabn = np->n_lastrahead + 1;
2132                 }
2133                 nfs_node_unlock(np);
2134         } else {
2135                 rabn = maxrabn = 0;
2136         }
2137
2138         do {
2139                 nfs_data_lock(np, NFS_DATA_LOCK_SHARED);
2140                 lbn = uio_offset(uio) / biosize;
2141
2142                 /*
2143                  * Copy directly from any cached pages without grabbing the bufs.
2144                  * (If we are NOCACHE and we've issued readahead requests, we need
2145                  * to grab the NB_NCRDAHEAD bufs to drop them.)
2146                  */
2147                 if ((!(ioflag & IO_NOCACHE) || !readaheads) &&
2148                     ((uio->uio_segflg == UIO_USERSPACE32 ||
2149                     uio->uio_segflg == UIO_USERSPACE64 ||
2150                     uio->uio_segflg == UIO_USERSPACE))) {
2151                         io_resid = uio_resid(uio);
2152                         diff = np->n_size - uio_offset(uio);
2153                         if (diff < io_resid) {
2154                                 io_resid = diff;
2155                         }
2156                         if (io_resid > 0) {
2157                                 int count = (io_resid > INT_MAX) ? INT_MAX : io_resid;
2158                                 error = cluster_copy_ubc_data(vp, uio, &count, 0);
2159                                 if (error) {
2160                                         nfs_data_unlock(np);
2161                                         FSDBG_BOT(514, np, uio_offset(uio), 0xcacefeed, error);
2162                                         return error;
2163                                 }
2164                         }
2165                         /* count any biocache reads that we just copied directly */
2166                         if (lbn != (uio_offset(uio) / biosize)) {
2167                                 OSAddAtomic64((uio_offset(uio) / biosize) - lbn, &nfsstats.biocache_reads);
2168                                 FSDBG(514, np, 0xcacefeed, uio_offset(uio), error);
2169                         }
2170                 }
2171
2172                 lbn = uio_offset(uio) / biosize;
2173                 on = uio_offset(uio) % biosize;
2174                 nfs_node_lock_force(np);
2175                 np->n_lastread = (uio_offset(uio) - 1) / biosize;
2176                 nfs_node_unlock(np);
2177
2178                 if ((uio_resid(uio) <= 0) || (uio_offset(uio) >= (off_t)np->n_size)) {
2179                         nfs_data_unlock(np);
2180                         FSDBG_BOT(514, np, uio_offset(uio), uio_resid(uio), 0xaaaaaaaa);
2181                         return 0;
2182                 }
2183
2184                 /* adjust readahead block number, if necessary */
2185                 if (rabn < lbn) {
2186                         rabn = lbn;
2187                 }
2188                 lastrabn = MIN(maxrabn, lbn + nmp->nm_readahead);
2189                 if (rabn <= lastrabn) { /* start readaheads */
2190                         error = nfs_buf_readahead(np, ioflag, &rabn, lastrabn, thd, cred);
2191                         if (error) {
2192                                 nfs_data_unlock(np);
2193                                 FSDBG_BOT(514, np, 0xd1e000b, 1, error);
2194                                 return error;
2195                         }
2196                         readaheads = 1;
2197                 }
2198
2199                 OSAddAtomic64(1, &nfsstats.biocache_reads);
2200
2201                 /*
2202                  * If the block is in the cache and has the required data
2203                  * in a valid region, just copy it out.
2204                  * Otherwise, get the block and write back/read in,
2205                  * as required.
2206                  */
2207 again:
2208                 io_resid = uio_resid(uio);
2209                 n = (io_resid > (biosize - on)) ? (biosize - on) : io_resid;
2210                 diff = np->n_size - uio_offset(uio);
2211                 if (diff < n) {
2212                         n = diff;
2213                 }
2214
2215                 error = nfs_buf_get(np, lbn, biosize, thd, NBLK_READ, &bp);
2216                 if (error) {
2217                         nfs_data_unlock(np);
2218                         FSDBG_BOT(514, np, 0xd1e000c, 0, error);
2219                         return error;
2220                 }
2221
2222                 if ((ioflag & IO_NOCACHE) && ISSET(bp->nb_flags, NB_CACHE)) {
2223                         /*
2224                          * IO_NOCACHE found a cached buffer.
2225                          * Flush the buffer if it's dirty.
2226                          * Invalidate the data if it wasn't just read
2227                          * in as part of a "nocache readahead".
2228                          */
2229                         if (bp->nb_dirty || (bp->nb_dirtyend > 0)) {
2230                                 /* so write the buffer out and try again */
2231                                 SET(bp->nb_flags, NB_NOCACHE);
2232                                 goto flushbuffer;
2233                         }
2234                         if (ISSET(bp->nb_flags, NB_NCRDAHEAD)) {
2235                                 CLR(bp->nb_flags, NB_NCRDAHEAD);
2236                                 SET(bp->nb_flags, NB_NOCACHE);
2237                         }
2238                 }
2239
2240                 /* if any pages are valid... */
2241                 if (bp->nb_valid) {
2242                         /* ...check for any invalid pages in the read range */
2243                         int pg, firstpg, lastpg, dirtypg;
2244                         dirtypg = firstpg = lastpg = -1;
2245                         pg = on / PAGE_SIZE;
2246                         while (pg <= (on + n - 1) / PAGE_SIZE) {
2247                                 if (!NBPGVALID(bp, pg)) {
2248                                         if (firstpg < 0) {
2249                                                 firstpg = pg;
2250                                         }
2251                                         lastpg = pg;
2252                                 } else if (firstpg >= 0 && dirtypg < 0 && NBPGDIRTY(bp, pg)) {
2253                                         dirtypg = pg;
2254                                 }
2255                                 pg++;
2256                         }
2257
2258                         /* if there are no invalid pages, we're all set */
2259                         if (firstpg < 0) {
2260                                 if (bp->nb_validoff < 0) {
2261                                         /* valid range isn't set up, so */
2262                                         /* set it to what we know is valid */
2263                                         bp->nb_validoff = trunc_page(on);
2264                                         bp->nb_validend = round_page(on + n);
2265                                         nfs_buf_normalize_valid_range(np, bp);
2266                                 }
2267                                 goto buffer_ready;
2268                         }
2269
2270                         /* there are invalid pages in the read range */
2271                         if (((dirtypg > firstpg) && (dirtypg < lastpg)) ||
2272                             (((firstpg * PAGE_SIZE) < bp->nb_dirtyend) && (((lastpg + 1) * PAGE_SIZE) > bp->nb_dirtyoff))) {
2273                                 /* there are also dirty page(s) (or range) in the read range, */
2274                                 /* so write the buffer out and try again */
2275 flushbuffer:
2276                                 CLR(bp->nb_flags, (NB_DONE | NB_ERROR | NB_INVAL));
2277                                 SET(bp->nb_flags, NB_ASYNC);
2278                                 if (!IS_VALID_CRED(bp->nb_wcred)) {
2279                                         kauth_cred_ref(cred);
2280                                         bp->nb_wcred = cred;
2281                                 }
2282                                 error = nfs_buf_write(bp);
2283                                 if (error) {
2284                                         nfs_data_unlock(np);
2285                                         FSDBG_BOT(514, np, 0xd1e000d, 0, error);
2286                                         return error;
2287                                 }
2288                                 goto again;
2289                         }
2290                         if (!bp->nb_dirty && bp->nb_dirtyend <= 0 &&
2291                             (lastpg - firstpg + 1) > (biosize / PAGE_SIZE) / 2) {
2292                                 /* we need to read in more than half the buffer and the */
2293                                 /* buffer's not dirty, so just fetch the whole buffer */
2294                                 bp->nb_valid = 0;
2295                         } else {
2296                                 /* read the page range in */
2297                                 uio_t auio;
2298                                 char uio_buf[UIO_SIZEOF(1)];
2299
2300                                 NFS_BUF_MAP(bp);
2301                                 auio = uio_createwithbuffer(1, (NBOFF(bp) + firstpg * PAGE_SIZE_64),
2302                                     UIO_SYSSPACE, UIO_READ, &uio_buf[0], sizeof(uio_buf));
2303                                 if (!auio) {
2304                                         error = ENOMEM;
2305                                 } else {
2306                                         uio_addiov(auio, CAST_USER_ADDR_T(bp->nb_data + (firstpg * PAGE_SIZE)),
2307                                             ((lastpg - firstpg + 1) * PAGE_SIZE));
2308                                         error = nfs_read_rpc(np, auio, ctx);
2309                                 }
2310                                 if (error) {
2311                                         if (ioflag & IO_NOCACHE) {
2312                                                 SET(bp->nb_flags, NB_NOCACHE);
2313                                         }
2314                                         nfs_buf_release(bp, 1);
2315                                         nfs_data_unlock(np);
2316                                         FSDBG_BOT(514, np, 0xd1e000e, 0, error);
2317                                         return error;
2318                                 }
2319                                 /* Make sure that the valid range is set to cover this read. */
2320                                 bp->nb_validoff = trunc_page_32(on);
2321                                 bp->nb_validend = round_page_32(on + n);
2322                                 nfs_buf_normalize_valid_range(np, bp);
2323                                 if (uio_resid(auio) > 0) {
2324                                         /* if short read, must have hit EOF, */
2325                                         /* so zero the rest of the range */
2326                                         bzero(CAST_DOWN(caddr_t, uio_curriovbase(auio)), uio_resid(auio));
2327                                 }
2328                                 /* mark the pages (successfully read) as valid */
2329                                 for (pg = firstpg; pg <= lastpg; pg++) {
2330                                         NBPGVALID_SET(bp, pg);
2331                                 }
2332                         }
2333                 }
2334                 /* if no pages are valid, read the whole block */
2335                 if (!bp->nb_valid) {
2336                         if (!IS_VALID_CRED(bp->nb_rcred) && IS_VALID_CRED(cred)) {
2337                                 kauth_cred_ref(cred);
2338                                 bp->nb_rcred = cred;
2339                         }
2340                         SET(bp->nb_flags, NB_READ);
2341                         CLR(bp->nb_flags, (NB_DONE | NB_ERROR | NB_INVAL));
2342                         error = nfs_buf_read(bp);
2343                         if (ioflag & IO_NOCACHE) {
2344                                 SET(bp->nb_flags, NB_NOCACHE);
2345                         }
2346                         if (error) {
2347                                 nfs_data_unlock(np);
2348                                 nfs_buf_release(bp, 1);
2349                                 FSDBG_BOT(514, np, 0xd1e000f, 0, error);
2350                                 return error;
2351                         }
2352                 }
2353 buffer_ready:
2354                 /* validate read range against valid range and clip */
2355                 if (bp->nb_validend > 0) {
2356                         diff = (on >= bp->nb_validend) ? 0 : (bp->nb_validend - on);
2357                         if (diff < n) {
2358                                 n = diff;
2359                         }
2360                 }
2361                 if (n > 0) {
2362                         NFS_BUF_MAP(bp);
2363                         error = uiomove(bp->nb_data + on, n, uio);
2364                 }
2365
2366
2367                 nfs_buf_release(bp, 1);
2368                 nfs_data_unlock(np);
2369                 nfs_node_lock_force(np);
2370                 np->n_lastread = (uio_offset(uio) - 1) / biosize;
2371                 nfs_node_unlock(np);
2372         } while (error == 0 && uio_resid(uio) > 0 && n > 0);
2373         FSDBG_BOT(514, np, uio_offset(uio), uio_resid(uio), error);
2374         return error;
2375 }
2376
2377 /*
2378  * limit the number of outstanding async I/O writes
2379  */
2380 int
2381 nfs_async_write_start(struct nfsmount *nmp)
2382 {
2383         int error = 0, slpflag = NMFLAG(nmp, INTR) ? PCATCH : 0;
2384         struct timespec ts = { .tv_sec = 1, .tv_nsec = 0 };
2385
2386         if (nfs_max_async_writes <= 0) {
2387                 return 0;
2388         }
2389         lck_mtx_lock(&nmp->nm_lock);
2390         while ((nfs_max_async_writes > 0) && (nmp->nm_asyncwrites >= nfs_max_async_writes)) {
2391                 if ((error = nfs_sigintr(nmp, NULL, current_thread(), 1))) {
2392                         break;
2393                 }
2394                 msleep(&nmp->nm_asyncwrites, &nmp->nm_lock, slpflag | (PZERO - 1), "nfsasyncwrites", &ts);
2395                 slpflag = 0;
2396         }
2397         if (!error) {
2398                 nmp->nm_asyncwrites++;
2399         }
2400         lck_mtx_unlock(&nmp->nm_lock);
2401         return error;
2402 }
2403 void
2404 nfs_async_write_done(struct nfsmount *nmp)
2405 {
2406         if (nmp->nm_asyncwrites <= 0) {
2407                 return;
2408         }
2409         lck_mtx_lock(&nmp->nm_lock);
2410         if (nmp->nm_asyncwrites-- >= nfs_max_async_writes) {
2411                 wakeup(&nmp->nm_asyncwrites);
2412         }
2413         lck_mtx_unlock(&nmp->nm_lock);
2414 }
2415
2416 /*
2417  * write (or commit) the given NFS buffer
2418  *
2419  * Commit the buffer if we can.
2420  * Write out any dirty range.
2421  * If any dirty pages remain, write them out.
2422  * Mark buffer done.
2423  *
2424  * For async requests, all the work beyond sending the initial
2425  * write RPC is handled in the RPC callback(s).
2426  */
2427 int
2428 nfs_buf_write(struct nfsbuf *bp)
2429 {
2430         int error = 0, oldflags, async;
2431         nfsnode_t np;
2432         thread_t thd;
2433         kauth_cred_t cred;
2434         proc_t p = current_proc();
2435         int iomode, doff, dend, firstpg, lastpg;
2436         uint32_t pagemask;
2437
2438         FSDBG_TOP(553, bp, NBOFF(bp), bp->nb_flags, 0);
2439
2440         if (!ISSET(bp->nb_lflags, NBL_BUSY)) {
2441                 panic("nfs_buf_write: buffer is not busy???");
2442         }
2443
2444         np = bp->nb_np;
2445         async = ISSET(bp->nb_flags, NB_ASYNC);
2446         oldflags = bp->nb_flags;
2447
2448         CLR(bp->nb_flags, (NB_READ | NB_DONE | NB_ERROR | NB_DELWRI));
2449         if (ISSET(oldflags, NB_DELWRI)) {
2450                 lck_mtx_lock(nfs_buf_mutex);
2451                 nfs_nbdwrite--;
2452                 NFSBUFCNTCHK();
2453                 lck_mtx_unlock(nfs_buf_mutex);
2454                 wakeup(&nfs_nbdwrite);
2455         }
2456
2457         /* move to clean list */
2458         if (ISSET(oldflags, (NB_ASYNC | NB_DELWRI))) {
2459                 lck_mtx_lock(nfs_buf_mutex);
2460                 if (bp->nb_vnbufs.le_next != NFSNOLIST) {
2461                         LIST_REMOVE(bp, nb_vnbufs);
2462                 }
2463                 LIST_INSERT_HEAD(&np->n_cleanblkhd, bp, nb_vnbufs);
2464                 lck_mtx_unlock(nfs_buf_mutex);
2465         }
2466         nfs_node_lock_force(np);
2467         np->n_numoutput++;
2468         nfs_node_unlock(np);
2469         vnode_startwrite(NFSTOV(np));
2470
2471         if (p && p->p_stats) {
2472                 OSIncrementAtomicLong(&p->p_stats->p_ru.ru_oublock);
2473         }
2474
2475         cred = bp->nb_wcred;
2476         if (!IS_VALID_CRED(cred) && ISSET(bp->nb_flags, NB_READ)) {
2477                 cred = bp->nb_rcred;  /* shouldn't really happen, but... */
2478         }
2479         if (IS_VALID_CRED(cred)) {
2480                 kauth_cred_ref(cred);
2481         }
2482         thd = async ? NULL : current_thread();
2483
2484         /* We need to make sure the pages are locked before doing I/O.  */
2485         if (!ISSET(bp->nb_flags, NB_META)) {
2486                 if (UBCINFOEXISTS(NFSTOV(np))) {
2487                         if (!ISSET(bp->nb_flags, NB_PAGELIST)) {
2488                                 error = nfs_buf_upl_setup(bp);
2489                                 if (error) {
2490                                         printf("nfs_buf_write: upl create failed %d\n", error);
2491                                         SET(bp->nb_flags, NB_ERROR);
2492                                         bp->nb_error = error = EIO;
2493                                         nfs_buf_iodone(bp);
2494                                         goto out;
2495                                 }
2496                                 nfs_buf_upl_check(bp);
2497                         }
2498                 } else {
2499                         /* We should never be in nfs_buf_write() with no UBCINFO. */
2500                         printf("nfs_buf_write: ubcinfo already gone\n");
2501                         SET(bp->nb_flags, NB_ERROR);
2502                         bp->nb_error = error = EIO;
2503                         nfs_buf_iodone(bp);
2504                         goto out;
2505                 }
2506         }
2507
2508         /* If NB_NEEDCOMMIT is set, a commit RPC may do the trick. */
2509         if (ISSET(bp->nb_flags, NB_NEEDCOMMIT)) {
2510                 nfs_buf_check_write_verifier(np, bp);
2511         }
2512         if (ISSET(bp->nb_flags, NB_NEEDCOMMIT)) {
2513                 struct nfsmount *nmp = NFSTONMP(np);
2514                 if (nfs_mount_gone(nmp)) {
2515                         SET(bp->nb_flags, NB_ERROR);
2516                         bp->nb_error = error = EIO;
2517                         nfs_buf_iodone(bp);
2518                         goto out;
2519                 }
2520                 SET(bp->nb_flags, NB_WRITEINPROG);
2521                 error = nmp->nm_funcs->nf_commit_rpc(np, NBOFF(bp) + bp->nb_dirtyoff,
2522                     bp->nb_dirtyend - bp->nb_dirtyoff, bp->nb_wcred, bp->nb_verf);
2523                 CLR(bp->nb_flags, NB_WRITEINPROG);
2524                 if (error) {
2525                         if (error != NFSERR_STALEWRITEVERF) {
2526                                 SET(bp->nb_flags, NB_ERROR);
2527                                 bp->nb_error = error;
2528                         }
2529                         nfs_buf_iodone(bp);
2530                         goto out;
2531                 }
2532                 bp->nb_dirtyoff = bp->nb_dirtyend = 0;
2533                 CLR(bp->nb_flags, NB_NEEDCOMMIT);
2534                 nfs_node_lock_force(np);
2535                 np->n_needcommitcnt--;
2536                 CHECK_NEEDCOMMITCNT(np);
2537                 nfs_node_unlock(np);
2538         }
2539         if (!error && (bp->nb_dirtyend > 0)) {
2540                 /* sanity check the dirty range */
2541                 if (NBOFF(bp) + bp->nb_dirtyend > (off_t) np->n_size) {
2542                         bp->nb_dirtyend = np->n_size - NBOFF(bp);
2543                         if (bp->nb_dirtyoff >= bp->nb_dirtyend) {
2544                                 bp->nb_dirtyoff = bp->nb_dirtyend = 0;
2545                         }
2546                 }
2547         }
2548         if (!error && (bp->nb_dirtyend > 0)) {
2549                 /* there's a dirty range that needs to be written out */
2550                 NFS_BUF_MAP(bp);
2551
2552                 doff = bp->nb_dirtyoff;
2553                 dend = bp->nb_dirtyend;
2554
2555                 /* if doff page is dirty, move doff to start of page */
2556                 if (NBPGDIRTY(bp, doff / PAGE_SIZE)) {
2557                         doff -= doff & PAGE_MASK;
2558                 }
2559                 /* try to expand write range to include preceding dirty pages */
2560                 if (!(doff & PAGE_MASK)) {
2561                         while ((doff > 0) && NBPGDIRTY(bp, (doff - 1) / PAGE_SIZE)) {
2562                                 doff -= PAGE_SIZE;
2563                         }
2564                 }
2565                 /* if dend page is dirty, move dend to start of next page */
2566                 if ((dend & PAGE_MASK) && NBPGDIRTY(bp, dend / PAGE_SIZE)) {
2567                         dend = round_page_32(dend);
2568                 }
2569                 /* try to expand write range to include trailing dirty pages */
2570                 if (!(dend & PAGE_MASK)) {
2571                         while ((dend < (int)bp->nb_bufsize) && NBPGDIRTY(bp, dend / PAGE_SIZE)) {
2572                                 dend += PAGE_SIZE;
2573                         }
2574                 }
2575                 /* make sure to keep dend clipped to EOF */
2576                 if ((NBOFF(bp) + dend) > (off_t) np->n_size) {
2577                         dend = np->n_size - NBOFF(bp);
2578                 }
2579                 /* calculate range of complete pages being written */
2580                 firstpg = round_page_32(doff) / PAGE_SIZE;
2581                 lastpg = (trunc_page_32(dend) - 1) / PAGE_SIZE;
2582                 /* calculate mask for that page range */
2583                 pagemask = ((1 << (lastpg + 1)) - 1) & ~((1 << firstpg) - 1);
2584
2585                 /*
2586                  * compare page mask to nb_dirty; if there are other dirty pages
2587                  * then write FILESYNC; otherwise, write UNSTABLE if async and
2588                  * not needcommit/stable; otherwise write FILESYNC
2589                  */
2590                 if (bp->nb_dirty & ~pagemask) {
2591                         iomode = NFS_WRITE_FILESYNC;
2592                 } else if ((bp->nb_flags & (NB_ASYNC | NB_NEEDCOMMIT | NB_STABLE)) == NB_ASYNC) {
2593                         iomode = NFS_WRITE_UNSTABLE;
2594                 } else {
2595                         iomode = NFS_WRITE_FILESYNC;
2596                 }
2597
2598                 /* write the whole contiguous dirty range */
2599                 bp->nb_offio = doff;
2600                 bp->nb_endio = dend;
2601
2602                 OSAddAtomic64(1, &nfsstats.write_bios);
2603
2604                 SET(bp->nb_flags, NB_WRITEINPROG);
2605                 error = nfs_buf_write_rpc(bp, iomode, thd, cred);
2606                 /*
2607                  * For async I/O, the callbacks will finish up the
2608                  * write and push out any dirty pages.  Otherwise,
2609                  * the write has already been finished and any dirty
2610                  * pages pushed out.
2611                  */
2612         } else {
2613                 if (!error && bp->nb_dirty) { /* write out any dirty pages */
2614                         error = nfs_buf_write_dirty_pages(bp, thd, cred);
2615                 }
2616                 nfs_buf_iodone(bp);
2617         }
2618         /* note: bp is still valid only for !async case */
2619 out:
2620         if (!async) {
2621                 error = nfs_buf_iowait(bp);
2622                 /* move to clean list */
2623                 if (oldflags & NB_DELWRI) {
2624                         lck_mtx_lock(nfs_buf_mutex);
2625                         if (bp->nb_vnbufs.le_next != NFSNOLIST) {
2626                                 LIST_REMOVE(bp, nb_vnbufs);
2627                         }
2628                         LIST_INSERT_HEAD(&np->n_cleanblkhd, bp, nb_vnbufs);
2629                         lck_mtx_unlock(nfs_buf_mutex);
2630                 }
2631                 FSDBG_BOT(553, bp, NBOFF(bp), bp->nb_flags, error);
2632                 nfs_buf_release(bp, 1);
2633                 /* check if we need to invalidate (and we can) */
2634                 if ((np->n_flag & NNEEDINVALIDATE) &&
2635                     !(np->n_bflag & (NBINVALINPROG | NBFLUSHINPROG))) {
2636                         int invalidate = 0;
2637                         nfs_node_lock_force(np);
2638                         if (np->n_flag & NNEEDINVALIDATE) {
2639                                 invalidate = 1;
2640                                 np->n_flag &= ~NNEEDINVALIDATE;
2641                         }
2642                         nfs_node_unlock(np);
2643                         if (invalidate) {
2644                                 /*
2645                                  * There was a write error and we need to
2646                                  * invalidate attrs and flush buffers in
2647                                  * order to sync up with the server.
2648                                  * (if this write was extending the file,
2649                                  * we may no longer know the correct size)
2650                                  *
2651                                  * But we couldn't call vinvalbuf while holding
2652                                  * the buffer busy.  So we call vinvalbuf() after
2653                                  * releasing the buffer.
2654                                  */
2655                                 nfs_vinvalbuf2(NFSTOV(np), V_SAVE | V_IGNORE_WRITEERR, thd, cred, 1);
2656                         }
2657                 }
2658         }
2659
2660         if (IS_VALID_CRED(cred)) {
2661                 kauth_cred_unref(&cred);
2662         }
2663         return error;
2664 }
2665
2666 /*
2667  * finish the writing of a buffer
2668  */
2669 void
2670 nfs_buf_write_finish(struct nfsbuf *bp, thread_t thd, kauth_cred_t cred)
2671 {
2672         nfsnode_t np = bp->nb_np;
2673         int error = (bp->nb_flags & NB_ERROR) ? bp->nb_error : 0;
2674         int firstpg, lastpg;
2675         uint32_t pagemask;
2676
2677         if ((error == EINTR) || (error == ERESTART)) {
2678                 CLR(bp->nb_flags, NB_ERROR);
2679                 SET(bp->nb_flags, NB_EINTR);
2680         }
2681
2682         if (!error) {
2683                 /* calculate range of complete pages being written */
2684                 firstpg = round_page_32(bp->nb_offio) / PAGE_SIZE;
2685                 lastpg = (trunc_page_32(bp->nb_endio) - 1) / PAGE_SIZE;
2686                 /* calculate mask for that page range written */
2687                 pagemask = ((1 << (lastpg + 1)) - 1) & ~((1 << firstpg) - 1);
2688                 /* clear dirty bits for pages we've written */
2689                 bp->nb_dirty &= ~pagemask;
2690         }
2691
2692         /* manage needcommit state */
2693         if (!error && (bp->nb_commitlevel == NFS_WRITE_UNSTABLE)) {
2694                 if (!ISSET(bp->nb_flags, NB_NEEDCOMMIT)) {
2695                         nfs_node_lock_force(np);
2696                         np->n_needcommitcnt++;
2697                         nfs_node_unlock(np);
2698                         SET(bp->nb_flags, NB_NEEDCOMMIT);
2699                 }
2700                 /* make sure nb_dirtyoff/nb_dirtyend reflect actual range written */
2701                 bp->nb_dirtyoff = bp->nb_offio;
2702                 bp->nb_dirtyend = bp->nb_endio;
2703         } else if (ISSET(bp->nb_flags, NB_NEEDCOMMIT)) {
2704                 nfs_node_lock_force(np);
2705                 np->n_needcommitcnt--;
2706                 CHECK_NEEDCOMMITCNT(np);
2707                 nfs_node_unlock(np);
2708                 CLR(bp->nb_flags, NB_NEEDCOMMIT);
2709         }
2710
2711         CLR(bp->nb_flags, NB_WRITEINPROG);
2712
2713         /*
2714          * For an unstable write, the buffer is still treated as dirty until
2715          * a commit (or stable (re)write) is performed.  Buffers needing only
2716          * a commit are marked with the NB_DELWRI and NB_NEEDCOMMIT flags.
2717          *
2718          * If the write was interrupted we set NB_EINTR.  Don't set NB_ERROR
2719          * because that would cause the buffer to be dropped.  The buffer is
2720          * still valid and simply needs to be written again.
2721          */
2722         if ((error == EINTR) || (error == ERESTART) || (!error && (bp->nb_flags & NB_NEEDCOMMIT))) {
2723                 CLR(bp->nb_flags, NB_INVAL);
2724                 if (!ISSET(bp->nb_flags, NB_DELWRI)) {
2725                         SET(bp->nb_flags, NB_DELWRI);
2726                         lck_mtx_lock(nfs_buf_mutex);
2727                         nfs_nbdwrite++;
2728                         NFSBUFCNTCHK();
2729                         lck_mtx_unlock(nfs_buf_mutex);
2730                 }
2731                 /*
2732                  * Since for the NB_ASYNC case, we've reassigned the buffer to the
2733                  * clean list, we have to reassign it back to the dirty one. Ugh.
2734                  */
2735                 if (ISSET(bp->nb_flags, NB_ASYNC)) {
2736                         /* move to dirty list */
2737                         lck_mtx_lock(nfs_buf_mutex);
2738                         if (bp->nb_vnbufs.le_next != NFSNOLIST) {
2739                                 LIST_REMOVE(bp, nb_vnbufs);
2740                         }
2741                         LIST_INSERT_HEAD(&np->n_dirtyblkhd, bp, nb_vnbufs);
2742                         lck_mtx_unlock(nfs_buf_mutex);
2743                 }
2744         } else {
2745                 /* either there's an error or we don't need to commit */
2746                 if (error) {
2747                         /*
2748                          * There was a write error and we need to invalidate
2749                          * attrs and flush buffers in order to sync up with the
2750                          * server.  (if this write was extending the file, we
2751                          * may no longer know the correct size)
2752                          *
2753                          * But we can't call vinvalbuf while holding this
2754                          * buffer busy.  Set a flag to do it after releasing
2755                          * the buffer.
2756                          */
2757                         nfs_node_lock_force(np);
2758                         np->n_error = error;
2759                         np->n_flag |= (NWRITEERR | NNEEDINVALIDATE);
2760                         NATTRINVALIDATE(np);
2761                         nfs_node_unlock(np);
2762                 }
2763                 /* clear the dirty range */
2764                 bp->nb_dirtyoff = bp->nb_dirtyend = 0;
2765         }
2766
2767         if (!error && bp->nb_dirty) {
2768                 nfs_buf_write_dirty_pages(bp, thd, cred);
2769         }
2770         nfs_buf_iodone(bp);
2771 }
2772
2773 /*
2774  * write out any pages marked dirty in a buffer
2775  *
2776  * We do use unstable writes and follow up with a commit.
2777  * If we catch the write verifier changing we'll restart
2778  * do the writes filesync.
2779  */
2780 int
2781 nfs_buf_write_dirty_pages(struct nfsbuf *bp, thread_t thd, kauth_cred_t cred)
2782 {
2783         nfsnode_t np = bp->nb_np;
2784         struct nfsmount *nmp = NFSTONMP(np);
2785         int error = 0, commit, iomode, iomode2, len, pg, count, npages, off;
2786         uint32_t dirty = bp->nb_dirty;
2787         uint64_t wverf;
2788         uio_t auio;
2789         char uio_buf[UIO_SIZEOF(1)];
2790
2791         if (!bp->nb_dirty) {
2792                 return 0;
2793         }
2794
2795         /* there are pages marked dirty that need to be written out */
2796         OSAddAtomic64(1, &nfsstats.write_bios);
2797         NFS_BUF_MAP(bp);
2798         SET(bp->nb_flags, NB_WRITEINPROG);
2799         npages = bp->nb_bufsize / PAGE_SIZE;
2800         iomode = NFS_WRITE_UNSTABLE;
2801
2802         auio = uio_createwithbuffer(1, 0, UIO_SYSSPACE, UIO_WRITE,
2803             &uio_buf, sizeof(uio_buf));
2804
2805 again:
2806         dirty = bp->nb_dirty;
2807         wverf = bp->nb_verf;
2808         commit = NFS_WRITE_FILESYNC;
2809         for (pg = 0; pg < npages; pg++) {
2810                 if (!NBPGDIRTY(bp, pg)) {
2811                         continue;
2812                 }
2813                 count = 1;
2814                 while (((pg + count) < npages) && NBPGDIRTY(bp, pg + count)) {
2815                         count++;
2816                 }
2817                 /* write count pages starting with page pg */
2818                 off = pg * PAGE_SIZE;
2819                 len = count * PAGE_SIZE;
2820                 /* clip writes to EOF */
2821                 if (NBOFF(bp) + off + len > (off_t) np->n_size) {
2822                         len -= (NBOFF(bp) + off + len) - np->n_size;
2823                 }
2824                 if (len > 0) {
2825                         iomode2 = iomode;
2826                         uio_reset(auio, NBOFF(bp) + off, UIO_SYSSPACE, UIO_WRITE);
2827                         uio_addiov(auio, CAST_USER_ADDR_T(bp->nb_data + off), len);
2828                         error = nfs_write_rpc2(np, auio, thd, cred, &iomode2, &bp->nb_verf);
2829                         if (error) {
2830                                 break;
2831                         }
2832                         if (iomode2 < commit) { /* Retain the lowest commitment level returned. */
2833                                 commit = iomode2;
2834                         }
2835                         if ((commit != NFS_WRITE_FILESYNC) && (wverf != bp->nb_verf)) {
2836                                 /* verifier changed, redo all the writes filesync */
2837                                 iomode = NFS_WRITE_FILESYNC;
2838                                 goto again;
2839                         }
2840                 }
2841                 /* clear dirty bits */
2842                 while (count--) {
2843                         dirty &= ~(1 << pg);
2844                         if (count) { /* leave pg on last page */
2845                                 pg++;
2846                         }
2847                 }
2848         }
2849         CLR(bp->nb_flags, NB_WRITEINPROG);
2850
2851         if (!error && (commit != NFS_WRITE_FILESYNC)) {
2852                 error = nmp->nm_funcs->nf_commit_rpc(np, NBOFF(bp), bp->nb_bufsize, cred, wverf);
2853                 if (error == NFSERR_STALEWRITEVERF) {
2854                         /* verifier changed, so we need to restart all the writes */
2855                         iomode = NFS_WRITE_FILESYNC;
2856                         goto again;
2857                 }
2858         }
2859         if (!error) {
2860                 bp->nb_dirty = dirty;
2861         } else {
2862                 SET(bp->nb_flags, NB_ERROR);
2863                 bp->nb_error = error;
2864         }
2865         return error;
2866 }
2867
2868 /*
2869  * initiate the NFS WRITE RPC(s) for a buffer
2870  */
2871 int
2872 nfs_buf_write_rpc(struct nfsbuf *bp, int iomode, thread_t thd, kauth_cred_t cred)
2873 {
2874         struct nfsmount *nmp;
2875         nfsnode_t np = bp->nb_np;
2876         int error = 0, nfsvers, async;
2877         int offset, nrpcs;
2878         uint32_t nmwsize, length, len;
2879         struct nfsreq *req;
2880         struct nfsreq_cbinfo cb;
2881         uio_t auio;
2882         char uio_buf[UIO_SIZEOF(1)];
2883
2884         nmp = NFSTONMP(np);
2885         if (nfs_mount_gone(nmp)) {
2886                 bp->nb_error = error = ENXIO;
2887                 SET(bp->nb_flags, NB_ERROR);
2888                 nfs_buf_iodone(bp);
2889                 return error;
2890         }
2891         nfsvers = nmp->nm_vers;
2892         nmwsize = nmp->nm_wsize;
2893
2894         offset = bp->nb_offio;
2895         length = bp->nb_endio - bp->nb_offio;
2896
2897         /* Note: Can only do async I/O if nfsiods are configured. */
2898         async = (bp->nb_flags & NB_ASYNC) && (NFSIOD_MAX > 0);
2899         bp->nb_commitlevel = NFS_WRITE_FILESYNC;
2900         cb.rcb_func = async ? nfs_buf_write_rpc_finish : NULL;
2901         cb.rcb_bp = bp;
2902
2903         if ((nfsvers == NFS_VER2) && ((NBOFF(bp) + bp->nb_endio) > 0xffffffffLL)) {
2904                 bp->nb_error = error = EFBIG;
2905                 SET(bp->nb_flags, NB_ERROR);
2906                 nfs_buf_iodone(bp);
2907                 return error;
2908         }
2909
2910         auio = uio_createwithbuffer(1, NBOFF(bp) + offset, UIO_SYSSPACE,
2911             UIO_WRITE, &uio_buf, sizeof(uio_buf));
2912         uio_addiov(auio, CAST_USER_ADDR_T(bp->nb_data + offset), length);
2913
2914         bp->nb_rpcs = nrpcs = (length + nmwsize - 1) / nmwsize;
2915         if (async && (nrpcs > 1)) {
2916                 SET(bp->nb_flags, NB_MULTASYNCRPC);
2917         } else {
2918                 CLR(bp->nb_flags, NB_MULTASYNCRPC);
2919         }
2920
2921         while (length > 0) {
2922                 if (ISSET(bp->nb_flags, NB_ERROR)) {
2923                         error = bp->nb_error;
2924                         break;
2925                 }
2926                 len = (length > nmwsize) ? nmwsize : length;
2927                 cb.rcb_args[0] = offset;
2928                 cb.rcb_args[1] = len;
2929 #if CONFIG_NFS4
2930                 if (nmp->nm_vers >= NFS_VER4) {
2931                         cb.rcb_args[2] = nmp->nm_stategenid;
2932                 }
2933 #endif
2934                 if (async && ((error = nfs_async_write_start(nmp)))) {
2935                         break;
2936                 }
2937                 req = NULL;
2938                 error = nmp->nm_funcs->nf_write_rpc_async(np, auio, len, thd, cred,
2939                     iomode, &cb, &req);
2940                 if (error) {
2941                         if (async) {
2942                                 nfs_async_write_done(nmp);
2943                         }
2944                         break;
2945                 }
2946                 offset += len;
2947                 length -= len;
2948                 if (async) {
2949                         continue;
2950                 }
2951                 nfs_buf_write_rpc_finish(req);
2952         }
2953
2954         if (length > 0) {
2955                 /*
2956                  * Something bad happened while trying to send the RPCs.
2957                  * Wait for any outstanding requests to complete.
2958                  */
2959                 bp->nb_error = error;
2960                 SET(bp->nb_flags, NB_ERROR);
2961                 if (ISSET(bp->nb_flags, NB_MULTASYNCRPC)) {
2962                         nrpcs = (length + nmwsize - 1) / nmwsize;
2963                         lck_mtx_lock(nfs_buf_mutex);
2964                         bp->nb_rpcs -= nrpcs;
2965                         if (bp->nb_rpcs == 0) {
2966                                 /* No RPCs left, so the buffer's done */
2967                                 lck_mtx_unlock(nfs_buf_mutex);
2968                                 nfs_buf_write_finish(bp, thd, cred);
2969                         } else {
2970                                 /* wait for the last RPC to mark it done */
2971                                 while (bp->nb_rpcs > 0) {
2972                                         msleep(&bp->nb_rpcs, nfs_buf_mutex, 0,
2973                                             "nfs_buf_write_rpc_cancel", NULL);
2974                                 }
2975                                 lck_mtx_unlock(nfs_buf_mutex);
2976                         }
2977                 } else {
2978                         nfs_buf_write_finish(bp, thd, cred);
2979                 }
2980                 /* It may have just been an interrupt... that's OK */
2981                 if (!ISSET(bp->nb_flags, NB_ERROR)) {
2982                         error = 0;
2983                 }
2984         }
2985
2986         return error;
2987 }
2988
2989 /*
2990  * finish up an NFS WRITE RPC on a buffer
2991  */
2992 void
2993 nfs_buf_write_rpc_finish(struct nfsreq *req)
2994 {
2995         int error = 0, nfsvers, offset, length, multasyncrpc, finished;
2996         int committed = NFS_WRITE_FILESYNC;
2997         uint64_t wverf = 0;
2998         size_t rlen;
2999         void *wakeme = NULL;
3000         struct nfsreq_cbinfo cb;
3001         struct nfsreq *wreq = NULL;
3002         struct nfsbuf *bp;
3003         struct nfsmount *nmp;
3004         nfsnode_t np;
3005         thread_t thd;
3006         kauth_cred_t cred;
3007         uio_t auio;
3008         char uio_buf[UIO_SIZEOF(1)];
3009
3010 finish:
3011         np = req->r_np;
3012         thd = req->r_thread;
3013         cred = req->r_cred;
3014         if (IS_VALID_CRED(cred)) {
3015                 kauth_cred_ref(cred);
3016         }
3017         cb = req->r_callback;
3018         bp = cb.rcb_bp;
3019         if (cb.rcb_func) { /* take an extra reference on the nfsreq in case we want to resend it later due to grace error */
3020                 nfs_request_ref(req, 0);
3021         }
3022
3023         nmp = NFSTONMP(np);
3024         if (nfs_mount_gone(nmp)) {
3025                 SET(bp->nb_flags, NB_ERROR);
3026                 bp->nb_error = error = ENXIO;
3027         }
3028         if (error || ISSET(bp->nb_flags, NB_ERROR)) {
3029                 /* just drop it */
3030                 nfs_request_async_cancel(req);
3031                 goto out;
3032         }
3033         nfsvers = nmp->nm_vers;
3034
3035         offset = cb.rcb_args[0];
3036         rlen = length = cb.rcb_args[1];
3037
3038         /* finish the RPC */
3039         error = nmp->nm_funcs->nf_write_rpc_async_finish(np, req, &committed, &rlen, &wverf);
3040         if ((error == EINPROGRESS) && cb.rcb_func) {
3041                 /* async request restarted */
3042                 if (cb.rcb_func) {
3043                         nfs_request_rele(req);
3044                 }
3045                 if (IS_VALID_CRED(cred)) {
3046                         kauth_cred_unref(&cred);
3047                 }
3048                 return;
3049         }
3050 #if CONFIG_NFS4
3051         if ((nmp->nm_vers >= NFS_VER4) && nfs_mount_state_error_should_restart(error) && !ISSET(bp->nb_flags, NB_ERROR)) {
3052                 lck_mtx_lock(&nmp->nm_lock);
3053                 if ((error != NFSERR_OLD_STATEID) && (error != NFSERR_GRACE) && (cb.rcb_args[2] == nmp->nm_stategenid)) {
3054                         NP(np, "nfs_buf_write_rpc_finish: error %d @ 0x%llx, 0x%x 0x%x, initiating recovery",
3055                             error, NBOFF(bp) + offset, cb.rcb_args[2], nmp->nm_stategenid);
3056                         nfs_need_recover(nmp, error);
3057                 }
3058                 lck_mtx_unlock(&nmp->nm_lock);
3059                 if (np->n_flag & NREVOKE) {
3060                         error = EIO;
3061                 } else {
3062                         if (error == NFSERR_GRACE) {
3063                                 if (cb.rcb_func) {
3064                                         /*
3065                                          * For an async I/O request, handle a grace delay just like
3066                                          * jukebox errors.  Set the resend time and queue it up.
3067                                          */
3068                                         struct timeval now;
3069                                         if (req->r_nmrep.nmc_mhead) {
3070                                                 mbuf_freem(req->r_nmrep.nmc_mhead);
3071                                                 req->r_nmrep.nmc_mhead = NULL;
3072                                         }
3073                                         req->r_error = 0;
3074                                         microuptime(&now);
3075                                         lck_mtx_lock(&req->r_mtx);
3076                                         req->r_resendtime = now.tv_sec + 2;
3077                                         req->r_xid = 0;                 // get a new XID
3078                                         req->r_flags |= R_RESTART;
3079                                         req->r_start = 0;
3080                                         nfs_asyncio_resend(req);
3081                                         lck_mtx_unlock(&req->r_mtx);
3082                                         if (IS_VALID_CRED(cred)) {
3083                                                 kauth_cred_unref(&cred);
3084                                         }
3085                                         /* Note: nfsreq reference taken will be dropped later when finished */
3086                                         return;
3087                                 }
3088                                 /* otherwise, just pause a couple seconds and retry */
3089                                 tsleep(&nmp->nm_state, (PZERO - 1), "nfsgrace", 2 * hz);
3090                         }
3091                         if (!(error = nfs_mount_state_wait_for_recovery(nmp))) {
3092                                 rlen = 0;
3093                                 goto writeagain;
3094                         }
3095                 }
3096         }
3097 #endif
3098         if (error) {
3099                 SET(bp->nb_flags, NB_ERROR);
3100                 bp->nb_error = error;
3101         }
3102         if (error || (nfsvers == NFS_VER2)) {
3103                 goto out;
3104         }
3105         if (rlen <= 0) {
3106                 SET(bp->nb_flags, NB_ERROR);
3107                 bp->nb_error = error = EIO;
3108                 goto out;
3109         }
3110
3111         /* save lowest commit level returned */
3112         if (committed < bp->nb_commitlevel) {
3113                 bp->nb_commitlevel = committed;
3114         }
3115
3116         /* check the write verifier */
3117         if (!bp->nb_verf) {
3118                 bp->nb_verf = wverf;
3119         } else if (bp->nb_verf != wverf) {
3120                 /* verifier changed, so buffer will need to be rewritten */
3121                 bp->nb_flags |= NB_STALEWVERF;
3122                 bp->nb_commitlevel = NFS_WRITE_UNSTABLE;
3123                 bp->nb_verf = wverf;
3124         }
3125
3126         /*
3127          * check for a short write
3128          *
3129          * If the server didn't write all the data, then we
3130          * need to issue another write for the rest of it.
3131          * (Don't bother if the buffer hit an error or stale wverf.)
3132          */
3133         if (((int)rlen < length) && !(bp->nb_flags & (NB_STALEWVERF | NB_ERROR))) {
3134 #if CONFIG_NFS4
3135 writeagain:
3136 #endif
3137                 offset += rlen;
3138                 length -= rlen;
3139
3140                 auio = uio_createwithbuffer(1, NBOFF(bp) + offset, UIO_SYSSPACE,
3141                     UIO_WRITE, &uio_buf, sizeof(uio_buf));
3142                 uio_addiov(auio, CAST_USER_ADDR_T(bp->nb_data + offset), length);
3143
3144                 cb.rcb_args[0] = offset;
3145                 cb.rcb_args[1] = length;
3146 #if CONFIG_NFS4
3147                 if (nmp->nm_vers >= NFS_VER4) {
3148                         cb.rcb_args[2] = nmp->nm_stategenid;
3149                 }
3150 #endif
3151                 // XXX iomode should really match the original request
3152                 error = nmp->nm_funcs->nf_write_rpc_async(np, auio, length, thd, cred,
3153                     NFS_WRITE_FILESYNC, &cb, &wreq);
3154                 if (!error) {
3155                         if (IS_VALID_CRED(cred)) {
3156                                 kauth_cred_unref(&cred);
3157                         }
3158                         if (!cb.rcb_func) {
3159                                 /* if !async we'll need to wait for this RPC to finish */
3160                                 req = wreq;
3161                                 wreq = NULL;
3162                                 goto finish;
3163                         }
3164                         nfs_request_rele(req);
3165                         /*
3166                          * We're done here.
3167                          * Outstanding RPC count is unchanged.
3168                          * Callback will be called when RPC is done.
3169                          */
3170                         return;
3171                 }
3172                 SET(bp->nb_flags, NB_ERROR);
3173                 bp->nb_error = error;
3174         }
3175
3176 out:
3177         if (cb.rcb_func) {
3178                 nfs_async_write_done(nmp);
3179                 nfs_request_rele(req);
3180         }
3181         /*
3182          * Decrement outstanding RPC count on buffer
3183          * and call nfs_buf_write_finish on last RPC.
3184          *
3185          * (Note: when there are multiple async RPCs issued for a
3186          * buffer we need nfs_buffer_mutex to avoid problems when
3187          * aborting a partially-initiated set of RPCs)
3188          */
3189         multasyncrpc = ISSET(bp->nb_flags, NB_MULTASYNCRPC);
3190         if (multasyncrpc) {
3191                 lck_mtx_lock(nfs_buf_mutex);
3192         }
3193
3194         bp->nb_rpcs--;
3195         finished = (bp->nb_rpcs == 0);
3196
3197         if (multasyncrpc) {
3198                 lck_mtx_unlock(nfs_buf_mutex);
3199         }
3200
3201         if (finished) {
3202                 if (multasyncrpc) {
3203                         wakeme = &bp->nb_rpcs;
3204                 }
3205                 nfs_buf_write_finish(bp, thd, cred);
3206                 if (wakeme) {
3207                         wakeup(wakeme);
3208                 }
3209         }
3210
3211         if (IS_VALID_CRED(cred)) {
3212                 kauth_cred_unref(&cred);
3213         }
3214 }
3215
3216 /*
3217  * Send commit(s) for the given node's "needcommit" buffers
3218  */
3219 int
3220 nfs_flushcommits(nfsnode_t np, int nowait)
3221 {
3222         struct nfsmount *nmp;
3223         struct nfsbuf *bp, *prevlbp, *lbp;
3224         struct nfsbuflists blist, commitlist;
3225         int error = 0, retv, wcred_set, flags, dirty;
3226         u_quad_t off, endoff, toff;
3227         uint64_t wverf;
3228         u_int32_t count;
3229         kauth_cred_t wcred = NULL;
3230
3231         FSDBG_TOP(557, np, 0, 0, 0);
3232
3233         /*
3234          * A nb_flags == (NB_DELWRI | NB_NEEDCOMMIT) block has been written to the
3235          * server, but nas not been committed to stable storage on the server
3236          * yet. The byte range is worked out for as many nfsbufs as we can handle
3237          * and the commit rpc is done.
3238          */
3239         if (!LIST_EMPTY(&np->n_dirtyblkhd)) {
3240                 error = nfs_node_lock(np);
3241                 if (error) {
3242                         goto done;
3243                 }
3244                 np->n_flag |= NMODIFIED;
3245                 nfs_node_unlock(np);
3246         }
3247
3248         off = (u_quad_t)-1;
3249         endoff = 0;
3250         wcred_set = 0;
3251         LIST_INIT(&commitlist);
3252
3253         nmp = NFSTONMP(np);
3254         if (nfs_mount_gone(nmp)) {
3255                 error = ENXIO;
3256                 goto done;
3257         }
3258         if (nmp->nm_vers == NFS_VER2) {
3259                 error = EINVAL;
3260                 goto done;
3261         }
3262
3263         flags = NBI_DIRTY;
3264         if (nowait) {
3265                 flags |= NBI_NOWAIT;
3266         }
3267         lck_mtx_lock(nfs_buf_mutex);
3268         wverf = nmp->nm_verf;
3269         if (!nfs_buf_iterprepare(np, &blist, flags)) {
3270                 while ((bp = LIST_FIRST(&blist))) {
3271                         LIST_REMOVE(bp, nb_vnbufs);
3272                         LIST_INSERT_HEAD(&np->n_dirtyblkhd, bp, nb_vnbufs);
3273                         error = nfs_buf_acquire(bp, NBAC_NOWAIT, 0, 0);
3274                         if (error) {
3275                                 continue;
3276                         }
3277                         if (ISSET(bp->nb_flags, NB_NEEDCOMMIT)) {
3278                                 nfs_buf_check_write_verifier(np, bp);
3279                         }
3280                         if (((bp->nb_flags & (NB_DELWRI | NB_NEEDCOMMIT)) != (NB_DELWRI | NB_NEEDCOMMIT)) ||
3281                             (bp->nb_verf != wverf)) {
3282                                 nfs_buf_drop(bp);
3283                                 continue;
3284                         }
3285                         nfs_buf_remfree(bp);
3286
3287                         /* buffer UPLs will be grabbed *in order* below */
3288
3289                         FSDBG(557, bp, bp->nb_flags, bp->nb_valid, bp->nb_dirty);
3290                         FSDBG(557, bp->nb_validoff, bp->nb_validend,
3291                             bp->nb_dirtyoff, bp->nb_dirtyend);
3292
3293                         /*
3294                          * Work out if all buffers are using the same cred
3295                          * so we can deal with them all with one commit.
3296                          *
3297                          * Note: creds in bp's must be obtained by kauth_cred_ref
3298                          * on the same original cred in order for them to be equal.
3299                          */
3300                         if (wcred_set == 0) {
3301                                 wcred = bp->nb_wcred;
3302                                 if (!IS_VALID_CRED(wcred)) {
3303                                         panic("nfs: needcommit w/out wcred");
3304                                 }
3305                                 wcred_set = 1;
3306                         } else if ((wcred_set == 1) && wcred != bp->nb_wcred) {
3307                                 wcred_set = -1;
3308                         }
3309                         SET(bp->nb_flags, NB_WRITEINPROG);
3310
3311                         /*
3312                          * Add this buffer to the list of buffers we are committing.
3313                          * Buffers are inserted into the list in ascending order so that
3314                          * we can take the UPLs in order after the list is complete.
3315                          */
3316                         prevlbp = NULL;
3317                         LIST_FOREACH(lbp, &commitlist, nb_vnbufs) {
3318                                 if (bp->nb_lblkno < lbp->nb_lblkno) {
3319                                         break;
3320                                 }
3321                                 prevlbp = lbp;
3322                         }
3323                         LIST_REMOVE(bp, nb_vnbufs);
3324                         if (prevlbp) {
3325                                 LIST_INSERT_AFTER(prevlbp, bp, nb_vnbufs);
3326                         } else {
3327                                 LIST_INSERT_HEAD(&commitlist, bp, nb_vnbufs);
3328                         }
3329
3330                         /* update commit range start, end */
3331                         toff = NBOFF(bp) + bp->nb_dirtyoff;
3332                         if (toff < off) {
3333                                 off = toff;
3334                         }
3335                         toff += (u_quad_t)(bp->nb_dirtyend - bp->nb_dirtyoff);
3336                         if (toff > endoff) {
3337                                 endoff = toff;
3338                         }
3339                 }
3340                 nfs_buf_itercomplete(np, &blist, NBI_DIRTY);
3341         }
3342         lck_mtx_unlock(nfs_buf_mutex);
3343
3344         if (LIST_EMPTY(&commitlist)) {
3345                 error = ENOBUFS;
3346                 goto done;
3347         }
3348
3349         /*
3350          * We need a UPL to prevent others from accessing the buffers during
3351          * our commit RPC(s).
3352          *
3353          * We used to also check for dirty pages here; if there were any we'd
3354          * abort the commit and force the entire buffer to be written again.
3355          * Instead of doing that, we just go ahead and commit the dirty range,
3356          * and then leave the buffer around with dirty pages that will be
3357          * written out later.
3358          */
3359         LIST_FOREACH(bp, &commitlist, nb_vnbufs) {
3360                 if (!ISSET(bp->nb_flags, NB_PAGELIST)) {
3361                         retv = nfs_buf_upl_setup(bp);
3362                         if (retv) {
3363                                 /* Unable to create the UPL, the VM object probably no longer exists. */
3364                                 printf("nfs_flushcommits: upl create failed %d\n", retv);
3365                                 bp->nb_valid = bp->nb_dirty = 0;
3366                         }
3367                 }
3368                 nfs_buf_upl_check(bp);
3369         }
3370
3371         /*
3372          * Commit data on the server, as required.
3373          * If all bufs are using the same wcred, then use that with
3374          * one call for all of them, otherwise commit each one
3375          * separately.
3376          */
3377         if (wcred_set == 1) {
3378                 /*
3379                  * Note, it's possible the commit range could be >2^32-1.
3380                  * If it is, we'll send one commit that covers the whole file.
3381                  */
3382                 if ((endoff - off) > 0xffffffff) {
3383                         count = 0;
3384                 } else {
3385                         count = (endoff - off);
3386                 }
3387                 retv = nmp->nm_funcs->nf_commit_rpc(np, off, count, wcred, wverf);
3388         } else {
3389                 retv = 0;
3390                 LIST_FOREACH(bp, &commitlist, nb_vnbufs) {
3391                         toff = NBOFF(bp) + bp->nb_dirtyoff;
3392                         count = bp->nb_dirtyend - bp->nb_dirtyoff;
3393                         retv = nmp->nm_funcs->nf_commit_rpc(np, toff, count, bp->nb_wcred, wverf);
3394                         if (retv) {
3395                                 break;
3396                         }
3397                 }
3398         }
3399
3400         /*
3401          * Now, either mark the blocks I/O done or mark the
3402          * blocks dirty, depending on whether the commit
3403          * succeeded.
3404          */
3405         while ((bp = LIST_FIRST(&commitlist))) {
3406                 LIST_REMOVE(bp, nb_vnbufs);
3407                 FSDBG(557, bp, retv, bp->nb_flags, bp->nb_dirty);
3408                 nfs_node_lock_force(np);
3409                 CLR(bp->nb_flags, (NB_NEEDCOMMIT | NB_WRITEINPROG));
3410                 np->n_needcommitcnt--;
3411                 CHECK_NEEDCOMMITCNT(np);
3412                 nfs_node_unlock(np);
3413
3414                 if (retv) {
3415                         /* move back to dirty list */
3416                         lck_mtx_lock(nfs_buf_mutex);
3417                         LIST_INSERT_HEAD(&np->n_dirtyblkhd, bp, nb_vnbufs);
3418                         lck_mtx_unlock(nfs_buf_mutex);
3419                         nfs_buf_release(bp, 1);
3420                         continue;
3421                 }
3422
3423                 nfs_node_lock_force(np);
3424                 np->n_numoutput++;
3425                 nfs_node_unlock(np);
3426                 vnode_startwrite(NFSTOV(np));
3427                 if (ISSET(bp->nb_flags, NB_DELWRI)) {
3428                         lck_mtx_lock(nfs_buf_mutex);
3429                         nfs_nbdwrite--;
3430                         NFSBUFCNTCHK();
3431                         lck_mtx_unlock(nfs_buf_mutex);
3432                         wakeup(&nfs_nbdwrite);
3433                 }
3434                 CLR(bp->nb_flags, (NB_READ | NB_DONE | NB_ERROR | NB_DELWRI));
3435                 /* if block still has dirty pages, we don't want it to */
3436                 /* be released in nfs_buf_iodone().  So, don't set NB_ASYNC. */
3437                 if (!(dirty = bp->nb_dirty)) {
3438                         SET(bp->nb_flags, NB_ASYNC);
3439                 } else {
3440                         CLR(bp->nb_flags, NB_ASYNC);
3441                 }
3442
3443                 /* move to clean list */
3444                 lck_mtx_lock(nfs_buf_mutex);
3445                 LIST_INSERT_HEAD(&np->n_cleanblkhd, bp, nb_vnbufs);
3446                 lck_mtx_unlock(nfs_buf_mutex);
3447
3448                 bp->nb_dirtyoff = bp->nb_dirtyend = 0;
3449
3450                 nfs_buf_iodone(bp);
3451                 if (dirty) {
3452                         /* throw it back in as a delayed write buffer */
3453                         CLR(bp->nb_flags, NB_DONE);
3454                         nfs_buf_write_delayed(bp);
3455                 }
3456         }
3457
3458 done:
3459         FSDBG_BOT(557, np, 0, 0, error);
3460         return error;
3461 }
3462
3463 /*
3464  * Flush all the blocks associated with a vnode.
3465  *      Walk through the buffer pool and push any dirty pages
3466  *      associated with the vnode.
3467  */
3468 int
3469 nfs_flush(nfsnode_t np, int waitfor, thread_t thd, int ignore_writeerr)
3470 {
3471         struct nfsbuf *bp;
3472         struct nfsbuflists blist;
3473         struct nfsmount *nmp = NFSTONMP(np);
3474         int error = 0, error2, slptimeo = 0, slpflag = 0;
3475         int nfsvers, flags, passone = 1;
3476
3477         FSDBG_TOP(517, np, waitfor, ignore_writeerr, 0);
3478
3479         if (nfs_mount_gone(nmp)) {
3480                 error = ENXIO;
3481                 goto out;
3482         }
3483         nfsvers = nmp->nm_vers;
3484         if (NMFLAG(nmp, INTR)) {
3485                 slpflag = PCATCH;
3486         }
3487
3488         if (!LIST_EMPTY(&np->n_dirtyblkhd)) {
3489                 nfs_node_lock_force(np);
3490                 np->n_flag |= NMODIFIED;
3491                 nfs_node_unlock(np);
3492         }
3493
3494         lck_mtx_lock(nfs_buf_mutex);
3495         while (np->n_bflag & NBFLUSHINPROG) {
3496                 np->n_bflag |= NBFLUSHWANT;
3497                 error = msleep(&np->n_bflag, nfs_buf_mutex, slpflag, "nfs_flush", NULL);
3498                 if ((error && (error != EWOULDBLOCK)) ||
3499                     ((error = nfs_sigintr(NFSTONMP(np), NULL, thd, 0)))) {
3500                         lck_mtx_unlock(nfs_buf_mutex);
3501                         goto out;
3502                 }
3503         }
3504         np->n_bflag |= NBFLUSHINPROG;
3505
3506         /*
3507          * On the first pass, start async/unstable writes on all
3508          * delayed write buffers.  Then wait for all writes to complete
3509          * and call nfs_flushcommits() to commit any uncommitted buffers.
3510          * On all subsequent passes, start STABLE writes on any remaining
3511          * dirty buffers.  Then wait for all writes to complete.
3512          */
3513 again:
3514         FSDBG(518, LIST_FIRST(&np->n_dirtyblkhd), np->n_flag, 0, 0);
3515         if (!NFSTONMP(np)) {
3516                 lck_mtx_unlock(nfs_buf_mutex);
3517                 error = ENXIO;
3518                 goto done;
3519         }
3520
3521         /* Start/do any write(s) that are required. */
3522         if (!nfs_buf_iterprepare(np, &blist, NBI_DIRTY)) {
3523                 while ((bp = LIST_FIRST(&blist))) {
3524                         LIST_REMOVE(bp, nb_vnbufs);
3525                         LIST_INSERT_HEAD(&np->n_dirtyblkhd, bp, nb_vnbufs);
3526                         flags = (passone || !(waitfor == MNT_WAIT || waitfor == MNT_DWAIT)) ? NBAC_NOWAIT : 0;
3527                         if (flags != NBAC_NOWAIT) {
3528                                 nfs_buf_refget(bp);
3529                         }
3530                         while ((error = nfs_buf_acquire(bp, flags, slpflag, slptimeo))) {
3531                                 FSDBG(524, bp, flags, bp->nb_lflags, bp->nb_flags);
3532                                 if (error == EBUSY) {
3533                                         break;
3534                                 }
3535                                 if (error) {
3536                                         error2 = nfs_sigintr(NFSTONMP(np), NULL, thd, 0);
3537                                         if (error2) {
3538                                                 if (flags != NBAC_NOWAIT) {
3539                                                         nfs_buf_refrele(bp);
3540                                                 }
3541                                                 nfs_buf_itercomplete(np, &blist, NBI_DIRTY);
3542                                                 lck_mtx_unlock(nfs_buf_mutex);
3543                                                 error = error2;
3544                                                 goto done;
3545                                         }
3546                                         if (slpflag == PCATCH) {
3547                                                 slpflag = 0;
3548                                                 slptimeo = 2 * hz;
3549                                         }
3550                                 }
3551                         }
3552                         if (flags != NBAC_NOWAIT) {
3553                                 nfs_buf_refrele(bp);
3554                         }
3555                         if (error == EBUSY) {
3556                                 continue;
3557                         }
3558                         if (!bp->nb_np) {
3559                                 /* buffer is no longer valid */
3560                                 nfs_buf_drop(bp);
3561                                 continue;
3562                         }
3563                         if (ISSET(bp->nb_flags, NB_NEEDCOMMIT)) {
3564                                 nfs_buf_check_write_verifier(np, bp);
3565                         }
3566                         if (!ISSET(bp->nb_flags, NB_DELWRI)) {
3567                                 /* buffer is no longer dirty */
3568                                 nfs_buf_drop(bp);
3569                                 continue;
3570                         }
3571                         FSDBG(525, bp, passone, bp->nb_lflags, bp->nb_flags);
3572                         if ((passone || !(waitfor == MNT_WAIT || waitfor == MNT_DWAIT)) &&
3573                             ISSET(bp->nb_flags, NB_NEEDCOMMIT)) {
3574                                 nfs_buf_drop(bp);
3575                                 continue;
3576                         }
3577                         nfs_buf_remfree(bp);
3578                         lck_mtx_unlock(nfs_buf_mutex);
3579                         if (ISSET(bp->nb_flags, NB_ERROR)) {
3580                                 nfs_node_lock_force(np);
3581                                 np->n_error = bp->nb_error ? bp->nb_error : EIO;
3582                                 np->n_flag |= NWRITEERR;
3583                                 nfs_node_unlock(np);
3584                                 nfs_buf_release(bp, 1);
3585                                 lck_mtx_lock(nfs_buf_mutex);
3586                                 continue;
3587                         }
3588                         SET(bp->nb_flags, NB_ASYNC);
3589                         if (!passone) {
3590                                 /* NB_STABLE forces this to be written FILESYNC */
3591                                 SET(bp->nb_flags, NB_STABLE);
3592                         }
3593                         nfs_buf_write(bp);
3594                         lck_mtx_lock(nfs_buf_mutex);
3595                 }
3596                 nfs_buf_itercomplete(np, &blist, NBI_DIRTY);
3597         }
3598         lck_mtx_unlock(nfs_buf_mutex);
3599
3600         if (waitfor == MNT_WAIT || waitfor == MNT_DWAIT) {
3601                 while ((error = vnode_waitforwrites(NFSTOV(np), 0, slpflag, slptimeo, "nfsflush"))) {
3602                         error2 = nfs_sigintr(NFSTONMP(np), NULL, thd, 0);
3603                         if (error2) {
3604                                 error = error2;
3605                                 goto done;
3606                         }
3607                         if (slpflag == PCATCH) {
3608                                 slpflag = 0;
3609                                 slptimeo = 2 * hz;
3610                         }
3611                 }
3612         }
3613
3614         if (nfsvers != NFS_VER2) {
3615                 /* loop while it looks like there are still buffers to be */
3616                 /* commited and nfs_flushcommits() seems to be handling them. */
3617                 while (np->n_needcommitcnt) {
3618                         if (nfs_flushcommits(np, 0)) {
3619                                 break;
3620                         }
3621                 }
3622         }
3623
3624         if (passone) {
3625                 passone = 0;
3626                 if (!LIST_EMPTY(&np->n_dirtyblkhd)) {
3627                         nfs_node_lock_force(np);
3628                         np->n_flag |= NMODIFIED;
3629                         nfs_node_unlock(np);
3630                 }
3631                 lck_mtx_lock(nfs_buf_mutex);
3632                 goto again;
3633         }
3634
3635         if (waitfor == MNT_WAIT || waitfor == MNT_DWAIT) {
3636                 if (!LIST_EMPTY(&np->n_dirtyblkhd)) {
3637                         nfs_node_lock_force(np);
3638                         np->n_flag |= NMODIFIED;
3639                         nfs_node_unlock(np);
3640                 }
3641                 lck_mtx_lock(nfs_buf_mutex);
3642                 if (!LIST_EMPTY(&np->n_dirtyblkhd)) {
3643                         goto again;
3644                 }
3645                 lck_mtx_unlock(nfs_buf_mutex);
3646                 nfs_node_lock_force(np);
3647                 /*
3648                  * OK, it looks like there are no dirty blocks.  If we have no
3649                  * writes in flight and no one in the write code, we can clear
3650                  * the modified flag.  In order to make sure we see the latest
3651                  * attributes and size, we also invalidate the attributes and
3652                  * advance the attribute cache XID to guarantee that attributes
3653                  * newer than our clearing of NMODIFIED will get loaded next.
3654                  * (If we don't do this, it's possible for the flush's final
3655                  * write/commit (xid1) to be executed in parallel with a subsequent
3656                  * getattr request (xid2).  The getattr could return attributes
3657                  * from *before* the write/commit completed but the stale attributes
3658                  * would be preferred because of the xid ordering.)
3659                  */
3660                 if (!np->n_wrbusy && !np->n_numoutput) {
3661                         np->n_flag &= ~NMODIFIED;
3662                         NATTRINVALIDATE(np);
3663                         nfs_get_xid(&np->n_xid);
3664                 }
3665         } else {
3666                 nfs_node_lock_force(np);
3667         }
3668
3669         FSDBG(526, np->n_flag, np->n_error, 0, 0);
3670         if (!ignore_writeerr && (np->n_flag & NWRITEERR)) {
3671                 error = np->n_error;
3672                 np->n_flag &= ~NWRITEERR;
3673         }
3674         nfs_node_unlock(np);
3675 done:
3676         lck_mtx_lock(nfs_buf_mutex);
3677         flags = np->n_bflag;
3678         np->n_bflag &= ~(NBFLUSHINPROG | NBFLUSHWANT);
3679         lck_mtx_unlock(nfs_buf_mutex);
3680         if (flags & NBFLUSHWANT) {
3681                 wakeup(&np->n_bflag);
3682         }
3683 out:
3684         FSDBG_BOT(517, np, error, ignore_writeerr, 0);
3685         return error;
3686 }
3687
3688 /*
3689  * Flush out and invalidate all buffers associated with a vnode.
3690  * Called with the underlying object locked.
3691  */
3692 int
3693 nfs_vinvalbuf_internal(
3694         nfsnode_t np,
3695         int flags,
3696         thread_t thd,
3697         kauth_cred_t cred,
3698         int slpflag,
3699         int slptimeo)
3700 {
3701         struct nfsbuf *bp;
3702         struct nfsbuflists blist;
3703         int list, error = 0;
3704
3705         if (flags & V_SAVE) {
3706                 if ((error = nfs_flush(np, MNT_WAIT, thd, (flags & V_IGNORE_WRITEERR)))) {
3707                         return error;
3708                 }
3709         }
3710
3711         lck_mtx_lock(nfs_buf_mutex);
3712         for (;;) {
3713                 list = NBI_CLEAN;
3714                 if (nfs_buf_iterprepare(np, &blist, list)) {
3715                         list = NBI_DIRTY;
3716                         if (nfs_buf_iterprepare(np, &blist, list)) {
3717                                 break;
3718                         }
3719                 }
3720                 while ((bp = LIST_FIRST(&blist))) {
3721                         LIST_REMOVE(bp, nb_vnbufs);
3722                         if (list == NBI_CLEAN) {
3723                                 LIST_INSERT_HEAD(&np->n_cleanblkhd, bp, nb_vnbufs);
3724                         } else {
3725                                 LIST_INSERT_HEAD(&np->n_dirtyblkhd, bp, nb_vnbufs);
3726                         }
3727                         nfs_buf_refget(bp);
3728                         while ((error = nfs_buf_acquire(bp, NBAC_REMOVE, slpflag, slptimeo))) {
3729                                 FSDBG(556, np, bp, NBOFF(bp), bp->nb_flags);
3730                                 if (error != EAGAIN) {
3731                                         FSDBG(554, np, bp, -1, error);
3732                                         nfs_buf_refrele(bp);
3733                                         nfs_buf_itercomplete(np, &blist, list);
3734                                         lck_mtx_unlock(nfs_buf_mutex);
3735                                         return error;
3736                                 }
3737                         }
3738                         nfs_buf_refrele(bp);
3739                         FSDBG(554, np, bp, NBOFF(bp), bp->nb_flags);
3740                         lck_mtx_unlock(nfs_buf_mutex);
3741                         if ((flags & V_SAVE) && UBCINFOEXISTS(NFSTOV(np)) && bp->nb_np &&
3742                             (NBOFF(bp) < (off_t)np->n_size)) {
3743                                 /* extra paranoia: make sure we're not */
3744                                 /* somehow leaving any dirty data around */
3745                                 int mustwrite = 0;
3746                                 int end = (NBOFF(bp) + bp->nb_bufsize > (off_t)np->n_size) ?
3747                                     ((off_t)np->n_size - NBOFF(bp)) : bp->nb_bufsize;
3748                                 if (!ISSET(bp->nb_flags, NB_PAGELIST)) {
3749                                         error = nfs_buf_upl_setup(bp);
3750                                         if (error == EINVAL) {
3751                                                 /* vm object must no longer exist */
3752                                                 /* hopefully we don't need to do */
3753                                                 /* anything for this buffer */
3754                                         } else if (error) {
3755                                                 printf("nfs_vinvalbuf: upl setup failed %d\n", error);
3756                                         }
3757                                         bp->nb_valid = bp->nb_dirty = 0;
3758                                 }
3759                                 nfs_buf_upl_check(bp);
3760                                 /* check for any dirty data before the EOF */
3761                                 if ((bp->nb_dirtyend > 0) && (bp->nb_dirtyoff < end)) {
3762                                         /* clip dirty range to EOF */
3763                                         if (bp->nb_dirtyend > end) {
3764                                                 bp->nb_dirtyend = end;
3765                                                 if (bp->nb_dirtyoff >= bp->nb_dirtyend) {
3766                                                         bp->nb_dirtyoff = bp->nb_dirtyend = 0;
3767                                                 }
3768                                         }
3769                                         if ((bp->nb_dirtyend > 0) && (bp->nb_dirtyoff < end)) {
3770                                                 mustwrite++;
3771                                         }
3772                                 }
3773                                 bp->nb_dirty &= (1 << (round_page_32(end) / PAGE_SIZE)) - 1;
3774                                 if (bp->nb_dirty) {
3775                                         mustwrite++;
3776                                 }
3777                                 /* also make sure we'll have a credential to do the write */
3778                                 if (mustwrite && !IS_VALID_CRED(bp->nb_wcred) && !IS_VALID_CRED(cred)) {
3779                                         printf("nfs_vinvalbuf: found dirty buffer with no write creds\n");
3780                                         mustwrite = 0;
3781                                 }
3782                                 if (mustwrite) {
3783                                         FSDBG(554, np, bp, 0xd00dee, bp->nb_flags);
3784                                         if (!ISSET(bp->nb_flags, NB_PAGELIST)) {
3785                                                 panic("nfs_vinvalbuf: dirty buffer without upl");
3786                                         }
3787                                         /* gotta write out dirty data before invalidating */
3788                                         /* (NB_STABLE indicates that data writes should be FILESYNC) */
3789                                         /* (NB_NOCACHE indicates buffer should be discarded) */
3790                                         CLR(bp->nb_flags, (NB_DONE | NB_ERROR | NB_INVAL | NB_ASYNC));
3791                                         SET(bp->nb_flags, NB_STABLE | NB_NOCACHE);
3792                                         if (!IS_VALID_CRED(bp->nb_wcred)) {
3793                                                 kauth_cred_ref(cred);
3794                                                 bp->nb_wcred = cred;
3795                                         }
3796                                         error = nfs_buf_write(bp);
3797                                         // Note: bp has been released
3798                                         if (error) {
3799                                                 FSDBG(554, bp, 0xd00dee, 0xbad, error);
3800                                                 nfs_node_lock_force(np);
3801                                                 if ((error != EINTR) && (error != ERESTART)) {
3802                                                         np->n_error = error;
3803                                                         np->n_flag |= NWRITEERR;
3804                                                 }
3805                                                 /*
3806                                                  * There was a write error and we need to
3807                                                  * invalidate attrs to sync with server.
3808                                                  * (if this write was extending the file,
3809                                                  * we may no longer know the correct size)
3810                                                  */
3811                                                 NATTRINVALIDATE(np);
3812                                                 nfs_node_unlock(np);
3813                                                 if ((error == EINTR) || (error == ERESTART)) {
3814                                                         /*
3815                                                          * Abort on EINTR.  If we don't, we could
3816                                                          * be stuck in this loop forever because
3817                                                          * the buffer will continue to stay dirty.
3818                                                          */
3819                                                         lck_mtx_lock(nfs_buf_mutex);
3820                                                         nfs_buf_itercomplete(np, &blist, list);
3821                                                         lck_mtx_unlock(nfs_buf_mutex);
3822                                                         return error;
3823                                                 }
3824                                                 error = 0;
3825                                         }
3826                                         lck_mtx_lock(nfs_buf_mutex);
3827                                         continue;
3828                                 }
3829                         }
3830                         SET(bp->nb_flags, NB_INVAL);
3831                         // hold off on FREEUPs until we're done here
3832                         nfs_buf_release(bp, 0);
3833                         lck_mtx_lock(nfs_buf_mutex);
3834                 }
3835                 nfs_buf_itercomplete(np, &blist, list);
3836         }
3837         if (!LIST_EMPTY(&(np)->n_dirtyblkhd) || !LIST_EMPTY(&(np)->n_cleanblkhd)) {
3838                 panic("nfs_vinvalbuf: flush/inval failed");
3839         }
3840         lck_mtx_unlock(nfs_buf_mutex);
3841         nfs_node_lock_force(np);
3842         if (!(flags & V_SAVE)) {
3843                 np->n_flag &= ~NMODIFIED;
3844         }
3845         if (vnode_vtype(NFSTOV(np)) == VREG) {
3846                 np->n_lastrahead = -1;
3847         }
3848         nfs_node_unlock(np);
3849         NFS_BUF_FREEUP();
3850         return 0;
3851 }
3852
3853
3854 /*
3855  * Flush and invalidate all dirty buffers. If another process is already
3856  * doing the flush, just wait for completion.
3857  */
3858 int
3859 nfs_vinvalbuf(vnode_t vp, int flags, vfs_context_t ctx, int intrflg)
3860 {
3861         return nfs_vinvalbuf2(vp, flags, vfs_context_thread(ctx), vfs_context_ucred(ctx), intrflg);
3862 }
3863
3864 int
3865 nfs_vinvalbuf2(vnode_t vp, int flags, thread_t thd, kauth_cred_t cred, int intrflg)
3866 {
3867         nfsnode_t np = VTONFS(vp);
3868         struct nfsmount *nmp = VTONMP(vp);
3869         int error, slpflag, slptimeo, nflags, retry = 0;
3870         int ubcflags = UBC_PUSHALL | UBC_SYNC | UBC_INVALIDATE;
3871         struct timespec ts = { .tv_sec = 2, .tv_nsec = 0 };
3872         off_t size;
3873
3874         FSDBG_TOP(554, np, flags, intrflg, 0);
3875
3876         /*
3877          * If the mount is gone no sense to try and write anything.
3878          * and hang trying to do IO.
3879          */
3880         if (nfs_mount_gone(nmp)) {
3881                 flags &= ~V_SAVE;
3882                 ubcflags &= ~UBC_PUSHALL;
3883         }
3884
3885         if (nmp && !NMFLAG(nmp, INTR)) {
3886                 intrflg = 0;
3887         }
3888         if (intrflg) {
3889                 slpflag = PCATCH;
3890                 slptimeo = 2 * hz;
3891         } else {
3892                 slpflag = 0;
3893                 slptimeo = 0;
3894         }
3895
3896         /* First wait for any other process doing a flush to complete.  */
3897         lck_mtx_lock(nfs_buf_mutex);
3898         while (np->n_bflag & NBINVALINPROG) {
3899                 np->n_bflag |= NBINVALWANT;
3900                 msleep(&np->n_bflag, nfs_buf_mutex, slpflag, "nfs_vinvalbuf", &ts);
3901                 if ((error = nfs_sigintr(VTONMP(vp), NULL, thd, 0))) {
3902                         lck_mtx_unlock(nfs_buf_mutex);
3903                         return error;
3904                 }
3905                 if (np->n_bflag & NBINVALINPROG) {
3906                         slpflag = 0;
3907                 }
3908         }
3909         np->n_bflag |= NBINVALINPROG;
3910         lck_mtx_unlock(nfs_buf_mutex);
3911
3912         /* Now, flush as required.  */
3913 again:
3914         error = nfs_vinvalbuf_internal(np, flags, thd, cred, slpflag, 0);
3915         while (error) {
3916                 FSDBG(554, np, 0, 0, error);
3917                 if ((error = nfs_sigintr(VTONMP(vp), NULL, thd, 0))) {
3918                         goto done;
3919                 }
3920                 error = nfs_vinvalbuf_internal(np, flags, thd, cred, 0, slptimeo);
3921         }
3922
3923         /* get the pages out of vm also */
3924         if (UBCINFOEXISTS(vp) && (size = ubc_getsize(vp))) {
3925                 if ((error = ubc_msync(vp, 0, size, NULL, ubcflags))) {
3926                         if (error == EINVAL) {
3927                                 panic("nfs_vinvalbuf(): ubc_msync failed!, error %d", error);
3928                         }
3929                         if (retry++ < 10) { /* retry invalidating a few times */
3930                                 if (retry > 1 || error == ENXIO) {
3931                                         ubcflags &= ~UBC_PUSHALL;
3932                                 }
3933                                 goto again;
3934                         }
3935                         /* give up */
3936                         printf("nfs_vinvalbuf(): ubc_msync failed!, error %d\n", error);
3937                 }
3938         }
3939 done:
3940         lck_mtx_lock(nfs_buf_mutex);
3941         nflags = np->n_bflag;
3942         np->n_bflag &= ~(NBINVALINPROG | NBINVALWANT);
3943         lck_mtx_unlock(nfs_buf_mutex);
3944         if (nflags & NBINVALWANT) {
3945                 wakeup(&np->n_bflag);
3946         }
3947
3948         FSDBG_BOT(554, np, flags, intrflg, error);
3949         return error;
3950 }
3951
3952 /*
3953  * Wait for any busy buffers to complete.
3954  */
3955 void
3956 nfs_wait_bufs(nfsnode_t np)
3957 {
3958         struct nfsbuf *bp;
3959         struct nfsbuflists blist;
3960         int error = 0;
3961
3962         lck_mtx_lock(nfs_buf_mutex);
3963         if (!nfs_buf_iterprepare(np, &blist, NBI_CLEAN)) {
3964                 while ((bp = LIST_FIRST(&blist))) {
3965                         LIST_REMOVE(bp, nb_vnbufs);
3966                         LIST_INSERT_HEAD(&np->n_cleanblkhd, bp, nb_vnbufs);
3967                         nfs_buf_refget(bp);
3968                         while ((error = nfs_buf_acquire(bp, 0, 0, 0))) {
3969                                 if (error != EAGAIN) {
3970                                         nfs_buf_refrele(bp);
3971                                         nfs_buf_itercomplete(np, &blist, NBI_CLEAN);
3972                                         lck_mtx_unlock(nfs_buf_mutex);
3973                                         return;
3974                                 }
3975                         }
3976                         nfs_buf_refrele(bp);
3977                         nfs_buf_drop(bp);
3978                 }
3979                 nfs_buf_itercomplete(np, &blist, NBI_CLEAN);
3980         }
3981         if (!nfs_buf_iterprepare(np, &blist, NBI_DIRTY)) {
3982                 while ((bp = LIST_FIRST(&blist))) {
3983                         LIST_REMOVE(bp, nb_vnbufs);
3984                         LIST_INSERT_HEAD(&np->n_dirtyblkhd, bp, nb_vnbufs);
3985                         nfs_buf_refget(bp);
3986                         while ((error = nfs_buf_acquire(bp, 0, 0, 0))) {
3987                                 if (error != EAGAIN) {
3988                                         nfs_buf_refrele(bp);
3989                                         nfs_buf_itercomplete(np, &blist, NBI_DIRTY);
3990                                         lck_mtx_unlock(nfs_buf_mutex);
3991                                         return;
3992                                 }
3993                         }
3994                         nfs_buf_refrele(bp);
3995                         nfs_buf_drop(bp);
3996                 }
3997                 nfs_buf_itercomplete(np, &blist, NBI_DIRTY);
3998         }
3999         lck_mtx_unlock(nfs_buf_mutex);
4000 }
4001
4002
4003 /*
4004  * Add an async I/O request to the mount's async I/O queue and make
4005  * sure that an nfsiod will service it.
4006  */
4007 void
4008 nfs_asyncio_finish(struct nfsreq *req)
4009 {
4010         struct nfsmount *nmp;
4011         struct nfsiod *niod;
4012         int started = 0;
4013
4014         FSDBG_TOP(552, nmp, 0, 0, 0);
4015 again:
4016         nmp = req->r_nmp;
4017
4018         if (nmp == NULL) {
4019                 return;
4020         }
4021
4022         lck_mtx_lock(nfsiod_mutex);
4023         niod = nmp->nm_niod;
4024
4025         /* grab an nfsiod if we don't have one already */
4026         if (!niod) {
4027                 niod = TAILQ_FIRST(&nfsiodfree);
4028                 if (niod) {
4029                         TAILQ_REMOVE(&nfsiodfree, niod, niod_link);
4030                         TAILQ_INSERT_TAIL(&nfsiodwork, niod, niod_link);
4031                         niod->niod_nmp = nmp;
4032                 } else if (((nfsiod_thread_count < NFSIOD_MAX) || (nfsiod_thread_count <= 0)) && (started < 4)) {
4033                         /*
4034                          * Try starting a new thread.
4035                          * We may try a couple times if other callers
4036                          * get the new threads before we do.
4037                          */
4038                         lck_mtx_unlock(nfsiod_mutex);
4039                         started++;
4040                         if (!nfsiod_start()) {
4041                                 goto again;
4042                         }
4043                         lck_mtx_lock(nfsiod_mutex);
4044                 }
4045         }
4046
4047         /*
4048          * If we got here while being on the resendq we need to get off. This
4049          * happens when the timer fires and errors out requests from nfs_sigintr
4050          * or we receive a reply (UDP case) while being on the resend queue so
4051          * we're just finishing up and are not going to be resent.
4052          */
4053         lck_mtx_lock(&req->r_mtx);
4054         if (req->r_flags & R_RESENDQ) {
4055                 lck_mtx_lock(&nmp->nm_lock);
4056                 if (req->r_rchain.tqe_next != NFSREQNOLIST) {
4057                         NFS_BIO_DBG("Proccessing async request on resendq. Removing");
4058                         TAILQ_REMOVE(&nmp->nm_resendq, req, r_rchain);
4059                         req->r_rchain.tqe_next = NFSREQNOLIST;
4060                         assert(req->r_refs > 1);
4061                         /* Remove resendq reference */
4062                         req->r_refs--;
4063                 }
4064                 lck_mtx_unlock(&nmp->nm_lock);
4065                 req->r_flags &= ~R_RESENDQ;
4066         }
4067         lck_mtx_unlock(&req->r_mtx);
4068
4069         if (req->r_achain.tqe_next == NFSREQNOLIST) {
4070                 TAILQ_INSERT_TAIL(&nmp->nm_iodq, req, r_achain);
4071         }
4072
4073         /* If this mount doesn't already have an nfsiod working on it... */
4074         if (!nmp->nm_niod) {
4075                 if (niod) { /* give it the nfsiod we just grabbed */
4076                         nmp->nm_niod = niod;
4077                         lck_mtx_unlock(nfsiod_mutex);
4078                         wakeup(niod);
4079                 } else if (nfsiod_thread_count > 0) {
4080                         /* just queue it up on nfsiod mounts queue if needed */
4081                         if (nmp->nm_iodlink.tqe_next == NFSNOLIST) {
4082                                 TAILQ_INSERT_TAIL(&nfsiodmounts, nmp, nm_iodlink);
4083                         }
4084                         lck_mtx_unlock(nfsiod_mutex);
4085                 } else {
4086                         printf("nfs_asyncio(): no nfsiods? %d %d (%d)\n", nfsiod_thread_count, NFSIOD_MAX, started);
4087                         lck_mtx_unlock(nfsiod_mutex);
4088                         /* we have no other option but to be persistent */
4089                         started = 0;
4090                         goto again;
4091                 }
4092         } else {
4093                 lck_mtx_unlock(nfsiod_mutex);
4094         }
4095
4096         FSDBG_BOT(552, nmp, 0, 0, 0);
4097 }
4098
4099 /*
4100  * queue up async I/O request for resend
4101  */
4102 void
4103 nfs_asyncio_resend(struct nfsreq *req)
4104 {
4105         struct nfsmount *nmp = req->r_nmp;
4106
4107         if (nfs_mount_gone(nmp)) {
4108                 return;
4109         }
4110
4111 #if CONFIG_NFS_GSS
4112         nfs_gss_clnt_rpcdone(req);
4113 #endif
4114         lck_mtx_lock(&nmp->nm_lock);
4115         if (!(req->r_flags & R_RESENDQ)) {
4116                 TAILQ_INSERT_TAIL(&nmp->nm_resendq, req, r_rchain);
4117                 req->r_flags |= R_RESENDQ;
4118                 /*
4119                  * We take a reference on this request so that it can't be
4120                  * destroyed while a resend is queued or in progress.
4121                  */
4122                 nfs_request_ref(req, 1);
4123         }
4124         nfs_mount_sock_thread_wake(nmp);
4125         lck_mtx_unlock(&nmp->nm_lock);
4126 }
4127
4128 /*
4129  * Read directory data into a buffer.
4130  *
4131  * Buffer will be filled (unless EOF is hit).
4132  * Buffers after this one may also be completely/partially filled.
4133  */
4134 int
4135 nfs_buf_readdir(struct nfsbuf *bp, vfs_context_t ctx)
4136 {
4137         nfsnode_t np = bp->nb_np;
4138         struct nfsmount *nmp = NFSTONMP(np);
4139         int error = 0;
4140
4141         if (nfs_mount_gone(nmp)) {
4142                 return ENXIO;
4143         }
4144
4145         if (nmp->nm_vers < NFS_VER4) {
4146                 error = nfs3_readdir_rpc(np, bp, ctx);
4147         }
4148 #if CONFIG_NFS4
4149         else {
4150                 error = nfs4_readdir_rpc(np, bp, ctx);
4151         }
4152 #endif
4153         if (error && (error != NFSERR_DIRBUFDROPPED)) {
4154                 SET(bp->nb_flags, NB_ERROR);
4155                 bp->nb_error = error;
4156         }
4157         return error;
4158 }
4159
4160 #endif /* CONFIG_NFS_CLIENT */