bsd/nfs/nfs_bio.c

   1 /*
   2  * Copyright (c) 2000-2019 Apple Inc. All rights reserved.
   3  *
   4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
   5  *
   6  * This file contains Original Code and/or Modifications of Original Code
   7  * as defined in and that are subject to the Apple Public Source License
   8  * Version 2.0 (the 'License'). You may not use this file except in
   9  * compliance with the License. The rights granted to you under the License
  10  * may not be used to create, or enable the creation or redistribution of,
  11  * unlawful or unlicensed copies of an Apple operating system, or to
  12  * circumvent, violate, or enable the circumvention or violation of, any
  13  * terms of an Apple operating system software license agreement.
  14  *
  15  * Please obtain a copy of the License at
  16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
  17  *
  18  * The Original Code and all software distributed under the License are
  19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
  20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
  21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
  22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
  23  * Please see the License for the specific language governing rights and
  24  * limitations under the License.
  25  *
  26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
  27  */
  28 /* Copyright (c) 1995 NeXT Computer, Inc. All Rights Reserved */
  29 /*
  30  * Copyright (c) 1989, 1993
  31  *      The Regents of the University of California.  All rights reserved.
  32  *
  33  * This code is derived from software contributed to Berkeley by
  34  * Rick Macklem at The University of Guelph.
  35  *
  36  * Redistribution and use in source and binary forms, with or without
  37  * modification, are permitted provided that the following conditions
  38  * are met:
  39  * 1. Redistributions of source code must retain the above copyright
  40  *    notice, this list of conditions and the following disclaimer.
  41  * 2. Redistributions in binary form must reproduce the above copyright
  42  *    notice, this list of conditions and the following disclaimer in the
  43  *    documentation and/or other materials provided with the distribution.
  44  * 3. All advertising materials mentioning features or use of this software
  45  *    must display the following acknowledgement:
  46  *      This product includes software developed by the University of
  47  *      California, Berkeley and its contributors.
  48  * 4. Neither the name of the University nor the names of its contributors
  49  *    may be used to endorse or promote products derived from this software
  50  *    without specific prior written permission.
  51  *
  52  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  53  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  54  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  55  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  56  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  57  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  58  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  59  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  60  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  61  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  62  * SUCH DAMAGE.
  63  *
  64  *      @(#)nfs_bio.c   8.9 (Berkeley) 3/30/95
  65  * FreeBSD-Id: nfs_bio.c,v 1.44 1997/09/10 19:52:25 phk Exp $
  66  */
  67 #include <sys/param.h>
  68 #include <sys/systm.h>
  69 #include <sys/resourcevar.h>
  70 #include <sys/signalvar.h>
  71 #include <sys/proc_internal.h>
  72 #include <sys/kauth.h>
  73 #include <sys/malloc.h>
  74 #include <sys/vnode.h>
  75 #include <sys/dirent.h>
  76 #include <sys/mount_internal.h>
  77 #include <sys/kernel.h>
  78 #include <sys/ubc_internal.h>
  79 #include <sys/uio_internal.h>
  80 #include <sys/kpi_mbuf.h>
  81
  82 #include <sys/vm.h>
  83 #include <sys/vmparam.h>
  84
  85 #include <sys/time.h>
  86 #include <kern/clock.h>
  87 #include <libkern/OSAtomic.h>
  88 #include <kern/kalloc.h>
  89 #include <kern/thread_call.h>
  90
  91 #include <nfs/rpcv2.h>
  92 #include <nfs/nfsproto.h>
  93 #include <nfs/nfs.h>
  94 #include <nfs/nfs_gss.h>
  95 #include <nfs/nfsmount.h>
  96 #include <nfs/nfsnode.h>
  97 #include <sys/buf_internal.h>
  98 #include <libkern/OSAtomic.h>
  99 #include <os/refcnt.h>
 100
 101 #define NFS_BIO_DBG(...) NFS_DBG(NFS_FAC_BIO, 7, ## __VA_ARGS__)
 102
 103 kern_return_t   thread_terminate(thread_t); /* XXX */
 104
 105 #define NFSBUFHASH(np, lbn)     \
 106         (&nfsbufhashtbl[((long)(np) / sizeof(*(np)) + (int)(lbn)) & nfsbufhash])
 107 LIST_HEAD(nfsbufhashhead, nfsbuf) * nfsbufhashtbl;
 108 struct nfsbuffreehead nfsbuffree, nfsbuffreemeta, nfsbufdelwri;
 109 u_long nfsbufhash;
 110 int nfsbufcnt, nfsbufmin, nfsbufmax, nfsbufmetacnt, nfsbufmetamax;
 111 int nfsbuffreecnt, nfsbuffreemetacnt, nfsbufdelwricnt, nfsneedbuffer;
 112 int nfs_nbdwrite;
 113 int nfs_buf_timer_on = 0;
 114 thread_t nfsbufdelwrithd = NULL;
 115
 116 lck_grp_t *nfs_buf_lck_grp;
 117 lck_mtx_t *nfs_buf_mutex;
 118
 119 #define NFSBUF_FREE_PERIOD      30      /* seconds */
 120 #define NFSBUF_LRU_STALE        120
 121 #define NFSBUF_META_STALE       240
 122
 123 /* number of nfsbufs nfs_buf_freeup() should attempt to free from nfsbuffree list */
 124 #define LRU_TO_FREEUP                   6
 125 /* number of nfsbufs nfs_buf_freeup() should attempt to free from nfsbuffreemeta list */
 126 #define META_TO_FREEUP                  3
 127 /* total number of nfsbufs nfs_buf_freeup() should attempt to free */
 128 #define TOTAL_TO_FREEUP                 (LRU_TO_FREEUP+META_TO_FREEUP)
 129 /* fraction of nfsbufs nfs_buf_freeup() should attempt to free from nfsbuffree list when called from timer */
 130 #define LRU_FREEUP_FRAC_ON_TIMER        8
 131 /* fraction of nfsbufs nfs_buf_freeup() should attempt to free from nfsbuffreemeta list when called from timer */
 132 #define META_FREEUP_FRAC_ON_TIMER       16
 133 /* fraction of total nfsbufs that nfsbuffreecnt should exceed before bothering to call nfs_buf_freeup() */
 134 #define LRU_FREEUP_MIN_FRAC             4
 135 /* fraction of total nfsbufs that nfsbuffreemetacnt should exceed before bothering to call nfs_buf_freeup() */
 136 #define META_FREEUP_MIN_FRAC            2
 137
 138 #define NFS_BUF_FREEUP() \
 139         do { \
 140         /* only call nfs_buf_freeup() if it has work to do: */ \
 141                 if (((nfsbuffreecnt > nfsbufcnt/LRU_FREEUP_MIN_FRAC) || \
 142                      (nfsbuffreemetacnt > nfsbufcnt/META_FREEUP_MIN_FRAC)) && \
 143                     ((nfsbufcnt - TOTAL_TO_FREEUP) > nfsbufmin)) \
 144                         nfs_buf_freeup(0); \
 145         } while (0)
 146
 147 /*
 148  * Initialize nfsbuf lists
 149  */
 150 void
 151 nfs_nbinit(void)
 152 {
 153         nfs_buf_lck_grp = lck_grp_alloc_init("nfs_buf", LCK_GRP_ATTR_NULL);
 154         nfs_buf_mutex = lck_mtx_alloc_init(nfs_buf_lck_grp, LCK_ATTR_NULL);
 155
 156         nfsbufcnt = nfsbufmetacnt =
 157             nfsbuffreecnt = nfsbuffreemetacnt = nfsbufdelwricnt = 0;
 158         nfsbufmin = 128;
 159         /* size nfsbufmax to cover at most half sane_size (w/default buf size) */
 160         nfsbufmax = (sane_size >> PAGE_SHIFT) / (2 * (NFS_RWSIZE >> PAGE_SHIFT));
 161         nfsbufmetamax = nfsbufmax / 4;
 162         nfsneedbuffer = 0;
 163         nfs_nbdwrite = 0;
 164
 165         nfsbufhashtbl = hashinit(nfsbufmax / 4, M_TEMP, &nfsbufhash);
 166         TAILQ_INIT(&nfsbuffree);
 167         TAILQ_INIT(&nfsbuffreemeta);
 168         TAILQ_INIT(&nfsbufdelwri);
 169 }
 170
 171 /*
 172  * Check periodically for stale/unused nfs bufs
 173  */
 174 void
 175 nfs_buf_timer(__unused void *param0, __unused void *param1)
 176 {
 177         nfs_buf_freeup(1);
 178
 179         lck_mtx_lock(nfs_buf_mutex);
 180         if (nfsbufcnt <= nfsbufmin) {
 181                 nfs_buf_timer_on = 0;
 182                 lck_mtx_unlock(nfs_buf_mutex);
 183                 return;
 184         }
 185         lck_mtx_unlock(nfs_buf_mutex);
 186
 187         nfs_interval_timer_start(nfs_buf_timer_call,
 188             NFSBUF_FREE_PERIOD * 1000);
 189 }
 190
 191 /*
 192  * try to free up some excess, unused nfsbufs
 193  */
 194 void
 195 nfs_buf_freeup(int timer)
 196 {
 197         struct nfsbuf *fbp;
 198         struct timeval now;
 199         int count;
 200         struct nfsbuffreehead nfsbuffreeup;
 201
 202         TAILQ_INIT(&nfsbuffreeup);
 203
 204         lck_mtx_lock(nfs_buf_mutex);
 205
 206         microuptime(&now);
 207
 208         FSDBG(320, nfsbufcnt, nfsbuffreecnt, nfsbuffreemetacnt, 0);
 209
 210         count = timer ? nfsbuffreecnt / LRU_FREEUP_FRAC_ON_TIMER : LRU_TO_FREEUP;
 211         while ((nfsbufcnt > nfsbufmin) && (count-- > 0)) {
 212                 fbp = TAILQ_FIRST(&nfsbuffree);
 213                 if (!fbp) {
 214                         break;
 215                 }
 216                 if (os_ref_get_count(&fbp->nb_refs) > 1) {
 217                         break;
 218                 }
 219                 if (NBUFSTAMPVALID(fbp) &&
 220                     (fbp->nb_timestamp + (2 * NFSBUF_LRU_STALE)) > now.tv_sec) {
 221                         break;
 222                 }
 223                 nfs_buf_remfree(fbp);
 224                 /* disassociate buffer from any nfsnode */
 225                 if (fbp->nb_np) {
 226                         if (fbp->nb_vnbufs.le_next != NFSNOLIST) {
 227                                 LIST_REMOVE(fbp, nb_vnbufs);
 228                                 fbp->nb_vnbufs.le_next = NFSNOLIST;
 229                         }
 230                         fbp->nb_np = NULL;
 231                 }
 232                 LIST_REMOVE(fbp, nb_hash);
 233                 TAILQ_INSERT_TAIL(&nfsbuffreeup, fbp, nb_free);
 234                 nfsbufcnt--;
 235         }
 236
 237         count = timer ? nfsbuffreemetacnt / META_FREEUP_FRAC_ON_TIMER : META_TO_FREEUP;
 238         while ((nfsbufcnt > nfsbufmin) && (count-- > 0)) {
 239                 fbp = TAILQ_FIRST(&nfsbuffreemeta);
 240                 if (!fbp) {
 241                         break;
 242                 }
 243                 if (os_ref_get_count(&fbp->nb_refs) > 1) {
 244                         break;
 245                 }
 246                 if (NBUFSTAMPVALID(fbp) &&
 247                     (fbp->nb_timestamp + (2 * NFSBUF_META_STALE)) > now.tv_sec) {
 248                         break;
 249                 }
 250                 nfs_buf_remfree(fbp);
 251                 /* disassociate buffer from any nfsnode */
 252                 if (fbp->nb_np) {
 253                         if (fbp->nb_vnbufs.le_next != NFSNOLIST) {
 254                                 LIST_REMOVE(fbp, nb_vnbufs);
 255                                 fbp->nb_vnbufs.le_next = NFSNOLIST;
 256                         }
 257                         fbp->nb_np = NULL;
 258                 }
 259                 LIST_REMOVE(fbp, nb_hash);
 260                 TAILQ_INSERT_TAIL(&nfsbuffreeup, fbp, nb_free);
 261                 nfsbufcnt--;
 262                 nfsbufmetacnt--;
 263         }
 264
 265         FSDBG(320, nfsbufcnt, nfsbuffreecnt, nfsbuffreemetacnt, 0);
 266         NFSBUFCNTCHK();
 267
 268         lck_mtx_unlock(nfs_buf_mutex);
 269
 270         while ((fbp = TAILQ_FIRST(&nfsbuffreeup))) {
 271                 TAILQ_REMOVE(&nfsbuffreeup, fbp, nb_free);
 272                 /* nuke any creds */
 273                 if (IS_VALID_CRED(fbp->nb_rcred)) {
 274                         kauth_cred_unref(&fbp->nb_rcred);
 275                 }
 276                 if (IS_VALID_CRED(fbp->nb_wcred)) {
 277                         kauth_cred_unref(&fbp->nb_wcred);
 278                 }
 279                 /* if buf was NB_META, dump buffer */
 280                 if (ISSET(fbp->nb_flags, NB_META) && fbp->nb_data) {
 281                         kfree(fbp->nb_data, fbp->nb_bufsize);
 282                 }
 283                 FREE(fbp, M_TEMP);
 284         }
 285 }
 286
 287 /*
 288  * remove a buffer from the freelist
 289  * (must be called with nfs_buf_mutex held)
 290  */
 291 void
 292 nfs_buf_remfree(struct nfsbuf *bp)
 293 {
 294         if (bp->nb_free.tqe_next == NFSNOLIST) {
 295                 panic("nfsbuf not on free list");
 296         }
 297         if (ISSET(bp->nb_flags, NB_DELWRI)) {
 298                 nfsbufdelwricnt--;
 299                 TAILQ_REMOVE(&nfsbufdelwri, bp, nb_free);
 300         } else if (ISSET(bp->nb_flags, NB_META)) {
 301                 nfsbuffreemetacnt--;
 302                 TAILQ_REMOVE(&nfsbuffreemeta, bp, nb_free);
 303         } else {
 304                 nfsbuffreecnt--;
 305                 TAILQ_REMOVE(&nfsbuffree, bp, nb_free);
 306         }
 307         bp->nb_free.tqe_next = NFSNOLIST;
 308         NFSBUFCNTCHK();
 309 }
 310
 311 /*
 312  * check for existence of nfsbuf in cache
 313  */
 314 boolean_t
 315 nfs_buf_is_incore(nfsnode_t np, daddr64_t blkno)
 316 {
 317         boolean_t rv;
 318         lck_mtx_lock(nfs_buf_mutex);
 319         if (nfs_buf_incore(np, blkno)) {
 320                 rv = TRUE;
 321         } else {
 322                 rv = FALSE;
 323         }
 324         lck_mtx_unlock(nfs_buf_mutex);
 325         return rv;
 326 }
 327
 328 /*
 329  * return incore buffer (must be called with nfs_buf_mutex held)
 330  */
 331 struct nfsbuf *
 332 nfs_buf_incore(nfsnode_t np, daddr64_t blkno)
 333 {
 334         /* Search hash chain */
 335         struct nfsbuf * bp = NFSBUFHASH(np, blkno)->lh_first;
 336         for (; bp != NULL; bp = bp->nb_hash.le_next) {
 337                 if ((bp->nb_lblkno == blkno) && (bp->nb_np == np)) {
 338                         if (!ISSET(bp->nb_flags, NB_INVAL)) {
 339                                 FSDBG(547, bp, blkno, bp->nb_flags, bp->nb_np);
 340                                 return bp;
 341                         }
 342                 }
 343         }
 344         return NULL;
 345 }
 346
 347 /*
 348  * Check if it's OK to drop a page.
 349  *
 350  * Called by vnode_pager() on pageout request of non-dirty page.
 351  * We need to make sure that it's not part of a delayed write.
 352  * If it is, we can't let the VM drop it because we may need it
 353  * later when/if we need to write the data (again).
 354  */
 355 int
 356 nfs_buf_page_inval(vnode_t vp, off_t offset)
 357 {
 358         struct nfsmount *nmp = VTONMP(vp);
 359         struct nfsbuf *bp;
 360         int error = 0;
 361
 362         if (nfs_mount_gone(nmp)) {
 363                 return ENXIO;
 364         }
 365
 366         lck_mtx_lock(nfs_buf_mutex);
 367         bp = nfs_buf_incore(VTONFS(vp), (daddr64_t)(offset / nmp->nm_biosize));
 368         if (!bp) {
 369                 goto out;
 370         }
 371         FSDBG(325, bp, bp->nb_flags, bp->nb_dirtyoff, bp->nb_dirtyend);
 372         if (ISSET(bp->nb_lflags, NBL_BUSY)) {
 373                 error = EBUSY;
 374                 goto out;
 375         }
 376         /*
 377          * If there's a dirty range in the buffer, check to
 378          * see if this page intersects with the dirty range.
 379          * If it does, we can't let the pager drop the page.
 380          */
 381         if (bp->nb_dirtyend > 0) {
 382                 int start = offset - NBOFF(bp);
 383                 if ((bp->nb_dirtyend > start) &&
 384                     (bp->nb_dirtyoff < (start + PAGE_SIZE))) {
 385                         /*
 386                          * Before returning the bad news, move the
 387                          * buffer to the start of the delwri list and
 388                          * give the list a push to try to flush the
 389                          * buffer out.
 390                          */
 391                         error = EBUSY;
 392                         nfs_buf_remfree(bp);
 393                         TAILQ_INSERT_HEAD(&nfsbufdelwri, bp, nb_free);
 394                         nfsbufdelwricnt++;
 395                         nfs_buf_delwri_push(1);
 396                 }
 397         }
 398 out:
 399         lck_mtx_unlock(nfs_buf_mutex);
 400         return error;
 401 }
 402
 403 /*
 404  * set up the UPL for a buffer
 405  * (must NOT be called with nfs_buf_mutex held)
 406  */
 407 int
 408 nfs_buf_upl_setup(struct nfsbuf *bp)
 409 {
 410         kern_return_t kret;
 411         upl_t upl;
 412         int upl_flags;
 413
 414         if (ISSET(bp->nb_flags, NB_PAGELIST)) {
 415                 return 0;
 416         }
 417
 418         upl_flags = UPL_PRECIOUS;
 419         if (!ISSET(bp->nb_flags, NB_READ)) {
 420                 /*
 421                  * We're doing a "write", so we intend to modify
 422                  * the pages we're gathering.
 423                  */
 424                 upl_flags |= UPL_WILL_MODIFY;
 425         }
 426         kret = ubc_create_upl_kernel(NFSTOV(bp->nb_np), NBOFF(bp), bp->nb_bufsize,
 427             &upl, NULL, upl_flags, VM_KERN_MEMORY_FILE);
 428         if (kret == KERN_INVALID_ARGUMENT) {
 429                 /* vm object probably doesn't exist any more */
 430                 bp->nb_pagelist = NULL;
 431                 return EINVAL;
 432         }
 433         if (kret != KERN_SUCCESS) {
 434                 printf("nfs_buf_upl_setup(): failed to get pagelist %d\n", kret);
 435                 bp->nb_pagelist = NULL;
 436                 return EIO;
 437         }
 438
 439         FSDBG(538, bp, NBOFF(bp), bp->nb_bufsize, bp->nb_np);
 440
 441         bp->nb_pagelist = upl;
 442         SET(bp->nb_flags, NB_PAGELIST);
 443         return 0;
 444 }
 445
 446 /*
 447  * update buffer's valid/dirty info from UBC
 448  * (must NOT be called with nfs_buf_mutex held)
 449  */
 450 void
 451 nfs_buf_upl_check(struct nfsbuf *bp)
 452 {
 453         upl_page_info_t *pl;
 454         off_t filesize, fileoffset;
 455         int i, npages;
 456
 457         if (!ISSET(bp->nb_flags, NB_PAGELIST)) {
 458                 return;
 459         }
 460
 461         npages = round_page_32(bp->nb_bufsize) / PAGE_SIZE;
 462         filesize = ubc_getsize(NFSTOV(bp->nb_np));
 463         fileoffset = NBOFF(bp);
 464         if (fileoffset < filesize) {
 465                 SET(bp->nb_flags, NB_CACHE);
 466         } else {
 467                 CLR(bp->nb_flags, NB_CACHE);
 468         }
 469
 470         pl = ubc_upl_pageinfo(bp->nb_pagelist);
 471         bp->nb_valid = bp->nb_dirty = 0;
 472
 473         for (i = 0; i < npages; i++, fileoffset += PAGE_SIZE_64) {
 474                 /* anything beyond the end of the file is not valid or dirty */
 475                 if (fileoffset >= filesize) {
 476                         break;
 477                 }
 478                 if (!upl_valid_page(pl, i)) {
 479                         CLR(bp->nb_flags, NB_CACHE);
 480                         continue;
 481                 }
 482                 NBPGVALID_SET(bp, i);
 483                 if (upl_dirty_page(pl, i)) {
 484                         NBPGDIRTY_SET(bp, i);
 485                 }
 486         }
 487         fileoffset = NBOFF(bp);
 488         if (ISSET(bp->nb_flags, NB_CACHE)) {
 489                 bp->nb_validoff = 0;
 490                 bp->nb_validend = bp->nb_bufsize;
 491                 if (fileoffset + bp->nb_validend > filesize) {
 492                         bp->nb_validend = filesize - fileoffset;
 493                 }
 494         } else {
 495                 bp->nb_validoff = bp->nb_validend = -1;
 496         }
 497         FSDBG(539, bp, fileoffset, bp->nb_valid, bp->nb_dirty);
 498         FSDBG(539, bp->nb_validoff, bp->nb_validend, bp->nb_dirtyoff, bp->nb_dirtyend);
 499 }
 500
 501 /*
 502  * make sure that a buffer is mapped
 503  * (must NOT be called with nfs_buf_mutex held)
 504  */
 505 int
 506 nfs_buf_map(struct nfsbuf *bp)
 507 {
 508         kern_return_t kret;
 509
 510         if (bp->nb_data) {
 511                 return 0;
 512         }
 513         if (!ISSET(bp->nb_flags, NB_PAGELIST)) {
 514                 return EINVAL;
 515         }
 516
 517         kret = ubc_upl_map(bp->nb_pagelist, (vm_offset_t *)&(bp->nb_data));
 518         if (kret != KERN_SUCCESS) {
 519                 panic("nfs_buf_map: ubc_upl_map() failed with (%d)", kret);
 520         }
 521         if (bp->nb_data == 0) {
 522                 panic("ubc_upl_map mapped 0");
 523         }
 524         FSDBG(540, bp, bp->nb_flags, NBOFF(bp), bp->nb_data);
 525         return 0;
 526 }
 527
 528 /*
 529  * normalize an nfsbuf's valid range
 530  *
 531  * the read/write code guarantees that we'll always have a valid
 532  * region that is an integral number of pages.  If either end
 533  * of the valid range isn't page-aligned, it gets corrected
 534  * here as we extend the valid range through all of the
 535  * contiguous valid pages.
 536  */
 537 void
 538 nfs_buf_normalize_valid_range(nfsnode_t np, struct nfsbuf *bp)
 539 {
 540         int pg, npg;
 541         /* pull validoff back to start of contiguous valid page range */
 542         pg = bp->nb_validoff / PAGE_SIZE;
 543         while (pg >= 0 && NBPGVALID(bp, pg)) {
 544                 pg--;
 545         }
 546         bp->nb_validoff = (pg + 1) * PAGE_SIZE;
 547         /* push validend forward to end of contiguous valid page range */
 548         npg = bp->nb_bufsize / PAGE_SIZE;
 549         pg = bp->nb_validend / PAGE_SIZE;
 550         while (pg < npg && NBPGVALID(bp, pg)) {
 551                 pg++;
 552         }
 553         bp->nb_validend = pg * PAGE_SIZE;
 554         /* clip to EOF */
 555         if (NBOFF(bp) + bp->nb_validend > (off_t)np->n_size) {
 556                 bp->nb_validend = np->n_size % bp->nb_bufsize;
 557         }
 558 }
 559
 560 /*
 561  * process some entries on the delayed write queue
 562  * (must be called with nfs_buf_mutex held)
 563  */
 564 void
 565 nfs_buf_delwri_service(void)
 566 {
 567         struct nfsbuf *bp;
 568         nfsnode_t np;
 569         int error, i = 0;
 570
 571         while (i < 8 && (bp = TAILQ_FIRST(&nfsbufdelwri)) != NULL) {
 572                 np = bp->nb_np;
 573                 nfs_buf_remfree(bp);
 574                 nfs_buf_refget(bp);
 575                 while ((error = nfs_buf_acquire(bp, 0, 0, 0)) == EAGAIN) {
 576                         ;
 577                 }
 578                 nfs_buf_refrele(bp);
 579                 if (error) {
 580                         break;
 581                 }
 582                 if (!bp->nb_np) {
 583                         /* buffer is no longer valid */
 584                         nfs_buf_drop(bp);
 585                         continue;
 586                 }
 587                 if (ISSET(bp->nb_flags, NB_NEEDCOMMIT)) {
 588                         nfs_buf_check_write_verifier(np, bp);
 589                 }
 590                 if (ISSET(bp->nb_flags, NB_NEEDCOMMIT)) {
 591                         /* put buffer at end of delwri list */
 592                         TAILQ_INSERT_TAIL(&nfsbufdelwri, bp, nb_free);
 593                         nfsbufdelwricnt++;
 594                         nfs_buf_drop(bp);
 595                         lck_mtx_unlock(nfs_buf_mutex);
 596                         nfs_flushcommits(np, 1);
 597                 } else {
 598                         SET(bp->nb_flags, NB_ASYNC);
 599                         lck_mtx_unlock(nfs_buf_mutex);
 600                         nfs_buf_write(bp);
 601                 }
 602                 i++;
 603                 lck_mtx_lock(nfs_buf_mutex);
 604         }
 605 }
 606
 607 /*
 608  * thread to service the delayed write queue when asked
 609  */
 610 void
 611 nfs_buf_delwri_thread(__unused void *arg, __unused wait_result_t wr)
 612 {
 613         struct timespec ts = { .tv_sec = 30, .tv_nsec = 0 };
 614         int error = 0;
 615
 616         lck_mtx_lock(nfs_buf_mutex);
 617         while (!error) {
 618                 nfs_buf_delwri_service();
 619                 error = msleep(&nfsbufdelwrithd, nfs_buf_mutex, 0, "nfsbufdelwri", &ts);
 620         }
 621         nfsbufdelwrithd = NULL;
 622         lck_mtx_unlock(nfs_buf_mutex);
 623         thread_terminate(nfsbufdelwrithd);
 624 }
 625
 626 /*
 627  * try to push out some delayed/uncommitted writes
 628  * ("locked" indicates whether nfs_buf_mutex is already held)
 629  */
 630 void
 631 nfs_buf_delwri_push(int locked)
 632 {
 633         if (TAILQ_EMPTY(&nfsbufdelwri)) {
 634                 return;
 635         }
 636         if (!locked) {
 637                 lck_mtx_lock(nfs_buf_mutex);
 638         }
 639         /* wake up the delayed write service thread */
 640         if (nfsbufdelwrithd) {
 641                 wakeup(&nfsbufdelwrithd);
 642         } else if (kernel_thread_start(nfs_buf_delwri_thread, NULL, &nfsbufdelwrithd) == KERN_SUCCESS) {
 643                 thread_deallocate(nfsbufdelwrithd);
 644         }
 645         /* otherwise, try to do some of the work ourselves */
 646         if (!nfsbufdelwrithd) {
 647                 nfs_buf_delwri_service();
 648         }
 649         if (!locked) {
 650                 lck_mtx_unlock(nfs_buf_mutex);
 651         }
 652 }
 653
 654 /*
 655  * Get an nfs buffer.
 656  *
 657  * Returns errno on error, 0 otherwise.
 658  * Any buffer is returned in *bpp.
 659  *
 660  * If NBLK_ONLYVALID is set, only return buffer if found in cache.
 661  * If NBLK_NOWAIT is set, don't wait for the buffer if it's marked BUSY.
 662  *
 663  * Check for existence of buffer in cache.
 664  * Or attempt to reuse a buffer from one of the free lists.
 665  * Or allocate a new buffer if we haven't already hit max allocation.
 666  * Or wait for a free buffer.
 667  *
 668  * If available buffer found, prepare it, and return it.
 669  *
 670  * If the calling process is interrupted by a signal for
 671  * an interruptible mount point, return EINTR.
 672  */
 673 int
 674 nfs_buf_get(
 675         nfsnode_t np,
 676         daddr64_t blkno,
 677         uint32_t size,
 678         thread_t thd,
 679         int flags,
 680         struct nfsbuf **bpp)
 681 {
 682         vnode_t vp = NFSTOV(np);
 683         struct nfsmount *nmp = VTONMP(vp);
 684         struct nfsbuf *bp;
 685         uint32_t bufsize;
 686         int slpflag = PCATCH;
 687         int operation = (flags & NBLK_OPMASK);
 688         int error = 0;
 689         struct timespec ts;
 690
 691         FSDBG_TOP(541, np, blkno, size, flags);
 692         *bpp = NULL;
 693
 694         bufsize = size;
 695         if (bufsize > NFS_MAXBSIZE) {
 696                 panic("nfs_buf_get: buffer larger than NFS_MAXBSIZE requested");
 697         }
 698
 699         if (nfs_mount_gone(nmp)) {
 700                 FSDBG_BOT(541, np, blkno, 0, ENXIO);
 701                 return ENXIO;
 702         }
 703
 704         if (!UBCINFOEXISTS(vp)) {
 705                 operation = NBLK_META;
 706         } else if (bufsize < (uint32_t)nmp->nm_biosize) {
 707                 /* reg files should always have biosize blocks */
 708                 bufsize = nmp->nm_biosize;
 709         }
 710
 711         /* if NBLK_WRITE, check for too many delayed/uncommitted writes */
 712         if ((operation == NBLK_WRITE) && (nfs_nbdwrite > NFS_A_LOT_OF_DELAYED_WRITES)) {
 713                 FSDBG_TOP(542, np, blkno, nfs_nbdwrite, NFS_A_LOT_OF_DELAYED_WRITES);
 714
 715                 /* poke the delwri list */
 716                 nfs_buf_delwri_push(0);
 717
 718                 /* sleep to let other threads run... */
 719                 tsleep(&nfs_nbdwrite, PCATCH, "nfs_nbdwrite", 1);
 720                 FSDBG_BOT(542, np, blkno, nfs_nbdwrite, NFS_A_LOT_OF_DELAYED_WRITES);
 721         }
 722
 723 loop:
 724         lck_mtx_lock(nfs_buf_mutex);
 725
 726         /* wait for any buffer invalidation/flushing to complete */
 727         while (np->n_bflag & NBINVALINPROG) {
 728                 np->n_bflag |= NBINVALWANT;
 729                 ts.tv_sec = 2;
 730                 ts.tv_nsec = 0;
 731                 msleep(&np->n_bflag, nfs_buf_mutex, slpflag, "nfs_buf_get_invalwait", &ts);
 732                 if ((error = nfs_sigintr(VTONMP(vp), NULL, thd, 0))) {
 733                         lck_mtx_unlock(nfs_buf_mutex);
 734                         FSDBG_BOT(541, np, blkno, 0, error);
 735                         return error;
 736                 }
 737                 if (np->n_bflag & NBINVALINPROG) {
 738                         slpflag = 0;
 739                 }
 740         }
 741
 742         /* check for existence of nfsbuf in cache */
 743         if ((bp = nfs_buf_incore(np, blkno))) {
 744                 /* if busy, set wanted and wait */
 745                 if (ISSET(bp->nb_lflags, NBL_BUSY)) {
 746                         if (flags & NBLK_NOWAIT) {
 747                                 lck_mtx_unlock(nfs_buf_mutex);
 748                                 FSDBG_BOT(541, np, blkno, bp, 0xbcbcbcbc);
 749                                 return 0;
 750                         }
 751                         FSDBG_TOP(543, np, blkno, bp, bp->nb_flags);
 752                         SET(bp->nb_lflags, NBL_WANTED);
 753
 754                         ts.tv_sec = 2;
 755                         ts.tv_nsec = 0;
 756                         msleep(bp, nfs_buf_mutex, slpflag | (PRIBIO + 1) | PDROP,
 757                             "nfsbufget", (slpflag == PCATCH) ? NULL : &ts);
 758                         slpflag = 0;
 759                         FSDBG_BOT(543, np, blkno, bp, bp->nb_flags);
 760                         if ((error = nfs_sigintr(VTONMP(vp), NULL, thd, 0))) {
 761                                 FSDBG_BOT(541, np, blkno, 0, error);
 762                                 return error;
 763                         }
 764                         goto loop;
 765                 }
 766                 if (bp->nb_bufsize != bufsize) {
 767                         panic("nfsbuf size mismatch");
 768                 }
 769                 SET(bp->nb_lflags, NBL_BUSY);
 770                 SET(bp->nb_flags, NB_CACHE);
 771                 nfs_buf_remfree(bp);
 772                 /* additional paranoia: */
 773                 if (ISSET(bp->nb_flags, NB_PAGELIST)) {
 774                         panic("pagelist buffer was not busy");
 775                 }
 776                 goto buffer_setup;
 777         }
 778
 779         if (flags & NBLK_ONLYVALID) {
 780                 lck_mtx_unlock(nfs_buf_mutex);
 781                 FSDBG_BOT(541, np, blkno, 0, 0x0000cace);
 782                 return 0;
 783         }
 784
 785         /*
 786          * where to get a free buffer:
 787          * - if meta and maxmeta reached, must reuse meta
 788          * - alloc new if we haven't reached min bufs
 789          * - if free lists are NOT empty
 790          *   - if free list is stale, use it
 791          *   - else if freemeta list is stale, use it
 792          *   - else if max bufs allocated, use least-time-to-stale
 793          * - alloc new if we haven't reached max allowed
 794          * - start clearing out delwri list and try again
 795          */
 796
 797         if ((operation == NBLK_META) && (nfsbufmetacnt >= nfsbufmetamax)) {
 798                 /* if we've hit max meta buffers, must reuse a meta buffer */
 799                 bp = TAILQ_FIRST(&nfsbuffreemeta);
 800         } else if ((nfsbufcnt > nfsbufmin) &&
 801             (!TAILQ_EMPTY(&nfsbuffree) || !TAILQ_EMPTY(&nfsbuffreemeta))) {
 802                 /* try to pull an nfsbuf off a free list */
 803                 struct nfsbuf *lrubp, *metabp;
 804                 struct timeval now;
 805                 microuptime(&now);
 806
 807                 /* if the next LRU or META buffer is invalid or stale, use it */
 808                 lrubp = TAILQ_FIRST(&nfsbuffree);
 809                 if (lrubp && (!NBUFSTAMPVALID(lrubp) ||
 810                     ((lrubp->nb_timestamp + NFSBUF_LRU_STALE) < now.tv_sec))) {
 811                         bp = lrubp;
 812                 }
 813                 metabp = TAILQ_FIRST(&nfsbuffreemeta);
 814                 if (!bp && metabp && (!NBUFSTAMPVALID(metabp) ||
 815                     ((metabp->nb_timestamp + NFSBUF_META_STALE) < now.tv_sec))) {
 816                         bp = metabp;
 817                 }
 818
 819                 if (!bp && (nfsbufcnt >= nfsbufmax)) {
 820                         /* we've already allocated all bufs, so */
 821                         /* choose the buffer that'll go stale first */
 822                         if (!metabp) {
 823                                 bp = lrubp;
 824                         } else if (!lrubp) {
 825                                 bp = metabp;
 826                         } else {
 827                                 int32_t lru_stale_time, meta_stale_time;
 828                                 lru_stale_time = lrubp->nb_timestamp + NFSBUF_LRU_STALE;
 829                                 meta_stale_time = metabp->nb_timestamp + NFSBUF_META_STALE;
 830                                 if (lru_stale_time <= meta_stale_time) {
 831                                         bp = lrubp;
 832                                 } else {
 833                                         bp = metabp;
 834                                 }
 835                         }
 836                 }
 837         }
 838
 839         if (bp) {
 840                 /* we have a buffer to reuse */
 841                 FSDBG(544, np, blkno, bp, bp->nb_flags);
 842                 nfs_buf_remfree(bp);
 843                 if (ISSET(bp->nb_flags, NB_DELWRI)) {
 844                         panic("nfs_buf_get: delwri");
 845                 }
 846                 SET(bp->nb_lflags, NBL_BUSY);
 847                 /* disassociate buffer from previous nfsnode */
 848                 if (bp->nb_np) {
 849                         if (bp->nb_vnbufs.le_next != NFSNOLIST) {
 850                                 LIST_REMOVE(bp, nb_vnbufs);
 851                                 bp->nb_vnbufs.le_next = NFSNOLIST;
 852                         }
 853                         bp->nb_np = NULL;
 854                 }
 855                 LIST_REMOVE(bp, nb_hash);
 856                 /* nuke any creds we're holding */
 857                 if (IS_VALID_CRED(bp->nb_rcred)) {
 858                         kauth_cred_unref(&bp->nb_rcred);
 859                 }
 860                 if (IS_VALID_CRED(bp->nb_wcred)) {
 861                         kauth_cred_unref(&bp->nb_wcred);
 862                 }
 863                 /* if buf will no longer be NB_META, dump old buffer */
 864                 if (operation == NBLK_META) {
 865                         if (!ISSET(bp->nb_flags, NB_META)) {
 866                                 nfsbufmetacnt++;
 867                         }
 868                 } else if (ISSET(bp->nb_flags, NB_META)) {
 869                         if (bp->nb_data) {
 870                                 kfree(bp->nb_data, bp->nb_bufsize);
 871                                 bp->nb_data = NULL;
 872                         }
 873                         nfsbufmetacnt--;
 874                 }
 875                 /* re-init buf fields */
 876                 bp->nb_error = 0;
 877                 bp->nb_validoff = bp->nb_validend = -1;
 878                 bp->nb_dirtyoff = bp->nb_dirtyend = 0;
 879                 bp->nb_valid = 0;
 880                 bp->nb_dirty = 0;
 881                 bp->nb_verf = 0;
 882         } else {
 883                 /* no buffer to reuse */
 884                 if ((nfsbufcnt < nfsbufmax) &&
 885                     ((operation != NBLK_META) || (nfsbufmetacnt < nfsbufmetamax))) {
 886                         /* just alloc a new one */
 887                         MALLOC(bp, struct nfsbuf *, sizeof(struct nfsbuf), M_TEMP, M_WAITOK);
 888                         if (!bp) {
 889                                 lck_mtx_unlock(nfs_buf_mutex);
 890                                 FSDBG_BOT(541, np, blkno, 0, error);
 891                                 return ENOMEM;
 892                         }
 893                         nfsbufcnt++;
 894
 895                         /*
 896                          * If any excess bufs, make sure the timer
 897                          * is running to free them up later.
 898                          */
 899                         if (nfsbufcnt > nfsbufmin && !nfs_buf_timer_on) {
 900                                 nfs_buf_timer_on = 1;
 901                                 nfs_interval_timer_start(nfs_buf_timer_call,
 902                                     NFSBUF_FREE_PERIOD * 1000);
 903                         }
 904
 905                         if (operation == NBLK_META) {
 906                                 nfsbufmetacnt++;
 907                         }
 908                         NFSBUFCNTCHK();
 909                         /* init nfsbuf */
 910                         bzero(bp, sizeof(*bp));
 911                         os_ref_init(&bp->nb_refs, NULL);
 912
 913                         bp->nb_free.tqe_next = NFSNOLIST;
 914                         bp->nb_validoff = bp->nb_validend = -1;
 915                         FSDBG(545, np, blkno, bp, 0);
 916                 } else {
 917                         /* too many bufs... wait for buffers to free up */
 918                         FSDBG_TOP(546, np, blkno, nfsbufcnt, nfsbufmax);
 919
 920                         /* poke the delwri list */
 921                         nfs_buf_delwri_push(1);
 922
 923                         nfsneedbuffer = 1;
 924                         msleep(&nfsneedbuffer, nfs_buf_mutex, PCATCH | PDROP, "nfsbufget", NULL);
 925                         FSDBG_BOT(546, np, blkno, nfsbufcnt, nfsbufmax);
 926                         if ((error = nfs_sigintr(VTONMP(vp), NULL, thd, 0))) {
 927                                 FSDBG_BOT(541, np, blkno, 0, error);
 928                                 return error;
 929                         }
 930                         goto loop;
 931                 }
 932         }
 933
 934         /* set up nfsbuf */
 935         SET(bp->nb_lflags, NBL_BUSY);
 936         bp->nb_flags = 0;
 937         bp->nb_lblkno = blkno;
 938         /* insert buf in hash */
 939         LIST_INSERT_HEAD(NFSBUFHASH(np, blkno), bp, nb_hash);
 940         /* associate buffer with new nfsnode */
 941         bp->nb_np = np;
 942         LIST_INSERT_HEAD(&np->n_cleanblkhd, bp, nb_vnbufs);
 943
 944 buffer_setup:
 945
 946         /* unlock hash */
 947         lck_mtx_unlock(nfs_buf_mutex);
 948
 949         switch (operation) {
 950         case NBLK_META:
 951                 SET(bp->nb_flags, NB_META);
 952                 if ((bp->nb_bufsize != bufsize) && bp->nb_data) {
 953                         kfree(bp->nb_data, bp->nb_bufsize);
 954                         bp->nb_data = NULL;
 955                         bp->nb_validoff = bp->nb_validend = -1;
 956                         bp->nb_dirtyoff = bp->nb_dirtyend = 0;
 957                         bp->nb_valid = 0;
 958                         bp->nb_dirty = 0;
 959                         CLR(bp->nb_flags, NB_CACHE);
 960                 }
 961                 if (!bp->nb_data) {
 962                         bp->nb_data = kalloc(bufsize);
 963                 }
 964                 if (!bp->nb_data) {
 965                         /* Ack! couldn't allocate the data buffer! */
 966                         /* clean up buffer and return error */
 967                         lck_mtx_lock(nfs_buf_mutex);
 968                         LIST_REMOVE(bp, nb_vnbufs);
 969                         bp->nb_vnbufs.le_next = NFSNOLIST;
 970                         bp->nb_np = NULL;
 971                         /* invalidate usage timestamp to allow immediate freeing */
 972                         NBUFSTAMPINVALIDATE(bp);
 973                         if (bp->nb_free.tqe_next != NFSNOLIST) {
 974                                 panic("nfsbuf on freelist");
 975                         }
 976                         TAILQ_INSERT_HEAD(&nfsbuffree, bp, nb_free);
 977                         nfsbuffreecnt++;
 978                         lck_mtx_unlock(nfs_buf_mutex);
 979                         FSDBG_BOT(541, np, blkno, 0xb00, ENOMEM);
 980                         return ENOMEM;
 981                 }
 982                 bp->nb_bufsize = bufsize;
 983                 break;
 984
 985         case NBLK_READ:
 986         case NBLK_WRITE:
 987                 /*
 988                  * Set or clear NB_READ now to let the UPL subsystem know
 989                  * if we intend to modify the pages or not.
 990                  */
 991                 if (operation == NBLK_READ) {
 992                         SET(bp->nb_flags, NB_READ);
 993                 } else {
 994                         CLR(bp->nb_flags, NB_READ);
 995                 }
 996                 if (bufsize < PAGE_SIZE) {
 997                         bufsize = PAGE_SIZE;
 998                 }
 999                 bp->nb_bufsize = bufsize;
1000                 bp->nb_validoff = bp->nb_validend = -1;
1001
1002                 if (UBCINFOEXISTS(vp)) {
1003                         /* set up upl */
1004                         if (nfs_buf_upl_setup(bp)) {
1005                                 /* unable to create upl */
1006                                 /* vm object must no longer exist */
1007                                 /* clean up buffer and return error */
1008                                 lck_mtx_lock(nfs_buf_mutex);
1009                                 LIST_REMOVE(bp, nb_vnbufs);
1010                                 bp->nb_vnbufs.le_next = NFSNOLIST;
1011                                 bp->nb_np = NULL;
1012                                 /* invalidate usage timestamp to allow immediate freeing */
1013                                 NBUFSTAMPINVALIDATE(bp);
1014                                 if (bp->nb_free.tqe_next != NFSNOLIST) {
1015                                         panic("nfsbuf on freelist");
1016                                 }
1017                                 TAILQ_INSERT_HEAD(&nfsbuffree, bp, nb_free);
1018                                 nfsbuffreecnt++;
1019                                 lck_mtx_unlock(nfs_buf_mutex);
1020                                 FSDBG_BOT(541, np, blkno, 0x2bc, EIO);
1021                                 return EIO;
1022                         }
1023                         nfs_buf_upl_check(bp);
1024                 }
1025                 break;
1026
1027         default:
1028                 panic("nfs_buf_get: %d unknown operation", operation);
1029         }
1030
1031         *bpp = bp;
1032
1033         FSDBG_BOT(541, np, blkno, bp, bp->nb_flags);
1034
1035         return 0;
1036 }
1037
1038 void
1039 nfs_buf_release(struct nfsbuf *bp, int freeup)
1040 {
1041         nfsnode_t np = bp->nb_np;
1042         vnode_t vp;
1043         struct timeval now;
1044         int wakeup_needbuffer, wakeup_buffer, wakeup_nbdwrite;
1045
1046         FSDBG_TOP(548, bp, NBOFF(bp), bp->nb_flags, bp->nb_data);
1047         FSDBG(548, bp->nb_validoff, bp->nb_validend, bp->nb_dirtyoff, bp->nb_dirtyend);
1048         FSDBG(548, bp->nb_valid, 0, bp->nb_dirty, 0);
1049
1050         vp = np ? NFSTOV(np) : NULL;
1051         if (vp && UBCINFOEXISTS(vp) && bp->nb_bufsize) {
1052                 int upl_flags, rv;
1053                 upl_t upl;
1054                 uint32_t i;
1055
1056                 if (!ISSET(bp->nb_flags, NB_PAGELIST) && !ISSET(bp->nb_flags, NB_INVAL)) {
1057                         rv = nfs_buf_upl_setup(bp);
1058                         if (rv) {
1059                                 printf("nfs_buf_release: upl create failed %d\n", rv);
1060                         } else {
1061                                 nfs_buf_upl_check(bp);
1062                         }
1063                 }
1064                 upl = bp->nb_pagelist;
1065                 if (!upl) {
1066                         goto pagelist_cleanup_done;
1067                 }
1068                 if (bp->nb_data) {
1069                         if (ubc_upl_unmap(upl) != KERN_SUCCESS) {
1070                                 panic("ubc_upl_unmap failed");
1071                         }
1072                         bp->nb_data = NULL;
1073                 }
1074                 /*
1075                  * Abort the pages on error or: if this is an invalid or
1076                  * non-needcommit nocache buffer AND no pages are dirty.
1077                  */
1078                 if (ISSET(bp->nb_flags, NB_ERROR) || (!bp->nb_dirty && (ISSET(bp->nb_flags, NB_INVAL) ||
1079                     (ISSET(bp->nb_flags, NB_NOCACHE) && !ISSET(bp->nb_flags, (NB_NEEDCOMMIT | NB_DELWRI)))))) {
1080                         if (ISSET(bp->nb_flags, (NB_READ | NB_INVAL | NB_NOCACHE))) {
1081                                 upl_flags = UPL_ABORT_DUMP_PAGES;
1082                         } else {
1083                                 upl_flags = 0;
1084                         }
1085                         ubc_upl_abort(upl, upl_flags);
1086                         goto pagelist_cleanup_done;
1087                 }
1088                 for (i = 0; i <= (bp->nb_bufsize - 1) / PAGE_SIZE; i++) {
1089                         if (!NBPGVALID(bp, i)) {
1090                                 ubc_upl_abort_range(upl,
1091                                     i * PAGE_SIZE, PAGE_SIZE,
1092                                     UPL_ABORT_DUMP_PAGES |
1093                                     UPL_ABORT_FREE_ON_EMPTY);
1094                         } else {
1095                                 if (NBPGDIRTY(bp, i)) {
1096                                         upl_flags = UPL_COMMIT_SET_DIRTY;
1097                                 } else {
1098                                         upl_flags = UPL_COMMIT_CLEAR_DIRTY;
1099                                 }
1100
1101                                 if (!ISSET(bp->nb_flags, (NB_NEEDCOMMIT | NB_DELWRI))) {
1102                                         upl_flags |= UPL_COMMIT_CLEAR_PRECIOUS;
1103                                 }
1104
1105                                 ubc_upl_commit_range(upl,
1106                                     i * PAGE_SIZE, PAGE_SIZE,
1107                                     upl_flags |
1108                                     UPL_COMMIT_INACTIVATE |
1109                                     UPL_COMMIT_FREE_ON_EMPTY);
1110                         }
1111                 }
1112 pagelist_cleanup_done:
1113                 /* invalidate any pages past EOF */
1114                 if (NBOFF(bp) + bp->nb_bufsize > (off_t)(np->n_size)) {
1115                         off_t start, end;
1116                         start = trunc_page_64(np->n_size) + PAGE_SIZE_64;
1117                         end = trunc_page_64(NBOFF(bp) + bp->nb_bufsize);
1118                         if (start < NBOFF(bp)) {
1119                                 start = NBOFF(bp);
1120                         }
1121                         if (end > start) {
1122                                 if ((rv = ubc_msync(vp, start, end, NULL, UBC_INVALIDATE))) {
1123                                         printf("nfs_buf_release(): ubc_msync failed!, error %d\n", rv);
1124                                 }
1125                         }
1126                 }
1127                 CLR(bp->nb_flags, NB_PAGELIST);
1128                 bp->nb_pagelist = NULL;
1129         }
1130
1131         lck_mtx_lock(nfs_buf_mutex);
1132
1133         wakeup_needbuffer = wakeup_buffer = wakeup_nbdwrite = 0;
1134
1135         /* Wake up any processes waiting for any buffer to become free. */
1136         if (nfsneedbuffer) {
1137                 nfsneedbuffer = 0;
1138                 wakeup_needbuffer = 1;
1139         }
1140         /* Wake up any processes waiting for _this_ buffer to become free. */
1141         if (ISSET(bp->nb_lflags, NBL_WANTED)) {
1142                 CLR(bp->nb_lflags, NBL_WANTED);
1143                 wakeup_buffer = 1;
1144         }
1145
1146         /* If it's non-needcommit nocache, or an error, mark it invalid. */
1147         if (ISSET(bp->nb_flags, NB_ERROR) ||
1148             (ISSET(bp->nb_flags, NB_NOCACHE) && !ISSET(bp->nb_flags, (NB_NEEDCOMMIT | NB_DELWRI)))) {
1149                 SET(bp->nb_flags, NB_INVAL);
1150         }
1151
1152         if ((bp->nb_bufsize <= 0) || ISSET(bp->nb_flags, NB_INVAL)) {
1153                 /* If it's invalid or empty, dissociate it from its nfsnode */
1154                 if (bp->nb_vnbufs.le_next != NFSNOLIST) {
1155                         LIST_REMOVE(bp, nb_vnbufs);
1156                         bp->nb_vnbufs.le_next = NFSNOLIST;
1157                 }
1158                 bp->nb_np = NULL;
1159                 /* if this was a delayed write, wakeup anyone */
1160                 /* waiting for delayed writes to complete */
1161                 if (ISSET(bp->nb_flags, NB_DELWRI)) {
1162                         CLR(bp->nb_flags, NB_DELWRI);
1163                         nfs_nbdwrite--;
1164                         NFSBUFCNTCHK();
1165                         wakeup_nbdwrite = 1;
1166                 }
1167                 /* invalidate usage timestamp to allow immediate freeing */
1168                 NBUFSTAMPINVALIDATE(bp);
1169                 /* put buffer at head of free list */
1170                 if (bp->nb_free.tqe_next != NFSNOLIST) {
1171                         panic("nfsbuf on freelist");
1172                 }
1173                 SET(bp->nb_flags, NB_INVAL);
1174                 if (ISSET(bp->nb_flags, NB_META)) {
1175                         TAILQ_INSERT_HEAD(&nfsbuffreemeta, bp, nb_free);
1176                         nfsbuffreemetacnt++;
1177                 } else {
1178                         TAILQ_INSERT_HEAD(&nfsbuffree, bp, nb_free);
1179                         nfsbuffreecnt++;
1180                 }
1181         } else if (ISSET(bp->nb_flags, NB_DELWRI)) {
1182                 /* put buffer at end of delwri list */
1183                 if (bp->nb_free.tqe_next != NFSNOLIST) {
1184                         panic("nfsbuf on freelist");
1185                 }
1186                 TAILQ_INSERT_TAIL(&nfsbufdelwri, bp, nb_free);
1187                 nfsbufdelwricnt++;
1188                 freeup = 0;
1189         } else {
1190                 /* update usage timestamp */
1191                 microuptime(&now);
1192                 bp->nb_timestamp = now.tv_sec;
1193                 /* put buffer at end of free list */
1194                 if (bp->nb_free.tqe_next != NFSNOLIST) {
1195                         panic("nfsbuf on freelist");
1196                 }
1197                 if (ISSET(bp->nb_flags, NB_META)) {
1198                         TAILQ_INSERT_TAIL(&nfsbuffreemeta, bp, nb_free);
1199                         nfsbuffreemetacnt++;
1200                 } else {
1201                         TAILQ_INSERT_TAIL(&nfsbuffree, bp, nb_free);
1202                         nfsbuffreecnt++;
1203                 }
1204         }
1205
1206         NFSBUFCNTCHK();
1207
1208         /* Unlock the buffer. */
1209         CLR(bp->nb_flags, (NB_ASYNC | NB_STABLE));
1210         CLR(bp->nb_lflags, NBL_BUSY);
1211
1212         FSDBG_BOT(548, bp, NBOFF(bp), bp->nb_flags, bp->nb_data);
1213
1214         lck_mtx_unlock(nfs_buf_mutex);
1215
1216         if (wakeup_needbuffer) {
1217                 wakeup(&nfsneedbuffer);
1218         }
1219         if (wakeup_buffer) {
1220                 wakeup(bp);
1221         }
1222         if (wakeup_nbdwrite) {
1223                 wakeup(&nfs_nbdwrite);
1224         }
1225         if (freeup) {
1226                 NFS_BUF_FREEUP();
1227         }
1228 }
1229
1230 /*
1231  * Wait for operations on the buffer to complete.
1232  * When they do, extract and return the I/O's error value.
1233  */
1234 int
1235 nfs_buf_iowait(struct nfsbuf *bp)
1236 {
1237         FSDBG_TOP(549, bp, NBOFF(bp), bp->nb_flags, bp->nb_error);
1238
1239         lck_mtx_lock(nfs_buf_mutex);
1240
1241         while (!ISSET(bp->nb_flags, NB_DONE)) {
1242                 msleep(bp, nfs_buf_mutex, PRIBIO + 1, "nfs_buf_iowait", NULL);
1243         }
1244
1245         lck_mtx_unlock(nfs_buf_mutex);
1246
1247         FSDBG_BOT(549, bp, NBOFF(bp), bp->nb_flags, bp->nb_error);
1248
1249         /* check for interruption of I/O, then errors. */
1250         if (ISSET(bp->nb_flags, NB_EINTR)) {
1251                 CLR(bp->nb_flags, NB_EINTR);
1252                 return EINTR;
1253         } else if (ISSET(bp->nb_flags, NB_ERROR)) {
1254                 return bp->nb_error ? bp->nb_error : EIO;
1255         }
1256         return 0;
1257 }
1258
1259 /*
1260  * Mark I/O complete on a buffer.
1261  */
1262 void
1263 nfs_buf_iodone(struct nfsbuf *bp)
1264 {
1265         FSDBG_TOP(550, bp, NBOFF(bp), bp->nb_flags, bp->nb_error);
1266
1267         if (ISSET(bp->nb_flags, NB_DONE)) {
1268                 panic("nfs_buf_iodone already");
1269         }
1270
1271         if (!ISSET(bp->nb_flags, NB_READ)) {
1272                 CLR(bp->nb_flags, NB_WRITEINPROG);
1273                 /*
1274                  * vnode_writedone() takes care of waking up
1275                  * any throttled write operations
1276                  */
1277                 vnode_writedone(NFSTOV(bp->nb_np));
1278                 nfs_node_lock_force(bp->nb_np);
1279                 bp->nb_np->n_numoutput--;
1280                 nfs_node_unlock(bp->nb_np);
1281         }
1282         if (ISSET(bp->nb_flags, NB_ASYNC)) {    /* if async, release it */
1283                 SET(bp->nb_flags, NB_DONE);             /* note that it's done */
1284                 nfs_buf_release(bp, 1);
1285         } else {                                        /* or just wakeup the buffer */
1286                 lck_mtx_lock(nfs_buf_mutex);
1287                 SET(bp->nb_flags, NB_DONE);             /* note that it's done */
1288                 CLR(bp->nb_lflags, NBL_WANTED);
1289                 lck_mtx_unlock(nfs_buf_mutex);
1290                 wakeup(bp);
1291         }
1292
1293         FSDBG_BOT(550, bp, NBOFF(bp), bp->nb_flags, bp->nb_error);
1294 }
1295
1296 void
1297 nfs_buf_write_delayed(struct nfsbuf *bp)
1298 {
1299         nfsnode_t np = bp->nb_np;
1300
1301         FSDBG_TOP(551, bp, NBOFF(bp), bp->nb_flags, 0);
1302         FSDBG(551, bp, bp->nb_dirtyoff, bp->nb_dirtyend, bp->nb_dirty);
1303
1304         /*
1305          * If the block hasn't been seen before:
1306          *      (1) Mark it as having been seen,
1307          *      (2) Make sure it's on its node's correct block list,
1308          */
1309         if (!ISSET(bp->nb_flags, NB_DELWRI)) {
1310                 SET(bp->nb_flags, NB_DELWRI);
1311                 /* move to dirty list */
1312                 lck_mtx_lock(nfs_buf_mutex);
1313                 nfs_nbdwrite++;
1314                 NFSBUFCNTCHK();
1315                 if (bp->nb_vnbufs.le_next != NFSNOLIST) {
1316                         LIST_REMOVE(bp, nb_vnbufs);
1317                 }
1318                 LIST_INSERT_HEAD(&np->n_dirtyblkhd, bp, nb_vnbufs);
1319                 lck_mtx_unlock(nfs_buf_mutex);
1320         }
1321
1322         /*
1323          * If the vnode has "too many" write operations in progress
1324          * wait for them to finish the IO
1325          */
1326         vnode_waitforwrites(NFSTOV(np), VNODE_ASYNC_THROTTLE, 0, 0, "nfs_buf_write_delayed");
1327
1328         /* the file is in a modified state, so make sure the flag's set */
1329         nfs_node_lock_force(np);
1330         np->n_flag |= NMODIFIED;
1331         nfs_node_unlock(np);
1332
1333         /*
1334          * If we have too many delayed write buffers,
1335          * just fall back to doing the async write.
1336          */
1337         if (nfs_nbdwrite < 0) {
1338                 panic("nfs_buf_write_delayed: Negative nfs_nbdwrite");
1339         }
1340         if (nfs_nbdwrite > NFS_A_LOT_OF_DELAYED_WRITES) {
1341                 /* issue async write */
1342                 SET(bp->nb_flags, NB_ASYNC);
1343                 nfs_buf_write(bp);
1344                 FSDBG_BOT(551, bp, NBOFF(bp), bp->nb_flags, bp->nb_error);
1345                 return;
1346         }
1347
1348         /* Otherwise, the "write" is done, so mark and release the buffer. */
1349         SET(bp->nb_flags, NB_DONE);
1350         nfs_buf_release(bp, 1);
1351         FSDBG_BOT(551, bp, NBOFF(bp), bp->nb_flags, 0);
1352         return;
1353 }
1354
1355 /*
1356  * Check that a "needcommit" buffer can still be committed.
1357  * If the write verifier has changed, we need to clear the
1358  * the needcommit flag.
1359  */
1360 void
1361 nfs_buf_check_write_verifier(nfsnode_t np, struct nfsbuf *bp)
1362 {
1363         struct nfsmount *nmp;
1364
1365         if (!ISSET(bp->nb_flags, NB_NEEDCOMMIT)) {
1366                 return;
1367         }
1368
1369         nmp = NFSTONMP(np);
1370         if (nfs_mount_gone(nmp)) {
1371                 return;
1372         }
1373         if (!ISSET(bp->nb_flags, NB_STALEWVERF) && (bp->nb_verf == nmp->nm_verf)) {
1374                 return;
1375         }
1376
1377         /* write verifier changed, clear commit/wverf flags */
1378         CLR(bp->nb_flags, (NB_NEEDCOMMIT | NB_STALEWVERF));
1379         bp->nb_verf = 0;
1380         nfs_node_lock_force(np);
1381         np->n_needcommitcnt--;
1382         CHECK_NEEDCOMMITCNT(np);
1383         nfs_node_unlock(np);
1384 }
1385
1386 /*
1387  * add a reference to a buffer so it doesn't disappear while being used
1388  * (must be called with nfs_buf_mutex held)
1389  */
1390 void
1391 nfs_buf_refget(struct nfsbuf *bp)
1392 {
1393         os_ref_retain_locked(&bp->nb_refs);
1394 }
1395 /*
1396  * release a reference on a buffer
1397  * (must be called with nfs_buf_mutex held)
1398  */
1399 void
1400 nfs_buf_refrele(struct nfsbuf *bp)
1401 {
1402         (void) os_ref_release_locked(&bp->nb_refs);
1403 }
1404
1405 /*
1406  * mark a particular buffer as BUSY
1407  * (must be called with nfs_buf_mutex held)
1408  */
1409 errno_t
1410 nfs_buf_acquire(struct nfsbuf *bp, int flags, int slpflag, int slptimeo)
1411 {
1412         errno_t error;
1413         struct timespec ts;
1414
1415         if (ISSET(bp->nb_lflags, NBL_BUSY)) {
1416                 /*
1417                  * since the lck_mtx_lock may block, the buffer
1418                  * may become BUSY, so we need to recheck for
1419                  * a NOWAIT request
1420                  */
1421                 if (flags & NBAC_NOWAIT) {
1422                         return EBUSY;
1423                 }
1424                 SET(bp->nb_lflags, NBL_WANTED);
1425
1426                 ts.tv_sec = (slptimeo / 100);
1427                 /* the hz value is 100; which leads to 10ms */
1428                 ts.tv_nsec = (slptimeo % 100) * 10  * NSEC_PER_USEC * 1000;
1429
1430                 error = msleep(bp, nfs_buf_mutex, slpflag | (PRIBIO + 1),
1431                     "nfs_buf_acquire", &ts);
1432                 if (error) {
1433                         return error;
1434                 }
1435                 return EAGAIN;
1436         }
1437         if (flags & NBAC_REMOVE) {
1438                 nfs_buf_remfree(bp);
1439         }
1440         SET(bp->nb_lflags, NBL_BUSY);
1441
1442         return 0;
1443 }
1444
1445 /*
1446  * simply drop the BUSY status of a buffer
1447  * (must be called with nfs_buf_mutex held)
1448  */
1449 void
1450 nfs_buf_drop(struct nfsbuf *bp)
1451 {
1452         int need_wakeup = 0;
1453
1454         if (!ISSET(bp->nb_lflags, NBL_BUSY)) {
1455                 panic("nfs_buf_drop: buffer not busy!");
1456         }
1457         if (ISSET(bp->nb_lflags, NBL_WANTED)) {
1458                 /* delay the actual wakeup until after we clear NBL_BUSY */
1459                 need_wakeup = 1;
1460         }
1461         /* Unlock the buffer. */
1462         CLR(bp->nb_lflags, (NBL_BUSY | NBL_WANTED));
1463
1464         if (need_wakeup) {
1465                 wakeup(bp);
1466         }
1467 }
1468
1469 /*
1470  * prepare for iterating over an nfsnode's buffer list
1471  * this lock protects the queue manipulation
1472  * (must be called with nfs_buf_mutex held)
1473  */
1474 int
1475 nfs_buf_iterprepare(nfsnode_t np, struct nfsbuflists *iterheadp, int flags)
1476 {
1477         struct nfsbuflists *listheadp;
1478
1479         if (flags & NBI_DIRTY) {
1480                 listheadp = &np->n_dirtyblkhd;
1481         } else {
1482                 listheadp = &np->n_cleanblkhd;
1483         }
1484
1485         if ((flags & NBI_NOWAIT) && (np->n_bufiterflags & NBI_ITER)) {
1486                 LIST_INIT(iterheadp);
1487                 return EWOULDBLOCK;
1488         }
1489
1490         while (np->n_bufiterflags & NBI_ITER) {
1491                 np->n_bufiterflags |= NBI_ITERWANT;
1492                 msleep(&np->n_bufiterflags, nfs_buf_mutex, 0, "nfs_buf_iterprepare", NULL);
1493         }
1494         if (LIST_EMPTY(listheadp)) {
1495                 LIST_INIT(iterheadp);
1496                 return EINVAL;
1497         }
1498         np->n_bufiterflags |= NBI_ITER;
1499
1500         iterheadp->lh_first = listheadp->lh_first;
1501         listheadp->lh_first->nb_vnbufs.le_prev = &iterheadp->lh_first;
1502         LIST_INIT(listheadp);
1503
1504         return 0;
1505 }
1506
1507 /*
1508  * clean up after iterating over an nfsnode's buffer list
1509  * this lock protects the queue manipulation
1510  * (must be called with nfs_buf_mutex held)
1511  */
1512 void
1513 nfs_buf_itercomplete(nfsnode_t np, struct nfsbuflists *iterheadp, int flags)
1514 {
1515         struct nfsbuflists * listheadp;
1516         struct nfsbuf *bp;
1517
1518         if (flags & NBI_DIRTY) {
1519                 listheadp = &np->n_dirtyblkhd;
1520         } else {
1521                 listheadp = &np->n_cleanblkhd;
1522         }
1523
1524         while (!LIST_EMPTY(iterheadp)) {
1525                 bp = LIST_FIRST(iterheadp);
1526                 LIST_REMOVE(bp, nb_vnbufs);
1527                 LIST_INSERT_HEAD(listheadp, bp, nb_vnbufs);
1528         }
1529
1530         np->n_bufiterflags &= ~NBI_ITER;
1531         if (np->n_bufiterflags & NBI_ITERWANT) {
1532                 np->n_bufiterflags &= ~NBI_ITERWANT;
1533                 wakeup(&np->n_bufiterflags);
1534         }
1535 }
1536
1537
1538 /*
1539  * Read an NFS buffer for a file.
1540  */
1541 int
1542 nfs_buf_read(struct nfsbuf *bp)
1543 {
1544         int error = 0;
1545         nfsnode_t np;
1546         thread_t thd;
1547         kauth_cred_t cred;
1548
1549         np = bp->nb_np;
1550         cred = bp->nb_rcred;
1551         if (IS_VALID_CRED(cred)) {
1552                 kauth_cred_ref(cred);
1553         }
1554         thd = ISSET(bp->nb_flags, NB_ASYNC) ? NULL : current_thread();
1555
1556         /* sanity checks */
1557         if (!ISSET(bp->nb_flags, NB_READ)) {
1558                 panic("nfs_buf_read: !NB_READ");
1559         }
1560         if (ISSET(bp->nb_flags, NB_DONE)) {
1561                 CLR(bp->nb_flags, NB_DONE);
1562         }
1563
1564         NFS_BUF_MAP(bp);
1565
1566         OSAddAtomic64(1, &nfsstats.read_bios);
1567
1568         error = nfs_buf_read_rpc(bp, thd, cred);
1569         /*
1570          * For async I/O, the callbacks will finish up the
1571          * read.  Otherwise, the read has already been finished.
1572          */
1573
1574         if (IS_VALID_CRED(cred)) {
1575                 kauth_cred_unref(&cred);
1576         }
1577         return error;
1578 }
1579
1580 /*
1581  * finish the reading of a buffer
1582  */
1583 void
1584 nfs_buf_read_finish(struct nfsbuf *bp)
1585 {
1586         nfsnode_t np = bp->nb_np;
1587         struct nfsmount *nmp;
1588
1589         if (!ISSET(bp->nb_flags, NB_ERROR)) {
1590                 /* update valid range */
1591                 bp->nb_validoff = 0;
1592                 bp->nb_validend = bp->nb_endio;
1593                 if (bp->nb_endio < (int)bp->nb_bufsize) {
1594                         /*
1595                          * The read may be short because we have unflushed writes
1596                          * that are extending the file size and the reads hit the
1597                          * (old) EOF on the server.  So, just make sure nb_validend
1598                          * correctly tracks EOF.
1599                          * Note that the missing data should have already been zeroed
1600                          * in nfs_buf_read_rpc_finish().
1601                          */
1602                         off_t boff = NBOFF(bp);
1603                         if ((off_t)np->n_size >= (boff + bp->nb_bufsize)) {
1604                                 bp->nb_validend = bp->nb_bufsize;
1605                         } else if ((off_t)np->n_size >= boff) {
1606                                 bp->nb_validend = np->n_size - boff;
1607                         } else {
1608                                 bp->nb_validend = 0;
1609                         }
1610                 }
1611                 if ((nmp = NFSTONMP(np)) && (nmp->nm_vers == NFS_VER2) &&
1612                     ((NBOFF(bp) + bp->nb_validend) > 0x100000000LL)) {
1613                         bp->nb_validend = 0x100000000LL - NBOFF(bp);
1614                 }
1615                 bp->nb_valid = (uint32_t)(1LLU << (round_page_32(bp->nb_validend) / PAGE_SIZE)) - 1;
1616                 if (bp->nb_validend & PAGE_MASK) {
1617                         /* zero-fill remainder of last page */
1618                         bzero(bp->nb_data + bp->nb_validend, PAGE_SIZE - (bp->nb_validend & PAGE_MASK));
1619                 }
1620         }
1621         nfs_buf_iodone(bp);
1622 }
1623
1624 /*
1625  * initiate the NFS READ RPC(s) for a buffer
1626  */
1627 int
1628 nfs_buf_read_rpc(struct nfsbuf *bp, thread_t thd, kauth_cred_t cred)
1629 {
1630         struct nfsmount *nmp;
1631         nfsnode_t np = bp->nb_np;
1632         int error = 0, nfsvers, async;
1633         int offset, nrpcs;
1634         uint32_t nmrsize, length, len;
1635         off_t boff;
1636         struct nfsreq *req;
1637         struct nfsreq_cbinfo cb;
1638
1639         nmp = NFSTONMP(np);
1640         if (nfs_mount_gone(nmp)) {
1641                 bp->nb_error = error = ENXIO;
1642                 SET(bp->nb_flags, NB_ERROR);
1643                 nfs_buf_iodone(bp);
1644                 return error;
1645         }
1646         nfsvers = nmp->nm_vers;
1647         nmrsize = nmp->nm_rsize;
1648
1649         boff = NBOFF(bp);
1650         offset = 0;
1651         length = bp->nb_bufsize;
1652
1653         if (nfsvers == NFS_VER2) {
1654                 if (boff > 0xffffffffLL) {
1655                         bp->nb_error = error = EFBIG;
1656                         SET(bp->nb_flags, NB_ERROR);
1657                         nfs_buf_iodone(bp);
1658                         return error;
1659                 }
1660                 if ((boff + length - 1) > 0xffffffffLL) {
1661                         length = 0x100000000LL - boff;
1662                 }
1663         }
1664
1665         /* Note: Can only do async I/O if nfsiods are configured. */
1666         async = (bp->nb_flags & NB_ASYNC);
1667         cb.rcb_func = async ? nfs_buf_read_rpc_finish : NULL;
1668         cb.rcb_bp = bp;
1669
1670         bp->nb_offio = bp->nb_endio = 0;
1671         bp->nb_rpcs = nrpcs = (length + nmrsize - 1) / nmrsize;
1672         if (async && (nrpcs > 1)) {
1673                 SET(bp->nb_flags, NB_MULTASYNCRPC);
1674         } else {
1675                 CLR(bp->nb_flags, NB_MULTASYNCRPC);
1676         }
1677
1678         while (length > 0) {
1679                 if (ISSET(bp->nb_flags, NB_ERROR)) {
1680                         error = bp->nb_error;
1681                         break;
1682                 }
1683                 len = (length > nmrsize) ? nmrsize : length;
1684                 cb.rcb_args[0] = offset;
1685                 cb.rcb_args[1] = len;
1686 #if CONFIG_NFS4
1687                 if (nmp->nm_vers >= NFS_VER4) {
1688                         cb.rcb_args[2] = nmp->nm_stategenid;
1689                 }
1690 #endif
1691                 req = NULL;
1692                 error = nmp->nm_funcs->nf_read_rpc_async(np, boff + offset, len, thd, cred, &cb, &req);
1693                 if (error) {
1694                         break;
1695                 }
1696                 offset += len;
1697                 length -= len;
1698                 if (async) {
1699                         continue;
1700                 }
1701                 nfs_buf_read_rpc_finish(req);
1702                 if (ISSET(bp->nb_flags, NB_ERROR)) {
1703                         error = bp->nb_error;
1704                         break;
1705                 }
1706         }
1707
1708         if (length > 0) {
1709                 /*
1710                  * Something bad happened while trying to send the RPC(s).
1711                  * Wait for any outstanding requests to complete.
1712                  */
1713                 bp->nb_error = error;
1714                 SET(bp->nb_flags, NB_ERROR);
1715                 if (ISSET(bp->nb_flags, NB_MULTASYNCRPC)) {
1716                         nrpcs = (length + nmrsize - 1) / nmrsize;
1717                         lck_mtx_lock(nfs_buf_mutex);
1718                         bp->nb_rpcs -= nrpcs;
1719                         if (bp->nb_rpcs == 0) {
1720                                 /* No RPCs left, so the buffer's done */
1721                                 lck_mtx_unlock(nfs_buf_mutex);
1722                                 nfs_buf_iodone(bp);
1723                         } else {
1724                                 /* wait for the last RPC to mark it done */
1725                                 while (bp->nb_rpcs > 0) {
1726                                         msleep(&bp->nb_rpcs, nfs_buf_mutex, 0,
1727                                             "nfs_buf_read_rpc_cancel", NULL);
1728                                 }
1729                                 lck_mtx_unlock(nfs_buf_mutex);
1730                         }
1731                 } else {
1732                         nfs_buf_iodone(bp);
1733                 }
1734         }
1735
1736         return error;
1737 }
1738
1739 /*
1740  * finish up an NFS READ RPC on a buffer
1741  */
1742 void
1743 nfs_buf_read_rpc_finish(struct nfsreq *req)
1744 {
1745         struct nfsmount *nmp;
1746         size_t rlen;
1747         struct nfsreq_cbinfo cb;
1748         struct nfsbuf *bp;
1749         int error = 0, nfsvers, offset, length, eof = 0, multasyncrpc, finished;
1750         void *wakeme = NULL;
1751         struct nfsreq *rreq = NULL;
1752         nfsnode_t np;
1753         thread_t thd;
1754         kauth_cred_t cred;
1755         uio_t auio;
1756         char uio_buf[UIO_SIZEOF(1)];
1757
1758 finish:
1759         np = req->r_np;
1760         thd = req->r_thread;
1761         cred = req->r_cred;
1762         if (IS_VALID_CRED(cred)) {
1763                 kauth_cred_ref(cred);
1764         }
1765         cb = req->r_callback;
1766         bp = cb.rcb_bp;
1767         if (cb.rcb_func) { /* take an extra reference on the nfsreq in case we want to resend it later due to grace error */
1768                 nfs_request_ref(req, 0);
1769         }
1770
1771         nmp = NFSTONMP(np);
1772         if (nfs_mount_gone(nmp)) {
1773                 SET(bp->nb_flags, NB_ERROR);
1774                 bp->nb_error = error = ENXIO;
1775         }
1776         if (error || ISSET(bp->nb_flags, NB_ERROR)) {
1777                 /* just drop it */
1778                 nfs_request_async_cancel(req);
1779                 goto out;
1780         }
1781
1782         nfsvers = nmp->nm_vers;
1783         offset = cb.rcb_args[0];
1784         rlen = length = cb.rcb_args[1];
1785
1786         auio = uio_createwithbuffer(1, NBOFF(bp) + offset, UIO_SYSSPACE,
1787             UIO_READ, &uio_buf, sizeof(uio_buf));
1788         uio_addiov(auio, CAST_USER_ADDR_T(bp->nb_data + offset), length);
1789
1790         /* finish the RPC */
1791         error = nmp->nm_funcs->nf_read_rpc_async_finish(np, req, auio, &rlen, &eof);
1792         if ((error == EINPROGRESS) && cb.rcb_func) {
1793                 /* async request restarted */
1794                 if (cb.rcb_func) {
1795                         nfs_request_rele(req);
1796                 }
1797                 if (IS_VALID_CRED(cred)) {
1798                         kauth_cred_unref(&cred);
1799                 }
1800                 return;
1801         }
1802 #if CONFIG_NFS4
1803         if ((nmp->nm_vers >= NFS_VER4) && nfs_mount_state_error_should_restart(error) && !ISSET(bp->nb_flags, NB_ERROR)) {
1804                 lck_mtx_lock(&nmp->nm_lock);
1805                 if ((error != NFSERR_OLD_STATEID) && (error != NFSERR_GRACE) && (cb.rcb_args[2] == nmp->nm_stategenid)) {
1806                         NP(np, "nfs_buf_read_rpc_finish: error %d @ 0x%llx, 0x%x 0x%x, initiating recovery",
1807                             error, NBOFF(bp) + offset, cb.rcb_args[2], nmp->nm_stategenid);
1808                         nfs_need_recover(nmp, error);
1809                 }
1810                 lck_mtx_unlock(&nmp->nm_lock);
1811                 if (np->n_flag & NREVOKE) {
1812                         error = EIO;
1813                 } else {
1814                         if (error == NFSERR_GRACE) {
1815                                 if (cb.rcb_func) {
1816                                         /*
1817                                          * For an async I/O request, handle a grace delay just like
1818                                          * jukebox errors.  Set the resend time and queue it up.
1819                                          */
1820                                         struct timeval now;
1821                                         if (req->r_nmrep.nmc_mhead) {
1822                                                 mbuf_freem(req->r_nmrep.nmc_mhead);
1823                                                 req->r_nmrep.nmc_mhead = NULL;
1824                                         }
1825                                         req->r_error = 0;
1826                                         microuptime(&now);
1827                                         lck_mtx_lock(&req->r_mtx);
1828                                         req->r_resendtime = now.tv_sec + 2;
1829                                         req->r_xid = 0;                 // get a new XID
1830                                         req->r_flags |= R_RESTART;
1831                                         req->r_start = 0;
1832                                         nfs_asyncio_resend(req);
1833                                         lck_mtx_unlock(&req->r_mtx);
1834                                         if (IS_VALID_CRED(cred)) {
1835                                                 kauth_cred_unref(&cred);
1836                                         }
1837                                         /* Note: nfsreq reference taken will be dropped later when finished */
1838                                         return;
1839                                 }
1840                                 /* otherwise, just pause a couple seconds and retry */
1841                                 tsleep(&nmp->nm_state, (PZERO - 1), "nfsgrace", 2 * hz);
1842                         }
1843                         if (!(error = nfs_mount_state_wait_for_recovery(nmp))) {
1844                                 rlen = 0;
1845                                 goto readagain;
1846                         }
1847                 }
1848         }
1849 #endif
1850         if (error) {
1851                 SET(bp->nb_flags, NB_ERROR);
1852                 bp->nb_error = error;
1853                 goto out;
1854         }
1855
1856         if ((rlen > 0) && (bp->nb_endio < (offset + (int)rlen))) {
1857                 bp->nb_endio = offset + rlen;
1858         }
1859
1860         if ((nfsvers == NFS_VER2) || eof || (rlen == 0)) {
1861                 /* zero out the remaining data (up to EOF) */
1862                 off_t rpcrem, eofrem, rem;
1863                 rpcrem = (length - rlen);
1864                 eofrem = np->n_size - (NBOFF(bp) + offset + rlen);
1865                 rem = (rpcrem < eofrem) ? rpcrem : eofrem;
1866                 if (rem > 0) {
1867                         bzero(bp->nb_data + offset + rlen, rem);
1868                 }
1869         } else if (((int)rlen < length) && !ISSET(bp->nb_flags, NB_ERROR)) {
1870                 /*
1871                  * short read
1872                  *
1873                  * We haven't hit EOF and we didn't get all the data
1874                  * requested, so we need to issue another read for the rest.
1875                  * (Don't bother if the buffer already hit an error.)
1876                  */
1877 #if CONFIG_NFS4
1878 readagain:
1879 #endif
1880                 offset += rlen;
1881                 length -= rlen;
1882                 cb.rcb_args[0] = offset;
1883                 cb.rcb_args[1] = length;
1884 #if CONFIG_NFS4
1885                 if (nmp->nm_vers >= NFS_VER4) {
1886                         cb.rcb_args[2] = nmp->nm_stategenid;
1887                 }
1888 #endif
1889                 error = nmp->nm_funcs->nf_read_rpc_async(np, NBOFF(bp) + offset, length, thd, cred, &cb, &rreq);
1890                 if (!error) {
1891                         if (IS_VALID_CRED(cred)) {
1892                                 kauth_cred_unref(&cred);
1893                         }
1894                         if (!cb.rcb_func) {
1895                                 /* if !async we'll need to wait for this RPC to finish */
1896                                 req = rreq;
1897                                 rreq = NULL;
1898                                 goto finish;
1899                         }
1900                         nfs_request_rele(req);
1901                         /*
1902                          * We're done here.
1903                          * Outstanding RPC count is unchanged.
1904                          * Callback will be called when RPC is done.
1905                          */
1906                         return;
1907                 }
1908                 SET(bp->nb_flags, NB_ERROR);
1909                 bp->nb_error = error;
1910         }
1911
1912 out:
1913         if (cb.rcb_func) {
1914                 nfs_request_rele(req);
1915         }
1916         if (IS_VALID_CRED(cred)) {
1917                 kauth_cred_unref(&cred);
1918         }
1919
1920         /*
1921          * Decrement outstanding RPC count on buffer
1922          * and call nfs_buf_read_finish on last RPC.
1923          *
1924          * (Note: when there are multiple async RPCs issued for a
1925          * buffer we need nfs_buffer_mutex to avoid problems when
1926          * aborting a partially-initiated set of RPCs)
1927          */
1928
1929         multasyncrpc = ISSET(bp->nb_flags, NB_MULTASYNCRPC);
1930         if (multasyncrpc) {
1931                 lck_mtx_lock(nfs_buf_mutex);
1932         }
1933
1934         bp->nb_rpcs--;
1935         finished = (bp->nb_rpcs == 0);
1936
1937         if (multasyncrpc) {
1938                 lck_mtx_unlock(nfs_buf_mutex);
1939         }
1940
1941         if (finished) {
1942                 if (multasyncrpc) {
1943                         wakeme = &bp->nb_rpcs;
1944                 }
1945                 nfs_buf_read_finish(bp);
1946                 if (wakeme) {
1947                         wakeup(wakeme);
1948                 }
1949         }
1950 }
1951
1952 /*
1953  * Do buffer readahead.
1954  * Initiate async I/O to read buffers not in cache.
1955  */
1956 int
1957 nfs_buf_readahead(nfsnode_t np, int ioflag, daddr64_t *rabnp, daddr64_t lastrabn, thread_t thd, kauth_cred_t cred)
1958 {
1959         struct nfsmount *nmp = NFSTONMP(np);
1960         struct nfsbuf *bp;
1961         int error = 0;
1962         uint32_t nra;
1963
1964         if (nfs_mount_gone(nmp)) {
1965                 return ENXIO;
1966         }
1967         if (nmp->nm_readahead <= 0) {
1968                 return 0;
1969         }
1970         if (*rabnp > lastrabn) {
1971                 return 0;
1972         }
1973
1974         for (nra = 0; (nra < nmp->nm_readahead) && (*rabnp <= lastrabn); nra++, *rabnp = *rabnp + 1) {
1975                 /* check if block exists and is valid. */
1976                 if ((*rabnp * nmp->nm_biosize) >= (off_t)np->n_size) {
1977                         /* stop reading ahead if we're beyond EOF */
1978                         *rabnp = lastrabn;
1979                         break;
1980                 }
1981                 error = nfs_buf_get(np, *rabnp, nmp->nm_biosize, thd, NBLK_READ | NBLK_NOWAIT, &bp);
1982                 if (error) {
1983                         break;
1984                 }
1985                 nfs_node_lock_force(np);
1986                 np->n_lastrahead = *rabnp;
1987                 nfs_node_unlock(np);
1988                 if (!bp) {
1989                         continue;
1990                 }
1991                 if ((ioflag & IO_NOCACHE) && ISSET(bp->nb_flags, NB_CACHE) &&
1992                     !bp->nb_dirty && !ISSET(bp->nb_flags, (NB_DELWRI | NB_NCRDAHEAD))) {
1993                         CLR(bp->nb_flags, NB_CACHE);
1994                         bp->nb_valid = 0;
1995                         bp->nb_validoff = bp->nb_validend = -1;
1996                 }
1997                 if ((bp->nb_dirtyend <= 0) && !bp->nb_dirty &&
1998                     !ISSET(bp->nb_flags, (NB_CACHE | NB_DELWRI))) {
1999                         SET(bp->nb_flags, (NB_READ | NB_ASYNC));
2000                         if (ioflag & IO_NOCACHE) {
2001                                 SET(bp->nb_flags, NB_NCRDAHEAD);
2002                         }
2003                         if (!IS_VALID_CRED(bp->nb_rcred) && IS_VALID_CRED(cred)) {
2004                                 kauth_cred_ref(cred);
2005                                 bp->nb_rcred = cred;
2006                         }
2007                         if ((error = nfs_buf_read(bp))) {
2008                                 break;
2009                         }
2010                         continue;
2011                 }
2012                 nfs_buf_release(bp, 1);
2013         }
2014         return error;
2015 }
2016
2017 /*
2018  * NFS buffer I/O for reading files.
2019  */
2020 int
2021 nfs_bioread(nfsnode_t np, uio_t uio, int ioflag, vfs_context_t ctx)
2022 {
2023         vnode_t vp = NFSTOV(np);
2024         struct nfsbuf *bp = NULL;
2025         struct nfsmount *nmp = VTONMP(vp);
2026         daddr64_t lbn, rabn = 0, lastrabn, maxrabn = -1;
2027         off_t diff;
2028         int error = 0, n = 0, on = 0;
2029         int nfsvers, biosize, modified, readaheads = 0;
2030         thread_t thd;
2031         kauth_cred_t cred;
2032         int64_t io_resid;
2033
2034         FSDBG_TOP(514, np, uio_offset(uio), uio_resid(uio), ioflag);
2035
2036         nfsvers = nmp->nm_vers;
2037         biosize = nmp->nm_biosize;
2038         thd = vfs_context_thread(ctx);
2039         cred = vfs_context_ucred(ctx);
2040
2041         if (vnode_vtype(vp) != VREG) {
2042                 printf("nfs_bioread: type %x unexpected\n", vnode_vtype(vp));
2043                 FSDBG_BOT(514, np, 0xd1e0016, 0, EINVAL);
2044                 return EINVAL;
2045         }
2046
2047         /*
2048          * For NFS, cache consistency can only be maintained approximately.
2049          * Although RFC1094 does not specify the criteria, the following is
2050          * believed to be compatible with the reference port.
2051          *
2052          * If the file has changed since the last read RPC or you have
2053          * written to the file, you may have lost data cache consistency
2054          * with the server.  So, check for a change, and flush all of the
2055          * file's data out of the cache.
2056          * NB: This implies that cache data can be read when up to
2057          * NFS_MAXATTRTIMO seconds out of date. If you find that you
2058          * need current attributes, nfs_getattr() can be forced to fetch
2059          * new attributes (via NATTRINVALIDATE() or NGA_UNCACHED).
2060          */
2061
2062         if (ISSET(np->n_flag, NUPDATESIZE)) {
2063                 nfs_data_update_size(np, 0);
2064         }
2065
2066         if ((error = nfs_node_lock(np))) {
2067                 FSDBG_BOT(514, np, 0xd1e0222, 0, error);
2068                 return error;
2069         }
2070
2071         if (np->n_flag & NNEEDINVALIDATE) {
2072                 np->n_flag &= ~NNEEDINVALIDATE;
2073                 nfs_node_unlock(np);
2074                 error = nfs_vinvalbuf(vp, V_SAVE | V_IGNORE_WRITEERR, ctx, 1);
2075                 if (!error) {
2076                         error = nfs_node_lock(np);
2077                 }
2078                 if (error) {
2079                         FSDBG_BOT(514, np, 0xd1e0322, 0, error);
2080                         return error;
2081                 }
2082         }
2083
2084         modified = (np->n_flag & NMODIFIED);
2085         nfs_node_unlock(np);
2086         /* nfs_getattr() will check changed and purge caches */
2087         error = nfs_getattr(np, NULL, ctx, modified ? NGA_UNCACHED : NGA_CACHED);
2088         if (error) {
2089                 FSDBG_BOT(514, np, 0xd1e0004, 0, error);
2090                 return error;
2091         }
2092
2093         if (uio_resid(uio) == 0) {
2094                 FSDBG_BOT(514, np, 0xd1e0001, 0, 0);
2095                 return 0;
2096         }
2097         if (uio_offset(uio) < 0) {
2098                 FSDBG_BOT(514, np, 0xd1e0002, 0, EINVAL);
2099                 return EINVAL;
2100         }
2101
2102         /*
2103          * set up readahead - which may be limited by:
2104          * + current request length (for IO_NOCACHE)
2105          * + readahead setting
2106          * + file size
2107          */
2108         if (nmp->nm_readahead > 0) {
2109                 off_t end = uio_offset(uio) + uio_resid(uio);
2110                 if (end > (off_t)np->n_size) {
2111                         end = np->n_size;
2112                 }
2113                 rabn = uio_offset(uio) / biosize;
2114                 maxrabn = (end - 1) / biosize;
2115                 nfs_node_lock_force(np);
2116                 if (!(ioflag & IO_NOCACHE) &&
2117                     (!rabn || (rabn == np->n_lastread) || (rabn == (np->n_lastread + 1)))) {
2118                         maxrabn += nmp->nm_readahead;
2119                         if ((maxrabn * biosize) >= (off_t)np->n_size) {
2120                                 maxrabn = ((off_t)np->n_size - 1) / biosize;
2121                         }
2122                 }
2123                 if (maxrabn < np->n_lastrahead) {
2124                         np->n_lastrahead = -1;
2125                 }
2126                 if (rabn < np->n_lastrahead) {
2127                         rabn = np->n_lastrahead + 1;
2128                 }
2129                 nfs_node_unlock(np);
2130         } else {
2131                 rabn = maxrabn = 0;
2132         }
2133
2134         do {
2135                 nfs_data_lock(np, NFS_DATA_LOCK_SHARED);
2136                 lbn = uio_offset(uio) / biosize;
2137
2138                 /*
2139                  * Copy directly from any cached pages without grabbing the bufs.
2140                  * (If we are NOCACHE and we've issued readahead requests, we need
2141                  * to grab the NB_NCRDAHEAD bufs to drop them.)
2142                  */
2143                 if ((!(ioflag & IO_NOCACHE) || !readaheads) &&
2144                     ((uio->uio_segflg == UIO_USERSPACE32 ||
2145                     uio->uio_segflg == UIO_USERSPACE64 ||
2146                     uio->uio_segflg == UIO_USERSPACE))) {
2147                         io_resid = uio_resid(uio);
2148                         diff = np->n_size - uio_offset(uio);
2149                         if (diff < io_resid) {
2150                                 io_resid = diff;
2151                         }
2152                         if (io_resid > 0) {
2153                                 int count = (io_resid > INT_MAX) ? INT_MAX : io_resid;
2154                                 error = cluster_copy_ubc_data(vp, uio, &count, 0);
2155                                 if (error) {
2156                                         nfs_data_unlock(np);
2157                                         FSDBG_BOT(514, np, uio_offset(uio), 0xcacefeed, error);
2158                                         return error;
2159                                 }
2160                         }
2161                         /* count any biocache reads that we just copied directly */
2162                         if (lbn != (uio_offset(uio) / biosize)) {
2163                                 OSAddAtomic64((uio_offset(uio) / biosize) - lbn, &nfsstats.biocache_reads);
2164                                 FSDBG(514, np, 0xcacefeed, uio_offset(uio), error);
2165                         }
2166                 }
2167
2168                 lbn = uio_offset(uio) / biosize;
2169                 on = uio_offset(uio) % biosize;
2170                 nfs_node_lock_force(np);
2171                 np->n_lastread = (uio_offset(uio) - 1) / biosize;
2172                 nfs_node_unlock(np);
2173
2174                 if ((uio_resid(uio) <= 0) || (uio_offset(uio) >= (off_t)np->n_size)) {
2175                         nfs_data_unlock(np);
2176                         FSDBG_BOT(514, np, uio_offset(uio), uio_resid(uio), 0xaaaaaaaa);
2177                         return 0;
2178                 }
2179
2180                 /* adjust readahead block number, if necessary */
2181                 if (rabn < lbn) {
2182                         rabn = lbn;
2183                 }
2184                 lastrabn = MIN(maxrabn, lbn + nmp->nm_readahead);
2185                 if (rabn <= lastrabn) { /* start readaheads */
2186                         error = nfs_buf_readahead(np, ioflag, &rabn, lastrabn, thd, cred);
2187                         if (error) {
2188                                 nfs_data_unlock(np);
2189                                 FSDBG_BOT(514, np, 0xd1e000b, 1, error);
2190                                 return error;
2191                         }
2192                         readaheads = 1;
2193                 }
2194
2195                 OSAddAtomic64(1, &nfsstats.biocache_reads);
2196
2197                 /*
2198                  * If the block is in the cache and has the required data
2199                  * in a valid region, just copy it out.
2200                  * Otherwise, get the block and write back/read in,
2201                  * as required.
2202                  */
2203 again:
2204                 io_resid = uio_resid(uio);
2205                 n = (io_resid > (biosize - on)) ? (biosize - on) : io_resid;
2206                 diff = np->n_size - uio_offset(uio);
2207                 if (diff < n) {
2208                         n = diff;
2209                 }
2210
2211                 error = nfs_buf_get(np, lbn, biosize, thd, NBLK_READ, &bp);
2212                 if (error) {
2213                         nfs_data_unlock(np);
2214                         FSDBG_BOT(514, np, 0xd1e000c, 0, error);
2215                         return error;
2216                 }
2217
2218                 if ((ioflag & IO_NOCACHE) && ISSET(bp->nb_flags, NB_CACHE)) {
2219                         /*
2220                          * IO_NOCACHE found a cached buffer.
2221                          * Flush the buffer if it's dirty.
2222                          * Invalidate the data if it wasn't just read
2223                          * in as part of a "nocache readahead".
2224                          */
2225                         if (bp->nb_dirty || (bp->nb_dirtyend > 0)) {
2226                                 /* so write the buffer out and try again */
2227                                 SET(bp->nb_flags, NB_NOCACHE);
2228                                 goto flushbuffer;
2229                         }
2230                         if (ISSET(bp->nb_flags, NB_NCRDAHEAD)) {
2231                                 CLR(bp->nb_flags, NB_NCRDAHEAD);
2232                                 SET(bp->nb_flags, NB_NOCACHE);
2233                         }
2234                 }
2235
2236                 /* if any pages are valid... */
2237                 if (bp->nb_valid) {
2238                         /* ...check for any invalid pages in the read range */
2239                         int pg, firstpg, lastpg, dirtypg;
2240                         dirtypg = firstpg = lastpg = -1;
2241                         pg = on / PAGE_SIZE;
2242                         while (pg <= (on + n - 1) / PAGE_SIZE) {
2243                                 if (!NBPGVALID(bp, pg)) {
2244                                         if (firstpg < 0) {
2245                                                 firstpg = pg;
2246                                         }
2247                                         lastpg = pg;
2248                                 } else if (firstpg >= 0 && dirtypg < 0 && NBPGDIRTY(bp, pg)) {
2249                                         dirtypg = pg;
2250                                 }
2251                                 pg++;
2252                         }
2253
2254                         /* if there are no invalid pages, we're all set */
2255                         if (firstpg < 0) {
2256                                 if (bp->nb_validoff < 0) {
2257                                         /* valid range isn't set up, so */
2258                                         /* set it to what we know is valid */
2259                                         bp->nb_validoff = trunc_page(on);
2260                                         bp->nb_validend = round_page(on + n);
2261                                         nfs_buf_normalize_valid_range(np, bp);
2262                                 }
2263                                 goto buffer_ready;
2264                         }
2265
2266                         /* there are invalid pages in the read range */
2267                         if (((dirtypg > firstpg) && (dirtypg < lastpg)) ||
2268                             (((firstpg * PAGE_SIZE) < bp->nb_dirtyend) && (((lastpg + 1) * PAGE_SIZE) > bp->nb_dirtyoff))) {
2269                                 /* there are also dirty page(s) (or range) in the read range, */
2270                                 /* so write the buffer out and try again */
2271 flushbuffer:
2272                                 CLR(bp->nb_flags, (NB_DONE | NB_ERROR | NB_INVAL));
2273                                 SET(bp->nb_flags, NB_ASYNC);
2274                                 if (!IS_VALID_CRED(bp->nb_wcred)) {
2275                                         kauth_cred_ref(cred);
2276                                         bp->nb_wcred = cred;
2277                                 }
2278                                 error = nfs_buf_write(bp);
2279                                 if (error) {
2280                                         nfs_data_unlock(np);
2281                                         FSDBG_BOT(514, np, 0xd1e000d, 0, error);
2282                                         return error;
2283                                 }
2284                                 goto again;
2285                         }
2286                         if (!bp->nb_dirty && bp->nb_dirtyend <= 0 &&
2287                             (lastpg - firstpg + 1) > (biosize / PAGE_SIZE) / 2) {
2288                                 /* we need to read in more than half the buffer and the */
2289                                 /* buffer's not dirty, so just fetch the whole buffer */
2290                                 bp->nb_valid = 0;
2291                         } else {
2292                                 /* read the page range in */
2293                                 uio_t auio;
2294                                 char uio_buf[UIO_SIZEOF(1)];
2295
2296                                 NFS_BUF_MAP(bp);
2297                                 auio = uio_createwithbuffer(1, (NBOFF(bp) + firstpg * PAGE_SIZE_64),
2298                                     UIO_SYSSPACE, UIO_READ, &uio_buf[0], sizeof(uio_buf));
2299                                 if (!auio) {
2300                                         error = ENOMEM;
2301                                 } else {
2302                                         uio_addiov(auio, CAST_USER_ADDR_T(bp->nb_data + (firstpg * PAGE_SIZE)),
2303                                             ((lastpg - firstpg + 1) * PAGE_SIZE));
2304                                         error = nfs_read_rpc(np, auio, ctx);
2305                                 }
2306                                 if (error) {
2307                                         if (ioflag & IO_NOCACHE) {
2308                                                 SET(bp->nb_flags, NB_NOCACHE);
2309                                         }
2310                                         nfs_buf_release(bp, 1);
2311                                         nfs_data_unlock(np);
2312                                         FSDBG_BOT(514, np, 0xd1e000e, 0, error);
2313                                         return error;
2314                                 }
2315                                 /* Make sure that the valid range is set to cover this read. */
2316                                 bp->nb_validoff = trunc_page_32(on);
2317                                 bp->nb_validend = round_page_32(on + n);
2318                                 nfs_buf_normalize_valid_range(np, bp);
2319                                 if (uio_resid(auio) > 0) {
2320                                         /* if short read, must have hit EOF, */
2321                                         /* so zero the rest of the range */
2322                                         bzero(CAST_DOWN(caddr_t, uio_curriovbase(auio)), uio_resid(auio));
2323                                 }
2324                                 /* mark the pages (successfully read) as valid */
2325                                 for (pg = firstpg; pg <= lastpg; pg++) {
2326                                         NBPGVALID_SET(bp, pg);
2327                                 }
2328                         }
2329                 }
2330                 /* if no pages are valid, read the whole block */
2331                 if (!bp->nb_valid) {
2332                         if (!IS_VALID_CRED(bp->nb_rcred) && IS_VALID_CRED(cred)) {
2333                                 kauth_cred_ref(cred);
2334                                 bp->nb_rcred = cred;
2335                         }
2336                         SET(bp->nb_flags, NB_READ);
2337                         CLR(bp->nb_flags, (NB_DONE | NB_ERROR | NB_INVAL));
2338                         error = nfs_buf_read(bp);
2339                         if (ioflag & IO_NOCACHE) {
2340                                 SET(bp->nb_flags, NB_NOCACHE);
2341                         }
2342                         if (error) {
2343                                 nfs_data_unlock(np);
2344                                 nfs_buf_release(bp, 1);
2345                                 FSDBG_BOT(514, np, 0xd1e000f, 0, error);
2346                                 return error;
2347                         }
2348                 }
2349 buffer_ready:
2350                 /* validate read range against valid range and clip */
2351                 if (bp->nb_validend > 0) {
2352                         diff = (on >= bp->nb_validend) ? 0 : (bp->nb_validend - on);
2353                         if (diff < n) {
2354                                 n = diff;
2355                         }
2356                 }
2357                 if (n > 0) {
2358                         NFS_BUF_MAP(bp);
2359                         error = uiomove(bp->nb_data + on, n, uio);
2360                 }
2361
2362
2363                 nfs_buf_release(bp, 1);
2364                 nfs_data_unlock(np);
2365                 nfs_node_lock_force(np);
2366                 np->n_lastread = (uio_offset(uio) - 1) / biosize;
2367                 nfs_node_unlock(np);
2368         } while (error == 0 && uio_resid(uio) > 0 && n > 0);
2369         FSDBG_BOT(514, np, uio_offset(uio), uio_resid(uio), error);
2370         return error;
2371 }
2372
2373 /*
2374  * limit the number of outstanding async I/O writes
2375  */
2376 int
2377 nfs_async_write_start(struct nfsmount *nmp)
2378 {
2379         int error = 0, slpflag = NMFLAG(nmp, INTR) ? PCATCH : 0;
2380         struct timespec ts = { .tv_sec = 1, .tv_nsec = 0 };
2381
2382         if (nfs_max_async_writes <= 0) {
2383                 return 0;
2384         }
2385         lck_mtx_lock(&nmp->nm_lock);
2386         while ((nfs_max_async_writes > 0) && (nmp->nm_asyncwrites >= nfs_max_async_writes)) {
2387                 if ((error = nfs_sigintr(nmp, NULL, current_thread(), 1))) {
2388                         break;
2389                 }
2390                 msleep(&nmp->nm_asyncwrites, &nmp->nm_lock, slpflag | (PZERO - 1), "nfsasyncwrites", &ts);
2391                 slpflag = 0;
2392         }
2393         if (!error) {
2394                 nmp->nm_asyncwrites++;
2395         }
2396         lck_mtx_unlock(&nmp->nm_lock);
2397         return error;
2398 }
2399 void
2400 nfs_async_write_done(struct nfsmount *nmp)
2401 {
2402         if (nmp->nm_asyncwrites <= 0) {
2403                 return;
2404         }
2405         lck_mtx_lock(&nmp->nm_lock);
2406         if (nmp->nm_asyncwrites-- >= nfs_max_async_writes) {
2407                 wakeup(&nmp->nm_asyncwrites);
2408         }
2409         lck_mtx_unlock(&nmp->nm_lock);
2410 }
2411
2412 /*
2413  * write (or commit) the given NFS buffer
2414  *
2415  * Commit the buffer if we can.
2416  * Write out any dirty range.
2417  * If any dirty pages remain, write them out.
2418  * Mark buffer done.
2419  *
2420  * For async requests, all the work beyond sending the initial
2421  * write RPC is handled in the RPC callback(s).
2422  */
2423 int
2424 nfs_buf_write(struct nfsbuf *bp)
2425 {
2426         int error = 0, oldflags, async;
2427         nfsnode_t np;
2428         thread_t thd;
2429         kauth_cred_t cred;
2430         proc_t p = current_proc();
2431         int iomode, doff, dend, firstpg, lastpg;
2432         uint32_t pagemask;
2433
2434         FSDBG_TOP(553, bp, NBOFF(bp), bp->nb_flags, 0);
2435
2436         if (!ISSET(bp->nb_lflags, NBL_BUSY)) {
2437                 panic("nfs_buf_write: buffer is not busy???");
2438         }
2439
2440         np = bp->nb_np;
2441         async = ISSET(bp->nb_flags, NB_ASYNC);
2442         oldflags = bp->nb_flags;
2443
2444         CLR(bp->nb_flags, (NB_READ | NB_DONE | NB_ERROR | NB_DELWRI));
2445         if (ISSET(oldflags, NB_DELWRI)) {
2446                 lck_mtx_lock(nfs_buf_mutex);
2447                 nfs_nbdwrite--;
2448                 NFSBUFCNTCHK();
2449                 lck_mtx_unlock(nfs_buf_mutex);
2450                 wakeup(&nfs_nbdwrite);
2451         }
2452
2453         /* move to clean list */
2454         if (ISSET(oldflags, (NB_ASYNC | NB_DELWRI))) {
2455                 lck_mtx_lock(nfs_buf_mutex);
2456                 if (bp->nb_vnbufs.le_next != NFSNOLIST) {
2457                         LIST_REMOVE(bp, nb_vnbufs);
2458                 }
2459                 LIST_INSERT_HEAD(&np->n_cleanblkhd, bp, nb_vnbufs);
2460                 lck_mtx_unlock(nfs_buf_mutex);
2461         }
2462         nfs_node_lock_force(np);
2463         np->n_numoutput++;
2464         nfs_node_unlock(np);
2465         vnode_startwrite(NFSTOV(np));
2466
2467         if (p && p->p_stats) {
2468                 OSIncrementAtomicLong(&p->p_stats->p_ru.ru_oublock);
2469         }
2470
2471         cred = bp->nb_wcred;
2472         if (!IS_VALID_CRED(cred) && ISSET(bp->nb_flags, NB_READ)) {
2473                 cred = bp->nb_rcred;  /* shouldn't really happen, but... */
2474         }
2475         if (IS_VALID_CRED(cred)) {
2476                 kauth_cred_ref(cred);
2477         }
2478         thd = async ? NULL : current_thread();
2479
2480         /* We need to make sure the pages are locked before doing I/O.  */
2481         if (!ISSET(bp->nb_flags, NB_META)) {
2482                 if (UBCINFOEXISTS(NFSTOV(np))) {
2483                         if (!ISSET(bp->nb_flags, NB_PAGELIST)) {
2484                                 error = nfs_buf_upl_setup(bp);
2485                                 if (error) {
2486                                         printf("nfs_buf_write: upl create failed %d\n", error);
2487                                         SET(bp->nb_flags, NB_ERROR);
2488                                         bp->nb_error = error = EIO;
2489                                         nfs_buf_iodone(bp);
2490                                         goto out;
2491                                 }
2492                                 nfs_buf_upl_check(bp);
2493                         }
2494                 } else {
2495                         /* We should never be in nfs_buf_write() with no UBCINFO. */
2496                         printf("nfs_buf_write: ubcinfo already gone\n");
2497                         SET(bp->nb_flags, NB_ERROR);
2498                         bp->nb_error = error = EIO;
2499                         nfs_buf_iodone(bp);
2500                         goto out;
2501                 }
2502         }
2503
2504         /* If NB_NEEDCOMMIT is set, a commit RPC may do the trick. */
2505         if (ISSET(bp->nb_flags, NB_NEEDCOMMIT)) {
2506                 nfs_buf_check_write_verifier(np, bp);
2507         }
2508         if (ISSET(bp->nb_flags, NB_NEEDCOMMIT)) {
2509                 struct nfsmount *nmp = NFSTONMP(np);
2510                 if (nfs_mount_gone(nmp)) {
2511                         SET(bp->nb_flags, NB_ERROR);
2512                         bp->nb_error = error = EIO;
2513                         nfs_buf_iodone(bp);
2514                         goto out;
2515                 }
2516                 SET(bp->nb_flags, NB_WRITEINPROG);
2517                 error = nmp->nm_funcs->nf_commit_rpc(np, NBOFF(bp) + bp->nb_dirtyoff,
2518                     bp->nb_dirtyend - bp->nb_dirtyoff, bp->nb_wcred, bp->nb_verf);
2519                 CLR(bp->nb_flags, NB_WRITEINPROG);
2520                 if (error) {
2521                         if (error != NFSERR_STALEWRITEVERF) {
2522                                 SET(bp->nb_flags, NB_ERROR);
2523                                 bp->nb_error = error;
2524                         }
2525                         nfs_buf_iodone(bp);
2526                         goto out;
2527                 }
2528                 bp->nb_dirtyoff = bp->nb_dirtyend = 0;
2529                 CLR(bp->nb_flags, NB_NEEDCOMMIT);
2530                 nfs_node_lock_force(np);
2531                 np->n_needcommitcnt--;
2532                 CHECK_NEEDCOMMITCNT(np);
2533                 nfs_node_unlock(np);
2534         }
2535         if (!error && (bp->nb_dirtyend > 0)) {
2536                 /* sanity check the dirty range */
2537                 if (NBOFF(bp) + bp->nb_dirtyend > (off_t) np->n_size) {
2538                         bp->nb_dirtyend = np->n_size - NBOFF(bp);
2539                         if (bp->nb_dirtyoff >= bp->nb_dirtyend) {
2540                                 bp->nb_dirtyoff = bp->nb_dirtyend = 0;
2541                         }
2542                 }
2543         }
2544         if (!error && (bp->nb_dirtyend > 0)) {
2545                 /* there's a dirty range that needs to be written out */
2546                 NFS_BUF_MAP(bp);
2547
2548                 doff = bp->nb_dirtyoff;
2549                 dend = bp->nb_dirtyend;
2550
2551                 /* if doff page is dirty, move doff to start of page */
2552                 if (NBPGDIRTY(bp, doff / PAGE_SIZE)) {
2553                         doff -= doff & PAGE_MASK;
2554                 }
2555                 /* try to expand write range to include preceding dirty pages */
2556                 if (!(doff & PAGE_MASK)) {
2557                         while ((doff > 0) && NBPGDIRTY(bp, (doff - 1) / PAGE_SIZE)) {
2558                                 doff -= PAGE_SIZE;
2559                         }
2560                 }
2561                 /* if dend page is dirty, move dend to start of next page */
2562                 if ((dend & PAGE_MASK) && NBPGDIRTY(bp, dend / PAGE_SIZE)) {
2563                         dend = round_page_32(dend);
2564                 }
2565                 /* try to expand write range to include trailing dirty pages */
2566                 if (!(dend & PAGE_MASK)) {
2567                         while ((dend < (int)bp->nb_bufsize) && NBPGDIRTY(bp, dend / PAGE_SIZE)) {
2568                                 dend += PAGE_SIZE;
2569                         }
2570                 }
2571                 /* make sure to keep dend clipped to EOF */
2572                 if ((NBOFF(bp) + dend) > (off_t) np->n_size) {
2573                         dend = np->n_size - NBOFF(bp);
2574                 }
2575                 /* calculate range of complete pages being written */
2576                 firstpg = round_page_32(doff) / PAGE_SIZE;
2577                 lastpg = (trunc_page_32(dend) - 1) / PAGE_SIZE;
2578                 /* calculate mask for that page range */
2579                 pagemask = ((1 << (lastpg + 1)) - 1) & ~((1 << firstpg) - 1);
2580
2581                 /*
2582                  * compare page mask to nb_dirty; if there are other dirty pages
2583                  * then write FILESYNC; otherwise, write UNSTABLE if async and
2584                  * not needcommit/stable; otherwise write FILESYNC
2585                  */
2586                 if (bp->nb_dirty & ~pagemask) {
2587                         iomode = NFS_WRITE_FILESYNC;
2588                 } else if ((bp->nb_flags & (NB_ASYNC | NB_NEEDCOMMIT | NB_STABLE)) == NB_ASYNC) {
2589                         iomode = NFS_WRITE_UNSTABLE;
2590                 } else {
2591                         iomode = NFS_WRITE_FILESYNC;
2592                 }
2593
2594                 /* write the whole contiguous dirty range */
2595                 bp->nb_offio = doff;
2596                 bp->nb_endio = dend;
2597
2598                 OSAddAtomic64(1, &nfsstats.write_bios);
2599
2600                 SET(bp->nb_flags, NB_WRITEINPROG);
2601                 error = nfs_buf_write_rpc(bp, iomode, thd, cred);
2602                 /*
2603                  * For async I/O, the callbacks will finish up the
2604                  * write and push out any dirty pages.  Otherwise,
2605                  * the write has already been finished and any dirty
2606                  * pages pushed out.
2607                  */
2608         } else {
2609                 if (!error && bp->nb_dirty) { /* write out any dirty pages */
2610                         error = nfs_buf_write_dirty_pages(bp, thd, cred);
2611                 }
2612                 nfs_buf_iodone(bp);
2613         }
2614         /* note: bp is still valid only for !async case */
2615 out:
2616         if (!async) {
2617                 error = nfs_buf_iowait(bp);
2618                 /* move to clean list */
2619                 if (oldflags & NB_DELWRI) {
2620                         lck_mtx_lock(nfs_buf_mutex);
2621                         if (bp->nb_vnbufs.le_next != NFSNOLIST) {
2622                                 LIST_REMOVE(bp, nb_vnbufs);
2623                         }
2624                         LIST_INSERT_HEAD(&np->n_cleanblkhd, bp, nb_vnbufs);
2625                         lck_mtx_unlock(nfs_buf_mutex);
2626                 }
2627                 FSDBG_BOT(553, bp, NBOFF(bp), bp->nb_flags, error);
2628                 nfs_buf_release(bp, 1);
2629                 /* check if we need to invalidate (and we can) */
2630                 if ((np->n_flag & NNEEDINVALIDATE) &&
2631                     !(np->n_bflag & (NBINVALINPROG | NBFLUSHINPROG))) {
2632                         int invalidate = 0;
2633                         nfs_node_lock_force(np);
2634                         if (np->n_flag & NNEEDINVALIDATE) {
2635                                 invalidate = 1;
2636                                 np->n_flag &= ~NNEEDINVALIDATE;
2637                         }
2638                         nfs_node_unlock(np);
2639                         if (invalidate) {
2640                                 /*
2641                                  * There was a write error and we need to
2642                                  * invalidate attrs and flush buffers in
2643                                  * order to sync up with the server.
2644                                  * (if this write was extending the file,
2645                                  * we may no longer know the correct size)
2646                                  *
2647                                  * But we couldn't call vinvalbuf while holding
2648                                  * the buffer busy.  So we call vinvalbuf() after
2649                                  * releasing the buffer.
2650                                  */
2651                                 nfs_vinvalbuf2(NFSTOV(np), V_SAVE | V_IGNORE_WRITEERR, thd, cred, 1);
2652                         }
2653                 }
2654         }
2655
2656         if (IS_VALID_CRED(cred)) {
2657                 kauth_cred_unref(&cred);
2658         }
2659         return error;
2660 }
2661
2662 /*
2663  * finish the writing of a buffer
2664  */
2665 void
2666 nfs_buf_write_finish(struct nfsbuf *bp, thread_t thd, kauth_cred_t cred)
2667 {
2668         nfsnode_t np = bp->nb_np;
2669         int error = (bp->nb_flags & NB_ERROR) ? bp->nb_error : 0;
2670         int firstpg, lastpg;
2671         uint32_t pagemask;
2672
2673         if ((error == EINTR) || (error == ERESTART)) {
2674                 CLR(bp->nb_flags, NB_ERROR);
2675                 SET(bp->nb_flags, NB_EINTR);
2676         }
2677
2678         if (!error) {
2679                 /* calculate range of complete pages being written */
2680                 firstpg = round_page_32(bp->nb_offio) / PAGE_SIZE;
2681                 lastpg = (trunc_page_32(bp->nb_endio) - 1) / PAGE_SIZE;
2682                 /* calculate mask for that page range written */
2683                 pagemask = ((1 << (lastpg + 1)) - 1) & ~((1 << firstpg) - 1);
2684                 /* clear dirty bits for pages we've written */
2685                 bp->nb_dirty &= ~pagemask;
2686         }
2687
2688         /* manage needcommit state */
2689         if (!error && (bp->nb_commitlevel == NFS_WRITE_UNSTABLE)) {
2690                 if (!ISSET(bp->nb_flags, NB_NEEDCOMMIT)) {
2691                         nfs_node_lock_force(np);
2692                         np->n_needcommitcnt++;
2693                         nfs_node_unlock(np);
2694                         SET(bp->nb_flags, NB_NEEDCOMMIT);
2695                 }
2696                 /* make sure nb_dirtyoff/nb_dirtyend reflect actual range written */
2697                 bp->nb_dirtyoff = bp->nb_offio;
2698                 bp->nb_dirtyend = bp->nb_endio;
2699         } else if (ISSET(bp->nb_flags, NB_NEEDCOMMIT)) {
2700                 nfs_node_lock_force(np);
2701                 np->n_needcommitcnt--;
2702                 CHECK_NEEDCOMMITCNT(np);
2703                 nfs_node_unlock(np);
2704                 CLR(bp->nb_flags, NB_NEEDCOMMIT);
2705         }
2706
2707         CLR(bp->nb_flags, NB_WRITEINPROG);
2708
2709         /*
2710          * For an unstable write, the buffer is still treated as dirty until
2711          * a commit (or stable (re)write) is performed.  Buffers needing only
2712          * a commit are marked with the NB_DELWRI and NB_NEEDCOMMIT flags.
2713          *
2714          * If the write was interrupted we set NB_EINTR.  Don't set NB_ERROR
2715          * because that would cause the buffer to be dropped.  The buffer is
2716          * still valid and simply needs to be written again.
2717          */
2718         if ((error == EINTR) || (error == ERESTART) || (!error && (bp->nb_flags & NB_NEEDCOMMIT))) {
2719                 CLR(bp->nb_flags, NB_INVAL);
2720                 if (!ISSET(bp->nb_flags, NB_DELWRI)) {
2721                         SET(bp->nb_flags, NB_DELWRI);
2722                         lck_mtx_lock(nfs_buf_mutex);
2723                         nfs_nbdwrite++;
2724                         NFSBUFCNTCHK();
2725                         lck_mtx_unlock(nfs_buf_mutex);
2726                 }
2727                 /*
2728                  * Since for the NB_ASYNC case, we've reassigned the buffer to the
2729                  * clean list, we have to reassign it back to the dirty one. Ugh.
2730                  */
2731                 if (ISSET(bp->nb_flags, NB_ASYNC)) {
2732                         /* move to dirty list */
2733                         lck_mtx_lock(nfs_buf_mutex);
2734                         if (bp->nb_vnbufs.le_next != NFSNOLIST) {
2735                                 LIST_REMOVE(bp, nb_vnbufs);
2736                         }
2737                         LIST_INSERT_HEAD(&np->n_dirtyblkhd, bp, nb_vnbufs);
2738                         lck_mtx_unlock(nfs_buf_mutex);
2739                 }
2740         } else {
2741                 /* either there's an error or we don't need to commit */
2742                 if (error) {
2743                         /*
2744                          * There was a write error and we need to invalidate
2745                          * attrs and flush buffers in order to sync up with the
2746                          * server.  (if this write was extending the file, we
2747                          * may no longer know the correct size)
2748                          *
2749                          * But we can't call vinvalbuf while holding this
2750                          * buffer busy.  Set a flag to do it after releasing
2751                          * the buffer.
2752                          */
2753                         nfs_node_lock_force(np);
2754                         np->n_error = error;
2755                         np->n_flag |= (NWRITEERR | NNEEDINVALIDATE);
2756                         NATTRINVALIDATE(np);
2757                         nfs_node_unlock(np);
2758                 }
2759                 /* clear the dirty range */
2760                 bp->nb_dirtyoff = bp->nb_dirtyend = 0;
2761         }
2762
2763         if (!error && bp->nb_dirty) {
2764                 nfs_buf_write_dirty_pages(bp, thd, cred);
2765         }
2766         nfs_buf_iodone(bp);
2767 }
2768
2769 /*
2770  * write out any pages marked dirty in a buffer
2771  *
2772  * We do use unstable writes and follow up with a commit.
2773  * If we catch the write verifier changing we'll restart
2774  * do the writes filesync.
2775  */
2776 int
2777 nfs_buf_write_dirty_pages(struct nfsbuf *bp, thread_t thd, kauth_cred_t cred)
2778 {
2779         nfsnode_t np = bp->nb_np;
2780         struct nfsmount *nmp = NFSTONMP(np);
2781         int error = 0, commit, iomode, iomode2, len, pg, count, npages, off;
2782         uint32_t dirty = bp->nb_dirty;
2783         uint64_t wverf;
2784         uio_t auio;
2785         char uio_buf[UIO_SIZEOF(1)];
2786
2787         if (!bp->nb_dirty) {
2788                 return 0;
2789         }
2790
2791         /* there are pages marked dirty that need to be written out */
2792         OSAddAtomic64(1, &nfsstats.write_bios);
2793         NFS_BUF_MAP(bp);
2794         SET(bp->nb_flags, NB_WRITEINPROG);
2795         npages = bp->nb_bufsize / PAGE_SIZE;
2796         iomode = NFS_WRITE_UNSTABLE;
2797
2798         auio = uio_createwithbuffer(1, 0, UIO_SYSSPACE, UIO_WRITE,
2799             &uio_buf, sizeof(uio_buf));
2800
2801 again:
2802         dirty = bp->nb_dirty;
2803         wverf = bp->nb_verf;
2804         commit = NFS_WRITE_FILESYNC;
2805         for (pg = 0; pg < npages; pg++) {
2806                 if (!NBPGDIRTY(bp, pg)) {
2807                         continue;
2808                 }
2809                 count = 1;
2810                 while (((pg + count) < npages) && NBPGDIRTY(bp, pg + count)) {
2811                         count++;
2812                 }
2813                 /* write count pages starting with page pg */
2814                 off = pg * PAGE_SIZE;
2815                 len = count * PAGE_SIZE;
2816                 /* clip writes to EOF */
2817                 if (NBOFF(bp) + off + len > (off_t) np->n_size) {
2818                         len -= (NBOFF(bp) + off + len) - np->n_size;
2819                 }
2820                 if (len > 0) {
2821                         iomode2 = iomode;
2822                         uio_reset(auio, NBOFF(bp) + off, UIO_SYSSPACE, UIO_WRITE);
2823                         uio_addiov(auio, CAST_USER_ADDR_T(bp->nb_data + off), len);
2824                         error = nfs_write_rpc2(np, auio, thd, cred, &iomode2, &bp->nb_verf);
2825                         if (error) {
2826                                 break;
2827                         }
2828                         if (iomode2 < commit) { /* Retain the lowest commitment level returned. */
2829                                 commit = iomode2;
2830                         }
2831                         if ((commit != NFS_WRITE_FILESYNC) && (wverf != bp->nb_verf)) {
2832                                 /* verifier changed, redo all the writes filesync */
2833                                 iomode = NFS_WRITE_FILESYNC;
2834                                 goto again;
2835                         }
2836                 }
2837                 /* clear dirty bits */
2838                 while (count--) {
2839                         dirty &= ~(1 << pg);
2840                         if (count) { /* leave pg on last page */
2841                                 pg++;
2842                         }
2843                 }
2844         }
2845         CLR(bp->nb_flags, NB_WRITEINPROG);
2846
2847         if (!error && (commit != NFS_WRITE_FILESYNC)) {
2848                 error = nmp->nm_funcs->nf_commit_rpc(np, NBOFF(bp), bp->nb_bufsize, cred, wverf);
2849                 if (error == NFSERR_STALEWRITEVERF) {
2850                         /* verifier changed, so we need to restart all the writes */
2851                         iomode = NFS_WRITE_FILESYNC;
2852                         goto again;
2853                 }
2854         }
2855         if (!error) {
2856                 bp->nb_dirty = dirty;
2857         } else {
2858                 SET(bp->nb_flags, NB_ERROR);
2859                 bp->nb_error = error;
2860         }
2861         return error;
2862 }
2863
2864 /*
2865  * initiate the NFS WRITE RPC(s) for a buffer
2866  */
2867 int
2868 nfs_buf_write_rpc(struct nfsbuf *bp, int iomode, thread_t thd, kauth_cred_t cred)
2869 {
2870         struct nfsmount *nmp;
2871         nfsnode_t np = bp->nb_np;
2872         int error = 0, nfsvers, async;
2873         int offset, nrpcs;
2874         uint32_t nmwsize, length, len;
2875         struct nfsreq *req;
2876         struct nfsreq_cbinfo cb;
2877         uio_t auio;
2878         char uio_buf[UIO_SIZEOF(1)];
2879
2880         nmp = NFSTONMP(np);
2881         if (nfs_mount_gone(nmp)) {
2882                 bp->nb_error = error = ENXIO;
2883                 SET(bp->nb_flags, NB_ERROR);
2884                 nfs_buf_iodone(bp);
2885                 return error;
2886         }
2887         nfsvers = nmp->nm_vers;
2888         nmwsize = nmp->nm_wsize;
2889
2890         offset = bp->nb_offio;
2891         length = bp->nb_endio - bp->nb_offio;
2892
2893         /* Note: Can only do async I/O if nfsiods are configured. */
2894         async = (bp->nb_flags & NB_ASYNC) && (NFSIOD_MAX > 0);
2895         bp->nb_commitlevel = NFS_WRITE_FILESYNC;
2896         cb.rcb_func = async ? nfs_buf_write_rpc_finish : NULL;
2897         cb.rcb_bp = bp;
2898
2899         if ((nfsvers == NFS_VER2) && ((NBOFF(bp) + bp->nb_endio) > 0xffffffffLL)) {
2900                 bp->nb_error = error = EFBIG;
2901                 SET(bp->nb_flags, NB_ERROR);
2902                 nfs_buf_iodone(bp);
2903                 return error;
2904         }
2905
2906         auio = uio_createwithbuffer(1, NBOFF(bp) + offset, UIO_SYSSPACE,
2907             UIO_WRITE, &uio_buf, sizeof(uio_buf));
2908         uio_addiov(auio, CAST_USER_ADDR_T(bp->nb_data + offset), length);
2909
2910         bp->nb_rpcs = nrpcs = (length + nmwsize - 1) / nmwsize;
2911         if (async && (nrpcs > 1)) {
2912                 SET(bp->nb_flags, NB_MULTASYNCRPC);
2913         } else {
2914                 CLR(bp->nb_flags, NB_MULTASYNCRPC);
2915         }
2916
2917         while (length > 0) {
2918                 if (ISSET(bp->nb_flags, NB_ERROR)) {
2919                         error = bp->nb_error;
2920                         break;
2921                 }
2922                 len = (length > nmwsize) ? nmwsize : length;
2923                 cb.rcb_args[0] = offset;
2924                 cb.rcb_args[1] = len;
2925 #if CONFIG_NFS4
2926                 if (nmp->nm_vers >= NFS_VER4) {
2927                         cb.rcb_args[2] = nmp->nm_stategenid;
2928                 }
2929 #endif
2930                 if (async && ((error = nfs_async_write_start(nmp)))) {
2931                         break;
2932                 }
2933                 req = NULL;
2934                 error = nmp->nm_funcs->nf_write_rpc_async(np, auio, len, thd, cred,
2935                     iomode, &cb, &req);
2936                 if (error) {
2937                         if (async) {
2938                                 nfs_async_write_done(nmp);
2939                         }
2940                         break;
2941                 }
2942                 offset += len;
2943                 length -= len;
2944                 if (async) {
2945                         continue;
2946                 }
2947                 nfs_buf_write_rpc_finish(req);
2948         }
2949
2950         if (length > 0) {
2951                 /*
2952                  * Something bad happened while trying to send the RPCs.
2953                  * Wait for any outstanding requests to complete.
2954                  */
2955                 bp->nb_error = error;
2956                 SET(bp->nb_flags, NB_ERROR);
2957                 if (ISSET(bp->nb_flags, NB_MULTASYNCRPC)) {
2958                         nrpcs = (length + nmwsize - 1) / nmwsize;
2959                         lck_mtx_lock(nfs_buf_mutex);
2960                         bp->nb_rpcs -= nrpcs;
2961                         if (bp->nb_rpcs == 0) {
2962                                 /* No RPCs left, so the buffer's done */
2963                                 lck_mtx_unlock(nfs_buf_mutex);
2964                                 nfs_buf_write_finish(bp, thd, cred);
2965                         } else {
2966                                 /* wait for the last RPC to mark it done */
2967                                 while (bp->nb_rpcs > 0) {
2968                                         msleep(&bp->nb_rpcs, nfs_buf_mutex, 0,
2969                                             "nfs_buf_write_rpc_cancel", NULL);
2970                                 }
2971                                 lck_mtx_unlock(nfs_buf_mutex);
2972                         }
2973                 } else {
2974                         nfs_buf_write_finish(bp, thd, cred);
2975                 }
2976                 /* It may have just been an interrupt... that's OK */
2977                 if (!ISSET(bp->nb_flags, NB_ERROR)) {
2978                         error = 0;
2979                 }
2980         }
2981
2982         return error;
2983 }
2984
2985 /*
2986  * finish up an NFS WRITE RPC on a buffer
2987  */
2988 void
2989 nfs_buf_write_rpc_finish(struct nfsreq *req)
2990 {
2991         int error = 0, nfsvers, offset, length, multasyncrpc, finished;
2992         int committed = NFS_WRITE_FILESYNC;
2993         uint64_t wverf = 0;
2994         size_t rlen;
2995         void *wakeme = NULL;
2996         struct nfsreq_cbinfo cb;
2997         struct nfsreq *wreq = NULL;
2998         struct nfsbuf *bp;
2999         struct nfsmount *nmp;
3000         nfsnode_t np;
3001         thread_t thd;
3002         kauth_cred_t cred;
3003         uio_t auio;
3004         char uio_buf[UIO_SIZEOF(1)];
3005
3006 finish:
3007         np = req->r_np;
3008         thd = req->r_thread;
3009         cred = req->r_cred;
3010         if (IS_VALID_CRED(cred)) {
3011                 kauth_cred_ref(cred);
3012         }
3013         cb = req->r_callback;
3014         bp = cb.rcb_bp;
3015         if (cb.rcb_func) { /* take an extra reference on the nfsreq in case we want to resend it later due to grace error */
3016                 nfs_request_ref(req, 0);
3017         }
3018
3019         nmp = NFSTONMP(np);
3020         if (nfs_mount_gone(nmp)) {
3021                 SET(bp->nb_flags, NB_ERROR);
3022                 bp->nb_error = error = ENXIO;
3023         }
3024         if (error || ISSET(bp->nb_flags, NB_ERROR)) {
3025                 /* just drop it */
3026                 nfs_request_async_cancel(req);
3027                 goto out;
3028         }
3029         nfsvers = nmp->nm_vers;
3030
3031         offset = cb.rcb_args[0];
3032         rlen = length = cb.rcb_args[1];
3033
3034         /* finish the RPC */
3035         error = nmp->nm_funcs->nf_write_rpc_async_finish(np, req, &committed, &rlen, &wverf);
3036         if ((error == EINPROGRESS) && cb.rcb_func) {
3037                 /* async request restarted */
3038                 if (cb.rcb_func) {
3039                         nfs_request_rele(req);
3040                 }
3041                 if (IS_VALID_CRED(cred)) {
3042                         kauth_cred_unref(&cred);
3043                 }
3044                 return;
3045         }
3046 #if CONFIG_NFS4
3047         if ((nmp->nm_vers >= NFS_VER4) && nfs_mount_state_error_should_restart(error) && !ISSET(bp->nb_flags, NB_ERROR)) {
3048                 lck_mtx_lock(&nmp->nm_lock);
3049                 if ((error != NFSERR_OLD_STATEID) && (error != NFSERR_GRACE) && (cb.rcb_args[2] == nmp->nm_stategenid)) {
3050                         NP(np, "nfs_buf_write_rpc_finish: error %d @ 0x%llx, 0x%x 0x%x, initiating recovery",
3051                             error, NBOFF(bp) + offset, cb.rcb_args[2], nmp->nm_stategenid);
3052                         nfs_need_recover(nmp, error);
3053                 }
3054                 lck_mtx_unlock(&nmp->nm_lock);
3055                 if (np->n_flag & NREVOKE) {
3056                         error = EIO;
3057                 } else {
3058                         if (error == NFSERR_GRACE) {
3059                                 if (cb.rcb_func) {
3060                                         /*
3061                                          * For an async I/O request, handle a grace delay just like
3062                                          * jukebox errors.  Set the resend time and queue it up.
3063                                          */
3064                                         struct timeval now;
3065                                         if (req->r_nmrep.nmc_mhead) {
3066                                                 mbuf_freem(req->r_nmrep.nmc_mhead);
3067                                                 req->r_nmrep.nmc_mhead = NULL;
3068                                         }
3069                                         req->r_error = 0;
3070                                         microuptime(&now);
3071                                         lck_mtx_lock(&req->r_mtx);
3072                                         req->r_resendtime = now.tv_sec + 2;
3073                                         req->r_xid = 0;                 // get a new XID
3074                                         req->r_flags |= R_RESTART;
3075                                         req->r_start = 0;
3076                                         nfs_asyncio_resend(req);
3077                                         lck_mtx_unlock(&req->r_mtx);
3078                                         if (IS_VALID_CRED(cred)) {
3079                                                 kauth_cred_unref(&cred);
3080                                         }
3081                                         /* Note: nfsreq reference taken will be dropped later when finished */
3082                                         return;
3083                                 }
3084                                 /* otherwise, just pause a couple seconds and retry */
3085                                 tsleep(&nmp->nm_state, (PZERO - 1), "nfsgrace", 2 * hz);
3086                         }
3087                         if (!(error = nfs_mount_state_wait_for_recovery(nmp))) {
3088                                 rlen = 0;
3089                                 goto writeagain;
3090                         }
3091                 }
3092         }
3093 #endif
3094         if (error) {
3095                 SET(bp->nb_flags, NB_ERROR);
3096                 bp->nb_error = error;
3097         }
3098         if (error || (nfsvers == NFS_VER2)) {
3099                 goto out;
3100         }
3101         if (rlen <= 0) {
3102                 SET(bp->nb_flags, NB_ERROR);
3103                 bp->nb_error = error = EIO;
3104                 goto out;
3105         }
3106
3107         /* save lowest commit level returned */
3108         if (committed < bp->nb_commitlevel) {
3109                 bp->nb_commitlevel = committed;
3110         }
3111
3112         /* check the write verifier */
3113         if (!bp->nb_verf) {
3114                 bp->nb_verf = wverf;
3115         } else if (bp->nb_verf != wverf) {
3116                 /* verifier changed, so buffer will need to be rewritten */
3117                 bp->nb_flags |= NB_STALEWVERF;
3118                 bp->nb_commitlevel = NFS_WRITE_UNSTABLE;
3119                 bp->nb_verf = wverf;
3120         }
3121
3122         /*
3123          * check for a short write
3124          *
3125          * If the server didn't write all the data, then we
3126          * need to issue another write for the rest of it.
3127          * (Don't bother if the buffer hit an error or stale wverf.)
3128          */
3129         if (((int)rlen < length) && !(bp->nb_flags & (NB_STALEWVERF | NB_ERROR))) {
3130 #if CONFIG_NFS4
3131 writeagain:
3132 #endif
3133                 offset += rlen;
3134                 length -= rlen;
3135
3136                 auio = uio_createwithbuffer(1, NBOFF(bp) + offset, UIO_SYSSPACE,
3137                     UIO_WRITE, &uio_buf, sizeof(uio_buf));
3138                 uio_addiov(auio, CAST_USER_ADDR_T(bp->nb_data + offset), length);
3139
3140                 cb.rcb_args[0] = offset;
3141                 cb.rcb_args[1] = length;
3142 #if CONFIG_NFS4
3143                 if (nmp->nm_vers >= NFS_VER4) {
3144                         cb.rcb_args[2] = nmp->nm_stategenid;
3145                 }
3146 #endif
3147                 // XXX iomode should really match the original request
3148                 error = nmp->nm_funcs->nf_write_rpc_async(np, auio, length, thd, cred,
3149                     NFS_WRITE_FILESYNC, &cb, &wreq);
3150                 if (!error) {
3151                         if (IS_VALID_CRED(cred)) {
3152                                 kauth_cred_unref(&cred);
3153                         }
3154                         if (!cb.rcb_func) {
3155                                 /* if !async we'll need to wait for this RPC to finish */
3156                                 req = wreq;
3157                                 wreq = NULL;
3158                                 goto finish;
3159                         }
3160                         nfs_request_rele(req);
3161                         /*
3162                          * We're done here.
3163                          * Outstanding RPC count is unchanged.
3164                          * Callback will be called when RPC is done.
3165                          */
3166                         return;
3167                 }
3168                 SET(bp->nb_flags, NB_ERROR);
3169                 bp->nb_error = error;
3170         }
3171
3172 out:
3173         if (cb.rcb_func) {
3174                 nfs_async_write_done(nmp);
3175                 nfs_request_rele(req);
3176         }
3177         /*
3178          * Decrement outstanding RPC count on buffer
3179          * and call nfs_buf_write_finish on last RPC.
3180          *
3181          * (Note: when there are multiple async RPCs issued for a
3182          * buffer we need nfs_buffer_mutex to avoid problems when
3183          * aborting a partially-initiated set of RPCs)
3184          */
3185         multasyncrpc = ISSET(bp->nb_flags, NB_MULTASYNCRPC);
3186         if (multasyncrpc) {
3187                 lck_mtx_lock(nfs_buf_mutex);
3188         }
3189
3190         bp->nb_rpcs--;
3191         finished = (bp->nb_rpcs == 0);
3192
3193         if (multasyncrpc) {
3194                 lck_mtx_unlock(nfs_buf_mutex);
3195         }
3196
3197         if (finished) {
3198                 if (multasyncrpc) {
3199                         wakeme = &bp->nb_rpcs;
3200                 }
3201                 nfs_buf_write_finish(bp, thd, cred);
3202                 if (wakeme) {
3203                         wakeup(wakeme);
3204                 }
3205         }
3206
3207         if (IS_VALID_CRED(cred)) {
3208                 kauth_cred_unref(&cred);
3209         }
3210 }
3211
3212 /*
3213  * Send commit(s) for the given node's "needcommit" buffers
3214  */
3215 int
3216 nfs_flushcommits(nfsnode_t np, int nowait)
3217 {
3218         struct nfsmount *nmp;
3219         struct nfsbuf *bp, *prevlbp, *lbp;
3220         struct nfsbuflists blist, commitlist;
3221         int error = 0, retv, wcred_set, flags, dirty;
3222         u_quad_t off, endoff, toff;
3223         uint64_t wverf;
3224         u_int32_t count;
3225         kauth_cred_t wcred = NULL;
3226
3227         FSDBG_TOP(557, np, 0, 0, 0);
3228
3229         /*
3230          * A nb_flags == (NB_DELWRI | NB_NEEDCOMMIT) block has been written to the
3231          * server, but nas not been committed to stable storage on the server
3232          * yet. The byte range is worked out for as many nfsbufs as we can handle
3233          * and the commit rpc is done.
3234          */
3235         if (!LIST_EMPTY(&np->n_dirtyblkhd)) {
3236                 error = nfs_node_lock(np);
3237                 if (error) {
3238                         goto done;
3239                 }
3240                 np->n_flag |= NMODIFIED;
3241                 nfs_node_unlock(np);
3242         }
3243
3244         off = (u_quad_t)-1;
3245         endoff = 0;
3246         wcred_set = 0;
3247         LIST_INIT(&commitlist);
3248
3249         nmp = NFSTONMP(np);
3250         if (nfs_mount_gone(nmp)) {
3251                 error = ENXIO;
3252                 goto done;
3253         }
3254         if (nmp->nm_vers == NFS_VER2) {
3255                 error = EINVAL;
3256                 goto done;
3257         }
3258
3259         flags = NBI_DIRTY;
3260         if (nowait) {
3261                 flags |= NBI_NOWAIT;
3262         }
3263         lck_mtx_lock(nfs_buf_mutex);
3264         wverf = nmp->nm_verf;
3265         if (!nfs_buf_iterprepare(np, &blist, flags)) {
3266                 while ((bp = LIST_FIRST(&blist))) {
3267                         LIST_REMOVE(bp, nb_vnbufs);
3268                         LIST_INSERT_HEAD(&np->n_dirtyblkhd, bp, nb_vnbufs);
3269                         error = nfs_buf_acquire(bp, NBAC_NOWAIT, 0, 0);
3270                         if (error) {
3271                                 continue;
3272                         }
3273                         if (ISSET(bp->nb_flags, NB_NEEDCOMMIT)) {
3274                                 nfs_buf_check_write_verifier(np, bp);
3275                         }
3276                         if (((bp->nb_flags & (NB_DELWRI | NB_NEEDCOMMIT)) != (NB_DELWRI | NB_NEEDCOMMIT)) ||
3277                             (bp->nb_verf != wverf)) {
3278                                 nfs_buf_drop(bp);
3279                                 continue;
3280                         }
3281                         nfs_buf_remfree(bp);
3282
3283                         /* buffer UPLs will be grabbed *in order* below */
3284
3285                         FSDBG(557, bp, bp->nb_flags, bp->nb_valid, bp->nb_dirty);
3286                         FSDBG(557, bp->nb_validoff, bp->nb_validend,
3287                             bp->nb_dirtyoff, bp->nb_dirtyend);
3288
3289                         /*
3290                          * Work out if all buffers are using the same cred
3291                          * so we can deal with them all with one commit.
3292                          *
3293                          * Note: creds in bp's must be obtained by kauth_cred_ref
3294                          * on the same original cred in order for them to be equal.
3295                          */
3296                         if (wcred_set == 0) {
3297                                 wcred = bp->nb_wcred;
3298                                 if (!IS_VALID_CRED(wcred)) {
3299                                         panic("nfs: needcommit w/out wcred");
3300                                 }
3301                                 wcred_set = 1;
3302                         } else if ((wcred_set == 1) && wcred != bp->nb_wcred) {
3303                                 wcred_set = -1;
3304                         }
3305                         SET(bp->nb_flags, NB_WRITEINPROG);
3306
3307                         /*
3308                          * Add this buffer to the list of buffers we are committing.
3309                          * Buffers are inserted into the list in ascending order so that
3310                          * we can take the UPLs in order after the list is complete.
3311                          */
3312                         prevlbp = NULL;
3313                         LIST_FOREACH(lbp, &commitlist, nb_vnbufs) {
3314                                 if (bp->nb_lblkno < lbp->nb_lblkno) {
3315                                         break;
3316                                 }
3317                                 prevlbp = lbp;
3318                         }
3319                         LIST_REMOVE(bp, nb_vnbufs);
3320                         if (prevlbp) {
3321                                 LIST_INSERT_AFTER(prevlbp, bp, nb_vnbufs);
3322                         } else {
3323                                 LIST_INSERT_HEAD(&commitlist, bp, nb_vnbufs);
3324                         }
3325
3326                         /* update commit range start, end */
3327                         toff = NBOFF(bp) + bp->nb_dirtyoff;
3328                         if (toff < off) {
3329                                 off = toff;
3330                         }
3331                         toff += (u_quad_t)(bp->nb_dirtyend - bp->nb_dirtyoff);
3332                         if (toff > endoff) {
3333                                 endoff = toff;
3334                         }
3335                 }
3336                 nfs_buf_itercomplete(np, &blist, NBI_DIRTY);
3337         }
3338         lck_mtx_unlock(nfs_buf_mutex);
3339
3340         if (LIST_EMPTY(&commitlist)) {
3341                 error = ENOBUFS;
3342                 goto done;
3343         }
3344
3345         /*
3346          * We need a UPL to prevent others from accessing the buffers during
3347          * our commit RPC(s).
3348          *
3349          * We used to also check for dirty pages here; if there were any we'd
3350          * abort the commit and force the entire buffer to be written again.
3351          * Instead of doing that, we just go ahead and commit the dirty range,
3352          * and then leave the buffer around with dirty pages that will be
3353          * written out later.
3354          */
3355         LIST_FOREACH(bp, &commitlist, nb_vnbufs) {
3356                 if (!ISSET(bp->nb_flags, NB_PAGELIST)) {
3357                         retv = nfs_buf_upl_setup(bp);
3358                         if (retv) {
3359                                 /* Unable to create the UPL, the VM object probably no longer exists. */
3360                                 printf("nfs_flushcommits: upl create failed %d\n", retv);
3361                                 bp->nb_valid = bp->nb_dirty = 0;
3362                         }
3363                 }
3364                 nfs_buf_upl_check(bp);
3365         }
3366
3367         /*
3368          * Commit data on the server, as required.
3369          * If all bufs are using the same wcred, then use that with
3370          * one call for all of them, otherwise commit each one
3371          * separately.
3372          */
3373         if (wcred_set == 1) {
3374                 /*
3375                  * Note, it's possible the commit range could be >2^32-1.
3376                  * If it is, we'll send one commit that covers the whole file.
3377                  */
3378                 if ((endoff - off) > 0xffffffff) {
3379                         count = 0;
3380                 } else {
3381                         count = (endoff - off);
3382                 }
3383                 retv = nmp->nm_funcs->nf_commit_rpc(np, off, count, wcred, wverf);
3384         } else {
3385                 retv = 0;
3386                 LIST_FOREACH(bp, &commitlist, nb_vnbufs) {
3387                         toff = NBOFF(bp) + bp->nb_dirtyoff;
3388                         count = bp->nb_dirtyend - bp->nb_dirtyoff;
3389                         retv = nmp->nm_funcs->nf_commit_rpc(np, toff, count, bp->nb_wcred, wverf);
3390                         if (retv) {
3391                                 break;
3392                         }
3393                 }
3394         }
3395
3396         /*
3397          * Now, either mark the blocks I/O done or mark the
3398          * blocks dirty, depending on whether the commit
3399          * succeeded.
3400          */
3401         while ((bp = LIST_FIRST(&commitlist))) {
3402                 LIST_REMOVE(bp, nb_vnbufs);
3403                 FSDBG(557, bp, retv, bp->nb_flags, bp->nb_dirty);
3404                 nfs_node_lock_force(np);
3405                 CLR(bp->nb_flags, (NB_NEEDCOMMIT | NB_WRITEINPROG));
3406                 np->n_needcommitcnt--;
3407                 CHECK_NEEDCOMMITCNT(np);
3408                 nfs_node_unlock(np);
3409
3410                 if (retv) {
3411                         /* move back to dirty list */
3412                         lck_mtx_lock(nfs_buf_mutex);
3413                         LIST_INSERT_HEAD(&np->n_dirtyblkhd, bp, nb_vnbufs);
3414                         lck_mtx_unlock(nfs_buf_mutex);
3415                         nfs_buf_release(bp, 1);
3416                         continue;
3417                 }
3418
3419                 nfs_node_lock_force(np);
3420                 np->n_numoutput++;
3421                 nfs_node_unlock(np);
3422                 vnode_startwrite(NFSTOV(np));
3423                 if (ISSET(bp->nb_flags, NB_DELWRI)) {
3424                         lck_mtx_lock(nfs_buf_mutex);
3425                         nfs_nbdwrite--;
3426                         NFSBUFCNTCHK();
3427                         lck_mtx_unlock(nfs_buf_mutex);
3428                         wakeup(&nfs_nbdwrite);
3429                 }
3430                 CLR(bp->nb_flags, (NB_READ | NB_DONE | NB_ERROR | NB_DELWRI));
3431                 /* if block still has dirty pages, we don't want it to */
3432                 /* be released in nfs_buf_iodone().  So, don't set NB_ASYNC. */
3433                 if (!(dirty = bp->nb_dirty)) {
3434                         SET(bp->nb_flags, NB_ASYNC);
3435                 } else {
3436                         CLR(bp->nb_flags, NB_ASYNC);
3437                 }
3438
3439                 /* move to clean list */
3440                 lck_mtx_lock(nfs_buf_mutex);
3441                 LIST_INSERT_HEAD(&np->n_cleanblkhd, bp, nb_vnbufs);
3442                 lck_mtx_unlock(nfs_buf_mutex);
3443
3444                 bp->nb_dirtyoff = bp->nb_dirtyend = 0;
3445
3446                 nfs_buf_iodone(bp);
3447                 if (dirty) {
3448                         /* throw it back in as a delayed write buffer */
3449                         CLR(bp->nb_flags, NB_DONE);
3450                         nfs_buf_write_delayed(bp);
3451                 }
3452         }
3453
3454 done:
3455         FSDBG_BOT(557, np, 0, 0, error);
3456         return error;
3457 }
3458
3459 /*
3460  * Flush all the blocks associated with a vnode.
3461  *      Walk through the buffer pool and push any dirty pages
3462  *      associated with the vnode.
3463  */
3464 int
3465 nfs_flush(nfsnode_t np, int waitfor, thread_t thd, int ignore_writeerr)
3466 {
3467         struct nfsbuf *bp;
3468         struct nfsbuflists blist;
3469         struct nfsmount *nmp = NFSTONMP(np);
3470         int error = 0, error2, slptimeo = 0, slpflag = 0;
3471         int nfsvers, flags, passone = 1;
3472
3473         FSDBG_TOP(517, np, waitfor, ignore_writeerr, 0);
3474
3475         if (nfs_mount_gone(nmp)) {
3476                 error = ENXIO;
3477                 goto out;
3478         }
3479         nfsvers = nmp->nm_vers;
3480         if (NMFLAG(nmp, INTR)) {
3481                 slpflag = PCATCH;
3482         }
3483
3484         if (!LIST_EMPTY(&np->n_dirtyblkhd)) {
3485                 nfs_node_lock_force(np);
3486                 np->n_flag |= NMODIFIED;
3487                 nfs_node_unlock(np);
3488         }
3489
3490         lck_mtx_lock(nfs_buf_mutex);
3491         while (np->n_bflag & NBFLUSHINPROG) {
3492                 np->n_bflag |= NBFLUSHWANT;
3493                 error = msleep(&np->n_bflag, nfs_buf_mutex, slpflag, "nfs_flush", NULL);
3494                 if ((error && (error != EWOULDBLOCK)) ||
3495                     ((error = nfs_sigintr(NFSTONMP(np), NULL, thd, 0)))) {
3496                         lck_mtx_unlock(nfs_buf_mutex);
3497                         goto out;
3498                 }
3499         }
3500         np->n_bflag |= NBFLUSHINPROG;
3501
3502         /*
3503          * On the first pass, start async/unstable writes on all
3504          * delayed write buffers.  Then wait for all writes to complete
3505          * and call nfs_flushcommits() to commit any uncommitted buffers.
3506          * On all subsequent passes, start STABLE writes on any remaining
3507          * dirty buffers.  Then wait for all writes to complete.
3508          */
3509 again:
3510         FSDBG(518, LIST_FIRST(&np->n_dirtyblkhd), np->n_flag, 0, 0);
3511         if (!NFSTONMP(np)) {
3512                 lck_mtx_unlock(nfs_buf_mutex);
3513                 error = ENXIO;
3514                 goto done;
3515         }
3516
3517         /* Start/do any write(s) that are required. */
3518         if (!nfs_buf_iterprepare(np, &blist, NBI_DIRTY)) {
3519                 while ((bp = LIST_FIRST(&blist))) {
3520                         LIST_REMOVE(bp, nb_vnbufs);
3521                         LIST_INSERT_HEAD(&np->n_dirtyblkhd, bp, nb_vnbufs);
3522                         flags = (passone || !(waitfor == MNT_WAIT || waitfor == MNT_DWAIT)) ? NBAC_NOWAIT : 0;
3523                         if (flags != NBAC_NOWAIT) {
3524                                 nfs_buf_refget(bp);
3525                         }
3526                         while ((error = nfs_buf_acquire(bp, flags, slpflag, slptimeo))) {
3527                                 FSDBG(524, bp, flags, bp->nb_lflags, bp->nb_flags);
3528                                 if (error == EBUSY) {
3529                                         break;
3530                                 }
3531                                 if (error) {
3532                                         error2 = nfs_sigintr(NFSTONMP(np), NULL, thd, 0);
3533                                         if (error2) {
3534                                                 if (flags != NBAC_NOWAIT) {
3535                                                         nfs_buf_refrele(bp);
3536                                                 }
3537                                                 nfs_buf_itercomplete(np, &blist, NBI_DIRTY);
3538                                                 lck_mtx_unlock(nfs_buf_mutex);
3539                                                 error = error2;
3540                                                 goto done;
3541                                         }
3542                                         if (slpflag == PCATCH) {
3543                                                 slpflag = 0;
3544                                                 slptimeo = 2 * hz;
3545                                         }
3546                                 }
3547                         }
3548                         if (flags != NBAC_NOWAIT) {
3549                                 nfs_buf_refrele(bp);
3550                         }
3551                         if (error == EBUSY) {
3552                                 continue;
3553                         }
3554                         if (!bp->nb_np) {
3555                                 /* buffer is no longer valid */
3556                                 nfs_buf_drop(bp);
3557                                 continue;
3558                         }
3559                         if (ISSET(bp->nb_flags, NB_NEEDCOMMIT)) {
3560                                 nfs_buf_check_write_verifier(np, bp);
3561                         }
3562                         if (!ISSET(bp->nb_flags, NB_DELWRI)) {
3563                                 /* buffer is no longer dirty */
3564                                 nfs_buf_drop(bp);
3565                                 continue;
3566                         }
3567                         FSDBG(525, bp, passone, bp->nb_lflags, bp->nb_flags);
3568                         if ((passone || !(waitfor == MNT_WAIT || waitfor == MNT_DWAIT)) &&
3569                             ISSET(bp->nb_flags, NB_NEEDCOMMIT)) {
3570                                 nfs_buf_drop(bp);
3571                                 continue;
3572                         }
3573                         nfs_buf_remfree(bp);
3574                         lck_mtx_unlock(nfs_buf_mutex);
3575                         if (ISSET(bp->nb_flags, NB_ERROR)) {
3576                                 nfs_node_lock_force(np);
3577                                 np->n_error = bp->nb_error ? bp->nb_error : EIO;
3578                                 np->n_flag |= NWRITEERR;
3579                                 nfs_node_unlock(np);
3580                                 nfs_buf_release(bp, 1);
3581                                 lck_mtx_lock(nfs_buf_mutex);
3582                                 continue;
3583                         }
3584                         SET(bp->nb_flags, NB_ASYNC);
3585                         if (!passone) {
3586                                 /* NB_STABLE forces this to be written FILESYNC */
3587                                 SET(bp->nb_flags, NB_STABLE);
3588                         }
3589                         nfs_buf_write(bp);
3590                         lck_mtx_lock(nfs_buf_mutex);
3591                 }
3592                 nfs_buf_itercomplete(np, &blist, NBI_DIRTY);
3593         }
3594         lck_mtx_unlock(nfs_buf_mutex);
3595
3596         if (waitfor == MNT_WAIT || waitfor == MNT_DWAIT) {
3597                 while ((error = vnode_waitforwrites(NFSTOV(np), 0, slpflag, slptimeo, "nfsflush"))) {
3598                         error2 = nfs_sigintr(NFSTONMP(np), NULL, thd, 0);
3599                         if (error2) {
3600                                 error = error2;
3601                                 goto done;
3602                         }
3603                         if (slpflag == PCATCH) {
3604                                 slpflag = 0;
3605                                 slptimeo = 2 * hz;
3606                         }
3607                 }
3608         }
3609
3610         if (nfsvers != NFS_VER2) {
3611                 /* loop while it looks like there are still buffers to be */
3612                 /* commited and nfs_flushcommits() seems to be handling them. */
3613                 while (np->n_needcommitcnt) {
3614                         if (nfs_flushcommits(np, 0)) {
3615                                 break;
3616                         }
3617                 }
3618         }
3619
3620         if (passone) {
3621                 passone = 0;
3622                 if (!LIST_EMPTY(&np->n_dirtyblkhd)) {
3623                         nfs_node_lock_force(np);
3624                         np->n_flag |= NMODIFIED;
3625                         nfs_node_unlock(np);
3626                 }
3627                 lck_mtx_lock(nfs_buf_mutex);
3628                 goto again;
3629         }
3630
3631         if (waitfor == MNT_WAIT || waitfor == MNT_DWAIT) {
3632                 if (!LIST_EMPTY(&np->n_dirtyblkhd)) {
3633                         nfs_node_lock_force(np);
3634                         np->n_flag |= NMODIFIED;
3635                         nfs_node_unlock(np);
3636                 }
3637                 lck_mtx_lock(nfs_buf_mutex);
3638                 if (!LIST_EMPTY(&np->n_dirtyblkhd)) {
3639                         goto again;
3640                 }
3641                 lck_mtx_unlock(nfs_buf_mutex);
3642                 nfs_node_lock_force(np);
3643                 /*
3644                  * OK, it looks like there are no dirty blocks.  If we have no
3645                  * writes in flight and no one in the write code, we can clear
3646                  * the modified flag.  In order to make sure we see the latest
3647                  * attributes and size, we also invalidate the attributes and
3648                  * advance the attribute cache XID to guarantee that attributes
3649                  * newer than our clearing of NMODIFIED will get loaded next.
3650                  * (If we don't do this, it's possible for the flush's final
3651                  * write/commit (xid1) to be executed in parallel with a subsequent
3652                  * getattr request (xid2).  The getattr could return attributes
3653                  * from *before* the write/commit completed but the stale attributes
3654                  * would be preferred because of the xid ordering.)
3655                  */
3656                 if (!np->n_wrbusy && !np->n_numoutput) {
3657                         np->n_flag &= ~NMODIFIED;
3658                         NATTRINVALIDATE(np);
3659                         nfs_get_xid(&np->n_xid);
3660                 }
3661         } else {
3662                 nfs_node_lock_force(np);
3663         }
3664
3665         FSDBG(526, np->n_flag, np->n_error, 0, 0);
3666         if (!ignore_writeerr && (np->n_flag & NWRITEERR)) {
3667                 error = np->n_error;
3668                 np->n_flag &= ~NWRITEERR;
3669         }
3670         nfs_node_unlock(np);
3671 done:
3672         lck_mtx_lock(nfs_buf_mutex);
3673         flags = np->n_bflag;
3674         np->n_bflag &= ~(NBFLUSHINPROG | NBFLUSHWANT);
3675         lck_mtx_unlock(nfs_buf_mutex);
3676         if (flags & NBFLUSHWANT) {
3677                 wakeup(&np->n_bflag);
3678         }
3679 out:
3680         FSDBG_BOT(517, np, error, ignore_writeerr, 0);
3681         return error;
3682 }
3683
3684 /*
3685  * Flush out and invalidate all buffers associated with a vnode.
3686  * Called with the underlying object locked.
3687  */
3688 int
3689 nfs_vinvalbuf_internal(
3690         nfsnode_t np,
3691         int flags,
3692         thread_t thd,
3693         kauth_cred_t cred,
3694         int slpflag,
3695         int slptimeo)
3696 {
3697         struct nfsbuf *bp;
3698         struct nfsbuflists blist;
3699         int list, error = 0;
3700
3701         if (flags & V_SAVE) {
3702                 if ((error = nfs_flush(np, MNT_WAIT, thd, (flags & V_IGNORE_WRITEERR)))) {
3703                         return error;
3704                 }
3705         }
3706
3707         lck_mtx_lock(nfs_buf_mutex);
3708         for (;;) {
3709                 list = NBI_CLEAN;
3710                 if (nfs_buf_iterprepare(np, &blist, list)) {
3711                         list = NBI_DIRTY;
3712                         if (nfs_buf_iterprepare(np, &blist, list)) {
3713                                 break;
3714                         }
3715                 }
3716                 while ((bp = LIST_FIRST(&blist))) {
3717                         LIST_REMOVE(bp, nb_vnbufs);
3718                         if (list == NBI_CLEAN) {
3719                                 LIST_INSERT_HEAD(&np->n_cleanblkhd, bp, nb_vnbufs);
3720                         } else {
3721                                 LIST_INSERT_HEAD(&np->n_dirtyblkhd, bp, nb_vnbufs);
3722                         }
3723                         nfs_buf_refget(bp);
3724                         while ((error = nfs_buf_acquire(bp, NBAC_REMOVE, slpflag, slptimeo))) {
3725                                 FSDBG(556, np, bp, NBOFF(bp), bp->nb_flags);
3726                                 if (error != EAGAIN) {
3727                                         FSDBG(554, np, bp, -1, error);
3728                                         nfs_buf_refrele(bp);
3729                                         nfs_buf_itercomplete(np, &blist, list);
3730                                         lck_mtx_unlock(nfs_buf_mutex);
3731                                         return error;
3732                                 }
3733                         }
3734                         nfs_buf_refrele(bp);
3735                         FSDBG(554, np, bp, NBOFF(bp), bp->nb_flags);
3736                         lck_mtx_unlock(nfs_buf_mutex);
3737                         if ((flags & V_SAVE) && UBCINFOEXISTS(NFSTOV(np)) && bp->nb_np &&
3738                             (NBOFF(bp) < (off_t)np->n_size)) {
3739                                 /* extra paranoia: make sure we're not */
3740                                 /* somehow leaving any dirty data around */
3741                                 int mustwrite = 0;
3742                                 int end = (NBOFF(bp) + bp->nb_bufsize > (off_t)np->n_size) ?
3743                                     ((off_t)np->n_size - NBOFF(bp)) : bp->nb_bufsize;
3744                                 if (!ISSET(bp->nb_flags, NB_PAGELIST)) {
3745                                         error = nfs_buf_upl_setup(bp);
3746                                         if (error == EINVAL) {
3747                                                 /* vm object must no longer exist */
3748                                                 /* hopefully we don't need to do */
3749                                                 /* anything for this buffer */
3750                                         } else if (error) {
3751                                                 printf("nfs_vinvalbuf: upl setup failed %d\n", error);
3752                                         }
3753                                         bp->nb_valid = bp->nb_dirty = 0;
3754                                 }
3755                                 nfs_buf_upl_check(bp);
3756                                 /* check for any dirty data before the EOF */
3757                                 if ((bp->nb_dirtyend > 0) && (bp->nb_dirtyoff < end)) {
3758                                         /* clip dirty range to EOF */
3759                                         if (bp->nb_dirtyend > end) {
3760                                                 bp->nb_dirtyend = end;
3761                                                 if (bp->nb_dirtyoff >= bp->nb_dirtyend) {
3762                                                         bp->nb_dirtyoff = bp->nb_dirtyend = 0;
3763                                                 }
3764                                         }
3765                                         if ((bp->nb_dirtyend > 0) && (bp->nb_dirtyoff < end)) {
3766                                                 mustwrite++;
3767                                         }
3768                                 }
3769                                 bp->nb_dirty &= (1 << (round_page_32(end) / PAGE_SIZE)) - 1;
3770                                 if (bp->nb_dirty) {
3771                                         mustwrite++;
3772                                 }
3773                                 /* also make sure we'll have a credential to do the write */
3774                                 if (mustwrite && !IS_VALID_CRED(bp->nb_wcred) && !IS_VALID_CRED(cred)) {
3775                                         printf("nfs_vinvalbuf: found dirty buffer with no write creds\n");
3776                                         mustwrite = 0;
3777                                 }
3778                                 if (mustwrite) {
3779                                         FSDBG(554, np, bp, 0xd00dee, bp->nb_flags);
3780                                         if (!ISSET(bp->nb_flags, NB_PAGELIST)) {
3781                                                 panic("nfs_vinvalbuf: dirty buffer without upl");
3782                                         }
3783                                         /* gotta write out dirty data before invalidating */
3784                                         /* (NB_STABLE indicates that data writes should be FILESYNC) */
3785                                         /* (NB_NOCACHE indicates buffer should be discarded) */
3786                                         CLR(bp->nb_flags, (NB_DONE | NB_ERROR | NB_INVAL | NB_ASYNC));
3787                                         SET(bp->nb_flags, NB_STABLE | NB_NOCACHE);
3788                                         if (!IS_VALID_CRED(bp->nb_wcred)) {
3789                                                 kauth_cred_ref(cred);
3790                                                 bp->nb_wcred = cred;
3791                                         }
3792                                         error = nfs_buf_write(bp);
3793                                         // Note: bp has been released
3794                                         if (error) {
3795                                                 FSDBG(554, bp, 0xd00dee, 0xbad, error);
3796                                                 nfs_node_lock_force(np);
3797                                                 if ((error != EINTR) && (error != ERESTART)) {
3798                                                         np->n_error = error;
3799                                                         np->n_flag |= NWRITEERR;
3800                                                 }
3801                                                 /*
3802                                                  * There was a write error and we need to
3803                                                  * invalidate attrs to sync with server.
3804                                                  * (if this write was extending the file,
3805                                                  * we may no longer know the correct size)
3806                                                  */
3807                                                 NATTRINVALIDATE(np);
3808                                                 nfs_node_unlock(np);
3809                                                 if ((error == EINTR) || (error == ERESTART)) {
3810                                                         /*
3811                                                          * Abort on EINTR.  If we don't, we could
3812                                                          * be stuck in this loop forever because
3813                                                          * the buffer will continue to stay dirty.
3814                                                          */
3815                                                         lck_mtx_lock(nfs_buf_mutex);
3816                                                         nfs_buf_itercomplete(np, &blist, list);
3817                                                         lck_mtx_unlock(nfs_buf_mutex);
3818                                                         return error;
3819                                                 }
3820                                                 error = 0;
3821                                         }
3822                                         lck_mtx_lock(nfs_buf_mutex);
3823                                         continue;
3824                                 }
3825                         }
3826                         SET(bp->nb_flags, NB_INVAL);
3827                         // hold off on FREEUPs until we're done here
3828                         nfs_buf_release(bp, 0);
3829                         lck_mtx_lock(nfs_buf_mutex);
3830                 }
3831                 nfs_buf_itercomplete(np, &blist, list);
3832         }
3833         if (!LIST_EMPTY(&(np)->n_dirtyblkhd) || !LIST_EMPTY(&(np)->n_cleanblkhd)) {
3834                 panic("nfs_vinvalbuf: flush/inval failed");
3835         }
3836         lck_mtx_unlock(nfs_buf_mutex);
3837         nfs_node_lock_force(np);
3838         if (!(flags & V_SAVE)) {
3839                 np->n_flag &= ~NMODIFIED;
3840         }
3841         if (vnode_vtype(NFSTOV(np)) == VREG) {
3842                 np->n_lastrahead = -1;
3843         }
3844         nfs_node_unlock(np);
3845         NFS_BUF_FREEUP();
3846         return 0;
3847 }
3848
3849
3850 /*
3851  * Flush and invalidate all dirty buffers. If another process is already
3852  * doing the flush, just wait for completion.
3853  */
3854 int
3855 nfs_vinvalbuf(vnode_t vp, int flags, vfs_context_t ctx, int intrflg)
3856 {
3857         return nfs_vinvalbuf2(vp, flags, vfs_context_thread(ctx), vfs_context_ucred(ctx), intrflg);
3858 }
3859
3860 int
3861 nfs_vinvalbuf2(vnode_t vp, int flags, thread_t thd, kauth_cred_t cred, int intrflg)
3862 {
3863         nfsnode_t np = VTONFS(vp);
3864         struct nfsmount *nmp = VTONMP(vp);
3865         int error, slpflag, slptimeo, nflags, retry = 0;
3866         int ubcflags = UBC_PUSHALL | UBC_SYNC | UBC_INVALIDATE;
3867         struct timespec ts = { .tv_sec = 2, .tv_nsec = 0 };
3868         off_t size;
3869
3870         FSDBG_TOP(554, np, flags, intrflg, 0);
3871
3872         /*
3873          * If the mount is gone no sense to try and write anything.
3874          * and hang trying to do IO.
3875          */
3876         if (nfs_mount_gone(nmp)) {
3877                 flags &= ~V_SAVE;
3878                 ubcflags &= ~UBC_PUSHALL;
3879         }
3880
3881         if (nmp && !NMFLAG(nmp, INTR)) {
3882                 intrflg = 0;
3883         }
3884         if (intrflg) {
3885                 slpflag = PCATCH;
3886                 slptimeo = 2 * hz;
3887         } else {
3888                 slpflag = 0;
3889                 slptimeo = 0;
3890         }
3891
3892         /* First wait for any other process doing a flush to complete.  */
3893         lck_mtx_lock(nfs_buf_mutex);
3894         while (np->n_bflag & NBINVALINPROG) {
3895                 np->n_bflag |= NBINVALWANT;
3896                 msleep(&np->n_bflag, nfs_buf_mutex, slpflag, "nfs_vinvalbuf", &ts);
3897                 if ((error = nfs_sigintr(VTONMP(vp), NULL, thd, 0))) {
3898                         lck_mtx_unlock(nfs_buf_mutex);
3899                         return error;
3900                 }
3901                 if (np->n_bflag & NBINVALINPROG) {
3902                         slpflag = 0;
3903                 }
3904         }
3905         np->n_bflag |= NBINVALINPROG;
3906         lck_mtx_unlock(nfs_buf_mutex);
3907
3908         /* Now, flush as required.  */
3909 again:
3910         error = nfs_vinvalbuf_internal(np, flags, thd, cred, slpflag, 0);
3911         while (error) {
3912                 FSDBG(554, np, 0, 0, error);
3913                 if ((error = nfs_sigintr(VTONMP(vp), NULL, thd, 0))) {
3914                         goto done;
3915                 }
3916                 error = nfs_vinvalbuf_internal(np, flags, thd, cred, 0, slptimeo);
3917         }
3918
3919         /* get the pages out of vm also */
3920         if (UBCINFOEXISTS(vp) && (size = ubc_getsize(vp))) {
3921                 if ((error = ubc_msync(vp, 0, size, NULL, ubcflags))) {
3922                         if (error == EINVAL) {
3923                                 panic("nfs_vinvalbuf(): ubc_msync failed!, error %d", error);
3924                         }
3925                         if (retry++ < 10) { /* retry invalidating a few times */
3926                                 if (retry > 1 || error == ENXIO) {
3927                                         ubcflags &= ~UBC_PUSHALL;
3928                                 }
3929                                 goto again;
3930                         }
3931                         /* give up */
3932                         printf("nfs_vinvalbuf(): ubc_msync failed!, error %d\n", error);
3933                 }
3934         }
3935 done:
3936         lck_mtx_lock(nfs_buf_mutex);
3937         nflags = np->n_bflag;
3938         np->n_bflag &= ~(NBINVALINPROG | NBINVALWANT);
3939         lck_mtx_unlock(nfs_buf_mutex);
3940         if (nflags & NBINVALWANT) {
3941                 wakeup(&np->n_bflag);
3942         }
3943
3944         FSDBG_BOT(554, np, flags, intrflg, error);
3945         return error;
3946 }
3947
3948 /*
3949  * Wait for any busy buffers to complete.
3950  */
3951 void
3952 nfs_wait_bufs(nfsnode_t np)
3953 {
3954         struct nfsbuf *bp;
3955         struct nfsbuflists blist;
3956         int error = 0;
3957
3958         lck_mtx_lock(nfs_buf_mutex);
3959         if (!nfs_buf_iterprepare(np, &blist, NBI_CLEAN)) {
3960                 while ((bp = LIST_FIRST(&blist))) {
3961                         LIST_REMOVE(bp, nb_vnbufs);
3962                         LIST_INSERT_HEAD(&np->n_cleanblkhd, bp, nb_vnbufs);
3963                         nfs_buf_refget(bp);
3964                         while ((error = nfs_buf_acquire(bp, 0, 0, 0))) {
3965                                 if (error != EAGAIN) {
3966                                         nfs_buf_refrele(bp);
3967                                         nfs_buf_itercomplete(np, &blist, NBI_CLEAN);
3968                                         lck_mtx_unlock(nfs_buf_mutex);
3969                                         return;
3970                                 }
3971                         }
3972                         nfs_buf_refrele(bp);
3973                         nfs_buf_drop(bp);
3974                 }
3975                 nfs_buf_itercomplete(np, &blist, NBI_CLEAN);
3976         }
3977         if (!nfs_buf_iterprepare(np, &blist, NBI_DIRTY)) {
3978                 while ((bp = LIST_FIRST(&blist))) {
3979                         LIST_REMOVE(bp, nb_vnbufs);
3980                         LIST_INSERT_HEAD(&np->n_dirtyblkhd, bp, nb_vnbufs);
3981                         nfs_buf_refget(bp);
3982                         while ((error = nfs_buf_acquire(bp, 0, 0, 0))) {
3983                                 if (error != EAGAIN) {
3984                                         nfs_buf_refrele(bp);
3985                                         nfs_buf_itercomplete(np, &blist, NBI_DIRTY);
3986                                         lck_mtx_unlock(nfs_buf_mutex);
3987                                         return;
3988                                 }
3989                         }
3990                         nfs_buf_refrele(bp);
3991                         nfs_buf_drop(bp);
3992                 }
3993                 nfs_buf_itercomplete(np, &blist, NBI_DIRTY);
3994         }
3995         lck_mtx_unlock(nfs_buf_mutex);
3996 }
3997
3998
3999 /*
4000  * Add an async I/O request to the mount's async I/O queue and make
4001  * sure that an nfsiod will service it.
4002  */
4003 void
4004 nfs_asyncio_finish(struct nfsreq *req)
4005 {
4006         struct nfsmount *nmp;
4007         struct nfsiod *niod;
4008         int started = 0;
4009
4010         FSDBG_TOP(552, nmp, 0, 0, 0);
4011 again:
4012         nmp = req->r_nmp;
4013
4014         if (nmp == NULL) {
4015                 return;
4016         }
4017
4018         lck_mtx_lock(nfsiod_mutex);
4019         niod = nmp->nm_niod;
4020
4021         /* grab an nfsiod if we don't have one already */
4022         if (!niod) {
4023                 niod = TAILQ_FIRST(&nfsiodfree);
4024                 if (niod) {
4025                         TAILQ_REMOVE(&nfsiodfree, niod, niod_link);
4026                         TAILQ_INSERT_TAIL(&nfsiodwork, niod, niod_link);
4027                         niod->niod_nmp = nmp;
4028                 } else if (((nfsiod_thread_count < NFSIOD_MAX) || (nfsiod_thread_count <= 0)) && (started < 4)) {
4029                         /*
4030                          * Try starting a new thread.
4031                          * We may try a couple times if other callers
4032                          * get the new threads before we do.
4033                          */
4034                         lck_mtx_unlock(nfsiod_mutex);
4035                         started++;
4036                         if (!nfsiod_start()) {
4037                                 goto again;
4038                         }
4039                         lck_mtx_lock(nfsiod_mutex);
4040                 }
4041         }
4042
4043         /*
4044          * If we got here while being on the resendq we need to get off. This
4045          * happens when the timer fires and errors out requests from nfs_sigintr
4046          * or we receive a reply (UDP case) while being on the resend queue so
4047          * we're just finishing up and are not going to be resent.
4048          */
4049         lck_mtx_lock(&req->r_mtx);
4050         if (req->r_flags & R_RESENDQ) {
4051                 lck_mtx_lock(&nmp->nm_lock);
4052                 if (req->r_rchain.tqe_next != NFSREQNOLIST) {
4053                         NFS_BIO_DBG("Proccessing async request on resendq. Removing");
4054                         TAILQ_REMOVE(&nmp->nm_resendq, req, r_rchain);
4055                         req->r_rchain.tqe_next = NFSREQNOLIST;
4056                         assert(req->r_refs > 1);
4057                         /* Remove resendq reference */
4058                         req->r_refs--;
4059                 }
4060                 lck_mtx_unlock(&nmp->nm_lock);
4061                 req->r_flags &= ~R_RESENDQ;
4062         }
4063         lck_mtx_unlock(&req->r_mtx);
4064
4065         if (req->r_achain.tqe_next == NFSREQNOLIST) {
4066                 TAILQ_INSERT_TAIL(&nmp->nm_iodq, req, r_achain);
4067         }
4068
4069         /* If this mount doesn't already have an nfsiod working on it... */
4070         if (!nmp->nm_niod) {
4071                 if (niod) { /* give it the nfsiod we just grabbed */
4072                         nmp->nm_niod = niod;
4073                         lck_mtx_unlock(nfsiod_mutex);
4074                         wakeup(niod);
4075                 } else if (nfsiod_thread_count > 0) {
4076                         /* just queue it up on nfsiod mounts queue if needed */
4077                         if (nmp->nm_iodlink.tqe_next == NFSNOLIST) {
4078                                 TAILQ_INSERT_TAIL(&nfsiodmounts, nmp, nm_iodlink);
4079                         }
4080                         lck_mtx_unlock(nfsiod_mutex);
4081                 } else {
4082                         printf("nfs_asyncio(): no nfsiods? %d %d (%d)\n", nfsiod_thread_count, NFSIOD_MAX, started);
4083                         lck_mtx_unlock(nfsiod_mutex);
4084                         /* we have no other option but to be persistent */
4085                         started = 0;
4086                         goto again;
4087                 }
4088         } else {
4089                 lck_mtx_unlock(nfsiod_mutex);
4090         }
4091
4092         FSDBG_BOT(552, nmp, 0, 0, 0);
4093 }
4094
4095 /*
4096  * queue up async I/O request for resend
4097  */
4098 void
4099 nfs_asyncio_resend(struct nfsreq *req)
4100 {
4101         struct nfsmount *nmp = req->r_nmp;
4102
4103         if (nfs_mount_gone(nmp)) {
4104                 return;
4105         }
4106
4107 #if CONFIG_NFS_GSS
4108         nfs_gss_clnt_rpcdone(req);
4109 #endif
4110         lck_mtx_lock(&nmp->nm_lock);
4111         if (!(req->r_flags & R_RESENDQ)) {
4112                 TAILQ_INSERT_TAIL(&nmp->nm_resendq, req, r_rchain);
4113                 req->r_flags |= R_RESENDQ;
4114                 /*
4115                  * We take a reference on this request so that it can't be
4116                  * destroyed while a resend is queued or in progress.
4117                  */
4118                 nfs_request_ref(req, 1);
4119         }
4120         nfs_mount_sock_thread_wake(nmp);
4121         lck_mtx_unlock(&nmp->nm_lock);
4122 }
4123
4124 /*
4125  * Read directory data into a buffer.
4126  *
4127  * Buffer will be filled (unless EOF is hit).
4128  * Buffers after this one may also be completely/partially filled.
4129  */
4130 int
4131 nfs_buf_readdir(struct nfsbuf *bp, vfs_context_t ctx)
4132 {
4133         nfsnode_t np = bp->nb_np;
4134         struct nfsmount *nmp = NFSTONMP(np);
4135         int error = 0;
4136
4137         if (nfs_mount_gone(nmp)) {
4138                 return ENXIO;
4139         }
4140
4141         if (nmp->nm_vers < NFS_VER4) {
4142                 error = nfs3_readdir_rpc(np, bp, ctx);
4143         }
4144 #if CONFIG_NFS4
4145         else {
4146                 error = nfs4_readdir_rpc(np, bp, ctx);
4147         }
4148 #endif
4149         if (error && (error != NFSERR_DIRBUFDROPPED)) {
4150                 SET(bp->nb_flags, NB_ERROR);
4151                 bp->nb_error = error;
4152         }
4153         return error;
4154 }