bsd/nfs/nfs_bio.c

   1 /*
   2  * Copyright (c) 2000-2015 Apple Inc. All rights reserved.
   3  *
   4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
   5  *
   6  * This file contains Original Code and/or Modifications of Original Code
   7  * as defined in and that are subject to the Apple Public Source License
   8  * Version 2.0 (the 'License'). You may not use this file except in
   9  * compliance with the License. The rights granted to you under the License
  10  * may not be used to create, or enable the creation or redistribution of,
  11  * unlawful or unlicensed copies of an Apple operating system, or to
  12  * circumvent, violate, or enable the circumvention or violation of, any
  13  * terms of an Apple operating system software license agreement.
  14  *
  15  * Please obtain a copy of the License at
  16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
  17  *
  18  * The Original Code and all software distributed under the License are
  19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
  20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
  21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
  22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
  23  * Please see the License for the specific language governing rights and
  24  * limitations under the License.
  25  *
  26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
  27  */
  28 /* Copyright (c) 1995 NeXT Computer, Inc. All Rights Reserved */
  29 /*
  30  * Copyright (c) 1989, 1993
  31  *      The Regents of the University of California.  All rights reserved.
  32  *
  33  * This code is derived from software contributed to Berkeley by
  34  * Rick Macklem at The University of Guelph.
  35  *
  36  * Redistribution and use in source and binary forms, with or without
  37  * modification, are permitted provided that the following conditions
  38  * are met:
  39  * 1. Redistributions of source code must retain the above copyright
  40  *    notice, this list of conditions and the following disclaimer.
  41  * 2. Redistributions in binary form must reproduce the above copyright
  42  *    notice, this list of conditions and the following disclaimer in the
  43  *    documentation and/or other materials provided with the distribution.
  44  * 3. All advertising materials mentioning features or use of this software
  45  *    must display the following acknowledgement:
  46  *      This product includes software developed by the University of
  47  *      California, Berkeley and its contributors.
  48  * 4. Neither the name of the University nor the names of its contributors
  49  *    may be used to endorse or promote products derived from this software
  50  *    without specific prior written permission.
  51  *
  52  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  53  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  54  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  55  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  56  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  57  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  58  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  59  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  60  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  61  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  62  * SUCH DAMAGE.
  63  *
  64  *      @(#)nfs_bio.c   8.9 (Berkeley) 3/30/95
  65  * FreeBSD-Id: nfs_bio.c,v 1.44 1997/09/10 19:52:25 phk Exp $
  66  */
  67 #include <sys/param.h>
  68 #include <sys/systm.h>
  69 #include <sys/resourcevar.h>
  70 #include <sys/signalvar.h>
  71 #include <sys/proc_internal.h>
  72 #include <sys/kauth.h>
  73 #include <sys/malloc.h>
  74 #include <sys/vnode.h>
  75 #include <sys/dirent.h>
  76 #include <sys/mount_internal.h>
  77 #include <sys/kernel.h>
  78 #include <sys/ubc_internal.h>
  79 #include <sys/uio_internal.h>
  80 #include <sys/kpi_mbuf.h>
  81
  82 #include <sys/vm.h>
  83 #include <sys/vmparam.h>
  84
  85 #include <sys/time.h>
  86 #include <kern/clock.h>
  87 #include <libkern/OSAtomic.h>
  88 #include <kern/kalloc.h>
  89 #include <kern/thread_call.h>
  90
  91 #include <nfs/rpcv2.h>
  92 #include <nfs/nfsproto.h>
  93 #include <nfs/nfs.h>
  94 #include <nfs/nfs_gss.h>
  95 #include <nfs/nfsmount.h>
  96 #include <nfs/nfsnode.h>
  97 #include <sys/buf_internal.h>
  98 #include <libkern/OSAtomic.h>
  99
 100 #define NFS_BIO_DBG(...) NFS_DBG(NFS_FAC_BIO, 7, ## __VA_ARGS__)
 101
 102 kern_return_t   thread_terminate(thread_t); /* XXX */
 103
 104 #define NFSBUFHASH(np, lbn)     \
 105         (&nfsbufhashtbl[((long)(np) / sizeof(*(np)) + (int)(lbn)) & nfsbufhash])
 106 LIST_HEAD(nfsbufhashhead, nfsbuf) * nfsbufhashtbl;
 107 struct nfsbuffreehead nfsbuffree, nfsbuffreemeta, nfsbufdelwri;
 108 u_long nfsbufhash;
 109 int nfsbufcnt, nfsbufmin, nfsbufmax, nfsbufmetacnt, nfsbufmetamax;
 110 int nfsbuffreecnt, nfsbuffreemetacnt, nfsbufdelwricnt, nfsneedbuffer;
 111 int nfs_nbdwrite;
 112 int nfs_buf_timer_on = 0;
 113 thread_t nfsbufdelwrithd = NULL;
 114
 115 lck_grp_t *nfs_buf_lck_grp;
 116 lck_mtx_t *nfs_buf_mutex;
 117
 118 #define NFSBUF_FREE_PERIOD      30      /* seconds */
 119 #define NFSBUF_LRU_STALE        120
 120 #define NFSBUF_META_STALE       240
 121
 122 /* number of nfsbufs nfs_buf_freeup() should attempt to free from nfsbuffree list */
 123 #define LRU_TO_FREEUP                   6
 124 /* number of nfsbufs nfs_buf_freeup() should attempt to free from nfsbuffreemeta list */
 125 #define META_TO_FREEUP                  3
 126 /* total number of nfsbufs nfs_buf_freeup() should attempt to free */
 127 #define TOTAL_TO_FREEUP                 (LRU_TO_FREEUP+META_TO_FREEUP)
 128 /* fraction of nfsbufs nfs_buf_freeup() should attempt to free from nfsbuffree list when called from timer */
 129 #define LRU_FREEUP_FRAC_ON_TIMER        8
 130 /* fraction of nfsbufs nfs_buf_freeup() should attempt to free from nfsbuffreemeta list when called from timer */
 131 #define META_FREEUP_FRAC_ON_TIMER       16
 132 /* fraction of total nfsbufs that nfsbuffreecnt should exceed before bothering to call nfs_buf_freeup() */
 133 #define LRU_FREEUP_MIN_FRAC             4
 134 /* fraction of total nfsbufs that nfsbuffreemetacnt should exceed before bothering to call nfs_buf_freeup() */
 135 #define META_FREEUP_MIN_FRAC            2
 136
 137 #define NFS_BUF_FREEUP() \
 138         do { \
 139         /* only call nfs_buf_freeup() if it has work to do: */ \
 140                 if (((nfsbuffreecnt > nfsbufcnt/LRU_FREEUP_MIN_FRAC) || \
 141                      (nfsbuffreemetacnt > nfsbufcnt/META_FREEUP_MIN_FRAC)) && \
 142                     ((nfsbufcnt - TOTAL_TO_FREEUP) > nfsbufmin)) \
 143                         nfs_buf_freeup(0); \
 144         } while (0)
 145
 146 /*
 147  * Initialize nfsbuf lists
 148  */
 149 void
 150 nfs_nbinit(void)
 151 {
 152         nfs_buf_lck_grp = lck_grp_alloc_init("nfs_buf", LCK_GRP_ATTR_NULL);
 153         nfs_buf_mutex = lck_mtx_alloc_init(nfs_buf_lck_grp, LCK_ATTR_NULL);
 154
 155         nfsbufcnt = nfsbufmetacnt =
 156             nfsbuffreecnt = nfsbuffreemetacnt = nfsbufdelwricnt = 0;
 157         nfsbufmin = 128;
 158         /* size nfsbufmax to cover at most half sane_size (w/default buf size) */
 159         nfsbufmax = (sane_size >> PAGE_SHIFT) / (2 * (NFS_RWSIZE >> PAGE_SHIFT));
 160         nfsbufmetamax = nfsbufmax / 4;
 161         nfsneedbuffer = 0;
 162         nfs_nbdwrite = 0;
 163
 164         nfsbufhashtbl = hashinit(nfsbufmax / 4, M_TEMP, &nfsbufhash);
 165         TAILQ_INIT(&nfsbuffree);
 166         TAILQ_INIT(&nfsbuffreemeta);
 167         TAILQ_INIT(&nfsbufdelwri);
 168 }
 169
 170 /*
 171  * Check periodically for stale/unused nfs bufs
 172  */
 173 void
 174 nfs_buf_timer(__unused void *param0, __unused void *param1)
 175 {
 176         nfs_buf_freeup(1);
 177
 178         lck_mtx_lock(nfs_buf_mutex);
 179         if (nfsbufcnt <= nfsbufmin) {
 180                 nfs_buf_timer_on = 0;
 181                 lck_mtx_unlock(nfs_buf_mutex);
 182                 return;
 183         }
 184         lck_mtx_unlock(nfs_buf_mutex);
 185
 186         nfs_interval_timer_start(nfs_buf_timer_call,
 187             NFSBUF_FREE_PERIOD * 1000);
 188 }
 189
 190 /*
 191  * try to free up some excess, unused nfsbufs
 192  */
 193 void
 194 nfs_buf_freeup(int timer)
 195 {
 196         struct nfsbuf *fbp;
 197         struct timeval now;
 198         int count;
 199         struct nfsbuffreehead nfsbuffreeup;
 200
 201         TAILQ_INIT(&nfsbuffreeup);
 202
 203         lck_mtx_lock(nfs_buf_mutex);
 204
 205         microuptime(&now);
 206
 207         FSDBG(320, nfsbufcnt, nfsbuffreecnt, nfsbuffreemetacnt, 0);
 208
 209         count = timer ? nfsbuffreecnt / LRU_FREEUP_FRAC_ON_TIMER : LRU_TO_FREEUP;
 210         while ((nfsbufcnt > nfsbufmin) && (count-- > 0)) {
 211                 fbp = TAILQ_FIRST(&nfsbuffree);
 212                 if (!fbp) {
 213                         break;
 214                 }
 215                 if (fbp->nb_refs) {
 216                         break;
 217                 }
 218                 if (NBUFSTAMPVALID(fbp) &&
 219                     (fbp->nb_timestamp + (2 * NFSBUF_LRU_STALE)) > now.tv_sec) {
 220                         break;
 221                 }
 222                 nfs_buf_remfree(fbp);
 223                 /* disassociate buffer from any nfsnode */
 224                 if (fbp->nb_np) {
 225                         if (fbp->nb_vnbufs.le_next != NFSNOLIST) {
 226                                 LIST_REMOVE(fbp, nb_vnbufs);
 227                                 fbp->nb_vnbufs.le_next = NFSNOLIST;
 228                         }
 229                         fbp->nb_np = NULL;
 230                 }
 231                 LIST_REMOVE(fbp, nb_hash);
 232                 TAILQ_INSERT_TAIL(&nfsbuffreeup, fbp, nb_free);
 233                 nfsbufcnt--;
 234         }
 235
 236         count = timer ? nfsbuffreemetacnt / META_FREEUP_FRAC_ON_TIMER : META_TO_FREEUP;
 237         while ((nfsbufcnt > nfsbufmin) && (count-- > 0)) {
 238                 fbp = TAILQ_FIRST(&nfsbuffreemeta);
 239                 if (!fbp) {
 240                         break;
 241                 }
 242                 if (fbp->nb_refs) {
 243                         break;
 244                 }
 245                 if (NBUFSTAMPVALID(fbp) &&
 246                     (fbp->nb_timestamp + (2 * NFSBUF_META_STALE)) > now.tv_sec) {
 247                         break;
 248                 }
 249                 nfs_buf_remfree(fbp);
 250                 /* disassociate buffer from any nfsnode */
 251                 if (fbp->nb_np) {
 252                         if (fbp->nb_vnbufs.le_next != NFSNOLIST) {
 253                                 LIST_REMOVE(fbp, nb_vnbufs);
 254                                 fbp->nb_vnbufs.le_next = NFSNOLIST;
 255                         }
 256                         fbp->nb_np = NULL;
 257                 }
 258                 LIST_REMOVE(fbp, nb_hash);
 259                 TAILQ_INSERT_TAIL(&nfsbuffreeup, fbp, nb_free);
 260                 nfsbufcnt--;
 261                 nfsbufmetacnt--;
 262         }
 263
 264         FSDBG(320, nfsbufcnt, nfsbuffreecnt, nfsbuffreemetacnt, 0);
 265         NFSBUFCNTCHK();
 266
 267         lck_mtx_unlock(nfs_buf_mutex);
 268
 269         while ((fbp = TAILQ_FIRST(&nfsbuffreeup))) {
 270                 TAILQ_REMOVE(&nfsbuffreeup, fbp, nb_free);
 271                 /* nuke any creds */
 272                 if (IS_VALID_CRED(fbp->nb_rcred)) {
 273                         kauth_cred_unref(&fbp->nb_rcred);
 274                 }
 275                 if (IS_VALID_CRED(fbp->nb_wcred)) {
 276                         kauth_cred_unref(&fbp->nb_wcred);
 277                 }
 278                 /* if buf was NB_META, dump buffer */
 279                 if (ISSET(fbp->nb_flags, NB_META) && fbp->nb_data) {
 280                         kfree(fbp->nb_data, fbp->nb_bufsize);
 281                 }
 282                 FREE(fbp, M_TEMP);
 283         }
 284 }
 285
 286 /*
 287  * remove a buffer from the freelist
 288  * (must be called with nfs_buf_mutex held)
 289  */
 290 void
 291 nfs_buf_remfree(struct nfsbuf *bp)
 292 {
 293         if (bp->nb_free.tqe_next == NFSNOLIST) {
 294                 panic("nfsbuf not on free list");
 295         }
 296         if (ISSET(bp->nb_flags, NB_DELWRI)) {
 297                 nfsbufdelwricnt--;
 298                 TAILQ_REMOVE(&nfsbufdelwri, bp, nb_free);
 299         } else if (ISSET(bp->nb_flags, NB_META)) {
 300                 nfsbuffreemetacnt--;
 301                 TAILQ_REMOVE(&nfsbuffreemeta, bp, nb_free);
 302         } else {
 303                 nfsbuffreecnt--;
 304                 TAILQ_REMOVE(&nfsbuffree, bp, nb_free);
 305         }
 306         bp->nb_free.tqe_next = NFSNOLIST;
 307         NFSBUFCNTCHK();
 308 }
 309
 310 /*
 311  * check for existence of nfsbuf in cache
 312  */
 313 boolean_t
 314 nfs_buf_is_incore(nfsnode_t np, daddr64_t blkno)
 315 {
 316         boolean_t rv;
 317         lck_mtx_lock(nfs_buf_mutex);
 318         if (nfs_buf_incore(np, blkno)) {
 319                 rv = TRUE;
 320         } else {
 321                 rv = FALSE;
 322         }
 323         lck_mtx_unlock(nfs_buf_mutex);
 324         return rv;
 325 }
 326
 327 /*
 328  * return incore buffer (must be called with nfs_buf_mutex held)
 329  */
 330 struct nfsbuf *
 331 nfs_buf_incore(nfsnode_t np, daddr64_t blkno)
 332 {
 333         /* Search hash chain */
 334         struct nfsbuf * bp = NFSBUFHASH(np, blkno)->lh_first;
 335         for (; bp != NULL; bp = bp->nb_hash.le_next) {
 336                 if ((bp->nb_lblkno == blkno) && (bp->nb_np == np)) {
 337                         if (!ISSET(bp->nb_flags, NB_INVAL)) {
 338                                 FSDBG(547, bp, blkno, bp->nb_flags, bp->nb_np);
 339                                 return bp;
 340                         }
 341                 }
 342         }
 343         return NULL;
 344 }
 345
 346 /*
 347  * Check if it's OK to drop a page.
 348  *
 349  * Called by vnode_pager() on pageout request of non-dirty page.
 350  * We need to make sure that it's not part of a delayed write.
 351  * If it is, we can't let the VM drop it because we may need it
 352  * later when/if we need to write the data (again).
 353  */
 354 int
 355 nfs_buf_page_inval(vnode_t vp, off_t offset)
 356 {
 357         struct nfsmount *nmp = VTONMP(vp);
 358         struct nfsbuf *bp;
 359         int error = 0;
 360
 361         if (nfs_mount_gone(nmp)) {
 362                 return ENXIO;
 363         }
 364
 365         lck_mtx_lock(nfs_buf_mutex);
 366         bp = nfs_buf_incore(VTONFS(vp), (daddr64_t)(offset / nmp->nm_biosize));
 367         if (!bp) {
 368                 goto out;
 369         }
 370         FSDBG(325, bp, bp->nb_flags, bp->nb_dirtyoff, bp->nb_dirtyend);
 371         if (ISSET(bp->nb_lflags, NBL_BUSY)) {
 372                 error = EBUSY;
 373                 goto out;
 374         }
 375         /*
 376          * If there's a dirty range in the buffer, check to
 377          * see if this page intersects with the dirty range.
 378          * If it does, we can't let the pager drop the page.
 379          */
 380         if (bp->nb_dirtyend > 0) {
 381                 int start = offset - NBOFF(bp);
 382                 if ((bp->nb_dirtyend > start) &&
 383                     (bp->nb_dirtyoff < (start + PAGE_SIZE))) {
 384                         /*
 385                          * Before returning the bad news, move the
 386                          * buffer to the start of the delwri list and
 387                          * give the list a push to try to flush the
 388                          * buffer out.
 389                          */
 390                         error = EBUSY;
 391                         nfs_buf_remfree(bp);
 392                         TAILQ_INSERT_HEAD(&nfsbufdelwri, bp, nb_free);
 393                         nfsbufdelwricnt++;
 394                         nfs_buf_delwri_push(1);
 395                 }
 396         }
 397 out:
 398         lck_mtx_unlock(nfs_buf_mutex);
 399         return error;
 400 }
 401
 402 /*
 403  * set up the UPL for a buffer
 404  * (must NOT be called with nfs_buf_mutex held)
 405  */
 406 int
 407 nfs_buf_upl_setup(struct nfsbuf *bp)
 408 {
 409         kern_return_t kret;
 410         upl_t upl;
 411         int upl_flags;
 412
 413         if (ISSET(bp->nb_flags, NB_PAGELIST)) {
 414                 return 0;
 415         }
 416
 417         upl_flags = UPL_PRECIOUS;
 418         if (!ISSET(bp->nb_flags, NB_READ)) {
 419                 /*
 420                  * We're doing a "write", so we intend to modify
 421                  * the pages we're gathering.
 422                  */
 423                 upl_flags |= UPL_WILL_MODIFY;
 424         }
 425         kret = ubc_create_upl_kernel(NFSTOV(bp->nb_np), NBOFF(bp), bp->nb_bufsize,
 426             &upl, NULL, upl_flags, VM_KERN_MEMORY_FILE);
 427         if (kret == KERN_INVALID_ARGUMENT) {
 428                 /* vm object probably doesn't exist any more */
 429                 bp->nb_pagelist = NULL;
 430                 return EINVAL;
 431         }
 432         if (kret != KERN_SUCCESS) {
 433                 printf("nfs_buf_upl_setup(): failed to get pagelist %d\n", kret);
 434                 bp->nb_pagelist = NULL;
 435                 return EIO;
 436         }
 437
 438         FSDBG(538, bp, NBOFF(bp), bp->nb_bufsize, bp->nb_np);
 439
 440         bp->nb_pagelist = upl;
 441         SET(bp->nb_flags, NB_PAGELIST);
 442         return 0;
 443 }
 444
 445 /*
 446  * update buffer's valid/dirty info from UBC
 447  * (must NOT be called with nfs_buf_mutex held)
 448  */
 449 void
 450 nfs_buf_upl_check(struct nfsbuf *bp)
 451 {
 452         upl_page_info_t *pl;
 453         off_t filesize, fileoffset;
 454         int i, npages;
 455
 456         if (!ISSET(bp->nb_flags, NB_PAGELIST)) {
 457                 return;
 458         }
 459
 460         npages = round_page_32(bp->nb_bufsize) / PAGE_SIZE;
 461         filesize = ubc_getsize(NFSTOV(bp->nb_np));
 462         fileoffset = NBOFF(bp);
 463         if (fileoffset < filesize) {
 464                 SET(bp->nb_flags, NB_CACHE);
 465         } else {
 466                 CLR(bp->nb_flags, NB_CACHE);
 467         }
 468
 469         pl = ubc_upl_pageinfo(bp->nb_pagelist);
 470         bp->nb_valid = bp->nb_dirty = 0;
 471
 472         for (i = 0; i < npages; i++, fileoffset += PAGE_SIZE_64) {
 473                 /* anything beyond the end of the file is not valid or dirty */
 474                 if (fileoffset >= filesize) {
 475                         break;
 476                 }
 477                 if (!upl_valid_page(pl, i)) {
 478                         CLR(bp->nb_flags, NB_CACHE);
 479                         continue;
 480                 }
 481                 NBPGVALID_SET(bp, i);
 482                 if (upl_dirty_page(pl, i)) {
 483                         NBPGDIRTY_SET(bp, i);
 484                 }
 485         }
 486         fileoffset = NBOFF(bp);
 487         if (ISSET(bp->nb_flags, NB_CACHE)) {
 488                 bp->nb_validoff = 0;
 489                 bp->nb_validend = bp->nb_bufsize;
 490                 if (fileoffset + bp->nb_validend > filesize) {
 491                         bp->nb_validend = filesize - fileoffset;
 492                 }
 493         } else {
 494                 bp->nb_validoff = bp->nb_validend = -1;
 495         }
 496         FSDBG(539, bp, fileoffset, bp->nb_valid, bp->nb_dirty);
 497         FSDBG(539, bp->nb_validoff, bp->nb_validend, bp->nb_dirtyoff, bp->nb_dirtyend);
 498 }
 499
 500 /*
 501  * make sure that a buffer is mapped
 502  * (must NOT be called with nfs_buf_mutex held)
 503  */
 504 int
 505 nfs_buf_map(struct nfsbuf *bp)
 506 {
 507         kern_return_t kret;
 508
 509         if (bp->nb_data) {
 510                 return 0;
 511         }
 512         if (!ISSET(bp->nb_flags, NB_PAGELIST)) {
 513                 return EINVAL;
 514         }
 515
 516         kret = ubc_upl_map(bp->nb_pagelist, (vm_offset_t *)&(bp->nb_data));
 517         if (kret != KERN_SUCCESS) {
 518                 panic("nfs_buf_map: ubc_upl_map() failed with (%d)", kret);
 519         }
 520         if (bp->nb_data == 0) {
 521                 panic("ubc_upl_map mapped 0");
 522         }
 523         FSDBG(540, bp, bp->nb_flags, NBOFF(bp), bp->nb_data);
 524         return 0;
 525 }
 526
 527 /*
 528  * normalize an nfsbuf's valid range
 529  *
 530  * the read/write code guarantees that we'll always have a valid
 531  * region that is an integral number of pages.  If either end
 532  * of the valid range isn't page-aligned, it gets corrected
 533  * here as we extend the valid range through all of the
 534  * contiguous valid pages.
 535  */
 536 void
 537 nfs_buf_normalize_valid_range(nfsnode_t np, struct nfsbuf *bp)
 538 {
 539         int pg, npg;
 540         /* pull validoff back to start of contiguous valid page range */
 541         pg = bp->nb_validoff / PAGE_SIZE;
 542         while (pg >= 0 && NBPGVALID(bp, pg)) {
 543                 pg--;
 544         }
 545         bp->nb_validoff = (pg + 1) * PAGE_SIZE;
 546         /* push validend forward to end of contiguous valid page range */
 547         npg = bp->nb_bufsize / PAGE_SIZE;
 548         pg = bp->nb_validend / PAGE_SIZE;
 549         while (pg < npg && NBPGVALID(bp, pg)) {
 550                 pg++;
 551         }
 552         bp->nb_validend = pg * PAGE_SIZE;
 553         /* clip to EOF */
 554         if (NBOFF(bp) + bp->nb_validend > (off_t)np->n_size) {
 555                 bp->nb_validend = np->n_size % bp->nb_bufsize;
 556         }
 557 }
 558
 559 /*
 560  * process some entries on the delayed write queue
 561  * (must be called with nfs_buf_mutex held)
 562  */
 563 void
 564 nfs_buf_delwri_service(void)
 565 {
 566         struct nfsbuf *bp;
 567         nfsnode_t np;
 568         int error, i = 0;
 569
 570         while (i < 8 && (bp = TAILQ_FIRST(&nfsbufdelwri)) != NULL) {
 571                 np = bp->nb_np;
 572                 nfs_buf_remfree(bp);
 573                 nfs_buf_refget(bp);
 574                 while ((error = nfs_buf_acquire(bp, 0, 0, 0)) == EAGAIN) {
 575                         ;
 576                 }
 577                 nfs_buf_refrele(bp);
 578                 if (error) {
 579                         break;
 580                 }
 581                 if (!bp->nb_np) {
 582                         /* buffer is no longer valid */
 583                         nfs_buf_drop(bp);
 584                         continue;
 585                 }
 586                 if (ISSET(bp->nb_flags, NB_NEEDCOMMIT)) {
 587                         nfs_buf_check_write_verifier(np, bp);
 588                 }
 589                 if (ISSET(bp->nb_flags, NB_NEEDCOMMIT)) {
 590                         /* put buffer at end of delwri list */
 591                         TAILQ_INSERT_TAIL(&nfsbufdelwri, bp, nb_free);
 592                         nfsbufdelwricnt++;
 593                         nfs_buf_drop(bp);
 594                         lck_mtx_unlock(nfs_buf_mutex);
 595                         nfs_flushcommits(np, 1);
 596                 } else {
 597                         SET(bp->nb_flags, NB_ASYNC);
 598                         lck_mtx_unlock(nfs_buf_mutex);
 599                         nfs_buf_write(bp);
 600                 }
 601                 i++;
 602                 lck_mtx_lock(nfs_buf_mutex);
 603         }
 604 }
 605
 606 /*
 607  * thread to service the delayed write queue when asked
 608  */
 609 void
 610 nfs_buf_delwri_thread(__unused void *arg, __unused wait_result_t wr)
 611 {
 612         struct timespec ts = { 30, 0 };
 613         int error = 0;
 614
 615         lck_mtx_lock(nfs_buf_mutex);
 616         while (!error) {
 617                 nfs_buf_delwri_service();
 618                 error = msleep(&nfsbufdelwrithd, nfs_buf_mutex, 0, "nfsbufdelwri", &ts);
 619         }
 620         nfsbufdelwrithd = NULL;
 621         lck_mtx_unlock(nfs_buf_mutex);
 622         thread_terminate(nfsbufdelwrithd);
 623 }
 624
 625 /*
 626  * try to push out some delayed/uncommitted writes
 627  * ("locked" indicates whether nfs_buf_mutex is already held)
 628  */
 629 void
 630 nfs_buf_delwri_push(int locked)
 631 {
 632         if (TAILQ_EMPTY(&nfsbufdelwri)) {
 633                 return;
 634         }
 635         if (!locked) {
 636                 lck_mtx_lock(nfs_buf_mutex);
 637         }
 638         /* wake up the delayed write service thread */
 639         if (nfsbufdelwrithd) {
 640                 wakeup(&nfsbufdelwrithd);
 641         } else if (kernel_thread_start(nfs_buf_delwri_thread, NULL, &nfsbufdelwrithd) == KERN_SUCCESS) {
 642                 thread_deallocate(nfsbufdelwrithd);
 643         }
 644         /* otherwise, try to do some of the work ourselves */
 645         if (!nfsbufdelwrithd) {
 646                 nfs_buf_delwri_service();
 647         }
 648         if (!locked) {
 649                 lck_mtx_unlock(nfs_buf_mutex);
 650         }
 651 }
 652
 653 /*
 654  * Get an nfs buffer.
 655  *
 656  * Returns errno on error, 0 otherwise.
 657  * Any buffer is returned in *bpp.
 658  *
 659  * If NBLK_ONLYVALID is set, only return buffer if found in cache.
 660  * If NBLK_NOWAIT is set, don't wait for the buffer if it's marked BUSY.
 661  *
 662  * Check for existence of buffer in cache.
 663  * Or attempt to reuse a buffer from one of the free lists.
 664  * Or allocate a new buffer if we haven't already hit max allocation.
 665  * Or wait for a free buffer.
 666  *
 667  * If available buffer found, prepare it, and return it.
 668  *
 669  * If the calling process is interrupted by a signal for
 670  * an interruptible mount point, return EINTR.
 671  */
 672 int
 673 nfs_buf_get(
 674         nfsnode_t np,
 675         daddr64_t blkno,
 676         uint32_t size,
 677         thread_t thd,
 678         int flags,
 679         struct nfsbuf **bpp)
 680 {
 681         vnode_t vp = NFSTOV(np);
 682         struct nfsmount *nmp = VTONMP(vp);
 683         struct nfsbuf *bp;
 684         uint32_t bufsize;
 685         int slpflag = PCATCH;
 686         int operation = (flags & NBLK_OPMASK);
 687         int error = 0;
 688         struct timespec ts;
 689
 690         FSDBG_TOP(541, np, blkno, size, flags);
 691         *bpp = NULL;
 692
 693         bufsize = size;
 694         if (bufsize > NFS_MAXBSIZE) {
 695                 panic("nfs_buf_get: buffer larger than NFS_MAXBSIZE requested");
 696         }
 697
 698         if (nfs_mount_gone(nmp)) {
 699                 FSDBG_BOT(541, np, blkno, 0, ENXIO);
 700                 return ENXIO;
 701         }
 702
 703         if (!UBCINFOEXISTS(vp)) {
 704                 operation = NBLK_META;
 705         } else if (bufsize < (uint32_t)nmp->nm_biosize) {
 706                 /* reg files should always have biosize blocks */
 707                 bufsize = nmp->nm_biosize;
 708         }
 709
 710         /* if NBLK_WRITE, check for too many delayed/uncommitted writes */
 711         if ((operation == NBLK_WRITE) && (nfs_nbdwrite > NFS_A_LOT_OF_DELAYED_WRITES)) {
 712                 FSDBG_TOP(542, np, blkno, nfs_nbdwrite, NFS_A_LOT_OF_DELAYED_WRITES);
 713
 714                 /* poke the delwri list */
 715                 nfs_buf_delwri_push(0);
 716
 717                 /* sleep to let other threads run... */
 718                 tsleep(&nfs_nbdwrite, PCATCH, "nfs_nbdwrite", 1);
 719                 FSDBG_BOT(542, np, blkno, nfs_nbdwrite, NFS_A_LOT_OF_DELAYED_WRITES);
 720         }
 721
 722 loop:
 723         lck_mtx_lock(nfs_buf_mutex);
 724
 725         /* wait for any buffer invalidation/flushing to complete */
 726         while (np->n_bflag & NBINVALINPROG) {
 727                 np->n_bflag |= NBINVALWANT;
 728                 ts.tv_sec = 2;
 729                 ts.tv_nsec = 0;
 730                 msleep(&np->n_bflag, nfs_buf_mutex, slpflag, "nfs_buf_get_invalwait", &ts);
 731                 if ((error = nfs_sigintr(VTONMP(vp), NULL, thd, 0))) {
 732                         lck_mtx_unlock(nfs_buf_mutex);
 733                         FSDBG_BOT(541, np, blkno, 0, error);
 734                         return error;
 735                 }
 736                 if (np->n_bflag & NBINVALINPROG) {
 737                         slpflag = 0;
 738                 }
 739         }
 740
 741         /* check for existence of nfsbuf in cache */
 742         if ((bp = nfs_buf_incore(np, blkno))) {
 743                 /* if busy, set wanted and wait */
 744                 if (ISSET(bp->nb_lflags, NBL_BUSY)) {
 745                         if (flags & NBLK_NOWAIT) {
 746                                 lck_mtx_unlock(nfs_buf_mutex);
 747                                 FSDBG_BOT(541, np, blkno, bp, 0xbcbcbcbc);
 748                                 return 0;
 749                         }
 750                         FSDBG_TOP(543, np, blkno, bp, bp->nb_flags);
 751                         SET(bp->nb_lflags, NBL_WANTED);
 752
 753                         ts.tv_sec = 2;
 754                         ts.tv_nsec = 0;
 755                         msleep(bp, nfs_buf_mutex, slpflag | (PRIBIO + 1) | PDROP,
 756                             "nfsbufget", (slpflag == PCATCH) ? NULL : &ts);
 757                         slpflag = 0;
 758                         FSDBG_BOT(543, np, blkno, bp, bp->nb_flags);
 759                         if ((error = nfs_sigintr(VTONMP(vp), NULL, thd, 0))) {
 760                                 FSDBG_BOT(541, np, blkno, 0, error);
 761                                 return error;
 762                         }
 763                         goto loop;
 764                 }
 765                 if (bp->nb_bufsize != bufsize) {
 766                         panic("nfsbuf size mismatch");
 767                 }
 768                 SET(bp->nb_lflags, NBL_BUSY);
 769                 SET(bp->nb_flags, NB_CACHE);
 770                 nfs_buf_remfree(bp);
 771                 /* additional paranoia: */
 772                 if (ISSET(bp->nb_flags, NB_PAGELIST)) {
 773                         panic("pagelist buffer was not busy");
 774                 }
 775                 goto buffer_setup;
 776         }
 777
 778         if (flags & NBLK_ONLYVALID) {
 779                 lck_mtx_unlock(nfs_buf_mutex);
 780                 FSDBG_BOT(541, np, blkno, 0, 0x0000cace);
 781                 return 0;
 782         }
 783
 784         /*
 785          * where to get a free buffer:
 786          * - if meta and maxmeta reached, must reuse meta
 787          * - alloc new if we haven't reached min bufs
 788          * - if free lists are NOT empty
 789          *   - if free list is stale, use it
 790          *   - else if freemeta list is stale, use it
 791          *   - else if max bufs allocated, use least-time-to-stale
 792          * - alloc new if we haven't reached max allowed
 793          * - start clearing out delwri list and try again
 794          */
 795
 796         if ((operation == NBLK_META) && (nfsbufmetacnt >= nfsbufmetamax)) {
 797                 /* if we've hit max meta buffers, must reuse a meta buffer */
 798                 bp = TAILQ_FIRST(&nfsbuffreemeta);
 799         } else if ((nfsbufcnt > nfsbufmin) &&
 800             (!TAILQ_EMPTY(&nfsbuffree) || !TAILQ_EMPTY(&nfsbuffreemeta))) {
 801                 /* try to pull an nfsbuf off a free list */
 802                 struct nfsbuf *lrubp, *metabp;
 803                 struct timeval now;
 804                 microuptime(&now);
 805
 806                 /* if the next LRU or META buffer is invalid or stale, use it */
 807                 lrubp = TAILQ_FIRST(&nfsbuffree);
 808                 if (lrubp && (!NBUFSTAMPVALID(lrubp) ||
 809                     ((lrubp->nb_timestamp + NFSBUF_LRU_STALE) < now.tv_sec))) {
 810                         bp = lrubp;
 811                 }
 812                 metabp = TAILQ_FIRST(&nfsbuffreemeta);
 813                 if (!bp && metabp && (!NBUFSTAMPVALID(metabp) ||
 814                     ((metabp->nb_timestamp + NFSBUF_META_STALE) < now.tv_sec))) {
 815                         bp = metabp;
 816                 }
 817
 818                 if (!bp && (nfsbufcnt >= nfsbufmax)) {
 819                         /* we've already allocated all bufs, so */
 820                         /* choose the buffer that'll go stale first */
 821                         if (!metabp) {
 822                                 bp = lrubp;
 823                         } else if (!lrubp) {
 824                                 bp = metabp;
 825                         } else {
 826                                 int32_t lru_stale_time, meta_stale_time;
 827                                 lru_stale_time = lrubp->nb_timestamp + NFSBUF_LRU_STALE;
 828                                 meta_stale_time = metabp->nb_timestamp + NFSBUF_META_STALE;
 829                                 if (lru_stale_time <= meta_stale_time) {
 830                                         bp = lrubp;
 831                                 } else {
 832                                         bp = metabp;
 833                                 }
 834                         }
 835                 }
 836         }
 837
 838         if (bp) {
 839                 /* we have a buffer to reuse */
 840                 FSDBG(544, np, blkno, bp, bp->nb_flags);
 841                 nfs_buf_remfree(bp);
 842                 if (ISSET(bp->nb_flags, NB_DELWRI)) {
 843                         panic("nfs_buf_get: delwri");
 844                 }
 845                 SET(bp->nb_lflags, NBL_BUSY);
 846                 /* disassociate buffer from previous nfsnode */
 847                 if (bp->nb_np) {
 848                         if (bp->nb_vnbufs.le_next != NFSNOLIST) {
 849                                 LIST_REMOVE(bp, nb_vnbufs);
 850                                 bp->nb_vnbufs.le_next = NFSNOLIST;
 851                         }
 852                         bp->nb_np = NULL;
 853                 }
 854                 LIST_REMOVE(bp, nb_hash);
 855                 /* nuke any creds we're holding */
 856                 if (IS_VALID_CRED(bp->nb_rcred)) {
 857                         kauth_cred_unref(&bp->nb_rcred);
 858                 }
 859                 if (IS_VALID_CRED(bp->nb_wcred)) {
 860                         kauth_cred_unref(&bp->nb_wcred);
 861                 }
 862                 /* if buf will no longer be NB_META, dump old buffer */
 863                 if (operation == NBLK_META) {
 864                         if (!ISSET(bp->nb_flags, NB_META)) {
 865                                 nfsbufmetacnt++;
 866                         }
 867                 } else if (ISSET(bp->nb_flags, NB_META)) {
 868                         if (bp->nb_data) {
 869                                 kfree(bp->nb_data, bp->nb_bufsize);
 870                                 bp->nb_data = NULL;
 871                         }
 872                         nfsbufmetacnt--;
 873                 }
 874                 /* re-init buf fields */
 875                 bp->nb_error = 0;
 876                 bp->nb_validoff = bp->nb_validend = -1;
 877                 bp->nb_dirtyoff = bp->nb_dirtyend = 0;
 878                 bp->nb_valid = 0;
 879                 bp->nb_dirty = 0;
 880                 bp->nb_verf = 0;
 881         } else {
 882                 /* no buffer to reuse */
 883                 if ((nfsbufcnt < nfsbufmax) &&
 884                     ((operation != NBLK_META) || (nfsbufmetacnt < nfsbufmetamax))) {
 885                         /* just alloc a new one */
 886                         MALLOC(bp, struct nfsbuf *, sizeof(struct nfsbuf), M_TEMP, M_WAITOK);
 887                         if (!bp) {
 888                                 lck_mtx_unlock(nfs_buf_mutex);
 889                                 FSDBG_BOT(541, np, blkno, 0, error);
 890                                 return ENOMEM;
 891                         }
 892                         nfsbufcnt++;
 893
 894                         /*
 895                          * If any excess bufs, make sure the timer
 896                          * is running to free them up later.
 897                          */
 898                         if (nfsbufcnt > nfsbufmin && !nfs_buf_timer_on) {
 899                                 nfs_buf_timer_on = 1;
 900                                 nfs_interval_timer_start(nfs_buf_timer_call,
 901                                     NFSBUF_FREE_PERIOD * 1000);
 902                         }
 903
 904                         if (operation == NBLK_META) {
 905                                 nfsbufmetacnt++;
 906                         }
 907                         NFSBUFCNTCHK();
 908                         /* init nfsbuf */
 909                         bzero(bp, sizeof(*bp));
 910                         bp->nb_free.tqe_next = NFSNOLIST;
 911                         bp->nb_validoff = bp->nb_validend = -1;
 912                         FSDBG(545, np, blkno, bp, 0);
 913                 } else {
 914                         /* too many bufs... wait for buffers to free up */
 915                         FSDBG_TOP(546, np, blkno, nfsbufcnt, nfsbufmax);
 916
 917                         /* poke the delwri list */
 918                         nfs_buf_delwri_push(1);
 919
 920                         nfsneedbuffer = 1;
 921                         msleep(&nfsneedbuffer, nfs_buf_mutex, PCATCH | PDROP, "nfsbufget", NULL);
 922                         FSDBG_BOT(546, np, blkno, nfsbufcnt, nfsbufmax);
 923                         if ((error = nfs_sigintr(VTONMP(vp), NULL, thd, 0))) {
 924                                 FSDBG_BOT(541, np, blkno, 0, error);
 925                                 return error;
 926                         }
 927                         goto loop;
 928                 }
 929         }
 930
 931         /* set up nfsbuf */
 932         SET(bp->nb_lflags, NBL_BUSY);
 933         bp->nb_flags = 0;
 934         bp->nb_lblkno = blkno;
 935         /* insert buf in hash */
 936         LIST_INSERT_HEAD(NFSBUFHASH(np, blkno), bp, nb_hash);
 937         /* associate buffer with new nfsnode */
 938         bp->nb_np = np;
 939         LIST_INSERT_HEAD(&np->n_cleanblkhd, bp, nb_vnbufs);
 940
 941 buffer_setup:
 942
 943         /* unlock hash */
 944         lck_mtx_unlock(nfs_buf_mutex);
 945
 946         switch (operation) {
 947         case NBLK_META:
 948                 SET(bp->nb_flags, NB_META);
 949                 if ((bp->nb_bufsize != bufsize) && bp->nb_data) {
 950                         kfree(bp->nb_data, bp->nb_bufsize);
 951                         bp->nb_data = NULL;
 952                         bp->nb_validoff = bp->nb_validend = -1;
 953                         bp->nb_dirtyoff = bp->nb_dirtyend = 0;
 954                         bp->nb_valid = 0;
 955                         bp->nb_dirty = 0;
 956                         CLR(bp->nb_flags, NB_CACHE);
 957                 }
 958                 if (!bp->nb_data) {
 959                         bp->nb_data = kalloc(bufsize);
 960                 }
 961                 if (!bp->nb_data) {
 962                         /* Ack! couldn't allocate the data buffer! */
 963                         /* clean up buffer and return error */
 964                         lck_mtx_lock(nfs_buf_mutex);
 965                         LIST_REMOVE(bp, nb_vnbufs);
 966                         bp->nb_vnbufs.le_next = NFSNOLIST;
 967                         bp->nb_np = NULL;
 968                         /* invalidate usage timestamp to allow immediate freeing */
 969                         NBUFSTAMPINVALIDATE(bp);
 970                         if (bp->nb_free.tqe_next != NFSNOLIST) {
 971                                 panic("nfsbuf on freelist");
 972                         }
 973                         TAILQ_INSERT_HEAD(&nfsbuffree, bp, nb_free);
 974                         nfsbuffreecnt++;
 975                         lck_mtx_unlock(nfs_buf_mutex);
 976                         FSDBG_BOT(541, np, blkno, 0xb00, ENOMEM);
 977                         return ENOMEM;
 978                 }
 979                 bp->nb_bufsize = bufsize;
 980                 break;
 981
 982         case NBLK_READ:
 983         case NBLK_WRITE:
 984                 /*
 985                  * Set or clear NB_READ now to let the UPL subsystem know
 986                  * if we intend to modify the pages or not.
 987                  */
 988                 if (operation == NBLK_READ) {
 989                         SET(bp->nb_flags, NB_READ);
 990                 } else {
 991                         CLR(bp->nb_flags, NB_READ);
 992                 }
 993                 if (bufsize < PAGE_SIZE) {
 994                         bufsize = PAGE_SIZE;
 995                 }
 996                 bp->nb_bufsize = bufsize;
 997                 bp->nb_validoff = bp->nb_validend = -1;
 998
 999                 if (UBCINFOEXISTS(vp)) {
1000                         /* set up upl */
1001                         if (nfs_buf_upl_setup(bp)) {
1002                                 /* unable to create upl */
1003                                 /* vm object must no longer exist */
1004                                 /* clean up buffer and return error */
1005                                 lck_mtx_lock(nfs_buf_mutex);
1006                                 LIST_REMOVE(bp, nb_vnbufs);
1007                                 bp->nb_vnbufs.le_next = NFSNOLIST;
1008                                 bp->nb_np = NULL;
1009                                 /* invalidate usage timestamp to allow immediate freeing */
1010                                 NBUFSTAMPINVALIDATE(bp);
1011                                 if (bp->nb_free.tqe_next != NFSNOLIST) {
1012                                         panic("nfsbuf on freelist");
1013                                 }
1014                                 TAILQ_INSERT_HEAD(&nfsbuffree, bp, nb_free);
1015                                 nfsbuffreecnt++;
1016                                 lck_mtx_unlock(nfs_buf_mutex);
1017                                 FSDBG_BOT(541, np, blkno, 0x2bc, EIO);
1018                                 return EIO;
1019                         }
1020                         nfs_buf_upl_check(bp);
1021                 }
1022                 break;
1023
1024         default:
1025                 panic("nfs_buf_get: %d unknown operation", operation);
1026         }
1027
1028         *bpp = bp;
1029
1030         FSDBG_BOT(541, np, blkno, bp, bp->nb_flags);
1031
1032         return 0;
1033 }
1034
1035 void
1036 nfs_buf_release(struct nfsbuf *bp, int freeup)
1037 {
1038         nfsnode_t np = bp->nb_np;
1039         vnode_t vp;
1040         struct timeval now;
1041         int wakeup_needbuffer, wakeup_buffer, wakeup_nbdwrite;
1042
1043         FSDBG_TOP(548, bp, NBOFF(bp), bp->nb_flags, bp->nb_data);
1044         FSDBG(548, bp->nb_validoff, bp->nb_validend, bp->nb_dirtyoff, bp->nb_dirtyend);
1045         FSDBG(548, bp->nb_valid, 0, bp->nb_dirty, 0);
1046
1047         vp = np ? NFSTOV(np) : NULL;
1048         if (vp && UBCINFOEXISTS(vp) && bp->nb_bufsize) {
1049                 int upl_flags, rv;
1050                 upl_t upl;
1051                 uint32_t i;
1052
1053                 if (!ISSET(bp->nb_flags, NB_PAGELIST) && !ISSET(bp->nb_flags, NB_INVAL)) {
1054                         rv = nfs_buf_upl_setup(bp);
1055                         if (rv) {
1056                                 printf("nfs_buf_release: upl create failed %d\n", rv);
1057                         } else {
1058                                 nfs_buf_upl_check(bp);
1059                         }
1060                 }
1061                 upl = bp->nb_pagelist;
1062                 if (!upl) {
1063                         goto pagelist_cleanup_done;
1064                 }
1065                 if (bp->nb_data) {
1066                         if (ubc_upl_unmap(upl) != KERN_SUCCESS) {
1067                                 panic("ubc_upl_unmap failed");
1068                         }
1069                         bp->nb_data = NULL;
1070                 }
1071                 /*
1072                  * Abort the pages on error or: if this is an invalid or
1073                  * non-needcommit nocache buffer AND no pages are dirty.
1074                  */
1075                 if (ISSET(bp->nb_flags, NB_ERROR) || (!bp->nb_dirty && (ISSET(bp->nb_flags, NB_INVAL) ||
1076                     (ISSET(bp->nb_flags, NB_NOCACHE) && !ISSET(bp->nb_flags, (NB_NEEDCOMMIT | NB_DELWRI)))))) {
1077                         if (ISSET(bp->nb_flags, (NB_READ | NB_INVAL | NB_NOCACHE))) {
1078                                 upl_flags = UPL_ABORT_DUMP_PAGES;
1079                         } else {
1080                                 upl_flags = 0;
1081                         }
1082                         ubc_upl_abort(upl, upl_flags);
1083                         goto pagelist_cleanup_done;
1084                 }
1085                 for (i = 0; i <= (bp->nb_bufsize - 1) / PAGE_SIZE; i++) {
1086                         if (!NBPGVALID(bp, i)) {
1087                                 ubc_upl_abort_range(upl,
1088                                     i * PAGE_SIZE, PAGE_SIZE,
1089                                     UPL_ABORT_DUMP_PAGES |
1090                                     UPL_ABORT_FREE_ON_EMPTY);
1091                         } else {
1092                                 if (NBPGDIRTY(bp, i)) {
1093                                         upl_flags = UPL_COMMIT_SET_DIRTY;
1094                                 } else {
1095                                         upl_flags = UPL_COMMIT_CLEAR_DIRTY;
1096                                 }
1097
1098                                 if (!ISSET(bp->nb_flags, (NB_NEEDCOMMIT | NB_DELWRI))) {
1099                                         upl_flags |= UPL_COMMIT_CLEAR_PRECIOUS;
1100                                 }
1101
1102                                 ubc_upl_commit_range(upl,
1103                                     i * PAGE_SIZE, PAGE_SIZE,
1104                                     upl_flags |
1105                                     UPL_COMMIT_INACTIVATE |
1106                                     UPL_COMMIT_FREE_ON_EMPTY);
1107                         }
1108                 }
1109 pagelist_cleanup_done:
1110                 /* invalidate any pages past EOF */
1111                 if (NBOFF(bp) + bp->nb_bufsize > (off_t)(np->n_size)) {
1112                         off_t start, end;
1113                         start = trunc_page_64(np->n_size) + PAGE_SIZE_64;
1114                         end = trunc_page_64(NBOFF(bp) + bp->nb_bufsize);
1115                         if (start < NBOFF(bp)) {
1116                                 start = NBOFF(bp);
1117                         }
1118                         if (end > start) {
1119                                 if ((rv = ubc_msync(vp, start, end, NULL, UBC_INVALIDATE))) {
1120                                         printf("nfs_buf_release(): ubc_msync failed!, error %d\n", rv);
1121                                 }
1122                         }
1123                 }
1124                 CLR(bp->nb_flags, NB_PAGELIST);
1125                 bp->nb_pagelist = NULL;
1126         }
1127
1128         lck_mtx_lock(nfs_buf_mutex);
1129
1130         wakeup_needbuffer = wakeup_buffer = wakeup_nbdwrite = 0;
1131
1132         /* Wake up any processes waiting for any buffer to become free. */
1133         if (nfsneedbuffer) {
1134                 nfsneedbuffer = 0;
1135                 wakeup_needbuffer = 1;
1136         }
1137         /* Wake up any processes waiting for _this_ buffer to become free. */
1138         if (ISSET(bp->nb_lflags, NBL_WANTED)) {
1139                 CLR(bp->nb_lflags, NBL_WANTED);
1140                 wakeup_buffer = 1;
1141         }
1142
1143         /* If it's non-needcommit nocache, or an error, mark it invalid. */
1144         if (ISSET(bp->nb_flags, NB_ERROR) ||
1145             (ISSET(bp->nb_flags, NB_NOCACHE) && !ISSET(bp->nb_flags, (NB_NEEDCOMMIT | NB_DELWRI)))) {
1146                 SET(bp->nb_flags, NB_INVAL);
1147         }
1148
1149         if ((bp->nb_bufsize <= 0) || ISSET(bp->nb_flags, NB_INVAL)) {
1150                 /* If it's invalid or empty, dissociate it from its nfsnode */
1151                 if (bp->nb_vnbufs.le_next != NFSNOLIST) {
1152                         LIST_REMOVE(bp, nb_vnbufs);
1153                         bp->nb_vnbufs.le_next = NFSNOLIST;
1154                 }
1155                 bp->nb_np = NULL;
1156                 /* if this was a delayed write, wakeup anyone */
1157                 /* waiting for delayed writes to complete */
1158                 if (ISSET(bp->nb_flags, NB_DELWRI)) {
1159                         CLR(bp->nb_flags, NB_DELWRI);
1160                         nfs_nbdwrite--;
1161                         NFSBUFCNTCHK();
1162                         wakeup_nbdwrite = 1;
1163                 }
1164                 /* invalidate usage timestamp to allow immediate freeing */
1165                 NBUFSTAMPINVALIDATE(bp);
1166                 /* put buffer at head of free list */
1167                 if (bp->nb_free.tqe_next != NFSNOLIST) {
1168                         panic("nfsbuf on freelist");
1169                 }
1170                 SET(bp->nb_flags, NB_INVAL);
1171                 if (ISSET(bp->nb_flags, NB_META)) {
1172                         TAILQ_INSERT_HEAD(&nfsbuffreemeta, bp, nb_free);
1173                         nfsbuffreemetacnt++;
1174                 } else {
1175                         TAILQ_INSERT_HEAD(&nfsbuffree, bp, nb_free);
1176                         nfsbuffreecnt++;
1177                 }
1178         } else if (ISSET(bp->nb_flags, NB_DELWRI)) {
1179                 /* put buffer at end of delwri list */
1180                 if (bp->nb_free.tqe_next != NFSNOLIST) {
1181                         panic("nfsbuf on freelist");
1182                 }
1183                 TAILQ_INSERT_TAIL(&nfsbufdelwri, bp, nb_free);
1184                 nfsbufdelwricnt++;
1185                 freeup = 0;
1186         } else {
1187                 /* update usage timestamp */
1188                 microuptime(&now);
1189                 bp->nb_timestamp = now.tv_sec;
1190                 /* put buffer at end of free list */
1191                 if (bp->nb_free.tqe_next != NFSNOLIST) {
1192                         panic("nfsbuf on freelist");
1193                 }
1194                 if (ISSET(bp->nb_flags, NB_META)) {
1195                         TAILQ_INSERT_TAIL(&nfsbuffreemeta, bp, nb_free);
1196                         nfsbuffreemetacnt++;
1197                 } else {
1198                         TAILQ_INSERT_TAIL(&nfsbuffree, bp, nb_free);
1199                         nfsbuffreecnt++;
1200                 }
1201         }
1202
1203         NFSBUFCNTCHK();
1204
1205         /* Unlock the buffer. */
1206         CLR(bp->nb_flags, (NB_ASYNC | NB_STABLE));
1207         CLR(bp->nb_lflags, NBL_BUSY);
1208
1209         FSDBG_BOT(548, bp, NBOFF(bp), bp->nb_flags, bp->nb_data);
1210
1211         lck_mtx_unlock(nfs_buf_mutex);
1212
1213         if (wakeup_needbuffer) {
1214                 wakeup(&nfsneedbuffer);
1215         }
1216         if (wakeup_buffer) {
1217                 wakeup(bp);
1218         }
1219         if (wakeup_nbdwrite) {
1220                 wakeup(&nfs_nbdwrite);
1221         }
1222         if (freeup) {
1223                 NFS_BUF_FREEUP();
1224         }
1225 }
1226
1227 /*
1228  * Wait for operations on the buffer to complete.
1229  * When they do, extract and return the I/O's error value.
1230  */
1231 int
1232 nfs_buf_iowait(struct nfsbuf *bp)
1233 {
1234         FSDBG_TOP(549, bp, NBOFF(bp), bp->nb_flags, bp->nb_error);
1235
1236         lck_mtx_lock(nfs_buf_mutex);
1237
1238         while (!ISSET(bp->nb_flags, NB_DONE)) {
1239                 msleep(bp, nfs_buf_mutex, PRIBIO + 1, "nfs_buf_iowait", NULL);
1240         }
1241
1242         lck_mtx_unlock(nfs_buf_mutex);
1243
1244         FSDBG_BOT(549, bp, NBOFF(bp), bp->nb_flags, bp->nb_error);
1245
1246         /* check for interruption of I/O, then errors. */
1247         if (ISSET(bp->nb_flags, NB_EINTR)) {
1248                 CLR(bp->nb_flags, NB_EINTR);
1249                 return EINTR;
1250         } else if (ISSET(bp->nb_flags, NB_ERROR)) {
1251                 return bp->nb_error ? bp->nb_error : EIO;
1252         }
1253         return 0;
1254 }
1255
1256 /*
1257  * Mark I/O complete on a buffer.
1258  */
1259 void
1260 nfs_buf_iodone(struct nfsbuf *bp)
1261 {
1262         FSDBG_TOP(550, bp, NBOFF(bp), bp->nb_flags, bp->nb_error);
1263
1264         if (ISSET(bp->nb_flags, NB_DONE)) {
1265                 panic("nfs_buf_iodone already");
1266         }
1267
1268         if (!ISSET(bp->nb_flags, NB_READ)) {
1269                 CLR(bp->nb_flags, NB_WRITEINPROG);
1270                 /*
1271                  * vnode_writedone() takes care of waking up
1272                  * any throttled write operations
1273                  */
1274                 vnode_writedone(NFSTOV(bp->nb_np));
1275                 nfs_node_lock_force(bp->nb_np);
1276                 bp->nb_np->n_numoutput--;
1277                 nfs_node_unlock(bp->nb_np);
1278         }
1279         if (ISSET(bp->nb_flags, NB_ASYNC)) {    /* if async, release it */
1280                 SET(bp->nb_flags, NB_DONE);             /* note that it's done */
1281                 nfs_buf_release(bp, 1);
1282         } else {                                        /* or just wakeup the buffer */
1283                 lck_mtx_lock(nfs_buf_mutex);
1284                 SET(bp->nb_flags, NB_DONE);             /* note that it's done */
1285                 CLR(bp->nb_lflags, NBL_WANTED);
1286                 lck_mtx_unlock(nfs_buf_mutex);
1287                 wakeup(bp);
1288         }
1289
1290         FSDBG_BOT(550, bp, NBOFF(bp), bp->nb_flags, bp->nb_error);
1291 }
1292
1293 void
1294 nfs_buf_write_delayed(struct nfsbuf *bp)
1295 {
1296         nfsnode_t np = bp->nb_np;
1297
1298         FSDBG_TOP(551, bp, NBOFF(bp), bp->nb_flags, 0);
1299         FSDBG(551, bp, bp->nb_dirtyoff, bp->nb_dirtyend, bp->nb_dirty);
1300
1301         /*
1302          * If the block hasn't been seen before:
1303          *      (1) Mark it as having been seen,
1304          *      (2) Make sure it's on its node's correct block list,
1305          */
1306         if (!ISSET(bp->nb_flags, NB_DELWRI)) {
1307                 SET(bp->nb_flags, NB_DELWRI);
1308                 /* move to dirty list */
1309                 lck_mtx_lock(nfs_buf_mutex);
1310                 nfs_nbdwrite++;
1311                 NFSBUFCNTCHK();
1312                 if (bp->nb_vnbufs.le_next != NFSNOLIST) {
1313                         LIST_REMOVE(bp, nb_vnbufs);
1314                 }
1315                 LIST_INSERT_HEAD(&np->n_dirtyblkhd, bp, nb_vnbufs);
1316                 lck_mtx_unlock(nfs_buf_mutex);
1317         }
1318
1319         /*
1320          * If the vnode has "too many" write operations in progress
1321          * wait for them to finish the IO
1322          */
1323         vnode_waitforwrites(NFSTOV(np), VNODE_ASYNC_THROTTLE, 0, 0, "nfs_buf_write_delayed");
1324
1325         /* the file is in a modified state, so make sure the flag's set */
1326         nfs_node_lock_force(np);
1327         np->n_flag |= NMODIFIED;
1328         nfs_node_unlock(np);
1329
1330         /*
1331          * If we have too many delayed write buffers,
1332          * just fall back to doing the async write.
1333          */
1334         if (nfs_nbdwrite < 0) {
1335                 panic("nfs_buf_write_delayed: Negative nfs_nbdwrite");
1336         }
1337         if (nfs_nbdwrite > NFS_A_LOT_OF_DELAYED_WRITES) {
1338                 /* issue async write */
1339                 SET(bp->nb_flags, NB_ASYNC);
1340                 nfs_buf_write(bp);
1341                 FSDBG_BOT(551, bp, NBOFF(bp), bp->nb_flags, bp->nb_error);
1342                 return;
1343         }
1344
1345         /* Otherwise, the "write" is done, so mark and release the buffer. */
1346         SET(bp->nb_flags, NB_DONE);
1347         nfs_buf_release(bp, 1);
1348         FSDBG_BOT(551, bp, NBOFF(bp), bp->nb_flags, 0);
1349         return;
1350 }
1351
1352 /*
1353  * Check that a "needcommit" buffer can still be committed.
1354  * If the write verifier has changed, we need to clear the
1355  * the needcommit flag.
1356  */
1357 void
1358 nfs_buf_check_write_verifier(nfsnode_t np, struct nfsbuf *bp)
1359 {
1360         struct nfsmount *nmp;
1361
1362         if (!ISSET(bp->nb_flags, NB_NEEDCOMMIT)) {
1363                 return;
1364         }
1365
1366         nmp = NFSTONMP(np);
1367         if (nfs_mount_gone(nmp)) {
1368                 return;
1369         }
1370         if (!ISSET(bp->nb_flags, NB_STALEWVERF) && (bp->nb_verf == nmp->nm_verf)) {
1371                 return;
1372         }
1373
1374         /* write verifier changed, clear commit/wverf flags */
1375         CLR(bp->nb_flags, (NB_NEEDCOMMIT | NB_STALEWVERF));
1376         bp->nb_verf = 0;
1377         nfs_node_lock_force(np);
1378         np->n_needcommitcnt--;
1379         CHECK_NEEDCOMMITCNT(np);
1380         nfs_node_unlock(np);
1381 }
1382
1383 /*
1384  * add a reference to a buffer so it doesn't disappear while being used
1385  * (must be called with nfs_buf_mutex held)
1386  */
1387 void
1388 nfs_buf_refget(struct nfsbuf *bp)
1389 {
1390         bp->nb_refs++;
1391 }
1392 /*
1393  * release a reference on a buffer
1394  * (must be called with nfs_buf_mutex held)
1395  */
1396 void
1397 nfs_buf_refrele(struct nfsbuf *bp)
1398 {
1399         bp->nb_refs--;
1400 }
1401
1402 /*
1403  * mark a particular buffer as BUSY
1404  * (must be called with nfs_buf_mutex held)
1405  */
1406 errno_t
1407 nfs_buf_acquire(struct nfsbuf *bp, int flags, int slpflag, int slptimeo)
1408 {
1409         errno_t error;
1410         struct timespec ts;
1411
1412         if (ISSET(bp->nb_lflags, NBL_BUSY)) {
1413                 /*
1414                  * since the lck_mtx_lock may block, the buffer
1415                  * may become BUSY, so we need to recheck for
1416                  * a NOWAIT request
1417                  */
1418                 if (flags & NBAC_NOWAIT) {
1419                         return EBUSY;
1420                 }
1421                 SET(bp->nb_lflags, NBL_WANTED);
1422
1423                 ts.tv_sec = (slptimeo / 100);
1424                 /* the hz value is 100; which leads to 10ms */
1425                 ts.tv_nsec = (slptimeo % 100) * 10  * NSEC_PER_USEC * 1000;
1426
1427                 error = msleep(bp, nfs_buf_mutex, slpflag | (PRIBIO + 1),
1428                     "nfs_buf_acquire", &ts);
1429                 if (error) {
1430                         return error;
1431                 }
1432                 return EAGAIN;
1433         }
1434         if (flags & NBAC_REMOVE) {
1435                 nfs_buf_remfree(bp);
1436         }
1437         SET(bp->nb_lflags, NBL_BUSY);
1438
1439         return 0;
1440 }
1441
1442 /*
1443  * simply drop the BUSY status of a buffer
1444  * (must be called with nfs_buf_mutex held)
1445  */
1446 void
1447 nfs_buf_drop(struct nfsbuf *bp)
1448 {
1449         int need_wakeup = 0;
1450
1451         if (!ISSET(bp->nb_lflags, NBL_BUSY)) {
1452                 panic("nfs_buf_drop: buffer not busy!");
1453         }
1454         if (ISSET(bp->nb_lflags, NBL_WANTED)) {
1455                 /* delay the actual wakeup until after we clear NBL_BUSY */
1456                 need_wakeup = 1;
1457         }
1458         /* Unlock the buffer. */
1459         CLR(bp->nb_lflags, (NBL_BUSY | NBL_WANTED));
1460
1461         if (need_wakeup) {
1462                 wakeup(bp);
1463         }
1464 }
1465
1466 /*
1467  * prepare for iterating over an nfsnode's buffer list
1468  * this lock protects the queue manipulation
1469  * (must be called with nfs_buf_mutex held)
1470  */
1471 int
1472 nfs_buf_iterprepare(nfsnode_t np, struct nfsbuflists *iterheadp, int flags)
1473 {
1474         struct nfsbuflists *listheadp;
1475
1476         if (flags & NBI_DIRTY) {
1477                 listheadp = &np->n_dirtyblkhd;
1478         } else {
1479                 listheadp = &np->n_cleanblkhd;
1480         }
1481
1482         if ((flags & NBI_NOWAIT) && (np->n_bufiterflags & NBI_ITER)) {
1483                 LIST_INIT(iterheadp);
1484                 return EWOULDBLOCK;
1485         }
1486
1487         while (np->n_bufiterflags & NBI_ITER) {
1488                 np->n_bufiterflags |= NBI_ITERWANT;
1489                 msleep(&np->n_bufiterflags, nfs_buf_mutex, 0, "nfs_buf_iterprepare", NULL);
1490         }
1491         if (LIST_EMPTY(listheadp)) {
1492                 LIST_INIT(iterheadp);
1493                 return EINVAL;
1494         }
1495         np->n_bufiterflags |= NBI_ITER;
1496
1497         iterheadp->lh_first = listheadp->lh_first;
1498         listheadp->lh_first->nb_vnbufs.le_prev = &iterheadp->lh_first;
1499         LIST_INIT(listheadp);
1500
1501         return 0;
1502 }
1503
1504 /*
1505  * clean up after iterating over an nfsnode's buffer list
1506  * this lock protects the queue manipulation
1507  * (must be called with nfs_buf_mutex held)
1508  */
1509 void
1510 nfs_buf_itercomplete(nfsnode_t np, struct nfsbuflists *iterheadp, int flags)
1511 {
1512         struct nfsbuflists * listheadp;
1513         struct nfsbuf *bp;
1514
1515         if (flags & NBI_DIRTY) {
1516                 listheadp = &np->n_dirtyblkhd;
1517         } else {
1518                 listheadp = &np->n_cleanblkhd;
1519         }
1520
1521         while (!LIST_EMPTY(iterheadp)) {
1522                 bp = LIST_FIRST(iterheadp);
1523                 LIST_REMOVE(bp, nb_vnbufs);
1524                 LIST_INSERT_HEAD(listheadp, bp, nb_vnbufs);
1525         }
1526
1527         np->n_bufiterflags &= ~NBI_ITER;
1528         if (np->n_bufiterflags & NBI_ITERWANT) {
1529                 np->n_bufiterflags &= ~NBI_ITERWANT;
1530                 wakeup(&np->n_bufiterflags);
1531         }
1532 }
1533
1534
1535 /*
1536  * Read an NFS buffer for a file.
1537  */
1538 int
1539 nfs_buf_read(struct nfsbuf *bp)
1540 {
1541         int error = 0;
1542         nfsnode_t np;
1543         thread_t thd;
1544         kauth_cred_t cred;
1545
1546         np = bp->nb_np;
1547         cred = bp->nb_rcred;
1548         if (IS_VALID_CRED(cred)) {
1549                 kauth_cred_ref(cred);
1550         }
1551         thd = ISSET(bp->nb_flags, NB_ASYNC) ? NULL : current_thread();
1552
1553         /* sanity checks */
1554         if (!ISSET(bp->nb_flags, NB_READ)) {
1555                 panic("nfs_buf_read: !NB_READ");
1556         }
1557         if (ISSET(bp->nb_flags, NB_DONE)) {
1558                 CLR(bp->nb_flags, NB_DONE);
1559         }
1560
1561         NFS_BUF_MAP(bp);
1562
1563         OSAddAtomic64(1, &nfsstats.read_bios);
1564
1565         error = nfs_buf_read_rpc(bp, thd, cred);
1566         /*
1567          * For async I/O, the callbacks will finish up the
1568          * read.  Otherwise, the read has already been finished.
1569          */
1570
1571         if (IS_VALID_CRED(cred)) {
1572                 kauth_cred_unref(&cred);
1573         }
1574         return error;
1575 }
1576
1577 /*
1578  * finish the reading of a buffer
1579  */
1580 void
1581 nfs_buf_read_finish(struct nfsbuf *bp)
1582 {
1583         nfsnode_t np = bp->nb_np;
1584         struct nfsmount *nmp;
1585
1586         if (!ISSET(bp->nb_flags, NB_ERROR)) {
1587                 /* update valid range */
1588                 bp->nb_validoff = 0;
1589                 bp->nb_validend = bp->nb_endio;
1590                 if (bp->nb_endio < (int)bp->nb_bufsize) {
1591                         /*
1592                          * The read may be short because we have unflushed writes
1593                          * that are extending the file size and the reads hit the
1594                          * (old) EOF on the server.  So, just make sure nb_validend
1595                          * correctly tracks EOF.
1596                          * Note that the missing data should have already been zeroed
1597                          * in nfs_buf_read_rpc_finish().
1598                          */
1599                         off_t boff = NBOFF(bp);
1600                         if ((off_t)np->n_size >= (boff + bp->nb_bufsize)) {
1601                                 bp->nb_validend = bp->nb_bufsize;
1602                         } else if ((off_t)np->n_size >= boff) {
1603                                 bp->nb_validend = np->n_size - boff;
1604                         } else {
1605                                 bp->nb_validend = 0;
1606                         }
1607                 }
1608                 if ((nmp = NFSTONMP(np)) && (nmp->nm_vers == NFS_VER2) &&
1609                     ((NBOFF(bp) + bp->nb_validend) > 0x100000000LL)) {
1610                         bp->nb_validend = 0x100000000LL - NBOFF(bp);
1611                 }
1612                 bp->nb_valid = (1 << (round_page_32(bp->nb_validend) / PAGE_SIZE)) - 1;
1613                 if (bp->nb_validend & PAGE_MASK) {
1614                         /* zero-fill remainder of last page */
1615                         bzero(bp->nb_data + bp->nb_validend, PAGE_SIZE - (bp->nb_validend & PAGE_MASK));
1616                 }
1617         }
1618         nfs_buf_iodone(bp);
1619 }
1620
1621 /*
1622  * initiate the NFS READ RPC(s) for a buffer
1623  */
1624 int
1625 nfs_buf_read_rpc(struct nfsbuf *bp, thread_t thd, kauth_cred_t cred)
1626 {
1627         struct nfsmount *nmp;
1628         nfsnode_t np = bp->nb_np;
1629         int error = 0, nfsvers, async;
1630         int offset, nrpcs;
1631         uint32_t nmrsize, length, len;
1632         off_t boff;
1633         struct nfsreq *req;
1634         struct nfsreq_cbinfo cb;
1635
1636         nmp = NFSTONMP(np);
1637         if (nfs_mount_gone(nmp)) {
1638                 bp->nb_error = error = ENXIO;
1639                 SET(bp->nb_flags, NB_ERROR);
1640                 nfs_buf_iodone(bp);
1641                 return error;
1642         }
1643         nfsvers = nmp->nm_vers;
1644         nmrsize = nmp->nm_rsize;
1645
1646         boff = NBOFF(bp);
1647         offset = 0;
1648         length = bp->nb_bufsize;
1649
1650         if (nfsvers == NFS_VER2) {
1651                 if (boff > 0xffffffffLL) {
1652                         bp->nb_error = error = EFBIG;
1653                         SET(bp->nb_flags, NB_ERROR);
1654                         nfs_buf_iodone(bp);
1655                         return error;
1656                 }
1657                 if ((boff + length - 1) > 0xffffffffLL) {
1658                         length = 0x100000000LL - boff;
1659                 }
1660         }
1661
1662         /* Note: Can only do async I/O if nfsiods are configured. */
1663         async = (bp->nb_flags & NB_ASYNC);
1664         cb.rcb_func = async ? nfs_buf_read_rpc_finish : NULL;
1665         cb.rcb_bp = bp;
1666
1667         bp->nb_offio = bp->nb_endio = 0;
1668         bp->nb_rpcs = nrpcs = (length + nmrsize - 1) / nmrsize;
1669         if (async && (nrpcs > 1)) {
1670                 SET(bp->nb_flags, NB_MULTASYNCRPC);
1671         } else {
1672                 CLR(bp->nb_flags, NB_MULTASYNCRPC);
1673         }
1674
1675         while (length > 0) {
1676                 if (ISSET(bp->nb_flags, NB_ERROR)) {
1677                         error = bp->nb_error;
1678                         break;
1679                 }
1680                 len = (length > nmrsize) ? nmrsize : length;
1681                 cb.rcb_args[0] = offset;
1682                 cb.rcb_args[1] = len;
1683                 if (nmp->nm_vers >= NFS_VER4) {
1684                         cb.rcb_args[2] = nmp->nm_stategenid;
1685                 }
1686                 req = NULL;
1687                 error = nmp->nm_funcs->nf_read_rpc_async(np, boff + offset, len, thd, cred, &cb, &req);
1688                 if (error) {
1689                         break;
1690                 }
1691                 offset += len;
1692                 length -= len;
1693                 if (async) {
1694                         continue;
1695                 }
1696                 nfs_buf_read_rpc_finish(req);
1697                 if (ISSET(bp->nb_flags, NB_ERROR)) {
1698                         error = bp->nb_error;
1699                         break;
1700                 }
1701         }
1702
1703         if (length > 0) {
1704                 /*
1705                  * Something bad happened while trying to send the RPC(s).
1706                  * Wait for any outstanding requests to complete.
1707                  */
1708                 bp->nb_error = error;
1709                 SET(bp->nb_flags, NB_ERROR);
1710                 if (ISSET(bp->nb_flags, NB_MULTASYNCRPC)) {
1711                         nrpcs = (length + nmrsize - 1) / nmrsize;
1712                         lck_mtx_lock(nfs_buf_mutex);
1713                         bp->nb_rpcs -= nrpcs;
1714                         if (bp->nb_rpcs == 0) {
1715                                 /* No RPCs left, so the buffer's done */
1716                                 lck_mtx_unlock(nfs_buf_mutex);
1717                                 nfs_buf_iodone(bp);
1718                         } else {
1719                                 /* wait for the last RPC to mark it done */
1720                                 while (bp->nb_rpcs > 0) {
1721                                         msleep(&bp->nb_rpcs, nfs_buf_mutex, 0,
1722                                             "nfs_buf_read_rpc_cancel", NULL);
1723                                 }
1724                                 lck_mtx_unlock(nfs_buf_mutex);
1725                         }
1726                 } else {
1727                         nfs_buf_iodone(bp);
1728                 }
1729         }
1730
1731         return error;
1732 }
1733
1734 /*
1735  * finish up an NFS READ RPC on a buffer
1736  */
1737 void
1738 nfs_buf_read_rpc_finish(struct nfsreq *req)
1739 {
1740         struct nfsmount *nmp;
1741         size_t rlen;
1742         struct nfsreq_cbinfo cb;
1743         struct nfsbuf *bp;
1744         int error = 0, nfsvers, offset, length, eof = 0, multasyncrpc, finished;
1745         void *wakeme = NULL;
1746         struct nfsreq *rreq = NULL;
1747         nfsnode_t np;
1748         thread_t thd;
1749         kauth_cred_t cred;
1750         uio_t auio;
1751         char uio_buf[UIO_SIZEOF(1)];
1752
1753 finish:
1754         np = req->r_np;
1755         thd = req->r_thread;
1756         cred = req->r_cred;
1757         if (IS_VALID_CRED(cred)) {
1758                 kauth_cred_ref(cred);
1759         }
1760         cb = req->r_callback;
1761         bp = cb.rcb_bp;
1762         if (cb.rcb_func) { /* take an extra reference on the nfsreq in case we want to resend it later due to grace error */
1763                 nfs_request_ref(req, 0);
1764         }
1765
1766         nmp = NFSTONMP(np);
1767         if (nfs_mount_gone(nmp)) {
1768                 SET(bp->nb_flags, NB_ERROR);
1769                 bp->nb_error = error = ENXIO;
1770         }
1771         if (error || ISSET(bp->nb_flags, NB_ERROR)) {
1772                 /* just drop it */
1773                 nfs_request_async_cancel(req);
1774                 goto out;
1775         }
1776
1777         nfsvers = nmp->nm_vers;
1778         offset = cb.rcb_args[0];
1779         rlen = length = cb.rcb_args[1];
1780
1781         auio = uio_createwithbuffer(1, NBOFF(bp) + offset, UIO_SYSSPACE,
1782             UIO_READ, &uio_buf, sizeof(uio_buf));
1783         uio_addiov(auio, CAST_USER_ADDR_T(bp->nb_data + offset), length);
1784
1785         /* finish the RPC */
1786         error = nmp->nm_funcs->nf_read_rpc_async_finish(np, req, auio, &rlen, &eof);
1787         if ((error == EINPROGRESS) && cb.rcb_func) {
1788                 /* async request restarted */
1789                 if (cb.rcb_func) {
1790                         nfs_request_rele(req);
1791                 }
1792                 if (IS_VALID_CRED(cred)) {
1793                         kauth_cred_unref(&cred);
1794                 }
1795                 return;
1796         }
1797         if ((nmp->nm_vers >= NFS_VER4) && nfs_mount_state_error_should_restart(error) && !ISSET(bp->nb_flags, NB_ERROR)) {
1798                 lck_mtx_lock(&nmp->nm_lock);
1799                 if ((error != NFSERR_OLD_STATEID) && (error != NFSERR_GRACE) && (cb.rcb_args[2] == nmp->nm_stategenid)) {
1800                         NP(np, "nfs_buf_read_rpc_finish: error %d @ 0x%llx, 0x%x 0x%x, initiating recovery",
1801                             error, NBOFF(bp) + offset, cb.rcb_args[2], nmp->nm_stategenid);
1802                         nfs_need_recover(nmp, error);
1803                 }
1804                 lck_mtx_unlock(&nmp->nm_lock);
1805                 if (np->n_flag & NREVOKE) {
1806                         error = EIO;
1807                 } else {
1808                         if (error == NFSERR_GRACE) {
1809                                 if (cb.rcb_func) {
1810                                         /*
1811                                          * For an async I/O request, handle a grace delay just like
1812                                          * jukebox errors.  Set the resend time and queue it up.
1813                                          */
1814                                         struct timeval now;
1815                                         if (req->r_nmrep.nmc_mhead) {
1816                                                 mbuf_freem(req->r_nmrep.nmc_mhead);
1817                                                 req->r_nmrep.nmc_mhead = NULL;
1818                                         }
1819                                         req->r_error = 0;
1820                                         microuptime(&now);
1821                                         lck_mtx_lock(&req->r_mtx);
1822                                         req->r_resendtime = now.tv_sec + 2;
1823                                         req->r_xid = 0;                 // get a new XID
1824                                         req->r_flags |= R_RESTART;
1825                                         req->r_start = 0;
1826                                         nfs_asyncio_resend(req);
1827                                         lck_mtx_unlock(&req->r_mtx);
1828                                         if (IS_VALID_CRED(cred)) {
1829                                                 kauth_cred_unref(&cred);
1830                                         }
1831                                         /* Note: nfsreq reference taken will be dropped later when finished */
1832                                         return;
1833                                 }
1834                                 /* otherwise, just pause a couple seconds and retry */
1835                                 tsleep(&nmp->nm_state, (PZERO - 1), "nfsgrace", 2 * hz);
1836                         }
1837                         if (!(error = nfs_mount_state_wait_for_recovery(nmp))) {
1838                                 rlen = 0;
1839                                 goto readagain;
1840                         }
1841                 }
1842         }
1843         if (error) {
1844                 SET(bp->nb_flags, NB_ERROR);
1845                 bp->nb_error = error;
1846                 goto out;
1847         }
1848
1849         if ((rlen > 0) && (bp->nb_endio < (offset + (int)rlen))) {
1850                 bp->nb_endio = offset + rlen;
1851         }
1852
1853         if ((nfsvers == NFS_VER2) || eof || (rlen == 0)) {
1854                 /* zero out the remaining data (up to EOF) */
1855                 off_t rpcrem, eofrem, rem;
1856                 rpcrem = (length - rlen);
1857                 eofrem = np->n_size - (NBOFF(bp) + offset + rlen);
1858                 rem = (rpcrem < eofrem) ? rpcrem : eofrem;
1859                 if (rem > 0) {
1860                         bzero(bp->nb_data + offset + rlen, rem);
1861                 }
1862         } else if (((int)rlen < length) && !ISSET(bp->nb_flags, NB_ERROR)) {
1863                 /*
1864                  * short read
1865                  *
1866                  * We haven't hit EOF and we didn't get all the data
1867                  * requested, so we need to issue another read for the rest.
1868                  * (Don't bother if the buffer already hit an error.)
1869                  */
1870 readagain:
1871                 offset += rlen;
1872                 length -= rlen;
1873                 cb.rcb_args[0] = offset;
1874                 cb.rcb_args[1] = length;
1875                 if (nmp->nm_vers >= NFS_VER4) {
1876                         cb.rcb_args[2] = nmp->nm_stategenid;
1877                 }
1878                 error = nmp->nm_funcs->nf_read_rpc_async(np, NBOFF(bp) + offset, length, thd, cred, &cb, &rreq);
1879                 if (!error) {
1880                         if (IS_VALID_CRED(cred)) {
1881                                 kauth_cred_unref(&cred);
1882                         }
1883                         if (!cb.rcb_func) {
1884                                 /* if !async we'll need to wait for this RPC to finish */
1885                                 req = rreq;
1886                                 rreq = NULL;
1887                                 goto finish;
1888                         }
1889                         nfs_request_rele(req);
1890                         /*
1891                          * We're done here.
1892                          * Outstanding RPC count is unchanged.
1893                          * Callback will be called when RPC is done.
1894                          */
1895                         return;
1896                 }
1897                 SET(bp->nb_flags, NB_ERROR);
1898                 bp->nb_error = error;
1899         }
1900
1901 out:
1902         if (cb.rcb_func) {
1903                 nfs_request_rele(req);
1904         }
1905         if (IS_VALID_CRED(cred)) {
1906                 kauth_cred_unref(&cred);
1907         }
1908
1909         /*
1910          * Decrement outstanding RPC count on buffer
1911          * and call nfs_buf_read_finish on last RPC.
1912          *
1913          * (Note: when there are multiple async RPCs issued for a
1914          * buffer we need nfs_buffer_mutex to avoid problems when
1915          * aborting a partially-initiated set of RPCs)
1916          */
1917
1918         multasyncrpc = ISSET(bp->nb_flags, NB_MULTASYNCRPC);
1919         if (multasyncrpc) {
1920                 lck_mtx_lock(nfs_buf_mutex);
1921         }
1922
1923         bp->nb_rpcs--;
1924         finished = (bp->nb_rpcs == 0);
1925
1926         if (multasyncrpc) {
1927                 lck_mtx_unlock(nfs_buf_mutex);
1928         }
1929
1930         if (finished) {
1931                 if (multasyncrpc) {
1932                         wakeme = &bp->nb_rpcs;
1933                 }
1934                 nfs_buf_read_finish(bp);
1935                 if (wakeme) {
1936                         wakeup(wakeme);
1937                 }
1938         }
1939 }
1940
1941 /*
1942  * Do buffer readahead.
1943  * Initiate async I/O to read buffers not in cache.
1944  */
1945 int
1946 nfs_buf_readahead(nfsnode_t np, int ioflag, daddr64_t *rabnp, daddr64_t lastrabn, thread_t thd, kauth_cred_t cred)
1947 {
1948         struct nfsmount *nmp = NFSTONMP(np);
1949         struct nfsbuf *bp;
1950         int error = 0;
1951         uint32_t nra;
1952
1953         if (nfs_mount_gone(nmp)) {
1954                 return ENXIO;
1955         }
1956         if (nmp->nm_readahead <= 0) {
1957                 return 0;
1958         }
1959         if (*rabnp > lastrabn) {
1960                 return 0;
1961         }
1962
1963         for (nra = 0; (nra < nmp->nm_readahead) && (*rabnp <= lastrabn); nra++, *rabnp = *rabnp + 1) {
1964                 /* check if block exists and is valid. */
1965                 if ((*rabnp * nmp->nm_biosize) >= (off_t)np->n_size) {
1966                         /* stop reading ahead if we're beyond EOF */
1967                         *rabnp = lastrabn;
1968                         break;
1969                 }
1970                 error = nfs_buf_get(np, *rabnp, nmp->nm_biosize, thd, NBLK_READ | NBLK_NOWAIT, &bp);
1971                 if (error) {
1972                         break;
1973                 }
1974                 nfs_node_lock_force(np);
1975                 np->n_lastrahead = *rabnp;
1976                 nfs_node_unlock(np);
1977                 if (!bp) {
1978                         continue;
1979                 }
1980                 if ((ioflag & IO_NOCACHE) && ISSET(bp->nb_flags, NB_CACHE) &&
1981                     !bp->nb_dirty && !ISSET(bp->nb_flags, (NB_DELWRI | NB_NCRDAHEAD))) {
1982                         CLR(bp->nb_flags, NB_CACHE);
1983                         bp->nb_valid = 0;
1984                         bp->nb_validoff = bp->nb_validend = -1;
1985                 }
1986                 if ((bp->nb_dirtyend <= 0) && !bp->nb_dirty &&
1987                     !ISSET(bp->nb_flags, (NB_CACHE | NB_DELWRI))) {
1988                         SET(bp->nb_flags, (NB_READ | NB_ASYNC));
1989                         if (ioflag & IO_NOCACHE) {
1990                                 SET(bp->nb_flags, NB_NCRDAHEAD);
1991                         }
1992                         if (!IS_VALID_CRED(bp->nb_rcred) && IS_VALID_CRED(cred)) {
1993                                 kauth_cred_ref(cred);
1994                                 bp->nb_rcred = cred;
1995                         }
1996                         if ((error = nfs_buf_read(bp))) {
1997                                 break;
1998                         }
1999                         continue;
2000                 }
2001                 nfs_buf_release(bp, 1);
2002         }
2003         return error;
2004 }
2005
2006 /*
2007  * NFS buffer I/O for reading files.
2008  */
2009 int
2010 nfs_bioread(nfsnode_t np, uio_t uio, int ioflag, vfs_context_t ctx)
2011 {
2012         vnode_t vp = NFSTOV(np);
2013         struct nfsbuf *bp = NULL;
2014         struct nfsmount *nmp = VTONMP(vp);
2015         daddr64_t lbn, rabn = 0, lastrabn, maxrabn = -1;
2016         off_t diff;
2017         int error = 0, n = 0, on = 0;
2018         int nfsvers, biosize, modified, readaheads = 0;
2019         thread_t thd;
2020         kauth_cred_t cred;
2021         int64_t io_resid;
2022
2023         FSDBG_TOP(514, np, uio_offset(uio), uio_resid(uio), ioflag);
2024
2025         nfsvers = nmp->nm_vers;
2026         biosize = nmp->nm_biosize;
2027         thd = vfs_context_thread(ctx);
2028         cred = vfs_context_ucred(ctx);
2029
2030         if (vnode_vtype(vp) != VREG) {
2031                 printf("nfs_bioread: type %x unexpected\n", vnode_vtype(vp));
2032                 FSDBG_BOT(514, np, 0xd1e0016, 0, EINVAL);
2033                 return EINVAL;
2034         }
2035
2036         /*
2037          * For NFS, cache consistency can only be maintained approximately.
2038          * Although RFC1094 does not specify the criteria, the following is
2039          * believed to be compatible with the reference port.
2040          *
2041          * If the file has changed since the last read RPC or you have
2042          * written to the file, you may have lost data cache consistency
2043          * with the server.  So, check for a change, and flush all of the
2044          * file's data out of the cache.
2045          * NB: This implies that cache data can be read when up to
2046          * NFS_MAXATTRTIMO seconds out of date. If you find that you
2047          * need current attributes, nfs_getattr() can be forced to fetch
2048          * new attributes (via NATTRINVALIDATE() or NGA_UNCACHED).
2049          */
2050
2051         if (ISSET(np->n_flag, NUPDATESIZE)) {
2052                 nfs_data_update_size(np, 0);
2053         }
2054
2055         if ((error = nfs_node_lock(np))) {
2056                 FSDBG_BOT(514, np, 0xd1e0222, 0, error);
2057                 return error;
2058         }
2059
2060         if (np->n_flag & NNEEDINVALIDATE) {
2061                 np->n_flag &= ~NNEEDINVALIDATE;
2062                 nfs_node_unlock(np);
2063                 error = nfs_vinvalbuf(vp, V_SAVE | V_IGNORE_WRITEERR, ctx, 1);
2064                 if (!error) {
2065                         error = nfs_node_lock(np);
2066                 }
2067                 if (error) {
2068                         FSDBG_BOT(514, np, 0xd1e0322, 0, error);
2069                         return error;
2070                 }
2071         }
2072
2073         modified = (np->n_flag & NMODIFIED);
2074         nfs_node_unlock(np);
2075         /* nfs_getattr() will check changed and purge caches */
2076         error = nfs_getattr(np, NULL, ctx, modified ? NGA_UNCACHED : NGA_CACHED);
2077         if (error) {
2078                 FSDBG_BOT(514, np, 0xd1e0004, 0, error);
2079                 return error;
2080         }
2081
2082         if (uio_resid(uio) == 0) {
2083                 FSDBG_BOT(514, np, 0xd1e0001, 0, 0);
2084                 return 0;
2085         }
2086         if (uio_offset(uio) < 0) {
2087                 FSDBG_BOT(514, np, 0xd1e0002, 0, EINVAL);
2088                 return EINVAL;
2089         }
2090
2091         /*
2092          * set up readahead - which may be limited by:
2093          * + current request length (for IO_NOCACHE)
2094          * + readahead setting
2095          * + file size
2096          */
2097         if (nmp->nm_readahead > 0) {
2098                 off_t end = uio_offset(uio) + uio_resid(uio);
2099                 if (end > (off_t)np->n_size) {
2100                         end = np->n_size;
2101                 }
2102                 rabn = uio_offset(uio) / biosize;
2103                 maxrabn = (end - 1) / biosize;
2104                 nfs_node_lock_force(np);
2105                 if (!(ioflag & IO_NOCACHE) &&
2106                     (!rabn || (rabn == np->n_lastread) || (rabn == (np->n_lastread + 1)))) {
2107                         maxrabn += nmp->nm_readahead;
2108                         if ((maxrabn * biosize) >= (off_t)np->n_size) {
2109                                 maxrabn = ((off_t)np->n_size - 1) / biosize;
2110                         }
2111                 }
2112                 if (maxrabn < np->n_lastrahead) {
2113                         np->n_lastrahead = -1;
2114                 }
2115                 if (rabn < np->n_lastrahead) {
2116                         rabn = np->n_lastrahead + 1;
2117                 }
2118                 nfs_node_unlock(np);
2119         } else {
2120                 rabn = maxrabn = 0;
2121         }
2122
2123         do {
2124                 nfs_data_lock(np, NFS_DATA_LOCK_SHARED);
2125                 lbn = uio_offset(uio) / biosize;
2126
2127                 /*
2128                  * Copy directly from any cached pages without grabbing the bufs.
2129                  * (If we are NOCACHE and we've issued readahead requests, we need
2130                  * to grab the NB_NCRDAHEAD bufs to drop them.)
2131                  */
2132                 if ((!(ioflag & IO_NOCACHE) || !readaheads) &&
2133                     ((uio->uio_segflg == UIO_USERSPACE32 ||
2134                     uio->uio_segflg == UIO_USERSPACE64 ||
2135                     uio->uio_segflg == UIO_USERSPACE))) {
2136                         io_resid = uio_resid(uio);
2137                         diff = np->n_size - uio_offset(uio);
2138                         if (diff < io_resid) {
2139                                 io_resid = diff;
2140                         }
2141                         if (io_resid > 0) {
2142                                 int count = (io_resid > INT_MAX) ? INT_MAX : io_resid;
2143                                 error = cluster_copy_ubc_data(vp, uio, &count, 0);
2144                                 if (error) {
2145                                         nfs_data_unlock(np);
2146                                         FSDBG_BOT(514, np, uio_offset(uio), 0xcacefeed, error);
2147                                         return error;
2148                                 }
2149                         }
2150                         /* count any biocache reads that we just copied directly */
2151                         if (lbn != (uio_offset(uio) / biosize)) {
2152                                 OSAddAtomic64((uio_offset(uio) / biosize) - lbn, &nfsstats.biocache_reads);
2153                                 FSDBG(514, np, 0xcacefeed, uio_offset(uio), error);
2154                         }
2155                 }
2156
2157                 lbn = uio_offset(uio) / biosize;
2158                 on = uio_offset(uio) % biosize;
2159                 nfs_node_lock_force(np);
2160                 np->n_lastread = (uio_offset(uio) - 1) / biosize;
2161                 nfs_node_unlock(np);
2162
2163                 if ((uio_resid(uio) <= 0) || (uio_offset(uio) >= (off_t)np->n_size)) {
2164                         nfs_data_unlock(np);
2165                         FSDBG_BOT(514, np, uio_offset(uio), uio_resid(uio), 0xaaaaaaaa);
2166                         return 0;
2167                 }
2168
2169                 /* adjust readahead block number, if necessary */
2170                 if (rabn < lbn) {
2171                         rabn = lbn;
2172                 }
2173                 lastrabn = MIN(maxrabn, lbn + nmp->nm_readahead);
2174                 if (rabn <= lastrabn) { /* start readaheads */
2175                         error = nfs_buf_readahead(np, ioflag, &rabn, lastrabn, thd, cred);
2176                         if (error) {
2177                                 nfs_data_unlock(np);
2178                                 FSDBG_BOT(514, np, 0xd1e000b, 1, error);
2179                                 return error;
2180                         }
2181                         readaheads = 1;
2182                 }
2183
2184                 OSAddAtomic64(1, &nfsstats.biocache_reads);
2185
2186                 /*
2187                  * If the block is in the cache and has the required data
2188                  * in a valid region, just copy it out.
2189                  * Otherwise, get the block and write back/read in,
2190                  * as required.
2191                  */
2192 again:
2193                 io_resid = uio_resid(uio);
2194                 n = (io_resid > (biosize - on)) ? (biosize - on) : io_resid;
2195                 diff = np->n_size - uio_offset(uio);
2196                 if (diff < n) {
2197                         n = diff;
2198                 }
2199
2200                 error = nfs_buf_get(np, lbn, biosize, thd, NBLK_READ, &bp);
2201                 if (error) {
2202                         nfs_data_unlock(np);
2203                         FSDBG_BOT(514, np, 0xd1e000c, 0, error);
2204                         return error;
2205                 }
2206
2207                 if ((ioflag & IO_NOCACHE) && ISSET(bp->nb_flags, NB_CACHE)) {
2208                         /*
2209                          * IO_NOCACHE found a cached buffer.
2210                          * Flush the buffer if it's dirty.
2211                          * Invalidate the data if it wasn't just read
2212                          * in as part of a "nocache readahead".
2213                          */
2214                         if (bp->nb_dirty || (bp->nb_dirtyend > 0)) {
2215                                 /* so write the buffer out and try again */
2216                                 SET(bp->nb_flags, NB_NOCACHE);
2217                                 goto flushbuffer;
2218                         }
2219                         if (ISSET(bp->nb_flags, NB_NCRDAHEAD)) {
2220                                 CLR(bp->nb_flags, NB_NCRDAHEAD);
2221                                 SET(bp->nb_flags, NB_NOCACHE);
2222                         }
2223                 }
2224
2225                 /* if any pages are valid... */
2226                 if (bp->nb_valid) {
2227                         /* ...check for any invalid pages in the read range */
2228                         int pg, firstpg, lastpg, dirtypg;
2229                         dirtypg = firstpg = lastpg = -1;
2230                         pg = on / PAGE_SIZE;
2231                         while (pg <= (on + n - 1) / PAGE_SIZE) {
2232                                 if (!NBPGVALID(bp, pg)) {
2233                                         if (firstpg < 0) {
2234                                                 firstpg = pg;
2235                                         }
2236                                         lastpg = pg;
2237                                 } else if (firstpg >= 0 && dirtypg < 0 && NBPGDIRTY(bp, pg)) {
2238                                         dirtypg = pg;
2239                                 }
2240                                 pg++;
2241                         }
2242
2243                         /* if there are no invalid pages, we're all set */
2244                         if (firstpg < 0) {
2245                                 if (bp->nb_validoff < 0) {
2246                                         /* valid range isn't set up, so */
2247                                         /* set it to what we know is valid */
2248                                         bp->nb_validoff = trunc_page(on);
2249                                         bp->nb_validend = round_page(on + n);
2250                                         nfs_buf_normalize_valid_range(np, bp);
2251                                 }
2252                                 goto buffer_ready;
2253                         }
2254
2255                         /* there are invalid pages in the read range */
2256                         if (((dirtypg > firstpg) && (dirtypg < lastpg)) ||
2257                             (((firstpg * PAGE_SIZE) < bp->nb_dirtyend) && (((lastpg + 1) * PAGE_SIZE) > bp->nb_dirtyoff))) {
2258                                 /* there are also dirty page(s) (or range) in the read range, */
2259                                 /* so write the buffer out and try again */
2260 flushbuffer:
2261                                 CLR(bp->nb_flags, (NB_DONE | NB_ERROR | NB_INVAL));
2262                                 SET(bp->nb_flags, NB_ASYNC);
2263                                 if (!IS_VALID_CRED(bp->nb_wcred)) {
2264                                         kauth_cred_ref(cred);
2265                                         bp->nb_wcred = cred;
2266                                 }
2267                                 error = nfs_buf_write(bp);
2268                                 if (error) {
2269                                         nfs_data_unlock(np);
2270                                         FSDBG_BOT(514, np, 0xd1e000d, 0, error);
2271                                         return error;
2272                                 }
2273                                 goto again;
2274                         }
2275                         if (!bp->nb_dirty && bp->nb_dirtyend <= 0 &&
2276                             (lastpg - firstpg + 1) > (biosize / PAGE_SIZE) / 2) {
2277                                 /* we need to read in more than half the buffer and the */
2278                                 /* buffer's not dirty, so just fetch the whole buffer */
2279                                 bp->nb_valid = 0;
2280                         } else {
2281                                 /* read the page range in */
2282                                 uio_t auio;
2283                                 char uio_buf[UIO_SIZEOF(1)];
2284
2285                                 NFS_BUF_MAP(bp);
2286                                 auio = uio_createwithbuffer(1, (NBOFF(bp) + firstpg * PAGE_SIZE_64),
2287                                     UIO_SYSSPACE, UIO_READ, &uio_buf[0], sizeof(uio_buf));
2288                                 if (!auio) {
2289                                         error = ENOMEM;
2290                                 } else {
2291                                         uio_addiov(auio, CAST_USER_ADDR_T(bp->nb_data + (firstpg * PAGE_SIZE)),
2292                                             ((lastpg - firstpg + 1) * PAGE_SIZE));
2293                                         error = nfs_read_rpc(np, auio, ctx);
2294                                 }
2295                                 if (error) {
2296                                         if (ioflag & IO_NOCACHE) {
2297                                                 SET(bp->nb_flags, NB_NOCACHE);
2298                                         }
2299                                         nfs_buf_release(bp, 1);
2300                                         nfs_data_unlock(np);
2301                                         FSDBG_BOT(514, np, 0xd1e000e, 0, error);
2302                                         return error;
2303                                 }
2304                                 /* Make sure that the valid range is set to cover this read. */
2305                                 bp->nb_validoff = trunc_page_32(on);
2306                                 bp->nb_validend = round_page_32(on + n);
2307                                 nfs_buf_normalize_valid_range(np, bp);
2308                                 if (uio_resid(auio) > 0) {
2309                                         /* if short read, must have hit EOF, */
2310                                         /* so zero the rest of the range */
2311                                         bzero(CAST_DOWN(caddr_t, uio_curriovbase(auio)), uio_resid(auio));
2312                                 }
2313                                 /* mark the pages (successfully read) as valid */
2314                                 for (pg = firstpg; pg <= lastpg; pg++) {
2315                                         NBPGVALID_SET(bp, pg);
2316                                 }
2317                         }
2318                 }
2319                 /* if no pages are valid, read the whole block */
2320                 if (!bp->nb_valid) {
2321                         if (!IS_VALID_CRED(bp->nb_rcred) && IS_VALID_CRED(cred)) {
2322                                 kauth_cred_ref(cred);
2323                                 bp->nb_rcred = cred;
2324                         }
2325                         SET(bp->nb_flags, NB_READ);
2326                         CLR(bp->nb_flags, (NB_DONE | NB_ERROR | NB_INVAL));
2327                         error = nfs_buf_read(bp);
2328                         if (ioflag & IO_NOCACHE) {
2329                                 SET(bp->nb_flags, NB_NOCACHE);
2330                         }
2331                         if (error) {
2332                                 nfs_data_unlock(np);
2333                                 nfs_buf_release(bp, 1);
2334                                 FSDBG_BOT(514, np, 0xd1e000f, 0, error);
2335                                 return error;
2336                         }
2337                 }
2338 buffer_ready:
2339                 /* validate read range against valid range and clip */
2340                 if (bp->nb_validend > 0) {
2341                         diff = (on >= bp->nb_validend) ? 0 : (bp->nb_validend - on);
2342                         if (diff < n) {
2343                                 n = diff;
2344                         }
2345                 }
2346                 if (n > 0) {
2347                         NFS_BUF_MAP(bp);
2348                         error = uiomove(bp->nb_data + on, n, uio);
2349                 }
2350
2351                 nfs_buf_release(bp, 1);
2352                 nfs_data_unlock(np);
2353                 nfs_node_lock_force(np);
2354                 np->n_lastread = (uio_offset(uio) - 1) / biosize;
2355                 nfs_node_unlock(np);
2356         } while (error == 0 && uio_resid(uio) > 0 && n > 0);
2357         FSDBG_BOT(514, np, uio_offset(uio), uio_resid(uio), error);
2358         return error;
2359 }
2360
2361 /*
2362  * limit the number of outstanding async I/O writes
2363  */
2364 int
2365 nfs_async_write_start(struct nfsmount *nmp)
2366 {
2367         int error = 0, slpflag = NMFLAG(nmp, INTR) ? PCATCH : 0;
2368         struct timespec ts = {1, 0};
2369
2370         if (nfs_max_async_writes <= 0) {
2371                 return 0;
2372         }
2373         lck_mtx_lock(&nmp->nm_lock);
2374         while ((nfs_max_async_writes > 0) && (nmp->nm_asyncwrites >= nfs_max_async_writes)) {
2375                 if ((error = nfs_sigintr(nmp, NULL, current_thread(), 1))) {
2376                         break;
2377                 }
2378                 msleep(&nmp->nm_asyncwrites, &nmp->nm_lock, slpflag | (PZERO - 1), "nfsasyncwrites", &ts);
2379                 slpflag = 0;
2380         }
2381         if (!error) {
2382                 nmp->nm_asyncwrites++;
2383         }
2384         lck_mtx_unlock(&nmp->nm_lock);
2385         return error;
2386 }
2387 void
2388 nfs_async_write_done(struct nfsmount *nmp)
2389 {
2390         if (nmp->nm_asyncwrites <= 0) {
2391                 return;
2392         }
2393         lck_mtx_lock(&nmp->nm_lock);
2394         if (nmp->nm_asyncwrites-- >= nfs_max_async_writes) {
2395                 wakeup(&nmp->nm_asyncwrites);
2396         }
2397         lck_mtx_unlock(&nmp->nm_lock);
2398 }
2399
2400 /*
2401  * write (or commit) the given NFS buffer
2402  *
2403  * Commit the buffer if we can.
2404  * Write out any dirty range.
2405  * If any dirty pages remain, write them out.
2406  * Mark buffer done.
2407  *
2408  * For async requests, all the work beyond sending the initial
2409  * write RPC is handled in the RPC callback(s).
2410  */
2411 int
2412 nfs_buf_write(struct nfsbuf *bp)
2413 {
2414         int error = 0, oldflags, async;
2415         nfsnode_t np;
2416         thread_t thd;
2417         kauth_cred_t cred;
2418         proc_t p = current_proc();
2419         int iomode, doff, dend, firstpg, lastpg;
2420         uint32_t pagemask;
2421
2422         FSDBG_TOP(553, bp, NBOFF(bp), bp->nb_flags, 0);
2423
2424         if (!ISSET(bp->nb_lflags, NBL_BUSY)) {
2425                 panic("nfs_buf_write: buffer is not busy???");
2426         }
2427
2428         np = bp->nb_np;
2429         async = ISSET(bp->nb_flags, NB_ASYNC);
2430         oldflags = bp->nb_flags;
2431
2432         CLR(bp->nb_flags, (NB_READ | NB_DONE | NB_ERROR | NB_DELWRI));
2433         if (ISSET(oldflags, NB_DELWRI)) {
2434                 lck_mtx_lock(nfs_buf_mutex);
2435                 nfs_nbdwrite--;
2436                 NFSBUFCNTCHK();
2437                 lck_mtx_unlock(nfs_buf_mutex);
2438                 wakeup(&nfs_nbdwrite);
2439         }
2440
2441         /* move to clean list */
2442         if (ISSET(oldflags, (NB_ASYNC | NB_DELWRI))) {
2443                 lck_mtx_lock(nfs_buf_mutex);
2444                 if (bp->nb_vnbufs.le_next != NFSNOLIST) {
2445                         LIST_REMOVE(bp, nb_vnbufs);
2446                 }
2447                 LIST_INSERT_HEAD(&np->n_cleanblkhd, bp, nb_vnbufs);
2448                 lck_mtx_unlock(nfs_buf_mutex);
2449         }
2450         nfs_node_lock_force(np);
2451         np->n_numoutput++;
2452         nfs_node_unlock(np);
2453         vnode_startwrite(NFSTOV(np));
2454
2455         if (p && p->p_stats) {
2456                 OSIncrementAtomicLong(&p->p_stats->p_ru.ru_oublock);
2457         }
2458
2459         cred = bp->nb_wcred;
2460         if (!IS_VALID_CRED(cred) && ISSET(bp->nb_flags, NB_READ)) {
2461                 cred = bp->nb_rcred;  /* shouldn't really happen, but... */
2462         }
2463         if (IS_VALID_CRED(cred)) {
2464                 kauth_cred_ref(cred);
2465         }
2466         thd = async ? NULL : current_thread();
2467
2468         /* We need to make sure the pages are locked before doing I/O.  */
2469         if (!ISSET(bp->nb_flags, NB_META)) {
2470                 if (UBCINFOEXISTS(NFSTOV(np))) {
2471                         if (!ISSET(bp->nb_flags, NB_PAGELIST)) {
2472                                 error = nfs_buf_upl_setup(bp);
2473                                 if (error) {
2474                                         printf("nfs_buf_write: upl create failed %d\n", error);
2475                                         SET(bp->nb_flags, NB_ERROR);
2476                                         bp->nb_error = error = EIO;
2477                                         nfs_buf_iodone(bp);
2478                                         goto out;
2479                                 }
2480                                 nfs_buf_upl_check(bp);
2481                         }
2482                 } else {
2483                         /* We should never be in nfs_buf_write() with no UBCINFO. */
2484                         printf("nfs_buf_write: ubcinfo already gone\n");
2485                         SET(bp->nb_flags, NB_ERROR);
2486                         bp->nb_error = error = EIO;
2487                         nfs_buf_iodone(bp);
2488                         goto out;
2489                 }
2490         }
2491
2492         /* If NB_NEEDCOMMIT is set, a commit RPC may do the trick. */
2493         if (ISSET(bp->nb_flags, NB_NEEDCOMMIT)) {
2494                 nfs_buf_check_write_verifier(np, bp);
2495         }
2496         if (ISSET(bp->nb_flags, NB_NEEDCOMMIT)) {
2497                 struct nfsmount *nmp = NFSTONMP(np);
2498                 if (nfs_mount_gone(nmp)) {
2499                         SET(bp->nb_flags, NB_ERROR);
2500                         bp->nb_error = error = EIO;
2501                         nfs_buf_iodone(bp);
2502                         goto out;
2503                 }
2504                 SET(bp->nb_flags, NB_WRITEINPROG);
2505                 error = nmp->nm_funcs->nf_commit_rpc(np, NBOFF(bp) + bp->nb_dirtyoff,
2506                     bp->nb_dirtyend - bp->nb_dirtyoff, bp->nb_wcred, bp->nb_verf);
2507                 CLR(bp->nb_flags, NB_WRITEINPROG);
2508                 if (error) {
2509                         if (error != NFSERR_STALEWRITEVERF) {
2510                                 SET(bp->nb_flags, NB_ERROR);
2511                                 bp->nb_error = error;
2512                         }
2513                         nfs_buf_iodone(bp);
2514                         goto out;
2515                 }
2516                 bp->nb_dirtyoff = bp->nb_dirtyend = 0;
2517                 CLR(bp->nb_flags, NB_NEEDCOMMIT);
2518                 nfs_node_lock_force(np);
2519                 np->n_needcommitcnt--;
2520                 CHECK_NEEDCOMMITCNT(np);
2521                 nfs_node_unlock(np);
2522         }
2523         if (!error && (bp->nb_dirtyend > 0)) {
2524                 /* sanity check the dirty range */
2525                 if (NBOFF(bp) + bp->nb_dirtyend > (off_t) np->n_size) {
2526                         bp->nb_dirtyend = np->n_size - NBOFF(bp);
2527                         if (bp->nb_dirtyoff >= bp->nb_dirtyend) {
2528                                 bp->nb_dirtyoff = bp->nb_dirtyend = 0;
2529                         }
2530                 }
2531         }
2532         if (!error && (bp->nb_dirtyend > 0)) {
2533                 /* there's a dirty range that needs to be written out */
2534                 NFS_BUF_MAP(bp);
2535
2536                 doff = bp->nb_dirtyoff;
2537                 dend = bp->nb_dirtyend;
2538
2539                 /* if doff page is dirty, move doff to start of page */
2540                 if (NBPGDIRTY(bp, doff / PAGE_SIZE)) {
2541                         doff -= doff & PAGE_MASK;
2542                 }
2543                 /* try to expand write range to include preceding dirty pages */
2544                 if (!(doff & PAGE_MASK)) {
2545                         while ((doff > 0) && NBPGDIRTY(bp, (doff - 1) / PAGE_SIZE)) {
2546                                 doff -= PAGE_SIZE;
2547                         }
2548                 }
2549                 /* if dend page is dirty, move dend to start of next page */
2550                 if ((dend & PAGE_MASK) && NBPGDIRTY(bp, dend / PAGE_SIZE)) {
2551                         dend = round_page_32(dend);
2552                 }
2553                 /* try to expand write range to include trailing dirty pages */
2554                 if (!(dend & PAGE_MASK)) {
2555                         while ((dend < (int)bp->nb_bufsize) && NBPGDIRTY(bp, dend / PAGE_SIZE)) {
2556                                 dend += PAGE_SIZE;
2557                         }
2558                 }
2559                 /* make sure to keep dend clipped to EOF */
2560                 if ((NBOFF(bp) + dend) > (off_t) np->n_size) {
2561                         dend = np->n_size - NBOFF(bp);
2562                 }
2563                 /* calculate range of complete pages being written */
2564                 firstpg = round_page_32(doff) / PAGE_SIZE;
2565                 lastpg = (trunc_page_32(dend) - 1) / PAGE_SIZE;
2566                 /* calculate mask for that page range */
2567                 pagemask = ((1 << (lastpg + 1)) - 1) & ~((1 << firstpg) - 1);
2568
2569                 /*
2570                  * compare page mask to nb_dirty; if there are other dirty pages
2571                  * then write FILESYNC; otherwise, write UNSTABLE if async and
2572                  * not needcommit/stable; otherwise write FILESYNC
2573                  */
2574                 if (bp->nb_dirty & ~pagemask) {
2575                         iomode = NFS_WRITE_FILESYNC;
2576                 } else if ((bp->nb_flags & (NB_ASYNC | NB_NEEDCOMMIT | NB_STABLE)) == NB_ASYNC) {
2577                         iomode = NFS_WRITE_UNSTABLE;
2578                 } else {
2579                         iomode = NFS_WRITE_FILESYNC;
2580                 }
2581
2582                 /* write the whole contiguous dirty range */
2583                 bp->nb_offio = doff;
2584                 bp->nb_endio = dend;
2585
2586                 OSAddAtomic64(1, &nfsstats.write_bios);
2587
2588                 SET(bp->nb_flags, NB_WRITEINPROG);
2589                 error = nfs_buf_write_rpc(bp, iomode, thd, cred);
2590                 /*
2591                  * For async I/O, the callbacks will finish up the
2592                  * write and push out any dirty pages.  Otherwise,
2593                  * the write has already been finished and any dirty
2594                  * pages pushed out.
2595                  */
2596         } else {
2597                 if (!error && bp->nb_dirty) { /* write out any dirty pages */
2598                         error = nfs_buf_write_dirty_pages(bp, thd, cred);
2599                 }
2600                 nfs_buf_iodone(bp);
2601         }
2602         /* note: bp is still valid only for !async case */
2603 out:
2604         if (!async) {
2605                 error = nfs_buf_iowait(bp);
2606                 /* move to clean list */
2607                 if (oldflags & NB_DELWRI) {
2608                         lck_mtx_lock(nfs_buf_mutex);
2609                         if (bp->nb_vnbufs.le_next != NFSNOLIST) {
2610                                 LIST_REMOVE(bp, nb_vnbufs);
2611                         }
2612                         LIST_INSERT_HEAD(&np->n_cleanblkhd, bp, nb_vnbufs);
2613                         lck_mtx_unlock(nfs_buf_mutex);
2614                 }
2615                 FSDBG_BOT(553, bp, NBOFF(bp), bp->nb_flags, error);
2616                 nfs_buf_release(bp, 1);
2617                 /* check if we need to invalidate (and we can) */
2618                 if ((np->n_flag & NNEEDINVALIDATE) &&
2619                     !(np->n_bflag & (NBINVALINPROG | NBFLUSHINPROG))) {
2620                         int invalidate = 0;
2621                         nfs_node_lock_force(np);
2622                         if (np->n_flag & NNEEDINVALIDATE) {
2623                                 invalidate = 1;
2624                                 np->n_flag &= ~NNEEDINVALIDATE;
2625                         }
2626                         nfs_node_unlock(np);
2627                         if (invalidate) {
2628                                 /*
2629                                  * There was a write error and we need to
2630                                  * invalidate attrs and flush buffers in
2631                                  * order to sync up with the server.
2632                                  * (if this write was extending the file,
2633                                  * we may no longer know the correct size)
2634                                  *
2635                                  * But we couldn't call vinvalbuf while holding
2636                                  * the buffer busy.  So we call vinvalbuf() after
2637                                  * releasing the buffer.
2638                                  */
2639                                 nfs_vinvalbuf2(NFSTOV(np), V_SAVE | V_IGNORE_WRITEERR, thd, cred, 1);
2640                         }
2641                 }
2642         }
2643
2644         if (IS_VALID_CRED(cred)) {
2645                 kauth_cred_unref(&cred);
2646         }
2647         return error;
2648 }
2649
2650 /*
2651  * finish the writing of a buffer
2652  */
2653 void
2654 nfs_buf_write_finish(struct nfsbuf *bp, thread_t thd, kauth_cred_t cred)
2655 {
2656         nfsnode_t np = bp->nb_np;
2657         int error = (bp->nb_flags & NB_ERROR) ? bp->nb_error : 0;
2658         int firstpg, lastpg;
2659         uint32_t pagemask;
2660
2661         if ((error == EINTR) || (error == ERESTART)) {
2662                 CLR(bp->nb_flags, NB_ERROR);
2663                 SET(bp->nb_flags, NB_EINTR);
2664         }
2665
2666         if (!error) {
2667                 /* calculate range of complete pages being written */
2668                 firstpg = round_page_32(bp->nb_offio) / PAGE_SIZE;
2669                 lastpg = (trunc_page_32(bp->nb_endio) - 1) / PAGE_SIZE;
2670                 /* calculate mask for that page range written */
2671                 pagemask = ((1 << (lastpg + 1)) - 1) & ~((1 << firstpg) - 1);
2672                 /* clear dirty bits for pages we've written */
2673                 bp->nb_dirty &= ~pagemask;
2674         }
2675
2676         /* manage needcommit state */
2677         if (!error && (bp->nb_commitlevel == NFS_WRITE_UNSTABLE)) {
2678                 if (!ISSET(bp->nb_flags, NB_NEEDCOMMIT)) {
2679                         nfs_node_lock_force(np);
2680                         np->n_needcommitcnt++;
2681                         nfs_node_unlock(np);
2682                         SET(bp->nb_flags, NB_NEEDCOMMIT);
2683                 }
2684                 /* make sure nb_dirtyoff/nb_dirtyend reflect actual range written */
2685                 bp->nb_dirtyoff = bp->nb_offio;
2686                 bp->nb_dirtyend = bp->nb_endio;
2687         } else if (ISSET(bp->nb_flags, NB_NEEDCOMMIT)) {
2688                 nfs_node_lock_force(np);
2689                 np->n_needcommitcnt--;
2690                 CHECK_NEEDCOMMITCNT(np);
2691                 nfs_node_unlock(np);
2692                 CLR(bp->nb_flags, NB_NEEDCOMMIT);
2693         }
2694
2695         CLR(bp->nb_flags, NB_WRITEINPROG);
2696
2697         /*
2698          * For an unstable write, the buffer is still treated as dirty until
2699          * a commit (or stable (re)write) is performed.  Buffers needing only
2700          * a commit are marked with the NB_DELWRI and NB_NEEDCOMMIT flags.
2701          *
2702          * If the write was interrupted we set NB_EINTR.  Don't set NB_ERROR
2703          * because that would cause the buffer to be dropped.  The buffer is
2704          * still valid and simply needs to be written again.
2705          */
2706         if ((error == EINTR) || (error == ERESTART) || (!error && (bp->nb_flags & NB_NEEDCOMMIT))) {
2707                 CLR(bp->nb_flags, NB_INVAL);
2708                 if (!ISSET(bp->nb_flags, NB_DELWRI)) {
2709                         SET(bp->nb_flags, NB_DELWRI);
2710                         lck_mtx_lock(nfs_buf_mutex);
2711                         nfs_nbdwrite++;
2712                         NFSBUFCNTCHK();
2713                         lck_mtx_unlock(nfs_buf_mutex);
2714                 }
2715                 /*
2716                  * Since for the NB_ASYNC case, we've reassigned the buffer to the
2717                  * clean list, we have to reassign it back to the dirty one. Ugh.
2718                  */
2719                 if (ISSET(bp->nb_flags, NB_ASYNC)) {
2720                         /* move to dirty list */
2721                         lck_mtx_lock(nfs_buf_mutex);
2722                         if (bp->nb_vnbufs.le_next != NFSNOLIST) {
2723                                 LIST_REMOVE(bp, nb_vnbufs);
2724                         }
2725                         LIST_INSERT_HEAD(&np->n_dirtyblkhd, bp, nb_vnbufs);
2726                         lck_mtx_unlock(nfs_buf_mutex);
2727                 }
2728         } else {
2729                 /* either there's an error or we don't need to commit */
2730                 if (error) {
2731                         /*
2732                          * There was a write error and we need to invalidate
2733                          * attrs and flush buffers in order to sync up with the
2734                          * server.  (if this write was extending the file, we
2735                          * may no longer know the correct size)
2736                          *
2737                          * But we can't call vinvalbuf while holding this
2738                          * buffer busy.  Set a flag to do it after releasing
2739                          * the buffer.
2740                          */
2741                         nfs_node_lock_force(np);
2742                         np->n_error = error;
2743                         np->n_flag |= (NWRITEERR | NNEEDINVALIDATE);
2744                         NATTRINVALIDATE(np);
2745                         nfs_node_unlock(np);
2746                 }
2747                 /* clear the dirty range */
2748                 bp->nb_dirtyoff = bp->nb_dirtyend = 0;
2749         }
2750
2751         if (!error && bp->nb_dirty) {
2752                 nfs_buf_write_dirty_pages(bp, thd, cred);
2753         }
2754         nfs_buf_iodone(bp);
2755 }
2756
2757 /*
2758  * write out any pages marked dirty in a buffer
2759  *
2760  * We do use unstable writes and follow up with a commit.
2761  * If we catch the write verifier changing we'll restart
2762  * do the writes filesync.
2763  */
2764 int
2765 nfs_buf_write_dirty_pages(struct nfsbuf *bp, thread_t thd, kauth_cred_t cred)
2766 {
2767         nfsnode_t np = bp->nb_np;
2768         struct nfsmount *nmp = NFSTONMP(np);
2769         int error = 0, commit, iomode, iomode2, len, pg, count, npages, off;
2770         uint32_t dirty = bp->nb_dirty;
2771         uint64_t wverf;
2772         uio_t auio;
2773         char uio_buf[UIO_SIZEOF(1)];
2774
2775         if (!bp->nb_dirty) {
2776                 return 0;
2777         }
2778
2779         /* there are pages marked dirty that need to be written out */
2780         OSAddAtomic64(1, &nfsstats.write_bios);
2781         NFS_BUF_MAP(bp);
2782         SET(bp->nb_flags, NB_WRITEINPROG);
2783         npages = bp->nb_bufsize / PAGE_SIZE;
2784         iomode = NFS_WRITE_UNSTABLE;
2785
2786         auio = uio_createwithbuffer(1, 0, UIO_SYSSPACE, UIO_WRITE,
2787             &uio_buf, sizeof(uio_buf));
2788
2789 again:
2790         dirty = bp->nb_dirty;
2791         wverf = bp->nb_verf;
2792         commit = NFS_WRITE_FILESYNC;
2793         for (pg = 0; pg < npages; pg++) {
2794                 if (!NBPGDIRTY(bp, pg)) {
2795                         continue;
2796                 }
2797                 count = 1;
2798                 while (((pg + count) < npages) && NBPGDIRTY(bp, pg + count)) {
2799                         count++;
2800                 }
2801                 /* write count pages starting with page pg */
2802                 off = pg * PAGE_SIZE;
2803                 len = count * PAGE_SIZE;
2804                 /* clip writes to EOF */
2805                 if (NBOFF(bp) + off + len > (off_t) np->n_size) {
2806                         len -= (NBOFF(bp) + off + len) - np->n_size;
2807                 }
2808                 if (len > 0) {
2809                         iomode2 = iomode;
2810                         uio_reset(auio, NBOFF(bp) + off, UIO_SYSSPACE, UIO_WRITE);
2811                         uio_addiov(auio, CAST_USER_ADDR_T(bp->nb_data + off), len);
2812                         error = nfs_write_rpc2(np, auio, thd, cred, &iomode2, &bp->nb_verf);
2813                         if (error) {
2814                                 break;
2815                         }
2816                         if (iomode2 < commit) { /* Retain the lowest commitment level returned. */
2817                                 commit = iomode2;
2818                         }
2819                         if ((commit != NFS_WRITE_FILESYNC) && (wverf != bp->nb_verf)) {
2820                                 /* verifier changed, redo all the writes filesync */
2821                                 iomode = NFS_WRITE_FILESYNC;
2822                                 goto again;
2823                         }
2824                 }
2825                 /* clear dirty bits */
2826                 while (count--) {
2827                         dirty &= ~(1 << pg);
2828                         if (count) { /* leave pg on last page */
2829                                 pg++;
2830                         }
2831                 }
2832         }
2833         CLR(bp->nb_flags, NB_WRITEINPROG);
2834
2835         if (!error && (commit != NFS_WRITE_FILESYNC)) {
2836                 error = nmp->nm_funcs->nf_commit_rpc(np, NBOFF(bp), bp->nb_bufsize, cred, wverf);
2837                 if (error == NFSERR_STALEWRITEVERF) {
2838                         /* verifier changed, so we need to restart all the writes */
2839                         iomode = NFS_WRITE_FILESYNC;
2840                         goto again;
2841                 }
2842         }
2843         if (!error) {
2844                 bp->nb_dirty = dirty;
2845         } else {
2846                 SET(bp->nb_flags, NB_ERROR);
2847                 bp->nb_error = error;
2848         }
2849         return error;
2850 }
2851
2852 /*
2853  * initiate the NFS WRITE RPC(s) for a buffer
2854  */
2855 int
2856 nfs_buf_write_rpc(struct nfsbuf *bp, int iomode, thread_t thd, kauth_cred_t cred)
2857 {
2858         struct nfsmount *nmp;
2859         nfsnode_t np = bp->nb_np;
2860         int error = 0, nfsvers, async;
2861         int offset, nrpcs;
2862         uint32_t nmwsize, length, len;
2863         struct nfsreq *req;
2864         struct nfsreq_cbinfo cb;
2865         uio_t auio;
2866         char uio_buf[UIO_SIZEOF(1)];
2867
2868         nmp = NFSTONMP(np);
2869         if (nfs_mount_gone(nmp)) {
2870                 bp->nb_error = error = ENXIO;
2871                 SET(bp->nb_flags, NB_ERROR);
2872                 nfs_buf_iodone(bp);
2873                 return error;
2874         }
2875         nfsvers = nmp->nm_vers;
2876         nmwsize = nmp->nm_wsize;
2877
2878         offset = bp->nb_offio;
2879         length = bp->nb_endio - bp->nb_offio;
2880
2881         /* Note: Can only do async I/O if nfsiods are configured. */
2882         async = (bp->nb_flags & NB_ASYNC) && (NFSIOD_MAX > 0);
2883         bp->nb_commitlevel = NFS_WRITE_FILESYNC;
2884         cb.rcb_func = async ? nfs_buf_write_rpc_finish : NULL;
2885         cb.rcb_bp = bp;
2886
2887         if ((nfsvers == NFS_VER2) && ((NBOFF(bp) + bp->nb_endio) > 0xffffffffLL)) {
2888                 bp->nb_error = error = EFBIG;
2889                 SET(bp->nb_flags, NB_ERROR);
2890                 nfs_buf_iodone(bp);
2891                 return error;
2892         }
2893
2894         auio = uio_createwithbuffer(1, NBOFF(bp) + offset, UIO_SYSSPACE,
2895             UIO_WRITE, &uio_buf, sizeof(uio_buf));
2896         uio_addiov(auio, CAST_USER_ADDR_T(bp->nb_data + offset), length);
2897
2898         bp->nb_rpcs = nrpcs = (length + nmwsize - 1) / nmwsize;
2899         if (async && (nrpcs > 1)) {
2900                 SET(bp->nb_flags, NB_MULTASYNCRPC);
2901         } else {
2902                 CLR(bp->nb_flags, NB_MULTASYNCRPC);
2903         }
2904
2905         while (length > 0) {
2906                 if (ISSET(bp->nb_flags, NB_ERROR)) {
2907                         error = bp->nb_error;
2908                         break;
2909                 }
2910                 len = (length > nmwsize) ? nmwsize : length;
2911                 cb.rcb_args[0] = offset;
2912                 cb.rcb_args[1] = len;
2913                 if (nmp->nm_vers >= NFS_VER4) {
2914                         cb.rcb_args[2] = nmp->nm_stategenid;
2915                 }
2916                 if (async && ((error = nfs_async_write_start(nmp)))) {
2917                         break;
2918                 }
2919                 req = NULL;
2920                 error = nmp->nm_funcs->nf_write_rpc_async(np, auio, len, thd, cred,
2921                     iomode, &cb, &req);
2922                 if (error) {
2923                         if (async) {
2924                                 nfs_async_write_done(nmp);
2925                         }
2926                         break;
2927                 }
2928                 offset += len;
2929                 length -= len;
2930                 if (async) {
2931                         continue;
2932                 }
2933                 nfs_buf_write_rpc_finish(req);
2934         }
2935
2936         if (length > 0) {
2937                 /*
2938                  * Something bad happened while trying to send the RPCs.
2939                  * Wait for any outstanding requests to complete.
2940                  */
2941                 bp->nb_error = error;
2942                 SET(bp->nb_flags, NB_ERROR);
2943                 if (ISSET(bp->nb_flags, NB_MULTASYNCRPC)) {
2944                         nrpcs = (length + nmwsize - 1) / nmwsize;
2945                         lck_mtx_lock(nfs_buf_mutex);
2946                         bp->nb_rpcs -= nrpcs;
2947                         if (bp->nb_rpcs == 0) {
2948                                 /* No RPCs left, so the buffer's done */
2949                                 lck_mtx_unlock(nfs_buf_mutex);
2950                                 nfs_buf_write_finish(bp, thd, cred);
2951                         } else {
2952                                 /* wait for the last RPC to mark it done */
2953                                 while (bp->nb_rpcs > 0) {
2954                                         msleep(&bp->nb_rpcs, nfs_buf_mutex, 0,
2955                                             "nfs_buf_write_rpc_cancel", NULL);
2956                                 }
2957                                 lck_mtx_unlock(nfs_buf_mutex);
2958                         }
2959                 } else {
2960                         nfs_buf_write_finish(bp, thd, cred);
2961                 }
2962                 /* It may have just been an interrupt... that's OK */
2963                 if (!ISSET(bp->nb_flags, NB_ERROR)) {
2964                         error = 0;
2965                 }
2966         }
2967
2968         return error;
2969 }
2970
2971 /*
2972  * finish up an NFS WRITE RPC on a buffer
2973  */
2974 void
2975 nfs_buf_write_rpc_finish(struct nfsreq *req)
2976 {
2977         int error = 0, nfsvers, offset, length, multasyncrpc, finished;
2978         int committed = NFS_WRITE_FILESYNC;
2979         uint64_t wverf = 0;
2980         size_t rlen;
2981         void *wakeme = NULL;
2982         struct nfsreq_cbinfo cb;
2983         struct nfsreq *wreq = NULL;
2984         struct nfsbuf *bp;
2985         struct nfsmount *nmp;
2986         nfsnode_t np;
2987         thread_t thd;
2988         kauth_cred_t cred;
2989         uio_t auio;
2990         char uio_buf[UIO_SIZEOF(1)];
2991
2992 finish:
2993         np = req->r_np;
2994         thd = req->r_thread;
2995         cred = req->r_cred;
2996         if (IS_VALID_CRED(cred)) {
2997                 kauth_cred_ref(cred);
2998         }
2999         cb = req->r_callback;
3000         bp = cb.rcb_bp;
3001         if (cb.rcb_func) { /* take an extra reference on the nfsreq in case we want to resend it later due to grace error */
3002                 nfs_request_ref(req, 0);
3003         }
3004
3005         nmp = NFSTONMP(np);
3006         if (nfs_mount_gone(nmp)) {
3007                 SET(bp->nb_flags, NB_ERROR);
3008                 bp->nb_error = error = ENXIO;
3009         }
3010         if (error || ISSET(bp->nb_flags, NB_ERROR)) {
3011                 /* just drop it */
3012                 nfs_request_async_cancel(req);
3013                 goto out;
3014         }
3015         nfsvers = nmp->nm_vers;
3016
3017         offset = cb.rcb_args[0];
3018         rlen = length = cb.rcb_args[1];
3019
3020         /* finish the RPC */
3021         error = nmp->nm_funcs->nf_write_rpc_async_finish(np, req, &committed, &rlen, &wverf);
3022         if ((error == EINPROGRESS) && cb.rcb_func) {
3023                 /* async request restarted */
3024                 if (cb.rcb_func) {
3025                         nfs_request_rele(req);
3026                 }
3027                 if (IS_VALID_CRED(cred)) {
3028                         kauth_cred_unref(&cred);
3029                 }
3030                 return;
3031         }
3032         if ((nmp->nm_vers >= NFS_VER4) && nfs_mount_state_error_should_restart(error) && !ISSET(bp->nb_flags, NB_ERROR)) {
3033                 lck_mtx_lock(&nmp->nm_lock);
3034                 if ((error != NFSERR_OLD_STATEID) && (error != NFSERR_GRACE) && (cb.rcb_args[2] == nmp->nm_stategenid)) {
3035                         NP(np, "nfs_buf_write_rpc_finish: error %d @ 0x%llx, 0x%x 0x%x, initiating recovery",
3036                             error, NBOFF(bp) + offset, cb.rcb_args[2], nmp->nm_stategenid);
3037                         nfs_need_recover(nmp, error);
3038                 }
3039                 lck_mtx_unlock(&nmp->nm_lock);
3040                 if (np->n_flag & NREVOKE) {
3041                         error = EIO;
3042                 } else {
3043                         if (error == NFSERR_GRACE) {
3044                                 if (cb.rcb_func) {
3045                                         /*
3046                                          * For an async I/O request, handle a grace delay just like
3047                                          * jukebox errors.  Set the resend time and queue it up.
3048                                          */
3049                                         struct timeval now;
3050                                         if (req->r_nmrep.nmc_mhead) {
3051                                                 mbuf_freem(req->r_nmrep.nmc_mhead);
3052                                                 req->r_nmrep.nmc_mhead = NULL;
3053                                         }
3054                                         req->r_error = 0;
3055                                         microuptime(&now);
3056                                         lck_mtx_lock(&req->r_mtx);
3057                                         req->r_resendtime = now.tv_sec + 2;
3058                                         req->r_xid = 0;                 // get a new XID
3059                                         req->r_flags |= R_RESTART;
3060                                         req->r_start = 0;
3061                                         nfs_asyncio_resend(req);
3062                                         lck_mtx_unlock(&req->r_mtx);
3063                                         if (IS_VALID_CRED(cred)) {
3064                                                 kauth_cred_unref(&cred);
3065                                         }
3066                                         /* Note: nfsreq reference taken will be dropped later when finished */
3067                                         return;
3068                                 }
3069                                 /* otherwise, just pause a couple seconds and retry */
3070                                 tsleep(&nmp->nm_state, (PZERO - 1), "nfsgrace", 2 * hz);
3071                         }
3072                         if (!(error = nfs_mount_state_wait_for_recovery(nmp))) {
3073                                 rlen = 0;
3074                                 goto writeagain;
3075                         }
3076                 }
3077         }
3078         if (error) {
3079                 SET(bp->nb_flags, NB_ERROR);
3080                 bp->nb_error = error;
3081         }
3082         if (error || (nfsvers == NFS_VER2)) {
3083                 goto out;
3084         }
3085         if (rlen <= 0) {
3086                 SET(bp->nb_flags, NB_ERROR);
3087                 bp->nb_error = error = EIO;
3088                 goto out;
3089         }
3090
3091         /* save lowest commit level returned */
3092         if (committed < bp->nb_commitlevel) {
3093                 bp->nb_commitlevel = committed;
3094         }
3095
3096         /* check the write verifier */
3097         if (!bp->nb_verf) {
3098                 bp->nb_verf = wverf;
3099         } else if (bp->nb_verf != wverf) {
3100                 /* verifier changed, so buffer will need to be rewritten */
3101                 bp->nb_flags |= NB_STALEWVERF;
3102                 bp->nb_commitlevel = NFS_WRITE_UNSTABLE;
3103                 bp->nb_verf = wverf;
3104         }
3105
3106         /*
3107          * check for a short write
3108          *
3109          * If the server didn't write all the data, then we
3110          * need to issue another write for the rest of it.
3111          * (Don't bother if the buffer hit an error or stale wverf.)
3112          */
3113         if (((int)rlen < length) && !(bp->nb_flags & (NB_STALEWVERF | NB_ERROR))) {
3114 writeagain:
3115                 offset += rlen;
3116                 length -= rlen;
3117
3118                 auio = uio_createwithbuffer(1, NBOFF(bp) + offset, UIO_SYSSPACE,
3119                     UIO_WRITE, &uio_buf, sizeof(uio_buf));
3120                 uio_addiov(auio, CAST_USER_ADDR_T(bp->nb_data + offset), length);
3121
3122                 cb.rcb_args[0] = offset;
3123                 cb.rcb_args[1] = length;
3124                 if (nmp->nm_vers >= NFS_VER4) {
3125                         cb.rcb_args[2] = nmp->nm_stategenid;
3126                 }
3127
3128                 // XXX iomode should really match the original request
3129                 error = nmp->nm_funcs->nf_write_rpc_async(np, auio, length, thd, cred,
3130                     NFS_WRITE_FILESYNC, &cb, &wreq);
3131                 if (!error) {
3132                         if (IS_VALID_CRED(cred)) {
3133                                 kauth_cred_unref(&cred);
3134                         }
3135                         if (!cb.rcb_func) {
3136                                 /* if !async we'll need to wait for this RPC to finish */
3137                                 req = wreq;
3138                                 wreq = NULL;
3139                                 goto finish;
3140                         }
3141                         nfs_request_rele(req);
3142                         /*
3143                          * We're done here.
3144                          * Outstanding RPC count is unchanged.
3145                          * Callback will be called when RPC is done.
3146                          */
3147                         return;
3148                 }
3149                 SET(bp->nb_flags, NB_ERROR);
3150                 bp->nb_error = error;
3151         }
3152
3153 out:
3154         if (cb.rcb_func) {
3155                 nfs_async_write_done(nmp);
3156                 nfs_request_rele(req);
3157         }
3158         /*
3159          * Decrement outstanding RPC count on buffer
3160          * and call nfs_buf_write_finish on last RPC.
3161          *
3162          * (Note: when there are multiple async RPCs issued for a
3163          * buffer we need nfs_buffer_mutex to avoid problems when
3164          * aborting a partially-initiated set of RPCs)
3165          */
3166         multasyncrpc = ISSET(bp->nb_flags, NB_MULTASYNCRPC);
3167         if (multasyncrpc) {
3168                 lck_mtx_lock(nfs_buf_mutex);
3169         }
3170
3171         bp->nb_rpcs--;
3172         finished = (bp->nb_rpcs == 0);
3173
3174         if (multasyncrpc) {
3175                 lck_mtx_unlock(nfs_buf_mutex);
3176         }
3177
3178         if (finished) {
3179                 if (multasyncrpc) {
3180                         wakeme = &bp->nb_rpcs;
3181                 }
3182                 nfs_buf_write_finish(bp, thd, cred);
3183                 if (wakeme) {
3184                         wakeup(wakeme);
3185                 }
3186         }
3187
3188         if (IS_VALID_CRED(cred)) {
3189                 kauth_cred_unref(&cred);
3190         }
3191 }
3192
3193 /*
3194  * Send commit(s) for the given node's "needcommit" buffers
3195  */
3196 int
3197 nfs_flushcommits(nfsnode_t np, int nowait)
3198 {
3199         struct nfsmount *nmp;
3200         struct nfsbuf *bp, *prevlbp, *lbp;
3201         struct nfsbuflists blist, commitlist;
3202         int error = 0, retv, wcred_set, flags, dirty;
3203         u_quad_t off, endoff, toff;
3204         uint64_t wverf;
3205         u_int32_t count;
3206         kauth_cred_t wcred = NULL;
3207
3208         FSDBG_TOP(557, np, 0, 0, 0);
3209
3210         /*
3211          * A nb_flags == (NB_DELWRI | NB_NEEDCOMMIT) block has been written to the
3212          * server, but nas not been committed to stable storage on the server
3213          * yet. The byte range is worked out for as many nfsbufs as we can handle
3214          * and the commit rpc is done.
3215          */
3216         if (!LIST_EMPTY(&np->n_dirtyblkhd)) {
3217                 error = nfs_node_lock(np);
3218                 if (error) {
3219                         goto done;
3220                 }
3221                 np->n_flag |= NMODIFIED;
3222                 nfs_node_unlock(np);
3223         }
3224
3225         off = (u_quad_t)-1;
3226         endoff = 0;
3227         wcred_set = 0;
3228         LIST_INIT(&commitlist);
3229
3230         nmp = NFSTONMP(np);
3231         if (nfs_mount_gone(nmp)) {
3232                 error = ENXIO;
3233                 goto done;
3234         }
3235         if (nmp->nm_vers == NFS_VER2) {
3236                 error = EINVAL;
3237                 goto done;
3238         }
3239
3240         flags = NBI_DIRTY;
3241         if (nowait) {
3242                 flags |= NBI_NOWAIT;
3243         }
3244         lck_mtx_lock(nfs_buf_mutex);
3245         wverf = nmp->nm_verf;
3246         if (!nfs_buf_iterprepare(np, &blist, flags)) {
3247                 while ((bp = LIST_FIRST(&blist))) {
3248                         LIST_REMOVE(bp, nb_vnbufs);
3249                         LIST_INSERT_HEAD(&np->n_dirtyblkhd, bp, nb_vnbufs);
3250                         error = nfs_buf_acquire(bp, NBAC_NOWAIT, 0, 0);
3251                         if (error) {
3252                                 continue;
3253                         }
3254                         if (ISSET(bp->nb_flags, NB_NEEDCOMMIT)) {
3255                                 nfs_buf_check_write_verifier(np, bp);
3256                         }
3257                         if (((bp->nb_flags & (NB_DELWRI | NB_NEEDCOMMIT)) != (NB_DELWRI | NB_NEEDCOMMIT)) ||
3258                             (bp->nb_verf != wverf)) {
3259                                 nfs_buf_drop(bp);
3260                                 continue;
3261                         }
3262                         nfs_buf_remfree(bp);
3263
3264                         /* buffer UPLs will be grabbed *in order* below */
3265
3266                         FSDBG(557, bp, bp->nb_flags, bp->nb_valid, bp->nb_dirty);
3267                         FSDBG(557, bp->nb_validoff, bp->nb_validend,
3268                             bp->nb_dirtyoff, bp->nb_dirtyend);
3269
3270                         /*
3271                          * Work out if all buffers are using the same cred
3272                          * so we can deal with them all with one commit.
3273                          *
3274                          * Note: creds in bp's must be obtained by kauth_cred_ref
3275                          * on the same original cred in order for them to be equal.
3276                          */
3277                         if (wcred_set == 0) {
3278                                 wcred = bp->nb_wcred;
3279                                 if (!IS_VALID_CRED(wcred)) {
3280                                         panic("nfs: needcommit w/out wcred");
3281                                 }
3282                                 wcred_set = 1;
3283                         } else if ((wcred_set == 1) && wcred != bp->nb_wcred) {
3284                                 wcred_set = -1;
3285                         }
3286                         SET(bp->nb_flags, NB_WRITEINPROG);
3287
3288                         /*
3289                          * Add this buffer to the list of buffers we are committing.
3290                          * Buffers are inserted into the list in ascending order so that
3291                          * we can take the UPLs in order after the list is complete.
3292                          */
3293                         prevlbp = NULL;
3294                         LIST_FOREACH(lbp, &commitlist, nb_vnbufs) {
3295                                 if (bp->nb_lblkno < lbp->nb_lblkno) {
3296                                         break;
3297                                 }
3298                                 prevlbp = lbp;
3299                         }
3300                         LIST_REMOVE(bp, nb_vnbufs);
3301                         if (prevlbp) {
3302                                 LIST_INSERT_AFTER(prevlbp, bp, nb_vnbufs);
3303                         } else {
3304                                 LIST_INSERT_HEAD(&commitlist, bp, nb_vnbufs);
3305                         }
3306
3307                         /* update commit range start, end */
3308                         toff = NBOFF(bp) + bp->nb_dirtyoff;
3309                         if (toff < off) {
3310                                 off = toff;
3311                         }
3312                         toff += (u_quad_t)(bp->nb_dirtyend - bp->nb_dirtyoff);
3313                         if (toff > endoff) {
3314                                 endoff = toff;
3315                         }
3316                 }
3317                 nfs_buf_itercomplete(np, &blist, NBI_DIRTY);
3318         }
3319         lck_mtx_unlock(nfs_buf_mutex);
3320
3321         if (LIST_EMPTY(&commitlist)) {
3322                 error = ENOBUFS;
3323                 goto done;
3324         }
3325
3326         /*
3327          * We need a UPL to prevent others from accessing the buffers during
3328          * our commit RPC(s).
3329          *
3330          * We used to also check for dirty pages here; if there were any we'd
3331          * abort the commit and force the entire buffer to be written again.
3332          * Instead of doing that, we just go ahead and commit the dirty range,
3333          * and then leave the buffer around with dirty pages that will be
3334          * written out later.
3335          */
3336         LIST_FOREACH(bp, &commitlist, nb_vnbufs) {
3337                 if (!ISSET(bp->nb_flags, NB_PAGELIST)) {
3338                         retv = nfs_buf_upl_setup(bp);
3339                         if (retv) {
3340                                 /* Unable to create the UPL, the VM object probably no longer exists. */
3341                                 printf("nfs_flushcommits: upl create failed %d\n", retv);
3342                                 bp->nb_valid = bp->nb_dirty = 0;
3343                         }
3344                 }
3345                 nfs_buf_upl_check(bp);
3346         }
3347
3348         /*
3349          * Commit data on the server, as required.
3350          * If all bufs are using the same wcred, then use that with
3351          * one call for all of them, otherwise commit each one
3352          * separately.
3353          */
3354         if (wcred_set == 1) {
3355                 /*
3356                  * Note, it's possible the commit range could be >2^32-1.
3357                  * If it is, we'll send one commit that covers the whole file.
3358                  */
3359                 if ((endoff - off) > 0xffffffff) {
3360                         count = 0;
3361                 } else {
3362                         count = (endoff - off);
3363                 }
3364                 retv = nmp->nm_funcs->nf_commit_rpc(np, off, count, wcred, wverf);
3365         } else {
3366                 retv = 0;
3367                 LIST_FOREACH(bp, &commitlist, nb_vnbufs) {
3368                         toff = NBOFF(bp) + bp->nb_dirtyoff;
3369                         count = bp->nb_dirtyend - bp->nb_dirtyoff;
3370                         retv = nmp->nm_funcs->nf_commit_rpc(np, toff, count, bp->nb_wcred, wverf);
3371                         if (retv) {
3372                                 break;
3373                         }
3374                 }
3375         }
3376
3377         /*
3378          * Now, either mark the blocks I/O done or mark the
3379          * blocks dirty, depending on whether the commit
3380          * succeeded.
3381          */
3382         while ((bp = LIST_FIRST(&commitlist))) {
3383                 LIST_REMOVE(bp, nb_vnbufs);
3384                 FSDBG(557, bp, retv, bp->nb_flags, bp->nb_dirty);
3385                 nfs_node_lock_force(np);
3386                 CLR(bp->nb_flags, (NB_NEEDCOMMIT | NB_WRITEINPROG));
3387                 np->n_needcommitcnt--;
3388                 CHECK_NEEDCOMMITCNT(np);
3389                 nfs_node_unlock(np);
3390
3391                 if (retv) {
3392                         /* move back to dirty list */
3393                         lck_mtx_lock(nfs_buf_mutex);
3394                         LIST_INSERT_HEAD(&np->n_dirtyblkhd, bp, nb_vnbufs);
3395                         lck_mtx_unlock(nfs_buf_mutex);
3396                         nfs_buf_release(bp, 1);
3397                         continue;
3398                 }
3399
3400                 nfs_node_lock_force(np);
3401                 np->n_numoutput++;
3402                 nfs_node_unlock(np);
3403                 vnode_startwrite(NFSTOV(np));
3404                 if (ISSET(bp->nb_flags, NB_DELWRI)) {
3405                         lck_mtx_lock(nfs_buf_mutex);
3406                         nfs_nbdwrite--;
3407                         NFSBUFCNTCHK();
3408                         lck_mtx_unlock(nfs_buf_mutex);
3409                         wakeup(&nfs_nbdwrite);
3410                 }
3411                 CLR(bp->nb_flags, (NB_READ | NB_DONE | NB_ERROR | NB_DELWRI));
3412                 /* if block still has dirty pages, we don't want it to */
3413                 /* be released in nfs_buf_iodone().  So, don't set NB_ASYNC. */
3414                 if (!(dirty = bp->nb_dirty)) {
3415                         SET(bp->nb_flags, NB_ASYNC);
3416                 } else {
3417                         CLR(bp->nb_flags, NB_ASYNC);
3418                 }
3419
3420                 /* move to clean list */
3421                 lck_mtx_lock(nfs_buf_mutex);
3422                 LIST_INSERT_HEAD(&np->n_cleanblkhd, bp, nb_vnbufs);
3423                 lck_mtx_unlock(nfs_buf_mutex);
3424
3425                 bp->nb_dirtyoff = bp->nb_dirtyend = 0;
3426
3427                 nfs_buf_iodone(bp);
3428                 if (dirty) {
3429                         /* throw it back in as a delayed write buffer */
3430                         CLR(bp->nb_flags, NB_DONE);
3431                         nfs_buf_write_delayed(bp);
3432                 }
3433         }
3434
3435 done:
3436         FSDBG_BOT(557, np, 0, 0, error);
3437         return error;
3438 }
3439
3440 /*
3441  * Flush all the blocks associated with a vnode.
3442  *      Walk through the buffer pool and push any dirty pages
3443  *      associated with the vnode.
3444  */
3445 int
3446 nfs_flush(nfsnode_t np, int waitfor, thread_t thd, int ignore_writeerr)
3447 {
3448         struct nfsbuf *bp;
3449         struct nfsbuflists blist;
3450         struct nfsmount *nmp = NFSTONMP(np);
3451         int error = 0, error2, slptimeo = 0, slpflag = 0;
3452         int nfsvers, flags, passone = 1;
3453
3454         FSDBG_TOP(517, np, waitfor, ignore_writeerr, 0);
3455
3456         if (nfs_mount_gone(nmp)) {
3457                 error = ENXIO;
3458                 goto out;
3459         }
3460         nfsvers = nmp->nm_vers;
3461         if (NMFLAG(nmp, INTR)) {
3462                 slpflag = PCATCH;
3463         }
3464
3465         if (!LIST_EMPTY(&np->n_dirtyblkhd)) {
3466                 nfs_node_lock_force(np);
3467                 np->n_flag |= NMODIFIED;
3468                 nfs_node_unlock(np);
3469         }
3470
3471         lck_mtx_lock(nfs_buf_mutex);
3472         while (np->n_bflag & NBFLUSHINPROG) {
3473                 np->n_bflag |= NBFLUSHWANT;
3474                 error = msleep(&np->n_bflag, nfs_buf_mutex, slpflag, "nfs_flush", NULL);
3475                 if ((error && (error != EWOULDBLOCK)) ||
3476                     ((error = nfs_sigintr(NFSTONMP(np), NULL, thd, 0)))) {
3477                         lck_mtx_unlock(nfs_buf_mutex);
3478                         goto out;
3479                 }
3480         }
3481         np->n_bflag |= NBFLUSHINPROG;
3482
3483         /*
3484          * On the first pass, start async/unstable writes on all
3485          * delayed write buffers.  Then wait for all writes to complete
3486          * and call nfs_flushcommits() to commit any uncommitted buffers.
3487          * On all subsequent passes, start STABLE writes on any remaining
3488          * dirty buffers.  Then wait for all writes to complete.
3489          */
3490 again:
3491         FSDBG(518, LIST_FIRST(&np->n_dirtyblkhd), np->n_flag, 0, 0);
3492         if (!NFSTONMP(np)) {
3493                 lck_mtx_unlock(nfs_buf_mutex);
3494                 error = ENXIO;
3495                 goto done;
3496         }
3497
3498         /* Start/do any write(s) that are required. */
3499         if (!nfs_buf_iterprepare(np, &blist, NBI_DIRTY)) {
3500                 while ((bp = LIST_FIRST(&blist))) {
3501                         LIST_REMOVE(bp, nb_vnbufs);
3502                         LIST_INSERT_HEAD(&np->n_dirtyblkhd, bp, nb_vnbufs);
3503                         flags = (passone || !(waitfor == MNT_WAIT || waitfor == MNT_DWAIT)) ? NBAC_NOWAIT : 0;
3504                         if (flags != NBAC_NOWAIT) {
3505                                 nfs_buf_refget(bp);
3506                         }
3507                         while ((error = nfs_buf_acquire(bp, flags, slpflag, slptimeo))) {
3508                                 FSDBG(524, bp, flags, bp->nb_lflags, bp->nb_flags);
3509                                 if (error == EBUSY) {
3510                                         break;
3511                                 }
3512                                 if (error) {
3513                                         error2 = nfs_sigintr(NFSTONMP(np), NULL, thd, 0);
3514                                         if (error2) {
3515                                                 if (flags != NBAC_NOWAIT) {
3516                                                         nfs_buf_refrele(bp);
3517                                                 }
3518                                                 nfs_buf_itercomplete(np, &blist, NBI_DIRTY);
3519                                                 lck_mtx_unlock(nfs_buf_mutex);
3520                                                 error = error2;
3521                                                 goto done;
3522                                         }
3523                                         if (slpflag == PCATCH) {
3524                                                 slpflag = 0;
3525                                                 slptimeo = 2 * hz;
3526                                         }
3527                                 }
3528                         }
3529                         if (flags != NBAC_NOWAIT) {
3530                                 nfs_buf_refrele(bp);
3531                         }
3532                         if (error == EBUSY) {
3533                                 continue;
3534                         }
3535                         if (!bp->nb_np) {
3536                                 /* buffer is no longer valid */
3537                                 nfs_buf_drop(bp);
3538                                 continue;
3539                         }
3540                         if (ISSET(bp->nb_flags, NB_NEEDCOMMIT)) {
3541                                 nfs_buf_check_write_verifier(np, bp);
3542                         }
3543                         if (!ISSET(bp->nb_flags, NB_DELWRI)) {
3544                                 /* buffer is no longer dirty */
3545                                 nfs_buf_drop(bp);
3546                                 continue;
3547                         }
3548                         FSDBG(525, bp, passone, bp->nb_lflags, bp->nb_flags);
3549                         if ((passone || !(waitfor == MNT_WAIT || waitfor == MNT_DWAIT)) &&
3550                             ISSET(bp->nb_flags, NB_NEEDCOMMIT)) {
3551                                 nfs_buf_drop(bp);
3552                                 continue;
3553                         }
3554                         nfs_buf_remfree(bp);
3555                         lck_mtx_unlock(nfs_buf_mutex);
3556                         if (ISSET(bp->nb_flags, NB_ERROR)) {
3557                                 nfs_node_lock_force(np);
3558                                 np->n_error = bp->nb_error ? bp->nb_error : EIO;
3559                                 np->n_flag |= NWRITEERR;
3560                                 nfs_node_unlock(np);
3561                                 nfs_buf_release(bp, 1);
3562                                 lck_mtx_lock(nfs_buf_mutex);
3563                                 continue;
3564                         }
3565                         SET(bp->nb_flags, NB_ASYNC);
3566                         if (!passone) {
3567                                 /* NB_STABLE forces this to be written FILESYNC */
3568                                 SET(bp->nb_flags, NB_STABLE);
3569                         }
3570                         nfs_buf_write(bp);
3571                         lck_mtx_lock(nfs_buf_mutex);
3572                 }
3573                 nfs_buf_itercomplete(np, &blist, NBI_DIRTY);
3574         }
3575         lck_mtx_unlock(nfs_buf_mutex);
3576
3577         if (waitfor == MNT_WAIT || waitfor == MNT_DWAIT) {
3578                 while ((error = vnode_waitforwrites(NFSTOV(np), 0, slpflag, slptimeo, "nfsflush"))) {
3579                         error2 = nfs_sigintr(NFSTONMP(np), NULL, thd, 0);
3580                         if (error2) {
3581                                 error = error2;
3582                                 goto done;
3583                         }
3584                         if (slpflag == PCATCH) {
3585                                 slpflag = 0;
3586                                 slptimeo = 2 * hz;
3587                         }
3588                 }
3589         }
3590
3591         if (nfsvers != NFS_VER2) {
3592                 /* loop while it looks like there are still buffers to be */
3593                 /* commited and nfs_flushcommits() seems to be handling them. */
3594                 while (np->n_needcommitcnt) {
3595                         if (nfs_flushcommits(np, 0)) {
3596                                 break;
3597                         }
3598                 }
3599         }
3600
3601         if (passone) {
3602                 passone = 0;
3603                 if (!LIST_EMPTY(&np->n_dirtyblkhd)) {
3604                         nfs_node_lock_force(np);
3605                         np->n_flag |= NMODIFIED;
3606                         nfs_node_unlock(np);
3607                 }
3608                 lck_mtx_lock(nfs_buf_mutex);
3609                 goto again;
3610         }
3611
3612         if (waitfor == MNT_WAIT || waitfor == MNT_DWAIT) {
3613                 if (!LIST_EMPTY(&np->n_dirtyblkhd)) {
3614                         nfs_node_lock_force(np);
3615                         np->n_flag |= NMODIFIED;
3616                         nfs_node_unlock(np);
3617                 }
3618                 lck_mtx_lock(nfs_buf_mutex);
3619                 if (!LIST_EMPTY(&np->n_dirtyblkhd)) {
3620                         goto again;
3621                 }
3622                 lck_mtx_unlock(nfs_buf_mutex);
3623                 nfs_node_lock_force(np);
3624                 /*
3625                  * OK, it looks like there are no dirty blocks.  If we have no
3626                  * writes in flight and no one in the write code, we can clear
3627                  * the modified flag.  In order to make sure we see the latest
3628                  * attributes and size, we also invalidate the attributes and
3629                  * advance the attribute cache XID to guarantee that attributes
3630                  * newer than our clearing of NMODIFIED will get loaded next.
3631                  * (If we don't do this, it's possible for the flush's final
3632                  * write/commit (xid1) to be executed in parallel with a subsequent
3633                  * getattr request (xid2).  The getattr could return attributes
3634                  * from *before* the write/commit completed but the stale attributes
3635                  * would be preferred because of the xid ordering.)
3636                  */
3637                 if (!np->n_wrbusy && !np->n_numoutput) {
3638                         np->n_flag &= ~NMODIFIED;
3639                         NATTRINVALIDATE(np);
3640                         nfs_get_xid(&np->n_xid);
3641                 }
3642         } else {
3643                 nfs_node_lock_force(np);
3644         }
3645
3646         FSDBG(526, np->n_flag, np->n_error, 0, 0);
3647         if (!ignore_writeerr && (np->n_flag & NWRITEERR)) {
3648                 error = np->n_error;
3649                 np->n_flag &= ~NWRITEERR;
3650         }
3651         nfs_node_unlock(np);
3652 done:
3653         lck_mtx_lock(nfs_buf_mutex);
3654         flags = np->n_bflag;
3655         np->n_bflag &= ~(NBFLUSHINPROG | NBFLUSHWANT);
3656         lck_mtx_unlock(nfs_buf_mutex);
3657         if (flags & NBFLUSHWANT) {
3658                 wakeup(&np->n_bflag);
3659         }
3660 out:
3661         FSDBG_BOT(517, np, error, ignore_writeerr, 0);
3662         return error;
3663 }
3664
3665 /*
3666  * Flush out and invalidate all buffers associated with a vnode.
3667  * Called with the underlying object locked.
3668  */
3669 int
3670 nfs_vinvalbuf_internal(
3671         nfsnode_t np,
3672         int flags,
3673         thread_t thd,
3674         kauth_cred_t cred,
3675         int slpflag,
3676         int slptimeo)
3677 {
3678         struct nfsbuf *bp;
3679         struct nfsbuflists blist;
3680         int list, error = 0;
3681
3682         if (flags & V_SAVE) {
3683                 if ((error = nfs_flush(np, MNT_WAIT, thd, (flags & V_IGNORE_WRITEERR)))) {
3684                         return error;
3685                 }
3686         }
3687
3688         lck_mtx_lock(nfs_buf_mutex);
3689         for (;;) {
3690                 list = NBI_CLEAN;
3691                 if (nfs_buf_iterprepare(np, &blist, list)) {
3692                         list = NBI_DIRTY;
3693                         if (nfs_buf_iterprepare(np, &blist, list)) {
3694                                 break;
3695                         }
3696                 }
3697                 while ((bp = LIST_FIRST(&blist))) {
3698                         LIST_REMOVE(bp, nb_vnbufs);
3699                         if (list == NBI_CLEAN) {
3700                                 LIST_INSERT_HEAD(&np->n_cleanblkhd, bp, nb_vnbufs);
3701                         } else {
3702                                 LIST_INSERT_HEAD(&np->n_dirtyblkhd, bp, nb_vnbufs);
3703                         }
3704                         nfs_buf_refget(bp);
3705                         while ((error = nfs_buf_acquire(bp, NBAC_REMOVE, slpflag, slptimeo))) {
3706                                 FSDBG(556, np, bp, NBOFF(bp), bp->nb_flags);
3707                                 if (error != EAGAIN) {
3708                                         FSDBG(554, np, bp, -1, error);
3709                                         nfs_buf_refrele(bp);
3710                                         nfs_buf_itercomplete(np, &blist, list);
3711                                         lck_mtx_unlock(nfs_buf_mutex);
3712                                         return error;
3713                                 }
3714                         }
3715                         nfs_buf_refrele(bp);
3716                         FSDBG(554, np, bp, NBOFF(bp), bp->nb_flags);
3717                         lck_mtx_unlock(nfs_buf_mutex);
3718                         if ((flags & V_SAVE) && UBCINFOEXISTS(NFSTOV(np)) && bp->nb_np &&
3719                             (NBOFF(bp) < (off_t)np->n_size)) {
3720                                 /* extra paranoia: make sure we're not */
3721                                 /* somehow leaving any dirty data around */
3722                                 int mustwrite = 0;
3723                                 int end = (NBOFF(bp) + bp->nb_bufsize > (off_t)np->n_size) ?
3724                                     ((off_t)np->n_size - NBOFF(bp)) : bp->nb_bufsize;
3725                                 if (!ISSET(bp->nb_flags, NB_PAGELIST)) {
3726                                         error = nfs_buf_upl_setup(bp);
3727                                         if (error == EINVAL) {
3728                                                 /* vm object must no longer exist */
3729                                                 /* hopefully we don't need to do */
3730                                                 /* anything for this buffer */
3731                                         } else if (error) {
3732                                                 printf("nfs_vinvalbuf: upl setup failed %d\n", error);
3733                                         }
3734                                         bp->nb_valid = bp->nb_dirty = 0;
3735                                 }
3736                                 nfs_buf_upl_check(bp);
3737                                 /* check for any dirty data before the EOF */
3738                                 if ((bp->nb_dirtyend > 0) && (bp->nb_dirtyoff < end)) {
3739                                         /* clip dirty range to EOF */
3740                                         if (bp->nb_dirtyend > end) {
3741                                                 bp->nb_dirtyend = end;
3742                                                 if (bp->nb_dirtyoff >= bp->nb_dirtyend) {
3743                                                         bp->nb_dirtyoff = bp->nb_dirtyend = 0;
3744                                                 }
3745                                         }
3746                                         if ((bp->nb_dirtyend > 0) && (bp->nb_dirtyoff < end)) {
3747                                                 mustwrite++;
3748                                         }
3749                                 }
3750                                 bp->nb_dirty &= (1 << (round_page_32(end) / PAGE_SIZE)) - 1;
3751                                 if (bp->nb_dirty) {
3752                                         mustwrite++;
3753                                 }
3754                                 /* also make sure we'll have a credential to do the write */
3755                                 if (mustwrite && !IS_VALID_CRED(bp->nb_wcred) && !IS_VALID_CRED(cred)) {
3756                                         printf("nfs_vinvalbuf: found dirty buffer with no write creds\n");
3757                                         mustwrite = 0;
3758                                 }
3759                                 if (mustwrite) {
3760                                         FSDBG(554, np, bp, 0xd00dee, bp->nb_flags);
3761                                         if (!ISSET(bp->nb_flags, NB_PAGELIST)) {
3762                                                 panic("nfs_vinvalbuf: dirty buffer without upl");
3763                                         }
3764                                         /* gotta write out dirty data before invalidating */
3765                                         /* (NB_STABLE indicates that data writes should be FILESYNC) */
3766                                         /* (NB_NOCACHE indicates buffer should be discarded) */
3767                                         CLR(bp->nb_flags, (NB_DONE | NB_ERROR | NB_INVAL | NB_ASYNC));
3768                                         SET(bp->nb_flags, NB_STABLE | NB_NOCACHE);
3769                                         if (!IS_VALID_CRED(bp->nb_wcred)) {
3770                                                 kauth_cred_ref(cred);
3771                                                 bp->nb_wcred = cred;
3772                                         }
3773                                         error = nfs_buf_write(bp);
3774                                         // Note: bp has been released
3775                                         if (error) {
3776                                                 FSDBG(554, bp, 0xd00dee, 0xbad, error);
3777                                                 nfs_node_lock_force(np);
3778                                                 if ((error != EINTR) && (error != ERESTART)) {
3779                                                         np->n_error = error;
3780                                                         np->n_flag |= NWRITEERR;
3781                                                 }
3782                                                 /*
3783                                                  * There was a write error and we need to
3784                                                  * invalidate attrs to sync with server.
3785                                                  * (if this write was extending the file,
3786                                                  * we may no longer know the correct size)
3787                                                  */
3788                                                 NATTRINVALIDATE(np);
3789                                                 nfs_node_unlock(np);
3790                                                 if ((error == EINTR) || (error == ERESTART)) {
3791                                                         /*
3792                                                          * Abort on EINTR.  If we don't, we could
3793                                                          * be stuck in this loop forever because
3794                                                          * the buffer will continue to stay dirty.
3795                                                          */
3796                                                         lck_mtx_lock(nfs_buf_mutex);
3797                                                         nfs_buf_itercomplete(np, &blist, list);
3798                                                         lck_mtx_unlock(nfs_buf_mutex);
3799                                                         return error;
3800                                                 }
3801                                                 error = 0;
3802                                         }
3803                                         lck_mtx_lock(nfs_buf_mutex);
3804                                         continue;
3805                                 }
3806                         }
3807                         SET(bp->nb_flags, NB_INVAL);
3808                         // hold off on FREEUPs until we're done here
3809                         nfs_buf_release(bp, 0);
3810                         lck_mtx_lock(nfs_buf_mutex);
3811                 }
3812                 nfs_buf_itercomplete(np, &blist, list);
3813         }
3814         if (!LIST_EMPTY(&(np)->n_dirtyblkhd) || !LIST_EMPTY(&(np)->n_cleanblkhd)) {
3815                 panic("nfs_vinvalbuf: flush/inval failed");
3816         }
3817         lck_mtx_unlock(nfs_buf_mutex);
3818         nfs_node_lock_force(np);
3819         if (!(flags & V_SAVE)) {
3820                 np->n_flag &= ~NMODIFIED;
3821         }
3822         if (vnode_vtype(NFSTOV(np)) == VREG) {
3823                 np->n_lastrahead = -1;
3824         }
3825         nfs_node_unlock(np);
3826         NFS_BUF_FREEUP();
3827         return 0;
3828 }
3829
3830
3831 /*
3832  * Flush and invalidate all dirty buffers. If another process is already
3833  * doing the flush, just wait for completion.
3834  */
3835 int
3836 nfs_vinvalbuf(vnode_t vp, int flags, vfs_context_t ctx, int intrflg)
3837 {
3838         return nfs_vinvalbuf2(vp, flags, vfs_context_thread(ctx), vfs_context_ucred(ctx), intrflg);
3839 }
3840
3841 int
3842 nfs_vinvalbuf2(vnode_t vp, int flags, thread_t thd, kauth_cred_t cred, int intrflg)
3843 {
3844         nfsnode_t np = VTONFS(vp);
3845         struct nfsmount *nmp = VTONMP(vp);
3846         int error, slpflag, slptimeo, nflags, retry = 0;
3847         int ubcflags = UBC_PUSHALL | UBC_SYNC | UBC_INVALIDATE;
3848         struct timespec ts = { 2, 0 };
3849         off_t size;
3850
3851         FSDBG_TOP(554, np, flags, intrflg, 0);
3852
3853         /*
3854          * If the mount is gone no sense to try and write anything.
3855          * and hang trying to do IO.
3856          */
3857         if (nfs_mount_gone(nmp)) {
3858                 flags &= ~V_SAVE;
3859                 ubcflags &= ~UBC_PUSHALL;
3860         }
3861
3862         if (nmp && !NMFLAG(nmp, INTR)) {
3863                 intrflg = 0;
3864         }
3865         if (intrflg) {
3866                 slpflag = PCATCH;
3867                 slptimeo = 2 * hz;
3868         } else {
3869                 slpflag = 0;
3870                 slptimeo = 0;
3871         }
3872
3873         /* First wait for any other process doing a flush to complete.  */
3874         lck_mtx_lock(nfs_buf_mutex);
3875         while (np->n_bflag & NBINVALINPROG) {
3876                 np->n_bflag |= NBINVALWANT;
3877                 msleep(&np->n_bflag, nfs_buf_mutex, slpflag, "nfs_vinvalbuf", &ts);
3878                 if ((error = nfs_sigintr(VTONMP(vp), NULL, thd, 0))) {
3879                         lck_mtx_unlock(nfs_buf_mutex);
3880                         return error;
3881                 }
3882                 if (np->n_bflag & NBINVALINPROG) {
3883                         slpflag = 0;
3884                 }
3885         }
3886         np->n_bflag |= NBINVALINPROG;
3887         lck_mtx_unlock(nfs_buf_mutex);
3888
3889         /* Now, flush as required.  */
3890 again:
3891         error = nfs_vinvalbuf_internal(np, flags, thd, cred, slpflag, 0);
3892         while (error) {
3893                 FSDBG(554, np, 0, 0, error);
3894                 if ((error = nfs_sigintr(VTONMP(vp), NULL, thd, 0))) {
3895                         goto done;
3896                 }
3897                 error = nfs_vinvalbuf_internal(np, flags, thd, cred, 0, slptimeo);
3898         }
3899
3900         /* get the pages out of vm also */
3901         if (UBCINFOEXISTS(vp) && (size = ubc_getsize(vp))) {
3902                 if ((error = ubc_msync(vp, 0, size, NULL, ubcflags))) {
3903                         if (error == EINVAL) {
3904                                 panic("nfs_vinvalbuf(): ubc_msync failed!, error %d", error);
3905                         }
3906                         if (retry++ < 10) { /* retry invalidating a few times */
3907                                 if (retry > 1 || error == ENXIO) {
3908                                         ubcflags &= ~UBC_PUSHALL;
3909                                 }
3910                                 goto again;
3911                         }
3912                         /* give up */
3913                         printf("nfs_vinvalbuf(): ubc_msync failed!, error %d\n", error);
3914                 }
3915         }
3916 done:
3917         lck_mtx_lock(nfs_buf_mutex);
3918         nflags = np->n_bflag;
3919         np->n_bflag &= ~(NBINVALINPROG | NBINVALWANT);
3920         lck_mtx_unlock(nfs_buf_mutex);
3921         if (nflags & NBINVALWANT) {
3922                 wakeup(&np->n_bflag);
3923         }
3924
3925         FSDBG_BOT(554, np, flags, intrflg, error);
3926         return error;
3927 }
3928
3929 /*
3930  * Wait for any busy buffers to complete.
3931  */
3932 void
3933 nfs_wait_bufs(nfsnode_t np)
3934 {
3935         struct nfsbuf *bp;
3936         struct nfsbuflists blist;
3937         int error = 0;
3938
3939         lck_mtx_lock(nfs_buf_mutex);
3940         if (!nfs_buf_iterprepare(np, &blist, NBI_CLEAN)) {
3941                 while ((bp = LIST_FIRST(&blist))) {
3942                         LIST_REMOVE(bp, nb_vnbufs);
3943                         LIST_INSERT_HEAD(&np->n_cleanblkhd, bp, nb_vnbufs);
3944                         nfs_buf_refget(bp);
3945                         while ((error = nfs_buf_acquire(bp, 0, 0, 0))) {
3946                                 if (error != EAGAIN) {
3947                                         nfs_buf_refrele(bp);
3948                                         nfs_buf_itercomplete(np, &blist, NBI_CLEAN);
3949                                         lck_mtx_unlock(nfs_buf_mutex);
3950                                         return;
3951                                 }
3952                         }
3953                         nfs_buf_refrele(bp);
3954                         nfs_buf_drop(bp);
3955                 }
3956                 nfs_buf_itercomplete(np, &blist, NBI_CLEAN);
3957         }
3958         if (!nfs_buf_iterprepare(np, &blist, NBI_DIRTY)) {
3959                 while ((bp = LIST_FIRST(&blist))) {
3960                         LIST_REMOVE(bp, nb_vnbufs);
3961                         LIST_INSERT_HEAD(&np->n_dirtyblkhd, bp, nb_vnbufs);
3962                         nfs_buf_refget(bp);
3963                         while ((error = nfs_buf_acquire(bp, 0, 0, 0))) {
3964                                 if (error != EAGAIN) {
3965                                         nfs_buf_refrele(bp);
3966                                         nfs_buf_itercomplete(np, &blist, NBI_DIRTY);
3967                                         lck_mtx_unlock(nfs_buf_mutex);
3968                                         return;
3969                                 }
3970                         }
3971                         nfs_buf_refrele(bp);
3972                         nfs_buf_drop(bp);
3973                 }
3974                 nfs_buf_itercomplete(np, &blist, NBI_DIRTY);
3975         }
3976         lck_mtx_unlock(nfs_buf_mutex);
3977 }
3978
3979
3980 /*
3981  * Add an async I/O request to the mount's async I/O queue and make
3982  * sure that an nfsiod will service it.
3983  */
3984 void
3985 nfs_asyncio_finish(struct nfsreq *req)
3986 {
3987         struct nfsmount *nmp;
3988         struct nfsiod *niod;
3989         int started = 0;
3990
3991         FSDBG_TOP(552, nmp, 0, 0, 0);
3992 again:
3993         nmp = req->r_nmp;
3994
3995         if (nmp == NULL) {
3996                 return;
3997         }
3998
3999         lck_mtx_lock(nfsiod_mutex);
4000         niod = nmp->nm_niod;
4001
4002         /* grab an nfsiod if we don't have one already */
4003         if (!niod) {
4004                 niod = TAILQ_FIRST(&nfsiodfree);
4005                 if (niod) {
4006                         TAILQ_REMOVE(&nfsiodfree, niod, niod_link);
4007                         TAILQ_INSERT_TAIL(&nfsiodwork, niod, niod_link);
4008                         niod->niod_nmp = nmp;
4009                 } else if (((nfsiod_thread_count < NFSIOD_MAX) || (nfsiod_thread_count <= 0)) && (started < 4)) {
4010                         /*
4011                          * Try starting a new thread.
4012                          * We may try a couple times if other callers
4013                          * get the new threads before we do.
4014                          */
4015                         lck_mtx_unlock(nfsiod_mutex);
4016                         started++;
4017                         if (!nfsiod_start()) {
4018                                 goto again;
4019                         }
4020                         lck_mtx_lock(nfsiod_mutex);
4021                 }
4022         }
4023
4024         /*
4025          * If we got here while being on the resendq we need to get off. This
4026          * happens when the timer fires and errors out requests from nfs_sigintr
4027          * or we receive a reply (UDP case) while being on the resend queue so
4028          * we're just finishing up and are not going to be resent.
4029          */
4030         lck_mtx_lock(&req->r_mtx);
4031         if (req->r_flags & R_RESENDQ) {
4032                 lck_mtx_lock(&nmp->nm_lock);
4033                 if (req->r_rchain.tqe_next != NFSREQNOLIST) {
4034                         NFS_BIO_DBG("Proccessing async request on resendq. Removing");
4035                         TAILQ_REMOVE(&nmp->nm_resendq, req, r_rchain);
4036                         req->r_rchain.tqe_next = NFSREQNOLIST;
4037                         assert(req->r_refs > 1);
4038                         /* Remove resendq reference */
4039                         req->r_refs--;
4040                 }
4041                 lck_mtx_unlock(&nmp->nm_lock);
4042                 req->r_flags &= ~R_RESENDQ;
4043         }
4044         lck_mtx_unlock(&req->r_mtx);
4045
4046         if (req->r_achain.tqe_next == NFSREQNOLIST) {
4047                 TAILQ_INSERT_TAIL(&nmp->nm_iodq, req, r_achain);
4048         }
4049
4050         /* If this mount doesn't already have an nfsiod working on it... */
4051         if (!nmp->nm_niod) {
4052                 if (niod) { /* give it the nfsiod we just grabbed */
4053                         nmp->nm_niod = niod;
4054                         lck_mtx_unlock(nfsiod_mutex);
4055                         wakeup(niod);
4056                 } else if (nfsiod_thread_count > 0) {
4057                         /* just queue it up on nfsiod mounts queue if needed */
4058                         if (nmp->nm_iodlink.tqe_next == NFSNOLIST) {
4059                                 TAILQ_INSERT_TAIL(&nfsiodmounts, nmp, nm_iodlink);
4060                         }
4061                         lck_mtx_unlock(nfsiod_mutex);
4062                 } else {
4063                         printf("nfs_asyncio(): no nfsiods? %d %d (%d)\n", nfsiod_thread_count, NFSIOD_MAX, started);
4064                         lck_mtx_unlock(nfsiod_mutex);
4065                         /* we have no other option but to be persistent */
4066                         started = 0;
4067                         goto again;
4068                 }
4069         } else {
4070                 lck_mtx_unlock(nfsiod_mutex);
4071         }
4072
4073         FSDBG_BOT(552, nmp, 0, 0, 0);
4074 }
4075
4076 /*
4077  * queue up async I/O request for resend
4078  */
4079 void
4080 nfs_asyncio_resend(struct nfsreq *req)
4081 {
4082         struct nfsmount *nmp = req->r_nmp;
4083
4084         if (nfs_mount_gone(nmp)) {
4085                 return;
4086         }
4087
4088         nfs_gss_clnt_rpcdone(req);
4089         lck_mtx_lock(&nmp->nm_lock);
4090         if (!(req->r_flags & R_RESENDQ)) {
4091                 TAILQ_INSERT_TAIL(&nmp->nm_resendq, req, r_rchain);
4092                 req->r_flags |= R_RESENDQ;
4093                 /*
4094                  * We take a reference on this request so that it can't be
4095                  * destroyed while a resend is queued or in progress.
4096                  */
4097                 nfs_request_ref(req, 1);
4098         }
4099         nfs_mount_sock_thread_wake(nmp);
4100         lck_mtx_unlock(&nmp->nm_lock);
4101 }
4102
4103 /*
4104  * Read directory data into a buffer.
4105  *
4106  * Buffer will be filled (unless EOF is hit).
4107  * Buffers after this one may also be completely/partially filled.
4108  */
4109 int
4110 nfs_buf_readdir(struct nfsbuf *bp, vfs_context_t ctx)
4111 {
4112         nfsnode_t np = bp->nb_np;
4113         struct nfsmount *nmp = NFSTONMP(np);
4114         int error = 0;
4115
4116         if (nfs_mount_gone(nmp)) {
4117                 return ENXIO;
4118         }
4119
4120         if (nmp->nm_vers < NFS_VER4) {
4121                 error = nfs3_readdir_rpc(np, bp, ctx);
4122         } else {
4123                 error = nfs4_readdir_rpc(np, bp, ctx);
4124         }
4125
4126         if (error && (error != NFSERR_DIRBUFDROPPED)) {
4127                 SET(bp->nb_flags, NB_ERROR);
4128                 bp->nb_error = error;
4129         }
4130         return error;
4131 }