bsd/nfs/nfs_bio.c

   1 /*
   2  * Copyright (c) 2000-2003 Apple Computer, Inc. All rights reserved.
   3  *
   4  * @APPLE_LICENSE_HEADER_START@
   5  *
   6  * The contents of this file constitute Original Code as defined in and
   7  * are subject to the Apple Public Source License Version 1.1 (the
   8  * "License").  You may not use this file except in compliance with the
   9  * License.  Please obtain a copy of the License at
  10  * http://www.apple.com/publicsource and read it before using this file.
  11  *
  12  * This Original Code and all software distributed under the License are
  13  * distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY KIND, EITHER
  14  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
  15  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
  16  * FITNESS FOR A PARTICULAR PURPOSE OR NON-INFRINGEMENT.  Please see the
  17  * License for the specific language governing rights and limitations
  18  * under the License.
  19  *
  20  * @APPLE_LICENSE_HEADER_END@
  21  */
  22 /* Copyright (c) 1995 NeXT Computer, Inc. All Rights Reserved */
  23 /*
  24  * Copyright (c) 1989, 1993
  25  *      The Regents of the University of California.  All rights reserved.
  26  *
  27  * This code is derived from software contributed to Berkeley by
  28  * Rick Macklem at The University of Guelph.
  29  *
  30  * Redistribution and use in source and binary forms, with or without
  31  * modification, are permitted provided that the following conditions
  32  * are met:
  33  * 1. Redistributions of source code must retain the above copyright
  34  *    notice, this list of conditions and the following disclaimer.
  35  * 2. Redistributions in binary form must reproduce the above copyright
  36  *    notice, this list of conditions and the following disclaimer in the
  37  *    documentation and/or other materials provided with the distribution.
  38  * 3. All advertising materials mentioning features or use of this software
  39  *    must display the following acknowledgement:
  40  *      This product includes software developed by the University of
  41  *      California, Berkeley and its contributors.
  42  * 4. Neither the name of the University nor the names of its contributors
  43  *    may be used to endorse or promote products derived from this software
  44  *    without specific prior written permission.
  45  *
  46  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  47  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  48  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  49  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  50  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  51  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  52  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  53  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  54  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  55  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  56  * SUCH DAMAGE.
  57  *
  58  *      @(#)nfs_bio.c   8.9 (Berkeley) 3/30/95
  59  * FreeBSD-Id: nfs_bio.c,v 1.44 1997/09/10 19:52:25 phk Exp $
  60  */
  61 #include <sys/param.h>
  62 #include <sys/systm.h>
  63 #include <sys/resourcevar.h>
  64 #include <sys/signalvar.h>
  65 #include <sys/proc.h>
  66 #include <sys/malloc.h>
  67 #include <sys/vnode.h>
  68 #include <sys/dirent.h>
  69 #include <sys/mount.h>
  70 #include <sys/kernel.h>
  71 #include <sys/sysctl.h>
  72 #include <sys/ubc.h>
  73
  74 #include <sys/vm.h>
  75 #include <sys/vmparam.h>
  76
  77 #include <sys/time.h>
  78 #include <kern/clock.h>
  79
  80 #include <nfs/rpcv2.h>
  81 #include <nfs/nfsproto.h>
  82 #include <nfs/nfs.h>
  83 #include <nfs/nfsmount.h>
  84 #include <nfs/nqnfs.h>
  85 #include <nfs/nfsnode.h>
  86
  87 #include <sys/kdebug.h>
  88
  89 #define FSDBG(A, B, C, D, E) \
  90         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, (A))) | DBG_FUNC_NONE, \
  91                 (int)(B), (int)(C), (int)(D), (int)(E), 0)
  92 #define FSDBG_TOP(A, B, C, D, E) \
  93         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, (A))) | DBG_FUNC_START, \
  94                 (int)(B), (int)(C), (int)(D), (int)(E), 0)
  95 #define FSDBG_BOT(A, B, C, D, E) \
  96         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, (A))) | DBG_FUNC_END, \
  97                 (int)(B), (int)(C), (int)(D), (int)(E), 0)
  98
  99 extern int nfs_numasync;
 100 extern int nfs_ioddelwri;
 101 extern struct nfsstats nfsstats;
 102
 103 #define NFSBUFHASH(dvp, lbn)    \
 104         (&nfsbufhashtbl[((long)(dvp) / sizeof(*(dvp)) + (int)(lbn)) & nfsbufhash])
 105 LIST_HEAD(nfsbufhashhead, nfsbuf) *nfsbufhashtbl;
 106 struct nfsbuffreehead nfsbuffree, nfsbuffreemeta, nfsbufdelwri;
 107 u_long nfsbufhash;
 108 int nfsbufhashlock, nfsbufcnt, nfsbufmin, nfsbufmax;
 109 int nfsbuffreecnt, nfsbuffreemetacnt, nfsbufdelwricnt, nfsneedbuffer;
 110 int nfs_nbdwrite;
 111 time_t nfsbuffreeuptimestamp;
 112
 113 #define NFSBUFWRITE_THROTTLE    9
 114 #define NFSBUF_LRU_STALE        120
 115 #define NFSBUF_META_STALE       240
 116
 117 /* number of nfsbufs nfs_buf_freeup() should attempt to free from nfsbuffree list */
 118 #define LRU_TO_FREEUP                   6
 119 /* number of nfsbufs nfs_buf_freeup() should attempt to free from nfsbuffreemeta list */
 120 #define META_TO_FREEUP                  3
 121 /* total number of nfsbufs nfs_buf_freeup() should attempt to free */
 122 #define TOTAL_TO_FREEUP                 (LRU_TO_FREEUP+META_TO_FREEUP)
 123 /* fraction of nfsbufs nfs_buf_freeup() should attempt to free from nfsbuffree list when called from nfs_timer() */
 124 #define LRU_FREEUP_FRAC_ON_TIMER        8
 125 /* fraction of nfsbufs nfs_buf_freeup() should attempt to free from nfsbuffreemeta list when called from nfs_timer() */
 126 #define META_FREEUP_FRAC_ON_TIMER       16
 127 /* fraction of total nfsbufs that nfsbuffreecnt should exceed before bothering to call nfs_buf_freeup() */
 128 #define LRU_FREEUP_MIN_FRAC             4
 129 /* fraction of total nfsbufs that nfsbuffreemetacnt should exceed before bothering to call nfs_buf_freeup() */
 130 #define META_FREEUP_MIN_FRAC            2
 131
 132 #define NFS_BUF_FREEUP() \
 133         do { \
 134                 /* only call nfs_buf_freeup() if it has work to do: */ \
 135                 if (((nfsbuffreecnt > nfsbufcnt/LRU_FREEUP_MIN_FRAC) || \
 136                      (nfsbuffreemetacnt > nfsbufcnt/META_FREEUP_MIN_FRAC)) && \
 137                     ((nfsbufcnt - TOTAL_TO_FREEUP) > nfsbufmin)) \
 138                         nfs_buf_freeup(0); \
 139         } while (0)
 140
 141 /*
 142  * Initialize nfsbuf lists
 143  */
 144 void
 145 nfs_nbinit(void)
 146 {
 147         nfsbufhashlock = 0;
 148         nfsbufhashtbl = hashinit(nbuf, M_TEMP, &nfsbufhash);
 149         TAILQ_INIT(&nfsbuffree);
 150         TAILQ_INIT(&nfsbuffreemeta);
 151         TAILQ_INIT(&nfsbufdelwri);
 152         nfsbufcnt = nfsbuffreecnt = nfsbuffreemetacnt = nfsbufdelwricnt = 0;
 153         nfsbufmin = 128; // XXX tune me!
 154         nfsbufmax = 8192; // XXX tune me!
 155         nfsneedbuffer = 0;
 156         nfs_nbdwrite = 0;
 157         nfsbuffreeuptimestamp = 0;
 158 }
 159
 160 /*
 161  * try to free up some excess, unused nfsbufs
 162  */
 163 void
 164 nfs_buf_freeup(int timer)
 165 {
 166         struct nfsbuf *fbp;
 167         struct timeval now;
 168         int count;
 169
 170         microuptime(&now);
 171         nfsbuffreeuptimestamp = now.tv_sec;
 172
 173         FSDBG(320, nfsbufcnt, nfsbuffreecnt, nfsbuffreemetacnt, count);
 174         count = timer ? nfsbuffreecnt/LRU_FREEUP_FRAC_ON_TIMER : LRU_TO_FREEUP;
 175         while ((nfsbufcnt > nfsbufmin) && (count-- > 0)) {
 176                 fbp = TAILQ_FIRST(&nfsbuffree);
 177                 if (!fbp)
 178                         break;
 179                 if ((fbp->nb_timestamp + (2*NFSBUF_LRU_STALE)) > now.tv_sec)
 180                         break;
 181                 nfs_buf_remfree(fbp);
 182                 /* disassociate buffer from any vnode */
 183                 if (fbp->nb_vp) {
 184                         struct vnode *oldvp;
 185                         if (fbp->nb_vnbufs.le_next != NFSNOLIST) {
 186                                 LIST_REMOVE(fbp, nb_vnbufs);
 187                                 fbp->nb_vnbufs.le_next = NFSNOLIST;
 188                         }
 189                         oldvp = fbp->nb_vp;
 190                         fbp->nb_vp = NULL;
 191                         HOLDRELE(oldvp);
 192                 }
 193                 LIST_REMOVE(fbp, nb_hash);
 194                 /* nuke any creds */
 195                 if (fbp->nb_rcred != NOCRED)
 196                         crfree(fbp->nb_rcred);
 197                 if (fbp->nb_wcred != NOCRED)
 198                         crfree(fbp->nb_wcred);
 199                 /* if buf was NB_META, dump buffer */
 200                 if (ISSET(fbp->nb_flags, NB_META) && fbp->nb_data) {
 201                         FREE(fbp->nb_data, M_TEMP);
 202                 }
 203                 FREE(fbp, M_TEMP);
 204                 nfsbufcnt--;
 205         }
 206
 207         count = timer ? nfsbuffreemetacnt/META_FREEUP_FRAC_ON_TIMER : META_TO_FREEUP;
 208         while ((nfsbufcnt > nfsbufmin) && (count-- > 0)) {
 209                 fbp = TAILQ_FIRST(&nfsbuffreemeta);
 210                 if (!fbp)
 211                         break;
 212                 if ((fbp->nb_timestamp + (2*NFSBUF_META_STALE)) > now.tv_sec)
 213                         break;
 214                 nfs_buf_remfree(fbp);
 215                 /* disassociate buffer from any vnode */
 216                 if (fbp->nb_vp) {
 217                         struct vnode *oldvp;
 218                         if (fbp->nb_vnbufs.le_next != NFSNOLIST) {
 219                                 LIST_REMOVE(fbp, nb_vnbufs);
 220                                 fbp->nb_vnbufs.le_next = NFSNOLIST;
 221                         }
 222                         oldvp = fbp->nb_vp;
 223                         fbp->nb_vp = NULL;
 224                         HOLDRELE(oldvp);
 225                 }
 226                 LIST_REMOVE(fbp, nb_hash);
 227                 /* nuke any creds */
 228                 if (fbp->nb_rcred != NOCRED)
 229                         crfree(fbp->nb_rcred);
 230                 if (fbp->nb_wcred != NOCRED)
 231                         crfree(fbp->nb_wcred);
 232                 /* if buf was NB_META, dump buffer */
 233                 if (ISSET(fbp->nb_flags, NB_META) && fbp->nb_data) {
 234                         FREE(fbp->nb_data, M_TEMP);
 235                 }
 236                 FREE(fbp, M_TEMP);
 237                 nfsbufcnt--;
 238         }
 239         FSDBG(320, nfsbufcnt, nfsbuffreecnt, nfsbuffreemetacnt, count);
 240 }
 241
 242 void
 243 nfs_buf_remfree(struct nfsbuf *bp)
 244 {
 245         if (bp->nb_free.tqe_next == NFSNOLIST)
 246                 panic("nfsbuf not on free list");
 247         if (ISSET(bp->nb_flags, NB_DELWRI)) {
 248                 nfsbufdelwricnt--;
 249                 TAILQ_REMOVE(&nfsbufdelwri, bp, nb_free);
 250         } else if (ISSET(bp->nb_flags, NB_META) && !ISSET(bp->nb_flags, NB_INVAL)) {
 251                 nfsbuffreemetacnt--;
 252                 TAILQ_REMOVE(&nfsbuffreemeta, bp, nb_free);
 253         } else {
 254                 nfsbuffreecnt--;
 255                 TAILQ_REMOVE(&nfsbuffree, bp, nb_free);
 256         }
 257         bp->nb_free.tqe_next = NFSNOLIST;
 258         NFSBUFCNTCHK();
 259 }
 260
 261 /*
 262  * check for existence of nfsbuf in cache
 263  */
 264 struct nfsbuf *
 265 nfs_buf_incore(struct vnode *vp, daddr_t blkno)
 266 {
 267         /* Search hash chain */
 268         struct nfsbuf * bp = NFSBUFHASH(vp, blkno)->lh_first;
 269         for (; bp != NULL; bp = bp->nb_hash.le_next)
 270                 if (bp->nb_lblkno == blkno && bp->nb_vp == vp) {
 271                         if (!ISSET(bp->nb_flags, NB_INVAL)) {
 272                                 FSDBG(547, bp, blkno, bp->nb_flags, bp->nb_vp);
 273                                 return (bp);
 274                         }
 275                 }
 276         return (NULL);
 277 }
 278
 279 /*
 280  * Check if it's OK to drop a page.
 281  *
 282  * Called by vnode_pager() on pageout request of non-dirty page.
 283  * We need to make sure that it's not part of a delayed write.
 284  * If it is, we can't let the VM drop it because we may need it
 285  * later when/if we need to write the data (again).
 286  */
 287 int
 288 nfs_buf_page_inval(struct vnode *vp, off_t offset)
 289 {
 290         struct nfsbuf *bp;
 291         bp = nfs_buf_incore(vp, ubc_offtoblk(vp, offset));
 292         if (!bp)
 293                 return (0);
 294         FSDBG(325, bp, bp->nb_flags, bp->nb_dirtyoff, bp->nb_dirtyend);
 295         if (ISSET(bp->nb_flags, NB_BUSY))
 296                 return (EBUSY);
 297         /*
 298          * If there's a dirty range in the buffer, check to
 299          * see if this page intersects with the dirty range.
 300          * If it does, we can't let the pager drop the page.
 301          */
 302         if (bp->nb_dirtyend > 0) {
 303                 int start = offset - NBOFF(bp);
 304                 if (bp->nb_dirtyend <= start ||
 305                     bp->nb_dirtyoff >= (start + PAGE_SIZE))
 306                         return (0);
 307                 return (EBUSY);
 308         }
 309         return (0);
 310 }
 311
 312 int
 313 nfs_buf_upl_setup(struct nfsbuf *bp)
 314 {
 315         kern_return_t kret;
 316         upl_t upl;
 317         int s;
 318
 319         if (ISSET(bp->nb_flags, NB_PAGELIST))
 320                 return (0);
 321
 322         kret = ubc_create_upl(bp->nb_vp, NBOFF(bp), bp->nb_bufsize,
 323                                 &upl, NULL, UPL_PRECIOUS);
 324         if (kret == KERN_INVALID_ARGUMENT) {
 325                 /* vm object probably doesn't exist any more */
 326                 bp->nb_pagelist = NULL;
 327                 return (EINVAL);
 328         }
 329         if (kret != KERN_SUCCESS) {
 330                 printf("nfs_buf_upl_setup(): failed to get pagelist %d\n", kret);
 331                 bp->nb_pagelist = NULL;
 332                 return (EIO);
 333         }
 334
 335         FSDBG(538, bp, NBOFF(bp), bp->nb_bufsize, bp->nb_vp);
 336
 337         s = splbio();
 338         bp->nb_pagelist = upl;
 339         SET(bp->nb_flags, NB_PAGELIST);
 340         splx(s);
 341         return (0);
 342 }
 343
 344 void
 345 nfs_buf_upl_check(struct nfsbuf *bp)
 346 {
 347         upl_page_info_t *pl;
 348         off_t filesize, fileoffset;
 349         int i, npages;
 350
 351         if (!ISSET(bp->nb_flags, NB_PAGELIST))
 352                 return;
 353
 354         npages = round_page_32(bp->nb_bufsize) / PAGE_SIZE;
 355         filesize = ubc_getsize(bp->nb_vp);
 356         fileoffset = NBOFF(bp);
 357         if (fileoffset < filesize)
 358                 SET(bp->nb_flags, NB_CACHE);
 359         else
 360                 CLR(bp->nb_flags, NB_CACHE);
 361
 362         pl = ubc_upl_pageinfo(bp->nb_pagelist);
 363         bp->nb_valid = bp->nb_dirty = 0;
 364
 365         for (i=0; i < npages; i++, fileoffset += PAGE_SIZE_64) {
 366                 /* anything beyond the end of the file is not valid or dirty */
 367                 if (fileoffset >= filesize)
 368                         break;
 369                 if (!upl_valid_page(pl, i)) {
 370                         CLR(bp->nb_flags, NB_CACHE);
 371                         continue;
 372                 }
 373                 NBPGVALID_SET(bp,i);
 374                 if (upl_dirty_page(pl, i)) {
 375                         NBPGDIRTY_SET(bp, i);
 376                         if (!ISSET(bp->nb_flags, NB_WASDIRTY))
 377                                 SET(bp->nb_flags, NB_WASDIRTY);
 378                 }
 379         }
 380         fileoffset = NBOFF(bp);
 381         if (ISSET(bp->nb_flags, NB_CACHE)) {
 382                 bp->nb_validoff = 0;
 383                 bp->nb_validend = bp->nb_bufsize;
 384                 if (fileoffset + bp->nb_validend > filesize)
 385                         bp->nb_validend = filesize - fileoffset;
 386         } else {
 387                 bp->nb_validoff = bp->nb_validend = -1;
 388         }
 389         FSDBG(539, bp, fileoffset, bp->nb_valid, bp->nb_dirty);
 390         FSDBG(539, bp->nb_validoff, bp->nb_validend, bp->nb_dirtyoff, bp->nb_dirtyend);
 391 }
 392
 393 static int
 394 nfs_buf_map(struct nfsbuf *bp)
 395 {
 396         kern_return_t kret;
 397
 398         if (bp->nb_data)
 399                 return (0);
 400         if (!ISSET(bp->nb_flags, NB_PAGELIST))
 401                 return (EINVAL);
 402
 403         kret = ubc_upl_map(bp->nb_pagelist, (vm_address_t *)&(bp->nb_data));
 404         if (kret != KERN_SUCCESS)
 405                 panic("nfs_buf_map: ubc_upl_map() failed with (%d)", kret);
 406         if (bp->nb_data == 0)
 407                 panic("ubc_upl_map mapped 0");
 408         FSDBG(540, bp, bp->nb_flags, NBOFF(bp), bp->nb_data);
 409         return (0);
 410 }
 411
 412 /*
 413  * check range of pages in nfsbuf's UPL for validity
 414  */
 415 static int
 416 nfs_buf_upl_valid_range(struct nfsbuf *bp, int off, int size)
 417 {
 418         off_t fileoffset, filesize;
 419         int pg, lastpg;
 420         upl_page_info_t *pl;
 421
 422         if (!ISSET(bp->nb_flags, NB_PAGELIST))
 423                 return (0);
 424         pl = ubc_upl_pageinfo(bp->nb_pagelist);
 425
 426         size += off & PAGE_MASK;
 427         off &= ~PAGE_MASK;
 428         fileoffset = NBOFF(bp);
 429         filesize = VTONFS(bp->nb_vp)->n_size;
 430         if ((fileoffset + off + size) > filesize)
 431                 size = filesize - (fileoffset + off);
 432
 433         pg = off/PAGE_SIZE;
 434         lastpg = (off + size - 1)/PAGE_SIZE;
 435         while (pg <= lastpg) {
 436                 if (!upl_valid_page(pl, pg))
 437                         return (0);
 438                 pg++;
 439         }
 440         return (1);
 441 }
 442
 443 /*
 444  * normalize an nfsbuf's valid range
 445  *
 446  * the read/write code guarantees that we'll always have a valid
 447  * region that is an integral number of pages.  If either end
 448  * of the valid range isn't page-aligned, it gets corrected
 449  * here as we extend the valid range through all of the
 450  * contiguous valid pages.
 451  */
 452 static void
 453 nfs_buf_normalize_valid_range(struct nfsnode *np, struct nfsbuf *bp)
 454 {
 455         int pg, npg;
 456         /* pull validoff back to start of contiguous valid page range */
 457         pg = bp->nb_validoff/PAGE_SIZE;
 458         while (pg >= 0 && NBPGVALID(bp,pg))
 459                 pg--;
 460         bp->nb_validoff = (pg+1) * PAGE_SIZE;
 461         /* push validend forward to end of contiguous valid page range */
 462         npg = bp->nb_bufsize/PAGE_SIZE;
 463         pg = bp->nb_validend/PAGE_SIZE;
 464         while (pg < npg && NBPGVALID(bp,pg))
 465                 pg++;
 466         bp->nb_validend = pg * PAGE_SIZE;
 467         /* clip to EOF */
 468         if (NBOFF(bp) + bp->nb_validend > np->n_size)
 469                 bp->nb_validend = np->n_size % bp->nb_bufsize;
 470 }
 471
 472 /*
 473  * try to push out some delayed/uncommitted writes
 474  */
 475 static void
 476 nfs_buf_delwri_push(void)
 477 {
 478         struct nfsbuf *bp;
 479         int i;
 480
 481         if (TAILQ_EMPTY(&nfsbufdelwri))
 482                 return;
 483
 484         /* first try to tell the nfsiods to do it */
 485         if (nfs_asyncio(NULL, NULL) == 0)
 486                 return;
 487
 488         /* otherwise, try to do some of the work ourselves */
 489         i = 0;
 490         while (i < 8 && (bp = TAILQ_FIRST(&nfsbufdelwri)) != NULL) {
 491                 struct nfsnode *np = VTONFS(bp->nb_vp);
 492                 nfs_buf_remfree(bp);
 493                 if (ISSET(bp->nb_flags, NB_NEEDCOMMIT)) {
 494                         /* put buffer at end of delwri list */
 495                         TAILQ_INSERT_TAIL(&nfsbufdelwri, bp, nb_free);
 496                         nfsbufdelwricnt++;
 497                         nfs_flushcommits(np->n_vnode, (struct proc *)0);
 498                 } else {
 499                         SET(bp->nb_flags, (NB_BUSY | NB_ASYNC));
 500                         nfs_buf_write(bp);
 501                 }
 502                 i++;
 503         }
 504 }
 505
 506 /*
 507  * Get an nfs cache block.
 508  * Allocate a new one if the block isn't currently in the cache
 509  * and return the block marked busy. If the calling process is
 510  * interrupted by a signal for an interruptible mount point, return
 511  * NULL.
 512  */
 513 struct nfsbuf *
 514 nfs_buf_get(
 515         struct vnode *vp,
 516         daddr_t blkno,
 517         int size,
 518         struct proc *p,
 519         int operation)
 520 {
 521         struct nfsnode *np = VTONFS(vp);
 522         struct nfsbuf *bp;
 523         int i, biosize, bufsize, rv;
 524         struct ucred *cred;
 525         int slpflag = PCATCH;
 526
 527         FSDBG_TOP(541, vp, blkno, size, operation);
 528
 529         bufsize = size;
 530         if (bufsize > MAXBSIZE)
 531                 panic("nfs_buf_get: buffer larger than MAXBSIZE requested");
 532
 533         biosize = vp->v_mount->mnt_stat.f_iosize;
 534
 535         if (UBCINVALID(vp) || !UBCINFOEXISTS(vp))
 536                 operation = BLK_META;
 537         else if (bufsize < biosize)
 538                 /* reg files should always have biosize blocks */
 539                 bufsize = biosize;
 540
 541         /* if BLK_WRITE, check for too many delayed/uncommitted writes */
 542         if ((operation == BLK_WRITE) && (nfs_nbdwrite > ((nfsbufcnt*3)/4))) {
 543                 FSDBG_TOP(542, vp, blkno, nfs_nbdwrite, ((nfsbufcnt*3)/4));
 544
 545                 /* poke the delwri list */
 546                 nfs_buf_delwri_push();
 547
 548                 /* sleep to let other threads run... */
 549                 tsleep(&nfs_nbdwrite, PCATCH, "nfs_nbdwrite", 1);
 550                 FSDBG_BOT(542, vp, blkno, nfs_nbdwrite, ((nfsbufcnt*3)/4));
 551         }
 552
 553 loop:
 554         /*
 555          * Obtain a lock to prevent a race condition if the
 556          * MALLOC() below happens to block.
 557          */
 558         if (nfsbufhashlock) {
 559                 while (nfsbufhashlock) {
 560                         nfsbufhashlock = -1;
 561                         tsleep(&nfsbufhashlock, PCATCH, "nfsbufget", 0);
 562                         if (nfs_sigintr(VFSTONFS(vp->v_mount), NULL, p))
 563                                 return (NULL);
 564                 }
 565                 goto loop;
 566         }
 567         nfsbufhashlock = 1;
 568
 569         /* check for existence of nfsbuf in cache */
 570         if (bp = nfs_buf_incore(vp, blkno)) {
 571                 /* if busy, set wanted and wait */
 572                 if (ISSET(bp->nb_flags, NB_BUSY)) {
 573                         FSDBG_TOP(543, vp, blkno, bp, bp->nb_flags);
 574                         SET(bp->nb_flags, NB_WANTED);
 575                         /* unlock hash */
 576                         if (nfsbufhashlock < 0) {
 577                                 nfsbufhashlock = 0;
 578                                 wakeup(&nfsbufhashlock);
 579                         } else
 580                                 nfsbufhashlock = 0;
 581                         tsleep(bp, slpflag|(PRIBIO+1), "nfsbufget", (slpflag == PCATCH) ? 0 : 2*hz);
 582                         slpflag = 0;
 583                         FSDBG_BOT(543, vp, blkno, bp, bp->nb_flags);
 584                         if (nfs_sigintr(VFSTONFS(vp->v_mount), NULL, p)) {
 585                                 FSDBG_BOT(541, vp, blkno, 0, EINTR);
 586                                 return (NULL);
 587                         }
 588                         goto loop;
 589                 }
 590                 if (bp->nb_bufsize != bufsize)
 591                         panic("nfsbuf size mismatch");
 592                 SET(bp->nb_flags, (NB_BUSY | NB_CACHE));
 593                 nfs_buf_remfree(bp);
 594                 /* additional paranoia: */
 595                 if (ISSET(bp->nb_flags, NB_PAGELIST))
 596                         panic("pagelist buffer was not busy");
 597                 goto buffer_setup;
 598         }
 599
 600         /*
 601          * where to get a free buffer:
 602          * - alloc new if we haven't reached min bufs
 603          * - if free lists are NOT empty
 604          *   - if free list is stale, use it
 605          *   - else if freemeta list is stale, use it
 606          *   - else if max bufs allocated, use least-time-to-stale
 607          * - alloc new if we haven't reached max allowed
 608          * - start clearing out delwri list and try again
 609          */
 610
 611         if ((nfsbufcnt > nfsbufmin) &&
 612             (!TAILQ_EMPTY(&nfsbuffree) || !TAILQ_EMPTY(&nfsbuffreemeta))) {
 613                 /* try to pull an nfsbuf off a free list */
 614                 struct nfsbuf *lrubp, *metabp;
 615                 struct timeval now;
 616                 microuptime(&now);
 617
 618                 /* if the next LRU or META buffer is stale, use it */
 619                 lrubp = TAILQ_FIRST(&nfsbuffree);
 620                 if (lrubp && ((lrubp->nb_timestamp + NFSBUF_LRU_STALE) < now.tv_sec))
 621                         bp = lrubp;
 622                 metabp = TAILQ_FIRST(&nfsbuffreemeta);
 623                 if (!bp && metabp && ((metabp->nb_timestamp + NFSBUF_META_STALE) < now.tv_sec))
 624                         bp = metabp;
 625
 626                 if (!bp && (nfsbufcnt >= nfsbufmax)) {
 627                         /* we've already allocated all bufs, so */
 628                         /* choose the buffer that'll go stale first */
 629                         if (!metabp)
 630                                 bp = lrubp;
 631                         else if (!lrubp)
 632                                 bp = metabp;
 633                         else {
 634                                 int32_t lru_stale_time, meta_stale_time;
 635                                 lru_stale_time = lrubp->nb_timestamp + NFSBUF_LRU_STALE;
 636                                 meta_stale_time = metabp->nb_timestamp + NFSBUF_META_STALE;
 637                                 if (lru_stale_time <= meta_stale_time)
 638                                         bp = lrubp;
 639                                 else
 640                                         bp = metabp;
 641                         }
 642                 }
 643
 644                 if (bp) {
 645                         /* we have a buffer to reuse */
 646                         FSDBG(544, vp, blkno, bp, bp->nb_flags);
 647                         nfs_buf_remfree(bp);
 648                         if (ISSET(bp->nb_flags, NB_DELWRI))
 649                                 panic("nfs_buf_get: delwri");
 650                         SET(bp->nb_flags, NB_BUSY);
 651                         /* disassociate buffer from previous vnode */
 652                         if (bp->nb_vp) {
 653                                 struct vnode *oldvp;
 654                                 if (bp->nb_vnbufs.le_next != NFSNOLIST) {
 655                                         LIST_REMOVE(bp, nb_vnbufs);
 656                                         bp->nb_vnbufs.le_next = NFSNOLIST;
 657                                 }
 658                                 oldvp = bp->nb_vp;
 659                                 bp->nb_vp = NULL;
 660                                 HOLDRELE(oldvp);
 661                         }
 662                         LIST_REMOVE(bp, nb_hash);
 663                         /* nuke any creds we're holding */
 664                         cred = bp->nb_rcred;
 665                         if (cred != NOCRED) {
 666                                 bp->nb_rcred = NOCRED;
 667                                 crfree(cred);
 668                         }
 669                         cred = bp->nb_wcred;
 670                         if (cred != NOCRED) {
 671                                 bp->nb_wcred = NOCRED;
 672                                 crfree(cred);
 673                         }
 674                         /* if buf will no longer be NB_META, dump old buffer */
 675                         if ((operation != BLK_META) &&
 676                             ISSET(bp->nb_flags, NB_META) && bp->nb_data) {
 677                                 FREE(bp->nb_data, M_TEMP);
 678                                 bp->nb_data = NULL;
 679                         }
 680                         /* re-init buf fields */
 681                         bp->nb_error = 0;
 682                         bp->nb_validoff = bp->nb_validend = -1;
 683                         bp->nb_dirtyoff = bp->nb_dirtyend = 0;
 684                         bp->nb_valid = 0;
 685                         bp->nb_dirty = 0;
 686                 }
 687         }
 688
 689         if (!bp) {
 690                 if (nfsbufcnt < nfsbufmax) {
 691                         /* just alloc a new one */
 692                         MALLOC(bp, struct nfsbuf *, sizeof(struct nfsbuf), M_TEMP, M_WAITOK);
 693                         nfsbufcnt++;
 694                         NFSBUFCNTCHK();
 695                         /* init nfsbuf */
 696                         bzero(bp, sizeof(*bp));
 697                         bp->nb_free.tqe_next = NFSNOLIST;
 698                         bp->nb_validoff = bp->nb_validend = -1;
 699                         FSDBG(545, vp, blkno, bp, 0);
 700                 } else {
 701                         /* too many bufs... wait for buffers to free up */
 702                         FSDBG_TOP(546, vp, blkno, nfsbufcnt, nfsbufmax);
 703                         /* unlock hash */
 704                         if (nfsbufhashlock < 0) {
 705                                 nfsbufhashlock = 0;
 706                                 wakeup(&nfsbufhashlock);
 707                         } else
 708                                 nfsbufhashlock = 0;
 709
 710                         /* poke the delwri list */
 711                         nfs_buf_delwri_push();
 712
 713                         nfsneedbuffer = 1;
 714                         tsleep(&nfsneedbuffer, PCATCH, "nfsbufget", 0);
 715                         FSDBG_BOT(546, vp, blkno, nfsbufcnt, nfsbufmax);
 716                         if (nfs_sigintr(VFSTONFS(vp->v_mount), NULL, p)) {
 717                                 FSDBG_BOT(541, vp, blkno, 0, EINTR);
 718                                 return (NULL);
 719                         }
 720                         goto loop;
 721                 }
 722         }
 723
 724 setup_nfsbuf:
 725
 726         /* setup nfsbuf */
 727         bp->nb_flags = NB_BUSY;
 728         bp->nb_lblkno = blkno;
 729         /* insert buf in hash */
 730         LIST_INSERT_HEAD(NFSBUFHASH(vp, blkno), bp, nb_hash);
 731         /* associate buffer with new vnode */
 732         VHOLD(vp);
 733         bp->nb_vp = vp;
 734         LIST_INSERT_HEAD(&np->n_cleanblkhd, bp, nb_vnbufs);
 735
 736 buffer_setup:
 737
 738         switch (operation) {
 739         case BLK_META:
 740                 SET(bp->nb_flags, NB_META);
 741                 if ((bp->nb_bufsize != bufsize) && bp->nb_data) {
 742                         FREE(bp->nb_data, M_TEMP);
 743                         bp->nb_data = NULL;
 744                         bp->nb_validoff = bp->nb_validend = -1;
 745                         bp->nb_dirtyoff = bp->nb_dirtyend = 0;
 746                         bp->nb_valid = 0;
 747                         bp->nb_dirty = 0;
 748                         CLR(bp->nb_flags, NB_CACHE);
 749                 }
 750                 if (!bp->nb_data)
 751                         MALLOC(bp->nb_data, caddr_t, bufsize, M_TEMP, M_WAITOK);
 752                 if (!bp->nb_data)
 753                         panic("nfs_buf_get: null nb_data");
 754                 bp->nb_bufsize = bufsize;
 755                 break;
 756
 757         case BLK_READ:
 758         case BLK_WRITE:
 759                 if (bufsize < PAGE_SIZE)
 760                         bufsize = PAGE_SIZE;
 761                 bp->nb_bufsize = bufsize;
 762                 bp->nb_validoff = bp->nb_validend = -1;
 763
 764                 if (UBCISVALID(vp)) {
 765                         /* setup upl */
 766                         if (nfs_buf_upl_setup(bp)) {
 767                                 /* unable to create upl */
 768                                 /* vm object must no longer exist */
 769                                 /* cleanup buffer and return NULL */
 770                                 LIST_REMOVE(bp, nb_vnbufs);
 771                                 bp->nb_vnbufs.le_next = NFSNOLIST;
 772                                 bp->nb_vp = NULL;
 773                                 /* clear usage timestamp to allow immediate freeing */
 774                                 bp->nb_timestamp = 0;
 775                                 HOLDRELE(vp);
 776                                 if (bp->nb_free.tqe_next != NFSNOLIST)
 777                                         panic("nfsbuf on freelist");
 778                                 TAILQ_INSERT_HEAD(&nfsbuffree, bp, nb_free);
 779                                 nfsbuffreecnt++;
 780                                 FSDBG_BOT(541, vp, blkno, 0x2bc, EIO);
 781                                 return (NULL);
 782                         }
 783                         nfs_buf_upl_check(bp);
 784                 }
 785                 break;
 786
 787         default:
 788                 panic("nfs_buf_get: %d unknown operation", operation);
 789         }
 790
 791         /* unlock hash */
 792         if (nfsbufhashlock < 0) {
 793                 nfsbufhashlock = 0;
 794                 wakeup(&nfsbufhashlock);
 795         } else
 796                 nfsbufhashlock = 0;
 797
 798         FSDBG_BOT(541, vp, blkno, bp, bp->nb_flags);
 799
 800         return (bp);
 801 }
 802
 803 void
 804 nfs_buf_release(struct nfsbuf *bp, int freeup)
 805 {
 806         struct vnode *vp = bp->nb_vp;
 807         struct timeval now;
 808
 809         FSDBG_TOP(548, bp, NBOFF(bp), bp->nb_flags, bp->nb_data);
 810         FSDBG(548, bp->nb_validoff, bp->nb_validend, bp->nb_dirtyoff, bp->nb_dirtyend);
 811         FSDBG(548, bp->nb_valid, 0, bp->nb_dirty, 0);
 812
 813         if (UBCINFOEXISTS(vp) && bp->nb_bufsize) {
 814                 int upl_flags;
 815                 upl_t upl;
 816                 int i, rv;
 817
 818                 if (!ISSET(bp->nb_flags, NB_PAGELIST) && !ISSET(bp->nb_flags, NB_INVAL)) {
 819                         rv = nfs_buf_upl_setup(bp);
 820                         if (rv)
 821                                 printf("nfs_buf_release: upl create failed %d\n", rv);
 822                         else
 823                                 nfs_buf_upl_check(bp);
 824                 }
 825                 upl = bp->nb_pagelist;
 826                 if (!upl)
 827                         goto pagelist_cleanup_done;
 828                 if (bp->nb_data) {
 829                         if (ubc_upl_unmap(upl) != KERN_SUCCESS)
 830                                 panic("ubc_upl_unmap failed");
 831                         bp->nb_data = NULL;
 832                 }
 833                 if (bp->nb_flags & (NB_ERROR | NB_INVAL)) {
 834                         if (bp->nb_flags & (NB_READ | NB_INVAL))
 835                                 upl_flags = UPL_ABORT_DUMP_PAGES;
 836                         else
 837                                 upl_flags = 0;
 838                         ubc_upl_abort(upl, upl_flags);
 839                         goto pagelist_cleanup_done;
 840                 }
 841                 for (i=0; i <= (bp->nb_bufsize - 1)/PAGE_SIZE; i++) {
 842                         if (!NBPGVALID(bp,i))
 843                                 ubc_upl_abort_range(upl,
 844                                         i*PAGE_SIZE, PAGE_SIZE,
 845                                         UPL_ABORT_DUMP_PAGES |
 846                                         UPL_ABORT_FREE_ON_EMPTY);
 847                         else {
 848                                 if (NBPGDIRTY(bp,i))
 849                                         upl_flags = UPL_COMMIT_SET_DIRTY;
 850                                 else
 851                                         upl_flags = UPL_COMMIT_CLEAR_DIRTY;
 852                                 ubc_upl_commit_range(upl,
 853                                         i*PAGE_SIZE, PAGE_SIZE,
 854                                         upl_flags |
 855                                         UPL_COMMIT_INACTIVATE |
 856                                         UPL_COMMIT_FREE_ON_EMPTY);
 857                         }
 858                 }
 859 pagelist_cleanup_done:
 860                 /* was this the last buffer in the file? */
 861                 if (NBOFF(bp) + bp->nb_bufsize > VTONFS(vp)->n_size) {
 862                         /* if so, invalidate all pages of last buffer past EOF */
 863                         int biosize = vp->v_mount->mnt_stat.f_iosize;
 864                         off_t off, size;
 865                         off = trunc_page_64(VTONFS(vp)->n_size) + PAGE_SIZE_64;
 866                         size = trunc_page_64(NBOFF(bp) + biosize) - off;
 867                         if (size)
 868                                 ubc_invalidate(vp, off, size);
 869                 }
 870                 CLR(bp->nb_flags, NB_PAGELIST);
 871                 bp->nb_pagelist = NULL;
 872         }
 873
 874         /* Wake up any processes waiting for any buffer to become free. */
 875         if (nfsneedbuffer) {
 876                 nfsneedbuffer = 0;
 877                 wakeup(&nfsneedbuffer);
 878         }
 879         /* Wake up any processes waiting for _this_ buffer to become free. */
 880         if (ISSET(bp->nb_flags, NB_WANTED)) {
 881                 CLR(bp->nb_flags, NB_WANTED);
 882                 wakeup(bp);
 883         }
 884
 885         /* If it's not cacheable, or an error, mark it invalid. */
 886         if (ISSET(bp->nb_flags, (NB_NOCACHE|NB_ERROR)))
 887                 SET(bp->nb_flags, NB_INVAL);
 888
 889         if ((bp->nb_bufsize <= 0) || ISSET(bp->nb_flags, NB_INVAL)) {
 890                 /* If it's invalid or empty, dissociate it from its vnode */
 891                 if (bp->nb_vnbufs.le_next != NFSNOLIST) {
 892                         LIST_REMOVE(bp, nb_vnbufs);
 893                         bp->nb_vnbufs.le_next = NFSNOLIST;
 894                 }
 895                 bp->nb_vp = NULL;
 896                 HOLDRELE(vp);
 897                 /* if this was a delayed write, wakeup anyone */
 898                 /* waiting for delayed writes to complete */
 899                 if (ISSET(bp->nb_flags, NB_DELWRI)) {
 900                         CLR(bp->nb_flags, NB_DELWRI);
 901                         nfs_nbdwrite--;
 902                         NFSBUFCNTCHK();
 903                         wakeup((caddr_t)&nfs_nbdwrite);
 904                 }
 905                 /* clear usage timestamp to allow immediate freeing */
 906                 bp->nb_timestamp = 0;
 907                 /* put buffer at head of free list */
 908                 if (bp->nb_free.tqe_next != NFSNOLIST)
 909                         panic("nfsbuf on freelist");
 910                 SET(bp->nb_flags, NB_INVAL);
 911                 TAILQ_INSERT_HEAD(&nfsbuffree, bp, nb_free);
 912                 nfsbuffreecnt++;
 913                 if (freeup)
 914                         NFS_BUF_FREEUP();
 915         } else if (ISSET(bp->nb_flags, NB_DELWRI)) {
 916                 /* put buffer at end of delwri list */
 917                 if (bp->nb_free.tqe_next != NFSNOLIST)
 918                         panic("nfsbuf on freelist");
 919                 TAILQ_INSERT_TAIL(&nfsbufdelwri, bp, nb_free);
 920                 nfsbufdelwricnt++;
 921         } else {
 922                 /* update usage timestamp */
 923                 microuptime(&now);
 924                 bp->nb_timestamp = now.tv_sec;
 925                 /* put buffer at end of free list */
 926                 if (bp->nb_free.tqe_next != NFSNOLIST)
 927                         panic("nfsbuf on freelist");
 928                 if (ISSET(bp->nb_flags, NB_META)) {
 929                         TAILQ_INSERT_TAIL(&nfsbuffreemeta, bp, nb_free);
 930                         nfsbuffreemetacnt++;
 931                 } else {
 932                         TAILQ_INSERT_TAIL(&nfsbuffree, bp, nb_free);
 933                         nfsbuffreecnt++;
 934                 }
 935                 if (freeup)
 936                         NFS_BUF_FREEUP();
 937         }
 938
 939         NFSBUFCNTCHK();
 940
 941         /* Unlock the buffer. */
 942         CLR(bp->nb_flags, (NB_ASYNC | NB_BUSY | NB_NOCACHE | NB_STABLE | NB_IOD));
 943
 944         FSDBG_BOT(548, bp, NBOFF(bp), bp->nb_flags, bp->nb_data);
 945 }
 946
 947 /*
 948  * Wait for operations on the buffer to complete.
 949  * When they do, extract and return the I/O's error value.
 950  */
 951 int
 952 nfs_buf_iowait(struct nfsbuf *bp)
 953 {
 954         FSDBG_TOP(549, bp, NBOFF(bp), bp->nb_flags, bp->nb_error);
 955
 956         while (!ISSET(bp->nb_flags, NB_DONE))
 957                 tsleep(bp, PRIBIO + 1, "nfs_buf_iowait", 0);
 958
 959         FSDBG_BOT(549, bp, NBOFF(bp), bp->nb_flags, bp->nb_error);
 960
 961         /* check for interruption of I/O, then errors. */
 962         if (ISSET(bp->nb_flags, NB_EINTR)) {
 963                 CLR(bp->nb_flags, NB_EINTR);
 964                 return (EINTR);
 965         } else if (ISSET(bp->nb_flags, NB_ERROR))
 966                 return (bp->nb_error ? bp->nb_error : EIO);
 967         return (0);
 968 }
 969
 970 /*
 971  * Mark I/O complete on a buffer.
 972  */
 973 void
 974 nfs_buf_iodone(struct nfsbuf *bp)
 975 {
 976         struct vnode *vp;
 977
 978         FSDBG_TOP(550, bp, NBOFF(bp), bp->nb_flags, bp->nb_error);
 979
 980         if (ISSET(bp->nb_flags, NB_DONE))
 981                 panic("nfs_buf_iodone already");
 982         SET(bp->nb_flags, NB_DONE);             /* note that it's done */
 983         /*
 984          * I/O was done, so don't believe
 985          * the DIRTY state from VM anymore
 986          */
 987         CLR(bp->nb_flags, NB_WASDIRTY);
 988
 989         if (!ISSET(bp->nb_flags, NB_READ)) {
 990                 CLR(bp->nb_flags, NB_WRITEINPROG);
 991                 vpwakeup(bp->nb_vp);
 992         }
 993
 994         /* Wakeup the throttled write operations as needed */
 995         vp = bp->nb_vp;
 996         if (vp && (vp->v_flag & VTHROTTLED)
 997                 && (vp->v_numoutput <= (NFSBUFWRITE_THROTTLE / 3))) {
 998                 vp->v_flag &= ~VTHROTTLED;
 999                 wakeup((caddr_t)&vp->v_numoutput);
1000         }
1001
1002         if (ISSET(bp->nb_flags, NB_ASYNC))      /* if async, release it */
1003                 nfs_buf_release(bp, 1);
1004         else {                                  /* or just wakeup the buffer */
1005                 CLR(bp->nb_flags, NB_WANTED);
1006                 wakeup(bp);
1007         }
1008
1009         FSDBG_BOT(550, bp, NBOFF(bp), bp->nb_flags, bp->nb_error);
1010 }
1011
1012 void
1013 nfs_buf_write_delayed(struct nfsbuf *bp)
1014 {
1015         struct proc *p = current_proc();
1016         struct vnode *vp = bp->nb_vp;
1017
1018         FSDBG_TOP(551, bp, NBOFF(bp), bp->nb_flags, 0);
1019         FSDBG(551, bp, bp->nb_dirtyoff, bp->nb_dirtyend, bp->nb_dirty);
1020
1021         /*
1022          * If the block hasn't been seen before:
1023          *      (1) Mark it as having been seen,
1024          *      (2) Charge for the write.
1025          *      (3) Make sure it's on its vnode's correct block list,
1026          */
1027         if (!ISSET(bp->nb_flags, NB_DELWRI)) {
1028                 SET(bp->nb_flags, NB_DELWRI);
1029                 if (p && p->p_stats)
1030                         p->p_stats->p_ru.ru_oublock++;          /* XXX */
1031                 nfs_nbdwrite++;
1032                 NFSBUFCNTCHK();
1033                 /* move to dirty list */
1034                 if (bp->nb_vnbufs.le_next != NFSNOLIST)
1035                         LIST_REMOVE(bp, nb_vnbufs);
1036                 LIST_INSERT_HEAD(&VTONFS(vp)->n_dirtyblkhd, bp, nb_vnbufs);
1037         }
1038
1039         /*
1040          * If the vnode has "too many" write operations in progress
1041          * wait for them to finish the IO
1042          */
1043         while (vp->v_numoutput >= NFSBUFWRITE_THROTTLE) {
1044                 vp->v_flag |= VTHROTTLED;
1045                 tsleep((caddr_t)&vp->v_numoutput, PRIBIO + 1, "nfs_buf_write_delayed", 0);
1046         }
1047
1048         /*
1049          * If we have too many delayed write buffers,
1050          * more than we can "safely" handle, just fall back to
1051          * doing the async write
1052          */
1053         if (nfs_nbdwrite < 0)
1054                 panic("nfs_buf_write_delayed: Negative nfs_nbdwrite");
1055
1056         if (nfs_nbdwrite > ((nfsbufcnt/4)*3)) {
1057                 /* issue async write */
1058                 SET(bp->nb_flags, NB_ASYNC);
1059                 nfs_buf_write(bp);
1060                 FSDBG_BOT(551, bp, NBOFF(bp), bp->nb_flags, bp->nb_error);
1061                 return;
1062         }
1063
1064         /* Otherwise, the "write" is done, so mark and release the buffer. */
1065         SET(bp->nb_flags, NB_DONE);
1066         nfs_buf_release(bp, 1);
1067         FSDBG_BOT(551, bp, NBOFF(bp), bp->nb_flags, 0);
1068         return;
1069 }
1070
1071
1072 /*
1073  * Vnode op for read using bio
1074  * Any similarity to readip() is purely coincidental
1075  */
1076 int
1077 nfs_bioread(vp, uio, ioflag, cred, getpages)
1078         register struct vnode *vp;
1079         register struct uio *uio;
1080         int ioflag;
1081         struct ucred *cred;
1082         int getpages; // XXX unused!
1083 {
1084         struct nfsnode *np = VTONFS(vp);
1085         int biosize, i;
1086         off_t diff;
1087         struct nfsbuf *bp = 0, *rabp;
1088         struct vattr vattr;
1089         struct proc *p;
1090         struct nfsmount *nmp = VFSTONFS(vp->v_mount);
1091         daddr_t lbn, rabn, lastrabn = -1;
1092         int bufsize;
1093         int nra, error = 0, n = 0, on = 0;
1094         int operation = (getpages? BLK_PAGEIN : BLK_READ);
1095         caddr_t dp;
1096         struct dirent *direntp;
1097
1098         FSDBG_TOP(514, vp, uio->uio_offset, uio->uio_resid, ioflag);
1099
1100 #if DIAGNOSTIC
1101         if (uio->uio_rw != UIO_READ)
1102                 panic("nfs_read mode");
1103 #endif
1104         if (uio->uio_resid == 0) {
1105                 FSDBG_BOT(514, vp, 0xd1e0001, 0, 0);
1106                 return (0);
1107         }
1108         if (uio->uio_offset < 0) {
1109                 FSDBG_BOT(514, vp, 0xd1e0002, 0, EINVAL);
1110                 return (EINVAL);
1111         }
1112         p = uio->uio_procp;
1113         if ((nmp->nm_flag & NFSMNT_NFSV3) &&
1114             !(nmp->nm_state & NFSSTA_GOTFSINFO))
1115                 (void)nfs_fsinfo(nmp, vp, cred, p);
1116         biosize = vp->v_mount->mnt_stat.f_iosize;
1117         /*
1118          * For nfs, cache consistency can only be maintained approximately.
1119          * Although RFC1094 does not specify the criteria, the following is
1120          * believed to be compatible with the reference port.
1121          * For nqnfs, full cache consistency is maintained within the loop.
1122          * For nfs:
1123          * If the file's modify time on the server has changed since the
1124          * last read rpc or you have written to the file,
1125          * you may have lost data cache consistency with the
1126          * server, so flush all of the file's data out of the cache.
1127          * Then force a getattr rpc to ensure that you have up to date
1128          * attributes.
1129          * NB: This implies that cache data can be read when up to
1130          * NFS_MAXATTRTIMEO seconds out of date. If you find that you need
1131          * current attributes this could be forced by setting n_xid to 0
1132          * before the VOP_GETATTR() call.
1133          */
1134         if ((nmp->nm_flag & NFSMNT_NQNFS) == 0) {
1135                 if (np->n_flag & NMODIFIED) {
1136                         if (vp->v_type != VREG) {
1137                                 if (vp->v_type != VDIR)
1138                                         panic("nfs: bioread, not dir");
1139                                 nfs_invaldir(vp);
1140                                 error = nfs_vinvalbuf(vp, V_SAVE, cred, p, 1);
1141                                 if (error) {
1142                                         FSDBG_BOT(514, vp, 0xd1e0003, 0, error);
1143                                         return (error);
1144                                 }
1145                         }
1146                         np->n_xid = 0;
1147                         error = VOP_GETATTR(vp, &vattr, cred, p);
1148                         if (error) {
1149                                 FSDBG_BOT(514, vp, 0xd1e0004, 0, error);
1150                                 return (error);
1151                         }
1152                         if (vp->v_type == VDIR) {
1153                                 /* if directory changed, purge any name cache entries */
1154                                 if (np->n_ncmtime != vattr.va_mtime.tv_sec)
1155                                         cache_purge(vp);
1156                                 np->n_ncmtime = vattr.va_mtime.tv_sec;
1157                         }
1158                         np->n_mtime = vattr.va_mtime.tv_sec;
1159                 } else {
1160                         error = VOP_GETATTR(vp, &vattr, cred, p);
1161                         if (error) {
1162                                 FSDBG_BOT(514, vp, 0xd1e0005, 0, error);
1163                                 return (error);
1164                         }
1165                         if (np->n_mtime != vattr.va_mtime.tv_sec) {
1166                                 if (vp->v_type == VDIR) {
1167                                         nfs_invaldir(vp);
1168                                         /* purge name cache entries */
1169                                         if (np->n_ncmtime != vattr.va_mtime.tv_sec)
1170                                                 cache_purge(vp);
1171                                 }
1172                                 error = nfs_vinvalbuf(vp, V_SAVE, cred, p, 1);
1173                                 if (error) {
1174                                         FSDBG_BOT(514, vp, 0xd1e0006, 0, error);
1175                                         return (error);
1176                                 }
1177                                 if (vp->v_type == VDIR)
1178                                         np->n_ncmtime = vattr.va_mtime.tv_sec;
1179                                 np->n_mtime = vattr.va_mtime.tv_sec;
1180                         }
1181                 }
1182         }
1183         do {
1184
1185             /*
1186              * Get a valid lease. If cached data is stale, flush it.
1187              */
1188             if (nmp->nm_flag & NFSMNT_NQNFS) {
1189                 if (NQNFS_CKINVALID(vp, np, ND_READ)) {
1190                     do {
1191                         error = nqnfs_getlease(vp, ND_READ, cred, p);
1192                     } while (error == NQNFS_EXPIRED);
1193                     if (error) {
1194                         FSDBG_BOT(514, vp, 0xd1e0007, 0, error);
1195                         return (error);
1196                     }
1197                     if (np->n_lrev != np->n_brev ||
1198                         (np->n_flag & NQNFSNONCACHE) ||
1199                         ((np->n_flag & NMODIFIED) && vp->v_type == VDIR)) {
1200                         if (vp->v_type == VDIR)
1201                             nfs_invaldir(vp);
1202                         error = nfs_vinvalbuf(vp, V_SAVE, cred, p, 1);
1203                         if (error) {
1204                             FSDBG_BOT(514, vp, 0xd1e0008, 0, error);
1205                             return (error);
1206                         }
1207                         np->n_brev = np->n_lrev;
1208                     }
1209                 } else if (vp->v_type == VDIR && (np->n_flag & NMODIFIED)) {
1210                     nfs_invaldir(vp);
1211                     error = nfs_vinvalbuf(vp, V_SAVE, cred, p, 1);
1212                     if (error) {
1213                         FSDBG_BOT(514, vp, 0xd1e0009, 0, error);
1214                         return (error);
1215                     }
1216                 }
1217             }
1218             if ((np->n_flag & NQNFSNONCACHE) || (vp->v_flag & VNOCACHE_DATA)) {
1219                 if ((vp->v_flag & VNOCACHE_DATA) &&
1220                     (np->n_dirtyblkhd.lh_first || np->n_cleanblkhd.lh_first)) {
1221                         error = nfs_vinvalbuf(vp, V_SAVE, cred, p, 1);
1222                         if (error) {
1223                                 FSDBG_BOT(514, vp, 0xd1e000a, 0, error);
1224                                 return (error);
1225                         }
1226                 }
1227                 switch (vp->v_type) {
1228                 case VREG:
1229                         error = nfs_readrpc(vp, uio, cred);
1230                         FSDBG_BOT(514, vp, uio->uio_offset, uio->uio_resid, error);
1231                         return (error);
1232                 case VLNK:
1233                         error = nfs_readlinkrpc(vp, uio, cred);
1234                         FSDBG_BOT(514, vp, uio->uio_offset, uio->uio_resid, error);
1235                         return (error);
1236                 case VDIR:
1237                         break;
1238                 default:
1239                         printf(" NQNFSNONCACHE: type %x unexpected\n", vp->v_type);
1240                 };
1241             }
1242             switch (vp->v_type) {
1243             case VREG:
1244                 lbn = uio->uio_offset / biosize;
1245
1246                 /*
1247                  * Copy directly from any cached pages without grabbing the bufs.
1248                  */
1249                 if (uio->uio_segflg == UIO_USERSPACE) {
1250                         int io_resid = uio->uio_resid;
1251                         diff = np->n_size - uio->uio_offset;
1252                         if (diff < io_resid)
1253                                 io_resid = diff;
1254                         if (io_resid > 0) {
1255                                 error = cluster_copy_ubc_data(vp, uio, &io_resid, 0);
1256                                 if (error) {
1257                                         FSDBG_BOT(514, vp, uio->uio_offset, 0xcacefeed, error);
1258                                         return (error);
1259                                 }
1260                         }
1261                         /* count any biocache reads that we just copied directly */
1262                         if (lbn != uio->uio_offset / biosize) {
1263                                 nfsstats.biocache_reads += (uio->uio_offset / biosize) - lbn;
1264                                 FSDBG(514, vp, 0xcacefeed, uio->uio_offset, error);
1265                         }
1266                 }
1267
1268                 lbn = uio->uio_offset / biosize;
1269                 on = uio->uio_offset % biosize;
1270
1271                 /*
1272                  * Start the read ahead(s), as required.
1273                  */
1274                 if (nfs_numasync > 0 && nmp->nm_readahead > 0) {
1275                         for (nra = 0; nra < nmp->nm_readahead; nra++) {
1276                                 rabn = lbn + 1 + nra;
1277                                 if (rabn <= lastrabn) {
1278                                         /* we've already (tried to) read this block */
1279                                         /* no need to try it again... */
1280                                         continue;
1281                                 }
1282                                 lastrabn = rabn;
1283                                 if ((off_t)rabn * biosize >= np->n_size)
1284                                         break;
1285                                 /* check if block exists and is valid. */
1286                                 rabp = nfs_buf_incore(vp, rabn);
1287                                 if (rabp && nfs_buf_upl_valid_range(rabp, 0, rabp->nb_bufsize))
1288                                         continue;
1289                                 rabp = nfs_buf_get(vp, rabn, biosize, p, operation);
1290                                 if (!rabp) {
1291                                         FSDBG_BOT(514, vp, 0xd1e000b, 0, EINTR);
1292                                         return (EINTR);
1293                                 }
1294                                 if (!ISSET(rabp->nb_flags, (NB_CACHE|NB_DELWRI))) {
1295                                         SET(rabp->nb_flags, (NB_READ|NB_ASYNC));
1296                                         if (nfs_asyncio(rabp, cred)) {
1297                                                 SET(rabp->nb_flags, (NB_INVAL|NB_ERROR));
1298                                                 rabp->nb_error = EIO;
1299                                                 nfs_buf_release(rabp, 1);
1300                                         }
1301                                 } else
1302                                         nfs_buf_release(rabp, 1);
1303                         }
1304                 }
1305
1306                 if ((uio->uio_resid <= 0) || (uio->uio_offset >= np->n_size)) {
1307                         FSDBG_BOT(514, vp, uio->uio_offset, uio->uio_resid, 0xaaaaaaaa);
1308                         return (0);
1309                 }
1310
1311                 nfsstats.biocache_reads++;
1312
1313                 /*
1314                  * If the block is in the cache and has the required data
1315                  * in a valid region, just copy it out.
1316                  * Otherwise, get the block and write back/read in,
1317                  * as required.
1318                  */
1319 again:
1320                 bufsize = biosize;
1321                 n = min((unsigned)(bufsize - on), uio->uio_resid);
1322                 diff = np->n_size - uio->uio_offset;
1323                 if (diff < n)
1324                         n = diff;
1325
1326                 bp = nfs_buf_get(vp, lbn, bufsize, p, operation);
1327                 if (!bp) {
1328                         FSDBG_BOT(514, vp, 0xd1e000c, 0, EINTR);
1329                         return (EINTR);
1330                 }
1331
1332                 /* if any pages are valid... */
1333                 if (bp->nb_valid) {
1334                         /* ...check for any invalid pages in the read range */
1335                         int pg, firstpg, lastpg, dirtypg;
1336                         dirtypg = firstpg = lastpg = -1;
1337                         pg = on/PAGE_SIZE;
1338                         while (pg <= (on + n - 1)/PAGE_SIZE) {
1339                                 if (!NBPGVALID(bp,pg)) {
1340                                         if (firstpg < 0)
1341                                                 firstpg = pg;
1342                                         lastpg = pg;
1343                                 } else if (firstpg >= 0 && dirtypg < 0 && NBPGDIRTY(bp,pg))
1344                                         dirtypg = pg;
1345                                 pg++;
1346                         }
1347
1348                         /* if there are no invalid pages, we're all set */
1349                         if (firstpg < 0) {
1350                                 if (bp->nb_validoff < 0) {
1351                                         /* valid range isn't set up, so */
1352                                         /* set it to what we know is valid */
1353                                         bp->nb_validoff = trunc_page_32(on);
1354                                         bp->nb_validend = round_page_32(on+n);
1355                                         nfs_buf_normalize_valid_range(np, bp);
1356                                 }
1357                                 goto buffer_ready;
1358                         }
1359
1360                         /* there are invalid pages in the read range */
1361                         if ((dirtypg > firstpg) && (dirtypg < lastpg)) {
1362                                 /* there are also dirty page(s) in the range, */
1363                                 /* so write the buffer out and try again */
1364                                 CLR(bp->nb_flags, (NB_DONE | NB_ERROR | NB_INVAL));
1365                                 SET(bp->nb_flags, NB_ASYNC);
1366                                 /*
1367                                  * NFS has embedded ucred so crhold() risks zone corruption
1368                                  */
1369                                 if (bp->nb_wcred == NOCRED)
1370                                         bp->nb_wcred = crdup(cred);
1371                                 error = nfs_buf_write(bp);
1372                                 if (error) {
1373                                         FSDBG_BOT(514, vp, 0xd1e000d, 0, error);
1374                                         return (error);
1375                                 }
1376                                 goto again;
1377                         }
1378                         if (!bp->nb_dirty && bp->nb_dirtyend <= 0 &&
1379                             (lastpg - firstpg + 1) > (bufsize/PAGE_SIZE)/2) {
1380                                 /* we need to read in more than half the buffer and the */
1381                                 /* buffer's not dirty, so just fetch the whole buffer */
1382                                 bp->nb_valid = 0;
1383                         } else {
1384                                 /* read the page range in */
1385                                 struct iovec iov;
1386                                 struct uio auio;
1387                                 auio.uio_iov = &iov;
1388                                 auio.uio_iovcnt = 1;
1389                                 auio.uio_offset = NBOFF(bp) + firstpg * PAGE_SIZE_64;
1390                                 auio.uio_resid = (lastpg - firstpg + 1) * PAGE_SIZE;
1391                                 auio.uio_segflg = UIO_SYSSPACE;
1392                                 auio.uio_rw = UIO_READ;
1393                                 auio.uio_procp = p;
1394                                 NFS_BUF_MAP(bp);
1395                                 iov.iov_base = bp->nb_data + firstpg * PAGE_SIZE;
1396                                 iov.iov_len = auio.uio_resid;
1397                                 error = nfs_readrpc(vp, &auio, cred);
1398                                 if (error) {
1399                                         nfs_buf_release(bp, 1);
1400                                         FSDBG_BOT(514, vp, 0xd1e000e, 0, error);
1401                                         return (error);
1402                                 }
1403                                 /* Make sure that the valid range is set to cover this read. */
1404                                 bp->nb_validoff = trunc_page_32(on);
1405                                 bp->nb_validend = round_page_32(on+n);
1406                                 nfs_buf_normalize_valid_range(np, bp);
1407                                 if (auio.uio_resid > 0) {
1408                                         /* if short read, must have hit EOF, */
1409                                         /* so zero the rest of the range */
1410                                         bzero(iov.iov_base, auio.uio_resid);
1411                                 }
1412                                 /* mark the pages (successfully read) as valid */
1413                                 for (pg=firstpg; pg <= lastpg; pg++)
1414                                         NBPGVALID_SET(bp,pg);
1415                         }
1416                 }
1417                 /* if no pages are valid, read the whole block */
1418                 if (!bp->nb_valid) {
1419                         SET(bp->nb_flags, NB_READ);
1420                         CLR(bp->nb_flags, (NB_DONE | NB_ERROR | NB_INVAL));
1421                         error = nfs_doio(bp, cred, p);
1422                         if (error) {
1423                                 nfs_buf_release(bp, 1);
1424                                 FSDBG_BOT(514, vp, 0xd1e000f, 0, error);
1425                                 return (error);
1426                         }
1427                 }
1428 buffer_ready:
1429                 vp->v_lastr = lbn;
1430                 /* validate read range against valid range and clip */
1431                 if (bp->nb_validend > 0) {
1432                         diff = (on >= bp->nb_validend) ? 0 : (bp->nb_validend - on);
1433                         if (diff < n)
1434                                 n = diff;
1435                 }
1436                 if (n > 0)
1437                         NFS_BUF_MAP(bp);
1438                 break;
1439             case VLNK:
1440                 nfsstats.biocache_readlinks++;
1441                 bp = nfs_buf_get(vp, (daddr_t)0, NFS_MAXPATHLEN, p, operation);
1442                 if (!bp) {
1443                         FSDBG_BOT(514, vp, 0xd1e0010, 0, EINTR);
1444                         return (EINTR);
1445                 }
1446                 if (!ISSET(bp->nb_flags, NB_CACHE)) {
1447                         SET(bp->nb_flags, NB_READ);
1448                         error = nfs_doio(bp, cred, p);
1449                         if (error) {
1450                                 SET(bp->nb_flags, NB_ERROR);
1451                                 nfs_buf_release(bp, 1);
1452                                 FSDBG_BOT(514, vp, 0xd1e0011, 0, error);
1453                                 return (error);
1454                         }
1455                 }
1456                 n = min(uio->uio_resid, bp->nb_validend);
1457                 on = 0;
1458                 break;
1459             case VDIR:
1460                 nfsstats.biocache_readdirs++;
1461                 if (np->n_direofoffset && uio->uio_offset >= np->n_direofoffset) {
1462                         FSDBG_BOT(514, vp, 0xde0f0001, 0, 0);
1463                         return (0);
1464                 }
1465                 lbn = uio->uio_offset / NFS_DIRBLKSIZ;
1466                 on = uio->uio_offset & (NFS_DIRBLKSIZ - 1);
1467                 bp = nfs_buf_get(vp, lbn, NFS_DIRBLKSIZ, p, operation);
1468                 if (!bp) {
1469                         FSDBG_BOT(514, vp, 0xd1e0012, 0, EINTR);
1470                         return (EINTR);
1471                 }
1472                 if (!ISSET(bp->nb_flags, NB_CACHE)) {
1473                     SET(bp->nb_flags, NB_READ);
1474                     error = nfs_doio(bp, cred, p);
1475                     if (error) {
1476                         nfs_buf_release(bp, 1);
1477                     }
1478                     while (error == NFSERR_BAD_COOKIE) {
1479                         nfs_invaldir(vp);
1480                         error = nfs_vinvalbuf(vp, 0, cred, p, 1);
1481                         /*
1482                          * Yuck! The directory has been modified on the
1483                          * server. The only way to get the block is by
1484                          * reading from the beginning to get all the
1485                          * offset cookies.
1486                          */
1487                         for (i = 0; i <= lbn && !error; i++) {
1488                             if (np->n_direofoffset
1489                                 && (i * NFS_DIRBLKSIZ) >= np->n_direofoffset) {
1490                                     FSDBG_BOT(514, vp, 0xde0f0002, 0, 0);
1491                                     return (0);
1492                             }
1493                             bp = nfs_buf_get(vp, i, NFS_DIRBLKSIZ, p, operation);
1494                             if (!bp) {
1495                                     FSDBG_BOT(514, vp, 0xd1e0013, 0, EINTR);
1496                                     return (EINTR);
1497                             }
1498                             if (!ISSET(bp->nb_flags, NB_CACHE)) {
1499                                     SET(bp->nb_flags, NB_READ);
1500                                     error = nfs_doio(bp, cred, p);
1501                                     /*
1502                                      * no error + NB_INVAL == directory EOF,
1503                                      * use the block.
1504                                      */
1505                                     if (error == 0 && (bp->nb_flags & NB_INVAL))
1506                                             break;
1507                             }
1508                             /*
1509                              * An error will throw away the block and the
1510                              * for loop will break out.  If no error and this
1511                              * is not the block we want, we throw away the
1512                              * block and go for the next one via the for loop.
1513                              */
1514                             if (error || i < lbn)
1515                                     nfs_buf_release(bp, 1);
1516                         }
1517                     }
1518                     /*
1519                      * The above while is repeated if we hit another cookie
1520                      * error.  If we hit an error and it wasn't a cookie error,
1521                      * we give up.
1522                      */
1523                     if (error) {
1524                         FSDBG_BOT(514, vp, 0xd1e0014, 0, error);
1525                         return (error);
1526                     }
1527                 }
1528
1529                 /*
1530                  * If not eof and read aheads are enabled, start one.
1531                  * (You need the current block first, so that you have the
1532                  *  directory offset cookie of the next block.)
1533                  */
1534                 if (nfs_numasync > 0 && nmp->nm_readahead > 0 &&
1535                     (np->n_direofoffset == 0 ||
1536                     (lbn + 1) * NFS_DIRBLKSIZ < np->n_direofoffset) &&
1537                     !(np->n_flag & NQNFSNONCACHE) &&
1538                     !nfs_buf_incore(vp, lbn + 1)) {
1539                         rabp = nfs_buf_get(vp, lbn + 1, NFS_DIRBLKSIZ, p,
1540                                                operation);
1541                         if (rabp) {
1542                             if (!ISSET(rabp->nb_flags, (NB_CACHE))) {
1543                                 SET(rabp->nb_flags, (NB_READ | NB_ASYNC));
1544                                 if (nfs_asyncio(rabp, cred)) {
1545                                     SET(rabp->nb_flags, (NB_INVAL|NB_ERROR));
1546                                     rabp->nb_error = EIO;
1547                                     nfs_buf_release(rabp, 1);
1548                                 }
1549                             } else {
1550                                 nfs_buf_release(rabp, 1);
1551                             }
1552                         }
1553                 }
1554                 /*
1555                  * Make sure we use a signed variant of min() since
1556                  * the second term may be negative.
1557                  */
1558                 n = lmin(uio->uio_resid, bp->nb_validend - on);
1559                 /*
1560                  * We keep track of the directory eof in
1561                  * np->n_direofoffset and chop it off as an
1562                  * extra step right here.
1563                  */
1564                 if (np->n_direofoffset &&
1565                     n > np->n_direofoffset - uio->uio_offset)
1566                         n = np->n_direofoffset - uio->uio_offset;
1567                 /*
1568                  * Make sure that we return an integral number of entries so
1569                  * that any subsequent calls will start copying from the start
1570                  * of the next entry.
1571                  *
1572                  * If the current value of n has the last entry cut short,
1573                  * set n to copy everything up to the last entry instead.
1574                  */
1575                 if (n > 0) {
1576                         dp = bp->nb_data + on;
1577                         while (dp < (bp->nb_data + on + n)) {
1578                                 direntp = (struct dirent *)dp;
1579                                 dp += direntp->d_reclen;
1580                         }
1581                         if (dp > (bp->nb_data + on + n))
1582                                 n = (dp - direntp->d_reclen) - (bp->nb_data + on);
1583                 }
1584                 break;
1585             default:
1586                 printf("nfs_bioread: type %x unexpected\n",vp->v_type);
1587                 FSDBG_BOT(514, vp, 0xd1e0015, 0, EINVAL);
1588                 return (EINVAL);
1589             };
1590
1591             if (n > 0) {
1592                 error = uiomove(bp->nb_data + on, (int)n, uio);
1593             }
1594             switch (vp->v_type) {
1595             case VREG:
1596                 break;
1597             case VLNK:
1598                 n = 0;
1599                 break;
1600             case VDIR:
1601                 if (np->n_flag & NQNFSNONCACHE)
1602                         SET(bp->nb_flags, NB_INVAL);
1603                 break;
1604             }
1605             nfs_buf_release(bp, 1);
1606         } while (error == 0 && uio->uio_resid > 0 && n > 0);
1607         FSDBG_BOT(514, vp, uio->uio_offset, uio->uio_resid, error);
1608         return (error);
1609 }
1610
1611
1612 /*
1613  * Vnode op for write using bio
1614  */
1615 int
1616 nfs_write(ap)
1617         struct vop_write_args /* {
1618                 struct vnode *a_vp;
1619                 struct uio *a_uio;
1620                 int  a_ioflag;
1621                 struct ucred *a_cred;
1622         } */ *ap;
1623 {
1624         struct uio *uio = ap->a_uio;
1625         struct proc *p = uio->uio_procp;
1626         struct vnode *vp = ap->a_vp;
1627         struct nfsnode *np = VTONFS(vp);
1628         struct ucred *cred = ap->a_cred;
1629         int ioflag = ap->a_ioflag;
1630         struct nfsbuf *bp;
1631         struct vattr vattr;
1632         struct nfsmount *nmp = VFSTONFS(vp->v_mount);
1633         daddr_t lbn;
1634         int biosize, bufsize, writeop;
1635         int n, on, error = 0, iomode, must_commit;
1636         off_t boff, start, end, cureof;
1637         struct iovec iov;
1638         struct uio auio;
1639
1640         FSDBG_TOP(515, vp, uio->uio_offset, uio->uio_resid, ioflag);
1641
1642 #if DIAGNOSTIC
1643         if (uio->uio_rw != UIO_WRITE)
1644                 panic("nfs_write mode");
1645         if (uio->uio_segflg == UIO_USERSPACE && uio->uio_procp != current_proc())
1646                 panic("nfs_write proc");
1647 #endif
1648         if (vp->v_type != VREG)
1649                 return (EIO);
1650         if (np->n_flag & NWRITEERR) {
1651                 np->n_flag &= ~NWRITEERR;
1652                 FSDBG_BOT(515, vp, uio->uio_offset, uio->uio_resid, np->n_error);
1653                 return (np->n_error);
1654         }
1655         if ((nmp->nm_flag & NFSMNT_NFSV3) &&
1656             !(nmp->nm_state & NFSSTA_GOTFSINFO))
1657                 (void)nfs_fsinfo(nmp, vp, cred, p);
1658         if (ioflag & (IO_APPEND | IO_SYNC)) {
1659                 if (np->n_flag & NMODIFIED) {
1660                         np->n_xid = 0;
1661                         error = nfs_vinvalbuf(vp, V_SAVE, cred, p, 1);
1662                         if (error) {
1663                                 FSDBG_BOT(515, vp, uio->uio_offset, 0x10bad01, error);
1664                                 return (error);
1665                         }
1666                 }
1667                 if (ioflag & IO_APPEND) {
1668                         np->n_xid = 0;
1669                         error = VOP_GETATTR(vp, &vattr, cred, p);
1670                         if (error) {
1671                                 FSDBG_BOT(515, vp, uio->uio_offset, 0x10bad02, error);
1672                                 return (error);
1673                         }
1674                         uio->uio_offset = np->n_size;
1675                 }
1676         }
1677         if (uio->uio_offset < 0) {
1678                 FSDBG_BOT(515, vp, uio->uio_offset, 0xbad0ff, EINVAL);
1679                 return (EINVAL);
1680         }
1681         if (uio->uio_resid == 0) {
1682                 FSDBG_BOT(515, vp, uio->uio_offset, uio->uio_resid, 0);
1683                 return (0);
1684         }
1685         /*
1686          * Maybe this should be above the vnode op call, but so long as
1687          * file servers have no limits, i don't think it matters
1688          */
1689         if (p && uio->uio_offset + uio->uio_resid >
1690               p->p_rlimit[RLIMIT_FSIZE].rlim_cur) {
1691                 psignal(p, SIGXFSZ);
1692                 FSDBG_BOT(515, vp, uio->uio_offset, 0x2b1f, EFBIG);
1693                 return (EFBIG);
1694         }
1695
1696         biosize = vp->v_mount->mnt_stat.f_iosize;
1697
1698         do {
1699                 /*
1700                  * Check for a valid write lease.
1701                  */
1702                 if ((nmp->nm_flag & NFSMNT_NQNFS) &&
1703                     NQNFS_CKINVALID(vp, np, ND_WRITE)) {
1704                         do {
1705                                 error = nqnfs_getlease(vp, ND_WRITE, cred, p);
1706                         } while (error == NQNFS_EXPIRED);
1707                         if (error) {
1708                                 FSDBG_BOT(515, vp, uio->uio_offset, 0x11110001, error);
1709                                 return (error);
1710                         }
1711                         if (np->n_lrev != np->n_brev ||
1712                             (np->n_flag & NQNFSNONCACHE)) {
1713                                 error = nfs_vinvalbuf(vp, V_SAVE, cred, p, 1);
1714                                 if (error) {
1715                                         FSDBG_BOT(515, vp, uio->uio_offset, 0x11110002, error);
1716                                         return (error);
1717                                 }
1718                                 np->n_brev = np->n_lrev;
1719                         }
1720                 }
1721                 if (ISSET(vp->v_flag, VNOCACHE_DATA) &&
1722                     (np->n_dirtyblkhd.lh_first || np->n_cleanblkhd.lh_first)) {
1723                         error = nfs_vinvalbuf(vp, V_SAVE, cred, p, 1);
1724                         if (error) {
1725                                 FSDBG_BOT(515, vp, 0, 0, error);
1726                                 return (error);
1727                         }
1728                 }
1729                 if (((np->n_flag & NQNFSNONCACHE) ||
1730                      ISSET(vp->v_flag, VNOCACHE_DATA)) &&
1731                     uio->uio_iovcnt == 1) {
1732                     iomode = NFSV3WRITE_FILESYNC;
1733                     error = nfs_writerpc(vp, uio, cred, &iomode, &must_commit);
1734                     if (must_commit)
1735                         nfs_clearcommit(vp->v_mount);
1736                     FSDBG_BOT(515, vp, uio->uio_offset, uio->uio_resid, error);
1737                     return (error);
1738                 }
1739                 nfsstats.biocache_writes++;
1740                 lbn = uio->uio_offset / biosize;
1741                 on = uio->uio_offset % biosize;
1742                 n = min((unsigned)(biosize - on), uio->uio_resid);
1743 again:
1744                 bufsize = biosize;
1745                 /*
1746                  * Get a cache block for writing.  The range to be written is
1747                  * (off..off+n) within the block.  We ensure that the block
1748                  * either has no dirty region or that the given range is
1749                  * contiguous with the existing dirty region.
1750                  */
1751                 bp = nfs_buf_get(vp, lbn, bufsize, p, BLK_WRITE);
1752                 if (!bp) {
1753                         FSDBG_BOT(515, vp, uio->uio_offset, uio->uio_resid, EINTR);
1754                         return (EINTR);
1755                 }
1756                 /* map the block because we know we're going to write to it */
1757                 NFS_BUF_MAP(bp);
1758
1759                 if (ISSET(vp->v_flag, VNOCACHE_DATA))
1760                         SET(bp->nb_flags, (NB_NOCACHE|NB_INVAL));
1761
1762                 /*
1763                  * NFS has embedded ucred so crhold() risks zone corruption
1764                  */
1765                 if (bp->nb_wcred == NOCRED)
1766                         bp->nb_wcred = crdup(cred);
1767
1768                 /*
1769                  * If there's already a dirty range AND dirty pages in this block we
1770                  * need to send a commit AND write the dirty pages before continuing.
1771                  *
1772                  * If there's already a dirty range OR dirty pages in this block
1773                  * and the new write range is not contiguous with the existing range,
1774                  * then force the buffer to be written out now.
1775                  * (We used to just extend the dirty range to cover the valid,
1776                  * but unwritten, data in between also.  But writing ranges
1777                  * of data that weren't actually written by an application
1778                  * risks overwriting some other client's data with stale data
1779                  * that's just masquerading as new written data.)
1780                  */
1781                 if (bp->nb_dirtyend > 0) {
1782                     if (on > bp->nb_dirtyend || (on + n) < bp->nb_dirtyoff || bp->nb_dirty) {
1783                         FSDBG(515, vp, uio->uio_offset, bp, 0xd15c001);
1784                         /* write/commit buffer "synchronously" */
1785                         /* (NB_STABLE indicates that data writes should be FILESYNC) */
1786                         CLR(bp->nb_flags, (NB_DONE | NB_ERROR | NB_INVAL));
1787                         SET(bp->nb_flags, (NB_ASYNC | NB_STABLE));
1788                         error = nfs_buf_write(bp);
1789                         if (error) {
1790                             FSDBG_BOT(515, vp, uio->uio_offset, uio->uio_resid, error);
1791                             return (error);
1792                         }
1793                         goto again;
1794                     }
1795                 } else if (bp->nb_dirty) {
1796                     int firstpg, lastpg;
1797                     u_int32_t pagemask;
1798                     /* calculate write range pagemask */
1799                     firstpg = on/PAGE_SIZE;
1800                     lastpg = (on+n-1)/PAGE_SIZE;
1801                     pagemask = ((1 << (lastpg+1)) - 1) & ~((1 << firstpg) - 1);
1802                     /* check if there are dirty pages outside the write range */
1803                     if (bp->nb_dirty & ~pagemask) {
1804                         FSDBG(515, vp, uio->uio_offset, bp, 0xd15c002);
1805                         /* write/commit buffer "synchronously" */
1806                         /* (NB_STABLE indicates that data writes should be FILESYNC) */
1807                         CLR(bp->nb_flags, (NB_DONE | NB_ERROR | NB_INVAL));
1808                         SET(bp->nb_flags, (NB_ASYNC | NB_STABLE));
1809                         error = nfs_buf_write(bp);
1810                         if (error) {
1811                             FSDBG_BOT(515, vp, uio->uio_offset, uio->uio_resid, error);
1812                             return (error);
1813                         }
1814                         goto again;
1815                     }
1816                     /* if the first or last pages are already dirty */
1817                     /* make sure that the dirty range encompasses those pages */
1818                     if (NBPGDIRTY(bp,firstpg) || NBPGDIRTY(bp,lastpg)) {
1819                         FSDBG(515, vp, uio->uio_offset, bp, 0xd15c003);
1820                         bp->nb_dirtyoff = min(on, firstpg * PAGE_SIZE);
1821                         if (NBPGDIRTY(bp,lastpg)) {
1822                             bp->nb_dirtyend = (lastpg+1) * PAGE_SIZE;
1823                             /* clip to EOF */
1824                             if (NBOFF(bp) + bp->nb_dirtyend > np->n_size)
1825                                     bp->nb_dirtyend = np->n_size - NBOFF(bp);
1826                         } else
1827                             bp->nb_dirtyend = on+n;
1828                     }
1829                 }
1830
1831                 /*
1832                  * Are we extending the size of the file with this write?
1833                  * If so, update file size now that we have the block.
1834                  * If there was a partial buf at the old eof, validate
1835                  * and zero the new bytes.
1836                  */
1837                 cureof = (off_t)np->n_size;
1838                 if (uio->uio_offset + n > np->n_size) {
1839                         struct nfsbuf *eofbp = NULL;
1840                         daddr_t eofbn = np->n_size / biosize;
1841                         int eofoff = np->n_size % biosize;
1842                         int neweofoff = (uio->uio_offset + n) % biosize;
1843
1844                         FSDBG(515, 0xb1ffa000, uio->uio_offset + n, eofoff, neweofoff);
1845
1846                         if (eofoff && eofbn < lbn && nfs_buf_incore(vp, eofbn))
1847                                 eofbp = nfs_buf_get(vp, eofbn, biosize, p, BLK_WRITE);
1848
1849                         /* if we're extending within the same last block */
1850                         /* and the block is flagged as being cached... */
1851                         if ((lbn == eofbn) && ISSET(bp->nb_flags, NB_CACHE)) {
1852                                 /* ...check that all pages in buffer are valid */
1853                                 int endpg = ((neweofoff ? neweofoff : biosize) - 1)/PAGE_SIZE;
1854                                 u_int32_t pagemask;
1855                                 /* pagemask only has to extend to last page being written to */
1856                                 pagemask = (1 << (endpg+1)) - 1;
1857                                 FSDBG(515, 0xb1ffa001, bp->nb_valid, pagemask, 0);
1858                                 if ((bp->nb_valid & pagemask) != pagemask) {
1859                                         /* zerofill any hole */
1860                                         if (on > bp->nb_validend) {
1861                                                 int i;
1862                                                 for (i=bp->nb_validend/PAGE_SIZE; i <= (on - 1)/PAGE_SIZE; i++)
1863                                                         NBPGVALID_SET(bp, i);
1864                                                 NFS_BUF_MAP(bp);
1865                                                 FSDBG(516, bp, bp->nb_validend, on - bp->nb_validend, 0xf01e);
1866                                                 bzero((char *)bp->nb_data + bp->nb_validend,
1867                                                         on - bp->nb_validend);
1868                                         }
1869                                         /* zerofill any trailing data in the last page */
1870                                         if (neweofoff) {
1871                                                 NFS_BUF_MAP(bp);
1872                                                 FSDBG(516, bp, neweofoff, PAGE_SIZE - (neweofoff & PAGE_MASK), 0xe0f);
1873                                                 bzero((char *)bp->nb_data + neweofoff,
1874                                                         PAGE_SIZE - (neweofoff & PAGE_MASK));
1875                                         }
1876                                 }
1877                         }
1878                         np->n_flag |= NMODIFIED;
1879                         np->n_size = uio->uio_offset + n;
1880                         ubc_setsize(vp, (off_t)np->n_size); /* XXX errors */
1881                         if (eofbp) {
1882                                 /*
1883                                  * We may need to zero any previously invalid data
1884                                  * after the old EOF in the previous EOF buffer.
1885                                  *
1886                                  * For the old last page, don't zero bytes if there
1887                                  * are invalid bytes in that page (i.e. the page isn't
1888                                  * currently valid).
1889                                  * For pages after the old last page, zero them and
1890                                  * mark them as valid.
1891                                  */
1892                                 char *d;
1893                                 int i;
1894                                 if (ISSET(vp->v_flag, VNOCACHE_DATA))
1895                                         SET(eofbp->nb_flags, (NB_NOCACHE|NB_INVAL));
1896                                 NFS_BUF_MAP(eofbp);
1897                                 FSDBG(516, eofbp, eofoff, biosize - eofoff, 0xe0fff01e);
1898                                 d = eofbp->nb_data;
1899                                 i = eofoff/PAGE_SIZE;
1900                                 while (eofoff < biosize) {
1901                                         int poff = eofoff & PAGE_MASK;
1902                                         if (!poff || NBPGVALID(eofbp,i)) {
1903                                                 bzero(d + eofoff, PAGE_SIZE - poff);
1904                                                 NBPGVALID_SET(eofbp, i);
1905                                         }
1906                                         if (bp->nb_validend == eofoff)
1907                                                 bp->nb_validend += PAGE_SIZE - poff;
1908                                         eofoff += PAGE_SIZE - poff;
1909                                         i++;
1910                                 }
1911                                 nfs_buf_release(eofbp, 1);
1912                         }
1913                 }
1914                 /*
1915                  * If dirtyend exceeds file size, chop it down.  This should
1916                  * not occur unless there is a race.
1917                  */
1918                 if (NBOFF(bp) + bp->nb_dirtyend > np->n_size)
1919                         bp->nb_dirtyend = np->n_size - NBOFF(bp);
1920                 /*
1921                  * UBC doesn't handle partial pages, so we need to make sure
1922                  * that any pages left in the page cache are completely valid.
1923                  *
1924                  * Writes that are smaller than a block are delayed if they
1925                  * don't extend to the end of the block.
1926                  *
1927                  * If the block isn't (completely) cached, we may need to read
1928                  * in some parts of pages that aren't covered by the write.
1929                  * If the write offset (on) isn't page aligned, we'll need to
1930                  * read the start of the first page being written to.  Likewise,
1931                  * if the offset of the end of the write (on+n) isn't page aligned,
1932                  * we'll need to read the end of the last page being written to.
1933                  *
1934                  * Notes:
1935                  * We don't want to read anything we're just going to write over.
1936                  * We don't want to issue multiple I/Os if we don't have to
1937                  *   (because they're synchronous rpcs).
1938                  * We don't want to read anything we already have modified in the
1939                  *   page cache.
1940                  */
1941                 if (!ISSET(bp->nb_flags, NB_CACHE) && n < biosize) {
1942                         int firstpg, lastpg, dirtypg;
1943                         int firstpgoff, lastpgoff;
1944                         start = end = -1;
1945                         firstpg = on/PAGE_SIZE;
1946                         firstpgoff = on & PAGE_MASK;
1947                         lastpg = (on+n-1)/PAGE_SIZE;
1948                         lastpgoff = (on+n) & PAGE_MASK;
1949                         if (firstpgoff && !NBPGVALID(bp,firstpg)) {
1950                                 /* need to read start of first page */
1951                                 start = firstpg * PAGE_SIZE;
1952                                 end = start + firstpgoff;
1953                         }
1954                         if (lastpgoff && !NBPGVALID(bp,lastpg)) {
1955                                 /* need to read end of last page */
1956                                 if (start < 0)
1957                                         start = (lastpg * PAGE_SIZE) + lastpgoff;
1958                                 end = (lastpg + 1) * PAGE_SIZE;
1959                         }
1960                         if (end > start) {
1961                                 /* need to read the data in range: start...end-1 */
1962
1963                                 /* first, check for dirty pages in between */
1964                                 /* if there are, we'll have to do two reads because */
1965                                 /* we don't want to overwrite the dirty pages. */
1966                                 for (dirtypg=start/PAGE_SIZE; dirtypg <= (end-1)/PAGE_SIZE; dirtypg++)
1967                                         if (NBPGDIRTY(bp,dirtypg))
1968                                                 break;
1969
1970                                 /* if start is at beginning of page, try */
1971                                 /* to get any preceeding pages as well. */
1972                                 if (!(start & PAGE_MASK)) {
1973                                         /* stop at next dirty/valid page or start of block */
1974                                         for (; start > 0; start-=PAGE_SIZE)
1975                                                 if (NBPGVALID(bp,((start-1)/PAGE_SIZE)))
1976                                                         break;
1977                                 }
1978
1979                                 NFS_BUF_MAP(bp);
1980                                 /* setup uio for read(s) */
1981                                 boff = NBOFF(bp);
1982                                 auio.uio_iov = &iov;
1983                                 auio.uio_iovcnt = 1;
1984                                 auio.uio_segflg = UIO_SYSSPACE;
1985                                 auio.uio_rw = UIO_READ;
1986                                 auio.uio_procp = p;
1987
1988                                 if (dirtypg <= (end-1)/PAGE_SIZE) {
1989                                         /* there's a dirty page in the way, so just do two reads */
1990                                         /* we'll read the preceding data here */
1991                                         auio.uio_offset = boff + start;
1992                                         auio.uio_resid = iov.iov_len = on - start;
1993                                         iov.iov_base = bp->nb_data + start;
1994                                         error = nfs_readrpc(vp, &auio, cred);
1995                                         if (error) {
1996                                                 bp->nb_error = error;
1997                                                 SET(bp->nb_flags, NB_ERROR);
1998                                                 printf("nfs_write: readrpc %d", error);
1999                                         }
2000                                         if (auio.uio_resid > 0) {
2001                                                 FSDBG(516, bp, iov.iov_base - bp->nb_data, auio.uio_resid, 0xd00dee01);
2002                                                 bzero(iov.iov_base, auio.uio_resid);
2003                                         }
2004                                         /* update validoff/validend if necessary */
2005                                         if ((bp->nb_validoff < 0) || (bp->nb_validoff > start))
2006                                                 bp->nb_validoff = start;
2007                                         if ((bp->nb_validend < 0) || (bp->nb_validend < on))
2008                                                 bp->nb_validend = on;
2009                                         if (np->n_size > boff + bp->nb_validend)
2010                                                 bp->nb_validend = min(np->n_size - (boff + start), biosize);
2011                                         /* validate any pages before the write offset */
2012                                         for (; start < on/PAGE_SIZE; start+=PAGE_SIZE)
2013                                                 NBPGVALID_SET(bp, start/PAGE_SIZE);
2014                                         /* adjust start to read any trailing data */
2015                                         start = on+n;
2016                                 }
2017
2018                                 /* if end is at end of page, try to */
2019                                 /* get any following pages as well. */
2020                                 if (!(end & PAGE_MASK)) {
2021                                         /* stop at next valid page or end of block */
2022                                         for (; end < bufsize; end+=PAGE_SIZE)
2023                                                 if (NBPGVALID(bp,end/PAGE_SIZE))
2024                                                         break;
2025                                 }
2026
2027                                 if (((boff+start) >= cureof) || ((start >= on) && ((boff + on + n) >= cureof))) {
2028                                         /*
2029                                          * Either this entire read is beyond the current EOF
2030                                          * or the range that we won't be modifying (on+n...end)
2031                                          * is all beyond the current EOF.
2032                                          * No need to make a trip across the network to
2033                                          * read nothing.  So, just zero the buffer instead.
2034                                          */
2035                                         FSDBG(516, bp, start, end - start, 0xd00dee00);
2036                                         bzero(bp->nb_data + start, end - start);
2037                                 } else {
2038                                         /* now we'll read the (rest of the) data */
2039                                         auio.uio_offset = boff + start;
2040                                         auio.uio_resid = iov.iov_len = end - start;
2041                                         iov.iov_base = bp->nb_data + start;
2042                                         error = nfs_readrpc(vp, &auio, cred);
2043                                         if (error) {
2044                                                 bp->nb_error = error;
2045                                                 SET(bp->nb_flags, NB_ERROR);
2046                                                 printf("nfs_write: readrpc %d", error);
2047                                         }
2048                                         if (auio.uio_resid > 0) {
2049                                                 FSDBG(516, bp, iov.iov_base - bp->nb_data, auio.uio_resid, 0xd00dee02);
2050                                                 bzero(iov.iov_base, auio.uio_resid);
2051                                         }
2052                                 }
2053                                 /* update validoff/validend if necessary */
2054                                 if ((bp->nb_validoff < 0) || (bp->nb_validoff > start))
2055                                         bp->nb_validoff = start;
2056                                 if ((bp->nb_validend < 0) || (bp->nb_validend < end))
2057                                         bp->nb_validend = end;
2058                                 if (np->n_size > boff + bp->nb_validend)
2059                                         bp->nb_validend = min(np->n_size - (boff + start), biosize);
2060                                 /* validate any pages before the write offset's page */
2061                                 for (; start < trunc_page_32(on); start+=PAGE_SIZE)
2062                                         NBPGVALID_SET(bp, start/PAGE_SIZE);
2063                                 /* validate any pages after the range of pages being written to */
2064                                 for (; (end - 1) > round_page_32(on+n-1); end-=PAGE_SIZE)
2065                                         NBPGVALID_SET(bp, (end-1)/PAGE_SIZE);
2066                                 /* Note: pages being written to will be validated when written */
2067                         }
2068                 }
2069
2070                 if (ISSET(bp->nb_flags, NB_ERROR)) {
2071                         error = bp->nb_error;
2072                         nfs_buf_release(bp, 1);
2073                         FSDBG_BOT(515, vp, uio->uio_offset, uio->uio_resid, error);
2074                         return (error);
2075                 }
2076
2077                 np->n_flag |= NMODIFIED;
2078
2079                 /*
2080                  * Check for valid write lease and get one as required.
2081                  * In case nfs_buf_get() and/or nfs_buf_write() delayed us.
2082                  */
2083                 if ((nmp->nm_flag & NFSMNT_NQNFS) &&
2084                     NQNFS_CKINVALID(vp, np, ND_WRITE)) {
2085                         do {
2086                                 error = nqnfs_getlease(vp, ND_WRITE, cred, p);
2087                         } while (error == NQNFS_EXPIRED);
2088                         if (error) {
2089                                 nfs_buf_release(bp, 1);
2090                                 FSDBG_BOT(515, vp, uio->uio_offset, 0x11220001, error);
2091                                 return (error);
2092                         }
2093                         if (np->n_lrev != np->n_brev ||
2094                             (np->n_flag & NQNFSNONCACHE)) {
2095                                 nfs_buf_release(bp, 1);
2096                                 error = nfs_vinvalbuf(vp, V_SAVE, cred, p, 1);
2097                                 if (error) {
2098                                         FSDBG_BOT(515, vp, uio->uio_offset, 0x11220002, error);
2099                                         return (error);
2100                                 }
2101                                 np->n_brev = np->n_lrev;
2102                                 goto again;
2103                         }
2104                 }
2105                 NFS_BUF_MAP(bp);
2106                 error = uiomove((char *)bp->nb_data + on, n, uio);
2107                 if (error) {
2108                         SET(bp->nb_flags, NB_ERROR);
2109                         nfs_buf_release(bp, 1);
2110                         FSDBG_BOT(515, vp, uio->uio_offset, uio->uio_resid, error);
2111                         return (error);
2112                 }
2113
2114                 /* validate any pages written to */
2115                 start = on & ~PAGE_MASK;
2116                 for (; start < on+n; start += PAGE_SIZE) {
2117                         NBPGVALID_SET(bp, start/PAGE_SIZE);
2118                         /*
2119                          * This may seem a little weird, but we don't actually set the
2120                          * dirty bits for writes.  This is because we keep the dirty range
2121                          * in the nb_dirtyoff/nb_dirtyend fields.  Also, particularly for
2122                          * delayed writes, when we give the pages back to the VM we don't
2123                          * want to keep them marked dirty, because when we later write the
2124                          * buffer we won't be able to tell which pages were written dirty
2125                          * and which pages were mmapped and dirtied.
2126                          */
2127                 }
2128                 if (bp->nb_dirtyend > 0) {
2129                         bp->nb_dirtyoff = min(on, bp->nb_dirtyoff);
2130                         bp->nb_dirtyend = max((on + n), bp->nb_dirtyend);
2131                 } else {
2132                         bp->nb_dirtyoff = on;
2133                         bp->nb_dirtyend = on + n;
2134                 }
2135                 if (bp->nb_validend <= 0 || bp->nb_validend < bp->nb_dirtyoff ||
2136                     bp->nb_validoff > bp->nb_dirtyend) {
2137                         bp->nb_validoff = bp->nb_dirtyoff;
2138                         bp->nb_validend = bp->nb_dirtyend;
2139                 } else {
2140                         bp->nb_validoff = min(bp->nb_validoff, bp->nb_dirtyoff);
2141                         bp->nb_validend = max(bp->nb_validend, bp->nb_dirtyend);
2142                 }
2143                 if (!ISSET(bp->nb_flags, NB_CACHE))
2144                         nfs_buf_normalize_valid_range(np, bp);
2145
2146                 /*
2147                  * Since this block is being modified, it must be written
2148                  * again and not just committed.
2149                  */
2150                 if (ISSET(bp->nb_flags, NB_NEEDCOMMIT)) {
2151                         np->n_needcommitcnt--;
2152                         CHECK_NEEDCOMMITCNT(np);
2153                 }
2154                 CLR(bp->nb_flags, NB_NEEDCOMMIT);
2155
2156                 if ((np->n_flag & NQNFSNONCACHE) ||
2157                     (ioflag & IO_SYNC) || (vp->v_flag & VNOCACHE_DATA)) {
2158                         bp->nb_proc = p;
2159                         error = nfs_buf_write(bp);
2160                         if (error) {
2161                                 FSDBG_BOT(515, vp, uio->uio_offset,
2162                                         uio->uio_resid, error);
2163                                 return (error);
2164                         }
2165                         if (np->n_flag & NQNFSNONCACHE) {
2166                                 error = nfs_vinvalbuf(vp, V_SAVE, cred, p, 1);
2167                                 if (error) {
2168                                         FSDBG_BOT(515, vp, uio->uio_offset,
2169                                                 uio->uio_resid, error);
2170                                         return (error);
2171                                 }
2172                         }
2173                 } else if ((n + on) == biosize && (nmp->nm_flag & NFSMNT_NQNFS) == 0) {
2174                         bp->nb_proc = (struct proc *)0;
2175                         SET(bp->nb_flags, NB_ASYNC);
2176                         nfs_buf_write(bp);
2177                 } else
2178                         nfs_buf_write_delayed(bp);
2179
2180                 if (np->n_needcommitcnt > (nbuf/16))
2181                         nfs_flushcommits(vp, p);
2182
2183         } while (uio->uio_resid > 0 && n > 0);
2184
2185         FSDBG_BOT(515, vp, uio->uio_offset, uio->uio_resid, 0);
2186         return (0);
2187 }
2188
2189 /*
2190  * Flush out and invalidate all buffers associated with a vnode.
2191  * Called with the underlying object locked.
2192  */
2193 static int
2194 nfs_vinvalbuf_internal(vp, flags, cred, p, slpflag, slptimeo)
2195         register struct vnode *vp;
2196         int flags;
2197         struct ucred *cred;
2198         struct proc *p;
2199         int slpflag, slptimeo;
2200 {
2201         struct nfsbuf *bp;
2202         struct nfsbuf *nbp, *blist;
2203         int s, error = 0;
2204         struct nfsnode *np = VTONFS(vp);
2205
2206         if (flags & V_SAVE) {
2207                 if (error = VOP_FSYNC(vp, cred, MNT_WAIT, p))
2208                         return (error);
2209                 if (np->n_dirtyblkhd.lh_first)
2210                         panic("nfs_vinvalbuf: dirty bufs (vp 0x%x, bp 0x%x)",
2211                                 vp, np->n_dirtyblkhd.lh_first);
2212         }
2213
2214         for (;;) {
2215                 blist = np->n_cleanblkhd.lh_first;
2216                 if (!blist)
2217                         blist = np->n_dirtyblkhd.lh_first;
2218                 if (!blist)
2219                         break;
2220
2221                 for (bp = blist; bp; bp = nbp) {
2222                         nbp = bp->nb_vnbufs.le_next;
2223                         s = splbio();
2224                         if (ISSET(bp->nb_flags, NB_BUSY)) {
2225                                 SET(bp->nb_flags, NB_WANTED);
2226                                 FSDBG_TOP(556, vp, bp, NBOFF(bp), bp->nb_flags);
2227                                 error = tsleep((caddr_t)bp,
2228                                         slpflag | (PRIBIO + 1), "nfs_vinvalbuf",
2229                                         slptimeo);
2230                                 FSDBG_BOT(556, vp, bp, NBOFF(bp), bp->nb_flags);
2231                                 splx(s);
2232                                 if (error) {
2233                                         FSDBG(554, vp, bp, -1, error);
2234                                         return (error);
2235                                 }
2236                                 break;
2237                         }
2238                         FSDBG(554, vp, bp, NBOFF(bp), bp->nb_flags);
2239                         nfs_buf_remfree(bp);
2240                         SET(bp->nb_flags, NB_BUSY);
2241                         splx(s);
2242                         if ((flags & V_SAVE) && UBCINFOEXISTS(vp) && (NBOFF(bp) < np->n_size)) {
2243                                 /* XXX extra paranoia: make sure we're not */
2244                                 /* somehow leaving any dirty data around */
2245                                 int mustwrite = 0;
2246                                 int end = (NBOFF(bp) + bp->nb_bufsize >= np->n_size) ?
2247                                     bp->nb_bufsize : (np->n_size - NBOFF(bp));
2248                                 if (!ISSET(bp->nb_flags, NB_PAGELIST)) {
2249                                         error = nfs_buf_upl_setup(bp);
2250                                         if (error == EINVAL) {
2251                                                 /* vm object must no longer exist */
2252                                                 /* hopefully we don't need to do */
2253                                                 /* anything for this buffer */
2254                                         } else if (error)
2255                                                 printf("nfs_vinvalbuf: upl setup failed %d\n",
2256                                                         error);
2257                                         bp->nb_valid = bp->nb_dirty = 0;
2258                                 }
2259                                 nfs_buf_upl_check(bp);
2260                                 /* check for any dirty data before the EOF */
2261                                 if (bp->nb_dirtyend && bp->nb_dirtyoff < end) {
2262                                         /* clip dirty range to EOF */
2263                                         if (bp->nb_dirtyend > end)
2264                                                 bp->nb_dirtyend = end;
2265                                         mustwrite++;
2266                                 }
2267                                 bp->nb_dirty &= (1 << (round_page_32(end)/PAGE_SIZE)) - 1;
2268                                 if (bp->nb_dirty)
2269                                         mustwrite++;
2270                                 if (mustwrite) {
2271                                         FSDBG(554, vp, bp, 0xd00dee, bp->nb_flags);
2272                                         if (!ISSET(bp->nb_flags, NB_PAGELIST))
2273                                                 panic("nfs_vinvalbuf: dirty buffer without upl");
2274                                         /* gotta write out dirty data before invalidating */
2275                                         /* (NB_STABLE indicates that data writes should be FILESYNC) */
2276                                         /* (NB_NOCACHE indicates buffer should be discarded) */
2277                                         CLR(bp->nb_flags, (NB_DONE | NB_ERROR | NB_INVAL | NB_ASYNC));
2278                                         SET(bp->nb_flags, NB_STABLE | NB_NOCACHE);
2279                                         /*
2280                                          * NFS has embedded ucred so crhold() risks zone corruption
2281                                          */
2282                                         if (bp->nb_wcred == NOCRED)
2283                                                 bp->nb_wcred = crdup(cred);
2284                                         error = nfs_buf_write(bp);
2285                                         // Note: bp has been released
2286                                         if (error) {
2287                                                 FSDBG(554, bp, 0xd00dee, 0xbad, error);
2288                                                 np->n_error = error;
2289                                                 np->n_flag |= NWRITEERR;
2290                                                 error = 0;
2291                                         }
2292                                         break;
2293                                 }
2294                         }
2295                         SET(bp->nb_flags, NB_INVAL);
2296                         // Note: We don't want to do FREEUPs here because
2297                         // that may modify the buffer chain we're iterating!
2298                         nfs_buf_release(bp, 0);
2299                 }
2300         }
2301         NFS_BUF_FREEUP();
2302         if (np->n_dirtyblkhd.lh_first || np->n_cleanblkhd.lh_first)
2303                 panic("nfs_vinvalbuf: flush failed");
2304         return (0);
2305 }
2306
2307
2308 /*
2309  * Flush and invalidate all dirty buffers. If another process is already
2310  * doing the flush, just wait for completion.
2311  */
2312 int
2313 nfs_vinvalbuf(vp, flags, cred, p, intrflg)
2314         struct vnode *vp;
2315         int flags;
2316         struct ucred *cred;
2317         struct proc *p;
2318         int intrflg;
2319 {
2320         register struct nfsnode *np = VTONFS(vp);
2321         struct nfsmount *nmp = VFSTONFS(vp->v_mount);
2322         int error = 0, slpflag, slptimeo;
2323         int didhold = 0;
2324
2325         FSDBG_TOP(554, vp, flags, intrflg, 0);
2326
2327         if (nmp && ((nmp->nm_flag & NFSMNT_INT) == 0))
2328                 intrflg = 0;
2329         if (intrflg) {
2330                 slpflag = PCATCH;
2331                 slptimeo = 2 * hz;
2332         } else {
2333                 slpflag = 0;
2334                 slptimeo = 0;
2335         }
2336         /*
2337          * First wait for any other process doing a flush to complete.
2338          */
2339         while (np->n_flag & NFLUSHINPROG) {
2340                 np->n_flag |= NFLUSHWANT;
2341                 FSDBG_TOP(555, vp, flags, intrflg, np->n_flag);
2342                 error = tsleep((caddr_t)&np->n_flag, PRIBIO + 2, "nfsvinval", slptimeo);
2343                 FSDBG_BOT(555, vp, flags, intrflg, np->n_flag);
2344                 if (error && (error = nfs_sigintr(VFSTONFS(vp->v_mount), NULL, p))) {
2345                         FSDBG_BOT(554, vp, flags, intrflg, error);
2346                         return (error);
2347                 }
2348         }
2349
2350         /*
2351          * Now, flush as required.
2352          */
2353         np->n_flag |= NFLUSHINPROG;
2354         error = nfs_vinvalbuf_internal(vp, flags, cred, p, slpflag, 0);
2355         while (error) {
2356                 FSDBG(554, vp, 0, 0, error);
2357                 error = nfs_sigintr(VFSTONFS(vp->v_mount), NULL, p);
2358                 if (error) {
2359                         np->n_flag &= ~NFLUSHINPROG;
2360                         if (np->n_flag & NFLUSHWANT) {
2361                                 np->n_flag &= ~NFLUSHWANT;
2362                                 wakeup((caddr_t)&np->n_flag);
2363                         }
2364                         FSDBG_BOT(554, vp, flags, intrflg, error);
2365                         return (error);
2366                 }
2367                 error = nfs_vinvalbuf_internal(vp, flags, cred, p, 0, slptimeo);
2368         }
2369         np->n_flag &= ~(NMODIFIED | NFLUSHINPROG);
2370         if (np->n_flag & NFLUSHWANT) {
2371                 np->n_flag &= ~NFLUSHWANT;
2372                 wakeup((caddr_t)&np->n_flag);
2373         }
2374         didhold = ubc_hold(vp);
2375         if (didhold) {
2376                 int rv = ubc_clean(vp, 1); /* get the pages out of vm also */
2377                 if (!rv)
2378                         panic("nfs_vinvalbuf(): ubc_clean failed!");
2379                 ubc_rele(vp);
2380         }
2381         FSDBG_BOT(554, vp, flags, intrflg, 0);
2382         return (0);
2383 }
2384
2385 /*
2386  * Initiate asynchronous I/O. Return an error if no nfsiods are available.
2387  * This is mainly to avoid queueing async I/O requests when the nfsiods
2388  * are all hung on a dead server.
2389  */
2390 int
2391 nfs_asyncio(bp, cred)
2392         struct nfsbuf *bp;
2393         struct ucred *cred;
2394 {
2395         struct nfsmount *nmp;
2396         int i;
2397         int gotiod;
2398         int slpflag = 0;
2399         int slptimeo = 0;
2400         int error, error2;
2401
2402         if (nfs_numasync == 0)
2403                 return (EIO);
2404
2405         FSDBG_TOP(552, bp, bp ? NBOFF(bp) : 0, bp ? bp->nb_flags : 0, 0);
2406
2407         nmp = ((bp != NULL) ? VFSTONFS(bp->nb_vp->v_mount) : NULL);
2408 again:
2409         if (nmp && nmp->nm_flag & NFSMNT_INT)
2410                 slpflag = PCATCH;
2411         gotiod = FALSE;
2412
2413         /* no nfsbuf means tell nfsiod to process delwri list */
2414         if (!bp)
2415                 nfs_ioddelwri = 1;
2416
2417         /*
2418          * Find a free iod to process this request.
2419          */
2420         for (i = 0; i < NFS_MAXASYNCDAEMON; i++)
2421                 if (nfs_iodwant[i]) {
2422                         /*
2423                          * Found one, so wake it up and tell it which
2424                          * mount to process.
2425                          */
2426                         NFS_DPF(ASYNCIO,
2427                                 ("nfs_asyncio: waking iod %d for mount %p\n",
2428                                  i, nmp));
2429                         nfs_iodwant[i] = (struct proc *)0;
2430                         nfs_iodmount[i] = nmp;
2431                         if (nmp)
2432                                 nmp->nm_bufqiods++;
2433                         wakeup((caddr_t)&nfs_iodwant[i]);
2434                         gotiod = TRUE;
2435                         break;
2436                 }
2437
2438         /* if we're just poking the delwri list, we're done */
2439         if (!bp)
2440                 return (0);
2441
2442         /*
2443          * If none are free, we may already have an iod working on this mount
2444          * point.  If so, it will process our request.
2445          */
2446         if (!gotiod) {
2447                 if (nmp->nm_bufqiods > 0) {
2448                         NFS_DPF(ASYNCIO,
2449                                 ("nfs_asyncio: %d iods are already processing mount %p\n",
2450                                  nmp->nm_bufqiods, nmp));
2451                         gotiod = TRUE;
2452                 }
2453         }
2454
2455         /*
2456          * If we have an iod which can process the request, then queue
2457          * the buffer.
2458          */
2459         FSDBG(552, bp, gotiod, i, nmp->nm_bufqiods);
2460         if (gotiod) {
2461                 /*
2462                  * Ensure that the queue never grows too large.
2463                  */
2464                 while (nmp->nm_bufqlen >= 2*nfs_numasync) {
2465                         if (ISSET(bp->nb_flags, NB_IOD)) {
2466                                 /* An nfsiod is attempting this async operation so */
2467                                 /* we must not fall asleep on the bufq because we */
2468                                 /* could be waiting on ourself.  Just return error */
2469                                 /* and we'll do this operation syncrhonously. */
2470                                 goto out;
2471                         }
2472                         FSDBG(552, bp, nmp->nm_bufqlen, 2*nfs_numasync, -1);
2473                         NFS_DPF(ASYNCIO,
2474                                 ("nfs_asyncio: waiting for mount %p queue to drain\n", nmp));
2475                         nmp->nm_bufqwant = TRUE;
2476                         error = tsleep(&nmp->nm_bufq, slpflag | PRIBIO,
2477                                        "nfsaio", slptimeo);
2478                         if (error) {
2479                                 error2 = nfs_sigintr(nmp, NULL, bp->nb_proc);
2480                                 if (error2) {
2481                                         FSDBG_BOT(552, bp, NBOFF(bp), bp->nb_flags, error2);
2482                                         return (error2);
2483                                 }
2484                                 if (slpflag == PCATCH) {
2485                                         slpflag = 0;
2486                                         slptimeo = 2 * hz;
2487                                 }
2488                         }
2489                         /*
2490                          * We might have lost our iod while sleeping,
2491                          * so check and loop if nescessary.
2492                          */
2493                         if (nmp->nm_bufqiods == 0) {
2494                                 NFS_DPF(ASYNCIO,
2495                                         ("nfs_asyncio: no iods after mount %p queue was drained, looping\n", nmp));
2496                                 goto again;
2497                         }
2498                 }
2499
2500                 if (ISSET(bp->nb_flags, NB_READ)) {
2501                         if (bp->nb_rcred == NOCRED && cred != NOCRED) {
2502                                 /*
2503                                  * NFS has embedded ucred.
2504                                  * Can not crhold() here as that causes zone corruption
2505                                  */
2506                                 bp->nb_rcred = crdup(cred);
2507                         }
2508                 } else {
2509                         SET(bp->nb_flags, NB_WRITEINPROG);
2510                         if (bp->nb_wcred == NOCRED && cred != NOCRED) {
2511                                 /*
2512                                  * NFS has embedded ucred.
2513                                  * Can not crhold() here as that causes zone corruption
2514                                  */
2515                                 bp->nb_wcred = crdup(cred);
2516                         }
2517                 }
2518
2519                 TAILQ_INSERT_TAIL(&nmp->nm_bufq, bp, nb_free);
2520                 nmp->nm_bufqlen++;
2521                 FSDBG_BOT(552, bp, NBOFF(bp), bp->nb_flags, 0);
2522                 return (0);
2523         }
2524
2525 out:
2526         /*
2527          * All the iods are busy on other mounts, so return EIO to
2528          * force the caller to process the i/o synchronously.
2529          */
2530         NFS_DPF(ASYNCIO, ("nfs_asyncio: no iods available, i/o is synchronous\n"));
2531         FSDBG_BOT(552, bp, NBOFF(bp), bp->nb_flags, EIO);
2532         return (EIO);
2533 }
2534
2535 /*
2536  * Do an I/O operation to/from a cache block. This may be called
2537  * synchronously or from an nfsiod.
2538  */
2539 int
2540 nfs_doio(bp, cr, p)
2541         struct nfsbuf *bp;
2542         struct ucred *cr;
2543         struct proc *p;
2544 {
2545         register struct uio *uiop;
2546         register struct vnode *vp;
2547         struct nfsnode *np;
2548         struct nfsmount *nmp;
2549         int error = 0, diff, len, iomode, must_commit = 0;
2550         struct uio uio;
2551         struct iovec io;
2552
2553         vp = bp->nb_vp;
2554         np = VTONFS(vp);
2555         nmp = VFSTONFS(vp->v_mount);
2556         uiop = &uio;
2557         uiop->uio_iov = &io;
2558         uiop->uio_iovcnt = 1;
2559         uiop->uio_segflg = UIO_SYSSPACE;
2560         uiop->uio_procp = p;
2561
2562         /*
2563          * we've decided to perform I/O for this block,
2564          * so we couldn't possibly NB_DONE.  So, clear it.
2565          */
2566         if (ISSET(bp->nb_flags, NB_DONE)) {
2567                 if (!ISSET(bp->nb_flags, NB_ASYNC))
2568                         panic("nfs_doio: done and not async");
2569                 CLR(bp->nb_flags, NB_DONE);
2570         }
2571         FSDBG_TOP(256, np->n_size, NBOFF(bp), bp->nb_bufsize, bp->nb_flags);
2572         FSDBG(257, bp->nb_validoff, bp->nb_validend, bp->nb_dirtyoff,
2573               bp->nb_dirtyend);
2574
2575         if (ISSET(bp->nb_flags, NB_READ)) {
2576             if (vp->v_type == VREG)
2577                     NFS_BUF_MAP(bp);
2578             io.iov_len = uiop->uio_resid = bp->nb_bufsize;
2579             io.iov_base = bp->nb_data;
2580             uiop->uio_rw = UIO_READ;
2581             switch (vp->v_type) {
2582             case VREG:
2583                 uiop->uio_offset = NBOFF(bp);
2584                 nfsstats.read_bios++;
2585                 error = nfs_readrpc(vp, uiop, cr);
2586                 FSDBG(262, np->n_size, NBOFF(bp), uiop->uio_resid, error);
2587                 if (!error) {
2588                     /* update valid range */
2589                     bp->nb_validoff = 0;
2590                     if (uiop->uio_resid) {
2591                         /*
2592                          * If len > 0, there is a hole in the file and
2593                          * no writes after the hole have been pushed to
2594                          * the server yet.
2595                          * Just zero fill the rest of the valid area.
2596                          */
2597                         diff = bp->nb_bufsize - uiop->uio_resid;
2598                         len = np->n_size - (NBOFF(bp) + diff);
2599                         if (len > 0) {
2600                                 len = min(len, uiop->uio_resid);
2601                                 bzero((char *)bp->nb_data + diff, len);
2602                                 bp->nb_validend = diff + len;
2603                                 FSDBG(258, diff, len, 0, 1);
2604                         } else
2605                                 bp->nb_validend = diff;
2606                     } else
2607                                 bp->nb_validend = bp->nb_bufsize;
2608                     bp->nb_valid = (1 << (round_page_32(bp->nb_validend)/PAGE_SIZE)) - 1;
2609                     if (bp->nb_validend & PAGE_MASK) {
2610                             /* valid range ends in the middle of a page so we */
2611                             /* need to zero-fill any invalid data at the end */
2612                             /* of the last page */
2613                             bzero((caddr_t)(bp->nb_data + bp->nb_validend),
2614                                   bp->nb_bufsize - bp->nb_validend);
2615                             FSDBG(258, bp->nb_validend,
2616                                   bp->nb_bufsize - bp->nb_validend, 0, 2);
2617                     }
2618                 }
2619                 if (p && (vp->v_flag & VTEXT) &&
2620                         (((nmp->nm_flag & NFSMNT_NQNFS) &&
2621                           NQNFS_CKINVALID(vp, np, ND_READ) &&
2622                           np->n_lrev != np->n_brev) ||
2623                          (!(nmp->nm_flag & NFSMNT_NQNFS) &&
2624                           np->n_mtime != np->n_vattr.va_mtime.tv_sec))) {
2625                         uprintf("Process killed due to text file modification\n");
2626                         psignal(p, SIGKILL);
2627                         p->p_flag |= P_NOSWAP;
2628                 }
2629                 break;
2630             case VLNK:
2631                 uiop->uio_offset = (off_t)0;
2632                 nfsstats.readlink_bios++;
2633                 error = nfs_readlinkrpc(vp, uiop, cr);
2634                 if (!error) {
2635                         bp->nb_validoff = 0;
2636                         bp->nb_validend = uiop->uio_offset;
2637                 }
2638                 break;
2639             case VDIR:
2640                 nfsstats.readdir_bios++;
2641                 uiop->uio_offset = NBOFF(bp);
2642                 if (!(nmp->nm_flag & NFSMNT_NFSV3))
2643                         nmp->nm_flag &= ~NFSMNT_RDIRPLUS; /* dk@farm.org */
2644                 if (nmp->nm_flag & NFSMNT_RDIRPLUS) {
2645                         error = nfs_readdirplusrpc(vp, uiop, cr);
2646                         if (error == NFSERR_NOTSUPP)
2647                                 nmp->nm_flag &= ~NFSMNT_RDIRPLUS;
2648                 }
2649                 if ((nmp->nm_flag & NFSMNT_RDIRPLUS) == 0)
2650                         error = nfs_readdirrpc(vp, uiop, cr);
2651                 if (!error) {
2652                         bp->nb_validoff = 0;
2653                         bp->nb_validend = uiop->uio_offset - NBOFF(bp);
2654                         bp->nb_valid = (1 << (round_page_32(bp->nb_validend)/PAGE_SIZE)) - 1;
2655                 }
2656                 break;
2657             default:
2658                 printf("nfs_doio: type %x unexpected\n", vp->v_type);
2659                 break;
2660             };
2661             if (error) {
2662                 SET(bp->nb_flags, NB_ERROR);
2663                 bp->nb_error = error;
2664             }
2665
2666         } else {
2667             /* we're doing a write */
2668             int doff, dend = 0;
2669
2670             /* We need to make sure the pages are locked before doing I/O.  */
2671             if (!ISSET(bp->nb_flags, NB_META) && UBCISVALID(vp)) {
2672                 if (!ISSET(bp->nb_flags, NB_PAGELIST)) {
2673                     error = nfs_buf_upl_setup(bp);
2674                     if (error) {
2675                         printf("nfs_doio: upl create failed %d\n", error);
2676                         SET(bp->nb_flags, NB_ERROR);
2677                         bp->nb_error = EIO;
2678                         return (EIO);
2679                     }
2680                     nfs_buf_upl_check(bp);
2681                 }
2682             }
2683
2684             if (ISSET(bp->nb_flags, NB_WASDIRTY)) {
2685                 FSDBG(256, bp, NBOFF(bp), bp->nb_dirty, 0xd00dee);
2686                 /*
2687                  * There are pages marked dirty that need to be written out.
2688                  *
2689                  * We don't want to just combine the write range with the
2690                  * range of pages that are dirty because that could cause us
2691                  * to write data that wasn't actually written to.
2692                  * We also don't want to write data more than once.
2693                  *
2694                  * If the dirty range just needs to be committed, we do that.
2695                  * Otherwise, we write the dirty range and clear the dirty bits
2696                  * for any COMPLETE pages covered by that range.
2697                  * If there are dirty pages left after that, we write out the
2698                  * parts that we haven't written yet.
2699                  */
2700             }
2701
2702             /*
2703              * If NB_NEEDCOMMIT is set, a commit rpc may do the trick. If not
2704              * an actual write will have to be done.
2705              * If NB_WRITEINPROG is already set, then push it with a write anyhow.
2706              */
2707             if ((bp->nb_flags & (NB_NEEDCOMMIT | NB_WRITEINPROG)) == NB_NEEDCOMMIT) {
2708                 doff = NBOFF(bp) + bp->nb_dirtyoff;
2709                 SET(bp->nb_flags, NB_WRITEINPROG);
2710                 error = nfs_commit(vp, doff, bp->nb_dirtyend - bp->nb_dirtyoff,
2711                                 bp->nb_wcred, bp->nb_proc);
2712                 CLR(bp->nb_flags, NB_WRITEINPROG);
2713                 if (!error) {
2714                     bp->nb_dirtyoff = bp->nb_dirtyend = 0;
2715                     CLR(bp->nb_flags, NB_NEEDCOMMIT);
2716                     np->n_needcommitcnt--;
2717                     CHECK_NEEDCOMMITCNT(np);
2718                 } else if (error == NFSERR_STALEWRITEVERF)
2719                     nfs_clearcommit(vp->v_mount);
2720             }
2721
2722             if (!error && bp->nb_dirtyend > 0) {
2723                 /* there's a dirty range that needs to be written out */
2724                 u_int32_t pagemask;
2725                 int firstpg, lastpg;
2726
2727                 if (NBOFF(bp) + bp->nb_dirtyend > np->n_size)
2728                     bp->nb_dirtyend = np->n_size - NBOFF(bp);
2729
2730                 NFS_BUF_MAP(bp);
2731
2732                 doff = bp->nb_dirtyoff;
2733                 dend = bp->nb_dirtyend;
2734
2735                 /* if doff page is dirty, move doff to start of page */
2736                 if (NBPGDIRTY(bp,doff/PAGE_SIZE))
2737                     doff -= doff & PAGE_MASK;
2738                 /* try to expand write range to include preceding dirty pages */
2739                 if (!(doff & PAGE_MASK))
2740                     while (doff > 0 && NBPGDIRTY(bp,(doff-1)/PAGE_SIZE))
2741                         doff -= PAGE_SIZE;
2742                 /* if dend page is dirty, move dend to start of next page */
2743                 if ((dend & PAGE_MASK) && NBPGDIRTY(bp,dend/PAGE_SIZE))
2744                     dend = round_page_32(dend);
2745                 /* try to expand write range to include trailing dirty pages */
2746                 if (!(dend & PAGE_MASK))
2747                     while (dend < bp->nb_bufsize && NBPGDIRTY(bp,dend/PAGE_SIZE))
2748                         dend += PAGE_SIZE;
2749                 /* make sure to keep dend clipped to EOF */
2750                 if (NBOFF(bp) + dend > np->n_size)
2751                     dend = np->n_size - NBOFF(bp);
2752                 /* calculate range of complete pages being written */
2753                 firstpg = round_page_32(doff) / PAGE_SIZE;
2754                 lastpg = (trunc_page_32(dend) - 1)/ PAGE_SIZE;
2755                 /* calculate mask for that page range */
2756                 pagemask = ((1 << (lastpg+1)) - 1) & ~((1 << firstpg) - 1);
2757
2758                 /* compare page mask to nb_dirty; if there are other dirty pages */
2759                 /* then write FILESYNC; otherwise, write UNSTABLE if async and */
2760                 /* not needcommit/nocache/call; otherwise write FILESYNC */
2761                 if (bp->nb_dirty & ~pagemask)
2762                     iomode = NFSV3WRITE_FILESYNC;
2763                 else if ((bp->nb_flags & (NB_ASYNC | NB_NEEDCOMMIT | NB_NOCACHE | NB_STABLE)) == NB_ASYNC)
2764                     iomode = NFSV3WRITE_UNSTABLE;
2765                 else
2766                     iomode = NFSV3WRITE_FILESYNC;
2767
2768                 /* write the dirty range */
2769                 io.iov_len = uiop->uio_resid = dend - doff;
2770                 uiop->uio_offset = NBOFF(bp) + doff;
2771                 io.iov_base = (char *)bp->nb_data + doff;
2772                 uiop->uio_rw = UIO_WRITE;
2773
2774                 nfsstats.write_bios++;
2775
2776                 SET(bp->nb_flags, NB_WRITEINPROG);
2777                 error = nfs_writerpc(vp, uiop, cr, &iomode, &must_commit);
2778                 if (must_commit)
2779                     nfs_clearcommit(vp->v_mount);
2780                 /* clear dirty bits for pages we've written */
2781                 if (!error)
2782                     bp->nb_dirty &= ~pagemask;
2783                 /* set/clear needcommit flag */
2784                 if (!error && iomode == NFSV3WRITE_UNSTABLE) {
2785                     if (!ISSET(bp->nb_flags, NB_NEEDCOMMIT))
2786                         np->n_needcommitcnt++;
2787                     SET(bp->nb_flags, NB_NEEDCOMMIT);
2788                     /* make sure nb_dirtyoff/nb_dirtyend reflect actual range written */
2789                     bp->nb_dirtyoff = doff;
2790                     bp->nb_dirtyend = dend;
2791                 } else {
2792                     if (ISSET(bp->nb_flags, NB_NEEDCOMMIT)) {
2793                         np->n_needcommitcnt--;
2794                         CHECK_NEEDCOMMITCNT(np);
2795                     }
2796                     CLR(bp->nb_flags, NB_NEEDCOMMIT);
2797                 }
2798                 CLR(bp->nb_flags, NB_WRITEINPROG);
2799                 /*
2800                  * For an interrupted write, the buffer is still valid and the write
2801                  * hasn't been pushed to the server yet, so we can't set NB_ERROR and
2802                  * report the interruption by setting NB_EINTR.  For the NB_ASYNC case,
2803                  * NB_EINTR is not relevant.
2804                  *
2805                  * For the case of a V3 write rpc not being committed to stable
2806                  * storage, the block is still dirty and requires either a commit rpc
2807                  * or another write rpc with iomode == NFSV3WRITE_FILESYNC before the
2808                  * block is reused. This is indicated by setting the NB_DELWRI and
2809                  * NB_NEEDCOMMIT flags.
2810                  */
2811                 if (error == EINTR || (!error && bp->nb_flags & NB_NEEDCOMMIT)) {
2812                     CLR(bp->nb_flags, NB_INVAL | NB_NOCACHE);
2813                     if (!ISSET(bp->nb_flags, NB_DELWRI)) {
2814                         SET(bp->nb_flags, NB_DELWRI);
2815                         nfs_nbdwrite++;
2816                         NFSBUFCNTCHK();
2817                     }
2818                     FSDBG(261, bp->nb_validoff, bp->nb_validend,
2819                           bp->nb_bufsize, 0);
2820                     /*
2821                      * Since for the NB_ASYNC case, nfs_bwrite() has
2822                      * reassigned the buffer to the clean list, we have to
2823                      * reassign it back to the dirty one. Ugh.
2824                      */
2825                     if (ISSET(bp->nb_flags, NB_ASYNC)) {
2826                         /* move to dirty list */
2827                         int s = splbio();
2828                         if (bp->nb_vnbufs.le_next != NFSNOLIST)
2829                             LIST_REMOVE(bp, nb_vnbufs);
2830                         LIST_INSERT_HEAD(&np->n_dirtyblkhd, bp, nb_vnbufs);
2831                         splx(s);
2832                     } else {
2833                         SET(bp->nb_flags, NB_EINTR);
2834                     }
2835                 } else {
2836                         /* either there's an error or we don't need to commit */
2837                         if (error) {
2838                             SET(bp->nb_flags, NB_ERROR);
2839                             bp->nb_error = np->n_error = error;
2840                             np->n_flag |= NWRITEERR;
2841                         }
2842                         /* clear the dirty range */
2843                         bp->nb_dirtyoff = bp->nb_dirtyend = 0;
2844                 }
2845             }
2846
2847             if (!error && bp->nb_dirty) {
2848                 /* there are pages marked dirty that need to be written out */
2849                 int pg, cnt, npages, off, len;
2850
2851                 nfsstats.write_bios++;
2852
2853                 NFS_BUF_MAP(bp);
2854
2855                 /*
2856                  * we do these writes synchronously because we can't really
2857                  * support the unstable/needommit method.  We could write
2858                  * them unstable, clear the dirty bits, and then commit the
2859                  * whole block later, but if we need to rewrite the data, we
2860                  * won't have any idea which pages were written because that
2861                  * info can't be stored in the nb_dirtyoff/nb_dirtyend.  We
2862                  * also can't leave the dirty bits set because then we wouldn't
2863                  * be able to tell if the pages were re-dirtied between the end
2864                  * of the write and the commit.
2865                  */
2866                 iomode = NFSV3WRITE_FILESYNC;
2867                 uiop->uio_rw = UIO_WRITE;
2868
2869                 SET(bp->nb_flags, NB_WRITEINPROG);
2870                 npages = bp->nb_bufsize/PAGE_SIZE;
2871                 for (pg=0; pg < npages; pg++) {
2872                     if (!NBPGDIRTY(bp,pg))
2873                         continue;
2874                     cnt = 1;
2875                     while (((pg+cnt) < npages) && NBPGDIRTY(bp,pg+cnt))
2876                             cnt++;
2877                     /* write cnt pages starting with page pg */
2878                     off = pg * PAGE_SIZE;
2879                     len = cnt * PAGE_SIZE;
2880
2881                     /* clip writes to EOF */
2882                     if (NBOFF(bp) + off + len > np->n_size)
2883                         len -= (NBOFF(bp) + off + len) - np->n_size;
2884                     if (len > 0) {
2885                         io.iov_len = uiop->uio_resid = len;
2886                         uiop->uio_offset = NBOFF(bp) + off;
2887                         io.iov_base = (char *)bp->nb_data + off;
2888                         error = nfs_writerpc(vp, uiop, cr, &iomode, &must_commit);
2889                         if (must_commit)
2890                             nfs_clearcommit(vp->v_mount);
2891                         if (error)
2892                             break;
2893                     }
2894                     /* clear dirty bits */
2895                     while (cnt--) {
2896                            bp->nb_dirty &= ~(1 << pg);
2897                            /* leave pg on last page */
2898                            if (cnt) pg++;
2899                     }
2900                 }
2901                 if (!error) {
2902                     if (ISSET(bp->nb_flags, NB_NEEDCOMMIT)) {
2903                         np->n_needcommitcnt--;
2904                         CHECK_NEEDCOMMITCNT(np);
2905                     }
2906                     CLR(bp->nb_flags, NB_NEEDCOMMIT);
2907                 }
2908                 CLR(bp->nb_flags, NB_WRITEINPROG);
2909                 FSDBG_BOT(256, bp->nb_validoff, bp->nb_validend, bp->nb_bufsize,
2910                           np->n_size);
2911             }
2912
2913             if (error) {
2914                 SET(bp->nb_flags, NB_ERROR);
2915                 bp->nb_error = error;
2916             }
2917         }
2918
2919         FSDBG_BOT(256, bp->nb_validoff, bp->nb_validend, bp->nb_bufsize, error);
2920
2921         nfs_buf_iodone(bp);
2922         return (error);
2923 }