bsd/nfs/nfs_bio.c

   1 /*
   2  * Copyright (c) 2000 Apple Computer, Inc. All rights reserved.
   3  *
   4  * @APPLE_LICENSE_HEADER_START@
   5  *
   6  * The contents of this file constitute Original Code as defined in and
   7  * are subject to the Apple Public Source License Version 1.1 (the
   8  * "License").  You may not use this file except in compliance with the
   9  * License.  Please obtain a copy of the License at
  10  * http://www.apple.com/publicsource and read it before using this file.
  11  *
  12  * This Original Code and all software distributed under the License are
  13  * distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY KIND, EITHER
  14  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
  15  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
  16  * FITNESS FOR A PARTICULAR PURPOSE OR NON-INFRINGEMENT.  Please see the
  17  * License for the specific language governing rights and limitations
  18  * under the License.
  19  *
  20  * @APPLE_LICENSE_HEADER_END@
  21  */
  22 /* Copyright (c) 1995 NeXT Computer, Inc. All Rights Reserved */
  23 /*
  24  * Copyright (c) 1989, 1993
  25  *      The Regents of the University of California.  All rights reserved.
  26  *
  27  * This code is derived from software contributed to Berkeley by
  28  * Rick Macklem at The University of Guelph.
  29  *
  30  * Redistribution and use in source and binary forms, with or without
  31  * modification, are permitted provided that the following conditions
  32  * are met:
  33  * 1. Redistributions of source code must retain the above copyright
  34  *    notice, this list of conditions and the following disclaimer.
  35  * 2. Redistributions in binary form must reproduce the above copyright
  36  *    notice, this list of conditions and the following disclaimer in the
  37  *    documentation and/or other materials provided with the distribution.
  38  * 3. All advertising materials mentioning features or use of this software
  39  *    must display the following acknowledgement:
  40  *      This product includes software developed by the University of
  41  *      California, Berkeley and its contributors.
  42  * 4. Neither the name of the University nor the names of its contributors
  43  *    may be used to endorse or promote products derived from this software
  44  *    without specific prior written permission.
  45  *
  46  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  47  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  48  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  49  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  50  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  51  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  52  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  53  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  54  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  55  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  56  * SUCH DAMAGE.
  57  *
  58  *      @(#)nfs_bio.c   8.9 (Berkeley) 3/30/95
  59  * FreeBSD-Id: nfs_bio.c,v 1.44 1997/09/10 19:52:25 phk Exp $
  60  */
  61 #include <sys/param.h>
  62 #include <sys/systm.h>
  63 #include <sys/resourcevar.h>
  64 #include <sys/signalvar.h>
  65 #include <sys/proc.h>
  66 #include <sys/buf.h>
  67 #include <sys/vnode.h>
  68 #include <sys/mount.h>
  69 #include <sys/kernel.h>
  70 #include <sys/sysctl.h>
  71 #include <sys/ubc.h>
  72
  73 #include <sys/vm.h>
  74 #include <sys/vmparam.h>
  75
  76 #include <sys/time.h>
  77 #include <kern/clock.h>
  78
  79 #include <nfs/rpcv2.h>
  80 #include <nfs/nfsproto.h>
  81 #include <nfs/nfs.h>
  82 #include <nfs/nfsmount.h>
  83 #include <nfs/nqnfs.h>
  84 #include <nfs/nfsnode.h>
  85
  86 #include <sys/kdebug.h>
  87
  88 #define FSDBG(A, B, C, D, E) \
  89         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, (A))) | DBG_FUNC_NONE, \
  90                 (int)(B), (int)(C), (int)(D), (int)(E), 0)
  91 #define FSDBG_TOP(A, B, C, D, E) \
  92         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, (A))) | DBG_FUNC_START, \
  93                 (int)(B), (int)(C), (int)(D), (int)(E), 0)
  94 #define FSDBG_BOT(A, B, C, D, E) \
  95         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, (A))) | DBG_FUNC_END, \
  96                 (int)(B), (int)(C), (int)(D), (int)(E), 0)
  97
  98 static struct buf *nfs_getcacheblk __P((struct vnode *vp, daddr_t bn, int size,
  99                                         struct proc *p, int operation));
 100
 101 extern int nfs_numasync;
 102 extern struct nfsstats nfsstats;
 103 extern int nbdwrite;
 104
 105 /*
 106  * Vnode op for read using bio
 107  * Any similarity to readip() is purely coincidental
 108  */
 109 int
 110 nfs_bioread(vp, uio, ioflag, cred, getpages)
 111         register struct vnode *vp;
 112         register struct uio *uio;
 113         int ioflag;
 114         struct ucred *cred;
 115         int getpages;
 116 {
 117         register struct nfsnode *np = VTONFS(vp);
 118         register int biosize, i;
 119         off_t diff;
 120         struct buf *bp = 0, *rabp;
 121         struct vattr vattr;
 122         struct proc *p;
 123         struct nfsmount *nmp = VFSTONFS(vp->v_mount);
 124         daddr_t lbn, rabn;
 125         int bufsize;
 126         int nra, error = 0, n = 0, on = 0, not_readin;
 127         int operation = (getpages? BLK_PAGEIN : BLK_READ);
 128
 129 #if DIAGNOSTIC
 130         if (uio->uio_rw != UIO_READ)
 131                 panic("nfs_read mode");
 132 #endif
 133         if (uio->uio_resid == 0)
 134                 return (0);
 135         if (uio->uio_offset < 0)
 136                 return (EINVAL);
 137         p = uio->uio_procp;
 138         if ((nmp->nm_flag & (NFSMNT_NFSV3 | NFSMNT_GOTFSINFO)) == NFSMNT_NFSV3)
 139                 (void)nfs_fsinfo(nmp, vp, cred, p);
 140         /*due to getblk/vm interractions, use vm page size or less values */
 141         biosize = min(vp->v_mount->mnt_stat.f_iosize, PAGE_SIZE);
 142         /*
 143          * For nfs, cache consistency can only be maintained approximately.
 144          * Although RFC1094 does not specify the criteria, the following is
 145          * believed to be compatible with the reference port.
 146          * For nqnfs, full cache consistency is maintained within the loop.
 147          * For nfs:
 148          * If the file's modify time on the server has changed since the
 149          * last read rpc or you have written to the file,
 150          * you may have lost data cache consistency with the
 151          * server, so flush all of the file's data out of the cache.
 152          * Then force a getattr rpc to ensure that you have up to date
 153          * attributes.
 154          * NB: This implies that cache data can be read when up to
 155          * NFS_ATTRTIMEO seconds out of date. If you find that you need current
 156          * attributes this could be forced by setting n_attrstamp to 0 before
 157          * the VOP_GETATTR() call.
 158          */
 159         if ((nmp->nm_flag & NFSMNT_NQNFS) == 0) {
 160                 if (np->n_flag & NMODIFIED) {
 161                         if (vp->v_type != VREG) {
 162                                 if (vp->v_type != VDIR)
 163                                         panic("nfs: bioread, not dir");
 164                                 nfs_invaldir(vp);
 165                                 error = nfs_vinvalbuf(vp, V_SAVE, cred, p, 1);
 166                                 if (error)
 167                                         return (error);
 168                         }
 169                         np->n_attrstamp = 0;
 170                         error = VOP_GETATTR(vp, &vattr, cred, p);
 171                         if (error)
 172                                 return (error);
 173                         np->n_mtime = vattr.va_mtime.tv_sec;
 174                 } else {
 175                         error = VOP_GETATTR(vp, &vattr, cred, p);
 176                         if (error)
 177                                 return (error);
 178                         if (np->n_mtime != vattr.va_mtime.tv_sec) {
 179                                 if (vp->v_type == VDIR)
 180                                         nfs_invaldir(vp);
 181                                 error = nfs_vinvalbuf(vp, V_SAVE, cred, p, 1);
 182                                 if (error)
 183                                         return (error);
 184                                 np->n_mtime = vattr.va_mtime.tv_sec;
 185                         }
 186                 }
 187         }
 188         do {
 189
 190             /*
 191              * Get a valid lease. If cached data is stale, flush it.
 192              */
 193             if (nmp->nm_flag & NFSMNT_NQNFS) {
 194                 if (NQNFS_CKINVALID(vp, np, ND_READ)) {
 195                     do {
 196                         error = nqnfs_getlease(vp, ND_READ, cred, p);
 197                     } while (error == NQNFS_EXPIRED);
 198                     if (error)
 199                         return (error);
 200                     if (np->n_lrev != np->n_brev ||
 201                         (np->n_flag & NQNFSNONCACHE) ||
 202                         ((np->n_flag & NMODIFIED) && vp->v_type == VDIR)) {
 203                         if (vp->v_type == VDIR)
 204                             nfs_invaldir(vp);
 205                         error = nfs_vinvalbuf(vp, V_SAVE, cred, p, 1);
 206                         if (error)
 207                             return (error);
 208                         np->n_brev = np->n_lrev;
 209                     }
 210                 } else if (vp->v_type == VDIR && (np->n_flag & NMODIFIED)) {
 211                     nfs_invaldir(vp);
 212                     error = nfs_vinvalbuf(vp, V_SAVE, cred, p, 1);
 213                     if (error)
 214                         return (error);
 215                 }
 216             }
 217             if (np->n_flag & NQNFSNONCACHE) {
 218                 switch (vp->v_type) {
 219                 case VREG:
 220                         return (nfs_readrpc(vp, uio, cred));
 221                 case VLNK:
 222                         return (nfs_readlinkrpc(vp, uio, cred));
 223                 case VDIR:
 224                         break;
 225                 default:
 226                         printf(" NQNFSNONCACHE: type %x unexpected\n",
 227                                 vp->v_type);
 228                 };
 229             }
 230             switch (vp->v_type) {
 231             case VREG:
 232                 nfsstats.biocache_reads++;
 233                 lbn = uio->uio_offset / biosize;
 234                 on = uio->uio_offset & (biosize - 1);
 235                 not_readin = 1;
 236
 237                 /*
 238                  * Start the read ahead(s), as required.
 239                  */
 240                 if (nfs_numasync > 0 && nmp->nm_readahead > 0) {
 241                     for (nra = 0; nra < nmp->nm_readahead &&
 242                                   (off_t)(lbn + 1 + nra) * biosize < np->n_size;
 243                          nra++) {
 244                                 rabn = lbn + 1 + nra;
 245                                 if (!incore(vp, rabn)) {
 246                                         rabp = nfs_getcacheblk(vp, rabn, biosize, p, operation);
 247                                         if (!rabp)
 248                                                 return (EINTR);
 249                                         if (!ISSET(rabp->b_flags, (B_CACHE|B_DELWRI))) {
 250                                                 SET(rabp->b_flags, (B_READ | B_ASYNC));
 251                                                 if (nfs_asyncio(rabp, cred)) {
 252                                                         SET(rabp->b_flags, (B_INVAL|B_ERROR));
 253                                                         rabp->b_error = EIO;
 254                                                         brelse(rabp);
 255                                                 }
 256                                         } else
 257                                                 brelse(rabp);
 258                                 }
 259                     }
 260                 }
 261
 262                 /*
 263                  * If the block is in the cache and has the required data
 264                  * in a valid region, just copy it out.
 265                  * Otherwise, get the block and write back/read in,
 266                  * as required.
 267                  */
 268 again:
 269                 bufsize = biosize;
 270                 if ((off_t)(lbn + 1) * biosize > np->n_size &&
 271                     (off_t)(lbn + 1) * biosize - np->n_size < biosize) {
 272                         bufsize = np->n_size - (off_t)lbn * biosize;
 273                         bufsize = (bufsize + DEV_BSIZE - 1) & ~(DEV_BSIZE - 1);
 274                 }
 275                 bp = nfs_getcacheblk(vp, lbn, bufsize, p, operation);
 276                 if (!bp)
 277                         return (EINTR);
 278
 279                 if (!ISSET(bp->b_flags, B_CACHE)) {
 280                         SET(bp->b_flags, B_READ);
 281                         CLR(bp->b_flags, (B_DONE | B_ERROR | B_INVAL));
 282                         not_readin = 0;
 283                         error = nfs_doio(bp, cred, p);
 284                         if (error) {
 285                             brelse(bp);
 286                             return (error);
 287                         }
 288                 }
 289                 if (bufsize > on) {
 290                         n = min((unsigned)(bufsize - on), uio->uio_resid);
 291                 } else {
 292                         n = 0;
 293                 }
 294                 diff = np->n_size - uio->uio_offset;
 295                 if (diff < n)
 296                         n = diff;
 297                 if (not_readin && n > 0) {
 298                         if (on < bp->b_validoff || (on + n) > bp->b_validend) {
 299                                 SET(bp->b_flags, (B_NOCACHE|B_INVAFTERWRITE));
 300                                 if (bp->b_dirtyend > 0) {
 301                                         if (!ISSET(bp->b_flags, B_DELWRI))
 302                                                 panic("nfsbioread");
 303                                         if (VOP_BWRITE(bp) == EINTR)
 304                                                 return (EINTR);
 305                                 } else
 306                                         brelse(bp);
 307                                 goto again;
 308                         }
 309                 }
 310                 vp->v_lastr = lbn;
 311                 diff = (on >= bp->b_validend) ? 0 : (bp->b_validend - on);
 312                 if (diff < n)
 313                         n = diff;
 314                 break;
 315             case VLNK:
 316                 nfsstats.biocache_readlinks++;
 317                 bp = nfs_getcacheblk(vp, (daddr_t)0, NFS_MAXPATHLEN, p, operation);
 318                 if (!bp)
 319                         return (EINTR);
 320                 if (!ISSET(bp->b_flags, B_CACHE)) {
 321                         SET(bp->b_flags, B_READ);
 322                         error = nfs_doio(bp, cred, p);
 323                         if (error) {
 324                                 SET(bp->b_flags, B_ERROR);
 325                                 brelse(bp);
 326                                 return (error);
 327                         }
 328                 }
 329                 n = min(uio->uio_resid, NFS_MAXPATHLEN - bp->b_resid);
 330                 on = 0;
 331                 break;
 332             case VDIR:
 333                 nfsstats.biocache_readdirs++;
 334                 if (np->n_direofoffset
 335                     && uio->uio_offset >= np->n_direofoffset) {
 336                     return (0);
 337                 }
 338                 lbn = uio->uio_offset / NFS_DIRBLKSIZ;
 339                 on = uio->uio_offset & (NFS_DIRBLKSIZ - 1);
 340                 bp = nfs_getcacheblk(vp, lbn, NFS_DIRBLKSIZ, p, operation);
 341                 if (!bp)
 342                     return (EINTR);
 343                 if (!ISSET(bp->b_flags, B_CACHE)) {
 344                     SET(bp->b_flags, B_READ);
 345                     error = nfs_doio(bp, cred, p);
 346                     if (error) {
 347                         brelse(bp);
 348                     }
 349                     while (error == NFSERR_BAD_COOKIE) {
 350                         nfs_invaldir(vp);
 351                         error = nfs_vinvalbuf(vp, 0, cred, p, 1);
 352                         /*
 353                          * Yuck! The directory has been modified on the
 354                          * server. The only way to get the block is by
 355                          * reading from the beginning to get all the
 356                          * offset cookies.
 357                          */
 358                         for (i = 0; i <= lbn && !error; i++) {
 359                             if (np->n_direofoffset
 360                                 && (i * NFS_DIRBLKSIZ) >= np->n_direofoffset)
 361                                     return (0);
 362                             bp = nfs_getcacheblk(vp, i, NFS_DIRBLKSIZ, p,
 363                                                  operation);
 364                             if (!bp)
 365                                     return (EINTR);
 366                             if (!ISSET(bp->b_flags, B_CACHE)) {
 367                                     SET(bp->b_flags, B_READ);
 368                                     error = nfs_doio(bp, cred, p);
 369                                     /*
 370                                      * no error + B_INVAL == directory EOF,
 371                                      * use the block.
 372                                      */
 373                                     if (error == 0 && (bp->b_flags & B_INVAL))
 374                                             break;
 375                             }
 376                             /*
 377                              * An error will throw away the block and the
 378                              * for loop will break out.  If no error and this
 379                              * is not the block we want, we throw away the
 380                              * block and go for the next one via the for loop.
 381                              */
 382                             if (error || i < lbn)
 383                                     brelse(bp);
 384                         }
 385                     }
 386                     /*
 387                      * The above while is repeated if we hit another cookie
 388                      * error.  If we hit an error and it wasn't a cookie error,
 389                      * we give up.
 390                      */
 391                     if (error)
 392                         return (error);
 393                 }
 394
 395                 /*
 396                  * If not eof and read aheads are enabled, start one.
 397                  * (You need the current block first, so that you have the
 398                  *  directory offset cookie of the next block.)
 399                  */
 400                 if (nfs_numasync > 0 && nmp->nm_readahead > 0 &&
 401                     (np->n_direofoffset == 0 ||
 402                     (lbn + 1) * NFS_DIRBLKSIZ < np->n_direofoffset) &&
 403                     !(np->n_flag & NQNFSNONCACHE) &&
 404                     !incore(vp, lbn + 1)) {
 405                         rabp = nfs_getcacheblk(vp, lbn + 1, NFS_DIRBLKSIZ, p,
 406                                                operation);
 407                         if (rabp) {
 408                             if (!ISSET(rabp->b_flags, (B_CACHE|B_DELWRI))) {
 409                                 SET(rabp->b_flags, (B_READ | B_ASYNC));
 410                                 if (nfs_asyncio(rabp, cred)) {
 411                                     SET(rabp->b_flags, (B_INVAL|B_ERROR));
 412                                     rabp->b_error = EIO;
 413                                     brelse(rabp);
 414                                 }
 415                             } else {
 416                                 brelse(rabp);
 417                             }
 418                         }
 419                 }
 420                 /*
 421                  * Make sure we use a signed variant of min() since
 422                  * the second term may be negative.
 423                  */
 424                 n = lmin(uio->uio_resid, NFS_DIRBLKSIZ - bp->b_resid - on);
 425                 /*
 426                  * Unlike VREG files, whos buffer size ( bp->b_bcount ) is
 427                  * chopped for the EOF condition, we cannot tell how large
 428                  * NFS directories are going to be until we hit EOF.  So
 429                  * an NFS directory buffer is *not* chopped to its EOF.  Now,
 430                  * it just so happens that b_resid will effectively chop it
 431                  * to EOF.  *BUT* this information is lost if the buffer goes
 432                  * away and is reconstituted into a B_CACHE state (recovered
 433                  * from VM) later.  So we keep track of the directory eof
 434                  * in np->n_direofoffset and chop it off as an extra step
 435                  * right here.
 436                  */
 437                 if (np->n_direofoffset &&
 438                     n > np->n_direofoffset - uio->uio_offset)
 439                         n = np->n_direofoffset - uio->uio_offset;
 440                 break;
 441             default:
 442                 printf(" nfs_bioread: type %x unexpected\n",vp->v_type);
 443                 break;
 444             };
 445
 446             if (n > 0) {
 447                 error = uiomove(bp->b_data + on, (int)n, uio);
 448             }
 449             switch (vp->v_type) {
 450             case VREG:
 451                 break;
 452             case VLNK:
 453                 n = 0;
 454                 break;
 455             case VDIR:
 456                 if (np->n_flag & NQNFSNONCACHE)
 457                         SET(bp->b_flags, B_INVAL);
 458                 break;
 459             default:
 460                 printf(" nfs_bioread: type %x unexpected\n",vp->v_type);
 461             }
 462             brelse(bp);
 463         } while (error == 0 && uio->uio_resid > 0 && n > 0);
 464         return (error);
 465 }
 466
 467
 468 /*
 469  * Vnode op for write using bio
 470  */
 471 int
 472 nfs_write(ap)
 473         struct vop_write_args /* {
 474                 struct vnode *a_vp;
 475                 struct uio *a_uio;
 476                 int  a_ioflag;
 477                 struct ucred *a_cred;
 478         } */ *ap;
 479 {
 480         register int biosize;
 481         register struct uio *uio = ap->a_uio;
 482         struct proc *p = uio->uio_procp;
 483         register struct vnode *vp = ap->a_vp;
 484         struct nfsnode *np = VTONFS(vp);
 485         register struct ucred *cred = ap->a_cred;
 486         int ioflag = ap->a_ioflag;
 487         struct buf *bp;
 488         struct vattr vattr;
 489         struct nfsmount *nmp = VFSTONFS(vp->v_mount);
 490         daddr_t lbn;
 491         int bufsize;
 492         int n, on, error = 0, iomode, must_commit;
 493         off_t boff;
 494         struct iovec iov;
 495         struct uio auio;
 496
 497 #if DIAGNOSTIC
 498         if (uio->uio_rw != UIO_WRITE)
 499                 panic("nfs_write mode");
 500         if (uio->uio_segflg == UIO_USERSPACE && uio->uio_procp != current_proc())
 501                 panic("nfs_write proc");
 502 #endif
 503         if (vp->v_type != VREG)
 504                 return (EIO);
 505         if (np->n_flag & NWRITEERR) {
 506                 np->n_flag &= ~NWRITEERR;
 507                 return (np->n_error);
 508         }
 509         if ((nmp->nm_flag & (NFSMNT_NFSV3 | NFSMNT_GOTFSINFO)) == NFSMNT_NFSV3)
 510                 (void)nfs_fsinfo(nmp, vp, cred, p);
 511         if (ioflag & (IO_APPEND | IO_SYNC)) {
 512                 if (np->n_flag & NMODIFIED) {
 513                         np->n_attrstamp = 0;
 514                         error = nfs_vinvalbuf(vp, V_SAVE, cred, p, 1);
 515                         if (error)
 516                                 return (error);
 517                 }
 518                 if (ioflag & IO_APPEND) {
 519                         np->n_attrstamp = 0;
 520                         error = VOP_GETATTR(vp, &vattr, cred, p);
 521                         if (error)
 522                                 return (error);
 523                         uio->uio_offset = np->n_size;
 524                 }
 525         }
 526         if (uio->uio_offset < 0)
 527                 return (EINVAL);
 528         if (uio->uio_resid == 0)
 529                 return (0);
 530         /*
 531          * Maybe this should be above the vnode op call, but so long as
 532          * file servers have no limits, i don't think it matters
 533          */
 534         if (p && uio->uio_offset + uio->uio_resid >
 535               p->p_rlimit[RLIMIT_FSIZE].rlim_cur) {
 536                 psignal(p, SIGXFSZ);
 537                 return (EFBIG);
 538         }
 539         /*
 540          * I use nm_rsize, not nm_wsize so that all buffer cache blocks
 541          * will be the same size within a filesystem. nfs_writerpc will
 542          * still use nm_wsize when sizing the rpc's.
 543          */
 544         /*due to getblk/vm interractions, use vm page size or less values */
 545         biosize = min(vp->v_mount->mnt_stat.f_iosize, PAGE_SIZE);
 546
 547         do {
 548                 /*
 549                  * Check for a valid write lease.
 550                  */
 551                 if ((nmp->nm_flag & NFSMNT_NQNFS) &&
 552                     NQNFS_CKINVALID(vp, np, ND_WRITE)) {
 553                         do {
 554                                 error = nqnfs_getlease(vp, ND_WRITE, cred, p);
 555                         } while (error == NQNFS_EXPIRED);
 556                         if (error)
 557                                 return (error);
 558                         if (np->n_lrev != np->n_brev ||
 559                             (np->n_flag & NQNFSNONCACHE)) {
 560                                 error = nfs_vinvalbuf(vp, V_SAVE, cred, p, 1);
 561                                 if (error)
 562                                         return (error);
 563                                 np->n_brev = np->n_lrev;
 564                         }
 565                 }
 566                 if ((np->n_flag & NQNFSNONCACHE) && uio->uio_iovcnt == 1) {
 567                     iomode = NFSV3WRITE_FILESYNC;
 568                     error = nfs_writerpc(vp, uio, cred, &iomode, &must_commit);
 569                     if (must_commit)
 570                         nfs_clearcommit(vp->v_mount);
 571                     return (error);
 572                 }
 573                 nfsstats.biocache_writes++;
 574                 lbn = uio->uio_offset / biosize;
 575                 on = uio->uio_offset & (biosize-1);
 576                 n = min((unsigned)(biosize - on), uio->uio_resid);
 577 again:
 578                 bufsize = biosize;
 579 #if 0
 580 /* (removed for UBC) */
 581                 if ((lbn + 1) * biosize > np->n_size) {
 582                         bufsize = np->n_size - lbn * biosize;
 583                         bufsize = (bufsize + DEV_BSIZE - 1) & ~(DEV_BSIZE - 1);
 584                 }
 585 #endif
 586                 /*
 587                  * Get a cache block for writing.  The range to be written is
 588                  * (off..off+len) within the block.  We ensure that the block
 589                  * either has no dirty region or that the given range is
 590                  * contiguous with the existing dirty region.
 591                  */
 592                 bp = nfs_getcacheblk(vp, lbn, bufsize, p, BLK_WRITE);
 593                 if (!bp)
 594                         return (EINTR);
 595                 /*
 596                  * Resize nfsnode *after* we busy the buffer to prevent
 597                  * readers from reading garbage.
 598                  * If there was a partial buf at the old eof, validate
 599                  * and zero the new bytes.
 600                  */
 601                 if (uio->uio_offset + n > np->n_size) {
 602                         struct buf *bp0 = NULL;
 603                         daddr_t bn = np->n_size / biosize;
 604                         int off = np->n_size & (biosize - 1);
 605
 606                         if (off && bn < lbn && incore(vp, bn))
 607                                 bp0 = nfs_getcacheblk(vp, bn, biosize, p,
 608                                                       BLK_WRITE);
 609                         np->n_flag |= NMODIFIED;
 610                         np->n_size = uio->uio_offset + n;
 611                         ubc_setsize(vp, (off_t)np->n_size); /* XXX errors */
 612                         if (bp0) {
 613                                 bzero((char *)bp0->b_data + off, biosize - off);
 614                                 bp0->b_validend = biosize;
 615                                 brelse(bp0);
 616                         }
 617                 }
 618                 /*
 619                  * NFS has embedded ucred so crhold() risks zone corruption
 620                  */
 621                 if (bp->b_wcred == NOCRED)
 622                         bp->b_wcred = crdup(cred);
 623                 /*
 624                  * If dirtyend exceeds file size, chop it down.  This should
 625                  * not occur unless there is a race.
 626                  */
 627                 if ((off_t)bp->b_blkno * DEV_BSIZE + bp->b_dirtyend >
 628                     np->n_size)
 629                         bp->b_dirtyend = np->n_size - (off_t)bp->b_blkno *
 630                                                       DEV_BSIZE;
 631                 /*
 632                  * UBC doesn't (yet) handle partial pages so nfs_biowrite was
 633                  * hacked to never bdwrite, to start every little write right
 634                  * away.  Running IE Avie noticed the performance problem, thus
 635                  * this code, which permits those delayed writes by ensuring an
 636                  * initial read of the entire page.  The read may hit eof
 637                  * ("short read") but that we will handle.
 638                  *
 639                  * We are quite dependant on the correctness of B_CACHE so check
 640                  * that first in case of problems.
 641                  */
 642                 if (!ISSET(bp->b_flags, B_CACHE) && n < PAGE_SIZE) {
 643                         boff = (off_t)bp->b_blkno * DEV_BSIZE;
 644                         auio.uio_iov = &iov;
 645                         auio.uio_iovcnt = 1;
 646                         auio.uio_offset = boff;
 647                         auio.uio_resid = PAGE_SIZE;
 648                         auio.uio_segflg = UIO_SYSSPACE;
 649                         auio.uio_rw = UIO_READ;
 650                         auio.uio_procp = p;
 651                         iov.iov_base = bp->b_data;
 652                         iov.iov_len = PAGE_SIZE;
 653                         error = nfs_readrpc(vp, &auio, cred);
 654                         if (error) {
 655                                 bp->b_error = error;
 656                                 SET(bp->b_flags, B_ERROR);
 657                                 printf("nfs_write: readrpc %d", error);
 658                         }
 659                         if (auio.uio_resid > 0)
 660                                 bzero(iov.iov_base, auio.uio_resid);
 661                         bp->b_validoff = 0;
 662                         bp->b_validend = PAGE_SIZE - auio.uio_resid;
 663                         if (np->n_size > boff + bp->b_validend)
 664                                 bp->b_validend = min(np->n_size - boff,
 665                                                      PAGE_SIZE);
 666                         bp->b_dirtyoff = 0;
 667                         bp->b_dirtyend = 0;
 668                 }
 669
 670                 /*
 671                  * If the new write will leave a contiguous dirty
 672                  * area, just update the b_dirtyoff and b_dirtyend,
 673                  * otherwise try to extend the dirty region.
 674                  */
 675                 if (bp->b_dirtyend > 0 &&
 676                     (on > bp->b_dirtyend || (on + n) < bp->b_dirtyoff)) {
 677                         off_t start, end;
 678
 679                         boff = (off_t)bp->b_blkno * DEV_BSIZE;
 680                         if (on > bp->b_dirtyend) {
 681                                 start = boff + bp->b_validend;
 682                                 end = boff + on;
 683                         } else {
 684                                 start = boff + on + n;
 685                                 end = boff + bp->b_validoff;
 686                         }
 687
 688                         /*
 689                          * It may be that the valid region in the buffer
 690                          * covers the region we want, in which case just
 691                          * extend the dirty region.  Otherwise we try to
 692                          * extend the valid region.
 693                          */
 694                         if (end > start) {
 695                                 auio.uio_iov = &iov;
 696                                 auio.uio_iovcnt = 1;
 697                                 auio.uio_offset = start;
 698                                 auio.uio_resid = end - start;
 699                                 auio.uio_segflg = UIO_SYSSPACE;
 700                                 auio.uio_rw = UIO_READ;
 701                                 auio.uio_procp = p;
 702                                 iov.iov_base = bp->b_data + (start - boff);
 703                                 iov.iov_len = end - start;
 704                                 error = nfs_readrpc(vp, &auio, cred);
 705                                 /*
 706                                  * If we couldn't read, do not do a VOP_BWRITE
 707                                  * as originally coded. That could also error
 708                                  * and looping back to "again" as it was doing
 709                                  * could have us stuck trying to write same buf
 710                                  * again. nfs_write, will get the entire region
 711                                  * if nfs_readrpc succeeded. If unsuccessful
 712                                  * we should just error out. Errors like ESTALE
 713                                  * would keep us looping rather than transient
 714                                  * errors justifying a retry. We can return here
 715                                  * instead of altering dirty region later.  We
 716                                  * did not write old dirty region at this point.
 717                                  */
 718                                 if (error) {
 719                                         bp->b_error = error;
 720                                         SET(bp->b_flags, B_ERROR);
 721                                         printf("nfs_write: readrpc2 %d", error);
 722                                         brelse(bp);
 723                                         return (error);
 724                                 }
 725                                 /*
 726                                  * The read worked.
 727                                  * If there was a short read, just zero fill.
 728                                  */
 729                                 if (auio.uio_resid > 0)
 730                                         bzero(iov.iov_base, auio.uio_resid);
 731                                 if (on > bp->b_dirtyend)
 732                                         bp->b_validend = on;
 733                                 else
 734                                         bp->b_validoff = on + n;
 735                         }
 736                         /*
 737                          * We now have a valid region which extends up to the
 738                          * dirty region which we want.
 739                          */
 740                         if (on > bp->b_dirtyend)
 741                                 bp->b_dirtyend = on;
 742                         else
 743                                 bp->b_dirtyoff = on + n;
 744                 }
 745                 if (ISSET(bp->b_flags, B_ERROR)) {
 746                         error = bp->b_error;
 747                         brelse(bp);
 748                         return (error);
 749                 }
 750                 /*
 751                  * NFS has embedded ucred so crhold() risks zone corruption
 752                  */
 753                 if (bp->b_wcred == NOCRED)
 754                         bp->b_wcred = crdup(cred);
 755                 np->n_flag |= NMODIFIED;
 756
 757                 /*
 758                  * Check for valid write lease and get one as required.
 759                  * In case getblk() and/or bwrite() delayed us.
 760                  */
 761                 if ((nmp->nm_flag & NFSMNT_NQNFS) &&
 762                     NQNFS_CKINVALID(vp, np, ND_WRITE)) {
 763                         do {
 764                                 error = nqnfs_getlease(vp, ND_WRITE, cred, p);
 765                         } while (error == NQNFS_EXPIRED);
 766                         if (error) {
 767                                 brelse(bp);
 768                                 return (error);
 769                         }
 770                         if (np->n_lrev != np->n_brev ||
 771                             (np->n_flag & NQNFSNONCACHE)) {
 772                                 brelse(bp);
 773                                 error = nfs_vinvalbuf(vp, V_SAVE, cred, p, 1);
 774                                 if (error)
 775                                         return (error);
 776                                 np->n_brev = np->n_lrev;
 777                                 goto again;
 778                         }
 779                 }
 780                 error = uiomove((char *)bp->b_data + on, n, uio);
 781                 if (error) {
 782                         SET(bp->b_flags, B_ERROR);
 783                         brelse(bp);
 784                         return (error);
 785                 }
 786                 if (bp->b_dirtyend > 0) {
 787                         bp->b_dirtyoff = min(on, bp->b_dirtyoff);
 788                         bp->b_dirtyend = max((on + n), bp->b_dirtyend);
 789                 } else {
 790                         bp->b_dirtyoff = on;
 791                         bp->b_dirtyend = on + n;
 792                 }
 793                 if (bp->b_validend == 0 || bp->b_validend < bp->b_dirtyoff ||
 794                     bp->b_validoff > bp->b_dirtyend) {
 795                         bp->b_validoff = bp->b_dirtyoff;
 796                         bp->b_validend = bp->b_dirtyend;
 797                 } else {
 798                         bp->b_validoff = min(bp->b_validoff, bp->b_dirtyoff);
 799                         bp->b_validend = max(bp->b_validend, bp->b_dirtyend);
 800                 }
 801
 802                 /*
 803                  * Since this block is being modified, it must be written
 804                  * again and not just committed.
 805                  */
 806                 CLR(bp->b_flags, B_NEEDCOMMIT);
 807
 808                 /*
 809                  * If the lease is non-cachable or IO_SYNC do bwrite().
 810                  */
 811                 if ((np->n_flag & NQNFSNONCACHE) || (ioflag & IO_SYNC)) {
 812                         bp->b_proc = p;
 813                         error = VOP_BWRITE(bp);
 814                         if (error)
 815                                 return (error);
 816                         if (np->n_flag & NQNFSNONCACHE) {
 817                                 error = nfs_vinvalbuf(vp, V_SAVE, cred, p, 1);
 818                                 if (error)
 819                                         return (error);
 820                         }
 821                 } else if ((n + on) == biosize &&
 822                         (nmp->nm_flag & NFSMNT_NQNFS) == 0) {
 823                         bp->b_proc = (struct proc *)0;
 824                         SET(bp->b_flags, B_ASYNC);
 825                         (void)nfs_writebp(bp, 0);
 826                 } else
 827                         bdwrite(bp);
 828         } while (uio->uio_resid > 0 && n > 0);
 829         return (0);
 830 }
 831
 832
 833 /*
 834  * Get an nfs cache block.
 835  * Allocate a new one if the block isn't currently in the cache
 836  * and return the block marked busy. If the calling process is
 837  * interrupted by a signal for an interruptible mount point, return
 838  * NULL.
 839  */
 840 static struct buf *
 841 nfs_getcacheblk(vp, bn, size, p, operation)
 842         struct vnode *vp;
 843         daddr_t bn;
 844         int size;
 845         struct proc *p;
 846         int operation;  /* defined in sys/buf.h */
 847 {
 848         register struct buf *bp;
 849         struct nfsmount *nmp = VFSTONFS(vp->v_mount);
 850         /*due to getblk/vm interractions, use vm page size or less values */
 851         int biosize = min(vp->v_mount->mnt_stat.f_iosize, PAGE_SIZE);
 852
 853         if (nbdwrite > ((nbuf/4)*3) && operation == BLK_WRITE) {
 854 #define __BUFFERS_RECLAIMED 2
 855                 struct buf *tbp[__BUFFERS_RECLAIMED];
 856                 int i;
 857
 858                 /* too many delayed writes, try to free up some buffers */
 859                 for (i = 0; i < __BUFFERS_RECLAIMED; i++)
 860                         tbp[i] = geteblk(512);
 861
 862                 /* Yield to IO thread */
 863                 (void)tsleep((caddr_t)&nbdwrite, PCATCH, "nbdwrite", 1);
 864
 865                 for (i = (__BUFFERS_RECLAIMED - 1); i >= 0; i--)
 866                          brelse(tbp[i]);
 867         }
 868
 869         if (nmp->nm_flag & NFSMNT_INT) {
 870                 bp = getblk(vp, bn, size, PCATCH, 0, operation);
 871                 while (bp == (struct buf *)0) {
 872                         if (nfs_sigintr(nmp, (struct nfsreq *)0, p))
 873                                 return ((struct buf *)0);
 874                         bp = getblk(vp, bn, size, 0, 2 * hz, operation);
 875                 }
 876         } else
 877                 bp = getblk(vp, bn, size, 0, 0, operation);
 878
 879         if( vp->v_type == VREG)
 880                 bp->b_blkno = ((off_t)bn * biosize) / DEV_BSIZE;
 881
 882         return (bp);
 883 }
 884
 885 /*
 886  * Flush and invalidate all dirty buffers. If another process is already
 887  * doing the flush, just wait for completion.
 888  */
 889 int
 890 nfs_vinvalbuf(vp, flags, cred, p, intrflg)
 891         struct vnode *vp;
 892         int flags;
 893         struct ucred *cred;
 894         struct proc *p;
 895         int intrflg;
 896 {
 897         register struct nfsnode *np = VTONFS(vp);
 898         struct nfsmount *nmp = VFSTONFS(vp->v_mount);
 899         int error = 0, slpflag, slptimeo;
 900         int didhold = 0;
 901
 902         if ((nmp->nm_flag & NFSMNT_INT) == 0)
 903                 intrflg = 0;
 904         if (intrflg) {
 905                 slpflag = PCATCH;
 906                 slptimeo = 2 * hz;
 907         } else {
 908                 slpflag = 0;
 909                 slptimeo = 0;
 910         }
 911         /*
 912          * First wait for any other process doing a flush to complete.
 913          */
 914         while (np->n_flag & NFLUSHINPROG) {
 915                 np->n_flag |= NFLUSHWANT;
 916                 error = tsleep((caddr_t)&np->n_flag, PRIBIO + 2, "nfsvinval",
 917                         slptimeo);
 918                 if (error && intrflg && nfs_sigintr(nmp, (struct nfsreq *)0, p))
 919                         return (EINTR);
 920         }
 921
 922         /*
 923          * Now, flush as required.
 924          */
 925         np->n_flag |= NFLUSHINPROG;
 926         error = vinvalbuf(vp, flags, cred, p, slpflag, 0);
 927         while (error) {
 928                 /* we seem to be stuck in a loop here if the thread got aborted.
 929                  * nfs_flush will return EINTR. Not sure if that will cause
 930                  * other consequences due to EINTR having other meanings in NFS
 931                  * To handle, no dirty pages, it seems safe to just return from
 932                  * here. But if we did have dirty pages, how would we get them
 933                  * written out if thread was aborted? Some other strategy is
 934                  * necessary. -- EKN
 935                  */
 936                 if ((intrflg && nfs_sigintr(nmp, (struct nfsreq *)0, p)) ||
 937                     (error == EINTR && current_thread_aborted())) {
 938                         np->n_flag &= ~NFLUSHINPROG;
 939                         if (np->n_flag & NFLUSHWANT) {
 940                                 np->n_flag &= ~NFLUSHWANT;
 941                                 wakeup((caddr_t)&np->n_flag);
 942                         }
 943                         return (EINTR);
 944                 }
 945                 error = vinvalbuf(vp, flags, cred, p, 0, slptimeo);
 946         }
 947         np->n_flag &= ~(NMODIFIED | NFLUSHINPROG);
 948         if (np->n_flag & NFLUSHWANT) {
 949                 np->n_flag &= ~NFLUSHWANT;
 950                 wakeup((caddr_t)&np->n_flag);
 951         }
 952         didhold = ubc_hold(vp);
 953         if (didhold) {
 954                 (void) ubc_clean(vp, 1); /* get the pages out of vm also */
 955                 ubc_rele(vp);
 956         }
 957         return (0);
 958 }
 959
 960 /*
 961  * Initiate asynchronous I/O. Return an error if no nfsiods are available.
 962  * This is mainly to avoid queueing async I/O requests when the nfsiods
 963  * are all hung on a dead server.
 964  */
 965 int
 966 nfs_asyncio(bp, cred)
 967         register struct buf *bp;
 968         struct ucred *cred;
 969 {
 970         struct nfsmount *nmp;
 971         int i;
 972         int gotiod;
 973         int slpflag = 0;
 974         int slptimeo = 0;
 975         int error;
 976
 977         if (nfs_numasync == 0)
 978                 return (EIO);
 979
 980         nmp = VFSTONFS(bp->b_vp->v_mount);
 981 again:
 982         if (nmp->nm_flag & NFSMNT_INT)
 983                 slpflag = PCATCH;
 984         gotiod = FALSE;
 985
 986         /*
 987          * Find a free iod to process this request.
 988          */
 989         for (i = 0; i < NFS_MAXASYNCDAEMON; i++)
 990                 if (nfs_iodwant[i]) {
 991                         /*
 992                          * Found one, so wake it up and tell it which
 993                          * mount to process.
 994                          */
 995                         NFS_DPF(ASYNCIO,
 996                                 ("nfs_asyncio: waking iod %d for mount %p\n",
 997                                  i, nmp));
 998                         nfs_iodwant[i] = (struct proc *)0;
 999                         nfs_iodmount[i] = nmp;
1000                         nmp->nm_bufqiods++;
1001                         wakeup((caddr_t)&nfs_iodwant[i]);
1002                         gotiod = TRUE;
1003                         break;
1004                 }
1005
1006         /*
1007          * If none are free, we may already have an iod working on this mount
1008          * point.  If so, it will process our request.
1009          */
1010         if (!gotiod) {
1011                 if (nmp->nm_bufqiods > 0) {
1012                         NFS_DPF(ASYNCIO,
1013                                 ("nfs_asyncio: %d iods are already processing mount %p\n",
1014                                  nmp->nm_bufqiods, nmp));
1015                         gotiod = TRUE;
1016                 }
1017         }
1018
1019         /*
1020          * If we have an iod which can process the request, then queue
1021          * the buffer.
1022          */
1023         if (gotiod) {
1024                 /*
1025                  * Ensure that the queue never grows too large.
1026                  */
1027                 while (nmp->nm_bufqlen >= 2*nfs_numasync) {
1028                         NFS_DPF(ASYNCIO,
1029                                 ("nfs_asyncio: waiting for mount %p queue to drain\n", nmp));
1030                         nmp->nm_bufqwant = TRUE;
1031                         error = tsleep(&nmp->nm_bufq, slpflag | PRIBIO,
1032                                        "nfsaio", slptimeo);
1033                         if (error) {
1034                                 if (nfs_sigintr(nmp, NULL, bp->b_proc))
1035                                         return (EINTR);
1036                                 if (slpflag == PCATCH) {
1037                                         slpflag = 0;
1038                                         slptimeo = 2 * hz;
1039                                 }
1040                         }
1041                         /*
1042                          * We might have lost our iod while sleeping,
1043                          * so check and loop if nescessary.
1044                          */
1045                         if (nmp->nm_bufqiods == 0) {
1046                                 NFS_DPF(ASYNCIO,
1047                                         ("nfs_asyncio: no iods after mount %p queue was drained, looping\n", nmp));
1048                                 goto again;
1049                         }
1050                 }
1051
1052                 if (ISSET(bp->b_flags, B_READ)) {
1053                         if (bp->b_rcred == NOCRED && cred != NOCRED) {
1054                                 /*
1055                                  * NFS has embedded ucred.
1056                                  * Can not crhold() here as that causes zone corruption
1057                                  */
1058                                 bp->b_rcred = crdup(cred);
1059                         }
1060                 } else {
1061                         SET(bp->b_flags, B_WRITEINPROG);
1062                         if (bp->b_wcred == NOCRED && cred != NOCRED) {
1063                                 /*
1064                                  * NFS has embedded ucred.
1065                                  * Can not crhold() here as that causes zone corruption
1066                                  */
1067                                 bp->b_wcred = crdup(cred);
1068                         }
1069                 }
1070
1071                 TAILQ_INSERT_TAIL(&nmp->nm_bufq, bp, b_freelist);
1072                 nmp->nm_bufqlen++;
1073                 return (0);
1074         }
1075
1076         /*
1077          * All the iods are busy on other mounts, so return EIO to
1078          * force the caller to process the i/o synchronously.
1079          */
1080         NFS_DPF(ASYNCIO, ("nfs_asyncio: no iods available, i/o is synchronous\n"));
1081         return (EIO);
1082 }
1083
1084 /*
1085  * Do an I/O operation to/from a cache block. This may be called
1086  * synchronously or from an nfsiod.
1087  */
1088 int
1089 nfs_doio(bp, cr, p)
1090         register struct buf *bp;
1091         struct ucred *cr;
1092         struct proc *p;
1093 {
1094         register struct uio *uiop;
1095         register struct vnode *vp;
1096         struct nfsnode *np;
1097         struct nfsmount *nmp;
1098         int error = 0, diff, len, iomode, must_commit = 0;
1099         struct uio uio;
1100         struct iovec io;
1101
1102         vp = bp->b_vp;
1103         np = VTONFS(vp);
1104         nmp = VFSTONFS(vp->v_mount);
1105         uiop = &uio;
1106         uiop->uio_iov = &io;
1107         uiop->uio_iovcnt = 1;
1108         uiop->uio_segflg = UIO_SYSSPACE;
1109         uiop->uio_procp = p;
1110
1111         /*
1112          * With UBC, getblk() can return a buf with B_DONE set.
1113          * This indicates that the VM has valid data for that page.
1114          * NFS being stateless, this case poses a problem.
1115          * By definition, the NFS server should always be consulted
1116          * for the data in that page.
1117          * So we choose to clear the B_DONE and to do the IO.
1118          *
1119          * XXX revisit this if there is a performance issue.
1120          * XXX In that case, we could play the attribute cache games ...
1121          */
1122          if (ISSET(bp->b_flags, B_DONE)) {
1123                 if (!ISSET(bp->b_flags, B_ASYNC))
1124                         panic("nfs_doio: done and not async");
1125                 CLR(bp->b_flags, B_DONE);
1126         }
1127         FSDBG_TOP(256, np->n_size, bp->b_blkno * DEV_BSIZE, bp->b_bcount,
1128                   bp->b_flags);
1129         FSDBG(257, bp->b_validoff, bp->b_validend, bp->b_dirtyoff,
1130               bp->b_dirtyend);
1131         /*
1132          * Historically, paging was done with physio, but no more.
1133          */
1134         if (ISSET(bp->b_flags, B_PHYS)) {
1135             /*
1136              * ...though reading /dev/drum still gets us here.
1137              */
1138             io.iov_len = uiop->uio_resid = bp->b_bcount;
1139             /* mapping was done by vmapbuf() */
1140             io.iov_base = bp->b_data;
1141             uiop->uio_offset = (off_t)bp->b_blkno * DEV_BSIZE;
1142             if (ISSET(bp->b_flags, B_READ)) {
1143                         uiop->uio_rw = UIO_READ;
1144                         nfsstats.read_physios++;
1145                         error = nfs_readrpc(vp, uiop, cr);
1146             } else {
1147                         int com;
1148
1149                         iomode = NFSV3WRITE_DATASYNC;
1150                         uiop->uio_rw = UIO_WRITE;
1151                         nfsstats.write_physios++;
1152                         error = nfs_writerpc(vp, uiop, cr, &iomode, &com);
1153             }
1154             if (error) {
1155                         SET(bp->b_flags, B_ERROR);
1156                         bp->b_error = error;
1157             }
1158         } else if (ISSET(bp->b_flags, B_READ)) {
1159             io.iov_len = uiop->uio_resid = bp->b_bcount;
1160             io.iov_base = bp->b_data;
1161             uiop->uio_rw = UIO_READ;
1162             switch (vp->v_type) {
1163             case VREG:
1164                 uiop->uio_offset = (off_t)bp->b_blkno * DEV_BSIZE;
1165                 nfsstats.read_bios++;
1166                 error = nfs_readrpc(vp, uiop, cr);
1167                 FSDBG(262, np->n_size, bp->b_blkno * DEV_BSIZE,
1168                       uiop->uio_resid, error);
1169                 if (!error) {
1170                     bp->b_validoff = 0;
1171                     if (uiop->uio_resid) {
1172                         /*
1173                          * If len > 0, there is a hole in the file and
1174                          * no writes after the hole have been pushed to
1175                          * the server yet.
1176                          * Just zero fill the rest of the valid area.
1177                          */
1178                         diff = bp->b_bcount - uiop->uio_resid;
1179                         len = np->n_size - ((u_quad_t)bp->b_blkno * DEV_BSIZE +
1180                                             diff);
1181                         if (len > 0) {
1182                                 len = min(len, uiop->uio_resid);
1183                                 bzero((char *)bp->b_data + diff, len);
1184                                 bp->b_validend = diff + len;
1185                                 FSDBG(258, diff, len, 0, 1);
1186                         } else
1187                                 bp->b_validend = diff;
1188                     } else
1189                                 bp->b_validend = bp->b_bcount;
1190
1191                     if (bp->b_validend < bp->b_bufsize) {
1192                             /*
1193                              * we're about to release a partial buffer after a
1194                              * read... the only way we should get here is if
1195                              * this buffer contains the EOF before releasing it,
1196                              * we'll zero out to the end of the buffer so that
1197                              * if a mmap of this page occurs, we'll see zero's
1198                              * even if a ftruncate extends the file in the
1199                              * meantime
1200                              */
1201                             bzero((caddr_t)(bp->b_data + bp->b_validend),
1202                                   bp->b_bufsize - bp->b_validend);
1203                             FSDBG(258, bp->b_validend,
1204                                   bp->b_bufsize - bp->b_validend, 0, 2);
1205                     }
1206                 }
1207                 if (p && (vp->v_flag & VTEXT) &&
1208                         (((nmp->nm_flag & NFSMNT_NQNFS) &&
1209                           NQNFS_CKINVALID(vp, np, ND_READ) &&
1210                           np->n_lrev != np->n_brev) ||
1211                          (!(nmp->nm_flag & NFSMNT_NQNFS) &&
1212                           np->n_mtime != np->n_vattr.va_mtime.tv_sec))) {
1213                         uprintf("Process killed due to text file modification\n");
1214                         psignal(p, SIGKILL);
1215                         p->p_flag |= P_NOSWAP;
1216                 }
1217                 break;
1218             case VLNK:
1219                 uiop->uio_offset = (off_t)0;
1220                 nfsstats.readlink_bios++;
1221                 error = nfs_readlinkrpc(vp, uiop, cr);
1222                 break;
1223             case VDIR:
1224                 nfsstats.readdir_bios++;
1225                 uiop->uio_offset = ((u_quad_t)bp->b_lblkno) * NFS_DIRBLKSIZ;
1226                 if (!(nmp->nm_flag & NFSMNT_NFSV3))
1227                         nmp->nm_flag &= ~NFSMNT_RDIRPLUS; /* dk@farm.org */
1228                 if (nmp->nm_flag & NFSMNT_RDIRPLUS) {
1229                         error = nfs_readdirplusrpc(vp, uiop, cr);
1230                         if (error == NFSERR_NOTSUPP)
1231                                 nmp->nm_flag &= ~NFSMNT_RDIRPLUS;
1232                 }
1233                 if ((nmp->nm_flag & NFSMNT_RDIRPLUS) == 0)
1234                         error = nfs_readdirrpc(vp, uiop, cr);
1235                 break;
1236             default:
1237                 printf("nfs_doio: type %x unexpected\n", vp->v_type);
1238                 break;
1239             };
1240             if (error) {
1241                 SET(bp->b_flags, B_ERROR);
1242                 bp->b_error = error;
1243             }
1244         } else {
1245             /*
1246              * mapped I/O may have altered any bytes, so we extend
1247              * the dirty zone to the valid zone.  For best performance
1248              * a better solution would be to save & restore page dirty bits
1249              * around the uiomove which brings write-data into the buffer.
1250              * Then here we'd check if the page is dirty rather than WASMAPPED
1251              * Also vnode_pager would change - if a page is clean it might
1252              * still need to be written due to DELWRI.
1253              */
1254             if (UBCINFOEXISTS(vp) && ubc_issetflags(vp, UI_WASMAPPED)) {
1255                 bp->b_dirtyoff = min(bp->b_dirtyoff, bp->b_validoff);
1256                 bp->b_dirtyend = max(bp->b_dirtyend, bp->b_validend);
1257             }
1258             if ((off_t)bp->b_blkno * DEV_BSIZE + bp->b_dirtyend > np->n_size)
1259                 bp->b_dirtyend = np->n_size - (off_t)bp->b_blkno * DEV_BSIZE;
1260
1261             if (bp->b_dirtyend > bp->b_dirtyoff) {
1262                 io.iov_len = uiop->uio_resid = bp->b_dirtyend - bp->b_dirtyoff;
1263                 uiop->uio_offset = (off_t)bp->b_blkno * DEV_BSIZE +
1264                                    bp->b_dirtyoff;
1265                 io.iov_base = (char *)bp->b_data + bp->b_dirtyoff;
1266                 uiop->uio_rw = UIO_WRITE;
1267
1268                 nfsstats.write_bios++;
1269                 if ((bp->b_flags & (B_ASYNC | B_NEEDCOMMIT | B_NOCACHE)) ==
1270                     B_ASYNC)
1271                     iomode = NFSV3WRITE_UNSTABLE;
1272                 else
1273                     iomode = NFSV3WRITE_FILESYNC;
1274                 SET(bp->b_flags, B_WRITEINPROG);
1275                 error = nfs_writerpc(vp, uiop, cr, &iomode, &must_commit);
1276                 if (!error && iomode == NFSV3WRITE_UNSTABLE)
1277                     SET(bp->b_flags, B_NEEDCOMMIT);
1278                 else
1279                     CLR(bp->b_flags, B_NEEDCOMMIT);
1280                 CLR(bp->b_flags, B_WRITEINPROG);
1281                 /*
1282                  * For an interrupted write, the buffer is still valid
1283                  * and the write hasn't been pushed to the server yet,
1284                  * so we can't set B_ERROR and report the interruption
1285                  * by setting B_EINTR. For the B_ASYNC case, B_EINTR
1286                  * is not relevant, so the rpc attempt is essentially
1287                  * a noop.  For the case of a V3 write rpc not being
1288                  * committed to stable storage, the block is still
1289                  * dirty and requires either a commit rpc or another
1290                  * write rpc with iomode == NFSV3WRITE_FILESYNC before
1291                  * the block is reused. This is indicated by setting
1292                  * the B_DELWRI and B_NEEDCOMMIT flags.
1293                  */
1294                 if (error == EINTR || (!error && bp->b_flags & B_NEEDCOMMIT)) {
1295                         int s;
1296
1297                         CLR(bp->b_flags, B_INVAL | B_NOCACHE);
1298                         if (!ISSET(bp->b_flags, B_DELWRI)) {
1299                                 SET(bp->b_flags, B_DELWRI);
1300                                 nbdwrite++;
1301                         }
1302                         FSDBG(261, bp->b_validoff, bp->b_validend,
1303                               bp->b_bufsize, bp->b_bcount);
1304                         /*
1305                          * Since for the B_ASYNC case, nfs_bwrite() has
1306                          * reassigned the buffer to the clean list, we have to
1307                          * reassign it back to the dirty one. Ugh.
1308                          */
1309                         if (ISSET(bp->b_flags, B_ASYNC)) {
1310                                 s = splbio();
1311                                 reassignbuf(bp, vp);
1312                                 splx(s);
1313                         } else {
1314                                 SET(bp->b_flags, B_EINTR);
1315                         }
1316                 } else {
1317                         if (error) {
1318                                 SET(bp->b_flags, B_ERROR);
1319                                 bp->b_error = np->n_error = error;
1320                                 np->n_flag |= NWRITEERR;
1321                         }
1322                         bp->b_dirtyoff = bp->b_dirtyend = 0;
1323
1324                         /*
1325                          * validoff and validend represent the real data present
1326                          * in this buffer if validoff is non-zero, than we have
1327                          * to invalidate the buffer and kill the page when
1328                          * biodone is called... the same is also true when
1329                          * validend doesn't extend all the way to the end of the
1330                          * buffer and validend doesn't equate to the current
1331                          * EOF... eventually we need to deal with this in a more
1332                          * humane way (like keeping the partial buffer without
1333                          * making it immediately available to the VM page cache)
1334                          */
1335                         if (bp->b_validoff)
1336                                 SET(bp->b_flags, B_INVAL);
1337                         else
1338                         if (bp->b_validend < bp->b_bufsize) {
1339                                 if ((off_t)bp->b_blkno * DEV_BSIZE +
1340                                     bp->b_validend == np->n_size) {
1341                                         bzero((caddr_t)(bp->b_data +
1342                                                         bp->b_validend),
1343                                               bp->b_bufsize - bp->b_validend);
1344                                         FSDBG(259, bp->b_validend,
1345                                               bp->b_bufsize - bp->b_validend, 0,
1346                                               0);
1347                                 } else
1348                                         SET(bp->b_flags, B_INVAL);
1349                         }
1350                 }
1351
1352             } else {
1353                 if (bp->b_validoff ||
1354                     (bp->b_validend < bp->b_bufsize &&
1355                      (off_t)bp->b_blkno * DEV_BSIZE + bp->b_validend !=
1356                      np->n_size)) {
1357                         SET(bp->b_flags, B_INVAL);
1358                 }
1359                 if (bp->b_flags & B_INVAL) {
1360                         FSDBG(260, bp->b_validoff, bp->b_validend,
1361                               bp->b_bufsize, bp->b_bcount);
1362                 }
1363                 bp->b_resid = 0;
1364                 biodone(bp);
1365                 FSDBG_BOT(256, bp->b_validoff, bp->b_validend, bp->b_bufsize,
1366                           np->n_size);
1367                 return (0);
1368             }
1369         }
1370         bp->b_resid = uiop->uio_resid;
1371         if (must_commit)
1372                 nfs_clearcommit(vp->v_mount);
1373
1374         if (bp->b_flags & B_INVAL) {
1375                 FSDBG(260, bp->b_validoff, bp->b_validend, bp->b_bufsize,
1376                       bp->b_bcount);
1377         }
1378         FSDBG_BOT(256, bp->b_validoff, bp->b_validend, bp->b_bcount, error);
1379
1380         biodone(bp);
1381         return (error);
1382 }