bsd/nfs/nfs_bio.c

   1 /*
   2  * Copyright (c) 2000 Apple Computer, Inc. All rights reserved.
   3  *
   4  * @APPLE_LICENSE_HEADER_START@
   5  *
   6  * The contents of this file constitute Original Code as defined in and
   7  * are subject to the Apple Public Source License Version 1.1 (the
   8  * "License").  You may not use this file except in compliance with the
   9  * License.  Please obtain a copy of the License at
  10  * http://www.apple.com/publicsource and read it before using this file.
  11  *
  12  * This Original Code and all software distributed under the License are
  13  * distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY KIND, EITHER
  14  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
  15  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
  16  * FITNESS FOR A PARTICULAR PURPOSE OR NON-INFRINGEMENT.  Please see the
  17  * License for the specific language governing rights and limitations
  18  * under the License.
  19  *
  20  * @APPLE_LICENSE_HEADER_END@
  21  */
  22 /* Copyright (c) 1995 NeXT Computer, Inc. All Rights Reserved */
  23 /*
  24  * Copyright (c) 1989, 1993
  25  *      The Regents of the University of California.  All rights reserved.
  26  *
  27  * This code is derived from software contributed to Berkeley by
  28  * Rick Macklem at The University of Guelph.
  29  *
  30  * Redistribution and use in source and binary forms, with or without
  31  * modification, are permitted provided that the following conditions
  32  * are met:
  33  * 1. Redistributions of source code must retain the above copyright
  34  *    notice, this list of conditions and the following disclaimer.
  35  * 2. Redistributions in binary form must reproduce the above copyright
  36  *    notice, this list of conditions and the following disclaimer in the
  37  *    documentation and/or other materials provided with the distribution.
  38  * 3. All advertising materials mentioning features or use of this software
  39  *    must display the following acknowledgement:
  40  *      This product includes software developed by the University of
  41  *      California, Berkeley and its contributors.
  42  * 4. Neither the name of the University nor the names of its contributors
  43  *    may be used to endorse or promote products derived from this software
  44  *    without specific prior written permission.
  45  *
  46  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  47  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  48  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  49  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  50  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  51  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  52  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  53  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  54  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  55  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  56  * SUCH DAMAGE.
  57  *
  58  *      @(#)nfs_bio.c   8.9 (Berkeley) 3/30/95
  59  * FreeBSD-Id: nfs_bio.c,v 1.44 1997/09/10 19:52:25 phk Exp $
  60  */
  61 #include <sys/param.h>
  62 #include <sys/systm.h>
  63 #include <sys/resourcevar.h>
  64 #include <sys/signalvar.h>
  65 #include <sys/proc.h>
  66 #include <sys/buf.h>
  67 #include <sys/vnode.h>
  68 #include <sys/mount.h>
  69 #include <sys/kernel.h>
  70 #include <sys/sysctl.h>
  71 #include <sys/ubc.h>
  72
  73 #include <sys/vm.h>
  74 #include <sys/vmparam.h>
  75
  76 #include <sys/time.h>
  77 #include <kern/clock.h>
  78
  79 #include <nfs/rpcv2.h>
  80 #include <nfs/nfsproto.h>
  81 #include <nfs/nfs.h>
  82 #include <nfs/nfsmount.h>
  83 #include <nfs/nqnfs.h>
  84 #include <nfs/nfsnode.h>
  85
  86 #include <sys/kdebug.h>
  87
  88 #define FSDBG(A, B, C, D, E) \
  89         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, (A))) | DBG_FUNC_NONE, \
  90                 (int)(B), (int)(C), (int)(D), (int)(E), 0)
  91 #define FSDBG_TOP(A, B, C, D, E) \
  92         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, (A))) | DBG_FUNC_START, \
  93                 (int)(B), (int)(C), (int)(D), (int)(E), 0)
  94 #define FSDBG_BOT(A, B, C, D, E) \
  95         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, (A))) | DBG_FUNC_END, \
  96                 (int)(B), (int)(C), (int)(D), (int)(E), 0)
  97
  98 static struct buf *nfs_getcacheblk __P((struct vnode *vp, daddr_t bn, int size,
  99                                         struct proc *p, int operation));
 100
 101 extern int nfs_numasync;
 102 extern struct nfsstats nfsstats;
 103
 104 /*
 105  * Vnode op for read using bio
 106  * Any similarity to readip() is purely coincidental
 107  */
 108 int
 109 nfs_bioread(vp, uio, ioflag, cred, getpages)
 110         register struct vnode *vp;
 111         register struct uio *uio;
 112         int ioflag;
 113         struct ucred *cred;
 114         int getpages;
 115 {
 116         register struct nfsnode *np = VTONFS(vp);
 117         register int biosize, diff, i;
 118         struct buf *bp = 0, *rabp;
 119         struct vattr vattr;
 120         struct proc *p;
 121         struct nfsmount *nmp = VFSTONFS(vp->v_mount);
 122         daddr_t lbn, rabn;
 123         int bufsize;
 124         int nra, error = 0, n = 0, on = 0, not_readin;
 125         int operation = (getpages? BLK_PAGEIN : BLK_READ);
 126
 127 #if DIAGNOSTIC
 128         if (uio->uio_rw != UIO_READ)
 129                 panic("nfs_read mode");
 130 #endif
 131         if (uio->uio_resid == 0)
 132                 return (0);
 133         if (uio->uio_offset < 0)
 134                 return (EINVAL);
 135         p = uio->uio_procp;
 136         if ((nmp->nm_flag & (NFSMNT_NFSV3 | NFSMNT_GOTFSINFO)) == NFSMNT_NFSV3)
 137                 (void)nfs_fsinfo(nmp, vp, cred, p);
 138         /*due to getblk/vm interractions, use vm page size or less values */
 139         biosize = min(vp->v_mount->mnt_stat.f_iosize, PAGE_SIZE);
 140         /*
 141          * For nfs, cache consistency can only be maintained approximately.
 142          * Although RFC1094 does not specify the criteria, the following is
 143          * believed to be compatible with the reference port.
 144          * For nqnfs, full cache consistency is maintained within the loop.
 145          * For nfs:
 146          * If the file's modify time on the server has changed since the
 147          * last read rpc or you have written to the file,
 148          * you may have lost data cache consistency with the
 149          * server, so flush all of the file's data out of the cache.
 150          * Then force a getattr rpc to ensure that you have up to date
 151          * attributes.
 152          * NB: This implies that cache data can be read when up to
 153          * NFS_ATTRTIMEO seconds out of date. If you find that you need current
 154          * attributes this could be forced by setting n_attrstamp to 0 before
 155          * the VOP_GETATTR() call.
 156          */
 157         if ((nmp->nm_flag & NFSMNT_NQNFS) == 0) {
 158                 if (np->n_flag & NMODIFIED) {
 159                         if (vp->v_type != VREG) {
 160                                 if (vp->v_type != VDIR)
 161                                         panic("nfs: bioread, not dir");
 162                                 nfs_invaldir(vp);
 163                                 error = nfs_vinvalbuf(vp, V_SAVE, cred, p, 1);
 164                                 if (error)
 165                                         return (error);
 166                         }
 167                         np->n_attrstamp = 0;
 168                         error = VOP_GETATTR(vp, &vattr, cred, p);
 169                         if (error)
 170                                 return (error);
 171                         np->n_mtime = vattr.va_mtime.tv_sec;
 172                 } else {
 173                         error = VOP_GETATTR(vp, &vattr, cred, p);
 174                         if (error)
 175                                 return (error);
 176                         if (np->n_mtime != vattr.va_mtime.tv_sec) {
 177                                 if (vp->v_type == VDIR)
 178                                         nfs_invaldir(vp);
 179                                 error = nfs_vinvalbuf(vp, V_SAVE, cred, p, 1);
 180                                 if (error)
 181                                         return (error);
 182                                 np->n_mtime = vattr.va_mtime.tv_sec;
 183                         }
 184                 }
 185         }
 186         do {
 187
 188             /*
 189              * Get a valid lease. If cached data is stale, flush it.
 190              */
 191             if (nmp->nm_flag & NFSMNT_NQNFS) {
 192                 if (NQNFS_CKINVALID(vp, np, ND_READ)) {
 193                     do {
 194                         error = nqnfs_getlease(vp, ND_READ, cred, p);
 195                     } while (error == NQNFS_EXPIRED);
 196                     if (error)
 197                         return (error);
 198                     if (np->n_lrev != np->n_brev ||
 199                         (np->n_flag & NQNFSNONCACHE) ||
 200                         ((np->n_flag & NMODIFIED) && vp->v_type == VDIR)) {
 201                         if (vp->v_type == VDIR)
 202                             nfs_invaldir(vp);
 203                         error = nfs_vinvalbuf(vp, V_SAVE, cred, p, 1);
 204                         if (error)
 205                             return (error);
 206                         np->n_brev = np->n_lrev;
 207                     }
 208                 } else if (vp->v_type == VDIR && (np->n_flag & NMODIFIED)) {
 209                     nfs_invaldir(vp);
 210                     error = nfs_vinvalbuf(vp, V_SAVE, cred, p, 1);
 211                     if (error)
 212                         return (error);
 213                 }
 214             }
 215             if (np->n_flag & NQNFSNONCACHE) {
 216                 switch (vp->v_type) {
 217                 case VREG:
 218                         return (nfs_readrpc(vp, uio, cred));
 219                 case VLNK:
 220                         return (nfs_readlinkrpc(vp, uio, cred));
 221                 case VDIR:
 222                         break;
 223                 default:
 224                         printf(" NQNFSNONCACHE: type %x unexpected\n",
 225                                 vp->v_type);
 226                 };
 227             }
 228             switch (vp->v_type) {
 229             case VREG:
 230                 nfsstats.biocache_reads++;
 231                 lbn = uio->uio_offset / biosize;
 232                 on = uio->uio_offset & (biosize - 1);
 233                 not_readin = 1;
 234
 235                 /*
 236                  * Start the read ahead(s), as required.
 237                  */
 238                 if (nfs_numasync > 0 && nmp->nm_readahead > 0) {
 239                     for (nra = 0; nra < nmp->nm_readahead &&
 240                                   (off_t)(lbn + 1 + nra) * biosize < np->n_size;
 241                          nra++) {
 242                                 rabn = lbn + 1 + nra;
 243                                 if (!incore(vp, rabn)) {
 244                                         rabp = nfs_getcacheblk(vp, rabn, biosize, p, operation);
 245                                         if (!rabp)
 246                                                 return (EINTR);
 247                                         if (!ISSET(rabp->b_flags, (B_CACHE|B_DELWRI))) {
 248                                                 SET(rabp->b_flags, (B_READ | B_ASYNC));
 249                                                 if (nfs_asyncio(rabp, cred)) {
 250                                                         SET(rabp->b_flags, (B_INVAL|B_ERROR));
 251                                                         rabp->b_error = EIO;
 252                                                         brelse(rabp);
 253                                                 }
 254                                         } else
 255                                                 brelse(rabp);
 256                                 }
 257                     }
 258                 }
 259
 260                 /*
 261                  * If the block is in the cache and has the required data
 262                  * in a valid region, just copy it out.
 263                  * Otherwise, get the block and write back/read in,
 264                  * as required.
 265                  */
 266 again:
 267                 bufsize = biosize;
 268                 if ((off_t)(lbn + 1) * biosize > np->n_size &&
 269                     (off_t)(lbn + 1) * biosize - np->n_size < biosize) {
 270                         bufsize = np->n_size - lbn * biosize;
 271                         bufsize = (bufsize + DEV_BSIZE - 1) & ~(DEV_BSIZE - 1);
 272                 }
 273                 bp = nfs_getcacheblk(vp, lbn, bufsize, p, operation);
 274                 if (!bp)
 275                         return (EINTR);
 276
 277                 if (!ISSET(bp->b_flags, B_CACHE)) {
 278                         SET(bp->b_flags, B_READ);
 279                         CLR(bp->b_flags, (B_DONE | B_ERROR | B_INVAL));
 280                         not_readin = 0;
 281                         error = nfs_doio(bp, cred, p);
 282                         if (error) {
 283                             brelse(bp);
 284                             return (error);
 285                         }
 286                 }
 287                 if (bufsize > on) {
 288                         n = min((unsigned)(bufsize - on), uio->uio_resid);
 289                 } else {
 290                         n = 0;
 291                 }
 292                 diff = np->n_size - uio->uio_offset;
 293                 if (diff < n)
 294                         n = diff;
 295                 if (not_readin && n > 0) {
 296                         if (on < bp->b_validoff || (on + n) > bp->b_validend) {
 297                                 SET(bp->b_flags, (B_NOCACHE|B_INVAFTERWRITE));
 298                                 if (bp->b_dirtyend > 0) {
 299                                         if (!ISSET(bp->b_flags, B_DELWRI))
 300                                                 panic("nfsbioread");
 301                                         if (VOP_BWRITE(bp) == EINTR)
 302                                                 return (EINTR);
 303                                 } else
 304                                         brelse(bp);
 305                                 goto again;
 306                         }
 307                 }
 308                 vp->v_lastr = lbn;
 309                 diff = (on >= bp->b_validend) ? 0 : (bp->b_validend - on);
 310                 if (diff < n)
 311                         n = diff;
 312                 break;
 313             case VLNK:
 314                 nfsstats.biocache_readlinks++;
 315                 bp = nfs_getcacheblk(vp, (daddr_t)0, NFS_MAXPATHLEN, p, operation);
 316                 if (!bp)
 317                         return (EINTR);
 318                 if (!ISSET(bp->b_flags, B_CACHE)) {
 319                         SET(bp->b_flags, B_READ);
 320                         error = nfs_doio(bp, cred, p);
 321                         if (error) {
 322                                 SET(bp->b_flags, B_ERROR);
 323                                 brelse(bp);
 324                                 return (error);
 325                         }
 326                 }
 327                 n = min(uio->uio_resid, NFS_MAXPATHLEN - bp->b_resid);
 328                 on = 0;
 329                 break;
 330             case VDIR:
 331                 nfsstats.biocache_readdirs++;
 332                 if (np->n_direofoffset
 333                     && uio->uio_offset >= np->n_direofoffset) {
 334                     return (0);
 335                 }
 336                 lbn = uio->uio_offset / NFS_DIRBLKSIZ;
 337                 on = uio->uio_offset & (NFS_DIRBLKSIZ - 1);
 338                 bp = nfs_getcacheblk(vp, lbn, NFS_DIRBLKSIZ, p, operation);
 339                 if (!bp)
 340                     return (EINTR);
 341                 if (!ISSET(bp->b_flags, B_CACHE)) {
 342                     SET(bp->b_flags, B_READ);
 343                     error = nfs_doio(bp, cred, p);
 344                     if (error) {
 345                         brelse(bp);
 346                     }
 347                     while (error == NFSERR_BAD_COOKIE) {
 348                         nfs_invaldir(vp);
 349                         error = nfs_vinvalbuf(vp, 0, cred, p, 1);
 350                         /*
 351                          * Yuck! The directory has been modified on the
 352                          * server. The only way to get the block is by
 353                          * reading from the beginning to get all the
 354                          * offset cookies.
 355                          */
 356                         for (i = 0; i <= lbn && !error; i++) {
 357                             if (np->n_direofoffset
 358                                 && (i * NFS_DIRBLKSIZ) >= np->n_direofoffset)
 359                                     return (0);
 360                             bp = nfs_getcacheblk(vp, i, NFS_DIRBLKSIZ, p,
 361                                                  operation);
 362                             if (!bp)
 363                                     return (EINTR);
 364                             if (!ISSET(bp->b_flags, B_CACHE)) {
 365                                     SET(bp->b_flags, B_READ);
 366                                     error = nfs_doio(bp, cred, p);
 367                                     /*
 368                                      * no error + B_INVAL == directory EOF,
 369                                      * use the block.
 370                                      */
 371                                     if (error == 0 && (bp->b_flags & B_INVAL))
 372                                             break;
 373                             }
 374                             /*
 375                              * An error will throw away the block and the
 376                              * for loop will break out.  If no error and this
 377                              * is not the block we want, we throw away the
 378                              * block and go for the next one via the for loop.
 379                              */
 380                             if (error || i < lbn)
 381                                     brelse(bp);
 382                         }
 383                     }
 384                     /*
 385                      * The above while is repeated if we hit another cookie
 386                      * error.  If we hit an error and it wasn't a cookie error,
 387                      * we give up.
 388                      */
 389                     if (error)
 390                         return (error);
 391                 }
 392
 393                 /*
 394                  * If not eof and read aheads are enabled, start one.
 395                  * (You need the current block first, so that you have the
 396                  *  directory offset cookie of the next block.)
 397                  */
 398                 if (nfs_numasync > 0 && nmp->nm_readahead > 0 &&
 399                     (np->n_direofoffset == 0 ||
 400                     (lbn + 1) * NFS_DIRBLKSIZ < np->n_direofoffset) &&
 401                     !(np->n_flag & NQNFSNONCACHE) &&
 402                     !incore(vp, lbn + 1)) {
 403                         rabp = nfs_getcacheblk(vp, lbn + 1, NFS_DIRBLKSIZ, p,
 404                                                operation);
 405                         if (rabp) {
 406                             if (!ISSET(rabp->b_flags, (B_CACHE|B_DELWRI))) {
 407                                 SET(rabp->b_flags, (B_READ | B_ASYNC));
 408                                 if (nfs_asyncio(rabp, cred)) {
 409                                     SET(rabp->b_flags, (B_INVAL|B_ERROR));
 410                                     rabp->b_error = EIO;
 411                                     brelse(rabp);
 412                                 }
 413                             } else {
 414                                 brelse(rabp);
 415                             }
 416                         }
 417                 }
 418                 /*
 419                  * Make sure we use a signed variant of min() since
 420                  * the second term may be negative.
 421                  */
 422                 n = lmin(uio->uio_resid, NFS_DIRBLKSIZ - bp->b_resid - on);
 423                 /*
 424                  * Unlike VREG files, whos buffer size ( bp->b_bcount ) is
 425                  * chopped for the EOF condition, we cannot tell how large
 426                  * NFS directories are going to be until we hit EOF.  So
 427                  * an NFS directory buffer is *not* chopped to its EOF.  Now,
 428                  * it just so happens that b_resid will effectively chop it
 429                  * to EOF.  *BUT* this information is lost if the buffer goes
 430                  * away and is reconstituted into a B_CACHE state (recovered
 431                  * from VM) later.  So we keep track of the directory eof
 432                  * in np->n_direofoffset and chop it off as an extra step
 433                  * right here.
 434                  */
 435                 if (np->n_direofoffset &&
 436                     n > np->n_direofoffset - uio->uio_offset)
 437                         n = np->n_direofoffset - uio->uio_offset;
 438                 break;
 439             default:
 440                 printf(" nfs_bioread: type %x unexpected\n",vp->v_type);
 441                 break;
 442             };
 443
 444             if (n > 0) {
 445                 error = uiomove(bp->b_data + on, (int)n, uio);
 446             }
 447             switch (vp->v_type) {
 448             case VREG:
 449                 break;
 450             case VLNK:
 451                 n = 0;
 452                 break;
 453             case VDIR:
 454                 if (np->n_flag & NQNFSNONCACHE)
 455                         SET(bp->b_flags, B_INVAL);
 456                 break;
 457             default:
 458                 printf(" nfs_bioread: type %x unexpected\n",vp->v_type);
 459             }
 460             brelse(bp);
 461         } while (error == 0 && uio->uio_resid > 0 && n > 0);
 462         return (error);
 463 }
 464
 465
 466 /*
 467  * Vnode op for write using bio
 468  */
 469 int
 470 nfs_write(ap)
 471         struct vop_write_args /* {
 472                 struct vnode *a_vp;
 473                 struct uio *a_uio;
 474                 int  a_ioflag;
 475                 struct ucred *a_cred;
 476         } */ *ap;
 477 {
 478         register int biosize;
 479         register struct uio *uio = ap->a_uio;
 480         struct proc *p = uio->uio_procp;
 481         register struct vnode *vp = ap->a_vp;
 482         struct nfsnode *np = VTONFS(vp);
 483         register struct ucred *cred = ap->a_cred;
 484         int ioflag = ap->a_ioflag;
 485         struct buf *bp;
 486         struct vattr vattr;
 487         struct nfsmount *nmp = VFSTONFS(vp->v_mount);
 488         daddr_t lbn;
 489         int bufsize;
 490         int n, on, error = 0, iomode, must_commit;
 491         off_t boff;
 492         struct iovec iov;
 493         struct uio auio;
 494
 495 #if DIAGNOSTIC
 496         if (uio->uio_rw != UIO_WRITE)
 497                 panic("nfs_write mode");
 498         if (uio->uio_segflg == UIO_USERSPACE && uio->uio_procp != current_proc())
 499                 panic("nfs_write proc");
 500 #endif
 501         if (vp->v_type != VREG)
 502                 return (EIO);
 503         if (np->n_flag & NWRITEERR) {
 504                 np->n_flag &= ~NWRITEERR;
 505                 return (np->n_error);
 506         }
 507         if ((nmp->nm_flag & (NFSMNT_NFSV3 | NFSMNT_GOTFSINFO)) == NFSMNT_NFSV3)
 508                 (void)nfs_fsinfo(nmp, vp, cred, p);
 509         if (ioflag & (IO_APPEND | IO_SYNC)) {
 510                 if (np->n_flag & NMODIFIED) {
 511                         np->n_attrstamp = 0;
 512                         error = nfs_vinvalbuf(vp, V_SAVE, cred, p, 1);
 513                         if (error)
 514                                 return (error);
 515                 }
 516                 if (ioflag & IO_APPEND) {
 517                         np->n_attrstamp = 0;
 518                         error = VOP_GETATTR(vp, &vattr, cred, p);
 519                         if (error)
 520                                 return (error);
 521                         uio->uio_offset = np->n_size;
 522                 }
 523         }
 524         if (uio->uio_offset < 0)
 525                 return (EINVAL);
 526         if (uio->uio_resid == 0)
 527                 return (0);
 528         /*
 529          * Maybe this should be above the vnode op call, but so long as
 530          * file servers have no limits, i don't think it matters
 531          */
 532         if (p && uio->uio_offset + uio->uio_resid >
 533               p->p_rlimit[RLIMIT_FSIZE].rlim_cur) {
 534                 psignal(p, SIGXFSZ);
 535                 return (EFBIG);
 536         }
 537         /*
 538          * I use nm_rsize, not nm_wsize so that all buffer cache blocks
 539          * will be the same size within a filesystem. nfs_writerpc will
 540          * still use nm_wsize when sizing the rpc's.
 541          */
 542         /*due to getblk/vm interractions, use vm page size or less values */
 543         biosize = min(vp->v_mount->mnt_stat.f_iosize, PAGE_SIZE);
 544
 545         do {
 546                 /*
 547                  * Check for a valid write lease.
 548                  */
 549                 if ((nmp->nm_flag & NFSMNT_NQNFS) &&
 550                     NQNFS_CKINVALID(vp, np, ND_WRITE)) {
 551                         do {
 552                                 error = nqnfs_getlease(vp, ND_WRITE, cred, p);
 553                         } while (error == NQNFS_EXPIRED);
 554                         if (error)
 555                                 return (error);
 556                         if (np->n_lrev != np->n_brev ||
 557                             (np->n_flag & NQNFSNONCACHE)) {
 558                                 error = nfs_vinvalbuf(vp, V_SAVE, cred, p, 1);
 559                                 if (error)
 560                                         return (error);
 561                                 np->n_brev = np->n_lrev;
 562                         }
 563                 }
 564                 if ((np->n_flag & NQNFSNONCACHE) && uio->uio_iovcnt == 1) {
 565                     iomode = NFSV3WRITE_FILESYNC;
 566                     error = nfs_writerpc(vp, uio, cred, &iomode, &must_commit);
 567                     if (must_commit)
 568                         nfs_clearcommit(vp->v_mount);
 569                     return (error);
 570                 }
 571                 nfsstats.biocache_writes++;
 572                 lbn = uio->uio_offset / biosize;
 573                 on = uio->uio_offset & (biosize-1);
 574                 n = min((unsigned)(biosize - on), uio->uio_resid);
 575 again:
 576                 bufsize = biosize;
 577 #if 0
 578 /* (removed for UBC) */
 579                 if ((lbn + 1) * biosize > np->n_size) {
 580                         bufsize = np->n_size - lbn * biosize;
 581                         bufsize = (bufsize + DEV_BSIZE - 1) & ~(DEV_BSIZE - 1);
 582                 }
 583 #endif
 584                 /*
 585                  * Get a cache block for writing.  The range to be written is
 586                  * (off..off+len) within the block.  We ensure that the block
 587                  * either has no dirty region or that the given range is
 588                  * contiguous with the existing dirty region.
 589                  */
 590                 bp = nfs_getcacheblk(vp, lbn, bufsize, p, BLK_WRITE);
 591                 if (!bp)
 592                         return (EINTR);
 593                 /*
 594                  * Resize nfsnode *after* we busy the buffer to prevent
 595                  * readers from reading garbage.
 596                  * If there was a partial buf at the old eof, validate
 597                  * and zero the new bytes.
 598                  */
 599                 if (uio->uio_offset + n > np->n_size) {
 600                         struct buf *bp0 = NULL;
 601                         daddr_t bn = np->n_size / biosize;
 602                         int off = np->n_size & (biosize - 1);
 603
 604                         if (off && bn < lbn && incore(vp, bn))
 605                                 bp0 = nfs_getcacheblk(vp, bn, biosize, p,
 606                                                       BLK_WRITE);
 607                         np->n_flag |= NMODIFIED;
 608                         np->n_size = uio->uio_offset + n;
 609                         ubc_setsize(vp, (off_t)np->n_size); /* XXX errors */
 610                         if (bp0) {
 611                                 bzero((char *)bp0->b_data + off, biosize - off);
 612                                 bp0->b_validend = biosize;
 613                                 brelse(bp0);
 614                         }
 615                 }
 616                 /*
 617                  * NFS has embedded ucred so crhold() risks zone corruption
 618                  */
 619                 if (bp->b_wcred == NOCRED)
 620                         bp->b_wcred = crdup(cred);
 621                 /*
 622                  * If dirtyend exceeds file size, chop it down.  This should
 623                  * not occur unless there is a race.
 624                  */
 625                 if ((off_t)bp->b_blkno * DEV_BSIZE + bp->b_dirtyend >
 626                     np->n_size)
 627                         bp->b_dirtyend = np->n_size - (off_t)bp->b_blkno *
 628                                                       DEV_BSIZE;
 629                 /*
 630                  * UBC doesn't (yet) handle partial pages so nfs_biowrite was
 631                  * hacked to never bdwrite, to start every little write right
 632                  * away.  Running IE Avie noticed the performance problem, thus
 633                  * this code, which permits those delayed writes by ensuring an
 634                  * initial read of the entire page.  The read may hit eof
 635                  * ("short read") but that we will handle.
 636                  *
 637                  * We are quite dependant on the correctness of B_CACHE so check
 638                  * that first in case of problems.
 639                  */
 640                 if (!ISSET(bp->b_flags, B_CACHE) && n < PAGE_SIZE) {
 641                         boff = (off_t)bp->b_blkno * DEV_BSIZE;
 642                         auio.uio_iov = &iov;
 643                         auio.uio_iovcnt = 1;
 644                         auio.uio_offset = boff;
 645                         auio.uio_resid = PAGE_SIZE;
 646                         auio.uio_segflg = UIO_SYSSPACE;
 647                         auio.uio_rw = UIO_READ;
 648                         auio.uio_procp = p;
 649                         iov.iov_base = bp->b_data;
 650                         iov.iov_len = PAGE_SIZE;
 651                         error = nfs_readrpc(vp, &auio, cred);
 652                         if (error) {
 653                                 bp->b_error = error;
 654                                 SET(bp->b_flags, B_ERROR);
 655                                 printf("nfs_write: readrpc %d", error);
 656                         }
 657                         if (auio.uio_resid > 0)
 658                                 bzero(iov.iov_base, auio.uio_resid);
 659                         bp->b_validoff = 0;
 660                         bp->b_validend = PAGE_SIZE - auio.uio_resid;
 661                         if (np->n_size > boff + bp->b_validend)
 662                                 bp->b_validend = min(np->n_size - boff,
 663                                                      PAGE_SIZE);
 664                         bp->b_dirtyoff = 0;
 665                         bp->b_dirtyend = 0;
 666                 }
 667
 668                 /*
 669                  * If the new write will leave a contiguous dirty
 670                  * area, just update the b_dirtyoff and b_dirtyend,
 671                  * otherwise try to extend the dirty region.
 672                  */
 673                 if (bp->b_dirtyend > 0 &&
 674                     (on > bp->b_dirtyend || (on + n) < bp->b_dirtyoff)) {
 675                         off_t start, end;
 676
 677                         boff = (off_t)bp->b_blkno * DEV_BSIZE;
 678                         if (on > bp->b_dirtyend) {
 679                                 start = boff + bp->b_validend;
 680                                 end = boff + on;
 681                         } else {
 682                                 start = boff + on + n;
 683                                 end = boff + bp->b_validoff;
 684                         }
 685
 686                         /*
 687                          * It may be that the valid region in the buffer
 688                          * covers the region we want, in which case just
 689                          * extend the dirty region.  Otherwise we try to
 690                          * extend the valid region.
 691                          */
 692                         if (end > start) {
 693                                 auio.uio_iov = &iov;
 694                                 auio.uio_iovcnt = 1;
 695                                 auio.uio_offset = start;
 696                                 auio.uio_resid = end - start;
 697                                 auio.uio_segflg = UIO_SYSSPACE;
 698                                 auio.uio_rw = UIO_READ;
 699                                 auio.uio_procp = p;
 700                                 iov.iov_base = bp->b_data + (start - boff);
 701                                 iov.iov_len = end - start;
 702                                 error = nfs_readrpc(vp, &auio, cred);
 703                                 /*
 704                                  * If we couldn't read, do not do a VOP_BWRITE
 705                                  * as originally coded. That could also error
 706                                  * and looping back to "again" as it was doing
 707                                  * could have us stuck trying to write same buf
 708                                  * again. nfs_write, will get the entire region
 709                                  * if nfs_readrpc succeeded. If unsuccessful
 710                                  * we should just error out. Errors like ESTALE
 711                                  * would keep us looping rather than transient
 712                                  * errors justifying a retry. We can return here
 713                                  * instead of altering dirty region later.  We
 714                                  * did not write old dirty region at this point.
 715                                  */
 716                                 if (error) {
 717                                         bp->b_error = error;
 718                                         SET(bp->b_flags, B_ERROR);
 719                                         printf("nfs_write: readrpc2 %d", error);
 720                                         brelse(bp);
 721                                         return (error);
 722                                 }
 723                                 /*
 724                                  * The read worked.
 725                                  * If there was a short read, just zero fill.
 726                                  */
 727                                 if (auio.uio_resid > 0)
 728                                         bzero(iov.iov_base, auio.uio_resid);
 729                                 if (on > bp->b_dirtyend)
 730                                         bp->b_validend = on;
 731                                 else
 732                                         bp->b_validoff = on + n;
 733                         }
 734                         /*
 735                          * We now have a valid region which extends up to the
 736                          * dirty region which we want.
 737                          */
 738                         if (on > bp->b_dirtyend)
 739                                 bp->b_dirtyend = on;
 740                         else
 741                                 bp->b_dirtyoff = on + n;
 742                 }
 743                 if (ISSET(bp->b_flags, B_ERROR)) {
 744                         error = bp->b_error;
 745                         brelse(bp);
 746                         return (error);
 747                 }
 748                 /*
 749                  * NFS has embedded ucred so crhold() risks zone corruption
 750                  */
 751                 if (bp->b_wcred == NOCRED)
 752                         bp->b_wcred = crdup(cred);
 753                 np->n_flag |= NMODIFIED;
 754
 755                 /*
 756                  * Check for valid write lease and get one as required.
 757                  * In case getblk() and/or bwrite() delayed us.
 758                  */
 759                 if ((nmp->nm_flag & NFSMNT_NQNFS) &&
 760                     NQNFS_CKINVALID(vp, np, ND_WRITE)) {
 761                         do {
 762                                 error = nqnfs_getlease(vp, ND_WRITE, cred, p);
 763                         } while (error == NQNFS_EXPIRED);
 764                         if (error) {
 765                                 brelse(bp);
 766                                 return (error);
 767                         }
 768                         if (np->n_lrev != np->n_brev ||
 769                             (np->n_flag & NQNFSNONCACHE)) {
 770                                 brelse(bp);
 771                                 error = nfs_vinvalbuf(vp, V_SAVE, cred, p, 1);
 772                                 if (error)
 773                                         return (error);
 774                                 np->n_brev = np->n_lrev;
 775                                 goto again;
 776                         }
 777                 }
 778                 error = uiomove((char *)bp->b_data + on, n, uio);
 779                 if (error) {
 780                         SET(bp->b_flags, B_ERROR);
 781                         brelse(bp);
 782                         return (error);
 783                 }
 784                 if (bp->b_dirtyend > 0) {
 785                         bp->b_dirtyoff = min(on, bp->b_dirtyoff);
 786                         bp->b_dirtyend = max((on + n), bp->b_dirtyend);
 787                 } else {
 788                         bp->b_dirtyoff = on;
 789                         bp->b_dirtyend = on + n;
 790                 }
 791                 if (bp->b_validend == 0 || bp->b_validend < bp->b_dirtyoff ||
 792                     bp->b_validoff > bp->b_dirtyend) {
 793                         bp->b_validoff = bp->b_dirtyoff;
 794                         bp->b_validend = bp->b_dirtyend;
 795                 } else {
 796                         bp->b_validoff = min(bp->b_validoff, bp->b_dirtyoff);
 797                         bp->b_validend = max(bp->b_validend, bp->b_dirtyend);
 798                 }
 799
 800                 /*
 801                  * Since this block is being modified, it must be written
 802                  * again and not just committed.
 803                  */
 804                 CLR(bp->b_flags, B_NEEDCOMMIT);
 805
 806                 /*
 807                  * If the lease is non-cachable or IO_SYNC do bwrite().
 808                  */
 809                 if ((np->n_flag & NQNFSNONCACHE) || (ioflag & IO_SYNC)) {
 810                         bp->b_proc = p;
 811                         error = VOP_BWRITE(bp);
 812                         if (error)
 813                                 return (error);
 814                         if (np->n_flag & NQNFSNONCACHE) {
 815                                 error = nfs_vinvalbuf(vp, V_SAVE, cred, p, 1);
 816                                 if (error)
 817                                         return (error);
 818                         }
 819                 } else if ((n + on) == biosize &&
 820                         (nmp->nm_flag & NFSMNT_NQNFS) == 0) {
 821                         bp->b_proc = (struct proc *)0;
 822                         SET(bp->b_flags, B_ASYNC);
 823                         (void)nfs_writebp(bp, 0);
 824                 } else
 825                         bdwrite(bp);
 826         } while (uio->uio_resid > 0 && n > 0);
 827         return (0);
 828 }
 829
 830
 831 /*
 832  * Get an nfs cache block.
 833  * Allocate a new one if the block isn't currently in the cache
 834  * and return the block marked busy. If the calling process is
 835  * interrupted by a signal for an interruptible mount point, return
 836  * NULL.
 837  */
 838 static struct buf *
 839 nfs_getcacheblk(vp, bn, size, p, operation)
 840         struct vnode *vp;
 841         daddr_t bn;
 842         int size;
 843         struct proc *p;
 844         int operation;  /* defined in sys/buf.h */
 845 {
 846         register struct buf *bp;
 847         struct nfsmount *nmp = VFSTONFS(vp->v_mount);
 848         /*due to getblk/vm interractions, use vm page size or less values */
 849         int biosize = min(vp->v_mount->mnt_stat.f_iosize, PAGE_SIZE);
 850
 851         if (nmp->nm_flag & NFSMNT_INT) {
 852                 bp = getblk(vp, bn, size, PCATCH, 0, operation);
 853                 while (bp == (struct buf *)0) {
 854                         if (nfs_sigintr(nmp, (struct nfsreq *)0, p))
 855                                 return ((struct buf *)0);
 856                         bp = getblk(vp, bn, size, 0, 2 * hz, operation);
 857                 }
 858         } else
 859                 bp = getblk(vp, bn, size, 0, 0, operation);
 860
 861         if( vp->v_type == VREG)
 862                 bp->b_blkno = (bn * biosize) / DEV_BSIZE;
 863
 864         return (bp);
 865 }
 866
 867 /*
 868  * Flush and invalidate all dirty buffers. If another process is already
 869  * doing the flush, just wait for completion.
 870  */
 871 int
 872 nfs_vinvalbuf(vp, flags, cred, p, intrflg)
 873         struct vnode *vp;
 874         int flags;
 875         struct ucred *cred;
 876         struct proc *p;
 877         int intrflg;
 878 {
 879         register struct nfsnode *np = VTONFS(vp);
 880         struct nfsmount *nmp = VFSTONFS(vp->v_mount);
 881         int error = 0, slpflag, slptimeo;
 882         int didhold = 0;
 883
 884         if ((nmp->nm_flag & NFSMNT_INT) == 0)
 885                 intrflg = 0;
 886         if (intrflg) {
 887                 slpflag = PCATCH;
 888                 slptimeo = 2 * hz;
 889         } else {
 890                 slpflag = 0;
 891                 slptimeo = 0;
 892         }
 893         /*
 894          * First wait for any other process doing a flush to complete.
 895          */
 896         while (np->n_flag & NFLUSHINPROG) {
 897                 np->n_flag |= NFLUSHWANT;
 898                 error = tsleep((caddr_t)&np->n_flag, PRIBIO + 2, "nfsvinval",
 899                         slptimeo);
 900                 if (error && intrflg && nfs_sigintr(nmp, (struct nfsreq *)0, p))
 901                         return (EINTR);
 902         }
 903
 904         /*
 905          * Now, flush as required.
 906          */
 907         np->n_flag |= NFLUSHINPROG;
 908         error = vinvalbuf(vp, flags, cred, p, slpflag, 0);
 909         while (error) {
 910                 /* we seem to be stuck in a loop here if the thread got aborted.
 911                  * nfs_flush will return EINTR. Not sure if that will cause
 912                  * other consequences due to EINTR having other meanings in NFS
 913                  * To handle, no dirty pages, it seems safe to just return from
 914                  * here. But if we did have dirty pages, how would we get them
 915                  * written out if thread was aborted? Some other strategy is
 916                  * necessary. -- EKN
 917                  */
 918                 if ((intrflg && nfs_sigintr(nmp, (struct nfsreq *)0, p)) ||
 919                     (error == EINTR && current_thread_aborted())) {
 920                         np->n_flag &= ~NFLUSHINPROG;
 921                         if (np->n_flag & NFLUSHWANT) {
 922                                 np->n_flag &= ~NFLUSHWANT;
 923                                 wakeup((caddr_t)&np->n_flag);
 924                         }
 925                         return (EINTR);
 926                 }
 927                 error = vinvalbuf(vp, flags, cred, p, 0, slptimeo);
 928         }
 929         np->n_flag &= ~(NMODIFIED | NFLUSHINPROG);
 930         if (np->n_flag & NFLUSHWANT) {
 931                 np->n_flag &= ~NFLUSHWANT;
 932                 wakeup((caddr_t)&np->n_flag);
 933         }
 934         didhold = ubc_hold(vp);
 935         if (didhold) {
 936                 (void) ubc_clean(vp, 1); /* get the pages out of vm also */
 937                 ubc_rele(vp);
 938         }
 939         return (0);
 940 }
 941
 942 /*
 943  * Initiate asynchronous I/O. Return an error if no nfsiods are available.
 944  * This is mainly to avoid queueing async I/O requests when the nfsiods
 945  * are all hung on a dead server.
 946  */
 947 int
 948 nfs_asyncio(bp, cred)
 949         register struct buf *bp;
 950         struct ucred *cred;
 951 {
 952         struct nfsmount *nmp;
 953         int i;
 954         int gotiod;
 955         int slpflag = 0;
 956         int slptimeo = 0;
 957         int error;
 958
 959         if (nfs_numasync == 0)
 960                 return (EIO);
 961
 962         nmp = VFSTONFS(bp->b_vp->v_mount);
 963 again:
 964         if (nmp->nm_flag & NFSMNT_INT)
 965                 slpflag = PCATCH;
 966         gotiod = FALSE;
 967
 968         /*
 969          * Find a free iod to process this request.
 970          */
 971         for (i = 0; i < NFS_MAXASYNCDAEMON; i++)
 972                 if (nfs_iodwant[i]) {
 973                         /*
 974                          * Found one, so wake it up and tell it which
 975                          * mount to process.
 976                          */
 977                         NFS_DPF(ASYNCIO,
 978                                 ("nfs_asyncio: waking iod %d for mount %p\n",
 979                                  i, nmp));
 980                         nfs_iodwant[i] = (struct proc *)0;
 981                         nfs_iodmount[i] = nmp;
 982                         nmp->nm_bufqiods++;
 983                         wakeup((caddr_t)&nfs_iodwant[i]);
 984                         gotiod = TRUE;
 985                         break;
 986                 }
 987
 988         /*
 989          * If none are free, we may already have an iod working on this mount
 990          * point.  If so, it will process our request.
 991          */
 992         if (!gotiod) {
 993                 if (nmp->nm_bufqiods > 0) {
 994                         NFS_DPF(ASYNCIO,
 995                                 ("nfs_asyncio: %d iods are already processing mount %p\n",
 996                                  nmp->nm_bufqiods, nmp));
 997                         gotiod = TRUE;
 998                 }
 999         }
1000
1001         /*
1002          * If we have an iod which can process the request, then queue
1003          * the buffer.
1004          */
1005         if (gotiod) {
1006                 /*
1007                  * Ensure that the queue never grows too large.
1008                  */
1009                 while (nmp->nm_bufqlen >= 2*nfs_numasync) {
1010                         NFS_DPF(ASYNCIO,
1011                                 ("nfs_asyncio: waiting for mount %p queue to drain\n", nmp));
1012                         nmp->nm_bufqwant = TRUE;
1013                         error = tsleep(&nmp->nm_bufq, slpflag | PRIBIO,
1014                                        "nfsaio", slptimeo);
1015                         if (error) {
1016                                 if (nfs_sigintr(nmp, NULL, bp->b_proc))
1017                                         return (EINTR);
1018                                 if (slpflag == PCATCH) {
1019                                         slpflag = 0;
1020                                         slptimeo = 2 * hz;
1021                                 }
1022                         }
1023                         /*
1024                          * We might have lost our iod while sleeping,
1025                          * so check and loop if nescessary.
1026                          */
1027                         if (nmp->nm_bufqiods == 0) {
1028                                 NFS_DPF(ASYNCIO,
1029                                         ("nfs_asyncio: no iods after mount %p queue was drained, looping\n", nmp));
1030                                 goto again;
1031                         }
1032                 }
1033
1034                 if (ISSET(bp->b_flags, B_READ)) {
1035                         if (bp->b_rcred == NOCRED && cred != NOCRED) {
1036                                 /*
1037                                  * NFS has embedded ucred.
1038                                  * Can not crhold() here as that causes zone corruption
1039                                  */
1040                                 bp->b_rcred = crdup(cred);
1041                         }
1042                 } else {
1043                         SET(bp->b_flags, B_WRITEINPROG);
1044                         if (bp->b_wcred == NOCRED && cred != NOCRED) {
1045                                 /*
1046                                  * NFS has embedded ucred.
1047                                  * Can not crhold() here as that causes zone corruption
1048                                  */
1049                                 bp->b_wcred = crdup(cred);
1050                         }
1051                 }
1052
1053                 TAILQ_INSERT_TAIL(&nmp->nm_bufq, bp, b_freelist);
1054                 nmp->nm_bufqlen++;
1055                 return (0);
1056         }
1057
1058         /*
1059          * All the iods are busy on other mounts, so return EIO to
1060          * force the caller to process the i/o synchronously.
1061          */
1062         NFS_DPF(ASYNCIO, ("nfs_asyncio: no iods available, i/o is synchronous\n"));
1063         return (EIO);
1064 }
1065
1066 /*
1067  * Do an I/O operation to/from a cache block. This may be called
1068  * synchronously or from an nfsiod.
1069  */
1070 int
1071 nfs_doio(bp, cr, p)
1072         register struct buf *bp;
1073         struct ucred *cr;
1074         struct proc *p;
1075 {
1076         register struct uio *uiop;
1077         register struct vnode *vp;
1078         struct nfsnode *np;
1079         struct nfsmount *nmp;
1080         int error = 0, diff, len, iomode, must_commit = 0;
1081         struct uio uio;
1082         struct iovec io;
1083
1084         vp = bp->b_vp;
1085         np = VTONFS(vp);
1086         nmp = VFSTONFS(vp->v_mount);
1087         uiop = &uio;
1088         uiop->uio_iov = &io;
1089         uiop->uio_iovcnt = 1;
1090         uiop->uio_segflg = UIO_SYSSPACE;
1091         uiop->uio_procp = p;
1092
1093         /*
1094          * With UBC, getblk() can return a buf with B_DONE set.
1095          * This indicates that the VM has valid data for that page.
1096          * NFS being stateless, this case poses a problem.
1097          * By definition, the NFS server should always be consulted
1098          * for the data in that page.
1099          * So we choose to clear the B_DONE and to do the IO.
1100          *
1101          * XXX revisit this if there is a performance issue.
1102          * XXX In that case, we could play the attribute cache games ...
1103          */
1104          if (ISSET(bp->b_flags, B_DONE)) {
1105                 if (!ISSET(bp->b_flags, B_ASYNC))
1106                         panic("nfs_doio: done and not async");
1107                 CLR(bp->b_flags, B_DONE);
1108         }
1109         FSDBG_TOP(256, np->n_size, bp->b_blkno * DEV_BSIZE, bp->b_bcount,
1110                   bp->b_flags);
1111         FSDBG(257, bp->b_validoff, bp->b_validend, bp->b_dirtyoff,
1112               bp->b_dirtyend);
1113         /*
1114          * Historically, paging was done with physio, but no more.
1115          */
1116         if (ISSET(bp->b_flags, B_PHYS)) {
1117             /*
1118              * ...though reading /dev/drum still gets us here.
1119              */
1120             io.iov_len = uiop->uio_resid = bp->b_bcount;
1121             /* mapping was done by vmapbuf() */
1122             io.iov_base = bp->b_data;
1123             uiop->uio_offset = (off_t)bp->b_blkno * DEV_BSIZE;
1124             if (ISSET(bp->b_flags, B_READ)) {
1125                         uiop->uio_rw = UIO_READ;
1126                         nfsstats.read_physios++;
1127                         error = nfs_readrpc(vp, uiop, cr);
1128             } else {
1129                         int com;
1130
1131                         iomode = NFSV3WRITE_DATASYNC;
1132                         uiop->uio_rw = UIO_WRITE;
1133                         nfsstats.write_physios++;
1134                         error = nfs_writerpc(vp, uiop, cr, &iomode, &com);
1135             }
1136             if (error) {
1137                         SET(bp->b_flags, B_ERROR);
1138                         bp->b_error = error;
1139             }
1140         } else if (ISSET(bp->b_flags, B_READ)) {
1141             io.iov_len = uiop->uio_resid = bp->b_bcount;
1142             io.iov_base = bp->b_data;
1143             uiop->uio_rw = UIO_READ;
1144             switch (vp->v_type) {
1145             case VREG:
1146                 uiop->uio_offset = (off_t)bp->b_blkno * DEV_BSIZE;
1147                 nfsstats.read_bios++;
1148                 error = nfs_readrpc(vp, uiop, cr);
1149                 FSDBG(262, np->n_size, bp->b_blkno * DEV_BSIZE,
1150                       uiop->uio_resid, error);
1151                 if (!error) {
1152                     bp->b_validoff = 0;
1153                     if (uiop->uio_resid) {
1154                         /*
1155                          * If len > 0, there is a hole in the file and
1156                          * no writes after the hole have been pushed to
1157                          * the server yet.
1158                          * Just zero fill the rest of the valid area.
1159                          */
1160                         diff = bp->b_bcount - uiop->uio_resid;
1161                         len = np->n_size - ((u_quad_t)bp->b_blkno * DEV_BSIZE +
1162                                             diff);
1163                         if (len > 0) {
1164                                 len = min(len, uiop->uio_resid);
1165                                 bzero((char *)bp->b_data + diff, len);
1166                                 bp->b_validend = diff + len;
1167                                 FSDBG(258, diff, len, 0, 1);
1168                         } else
1169                                 bp->b_validend = diff;
1170                     } else
1171                                 bp->b_validend = bp->b_bcount;
1172 #if 1 /* USV + JOE [ */
1173                     if (bp->b_validend < bp->b_bufsize) {
1174                             /*
1175                              * we're about to release a partial buffer after a
1176                              * read... the only way we should get here is if
1177                              * this buffer contains the EOF before releasing it,
1178                              * we'll zero out to the end of the buffer so that
1179                              * if a mmap of this page occurs, we'll see zero's
1180                              * even if a ftruncate extends the file in the
1181                              * meantime
1182                              */
1183                             bzero((caddr_t)(bp->b_data + bp->b_validend),
1184                                   bp->b_bufsize - bp->b_validend);
1185                             FSDBG(258, bp->b_validend,
1186                                   bp->b_bufsize - bp->b_validend, 0, 2);
1187                     }
1188 #endif /* ] USV + JOE */
1189                 }
1190                 if (p && (vp->v_flag & VTEXT) &&
1191                         (((nmp->nm_flag & NFSMNT_NQNFS) &&
1192                           NQNFS_CKINVALID(vp, np, ND_READ) &&
1193                           np->n_lrev != np->n_brev) ||
1194                          (!(nmp->nm_flag & NFSMNT_NQNFS) &&
1195                           np->n_mtime != np->n_vattr.va_mtime.tv_sec))) {
1196                         uprintf("Process killed due to text file modification\n");
1197                         psignal(p, SIGKILL);
1198                         p->p_flag |= P_NOSWAP;
1199                 }
1200                 break;
1201             case VLNK:
1202                 uiop->uio_offset = (off_t)0;
1203                 nfsstats.readlink_bios++;
1204                 error = nfs_readlinkrpc(vp, uiop, cr);
1205                 break;
1206             case VDIR:
1207                 nfsstats.readdir_bios++;
1208                 uiop->uio_offset = ((u_quad_t)bp->b_lblkno) * NFS_DIRBLKSIZ;
1209                 if (!(nmp->nm_flag & NFSMNT_NFSV3))
1210                         nmp->nm_flag &= ~NFSMNT_RDIRPLUS; /* dk@farm.org */
1211                 if (nmp->nm_flag & NFSMNT_RDIRPLUS) {
1212                         error = nfs_readdirplusrpc(vp, uiop, cr);
1213                         if (error == NFSERR_NOTSUPP)
1214                                 nmp->nm_flag &= ~NFSMNT_RDIRPLUS;
1215                 }
1216                 if ((nmp->nm_flag & NFSMNT_RDIRPLUS) == 0)
1217                         error = nfs_readdirrpc(vp, uiop, cr);
1218                 break;
1219             default:
1220                 printf("nfs_doio: type %x unexpected\n", vp->v_type);
1221                 break;
1222             };
1223             if (error) {
1224                 SET(bp->b_flags, B_ERROR);
1225                 bp->b_error = error;
1226             }
1227         } else {
1228             /*
1229              * mapped I/O may have altered any bytes, so we extend
1230              * the dirty zone to the valid zone.  For best performance
1231              * a better solution would be to save & restore page dirty bits
1232              * around the uiomove which brings write-data into the buffer.
1233              * Then here we'd check if the page is dirty rather than WASMAPPED
1234              * Also vnode_pager would change - if a page is clean it might
1235              * still need to be written due to DELWRI.
1236              */
1237             if (UBCINFOEXISTS(vp) && ubc_issetflags(vp, UI_WASMAPPED)) {
1238                 bp->b_dirtyoff = min(bp->b_dirtyoff, bp->b_validoff);
1239                 bp->b_dirtyend = max(bp->b_dirtyend, bp->b_validend);
1240             }
1241             if ((off_t)bp->b_blkno * DEV_BSIZE + bp->b_dirtyend > np->n_size)
1242                 bp->b_dirtyend = np->n_size - (off_t)bp->b_blkno * DEV_BSIZE;
1243
1244             if (bp->b_dirtyend > bp->b_dirtyoff) {
1245                 io.iov_len = uiop->uio_resid = bp->b_dirtyend - bp->b_dirtyoff;
1246                 uiop->uio_offset = (off_t)bp->b_blkno * DEV_BSIZE +
1247                                    bp->b_dirtyoff;
1248                 io.iov_base = (char *)bp->b_data + bp->b_dirtyoff;
1249                 uiop->uio_rw = UIO_WRITE;
1250
1251                 nfsstats.write_bios++;
1252                 if ((bp->b_flags & (B_ASYNC | B_NEEDCOMMIT | B_NOCACHE)) ==
1253                     B_ASYNC)
1254                     iomode = NFSV3WRITE_UNSTABLE;
1255                 else
1256                     iomode = NFSV3WRITE_FILESYNC;
1257                 SET(bp->b_flags, B_WRITEINPROG);
1258                 error = nfs_writerpc(vp, uiop, cr, &iomode, &must_commit);
1259                 if (!error && iomode == NFSV3WRITE_UNSTABLE)
1260                     SET(bp->b_flags, B_NEEDCOMMIT);
1261                 else
1262                     CLR(bp->b_flags, B_NEEDCOMMIT);
1263                 CLR(bp->b_flags, B_WRITEINPROG);
1264                 /*
1265                  * For an interrupted write, the buffer is still valid
1266                  * and the write hasn't been pushed to the server yet,
1267                  * so we can't set B_ERROR and report the interruption
1268                  * by setting B_EINTR. For the B_ASYNC case, B_EINTR
1269                  * is not relevant, so the rpc attempt is essentially
1270                  * a noop.  For the case of a V3 write rpc not being
1271                  * committed to stable storage, the block is still
1272                  * dirty and requires either a commit rpc or another
1273                  * write rpc with iomode == NFSV3WRITE_FILESYNC before
1274                  * the block is reused. This is indicated by setting
1275                  * the B_DELWRI and B_NEEDCOMMIT flags.
1276                  */
1277                 if (error == EINTR || (!error && bp->b_flags & B_NEEDCOMMIT)) {
1278                         int s;
1279
1280                         CLR(bp->b_flags, B_INVAL | B_NOCACHE);
1281                         SET(bp->b_flags, B_DELWRI);
1282                         FSDBG(261, bp->b_validoff, bp->b_validend,
1283                               bp->b_bufsize, bp->b_bcount);
1284                         /*
1285                          * Since for the B_ASYNC case, nfs_bwrite() has
1286                          * reassigned the buffer to the clean list, we have to
1287                          * reassign it back to the dirty one. Ugh.
1288                          */
1289                         if (ISSET(bp->b_flags, B_ASYNC)) {
1290                                 s = splbio();
1291                                 reassignbuf(bp, vp);
1292                                 splx(s);
1293                         } else {
1294                                 SET(bp->b_flags, B_EINTR);
1295                         }
1296                 } else {
1297                         if (error) {
1298                                 SET(bp->b_flags, B_ERROR);
1299                                 bp->b_error = np->n_error = error;
1300                                 np->n_flag |= NWRITEERR;
1301                         }
1302                         bp->b_dirtyoff = bp->b_dirtyend = 0;
1303 #if 1  /* JOE */
1304                         /*
1305                          * validoff and validend represent the real data present
1306                          * in this buffer if validoff is non-zero, than we have
1307                          * to invalidate the buffer and kill the page when
1308                          * biodone is called... the same is also true when
1309                          * validend doesn't extend all the way to the end of the
1310                          * buffer and validend doesn't equate to the current
1311                          * EOF... eventually we need to deal with this in a more
1312                          * humane way (like keeping the partial buffer without
1313                          * making it immediately available to the VM page cache)
1314                          */
1315                         if (bp->b_validoff)
1316                                 SET(bp->b_flags, B_INVAL);
1317                         else
1318                         if (bp->b_validend < bp->b_bufsize) {
1319                                 if ((off_t)bp->b_blkno * DEV_BSIZE +
1320                                     bp->b_validend == np->n_size) {
1321                                         bzero((caddr_t)(bp->b_data +
1322                                                         bp->b_validend),
1323                                               bp->b_bufsize - bp->b_validend);
1324                                         FSDBG(259, bp->b_validend,
1325                                               bp->b_bufsize - bp->b_validend, 0,
1326                                               0);
1327                                 } else
1328                                         SET(bp->b_flags, B_INVAL);
1329                         }
1330 #endif
1331                 }
1332
1333             } else {
1334 #if 1  /* JOE */
1335                 if (bp->b_validoff ||
1336                     (bp->b_validend < bp->b_bufsize &&
1337                      (off_t)bp->b_blkno * DEV_BSIZE + bp->b_validend !=
1338                      np->n_size)) {
1339                         SET(bp->b_flags, B_INVAL);
1340                 }
1341                 if (bp->b_flags & B_INVAL) {
1342                         FSDBG(260, bp->b_validoff, bp->b_validend,
1343                               bp->b_bufsize, bp->b_bcount);
1344                 }
1345 #endif
1346                 bp->b_resid = 0;
1347                 biodone(bp);
1348                 FSDBG_BOT(256, bp->b_validoff, bp->b_validend, bp->b_bufsize,
1349                           np->n_size);
1350                 return (0);
1351             }
1352         }
1353         bp->b_resid = uiop->uio_resid;
1354         if (must_commit)
1355                 nfs_clearcommit(vp->v_mount);
1356
1357         if (bp->b_flags & B_INVAL) {
1358                 FSDBG(260, bp->b_validoff, bp->b_validend, bp->b_bufsize,
1359                       bp->b_bcount);
1360         }
1361         FSDBG_BOT(256, bp->b_validoff, bp->b_validend, bp->b_bcount, error);
1362
1363         biodone(bp);
1364         return (error);
1365 }