bsd/nfs/nfs_bio.c

   1 /*
   2  * Copyright (c) 2000 Apple Computer, Inc. All rights reserved.
   3  *
   4  * @APPLE_LICENSE_HEADER_START@
   5  *
   6  * The contents of this file constitute Original Code as defined in and
   7  * are subject to the Apple Public Source License Version 1.1 (the
   8  * "License").  You may not use this file except in compliance with the
   9  * License.  Please obtain a copy of the License at
  10  * http://www.apple.com/publicsource and read it before using this file.
  11  *
  12  * This Original Code and all software distributed under the License are
  13  * distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY KIND, EITHER
  14  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
  15  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
  16  * FITNESS FOR A PARTICULAR PURPOSE OR NON-INFRINGEMENT.  Please see the
  17  * License for the specific language governing rights and limitations
  18  * under the License.
  19  *
  20  * @APPLE_LICENSE_HEADER_END@
  21  */
  22 /* Copyright (c) 1995 NeXT Computer, Inc. All Rights Reserved */
  23 /*
  24  * Copyright (c) 1989, 1993
  25  *      The Regents of the University of California.  All rights reserved.
  26  *
  27  * This code is derived from software contributed to Berkeley by
  28  * Rick Macklem at The University of Guelph.
  29  *
  30  * Redistribution and use in source and binary forms, with or without
  31  * modification, are permitted provided that the following conditions
  32  * are met:
  33  * 1. Redistributions of source code must retain the above copyright
  34  *    notice, this list of conditions and the following disclaimer.
  35  * 2. Redistributions in binary form must reproduce the above copyright
  36  *    notice, this list of conditions and the following disclaimer in the
  37  *    documentation and/or other materials provided with the distribution.
  38  * 3. All advertising materials mentioning features or use of this software
  39  *    must display the following acknowledgement:
  40  *      This product includes software developed by the University of
  41  *      California, Berkeley and its contributors.
  42  * 4. Neither the name of the University nor the names of its contributors
  43  *    may be used to endorse or promote products derived from this software
  44  *    without specific prior written permission.
  45  *
  46  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  47  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  48  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  49  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  50  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  51  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  52  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  53  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  54  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  55  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  56  * SUCH DAMAGE.
  57  *
  58  *      @(#)nfs_bio.c   8.9 (Berkeley) 3/30/95
  59  * FreeBSD-Id: nfs_bio.c,v 1.44 1997/09/10 19:52:25 phk Exp $
  60  */
  61 #include <sys/param.h>
  62 #include <sys/systm.h>
  63 #include <sys/resourcevar.h>
  64 #include <sys/signalvar.h>
  65 #include <sys/proc.h>
  66 #include <sys/buf.h>
  67 #include <sys/vnode.h>
  68 #include <sys/mount.h>
  69 #include <sys/kernel.h>
  70 #include <sys/sysctl.h>
  71 #include <sys/ubc.h>
  72
  73 #include <sys/vm.h>
  74 #include <sys/vmparam.h>
  75
  76 #include <sys/time.h>
  77 #include <kern/clock.h>
  78
  79 #include <nfs/rpcv2.h>
  80 #include <nfs/nfsproto.h>
  81 #include <nfs/nfs.h>
  82 #include <nfs/nfsmount.h>
  83 #include <nfs/nqnfs.h>
  84 #include <nfs/nfsnode.h>
  85
  86 #include <sys/kdebug.h>
  87
  88 #define FSDBG(A, B, C, D, E) \
  89         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, (A))) | DBG_FUNC_NONE, \
  90                 (int)(B), (int)(C), (int)(D), (int)(E), 0)
  91 #define FSDBG_TOP(A, B, C, D, E) \
  92         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, (A))) | DBG_FUNC_START, \
  93                 (int)(B), (int)(C), (int)(D), (int)(E), 0)
  94 #define FSDBG_BOT(A, B, C, D, E) \
  95         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, (A))) | DBG_FUNC_END, \
  96                 (int)(B), (int)(C), (int)(D), (int)(E), 0)
  97
  98 static struct buf *nfs_getcacheblk __P((struct vnode *vp, daddr_t bn, int size,
  99                                         struct proc *p, int operation));
 100
 101 extern int nfs_numasync;
 102 extern struct nfsstats nfsstats;
 103 extern int nbdwrite;
 104
 105 /*
 106  * Vnode op for read using bio
 107  * Any similarity to readip() is purely coincidental
 108  */
 109 int
 110 nfs_bioread(vp, uio, ioflag, cred, getpages)
 111         register struct vnode *vp;
 112         register struct uio *uio;
 113         int ioflag;
 114         struct ucred *cred;
 115         int getpages;
 116 {
 117         register struct nfsnode *np = VTONFS(vp);
 118         register int biosize, diff, i;
 119         struct buf *bp = 0, *rabp;
 120         struct vattr vattr;
 121         struct proc *p;
 122         struct nfsmount *nmp = VFSTONFS(vp->v_mount);
 123         daddr_t lbn, rabn;
 124         int bufsize;
 125         int nra, error = 0, n = 0, on = 0, not_readin;
 126         int operation = (getpages? BLK_PAGEIN : BLK_READ);
 127
 128 #if DIAGNOSTIC
 129         if (uio->uio_rw != UIO_READ)
 130                 panic("nfs_read mode");
 131 #endif
 132         if (uio->uio_resid == 0)
 133                 return (0);
 134         if (uio->uio_offset < 0)
 135                 return (EINVAL);
 136         p = uio->uio_procp;
 137         if ((nmp->nm_flag & (NFSMNT_NFSV3 | NFSMNT_GOTFSINFO)) == NFSMNT_NFSV3)
 138                 (void)nfs_fsinfo(nmp, vp, cred, p);
 139         /*due to getblk/vm interractions, use vm page size or less values */
 140         biosize = min(vp->v_mount->mnt_stat.f_iosize, PAGE_SIZE);
 141         /*
 142          * For nfs, cache consistency can only be maintained approximately.
 143          * Although RFC1094 does not specify the criteria, the following is
 144          * believed to be compatible with the reference port.
 145          * For nqnfs, full cache consistency is maintained within the loop.
 146          * For nfs:
 147          * If the file's modify time on the server has changed since the
 148          * last read rpc or you have written to the file,
 149          * you may have lost data cache consistency with the
 150          * server, so flush all of the file's data out of the cache.
 151          * Then force a getattr rpc to ensure that you have up to date
 152          * attributes.
 153          * NB: This implies that cache data can be read when up to
 154          * NFS_ATTRTIMEO seconds out of date. If you find that you need current
 155          * attributes this could be forced by setting n_attrstamp to 0 before
 156          * the VOP_GETATTR() call.
 157          */
 158         if ((nmp->nm_flag & NFSMNT_NQNFS) == 0) {
 159                 if (np->n_flag & NMODIFIED) {
 160                         if (vp->v_type != VREG) {
 161                                 if (vp->v_type != VDIR)
 162                                         panic("nfs: bioread, not dir");
 163                                 nfs_invaldir(vp);
 164                                 error = nfs_vinvalbuf(vp, V_SAVE, cred, p, 1);
 165                                 if (error)
 166                                         return (error);
 167                         }
 168                         np->n_attrstamp = 0;
 169                         error = VOP_GETATTR(vp, &vattr, cred, p);
 170                         if (error)
 171                                 return (error);
 172                         np->n_mtime = vattr.va_mtime.tv_sec;
 173                 } else {
 174                         error = VOP_GETATTR(vp, &vattr, cred, p);
 175                         if (error)
 176                                 return (error);
 177                         if (np->n_mtime != vattr.va_mtime.tv_sec) {
 178                                 if (vp->v_type == VDIR)
 179                                         nfs_invaldir(vp);
 180                                 error = nfs_vinvalbuf(vp, V_SAVE, cred, p, 1);
 181                                 if (error)
 182                                         return (error);
 183                                 np->n_mtime = vattr.va_mtime.tv_sec;
 184                         }
 185                 }
 186         }
 187         do {
 188
 189             /*
 190              * Get a valid lease. If cached data is stale, flush it.
 191              */
 192             if (nmp->nm_flag & NFSMNT_NQNFS) {
 193                 if (NQNFS_CKINVALID(vp, np, ND_READ)) {
 194                     do {
 195                         error = nqnfs_getlease(vp, ND_READ, cred, p);
 196                     } while (error == NQNFS_EXPIRED);
 197                     if (error)
 198                         return (error);
 199                     if (np->n_lrev != np->n_brev ||
 200                         (np->n_flag & NQNFSNONCACHE) ||
 201                         ((np->n_flag & NMODIFIED) && vp->v_type == VDIR)) {
 202                         if (vp->v_type == VDIR)
 203                             nfs_invaldir(vp);
 204                         error = nfs_vinvalbuf(vp, V_SAVE, cred, p, 1);
 205                         if (error)
 206                             return (error);
 207                         np->n_brev = np->n_lrev;
 208                     }
 209                 } else if (vp->v_type == VDIR && (np->n_flag & NMODIFIED)) {
 210                     nfs_invaldir(vp);
 211                     error = nfs_vinvalbuf(vp, V_SAVE, cred, p, 1);
 212                     if (error)
 213                         return (error);
 214                 }
 215             }
 216             if (np->n_flag & NQNFSNONCACHE) {
 217                 switch (vp->v_type) {
 218                 case VREG:
 219                         return (nfs_readrpc(vp, uio, cred));
 220                 case VLNK:
 221                         return (nfs_readlinkrpc(vp, uio, cred));
 222                 case VDIR:
 223                         break;
 224                 default:
 225                         printf(" NQNFSNONCACHE: type %x unexpected\n",
 226                                 vp->v_type);
 227                 };
 228             }
 229             switch (vp->v_type) {
 230             case VREG:
 231                 nfsstats.biocache_reads++;
 232                 lbn = uio->uio_offset / biosize;
 233                 on = uio->uio_offset & (biosize - 1);
 234                 not_readin = 1;
 235
 236                 /*
 237                  * Start the read ahead(s), as required.
 238                  */
 239                 if (nfs_numasync > 0 && nmp->nm_readahead > 0) {
 240                     for (nra = 0; nra < nmp->nm_readahead &&
 241                                   (off_t)(lbn + 1 + nra) * biosize < np->n_size;
 242                          nra++) {
 243                                 rabn = lbn + 1 + nra;
 244                                 if (!incore(vp, rabn)) {
 245                                         rabp = nfs_getcacheblk(vp, rabn, biosize, p, operation);
 246                                         if (!rabp)
 247                                                 return (EINTR);
 248                                         if (!ISSET(rabp->b_flags, (B_CACHE|B_DELWRI))) {
 249                                                 SET(rabp->b_flags, (B_READ | B_ASYNC));
 250                                                 if (nfs_asyncio(rabp, cred)) {
 251                                                         SET(rabp->b_flags, (B_INVAL|B_ERROR));
 252                                                         rabp->b_error = EIO;
 253                                                         brelse(rabp);
 254                                                 }
 255                                         } else
 256                                                 brelse(rabp);
 257                                 }
 258                     }
 259                 }
 260
 261                 /*
 262                  * If the block is in the cache and has the required data
 263                  * in a valid region, just copy it out.
 264                  * Otherwise, get the block and write back/read in,
 265                  * as required.
 266                  */
 267 again:
 268                 bufsize = biosize;
 269                 if ((off_t)(lbn + 1) * biosize > np->n_size &&
 270                     (off_t)(lbn + 1) * biosize - np->n_size < biosize) {
 271                         bufsize = np->n_size - lbn * biosize;
 272                         bufsize = (bufsize + DEV_BSIZE - 1) & ~(DEV_BSIZE - 1);
 273                 }
 274                 bp = nfs_getcacheblk(vp, lbn, bufsize, p, operation);
 275                 if (!bp)
 276                         return (EINTR);
 277
 278                 if (!ISSET(bp->b_flags, B_CACHE)) {
 279                         SET(bp->b_flags, B_READ);
 280                         CLR(bp->b_flags, (B_DONE | B_ERROR | B_INVAL));
 281                         not_readin = 0;
 282                         error = nfs_doio(bp, cred, p);
 283                         if (error) {
 284                             brelse(bp);
 285                             return (error);
 286                         }
 287                 }
 288                 if (bufsize > on) {
 289                         n = min((unsigned)(bufsize - on), uio->uio_resid);
 290                 } else {
 291                         n = 0;
 292                 }
 293                 diff = np->n_size - uio->uio_offset;
 294                 if (diff < n)
 295                         n = diff;
 296                 if (not_readin && n > 0) {
 297                         if (on < bp->b_validoff || (on + n) > bp->b_validend) {
 298                                 SET(bp->b_flags, (B_NOCACHE|B_INVAFTERWRITE));
 299                                 if (bp->b_dirtyend > 0) {
 300                                         if (!ISSET(bp->b_flags, B_DELWRI))
 301                                                 panic("nfsbioread");
 302                                         if (VOP_BWRITE(bp) == EINTR)
 303                                                 return (EINTR);
 304                                 } else
 305                                         brelse(bp);
 306                                 goto again;
 307                         }
 308                 }
 309                 vp->v_lastr = lbn;
 310                 diff = (on >= bp->b_validend) ? 0 : (bp->b_validend - on);
 311                 if (diff < n)
 312                         n = diff;
 313                 break;
 314             case VLNK:
 315                 nfsstats.biocache_readlinks++;
 316                 bp = nfs_getcacheblk(vp, (daddr_t)0, NFS_MAXPATHLEN, p, operation);
 317                 if (!bp)
 318                         return (EINTR);
 319                 if (!ISSET(bp->b_flags, B_CACHE)) {
 320                         SET(bp->b_flags, B_READ);
 321                         error = nfs_doio(bp, cred, p);
 322                         if (error) {
 323                                 SET(bp->b_flags, B_ERROR);
 324                                 brelse(bp);
 325                                 return (error);
 326                         }
 327                 }
 328                 n = min(uio->uio_resid, NFS_MAXPATHLEN - bp->b_resid);
 329                 on = 0;
 330                 break;
 331             case VDIR:
 332                 nfsstats.biocache_readdirs++;
 333                 if (np->n_direofoffset
 334                     && uio->uio_offset >= np->n_direofoffset) {
 335                     return (0);
 336                 }
 337                 lbn = uio->uio_offset / NFS_DIRBLKSIZ;
 338                 on = uio->uio_offset & (NFS_DIRBLKSIZ - 1);
 339                 bp = nfs_getcacheblk(vp, lbn, NFS_DIRBLKSIZ, p, operation);
 340                 if (!bp)
 341                     return (EINTR);
 342                 if (!ISSET(bp->b_flags, B_CACHE)) {
 343                     SET(bp->b_flags, B_READ);
 344                     error = nfs_doio(bp, cred, p);
 345                     if (error) {
 346                         brelse(bp);
 347                     }
 348                     while (error == NFSERR_BAD_COOKIE) {
 349                         nfs_invaldir(vp);
 350                         error = nfs_vinvalbuf(vp, 0, cred, p, 1);
 351                         /*
 352                          * Yuck! The directory has been modified on the
 353                          * server. The only way to get the block is by
 354                          * reading from the beginning to get all the
 355                          * offset cookies.
 356                          */
 357                         for (i = 0; i <= lbn && !error; i++) {
 358                             if (np->n_direofoffset
 359                                 && (i * NFS_DIRBLKSIZ) >= np->n_direofoffset)
 360                                     return (0);
 361                             bp = nfs_getcacheblk(vp, i, NFS_DIRBLKSIZ, p,
 362                                                  operation);
 363                             if (!bp)
 364                                     return (EINTR);
 365                             if (!ISSET(bp->b_flags, B_CACHE)) {
 366                                     SET(bp->b_flags, B_READ);
 367                                     error = nfs_doio(bp, cred, p);
 368                                     /*
 369                                      * no error + B_INVAL == directory EOF,
 370                                      * use the block.
 371                                      */
 372                                     if (error == 0 && (bp->b_flags & B_INVAL))
 373                                             break;
 374                             }
 375                             /*
 376                              * An error will throw away the block and the
 377                              * for loop will break out.  If no error and this
 378                              * is not the block we want, we throw away the
 379                              * block and go for the next one via the for loop.
 380                              */
 381                             if (error || i < lbn)
 382                                     brelse(bp);
 383                         }
 384                     }
 385                     /*
 386                      * The above while is repeated if we hit another cookie
 387                      * error.  If we hit an error and it wasn't a cookie error,
 388                      * we give up.
 389                      */
 390                     if (error)
 391                         return (error);
 392                 }
 393
 394                 /*
 395                  * If not eof and read aheads are enabled, start one.
 396                  * (You need the current block first, so that you have the
 397                  *  directory offset cookie of the next block.)
 398                  */
 399                 if (nfs_numasync > 0 && nmp->nm_readahead > 0 &&
 400                     (np->n_direofoffset == 0 ||
 401                     (lbn + 1) * NFS_DIRBLKSIZ < np->n_direofoffset) &&
 402                     !(np->n_flag & NQNFSNONCACHE) &&
 403                     !incore(vp, lbn + 1)) {
 404                         rabp = nfs_getcacheblk(vp, lbn + 1, NFS_DIRBLKSIZ, p,
 405                                                operation);
 406                         if (rabp) {
 407                             if (!ISSET(rabp->b_flags, (B_CACHE|B_DELWRI))) {
 408                                 SET(rabp->b_flags, (B_READ | B_ASYNC));
 409                                 if (nfs_asyncio(rabp, cred)) {
 410                                     SET(rabp->b_flags, (B_INVAL|B_ERROR));
 411                                     rabp->b_error = EIO;
 412                                     brelse(rabp);
 413                                 }
 414                             } else {
 415                                 brelse(rabp);
 416                             }
 417                         }
 418                 }
 419                 /*
 420                  * Make sure we use a signed variant of min() since
 421                  * the second term may be negative.
 422                  */
 423                 n = lmin(uio->uio_resid, NFS_DIRBLKSIZ - bp->b_resid - on);
 424                 /*
 425                  * Unlike VREG files, whos buffer size ( bp->b_bcount ) is
 426                  * chopped for the EOF condition, we cannot tell how large
 427                  * NFS directories are going to be until we hit EOF.  So
 428                  * an NFS directory buffer is *not* chopped to its EOF.  Now,
 429                  * it just so happens that b_resid will effectively chop it
 430                  * to EOF.  *BUT* this information is lost if the buffer goes
 431                  * away and is reconstituted into a B_CACHE state (recovered
 432                  * from VM) later.  So we keep track of the directory eof
 433                  * in np->n_direofoffset and chop it off as an extra step
 434                  * right here.
 435                  */
 436                 if (np->n_direofoffset &&
 437                     n > np->n_direofoffset - uio->uio_offset)
 438                         n = np->n_direofoffset - uio->uio_offset;
 439                 break;
 440             default:
 441                 printf(" nfs_bioread: type %x unexpected\n",vp->v_type);
 442                 break;
 443             };
 444
 445             if (n > 0) {
 446                 error = uiomove(bp->b_data + on, (int)n, uio);
 447             }
 448             switch (vp->v_type) {
 449             case VREG:
 450                 break;
 451             case VLNK:
 452                 n = 0;
 453                 break;
 454             case VDIR:
 455                 if (np->n_flag & NQNFSNONCACHE)
 456                         SET(bp->b_flags, B_INVAL);
 457                 break;
 458             default:
 459                 printf(" nfs_bioread: type %x unexpected\n",vp->v_type);
 460             }
 461             brelse(bp);
 462         } while (error == 0 && uio->uio_resid > 0 && n > 0);
 463         return (error);
 464 }
 465
 466
 467 /*
 468  * Vnode op for write using bio
 469  */
 470 int
 471 nfs_write(ap)
 472         struct vop_write_args /* {
 473                 struct vnode *a_vp;
 474                 struct uio *a_uio;
 475                 int  a_ioflag;
 476                 struct ucred *a_cred;
 477         } */ *ap;
 478 {
 479         register int biosize;
 480         register struct uio *uio = ap->a_uio;
 481         struct proc *p = uio->uio_procp;
 482         register struct vnode *vp = ap->a_vp;
 483         struct nfsnode *np = VTONFS(vp);
 484         register struct ucred *cred = ap->a_cred;
 485         int ioflag = ap->a_ioflag;
 486         struct buf *bp;
 487         struct vattr vattr;
 488         struct nfsmount *nmp = VFSTONFS(vp->v_mount);
 489         daddr_t lbn;
 490         int bufsize;
 491         int n, on, error = 0, iomode, must_commit;
 492         off_t boff;
 493         struct iovec iov;
 494         struct uio auio;
 495
 496 #if DIAGNOSTIC
 497         if (uio->uio_rw != UIO_WRITE)
 498                 panic("nfs_write mode");
 499         if (uio->uio_segflg == UIO_USERSPACE && uio->uio_procp != current_proc())
 500                 panic("nfs_write proc");
 501 #endif
 502         if (vp->v_type != VREG)
 503                 return (EIO);
 504         if (np->n_flag & NWRITEERR) {
 505                 np->n_flag &= ~NWRITEERR;
 506                 return (np->n_error);
 507         }
 508         if ((nmp->nm_flag & (NFSMNT_NFSV3 | NFSMNT_GOTFSINFO)) == NFSMNT_NFSV3)
 509                 (void)nfs_fsinfo(nmp, vp, cred, p);
 510         if (ioflag & (IO_APPEND | IO_SYNC)) {
 511                 if (np->n_flag & NMODIFIED) {
 512                         np->n_attrstamp = 0;
 513                         error = nfs_vinvalbuf(vp, V_SAVE, cred, p, 1);
 514                         if (error)
 515                                 return (error);
 516                 }
 517                 if (ioflag & IO_APPEND) {
 518                         np->n_attrstamp = 0;
 519                         error = VOP_GETATTR(vp, &vattr, cred, p);
 520                         if (error)
 521                                 return (error);
 522                         uio->uio_offset = np->n_size;
 523                 }
 524         }
 525         if (uio->uio_offset < 0)
 526                 return (EINVAL);
 527         if (uio->uio_resid == 0)
 528                 return (0);
 529         /*
 530          * Maybe this should be above the vnode op call, but so long as
 531          * file servers have no limits, i don't think it matters
 532          */
 533         if (p && uio->uio_offset + uio->uio_resid >
 534               p->p_rlimit[RLIMIT_FSIZE].rlim_cur) {
 535                 psignal(p, SIGXFSZ);
 536                 return (EFBIG);
 537         }
 538         /*
 539          * I use nm_rsize, not nm_wsize so that all buffer cache blocks
 540          * will be the same size within a filesystem. nfs_writerpc will
 541          * still use nm_wsize when sizing the rpc's.
 542          */
 543         /*due to getblk/vm interractions, use vm page size or less values */
 544         biosize = min(vp->v_mount->mnt_stat.f_iosize, PAGE_SIZE);
 545
 546         do {
 547                 /*
 548                  * Check for a valid write lease.
 549                  */
 550                 if ((nmp->nm_flag & NFSMNT_NQNFS) &&
 551                     NQNFS_CKINVALID(vp, np, ND_WRITE)) {
 552                         do {
 553                                 error = nqnfs_getlease(vp, ND_WRITE, cred, p);
 554                         } while (error == NQNFS_EXPIRED);
 555                         if (error)
 556                                 return (error);
 557                         if (np->n_lrev != np->n_brev ||
 558                             (np->n_flag & NQNFSNONCACHE)) {
 559                                 error = nfs_vinvalbuf(vp, V_SAVE, cred, p, 1);
 560                                 if (error)
 561                                         return (error);
 562                                 np->n_brev = np->n_lrev;
 563                         }
 564                 }
 565                 if ((np->n_flag & NQNFSNONCACHE) && uio->uio_iovcnt == 1) {
 566                     iomode = NFSV3WRITE_FILESYNC;
 567                     error = nfs_writerpc(vp, uio, cred, &iomode, &must_commit);
 568                     if (must_commit)
 569                         nfs_clearcommit(vp->v_mount);
 570                     return (error);
 571                 }
 572                 nfsstats.biocache_writes++;
 573                 lbn = uio->uio_offset / biosize;
 574                 on = uio->uio_offset & (biosize-1);
 575                 n = min((unsigned)(biosize - on), uio->uio_resid);
 576 again:
 577                 bufsize = biosize;
 578 #if 0
 579 /* (removed for UBC) */
 580                 if ((lbn + 1) * biosize > np->n_size) {
 581                         bufsize = np->n_size - lbn * biosize;
 582                         bufsize = (bufsize + DEV_BSIZE - 1) & ~(DEV_BSIZE - 1);
 583                 }
 584 #endif
 585                 /*
 586                  * Get a cache block for writing.  The range to be written is
 587                  * (off..off+len) within the block.  We ensure that the block
 588                  * either has no dirty region or that the given range is
 589                  * contiguous with the existing dirty region.
 590                  */
 591                 bp = nfs_getcacheblk(vp, lbn, bufsize, p, BLK_WRITE);
 592                 if (!bp)
 593                         return (EINTR);
 594                 /*
 595                  * Resize nfsnode *after* we busy the buffer to prevent
 596                  * readers from reading garbage.
 597                  * If there was a partial buf at the old eof, validate
 598                  * and zero the new bytes.
 599                  */
 600                 if (uio->uio_offset + n > np->n_size) {
 601                         struct buf *bp0 = NULL;
 602                         daddr_t bn = np->n_size / biosize;
 603                         int off = np->n_size & (biosize - 1);
 604
 605                         if (off && bn < lbn && incore(vp, bn))
 606                                 bp0 = nfs_getcacheblk(vp, bn, biosize, p,
 607                                                       BLK_WRITE);
 608                         np->n_flag |= NMODIFIED;
 609                         np->n_size = uio->uio_offset + n;
 610                         ubc_setsize(vp, (off_t)np->n_size); /* XXX errors */
 611                         if (bp0) {
 612                                 bzero((char *)bp0->b_data + off, biosize - off);
 613                                 bp0->b_validend = biosize;
 614                                 brelse(bp0);
 615                         }
 616                 }
 617                 /*
 618                  * NFS has embedded ucred so crhold() risks zone corruption
 619                  */
 620                 if (bp->b_wcred == NOCRED)
 621                         bp->b_wcred = crdup(cred);
 622                 /*
 623                  * If dirtyend exceeds file size, chop it down.  This should
 624                  * not occur unless there is a race.
 625                  */
 626                 if ((off_t)bp->b_blkno * DEV_BSIZE + bp->b_dirtyend >
 627                     np->n_size)
 628                         bp->b_dirtyend = np->n_size - (off_t)bp->b_blkno *
 629                                                       DEV_BSIZE;
 630                 /*
 631                  * UBC doesn't (yet) handle partial pages so nfs_biowrite was
 632                  * hacked to never bdwrite, to start every little write right
 633                  * away.  Running IE Avie noticed the performance problem, thus
 634                  * this code, which permits those delayed writes by ensuring an
 635                  * initial read of the entire page.  The read may hit eof
 636                  * ("short read") but that we will handle.
 637                  *
 638                  * We are quite dependant on the correctness of B_CACHE so check
 639                  * that first in case of problems.
 640                  */
 641                 if (!ISSET(bp->b_flags, B_CACHE) && n < PAGE_SIZE) {
 642                         boff = (off_t)bp->b_blkno * DEV_BSIZE;
 643                         auio.uio_iov = &iov;
 644                         auio.uio_iovcnt = 1;
 645                         auio.uio_offset = boff;
 646                         auio.uio_resid = PAGE_SIZE;
 647                         auio.uio_segflg = UIO_SYSSPACE;
 648                         auio.uio_rw = UIO_READ;
 649                         auio.uio_procp = p;
 650                         iov.iov_base = bp->b_data;
 651                         iov.iov_len = PAGE_SIZE;
 652                         error = nfs_readrpc(vp, &auio, cred);
 653                         if (error) {
 654                                 bp->b_error = error;
 655                                 SET(bp->b_flags, B_ERROR);
 656                                 printf("nfs_write: readrpc %d", error);
 657                         }
 658                         if (auio.uio_resid > 0)
 659                                 bzero(iov.iov_base, auio.uio_resid);
 660                         bp->b_validoff = 0;
 661                         bp->b_validend = PAGE_SIZE - auio.uio_resid;
 662                         if (np->n_size > boff + bp->b_validend)
 663                                 bp->b_validend = min(np->n_size - boff,
 664                                                      PAGE_SIZE);
 665                         bp->b_dirtyoff = 0;
 666                         bp->b_dirtyend = 0;
 667                 }
 668
 669                 /*
 670                  * If the new write will leave a contiguous dirty
 671                  * area, just update the b_dirtyoff and b_dirtyend,
 672                  * otherwise try to extend the dirty region.
 673                  */
 674                 if (bp->b_dirtyend > 0 &&
 675                     (on > bp->b_dirtyend || (on + n) < bp->b_dirtyoff)) {
 676                         off_t start, end;
 677
 678                         boff = (off_t)bp->b_blkno * DEV_BSIZE;
 679                         if (on > bp->b_dirtyend) {
 680                                 start = boff + bp->b_validend;
 681                                 end = boff + on;
 682                         } else {
 683                                 start = boff + on + n;
 684                                 end = boff + bp->b_validoff;
 685                         }
 686
 687                         /*
 688                          * It may be that the valid region in the buffer
 689                          * covers the region we want, in which case just
 690                          * extend the dirty region.  Otherwise we try to
 691                          * extend the valid region.
 692                          */
 693                         if (end > start) {
 694                                 auio.uio_iov = &iov;
 695                                 auio.uio_iovcnt = 1;
 696                                 auio.uio_offset = start;
 697                                 auio.uio_resid = end - start;
 698                                 auio.uio_segflg = UIO_SYSSPACE;
 699                                 auio.uio_rw = UIO_READ;
 700                                 auio.uio_procp = p;
 701                                 iov.iov_base = bp->b_data + (start - boff);
 702                                 iov.iov_len = end - start;
 703                                 error = nfs_readrpc(vp, &auio, cred);
 704                                 /*
 705                                  * If we couldn't read, do not do a VOP_BWRITE
 706                                  * as originally coded. That could also error
 707                                  * and looping back to "again" as it was doing
 708                                  * could have us stuck trying to write same buf
 709                                  * again. nfs_write, will get the entire region
 710                                  * if nfs_readrpc succeeded. If unsuccessful
 711                                  * we should just error out. Errors like ESTALE
 712                                  * would keep us looping rather than transient
 713                                  * errors justifying a retry. We can return here
 714                                  * instead of altering dirty region later.  We
 715                                  * did not write old dirty region at this point.
 716                                  */
 717                                 if (error) {
 718                                         bp->b_error = error;
 719                                         SET(bp->b_flags, B_ERROR);
 720                                         printf("nfs_write: readrpc2 %d", error);
 721                                         brelse(bp);
 722                                         return (error);
 723                                 }
 724                                 /*
 725                                  * The read worked.
 726                                  * If there was a short read, just zero fill.
 727                                  */
 728                                 if (auio.uio_resid > 0)
 729                                         bzero(iov.iov_base, auio.uio_resid);
 730                                 if (on > bp->b_dirtyend)
 731                                         bp->b_validend = on;
 732                                 else
 733                                         bp->b_validoff = on + n;
 734                         }
 735                         /*
 736                          * We now have a valid region which extends up to the
 737                          * dirty region which we want.
 738                          */
 739                         if (on > bp->b_dirtyend)
 740                                 bp->b_dirtyend = on;
 741                         else
 742                                 bp->b_dirtyoff = on + n;
 743                 }
 744                 if (ISSET(bp->b_flags, B_ERROR)) {
 745                         error = bp->b_error;
 746                         brelse(bp);
 747                         return (error);
 748                 }
 749                 /*
 750                  * NFS has embedded ucred so crhold() risks zone corruption
 751                  */
 752                 if (bp->b_wcred == NOCRED)
 753                         bp->b_wcred = crdup(cred);
 754                 np->n_flag |= NMODIFIED;
 755
 756                 /*
 757                  * Check for valid write lease and get one as required.
 758                  * In case getblk() and/or bwrite() delayed us.
 759                  */
 760                 if ((nmp->nm_flag & NFSMNT_NQNFS) &&
 761                     NQNFS_CKINVALID(vp, np, ND_WRITE)) {
 762                         do {
 763                                 error = nqnfs_getlease(vp, ND_WRITE, cred, p);
 764                         } while (error == NQNFS_EXPIRED);
 765                         if (error) {
 766                                 brelse(bp);
 767                                 return (error);
 768                         }
 769                         if (np->n_lrev != np->n_brev ||
 770                             (np->n_flag & NQNFSNONCACHE)) {
 771                                 brelse(bp);
 772                                 error = nfs_vinvalbuf(vp, V_SAVE, cred, p, 1);
 773                                 if (error)
 774                                         return (error);
 775                                 np->n_brev = np->n_lrev;
 776                                 goto again;
 777                         }
 778                 }
 779                 error = uiomove((char *)bp->b_data + on, n, uio);
 780                 if (error) {
 781                         SET(bp->b_flags, B_ERROR);
 782                         brelse(bp);
 783                         return (error);
 784                 }
 785                 if (bp->b_dirtyend > 0) {
 786                         bp->b_dirtyoff = min(on, bp->b_dirtyoff);
 787                         bp->b_dirtyend = max((on + n), bp->b_dirtyend);
 788                 } else {
 789                         bp->b_dirtyoff = on;
 790                         bp->b_dirtyend = on + n;
 791                 }
 792                 if (bp->b_validend == 0 || bp->b_validend < bp->b_dirtyoff ||
 793                     bp->b_validoff > bp->b_dirtyend) {
 794                         bp->b_validoff = bp->b_dirtyoff;
 795                         bp->b_validend = bp->b_dirtyend;
 796                 } else {
 797                         bp->b_validoff = min(bp->b_validoff, bp->b_dirtyoff);
 798                         bp->b_validend = max(bp->b_validend, bp->b_dirtyend);
 799                 }
 800
 801                 /*
 802                  * Since this block is being modified, it must be written
 803                  * again and not just committed.
 804                  */
 805                 CLR(bp->b_flags, B_NEEDCOMMIT);
 806
 807                 /*
 808                  * If the lease is non-cachable or IO_SYNC do bwrite().
 809                  */
 810                 if ((np->n_flag & NQNFSNONCACHE) || (ioflag & IO_SYNC)) {
 811                         bp->b_proc = p;
 812                         error = VOP_BWRITE(bp);
 813                         if (error)
 814                                 return (error);
 815                         if (np->n_flag & NQNFSNONCACHE) {
 816                                 error = nfs_vinvalbuf(vp, V_SAVE, cred, p, 1);
 817                                 if (error)
 818                                         return (error);
 819                         }
 820                 } else if ((n + on) == biosize &&
 821                         (nmp->nm_flag & NFSMNT_NQNFS) == 0) {
 822                         bp->b_proc = (struct proc *)0;
 823                         SET(bp->b_flags, B_ASYNC);
 824                         (void)nfs_writebp(bp, 0);
 825                 } else
 826                         bdwrite(bp);
 827         } while (uio->uio_resid > 0 && n > 0);
 828         return (0);
 829 }
 830
 831
 832 /*
 833  * Get an nfs cache block.
 834  * Allocate a new one if the block isn't currently in the cache
 835  * and return the block marked busy. If the calling process is
 836  * interrupted by a signal for an interruptible mount point, return
 837  * NULL.
 838  */
 839 static struct buf *
 840 nfs_getcacheblk(vp, bn, size, p, operation)
 841         struct vnode *vp;
 842         daddr_t bn;
 843         int size;
 844         struct proc *p;
 845         int operation;  /* defined in sys/buf.h */
 846 {
 847         register struct buf *bp;
 848         struct nfsmount *nmp = VFSTONFS(vp->v_mount);
 849         /*due to getblk/vm interractions, use vm page size or less values */
 850         int biosize = min(vp->v_mount->mnt_stat.f_iosize, PAGE_SIZE);
 851
 852         if (nbdwrite > ((nbuf/4)*3) && operation == BLK_WRITE) {
 853 #define __BUFFERS_RECLAIMED 2
 854                 struct buf *tbp[__BUFFERS_RECLAIMED];
 855                 int i;
 856
 857                 /* too many delayed writes, try to free up some buffers */
 858                 for (i = 0; i < __BUFFERS_RECLAIMED; i++)
 859                         tbp[i] = geteblk(512);
 860
 861                 /* Yield to IO thread */
 862                 (void)tsleep((caddr_t)&nbdwrite, PCATCH, "nbdwrite", 1);
 863
 864                 for (i = (__BUFFERS_RECLAIMED - 1); i >= 0; i--)
 865                          brelse(tbp[i]);
 866         }
 867
 868         if (nmp->nm_flag & NFSMNT_INT) {
 869                 bp = getblk(vp, bn, size, PCATCH, 0, operation);
 870                 while (bp == (struct buf *)0) {
 871                         if (nfs_sigintr(nmp, (struct nfsreq *)0, p))
 872                                 return ((struct buf *)0);
 873                         bp = getblk(vp, bn, size, 0, 2 * hz, operation);
 874                 }
 875         } else
 876                 bp = getblk(vp, bn, size, 0, 0, operation);
 877
 878         if( vp->v_type == VREG)
 879                 bp->b_blkno = (bn * biosize) / DEV_BSIZE;
 880
 881         return (bp);
 882 }
 883
 884 /*
 885  * Flush and invalidate all dirty buffers. If another process is already
 886  * doing the flush, just wait for completion.
 887  */
 888 int
 889 nfs_vinvalbuf(vp, flags, cred, p, intrflg)
 890         struct vnode *vp;
 891         int flags;
 892         struct ucred *cred;
 893         struct proc *p;
 894         int intrflg;
 895 {
 896         register struct nfsnode *np = VTONFS(vp);
 897         struct nfsmount *nmp = VFSTONFS(vp->v_mount);
 898         int error = 0, slpflag, slptimeo;
 899         int didhold = 0;
 900
 901         if ((nmp->nm_flag & NFSMNT_INT) == 0)
 902                 intrflg = 0;
 903         if (intrflg) {
 904                 slpflag = PCATCH;
 905                 slptimeo = 2 * hz;
 906         } else {
 907                 slpflag = 0;
 908                 slptimeo = 0;
 909         }
 910         /*
 911          * First wait for any other process doing a flush to complete.
 912          */
 913         while (np->n_flag & NFLUSHINPROG) {
 914                 np->n_flag |= NFLUSHWANT;
 915                 error = tsleep((caddr_t)&np->n_flag, PRIBIO + 2, "nfsvinval",
 916                         slptimeo);
 917                 if (error && intrflg && nfs_sigintr(nmp, (struct nfsreq *)0, p))
 918                         return (EINTR);
 919         }
 920
 921         /*
 922          * Now, flush as required.
 923          */
 924         np->n_flag |= NFLUSHINPROG;
 925         error = vinvalbuf(vp, flags, cred, p, slpflag, 0);
 926         while (error) {
 927                 /* we seem to be stuck in a loop here if the thread got aborted.
 928                  * nfs_flush will return EINTR. Not sure if that will cause
 929                  * other consequences due to EINTR having other meanings in NFS
 930                  * To handle, no dirty pages, it seems safe to just return from
 931                  * here. But if we did have dirty pages, how would we get them
 932                  * written out if thread was aborted? Some other strategy is
 933                  * necessary. -- EKN
 934                  */
 935                 if ((intrflg && nfs_sigintr(nmp, (struct nfsreq *)0, p)) ||
 936                     (error == EINTR && current_thread_aborted())) {
 937                         np->n_flag &= ~NFLUSHINPROG;
 938                         if (np->n_flag & NFLUSHWANT) {
 939                                 np->n_flag &= ~NFLUSHWANT;
 940                                 wakeup((caddr_t)&np->n_flag);
 941                         }
 942                         return (EINTR);
 943                 }
 944                 error = vinvalbuf(vp, flags, cred, p, 0, slptimeo);
 945         }
 946         np->n_flag &= ~(NMODIFIED | NFLUSHINPROG);
 947         if (np->n_flag & NFLUSHWANT) {
 948                 np->n_flag &= ~NFLUSHWANT;
 949                 wakeup((caddr_t)&np->n_flag);
 950         }
 951         didhold = ubc_hold(vp);
 952         if (didhold) {
 953                 (void) ubc_clean(vp, 1); /* get the pages out of vm also */
 954                 ubc_rele(vp);
 955         }
 956         return (0);
 957 }
 958
 959 /*
 960  * Initiate asynchronous I/O. Return an error if no nfsiods are available.
 961  * This is mainly to avoid queueing async I/O requests when the nfsiods
 962  * are all hung on a dead server.
 963  */
 964 int
 965 nfs_asyncio(bp, cred)
 966         register struct buf *bp;
 967         struct ucred *cred;
 968 {
 969         struct nfsmount *nmp;
 970         int i;
 971         int gotiod;
 972         int slpflag = 0;
 973         int slptimeo = 0;
 974         int error;
 975
 976         if (nfs_numasync == 0)
 977                 return (EIO);
 978
 979         nmp = VFSTONFS(bp->b_vp->v_mount);
 980 again:
 981         if (nmp->nm_flag & NFSMNT_INT)
 982                 slpflag = PCATCH;
 983         gotiod = FALSE;
 984
 985         /*
 986          * Find a free iod to process this request.
 987          */
 988         for (i = 0; i < NFS_MAXASYNCDAEMON; i++)
 989                 if (nfs_iodwant[i]) {
 990                         /*
 991                          * Found one, so wake it up and tell it which
 992                          * mount to process.
 993                          */
 994                         NFS_DPF(ASYNCIO,
 995                                 ("nfs_asyncio: waking iod %d for mount %p\n",
 996                                  i, nmp));
 997                         nfs_iodwant[i] = (struct proc *)0;
 998                         nfs_iodmount[i] = nmp;
 999                         nmp->nm_bufqiods++;
1000                         wakeup((caddr_t)&nfs_iodwant[i]);
1001                         gotiod = TRUE;
1002                         break;
1003                 }
1004
1005         /*
1006          * If none are free, we may already have an iod working on this mount
1007          * point.  If so, it will process our request.
1008          */
1009         if (!gotiod) {
1010                 if (nmp->nm_bufqiods > 0) {
1011                         NFS_DPF(ASYNCIO,
1012                                 ("nfs_asyncio: %d iods are already processing mount %p\n",
1013                                  nmp->nm_bufqiods, nmp));
1014                         gotiod = TRUE;
1015                 }
1016         }
1017
1018         /*
1019          * If we have an iod which can process the request, then queue
1020          * the buffer.
1021          */
1022         if (gotiod) {
1023                 /*
1024                  * Ensure that the queue never grows too large.
1025                  */
1026                 while (nmp->nm_bufqlen >= 2*nfs_numasync) {
1027                         NFS_DPF(ASYNCIO,
1028                                 ("nfs_asyncio: waiting for mount %p queue to drain\n", nmp));
1029                         nmp->nm_bufqwant = TRUE;
1030                         error = tsleep(&nmp->nm_bufq, slpflag | PRIBIO,
1031                                        "nfsaio", slptimeo);
1032                         if (error) {
1033                                 if (nfs_sigintr(nmp, NULL, bp->b_proc))
1034                                         return (EINTR);
1035                                 if (slpflag == PCATCH) {
1036                                         slpflag = 0;
1037                                         slptimeo = 2 * hz;
1038                                 }
1039                         }
1040                         /*
1041                          * We might have lost our iod while sleeping,
1042                          * so check and loop if nescessary.
1043                          */
1044                         if (nmp->nm_bufqiods == 0) {
1045                                 NFS_DPF(ASYNCIO,
1046                                         ("nfs_asyncio: no iods after mount %p queue was drained, looping\n", nmp));
1047                                 goto again;
1048                         }
1049                 }
1050
1051                 if (ISSET(bp->b_flags, B_READ)) {
1052                         if (bp->b_rcred == NOCRED && cred != NOCRED) {
1053                                 /*
1054                                  * NFS has embedded ucred.
1055                                  * Can not crhold() here as that causes zone corruption
1056                                  */
1057                                 bp->b_rcred = crdup(cred);
1058                         }
1059                 } else {
1060                         SET(bp->b_flags, B_WRITEINPROG);
1061                         if (bp->b_wcred == NOCRED && cred != NOCRED) {
1062                                 /*
1063                                  * NFS has embedded ucred.
1064                                  * Can not crhold() here as that causes zone corruption
1065                                  */
1066                                 bp->b_wcred = crdup(cred);
1067                         }
1068                 }
1069
1070                 TAILQ_INSERT_TAIL(&nmp->nm_bufq, bp, b_freelist);
1071                 nmp->nm_bufqlen++;
1072                 return (0);
1073         }
1074
1075         /*
1076          * All the iods are busy on other mounts, so return EIO to
1077          * force the caller to process the i/o synchronously.
1078          */
1079         NFS_DPF(ASYNCIO, ("nfs_asyncio: no iods available, i/o is synchronous\n"));
1080         return (EIO);
1081 }
1082
1083 /*
1084  * Do an I/O operation to/from a cache block. This may be called
1085  * synchronously or from an nfsiod.
1086  */
1087 int
1088 nfs_doio(bp, cr, p)
1089         register struct buf *bp;
1090         struct ucred *cr;
1091         struct proc *p;
1092 {
1093         register struct uio *uiop;
1094         register struct vnode *vp;
1095         struct nfsnode *np;
1096         struct nfsmount *nmp;
1097         int error = 0, diff, len, iomode, must_commit = 0;
1098         struct uio uio;
1099         struct iovec io;
1100
1101         vp = bp->b_vp;
1102         np = VTONFS(vp);
1103         nmp = VFSTONFS(vp->v_mount);
1104         uiop = &uio;
1105         uiop->uio_iov = &io;
1106         uiop->uio_iovcnt = 1;
1107         uiop->uio_segflg = UIO_SYSSPACE;
1108         uiop->uio_procp = p;
1109
1110         /*
1111          * With UBC, getblk() can return a buf with B_DONE set.
1112          * This indicates that the VM has valid data for that page.
1113          * NFS being stateless, this case poses a problem.
1114          * By definition, the NFS server should always be consulted
1115          * for the data in that page.
1116          * So we choose to clear the B_DONE and to do the IO.
1117          *
1118          * XXX revisit this if there is a performance issue.
1119          * XXX In that case, we could play the attribute cache games ...
1120          */
1121          if (ISSET(bp->b_flags, B_DONE)) {
1122                 if (!ISSET(bp->b_flags, B_ASYNC))
1123                         panic("nfs_doio: done and not async");
1124                 CLR(bp->b_flags, B_DONE);
1125         }
1126         FSDBG_TOP(256, np->n_size, bp->b_blkno * DEV_BSIZE, bp->b_bcount,
1127                   bp->b_flags);
1128         FSDBG(257, bp->b_validoff, bp->b_validend, bp->b_dirtyoff,
1129               bp->b_dirtyend);
1130         /*
1131          * Historically, paging was done with physio, but no more.
1132          */
1133         if (ISSET(bp->b_flags, B_PHYS)) {
1134             /*
1135              * ...though reading /dev/drum still gets us here.
1136              */
1137             io.iov_len = uiop->uio_resid = bp->b_bcount;
1138             /* mapping was done by vmapbuf() */
1139             io.iov_base = bp->b_data;
1140             uiop->uio_offset = (off_t)bp->b_blkno * DEV_BSIZE;
1141             if (ISSET(bp->b_flags, B_READ)) {
1142                         uiop->uio_rw = UIO_READ;
1143                         nfsstats.read_physios++;
1144                         error = nfs_readrpc(vp, uiop, cr);
1145             } else {
1146                         int com;
1147
1148                         iomode = NFSV3WRITE_DATASYNC;
1149                         uiop->uio_rw = UIO_WRITE;
1150                         nfsstats.write_physios++;
1151                         error = nfs_writerpc(vp, uiop, cr, &iomode, &com);
1152             }
1153             if (error) {
1154                         SET(bp->b_flags, B_ERROR);
1155                         bp->b_error = error;
1156             }
1157         } else if (ISSET(bp->b_flags, B_READ)) {
1158             io.iov_len = uiop->uio_resid = bp->b_bcount;
1159             io.iov_base = bp->b_data;
1160             uiop->uio_rw = UIO_READ;
1161             switch (vp->v_type) {
1162             case VREG:
1163                 uiop->uio_offset = (off_t)bp->b_blkno * DEV_BSIZE;
1164                 nfsstats.read_bios++;
1165                 error = nfs_readrpc(vp, uiop, cr);
1166                 FSDBG(262, np->n_size, bp->b_blkno * DEV_BSIZE,
1167                       uiop->uio_resid, error);
1168                 if (!error) {
1169                     bp->b_validoff = 0;
1170                     if (uiop->uio_resid) {
1171                         /*
1172                          * If len > 0, there is a hole in the file and
1173                          * no writes after the hole have been pushed to
1174                          * the server yet.
1175                          * Just zero fill the rest of the valid area.
1176                          */
1177                         diff = bp->b_bcount - uiop->uio_resid;
1178                         len = np->n_size - ((u_quad_t)bp->b_blkno * DEV_BSIZE +
1179                                             diff);
1180                         if (len > 0) {
1181                                 len = min(len, uiop->uio_resid);
1182                                 bzero((char *)bp->b_data + diff, len);
1183                                 bp->b_validend = diff + len;
1184                                 FSDBG(258, diff, len, 0, 1);
1185                         } else
1186                                 bp->b_validend = diff;
1187                     } else
1188                                 bp->b_validend = bp->b_bcount;
1189
1190                     if (bp->b_validend < bp->b_bufsize) {
1191                             /*
1192                              * we're about to release a partial buffer after a
1193                              * read... the only way we should get here is if
1194                              * this buffer contains the EOF before releasing it,
1195                              * we'll zero out to the end of the buffer so that
1196                              * if a mmap of this page occurs, we'll see zero's
1197                              * even if a ftruncate extends the file in the
1198                              * meantime
1199                              */
1200                             bzero((caddr_t)(bp->b_data + bp->b_validend),
1201                                   bp->b_bufsize - bp->b_validend);
1202                             FSDBG(258, bp->b_validend,
1203                                   bp->b_bufsize - bp->b_validend, 0, 2);
1204                     }
1205                 }
1206                 if (p && (vp->v_flag & VTEXT) &&
1207                         (((nmp->nm_flag & NFSMNT_NQNFS) &&
1208                           NQNFS_CKINVALID(vp, np, ND_READ) &&
1209                           np->n_lrev != np->n_brev) ||
1210                          (!(nmp->nm_flag & NFSMNT_NQNFS) &&
1211                           np->n_mtime != np->n_vattr.va_mtime.tv_sec))) {
1212                         uprintf("Process killed due to text file modification\n");
1213                         psignal(p, SIGKILL);
1214                         p->p_flag |= P_NOSWAP;
1215                 }
1216                 break;
1217             case VLNK:
1218                 uiop->uio_offset = (off_t)0;
1219                 nfsstats.readlink_bios++;
1220                 error = nfs_readlinkrpc(vp, uiop, cr);
1221                 break;
1222             case VDIR:
1223                 nfsstats.readdir_bios++;
1224                 uiop->uio_offset = ((u_quad_t)bp->b_lblkno) * NFS_DIRBLKSIZ;
1225                 if (!(nmp->nm_flag & NFSMNT_NFSV3))
1226                         nmp->nm_flag &= ~NFSMNT_RDIRPLUS; /* dk@farm.org */
1227                 if (nmp->nm_flag & NFSMNT_RDIRPLUS) {
1228                         error = nfs_readdirplusrpc(vp, uiop, cr);
1229                         if (error == NFSERR_NOTSUPP)
1230                                 nmp->nm_flag &= ~NFSMNT_RDIRPLUS;
1231                 }
1232                 if ((nmp->nm_flag & NFSMNT_RDIRPLUS) == 0)
1233                         error = nfs_readdirrpc(vp, uiop, cr);
1234                 break;
1235             default:
1236                 printf("nfs_doio: type %x unexpected\n", vp->v_type);
1237                 break;
1238             };
1239             if (error) {
1240                 SET(bp->b_flags, B_ERROR);
1241                 bp->b_error = error;
1242             }
1243         } else {
1244             /*
1245              * mapped I/O may have altered any bytes, so we extend
1246              * the dirty zone to the valid zone.  For best performance
1247              * a better solution would be to save & restore page dirty bits
1248              * around the uiomove which brings write-data into the buffer.
1249              * Then here we'd check if the page is dirty rather than WASMAPPED
1250              * Also vnode_pager would change - if a page is clean it might
1251              * still need to be written due to DELWRI.
1252              */
1253             if (UBCINFOEXISTS(vp) && ubc_issetflags(vp, UI_WASMAPPED)) {
1254                 bp->b_dirtyoff = min(bp->b_dirtyoff, bp->b_validoff);
1255                 bp->b_dirtyend = max(bp->b_dirtyend, bp->b_validend);
1256             }
1257             if ((off_t)bp->b_blkno * DEV_BSIZE + bp->b_dirtyend > np->n_size)
1258                 bp->b_dirtyend = np->n_size - (off_t)bp->b_blkno * DEV_BSIZE;
1259
1260             if (bp->b_dirtyend > bp->b_dirtyoff) {
1261                 io.iov_len = uiop->uio_resid = bp->b_dirtyend - bp->b_dirtyoff;
1262                 uiop->uio_offset = (off_t)bp->b_blkno * DEV_BSIZE +
1263                                    bp->b_dirtyoff;
1264                 io.iov_base = (char *)bp->b_data + bp->b_dirtyoff;
1265                 uiop->uio_rw = UIO_WRITE;
1266
1267                 nfsstats.write_bios++;
1268                 if ((bp->b_flags & (B_ASYNC | B_NEEDCOMMIT | B_NOCACHE)) ==
1269                     B_ASYNC)
1270                     iomode = NFSV3WRITE_UNSTABLE;
1271                 else
1272                     iomode = NFSV3WRITE_FILESYNC;
1273                 SET(bp->b_flags, B_WRITEINPROG);
1274                 error = nfs_writerpc(vp, uiop, cr, &iomode, &must_commit);
1275                 if (!error && iomode == NFSV3WRITE_UNSTABLE)
1276                     SET(bp->b_flags, B_NEEDCOMMIT);
1277                 else
1278                     CLR(bp->b_flags, B_NEEDCOMMIT);
1279                 CLR(bp->b_flags, B_WRITEINPROG);
1280                 /*
1281                  * For an interrupted write, the buffer is still valid
1282                  * and the write hasn't been pushed to the server yet,
1283                  * so we can't set B_ERROR and report the interruption
1284                  * by setting B_EINTR. For the B_ASYNC case, B_EINTR
1285                  * is not relevant, so the rpc attempt is essentially
1286                  * a noop.  For the case of a V3 write rpc not being
1287                  * committed to stable storage, the block is still
1288                  * dirty and requires either a commit rpc or another
1289                  * write rpc with iomode == NFSV3WRITE_FILESYNC before
1290                  * the block is reused. This is indicated by setting
1291                  * the B_DELWRI and B_NEEDCOMMIT flags.
1292                  */
1293                 if (error == EINTR || (!error && bp->b_flags & B_NEEDCOMMIT)) {
1294                         int s;
1295
1296                         CLR(bp->b_flags, B_INVAL | B_NOCACHE);
1297                         if (!ISSET(bp->b_flags, B_DELWRI)) {
1298                                 SET(bp->b_flags, B_DELWRI);
1299                                 nbdwrite++;
1300                         }
1301                         FSDBG(261, bp->b_validoff, bp->b_validend,
1302                               bp->b_bufsize, bp->b_bcount);
1303                         /*
1304                          * Since for the B_ASYNC case, nfs_bwrite() has
1305                          * reassigned the buffer to the clean list, we have to
1306                          * reassign it back to the dirty one. Ugh.
1307                          */
1308                         if (ISSET(bp->b_flags, B_ASYNC)) {
1309                                 s = splbio();
1310                                 reassignbuf(bp, vp);
1311                                 splx(s);
1312                         } else {
1313                                 SET(bp->b_flags, B_EINTR);
1314                         }
1315                 } else {
1316                         if (error) {
1317                                 SET(bp->b_flags, B_ERROR);
1318                                 bp->b_error = np->n_error = error;
1319                                 np->n_flag |= NWRITEERR;
1320                         }
1321                         bp->b_dirtyoff = bp->b_dirtyend = 0;
1322
1323                         /*
1324                          * validoff and validend represent the real data present
1325                          * in this buffer if validoff is non-zero, than we have
1326                          * to invalidate the buffer and kill the page when
1327                          * biodone is called... the same is also true when
1328                          * validend doesn't extend all the way to the end of the
1329                          * buffer and validend doesn't equate to the current
1330                          * EOF... eventually we need to deal with this in a more
1331                          * humane way (like keeping the partial buffer without
1332                          * making it immediately available to the VM page cache)
1333                          */
1334                         if (bp->b_validoff)
1335                                 SET(bp->b_flags, B_INVAL);
1336                         else
1337                         if (bp->b_validend < bp->b_bufsize) {
1338                                 if ((off_t)bp->b_blkno * DEV_BSIZE +
1339                                     bp->b_validend == np->n_size) {
1340                                         bzero((caddr_t)(bp->b_data +
1341                                                         bp->b_validend),
1342                                               bp->b_bufsize - bp->b_validend);
1343                                         FSDBG(259, bp->b_validend,
1344                                               bp->b_bufsize - bp->b_validend, 0,
1345                                               0);
1346                                 } else
1347                                         SET(bp->b_flags, B_INVAL);
1348                         }
1349                 }
1350
1351             } else {
1352                 if (bp->b_validoff ||
1353                     (bp->b_validend < bp->b_bufsize &&
1354                      (off_t)bp->b_blkno * DEV_BSIZE + bp->b_validend !=
1355                      np->n_size)) {
1356                         SET(bp->b_flags, B_INVAL);
1357                 }
1358                 if (bp->b_flags & B_INVAL) {
1359                         FSDBG(260, bp->b_validoff, bp->b_validend,
1360                               bp->b_bufsize, bp->b_bcount);
1361                 }
1362                 bp->b_resid = 0;
1363                 biodone(bp);
1364                 FSDBG_BOT(256, bp->b_validoff, bp->b_validend, bp->b_bufsize,
1365                           np->n_size);
1366                 return (0);
1367             }
1368         }
1369         bp->b_resid = uiop->uio_resid;
1370         if (must_commit)
1371                 nfs_clearcommit(vp->v_mount);
1372
1373         if (bp->b_flags & B_INVAL) {
1374                 FSDBG(260, bp->b_validoff, bp->b_validend, bp->b_bufsize,
1375                       bp->b_bcount);
1376         }
1377         FSDBG_BOT(256, bp->b_validoff, bp->b_validend, bp->b_bcount, error);
1378
1379         biodone(bp);
1380         return (error);
1381 }