bsd/nfs/nfs_bio.c

   1 /*
   2  * Copyright (c) 2000 Apple Computer, Inc. All rights reserved.
   3  *
   4  * @APPLE_LICENSE_HEADER_START@
   5  *
   6  * Copyright (c) 1999-2003 Apple Computer, Inc.  All Rights Reserved.
   7  *
   8  * This file contains Original Code and/or Modifications of Original Code
   9  * as defined in and that are subject to the Apple Public Source License
  10  * Version 2.0 (the 'License'). You may not use this file except in
  11  * compliance with the License. Please obtain a copy of the License at
  12  * http://www.opensource.apple.com/apsl/ and read it before using this
  13  * file.
  14  *
  15  * The Original Code and all software distributed under the License are
  16  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
  17  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
  18  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
  19  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
  20  * Please see the License for the specific language governing rights and
  21  * limitations under the License.
  22  *
  23  * @APPLE_LICENSE_HEADER_END@
  24  */
  25 /* Copyright (c) 1995 NeXT Computer, Inc. All Rights Reserved */
  26 /*
  27  * Copyright (c) 1989, 1993
  28  *      The Regents of the University of California.  All rights reserved.
  29  *
  30  * This code is derived from software contributed to Berkeley by
  31  * Rick Macklem at The University of Guelph.
  32  *
  33  * Redistribution and use in source and binary forms, with or without
  34  * modification, are permitted provided that the following conditions
  35  * are met:
  36  * 1. Redistributions of source code must retain the above copyright
  37  *    notice, this list of conditions and the following disclaimer.
  38  * 2. Redistributions in binary form must reproduce the above copyright
  39  *    notice, this list of conditions and the following disclaimer in the
  40  *    documentation and/or other materials provided with the distribution.
  41  * 3. All advertising materials mentioning features or use of this software
  42  *    must display the following acknowledgement:
  43  *      This product includes software developed by the University of
  44  *      California, Berkeley and its contributors.
  45  * 4. Neither the name of the University nor the names of its contributors
  46  *    may be used to endorse or promote products derived from this software
  47  *    without specific prior written permission.
  48  *
  49  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  50  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  51  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  52  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  53  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  54  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  55  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  56  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  57  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  58  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  59  * SUCH DAMAGE.
  60  *
  61  *      @(#)nfs_bio.c   8.9 (Berkeley) 3/30/95
  62  * FreeBSD-Id: nfs_bio.c,v 1.44 1997/09/10 19:52:25 phk Exp $
  63  */
  64 #include <sys/param.h>
  65 #include <sys/systm.h>
  66 #include <sys/resourcevar.h>
  67 #include <sys/signalvar.h>
  68 #include <sys/proc.h>
  69 #include <sys/buf.h>
  70 #include <sys/vnode.h>
  71 #include <sys/mount.h>
  72 #include <sys/kernel.h>
  73 #include <sys/sysctl.h>
  74 #include <sys/ubc.h>
  75
  76 #include <sys/vm.h>
  77 #include <sys/vmparam.h>
  78
  79 #include <sys/time.h>
  80 #include <kern/clock.h>
  81
  82 #include <nfs/rpcv2.h>
  83 #include <nfs/nfsproto.h>
  84 #include <nfs/nfs.h>
  85 #include <nfs/nfsmount.h>
  86 #include <nfs/nqnfs.h>
  87 #include <nfs/nfsnode.h>
  88
  89 #include <sys/kdebug.h>
  90
  91 #define FSDBG(A, B, C, D, E) \
  92         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, (A))) | DBG_FUNC_NONE, \
  93                 (int)(B), (int)(C), (int)(D), (int)(E), 0)
  94 #define FSDBG_TOP(A, B, C, D, E) \
  95         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, (A))) | DBG_FUNC_START, \
  96                 (int)(B), (int)(C), (int)(D), (int)(E), 0)
  97 #define FSDBG_BOT(A, B, C, D, E) \
  98         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, (A))) | DBG_FUNC_END, \
  99                 (int)(B), (int)(C), (int)(D), (int)(E), 0)
 100
 101 static struct buf *nfs_getcacheblk __P((struct vnode *vp, daddr_t bn, int size,
 102                                         struct proc *p, int operation));
 103
 104 extern int nfs_numasync;
 105 extern struct nfsstats nfsstats;
 106 extern int nbdwrite;
 107
 108 /*
 109  * Vnode op for read using bio
 110  * Any similarity to readip() is purely coincidental
 111  */
 112 int
 113 nfs_bioread(vp, uio, ioflag, cred, getpages)
 114         register struct vnode *vp;
 115         register struct uio *uio;
 116         int ioflag;
 117         struct ucred *cred;
 118         int getpages;
 119 {
 120         register struct nfsnode *np = VTONFS(vp);
 121         register int biosize, i;
 122         off_t diff;
 123         struct buf *bp = 0, *rabp;
 124         struct vattr vattr;
 125         struct proc *p;
 126         struct nfsmount *nmp = VFSTONFS(vp->v_mount);
 127         daddr_t lbn, rabn;
 128         int bufsize;
 129         int nra, error = 0, n = 0, on = 0, not_readin;
 130         int operation = (getpages? BLK_PAGEIN : BLK_READ);
 131
 132 #if DIAGNOSTIC
 133         if (uio->uio_rw != UIO_READ)
 134                 panic("nfs_read mode");
 135 #endif
 136         if (uio->uio_resid == 0)
 137                 return (0);
 138         if (uio->uio_offset < 0)
 139                 return (EINVAL);
 140         p = uio->uio_procp;
 141         if ((nmp->nm_flag & (NFSMNT_NFSV3 | NFSMNT_GOTFSINFO)) == NFSMNT_NFSV3)
 142                 (void)nfs_fsinfo(nmp, vp, cred, p);
 143         /*due to getblk/vm interractions, use vm page size or less values */
 144         biosize = min(vp->v_mount->mnt_stat.f_iosize, PAGE_SIZE);
 145         /*
 146          * For nfs, cache consistency can only be maintained approximately.
 147          * Although RFC1094 does not specify the criteria, the following is
 148          * believed to be compatible with the reference port.
 149          * For nqnfs, full cache consistency is maintained within the loop.
 150          * For nfs:
 151          * If the file's modify time on the server has changed since the
 152          * last read rpc or you have written to the file,
 153          * you may have lost data cache consistency with the
 154          * server, so flush all of the file's data out of the cache.
 155          * Then force a getattr rpc to ensure that you have up to date
 156          * attributes.
 157          * NB: This implies that cache data can be read when up to
 158          * NFS_ATTRTIMEO seconds out of date. If you find that you need current
 159          * attributes this could be forced by setting n_attrstamp to 0 before
 160          * the VOP_GETATTR() call.
 161          */
 162         if ((nmp->nm_flag & NFSMNT_NQNFS) == 0) {
 163                 if (np->n_flag & NMODIFIED) {
 164                         if (vp->v_type != VREG) {
 165                                 if (vp->v_type != VDIR)
 166                                         panic("nfs: bioread, not dir");
 167                                 nfs_invaldir(vp);
 168                                 error = nfs_vinvalbuf(vp, V_SAVE, cred, p, 1);
 169                                 if (error)
 170                                         return (error);
 171                         }
 172                         np->n_attrstamp = 0;
 173                         error = VOP_GETATTR(vp, &vattr, cred, p);
 174                         if (error)
 175                                 return (error);
 176                         np->n_mtime = vattr.va_mtime.tv_sec;
 177                 } else {
 178                         error = VOP_GETATTR(vp, &vattr, cred, p);
 179                         if (error)
 180                                 return (error);
 181                         if (np->n_mtime != vattr.va_mtime.tv_sec) {
 182                                 if (vp->v_type == VDIR)
 183                                         nfs_invaldir(vp);
 184                                 error = nfs_vinvalbuf(vp, V_SAVE, cred, p, 1);
 185                                 if (error)
 186                                         return (error);
 187                                 np->n_mtime = vattr.va_mtime.tv_sec;
 188                         }
 189                 }
 190         }
 191         do {
 192
 193             /*
 194              * Get a valid lease. If cached data is stale, flush it.
 195              */
 196             if (nmp->nm_flag & NFSMNT_NQNFS) {
 197                 if (NQNFS_CKINVALID(vp, np, ND_READ)) {
 198                     do {
 199                         error = nqnfs_getlease(vp, ND_READ, cred, p);
 200                     } while (error == NQNFS_EXPIRED);
 201                     if (error)
 202                         return (error);
 203                     if (np->n_lrev != np->n_brev ||
 204                         (np->n_flag & NQNFSNONCACHE) ||
 205                         ((np->n_flag & NMODIFIED) && vp->v_type == VDIR)) {
 206                         if (vp->v_type == VDIR)
 207                             nfs_invaldir(vp);
 208                         error = nfs_vinvalbuf(vp, V_SAVE, cred, p, 1);
 209                         if (error)
 210                             return (error);
 211                         np->n_brev = np->n_lrev;
 212                     }
 213                 } else if (vp->v_type == VDIR && (np->n_flag & NMODIFIED)) {
 214                     nfs_invaldir(vp);
 215                     error = nfs_vinvalbuf(vp, V_SAVE, cred, p, 1);
 216                     if (error)
 217                         return (error);
 218                 }
 219             }
 220             if (np->n_flag & NQNFSNONCACHE) {
 221                 switch (vp->v_type) {
 222                 case VREG:
 223                         return (nfs_readrpc(vp, uio, cred));
 224                 case VLNK:
 225                         return (nfs_readlinkrpc(vp, uio, cred));
 226                 case VDIR:
 227                         break;
 228                 default:
 229                         printf(" NQNFSNONCACHE: type %x unexpected\n",
 230                                 vp->v_type);
 231                 };
 232             }
 233             switch (vp->v_type) {
 234             case VREG:
 235                 nfsstats.biocache_reads++;
 236                 lbn = uio->uio_offset / biosize;
 237                 on = uio->uio_offset & (biosize - 1);
 238                 not_readin = 1;
 239
 240                 /*
 241                  * Start the read ahead(s), as required.
 242                  */
 243                 if (nfs_numasync > 0 && nmp->nm_readahead > 0) {
 244                     for (nra = 0; nra < nmp->nm_readahead &&
 245                                   (off_t)(lbn + 1 + nra) * biosize < np->n_size;
 246                          nra++) {
 247                                 rabn = lbn + 1 + nra;
 248                                 if (!incore(vp, rabn)) {
 249                                         rabp = nfs_getcacheblk(vp, rabn, biosize, p, operation);
 250                                         if (!rabp)
 251                                                 return (EINTR);
 252                                         if (!ISSET(rabp->b_flags, (B_CACHE|B_DELWRI))) {
 253                                                 SET(rabp->b_flags, (B_READ | B_ASYNC));
 254                                                 if (nfs_asyncio(rabp, cred)) {
 255                                                         SET(rabp->b_flags, (B_INVAL|B_ERROR));
 256                                                         rabp->b_error = EIO;
 257                                                         brelse(rabp);
 258                                                 }
 259                                         } else
 260                                                 brelse(rabp);
 261                                 }
 262                     }
 263                 }
 264
 265                 /*
 266                  * If the block is in the cache and has the required data
 267                  * in a valid region, just copy it out.
 268                  * Otherwise, get the block and write back/read in,
 269                  * as required.
 270                  */
 271 again:
 272                 bufsize = biosize;
 273                 if ((off_t)(lbn + 1) * biosize > np->n_size &&
 274                     (off_t)(lbn + 1) * biosize - np->n_size < biosize) {
 275                         bufsize = np->n_size - (off_t)lbn * biosize;
 276                         bufsize = (bufsize + DEV_BSIZE - 1) & ~(DEV_BSIZE - 1);
 277                 }
 278                 bp = nfs_getcacheblk(vp, lbn, bufsize, p, operation);
 279                 if (!bp)
 280                         return (EINTR);
 281
 282                 if (!ISSET(bp->b_flags, B_CACHE)) {
 283                         SET(bp->b_flags, B_READ);
 284                         CLR(bp->b_flags, (B_DONE | B_ERROR | B_INVAL));
 285                         not_readin = 0;
 286                         error = nfs_doio(bp, cred, p);
 287                         if (error) {
 288                             brelse(bp);
 289                             return (error);
 290                         }
 291                 }
 292                 if (bufsize > on) {
 293                         n = min((unsigned)(bufsize - on), uio->uio_resid);
 294                 } else {
 295                         n = 0;
 296                 }
 297                 diff = np->n_size - uio->uio_offset;
 298                 if (diff < n)
 299                         n = diff;
 300                 if (not_readin && n > 0) {
 301                         if (on < bp->b_validoff || (on + n) > bp->b_validend) {
 302                                 SET(bp->b_flags, (B_NOCACHE|B_INVAFTERWRITE));
 303                                 if (bp->b_dirtyend > 0) {
 304                                         if (!ISSET(bp->b_flags, B_DELWRI))
 305                                                 panic("nfsbioread");
 306                                         if (VOP_BWRITE(bp) == EINTR)
 307                                                 return (EINTR);
 308                                 } else
 309                                         brelse(bp);
 310                                 goto again;
 311                         }
 312                 }
 313                 vp->v_lastr = lbn;
 314                 diff = (on >= bp->b_validend) ? 0 : (bp->b_validend - on);
 315                 if (diff < n)
 316                         n = diff;
 317                 break;
 318             case VLNK:
 319                 nfsstats.biocache_readlinks++;
 320                 bp = nfs_getcacheblk(vp, (daddr_t)0, NFS_MAXPATHLEN, p, operation);
 321                 if (!bp)
 322                         return (EINTR);
 323                 if (!ISSET(bp->b_flags, B_CACHE)) {
 324                         SET(bp->b_flags, B_READ);
 325                         error = nfs_doio(bp, cred, p);
 326                         if (error) {
 327                                 SET(bp->b_flags, B_ERROR);
 328                                 brelse(bp);
 329                                 return (error);
 330                         }
 331                 }
 332                 n = min(uio->uio_resid, NFS_MAXPATHLEN - bp->b_resid);
 333                 on = 0;
 334                 break;
 335             case VDIR:
 336                 nfsstats.biocache_readdirs++;
 337                 if (np->n_direofoffset
 338                     && uio->uio_offset >= np->n_direofoffset) {
 339                     return (0);
 340                 }
 341                 lbn = uio->uio_offset / NFS_DIRBLKSIZ;
 342                 on = uio->uio_offset & (NFS_DIRBLKSIZ - 1);
 343                 bp = nfs_getcacheblk(vp, lbn, NFS_DIRBLKSIZ, p, operation);
 344                 if (!bp)
 345                     return (EINTR);
 346                 if (!ISSET(bp->b_flags, B_CACHE)) {
 347                     SET(bp->b_flags, B_READ);
 348                     error = nfs_doio(bp, cred, p);
 349                     if (error) {
 350                         brelse(bp);
 351                     }
 352                     while (error == NFSERR_BAD_COOKIE) {
 353                         nfs_invaldir(vp);
 354                         error = nfs_vinvalbuf(vp, 0, cred, p, 1);
 355                         /*
 356                          * Yuck! The directory has been modified on the
 357                          * server. The only way to get the block is by
 358                          * reading from the beginning to get all the
 359                          * offset cookies.
 360                          */
 361                         for (i = 0; i <= lbn && !error; i++) {
 362                             if (np->n_direofoffset
 363                                 && (i * NFS_DIRBLKSIZ) >= np->n_direofoffset)
 364                                     return (0);
 365                             bp = nfs_getcacheblk(vp, i, NFS_DIRBLKSIZ, p,
 366                                                  operation);
 367                             if (!bp)
 368                                     return (EINTR);
 369                             if (!ISSET(bp->b_flags, B_CACHE)) {
 370                                     SET(bp->b_flags, B_READ);
 371                                     error = nfs_doio(bp, cred, p);
 372                                     /*
 373                                      * no error + B_INVAL == directory EOF,
 374                                      * use the block.
 375                                      */
 376                                     if (error == 0 && (bp->b_flags & B_INVAL))
 377                                             break;
 378                             }
 379                             /*
 380                              * An error will throw away the block and the
 381                              * for loop will break out.  If no error and this
 382                              * is not the block we want, we throw away the
 383                              * block and go for the next one via the for loop.
 384                              */
 385                             if (error || i < lbn)
 386                                     brelse(bp);
 387                         }
 388                     }
 389                     /*
 390                      * The above while is repeated if we hit another cookie
 391                      * error.  If we hit an error and it wasn't a cookie error,
 392                      * we give up.
 393                      */
 394                     if (error)
 395                         return (error);
 396                 }
 397
 398                 /*
 399                  * If not eof and read aheads are enabled, start one.
 400                  * (You need the current block first, so that you have the
 401                  *  directory offset cookie of the next block.)
 402                  */
 403                 if (nfs_numasync > 0 && nmp->nm_readahead > 0 &&
 404                     (np->n_direofoffset == 0 ||
 405                     (lbn + 1) * NFS_DIRBLKSIZ < np->n_direofoffset) &&
 406                     !(np->n_flag & NQNFSNONCACHE) &&
 407                     !incore(vp, lbn + 1)) {
 408                         rabp = nfs_getcacheblk(vp, lbn + 1, NFS_DIRBLKSIZ, p,
 409                                                operation);
 410                         if (rabp) {
 411                             if (!ISSET(rabp->b_flags, (B_CACHE|B_DELWRI))) {
 412                                 SET(rabp->b_flags, (B_READ | B_ASYNC));
 413                                 if (nfs_asyncio(rabp, cred)) {
 414                                     SET(rabp->b_flags, (B_INVAL|B_ERROR));
 415                                     rabp->b_error = EIO;
 416                                     brelse(rabp);
 417                                 }
 418                             } else {
 419                                 brelse(rabp);
 420                             }
 421                         }
 422                 }
 423                 /*
 424                  * Make sure we use a signed variant of min() since
 425                  * the second term may be negative.
 426                  */
 427                 n = lmin(uio->uio_resid, NFS_DIRBLKSIZ - bp->b_resid - on);
 428                 /*
 429                  * Unlike VREG files, whos buffer size ( bp->b_bcount ) is
 430                  * chopped for the EOF condition, we cannot tell how large
 431                  * NFS directories are going to be until we hit EOF.  So
 432                  * an NFS directory buffer is *not* chopped to its EOF.  Now,
 433                  * it just so happens that b_resid will effectively chop it
 434                  * to EOF.  *BUT* this information is lost if the buffer goes
 435                  * away and is reconstituted into a B_CACHE state (recovered
 436                  * from VM) later.  So we keep track of the directory eof
 437                  * in np->n_direofoffset and chop it off as an extra step
 438                  * right here.
 439                  */
 440                 if (np->n_direofoffset &&
 441                     n > np->n_direofoffset - uio->uio_offset)
 442                         n = np->n_direofoffset - uio->uio_offset;
 443                 break;
 444             default:
 445                 printf(" nfs_bioread: type %x unexpected\n",vp->v_type);
 446                 break;
 447             };
 448
 449             if (n > 0) {
 450                 error = uiomove(bp->b_data + on, (int)n, uio);
 451             }
 452             switch (vp->v_type) {
 453             case VREG:
 454                 break;
 455             case VLNK:
 456                 n = 0;
 457                 break;
 458             case VDIR:
 459                 if (np->n_flag & NQNFSNONCACHE)
 460                         SET(bp->b_flags, B_INVAL);
 461                 break;
 462             default:
 463                 printf(" nfs_bioread: type %x unexpected\n",vp->v_type);
 464             }
 465             brelse(bp);
 466         } while (error == 0 && uio->uio_resid > 0 && n > 0);
 467         return (error);
 468 }
 469
 470
 471 /*
 472  * Vnode op for write using bio
 473  */
 474 int
 475 nfs_write(ap)
 476         struct vop_write_args /* {
 477                 struct vnode *a_vp;
 478                 struct uio *a_uio;
 479                 int  a_ioflag;
 480                 struct ucred *a_cred;
 481         } */ *ap;
 482 {
 483         register int biosize;
 484         register struct uio *uio = ap->a_uio;
 485         struct proc *p = uio->uio_procp;
 486         register struct vnode *vp = ap->a_vp;
 487         struct nfsnode *np = VTONFS(vp);
 488         register struct ucred *cred = ap->a_cred;
 489         int ioflag = ap->a_ioflag;
 490         struct buf *bp;
 491         struct vattr vattr;
 492         struct nfsmount *nmp = VFSTONFS(vp->v_mount);
 493         daddr_t lbn;
 494         int bufsize;
 495         int n, on, error = 0, iomode, must_commit;
 496         off_t boff;
 497         struct iovec iov;
 498         struct uio auio;
 499
 500 #if DIAGNOSTIC
 501         if (uio->uio_rw != UIO_WRITE)
 502                 panic("nfs_write mode");
 503         if (uio->uio_segflg == UIO_USERSPACE && uio->uio_procp != current_proc())
 504                 panic("nfs_write proc");
 505 #endif
 506         if (vp->v_type != VREG)
 507                 return (EIO);
 508         if (np->n_flag & NWRITEERR) {
 509                 np->n_flag &= ~NWRITEERR;
 510                 return (np->n_error);
 511         }
 512         if ((nmp->nm_flag & (NFSMNT_NFSV3 | NFSMNT_GOTFSINFO)) == NFSMNT_NFSV3)
 513                 (void)nfs_fsinfo(nmp, vp, cred, p);
 514         if (ioflag & (IO_APPEND | IO_SYNC)) {
 515                 if (np->n_flag & NMODIFIED) {
 516                         np->n_attrstamp = 0;
 517                         error = nfs_vinvalbuf(vp, V_SAVE, cred, p, 1);
 518                         if (error)
 519                                 return (error);
 520                 }
 521                 if (ioflag & IO_APPEND) {
 522                         np->n_attrstamp = 0;
 523                         error = VOP_GETATTR(vp, &vattr, cred, p);
 524                         if (error)
 525                                 return (error);
 526                         uio->uio_offset = np->n_size;
 527                 }
 528         }
 529         if (uio->uio_offset < 0)
 530                 return (EINVAL);
 531         if (uio->uio_resid == 0)
 532                 return (0);
 533         /*
 534          * Maybe this should be above the vnode op call, but so long as
 535          * file servers have no limits, i don't think it matters
 536          */
 537         if (p && uio->uio_offset + uio->uio_resid >
 538               p->p_rlimit[RLIMIT_FSIZE].rlim_cur) {
 539                 psignal(p, SIGXFSZ);
 540                 return (EFBIG);
 541         }
 542         /*
 543          * I use nm_rsize, not nm_wsize so that all buffer cache blocks
 544          * will be the same size within a filesystem. nfs_writerpc will
 545          * still use nm_wsize when sizing the rpc's.
 546          */
 547         /*due to getblk/vm interractions, use vm page size or less values */
 548         biosize = min(vp->v_mount->mnt_stat.f_iosize, PAGE_SIZE);
 549
 550         do {
 551                 /*
 552                  * Check for a valid write lease.
 553                  */
 554                 if ((nmp->nm_flag & NFSMNT_NQNFS) &&
 555                     NQNFS_CKINVALID(vp, np, ND_WRITE)) {
 556                         do {
 557                                 error = nqnfs_getlease(vp, ND_WRITE, cred, p);
 558                         } while (error == NQNFS_EXPIRED);
 559                         if (error)
 560                                 return (error);
 561                         if (np->n_lrev != np->n_brev ||
 562                             (np->n_flag & NQNFSNONCACHE)) {
 563                                 error = nfs_vinvalbuf(vp, V_SAVE, cred, p, 1);
 564                                 if (error)
 565                                         return (error);
 566                                 np->n_brev = np->n_lrev;
 567                         }
 568                 }
 569                 if ((np->n_flag & NQNFSNONCACHE) && uio->uio_iovcnt == 1) {
 570                     iomode = NFSV3WRITE_FILESYNC;
 571                     error = nfs_writerpc(vp, uio, cred, &iomode, &must_commit);
 572                     if (must_commit)
 573                         nfs_clearcommit(vp->v_mount);
 574                     return (error);
 575                 }
 576                 nfsstats.biocache_writes++;
 577                 lbn = uio->uio_offset / biosize;
 578                 on = uio->uio_offset & (biosize-1);
 579                 n = min((unsigned)(biosize - on), uio->uio_resid);
 580 again:
 581                 bufsize = biosize;
 582 #if 0
 583 /* (removed for UBC) */
 584                 if ((lbn + 1) * biosize > np->n_size) {
 585                         bufsize = np->n_size - lbn * biosize;
 586                         bufsize = (bufsize + DEV_BSIZE - 1) & ~(DEV_BSIZE - 1);
 587                 }
 588 #endif
 589                 /*
 590                  * Get a cache block for writing.  The range to be written is
 591                  * (off..off+len) within the block.  We ensure that the block
 592                  * either has no dirty region or that the given range is
 593                  * contiguous with the existing dirty region.
 594                  */
 595                 bp = nfs_getcacheblk(vp, lbn, bufsize, p, BLK_WRITE);
 596                 if (!bp)
 597                         return (EINTR);
 598                 /*
 599                  * Resize nfsnode *after* we busy the buffer to prevent
 600                  * readers from reading garbage.
 601                  * If there was a partial buf at the old eof, validate
 602                  * and zero the new bytes.
 603                  */
 604                 if (uio->uio_offset + n > np->n_size) {
 605                         struct buf *bp0 = NULL;
 606                         daddr_t bn = np->n_size / biosize;
 607                         int off = np->n_size & (biosize - 1);
 608
 609                         if (off && bn < lbn && incore(vp, bn))
 610                                 bp0 = nfs_getcacheblk(vp, bn, biosize, p,
 611                                                       BLK_WRITE);
 612                         np->n_flag |= NMODIFIED;
 613                         np->n_size = uio->uio_offset + n;
 614                         ubc_setsize(vp, (off_t)np->n_size); /* XXX errors */
 615                         if (bp0) {
 616                                 bzero((char *)bp0->b_data + off, biosize - off);
 617                                 bp0->b_validend = biosize;
 618                                 brelse(bp0);
 619                         }
 620                 }
 621                 /*
 622                  * NFS has embedded ucred so crhold() risks zone corruption
 623                  */
 624                 if (bp->b_wcred == NOCRED)
 625                         bp->b_wcred = crdup(cred);
 626                 /*
 627                  * If dirtyend exceeds file size, chop it down.  This should
 628                  * not occur unless there is a race.
 629                  */
 630                 if ((off_t)bp->b_blkno * DEV_BSIZE + bp->b_dirtyend >
 631                     np->n_size)
 632                         bp->b_dirtyend = np->n_size - (off_t)bp->b_blkno *
 633                                                       DEV_BSIZE;
 634                 /*
 635                  * UBC doesn't (yet) handle partial pages so nfs_biowrite was
 636                  * hacked to never bdwrite, to start every little write right
 637                  * away.  Running IE Avie noticed the performance problem, thus
 638                  * this code, which permits those delayed writes by ensuring an
 639                  * initial read of the entire page.  The read may hit eof
 640                  * ("short read") but that we will handle.
 641                  *
 642                  * We are quite dependant on the correctness of B_CACHE so check
 643                  * that first in case of problems.
 644                  */
 645                 if (!ISSET(bp->b_flags, B_CACHE) && n < PAGE_SIZE) {
 646                         boff = (off_t)bp->b_blkno * DEV_BSIZE;
 647                         auio.uio_iov = &iov;
 648                         auio.uio_iovcnt = 1;
 649                         auio.uio_offset = boff;
 650                         auio.uio_resid = PAGE_SIZE;
 651                         auio.uio_segflg = UIO_SYSSPACE;
 652                         auio.uio_rw = UIO_READ;
 653                         auio.uio_procp = p;
 654                         iov.iov_base = bp->b_data;
 655                         iov.iov_len = PAGE_SIZE;
 656                         error = nfs_readrpc(vp, &auio, cred);
 657                         if (error) {
 658                                 bp->b_error = error;
 659                                 SET(bp->b_flags, B_ERROR);
 660                                 printf("nfs_write: readrpc %d", error);
 661                         }
 662                         if (auio.uio_resid > 0)
 663                                 bzero(iov.iov_base, auio.uio_resid);
 664                         bp->b_validoff = 0;
 665                         bp->b_validend = PAGE_SIZE - auio.uio_resid;
 666                         if (np->n_size > boff + bp->b_validend)
 667                                 bp->b_validend = min(np->n_size - boff,
 668                                                      PAGE_SIZE);
 669                         bp->b_dirtyoff = 0;
 670                         bp->b_dirtyend = 0;
 671                 }
 672
 673                 /*
 674                  * If the new write will leave a contiguous dirty
 675                  * area, just update the b_dirtyoff and b_dirtyend,
 676                  * otherwise try to extend the dirty region.
 677                  */
 678                 if (bp->b_dirtyend > 0 &&
 679                     (on > bp->b_dirtyend || (on + n) < bp->b_dirtyoff)) {
 680                         off_t start, end;
 681
 682                         boff = (off_t)bp->b_blkno * DEV_BSIZE;
 683                         if (on > bp->b_dirtyend) {
 684                                 start = boff + bp->b_validend;
 685                                 end = boff + on;
 686                         } else {
 687                                 start = boff + on + n;
 688                                 end = boff + bp->b_validoff;
 689                         }
 690
 691                         /*
 692                          * It may be that the valid region in the buffer
 693                          * covers the region we want, in which case just
 694                          * extend the dirty region.  Otherwise we try to
 695                          * extend the valid region.
 696                          */
 697                         if (end > start) {
 698                                 auio.uio_iov = &iov;
 699                                 auio.uio_iovcnt = 1;
 700                                 auio.uio_offset = start;
 701                                 auio.uio_resid = end - start;
 702                                 auio.uio_segflg = UIO_SYSSPACE;
 703                                 auio.uio_rw = UIO_READ;
 704                                 auio.uio_procp = p;
 705                                 iov.iov_base = bp->b_data + (start - boff);
 706                                 iov.iov_len = end - start;
 707                                 error = nfs_readrpc(vp, &auio, cred);
 708                                 /*
 709                                  * If we couldn't read, do not do a VOP_BWRITE
 710                                  * as originally coded. That could also error
 711                                  * and looping back to "again" as it was doing
 712                                  * could have us stuck trying to write same buf
 713                                  * again. nfs_write, will get the entire region
 714                                  * if nfs_readrpc succeeded. If unsuccessful
 715                                  * we should just error out. Errors like ESTALE
 716                                  * would keep us looping rather than transient
 717                                  * errors justifying a retry. We can return here
 718                                  * instead of altering dirty region later.  We
 719                                  * did not write old dirty region at this point.
 720                                  */
 721                                 if (error) {
 722                                         bp->b_error = error;
 723                                         SET(bp->b_flags, B_ERROR);
 724                                         printf("nfs_write: readrpc2 %d", error);
 725                                         brelse(bp);
 726                                         return (error);
 727                                 }
 728                                 /*
 729                                  * The read worked.
 730                                  * If there was a short read, just zero fill.
 731                                  */
 732                                 if (auio.uio_resid > 0)
 733                                         bzero(iov.iov_base, auio.uio_resid);
 734                                 if (on > bp->b_dirtyend)
 735                                         bp->b_validend = on;
 736                                 else
 737                                         bp->b_validoff = on + n;
 738                         }
 739                         /*
 740                          * We now have a valid region which extends up to the
 741                          * dirty region which we want.
 742                          */
 743                         if (on > bp->b_dirtyend)
 744                                 bp->b_dirtyend = on;
 745                         else
 746                                 bp->b_dirtyoff = on + n;
 747                 }
 748                 if (ISSET(bp->b_flags, B_ERROR)) {
 749                         error = bp->b_error;
 750                         brelse(bp);
 751                         return (error);
 752                 }
 753                 /*
 754                  * NFS has embedded ucred so crhold() risks zone corruption
 755                  */
 756                 if (bp->b_wcred == NOCRED)
 757                         bp->b_wcred = crdup(cred);
 758                 np->n_flag |= NMODIFIED;
 759
 760                 /*
 761                  * Check for valid write lease and get one as required.
 762                  * In case getblk() and/or bwrite() delayed us.
 763                  */
 764                 if ((nmp->nm_flag & NFSMNT_NQNFS) &&
 765                     NQNFS_CKINVALID(vp, np, ND_WRITE)) {
 766                         do {
 767                                 error = nqnfs_getlease(vp, ND_WRITE, cred, p);
 768                         } while (error == NQNFS_EXPIRED);
 769                         if (error) {
 770                                 brelse(bp);
 771                                 return (error);
 772                         }
 773                         if (np->n_lrev != np->n_brev ||
 774                             (np->n_flag & NQNFSNONCACHE)) {
 775                                 brelse(bp);
 776                                 error = nfs_vinvalbuf(vp, V_SAVE, cred, p, 1);
 777                                 if (error)
 778                                         return (error);
 779                                 np->n_brev = np->n_lrev;
 780                                 goto again;
 781                         }
 782                 }
 783                 error = uiomove((char *)bp->b_data + on, n, uio);
 784                 if (error) {
 785                         SET(bp->b_flags, B_ERROR);
 786                         brelse(bp);
 787                         return (error);
 788                 }
 789                 if (bp->b_dirtyend > 0) {
 790                         bp->b_dirtyoff = min(on, bp->b_dirtyoff);
 791                         bp->b_dirtyend = max((on + n), bp->b_dirtyend);
 792                 } else {
 793                         bp->b_dirtyoff = on;
 794                         bp->b_dirtyend = on + n;
 795                 }
 796                 if (bp->b_validend == 0 || bp->b_validend < bp->b_dirtyoff ||
 797                     bp->b_validoff > bp->b_dirtyend) {
 798                         bp->b_validoff = bp->b_dirtyoff;
 799                         bp->b_validend = bp->b_dirtyend;
 800                 } else {
 801                         bp->b_validoff = min(bp->b_validoff, bp->b_dirtyoff);
 802                         bp->b_validend = max(bp->b_validend, bp->b_dirtyend);
 803                 }
 804
 805                 /*
 806                  * Since this block is being modified, it must be written
 807                  * again and not just committed.
 808                  */
 809                 CLR(bp->b_flags, B_NEEDCOMMIT);
 810
 811                 /*
 812                  * If the lease is non-cachable or IO_SYNC do bwrite().
 813                  */
 814                 if ((np->n_flag & NQNFSNONCACHE) || (ioflag & IO_SYNC)) {
 815                         bp->b_proc = p;
 816                         error = VOP_BWRITE(bp);
 817                         if (error)
 818                                 return (error);
 819                         if (np->n_flag & NQNFSNONCACHE) {
 820                                 error = nfs_vinvalbuf(vp, V_SAVE, cred, p, 1);
 821                                 if (error)
 822                                         return (error);
 823                         }
 824                 } else if ((n + on) == biosize &&
 825                         (nmp->nm_flag & NFSMNT_NQNFS) == 0) {
 826                         bp->b_proc = (struct proc *)0;
 827                         SET(bp->b_flags, B_ASYNC);
 828                         (void)nfs_writebp(bp, 0);
 829                 } else
 830                         bdwrite(bp);
 831         } while (uio->uio_resid > 0 && n > 0);
 832         return (0);
 833 }
 834
 835
 836 /*
 837  * Get an nfs cache block.
 838  * Allocate a new one if the block isn't currently in the cache
 839  * and return the block marked busy. If the calling process is
 840  * interrupted by a signal for an interruptible mount point, return
 841  * NULL.
 842  */
 843 static struct buf *
 844 nfs_getcacheblk(vp, bn, size, p, operation)
 845         struct vnode *vp;
 846         daddr_t bn;
 847         int size;
 848         struct proc *p;
 849         int operation;  /* defined in sys/buf.h */
 850 {
 851         register struct buf *bp;
 852         struct nfsmount *nmp = VFSTONFS(vp->v_mount);
 853         /*due to getblk/vm interractions, use vm page size or less values */
 854         int biosize = min(vp->v_mount->mnt_stat.f_iosize, PAGE_SIZE);
 855
 856         if (nbdwrite > ((nbuf/4)*3) && operation == BLK_WRITE) {
 857 #define __BUFFERS_RECLAIMED 2
 858                 struct buf *tbp[__BUFFERS_RECLAIMED];
 859                 int i;
 860
 861                 /* too many delayed writes, try to free up some buffers */
 862                 for (i = 0; i < __BUFFERS_RECLAIMED; i++)
 863                         tbp[i] = geteblk(512);
 864
 865                 /* Yield to IO thread */
 866                 (void)tsleep((caddr_t)&nbdwrite, PCATCH, "nbdwrite", 1);
 867
 868                 for (i = (__BUFFERS_RECLAIMED - 1); i >= 0; i--)
 869                          brelse(tbp[i]);
 870         }
 871
 872         if (nmp->nm_flag & NFSMNT_INT) {
 873                 bp = getblk(vp, bn, size, PCATCH, 0, operation);
 874                 while (bp == (struct buf *)0) {
 875                         if (nfs_sigintr(nmp, (struct nfsreq *)0, p))
 876                                 return ((struct buf *)0);
 877                         bp = getblk(vp, bn, size, 0, 2 * hz, operation);
 878                 }
 879         } else
 880                 bp = getblk(vp, bn, size, 0, 0, operation);
 881
 882         if( vp->v_type == VREG)
 883                 bp->b_blkno = ((off_t)bn * biosize) / DEV_BSIZE;
 884
 885         return (bp);
 886 }
 887
 888 /*
 889  * Flush and invalidate all dirty buffers. If another process is already
 890  * doing the flush, just wait for completion.
 891  */
 892 int
 893 nfs_vinvalbuf(vp, flags, cred, p, intrflg)
 894         struct vnode *vp;
 895         int flags;
 896         struct ucred *cred;
 897         struct proc *p;
 898         int intrflg;
 899 {
 900         register struct nfsnode *np = VTONFS(vp);
 901         struct nfsmount *nmp = VFSTONFS(vp->v_mount);
 902         int error = 0, slpflag, slptimeo;
 903         int didhold = 0;
 904
 905         if ((nmp->nm_flag & NFSMNT_INT) == 0)
 906                 intrflg = 0;
 907         if (intrflg) {
 908                 slpflag = PCATCH;
 909                 slptimeo = 2 * hz;
 910         } else {
 911                 slpflag = 0;
 912                 slptimeo = 0;
 913         }
 914         /*
 915          * First wait for any other process doing a flush to complete.
 916          */
 917         while (np->n_flag & NFLUSHINPROG) {
 918                 np->n_flag |= NFLUSHWANT;
 919                 error = tsleep((caddr_t)&np->n_flag, PRIBIO + 2, "nfsvinval",
 920                         slptimeo);
 921                 if (error && intrflg && nfs_sigintr(nmp, (struct nfsreq *)0, p))
 922                         return (EINTR);
 923         }
 924
 925         /*
 926          * Now, flush as required.
 927          */
 928         np->n_flag |= NFLUSHINPROG;
 929         error = vinvalbuf(vp, flags, cred, p, slpflag, 0);
 930         while (error) {
 931                 /* we seem to be stuck in a loop here if the thread got aborted.
 932                  * nfs_flush will return EINTR. Not sure if that will cause
 933                  * other consequences due to EINTR having other meanings in NFS
 934                  * To handle, no dirty pages, it seems safe to just return from
 935                  * here. But if we did have dirty pages, how would we get them
 936                  * written out if thread was aborted? Some other strategy is
 937                  * necessary. -- EKN
 938                  */
 939                 if ((intrflg && nfs_sigintr(nmp, (struct nfsreq *)0, p)) ||
 940                     (error == EINTR && current_thread_aborted())) {
 941                         np->n_flag &= ~NFLUSHINPROG;
 942                         if (np->n_flag & NFLUSHWANT) {
 943                                 np->n_flag &= ~NFLUSHWANT;
 944                                 wakeup((caddr_t)&np->n_flag);
 945                         }
 946                         return (EINTR);
 947                 }
 948                 error = vinvalbuf(vp, flags, cred, p, 0, slptimeo);
 949         }
 950         np->n_flag &= ~(NMODIFIED | NFLUSHINPROG);
 951         if (np->n_flag & NFLUSHWANT) {
 952                 np->n_flag &= ~NFLUSHWANT;
 953                 wakeup((caddr_t)&np->n_flag);
 954         }
 955         didhold = ubc_hold(vp);
 956         if (didhold) {
 957                 (void) ubc_clean(vp, 1); /* get the pages out of vm also */
 958                 ubc_rele(vp);
 959         }
 960         return (0);
 961 }
 962
 963 /*
 964  * Initiate asynchronous I/O. Return an error if no nfsiods are available.
 965  * This is mainly to avoid queueing async I/O requests when the nfsiods
 966  * are all hung on a dead server.
 967  */
 968 int
 969 nfs_asyncio(bp, cred)
 970         register struct buf *bp;
 971         struct ucred *cred;
 972 {
 973         struct nfsmount *nmp;
 974         int i;
 975         int gotiod;
 976         int slpflag = 0;
 977         int slptimeo = 0;
 978         int error;
 979
 980         if (nfs_numasync == 0)
 981                 return (EIO);
 982
 983         nmp = VFSTONFS(bp->b_vp->v_mount);
 984 again:
 985         if (nmp->nm_flag & NFSMNT_INT)
 986                 slpflag = PCATCH;
 987         gotiod = FALSE;
 988
 989         /*
 990          * Find a free iod to process this request.
 991          */
 992         for (i = 0; i < NFS_MAXASYNCDAEMON; i++)
 993                 if (nfs_iodwant[i]) {
 994                         /*
 995                          * Found one, so wake it up and tell it which
 996                          * mount to process.
 997                          */
 998                         NFS_DPF(ASYNCIO,
 999                                 ("nfs_asyncio: waking iod %d for mount %p\n",
1000                                  i, nmp));
1001                         nfs_iodwant[i] = (struct proc *)0;
1002                         nfs_iodmount[i] = nmp;
1003                         nmp->nm_bufqiods++;
1004                         wakeup((caddr_t)&nfs_iodwant[i]);
1005                         gotiod = TRUE;
1006                         break;
1007                 }
1008
1009         /*
1010          * If none are free, we may already have an iod working on this mount
1011          * point.  If so, it will process our request.
1012          */
1013         if (!gotiod) {
1014                 if (nmp->nm_bufqiods > 0) {
1015                         NFS_DPF(ASYNCIO,
1016                                 ("nfs_asyncio: %d iods are already processing mount %p\n",
1017                                  nmp->nm_bufqiods, nmp));
1018                         gotiod = TRUE;
1019                 }
1020         }
1021
1022         /*
1023          * If we have an iod which can process the request, then queue
1024          * the buffer.
1025          */
1026         if (gotiod) {
1027                 /*
1028                  * Ensure that the queue never grows too large.
1029                  */
1030                 while (nmp->nm_bufqlen >= 2*nfs_numasync) {
1031                         NFS_DPF(ASYNCIO,
1032                                 ("nfs_asyncio: waiting for mount %p queue to drain\n", nmp));
1033                         nmp->nm_bufqwant = TRUE;
1034                         error = tsleep(&nmp->nm_bufq, slpflag | PRIBIO,
1035                                        "nfsaio", slptimeo);
1036                         if (error) {
1037                                 if (nfs_sigintr(nmp, NULL, bp->b_proc))
1038                                         return (EINTR);
1039                                 if (slpflag == PCATCH) {
1040                                         slpflag = 0;
1041                                         slptimeo = 2 * hz;
1042                                 }
1043                         }
1044                         /*
1045                          * We might have lost our iod while sleeping,
1046                          * so check and loop if nescessary.
1047                          */
1048                         if (nmp->nm_bufqiods == 0) {
1049                                 NFS_DPF(ASYNCIO,
1050                                         ("nfs_asyncio: no iods after mount %p queue was drained, looping\n", nmp));
1051                                 goto again;
1052                         }
1053                 }
1054
1055                 if (ISSET(bp->b_flags, B_READ)) {
1056                         if (bp->b_rcred == NOCRED && cred != NOCRED) {
1057                                 /*
1058                                  * NFS has embedded ucred.
1059                                  * Can not crhold() here as that causes zone corruption
1060                                  */
1061                                 bp->b_rcred = crdup(cred);
1062                         }
1063                 } else {
1064                         SET(bp->b_flags, B_WRITEINPROG);
1065                         if (bp->b_wcred == NOCRED && cred != NOCRED) {
1066                                 /*
1067                                  * NFS has embedded ucred.
1068                                  * Can not crhold() here as that causes zone corruption
1069                                  */
1070                                 bp->b_wcred = crdup(cred);
1071                         }
1072                 }
1073
1074                 TAILQ_INSERT_TAIL(&nmp->nm_bufq, bp, b_freelist);
1075                 nmp->nm_bufqlen++;
1076                 return (0);
1077         }
1078
1079         /*
1080          * All the iods are busy on other mounts, so return EIO to
1081          * force the caller to process the i/o synchronously.
1082          */
1083         NFS_DPF(ASYNCIO, ("nfs_asyncio: no iods available, i/o is synchronous\n"));
1084         return (EIO);
1085 }
1086
1087 /*
1088  * Do an I/O operation to/from a cache block. This may be called
1089  * synchronously or from an nfsiod.
1090  */
1091 int
1092 nfs_doio(bp, cr, p)
1093         register struct buf *bp;
1094         struct ucred *cr;
1095         struct proc *p;
1096 {
1097         register struct uio *uiop;
1098         register struct vnode *vp;
1099         struct nfsnode *np;
1100         struct nfsmount *nmp;
1101         int error = 0, diff, len, iomode, must_commit = 0;
1102         struct uio uio;
1103         struct iovec io;
1104
1105         vp = bp->b_vp;
1106         np = VTONFS(vp);
1107         nmp = VFSTONFS(vp->v_mount);
1108         uiop = &uio;
1109         uiop->uio_iov = &io;
1110         uiop->uio_iovcnt = 1;
1111         uiop->uio_segflg = UIO_SYSSPACE;
1112         uiop->uio_procp = p;
1113
1114         /*
1115          * With UBC, getblk() can return a buf with B_DONE set.
1116          * This indicates that the VM has valid data for that page.
1117          * NFS being stateless, this case poses a problem.
1118          * By definition, the NFS server should always be consulted
1119          * for the data in that page.
1120          * So we choose to clear the B_DONE and to do the IO.
1121          *
1122          * XXX revisit this if there is a performance issue.
1123          * XXX In that case, we could play the attribute cache games ...
1124          */
1125          if (ISSET(bp->b_flags, B_DONE)) {
1126                 if (!ISSET(bp->b_flags, B_ASYNC))
1127                         panic("nfs_doio: done and not async");
1128                 CLR(bp->b_flags, B_DONE);
1129         }
1130         FSDBG_TOP(256, np->n_size, bp->b_blkno * DEV_BSIZE, bp->b_bcount,
1131                   bp->b_flags);
1132         FSDBG(257, bp->b_validoff, bp->b_validend, bp->b_dirtyoff,
1133               bp->b_dirtyend);
1134         /*
1135          * Historically, paging was done with physio, but no more.
1136          */
1137         if (ISSET(bp->b_flags, B_PHYS)) {
1138             /*
1139              * ...though reading /dev/drum still gets us here.
1140              */
1141             io.iov_len = uiop->uio_resid = bp->b_bcount;
1142             /* mapping was done by vmapbuf() */
1143             io.iov_base = bp->b_data;
1144             uiop->uio_offset = (off_t)bp->b_blkno * DEV_BSIZE;
1145             if (ISSET(bp->b_flags, B_READ)) {
1146                         uiop->uio_rw = UIO_READ;
1147                         nfsstats.read_physios++;
1148                         error = nfs_readrpc(vp, uiop, cr);
1149             } else {
1150                         int com;
1151
1152                         iomode = NFSV3WRITE_DATASYNC;
1153                         uiop->uio_rw = UIO_WRITE;
1154                         nfsstats.write_physios++;
1155                         error = nfs_writerpc(vp, uiop, cr, &iomode, &com);
1156             }
1157             if (error) {
1158                         SET(bp->b_flags, B_ERROR);
1159                         bp->b_error = error;
1160             }
1161         } else if (ISSET(bp->b_flags, B_READ)) {
1162             io.iov_len = uiop->uio_resid = bp->b_bcount;
1163             io.iov_base = bp->b_data;
1164             uiop->uio_rw = UIO_READ;
1165             switch (vp->v_type) {
1166             case VREG:
1167                 uiop->uio_offset = (off_t)bp->b_blkno * DEV_BSIZE;
1168                 nfsstats.read_bios++;
1169                 error = nfs_readrpc(vp, uiop, cr);
1170                 FSDBG(262, np->n_size, bp->b_blkno * DEV_BSIZE,
1171                       uiop->uio_resid, error);
1172                 if (!error) {
1173                     bp->b_validoff = 0;
1174                     if (uiop->uio_resid) {
1175                         /*
1176                          * If len > 0, there is a hole in the file and
1177                          * no writes after the hole have been pushed to
1178                          * the server yet.
1179                          * Just zero fill the rest of the valid area.
1180                          */
1181                         diff = bp->b_bcount - uiop->uio_resid;
1182                         len = np->n_size - ((u_quad_t)bp->b_blkno * DEV_BSIZE +
1183                                             diff);
1184                         if (len > 0) {
1185                                 len = min(len, uiop->uio_resid);
1186                                 bzero((char *)bp->b_data + diff, len);
1187                                 bp->b_validend = diff + len;
1188                                 FSDBG(258, diff, len, 0, 1);
1189                         } else
1190                                 bp->b_validend = diff;
1191                     } else
1192                                 bp->b_validend = bp->b_bcount;
1193
1194                     if (bp->b_validend < bp->b_bufsize) {
1195                             /*
1196                              * we're about to release a partial buffer after a
1197                              * read... the only way we should get here is if
1198                              * this buffer contains the EOF before releasing it,
1199                              * we'll zero out to the end of the buffer so that
1200                              * if a mmap of this page occurs, we'll see zero's
1201                              * even if a ftruncate extends the file in the
1202                              * meantime
1203                              */
1204                             bzero((caddr_t)(bp->b_data + bp->b_validend),
1205                                   bp->b_bufsize - bp->b_validend);
1206                             FSDBG(258, bp->b_validend,
1207                                   bp->b_bufsize - bp->b_validend, 0, 2);
1208                     }
1209                 }
1210                 if (p && (vp->v_flag & VTEXT) &&
1211                         (((nmp->nm_flag & NFSMNT_NQNFS) &&
1212                           NQNFS_CKINVALID(vp, np, ND_READ) &&
1213                           np->n_lrev != np->n_brev) ||
1214                          (!(nmp->nm_flag & NFSMNT_NQNFS) &&
1215                           np->n_mtime != np->n_vattr.va_mtime.tv_sec))) {
1216                         uprintf("Process killed due to text file modification\n");
1217                         psignal(p, SIGKILL);
1218                         p->p_flag |= P_NOSWAP;
1219                 }
1220                 break;
1221             case VLNK:
1222                 uiop->uio_offset = (off_t)0;
1223                 nfsstats.readlink_bios++;
1224                 error = nfs_readlinkrpc(vp, uiop, cr);
1225                 break;
1226             case VDIR:
1227                 nfsstats.readdir_bios++;
1228                 uiop->uio_offset = ((u_quad_t)bp->b_lblkno) * NFS_DIRBLKSIZ;
1229                 if (!(nmp->nm_flag & NFSMNT_NFSV3))
1230                         nmp->nm_flag &= ~NFSMNT_RDIRPLUS; /* dk@farm.org */
1231                 if (nmp->nm_flag & NFSMNT_RDIRPLUS) {
1232                         error = nfs_readdirplusrpc(vp, uiop, cr);
1233                         if (error == NFSERR_NOTSUPP)
1234                                 nmp->nm_flag &= ~NFSMNT_RDIRPLUS;
1235                 }
1236                 if ((nmp->nm_flag & NFSMNT_RDIRPLUS) == 0)
1237                         error = nfs_readdirrpc(vp, uiop, cr);
1238                 break;
1239             default:
1240                 printf("nfs_doio: type %x unexpected\n", vp->v_type);
1241                 break;
1242             };
1243             if (error) {
1244                 SET(bp->b_flags, B_ERROR);
1245                 bp->b_error = error;
1246             }
1247         } else {
1248             /*
1249              * mapped I/O may have altered any bytes, so we extend
1250              * the dirty zone to the valid zone.  For best performance
1251              * a better solution would be to save & restore page dirty bits
1252              * around the uiomove which brings write-data into the buffer.
1253              * Then here we'd check if the page is dirty rather than WASMAPPED
1254              * Also vnode_pager would change - if a page is clean it might
1255              * still need to be written due to DELWRI.
1256              */
1257             if (UBCINFOEXISTS(vp) && ubc_issetflags(vp, UI_WASMAPPED)) {
1258                 bp->b_dirtyoff = min(bp->b_dirtyoff, bp->b_validoff);
1259                 bp->b_dirtyend = max(bp->b_dirtyend, bp->b_validend);
1260             }
1261             if ((off_t)bp->b_blkno * DEV_BSIZE + bp->b_dirtyend > np->n_size)
1262                 bp->b_dirtyend = np->n_size - (off_t)bp->b_blkno * DEV_BSIZE;
1263
1264             if (bp->b_dirtyend > bp->b_dirtyoff) {
1265                 io.iov_len = uiop->uio_resid = bp->b_dirtyend - bp->b_dirtyoff;
1266                 uiop->uio_offset = (off_t)bp->b_blkno * DEV_BSIZE +
1267                                    bp->b_dirtyoff;
1268                 io.iov_base = (char *)bp->b_data + bp->b_dirtyoff;
1269                 uiop->uio_rw = UIO_WRITE;
1270
1271                 nfsstats.write_bios++;
1272                 if ((bp->b_flags & (B_ASYNC | B_NEEDCOMMIT | B_NOCACHE)) ==
1273                     B_ASYNC)
1274                     iomode = NFSV3WRITE_UNSTABLE;
1275                 else
1276                     iomode = NFSV3WRITE_FILESYNC;
1277                 SET(bp->b_flags, B_WRITEINPROG);
1278                 error = nfs_writerpc(vp, uiop, cr, &iomode, &must_commit);
1279                 if (!error && iomode == NFSV3WRITE_UNSTABLE)
1280                     SET(bp->b_flags, B_NEEDCOMMIT);
1281                 else
1282                     CLR(bp->b_flags, B_NEEDCOMMIT);
1283                 CLR(bp->b_flags, B_WRITEINPROG);
1284                 /*
1285                  * For an interrupted write, the buffer is still valid
1286                  * and the write hasn't been pushed to the server yet,
1287                  * so we can't set B_ERROR and report the interruption
1288                  * by setting B_EINTR. For the B_ASYNC case, B_EINTR
1289                  * is not relevant, so the rpc attempt is essentially
1290                  * a noop.  For the case of a V3 write rpc not being
1291                  * committed to stable storage, the block is still
1292                  * dirty and requires either a commit rpc or another
1293                  * write rpc with iomode == NFSV3WRITE_FILESYNC before
1294                  * the block is reused. This is indicated by setting
1295                  * the B_DELWRI and B_NEEDCOMMIT flags.
1296                  */
1297                 if (error == EINTR || (!error && bp->b_flags & B_NEEDCOMMIT)) {
1298                         int s;
1299
1300                         CLR(bp->b_flags, B_INVAL | B_NOCACHE);
1301                         if (!ISSET(bp->b_flags, B_DELWRI)) {
1302                                 SET(bp->b_flags, B_DELWRI);
1303                                 nbdwrite++;
1304                         }
1305                         FSDBG(261, bp->b_validoff, bp->b_validend,
1306                               bp->b_bufsize, bp->b_bcount);
1307                         /*
1308                          * Since for the B_ASYNC case, nfs_bwrite() has
1309                          * reassigned the buffer to the clean list, we have to
1310                          * reassign it back to the dirty one. Ugh.
1311                          */
1312                         if (ISSET(bp->b_flags, B_ASYNC)) {
1313                                 s = splbio();
1314                                 reassignbuf(bp, vp);
1315                                 splx(s);
1316                         } else {
1317                                 SET(bp->b_flags, B_EINTR);
1318                         }
1319                 } else {
1320                         if (error) {
1321                                 SET(bp->b_flags, B_ERROR);
1322                                 bp->b_error = np->n_error = error;
1323                                 np->n_flag |= NWRITEERR;
1324                         }
1325                         bp->b_dirtyoff = bp->b_dirtyend = 0;
1326
1327                         /*
1328                          * validoff and validend represent the real data present
1329                          * in this buffer if validoff is non-zero, than we have
1330                          * to invalidate the buffer and kill the page when
1331                          * biodone is called... the same is also true when
1332                          * validend doesn't extend all the way to the end of the
1333                          * buffer and validend doesn't equate to the current
1334                          * EOF... eventually we need to deal with this in a more
1335                          * humane way (like keeping the partial buffer without
1336                          * making it immediately available to the VM page cache)
1337                          */
1338                         if (bp->b_validoff)
1339                                 SET(bp->b_flags, B_INVAL);
1340                         else
1341                         if (bp->b_validend < bp->b_bufsize) {
1342                                 if ((off_t)bp->b_blkno * DEV_BSIZE +
1343                                     bp->b_validend == np->n_size) {
1344                                         bzero((caddr_t)(bp->b_data +
1345                                                         bp->b_validend),
1346                                               bp->b_bufsize - bp->b_validend);
1347                                         FSDBG(259, bp->b_validend,
1348                                               bp->b_bufsize - bp->b_validend, 0,
1349                                               0);
1350                                 } else
1351                                         SET(bp->b_flags, B_INVAL);
1352                         }
1353                 }
1354
1355             } else {
1356                 if (bp->b_validoff ||
1357                     (bp->b_validend < bp->b_bufsize &&
1358                      (off_t)bp->b_blkno * DEV_BSIZE + bp->b_validend !=
1359                      np->n_size)) {
1360                         SET(bp->b_flags, B_INVAL);
1361                 }
1362                 if (bp->b_flags & B_INVAL) {
1363                         FSDBG(260, bp->b_validoff, bp->b_validend,
1364                               bp->b_bufsize, bp->b_bcount);
1365                 }
1366                 bp->b_resid = 0;
1367                 biodone(bp);
1368                 FSDBG_BOT(256, bp->b_validoff, bp->b_validend, bp->b_bufsize,
1369                           np->n_size);
1370                 return (0);
1371             }
1372         }
1373         bp->b_resid = uiop->uio_resid;
1374         if (must_commit)
1375                 nfs_clearcommit(vp->v_mount);
1376
1377         if (bp->b_flags & B_INVAL) {
1378                 FSDBG(260, bp->b_validoff, bp->b_validend, bp->b_bufsize,
1379                       bp->b_bcount);
1380         }
1381         FSDBG_BOT(256, bp->b_validoff, bp->b_validend, bp->b_bcount, error);
1382
1383         biodone(bp);
1384         return (error);
1385 }