bsd/nfs/nfs_bio.c

   1 /*
   2  * Copyright (c) 2000 Apple Computer, Inc. All rights reserved.
   3  *
   4  * @APPLE_LICENSE_HEADER_START@
   5  *
   6  * The contents of this file constitute Original Code as defined in and
   7  * are subject to the Apple Public Source License Version 1.1 (the
   8  * "License").  You may not use this file except in compliance with the
   9  * License.  Please obtain a copy of the License at
  10  * http://www.apple.com/publicsource and read it before using this file.
  11  *
  12  * This Original Code and all software distributed under the License are
  13  * distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY KIND, EITHER
  14  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
  15  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
  16  * FITNESS FOR A PARTICULAR PURPOSE OR NON-INFRINGEMENT.  Please see the
  17  * License for the specific language governing rights and limitations
  18  * under the License.
  19  *
  20  * @APPLE_LICENSE_HEADER_END@
  21  */
  22 /* Copyright (c) 1995 NeXT Computer, Inc. All Rights Reserved */
  23 /*
  24  * Copyright (c) 1989, 1993
  25  *      The Regents of the University of California.  All rights reserved.
  26  *
  27  * This code is derived from software contributed to Berkeley by
  28  * Rick Macklem at The University of Guelph.
  29  *
  30  * Redistribution and use in source and binary forms, with or without
  31  * modification, are permitted provided that the following conditions
  32  * are met:
  33  * 1. Redistributions of source code must retain the above copyright
  34  *    notice, this list of conditions and the following disclaimer.
  35  * 2. Redistributions in binary form must reproduce the above copyright
  36  *    notice, this list of conditions and the following disclaimer in the
  37  *    documentation and/or other materials provided with the distribution.
  38  * 3. All advertising materials mentioning features or use of this software
  39  *    must display the following acknowledgement:
  40  *      This product includes software developed by the University of
  41  *      California, Berkeley and its contributors.
  42  * 4. Neither the name of the University nor the names of its contributors
  43  *    may be used to endorse or promote products derived from this software
  44  *    without specific prior written permission.
  45  *
  46  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  47  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  48  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  49  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  50  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  51  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  52  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  53  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  54  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  55  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  56  * SUCH DAMAGE.
  57  *
  58  *      @(#)nfs_bio.c   8.9 (Berkeley) 3/30/95
  59  * FreeBSD-Id: nfs_bio.c,v 1.44 1997/09/10 19:52:25 phk Exp $
  60  */
  61
  62 #include <sys/param.h>
  63 #include <sys/systm.h>
  64 #include <sys/resourcevar.h>
  65 #include <sys/signalvar.h>
  66 #include <sys/proc.h>
  67 #include <sys/buf.h>
  68 #include <sys/vnode.h>
  69 #include <sys/mount.h>
  70 #include <sys/kernel.h>
  71 #include <sys/sysctl.h>
  72 #include <sys/ubc.h>
  73
  74 #include <sys/vm.h>
  75 #include <sys/vmparam.h>
  76
  77 #include <sys/time.h>
  78 #include <kern/clock.h>
  79
  80 #include <nfs/rpcv2.h>
  81 #include <nfs/nfsproto.h>
  82 #include <nfs/nfs.h>
  83 #include <nfs/nfsmount.h>
  84 #include <nfs/nqnfs.h>
  85 #include <nfs/nfsnode.h>
  86
  87 #include <sys/kdebug.h>
  88
  89 static struct buf *nfs_getcacheblk __P((struct vnode *vp, daddr_t bn, int size,
  90                                         struct proc *p, int operation));
  91 static struct buf *nfs_getwriteblk __P((struct vnode *vp, daddr_t bn,
  92                                         int size, struct proc *p,
  93                                         struct ucred *cred, int off, int len));
  94
  95 extern int nfs_numasync;
  96 extern struct nfsstats nfsstats;
  97
  98 /*
  99  * Vnode op for read using bio
 100  * Any similarity to readip() is purely coincidental
 101  */
 102 int
 103 nfs_bioread(vp, uio, ioflag, cred, getpages)
 104         register struct vnode *vp;
 105         register struct uio *uio;
 106         int ioflag;
 107         struct ucred *cred;
 108         int getpages;
 109 {
 110         register struct nfsnode *np = VTONFS(vp);
 111         register int biosize, diff, i;
 112         struct buf *bp = 0, *rabp;
 113         struct vattr vattr;
 114         struct proc *p;
 115         struct nfsmount *nmp = VFSTONFS(vp->v_mount);
 116         daddr_t lbn, rabn;
 117         int bufsize;
 118         int nra, error = 0, n = 0, on = 0, not_readin;
 119         int operation = (getpages? BLK_PAGEIN : BLK_READ);
 120
 121 #if DIAGNOSTIC
 122         if (uio->uio_rw != UIO_READ)
 123                 panic("nfs_read mode");
 124 #endif
 125         if (uio->uio_resid == 0)
 126                 return (0);
 127         if (uio->uio_offset < 0)
 128                 return (EINVAL);
 129         p = uio->uio_procp;
 130         if ((nmp->nm_flag & (NFSMNT_NFSV3 | NFSMNT_GOTFSINFO)) == NFSMNT_NFSV3)
 131                 (void)nfs_fsinfo(nmp, vp, cred, p);
 132         /*due to getblk/vm interractions, use vm page size or less values */
 133         biosize = min(vp->v_mount->mnt_stat.f_iosize, PAGE_SIZE);
 134         /*
 135          * For nfs, cache consistency can only be maintained approximately.
 136          * Although RFC1094 does not specify the criteria, the following is
 137          * believed to be compatible with the reference port.
 138          * For nqnfs, full cache consistency is maintained within the loop.
 139          * For nfs:
 140          * If the file's modify time on the server has changed since the
 141          * last read rpc or you have written to the file,
 142          * you may have lost data cache consistency with the
 143          * server, so flush all of the file's data out of the cache.
 144          * Then force a getattr rpc to ensure that you have up to date
 145          * attributes.
 146          * NB: This implies that cache data can be read when up to
 147          * NFS_ATTRTIMEO seconds out of date. If you find that you need current
 148          * attributes this could be forced by setting n_attrstamp to 0 before
 149          * the VOP_GETATTR() call.
 150          */
 151         if ((nmp->nm_flag & NFSMNT_NQNFS) == 0) {
 152                 if (np->n_flag & NMODIFIED) {
 153                         if (vp->v_type != VREG) {
 154                                 if (vp->v_type != VDIR)
 155                                         panic("nfs: bioread, not dir");
 156                                 nfs_invaldir(vp);
 157                                 error = nfs_vinvalbuf(vp, V_SAVE, cred, p, 1);
 158                                 if (error)
 159                                         return (error);
 160                         }
 161                         np->n_attrstamp = 0;
 162                         error = VOP_GETATTR(vp, &vattr, cred, p);
 163                         if (error)
 164                                 return (error);
 165                         np->n_mtime = vattr.va_mtime.tv_sec;
 166                 } else {
 167                         error = VOP_GETATTR(vp, &vattr, cred, p);
 168                         if (error)
 169                                 return (error);
 170                         if (np->n_mtime != vattr.va_mtime.tv_sec) {
 171                                 if (vp->v_type == VDIR)
 172                                         nfs_invaldir(vp);
 173                                 error = nfs_vinvalbuf(vp, V_SAVE, cred, p, 1);
 174                                 if (error)
 175                                         return (error);
 176                                 np->n_mtime = vattr.va_mtime.tv_sec;
 177                         }
 178                 }
 179         }
 180         do {
 181
 182             /*
 183              * Get a valid lease. If cached data is stale, flush it.
 184              */
 185             if (nmp->nm_flag & NFSMNT_NQNFS) {
 186                 if (NQNFS_CKINVALID(vp, np, ND_READ)) {
 187                     do {
 188                         error = nqnfs_getlease(vp, ND_READ, cred, p);
 189                     } while (error == NQNFS_EXPIRED);
 190                     if (error)
 191                         return (error);
 192                     if (np->n_lrev != np->n_brev ||
 193                         (np->n_flag & NQNFSNONCACHE) ||
 194                         ((np->n_flag & NMODIFIED) && vp->v_type == VDIR)) {
 195                         if (vp->v_type == VDIR)
 196                             nfs_invaldir(vp);
 197                         error = nfs_vinvalbuf(vp, V_SAVE, cred, p, 1);
 198                         if (error)
 199                             return (error);
 200                         np->n_brev = np->n_lrev;
 201                     }
 202                 } else if (vp->v_type == VDIR && (np->n_flag & NMODIFIED)) {
 203                     nfs_invaldir(vp);
 204                     error = nfs_vinvalbuf(vp, V_SAVE, cred, p, 1);
 205                     if (error)
 206                         return (error);
 207                 }
 208             }
 209             if (np->n_flag & NQNFSNONCACHE) {
 210                 switch (vp->v_type) {
 211                 case VREG:
 212                         return (nfs_readrpc(vp, uio, cred));
 213                 case VLNK:
 214                         return (nfs_readlinkrpc(vp, uio, cred));
 215                 case VDIR:
 216                         break;
 217                 default:
 218                         printf(" NQNFSNONCACHE: type %x unexpected\n",
 219                                 vp->v_type);
 220                 };
 221             }
 222             switch (vp->v_type) {
 223             case VREG:
 224                 nfsstats.biocache_reads++;
 225                 lbn = uio->uio_offset / biosize;
 226                 on = uio->uio_offset & (biosize - 1);
 227                 not_readin = 1;
 228
 229                 /*
 230                  * Start the read ahead(s), as required.
 231                  */
 232                 if (nfs_numasync > 0 && nmp->nm_readahead > 0) {
 233                     for (nra = 0; nra < nmp->nm_readahead &&
 234                                 (off_t)(lbn + 1 + nra) * biosize < np->n_size; nra++) {
 235                                 rabn = lbn + 1 + nra;
 236                                 if (!incore(vp, rabn)) {
 237                                         rabp = nfs_getcacheblk(vp, rabn, biosize, p, operation);
 238                                         if (!rabp)
 239                                                 return (EINTR);
 240                                         if (!ISSET(rabp->b_flags, (B_CACHE|B_DELWRI))) {
 241                                                 SET(rabp->b_flags, (B_READ | B_ASYNC));
 242                                                 if (nfs_asyncio(rabp, cred)) {
 243                                                         SET(rabp->b_flags, (B_INVAL|B_ERROR));
 244                                                         rabp->b_error = EIO;
 245                                                         brelse(rabp);
 246                                                 }
 247                                         } else
 248                                                 brelse(rabp);
 249                                 }
 250                     }
 251                 }
 252
 253                 /*
 254                  * If the block is in the cache and has the required data
 255                  * in a valid region, just copy it out.
 256                  * Otherwise, get the block and write back/read in,
 257                  * as required.
 258                  */
 259 again:
 260                 bufsize = biosize;
 261                 if ((off_t)(lbn + 1) * biosize > np->n_size &&
 262                     (off_t)(lbn + 1) * biosize - np->n_size < biosize) {
 263                         bufsize = np->n_size - lbn * biosize;
 264                         bufsize = (bufsize + DEV_BSIZE - 1) & ~(DEV_BSIZE - 1);
 265                 }
 266                 bp = nfs_getcacheblk(vp, lbn, bufsize, p, operation);
 267                 if (!bp)
 268                         return (EINTR);
 269
 270                 if (!ISSET(bp->b_flags, B_CACHE)) {
 271                         SET(bp->b_flags, B_READ);
 272                         CLR(bp->b_flags, (B_DONE | B_ERROR | B_INVAL));
 273                         not_readin = 0;
 274                         error = nfs_doio(bp, cred, p);
 275                         if (error) {
 276                             brelse(bp);
 277                             return (error);
 278                         }
 279                 }
 280                 if (bufsize > on) {
 281                         n = min((unsigned)(bufsize - on), uio->uio_resid);
 282                 } else {
 283                         n = 0;
 284                 }
 285                 diff = np->n_size - uio->uio_offset;
 286                 if (diff < n)
 287                         n = diff;
 288                 if (not_readin && n > 0) {
 289                         if (on < bp->b_validoff || (on + n) > bp->b_validend) {
 290                                 SET(bp->b_flags, (B_NOCACHE|B_INVAFTERWRITE));
 291                                 if (bp->b_dirtyend > 0) {
 292                                         if (!ISSET(bp->b_flags, B_DELWRI))
 293                                                 panic("nfsbioread");
 294                                         if (VOP_BWRITE(bp) == EINTR)
 295                                                 return (EINTR);
 296                                 } else
 297                                         brelse(bp);
 298                                 goto again;
 299                         }
 300                 }
 301                 vp->v_lastr = lbn;
 302                 diff = (on >= bp->b_validend) ? 0 : (bp->b_validend - on);
 303                 if (diff < n)
 304                         n = diff;
 305                 break;
 306             case VLNK:
 307                 nfsstats.biocache_readlinks++;
 308                 bp = nfs_getcacheblk(vp, (daddr_t)0, NFS_MAXPATHLEN, p, operation);
 309                 if (!bp)
 310                         return (EINTR);
 311                 if (!ISSET(bp->b_flags, B_CACHE)) {
 312                         SET(bp->b_flags, B_READ);
 313                         error = nfs_doio(bp, cred, p);
 314                         if (error) {
 315                                 SET(bp->b_flags, B_ERROR);
 316                                 brelse(bp);
 317                                 return (error);
 318                         }
 319                 }
 320                 n = min(uio->uio_resid, NFS_MAXPATHLEN - bp->b_resid);
 321                 on = 0;
 322                 break;
 323             case VDIR:
 324                 nfsstats.biocache_readdirs++;
 325                 if (np->n_direofoffset
 326                     && uio->uio_offset >= np->n_direofoffset) {
 327                     return (0);
 328                 }
 329                 lbn = uio->uio_offset / NFS_DIRBLKSIZ;
 330                 on = uio->uio_offset & (NFS_DIRBLKSIZ - 1);
 331                 bp = nfs_getcacheblk(vp, lbn, NFS_DIRBLKSIZ, p, operation);
 332                 if (!bp)
 333                     return (EINTR);
 334                 if (!ISSET(bp->b_flags, B_CACHE)) {
 335                     SET(bp->b_flags, B_READ);
 336                     error = nfs_doio(bp, cred, p);
 337                     if (error) {
 338                                 brelse(bp);
 339                                 while (error == NFSERR_BAD_COOKIE) {
 340                                         nfs_invaldir(vp);
 341                                         error = nfs_vinvalbuf(vp, 0, cred, p, 1);
 342                                         /*
 343                                          * Yuck! The directory has been modified on the
 344                                          * server. The only way to get the block is by
 345                                          * reading from the beginning to get all the
 346                                          * offset cookies.
 347                                          */
 348                                         for (i = 0; i <= lbn && !error; i++) {
 349                                         if (np->n_direofoffset
 350                                                 && (i * NFS_DIRBLKSIZ) >= np->n_direofoffset)
 351                                                 return (0);
 352                                         bp = nfs_getcacheblk(vp, i, NFS_DIRBLKSIZ, p, operation);
 353                                         if (!bp)
 354                                                 return (EINTR);
 355                                         if (!ISSET(bp->b_flags, B_DONE)) {
 356                                                 SET(bp->b_flags, B_READ);
 357                                                 error = nfs_doio(bp, cred, p);
 358                                                 if (error) {
 359                                                         brelse(bp);
 360                                                 } else if (i < lbn)
 361                                                         brelse(bp);
 362                                         }
 363                                         }
 364                                 }
 365                                 if (error)
 366                                         return (error);
 367                     }
 368                 }
 369
 370                 /*
 371                  * If not eof and read aheads are enabled, start one.
 372                  * (You need the current block first, so that you have the
 373                  *  directory offset cookie of the next block.)
 374                  */
 375                 if (nfs_numasync > 0 && nmp->nm_readahead > 0 &&
 376                     (np->n_direofoffset == 0 ||
 377                     (lbn + 1) * NFS_DIRBLKSIZ < np->n_direofoffset) &&
 378                     !(np->n_flag & NQNFSNONCACHE) &&
 379                     !incore(vp, lbn + 1)) {
 380                         rabp = nfs_getcacheblk(vp, lbn + 1, NFS_DIRBLKSIZ, p, operation);
 381                         if (rabp) {
 382                             if (!ISSET(rabp->b_flags, (B_CACHE|B_DELWRI))) {
 383                                         SET(rabp->b_flags, (B_READ | B_ASYNC));
 384                                         if (nfs_asyncio(rabp, cred)) {
 385                                                 SET(rabp->b_flags, (B_INVAL|B_ERROR));
 386                                                 rabp->b_error = EIO;
 387                                                 brelse(rabp);
 388                                         }
 389                             } else {
 390                                         brelse(rabp);
 391                             }
 392                         }
 393                 }
 394                 /*
 395                  * Make sure we use a signed variant of min() since
 396                  * the second term may be negative.
 397                  */
 398                 n = lmin(uio->uio_resid, NFS_DIRBLKSIZ - bp->b_resid - on);
 399                 break;
 400             default:
 401                 printf(" nfs_bioread: type %x unexpected\n",vp->v_type);
 402                 break;
 403             };
 404
 405             if (n > 0) {
 406                 error = uiomove(bp->b_data + on, (int)n, uio);
 407             }
 408             switch (vp->v_type) {
 409             case VREG:
 410                 break;
 411             case VLNK:
 412                 n = 0;
 413                 break;
 414             case VDIR:
 415                 if (np->n_flag & NQNFSNONCACHE)
 416                         SET(bp->b_flags, B_INVAL);
 417                 break;
 418             default:
 419                 printf(" nfs_bioread: type %x unexpected\n",vp->v_type);
 420             }
 421             brelse(bp);
 422         } while (error == 0 && uio->uio_resid > 0 && n > 0);
 423         return (error);
 424 }
 425
 426 /*
 427  * Vnode op for write using bio
 428  */
 429 int
 430 nfs_write(ap)
 431         struct vop_write_args /* {
 432                 struct vnode *a_vp;
 433                 struct uio *a_uio;
 434                 int  a_ioflag;
 435                 struct ucred *a_cred;
 436         } */ *ap;
 437 {
 438         register int biosize;
 439         register struct uio *uio = ap->a_uio;
 440         struct proc *p = uio->uio_procp;
 441         register struct vnode *vp = ap->a_vp;
 442         struct nfsnode *np = VTONFS(vp);
 443         register struct ucred *cred = ap->a_cred;
 444         int ioflag = ap->a_ioflag;
 445         struct buf *bp;
 446         struct vattr vattr;
 447         struct nfsmount *nmp = VFSTONFS(vp->v_mount);
 448         daddr_t lbn;
 449         int bufsize;
 450         int n, on, error = 0, iomode, must_commit;
 451
 452 #if DIAGNOSTIC
 453         if (uio->uio_rw != UIO_WRITE)
 454                 panic("nfs_write mode");
 455         if (uio->uio_segflg == UIO_USERSPACE && uio->uio_procp != current_proc())
 456                 panic("nfs_write proc");
 457 #endif
 458         if (vp->v_type != VREG)
 459                 return (EIO);
 460         if (np->n_flag & NWRITEERR) {
 461                 np->n_flag &= ~NWRITEERR;
 462                 return (np->n_error);
 463         }
 464         if ((nmp->nm_flag & (NFSMNT_NFSV3 | NFSMNT_GOTFSINFO)) == NFSMNT_NFSV3)
 465                 (void)nfs_fsinfo(nmp, vp, cred, p);
 466         if (ioflag & (IO_APPEND | IO_SYNC)) {
 467                 if (np->n_flag & NMODIFIED) {
 468                         np->n_attrstamp = 0;
 469                         error = nfs_vinvalbuf(vp, V_SAVE, cred, p, 1);
 470                         if (error)
 471                                 return (error);
 472                 }
 473                 if (ioflag & IO_APPEND) {
 474                         np->n_attrstamp = 0;
 475                         error = VOP_GETATTR(vp, &vattr, cred, p);
 476                         if (error)
 477                                 return (error);
 478                         uio->uio_offset = np->n_size;
 479                 }
 480         }
 481         if (uio->uio_offset < 0)
 482                 return (EINVAL);
 483         if (uio->uio_resid == 0)
 484                 return (0);
 485         /*
 486          * Maybe this should be above the vnode op call, but so long as
 487          * file servers have no limits, i don't think it matters
 488          */
 489         if (p && uio->uio_offset + uio->uio_resid >
 490               p->p_rlimit[RLIMIT_FSIZE].rlim_cur) {
 491                 psignal(p, SIGXFSZ);
 492                 return (EFBIG);
 493         }
 494         /*
 495          * I use nm_rsize, not nm_wsize so that all buffer cache blocks
 496          * will be the same size within a filesystem. nfs_writerpc will
 497          * still use nm_wsize when sizing the rpc's.
 498          */
 499         /*due to getblk/vm interractions, use vm page size or less values */
 500         biosize = min(vp->v_mount->mnt_stat.f_iosize, PAGE_SIZE);
 501
 502         do {
 503                 /*
 504                  * Check for a valid write lease.
 505                  */
 506                 if ((nmp->nm_flag & NFSMNT_NQNFS) &&
 507                     NQNFS_CKINVALID(vp, np, ND_WRITE)) {
 508                         do {
 509                                 error = nqnfs_getlease(vp, ND_WRITE, cred, p);
 510                         } while (error == NQNFS_EXPIRED);
 511                         if (error)
 512                                 return (error);
 513                         if (np->n_lrev != np->n_brev ||
 514                             (np->n_flag & NQNFSNONCACHE)) {
 515                                 error = nfs_vinvalbuf(vp, V_SAVE, cred, p, 1);
 516                                 if (error)
 517                                         return (error);
 518                                 np->n_brev = np->n_lrev;
 519                         }
 520                 }
 521                 if ((np->n_flag & NQNFSNONCACHE) && uio->uio_iovcnt == 1) {
 522                     iomode = NFSV3WRITE_FILESYNC;
 523                     error = nfs_writerpc(vp, uio, cred, &iomode, &must_commit);
 524                     if (must_commit)
 525                         nfs_clearcommit(vp->v_mount);
 526                     return (error);
 527                 }
 528                 nfsstats.biocache_writes++;
 529                 lbn = uio->uio_offset / biosize;
 530                 on = uio->uio_offset & (biosize-1);
 531                 n = min((unsigned)(biosize - on), uio->uio_resid);
 532 again:
 533                 if (uio->uio_offset + n > np->n_size) {
 534                         np->n_size = uio->uio_offset + n;
 535                         np->n_flag |= NMODIFIED;
 536                         if (UBCISVALID(vp))
 537                                 ubc_setsize(vp, (off_t)np->n_size); /* XXX check error */
 538                 }
 539                 bufsize = biosize;
 540 #if 0
 541 /* (removed for UBC) */
 542                 if ((lbn + 1) * biosize > np->n_size) {
 543                         bufsize = np->n_size - lbn * biosize;
 544                         bufsize = (bufsize + DEV_BSIZE - 1) & ~(DEV_BSIZE - 1);
 545                 }
 546 #endif
 547                 bp = nfs_getwriteblk(vp, lbn, bufsize, p, cred, on, n);
 548                 if (!bp)
 549                         return (EINTR);
 550                 if (ISSET(bp->b_flags, B_ERROR)) {
 551                         error = bp->b_error;
 552                         brelse(bp);
 553                         return (error);
 554                 }
 555                 if (bp->b_wcred == NOCRED) {
 556             /*
 557              * NFS has embedded ucred.
 558              * Can not crhold() here as that causes zone corruption
 559              */
 560                         bp->b_wcred = crdup(cred);
 561                 }
 562                 np->n_flag |= NMODIFIED;
 563
 564                 /*
 565                  * Check for valid write lease and get one as required.
 566                  * In case getblk() and/or bwrite() delayed us.
 567                  */
 568                 if ((nmp->nm_flag & NFSMNT_NQNFS) &&
 569                     NQNFS_CKINVALID(vp, np, ND_WRITE)) {
 570                         do {
 571                                 error = nqnfs_getlease(vp, ND_WRITE, cred, p);
 572                         } while (error == NQNFS_EXPIRED);
 573                         if (error) {
 574                                 brelse(bp);
 575                                 return (error);
 576                         }
 577                         if (np->n_lrev != np->n_brev ||
 578                             (np->n_flag & NQNFSNONCACHE)) {
 579                                 brelse(bp);
 580                                 error = nfs_vinvalbuf(vp, V_SAVE, cred, p, 1);
 581                                 if (error)
 582                                         return (error);
 583                                 np->n_brev = np->n_lrev;
 584                                 goto again;
 585                         }
 586                 }
 587                 error = uiomove((char *)bp->b_data + on, n, uio);
 588                 if (error) {
 589                         SET(bp->b_flags, B_ERROR);
 590                         brelse(bp);
 591                         return (error);
 592                 }
 593                 if (bp->b_dirtyend > 0) {
 594                         bp->b_dirtyoff = min(on, bp->b_dirtyoff);
 595                         bp->b_dirtyend = max((on + n), bp->b_dirtyend);
 596                 } else {
 597                         bp->b_dirtyoff = on;
 598                         bp->b_dirtyend = on + n;
 599                 }
 600                 if (bp->b_validend == 0 || bp->b_validend < bp->b_dirtyoff ||
 601                     bp->b_validoff > bp->b_dirtyend) {
 602                         bp->b_validoff = bp->b_dirtyoff;
 603                         bp->b_validend = bp->b_dirtyend;
 604                 } else {
 605                         bp->b_validoff = min(bp->b_validoff, bp->b_dirtyoff);
 606                         bp->b_validend = max(bp->b_validend, bp->b_dirtyend);
 607                 }
 608
 609                 /*
 610                  * Since this block is being modified, it must be written
 611                  * again and not just committed.
 612                  */
 613                 CLR(bp->b_flags, B_NEEDCOMMIT);
 614
 615                 /*
 616                  * If the lease is non-cachable or IO_SYNC do bwrite().
 617                  */
 618                 if ((np->n_flag & NQNFSNONCACHE) || (ioflag & IO_SYNC)) {
 619                         bp->b_proc = p;
 620                         error = VOP_BWRITE(bp);
 621                         if (error)
 622                                 return (error);
 623                         if (np->n_flag & NQNFSNONCACHE) {
 624                                 error = nfs_vinvalbuf(vp, V_SAVE, cred, p, 1);
 625                                 if (error)
 626                                         return (error);
 627                         }
 628                 } else if ((n + on) == biosize &&
 629                         (nmp->nm_flag & NFSMNT_NQNFS) == 0) {
 630                         bp->b_proc = (struct proc *)0;
 631                         SET(bp->b_flags, B_ASYNC);
 632                         (void)nfs_writebp(bp, 0);
 633                 } else
 634                         bdwrite(bp);
 635         } while (uio->uio_resid > 0 && n > 0);
 636         return (0);
 637 }
 638
 639 /*
 640  * Get a cache block for writing.  The range to be written is
 641  * (off..off+len) within the block.  This routine ensures that the
 642  * block is either has no dirty region or that the given range is
 643  * contiguous with the existing dirty region.
 644  */
 645 static struct buf *
 646 nfs_getwriteblk(vp, bn, size, p, cred, off, len)
 647         struct vnode *vp;
 648         daddr_t bn;
 649         int size;
 650         struct proc *p;
 651         struct ucred *cred;
 652         int off, len;
 653 {
 654         struct nfsnode *np = VTONFS(vp);
 655         struct buf *bp;
 656         int error;
 657         struct iovec iov;
 658         struct uio uio;
 659         off_t boff;
 660
 661  again:
 662         bp = nfs_getcacheblk(vp, bn, size, p, BLK_WRITE);
 663         if (!bp)
 664                 return (NULL);
 665         if (bp->b_wcred == NOCRED) {
 666                 /*
 667                  * NFS has embedded ucred.
 668                  * Can not crhold() here as that causes zone corruption
 669                  */
 670                 bp->b_wcred = crdup(cred);
 671         }
 672
 673         if ((bp->b_blkno * DEV_BSIZE) + bp->b_dirtyend > np->n_size) {
 674                 bp->b_dirtyend = np->n_size - (bp->b_blkno * DEV_BSIZE);
 675         }
 676
 677         /*
 678          * UBC doesn't (yet) handle partial pages so nfs_biowrite was
 679          * hacked to never bdwrite, to start every little write right away.
 680          * Running IE Avie noticed the performance problem, thus this code,
 681          * which permits those delayed writes by ensuring an initial read
 682          * of the entire page.  The read may hit eof ("short read") but
 683          * that we will handle.
 684          *
 685          * We are quite dependant on the correctness of B_CACHE so check
 686          * that first in case of problems.
 687          */
 688         if (!ISSET(bp->b_flags, B_CACHE) && len < PAGE_SIZE) {
 689                 struct nfsnode *np = VTONFS(vp);
 690
 691                 boff = (off_t)bp->b_blkno * DEV_BSIZE;
 692                 uio.uio_iov = &iov;
 693                 uio.uio_iovcnt = 1;
 694                 uio.uio_offset = boff;
 695                 uio.uio_resid = PAGE_SIZE;
 696                 uio.uio_segflg = UIO_SYSSPACE;
 697                 uio.uio_rw = UIO_READ;
 698                 uio.uio_procp = p;
 699                 iov.iov_base = bp->b_data;
 700                 iov.iov_len = PAGE_SIZE;
 701                 error = nfs_readrpc(vp, &uio, cred);
 702                 if (error) {
 703                         bp->b_error = error;
 704                         SET(bp->b_flags, B_ERROR);
 705                         printf("nfs_getwriteblk: readrpc returned %d", error);
 706                 }
 707                 if (uio.uio_resid > 0)
 708                         bzero(iov.iov_base, uio.uio_resid);
 709                 bp->b_validoff = 0;
 710                 bp->b_validend = PAGE_SIZE - uio.uio_resid;
 711                 if (np->n_size > boff + bp->b_validend)
 712                         bp->b_validend = min(np->n_size - boff, PAGE_SIZE);
 713                 bp->b_dirtyoff = 0;
 714                 bp->b_dirtyend = 0;
 715         }
 716
 717         /*
 718          * If the new write will leave a contiguous dirty
 719          * area, just update the b_dirtyoff and b_dirtyend,
 720          * otherwise try to extend the dirty region.
 721          */
 722         if (bp->b_dirtyend > 0 &&
 723             (off > bp->b_dirtyend || (off + len) < bp->b_dirtyoff)) {
 724                 off_t start, end;
 725
 726                 boff = (off_t)bp->b_blkno * DEV_BSIZE;
 727                 if (off > bp->b_dirtyend) {
 728                         start = boff + bp->b_validend;
 729                         end = boff + off;
 730                 } else {
 731                         start = boff + off + len;
 732                         end = boff + bp->b_validoff;
 733                 }
 734
 735                 /*
 736                  * It may be that the valid region in the buffer
 737                  * covers the region we want, in which case just
 738                  * extend the dirty region.  Otherwise we try to
 739                  * extend the valid region.
 740                  */
 741                 if (end > start) {
 742                         uio.uio_iov = &iov;
 743                         uio.uio_iovcnt = 1;
 744                         uio.uio_offset = start;
 745                         uio.uio_resid = end - start;
 746                         uio.uio_segflg = UIO_SYSSPACE;
 747                         uio.uio_rw = UIO_READ;
 748                         uio.uio_procp = p;
 749                         iov.iov_base = bp->b_data + (start - boff);
 750                         iov.iov_len = end - start;
 751                         error = nfs_readrpc(vp, &uio, cred);
 752                         if (error) {
 753                                 /*
 754                                  * If we couldn't read, do not do a VOP_BWRITE
 755                                  * as originally coded. That, could also error
 756                                  * and looping back to "again" as it was doing
 757                                  * could have us stuck trying to write same buffer
 758                                  * again. nfs_write, will get the entire region
 759                                  * if nfs_readrpc was successful. If not successful
 760                                  * we should just error out. Errors like ESTALE
 761                                  * would keep us in this loop rather than transient
 762                                  * errors justifying a retry. We can return from here
 763                                  * instead of altering dirty region later in routine.
 764                                  * We did not write out old dirty region at this point.
 765                                  */
 766                                 bp->b_error = error;
 767                                 SET(bp->b_flags, B_ERROR);
 768                                 printf("nfs_getwriteblk: readrpc (2) returned %d", error);
 769                                 return bp;
 770                         } else {
 771                                 /*
 772                                  * The read worked.
 773                                  */
 774                                 if (uio.uio_resid > 0) {
 775                                         /*
 776                                          * If there was a short read,
 777                                          * just zero fill.
 778                                          */
 779                                         bzero(iov.iov_base,
 780                                               uio.uio_resid);
 781                                 }
 782                                 if (off > bp->b_dirtyend)
 783                                         bp->b_validend = off;
 784                                 else
 785                                         bp->b_validoff = off + len;
 786                         }
 787                 }
 788
 789                 /*
 790                  * We now have a valid region which extends up to the
 791                  * dirty region which we want.
 792                  */
 793                 if (off > bp->b_dirtyend)
 794                         bp->b_dirtyend = off;
 795                 else
 796                         bp->b_dirtyoff = off + len;
 797         }
 798
 799         return bp;
 800 }
 801
 802 /*
 803  * Get an nfs cache block.
 804  * Allocate a new one if the block isn't currently in the cache
 805  * and return the block marked busy. If the calling process is
 806  * interrupted by a signal for an interruptible mount point, return
 807  * NULL.
 808  */
 809 static struct buf *
 810 nfs_getcacheblk(vp, bn, size, p, operation)
 811         struct vnode *vp;
 812         daddr_t bn;
 813         int size;
 814         struct proc *p;
 815         int operation;  /* defined in sys/buf.h */
 816 {
 817         register struct buf *bp;
 818         struct nfsmount *nmp = VFSTONFS(vp->v_mount);
 819         /*due to getblk/vm interractions, use vm page size or less values */
 820         int biosize = min(vp->v_mount->mnt_stat.f_iosize, PAGE_SIZE);
 821
 822         if (nmp->nm_flag & NFSMNT_INT) {
 823                 bp = getblk(vp, bn, size, PCATCH, 0, operation);
 824                 while (bp == (struct buf *)0) {
 825                         if (nfs_sigintr(nmp, (struct nfsreq *)0, p))
 826                                 return ((struct buf *)0);
 827                         bp = getblk(vp, bn, size, 0, 2 * hz, operation);
 828                 }
 829         } else
 830                 bp = getblk(vp, bn, size, 0, 0, operation);
 831
 832         if( vp->v_type == VREG)
 833                 bp->b_blkno = (bn * biosize) / DEV_BSIZE;
 834
 835         return (bp);
 836 }
 837
 838 /*
 839  * Flush and invalidate all dirty buffers. If another process is already
 840  * doing the flush, just wait for completion.
 841  */
 842 int
 843 nfs_vinvalbuf(vp, flags, cred, p, intrflg)
 844         struct vnode *vp;
 845         int flags;
 846         struct ucred *cred;
 847         struct proc *p;
 848         int intrflg;
 849 {
 850         register struct nfsnode *np = VTONFS(vp);
 851         struct nfsmount *nmp = VFSTONFS(vp->v_mount);
 852         int error = 0, slpflag, slptimeo;
 853         int didhold = 0;
 854
 855         if ((nmp->nm_flag & NFSMNT_INT) == 0)
 856                 intrflg = 0;
 857         if (intrflg) {
 858                 slpflag = PCATCH;
 859                 slptimeo = 2 * hz;
 860         } else {
 861                 slpflag = 0;
 862                 slptimeo = 0;
 863         }
 864         /*
 865          * First wait for any other process doing a flush to complete.
 866          */
 867         while (np->n_flag & NFLUSHINPROG) {
 868                 np->n_flag |= NFLUSHWANT;
 869                 error = tsleep((caddr_t)&np->n_flag, PRIBIO + 2, "nfsvinval",
 870                         slptimeo);
 871                 if (error && intrflg && nfs_sigintr(nmp, (struct nfsreq *)0, p))
 872                         return (EINTR);
 873         }
 874
 875         /*
 876          * Now, flush as required.
 877          */
 878         np->n_flag |= NFLUSHINPROG;
 879         error = vinvalbuf(vp, flags, cred, p, slpflag, 0);
 880         while (error) {
 881                 /* we seem to be stuck in a loop here if the thread got aborted.
 882                  * nfs_flush will return EINTR. Not sure if that will cause
 883                  * other consequences due to EINTR having other meanings in NFS
 884                  * To handle, no dirty pages, it seems safe to just return from
 885                  * here. But if we did have dirty pages, how would we get them
 886                  * written out if thread was aborted? Some other strategy is
 887                  * necessary. -- EKN
 888                  */
 889                 if ((intrflg && nfs_sigintr(nmp, (struct nfsreq *)0, p)) ||
 890                                 ((error == EINTR) && current_thread_aborted())) {
 891                         np->n_flag &= ~NFLUSHINPROG;
 892                         if (np->n_flag & NFLUSHWANT) {
 893                                 np->n_flag &= ~NFLUSHWANT;
 894                                 wakeup((caddr_t)&np->n_flag);
 895                         }
 896                         return (EINTR);
 897                 }
 898                 error = vinvalbuf(vp, flags, cred, p, 0, slptimeo);
 899         }
 900         np->n_flag &= ~(NMODIFIED | NFLUSHINPROG);
 901         if (np->n_flag & NFLUSHWANT) {
 902                 np->n_flag &= ~NFLUSHWANT;
 903                 wakeup((caddr_t)&np->n_flag);
 904         }
 905         didhold = ubc_hold(vp);
 906         if (didhold) {
 907           (void) ubc_clean(vp, 1); /* get the pages out of vm also */
 908                 ubc_rele(vp);
 909         }
 910         return (0);
 911 }
 912
 913 /*
 914  * Initiate asynchronous I/O. Return an error if no nfsiods are available.
 915  * This is mainly to avoid queueing async I/O requests when the nfsiods
 916  * are all hung on a dead server.
 917  */
 918 int
 919 nfs_asyncio(bp, cred)
 920         register struct buf *bp;
 921         struct ucred *cred;
 922 {
 923         struct nfsmount *nmp;
 924         int i;
 925         int gotiod;
 926         int slpflag = 0;
 927         int slptimeo = 0;
 928         int error;
 929
 930         if (nfs_numasync == 0)
 931                 return (EIO);
 932
 933         nmp = VFSTONFS(bp->b_vp->v_mount);
 934 again:
 935         if (nmp->nm_flag & NFSMNT_INT)
 936                 slpflag = PCATCH;
 937         gotiod = FALSE;
 938
 939         /*
 940          * Find a free iod to process this request.
 941          */
 942         for (i = 0; i < NFS_MAXASYNCDAEMON; i++)
 943                 if (nfs_iodwant[i]) {
 944                         /*
 945                          * Found one, so wake it up and tell it which
 946                          * mount to process.
 947                          */
 948                         NFS_DPF(ASYNCIO,
 949                                 ("nfs_asyncio: waking iod %d for mount %p\n",
 950                                  i, nmp));
 951                         nfs_iodwant[i] = (struct proc *)0;
 952                         nfs_iodmount[i] = nmp;
 953                         nmp->nm_bufqiods++;
 954                         wakeup((caddr_t)&nfs_iodwant[i]);
 955                         gotiod = TRUE;
 956                         break;
 957                 }
 958
 959         /*
 960          * If none are free, we may already have an iod working on this mount
 961          * point.  If so, it will process our request.
 962          */
 963         if (!gotiod) {
 964                 if (nmp->nm_bufqiods > 0) {
 965                         NFS_DPF(ASYNCIO,
 966                                 ("nfs_asyncio: %d iods are already processing mount %p\n",
 967                                  nmp->nm_bufqiods, nmp));
 968                         gotiod = TRUE;
 969                 }
 970         }
 971
 972         /*
 973          * If we have an iod which can process the request, then queue
 974          * the buffer.
 975          */
 976         if (gotiod) {
 977                 /*
 978                  * Ensure that the queue never grows too large.
 979                  */
 980                 while (nmp->nm_bufqlen >= 2*nfs_numasync) {
 981                         NFS_DPF(ASYNCIO,
 982                                 ("nfs_asyncio: waiting for mount %p queue to drain\n", nmp));
 983                         nmp->nm_bufqwant = TRUE;
 984                         error = tsleep(&nmp->nm_bufq, slpflag | PRIBIO,
 985                                        "nfsaio", slptimeo);
 986                         if (error) {
 987                                 if (nfs_sigintr(nmp, NULL, bp->b_proc))
 988                                         return (EINTR);
 989                                 if (slpflag == PCATCH) {
 990                                         slpflag = 0;
 991                                         slptimeo = 2 * hz;
 992                                 }
 993                         }
 994                         /*
 995                          * We might have lost our iod while sleeping,
 996                          * so check and loop if nescessary.
 997                          */
 998                         if (nmp->nm_bufqiods == 0) {
 999                                 NFS_DPF(ASYNCIO,
1000                                         ("nfs_asyncio: no iods after mount %p queue was drained, looping\n", nmp));
1001                                 goto again;
1002                         }
1003                 }
1004
1005                 if (ISSET(bp->b_flags, B_READ)) {
1006                         if (bp->b_rcred == NOCRED && cred != NOCRED) {
1007                                 /*
1008                                  * NFS has embedded ucred.
1009                                  * Can not crhold() here as that causes zone corruption
1010                                  */
1011                                 bp->b_rcred = crdup(cred);
1012                         }
1013                 } else {
1014                         SET(bp->b_flags, B_WRITEINPROG);
1015                         if (bp->b_wcred == NOCRED && cred != NOCRED) {
1016                                 /*
1017                                  * NFS has embedded ucred.
1018                                  * Can not crhold() here as that causes zone corruption
1019                                  */
1020                                 bp->b_wcred = crdup(cred);
1021                         }
1022                 }
1023
1024                 TAILQ_INSERT_TAIL(&nmp->nm_bufq, bp, b_freelist);
1025                 nmp->nm_bufqlen++;
1026                 return (0);
1027         }
1028
1029         /*
1030          * All the iods are busy on other mounts, so return EIO to
1031          * force the caller to process the i/o synchronously.
1032          */
1033         NFS_DPF(ASYNCIO, ("nfs_asyncio: no iods available, i/o is synchronous\n"));
1034         return (EIO);
1035 }
1036
1037 /*
1038  * Do an I/O operation to/from a cache block. This may be called
1039  * synchronously or from an nfsiod.
1040  */
1041 int
1042 nfs_doio(bp, cr, p)
1043         register struct buf *bp;
1044         struct ucred *cr;
1045         struct proc *p;
1046 {
1047         register struct uio *uiop;
1048         register struct vnode *vp;
1049         struct nfsnode *np;
1050         struct nfsmount *nmp;
1051         int error = 0, diff, len, iomode, must_commit = 0;
1052         struct uio uio;
1053         struct iovec io;
1054
1055         vp = bp->b_vp;
1056         NFSTRACE(NFSTRC_DIO, vp);
1057         np = VTONFS(vp);
1058         nmp = VFSTONFS(vp->v_mount);
1059         uiop = &uio;
1060         uiop->uio_iov = &io;
1061         uiop->uio_iovcnt = 1;
1062         uiop->uio_segflg = UIO_SYSSPACE;
1063         uiop->uio_procp = p;
1064
1065         /*
1066          * With UBC, getblk() can return a buf with B_DONE set.
1067          * This indicates that the VM has valid data for that page.
1068          * NFS being stateless, this case poses a problem.
1069          * By definition, the NFS server should always be consulted
1070          * for the data in that page.
1071          * So we choose to clear the B_DONE and to the IO.
1072          *
1073          * XXX revisit this if there is a performance issue.
1074          * XXX In that case, we could play the attribute cache games ...
1075          */
1076          if (ISSET(bp->b_flags, B_DONE)) {
1077                 if (!ISSET(bp->b_flags, B_ASYNC))
1078                         panic("nfs_doio: done and not async");
1079                 CLR(bp->b_flags, B_DONE);
1080         }
1081
1082         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 256)) | DBG_FUNC_START,
1083                      (int)np->n_size, bp->b_blkno * DEV_BSIZE, bp->b_bcount, bp->b_flags, 0);
1084
1085         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 257)) | DBG_FUNC_NONE,
1086                      bp->b_validoff, bp->b_validend, bp->b_dirtyoff, bp->b_dirtyend, 0);
1087
1088         /*
1089          * Historically, paging was done with physio, but no more.
1090          */
1091         if (ISSET(bp->b_flags, B_PHYS)) {
1092             /*
1093              * ...though reading /dev/drum still gets us here.
1094              */
1095             io.iov_len = uiop->uio_resid = bp->b_bcount;
1096             /* mapping was done by vmapbuf() */
1097             io.iov_base = bp->b_data;
1098             uiop->uio_offset = ((off_t)bp->b_blkno) * DEV_BSIZE;
1099             if (ISSET(bp->b_flags, B_READ)) {
1100                         uiop->uio_rw = UIO_READ;
1101                         nfsstats.read_physios++;
1102                         error = nfs_readrpc(vp, uiop, cr);
1103             } else {
1104                         int com;
1105
1106                         iomode = NFSV3WRITE_DATASYNC;
1107                         uiop->uio_rw = UIO_WRITE;
1108                         nfsstats.write_physios++;
1109                         error = nfs_writerpc(vp, uiop, cr, &iomode, &com);
1110             }
1111             if (error) {
1112                         SET(bp->b_flags, B_ERROR);
1113                         bp->b_error = error;
1114             }
1115         } else if (ISSET(bp->b_flags, B_READ)) {
1116             io.iov_len = uiop->uio_resid = bp->b_bcount;
1117             io.iov_base = bp->b_data;
1118             uiop->uio_rw = UIO_READ;
1119             switch (vp->v_type) {
1120             case VREG:
1121                 uiop->uio_offset = ((off_t)bp->b_blkno) * DEV_BSIZE;
1122                 nfsstats.read_bios++;
1123                 error = nfs_readrpc(vp, uiop, cr);
1124
1125                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 262)) | DBG_FUNC_NONE,
1126                              (int)np->n_size, bp->b_blkno * DEV_BSIZE, uiop->uio_resid, error, 0);
1127
1128
1129                 if (!error) {
1130                     bp->b_validoff = 0;
1131                     if (uiop->uio_resid) {
1132                         /*
1133                          * If len > 0, there is a hole in the file and
1134                          * no writes after the hole have been pushed to
1135                          * the server yet.
1136                          * Just zero fill the rest of the valid area.
1137                          */
1138                         diff = bp->b_bcount - uiop->uio_resid;
1139                         len = np->n_size - (((u_quad_t)bp->b_blkno) * DEV_BSIZE
1140                                 + diff);
1141                                 if (len > 0) {
1142                                         len = min(len, uiop->uio_resid);
1143                                         bzero((char *)bp->b_data + diff, len);
1144                                         bp->b_validend = diff + len;
1145
1146                                         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 258)) | DBG_FUNC_NONE,
1147                                                      diff, len, 0, 1, 0);
1148
1149                                 } else
1150                                         bp->b_validend = diff;
1151                     } else
1152                                 bp->b_validend = bp->b_bcount;
1153 #if 1 /* USV + JOE [ */
1154                     if (bp->b_validend < bp->b_bufsize) {
1155                             /*
1156                              * we're about to release a partial buffer after a read... the only
1157                              * way we should get here is if this buffer contains the EOF
1158                              * before releasing it, we'll zero out to the end of the buffer
1159                              * so that if a mmap of this page occurs, we'll see zero's even
1160                              * if a ftruncate extends the file in the meantime
1161                              */
1162                             bzero((caddr_t)(bp->b_data + bp->b_validend), (bp->b_bufsize - bp->b_validend));
1163
1164                             KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 258)) | DBG_FUNC_NONE,
1165                                          bp->b_validend, (bp->b_bufsize - bp->b_validend), 0, 2, 0);
1166                     }
1167 #endif /* ] USV + JOE */
1168                 }
1169                 if (p && (vp->v_flag & VTEXT) &&
1170                         (((nmp->nm_flag & NFSMNT_NQNFS) &&
1171                           NQNFS_CKINVALID(vp, np, ND_READ) &&
1172                           np->n_lrev != np->n_brev) ||
1173                          (!(nmp->nm_flag & NFSMNT_NQNFS) &&
1174                           np->n_mtime != np->n_vattr.va_mtime.tv_sec))) {
1175                         uprintf("Process killed due to text file modification\n");
1176                         psignal(p, SIGKILL);
1177                         p->p_flag |= P_NOSWAP;
1178                 }
1179                 break;
1180             case VLNK:
1181                 uiop->uio_offset = (off_t)0;
1182                 nfsstats.readlink_bios++;
1183                 error = nfs_readlinkrpc(vp, uiop, cr);
1184                 break;
1185             case VDIR:
1186                 nfsstats.readdir_bios++;
1187                 uiop->uio_offset = ((u_quad_t)bp->b_lblkno) * NFS_DIRBLKSIZ;
1188                 if (!(nmp->nm_flag & NFSMNT_NFSV3))
1189                         nmp->nm_flag &= ~NFSMNT_RDIRPLUS; /* dk@farm.org */
1190                 if (nmp->nm_flag & NFSMNT_RDIRPLUS) {
1191                         error = nfs_readdirplusrpc(vp, uiop, cr);
1192                         if (error == NFSERR_NOTSUPP)
1193                                 nmp->nm_flag &= ~NFSMNT_RDIRPLUS;
1194                 }
1195                 if ((nmp->nm_flag & NFSMNT_RDIRPLUS) == 0)
1196                         error = nfs_readdirrpc(vp, uiop, cr);
1197                 break;
1198             default:
1199                 printf("nfs_doio:  type %x unexpected\n",vp->v_type);
1200                 break;
1201             };
1202             if (error) {
1203                         SET(bp->b_flags, B_ERROR);
1204                         bp->b_error = error;
1205             }
1206         } else {
1207             if (((bp->b_blkno * DEV_BSIZE) + bp->b_dirtyend) > np->n_size)
1208                 bp->b_dirtyend = np->n_size - (bp->b_blkno * DEV_BSIZE);
1209
1210             if (bp->b_dirtyend > bp->b_dirtyoff) {
1211
1212                 io.iov_len = uiop->uio_resid = bp->b_dirtyend
1213                     - bp->b_dirtyoff;
1214                 uiop->uio_offset = ((off_t)bp->b_blkno) * DEV_BSIZE
1215                     + bp->b_dirtyoff;
1216                 io.iov_base = (char *)bp->b_data + bp->b_dirtyoff;
1217                 uiop->uio_rw = UIO_WRITE;
1218
1219                 nfsstats.write_bios++;
1220                 if ((bp->b_flags & (B_ASYNC | B_NEEDCOMMIT | B_NOCACHE)) == B_ASYNC)
1221                     iomode = NFSV3WRITE_UNSTABLE;
1222                 else
1223                     iomode = NFSV3WRITE_FILESYNC;
1224                 SET(bp->b_flags, B_WRITEINPROG);
1225                 error = nfs_writerpc(vp, uiop, cr, &iomode, &must_commit);
1226                 if (!error && iomode == NFSV3WRITE_UNSTABLE)
1227                     SET(bp->b_flags, B_NEEDCOMMIT);
1228                 else
1229                     CLR(bp->b_flags, B_NEEDCOMMIT);
1230                 CLR(bp->b_flags, B_WRITEINPROG);
1231
1232                 /*
1233                  * For an interrupted write, the buffer is still valid
1234                  * and the write hasn't been pushed to the server yet,
1235                  * so we can't set B_ERROR and report the interruption
1236                  * by setting B_EINTR. For the B_ASYNC case, B_EINTR
1237                  * is not relevant, so the rpc attempt is essentially
1238                  * a noop.  For the case of a V3 write rpc not being
1239                  * committed to stable storage, the block is still
1240                  * dirty and requires either a commit rpc or another
1241                  * write rpc with iomode == NFSV3WRITE_FILESYNC before
1242                  * the block is reused. This is indicated by setting
1243                  * the B_DELWRI and B_NEEDCOMMIT flags.
1244                  */
1245                 if (error == EINTR
1246                         || (!error && (bp->b_flags & B_NEEDCOMMIT))) {
1247                         int s;
1248
1249                         CLR(bp->b_flags, (B_INVAL|B_NOCACHE));
1250                         SET(bp->b_flags, B_DELWRI);
1251
1252                         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 261)) | DBG_FUNC_NONE,
1253                                      bp->b_validoff, bp->b_validend, bp->b_bufsize, bp->b_bcount, 0);
1254
1255                         /*
1256                          * Since for the B_ASYNC case, nfs_bwrite() has reassigned the
1257                          * buffer to the clean list, we have to reassign it back to the
1258                          * dirty one. Ugh.
1259                          */
1260                         if (ISSET(bp->b_flags, B_ASYNC)) {
1261                                 s = splbio();
1262                                 reassignbuf(bp, vp);
1263                                 splx(s);
1264                         } else {
1265                                 SET(bp->b_flags, B_EINTR);
1266                         }
1267                 } else {
1268                         if (error) {
1269                                 SET(bp->b_flags, B_ERROR);
1270                                 bp->b_error = np->n_error = error;
1271                                 np->n_flag |= NWRITEERR;
1272                         }
1273                         bp->b_dirtyoff = bp->b_dirtyend = 0;
1274
1275 #if 1  /* JOE */
1276                         /*
1277                          * validoff and validend represent the real data present in this buffer
1278                          * if validoff is non-zero, than we have to invalidate the buffer and kill
1279                          * the page when biodone is called... the same is also true when validend
1280                          * doesn't extend all the way to the end of the buffer and validend doesn't
1281                          * equate to the current EOF... eventually we need to deal with this in a
1282                          * more humane way (like keeping the partial buffer without making it immediately
1283                          * available to the VM page cache).
1284                          */
1285                         if (bp->b_validoff)
1286                                 SET(bp->b_flags, B_INVAL);
1287                         else
1288                         if (bp->b_validend < bp->b_bufsize) {
1289                                 if ((((off_t)bp->b_blkno * (off_t)DEV_BSIZE) + bp->b_validend) == np->n_size) {
1290                                         bzero((caddr_t)(bp->b_data + bp->b_validend), (bp->b_bufsize - bp->b_validend));
1291
1292                                         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 259)) | DBG_FUNC_NONE,
1293                                                      bp->b_validend, (bp->b_bufsize - bp->b_validend), 0, 0, 0);;
1294                                 }
1295                                 else
1296                                         SET(bp->b_flags, B_INVAL);
1297                         }
1298 #endif
1299                 }
1300
1301             } else {
1302
1303 #if 1  /* JOE */
1304                         if (bp->b_validoff)
1305                                 SET(bp->b_flags, B_INVAL);
1306                         else if (bp->b_validend < bp->b_bufsize) {
1307                                 if ((((off_t)bp->b_blkno * (off_t)DEV_BSIZE) + bp->b_validend) != np->n_size)
1308                                          SET(bp->b_flags, B_INVAL);
1309                         }
1310                         if (bp->b_flags & B_INVAL) {
1311                                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 260)) | DBG_FUNC_NONE,
1312                                              bp->b_validoff, bp->b_validend, bp->b_bufsize, bp->b_bcount, 0);
1313                         }
1314 #endif
1315                         bp->b_resid = 0;
1316                         biodone(bp);
1317                         NFSTRACE(NFSTRC_DIO_DONE, vp);
1318                         return (0);
1319             }
1320         }
1321         bp->b_resid = uiop->uio_resid;
1322         if (must_commit)
1323                 nfs_clearcommit(vp->v_mount);
1324
1325         if (bp->b_flags & B_INVAL) {
1326                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 260)) | DBG_FUNC_NONE,
1327                              bp->b_validoff, bp->b_validend, bp->b_bufsize, bp->b_bcount, 0);
1328         }
1329         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 256)) | DBG_FUNC_END,
1330                      bp->b_validoff, bp->b_validend, bp->b_bcount, error, 0);
1331
1332         biodone(bp);
1333         NFSTRACE(NFSTRC_DIO_DONE, vp);
1334         return (error);
1335 }