bsd/nfs/nfs_bio.c

   1 /*
   2  * Copyright (c) 2000 Apple Computer, Inc. All rights reserved.
   3  *
   4  * @APPLE_LICENSE_HEADER_START@
   5  *
   6  * The contents of this file constitute Original Code as defined in and
   7  * are subject to the Apple Public Source License Version 1.1 (the
   8  * "License").  You may not use this file except in compliance with the
   9  * License.  Please obtain a copy of the License at
  10  * http://www.apple.com/publicsource and read it before using this file.
  11  *
  12  * This Original Code and all software distributed under the License are
  13  * distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY KIND, EITHER
  14  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
  15  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
  16  * FITNESS FOR A PARTICULAR PURPOSE OR NON-INFRINGEMENT.  Please see the
  17  * License for the specific language governing rights and limitations
  18  * under the License.
  19  *
  20  * @APPLE_LICENSE_HEADER_END@
  21  */
  22 /* Copyright (c) 1995 NeXT Computer, Inc. All Rights Reserved */
  23 /*
  24  * Copyright (c) 1989, 1993
  25  *      The Regents of the University of California.  All rights reserved.
  26  *
  27  * This code is derived from software contributed to Berkeley by
  28  * Rick Macklem at The University of Guelph.
  29  *
  30  * Redistribution and use in source and binary forms, with or without
  31  * modification, are permitted provided that the following conditions
  32  * are met:
  33  * 1. Redistributions of source code must retain the above copyright
  34  *    notice, this list of conditions and the following disclaimer.
  35  * 2. Redistributions in binary form must reproduce the above copyright
  36  *    notice, this list of conditions and the following disclaimer in the
  37  *    documentation and/or other materials provided with the distribution.
  38  * 3. All advertising materials mentioning features or use of this software
  39  *    must display the following acknowledgement:
  40  *      This product includes software developed by the University of
  41  *      California, Berkeley and its contributors.
  42  * 4. Neither the name of the University nor the names of its contributors
  43  *    may be used to endorse or promote products derived from this software
  44  *    without specific prior written permission.
  45  *
  46  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  47  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  48  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  49  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  50  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  51  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  52  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  53  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  54  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  55  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  56  * SUCH DAMAGE.
  57  *
  58  *      @(#)nfs_bio.c   8.9 (Berkeley) 3/30/95
  59  * FreeBSD-Id: nfs_bio.c,v 1.44 1997/09/10 19:52:25 phk Exp $
  60  */
  61
  62 #include <sys/param.h>
  63 #include <sys/systm.h>
  64 #include <sys/resourcevar.h>
  65 #include <sys/signalvar.h>
  66 #include <sys/proc.h>
  67 #include <sys/buf.h>
  68 #include <sys/vnode.h>
  69 #include <sys/mount.h>
  70 #include <sys/kernel.h>
  71 #include <sys/sysctl.h>
  72 #include <sys/ubc.h>
  73
  74 #include <sys/vm.h>
  75 #include <sys/vmparam.h>
  76
  77 #include <sys/time.h>
  78 #include <kern/clock.h>
  79
  80 #include <nfs/rpcv2.h>
  81 #include <nfs/nfsproto.h>
  82 #include <nfs/nfs.h>
  83 #include <nfs/nfsmount.h>
  84 #include <nfs/nqnfs.h>
  85 #include <nfs/nfsnode.h>
  86
  87 #include <sys/kdebug.h>
  88
  89 static struct buf *nfs_getcacheblk __P((struct vnode *vp, daddr_t bn, int size,
  90                                         struct proc *p, int operation));
  91 static struct buf *nfs_getwriteblk __P((struct vnode *vp, daddr_t bn,
  92                                         int size, struct proc *p,
  93                                         struct ucred *cred, int off, int len));
  94
  95 extern int nfs_numasync;
  96 extern struct nfsstats nfsstats;
  97
  98 /*
  99  * Vnode op for read using bio
 100  * Any similarity to readip() is purely coincidental
 101  */
 102 int
 103 nfs_bioread(vp, uio, ioflag, cred, getpages)
 104         register struct vnode *vp;
 105         register struct uio *uio;
 106         int ioflag;
 107         struct ucred *cred;
 108         int getpages;
 109 {
 110         register struct nfsnode *np = VTONFS(vp);
 111         register int biosize, diff, i;
 112         struct buf *bp = 0, *rabp;
 113         struct vattr vattr;
 114         struct proc *p;
 115         struct nfsmount *nmp = VFSTONFS(vp->v_mount);
 116         daddr_t lbn, rabn;
 117         int bufsize;
 118         int nra, error = 0, n = 0, on = 0, not_readin;
 119         int operation = (getpages? BLK_PAGEIN : BLK_READ);
 120
 121 #if DIAGNOSTIC
 122         if (uio->uio_rw != UIO_READ)
 123                 panic("nfs_read mode");
 124 #endif
 125         if (uio->uio_resid == 0)
 126                 return (0);
 127         if (uio->uio_offset < 0)
 128                 return (EINVAL);
 129         p = uio->uio_procp;
 130         if ((nmp->nm_flag & (NFSMNT_NFSV3 | NFSMNT_GOTFSINFO)) == NFSMNT_NFSV3)
 131                 (void)nfs_fsinfo(nmp, vp, cred, p);
 132         /*due to getblk/vm interractions, use vm page size or less values */
 133         biosize = min(vp->v_mount->mnt_stat.f_iosize, PAGE_SIZE);
 134         /*
 135          * For nfs, cache consistency can only be maintained approximately.
 136          * Although RFC1094 does not specify the criteria, the following is
 137          * believed to be compatible with the reference port.
 138          * For nqnfs, full cache consistency is maintained within the loop.
 139          * For nfs:
 140          * If the file's modify time on the server has changed since the
 141          * last read rpc or you have written to the file,
 142          * you may have lost data cache consistency with the
 143          * server, so flush all of the file's data out of the cache.
 144          * Then force a getattr rpc to ensure that you have up to date
 145          * attributes.
 146          * NB: This implies that cache data can be read when up to
 147          * NFS_ATTRTIMEO seconds out of date. If you find that you need current
 148          * attributes this could be forced by setting n_attrstamp to 0 before
 149          * the VOP_GETATTR() call.
 150          */
 151         if ((nmp->nm_flag & NFSMNT_NQNFS) == 0) {
 152                 if (np->n_flag & NMODIFIED) {
 153                         if (vp->v_type != VREG) {
 154                                 if (vp->v_type != VDIR)
 155                                         panic("nfs: bioread, not dir");
 156                                 nfs_invaldir(vp);
 157                                 error = nfs_vinvalbuf(vp, V_SAVE, cred, p, 1);
 158                                 if (error)
 159                                         return (error);
 160                         }
 161                         np->n_attrstamp = 0;
 162                         error = VOP_GETATTR(vp, &vattr, cred, p);
 163                         if (error)
 164                                 return (error);
 165                         np->n_mtime = vattr.va_mtime.tv_sec;
 166                 } else {
 167                         error = VOP_GETATTR(vp, &vattr, cred, p);
 168                         if (error)
 169                                 return (error);
 170                         if (np->n_mtime != vattr.va_mtime.tv_sec) {
 171                                 if (vp->v_type == VDIR)
 172                                         nfs_invaldir(vp);
 173                                 error = nfs_vinvalbuf(vp, V_SAVE, cred, p, 1);
 174                                 if (error)
 175                                         return (error);
 176                                 np->n_mtime = vattr.va_mtime.tv_sec;
 177                         }
 178                 }
 179         }
 180         do {
 181
 182             /*
 183              * Get a valid lease. If cached data is stale, flush it.
 184              */
 185             if (nmp->nm_flag & NFSMNT_NQNFS) {
 186                 if (NQNFS_CKINVALID(vp, np, ND_READ)) {
 187                     do {
 188                         error = nqnfs_getlease(vp, ND_READ, cred, p);
 189                     } while (error == NQNFS_EXPIRED);
 190                     if (error)
 191                         return (error);
 192                     if (np->n_lrev != np->n_brev ||
 193                         (np->n_flag & NQNFSNONCACHE) ||
 194                         ((np->n_flag & NMODIFIED) && vp->v_type == VDIR)) {
 195                         if (vp->v_type == VDIR)
 196                             nfs_invaldir(vp);
 197                         error = nfs_vinvalbuf(vp, V_SAVE, cred, p, 1);
 198                         if (error)
 199                             return (error);
 200                         np->n_brev = np->n_lrev;
 201                     }
 202                 } else if (vp->v_type == VDIR && (np->n_flag & NMODIFIED)) {
 203                     nfs_invaldir(vp);
 204                     error = nfs_vinvalbuf(vp, V_SAVE, cred, p, 1);
 205                     if (error)
 206                         return (error);
 207                 }
 208             }
 209             if (np->n_flag & NQNFSNONCACHE) {
 210                 switch (vp->v_type) {
 211                 case VREG:
 212                         return (nfs_readrpc(vp, uio, cred));
 213                 case VLNK:
 214                         return (nfs_readlinkrpc(vp, uio, cred));
 215                 case VDIR:
 216                         break;
 217                 default:
 218                         printf(" NQNFSNONCACHE: type %x unexpected\n",
 219                                 vp->v_type);
 220                 };
 221             }
 222             switch (vp->v_type) {
 223             case VREG:
 224                 nfsstats.biocache_reads++;
 225                 lbn = uio->uio_offset / biosize;
 226                 on = uio->uio_offset & (biosize - 1);
 227                 not_readin = 1;
 228
 229                 /*
 230                  * Start the read ahead(s), as required.
 231                  */
 232                 if (nfs_numasync > 0 && nmp->nm_readahead > 0) {
 233                     for (nra = 0; nra < nmp->nm_readahead &&
 234                                 (off_t)(lbn + 1 + nra) * biosize < np->n_size; nra++) {
 235                                 rabn = lbn + 1 + nra;
 236                                 if (!incore(vp, rabn)) {
 237                                         rabp = nfs_getcacheblk(vp, rabn, biosize, p, operation);
 238                                         if (!rabp)
 239                                                 return (EINTR);
 240                                         if (!ISSET(rabp->b_flags, (B_CACHE|B_DELWRI))) {
 241                                                 SET(rabp->b_flags, (B_READ | B_ASYNC));
 242                                                 if (nfs_asyncio(rabp, cred)) {
 243                                                         SET(rabp->b_flags, (B_INVAL|B_ERROR));
 244                                                         rabp->b_error = EIO;
 245                                                         brelse(rabp);
 246                                                 }
 247                                         } else
 248                                                 brelse(rabp);
 249                                 }
 250                     }
 251                 }
 252
 253                 /*
 254                  * If the block is in the cache and has the required data
 255                  * in a valid region, just copy it out.
 256                  * Otherwise, get the block and write back/read in,
 257                  * as required.
 258                  */
 259 again:
 260                 bufsize = biosize;
 261                 if ((off_t)(lbn + 1) * biosize > np->n_size &&
 262                     (off_t)(lbn + 1) * biosize - np->n_size < biosize) {
 263                         bufsize = np->n_size - lbn * biosize;
 264                         bufsize = (bufsize + DEV_BSIZE - 1) & ~(DEV_BSIZE - 1);
 265                 }
 266                 bp = nfs_getcacheblk(vp, lbn, bufsize, p, operation);
 267                 if (!bp)
 268                         return (EINTR);
 269
 270                 if (!ISSET(bp->b_flags, B_CACHE)) {
 271                         SET(bp->b_flags, B_READ);
 272                         CLR(bp->b_flags, (B_DONE | B_ERROR | B_INVAL));
 273                         not_readin = 0;
 274                         error = nfs_doio(bp, cred, p);
 275                         if (error) {
 276                             brelse(bp);
 277                             return (error);
 278                         }
 279                 }
 280                 if (bufsize > on) {
 281                         n = min((unsigned)(bufsize - on), uio->uio_resid);
 282                 } else {
 283                         n = 0;
 284                 }
 285                 diff = np->n_size - uio->uio_offset;
 286                 if (diff < n)
 287                         n = diff;
 288                 if (not_readin && n > 0) {
 289                         if (on < bp->b_validoff || (on + n) > bp->b_validend) {
 290                                 SET(bp->b_flags, (B_NOCACHE|B_INVAFTERWRITE));
 291                                 if (bp->b_dirtyend > 0) {
 292                                         if (!ISSET(bp->b_flags, B_DELWRI))
 293                                                 panic("nfsbioread");
 294                                         if (VOP_BWRITE(bp) == EINTR)
 295                                                 return (EINTR);
 296                                 } else
 297                                         brelse(bp);
 298                                 goto again;
 299                         }
 300                 }
 301                 vp->v_lastr = lbn;
 302                 diff = (on >= bp->b_validend) ? 0 : (bp->b_validend - on);
 303                 if (diff < n)
 304                         n = diff;
 305                 break;
 306             case VLNK:
 307                 nfsstats.biocache_readlinks++;
 308                 bp = nfs_getcacheblk(vp, (daddr_t)0, NFS_MAXPATHLEN, p, operation);
 309                 if (!bp)
 310                         return (EINTR);
 311                 if (!ISSET(bp->b_flags, B_CACHE)) {
 312                         SET(bp->b_flags, B_READ);
 313                         error = nfs_doio(bp, cred, p);
 314                         if (error) {
 315                                 SET(bp->b_flags, B_ERROR);
 316                                 brelse(bp);
 317                                 return (error);
 318                         }
 319                 }
 320                 n = min(uio->uio_resid, NFS_MAXPATHLEN - bp->b_resid);
 321                 on = 0;
 322                 break;
 323             case VDIR:
 324                 nfsstats.biocache_readdirs++;
 325                 if (np->n_direofoffset
 326                     && uio->uio_offset >= np->n_direofoffset) {
 327                     return (0);
 328                 }
 329                 lbn = uio->uio_offset / NFS_DIRBLKSIZ;
 330                 on = uio->uio_offset & (NFS_DIRBLKSIZ - 1);
 331                 bp = nfs_getcacheblk(vp, lbn, NFS_DIRBLKSIZ, p, operation);
 332                 if (!bp)
 333                     return (EINTR);
 334                 if (!ISSET(bp->b_flags, B_CACHE)) {
 335                     SET(bp->b_flags, B_READ);
 336                     error = nfs_doio(bp, cred, p);
 337                     if (error) {
 338                                 brelse(bp);
 339                                 while (error == NFSERR_BAD_COOKIE) {
 340                                         nfs_invaldir(vp);
 341                                         error = nfs_vinvalbuf(vp, 0, cred, p, 1);
 342                                         /*
 343                                          * Yuck! The directory has been modified on the
 344                                          * server. The only way to get the block is by
 345                                          * reading from the beginning to get all the
 346                                          * offset cookies.
 347                                          */
 348                                         for (i = 0; i <= lbn && !error; i++) {
 349                                         if (np->n_direofoffset
 350                                                 && (i * NFS_DIRBLKSIZ) >= np->n_direofoffset)
 351                                                 return (0);
 352                                         bp = nfs_getcacheblk(vp, i, NFS_DIRBLKSIZ, p, operation);
 353                                         if (!bp)
 354                                                 return (EINTR);
 355                                         if (!ISSET(bp->b_flags, B_DONE)) {
 356                                                 SET(bp->b_flags, B_READ);
 357                                                 error = nfs_doio(bp, cred, p);
 358                                                 if (error) {
 359                                                         brelse(bp);
 360                                                 } else if (i < lbn)
 361                                                         brelse(bp);
 362                                         }
 363                                         }
 364                                 }
 365                                 if (error)
 366                                         return (error);
 367                     }
 368                 }
 369
 370                 /*
 371                  * If not eof and read aheads are enabled, start one.
 372                  * (You need the current block first, so that you have the
 373                  *  directory offset cookie of the next block.)
 374                  */
 375                 if (nfs_numasync > 0 && nmp->nm_readahead > 0 &&
 376                     (np->n_direofoffset == 0 ||
 377                     (lbn + 1) * NFS_DIRBLKSIZ < np->n_direofoffset) &&
 378                     !(np->n_flag & NQNFSNONCACHE) &&
 379                     !incore(vp, lbn + 1)) {
 380                         rabp = nfs_getcacheblk(vp, lbn + 1, NFS_DIRBLKSIZ, p, operation);
 381                         if (rabp) {
 382                             if (!ISSET(rabp->b_flags, (B_CACHE|B_DELWRI))) {
 383                                         SET(rabp->b_flags, (B_READ | B_ASYNC));
 384                                         if (nfs_asyncio(rabp, cred)) {
 385                                                 SET(rabp->b_flags, (B_INVAL|B_ERROR));
 386                                                 rabp->b_error = EIO;
 387                                                 brelse(rabp);
 388                                         }
 389                             } else {
 390                                         brelse(rabp);
 391                             }
 392                         }
 393                 }
 394                 /*
 395                  * Make sure we use a signed variant of min() since
 396                  * the second term may be negative.
 397                  */
 398                 n = lmin(uio->uio_resid, NFS_DIRBLKSIZ - bp->b_resid - on);
 399                 break;
 400             default:
 401                 printf(" nfs_bioread: type %x unexpected\n",vp->v_type);
 402                 break;
 403             };
 404
 405             if (n > 0) {
 406                 error = uiomove(bp->b_data + on, (int)n, uio);
 407             }
 408             switch (vp->v_type) {
 409             case VREG:
 410                 break;
 411             case VLNK:
 412                 n = 0;
 413                 break;
 414             case VDIR:
 415                 if (np->n_flag & NQNFSNONCACHE)
 416                         SET(bp->b_flags, B_INVAL);
 417                 break;
 418             default:
 419                 printf(" nfs_bioread: type %x unexpected\n",vp->v_type);
 420             }
 421             brelse(bp);
 422         } while (error == 0 && uio->uio_resid > 0 && n > 0);
 423         return (error);
 424 }
 425
 426 /*
 427  * Vnode op for write using bio
 428  */
 429 int
 430 nfs_write(ap)
 431         struct vop_write_args /* {
 432                 struct vnode *a_vp;
 433                 struct uio *a_uio;
 434                 int  a_ioflag;
 435                 struct ucred *a_cred;
 436         } */ *ap;
 437 {
 438         register int biosize;
 439         register struct uio *uio = ap->a_uio;
 440         struct proc *p = uio->uio_procp;
 441         register struct vnode *vp = ap->a_vp;
 442         struct nfsnode *np = VTONFS(vp);
 443         register struct ucred *cred = ap->a_cred;
 444         int ioflag = ap->a_ioflag;
 445         struct buf *bp;
 446         struct vattr vattr;
 447         struct nfsmount *nmp = VFSTONFS(vp->v_mount);
 448         daddr_t lbn;
 449         int bufsize;
 450         int n, on, error = 0, iomode, must_commit;
 451
 452 #if DIAGNOSTIC
 453         if (uio->uio_rw != UIO_WRITE)
 454                 panic("nfs_write mode");
 455         if (uio->uio_segflg == UIO_USERSPACE && uio->uio_procp != current_proc())
 456                 panic("nfs_write proc");
 457 #endif
 458         if (vp->v_type != VREG)
 459                 return (EIO);
 460         if (np->n_flag & NWRITEERR) {
 461                 np->n_flag &= ~NWRITEERR;
 462                 return (np->n_error);
 463         }
 464         if ((nmp->nm_flag & (NFSMNT_NFSV3 | NFSMNT_GOTFSINFO)) == NFSMNT_NFSV3)
 465                 (void)nfs_fsinfo(nmp, vp, cred, p);
 466         if (ioflag & (IO_APPEND | IO_SYNC)) {
 467                 if (np->n_flag & NMODIFIED) {
 468                         np->n_attrstamp = 0;
 469                         error = nfs_vinvalbuf(vp, V_SAVE, cred, p, 1);
 470                         if (error)
 471                                 return (error);
 472                 }
 473                 if (ioflag & IO_APPEND) {
 474                         np->n_attrstamp = 0;
 475                         error = VOP_GETATTR(vp, &vattr, cred, p);
 476                         if (error)
 477                                 return (error);
 478                         uio->uio_offset = np->n_size;
 479                 }
 480         }
 481         if (uio->uio_offset < 0)
 482                 return (EINVAL);
 483         if (uio->uio_resid == 0)
 484                 return (0);
 485         /*
 486          * Maybe this should be above the vnode op call, but so long as
 487          * file servers have no limits, i don't think it matters
 488          */
 489         if (p && uio->uio_offset + uio->uio_resid >
 490               p->p_rlimit[RLIMIT_FSIZE].rlim_cur) {
 491                 psignal(p, SIGXFSZ);
 492                 return (EFBIG);
 493         }
 494         /*
 495          * I use nm_rsize, not nm_wsize so that all buffer cache blocks
 496          * will be the same size within a filesystem. nfs_writerpc will
 497          * still use nm_wsize when sizing the rpc's.
 498          */
 499         /*due to getblk/vm interractions, use vm page size or less values */
 500         biosize = min(vp->v_mount->mnt_stat.f_iosize, PAGE_SIZE);
 501
 502         do {
 503                 /*
 504                  * Check for a valid write lease.
 505                  */
 506                 if ((nmp->nm_flag & NFSMNT_NQNFS) &&
 507                     NQNFS_CKINVALID(vp, np, ND_WRITE)) {
 508                         do {
 509                                 error = nqnfs_getlease(vp, ND_WRITE, cred, p);
 510                         } while (error == NQNFS_EXPIRED);
 511                         if (error)
 512                                 return (error);
 513                         if (np->n_lrev != np->n_brev ||
 514                             (np->n_flag & NQNFSNONCACHE)) {
 515                                 error = nfs_vinvalbuf(vp, V_SAVE, cred, p, 1);
 516                                 if (error)
 517                                         return (error);
 518                                 np->n_brev = np->n_lrev;
 519                         }
 520                 }
 521                 if ((np->n_flag & NQNFSNONCACHE) && uio->uio_iovcnt == 1) {
 522                     iomode = NFSV3WRITE_FILESYNC;
 523                     error = nfs_writerpc(vp, uio, cred, &iomode, &must_commit);
 524                     if (must_commit)
 525                         nfs_clearcommit(vp->v_mount);
 526                     return (error);
 527                 }
 528                 nfsstats.biocache_writes++;
 529                 lbn = uio->uio_offset / biosize;
 530                 on = uio->uio_offset & (biosize-1);
 531                 n = min((unsigned)(biosize - on), uio->uio_resid);
 532 again:
 533                 if (uio->uio_offset + n > np->n_size) {
 534                         np->n_size = uio->uio_offset + n;
 535                         np->n_flag |= NMODIFIED;
 536                         if (UBCISVALID(vp))
 537                                 ubc_setsize(vp, (off_t)np->n_size); /* XXX check error */
 538                 }
 539                 bufsize = biosize;
 540 #if 0
 541 /* (removed for UBC) */
 542                 if ((lbn + 1) * biosize > np->n_size) {
 543                         bufsize = np->n_size - lbn * biosize;
 544                         bufsize = (bufsize + DEV_BSIZE - 1) & ~(DEV_BSIZE - 1);
 545                 }
 546 #endif
 547                 bp = nfs_getwriteblk(vp, lbn, bufsize, p, cred, on, n);
 548                 if (!bp)
 549                         return (EINTR);
 550                 if (ISSET(bp->b_flags, B_ERROR)) {
 551                         error = bp->b_error;
 552                         brelse(bp);
 553                         return (error);
 554                 }
 555                 if (bp->b_wcred == NOCRED) {
 556                         crhold(cred);
 557                         bp->b_wcred = cred;
 558                 }
 559                 np->n_flag |= NMODIFIED;
 560
 561                 /*
 562                  * Check for valid write lease and get one as required.
 563                  * In case getblk() and/or bwrite() delayed us.
 564                  */
 565                 if ((nmp->nm_flag & NFSMNT_NQNFS) &&
 566                     NQNFS_CKINVALID(vp, np, ND_WRITE)) {
 567                         do {
 568                                 error = nqnfs_getlease(vp, ND_WRITE, cred, p);
 569                         } while (error == NQNFS_EXPIRED);
 570                         if (error) {
 571                                 brelse(bp);
 572                                 return (error);
 573                         }
 574                         if (np->n_lrev != np->n_brev ||
 575                             (np->n_flag & NQNFSNONCACHE)) {
 576                                 brelse(bp);
 577                                 error = nfs_vinvalbuf(vp, V_SAVE, cred, p, 1);
 578                                 if (error)
 579                                         return (error);
 580                                 np->n_brev = np->n_lrev;
 581                                 goto again;
 582                         }
 583                 }
 584                 error = uiomove((char *)bp->b_data + on, n, uio);
 585                 if (error) {
 586                         SET(bp->b_flags, B_ERROR);
 587                         brelse(bp);
 588                         return (error);
 589                 }
 590                 if (bp->b_dirtyend > 0) {
 591                         bp->b_dirtyoff = min(on, bp->b_dirtyoff);
 592                         bp->b_dirtyend = max((on + n), bp->b_dirtyend);
 593                 } else {
 594                         bp->b_dirtyoff = on;
 595                         bp->b_dirtyend = on + n;
 596                 }
 597                 if (bp->b_validend == 0 || bp->b_validend < bp->b_dirtyoff ||
 598                     bp->b_validoff > bp->b_dirtyend) {
 599                         bp->b_validoff = bp->b_dirtyoff;
 600                         bp->b_validend = bp->b_dirtyend;
 601                 } else {
 602                         bp->b_validoff = min(bp->b_validoff, bp->b_dirtyoff);
 603                         bp->b_validend = max(bp->b_validend, bp->b_dirtyend);
 604                 }
 605
 606                 /*
 607                  * Since this block is being modified, it must be written
 608                  * again and not just committed.
 609                  */
 610                 CLR(bp->b_flags, B_NEEDCOMMIT);
 611
 612                 /*
 613                  * If the lease is non-cachable or IO_SYNC do bwrite().
 614                  */
 615                 if ((np->n_flag & NQNFSNONCACHE) || (ioflag & IO_SYNC)) {
 616                         bp->b_proc = p;
 617                         error = VOP_BWRITE(bp);
 618                         if (error)
 619                                 return (error);
 620                         if (np->n_flag & NQNFSNONCACHE) {
 621                                 error = nfs_vinvalbuf(vp, V_SAVE, cred, p, 1);
 622                                 if (error)
 623                                         return (error);
 624                         }
 625                 } else if ((n + on) == biosize &&
 626                         (nmp->nm_flag & NFSMNT_NQNFS) == 0) {
 627                         bp->b_proc = (struct proc *)0;
 628                         SET(bp->b_flags, B_ASYNC);
 629                         (void)nfs_writebp(bp, 0);
 630                 } else
 631                         bdwrite(bp);
 632         } while (uio->uio_resid > 0 && n > 0);
 633         return (0);
 634 }
 635
 636 /*
 637  * Get a cache block for writing.  The range to be written is
 638  * (off..off+len) within the block.  This routine ensures that the
 639  * block is either has no dirty region or that the given range is
 640  * contiguous with the existing dirty region.
 641  */
 642 static struct buf *
 643 nfs_getwriteblk(vp, bn, size, p, cred, off, len)
 644         struct vnode *vp;
 645         daddr_t bn;
 646         int size;
 647         struct proc *p;
 648         struct ucred *cred;
 649         int off, len;
 650 {
 651         struct nfsnode *np = VTONFS(vp);
 652         struct buf *bp;
 653         int error;
 654         struct iovec iov;
 655         struct uio uio;
 656         off_t boff;
 657
 658  again:
 659         bp = nfs_getcacheblk(vp, bn, size, p, BLK_WRITE);
 660         if (!bp)
 661                 return (NULL);
 662         if (bp->b_wcred == NOCRED) {
 663                 crhold(cred);
 664                 bp->b_wcred = cred;
 665         }
 666
 667         if ((bp->b_blkno * DEV_BSIZE) + bp->b_dirtyend > np->n_size) {
 668                 bp->b_dirtyend = np->n_size - (bp->b_blkno * DEV_BSIZE);
 669         }
 670
 671         /*
 672          * UBC doesn't (yet) handle partial pages so nfs_biowrite was
 673          * hacked to never bdwrite, to start every little write right away.
 674          * Running IE Avie noticed the performance problem, thus this code,
 675          * which permits those delayed writes by ensuring an initial read
 676          * of the entire page.  The read may hit eof ("short read") but
 677          * that we will handle.
 678          *
 679          * We are quite dependant on the correctness of B_CACHE so check
 680          * that first in case of problems.
 681          */
 682         if (!ISSET(bp->b_flags, B_CACHE) && len < PAGE_SIZE) {
 683                 struct nfsnode *np = VTONFS(vp);
 684
 685                 boff = (off_t)bp->b_blkno * DEV_BSIZE;
 686                 uio.uio_iov = &iov;
 687                 uio.uio_iovcnt = 1;
 688                 uio.uio_offset = boff;
 689                 uio.uio_resid = PAGE_SIZE;
 690                 uio.uio_segflg = UIO_SYSSPACE;
 691                 uio.uio_rw = UIO_READ;
 692                 uio.uio_procp = p;
 693                 iov.iov_base = bp->b_data;
 694                 iov.iov_len = PAGE_SIZE;
 695                 error = nfs_readrpc(vp, &uio, cred);
 696                 if (error) {
 697                         bp->b_error = error;
 698                         SET(bp->b_flags, B_ERROR);
 699                         printf("nfs_getwriteblk: readrpc returned %d", error);
 700                 }
 701                 if (uio.uio_resid > 0)
 702                         bzero(iov.iov_base, uio.uio_resid);
 703                 bp->b_validoff = 0;
 704                 bp->b_validend = PAGE_SIZE - uio.uio_resid;
 705                 if (np->n_size > boff + bp->b_validend)
 706                         bp->b_validend = min(np->n_size - boff, PAGE_SIZE);
 707                 bp->b_dirtyoff = 0;
 708                 bp->b_dirtyend = 0;
 709         }
 710
 711         /*
 712          * If the new write will leave a contiguous dirty
 713          * area, just update the b_dirtyoff and b_dirtyend,
 714          * otherwise try to extend the dirty region.
 715          */
 716         if (bp->b_dirtyend > 0 &&
 717             (off > bp->b_dirtyend || (off + len) < bp->b_dirtyoff)) {
 718                 off_t start, end;
 719
 720                 boff = (off_t)bp->b_blkno * DEV_BSIZE;
 721                 if (off > bp->b_dirtyend) {
 722                         start = boff + bp->b_validend;
 723                         end = boff + off;
 724                 } else {
 725                         start = boff + off + len;
 726                         end = boff + bp->b_validoff;
 727                 }
 728
 729                 /*
 730                  * It may be that the valid region in the buffer
 731                  * covers the region we want, in which case just
 732                  * extend the dirty region.  Otherwise we try to
 733                  * extend the valid region.
 734                  */
 735                 if (end > start) {
 736                         uio.uio_iov = &iov;
 737                         uio.uio_iovcnt = 1;
 738                         uio.uio_offset = start;
 739                         uio.uio_resid = end - start;
 740                         uio.uio_segflg = UIO_SYSSPACE;
 741                         uio.uio_rw = UIO_READ;
 742                         uio.uio_procp = p;
 743                         iov.iov_base = bp->b_data + (start - boff);
 744                         iov.iov_len = end - start;
 745                         error = nfs_readrpc(vp, &uio, cred);
 746                         if (error) {
 747                                 /*
 748                                  * If we couldn't read, fall back to writing
 749                                  * out the old dirty region.
 750                                  */
 751                                 bp->b_proc = p;
 752                                 if (VOP_BWRITE(bp) == EINTR)
 753                                         return (NULL);
 754                                 goto again;
 755                         } else {
 756                                 /*
 757                                  * The read worked.
 758                                  */
 759                                 if (uio.uio_resid > 0) {
 760                                         /*
 761                                          * If there was a short read,
 762                                          * just zero fill.
 763                                          */
 764                                         bzero(iov.iov_base,
 765                                               uio.uio_resid);
 766                                 }
 767                                 if (off > bp->b_dirtyend)
 768                                         bp->b_validend = off;
 769                                 else
 770                                         bp->b_validoff = off + len;
 771                         }
 772                 }
 773
 774                 /*
 775                  * We now have a valid region which extends up to the
 776                  * dirty region which we want.
 777                  */
 778                 if (off > bp->b_dirtyend)
 779                         bp->b_dirtyend = off;
 780                 else
 781                         bp->b_dirtyoff = off + len;
 782         }
 783
 784         return bp;
 785 }
 786
 787 /*
 788  * Get an nfs cache block.
 789  * Allocate a new one if the block isn't currently in the cache
 790  * and return the block marked busy. If the calling process is
 791  * interrupted by a signal for an interruptible mount point, return
 792  * NULL.
 793  */
 794 static struct buf *
 795 nfs_getcacheblk(vp, bn, size, p, operation)
 796         struct vnode *vp;
 797         daddr_t bn;
 798         int size;
 799         struct proc *p;
 800         int operation;  /* defined in sys/buf.h */
 801 {
 802         register struct buf *bp;
 803         struct nfsmount *nmp = VFSTONFS(vp->v_mount);
 804         /*due to getblk/vm interractions, use vm page size or less values */
 805         int biosize = min(vp->v_mount->mnt_stat.f_iosize, PAGE_SIZE);
 806
 807         if (nmp->nm_flag & NFSMNT_INT) {
 808                 bp = getblk(vp, bn, size, PCATCH, 0, operation);
 809                 while (bp == (struct buf *)0) {
 810                         if (nfs_sigintr(nmp, (struct nfsreq *)0, p))
 811                                 return ((struct buf *)0);
 812                         bp = getblk(vp, bn, size, 0, 2 * hz, operation);
 813                 }
 814         } else
 815                 bp = getblk(vp, bn, size, 0, 0, operation);
 816
 817         if( vp->v_type == VREG)
 818                 bp->b_blkno = (bn * biosize) / DEV_BSIZE;
 819
 820         return (bp);
 821 }
 822
 823 /*
 824  * Flush and invalidate all dirty buffers. If another process is already
 825  * doing the flush, just wait for completion.
 826  */
 827 int
 828 nfs_vinvalbuf(vp, flags, cred, p, intrflg)
 829         struct vnode *vp;
 830         int flags;
 831         struct ucred *cred;
 832         struct proc *p;
 833         int intrflg;
 834 {
 835         register struct nfsnode *np = VTONFS(vp);
 836         struct nfsmount *nmp = VFSTONFS(vp->v_mount);
 837         int error = 0, slpflag, slptimeo;
 838
 839         if ((nmp->nm_flag & NFSMNT_INT) == 0)
 840                 intrflg = 0;
 841         if (intrflg) {
 842                 slpflag = PCATCH;
 843                 slptimeo = 2 * hz;
 844         } else {
 845                 slpflag = 0;
 846                 slptimeo = 0;
 847         }
 848         /*
 849          * First wait for any other process doing a flush to complete.
 850          */
 851         while (np->n_flag & NFLUSHINPROG) {
 852                 np->n_flag |= NFLUSHWANT;
 853                 error = tsleep((caddr_t)&np->n_flag, PRIBIO + 2, "nfsvinval",
 854                         slptimeo);
 855                 if (error && intrflg && nfs_sigintr(nmp, (struct nfsreq *)0, p))
 856                         return (EINTR);
 857         }
 858
 859         /*
 860          * Now, flush as required.
 861          */
 862         np->n_flag |= NFLUSHINPROG;
 863         error = vinvalbuf(vp, flags, cred, p, slpflag, 0);
 864         while (error) {
 865                 if (intrflg && nfs_sigintr(nmp, (struct nfsreq *)0, p)) {
 866                         np->n_flag &= ~NFLUSHINPROG;
 867                         if (np->n_flag & NFLUSHWANT) {
 868                                 np->n_flag &= ~NFLUSHWANT;
 869                                 wakeup((caddr_t)&np->n_flag);
 870                         }
 871                         return (EINTR);
 872                 }
 873                 error = vinvalbuf(vp, flags, cred, p, 0, slptimeo);
 874         }
 875         np->n_flag &= ~(NMODIFIED | NFLUSHINPROG);
 876         if (np->n_flag & NFLUSHWANT) {
 877                 np->n_flag &= ~NFLUSHWANT;
 878                 wakeup((caddr_t)&np->n_flag);
 879         }
 880         (void) ubc_clean(vp, 1); /* get the pages out of vm also */
 881         return (0);
 882 }
 883
 884 /*
 885  * Initiate asynchronous I/O. Return an error if no nfsiods are available.
 886  * This is mainly to avoid queueing async I/O requests when the nfsiods
 887  * are all hung on a dead server.
 888  */
 889 int
 890 nfs_asyncio(bp, cred)
 891         register struct buf *bp;
 892         struct ucred *cred;
 893 {
 894         struct nfsmount *nmp;
 895         int i;
 896         int gotiod;
 897         int slpflag = 0;
 898         int slptimeo = 0;
 899         int error;
 900
 901         if (nfs_numasync == 0)
 902                 return (EIO);
 903
 904         nmp = VFSTONFS(bp->b_vp->v_mount);
 905 again:
 906         if (nmp->nm_flag & NFSMNT_INT)
 907                 slpflag = PCATCH;
 908         gotiod = FALSE;
 909
 910         /*
 911          * Find a free iod to process this request.
 912          */
 913         for (i = 0; i < NFS_MAXASYNCDAEMON; i++)
 914                 if (nfs_iodwant[i]) {
 915                         /*
 916                          * Found one, so wake it up and tell it which
 917                          * mount to process.
 918                          */
 919                         NFS_DPF(ASYNCIO,
 920                                 ("nfs_asyncio: waking iod %d for mount %p\n",
 921                                  i, nmp));
 922                         nfs_iodwant[i] = (struct proc *)0;
 923                         nfs_iodmount[i] = nmp;
 924                         nmp->nm_bufqiods++;
 925                         wakeup((caddr_t)&nfs_iodwant[i]);
 926                         gotiod = TRUE;
 927                         break;
 928                 }
 929
 930         /*
 931          * If none are free, we may already have an iod working on this mount
 932          * point.  If so, it will process our request.
 933          */
 934         if (!gotiod) {
 935                 if (nmp->nm_bufqiods > 0) {
 936                         NFS_DPF(ASYNCIO,
 937                                 ("nfs_asyncio: %d iods are already processing mount %p\n",
 938                                  nmp->nm_bufqiods, nmp));
 939                         gotiod = TRUE;
 940                 }
 941         }
 942
 943         /*
 944          * If we have an iod which can process the request, then queue
 945          * the buffer.
 946          */
 947         if (gotiod) {
 948                 /*
 949                  * Ensure that the queue never grows too large.
 950                  */
 951                 while (nmp->nm_bufqlen >= 2*nfs_numasync) {
 952                         NFS_DPF(ASYNCIO,
 953                                 ("nfs_asyncio: waiting for mount %p queue to drain\n", nmp));
 954                         nmp->nm_bufqwant = TRUE;
 955                         error = tsleep(&nmp->nm_bufq, slpflag | PRIBIO,
 956                                        "nfsaio", slptimeo);
 957                         if (error) {
 958                                 if (nfs_sigintr(nmp, NULL, bp->b_proc))
 959                                         return (EINTR);
 960                                 if (slpflag == PCATCH) {
 961                                         slpflag = 0;
 962                                         slptimeo = 2 * hz;
 963                                 }
 964                         }
 965                         /*
 966                          * We might have lost our iod while sleeping,
 967                          * so check and loop if nescessary.
 968                          */
 969                         if (nmp->nm_bufqiods == 0) {
 970                                 NFS_DPF(ASYNCIO,
 971                                         ("nfs_asyncio: no iods after mount %p queue was drained, looping\n", nmp));
 972                                 goto again;
 973                         }
 974                 }
 975
 976                 if (ISSET(bp->b_flags, B_READ)) {
 977                         if (bp->b_rcred == NOCRED && cred != NOCRED) {
 978                                 crhold(cred);
 979                                 bp->b_rcred = cred;
 980                         }
 981                 } else {
 982                         SET(bp->b_flags, B_WRITEINPROG);
 983                         if (bp->b_wcred == NOCRED && cred != NOCRED) {
 984                                 crhold(cred);
 985                                 bp->b_wcred = cred;
 986                         }
 987                 }
 988
 989                 TAILQ_INSERT_TAIL(&nmp->nm_bufq, bp, b_freelist);
 990                 nmp->nm_bufqlen++;
 991                 return (0);
 992         }
 993
 994         /*
 995          * All the iods are busy on other mounts, so return EIO to
 996          * force the caller to process the i/o synchronously.
 997          */
 998         NFS_DPF(ASYNCIO, ("nfs_asyncio: no iods available, i/o is synchronous\n"));
 999         return (EIO);
1000 }
1001
1002 /*
1003  * Do an I/O operation to/from a cache block. This may be called
1004  * synchronously or from an nfsiod.
1005  */
1006 int
1007 nfs_doio(bp, cr, p)
1008         register struct buf *bp;
1009         struct ucred *cr;
1010         struct proc *p;
1011 {
1012         register struct uio *uiop;
1013         register struct vnode *vp;
1014         struct nfsnode *np;
1015         struct nfsmount *nmp;
1016         int error = 0, diff, len, iomode, must_commit = 0;
1017         struct uio uio;
1018         struct iovec io;
1019
1020         vp = bp->b_vp;
1021         NFSTRACE(NFSTRC_DIO, vp);
1022         np = VTONFS(vp);
1023         nmp = VFSTONFS(vp->v_mount);
1024         uiop = &uio;
1025         uiop->uio_iov = &io;
1026         uiop->uio_iovcnt = 1;
1027         uiop->uio_segflg = UIO_SYSSPACE;
1028         uiop->uio_procp = p;
1029
1030         /*
1031          * With UBC, getblk() can return a buf with B_DONE set.
1032          * This indicates that the VM has valid data for that page.
1033          * NFS being stateless, this case poses a problem.
1034          * By definition, the NFS server should always be consulted
1035          * for the data in that page.
1036          * So we choose to clear the B_DONE and to the IO.
1037          *
1038          * XXX revisit this if there is a performance issue.
1039          * XXX In that case, we could play the attribute cache games ...
1040          */
1041          if (ISSET(bp->b_flags, B_DONE)) {
1042                 if (!ISSET(bp->b_flags, B_ASYNC))
1043                         panic("nfs_doio: done and not async");
1044                 CLR(bp->b_flags, B_DONE);
1045         }
1046
1047         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 256)) | DBG_FUNC_START,
1048                      (int)np->n_size, bp->b_blkno * DEV_BSIZE, bp->b_bcount, bp->b_flags, 0);
1049
1050         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 257)) | DBG_FUNC_NONE,
1051                      bp->b_validoff, bp->b_validend, bp->b_dirtyoff, bp->b_dirtyend, 0);
1052
1053         /*
1054          * Historically, paging was done with physio, but no more.
1055          */
1056         if (ISSET(bp->b_flags, B_PHYS)) {
1057             /*
1058              * ...though reading /dev/drum still gets us here.
1059              */
1060             io.iov_len = uiop->uio_resid = bp->b_bcount;
1061             /* mapping was done by vmapbuf() */
1062             io.iov_base = bp->b_data;
1063             uiop->uio_offset = ((off_t)bp->b_blkno) * DEV_BSIZE;
1064             if (ISSET(bp->b_flags, B_READ)) {
1065                         uiop->uio_rw = UIO_READ;
1066                         nfsstats.read_physios++;
1067                         error = nfs_readrpc(vp, uiop, cr);
1068             } else {
1069                         int com;
1070
1071                         iomode = NFSV3WRITE_DATASYNC;
1072                         uiop->uio_rw = UIO_WRITE;
1073                         nfsstats.write_physios++;
1074                         error = nfs_writerpc(vp, uiop, cr, &iomode, &com);
1075             }
1076             if (error) {
1077                         SET(bp->b_flags, B_ERROR);
1078                         bp->b_error = error;
1079             }
1080         } else if (ISSET(bp->b_flags, B_READ)) {
1081             io.iov_len = uiop->uio_resid = bp->b_bcount;
1082             io.iov_base = bp->b_data;
1083             uiop->uio_rw = UIO_READ;
1084             switch (vp->v_type) {
1085             case VREG:
1086                 uiop->uio_offset = ((off_t)bp->b_blkno) * DEV_BSIZE;
1087                 nfsstats.read_bios++;
1088                 error = nfs_readrpc(vp, uiop, cr);
1089
1090                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 262)) | DBG_FUNC_NONE,
1091                              (int)np->n_size, bp->b_blkno * DEV_BSIZE, uiop->uio_resid, error, 0);
1092
1093
1094                 if (!error) {
1095                     bp->b_validoff = 0;
1096                     if (uiop->uio_resid) {
1097                         /*
1098                          * If len > 0, there is a hole in the file and
1099                          * no writes after the hole have been pushed to
1100                          * the server yet.
1101                          * Just zero fill the rest of the valid area.
1102                          */
1103                         diff = bp->b_bcount - uiop->uio_resid;
1104                         len = np->n_size - (((u_quad_t)bp->b_blkno) * DEV_BSIZE
1105                                 + diff);
1106                                 if (len > 0) {
1107                                         len = min(len, uiop->uio_resid);
1108                                         bzero((char *)bp->b_data + diff, len);
1109                                         bp->b_validend = diff + len;
1110
1111                                         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 258)) | DBG_FUNC_NONE,
1112                                                      diff, len, 0, 1, 0);
1113
1114                                 } else
1115                                         bp->b_validend = diff;
1116                     } else
1117                                 bp->b_validend = bp->b_bcount;
1118 #if 1 /* USV + JOE [ */
1119                     if (bp->b_validend < bp->b_bufsize) {
1120                             /*
1121                              * we're about to release a partial buffer after a read... the only
1122                              * way we should get here is if this buffer contains the EOF
1123                              * before releasing it, we'll zero out to the end of the buffer
1124                              * so that if a mmap of this page occurs, we'll see zero's even
1125                              * if a ftruncate extends the file in the meantime
1126                              */
1127                             bzero((caddr_t)(bp->b_data + bp->b_validend), (bp->b_bufsize - bp->b_validend));
1128
1129                             KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 258)) | DBG_FUNC_NONE,
1130                                          bp->b_validend, (bp->b_bufsize - bp->b_validend), 0, 2, 0);
1131                     }
1132 #endif /* ] USV + JOE */
1133                 }
1134                 if (p && (vp->v_flag & VTEXT) &&
1135                         (((nmp->nm_flag & NFSMNT_NQNFS) &&
1136                           NQNFS_CKINVALID(vp, np, ND_READ) &&
1137                           np->n_lrev != np->n_brev) ||
1138                          (!(nmp->nm_flag & NFSMNT_NQNFS) &&
1139                           np->n_mtime != np->n_vattr.va_mtime.tv_sec))) {
1140                         uprintf("Process killed due to text file modification\n");
1141                         psignal(p, SIGKILL);
1142                         p->p_flag |= P_NOSWAP;
1143                 }
1144                 break;
1145             case VLNK:
1146                 uiop->uio_offset = (off_t)0;
1147                 nfsstats.readlink_bios++;
1148                 error = nfs_readlinkrpc(vp, uiop, cr);
1149                 break;
1150             case VDIR:
1151                 nfsstats.readdir_bios++;
1152                 uiop->uio_offset = ((u_quad_t)bp->b_lblkno) * NFS_DIRBLKSIZ;
1153                 if (!(nmp->nm_flag & NFSMNT_NFSV3))
1154                         nmp->nm_flag &= ~NFSMNT_RDIRPLUS; /* dk@farm.org */
1155                 if (nmp->nm_flag & NFSMNT_RDIRPLUS) {
1156                         error = nfs_readdirplusrpc(vp, uiop, cr);
1157                         if (error == NFSERR_NOTSUPP)
1158                                 nmp->nm_flag &= ~NFSMNT_RDIRPLUS;
1159                 }
1160                 if ((nmp->nm_flag & NFSMNT_RDIRPLUS) == 0)
1161                         error = nfs_readdirrpc(vp, uiop, cr);
1162                 break;
1163             default:
1164                 printf("nfs_doio:  type %x unexpected\n",vp->v_type);
1165                 break;
1166             };
1167             if (error) {
1168                         SET(bp->b_flags, B_ERROR);
1169                         bp->b_error = error;
1170             }
1171         } else {
1172             if (((bp->b_blkno * DEV_BSIZE) + bp->b_dirtyend) > np->n_size)
1173                 bp->b_dirtyend = np->n_size - (bp->b_blkno * DEV_BSIZE);
1174
1175             if (bp->b_dirtyend > bp->b_dirtyoff) {
1176
1177                 io.iov_len = uiop->uio_resid = bp->b_dirtyend
1178                     - bp->b_dirtyoff;
1179                 uiop->uio_offset = ((off_t)bp->b_blkno) * DEV_BSIZE
1180                     + bp->b_dirtyoff;
1181                 io.iov_base = (char *)bp->b_data + bp->b_dirtyoff;
1182                 uiop->uio_rw = UIO_WRITE;
1183
1184                 nfsstats.write_bios++;
1185                 if ((bp->b_flags & (B_ASYNC | B_NEEDCOMMIT | B_NOCACHE)) == B_ASYNC)
1186                     iomode = NFSV3WRITE_UNSTABLE;
1187                 else
1188                     iomode = NFSV3WRITE_FILESYNC;
1189                 SET(bp->b_flags, B_WRITEINPROG);
1190                 error = nfs_writerpc(vp, uiop, cr, &iomode, &must_commit);
1191                 if (!error && iomode == NFSV3WRITE_UNSTABLE)
1192                     SET(bp->b_flags, B_NEEDCOMMIT);
1193                 else
1194                     CLR(bp->b_flags, B_NEEDCOMMIT);
1195                 CLR(bp->b_flags, B_WRITEINPROG);
1196
1197                 /*
1198                  * For an interrupted write, the buffer is still valid
1199                  * and the write hasn't been pushed to the server yet,
1200                  * so we can't set B_ERROR and report the interruption
1201                  * by setting B_EINTR. For the B_ASYNC case, B_EINTR
1202                  * is not relevant, so the rpc attempt is essentially
1203                  * a noop.  For the case of a V3 write rpc not being
1204                  * committed to stable storage, the block is still
1205                  * dirty and requires either a commit rpc or another
1206                  * write rpc with iomode == NFSV3WRITE_FILESYNC before
1207                  * the block is reused. This is indicated by setting
1208                  * the B_DELWRI and B_NEEDCOMMIT flags.
1209                  */
1210                 if (error == EINTR
1211                         || (!error && (bp->b_flags & B_NEEDCOMMIT))) {
1212                         int s;
1213
1214                         CLR(bp->b_flags, (B_INVAL|B_NOCACHE));
1215                         SET(bp->b_flags, B_DELWRI);
1216
1217                         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 261)) | DBG_FUNC_NONE,
1218                                      bp->b_validoff, bp->b_validend, bp->b_bufsize, bp->b_bcount, 0);
1219
1220                         /*
1221                          * Since for the B_ASYNC case, nfs_bwrite() has reassigned the
1222                          * buffer to the clean list, we have to reassign it back to the
1223                          * dirty one. Ugh.
1224                          */
1225                         if (ISSET(bp->b_flags, B_ASYNC)) {
1226                                 s = splbio();
1227                                 reassignbuf(bp, vp);
1228                                 splx(s);
1229                         } else {
1230                                 SET(bp->b_flags, B_EINTR);
1231                         }
1232                 } else {
1233                         if (error) {
1234                                 SET(bp->b_flags, B_ERROR);
1235                                 bp->b_error = np->n_error = error;
1236                                 np->n_flag |= NWRITEERR;
1237                         }
1238                         bp->b_dirtyoff = bp->b_dirtyend = 0;
1239
1240 #if 1  /* JOE */
1241                         /*
1242                          * validoff and validend represent the real data present in this buffer
1243                          * if validoff is non-zero, than we have to invalidate the buffer and kill
1244                          * the page when biodone is called... the same is also true when validend
1245                          * doesn't extend all the way to the end of the buffer and validend doesn't
1246                          * equate to the current EOF... eventually we need to deal with this in a
1247                          * more humane way (like keeping the partial buffer without making it immediately
1248                          * available to the VM page cache).
1249                          */
1250                         if (bp->b_validoff)
1251                                 SET(bp->b_flags, B_INVAL);
1252                         else
1253                         if (bp->b_validend < bp->b_bufsize) {
1254                                 if ((((off_t)bp->b_blkno * (off_t)DEV_BSIZE) + bp->b_validend) == np->n_size) {
1255                                         bzero((caddr_t)(bp->b_data + bp->b_validend), (bp->b_bufsize - bp->b_validend));
1256
1257                                         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 259)) | DBG_FUNC_NONE,
1258                                                      bp->b_validend, (bp->b_bufsize - bp->b_validend), 0, 0, 0);;
1259                                 }
1260                                 else
1261                                         SET(bp->b_flags, B_INVAL);
1262                         }
1263 #endif
1264                 }
1265
1266             } else {
1267
1268 #if 1  /* JOE */
1269                         if (bp->b_validoff)
1270                                 SET(bp->b_flags, B_INVAL);
1271                         else if (bp->b_validend < bp->b_bufsize) {
1272                                 if ((((off_t)bp->b_blkno * (off_t)DEV_BSIZE) + bp->b_validend) != np->n_size)
1273                                          SET(bp->b_flags, B_INVAL);
1274                         }
1275                         if (bp->b_flags & B_INVAL) {
1276                                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 260)) | DBG_FUNC_NONE,
1277                                              bp->b_validoff, bp->b_validend, bp->b_bufsize, bp->b_bcount, 0);
1278                         }
1279 #endif
1280                         bp->b_resid = 0;
1281                         biodone(bp);
1282                         NFSTRACE(NFSTRC_DIO_DONE, vp);
1283                         return (0);
1284             }
1285         }
1286         bp->b_resid = uiop->uio_resid;
1287         if (must_commit)
1288                 nfs_clearcommit(vp->v_mount);
1289
1290         if (bp->b_flags & B_INVAL) {
1291                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 260)) | DBG_FUNC_NONE,
1292                              bp->b_validoff, bp->b_validend, bp->b_bufsize, bp->b_bcount, 0);
1293         }
1294         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 256)) | DBG_FUNC_END,
1295                      bp->b_validoff, bp->b_validend, bp->b_bcount, error, 0);
1296
1297         biodone(bp);
1298         NFSTRACE(NFSTRC_DIO_DONE, vp);
1299         return (error);
1300 }