]> git.saurik.com Git - apple/xnu.git/blob - bsd/nfs/nfs_bio.c
xnu-344.34.tar.gz
[apple/xnu.git] / bsd / nfs / nfs_bio.c
1 /*
2 * Copyright (c) 2000 Apple Computer, Inc. All rights reserved.
3 *
4 * @APPLE_LICENSE_HEADER_START@
5 *
6 * The contents of this file constitute Original Code as defined in and
7 * are subject to the Apple Public Source License Version 1.1 (the
8 * "License"). You may not use this file except in compliance with the
9 * License. Please obtain a copy of the License at
10 * http://www.apple.com/publicsource and read it before using this file.
11 *
12 * This Original Code and all software distributed under the License are
13 * distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY KIND, EITHER
14 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
15 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
16 * FITNESS FOR A PARTICULAR PURPOSE OR NON-INFRINGEMENT. Please see the
17 * License for the specific language governing rights and limitations
18 * under the License.
19 *
20 * @APPLE_LICENSE_HEADER_END@
21 */
22 /* Copyright (c) 1995 NeXT Computer, Inc. All Rights Reserved */
23 /*
24 * Copyright (c) 1989, 1993
25 * The Regents of the University of California. All rights reserved.
26 *
27 * This code is derived from software contributed to Berkeley by
28 * Rick Macklem at The University of Guelph.
29 *
30 * Redistribution and use in source and binary forms, with or without
31 * modification, are permitted provided that the following conditions
32 * are met:
33 * 1. Redistributions of source code must retain the above copyright
34 * notice, this list of conditions and the following disclaimer.
35 * 2. Redistributions in binary form must reproduce the above copyright
36 * notice, this list of conditions and the following disclaimer in the
37 * documentation and/or other materials provided with the distribution.
38 * 3. All advertising materials mentioning features or use of this software
39 * must display the following acknowledgement:
40 * This product includes software developed by the University of
41 * California, Berkeley and its contributors.
42 * 4. Neither the name of the University nor the names of its contributors
43 * may be used to endorse or promote products derived from this software
44 * without specific prior written permission.
45 *
46 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
47 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
48 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
49 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
50 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
51 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
52 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
53 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
54 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
55 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
56 * SUCH DAMAGE.
57 *
58 * @(#)nfs_bio.c 8.9 (Berkeley) 3/30/95
59 * FreeBSD-Id: nfs_bio.c,v 1.44 1997/09/10 19:52:25 phk Exp $
60 */
61 #include <sys/param.h>
62 #include <sys/systm.h>
63 #include <sys/resourcevar.h>
64 #include <sys/signalvar.h>
65 #include <sys/proc.h>
66 #include <sys/buf.h>
67 #include <sys/vnode.h>
68 #include <sys/mount.h>
69 #include <sys/kernel.h>
70 #include <sys/sysctl.h>
71 #include <sys/ubc.h>
72
73 #include <sys/vm.h>
74 #include <sys/vmparam.h>
75
76 #include <sys/time.h>
77 #include <kern/clock.h>
78
79 #include <nfs/rpcv2.h>
80 #include <nfs/nfsproto.h>
81 #include <nfs/nfs.h>
82 #include <nfs/nfsmount.h>
83 #include <nfs/nqnfs.h>
84 #include <nfs/nfsnode.h>
85
86 #include <sys/kdebug.h>
87
88 #define FSDBG(A, B, C, D, E) \
89 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, (A))) | DBG_FUNC_NONE, \
90 (int)(B), (int)(C), (int)(D), (int)(E), 0)
91 #define FSDBG_TOP(A, B, C, D, E) \
92 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, (A))) | DBG_FUNC_START, \
93 (int)(B), (int)(C), (int)(D), (int)(E), 0)
94 #define FSDBG_BOT(A, B, C, D, E) \
95 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, (A))) | DBG_FUNC_END, \
96 (int)(B), (int)(C), (int)(D), (int)(E), 0)
97
98 static struct buf *nfs_getcacheblk __P((struct vnode *vp, daddr_t bn, int size,
99 struct proc *p, int operation));
100
101 extern int nfs_numasync;
102 extern struct nfsstats nfsstats;
103 extern int nbdwrite;
104
105 /*
106 * Vnode op for read using bio
107 * Any similarity to readip() is purely coincidental
108 */
109 int
110 nfs_bioread(vp, uio, ioflag, cred, getpages)
111 register struct vnode *vp;
112 register struct uio *uio;
113 int ioflag;
114 struct ucred *cred;
115 int getpages;
116 {
117 register struct nfsnode *np = VTONFS(vp);
118 register int biosize, i;
119 off_t diff;
120 struct buf *bp = 0, *rabp;
121 struct vattr vattr;
122 struct proc *p;
123 struct nfsmount *nmp = VFSTONFS(vp->v_mount);
124 daddr_t lbn, rabn;
125 int bufsize;
126 int nra, error = 0, n = 0, on = 0, not_readin;
127 int operation = (getpages? BLK_PAGEIN : BLK_READ);
128
129 #if DIAGNOSTIC
130 if (uio->uio_rw != UIO_READ)
131 panic("nfs_read mode");
132 #endif
133 if (uio->uio_resid == 0)
134 return (0);
135 if (uio->uio_offset < 0)
136 return (EINVAL);
137 p = uio->uio_procp;
138 if ((nmp->nm_flag & (NFSMNT_NFSV3 | NFSMNT_GOTFSINFO)) == NFSMNT_NFSV3)
139 (void)nfs_fsinfo(nmp, vp, cred, p);
140 /*due to getblk/vm interractions, use vm page size or less values */
141 biosize = min(vp->v_mount->mnt_stat.f_iosize, PAGE_SIZE);
142 /*
143 * For nfs, cache consistency can only be maintained approximately.
144 * Although RFC1094 does not specify the criteria, the following is
145 * believed to be compatible with the reference port.
146 * For nqnfs, full cache consistency is maintained within the loop.
147 * For nfs:
148 * If the file's modify time on the server has changed since the
149 * last read rpc or you have written to the file,
150 * you may have lost data cache consistency with the
151 * server, so flush all of the file's data out of the cache.
152 * Then force a getattr rpc to ensure that you have up to date
153 * attributes.
154 * NB: This implies that cache data can be read when up to
155 * NFS_ATTRTIMEO seconds out of date. If you find that you need current
156 * attributes this could be forced by setting n_attrstamp to 0 before
157 * the VOP_GETATTR() call.
158 */
159 if ((nmp->nm_flag & NFSMNT_NQNFS) == 0) {
160 if (np->n_flag & NMODIFIED) {
161 if (vp->v_type != VREG) {
162 if (vp->v_type != VDIR)
163 panic("nfs: bioread, not dir");
164 nfs_invaldir(vp);
165 error = nfs_vinvalbuf(vp, V_SAVE, cred, p, 1);
166 if (error)
167 return (error);
168 }
169 np->n_attrstamp = 0;
170 error = VOP_GETATTR(vp, &vattr, cred, p);
171 if (error)
172 return (error);
173 np->n_mtime = vattr.va_mtime.tv_sec;
174 } else {
175 error = VOP_GETATTR(vp, &vattr, cred, p);
176 if (error)
177 return (error);
178 if (np->n_mtime != vattr.va_mtime.tv_sec) {
179 if (vp->v_type == VDIR)
180 nfs_invaldir(vp);
181 error = nfs_vinvalbuf(vp, V_SAVE, cred, p, 1);
182 if (error)
183 return (error);
184 np->n_mtime = vattr.va_mtime.tv_sec;
185 }
186 }
187 }
188 do {
189
190 /*
191 * Get a valid lease. If cached data is stale, flush it.
192 */
193 if (nmp->nm_flag & NFSMNT_NQNFS) {
194 if (NQNFS_CKINVALID(vp, np, ND_READ)) {
195 do {
196 error = nqnfs_getlease(vp, ND_READ, cred, p);
197 } while (error == NQNFS_EXPIRED);
198 if (error)
199 return (error);
200 if (np->n_lrev != np->n_brev ||
201 (np->n_flag & NQNFSNONCACHE) ||
202 ((np->n_flag & NMODIFIED) && vp->v_type == VDIR)) {
203 if (vp->v_type == VDIR)
204 nfs_invaldir(vp);
205 error = nfs_vinvalbuf(vp, V_SAVE, cred, p, 1);
206 if (error)
207 return (error);
208 np->n_brev = np->n_lrev;
209 }
210 } else if (vp->v_type == VDIR && (np->n_flag & NMODIFIED)) {
211 nfs_invaldir(vp);
212 error = nfs_vinvalbuf(vp, V_SAVE, cred, p, 1);
213 if (error)
214 return (error);
215 }
216 }
217 if (np->n_flag & NQNFSNONCACHE) {
218 switch (vp->v_type) {
219 case VREG:
220 return (nfs_readrpc(vp, uio, cred));
221 case VLNK:
222 return (nfs_readlinkrpc(vp, uio, cred));
223 case VDIR:
224 break;
225 default:
226 printf(" NQNFSNONCACHE: type %x unexpected\n",
227 vp->v_type);
228 };
229 }
230 switch (vp->v_type) {
231 case VREG:
232 nfsstats.biocache_reads++;
233 lbn = uio->uio_offset / biosize;
234 on = uio->uio_offset & (biosize - 1);
235 not_readin = 1;
236
237 /*
238 * Start the read ahead(s), as required.
239 */
240 if (nfs_numasync > 0 && nmp->nm_readahead > 0) {
241 for (nra = 0; nra < nmp->nm_readahead &&
242 (off_t)(lbn + 1 + nra) * biosize < np->n_size;
243 nra++) {
244 rabn = lbn + 1 + nra;
245 if (!incore(vp, rabn)) {
246 rabp = nfs_getcacheblk(vp, rabn, biosize, p, operation);
247 if (!rabp)
248 return (EINTR);
249 if (!ISSET(rabp->b_flags, (B_CACHE|B_DELWRI))) {
250 SET(rabp->b_flags, (B_READ | B_ASYNC));
251 if (nfs_asyncio(rabp, cred)) {
252 SET(rabp->b_flags, (B_INVAL|B_ERROR));
253 rabp->b_error = EIO;
254 brelse(rabp);
255 }
256 } else
257 brelse(rabp);
258 }
259 }
260 }
261
262 /*
263 * If the block is in the cache and has the required data
264 * in a valid region, just copy it out.
265 * Otherwise, get the block and write back/read in,
266 * as required.
267 */
268 again:
269 bufsize = biosize;
270 if ((off_t)(lbn + 1) * biosize > np->n_size &&
271 (off_t)(lbn + 1) * biosize - np->n_size < biosize) {
272 bufsize = np->n_size - (off_t)lbn * biosize;
273 bufsize = (bufsize + DEV_BSIZE - 1) & ~(DEV_BSIZE - 1);
274 }
275 bp = nfs_getcacheblk(vp, lbn, bufsize, p, operation);
276 if (!bp)
277 return (EINTR);
278
279 if (!ISSET(bp->b_flags, B_CACHE)) {
280 SET(bp->b_flags, B_READ);
281 CLR(bp->b_flags, (B_DONE | B_ERROR | B_INVAL));
282 not_readin = 0;
283 error = nfs_doio(bp, cred, p);
284 if (error) {
285 brelse(bp);
286 return (error);
287 }
288 }
289 if (bufsize > on) {
290 n = min((unsigned)(bufsize - on), uio->uio_resid);
291 } else {
292 n = 0;
293 }
294 diff = np->n_size - uio->uio_offset;
295 if (diff < n)
296 n = diff;
297 if (not_readin && n > 0) {
298 if (on < bp->b_validoff || (on + n) > bp->b_validend) {
299 SET(bp->b_flags, (B_NOCACHE|B_INVAFTERWRITE));
300 if (bp->b_dirtyend > 0) {
301 if (!ISSET(bp->b_flags, B_DELWRI))
302 panic("nfsbioread");
303 if (VOP_BWRITE(bp) == EINTR)
304 return (EINTR);
305 } else
306 brelse(bp);
307 goto again;
308 }
309 }
310 vp->v_lastr = lbn;
311 diff = (on >= bp->b_validend) ? 0 : (bp->b_validend - on);
312 if (diff < n)
313 n = diff;
314 break;
315 case VLNK:
316 nfsstats.biocache_readlinks++;
317 bp = nfs_getcacheblk(vp, (daddr_t)0, NFS_MAXPATHLEN, p, operation);
318 if (!bp)
319 return (EINTR);
320 if (!ISSET(bp->b_flags, B_CACHE)) {
321 SET(bp->b_flags, B_READ);
322 error = nfs_doio(bp, cred, p);
323 if (error) {
324 SET(bp->b_flags, B_ERROR);
325 brelse(bp);
326 return (error);
327 }
328 }
329 n = min(uio->uio_resid, NFS_MAXPATHLEN - bp->b_resid);
330 on = 0;
331 break;
332 case VDIR:
333 nfsstats.biocache_readdirs++;
334 if (np->n_direofoffset
335 && uio->uio_offset >= np->n_direofoffset) {
336 return (0);
337 }
338 lbn = uio->uio_offset / NFS_DIRBLKSIZ;
339 on = uio->uio_offset & (NFS_DIRBLKSIZ - 1);
340 bp = nfs_getcacheblk(vp, lbn, NFS_DIRBLKSIZ, p, operation);
341 if (!bp)
342 return (EINTR);
343 if (!ISSET(bp->b_flags, B_CACHE)) {
344 SET(bp->b_flags, B_READ);
345 error = nfs_doio(bp, cred, p);
346 if (error) {
347 brelse(bp);
348 }
349 while (error == NFSERR_BAD_COOKIE) {
350 nfs_invaldir(vp);
351 error = nfs_vinvalbuf(vp, 0, cred, p, 1);
352 /*
353 * Yuck! The directory has been modified on the
354 * server. The only way to get the block is by
355 * reading from the beginning to get all the
356 * offset cookies.
357 */
358 for (i = 0; i <= lbn && !error; i++) {
359 if (np->n_direofoffset
360 && (i * NFS_DIRBLKSIZ) >= np->n_direofoffset)
361 return (0);
362 bp = nfs_getcacheblk(vp, i, NFS_DIRBLKSIZ, p,
363 operation);
364 if (!bp)
365 return (EINTR);
366 if (!ISSET(bp->b_flags, B_CACHE)) {
367 SET(bp->b_flags, B_READ);
368 error = nfs_doio(bp, cred, p);
369 /*
370 * no error + B_INVAL == directory EOF,
371 * use the block.
372 */
373 if (error == 0 && (bp->b_flags & B_INVAL))
374 break;
375 }
376 /*
377 * An error will throw away the block and the
378 * for loop will break out. If no error and this
379 * is not the block we want, we throw away the
380 * block and go for the next one via the for loop.
381 */
382 if (error || i < lbn)
383 brelse(bp);
384 }
385 }
386 /*
387 * The above while is repeated if we hit another cookie
388 * error. If we hit an error and it wasn't a cookie error,
389 * we give up.
390 */
391 if (error)
392 return (error);
393 }
394
395 /*
396 * If not eof and read aheads are enabled, start one.
397 * (You need the current block first, so that you have the
398 * directory offset cookie of the next block.)
399 */
400 if (nfs_numasync > 0 && nmp->nm_readahead > 0 &&
401 (np->n_direofoffset == 0 ||
402 (lbn + 1) * NFS_DIRBLKSIZ < np->n_direofoffset) &&
403 !(np->n_flag & NQNFSNONCACHE) &&
404 !incore(vp, lbn + 1)) {
405 rabp = nfs_getcacheblk(vp, lbn + 1, NFS_DIRBLKSIZ, p,
406 operation);
407 if (rabp) {
408 if (!ISSET(rabp->b_flags, (B_CACHE|B_DELWRI))) {
409 SET(rabp->b_flags, (B_READ | B_ASYNC));
410 if (nfs_asyncio(rabp, cred)) {
411 SET(rabp->b_flags, (B_INVAL|B_ERROR));
412 rabp->b_error = EIO;
413 brelse(rabp);
414 }
415 } else {
416 brelse(rabp);
417 }
418 }
419 }
420 /*
421 * Make sure we use a signed variant of min() since
422 * the second term may be negative.
423 */
424 n = lmin(uio->uio_resid, NFS_DIRBLKSIZ - bp->b_resid - on);
425 /*
426 * Unlike VREG files, whos buffer size ( bp->b_bcount ) is
427 * chopped for the EOF condition, we cannot tell how large
428 * NFS directories are going to be until we hit EOF. So
429 * an NFS directory buffer is *not* chopped to its EOF. Now,
430 * it just so happens that b_resid will effectively chop it
431 * to EOF. *BUT* this information is lost if the buffer goes
432 * away and is reconstituted into a B_CACHE state (recovered
433 * from VM) later. So we keep track of the directory eof
434 * in np->n_direofoffset and chop it off as an extra step
435 * right here.
436 */
437 if (np->n_direofoffset &&
438 n > np->n_direofoffset - uio->uio_offset)
439 n = np->n_direofoffset - uio->uio_offset;
440 break;
441 default:
442 printf(" nfs_bioread: type %x unexpected\n",vp->v_type);
443 break;
444 };
445
446 if (n > 0) {
447 error = uiomove(bp->b_data + on, (int)n, uio);
448 }
449 switch (vp->v_type) {
450 case VREG:
451 break;
452 case VLNK:
453 n = 0;
454 break;
455 case VDIR:
456 if (np->n_flag & NQNFSNONCACHE)
457 SET(bp->b_flags, B_INVAL);
458 break;
459 default:
460 printf(" nfs_bioread: type %x unexpected\n",vp->v_type);
461 }
462 brelse(bp);
463 } while (error == 0 && uio->uio_resid > 0 && n > 0);
464 return (error);
465 }
466
467
468 /*
469 * Vnode op for write using bio
470 */
471 int
472 nfs_write(ap)
473 struct vop_write_args /* {
474 struct vnode *a_vp;
475 struct uio *a_uio;
476 int a_ioflag;
477 struct ucred *a_cred;
478 } */ *ap;
479 {
480 register int biosize;
481 register struct uio *uio = ap->a_uio;
482 struct proc *p = uio->uio_procp;
483 register struct vnode *vp = ap->a_vp;
484 struct nfsnode *np = VTONFS(vp);
485 register struct ucred *cred = ap->a_cred;
486 int ioflag = ap->a_ioflag;
487 struct buf *bp;
488 struct vattr vattr;
489 struct nfsmount *nmp = VFSTONFS(vp->v_mount);
490 daddr_t lbn;
491 int bufsize;
492 int n, on, error = 0, iomode, must_commit;
493 off_t boff;
494 struct iovec iov;
495 struct uio auio;
496
497 #if DIAGNOSTIC
498 if (uio->uio_rw != UIO_WRITE)
499 panic("nfs_write mode");
500 if (uio->uio_segflg == UIO_USERSPACE && uio->uio_procp != current_proc())
501 panic("nfs_write proc");
502 #endif
503 if (vp->v_type != VREG)
504 return (EIO);
505 if (np->n_flag & NWRITEERR) {
506 np->n_flag &= ~NWRITEERR;
507 return (np->n_error);
508 }
509 if ((nmp->nm_flag & (NFSMNT_NFSV3 | NFSMNT_GOTFSINFO)) == NFSMNT_NFSV3)
510 (void)nfs_fsinfo(nmp, vp, cred, p);
511 if (ioflag & (IO_APPEND | IO_SYNC)) {
512 if (np->n_flag & NMODIFIED) {
513 np->n_attrstamp = 0;
514 error = nfs_vinvalbuf(vp, V_SAVE, cred, p, 1);
515 if (error)
516 return (error);
517 }
518 if (ioflag & IO_APPEND) {
519 np->n_attrstamp = 0;
520 error = VOP_GETATTR(vp, &vattr, cred, p);
521 if (error)
522 return (error);
523 uio->uio_offset = np->n_size;
524 }
525 }
526 if (uio->uio_offset < 0)
527 return (EINVAL);
528 if (uio->uio_resid == 0)
529 return (0);
530 /*
531 * Maybe this should be above the vnode op call, but so long as
532 * file servers have no limits, i don't think it matters
533 */
534 if (p && uio->uio_offset + uio->uio_resid >
535 p->p_rlimit[RLIMIT_FSIZE].rlim_cur) {
536 psignal(p, SIGXFSZ);
537 return (EFBIG);
538 }
539 /*
540 * I use nm_rsize, not nm_wsize so that all buffer cache blocks
541 * will be the same size within a filesystem. nfs_writerpc will
542 * still use nm_wsize when sizing the rpc's.
543 */
544 /*due to getblk/vm interractions, use vm page size or less values */
545 biosize = min(vp->v_mount->mnt_stat.f_iosize, PAGE_SIZE);
546
547 do {
548 /*
549 * Check for a valid write lease.
550 */
551 if ((nmp->nm_flag & NFSMNT_NQNFS) &&
552 NQNFS_CKINVALID(vp, np, ND_WRITE)) {
553 do {
554 error = nqnfs_getlease(vp, ND_WRITE, cred, p);
555 } while (error == NQNFS_EXPIRED);
556 if (error)
557 return (error);
558 if (np->n_lrev != np->n_brev ||
559 (np->n_flag & NQNFSNONCACHE)) {
560 error = nfs_vinvalbuf(vp, V_SAVE, cred, p, 1);
561 if (error)
562 return (error);
563 np->n_brev = np->n_lrev;
564 }
565 }
566 if ((np->n_flag & NQNFSNONCACHE) && uio->uio_iovcnt == 1) {
567 iomode = NFSV3WRITE_FILESYNC;
568 error = nfs_writerpc(vp, uio, cred, &iomode, &must_commit);
569 if (must_commit)
570 nfs_clearcommit(vp->v_mount);
571 return (error);
572 }
573 nfsstats.biocache_writes++;
574 lbn = uio->uio_offset / biosize;
575 on = uio->uio_offset & (biosize-1);
576 n = min((unsigned)(biosize - on), uio->uio_resid);
577 again:
578 bufsize = biosize;
579 #if 0
580 /* (removed for UBC) */
581 if ((lbn + 1) * biosize > np->n_size) {
582 bufsize = np->n_size - lbn * biosize;
583 bufsize = (bufsize + DEV_BSIZE - 1) & ~(DEV_BSIZE - 1);
584 }
585 #endif
586 /*
587 * Get a cache block for writing. The range to be written is
588 * (off..off+len) within the block. We ensure that the block
589 * either has no dirty region or that the given range is
590 * contiguous with the existing dirty region.
591 */
592 bp = nfs_getcacheblk(vp, lbn, bufsize, p, BLK_WRITE);
593 if (!bp)
594 return (EINTR);
595 /*
596 * Resize nfsnode *after* we busy the buffer to prevent
597 * readers from reading garbage.
598 * If there was a partial buf at the old eof, validate
599 * and zero the new bytes.
600 */
601 if (uio->uio_offset + n > np->n_size) {
602 struct buf *bp0 = NULL;
603 daddr_t bn = np->n_size / biosize;
604 int off = np->n_size & (biosize - 1);
605
606 if (off && bn < lbn && incore(vp, bn))
607 bp0 = nfs_getcacheblk(vp, bn, biosize, p,
608 BLK_WRITE);
609 np->n_flag |= NMODIFIED;
610 np->n_size = uio->uio_offset + n;
611 ubc_setsize(vp, (off_t)np->n_size); /* XXX errors */
612 if (bp0) {
613 bzero((char *)bp0->b_data + off, biosize - off);
614 bp0->b_validend = biosize;
615 brelse(bp0);
616 }
617 }
618 /*
619 * NFS has embedded ucred so crhold() risks zone corruption
620 */
621 if (bp->b_wcred == NOCRED)
622 bp->b_wcred = crdup(cred);
623 /*
624 * If dirtyend exceeds file size, chop it down. This should
625 * not occur unless there is a race.
626 */
627 if ((off_t)bp->b_blkno * DEV_BSIZE + bp->b_dirtyend >
628 np->n_size)
629 bp->b_dirtyend = np->n_size - (off_t)bp->b_blkno *
630 DEV_BSIZE;
631 /*
632 * UBC doesn't (yet) handle partial pages so nfs_biowrite was
633 * hacked to never bdwrite, to start every little write right
634 * away. Running IE Avie noticed the performance problem, thus
635 * this code, which permits those delayed writes by ensuring an
636 * initial read of the entire page. The read may hit eof
637 * ("short read") but that we will handle.
638 *
639 * We are quite dependant on the correctness of B_CACHE so check
640 * that first in case of problems.
641 */
642 if (!ISSET(bp->b_flags, B_CACHE) && n < PAGE_SIZE) {
643 boff = (off_t)bp->b_blkno * DEV_BSIZE;
644 auio.uio_iov = &iov;
645 auio.uio_iovcnt = 1;
646 auio.uio_offset = boff;
647 auio.uio_resid = PAGE_SIZE;
648 auio.uio_segflg = UIO_SYSSPACE;
649 auio.uio_rw = UIO_READ;
650 auio.uio_procp = p;
651 iov.iov_base = bp->b_data;
652 iov.iov_len = PAGE_SIZE;
653 error = nfs_readrpc(vp, &auio, cred);
654 if (error) {
655 bp->b_error = error;
656 SET(bp->b_flags, B_ERROR);
657 printf("nfs_write: readrpc %d", error);
658 }
659 if (auio.uio_resid > 0)
660 bzero(iov.iov_base, auio.uio_resid);
661 bp->b_validoff = 0;
662 bp->b_validend = PAGE_SIZE - auio.uio_resid;
663 if (np->n_size > boff + bp->b_validend)
664 bp->b_validend = min(np->n_size - boff,
665 PAGE_SIZE);
666 bp->b_dirtyoff = 0;
667 bp->b_dirtyend = 0;
668 }
669
670 /*
671 * If the new write will leave a contiguous dirty
672 * area, just update the b_dirtyoff and b_dirtyend,
673 * otherwise try to extend the dirty region.
674 */
675 if (bp->b_dirtyend > 0 &&
676 (on > bp->b_dirtyend || (on + n) < bp->b_dirtyoff)) {
677 off_t start, end;
678
679 boff = (off_t)bp->b_blkno * DEV_BSIZE;
680 if (on > bp->b_dirtyend) {
681 start = boff + bp->b_validend;
682 end = boff + on;
683 } else {
684 start = boff + on + n;
685 end = boff + bp->b_validoff;
686 }
687
688 /*
689 * It may be that the valid region in the buffer
690 * covers the region we want, in which case just
691 * extend the dirty region. Otherwise we try to
692 * extend the valid region.
693 */
694 if (end > start) {
695 auio.uio_iov = &iov;
696 auio.uio_iovcnt = 1;
697 auio.uio_offset = start;
698 auio.uio_resid = end - start;
699 auio.uio_segflg = UIO_SYSSPACE;
700 auio.uio_rw = UIO_READ;
701 auio.uio_procp = p;
702 iov.iov_base = bp->b_data + (start - boff);
703 iov.iov_len = end - start;
704 error = nfs_readrpc(vp, &auio, cred);
705 /*
706 * If we couldn't read, do not do a VOP_BWRITE
707 * as originally coded. That could also error
708 * and looping back to "again" as it was doing
709 * could have us stuck trying to write same buf
710 * again. nfs_write, will get the entire region
711 * if nfs_readrpc succeeded. If unsuccessful
712 * we should just error out. Errors like ESTALE
713 * would keep us looping rather than transient
714 * errors justifying a retry. We can return here
715 * instead of altering dirty region later. We
716 * did not write old dirty region at this point.
717 */
718 if (error) {
719 bp->b_error = error;
720 SET(bp->b_flags, B_ERROR);
721 printf("nfs_write: readrpc2 %d", error);
722 brelse(bp);
723 return (error);
724 }
725 /*
726 * The read worked.
727 * If there was a short read, just zero fill.
728 */
729 if (auio.uio_resid > 0)
730 bzero(iov.iov_base, auio.uio_resid);
731 if (on > bp->b_dirtyend)
732 bp->b_validend = on;
733 else
734 bp->b_validoff = on + n;
735 }
736 /*
737 * We now have a valid region which extends up to the
738 * dirty region which we want.
739 */
740 if (on > bp->b_dirtyend)
741 bp->b_dirtyend = on;
742 else
743 bp->b_dirtyoff = on + n;
744 }
745 if (ISSET(bp->b_flags, B_ERROR)) {
746 error = bp->b_error;
747 brelse(bp);
748 return (error);
749 }
750 /*
751 * NFS has embedded ucred so crhold() risks zone corruption
752 */
753 if (bp->b_wcred == NOCRED)
754 bp->b_wcred = crdup(cred);
755 np->n_flag |= NMODIFIED;
756
757 /*
758 * Check for valid write lease and get one as required.
759 * In case getblk() and/or bwrite() delayed us.
760 */
761 if ((nmp->nm_flag & NFSMNT_NQNFS) &&
762 NQNFS_CKINVALID(vp, np, ND_WRITE)) {
763 do {
764 error = nqnfs_getlease(vp, ND_WRITE, cred, p);
765 } while (error == NQNFS_EXPIRED);
766 if (error) {
767 brelse(bp);
768 return (error);
769 }
770 if (np->n_lrev != np->n_brev ||
771 (np->n_flag & NQNFSNONCACHE)) {
772 brelse(bp);
773 error = nfs_vinvalbuf(vp, V_SAVE, cred, p, 1);
774 if (error)
775 return (error);
776 np->n_brev = np->n_lrev;
777 goto again;
778 }
779 }
780 error = uiomove((char *)bp->b_data + on, n, uio);
781 if (error) {
782 SET(bp->b_flags, B_ERROR);
783 brelse(bp);
784 return (error);
785 }
786 if (bp->b_dirtyend > 0) {
787 bp->b_dirtyoff = min(on, bp->b_dirtyoff);
788 bp->b_dirtyend = max((on + n), bp->b_dirtyend);
789 } else {
790 bp->b_dirtyoff = on;
791 bp->b_dirtyend = on + n;
792 }
793 if (bp->b_validend == 0 || bp->b_validend < bp->b_dirtyoff ||
794 bp->b_validoff > bp->b_dirtyend) {
795 bp->b_validoff = bp->b_dirtyoff;
796 bp->b_validend = bp->b_dirtyend;
797 } else {
798 bp->b_validoff = min(bp->b_validoff, bp->b_dirtyoff);
799 bp->b_validend = max(bp->b_validend, bp->b_dirtyend);
800 }
801
802 /*
803 * Since this block is being modified, it must be written
804 * again and not just committed.
805 */
806 CLR(bp->b_flags, B_NEEDCOMMIT);
807
808 /*
809 * If the lease is non-cachable or IO_SYNC do bwrite().
810 */
811 if ((np->n_flag & NQNFSNONCACHE) || (ioflag & IO_SYNC)) {
812 bp->b_proc = p;
813 error = VOP_BWRITE(bp);
814 if (error)
815 return (error);
816 if (np->n_flag & NQNFSNONCACHE) {
817 error = nfs_vinvalbuf(vp, V_SAVE, cred, p, 1);
818 if (error)
819 return (error);
820 }
821 } else if ((n + on) == biosize &&
822 (nmp->nm_flag & NFSMNT_NQNFS) == 0) {
823 bp->b_proc = (struct proc *)0;
824 SET(bp->b_flags, B_ASYNC);
825 (void)nfs_writebp(bp, 0);
826 } else
827 bdwrite(bp);
828 } while (uio->uio_resid > 0 && n > 0);
829 return (0);
830 }
831
832
833 /*
834 * Get an nfs cache block.
835 * Allocate a new one if the block isn't currently in the cache
836 * and return the block marked busy. If the calling process is
837 * interrupted by a signal for an interruptible mount point, return
838 * NULL.
839 */
840 static struct buf *
841 nfs_getcacheblk(vp, bn, size, p, operation)
842 struct vnode *vp;
843 daddr_t bn;
844 int size;
845 struct proc *p;
846 int operation; /* defined in sys/buf.h */
847 {
848 register struct buf *bp;
849 struct nfsmount *nmp = VFSTONFS(vp->v_mount);
850 /*due to getblk/vm interractions, use vm page size or less values */
851 int biosize = min(vp->v_mount->mnt_stat.f_iosize, PAGE_SIZE);
852
853 if (nbdwrite > ((nbuf/4)*3) && operation == BLK_WRITE) {
854 #define __BUFFERS_RECLAIMED 2
855 struct buf *tbp[__BUFFERS_RECLAIMED];
856 int i;
857
858 /* too many delayed writes, try to free up some buffers */
859 for (i = 0; i < __BUFFERS_RECLAIMED; i++)
860 tbp[i] = geteblk(512);
861
862 /* Yield to IO thread */
863 (void)tsleep((caddr_t)&nbdwrite, PCATCH, "nbdwrite", 1);
864
865 for (i = (__BUFFERS_RECLAIMED - 1); i >= 0; i--)
866 brelse(tbp[i]);
867 }
868
869 if (nmp->nm_flag & NFSMNT_INT) {
870 bp = getblk(vp, bn, size, PCATCH, 0, operation);
871 while (bp == (struct buf *)0) {
872 if (nfs_sigintr(nmp, (struct nfsreq *)0, p))
873 return ((struct buf *)0);
874 bp = getblk(vp, bn, size, 0, 2 * hz, operation);
875 }
876 } else
877 bp = getblk(vp, bn, size, 0, 0, operation);
878
879 if( vp->v_type == VREG)
880 bp->b_blkno = ((off_t)bn * biosize) / DEV_BSIZE;
881
882 return (bp);
883 }
884
885 /*
886 * Flush and invalidate all dirty buffers. If another process is already
887 * doing the flush, just wait for completion.
888 */
889 int
890 nfs_vinvalbuf(vp, flags, cred, p, intrflg)
891 struct vnode *vp;
892 int flags;
893 struct ucred *cred;
894 struct proc *p;
895 int intrflg;
896 {
897 register struct nfsnode *np = VTONFS(vp);
898 struct nfsmount *nmp = VFSTONFS(vp->v_mount);
899 int error = 0, slpflag, slptimeo;
900 int didhold = 0;
901
902 if ((nmp->nm_flag & NFSMNT_INT) == 0)
903 intrflg = 0;
904 if (intrflg) {
905 slpflag = PCATCH;
906 slptimeo = 2 * hz;
907 } else {
908 slpflag = 0;
909 slptimeo = 0;
910 }
911 /*
912 * First wait for any other process doing a flush to complete.
913 */
914 while (np->n_flag & NFLUSHINPROG) {
915 np->n_flag |= NFLUSHWANT;
916 error = tsleep((caddr_t)&np->n_flag, PRIBIO + 2, "nfsvinval",
917 slptimeo);
918 if (error && intrflg && nfs_sigintr(nmp, (struct nfsreq *)0, p))
919 return (EINTR);
920 }
921
922 /*
923 * Now, flush as required.
924 */
925 np->n_flag |= NFLUSHINPROG;
926 error = vinvalbuf(vp, flags, cred, p, slpflag, 0);
927 while (error) {
928 /* we seem to be stuck in a loop here if the thread got aborted.
929 * nfs_flush will return EINTR. Not sure if that will cause
930 * other consequences due to EINTR having other meanings in NFS
931 * To handle, no dirty pages, it seems safe to just return from
932 * here. But if we did have dirty pages, how would we get them
933 * written out if thread was aborted? Some other strategy is
934 * necessary. -- EKN
935 */
936 if ((intrflg && nfs_sigintr(nmp, (struct nfsreq *)0, p)) ||
937 (error == EINTR && current_thread_aborted())) {
938 np->n_flag &= ~NFLUSHINPROG;
939 if (np->n_flag & NFLUSHWANT) {
940 np->n_flag &= ~NFLUSHWANT;
941 wakeup((caddr_t)&np->n_flag);
942 }
943 return (EINTR);
944 }
945 error = vinvalbuf(vp, flags, cred, p, 0, slptimeo);
946 }
947 np->n_flag &= ~(NMODIFIED | NFLUSHINPROG);
948 if (np->n_flag & NFLUSHWANT) {
949 np->n_flag &= ~NFLUSHWANT;
950 wakeup((caddr_t)&np->n_flag);
951 }
952 didhold = ubc_hold(vp);
953 if (didhold) {
954 (void) ubc_clean(vp, 1); /* get the pages out of vm also */
955 ubc_rele(vp);
956 }
957 return (0);
958 }
959
960 /*
961 * Initiate asynchronous I/O. Return an error if no nfsiods are available.
962 * This is mainly to avoid queueing async I/O requests when the nfsiods
963 * are all hung on a dead server.
964 */
965 int
966 nfs_asyncio(bp, cred)
967 register struct buf *bp;
968 struct ucred *cred;
969 {
970 struct nfsmount *nmp;
971 int i;
972 int gotiod;
973 int slpflag = 0;
974 int slptimeo = 0;
975 int error;
976
977 if (nfs_numasync == 0)
978 return (EIO);
979
980 nmp = VFSTONFS(bp->b_vp->v_mount);
981 again:
982 if (nmp->nm_flag & NFSMNT_INT)
983 slpflag = PCATCH;
984 gotiod = FALSE;
985
986 /*
987 * Find a free iod to process this request.
988 */
989 for (i = 0; i < NFS_MAXASYNCDAEMON; i++)
990 if (nfs_iodwant[i]) {
991 /*
992 * Found one, so wake it up and tell it which
993 * mount to process.
994 */
995 NFS_DPF(ASYNCIO,
996 ("nfs_asyncio: waking iod %d for mount %p\n",
997 i, nmp));
998 nfs_iodwant[i] = (struct proc *)0;
999 nfs_iodmount[i] = nmp;
1000 nmp->nm_bufqiods++;
1001 wakeup((caddr_t)&nfs_iodwant[i]);
1002 gotiod = TRUE;
1003 break;
1004 }
1005
1006 /*
1007 * If none are free, we may already have an iod working on this mount
1008 * point. If so, it will process our request.
1009 */
1010 if (!gotiod) {
1011 if (nmp->nm_bufqiods > 0) {
1012 NFS_DPF(ASYNCIO,
1013 ("nfs_asyncio: %d iods are already processing mount %p\n",
1014 nmp->nm_bufqiods, nmp));
1015 gotiod = TRUE;
1016 }
1017 }
1018
1019 /*
1020 * If we have an iod which can process the request, then queue
1021 * the buffer.
1022 */
1023 if (gotiod) {
1024 /*
1025 * Ensure that the queue never grows too large.
1026 */
1027 while (nmp->nm_bufqlen >= 2*nfs_numasync) {
1028 NFS_DPF(ASYNCIO,
1029 ("nfs_asyncio: waiting for mount %p queue to drain\n", nmp));
1030 nmp->nm_bufqwant = TRUE;
1031 error = tsleep(&nmp->nm_bufq, slpflag | PRIBIO,
1032 "nfsaio", slptimeo);
1033 if (error) {
1034 if (nfs_sigintr(nmp, NULL, bp->b_proc))
1035 return (EINTR);
1036 if (slpflag == PCATCH) {
1037 slpflag = 0;
1038 slptimeo = 2 * hz;
1039 }
1040 }
1041 /*
1042 * We might have lost our iod while sleeping,
1043 * so check and loop if nescessary.
1044 */
1045 if (nmp->nm_bufqiods == 0) {
1046 NFS_DPF(ASYNCIO,
1047 ("nfs_asyncio: no iods after mount %p queue was drained, looping\n", nmp));
1048 goto again;
1049 }
1050 }
1051
1052 if (ISSET(bp->b_flags, B_READ)) {
1053 if (bp->b_rcred == NOCRED && cred != NOCRED) {
1054 /*
1055 * NFS has embedded ucred.
1056 * Can not crhold() here as that causes zone corruption
1057 */
1058 bp->b_rcred = crdup(cred);
1059 }
1060 } else {
1061 SET(bp->b_flags, B_WRITEINPROG);
1062 if (bp->b_wcred == NOCRED && cred != NOCRED) {
1063 /*
1064 * NFS has embedded ucred.
1065 * Can not crhold() here as that causes zone corruption
1066 */
1067 bp->b_wcred = crdup(cred);
1068 }
1069 }
1070
1071 TAILQ_INSERT_TAIL(&nmp->nm_bufq, bp, b_freelist);
1072 nmp->nm_bufqlen++;
1073 return (0);
1074 }
1075
1076 /*
1077 * All the iods are busy on other mounts, so return EIO to
1078 * force the caller to process the i/o synchronously.
1079 */
1080 NFS_DPF(ASYNCIO, ("nfs_asyncio: no iods available, i/o is synchronous\n"));
1081 return (EIO);
1082 }
1083
1084 /*
1085 * Do an I/O operation to/from a cache block. This may be called
1086 * synchronously or from an nfsiod.
1087 */
1088 int
1089 nfs_doio(bp, cr, p)
1090 register struct buf *bp;
1091 struct ucred *cr;
1092 struct proc *p;
1093 {
1094 register struct uio *uiop;
1095 register struct vnode *vp;
1096 struct nfsnode *np;
1097 struct nfsmount *nmp;
1098 int error = 0, diff, len, iomode, must_commit = 0;
1099 struct uio uio;
1100 struct iovec io;
1101
1102 vp = bp->b_vp;
1103 np = VTONFS(vp);
1104 nmp = VFSTONFS(vp->v_mount);
1105 uiop = &uio;
1106 uiop->uio_iov = &io;
1107 uiop->uio_iovcnt = 1;
1108 uiop->uio_segflg = UIO_SYSSPACE;
1109 uiop->uio_procp = p;
1110
1111 /*
1112 * With UBC, getblk() can return a buf with B_DONE set.
1113 * This indicates that the VM has valid data for that page.
1114 * NFS being stateless, this case poses a problem.
1115 * By definition, the NFS server should always be consulted
1116 * for the data in that page.
1117 * So we choose to clear the B_DONE and to do the IO.
1118 *
1119 * XXX revisit this if there is a performance issue.
1120 * XXX In that case, we could play the attribute cache games ...
1121 */
1122 if (ISSET(bp->b_flags, B_DONE)) {
1123 if (!ISSET(bp->b_flags, B_ASYNC))
1124 panic("nfs_doio: done and not async");
1125 CLR(bp->b_flags, B_DONE);
1126 }
1127 FSDBG_TOP(256, np->n_size, bp->b_blkno * DEV_BSIZE, bp->b_bcount,
1128 bp->b_flags);
1129 FSDBG(257, bp->b_validoff, bp->b_validend, bp->b_dirtyoff,
1130 bp->b_dirtyend);
1131 /*
1132 * Historically, paging was done with physio, but no more.
1133 */
1134 if (ISSET(bp->b_flags, B_PHYS)) {
1135 /*
1136 * ...though reading /dev/drum still gets us here.
1137 */
1138 io.iov_len = uiop->uio_resid = bp->b_bcount;
1139 /* mapping was done by vmapbuf() */
1140 io.iov_base = bp->b_data;
1141 uiop->uio_offset = (off_t)bp->b_blkno * DEV_BSIZE;
1142 if (ISSET(bp->b_flags, B_READ)) {
1143 uiop->uio_rw = UIO_READ;
1144 nfsstats.read_physios++;
1145 error = nfs_readrpc(vp, uiop, cr);
1146 } else {
1147 int com;
1148
1149 iomode = NFSV3WRITE_DATASYNC;
1150 uiop->uio_rw = UIO_WRITE;
1151 nfsstats.write_physios++;
1152 error = nfs_writerpc(vp, uiop, cr, &iomode, &com);
1153 }
1154 if (error) {
1155 SET(bp->b_flags, B_ERROR);
1156 bp->b_error = error;
1157 }
1158 } else if (ISSET(bp->b_flags, B_READ)) {
1159 io.iov_len = uiop->uio_resid = bp->b_bcount;
1160 io.iov_base = bp->b_data;
1161 uiop->uio_rw = UIO_READ;
1162 switch (vp->v_type) {
1163 case VREG:
1164 uiop->uio_offset = (off_t)bp->b_blkno * DEV_BSIZE;
1165 nfsstats.read_bios++;
1166 error = nfs_readrpc(vp, uiop, cr);
1167 FSDBG(262, np->n_size, bp->b_blkno * DEV_BSIZE,
1168 uiop->uio_resid, error);
1169 if (!error) {
1170 bp->b_validoff = 0;
1171 if (uiop->uio_resid) {
1172 /*
1173 * If len > 0, there is a hole in the file and
1174 * no writes after the hole have been pushed to
1175 * the server yet.
1176 * Just zero fill the rest of the valid area.
1177 */
1178 diff = bp->b_bcount - uiop->uio_resid;
1179 len = np->n_size - ((u_quad_t)bp->b_blkno * DEV_BSIZE +
1180 diff);
1181 if (len > 0) {
1182 len = min(len, uiop->uio_resid);
1183 bzero((char *)bp->b_data + diff, len);
1184 bp->b_validend = diff + len;
1185 FSDBG(258, diff, len, 0, 1);
1186 } else
1187 bp->b_validend = diff;
1188 } else
1189 bp->b_validend = bp->b_bcount;
1190
1191 if (bp->b_validend < bp->b_bufsize) {
1192 /*
1193 * we're about to release a partial buffer after a
1194 * read... the only way we should get here is if
1195 * this buffer contains the EOF before releasing it,
1196 * we'll zero out to the end of the buffer so that
1197 * if a mmap of this page occurs, we'll see zero's
1198 * even if a ftruncate extends the file in the
1199 * meantime
1200 */
1201 bzero((caddr_t)(bp->b_data + bp->b_validend),
1202 bp->b_bufsize - bp->b_validend);
1203 FSDBG(258, bp->b_validend,
1204 bp->b_bufsize - bp->b_validend, 0, 2);
1205 }
1206 }
1207 if (p && (vp->v_flag & VTEXT) &&
1208 (((nmp->nm_flag & NFSMNT_NQNFS) &&
1209 NQNFS_CKINVALID(vp, np, ND_READ) &&
1210 np->n_lrev != np->n_brev) ||
1211 (!(nmp->nm_flag & NFSMNT_NQNFS) &&
1212 np->n_mtime != np->n_vattr.va_mtime.tv_sec))) {
1213 uprintf("Process killed due to text file modification\n");
1214 psignal(p, SIGKILL);
1215 p->p_flag |= P_NOSWAP;
1216 }
1217 break;
1218 case VLNK:
1219 uiop->uio_offset = (off_t)0;
1220 nfsstats.readlink_bios++;
1221 error = nfs_readlinkrpc(vp, uiop, cr);
1222 break;
1223 case VDIR:
1224 nfsstats.readdir_bios++;
1225 uiop->uio_offset = ((u_quad_t)bp->b_lblkno) * NFS_DIRBLKSIZ;
1226 if (!(nmp->nm_flag & NFSMNT_NFSV3))
1227 nmp->nm_flag &= ~NFSMNT_RDIRPLUS; /* dk@farm.org */
1228 if (nmp->nm_flag & NFSMNT_RDIRPLUS) {
1229 error = nfs_readdirplusrpc(vp, uiop, cr);
1230 if (error == NFSERR_NOTSUPP)
1231 nmp->nm_flag &= ~NFSMNT_RDIRPLUS;
1232 }
1233 if ((nmp->nm_flag & NFSMNT_RDIRPLUS) == 0)
1234 error = nfs_readdirrpc(vp, uiop, cr);
1235 break;
1236 default:
1237 printf("nfs_doio: type %x unexpected\n", vp->v_type);
1238 break;
1239 };
1240 if (error) {
1241 SET(bp->b_flags, B_ERROR);
1242 bp->b_error = error;
1243 }
1244 } else {
1245 /*
1246 * mapped I/O may have altered any bytes, so we extend
1247 * the dirty zone to the valid zone. For best performance
1248 * a better solution would be to save & restore page dirty bits
1249 * around the uiomove which brings write-data into the buffer.
1250 * Then here we'd check if the page is dirty rather than WASMAPPED
1251 * Also vnode_pager would change - if a page is clean it might
1252 * still need to be written due to DELWRI.
1253 */
1254 if (UBCINFOEXISTS(vp) && ubc_issetflags(vp, UI_WASMAPPED)) {
1255 bp->b_dirtyoff = min(bp->b_dirtyoff, bp->b_validoff);
1256 bp->b_dirtyend = max(bp->b_dirtyend, bp->b_validend);
1257 }
1258 if ((off_t)bp->b_blkno * DEV_BSIZE + bp->b_dirtyend > np->n_size)
1259 bp->b_dirtyend = np->n_size - (off_t)bp->b_blkno * DEV_BSIZE;
1260
1261 if (bp->b_dirtyend > bp->b_dirtyoff) {
1262 io.iov_len = uiop->uio_resid = bp->b_dirtyend - bp->b_dirtyoff;
1263 uiop->uio_offset = (off_t)bp->b_blkno * DEV_BSIZE +
1264 bp->b_dirtyoff;
1265 io.iov_base = (char *)bp->b_data + bp->b_dirtyoff;
1266 uiop->uio_rw = UIO_WRITE;
1267
1268 nfsstats.write_bios++;
1269 if ((bp->b_flags & (B_ASYNC | B_NEEDCOMMIT | B_NOCACHE)) ==
1270 B_ASYNC)
1271 iomode = NFSV3WRITE_UNSTABLE;
1272 else
1273 iomode = NFSV3WRITE_FILESYNC;
1274 SET(bp->b_flags, B_WRITEINPROG);
1275 error = nfs_writerpc(vp, uiop, cr, &iomode, &must_commit);
1276 if (!error && iomode == NFSV3WRITE_UNSTABLE)
1277 SET(bp->b_flags, B_NEEDCOMMIT);
1278 else
1279 CLR(bp->b_flags, B_NEEDCOMMIT);
1280 CLR(bp->b_flags, B_WRITEINPROG);
1281 /*
1282 * For an interrupted write, the buffer is still valid
1283 * and the write hasn't been pushed to the server yet,
1284 * so we can't set B_ERROR and report the interruption
1285 * by setting B_EINTR. For the B_ASYNC case, B_EINTR
1286 * is not relevant, so the rpc attempt is essentially
1287 * a noop. For the case of a V3 write rpc not being
1288 * committed to stable storage, the block is still
1289 * dirty and requires either a commit rpc or another
1290 * write rpc with iomode == NFSV3WRITE_FILESYNC before
1291 * the block is reused. This is indicated by setting
1292 * the B_DELWRI and B_NEEDCOMMIT flags.
1293 */
1294 if (error == EINTR || (!error && bp->b_flags & B_NEEDCOMMIT)) {
1295 int s;
1296
1297 CLR(bp->b_flags, B_INVAL | B_NOCACHE);
1298 if (!ISSET(bp->b_flags, B_DELWRI)) {
1299 SET(bp->b_flags, B_DELWRI);
1300 nbdwrite++;
1301 }
1302 FSDBG(261, bp->b_validoff, bp->b_validend,
1303 bp->b_bufsize, bp->b_bcount);
1304 /*
1305 * Since for the B_ASYNC case, nfs_bwrite() has
1306 * reassigned the buffer to the clean list, we have to
1307 * reassign it back to the dirty one. Ugh.
1308 */
1309 if (ISSET(bp->b_flags, B_ASYNC)) {
1310 s = splbio();
1311 reassignbuf(bp, vp);
1312 splx(s);
1313 } else {
1314 SET(bp->b_flags, B_EINTR);
1315 }
1316 } else {
1317 if (error) {
1318 SET(bp->b_flags, B_ERROR);
1319 bp->b_error = np->n_error = error;
1320 np->n_flag |= NWRITEERR;
1321 }
1322 bp->b_dirtyoff = bp->b_dirtyend = 0;
1323
1324 /*
1325 * validoff and validend represent the real data present
1326 * in this buffer if validoff is non-zero, than we have
1327 * to invalidate the buffer and kill the page when
1328 * biodone is called... the same is also true when
1329 * validend doesn't extend all the way to the end of the
1330 * buffer and validend doesn't equate to the current
1331 * EOF... eventually we need to deal with this in a more
1332 * humane way (like keeping the partial buffer without
1333 * making it immediately available to the VM page cache)
1334 */
1335 if (bp->b_validoff)
1336 SET(bp->b_flags, B_INVAL);
1337 else
1338 if (bp->b_validend < bp->b_bufsize) {
1339 if ((off_t)bp->b_blkno * DEV_BSIZE +
1340 bp->b_validend == np->n_size) {
1341 bzero((caddr_t)(bp->b_data +
1342 bp->b_validend),
1343 bp->b_bufsize - bp->b_validend);
1344 FSDBG(259, bp->b_validend,
1345 bp->b_bufsize - bp->b_validend, 0,
1346 0);
1347 } else
1348 SET(bp->b_flags, B_INVAL);
1349 }
1350 }
1351
1352 } else {
1353 if (bp->b_validoff ||
1354 (bp->b_validend < bp->b_bufsize &&
1355 (off_t)bp->b_blkno * DEV_BSIZE + bp->b_validend !=
1356 np->n_size)) {
1357 SET(bp->b_flags, B_INVAL);
1358 }
1359 if (bp->b_flags & B_INVAL) {
1360 FSDBG(260, bp->b_validoff, bp->b_validend,
1361 bp->b_bufsize, bp->b_bcount);
1362 }
1363 bp->b_resid = 0;
1364 biodone(bp);
1365 FSDBG_BOT(256, bp->b_validoff, bp->b_validend, bp->b_bufsize,
1366 np->n_size);
1367 return (0);
1368 }
1369 }
1370 bp->b_resid = uiop->uio_resid;
1371 if (must_commit)
1372 nfs_clearcommit(vp->v_mount);
1373
1374 if (bp->b_flags & B_INVAL) {
1375 FSDBG(260, bp->b_validoff, bp->b_validend, bp->b_bufsize,
1376 bp->b_bcount);
1377 }
1378 FSDBG_BOT(256, bp->b_validoff, bp->b_validend, bp->b_bcount, error);
1379
1380 biodone(bp);
1381 return (error);
1382 }