]> git.saurik.com Git - apple/xnu.git/blob - bsd/nfs/nfs_bio.c
7f41efe13fb85952defd9b67213fca4b1fa07a98
[apple/xnu.git] / bsd / nfs / nfs_bio.c
1 /*
2 * Copyright (c) 2000 Apple Computer, Inc. All rights reserved.
3 *
4 * @APPLE_LICENSE_HEADER_START@
5 *
6 * Copyright (c) 1999-2003 Apple Computer, Inc. All Rights Reserved.
7 *
8 * This file contains Original Code and/or Modifications of Original Code
9 * as defined in and that are subject to the Apple Public Source License
10 * Version 2.0 (the 'License'). You may not use this file except in
11 * compliance with the License. Please obtain a copy of the License at
12 * http://www.opensource.apple.com/apsl/ and read it before using this
13 * file.
14 *
15 * The Original Code and all software distributed under the License are
16 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
17 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
18 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
19 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
20 * Please see the License for the specific language governing rights and
21 * limitations under the License.
22 *
23 * @APPLE_LICENSE_HEADER_END@
24 */
25 /* Copyright (c) 1995 NeXT Computer, Inc. All Rights Reserved */
26 /*
27 * Copyright (c) 1989, 1993
28 * The Regents of the University of California. All rights reserved.
29 *
30 * This code is derived from software contributed to Berkeley by
31 * Rick Macklem at The University of Guelph.
32 *
33 * Redistribution and use in source and binary forms, with or without
34 * modification, are permitted provided that the following conditions
35 * are met:
36 * 1. Redistributions of source code must retain the above copyright
37 * notice, this list of conditions and the following disclaimer.
38 * 2. Redistributions in binary form must reproduce the above copyright
39 * notice, this list of conditions and the following disclaimer in the
40 * documentation and/or other materials provided with the distribution.
41 * 3. All advertising materials mentioning features or use of this software
42 * must display the following acknowledgement:
43 * This product includes software developed by the University of
44 * California, Berkeley and its contributors.
45 * 4. Neither the name of the University nor the names of its contributors
46 * may be used to endorse or promote products derived from this software
47 * without specific prior written permission.
48 *
49 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
50 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
51 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
52 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
53 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
54 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
55 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
56 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
57 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
58 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
59 * SUCH DAMAGE.
60 *
61 * @(#)nfs_bio.c 8.9 (Berkeley) 3/30/95
62 * FreeBSD-Id: nfs_bio.c,v 1.44 1997/09/10 19:52:25 phk Exp $
63 */
64 #include <sys/param.h>
65 #include <sys/systm.h>
66 #include <sys/resourcevar.h>
67 #include <sys/signalvar.h>
68 #include <sys/proc.h>
69 #include <sys/buf.h>
70 #include <sys/vnode.h>
71 #include <sys/mount.h>
72 #include <sys/kernel.h>
73 #include <sys/sysctl.h>
74 #include <sys/ubc.h>
75
76 #include <sys/vm.h>
77 #include <sys/vmparam.h>
78
79 #include <sys/time.h>
80 #include <kern/clock.h>
81
82 #include <nfs/rpcv2.h>
83 #include <nfs/nfsproto.h>
84 #include <nfs/nfs.h>
85 #include <nfs/nfsmount.h>
86 #include <nfs/nqnfs.h>
87 #include <nfs/nfsnode.h>
88
89 #include <sys/kdebug.h>
90
91 #define FSDBG(A, B, C, D, E) \
92 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, (A))) | DBG_FUNC_NONE, \
93 (int)(B), (int)(C), (int)(D), (int)(E), 0)
94 #define FSDBG_TOP(A, B, C, D, E) \
95 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, (A))) | DBG_FUNC_START, \
96 (int)(B), (int)(C), (int)(D), (int)(E), 0)
97 #define FSDBG_BOT(A, B, C, D, E) \
98 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, (A))) | DBG_FUNC_END, \
99 (int)(B), (int)(C), (int)(D), (int)(E), 0)
100
101 static struct buf *nfs_getcacheblk __P((struct vnode *vp, daddr_t bn, int size,
102 struct proc *p, int operation));
103
104 extern int nfs_numasync;
105 extern struct nfsstats nfsstats;
106 extern int nbdwrite;
107
108 /*
109 * Vnode op for read using bio
110 * Any similarity to readip() is purely coincidental
111 */
112 int
113 nfs_bioread(vp, uio, ioflag, cred, getpages)
114 register struct vnode *vp;
115 register struct uio *uio;
116 int ioflag;
117 struct ucred *cred;
118 int getpages;
119 {
120 register struct nfsnode *np = VTONFS(vp);
121 register int biosize, i;
122 off_t diff;
123 struct buf *bp = 0, *rabp;
124 struct vattr vattr;
125 struct proc *p;
126 struct nfsmount *nmp = VFSTONFS(vp->v_mount);
127 daddr_t lbn, rabn;
128 int bufsize;
129 int nra, error = 0, n = 0, on = 0, not_readin;
130 int operation = (getpages? BLK_PAGEIN : BLK_READ);
131
132 #if DIAGNOSTIC
133 if (uio->uio_rw != UIO_READ)
134 panic("nfs_read mode");
135 #endif
136 if (uio->uio_resid == 0)
137 return (0);
138 if (uio->uio_offset < 0)
139 return (EINVAL);
140 p = uio->uio_procp;
141 if ((nmp->nm_flag & (NFSMNT_NFSV3 | NFSMNT_GOTFSINFO)) == NFSMNT_NFSV3)
142 (void)nfs_fsinfo(nmp, vp, cred, p);
143 /*due to getblk/vm interractions, use vm page size or less values */
144 biosize = min(vp->v_mount->mnt_stat.f_iosize, PAGE_SIZE);
145 /*
146 * For nfs, cache consistency can only be maintained approximately.
147 * Although RFC1094 does not specify the criteria, the following is
148 * believed to be compatible with the reference port.
149 * For nqnfs, full cache consistency is maintained within the loop.
150 * For nfs:
151 * If the file's modify time on the server has changed since the
152 * last read rpc or you have written to the file,
153 * you may have lost data cache consistency with the
154 * server, so flush all of the file's data out of the cache.
155 * Then force a getattr rpc to ensure that you have up to date
156 * attributes.
157 * NB: This implies that cache data can be read when up to
158 * NFS_ATTRTIMEO seconds out of date. If you find that you need current
159 * attributes this could be forced by setting n_attrstamp to 0 before
160 * the VOP_GETATTR() call.
161 */
162 if ((nmp->nm_flag & NFSMNT_NQNFS) == 0) {
163 if (np->n_flag & NMODIFIED) {
164 if (vp->v_type != VREG) {
165 if (vp->v_type != VDIR)
166 panic("nfs: bioread, not dir");
167 nfs_invaldir(vp);
168 error = nfs_vinvalbuf(vp, V_SAVE, cred, p, 1);
169 if (error)
170 return (error);
171 }
172 np->n_attrstamp = 0;
173 error = VOP_GETATTR(vp, &vattr, cred, p);
174 if (error)
175 return (error);
176 np->n_mtime = vattr.va_mtime.tv_sec;
177 } else {
178 error = VOP_GETATTR(vp, &vattr, cred, p);
179 if (error)
180 return (error);
181 if (np->n_mtime != vattr.va_mtime.tv_sec) {
182 if (vp->v_type == VDIR)
183 nfs_invaldir(vp);
184 error = nfs_vinvalbuf(vp, V_SAVE, cred, p, 1);
185 if (error)
186 return (error);
187 np->n_mtime = vattr.va_mtime.tv_sec;
188 }
189 }
190 }
191 do {
192
193 /*
194 * Get a valid lease. If cached data is stale, flush it.
195 */
196 if (nmp->nm_flag & NFSMNT_NQNFS) {
197 if (NQNFS_CKINVALID(vp, np, ND_READ)) {
198 do {
199 error = nqnfs_getlease(vp, ND_READ, cred, p);
200 } while (error == NQNFS_EXPIRED);
201 if (error)
202 return (error);
203 if (np->n_lrev != np->n_brev ||
204 (np->n_flag & NQNFSNONCACHE) ||
205 ((np->n_flag & NMODIFIED) && vp->v_type == VDIR)) {
206 if (vp->v_type == VDIR)
207 nfs_invaldir(vp);
208 error = nfs_vinvalbuf(vp, V_SAVE, cred, p, 1);
209 if (error)
210 return (error);
211 np->n_brev = np->n_lrev;
212 }
213 } else if (vp->v_type == VDIR && (np->n_flag & NMODIFIED)) {
214 nfs_invaldir(vp);
215 error = nfs_vinvalbuf(vp, V_SAVE, cred, p, 1);
216 if (error)
217 return (error);
218 }
219 }
220 if (np->n_flag & NQNFSNONCACHE) {
221 switch (vp->v_type) {
222 case VREG:
223 return (nfs_readrpc(vp, uio, cred));
224 case VLNK:
225 return (nfs_readlinkrpc(vp, uio, cred));
226 case VDIR:
227 break;
228 default:
229 printf(" NQNFSNONCACHE: type %x unexpected\n",
230 vp->v_type);
231 };
232 }
233 switch (vp->v_type) {
234 case VREG:
235 nfsstats.biocache_reads++;
236 lbn = uio->uio_offset / biosize;
237 on = uio->uio_offset & (biosize - 1);
238 not_readin = 1;
239
240 /*
241 * Start the read ahead(s), as required.
242 */
243 if (nfs_numasync > 0 && nmp->nm_readahead > 0) {
244 for (nra = 0; nra < nmp->nm_readahead &&
245 (off_t)(lbn + 1 + nra) * biosize < np->n_size;
246 nra++) {
247 rabn = lbn + 1 + nra;
248 if (!incore(vp, rabn)) {
249 rabp = nfs_getcacheblk(vp, rabn, biosize, p, operation);
250 if (!rabp)
251 return (EINTR);
252 if (!ISSET(rabp->b_flags, (B_CACHE|B_DELWRI))) {
253 SET(rabp->b_flags, (B_READ | B_ASYNC));
254 if (nfs_asyncio(rabp, cred)) {
255 SET(rabp->b_flags, (B_INVAL|B_ERROR));
256 rabp->b_error = EIO;
257 brelse(rabp);
258 }
259 } else
260 brelse(rabp);
261 }
262 }
263 }
264
265 /*
266 * If the block is in the cache and has the required data
267 * in a valid region, just copy it out.
268 * Otherwise, get the block and write back/read in,
269 * as required.
270 */
271 again:
272 bufsize = biosize;
273 if ((off_t)(lbn + 1) * biosize > np->n_size &&
274 (off_t)(lbn + 1) * biosize - np->n_size < biosize) {
275 bufsize = np->n_size - (off_t)lbn * biosize;
276 bufsize = (bufsize + DEV_BSIZE - 1) & ~(DEV_BSIZE - 1);
277 }
278 bp = nfs_getcacheblk(vp, lbn, bufsize, p, operation);
279 if (!bp)
280 return (EINTR);
281
282 if (!ISSET(bp->b_flags, B_CACHE)) {
283 SET(bp->b_flags, B_READ);
284 CLR(bp->b_flags, (B_DONE | B_ERROR | B_INVAL));
285 not_readin = 0;
286 error = nfs_doio(bp, cred, p);
287 if (error) {
288 brelse(bp);
289 return (error);
290 }
291 }
292 if (bufsize > on) {
293 n = min((unsigned)(bufsize - on), uio->uio_resid);
294 } else {
295 n = 0;
296 }
297 diff = np->n_size - uio->uio_offset;
298 if (diff < n)
299 n = diff;
300 if (not_readin && n > 0) {
301 if (on < bp->b_validoff || (on + n) > bp->b_validend) {
302 SET(bp->b_flags, (B_NOCACHE|B_INVAFTERWRITE));
303 if (bp->b_dirtyend > 0) {
304 if (!ISSET(bp->b_flags, B_DELWRI))
305 panic("nfsbioread");
306 if (VOP_BWRITE(bp) == EINTR)
307 return (EINTR);
308 } else
309 brelse(bp);
310 goto again;
311 }
312 }
313 vp->v_lastr = lbn;
314 diff = (on >= bp->b_validend) ? 0 : (bp->b_validend - on);
315 if (diff < n)
316 n = diff;
317 break;
318 case VLNK:
319 nfsstats.biocache_readlinks++;
320 bp = nfs_getcacheblk(vp, (daddr_t)0, NFS_MAXPATHLEN, p, operation);
321 if (!bp)
322 return (EINTR);
323 if (!ISSET(bp->b_flags, B_CACHE)) {
324 SET(bp->b_flags, B_READ);
325 error = nfs_doio(bp, cred, p);
326 if (error) {
327 SET(bp->b_flags, B_ERROR);
328 brelse(bp);
329 return (error);
330 }
331 }
332 n = min(uio->uio_resid, NFS_MAXPATHLEN - bp->b_resid);
333 on = 0;
334 break;
335 case VDIR:
336 nfsstats.biocache_readdirs++;
337 if (np->n_direofoffset
338 && uio->uio_offset >= np->n_direofoffset) {
339 return (0);
340 }
341 lbn = uio->uio_offset / NFS_DIRBLKSIZ;
342 on = uio->uio_offset & (NFS_DIRBLKSIZ - 1);
343 bp = nfs_getcacheblk(vp, lbn, NFS_DIRBLKSIZ, p, operation);
344 if (!bp)
345 return (EINTR);
346 if (!ISSET(bp->b_flags, B_CACHE)) {
347 SET(bp->b_flags, B_READ);
348 error = nfs_doio(bp, cred, p);
349 if (error) {
350 brelse(bp);
351 }
352 while (error == NFSERR_BAD_COOKIE) {
353 nfs_invaldir(vp);
354 error = nfs_vinvalbuf(vp, 0, cred, p, 1);
355 /*
356 * Yuck! The directory has been modified on the
357 * server. The only way to get the block is by
358 * reading from the beginning to get all the
359 * offset cookies.
360 */
361 for (i = 0; i <= lbn && !error; i++) {
362 if (np->n_direofoffset
363 && (i * NFS_DIRBLKSIZ) >= np->n_direofoffset)
364 return (0);
365 bp = nfs_getcacheblk(vp, i, NFS_DIRBLKSIZ, p,
366 operation);
367 if (!bp)
368 return (EINTR);
369 if (!ISSET(bp->b_flags, B_CACHE)) {
370 SET(bp->b_flags, B_READ);
371 error = nfs_doio(bp, cred, p);
372 /*
373 * no error + B_INVAL == directory EOF,
374 * use the block.
375 */
376 if (error == 0 && (bp->b_flags & B_INVAL))
377 break;
378 }
379 /*
380 * An error will throw away the block and the
381 * for loop will break out. If no error and this
382 * is not the block we want, we throw away the
383 * block and go for the next one via the for loop.
384 */
385 if (error || i < lbn)
386 brelse(bp);
387 }
388 }
389 /*
390 * The above while is repeated if we hit another cookie
391 * error. If we hit an error and it wasn't a cookie error,
392 * we give up.
393 */
394 if (error)
395 return (error);
396 }
397
398 /*
399 * If not eof and read aheads are enabled, start one.
400 * (You need the current block first, so that you have the
401 * directory offset cookie of the next block.)
402 */
403 if (nfs_numasync > 0 && nmp->nm_readahead > 0 &&
404 (np->n_direofoffset == 0 ||
405 (lbn + 1) * NFS_DIRBLKSIZ < np->n_direofoffset) &&
406 !(np->n_flag & NQNFSNONCACHE) &&
407 !incore(vp, lbn + 1)) {
408 rabp = nfs_getcacheblk(vp, lbn + 1, NFS_DIRBLKSIZ, p,
409 operation);
410 if (rabp) {
411 if (!ISSET(rabp->b_flags, (B_CACHE|B_DELWRI))) {
412 SET(rabp->b_flags, (B_READ | B_ASYNC));
413 if (nfs_asyncio(rabp, cred)) {
414 SET(rabp->b_flags, (B_INVAL|B_ERROR));
415 rabp->b_error = EIO;
416 brelse(rabp);
417 }
418 } else {
419 brelse(rabp);
420 }
421 }
422 }
423 /*
424 * Make sure we use a signed variant of min() since
425 * the second term may be negative.
426 */
427 n = lmin(uio->uio_resid, NFS_DIRBLKSIZ - bp->b_resid - on);
428 /*
429 * Unlike VREG files, whos buffer size ( bp->b_bcount ) is
430 * chopped for the EOF condition, we cannot tell how large
431 * NFS directories are going to be until we hit EOF. So
432 * an NFS directory buffer is *not* chopped to its EOF. Now,
433 * it just so happens that b_resid will effectively chop it
434 * to EOF. *BUT* this information is lost if the buffer goes
435 * away and is reconstituted into a B_CACHE state (recovered
436 * from VM) later. So we keep track of the directory eof
437 * in np->n_direofoffset and chop it off as an extra step
438 * right here.
439 */
440 if (np->n_direofoffset &&
441 n > np->n_direofoffset - uio->uio_offset)
442 n = np->n_direofoffset - uio->uio_offset;
443 break;
444 default:
445 printf(" nfs_bioread: type %x unexpected\n",vp->v_type);
446 break;
447 };
448
449 if (n > 0) {
450 error = uiomove(bp->b_data + on, (int)n, uio);
451 }
452 switch (vp->v_type) {
453 case VREG:
454 break;
455 case VLNK:
456 n = 0;
457 break;
458 case VDIR:
459 if (np->n_flag & NQNFSNONCACHE)
460 SET(bp->b_flags, B_INVAL);
461 break;
462 default:
463 printf(" nfs_bioread: type %x unexpected\n",vp->v_type);
464 }
465 brelse(bp);
466 } while (error == 0 && uio->uio_resid > 0 && n > 0);
467 return (error);
468 }
469
470
471 /*
472 * Vnode op for write using bio
473 */
474 int
475 nfs_write(ap)
476 struct vop_write_args /* {
477 struct vnode *a_vp;
478 struct uio *a_uio;
479 int a_ioflag;
480 struct ucred *a_cred;
481 } */ *ap;
482 {
483 register int biosize;
484 register struct uio *uio = ap->a_uio;
485 struct proc *p = uio->uio_procp;
486 register struct vnode *vp = ap->a_vp;
487 struct nfsnode *np = VTONFS(vp);
488 register struct ucred *cred = ap->a_cred;
489 int ioflag = ap->a_ioflag;
490 struct buf *bp;
491 struct vattr vattr;
492 struct nfsmount *nmp = VFSTONFS(vp->v_mount);
493 daddr_t lbn;
494 int bufsize;
495 int n, on, error = 0, iomode, must_commit;
496 off_t boff;
497 struct iovec iov;
498 struct uio auio;
499
500 #if DIAGNOSTIC
501 if (uio->uio_rw != UIO_WRITE)
502 panic("nfs_write mode");
503 if (uio->uio_segflg == UIO_USERSPACE && uio->uio_procp != current_proc())
504 panic("nfs_write proc");
505 #endif
506 if (vp->v_type != VREG)
507 return (EIO);
508 if (np->n_flag & NWRITEERR) {
509 np->n_flag &= ~NWRITEERR;
510 return (np->n_error);
511 }
512 if ((nmp->nm_flag & (NFSMNT_NFSV3 | NFSMNT_GOTFSINFO)) == NFSMNT_NFSV3)
513 (void)nfs_fsinfo(nmp, vp, cred, p);
514 if (ioflag & (IO_APPEND | IO_SYNC)) {
515 if (np->n_flag & NMODIFIED) {
516 np->n_attrstamp = 0;
517 error = nfs_vinvalbuf(vp, V_SAVE, cred, p, 1);
518 if (error)
519 return (error);
520 }
521 if (ioflag & IO_APPEND) {
522 np->n_attrstamp = 0;
523 error = VOP_GETATTR(vp, &vattr, cred, p);
524 if (error)
525 return (error);
526 uio->uio_offset = np->n_size;
527 }
528 }
529 if (uio->uio_offset < 0)
530 return (EINVAL);
531 if (uio->uio_resid == 0)
532 return (0);
533 /*
534 * Maybe this should be above the vnode op call, but so long as
535 * file servers have no limits, i don't think it matters
536 */
537 if (p && uio->uio_offset + uio->uio_resid >
538 p->p_rlimit[RLIMIT_FSIZE].rlim_cur) {
539 psignal(p, SIGXFSZ);
540 return (EFBIG);
541 }
542 /*
543 * I use nm_rsize, not nm_wsize so that all buffer cache blocks
544 * will be the same size within a filesystem. nfs_writerpc will
545 * still use nm_wsize when sizing the rpc's.
546 */
547 /*due to getblk/vm interractions, use vm page size or less values */
548 biosize = min(vp->v_mount->mnt_stat.f_iosize, PAGE_SIZE);
549
550 do {
551 /*
552 * Check for a valid write lease.
553 */
554 if ((nmp->nm_flag & NFSMNT_NQNFS) &&
555 NQNFS_CKINVALID(vp, np, ND_WRITE)) {
556 do {
557 error = nqnfs_getlease(vp, ND_WRITE, cred, p);
558 } while (error == NQNFS_EXPIRED);
559 if (error)
560 return (error);
561 if (np->n_lrev != np->n_brev ||
562 (np->n_flag & NQNFSNONCACHE)) {
563 error = nfs_vinvalbuf(vp, V_SAVE, cred, p, 1);
564 if (error)
565 return (error);
566 np->n_brev = np->n_lrev;
567 }
568 }
569 if ((np->n_flag & NQNFSNONCACHE) && uio->uio_iovcnt == 1) {
570 iomode = NFSV3WRITE_FILESYNC;
571 error = nfs_writerpc(vp, uio, cred, &iomode, &must_commit);
572 if (must_commit)
573 nfs_clearcommit(vp->v_mount);
574 return (error);
575 }
576 nfsstats.biocache_writes++;
577 lbn = uio->uio_offset / biosize;
578 on = uio->uio_offset & (biosize-1);
579 n = min((unsigned)(biosize - on), uio->uio_resid);
580 again:
581 bufsize = biosize;
582 #if 0
583 /* (removed for UBC) */
584 if ((lbn + 1) * biosize > np->n_size) {
585 bufsize = np->n_size - lbn * biosize;
586 bufsize = (bufsize + DEV_BSIZE - 1) & ~(DEV_BSIZE - 1);
587 }
588 #endif
589 /*
590 * Get a cache block for writing. The range to be written is
591 * (off..off+len) within the block. We ensure that the block
592 * either has no dirty region or that the given range is
593 * contiguous with the existing dirty region.
594 */
595 bp = nfs_getcacheblk(vp, lbn, bufsize, p, BLK_WRITE);
596 if (!bp)
597 return (EINTR);
598 /*
599 * Resize nfsnode *after* we busy the buffer to prevent
600 * readers from reading garbage.
601 * If there was a partial buf at the old eof, validate
602 * and zero the new bytes.
603 */
604 if (uio->uio_offset + n > np->n_size) {
605 struct buf *bp0 = NULL;
606 daddr_t bn = np->n_size / biosize;
607 int off = np->n_size & (biosize - 1);
608
609 if (off && bn < lbn && incore(vp, bn))
610 bp0 = nfs_getcacheblk(vp, bn, biosize, p,
611 BLK_WRITE);
612 np->n_flag |= NMODIFIED;
613 np->n_size = uio->uio_offset + n;
614 ubc_setsize(vp, (off_t)np->n_size); /* XXX errors */
615 if (bp0) {
616 bzero((char *)bp0->b_data + off, biosize - off);
617 bp0->b_validend = biosize;
618 brelse(bp0);
619 }
620 }
621 /*
622 * NFS has embedded ucred so crhold() risks zone corruption
623 */
624 if (bp->b_wcred == NOCRED)
625 bp->b_wcred = crdup(cred);
626 /*
627 * If dirtyend exceeds file size, chop it down. This should
628 * not occur unless there is a race.
629 */
630 if ((off_t)bp->b_blkno * DEV_BSIZE + bp->b_dirtyend >
631 np->n_size)
632 bp->b_dirtyend = np->n_size - (off_t)bp->b_blkno *
633 DEV_BSIZE;
634 /*
635 * UBC doesn't (yet) handle partial pages so nfs_biowrite was
636 * hacked to never bdwrite, to start every little write right
637 * away. Running IE Avie noticed the performance problem, thus
638 * this code, which permits those delayed writes by ensuring an
639 * initial read of the entire page. The read may hit eof
640 * ("short read") but that we will handle.
641 *
642 * We are quite dependant on the correctness of B_CACHE so check
643 * that first in case of problems.
644 */
645 if (!ISSET(bp->b_flags, B_CACHE) && n < PAGE_SIZE) {
646 boff = (off_t)bp->b_blkno * DEV_BSIZE;
647 auio.uio_iov = &iov;
648 auio.uio_iovcnt = 1;
649 auio.uio_offset = boff;
650 auio.uio_resid = PAGE_SIZE;
651 auio.uio_segflg = UIO_SYSSPACE;
652 auio.uio_rw = UIO_READ;
653 auio.uio_procp = p;
654 iov.iov_base = bp->b_data;
655 iov.iov_len = PAGE_SIZE;
656 error = nfs_readrpc(vp, &auio, cred);
657 if (error) {
658 bp->b_error = error;
659 SET(bp->b_flags, B_ERROR);
660 printf("nfs_write: readrpc %d", error);
661 }
662 if (auio.uio_resid > 0)
663 bzero(iov.iov_base, auio.uio_resid);
664 bp->b_validoff = 0;
665 bp->b_validend = PAGE_SIZE - auio.uio_resid;
666 if (np->n_size > boff + bp->b_validend)
667 bp->b_validend = min(np->n_size - boff,
668 PAGE_SIZE);
669 bp->b_dirtyoff = 0;
670 bp->b_dirtyend = 0;
671 }
672
673 /*
674 * If the new write will leave a contiguous dirty
675 * area, just update the b_dirtyoff and b_dirtyend,
676 * otherwise try to extend the dirty region.
677 */
678 if (bp->b_dirtyend > 0 &&
679 (on > bp->b_dirtyend || (on + n) < bp->b_dirtyoff)) {
680 off_t start, end;
681
682 boff = (off_t)bp->b_blkno * DEV_BSIZE;
683 if (on > bp->b_dirtyend) {
684 start = boff + bp->b_validend;
685 end = boff + on;
686 } else {
687 start = boff + on + n;
688 end = boff + bp->b_validoff;
689 }
690
691 /*
692 * It may be that the valid region in the buffer
693 * covers the region we want, in which case just
694 * extend the dirty region. Otherwise we try to
695 * extend the valid region.
696 */
697 if (end > start) {
698 auio.uio_iov = &iov;
699 auio.uio_iovcnt = 1;
700 auio.uio_offset = start;
701 auio.uio_resid = end - start;
702 auio.uio_segflg = UIO_SYSSPACE;
703 auio.uio_rw = UIO_READ;
704 auio.uio_procp = p;
705 iov.iov_base = bp->b_data + (start - boff);
706 iov.iov_len = end - start;
707 error = nfs_readrpc(vp, &auio, cred);
708 /*
709 * If we couldn't read, do not do a VOP_BWRITE
710 * as originally coded. That could also error
711 * and looping back to "again" as it was doing
712 * could have us stuck trying to write same buf
713 * again. nfs_write, will get the entire region
714 * if nfs_readrpc succeeded. If unsuccessful
715 * we should just error out. Errors like ESTALE
716 * would keep us looping rather than transient
717 * errors justifying a retry. We can return here
718 * instead of altering dirty region later. We
719 * did not write old dirty region at this point.
720 */
721 if (error) {
722 bp->b_error = error;
723 SET(bp->b_flags, B_ERROR);
724 printf("nfs_write: readrpc2 %d", error);
725 brelse(bp);
726 return (error);
727 }
728 /*
729 * The read worked.
730 * If there was a short read, just zero fill.
731 */
732 if (auio.uio_resid > 0)
733 bzero(iov.iov_base, auio.uio_resid);
734 if (on > bp->b_dirtyend)
735 bp->b_validend = on;
736 else
737 bp->b_validoff = on + n;
738 }
739 /*
740 * We now have a valid region which extends up to the
741 * dirty region which we want.
742 */
743 if (on > bp->b_dirtyend)
744 bp->b_dirtyend = on;
745 else
746 bp->b_dirtyoff = on + n;
747 }
748 if (ISSET(bp->b_flags, B_ERROR)) {
749 error = bp->b_error;
750 brelse(bp);
751 return (error);
752 }
753 /*
754 * NFS has embedded ucred so crhold() risks zone corruption
755 */
756 if (bp->b_wcred == NOCRED)
757 bp->b_wcred = crdup(cred);
758 np->n_flag |= NMODIFIED;
759
760 /*
761 * Check for valid write lease and get one as required.
762 * In case getblk() and/or bwrite() delayed us.
763 */
764 if ((nmp->nm_flag & NFSMNT_NQNFS) &&
765 NQNFS_CKINVALID(vp, np, ND_WRITE)) {
766 do {
767 error = nqnfs_getlease(vp, ND_WRITE, cred, p);
768 } while (error == NQNFS_EXPIRED);
769 if (error) {
770 brelse(bp);
771 return (error);
772 }
773 if (np->n_lrev != np->n_brev ||
774 (np->n_flag & NQNFSNONCACHE)) {
775 brelse(bp);
776 error = nfs_vinvalbuf(vp, V_SAVE, cred, p, 1);
777 if (error)
778 return (error);
779 np->n_brev = np->n_lrev;
780 goto again;
781 }
782 }
783 error = uiomove((char *)bp->b_data + on, n, uio);
784 if (error) {
785 SET(bp->b_flags, B_ERROR);
786 brelse(bp);
787 return (error);
788 }
789 if (bp->b_dirtyend > 0) {
790 bp->b_dirtyoff = min(on, bp->b_dirtyoff);
791 bp->b_dirtyend = max((on + n), bp->b_dirtyend);
792 } else {
793 bp->b_dirtyoff = on;
794 bp->b_dirtyend = on + n;
795 }
796 if (bp->b_validend == 0 || bp->b_validend < bp->b_dirtyoff ||
797 bp->b_validoff > bp->b_dirtyend) {
798 bp->b_validoff = bp->b_dirtyoff;
799 bp->b_validend = bp->b_dirtyend;
800 } else {
801 bp->b_validoff = min(bp->b_validoff, bp->b_dirtyoff);
802 bp->b_validend = max(bp->b_validend, bp->b_dirtyend);
803 }
804
805 /*
806 * Since this block is being modified, it must be written
807 * again and not just committed.
808 */
809 CLR(bp->b_flags, B_NEEDCOMMIT);
810
811 /*
812 * If the lease is non-cachable or IO_SYNC do bwrite().
813 */
814 if ((np->n_flag & NQNFSNONCACHE) || (ioflag & IO_SYNC)) {
815 bp->b_proc = p;
816 error = VOP_BWRITE(bp);
817 if (error)
818 return (error);
819 if (np->n_flag & NQNFSNONCACHE) {
820 error = nfs_vinvalbuf(vp, V_SAVE, cred, p, 1);
821 if (error)
822 return (error);
823 }
824 } else if ((n + on) == biosize &&
825 (nmp->nm_flag & NFSMNT_NQNFS) == 0) {
826 bp->b_proc = (struct proc *)0;
827 SET(bp->b_flags, B_ASYNC);
828 (void)nfs_writebp(bp, 0);
829 } else
830 bdwrite(bp);
831 } while (uio->uio_resid > 0 && n > 0);
832 return (0);
833 }
834
835
836 /*
837 * Get an nfs cache block.
838 * Allocate a new one if the block isn't currently in the cache
839 * and return the block marked busy. If the calling process is
840 * interrupted by a signal for an interruptible mount point, return
841 * NULL.
842 */
843 static struct buf *
844 nfs_getcacheblk(vp, bn, size, p, operation)
845 struct vnode *vp;
846 daddr_t bn;
847 int size;
848 struct proc *p;
849 int operation; /* defined in sys/buf.h */
850 {
851 register struct buf *bp;
852 struct nfsmount *nmp = VFSTONFS(vp->v_mount);
853 /*due to getblk/vm interractions, use vm page size or less values */
854 int biosize = min(vp->v_mount->mnt_stat.f_iosize, PAGE_SIZE);
855
856 if (nbdwrite > ((nbuf/4)*3) && operation == BLK_WRITE) {
857 #define __BUFFERS_RECLAIMED 2
858 struct buf *tbp[__BUFFERS_RECLAIMED];
859 int i;
860
861 /* too many delayed writes, try to free up some buffers */
862 for (i = 0; i < __BUFFERS_RECLAIMED; i++)
863 tbp[i] = geteblk(512);
864
865 /* Yield to IO thread */
866 (void)tsleep((caddr_t)&nbdwrite, PCATCH, "nbdwrite", 1);
867
868 for (i = (__BUFFERS_RECLAIMED - 1); i >= 0; i--)
869 brelse(tbp[i]);
870 }
871
872 if (nmp->nm_flag & NFSMNT_INT) {
873 bp = getblk(vp, bn, size, PCATCH, 0, operation);
874 while (bp == (struct buf *)0) {
875 if (nfs_sigintr(nmp, (struct nfsreq *)0, p))
876 return ((struct buf *)0);
877 bp = getblk(vp, bn, size, 0, 2 * hz, operation);
878 }
879 } else
880 bp = getblk(vp, bn, size, 0, 0, operation);
881
882 if( vp->v_type == VREG)
883 bp->b_blkno = ((off_t)bn * biosize) / DEV_BSIZE;
884
885 return (bp);
886 }
887
888 /*
889 * Flush and invalidate all dirty buffers. If another process is already
890 * doing the flush, just wait for completion.
891 */
892 int
893 nfs_vinvalbuf(vp, flags, cred, p, intrflg)
894 struct vnode *vp;
895 int flags;
896 struct ucred *cred;
897 struct proc *p;
898 int intrflg;
899 {
900 register struct nfsnode *np = VTONFS(vp);
901 struct nfsmount *nmp = VFSTONFS(vp->v_mount);
902 int error = 0, slpflag, slptimeo;
903 int didhold = 0;
904
905 if ((nmp->nm_flag & NFSMNT_INT) == 0)
906 intrflg = 0;
907 if (intrflg) {
908 slpflag = PCATCH;
909 slptimeo = 2 * hz;
910 } else {
911 slpflag = 0;
912 slptimeo = 0;
913 }
914 /*
915 * First wait for any other process doing a flush to complete.
916 */
917 while (np->n_flag & NFLUSHINPROG) {
918 np->n_flag |= NFLUSHWANT;
919 error = tsleep((caddr_t)&np->n_flag, PRIBIO + 2, "nfsvinval",
920 slptimeo);
921 if (error && intrflg && nfs_sigintr(nmp, (struct nfsreq *)0, p))
922 return (EINTR);
923 }
924
925 /*
926 * Now, flush as required.
927 */
928 np->n_flag |= NFLUSHINPROG;
929 error = vinvalbuf(vp, flags, cred, p, slpflag, 0);
930 while (error) {
931 /* we seem to be stuck in a loop here if the thread got aborted.
932 * nfs_flush will return EINTR. Not sure if that will cause
933 * other consequences due to EINTR having other meanings in NFS
934 * To handle, no dirty pages, it seems safe to just return from
935 * here. But if we did have dirty pages, how would we get them
936 * written out if thread was aborted? Some other strategy is
937 * necessary. -- EKN
938 */
939 if ((intrflg && nfs_sigintr(nmp, (struct nfsreq *)0, p)) ||
940 (error == EINTR && current_thread_aborted())) {
941 np->n_flag &= ~NFLUSHINPROG;
942 if (np->n_flag & NFLUSHWANT) {
943 np->n_flag &= ~NFLUSHWANT;
944 wakeup((caddr_t)&np->n_flag);
945 }
946 return (EINTR);
947 }
948 error = vinvalbuf(vp, flags, cred, p, 0, slptimeo);
949 }
950 np->n_flag &= ~(NMODIFIED | NFLUSHINPROG);
951 if (np->n_flag & NFLUSHWANT) {
952 np->n_flag &= ~NFLUSHWANT;
953 wakeup((caddr_t)&np->n_flag);
954 }
955 didhold = ubc_hold(vp);
956 if (didhold) {
957 (void) ubc_clean(vp, 1); /* get the pages out of vm also */
958 ubc_rele(vp);
959 }
960 return (0);
961 }
962
963 /*
964 * Initiate asynchronous I/O. Return an error if no nfsiods are available.
965 * This is mainly to avoid queueing async I/O requests when the nfsiods
966 * are all hung on a dead server.
967 */
968 int
969 nfs_asyncio(bp, cred)
970 register struct buf *bp;
971 struct ucred *cred;
972 {
973 struct nfsmount *nmp;
974 int i;
975 int gotiod;
976 int slpflag = 0;
977 int slptimeo = 0;
978 int error;
979
980 if (nfs_numasync == 0)
981 return (EIO);
982
983 nmp = VFSTONFS(bp->b_vp->v_mount);
984 again:
985 if (nmp->nm_flag & NFSMNT_INT)
986 slpflag = PCATCH;
987 gotiod = FALSE;
988
989 /*
990 * Find a free iod to process this request.
991 */
992 for (i = 0; i < NFS_MAXASYNCDAEMON; i++)
993 if (nfs_iodwant[i]) {
994 /*
995 * Found one, so wake it up and tell it which
996 * mount to process.
997 */
998 NFS_DPF(ASYNCIO,
999 ("nfs_asyncio: waking iod %d for mount %p\n",
1000 i, nmp));
1001 nfs_iodwant[i] = (struct proc *)0;
1002 nfs_iodmount[i] = nmp;
1003 nmp->nm_bufqiods++;
1004 wakeup((caddr_t)&nfs_iodwant[i]);
1005 gotiod = TRUE;
1006 break;
1007 }
1008
1009 /*
1010 * If none are free, we may already have an iod working on this mount
1011 * point. If so, it will process our request.
1012 */
1013 if (!gotiod) {
1014 if (nmp->nm_bufqiods > 0) {
1015 NFS_DPF(ASYNCIO,
1016 ("nfs_asyncio: %d iods are already processing mount %p\n",
1017 nmp->nm_bufqiods, nmp));
1018 gotiod = TRUE;
1019 }
1020 }
1021
1022 /*
1023 * If we have an iod which can process the request, then queue
1024 * the buffer.
1025 */
1026 if (gotiod) {
1027 /*
1028 * Ensure that the queue never grows too large.
1029 */
1030 while (nmp->nm_bufqlen >= 2*nfs_numasync) {
1031 NFS_DPF(ASYNCIO,
1032 ("nfs_asyncio: waiting for mount %p queue to drain\n", nmp));
1033 nmp->nm_bufqwant = TRUE;
1034 error = tsleep(&nmp->nm_bufq, slpflag | PRIBIO,
1035 "nfsaio", slptimeo);
1036 if (error) {
1037 if (nfs_sigintr(nmp, NULL, bp->b_proc))
1038 return (EINTR);
1039 if (slpflag == PCATCH) {
1040 slpflag = 0;
1041 slptimeo = 2 * hz;
1042 }
1043 }
1044 /*
1045 * We might have lost our iod while sleeping,
1046 * so check and loop if nescessary.
1047 */
1048 if (nmp->nm_bufqiods == 0) {
1049 NFS_DPF(ASYNCIO,
1050 ("nfs_asyncio: no iods after mount %p queue was drained, looping\n", nmp));
1051 goto again;
1052 }
1053 }
1054
1055 if (ISSET(bp->b_flags, B_READ)) {
1056 if (bp->b_rcred == NOCRED && cred != NOCRED) {
1057 /*
1058 * NFS has embedded ucred.
1059 * Can not crhold() here as that causes zone corruption
1060 */
1061 bp->b_rcred = crdup(cred);
1062 }
1063 } else {
1064 SET(bp->b_flags, B_WRITEINPROG);
1065 if (bp->b_wcred == NOCRED && cred != NOCRED) {
1066 /*
1067 * NFS has embedded ucred.
1068 * Can not crhold() here as that causes zone corruption
1069 */
1070 bp->b_wcred = crdup(cred);
1071 }
1072 }
1073
1074 TAILQ_INSERT_TAIL(&nmp->nm_bufq, bp, b_freelist);
1075 nmp->nm_bufqlen++;
1076 return (0);
1077 }
1078
1079 /*
1080 * All the iods are busy on other mounts, so return EIO to
1081 * force the caller to process the i/o synchronously.
1082 */
1083 NFS_DPF(ASYNCIO, ("nfs_asyncio: no iods available, i/o is synchronous\n"));
1084 return (EIO);
1085 }
1086
1087 /*
1088 * Do an I/O operation to/from a cache block. This may be called
1089 * synchronously or from an nfsiod.
1090 */
1091 int
1092 nfs_doio(bp, cr, p)
1093 register struct buf *bp;
1094 struct ucred *cr;
1095 struct proc *p;
1096 {
1097 register struct uio *uiop;
1098 register struct vnode *vp;
1099 struct nfsnode *np;
1100 struct nfsmount *nmp;
1101 int error = 0, diff, len, iomode, must_commit = 0;
1102 struct uio uio;
1103 struct iovec io;
1104
1105 vp = bp->b_vp;
1106 np = VTONFS(vp);
1107 nmp = VFSTONFS(vp->v_mount);
1108 uiop = &uio;
1109 uiop->uio_iov = &io;
1110 uiop->uio_iovcnt = 1;
1111 uiop->uio_segflg = UIO_SYSSPACE;
1112 uiop->uio_procp = p;
1113
1114 /*
1115 * With UBC, getblk() can return a buf with B_DONE set.
1116 * This indicates that the VM has valid data for that page.
1117 * NFS being stateless, this case poses a problem.
1118 * By definition, the NFS server should always be consulted
1119 * for the data in that page.
1120 * So we choose to clear the B_DONE and to do the IO.
1121 *
1122 * XXX revisit this if there is a performance issue.
1123 * XXX In that case, we could play the attribute cache games ...
1124 */
1125 if (ISSET(bp->b_flags, B_DONE)) {
1126 if (!ISSET(bp->b_flags, B_ASYNC))
1127 panic("nfs_doio: done and not async");
1128 CLR(bp->b_flags, B_DONE);
1129 }
1130 FSDBG_TOP(256, np->n_size, bp->b_blkno * DEV_BSIZE, bp->b_bcount,
1131 bp->b_flags);
1132 FSDBG(257, bp->b_validoff, bp->b_validend, bp->b_dirtyoff,
1133 bp->b_dirtyend);
1134 /*
1135 * Historically, paging was done with physio, but no more.
1136 */
1137 if (ISSET(bp->b_flags, B_PHYS)) {
1138 /*
1139 * ...though reading /dev/drum still gets us here.
1140 */
1141 io.iov_len = uiop->uio_resid = bp->b_bcount;
1142 /* mapping was done by vmapbuf() */
1143 io.iov_base = bp->b_data;
1144 uiop->uio_offset = (off_t)bp->b_blkno * DEV_BSIZE;
1145 if (ISSET(bp->b_flags, B_READ)) {
1146 uiop->uio_rw = UIO_READ;
1147 nfsstats.read_physios++;
1148 error = nfs_readrpc(vp, uiop, cr);
1149 } else {
1150 int com;
1151
1152 iomode = NFSV3WRITE_DATASYNC;
1153 uiop->uio_rw = UIO_WRITE;
1154 nfsstats.write_physios++;
1155 error = nfs_writerpc(vp, uiop, cr, &iomode, &com);
1156 }
1157 if (error) {
1158 SET(bp->b_flags, B_ERROR);
1159 bp->b_error = error;
1160 }
1161 } else if (ISSET(bp->b_flags, B_READ)) {
1162 io.iov_len = uiop->uio_resid = bp->b_bcount;
1163 io.iov_base = bp->b_data;
1164 uiop->uio_rw = UIO_READ;
1165 switch (vp->v_type) {
1166 case VREG:
1167 uiop->uio_offset = (off_t)bp->b_blkno * DEV_BSIZE;
1168 nfsstats.read_bios++;
1169 error = nfs_readrpc(vp, uiop, cr);
1170 FSDBG(262, np->n_size, bp->b_blkno * DEV_BSIZE,
1171 uiop->uio_resid, error);
1172 if (!error) {
1173 bp->b_validoff = 0;
1174 if (uiop->uio_resid) {
1175 /*
1176 * If len > 0, there is a hole in the file and
1177 * no writes after the hole have been pushed to
1178 * the server yet.
1179 * Just zero fill the rest of the valid area.
1180 */
1181 diff = bp->b_bcount - uiop->uio_resid;
1182 len = np->n_size - ((u_quad_t)bp->b_blkno * DEV_BSIZE +
1183 diff);
1184 if (len > 0) {
1185 len = min(len, uiop->uio_resid);
1186 bzero((char *)bp->b_data + diff, len);
1187 bp->b_validend = diff + len;
1188 FSDBG(258, diff, len, 0, 1);
1189 } else
1190 bp->b_validend = diff;
1191 } else
1192 bp->b_validend = bp->b_bcount;
1193
1194 if (bp->b_validend < bp->b_bufsize) {
1195 /*
1196 * we're about to release a partial buffer after a
1197 * read... the only way we should get here is if
1198 * this buffer contains the EOF before releasing it,
1199 * we'll zero out to the end of the buffer so that
1200 * if a mmap of this page occurs, we'll see zero's
1201 * even if a ftruncate extends the file in the
1202 * meantime
1203 */
1204 bzero((caddr_t)(bp->b_data + bp->b_validend),
1205 bp->b_bufsize - bp->b_validend);
1206 FSDBG(258, bp->b_validend,
1207 bp->b_bufsize - bp->b_validend, 0, 2);
1208 }
1209 }
1210 if (p && (vp->v_flag & VTEXT) &&
1211 (((nmp->nm_flag & NFSMNT_NQNFS) &&
1212 NQNFS_CKINVALID(vp, np, ND_READ) &&
1213 np->n_lrev != np->n_brev) ||
1214 (!(nmp->nm_flag & NFSMNT_NQNFS) &&
1215 np->n_mtime != np->n_vattr.va_mtime.tv_sec))) {
1216 uprintf("Process killed due to text file modification\n");
1217 psignal(p, SIGKILL);
1218 p->p_flag |= P_NOSWAP;
1219 }
1220 break;
1221 case VLNK:
1222 uiop->uio_offset = (off_t)0;
1223 nfsstats.readlink_bios++;
1224 error = nfs_readlinkrpc(vp, uiop, cr);
1225 break;
1226 case VDIR:
1227 nfsstats.readdir_bios++;
1228 uiop->uio_offset = ((u_quad_t)bp->b_lblkno) * NFS_DIRBLKSIZ;
1229 if (!(nmp->nm_flag & NFSMNT_NFSV3))
1230 nmp->nm_flag &= ~NFSMNT_RDIRPLUS; /* dk@farm.org */
1231 if (nmp->nm_flag & NFSMNT_RDIRPLUS) {
1232 error = nfs_readdirplusrpc(vp, uiop, cr);
1233 if (error == NFSERR_NOTSUPP)
1234 nmp->nm_flag &= ~NFSMNT_RDIRPLUS;
1235 }
1236 if ((nmp->nm_flag & NFSMNT_RDIRPLUS) == 0)
1237 error = nfs_readdirrpc(vp, uiop, cr);
1238 break;
1239 default:
1240 printf("nfs_doio: type %x unexpected\n", vp->v_type);
1241 break;
1242 };
1243 if (error) {
1244 SET(bp->b_flags, B_ERROR);
1245 bp->b_error = error;
1246 }
1247 } else {
1248 /*
1249 * mapped I/O may have altered any bytes, so we extend
1250 * the dirty zone to the valid zone. For best performance
1251 * a better solution would be to save & restore page dirty bits
1252 * around the uiomove which brings write-data into the buffer.
1253 * Then here we'd check if the page is dirty rather than WASMAPPED
1254 * Also vnode_pager would change - if a page is clean it might
1255 * still need to be written due to DELWRI.
1256 */
1257 if (UBCINFOEXISTS(vp) && ubc_issetflags(vp, UI_WASMAPPED)) {
1258 bp->b_dirtyoff = min(bp->b_dirtyoff, bp->b_validoff);
1259 bp->b_dirtyend = max(bp->b_dirtyend, bp->b_validend);
1260 }
1261 if ((off_t)bp->b_blkno * DEV_BSIZE + bp->b_dirtyend > np->n_size)
1262 bp->b_dirtyend = np->n_size - (off_t)bp->b_blkno * DEV_BSIZE;
1263
1264 if (bp->b_dirtyend > bp->b_dirtyoff) {
1265 io.iov_len = uiop->uio_resid = bp->b_dirtyend - bp->b_dirtyoff;
1266 uiop->uio_offset = (off_t)bp->b_blkno * DEV_BSIZE +
1267 bp->b_dirtyoff;
1268 io.iov_base = (char *)bp->b_data + bp->b_dirtyoff;
1269 uiop->uio_rw = UIO_WRITE;
1270
1271 nfsstats.write_bios++;
1272 if ((bp->b_flags & (B_ASYNC | B_NEEDCOMMIT | B_NOCACHE)) ==
1273 B_ASYNC)
1274 iomode = NFSV3WRITE_UNSTABLE;
1275 else
1276 iomode = NFSV3WRITE_FILESYNC;
1277 SET(bp->b_flags, B_WRITEINPROG);
1278 error = nfs_writerpc(vp, uiop, cr, &iomode, &must_commit);
1279 if (!error && iomode == NFSV3WRITE_UNSTABLE)
1280 SET(bp->b_flags, B_NEEDCOMMIT);
1281 else
1282 CLR(bp->b_flags, B_NEEDCOMMIT);
1283 CLR(bp->b_flags, B_WRITEINPROG);
1284 /*
1285 * For an interrupted write, the buffer is still valid
1286 * and the write hasn't been pushed to the server yet,
1287 * so we can't set B_ERROR and report the interruption
1288 * by setting B_EINTR. For the B_ASYNC case, B_EINTR
1289 * is not relevant, so the rpc attempt is essentially
1290 * a noop. For the case of a V3 write rpc not being
1291 * committed to stable storage, the block is still
1292 * dirty and requires either a commit rpc or another
1293 * write rpc with iomode == NFSV3WRITE_FILESYNC before
1294 * the block is reused. This is indicated by setting
1295 * the B_DELWRI and B_NEEDCOMMIT flags.
1296 */
1297 if (error == EINTR || (!error && bp->b_flags & B_NEEDCOMMIT)) {
1298 int s;
1299
1300 CLR(bp->b_flags, B_INVAL | B_NOCACHE);
1301 if (!ISSET(bp->b_flags, B_DELWRI)) {
1302 SET(bp->b_flags, B_DELWRI);
1303 nbdwrite++;
1304 }
1305 FSDBG(261, bp->b_validoff, bp->b_validend,
1306 bp->b_bufsize, bp->b_bcount);
1307 /*
1308 * Since for the B_ASYNC case, nfs_bwrite() has
1309 * reassigned the buffer to the clean list, we have to
1310 * reassign it back to the dirty one. Ugh.
1311 */
1312 if (ISSET(bp->b_flags, B_ASYNC)) {
1313 s = splbio();
1314 reassignbuf(bp, vp);
1315 splx(s);
1316 } else {
1317 SET(bp->b_flags, B_EINTR);
1318 }
1319 } else {
1320 if (error) {
1321 SET(bp->b_flags, B_ERROR);
1322 bp->b_error = np->n_error = error;
1323 np->n_flag |= NWRITEERR;
1324 }
1325 bp->b_dirtyoff = bp->b_dirtyend = 0;
1326
1327 /*
1328 * validoff and validend represent the real data present
1329 * in this buffer if validoff is non-zero, than we have
1330 * to invalidate the buffer and kill the page when
1331 * biodone is called... the same is also true when
1332 * validend doesn't extend all the way to the end of the
1333 * buffer and validend doesn't equate to the current
1334 * EOF... eventually we need to deal with this in a more
1335 * humane way (like keeping the partial buffer without
1336 * making it immediately available to the VM page cache)
1337 */
1338 if (bp->b_validoff)
1339 SET(bp->b_flags, B_INVAL);
1340 else
1341 if (bp->b_validend < bp->b_bufsize) {
1342 if ((off_t)bp->b_blkno * DEV_BSIZE +
1343 bp->b_validend == np->n_size) {
1344 bzero((caddr_t)(bp->b_data +
1345 bp->b_validend),
1346 bp->b_bufsize - bp->b_validend);
1347 FSDBG(259, bp->b_validend,
1348 bp->b_bufsize - bp->b_validend, 0,
1349 0);
1350 } else
1351 SET(bp->b_flags, B_INVAL);
1352 }
1353 }
1354
1355 } else {
1356 if (bp->b_validoff ||
1357 (bp->b_validend < bp->b_bufsize &&
1358 (off_t)bp->b_blkno * DEV_BSIZE + bp->b_validend !=
1359 np->n_size)) {
1360 SET(bp->b_flags, B_INVAL);
1361 }
1362 if (bp->b_flags & B_INVAL) {
1363 FSDBG(260, bp->b_validoff, bp->b_validend,
1364 bp->b_bufsize, bp->b_bcount);
1365 }
1366 bp->b_resid = 0;
1367 biodone(bp);
1368 FSDBG_BOT(256, bp->b_validoff, bp->b_validend, bp->b_bufsize,
1369 np->n_size);
1370 return (0);
1371 }
1372 }
1373 bp->b_resid = uiop->uio_resid;
1374 if (must_commit)
1375 nfs_clearcommit(vp->v_mount);
1376
1377 if (bp->b_flags & B_INVAL) {
1378 FSDBG(260, bp->b_validoff, bp->b_validend, bp->b_bufsize,
1379 bp->b_bcount);
1380 }
1381 FSDBG_BOT(256, bp->b_validoff, bp->b_validend, bp->b_bcount, error);
1382
1383 biodone(bp);
1384 return (error);
1385 }