]> git.saurik.com Git - apple/xnu.git/blob - bsd/nfs/nfs_bio.c
3c4473b4447f85a5944ba06431825fb890a29b38
[apple/xnu.git] / bsd / nfs / nfs_bio.c
1 /*
2 * Copyright (c) 2000 Apple Computer, Inc. All rights reserved.
3 *
4 * @APPLE_LICENSE_HEADER_START@
5 *
6 * The contents of this file constitute Original Code as defined in and
7 * are subject to the Apple Public Source License Version 1.1 (the
8 * "License"). You may not use this file except in compliance with the
9 * License. Please obtain a copy of the License at
10 * http://www.apple.com/publicsource and read it before using this file.
11 *
12 * This Original Code and all software distributed under the License are
13 * distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY KIND, EITHER
14 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
15 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
16 * FITNESS FOR A PARTICULAR PURPOSE OR NON-INFRINGEMENT. Please see the
17 * License for the specific language governing rights and limitations
18 * under the License.
19 *
20 * @APPLE_LICENSE_HEADER_END@
21 */
22 /* Copyright (c) 1995 NeXT Computer, Inc. All Rights Reserved */
23 /*
24 * Copyright (c) 1989, 1993
25 * The Regents of the University of California. All rights reserved.
26 *
27 * This code is derived from software contributed to Berkeley by
28 * Rick Macklem at The University of Guelph.
29 *
30 * Redistribution and use in source and binary forms, with or without
31 * modification, are permitted provided that the following conditions
32 * are met:
33 * 1. Redistributions of source code must retain the above copyright
34 * notice, this list of conditions and the following disclaimer.
35 * 2. Redistributions in binary form must reproduce the above copyright
36 * notice, this list of conditions and the following disclaimer in the
37 * documentation and/or other materials provided with the distribution.
38 * 3. All advertising materials mentioning features or use of this software
39 * must display the following acknowledgement:
40 * This product includes software developed by the University of
41 * California, Berkeley and its contributors.
42 * 4. Neither the name of the University nor the names of its contributors
43 * may be used to endorse or promote products derived from this software
44 * without specific prior written permission.
45 *
46 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
47 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
48 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
49 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
50 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
51 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
52 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
53 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
54 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
55 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
56 * SUCH DAMAGE.
57 *
58 * @(#)nfs_bio.c 8.9 (Berkeley) 3/30/95
59 * FreeBSD-Id: nfs_bio.c,v 1.44 1997/09/10 19:52:25 phk Exp $
60 */
61 #include <sys/param.h>
62 #include <sys/systm.h>
63 #include <sys/resourcevar.h>
64 #include <sys/signalvar.h>
65 #include <sys/proc.h>
66 #include <sys/buf.h>
67 #include <sys/vnode.h>
68 #include <sys/mount.h>
69 #include <sys/kernel.h>
70 #include <sys/sysctl.h>
71 #include <sys/ubc.h>
72
73 #include <sys/vm.h>
74 #include <sys/vmparam.h>
75
76 #include <sys/time.h>
77 #include <kern/clock.h>
78
79 #include <nfs/rpcv2.h>
80 #include <nfs/nfsproto.h>
81 #include <nfs/nfs.h>
82 #include <nfs/nfsmount.h>
83 #include <nfs/nqnfs.h>
84 #include <nfs/nfsnode.h>
85
86 #include <sys/kdebug.h>
87
88 #define FSDBG(A, B, C, D, E) \
89 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, (A))) | DBG_FUNC_NONE, \
90 (int)(B), (int)(C), (int)(D), (int)(E), 0)
91 #define FSDBG_TOP(A, B, C, D, E) \
92 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, (A))) | DBG_FUNC_START, \
93 (int)(B), (int)(C), (int)(D), (int)(E), 0)
94 #define FSDBG_BOT(A, B, C, D, E) \
95 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, (A))) | DBG_FUNC_END, \
96 (int)(B), (int)(C), (int)(D), (int)(E), 0)
97
98 static struct buf *nfs_getcacheblk __P((struct vnode *vp, daddr_t bn, int size,
99 struct proc *p, int operation));
100
101 extern int nfs_numasync;
102 extern struct nfsstats nfsstats;
103
104 /*
105 * Vnode op for read using bio
106 * Any similarity to readip() is purely coincidental
107 */
108 int
109 nfs_bioread(vp, uio, ioflag, cred, getpages)
110 register struct vnode *vp;
111 register struct uio *uio;
112 int ioflag;
113 struct ucred *cred;
114 int getpages;
115 {
116 register struct nfsnode *np = VTONFS(vp);
117 register int biosize, diff, i;
118 struct buf *bp = 0, *rabp;
119 struct vattr vattr;
120 struct proc *p;
121 struct nfsmount *nmp = VFSTONFS(vp->v_mount);
122 daddr_t lbn, rabn;
123 int bufsize;
124 int nra, error = 0, n = 0, on = 0, not_readin;
125 int operation = (getpages? BLK_PAGEIN : BLK_READ);
126
127 #if DIAGNOSTIC
128 if (uio->uio_rw != UIO_READ)
129 panic("nfs_read mode");
130 #endif
131 if (uio->uio_resid == 0)
132 return (0);
133 if (uio->uio_offset < 0)
134 return (EINVAL);
135 p = uio->uio_procp;
136 if ((nmp->nm_flag & (NFSMNT_NFSV3 | NFSMNT_GOTFSINFO)) == NFSMNT_NFSV3)
137 (void)nfs_fsinfo(nmp, vp, cred, p);
138 /*due to getblk/vm interractions, use vm page size or less values */
139 biosize = min(vp->v_mount->mnt_stat.f_iosize, PAGE_SIZE);
140 /*
141 * For nfs, cache consistency can only be maintained approximately.
142 * Although RFC1094 does not specify the criteria, the following is
143 * believed to be compatible with the reference port.
144 * For nqnfs, full cache consistency is maintained within the loop.
145 * For nfs:
146 * If the file's modify time on the server has changed since the
147 * last read rpc or you have written to the file,
148 * you may have lost data cache consistency with the
149 * server, so flush all of the file's data out of the cache.
150 * Then force a getattr rpc to ensure that you have up to date
151 * attributes.
152 * NB: This implies that cache data can be read when up to
153 * NFS_ATTRTIMEO seconds out of date. If you find that you need current
154 * attributes this could be forced by setting n_attrstamp to 0 before
155 * the VOP_GETATTR() call.
156 */
157 if ((nmp->nm_flag & NFSMNT_NQNFS) == 0) {
158 if (np->n_flag & NMODIFIED) {
159 if (vp->v_type != VREG) {
160 if (vp->v_type != VDIR)
161 panic("nfs: bioread, not dir");
162 nfs_invaldir(vp);
163 error = nfs_vinvalbuf(vp, V_SAVE, cred, p, 1);
164 if (error)
165 return (error);
166 }
167 np->n_attrstamp = 0;
168 error = VOP_GETATTR(vp, &vattr, cred, p);
169 if (error)
170 return (error);
171 np->n_mtime = vattr.va_mtime.tv_sec;
172 } else {
173 error = VOP_GETATTR(vp, &vattr, cred, p);
174 if (error)
175 return (error);
176 if (np->n_mtime != vattr.va_mtime.tv_sec) {
177 if (vp->v_type == VDIR)
178 nfs_invaldir(vp);
179 error = nfs_vinvalbuf(vp, V_SAVE, cred, p, 1);
180 if (error)
181 return (error);
182 np->n_mtime = vattr.va_mtime.tv_sec;
183 }
184 }
185 }
186 do {
187
188 /*
189 * Get a valid lease. If cached data is stale, flush it.
190 */
191 if (nmp->nm_flag & NFSMNT_NQNFS) {
192 if (NQNFS_CKINVALID(vp, np, ND_READ)) {
193 do {
194 error = nqnfs_getlease(vp, ND_READ, cred, p);
195 } while (error == NQNFS_EXPIRED);
196 if (error)
197 return (error);
198 if (np->n_lrev != np->n_brev ||
199 (np->n_flag & NQNFSNONCACHE) ||
200 ((np->n_flag & NMODIFIED) && vp->v_type == VDIR)) {
201 if (vp->v_type == VDIR)
202 nfs_invaldir(vp);
203 error = nfs_vinvalbuf(vp, V_SAVE, cred, p, 1);
204 if (error)
205 return (error);
206 np->n_brev = np->n_lrev;
207 }
208 } else if (vp->v_type == VDIR && (np->n_flag & NMODIFIED)) {
209 nfs_invaldir(vp);
210 error = nfs_vinvalbuf(vp, V_SAVE, cred, p, 1);
211 if (error)
212 return (error);
213 }
214 }
215 if (np->n_flag & NQNFSNONCACHE) {
216 switch (vp->v_type) {
217 case VREG:
218 return (nfs_readrpc(vp, uio, cred));
219 case VLNK:
220 return (nfs_readlinkrpc(vp, uio, cred));
221 case VDIR:
222 break;
223 default:
224 printf(" NQNFSNONCACHE: type %x unexpected\n",
225 vp->v_type);
226 };
227 }
228 switch (vp->v_type) {
229 case VREG:
230 nfsstats.biocache_reads++;
231 lbn = uio->uio_offset / biosize;
232 on = uio->uio_offset & (biosize - 1);
233 not_readin = 1;
234
235 /*
236 * Start the read ahead(s), as required.
237 */
238 if (nfs_numasync > 0 && nmp->nm_readahead > 0) {
239 for (nra = 0; nra < nmp->nm_readahead &&
240 (off_t)(lbn + 1 + nra) * biosize < np->n_size;
241 nra++) {
242 rabn = lbn + 1 + nra;
243 if (!incore(vp, rabn)) {
244 rabp = nfs_getcacheblk(vp, rabn, biosize, p, operation);
245 if (!rabp)
246 return (EINTR);
247 if (!ISSET(rabp->b_flags, (B_CACHE|B_DELWRI))) {
248 SET(rabp->b_flags, (B_READ | B_ASYNC));
249 if (nfs_asyncio(rabp, cred)) {
250 SET(rabp->b_flags, (B_INVAL|B_ERROR));
251 rabp->b_error = EIO;
252 brelse(rabp);
253 }
254 } else
255 brelse(rabp);
256 }
257 }
258 }
259
260 /*
261 * If the block is in the cache and has the required data
262 * in a valid region, just copy it out.
263 * Otherwise, get the block and write back/read in,
264 * as required.
265 */
266 again:
267 bufsize = biosize;
268 if ((off_t)(lbn + 1) * biosize > np->n_size &&
269 (off_t)(lbn + 1) * biosize - np->n_size < biosize) {
270 bufsize = np->n_size - lbn * biosize;
271 bufsize = (bufsize + DEV_BSIZE - 1) & ~(DEV_BSIZE - 1);
272 }
273 bp = nfs_getcacheblk(vp, lbn, bufsize, p, operation);
274 if (!bp)
275 return (EINTR);
276
277 if (!ISSET(bp->b_flags, B_CACHE)) {
278 SET(bp->b_flags, B_READ);
279 CLR(bp->b_flags, (B_DONE | B_ERROR | B_INVAL));
280 not_readin = 0;
281 error = nfs_doio(bp, cred, p);
282 if (error) {
283 brelse(bp);
284 return (error);
285 }
286 }
287 if (bufsize > on) {
288 n = min((unsigned)(bufsize - on), uio->uio_resid);
289 } else {
290 n = 0;
291 }
292 diff = np->n_size - uio->uio_offset;
293 if (diff < n)
294 n = diff;
295 if (not_readin && n > 0) {
296 if (on < bp->b_validoff || (on + n) > bp->b_validend) {
297 SET(bp->b_flags, (B_NOCACHE|B_INVAFTERWRITE));
298 if (bp->b_dirtyend > 0) {
299 if (!ISSET(bp->b_flags, B_DELWRI))
300 panic("nfsbioread");
301 if (VOP_BWRITE(bp) == EINTR)
302 return (EINTR);
303 } else
304 brelse(bp);
305 goto again;
306 }
307 }
308 vp->v_lastr = lbn;
309 diff = (on >= bp->b_validend) ? 0 : (bp->b_validend - on);
310 if (diff < n)
311 n = diff;
312 break;
313 case VLNK:
314 nfsstats.biocache_readlinks++;
315 bp = nfs_getcacheblk(vp, (daddr_t)0, NFS_MAXPATHLEN, p, operation);
316 if (!bp)
317 return (EINTR);
318 if (!ISSET(bp->b_flags, B_CACHE)) {
319 SET(bp->b_flags, B_READ);
320 error = nfs_doio(bp, cred, p);
321 if (error) {
322 SET(bp->b_flags, B_ERROR);
323 brelse(bp);
324 return (error);
325 }
326 }
327 n = min(uio->uio_resid, NFS_MAXPATHLEN - bp->b_resid);
328 on = 0;
329 break;
330 case VDIR:
331 nfsstats.biocache_readdirs++;
332 if (np->n_direofoffset
333 && uio->uio_offset >= np->n_direofoffset) {
334 return (0);
335 }
336 lbn = uio->uio_offset / NFS_DIRBLKSIZ;
337 on = uio->uio_offset & (NFS_DIRBLKSIZ - 1);
338 bp = nfs_getcacheblk(vp, lbn, NFS_DIRBLKSIZ, p, operation);
339 if (!bp)
340 return (EINTR);
341 if (!ISSET(bp->b_flags, B_CACHE)) {
342 SET(bp->b_flags, B_READ);
343 error = nfs_doio(bp, cred, p);
344 if (error) {
345 brelse(bp);
346 }
347 while (error == NFSERR_BAD_COOKIE) {
348 nfs_invaldir(vp);
349 error = nfs_vinvalbuf(vp, 0, cred, p, 1);
350 /*
351 * Yuck! The directory has been modified on the
352 * server. The only way to get the block is by
353 * reading from the beginning to get all the
354 * offset cookies.
355 */
356 for (i = 0; i <= lbn && !error; i++) {
357 if (np->n_direofoffset
358 && (i * NFS_DIRBLKSIZ) >= np->n_direofoffset)
359 return (0);
360 bp = nfs_getcacheblk(vp, i, NFS_DIRBLKSIZ, p,
361 operation);
362 if (!bp)
363 return (EINTR);
364 if (!ISSET(bp->b_flags, B_CACHE)) {
365 SET(bp->b_flags, B_READ);
366 error = nfs_doio(bp, cred, p);
367 /*
368 * no error + B_INVAL == directory EOF,
369 * use the block.
370 */
371 if (error == 0 && (bp->b_flags & B_INVAL))
372 break;
373 }
374 /*
375 * An error will throw away the block and the
376 * for loop will break out. If no error and this
377 * is not the block we want, we throw away the
378 * block and go for the next one via the for loop.
379 */
380 if (error || i < lbn)
381 brelse(bp);
382 }
383 }
384 /*
385 * The above while is repeated if we hit another cookie
386 * error. If we hit an error and it wasn't a cookie error,
387 * we give up.
388 */
389 if (error)
390 return (error);
391 }
392
393 /*
394 * If not eof and read aheads are enabled, start one.
395 * (You need the current block first, so that you have the
396 * directory offset cookie of the next block.)
397 */
398 if (nfs_numasync > 0 && nmp->nm_readahead > 0 &&
399 (np->n_direofoffset == 0 ||
400 (lbn + 1) * NFS_DIRBLKSIZ < np->n_direofoffset) &&
401 !(np->n_flag & NQNFSNONCACHE) &&
402 !incore(vp, lbn + 1)) {
403 rabp = nfs_getcacheblk(vp, lbn + 1, NFS_DIRBLKSIZ, p,
404 operation);
405 if (rabp) {
406 if (!ISSET(rabp->b_flags, (B_CACHE|B_DELWRI))) {
407 SET(rabp->b_flags, (B_READ | B_ASYNC));
408 if (nfs_asyncio(rabp, cred)) {
409 SET(rabp->b_flags, (B_INVAL|B_ERROR));
410 rabp->b_error = EIO;
411 brelse(rabp);
412 }
413 } else {
414 brelse(rabp);
415 }
416 }
417 }
418 /*
419 * Make sure we use a signed variant of min() since
420 * the second term may be negative.
421 */
422 n = lmin(uio->uio_resid, NFS_DIRBLKSIZ - bp->b_resid - on);
423 /*
424 * Unlike VREG files, whos buffer size ( bp->b_bcount ) is
425 * chopped for the EOF condition, we cannot tell how large
426 * NFS directories are going to be until we hit EOF. So
427 * an NFS directory buffer is *not* chopped to its EOF. Now,
428 * it just so happens that b_resid will effectively chop it
429 * to EOF. *BUT* this information is lost if the buffer goes
430 * away and is reconstituted into a B_CACHE state (recovered
431 * from VM) later. So we keep track of the directory eof
432 * in np->n_direofoffset and chop it off as an extra step
433 * right here.
434 */
435 if (np->n_direofoffset &&
436 n > np->n_direofoffset - uio->uio_offset)
437 n = np->n_direofoffset - uio->uio_offset;
438 break;
439 default:
440 printf(" nfs_bioread: type %x unexpected\n",vp->v_type);
441 break;
442 };
443
444 if (n > 0) {
445 error = uiomove(bp->b_data + on, (int)n, uio);
446 }
447 switch (vp->v_type) {
448 case VREG:
449 break;
450 case VLNK:
451 n = 0;
452 break;
453 case VDIR:
454 if (np->n_flag & NQNFSNONCACHE)
455 SET(bp->b_flags, B_INVAL);
456 break;
457 default:
458 printf(" nfs_bioread: type %x unexpected\n",vp->v_type);
459 }
460 brelse(bp);
461 } while (error == 0 && uio->uio_resid > 0 && n > 0);
462 return (error);
463 }
464
465
466 /*
467 * Vnode op for write using bio
468 */
469 int
470 nfs_write(ap)
471 struct vop_write_args /* {
472 struct vnode *a_vp;
473 struct uio *a_uio;
474 int a_ioflag;
475 struct ucred *a_cred;
476 } */ *ap;
477 {
478 register int biosize;
479 register struct uio *uio = ap->a_uio;
480 struct proc *p = uio->uio_procp;
481 register struct vnode *vp = ap->a_vp;
482 struct nfsnode *np = VTONFS(vp);
483 register struct ucred *cred = ap->a_cred;
484 int ioflag = ap->a_ioflag;
485 struct buf *bp;
486 struct vattr vattr;
487 struct nfsmount *nmp = VFSTONFS(vp->v_mount);
488 daddr_t lbn;
489 int bufsize;
490 int n, on, error = 0, iomode, must_commit;
491 off_t boff;
492 struct iovec iov;
493 struct uio auio;
494
495 #if DIAGNOSTIC
496 if (uio->uio_rw != UIO_WRITE)
497 panic("nfs_write mode");
498 if (uio->uio_segflg == UIO_USERSPACE && uio->uio_procp != current_proc())
499 panic("nfs_write proc");
500 #endif
501 if (vp->v_type != VREG)
502 return (EIO);
503 if (np->n_flag & NWRITEERR) {
504 np->n_flag &= ~NWRITEERR;
505 return (np->n_error);
506 }
507 if ((nmp->nm_flag & (NFSMNT_NFSV3 | NFSMNT_GOTFSINFO)) == NFSMNT_NFSV3)
508 (void)nfs_fsinfo(nmp, vp, cred, p);
509 if (ioflag & (IO_APPEND | IO_SYNC)) {
510 if (np->n_flag & NMODIFIED) {
511 np->n_attrstamp = 0;
512 error = nfs_vinvalbuf(vp, V_SAVE, cred, p, 1);
513 if (error)
514 return (error);
515 }
516 if (ioflag & IO_APPEND) {
517 np->n_attrstamp = 0;
518 error = VOP_GETATTR(vp, &vattr, cred, p);
519 if (error)
520 return (error);
521 uio->uio_offset = np->n_size;
522 }
523 }
524 if (uio->uio_offset < 0)
525 return (EINVAL);
526 if (uio->uio_resid == 0)
527 return (0);
528 /*
529 * Maybe this should be above the vnode op call, but so long as
530 * file servers have no limits, i don't think it matters
531 */
532 if (p && uio->uio_offset + uio->uio_resid >
533 p->p_rlimit[RLIMIT_FSIZE].rlim_cur) {
534 psignal(p, SIGXFSZ);
535 return (EFBIG);
536 }
537 /*
538 * I use nm_rsize, not nm_wsize so that all buffer cache blocks
539 * will be the same size within a filesystem. nfs_writerpc will
540 * still use nm_wsize when sizing the rpc's.
541 */
542 /*due to getblk/vm interractions, use vm page size or less values */
543 biosize = min(vp->v_mount->mnt_stat.f_iosize, PAGE_SIZE);
544
545 do {
546 /*
547 * Check for a valid write lease.
548 */
549 if ((nmp->nm_flag & NFSMNT_NQNFS) &&
550 NQNFS_CKINVALID(vp, np, ND_WRITE)) {
551 do {
552 error = nqnfs_getlease(vp, ND_WRITE, cred, p);
553 } while (error == NQNFS_EXPIRED);
554 if (error)
555 return (error);
556 if (np->n_lrev != np->n_brev ||
557 (np->n_flag & NQNFSNONCACHE)) {
558 error = nfs_vinvalbuf(vp, V_SAVE, cred, p, 1);
559 if (error)
560 return (error);
561 np->n_brev = np->n_lrev;
562 }
563 }
564 if ((np->n_flag & NQNFSNONCACHE) && uio->uio_iovcnt == 1) {
565 iomode = NFSV3WRITE_FILESYNC;
566 error = nfs_writerpc(vp, uio, cred, &iomode, &must_commit);
567 if (must_commit)
568 nfs_clearcommit(vp->v_mount);
569 return (error);
570 }
571 nfsstats.biocache_writes++;
572 lbn = uio->uio_offset / biosize;
573 on = uio->uio_offset & (biosize-1);
574 n = min((unsigned)(biosize - on), uio->uio_resid);
575 again:
576 bufsize = biosize;
577 #if 0
578 /* (removed for UBC) */
579 if ((lbn + 1) * biosize > np->n_size) {
580 bufsize = np->n_size - lbn * biosize;
581 bufsize = (bufsize + DEV_BSIZE - 1) & ~(DEV_BSIZE - 1);
582 }
583 #endif
584 /*
585 * Get a cache block for writing. The range to be written is
586 * (off..off+len) within the block. We ensure that the block
587 * either has no dirty region or that the given range is
588 * contiguous with the existing dirty region.
589 */
590 bp = nfs_getcacheblk(vp, lbn, bufsize, p, BLK_WRITE);
591 if (!bp)
592 return (EINTR);
593 /*
594 * Resize nfsnode *after* we busy the buffer to prevent
595 * readers from reading garbage.
596 * If there was a partial buf at the old eof, validate
597 * and zero the new bytes.
598 */
599 if (uio->uio_offset + n > np->n_size) {
600 struct buf *bp0 = NULL;
601 daddr_t bn = np->n_size / biosize;
602 int off = np->n_size & (biosize - 1);
603
604 if (off && bn < lbn && incore(vp, bn))
605 bp0 = nfs_getcacheblk(vp, bn, biosize, p,
606 BLK_WRITE);
607 np->n_flag |= NMODIFIED;
608 np->n_size = uio->uio_offset + n;
609 ubc_setsize(vp, (off_t)np->n_size); /* XXX errors */
610 if (bp0) {
611 bzero((char *)bp0->b_data + off, biosize - off);
612 bp0->b_validend = biosize;
613 brelse(bp0);
614 }
615 }
616 /*
617 * NFS has embedded ucred so crhold() risks zone corruption
618 */
619 if (bp->b_wcred == NOCRED)
620 bp->b_wcred = crdup(cred);
621 /*
622 * If dirtyend exceeds file size, chop it down. This should
623 * not occur unless there is a race.
624 */
625 if ((off_t)bp->b_blkno * DEV_BSIZE + bp->b_dirtyend >
626 np->n_size)
627 bp->b_dirtyend = np->n_size - (off_t)bp->b_blkno *
628 DEV_BSIZE;
629 /*
630 * UBC doesn't (yet) handle partial pages so nfs_biowrite was
631 * hacked to never bdwrite, to start every little write right
632 * away. Running IE Avie noticed the performance problem, thus
633 * this code, which permits those delayed writes by ensuring an
634 * initial read of the entire page. The read may hit eof
635 * ("short read") but that we will handle.
636 *
637 * We are quite dependant on the correctness of B_CACHE so check
638 * that first in case of problems.
639 */
640 if (!ISSET(bp->b_flags, B_CACHE) && n < PAGE_SIZE) {
641 boff = (off_t)bp->b_blkno * DEV_BSIZE;
642 auio.uio_iov = &iov;
643 auio.uio_iovcnt = 1;
644 auio.uio_offset = boff;
645 auio.uio_resid = PAGE_SIZE;
646 auio.uio_segflg = UIO_SYSSPACE;
647 auio.uio_rw = UIO_READ;
648 auio.uio_procp = p;
649 iov.iov_base = bp->b_data;
650 iov.iov_len = PAGE_SIZE;
651 error = nfs_readrpc(vp, &auio, cred);
652 if (error) {
653 bp->b_error = error;
654 SET(bp->b_flags, B_ERROR);
655 printf("nfs_write: readrpc %d", error);
656 }
657 if (auio.uio_resid > 0)
658 bzero(iov.iov_base, auio.uio_resid);
659 bp->b_validoff = 0;
660 bp->b_validend = PAGE_SIZE - auio.uio_resid;
661 if (np->n_size > boff + bp->b_validend)
662 bp->b_validend = min(np->n_size - boff,
663 PAGE_SIZE);
664 bp->b_dirtyoff = 0;
665 bp->b_dirtyend = 0;
666 }
667
668 /*
669 * If the new write will leave a contiguous dirty
670 * area, just update the b_dirtyoff and b_dirtyend,
671 * otherwise try to extend the dirty region.
672 */
673 if (bp->b_dirtyend > 0 &&
674 (on > bp->b_dirtyend || (on + n) < bp->b_dirtyoff)) {
675 off_t start, end;
676
677 boff = (off_t)bp->b_blkno * DEV_BSIZE;
678 if (on > bp->b_dirtyend) {
679 start = boff + bp->b_validend;
680 end = boff + on;
681 } else {
682 start = boff + on + n;
683 end = boff + bp->b_validoff;
684 }
685
686 /*
687 * It may be that the valid region in the buffer
688 * covers the region we want, in which case just
689 * extend the dirty region. Otherwise we try to
690 * extend the valid region.
691 */
692 if (end > start) {
693 auio.uio_iov = &iov;
694 auio.uio_iovcnt = 1;
695 auio.uio_offset = start;
696 auio.uio_resid = end - start;
697 auio.uio_segflg = UIO_SYSSPACE;
698 auio.uio_rw = UIO_READ;
699 auio.uio_procp = p;
700 iov.iov_base = bp->b_data + (start - boff);
701 iov.iov_len = end - start;
702 error = nfs_readrpc(vp, &auio, cred);
703 /*
704 * If we couldn't read, do not do a VOP_BWRITE
705 * as originally coded. That could also error
706 * and looping back to "again" as it was doing
707 * could have us stuck trying to write same buf
708 * again. nfs_write, will get the entire region
709 * if nfs_readrpc succeeded. If unsuccessful
710 * we should just error out. Errors like ESTALE
711 * would keep us looping rather than transient
712 * errors justifying a retry. We can return here
713 * instead of altering dirty region later. We
714 * did not write old dirty region at this point.
715 */
716 if (error) {
717 bp->b_error = error;
718 SET(bp->b_flags, B_ERROR);
719 printf("nfs_write: readrpc2 %d", error);
720 brelse(bp);
721 return (error);
722 }
723 /*
724 * The read worked.
725 * If there was a short read, just zero fill.
726 */
727 if (auio.uio_resid > 0)
728 bzero(iov.iov_base, auio.uio_resid);
729 if (on > bp->b_dirtyend)
730 bp->b_validend = on;
731 else
732 bp->b_validoff = on + n;
733 }
734 /*
735 * We now have a valid region which extends up to the
736 * dirty region which we want.
737 */
738 if (on > bp->b_dirtyend)
739 bp->b_dirtyend = on;
740 else
741 bp->b_dirtyoff = on + n;
742 }
743 if (ISSET(bp->b_flags, B_ERROR)) {
744 error = bp->b_error;
745 brelse(bp);
746 return (error);
747 }
748 /*
749 * NFS has embedded ucred so crhold() risks zone corruption
750 */
751 if (bp->b_wcred == NOCRED)
752 bp->b_wcred = crdup(cred);
753 np->n_flag |= NMODIFIED;
754
755 /*
756 * Check for valid write lease and get one as required.
757 * In case getblk() and/or bwrite() delayed us.
758 */
759 if ((nmp->nm_flag & NFSMNT_NQNFS) &&
760 NQNFS_CKINVALID(vp, np, ND_WRITE)) {
761 do {
762 error = nqnfs_getlease(vp, ND_WRITE, cred, p);
763 } while (error == NQNFS_EXPIRED);
764 if (error) {
765 brelse(bp);
766 return (error);
767 }
768 if (np->n_lrev != np->n_brev ||
769 (np->n_flag & NQNFSNONCACHE)) {
770 brelse(bp);
771 error = nfs_vinvalbuf(vp, V_SAVE, cred, p, 1);
772 if (error)
773 return (error);
774 np->n_brev = np->n_lrev;
775 goto again;
776 }
777 }
778 error = uiomove((char *)bp->b_data + on, n, uio);
779 if (error) {
780 SET(bp->b_flags, B_ERROR);
781 brelse(bp);
782 return (error);
783 }
784 if (bp->b_dirtyend > 0) {
785 bp->b_dirtyoff = min(on, bp->b_dirtyoff);
786 bp->b_dirtyend = max((on + n), bp->b_dirtyend);
787 } else {
788 bp->b_dirtyoff = on;
789 bp->b_dirtyend = on + n;
790 }
791 if (bp->b_validend == 0 || bp->b_validend < bp->b_dirtyoff ||
792 bp->b_validoff > bp->b_dirtyend) {
793 bp->b_validoff = bp->b_dirtyoff;
794 bp->b_validend = bp->b_dirtyend;
795 } else {
796 bp->b_validoff = min(bp->b_validoff, bp->b_dirtyoff);
797 bp->b_validend = max(bp->b_validend, bp->b_dirtyend);
798 }
799
800 /*
801 * Since this block is being modified, it must be written
802 * again and not just committed.
803 */
804 CLR(bp->b_flags, B_NEEDCOMMIT);
805
806 /*
807 * If the lease is non-cachable or IO_SYNC do bwrite().
808 */
809 if ((np->n_flag & NQNFSNONCACHE) || (ioflag & IO_SYNC)) {
810 bp->b_proc = p;
811 error = VOP_BWRITE(bp);
812 if (error)
813 return (error);
814 if (np->n_flag & NQNFSNONCACHE) {
815 error = nfs_vinvalbuf(vp, V_SAVE, cred, p, 1);
816 if (error)
817 return (error);
818 }
819 } else if ((n + on) == biosize &&
820 (nmp->nm_flag & NFSMNT_NQNFS) == 0) {
821 bp->b_proc = (struct proc *)0;
822 SET(bp->b_flags, B_ASYNC);
823 (void)nfs_writebp(bp, 0);
824 } else
825 bdwrite(bp);
826 } while (uio->uio_resid > 0 && n > 0);
827 return (0);
828 }
829
830
831 /*
832 * Get an nfs cache block.
833 * Allocate a new one if the block isn't currently in the cache
834 * and return the block marked busy. If the calling process is
835 * interrupted by a signal for an interruptible mount point, return
836 * NULL.
837 */
838 static struct buf *
839 nfs_getcacheblk(vp, bn, size, p, operation)
840 struct vnode *vp;
841 daddr_t bn;
842 int size;
843 struct proc *p;
844 int operation; /* defined in sys/buf.h */
845 {
846 register struct buf *bp;
847 struct nfsmount *nmp = VFSTONFS(vp->v_mount);
848 /*due to getblk/vm interractions, use vm page size or less values */
849 int biosize = min(vp->v_mount->mnt_stat.f_iosize, PAGE_SIZE);
850
851 if (nmp->nm_flag & NFSMNT_INT) {
852 bp = getblk(vp, bn, size, PCATCH, 0, operation);
853 while (bp == (struct buf *)0) {
854 if (nfs_sigintr(nmp, (struct nfsreq *)0, p))
855 return ((struct buf *)0);
856 bp = getblk(vp, bn, size, 0, 2 * hz, operation);
857 }
858 } else
859 bp = getblk(vp, bn, size, 0, 0, operation);
860
861 if( vp->v_type == VREG)
862 bp->b_blkno = (bn * biosize) / DEV_BSIZE;
863
864 return (bp);
865 }
866
867 /*
868 * Flush and invalidate all dirty buffers. If another process is already
869 * doing the flush, just wait for completion.
870 */
871 int
872 nfs_vinvalbuf(vp, flags, cred, p, intrflg)
873 struct vnode *vp;
874 int flags;
875 struct ucred *cred;
876 struct proc *p;
877 int intrflg;
878 {
879 register struct nfsnode *np = VTONFS(vp);
880 struct nfsmount *nmp = VFSTONFS(vp->v_mount);
881 int error = 0, slpflag, slptimeo;
882 int didhold = 0;
883
884 if ((nmp->nm_flag & NFSMNT_INT) == 0)
885 intrflg = 0;
886 if (intrflg) {
887 slpflag = PCATCH;
888 slptimeo = 2 * hz;
889 } else {
890 slpflag = 0;
891 slptimeo = 0;
892 }
893 /*
894 * First wait for any other process doing a flush to complete.
895 */
896 while (np->n_flag & NFLUSHINPROG) {
897 np->n_flag |= NFLUSHWANT;
898 error = tsleep((caddr_t)&np->n_flag, PRIBIO + 2, "nfsvinval",
899 slptimeo);
900 if (error && intrflg && nfs_sigintr(nmp, (struct nfsreq *)0, p))
901 return (EINTR);
902 }
903
904 /*
905 * Now, flush as required.
906 */
907 np->n_flag |= NFLUSHINPROG;
908 error = vinvalbuf(vp, flags, cred, p, slpflag, 0);
909 while (error) {
910 /* we seem to be stuck in a loop here if the thread got aborted.
911 * nfs_flush will return EINTR. Not sure if that will cause
912 * other consequences due to EINTR having other meanings in NFS
913 * To handle, no dirty pages, it seems safe to just return from
914 * here. But if we did have dirty pages, how would we get them
915 * written out if thread was aborted? Some other strategy is
916 * necessary. -- EKN
917 */
918 if ((intrflg && nfs_sigintr(nmp, (struct nfsreq *)0, p)) ||
919 (error == EINTR && current_thread_aborted())) {
920 np->n_flag &= ~NFLUSHINPROG;
921 if (np->n_flag & NFLUSHWANT) {
922 np->n_flag &= ~NFLUSHWANT;
923 wakeup((caddr_t)&np->n_flag);
924 }
925 return (EINTR);
926 }
927 error = vinvalbuf(vp, flags, cred, p, 0, slptimeo);
928 }
929 np->n_flag &= ~(NMODIFIED | NFLUSHINPROG);
930 if (np->n_flag & NFLUSHWANT) {
931 np->n_flag &= ~NFLUSHWANT;
932 wakeup((caddr_t)&np->n_flag);
933 }
934 didhold = ubc_hold(vp);
935 if (didhold) {
936 (void) ubc_clean(vp, 1); /* get the pages out of vm also */
937 ubc_rele(vp);
938 }
939 return (0);
940 }
941
942 /*
943 * Initiate asynchronous I/O. Return an error if no nfsiods are available.
944 * This is mainly to avoid queueing async I/O requests when the nfsiods
945 * are all hung on a dead server.
946 */
947 int
948 nfs_asyncio(bp, cred)
949 register struct buf *bp;
950 struct ucred *cred;
951 {
952 struct nfsmount *nmp;
953 int i;
954 int gotiod;
955 int slpflag = 0;
956 int slptimeo = 0;
957 int error;
958
959 if (nfs_numasync == 0)
960 return (EIO);
961
962 nmp = VFSTONFS(bp->b_vp->v_mount);
963 again:
964 if (nmp->nm_flag & NFSMNT_INT)
965 slpflag = PCATCH;
966 gotiod = FALSE;
967
968 /*
969 * Find a free iod to process this request.
970 */
971 for (i = 0; i < NFS_MAXASYNCDAEMON; i++)
972 if (nfs_iodwant[i]) {
973 /*
974 * Found one, so wake it up and tell it which
975 * mount to process.
976 */
977 NFS_DPF(ASYNCIO,
978 ("nfs_asyncio: waking iod %d for mount %p\n",
979 i, nmp));
980 nfs_iodwant[i] = (struct proc *)0;
981 nfs_iodmount[i] = nmp;
982 nmp->nm_bufqiods++;
983 wakeup((caddr_t)&nfs_iodwant[i]);
984 gotiod = TRUE;
985 break;
986 }
987
988 /*
989 * If none are free, we may already have an iod working on this mount
990 * point. If so, it will process our request.
991 */
992 if (!gotiod) {
993 if (nmp->nm_bufqiods > 0) {
994 NFS_DPF(ASYNCIO,
995 ("nfs_asyncio: %d iods are already processing mount %p\n",
996 nmp->nm_bufqiods, nmp));
997 gotiod = TRUE;
998 }
999 }
1000
1001 /*
1002 * If we have an iod which can process the request, then queue
1003 * the buffer.
1004 */
1005 if (gotiod) {
1006 /*
1007 * Ensure that the queue never grows too large.
1008 */
1009 while (nmp->nm_bufqlen >= 2*nfs_numasync) {
1010 NFS_DPF(ASYNCIO,
1011 ("nfs_asyncio: waiting for mount %p queue to drain\n", nmp));
1012 nmp->nm_bufqwant = TRUE;
1013 error = tsleep(&nmp->nm_bufq, slpflag | PRIBIO,
1014 "nfsaio", slptimeo);
1015 if (error) {
1016 if (nfs_sigintr(nmp, NULL, bp->b_proc))
1017 return (EINTR);
1018 if (slpflag == PCATCH) {
1019 slpflag = 0;
1020 slptimeo = 2 * hz;
1021 }
1022 }
1023 /*
1024 * We might have lost our iod while sleeping,
1025 * so check and loop if nescessary.
1026 */
1027 if (nmp->nm_bufqiods == 0) {
1028 NFS_DPF(ASYNCIO,
1029 ("nfs_asyncio: no iods after mount %p queue was drained, looping\n", nmp));
1030 goto again;
1031 }
1032 }
1033
1034 if (ISSET(bp->b_flags, B_READ)) {
1035 if (bp->b_rcred == NOCRED && cred != NOCRED) {
1036 /*
1037 * NFS has embedded ucred.
1038 * Can not crhold() here as that causes zone corruption
1039 */
1040 bp->b_rcred = crdup(cred);
1041 }
1042 } else {
1043 SET(bp->b_flags, B_WRITEINPROG);
1044 if (bp->b_wcred == NOCRED && cred != NOCRED) {
1045 /*
1046 * NFS has embedded ucred.
1047 * Can not crhold() here as that causes zone corruption
1048 */
1049 bp->b_wcred = crdup(cred);
1050 }
1051 }
1052
1053 TAILQ_INSERT_TAIL(&nmp->nm_bufq, bp, b_freelist);
1054 nmp->nm_bufqlen++;
1055 return (0);
1056 }
1057
1058 /*
1059 * All the iods are busy on other mounts, so return EIO to
1060 * force the caller to process the i/o synchronously.
1061 */
1062 NFS_DPF(ASYNCIO, ("nfs_asyncio: no iods available, i/o is synchronous\n"));
1063 return (EIO);
1064 }
1065
1066 /*
1067 * Do an I/O operation to/from a cache block. This may be called
1068 * synchronously or from an nfsiod.
1069 */
1070 int
1071 nfs_doio(bp, cr, p)
1072 register struct buf *bp;
1073 struct ucred *cr;
1074 struct proc *p;
1075 {
1076 register struct uio *uiop;
1077 register struct vnode *vp;
1078 struct nfsnode *np;
1079 struct nfsmount *nmp;
1080 int error = 0, diff, len, iomode, must_commit = 0;
1081 struct uio uio;
1082 struct iovec io;
1083
1084 vp = bp->b_vp;
1085 np = VTONFS(vp);
1086 nmp = VFSTONFS(vp->v_mount);
1087 uiop = &uio;
1088 uiop->uio_iov = &io;
1089 uiop->uio_iovcnt = 1;
1090 uiop->uio_segflg = UIO_SYSSPACE;
1091 uiop->uio_procp = p;
1092
1093 /*
1094 * With UBC, getblk() can return a buf with B_DONE set.
1095 * This indicates that the VM has valid data for that page.
1096 * NFS being stateless, this case poses a problem.
1097 * By definition, the NFS server should always be consulted
1098 * for the data in that page.
1099 * So we choose to clear the B_DONE and to do the IO.
1100 *
1101 * XXX revisit this if there is a performance issue.
1102 * XXX In that case, we could play the attribute cache games ...
1103 */
1104 if (ISSET(bp->b_flags, B_DONE)) {
1105 if (!ISSET(bp->b_flags, B_ASYNC))
1106 panic("nfs_doio: done and not async");
1107 CLR(bp->b_flags, B_DONE);
1108 }
1109 FSDBG_TOP(256, np->n_size, bp->b_blkno * DEV_BSIZE, bp->b_bcount,
1110 bp->b_flags);
1111 FSDBG(257, bp->b_validoff, bp->b_validend, bp->b_dirtyoff,
1112 bp->b_dirtyend);
1113 /*
1114 * Historically, paging was done with physio, but no more.
1115 */
1116 if (ISSET(bp->b_flags, B_PHYS)) {
1117 /*
1118 * ...though reading /dev/drum still gets us here.
1119 */
1120 io.iov_len = uiop->uio_resid = bp->b_bcount;
1121 /* mapping was done by vmapbuf() */
1122 io.iov_base = bp->b_data;
1123 uiop->uio_offset = (off_t)bp->b_blkno * DEV_BSIZE;
1124 if (ISSET(bp->b_flags, B_READ)) {
1125 uiop->uio_rw = UIO_READ;
1126 nfsstats.read_physios++;
1127 error = nfs_readrpc(vp, uiop, cr);
1128 } else {
1129 int com;
1130
1131 iomode = NFSV3WRITE_DATASYNC;
1132 uiop->uio_rw = UIO_WRITE;
1133 nfsstats.write_physios++;
1134 error = nfs_writerpc(vp, uiop, cr, &iomode, &com);
1135 }
1136 if (error) {
1137 SET(bp->b_flags, B_ERROR);
1138 bp->b_error = error;
1139 }
1140 } else if (ISSET(bp->b_flags, B_READ)) {
1141 io.iov_len = uiop->uio_resid = bp->b_bcount;
1142 io.iov_base = bp->b_data;
1143 uiop->uio_rw = UIO_READ;
1144 switch (vp->v_type) {
1145 case VREG:
1146 uiop->uio_offset = (off_t)bp->b_blkno * DEV_BSIZE;
1147 nfsstats.read_bios++;
1148 error = nfs_readrpc(vp, uiop, cr);
1149 FSDBG(262, np->n_size, bp->b_blkno * DEV_BSIZE,
1150 uiop->uio_resid, error);
1151 if (!error) {
1152 bp->b_validoff = 0;
1153 if (uiop->uio_resid) {
1154 /*
1155 * If len > 0, there is a hole in the file and
1156 * no writes after the hole have been pushed to
1157 * the server yet.
1158 * Just zero fill the rest of the valid area.
1159 */
1160 diff = bp->b_bcount - uiop->uio_resid;
1161 len = np->n_size - ((u_quad_t)bp->b_blkno * DEV_BSIZE +
1162 diff);
1163 if (len > 0) {
1164 len = min(len, uiop->uio_resid);
1165 bzero((char *)bp->b_data + diff, len);
1166 bp->b_validend = diff + len;
1167 FSDBG(258, diff, len, 0, 1);
1168 } else
1169 bp->b_validend = diff;
1170 } else
1171 bp->b_validend = bp->b_bcount;
1172 #if 1 /* USV + JOE [ */
1173 if (bp->b_validend < bp->b_bufsize) {
1174 /*
1175 * we're about to release a partial buffer after a
1176 * read... the only way we should get here is if
1177 * this buffer contains the EOF before releasing it,
1178 * we'll zero out to the end of the buffer so that
1179 * if a mmap of this page occurs, we'll see zero's
1180 * even if a ftruncate extends the file in the
1181 * meantime
1182 */
1183 bzero((caddr_t)(bp->b_data + bp->b_validend),
1184 bp->b_bufsize - bp->b_validend);
1185 FSDBG(258, bp->b_validend,
1186 bp->b_bufsize - bp->b_validend, 0, 2);
1187 }
1188 #endif /* ] USV + JOE */
1189 }
1190 if (p && (vp->v_flag & VTEXT) &&
1191 (((nmp->nm_flag & NFSMNT_NQNFS) &&
1192 NQNFS_CKINVALID(vp, np, ND_READ) &&
1193 np->n_lrev != np->n_brev) ||
1194 (!(nmp->nm_flag & NFSMNT_NQNFS) &&
1195 np->n_mtime != np->n_vattr.va_mtime.tv_sec))) {
1196 uprintf("Process killed due to text file modification\n");
1197 psignal(p, SIGKILL);
1198 p->p_flag |= P_NOSWAP;
1199 }
1200 break;
1201 case VLNK:
1202 uiop->uio_offset = (off_t)0;
1203 nfsstats.readlink_bios++;
1204 error = nfs_readlinkrpc(vp, uiop, cr);
1205 break;
1206 case VDIR:
1207 nfsstats.readdir_bios++;
1208 uiop->uio_offset = ((u_quad_t)bp->b_lblkno) * NFS_DIRBLKSIZ;
1209 if (!(nmp->nm_flag & NFSMNT_NFSV3))
1210 nmp->nm_flag &= ~NFSMNT_RDIRPLUS; /* dk@farm.org */
1211 if (nmp->nm_flag & NFSMNT_RDIRPLUS) {
1212 error = nfs_readdirplusrpc(vp, uiop, cr);
1213 if (error == NFSERR_NOTSUPP)
1214 nmp->nm_flag &= ~NFSMNT_RDIRPLUS;
1215 }
1216 if ((nmp->nm_flag & NFSMNT_RDIRPLUS) == 0)
1217 error = nfs_readdirrpc(vp, uiop, cr);
1218 break;
1219 default:
1220 printf("nfs_doio: type %x unexpected\n", vp->v_type);
1221 break;
1222 };
1223 if (error) {
1224 SET(bp->b_flags, B_ERROR);
1225 bp->b_error = error;
1226 }
1227 } else {
1228 /*
1229 * mapped I/O may have altered any bytes, so we extend
1230 * the dirty zone to the valid zone. For best performance
1231 * a better solution would be to save & restore page dirty bits
1232 * around the uiomove which brings write-data into the buffer.
1233 * Then here we'd check if the page is dirty rather than WASMAPPED
1234 * Also vnode_pager would change - if a page is clean it might
1235 * still need to be written due to DELWRI.
1236 */
1237 if (UBCINFOEXISTS(vp) && ubc_issetflags(vp, UI_WASMAPPED)) {
1238 bp->b_dirtyoff = min(bp->b_dirtyoff, bp->b_validoff);
1239 bp->b_dirtyend = max(bp->b_dirtyend, bp->b_validend);
1240 }
1241 if ((off_t)bp->b_blkno * DEV_BSIZE + bp->b_dirtyend > np->n_size)
1242 bp->b_dirtyend = np->n_size - (off_t)bp->b_blkno * DEV_BSIZE;
1243
1244 if (bp->b_dirtyend > bp->b_dirtyoff) {
1245 io.iov_len = uiop->uio_resid = bp->b_dirtyend - bp->b_dirtyoff;
1246 uiop->uio_offset = (off_t)bp->b_blkno * DEV_BSIZE +
1247 bp->b_dirtyoff;
1248 io.iov_base = (char *)bp->b_data + bp->b_dirtyoff;
1249 uiop->uio_rw = UIO_WRITE;
1250
1251 nfsstats.write_bios++;
1252 if ((bp->b_flags & (B_ASYNC | B_NEEDCOMMIT | B_NOCACHE)) ==
1253 B_ASYNC)
1254 iomode = NFSV3WRITE_UNSTABLE;
1255 else
1256 iomode = NFSV3WRITE_FILESYNC;
1257 SET(bp->b_flags, B_WRITEINPROG);
1258 error = nfs_writerpc(vp, uiop, cr, &iomode, &must_commit);
1259 if (!error && iomode == NFSV3WRITE_UNSTABLE)
1260 SET(bp->b_flags, B_NEEDCOMMIT);
1261 else
1262 CLR(bp->b_flags, B_NEEDCOMMIT);
1263 CLR(bp->b_flags, B_WRITEINPROG);
1264 /*
1265 * For an interrupted write, the buffer is still valid
1266 * and the write hasn't been pushed to the server yet,
1267 * so we can't set B_ERROR and report the interruption
1268 * by setting B_EINTR. For the B_ASYNC case, B_EINTR
1269 * is not relevant, so the rpc attempt is essentially
1270 * a noop. For the case of a V3 write rpc not being
1271 * committed to stable storage, the block is still
1272 * dirty and requires either a commit rpc or another
1273 * write rpc with iomode == NFSV3WRITE_FILESYNC before
1274 * the block is reused. This is indicated by setting
1275 * the B_DELWRI and B_NEEDCOMMIT flags.
1276 */
1277 if (error == EINTR || (!error && bp->b_flags & B_NEEDCOMMIT)) {
1278 int s;
1279
1280 CLR(bp->b_flags, B_INVAL | B_NOCACHE);
1281 if (!ISSET(bp->b_flags, B_DELWRI)) {
1282 extern int nbdwrite;
1283 SET(bp->b_flags, B_DELWRI);
1284 nbdwrite++;
1285 }
1286 FSDBG(261, bp->b_validoff, bp->b_validend,
1287 bp->b_bufsize, bp->b_bcount);
1288 /*
1289 * Since for the B_ASYNC case, nfs_bwrite() has
1290 * reassigned the buffer to the clean list, we have to
1291 * reassign it back to the dirty one. Ugh.
1292 */
1293 if (ISSET(bp->b_flags, B_ASYNC)) {
1294 s = splbio();
1295 reassignbuf(bp, vp);
1296 splx(s);
1297 } else {
1298 SET(bp->b_flags, B_EINTR);
1299 }
1300 } else {
1301 if (error) {
1302 SET(bp->b_flags, B_ERROR);
1303 bp->b_error = np->n_error = error;
1304 np->n_flag |= NWRITEERR;
1305 }
1306 bp->b_dirtyoff = bp->b_dirtyend = 0;
1307 #if 1 /* JOE */
1308 /*
1309 * validoff and validend represent the real data present
1310 * in this buffer if validoff is non-zero, than we have
1311 * to invalidate the buffer and kill the page when
1312 * biodone is called... the same is also true when
1313 * validend doesn't extend all the way to the end of the
1314 * buffer and validend doesn't equate to the current
1315 * EOF... eventually we need to deal with this in a more
1316 * humane way (like keeping the partial buffer without
1317 * making it immediately available to the VM page cache)
1318 */
1319 if (bp->b_validoff)
1320 SET(bp->b_flags, B_INVAL);
1321 else
1322 if (bp->b_validend < bp->b_bufsize) {
1323 if ((off_t)bp->b_blkno * DEV_BSIZE +
1324 bp->b_validend == np->n_size) {
1325 bzero((caddr_t)(bp->b_data +
1326 bp->b_validend),
1327 bp->b_bufsize - bp->b_validend);
1328 FSDBG(259, bp->b_validend,
1329 bp->b_bufsize - bp->b_validend, 0,
1330 0);
1331 } else
1332 SET(bp->b_flags, B_INVAL);
1333 }
1334 #endif
1335 }
1336
1337 } else {
1338 #if 1 /* JOE */
1339 if (bp->b_validoff ||
1340 (bp->b_validend < bp->b_bufsize &&
1341 (off_t)bp->b_blkno * DEV_BSIZE + bp->b_validend !=
1342 np->n_size)) {
1343 SET(bp->b_flags, B_INVAL);
1344 }
1345 if (bp->b_flags & B_INVAL) {
1346 FSDBG(260, bp->b_validoff, bp->b_validend,
1347 bp->b_bufsize, bp->b_bcount);
1348 }
1349 #endif
1350 bp->b_resid = 0;
1351 biodone(bp);
1352 FSDBG_BOT(256, bp->b_validoff, bp->b_validend, bp->b_bufsize,
1353 np->n_size);
1354 return (0);
1355 }
1356 }
1357 bp->b_resid = uiop->uio_resid;
1358 if (must_commit)
1359 nfs_clearcommit(vp->v_mount);
1360
1361 if (bp->b_flags & B_INVAL) {
1362 FSDBG(260, bp->b_validoff, bp->b_validend, bp->b_bufsize,
1363 bp->b_bcount);
1364 }
1365 FSDBG_BOT(256, bp->b_validoff, bp->b_validend, bp->b_bcount, error);
1366
1367 biodone(bp);
1368 return (error);
1369 }