]> git.saurik.com Git - apple/xnu.git/blame - bsd/nfs/nfs_bio.c
xnu-344.tar.gz
[apple/xnu.git] / bsd / nfs / nfs_bio.c
CommitLineData
1c79356b
A
1/*
2 * Copyright (c) 2000 Apple Computer, Inc. All rights reserved.
3 *
4 * @APPLE_LICENSE_HEADER_START@
5 *
6 * The contents of this file constitute Original Code as defined in and
7 * are subject to the Apple Public Source License Version 1.1 (the
8 * "License"). You may not use this file except in compliance with the
9 * License. Please obtain a copy of the License at
10 * http://www.apple.com/publicsource and read it before using this file.
11 *
12 * This Original Code and all software distributed under the License are
13 * distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY KIND, EITHER
14 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
15 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
16 * FITNESS FOR A PARTICULAR PURPOSE OR NON-INFRINGEMENT. Please see the
17 * License for the specific language governing rights and limitations
18 * under the License.
19 *
20 * @APPLE_LICENSE_HEADER_END@
21 */
22/* Copyright (c) 1995 NeXT Computer, Inc. All Rights Reserved */
23/*
24 * Copyright (c) 1989, 1993
25 * The Regents of the University of California. All rights reserved.
26 *
27 * This code is derived from software contributed to Berkeley by
28 * Rick Macklem at The University of Guelph.
29 *
30 * Redistribution and use in source and binary forms, with or without
31 * modification, are permitted provided that the following conditions
32 * are met:
33 * 1. Redistributions of source code must retain the above copyright
34 * notice, this list of conditions and the following disclaimer.
35 * 2. Redistributions in binary form must reproduce the above copyright
36 * notice, this list of conditions and the following disclaimer in the
37 * documentation and/or other materials provided with the distribution.
38 * 3. All advertising materials mentioning features or use of this software
39 * must display the following acknowledgement:
40 * This product includes software developed by the University of
41 * California, Berkeley and its contributors.
42 * 4. Neither the name of the University nor the names of its contributors
43 * may be used to endorse or promote products derived from this software
44 * without specific prior written permission.
45 *
46 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
47 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
48 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
49 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
50 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
51 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
52 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
53 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
54 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
55 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
56 * SUCH DAMAGE.
57 *
58 * @(#)nfs_bio.c 8.9 (Berkeley) 3/30/95
59 * FreeBSD-Id: nfs_bio.c,v 1.44 1997/09/10 19:52:25 phk Exp $
60 */
1c79356b
A
61#include <sys/param.h>
62#include <sys/systm.h>
63#include <sys/resourcevar.h>
64#include <sys/signalvar.h>
65#include <sys/proc.h>
66#include <sys/buf.h>
67#include <sys/vnode.h>
68#include <sys/mount.h>
69#include <sys/kernel.h>
70#include <sys/sysctl.h>
71#include <sys/ubc.h>
72
73#include <sys/vm.h>
74#include <sys/vmparam.h>
75
76#include <sys/time.h>
77#include <kern/clock.h>
78
79#include <nfs/rpcv2.h>
80#include <nfs/nfsproto.h>
81#include <nfs/nfs.h>
82#include <nfs/nfsmount.h>
83#include <nfs/nqnfs.h>
84#include <nfs/nfsnode.h>
85
86#include <sys/kdebug.h>
87
fa4905b1
A
88#define FSDBG(A, B, C, D, E) \
89 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, (A))) | DBG_FUNC_NONE, \
90 (int)(B), (int)(C), (int)(D), (int)(E), 0)
91#define FSDBG_TOP(A, B, C, D, E) \
92 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, (A))) | DBG_FUNC_START, \
93 (int)(B), (int)(C), (int)(D), (int)(E), 0)
94#define FSDBG_BOT(A, B, C, D, E) \
95 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, (A))) | DBG_FUNC_END, \
96 (int)(B), (int)(C), (int)(D), (int)(E), 0)
97
1c79356b
A
98static struct buf *nfs_getcacheblk __P((struct vnode *vp, daddr_t bn, int size,
99 struct proc *p, int operation));
1c79356b
A
100
101extern int nfs_numasync;
102extern struct nfsstats nfsstats;
9bccf70c 103extern int nbdwrite;
1c79356b
A
104
105/*
106 * Vnode op for read using bio
107 * Any similarity to readip() is purely coincidental
108 */
109int
110nfs_bioread(vp, uio, ioflag, cred, getpages)
111 register struct vnode *vp;
112 register struct uio *uio;
113 int ioflag;
114 struct ucred *cred;
115 int getpages;
116{
117 register struct nfsnode *np = VTONFS(vp);
118 register int biosize, diff, i;
119 struct buf *bp = 0, *rabp;
120 struct vattr vattr;
121 struct proc *p;
122 struct nfsmount *nmp = VFSTONFS(vp->v_mount);
123 daddr_t lbn, rabn;
124 int bufsize;
125 int nra, error = 0, n = 0, on = 0, not_readin;
126 int operation = (getpages? BLK_PAGEIN : BLK_READ);
127
128#if DIAGNOSTIC
129 if (uio->uio_rw != UIO_READ)
130 panic("nfs_read mode");
131#endif
132 if (uio->uio_resid == 0)
133 return (0);
134 if (uio->uio_offset < 0)
135 return (EINVAL);
136 p = uio->uio_procp;
137 if ((nmp->nm_flag & (NFSMNT_NFSV3 | NFSMNT_GOTFSINFO)) == NFSMNT_NFSV3)
138 (void)nfs_fsinfo(nmp, vp, cred, p);
fa4905b1 139 /*due to getblk/vm interractions, use vm page size or less values */
1c79356b
A
140 biosize = min(vp->v_mount->mnt_stat.f_iosize, PAGE_SIZE);
141 /*
142 * For nfs, cache consistency can only be maintained approximately.
143 * Although RFC1094 does not specify the criteria, the following is
144 * believed to be compatible with the reference port.
145 * For nqnfs, full cache consistency is maintained within the loop.
146 * For nfs:
147 * If the file's modify time on the server has changed since the
148 * last read rpc or you have written to the file,
149 * you may have lost data cache consistency with the
150 * server, so flush all of the file's data out of the cache.
151 * Then force a getattr rpc to ensure that you have up to date
152 * attributes.
153 * NB: This implies that cache data can be read when up to
154 * NFS_ATTRTIMEO seconds out of date. If you find that you need current
155 * attributes this could be forced by setting n_attrstamp to 0 before
156 * the VOP_GETATTR() call.
157 */
158 if ((nmp->nm_flag & NFSMNT_NQNFS) == 0) {
159 if (np->n_flag & NMODIFIED) {
160 if (vp->v_type != VREG) {
161 if (vp->v_type != VDIR)
162 panic("nfs: bioread, not dir");
163 nfs_invaldir(vp);
164 error = nfs_vinvalbuf(vp, V_SAVE, cred, p, 1);
165 if (error)
166 return (error);
167 }
168 np->n_attrstamp = 0;
169 error = VOP_GETATTR(vp, &vattr, cred, p);
170 if (error)
171 return (error);
172 np->n_mtime = vattr.va_mtime.tv_sec;
173 } else {
174 error = VOP_GETATTR(vp, &vattr, cred, p);
175 if (error)
176 return (error);
177 if (np->n_mtime != vattr.va_mtime.tv_sec) {
178 if (vp->v_type == VDIR)
179 nfs_invaldir(vp);
180 error = nfs_vinvalbuf(vp, V_SAVE, cred, p, 1);
181 if (error)
182 return (error);
183 np->n_mtime = vattr.va_mtime.tv_sec;
184 }
185 }
186 }
187 do {
188
189 /*
190 * Get a valid lease. If cached data is stale, flush it.
191 */
192 if (nmp->nm_flag & NFSMNT_NQNFS) {
193 if (NQNFS_CKINVALID(vp, np, ND_READ)) {
194 do {
195 error = nqnfs_getlease(vp, ND_READ, cred, p);
196 } while (error == NQNFS_EXPIRED);
197 if (error)
198 return (error);
199 if (np->n_lrev != np->n_brev ||
200 (np->n_flag & NQNFSNONCACHE) ||
201 ((np->n_flag & NMODIFIED) && vp->v_type == VDIR)) {
202 if (vp->v_type == VDIR)
203 nfs_invaldir(vp);
204 error = nfs_vinvalbuf(vp, V_SAVE, cred, p, 1);
205 if (error)
206 return (error);
207 np->n_brev = np->n_lrev;
208 }
209 } else if (vp->v_type == VDIR && (np->n_flag & NMODIFIED)) {
210 nfs_invaldir(vp);
211 error = nfs_vinvalbuf(vp, V_SAVE, cred, p, 1);
212 if (error)
213 return (error);
214 }
215 }
216 if (np->n_flag & NQNFSNONCACHE) {
217 switch (vp->v_type) {
218 case VREG:
219 return (nfs_readrpc(vp, uio, cred));
220 case VLNK:
221 return (nfs_readlinkrpc(vp, uio, cred));
222 case VDIR:
223 break;
224 default:
225 printf(" NQNFSNONCACHE: type %x unexpected\n",
226 vp->v_type);
227 };
228 }
229 switch (vp->v_type) {
230 case VREG:
231 nfsstats.biocache_reads++;
232 lbn = uio->uio_offset / biosize;
233 on = uio->uio_offset & (biosize - 1);
234 not_readin = 1;
235
236 /*
237 * Start the read ahead(s), as required.
238 */
239 if (nfs_numasync > 0 && nmp->nm_readahead > 0) {
240 for (nra = 0; nra < nmp->nm_readahead &&
fa4905b1
A
241 (off_t)(lbn + 1 + nra) * biosize < np->n_size;
242 nra++) {
1c79356b
A
243 rabn = lbn + 1 + nra;
244 if (!incore(vp, rabn)) {
245 rabp = nfs_getcacheblk(vp, rabn, biosize, p, operation);
246 if (!rabp)
247 return (EINTR);
248 if (!ISSET(rabp->b_flags, (B_CACHE|B_DELWRI))) {
249 SET(rabp->b_flags, (B_READ | B_ASYNC));
250 if (nfs_asyncio(rabp, cred)) {
251 SET(rabp->b_flags, (B_INVAL|B_ERROR));
252 rabp->b_error = EIO;
253 brelse(rabp);
254 }
255 } else
256 brelse(rabp);
257 }
258 }
259 }
260
261 /*
262 * If the block is in the cache and has the required data
263 * in a valid region, just copy it out.
264 * Otherwise, get the block and write back/read in,
265 * as required.
266 */
267again:
268 bufsize = biosize;
269 if ((off_t)(lbn + 1) * biosize > np->n_size &&
270 (off_t)(lbn + 1) * biosize - np->n_size < biosize) {
271 bufsize = np->n_size - lbn * biosize;
272 bufsize = (bufsize + DEV_BSIZE - 1) & ~(DEV_BSIZE - 1);
273 }
274 bp = nfs_getcacheblk(vp, lbn, bufsize, p, operation);
275 if (!bp)
276 return (EINTR);
277
278 if (!ISSET(bp->b_flags, B_CACHE)) {
279 SET(bp->b_flags, B_READ);
280 CLR(bp->b_flags, (B_DONE | B_ERROR | B_INVAL));
281 not_readin = 0;
282 error = nfs_doio(bp, cred, p);
283 if (error) {
284 brelse(bp);
285 return (error);
286 }
287 }
288 if (bufsize > on) {
289 n = min((unsigned)(bufsize - on), uio->uio_resid);
290 } else {
291 n = 0;
292 }
293 diff = np->n_size - uio->uio_offset;
294 if (diff < n)
295 n = diff;
296 if (not_readin && n > 0) {
297 if (on < bp->b_validoff || (on + n) > bp->b_validend) {
298 SET(bp->b_flags, (B_NOCACHE|B_INVAFTERWRITE));
299 if (bp->b_dirtyend > 0) {
300 if (!ISSET(bp->b_flags, B_DELWRI))
301 panic("nfsbioread");
302 if (VOP_BWRITE(bp) == EINTR)
303 return (EINTR);
304 } else
305 brelse(bp);
306 goto again;
307 }
308 }
309 vp->v_lastr = lbn;
310 diff = (on >= bp->b_validend) ? 0 : (bp->b_validend - on);
311 if (diff < n)
312 n = diff;
313 break;
314 case VLNK:
315 nfsstats.biocache_readlinks++;
316 bp = nfs_getcacheblk(vp, (daddr_t)0, NFS_MAXPATHLEN, p, operation);
317 if (!bp)
318 return (EINTR);
319 if (!ISSET(bp->b_flags, B_CACHE)) {
320 SET(bp->b_flags, B_READ);
321 error = nfs_doio(bp, cred, p);
322 if (error) {
323 SET(bp->b_flags, B_ERROR);
324 brelse(bp);
325 return (error);
326 }
327 }
328 n = min(uio->uio_resid, NFS_MAXPATHLEN - bp->b_resid);
329 on = 0;
330 break;
331 case VDIR:
332 nfsstats.biocache_readdirs++;
333 if (np->n_direofoffset
334 && uio->uio_offset >= np->n_direofoffset) {
335 return (0);
336 }
337 lbn = uio->uio_offset / NFS_DIRBLKSIZ;
338 on = uio->uio_offset & (NFS_DIRBLKSIZ - 1);
339 bp = nfs_getcacheblk(vp, lbn, NFS_DIRBLKSIZ, p, operation);
340 if (!bp)
341 return (EINTR);
342 if (!ISSET(bp->b_flags, B_CACHE)) {
343 SET(bp->b_flags, B_READ);
344 error = nfs_doio(bp, cred, p);
345 if (error) {
fa4905b1 346 brelse(bp);
1c79356b 347 }
fa4905b1
A
348 while (error == NFSERR_BAD_COOKIE) {
349 nfs_invaldir(vp);
350 error = nfs_vinvalbuf(vp, 0, cred, p, 1);
351 /*
352 * Yuck! The directory has been modified on the
353 * server. The only way to get the block is by
354 * reading from the beginning to get all the
355 * offset cookies.
356 */
357 for (i = 0; i <= lbn && !error; i++) {
358 if (np->n_direofoffset
359 && (i * NFS_DIRBLKSIZ) >= np->n_direofoffset)
360 return (0);
361 bp = nfs_getcacheblk(vp, i, NFS_DIRBLKSIZ, p,
362 operation);
363 if (!bp)
364 return (EINTR);
365 if (!ISSET(bp->b_flags, B_CACHE)) {
366 SET(bp->b_flags, B_READ);
367 error = nfs_doio(bp, cred, p);
368 /*
369 * no error + B_INVAL == directory EOF,
370 * use the block.
371 */
372 if (error == 0 && (bp->b_flags & B_INVAL))
373 break;
374 }
375 /*
376 * An error will throw away the block and the
377 * for loop will break out. If no error and this
378 * is not the block we want, we throw away the
379 * block and go for the next one via the for loop.
380 */
381 if (error || i < lbn)
382 brelse(bp);
383 }
384 }
385 /*
386 * The above while is repeated if we hit another cookie
387 * error. If we hit an error and it wasn't a cookie error,
388 * we give up.
389 */
390 if (error)
391 return (error);
1c79356b
A
392 }
393
394 /*
395 * If not eof and read aheads are enabled, start one.
396 * (You need the current block first, so that you have the
397 * directory offset cookie of the next block.)
398 */
399 if (nfs_numasync > 0 && nmp->nm_readahead > 0 &&
400 (np->n_direofoffset == 0 ||
401 (lbn + 1) * NFS_DIRBLKSIZ < np->n_direofoffset) &&
402 !(np->n_flag & NQNFSNONCACHE) &&
403 !incore(vp, lbn + 1)) {
fa4905b1
A
404 rabp = nfs_getcacheblk(vp, lbn + 1, NFS_DIRBLKSIZ, p,
405 operation);
1c79356b
A
406 if (rabp) {
407 if (!ISSET(rabp->b_flags, (B_CACHE|B_DELWRI))) {
fa4905b1
A
408 SET(rabp->b_flags, (B_READ | B_ASYNC));
409 if (nfs_asyncio(rabp, cred)) {
410 SET(rabp->b_flags, (B_INVAL|B_ERROR));
411 rabp->b_error = EIO;
412 brelse(rabp);
413 }
1c79356b 414 } else {
fa4905b1 415 brelse(rabp);
1c79356b
A
416 }
417 }
418 }
419 /*
420 * Make sure we use a signed variant of min() since
421 * the second term may be negative.
422 */
423 n = lmin(uio->uio_resid, NFS_DIRBLKSIZ - bp->b_resid - on);
fa4905b1
A
424 /*
425 * Unlike VREG files, whos buffer size ( bp->b_bcount ) is
426 * chopped for the EOF condition, we cannot tell how large
427 * NFS directories are going to be until we hit EOF. So
428 * an NFS directory buffer is *not* chopped to its EOF. Now,
429 * it just so happens that b_resid will effectively chop it
430 * to EOF. *BUT* this information is lost if the buffer goes
431 * away and is reconstituted into a B_CACHE state (recovered
432 * from VM) later. So we keep track of the directory eof
433 * in np->n_direofoffset and chop it off as an extra step
434 * right here.
435 */
436 if (np->n_direofoffset &&
437 n > np->n_direofoffset - uio->uio_offset)
438 n = np->n_direofoffset - uio->uio_offset;
1c79356b
A
439 break;
440 default:
441 printf(" nfs_bioread: type %x unexpected\n",vp->v_type);
442 break;
443 };
444
445 if (n > 0) {
446 error = uiomove(bp->b_data + on, (int)n, uio);
447 }
448 switch (vp->v_type) {
449 case VREG:
450 break;
451 case VLNK:
452 n = 0;
453 break;
454 case VDIR:
455 if (np->n_flag & NQNFSNONCACHE)
456 SET(bp->b_flags, B_INVAL);
457 break;
458 default:
459 printf(" nfs_bioread: type %x unexpected\n",vp->v_type);
460 }
461 brelse(bp);
462 } while (error == 0 && uio->uio_resid > 0 && n > 0);
463 return (error);
464}
465
fa4905b1 466
1c79356b
A
467/*
468 * Vnode op for write using bio
469 */
470int
471nfs_write(ap)
472 struct vop_write_args /* {
473 struct vnode *a_vp;
474 struct uio *a_uio;
475 int a_ioflag;
476 struct ucred *a_cred;
477 } */ *ap;
478{
479 register int biosize;
480 register struct uio *uio = ap->a_uio;
481 struct proc *p = uio->uio_procp;
482 register struct vnode *vp = ap->a_vp;
483 struct nfsnode *np = VTONFS(vp);
484 register struct ucred *cred = ap->a_cred;
485 int ioflag = ap->a_ioflag;
486 struct buf *bp;
487 struct vattr vattr;
488 struct nfsmount *nmp = VFSTONFS(vp->v_mount);
489 daddr_t lbn;
490 int bufsize;
491 int n, on, error = 0, iomode, must_commit;
fa4905b1
A
492 off_t boff;
493 struct iovec iov;
494 struct uio auio;
1c79356b
A
495
496#if DIAGNOSTIC
497 if (uio->uio_rw != UIO_WRITE)
498 panic("nfs_write mode");
499 if (uio->uio_segflg == UIO_USERSPACE && uio->uio_procp != current_proc())
500 panic("nfs_write proc");
501#endif
502 if (vp->v_type != VREG)
503 return (EIO);
504 if (np->n_flag & NWRITEERR) {
505 np->n_flag &= ~NWRITEERR;
506 return (np->n_error);
507 }
508 if ((nmp->nm_flag & (NFSMNT_NFSV3 | NFSMNT_GOTFSINFO)) == NFSMNT_NFSV3)
509 (void)nfs_fsinfo(nmp, vp, cred, p);
510 if (ioflag & (IO_APPEND | IO_SYNC)) {
511 if (np->n_flag & NMODIFIED) {
512 np->n_attrstamp = 0;
513 error = nfs_vinvalbuf(vp, V_SAVE, cred, p, 1);
514 if (error)
515 return (error);
516 }
517 if (ioflag & IO_APPEND) {
518 np->n_attrstamp = 0;
519 error = VOP_GETATTR(vp, &vattr, cred, p);
520 if (error)
521 return (error);
522 uio->uio_offset = np->n_size;
523 }
524 }
525 if (uio->uio_offset < 0)
526 return (EINVAL);
527 if (uio->uio_resid == 0)
528 return (0);
529 /*
530 * Maybe this should be above the vnode op call, but so long as
531 * file servers have no limits, i don't think it matters
532 */
533 if (p && uio->uio_offset + uio->uio_resid >
534 p->p_rlimit[RLIMIT_FSIZE].rlim_cur) {
535 psignal(p, SIGXFSZ);
536 return (EFBIG);
537 }
538 /*
539 * I use nm_rsize, not nm_wsize so that all buffer cache blocks
540 * will be the same size within a filesystem. nfs_writerpc will
541 * still use nm_wsize when sizing the rpc's.
542 */
fa4905b1
A
543 /*due to getblk/vm interractions, use vm page size or less values */
544 biosize = min(vp->v_mount->mnt_stat.f_iosize, PAGE_SIZE);
1c79356b
A
545
546 do {
547 /*
548 * Check for a valid write lease.
549 */
550 if ((nmp->nm_flag & NFSMNT_NQNFS) &&
551 NQNFS_CKINVALID(vp, np, ND_WRITE)) {
552 do {
553 error = nqnfs_getlease(vp, ND_WRITE, cred, p);
554 } while (error == NQNFS_EXPIRED);
555 if (error)
556 return (error);
557 if (np->n_lrev != np->n_brev ||
558 (np->n_flag & NQNFSNONCACHE)) {
559 error = nfs_vinvalbuf(vp, V_SAVE, cred, p, 1);
560 if (error)
561 return (error);
562 np->n_brev = np->n_lrev;
563 }
564 }
565 if ((np->n_flag & NQNFSNONCACHE) && uio->uio_iovcnt == 1) {
566 iomode = NFSV3WRITE_FILESYNC;
567 error = nfs_writerpc(vp, uio, cred, &iomode, &must_commit);
568 if (must_commit)
569 nfs_clearcommit(vp->v_mount);
570 return (error);
571 }
572 nfsstats.biocache_writes++;
573 lbn = uio->uio_offset / biosize;
574 on = uio->uio_offset & (biosize-1);
575 n = min((unsigned)(biosize - on), uio->uio_resid);
576again:
1c79356b
A
577 bufsize = biosize;
578#if 0
579/* (removed for UBC) */
580 if ((lbn + 1) * biosize > np->n_size) {
581 bufsize = np->n_size - lbn * biosize;
582 bufsize = (bufsize + DEV_BSIZE - 1) & ~(DEV_BSIZE - 1);
583 }
584#endif
fa4905b1
A
585 /*
586 * Get a cache block for writing. The range to be written is
587 * (off..off+len) within the block. We ensure that the block
588 * either has no dirty region or that the given range is
589 * contiguous with the existing dirty region.
590 */
591 bp = nfs_getcacheblk(vp, lbn, bufsize, p, BLK_WRITE);
1c79356b
A
592 if (!bp)
593 return (EINTR);
fa4905b1
A
594 /*
595 * Resize nfsnode *after* we busy the buffer to prevent
596 * readers from reading garbage.
597 * If there was a partial buf at the old eof, validate
598 * and zero the new bytes.
599 */
600 if (uio->uio_offset + n > np->n_size) {
601 struct buf *bp0 = NULL;
602 daddr_t bn = np->n_size / biosize;
603 int off = np->n_size & (biosize - 1);
604
605 if (off && bn < lbn && incore(vp, bn))
606 bp0 = nfs_getcacheblk(vp, bn, biosize, p,
607 BLK_WRITE);
608 np->n_flag |= NMODIFIED;
609 np->n_size = uio->uio_offset + n;
610 ubc_setsize(vp, (off_t)np->n_size); /* XXX errors */
611 if (bp0) {
612 bzero((char *)bp0->b_data + off, biosize - off);
613 bp0->b_validend = biosize;
614 brelse(bp0);
615 }
616 }
617 /*
618 * NFS has embedded ucred so crhold() risks zone corruption
619 */
620 if (bp->b_wcred == NOCRED)
621 bp->b_wcred = crdup(cred);
622 /*
623 * If dirtyend exceeds file size, chop it down. This should
624 * not occur unless there is a race.
625 */
626 if ((off_t)bp->b_blkno * DEV_BSIZE + bp->b_dirtyend >
627 np->n_size)
628 bp->b_dirtyend = np->n_size - (off_t)bp->b_blkno *
629 DEV_BSIZE;
630 /*
631 * UBC doesn't (yet) handle partial pages so nfs_biowrite was
632 * hacked to never bdwrite, to start every little write right
633 * away. Running IE Avie noticed the performance problem, thus
634 * this code, which permits those delayed writes by ensuring an
635 * initial read of the entire page. The read may hit eof
636 * ("short read") but that we will handle.
637 *
638 * We are quite dependant on the correctness of B_CACHE so check
639 * that first in case of problems.
640 */
641 if (!ISSET(bp->b_flags, B_CACHE) && n < PAGE_SIZE) {
642 boff = (off_t)bp->b_blkno * DEV_BSIZE;
643 auio.uio_iov = &iov;
644 auio.uio_iovcnt = 1;
645 auio.uio_offset = boff;
646 auio.uio_resid = PAGE_SIZE;
647 auio.uio_segflg = UIO_SYSSPACE;
648 auio.uio_rw = UIO_READ;
649 auio.uio_procp = p;
650 iov.iov_base = bp->b_data;
651 iov.iov_len = PAGE_SIZE;
652 error = nfs_readrpc(vp, &auio, cred);
653 if (error) {
654 bp->b_error = error;
655 SET(bp->b_flags, B_ERROR);
656 printf("nfs_write: readrpc %d", error);
657 }
658 if (auio.uio_resid > 0)
659 bzero(iov.iov_base, auio.uio_resid);
660 bp->b_validoff = 0;
661 bp->b_validend = PAGE_SIZE - auio.uio_resid;
662 if (np->n_size > boff + bp->b_validend)
663 bp->b_validend = min(np->n_size - boff,
664 PAGE_SIZE);
665 bp->b_dirtyoff = 0;
666 bp->b_dirtyend = 0;
667 }
668
669 /*
670 * If the new write will leave a contiguous dirty
671 * area, just update the b_dirtyoff and b_dirtyend,
672 * otherwise try to extend the dirty region.
673 */
674 if (bp->b_dirtyend > 0 &&
675 (on > bp->b_dirtyend || (on + n) < bp->b_dirtyoff)) {
676 off_t start, end;
677
678 boff = (off_t)bp->b_blkno * DEV_BSIZE;
679 if (on > bp->b_dirtyend) {
680 start = boff + bp->b_validend;
681 end = boff + on;
682 } else {
683 start = boff + on + n;
684 end = boff + bp->b_validoff;
685 }
686
687 /*
688 * It may be that the valid region in the buffer
689 * covers the region we want, in which case just
690 * extend the dirty region. Otherwise we try to
691 * extend the valid region.
692 */
693 if (end > start) {
694 auio.uio_iov = &iov;
695 auio.uio_iovcnt = 1;
696 auio.uio_offset = start;
697 auio.uio_resid = end - start;
698 auio.uio_segflg = UIO_SYSSPACE;
699 auio.uio_rw = UIO_READ;
700 auio.uio_procp = p;
701 iov.iov_base = bp->b_data + (start - boff);
702 iov.iov_len = end - start;
703 error = nfs_readrpc(vp, &auio, cred);
704 /*
705 * If we couldn't read, do not do a VOP_BWRITE
706 * as originally coded. That could also error
707 * and looping back to "again" as it was doing
708 * could have us stuck trying to write same buf
709 * again. nfs_write, will get the entire region
710 * if nfs_readrpc succeeded. If unsuccessful
711 * we should just error out. Errors like ESTALE
712 * would keep us looping rather than transient
713 * errors justifying a retry. We can return here
714 * instead of altering dirty region later. We
715 * did not write old dirty region at this point.
716 */
717 if (error) {
718 bp->b_error = error;
719 SET(bp->b_flags, B_ERROR);
720 printf("nfs_write: readrpc2 %d", error);
721 brelse(bp);
722 return (error);
723 }
724 /*
725 * The read worked.
726 * If there was a short read, just zero fill.
727 */
728 if (auio.uio_resid > 0)
729 bzero(iov.iov_base, auio.uio_resid);
730 if (on > bp->b_dirtyend)
731 bp->b_validend = on;
732 else
733 bp->b_validoff = on + n;
734 }
735 /*
736 * We now have a valid region which extends up to the
737 * dirty region which we want.
738 */
739 if (on > bp->b_dirtyend)
740 bp->b_dirtyend = on;
741 else
742 bp->b_dirtyoff = on + n;
743 }
1c79356b
A
744 if (ISSET(bp->b_flags, B_ERROR)) {
745 error = bp->b_error;
746 brelse(bp);
747 return (error);
748 }
fa4905b1
A
749 /*
750 * NFS has embedded ucred so crhold() risks zone corruption
751 */
752 if (bp->b_wcred == NOCRED)
0b4e3aa0 753 bp->b_wcred = crdup(cred);
1c79356b
A
754 np->n_flag |= NMODIFIED;
755
756 /*
757 * Check for valid write lease and get one as required.
758 * In case getblk() and/or bwrite() delayed us.
759 */
760 if ((nmp->nm_flag & NFSMNT_NQNFS) &&
761 NQNFS_CKINVALID(vp, np, ND_WRITE)) {
762 do {
763 error = nqnfs_getlease(vp, ND_WRITE, cred, p);
764 } while (error == NQNFS_EXPIRED);
765 if (error) {
766 brelse(bp);
767 return (error);
768 }
769 if (np->n_lrev != np->n_brev ||
770 (np->n_flag & NQNFSNONCACHE)) {
771 brelse(bp);
772 error = nfs_vinvalbuf(vp, V_SAVE, cred, p, 1);
773 if (error)
774 return (error);
775 np->n_brev = np->n_lrev;
776 goto again;
777 }
778 }
779 error = uiomove((char *)bp->b_data + on, n, uio);
780 if (error) {
781 SET(bp->b_flags, B_ERROR);
782 brelse(bp);
783 return (error);
784 }
785 if (bp->b_dirtyend > 0) {
786 bp->b_dirtyoff = min(on, bp->b_dirtyoff);
787 bp->b_dirtyend = max((on + n), bp->b_dirtyend);
788 } else {
789 bp->b_dirtyoff = on;
790 bp->b_dirtyend = on + n;
791 }
792 if (bp->b_validend == 0 || bp->b_validend < bp->b_dirtyoff ||
793 bp->b_validoff > bp->b_dirtyend) {
794 bp->b_validoff = bp->b_dirtyoff;
795 bp->b_validend = bp->b_dirtyend;
796 } else {
797 bp->b_validoff = min(bp->b_validoff, bp->b_dirtyoff);
798 bp->b_validend = max(bp->b_validend, bp->b_dirtyend);
799 }
800
801 /*
802 * Since this block is being modified, it must be written
803 * again and not just committed.
804 */
805 CLR(bp->b_flags, B_NEEDCOMMIT);
806
807 /*
808 * If the lease is non-cachable or IO_SYNC do bwrite().
809 */
810 if ((np->n_flag & NQNFSNONCACHE) || (ioflag & IO_SYNC)) {
811 bp->b_proc = p;
812 error = VOP_BWRITE(bp);
813 if (error)
814 return (error);
815 if (np->n_flag & NQNFSNONCACHE) {
816 error = nfs_vinvalbuf(vp, V_SAVE, cred, p, 1);
817 if (error)
818 return (error);
819 }
820 } else if ((n + on) == biosize &&
821 (nmp->nm_flag & NFSMNT_NQNFS) == 0) {
822 bp->b_proc = (struct proc *)0;
823 SET(bp->b_flags, B_ASYNC);
824 (void)nfs_writebp(bp, 0);
825 } else
826 bdwrite(bp);
827 } while (uio->uio_resid > 0 && n > 0);
828 return (0);
829}
830
1c79356b
A
831
832/*
833 * Get an nfs cache block.
834 * Allocate a new one if the block isn't currently in the cache
835 * and return the block marked busy. If the calling process is
836 * interrupted by a signal for an interruptible mount point, return
837 * NULL.
838 */
839static struct buf *
840nfs_getcacheblk(vp, bn, size, p, operation)
841 struct vnode *vp;
842 daddr_t bn;
843 int size;
844 struct proc *p;
845 int operation; /* defined in sys/buf.h */
846{
847 register struct buf *bp;
848 struct nfsmount *nmp = VFSTONFS(vp->v_mount);
fa4905b1 849 /*due to getblk/vm interractions, use vm page size or less values */
1c79356b
A
850 int biosize = min(vp->v_mount->mnt_stat.f_iosize, PAGE_SIZE);
851
9bccf70c
A
852 if (nbdwrite > ((nbuf/4)*3) && operation == BLK_WRITE) {
853#define __BUFFERS_RECLAIMED 2
854 struct buf *tbp[__BUFFERS_RECLAIMED];
855 int i;
856
857 /* too many delayed writes, try to free up some buffers */
858 for (i = 0; i < __BUFFERS_RECLAIMED; i++)
859 tbp[i] = geteblk(512);
860
861 /* Yield to IO thread */
862 (void)tsleep((caddr_t)&nbdwrite, PCATCH, "nbdwrite", 1);
863
864 for (i = (__BUFFERS_RECLAIMED - 1); i >= 0; i--)
865 brelse(tbp[i]);
866 }
867
1c79356b
A
868 if (nmp->nm_flag & NFSMNT_INT) {
869 bp = getblk(vp, bn, size, PCATCH, 0, operation);
870 while (bp == (struct buf *)0) {
871 if (nfs_sigintr(nmp, (struct nfsreq *)0, p))
872 return ((struct buf *)0);
873 bp = getblk(vp, bn, size, 0, 2 * hz, operation);
874 }
875 } else
876 bp = getblk(vp, bn, size, 0, 0, operation);
877
878 if( vp->v_type == VREG)
879 bp->b_blkno = (bn * biosize) / DEV_BSIZE;
880
881 return (bp);
882}
883
884/*
885 * Flush and invalidate all dirty buffers. If another process is already
886 * doing the flush, just wait for completion.
887 */
888int
889nfs_vinvalbuf(vp, flags, cred, p, intrflg)
890 struct vnode *vp;
891 int flags;
892 struct ucred *cred;
893 struct proc *p;
894 int intrflg;
895{
896 register struct nfsnode *np = VTONFS(vp);
897 struct nfsmount *nmp = VFSTONFS(vp->v_mount);
898 int error = 0, slpflag, slptimeo;
0b4e3aa0 899 int didhold = 0;
1c79356b
A
900
901 if ((nmp->nm_flag & NFSMNT_INT) == 0)
902 intrflg = 0;
903 if (intrflg) {
904 slpflag = PCATCH;
905 slptimeo = 2 * hz;
906 } else {
907 slpflag = 0;
908 slptimeo = 0;
909 }
910 /*
911 * First wait for any other process doing a flush to complete.
912 */
913 while (np->n_flag & NFLUSHINPROG) {
914 np->n_flag |= NFLUSHWANT;
915 error = tsleep((caddr_t)&np->n_flag, PRIBIO + 2, "nfsvinval",
916 slptimeo);
917 if (error && intrflg && nfs_sigintr(nmp, (struct nfsreq *)0, p))
918 return (EINTR);
919 }
920
921 /*
922 * Now, flush as required.
923 */
924 np->n_flag |= NFLUSHINPROG;
925 error = vinvalbuf(vp, flags, cred, p, slpflag, 0);
926 while (error) {
0b4e3aa0
A
927 /* we seem to be stuck in a loop here if the thread got aborted.
928 * nfs_flush will return EINTR. Not sure if that will cause
929 * other consequences due to EINTR having other meanings in NFS
930 * To handle, no dirty pages, it seems safe to just return from
931 * here. But if we did have dirty pages, how would we get them
932 * written out if thread was aborted? Some other strategy is
933 * necessary. -- EKN
934 */
935 if ((intrflg && nfs_sigintr(nmp, (struct nfsreq *)0, p)) ||
fa4905b1 936 (error == EINTR && current_thread_aborted())) {
1c79356b
A
937 np->n_flag &= ~NFLUSHINPROG;
938 if (np->n_flag & NFLUSHWANT) {
939 np->n_flag &= ~NFLUSHWANT;
940 wakeup((caddr_t)&np->n_flag);
941 }
942 return (EINTR);
943 }
944 error = vinvalbuf(vp, flags, cred, p, 0, slptimeo);
945 }
946 np->n_flag &= ~(NMODIFIED | NFLUSHINPROG);
947 if (np->n_flag & NFLUSHWANT) {
948 np->n_flag &= ~NFLUSHWANT;
949 wakeup((caddr_t)&np->n_flag);
950 }
0b4e3aa0
A
951 didhold = ubc_hold(vp);
952 if (didhold) {
fa4905b1 953 (void) ubc_clean(vp, 1); /* get the pages out of vm also */
0b4e3aa0
A
954 ubc_rele(vp);
955 }
1c79356b
A
956 return (0);
957}
958
959/*
960 * Initiate asynchronous I/O. Return an error if no nfsiods are available.
961 * This is mainly to avoid queueing async I/O requests when the nfsiods
962 * are all hung on a dead server.
963 */
964int
965nfs_asyncio(bp, cred)
966 register struct buf *bp;
967 struct ucred *cred;
968{
969 struct nfsmount *nmp;
970 int i;
971 int gotiod;
972 int slpflag = 0;
973 int slptimeo = 0;
974 int error;
975
976 if (nfs_numasync == 0)
977 return (EIO);
978
979 nmp = VFSTONFS(bp->b_vp->v_mount);
980again:
981 if (nmp->nm_flag & NFSMNT_INT)
982 slpflag = PCATCH;
983 gotiod = FALSE;
984
985 /*
986 * Find a free iod to process this request.
987 */
988 for (i = 0; i < NFS_MAXASYNCDAEMON; i++)
989 if (nfs_iodwant[i]) {
990 /*
991 * Found one, so wake it up and tell it which
992 * mount to process.
993 */
994 NFS_DPF(ASYNCIO,
995 ("nfs_asyncio: waking iod %d for mount %p\n",
996 i, nmp));
997 nfs_iodwant[i] = (struct proc *)0;
998 nfs_iodmount[i] = nmp;
999 nmp->nm_bufqiods++;
1000 wakeup((caddr_t)&nfs_iodwant[i]);
1001 gotiod = TRUE;
1002 break;
1003 }
1004
1005 /*
1006 * If none are free, we may already have an iod working on this mount
1007 * point. If so, it will process our request.
1008 */
1009 if (!gotiod) {
1010 if (nmp->nm_bufqiods > 0) {
1011 NFS_DPF(ASYNCIO,
1012 ("nfs_asyncio: %d iods are already processing mount %p\n",
1013 nmp->nm_bufqiods, nmp));
1014 gotiod = TRUE;
1015 }
1016 }
1017
1018 /*
1019 * If we have an iod which can process the request, then queue
1020 * the buffer.
1021 */
1022 if (gotiod) {
1023 /*
1024 * Ensure that the queue never grows too large.
1025 */
1026 while (nmp->nm_bufqlen >= 2*nfs_numasync) {
1027 NFS_DPF(ASYNCIO,
1028 ("nfs_asyncio: waiting for mount %p queue to drain\n", nmp));
1029 nmp->nm_bufqwant = TRUE;
1030 error = tsleep(&nmp->nm_bufq, slpflag | PRIBIO,
1031 "nfsaio", slptimeo);
1032 if (error) {
1033 if (nfs_sigintr(nmp, NULL, bp->b_proc))
1034 return (EINTR);
1035 if (slpflag == PCATCH) {
1036 slpflag = 0;
1037 slptimeo = 2 * hz;
1038 }
1039 }
1040 /*
1041 * We might have lost our iod while sleeping,
1042 * so check and loop if nescessary.
1043 */
1044 if (nmp->nm_bufqiods == 0) {
1045 NFS_DPF(ASYNCIO,
1046 ("nfs_asyncio: no iods after mount %p queue was drained, looping\n", nmp));
1047 goto again;
1048 }
1049 }
1050
1051 if (ISSET(bp->b_flags, B_READ)) {
1052 if (bp->b_rcred == NOCRED && cred != NOCRED) {
0b4e3aa0
A
1053 /*
1054 * NFS has embedded ucred.
1055 * Can not crhold() here as that causes zone corruption
1056 */
1057 bp->b_rcred = crdup(cred);
1c79356b
A
1058 }
1059 } else {
1060 SET(bp->b_flags, B_WRITEINPROG);
1061 if (bp->b_wcred == NOCRED && cred != NOCRED) {
0b4e3aa0
A
1062 /*
1063 * NFS has embedded ucred.
1064 * Can not crhold() here as that causes zone corruption
1065 */
1066 bp->b_wcred = crdup(cred);
1c79356b
A
1067 }
1068 }
1069
1070 TAILQ_INSERT_TAIL(&nmp->nm_bufq, bp, b_freelist);
1071 nmp->nm_bufqlen++;
1072 return (0);
1073 }
1074
1075 /*
1076 * All the iods are busy on other mounts, so return EIO to
1077 * force the caller to process the i/o synchronously.
1078 */
1079 NFS_DPF(ASYNCIO, ("nfs_asyncio: no iods available, i/o is synchronous\n"));
1080 return (EIO);
1081}
1082
1083/*
1084 * Do an I/O operation to/from a cache block. This may be called
1085 * synchronously or from an nfsiod.
1086 */
1087int
1088nfs_doio(bp, cr, p)
1089 register struct buf *bp;
1090 struct ucred *cr;
1091 struct proc *p;
1092{
1093 register struct uio *uiop;
1094 register struct vnode *vp;
1095 struct nfsnode *np;
1096 struct nfsmount *nmp;
1097 int error = 0, diff, len, iomode, must_commit = 0;
1098 struct uio uio;
1099 struct iovec io;
1100
1101 vp = bp->b_vp;
1c79356b
A
1102 np = VTONFS(vp);
1103 nmp = VFSTONFS(vp->v_mount);
1104 uiop = &uio;
1105 uiop->uio_iov = &io;
1106 uiop->uio_iovcnt = 1;
1107 uiop->uio_segflg = UIO_SYSSPACE;
1108 uiop->uio_procp = p;
1109
1110 /*
1111 * With UBC, getblk() can return a buf with B_DONE set.
1112 * This indicates that the VM has valid data for that page.
1113 * NFS being stateless, this case poses a problem.
1114 * By definition, the NFS server should always be consulted
1115 * for the data in that page.
fa4905b1 1116 * So we choose to clear the B_DONE and to do the IO.
1c79356b
A
1117 *
1118 * XXX revisit this if there is a performance issue.
1119 * XXX In that case, we could play the attribute cache games ...
1120 */
1121 if (ISSET(bp->b_flags, B_DONE)) {
1122 if (!ISSET(bp->b_flags, B_ASYNC))
1123 panic("nfs_doio: done and not async");
1124 CLR(bp->b_flags, B_DONE);
1125 }
fa4905b1
A
1126 FSDBG_TOP(256, np->n_size, bp->b_blkno * DEV_BSIZE, bp->b_bcount,
1127 bp->b_flags);
1128 FSDBG(257, bp->b_validoff, bp->b_validend, bp->b_dirtyoff,
1129 bp->b_dirtyend);
1c79356b
A
1130 /*
1131 * Historically, paging was done with physio, but no more.
1132 */
1133 if (ISSET(bp->b_flags, B_PHYS)) {
1134 /*
1135 * ...though reading /dev/drum still gets us here.
1136 */
1137 io.iov_len = uiop->uio_resid = bp->b_bcount;
1138 /* mapping was done by vmapbuf() */
1139 io.iov_base = bp->b_data;
fa4905b1 1140 uiop->uio_offset = (off_t)bp->b_blkno * DEV_BSIZE;
1c79356b
A
1141 if (ISSET(bp->b_flags, B_READ)) {
1142 uiop->uio_rw = UIO_READ;
1143 nfsstats.read_physios++;
1144 error = nfs_readrpc(vp, uiop, cr);
1145 } else {
1146 int com;
1147
1148 iomode = NFSV3WRITE_DATASYNC;
1149 uiop->uio_rw = UIO_WRITE;
1150 nfsstats.write_physios++;
1151 error = nfs_writerpc(vp, uiop, cr, &iomode, &com);
1152 }
1153 if (error) {
1154 SET(bp->b_flags, B_ERROR);
1155 bp->b_error = error;
1156 }
1157 } else if (ISSET(bp->b_flags, B_READ)) {
1158 io.iov_len = uiop->uio_resid = bp->b_bcount;
1159 io.iov_base = bp->b_data;
1160 uiop->uio_rw = UIO_READ;
1161 switch (vp->v_type) {
1162 case VREG:
fa4905b1 1163 uiop->uio_offset = (off_t)bp->b_blkno * DEV_BSIZE;
1c79356b
A
1164 nfsstats.read_bios++;
1165 error = nfs_readrpc(vp, uiop, cr);
fa4905b1
A
1166 FSDBG(262, np->n_size, bp->b_blkno * DEV_BSIZE,
1167 uiop->uio_resid, error);
1c79356b
A
1168 if (!error) {
1169 bp->b_validoff = 0;
1170 if (uiop->uio_resid) {
1171 /*
1172 * If len > 0, there is a hole in the file and
1173 * no writes after the hole have been pushed to
1174 * the server yet.
1175 * Just zero fill the rest of the valid area.
1176 */
1177 diff = bp->b_bcount - uiop->uio_resid;
fa4905b1
A
1178 len = np->n_size - ((u_quad_t)bp->b_blkno * DEV_BSIZE +
1179 diff);
1180 if (len > 0) {
1181 len = min(len, uiop->uio_resid);
1182 bzero((char *)bp->b_data + diff, len);
1183 bp->b_validend = diff + len;
1184 FSDBG(258, diff, len, 0, 1);
1185 } else
1186 bp->b_validend = diff;
1c79356b
A
1187 } else
1188 bp->b_validend = bp->b_bcount;
9bccf70c 1189
1c79356b 1190 if (bp->b_validend < bp->b_bufsize) {
fa4905b1
A
1191 /*
1192 * we're about to release a partial buffer after a
1193 * read... the only way we should get here is if
1194 * this buffer contains the EOF before releasing it,
1195 * we'll zero out to the end of the buffer so that
1196 * if a mmap of this page occurs, we'll see zero's
1197 * even if a ftruncate extends the file in the
1198 * meantime
1c79356b 1199 */
fa4905b1
A
1200 bzero((caddr_t)(bp->b_data + bp->b_validend),
1201 bp->b_bufsize - bp->b_validend);
1202 FSDBG(258, bp->b_validend,
1203 bp->b_bufsize - bp->b_validend, 0, 2);
1c79356b 1204 }
1c79356b
A
1205 }
1206 if (p && (vp->v_flag & VTEXT) &&
1207 (((nmp->nm_flag & NFSMNT_NQNFS) &&
1208 NQNFS_CKINVALID(vp, np, ND_READ) &&
1209 np->n_lrev != np->n_brev) ||
1210 (!(nmp->nm_flag & NFSMNT_NQNFS) &&
1211 np->n_mtime != np->n_vattr.va_mtime.tv_sec))) {
1212 uprintf("Process killed due to text file modification\n");
1213 psignal(p, SIGKILL);
1214 p->p_flag |= P_NOSWAP;
1215 }
1216 break;
1217 case VLNK:
1218 uiop->uio_offset = (off_t)0;
1219 nfsstats.readlink_bios++;
1220 error = nfs_readlinkrpc(vp, uiop, cr);
1221 break;
1222 case VDIR:
1223 nfsstats.readdir_bios++;
1224 uiop->uio_offset = ((u_quad_t)bp->b_lblkno) * NFS_DIRBLKSIZ;
1225 if (!(nmp->nm_flag & NFSMNT_NFSV3))
1226 nmp->nm_flag &= ~NFSMNT_RDIRPLUS; /* dk@farm.org */
1227 if (nmp->nm_flag & NFSMNT_RDIRPLUS) {
1228 error = nfs_readdirplusrpc(vp, uiop, cr);
1229 if (error == NFSERR_NOTSUPP)
1230 nmp->nm_flag &= ~NFSMNT_RDIRPLUS;
1231 }
1232 if ((nmp->nm_flag & NFSMNT_RDIRPLUS) == 0)
1233 error = nfs_readdirrpc(vp, uiop, cr);
1234 break;
1235 default:
fa4905b1 1236 printf("nfs_doio: type %x unexpected\n", vp->v_type);
1c79356b
A
1237 break;
1238 };
1239 if (error) {
fa4905b1
A
1240 SET(bp->b_flags, B_ERROR);
1241 bp->b_error = error;
1c79356b
A
1242 }
1243 } else {
fa4905b1
A
1244 /*
1245 * mapped I/O may have altered any bytes, so we extend
1246 * the dirty zone to the valid zone. For best performance
1247 * a better solution would be to save & restore page dirty bits
1248 * around the uiomove which brings write-data into the buffer.
1249 * Then here we'd check if the page is dirty rather than WASMAPPED
1250 * Also vnode_pager would change - if a page is clean it might
1251 * still need to be written due to DELWRI.
1252 */
1253 if (UBCINFOEXISTS(vp) && ubc_issetflags(vp, UI_WASMAPPED)) {
1254 bp->b_dirtyoff = min(bp->b_dirtyoff, bp->b_validoff);
1255 bp->b_dirtyend = max(bp->b_dirtyend, bp->b_validend);
1256 }
1257 if ((off_t)bp->b_blkno * DEV_BSIZE + bp->b_dirtyend > np->n_size)
1258 bp->b_dirtyend = np->n_size - (off_t)bp->b_blkno * DEV_BSIZE;
1c79356b
A
1259
1260 if (bp->b_dirtyend > bp->b_dirtyoff) {
fa4905b1
A
1261 io.iov_len = uiop->uio_resid = bp->b_dirtyend - bp->b_dirtyoff;
1262 uiop->uio_offset = (off_t)bp->b_blkno * DEV_BSIZE +
1263 bp->b_dirtyoff;
1c79356b
A
1264 io.iov_base = (char *)bp->b_data + bp->b_dirtyoff;
1265 uiop->uio_rw = UIO_WRITE;
1266
1267 nfsstats.write_bios++;
fa4905b1
A
1268 if ((bp->b_flags & (B_ASYNC | B_NEEDCOMMIT | B_NOCACHE)) ==
1269 B_ASYNC)
1c79356b
A
1270 iomode = NFSV3WRITE_UNSTABLE;
1271 else
1272 iomode = NFSV3WRITE_FILESYNC;
1273 SET(bp->b_flags, B_WRITEINPROG);
1274 error = nfs_writerpc(vp, uiop, cr, &iomode, &must_commit);
1275 if (!error && iomode == NFSV3WRITE_UNSTABLE)
1276 SET(bp->b_flags, B_NEEDCOMMIT);
1277 else
1278 CLR(bp->b_flags, B_NEEDCOMMIT);
1279 CLR(bp->b_flags, B_WRITEINPROG);
1c79356b
A
1280 /*
1281 * For an interrupted write, the buffer is still valid
1282 * and the write hasn't been pushed to the server yet,
1283 * so we can't set B_ERROR and report the interruption
1284 * by setting B_EINTR. For the B_ASYNC case, B_EINTR
1285 * is not relevant, so the rpc attempt is essentially
1286 * a noop. For the case of a V3 write rpc not being
1287 * committed to stable storage, the block is still
1288 * dirty and requires either a commit rpc or another
1289 * write rpc with iomode == NFSV3WRITE_FILESYNC before
1290 * the block is reused. This is indicated by setting
1291 * the B_DELWRI and B_NEEDCOMMIT flags.
1292 */
fa4905b1 1293 if (error == EINTR || (!error && bp->b_flags & B_NEEDCOMMIT)) {
1c79356b
A
1294 int s;
1295
fa4905b1 1296 CLR(bp->b_flags, B_INVAL | B_NOCACHE);
d52fe63f 1297 if (!ISSET(bp->b_flags, B_DELWRI)) {
d52fe63f
A
1298 SET(bp->b_flags, B_DELWRI);
1299 nbdwrite++;
1300 }
fa4905b1
A
1301 FSDBG(261, bp->b_validoff, bp->b_validend,
1302 bp->b_bufsize, bp->b_bcount);
1c79356b 1303 /*
fa4905b1
A
1304 * Since for the B_ASYNC case, nfs_bwrite() has
1305 * reassigned the buffer to the clean list, we have to
1306 * reassign it back to the dirty one. Ugh.
1c79356b
A
1307 */
1308 if (ISSET(bp->b_flags, B_ASYNC)) {
1309 s = splbio();
1310 reassignbuf(bp, vp);
1311 splx(s);
1312 } else {
1313 SET(bp->b_flags, B_EINTR);
1314 }
1315 } else {
1316 if (error) {
1317 SET(bp->b_flags, B_ERROR);
1318 bp->b_error = np->n_error = error;
1319 np->n_flag |= NWRITEERR;
1320 }
1321 bp->b_dirtyoff = bp->b_dirtyend = 0;
9bccf70c 1322
1c79356b 1323 /*
fa4905b1
A
1324 * validoff and validend represent the real data present
1325 * in this buffer if validoff is non-zero, than we have
1326 * to invalidate the buffer and kill the page when
1327 * biodone is called... the same is also true when
1328 * validend doesn't extend all the way to the end of the
1329 * buffer and validend doesn't equate to the current
1330 * EOF... eventually we need to deal with this in a more
1331 * humane way (like keeping the partial buffer without
1332 * making it immediately available to the VM page cache)
1c79356b
A
1333 */
1334 if (bp->b_validoff)
1335 SET(bp->b_flags, B_INVAL);
1336 else
1337 if (bp->b_validend < bp->b_bufsize) {
fa4905b1
A
1338 if ((off_t)bp->b_blkno * DEV_BSIZE +
1339 bp->b_validend == np->n_size) {
1340 bzero((caddr_t)(bp->b_data +
1341 bp->b_validend),
1342 bp->b_bufsize - bp->b_validend);
1343 FSDBG(259, bp->b_validend,
1344 bp->b_bufsize - bp->b_validend, 0,
1345 0);
1346 } else
1347 SET(bp->b_flags, B_INVAL);
1c79356b 1348 }
1c79356b
A
1349 }
1350
1351 } else {
fa4905b1
A
1352 if (bp->b_validoff ||
1353 (bp->b_validend < bp->b_bufsize &&
1354 (off_t)bp->b_blkno * DEV_BSIZE + bp->b_validend !=
1355 np->n_size)) {
1356 SET(bp->b_flags, B_INVAL);
1357 }
1358 if (bp->b_flags & B_INVAL) {
1359 FSDBG(260, bp->b_validoff, bp->b_validend,
1360 bp->b_bufsize, bp->b_bcount);
1361 }
fa4905b1
A
1362 bp->b_resid = 0;
1363 biodone(bp);
1364 FSDBG_BOT(256, bp->b_validoff, bp->b_validend, bp->b_bufsize,
1365 np->n_size);
1366 return (0);
1c79356b
A
1367 }
1368 }
1369 bp->b_resid = uiop->uio_resid;
1370 if (must_commit)
1371 nfs_clearcommit(vp->v_mount);
1372
1373 if (bp->b_flags & B_INVAL) {
fa4905b1
A
1374 FSDBG(260, bp->b_validoff, bp->b_validend, bp->b_bufsize,
1375 bp->b_bcount);
1c79356b 1376 }
fa4905b1 1377 FSDBG_BOT(256, bp->b_validoff, bp->b_validend, bp->b_bcount, error);
1c79356b
A
1378
1379 biodone(bp);
1c79356b
A
1380 return (error);
1381}