]>
git.saurik.com Git - apple/xnu.git/blob - bsd/nfs/nfs_bio.c
e341a0441e9961ad886f6c4cea00b2ea178d7616
2 * Copyright (c) 2000 Apple Computer, Inc. All rights reserved.
4 * @APPLE_LICENSE_HEADER_START@
6 * The contents of this file constitute Original Code as defined in and
7 * are subject to the Apple Public Source License Version 1.1 (the
8 * "License"). You may not use this file except in compliance with the
9 * License. Please obtain a copy of the License at
10 * http://www.apple.com/publicsource and read it before using this file.
12 * This Original Code and all software distributed under the License are
13 * distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY KIND, EITHER
14 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
15 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
16 * FITNESS FOR A PARTICULAR PURPOSE OR NON-INFRINGEMENT. Please see the
17 * License for the specific language governing rights and limitations
20 * @APPLE_LICENSE_HEADER_END@
22 /* Copyright (c) 1995 NeXT Computer, Inc. All Rights Reserved */
24 * Copyright (c) 1989, 1993
25 * The Regents of the University of California. All rights reserved.
27 * This code is derived from software contributed to Berkeley by
28 * Rick Macklem at The University of Guelph.
30 * Redistribution and use in source and binary forms, with or without
31 * modification, are permitted provided that the following conditions
33 * 1. Redistributions of source code must retain the above copyright
34 * notice, this list of conditions and the following disclaimer.
35 * 2. Redistributions in binary form must reproduce the above copyright
36 * notice, this list of conditions and the following disclaimer in the
37 * documentation and/or other materials provided with the distribution.
38 * 3. All advertising materials mentioning features or use of this software
39 * must display the following acknowledgement:
40 * This product includes software developed by the University of
41 * California, Berkeley and its contributors.
42 * 4. Neither the name of the University nor the names of its contributors
43 * may be used to endorse or promote products derived from this software
44 * without specific prior written permission.
46 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
47 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
48 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
49 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
50 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
51 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
52 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
53 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
54 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
55 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
58 * @(#)nfs_bio.c 8.9 (Berkeley) 3/30/95
59 * FreeBSD-Id: nfs_bio.c,v 1.44 1997/09/10 19:52:25 phk Exp $
62 #include <sys/param.h>
63 #include <sys/systm.h>
64 #include <sys/resourcevar.h>
65 #include <sys/signalvar.h>
68 #include <sys/vnode.h>
69 #include <sys/mount.h>
70 #include <sys/kernel.h>
71 #include <sys/sysctl.h>
75 #include <sys/vmparam.h>
78 #include <kern/clock.h>
80 #include <nfs/rpcv2.h>
81 #include <nfs/nfsproto.h>
83 #include <nfs/nfsmount.h>
84 #include <nfs/nqnfs.h>
85 #include <nfs/nfsnode.h>
87 #include <sys/kdebug.h>
89 static struct buf
*nfs_getcacheblk
__P((struct vnode
*vp
, daddr_t bn
, int size
,
90 struct proc
*p
, int operation
));
91 static struct buf
*nfs_getwriteblk
__P((struct vnode
*vp
, daddr_t bn
,
92 int size
, struct proc
*p
,
93 struct ucred
*cred
, int off
, int len
));
95 extern int nfs_numasync
;
96 extern struct nfsstats nfsstats
;
99 * Vnode op for read using bio
100 * Any similarity to readip() is purely coincidental
103 nfs_bioread(vp
, uio
, ioflag
, cred
, getpages
)
104 register struct vnode
*vp
;
105 register struct uio
*uio
;
110 register struct nfsnode
*np
= VTONFS(vp
);
111 register int biosize
, diff
, i
;
112 struct buf
*bp
= 0, *rabp
;
115 struct nfsmount
*nmp
= VFSTONFS(vp
->v_mount
);
118 int nra
, error
= 0, n
= 0, on
= 0, not_readin
;
119 int operation
= (getpages
? BLK_PAGEIN
: BLK_READ
);
122 if (uio
->uio_rw
!= UIO_READ
)
123 panic("nfs_read mode");
125 if (uio
->uio_resid
== 0)
127 if (uio
->uio_offset
< 0)
130 if ((nmp
->nm_flag
& (NFSMNT_NFSV3
| NFSMNT_GOTFSINFO
)) == NFSMNT_NFSV3
)
131 (void)nfs_fsinfo(nmp
, vp
, cred
, p
);
132 /*due to getblk/vm interractions, use vm page size or less values */
133 biosize
= min(vp
->v_mount
->mnt_stat
.f_iosize
, PAGE_SIZE
);
135 * For nfs, cache consistency can only be maintained approximately.
136 * Although RFC1094 does not specify the criteria, the following is
137 * believed to be compatible with the reference port.
138 * For nqnfs, full cache consistency is maintained within the loop.
140 * If the file's modify time on the server has changed since the
141 * last read rpc or you have written to the file,
142 * you may have lost data cache consistency with the
143 * server, so flush all of the file's data out of the cache.
144 * Then force a getattr rpc to ensure that you have up to date
146 * NB: This implies that cache data can be read when up to
147 * NFS_ATTRTIMEO seconds out of date. If you find that you need current
148 * attributes this could be forced by setting n_attrstamp to 0 before
149 * the VOP_GETATTR() call.
151 if ((nmp
->nm_flag
& NFSMNT_NQNFS
) == 0) {
152 if (np
->n_flag
& NMODIFIED
) {
153 if (vp
->v_type
!= VREG
) {
154 if (vp
->v_type
!= VDIR
)
155 panic("nfs: bioread, not dir");
157 error
= nfs_vinvalbuf(vp
, V_SAVE
, cred
, p
, 1);
162 error
= VOP_GETATTR(vp
, &vattr
, cred
, p
);
165 np
->n_mtime
= vattr
.va_mtime
.tv_sec
;
167 error
= VOP_GETATTR(vp
, &vattr
, cred
, p
);
170 if (np
->n_mtime
!= vattr
.va_mtime
.tv_sec
) {
171 if (vp
->v_type
== VDIR
)
173 error
= nfs_vinvalbuf(vp
, V_SAVE
, cred
, p
, 1);
176 np
->n_mtime
= vattr
.va_mtime
.tv_sec
;
183 * Get a valid lease. If cached data is stale, flush it.
185 if (nmp
->nm_flag
& NFSMNT_NQNFS
) {
186 if (NQNFS_CKINVALID(vp
, np
, ND_READ
)) {
188 error
= nqnfs_getlease(vp
, ND_READ
, cred
, p
);
189 } while (error
== NQNFS_EXPIRED
);
192 if (np
->n_lrev
!= np
->n_brev
||
193 (np
->n_flag
& NQNFSNONCACHE
) ||
194 ((np
->n_flag
& NMODIFIED
) && vp
->v_type
== VDIR
)) {
195 if (vp
->v_type
== VDIR
)
197 error
= nfs_vinvalbuf(vp
, V_SAVE
, cred
, p
, 1);
200 np
->n_brev
= np
->n_lrev
;
202 } else if (vp
->v_type
== VDIR
&& (np
->n_flag
& NMODIFIED
)) {
204 error
= nfs_vinvalbuf(vp
, V_SAVE
, cred
, p
, 1);
209 if (np
->n_flag
& NQNFSNONCACHE
) {
210 switch (vp
->v_type
) {
212 return (nfs_readrpc(vp
, uio
, cred
));
214 return (nfs_readlinkrpc(vp
, uio
, cred
));
218 printf(" NQNFSNONCACHE: type %x unexpected\n",
222 switch (vp
->v_type
) {
224 nfsstats
.biocache_reads
++;
225 lbn
= uio
->uio_offset
/ biosize
;
226 on
= uio
->uio_offset
& (biosize
- 1);
230 * Start the read ahead(s), as required.
232 if (nfs_numasync
> 0 && nmp
->nm_readahead
> 0) {
233 for (nra
= 0; nra
< nmp
->nm_readahead
&&
234 (off_t
)(lbn
+ 1 + nra
) * biosize
< np
->n_size
; nra
++) {
235 rabn
= lbn
+ 1 + nra
;
236 if (!incore(vp
, rabn
)) {
237 rabp
= nfs_getcacheblk(vp
, rabn
, biosize
, p
, operation
);
240 if (!ISSET(rabp
->b_flags
, (B_CACHE
|B_DELWRI
))) {
241 SET(rabp
->b_flags
, (B_READ
| B_ASYNC
));
242 if (nfs_asyncio(rabp
, cred
)) {
243 SET(rabp
->b_flags
, (B_INVAL
|B_ERROR
));
254 * If the block is in the cache and has the required data
255 * in a valid region, just copy it out.
256 * Otherwise, get the block and write back/read in,
261 if ((off_t
)(lbn
+ 1) * biosize
> np
->n_size
&&
262 (off_t
)(lbn
+ 1) * biosize
- np
->n_size
< biosize
) {
263 bufsize
= np
->n_size
- lbn
* biosize
;
264 bufsize
= (bufsize
+ DEV_BSIZE
- 1) & ~(DEV_BSIZE
- 1);
266 bp
= nfs_getcacheblk(vp
, lbn
, bufsize
, p
, operation
);
270 if (!ISSET(bp
->b_flags
, B_CACHE
)) {
271 SET(bp
->b_flags
, B_READ
);
272 CLR(bp
->b_flags
, (B_DONE
| B_ERROR
| B_INVAL
));
274 error
= nfs_doio(bp
, cred
, p
);
281 n
= min((unsigned)(bufsize
- on
), uio
->uio_resid
);
285 diff
= np
->n_size
- uio
->uio_offset
;
288 if (not_readin
&& n
> 0) {
289 if (on
< bp
->b_validoff
|| (on
+ n
) > bp
->b_validend
) {
290 SET(bp
->b_flags
, (B_NOCACHE
|B_INVAFTERWRITE
));
291 if (bp
->b_dirtyend
> 0) {
292 if (!ISSET(bp
->b_flags
, B_DELWRI
))
294 if (VOP_BWRITE(bp
) == EINTR
)
302 diff
= (on
>= bp
->b_validend
) ? 0 : (bp
->b_validend
- on
);
307 nfsstats
.biocache_readlinks
++;
308 bp
= nfs_getcacheblk(vp
, (daddr_t
)0, NFS_MAXPATHLEN
, p
, operation
);
311 if (!ISSET(bp
->b_flags
, B_CACHE
)) {
312 SET(bp
->b_flags
, B_READ
);
313 error
= nfs_doio(bp
, cred
, p
);
315 SET(bp
->b_flags
, B_ERROR
);
320 n
= min(uio
->uio_resid
, NFS_MAXPATHLEN
- bp
->b_resid
);
324 nfsstats
.biocache_readdirs
++;
325 if (np
->n_direofoffset
326 && uio
->uio_offset
>= np
->n_direofoffset
) {
329 lbn
= uio
->uio_offset
/ NFS_DIRBLKSIZ
;
330 on
= uio
->uio_offset
& (NFS_DIRBLKSIZ
- 1);
331 bp
= nfs_getcacheblk(vp
, lbn
, NFS_DIRBLKSIZ
, p
, operation
);
334 if (!ISSET(bp
->b_flags
, B_CACHE
)) {
335 SET(bp
->b_flags
, B_READ
);
336 error
= nfs_doio(bp
, cred
, p
);
339 while (error
== NFSERR_BAD_COOKIE
) {
341 error
= nfs_vinvalbuf(vp
, 0, cred
, p
, 1);
343 * Yuck! The directory has been modified on the
344 * server. The only way to get the block is by
345 * reading from the beginning to get all the
348 for (i
= 0; i
<= lbn
&& !error
; i
++) {
349 if (np
->n_direofoffset
350 && (i
* NFS_DIRBLKSIZ
) >= np
->n_direofoffset
)
352 bp
= nfs_getcacheblk(vp
, i
, NFS_DIRBLKSIZ
, p
, operation
);
355 if (!ISSET(bp
->b_flags
, B_DONE
)) {
356 SET(bp
->b_flags
, B_READ
);
357 error
= nfs_doio(bp
, cred
, p
);
371 * If not eof and read aheads are enabled, start one.
372 * (You need the current block first, so that you have the
373 * directory offset cookie of the next block.)
375 if (nfs_numasync
> 0 && nmp
->nm_readahead
> 0 &&
376 (np
->n_direofoffset
== 0 ||
377 (lbn
+ 1) * NFS_DIRBLKSIZ
< np
->n_direofoffset
) &&
378 !(np
->n_flag
& NQNFSNONCACHE
) &&
379 !incore(vp
, lbn
+ 1)) {
380 rabp
= nfs_getcacheblk(vp
, lbn
+ 1, NFS_DIRBLKSIZ
, p
, operation
);
382 if (!ISSET(rabp
->b_flags
, (B_CACHE
|B_DELWRI
))) {
383 SET(rabp
->b_flags
, (B_READ
| B_ASYNC
));
384 if (nfs_asyncio(rabp
, cred
)) {
385 SET(rabp
->b_flags
, (B_INVAL
|B_ERROR
));
395 * Make sure we use a signed variant of min() since
396 * the second term may be negative.
398 n
= lmin(uio
->uio_resid
, NFS_DIRBLKSIZ
- bp
->b_resid
- on
);
401 printf(" nfs_bioread: type %x unexpected\n",vp
->v_type
);
406 error
= uiomove(bp
->b_data
+ on
, (int)n
, uio
);
408 switch (vp
->v_type
) {
415 if (np
->n_flag
& NQNFSNONCACHE
)
416 SET(bp
->b_flags
, B_INVAL
);
419 printf(" nfs_bioread: type %x unexpected\n",vp
->v_type
);
422 } while (error
== 0 && uio
->uio_resid
> 0 && n
> 0);
427 * Vnode op for write using bio
431 struct vop_write_args
/* {
435 struct ucred *a_cred;
438 register int biosize
;
439 register struct uio
*uio
= ap
->a_uio
;
440 struct proc
*p
= uio
->uio_procp
;
441 register struct vnode
*vp
= ap
->a_vp
;
442 struct nfsnode
*np
= VTONFS(vp
);
443 register struct ucred
*cred
= ap
->a_cred
;
444 int ioflag
= ap
->a_ioflag
;
447 struct nfsmount
*nmp
= VFSTONFS(vp
->v_mount
);
450 int n
, on
, error
= 0, iomode
, must_commit
;
453 if (uio
->uio_rw
!= UIO_WRITE
)
454 panic("nfs_write mode");
455 if (uio
->uio_segflg
== UIO_USERSPACE
&& uio
->uio_procp
!= current_proc())
456 panic("nfs_write proc");
458 if (vp
->v_type
!= VREG
)
460 if (np
->n_flag
& NWRITEERR
) {
461 np
->n_flag
&= ~NWRITEERR
;
462 return (np
->n_error
);
464 if ((nmp
->nm_flag
& (NFSMNT_NFSV3
| NFSMNT_GOTFSINFO
)) == NFSMNT_NFSV3
)
465 (void)nfs_fsinfo(nmp
, vp
, cred
, p
);
466 if (ioflag
& (IO_APPEND
| IO_SYNC
)) {
467 if (np
->n_flag
& NMODIFIED
) {
469 error
= nfs_vinvalbuf(vp
, V_SAVE
, cred
, p
, 1);
473 if (ioflag
& IO_APPEND
) {
475 error
= VOP_GETATTR(vp
, &vattr
, cred
, p
);
478 uio
->uio_offset
= np
->n_size
;
481 if (uio
->uio_offset
< 0)
483 if (uio
->uio_resid
== 0)
486 * Maybe this should be above the vnode op call, but so long as
487 * file servers have no limits, i don't think it matters
489 if (p
&& uio
->uio_offset
+ uio
->uio_resid
>
490 p
->p_rlimit
[RLIMIT_FSIZE
].rlim_cur
) {
495 * I use nm_rsize, not nm_wsize so that all buffer cache blocks
496 * will be the same size within a filesystem. nfs_writerpc will
497 * still use nm_wsize when sizing the rpc's.
499 /*due to getblk/vm interractions, use vm page size or less values */
500 biosize
= min(vp
->v_mount
->mnt_stat
.f_iosize
, PAGE_SIZE
);
504 * Check for a valid write lease.
506 if ((nmp
->nm_flag
& NFSMNT_NQNFS
) &&
507 NQNFS_CKINVALID(vp
, np
, ND_WRITE
)) {
509 error
= nqnfs_getlease(vp
, ND_WRITE
, cred
, p
);
510 } while (error
== NQNFS_EXPIRED
);
513 if (np
->n_lrev
!= np
->n_brev
||
514 (np
->n_flag
& NQNFSNONCACHE
)) {
515 error
= nfs_vinvalbuf(vp
, V_SAVE
, cred
, p
, 1);
518 np
->n_brev
= np
->n_lrev
;
521 if ((np
->n_flag
& NQNFSNONCACHE
) && uio
->uio_iovcnt
== 1) {
522 iomode
= NFSV3WRITE_FILESYNC
;
523 error
= nfs_writerpc(vp
, uio
, cred
, &iomode
, &must_commit
);
525 nfs_clearcommit(vp
->v_mount
);
528 nfsstats
.biocache_writes
++;
529 lbn
= uio
->uio_offset
/ biosize
;
530 on
= uio
->uio_offset
& (biosize
-1);
531 n
= min((unsigned)(biosize
- on
), uio
->uio_resid
);
533 if (uio
->uio_offset
+ n
> np
->n_size
) {
534 np
->n_size
= uio
->uio_offset
+ n
;
535 np
->n_flag
|= NMODIFIED
;
537 ubc_setsize(vp
, (off_t
)np
->n_size
); /* XXX check error */
541 /* (removed for UBC) */
542 if ((lbn
+ 1) * biosize
> np
->n_size
) {
543 bufsize
= np
->n_size
- lbn
* biosize
;
544 bufsize
= (bufsize
+ DEV_BSIZE
- 1) & ~(DEV_BSIZE
- 1);
547 bp
= nfs_getwriteblk(vp
, lbn
, bufsize
, p
, cred
, on
, n
);
550 if (ISSET(bp
->b_flags
, B_ERROR
)) {
555 if (bp
->b_wcred
== NOCRED
) {
559 np
->n_flag
|= NMODIFIED
;
562 * Check for valid write lease and get one as required.
563 * In case getblk() and/or bwrite() delayed us.
565 if ((nmp
->nm_flag
& NFSMNT_NQNFS
) &&
566 NQNFS_CKINVALID(vp
, np
, ND_WRITE
)) {
568 error
= nqnfs_getlease(vp
, ND_WRITE
, cred
, p
);
569 } while (error
== NQNFS_EXPIRED
);
574 if (np
->n_lrev
!= np
->n_brev
||
575 (np
->n_flag
& NQNFSNONCACHE
)) {
577 error
= nfs_vinvalbuf(vp
, V_SAVE
, cred
, p
, 1);
580 np
->n_brev
= np
->n_lrev
;
584 error
= uiomove((char *)bp
->b_data
+ on
, n
, uio
);
586 SET(bp
->b_flags
, B_ERROR
);
590 if (bp
->b_dirtyend
> 0) {
591 bp
->b_dirtyoff
= min(on
, bp
->b_dirtyoff
);
592 bp
->b_dirtyend
= max((on
+ n
), bp
->b_dirtyend
);
595 bp
->b_dirtyend
= on
+ n
;
597 if (bp
->b_validend
== 0 || bp
->b_validend
< bp
->b_dirtyoff
||
598 bp
->b_validoff
> bp
->b_dirtyend
) {
599 bp
->b_validoff
= bp
->b_dirtyoff
;
600 bp
->b_validend
= bp
->b_dirtyend
;
602 bp
->b_validoff
= min(bp
->b_validoff
, bp
->b_dirtyoff
);
603 bp
->b_validend
= max(bp
->b_validend
, bp
->b_dirtyend
);
607 * Since this block is being modified, it must be written
608 * again and not just committed.
610 CLR(bp
->b_flags
, B_NEEDCOMMIT
);
613 * If the lease is non-cachable or IO_SYNC do bwrite().
615 if ((np
->n_flag
& NQNFSNONCACHE
) || (ioflag
& IO_SYNC
)) {
617 error
= VOP_BWRITE(bp
);
620 if (np
->n_flag
& NQNFSNONCACHE
) {
621 error
= nfs_vinvalbuf(vp
, V_SAVE
, cred
, p
, 1);
625 } else if ((n
+ on
) == biosize
&&
626 (nmp
->nm_flag
& NFSMNT_NQNFS
) == 0) {
627 bp
->b_proc
= (struct proc
*)0;
628 SET(bp
->b_flags
, B_ASYNC
);
629 (void)nfs_writebp(bp
, 0);
632 } while (uio
->uio_resid
> 0 && n
> 0);
637 * Get a cache block for writing. The range to be written is
638 * (off..off+len) within the block. This routine ensures that the
639 * block is either has no dirty region or that the given range is
640 * contiguous with the existing dirty region.
643 nfs_getwriteblk(vp
, bn
, size
, p
, cred
, off
, len
)
651 struct nfsnode
*np
= VTONFS(vp
);
659 bp
= nfs_getcacheblk(vp
, bn
, size
, p
, BLK_WRITE
);
662 if (bp
->b_wcred
== NOCRED
) {
667 if ((bp
->b_blkno
* DEV_BSIZE
) + bp
->b_dirtyend
> np
->n_size
) {
668 bp
->b_dirtyend
= np
->n_size
- (bp
->b_blkno
* DEV_BSIZE
);
672 * UBC doesn't (yet) handle partial pages so nfs_biowrite was
673 * hacked to never bdwrite, to start every little write right away.
674 * Running IE Avie noticed the performance problem, thus this code,
675 * which permits those delayed writes by ensuring an initial read
676 * of the entire page. The read may hit eof ("short read") but
677 * that we will handle.
679 * We are quite dependant on the correctness of B_CACHE so check
680 * that first in case of problems.
682 if (!ISSET(bp
->b_flags
, B_CACHE
) && len
< PAGE_SIZE
) {
683 struct nfsnode
*np
= VTONFS(vp
);
685 boff
= (off_t
)bp
->b_blkno
* DEV_BSIZE
;
688 uio
.uio_offset
= boff
;
689 uio
.uio_resid
= PAGE_SIZE
;
690 uio
.uio_segflg
= UIO_SYSSPACE
;
691 uio
.uio_rw
= UIO_READ
;
693 iov
.iov_base
= bp
->b_data
;
694 iov
.iov_len
= PAGE_SIZE
;
695 error
= nfs_readrpc(vp
, &uio
, cred
);
698 SET(bp
->b_flags
, B_ERROR
);
699 printf("nfs_getwriteblk: readrpc returned %d", error
);
701 if (uio
.uio_resid
> 0)
702 bzero(iov
.iov_base
, uio
.uio_resid
);
704 bp
->b_validend
= PAGE_SIZE
- uio
.uio_resid
;
705 if (np
->n_size
> boff
+ bp
->b_validend
)
706 bp
->b_validend
= min(np
->n_size
- boff
, PAGE_SIZE
);
712 * If the new write will leave a contiguous dirty
713 * area, just update the b_dirtyoff and b_dirtyend,
714 * otherwise try to extend the dirty region.
716 if (bp
->b_dirtyend
> 0 &&
717 (off
> bp
->b_dirtyend
|| (off
+ len
) < bp
->b_dirtyoff
)) {
720 boff
= (off_t
)bp
->b_blkno
* DEV_BSIZE
;
721 if (off
> bp
->b_dirtyend
) {
722 start
= boff
+ bp
->b_validend
;
725 start
= boff
+ off
+ len
;
726 end
= boff
+ bp
->b_validoff
;
730 * It may be that the valid region in the buffer
731 * covers the region we want, in which case just
732 * extend the dirty region. Otherwise we try to
733 * extend the valid region.
738 uio
.uio_offset
= start
;
739 uio
.uio_resid
= end
- start
;
740 uio
.uio_segflg
= UIO_SYSSPACE
;
741 uio
.uio_rw
= UIO_READ
;
743 iov
.iov_base
= bp
->b_data
+ (start
- boff
);
744 iov
.iov_len
= end
- start
;
745 error
= nfs_readrpc(vp
, &uio
, cred
);
748 * If we couldn't read, fall back to writing
749 * out the old dirty region.
752 if (VOP_BWRITE(bp
) == EINTR
)
759 if (uio
.uio_resid
> 0) {
761 * If there was a short read,
767 if (off
> bp
->b_dirtyend
)
768 bp
->b_validend
= off
;
770 bp
->b_validoff
= off
+ len
;
775 * We now have a valid region which extends up to the
776 * dirty region which we want.
778 if (off
> bp
->b_dirtyend
)
779 bp
->b_dirtyend
= off
;
781 bp
->b_dirtyoff
= off
+ len
;
788 * Get an nfs cache block.
789 * Allocate a new one if the block isn't currently in the cache
790 * and return the block marked busy. If the calling process is
791 * interrupted by a signal for an interruptible mount point, return
795 nfs_getcacheblk(vp
, bn
, size
, p
, operation
)
800 int operation
; /* defined in sys/buf.h */
802 register struct buf
*bp
;
803 struct nfsmount
*nmp
= VFSTONFS(vp
->v_mount
);
804 /*due to getblk/vm interractions, use vm page size or less values */
805 int biosize
= min(vp
->v_mount
->mnt_stat
.f_iosize
, PAGE_SIZE
);
807 if (nmp
->nm_flag
& NFSMNT_INT
) {
808 bp
= getblk(vp
, bn
, size
, PCATCH
, 0, operation
);
809 while (bp
== (struct buf
*)0) {
810 if (nfs_sigintr(nmp
, (struct nfsreq
*)0, p
))
811 return ((struct buf
*)0);
812 bp
= getblk(vp
, bn
, size
, 0, 2 * hz
, operation
);
815 bp
= getblk(vp
, bn
, size
, 0, 0, operation
);
817 if( vp
->v_type
== VREG
)
818 bp
->b_blkno
= (bn
* biosize
) / DEV_BSIZE
;
824 * Flush and invalidate all dirty buffers. If another process is already
825 * doing the flush, just wait for completion.
828 nfs_vinvalbuf(vp
, flags
, cred
, p
, intrflg
)
835 register struct nfsnode
*np
= VTONFS(vp
);
836 struct nfsmount
*nmp
= VFSTONFS(vp
->v_mount
);
837 int error
= 0, slpflag
, slptimeo
;
839 if ((nmp
->nm_flag
& NFSMNT_INT
) == 0)
849 * First wait for any other process doing a flush to complete.
851 while (np
->n_flag
& NFLUSHINPROG
) {
852 np
->n_flag
|= NFLUSHWANT
;
853 error
= tsleep((caddr_t
)&np
->n_flag
, PRIBIO
+ 2, "nfsvinval",
855 if (error
&& intrflg
&& nfs_sigintr(nmp
, (struct nfsreq
*)0, p
))
860 * Now, flush as required.
862 np
->n_flag
|= NFLUSHINPROG
;
863 error
= vinvalbuf(vp
, flags
, cred
, p
, slpflag
, 0);
865 if (intrflg
&& nfs_sigintr(nmp
, (struct nfsreq
*)0, p
)) {
866 np
->n_flag
&= ~NFLUSHINPROG
;
867 if (np
->n_flag
& NFLUSHWANT
) {
868 np
->n_flag
&= ~NFLUSHWANT
;
869 wakeup((caddr_t
)&np
->n_flag
);
873 error
= vinvalbuf(vp
, flags
, cred
, p
, 0, slptimeo
);
875 np
->n_flag
&= ~(NMODIFIED
| NFLUSHINPROG
);
876 if (np
->n_flag
& NFLUSHWANT
) {
877 np
->n_flag
&= ~NFLUSHWANT
;
878 wakeup((caddr_t
)&np
->n_flag
);
880 (void) ubc_clean(vp
, 1); /* get the pages out of vm also */
885 * Initiate asynchronous I/O. Return an error if no nfsiods are available.
886 * This is mainly to avoid queueing async I/O requests when the nfsiods
887 * are all hung on a dead server.
890 nfs_asyncio(bp
, cred
)
891 register struct buf
*bp
;
894 struct nfsmount
*nmp
;
901 if (nfs_numasync
== 0)
904 nmp
= VFSTONFS(bp
->b_vp
->v_mount
);
906 if (nmp
->nm_flag
& NFSMNT_INT
)
911 * Find a free iod to process this request.
913 for (i
= 0; i
< NFS_MAXASYNCDAEMON
; i
++)
914 if (nfs_iodwant
[i
]) {
916 * Found one, so wake it up and tell it which
920 ("nfs_asyncio: waking iod %d for mount %p\n",
922 nfs_iodwant
[i
] = (struct proc
*)0;
923 nfs_iodmount
[i
] = nmp
;
925 wakeup((caddr_t
)&nfs_iodwant
[i
]);
931 * If none are free, we may already have an iod working on this mount
932 * point. If so, it will process our request.
935 if (nmp
->nm_bufqiods
> 0) {
937 ("nfs_asyncio: %d iods are already processing mount %p\n",
938 nmp
->nm_bufqiods
, nmp
));
944 * If we have an iod which can process the request, then queue
949 * Ensure that the queue never grows too large.
951 while (nmp
->nm_bufqlen
>= 2*nfs_numasync
) {
953 ("nfs_asyncio: waiting for mount %p queue to drain\n", nmp
));
954 nmp
->nm_bufqwant
= TRUE
;
955 error
= tsleep(&nmp
->nm_bufq
, slpflag
| PRIBIO
,
958 if (nfs_sigintr(nmp
, NULL
, bp
->b_proc
))
960 if (slpflag
== PCATCH
) {
966 * We might have lost our iod while sleeping,
967 * so check and loop if nescessary.
969 if (nmp
->nm_bufqiods
== 0) {
971 ("nfs_asyncio: no iods after mount %p queue was drained, looping\n", nmp
));
976 if (ISSET(bp
->b_flags
, B_READ
)) {
977 if (bp
->b_rcred
== NOCRED
&& cred
!= NOCRED
) {
982 SET(bp
->b_flags
, B_WRITEINPROG
);
983 if (bp
->b_wcred
== NOCRED
&& cred
!= NOCRED
) {
989 TAILQ_INSERT_TAIL(&nmp
->nm_bufq
, bp
, b_freelist
);
995 * All the iods are busy on other mounts, so return EIO to
996 * force the caller to process the i/o synchronously.
998 NFS_DPF(ASYNCIO
, ("nfs_asyncio: no iods available, i/o is synchronous\n"));
1003 * Do an I/O operation to/from a cache block. This may be called
1004 * synchronously or from an nfsiod.
1008 register struct buf
*bp
;
1012 register struct uio
*uiop
;
1013 register struct vnode
*vp
;
1015 struct nfsmount
*nmp
;
1016 int error
= 0, diff
, len
, iomode
, must_commit
= 0;
1021 NFSTRACE(NFSTRC_DIO
, vp
);
1023 nmp
= VFSTONFS(vp
->v_mount
);
1025 uiop
->uio_iov
= &io
;
1026 uiop
->uio_iovcnt
= 1;
1027 uiop
->uio_segflg
= UIO_SYSSPACE
;
1028 uiop
->uio_procp
= p
;
1031 * With UBC, getblk() can return a buf with B_DONE set.
1032 * This indicates that the VM has valid data for that page.
1033 * NFS being stateless, this case poses a problem.
1034 * By definition, the NFS server should always be consulted
1035 * for the data in that page.
1036 * So we choose to clear the B_DONE and to the IO.
1038 * XXX revisit this if there is a performance issue.
1039 * XXX In that case, we could play the attribute cache games ...
1041 if (ISSET(bp
->b_flags
, B_DONE
)) {
1042 if (!ISSET(bp
->b_flags
, B_ASYNC
))
1043 panic("nfs_doio: done and not async");
1044 CLR(bp
->b_flags
, B_DONE
);
1047 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 256)) | DBG_FUNC_START
,
1048 (int)np
->n_size
, bp
->b_blkno
* DEV_BSIZE
, bp
->b_bcount
, bp
->b_flags
, 0);
1050 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 257)) | DBG_FUNC_NONE
,
1051 bp
->b_validoff
, bp
->b_validend
, bp
->b_dirtyoff
, bp
->b_dirtyend
, 0);
1054 * Historically, paging was done with physio, but no more.
1056 if (ISSET(bp
->b_flags
, B_PHYS
)) {
1058 * ...though reading /dev/drum still gets us here.
1060 io
.iov_len
= uiop
->uio_resid
= bp
->b_bcount
;
1061 /* mapping was done by vmapbuf() */
1062 io
.iov_base
= bp
->b_data
;
1063 uiop
->uio_offset
= ((off_t
)bp
->b_blkno
) * DEV_BSIZE
;
1064 if (ISSET(bp
->b_flags
, B_READ
)) {
1065 uiop
->uio_rw
= UIO_READ
;
1066 nfsstats
.read_physios
++;
1067 error
= nfs_readrpc(vp
, uiop
, cr
);
1071 iomode
= NFSV3WRITE_DATASYNC
;
1072 uiop
->uio_rw
= UIO_WRITE
;
1073 nfsstats
.write_physios
++;
1074 error
= nfs_writerpc(vp
, uiop
, cr
, &iomode
, &com
);
1077 SET(bp
->b_flags
, B_ERROR
);
1078 bp
->b_error
= error
;
1080 } else if (ISSET(bp
->b_flags
, B_READ
)) {
1081 io
.iov_len
= uiop
->uio_resid
= bp
->b_bcount
;
1082 io
.iov_base
= bp
->b_data
;
1083 uiop
->uio_rw
= UIO_READ
;
1084 switch (vp
->v_type
) {
1086 uiop
->uio_offset
= ((off_t
)bp
->b_blkno
) * DEV_BSIZE
;
1087 nfsstats
.read_bios
++;
1088 error
= nfs_readrpc(vp
, uiop
, cr
);
1090 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 262)) | DBG_FUNC_NONE
,
1091 (int)np
->n_size
, bp
->b_blkno
* DEV_BSIZE
, uiop
->uio_resid
, error
, 0);
1096 if (uiop
->uio_resid
) {
1098 * If len > 0, there is a hole in the file and
1099 * no writes after the hole have been pushed to
1101 * Just zero fill the rest of the valid area.
1103 diff
= bp
->b_bcount
- uiop
->uio_resid
;
1104 len
= np
->n_size
- (((u_quad_t
)bp
->b_blkno
) * DEV_BSIZE
1107 len
= min(len
, uiop
->uio_resid
);
1108 bzero((char *)bp
->b_data
+ diff
, len
);
1109 bp
->b_validend
= diff
+ len
;
1111 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 258)) | DBG_FUNC_NONE
,
1112 diff
, len
, 0, 1, 0);
1115 bp
->b_validend
= diff
;
1117 bp
->b_validend
= bp
->b_bcount
;
1118 #if 1 /* USV + JOE [ */
1119 if (bp
->b_validend
< bp
->b_bufsize
) {
1121 * we're about to release a partial buffer after a read... the only
1122 * way we should get here is if this buffer contains the EOF
1123 * before releasing it, we'll zero out to the end of the buffer
1124 * so that if a mmap of this page occurs, we'll see zero's even
1125 * if a ftruncate extends the file in the meantime
1127 bzero((caddr_t
)(bp
->b_data
+ bp
->b_validend
), (bp
->b_bufsize
- bp
->b_validend
));
1129 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 258)) | DBG_FUNC_NONE
,
1130 bp
->b_validend
, (bp
->b_bufsize
- bp
->b_validend
), 0, 2, 0);
1132 #endif /* ] USV + JOE */
1134 if (p
&& (vp
->v_flag
& VTEXT
) &&
1135 (((nmp
->nm_flag
& NFSMNT_NQNFS
) &&
1136 NQNFS_CKINVALID(vp
, np
, ND_READ
) &&
1137 np
->n_lrev
!= np
->n_brev
) ||
1138 (!(nmp
->nm_flag
& NFSMNT_NQNFS
) &&
1139 np
->n_mtime
!= np
->n_vattr
.va_mtime
.tv_sec
))) {
1140 uprintf("Process killed due to text file modification\n");
1141 psignal(p
, SIGKILL
);
1142 p
->p_flag
|= P_NOSWAP
;
1146 uiop
->uio_offset
= (off_t
)0;
1147 nfsstats
.readlink_bios
++;
1148 error
= nfs_readlinkrpc(vp
, uiop
, cr
);
1151 nfsstats
.readdir_bios
++;
1152 uiop
->uio_offset
= ((u_quad_t
)bp
->b_lblkno
) * NFS_DIRBLKSIZ
;
1153 if (!(nmp
->nm_flag
& NFSMNT_NFSV3
))
1154 nmp
->nm_flag
&= ~NFSMNT_RDIRPLUS
; /* dk@farm.org */
1155 if (nmp
->nm_flag
& NFSMNT_RDIRPLUS
) {
1156 error
= nfs_readdirplusrpc(vp
, uiop
, cr
);
1157 if (error
== NFSERR_NOTSUPP
)
1158 nmp
->nm_flag
&= ~NFSMNT_RDIRPLUS
;
1160 if ((nmp
->nm_flag
& NFSMNT_RDIRPLUS
) == 0)
1161 error
= nfs_readdirrpc(vp
, uiop
, cr
);
1164 printf("nfs_doio: type %x unexpected\n",vp
->v_type
);
1168 SET(bp
->b_flags
, B_ERROR
);
1169 bp
->b_error
= error
;
1172 if (((bp
->b_blkno
* DEV_BSIZE
) + bp
->b_dirtyend
) > np
->n_size
)
1173 bp
->b_dirtyend
= np
->n_size
- (bp
->b_blkno
* DEV_BSIZE
);
1175 if (bp
->b_dirtyend
> bp
->b_dirtyoff
) {
1177 io
.iov_len
= uiop
->uio_resid
= bp
->b_dirtyend
1179 uiop
->uio_offset
= ((off_t
)bp
->b_blkno
) * DEV_BSIZE
1181 io
.iov_base
= (char *)bp
->b_data
+ bp
->b_dirtyoff
;
1182 uiop
->uio_rw
= UIO_WRITE
;
1184 nfsstats
.write_bios
++;
1185 if ((bp
->b_flags
& (B_ASYNC
| B_NEEDCOMMIT
| B_NOCACHE
)) == B_ASYNC
)
1186 iomode
= NFSV3WRITE_UNSTABLE
;
1188 iomode
= NFSV3WRITE_FILESYNC
;
1189 SET(bp
->b_flags
, B_WRITEINPROG
);
1190 error
= nfs_writerpc(vp
, uiop
, cr
, &iomode
, &must_commit
);
1191 if (!error
&& iomode
== NFSV3WRITE_UNSTABLE
)
1192 SET(bp
->b_flags
, B_NEEDCOMMIT
);
1194 CLR(bp
->b_flags
, B_NEEDCOMMIT
);
1195 CLR(bp
->b_flags
, B_WRITEINPROG
);
1198 * For an interrupted write, the buffer is still valid
1199 * and the write hasn't been pushed to the server yet,
1200 * so we can't set B_ERROR and report the interruption
1201 * by setting B_EINTR. For the B_ASYNC case, B_EINTR
1202 * is not relevant, so the rpc attempt is essentially
1203 * a noop. For the case of a V3 write rpc not being
1204 * committed to stable storage, the block is still
1205 * dirty and requires either a commit rpc or another
1206 * write rpc with iomode == NFSV3WRITE_FILESYNC before
1207 * the block is reused. This is indicated by setting
1208 * the B_DELWRI and B_NEEDCOMMIT flags.
1211 || (!error
&& (bp
->b_flags
& B_NEEDCOMMIT
))) {
1214 CLR(bp
->b_flags
, (B_INVAL
|B_NOCACHE
));
1215 SET(bp
->b_flags
, B_DELWRI
);
1217 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 261)) | DBG_FUNC_NONE
,
1218 bp
->b_validoff
, bp
->b_validend
, bp
->b_bufsize
, bp
->b_bcount
, 0);
1221 * Since for the B_ASYNC case, nfs_bwrite() has reassigned the
1222 * buffer to the clean list, we have to reassign it back to the
1225 if (ISSET(bp
->b_flags
, B_ASYNC
)) {
1227 reassignbuf(bp
, vp
);
1230 SET(bp
->b_flags
, B_EINTR
);
1234 SET(bp
->b_flags
, B_ERROR
);
1235 bp
->b_error
= np
->n_error
= error
;
1236 np
->n_flag
|= NWRITEERR
;
1238 bp
->b_dirtyoff
= bp
->b_dirtyend
= 0;
1242 * validoff and validend represent the real data present in this buffer
1243 * if validoff is non-zero, than we have to invalidate the buffer and kill
1244 * the page when biodone is called... the same is also true when validend
1245 * doesn't extend all the way to the end of the buffer and validend doesn't
1246 * equate to the current EOF... eventually we need to deal with this in a
1247 * more humane way (like keeping the partial buffer without making it immediately
1248 * available to the VM page cache).
1251 SET(bp
->b_flags
, B_INVAL
);
1253 if (bp
->b_validend
< bp
->b_bufsize
) {
1254 if ((((off_t
)bp
->b_blkno
* (off_t
)DEV_BSIZE
) + bp
->b_validend
) == np
->n_size
) {
1255 bzero((caddr_t
)(bp
->b_data
+ bp
->b_validend
), (bp
->b_bufsize
- bp
->b_validend
));
1257 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 259)) | DBG_FUNC_NONE
,
1258 bp
->b_validend
, (bp
->b_bufsize
- bp
->b_validend
), 0, 0, 0);;
1261 SET(bp
->b_flags
, B_INVAL
);
1270 SET(bp
->b_flags
, B_INVAL
);
1271 else if (bp
->b_validend
< bp
->b_bufsize
) {
1272 if ((((off_t
)bp
->b_blkno
* (off_t
)DEV_BSIZE
) + bp
->b_validend
) != np
->n_size
)
1273 SET(bp
->b_flags
, B_INVAL
);
1275 if (bp
->b_flags
& B_INVAL
) {
1276 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 260)) | DBG_FUNC_NONE
,
1277 bp
->b_validoff
, bp
->b_validend
, bp
->b_bufsize
, bp
->b_bcount
, 0);
1282 NFSTRACE(NFSTRC_DIO_DONE
, vp
);
1286 bp
->b_resid
= uiop
->uio_resid
;
1288 nfs_clearcommit(vp
->v_mount
);
1290 if (bp
->b_flags
& B_INVAL
) {
1291 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 260)) | DBG_FUNC_NONE
,
1292 bp
->b_validoff
, bp
->b_validend
, bp
->b_bufsize
, bp
->b_bcount
, 0);
1294 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 256)) | DBG_FUNC_END
,
1295 bp
->b_validoff
, bp
->b_validend
, bp
->b_bcount
, error
, 0);
1298 NFSTRACE(NFSTRC_DIO_DONE
, vp
);