]>
git.saurik.com Git - apple/xnu.git/blob - bsd/nfs/nfs_bio.c
040678d37c28a4e394eedf97419de6e6c442e249
2 * Copyright (c) 2000 Apple Computer, Inc. All rights reserved.
4 * @APPLE_LICENSE_HEADER_START@
6 * The contents of this file constitute Original Code as defined in and
7 * are subject to the Apple Public Source License Version 1.1 (the
8 * "License"). You may not use this file except in compliance with the
9 * License. Please obtain a copy of the License at
10 * http://www.apple.com/publicsource and read it before using this file.
12 * This Original Code and all software distributed under the License are
13 * distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY KIND, EITHER
14 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
15 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
16 * FITNESS FOR A PARTICULAR PURPOSE OR NON-INFRINGEMENT. Please see the
17 * License for the specific language governing rights and limitations
20 * @APPLE_LICENSE_HEADER_END@
22 /* Copyright (c) 1995 NeXT Computer, Inc. All Rights Reserved */
24 * Copyright (c) 1989, 1993
25 * The Regents of the University of California. All rights reserved.
27 * This code is derived from software contributed to Berkeley by
28 * Rick Macklem at The University of Guelph.
30 * Redistribution and use in source and binary forms, with or without
31 * modification, are permitted provided that the following conditions
33 * 1. Redistributions of source code must retain the above copyright
34 * notice, this list of conditions and the following disclaimer.
35 * 2. Redistributions in binary form must reproduce the above copyright
36 * notice, this list of conditions and the following disclaimer in the
37 * documentation and/or other materials provided with the distribution.
38 * 3. All advertising materials mentioning features or use of this software
39 * must display the following acknowledgement:
40 * This product includes software developed by the University of
41 * California, Berkeley and its contributors.
42 * 4. Neither the name of the University nor the names of its contributors
43 * may be used to endorse or promote products derived from this software
44 * without specific prior written permission.
46 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
47 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
48 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
49 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
50 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
51 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
52 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
53 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
54 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
55 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
58 * @(#)nfs_bio.c 8.9 (Berkeley) 3/30/95
59 * FreeBSD-Id: nfs_bio.c,v 1.44 1997/09/10 19:52:25 phk Exp $
62 #include <sys/param.h>
63 #include <sys/systm.h>
64 #include <sys/resourcevar.h>
65 #include <sys/signalvar.h>
68 #include <sys/vnode.h>
69 #include <sys/mount.h>
70 #include <sys/kernel.h>
71 #include <sys/sysctl.h>
75 #include <sys/vmparam.h>
78 #include <kern/clock.h>
80 #include <nfs/rpcv2.h>
81 #include <nfs/nfsproto.h>
83 #include <nfs/nfsmount.h>
84 #include <nfs/nqnfs.h>
85 #include <nfs/nfsnode.h>
87 #include <sys/kdebug.h>
89 static struct buf
*nfs_getcacheblk
__P((struct vnode
*vp
, daddr_t bn
, int size
,
90 struct proc
*p
, int operation
));
91 static struct buf
*nfs_getwriteblk
__P((struct vnode
*vp
, daddr_t bn
,
92 int size
, struct proc
*p
,
93 struct ucred
*cred
, int off
, int len
));
95 extern int nfs_numasync
;
96 extern struct nfsstats nfsstats
;
99 * Vnode op for read using bio
100 * Any similarity to readip() is purely coincidental
103 nfs_bioread(vp
, uio
, ioflag
, cred
, getpages
)
104 register struct vnode
*vp
;
105 register struct uio
*uio
;
110 register struct nfsnode
*np
= VTONFS(vp
);
111 register int biosize
, diff
, i
;
112 struct buf
*bp
= 0, *rabp
;
115 struct nfsmount
*nmp
= VFSTONFS(vp
->v_mount
);
118 int nra
, error
= 0, n
= 0, on
= 0, not_readin
;
119 int operation
= (getpages
? BLK_PAGEIN
: BLK_READ
);
122 if (uio
->uio_rw
!= UIO_READ
)
123 panic("nfs_read mode");
125 if (uio
->uio_resid
== 0)
127 if (uio
->uio_offset
< 0)
130 if ((nmp
->nm_flag
& (NFSMNT_NFSV3
| NFSMNT_GOTFSINFO
)) == NFSMNT_NFSV3
)
131 (void)nfs_fsinfo(nmp
, vp
, cred
, p
);
132 /*due to getblk/vm interractions, use vm page size or less values */
133 biosize
= min(vp
->v_mount
->mnt_stat
.f_iosize
, PAGE_SIZE
);
135 * For nfs, cache consistency can only be maintained approximately.
136 * Although RFC1094 does not specify the criteria, the following is
137 * believed to be compatible with the reference port.
138 * For nqnfs, full cache consistency is maintained within the loop.
140 * If the file's modify time on the server has changed since the
141 * last read rpc or you have written to the file,
142 * you may have lost data cache consistency with the
143 * server, so flush all of the file's data out of the cache.
144 * Then force a getattr rpc to ensure that you have up to date
146 * NB: This implies that cache data can be read when up to
147 * NFS_ATTRTIMEO seconds out of date. If you find that you need current
148 * attributes this could be forced by setting n_attrstamp to 0 before
149 * the VOP_GETATTR() call.
151 if ((nmp
->nm_flag
& NFSMNT_NQNFS
) == 0) {
152 if (np
->n_flag
& NMODIFIED
) {
153 if (vp
->v_type
!= VREG
) {
154 if (vp
->v_type
!= VDIR
)
155 panic("nfs: bioread, not dir");
157 error
= nfs_vinvalbuf(vp
, V_SAVE
, cred
, p
, 1);
162 error
= VOP_GETATTR(vp
, &vattr
, cred
, p
);
165 np
->n_mtime
= vattr
.va_mtime
.tv_sec
;
167 error
= VOP_GETATTR(vp
, &vattr
, cred
, p
);
170 if (np
->n_mtime
!= vattr
.va_mtime
.tv_sec
) {
171 if (vp
->v_type
== VDIR
)
173 error
= nfs_vinvalbuf(vp
, V_SAVE
, cred
, p
, 1);
176 np
->n_mtime
= vattr
.va_mtime
.tv_sec
;
183 * Get a valid lease. If cached data is stale, flush it.
185 if (nmp
->nm_flag
& NFSMNT_NQNFS
) {
186 if (NQNFS_CKINVALID(vp
, np
, ND_READ
)) {
188 error
= nqnfs_getlease(vp
, ND_READ
, cred
, p
);
189 } while (error
== NQNFS_EXPIRED
);
192 if (np
->n_lrev
!= np
->n_brev
||
193 (np
->n_flag
& NQNFSNONCACHE
) ||
194 ((np
->n_flag
& NMODIFIED
) && vp
->v_type
== VDIR
)) {
195 if (vp
->v_type
== VDIR
)
197 error
= nfs_vinvalbuf(vp
, V_SAVE
, cred
, p
, 1);
200 np
->n_brev
= np
->n_lrev
;
202 } else if (vp
->v_type
== VDIR
&& (np
->n_flag
& NMODIFIED
)) {
204 error
= nfs_vinvalbuf(vp
, V_SAVE
, cred
, p
, 1);
209 if (np
->n_flag
& NQNFSNONCACHE
) {
210 switch (vp
->v_type
) {
212 return (nfs_readrpc(vp
, uio
, cred
));
214 return (nfs_readlinkrpc(vp
, uio
, cred
));
218 printf(" NQNFSNONCACHE: type %x unexpected\n",
222 switch (vp
->v_type
) {
224 nfsstats
.biocache_reads
++;
225 lbn
= uio
->uio_offset
/ biosize
;
226 on
= uio
->uio_offset
& (biosize
- 1);
230 * Start the read ahead(s), as required.
232 if (nfs_numasync
> 0 && nmp
->nm_readahead
> 0) {
233 for (nra
= 0; nra
< nmp
->nm_readahead
&&
234 (off_t
)(lbn
+ 1 + nra
) * biosize
< np
->n_size
; nra
++) {
235 rabn
= lbn
+ 1 + nra
;
236 if (!incore(vp
, rabn
)) {
237 rabp
= nfs_getcacheblk(vp
, rabn
, biosize
, p
, operation
);
240 if (!ISSET(rabp
->b_flags
, (B_CACHE
|B_DELWRI
))) {
241 SET(rabp
->b_flags
, (B_READ
| B_ASYNC
));
242 if (nfs_asyncio(rabp
, cred
)) {
243 SET(rabp
->b_flags
, (B_INVAL
|B_ERROR
));
254 * If the block is in the cache and has the required data
255 * in a valid region, just copy it out.
256 * Otherwise, get the block and write back/read in,
261 if ((off_t
)(lbn
+ 1) * biosize
> np
->n_size
&&
262 (off_t
)(lbn
+ 1) * biosize
- np
->n_size
< biosize
) {
263 bufsize
= np
->n_size
- lbn
* biosize
;
264 bufsize
= (bufsize
+ DEV_BSIZE
- 1) & ~(DEV_BSIZE
- 1);
266 bp
= nfs_getcacheblk(vp
, lbn
, bufsize
, p
, operation
);
270 if (!ISSET(bp
->b_flags
, B_CACHE
)) {
271 SET(bp
->b_flags
, B_READ
);
272 CLR(bp
->b_flags
, (B_DONE
| B_ERROR
| B_INVAL
));
274 error
= nfs_doio(bp
, cred
, p
);
281 n
= min((unsigned)(bufsize
- on
), uio
->uio_resid
);
285 diff
= np
->n_size
- uio
->uio_offset
;
288 if (not_readin
&& n
> 0) {
289 if (on
< bp
->b_validoff
|| (on
+ n
) > bp
->b_validend
) {
290 SET(bp
->b_flags
, (B_NOCACHE
|B_INVAFTERWRITE
));
291 if (bp
->b_dirtyend
> 0) {
292 if (!ISSET(bp
->b_flags
, B_DELWRI
))
294 if (VOP_BWRITE(bp
) == EINTR
)
302 diff
= (on
>= bp
->b_validend
) ? 0 : (bp
->b_validend
- on
);
307 nfsstats
.biocache_readlinks
++;
308 bp
= nfs_getcacheblk(vp
, (daddr_t
)0, NFS_MAXPATHLEN
, p
, operation
);
311 if (!ISSET(bp
->b_flags
, B_CACHE
)) {
312 SET(bp
->b_flags
, B_READ
);
313 error
= nfs_doio(bp
, cred
, p
);
315 SET(bp
->b_flags
, B_ERROR
);
320 n
= min(uio
->uio_resid
, NFS_MAXPATHLEN
- bp
->b_resid
);
324 nfsstats
.biocache_readdirs
++;
325 if (np
->n_direofoffset
326 && uio
->uio_offset
>= np
->n_direofoffset
) {
329 lbn
= uio
->uio_offset
/ NFS_DIRBLKSIZ
;
330 on
= uio
->uio_offset
& (NFS_DIRBLKSIZ
- 1);
331 bp
= nfs_getcacheblk(vp
, lbn
, NFS_DIRBLKSIZ
, p
, operation
);
334 if (!ISSET(bp
->b_flags
, B_CACHE
)) {
335 SET(bp
->b_flags
, B_READ
);
336 error
= nfs_doio(bp
, cred
, p
);
339 while (error
== NFSERR_BAD_COOKIE
) {
341 error
= nfs_vinvalbuf(vp
, 0, cred
, p
, 1);
343 * Yuck! The directory has been modified on the
344 * server. The only way to get the block is by
345 * reading from the beginning to get all the
348 for (i
= 0; i
<= lbn
&& !error
; i
++) {
349 if (np
->n_direofoffset
350 && (i
* NFS_DIRBLKSIZ
) >= np
->n_direofoffset
)
352 bp
= nfs_getcacheblk(vp
, i
, NFS_DIRBLKSIZ
, p
, operation
);
355 if (!ISSET(bp
->b_flags
, B_DONE
)) {
356 SET(bp
->b_flags
, B_READ
);
357 error
= nfs_doio(bp
, cred
, p
);
371 * If not eof and read aheads are enabled, start one.
372 * (You need the current block first, so that you have the
373 * directory offset cookie of the next block.)
375 if (nfs_numasync
> 0 && nmp
->nm_readahead
> 0 &&
376 (np
->n_direofoffset
== 0 ||
377 (lbn
+ 1) * NFS_DIRBLKSIZ
< np
->n_direofoffset
) &&
378 !(np
->n_flag
& NQNFSNONCACHE
) &&
379 !incore(vp
, lbn
+ 1)) {
380 rabp
= nfs_getcacheblk(vp
, lbn
+ 1, NFS_DIRBLKSIZ
, p
, operation
);
382 if (!ISSET(rabp
->b_flags
, (B_CACHE
|B_DELWRI
))) {
383 SET(rabp
->b_flags
, (B_READ
| B_ASYNC
));
384 if (nfs_asyncio(rabp
, cred
)) {
385 SET(rabp
->b_flags
, (B_INVAL
|B_ERROR
));
395 * Make sure we use a signed variant of min() since
396 * the second term may be negative.
398 n
= lmin(uio
->uio_resid
, NFS_DIRBLKSIZ
- bp
->b_resid
- on
);
401 printf(" nfs_bioread: type %x unexpected\n",vp
->v_type
);
406 error
= uiomove(bp
->b_data
+ on
, (int)n
, uio
);
408 switch (vp
->v_type
) {
415 if (np
->n_flag
& NQNFSNONCACHE
)
416 SET(bp
->b_flags
, B_INVAL
);
419 printf(" nfs_bioread: type %x unexpected\n",vp
->v_type
);
422 } while (error
== 0 && uio
->uio_resid
> 0 && n
> 0);
427 * Vnode op for write using bio
431 struct vop_write_args
/* {
435 struct ucred *a_cred;
438 register int biosize
;
439 register struct uio
*uio
= ap
->a_uio
;
440 struct proc
*p
= uio
->uio_procp
;
441 register struct vnode
*vp
= ap
->a_vp
;
442 struct nfsnode
*np
= VTONFS(vp
);
443 register struct ucred
*cred
= ap
->a_cred
;
444 int ioflag
= ap
->a_ioflag
;
447 struct nfsmount
*nmp
= VFSTONFS(vp
->v_mount
);
450 int n
, on
, error
= 0, iomode
, must_commit
;
453 if (uio
->uio_rw
!= UIO_WRITE
)
454 panic("nfs_write mode");
455 if (uio
->uio_segflg
== UIO_USERSPACE
&& uio
->uio_procp
!= current_proc())
456 panic("nfs_write proc");
458 if (vp
->v_type
!= VREG
)
460 if (np
->n_flag
& NWRITEERR
) {
461 np
->n_flag
&= ~NWRITEERR
;
462 return (np
->n_error
);
464 if ((nmp
->nm_flag
& (NFSMNT_NFSV3
| NFSMNT_GOTFSINFO
)) == NFSMNT_NFSV3
)
465 (void)nfs_fsinfo(nmp
, vp
, cred
, p
);
466 if (ioflag
& (IO_APPEND
| IO_SYNC
)) {
467 if (np
->n_flag
& NMODIFIED
) {
469 error
= nfs_vinvalbuf(vp
, V_SAVE
, cred
, p
, 1);
473 if (ioflag
& IO_APPEND
) {
475 error
= VOP_GETATTR(vp
, &vattr
, cred
, p
);
478 uio
->uio_offset
= np
->n_size
;
481 if (uio
->uio_offset
< 0)
483 if (uio
->uio_resid
== 0)
486 * Maybe this should be above the vnode op call, but so long as
487 * file servers have no limits, i don't think it matters
489 if (p
&& uio
->uio_offset
+ uio
->uio_resid
>
490 p
->p_rlimit
[RLIMIT_FSIZE
].rlim_cur
) {
495 * I use nm_rsize, not nm_wsize so that all buffer cache blocks
496 * will be the same size within a filesystem. nfs_writerpc will
497 * still use nm_wsize when sizing the rpc's.
499 /*due to getblk/vm interractions, use vm page size or less values */
500 biosize
= min(vp
->v_mount
->mnt_stat
.f_iosize
, PAGE_SIZE
);
504 * Check for a valid write lease.
506 if ((nmp
->nm_flag
& NFSMNT_NQNFS
) &&
507 NQNFS_CKINVALID(vp
, np
, ND_WRITE
)) {
509 error
= nqnfs_getlease(vp
, ND_WRITE
, cred
, p
);
510 } while (error
== NQNFS_EXPIRED
);
513 if (np
->n_lrev
!= np
->n_brev
||
514 (np
->n_flag
& NQNFSNONCACHE
)) {
515 error
= nfs_vinvalbuf(vp
, V_SAVE
, cred
, p
, 1);
518 np
->n_brev
= np
->n_lrev
;
521 if ((np
->n_flag
& NQNFSNONCACHE
) && uio
->uio_iovcnt
== 1) {
522 iomode
= NFSV3WRITE_FILESYNC
;
523 error
= nfs_writerpc(vp
, uio
, cred
, &iomode
, &must_commit
);
525 nfs_clearcommit(vp
->v_mount
);
528 nfsstats
.biocache_writes
++;
529 lbn
= uio
->uio_offset
/ biosize
;
530 on
= uio
->uio_offset
& (biosize
-1);
531 n
= min((unsigned)(biosize
- on
), uio
->uio_resid
);
533 if (uio
->uio_offset
+ n
> np
->n_size
) {
534 np
->n_size
= uio
->uio_offset
+ n
;
535 np
->n_flag
|= NMODIFIED
;
537 ubc_setsize(vp
, (off_t
)np
->n_size
); /* XXX check error */
541 /* (removed for UBC) */
542 if ((lbn
+ 1) * biosize
> np
->n_size
) {
543 bufsize
= np
->n_size
- lbn
* biosize
;
544 bufsize
= (bufsize
+ DEV_BSIZE
- 1) & ~(DEV_BSIZE
- 1);
547 bp
= nfs_getwriteblk(vp
, lbn
, bufsize
, p
, cred
, on
, n
);
550 if (ISSET(bp
->b_flags
, B_ERROR
)) {
555 if (bp
->b_wcred
== NOCRED
) {
557 * NFS has embedded ucred.
558 * Can not crhold() here as that causes zone corruption
560 bp
->b_wcred
= crdup(cred
);
562 np
->n_flag
|= NMODIFIED
;
565 * Check for valid write lease and get one as required.
566 * In case getblk() and/or bwrite() delayed us.
568 if ((nmp
->nm_flag
& NFSMNT_NQNFS
) &&
569 NQNFS_CKINVALID(vp
, np
, ND_WRITE
)) {
571 error
= nqnfs_getlease(vp
, ND_WRITE
, cred
, p
);
572 } while (error
== NQNFS_EXPIRED
);
577 if (np
->n_lrev
!= np
->n_brev
||
578 (np
->n_flag
& NQNFSNONCACHE
)) {
580 error
= nfs_vinvalbuf(vp
, V_SAVE
, cred
, p
, 1);
583 np
->n_brev
= np
->n_lrev
;
587 error
= uiomove((char *)bp
->b_data
+ on
, n
, uio
);
589 SET(bp
->b_flags
, B_ERROR
);
593 if (bp
->b_dirtyend
> 0) {
594 bp
->b_dirtyoff
= min(on
, bp
->b_dirtyoff
);
595 bp
->b_dirtyend
= max((on
+ n
), bp
->b_dirtyend
);
598 bp
->b_dirtyend
= on
+ n
;
600 if (bp
->b_validend
== 0 || bp
->b_validend
< bp
->b_dirtyoff
||
601 bp
->b_validoff
> bp
->b_dirtyend
) {
602 bp
->b_validoff
= bp
->b_dirtyoff
;
603 bp
->b_validend
= bp
->b_dirtyend
;
605 bp
->b_validoff
= min(bp
->b_validoff
, bp
->b_dirtyoff
);
606 bp
->b_validend
= max(bp
->b_validend
, bp
->b_dirtyend
);
610 * Since this block is being modified, it must be written
611 * again and not just committed.
613 CLR(bp
->b_flags
, B_NEEDCOMMIT
);
616 * If the lease is non-cachable or IO_SYNC do bwrite().
618 if ((np
->n_flag
& NQNFSNONCACHE
) || (ioflag
& IO_SYNC
)) {
620 error
= VOP_BWRITE(bp
);
623 if (np
->n_flag
& NQNFSNONCACHE
) {
624 error
= nfs_vinvalbuf(vp
, V_SAVE
, cred
, p
, 1);
628 } else if ((n
+ on
) == biosize
&&
629 (nmp
->nm_flag
& NFSMNT_NQNFS
) == 0) {
630 bp
->b_proc
= (struct proc
*)0;
631 SET(bp
->b_flags
, B_ASYNC
);
632 (void)nfs_writebp(bp
, 0);
635 } while (uio
->uio_resid
> 0 && n
> 0);
640 * Get a cache block for writing. The range to be written is
641 * (off..off+len) within the block. This routine ensures that the
642 * block is either has no dirty region or that the given range is
643 * contiguous with the existing dirty region.
646 nfs_getwriteblk(vp
, bn
, size
, p
, cred
, off
, len
)
654 struct nfsnode
*np
= VTONFS(vp
);
662 bp
= nfs_getcacheblk(vp
, bn
, size
, p
, BLK_WRITE
);
665 if (bp
->b_wcred
== NOCRED
) {
667 * NFS has embedded ucred.
668 * Can not crhold() here as that causes zone corruption
670 bp
->b_wcred
= crdup(cred
);
673 if ((bp
->b_blkno
* DEV_BSIZE
) + bp
->b_dirtyend
> np
->n_size
) {
674 bp
->b_dirtyend
= np
->n_size
- (bp
->b_blkno
* DEV_BSIZE
);
678 * UBC doesn't (yet) handle partial pages so nfs_biowrite was
679 * hacked to never bdwrite, to start every little write right away.
680 * Running IE Avie noticed the performance problem, thus this code,
681 * which permits those delayed writes by ensuring an initial read
682 * of the entire page. The read may hit eof ("short read") but
683 * that we will handle.
685 * We are quite dependant on the correctness of B_CACHE so check
686 * that first in case of problems.
688 if (!ISSET(bp
->b_flags
, B_CACHE
) && len
< PAGE_SIZE
) {
689 struct nfsnode
*np
= VTONFS(vp
);
691 boff
= (off_t
)bp
->b_blkno
* DEV_BSIZE
;
694 uio
.uio_offset
= boff
;
695 uio
.uio_resid
= PAGE_SIZE
;
696 uio
.uio_segflg
= UIO_SYSSPACE
;
697 uio
.uio_rw
= UIO_READ
;
699 iov
.iov_base
= bp
->b_data
;
700 iov
.iov_len
= PAGE_SIZE
;
701 error
= nfs_readrpc(vp
, &uio
, cred
);
704 SET(bp
->b_flags
, B_ERROR
);
705 printf("nfs_getwriteblk: readrpc returned %d", error
);
707 if (uio
.uio_resid
> 0)
708 bzero(iov
.iov_base
, uio
.uio_resid
);
710 bp
->b_validend
= PAGE_SIZE
- uio
.uio_resid
;
711 if (np
->n_size
> boff
+ bp
->b_validend
)
712 bp
->b_validend
= min(np
->n_size
- boff
, PAGE_SIZE
);
718 * If the new write will leave a contiguous dirty
719 * area, just update the b_dirtyoff and b_dirtyend,
720 * otherwise try to extend the dirty region.
722 if (bp
->b_dirtyend
> 0 &&
723 (off
> bp
->b_dirtyend
|| (off
+ len
) < bp
->b_dirtyoff
)) {
726 boff
= (off_t
)bp
->b_blkno
* DEV_BSIZE
;
727 if (off
> bp
->b_dirtyend
) {
728 start
= boff
+ bp
->b_validend
;
731 start
= boff
+ off
+ len
;
732 end
= boff
+ bp
->b_validoff
;
736 * It may be that the valid region in the buffer
737 * covers the region we want, in which case just
738 * extend the dirty region. Otherwise we try to
739 * extend the valid region.
744 uio
.uio_offset
= start
;
745 uio
.uio_resid
= end
- start
;
746 uio
.uio_segflg
= UIO_SYSSPACE
;
747 uio
.uio_rw
= UIO_READ
;
749 iov
.iov_base
= bp
->b_data
+ (start
- boff
);
750 iov
.iov_len
= end
- start
;
751 error
= nfs_readrpc(vp
, &uio
, cred
);
754 * If we couldn't read, do not do a VOP_BWRITE
755 * as originally coded. That, could also error
756 * and looping back to "again" as it was doing
757 * could have us stuck trying to write same buffer
758 * again. nfs_write, will get the entire region
759 * if nfs_readrpc was successful. If not successful
760 * we should just error out. Errors like ESTALE
761 * would keep us in this loop rather than transient
762 * errors justifying a retry. We can return from here
763 * instead of altering dirty region later in routine.
764 * We did not write out old dirty region at this point.
767 SET(bp
->b_flags
, B_ERROR
);
768 printf("nfs_getwriteblk: readrpc (2) returned %d", error
);
774 if (uio
.uio_resid
> 0) {
776 * If there was a short read,
782 if (off
> bp
->b_dirtyend
)
783 bp
->b_validend
= off
;
785 bp
->b_validoff
= off
+ len
;
790 * We now have a valid region which extends up to the
791 * dirty region which we want.
793 if (off
> bp
->b_dirtyend
)
794 bp
->b_dirtyend
= off
;
796 bp
->b_dirtyoff
= off
+ len
;
803 * Get an nfs cache block.
804 * Allocate a new one if the block isn't currently in the cache
805 * and return the block marked busy. If the calling process is
806 * interrupted by a signal for an interruptible mount point, return
810 nfs_getcacheblk(vp
, bn
, size
, p
, operation
)
815 int operation
; /* defined in sys/buf.h */
817 register struct buf
*bp
;
818 struct nfsmount
*nmp
= VFSTONFS(vp
->v_mount
);
819 /*due to getblk/vm interractions, use vm page size or less values */
820 int biosize
= min(vp
->v_mount
->mnt_stat
.f_iosize
, PAGE_SIZE
);
822 if (nmp
->nm_flag
& NFSMNT_INT
) {
823 bp
= getblk(vp
, bn
, size
, PCATCH
, 0, operation
);
824 while (bp
== (struct buf
*)0) {
825 if (nfs_sigintr(nmp
, (struct nfsreq
*)0, p
))
826 return ((struct buf
*)0);
827 bp
= getblk(vp
, bn
, size
, 0, 2 * hz
, operation
);
830 bp
= getblk(vp
, bn
, size
, 0, 0, operation
);
832 if( vp
->v_type
== VREG
)
833 bp
->b_blkno
= (bn
* biosize
) / DEV_BSIZE
;
839 * Flush and invalidate all dirty buffers. If another process is already
840 * doing the flush, just wait for completion.
843 nfs_vinvalbuf(vp
, flags
, cred
, p
, intrflg
)
850 register struct nfsnode
*np
= VTONFS(vp
);
851 struct nfsmount
*nmp
= VFSTONFS(vp
->v_mount
);
852 int error
= 0, slpflag
, slptimeo
;
855 if ((nmp
->nm_flag
& NFSMNT_INT
) == 0)
865 * First wait for any other process doing a flush to complete.
867 while (np
->n_flag
& NFLUSHINPROG
) {
868 np
->n_flag
|= NFLUSHWANT
;
869 error
= tsleep((caddr_t
)&np
->n_flag
, PRIBIO
+ 2, "nfsvinval",
871 if (error
&& intrflg
&& nfs_sigintr(nmp
, (struct nfsreq
*)0, p
))
876 * Now, flush as required.
878 np
->n_flag
|= NFLUSHINPROG
;
879 error
= vinvalbuf(vp
, flags
, cred
, p
, slpflag
, 0);
881 /* we seem to be stuck in a loop here if the thread got aborted.
882 * nfs_flush will return EINTR. Not sure if that will cause
883 * other consequences due to EINTR having other meanings in NFS
884 * To handle, no dirty pages, it seems safe to just return from
885 * here. But if we did have dirty pages, how would we get them
886 * written out if thread was aborted? Some other strategy is
889 if ((intrflg
&& nfs_sigintr(nmp
, (struct nfsreq
*)0, p
)) ||
890 ((error
== EINTR
) && current_thread_aborted())) {
891 np
->n_flag
&= ~NFLUSHINPROG
;
892 if (np
->n_flag
& NFLUSHWANT
) {
893 np
->n_flag
&= ~NFLUSHWANT
;
894 wakeup((caddr_t
)&np
->n_flag
);
898 error
= vinvalbuf(vp
, flags
, cred
, p
, 0, slptimeo
);
900 np
->n_flag
&= ~(NMODIFIED
| NFLUSHINPROG
);
901 if (np
->n_flag
& NFLUSHWANT
) {
902 np
->n_flag
&= ~NFLUSHWANT
;
903 wakeup((caddr_t
)&np
->n_flag
);
905 didhold
= ubc_hold(vp
);
907 (void) ubc_clean(vp
, 1); /* get the pages out of vm also */
914 * Initiate asynchronous I/O. Return an error if no nfsiods are available.
915 * This is mainly to avoid queueing async I/O requests when the nfsiods
916 * are all hung on a dead server.
919 nfs_asyncio(bp
, cred
)
920 register struct buf
*bp
;
923 struct nfsmount
*nmp
;
930 if (nfs_numasync
== 0)
933 nmp
= VFSTONFS(bp
->b_vp
->v_mount
);
935 if (nmp
->nm_flag
& NFSMNT_INT
)
940 * Find a free iod to process this request.
942 for (i
= 0; i
< NFS_MAXASYNCDAEMON
; i
++)
943 if (nfs_iodwant
[i
]) {
945 * Found one, so wake it up and tell it which
949 ("nfs_asyncio: waking iod %d for mount %p\n",
951 nfs_iodwant
[i
] = (struct proc
*)0;
952 nfs_iodmount
[i
] = nmp
;
954 wakeup((caddr_t
)&nfs_iodwant
[i
]);
960 * If none are free, we may already have an iod working on this mount
961 * point. If so, it will process our request.
964 if (nmp
->nm_bufqiods
> 0) {
966 ("nfs_asyncio: %d iods are already processing mount %p\n",
967 nmp
->nm_bufqiods
, nmp
));
973 * If we have an iod which can process the request, then queue
978 * Ensure that the queue never grows too large.
980 while (nmp
->nm_bufqlen
>= 2*nfs_numasync
) {
982 ("nfs_asyncio: waiting for mount %p queue to drain\n", nmp
));
983 nmp
->nm_bufqwant
= TRUE
;
984 error
= tsleep(&nmp
->nm_bufq
, slpflag
| PRIBIO
,
987 if (nfs_sigintr(nmp
, NULL
, bp
->b_proc
))
989 if (slpflag
== PCATCH
) {
995 * We might have lost our iod while sleeping,
996 * so check and loop if nescessary.
998 if (nmp
->nm_bufqiods
== 0) {
1000 ("nfs_asyncio: no iods after mount %p queue was drained, looping\n", nmp
));
1005 if (ISSET(bp
->b_flags
, B_READ
)) {
1006 if (bp
->b_rcred
== NOCRED
&& cred
!= NOCRED
) {
1008 * NFS has embedded ucred.
1009 * Can not crhold() here as that causes zone corruption
1011 bp
->b_rcred
= crdup(cred
);
1014 SET(bp
->b_flags
, B_WRITEINPROG
);
1015 if (bp
->b_wcred
== NOCRED
&& cred
!= NOCRED
) {
1017 * NFS has embedded ucred.
1018 * Can not crhold() here as that causes zone corruption
1020 bp
->b_wcred
= crdup(cred
);
1024 TAILQ_INSERT_TAIL(&nmp
->nm_bufq
, bp
, b_freelist
);
1030 * All the iods are busy on other mounts, so return EIO to
1031 * force the caller to process the i/o synchronously.
1033 NFS_DPF(ASYNCIO
, ("nfs_asyncio: no iods available, i/o is synchronous\n"));
1038 * Do an I/O operation to/from a cache block. This may be called
1039 * synchronously or from an nfsiod.
1043 register struct buf
*bp
;
1047 register struct uio
*uiop
;
1048 register struct vnode
*vp
;
1050 struct nfsmount
*nmp
;
1051 int error
= 0, diff
, len
, iomode
, must_commit
= 0;
1056 NFSTRACE(NFSTRC_DIO
, vp
);
1058 nmp
= VFSTONFS(vp
->v_mount
);
1060 uiop
->uio_iov
= &io
;
1061 uiop
->uio_iovcnt
= 1;
1062 uiop
->uio_segflg
= UIO_SYSSPACE
;
1063 uiop
->uio_procp
= p
;
1066 * With UBC, getblk() can return a buf with B_DONE set.
1067 * This indicates that the VM has valid data for that page.
1068 * NFS being stateless, this case poses a problem.
1069 * By definition, the NFS server should always be consulted
1070 * for the data in that page.
1071 * So we choose to clear the B_DONE and to the IO.
1073 * XXX revisit this if there is a performance issue.
1074 * XXX In that case, we could play the attribute cache games ...
1076 if (ISSET(bp
->b_flags
, B_DONE
)) {
1077 if (!ISSET(bp
->b_flags
, B_ASYNC
))
1078 panic("nfs_doio: done and not async");
1079 CLR(bp
->b_flags
, B_DONE
);
1082 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 256)) | DBG_FUNC_START
,
1083 (int)np
->n_size
, bp
->b_blkno
* DEV_BSIZE
, bp
->b_bcount
, bp
->b_flags
, 0);
1085 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 257)) | DBG_FUNC_NONE
,
1086 bp
->b_validoff
, bp
->b_validend
, bp
->b_dirtyoff
, bp
->b_dirtyend
, 0);
1089 * Historically, paging was done with physio, but no more.
1091 if (ISSET(bp
->b_flags
, B_PHYS
)) {
1093 * ...though reading /dev/drum still gets us here.
1095 io
.iov_len
= uiop
->uio_resid
= bp
->b_bcount
;
1096 /* mapping was done by vmapbuf() */
1097 io
.iov_base
= bp
->b_data
;
1098 uiop
->uio_offset
= ((off_t
)bp
->b_blkno
) * DEV_BSIZE
;
1099 if (ISSET(bp
->b_flags
, B_READ
)) {
1100 uiop
->uio_rw
= UIO_READ
;
1101 nfsstats
.read_physios
++;
1102 error
= nfs_readrpc(vp
, uiop
, cr
);
1106 iomode
= NFSV3WRITE_DATASYNC
;
1107 uiop
->uio_rw
= UIO_WRITE
;
1108 nfsstats
.write_physios
++;
1109 error
= nfs_writerpc(vp
, uiop
, cr
, &iomode
, &com
);
1112 SET(bp
->b_flags
, B_ERROR
);
1113 bp
->b_error
= error
;
1115 } else if (ISSET(bp
->b_flags
, B_READ
)) {
1116 io
.iov_len
= uiop
->uio_resid
= bp
->b_bcount
;
1117 io
.iov_base
= bp
->b_data
;
1118 uiop
->uio_rw
= UIO_READ
;
1119 switch (vp
->v_type
) {
1121 uiop
->uio_offset
= ((off_t
)bp
->b_blkno
) * DEV_BSIZE
;
1122 nfsstats
.read_bios
++;
1123 error
= nfs_readrpc(vp
, uiop
, cr
);
1125 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 262)) | DBG_FUNC_NONE
,
1126 (int)np
->n_size
, bp
->b_blkno
* DEV_BSIZE
, uiop
->uio_resid
, error
, 0);
1131 if (uiop
->uio_resid
) {
1133 * If len > 0, there is a hole in the file and
1134 * no writes after the hole have been pushed to
1136 * Just zero fill the rest of the valid area.
1138 diff
= bp
->b_bcount
- uiop
->uio_resid
;
1139 len
= np
->n_size
- (((u_quad_t
)bp
->b_blkno
) * DEV_BSIZE
1142 len
= min(len
, uiop
->uio_resid
);
1143 bzero((char *)bp
->b_data
+ diff
, len
);
1144 bp
->b_validend
= diff
+ len
;
1146 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 258)) | DBG_FUNC_NONE
,
1147 diff
, len
, 0, 1, 0);
1150 bp
->b_validend
= diff
;
1152 bp
->b_validend
= bp
->b_bcount
;
1153 #if 1 /* USV + JOE [ */
1154 if (bp
->b_validend
< bp
->b_bufsize
) {
1156 * we're about to release a partial buffer after a read... the only
1157 * way we should get here is if this buffer contains the EOF
1158 * before releasing it, we'll zero out to the end of the buffer
1159 * so that if a mmap of this page occurs, we'll see zero's even
1160 * if a ftruncate extends the file in the meantime
1162 bzero((caddr_t
)(bp
->b_data
+ bp
->b_validend
), (bp
->b_bufsize
- bp
->b_validend
));
1164 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 258)) | DBG_FUNC_NONE
,
1165 bp
->b_validend
, (bp
->b_bufsize
- bp
->b_validend
), 0, 2, 0);
1167 #endif /* ] USV + JOE */
1169 if (p
&& (vp
->v_flag
& VTEXT
) &&
1170 (((nmp
->nm_flag
& NFSMNT_NQNFS
) &&
1171 NQNFS_CKINVALID(vp
, np
, ND_READ
) &&
1172 np
->n_lrev
!= np
->n_brev
) ||
1173 (!(nmp
->nm_flag
& NFSMNT_NQNFS
) &&
1174 np
->n_mtime
!= np
->n_vattr
.va_mtime
.tv_sec
))) {
1175 uprintf("Process killed due to text file modification\n");
1176 psignal(p
, SIGKILL
);
1177 p
->p_flag
|= P_NOSWAP
;
1181 uiop
->uio_offset
= (off_t
)0;
1182 nfsstats
.readlink_bios
++;
1183 error
= nfs_readlinkrpc(vp
, uiop
, cr
);
1186 nfsstats
.readdir_bios
++;
1187 uiop
->uio_offset
= ((u_quad_t
)bp
->b_lblkno
) * NFS_DIRBLKSIZ
;
1188 if (!(nmp
->nm_flag
& NFSMNT_NFSV3
))
1189 nmp
->nm_flag
&= ~NFSMNT_RDIRPLUS
; /* dk@farm.org */
1190 if (nmp
->nm_flag
& NFSMNT_RDIRPLUS
) {
1191 error
= nfs_readdirplusrpc(vp
, uiop
, cr
);
1192 if (error
== NFSERR_NOTSUPP
)
1193 nmp
->nm_flag
&= ~NFSMNT_RDIRPLUS
;
1195 if ((nmp
->nm_flag
& NFSMNT_RDIRPLUS
) == 0)
1196 error
= nfs_readdirrpc(vp
, uiop
, cr
);
1199 printf("nfs_doio: type %x unexpected\n",vp
->v_type
);
1203 SET(bp
->b_flags
, B_ERROR
);
1204 bp
->b_error
= error
;
1207 if (((bp
->b_blkno
* DEV_BSIZE
) + bp
->b_dirtyend
) > np
->n_size
)
1208 bp
->b_dirtyend
= np
->n_size
- (bp
->b_blkno
* DEV_BSIZE
);
1210 if (bp
->b_dirtyend
> bp
->b_dirtyoff
) {
1212 io
.iov_len
= uiop
->uio_resid
= bp
->b_dirtyend
1214 uiop
->uio_offset
= ((off_t
)bp
->b_blkno
) * DEV_BSIZE
1216 io
.iov_base
= (char *)bp
->b_data
+ bp
->b_dirtyoff
;
1217 uiop
->uio_rw
= UIO_WRITE
;
1219 nfsstats
.write_bios
++;
1220 if ((bp
->b_flags
& (B_ASYNC
| B_NEEDCOMMIT
| B_NOCACHE
)) == B_ASYNC
)
1221 iomode
= NFSV3WRITE_UNSTABLE
;
1223 iomode
= NFSV3WRITE_FILESYNC
;
1224 SET(bp
->b_flags
, B_WRITEINPROG
);
1225 error
= nfs_writerpc(vp
, uiop
, cr
, &iomode
, &must_commit
);
1226 if (!error
&& iomode
== NFSV3WRITE_UNSTABLE
)
1227 SET(bp
->b_flags
, B_NEEDCOMMIT
);
1229 CLR(bp
->b_flags
, B_NEEDCOMMIT
);
1230 CLR(bp
->b_flags
, B_WRITEINPROG
);
1233 * For an interrupted write, the buffer is still valid
1234 * and the write hasn't been pushed to the server yet,
1235 * so we can't set B_ERROR and report the interruption
1236 * by setting B_EINTR. For the B_ASYNC case, B_EINTR
1237 * is not relevant, so the rpc attempt is essentially
1238 * a noop. For the case of a V3 write rpc not being
1239 * committed to stable storage, the block is still
1240 * dirty and requires either a commit rpc or another
1241 * write rpc with iomode == NFSV3WRITE_FILESYNC before
1242 * the block is reused. This is indicated by setting
1243 * the B_DELWRI and B_NEEDCOMMIT flags.
1246 || (!error
&& (bp
->b_flags
& B_NEEDCOMMIT
))) {
1249 CLR(bp
->b_flags
, (B_INVAL
|B_NOCACHE
));
1250 SET(bp
->b_flags
, B_DELWRI
);
1252 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 261)) | DBG_FUNC_NONE
,
1253 bp
->b_validoff
, bp
->b_validend
, bp
->b_bufsize
, bp
->b_bcount
, 0);
1256 * Since for the B_ASYNC case, nfs_bwrite() has reassigned the
1257 * buffer to the clean list, we have to reassign it back to the
1260 if (ISSET(bp
->b_flags
, B_ASYNC
)) {
1262 reassignbuf(bp
, vp
);
1265 SET(bp
->b_flags
, B_EINTR
);
1269 SET(bp
->b_flags
, B_ERROR
);
1270 bp
->b_error
= np
->n_error
= error
;
1271 np
->n_flag
|= NWRITEERR
;
1273 bp
->b_dirtyoff
= bp
->b_dirtyend
= 0;
1277 * validoff and validend represent the real data present in this buffer
1278 * if validoff is non-zero, than we have to invalidate the buffer and kill
1279 * the page when biodone is called... the same is also true when validend
1280 * doesn't extend all the way to the end of the buffer and validend doesn't
1281 * equate to the current EOF... eventually we need to deal with this in a
1282 * more humane way (like keeping the partial buffer without making it immediately
1283 * available to the VM page cache).
1286 SET(bp
->b_flags
, B_INVAL
);
1288 if (bp
->b_validend
< bp
->b_bufsize
) {
1289 if ((((off_t
)bp
->b_blkno
* (off_t
)DEV_BSIZE
) + bp
->b_validend
) == np
->n_size
) {
1290 bzero((caddr_t
)(bp
->b_data
+ bp
->b_validend
), (bp
->b_bufsize
- bp
->b_validend
));
1292 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 259)) | DBG_FUNC_NONE
,
1293 bp
->b_validend
, (bp
->b_bufsize
- bp
->b_validend
), 0, 0, 0);;
1296 SET(bp
->b_flags
, B_INVAL
);
1305 SET(bp
->b_flags
, B_INVAL
);
1306 else if (bp
->b_validend
< bp
->b_bufsize
) {
1307 if ((((off_t
)bp
->b_blkno
* (off_t
)DEV_BSIZE
) + bp
->b_validend
) != np
->n_size
)
1308 SET(bp
->b_flags
, B_INVAL
);
1310 if (bp
->b_flags
& B_INVAL
) {
1311 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 260)) | DBG_FUNC_NONE
,
1312 bp
->b_validoff
, bp
->b_validend
, bp
->b_bufsize
, bp
->b_bcount
, 0);
1317 NFSTRACE(NFSTRC_DIO_DONE
, vp
);
1321 bp
->b_resid
= uiop
->uio_resid
;
1323 nfs_clearcommit(vp
->v_mount
);
1325 if (bp
->b_flags
& B_INVAL
) {
1326 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 260)) | DBG_FUNC_NONE
,
1327 bp
->b_validoff
, bp
->b_validend
, bp
->b_bufsize
, bp
->b_bcount
, 0);
1329 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 256)) | DBG_FUNC_END
,
1330 bp
->b_validoff
, bp
->b_validend
, bp
->b_bcount
, error
, 0);
1333 NFSTRACE(NFSTRC_DIO_DONE
, vp
);