]>
git.saurik.com Git - apple/xnu.git/blob - bsd/nfs/nfs_bio.c
7f41efe13fb85952defd9b67213fca4b1fa07a98
2 * Copyright (c) 2000 Apple Computer, Inc. All rights reserved.
4 * @APPLE_LICENSE_HEADER_START@
6 * Copyright (c) 1999-2003 Apple Computer, Inc. All Rights Reserved.
8 * This file contains Original Code and/or Modifications of Original Code
9 * as defined in and that are subject to the Apple Public Source License
10 * Version 2.0 (the 'License'). You may not use this file except in
11 * compliance with the License. Please obtain a copy of the License at
12 * http://www.opensource.apple.com/apsl/ and read it before using this
15 * The Original Code and all software distributed under the License are
16 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
17 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
18 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
19 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
20 * Please see the License for the specific language governing rights and
21 * limitations under the License.
23 * @APPLE_LICENSE_HEADER_END@
25 /* Copyright (c) 1995 NeXT Computer, Inc. All Rights Reserved */
27 * Copyright (c) 1989, 1993
28 * The Regents of the University of California. All rights reserved.
30 * This code is derived from software contributed to Berkeley by
31 * Rick Macklem at The University of Guelph.
33 * Redistribution and use in source and binary forms, with or without
34 * modification, are permitted provided that the following conditions
36 * 1. Redistributions of source code must retain the above copyright
37 * notice, this list of conditions and the following disclaimer.
38 * 2. Redistributions in binary form must reproduce the above copyright
39 * notice, this list of conditions and the following disclaimer in the
40 * documentation and/or other materials provided with the distribution.
41 * 3. All advertising materials mentioning features or use of this software
42 * must display the following acknowledgement:
43 * This product includes software developed by the University of
44 * California, Berkeley and its contributors.
45 * 4. Neither the name of the University nor the names of its contributors
46 * may be used to endorse or promote products derived from this software
47 * without specific prior written permission.
49 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
50 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
51 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
52 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
53 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
54 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
55 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
56 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
57 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
58 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
61 * @(#)nfs_bio.c 8.9 (Berkeley) 3/30/95
62 * FreeBSD-Id: nfs_bio.c,v 1.44 1997/09/10 19:52:25 phk Exp $
64 #include <sys/param.h>
65 #include <sys/systm.h>
66 #include <sys/resourcevar.h>
67 #include <sys/signalvar.h>
70 #include <sys/vnode.h>
71 #include <sys/mount.h>
72 #include <sys/kernel.h>
73 #include <sys/sysctl.h>
77 #include <sys/vmparam.h>
80 #include <kern/clock.h>
82 #include <nfs/rpcv2.h>
83 #include <nfs/nfsproto.h>
85 #include <nfs/nfsmount.h>
86 #include <nfs/nqnfs.h>
87 #include <nfs/nfsnode.h>
89 #include <sys/kdebug.h>
91 #define FSDBG(A, B, C, D, E) \
92 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, (A))) | DBG_FUNC_NONE, \
93 (int)(B), (int)(C), (int)(D), (int)(E), 0)
94 #define FSDBG_TOP(A, B, C, D, E) \
95 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, (A))) | DBG_FUNC_START, \
96 (int)(B), (int)(C), (int)(D), (int)(E), 0)
97 #define FSDBG_BOT(A, B, C, D, E) \
98 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, (A))) | DBG_FUNC_END, \
99 (int)(B), (int)(C), (int)(D), (int)(E), 0)
101 static struct buf
*nfs_getcacheblk
__P((struct vnode
*vp
, daddr_t bn
, int size
,
102 struct proc
*p
, int operation
));
104 extern int nfs_numasync
;
105 extern struct nfsstats nfsstats
;
109 * Vnode op for read using bio
110 * Any similarity to readip() is purely coincidental
113 nfs_bioread(vp
, uio
, ioflag
, cred
, getpages
)
114 register struct vnode
*vp
;
115 register struct uio
*uio
;
120 register struct nfsnode
*np
= VTONFS(vp
);
121 register int biosize
, i
;
123 struct buf
*bp
= 0, *rabp
;
126 struct nfsmount
*nmp
= VFSTONFS(vp
->v_mount
);
129 int nra
, error
= 0, n
= 0, on
= 0, not_readin
;
130 int operation
= (getpages
? BLK_PAGEIN
: BLK_READ
);
133 if (uio
->uio_rw
!= UIO_READ
)
134 panic("nfs_read mode");
136 if (uio
->uio_resid
== 0)
138 if (uio
->uio_offset
< 0)
141 if ((nmp
->nm_flag
& (NFSMNT_NFSV3
| NFSMNT_GOTFSINFO
)) == NFSMNT_NFSV3
)
142 (void)nfs_fsinfo(nmp
, vp
, cred
, p
);
143 /*due to getblk/vm interractions, use vm page size or less values */
144 biosize
= min(vp
->v_mount
->mnt_stat
.f_iosize
, PAGE_SIZE
);
146 * For nfs, cache consistency can only be maintained approximately.
147 * Although RFC1094 does not specify the criteria, the following is
148 * believed to be compatible with the reference port.
149 * For nqnfs, full cache consistency is maintained within the loop.
151 * If the file's modify time on the server has changed since the
152 * last read rpc or you have written to the file,
153 * you may have lost data cache consistency with the
154 * server, so flush all of the file's data out of the cache.
155 * Then force a getattr rpc to ensure that you have up to date
157 * NB: This implies that cache data can be read when up to
158 * NFS_ATTRTIMEO seconds out of date. If you find that you need current
159 * attributes this could be forced by setting n_attrstamp to 0 before
160 * the VOP_GETATTR() call.
162 if ((nmp
->nm_flag
& NFSMNT_NQNFS
) == 0) {
163 if (np
->n_flag
& NMODIFIED
) {
164 if (vp
->v_type
!= VREG
) {
165 if (vp
->v_type
!= VDIR
)
166 panic("nfs: bioread, not dir");
168 error
= nfs_vinvalbuf(vp
, V_SAVE
, cred
, p
, 1);
173 error
= VOP_GETATTR(vp
, &vattr
, cred
, p
);
176 np
->n_mtime
= vattr
.va_mtime
.tv_sec
;
178 error
= VOP_GETATTR(vp
, &vattr
, cred
, p
);
181 if (np
->n_mtime
!= vattr
.va_mtime
.tv_sec
) {
182 if (vp
->v_type
== VDIR
)
184 error
= nfs_vinvalbuf(vp
, V_SAVE
, cred
, p
, 1);
187 np
->n_mtime
= vattr
.va_mtime
.tv_sec
;
194 * Get a valid lease. If cached data is stale, flush it.
196 if (nmp
->nm_flag
& NFSMNT_NQNFS
) {
197 if (NQNFS_CKINVALID(vp
, np
, ND_READ
)) {
199 error
= nqnfs_getlease(vp
, ND_READ
, cred
, p
);
200 } while (error
== NQNFS_EXPIRED
);
203 if (np
->n_lrev
!= np
->n_brev
||
204 (np
->n_flag
& NQNFSNONCACHE
) ||
205 ((np
->n_flag
& NMODIFIED
) && vp
->v_type
== VDIR
)) {
206 if (vp
->v_type
== VDIR
)
208 error
= nfs_vinvalbuf(vp
, V_SAVE
, cred
, p
, 1);
211 np
->n_brev
= np
->n_lrev
;
213 } else if (vp
->v_type
== VDIR
&& (np
->n_flag
& NMODIFIED
)) {
215 error
= nfs_vinvalbuf(vp
, V_SAVE
, cred
, p
, 1);
220 if (np
->n_flag
& NQNFSNONCACHE
) {
221 switch (vp
->v_type
) {
223 return (nfs_readrpc(vp
, uio
, cred
));
225 return (nfs_readlinkrpc(vp
, uio
, cred
));
229 printf(" NQNFSNONCACHE: type %x unexpected\n",
233 switch (vp
->v_type
) {
235 nfsstats
.biocache_reads
++;
236 lbn
= uio
->uio_offset
/ biosize
;
237 on
= uio
->uio_offset
& (biosize
- 1);
241 * Start the read ahead(s), as required.
243 if (nfs_numasync
> 0 && nmp
->nm_readahead
> 0) {
244 for (nra
= 0; nra
< nmp
->nm_readahead
&&
245 (off_t
)(lbn
+ 1 + nra
) * biosize
< np
->n_size
;
247 rabn
= lbn
+ 1 + nra
;
248 if (!incore(vp
, rabn
)) {
249 rabp
= nfs_getcacheblk(vp
, rabn
, biosize
, p
, operation
);
252 if (!ISSET(rabp
->b_flags
, (B_CACHE
|B_DELWRI
))) {
253 SET(rabp
->b_flags
, (B_READ
| B_ASYNC
));
254 if (nfs_asyncio(rabp
, cred
)) {
255 SET(rabp
->b_flags
, (B_INVAL
|B_ERROR
));
266 * If the block is in the cache and has the required data
267 * in a valid region, just copy it out.
268 * Otherwise, get the block and write back/read in,
273 if ((off_t
)(lbn
+ 1) * biosize
> np
->n_size
&&
274 (off_t
)(lbn
+ 1) * biosize
- np
->n_size
< biosize
) {
275 bufsize
= np
->n_size
- (off_t
)lbn
* biosize
;
276 bufsize
= (bufsize
+ DEV_BSIZE
- 1) & ~(DEV_BSIZE
- 1);
278 bp
= nfs_getcacheblk(vp
, lbn
, bufsize
, p
, operation
);
282 if (!ISSET(bp
->b_flags
, B_CACHE
)) {
283 SET(bp
->b_flags
, B_READ
);
284 CLR(bp
->b_flags
, (B_DONE
| B_ERROR
| B_INVAL
));
286 error
= nfs_doio(bp
, cred
, p
);
293 n
= min((unsigned)(bufsize
- on
), uio
->uio_resid
);
297 diff
= np
->n_size
- uio
->uio_offset
;
300 if (not_readin
&& n
> 0) {
301 if (on
< bp
->b_validoff
|| (on
+ n
) > bp
->b_validend
) {
302 SET(bp
->b_flags
, (B_NOCACHE
|B_INVAFTERWRITE
));
303 if (bp
->b_dirtyend
> 0) {
304 if (!ISSET(bp
->b_flags
, B_DELWRI
))
306 if (VOP_BWRITE(bp
) == EINTR
)
314 diff
= (on
>= bp
->b_validend
) ? 0 : (bp
->b_validend
- on
);
319 nfsstats
.biocache_readlinks
++;
320 bp
= nfs_getcacheblk(vp
, (daddr_t
)0, NFS_MAXPATHLEN
, p
, operation
);
323 if (!ISSET(bp
->b_flags
, B_CACHE
)) {
324 SET(bp
->b_flags
, B_READ
);
325 error
= nfs_doio(bp
, cred
, p
);
327 SET(bp
->b_flags
, B_ERROR
);
332 n
= min(uio
->uio_resid
, NFS_MAXPATHLEN
- bp
->b_resid
);
336 nfsstats
.biocache_readdirs
++;
337 if (np
->n_direofoffset
338 && uio
->uio_offset
>= np
->n_direofoffset
) {
341 lbn
= uio
->uio_offset
/ NFS_DIRBLKSIZ
;
342 on
= uio
->uio_offset
& (NFS_DIRBLKSIZ
- 1);
343 bp
= nfs_getcacheblk(vp
, lbn
, NFS_DIRBLKSIZ
, p
, operation
);
346 if (!ISSET(bp
->b_flags
, B_CACHE
)) {
347 SET(bp
->b_flags
, B_READ
);
348 error
= nfs_doio(bp
, cred
, p
);
352 while (error
== NFSERR_BAD_COOKIE
) {
354 error
= nfs_vinvalbuf(vp
, 0, cred
, p
, 1);
356 * Yuck! The directory has been modified on the
357 * server. The only way to get the block is by
358 * reading from the beginning to get all the
361 for (i
= 0; i
<= lbn
&& !error
; i
++) {
362 if (np
->n_direofoffset
363 && (i
* NFS_DIRBLKSIZ
) >= np
->n_direofoffset
)
365 bp
= nfs_getcacheblk(vp
, i
, NFS_DIRBLKSIZ
, p
,
369 if (!ISSET(bp
->b_flags
, B_CACHE
)) {
370 SET(bp
->b_flags
, B_READ
);
371 error
= nfs_doio(bp
, cred
, p
);
373 * no error + B_INVAL == directory EOF,
376 if (error
== 0 && (bp
->b_flags
& B_INVAL
))
380 * An error will throw away the block and the
381 * for loop will break out. If no error and this
382 * is not the block we want, we throw away the
383 * block and go for the next one via the for loop.
385 if (error
|| i
< lbn
)
390 * The above while is repeated if we hit another cookie
391 * error. If we hit an error and it wasn't a cookie error,
399 * If not eof and read aheads are enabled, start one.
400 * (You need the current block first, so that you have the
401 * directory offset cookie of the next block.)
403 if (nfs_numasync
> 0 && nmp
->nm_readahead
> 0 &&
404 (np
->n_direofoffset
== 0 ||
405 (lbn
+ 1) * NFS_DIRBLKSIZ
< np
->n_direofoffset
) &&
406 !(np
->n_flag
& NQNFSNONCACHE
) &&
407 !incore(vp
, lbn
+ 1)) {
408 rabp
= nfs_getcacheblk(vp
, lbn
+ 1, NFS_DIRBLKSIZ
, p
,
411 if (!ISSET(rabp
->b_flags
, (B_CACHE
|B_DELWRI
))) {
412 SET(rabp
->b_flags
, (B_READ
| B_ASYNC
));
413 if (nfs_asyncio(rabp
, cred
)) {
414 SET(rabp
->b_flags
, (B_INVAL
|B_ERROR
));
424 * Make sure we use a signed variant of min() since
425 * the second term may be negative.
427 n
= lmin(uio
->uio_resid
, NFS_DIRBLKSIZ
- bp
->b_resid
- on
);
429 * Unlike VREG files, whos buffer size ( bp->b_bcount ) is
430 * chopped for the EOF condition, we cannot tell how large
431 * NFS directories are going to be until we hit EOF. So
432 * an NFS directory buffer is *not* chopped to its EOF. Now,
433 * it just so happens that b_resid will effectively chop it
434 * to EOF. *BUT* this information is lost if the buffer goes
435 * away and is reconstituted into a B_CACHE state (recovered
436 * from VM) later. So we keep track of the directory eof
437 * in np->n_direofoffset and chop it off as an extra step
440 if (np
->n_direofoffset
&&
441 n
> np
->n_direofoffset
- uio
->uio_offset
)
442 n
= np
->n_direofoffset
- uio
->uio_offset
;
445 printf(" nfs_bioread: type %x unexpected\n",vp
->v_type
);
450 error
= uiomove(bp
->b_data
+ on
, (int)n
, uio
);
452 switch (vp
->v_type
) {
459 if (np
->n_flag
& NQNFSNONCACHE
)
460 SET(bp
->b_flags
, B_INVAL
);
463 printf(" nfs_bioread: type %x unexpected\n",vp
->v_type
);
466 } while (error
== 0 && uio
->uio_resid
> 0 && n
> 0);
472 * Vnode op for write using bio
476 struct vop_write_args
/* {
480 struct ucred *a_cred;
483 register int biosize
;
484 register struct uio
*uio
= ap
->a_uio
;
485 struct proc
*p
= uio
->uio_procp
;
486 register struct vnode
*vp
= ap
->a_vp
;
487 struct nfsnode
*np
= VTONFS(vp
);
488 register struct ucred
*cred
= ap
->a_cred
;
489 int ioflag
= ap
->a_ioflag
;
492 struct nfsmount
*nmp
= VFSTONFS(vp
->v_mount
);
495 int n
, on
, error
= 0, iomode
, must_commit
;
501 if (uio
->uio_rw
!= UIO_WRITE
)
502 panic("nfs_write mode");
503 if (uio
->uio_segflg
== UIO_USERSPACE
&& uio
->uio_procp
!= current_proc())
504 panic("nfs_write proc");
506 if (vp
->v_type
!= VREG
)
508 if (np
->n_flag
& NWRITEERR
) {
509 np
->n_flag
&= ~NWRITEERR
;
510 return (np
->n_error
);
512 if ((nmp
->nm_flag
& (NFSMNT_NFSV3
| NFSMNT_GOTFSINFO
)) == NFSMNT_NFSV3
)
513 (void)nfs_fsinfo(nmp
, vp
, cred
, p
);
514 if (ioflag
& (IO_APPEND
| IO_SYNC
)) {
515 if (np
->n_flag
& NMODIFIED
) {
517 error
= nfs_vinvalbuf(vp
, V_SAVE
, cred
, p
, 1);
521 if (ioflag
& IO_APPEND
) {
523 error
= VOP_GETATTR(vp
, &vattr
, cred
, p
);
526 uio
->uio_offset
= np
->n_size
;
529 if (uio
->uio_offset
< 0)
531 if (uio
->uio_resid
== 0)
534 * Maybe this should be above the vnode op call, but so long as
535 * file servers have no limits, i don't think it matters
537 if (p
&& uio
->uio_offset
+ uio
->uio_resid
>
538 p
->p_rlimit
[RLIMIT_FSIZE
].rlim_cur
) {
543 * I use nm_rsize, not nm_wsize so that all buffer cache blocks
544 * will be the same size within a filesystem. nfs_writerpc will
545 * still use nm_wsize when sizing the rpc's.
547 /*due to getblk/vm interractions, use vm page size or less values */
548 biosize
= min(vp
->v_mount
->mnt_stat
.f_iosize
, PAGE_SIZE
);
552 * Check for a valid write lease.
554 if ((nmp
->nm_flag
& NFSMNT_NQNFS
) &&
555 NQNFS_CKINVALID(vp
, np
, ND_WRITE
)) {
557 error
= nqnfs_getlease(vp
, ND_WRITE
, cred
, p
);
558 } while (error
== NQNFS_EXPIRED
);
561 if (np
->n_lrev
!= np
->n_brev
||
562 (np
->n_flag
& NQNFSNONCACHE
)) {
563 error
= nfs_vinvalbuf(vp
, V_SAVE
, cred
, p
, 1);
566 np
->n_brev
= np
->n_lrev
;
569 if ((np
->n_flag
& NQNFSNONCACHE
) && uio
->uio_iovcnt
== 1) {
570 iomode
= NFSV3WRITE_FILESYNC
;
571 error
= nfs_writerpc(vp
, uio
, cred
, &iomode
, &must_commit
);
573 nfs_clearcommit(vp
->v_mount
);
576 nfsstats
.biocache_writes
++;
577 lbn
= uio
->uio_offset
/ biosize
;
578 on
= uio
->uio_offset
& (biosize
-1);
579 n
= min((unsigned)(biosize
- on
), uio
->uio_resid
);
583 /* (removed for UBC) */
584 if ((lbn
+ 1) * biosize
> np
->n_size
) {
585 bufsize
= np
->n_size
- lbn
* biosize
;
586 bufsize
= (bufsize
+ DEV_BSIZE
- 1) & ~(DEV_BSIZE
- 1);
590 * Get a cache block for writing. The range to be written is
591 * (off..off+len) within the block. We ensure that the block
592 * either has no dirty region or that the given range is
593 * contiguous with the existing dirty region.
595 bp
= nfs_getcacheblk(vp
, lbn
, bufsize
, p
, BLK_WRITE
);
599 * Resize nfsnode *after* we busy the buffer to prevent
600 * readers from reading garbage.
601 * If there was a partial buf at the old eof, validate
602 * and zero the new bytes.
604 if (uio
->uio_offset
+ n
> np
->n_size
) {
605 struct buf
*bp0
= NULL
;
606 daddr_t bn
= np
->n_size
/ biosize
;
607 int off
= np
->n_size
& (biosize
- 1);
609 if (off
&& bn
< lbn
&& incore(vp
, bn
))
610 bp0
= nfs_getcacheblk(vp
, bn
, biosize
, p
,
612 np
->n_flag
|= NMODIFIED
;
613 np
->n_size
= uio
->uio_offset
+ n
;
614 ubc_setsize(vp
, (off_t
)np
->n_size
); /* XXX errors */
616 bzero((char *)bp0
->b_data
+ off
, biosize
- off
);
617 bp0
->b_validend
= biosize
;
622 * NFS has embedded ucred so crhold() risks zone corruption
624 if (bp
->b_wcred
== NOCRED
)
625 bp
->b_wcred
= crdup(cred
);
627 * If dirtyend exceeds file size, chop it down. This should
628 * not occur unless there is a race.
630 if ((off_t
)bp
->b_blkno
* DEV_BSIZE
+ bp
->b_dirtyend
>
632 bp
->b_dirtyend
= np
->n_size
- (off_t
)bp
->b_blkno
*
635 * UBC doesn't (yet) handle partial pages so nfs_biowrite was
636 * hacked to never bdwrite, to start every little write right
637 * away. Running IE Avie noticed the performance problem, thus
638 * this code, which permits those delayed writes by ensuring an
639 * initial read of the entire page. The read may hit eof
640 * ("short read") but that we will handle.
642 * We are quite dependant on the correctness of B_CACHE so check
643 * that first in case of problems.
645 if (!ISSET(bp
->b_flags
, B_CACHE
) && n
< PAGE_SIZE
) {
646 boff
= (off_t
)bp
->b_blkno
* DEV_BSIZE
;
649 auio
.uio_offset
= boff
;
650 auio
.uio_resid
= PAGE_SIZE
;
651 auio
.uio_segflg
= UIO_SYSSPACE
;
652 auio
.uio_rw
= UIO_READ
;
654 iov
.iov_base
= bp
->b_data
;
655 iov
.iov_len
= PAGE_SIZE
;
656 error
= nfs_readrpc(vp
, &auio
, cred
);
659 SET(bp
->b_flags
, B_ERROR
);
660 printf("nfs_write: readrpc %d", error
);
662 if (auio
.uio_resid
> 0)
663 bzero(iov
.iov_base
, auio
.uio_resid
);
665 bp
->b_validend
= PAGE_SIZE
- auio
.uio_resid
;
666 if (np
->n_size
> boff
+ bp
->b_validend
)
667 bp
->b_validend
= min(np
->n_size
- boff
,
674 * If the new write will leave a contiguous dirty
675 * area, just update the b_dirtyoff and b_dirtyend,
676 * otherwise try to extend the dirty region.
678 if (bp
->b_dirtyend
> 0 &&
679 (on
> bp
->b_dirtyend
|| (on
+ n
) < bp
->b_dirtyoff
)) {
682 boff
= (off_t
)bp
->b_blkno
* DEV_BSIZE
;
683 if (on
> bp
->b_dirtyend
) {
684 start
= boff
+ bp
->b_validend
;
687 start
= boff
+ on
+ n
;
688 end
= boff
+ bp
->b_validoff
;
692 * It may be that the valid region in the buffer
693 * covers the region we want, in which case just
694 * extend the dirty region. Otherwise we try to
695 * extend the valid region.
700 auio
.uio_offset
= start
;
701 auio
.uio_resid
= end
- start
;
702 auio
.uio_segflg
= UIO_SYSSPACE
;
703 auio
.uio_rw
= UIO_READ
;
705 iov
.iov_base
= bp
->b_data
+ (start
- boff
);
706 iov
.iov_len
= end
- start
;
707 error
= nfs_readrpc(vp
, &auio
, cred
);
709 * If we couldn't read, do not do a VOP_BWRITE
710 * as originally coded. That could also error
711 * and looping back to "again" as it was doing
712 * could have us stuck trying to write same buf
713 * again. nfs_write, will get the entire region
714 * if nfs_readrpc succeeded. If unsuccessful
715 * we should just error out. Errors like ESTALE
716 * would keep us looping rather than transient
717 * errors justifying a retry. We can return here
718 * instead of altering dirty region later. We
719 * did not write old dirty region at this point.
723 SET(bp
->b_flags
, B_ERROR
);
724 printf("nfs_write: readrpc2 %d", error
);
730 * If there was a short read, just zero fill.
732 if (auio
.uio_resid
> 0)
733 bzero(iov
.iov_base
, auio
.uio_resid
);
734 if (on
> bp
->b_dirtyend
)
737 bp
->b_validoff
= on
+ n
;
740 * We now have a valid region which extends up to the
741 * dirty region which we want.
743 if (on
> bp
->b_dirtyend
)
746 bp
->b_dirtyoff
= on
+ n
;
748 if (ISSET(bp
->b_flags
, B_ERROR
)) {
754 * NFS has embedded ucred so crhold() risks zone corruption
756 if (bp
->b_wcred
== NOCRED
)
757 bp
->b_wcred
= crdup(cred
);
758 np
->n_flag
|= NMODIFIED
;
761 * Check for valid write lease and get one as required.
762 * In case getblk() and/or bwrite() delayed us.
764 if ((nmp
->nm_flag
& NFSMNT_NQNFS
) &&
765 NQNFS_CKINVALID(vp
, np
, ND_WRITE
)) {
767 error
= nqnfs_getlease(vp
, ND_WRITE
, cred
, p
);
768 } while (error
== NQNFS_EXPIRED
);
773 if (np
->n_lrev
!= np
->n_brev
||
774 (np
->n_flag
& NQNFSNONCACHE
)) {
776 error
= nfs_vinvalbuf(vp
, V_SAVE
, cred
, p
, 1);
779 np
->n_brev
= np
->n_lrev
;
783 error
= uiomove((char *)bp
->b_data
+ on
, n
, uio
);
785 SET(bp
->b_flags
, B_ERROR
);
789 if (bp
->b_dirtyend
> 0) {
790 bp
->b_dirtyoff
= min(on
, bp
->b_dirtyoff
);
791 bp
->b_dirtyend
= max((on
+ n
), bp
->b_dirtyend
);
794 bp
->b_dirtyend
= on
+ n
;
796 if (bp
->b_validend
== 0 || bp
->b_validend
< bp
->b_dirtyoff
||
797 bp
->b_validoff
> bp
->b_dirtyend
) {
798 bp
->b_validoff
= bp
->b_dirtyoff
;
799 bp
->b_validend
= bp
->b_dirtyend
;
801 bp
->b_validoff
= min(bp
->b_validoff
, bp
->b_dirtyoff
);
802 bp
->b_validend
= max(bp
->b_validend
, bp
->b_dirtyend
);
806 * Since this block is being modified, it must be written
807 * again and not just committed.
809 CLR(bp
->b_flags
, B_NEEDCOMMIT
);
812 * If the lease is non-cachable or IO_SYNC do bwrite().
814 if ((np
->n_flag
& NQNFSNONCACHE
) || (ioflag
& IO_SYNC
)) {
816 error
= VOP_BWRITE(bp
);
819 if (np
->n_flag
& NQNFSNONCACHE
) {
820 error
= nfs_vinvalbuf(vp
, V_SAVE
, cred
, p
, 1);
824 } else if ((n
+ on
) == biosize
&&
825 (nmp
->nm_flag
& NFSMNT_NQNFS
) == 0) {
826 bp
->b_proc
= (struct proc
*)0;
827 SET(bp
->b_flags
, B_ASYNC
);
828 (void)nfs_writebp(bp
, 0);
831 } while (uio
->uio_resid
> 0 && n
> 0);
837 * Get an nfs cache block.
838 * Allocate a new one if the block isn't currently in the cache
839 * and return the block marked busy. If the calling process is
840 * interrupted by a signal for an interruptible mount point, return
844 nfs_getcacheblk(vp
, bn
, size
, p
, operation
)
849 int operation
; /* defined in sys/buf.h */
851 register struct buf
*bp
;
852 struct nfsmount
*nmp
= VFSTONFS(vp
->v_mount
);
853 /*due to getblk/vm interractions, use vm page size or less values */
854 int biosize
= min(vp
->v_mount
->mnt_stat
.f_iosize
, PAGE_SIZE
);
856 if (nbdwrite
> ((nbuf
/4)*3) && operation
== BLK_WRITE
) {
857 #define __BUFFERS_RECLAIMED 2
858 struct buf
*tbp
[__BUFFERS_RECLAIMED
];
861 /* too many delayed writes, try to free up some buffers */
862 for (i
= 0; i
< __BUFFERS_RECLAIMED
; i
++)
863 tbp
[i
] = geteblk(512);
865 /* Yield to IO thread */
866 (void)tsleep((caddr_t
)&nbdwrite
, PCATCH
, "nbdwrite", 1);
868 for (i
= (__BUFFERS_RECLAIMED
- 1); i
>= 0; i
--)
872 if (nmp
->nm_flag
& NFSMNT_INT
) {
873 bp
= getblk(vp
, bn
, size
, PCATCH
, 0, operation
);
874 while (bp
== (struct buf
*)0) {
875 if (nfs_sigintr(nmp
, (struct nfsreq
*)0, p
))
876 return ((struct buf
*)0);
877 bp
= getblk(vp
, bn
, size
, 0, 2 * hz
, operation
);
880 bp
= getblk(vp
, bn
, size
, 0, 0, operation
);
882 if( vp
->v_type
== VREG
)
883 bp
->b_blkno
= ((off_t
)bn
* biosize
) / DEV_BSIZE
;
889 * Flush and invalidate all dirty buffers. If another process is already
890 * doing the flush, just wait for completion.
893 nfs_vinvalbuf(vp
, flags
, cred
, p
, intrflg
)
900 register struct nfsnode
*np
= VTONFS(vp
);
901 struct nfsmount
*nmp
= VFSTONFS(vp
->v_mount
);
902 int error
= 0, slpflag
, slptimeo
;
905 if ((nmp
->nm_flag
& NFSMNT_INT
) == 0)
915 * First wait for any other process doing a flush to complete.
917 while (np
->n_flag
& NFLUSHINPROG
) {
918 np
->n_flag
|= NFLUSHWANT
;
919 error
= tsleep((caddr_t
)&np
->n_flag
, PRIBIO
+ 2, "nfsvinval",
921 if (error
&& intrflg
&& nfs_sigintr(nmp
, (struct nfsreq
*)0, p
))
926 * Now, flush as required.
928 np
->n_flag
|= NFLUSHINPROG
;
929 error
= vinvalbuf(vp
, flags
, cred
, p
, slpflag
, 0);
931 /* we seem to be stuck in a loop here if the thread got aborted.
932 * nfs_flush will return EINTR. Not sure if that will cause
933 * other consequences due to EINTR having other meanings in NFS
934 * To handle, no dirty pages, it seems safe to just return from
935 * here. But if we did have dirty pages, how would we get them
936 * written out if thread was aborted? Some other strategy is
939 if ((intrflg
&& nfs_sigintr(nmp
, (struct nfsreq
*)0, p
)) ||
940 (error
== EINTR
&& current_thread_aborted())) {
941 np
->n_flag
&= ~NFLUSHINPROG
;
942 if (np
->n_flag
& NFLUSHWANT
) {
943 np
->n_flag
&= ~NFLUSHWANT
;
944 wakeup((caddr_t
)&np
->n_flag
);
948 error
= vinvalbuf(vp
, flags
, cred
, p
, 0, slptimeo
);
950 np
->n_flag
&= ~(NMODIFIED
| NFLUSHINPROG
);
951 if (np
->n_flag
& NFLUSHWANT
) {
952 np
->n_flag
&= ~NFLUSHWANT
;
953 wakeup((caddr_t
)&np
->n_flag
);
955 didhold
= ubc_hold(vp
);
957 (void) ubc_clean(vp
, 1); /* get the pages out of vm also */
964 * Initiate asynchronous I/O. Return an error if no nfsiods are available.
965 * This is mainly to avoid queueing async I/O requests when the nfsiods
966 * are all hung on a dead server.
969 nfs_asyncio(bp
, cred
)
970 register struct buf
*bp
;
973 struct nfsmount
*nmp
;
980 if (nfs_numasync
== 0)
983 nmp
= VFSTONFS(bp
->b_vp
->v_mount
);
985 if (nmp
->nm_flag
& NFSMNT_INT
)
990 * Find a free iod to process this request.
992 for (i
= 0; i
< NFS_MAXASYNCDAEMON
; i
++)
993 if (nfs_iodwant
[i
]) {
995 * Found one, so wake it up and tell it which
999 ("nfs_asyncio: waking iod %d for mount %p\n",
1001 nfs_iodwant
[i
] = (struct proc
*)0;
1002 nfs_iodmount
[i
] = nmp
;
1004 wakeup((caddr_t
)&nfs_iodwant
[i
]);
1010 * If none are free, we may already have an iod working on this mount
1011 * point. If so, it will process our request.
1014 if (nmp
->nm_bufqiods
> 0) {
1016 ("nfs_asyncio: %d iods are already processing mount %p\n",
1017 nmp
->nm_bufqiods
, nmp
));
1023 * If we have an iod which can process the request, then queue
1028 * Ensure that the queue never grows too large.
1030 while (nmp
->nm_bufqlen
>= 2*nfs_numasync
) {
1032 ("nfs_asyncio: waiting for mount %p queue to drain\n", nmp
));
1033 nmp
->nm_bufqwant
= TRUE
;
1034 error
= tsleep(&nmp
->nm_bufq
, slpflag
| PRIBIO
,
1035 "nfsaio", slptimeo
);
1037 if (nfs_sigintr(nmp
, NULL
, bp
->b_proc
))
1039 if (slpflag
== PCATCH
) {
1045 * We might have lost our iod while sleeping,
1046 * so check and loop if nescessary.
1048 if (nmp
->nm_bufqiods
== 0) {
1050 ("nfs_asyncio: no iods after mount %p queue was drained, looping\n", nmp
));
1055 if (ISSET(bp
->b_flags
, B_READ
)) {
1056 if (bp
->b_rcred
== NOCRED
&& cred
!= NOCRED
) {
1058 * NFS has embedded ucred.
1059 * Can not crhold() here as that causes zone corruption
1061 bp
->b_rcred
= crdup(cred
);
1064 SET(bp
->b_flags
, B_WRITEINPROG
);
1065 if (bp
->b_wcred
== NOCRED
&& cred
!= NOCRED
) {
1067 * NFS has embedded ucred.
1068 * Can not crhold() here as that causes zone corruption
1070 bp
->b_wcred
= crdup(cred
);
1074 TAILQ_INSERT_TAIL(&nmp
->nm_bufq
, bp
, b_freelist
);
1080 * All the iods are busy on other mounts, so return EIO to
1081 * force the caller to process the i/o synchronously.
1083 NFS_DPF(ASYNCIO
, ("nfs_asyncio: no iods available, i/o is synchronous\n"));
1088 * Do an I/O operation to/from a cache block. This may be called
1089 * synchronously or from an nfsiod.
1093 register struct buf
*bp
;
1097 register struct uio
*uiop
;
1098 register struct vnode
*vp
;
1100 struct nfsmount
*nmp
;
1101 int error
= 0, diff
, len
, iomode
, must_commit
= 0;
1107 nmp
= VFSTONFS(vp
->v_mount
);
1109 uiop
->uio_iov
= &io
;
1110 uiop
->uio_iovcnt
= 1;
1111 uiop
->uio_segflg
= UIO_SYSSPACE
;
1112 uiop
->uio_procp
= p
;
1115 * With UBC, getblk() can return a buf with B_DONE set.
1116 * This indicates that the VM has valid data for that page.
1117 * NFS being stateless, this case poses a problem.
1118 * By definition, the NFS server should always be consulted
1119 * for the data in that page.
1120 * So we choose to clear the B_DONE and to do the IO.
1122 * XXX revisit this if there is a performance issue.
1123 * XXX In that case, we could play the attribute cache games ...
1125 if (ISSET(bp
->b_flags
, B_DONE
)) {
1126 if (!ISSET(bp
->b_flags
, B_ASYNC
))
1127 panic("nfs_doio: done and not async");
1128 CLR(bp
->b_flags
, B_DONE
);
1130 FSDBG_TOP(256, np
->n_size
, bp
->b_blkno
* DEV_BSIZE
, bp
->b_bcount
,
1132 FSDBG(257, bp
->b_validoff
, bp
->b_validend
, bp
->b_dirtyoff
,
1135 * Historically, paging was done with physio, but no more.
1137 if (ISSET(bp
->b_flags
, B_PHYS
)) {
1139 * ...though reading /dev/drum still gets us here.
1141 io
.iov_len
= uiop
->uio_resid
= bp
->b_bcount
;
1142 /* mapping was done by vmapbuf() */
1143 io
.iov_base
= bp
->b_data
;
1144 uiop
->uio_offset
= (off_t
)bp
->b_blkno
* DEV_BSIZE
;
1145 if (ISSET(bp
->b_flags
, B_READ
)) {
1146 uiop
->uio_rw
= UIO_READ
;
1147 nfsstats
.read_physios
++;
1148 error
= nfs_readrpc(vp
, uiop
, cr
);
1152 iomode
= NFSV3WRITE_DATASYNC
;
1153 uiop
->uio_rw
= UIO_WRITE
;
1154 nfsstats
.write_physios
++;
1155 error
= nfs_writerpc(vp
, uiop
, cr
, &iomode
, &com
);
1158 SET(bp
->b_flags
, B_ERROR
);
1159 bp
->b_error
= error
;
1161 } else if (ISSET(bp
->b_flags
, B_READ
)) {
1162 io
.iov_len
= uiop
->uio_resid
= bp
->b_bcount
;
1163 io
.iov_base
= bp
->b_data
;
1164 uiop
->uio_rw
= UIO_READ
;
1165 switch (vp
->v_type
) {
1167 uiop
->uio_offset
= (off_t
)bp
->b_blkno
* DEV_BSIZE
;
1168 nfsstats
.read_bios
++;
1169 error
= nfs_readrpc(vp
, uiop
, cr
);
1170 FSDBG(262, np
->n_size
, bp
->b_blkno
* DEV_BSIZE
,
1171 uiop
->uio_resid
, error
);
1174 if (uiop
->uio_resid
) {
1176 * If len > 0, there is a hole in the file and
1177 * no writes after the hole have been pushed to
1179 * Just zero fill the rest of the valid area.
1181 diff
= bp
->b_bcount
- uiop
->uio_resid
;
1182 len
= np
->n_size
- ((u_quad_t
)bp
->b_blkno
* DEV_BSIZE
+
1185 len
= min(len
, uiop
->uio_resid
);
1186 bzero((char *)bp
->b_data
+ diff
, len
);
1187 bp
->b_validend
= diff
+ len
;
1188 FSDBG(258, diff
, len
, 0, 1);
1190 bp
->b_validend
= diff
;
1192 bp
->b_validend
= bp
->b_bcount
;
1194 if (bp
->b_validend
< bp
->b_bufsize
) {
1196 * we're about to release a partial buffer after a
1197 * read... the only way we should get here is if
1198 * this buffer contains the EOF before releasing it,
1199 * we'll zero out to the end of the buffer so that
1200 * if a mmap of this page occurs, we'll see zero's
1201 * even if a ftruncate extends the file in the
1204 bzero((caddr_t
)(bp
->b_data
+ bp
->b_validend
),
1205 bp
->b_bufsize
- bp
->b_validend
);
1206 FSDBG(258, bp
->b_validend
,
1207 bp
->b_bufsize
- bp
->b_validend
, 0, 2);
1210 if (p
&& (vp
->v_flag
& VTEXT
) &&
1211 (((nmp
->nm_flag
& NFSMNT_NQNFS
) &&
1212 NQNFS_CKINVALID(vp
, np
, ND_READ
) &&
1213 np
->n_lrev
!= np
->n_brev
) ||
1214 (!(nmp
->nm_flag
& NFSMNT_NQNFS
) &&
1215 np
->n_mtime
!= np
->n_vattr
.va_mtime
.tv_sec
))) {
1216 uprintf("Process killed due to text file modification\n");
1217 psignal(p
, SIGKILL
);
1218 p
->p_flag
|= P_NOSWAP
;
1222 uiop
->uio_offset
= (off_t
)0;
1223 nfsstats
.readlink_bios
++;
1224 error
= nfs_readlinkrpc(vp
, uiop
, cr
);
1227 nfsstats
.readdir_bios
++;
1228 uiop
->uio_offset
= ((u_quad_t
)bp
->b_lblkno
) * NFS_DIRBLKSIZ
;
1229 if (!(nmp
->nm_flag
& NFSMNT_NFSV3
))
1230 nmp
->nm_flag
&= ~NFSMNT_RDIRPLUS
; /* dk@farm.org */
1231 if (nmp
->nm_flag
& NFSMNT_RDIRPLUS
) {
1232 error
= nfs_readdirplusrpc(vp
, uiop
, cr
);
1233 if (error
== NFSERR_NOTSUPP
)
1234 nmp
->nm_flag
&= ~NFSMNT_RDIRPLUS
;
1236 if ((nmp
->nm_flag
& NFSMNT_RDIRPLUS
) == 0)
1237 error
= nfs_readdirrpc(vp
, uiop
, cr
);
1240 printf("nfs_doio: type %x unexpected\n", vp
->v_type
);
1244 SET(bp
->b_flags
, B_ERROR
);
1245 bp
->b_error
= error
;
1249 * mapped I/O may have altered any bytes, so we extend
1250 * the dirty zone to the valid zone. For best performance
1251 * a better solution would be to save & restore page dirty bits
1252 * around the uiomove which brings write-data into the buffer.
1253 * Then here we'd check if the page is dirty rather than WASMAPPED
1254 * Also vnode_pager would change - if a page is clean it might
1255 * still need to be written due to DELWRI.
1257 if (UBCINFOEXISTS(vp
) && ubc_issetflags(vp
, UI_WASMAPPED
)) {
1258 bp
->b_dirtyoff
= min(bp
->b_dirtyoff
, bp
->b_validoff
);
1259 bp
->b_dirtyend
= max(bp
->b_dirtyend
, bp
->b_validend
);
1261 if ((off_t
)bp
->b_blkno
* DEV_BSIZE
+ bp
->b_dirtyend
> np
->n_size
)
1262 bp
->b_dirtyend
= np
->n_size
- (off_t
)bp
->b_blkno
* DEV_BSIZE
;
1264 if (bp
->b_dirtyend
> bp
->b_dirtyoff
) {
1265 io
.iov_len
= uiop
->uio_resid
= bp
->b_dirtyend
- bp
->b_dirtyoff
;
1266 uiop
->uio_offset
= (off_t
)bp
->b_blkno
* DEV_BSIZE
+
1268 io
.iov_base
= (char *)bp
->b_data
+ bp
->b_dirtyoff
;
1269 uiop
->uio_rw
= UIO_WRITE
;
1271 nfsstats
.write_bios
++;
1272 if ((bp
->b_flags
& (B_ASYNC
| B_NEEDCOMMIT
| B_NOCACHE
)) ==
1274 iomode
= NFSV3WRITE_UNSTABLE
;
1276 iomode
= NFSV3WRITE_FILESYNC
;
1277 SET(bp
->b_flags
, B_WRITEINPROG
);
1278 error
= nfs_writerpc(vp
, uiop
, cr
, &iomode
, &must_commit
);
1279 if (!error
&& iomode
== NFSV3WRITE_UNSTABLE
)
1280 SET(bp
->b_flags
, B_NEEDCOMMIT
);
1282 CLR(bp
->b_flags
, B_NEEDCOMMIT
);
1283 CLR(bp
->b_flags
, B_WRITEINPROG
);
1285 * For an interrupted write, the buffer is still valid
1286 * and the write hasn't been pushed to the server yet,
1287 * so we can't set B_ERROR and report the interruption
1288 * by setting B_EINTR. For the B_ASYNC case, B_EINTR
1289 * is not relevant, so the rpc attempt is essentially
1290 * a noop. For the case of a V3 write rpc not being
1291 * committed to stable storage, the block is still
1292 * dirty and requires either a commit rpc or another
1293 * write rpc with iomode == NFSV3WRITE_FILESYNC before
1294 * the block is reused. This is indicated by setting
1295 * the B_DELWRI and B_NEEDCOMMIT flags.
1297 if (error
== EINTR
|| (!error
&& bp
->b_flags
& B_NEEDCOMMIT
)) {
1300 CLR(bp
->b_flags
, B_INVAL
| B_NOCACHE
);
1301 if (!ISSET(bp
->b_flags
, B_DELWRI
)) {
1302 SET(bp
->b_flags
, B_DELWRI
);
1305 FSDBG(261, bp
->b_validoff
, bp
->b_validend
,
1306 bp
->b_bufsize
, bp
->b_bcount
);
1308 * Since for the B_ASYNC case, nfs_bwrite() has
1309 * reassigned the buffer to the clean list, we have to
1310 * reassign it back to the dirty one. Ugh.
1312 if (ISSET(bp
->b_flags
, B_ASYNC
)) {
1314 reassignbuf(bp
, vp
);
1317 SET(bp
->b_flags
, B_EINTR
);
1321 SET(bp
->b_flags
, B_ERROR
);
1322 bp
->b_error
= np
->n_error
= error
;
1323 np
->n_flag
|= NWRITEERR
;
1325 bp
->b_dirtyoff
= bp
->b_dirtyend
= 0;
1328 * validoff and validend represent the real data present
1329 * in this buffer if validoff is non-zero, than we have
1330 * to invalidate the buffer and kill the page when
1331 * biodone is called... the same is also true when
1332 * validend doesn't extend all the way to the end of the
1333 * buffer and validend doesn't equate to the current
1334 * EOF... eventually we need to deal with this in a more
1335 * humane way (like keeping the partial buffer without
1336 * making it immediately available to the VM page cache)
1339 SET(bp
->b_flags
, B_INVAL
);
1341 if (bp
->b_validend
< bp
->b_bufsize
) {
1342 if ((off_t
)bp
->b_blkno
* DEV_BSIZE
+
1343 bp
->b_validend
== np
->n_size
) {
1344 bzero((caddr_t
)(bp
->b_data
+
1346 bp
->b_bufsize
- bp
->b_validend
);
1347 FSDBG(259, bp
->b_validend
,
1348 bp
->b_bufsize
- bp
->b_validend
, 0,
1351 SET(bp
->b_flags
, B_INVAL
);
1356 if (bp
->b_validoff
||
1357 (bp
->b_validend
< bp
->b_bufsize
&&
1358 (off_t
)bp
->b_blkno
* DEV_BSIZE
+ bp
->b_validend
!=
1360 SET(bp
->b_flags
, B_INVAL
);
1362 if (bp
->b_flags
& B_INVAL
) {
1363 FSDBG(260, bp
->b_validoff
, bp
->b_validend
,
1364 bp
->b_bufsize
, bp
->b_bcount
);
1368 FSDBG_BOT(256, bp
->b_validoff
, bp
->b_validend
, bp
->b_bufsize
,
1373 bp
->b_resid
= uiop
->uio_resid
;
1375 nfs_clearcommit(vp
->v_mount
);
1377 if (bp
->b_flags
& B_INVAL
) {
1378 FSDBG(260, bp
->b_validoff
, bp
->b_validend
, bp
->b_bufsize
,
1381 FSDBG_BOT(256, bp
->b_validoff
, bp
->b_validend
, bp
->b_bcount
, error
);