]>
git.saurik.com Git - apple/xnu.git/blob - bsd/nfs/nfs_bio.c
2 * Copyright (c) 2000 Apple Computer, Inc. All rights reserved.
4 * @APPLE_LICENSE_HEADER_START@
6 * The contents of this file constitute Original Code as defined in and
7 * are subject to the Apple Public Source License Version 1.1 (the
8 * "License"). You may not use this file except in compliance with the
9 * License. Please obtain a copy of the License at
10 * http://www.apple.com/publicsource and read it before using this file.
12 * This Original Code and all software distributed under the License are
13 * distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY KIND, EITHER
14 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
15 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
16 * FITNESS FOR A PARTICULAR PURPOSE OR NON-INFRINGEMENT. Please see the
17 * License for the specific language governing rights and limitations
20 * @APPLE_LICENSE_HEADER_END@
22 /* Copyright (c) 1995 NeXT Computer, Inc. All Rights Reserved */
24 * Copyright (c) 1989, 1993
25 * The Regents of the University of California. All rights reserved.
27 * This code is derived from software contributed to Berkeley by
28 * Rick Macklem at The University of Guelph.
30 * Redistribution and use in source and binary forms, with or without
31 * modification, are permitted provided that the following conditions
33 * 1. Redistributions of source code must retain the above copyright
34 * notice, this list of conditions and the following disclaimer.
35 * 2. Redistributions in binary form must reproduce the above copyright
36 * notice, this list of conditions and the following disclaimer in the
37 * documentation and/or other materials provided with the distribution.
38 * 3. All advertising materials mentioning features or use of this software
39 * must display the following acknowledgement:
40 * This product includes software developed by the University of
41 * California, Berkeley and its contributors.
42 * 4. Neither the name of the University nor the names of its contributors
43 * may be used to endorse or promote products derived from this software
44 * without specific prior written permission.
46 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
47 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
48 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
49 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
50 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
51 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
52 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
53 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
54 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
55 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
58 * @(#)nfs_bio.c 8.9 (Berkeley) 3/30/95
59 * FreeBSD-Id: nfs_bio.c,v 1.44 1997/09/10 19:52:25 phk Exp $
61 #include <sys/param.h>
62 #include <sys/systm.h>
63 #include <sys/resourcevar.h>
64 #include <sys/signalvar.h>
67 #include <sys/vnode.h>
68 #include <sys/mount.h>
69 #include <sys/kernel.h>
70 #include <sys/sysctl.h>
74 #include <sys/vmparam.h>
77 #include <kern/clock.h>
79 #include <nfs/rpcv2.h>
80 #include <nfs/nfsproto.h>
82 #include <nfs/nfsmount.h>
83 #include <nfs/nqnfs.h>
84 #include <nfs/nfsnode.h>
86 #include <sys/kdebug.h>
88 #define FSDBG(A, B, C, D, E) \
89 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, (A))) | DBG_FUNC_NONE, \
90 (int)(B), (int)(C), (int)(D), (int)(E), 0)
91 #define FSDBG_TOP(A, B, C, D, E) \
92 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, (A))) | DBG_FUNC_START, \
93 (int)(B), (int)(C), (int)(D), (int)(E), 0)
94 #define FSDBG_BOT(A, B, C, D, E) \
95 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, (A))) | DBG_FUNC_END, \
96 (int)(B), (int)(C), (int)(D), (int)(E), 0)
98 static struct buf
*nfs_getcacheblk
__P((struct vnode
*vp
, daddr_t bn
, int size
,
99 struct proc
*p
, int operation
));
101 extern int nfs_numasync
;
102 extern struct nfsstats nfsstats
;
106 * Vnode op for read using bio
107 * Any similarity to readip() is purely coincidental
110 nfs_bioread(vp
, uio
, ioflag
, cred
, getpages
)
111 register struct vnode
*vp
;
112 register struct uio
*uio
;
117 register struct nfsnode
*np
= VTONFS(vp
);
118 register int biosize
, i
;
120 struct buf
*bp
= 0, *rabp
;
123 struct nfsmount
*nmp
= VFSTONFS(vp
->v_mount
);
126 int nra
, error
= 0, n
= 0, on
= 0, not_readin
;
127 int operation
= (getpages
? BLK_PAGEIN
: BLK_READ
);
130 if (uio
->uio_rw
!= UIO_READ
)
131 panic("nfs_read mode");
133 if (uio
->uio_resid
== 0)
135 if (uio
->uio_offset
< 0)
138 if ((nmp
->nm_flag
& (NFSMNT_NFSV3
| NFSMNT_GOTFSINFO
)) == NFSMNT_NFSV3
)
139 (void)nfs_fsinfo(nmp
, vp
, cred
, p
);
140 /*due to getblk/vm interractions, use vm page size or less values */
141 biosize
= min(vp
->v_mount
->mnt_stat
.f_iosize
, PAGE_SIZE
);
143 * For nfs, cache consistency can only be maintained approximately.
144 * Although RFC1094 does not specify the criteria, the following is
145 * believed to be compatible with the reference port.
146 * For nqnfs, full cache consistency is maintained within the loop.
148 * If the file's modify time on the server has changed since the
149 * last read rpc or you have written to the file,
150 * you may have lost data cache consistency with the
151 * server, so flush all of the file's data out of the cache.
152 * Then force a getattr rpc to ensure that you have up to date
154 * NB: This implies that cache data can be read when up to
155 * NFS_ATTRTIMEO seconds out of date. If you find that you need current
156 * attributes this could be forced by setting n_attrstamp to 0 before
157 * the VOP_GETATTR() call.
159 if ((nmp
->nm_flag
& NFSMNT_NQNFS
) == 0) {
160 if (np
->n_flag
& NMODIFIED
) {
161 if (vp
->v_type
!= VREG
) {
162 if (vp
->v_type
!= VDIR
)
163 panic("nfs: bioread, not dir");
165 error
= nfs_vinvalbuf(vp
, V_SAVE
, cred
, p
, 1);
170 error
= VOP_GETATTR(vp
, &vattr
, cred
, p
);
173 np
->n_mtime
= vattr
.va_mtime
.tv_sec
;
175 error
= VOP_GETATTR(vp
, &vattr
, cred
, p
);
178 if (np
->n_mtime
!= vattr
.va_mtime
.tv_sec
) {
179 if (vp
->v_type
== VDIR
)
181 error
= nfs_vinvalbuf(vp
, V_SAVE
, cred
, p
, 1);
184 np
->n_mtime
= vattr
.va_mtime
.tv_sec
;
191 * Get a valid lease. If cached data is stale, flush it.
193 if (nmp
->nm_flag
& NFSMNT_NQNFS
) {
194 if (NQNFS_CKINVALID(vp
, np
, ND_READ
)) {
196 error
= nqnfs_getlease(vp
, ND_READ
, cred
, p
);
197 } while (error
== NQNFS_EXPIRED
);
200 if (np
->n_lrev
!= np
->n_brev
||
201 (np
->n_flag
& NQNFSNONCACHE
) ||
202 ((np
->n_flag
& NMODIFIED
) && vp
->v_type
== VDIR
)) {
203 if (vp
->v_type
== VDIR
)
205 error
= nfs_vinvalbuf(vp
, V_SAVE
, cred
, p
, 1);
208 np
->n_brev
= np
->n_lrev
;
210 } else if (vp
->v_type
== VDIR
&& (np
->n_flag
& NMODIFIED
)) {
212 error
= nfs_vinvalbuf(vp
, V_SAVE
, cred
, p
, 1);
217 if (np
->n_flag
& NQNFSNONCACHE
) {
218 switch (vp
->v_type
) {
220 return (nfs_readrpc(vp
, uio
, cred
));
222 return (nfs_readlinkrpc(vp
, uio
, cred
));
226 printf(" NQNFSNONCACHE: type %x unexpected\n",
230 switch (vp
->v_type
) {
232 nfsstats
.biocache_reads
++;
233 lbn
= uio
->uio_offset
/ biosize
;
234 on
= uio
->uio_offset
& (biosize
- 1);
238 * Start the read ahead(s), as required.
240 if (nfs_numasync
> 0 && nmp
->nm_readahead
> 0) {
241 for (nra
= 0; nra
< nmp
->nm_readahead
&&
242 (off_t
)(lbn
+ 1 + nra
) * biosize
< np
->n_size
;
244 rabn
= lbn
+ 1 + nra
;
245 if (!incore(vp
, rabn
)) {
246 rabp
= nfs_getcacheblk(vp
, rabn
, biosize
, p
, operation
);
249 if (!ISSET(rabp
->b_flags
, (B_CACHE
|B_DELWRI
))) {
250 SET(rabp
->b_flags
, (B_READ
| B_ASYNC
));
251 if (nfs_asyncio(rabp
, cred
)) {
252 SET(rabp
->b_flags
, (B_INVAL
|B_ERROR
));
263 * If the block is in the cache and has the required data
264 * in a valid region, just copy it out.
265 * Otherwise, get the block and write back/read in,
270 if ((off_t
)(lbn
+ 1) * biosize
> np
->n_size
&&
271 (off_t
)(lbn
+ 1) * biosize
- np
->n_size
< biosize
) {
272 bufsize
= np
->n_size
- (off_t
)lbn
* biosize
;
273 bufsize
= (bufsize
+ DEV_BSIZE
- 1) & ~(DEV_BSIZE
- 1);
275 bp
= nfs_getcacheblk(vp
, lbn
, bufsize
, p
, operation
);
279 if (!ISSET(bp
->b_flags
, B_CACHE
)) {
280 SET(bp
->b_flags
, B_READ
);
281 CLR(bp
->b_flags
, (B_DONE
| B_ERROR
| B_INVAL
));
283 error
= nfs_doio(bp
, cred
, p
);
290 n
= min((unsigned)(bufsize
- on
), uio
->uio_resid
);
294 diff
= np
->n_size
- uio
->uio_offset
;
297 if (not_readin
&& n
> 0) {
298 if (on
< bp
->b_validoff
|| (on
+ n
) > bp
->b_validend
) {
299 SET(bp
->b_flags
, (B_NOCACHE
|B_INVAFTERWRITE
));
300 if (bp
->b_dirtyend
> 0) {
301 if (!ISSET(bp
->b_flags
, B_DELWRI
))
303 if (VOP_BWRITE(bp
) == EINTR
)
311 diff
= (on
>= bp
->b_validend
) ? 0 : (bp
->b_validend
- on
);
316 nfsstats
.biocache_readlinks
++;
317 bp
= nfs_getcacheblk(vp
, (daddr_t
)0, NFS_MAXPATHLEN
, p
, operation
);
320 if (!ISSET(bp
->b_flags
, B_CACHE
)) {
321 SET(bp
->b_flags
, B_READ
);
322 error
= nfs_doio(bp
, cred
, p
);
324 SET(bp
->b_flags
, B_ERROR
);
329 n
= min(uio
->uio_resid
, NFS_MAXPATHLEN
- bp
->b_resid
);
333 nfsstats
.biocache_readdirs
++;
334 if (np
->n_direofoffset
335 && uio
->uio_offset
>= np
->n_direofoffset
) {
338 lbn
= uio
->uio_offset
/ NFS_DIRBLKSIZ
;
339 on
= uio
->uio_offset
& (NFS_DIRBLKSIZ
- 1);
340 bp
= nfs_getcacheblk(vp
, lbn
, NFS_DIRBLKSIZ
, p
, operation
);
343 if (!ISSET(bp
->b_flags
, B_CACHE
)) {
344 SET(bp
->b_flags
, B_READ
);
345 error
= nfs_doio(bp
, cred
, p
);
349 while (error
== NFSERR_BAD_COOKIE
) {
351 error
= nfs_vinvalbuf(vp
, 0, cred
, p
, 1);
353 * Yuck! The directory has been modified on the
354 * server. The only way to get the block is by
355 * reading from the beginning to get all the
358 for (i
= 0; i
<= lbn
&& !error
; i
++) {
359 if (np
->n_direofoffset
360 && (i
* NFS_DIRBLKSIZ
) >= np
->n_direofoffset
)
362 bp
= nfs_getcacheblk(vp
, i
, NFS_DIRBLKSIZ
, p
,
366 if (!ISSET(bp
->b_flags
, B_CACHE
)) {
367 SET(bp
->b_flags
, B_READ
);
368 error
= nfs_doio(bp
, cred
, p
);
370 * no error + B_INVAL == directory EOF,
373 if (error
== 0 && (bp
->b_flags
& B_INVAL
))
377 * An error will throw away the block and the
378 * for loop will break out. If no error and this
379 * is not the block we want, we throw away the
380 * block and go for the next one via the for loop.
382 if (error
|| i
< lbn
)
387 * The above while is repeated if we hit another cookie
388 * error. If we hit an error and it wasn't a cookie error,
396 * If not eof and read aheads are enabled, start one.
397 * (You need the current block first, so that you have the
398 * directory offset cookie of the next block.)
400 if (nfs_numasync
> 0 && nmp
->nm_readahead
> 0 &&
401 (np
->n_direofoffset
== 0 ||
402 (lbn
+ 1) * NFS_DIRBLKSIZ
< np
->n_direofoffset
) &&
403 !(np
->n_flag
& NQNFSNONCACHE
) &&
404 !incore(vp
, lbn
+ 1)) {
405 rabp
= nfs_getcacheblk(vp
, lbn
+ 1, NFS_DIRBLKSIZ
, p
,
408 if (!ISSET(rabp
->b_flags
, (B_CACHE
|B_DELWRI
))) {
409 SET(rabp
->b_flags
, (B_READ
| B_ASYNC
));
410 if (nfs_asyncio(rabp
, cred
)) {
411 SET(rabp
->b_flags
, (B_INVAL
|B_ERROR
));
421 * Make sure we use a signed variant of min() since
422 * the second term may be negative.
424 n
= lmin(uio
->uio_resid
, NFS_DIRBLKSIZ
- bp
->b_resid
- on
);
426 * Unlike VREG files, whos buffer size ( bp->b_bcount ) is
427 * chopped for the EOF condition, we cannot tell how large
428 * NFS directories are going to be until we hit EOF. So
429 * an NFS directory buffer is *not* chopped to its EOF. Now,
430 * it just so happens that b_resid will effectively chop it
431 * to EOF. *BUT* this information is lost if the buffer goes
432 * away and is reconstituted into a B_CACHE state (recovered
433 * from VM) later. So we keep track of the directory eof
434 * in np->n_direofoffset and chop it off as an extra step
437 if (np
->n_direofoffset
&&
438 n
> np
->n_direofoffset
- uio
->uio_offset
)
439 n
= np
->n_direofoffset
- uio
->uio_offset
;
442 printf(" nfs_bioread: type %x unexpected\n",vp
->v_type
);
447 error
= uiomove(bp
->b_data
+ on
, (int)n
, uio
);
449 switch (vp
->v_type
) {
456 if (np
->n_flag
& NQNFSNONCACHE
)
457 SET(bp
->b_flags
, B_INVAL
);
460 printf(" nfs_bioread: type %x unexpected\n",vp
->v_type
);
463 } while (error
== 0 && uio
->uio_resid
> 0 && n
> 0);
469 * Vnode op for write using bio
473 struct vop_write_args
/* {
477 struct ucred *a_cred;
480 register int biosize
;
481 register struct uio
*uio
= ap
->a_uio
;
482 struct proc
*p
= uio
->uio_procp
;
483 register struct vnode
*vp
= ap
->a_vp
;
484 struct nfsnode
*np
= VTONFS(vp
);
485 register struct ucred
*cred
= ap
->a_cred
;
486 int ioflag
= ap
->a_ioflag
;
489 struct nfsmount
*nmp
= VFSTONFS(vp
->v_mount
);
492 int n
, on
, error
= 0, iomode
, must_commit
;
498 if (uio
->uio_rw
!= UIO_WRITE
)
499 panic("nfs_write mode");
500 if (uio
->uio_segflg
== UIO_USERSPACE
&& uio
->uio_procp
!= current_proc())
501 panic("nfs_write proc");
503 if (vp
->v_type
!= VREG
)
505 if (np
->n_flag
& NWRITEERR
) {
506 np
->n_flag
&= ~NWRITEERR
;
507 return (np
->n_error
);
509 if ((nmp
->nm_flag
& (NFSMNT_NFSV3
| NFSMNT_GOTFSINFO
)) == NFSMNT_NFSV3
)
510 (void)nfs_fsinfo(nmp
, vp
, cred
, p
);
511 if (ioflag
& (IO_APPEND
| IO_SYNC
)) {
512 if (np
->n_flag
& NMODIFIED
) {
514 error
= nfs_vinvalbuf(vp
, V_SAVE
, cred
, p
, 1);
518 if (ioflag
& IO_APPEND
) {
520 error
= VOP_GETATTR(vp
, &vattr
, cred
, p
);
523 uio
->uio_offset
= np
->n_size
;
526 if (uio
->uio_offset
< 0)
528 if (uio
->uio_resid
== 0)
531 * Maybe this should be above the vnode op call, but so long as
532 * file servers have no limits, i don't think it matters
534 if (p
&& uio
->uio_offset
+ uio
->uio_resid
>
535 p
->p_rlimit
[RLIMIT_FSIZE
].rlim_cur
) {
540 * I use nm_rsize, not nm_wsize so that all buffer cache blocks
541 * will be the same size within a filesystem. nfs_writerpc will
542 * still use nm_wsize when sizing the rpc's.
544 /*due to getblk/vm interractions, use vm page size or less values */
545 biosize
= min(vp
->v_mount
->mnt_stat
.f_iosize
, PAGE_SIZE
);
549 * Check for a valid write lease.
551 if ((nmp
->nm_flag
& NFSMNT_NQNFS
) &&
552 NQNFS_CKINVALID(vp
, np
, ND_WRITE
)) {
554 error
= nqnfs_getlease(vp
, ND_WRITE
, cred
, p
);
555 } while (error
== NQNFS_EXPIRED
);
558 if (np
->n_lrev
!= np
->n_brev
||
559 (np
->n_flag
& NQNFSNONCACHE
)) {
560 error
= nfs_vinvalbuf(vp
, V_SAVE
, cred
, p
, 1);
563 np
->n_brev
= np
->n_lrev
;
566 if ((np
->n_flag
& NQNFSNONCACHE
) && uio
->uio_iovcnt
== 1) {
567 iomode
= NFSV3WRITE_FILESYNC
;
568 error
= nfs_writerpc(vp
, uio
, cred
, &iomode
, &must_commit
);
570 nfs_clearcommit(vp
->v_mount
);
573 nfsstats
.biocache_writes
++;
574 lbn
= uio
->uio_offset
/ biosize
;
575 on
= uio
->uio_offset
& (biosize
-1);
576 n
= min((unsigned)(biosize
- on
), uio
->uio_resid
);
580 /* (removed for UBC) */
581 if ((lbn
+ 1) * biosize
> np
->n_size
) {
582 bufsize
= np
->n_size
- lbn
* biosize
;
583 bufsize
= (bufsize
+ DEV_BSIZE
- 1) & ~(DEV_BSIZE
- 1);
587 * Get a cache block for writing. The range to be written is
588 * (off..off+len) within the block. We ensure that the block
589 * either has no dirty region or that the given range is
590 * contiguous with the existing dirty region.
592 bp
= nfs_getcacheblk(vp
, lbn
, bufsize
, p
, BLK_WRITE
);
596 * Resize nfsnode *after* we busy the buffer to prevent
597 * readers from reading garbage.
598 * If there was a partial buf at the old eof, validate
599 * and zero the new bytes.
601 if (uio
->uio_offset
+ n
> np
->n_size
) {
602 struct buf
*bp0
= NULL
;
603 daddr_t bn
= np
->n_size
/ biosize
;
604 int off
= np
->n_size
& (biosize
- 1);
606 if (off
&& bn
< lbn
&& incore(vp
, bn
))
607 bp0
= nfs_getcacheblk(vp
, bn
, biosize
, p
,
609 np
->n_flag
|= NMODIFIED
;
610 np
->n_size
= uio
->uio_offset
+ n
;
611 ubc_setsize(vp
, (off_t
)np
->n_size
); /* XXX errors */
613 bzero((char *)bp0
->b_data
+ off
, biosize
- off
);
614 bp0
->b_validend
= biosize
;
619 * NFS has embedded ucred so crhold() risks zone corruption
621 if (bp
->b_wcred
== NOCRED
)
622 bp
->b_wcred
= crdup(cred
);
624 * If dirtyend exceeds file size, chop it down. This should
625 * not occur unless there is a race.
627 if ((off_t
)bp
->b_blkno
* DEV_BSIZE
+ bp
->b_dirtyend
>
629 bp
->b_dirtyend
= np
->n_size
- (off_t
)bp
->b_blkno
*
632 * UBC doesn't (yet) handle partial pages so nfs_biowrite was
633 * hacked to never bdwrite, to start every little write right
634 * away. Running IE Avie noticed the performance problem, thus
635 * this code, which permits those delayed writes by ensuring an
636 * initial read of the entire page. The read may hit eof
637 * ("short read") but that we will handle.
639 * We are quite dependant on the correctness of B_CACHE so check
640 * that first in case of problems.
642 if (!ISSET(bp
->b_flags
, B_CACHE
) && n
< PAGE_SIZE
) {
643 boff
= (off_t
)bp
->b_blkno
* DEV_BSIZE
;
646 auio
.uio_offset
= boff
;
647 auio
.uio_resid
= PAGE_SIZE
;
648 auio
.uio_segflg
= UIO_SYSSPACE
;
649 auio
.uio_rw
= UIO_READ
;
651 iov
.iov_base
= bp
->b_data
;
652 iov
.iov_len
= PAGE_SIZE
;
653 error
= nfs_readrpc(vp
, &auio
, cred
);
656 SET(bp
->b_flags
, B_ERROR
);
657 printf("nfs_write: readrpc %d", error
);
659 if (auio
.uio_resid
> 0)
660 bzero(iov
.iov_base
, auio
.uio_resid
);
662 bp
->b_validend
= PAGE_SIZE
- auio
.uio_resid
;
663 if (np
->n_size
> boff
+ bp
->b_validend
)
664 bp
->b_validend
= min(np
->n_size
- boff
,
671 * If the new write will leave a contiguous dirty
672 * area, just update the b_dirtyoff and b_dirtyend,
673 * otherwise try to extend the dirty region.
675 if (bp
->b_dirtyend
> 0 &&
676 (on
> bp
->b_dirtyend
|| (on
+ n
) < bp
->b_dirtyoff
)) {
679 boff
= (off_t
)bp
->b_blkno
* DEV_BSIZE
;
680 if (on
> bp
->b_dirtyend
) {
681 start
= boff
+ bp
->b_validend
;
684 start
= boff
+ on
+ n
;
685 end
= boff
+ bp
->b_validoff
;
689 * It may be that the valid region in the buffer
690 * covers the region we want, in which case just
691 * extend the dirty region. Otherwise we try to
692 * extend the valid region.
697 auio
.uio_offset
= start
;
698 auio
.uio_resid
= end
- start
;
699 auio
.uio_segflg
= UIO_SYSSPACE
;
700 auio
.uio_rw
= UIO_READ
;
702 iov
.iov_base
= bp
->b_data
+ (start
- boff
);
703 iov
.iov_len
= end
- start
;
704 error
= nfs_readrpc(vp
, &auio
, cred
);
706 * If we couldn't read, do not do a VOP_BWRITE
707 * as originally coded. That could also error
708 * and looping back to "again" as it was doing
709 * could have us stuck trying to write same buf
710 * again. nfs_write, will get the entire region
711 * if nfs_readrpc succeeded. If unsuccessful
712 * we should just error out. Errors like ESTALE
713 * would keep us looping rather than transient
714 * errors justifying a retry. We can return here
715 * instead of altering dirty region later. We
716 * did not write old dirty region at this point.
720 SET(bp
->b_flags
, B_ERROR
);
721 printf("nfs_write: readrpc2 %d", error
);
727 * If there was a short read, just zero fill.
729 if (auio
.uio_resid
> 0)
730 bzero(iov
.iov_base
, auio
.uio_resid
);
731 if (on
> bp
->b_dirtyend
)
734 bp
->b_validoff
= on
+ n
;
737 * We now have a valid region which extends up to the
738 * dirty region which we want.
740 if (on
> bp
->b_dirtyend
)
743 bp
->b_dirtyoff
= on
+ n
;
745 if (ISSET(bp
->b_flags
, B_ERROR
)) {
751 * NFS has embedded ucred so crhold() risks zone corruption
753 if (bp
->b_wcred
== NOCRED
)
754 bp
->b_wcred
= crdup(cred
);
755 np
->n_flag
|= NMODIFIED
;
758 * Check for valid write lease and get one as required.
759 * In case getblk() and/or bwrite() delayed us.
761 if ((nmp
->nm_flag
& NFSMNT_NQNFS
) &&
762 NQNFS_CKINVALID(vp
, np
, ND_WRITE
)) {
764 error
= nqnfs_getlease(vp
, ND_WRITE
, cred
, p
);
765 } while (error
== NQNFS_EXPIRED
);
770 if (np
->n_lrev
!= np
->n_brev
||
771 (np
->n_flag
& NQNFSNONCACHE
)) {
773 error
= nfs_vinvalbuf(vp
, V_SAVE
, cred
, p
, 1);
776 np
->n_brev
= np
->n_lrev
;
780 error
= uiomove((char *)bp
->b_data
+ on
, n
, uio
);
782 SET(bp
->b_flags
, B_ERROR
);
786 if (bp
->b_dirtyend
> 0) {
787 bp
->b_dirtyoff
= min(on
, bp
->b_dirtyoff
);
788 bp
->b_dirtyend
= max((on
+ n
), bp
->b_dirtyend
);
791 bp
->b_dirtyend
= on
+ n
;
793 if (bp
->b_validend
== 0 || bp
->b_validend
< bp
->b_dirtyoff
||
794 bp
->b_validoff
> bp
->b_dirtyend
) {
795 bp
->b_validoff
= bp
->b_dirtyoff
;
796 bp
->b_validend
= bp
->b_dirtyend
;
798 bp
->b_validoff
= min(bp
->b_validoff
, bp
->b_dirtyoff
);
799 bp
->b_validend
= max(bp
->b_validend
, bp
->b_dirtyend
);
803 * Since this block is being modified, it must be written
804 * again and not just committed.
806 CLR(bp
->b_flags
, B_NEEDCOMMIT
);
809 * If the lease is non-cachable or IO_SYNC do bwrite().
811 if ((np
->n_flag
& NQNFSNONCACHE
) || (ioflag
& IO_SYNC
)) {
813 error
= VOP_BWRITE(bp
);
816 if (np
->n_flag
& NQNFSNONCACHE
) {
817 error
= nfs_vinvalbuf(vp
, V_SAVE
, cred
, p
, 1);
821 } else if ((n
+ on
) == biosize
&&
822 (nmp
->nm_flag
& NFSMNT_NQNFS
) == 0) {
823 bp
->b_proc
= (struct proc
*)0;
824 SET(bp
->b_flags
, B_ASYNC
);
825 (void)nfs_writebp(bp
, 0);
828 } while (uio
->uio_resid
> 0 && n
> 0);
834 * Get an nfs cache block.
835 * Allocate a new one if the block isn't currently in the cache
836 * and return the block marked busy. If the calling process is
837 * interrupted by a signal for an interruptible mount point, return
841 nfs_getcacheblk(vp
, bn
, size
, p
, operation
)
846 int operation
; /* defined in sys/buf.h */
848 register struct buf
*bp
;
849 struct nfsmount
*nmp
= VFSTONFS(vp
->v_mount
);
850 /*due to getblk/vm interractions, use vm page size or less values */
851 int biosize
= min(vp
->v_mount
->mnt_stat
.f_iosize
, PAGE_SIZE
);
853 if (nbdwrite
> ((nbuf
/4)*3) && operation
== BLK_WRITE
) {
854 #define __BUFFERS_RECLAIMED 2
855 struct buf
*tbp
[__BUFFERS_RECLAIMED
];
858 /* too many delayed writes, try to free up some buffers */
859 for (i
= 0; i
< __BUFFERS_RECLAIMED
; i
++)
860 tbp
[i
] = geteblk(512);
862 /* Yield to IO thread */
863 (void)tsleep((caddr_t
)&nbdwrite
, PCATCH
, "nbdwrite", 1);
865 for (i
= (__BUFFERS_RECLAIMED
- 1); i
>= 0; i
--)
869 if (nmp
->nm_flag
& NFSMNT_INT
) {
870 bp
= getblk(vp
, bn
, size
, PCATCH
, 0, operation
);
871 while (bp
== (struct buf
*)0) {
872 if (nfs_sigintr(nmp
, (struct nfsreq
*)0, p
))
873 return ((struct buf
*)0);
874 bp
= getblk(vp
, bn
, size
, 0, 2 * hz
, operation
);
877 bp
= getblk(vp
, bn
, size
, 0, 0, operation
);
879 if( vp
->v_type
== VREG
)
880 bp
->b_blkno
= ((off_t
)bn
* biosize
) / DEV_BSIZE
;
886 * Flush and invalidate all dirty buffers. If another process is already
887 * doing the flush, just wait for completion.
890 nfs_vinvalbuf(vp
, flags
, cred
, p
, intrflg
)
897 register struct nfsnode
*np
= VTONFS(vp
);
898 struct nfsmount
*nmp
= VFSTONFS(vp
->v_mount
);
899 int error
= 0, slpflag
, slptimeo
;
902 if ((nmp
->nm_flag
& NFSMNT_INT
) == 0)
912 * First wait for any other process doing a flush to complete.
914 while (np
->n_flag
& NFLUSHINPROG
) {
915 np
->n_flag
|= NFLUSHWANT
;
916 error
= tsleep((caddr_t
)&np
->n_flag
, PRIBIO
+ 2, "nfsvinval",
918 if (error
&& intrflg
&& nfs_sigintr(nmp
, (struct nfsreq
*)0, p
))
923 * Now, flush as required.
925 np
->n_flag
|= NFLUSHINPROG
;
926 error
= vinvalbuf(vp
, flags
, cred
, p
, slpflag
, 0);
928 /* we seem to be stuck in a loop here if the thread got aborted.
929 * nfs_flush will return EINTR. Not sure if that will cause
930 * other consequences due to EINTR having other meanings in NFS
931 * To handle, no dirty pages, it seems safe to just return from
932 * here. But if we did have dirty pages, how would we get them
933 * written out if thread was aborted? Some other strategy is
936 if ((intrflg
&& nfs_sigintr(nmp
, (struct nfsreq
*)0, p
)) ||
937 (error
== EINTR
&& current_thread_aborted())) {
938 np
->n_flag
&= ~NFLUSHINPROG
;
939 if (np
->n_flag
& NFLUSHWANT
) {
940 np
->n_flag
&= ~NFLUSHWANT
;
941 wakeup((caddr_t
)&np
->n_flag
);
945 error
= vinvalbuf(vp
, flags
, cred
, p
, 0, slptimeo
);
947 np
->n_flag
&= ~(NMODIFIED
| NFLUSHINPROG
);
948 if (np
->n_flag
& NFLUSHWANT
) {
949 np
->n_flag
&= ~NFLUSHWANT
;
950 wakeup((caddr_t
)&np
->n_flag
);
952 didhold
= ubc_hold(vp
);
954 (void) ubc_clean(vp
, 1); /* get the pages out of vm also */
961 * Initiate asynchronous I/O. Return an error if no nfsiods are available.
962 * This is mainly to avoid queueing async I/O requests when the nfsiods
963 * are all hung on a dead server.
966 nfs_asyncio(bp
, cred
)
967 register struct buf
*bp
;
970 struct nfsmount
*nmp
;
977 if (nfs_numasync
== 0)
980 nmp
= VFSTONFS(bp
->b_vp
->v_mount
);
982 if (nmp
->nm_flag
& NFSMNT_INT
)
987 * Find a free iod to process this request.
989 for (i
= 0; i
< NFS_MAXASYNCDAEMON
; i
++)
990 if (nfs_iodwant
[i
]) {
992 * Found one, so wake it up and tell it which
996 ("nfs_asyncio: waking iod %d for mount %p\n",
998 nfs_iodwant
[i
] = (struct proc
*)0;
999 nfs_iodmount
[i
] = nmp
;
1001 wakeup((caddr_t
)&nfs_iodwant
[i
]);
1007 * If none are free, we may already have an iod working on this mount
1008 * point. If so, it will process our request.
1011 if (nmp
->nm_bufqiods
> 0) {
1013 ("nfs_asyncio: %d iods are already processing mount %p\n",
1014 nmp
->nm_bufqiods
, nmp
));
1020 * If we have an iod which can process the request, then queue
1025 * Ensure that the queue never grows too large.
1027 while (nmp
->nm_bufqlen
>= 2*nfs_numasync
) {
1029 ("nfs_asyncio: waiting for mount %p queue to drain\n", nmp
));
1030 nmp
->nm_bufqwant
= TRUE
;
1031 error
= tsleep(&nmp
->nm_bufq
, slpflag
| PRIBIO
,
1032 "nfsaio", slptimeo
);
1034 if (nfs_sigintr(nmp
, NULL
, bp
->b_proc
))
1036 if (slpflag
== PCATCH
) {
1042 * We might have lost our iod while sleeping,
1043 * so check and loop if nescessary.
1045 if (nmp
->nm_bufqiods
== 0) {
1047 ("nfs_asyncio: no iods after mount %p queue was drained, looping\n", nmp
));
1052 if (ISSET(bp
->b_flags
, B_READ
)) {
1053 if (bp
->b_rcred
== NOCRED
&& cred
!= NOCRED
) {
1055 * NFS has embedded ucred.
1056 * Can not crhold() here as that causes zone corruption
1058 bp
->b_rcred
= crdup(cred
);
1061 SET(bp
->b_flags
, B_WRITEINPROG
);
1062 if (bp
->b_wcred
== NOCRED
&& cred
!= NOCRED
) {
1064 * NFS has embedded ucred.
1065 * Can not crhold() here as that causes zone corruption
1067 bp
->b_wcred
= crdup(cred
);
1071 TAILQ_INSERT_TAIL(&nmp
->nm_bufq
, bp
, b_freelist
);
1077 * All the iods are busy on other mounts, so return EIO to
1078 * force the caller to process the i/o synchronously.
1080 NFS_DPF(ASYNCIO
, ("nfs_asyncio: no iods available, i/o is synchronous\n"));
1085 * Do an I/O operation to/from a cache block. This may be called
1086 * synchronously or from an nfsiod.
1090 register struct buf
*bp
;
1094 register struct uio
*uiop
;
1095 register struct vnode
*vp
;
1097 struct nfsmount
*nmp
;
1098 int error
= 0, diff
, len
, iomode
, must_commit
= 0;
1104 nmp
= VFSTONFS(vp
->v_mount
);
1106 uiop
->uio_iov
= &io
;
1107 uiop
->uio_iovcnt
= 1;
1108 uiop
->uio_segflg
= UIO_SYSSPACE
;
1109 uiop
->uio_procp
= p
;
1112 * With UBC, getblk() can return a buf with B_DONE set.
1113 * This indicates that the VM has valid data for that page.
1114 * NFS being stateless, this case poses a problem.
1115 * By definition, the NFS server should always be consulted
1116 * for the data in that page.
1117 * So we choose to clear the B_DONE and to do the IO.
1119 * XXX revisit this if there is a performance issue.
1120 * XXX In that case, we could play the attribute cache games ...
1122 if (ISSET(bp
->b_flags
, B_DONE
)) {
1123 if (!ISSET(bp
->b_flags
, B_ASYNC
))
1124 panic("nfs_doio: done and not async");
1125 CLR(bp
->b_flags
, B_DONE
);
1127 FSDBG_TOP(256, np
->n_size
, bp
->b_blkno
* DEV_BSIZE
, bp
->b_bcount
,
1129 FSDBG(257, bp
->b_validoff
, bp
->b_validend
, bp
->b_dirtyoff
,
1132 * Historically, paging was done with physio, but no more.
1134 if (ISSET(bp
->b_flags
, B_PHYS
)) {
1136 * ...though reading /dev/drum still gets us here.
1138 io
.iov_len
= uiop
->uio_resid
= bp
->b_bcount
;
1139 /* mapping was done by vmapbuf() */
1140 io
.iov_base
= bp
->b_data
;
1141 uiop
->uio_offset
= (off_t
)bp
->b_blkno
* DEV_BSIZE
;
1142 if (ISSET(bp
->b_flags
, B_READ
)) {
1143 uiop
->uio_rw
= UIO_READ
;
1144 nfsstats
.read_physios
++;
1145 error
= nfs_readrpc(vp
, uiop
, cr
);
1149 iomode
= NFSV3WRITE_DATASYNC
;
1150 uiop
->uio_rw
= UIO_WRITE
;
1151 nfsstats
.write_physios
++;
1152 error
= nfs_writerpc(vp
, uiop
, cr
, &iomode
, &com
);
1155 SET(bp
->b_flags
, B_ERROR
);
1156 bp
->b_error
= error
;
1158 } else if (ISSET(bp
->b_flags
, B_READ
)) {
1159 io
.iov_len
= uiop
->uio_resid
= bp
->b_bcount
;
1160 io
.iov_base
= bp
->b_data
;
1161 uiop
->uio_rw
= UIO_READ
;
1162 switch (vp
->v_type
) {
1164 uiop
->uio_offset
= (off_t
)bp
->b_blkno
* DEV_BSIZE
;
1165 nfsstats
.read_bios
++;
1166 error
= nfs_readrpc(vp
, uiop
, cr
);
1167 FSDBG(262, np
->n_size
, bp
->b_blkno
* DEV_BSIZE
,
1168 uiop
->uio_resid
, error
);
1171 if (uiop
->uio_resid
) {
1173 * If len > 0, there is a hole in the file and
1174 * no writes after the hole have been pushed to
1176 * Just zero fill the rest of the valid area.
1178 diff
= bp
->b_bcount
- uiop
->uio_resid
;
1179 len
= np
->n_size
- ((u_quad_t
)bp
->b_blkno
* DEV_BSIZE
+
1182 len
= min(len
, uiop
->uio_resid
);
1183 bzero((char *)bp
->b_data
+ diff
, len
);
1184 bp
->b_validend
= diff
+ len
;
1185 FSDBG(258, diff
, len
, 0, 1);
1187 bp
->b_validend
= diff
;
1189 bp
->b_validend
= bp
->b_bcount
;
1191 if (bp
->b_validend
< bp
->b_bufsize
) {
1193 * we're about to release a partial buffer after a
1194 * read... the only way we should get here is if
1195 * this buffer contains the EOF before releasing it,
1196 * we'll zero out to the end of the buffer so that
1197 * if a mmap of this page occurs, we'll see zero's
1198 * even if a ftruncate extends the file in the
1201 bzero((caddr_t
)(bp
->b_data
+ bp
->b_validend
),
1202 bp
->b_bufsize
- bp
->b_validend
);
1203 FSDBG(258, bp
->b_validend
,
1204 bp
->b_bufsize
- bp
->b_validend
, 0, 2);
1207 if (p
&& (vp
->v_flag
& VTEXT
) &&
1208 (((nmp
->nm_flag
& NFSMNT_NQNFS
) &&
1209 NQNFS_CKINVALID(vp
, np
, ND_READ
) &&
1210 np
->n_lrev
!= np
->n_brev
) ||
1211 (!(nmp
->nm_flag
& NFSMNT_NQNFS
) &&
1212 np
->n_mtime
!= np
->n_vattr
.va_mtime
.tv_sec
))) {
1213 uprintf("Process killed due to text file modification\n");
1214 psignal(p
, SIGKILL
);
1215 p
->p_flag
|= P_NOSWAP
;
1219 uiop
->uio_offset
= (off_t
)0;
1220 nfsstats
.readlink_bios
++;
1221 error
= nfs_readlinkrpc(vp
, uiop
, cr
);
1224 nfsstats
.readdir_bios
++;
1225 uiop
->uio_offset
= ((u_quad_t
)bp
->b_lblkno
) * NFS_DIRBLKSIZ
;
1226 if (!(nmp
->nm_flag
& NFSMNT_NFSV3
))
1227 nmp
->nm_flag
&= ~NFSMNT_RDIRPLUS
; /* dk@farm.org */
1228 if (nmp
->nm_flag
& NFSMNT_RDIRPLUS
) {
1229 error
= nfs_readdirplusrpc(vp
, uiop
, cr
);
1230 if (error
== NFSERR_NOTSUPP
)
1231 nmp
->nm_flag
&= ~NFSMNT_RDIRPLUS
;
1233 if ((nmp
->nm_flag
& NFSMNT_RDIRPLUS
) == 0)
1234 error
= nfs_readdirrpc(vp
, uiop
, cr
);
1237 printf("nfs_doio: type %x unexpected\n", vp
->v_type
);
1241 SET(bp
->b_flags
, B_ERROR
);
1242 bp
->b_error
= error
;
1246 * mapped I/O may have altered any bytes, so we extend
1247 * the dirty zone to the valid zone. For best performance
1248 * a better solution would be to save & restore page dirty bits
1249 * around the uiomove which brings write-data into the buffer.
1250 * Then here we'd check if the page is dirty rather than WASMAPPED
1251 * Also vnode_pager would change - if a page is clean it might
1252 * still need to be written due to DELWRI.
1254 if (UBCINFOEXISTS(vp
) && ubc_issetflags(vp
, UI_WASMAPPED
)) {
1255 bp
->b_dirtyoff
= min(bp
->b_dirtyoff
, bp
->b_validoff
);
1256 bp
->b_dirtyend
= max(bp
->b_dirtyend
, bp
->b_validend
);
1258 if ((off_t
)bp
->b_blkno
* DEV_BSIZE
+ bp
->b_dirtyend
> np
->n_size
)
1259 bp
->b_dirtyend
= np
->n_size
- (off_t
)bp
->b_blkno
* DEV_BSIZE
;
1261 if (bp
->b_dirtyend
> bp
->b_dirtyoff
) {
1262 io
.iov_len
= uiop
->uio_resid
= bp
->b_dirtyend
- bp
->b_dirtyoff
;
1263 uiop
->uio_offset
= (off_t
)bp
->b_blkno
* DEV_BSIZE
+
1265 io
.iov_base
= (char *)bp
->b_data
+ bp
->b_dirtyoff
;
1266 uiop
->uio_rw
= UIO_WRITE
;
1268 nfsstats
.write_bios
++;
1269 if ((bp
->b_flags
& (B_ASYNC
| B_NEEDCOMMIT
| B_NOCACHE
)) ==
1271 iomode
= NFSV3WRITE_UNSTABLE
;
1273 iomode
= NFSV3WRITE_FILESYNC
;
1274 SET(bp
->b_flags
, B_WRITEINPROG
);
1275 error
= nfs_writerpc(vp
, uiop
, cr
, &iomode
, &must_commit
);
1276 if (!error
&& iomode
== NFSV3WRITE_UNSTABLE
)
1277 SET(bp
->b_flags
, B_NEEDCOMMIT
);
1279 CLR(bp
->b_flags
, B_NEEDCOMMIT
);
1280 CLR(bp
->b_flags
, B_WRITEINPROG
);
1282 * For an interrupted write, the buffer is still valid
1283 * and the write hasn't been pushed to the server yet,
1284 * so we can't set B_ERROR and report the interruption
1285 * by setting B_EINTR. For the B_ASYNC case, B_EINTR
1286 * is not relevant, so the rpc attempt is essentially
1287 * a noop. For the case of a V3 write rpc not being
1288 * committed to stable storage, the block is still
1289 * dirty and requires either a commit rpc or another
1290 * write rpc with iomode == NFSV3WRITE_FILESYNC before
1291 * the block is reused. This is indicated by setting
1292 * the B_DELWRI and B_NEEDCOMMIT flags.
1294 if (error
== EINTR
|| (!error
&& bp
->b_flags
& B_NEEDCOMMIT
)) {
1297 CLR(bp
->b_flags
, B_INVAL
| B_NOCACHE
);
1298 if (!ISSET(bp
->b_flags
, B_DELWRI
)) {
1299 SET(bp
->b_flags
, B_DELWRI
);
1302 FSDBG(261, bp
->b_validoff
, bp
->b_validend
,
1303 bp
->b_bufsize
, bp
->b_bcount
);
1305 * Since for the B_ASYNC case, nfs_bwrite() has
1306 * reassigned the buffer to the clean list, we have to
1307 * reassign it back to the dirty one. Ugh.
1309 if (ISSET(bp
->b_flags
, B_ASYNC
)) {
1311 reassignbuf(bp
, vp
);
1314 SET(bp
->b_flags
, B_EINTR
);
1318 SET(bp
->b_flags
, B_ERROR
);
1319 bp
->b_error
= np
->n_error
= error
;
1320 np
->n_flag
|= NWRITEERR
;
1322 bp
->b_dirtyoff
= bp
->b_dirtyend
= 0;
1325 * validoff and validend represent the real data present
1326 * in this buffer if validoff is non-zero, than we have
1327 * to invalidate the buffer and kill the page when
1328 * biodone is called... the same is also true when
1329 * validend doesn't extend all the way to the end of the
1330 * buffer and validend doesn't equate to the current
1331 * EOF... eventually we need to deal with this in a more
1332 * humane way (like keeping the partial buffer without
1333 * making it immediately available to the VM page cache)
1336 SET(bp
->b_flags
, B_INVAL
);
1338 if (bp
->b_validend
< bp
->b_bufsize
) {
1339 if ((off_t
)bp
->b_blkno
* DEV_BSIZE
+
1340 bp
->b_validend
== np
->n_size
) {
1341 bzero((caddr_t
)(bp
->b_data
+
1343 bp
->b_bufsize
- bp
->b_validend
);
1344 FSDBG(259, bp
->b_validend
,
1345 bp
->b_bufsize
- bp
->b_validend
, 0,
1348 SET(bp
->b_flags
, B_INVAL
);
1353 if (bp
->b_validoff
||
1354 (bp
->b_validend
< bp
->b_bufsize
&&
1355 (off_t
)bp
->b_blkno
* DEV_BSIZE
+ bp
->b_validend
!=
1357 SET(bp
->b_flags
, B_INVAL
);
1359 if (bp
->b_flags
& B_INVAL
) {
1360 FSDBG(260, bp
->b_validoff
, bp
->b_validend
,
1361 bp
->b_bufsize
, bp
->b_bcount
);
1365 FSDBG_BOT(256, bp
->b_validoff
, bp
->b_validend
, bp
->b_bufsize
,
1370 bp
->b_resid
= uiop
->uio_resid
;
1372 nfs_clearcommit(vp
->v_mount
);
1374 if (bp
->b_flags
& B_INVAL
) {
1375 FSDBG(260, bp
->b_validoff
, bp
->b_validend
, bp
->b_bufsize
,
1378 FSDBG_BOT(256, bp
->b_validoff
, bp
->b_validend
, bp
->b_bcount
, error
);