]>
git.saurik.com Git - apple/xnu.git/blob - bsd/nfs/nfs_bio.c
3c4473b4447f85a5944ba06431825fb890a29b38
2 * Copyright (c) 2000 Apple Computer, Inc. All rights reserved.
4 * @APPLE_LICENSE_HEADER_START@
6 * The contents of this file constitute Original Code as defined in and
7 * are subject to the Apple Public Source License Version 1.1 (the
8 * "License"). You may not use this file except in compliance with the
9 * License. Please obtain a copy of the License at
10 * http://www.apple.com/publicsource and read it before using this file.
12 * This Original Code and all software distributed under the License are
13 * distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY KIND, EITHER
14 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
15 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
16 * FITNESS FOR A PARTICULAR PURPOSE OR NON-INFRINGEMENT. Please see the
17 * License for the specific language governing rights and limitations
20 * @APPLE_LICENSE_HEADER_END@
22 /* Copyright (c) 1995 NeXT Computer, Inc. All Rights Reserved */
24 * Copyright (c) 1989, 1993
25 * The Regents of the University of California. All rights reserved.
27 * This code is derived from software contributed to Berkeley by
28 * Rick Macklem at The University of Guelph.
30 * Redistribution and use in source and binary forms, with or without
31 * modification, are permitted provided that the following conditions
33 * 1. Redistributions of source code must retain the above copyright
34 * notice, this list of conditions and the following disclaimer.
35 * 2. Redistributions in binary form must reproduce the above copyright
36 * notice, this list of conditions and the following disclaimer in the
37 * documentation and/or other materials provided with the distribution.
38 * 3. All advertising materials mentioning features or use of this software
39 * must display the following acknowledgement:
40 * This product includes software developed by the University of
41 * California, Berkeley and its contributors.
42 * 4. Neither the name of the University nor the names of its contributors
43 * may be used to endorse or promote products derived from this software
44 * without specific prior written permission.
46 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
47 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
48 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
49 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
50 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
51 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
52 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
53 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
54 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
55 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
58 * @(#)nfs_bio.c 8.9 (Berkeley) 3/30/95
59 * FreeBSD-Id: nfs_bio.c,v 1.44 1997/09/10 19:52:25 phk Exp $
61 #include <sys/param.h>
62 #include <sys/systm.h>
63 #include <sys/resourcevar.h>
64 #include <sys/signalvar.h>
67 #include <sys/vnode.h>
68 #include <sys/mount.h>
69 #include <sys/kernel.h>
70 #include <sys/sysctl.h>
74 #include <sys/vmparam.h>
77 #include <kern/clock.h>
79 #include <nfs/rpcv2.h>
80 #include <nfs/nfsproto.h>
82 #include <nfs/nfsmount.h>
83 #include <nfs/nqnfs.h>
84 #include <nfs/nfsnode.h>
86 #include <sys/kdebug.h>
88 #define FSDBG(A, B, C, D, E) \
89 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, (A))) | DBG_FUNC_NONE, \
90 (int)(B), (int)(C), (int)(D), (int)(E), 0)
91 #define FSDBG_TOP(A, B, C, D, E) \
92 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, (A))) | DBG_FUNC_START, \
93 (int)(B), (int)(C), (int)(D), (int)(E), 0)
94 #define FSDBG_BOT(A, B, C, D, E) \
95 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, (A))) | DBG_FUNC_END, \
96 (int)(B), (int)(C), (int)(D), (int)(E), 0)
98 static struct buf
*nfs_getcacheblk
__P((struct vnode
*vp
, daddr_t bn
, int size
,
99 struct proc
*p
, int operation
));
101 extern int nfs_numasync
;
102 extern struct nfsstats nfsstats
;
105 * Vnode op for read using bio
106 * Any similarity to readip() is purely coincidental
109 nfs_bioread(vp
, uio
, ioflag
, cred
, getpages
)
110 register struct vnode
*vp
;
111 register struct uio
*uio
;
116 register struct nfsnode
*np
= VTONFS(vp
);
117 register int biosize
, diff
, i
;
118 struct buf
*bp
= 0, *rabp
;
121 struct nfsmount
*nmp
= VFSTONFS(vp
->v_mount
);
124 int nra
, error
= 0, n
= 0, on
= 0, not_readin
;
125 int operation
= (getpages
? BLK_PAGEIN
: BLK_READ
);
128 if (uio
->uio_rw
!= UIO_READ
)
129 panic("nfs_read mode");
131 if (uio
->uio_resid
== 0)
133 if (uio
->uio_offset
< 0)
136 if ((nmp
->nm_flag
& (NFSMNT_NFSV3
| NFSMNT_GOTFSINFO
)) == NFSMNT_NFSV3
)
137 (void)nfs_fsinfo(nmp
, vp
, cred
, p
);
138 /*due to getblk/vm interractions, use vm page size or less values */
139 biosize
= min(vp
->v_mount
->mnt_stat
.f_iosize
, PAGE_SIZE
);
141 * For nfs, cache consistency can only be maintained approximately.
142 * Although RFC1094 does not specify the criteria, the following is
143 * believed to be compatible with the reference port.
144 * For nqnfs, full cache consistency is maintained within the loop.
146 * If the file's modify time on the server has changed since the
147 * last read rpc or you have written to the file,
148 * you may have lost data cache consistency with the
149 * server, so flush all of the file's data out of the cache.
150 * Then force a getattr rpc to ensure that you have up to date
152 * NB: This implies that cache data can be read when up to
153 * NFS_ATTRTIMEO seconds out of date. If you find that you need current
154 * attributes this could be forced by setting n_attrstamp to 0 before
155 * the VOP_GETATTR() call.
157 if ((nmp
->nm_flag
& NFSMNT_NQNFS
) == 0) {
158 if (np
->n_flag
& NMODIFIED
) {
159 if (vp
->v_type
!= VREG
) {
160 if (vp
->v_type
!= VDIR
)
161 panic("nfs: bioread, not dir");
163 error
= nfs_vinvalbuf(vp
, V_SAVE
, cred
, p
, 1);
168 error
= VOP_GETATTR(vp
, &vattr
, cred
, p
);
171 np
->n_mtime
= vattr
.va_mtime
.tv_sec
;
173 error
= VOP_GETATTR(vp
, &vattr
, cred
, p
);
176 if (np
->n_mtime
!= vattr
.va_mtime
.tv_sec
) {
177 if (vp
->v_type
== VDIR
)
179 error
= nfs_vinvalbuf(vp
, V_SAVE
, cred
, p
, 1);
182 np
->n_mtime
= vattr
.va_mtime
.tv_sec
;
189 * Get a valid lease. If cached data is stale, flush it.
191 if (nmp
->nm_flag
& NFSMNT_NQNFS
) {
192 if (NQNFS_CKINVALID(vp
, np
, ND_READ
)) {
194 error
= nqnfs_getlease(vp
, ND_READ
, cred
, p
);
195 } while (error
== NQNFS_EXPIRED
);
198 if (np
->n_lrev
!= np
->n_brev
||
199 (np
->n_flag
& NQNFSNONCACHE
) ||
200 ((np
->n_flag
& NMODIFIED
) && vp
->v_type
== VDIR
)) {
201 if (vp
->v_type
== VDIR
)
203 error
= nfs_vinvalbuf(vp
, V_SAVE
, cred
, p
, 1);
206 np
->n_brev
= np
->n_lrev
;
208 } else if (vp
->v_type
== VDIR
&& (np
->n_flag
& NMODIFIED
)) {
210 error
= nfs_vinvalbuf(vp
, V_SAVE
, cred
, p
, 1);
215 if (np
->n_flag
& NQNFSNONCACHE
) {
216 switch (vp
->v_type
) {
218 return (nfs_readrpc(vp
, uio
, cred
));
220 return (nfs_readlinkrpc(vp
, uio
, cred
));
224 printf(" NQNFSNONCACHE: type %x unexpected\n",
228 switch (vp
->v_type
) {
230 nfsstats
.biocache_reads
++;
231 lbn
= uio
->uio_offset
/ biosize
;
232 on
= uio
->uio_offset
& (biosize
- 1);
236 * Start the read ahead(s), as required.
238 if (nfs_numasync
> 0 && nmp
->nm_readahead
> 0) {
239 for (nra
= 0; nra
< nmp
->nm_readahead
&&
240 (off_t
)(lbn
+ 1 + nra
) * biosize
< np
->n_size
;
242 rabn
= lbn
+ 1 + nra
;
243 if (!incore(vp
, rabn
)) {
244 rabp
= nfs_getcacheblk(vp
, rabn
, biosize
, p
, operation
);
247 if (!ISSET(rabp
->b_flags
, (B_CACHE
|B_DELWRI
))) {
248 SET(rabp
->b_flags
, (B_READ
| B_ASYNC
));
249 if (nfs_asyncio(rabp
, cred
)) {
250 SET(rabp
->b_flags
, (B_INVAL
|B_ERROR
));
261 * If the block is in the cache and has the required data
262 * in a valid region, just copy it out.
263 * Otherwise, get the block and write back/read in,
268 if ((off_t
)(lbn
+ 1) * biosize
> np
->n_size
&&
269 (off_t
)(lbn
+ 1) * biosize
- np
->n_size
< biosize
) {
270 bufsize
= np
->n_size
- lbn
* biosize
;
271 bufsize
= (bufsize
+ DEV_BSIZE
- 1) & ~(DEV_BSIZE
- 1);
273 bp
= nfs_getcacheblk(vp
, lbn
, bufsize
, p
, operation
);
277 if (!ISSET(bp
->b_flags
, B_CACHE
)) {
278 SET(bp
->b_flags
, B_READ
);
279 CLR(bp
->b_flags
, (B_DONE
| B_ERROR
| B_INVAL
));
281 error
= nfs_doio(bp
, cred
, p
);
288 n
= min((unsigned)(bufsize
- on
), uio
->uio_resid
);
292 diff
= np
->n_size
- uio
->uio_offset
;
295 if (not_readin
&& n
> 0) {
296 if (on
< bp
->b_validoff
|| (on
+ n
) > bp
->b_validend
) {
297 SET(bp
->b_flags
, (B_NOCACHE
|B_INVAFTERWRITE
));
298 if (bp
->b_dirtyend
> 0) {
299 if (!ISSET(bp
->b_flags
, B_DELWRI
))
301 if (VOP_BWRITE(bp
) == EINTR
)
309 diff
= (on
>= bp
->b_validend
) ? 0 : (bp
->b_validend
- on
);
314 nfsstats
.biocache_readlinks
++;
315 bp
= nfs_getcacheblk(vp
, (daddr_t
)0, NFS_MAXPATHLEN
, p
, operation
);
318 if (!ISSET(bp
->b_flags
, B_CACHE
)) {
319 SET(bp
->b_flags
, B_READ
);
320 error
= nfs_doio(bp
, cred
, p
);
322 SET(bp
->b_flags
, B_ERROR
);
327 n
= min(uio
->uio_resid
, NFS_MAXPATHLEN
- bp
->b_resid
);
331 nfsstats
.biocache_readdirs
++;
332 if (np
->n_direofoffset
333 && uio
->uio_offset
>= np
->n_direofoffset
) {
336 lbn
= uio
->uio_offset
/ NFS_DIRBLKSIZ
;
337 on
= uio
->uio_offset
& (NFS_DIRBLKSIZ
- 1);
338 bp
= nfs_getcacheblk(vp
, lbn
, NFS_DIRBLKSIZ
, p
, operation
);
341 if (!ISSET(bp
->b_flags
, B_CACHE
)) {
342 SET(bp
->b_flags
, B_READ
);
343 error
= nfs_doio(bp
, cred
, p
);
347 while (error
== NFSERR_BAD_COOKIE
) {
349 error
= nfs_vinvalbuf(vp
, 0, cred
, p
, 1);
351 * Yuck! The directory has been modified on the
352 * server. The only way to get the block is by
353 * reading from the beginning to get all the
356 for (i
= 0; i
<= lbn
&& !error
; i
++) {
357 if (np
->n_direofoffset
358 && (i
* NFS_DIRBLKSIZ
) >= np
->n_direofoffset
)
360 bp
= nfs_getcacheblk(vp
, i
, NFS_DIRBLKSIZ
, p
,
364 if (!ISSET(bp
->b_flags
, B_CACHE
)) {
365 SET(bp
->b_flags
, B_READ
);
366 error
= nfs_doio(bp
, cred
, p
);
368 * no error + B_INVAL == directory EOF,
371 if (error
== 0 && (bp
->b_flags
& B_INVAL
))
375 * An error will throw away the block and the
376 * for loop will break out. If no error and this
377 * is not the block we want, we throw away the
378 * block and go for the next one via the for loop.
380 if (error
|| i
< lbn
)
385 * The above while is repeated if we hit another cookie
386 * error. If we hit an error and it wasn't a cookie error,
394 * If not eof and read aheads are enabled, start one.
395 * (You need the current block first, so that you have the
396 * directory offset cookie of the next block.)
398 if (nfs_numasync
> 0 && nmp
->nm_readahead
> 0 &&
399 (np
->n_direofoffset
== 0 ||
400 (lbn
+ 1) * NFS_DIRBLKSIZ
< np
->n_direofoffset
) &&
401 !(np
->n_flag
& NQNFSNONCACHE
) &&
402 !incore(vp
, lbn
+ 1)) {
403 rabp
= nfs_getcacheblk(vp
, lbn
+ 1, NFS_DIRBLKSIZ
, p
,
406 if (!ISSET(rabp
->b_flags
, (B_CACHE
|B_DELWRI
))) {
407 SET(rabp
->b_flags
, (B_READ
| B_ASYNC
));
408 if (nfs_asyncio(rabp
, cred
)) {
409 SET(rabp
->b_flags
, (B_INVAL
|B_ERROR
));
419 * Make sure we use a signed variant of min() since
420 * the second term may be negative.
422 n
= lmin(uio
->uio_resid
, NFS_DIRBLKSIZ
- bp
->b_resid
- on
);
424 * Unlike VREG files, whos buffer size ( bp->b_bcount ) is
425 * chopped for the EOF condition, we cannot tell how large
426 * NFS directories are going to be until we hit EOF. So
427 * an NFS directory buffer is *not* chopped to its EOF. Now,
428 * it just so happens that b_resid will effectively chop it
429 * to EOF. *BUT* this information is lost if the buffer goes
430 * away and is reconstituted into a B_CACHE state (recovered
431 * from VM) later. So we keep track of the directory eof
432 * in np->n_direofoffset and chop it off as an extra step
435 if (np
->n_direofoffset
&&
436 n
> np
->n_direofoffset
- uio
->uio_offset
)
437 n
= np
->n_direofoffset
- uio
->uio_offset
;
440 printf(" nfs_bioread: type %x unexpected\n",vp
->v_type
);
445 error
= uiomove(bp
->b_data
+ on
, (int)n
, uio
);
447 switch (vp
->v_type
) {
454 if (np
->n_flag
& NQNFSNONCACHE
)
455 SET(bp
->b_flags
, B_INVAL
);
458 printf(" nfs_bioread: type %x unexpected\n",vp
->v_type
);
461 } while (error
== 0 && uio
->uio_resid
> 0 && n
> 0);
467 * Vnode op for write using bio
471 struct vop_write_args
/* {
475 struct ucred *a_cred;
478 register int biosize
;
479 register struct uio
*uio
= ap
->a_uio
;
480 struct proc
*p
= uio
->uio_procp
;
481 register struct vnode
*vp
= ap
->a_vp
;
482 struct nfsnode
*np
= VTONFS(vp
);
483 register struct ucred
*cred
= ap
->a_cred
;
484 int ioflag
= ap
->a_ioflag
;
487 struct nfsmount
*nmp
= VFSTONFS(vp
->v_mount
);
490 int n
, on
, error
= 0, iomode
, must_commit
;
496 if (uio
->uio_rw
!= UIO_WRITE
)
497 panic("nfs_write mode");
498 if (uio
->uio_segflg
== UIO_USERSPACE
&& uio
->uio_procp
!= current_proc())
499 panic("nfs_write proc");
501 if (vp
->v_type
!= VREG
)
503 if (np
->n_flag
& NWRITEERR
) {
504 np
->n_flag
&= ~NWRITEERR
;
505 return (np
->n_error
);
507 if ((nmp
->nm_flag
& (NFSMNT_NFSV3
| NFSMNT_GOTFSINFO
)) == NFSMNT_NFSV3
)
508 (void)nfs_fsinfo(nmp
, vp
, cred
, p
);
509 if (ioflag
& (IO_APPEND
| IO_SYNC
)) {
510 if (np
->n_flag
& NMODIFIED
) {
512 error
= nfs_vinvalbuf(vp
, V_SAVE
, cred
, p
, 1);
516 if (ioflag
& IO_APPEND
) {
518 error
= VOP_GETATTR(vp
, &vattr
, cred
, p
);
521 uio
->uio_offset
= np
->n_size
;
524 if (uio
->uio_offset
< 0)
526 if (uio
->uio_resid
== 0)
529 * Maybe this should be above the vnode op call, but so long as
530 * file servers have no limits, i don't think it matters
532 if (p
&& uio
->uio_offset
+ uio
->uio_resid
>
533 p
->p_rlimit
[RLIMIT_FSIZE
].rlim_cur
) {
538 * I use nm_rsize, not nm_wsize so that all buffer cache blocks
539 * will be the same size within a filesystem. nfs_writerpc will
540 * still use nm_wsize when sizing the rpc's.
542 /*due to getblk/vm interractions, use vm page size or less values */
543 biosize
= min(vp
->v_mount
->mnt_stat
.f_iosize
, PAGE_SIZE
);
547 * Check for a valid write lease.
549 if ((nmp
->nm_flag
& NFSMNT_NQNFS
) &&
550 NQNFS_CKINVALID(vp
, np
, ND_WRITE
)) {
552 error
= nqnfs_getlease(vp
, ND_WRITE
, cred
, p
);
553 } while (error
== NQNFS_EXPIRED
);
556 if (np
->n_lrev
!= np
->n_brev
||
557 (np
->n_flag
& NQNFSNONCACHE
)) {
558 error
= nfs_vinvalbuf(vp
, V_SAVE
, cred
, p
, 1);
561 np
->n_brev
= np
->n_lrev
;
564 if ((np
->n_flag
& NQNFSNONCACHE
) && uio
->uio_iovcnt
== 1) {
565 iomode
= NFSV3WRITE_FILESYNC
;
566 error
= nfs_writerpc(vp
, uio
, cred
, &iomode
, &must_commit
);
568 nfs_clearcommit(vp
->v_mount
);
571 nfsstats
.biocache_writes
++;
572 lbn
= uio
->uio_offset
/ biosize
;
573 on
= uio
->uio_offset
& (biosize
-1);
574 n
= min((unsigned)(biosize
- on
), uio
->uio_resid
);
578 /* (removed for UBC) */
579 if ((lbn
+ 1) * biosize
> np
->n_size
) {
580 bufsize
= np
->n_size
- lbn
* biosize
;
581 bufsize
= (bufsize
+ DEV_BSIZE
- 1) & ~(DEV_BSIZE
- 1);
585 * Get a cache block for writing. The range to be written is
586 * (off..off+len) within the block. We ensure that the block
587 * either has no dirty region or that the given range is
588 * contiguous with the existing dirty region.
590 bp
= nfs_getcacheblk(vp
, lbn
, bufsize
, p
, BLK_WRITE
);
594 * Resize nfsnode *after* we busy the buffer to prevent
595 * readers from reading garbage.
596 * If there was a partial buf at the old eof, validate
597 * and zero the new bytes.
599 if (uio
->uio_offset
+ n
> np
->n_size
) {
600 struct buf
*bp0
= NULL
;
601 daddr_t bn
= np
->n_size
/ biosize
;
602 int off
= np
->n_size
& (biosize
- 1);
604 if (off
&& bn
< lbn
&& incore(vp
, bn
))
605 bp0
= nfs_getcacheblk(vp
, bn
, biosize
, p
,
607 np
->n_flag
|= NMODIFIED
;
608 np
->n_size
= uio
->uio_offset
+ n
;
609 ubc_setsize(vp
, (off_t
)np
->n_size
); /* XXX errors */
611 bzero((char *)bp0
->b_data
+ off
, biosize
- off
);
612 bp0
->b_validend
= biosize
;
617 * NFS has embedded ucred so crhold() risks zone corruption
619 if (bp
->b_wcred
== NOCRED
)
620 bp
->b_wcred
= crdup(cred
);
622 * If dirtyend exceeds file size, chop it down. This should
623 * not occur unless there is a race.
625 if ((off_t
)bp
->b_blkno
* DEV_BSIZE
+ bp
->b_dirtyend
>
627 bp
->b_dirtyend
= np
->n_size
- (off_t
)bp
->b_blkno
*
630 * UBC doesn't (yet) handle partial pages so nfs_biowrite was
631 * hacked to never bdwrite, to start every little write right
632 * away. Running IE Avie noticed the performance problem, thus
633 * this code, which permits those delayed writes by ensuring an
634 * initial read of the entire page. The read may hit eof
635 * ("short read") but that we will handle.
637 * We are quite dependant on the correctness of B_CACHE so check
638 * that first in case of problems.
640 if (!ISSET(bp
->b_flags
, B_CACHE
) && n
< PAGE_SIZE
) {
641 boff
= (off_t
)bp
->b_blkno
* DEV_BSIZE
;
644 auio
.uio_offset
= boff
;
645 auio
.uio_resid
= PAGE_SIZE
;
646 auio
.uio_segflg
= UIO_SYSSPACE
;
647 auio
.uio_rw
= UIO_READ
;
649 iov
.iov_base
= bp
->b_data
;
650 iov
.iov_len
= PAGE_SIZE
;
651 error
= nfs_readrpc(vp
, &auio
, cred
);
654 SET(bp
->b_flags
, B_ERROR
);
655 printf("nfs_write: readrpc %d", error
);
657 if (auio
.uio_resid
> 0)
658 bzero(iov
.iov_base
, auio
.uio_resid
);
660 bp
->b_validend
= PAGE_SIZE
- auio
.uio_resid
;
661 if (np
->n_size
> boff
+ bp
->b_validend
)
662 bp
->b_validend
= min(np
->n_size
- boff
,
669 * If the new write will leave a contiguous dirty
670 * area, just update the b_dirtyoff and b_dirtyend,
671 * otherwise try to extend the dirty region.
673 if (bp
->b_dirtyend
> 0 &&
674 (on
> bp
->b_dirtyend
|| (on
+ n
) < bp
->b_dirtyoff
)) {
677 boff
= (off_t
)bp
->b_blkno
* DEV_BSIZE
;
678 if (on
> bp
->b_dirtyend
) {
679 start
= boff
+ bp
->b_validend
;
682 start
= boff
+ on
+ n
;
683 end
= boff
+ bp
->b_validoff
;
687 * It may be that the valid region in the buffer
688 * covers the region we want, in which case just
689 * extend the dirty region. Otherwise we try to
690 * extend the valid region.
695 auio
.uio_offset
= start
;
696 auio
.uio_resid
= end
- start
;
697 auio
.uio_segflg
= UIO_SYSSPACE
;
698 auio
.uio_rw
= UIO_READ
;
700 iov
.iov_base
= bp
->b_data
+ (start
- boff
);
701 iov
.iov_len
= end
- start
;
702 error
= nfs_readrpc(vp
, &auio
, cred
);
704 * If we couldn't read, do not do a VOP_BWRITE
705 * as originally coded. That could also error
706 * and looping back to "again" as it was doing
707 * could have us stuck trying to write same buf
708 * again. nfs_write, will get the entire region
709 * if nfs_readrpc succeeded. If unsuccessful
710 * we should just error out. Errors like ESTALE
711 * would keep us looping rather than transient
712 * errors justifying a retry. We can return here
713 * instead of altering dirty region later. We
714 * did not write old dirty region at this point.
718 SET(bp
->b_flags
, B_ERROR
);
719 printf("nfs_write: readrpc2 %d", error
);
725 * If there was a short read, just zero fill.
727 if (auio
.uio_resid
> 0)
728 bzero(iov
.iov_base
, auio
.uio_resid
);
729 if (on
> bp
->b_dirtyend
)
732 bp
->b_validoff
= on
+ n
;
735 * We now have a valid region which extends up to the
736 * dirty region which we want.
738 if (on
> bp
->b_dirtyend
)
741 bp
->b_dirtyoff
= on
+ n
;
743 if (ISSET(bp
->b_flags
, B_ERROR
)) {
749 * NFS has embedded ucred so crhold() risks zone corruption
751 if (bp
->b_wcred
== NOCRED
)
752 bp
->b_wcred
= crdup(cred
);
753 np
->n_flag
|= NMODIFIED
;
756 * Check for valid write lease and get one as required.
757 * In case getblk() and/or bwrite() delayed us.
759 if ((nmp
->nm_flag
& NFSMNT_NQNFS
) &&
760 NQNFS_CKINVALID(vp
, np
, ND_WRITE
)) {
762 error
= nqnfs_getlease(vp
, ND_WRITE
, cred
, p
);
763 } while (error
== NQNFS_EXPIRED
);
768 if (np
->n_lrev
!= np
->n_brev
||
769 (np
->n_flag
& NQNFSNONCACHE
)) {
771 error
= nfs_vinvalbuf(vp
, V_SAVE
, cred
, p
, 1);
774 np
->n_brev
= np
->n_lrev
;
778 error
= uiomove((char *)bp
->b_data
+ on
, n
, uio
);
780 SET(bp
->b_flags
, B_ERROR
);
784 if (bp
->b_dirtyend
> 0) {
785 bp
->b_dirtyoff
= min(on
, bp
->b_dirtyoff
);
786 bp
->b_dirtyend
= max((on
+ n
), bp
->b_dirtyend
);
789 bp
->b_dirtyend
= on
+ n
;
791 if (bp
->b_validend
== 0 || bp
->b_validend
< bp
->b_dirtyoff
||
792 bp
->b_validoff
> bp
->b_dirtyend
) {
793 bp
->b_validoff
= bp
->b_dirtyoff
;
794 bp
->b_validend
= bp
->b_dirtyend
;
796 bp
->b_validoff
= min(bp
->b_validoff
, bp
->b_dirtyoff
);
797 bp
->b_validend
= max(bp
->b_validend
, bp
->b_dirtyend
);
801 * Since this block is being modified, it must be written
802 * again and not just committed.
804 CLR(bp
->b_flags
, B_NEEDCOMMIT
);
807 * If the lease is non-cachable or IO_SYNC do bwrite().
809 if ((np
->n_flag
& NQNFSNONCACHE
) || (ioflag
& IO_SYNC
)) {
811 error
= VOP_BWRITE(bp
);
814 if (np
->n_flag
& NQNFSNONCACHE
) {
815 error
= nfs_vinvalbuf(vp
, V_SAVE
, cred
, p
, 1);
819 } else if ((n
+ on
) == biosize
&&
820 (nmp
->nm_flag
& NFSMNT_NQNFS
) == 0) {
821 bp
->b_proc
= (struct proc
*)0;
822 SET(bp
->b_flags
, B_ASYNC
);
823 (void)nfs_writebp(bp
, 0);
826 } while (uio
->uio_resid
> 0 && n
> 0);
832 * Get an nfs cache block.
833 * Allocate a new one if the block isn't currently in the cache
834 * and return the block marked busy. If the calling process is
835 * interrupted by a signal for an interruptible mount point, return
839 nfs_getcacheblk(vp
, bn
, size
, p
, operation
)
844 int operation
; /* defined in sys/buf.h */
846 register struct buf
*bp
;
847 struct nfsmount
*nmp
= VFSTONFS(vp
->v_mount
);
848 /*due to getblk/vm interractions, use vm page size or less values */
849 int biosize
= min(vp
->v_mount
->mnt_stat
.f_iosize
, PAGE_SIZE
);
851 if (nmp
->nm_flag
& NFSMNT_INT
) {
852 bp
= getblk(vp
, bn
, size
, PCATCH
, 0, operation
);
853 while (bp
== (struct buf
*)0) {
854 if (nfs_sigintr(nmp
, (struct nfsreq
*)0, p
))
855 return ((struct buf
*)0);
856 bp
= getblk(vp
, bn
, size
, 0, 2 * hz
, operation
);
859 bp
= getblk(vp
, bn
, size
, 0, 0, operation
);
861 if( vp
->v_type
== VREG
)
862 bp
->b_blkno
= (bn
* biosize
) / DEV_BSIZE
;
868 * Flush and invalidate all dirty buffers. If another process is already
869 * doing the flush, just wait for completion.
872 nfs_vinvalbuf(vp
, flags
, cred
, p
, intrflg
)
879 register struct nfsnode
*np
= VTONFS(vp
);
880 struct nfsmount
*nmp
= VFSTONFS(vp
->v_mount
);
881 int error
= 0, slpflag
, slptimeo
;
884 if ((nmp
->nm_flag
& NFSMNT_INT
) == 0)
894 * First wait for any other process doing a flush to complete.
896 while (np
->n_flag
& NFLUSHINPROG
) {
897 np
->n_flag
|= NFLUSHWANT
;
898 error
= tsleep((caddr_t
)&np
->n_flag
, PRIBIO
+ 2, "nfsvinval",
900 if (error
&& intrflg
&& nfs_sigintr(nmp
, (struct nfsreq
*)0, p
))
905 * Now, flush as required.
907 np
->n_flag
|= NFLUSHINPROG
;
908 error
= vinvalbuf(vp
, flags
, cred
, p
, slpflag
, 0);
910 /* we seem to be stuck in a loop here if the thread got aborted.
911 * nfs_flush will return EINTR. Not sure if that will cause
912 * other consequences due to EINTR having other meanings in NFS
913 * To handle, no dirty pages, it seems safe to just return from
914 * here. But if we did have dirty pages, how would we get them
915 * written out if thread was aborted? Some other strategy is
918 if ((intrflg
&& nfs_sigintr(nmp
, (struct nfsreq
*)0, p
)) ||
919 (error
== EINTR
&& current_thread_aborted())) {
920 np
->n_flag
&= ~NFLUSHINPROG
;
921 if (np
->n_flag
& NFLUSHWANT
) {
922 np
->n_flag
&= ~NFLUSHWANT
;
923 wakeup((caddr_t
)&np
->n_flag
);
927 error
= vinvalbuf(vp
, flags
, cred
, p
, 0, slptimeo
);
929 np
->n_flag
&= ~(NMODIFIED
| NFLUSHINPROG
);
930 if (np
->n_flag
& NFLUSHWANT
) {
931 np
->n_flag
&= ~NFLUSHWANT
;
932 wakeup((caddr_t
)&np
->n_flag
);
934 didhold
= ubc_hold(vp
);
936 (void) ubc_clean(vp
, 1); /* get the pages out of vm also */
943 * Initiate asynchronous I/O. Return an error if no nfsiods are available.
944 * This is mainly to avoid queueing async I/O requests when the nfsiods
945 * are all hung on a dead server.
948 nfs_asyncio(bp
, cred
)
949 register struct buf
*bp
;
952 struct nfsmount
*nmp
;
959 if (nfs_numasync
== 0)
962 nmp
= VFSTONFS(bp
->b_vp
->v_mount
);
964 if (nmp
->nm_flag
& NFSMNT_INT
)
969 * Find a free iod to process this request.
971 for (i
= 0; i
< NFS_MAXASYNCDAEMON
; i
++)
972 if (nfs_iodwant
[i
]) {
974 * Found one, so wake it up and tell it which
978 ("nfs_asyncio: waking iod %d for mount %p\n",
980 nfs_iodwant
[i
] = (struct proc
*)0;
981 nfs_iodmount
[i
] = nmp
;
983 wakeup((caddr_t
)&nfs_iodwant
[i
]);
989 * If none are free, we may already have an iod working on this mount
990 * point. If so, it will process our request.
993 if (nmp
->nm_bufqiods
> 0) {
995 ("nfs_asyncio: %d iods are already processing mount %p\n",
996 nmp
->nm_bufqiods
, nmp
));
1002 * If we have an iod which can process the request, then queue
1007 * Ensure that the queue never grows too large.
1009 while (nmp
->nm_bufqlen
>= 2*nfs_numasync
) {
1011 ("nfs_asyncio: waiting for mount %p queue to drain\n", nmp
));
1012 nmp
->nm_bufqwant
= TRUE
;
1013 error
= tsleep(&nmp
->nm_bufq
, slpflag
| PRIBIO
,
1014 "nfsaio", slptimeo
);
1016 if (nfs_sigintr(nmp
, NULL
, bp
->b_proc
))
1018 if (slpflag
== PCATCH
) {
1024 * We might have lost our iod while sleeping,
1025 * so check and loop if nescessary.
1027 if (nmp
->nm_bufqiods
== 0) {
1029 ("nfs_asyncio: no iods after mount %p queue was drained, looping\n", nmp
));
1034 if (ISSET(bp
->b_flags
, B_READ
)) {
1035 if (bp
->b_rcred
== NOCRED
&& cred
!= NOCRED
) {
1037 * NFS has embedded ucred.
1038 * Can not crhold() here as that causes zone corruption
1040 bp
->b_rcred
= crdup(cred
);
1043 SET(bp
->b_flags
, B_WRITEINPROG
);
1044 if (bp
->b_wcred
== NOCRED
&& cred
!= NOCRED
) {
1046 * NFS has embedded ucred.
1047 * Can not crhold() here as that causes zone corruption
1049 bp
->b_wcred
= crdup(cred
);
1053 TAILQ_INSERT_TAIL(&nmp
->nm_bufq
, bp
, b_freelist
);
1059 * All the iods are busy on other mounts, so return EIO to
1060 * force the caller to process the i/o synchronously.
1062 NFS_DPF(ASYNCIO
, ("nfs_asyncio: no iods available, i/o is synchronous\n"));
1067 * Do an I/O operation to/from a cache block. This may be called
1068 * synchronously or from an nfsiod.
1072 register struct buf
*bp
;
1076 register struct uio
*uiop
;
1077 register struct vnode
*vp
;
1079 struct nfsmount
*nmp
;
1080 int error
= 0, diff
, len
, iomode
, must_commit
= 0;
1086 nmp
= VFSTONFS(vp
->v_mount
);
1088 uiop
->uio_iov
= &io
;
1089 uiop
->uio_iovcnt
= 1;
1090 uiop
->uio_segflg
= UIO_SYSSPACE
;
1091 uiop
->uio_procp
= p
;
1094 * With UBC, getblk() can return a buf with B_DONE set.
1095 * This indicates that the VM has valid data for that page.
1096 * NFS being stateless, this case poses a problem.
1097 * By definition, the NFS server should always be consulted
1098 * for the data in that page.
1099 * So we choose to clear the B_DONE and to do the IO.
1101 * XXX revisit this if there is a performance issue.
1102 * XXX In that case, we could play the attribute cache games ...
1104 if (ISSET(bp
->b_flags
, B_DONE
)) {
1105 if (!ISSET(bp
->b_flags
, B_ASYNC
))
1106 panic("nfs_doio: done and not async");
1107 CLR(bp
->b_flags
, B_DONE
);
1109 FSDBG_TOP(256, np
->n_size
, bp
->b_blkno
* DEV_BSIZE
, bp
->b_bcount
,
1111 FSDBG(257, bp
->b_validoff
, bp
->b_validend
, bp
->b_dirtyoff
,
1114 * Historically, paging was done with physio, but no more.
1116 if (ISSET(bp
->b_flags
, B_PHYS
)) {
1118 * ...though reading /dev/drum still gets us here.
1120 io
.iov_len
= uiop
->uio_resid
= bp
->b_bcount
;
1121 /* mapping was done by vmapbuf() */
1122 io
.iov_base
= bp
->b_data
;
1123 uiop
->uio_offset
= (off_t
)bp
->b_blkno
* DEV_BSIZE
;
1124 if (ISSET(bp
->b_flags
, B_READ
)) {
1125 uiop
->uio_rw
= UIO_READ
;
1126 nfsstats
.read_physios
++;
1127 error
= nfs_readrpc(vp
, uiop
, cr
);
1131 iomode
= NFSV3WRITE_DATASYNC
;
1132 uiop
->uio_rw
= UIO_WRITE
;
1133 nfsstats
.write_physios
++;
1134 error
= nfs_writerpc(vp
, uiop
, cr
, &iomode
, &com
);
1137 SET(bp
->b_flags
, B_ERROR
);
1138 bp
->b_error
= error
;
1140 } else if (ISSET(bp
->b_flags
, B_READ
)) {
1141 io
.iov_len
= uiop
->uio_resid
= bp
->b_bcount
;
1142 io
.iov_base
= bp
->b_data
;
1143 uiop
->uio_rw
= UIO_READ
;
1144 switch (vp
->v_type
) {
1146 uiop
->uio_offset
= (off_t
)bp
->b_blkno
* DEV_BSIZE
;
1147 nfsstats
.read_bios
++;
1148 error
= nfs_readrpc(vp
, uiop
, cr
);
1149 FSDBG(262, np
->n_size
, bp
->b_blkno
* DEV_BSIZE
,
1150 uiop
->uio_resid
, error
);
1153 if (uiop
->uio_resid
) {
1155 * If len > 0, there is a hole in the file and
1156 * no writes after the hole have been pushed to
1158 * Just zero fill the rest of the valid area.
1160 diff
= bp
->b_bcount
- uiop
->uio_resid
;
1161 len
= np
->n_size
- ((u_quad_t
)bp
->b_blkno
* DEV_BSIZE
+
1164 len
= min(len
, uiop
->uio_resid
);
1165 bzero((char *)bp
->b_data
+ diff
, len
);
1166 bp
->b_validend
= diff
+ len
;
1167 FSDBG(258, diff
, len
, 0, 1);
1169 bp
->b_validend
= diff
;
1171 bp
->b_validend
= bp
->b_bcount
;
1172 #if 1 /* USV + JOE [ */
1173 if (bp
->b_validend
< bp
->b_bufsize
) {
1175 * we're about to release a partial buffer after a
1176 * read... the only way we should get here is if
1177 * this buffer contains the EOF before releasing it,
1178 * we'll zero out to the end of the buffer so that
1179 * if a mmap of this page occurs, we'll see zero's
1180 * even if a ftruncate extends the file in the
1183 bzero((caddr_t
)(bp
->b_data
+ bp
->b_validend
),
1184 bp
->b_bufsize
- bp
->b_validend
);
1185 FSDBG(258, bp
->b_validend
,
1186 bp
->b_bufsize
- bp
->b_validend
, 0, 2);
1188 #endif /* ] USV + JOE */
1190 if (p
&& (vp
->v_flag
& VTEXT
) &&
1191 (((nmp
->nm_flag
& NFSMNT_NQNFS
) &&
1192 NQNFS_CKINVALID(vp
, np
, ND_READ
) &&
1193 np
->n_lrev
!= np
->n_brev
) ||
1194 (!(nmp
->nm_flag
& NFSMNT_NQNFS
) &&
1195 np
->n_mtime
!= np
->n_vattr
.va_mtime
.tv_sec
))) {
1196 uprintf("Process killed due to text file modification\n");
1197 psignal(p
, SIGKILL
);
1198 p
->p_flag
|= P_NOSWAP
;
1202 uiop
->uio_offset
= (off_t
)0;
1203 nfsstats
.readlink_bios
++;
1204 error
= nfs_readlinkrpc(vp
, uiop
, cr
);
1207 nfsstats
.readdir_bios
++;
1208 uiop
->uio_offset
= ((u_quad_t
)bp
->b_lblkno
) * NFS_DIRBLKSIZ
;
1209 if (!(nmp
->nm_flag
& NFSMNT_NFSV3
))
1210 nmp
->nm_flag
&= ~NFSMNT_RDIRPLUS
; /* dk@farm.org */
1211 if (nmp
->nm_flag
& NFSMNT_RDIRPLUS
) {
1212 error
= nfs_readdirplusrpc(vp
, uiop
, cr
);
1213 if (error
== NFSERR_NOTSUPP
)
1214 nmp
->nm_flag
&= ~NFSMNT_RDIRPLUS
;
1216 if ((nmp
->nm_flag
& NFSMNT_RDIRPLUS
) == 0)
1217 error
= nfs_readdirrpc(vp
, uiop
, cr
);
1220 printf("nfs_doio: type %x unexpected\n", vp
->v_type
);
1224 SET(bp
->b_flags
, B_ERROR
);
1225 bp
->b_error
= error
;
1229 * mapped I/O may have altered any bytes, so we extend
1230 * the dirty zone to the valid zone. For best performance
1231 * a better solution would be to save & restore page dirty bits
1232 * around the uiomove which brings write-data into the buffer.
1233 * Then here we'd check if the page is dirty rather than WASMAPPED
1234 * Also vnode_pager would change - if a page is clean it might
1235 * still need to be written due to DELWRI.
1237 if (UBCINFOEXISTS(vp
) && ubc_issetflags(vp
, UI_WASMAPPED
)) {
1238 bp
->b_dirtyoff
= min(bp
->b_dirtyoff
, bp
->b_validoff
);
1239 bp
->b_dirtyend
= max(bp
->b_dirtyend
, bp
->b_validend
);
1241 if ((off_t
)bp
->b_blkno
* DEV_BSIZE
+ bp
->b_dirtyend
> np
->n_size
)
1242 bp
->b_dirtyend
= np
->n_size
- (off_t
)bp
->b_blkno
* DEV_BSIZE
;
1244 if (bp
->b_dirtyend
> bp
->b_dirtyoff
) {
1245 io
.iov_len
= uiop
->uio_resid
= bp
->b_dirtyend
- bp
->b_dirtyoff
;
1246 uiop
->uio_offset
= (off_t
)bp
->b_blkno
* DEV_BSIZE
+
1248 io
.iov_base
= (char *)bp
->b_data
+ bp
->b_dirtyoff
;
1249 uiop
->uio_rw
= UIO_WRITE
;
1251 nfsstats
.write_bios
++;
1252 if ((bp
->b_flags
& (B_ASYNC
| B_NEEDCOMMIT
| B_NOCACHE
)) ==
1254 iomode
= NFSV3WRITE_UNSTABLE
;
1256 iomode
= NFSV3WRITE_FILESYNC
;
1257 SET(bp
->b_flags
, B_WRITEINPROG
);
1258 error
= nfs_writerpc(vp
, uiop
, cr
, &iomode
, &must_commit
);
1259 if (!error
&& iomode
== NFSV3WRITE_UNSTABLE
)
1260 SET(bp
->b_flags
, B_NEEDCOMMIT
);
1262 CLR(bp
->b_flags
, B_NEEDCOMMIT
);
1263 CLR(bp
->b_flags
, B_WRITEINPROG
);
1265 * For an interrupted write, the buffer is still valid
1266 * and the write hasn't been pushed to the server yet,
1267 * so we can't set B_ERROR and report the interruption
1268 * by setting B_EINTR. For the B_ASYNC case, B_EINTR
1269 * is not relevant, so the rpc attempt is essentially
1270 * a noop. For the case of a V3 write rpc not being
1271 * committed to stable storage, the block is still
1272 * dirty and requires either a commit rpc or another
1273 * write rpc with iomode == NFSV3WRITE_FILESYNC before
1274 * the block is reused. This is indicated by setting
1275 * the B_DELWRI and B_NEEDCOMMIT flags.
1277 if (error
== EINTR
|| (!error
&& bp
->b_flags
& B_NEEDCOMMIT
)) {
1280 CLR(bp
->b_flags
, B_INVAL
| B_NOCACHE
);
1281 if (!ISSET(bp
->b_flags
, B_DELWRI
)) {
1282 extern int nbdwrite
;
1283 SET(bp
->b_flags
, B_DELWRI
);
1286 FSDBG(261, bp
->b_validoff
, bp
->b_validend
,
1287 bp
->b_bufsize
, bp
->b_bcount
);
1289 * Since for the B_ASYNC case, nfs_bwrite() has
1290 * reassigned the buffer to the clean list, we have to
1291 * reassign it back to the dirty one. Ugh.
1293 if (ISSET(bp
->b_flags
, B_ASYNC
)) {
1295 reassignbuf(bp
, vp
);
1298 SET(bp
->b_flags
, B_EINTR
);
1302 SET(bp
->b_flags
, B_ERROR
);
1303 bp
->b_error
= np
->n_error
= error
;
1304 np
->n_flag
|= NWRITEERR
;
1306 bp
->b_dirtyoff
= bp
->b_dirtyend
= 0;
1309 * validoff and validend represent the real data present
1310 * in this buffer if validoff is non-zero, than we have
1311 * to invalidate the buffer and kill the page when
1312 * biodone is called... the same is also true when
1313 * validend doesn't extend all the way to the end of the
1314 * buffer and validend doesn't equate to the current
1315 * EOF... eventually we need to deal with this in a more
1316 * humane way (like keeping the partial buffer without
1317 * making it immediately available to the VM page cache)
1320 SET(bp
->b_flags
, B_INVAL
);
1322 if (bp
->b_validend
< bp
->b_bufsize
) {
1323 if ((off_t
)bp
->b_blkno
* DEV_BSIZE
+
1324 bp
->b_validend
== np
->n_size
) {
1325 bzero((caddr_t
)(bp
->b_data
+
1327 bp
->b_bufsize
- bp
->b_validend
);
1328 FSDBG(259, bp
->b_validend
,
1329 bp
->b_bufsize
- bp
->b_validend
, 0,
1332 SET(bp
->b_flags
, B_INVAL
);
1339 if (bp
->b_validoff
||
1340 (bp
->b_validend
< bp
->b_bufsize
&&
1341 (off_t
)bp
->b_blkno
* DEV_BSIZE
+ bp
->b_validend
!=
1343 SET(bp
->b_flags
, B_INVAL
);
1345 if (bp
->b_flags
& B_INVAL
) {
1346 FSDBG(260, bp
->b_validoff
, bp
->b_validend
,
1347 bp
->b_bufsize
, bp
->b_bcount
);
1352 FSDBG_BOT(256, bp
->b_validoff
, bp
->b_validend
, bp
->b_bufsize
,
1357 bp
->b_resid
= uiop
->uio_resid
;
1359 nfs_clearcommit(vp
->v_mount
);
1361 if (bp
->b_flags
& B_INVAL
) {
1362 FSDBG(260, bp
->b_validoff
, bp
->b_validend
, bp
->b_bufsize
,
1365 FSDBG_BOT(256, bp
->b_validoff
, bp
->b_validend
, bp
->b_bcount
, error
);