]>
git.saurik.com Git - apple/xnu.git/blob - bsd/nfs/nfs_bio.c
7a7394c7869f4d6be5396dcf550d21c4dae5ecbe
2 * Copyright (c) 2000 Apple Computer, Inc. All rights reserved.
4 * @APPLE_LICENSE_HEADER_START@
6 * The contents of this file constitute Original Code as defined in and
7 * are subject to the Apple Public Source License Version 1.1 (the
8 * "License"). You may not use this file except in compliance with the
9 * License. Please obtain a copy of the License at
10 * http://www.apple.com/publicsource and read it before using this file.
12 * This Original Code and all software distributed under the License are
13 * distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY KIND, EITHER
14 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
15 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
16 * FITNESS FOR A PARTICULAR PURPOSE OR NON-INFRINGEMENT. Please see the
17 * License for the specific language governing rights and limitations
20 * @APPLE_LICENSE_HEADER_END@
22 /* Copyright (c) 1995 NeXT Computer, Inc. All Rights Reserved */
24 * Copyright (c) 1989, 1993
25 * The Regents of the University of California. All rights reserved.
27 * This code is derived from software contributed to Berkeley by
28 * Rick Macklem at The University of Guelph.
30 * Redistribution and use in source and binary forms, with or without
31 * modification, are permitted provided that the following conditions
33 * 1. Redistributions of source code must retain the above copyright
34 * notice, this list of conditions and the following disclaimer.
35 * 2. Redistributions in binary form must reproduce the above copyright
36 * notice, this list of conditions and the following disclaimer in the
37 * documentation and/or other materials provided with the distribution.
38 * 3. All advertising materials mentioning features or use of this software
39 * must display the following acknowledgement:
40 * This product includes software developed by the University of
41 * California, Berkeley and its contributors.
42 * 4. Neither the name of the University nor the names of its contributors
43 * may be used to endorse or promote products derived from this software
44 * without specific prior written permission.
46 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
47 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
48 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
49 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
50 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
51 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
52 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
53 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
54 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
55 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
58 * @(#)nfs_bio.c 8.9 (Berkeley) 3/30/95
59 * FreeBSD-Id: nfs_bio.c,v 1.44 1997/09/10 19:52:25 phk Exp $
61 #include <sys/param.h>
62 #include <sys/systm.h>
63 #include <sys/resourcevar.h>
64 #include <sys/signalvar.h>
67 #include <sys/vnode.h>
68 #include <sys/mount.h>
69 #include <sys/kernel.h>
70 #include <sys/sysctl.h>
74 #include <sys/vmparam.h>
77 #include <kern/clock.h>
79 #include <nfs/rpcv2.h>
80 #include <nfs/nfsproto.h>
82 #include <nfs/nfsmount.h>
83 #include <nfs/nqnfs.h>
84 #include <nfs/nfsnode.h>
86 #include <sys/kdebug.h>
88 #define FSDBG(A, B, C, D, E) \
89 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, (A))) | DBG_FUNC_NONE, \
90 (int)(B), (int)(C), (int)(D), (int)(E), 0)
91 #define FSDBG_TOP(A, B, C, D, E) \
92 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, (A))) | DBG_FUNC_START, \
93 (int)(B), (int)(C), (int)(D), (int)(E), 0)
94 #define FSDBG_BOT(A, B, C, D, E) \
95 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, (A))) | DBG_FUNC_END, \
96 (int)(B), (int)(C), (int)(D), (int)(E), 0)
98 static struct buf
*nfs_getcacheblk
__P((struct vnode
*vp
, daddr_t bn
, int size
,
99 struct proc
*p
, int operation
));
101 extern int nfs_numasync
;
102 extern struct nfsstats nfsstats
;
106 * Vnode op for read using bio
107 * Any similarity to readip() is purely coincidental
110 nfs_bioread(vp
, uio
, ioflag
, cred
, getpages
)
111 register struct vnode
*vp
;
112 register struct uio
*uio
;
117 register struct nfsnode
*np
= VTONFS(vp
);
118 register int biosize
, diff
, i
;
119 struct buf
*bp
= 0, *rabp
;
122 struct nfsmount
*nmp
= VFSTONFS(vp
->v_mount
);
125 int nra
, error
= 0, n
= 0, on
= 0, not_readin
;
126 int operation
= (getpages
? BLK_PAGEIN
: BLK_READ
);
129 if (uio
->uio_rw
!= UIO_READ
)
130 panic("nfs_read mode");
132 if (uio
->uio_resid
== 0)
134 if (uio
->uio_offset
< 0)
137 if ((nmp
->nm_flag
& (NFSMNT_NFSV3
| NFSMNT_GOTFSINFO
)) == NFSMNT_NFSV3
)
138 (void)nfs_fsinfo(nmp
, vp
, cred
, p
);
139 /*due to getblk/vm interractions, use vm page size or less values */
140 biosize
= min(vp
->v_mount
->mnt_stat
.f_iosize
, PAGE_SIZE
);
142 * For nfs, cache consistency can only be maintained approximately.
143 * Although RFC1094 does not specify the criteria, the following is
144 * believed to be compatible with the reference port.
145 * For nqnfs, full cache consistency is maintained within the loop.
147 * If the file's modify time on the server has changed since the
148 * last read rpc or you have written to the file,
149 * you may have lost data cache consistency with the
150 * server, so flush all of the file's data out of the cache.
151 * Then force a getattr rpc to ensure that you have up to date
153 * NB: This implies that cache data can be read when up to
154 * NFS_ATTRTIMEO seconds out of date. If you find that you need current
155 * attributes this could be forced by setting n_attrstamp to 0 before
156 * the VOP_GETATTR() call.
158 if ((nmp
->nm_flag
& NFSMNT_NQNFS
) == 0) {
159 if (np
->n_flag
& NMODIFIED
) {
160 if (vp
->v_type
!= VREG
) {
161 if (vp
->v_type
!= VDIR
)
162 panic("nfs: bioread, not dir");
164 error
= nfs_vinvalbuf(vp
, V_SAVE
, cred
, p
, 1);
169 error
= VOP_GETATTR(vp
, &vattr
, cred
, p
);
172 np
->n_mtime
= vattr
.va_mtime
.tv_sec
;
174 error
= VOP_GETATTR(vp
, &vattr
, cred
, p
);
177 if (np
->n_mtime
!= vattr
.va_mtime
.tv_sec
) {
178 if (vp
->v_type
== VDIR
)
180 error
= nfs_vinvalbuf(vp
, V_SAVE
, cred
, p
, 1);
183 np
->n_mtime
= vattr
.va_mtime
.tv_sec
;
190 * Get a valid lease. If cached data is stale, flush it.
192 if (nmp
->nm_flag
& NFSMNT_NQNFS
) {
193 if (NQNFS_CKINVALID(vp
, np
, ND_READ
)) {
195 error
= nqnfs_getlease(vp
, ND_READ
, cred
, p
);
196 } while (error
== NQNFS_EXPIRED
);
199 if (np
->n_lrev
!= np
->n_brev
||
200 (np
->n_flag
& NQNFSNONCACHE
) ||
201 ((np
->n_flag
& NMODIFIED
) && vp
->v_type
== VDIR
)) {
202 if (vp
->v_type
== VDIR
)
204 error
= nfs_vinvalbuf(vp
, V_SAVE
, cred
, p
, 1);
207 np
->n_brev
= np
->n_lrev
;
209 } else if (vp
->v_type
== VDIR
&& (np
->n_flag
& NMODIFIED
)) {
211 error
= nfs_vinvalbuf(vp
, V_SAVE
, cred
, p
, 1);
216 if (np
->n_flag
& NQNFSNONCACHE
) {
217 switch (vp
->v_type
) {
219 return (nfs_readrpc(vp
, uio
, cred
));
221 return (nfs_readlinkrpc(vp
, uio
, cred
));
225 printf(" NQNFSNONCACHE: type %x unexpected\n",
229 switch (vp
->v_type
) {
231 nfsstats
.biocache_reads
++;
232 lbn
= uio
->uio_offset
/ biosize
;
233 on
= uio
->uio_offset
& (biosize
- 1);
237 * Start the read ahead(s), as required.
239 if (nfs_numasync
> 0 && nmp
->nm_readahead
> 0) {
240 for (nra
= 0; nra
< nmp
->nm_readahead
&&
241 (off_t
)(lbn
+ 1 + nra
) * biosize
< np
->n_size
;
243 rabn
= lbn
+ 1 + nra
;
244 if (!incore(vp
, rabn
)) {
245 rabp
= nfs_getcacheblk(vp
, rabn
, biosize
, p
, operation
);
248 if (!ISSET(rabp
->b_flags
, (B_CACHE
|B_DELWRI
))) {
249 SET(rabp
->b_flags
, (B_READ
| B_ASYNC
));
250 if (nfs_asyncio(rabp
, cred
)) {
251 SET(rabp
->b_flags
, (B_INVAL
|B_ERROR
));
262 * If the block is in the cache and has the required data
263 * in a valid region, just copy it out.
264 * Otherwise, get the block and write back/read in,
269 if ((off_t
)(lbn
+ 1) * biosize
> np
->n_size
&&
270 (off_t
)(lbn
+ 1) * biosize
- np
->n_size
< biosize
) {
271 bufsize
= np
->n_size
- lbn
* biosize
;
272 bufsize
= (bufsize
+ DEV_BSIZE
- 1) & ~(DEV_BSIZE
- 1);
274 bp
= nfs_getcacheblk(vp
, lbn
, bufsize
, p
, operation
);
278 if (!ISSET(bp
->b_flags
, B_CACHE
)) {
279 SET(bp
->b_flags
, B_READ
);
280 CLR(bp
->b_flags
, (B_DONE
| B_ERROR
| B_INVAL
));
282 error
= nfs_doio(bp
, cred
, p
);
289 n
= min((unsigned)(bufsize
- on
), uio
->uio_resid
);
293 diff
= np
->n_size
- uio
->uio_offset
;
296 if (not_readin
&& n
> 0) {
297 if (on
< bp
->b_validoff
|| (on
+ n
) > bp
->b_validend
) {
298 SET(bp
->b_flags
, (B_NOCACHE
|B_INVAFTERWRITE
));
299 if (bp
->b_dirtyend
> 0) {
300 if (!ISSET(bp
->b_flags
, B_DELWRI
))
302 if (VOP_BWRITE(bp
) == EINTR
)
310 diff
= (on
>= bp
->b_validend
) ? 0 : (bp
->b_validend
- on
);
315 nfsstats
.biocache_readlinks
++;
316 bp
= nfs_getcacheblk(vp
, (daddr_t
)0, NFS_MAXPATHLEN
, p
, operation
);
319 if (!ISSET(bp
->b_flags
, B_CACHE
)) {
320 SET(bp
->b_flags
, B_READ
);
321 error
= nfs_doio(bp
, cred
, p
);
323 SET(bp
->b_flags
, B_ERROR
);
328 n
= min(uio
->uio_resid
, NFS_MAXPATHLEN
- bp
->b_resid
);
332 nfsstats
.biocache_readdirs
++;
333 if (np
->n_direofoffset
334 && uio
->uio_offset
>= np
->n_direofoffset
) {
337 lbn
= uio
->uio_offset
/ NFS_DIRBLKSIZ
;
338 on
= uio
->uio_offset
& (NFS_DIRBLKSIZ
- 1);
339 bp
= nfs_getcacheblk(vp
, lbn
, NFS_DIRBLKSIZ
, p
, operation
);
342 if (!ISSET(bp
->b_flags
, B_CACHE
)) {
343 SET(bp
->b_flags
, B_READ
);
344 error
= nfs_doio(bp
, cred
, p
);
348 while (error
== NFSERR_BAD_COOKIE
) {
350 error
= nfs_vinvalbuf(vp
, 0, cred
, p
, 1);
352 * Yuck! The directory has been modified on the
353 * server. The only way to get the block is by
354 * reading from the beginning to get all the
357 for (i
= 0; i
<= lbn
&& !error
; i
++) {
358 if (np
->n_direofoffset
359 && (i
* NFS_DIRBLKSIZ
) >= np
->n_direofoffset
)
361 bp
= nfs_getcacheblk(vp
, i
, NFS_DIRBLKSIZ
, p
,
365 if (!ISSET(bp
->b_flags
, B_CACHE
)) {
366 SET(bp
->b_flags
, B_READ
);
367 error
= nfs_doio(bp
, cred
, p
);
369 * no error + B_INVAL == directory EOF,
372 if (error
== 0 && (bp
->b_flags
& B_INVAL
))
376 * An error will throw away the block and the
377 * for loop will break out. If no error and this
378 * is not the block we want, we throw away the
379 * block and go for the next one via the for loop.
381 if (error
|| i
< lbn
)
386 * The above while is repeated if we hit another cookie
387 * error. If we hit an error and it wasn't a cookie error,
395 * If not eof and read aheads are enabled, start one.
396 * (You need the current block first, so that you have the
397 * directory offset cookie of the next block.)
399 if (nfs_numasync
> 0 && nmp
->nm_readahead
> 0 &&
400 (np
->n_direofoffset
== 0 ||
401 (lbn
+ 1) * NFS_DIRBLKSIZ
< np
->n_direofoffset
) &&
402 !(np
->n_flag
& NQNFSNONCACHE
) &&
403 !incore(vp
, lbn
+ 1)) {
404 rabp
= nfs_getcacheblk(vp
, lbn
+ 1, NFS_DIRBLKSIZ
, p
,
407 if (!ISSET(rabp
->b_flags
, (B_CACHE
|B_DELWRI
))) {
408 SET(rabp
->b_flags
, (B_READ
| B_ASYNC
));
409 if (nfs_asyncio(rabp
, cred
)) {
410 SET(rabp
->b_flags
, (B_INVAL
|B_ERROR
));
420 * Make sure we use a signed variant of min() since
421 * the second term may be negative.
423 n
= lmin(uio
->uio_resid
, NFS_DIRBLKSIZ
- bp
->b_resid
- on
);
425 * Unlike VREG files, whos buffer size ( bp->b_bcount ) is
426 * chopped for the EOF condition, we cannot tell how large
427 * NFS directories are going to be until we hit EOF. So
428 * an NFS directory buffer is *not* chopped to its EOF. Now,
429 * it just so happens that b_resid will effectively chop it
430 * to EOF. *BUT* this information is lost if the buffer goes
431 * away and is reconstituted into a B_CACHE state (recovered
432 * from VM) later. So we keep track of the directory eof
433 * in np->n_direofoffset and chop it off as an extra step
436 if (np
->n_direofoffset
&&
437 n
> np
->n_direofoffset
- uio
->uio_offset
)
438 n
= np
->n_direofoffset
- uio
->uio_offset
;
441 printf(" nfs_bioread: type %x unexpected\n",vp
->v_type
);
446 error
= uiomove(bp
->b_data
+ on
, (int)n
, uio
);
448 switch (vp
->v_type
) {
455 if (np
->n_flag
& NQNFSNONCACHE
)
456 SET(bp
->b_flags
, B_INVAL
);
459 printf(" nfs_bioread: type %x unexpected\n",vp
->v_type
);
462 } while (error
== 0 && uio
->uio_resid
> 0 && n
> 0);
468 * Vnode op for write using bio
472 struct vop_write_args
/* {
476 struct ucred *a_cred;
479 register int biosize
;
480 register struct uio
*uio
= ap
->a_uio
;
481 struct proc
*p
= uio
->uio_procp
;
482 register struct vnode
*vp
= ap
->a_vp
;
483 struct nfsnode
*np
= VTONFS(vp
);
484 register struct ucred
*cred
= ap
->a_cred
;
485 int ioflag
= ap
->a_ioflag
;
488 struct nfsmount
*nmp
= VFSTONFS(vp
->v_mount
);
491 int n
, on
, error
= 0, iomode
, must_commit
;
497 if (uio
->uio_rw
!= UIO_WRITE
)
498 panic("nfs_write mode");
499 if (uio
->uio_segflg
== UIO_USERSPACE
&& uio
->uio_procp
!= current_proc())
500 panic("nfs_write proc");
502 if (vp
->v_type
!= VREG
)
504 if (np
->n_flag
& NWRITEERR
) {
505 np
->n_flag
&= ~NWRITEERR
;
506 return (np
->n_error
);
508 if ((nmp
->nm_flag
& (NFSMNT_NFSV3
| NFSMNT_GOTFSINFO
)) == NFSMNT_NFSV3
)
509 (void)nfs_fsinfo(nmp
, vp
, cred
, p
);
510 if (ioflag
& (IO_APPEND
| IO_SYNC
)) {
511 if (np
->n_flag
& NMODIFIED
) {
513 error
= nfs_vinvalbuf(vp
, V_SAVE
, cred
, p
, 1);
517 if (ioflag
& IO_APPEND
) {
519 error
= VOP_GETATTR(vp
, &vattr
, cred
, p
);
522 uio
->uio_offset
= np
->n_size
;
525 if (uio
->uio_offset
< 0)
527 if (uio
->uio_resid
== 0)
530 * Maybe this should be above the vnode op call, but so long as
531 * file servers have no limits, i don't think it matters
533 if (p
&& uio
->uio_offset
+ uio
->uio_resid
>
534 p
->p_rlimit
[RLIMIT_FSIZE
].rlim_cur
) {
539 * I use nm_rsize, not nm_wsize so that all buffer cache blocks
540 * will be the same size within a filesystem. nfs_writerpc will
541 * still use nm_wsize when sizing the rpc's.
543 /*due to getblk/vm interractions, use vm page size or less values */
544 biosize
= min(vp
->v_mount
->mnt_stat
.f_iosize
, PAGE_SIZE
);
548 * Check for a valid write lease.
550 if ((nmp
->nm_flag
& NFSMNT_NQNFS
) &&
551 NQNFS_CKINVALID(vp
, np
, ND_WRITE
)) {
553 error
= nqnfs_getlease(vp
, ND_WRITE
, cred
, p
);
554 } while (error
== NQNFS_EXPIRED
);
557 if (np
->n_lrev
!= np
->n_brev
||
558 (np
->n_flag
& NQNFSNONCACHE
)) {
559 error
= nfs_vinvalbuf(vp
, V_SAVE
, cred
, p
, 1);
562 np
->n_brev
= np
->n_lrev
;
565 if ((np
->n_flag
& NQNFSNONCACHE
) && uio
->uio_iovcnt
== 1) {
566 iomode
= NFSV3WRITE_FILESYNC
;
567 error
= nfs_writerpc(vp
, uio
, cred
, &iomode
, &must_commit
);
569 nfs_clearcommit(vp
->v_mount
);
572 nfsstats
.biocache_writes
++;
573 lbn
= uio
->uio_offset
/ biosize
;
574 on
= uio
->uio_offset
& (biosize
-1);
575 n
= min((unsigned)(biosize
- on
), uio
->uio_resid
);
579 /* (removed for UBC) */
580 if ((lbn
+ 1) * biosize
> np
->n_size
) {
581 bufsize
= np
->n_size
- lbn
* biosize
;
582 bufsize
= (bufsize
+ DEV_BSIZE
- 1) & ~(DEV_BSIZE
- 1);
586 * Get a cache block for writing. The range to be written is
587 * (off..off+len) within the block. We ensure that the block
588 * either has no dirty region or that the given range is
589 * contiguous with the existing dirty region.
591 bp
= nfs_getcacheblk(vp
, lbn
, bufsize
, p
, BLK_WRITE
);
595 * Resize nfsnode *after* we busy the buffer to prevent
596 * readers from reading garbage.
597 * If there was a partial buf at the old eof, validate
598 * and zero the new bytes.
600 if (uio
->uio_offset
+ n
> np
->n_size
) {
601 struct buf
*bp0
= NULL
;
602 daddr_t bn
= np
->n_size
/ biosize
;
603 int off
= np
->n_size
& (biosize
- 1);
605 if (off
&& bn
< lbn
&& incore(vp
, bn
))
606 bp0
= nfs_getcacheblk(vp
, bn
, biosize
, p
,
608 np
->n_flag
|= NMODIFIED
;
609 np
->n_size
= uio
->uio_offset
+ n
;
610 ubc_setsize(vp
, (off_t
)np
->n_size
); /* XXX errors */
612 bzero((char *)bp0
->b_data
+ off
, biosize
- off
);
613 bp0
->b_validend
= biosize
;
618 * NFS has embedded ucred so crhold() risks zone corruption
620 if (bp
->b_wcred
== NOCRED
)
621 bp
->b_wcred
= crdup(cred
);
623 * If dirtyend exceeds file size, chop it down. This should
624 * not occur unless there is a race.
626 if ((off_t
)bp
->b_blkno
* DEV_BSIZE
+ bp
->b_dirtyend
>
628 bp
->b_dirtyend
= np
->n_size
- (off_t
)bp
->b_blkno
*
631 * UBC doesn't (yet) handle partial pages so nfs_biowrite was
632 * hacked to never bdwrite, to start every little write right
633 * away. Running IE Avie noticed the performance problem, thus
634 * this code, which permits those delayed writes by ensuring an
635 * initial read of the entire page. The read may hit eof
636 * ("short read") but that we will handle.
638 * We are quite dependant on the correctness of B_CACHE so check
639 * that first in case of problems.
641 if (!ISSET(bp
->b_flags
, B_CACHE
) && n
< PAGE_SIZE
) {
642 boff
= (off_t
)bp
->b_blkno
* DEV_BSIZE
;
645 auio
.uio_offset
= boff
;
646 auio
.uio_resid
= PAGE_SIZE
;
647 auio
.uio_segflg
= UIO_SYSSPACE
;
648 auio
.uio_rw
= UIO_READ
;
650 iov
.iov_base
= bp
->b_data
;
651 iov
.iov_len
= PAGE_SIZE
;
652 error
= nfs_readrpc(vp
, &auio
, cred
);
655 SET(bp
->b_flags
, B_ERROR
);
656 printf("nfs_write: readrpc %d", error
);
658 if (auio
.uio_resid
> 0)
659 bzero(iov
.iov_base
, auio
.uio_resid
);
661 bp
->b_validend
= PAGE_SIZE
- auio
.uio_resid
;
662 if (np
->n_size
> boff
+ bp
->b_validend
)
663 bp
->b_validend
= min(np
->n_size
- boff
,
670 * If the new write will leave a contiguous dirty
671 * area, just update the b_dirtyoff and b_dirtyend,
672 * otherwise try to extend the dirty region.
674 if (bp
->b_dirtyend
> 0 &&
675 (on
> bp
->b_dirtyend
|| (on
+ n
) < bp
->b_dirtyoff
)) {
678 boff
= (off_t
)bp
->b_blkno
* DEV_BSIZE
;
679 if (on
> bp
->b_dirtyend
) {
680 start
= boff
+ bp
->b_validend
;
683 start
= boff
+ on
+ n
;
684 end
= boff
+ bp
->b_validoff
;
688 * It may be that the valid region in the buffer
689 * covers the region we want, in which case just
690 * extend the dirty region. Otherwise we try to
691 * extend the valid region.
696 auio
.uio_offset
= start
;
697 auio
.uio_resid
= end
- start
;
698 auio
.uio_segflg
= UIO_SYSSPACE
;
699 auio
.uio_rw
= UIO_READ
;
701 iov
.iov_base
= bp
->b_data
+ (start
- boff
);
702 iov
.iov_len
= end
- start
;
703 error
= nfs_readrpc(vp
, &auio
, cred
);
705 * If we couldn't read, do not do a VOP_BWRITE
706 * as originally coded. That could also error
707 * and looping back to "again" as it was doing
708 * could have us stuck trying to write same buf
709 * again. nfs_write, will get the entire region
710 * if nfs_readrpc succeeded. If unsuccessful
711 * we should just error out. Errors like ESTALE
712 * would keep us looping rather than transient
713 * errors justifying a retry. We can return here
714 * instead of altering dirty region later. We
715 * did not write old dirty region at this point.
719 SET(bp
->b_flags
, B_ERROR
);
720 printf("nfs_write: readrpc2 %d", error
);
726 * If there was a short read, just zero fill.
728 if (auio
.uio_resid
> 0)
729 bzero(iov
.iov_base
, auio
.uio_resid
);
730 if (on
> bp
->b_dirtyend
)
733 bp
->b_validoff
= on
+ n
;
736 * We now have a valid region which extends up to the
737 * dirty region which we want.
739 if (on
> bp
->b_dirtyend
)
742 bp
->b_dirtyoff
= on
+ n
;
744 if (ISSET(bp
->b_flags
, B_ERROR
)) {
750 * NFS has embedded ucred so crhold() risks zone corruption
752 if (bp
->b_wcred
== NOCRED
)
753 bp
->b_wcred
= crdup(cred
);
754 np
->n_flag
|= NMODIFIED
;
757 * Check for valid write lease and get one as required.
758 * In case getblk() and/or bwrite() delayed us.
760 if ((nmp
->nm_flag
& NFSMNT_NQNFS
) &&
761 NQNFS_CKINVALID(vp
, np
, ND_WRITE
)) {
763 error
= nqnfs_getlease(vp
, ND_WRITE
, cred
, p
);
764 } while (error
== NQNFS_EXPIRED
);
769 if (np
->n_lrev
!= np
->n_brev
||
770 (np
->n_flag
& NQNFSNONCACHE
)) {
772 error
= nfs_vinvalbuf(vp
, V_SAVE
, cred
, p
, 1);
775 np
->n_brev
= np
->n_lrev
;
779 error
= uiomove((char *)bp
->b_data
+ on
, n
, uio
);
781 SET(bp
->b_flags
, B_ERROR
);
785 if (bp
->b_dirtyend
> 0) {
786 bp
->b_dirtyoff
= min(on
, bp
->b_dirtyoff
);
787 bp
->b_dirtyend
= max((on
+ n
), bp
->b_dirtyend
);
790 bp
->b_dirtyend
= on
+ n
;
792 if (bp
->b_validend
== 0 || bp
->b_validend
< bp
->b_dirtyoff
||
793 bp
->b_validoff
> bp
->b_dirtyend
) {
794 bp
->b_validoff
= bp
->b_dirtyoff
;
795 bp
->b_validend
= bp
->b_dirtyend
;
797 bp
->b_validoff
= min(bp
->b_validoff
, bp
->b_dirtyoff
);
798 bp
->b_validend
= max(bp
->b_validend
, bp
->b_dirtyend
);
802 * Since this block is being modified, it must be written
803 * again and not just committed.
805 CLR(bp
->b_flags
, B_NEEDCOMMIT
);
808 * If the lease is non-cachable or IO_SYNC do bwrite().
810 if ((np
->n_flag
& NQNFSNONCACHE
) || (ioflag
& IO_SYNC
)) {
812 error
= VOP_BWRITE(bp
);
815 if (np
->n_flag
& NQNFSNONCACHE
) {
816 error
= nfs_vinvalbuf(vp
, V_SAVE
, cred
, p
, 1);
820 } else if ((n
+ on
) == biosize
&&
821 (nmp
->nm_flag
& NFSMNT_NQNFS
) == 0) {
822 bp
->b_proc
= (struct proc
*)0;
823 SET(bp
->b_flags
, B_ASYNC
);
824 (void)nfs_writebp(bp
, 0);
827 } while (uio
->uio_resid
> 0 && n
> 0);
833 * Get an nfs cache block.
834 * Allocate a new one if the block isn't currently in the cache
835 * and return the block marked busy. If the calling process is
836 * interrupted by a signal for an interruptible mount point, return
840 nfs_getcacheblk(vp
, bn
, size
, p
, operation
)
845 int operation
; /* defined in sys/buf.h */
847 register struct buf
*bp
;
848 struct nfsmount
*nmp
= VFSTONFS(vp
->v_mount
);
849 /*due to getblk/vm interractions, use vm page size or less values */
850 int biosize
= min(vp
->v_mount
->mnt_stat
.f_iosize
, PAGE_SIZE
);
852 if (nbdwrite
> ((nbuf
/4)*3) && operation
== BLK_WRITE
) {
853 #define __BUFFERS_RECLAIMED 2
854 struct buf
*tbp
[__BUFFERS_RECLAIMED
];
857 /* too many delayed writes, try to free up some buffers */
858 for (i
= 0; i
< __BUFFERS_RECLAIMED
; i
++)
859 tbp
[i
] = geteblk(512);
861 /* Yield to IO thread */
862 (void)tsleep((caddr_t
)&nbdwrite
, PCATCH
, "nbdwrite", 1);
864 for (i
= (__BUFFERS_RECLAIMED
- 1); i
>= 0; i
--)
868 if (nmp
->nm_flag
& NFSMNT_INT
) {
869 bp
= getblk(vp
, bn
, size
, PCATCH
, 0, operation
);
870 while (bp
== (struct buf
*)0) {
871 if (nfs_sigintr(nmp
, (struct nfsreq
*)0, p
))
872 return ((struct buf
*)0);
873 bp
= getblk(vp
, bn
, size
, 0, 2 * hz
, operation
);
876 bp
= getblk(vp
, bn
, size
, 0, 0, operation
);
878 if( vp
->v_type
== VREG
)
879 bp
->b_blkno
= (bn
* biosize
) / DEV_BSIZE
;
885 * Flush and invalidate all dirty buffers. If another process is already
886 * doing the flush, just wait for completion.
889 nfs_vinvalbuf(vp
, flags
, cred
, p
, intrflg
)
896 register struct nfsnode
*np
= VTONFS(vp
);
897 struct nfsmount
*nmp
= VFSTONFS(vp
->v_mount
);
898 int error
= 0, slpflag
, slptimeo
;
901 if ((nmp
->nm_flag
& NFSMNT_INT
) == 0)
911 * First wait for any other process doing a flush to complete.
913 while (np
->n_flag
& NFLUSHINPROG
) {
914 np
->n_flag
|= NFLUSHWANT
;
915 error
= tsleep((caddr_t
)&np
->n_flag
, PRIBIO
+ 2, "nfsvinval",
917 if (error
&& intrflg
&& nfs_sigintr(nmp
, (struct nfsreq
*)0, p
))
922 * Now, flush as required.
924 np
->n_flag
|= NFLUSHINPROG
;
925 error
= vinvalbuf(vp
, flags
, cred
, p
, slpflag
, 0);
927 /* we seem to be stuck in a loop here if the thread got aborted.
928 * nfs_flush will return EINTR. Not sure if that will cause
929 * other consequences due to EINTR having other meanings in NFS
930 * To handle, no dirty pages, it seems safe to just return from
931 * here. But if we did have dirty pages, how would we get them
932 * written out if thread was aborted? Some other strategy is
935 if ((intrflg
&& nfs_sigintr(nmp
, (struct nfsreq
*)0, p
)) ||
936 (error
== EINTR
&& current_thread_aborted())) {
937 np
->n_flag
&= ~NFLUSHINPROG
;
938 if (np
->n_flag
& NFLUSHWANT
) {
939 np
->n_flag
&= ~NFLUSHWANT
;
940 wakeup((caddr_t
)&np
->n_flag
);
944 error
= vinvalbuf(vp
, flags
, cred
, p
, 0, slptimeo
);
946 np
->n_flag
&= ~(NMODIFIED
| NFLUSHINPROG
);
947 if (np
->n_flag
& NFLUSHWANT
) {
948 np
->n_flag
&= ~NFLUSHWANT
;
949 wakeup((caddr_t
)&np
->n_flag
);
951 didhold
= ubc_hold(vp
);
953 (void) ubc_clean(vp
, 1); /* get the pages out of vm also */
960 * Initiate asynchronous I/O. Return an error if no nfsiods are available.
961 * This is mainly to avoid queueing async I/O requests when the nfsiods
962 * are all hung on a dead server.
965 nfs_asyncio(bp
, cred
)
966 register struct buf
*bp
;
969 struct nfsmount
*nmp
;
976 if (nfs_numasync
== 0)
979 nmp
= VFSTONFS(bp
->b_vp
->v_mount
);
981 if (nmp
->nm_flag
& NFSMNT_INT
)
986 * Find a free iod to process this request.
988 for (i
= 0; i
< NFS_MAXASYNCDAEMON
; i
++)
989 if (nfs_iodwant
[i
]) {
991 * Found one, so wake it up and tell it which
995 ("nfs_asyncio: waking iod %d for mount %p\n",
997 nfs_iodwant
[i
] = (struct proc
*)0;
998 nfs_iodmount
[i
] = nmp
;
1000 wakeup((caddr_t
)&nfs_iodwant
[i
]);
1006 * If none are free, we may already have an iod working on this mount
1007 * point. If so, it will process our request.
1010 if (nmp
->nm_bufqiods
> 0) {
1012 ("nfs_asyncio: %d iods are already processing mount %p\n",
1013 nmp
->nm_bufqiods
, nmp
));
1019 * If we have an iod which can process the request, then queue
1024 * Ensure that the queue never grows too large.
1026 while (nmp
->nm_bufqlen
>= 2*nfs_numasync
) {
1028 ("nfs_asyncio: waiting for mount %p queue to drain\n", nmp
));
1029 nmp
->nm_bufqwant
= TRUE
;
1030 error
= tsleep(&nmp
->nm_bufq
, slpflag
| PRIBIO
,
1031 "nfsaio", slptimeo
);
1033 if (nfs_sigintr(nmp
, NULL
, bp
->b_proc
))
1035 if (slpflag
== PCATCH
) {
1041 * We might have lost our iod while sleeping,
1042 * so check and loop if nescessary.
1044 if (nmp
->nm_bufqiods
== 0) {
1046 ("nfs_asyncio: no iods after mount %p queue was drained, looping\n", nmp
));
1051 if (ISSET(bp
->b_flags
, B_READ
)) {
1052 if (bp
->b_rcred
== NOCRED
&& cred
!= NOCRED
) {
1054 * NFS has embedded ucred.
1055 * Can not crhold() here as that causes zone corruption
1057 bp
->b_rcred
= crdup(cred
);
1060 SET(bp
->b_flags
, B_WRITEINPROG
);
1061 if (bp
->b_wcred
== NOCRED
&& cred
!= NOCRED
) {
1063 * NFS has embedded ucred.
1064 * Can not crhold() here as that causes zone corruption
1066 bp
->b_wcred
= crdup(cred
);
1070 TAILQ_INSERT_TAIL(&nmp
->nm_bufq
, bp
, b_freelist
);
1076 * All the iods are busy on other mounts, so return EIO to
1077 * force the caller to process the i/o synchronously.
1079 NFS_DPF(ASYNCIO
, ("nfs_asyncio: no iods available, i/o is synchronous\n"));
1084 * Do an I/O operation to/from a cache block. This may be called
1085 * synchronously or from an nfsiod.
1089 register struct buf
*bp
;
1093 register struct uio
*uiop
;
1094 register struct vnode
*vp
;
1096 struct nfsmount
*nmp
;
1097 int error
= 0, diff
, len
, iomode
, must_commit
= 0;
1103 nmp
= VFSTONFS(vp
->v_mount
);
1105 uiop
->uio_iov
= &io
;
1106 uiop
->uio_iovcnt
= 1;
1107 uiop
->uio_segflg
= UIO_SYSSPACE
;
1108 uiop
->uio_procp
= p
;
1111 * With UBC, getblk() can return a buf with B_DONE set.
1112 * This indicates that the VM has valid data for that page.
1113 * NFS being stateless, this case poses a problem.
1114 * By definition, the NFS server should always be consulted
1115 * for the data in that page.
1116 * So we choose to clear the B_DONE and to do the IO.
1118 * XXX revisit this if there is a performance issue.
1119 * XXX In that case, we could play the attribute cache games ...
1121 if (ISSET(bp
->b_flags
, B_DONE
)) {
1122 if (!ISSET(bp
->b_flags
, B_ASYNC
))
1123 panic("nfs_doio: done and not async");
1124 CLR(bp
->b_flags
, B_DONE
);
1126 FSDBG_TOP(256, np
->n_size
, bp
->b_blkno
* DEV_BSIZE
, bp
->b_bcount
,
1128 FSDBG(257, bp
->b_validoff
, bp
->b_validend
, bp
->b_dirtyoff
,
1131 * Historically, paging was done with physio, but no more.
1133 if (ISSET(bp
->b_flags
, B_PHYS
)) {
1135 * ...though reading /dev/drum still gets us here.
1137 io
.iov_len
= uiop
->uio_resid
= bp
->b_bcount
;
1138 /* mapping was done by vmapbuf() */
1139 io
.iov_base
= bp
->b_data
;
1140 uiop
->uio_offset
= (off_t
)bp
->b_blkno
* DEV_BSIZE
;
1141 if (ISSET(bp
->b_flags
, B_READ
)) {
1142 uiop
->uio_rw
= UIO_READ
;
1143 nfsstats
.read_physios
++;
1144 error
= nfs_readrpc(vp
, uiop
, cr
);
1148 iomode
= NFSV3WRITE_DATASYNC
;
1149 uiop
->uio_rw
= UIO_WRITE
;
1150 nfsstats
.write_physios
++;
1151 error
= nfs_writerpc(vp
, uiop
, cr
, &iomode
, &com
);
1154 SET(bp
->b_flags
, B_ERROR
);
1155 bp
->b_error
= error
;
1157 } else if (ISSET(bp
->b_flags
, B_READ
)) {
1158 io
.iov_len
= uiop
->uio_resid
= bp
->b_bcount
;
1159 io
.iov_base
= bp
->b_data
;
1160 uiop
->uio_rw
= UIO_READ
;
1161 switch (vp
->v_type
) {
1163 uiop
->uio_offset
= (off_t
)bp
->b_blkno
* DEV_BSIZE
;
1164 nfsstats
.read_bios
++;
1165 error
= nfs_readrpc(vp
, uiop
, cr
);
1166 FSDBG(262, np
->n_size
, bp
->b_blkno
* DEV_BSIZE
,
1167 uiop
->uio_resid
, error
);
1170 if (uiop
->uio_resid
) {
1172 * If len > 0, there is a hole in the file and
1173 * no writes after the hole have been pushed to
1175 * Just zero fill the rest of the valid area.
1177 diff
= bp
->b_bcount
- uiop
->uio_resid
;
1178 len
= np
->n_size
- ((u_quad_t
)bp
->b_blkno
* DEV_BSIZE
+
1181 len
= min(len
, uiop
->uio_resid
);
1182 bzero((char *)bp
->b_data
+ diff
, len
);
1183 bp
->b_validend
= diff
+ len
;
1184 FSDBG(258, diff
, len
, 0, 1);
1186 bp
->b_validend
= diff
;
1188 bp
->b_validend
= bp
->b_bcount
;
1190 if (bp
->b_validend
< bp
->b_bufsize
) {
1192 * we're about to release a partial buffer after a
1193 * read... the only way we should get here is if
1194 * this buffer contains the EOF before releasing it,
1195 * we'll zero out to the end of the buffer so that
1196 * if a mmap of this page occurs, we'll see zero's
1197 * even if a ftruncate extends the file in the
1200 bzero((caddr_t
)(bp
->b_data
+ bp
->b_validend
),
1201 bp
->b_bufsize
- bp
->b_validend
);
1202 FSDBG(258, bp
->b_validend
,
1203 bp
->b_bufsize
- bp
->b_validend
, 0, 2);
1206 if (p
&& (vp
->v_flag
& VTEXT
) &&
1207 (((nmp
->nm_flag
& NFSMNT_NQNFS
) &&
1208 NQNFS_CKINVALID(vp
, np
, ND_READ
) &&
1209 np
->n_lrev
!= np
->n_brev
) ||
1210 (!(nmp
->nm_flag
& NFSMNT_NQNFS
) &&
1211 np
->n_mtime
!= np
->n_vattr
.va_mtime
.tv_sec
))) {
1212 uprintf("Process killed due to text file modification\n");
1213 psignal(p
, SIGKILL
);
1214 p
->p_flag
|= P_NOSWAP
;
1218 uiop
->uio_offset
= (off_t
)0;
1219 nfsstats
.readlink_bios
++;
1220 error
= nfs_readlinkrpc(vp
, uiop
, cr
);
1223 nfsstats
.readdir_bios
++;
1224 uiop
->uio_offset
= ((u_quad_t
)bp
->b_lblkno
) * NFS_DIRBLKSIZ
;
1225 if (!(nmp
->nm_flag
& NFSMNT_NFSV3
))
1226 nmp
->nm_flag
&= ~NFSMNT_RDIRPLUS
; /* dk@farm.org */
1227 if (nmp
->nm_flag
& NFSMNT_RDIRPLUS
) {
1228 error
= nfs_readdirplusrpc(vp
, uiop
, cr
);
1229 if (error
== NFSERR_NOTSUPP
)
1230 nmp
->nm_flag
&= ~NFSMNT_RDIRPLUS
;
1232 if ((nmp
->nm_flag
& NFSMNT_RDIRPLUS
) == 0)
1233 error
= nfs_readdirrpc(vp
, uiop
, cr
);
1236 printf("nfs_doio: type %x unexpected\n", vp
->v_type
);
1240 SET(bp
->b_flags
, B_ERROR
);
1241 bp
->b_error
= error
;
1245 * mapped I/O may have altered any bytes, so we extend
1246 * the dirty zone to the valid zone. For best performance
1247 * a better solution would be to save & restore page dirty bits
1248 * around the uiomove which brings write-data into the buffer.
1249 * Then here we'd check if the page is dirty rather than WASMAPPED
1250 * Also vnode_pager would change - if a page is clean it might
1251 * still need to be written due to DELWRI.
1253 if (UBCINFOEXISTS(vp
) && ubc_issetflags(vp
, UI_WASMAPPED
)) {
1254 bp
->b_dirtyoff
= min(bp
->b_dirtyoff
, bp
->b_validoff
);
1255 bp
->b_dirtyend
= max(bp
->b_dirtyend
, bp
->b_validend
);
1257 if ((off_t
)bp
->b_blkno
* DEV_BSIZE
+ bp
->b_dirtyend
> np
->n_size
)
1258 bp
->b_dirtyend
= np
->n_size
- (off_t
)bp
->b_blkno
* DEV_BSIZE
;
1260 if (bp
->b_dirtyend
> bp
->b_dirtyoff
) {
1261 io
.iov_len
= uiop
->uio_resid
= bp
->b_dirtyend
- bp
->b_dirtyoff
;
1262 uiop
->uio_offset
= (off_t
)bp
->b_blkno
* DEV_BSIZE
+
1264 io
.iov_base
= (char *)bp
->b_data
+ bp
->b_dirtyoff
;
1265 uiop
->uio_rw
= UIO_WRITE
;
1267 nfsstats
.write_bios
++;
1268 if ((bp
->b_flags
& (B_ASYNC
| B_NEEDCOMMIT
| B_NOCACHE
)) ==
1270 iomode
= NFSV3WRITE_UNSTABLE
;
1272 iomode
= NFSV3WRITE_FILESYNC
;
1273 SET(bp
->b_flags
, B_WRITEINPROG
);
1274 error
= nfs_writerpc(vp
, uiop
, cr
, &iomode
, &must_commit
);
1275 if (!error
&& iomode
== NFSV3WRITE_UNSTABLE
)
1276 SET(bp
->b_flags
, B_NEEDCOMMIT
);
1278 CLR(bp
->b_flags
, B_NEEDCOMMIT
);
1279 CLR(bp
->b_flags
, B_WRITEINPROG
);
1281 * For an interrupted write, the buffer is still valid
1282 * and the write hasn't been pushed to the server yet,
1283 * so we can't set B_ERROR and report the interruption
1284 * by setting B_EINTR. For the B_ASYNC case, B_EINTR
1285 * is not relevant, so the rpc attempt is essentially
1286 * a noop. For the case of a V3 write rpc not being
1287 * committed to stable storage, the block is still
1288 * dirty and requires either a commit rpc or another
1289 * write rpc with iomode == NFSV3WRITE_FILESYNC before
1290 * the block is reused. This is indicated by setting
1291 * the B_DELWRI and B_NEEDCOMMIT flags.
1293 if (error
== EINTR
|| (!error
&& bp
->b_flags
& B_NEEDCOMMIT
)) {
1296 CLR(bp
->b_flags
, B_INVAL
| B_NOCACHE
);
1297 if (!ISSET(bp
->b_flags
, B_DELWRI
)) {
1298 SET(bp
->b_flags
, B_DELWRI
);
1301 FSDBG(261, bp
->b_validoff
, bp
->b_validend
,
1302 bp
->b_bufsize
, bp
->b_bcount
);
1304 * Since for the B_ASYNC case, nfs_bwrite() has
1305 * reassigned the buffer to the clean list, we have to
1306 * reassign it back to the dirty one. Ugh.
1308 if (ISSET(bp
->b_flags
, B_ASYNC
)) {
1310 reassignbuf(bp
, vp
);
1313 SET(bp
->b_flags
, B_EINTR
);
1317 SET(bp
->b_flags
, B_ERROR
);
1318 bp
->b_error
= np
->n_error
= error
;
1319 np
->n_flag
|= NWRITEERR
;
1321 bp
->b_dirtyoff
= bp
->b_dirtyend
= 0;
1324 * validoff and validend represent the real data present
1325 * in this buffer if validoff is non-zero, than we have
1326 * to invalidate the buffer and kill the page when
1327 * biodone is called... the same is also true when
1328 * validend doesn't extend all the way to the end of the
1329 * buffer and validend doesn't equate to the current
1330 * EOF... eventually we need to deal with this in a more
1331 * humane way (like keeping the partial buffer without
1332 * making it immediately available to the VM page cache)
1335 SET(bp
->b_flags
, B_INVAL
);
1337 if (bp
->b_validend
< bp
->b_bufsize
) {
1338 if ((off_t
)bp
->b_blkno
* DEV_BSIZE
+
1339 bp
->b_validend
== np
->n_size
) {
1340 bzero((caddr_t
)(bp
->b_data
+
1342 bp
->b_bufsize
- bp
->b_validend
);
1343 FSDBG(259, bp
->b_validend
,
1344 bp
->b_bufsize
- bp
->b_validend
, 0,
1347 SET(bp
->b_flags
, B_INVAL
);
1352 if (bp
->b_validoff
||
1353 (bp
->b_validend
< bp
->b_bufsize
&&
1354 (off_t
)bp
->b_blkno
* DEV_BSIZE
+ bp
->b_validend
!=
1356 SET(bp
->b_flags
, B_INVAL
);
1358 if (bp
->b_flags
& B_INVAL
) {
1359 FSDBG(260, bp
->b_validoff
, bp
->b_validend
,
1360 bp
->b_bufsize
, bp
->b_bcount
);
1364 FSDBG_BOT(256, bp
->b_validoff
, bp
->b_validend
, bp
->b_bufsize
,
1369 bp
->b_resid
= uiop
->uio_resid
;
1371 nfs_clearcommit(vp
->v_mount
);
1373 if (bp
->b_flags
& B_INVAL
) {
1374 FSDBG(260, bp
->b_validoff
, bp
->b_validend
, bp
->b_bufsize
,
1377 FSDBG_BOT(256, bp
->b_validoff
, bp
->b_validend
, bp
->b_bcount
, error
);