2 * Copyright (c) 2000-2003 Apple Computer, Inc. All rights reserved.
4 * @APPLE_LICENSE_HEADER_START@
6 * The contents of this file constitute Original Code as defined in and
7 * are subject to the Apple Public Source License Version 1.1 (the
8 * "License"). You may not use this file except in compliance with the
9 * License. Please obtain a copy of the License at
10 * http://www.apple.com/publicsource and read it before using this file.
12 * This Original Code and all software distributed under the License are
13 * distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY KIND, EITHER
14 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
15 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
16 * FITNESS FOR A PARTICULAR PURPOSE OR NON-INFRINGEMENT. Please see the
17 * License for the specific language governing rights and limitations
20 * @APPLE_LICENSE_HEADER_END@
22 /* Copyright (c) 1995 NeXT Computer, Inc. All Rights Reserved */
24 * Copyright (c) 1989, 1993
25 * The Regents of the University of California. All rights reserved.
27 * This code is derived from software contributed to Berkeley by
28 * Rick Macklem at The University of Guelph.
30 * Redistribution and use in source and binary forms, with or without
31 * modification, are permitted provided that the following conditions
33 * 1. Redistributions of source code must retain the above copyright
34 * notice, this list of conditions and the following disclaimer.
35 * 2. Redistributions in binary form must reproduce the above copyright
36 * notice, this list of conditions and the following disclaimer in the
37 * documentation and/or other materials provided with the distribution.
38 * 3. All advertising materials mentioning features or use of this software
39 * must display the following acknowledgement:
40 * This product includes software developed by the University of
41 * California, Berkeley and its contributors.
42 * 4. Neither the name of the University nor the names of its contributors
43 * may be used to endorse or promote products derived from this software
44 * without specific prior written permission.
46 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
47 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
48 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
49 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
50 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
51 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
52 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
53 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
54 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
55 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
58 * @(#)nfs_bio.c 8.9 (Berkeley) 3/30/95
59 * FreeBSD-Id: nfs_bio.c,v 1.44 1997/09/10 19:52:25 phk Exp $
61 #include <sys/param.h>
62 #include <sys/systm.h>
63 #include <sys/resourcevar.h>
64 #include <sys/signalvar.h>
66 #include <sys/malloc.h>
67 #include <sys/vnode.h>
68 #include <sys/dirent.h>
69 #include <sys/mount.h>
70 #include <sys/kernel.h>
71 #include <sys/sysctl.h>
75 #include <sys/vmparam.h>
78 #include <kern/clock.h>
80 #include <nfs/rpcv2.h>
81 #include <nfs/nfsproto.h>
83 #include <nfs/nfsmount.h>
84 #include <nfs/nqnfs.h>
85 #include <nfs/nfsnode.h>
87 #include <sys/kdebug.h>
89 #define FSDBG(A, B, C, D, E) \
90 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, (A))) | DBG_FUNC_NONE, \
91 (int)(B), (int)(C), (int)(D), (int)(E), 0)
92 #define FSDBG_TOP(A, B, C, D, E) \
93 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, (A))) | DBG_FUNC_START, \
94 (int)(B), (int)(C), (int)(D), (int)(E), 0)
95 #define FSDBG_BOT(A, B, C, D, E) \
96 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, (A))) | DBG_FUNC_END, \
97 (int)(B), (int)(C), (int)(D), (int)(E), 0)
99 extern int nfs_numasync
;
100 extern int nfs_ioddelwri
;
101 extern struct nfsstats nfsstats
;
103 #define NFSBUFHASH(dvp, lbn) \
104 (&nfsbufhashtbl[((long)(dvp) / sizeof(*(dvp)) + (int)(lbn)) & nfsbufhash])
105 LIST_HEAD(nfsbufhashhead
, nfsbuf
) *nfsbufhashtbl
;
106 struct nfsbuffreehead nfsbuffree
, nfsbuffreemeta
, nfsbufdelwri
;
108 int nfsbufhashlock
, nfsbufcnt
, nfsbufmin
, nfsbufmax
;
109 int nfsbuffreecnt
, nfsbuffreemetacnt
, nfsbufdelwricnt
, nfsneedbuffer
;
111 time_t nfsbuffreeuptimestamp
;
113 #define NFSBUFWRITE_THROTTLE 9
114 #define NFSBUF_LRU_STALE 120
115 #define NFSBUF_META_STALE 240
117 /* number of nfsbufs nfs_buf_freeup() should attempt to free from nfsbuffree list */
118 #define LRU_TO_FREEUP 6
119 /* number of nfsbufs nfs_buf_freeup() should attempt to free from nfsbuffreemeta list */
120 #define META_TO_FREEUP 3
121 /* total number of nfsbufs nfs_buf_freeup() should attempt to free */
122 #define TOTAL_TO_FREEUP (LRU_TO_FREEUP+META_TO_FREEUP)
123 /* fraction of nfsbufs nfs_buf_freeup() should attempt to free from nfsbuffree list when called from nfs_timer() */
124 #define LRU_FREEUP_FRAC_ON_TIMER 8
125 /* fraction of nfsbufs nfs_buf_freeup() should attempt to free from nfsbuffreemeta list when called from nfs_timer() */
126 #define META_FREEUP_FRAC_ON_TIMER 16
127 /* fraction of total nfsbufs that nfsbuffreecnt should exceed before bothering to call nfs_buf_freeup() */
128 #define LRU_FREEUP_MIN_FRAC 4
129 /* fraction of total nfsbufs that nfsbuffreemetacnt should exceed before bothering to call nfs_buf_freeup() */
130 #define META_FREEUP_MIN_FRAC 2
132 #define NFS_BUF_FREEUP() \
134 /* only call nfs_buf_freeup() if it has work to do: */ \
135 if (((nfsbuffreecnt > nfsbufcnt/LRU_FREEUP_MIN_FRAC) || \
136 (nfsbuffreemetacnt > nfsbufcnt/META_FREEUP_MIN_FRAC)) && \
137 ((nfsbufcnt - TOTAL_TO_FREEUP) > nfsbufmin)) \
142 * Initialize nfsbuf lists
148 nfsbufhashtbl
= hashinit(nbuf
, M_TEMP
, &nfsbufhash
);
149 TAILQ_INIT(&nfsbuffree
);
150 TAILQ_INIT(&nfsbuffreemeta
);
151 TAILQ_INIT(&nfsbufdelwri
);
152 nfsbufcnt
= nfsbuffreecnt
= nfsbuffreemetacnt
= nfsbufdelwricnt
= 0;
153 nfsbufmin
= 128; // XXX tune me!
154 nfsbufmax
= 8192; // XXX tune me!
157 nfsbuffreeuptimestamp
= 0;
161 * try to free up some excess, unused nfsbufs
164 nfs_buf_freeup(int timer
)
171 nfsbuffreeuptimestamp
= now
.tv_sec
;
173 FSDBG(320, nfsbufcnt
, nfsbuffreecnt
, nfsbuffreemetacnt
, count
);
174 count
= timer
? nfsbuffreecnt
/LRU_FREEUP_FRAC_ON_TIMER
: LRU_TO_FREEUP
;
175 while ((nfsbufcnt
> nfsbufmin
) && (count
-- > 0)) {
176 fbp
= TAILQ_FIRST(&nfsbuffree
);
179 if ((fbp
->nb_timestamp
+ (2*NFSBUF_LRU_STALE
)) > now
.tv_sec
)
181 nfs_buf_remfree(fbp
);
182 /* disassociate buffer from any vnode */
185 if (fbp
->nb_vnbufs
.le_next
!= NFSNOLIST
) {
186 LIST_REMOVE(fbp
, nb_vnbufs
);
187 fbp
->nb_vnbufs
.le_next
= NFSNOLIST
;
193 LIST_REMOVE(fbp
, nb_hash
);
195 if (fbp
->nb_rcred
!= NOCRED
)
196 crfree(fbp
->nb_rcred
);
197 if (fbp
->nb_wcred
!= NOCRED
)
198 crfree(fbp
->nb_wcred
);
199 /* if buf was NB_META, dump buffer */
200 if (ISSET(fbp
->nb_flags
, NB_META
) && fbp
->nb_data
) {
201 FREE(fbp
->nb_data
, M_TEMP
);
207 count
= timer
? nfsbuffreemetacnt
/META_FREEUP_FRAC_ON_TIMER
: META_TO_FREEUP
;
208 while ((nfsbufcnt
> nfsbufmin
) && (count
-- > 0)) {
209 fbp
= TAILQ_FIRST(&nfsbuffreemeta
);
212 if ((fbp
->nb_timestamp
+ (2*NFSBUF_META_STALE
)) > now
.tv_sec
)
214 nfs_buf_remfree(fbp
);
215 /* disassociate buffer from any vnode */
218 if (fbp
->nb_vnbufs
.le_next
!= NFSNOLIST
) {
219 LIST_REMOVE(fbp
, nb_vnbufs
);
220 fbp
->nb_vnbufs
.le_next
= NFSNOLIST
;
226 LIST_REMOVE(fbp
, nb_hash
);
228 if (fbp
->nb_rcred
!= NOCRED
)
229 crfree(fbp
->nb_rcred
);
230 if (fbp
->nb_wcred
!= NOCRED
)
231 crfree(fbp
->nb_wcred
);
232 /* if buf was NB_META, dump buffer */
233 if (ISSET(fbp
->nb_flags
, NB_META
) && fbp
->nb_data
) {
234 FREE(fbp
->nb_data
, M_TEMP
);
239 FSDBG(320, nfsbufcnt
, nfsbuffreecnt
, nfsbuffreemetacnt
, count
);
243 nfs_buf_remfree(struct nfsbuf
*bp
)
245 if (bp
->nb_free
.tqe_next
== NFSNOLIST
)
246 panic("nfsbuf not on free list");
247 if (ISSET(bp
->nb_flags
, NB_DELWRI
)) {
249 TAILQ_REMOVE(&nfsbufdelwri
, bp
, nb_free
);
250 } else if (ISSET(bp
->nb_flags
, NB_META
) && !ISSET(bp
->nb_flags
, NB_INVAL
)) {
252 TAILQ_REMOVE(&nfsbuffreemeta
, bp
, nb_free
);
255 TAILQ_REMOVE(&nfsbuffree
, bp
, nb_free
);
257 bp
->nb_free
.tqe_next
= NFSNOLIST
;
262 * check for existence of nfsbuf in cache
265 nfs_buf_incore(struct vnode
*vp
, daddr_t blkno
)
267 /* Search hash chain */
268 struct nfsbuf
* bp
= NFSBUFHASH(vp
, blkno
)->lh_first
;
269 for (; bp
!= NULL
; bp
= bp
->nb_hash
.le_next
)
270 if (bp
->nb_lblkno
== blkno
&& bp
->nb_vp
== vp
) {
271 if (!ISSET(bp
->nb_flags
, NB_INVAL
)) {
272 FSDBG(547, bp
, blkno
, bp
->nb_flags
, bp
->nb_vp
);
280 * Check if it's OK to drop a page.
282 * Called by vnode_pager() on pageout request of non-dirty page.
283 * We need to make sure that it's not part of a delayed write.
284 * If it is, we can't let the VM drop it because we may need it
285 * later when/if we need to write the data (again).
288 nfs_buf_page_inval(struct vnode
*vp
, off_t offset
)
291 bp
= nfs_buf_incore(vp
, ubc_offtoblk(vp
, offset
));
294 FSDBG(325, bp
, bp
->nb_flags
, bp
->nb_dirtyoff
, bp
->nb_dirtyend
);
295 if (ISSET(bp
->nb_flags
, NB_BUSY
))
298 * If there's a dirty range in the buffer, check to
299 * see if this page intersects with the dirty range.
300 * If it does, we can't let the pager drop the page.
302 if (bp
->nb_dirtyend
> 0) {
303 int start
= offset
- NBOFF(bp
);
304 if (bp
->nb_dirtyend
<= start
||
305 bp
->nb_dirtyoff
>= (start
+ PAGE_SIZE
))
313 nfs_buf_upl_setup(struct nfsbuf
*bp
)
319 if (ISSET(bp
->nb_flags
, NB_PAGELIST
))
322 kret
= ubc_create_upl(bp
->nb_vp
, NBOFF(bp
), bp
->nb_bufsize
,
323 &upl
, NULL
, UPL_PRECIOUS
);
324 if (kret
== KERN_INVALID_ARGUMENT
) {
325 /* vm object probably doesn't exist any more */
326 bp
->nb_pagelist
= NULL
;
329 if (kret
!= KERN_SUCCESS
) {
330 printf("nfs_buf_upl_setup(): failed to get pagelist %d\n", kret
);
331 bp
->nb_pagelist
= NULL
;
335 FSDBG(538, bp
, NBOFF(bp
), bp
->nb_bufsize
, bp
->nb_vp
);
338 bp
->nb_pagelist
= upl
;
339 SET(bp
->nb_flags
, NB_PAGELIST
);
345 nfs_buf_upl_check(struct nfsbuf
*bp
)
348 off_t filesize
, fileoffset
;
351 if (!ISSET(bp
->nb_flags
, NB_PAGELIST
))
354 npages
= round_page_32(bp
->nb_bufsize
) / PAGE_SIZE
;
355 filesize
= ubc_getsize(bp
->nb_vp
);
356 fileoffset
= NBOFF(bp
);
357 if (fileoffset
< filesize
)
358 SET(bp
->nb_flags
, NB_CACHE
);
360 CLR(bp
->nb_flags
, NB_CACHE
);
362 pl
= ubc_upl_pageinfo(bp
->nb_pagelist
);
363 bp
->nb_valid
= bp
->nb_dirty
= 0;
365 for (i
=0; i
< npages
; i
++, fileoffset
+= PAGE_SIZE_64
) {
366 /* anything beyond the end of the file is not valid or dirty */
367 if (fileoffset
>= filesize
)
369 if (!upl_valid_page(pl
, i
)) {
370 CLR(bp
->nb_flags
, NB_CACHE
);
374 if (upl_dirty_page(pl
, i
)) {
375 NBPGDIRTY_SET(bp
, i
);
376 if (!ISSET(bp
->nb_flags
, NB_WASDIRTY
))
377 SET(bp
->nb_flags
, NB_WASDIRTY
);
380 fileoffset
= NBOFF(bp
);
381 if (ISSET(bp
->nb_flags
, NB_CACHE
)) {
383 bp
->nb_validend
= bp
->nb_bufsize
;
384 if (fileoffset
+ bp
->nb_validend
> filesize
)
385 bp
->nb_validend
= filesize
- fileoffset
;
387 bp
->nb_validoff
= bp
->nb_validend
= -1;
389 FSDBG(539, bp
, fileoffset
, bp
->nb_valid
, bp
->nb_dirty
);
390 FSDBG(539, bp
->nb_validoff
, bp
->nb_validend
, bp
->nb_dirtyoff
, bp
->nb_dirtyend
);
394 nfs_buf_map(struct nfsbuf
*bp
)
400 if (!ISSET(bp
->nb_flags
, NB_PAGELIST
))
403 kret
= ubc_upl_map(bp
->nb_pagelist
, (vm_address_t
*)&(bp
->nb_data
));
404 if (kret
!= KERN_SUCCESS
)
405 panic("nfs_buf_map: ubc_upl_map() failed with (%d)", kret
);
406 if (bp
->nb_data
== 0)
407 panic("ubc_upl_map mapped 0");
408 FSDBG(540, bp
, bp
->nb_flags
, NBOFF(bp
), bp
->nb_data
);
413 * check range of pages in nfsbuf's UPL for validity
416 nfs_buf_upl_valid_range(struct nfsbuf
*bp
, int off
, int size
)
418 off_t fileoffset
, filesize
;
422 if (!ISSET(bp
->nb_flags
, NB_PAGELIST
))
424 pl
= ubc_upl_pageinfo(bp
->nb_pagelist
);
426 size
+= off
& PAGE_MASK
;
428 fileoffset
= NBOFF(bp
);
429 filesize
= VTONFS(bp
->nb_vp
)->n_size
;
430 if ((fileoffset
+ off
+ size
) > filesize
)
431 size
= filesize
- (fileoffset
+ off
);
434 lastpg
= (off
+ size
- 1)/PAGE_SIZE
;
435 while (pg
<= lastpg
) {
436 if (!upl_valid_page(pl
, pg
))
444 * normalize an nfsbuf's valid range
446 * the read/write code guarantees that we'll always have a valid
447 * region that is an integral number of pages. If either end
448 * of the valid range isn't page-aligned, it gets corrected
449 * here as we extend the valid range through all of the
450 * contiguous valid pages.
453 nfs_buf_normalize_valid_range(struct nfsnode
*np
, struct nfsbuf
*bp
)
456 /* pull validoff back to start of contiguous valid page range */
457 pg
= bp
->nb_validoff
/PAGE_SIZE
;
458 while (pg
>= 0 && NBPGVALID(bp
,pg
))
460 bp
->nb_validoff
= (pg
+1) * PAGE_SIZE
;
461 /* push validend forward to end of contiguous valid page range */
462 npg
= bp
->nb_bufsize
/PAGE_SIZE
;
463 pg
= bp
->nb_validend
/PAGE_SIZE
;
464 while (pg
< npg
&& NBPGVALID(bp
,pg
))
466 bp
->nb_validend
= pg
* PAGE_SIZE
;
468 if (NBOFF(bp
) + bp
->nb_validend
> np
->n_size
)
469 bp
->nb_validend
= np
->n_size
% bp
->nb_bufsize
;
473 * try to push out some delayed/uncommitted writes
476 nfs_buf_delwri_push(void)
481 if (TAILQ_EMPTY(&nfsbufdelwri
))
484 /* first try to tell the nfsiods to do it */
485 if (nfs_asyncio(NULL
, NULL
) == 0)
488 /* otherwise, try to do some of the work ourselves */
490 while (i
< 8 && (bp
= TAILQ_FIRST(&nfsbufdelwri
)) != NULL
) {
491 struct nfsnode
*np
= VTONFS(bp
->nb_vp
);
493 if (ISSET(bp
->nb_flags
, NB_NEEDCOMMIT
)) {
494 /* put buffer at end of delwri list */
495 TAILQ_INSERT_TAIL(&nfsbufdelwri
, bp
, nb_free
);
497 nfs_flushcommits(np
->n_vnode
, (struct proc
*)0);
499 SET(bp
->nb_flags
, (NB_BUSY
| NB_ASYNC
));
507 * Get an nfs cache block.
508 * Allocate a new one if the block isn't currently in the cache
509 * and return the block marked busy. If the calling process is
510 * interrupted by a signal for an interruptible mount point, return
521 struct nfsnode
*np
= VTONFS(vp
);
523 int i
, biosize
, bufsize
, rv
;
525 int slpflag
= PCATCH
;
527 FSDBG_TOP(541, vp
, blkno
, size
, operation
);
530 if (bufsize
> MAXBSIZE
)
531 panic("nfs_buf_get: buffer larger than MAXBSIZE requested");
533 biosize
= vp
->v_mount
->mnt_stat
.f_iosize
;
535 if (UBCINVALID(vp
) || !UBCINFOEXISTS(vp
))
536 operation
= BLK_META
;
537 else if (bufsize
< biosize
)
538 /* reg files should always have biosize blocks */
541 /* if BLK_WRITE, check for too many delayed/uncommitted writes */
542 if ((operation
== BLK_WRITE
) && (nfs_nbdwrite
> ((nfsbufcnt
*3)/4))) {
543 FSDBG_TOP(542, vp
, blkno
, nfs_nbdwrite
, ((nfsbufcnt
*3)/4));
545 /* poke the delwri list */
546 nfs_buf_delwri_push();
548 /* sleep to let other threads run... */
549 tsleep(&nfs_nbdwrite
, PCATCH
, "nfs_nbdwrite", 1);
550 FSDBG_BOT(542, vp
, blkno
, nfs_nbdwrite
, ((nfsbufcnt
*3)/4));
555 * Obtain a lock to prevent a race condition if the
556 * MALLOC() below happens to block.
558 if (nfsbufhashlock
) {
559 while (nfsbufhashlock
) {
561 tsleep(&nfsbufhashlock
, PCATCH
, "nfsbufget", 0);
562 if (nfs_sigintr(VFSTONFS(vp
->v_mount
), NULL
, p
))
569 /* check for existence of nfsbuf in cache */
570 if (bp
= nfs_buf_incore(vp
, blkno
)) {
571 /* if busy, set wanted and wait */
572 if (ISSET(bp
->nb_flags
, NB_BUSY
)) {
573 FSDBG_TOP(543, vp
, blkno
, bp
, bp
->nb_flags
);
574 SET(bp
->nb_flags
, NB_WANTED
);
576 if (nfsbufhashlock
< 0) {
578 wakeup(&nfsbufhashlock
);
581 tsleep(bp
, slpflag
|(PRIBIO
+1), "nfsbufget", (slpflag
== PCATCH
) ? 0 : 2*hz
);
583 FSDBG_BOT(543, vp
, blkno
, bp
, bp
->nb_flags
);
584 if (nfs_sigintr(VFSTONFS(vp
->v_mount
), NULL
, p
)) {
585 FSDBG_BOT(541, vp
, blkno
, 0, EINTR
);
590 if (bp
->nb_bufsize
!= bufsize
)
591 panic("nfsbuf size mismatch");
592 SET(bp
->nb_flags
, (NB_BUSY
| NB_CACHE
));
594 /* additional paranoia: */
595 if (ISSET(bp
->nb_flags
, NB_PAGELIST
))
596 panic("pagelist buffer was not busy");
601 * where to get a free buffer:
602 * - alloc new if we haven't reached min bufs
603 * - if free lists are NOT empty
604 * - if free list is stale, use it
605 * - else if freemeta list is stale, use it
606 * - else if max bufs allocated, use least-time-to-stale
607 * - alloc new if we haven't reached max allowed
608 * - start clearing out delwri list and try again
611 if ((nfsbufcnt
> nfsbufmin
) &&
612 (!TAILQ_EMPTY(&nfsbuffree
) || !TAILQ_EMPTY(&nfsbuffreemeta
))) {
613 /* try to pull an nfsbuf off a free list */
614 struct nfsbuf
*lrubp
, *metabp
;
618 /* if the next LRU or META buffer is stale, use it */
619 lrubp
= TAILQ_FIRST(&nfsbuffree
);
620 if (lrubp
&& ((lrubp
->nb_timestamp
+ NFSBUF_LRU_STALE
) < now
.tv_sec
))
622 metabp
= TAILQ_FIRST(&nfsbuffreemeta
);
623 if (!bp
&& metabp
&& ((metabp
->nb_timestamp
+ NFSBUF_META_STALE
) < now
.tv_sec
))
626 if (!bp
&& (nfsbufcnt
>= nfsbufmax
)) {
627 /* we've already allocated all bufs, so */
628 /* choose the buffer that'll go stale first */
634 int32_t lru_stale_time
, meta_stale_time
;
635 lru_stale_time
= lrubp
->nb_timestamp
+ NFSBUF_LRU_STALE
;
636 meta_stale_time
= metabp
->nb_timestamp
+ NFSBUF_META_STALE
;
637 if (lru_stale_time
<= meta_stale_time
)
645 /* we have a buffer to reuse */
646 FSDBG(544, vp
, blkno
, bp
, bp
->nb_flags
);
648 if (ISSET(bp
->nb_flags
, NB_DELWRI
))
649 panic("nfs_buf_get: delwri");
650 SET(bp
->nb_flags
, NB_BUSY
);
651 /* disassociate buffer from previous vnode */
654 if (bp
->nb_vnbufs
.le_next
!= NFSNOLIST
) {
655 LIST_REMOVE(bp
, nb_vnbufs
);
656 bp
->nb_vnbufs
.le_next
= NFSNOLIST
;
662 LIST_REMOVE(bp
, nb_hash
);
663 /* nuke any creds we're holding */
665 if (cred
!= NOCRED
) {
666 bp
->nb_rcred
= NOCRED
;
670 if (cred
!= NOCRED
) {
671 bp
->nb_wcred
= NOCRED
;
674 /* if buf will no longer be NB_META, dump old buffer */
675 if ((operation
!= BLK_META
) &&
676 ISSET(bp
->nb_flags
, NB_META
) && bp
->nb_data
) {
677 FREE(bp
->nb_data
, M_TEMP
);
680 /* re-init buf fields */
682 bp
->nb_validoff
= bp
->nb_validend
= -1;
683 bp
->nb_dirtyoff
= bp
->nb_dirtyend
= 0;
690 if (nfsbufcnt
< nfsbufmax
) {
691 /* just alloc a new one */
692 MALLOC(bp
, struct nfsbuf
*, sizeof(struct nfsbuf
), M_TEMP
, M_WAITOK
);
696 bzero(bp
, sizeof(*bp
));
697 bp
->nb_free
.tqe_next
= NFSNOLIST
;
698 bp
->nb_validoff
= bp
->nb_validend
= -1;
699 FSDBG(545, vp
, blkno
, bp
, 0);
701 /* too many bufs... wait for buffers to free up */
702 FSDBG_TOP(546, vp
, blkno
, nfsbufcnt
, nfsbufmax
);
704 if (nfsbufhashlock
< 0) {
706 wakeup(&nfsbufhashlock
);
710 /* poke the delwri list */
711 nfs_buf_delwri_push();
714 tsleep(&nfsneedbuffer
, PCATCH
, "nfsbufget", 0);
715 FSDBG_BOT(546, vp
, blkno
, nfsbufcnt
, nfsbufmax
);
716 if (nfs_sigintr(VFSTONFS(vp
->v_mount
), NULL
, p
)) {
717 FSDBG_BOT(541, vp
, blkno
, 0, EINTR
);
727 bp
->nb_flags
= NB_BUSY
;
728 bp
->nb_lblkno
= blkno
;
729 /* insert buf in hash */
730 LIST_INSERT_HEAD(NFSBUFHASH(vp
, blkno
), bp
, nb_hash
);
731 /* associate buffer with new vnode */
734 LIST_INSERT_HEAD(&np
->n_cleanblkhd
, bp
, nb_vnbufs
);
740 SET(bp
->nb_flags
, NB_META
);
741 if ((bp
->nb_bufsize
!= bufsize
) && bp
->nb_data
) {
742 FREE(bp
->nb_data
, M_TEMP
);
744 bp
->nb_validoff
= bp
->nb_validend
= -1;
745 bp
->nb_dirtyoff
= bp
->nb_dirtyend
= 0;
748 CLR(bp
->nb_flags
, NB_CACHE
);
751 MALLOC(bp
->nb_data
, caddr_t
, bufsize
, M_TEMP
, M_WAITOK
);
753 panic("nfs_buf_get: null nb_data");
754 bp
->nb_bufsize
= bufsize
;
759 if (bufsize
< PAGE_SIZE
)
761 bp
->nb_bufsize
= bufsize
;
762 bp
->nb_validoff
= bp
->nb_validend
= -1;
764 if (UBCISVALID(vp
)) {
766 if (nfs_buf_upl_setup(bp
)) {
767 /* unable to create upl */
768 /* vm object must no longer exist */
769 /* cleanup buffer and return NULL */
770 LIST_REMOVE(bp
, nb_vnbufs
);
771 bp
->nb_vnbufs
.le_next
= NFSNOLIST
;
773 /* clear usage timestamp to allow immediate freeing */
774 bp
->nb_timestamp
= 0;
776 if (bp
->nb_free
.tqe_next
!= NFSNOLIST
)
777 panic("nfsbuf on freelist");
778 TAILQ_INSERT_HEAD(&nfsbuffree
, bp
, nb_free
);
780 FSDBG_BOT(541, vp
, blkno
, 0x2bc, EIO
);
783 nfs_buf_upl_check(bp
);
788 panic("nfs_buf_get: %d unknown operation", operation
);
792 if (nfsbufhashlock
< 0) {
794 wakeup(&nfsbufhashlock
);
798 FSDBG_BOT(541, vp
, blkno
, bp
, bp
->nb_flags
);
804 nfs_buf_release(struct nfsbuf
*bp
, int freeup
)
806 struct vnode
*vp
= bp
->nb_vp
;
809 FSDBG_TOP(548, bp
, NBOFF(bp
), bp
->nb_flags
, bp
->nb_data
);
810 FSDBG(548, bp
->nb_validoff
, bp
->nb_validend
, bp
->nb_dirtyoff
, bp
->nb_dirtyend
);
811 FSDBG(548, bp
->nb_valid
, 0, bp
->nb_dirty
, 0);
813 if (UBCINFOEXISTS(vp
) && bp
->nb_bufsize
) {
818 if (!ISSET(bp
->nb_flags
, NB_PAGELIST
) && !ISSET(bp
->nb_flags
, NB_INVAL
)) {
819 rv
= nfs_buf_upl_setup(bp
);
821 printf("nfs_buf_release: upl create failed %d\n", rv
);
823 nfs_buf_upl_check(bp
);
825 upl
= bp
->nb_pagelist
;
827 goto pagelist_cleanup_done
;
829 if (ubc_upl_unmap(upl
) != KERN_SUCCESS
)
830 panic("ubc_upl_unmap failed");
833 if (bp
->nb_flags
& (NB_ERROR
| NB_INVAL
)) {
834 if (bp
->nb_flags
& (NB_READ
| NB_INVAL
))
835 upl_flags
= UPL_ABORT_DUMP_PAGES
;
838 ubc_upl_abort(upl
, upl_flags
);
839 goto pagelist_cleanup_done
;
841 for (i
=0; i
<= (bp
->nb_bufsize
- 1)/PAGE_SIZE
; i
++) {
842 if (!NBPGVALID(bp
,i
))
843 ubc_upl_abort_range(upl
,
844 i
*PAGE_SIZE
, PAGE_SIZE
,
845 UPL_ABORT_DUMP_PAGES
|
846 UPL_ABORT_FREE_ON_EMPTY
);
849 upl_flags
= UPL_COMMIT_SET_DIRTY
;
851 upl_flags
= UPL_COMMIT_CLEAR_DIRTY
;
852 ubc_upl_commit_range(upl
,
853 i
*PAGE_SIZE
, PAGE_SIZE
,
855 UPL_COMMIT_INACTIVATE
|
856 UPL_COMMIT_FREE_ON_EMPTY
);
859 pagelist_cleanup_done
:
860 /* was this the last buffer in the file? */
861 if (NBOFF(bp
) + bp
->nb_bufsize
> VTONFS(vp
)->n_size
) {
862 /* if so, invalidate all pages of last buffer past EOF */
863 int biosize
= vp
->v_mount
->mnt_stat
.f_iosize
;
865 off
= trunc_page_64(VTONFS(vp
)->n_size
) + PAGE_SIZE_64
;
866 size
= trunc_page_64(NBOFF(bp
) + biosize
) - off
;
868 ubc_invalidate(vp
, off
, size
);
870 CLR(bp
->nb_flags
, NB_PAGELIST
);
871 bp
->nb_pagelist
= NULL
;
874 /* Wake up any processes waiting for any buffer to become free. */
877 wakeup(&nfsneedbuffer
);
879 /* Wake up any processes waiting for _this_ buffer to become free. */
880 if (ISSET(bp
->nb_flags
, NB_WANTED
)) {
881 CLR(bp
->nb_flags
, NB_WANTED
);
885 /* If it's not cacheable, or an error, mark it invalid. */
886 if (ISSET(bp
->nb_flags
, (NB_NOCACHE
|NB_ERROR
)))
887 SET(bp
->nb_flags
, NB_INVAL
);
889 if ((bp
->nb_bufsize
<= 0) || ISSET(bp
->nb_flags
, NB_INVAL
)) {
890 /* If it's invalid or empty, dissociate it from its vnode */
891 if (bp
->nb_vnbufs
.le_next
!= NFSNOLIST
) {
892 LIST_REMOVE(bp
, nb_vnbufs
);
893 bp
->nb_vnbufs
.le_next
= NFSNOLIST
;
897 /* if this was a delayed write, wakeup anyone */
898 /* waiting for delayed writes to complete */
899 if (ISSET(bp
->nb_flags
, NB_DELWRI
)) {
900 CLR(bp
->nb_flags
, NB_DELWRI
);
903 wakeup((caddr_t
)&nfs_nbdwrite
);
905 /* clear usage timestamp to allow immediate freeing */
906 bp
->nb_timestamp
= 0;
907 /* put buffer at head of free list */
908 if (bp
->nb_free
.tqe_next
!= NFSNOLIST
)
909 panic("nfsbuf on freelist");
910 SET(bp
->nb_flags
, NB_INVAL
);
911 TAILQ_INSERT_HEAD(&nfsbuffree
, bp
, nb_free
);
915 } else if (ISSET(bp
->nb_flags
, NB_DELWRI
)) {
916 /* put buffer at end of delwri list */
917 if (bp
->nb_free
.tqe_next
!= NFSNOLIST
)
918 panic("nfsbuf on freelist");
919 TAILQ_INSERT_TAIL(&nfsbufdelwri
, bp
, nb_free
);
922 /* update usage timestamp */
924 bp
->nb_timestamp
= now
.tv_sec
;
925 /* put buffer at end of free list */
926 if (bp
->nb_free
.tqe_next
!= NFSNOLIST
)
927 panic("nfsbuf on freelist");
928 if (ISSET(bp
->nb_flags
, NB_META
)) {
929 TAILQ_INSERT_TAIL(&nfsbuffreemeta
, bp
, nb_free
);
932 TAILQ_INSERT_TAIL(&nfsbuffree
, bp
, nb_free
);
941 /* Unlock the buffer. */
942 CLR(bp
->nb_flags
, (NB_ASYNC
| NB_BUSY
| NB_NOCACHE
| NB_STABLE
| NB_IOD
));
944 FSDBG_BOT(548, bp
, NBOFF(bp
), bp
->nb_flags
, bp
->nb_data
);
948 * Wait for operations on the buffer to complete.
949 * When they do, extract and return the I/O's error value.
952 nfs_buf_iowait(struct nfsbuf
*bp
)
954 FSDBG_TOP(549, bp
, NBOFF(bp
), bp
->nb_flags
, bp
->nb_error
);
956 while (!ISSET(bp
->nb_flags
, NB_DONE
))
957 tsleep(bp
, PRIBIO
+ 1, "nfs_buf_iowait", 0);
959 FSDBG_BOT(549, bp
, NBOFF(bp
), bp
->nb_flags
, bp
->nb_error
);
961 /* check for interruption of I/O, then errors. */
962 if (ISSET(bp
->nb_flags
, NB_EINTR
)) {
963 CLR(bp
->nb_flags
, NB_EINTR
);
965 } else if (ISSET(bp
->nb_flags
, NB_ERROR
))
966 return (bp
->nb_error
? bp
->nb_error
: EIO
);
971 * Mark I/O complete on a buffer.
974 nfs_buf_iodone(struct nfsbuf
*bp
)
978 FSDBG_TOP(550, bp
, NBOFF(bp
), bp
->nb_flags
, bp
->nb_error
);
980 if (ISSET(bp
->nb_flags
, NB_DONE
))
981 panic("nfs_buf_iodone already");
982 SET(bp
->nb_flags
, NB_DONE
); /* note that it's done */
984 * I/O was done, so don't believe
985 * the DIRTY state from VM anymore
987 CLR(bp
->nb_flags
, NB_WASDIRTY
);
989 if (!ISSET(bp
->nb_flags
, NB_READ
)) {
990 CLR(bp
->nb_flags
, NB_WRITEINPROG
);
994 /* Wakeup the throttled write operations as needed */
996 if (vp
&& (vp
->v_flag
& VTHROTTLED
)
997 && (vp
->v_numoutput
<= (NFSBUFWRITE_THROTTLE
/ 3))) {
998 vp
->v_flag
&= ~VTHROTTLED
;
999 wakeup((caddr_t
)&vp
->v_numoutput
);
1002 if (ISSET(bp
->nb_flags
, NB_ASYNC
)) /* if async, release it */
1003 nfs_buf_release(bp
, 1);
1004 else { /* or just wakeup the buffer */
1005 CLR(bp
->nb_flags
, NB_WANTED
);
1009 FSDBG_BOT(550, bp
, NBOFF(bp
), bp
->nb_flags
, bp
->nb_error
);
1013 nfs_buf_write_delayed(struct nfsbuf
*bp
)
1015 struct proc
*p
= current_proc();
1016 struct vnode
*vp
= bp
->nb_vp
;
1018 FSDBG_TOP(551, bp
, NBOFF(bp
), bp
->nb_flags
, 0);
1019 FSDBG(551, bp
, bp
->nb_dirtyoff
, bp
->nb_dirtyend
, bp
->nb_dirty
);
1022 * If the block hasn't been seen before:
1023 * (1) Mark it as having been seen,
1024 * (2) Charge for the write.
1025 * (3) Make sure it's on its vnode's correct block list,
1027 if (!ISSET(bp
->nb_flags
, NB_DELWRI
)) {
1028 SET(bp
->nb_flags
, NB_DELWRI
);
1029 if (p
&& p
->p_stats
)
1030 p
->p_stats
->p_ru
.ru_oublock
++; /* XXX */
1033 /* move to dirty list */
1034 if (bp
->nb_vnbufs
.le_next
!= NFSNOLIST
)
1035 LIST_REMOVE(bp
, nb_vnbufs
);
1036 LIST_INSERT_HEAD(&VTONFS(vp
)->n_dirtyblkhd
, bp
, nb_vnbufs
);
1040 * If the vnode has "too many" write operations in progress
1041 * wait for them to finish the IO
1043 while (vp
->v_numoutput
>= NFSBUFWRITE_THROTTLE
) {
1044 vp
->v_flag
|= VTHROTTLED
;
1045 tsleep((caddr_t
)&vp
->v_numoutput
, PRIBIO
+ 1, "nfs_buf_write_delayed", 0);
1049 * If we have too many delayed write buffers,
1050 * more than we can "safely" handle, just fall back to
1051 * doing the async write
1053 if (nfs_nbdwrite
< 0)
1054 panic("nfs_buf_write_delayed: Negative nfs_nbdwrite");
1056 if (nfs_nbdwrite
> ((nfsbufcnt
/4)*3)) {
1057 /* issue async write */
1058 SET(bp
->nb_flags
, NB_ASYNC
);
1060 FSDBG_BOT(551, bp
, NBOFF(bp
), bp
->nb_flags
, bp
->nb_error
);
1064 /* Otherwise, the "write" is done, so mark and release the buffer. */
1065 SET(bp
->nb_flags
, NB_DONE
);
1066 nfs_buf_release(bp
, 1);
1067 FSDBG_BOT(551, bp
, NBOFF(bp
), bp
->nb_flags
, 0);
1073 * Vnode op for read using bio
1074 * Any similarity to readip() is purely coincidental
1077 nfs_bioread(vp
, uio
, ioflag
, cred
, getpages
)
1078 register struct vnode
*vp
;
1079 register struct uio
*uio
;
1082 int getpages
; // XXX unused!
1084 struct nfsnode
*np
= VTONFS(vp
);
1087 struct nfsbuf
*bp
= 0, *rabp
;
1090 struct nfsmount
*nmp
= VFSTONFS(vp
->v_mount
);
1091 daddr_t lbn
, rabn
, lastrabn
= -1;
1093 int nra
, error
= 0, n
= 0, on
= 0;
1094 int operation
= (getpages
? BLK_PAGEIN
: BLK_READ
);
1096 struct dirent
*direntp
;
1098 FSDBG_TOP(514, vp
, uio
->uio_offset
, uio
->uio_resid
, ioflag
);
1101 if (uio
->uio_rw
!= UIO_READ
)
1102 panic("nfs_read mode");
1104 if (uio
->uio_resid
== 0) {
1105 FSDBG_BOT(514, vp
, 0xd1e0001, 0, 0);
1108 if (uio
->uio_offset
< 0) {
1109 FSDBG_BOT(514, vp
, 0xd1e0002, 0, EINVAL
);
1113 if ((nmp
->nm_flag
& NFSMNT_NFSV3
) &&
1114 !(nmp
->nm_state
& NFSSTA_GOTFSINFO
))
1115 (void)nfs_fsinfo(nmp
, vp
, cred
, p
);
1116 biosize
= vp
->v_mount
->mnt_stat
.f_iosize
;
1118 * For nfs, cache consistency can only be maintained approximately.
1119 * Although RFC1094 does not specify the criteria, the following is
1120 * believed to be compatible with the reference port.
1121 * For nqnfs, full cache consistency is maintained within the loop.
1123 * If the file's modify time on the server has changed since the
1124 * last read rpc or you have written to the file,
1125 * you may have lost data cache consistency with the
1126 * server, so flush all of the file's data out of the cache.
1127 * Then force a getattr rpc to ensure that you have up to date
1129 * NB: This implies that cache data can be read when up to
1130 * NFS_MAXATTRTIMEO seconds out of date. If you find that you need
1131 * current attributes this could be forced by setting n_xid to 0
1132 * before the VOP_GETATTR() call.
1134 if ((nmp
->nm_flag
& NFSMNT_NQNFS
) == 0) {
1135 if (np
->n_flag
& NMODIFIED
) {
1136 if (vp
->v_type
!= VREG
) {
1137 if (vp
->v_type
!= VDIR
)
1138 panic("nfs: bioread, not dir");
1140 error
= nfs_vinvalbuf(vp
, V_SAVE
, cred
, p
, 1);
1142 FSDBG_BOT(514, vp
, 0xd1e0003, 0, error
);
1147 error
= VOP_GETATTR(vp
, &vattr
, cred
, p
);
1149 FSDBG_BOT(514, vp
, 0xd1e0004, 0, error
);
1152 if (vp
->v_type
== VDIR
) {
1153 /* if directory changed, purge any name cache entries */
1154 if (np
->n_ncmtime
!= vattr
.va_mtime
.tv_sec
)
1156 np
->n_ncmtime
= vattr
.va_mtime
.tv_sec
;
1158 np
->n_mtime
= vattr
.va_mtime
.tv_sec
;
1160 error
= VOP_GETATTR(vp
, &vattr
, cred
, p
);
1162 FSDBG_BOT(514, vp
, 0xd1e0005, 0, error
);
1165 if (np
->n_mtime
!= vattr
.va_mtime
.tv_sec
) {
1166 if (vp
->v_type
== VDIR
) {
1168 /* purge name cache entries */
1169 if (np
->n_ncmtime
!= vattr
.va_mtime
.tv_sec
)
1172 error
= nfs_vinvalbuf(vp
, V_SAVE
, cred
, p
, 1);
1174 FSDBG_BOT(514, vp
, 0xd1e0006, 0, error
);
1177 if (vp
->v_type
== VDIR
)
1178 np
->n_ncmtime
= vattr
.va_mtime
.tv_sec
;
1179 np
->n_mtime
= vattr
.va_mtime
.tv_sec
;
1186 * Get a valid lease. If cached data is stale, flush it.
1188 if (nmp
->nm_flag
& NFSMNT_NQNFS
) {
1189 if (NQNFS_CKINVALID(vp
, np
, ND_READ
)) {
1191 error
= nqnfs_getlease(vp
, ND_READ
, cred
, p
);
1192 } while (error
== NQNFS_EXPIRED
);
1194 FSDBG_BOT(514, vp
, 0xd1e0007, 0, error
);
1197 if (np
->n_lrev
!= np
->n_brev
||
1198 (np
->n_flag
& NQNFSNONCACHE
) ||
1199 ((np
->n_flag
& NMODIFIED
) && vp
->v_type
== VDIR
)) {
1200 if (vp
->v_type
== VDIR
)
1202 error
= nfs_vinvalbuf(vp
, V_SAVE
, cred
, p
, 1);
1204 FSDBG_BOT(514, vp
, 0xd1e0008, 0, error
);
1207 np
->n_brev
= np
->n_lrev
;
1209 } else if (vp
->v_type
== VDIR
&& (np
->n_flag
& NMODIFIED
)) {
1211 error
= nfs_vinvalbuf(vp
, V_SAVE
, cred
, p
, 1);
1213 FSDBG_BOT(514, vp
, 0xd1e0009, 0, error
);
1218 if ((np
->n_flag
& NQNFSNONCACHE
) || (vp
->v_flag
& VNOCACHE_DATA
)) {
1219 if ((vp
->v_flag
& VNOCACHE_DATA
) &&
1220 (np
->n_dirtyblkhd
.lh_first
|| np
->n_cleanblkhd
.lh_first
)) {
1221 error
= nfs_vinvalbuf(vp
, V_SAVE
, cred
, p
, 1);
1223 FSDBG_BOT(514, vp
, 0xd1e000a, 0, error
);
1227 switch (vp
->v_type
) {
1229 error
= nfs_readrpc(vp
, uio
, cred
);
1230 FSDBG_BOT(514, vp
, uio
->uio_offset
, uio
->uio_resid
, error
);
1233 error
= nfs_readlinkrpc(vp
, uio
, cred
);
1234 FSDBG_BOT(514, vp
, uio
->uio_offset
, uio
->uio_resid
, error
);
1239 printf(" NQNFSNONCACHE: type %x unexpected\n", vp
->v_type
);
1242 switch (vp
->v_type
) {
1244 lbn
= uio
->uio_offset
/ biosize
;
1247 * Copy directly from any cached pages without grabbing the bufs.
1249 if (uio
->uio_segflg
== UIO_USERSPACE
) {
1250 int io_resid
= uio
->uio_resid
;
1251 diff
= np
->n_size
- uio
->uio_offset
;
1252 if (diff
< io_resid
)
1255 error
= cluster_copy_ubc_data(vp
, uio
, &io_resid
, 0);
1257 FSDBG_BOT(514, vp
, uio
->uio_offset
, 0xcacefeed, error
);
1261 /* count any biocache reads that we just copied directly */
1262 if (lbn
!= uio
->uio_offset
/ biosize
) {
1263 nfsstats
.biocache_reads
+= (uio
->uio_offset
/ biosize
) - lbn
;
1264 FSDBG(514, vp
, 0xcacefeed, uio
->uio_offset
, error
);
1268 lbn
= uio
->uio_offset
/ biosize
;
1269 on
= uio
->uio_offset
% biosize
;
1272 * Start the read ahead(s), as required.
1274 if (nfs_numasync
> 0 && nmp
->nm_readahead
> 0) {
1275 for (nra
= 0; nra
< nmp
->nm_readahead
; nra
++) {
1276 rabn
= lbn
+ 1 + nra
;
1277 if (rabn
<= lastrabn
) {
1278 /* we've already (tried to) read this block */
1279 /* no need to try it again... */
1283 if ((off_t
)rabn
* biosize
>= np
->n_size
)
1285 /* check if block exists and is valid. */
1286 rabp
= nfs_buf_incore(vp
, rabn
);
1287 if (rabp
&& nfs_buf_upl_valid_range(rabp
, 0, rabp
->nb_bufsize
))
1289 rabp
= nfs_buf_get(vp
, rabn
, biosize
, p
, operation
);
1291 FSDBG_BOT(514, vp
, 0xd1e000b, 0, EINTR
);
1294 if (!ISSET(rabp
->nb_flags
, (NB_CACHE
|NB_DELWRI
))) {
1295 SET(rabp
->nb_flags
, (NB_READ
|NB_ASYNC
));
1296 if (nfs_asyncio(rabp
, cred
)) {
1297 SET(rabp
->nb_flags
, (NB_INVAL
|NB_ERROR
));
1298 rabp
->nb_error
= EIO
;
1299 nfs_buf_release(rabp
, 1);
1302 nfs_buf_release(rabp
, 1);
1306 if ((uio
->uio_resid
<= 0) || (uio
->uio_offset
>= np
->n_size
)) {
1307 FSDBG_BOT(514, vp
, uio
->uio_offset
, uio
->uio_resid
, 0xaaaaaaaa);
1311 nfsstats
.biocache_reads
++;
1314 * If the block is in the cache and has the required data
1315 * in a valid region, just copy it out.
1316 * Otherwise, get the block and write back/read in,
1321 n
= min((unsigned)(bufsize
- on
), uio
->uio_resid
);
1322 diff
= np
->n_size
- uio
->uio_offset
;
1326 bp
= nfs_buf_get(vp
, lbn
, bufsize
, p
, operation
);
1328 FSDBG_BOT(514, vp
, 0xd1e000c, 0, EINTR
);
1332 /* if any pages are valid... */
1334 /* ...check for any invalid pages in the read range */
1335 int pg
, firstpg
, lastpg
, dirtypg
;
1336 dirtypg
= firstpg
= lastpg
= -1;
1338 while (pg
<= (on
+ n
- 1)/PAGE_SIZE
) {
1339 if (!NBPGVALID(bp
,pg
)) {
1343 } else if (firstpg
>= 0 && dirtypg
< 0 && NBPGDIRTY(bp
,pg
))
1348 /* if there are no invalid pages, we're all set */
1350 if (bp
->nb_validoff
< 0) {
1351 /* valid range isn't set up, so */
1352 /* set it to what we know is valid */
1353 bp
->nb_validoff
= trunc_page_32(on
);
1354 bp
->nb_validend
= round_page_32(on
+n
);
1355 nfs_buf_normalize_valid_range(np
, bp
);
1360 /* there are invalid pages in the read range */
1361 if ((dirtypg
> firstpg
) && (dirtypg
< lastpg
)) {
1362 /* there are also dirty page(s) in the range, */
1363 /* so write the buffer out and try again */
1364 CLR(bp
->nb_flags
, (NB_DONE
| NB_ERROR
| NB_INVAL
));
1365 SET(bp
->nb_flags
, NB_ASYNC
);
1367 * NFS has embedded ucred so crhold() risks zone corruption
1369 if (bp
->nb_wcred
== NOCRED
)
1370 bp
->nb_wcred
= crdup(cred
);
1371 error
= nfs_buf_write(bp
);
1373 FSDBG_BOT(514, vp
, 0xd1e000d, 0, error
);
1378 if (!bp
->nb_dirty
&& bp
->nb_dirtyend
<= 0 &&
1379 (lastpg
- firstpg
+ 1) > (bufsize
/PAGE_SIZE
)/2) {
1380 /* we need to read in more than half the buffer and the */
1381 /* buffer's not dirty, so just fetch the whole buffer */
1384 /* read the page range in */
1387 auio
.uio_iov
= &iov
;
1388 auio
.uio_iovcnt
= 1;
1389 auio
.uio_offset
= NBOFF(bp
) + firstpg
* PAGE_SIZE_64
;
1390 auio
.uio_resid
= (lastpg
- firstpg
+ 1) * PAGE_SIZE
;
1391 auio
.uio_segflg
= UIO_SYSSPACE
;
1392 auio
.uio_rw
= UIO_READ
;
1395 iov
.iov_base
= bp
->nb_data
+ firstpg
* PAGE_SIZE
;
1396 iov
.iov_len
= auio
.uio_resid
;
1397 error
= nfs_readrpc(vp
, &auio
, cred
);
1399 nfs_buf_release(bp
, 1);
1400 FSDBG_BOT(514, vp
, 0xd1e000e, 0, error
);
1403 /* Make sure that the valid range is set to cover this read. */
1404 bp
->nb_validoff
= trunc_page_32(on
);
1405 bp
->nb_validend
= round_page_32(on
+n
);
1406 nfs_buf_normalize_valid_range(np
, bp
);
1407 if (auio
.uio_resid
> 0) {
1408 /* if short read, must have hit EOF, */
1409 /* so zero the rest of the range */
1410 bzero(iov
.iov_base
, auio
.uio_resid
);
1412 /* mark the pages (successfully read) as valid */
1413 for (pg
=firstpg
; pg
<= lastpg
; pg
++)
1414 NBPGVALID_SET(bp
,pg
);
1417 /* if no pages are valid, read the whole block */
1418 if (!bp
->nb_valid
) {
1419 SET(bp
->nb_flags
, NB_READ
);
1420 CLR(bp
->nb_flags
, (NB_DONE
| NB_ERROR
| NB_INVAL
));
1421 error
= nfs_doio(bp
, cred
, p
);
1423 nfs_buf_release(bp
, 1);
1424 FSDBG_BOT(514, vp
, 0xd1e000f, 0, error
);
1430 /* validate read range against valid range and clip */
1431 if (bp
->nb_validend
> 0) {
1432 diff
= (on
>= bp
->nb_validend
) ? 0 : (bp
->nb_validend
- on
);
1440 nfsstats
.biocache_readlinks
++;
1441 bp
= nfs_buf_get(vp
, (daddr_t
)0, NFS_MAXPATHLEN
, p
, operation
);
1443 FSDBG_BOT(514, vp
, 0xd1e0010, 0, EINTR
);
1446 if (!ISSET(bp
->nb_flags
, NB_CACHE
)) {
1447 SET(bp
->nb_flags
, NB_READ
);
1448 error
= nfs_doio(bp
, cred
, p
);
1450 SET(bp
->nb_flags
, NB_ERROR
);
1451 nfs_buf_release(bp
, 1);
1452 FSDBG_BOT(514, vp
, 0xd1e0011, 0, error
);
1456 n
= min(uio
->uio_resid
, bp
->nb_validend
);
1460 nfsstats
.biocache_readdirs
++;
1461 if (np
->n_direofoffset
&& uio
->uio_offset
>= np
->n_direofoffset
) {
1462 FSDBG_BOT(514, vp
, 0xde0f0001, 0, 0);
1465 lbn
= uio
->uio_offset
/ NFS_DIRBLKSIZ
;
1466 on
= uio
->uio_offset
& (NFS_DIRBLKSIZ
- 1);
1467 bp
= nfs_buf_get(vp
, lbn
, NFS_DIRBLKSIZ
, p
, operation
);
1469 FSDBG_BOT(514, vp
, 0xd1e0012, 0, EINTR
);
1472 if (!ISSET(bp
->nb_flags
, NB_CACHE
)) {
1473 SET(bp
->nb_flags
, NB_READ
);
1474 error
= nfs_doio(bp
, cred
, p
);
1476 nfs_buf_release(bp
, 1);
1478 while (error
== NFSERR_BAD_COOKIE
) {
1480 error
= nfs_vinvalbuf(vp
, 0, cred
, p
, 1);
1482 * Yuck! The directory has been modified on the
1483 * server. The only way to get the block is by
1484 * reading from the beginning to get all the
1487 for (i
= 0; i
<= lbn
&& !error
; i
++) {
1488 if (np
->n_direofoffset
1489 && (i
* NFS_DIRBLKSIZ
) >= np
->n_direofoffset
) {
1490 FSDBG_BOT(514, vp
, 0xde0f0002, 0, 0);
1493 bp
= nfs_buf_get(vp
, i
, NFS_DIRBLKSIZ
, p
, operation
);
1495 FSDBG_BOT(514, vp
, 0xd1e0013, 0, EINTR
);
1498 if (!ISSET(bp
->nb_flags
, NB_CACHE
)) {
1499 SET(bp
->nb_flags
, NB_READ
);
1500 error
= nfs_doio(bp
, cred
, p
);
1502 * no error + NB_INVAL == directory EOF,
1505 if (error
== 0 && (bp
->nb_flags
& NB_INVAL
))
1509 * An error will throw away the block and the
1510 * for loop will break out. If no error and this
1511 * is not the block we want, we throw away the
1512 * block and go for the next one via the for loop.
1514 if (error
|| i
< lbn
)
1515 nfs_buf_release(bp
, 1);
1519 * The above while is repeated if we hit another cookie
1520 * error. If we hit an error and it wasn't a cookie error,
1524 FSDBG_BOT(514, vp
, 0xd1e0014, 0, error
);
1530 * If not eof and read aheads are enabled, start one.
1531 * (You need the current block first, so that you have the
1532 * directory offset cookie of the next block.)
1534 if (nfs_numasync
> 0 && nmp
->nm_readahead
> 0 &&
1535 (np
->n_direofoffset
== 0 ||
1536 (lbn
+ 1) * NFS_DIRBLKSIZ
< np
->n_direofoffset
) &&
1537 !(np
->n_flag
& NQNFSNONCACHE
) &&
1538 !nfs_buf_incore(vp
, lbn
+ 1)) {
1539 rabp
= nfs_buf_get(vp
, lbn
+ 1, NFS_DIRBLKSIZ
, p
,
1542 if (!ISSET(rabp
->nb_flags
, (NB_CACHE
))) {
1543 SET(rabp
->nb_flags
, (NB_READ
| NB_ASYNC
));
1544 if (nfs_asyncio(rabp
, cred
)) {
1545 SET(rabp
->nb_flags
, (NB_INVAL
|NB_ERROR
));
1546 rabp
->nb_error
= EIO
;
1547 nfs_buf_release(rabp
, 1);
1550 nfs_buf_release(rabp
, 1);
1555 * Make sure we use a signed variant of min() since
1556 * the second term may be negative.
1558 n
= lmin(uio
->uio_resid
, bp
->nb_validend
- on
);
1560 * We keep track of the directory eof in
1561 * np->n_direofoffset and chop it off as an
1562 * extra step right here.
1564 if (np
->n_direofoffset
&&
1565 n
> np
->n_direofoffset
- uio
->uio_offset
)
1566 n
= np
->n_direofoffset
- uio
->uio_offset
;
1568 * Make sure that we return an integral number of entries so
1569 * that any subsequent calls will start copying from the start
1570 * of the next entry.
1572 * If the current value of n has the last entry cut short,
1573 * set n to copy everything up to the last entry instead.
1576 dp
= bp
->nb_data
+ on
;
1577 while (dp
< (bp
->nb_data
+ on
+ n
)) {
1578 direntp
= (struct dirent
*)dp
;
1579 dp
+= direntp
->d_reclen
;
1581 if (dp
> (bp
->nb_data
+ on
+ n
))
1582 n
= (dp
- direntp
->d_reclen
) - (bp
->nb_data
+ on
);
1586 printf("nfs_bioread: type %x unexpected\n",vp
->v_type
);
1587 FSDBG_BOT(514, vp
, 0xd1e0015, 0, EINVAL
);
1592 error
= uiomove(bp
->nb_data
+ on
, (int)n
, uio
);
1594 switch (vp
->v_type
) {
1601 if (np
->n_flag
& NQNFSNONCACHE
)
1602 SET(bp
->nb_flags
, NB_INVAL
);
1605 nfs_buf_release(bp
, 1);
1606 } while (error
== 0 && uio
->uio_resid
> 0 && n
> 0);
1607 FSDBG_BOT(514, vp
, uio
->uio_offset
, uio
->uio_resid
, error
);
1613 * Vnode op for write using bio
1617 struct vop_write_args
/* {
1621 struct ucred *a_cred;
1624 struct uio
*uio
= ap
->a_uio
;
1625 struct proc
*p
= uio
->uio_procp
;
1626 struct vnode
*vp
= ap
->a_vp
;
1627 struct nfsnode
*np
= VTONFS(vp
);
1628 struct ucred
*cred
= ap
->a_cred
;
1629 int ioflag
= ap
->a_ioflag
;
1632 struct nfsmount
*nmp
= VFSTONFS(vp
->v_mount
);
1634 int biosize
, bufsize
, writeop
;
1635 int n
, on
, error
= 0, iomode
, must_commit
;
1636 off_t boff
, start
, end
, cureof
;
1640 FSDBG_TOP(515, vp
, uio
->uio_offset
, uio
->uio_resid
, ioflag
);
1643 if (uio
->uio_rw
!= UIO_WRITE
)
1644 panic("nfs_write mode");
1645 if (uio
->uio_segflg
== UIO_USERSPACE
&& uio
->uio_procp
!= current_proc())
1646 panic("nfs_write proc");
1648 if (vp
->v_type
!= VREG
)
1650 if (np
->n_flag
& NWRITEERR
) {
1651 np
->n_flag
&= ~NWRITEERR
;
1652 FSDBG_BOT(515, vp
, uio
->uio_offset
, uio
->uio_resid
, np
->n_error
);
1653 return (np
->n_error
);
1655 if ((nmp
->nm_flag
& NFSMNT_NFSV3
) &&
1656 !(nmp
->nm_state
& NFSSTA_GOTFSINFO
))
1657 (void)nfs_fsinfo(nmp
, vp
, cred
, p
);
1658 if (ioflag
& (IO_APPEND
| IO_SYNC
)) {
1659 if (np
->n_flag
& NMODIFIED
) {
1661 error
= nfs_vinvalbuf(vp
, V_SAVE
, cred
, p
, 1);
1663 FSDBG_BOT(515, vp
, uio
->uio_offset
, 0x10bad01, error
);
1667 if (ioflag
& IO_APPEND
) {
1669 error
= VOP_GETATTR(vp
, &vattr
, cred
, p
);
1671 FSDBG_BOT(515, vp
, uio
->uio_offset
, 0x10bad02, error
);
1674 uio
->uio_offset
= np
->n_size
;
1677 if (uio
->uio_offset
< 0) {
1678 FSDBG_BOT(515, vp
, uio
->uio_offset
, 0xbad0ff, EINVAL
);
1681 if (uio
->uio_resid
== 0) {
1682 FSDBG_BOT(515, vp
, uio
->uio_offset
, uio
->uio_resid
, 0);
1686 * Maybe this should be above the vnode op call, but so long as
1687 * file servers have no limits, i don't think it matters
1689 if (p
&& uio
->uio_offset
+ uio
->uio_resid
>
1690 p
->p_rlimit
[RLIMIT_FSIZE
].rlim_cur
) {
1691 psignal(p
, SIGXFSZ
);
1692 FSDBG_BOT(515, vp
, uio
->uio_offset
, 0x2b1f, EFBIG
);
1696 biosize
= vp
->v_mount
->mnt_stat
.f_iosize
;
1700 * Check for a valid write lease.
1702 if ((nmp
->nm_flag
& NFSMNT_NQNFS
) &&
1703 NQNFS_CKINVALID(vp
, np
, ND_WRITE
)) {
1705 error
= nqnfs_getlease(vp
, ND_WRITE
, cred
, p
);
1706 } while (error
== NQNFS_EXPIRED
);
1708 FSDBG_BOT(515, vp
, uio
->uio_offset
, 0x11110001, error
);
1711 if (np
->n_lrev
!= np
->n_brev
||
1712 (np
->n_flag
& NQNFSNONCACHE
)) {
1713 error
= nfs_vinvalbuf(vp
, V_SAVE
, cred
, p
, 1);
1715 FSDBG_BOT(515, vp
, uio
->uio_offset
, 0x11110002, error
);
1718 np
->n_brev
= np
->n_lrev
;
1721 if (ISSET(vp
->v_flag
, VNOCACHE_DATA
) &&
1722 (np
->n_dirtyblkhd
.lh_first
|| np
->n_cleanblkhd
.lh_first
)) {
1723 error
= nfs_vinvalbuf(vp
, V_SAVE
, cred
, p
, 1);
1725 FSDBG_BOT(515, vp
, 0, 0, error
);
1729 if (((np
->n_flag
& NQNFSNONCACHE
) ||
1730 ISSET(vp
->v_flag
, VNOCACHE_DATA
)) &&
1731 uio
->uio_iovcnt
== 1) {
1732 iomode
= NFSV3WRITE_FILESYNC
;
1733 error
= nfs_writerpc(vp
, uio
, cred
, &iomode
, &must_commit
);
1735 nfs_clearcommit(vp
->v_mount
);
1736 FSDBG_BOT(515, vp
, uio
->uio_offset
, uio
->uio_resid
, error
);
1739 nfsstats
.biocache_writes
++;
1740 lbn
= uio
->uio_offset
/ biosize
;
1741 on
= uio
->uio_offset
% biosize
;
1742 n
= min((unsigned)(biosize
- on
), uio
->uio_resid
);
1746 * Get a cache block for writing. The range to be written is
1747 * (off..off+n) within the block. We ensure that the block
1748 * either has no dirty region or that the given range is
1749 * contiguous with the existing dirty region.
1751 bp
= nfs_buf_get(vp
, lbn
, bufsize
, p
, BLK_WRITE
);
1753 FSDBG_BOT(515, vp
, uio
->uio_offset
, uio
->uio_resid
, EINTR
);
1756 /* map the block because we know we're going to write to it */
1759 if (ISSET(vp
->v_flag
, VNOCACHE_DATA
))
1760 SET(bp
->nb_flags
, (NB_NOCACHE
|NB_INVAL
));
1763 * NFS has embedded ucred so crhold() risks zone corruption
1765 if (bp
->nb_wcred
== NOCRED
)
1766 bp
->nb_wcred
= crdup(cred
);
1769 * If there's already a dirty range AND dirty pages in this block we
1770 * need to send a commit AND write the dirty pages before continuing.
1772 * If there's already a dirty range OR dirty pages in this block
1773 * and the new write range is not contiguous with the existing range,
1774 * then force the buffer to be written out now.
1775 * (We used to just extend the dirty range to cover the valid,
1776 * but unwritten, data in between also. But writing ranges
1777 * of data that weren't actually written by an application
1778 * risks overwriting some other client's data with stale data
1779 * that's just masquerading as new written data.)
1781 if (bp
->nb_dirtyend
> 0) {
1782 if (on
> bp
->nb_dirtyend
|| (on
+ n
) < bp
->nb_dirtyoff
|| bp
->nb_dirty
) {
1783 FSDBG(515, vp
, uio
->uio_offset
, bp
, 0xd15c001);
1784 /* write/commit buffer "synchronously" */
1785 /* (NB_STABLE indicates that data writes should be FILESYNC) */
1786 CLR(bp
->nb_flags
, (NB_DONE
| NB_ERROR
| NB_INVAL
));
1787 SET(bp
->nb_flags
, (NB_ASYNC
| NB_STABLE
));
1788 error
= nfs_buf_write(bp
);
1790 FSDBG_BOT(515, vp
, uio
->uio_offset
, uio
->uio_resid
, error
);
1795 } else if (bp
->nb_dirty
) {
1796 int firstpg
, lastpg
;
1798 /* calculate write range pagemask */
1799 firstpg
= on
/PAGE_SIZE
;
1800 lastpg
= (on
+n
-1)/PAGE_SIZE
;
1801 pagemask
= ((1 << (lastpg
+1)) - 1) & ~((1 << firstpg
) - 1);
1802 /* check if there are dirty pages outside the write range */
1803 if (bp
->nb_dirty
& ~pagemask
) {
1804 FSDBG(515, vp
, uio
->uio_offset
, bp
, 0xd15c002);
1805 /* write/commit buffer "synchronously" */
1806 /* (NB_STABLE indicates that data writes should be FILESYNC) */
1807 CLR(bp
->nb_flags
, (NB_DONE
| NB_ERROR
| NB_INVAL
));
1808 SET(bp
->nb_flags
, (NB_ASYNC
| NB_STABLE
));
1809 error
= nfs_buf_write(bp
);
1811 FSDBG_BOT(515, vp
, uio
->uio_offset
, uio
->uio_resid
, error
);
1816 /* if the first or last pages are already dirty */
1817 /* make sure that the dirty range encompasses those pages */
1818 if (NBPGDIRTY(bp
,firstpg
) || NBPGDIRTY(bp
,lastpg
)) {
1819 FSDBG(515, vp
, uio
->uio_offset
, bp
, 0xd15c003);
1820 bp
->nb_dirtyoff
= min(on
, firstpg
* PAGE_SIZE
);
1821 if (NBPGDIRTY(bp
,lastpg
)) {
1822 bp
->nb_dirtyend
= (lastpg
+1) * PAGE_SIZE
;
1824 if (NBOFF(bp
) + bp
->nb_dirtyend
> np
->n_size
)
1825 bp
->nb_dirtyend
= np
->n_size
- NBOFF(bp
);
1827 bp
->nb_dirtyend
= on
+n
;
1832 * Are we extending the size of the file with this write?
1833 * If so, update file size now that we have the block.
1834 * If there was a partial buf at the old eof, validate
1835 * and zero the new bytes.
1837 cureof
= (off_t
)np
->n_size
;
1838 if (uio
->uio_offset
+ n
> np
->n_size
) {
1839 struct nfsbuf
*eofbp
= NULL
;
1840 daddr_t eofbn
= np
->n_size
/ biosize
;
1841 int eofoff
= np
->n_size
% biosize
;
1842 int neweofoff
= (uio
->uio_offset
+ n
) % biosize
;
1844 FSDBG(515, 0xb1ffa000, uio
->uio_offset
+ n
, eofoff
, neweofoff
);
1846 if (eofoff
&& eofbn
< lbn
&& nfs_buf_incore(vp
, eofbn
))
1847 eofbp
= nfs_buf_get(vp
, eofbn
, biosize
, p
, BLK_WRITE
);
1849 /* if we're extending within the same last block */
1850 /* and the block is flagged as being cached... */
1851 if ((lbn
== eofbn
) && ISSET(bp
->nb_flags
, NB_CACHE
)) {
1852 /* ...check that all pages in buffer are valid */
1853 int endpg
= ((neweofoff
? neweofoff
: biosize
) - 1)/PAGE_SIZE
;
1855 /* pagemask only has to extend to last page being written to */
1856 pagemask
= (1 << (endpg
+1)) - 1;
1857 FSDBG(515, 0xb1ffa001, bp
->nb_valid
, pagemask
, 0);
1858 if ((bp
->nb_valid
& pagemask
) != pagemask
) {
1859 /* zerofill any hole */
1860 if (on
> bp
->nb_validend
) {
1862 for (i
=bp
->nb_validend
/PAGE_SIZE
; i
<= (on
- 1)/PAGE_SIZE
; i
++)
1863 NBPGVALID_SET(bp
, i
);
1865 FSDBG(516, bp
, bp
->nb_validend
, on
- bp
->nb_validend
, 0xf01e);
1866 bzero((char *)bp
->nb_data
+ bp
->nb_validend
,
1867 on
- bp
->nb_validend
);
1869 /* zerofill any trailing data in the last page */
1872 FSDBG(516, bp
, neweofoff
, PAGE_SIZE
- (neweofoff
& PAGE_MASK
), 0xe0f);
1873 bzero((char *)bp
->nb_data
+ neweofoff
,
1874 PAGE_SIZE
- (neweofoff
& PAGE_MASK
));
1878 np
->n_flag
|= NMODIFIED
;
1879 np
->n_size
= uio
->uio_offset
+ n
;
1880 ubc_setsize(vp
, (off_t
)np
->n_size
); /* XXX errors */
1883 * We may need to zero any previously invalid data
1884 * after the old EOF in the previous EOF buffer.
1886 * For the old last page, don't zero bytes if there
1887 * are invalid bytes in that page (i.e. the page isn't
1889 * For pages after the old last page, zero them and
1890 * mark them as valid.
1894 if (ISSET(vp
->v_flag
, VNOCACHE_DATA
))
1895 SET(eofbp
->nb_flags
, (NB_NOCACHE
|NB_INVAL
));
1897 FSDBG(516, eofbp
, eofoff
, biosize
- eofoff
, 0xe0fff01e);
1899 i
= eofoff
/PAGE_SIZE
;
1900 while (eofoff
< biosize
) {
1901 int poff
= eofoff
& PAGE_MASK
;
1902 if (!poff
|| NBPGVALID(eofbp
,i
)) {
1903 bzero(d
+ eofoff
, PAGE_SIZE
- poff
);
1904 NBPGVALID_SET(eofbp
, i
);
1906 if (bp
->nb_validend
== eofoff
)
1907 bp
->nb_validend
+= PAGE_SIZE
- poff
;
1908 eofoff
+= PAGE_SIZE
- poff
;
1911 nfs_buf_release(eofbp
, 1);
1915 * If dirtyend exceeds file size, chop it down. This should
1916 * not occur unless there is a race.
1918 if (NBOFF(bp
) + bp
->nb_dirtyend
> np
->n_size
)
1919 bp
->nb_dirtyend
= np
->n_size
- NBOFF(bp
);
1921 * UBC doesn't handle partial pages, so we need to make sure
1922 * that any pages left in the page cache are completely valid.
1924 * Writes that are smaller than a block are delayed if they
1925 * don't extend to the end of the block.
1927 * If the block isn't (completely) cached, we may need to read
1928 * in some parts of pages that aren't covered by the write.
1929 * If the write offset (on) isn't page aligned, we'll need to
1930 * read the start of the first page being written to. Likewise,
1931 * if the offset of the end of the write (on+n) isn't page aligned,
1932 * we'll need to read the end of the last page being written to.
1935 * We don't want to read anything we're just going to write over.
1936 * We don't want to issue multiple I/Os if we don't have to
1937 * (because they're synchronous rpcs).
1938 * We don't want to read anything we already have modified in the
1941 if (!ISSET(bp
->nb_flags
, NB_CACHE
) && n
< biosize
) {
1942 int firstpg
, lastpg
, dirtypg
;
1943 int firstpgoff
, lastpgoff
;
1945 firstpg
= on
/PAGE_SIZE
;
1946 firstpgoff
= on
& PAGE_MASK
;
1947 lastpg
= (on
+n
-1)/PAGE_SIZE
;
1948 lastpgoff
= (on
+n
) & PAGE_MASK
;
1949 if (firstpgoff
&& !NBPGVALID(bp
,firstpg
)) {
1950 /* need to read start of first page */
1951 start
= firstpg
* PAGE_SIZE
;
1952 end
= start
+ firstpgoff
;
1954 if (lastpgoff
&& !NBPGVALID(bp
,lastpg
)) {
1955 /* need to read end of last page */
1957 start
= (lastpg
* PAGE_SIZE
) + lastpgoff
;
1958 end
= (lastpg
+ 1) * PAGE_SIZE
;
1961 /* need to read the data in range: start...end-1 */
1963 /* first, check for dirty pages in between */
1964 /* if there are, we'll have to do two reads because */
1965 /* we don't want to overwrite the dirty pages. */
1966 for (dirtypg
=start
/PAGE_SIZE
; dirtypg
<= (end
-1)/PAGE_SIZE
; dirtypg
++)
1967 if (NBPGDIRTY(bp
,dirtypg
))
1970 /* if start is at beginning of page, try */
1971 /* to get any preceeding pages as well. */
1972 if (!(start
& PAGE_MASK
)) {
1973 /* stop at next dirty/valid page or start of block */
1974 for (; start
> 0; start
-=PAGE_SIZE
)
1975 if (NBPGVALID(bp
,((start
-1)/PAGE_SIZE
)))
1980 /* setup uio for read(s) */
1982 auio
.uio_iov
= &iov
;
1983 auio
.uio_iovcnt
= 1;
1984 auio
.uio_segflg
= UIO_SYSSPACE
;
1985 auio
.uio_rw
= UIO_READ
;
1988 if (dirtypg
<= (end
-1)/PAGE_SIZE
) {
1989 /* there's a dirty page in the way, so just do two reads */
1990 /* we'll read the preceding data here */
1991 auio
.uio_offset
= boff
+ start
;
1992 auio
.uio_resid
= iov
.iov_len
= on
- start
;
1993 iov
.iov_base
= bp
->nb_data
+ start
;
1994 error
= nfs_readrpc(vp
, &auio
, cred
);
1996 bp
->nb_error
= error
;
1997 SET(bp
->nb_flags
, NB_ERROR
);
1998 printf("nfs_write: readrpc %d", error
);
2000 if (auio
.uio_resid
> 0) {
2001 FSDBG(516, bp
, iov
.iov_base
- bp
->nb_data
, auio
.uio_resid
, 0xd00dee01);
2002 bzero(iov
.iov_base
, auio
.uio_resid
);
2004 /* update validoff/validend if necessary */
2005 if ((bp
->nb_validoff
< 0) || (bp
->nb_validoff
> start
))
2006 bp
->nb_validoff
= start
;
2007 if ((bp
->nb_validend
< 0) || (bp
->nb_validend
< on
))
2008 bp
->nb_validend
= on
;
2009 if (np
->n_size
> boff
+ bp
->nb_validend
)
2010 bp
->nb_validend
= min(np
->n_size
- (boff
+ start
), biosize
);
2011 /* validate any pages before the write offset */
2012 for (; start
< on
/PAGE_SIZE
; start
+=PAGE_SIZE
)
2013 NBPGVALID_SET(bp
, start
/PAGE_SIZE
);
2014 /* adjust start to read any trailing data */
2018 /* if end is at end of page, try to */
2019 /* get any following pages as well. */
2020 if (!(end
& PAGE_MASK
)) {
2021 /* stop at next valid page or end of block */
2022 for (; end
< bufsize
; end
+=PAGE_SIZE
)
2023 if (NBPGVALID(bp
,end
/PAGE_SIZE
))
2027 if (((boff
+start
) >= cureof
) || ((start
>= on
) && ((boff
+ on
+ n
) >= cureof
))) {
2029 * Either this entire read is beyond the current EOF
2030 * or the range that we won't be modifying (on+n...end)
2031 * is all beyond the current EOF.
2032 * No need to make a trip across the network to
2033 * read nothing. So, just zero the buffer instead.
2035 FSDBG(516, bp
, start
, end
- start
, 0xd00dee00);
2036 bzero(bp
->nb_data
+ start
, end
- start
);
2038 /* now we'll read the (rest of the) data */
2039 auio
.uio_offset
= boff
+ start
;
2040 auio
.uio_resid
= iov
.iov_len
= end
- start
;
2041 iov
.iov_base
= bp
->nb_data
+ start
;
2042 error
= nfs_readrpc(vp
, &auio
, cred
);
2044 bp
->nb_error
= error
;
2045 SET(bp
->nb_flags
, NB_ERROR
);
2046 printf("nfs_write: readrpc %d", error
);
2048 if (auio
.uio_resid
> 0) {
2049 FSDBG(516, bp
, iov
.iov_base
- bp
->nb_data
, auio
.uio_resid
, 0xd00dee02);
2050 bzero(iov
.iov_base
, auio
.uio_resid
);
2053 /* update validoff/validend if necessary */
2054 if ((bp
->nb_validoff
< 0) || (bp
->nb_validoff
> start
))
2055 bp
->nb_validoff
= start
;
2056 if ((bp
->nb_validend
< 0) || (bp
->nb_validend
< end
))
2057 bp
->nb_validend
= end
;
2058 if (np
->n_size
> boff
+ bp
->nb_validend
)
2059 bp
->nb_validend
= min(np
->n_size
- (boff
+ start
), biosize
);
2060 /* validate any pages before the write offset's page */
2061 for (; start
< trunc_page_32(on
); start
+=PAGE_SIZE
)
2062 NBPGVALID_SET(bp
, start
/PAGE_SIZE
);
2063 /* validate any pages after the range of pages being written to */
2064 for (; (end
- 1) > round_page_32(on
+n
-1); end
-=PAGE_SIZE
)
2065 NBPGVALID_SET(bp
, (end
-1)/PAGE_SIZE
);
2066 /* Note: pages being written to will be validated when written */
2070 if (ISSET(bp
->nb_flags
, NB_ERROR
)) {
2071 error
= bp
->nb_error
;
2072 nfs_buf_release(bp
, 1);
2073 FSDBG_BOT(515, vp
, uio
->uio_offset
, uio
->uio_resid
, error
);
2077 np
->n_flag
|= NMODIFIED
;
2080 * Check for valid write lease and get one as required.
2081 * In case nfs_buf_get() and/or nfs_buf_write() delayed us.
2083 if ((nmp
->nm_flag
& NFSMNT_NQNFS
) &&
2084 NQNFS_CKINVALID(vp
, np
, ND_WRITE
)) {
2086 error
= nqnfs_getlease(vp
, ND_WRITE
, cred
, p
);
2087 } while (error
== NQNFS_EXPIRED
);
2089 nfs_buf_release(bp
, 1);
2090 FSDBG_BOT(515, vp
, uio
->uio_offset
, 0x11220001, error
);
2093 if (np
->n_lrev
!= np
->n_brev
||
2094 (np
->n_flag
& NQNFSNONCACHE
)) {
2095 nfs_buf_release(bp
, 1);
2096 error
= nfs_vinvalbuf(vp
, V_SAVE
, cred
, p
, 1);
2098 FSDBG_BOT(515, vp
, uio
->uio_offset
, 0x11220002, error
);
2101 np
->n_brev
= np
->n_lrev
;
2106 error
= uiomove((char *)bp
->nb_data
+ on
, n
, uio
);
2108 SET(bp
->nb_flags
, NB_ERROR
);
2109 nfs_buf_release(bp
, 1);
2110 FSDBG_BOT(515, vp
, uio
->uio_offset
, uio
->uio_resid
, error
);
2114 /* validate any pages written to */
2115 start
= on
& ~PAGE_MASK
;
2116 for (; start
< on
+n
; start
+= PAGE_SIZE
) {
2117 NBPGVALID_SET(bp
, start
/PAGE_SIZE
);
2119 * This may seem a little weird, but we don't actually set the
2120 * dirty bits for writes. This is because we keep the dirty range
2121 * in the nb_dirtyoff/nb_dirtyend fields. Also, particularly for
2122 * delayed writes, when we give the pages back to the VM we don't
2123 * want to keep them marked dirty, because when we later write the
2124 * buffer we won't be able to tell which pages were written dirty
2125 * and which pages were mmapped and dirtied.
2128 if (bp
->nb_dirtyend
> 0) {
2129 bp
->nb_dirtyoff
= min(on
, bp
->nb_dirtyoff
);
2130 bp
->nb_dirtyend
= max((on
+ n
), bp
->nb_dirtyend
);
2132 bp
->nb_dirtyoff
= on
;
2133 bp
->nb_dirtyend
= on
+ n
;
2135 if (bp
->nb_validend
<= 0 || bp
->nb_validend
< bp
->nb_dirtyoff
||
2136 bp
->nb_validoff
> bp
->nb_dirtyend
) {
2137 bp
->nb_validoff
= bp
->nb_dirtyoff
;
2138 bp
->nb_validend
= bp
->nb_dirtyend
;
2140 bp
->nb_validoff
= min(bp
->nb_validoff
, bp
->nb_dirtyoff
);
2141 bp
->nb_validend
= max(bp
->nb_validend
, bp
->nb_dirtyend
);
2143 if (!ISSET(bp
->nb_flags
, NB_CACHE
))
2144 nfs_buf_normalize_valid_range(np
, bp
);
2147 * Since this block is being modified, it must be written
2148 * again and not just committed.
2150 if (ISSET(bp
->nb_flags
, NB_NEEDCOMMIT
)) {
2151 np
->n_needcommitcnt
--;
2152 CHECK_NEEDCOMMITCNT(np
);
2154 CLR(bp
->nb_flags
, NB_NEEDCOMMIT
);
2156 if ((np
->n_flag
& NQNFSNONCACHE
) ||
2157 (ioflag
& IO_SYNC
) || (vp
->v_flag
& VNOCACHE_DATA
)) {
2159 error
= nfs_buf_write(bp
);
2161 FSDBG_BOT(515, vp
, uio
->uio_offset
,
2162 uio
->uio_resid
, error
);
2165 if (np
->n_flag
& NQNFSNONCACHE
) {
2166 error
= nfs_vinvalbuf(vp
, V_SAVE
, cred
, p
, 1);
2168 FSDBG_BOT(515, vp
, uio
->uio_offset
,
2169 uio
->uio_resid
, error
);
2173 } else if ((n
+ on
) == biosize
&& (nmp
->nm_flag
& NFSMNT_NQNFS
) == 0) {
2174 bp
->nb_proc
= (struct proc
*)0;
2175 SET(bp
->nb_flags
, NB_ASYNC
);
2178 nfs_buf_write_delayed(bp
);
2180 if (np
->n_needcommitcnt
> (nbuf
/16))
2181 nfs_flushcommits(vp
, p
);
2183 } while (uio
->uio_resid
> 0 && n
> 0);
2185 FSDBG_BOT(515, vp
, uio
->uio_offset
, uio
->uio_resid
, 0);
2190 * Flush out and invalidate all buffers associated with a vnode.
2191 * Called with the underlying object locked.
2194 nfs_vinvalbuf_internal(vp
, flags
, cred
, p
, slpflag
, slptimeo
)
2195 register struct vnode
*vp
;
2199 int slpflag
, slptimeo
;
2202 struct nfsbuf
*nbp
, *blist
;
2204 struct nfsnode
*np
= VTONFS(vp
);
2206 if (flags
& V_SAVE
) {
2207 if (error
= VOP_FSYNC(vp
, cred
, MNT_WAIT
, p
))
2209 if (np
->n_dirtyblkhd
.lh_first
)
2210 panic("nfs_vinvalbuf: dirty bufs (vp 0x%x, bp 0x%x)",
2211 vp
, np
->n_dirtyblkhd
.lh_first
);
2215 blist
= np
->n_cleanblkhd
.lh_first
;
2217 blist
= np
->n_dirtyblkhd
.lh_first
;
2221 for (bp
= blist
; bp
; bp
= nbp
) {
2222 nbp
= bp
->nb_vnbufs
.le_next
;
2224 if (ISSET(bp
->nb_flags
, NB_BUSY
)) {
2225 SET(bp
->nb_flags
, NB_WANTED
);
2226 FSDBG_TOP(556, vp
, bp
, NBOFF(bp
), bp
->nb_flags
);
2227 error
= tsleep((caddr_t
)bp
,
2228 slpflag
| (PRIBIO
+ 1), "nfs_vinvalbuf",
2230 FSDBG_BOT(556, vp
, bp
, NBOFF(bp
), bp
->nb_flags
);
2233 FSDBG(554, vp
, bp
, -1, error
);
2238 FSDBG(554, vp
, bp
, NBOFF(bp
), bp
->nb_flags
);
2239 nfs_buf_remfree(bp
);
2240 SET(bp
->nb_flags
, NB_BUSY
);
2242 if ((flags
& V_SAVE
) && UBCINFOEXISTS(vp
) && (NBOFF(bp
) < np
->n_size
)) {
2243 /* XXX extra paranoia: make sure we're not */
2244 /* somehow leaving any dirty data around */
2246 int end
= (NBOFF(bp
) + bp
->nb_bufsize
>= np
->n_size
) ?
2247 bp
->nb_bufsize
: (np
->n_size
- NBOFF(bp
));
2248 if (!ISSET(bp
->nb_flags
, NB_PAGELIST
)) {
2249 error
= nfs_buf_upl_setup(bp
);
2250 if (error
== EINVAL
) {
2251 /* vm object must no longer exist */
2252 /* hopefully we don't need to do */
2253 /* anything for this buffer */
2255 printf("nfs_vinvalbuf: upl setup failed %d\n",
2257 bp
->nb_valid
= bp
->nb_dirty
= 0;
2259 nfs_buf_upl_check(bp
);
2260 /* check for any dirty data before the EOF */
2261 if (bp
->nb_dirtyend
&& bp
->nb_dirtyoff
< end
) {
2262 /* clip dirty range to EOF */
2263 if (bp
->nb_dirtyend
> end
)
2264 bp
->nb_dirtyend
= end
;
2267 bp
->nb_dirty
&= (1 << (round_page_32(end
)/PAGE_SIZE
)) - 1;
2271 FSDBG(554, vp
, bp
, 0xd00dee, bp
->nb_flags
);
2272 if (!ISSET(bp
->nb_flags
, NB_PAGELIST
))
2273 panic("nfs_vinvalbuf: dirty buffer without upl");
2274 /* gotta write out dirty data before invalidating */
2275 /* (NB_STABLE indicates that data writes should be FILESYNC) */
2276 /* (NB_NOCACHE indicates buffer should be discarded) */
2277 CLR(bp
->nb_flags
, (NB_DONE
| NB_ERROR
| NB_INVAL
| NB_ASYNC
));
2278 SET(bp
->nb_flags
, NB_STABLE
| NB_NOCACHE
);
2280 * NFS has embedded ucred so crhold() risks zone corruption
2282 if (bp
->nb_wcred
== NOCRED
)
2283 bp
->nb_wcred
= crdup(cred
);
2284 error
= nfs_buf_write(bp
);
2285 // Note: bp has been released
2287 FSDBG(554, bp
, 0xd00dee, 0xbad, error
);
2288 np
->n_error
= error
;
2289 np
->n_flag
|= NWRITEERR
;
2295 SET(bp
->nb_flags
, NB_INVAL
);
2296 // Note: We don't want to do FREEUPs here because
2297 // that may modify the buffer chain we're iterating!
2298 nfs_buf_release(bp
, 0);
2302 if (np
->n_dirtyblkhd
.lh_first
|| np
->n_cleanblkhd
.lh_first
)
2303 panic("nfs_vinvalbuf: flush failed");
2309 * Flush and invalidate all dirty buffers. If another process is already
2310 * doing the flush, just wait for completion.
2313 nfs_vinvalbuf(vp
, flags
, cred
, p
, intrflg
)
2320 register struct nfsnode
*np
= VTONFS(vp
);
2321 struct nfsmount
*nmp
= VFSTONFS(vp
->v_mount
);
2322 int error
= 0, slpflag
, slptimeo
;
2325 FSDBG_TOP(554, vp
, flags
, intrflg
, 0);
2327 if (nmp
&& ((nmp
->nm_flag
& NFSMNT_INT
) == 0))
2337 * First wait for any other process doing a flush to complete.
2339 while (np
->n_flag
& NFLUSHINPROG
) {
2340 np
->n_flag
|= NFLUSHWANT
;
2341 FSDBG_TOP(555, vp
, flags
, intrflg
, np
->n_flag
);
2342 error
= tsleep((caddr_t
)&np
->n_flag
, PRIBIO
+ 2, "nfsvinval", slptimeo
);
2343 FSDBG_BOT(555, vp
, flags
, intrflg
, np
->n_flag
);
2344 if (error
&& (error
= nfs_sigintr(VFSTONFS(vp
->v_mount
), NULL
, p
))) {
2345 FSDBG_BOT(554, vp
, flags
, intrflg
, error
);
2351 * Now, flush as required.
2353 np
->n_flag
|= NFLUSHINPROG
;
2354 error
= nfs_vinvalbuf_internal(vp
, flags
, cred
, p
, slpflag
, 0);
2356 FSDBG(554, vp
, 0, 0, error
);
2357 error
= nfs_sigintr(VFSTONFS(vp
->v_mount
), NULL
, p
);
2359 np
->n_flag
&= ~NFLUSHINPROG
;
2360 if (np
->n_flag
& NFLUSHWANT
) {
2361 np
->n_flag
&= ~NFLUSHWANT
;
2362 wakeup((caddr_t
)&np
->n_flag
);
2364 FSDBG_BOT(554, vp
, flags
, intrflg
, error
);
2367 error
= nfs_vinvalbuf_internal(vp
, flags
, cred
, p
, 0, slptimeo
);
2369 np
->n_flag
&= ~(NMODIFIED
| NFLUSHINPROG
);
2370 if (np
->n_flag
& NFLUSHWANT
) {
2371 np
->n_flag
&= ~NFLUSHWANT
;
2372 wakeup((caddr_t
)&np
->n_flag
);
2374 didhold
= ubc_hold(vp
);
2376 int rv
= ubc_clean(vp
, 1); /* get the pages out of vm also */
2378 panic("nfs_vinvalbuf(): ubc_clean failed!");
2381 FSDBG_BOT(554, vp
, flags
, intrflg
, 0);
2386 * Initiate asynchronous I/O. Return an error if no nfsiods are available.
2387 * This is mainly to avoid queueing async I/O requests when the nfsiods
2388 * are all hung on a dead server.
2391 nfs_asyncio(bp
, cred
)
2395 struct nfsmount
*nmp
;
2402 if (nfs_numasync
== 0)
2405 FSDBG_TOP(552, bp
, bp
? NBOFF(bp
) : 0, bp
? bp
->nb_flags
: 0, 0);
2407 nmp
= ((bp
!= NULL
) ? VFSTONFS(bp
->nb_vp
->v_mount
) : NULL
);
2409 if (nmp
&& nmp
->nm_flag
& NFSMNT_INT
)
2413 /* no nfsbuf means tell nfsiod to process delwri list */
2418 * Find a free iod to process this request.
2420 for (i
= 0; i
< NFS_MAXASYNCDAEMON
; i
++)
2421 if (nfs_iodwant
[i
]) {
2423 * Found one, so wake it up and tell it which
2427 ("nfs_asyncio: waking iod %d for mount %p\n",
2429 nfs_iodwant
[i
] = (struct proc
*)0;
2430 nfs_iodmount
[i
] = nmp
;
2433 wakeup((caddr_t
)&nfs_iodwant
[i
]);
2438 /* if we're just poking the delwri list, we're done */
2443 * If none are free, we may already have an iod working on this mount
2444 * point. If so, it will process our request.
2447 if (nmp
->nm_bufqiods
> 0) {
2449 ("nfs_asyncio: %d iods are already processing mount %p\n",
2450 nmp
->nm_bufqiods
, nmp
));
2456 * If we have an iod which can process the request, then queue
2459 FSDBG(552, bp
, gotiod
, i
, nmp
->nm_bufqiods
);
2462 * Ensure that the queue never grows too large.
2464 while (nmp
->nm_bufqlen
>= 2*nfs_numasync
) {
2465 if (ISSET(bp
->nb_flags
, NB_IOD
)) {
2466 /* An nfsiod is attempting this async operation so */
2467 /* we must not fall asleep on the bufq because we */
2468 /* could be waiting on ourself. Just return error */
2469 /* and we'll do this operation syncrhonously. */
2472 FSDBG(552, bp
, nmp
->nm_bufqlen
, 2*nfs_numasync
, -1);
2474 ("nfs_asyncio: waiting for mount %p queue to drain\n", nmp
));
2475 nmp
->nm_bufqwant
= TRUE
;
2476 error
= tsleep(&nmp
->nm_bufq
, slpflag
| PRIBIO
,
2477 "nfsaio", slptimeo
);
2479 error2
= nfs_sigintr(nmp
, NULL
, bp
->nb_proc
);
2481 FSDBG_BOT(552, bp
, NBOFF(bp
), bp
->nb_flags
, error2
);
2484 if (slpflag
== PCATCH
) {
2490 * We might have lost our iod while sleeping,
2491 * so check and loop if nescessary.
2493 if (nmp
->nm_bufqiods
== 0) {
2495 ("nfs_asyncio: no iods after mount %p queue was drained, looping\n", nmp
));
2500 if (ISSET(bp
->nb_flags
, NB_READ
)) {
2501 if (bp
->nb_rcred
== NOCRED
&& cred
!= NOCRED
) {
2503 * NFS has embedded ucred.
2504 * Can not crhold() here as that causes zone corruption
2506 bp
->nb_rcred
= crdup(cred
);
2509 SET(bp
->nb_flags
, NB_WRITEINPROG
);
2510 if (bp
->nb_wcred
== NOCRED
&& cred
!= NOCRED
) {
2512 * NFS has embedded ucred.
2513 * Can not crhold() here as that causes zone corruption
2515 bp
->nb_wcred
= crdup(cred
);
2519 TAILQ_INSERT_TAIL(&nmp
->nm_bufq
, bp
, nb_free
);
2521 FSDBG_BOT(552, bp
, NBOFF(bp
), bp
->nb_flags
, 0);
2527 * All the iods are busy on other mounts, so return EIO to
2528 * force the caller to process the i/o synchronously.
2530 NFS_DPF(ASYNCIO
, ("nfs_asyncio: no iods available, i/o is synchronous\n"));
2531 FSDBG_BOT(552, bp
, NBOFF(bp
), bp
->nb_flags
, EIO
);
2536 * Do an I/O operation to/from a cache block. This may be called
2537 * synchronously or from an nfsiod.
2545 register struct uio
*uiop
;
2546 register struct vnode
*vp
;
2548 struct nfsmount
*nmp
;
2549 int error
= 0, diff
, len
, iomode
, must_commit
= 0;
2555 nmp
= VFSTONFS(vp
->v_mount
);
2557 uiop
->uio_iov
= &io
;
2558 uiop
->uio_iovcnt
= 1;
2559 uiop
->uio_segflg
= UIO_SYSSPACE
;
2560 uiop
->uio_procp
= p
;
2563 * we've decided to perform I/O for this block,
2564 * so we couldn't possibly NB_DONE. So, clear it.
2566 if (ISSET(bp
->nb_flags
, NB_DONE
)) {
2567 if (!ISSET(bp
->nb_flags
, NB_ASYNC
))
2568 panic("nfs_doio: done and not async");
2569 CLR(bp
->nb_flags
, NB_DONE
);
2571 FSDBG_TOP(256, np
->n_size
, NBOFF(bp
), bp
->nb_bufsize
, bp
->nb_flags
);
2572 FSDBG(257, bp
->nb_validoff
, bp
->nb_validend
, bp
->nb_dirtyoff
,
2575 if (ISSET(bp
->nb_flags
, NB_READ
)) {
2576 if (vp
->v_type
== VREG
)
2578 io
.iov_len
= uiop
->uio_resid
= bp
->nb_bufsize
;
2579 io
.iov_base
= bp
->nb_data
;
2580 uiop
->uio_rw
= UIO_READ
;
2581 switch (vp
->v_type
) {
2583 uiop
->uio_offset
= NBOFF(bp
);
2584 nfsstats
.read_bios
++;
2585 error
= nfs_readrpc(vp
, uiop
, cr
);
2586 FSDBG(262, np
->n_size
, NBOFF(bp
), uiop
->uio_resid
, error
);
2588 /* update valid range */
2589 bp
->nb_validoff
= 0;
2590 if (uiop
->uio_resid
) {
2592 * If len > 0, there is a hole in the file and
2593 * no writes after the hole have been pushed to
2595 * Just zero fill the rest of the valid area.
2597 diff
= bp
->nb_bufsize
- uiop
->uio_resid
;
2598 len
= np
->n_size
- (NBOFF(bp
) + diff
);
2600 len
= min(len
, uiop
->uio_resid
);
2601 bzero((char *)bp
->nb_data
+ diff
, len
);
2602 bp
->nb_validend
= diff
+ len
;
2603 FSDBG(258, diff
, len
, 0, 1);
2605 bp
->nb_validend
= diff
;
2607 bp
->nb_validend
= bp
->nb_bufsize
;
2608 bp
->nb_valid
= (1 << (round_page_32(bp
->nb_validend
)/PAGE_SIZE
)) - 1;
2609 if (bp
->nb_validend
& PAGE_MASK
) {
2610 /* valid range ends in the middle of a page so we */
2611 /* need to zero-fill any invalid data at the end */
2612 /* of the last page */
2613 bzero((caddr_t
)(bp
->nb_data
+ bp
->nb_validend
),
2614 bp
->nb_bufsize
- bp
->nb_validend
);
2615 FSDBG(258, bp
->nb_validend
,
2616 bp
->nb_bufsize
- bp
->nb_validend
, 0, 2);
2619 if (p
&& (vp
->v_flag
& VTEXT
) &&
2620 (((nmp
->nm_flag
& NFSMNT_NQNFS
) &&
2621 NQNFS_CKINVALID(vp
, np
, ND_READ
) &&
2622 np
->n_lrev
!= np
->n_brev
) ||
2623 (!(nmp
->nm_flag
& NFSMNT_NQNFS
) &&
2624 np
->n_mtime
!= np
->n_vattr
.va_mtime
.tv_sec
))) {
2625 uprintf("Process killed due to text file modification\n");
2626 psignal(p
, SIGKILL
);
2627 p
->p_flag
|= P_NOSWAP
;
2631 uiop
->uio_offset
= (off_t
)0;
2632 nfsstats
.readlink_bios
++;
2633 error
= nfs_readlinkrpc(vp
, uiop
, cr
);
2635 bp
->nb_validoff
= 0;
2636 bp
->nb_validend
= uiop
->uio_offset
;
2640 nfsstats
.readdir_bios
++;
2641 uiop
->uio_offset
= NBOFF(bp
);
2642 if (!(nmp
->nm_flag
& NFSMNT_NFSV3
))
2643 nmp
->nm_flag
&= ~NFSMNT_RDIRPLUS
; /* dk@farm.org */
2644 if (nmp
->nm_flag
& NFSMNT_RDIRPLUS
) {
2645 error
= nfs_readdirplusrpc(vp
, uiop
, cr
);
2646 if (error
== NFSERR_NOTSUPP
)
2647 nmp
->nm_flag
&= ~NFSMNT_RDIRPLUS
;
2649 if ((nmp
->nm_flag
& NFSMNT_RDIRPLUS
) == 0)
2650 error
= nfs_readdirrpc(vp
, uiop
, cr
);
2652 bp
->nb_validoff
= 0;
2653 bp
->nb_validend
= uiop
->uio_offset
- NBOFF(bp
);
2654 bp
->nb_valid
= (1 << (round_page_32(bp
->nb_validend
)/PAGE_SIZE
)) - 1;
2658 printf("nfs_doio: type %x unexpected\n", vp
->v_type
);
2662 SET(bp
->nb_flags
, NB_ERROR
);
2663 bp
->nb_error
= error
;
2667 /* we're doing a write */
2670 /* We need to make sure the pages are locked before doing I/O. */
2671 if (!ISSET(bp
->nb_flags
, NB_META
) && UBCISVALID(vp
)) {
2672 if (!ISSET(bp
->nb_flags
, NB_PAGELIST
)) {
2673 error
= nfs_buf_upl_setup(bp
);
2675 printf("nfs_doio: upl create failed %d\n", error
);
2676 SET(bp
->nb_flags
, NB_ERROR
);
2680 nfs_buf_upl_check(bp
);
2684 if (ISSET(bp
->nb_flags
, NB_WASDIRTY
)) {
2685 FSDBG(256, bp
, NBOFF(bp
), bp
->nb_dirty
, 0xd00dee);
2687 * There are pages marked dirty that need to be written out.
2689 * We don't want to just combine the write range with the
2690 * range of pages that are dirty because that could cause us
2691 * to write data that wasn't actually written to.
2692 * We also don't want to write data more than once.
2694 * If the dirty range just needs to be committed, we do that.
2695 * Otherwise, we write the dirty range and clear the dirty bits
2696 * for any COMPLETE pages covered by that range.
2697 * If there are dirty pages left after that, we write out the
2698 * parts that we haven't written yet.
2703 * If NB_NEEDCOMMIT is set, a commit rpc may do the trick. If not
2704 * an actual write will have to be done.
2705 * If NB_WRITEINPROG is already set, then push it with a write anyhow.
2707 if ((bp
->nb_flags
& (NB_NEEDCOMMIT
| NB_WRITEINPROG
)) == NB_NEEDCOMMIT
) {
2708 doff
= NBOFF(bp
) + bp
->nb_dirtyoff
;
2709 SET(bp
->nb_flags
, NB_WRITEINPROG
);
2710 error
= nfs_commit(vp
, doff
, bp
->nb_dirtyend
- bp
->nb_dirtyoff
,
2711 bp
->nb_wcred
, bp
->nb_proc
);
2712 CLR(bp
->nb_flags
, NB_WRITEINPROG
);
2714 bp
->nb_dirtyoff
= bp
->nb_dirtyend
= 0;
2715 CLR(bp
->nb_flags
, NB_NEEDCOMMIT
);
2716 np
->n_needcommitcnt
--;
2717 CHECK_NEEDCOMMITCNT(np
);
2718 } else if (error
== NFSERR_STALEWRITEVERF
)
2719 nfs_clearcommit(vp
->v_mount
);
2722 if (!error
&& bp
->nb_dirtyend
> 0) {
2723 /* there's a dirty range that needs to be written out */
2725 int firstpg
, lastpg
;
2727 if (NBOFF(bp
) + bp
->nb_dirtyend
> np
->n_size
)
2728 bp
->nb_dirtyend
= np
->n_size
- NBOFF(bp
);
2732 doff
= bp
->nb_dirtyoff
;
2733 dend
= bp
->nb_dirtyend
;
2735 /* if doff page is dirty, move doff to start of page */
2736 if (NBPGDIRTY(bp
,doff
/PAGE_SIZE
))
2737 doff
-= doff
& PAGE_MASK
;
2738 /* try to expand write range to include preceding dirty pages */
2739 if (!(doff
& PAGE_MASK
))
2740 while (doff
> 0 && NBPGDIRTY(bp
,(doff
-1)/PAGE_SIZE
))
2742 /* if dend page is dirty, move dend to start of next page */
2743 if ((dend
& PAGE_MASK
) && NBPGDIRTY(bp
,dend
/PAGE_SIZE
))
2744 dend
= round_page_32(dend
);
2745 /* try to expand write range to include trailing dirty pages */
2746 if (!(dend
& PAGE_MASK
))
2747 while (dend
< bp
->nb_bufsize
&& NBPGDIRTY(bp
,dend
/PAGE_SIZE
))
2749 /* make sure to keep dend clipped to EOF */
2750 if (NBOFF(bp
) + dend
> np
->n_size
)
2751 dend
= np
->n_size
- NBOFF(bp
);
2752 /* calculate range of complete pages being written */
2753 firstpg
= round_page_32(doff
) / PAGE_SIZE
;
2754 lastpg
= (trunc_page_32(dend
) - 1)/ PAGE_SIZE
;
2755 /* calculate mask for that page range */
2756 pagemask
= ((1 << (lastpg
+1)) - 1) & ~((1 << firstpg
) - 1);
2758 /* compare page mask to nb_dirty; if there are other dirty pages */
2759 /* then write FILESYNC; otherwise, write UNSTABLE if async and */
2760 /* not needcommit/nocache/call; otherwise write FILESYNC */
2761 if (bp
->nb_dirty
& ~pagemask
)
2762 iomode
= NFSV3WRITE_FILESYNC
;
2763 else if ((bp
->nb_flags
& (NB_ASYNC
| NB_NEEDCOMMIT
| NB_NOCACHE
| NB_STABLE
)) == NB_ASYNC
)
2764 iomode
= NFSV3WRITE_UNSTABLE
;
2766 iomode
= NFSV3WRITE_FILESYNC
;
2768 /* write the dirty range */
2769 io
.iov_len
= uiop
->uio_resid
= dend
- doff
;
2770 uiop
->uio_offset
= NBOFF(bp
) + doff
;
2771 io
.iov_base
= (char *)bp
->nb_data
+ doff
;
2772 uiop
->uio_rw
= UIO_WRITE
;
2774 nfsstats
.write_bios
++;
2776 SET(bp
->nb_flags
, NB_WRITEINPROG
);
2777 error
= nfs_writerpc(vp
, uiop
, cr
, &iomode
, &must_commit
);
2779 nfs_clearcommit(vp
->v_mount
);
2780 /* clear dirty bits for pages we've written */
2782 bp
->nb_dirty
&= ~pagemask
;
2783 /* set/clear needcommit flag */
2784 if (!error
&& iomode
== NFSV3WRITE_UNSTABLE
) {
2785 if (!ISSET(bp
->nb_flags
, NB_NEEDCOMMIT
))
2786 np
->n_needcommitcnt
++;
2787 SET(bp
->nb_flags
, NB_NEEDCOMMIT
);
2788 /* make sure nb_dirtyoff/nb_dirtyend reflect actual range written */
2789 bp
->nb_dirtyoff
= doff
;
2790 bp
->nb_dirtyend
= dend
;
2792 if (ISSET(bp
->nb_flags
, NB_NEEDCOMMIT
)) {
2793 np
->n_needcommitcnt
--;
2794 CHECK_NEEDCOMMITCNT(np
);
2796 CLR(bp
->nb_flags
, NB_NEEDCOMMIT
);
2798 CLR(bp
->nb_flags
, NB_WRITEINPROG
);
2800 * For an interrupted write, the buffer is still valid and the write
2801 * hasn't been pushed to the server yet, so we can't set NB_ERROR and
2802 * report the interruption by setting NB_EINTR. For the NB_ASYNC case,
2803 * NB_EINTR is not relevant.
2805 * For the case of a V3 write rpc not being committed to stable
2806 * storage, the block is still dirty and requires either a commit rpc
2807 * or another write rpc with iomode == NFSV3WRITE_FILESYNC before the
2808 * block is reused. This is indicated by setting the NB_DELWRI and
2809 * NB_NEEDCOMMIT flags.
2811 if (error
== EINTR
|| (!error
&& bp
->nb_flags
& NB_NEEDCOMMIT
)) {
2812 CLR(bp
->nb_flags
, NB_INVAL
| NB_NOCACHE
);
2813 if (!ISSET(bp
->nb_flags
, NB_DELWRI
)) {
2814 SET(bp
->nb_flags
, NB_DELWRI
);
2818 FSDBG(261, bp
->nb_validoff
, bp
->nb_validend
,
2821 * Since for the NB_ASYNC case, nfs_bwrite() has
2822 * reassigned the buffer to the clean list, we have to
2823 * reassign it back to the dirty one. Ugh.
2825 if (ISSET(bp
->nb_flags
, NB_ASYNC
)) {
2826 /* move to dirty list */
2828 if (bp
->nb_vnbufs
.le_next
!= NFSNOLIST
)
2829 LIST_REMOVE(bp
, nb_vnbufs
);
2830 LIST_INSERT_HEAD(&np
->n_dirtyblkhd
, bp
, nb_vnbufs
);
2833 SET(bp
->nb_flags
, NB_EINTR
);
2836 /* either there's an error or we don't need to commit */
2838 SET(bp
->nb_flags
, NB_ERROR
);
2839 bp
->nb_error
= np
->n_error
= error
;
2840 np
->n_flag
|= NWRITEERR
;
2842 /* clear the dirty range */
2843 bp
->nb_dirtyoff
= bp
->nb_dirtyend
= 0;
2847 if (!error
&& bp
->nb_dirty
) {
2848 /* there are pages marked dirty that need to be written out */
2849 int pg
, cnt
, npages
, off
, len
;
2851 nfsstats
.write_bios
++;
2856 * we do these writes synchronously because we can't really
2857 * support the unstable/needommit method. We could write
2858 * them unstable, clear the dirty bits, and then commit the
2859 * whole block later, but if we need to rewrite the data, we
2860 * won't have any idea which pages were written because that
2861 * info can't be stored in the nb_dirtyoff/nb_dirtyend. We
2862 * also can't leave the dirty bits set because then we wouldn't
2863 * be able to tell if the pages were re-dirtied between the end
2864 * of the write and the commit.
2866 iomode
= NFSV3WRITE_FILESYNC
;
2867 uiop
->uio_rw
= UIO_WRITE
;
2869 SET(bp
->nb_flags
, NB_WRITEINPROG
);
2870 npages
= bp
->nb_bufsize
/PAGE_SIZE
;
2871 for (pg
=0; pg
< npages
; pg
++) {
2872 if (!NBPGDIRTY(bp
,pg
))
2875 while (((pg
+cnt
) < npages
) && NBPGDIRTY(bp
,pg
+cnt
))
2877 /* write cnt pages starting with page pg */
2878 off
= pg
* PAGE_SIZE
;
2879 len
= cnt
* PAGE_SIZE
;
2881 /* clip writes to EOF */
2882 if (NBOFF(bp
) + off
+ len
> np
->n_size
)
2883 len
-= (NBOFF(bp
) + off
+ len
) - np
->n_size
;
2885 io
.iov_len
= uiop
->uio_resid
= len
;
2886 uiop
->uio_offset
= NBOFF(bp
) + off
;
2887 io
.iov_base
= (char *)bp
->nb_data
+ off
;
2888 error
= nfs_writerpc(vp
, uiop
, cr
, &iomode
, &must_commit
);
2890 nfs_clearcommit(vp
->v_mount
);
2894 /* clear dirty bits */
2896 bp
->nb_dirty
&= ~(1 << pg
);
2897 /* leave pg on last page */
2902 if (ISSET(bp
->nb_flags
, NB_NEEDCOMMIT
)) {
2903 np
->n_needcommitcnt
--;
2904 CHECK_NEEDCOMMITCNT(np
);
2906 CLR(bp
->nb_flags
, NB_NEEDCOMMIT
);
2908 CLR(bp
->nb_flags
, NB_WRITEINPROG
);
2909 FSDBG_BOT(256, bp
->nb_validoff
, bp
->nb_validend
, bp
->nb_bufsize
,
2914 SET(bp
->nb_flags
, NB_ERROR
);
2915 bp
->nb_error
= error
;
2919 FSDBG_BOT(256, bp
->nb_validoff
, bp
->nb_validend
, bp
->nb_bufsize
, error
);