2 * Copyright (c) 2000-2005 Apple Computer, Inc. All rights reserved.
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
28 /* Copyright (c) 1995 NeXT Computer, Inc. All Rights Reserved */
30 * Copyright (c) 1989, 1993
31 * The Regents of the University of California. All rights reserved.
33 * This code is derived from software contributed to Berkeley by
34 * Rick Macklem at The University of Guelph.
36 * Redistribution and use in source and binary forms, with or without
37 * modification, are permitted provided that the following conditions
39 * 1. Redistributions of source code must retain the above copyright
40 * notice, this list of conditions and the following disclaimer.
41 * 2. Redistributions in binary form must reproduce the above copyright
42 * notice, this list of conditions and the following disclaimer in the
43 * documentation and/or other materials provided with the distribution.
44 * 3. All advertising materials mentioning features or use of this software
45 * must display the following acknowledgement:
46 * This product includes software developed by the University of
47 * California, Berkeley and its contributors.
48 * 4. Neither the name of the University nor the names of its contributors
49 * may be used to endorse or promote products derived from this software
50 * without specific prior written permission.
52 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
53 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
54 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
55 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
56 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
57 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
58 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
59 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
60 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
61 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
64 * @(#)nfs_bio.c 8.9 (Berkeley) 3/30/95
65 * FreeBSD-Id: nfs_bio.c,v 1.44 1997/09/10 19:52:25 phk Exp $
67 #include <sys/param.h>
68 #include <sys/systm.h>
69 #include <sys/resourcevar.h>
70 #include <sys/signalvar.h>
71 #include <sys/proc_internal.h>
72 #include <sys/kauth.h>
73 #include <sys/malloc.h>
74 #include <sys/vnode.h>
75 #include <sys/dirent.h>
76 #include <sys/mount_internal.h>
77 #include <sys/kernel.h>
78 #include <sys/sysctl.h>
79 #include <sys/ubc_internal.h>
80 #include <sys/uio_internal.h>
83 #include <sys/vmparam.h>
86 #include <kern/clock.h>
87 #include <libkern/OSAtomic.h>
88 #include <kern/kalloc.h>
90 #include <nfs/rpcv2.h>
91 #include <nfs/nfsproto.h>
93 #include <nfs/nfsmount.h>
94 #include <nfs/nfsnode.h>
95 #include <sys/buf_internal.h>
97 #include <sys/kdebug.h>
99 #define FSDBG(A, B, C, D, E) \
100 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, (A))) | DBG_FUNC_NONE, \
101 (int)(B), (int)(C), (int)(D), (int)(E), 0)
102 #define FSDBG_TOP(A, B, C, D, E) \
103 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, (A))) | DBG_FUNC_START, \
104 (int)(B), (int)(C), (int)(D), (int)(E), 0)
105 #define FSDBG_BOT(A, B, C, D, E) \
106 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, (A))) | DBG_FUNC_END, \
107 (int)(B), (int)(C), (int)(D), (int)(E), 0)
109 extern int nfs_numasync
;
110 extern int nfs_ioddelwri
;
111 extern struct nfsstats nfsstats
;
113 #define NFSBUFHASH(np, lbn) \
114 (&nfsbufhashtbl[((long)(np) / sizeof(*(np)) + (int)(lbn)) & nfsbufhash])
115 LIST_HEAD(nfsbufhashhead
, nfsbuf
) *nfsbufhashtbl
;
116 struct nfsbuffreehead nfsbuffree
, nfsbuffreemeta
, nfsbufdelwri
;
118 int nfsbufcnt
, nfsbufmin
, nfsbufmax
, nfsbufmetacnt
, nfsbufmetamax
;
119 int nfsbuffreecnt
, nfsbuffreemetacnt
, nfsbufdelwricnt
, nfsneedbuffer
;
121 time_t nfsbuffreeuptimestamp
;
123 lck_grp_t
*nfs_buf_lck_grp
;
124 lck_grp_attr_t
*nfs_buf_lck_grp_attr
;
125 lck_attr_t
*nfs_buf_lck_attr
;
126 lck_mtx_t
*nfs_buf_mutex
;
128 #define NFSBUFWRITE_THROTTLE 9
129 #define NFSBUF_LRU_STALE 120
130 #define NFSBUF_META_STALE 240
132 /* number of nfsbufs nfs_buf_freeup() should attempt to free from nfsbuffree list */
133 #define LRU_TO_FREEUP 6
134 /* number of nfsbufs nfs_buf_freeup() should attempt to free from nfsbuffreemeta list */
135 #define META_TO_FREEUP 3
136 /* total number of nfsbufs nfs_buf_freeup() should attempt to free */
137 #define TOTAL_TO_FREEUP (LRU_TO_FREEUP+META_TO_FREEUP)
138 /* fraction of nfsbufs nfs_buf_freeup() should attempt to free from nfsbuffree list when called from nfs_timer() */
139 #define LRU_FREEUP_FRAC_ON_TIMER 8
140 /* fraction of nfsbufs nfs_buf_freeup() should attempt to free from nfsbuffreemeta list when called from nfs_timer() */
141 #define META_FREEUP_FRAC_ON_TIMER 16
142 /* fraction of total nfsbufs that nfsbuffreecnt should exceed before bothering to call nfs_buf_freeup() */
143 #define LRU_FREEUP_MIN_FRAC 4
144 /* fraction of total nfsbufs that nfsbuffreemetacnt should exceed before bothering to call nfs_buf_freeup() */
145 #define META_FREEUP_MIN_FRAC 2
147 #define NFS_BUF_FREEUP() \
149 /* only call nfs_buf_freeup() if it has work to do: */ \
150 if (((nfsbuffreecnt > nfsbufcnt/LRU_FREEUP_MIN_FRAC) || \
151 (nfsbuffreemetacnt > nfsbufcnt/META_FREEUP_MIN_FRAC)) && \
152 ((nfsbufcnt - TOTAL_TO_FREEUP) > nfsbufmin)) \
157 * Initialize nfsbuf lists
162 nfs_buf_lck_grp_attr
= lck_grp_attr_alloc_init();
163 nfs_buf_lck_grp
= lck_grp_alloc_init("nfs_buf", nfs_buf_lck_grp_attr
);
165 nfs_buf_lck_attr
= lck_attr_alloc_init();
167 nfs_buf_mutex
= lck_mtx_alloc_init(nfs_buf_lck_grp
, nfs_buf_lck_attr
);
169 nfsbufcnt
= nfsbufmetacnt
=
170 nfsbuffreecnt
= nfsbuffreemetacnt
= nfsbufdelwricnt
= 0;
172 nfsbufmax
= (sane_size
>> PAGE_SHIFT
) / 4;
173 nfsbufmetamax
= (sane_size
>> PAGE_SHIFT
) / 16;
176 nfsbuffreeuptimestamp
= 0;
178 nfsbufhashtbl
= hashinit(nfsbufmax
/4, M_TEMP
, &nfsbufhash
);
179 TAILQ_INIT(&nfsbuffree
);
180 TAILQ_INIT(&nfsbuffreemeta
);
181 TAILQ_INIT(&nfsbufdelwri
);
186 * try to free up some excess, unused nfsbufs
189 nfs_buf_freeup(int timer
)
194 struct nfsbuffreehead nfsbuffreeup
;
196 TAILQ_INIT(&nfsbuffreeup
);
198 lck_mtx_lock(nfs_buf_mutex
);
201 nfsbuffreeuptimestamp
= now
.tv_sec
;
203 FSDBG(320, nfsbufcnt
, nfsbuffreecnt
, nfsbuffreemetacnt
, 0);
205 count
= timer
? nfsbuffreecnt
/LRU_FREEUP_FRAC_ON_TIMER
: LRU_TO_FREEUP
;
206 while ((nfsbufcnt
> nfsbufmin
) && (count
-- > 0)) {
207 fbp
= TAILQ_FIRST(&nfsbuffree
);
212 if (NBUFSTAMPVALID(fbp
) &&
213 (fbp
->nb_timestamp
+ (2*NFSBUF_LRU_STALE
)) > now
.tv_sec
)
215 nfs_buf_remfree(fbp
);
216 /* disassociate buffer from any vnode */
218 if (fbp
->nb_vnbufs
.le_next
!= NFSNOLIST
) {
219 LIST_REMOVE(fbp
, nb_vnbufs
);
220 fbp
->nb_vnbufs
.le_next
= NFSNOLIST
;
224 LIST_REMOVE(fbp
, nb_hash
);
225 TAILQ_INSERT_TAIL(&nfsbuffreeup
, fbp
, nb_free
);
229 count
= timer
? nfsbuffreemetacnt
/META_FREEUP_FRAC_ON_TIMER
: META_TO_FREEUP
;
230 while ((nfsbufcnt
> nfsbufmin
) && (count
-- > 0)) {
231 fbp
= TAILQ_FIRST(&nfsbuffreemeta
);
236 if (NBUFSTAMPVALID(fbp
) &&
237 (fbp
->nb_timestamp
+ (2*NFSBUF_META_STALE
)) > now
.tv_sec
)
239 nfs_buf_remfree(fbp
);
240 /* disassociate buffer from any vnode */
242 if (fbp
->nb_vnbufs
.le_next
!= NFSNOLIST
) {
243 LIST_REMOVE(fbp
, nb_vnbufs
);
244 fbp
->nb_vnbufs
.le_next
= NFSNOLIST
;
248 LIST_REMOVE(fbp
, nb_hash
);
249 TAILQ_INSERT_TAIL(&nfsbuffreeup
, fbp
, nb_free
);
254 FSDBG(320, nfsbufcnt
, nfsbuffreecnt
, nfsbuffreemetacnt
, 0);
257 lck_mtx_unlock(nfs_buf_mutex
);
259 while ((fbp
= TAILQ_FIRST(&nfsbuffreeup
))) {
260 TAILQ_REMOVE(&nfsbuffreeup
, fbp
, nb_free
);
262 if (IS_VALID_CRED(fbp
->nb_rcred
)) {
263 kauth_cred_unref(&fbp
->nb_rcred
);
265 if (IS_VALID_CRED(fbp
->nb_wcred
)) {
266 kauth_cred_unref(&fbp
->nb_wcred
);
268 /* if buf was NB_META, dump buffer */
269 if (ISSET(fbp
->nb_flags
, NB_META
) && fbp
->nb_data
)
270 kfree(fbp
->nb_data
, fbp
->nb_bufsize
);
277 * remove a buffer from the freelist
278 * (must be called with nfs_buf_mutex held)
281 nfs_buf_remfree(struct nfsbuf
*bp
)
283 if (bp
->nb_free
.tqe_next
== NFSNOLIST
)
284 panic("nfsbuf not on free list");
285 if (ISSET(bp
->nb_flags
, NB_DELWRI
)) {
287 TAILQ_REMOVE(&nfsbufdelwri
, bp
, nb_free
);
288 } else if (ISSET(bp
->nb_flags
, NB_META
)) {
290 TAILQ_REMOVE(&nfsbuffreemeta
, bp
, nb_free
);
293 TAILQ_REMOVE(&nfsbuffree
, bp
, nb_free
);
295 bp
->nb_free
.tqe_next
= NFSNOLIST
;
300 * check for existence of nfsbuf in cache
303 nfs_buf_is_incore(vnode_t vp
, daddr64_t blkno
)
306 lck_mtx_lock(nfs_buf_mutex
);
307 if (nfs_buf_incore(vp
, blkno
))
311 lck_mtx_unlock(nfs_buf_mutex
);
316 * return incore buffer (must be called with nfs_buf_mutex held)
319 nfs_buf_incore(vnode_t vp
, daddr64_t blkno
)
321 /* Search hash chain */
322 struct nfsbuf
* bp
= NFSBUFHASH(VTONFS(vp
), blkno
)->lh_first
;
323 for (; bp
!= NULL
; bp
= bp
->nb_hash
.le_next
)
324 if (bp
->nb_lblkno
== blkno
&& bp
->nb_vp
== vp
) {
325 if (!ISSET(bp
->nb_flags
, NB_INVAL
)) {
326 FSDBG(547, bp
, blkno
, bp
->nb_flags
, bp
->nb_vp
);
334 * Check if it's OK to drop a page.
336 * Called by vnode_pager() on pageout request of non-dirty page.
337 * We need to make sure that it's not part of a delayed write.
338 * If it is, we can't let the VM drop it because we may need it
339 * later when/if we need to write the data (again).
342 nfs_buf_page_inval(vnode_t vp
, off_t offset
)
347 lck_mtx_lock(nfs_buf_mutex
);
348 bp
= nfs_buf_incore(vp
, ubc_offtoblk(vp
, offset
));
351 FSDBG(325, bp
, bp
->nb_flags
, bp
->nb_dirtyoff
, bp
->nb_dirtyend
);
352 if (ISSET(bp
->nb_lflags
, NBL_BUSY
)) {
357 * If there's a dirty range in the buffer, check to
358 * see if this page intersects with the dirty range.
359 * If it does, we can't let the pager drop the page.
361 if (bp
->nb_dirtyend
> 0) {
362 int start
= offset
- NBOFF(bp
);
363 if (bp
->nb_dirtyend
<= start
||
364 bp
->nb_dirtyoff
>= (start
+ PAGE_SIZE
))
370 lck_mtx_unlock(nfs_buf_mutex
);
375 * set up the UPL for a buffer
376 * (must NOT be called with nfs_buf_mutex held)
379 nfs_buf_upl_setup(struct nfsbuf
*bp
)
385 if (ISSET(bp
->nb_flags
, NB_PAGELIST
))
388 upl_flags
= UPL_PRECIOUS
;
389 if (! ISSET(bp
->nb_flags
, NB_READ
)) {
391 * We're doing a "write", so we intend to modify
392 * the pages we're gathering.
394 upl_flags
|= UPL_WILL_MODIFY
;
396 kret
= ubc_create_upl(bp
->nb_vp
, NBOFF(bp
), bp
->nb_bufsize
,
397 &upl
, NULL
, upl_flags
);
398 if (kret
== KERN_INVALID_ARGUMENT
) {
399 /* vm object probably doesn't exist any more */
400 bp
->nb_pagelist
= NULL
;
403 if (kret
!= KERN_SUCCESS
) {
404 printf("nfs_buf_upl_setup(): failed to get pagelist %d\n", kret
);
405 bp
->nb_pagelist
= NULL
;
409 FSDBG(538, bp
, NBOFF(bp
), bp
->nb_bufsize
, bp
->nb_vp
);
411 bp
->nb_pagelist
= upl
;
412 SET(bp
->nb_flags
, NB_PAGELIST
);
417 * update buffer's valid/dirty info from UBC
418 * (must NOT be called with nfs_buf_mutex held)
421 nfs_buf_upl_check(struct nfsbuf
*bp
)
424 off_t filesize
, fileoffset
;
427 if (!ISSET(bp
->nb_flags
, NB_PAGELIST
))
430 npages
= round_page_32(bp
->nb_bufsize
) / PAGE_SIZE
;
431 filesize
= ubc_getsize(bp
->nb_vp
);
432 fileoffset
= NBOFF(bp
);
433 if (fileoffset
< filesize
)
434 SET(bp
->nb_flags
, NB_CACHE
);
436 CLR(bp
->nb_flags
, NB_CACHE
);
438 pl
= ubc_upl_pageinfo(bp
->nb_pagelist
);
439 bp
->nb_valid
= bp
->nb_dirty
= 0;
441 for (i
=0; i
< npages
; i
++, fileoffset
+= PAGE_SIZE_64
) {
442 /* anything beyond the end of the file is not valid or dirty */
443 if (fileoffset
>= filesize
)
445 if (!upl_valid_page(pl
, i
)) {
446 CLR(bp
->nb_flags
, NB_CACHE
);
450 if (upl_dirty_page(pl
, i
)) {
451 NBPGDIRTY_SET(bp
, i
);
452 if (!ISSET(bp
->nb_flags
, NB_WASDIRTY
))
453 SET(bp
->nb_flags
, NB_WASDIRTY
);
456 fileoffset
= NBOFF(bp
);
457 if (ISSET(bp
->nb_flags
, NB_CACHE
)) {
459 bp
->nb_validend
= bp
->nb_bufsize
;
460 if (fileoffset
+ bp
->nb_validend
> filesize
)
461 bp
->nb_validend
= filesize
- fileoffset
;
463 bp
->nb_validoff
= bp
->nb_validend
= -1;
465 FSDBG(539, bp
, fileoffset
, bp
->nb_valid
, bp
->nb_dirty
);
466 FSDBG(539, bp
->nb_validoff
, bp
->nb_validend
, bp
->nb_dirtyoff
, bp
->nb_dirtyend
);
470 * make sure that a buffer is mapped
471 * (must NOT be called with nfs_buf_mutex held)
474 nfs_buf_map(struct nfsbuf
*bp
)
480 if (!ISSET(bp
->nb_flags
, NB_PAGELIST
))
483 kret
= ubc_upl_map(bp
->nb_pagelist
, (vm_address_t
*)&(bp
->nb_data
));
484 if (kret
!= KERN_SUCCESS
)
485 panic("nfs_buf_map: ubc_upl_map() failed with (%d)", kret
);
486 if (bp
->nb_data
== 0)
487 panic("ubc_upl_map mapped 0");
488 FSDBG(540, bp
, bp
->nb_flags
, NBOFF(bp
), bp
->nb_data
);
493 * check range of pages in nfsbuf's UPL for validity
496 nfs_buf_upl_valid_range(struct nfsbuf
*bp
, int off
, int size
)
498 off_t fileoffset
, filesize
;
502 if (!ISSET(bp
->nb_flags
, NB_PAGELIST
))
504 pl
= ubc_upl_pageinfo(bp
->nb_pagelist
);
506 size
+= off
& PAGE_MASK
;
508 fileoffset
= NBOFF(bp
);
509 filesize
= VTONFS(bp
->nb_vp
)->n_size
;
510 if ((fileoffset
+ off
+ size
) > filesize
)
511 size
= filesize
- (fileoffset
+ off
);
514 lastpg
= (off
+ size
- 1)/PAGE_SIZE
;
515 while (pg
<= lastpg
) {
516 if (!upl_valid_page(pl
, pg
))
524 * normalize an nfsbuf's valid range
526 * the read/write code guarantees that we'll always have a valid
527 * region that is an integral number of pages. If either end
528 * of the valid range isn't page-aligned, it gets corrected
529 * here as we extend the valid range through all of the
530 * contiguous valid pages.
533 nfs_buf_normalize_valid_range(struct nfsnode
*np
, struct nfsbuf
*bp
)
536 /* pull validoff back to start of contiguous valid page range */
537 pg
= bp
->nb_validoff
/PAGE_SIZE
;
538 while (pg
>= 0 && NBPGVALID(bp
,pg
))
540 bp
->nb_validoff
= (pg
+1) * PAGE_SIZE
;
541 /* push validend forward to end of contiguous valid page range */
542 npg
= bp
->nb_bufsize
/PAGE_SIZE
;
543 pg
= bp
->nb_validend
/PAGE_SIZE
;
544 while (pg
< npg
&& NBPGVALID(bp
,pg
))
546 bp
->nb_validend
= pg
* PAGE_SIZE
;
548 if (NBOFF(bp
) + bp
->nb_validend
> (off_t
)np
->n_size
)
549 bp
->nb_validend
= np
->n_size
% bp
->nb_bufsize
;
553 * try to push out some delayed/uncommitted writes
554 * ("locked" indicates whether nfs_buf_mutex is already held)
557 nfs_buf_delwri_push(int locked
)
562 if (TAILQ_EMPTY(&nfsbufdelwri
))
565 /* first try to tell the nfsiods to do it */
566 if (nfs_asyncio(NULL
, NULL
) == 0)
569 /* otherwise, try to do some of the work ourselves */
572 lck_mtx_lock(nfs_buf_mutex
);
573 while (i
< 8 && (bp
= TAILQ_FIRST(&nfsbufdelwri
)) != NULL
) {
574 struct nfsnode
*np
= VTONFS(bp
->nb_vp
);
577 while ((error
= nfs_buf_acquire(bp
, 0, 0, 0)) == EAGAIN
);
582 /* buffer is no longer valid */
586 if (ISSET(bp
->nb_flags
, NB_NEEDCOMMIT
))
587 nfs_buf_check_write_verifier(np
, bp
);
588 if (ISSET(bp
->nb_flags
, NB_NEEDCOMMIT
)) {
589 /* put buffer at end of delwri list */
590 TAILQ_INSERT_TAIL(&nfsbufdelwri
, bp
, nb_free
);
593 lck_mtx_unlock(nfs_buf_mutex
);
594 nfs_flushcommits(np
->n_vnode
, NULL
, 1);
596 SET(bp
->nb_flags
, NB_ASYNC
);
597 lck_mtx_unlock(nfs_buf_mutex
);
601 lck_mtx_lock(nfs_buf_mutex
);
604 lck_mtx_unlock(nfs_buf_mutex
);
610 * Returns errno on error, 0 otherwise.
611 * Any buffer is returned in *bpp.
613 * If NBLK_ONLYVALID is set, only return buffer if found in cache.
614 * If NBLK_NOWAIT is set, don't wait for the buffer if it's marked BUSY.
616 * Check for existence of buffer in cache.
617 * Or attempt to reuse a buffer from one of the free lists.
618 * Or allocate a new buffer if we haven't already hit max allocation.
619 * Or wait for a free buffer.
621 * If available buffer found, prepare it, and return it.
623 * If the calling process is interrupted by a signal for
624 * an interruptible mount point, return EINTR.
635 struct nfsnode
*np
= VTONFS(vp
);
636 struct nfsmount
*nmp
= VFSTONFS(vnode_mount(vp
));
638 int biosize
, bufsize
;
639 int slpflag
= PCATCH
;
640 int operation
= (flags
& NBLK_OPMASK
);
644 FSDBG_TOP(541, vp
, blkno
, size
, flags
);
648 if (bufsize
> NFS_MAXBSIZE
)
649 panic("nfs_buf_get: buffer larger than NFS_MAXBSIZE requested");
652 FSDBG_BOT(541, vp
, blkno
, 0, ENXIO
);
655 biosize
= nmp
->nm_biosize
;
657 if (UBCINVALID(vp
) || !UBCINFOEXISTS(vp
)) {
658 operation
= NBLK_META
;
659 } else if (bufsize
< biosize
) {
660 /* reg files should always have biosize blocks */
664 /* if NBLK_WRITE, check for too many delayed/uncommitted writes */
665 if ((operation
== NBLK_WRITE
) && (nfs_nbdwrite
> ((nfsbufcnt
*3)/4))) {
666 FSDBG_TOP(542, vp
, blkno
, nfs_nbdwrite
, ((nfsbufcnt
*3)/4));
668 /* poke the delwri list */
669 nfs_buf_delwri_push(0);
671 /* sleep to let other threads run... */
672 tsleep(&nfs_nbdwrite
, PCATCH
, "nfs_nbdwrite", 1);
673 FSDBG_BOT(542, vp
, blkno
, nfs_nbdwrite
, ((nfsbufcnt
*3)/4));
677 lck_mtx_lock(nfs_buf_mutex
);
679 /* check for existence of nfsbuf in cache */
680 if ((bp
= nfs_buf_incore(vp
, blkno
))) {
681 /* if busy, set wanted and wait */
682 if (ISSET(bp
->nb_lflags
, NBL_BUSY
)) {
683 if (flags
& NBLK_NOWAIT
) {
684 lck_mtx_unlock(nfs_buf_mutex
);
685 FSDBG_BOT(541, vp
, blkno
, bp
, 0xbcbcbcbc);
688 FSDBG_TOP(543, vp
, blkno
, bp
, bp
->nb_flags
);
689 SET(bp
->nb_lflags
, NBL_WANTED
);
693 msleep(bp
, nfs_buf_mutex
, slpflag
|(PRIBIO
+1)|PDROP
,
694 "nfsbufget", (slpflag
== PCATCH
) ? 0 : &ts
);
696 FSDBG_BOT(543, vp
, blkno
, bp
, bp
->nb_flags
);
697 if ((error
= nfs_sigintr(VFSTONFS(vnode_mount(vp
)), NULL
, p
))) {
698 FSDBG_BOT(541, vp
, blkno
, 0, error
);
703 if (bp
->nb_bufsize
!= bufsize
)
704 panic("nfsbuf size mismatch");
705 SET(bp
->nb_lflags
, NBL_BUSY
);
706 SET(bp
->nb_flags
, NB_CACHE
);
708 /* additional paranoia: */
709 if (ISSET(bp
->nb_flags
, NB_PAGELIST
))
710 panic("pagelist buffer was not busy");
714 if (flags
& NBLK_ONLYVALID
) {
715 lck_mtx_unlock(nfs_buf_mutex
);
716 FSDBG_BOT(541, vp
, blkno
, 0, 0x0000cace);
721 * where to get a free buffer:
722 * - if meta and maxmeta reached, must reuse meta
723 * - alloc new if we haven't reached min bufs
724 * - if free lists are NOT empty
725 * - if free list is stale, use it
726 * - else if freemeta list is stale, use it
727 * - else if max bufs allocated, use least-time-to-stale
728 * - alloc new if we haven't reached max allowed
729 * - start clearing out delwri list and try again
732 if ((operation
== NBLK_META
) && (nfsbufmetacnt
>= nfsbufmetamax
)) {
733 /* if we've hit max meta buffers, must reuse a meta buffer */
734 bp
= TAILQ_FIRST(&nfsbuffreemeta
);
735 } else if ((nfsbufcnt
> nfsbufmin
) &&
736 (!TAILQ_EMPTY(&nfsbuffree
) || !TAILQ_EMPTY(&nfsbuffreemeta
))) {
737 /* try to pull an nfsbuf off a free list */
738 struct nfsbuf
*lrubp
, *metabp
;
742 /* if the next LRU or META buffer is invalid or stale, use it */
743 lrubp
= TAILQ_FIRST(&nfsbuffree
);
744 if (lrubp
&& (!NBUFSTAMPVALID(lrubp
) ||
745 ((lrubp
->nb_timestamp
+ NFSBUF_LRU_STALE
) < now
.tv_sec
)))
747 metabp
= TAILQ_FIRST(&nfsbuffreemeta
);
748 if (!bp
&& metabp
&& (!NBUFSTAMPVALID(metabp
) ||
749 ((metabp
->nb_timestamp
+ NFSBUF_META_STALE
) < now
.tv_sec
)))
752 if (!bp
&& (nfsbufcnt
>= nfsbufmax
)) {
753 /* we've already allocated all bufs, so */
754 /* choose the buffer that'll go stale first */
760 int32_t lru_stale_time
, meta_stale_time
;
761 lru_stale_time
= lrubp
->nb_timestamp
+ NFSBUF_LRU_STALE
;
762 meta_stale_time
= metabp
->nb_timestamp
+ NFSBUF_META_STALE
;
763 if (lru_stale_time
<= meta_stale_time
)
772 /* we have a buffer to reuse */
773 FSDBG(544, vp
, blkno
, bp
, bp
->nb_flags
);
775 if (ISSET(bp
->nb_flags
, NB_DELWRI
))
776 panic("nfs_buf_get: delwri");
777 SET(bp
->nb_lflags
, NBL_BUSY
);
778 /* disassociate buffer from previous vnode */
780 if (bp
->nb_vnbufs
.le_next
!= NFSNOLIST
) {
781 LIST_REMOVE(bp
, nb_vnbufs
);
782 bp
->nb_vnbufs
.le_next
= NFSNOLIST
;
786 LIST_REMOVE(bp
, nb_hash
);
787 /* nuke any creds we're holding */
788 if (IS_VALID_CRED(bp
->nb_rcred
)) {
789 kauth_cred_unref(&bp
->nb_rcred
);
791 if (IS_VALID_CRED(bp
->nb_wcred
)) {
792 kauth_cred_unref(&bp
->nb_wcred
);
794 /* if buf will no longer be NB_META, dump old buffer */
795 if (operation
== NBLK_META
) {
796 if (!ISSET(bp
->nb_flags
, NB_META
))
798 } else if (ISSET(bp
->nb_flags
, NB_META
)) {
800 kfree(bp
->nb_data
, bp
->nb_bufsize
);
805 /* re-init buf fields */
807 bp
->nb_validoff
= bp
->nb_validend
= -1;
808 bp
->nb_dirtyoff
= bp
->nb_dirtyend
= 0;
813 /* no buffer to reuse */
814 if ((nfsbufcnt
< nfsbufmax
) &&
815 ((operation
!= NBLK_META
) || (nfsbufmetacnt
< nfsbufmetamax
))) {
816 /* just alloc a new one */
817 MALLOC(bp
, struct nfsbuf
*, sizeof(struct nfsbuf
), M_TEMP
, M_WAITOK
);
819 lck_mtx_unlock(nfs_buf_mutex
);
820 FSDBG_BOT(541, vp
, blkno
, 0, error
);
824 if (operation
== NBLK_META
)
828 bzero(bp
, sizeof(*bp
));
829 bp
->nb_free
.tqe_next
= NFSNOLIST
;
830 bp
->nb_validoff
= bp
->nb_validend
= -1;
831 FSDBG(545, vp
, blkno
, bp
, 0);
833 /* too many bufs... wait for buffers to free up */
834 FSDBG_TOP(546, vp
, blkno
, nfsbufcnt
, nfsbufmax
);
836 /* poke the delwri list */
837 nfs_buf_delwri_push(1);
840 msleep(&nfsneedbuffer
, nfs_buf_mutex
, PCATCH
|PDROP
,
842 FSDBG_BOT(546, vp
, blkno
, nfsbufcnt
, nfsbufmax
);
843 if ((error
= nfs_sigintr(VFSTONFS(vnode_mount(vp
)), NULL
, p
))) {
844 FSDBG_BOT(541, vp
, blkno
, 0, error
);
852 bp
->nb_lflags
= NBL_BUSY
;
854 bp
->nb_lblkno
= blkno
;
855 /* insert buf in hash */
856 LIST_INSERT_HEAD(NFSBUFHASH(np
, blkno
), bp
, nb_hash
);
857 /* associate buffer with new vnode */
859 LIST_INSERT_HEAD(&np
->n_cleanblkhd
, bp
, nb_vnbufs
);
864 lck_mtx_unlock(nfs_buf_mutex
);
868 SET(bp
->nb_flags
, NB_META
);
869 if ((bp
->nb_bufsize
!= bufsize
) && bp
->nb_data
) {
870 kfree(bp
->nb_data
, bp
->nb_bufsize
);
872 bp
->nb_validoff
= bp
->nb_validend
= -1;
873 bp
->nb_dirtyoff
= bp
->nb_dirtyend
= 0;
876 CLR(bp
->nb_flags
, NB_CACHE
);
879 bp
->nb_data
= kalloc(bufsize
);
881 /* Ack! couldn't allocate the data buffer! */
882 /* cleanup buffer and return error */
883 lck_mtx_lock(nfs_buf_mutex
);
884 LIST_REMOVE(bp
, nb_vnbufs
);
885 bp
->nb_vnbufs
.le_next
= NFSNOLIST
;
887 /* invalidate usage timestamp to allow immediate freeing */
888 NBUFSTAMPINVALIDATE(bp
);
889 if (bp
->nb_free
.tqe_next
!= NFSNOLIST
)
890 panic("nfsbuf on freelist");
891 TAILQ_INSERT_HEAD(&nfsbuffree
, bp
, nb_free
);
893 lck_mtx_unlock(nfs_buf_mutex
);
894 FSDBG_BOT(541, vp
, blkno
, 0xb00, ENOMEM
);
897 bp
->nb_bufsize
= bufsize
;
903 * Set or clear NB_READ now to let the UPL subsystem know
904 * if we intend to modify the pages or not.
906 if (operation
== NBLK_READ
) {
907 SET(bp
->nb_flags
, NB_READ
);
909 CLR(bp
->nb_flags
, NB_READ
);
911 if (bufsize
< PAGE_SIZE
)
913 bp
->nb_bufsize
= bufsize
;
914 bp
->nb_validoff
= bp
->nb_validend
= -1;
916 if (UBCINFOEXISTS(vp
)) {
918 if (nfs_buf_upl_setup(bp
)) {
919 /* unable to create upl */
920 /* vm object must no longer exist */
921 /* cleanup buffer and return error */
922 lck_mtx_lock(nfs_buf_mutex
);
923 LIST_REMOVE(bp
, nb_vnbufs
);
924 bp
->nb_vnbufs
.le_next
= NFSNOLIST
;
926 /* invalidate usage timestamp to allow immediate freeing */
927 NBUFSTAMPINVALIDATE(bp
);
928 if (bp
->nb_free
.tqe_next
!= NFSNOLIST
)
929 panic("nfsbuf on freelist");
930 TAILQ_INSERT_HEAD(&nfsbuffree
, bp
, nb_free
);
932 lck_mtx_unlock(nfs_buf_mutex
);
933 FSDBG_BOT(541, vp
, blkno
, 0x2bc, EIO
);
936 nfs_buf_upl_check(bp
);
941 panic("nfs_buf_get: %d unknown operation", operation
);
946 FSDBG_BOT(541, vp
, blkno
, bp
, bp
->nb_flags
);
952 nfs_buf_release(struct nfsbuf
*bp
, int freeup
)
954 vnode_t vp
= bp
->nb_vp
;
956 int wakeup_needbuffer
, wakeup_buffer
, wakeup_nbdwrite
;
958 FSDBG_TOP(548, bp
, NBOFF(bp
), bp
->nb_flags
, bp
->nb_data
);
959 FSDBG(548, bp
->nb_validoff
, bp
->nb_validend
, bp
->nb_dirtyoff
, bp
->nb_dirtyend
);
960 FSDBG(548, bp
->nb_valid
, 0, bp
->nb_dirty
, 0);
962 if (UBCINFOEXISTS(vp
) && bp
->nb_bufsize
) {
967 if (!ISSET(bp
->nb_flags
, NB_PAGELIST
) && !ISSET(bp
->nb_flags
, NB_INVAL
)) {
968 rv
= nfs_buf_upl_setup(bp
);
970 printf("nfs_buf_release: upl create failed %d\n", rv
);
972 nfs_buf_upl_check(bp
);
974 upl
= bp
->nb_pagelist
;
976 goto pagelist_cleanup_done
;
978 if (ubc_upl_unmap(upl
) != KERN_SUCCESS
)
979 panic("ubc_upl_unmap failed");
982 /* abort pages if error, invalid, or non-needcommit nocache */
983 if ((bp
->nb_flags
& (NB_ERROR
| NB_INVAL
)) ||
984 ((bp
->nb_flags
& NB_NOCACHE
) && !(bp
->nb_flags
& (NB_NEEDCOMMIT
| NB_DELWRI
)))) {
985 if (bp
->nb_flags
& (NB_READ
| NB_INVAL
| NB_NOCACHE
))
986 upl_flags
= UPL_ABORT_DUMP_PAGES
;
989 ubc_upl_abort(upl
, upl_flags
);
990 goto pagelist_cleanup_done
;
992 for (i
=0; i
<= (bp
->nb_bufsize
- 1)/PAGE_SIZE
; i
++) {
993 if (!NBPGVALID(bp
,i
))
994 ubc_upl_abort_range(upl
,
995 i
*PAGE_SIZE
, PAGE_SIZE
,
996 UPL_ABORT_DUMP_PAGES
|
997 UPL_ABORT_FREE_ON_EMPTY
);
1000 upl_flags
= UPL_COMMIT_SET_DIRTY
;
1002 upl_flags
= UPL_COMMIT_CLEAR_DIRTY
;
1003 ubc_upl_commit_range(upl
,
1004 i
*PAGE_SIZE
, PAGE_SIZE
,
1006 UPL_COMMIT_INACTIVATE
|
1007 UPL_COMMIT_FREE_ON_EMPTY
);
1010 pagelist_cleanup_done
:
1011 /* was this the last buffer in the file? */
1012 if (NBOFF(bp
) + bp
->nb_bufsize
> (off_t
)(VTONFS(vp
)->n_size
)) {
1013 /* if so, invalidate all pages of last buffer past EOF */
1015 start
= trunc_page_64(VTONFS(vp
)->n_size
) + PAGE_SIZE_64
;
1016 end
= trunc_page_64(NBOFF(bp
) + bp
->nb_bufsize
);
1018 if (!(rv
= ubc_sync_range(vp
, start
, end
, UBC_INVALIDATE
)))
1019 printf("nfs_buf_release(): ubc_sync_range failed!\n");
1022 CLR(bp
->nb_flags
, NB_PAGELIST
);
1023 bp
->nb_pagelist
= NULL
;
1026 lck_mtx_lock(nfs_buf_mutex
);
1028 wakeup_needbuffer
= wakeup_buffer
= wakeup_nbdwrite
= 0;
1030 /* Wake up any processes waiting for any buffer to become free. */
1031 if (nfsneedbuffer
) {
1033 wakeup_needbuffer
= 1;
1035 /* Wake up any processes waiting for _this_ buffer to become free. */
1036 if (ISSET(bp
->nb_lflags
, NBL_WANTED
)) {
1037 CLR(bp
->nb_lflags
, NBL_WANTED
);
1041 /* If it's non-needcommit nocache, or an error, mark it invalid. */
1042 if (ISSET(bp
->nb_flags
, NB_ERROR
) ||
1043 (ISSET(bp
->nb_flags
, NB_NOCACHE
) && !ISSET(bp
->nb_flags
, (NB_NEEDCOMMIT
| NB_DELWRI
))))
1044 SET(bp
->nb_flags
, NB_INVAL
);
1046 if ((bp
->nb_bufsize
<= 0) || ISSET(bp
->nb_flags
, NB_INVAL
)) {
1047 /* If it's invalid or empty, dissociate it from its vnode */
1048 if (bp
->nb_vnbufs
.le_next
!= NFSNOLIST
) {
1049 LIST_REMOVE(bp
, nb_vnbufs
);
1050 bp
->nb_vnbufs
.le_next
= NFSNOLIST
;
1053 /* if this was a delayed write, wakeup anyone */
1054 /* waiting for delayed writes to complete */
1055 if (ISSET(bp
->nb_flags
, NB_DELWRI
)) {
1056 CLR(bp
->nb_flags
, NB_DELWRI
);
1057 OSAddAtomic(-1, (SInt32
*)&nfs_nbdwrite
);
1059 wakeup_nbdwrite
= 1;
1061 /* invalidate usage timestamp to allow immediate freeing */
1062 NBUFSTAMPINVALIDATE(bp
);
1063 /* put buffer at head of free list */
1064 if (bp
->nb_free
.tqe_next
!= NFSNOLIST
)
1065 panic("nfsbuf on freelist");
1066 SET(bp
->nb_flags
, NB_INVAL
);
1067 if (ISSET(bp
->nb_flags
, NB_META
)) {
1068 TAILQ_INSERT_HEAD(&nfsbuffreemeta
, bp
, nb_free
);
1069 nfsbuffreemetacnt
++;
1071 TAILQ_INSERT_HEAD(&nfsbuffree
, bp
, nb_free
);
1074 } else if (ISSET(bp
->nb_flags
, NB_DELWRI
)) {
1075 /* put buffer at end of delwri list */
1076 if (bp
->nb_free
.tqe_next
!= NFSNOLIST
)
1077 panic("nfsbuf on freelist");
1078 TAILQ_INSERT_TAIL(&nfsbufdelwri
, bp
, nb_free
);
1082 /* update usage timestamp */
1084 bp
->nb_timestamp
= now
.tv_sec
;
1085 /* put buffer at end of free list */
1086 if (bp
->nb_free
.tqe_next
!= NFSNOLIST
)
1087 panic("nfsbuf on freelist");
1088 if (ISSET(bp
->nb_flags
, NB_META
)) {
1089 TAILQ_INSERT_TAIL(&nfsbuffreemeta
, bp
, nb_free
);
1090 nfsbuffreemetacnt
++;
1092 TAILQ_INSERT_TAIL(&nfsbuffree
, bp
, nb_free
);
1099 /* Unlock the buffer. */
1100 CLR(bp
->nb_flags
, (NB_ASYNC
| NB_STABLE
| NB_IOD
));
1101 CLR(bp
->nb_lflags
, NBL_BUSY
);
1103 FSDBG_BOT(548, bp
, NBOFF(bp
), bp
->nb_flags
, bp
->nb_data
);
1105 lck_mtx_unlock(nfs_buf_mutex
);
1107 if (wakeup_needbuffer
)
1108 wakeup(&nfsneedbuffer
);
1111 if (wakeup_nbdwrite
)
1112 wakeup(&nfs_nbdwrite
);
1118 * Wait for operations on the buffer to complete.
1119 * When they do, extract and return the I/O's error value.
1122 nfs_buf_iowait(struct nfsbuf
*bp
)
1124 FSDBG_TOP(549, bp
, NBOFF(bp
), bp
->nb_flags
, bp
->nb_error
);
1126 lck_mtx_lock(nfs_buf_mutex
);
1128 while (!ISSET(bp
->nb_flags
, NB_DONE
))
1129 msleep(bp
, nfs_buf_mutex
, PRIBIO
+ 1, "nfs_buf_iowait", 0);
1131 lck_mtx_unlock(nfs_buf_mutex
);
1133 FSDBG_BOT(549, bp
, NBOFF(bp
), bp
->nb_flags
, bp
->nb_error
);
1135 /* check for interruption of I/O, then errors. */
1136 if (ISSET(bp
->nb_flags
, NB_EINTR
)) {
1137 CLR(bp
->nb_flags
, NB_EINTR
);
1139 } else if (ISSET(bp
->nb_flags
, NB_ERROR
))
1140 return (bp
->nb_error
? bp
->nb_error
: EIO
);
1145 * Mark I/O complete on a buffer.
1148 nfs_buf_iodone(struct nfsbuf
*bp
)
1151 FSDBG_TOP(550, bp
, NBOFF(bp
), bp
->nb_flags
, bp
->nb_error
);
1153 if (ISSET(bp
->nb_flags
, NB_DONE
))
1154 panic("nfs_buf_iodone already");
1156 * I/O was done, so don't believe
1157 * the DIRTY state from VM anymore
1159 CLR(bp
->nb_flags
, NB_WASDIRTY
);
1161 if (!ISSET(bp
->nb_flags
, NB_READ
)) {
1162 CLR(bp
->nb_flags
, NB_WRITEINPROG
);
1164 * vnode_writedone() takes care of waking up
1165 * any throttled write operations
1167 vnode_writedone(bp
->nb_vp
);
1169 if (ISSET(bp
->nb_flags
, NB_ASYNC
)) { /* if async, release it */
1170 SET(bp
->nb_flags
, NB_DONE
); /* note that it's done */
1171 nfs_buf_release(bp
, 1);
1172 } else { /* or just wakeup the buffer */
1173 lck_mtx_lock(nfs_buf_mutex
);
1174 SET(bp
->nb_flags
, NB_DONE
); /* note that it's done */
1175 CLR(bp
->nb_lflags
, NBL_WANTED
);
1176 lck_mtx_unlock(nfs_buf_mutex
);
1180 FSDBG_BOT(550, bp
, NBOFF(bp
), bp
->nb_flags
, bp
->nb_error
);
1184 nfs_buf_write_delayed(struct nfsbuf
*bp
, proc_t p
)
1186 vnode_t vp
= bp
->nb_vp
;
1188 FSDBG_TOP(551, bp
, NBOFF(bp
), bp
->nb_flags
, 0);
1189 FSDBG(551, bp
, bp
->nb_dirtyoff
, bp
->nb_dirtyend
, bp
->nb_dirty
);
1192 * If the block hasn't been seen before:
1193 * (1) Mark it as having been seen,
1194 * (2) Charge for the write.
1195 * (3) Make sure it's on its vnode's correct block list,
1197 if (!ISSET(bp
->nb_flags
, NB_DELWRI
)) {
1198 SET(bp
->nb_flags
, NB_DELWRI
);
1199 if (p
&& p
->p_stats
)
1200 p
->p_stats
->p_ru
.ru_oublock
++; /* XXX */
1201 OSAddAtomic(1, (SInt32
*)&nfs_nbdwrite
);
1203 /* move to dirty list */
1204 lck_mtx_lock(nfs_buf_mutex
);
1205 if (bp
->nb_vnbufs
.le_next
!= NFSNOLIST
)
1206 LIST_REMOVE(bp
, nb_vnbufs
);
1207 LIST_INSERT_HEAD(&VTONFS(vp
)->n_dirtyblkhd
, bp
, nb_vnbufs
);
1208 lck_mtx_unlock(nfs_buf_mutex
);
1212 * If the vnode has "too many" write operations in progress
1213 * wait for them to finish the IO
1215 (void)vnode_waitforwrites(vp
, VNODE_ASYNC_THROTTLE
, 0, 0, "nfs_buf_write_delayed");
1218 * If we have too many delayed write buffers,
1219 * more than we can "safely" handle, just fall back to
1220 * doing the async write
1222 if (nfs_nbdwrite
< 0)
1223 panic("nfs_buf_write_delayed: Negative nfs_nbdwrite");
1225 if (nfs_nbdwrite
> ((nfsbufcnt
/4)*3)) {
1226 /* issue async write */
1227 SET(bp
->nb_flags
, NB_ASYNC
);
1229 FSDBG_BOT(551, bp
, NBOFF(bp
), bp
->nb_flags
, bp
->nb_error
);
1233 /* Otherwise, the "write" is done, so mark and release the buffer. */
1234 SET(bp
->nb_flags
, NB_DONE
);
1235 nfs_buf_release(bp
, 1);
1236 FSDBG_BOT(551, bp
, NBOFF(bp
), bp
->nb_flags
, 0);
1241 * Check that a "needcommit" buffer can still be committed.
1242 * If the write verifier has changed, we need to clear the
1243 * the needcommit flag.
1246 nfs_buf_check_write_verifier(struct nfsnode
*np
, struct nfsbuf
*bp
)
1248 struct nfsmount
*nmp
;
1250 if (!ISSET(bp
->nb_flags
, NB_NEEDCOMMIT
))
1253 nmp
= VFSTONFS(vnode_mount(NFSTOV(np
)));
1254 if (!nmp
|| (bp
->nb_verf
== nmp
->nm_verf
))
1257 /* write verifier changed, clear commit flag */
1258 bp
->nb_flags
&= ~NB_NEEDCOMMIT
;
1259 np
->n_needcommitcnt
--;
1260 CHECK_NEEDCOMMITCNT(np
);
1264 * add a reference to a buffer so it doesn't disappear while being used
1265 * (must be called with nfs_buf_mutex held)
1268 nfs_buf_refget(struct nfsbuf
*bp
)
1273 * release a reference on a buffer
1274 * (must be called with nfs_buf_mutex held)
1277 nfs_buf_refrele(struct nfsbuf
*bp
)
1283 * mark a particular buffer as BUSY
1284 * (must be called with nfs_buf_mutex held)
1287 nfs_buf_acquire(struct nfsbuf
*bp
, int flags
, int slpflag
, int slptimeo
)
1292 if (ISSET(bp
->nb_lflags
, NBL_BUSY
)) {
1294 * since the mutex_lock may block, the buffer
1295 * may become BUSY, so we need to recheck for
1298 if (flags
& NBAC_NOWAIT
)
1300 SET(bp
->nb_lflags
, NBL_WANTED
);
1302 ts
.tv_sec
= (slptimeo
/100);
1303 /* the hz value is 100; which leads to 10ms */
1304 ts
.tv_nsec
= (slptimeo
% 100) * 10 * NSEC_PER_USEC
* 1000;
1306 error
= msleep(bp
, nfs_buf_mutex
, slpflag
| (PRIBIO
+ 1),
1307 "nfs_buf_acquire", &ts
);
1312 if (flags
& NBAC_REMOVE
)
1313 nfs_buf_remfree(bp
);
1314 SET(bp
->nb_lflags
, NBL_BUSY
);
1320 * simply drop the BUSY status of a buffer
1321 * (must be called with nfs_buf_mutex held)
1324 nfs_buf_drop(struct nfsbuf
*bp
)
1326 int need_wakeup
= 0;
1328 if (!ISSET(bp
->nb_lflags
, NBL_BUSY
))
1329 panic("nfs_buf_drop: buffer not busy!");
1330 if (ISSET(bp
->nb_lflags
, NBL_WANTED
)) {
1332 * delay the actual wakeup until after we
1333 * clear NBL_BUSY and we've dropped nfs_buf_mutex
1337 /* Unlock the buffer. */
1338 CLR(bp
->nb_lflags
, (NBL_BUSY
| NBL_WANTED
));
1345 * prepare for iterating over an nfsnode's buffer list
1346 * this lock protects the queue manipulation
1347 * (must be called with nfs_buf_mutex held)
1350 nfs_buf_iterprepare(struct nfsnode
*np
, struct nfsbuflists
*iterheadp
, int flags
)
1352 struct nfsbuflists
*listheadp
;
1354 if (flags
& NBI_DIRTY
)
1355 listheadp
= &np
->n_dirtyblkhd
;
1357 listheadp
= &np
->n_cleanblkhd
;
1359 if ((flags
& NBI_NOWAIT
) && (np
->n_bufiterflags
& NBI_ITER
)) {
1360 LIST_INIT(iterheadp
);
1361 return(EWOULDBLOCK
);
1364 while (np
->n_bufiterflags
& NBI_ITER
) {
1365 np
->n_bufiterflags
|= NBI_ITERWANT
;
1366 msleep(&np
->n_bufiterflags
, nfs_buf_mutex
, 0, "nfs_buf_iterprepare", 0);
1368 if (LIST_EMPTY(listheadp
)) {
1369 LIST_INIT(iterheadp
);
1372 np
->n_bufiterflags
|= NBI_ITER
;
1374 iterheadp
->lh_first
= listheadp
->lh_first
;
1375 listheadp
->lh_first
->nb_vnbufs
.le_prev
= &iterheadp
->lh_first
;
1376 LIST_INIT(listheadp
);
1382 * cleanup after iterating over an nfsnode's buffer list
1383 * this lock protects the queue manipulation
1384 * (must be called with nfs_buf_mutex held)
1387 nfs_buf_itercomplete(struct nfsnode
*np
, struct nfsbuflists
*iterheadp
, int flags
)
1389 struct nfsbuflists
* listheadp
;
1392 if (flags
& NBI_DIRTY
)
1393 listheadp
= &np
->n_dirtyblkhd
;
1395 listheadp
= &np
->n_cleanblkhd
;
1397 while (!LIST_EMPTY(iterheadp
)) {
1398 bp
= LIST_FIRST(iterheadp
);
1399 LIST_REMOVE(bp
, nb_vnbufs
);
1400 LIST_INSERT_HEAD(listheadp
, bp
, nb_vnbufs
);
1403 np
->n_bufiterflags
&= ~NBI_ITER
;
1404 if (np
->n_bufiterflags
& NBI_ITERWANT
) {
1405 np
->n_bufiterflags
&= ~NBI_ITERWANT
;
1406 wakeup(&np
->n_bufiterflags
);
1412 * Vnode op for read using bio
1413 * Any similarity to readip() is purely coincidental
1419 __unused
int ioflag
,
1423 struct nfsnode
*np
= VTONFS(vp
);
1426 struct nfsbuf
*bp
= NULL
, *rabp
;
1427 struct nfs_vattr nvattr
;
1428 struct nfsmount
*nmp
= VFSTONFS(vnode_mount(vp
));
1429 daddr64_t lbn
, rabn
, lastrabn
= -1, tlbn
;
1431 int nra
, error
= 0, n
= 0, on
= 0;
1433 struct dirent
*direntp
= NULL
;
1435 int nocachereadahead
= 0;
1437 FSDBG_TOP(514, vp
, uio
->uio_offset
, uio_uio_resid(uio
), ioflag
);
1440 if (uio
->uio_rw
!= UIO_READ
)
1441 panic("nfs_read mode");
1443 if (uio_uio_resid(uio
) == 0) {
1444 FSDBG_BOT(514, vp
, 0xd1e0001, 0, 0);
1447 if (uio
->uio_offset
< 0) {
1448 FSDBG_BOT(514, vp
, 0xd1e0002, 0, EINVAL
);
1452 biosize
= nmp
->nm_biosize
;
1453 if ((nmp
->nm_flag
& NFSMNT_NFSV3
) && !(nmp
->nm_state
& NFSSTA_GOTFSINFO
))
1454 nfs_fsinfo(nmp
, vp
, cred
, p
);
1456 vtype
= vnode_vtype(vp
);
1458 * For nfs, cache consistency can only be maintained approximately.
1459 * Although RFC1094 does not specify the criteria, the following is
1460 * believed to be compatible with the reference port.
1462 * If the file's modify time on the server has changed since the
1463 * last read rpc or you have written to the file,
1464 * you may have lost data cache consistency with the
1465 * server, so flush all of the file's data out of the cache.
1466 * Then force a getattr rpc to ensure that you have up to date
1468 * NB: This implies that cache data can be read when up to
1469 * NFS_MAXATTRTIMEO seconds out of date. If you find that you need
1470 * current attributes this could be forced by setting calling
1471 * NATTRINVALIDATE() before the nfs_getattr() call.
1473 if (np
->n_flag
& NNEEDINVALIDATE
) {
1474 np
->n_flag
&= ~NNEEDINVALIDATE
;
1475 nfs_vinvalbuf(vp
, V_SAVE
|V_IGNORE_WRITEERR
, cred
, p
, 1);
1477 if (np
->n_flag
& NMODIFIED
) {
1478 if (vtype
!= VREG
) {
1480 panic("nfs: bioread, not dir");
1482 error
= nfs_vinvalbuf(vp
, V_SAVE
, cred
, p
, 1);
1484 FSDBG_BOT(514, vp
, 0xd1e0003, 0, error
);
1488 NATTRINVALIDATE(np
);
1489 error
= nfs_getattr(vp
, &nvattr
, cred
, p
);
1491 FSDBG_BOT(514, vp
, 0xd1e0004, 0, error
);
1494 if (vtype
== VDIR
) {
1495 /* if directory changed, purge any name cache entries */
1496 if (nfstimespeccmp(&np
->n_ncmtime
, &nvattr
.nva_mtime
, !=))
1498 np
->n_ncmtime
= nvattr
.nva_mtime
;
1500 np
->n_mtime
= nvattr
.nva_mtime
;
1502 error
= nfs_getattr(vp
, &nvattr
, cred
, p
);
1504 FSDBG_BOT(514, vp
, 0xd1e0005, 0, error
);
1507 if (nfstimespeccmp(&np
->n_mtime
, &nvattr
.nva_mtime
, !=)) {
1508 if (vtype
== VDIR
) {
1510 /* purge name cache entries */
1511 if (nfstimespeccmp(&np
->n_ncmtime
, &nvattr
.nva_mtime
, !=))
1514 error
= nfs_vinvalbuf(vp
, V_SAVE
, cred
, p
, 1);
1516 FSDBG_BOT(514, vp
, 0xd1e0006, 0, error
);
1520 np
->n_ncmtime
= nvattr
.nva_mtime
;
1521 np
->n_mtime
= nvattr
.nva_mtime
;
1525 if (vnode_isnocache(vp
)) {
1526 if (!(np
->n_flag
& NNOCACHE
)) {
1527 if (NVALIDBUFS(np
)) {
1528 error
= nfs_vinvalbuf(vp
, V_SAVE
, cred
, p
, 1);
1530 FSDBG_BOT(514, vp
, 0xd1e000a, 0, error
);
1534 np
->n_flag
|= NNOCACHE
;
1536 } else if (np
->n_flag
& NNOCACHE
) {
1537 np
->n_flag
&= ~NNOCACHE
;
1541 if (np
->n_flag
& NNOCACHE
) {
1545 * If we have only a block or so to read,
1546 * just do the rpc directly.
1547 * If we have a couple blocks or more to read,
1548 * then we'll take advantage of readahead within
1549 * this loop to try to fetch all the data in parallel
1551 if (!nocachereadahead
&& (uio_uio_resid(uio
) < 2*biosize
)) {
1552 error
= nfs_readrpc(vp
, uio
, cred
, p
);
1553 FSDBG_BOT(514, vp
, uio
->uio_offset
, uio_uio_resid(uio
), error
);
1556 nocachereadahead
= 1;
1559 error
= nfs_readlinkrpc(vp
, uio
, cred
, p
);
1560 FSDBG_BOT(514, vp
, uio
->uio_offset
, uio_uio_resid(uio
), error
);
1565 printf(" NFSNOCACHE: type %x unexpected\n", vtype
);
1570 lbn
= uio
->uio_offset
/ biosize
;
1573 * Copy directly from any cached pages without grabbing the bufs.
1575 * Note: for "nocache" reads, we don't copy directly from UBC
1576 * because any cached pages will be for readahead buffers that
1577 * need to be invalidated anyway before we finish this request.
1579 if (!(np
->n_flag
& NNOCACHE
) &&
1580 (uio
->uio_segflg
== UIO_USERSPACE32
||
1581 uio
->uio_segflg
== UIO_USERSPACE64
||
1582 uio
->uio_segflg
== UIO_USERSPACE
)) {
1583 // LP64todo - fix this!
1584 int io_resid
= uio_uio_resid(uio
);
1585 diff
= np
->n_size
- uio
->uio_offset
;
1586 if (diff
< io_resid
)
1589 error
= cluster_copy_ubc_data(vp
, uio
, &io_resid
, 0);
1591 FSDBG_BOT(514, vp
, uio
->uio_offset
, 0xcacefeed, error
);
1595 /* count any biocache reads that we just copied directly */
1596 if (lbn
!= uio
->uio_offset
/ biosize
) {
1597 OSAddAtomic((uio
->uio_offset
/ biosize
) - lbn
, (SInt32
*)&nfsstats
.biocache_reads
);
1598 FSDBG(514, vp
, 0xcacefeed, uio
->uio_offset
, error
);
1602 lbn
= uio
->uio_offset
/ biosize
;
1603 on
= uio
->uio_offset
% biosize
;
1606 * Start the read ahead(s), as required.
1608 if (nfs_numasync
> 0 && nmp
->nm_readahead
> 0) {
1609 for (nra
= 0; nra
< nmp
->nm_readahead
; nra
++) {
1610 rabn
= lbn
+ 1 + nra
;
1611 if (rabn
<= lastrabn
) {
1612 /* we've already (tried to) read this block */
1613 /* no need to try it again... */
1617 if ((off_t
)rabn
* biosize
>= (off_t
)np
->n_size
)
1619 if ((np
->n_flag
& NNOCACHE
) &&
1620 (((off_t
)rabn
* biosize
) >= (uio
->uio_offset
+ uio_uio_resid(uio
))))
1621 /* for uncached readahead, don't go beyond end of request */
1623 /* check if block exists and is valid. */
1624 error
= nfs_buf_get(vp
, rabn
, biosize
, p
, NBLK_READ
|NBLK_NOWAIT
, &rabp
);
1626 FSDBG_BOT(514, vp
, 0xd1e000b, 1, error
);
1631 if (nfs_buf_upl_valid_range(rabp
, 0, rabp
->nb_bufsize
)) {
1632 nfs_buf_release(rabp
, 1);
1635 if (!ISSET(rabp
->nb_flags
, (NB_CACHE
|NB_DELWRI
))) {
1636 SET(rabp
->nb_flags
, (NB_READ
|NB_ASYNC
));
1637 if (nfs_asyncio(rabp
, cred
)) {
1638 SET(rabp
->nb_flags
, (NB_INVAL
|NB_ERROR
));
1639 rabp
->nb_error
= EIO
;
1640 nfs_buf_release(rabp
, 1);
1643 nfs_buf_release(rabp
, 1);
1647 if ((uio_uio_resid(uio
) <= 0) || (uio
->uio_offset
>= (off_t
)np
->n_size
)) {
1648 FSDBG_BOT(514, vp
, uio
->uio_offset
, uio_uio_resid(uio
), 0xaaaaaaaa);
1652 OSAddAtomic(1, (SInt32
*)&nfsstats
.biocache_reads
);
1655 * If the block is in the cache and has the required data
1656 * in a valid region, just copy it out.
1657 * Otherwise, get the block and write back/read in,
1662 // LP64todo - fix this!
1663 n
= min((unsigned)(bufsize
- on
), uio_uio_resid(uio
));
1664 diff
= np
->n_size
- uio
->uio_offset
;
1668 error
= nfs_buf_get(vp
, lbn
, bufsize
, p
, NBLK_READ
, &bp
);
1670 FSDBG_BOT(514, vp
, 0xd1e000c, 0, EINTR
);
1674 /* if any pages are valid... */
1676 /* ...check for any invalid pages in the read range */
1677 int pg
, firstpg
, lastpg
, dirtypg
;
1678 dirtypg
= firstpg
= lastpg
= -1;
1680 while (pg
<= (on
+ n
- 1)/PAGE_SIZE
) {
1681 if (!NBPGVALID(bp
,pg
)) {
1685 } else if (firstpg
>= 0 && dirtypg
< 0 && NBPGDIRTY(bp
,pg
))
1690 /* if there are no invalid pages, we're all set */
1692 if (bp
->nb_validoff
< 0) {
1693 /* valid range isn't set up, so */
1694 /* set it to what we know is valid */
1695 bp
->nb_validoff
= trunc_page(on
);
1696 bp
->nb_validend
= round_page(on
+n
);
1697 nfs_buf_normalize_valid_range(np
, bp
);
1702 /* there are invalid pages in the read range */
1703 if ((dirtypg
> firstpg
) && (dirtypg
< lastpg
)) {
1704 /* there are also dirty page(s) in the range, */
1705 /* so write the buffer out and try again */
1706 CLR(bp
->nb_flags
, (NB_DONE
| NB_ERROR
| NB_INVAL
));
1707 SET(bp
->nb_flags
, NB_ASYNC
);
1708 if (!IS_VALID_CRED(bp
->nb_wcred
)) {
1709 kauth_cred_ref(cred
);
1710 bp
->nb_wcred
= cred
;
1712 error
= nfs_buf_write(bp
);
1714 FSDBG_BOT(514, vp
, 0xd1e000d, 0, error
);
1719 if (!bp
->nb_dirty
&& bp
->nb_dirtyend
<= 0 &&
1720 (lastpg
- firstpg
+ 1) > (bufsize
/PAGE_SIZE
)/2) {
1721 /* we need to read in more than half the buffer and the */
1722 /* buffer's not dirty, so just fetch the whole buffer */
1725 /* read the page range in */
1727 char uio_buf
[ UIO_SIZEOF(1) ];
1730 auio
= uio_createwithbuffer(1, (NBOFF(bp
) + firstpg
* PAGE_SIZE_64
),
1731 UIO_SYSSPACE
, UIO_READ
, &uio_buf
[0], sizeof(uio_buf
));
1735 uio_addiov(auio
, CAST_USER_ADDR_T((bp
->nb_data
+ firstpg
* PAGE_SIZE
)),
1736 ((lastpg
- firstpg
+ 1) * PAGE_SIZE
));
1737 error
= nfs_readrpc(vp
, auio
, cred
, p
);
1740 if (np
->n_flag
& NNOCACHE
)
1741 SET(bp
->nb_flags
, NB_NOCACHE
);
1742 nfs_buf_release(bp
, 1);
1743 FSDBG_BOT(514, vp
, 0xd1e000e, 0, error
);
1746 /* Make sure that the valid range is set to cover this read. */
1747 bp
->nb_validoff
= trunc_page_32(on
);
1748 bp
->nb_validend
= round_page_32(on
+n
);
1749 nfs_buf_normalize_valid_range(np
, bp
);
1750 if (uio_resid(auio
) > 0) {
1751 /* if short read, must have hit EOF, */
1752 /* so zero the rest of the range */
1753 bzero(CAST_DOWN(caddr_t
, uio_curriovbase(auio
)), uio_resid(auio
));
1755 /* mark the pages (successfully read) as valid */
1756 for (pg
=firstpg
; pg
<= lastpg
; pg
++)
1757 NBPGVALID_SET(bp
,pg
);
1760 /* if no pages are valid, read the whole block */
1761 if (!bp
->nb_valid
) {
1762 SET(bp
->nb_flags
, NB_READ
);
1763 CLR(bp
->nb_flags
, (NB_DONE
| NB_ERROR
| NB_INVAL
));
1764 error
= nfs_doio(bp
, cred
, p
);
1766 if (np
->n_flag
& NNOCACHE
)
1767 SET(bp
->nb_flags
, NB_NOCACHE
);
1768 nfs_buf_release(bp
, 1);
1769 FSDBG_BOT(514, vp
, 0xd1e000f, 0, error
);
1774 /* validate read range against valid range and clip */
1775 if (bp
->nb_validend
> 0) {
1776 diff
= (on
>= bp
->nb_validend
) ? 0 : (bp
->nb_validend
- on
);
1784 OSAddAtomic(1, (SInt32
*)&nfsstats
.biocache_readlinks
);
1785 error
= nfs_buf_get(vp
, 0, NFS_MAXPATHLEN
, p
, NBLK_READ
, &bp
);
1787 FSDBG_BOT(514, vp
, 0xd1e0010, 0, error
);
1790 if (!ISSET(bp
->nb_flags
, NB_CACHE
)) {
1791 SET(bp
->nb_flags
, NB_READ
);
1792 error
= nfs_doio(bp
, cred
, p
);
1794 SET(bp
->nb_flags
, NB_ERROR
);
1795 nfs_buf_release(bp
, 1);
1796 FSDBG_BOT(514, vp
, 0xd1e0011, 0, error
);
1800 // LP64todo - fix this!
1801 n
= min(uio_uio_resid(uio
), bp
->nb_validend
);
1805 OSAddAtomic(1, (SInt32
*)&nfsstats
.biocache_readdirs
);
1806 if (np
->n_direofoffset
&& uio
->uio_offset
>= np
->n_direofoffset
) {
1807 FSDBG_BOT(514, vp
, 0xde0f0001, 0, 0);
1810 lbn
= uio
->uio_offset
/ NFS_DIRBLKSIZ
;
1811 on
= uio
->uio_offset
& (NFS_DIRBLKSIZ
- 1);
1812 error
= nfs_buf_get(vp
, lbn
, NFS_DIRBLKSIZ
, p
, NBLK_READ
, &bp
);
1814 FSDBG_BOT(514, vp
, 0xd1e0012, 0, error
);
1817 if (!ISSET(bp
->nb_flags
, NB_CACHE
)) {
1818 SET(bp
->nb_flags
, NB_READ
);
1819 error
= nfs_doio(bp
, cred
, p
);
1821 nfs_buf_release(bp
, 1);
1823 while (error
== NFSERR_BAD_COOKIE
) {
1825 error
= nfs_vinvalbuf(vp
, 0, cred
, p
, 1);
1827 * Yuck! The directory has been modified on the
1828 * server. The only way to get the block is by
1829 * reading from the beginning to get all the
1832 for (tlbn
= 0; tlbn
<= lbn
&& !error
; tlbn
++) {
1833 if (np
->n_direofoffset
1834 && (tlbn
* NFS_DIRBLKSIZ
) >= np
->n_direofoffset
) {
1835 FSDBG_BOT(514, vp
, 0xde0f0002, 0, 0);
1838 error
= nfs_buf_get(vp
, tlbn
, NFS_DIRBLKSIZ
, p
, NBLK_READ
, &bp
);
1840 FSDBG_BOT(514, vp
, 0xd1e0013, 0, error
);
1843 if (!ISSET(bp
->nb_flags
, NB_CACHE
)) {
1844 SET(bp
->nb_flags
, NB_READ
);
1845 error
= nfs_doio(bp
, cred
, p
);
1847 * no error + NB_INVAL == directory EOF,
1850 if (error
== 0 && (bp
->nb_flags
& NB_INVAL
))
1854 * An error will throw away the block and the
1855 * for loop will break out. If no error and this
1856 * is not the block we want, we throw away the
1857 * block and go for the next one via the for loop.
1859 if (error
|| tlbn
< lbn
)
1860 nfs_buf_release(bp
, 1);
1864 * The above while is repeated if we hit another cookie
1865 * error. If we hit an error and it wasn't a cookie error,
1869 FSDBG_BOT(514, vp
, 0xd1e0014, 0, error
);
1875 * If not eof and read aheads are enabled, start one.
1876 * (You need the current block first, so that you have the
1877 * directory offset cookie of the next block.)
1879 if (nfs_numasync
> 0 && nmp
->nm_readahead
> 0 &&
1880 (np
->n_direofoffset
== 0 ||
1881 (lbn
+ 1) * NFS_DIRBLKSIZ
< np
->n_direofoffset
) &&
1882 !nfs_buf_is_incore(vp
, lbn
+ 1)) {
1883 error
= nfs_buf_get(vp
, lbn
+ 1, NFS_DIRBLKSIZ
, p
, NBLK_READ
|NBLK_NOWAIT
, &rabp
);
1885 FSDBG_BOT(514, vp
, 0xd1e0015, 0, error
);
1889 if (!ISSET(rabp
->nb_flags
, (NB_CACHE
))) {
1890 SET(rabp
->nb_flags
, (NB_READ
| NB_ASYNC
));
1891 if (nfs_asyncio(rabp
, cred
)) {
1892 SET(rabp
->nb_flags
, (NB_INVAL
|NB_ERROR
));
1893 rabp
->nb_error
= EIO
;
1894 nfs_buf_release(rabp
, 1);
1897 nfs_buf_release(rabp
, 1);
1902 * Make sure we use a signed variant of min() since
1903 * the second term may be negative.
1905 // LP64todo - fix this!
1906 n
= lmin(uio_uio_resid(uio
), bp
->nb_validend
- on
);
1908 * We keep track of the directory eof in
1909 * np->n_direofoffset and chop it off as an
1910 * extra step right here.
1912 if (np
->n_direofoffset
&&
1913 n
> np
->n_direofoffset
- uio
->uio_offset
)
1914 n
= np
->n_direofoffset
- uio
->uio_offset
;
1916 * Make sure that we return an integral number of entries so
1917 * that any subsequent calls will start copying from the start
1918 * of the next entry.
1920 * If the current value of n has the last entry cut short,
1921 * set n to copy everything up to the last entry instead.
1924 dp
= bp
->nb_data
+ on
;
1925 while (dp
< (bp
->nb_data
+ on
+ n
)) {
1926 direntp
= (struct dirent
*)dp
;
1927 dp
+= direntp
->d_reclen
;
1929 if (dp
> (bp
->nb_data
+ on
+ n
))
1930 n
= (dp
- direntp
->d_reclen
) - (bp
->nb_data
+ on
);
1934 printf("nfs_bioread: type %x unexpected\n", vtype
);
1935 FSDBG_BOT(514, vp
, 0xd1e0016, 0, EINVAL
);
1940 error
= uiomove(bp
->nb_data
+ on
, (int)n
, uio
);
1944 if (np
->n_flag
& NNOCACHE
)
1945 SET(bp
->nb_flags
, NB_NOCACHE
);
1955 nfs_buf_release(bp
, 1);
1956 } while (error
== 0 && uio_uio_resid(uio
) > 0 && n
> 0);
1957 FSDBG_BOT(514, vp
, uio
->uio_offset
, uio_uio_resid(uio
), error
);
1963 * Vnode op for write using bio
1967 struct vnop_write_args
/* {
1968 struct vnodeop_desc *a_desc;
1972 vfs_context_t a_context;
1975 struct uio
*uio
= ap
->a_uio
;
1976 vnode_t vp
= ap
->a_vp
;
1977 struct nfsnode
*np
= VTONFS(vp
);
1980 int ioflag
= ap
->a_ioflag
;
1982 struct nfs_vattr nvattr
;
1983 struct nfsmount
*nmp
= VFSTONFS(vnode_mount(vp
));
1985 int biosize
, bufsize
;
1986 int n
, on
, error
= 0;
1987 off_t boff
, start
, end
, cureof
;
1988 struct iovec_32 iov
;
1991 FSDBG_TOP(515, vp
, uio
->uio_offset
, uio_uio_resid(uio
), ioflag
);
1994 if (uio
->uio_rw
!= UIO_WRITE
)
1995 panic("nfs_write mode");
1996 if (UIO_SEG_IS_USER_SPACE(uio
->uio_segflg
))
1997 panic("nfs_write proc");
2000 p
= vfs_context_proc(ap
->a_context
);
2001 cred
= vfs_context_ucred(ap
->a_context
);
2003 if (vnode_vtype(vp
) != VREG
)
2006 np
->n_flag
|= NWRBUSY
;
2008 if (np
->n_flag
& NNEEDINVALIDATE
) {
2009 np
->n_flag
&= ~NNEEDINVALIDATE
;
2010 nfs_vinvalbuf(vp
, V_SAVE
|V_IGNORE_WRITEERR
, cred
, p
, 1);
2012 if (np
->n_flag
& NWRITEERR
) {
2013 np
->n_flag
&= ~(NWRITEERR
| NWRBUSY
);
2014 FSDBG_BOT(515, vp
, uio
->uio_offset
, uio_uio_resid(uio
), np
->n_error
);
2015 return (np
->n_error
);
2018 biosize
= nmp
->nm_biosize
;
2019 if ((nmp
->nm_flag
& NFSMNT_NFSV3
) && !(nmp
->nm_state
& NFSSTA_GOTFSINFO
))
2020 nfs_fsinfo(nmp
, vp
, cred
, p
);
2022 if (ioflag
& (IO_APPEND
| IO_SYNC
)) {
2023 if (np
->n_flag
& NMODIFIED
) {
2024 NATTRINVALIDATE(np
);
2025 error
= nfs_vinvalbuf(vp
, V_SAVE
, cred
, p
, 1);
2027 np
->n_flag
&= ~NWRBUSY
;
2028 FSDBG_BOT(515, vp
, uio
->uio_offset
, 0x10bad01, error
);
2032 if (ioflag
& IO_APPEND
) {
2033 NATTRINVALIDATE(np
);
2034 error
= nfs_getattr(vp
, &nvattr
, cred
, p
);
2036 np
->n_flag
&= ~NWRBUSY
;
2037 FSDBG_BOT(515, vp
, uio
->uio_offset
, 0x10bad02, error
);
2040 uio
->uio_offset
= np
->n_size
;
2043 if (uio
->uio_offset
< 0) {
2044 np
->n_flag
&= ~NWRBUSY
;
2045 FSDBG_BOT(515, vp
, uio
->uio_offset
, 0xbad0ff, EINVAL
);
2048 if (uio_uio_resid(uio
) == 0) {
2049 np
->n_flag
&= ~NWRBUSY
;
2050 FSDBG_BOT(515, vp
, uio
->uio_offset
, uio_uio_resid(uio
), 0);
2054 if (vnode_isnocache(vp
)) {
2055 if (!(np
->n_flag
& NNOCACHE
)) {
2056 if (NVALIDBUFS(np
)) {
2057 error
= nfs_vinvalbuf(vp
, V_SAVE
, cred
, p
, 1);
2059 np
->n_flag
&= ~NWRBUSY
;
2060 FSDBG_BOT(515, vp
, 0, 0, error
);
2064 np
->n_flag
|= NNOCACHE
;
2066 } else if (np
->n_flag
& NNOCACHE
) {
2067 np
->n_flag
&= ~NNOCACHE
;
2071 OSAddAtomic(1, (SInt32
*)&nfsstats
.biocache_writes
);
2072 lbn
= uio
->uio_offset
/ biosize
;
2073 on
= uio
->uio_offset
% biosize
;
2074 // LP64todo - fix this
2075 n
= min((unsigned)(biosize
- on
), uio_uio_resid(uio
));
2079 * Get a cache block for writing. The range to be written is
2080 * (off..off+n) within the block. We ensure that the block
2081 * either has no dirty region or that the given range is
2082 * contiguous with the existing dirty region.
2084 error
= nfs_buf_get(vp
, lbn
, bufsize
, p
, NBLK_WRITE
, &bp
);
2086 np
->n_flag
&= ~NWRBUSY
;
2087 FSDBG_BOT(515, vp
, uio
->uio_offset
, uio_uio_resid(uio
), error
);
2090 /* map the block because we know we're going to write to it */
2093 if (np
->n_flag
& NNOCACHE
)
2094 SET(bp
->nb_flags
, NB_NOCACHE
);
2096 if (!IS_VALID_CRED(bp
->nb_wcred
)) {
2097 kauth_cred_ref(cred
);
2098 bp
->nb_wcred
= cred
;
2102 * If there's already a dirty range AND dirty pages in this block we
2103 * need to send a commit AND write the dirty pages before continuing.
2105 * If there's already a dirty range OR dirty pages in this block
2106 * and the new write range is not contiguous with the existing range,
2107 * then force the buffer to be written out now.
2108 * (We used to just extend the dirty range to cover the valid,
2109 * but unwritten, data in between also. But writing ranges
2110 * of data that weren't actually written by an application
2111 * risks overwriting some other client's data with stale data
2112 * that's just masquerading as new written data.)
2114 if (bp
->nb_dirtyend
> 0) {
2115 if (on
> bp
->nb_dirtyend
|| (on
+ n
) < bp
->nb_dirtyoff
|| bp
->nb_dirty
) {
2116 FSDBG(515, vp
, uio
->uio_offset
, bp
, 0xd15c001);
2117 /* write/commit buffer "synchronously" */
2118 /* (NB_STABLE indicates that data writes should be FILESYNC) */
2119 CLR(bp
->nb_flags
, (NB_DONE
| NB_ERROR
| NB_INVAL
));
2120 SET(bp
->nb_flags
, (NB_ASYNC
| NB_STABLE
));
2121 error
= nfs_buf_write(bp
);
2123 np
->n_flag
&= ~NWRBUSY
;
2124 FSDBG_BOT(515, vp
, uio
->uio_offset
, uio_uio_resid(uio
), error
);
2129 } else if (bp
->nb_dirty
) {
2130 int firstpg
, lastpg
;
2132 /* calculate write range pagemask */
2133 firstpg
= on
/PAGE_SIZE
;
2134 lastpg
= (on
+n
-1)/PAGE_SIZE
;
2135 pagemask
= ((1 << (lastpg
+1)) - 1) & ~((1 << firstpg
) - 1);
2136 /* check if there are dirty pages outside the write range */
2137 if (bp
->nb_dirty
& ~pagemask
) {
2138 FSDBG(515, vp
, uio
->uio_offset
, bp
, 0xd15c002);
2139 /* write/commit buffer "synchronously" */
2140 /* (NB_STABLE indicates that data writes should be FILESYNC) */
2141 CLR(bp
->nb_flags
, (NB_DONE
| NB_ERROR
| NB_INVAL
));
2142 SET(bp
->nb_flags
, (NB_ASYNC
| NB_STABLE
));
2143 error
= nfs_buf_write(bp
);
2145 np
->n_flag
&= ~NWRBUSY
;
2146 FSDBG_BOT(515, vp
, uio
->uio_offset
, uio_uio_resid(uio
), error
);
2151 /* if the first or last pages are already dirty */
2152 /* make sure that the dirty range encompasses those pages */
2153 if (NBPGDIRTY(bp
,firstpg
) || NBPGDIRTY(bp
,lastpg
)) {
2154 FSDBG(515, vp
, uio
->uio_offset
, bp
, 0xd15c003);
2155 bp
->nb_dirtyoff
= min(on
, firstpg
* PAGE_SIZE
);
2156 if (NBPGDIRTY(bp
,lastpg
)) {
2157 bp
->nb_dirtyend
= (lastpg
+1) * PAGE_SIZE
;
2159 if (NBOFF(bp
) + bp
->nb_dirtyend
> (off_t
)np
->n_size
)
2160 bp
->nb_dirtyend
= np
->n_size
- NBOFF(bp
);
2162 bp
->nb_dirtyend
= on
+n
;
2167 * Are we extending the size of the file with this write?
2168 * If so, update file size now that we have the block.
2169 * If there was a partial buf at the old eof, validate
2170 * and zero the new bytes.
2172 cureof
= (off_t
)np
->n_size
;
2173 if (uio
->uio_offset
+ n
> (off_t
)np
->n_size
) {
2174 struct nfsbuf
*eofbp
= NULL
;
2175 daddr64_t eofbn
= np
->n_size
/ biosize
;
2176 int eofoff
= np
->n_size
% biosize
;
2177 int neweofoff
= (uio
->uio_offset
+ n
) % biosize
;
2179 FSDBG(515, 0xb1ffa000, uio
->uio_offset
+ n
, eofoff
, neweofoff
);
2181 if (eofoff
&& (eofbn
< lbn
)) {
2182 error
= nfs_buf_get(vp
, eofbn
, biosize
, p
, NBLK_WRITE
|NBLK_ONLYVALID
, &eofbp
);
2184 np
->n_flag
&= ~NWRBUSY
;
2185 FSDBG_BOT(515, vp
, uio
->uio_offset
, uio_uio_resid(uio
), error
);
2190 /* if we're extending within the same last block */
2191 /* and the block is flagged as being cached... */
2192 if ((lbn
== eofbn
) && ISSET(bp
->nb_flags
, NB_CACHE
)) {
2193 /* ...check that all pages in buffer are valid */
2194 int endpg
= ((neweofoff
? neweofoff
: biosize
) - 1)/PAGE_SIZE
;
2196 /* pagemask only has to extend to last page being written to */
2197 pagemask
= (1 << (endpg
+1)) - 1;
2198 FSDBG(515, 0xb1ffa001, bp
->nb_valid
, pagemask
, 0);
2199 if ((bp
->nb_valid
& pagemask
) != pagemask
) {
2200 /* zerofill any hole */
2201 if (on
> bp
->nb_validend
) {
2203 for (i
=bp
->nb_validend
/PAGE_SIZE
; i
<= (on
- 1)/PAGE_SIZE
; i
++)
2204 NBPGVALID_SET(bp
, i
);
2206 FSDBG(516, bp
, bp
->nb_validend
, on
- bp
->nb_validend
, 0xf01e);
2207 bzero((char *)bp
->nb_data
+ bp
->nb_validend
,
2208 on
- bp
->nb_validend
);
2210 /* zerofill any trailing data in the last page */
2213 FSDBG(516, bp
, neweofoff
, PAGE_SIZE
- (neweofoff
& PAGE_MASK
), 0xe0f);
2214 bzero((char *)bp
->nb_data
+ neweofoff
,
2215 PAGE_SIZE
- (neweofoff
& PAGE_MASK
));
2219 np
->n_flag
|= NMODIFIED
;
2220 np
->n_size
= uio
->uio_offset
+ n
;
2221 ubc_setsize(vp
, (off_t
)np
->n_size
); /* XXX errors */
2224 * We may need to zero any previously invalid data
2225 * after the old EOF in the previous EOF buffer.
2227 * For the old last page, don't zero bytes if there
2228 * are invalid bytes in that page (i.e. the page isn't
2230 * For pages after the old last page, zero them and
2231 * mark them as valid.
2235 if (np
->n_flag
& NNOCACHE
)
2236 SET(eofbp
->nb_flags
, NB_NOCACHE
);
2238 FSDBG(516, eofbp
, eofoff
, biosize
- eofoff
, 0xe0fff01e);
2240 i
= eofoff
/PAGE_SIZE
;
2241 while (eofoff
< biosize
) {
2242 int poff
= eofoff
& PAGE_MASK
;
2243 if (!poff
|| NBPGVALID(eofbp
,i
)) {
2244 bzero(d
+ eofoff
, PAGE_SIZE
- poff
);
2245 NBPGVALID_SET(eofbp
, i
);
2247 if (bp
->nb_validend
== eofoff
)
2248 bp
->nb_validend
+= PAGE_SIZE
- poff
;
2249 eofoff
+= PAGE_SIZE
- poff
;
2252 nfs_buf_release(eofbp
, 1);
2256 * If dirtyend exceeds file size, chop it down. This should
2257 * not occur unless there is a race.
2259 if (NBOFF(bp
) + bp
->nb_dirtyend
> (off_t
)np
->n_size
)
2260 bp
->nb_dirtyend
= np
->n_size
- NBOFF(bp
);
2262 * UBC doesn't handle partial pages, so we need to make sure
2263 * that any pages left in the page cache are completely valid.
2265 * Writes that are smaller than a block are delayed if they
2266 * don't extend to the end of the block.
2268 * If the block isn't (completely) cached, we may need to read
2269 * in some parts of pages that aren't covered by the write.
2270 * If the write offset (on) isn't page aligned, we'll need to
2271 * read the start of the first page being written to. Likewise,
2272 * if the offset of the end of the write (on+n) isn't page aligned,
2273 * we'll need to read the end of the last page being written to.
2276 * We don't want to read anything we're just going to write over.
2277 * We don't want to issue multiple I/Os if we don't have to
2278 * (because they're synchronous rpcs).
2279 * We don't want to read anything we already have modified in the
2282 if (!ISSET(bp
->nb_flags
, NB_CACHE
) && n
< biosize
) {
2283 int firstpg
, lastpg
, dirtypg
;
2284 int firstpgoff
, lastpgoff
;
2286 firstpg
= on
/PAGE_SIZE
;
2287 firstpgoff
= on
& PAGE_MASK
;
2288 lastpg
= (on
+n
-1)/PAGE_SIZE
;
2289 lastpgoff
= (on
+n
) & PAGE_MASK
;
2290 if (firstpgoff
&& !NBPGVALID(bp
,firstpg
)) {
2291 /* need to read start of first page */
2292 start
= firstpg
* PAGE_SIZE
;
2293 end
= start
+ firstpgoff
;
2295 if (lastpgoff
&& !NBPGVALID(bp
,lastpg
)) {
2296 /* need to read end of last page */
2298 start
= (lastpg
* PAGE_SIZE
) + lastpgoff
;
2299 end
= (lastpg
+ 1) * PAGE_SIZE
;
2302 /* need to read the data in range: start...end-1 */
2304 /* first, check for dirty pages in between */
2305 /* if there are, we'll have to do two reads because */
2306 /* we don't want to overwrite the dirty pages. */
2307 for (dirtypg
=start
/PAGE_SIZE
; dirtypg
<= (end
-1)/PAGE_SIZE
; dirtypg
++)
2308 if (NBPGDIRTY(bp
,dirtypg
))
2311 /* if start is at beginning of page, try */
2312 /* to get any preceeding pages as well. */
2313 if (!(start
& PAGE_MASK
)) {
2314 /* stop at next dirty/valid page or start of block */
2315 for (; start
> 0; start
-=PAGE_SIZE
)
2316 if (NBPGVALID(bp
,((start
-1)/PAGE_SIZE
)))
2321 /* setup uio for read(s) */
2323 auio
.uio_iovs
.iov32p
= &iov
;
2324 auio
.uio_iovcnt
= 1;
2325 #if 1 /* LP64todo - can't use new segment flags until the drivers are ready */
2326 auio
.uio_segflg
= UIO_SYSSPACE
;
2328 auio
.uio_segflg
= UIO_SYSSPACE32
;
2330 auio
.uio_rw
= UIO_READ
;
2332 if (dirtypg
<= (end
-1)/PAGE_SIZE
) {
2333 /* there's a dirty page in the way, so just do two reads */
2334 /* we'll read the preceding data here */
2335 auio
.uio_offset
= boff
+ start
;
2336 iov
.iov_len
= on
- start
;
2337 uio_uio_resid_set(&auio
, iov
.iov_len
);
2338 iov
.iov_base
= (uintptr_t) bp
->nb_data
+ start
;
2339 error
= nfs_readrpc(vp
, &auio
, cred
, p
);
2341 bp
->nb_error
= error
;
2342 SET(bp
->nb_flags
, NB_ERROR
);
2343 printf("nfs_write: readrpc %d", error
);
2345 if (uio_uio_resid(&auio
) > 0) {
2346 FSDBG(516, bp
, iov
.iov_base
- bp
->nb_data
, uio_uio_resid(&auio
), 0xd00dee01);
2347 // LP64todo - fix this
2348 bzero((caddr_t
)iov
.iov_base
, uio_uio_resid(&auio
));
2350 /* update validoff/validend if necessary */
2351 if ((bp
->nb_validoff
< 0) || (bp
->nb_validoff
> start
))
2352 bp
->nb_validoff
= start
;
2353 if ((bp
->nb_validend
< 0) || (bp
->nb_validend
< on
))
2354 bp
->nb_validend
= on
;
2355 if ((off_t
)np
->n_size
> boff
+ bp
->nb_validend
)
2356 bp
->nb_validend
= min(np
->n_size
- (boff
+ start
), biosize
);
2357 /* validate any pages before the write offset */
2358 for (; start
< on
/PAGE_SIZE
; start
+=PAGE_SIZE
)
2359 NBPGVALID_SET(bp
, start
/PAGE_SIZE
);
2360 /* adjust start to read any trailing data */
2364 /* if end is at end of page, try to */
2365 /* get any following pages as well. */
2366 if (!(end
& PAGE_MASK
)) {
2367 /* stop at next valid page or end of block */
2368 for (; end
< bufsize
; end
+=PAGE_SIZE
)
2369 if (NBPGVALID(bp
,end
/PAGE_SIZE
))
2373 if (((boff
+start
) >= cureof
) || ((start
>= on
) && ((boff
+ on
+ n
) >= cureof
))) {
2375 * Either this entire read is beyond the current EOF
2376 * or the range that we won't be modifying (on+n...end)
2377 * is all beyond the current EOF.
2378 * No need to make a trip across the network to
2379 * read nothing. So, just zero the buffer instead.
2381 FSDBG(516, bp
, start
, end
- start
, 0xd00dee00);
2382 bzero(bp
->nb_data
+ start
, end
- start
);
2384 /* now we'll read the (rest of the) data */
2385 auio
.uio_offset
= boff
+ start
;
2386 iov
.iov_len
= end
- start
;
2387 uio_uio_resid_set(&auio
, iov
.iov_len
);
2388 iov
.iov_base
= (uintptr_t) (bp
->nb_data
+ start
);
2389 error
= nfs_readrpc(vp
, &auio
, cred
, p
);
2391 bp
->nb_error
= error
;
2392 SET(bp
->nb_flags
, NB_ERROR
);
2393 printf("nfs_write: readrpc %d", error
);
2395 if (uio_uio_resid(&auio
) > 0) {
2396 FSDBG(516, bp
, iov
.iov_base
- bp
->nb_data
, uio_uio_resid(&auio
), 0xd00dee02);
2397 // LP64todo - fix this
2398 bzero((caddr_t
)iov
.iov_base
, uio_uio_resid(&auio
));
2401 /* update validoff/validend if necessary */
2402 if ((bp
->nb_validoff
< 0) || (bp
->nb_validoff
> start
))
2403 bp
->nb_validoff
= start
;
2404 if ((bp
->nb_validend
< 0) || (bp
->nb_validend
< end
))
2405 bp
->nb_validend
= end
;
2406 if ((off_t
)np
->n_size
> boff
+ bp
->nb_validend
)
2407 bp
->nb_validend
= min(np
->n_size
- (boff
+ start
), biosize
);
2408 /* validate any pages before the write offset's page */
2409 for (; start
< trunc_page_32(on
); start
+=PAGE_SIZE
)
2410 NBPGVALID_SET(bp
, start
/PAGE_SIZE
);
2411 /* validate any pages after the range of pages being written to */
2412 for (; (end
- 1) > round_page_32(on
+n
-1); end
-=PAGE_SIZE
)
2413 NBPGVALID_SET(bp
, (end
-1)/PAGE_SIZE
);
2414 /* Note: pages being written to will be validated when written */
2418 if (ISSET(bp
->nb_flags
, NB_ERROR
)) {
2419 error
= bp
->nb_error
;
2420 nfs_buf_release(bp
, 1);
2421 np
->n_flag
&= ~NWRBUSY
;
2422 FSDBG_BOT(515, vp
, uio
->uio_offset
, uio_uio_resid(uio
), error
);
2426 np
->n_flag
|= NMODIFIED
;
2429 error
= uiomove((char *)bp
->nb_data
+ on
, n
, uio
);
2431 SET(bp
->nb_flags
, NB_ERROR
);
2432 nfs_buf_release(bp
, 1);
2433 np
->n_flag
&= ~NWRBUSY
;
2434 FSDBG_BOT(515, vp
, uio
->uio_offset
, uio_uio_resid(uio
), error
);
2438 /* validate any pages written to */
2439 start
= on
& ~PAGE_MASK
;
2440 for (; start
< on
+n
; start
+= PAGE_SIZE
) {
2441 NBPGVALID_SET(bp
, start
/PAGE_SIZE
);
2443 * This may seem a little weird, but we don't actually set the
2444 * dirty bits for writes. This is because we keep the dirty range
2445 * in the nb_dirtyoff/nb_dirtyend fields. Also, particularly for
2446 * delayed writes, when we give the pages back to the VM we don't
2447 * want to keep them marked dirty, because when we later write the
2448 * buffer we won't be able to tell which pages were written dirty
2449 * and which pages were mmapped and dirtied.
2452 if (bp
->nb_dirtyend
> 0) {
2453 bp
->nb_dirtyoff
= min(on
, bp
->nb_dirtyoff
);
2454 bp
->nb_dirtyend
= max((on
+ n
), bp
->nb_dirtyend
);
2456 bp
->nb_dirtyoff
= on
;
2457 bp
->nb_dirtyend
= on
+ n
;
2459 if (bp
->nb_validend
<= 0 || bp
->nb_validend
< bp
->nb_dirtyoff
||
2460 bp
->nb_validoff
> bp
->nb_dirtyend
) {
2461 bp
->nb_validoff
= bp
->nb_dirtyoff
;
2462 bp
->nb_validend
= bp
->nb_dirtyend
;
2464 bp
->nb_validoff
= min(bp
->nb_validoff
, bp
->nb_dirtyoff
);
2465 bp
->nb_validend
= max(bp
->nb_validend
, bp
->nb_dirtyend
);
2467 if (!ISSET(bp
->nb_flags
, NB_CACHE
))
2468 nfs_buf_normalize_valid_range(np
, bp
);
2471 * Since this block is being modified, it must be written
2472 * again and not just committed.
2474 if (ISSET(bp
->nb_flags
, NB_NEEDCOMMIT
)) {
2475 np
->n_needcommitcnt
--;
2476 CHECK_NEEDCOMMITCNT(np
);
2478 CLR(bp
->nb_flags
, NB_NEEDCOMMIT
);
2480 if (ioflag
& IO_SYNC
) {
2482 error
= nfs_buf_write(bp
);
2484 np
->n_flag
&= ~NWRBUSY
;
2485 FSDBG_BOT(515, vp
, uio
->uio_offset
,
2486 uio_uio_resid(uio
), error
);
2489 } else if (((n
+ on
) == biosize
) || (np
->n_flag
& NNOCACHE
)) {
2491 SET(bp
->nb_flags
, NB_ASYNC
);
2494 nfs_buf_write_delayed(bp
, p
);
2496 if (np
->n_needcommitcnt
> (nfsbufcnt
/16))
2497 nfs_flushcommits(vp
, p
, 1);
2499 } while (uio_uio_resid(uio
) > 0 && n
> 0);
2501 if (np
->n_flag
& NNOCACHE
) {
2502 /* make sure all the buffers are flushed out */
2503 error
= nfs_flush(vp
, MNT_WAIT
, cred
, p
, 0);
2506 np
->n_flag
&= ~NWRBUSY
;
2507 FSDBG_BOT(515, vp
, uio
->uio_offset
, uio_uio_resid(uio
), error
);
2512 * Flush out and invalidate all buffers associated with a vnode.
2513 * Called with the underlying object locked.
2516 nfs_vinvalbuf_internal(
2525 struct nfsbuflists blist
;
2526 int list
, error
= 0;
2527 struct nfsnode
*np
= VTONFS(vp
);
2529 if (flags
& V_SAVE
) {
2530 if ((error
= nfs_flush(vp
, MNT_WAIT
, cred
, p
,
2531 (flags
& V_IGNORE_WRITEERR
))))
2533 if (!LIST_EMPTY(&np
->n_dirtyblkhd
))
2534 panic("nfs_vinvalbuf: dirty bufs (vp 0x%x, bp 0x%x)",
2535 vp
, LIST_FIRST(&np
->n_dirtyblkhd
));
2538 lck_mtx_lock(nfs_buf_mutex
);
2541 if (nfs_buf_iterprepare(np
, &blist
, list
)) {
2543 if (nfs_buf_iterprepare(np
, &blist
, list
))
2546 while ((bp
= LIST_FIRST(&blist
))) {
2547 LIST_REMOVE(bp
, nb_vnbufs
);
2548 if (list
== NBI_CLEAN
)
2549 LIST_INSERT_HEAD(&np
->n_cleanblkhd
, bp
, nb_vnbufs
);
2551 LIST_INSERT_HEAD(&np
->n_dirtyblkhd
, bp
, nb_vnbufs
);
2553 while ((error
= nfs_buf_acquire(bp
, NBAC_REMOVE
, slpflag
, slptimeo
))) {
2554 FSDBG(556, vp
, bp
, NBOFF(bp
), bp
->nb_flags
);
2555 if (error
!= EAGAIN
) {
2556 FSDBG(554, vp
, bp
, -1, error
);
2557 nfs_buf_refrele(bp
);
2558 nfs_buf_itercomplete(np
, &blist
, list
);
2559 lck_mtx_unlock(nfs_buf_mutex
);
2563 nfs_buf_refrele(bp
);
2564 FSDBG(554, vp
, bp
, NBOFF(bp
), bp
->nb_flags
);
2565 lck_mtx_unlock(nfs_buf_mutex
);
2566 if ((flags
& V_SAVE
) && UBCINFOEXISTS(vp
) && bp
->nb_vp
&&
2567 (NBOFF(bp
) < (off_t
)np
->n_size
)) {
2568 /* XXX extra paranoia: make sure we're not */
2569 /* somehow leaving any dirty data around */
2571 int end
= (NBOFF(bp
) + bp
->nb_bufsize
> (off_t
)np
->n_size
) ?
2572 ((off_t
)np
->n_size
- NBOFF(bp
)) : bp
->nb_bufsize
;
2573 if (!ISSET(bp
->nb_flags
, NB_PAGELIST
)) {
2574 error
= nfs_buf_upl_setup(bp
);
2575 if (error
== EINVAL
) {
2576 /* vm object must no longer exist */
2577 /* hopefully we don't need to do */
2578 /* anything for this buffer */
2580 printf("nfs_vinvalbuf: upl setup failed %d\n", error
);
2581 bp
->nb_valid
= bp
->nb_dirty
= 0;
2583 nfs_buf_upl_check(bp
);
2584 /* check for any dirty data before the EOF */
2585 if (bp
->nb_dirtyend
&& bp
->nb_dirtyoff
< end
) {
2586 /* clip dirty range to EOF */
2587 if (bp
->nb_dirtyend
> end
)
2588 bp
->nb_dirtyend
= end
;
2591 bp
->nb_dirty
&= (1 << (round_page_32(end
)/PAGE_SIZE
)) - 1;
2592 /* also make sure we'll have a credential to do the write */
2593 if (mustwrite
&& !IS_VALID_CRED(bp
->nb_wcred
) && !IS_VALID_CRED(cred
)) {
2594 printf("nfs_vinvalbuf: found dirty buffer with no write creds\n");
2598 FSDBG(554, vp
, bp
, 0xd00dee, bp
->nb_flags
);
2599 if (!ISSET(bp
->nb_flags
, NB_PAGELIST
))
2600 panic("nfs_vinvalbuf: dirty buffer without upl");
2601 /* gotta write out dirty data before invalidating */
2602 /* (NB_STABLE indicates that data writes should be FILESYNC) */
2603 /* (NB_NOCACHE indicates buffer should be discarded) */
2604 CLR(bp
->nb_flags
, (NB_DONE
| NB_ERROR
| NB_INVAL
| NB_ASYNC
));
2605 SET(bp
->nb_flags
, NB_STABLE
| NB_NOCACHE
);
2606 if (!IS_VALID_CRED(bp
->nb_wcred
)) {
2607 kauth_cred_ref(cred
);
2608 bp
->nb_wcred
= cred
;
2610 error
= nfs_buf_write(bp
);
2611 // Note: bp has been released
2613 FSDBG(554, bp
, 0xd00dee, 0xbad, error
);
2614 np
->n_error
= error
;
2615 np
->n_flag
|= NWRITEERR
;
2617 * There was a write error and we need to
2618 * invalidate attrs to sync with server.
2619 * (if this write was extending the file,
2620 * we may no longer know the correct size)
2622 NATTRINVALIDATE(np
);
2625 lck_mtx_lock(nfs_buf_mutex
);
2629 SET(bp
->nb_flags
, NB_INVAL
);
2630 // hold off on FREEUPs until we're done here
2631 nfs_buf_release(bp
, 0);
2632 lck_mtx_lock(nfs_buf_mutex
);
2634 nfs_buf_itercomplete(np
, &blist
, list
);
2636 lck_mtx_unlock(nfs_buf_mutex
);
2639 panic("nfs_vinvalbuf: flush failed");
2645 * Flush and invalidate all dirty buffers. If another process is already
2646 * doing the flush, just wait for completion.
2656 struct nfsnode
*np
= VTONFS(vp
);
2657 struct nfsmount
*nmp
= VFSTONFS(vnode_mount(vp
));
2658 int error
= 0, slpflag
, slptimeo
;
2661 FSDBG_TOP(554, vp
, flags
, intrflg
, 0);
2663 if (nmp
&& ((nmp
->nm_flag
& NFSMNT_INT
) == 0))
2673 * First wait for any other process doing a flush to complete.
2675 while (np
->n_flag
& NFLUSHINPROG
) {
2676 np
->n_flag
|= NFLUSHWANT
;
2677 FSDBG_TOP(555, vp
, flags
, intrflg
, np
->n_flag
);
2678 error
= tsleep((caddr_t
)&np
->n_flag
, PRIBIO
+ 2, "nfsvinval", slptimeo
);
2679 FSDBG_BOT(555, vp
, flags
, intrflg
, np
->n_flag
);
2680 if (error
&& (error
= nfs_sigintr(VFSTONFS(vnode_mount(vp
)), NULL
, p
))) {
2681 FSDBG_BOT(554, vp
, flags
, intrflg
, error
);
2687 * Now, flush as required.
2689 np
->n_flag
|= NFLUSHINPROG
;
2690 error
= nfs_vinvalbuf_internal(vp
, flags
, cred
, p
, slpflag
, 0);
2692 FSDBG(554, vp
, 0, 0, error
);
2693 error
= nfs_sigintr(VFSTONFS(vnode_mount(vp
)), NULL
, p
);
2695 np
->n_flag
&= ~NFLUSHINPROG
;
2696 if (np
->n_flag
& NFLUSHWANT
) {
2697 np
->n_flag
&= ~NFLUSHWANT
;
2698 wakeup((caddr_t
)&np
->n_flag
);
2700 FSDBG_BOT(554, vp
, flags
, intrflg
, error
);
2703 error
= nfs_vinvalbuf_internal(vp
, flags
, cred
, p
, 0, slptimeo
);
2705 np
->n_flag
&= ~(NMODIFIED
| NFLUSHINPROG
);
2706 if (np
->n_flag
& NFLUSHWANT
) {
2707 np
->n_flag
&= ~NFLUSHWANT
;
2708 wakeup((caddr_t
)&np
->n_flag
);
2711 * get the pages out of vm also
2713 if (UBCINFOEXISTS(vp
) && (size
= ubc_getsize(vp
))) {
2714 int rv
= ubc_sync_range(vp
, 0, size
, UBC_PUSHALL
| UBC_INVALIDATE
);
2716 panic("nfs_vinvalbuf(): ubc_sync_range failed!");
2719 FSDBG_BOT(554, vp
, flags
, intrflg
, 0);
2724 * Initiate asynchronous I/O. Return an error if no nfsiods are available.
2725 * This is mainly to avoid queueing async I/O requests when the nfsiods
2726 * are all hung on a dead server.
2729 nfs_asyncio(bp
, cred
)
2733 struct nfsmount
*nmp
;
2739 void *wakeme
= NULL
;
2742 if (nfs_numasync
== 0)
2745 FSDBG_TOP(552, bp
, bp
? NBOFF(bp
) : 0, bp
? bp
->nb_flags
: 0, 0);
2747 nmp
= ((bp
!= NULL
) ? VFSTONFS(vnode_mount(bp
->nb_vp
)) : NULL
);
2749 if (nmp
&& nmp
->nm_flag
& NFSMNT_INT
)
2753 lck_mtx_lock(nfs_iod_mutex
);
2755 /* no nfsbuf means tell nfsiod to process delwri list */
2760 * Find a free iod to process this request.
2762 for (i
= 0; i
< NFS_MAXASYNCDAEMON
; i
++)
2763 if (nfs_iodwant
[i
]) {
2765 * Found one, so wake it up and tell it which
2768 nfs_iodwant
[i
] = NULL
;
2769 nfs_iodmount
[i
] = nmp
;
2772 wakeme
= &nfs_iodwant
[i
];
2777 /* if we're just poking the delwri list, we're done */
2779 lck_mtx_unlock(nfs_iod_mutex
);
2782 FSDBG_BOT(552, bp
, 0x10101010, wakeme
, 0);
2787 * If none are free, we may already have an iod working on this mount
2788 * point. If so, it will process our request.
2791 if (nmp
->nm_bufqiods
> 0) {
2797 * If we have an iod which can process the request, then queue
2800 FSDBG(552, bp
, gotiod
, i
, nmp
->nm_bufqiods
);
2803 * Ensure that the queue never grows too large.
2805 while (nmp
->nm_bufqlen
>= 2*nfs_numasync
) {
2806 if (ISSET(bp
->nb_flags
, NB_IOD
)) {
2807 /* An nfsiod is attempting this async operation so */
2808 /* we must not fall asleep on the bufq because we */
2809 /* could be waiting on ourself. Just return error */
2810 /* and we'll do this operation syncrhonously. */
2813 FSDBG(552, bp
, nmp
->nm_bufqlen
, 2*nfs_numasync
, -1);
2814 nmp
->nm_bufqwant
= TRUE
;
2816 ts
.tv_sec
= (slptimeo
/100);
2817 /* the hz value is 100; which leads to 10ms */
2818 ts
.tv_nsec
= (slptimeo
% 100) * 10 * NSEC_PER_USEC
* 1000;
2820 error
= msleep(&nmp
->nm_bufq
, nfs_iod_mutex
, slpflag
| PRIBIO
,
2823 error2
= nfs_sigintr(nmp
, NULL
, bp
->nb_proc
);
2825 lck_mtx_unlock(nfs_iod_mutex
);
2826 FSDBG_BOT(552, bp
, NBOFF(bp
), bp
->nb_flags
, error2
);
2829 if (slpflag
== PCATCH
) {
2835 * We might have lost our iod while sleeping,
2836 * so check and loop if nescessary.
2838 if (nmp
->nm_bufqiods
== 0) {
2839 lck_mtx_unlock(nfs_iod_mutex
);
2844 if (ISSET(bp
->nb_flags
, NB_READ
)) {
2845 if (!IS_VALID_CRED(bp
->nb_rcred
) && IS_VALID_CRED(cred
)) {
2846 kauth_cred_ref(cred
);
2847 bp
->nb_rcred
= cred
;
2850 SET(bp
->nb_flags
, NB_WRITEINPROG
);
2851 if (!IS_VALID_CRED(bp
->nb_wcred
) && IS_VALID_CRED(cred
)) {
2852 kauth_cred_ref(cred
);
2853 bp
->nb_wcred
= cred
;
2857 TAILQ_INSERT_TAIL(&nmp
->nm_bufq
, bp
, nb_free
);
2859 lck_mtx_unlock(nfs_iod_mutex
);
2862 FSDBG_BOT(552, bp
, NBOFF(bp
), bp
->nb_flags
, 0);
2867 lck_mtx_unlock(nfs_iod_mutex
);
2869 * All the iods are busy on other mounts, so return EIO to
2870 * force the caller to process the i/o synchronously.
2872 FSDBG_BOT(552, bp
, NBOFF(bp
), bp
->nb_flags
, EIO
);
2877 * Do an I/O operation to/from a cache block. This may be called
2878 * synchronously or from an nfsiod.
2881 nfs_doio(struct nfsbuf
*bp
, kauth_cred_t cr
, proc_t p
)
2886 struct nfsmount
*nmp
;
2887 int error
= 0, diff
, len
, iomode
, invalidate
= 0;
2893 vtype
= vnode_vtype(vp
);
2895 nmp
= VFSTONFS(vnode_mount(vp
));
2897 uiop
->uio_iovs
.iov32p
= &io
;
2898 uiop
->uio_iovcnt
= 1;
2899 #if 1 /* LP64todo - can't use new segment flags until the drivers are ready */
2900 uiop
->uio_segflg
= UIO_SYSSPACE
;
2902 uiop
->uio_segflg
= UIO_SYSSPACE32
;
2906 * we've decided to perform I/O for this block,
2907 * so we couldn't possibly NB_DONE. So, clear it.
2909 if (ISSET(bp
->nb_flags
, NB_DONE
)) {
2910 if (!ISSET(bp
->nb_flags
, NB_ASYNC
))
2911 panic("nfs_doio: done and not async");
2912 CLR(bp
->nb_flags
, NB_DONE
);
2914 FSDBG_TOP(256, np
->n_size
, NBOFF(bp
), bp
->nb_bufsize
, bp
->nb_flags
);
2915 FSDBG(257, bp
->nb_validoff
, bp
->nb_validend
, bp
->nb_dirtyoff
,
2918 if (ISSET(bp
->nb_flags
, NB_READ
)) {
2921 io
.iov_len
= bp
->nb_bufsize
;
2922 uio_uio_resid_set(uiop
, io
.iov_len
);
2923 io
.iov_base
= (uintptr_t) bp
->nb_data
;
2924 uiop
->uio_rw
= UIO_READ
;
2927 uiop
->uio_offset
= NBOFF(bp
);
2928 OSAddAtomic(1, (SInt32
*)&nfsstats
.read_bios
);
2929 error
= nfs_readrpc(vp
, uiop
, cr
, p
);
2930 FSDBG(262, np
->n_size
, NBOFF(bp
), uio_uio_resid(uiop
), error
);
2932 /* update valid range */
2933 bp
->nb_validoff
= 0;
2934 if (uio_uio_resid(uiop
) != 0) {
2936 * If len > 0, there is a hole in the file and
2937 * no writes after the hole have been pushed to
2939 * Just zero fill the rest of the valid area.
2941 // LP64todo - fix this
2942 diff
= bp
->nb_bufsize
- uio_uio_resid(uiop
);
2943 len
= np
->n_size
- (NBOFF(bp
) + diff
);
2945 // LP64todo - fix this
2946 len
= min(len
, uio_uio_resid(uiop
));
2947 bzero((char *)bp
->nb_data
+ diff
, len
);
2948 bp
->nb_validend
= diff
+ len
;
2949 FSDBG(258, diff
, len
, 0, 1);
2951 bp
->nb_validend
= diff
;
2953 bp
->nb_validend
= bp
->nb_bufsize
;
2954 bp
->nb_valid
= (1 << (round_page_32(bp
->nb_validend
)/PAGE_SIZE
)) - 1;
2955 if (bp
->nb_validend
& PAGE_MASK
) {
2956 /* valid range ends in the middle of a page so we */
2957 /* need to zero-fill any invalid data at the end */
2958 /* of the last page */
2959 bzero((caddr_t
)(bp
->nb_data
+ bp
->nb_validend
),
2960 bp
->nb_bufsize
- bp
->nb_validend
);
2961 FSDBG(258, bp
->nb_validend
,
2962 bp
->nb_bufsize
- bp
->nb_validend
, 0, 2);
2967 uiop
->uio_offset
= (off_t
)0;
2968 OSAddAtomic(1, (SInt32
*)&nfsstats
.readlink_bios
);
2969 error
= nfs_readlinkrpc(vp
, uiop
, cr
, p
);
2971 bp
->nb_validoff
= 0;
2972 bp
->nb_validend
= uiop
->uio_offset
;
2976 OSAddAtomic(1, (SInt32
*)&nfsstats
.readdir_bios
);
2977 uiop
->uio_offset
= NBOFF(bp
);
2978 if (!(nmp
->nm_flag
& NFSMNT_NFSV3
))
2979 nmp
->nm_flag
&= ~NFSMNT_RDIRPLUS
; /* dk@farm.org */
2980 if (nmp
->nm_flag
& NFSMNT_RDIRPLUS
) {
2981 error
= nfs_readdirplusrpc(vp
, uiop
, cr
, p
);
2982 if (error
== NFSERR_NOTSUPP
)
2983 nmp
->nm_flag
&= ~NFSMNT_RDIRPLUS
;
2985 if ((nmp
->nm_flag
& NFSMNT_RDIRPLUS
) == 0)
2986 error
= nfs_readdirrpc(vp
, uiop
, cr
, p
);
2988 bp
->nb_validoff
= 0;
2989 bp
->nb_validend
= uiop
->uio_offset
- NBOFF(bp
);
2990 bp
->nb_valid
= (1 << (round_page_32(bp
->nb_validend
)/PAGE_SIZE
)) - 1;
2994 printf("nfs_doio: type %x unexpected\n", vtype
);
2998 SET(bp
->nb_flags
, NB_ERROR
);
2999 bp
->nb_error
= error
;
3003 /* we're doing a write */
3006 /* We need to make sure the pages are locked before doing I/O. */
3007 if (!ISSET(bp
->nb_flags
, NB_META
) && UBCINFOEXISTS(vp
)) {
3008 if (!ISSET(bp
->nb_flags
, NB_PAGELIST
)) {
3009 error
= nfs_buf_upl_setup(bp
);
3011 printf("nfs_doio: upl create failed %d\n", error
);
3012 SET(bp
->nb_flags
, NB_ERROR
);
3016 nfs_buf_upl_check(bp
);
3020 if (ISSET(bp
->nb_flags
, NB_WASDIRTY
)) {
3021 FSDBG(256, bp
, NBOFF(bp
), bp
->nb_dirty
, 0xd00dee);
3023 * There are pages marked dirty that need to be written out.
3025 * We don't want to just combine the write range with the
3026 * range of pages that are dirty because that could cause us
3027 * to write data that wasn't actually written to.
3028 * We also don't want to write data more than once.
3030 * If the dirty range just needs to be committed, we do that.
3031 * Otherwise, we write the dirty range and clear the dirty bits
3032 * for any COMPLETE pages covered by that range.
3033 * If there are dirty pages left after that, we write out the
3034 * parts that we haven't written yet.
3039 * If NB_NEEDCOMMIT is set, a commit rpc may do the trick. If not
3040 * an actual write will have to be done.
3041 * If NB_WRITEINPROG is already set, then push it with a write anyhow.
3043 if (ISSET(bp
->nb_flags
, NB_NEEDCOMMIT
))
3044 nfs_buf_check_write_verifier(np
, bp
);
3045 if ((bp
->nb_flags
& (NB_NEEDCOMMIT
| NB_WRITEINPROG
)) == NB_NEEDCOMMIT
) {
3046 doff
= NBOFF(bp
) + bp
->nb_dirtyoff
;
3047 SET(bp
->nb_flags
, NB_WRITEINPROG
);
3048 error
= nfs_commit(vp
, doff
, bp
->nb_dirtyend
- bp
->nb_dirtyoff
,
3049 bp
->nb_wcred
, bp
->nb_proc
);
3050 CLR(bp
->nb_flags
, NB_WRITEINPROG
);
3052 bp
->nb_dirtyoff
= bp
->nb_dirtyend
= 0;
3053 CLR(bp
->nb_flags
, NB_NEEDCOMMIT
);
3054 np
->n_needcommitcnt
--;
3055 CHECK_NEEDCOMMITCNT(np
);
3059 if (!error
&& bp
->nb_dirtyend
> 0) {
3060 /* there's a dirty range that needs to be written out */
3062 int firstpg
, lastpg
;
3064 if (NBOFF(bp
) + bp
->nb_dirtyend
> (off_t
)np
->n_size
)
3065 bp
->nb_dirtyend
= np
->n_size
- NBOFF(bp
);
3069 doff
= bp
->nb_dirtyoff
;
3070 dend
= bp
->nb_dirtyend
;
3072 /* if doff page is dirty, move doff to start of page */
3073 if (NBPGDIRTY(bp
,doff
/PAGE_SIZE
))
3074 doff
-= doff
& PAGE_MASK
;
3075 /* try to expand write range to include preceding dirty pages */
3076 if (!(doff
& PAGE_MASK
))
3077 while (doff
> 0 && NBPGDIRTY(bp
,(doff
-1)/PAGE_SIZE
))
3079 /* if dend page is dirty, move dend to start of next page */
3080 if ((dend
& PAGE_MASK
) && NBPGDIRTY(bp
,dend
/PAGE_SIZE
))
3081 dend
= round_page_32(dend
);
3082 /* try to expand write range to include trailing dirty pages */
3083 if (!(dend
& PAGE_MASK
))
3084 while (dend
< bp
->nb_bufsize
&& NBPGDIRTY(bp
,dend
/PAGE_SIZE
))
3086 /* make sure to keep dend clipped to EOF */
3087 if (NBOFF(bp
) + dend
> (off_t
)np
->n_size
)
3088 dend
= np
->n_size
- NBOFF(bp
);
3089 /* calculate range of complete pages being written */
3090 firstpg
= round_page_32(doff
) / PAGE_SIZE
;
3091 lastpg
= (trunc_page_32(dend
) - 1)/ PAGE_SIZE
;
3092 /* calculate mask for that page range */
3093 pagemask
= ((1 << (lastpg
+1)) - 1) & ~((1 << firstpg
) - 1);
3095 /* compare page mask to nb_dirty; if there are other dirty pages */
3096 /* then write FILESYNC; otherwise, write UNSTABLE if async and */
3097 /* not needcommit/stable; otherwise write FILESYNC */
3098 if (bp
->nb_dirty
& ~pagemask
)
3099 iomode
= NFSV3WRITE_FILESYNC
;
3100 else if ((bp
->nb_flags
& (NB_ASYNC
| NB_NEEDCOMMIT
| NB_STABLE
)) == NB_ASYNC
)
3101 iomode
= NFSV3WRITE_UNSTABLE
;
3103 iomode
= NFSV3WRITE_FILESYNC
;
3105 /* write the dirty range */
3106 io
.iov_len
= dend
- doff
;
3107 uio_uio_resid_set(uiop
, io
.iov_len
);
3108 uiop
->uio_offset
= NBOFF(bp
) + doff
;
3109 io
.iov_base
= (uintptr_t) bp
->nb_data
+ doff
;
3110 uiop
->uio_rw
= UIO_WRITE
;
3112 OSAddAtomic(1, (SInt32
*)&nfsstats
.write_bios
);
3114 SET(bp
->nb_flags
, NB_WRITEINPROG
);
3115 error
= nfs_writerpc(vp
, uiop
, cr
, p
, &iomode
, &bp
->nb_verf
);
3116 /* clear dirty bits for pages we've written */
3118 bp
->nb_dirty
&= ~pagemask
;
3119 /* set/clear needcommit flag */
3120 if (!error
&& iomode
== NFSV3WRITE_UNSTABLE
) {
3121 if (!ISSET(bp
->nb_flags
, NB_NEEDCOMMIT
))
3122 np
->n_needcommitcnt
++;
3123 SET(bp
->nb_flags
, NB_NEEDCOMMIT
);
3124 /* make sure nb_dirtyoff/nb_dirtyend reflect actual range written */
3125 bp
->nb_dirtyoff
= doff
;
3126 bp
->nb_dirtyend
= dend
;
3128 if (ISSET(bp
->nb_flags
, NB_NEEDCOMMIT
)) {
3129 np
->n_needcommitcnt
--;
3130 CHECK_NEEDCOMMITCNT(np
);
3132 CLR(bp
->nb_flags
, NB_NEEDCOMMIT
);
3134 CLR(bp
->nb_flags
, NB_WRITEINPROG
);
3136 * For an interrupted write, the buffer is still valid and the write
3137 * hasn't been pushed to the server yet, so we can't set NB_ERROR and
3138 * report the interruption by setting NB_EINTR. For the NB_ASYNC case,
3139 * NB_EINTR is not relevant.
3141 * For the case of a V3 write rpc not being committed to stable
3142 * storage, the block is still dirty and requires either a commit rpc
3143 * or another write rpc with iomode == NFSV3WRITE_FILESYNC before the
3144 * block is reused. This is indicated by setting the NB_DELWRI and
3145 * NB_NEEDCOMMIT flags.
3147 if (error
== EINTR
|| (!error
&& bp
->nb_flags
& NB_NEEDCOMMIT
)) {
3148 CLR(bp
->nb_flags
, NB_INVAL
);
3149 if (!ISSET(bp
->nb_flags
, NB_DELWRI
)) {
3150 SET(bp
->nb_flags
, NB_DELWRI
);
3151 OSAddAtomic(1, (SInt32
*)&nfs_nbdwrite
);
3154 FSDBG(261, bp
->nb_validoff
, bp
->nb_validend
,
3157 * Since for the NB_ASYNC case, nfs_bwrite() has
3158 * reassigned the buffer to the clean list, we have to
3159 * reassign it back to the dirty one. Ugh.
3161 if (ISSET(bp
->nb_flags
, NB_ASYNC
)) {
3162 /* move to dirty list */
3163 lck_mtx_lock(nfs_buf_mutex
);
3164 if (bp
->nb_vnbufs
.le_next
!= NFSNOLIST
)
3165 LIST_REMOVE(bp
, nb_vnbufs
);
3166 LIST_INSERT_HEAD(&np
->n_dirtyblkhd
, bp
, nb_vnbufs
);
3167 lck_mtx_unlock(nfs_buf_mutex
);
3169 SET(bp
->nb_flags
, NB_EINTR
);
3172 /* either there's an error or we don't need to commit */
3174 SET(bp
->nb_flags
, NB_ERROR
);
3175 bp
->nb_error
= np
->n_error
= error
;
3176 np
->n_flag
|= NWRITEERR
;
3178 * There was a write error and we need to
3179 * invalidate attrs and flush buffers in
3180 * order to sync up with the server.
3181 * (if this write was extending the file,
3182 * we may no longer know the correct size)
3184 * But we can't call vinvalbuf while holding
3185 * this buffer busy. Set a flag to do it after
3186 * releasing the buffer.
3188 * Note we can only invalidate in this function
3189 * if this is an async write and so the iodone
3190 * below will release the buffer. Also, we
3191 * shouldn't call vinvalbuf from nfsiod because
3192 * that may deadlock waiting for the completion
3193 * of writes that are queued up behind this one.
3195 if (ISSET(bp
->nb_flags
, NB_ASYNC
) &&
3196 !ISSET(bp
->nb_flags
, NB_IOD
)) {
3199 /* invalidate later */
3200 np
->n_flag
|= NNEEDINVALIDATE
;
3202 NATTRINVALIDATE(np
);
3204 /* clear the dirty range */
3205 bp
->nb_dirtyoff
= bp
->nb_dirtyend
= 0;
3209 if (!error
&& bp
->nb_dirty
) {
3210 /* there are pages marked dirty that need to be written out */
3211 int pg
, count
, npages
, off
;
3213 OSAddAtomic(1, (SInt32
*)&nfsstats
.write_bios
);
3218 * we do these writes synchronously because we can't really
3219 * support the unstable/needommit method. We could write
3220 * them unstable, clear the dirty bits, and then commit the
3221 * whole block later, but if we need to rewrite the data, we
3222 * won't have any idea which pages were written because that
3223 * info can't be stored in the nb_dirtyoff/nb_dirtyend. We
3224 * also can't leave the dirty bits set because then we wouldn't
3225 * be able to tell if the pages were re-dirtied between the end
3226 * of the write and the commit.
3228 iomode
= NFSV3WRITE_FILESYNC
;
3229 uiop
->uio_rw
= UIO_WRITE
;
3231 SET(bp
->nb_flags
, NB_WRITEINPROG
);
3232 npages
= bp
->nb_bufsize
/PAGE_SIZE
;
3233 for (pg
=0; pg
< npages
; pg
++) {
3234 if (!NBPGDIRTY(bp
,pg
))
3237 while (((pg
+count
) < npages
) && NBPGDIRTY(bp
,pg
+count
))
3239 /* write count pages starting with page pg */
3240 off
= pg
* PAGE_SIZE
;
3241 len
= count
* PAGE_SIZE
;
3243 /* clip writes to EOF */
3244 if (NBOFF(bp
) + off
+ len
> (off_t
)np
->n_size
)
3245 len
-= (NBOFF(bp
) + off
+ len
) - np
->n_size
;
3248 uio_uio_resid_set(uiop
, io
.iov_len
);
3249 uiop
->uio_offset
= NBOFF(bp
) + off
;
3250 io
.iov_base
= (uintptr_t) bp
->nb_data
+ off
;
3251 error
= nfs_writerpc(vp
, uiop
, cr
, p
, &iomode
, &bp
->nb_verf
);
3255 /* clear dirty bits */
3257 bp
->nb_dirty
&= ~(1 << pg
);
3258 /* leave pg on last page */
3263 if (ISSET(bp
->nb_flags
, NB_NEEDCOMMIT
)) {
3264 np
->n_needcommitcnt
--;
3265 CHECK_NEEDCOMMITCNT(np
);
3267 CLR(bp
->nb_flags
, NB_NEEDCOMMIT
);
3269 CLR(bp
->nb_flags
, NB_WRITEINPROG
);
3270 FSDBG_BOT(256, bp
->nb_validoff
, bp
->nb_validend
, bp
->nb_bufsize
,
3275 SET(bp
->nb_flags
, NB_ERROR
);
3276 bp
->nb_error
= error
;
3280 FSDBG_BOT(256, bp
->nb_validoff
, bp
->nb_validend
, bp
->nb_bufsize
, error
);
3286 * There was a write error and we need to
3287 * invalidate attrs and flush buffers in
3288 * order to sync up with the server.
3289 * (if this write was extending the file,
3290 * we may no longer know the correct size)
3292 * But we couldn't call vinvalbuf while holding
3293 * the buffer busy. So we call vinvalbuf() after
3294 * releasing the buffer.
3296 * Note: we don't bother calling nfs_vinvalbuf() if
3297 * there's already a flush in progress.
3299 if (!(np
->n_flag
& NFLUSHINPROG
))
3300 nfs_vinvalbuf(vp
, V_SAVE
|V_IGNORE_WRITEERR
, cr
, p
, 1);