2 * Copyright (c) 2000-2019 Apple Inc. All rights reserved.
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
28 /* Copyright (c) 1995 NeXT Computer, Inc. All Rights Reserved */
30 * Copyright (c) 1989, 1993
31 * The Regents of the University of California. All rights reserved.
33 * This code is derived from software contributed to Berkeley by
34 * Rick Macklem at The University of Guelph.
36 * Redistribution and use in source and binary forms, with or without
37 * modification, are permitted provided that the following conditions
39 * 1. Redistributions of source code must retain the above copyright
40 * notice, this list of conditions and the following disclaimer.
41 * 2. Redistributions in binary form must reproduce the above copyright
42 * notice, this list of conditions and the following disclaimer in the
43 * documentation and/or other materials provided with the distribution.
44 * 3. All advertising materials mentioning features or use of this software
45 * must display the following acknowledgement:
46 * This product includes software developed by the University of
47 * California, Berkeley and its contributors.
48 * 4. Neither the name of the University nor the names of its contributors
49 * may be used to endorse or promote products derived from this software
50 * without specific prior written permission.
52 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
53 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
54 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
55 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
56 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
57 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
58 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
59 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
60 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
61 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
64 * @(#)nfs_bio.c 8.9 (Berkeley) 3/30/95
65 * FreeBSD-Id: nfs_bio.c,v 1.44 1997/09/10 19:52:25 phk Exp $
68 #include <nfs/nfs_conf.h>
71 #include <sys/param.h>
72 #include <sys/systm.h>
73 #include <sys/resourcevar.h>
74 #include <sys/signalvar.h>
75 #include <sys/proc_internal.h>
76 #include <sys/kauth.h>
77 #include <sys/malloc.h>
78 #include <sys/vnode.h>
79 #include <sys/dirent.h>
80 #include <sys/mount_internal.h>
81 #include <sys/kernel.h>
82 #include <sys/ubc_internal.h>
83 #include <sys/uio_internal.h>
84 #include <sys/kpi_mbuf.h>
87 #include <sys/vmparam.h>
90 #include <kern/clock.h>
91 #include <libkern/OSAtomic.h>
92 #include <kern/kalloc.h>
93 #include <kern/thread_call.h>
95 #include <nfs/rpcv2.h>
96 #include <nfs/nfsproto.h>
98 #include <nfs/nfs_gss.h>
99 #include <nfs/nfsmount.h>
100 #include <nfs/nfsnode.h>
101 #include <sys/buf_internal.h>
102 #include <libkern/OSAtomic.h>
103 #include <os/refcnt.h>
105 #define NFS_BIO_DBG(...) NFS_DBG(NFS_FAC_BIO, 7, ## __VA_ARGS__)
107 kern_return_t
thread_terminate(thread_t
); /* XXX */
109 #define NFSBUFHASH(np, lbn) \
110 (&nfsbufhashtbl[((long)(np) / sizeof(*(np)) + (int)(lbn)) & nfsbufhash])
111 LIST_HEAD(nfsbufhashhead
, nfsbuf
) * nfsbufhashtbl
;
112 struct nfsbuffreehead nfsbuffree
, nfsbuffreemeta
, nfsbufdelwri
;
114 int nfsbufcnt
, nfsbufmin
, nfsbufmax
, nfsbufmetacnt
, nfsbufmetamax
;
115 int nfsbuffreecnt
, nfsbuffreemetacnt
, nfsbufdelwricnt
, nfsneedbuffer
;
117 int nfs_buf_timer_on
= 0;
118 thread_t nfsbufdelwrithd
= NULL
;
120 ZONE_DECLARE(nfsbuf_zone
, "NFS bio", sizeof(struct nfsbuf
), ZC_NONE
);
122 lck_grp_t
*nfs_buf_lck_grp
;
123 lck_mtx_t
*nfs_buf_mutex
;
125 #define NFSBUF_FREE_PERIOD 30 /* seconds */
126 #define NFSBUF_LRU_STALE 120
127 #define NFSBUF_META_STALE 240
129 /* number of nfsbufs nfs_buf_freeup() should attempt to free from nfsbuffree list */
130 #define LRU_TO_FREEUP 6
131 /* number of nfsbufs nfs_buf_freeup() should attempt to free from nfsbuffreemeta list */
132 #define META_TO_FREEUP 3
133 /* total number of nfsbufs nfs_buf_freeup() should attempt to free */
134 #define TOTAL_TO_FREEUP (LRU_TO_FREEUP+META_TO_FREEUP)
135 /* fraction of nfsbufs nfs_buf_freeup() should attempt to free from nfsbuffree list when called from timer */
136 #define LRU_FREEUP_FRAC_ON_TIMER 8
137 /* fraction of nfsbufs nfs_buf_freeup() should attempt to free from nfsbuffreemeta list when called from timer */
138 #define META_FREEUP_FRAC_ON_TIMER 16
139 /* fraction of total nfsbufs that nfsbuffreecnt should exceed before bothering to call nfs_buf_freeup() */
140 #define LRU_FREEUP_MIN_FRAC 4
141 /* fraction of total nfsbufs that nfsbuffreemetacnt should exceed before bothering to call nfs_buf_freeup() */
142 #define META_FREEUP_MIN_FRAC 2
144 #define NFS_ROUND_BLOCK(p, blksize) ((((uint64_t)(p) + blksize - 1) & ~((uint64_t)blksize - 1)) / blksize)
146 #define NFS_BUF_FREEUP() \
148 /* only call nfs_buf_freeup() if it has work to do: */ \
149 if (((nfsbuffreecnt > nfsbufcnt/LRU_FREEUP_MIN_FRAC) || \
150 (nfsbuffreemetacnt > nfsbufcnt/META_FREEUP_MIN_FRAC)) && \
151 ((nfsbufcnt - TOTAL_TO_FREEUP) > nfsbufmin)) \
156 nfs_buf_pgs_get_page_mask(nfsbufpgs
*nfsbp
, off_t page
)
158 off_t page_pos
= page
/ NBPGS_ELEMENT_PAGES
;
159 off_t max_page
= NBPGS_STRUCT_SIZE
* 8;
162 if (page
>= max_page
) {
163 nfs_buf_pgs_bit_not(nfsbp
);
167 NBPGS_SET(nfsbp
, page
);
168 nfsbp
->pages
[page_pos
]--;
169 for (off_t i
= page_pos
- 1; i
>= 0; i
--) {
170 nfsbp
->pages
[i
] = ~0;
175 nfs_buf_pgs_bit_not(nfsbufpgs
*nfsbp
)
177 for (uint32_t i
= 0; i
< NBPGS_ELEMENTS
; i
++) {
178 nfsbp
->pages
[i
] = ~nfsbp
->pages
[i
];
183 nfs_buf_pgs_bit_and(nfsbufpgs
*nfsbp_src1
, nfsbufpgs
*nfsbp_src2
, nfsbufpgs
*nfsbp_dst
)
185 for (uint32_t i
= 0; i
< NBPGS_ELEMENTS
; i
++) {
186 nfsbp_dst
->pages
[i
] = nfsbp_src1
->pages
[i
] & nfsbp_src2
->pages
[i
];
191 nfs_buf_pgs_set_pages_between(nfsbufpgs
*nfsbp
, off_t firstpg
, off_t lastpg
)
193 nfsbufpgs pagemaskfirst
, pagemasklast
;
195 nfs_buf_pgs_get_page_mask(&pagemasklast
, lastpg
);
196 nfs_buf_pgs_get_page_mask(&pagemaskfirst
, firstpg
);
197 nfs_buf_pgs_bit_not(&pagemaskfirst
);
198 nfs_buf_pgs_bit_and(&pagemaskfirst
, &pagemasklast
, nfsbp
);
202 nfs_buf_pgs_is_set(nfsbufpgs
*nfsbp
)
204 for (uint32_t i
= 0; i
< NBPGS_ELEMENTS
; i
++) {
205 if (nfsbp
->pages
[i
] != 0) {
213 * Initialize nfsbuf lists
218 nfs_buf_lck_grp
= lck_grp_alloc_init("nfs_buf", LCK_GRP_ATTR_NULL
);
219 nfs_buf_mutex
= lck_mtx_alloc_init(nfs_buf_lck_grp
, LCK_ATTR_NULL
);
221 nfsbufcnt
= nfsbufmetacnt
=
222 nfsbuffreecnt
= nfsbuffreemetacnt
= nfsbufdelwricnt
= 0;
224 /* size nfsbufmax to cover at most half sane_size (w/default buf size) */
225 nfsbufmax
= (int)(sane_size
>> PAGE_SHIFT
) / (2 * (NFS_RWSIZE
>> PAGE_SHIFT
));
226 nfsbufmetamax
= nfsbufmax
/ 4;
230 nfsbufhashtbl
= hashinit(nfsbufmax
/ 4, M_NFSBIO
, &nfsbufhash
);
231 TAILQ_INIT(&nfsbuffree
);
232 TAILQ_INIT(&nfsbuffreemeta
);
233 TAILQ_INIT(&nfsbufdelwri
);
237 * Check periodically for stale/unused nfs bufs
240 nfs_buf_timer(__unused
void *param0
, __unused
void *param1
)
244 lck_mtx_lock(nfs_buf_mutex
);
245 if (nfsbufcnt
<= nfsbufmin
) {
246 nfs_buf_timer_on
= 0;
247 lck_mtx_unlock(nfs_buf_mutex
);
250 lck_mtx_unlock(nfs_buf_mutex
);
252 nfs_interval_timer_start(nfs_buf_timer_call
,
253 NFSBUF_FREE_PERIOD
* 1000);
257 * try to free up some excess, unused nfsbufs
260 nfs_buf_freeup(int timer
)
265 struct nfsbuffreehead nfsbuffreeup
;
267 TAILQ_INIT(&nfsbuffreeup
);
269 lck_mtx_lock(nfs_buf_mutex
);
273 FSDBG(320, nfsbufcnt
, nfsbuffreecnt
, nfsbuffreemetacnt
, 0);
275 count
= timer
? nfsbuffreecnt
/ LRU_FREEUP_FRAC_ON_TIMER
: LRU_TO_FREEUP
;
276 while ((nfsbufcnt
> nfsbufmin
) && (count
-- > 0)) {
277 fbp
= TAILQ_FIRST(&nfsbuffree
);
281 if (os_ref_get_count(&fbp
->nb_refs
) > 1) {
284 if (NBUFSTAMPVALID(fbp
) &&
285 (fbp
->nb_timestamp
+ (2 * NFSBUF_LRU_STALE
)) > now
.tv_sec
) {
288 nfs_buf_remfree(fbp
);
289 /* disassociate buffer from any nfsnode */
291 if (fbp
->nb_vnbufs
.le_next
!= NFSNOLIST
) {
292 LIST_REMOVE(fbp
, nb_vnbufs
);
293 fbp
->nb_vnbufs
.le_next
= NFSNOLIST
;
297 LIST_REMOVE(fbp
, nb_hash
);
298 TAILQ_INSERT_TAIL(&nfsbuffreeup
, fbp
, nb_free
);
302 count
= timer
? nfsbuffreemetacnt
/ META_FREEUP_FRAC_ON_TIMER
: META_TO_FREEUP
;
303 while ((nfsbufcnt
> nfsbufmin
) && (count
-- > 0)) {
304 fbp
= TAILQ_FIRST(&nfsbuffreemeta
);
308 if (os_ref_get_count(&fbp
->nb_refs
) > 1) {
311 if (NBUFSTAMPVALID(fbp
) &&
312 (fbp
->nb_timestamp
+ (2 * NFSBUF_META_STALE
)) > now
.tv_sec
) {
315 nfs_buf_remfree(fbp
);
316 /* disassociate buffer from any nfsnode */
318 if (fbp
->nb_vnbufs
.le_next
!= NFSNOLIST
) {
319 LIST_REMOVE(fbp
, nb_vnbufs
);
320 fbp
->nb_vnbufs
.le_next
= NFSNOLIST
;
324 LIST_REMOVE(fbp
, nb_hash
);
325 TAILQ_INSERT_TAIL(&nfsbuffreeup
, fbp
, nb_free
);
330 FSDBG(320, nfsbufcnt
, nfsbuffreecnt
, nfsbuffreemetacnt
, 0);
333 lck_mtx_unlock(nfs_buf_mutex
);
335 while ((fbp
= TAILQ_FIRST(&nfsbuffreeup
))) {
336 TAILQ_REMOVE(&nfsbuffreeup
, fbp
, nb_free
);
338 if (IS_VALID_CRED(fbp
->nb_rcred
)) {
339 kauth_cred_unref(&fbp
->nb_rcred
);
341 if (IS_VALID_CRED(fbp
->nb_wcred
)) {
342 kauth_cred_unref(&fbp
->nb_wcred
);
344 /* if buf was NB_META, dump buffer */
345 if (ISSET(fbp
->nb_flags
, NB_META
) && fbp
->nb_data
) {
346 kheap_free(KHEAP_DATA_BUFFERS
, fbp
->nb_data
, fbp
->nb_bufsize
);
348 NFS_ZFREE(nfsbuf_zone
, fbp
);
353 * remove a buffer from the freelist
354 * (must be called with nfs_buf_mutex held)
357 nfs_buf_remfree(struct nfsbuf
*bp
)
359 if (bp
->nb_free
.tqe_next
== NFSNOLIST
) {
360 panic("nfsbuf not on free list");
362 if (ISSET(bp
->nb_flags
, NB_DELWRI
)) {
364 TAILQ_REMOVE(&nfsbufdelwri
, bp
, nb_free
);
365 } else if (ISSET(bp
->nb_flags
, NB_META
)) {
367 TAILQ_REMOVE(&nfsbuffreemeta
, bp
, nb_free
);
370 TAILQ_REMOVE(&nfsbuffree
, bp
, nb_free
);
372 bp
->nb_free
.tqe_next
= NFSNOLIST
;
377 * check for existence of nfsbuf in cache
380 nfs_buf_is_incore(nfsnode_t np
, daddr64_t blkno
)
383 lck_mtx_lock(nfs_buf_mutex
);
384 if (nfs_buf_incore(np
, blkno
)) {
389 lck_mtx_unlock(nfs_buf_mutex
);
394 * return incore buffer (must be called with nfs_buf_mutex held)
397 nfs_buf_incore(nfsnode_t np
, daddr64_t blkno
)
399 /* Search hash chain */
400 struct nfsbuf
* bp
= NFSBUFHASH(np
, blkno
)->lh_first
;
401 for (; bp
!= NULL
; bp
= bp
->nb_hash
.le_next
) {
402 if ((bp
->nb_lblkno
== blkno
) && (bp
->nb_np
== np
)) {
403 if (!ISSET(bp
->nb_flags
, NB_INVAL
)) {
404 FSDBG(547, bp
, blkno
, bp
->nb_flags
, bp
->nb_np
);
413 * Check if it's OK to drop a page.
415 * Called by vnode_pager() on pageout request of non-dirty page.
416 * We need to make sure that it's not part of a delayed write.
417 * If it is, we can't let the VM drop it because we may need it
418 * later when/if we need to write the data (again).
421 nfs_buf_page_inval(vnode_t vp
, off_t offset
)
423 struct nfsmount
*nmp
= VTONMP(vp
);
427 if (nfs_mount_gone(nmp
)) {
431 lck_mtx_lock(nfs_buf_mutex
);
432 bp
= nfs_buf_incore(VTONFS(vp
), (daddr64_t
)(offset
/ nmp
->nm_biosize
));
436 FSDBG(325, bp
, bp
->nb_flags
, bp
->nb_dirtyoff
, bp
->nb_dirtyend
);
437 if (ISSET(bp
->nb_lflags
, NBL_BUSY
)) {
442 * If there's a dirty range in the buffer, check to
443 * see if this page intersects with the dirty range.
444 * If it does, we can't let the pager drop the page.
446 if (bp
->nb_dirtyend
> 0) {
447 off_t start
= offset
- NBOFF(bp
);
448 if ((bp
->nb_dirtyend
> start
) &&
449 (bp
->nb_dirtyoff
< (start
+ PAGE_SIZE
))) {
451 * Before returning the bad news, move the
452 * buffer to the start of the delwri list and
453 * give the list a push to try to flush the
458 TAILQ_INSERT_HEAD(&nfsbufdelwri
, bp
, nb_free
);
460 nfs_buf_delwri_push(1);
464 lck_mtx_unlock(nfs_buf_mutex
);
469 * set up the UPL for a buffer
470 * (must NOT be called with nfs_buf_mutex held)
473 nfs_buf_upl_setup(struct nfsbuf
*bp
)
479 if (ISSET(bp
->nb_flags
, NB_PAGELIST
)) {
483 upl_flags
= UPL_PRECIOUS
;
484 if (!ISSET(bp
->nb_flags
, NB_READ
)) {
486 * We're doing a "write", so we intend to modify
487 * the pages we're gathering.
489 upl_flags
|= UPL_WILL_MODIFY
;
491 kret
= ubc_create_upl_kernel(NFSTOV(bp
->nb_np
), NBOFF(bp
), bp
->nb_bufsize
,
492 &upl
, NULL
, upl_flags
, VM_KERN_MEMORY_FILE
);
493 if (kret
== KERN_INVALID_ARGUMENT
) {
494 /* vm object probably doesn't exist any more */
495 bp
->nb_pagelist
= NULL
;
498 if (kret
!= KERN_SUCCESS
) {
499 printf("nfs_buf_upl_setup(): failed to get pagelist %d\n", kret
);
500 bp
->nb_pagelist
= NULL
;
504 FSDBG(538, bp
, NBOFF(bp
), bp
->nb_bufsize
, bp
->nb_np
);
506 bp
->nb_pagelist
= upl
;
507 SET(bp
->nb_flags
, NB_PAGELIST
);
512 * update buffer's valid/dirty info from UBC
513 * (must NOT be called with nfs_buf_mutex held)
516 nfs_buf_upl_check(struct nfsbuf
*bp
)
519 off_t filesize
, fileoffset
;
522 if (!ISSET(bp
->nb_flags
, NB_PAGELIST
)) {
526 npages
= round_page_32(bp
->nb_bufsize
) / PAGE_SIZE
;
527 filesize
= ubc_getsize(NFSTOV(bp
->nb_np
));
528 fileoffset
= NBOFF(bp
);
529 if (fileoffset
< filesize
) {
530 SET(bp
->nb_flags
, NB_CACHE
);
532 CLR(bp
->nb_flags
, NB_CACHE
);
535 pl
= ubc_upl_pageinfo(bp
->nb_pagelist
);
536 NBPGS_ERASE(&bp
->nb_valid
);
537 NBPGS_ERASE(&bp
->nb_dirty
);
539 for (i
= 0; i
< npages
; i
++, fileoffset
+= PAGE_SIZE_64
) {
540 /* anything beyond the end of the file is not valid or dirty */
541 if (fileoffset
>= filesize
) {
544 if (!upl_valid_page(pl
, i
)) {
545 CLR(bp
->nb_flags
, NB_CACHE
);
548 NBPGVALID_SET(bp
, i
);
549 if (upl_dirty_page(pl
, i
)) {
550 NBPGDIRTY_SET(bp
, i
);
553 fileoffset
= NBOFF(bp
);
554 if (ISSET(bp
->nb_flags
, NB_CACHE
)) {
556 bp
->nb_validend
= bp
->nb_bufsize
;
557 if (fileoffset
+ bp
->nb_validend
> filesize
) {
558 bp
->nb_validend
= filesize
- fileoffset
;
561 bp
->nb_validoff
= bp
->nb_validend
= -1;
563 FSDBG(539, bp
, fileoffset
, bp
->nb_valid
, bp
->nb_dirty
);
564 FSDBG(539, bp
->nb_validoff
, bp
->nb_validend
, bp
->nb_dirtyoff
, bp
->nb_dirtyend
);
568 * make sure that a buffer is mapped
569 * (must NOT be called with nfs_buf_mutex held)
572 nfs_buf_map(struct nfsbuf
*bp
)
579 if (!ISSET(bp
->nb_flags
, NB_PAGELIST
)) {
583 kret
= ubc_upl_map(bp
->nb_pagelist
, (vm_offset_t
*)&(bp
->nb_data
));
584 if (kret
!= KERN_SUCCESS
) {
585 panic("nfs_buf_map: ubc_upl_map() failed with (%d)", kret
);
587 if (bp
->nb_data
== 0) {
588 panic("ubc_upl_map mapped 0");
590 FSDBG(540, bp
, bp
->nb_flags
, NBOFF(bp
), bp
->nb_data
);
595 * normalize an nfsbuf's valid range
597 * the read/write code guarantees that we'll always have a valid
598 * region that is an integral number of pages. If either end
599 * of the valid range isn't page-aligned, it gets corrected
600 * here as we extend the valid range through all of the
601 * contiguous valid pages.
604 nfs_buf_normalize_valid_range(nfsnode_t np
, struct nfsbuf
*bp
)
607 /* pull validoff back to start of contiguous valid page range */
608 pg
= bp
->nb_validoff
/ PAGE_SIZE
;
609 while (pg
>= 0 && NBPGVALID(bp
, pg
)) {
612 bp
->nb_validoff
= (pg
+ 1) * PAGE_SIZE
;
613 /* push validend forward to end of contiguous valid page range */
614 npg
= bp
->nb_bufsize
/ PAGE_SIZE
;
615 pg
= bp
->nb_validend
/ PAGE_SIZE
;
616 while (pg
< npg
&& NBPGVALID(bp
, pg
)) {
619 bp
->nb_validend
= pg
* PAGE_SIZE
;
621 if (NBOFF(bp
) + bp
->nb_validend
> (off_t
)np
->n_size
) {
622 bp
->nb_validend
= np
->n_size
% bp
->nb_bufsize
;
627 * process some entries on the delayed write queue
628 * (must be called with nfs_buf_mutex held)
631 nfs_buf_delwri_service(void)
637 while (i
< 8 && (bp
= TAILQ_FIRST(&nfsbufdelwri
)) != NULL
) {
641 while ((error
= nfs_buf_acquire(bp
, 0, 0, 0)) == EAGAIN
) {
649 /* buffer is no longer valid */
653 if (ISSET(bp
->nb_flags
, NB_NEEDCOMMIT
)) {
654 nfs_buf_check_write_verifier(np
, bp
);
656 if (ISSET(bp
->nb_flags
, NB_NEEDCOMMIT
)) {
657 /* put buffer at end of delwri list */
658 TAILQ_INSERT_TAIL(&nfsbufdelwri
, bp
, nb_free
);
661 lck_mtx_unlock(nfs_buf_mutex
);
662 nfs_flushcommits(np
, 1);
664 SET(bp
->nb_flags
, NB_ASYNC
);
665 lck_mtx_unlock(nfs_buf_mutex
);
669 lck_mtx_lock(nfs_buf_mutex
);
674 * thread to service the delayed write queue when asked
677 nfs_buf_delwri_thread(__unused
void *arg
, __unused wait_result_t wr
)
679 struct timespec ts
= { .tv_sec
= 30, .tv_nsec
= 0 };
682 lck_mtx_lock(nfs_buf_mutex
);
684 nfs_buf_delwri_service();
685 error
= msleep(&nfsbufdelwrithd
, nfs_buf_mutex
, 0, "nfsbufdelwri", &ts
);
687 nfsbufdelwrithd
= NULL
;
688 lck_mtx_unlock(nfs_buf_mutex
);
689 thread_terminate(nfsbufdelwrithd
);
693 * try to push out some delayed/uncommitted writes
694 * ("locked" indicates whether nfs_buf_mutex is already held)
697 nfs_buf_delwri_push(int locked
)
699 if (TAILQ_EMPTY(&nfsbufdelwri
)) {
703 lck_mtx_lock(nfs_buf_mutex
);
705 /* wake up the delayed write service thread */
706 if (nfsbufdelwrithd
) {
707 wakeup(&nfsbufdelwrithd
);
708 } else if (kernel_thread_start(nfs_buf_delwri_thread
, NULL
, &nfsbufdelwrithd
) == KERN_SUCCESS
) {
709 thread_deallocate(nfsbufdelwrithd
);
711 /* otherwise, try to do some of the work ourselves */
712 if (!nfsbufdelwrithd
) {
713 nfs_buf_delwri_service();
716 lck_mtx_unlock(nfs_buf_mutex
);
723 * Returns errno on error, 0 otherwise.
724 * Any buffer is returned in *bpp.
726 * If NBLK_ONLYVALID is set, only return buffer if found in cache.
727 * If NBLK_NOWAIT is set, don't wait for the buffer if it's marked BUSY.
729 * Check for existence of buffer in cache.
730 * Or attempt to reuse a buffer from one of the free lists.
731 * Or allocate a new buffer if we haven't already hit max allocation.
732 * Or wait for a free buffer.
734 * If available buffer found, prepare it, and return it.
736 * If the calling process is interrupted by a signal for
737 * an interruptible mount point, return EINTR.
748 vnode_t vp
= NFSTOV(np
);
749 struct nfsmount
*nmp
= VTONMP(vp
);
752 int slpflag
= PCATCH
;
753 int operation
= (flags
& NBLK_OPMASK
);
757 FSDBG_TOP(541, np
, blkno
, size
, flags
);
761 if (bufsize
> NFS_MAXBSIZE
) {
762 panic("nfs_buf_get: buffer larger than NFS_MAXBSIZE requested");
765 if (nfs_mount_gone(nmp
)) {
766 FSDBG_BOT(541, np
, blkno
, 0, ENXIO
);
770 if (!UBCINFOEXISTS(vp
)) {
771 operation
= NBLK_META
;
772 } else if (bufsize
< (uint32_t)nmp
->nm_biosize
) {
773 /* reg files should always have biosize blocks */
774 bufsize
= nmp
->nm_biosize
;
777 /* if NBLK_WRITE, check for too many delayed/uncommitted writes */
778 if ((operation
== NBLK_WRITE
) && (nfs_nbdwrite
> NFS_A_LOT_OF_DELAYED_WRITES
)) {
779 FSDBG_TOP(542, np
, blkno
, nfs_nbdwrite
, NFS_A_LOT_OF_DELAYED_WRITES
);
781 /* poke the delwri list */
782 nfs_buf_delwri_push(0);
784 /* sleep to let other threads run... */
785 tsleep(&nfs_nbdwrite
, PCATCH
, "nfs_nbdwrite", 1);
786 FSDBG_BOT(542, np
, blkno
, nfs_nbdwrite
, NFS_A_LOT_OF_DELAYED_WRITES
);
790 lck_mtx_lock(nfs_buf_mutex
);
792 /* wait for any buffer invalidation/flushing to complete */
793 while (np
->n_bflag
& NBINVALINPROG
) {
794 np
->n_bflag
|= NBINVALWANT
;
797 msleep(&np
->n_bflag
, nfs_buf_mutex
, slpflag
, "nfs_buf_get_invalwait", &ts
);
798 if ((error
= nfs_sigintr(VTONMP(vp
), NULL
, thd
, 0))) {
799 lck_mtx_unlock(nfs_buf_mutex
);
800 FSDBG_BOT(541, np
, blkno
, 0, error
);
803 if (np
->n_bflag
& NBINVALINPROG
) {
808 /* check for existence of nfsbuf in cache */
809 if ((bp
= nfs_buf_incore(np
, blkno
))) {
810 /* if busy, set wanted and wait */
811 if (ISSET(bp
->nb_lflags
, NBL_BUSY
)) {
812 if (flags
& NBLK_NOWAIT
) {
813 lck_mtx_unlock(nfs_buf_mutex
);
814 FSDBG_BOT(541, np
, blkno
, bp
, 0xbcbcbcbc);
817 FSDBG_TOP(543, np
, blkno
, bp
, bp
->nb_flags
);
818 SET(bp
->nb_lflags
, NBL_WANTED
);
822 msleep(bp
, nfs_buf_mutex
, slpflag
| (PRIBIO
+ 1) | PDROP
,
823 "nfsbufget", (slpflag
== PCATCH
) ? NULL
: &ts
);
825 FSDBG_BOT(543, np
, blkno
, bp
, bp
->nb_flags
);
826 if ((error
= nfs_sigintr(VTONMP(vp
), NULL
, thd
, 0))) {
827 FSDBG_BOT(541, np
, blkno
, 0, error
);
832 if (bp
->nb_bufsize
!= bufsize
) {
833 panic("nfsbuf size mismatch");
835 SET(bp
->nb_lflags
, NBL_BUSY
);
836 SET(bp
->nb_flags
, NB_CACHE
);
838 /* additional paranoia: */
839 if (ISSET(bp
->nb_flags
, NB_PAGELIST
)) {
840 panic("pagelist buffer was not busy");
845 if (flags
& NBLK_ONLYVALID
) {
846 lck_mtx_unlock(nfs_buf_mutex
);
847 FSDBG_BOT(541, np
, blkno
, 0, 0x0000cace);
852 * where to get a free buffer:
853 * - if meta and maxmeta reached, must reuse meta
854 * - alloc new if we haven't reached min bufs
855 * - if free lists are NOT empty
856 * - if free list is stale, use it
857 * - else if freemeta list is stale, use it
858 * - else if max bufs allocated, use least-time-to-stale
859 * - alloc new if we haven't reached max allowed
860 * - start clearing out delwri list and try again
863 if ((operation
== NBLK_META
) && (nfsbufmetacnt
>= nfsbufmetamax
)) {
864 /* if we've hit max meta buffers, must reuse a meta buffer */
865 bp
= TAILQ_FIRST(&nfsbuffreemeta
);
866 } else if ((nfsbufcnt
> nfsbufmin
) &&
867 (!TAILQ_EMPTY(&nfsbuffree
) || !TAILQ_EMPTY(&nfsbuffreemeta
))) {
868 /* try to pull an nfsbuf off a free list */
869 struct nfsbuf
*lrubp
, *metabp
;
873 /* if the next LRU or META buffer is invalid or stale, use it */
874 lrubp
= TAILQ_FIRST(&nfsbuffree
);
875 if (lrubp
&& (!NBUFSTAMPVALID(lrubp
) ||
876 ((lrubp
->nb_timestamp
+ NFSBUF_LRU_STALE
) < now
.tv_sec
))) {
879 metabp
= TAILQ_FIRST(&nfsbuffreemeta
);
880 if (!bp
&& metabp
&& (!NBUFSTAMPVALID(metabp
) ||
881 ((metabp
->nb_timestamp
+ NFSBUF_META_STALE
) < now
.tv_sec
))) {
885 if (!bp
&& (nfsbufcnt
>= nfsbufmax
)) {
886 /* we've already allocated all bufs, so */
887 /* choose the buffer that'll go stale first */
893 time_t lru_stale_time
, meta_stale_time
;
894 lru_stale_time
= lrubp
->nb_timestamp
+ NFSBUF_LRU_STALE
;
895 meta_stale_time
= metabp
->nb_timestamp
+ NFSBUF_META_STALE
;
896 if (lru_stale_time
<= meta_stale_time
) {
906 /* we have a buffer to reuse */
907 FSDBG(544, np
, blkno
, bp
, bp
->nb_flags
);
909 if (ISSET(bp
->nb_flags
, NB_DELWRI
)) {
910 panic("nfs_buf_get: delwri");
912 SET(bp
->nb_lflags
, NBL_BUSY
);
913 /* disassociate buffer from previous nfsnode */
915 if (bp
->nb_vnbufs
.le_next
!= NFSNOLIST
) {
916 LIST_REMOVE(bp
, nb_vnbufs
);
917 bp
->nb_vnbufs
.le_next
= NFSNOLIST
;
921 LIST_REMOVE(bp
, nb_hash
);
922 /* nuke any creds we're holding */
923 if (IS_VALID_CRED(bp
->nb_rcred
)) {
924 kauth_cred_unref(&bp
->nb_rcred
);
926 if (IS_VALID_CRED(bp
->nb_wcred
)) {
927 kauth_cred_unref(&bp
->nb_wcred
);
929 /* if buf will no longer be NB_META, dump old buffer */
930 if (operation
== NBLK_META
) {
931 if (!ISSET(bp
->nb_flags
, NB_META
)) {
934 } else if (ISSET(bp
->nb_flags
, NB_META
)) {
936 kheap_free(KHEAP_DATA_BUFFERS
, bp
->nb_data
, bp
->nb_bufsize
);
941 /* re-init buf fields */
943 bp
->nb_validoff
= bp
->nb_validend
= -1;
944 bp
->nb_dirtyoff
= bp
->nb_dirtyend
= 0;
945 NBPGS_ERASE(&bp
->nb_valid
);
946 NBPGS_ERASE(&bp
->nb_dirty
);
949 /* no buffer to reuse */
950 if ((nfsbufcnt
< nfsbufmax
) &&
951 ((operation
!= NBLK_META
) || (nfsbufmetacnt
< nfsbufmetamax
))) {
952 /* just alloc a new one */
953 bp
= zalloc(nfsbuf_zone
);
957 * If any excess bufs, make sure the timer
958 * is running to free them up later.
960 if (nfsbufcnt
> nfsbufmin
&& !nfs_buf_timer_on
) {
961 nfs_buf_timer_on
= 1;
962 nfs_interval_timer_start(nfs_buf_timer_call
,
963 NFSBUF_FREE_PERIOD
* 1000);
966 if (operation
== NBLK_META
) {
971 bzero(bp
, sizeof(*bp
));
972 os_ref_init(&bp
->nb_refs
, NULL
);
974 bp
->nb_free
.tqe_next
= NFSNOLIST
;
975 bp
->nb_validoff
= bp
->nb_validend
= -1;
976 FSDBG(545, np
, blkno
, bp
, 0);
978 /* too many bufs... wait for buffers to free up */
979 FSDBG_TOP(546, np
, blkno
, nfsbufcnt
, nfsbufmax
);
981 /* poke the delwri list */
982 nfs_buf_delwri_push(1);
985 msleep(&nfsneedbuffer
, nfs_buf_mutex
, PCATCH
| PDROP
, "nfsbufget", NULL
);
986 FSDBG_BOT(546, np
, blkno
, nfsbufcnt
, nfsbufmax
);
987 if ((error
= nfs_sigintr(VTONMP(vp
), NULL
, thd
, 0))) {
988 FSDBG_BOT(541, np
, blkno
, 0, error
);
996 SET(bp
->nb_lflags
, NBL_BUSY
);
998 bp
->nb_lblkno
= blkno
;
999 /* insert buf in hash */
1000 LIST_INSERT_HEAD(NFSBUFHASH(np
, blkno
), bp
, nb_hash
);
1001 /* associate buffer with new nfsnode */
1003 LIST_INSERT_HEAD(&np
->n_cleanblkhd
, bp
, nb_vnbufs
);
1008 lck_mtx_unlock(nfs_buf_mutex
);
1010 switch (operation
) {
1012 SET(bp
->nb_flags
, NB_META
);
1013 if ((bp
->nb_bufsize
!= bufsize
) && bp
->nb_data
) {
1014 kheap_free(KHEAP_DATA_BUFFERS
, bp
->nb_data
, bp
->nb_bufsize
);
1016 bp
->nb_validoff
= bp
->nb_validend
= -1;
1017 bp
->nb_dirtyoff
= bp
->nb_dirtyend
= 0;
1018 NBPGS_ERASE(&bp
->nb_valid
);
1019 NBPGS_ERASE(&bp
->nb_dirty
);
1020 CLR(bp
->nb_flags
, NB_CACHE
);
1023 bp
->nb_data
= kheap_alloc(KHEAP_DATA_BUFFERS
,
1027 /* Ack! couldn't allocate the data buffer! */
1028 /* clean up buffer and return error */
1029 lck_mtx_lock(nfs_buf_mutex
);
1030 LIST_REMOVE(bp
, nb_vnbufs
);
1031 bp
->nb_vnbufs
.le_next
= NFSNOLIST
;
1033 /* invalidate usage timestamp to allow immediate freeing */
1034 NBUFSTAMPINVALIDATE(bp
);
1035 if (bp
->nb_free
.tqe_next
!= NFSNOLIST
) {
1036 panic("nfsbuf on freelist");
1038 TAILQ_INSERT_HEAD(&nfsbuffree
, bp
, nb_free
);
1040 lck_mtx_unlock(nfs_buf_mutex
);
1041 FSDBG_BOT(541, np
, blkno
, 0xb00, ENOMEM
);
1044 bp
->nb_bufsize
= bufsize
;
1050 * Set or clear NB_READ now to let the UPL subsystem know
1051 * if we intend to modify the pages or not.
1053 if (operation
== NBLK_READ
) {
1054 SET(bp
->nb_flags
, NB_READ
);
1056 CLR(bp
->nb_flags
, NB_READ
);
1058 if (bufsize
< PAGE_SIZE
) {
1059 bufsize
= PAGE_SIZE
;
1061 bp
->nb_bufsize
= bufsize
;
1062 bp
->nb_validoff
= bp
->nb_validend
= -1;
1064 if (UBCINFOEXISTS(vp
)) {
1066 if (nfs_buf_upl_setup(bp
)) {
1067 /* unable to create upl */
1068 /* vm object must no longer exist */
1069 /* clean up buffer and return error */
1070 lck_mtx_lock(nfs_buf_mutex
);
1071 LIST_REMOVE(bp
, nb_vnbufs
);
1072 bp
->nb_vnbufs
.le_next
= NFSNOLIST
;
1074 /* invalidate usage timestamp to allow immediate freeing */
1075 NBUFSTAMPINVALIDATE(bp
);
1076 if (bp
->nb_free
.tqe_next
!= NFSNOLIST
) {
1077 panic("nfsbuf on freelist");
1079 TAILQ_INSERT_HEAD(&nfsbuffree
, bp
, nb_free
);
1081 lck_mtx_unlock(nfs_buf_mutex
);
1082 FSDBG_BOT(541, np
, blkno
, 0x2bc, EIO
);
1085 nfs_buf_upl_check(bp
);
1090 panic("nfs_buf_get: %d unknown operation", operation
);
1095 FSDBG_BOT(541, np
, blkno
, bp
, bp
->nb_flags
);
1101 nfs_buf_release(struct nfsbuf
*bp
, int freeup
)
1103 nfsnode_t np
= bp
->nb_np
;
1106 int wakeup_needbuffer
, wakeup_buffer
, wakeup_nbdwrite
;
1108 FSDBG_TOP(548, bp
, NBOFF(bp
), bp
->nb_flags
, bp
->nb_data
);
1109 FSDBG(548, bp
->nb_validoff
, bp
->nb_validend
, bp
->nb_dirtyoff
, bp
->nb_dirtyend
);
1110 FSDBG(548, bp
->nb_valid
, 0, bp
->nb_dirty
, 0);
1112 vp
= np
? NFSTOV(np
) : NULL
;
1113 if (vp
&& UBCINFOEXISTS(vp
) && bp
->nb_bufsize
) {
1118 if (!ISSET(bp
->nb_flags
, NB_PAGELIST
) && !ISSET(bp
->nb_flags
, NB_INVAL
)) {
1119 rv
= nfs_buf_upl_setup(bp
);
1121 printf("nfs_buf_release: upl create failed %d\n", rv
);
1123 nfs_buf_upl_check(bp
);
1126 upl
= bp
->nb_pagelist
;
1128 goto pagelist_cleanup_done
;
1131 if (ubc_upl_unmap(upl
) != KERN_SUCCESS
) {
1132 panic("ubc_upl_unmap failed");
1137 * Abort the pages on error or: if this is an invalid or
1138 * non-needcommit nocache buffer AND no pages are dirty.
1140 if (ISSET(bp
->nb_flags
, NB_ERROR
) || (!nfs_buf_pgs_is_set(&bp
->nb_dirty
) && (ISSET(bp
->nb_flags
, NB_INVAL
) ||
1141 (ISSET(bp
->nb_flags
, NB_NOCACHE
) && !ISSET(bp
->nb_flags
, (NB_NEEDCOMMIT
| NB_DELWRI
)))))) {
1142 if (ISSET(bp
->nb_flags
, (NB_READ
| NB_INVAL
| NB_NOCACHE
))) {
1143 upl_flags
= UPL_ABORT_DUMP_PAGES
;
1147 ubc_upl_abort(upl
, upl_flags
);
1148 goto pagelist_cleanup_done
;
1150 for (i
= 0; i
<= (bp
->nb_bufsize
- 1) / PAGE_SIZE
; i
++) {
1151 if (!NBPGVALID(bp
, i
)) {
1152 ubc_upl_abort_range(upl
,
1153 i
* PAGE_SIZE
, PAGE_SIZE
,
1154 UPL_ABORT_DUMP_PAGES
|
1155 UPL_ABORT_FREE_ON_EMPTY
);
1157 if (NBPGDIRTY(bp
, i
)) {
1158 upl_flags
= UPL_COMMIT_SET_DIRTY
;
1160 upl_flags
= UPL_COMMIT_CLEAR_DIRTY
;
1163 if (!ISSET(bp
->nb_flags
, (NB_NEEDCOMMIT
| NB_DELWRI
))) {
1164 upl_flags
|= UPL_COMMIT_CLEAR_PRECIOUS
;
1167 ubc_upl_commit_range(upl
,
1168 i
* PAGE_SIZE
, PAGE_SIZE
,
1170 UPL_COMMIT_INACTIVATE
|
1171 UPL_COMMIT_FREE_ON_EMPTY
);
1174 pagelist_cleanup_done
:
1175 /* invalidate any pages past EOF */
1176 if (NBOFF(bp
) + bp
->nb_bufsize
> (off_t
)(np
->n_size
)) {
1178 start
= trunc_page_64(np
->n_size
) + PAGE_SIZE_64
;
1179 end
= trunc_page_64(NBOFF(bp
) + bp
->nb_bufsize
);
1180 if (start
< NBOFF(bp
)) {
1184 if ((rv
= ubc_msync(vp
, start
, end
, NULL
, UBC_INVALIDATE
))) {
1185 printf("nfs_buf_release(): ubc_msync failed!, error %d\n", rv
);
1189 CLR(bp
->nb_flags
, NB_PAGELIST
);
1190 bp
->nb_pagelist
= NULL
;
1193 lck_mtx_lock(nfs_buf_mutex
);
1195 wakeup_needbuffer
= wakeup_buffer
= wakeup_nbdwrite
= 0;
1197 /* Wake up any processes waiting for any buffer to become free. */
1198 if (nfsneedbuffer
) {
1200 wakeup_needbuffer
= 1;
1202 /* Wake up any processes waiting for _this_ buffer to become free. */
1203 if (ISSET(bp
->nb_lflags
, NBL_WANTED
)) {
1204 CLR(bp
->nb_lflags
, NBL_WANTED
);
1208 /* If it's non-needcommit nocache, or an error, mark it invalid. */
1209 if (ISSET(bp
->nb_flags
, NB_ERROR
) ||
1210 (ISSET(bp
->nb_flags
, NB_NOCACHE
) && !ISSET(bp
->nb_flags
, (NB_NEEDCOMMIT
| NB_DELWRI
)))) {
1211 SET(bp
->nb_flags
, NB_INVAL
);
1214 if ((bp
->nb_bufsize
<= 0) || ISSET(bp
->nb_flags
, NB_INVAL
)) {
1215 /* If it's invalid or empty, dissociate it from its nfsnode */
1216 if (bp
->nb_vnbufs
.le_next
!= NFSNOLIST
) {
1217 LIST_REMOVE(bp
, nb_vnbufs
);
1218 bp
->nb_vnbufs
.le_next
= NFSNOLIST
;
1221 /* if this was a delayed write, wakeup anyone */
1222 /* waiting for delayed writes to complete */
1223 if (ISSET(bp
->nb_flags
, NB_DELWRI
)) {
1224 CLR(bp
->nb_flags
, NB_DELWRI
);
1227 wakeup_nbdwrite
= 1;
1229 /* invalidate usage timestamp to allow immediate freeing */
1230 NBUFSTAMPINVALIDATE(bp
);
1231 /* put buffer at head of free list */
1232 if (bp
->nb_free
.tqe_next
!= NFSNOLIST
) {
1233 panic("nfsbuf on freelist");
1235 SET(bp
->nb_flags
, NB_INVAL
);
1236 if (ISSET(bp
->nb_flags
, NB_META
)) {
1237 TAILQ_INSERT_HEAD(&nfsbuffreemeta
, bp
, nb_free
);
1238 nfsbuffreemetacnt
++;
1240 TAILQ_INSERT_HEAD(&nfsbuffree
, bp
, nb_free
);
1243 } else if (ISSET(bp
->nb_flags
, NB_DELWRI
)) {
1244 /* put buffer at end of delwri list */
1245 if (bp
->nb_free
.tqe_next
!= NFSNOLIST
) {
1246 panic("nfsbuf on freelist");
1248 TAILQ_INSERT_TAIL(&nfsbufdelwri
, bp
, nb_free
);
1252 /* update usage timestamp */
1254 bp
->nb_timestamp
= now
.tv_sec
;
1255 /* put buffer at end of free list */
1256 if (bp
->nb_free
.tqe_next
!= NFSNOLIST
) {
1257 panic("nfsbuf on freelist");
1259 if (ISSET(bp
->nb_flags
, NB_META
)) {
1260 TAILQ_INSERT_TAIL(&nfsbuffreemeta
, bp
, nb_free
);
1261 nfsbuffreemetacnt
++;
1263 TAILQ_INSERT_TAIL(&nfsbuffree
, bp
, nb_free
);
1270 /* Unlock the buffer. */
1271 CLR(bp
->nb_flags
, (NB_ASYNC
| NB_STABLE
));
1272 CLR(bp
->nb_lflags
, NBL_BUSY
);
1274 FSDBG_BOT(548, bp
, NBOFF(bp
), bp
->nb_flags
, bp
->nb_data
);
1276 lck_mtx_unlock(nfs_buf_mutex
);
1278 if (wakeup_needbuffer
) {
1279 wakeup(&nfsneedbuffer
);
1281 if (wakeup_buffer
) {
1284 if (wakeup_nbdwrite
) {
1285 wakeup(&nfs_nbdwrite
);
1293 * Wait for operations on the buffer to complete.
1294 * When they do, extract and return the I/O's error value.
1297 nfs_buf_iowait(struct nfsbuf
*bp
)
1299 FSDBG_TOP(549, bp
, NBOFF(bp
), bp
->nb_flags
, bp
->nb_error
);
1301 lck_mtx_lock(nfs_buf_mutex
);
1303 while (!ISSET(bp
->nb_flags
, NB_DONE
)) {
1304 msleep(bp
, nfs_buf_mutex
, PRIBIO
+ 1, "nfs_buf_iowait", NULL
);
1307 lck_mtx_unlock(nfs_buf_mutex
);
1309 FSDBG_BOT(549, bp
, NBOFF(bp
), bp
->nb_flags
, bp
->nb_error
);
1311 /* check for interruption of I/O, then errors. */
1312 if (ISSET(bp
->nb_flags
, NB_EINTR
)) {
1313 CLR(bp
->nb_flags
, NB_EINTR
);
1315 } else if (ISSET(bp
->nb_flags
, NB_ERROR
)) {
1316 return bp
->nb_error
? bp
->nb_error
: EIO
;
1322 * Mark I/O complete on a buffer.
1325 nfs_buf_iodone(struct nfsbuf
*bp
)
1327 FSDBG_TOP(550, bp
, NBOFF(bp
), bp
->nb_flags
, bp
->nb_error
);
1329 if (ISSET(bp
->nb_flags
, NB_DONE
)) {
1330 panic("nfs_buf_iodone already");
1333 if (!ISSET(bp
->nb_flags
, NB_READ
)) {
1334 CLR(bp
->nb_flags
, NB_WRITEINPROG
);
1336 * vnode_writedone() takes care of waking up
1337 * any throttled write operations
1339 vnode_writedone(NFSTOV(bp
->nb_np
));
1340 nfs_node_lock_force(bp
->nb_np
);
1341 bp
->nb_np
->n_numoutput
--;
1342 nfs_node_unlock(bp
->nb_np
);
1344 if (ISSET(bp
->nb_flags
, NB_ASYNC
)) { /* if async, release it */
1345 SET(bp
->nb_flags
, NB_DONE
); /* note that it's done */
1346 nfs_buf_release(bp
, 1);
1347 } else { /* or just wakeup the buffer */
1348 lck_mtx_lock(nfs_buf_mutex
);
1349 SET(bp
->nb_flags
, NB_DONE
); /* note that it's done */
1350 CLR(bp
->nb_lflags
, NBL_WANTED
);
1351 lck_mtx_unlock(nfs_buf_mutex
);
1355 FSDBG_BOT(550, bp
, NBOFF(bp
), bp
->nb_flags
, bp
->nb_error
);
1359 nfs_buf_write_delayed(struct nfsbuf
*bp
)
1361 nfsnode_t np
= bp
->nb_np
;
1363 FSDBG_TOP(551, bp
, NBOFF(bp
), bp
->nb_flags
, 0);
1364 FSDBG(551, bp
, bp
->nb_dirtyoff
, bp
->nb_dirtyend
, bp
->nb_dirty
);
1367 * If the block hasn't been seen before:
1368 * (1) Mark it as having been seen,
1369 * (2) Make sure it's on its node's correct block list,
1371 if (!ISSET(bp
->nb_flags
, NB_DELWRI
)) {
1372 SET(bp
->nb_flags
, NB_DELWRI
);
1373 /* move to dirty list */
1374 lck_mtx_lock(nfs_buf_mutex
);
1377 if (bp
->nb_vnbufs
.le_next
!= NFSNOLIST
) {
1378 LIST_REMOVE(bp
, nb_vnbufs
);
1380 LIST_INSERT_HEAD(&np
->n_dirtyblkhd
, bp
, nb_vnbufs
);
1381 lck_mtx_unlock(nfs_buf_mutex
);
1385 * If the vnode has "too many" write operations in progress
1386 * wait for them to finish the IO
1388 vnode_waitforwrites(NFSTOV(np
), VNODE_ASYNC_THROTTLE
, 0, 0, "nfs_buf_write_delayed");
1390 /* the file is in a modified state, so make sure the flag's set */
1391 nfs_node_lock_force(np
);
1392 np
->n_flag
|= NMODIFIED
;
1393 nfs_node_unlock(np
);
1396 * If we have too many delayed write buffers,
1397 * just fall back to doing the async write.
1399 if (nfs_nbdwrite
< 0) {
1400 panic("nfs_buf_write_delayed: Negative nfs_nbdwrite");
1402 if (nfs_nbdwrite
> NFS_A_LOT_OF_DELAYED_WRITES
) {
1403 /* issue async write */
1404 SET(bp
->nb_flags
, NB_ASYNC
);
1406 FSDBG_BOT(551, bp
, NBOFF(bp
), bp
->nb_flags
, bp
->nb_error
);
1410 /* Otherwise, the "write" is done, so mark and release the buffer. */
1411 SET(bp
->nb_flags
, NB_DONE
);
1412 nfs_buf_release(bp
, 1);
1413 FSDBG_BOT(551, bp
, NBOFF(bp
), bp
->nb_flags
, 0);
1418 * Check that a "needcommit" buffer can still be committed.
1419 * If the write verifier has changed, we need to clear the
1420 * the needcommit flag.
1423 nfs_buf_check_write_verifier(nfsnode_t np
, struct nfsbuf
*bp
)
1425 struct nfsmount
*nmp
;
1427 if (!ISSET(bp
->nb_flags
, NB_NEEDCOMMIT
)) {
1432 if (nfs_mount_gone(nmp
)) {
1435 if (!ISSET(bp
->nb_flags
, NB_STALEWVERF
) && (bp
->nb_verf
== nmp
->nm_verf
)) {
1439 /* write verifier changed, clear commit/wverf flags */
1440 CLR(bp
->nb_flags
, (NB_NEEDCOMMIT
| NB_STALEWVERF
));
1442 nfs_node_lock_force(np
);
1443 np
->n_needcommitcnt
--;
1444 CHECK_NEEDCOMMITCNT(np
);
1445 nfs_node_unlock(np
);
1449 * add a reference to a buffer so it doesn't disappear while being used
1450 * (must be called with nfs_buf_mutex held)
1453 nfs_buf_refget(struct nfsbuf
*bp
)
1455 os_ref_retain_locked(&bp
->nb_refs
);
1458 * release a reference on a buffer
1459 * (must be called with nfs_buf_mutex held)
1462 nfs_buf_refrele(struct nfsbuf
*bp
)
1464 (void) os_ref_release_locked(&bp
->nb_refs
);
1468 * mark a particular buffer as BUSY
1469 * (must be called with nfs_buf_mutex held)
1472 nfs_buf_acquire(struct nfsbuf
*bp
, int flags
, int slpflag
, int slptimeo
)
1477 if (ISSET(bp
->nb_lflags
, NBL_BUSY
)) {
1479 * since the lck_mtx_lock may block, the buffer
1480 * may become BUSY, so we need to recheck for
1483 if (flags
& NBAC_NOWAIT
) {
1486 SET(bp
->nb_lflags
, NBL_WANTED
);
1488 ts
.tv_sec
= (slptimeo
/ 100);
1489 /* the hz value is 100; which leads to 10ms */
1490 ts
.tv_nsec
= (slptimeo
% 100) * 10 * NSEC_PER_USEC
* 1000;
1492 error
= msleep(bp
, nfs_buf_mutex
, slpflag
| (PRIBIO
+ 1),
1493 "nfs_buf_acquire", &ts
);
1499 if (flags
& NBAC_REMOVE
) {
1500 nfs_buf_remfree(bp
);
1502 SET(bp
->nb_lflags
, NBL_BUSY
);
1508 * simply drop the BUSY status of a buffer
1509 * (must be called with nfs_buf_mutex held)
1512 nfs_buf_drop(struct nfsbuf
*bp
)
1514 int need_wakeup
= 0;
1516 if (!ISSET(bp
->nb_lflags
, NBL_BUSY
)) {
1517 panic("nfs_buf_drop: buffer not busy!");
1519 if (ISSET(bp
->nb_lflags
, NBL_WANTED
)) {
1520 /* delay the actual wakeup until after we clear NBL_BUSY */
1523 /* Unlock the buffer. */
1524 CLR(bp
->nb_lflags
, (NBL_BUSY
| NBL_WANTED
));
1532 * prepare for iterating over an nfsnode's buffer list
1533 * this lock protects the queue manipulation
1534 * (must be called with nfs_buf_mutex held)
1537 nfs_buf_iterprepare(nfsnode_t np
, struct nfsbuflists
*iterheadp
, int flags
)
1539 struct nfsbuflists
*listheadp
;
1541 if (flags
& NBI_DIRTY
) {
1542 listheadp
= &np
->n_dirtyblkhd
;
1544 listheadp
= &np
->n_cleanblkhd
;
1547 if ((flags
& NBI_NOWAIT
) && (np
->n_bufiterflags
& NBI_ITER
)) {
1548 LIST_INIT(iterheadp
);
1552 while (np
->n_bufiterflags
& NBI_ITER
) {
1553 np
->n_bufiterflags
|= NBI_ITERWANT
;
1554 msleep(&np
->n_bufiterflags
, nfs_buf_mutex
, 0, "nfs_buf_iterprepare", NULL
);
1556 if (LIST_EMPTY(listheadp
)) {
1557 LIST_INIT(iterheadp
);
1560 np
->n_bufiterflags
|= NBI_ITER
;
1562 iterheadp
->lh_first
= listheadp
->lh_first
;
1563 listheadp
->lh_first
->nb_vnbufs
.le_prev
= &iterheadp
->lh_first
;
1564 LIST_INIT(listheadp
);
1570 * clean up after iterating over an nfsnode's buffer list
1571 * this lock protects the queue manipulation
1572 * (must be called with nfs_buf_mutex held)
1575 nfs_buf_itercomplete(nfsnode_t np
, struct nfsbuflists
*iterheadp
, int flags
)
1577 struct nfsbuflists
* listheadp
;
1580 if (flags
& NBI_DIRTY
) {
1581 listheadp
= &np
->n_dirtyblkhd
;
1583 listheadp
= &np
->n_cleanblkhd
;
1586 while (!LIST_EMPTY(iterheadp
)) {
1587 bp
= LIST_FIRST(iterheadp
);
1588 LIST_REMOVE(bp
, nb_vnbufs
);
1589 LIST_INSERT_HEAD(listheadp
, bp
, nb_vnbufs
);
1592 np
->n_bufiterflags
&= ~NBI_ITER
;
1593 if (np
->n_bufiterflags
& NBI_ITERWANT
) {
1594 np
->n_bufiterflags
&= ~NBI_ITERWANT
;
1595 wakeup(&np
->n_bufiterflags
);
1601 * Read an NFS buffer for a file.
1604 nfs_buf_read(struct nfsbuf
*bp
)
1612 cred
= bp
->nb_rcred
;
1613 if (IS_VALID_CRED(cred
)) {
1614 kauth_cred_ref(cred
);
1616 thd
= ISSET(bp
->nb_flags
, NB_ASYNC
) ? NULL
: current_thread();
1619 if (!ISSET(bp
->nb_flags
, NB_READ
)) {
1620 panic("nfs_buf_read: !NB_READ");
1622 if (ISSET(bp
->nb_flags
, NB_DONE
)) {
1623 CLR(bp
->nb_flags
, NB_DONE
);
1628 OSAddAtomic64(1, &nfsstats
.read_bios
);
1630 error
= nfs_buf_read_rpc(bp
, thd
, cred
);
1632 * For async I/O, the callbacks will finish up the
1633 * read. Otherwise, the read has already been finished.
1636 if (IS_VALID_CRED(cred
)) {
1637 kauth_cred_unref(&cred
);
1643 * finish the reading of a buffer
1646 nfs_buf_read_finish(struct nfsbuf
*bp
)
1648 nfsnode_t np
= bp
->nb_np
;
1649 struct nfsmount
*nmp
;
1651 if (!ISSET(bp
->nb_flags
, NB_ERROR
)) {
1652 /* update valid range */
1653 bp
->nb_validoff
= 0;
1654 bp
->nb_validend
= bp
->nb_endio
;
1655 if (bp
->nb_endio
< bp
->nb_bufsize
) {
1657 * The read may be short because we have unflushed writes
1658 * that are extending the file size and the reads hit the
1659 * (old) EOF on the server. So, just make sure nb_validend
1660 * correctly tracks EOF.
1661 * Note that the missing data should have already been zeroed
1662 * in nfs_buf_read_rpc_finish().
1664 off_t boff
= NBOFF(bp
);
1665 if ((off_t
)np
->n_size
>= (boff
+ bp
->nb_bufsize
)) {
1666 bp
->nb_validend
= bp
->nb_bufsize
;
1667 } else if ((off_t
)np
->n_size
>= boff
) {
1668 bp
->nb_validend
= np
->n_size
- boff
;
1670 bp
->nb_validend
= 0;
1673 if ((nmp
= NFSTONMP(np
)) && (nmp
->nm_vers
== NFS_VER2
) &&
1674 ((NBOFF(bp
) + bp
->nb_validend
) > 0x100000000LL
)) {
1675 bp
->nb_validend
= 0x100000000LL
- NBOFF(bp
);
1677 nfs_buf_pgs_get_page_mask(&bp
->nb_valid
, round_page_64(bp
->nb_validend
) / PAGE_SIZE
);
1678 if (bp
->nb_validend
& PAGE_MASK
) {
1679 /* zero-fill remainder of last page */
1680 bzero(bp
->nb_data
+ bp
->nb_validend
, PAGE_SIZE
- (bp
->nb_validend
& PAGE_MASK
));
1687 * initiate the NFS READ RPC(s) for a buffer
1690 nfs_buf_read_rpc(struct nfsbuf
*bp
, thread_t thd
, kauth_cred_t cred
)
1692 struct nfsmount
*nmp
;
1693 nfsnode_t np
= bp
->nb_np
;
1694 int error
= 0, nfsvers
, async
;
1696 uint64_t length
, nrpcs
;
1701 struct nfsreq_cbinfo cb
;
1704 if (nfs_mount_gone(nmp
)) {
1705 bp
->nb_error
= error
= ENXIO
;
1706 SET(bp
->nb_flags
, NB_ERROR
);
1710 nfsvers
= nmp
->nm_vers
;
1711 nmrsize
= nmp
->nm_rsize
;
1715 length
= bp
->nb_bufsize
;
1717 if (nfsvers
== NFS_VER2
) {
1718 if (boff
> 0xffffffffLL
) {
1719 bp
->nb_error
= error
= EFBIG
;
1720 SET(bp
->nb_flags
, NB_ERROR
);
1724 if ((boff
+ length
- 1) > 0xffffffffLL
) {
1725 length
= 0x100000000LL
- boff
;
1729 /* Note: Can only do async I/O if nfsiods are configured. */
1730 async
= (bp
->nb_flags
& NB_ASYNC
);
1731 cb
.rcb_func
= async
? nfs_buf_read_rpc_finish
: NULL
;
1734 bp
->nb_offio
= bp
->nb_endio
= 0;
1735 bp
->nb_rpcs
= nrpcs
= (length
+ nmrsize
- 1) / nmrsize
;
1736 if (async
&& (nrpcs
> 1)) {
1737 SET(bp
->nb_flags
, NB_MULTASYNCRPC
);
1739 CLR(bp
->nb_flags
, NB_MULTASYNCRPC
);
1742 while (length
> 0) {
1743 if (ISSET(bp
->nb_flags
, NB_ERROR
)) {
1744 error
= bp
->nb_error
;
1747 len
= (length
> nmrsize
) ? nmrsize
: (uint32_t)length
;
1748 cb
.rcb_args
.offset
= offset
;
1749 cb
.rcb_args
.length
= len
;
1751 if (nmp
->nm_vers
>= NFS_VER4
) {
1752 cb
.rcb_args
.stategenid
= nmp
->nm_stategenid
;
1756 error
= nmp
->nm_funcs
->nf_read_rpc_async(np
, boff
+ offset
, len
, thd
, cred
, &cb
, &req
);
1765 nfs_buf_read_rpc_finish(req
);
1766 if (ISSET(bp
->nb_flags
, NB_ERROR
)) {
1767 error
= bp
->nb_error
;
1774 * Something bad happened while trying to send the RPC(s).
1775 * Wait for any outstanding requests to complete.
1777 bp
->nb_error
= error
;
1778 SET(bp
->nb_flags
, NB_ERROR
);
1779 if (ISSET(bp
->nb_flags
, NB_MULTASYNCRPC
)) {
1780 nrpcs
= (length
+ nmrsize
- 1) / nmrsize
;
1781 lck_mtx_lock(nfs_buf_mutex
);
1782 bp
->nb_rpcs
-= nrpcs
;
1783 if (bp
->nb_rpcs
== 0) {
1784 /* No RPCs left, so the buffer's done */
1785 lck_mtx_unlock(nfs_buf_mutex
);
1788 /* wait for the last RPC to mark it done */
1789 while (bp
->nb_rpcs
> 0) {
1790 msleep(&bp
->nb_rpcs
, nfs_buf_mutex
, 0,
1791 "nfs_buf_read_rpc_cancel", NULL
);
1793 lck_mtx_unlock(nfs_buf_mutex
);
1804 * finish up an NFS READ RPC on a buffer
1807 nfs_buf_read_rpc_finish(struct nfsreq
*req
)
1809 struct nfsmount
*nmp
;
1810 size_t rlen
, length
;
1811 struct nfsreq_cbinfo cb
;
1813 int error
= 0, nfsvers
, eof
= 0, multasyncrpc
, finished
;
1815 void *wakeme
= NULL
;
1816 struct nfsreq
*rreq
= NULL
;
1821 char uio_buf
[UIO_SIZEOF(1)];
1825 thd
= req
->r_thread
;
1827 if (IS_VALID_CRED(cred
)) {
1828 kauth_cred_ref(cred
);
1830 cb
= req
->r_callback
;
1832 if (cb
.rcb_func
) { /* take an extra reference on the nfsreq in case we want to resend it later due to grace error */
1833 nfs_request_ref(req
, 0);
1837 if (nfs_mount_gone(nmp
)) {
1838 SET(bp
->nb_flags
, NB_ERROR
);
1839 bp
->nb_error
= error
= ENXIO
;
1841 if (error
|| ISSET(bp
->nb_flags
, NB_ERROR
)) {
1843 nfs_request_async_cancel(req
);
1847 nfsvers
= nmp
->nm_vers
;
1848 offset
= cb
.rcb_args
.offset
;
1849 rlen
= length
= cb
.rcb_args
.length
;
1851 auio
= uio_createwithbuffer(1, NBOFF(bp
) + offset
, UIO_SYSSPACE
,
1852 UIO_READ
, &uio_buf
, sizeof(uio_buf
));
1853 uio_addiov(auio
, CAST_USER_ADDR_T(bp
->nb_data
+ offset
), length
);
1855 /* finish the RPC */
1856 error
= nmp
->nm_funcs
->nf_read_rpc_async_finish(np
, req
, auio
, &rlen
, &eof
);
1857 if ((error
== EINPROGRESS
) && cb
.rcb_func
) {
1858 /* async request restarted */
1860 nfs_request_rele(req
);
1862 if (IS_VALID_CRED(cred
)) {
1863 kauth_cred_unref(&cred
);
1868 if ((nmp
->nm_vers
>= NFS_VER4
) && nfs_mount_state_error_should_restart(error
) && !ISSET(bp
->nb_flags
, NB_ERROR
)) {
1869 lck_mtx_lock(&nmp
->nm_lock
);
1870 if ((error
!= NFSERR_OLD_STATEID
) && (error
!= NFSERR_GRACE
) && (cb
.rcb_args
.stategenid
== nmp
->nm_stategenid
)) {
1871 NP(np
, "nfs_buf_read_rpc_finish: error %d @ 0x%llx, 0x%x 0x%x, initiating recovery",
1872 error
, NBOFF(bp
) + offset
, cb
.rcb_args
.stategenid
, nmp
->nm_stategenid
);
1873 nfs_need_recover(nmp
, error
);
1875 lck_mtx_unlock(&nmp
->nm_lock
);
1876 if (np
->n_flag
& NREVOKE
) {
1879 if (error
== NFSERR_GRACE
) {
1882 * For an async I/O request, handle a grace delay just like
1883 * jukebox errors. Set the resend time and queue it up.
1886 if (req
->r_nmrep
.nmc_mhead
) {
1887 mbuf_freem(req
->r_nmrep
.nmc_mhead
);
1888 req
->r_nmrep
.nmc_mhead
= NULL
;
1892 lck_mtx_lock(&req
->r_mtx
);
1893 req
->r_resendtime
= now
.tv_sec
+ 2;
1894 req
->r_xid
= 0; // get a new XID
1895 req
->r_flags
|= R_RESTART
;
1897 nfs_asyncio_resend(req
);
1898 lck_mtx_unlock(&req
->r_mtx
);
1899 if (IS_VALID_CRED(cred
)) {
1900 kauth_cred_unref(&cred
);
1902 /* Note: nfsreq reference taken will be dropped later when finished */
1905 /* otherwise, just pause a couple seconds and retry */
1906 tsleep(&nmp
->nm_state
, (PZERO
- 1), "nfsgrace", 2 * hz
);
1908 if (!(error
= nfs_mount_state_wait_for_recovery(nmp
))) {
1916 SET(bp
->nb_flags
, NB_ERROR
);
1917 bp
->nb_error
= error
;
1921 if ((rlen
> 0) && (bp
->nb_endio
< (offset
+ (int)rlen
))) {
1922 bp
->nb_endio
= offset
+ rlen
;
1925 if ((nfsvers
== NFS_VER2
) || eof
|| (rlen
== 0)) {
1926 /* zero out the remaining data (up to EOF) */
1927 off_t rpcrem
, eofrem
, rem
;
1928 rpcrem
= (length
- rlen
);
1929 eofrem
= np
->n_size
- (NBOFF(bp
) + offset
+ rlen
);
1930 rem
= (rpcrem
< eofrem
) ? rpcrem
: eofrem
;
1932 NFS_BZERO(bp
->nb_data
+ offset
+ rlen
, rem
);
1934 } else if ((rlen
< length
) && !ISSET(bp
->nb_flags
, NB_ERROR
)) {
1938 * We haven't hit EOF and we didn't get all the data
1939 * requested, so we need to issue another read for the rest.
1940 * (Don't bother if the buffer already hit an error.)
1947 cb
.rcb_args
.offset
= offset
;
1948 cb
.rcb_args
.length
= length
;
1950 if (nmp
->nm_vers
>= NFS_VER4
) {
1951 cb
.rcb_args
.stategenid
= nmp
->nm_stategenid
;
1954 error
= nmp
->nm_funcs
->nf_read_rpc_async(np
, NBOFF(bp
) + offset
, length
, thd
, cred
, &cb
, &rreq
);
1956 if (IS_VALID_CRED(cred
)) {
1957 kauth_cred_unref(&cred
);
1960 /* if !async we'll need to wait for this RPC to finish */
1965 nfs_request_rele(req
);
1968 * Outstanding RPC count is unchanged.
1969 * Callback will be called when RPC is done.
1973 SET(bp
->nb_flags
, NB_ERROR
);
1974 bp
->nb_error
= error
;
1979 nfs_request_rele(req
);
1981 if (IS_VALID_CRED(cred
)) {
1982 kauth_cred_unref(&cred
);
1986 * Decrement outstanding RPC count on buffer
1987 * and call nfs_buf_read_finish on last RPC.
1989 * (Note: when there are multiple async RPCs issued for a
1990 * buffer we need nfs_buffer_mutex to avoid problems when
1991 * aborting a partially-initiated set of RPCs)
1994 multasyncrpc
= ISSET(bp
->nb_flags
, NB_MULTASYNCRPC
);
1996 lck_mtx_lock(nfs_buf_mutex
);
2000 finished
= (bp
->nb_rpcs
== 0);
2003 lck_mtx_unlock(nfs_buf_mutex
);
2008 wakeme
= &bp
->nb_rpcs
;
2010 nfs_buf_read_finish(bp
);
2018 * Do buffer readahead.
2019 * Initiate async I/O to read buffers not in cache.
2022 nfs_buf_readahead(nfsnode_t np
, int ioflag
, daddr64_t
*rabnp
, daddr64_t lastrabn
, thread_t thd
, kauth_cred_t cred
)
2024 struct nfsmount
*nmp
= NFSTONMP(np
);
2029 if (nfs_mount_gone(nmp
)) {
2032 if (nmp
->nm_readahead
<= 0) {
2035 if (*rabnp
> lastrabn
) {
2039 for (nra
= 0; (nra
< nmp
->nm_readahead
) && (*rabnp
<= lastrabn
); nra
++, *rabnp
= *rabnp
+ 1) {
2040 /* check if block exists and is valid. */
2041 if ((*rabnp
* nmp
->nm_biosize
) >= (off_t
)np
->n_size
) {
2042 /* stop reading ahead if we're beyond EOF */
2046 error
= nfs_buf_get(np
, *rabnp
, nmp
->nm_biosize
, thd
, NBLK_READ
| NBLK_NOWAIT
, &bp
);
2050 nfs_node_lock_force(np
);
2051 np
->n_lastrahead
= *rabnp
;
2052 nfs_node_unlock(np
);
2056 if ((ioflag
& IO_NOCACHE
) && ISSET(bp
->nb_flags
, NB_CACHE
) &&
2057 !nfs_buf_pgs_is_set(&bp
->nb_dirty
) && !ISSET(bp
->nb_flags
, (NB_DELWRI
| NB_NCRDAHEAD
))) {
2058 CLR(bp
->nb_flags
, NB_CACHE
);
2059 NBPGS_ERASE(&bp
->nb_valid
);
2060 bp
->nb_validoff
= bp
->nb_validend
= -1;
2062 if ((bp
->nb_dirtyend
<= 0) && !nfs_buf_pgs_is_set(&bp
->nb_dirty
) &&
2063 !ISSET(bp
->nb_flags
, (NB_CACHE
| NB_DELWRI
))) {
2064 SET(bp
->nb_flags
, (NB_READ
| NB_ASYNC
));
2065 if (ioflag
& IO_NOCACHE
) {
2066 SET(bp
->nb_flags
, NB_NCRDAHEAD
);
2068 if (!IS_VALID_CRED(bp
->nb_rcred
) && IS_VALID_CRED(cred
)) {
2069 kauth_cred_ref(cred
);
2070 bp
->nb_rcred
= cred
;
2072 if ((error
= nfs_buf_read(bp
))) {
2077 nfs_buf_release(bp
, 1);
2083 * NFS buffer I/O for reading files.
2086 nfs_bioread(nfsnode_t np
, uio_t uio
, int ioflag
, vfs_context_t ctx
)
2088 vnode_t vp
= NFSTOV(np
);
2089 struct nfsbuf
*bp
= NULL
;
2090 struct nfsmount
*nmp
= VTONMP(vp
);
2091 daddr64_t lbn
, rabn
= 0, lastrabn
, maxrabn
= -1;
2092 off_t diff
, on
= 0, n
= 0;
2094 int nfsvers
, biosize
, modified
, readaheads
= 0;
2099 FSDBG_TOP(514, np
, uio_offset(uio
), uio_resid(uio
), ioflag
);
2101 nfsvers
= nmp
->nm_vers
;
2102 biosize
= nmp
->nm_biosize
;
2103 thd
= vfs_context_thread(ctx
);
2104 cred
= vfs_context_ucred(ctx
);
2106 if (vnode_vtype(vp
) != VREG
) {
2107 printf("nfs_bioread: type %x unexpected\n", vnode_vtype(vp
));
2108 FSDBG_BOT(514, np
, 0xd1e0016, 0, EINVAL
);
2113 * For NFS, cache consistency can only be maintained approximately.
2114 * Although RFC1094 does not specify the criteria, the following is
2115 * believed to be compatible with the reference port.
2117 * If the file has changed since the last read RPC or you have
2118 * written to the file, you may have lost data cache consistency
2119 * with the server. So, check for a change, and flush all of the
2120 * file's data out of the cache.
2121 * NB: This implies that cache data can be read when up to
2122 * NFS_MAXATTRTIMO seconds out of date. If you find that you
2123 * need current attributes, nfs_getattr() can be forced to fetch
2124 * new attributes (via NATTRINVALIDATE() or NGA_UNCACHED).
2127 if (ISSET(np
->n_flag
, NUPDATESIZE
)) {
2128 nfs_data_update_size(np
, 0);
2131 if ((error
= nfs_node_lock(np
))) {
2132 FSDBG_BOT(514, np
, 0xd1e0222, 0, error
);
2136 if (np
->n_flag
& NNEEDINVALIDATE
) {
2137 np
->n_flag
&= ~NNEEDINVALIDATE
;
2138 nfs_node_unlock(np
);
2139 error
= nfs_vinvalbuf(vp
, V_SAVE
| V_IGNORE_WRITEERR
, ctx
, 1);
2141 error
= nfs_node_lock(np
);
2144 FSDBG_BOT(514, np
, 0xd1e0322, 0, error
);
2149 modified
= (np
->n_flag
& NMODIFIED
);
2150 nfs_node_unlock(np
);
2151 /* nfs_getattr() will check changed and purge caches */
2152 error
= nfs_getattr(np
, NULL
, ctx
, modified
? NGA_UNCACHED
: NGA_CACHED
);
2154 FSDBG_BOT(514, np
, 0xd1e0004, 0, error
);
2158 if (uio_resid(uio
) == 0) {
2159 FSDBG_BOT(514, np
, 0xd1e0001, 0, 0);
2162 if (uio_offset(uio
) < 0) {
2163 FSDBG_BOT(514, np
, 0xd1e0002, 0, EINVAL
);
2168 * set up readahead - which may be limited by:
2169 * + current request length (for IO_NOCACHE)
2170 * + readahead setting
2173 if (nmp
->nm_readahead
> 0) {
2174 off_t end
= uio_offset(uio
) + uio_resid(uio
);
2175 if (end
> (off_t
)np
->n_size
) {
2178 rabn
= uio_offset(uio
) / biosize
;
2179 maxrabn
= (end
- 1) / biosize
;
2180 nfs_node_lock_force(np
);
2181 if (!(ioflag
& IO_NOCACHE
) &&
2182 (!rabn
|| (rabn
== np
->n_lastread
) || (rabn
== (np
->n_lastread
+ 1)))) {
2183 maxrabn
+= nmp
->nm_readahead
;
2184 if ((maxrabn
* biosize
) >= (off_t
)np
->n_size
) {
2185 maxrabn
= ((off_t
)np
->n_size
- 1) / biosize
;
2188 if (maxrabn
< np
->n_lastrahead
) {
2189 np
->n_lastrahead
= -1;
2191 if (rabn
< np
->n_lastrahead
) {
2192 rabn
= np
->n_lastrahead
+ 1;
2194 nfs_node_unlock(np
);
2200 nfs_data_lock(np
, NFS_DATA_LOCK_SHARED
);
2201 lbn
= uio_offset(uio
) / biosize
;
2204 * Copy directly from any cached pages without grabbing the bufs.
2205 * (If we are NOCACHE and we've issued readahead requests, we need
2206 * to grab the NB_NCRDAHEAD bufs to drop them.)
2208 if ((!(ioflag
& IO_NOCACHE
) || !readaheads
) &&
2209 ((uio
->uio_segflg
== UIO_USERSPACE32
||
2210 uio
->uio_segflg
== UIO_USERSPACE64
||
2211 uio
->uio_segflg
== UIO_USERSPACE
))) {
2212 io_resid
= uio_resid(uio
);
2213 diff
= np
->n_size
- uio_offset(uio
);
2214 if (diff
< io_resid
) {
2218 int count
= (io_resid
> INT_MAX
) ? INT_MAX
: (int)io_resid
;
2219 error
= cluster_copy_ubc_data(vp
, uio
, &count
, 0);
2221 nfs_data_unlock(np
);
2222 FSDBG_BOT(514, np
, uio_offset(uio
), 0xcacefeed, error
);
2226 /* count any biocache reads that we just copied directly */
2227 if (lbn
!= (uio_offset(uio
) / biosize
)) {
2228 OSAddAtomic64(NFS_ROUND_BLOCK(uio_offset(uio
), biosize
) - lbn
, &nfsstats
.biocache_reads
);
2229 FSDBG(514, np
, 0xcacefeed, uio_offset(uio
), error
);
2233 lbn
= uio_offset(uio
) / biosize
;
2234 on
= uio_offset(uio
) % biosize
;
2235 nfs_node_lock_force(np
);
2236 np
->n_lastread
= (uio_offset(uio
) - 1) / biosize
;
2237 nfs_node_unlock(np
);
2239 if ((uio_resid(uio
) <= 0) || (uio_offset(uio
) >= (off_t
)np
->n_size
)) {
2240 nfs_data_unlock(np
);
2241 FSDBG_BOT(514, np
, uio_offset(uio
), uio_resid(uio
), 0xaaaaaaaa);
2245 /* adjust readahead block number, if necessary */
2249 lastrabn
= MIN(maxrabn
, lbn
+ nmp
->nm_readahead
);
2250 if (rabn
<= lastrabn
) { /* start readaheads */
2251 error
= nfs_buf_readahead(np
, ioflag
, &rabn
, lastrabn
, thd
, cred
);
2253 nfs_data_unlock(np
);
2254 FSDBG_BOT(514, np
, 0xd1e000b, 1, error
);
2258 OSAddAtomic64(rabn
- lbn
, &nfsstats
.biocache_reads
);
2260 OSAddAtomic64(1, &nfsstats
.biocache_reads
);
2264 * If the block is in the cache and has the required data
2265 * in a valid region, just copy it out.
2266 * Otherwise, get the block and write back/read in,
2270 io_resid
= uio_resid(uio
);
2271 n
= (io_resid
> (biosize
- on
)) ? (biosize
- on
) : io_resid
;
2272 diff
= np
->n_size
- uio_offset(uio
);
2277 error
= nfs_buf_get(np
, lbn
, biosize
, thd
, NBLK_READ
, &bp
);
2279 nfs_data_unlock(np
);
2280 FSDBG_BOT(514, np
, 0xd1e000c, 0, error
);
2284 if ((ioflag
& IO_NOCACHE
) && ISSET(bp
->nb_flags
, NB_CACHE
)) {
2286 * IO_NOCACHE found a cached buffer.
2287 * Flush the buffer if it's dirty.
2288 * Invalidate the data if it wasn't just read
2289 * in as part of a "nocache readahead".
2291 if (nfs_buf_pgs_is_set(&bp
->nb_dirty
) || (bp
->nb_dirtyend
> 0)) {
2292 /* so write the buffer out and try again */
2293 SET(bp
->nb_flags
, NB_NOCACHE
);
2296 if (ISSET(bp
->nb_flags
, NB_NCRDAHEAD
)) {
2297 CLR(bp
->nb_flags
, NB_NCRDAHEAD
);
2298 SET(bp
->nb_flags
, NB_NOCACHE
);
2302 /* if any pages are valid... */
2303 if (nfs_buf_pgs_is_set(&bp
->nb_valid
)) {
2304 /* ...check for any invalid pages in the read range */
2305 off_t pg
, firstpg
, lastpg
, dirtypg
;
2306 dirtypg
= firstpg
= lastpg
= -1;
2307 pg
= on
/ PAGE_SIZE
;
2308 while (pg
<= (on
+ n
- 1) / PAGE_SIZE
) {
2309 if (!NBPGVALID(bp
, pg
)) {
2314 } else if (firstpg
>= 0 && dirtypg
< 0 && NBPGDIRTY(bp
, pg
)) {
2320 /* if there are no invalid pages, we're all set */
2322 if (bp
->nb_validoff
< 0) {
2323 /* valid range isn't set up, so */
2324 /* set it to what we know is valid */
2325 bp
->nb_validoff
= trunc_page_64(on
);
2326 bp
->nb_validend
= round_page_64(on
+ n
);
2327 nfs_buf_normalize_valid_range(np
, bp
);
2332 /* there are invalid pages in the read range */
2333 if (((dirtypg
> firstpg
) && (dirtypg
< lastpg
)) ||
2334 (((firstpg
* PAGE_SIZE
) < bp
->nb_dirtyend
) && (((lastpg
+ 1) * PAGE_SIZE
) > bp
->nb_dirtyoff
))) {
2335 /* there are also dirty page(s) (or range) in the read range, */
2336 /* so write the buffer out and try again */
2338 CLR(bp
->nb_flags
, (NB_DONE
| NB_ERROR
| NB_INVAL
));
2339 SET(bp
->nb_flags
, NB_ASYNC
);
2340 if (!IS_VALID_CRED(bp
->nb_wcred
)) {
2341 kauth_cred_ref(cred
);
2342 bp
->nb_wcred
= cred
;
2344 error
= nfs_buf_write(bp
);
2346 nfs_data_unlock(np
);
2347 FSDBG_BOT(514, np
, 0xd1e000d, 0, error
);
2352 if (!nfs_buf_pgs_is_set(&bp
->nb_dirty
) && bp
->nb_dirtyend
<= 0 &&
2353 (lastpg
- firstpg
+ 1) > (biosize
/ PAGE_SIZE
) / 2) {
2354 /* we need to read in more than half the buffer and the */
2355 /* buffer's not dirty, so just fetch the whole buffer */
2356 NBPGS_ERASE(&bp
->nb_valid
);
2358 /* read the page range in */
2360 char uio_buf
[UIO_SIZEOF(1)];
2363 auio
= uio_createwithbuffer(1, (NBOFF(bp
) + firstpg
* PAGE_SIZE_64
),
2364 UIO_SYSSPACE
, UIO_READ
, &uio_buf
[0], sizeof(uio_buf
));
2368 NFS_UIO_ADDIOV(auio
, CAST_USER_ADDR_T(bp
->nb_data
+ (firstpg
* PAGE_SIZE
)),
2369 ((lastpg
- firstpg
+ 1) * PAGE_SIZE
));
2370 error
= nfs_read_rpc(np
, auio
, ctx
);
2373 if (ioflag
& IO_NOCACHE
) {
2374 SET(bp
->nb_flags
, NB_NOCACHE
);
2376 nfs_buf_release(bp
, 1);
2377 nfs_data_unlock(np
);
2378 FSDBG_BOT(514, np
, 0xd1e000e, 0, error
);
2381 /* Make sure that the valid range is set to cover this read. */
2382 bp
->nb_validoff
= trunc_page_64(on
);
2383 bp
->nb_validend
= round_page_64(on
+ n
);
2384 nfs_buf_normalize_valid_range(np
, bp
);
2385 if (uio_resid(auio
) > 0) {
2386 /* if short read, must have hit EOF, */
2387 /* so zero the rest of the range */
2388 bzero(CAST_DOWN(caddr_t
, uio_curriovbase(auio
)), uio_resid(auio
));
2390 /* mark the pages (successfully read) as valid */
2391 for (pg
= firstpg
; pg
<= lastpg
; pg
++) {
2392 NBPGVALID_SET(bp
, pg
);
2396 /* if no pages are valid, read the whole block */
2397 if (!nfs_buf_pgs_is_set(&bp
->nb_valid
)) {
2398 if (!IS_VALID_CRED(bp
->nb_rcred
) && IS_VALID_CRED(cred
)) {
2399 kauth_cred_ref(cred
);
2400 bp
->nb_rcred
= cred
;
2402 SET(bp
->nb_flags
, NB_READ
);
2403 CLR(bp
->nb_flags
, (NB_DONE
| NB_ERROR
| NB_INVAL
));
2404 error
= nfs_buf_read(bp
);
2405 if (ioflag
& IO_NOCACHE
) {
2406 SET(bp
->nb_flags
, NB_NOCACHE
);
2409 nfs_data_unlock(np
);
2410 nfs_buf_release(bp
, 1);
2411 FSDBG_BOT(514, np
, 0xd1e000f, 0, error
);
2416 /* validate read range against valid range and clip */
2417 if (bp
->nb_validend
> 0) {
2418 diff
= (on
>= bp
->nb_validend
) ? 0 : (bp
->nb_validend
- on
);
2425 n32
= n
> INT_MAX
? INT_MAX
: (int)n
;
2426 error
= uiomove(bp
->nb_data
+ on
, n32
, uio
);
2427 if (!error
&& n
> n32
) {
2428 error
= uiomove(bp
->nb_data
+ on
+ n32
, (int)(n
- n32
), uio
);
2433 nfs_buf_release(bp
, 1);
2434 nfs_data_unlock(np
);
2435 nfs_node_lock_force(np
);
2436 np
->n_lastread
= (uio_offset(uio
) - 1) / biosize
;
2437 nfs_node_unlock(np
);
2438 } while (error
== 0 && uio_resid(uio
) > 0 && n
> 0);
2439 FSDBG_BOT(514, np
, uio_offset(uio
), uio_resid(uio
), error
);
2444 * limit the number of outstanding async I/O writes
2447 nfs_async_write_start(struct nfsmount
*nmp
)
2449 int error
= 0, slpflag
= NMFLAG(nmp
, INTR
) ? PCATCH
: 0;
2450 struct timespec ts
= { .tv_sec
= 1, .tv_nsec
= 0 };
2452 if (nfs_max_async_writes
<= 0) {
2455 lck_mtx_lock(&nmp
->nm_lock
);
2456 while ((nfs_max_async_writes
> 0) && (nmp
->nm_asyncwrites
>= nfs_max_async_writes
)) {
2457 if ((error
= nfs_sigintr(nmp
, NULL
, current_thread(), 1))) {
2460 msleep(&nmp
->nm_asyncwrites
, &nmp
->nm_lock
, slpflag
| (PZERO
- 1), "nfsasyncwrites", &ts
);
2464 nmp
->nm_asyncwrites
++;
2466 lck_mtx_unlock(&nmp
->nm_lock
);
2470 nfs_async_write_done(struct nfsmount
*nmp
)
2472 if (nmp
->nm_asyncwrites
<= 0) {
2475 lck_mtx_lock(&nmp
->nm_lock
);
2476 if (nmp
->nm_asyncwrites
-- >= nfs_max_async_writes
) {
2477 wakeup(&nmp
->nm_asyncwrites
);
2479 lck_mtx_unlock(&nmp
->nm_lock
);
2483 * write (or commit) the given NFS buffer
2485 * Commit the buffer if we can.
2486 * Write out any dirty range.
2487 * If any dirty pages remain, write them out.
2490 * For async requests, all the work beyond sending the initial
2491 * write RPC is handled in the RPC callback(s).
2494 nfs_buf_write(struct nfsbuf
*bp
)
2496 int error
= 0, oldflags
, async
;
2500 proc_t p
= current_proc();
2502 off_t doff
, dend
, firstpg
, lastpg
;
2504 FSDBG_TOP(553, bp
, NBOFF(bp
), bp
->nb_flags
, 0);
2506 if (!ISSET(bp
->nb_lflags
, NBL_BUSY
)) {
2507 panic("nfs_buf_write: buffer is not busy???");
2511 async
= ISSET(bp
->nb_flags
, NB_ASYNC
);
2512 oldflags
= bp
->nb_flags
;
2514 CLR(bp
->nb_flags
, (NB_READ
| NB_DONE
| NB_ERROR
| NB_DELWRI
));
2515 if (ISSET(oldflags
, NB_DELWRI
)) {
2516 lck_mtx_lock(nfs_buf_mutex
);
2519 lck_mtx_unlock(nfs_buf_mutex
);
2520 wakeup(&nfs_nbdwrite
);
2523 /* move to clean list */
2524 if (ISSET(oldflags
, (NB_ASYNC
| NB_DELWRI
))) {
2525 lck_mtx_lock(nfs_buf_mutex
);
2526 if (bp
->nb_vnbufs
.le_next
!= NFSNOLIST
) {
2527 LIST_REMOVE(bp
, nb_vnbufs
);
2529 LIST_INSERT_HEAD(&np
->n_cleanblkhd
, bp
, nb_vnbufs
);
2530 lck_mtx_unlock(nfs_buf_mutex
);
2532 nfs_node_lock_force(np
);
2534 nfs_node_unlock(np
);
2535 vnode_startwrite(NFSTOV(np
));
2537 if (p
&& p
->p_stats
) {
2538 OSIncrementAtomicLong(&p
->p_stats
->p_ru
.ru_oublock
);
2541 cred
= bp
->nb_wcred
;
2542 if (!IS_VALID_CRED(cred
) && ISSET(bp
->nb_flags
, NB_READ
)) {
2543 cred
= bp
->nb_rcred
; /* shouldn't really happen, but... */
2545 if (IS_VALID_CRED(cred
)) {
2546 kauth_cred_ref(cred
);
2548 thd
= async
? NULL
: current_thread();
2550 /* We need to make sure the pages are locked before doing I/O. */
2551 if (!ISSET(bp
->nb_flags
, NB_META
)) {
2552 if (UBCINFOEXISTS(NFSTOV(np
))) {
2553 if (!ISSET(bp
->nb_flags
, NB_PAGELIST
)) {
2554 error
= nfs_buf_upl_setup(bp
);
2556 printf("nfs_buf_write: upl create failed %d\n", error
);
2557 SET(bp
->nb_flags
, NB_ERROR
);
2558 bp
->nb_error
= error
= EIO
;
2562 nfs_buf_upl_check(bp
);
2565 /* We should never be in nfs_buf_write() with no UBCINFO. */
2566 printf("nfs_buf_write: ubcinfo already gone\n");
2567 SET(bp
->nb_flags
, NB_ERROR
);
2568 bp
->nb_error
= error
= EIO
;
2574 /* If NB_NEEDCOMMIT is set, a commit RPC may do the trick. */
2575 if (ISSET(bp
->nb_flags
, NB_NEEDCOMMIT
)) {
2576 nfs_buf_check_write_verifier(np
, bp
);
2578 if (ISSET(bp
->nb_flags
, NB_NEEDCOMMIT
)) {
2579 struct nfsmount
*nmp
= NFSTONMP(np
);
2580 if (nfs_mount_gone(nmp
)) {
2581 SET(bp
->nb_flags
, NB_ERROR
);
2582 bp
->nb_error
= error
= EIO
;
2586 SET(bp
->nb_flags
, NB_WRITEINPROG
);
2587 error
= nmp
->nm_funcs
->nf_commit_rpc(np
, NBOFF(bp
) + bp
->nb_dirtyoff
,
2588 bp
->nb_dirtyend
- bp
->nb_dirtyoff
, bp
->nb_wcred
, bp
->nb_verf
);
2589 CLR(bp
->nb_flags
, NB_WRITEINPROG
);
2591 if (error
!= NFSERR_STALEWRITEVERF
) {
2592 SET(bp
->nb_flags
, NB_ERROR
);
2593 bp
->nb_error
= error
;
2598 bp
->nb_dirtyoff
= bp
->nb_dirtyend
= 0;
2599 CLR(bp
->nb_flags
, NB_NEEDCOMMIT
);
2600 nfs_node_lock_force(np
);
2601 np
->n_needcommitcnt
--;
2602 CHECK_NEEDCOMMITCNT(np
);
2603 nfs_node_unlock(np
);
2605 if (!error
&& (bp
->nb_dirtyend
> 0)) {
2606 /* sanity check the dirty range */
2607 if (NBOFF(bp
) + bp
->nb_dirtyend
> (off_t
) np
->n_size
) {
2608 bp
->nb_dirtyend
= np
->n_size
- NBOFF(bp
);
2609 if (bp
->nb_dirtyoff
>= bp
->nb_dirtyend
) {
2610 bp
->nb_dirtyoff
= bp
->nb_dirtyend
= 0;
2614 if (!error
&& (bp
->nb_dirtyend
> 0)) {
2615 /* there's a dirty range that needs to be written out */
2616 nfsbufpgs pagemask
, pagemaskand
;
2619 doff
= bp
->nb_dirtyoff
;
2620 dend
= bp
->nb_dirtyend
;
2622 /* if doff page is dirty, move doff to start of page */
2623 if (NBPGDIRTY(bp
, doff
/ PAGE_SIZE
)) {
2624 doff
-= doff
& PAGE_MASK
;
2626 /* try to expand write range to include preceding dirty pages */
2627 if (!(doff
& PAGE_MASK
)) {
2628 while ((doff
> 0) && NBPGDIRTY(bp
, (doff
- 1) / PAGE_SIZE
)) {
2632 /* if dend page is dirty, move dend to start of next page */
2633 if ((dend
& PAGE_MASK
) && NBPGDIRTY(bp
, dend
/ PAGE_SIZE
)) {
2634 dend
= round_page_64(dend
);
2636 /* try to expand write range to include trailing dirty pages */
2637 if (!(dend
& PAGE_MASK
)) {
2638 while ((dend
< (int)bp
->nb_bufsize
) && NBPGDIRTY(bp
, dend
/ PAGE_SIZE
)) {
2642 /* make sure to keep dend clipped to EOF */
2643 if ((NBOFF(bp
) + dend
) > (off_t
) np
->n_size
) {
2644 dend
= np
->n_size
- NBOFF(bp
);
2646 /* calculate range of complete pages being written */
2648 firstpg
= doff
/ PAGE_SIZE
;
2649 lastpg
= (dend
- 1) / PAGE_SIZE
;
2650 /* calculate mask for that page range */
2651 nfs_buf_pgs_set_pages_between(&pagemask
, firstpg
, lastpg
+ 1);
2653 NBPGS_ERASE(&pagemask
);
2657 * compare page mask to nb_dirty; if there are other dirty pages
2658 * then write FILESYNC; otherwise, write UNSTABLE if async and
2659 * not needcommit/stable; otherwise write FILESYNC
2661 nfs_buf_pgs_bit_not(&pagemask
);
2662 nfs_buf_pgs_bit_and(&bp
->nb_dirty
, &pagemask
, &pagemaskand
);
2663 if (nfs_buf_pgs_is_set(&pagemaskand
)) {
2664 iomode
= NFS_WRITE_FILESYNC
;
2665 } else if ((bp
->nb_flags
& (NB_ASYNC
| NB_NEEDCOMMIT
| NB_STABLE
)) == NB_ASYNC
) {
2666 iomode
= NFS_WRITE_UNSTABLE
;
2668 iomode
= NFS_WRITE_FILESYNC
;
2671 /* write the whole contiguous dirty range */
2672 bp
->nb_offio
= doff
;
2673 bp
->nb_endio
= dend
;
2675 OSAddAtomic64(1, &nfsstats
.write_bios
);
2677 SET(bp
->nb_flags
, NB_WRITEINPROG
);
2678 error
= nfs_buf_write_rpc(bp
, iomode
, thd
, cred
);
2680 * For async I/O, the callbacks will finish up the
2681 * write and push out any dirty pages. Otherwise,
2682 * the write has already been finished and any dirty
2686 if (!error
&& nfs_buf_pgs_is_set(&bp
->nb_dirty
)) { /* write out any dirty pages */
2687 error
= nfs_buf_write_dirty_pages(bp
, thd
, cred
);
2691 /* note: bp is still valid only for !async case */
2694 error
= nfs_buf_iowait(bp
);
2695 /* move to clean list */
2696 if (oldflags
& NB_DELWRI
) {
2697 lck_mtx_lock(nfs_buf_mutex
);
2698 if (bp
->nb_vnbufs
.le_next
!= NFSNOLIST
) {
2699 LIST_REMOVE(bp
, nb_vnbufs
);
2701 LIST_INSERT_HEAD(&np
->n_cleanblkhd
, bp
, nb_vnbufs
);
2702 lck_mtx_unlock(nfs_buf_mutex
);
2704 FSDBG_BOT(553, bp
, NBOFF(bp
), bp
->nb_flags
, error
);
2705 nfs_buf_release(bp
, 1);
2706 /* check if we need to invalidate (and we can) */
2707 if ((np
->n_flag
& NNEEDINVALIDATE
) &&
2708 !(np
->n_bflag
& (NBINVALINPROG
| NBFLUSHINPROG
))) {
2710 nfs_node_lock_force(np
);
2711 if (np
->n_flag
& NNEEDINVALIDATE
) {
2713 np
->n_flag
&= ~NNEEDINVALIDATE
;
2715 nfs_node_unlock(np
);
2718 * There was a write error and we need to
2719 * invalidate attrs and flush buffers in
2720 * order to sync up with the server.
2721 * (if this write was extending the file,
2722 * we may no longer know the correct size)
2724 * But we couldn't call vinvalbuf while holding
2725 * the buffer busy. So we call vinvalbuf() after
2726 * releasing the buffer.
2728 nfs_vinvalbuf2(NFSTOV(np
), V_SAVE
| V_IGNORE_WRITEERR
, thd
, cred
, 1);
2733 if (IS_VALID_CRED(cred
)) {
2734 kauth_cred_unref(&cred
);
2740 * finish the writing of a buffer
2743 nfs_buf_write_finish(struct nfsbuf
*bp
, thread_t thd
, kauth_cred_t cred
)
2745 nfsnode_t np
= bp
->nb_np
;
2746 int error
= (bp
->nb_flags
& NB_ERROR
) ? bp
->nb_error
: 0;
2747 off_t firstpg
, lastpg
;
2749 if ((error
== EINTR
) || (error
== ERESTART
)) {
2750 CLR(bp
->nb_flags
, NB_ERROR
);
2751 SET(bp
->nb_flags
, NB_EINTR
);
2756 /* calculate range of complete pages being written */
2757 if (bp
->nb_endio
> bp
->nb_offio
) {
2758 firstpg
= bp
->nb_offio
/ PAGE_SIZE
;
2759 lastpg
= (bp
->nb_endio
- 1) / PAGE_SIZE
;
2760 /* calculate mask for that page range written */
2761 nfs_buf_pgs_set_pages_between(&pagemask
, firstpg
, lastpg
+ 1);
2763 NBPGS_ERASE(&pagemask
);
2765 /* clear dirty bits for pages we've written */
2766 nfs_buf_pgs_bit_not(&pagemask
);
2767 nfs_buf_pgs_bit_and(&bp
->nb_dirty
, &pagemask
, &bp
->nb_dirty
);
2770 /* manage needcommit state */
2771 if (!error
&& (bp
->nb_commitlevel
== NFS_WRITE_UNSTABLE
)) {
2772 if (!ISSET(bp
->nb_flags
, NB_NEEDCOMMIT
)) {
2773 nfs_node_lock_force(np
);
2774 np
->n_needcommitcnt
++;
2775 nfs_node_unlock(np
);
2776 SET(bp
->nb_flags
, NB_NEEDCOMMIT
);
2778 /* make sure nb_dirtyoff/nb_dirtyend reflect actual range written */
2779 bp
->nb_dirtyoff
= bp
->nb_offio
;
2780 bp
->nb_dirtyend
= bp
->nb_endio
;
2781 } else if (ISSET(bp
->nb_flags
, NB_NEEDCOMMIT
)) {
2782 nfs_node_lock_force(np
);
2783 np
->n_needcommitcnt
--;
2784 CHECK_NEEDCOMMITCNT(np
);
2785 nfs_node_unlock(np
);
2786 CLR(bp
->nb_flags
, NB_NEEDCOMMIT
);
2789 CLR(bp
->nb_flags
, NB_WRITEINPROG
);
2792 * For an unstable write, the buffer is still treated as dirty until
2793 * a commit (or stable (re)write) is performed. Buffers needing only
2794 * a commit are marked with the NB_DELWRI and NB_NEEDCOMMIT flags.
2796 * If the write was interrupted we set NB_EINTR. Don't set NB_ERROR
2797 * because that would cause the buffer to be dropped. The buffer is
2798 * still valid and simply needs to be written again.
2800 if ((error
== EINTR
) || (error
== ERESTART
) || (!error
&& (bp
->nb_flags
& NB_NEEDCOMMIT
))) {
2801 CLR(bp
->nb_flags
, NB_INVAL
);
2802 if (!ISSET(bp
->nb_flags
, NB_DELWRI
)) {
2803 SET(bp
->nb_flags
, NB_DELWRI
);
2804 lck_mtx_lock(nfs_buf_mutex
);
2807 lck_mtx_unlock(nfs_buf_mutex
);
2810 * Since for the NB_ASYNC case, we've reassigned the buffer to the
2811 * clean list, we have to reassign it back to the dirty one. Ugh.
2813 if (ISSET(bp
->nb_flags
, NB_ASYNC
)) {
2814 /* move to dirty list */
2815 lck_mtx_lock(nfs_buf_mutex
);
2816 if (bp
->nb_vnbufs
.le_next
!= NFSNOLIST
) {
2817 LIST_REMOVE(bp
, nb_vnbufs
);
2819 LIST_INSERT_HEAD(&np
->n_dirtyblkhd
, bp
, nb_vnbufs
);
2820 lck_mtx_unlock(nfs_buf_mutex
);
2823 /* either there's an error or we don't need to commit */
2826 * There was a write error and we need to invalidate
2827 * attrs and flush buffers in order to sync up with the
2828 * server. (if this write was extending the file, we
2829 * may no longer know the correct size)
2831 * But we can't call vinvalbuf while holding this
2832 * buffer busy. Set a flag to do it after releasing
2835 nfs_node_lock_force(np
);
2836 np
->n_error
= error
;
2837 np
->n_flag
|= (NWRITEERR
| NNEEDINVALIDATE
);
2838 NATTRINVALIDATE(np
);
2839 nfs_node_unlock(np
);
2841 /* clear the dirty range */
2842 bp
->nb_dirtyoff
= bp
->nb_dirtyend
= 0;
2845 if (!error
&& nfs_buf_pgs_is_set(&bp
->nb_dirty
)) {
2846 nfs_buf_write_dirty_pages(bp
, thd
, cred
);
2852 * write out any pages marked dirty in a buffer
2854 * We do use unstable writes and follow up with a commit.
2855 * If we catch the write verifier changing we'll restart
2856 * do the writes filesync.
2859 nfs_buf_write_dirty_pages(struct nfsbuf
*bp
, thread_t thd
, kauth_cred_t cred
)
2861 nfsnode_t np
= bp
->nb_np
;
2862 struct nfsmount
*nmp
= NFSTONMP(np
);
2863 int error
= 0, commit
, iomode
, iomode2
, len
, pg
, count
, npages
, off
;
2867 char uio_buf
[UIO_SIZEOF(1)];
2869 if (!nfs_buf_pgs_is_set(&bp
->nb_dirty
)) {
2873 /* there are pages marked dirty that need to be written out */
2874 OSAddAtomic64(1, &nfsstats
.write_bios
);
2876 SET(bp
->nb_flags
, NB_WRITEINPROG
);
2877 npages
= bp
->nb_bufsize
/ PAGE_SIZE
;
2878 iomode
= NFS_WRITE_UNSTABLE
;
2880 auio
= uio_createwithbuffer(1, 0, UIO_SYSSPACE
, UIO_WRITE
,
2881 &uio_buf
, sizeof(uio_buf
));
2884 NBPGS_COPY(&dirty
, &bp
->nb_dirty
);
2885 wverf
= bp
->nb_verf
;
2886 commit
= NFS_WRITE_FILESYNC
;
2887 for (pg
= 0; pg
< npages
; pg
++) {
2888 if (!NBPGDIRTY(bp
, pg
)) {
2892 while (((pg
+ count
) < npages
) && NBPGDIRTY(bp
, pg
+ count
)) {
2895 /* write count pages starting with page pg */
2896 off
= pg
* PAGE_SIZE
;
2897 len
= count
* PAGE_SIZE
;
2898 /* clip writes to EOF */
2899 if (NBOFF(bp
) + off
+ len
> (off_t
) np
->n_size
) {
2900 len
-= (NBOFF(bp
) + off
+ len
) - np
->n_size
;
2904 uio_reset(auio
, NBOFF(bp
) + off
, UIO_SYSSPACE
, UIO_WRITE
);
2905 uio_addiov(auio
, CAST_USER_ADDR_T(bp
->nb_data
+ off
), len
);
2906 error
= nfs_write_rpc2(np
, auio
, thd
, cred
, &iomode2
, &bp
->nb_verf
);
2910 if (iomode2
< commit
) { /* Retain the lowest commitment level returned. */
2913 if ((commit
!= NFS_WRITE_FILESYNC
) && (wverf
!= bp
->nb_verf
)) {
2914 /* verifier changed, redo all the writes filesync */
2915 iomode
= NFS_WRITE_FILESYNC
;
2919 /* clear dirty bits */
2921 NBPGS_UNSET(&dirty
, pg
);
2922 if (count
) { /* leave pg on last page */
2927 CLR(bp
->nb_flags
, NB_WRITEINPROG
);
2929 if (!error
&& (commit
!= NFS_WRITE_FILESYNC
)) {
2930 error
= nmp
->nm_funcs
->nf_commit_rpc(np
, NBOFF(bp
), bp
->nb_bufsize
, cred
, wverf
);
2931 if (error
== NFSERR_STALEWRITEVERF
) {
2932 /* verifier changed, so we need to restart all the writes */
2933 iomode
= NFS_WRITE_FILESYNC
;
2938 NBPGS_COPY(&bp
->nb_dirty
, &dirty
);
2940 SET(bp
->nb_flags
, NB_ERROR
);
2941 bp
->nb_error
= error
;
2947 * initiate the NFS WRITE RPC(s) for a buffer
2950 nfs_buf_write_rpc(struct nfsbuf
*bp
, int iomode
, thread_t thd
, kauth_cred_t cred
)
2952 struct nfsmount
*nmp
;
2953 nfsnode_t np
= bp
->nb_np
;
2954 int error
= 0, nfsvers
, async
;
2959 struct nfsreq_cbinfo cb
;
2961 char uio_buf
[UIO_SIZEOF(1)];
2962 off_t offset
, length
;
2965 if (nfs_mount_gone(nmp
)) {
2966 bp
->nb_error
= error
= ENXIO
;
2967 SET(bp
->nb_flags
, NB_ERROR
);
2971 nfsvers
= nmp
->nm_vers
;
2972 nmwsize
= nmp
->nm_wsize
;
2974 offset
= bp
->nb_offio
;
2975 length
= bp
->nb_endio
- bp
->nb_offio
;
2977 /* Note: Can only do async I/O if nfsiods are configured. */
2978 async
= (bp
->nb_flags
& NB_ASYNC
) && (NFSIOD_MAX
> 0);
2979 bp
->nb_commitlevel
= NFS_WRITE_FILESYNC
;
2980 cb
.rcb_func
= async
? nfs_buf_write_rpc_finish
: NULL
;
2983 if ((nfsvers
== NFS_VER2
) && ((NBOFF(bp
) + bp
->nb_endio
) > 0xffffffffLL
)) {
2984 bp
->nb_error
= error
= EFBIG
;
2985 SET(bp
->nb_flags
, NB_ERROR
);
2990 auio
= uio_createwithbuffer(1, NBOFF(bp
) + offset
, UIO_SYSSPACE
,
2991 UIO_WRITE
, &uio_buf
, sizeof(uio_buf
));
2992 NFS_UIO_ADDIOV(auio
, CAST_USER_ADDR_T(bp
->nb_data
+ offset
), length
);
2994 bp
->nb_rpcs
= nrpcs
= (length
+ nmwsize
- 1) / nmwsize
;
2995 if (async
&& (nrpcs
> 1)) {
2996 SET(bp
->nb_flags
, NB_MULTASYNCRPC
);
2998 CLR(bp
->nb_flags
, NB_MULTASYNCRPC
);
3001 while (length
> 0) {
3002 if (ISSET(bp
->nb_flags
, NB_ERROR
)) {
3003 error
= bp
->nb_error
;
3006 len
= (length
> nmwsize
) ? nmwsize
: (uint32_t)length
;
3007 cb
.rcb_args
.offset
= offset
;
3008 cb
.rcb_args
.length
= len
;
3010 if (nmp
->nm_vers
>= NFS_VER4
) {
3011 cb
.rcb_args
.stategenid
= nmp
->nm_stategenid
;
3014 if (async
&& ((error
= nfs_async_write_start(nmp
)))) {
3018 error
= nmp
->nm_funcs
->nf_write_rpc_async(np
, auio
, len
, thd
, cred
,
3022 nfs_async_write_done(nmp
);
3031 nfs_buf_write_rpc_finish(req
);
3036 * Something bad happened while trying to send the RPCs.
3037 * Wait for any outstanding requests to complete.
3039 bp
->nb_error
= error
;
3040 SET(bp
->nb_flags
, NB_ERROR
);
3041 if (ISSET(bp
->nb_flags
, NB_MULTASYNCRPC
)) {
3042 nrpcs
= (length
+ nmwsize
- 1) / nmwsize
;
3043 lck_mtx_lock(nfs_buf_mutex
);
3044 bp
->nb_rpcs
-= nrpcs
;
3045 if (bp
->nb_rpcs
== 0) {
3046 /* No RPCs left, so the buffer's done */
3047 lck_mtx_unlock(nfs_buf_mutex
);
3048 nfs_buf_write_finish(bp
, thd
, cred
);
3050 /* wait for the last RPC to mark it done */
3051 while (bp
->nb_rpcs
> 0) {
3052 msleep(&bp
->nb_rpcs
, nfs_buf_mutex
, 0,
3053 "nfs_buf_write_rpc_cancel", NULL
);
3055 lck_mtx_unlock(nfs_buf_mutex
);
3058 nfs_buf_write_finish(bp
, thd
, cred
);
3060 /* It may have just been an interrupt... that's OK */
3061 if (!ISSET(bp
->nb_flags
, NB_ERROR
)) {
3070 * finish up an NFS WRITE RPC on a buffer
3073 nfs_buf_write_rpc_finish(struct nfsreq
*req
)
3075 int error
= 0, nfsvers
, multasyncrpc
, finished
;
3076 int committed
= NFS_WRITE_FILESYNC
;
3079 size_t rlen
, length
;
3080 void *wakeme
= NULL
;
3081 struct nfsreq_cbinfo cb
;
3082 struct nfsreq
*wreq
= NULL
;
3084 struct nfsmount
*nmp
;
3089 char uio_buf
[UIO_SIZEOF(1)];
3093 thd
= req
->r_thread
;
3095 if (IS_VALID_CRED(cred
)) {
3096 kauth_cred_ref(cred
);
3098 cb
= req
->r_callback
;
3100 if (cb
.rcb_func
) { /* take an extra reference on the nfsreq in case we want to resend it later due to grace error */
3101 nfs_request_ref(req
, 0);
3105 if (nfs_mount_gone(nmp
)) {
3106 SET(bp
->nb_flags
, NB_ERROR
);
3107 bp
->nb_error
= error
= ENXIO
;
3109 if (error
|| ISSET(bp
->nb_flags
, NB_ERROR
)) {
3111 nfs_request_async_cancel(req
);
3114 nfsvers
= nmp
->nm_vers
;
3116 offset
= cb
.rcb_args
.offset
;
3117 rlen
= length
= cb
.rcb_args
.length
;
3119 /* finish the RPC */
3120 error
= nmp
->nm_funcs
->nf_write_rpc_async_finish(np
, req
, &committed
, &rlen
, &wverf
);
3121 if ((error
== EINPROGRESS
) && cb
.rcb_func
) {
3122 /* async request restarted */
3124 nfs_request_rele(req
);
3126 if (IS_VALID_CRED(cred
)) {
3127 kauth_cred_unref(&cred
);
3132 if ((nmp
->nm_vers
>= NFS_VER4
) && nfs_mount_state_error_should_restart(error
) && !ISSET(bp
->nb_flags
, NB_ERROR
)) {
3133 lck_mtx_lock(&nmp
->nm_lock
);
3134 if ((error
!= NFSERR_OLD_STATEID
) && (error
!= NFSERR_GRACE
) && (cb
.rcb_args
.stategenid
== nmp
->nm_stategenid
)) {
3135 NP(np
, "nfs_buf_write_rpc_finish: error %d @ 0x%llx, 0x%x 0x%x, initiating recovery",
3136 error
, NBOFF(bp
) + offset
, cb
.rcb_args
.stategenid
, nmp
->nm_stategenid
);
3137 nfs_need_recover(nmp
, error
);
3139 lck_mtx_unlock(&nmp
->nm_lock
);
3140 if (np
->n_flag
& NREVOKE
) {
3143 if (error
== NFSERR_GRACE
) {
3146 * For an async I/O request, handle a grace delay just like
3147 * jukebox errors. Set the resend time and queue it up.
3150 if (req
->r_nmrep
.nmc_mhead
) {
3151 mbuf_freem(req
->r_nmrep
.nmc_mhead
);
3152 req
->r_nmrep
.nmc_mhead
= NULL
;
3156 lck_mtx_lock(&req
->r_mtx
);
3157 req
->r_resendtime
= now
.tv_sec
+ 2;
3158 req
->r_xid
= 0; // get a new XID
3159 req
->r_flags
|= R_RESTART
;
3161 nfs_asyncio_resend(req
);
3162 lck_mtx_unlock(&req
->r_mtx
);
3163 if (IS_VALID_CRED(cred
)) {
3164 kauth_cred_unref(&cred
);
3166 /* Note: nfsreq reference taken will be dropped later when finished */
3169 /* otherwise, just pause a couple seconds and retry */
3170 tsleep(&nmp
->nm_state
, (PZERO
- 1), "nfsgrace", 2 * hz
);
3172 if (!(error
= nfs_mount_state_wait_for_recovery(nmp
))) {
3180 SET(bp
->nb_flags
, NB_ERROR
);
3181 bp
->nb_error
= error
;
3183 if (error
|| (nfsvers
== NFS_VER2
)) {
3187 SET(bp
->nb_flags
, NB_ERROR
);
3188 bp
->nb_error
= error
= EIO
;
3192 /* save lowest commit level returned */
3193 if (committed
< bp
->nb_commitlevel
) {
3194 bp
->nb_commitlevel
= committed
;
3197 /* check the write verifier */
3199 bp
->nb_verf
= wverf
;
3200 } else if (bp
->nb_verf
!= wverf
) {
3201 /* verifier changed, so buffer will need to be rewritten */
3202 bp
->nb_flags
|= NB_STALEWVERF
;
3203 bp
->nb_commitlevel
= NFS_WRITE_UNSTABLE
;
3204 bp
->nb_verf
= wverf
;
3207 if ((rlen
> 0) && (bp
->nb_offio
< (offset
+ (int)rlen
))) {
3208 bp
->nb_offio
= offset
+ rlen
;
3212 * check for a short write
3214 * If the server didn't write all the data, then we
3215 * need to issue another write for the rest of it.
3216 * (Don't bother if the buffer hit an error or stale wverf.)
3218 if ((rlen
< length
) && !(bp
->nb_flags
& (NB_STALEWVERF
| NB_ERROR
))) {
3225 auio
= uio_createwithbuffer(1, NBOFF(bp
) + offset
, UIO_SYSSPACE
,
3226 UIO_WRITE
, &uio_buf
, sizeof(uio_buf
));
3227 uio_addiov(auio
, CAST_USER_ADDR_T(bp
->nb_data
+ offset
), length
);
3229 cb
.rcb_args
.offset
= offset
;
3230 cb
.rcb_args
.length
= length
;
3232 if (nmp
->nm_vers
>= NFS_VER4
) {
3233 cb
.rcb_args
.stategenid
= nmp
->nm_stategenid
;
3236 // XXX iomode should really match the original request
3237 error
= nmp
->nm_funcs
->nf_write_rpc_async(np
, auio
, length
, thd
, cred
,
3238 NFS_WRITE_FILESYNC
, &cb
, &wreq
);
3240 if (IS_VALID_CRED(cred
)) {
3241 kauth_cred_unref(&cred
);
3244 /* if !async we'll need to wait for this RPC to finish */
3249 nfs_request_rele(req
);
3252 * Outstanding RPC count is unchanged.
3253 * Callback will be called when RPC is done.
3257 SET(bp
->nb_flags
, NB_ERROR
);
3258 bp
->nb_error
= error
;
3263 nfs_async_write_done(nmp
);
3264 nfs_request_rele(req
);
3267 * Decrement outstanding RPC count on buffer
3268 * and call nfs_buf_write_finish on last RPC.
3270 * (Note: when there are multiple async RPCs issued for a
3271 * buffer we need nfs_buffer_mutex to avoid problems when
3272 * aborting a partially-initiated set of RPCs)
3274 multasyncrpc
= ISSET(bp
->nb_flags
, NB_MULTASYNCRPC
);
3276 lck_mtx_lock(nfs_buf_mutex
);
3280 finished
= (bp
->nb_rpcs
== 0);
3283 lck_mtx_unlock(nfs_buf_mutex
);
3288 wakeme
= &bp
->nb_rpcs
;
3290 nfs_buf_write_finish(bp
, thd
, cred
);
3296 if (IS_VALID_CRED(cred
)) {
3297 kauth_cred_unref(&cred
);
3300 if (cb
.rcb_func
&& np
->n_needcommitcnt
>= NFS_A_LOT_OF_NEEDCOMMITS
) {
3301 nfs_flushcommits(np
, 1);
3306 * Send commit(s) for the given node's "needcommit" buffers
3309 nfs_flushcommits(nfsnode_t np
, int nowait
)
3311 struct nfsmount
*nmp
;
3312 struct nfsbuf
*bp
, *prevlbp
, *lbp
;
3313 struct nfsbuflists blist
, commitlist
;
3314 int error
= 0, retv
, wcred_set
, flags
;
3315 u_quad_t off
, endoff
, toff
;
3316 uint64_t wverf
, count
;
3317 kauth_cred_t wcred
= NULL
;
3320 FSDBG_TOP(557, np
, 0, 0, 0);
3323 * A nb_flags == (NB_DELWRI | NB_NEEDCOMMIT) block has been written to the
3324 * server, but nas not been committed to stable storage on the server
3325 * yet. The byte range is worked out for as many nfsbufs as we can handle
3326 * and the commit rpc is done.
3328 if (!LIST_EMPTY(&np
->n_dirtyblkhd
)) {
3329 error
= nfs_node_lock(np
);
3333 np
->n_flag
|= NMODIFIED
;
3334 nfs_node_unlock(np
);
3340 LIST_INIT(&commitlist
);
3343 if (nfs_mount_gone(nmp
)) {
3347 if (nmp
->nm_vers
== NFS_VER2
) {
3354 flags
|= NBI_NOWAIT
;
3356 lck_mtx_lock(nfs_buf_mutex
);
3357 wverf
= nmp
->nm_verf
;
3358 if (!nfs_buf_iterprepare(np
, &blist
, flags
)) {
3359 while ((bp
= LIST_FIRST(&blist
))) {
3360 LIST_REMOVE(bp
, nb_vnbufs
);
3361 LIST_INSERT_HEAD(&np
->n_dirtyblkhd
, bp
, nb_vnbufs
);
3362 error
= nfs_buf_acquire(bp
, NBAC_NOWAIT
, 0, 0);
3366 if (ISSET(bp
->nb_flags
, NB_NEEDCOMMIT
)) {
3367 nfs_buf_check_write_verifier(np
, bp
);
3369 if (((bp
->nb_flags
& (NB_DELWRI
| NB_NEEDCOMMIT
)) != (NB_DELWRI
| NB_NEEDCOMMIT
)) ||
3370 (bp
->nb_verf
!= wverf
)) {
3374 nfs_buf_remfree(bp
);
3376 /* buffer UPLs will be grabbed *in order* below */
3378 FSDBG(557, bp
, bp
->nb_flags
, bp
->nb_valid
, bp
->nb_dirty
);
3379 FSDBG(557, bp
->nb_validoff
, bp
->nb_validend
,
3380 bp
->nb_dirtyoff
, bp
->nb_dirtyend
);
3383 * Work out if all buffers are using the same cred
3384 * so we can deal with them all with one commit.
3386 * Note: creds in bp's must be obtained by kauth_cred_ref
3387 * on the same original cred in order for them to be equal.
3389 if (wcred_set
== 0) {
3390 wcred
= bp
->nb_wcred
;
3391 if (!IS_VALID_CRED(wcred
)) {
3392 panic("nfs: needcommit w/out wcred");
3395 } else if ((wcred_set
== 1) && wcred
!= bp
->nb_wcred
) {
3398 SET(bp
->nb_flags
, NB_WRITEINPROG
);
3401 * Add this buffer to the list of buffers we are committing.
3402 * Buffers are inserted into the list in ascending order so that
3403 * we can take the UPLs in order after the list is complete.
3406 LIST_FOREACH(lbp
, &commitlist
, nb_vnbufs
) {
3407 if (bp
->nb_lblkno
< lbp
->nb_lblkno
) {
3412 LIST_REMOVE(bp
, nb_vnbufs
);
3414 LIST_INSERT_AFTER(prevlbp
, bp
, nb_vnbufs
);
3416 LIST_INSERT_HEAD(&commitlist
, bp
, nb_vnbufs
);
3419 /* update commit range start, end */
3420 toff
= NBOFF(bp
) + bp
->nb_dirtyoff
;
3424 toff
+= (u_quad_t
)(bp
->nb_dirtyend
- bp
->nb_dirtyoff
);
3425 if (toff
> endoff
) {
3429 nfs_buf_itercomplete(np
, &blist
, NBI_DIRTY
);
3431 lck_mtx_unlock(nfs_buf_mutex
);
3433 if (LIST_EMPTY(&commitlist
)) {
3439 * We need a UPL to prevent others from accessing the buffers during
3440 * our commit RPC(s).
3442 * We used to also check for dirty pages here; if there were any we'd
3443 * abort the commit and force the entire buffer to be written again.
3444 * Instead of doing that, we just go ahead and commit the dirty range,
3445 * and then leave the buffer around with dirty pages that will be
3446 * written out later.
3448 LIST_FOREACH(bp
, &commitlist
, nb_vnbufs
) {
3449 if (!ISSET(bp
->nb_flags
, NB_PAGELIST
)) {
3450 retv
= nfs_buf_upl_setup(bp
);
3452 /* Unable to create the UPL, the VM object probably no longer exists. */
3453 printf("nfs_flushcommits: upl create failed %d\n", retv
);
3454 NBPGS_ERASE(&bp
->nb_valid
);
3455 NBPGS_ERASE(&bp
->nb_dirty
);
3458 nfs_buf_upl_check(bp
);
3462 * Commit data on the server, as required.
3463 * If all bufs are using the same wcred, then use that with
3464 * one call for all of them, otherwise commit each one
3467 if (wcred_set
== 1) {
3469 * Note, it's possible the commit range could be >2^32-1.
3470 * If it is, we'll send one commit that covers the whole file.
3472 if ((endoff
- off
) > 0xffffffff) {
3475 count
= (endoff
- off
);
3477 retv
= nmp
->nm_funcs
->nf_commit_rpc(np
, off
, count
, wcred
, wverf
);
3480 LIST_FOREACH(bp
, &commitlist
, nb_vnbufs
) {
3481 toff
= NBOFF(bp
) + bp
->nb_dirtyoff
;
3482 count
= bp
->nb_dirtyend
- bp
->nb_dirtyoff
;
3483 retv
= nmp
->nm_funcs
->nf_commit_rpc(np
, toff
, count
, bp
->nb_wcred
, wverf
);
3491 * Now, either mark the blocks I/O done or mark the
3492 * blocks dirty, depending on whether the commit
3495 while ((bp
= LIST_FIRST(&commitlist
))) {
3496 LIST_REMOVE(bp
, nb_vnbufs
);
3497 FSDBG(557, bp
, retv
, bp
->nb_flags
, bp
->nb_dirty
);
3498 nfs_node_lock_force(np
);
3499 CLR(bp
->nb_flags
, (NB_NEEDCOMMIT
| NB_WRITEINPROG
));
3500 np
->n_needcommitcnt
--;
3501 CHECK_NEEDCOMMITCNT(np
);
3502 nfs_node_unlock(np
);
3505 /* move back to dirty list */
3506 lck_mtx_lock(nfs_buf_mutex
);
3507 LIST_INSERT_HEAD(&np
->n_dirtyblkhd
, bp
, nb_vnbufs
);
3508 lck_mtx_unlock(nfs_buf_mutex
);
3509 nfs_buf_release(bp
, 1);
3513 nfs_node_lock_force(np
);
3515 nfs_node_unlock(np
);
3516 vnode_startwrite(NFSTOV(np
));
3517 if (ISSET(bp
->nb_flags
, NB_DELWRI
)) {
3518 lck_mtx_lock(nfs_buf_mutex
);
3521 lck_mtx_unlock(nfs_buf_mutex
);
3522 wakeup(&nfs_nbdwrite
);
3524 CLR(bp
->nb_flags
, (NB_READ
| NB_DONE
| NB_ERROR
| NB_DELWRI
));
3525 /* if block still has dirty pages, we don't want it to */
3526 /* be released in nfs_buf_iodone(). So, don't set NB_ASYNC. */
3527 NBPGS_COPY(&dirty
, &bp
->nb_dirty
);
3528 if (!nfs_buf_pgs_is_set(&dirty
)) {
3529 SET(bp
->nb_flags
, NB_ASYNC
);
3531 CLR(bp
->nb_flags
, NB_ASYNC
);
3534 /* move to clean list */
3535 lck_mtx_lock(nfs_buf_mutex
);
3536 LIST_INSERT_HEAD(&np
->n_cleanblkhd
, bp
, nb_vnbufs
);
3537 lck_mtx_unlock(nfs_buf_mutex
);
3539 bp
->nb_dirtyoff
= bp
->nb_dirtyend
= 0;
3542 if (nfs_buf_pgs_is_set(&dirty
)) {
3543 /* throw it back in as a delayed write buffer */
3544 CLR(bp
->nb_flags
, NB_DONE
);
3545 nfs_buf_write_delayed(bp
);
3550 FSDBG_BOT(557, np
, 0, 0, error
);
3555 * Flush all the blocks associated with a vnode.
3556 * Walk through the buffer pool and push any dirty pages
3557 * associated with the vnode.
3560 nfs_flush(nfsnode_t np
, int waitfor
, thread_t thd
, int ignore_writeerr
)
3563 struct nfsbuflists blist
;
3564 struct nfsmount
*nmp
= NFSTONMP(np
);
3565 int error
= 0, error2
, slptimeo
= 0, slpflag
= 0;
3566 int nfsvers
, flags
, passone
= 1;
3568 FSDBG_TOP(517, np
, waitfor
, ignore_writeerr
, 0);
3570 if (nfs_mount_gone(nmp
)) {
3574 nfsvers
= nmp
->nm_vers
;
3575 if (NMFLAG(nmp
, INTR
)) {
3579 if (!LIST_EMPTY(&np
->n_dirtyblkhd
)) {
3580 nfs_node_lock_force(np
);
3581 np
->n_flag
|= NMODIFIED
;
3582 nfs_node_unlock(np
);
3585 lck_mtx_lock(nfs_buf_mutex
);
3586 while (np
->n_bflag
& NBFLUSHINPROG
) {
3587 np
->n_bflag
|= NBFLUSHWANT
;
3588 error
= msleep(&np
->n_bflag
, nfs_buf_mutex
, slpflag
, "nfs_flush", NULL
);
3589 if ((error
&& (error
!= EWOULDBLOCK
)) ||
3590 ((error
= nfs_sigintr(NFSTONMP(np
), NULL
, thd
, 0)))) {
3591 lck_mtx_unlock(nfs_buf_mutex
);
3595 np
->n_bflag
|= NBFLUSHINPROG
;
3598 * On the first pass, start async/unstable writes on all
3599 * delayed write buffers. Then wait for all writes to complete
3600 * and call nfs_flushcommits() to commit any uncommitted buffers.
3601 * On all subsequent passes, start STABLE writes on any remaining
3602 * dirty buffers. Then wait for all writes to complete.
3605 FSDBG(518, LIST_FIRST(&np
->n_dirtyblkhd
), np
->n_flag
, 0, 0);
3606 if (!NFSTONMP(np
)) {
3607 lck_mtx_unlock(nfs_buf_mutex
);
3612 /* Start/do any write(s) that are required. */
3613 if (!nfs_buf_iterprepare(np
, &blist
, NBI_DIRTY
)) {
3614 while ((bp
= LIST_FIRST(&blist
))) {
3615 LIST_REMOVE(bp
, nb_vnbufs
);
3616 LIST_INSERT_HEAD(&np
->n_dirtyblkhd
, bp
, nb_vnbufs
);
3617 flags
= (passone
|| !(waitfor
== MNT_WAIT
|| waitfor
== MNT_DWAIT
)) ? NBAC_NOWAIT
: 0;
3618 if (flags
!= NBAC_NOWAIT
) {
3621 while ((error
= nfs_buf_acquire(bp
, flags
, slpflag
, slptimeo
))) {
3622 FSDBG(524, bp
, flags
, bp
->nb_lflags
, bp
->nb_flags
);
3623 if (error
== EBUSY
) {
3627 error2
= nfs_sigintr(NFSTONMP(np
), NULL
, thd
, 0);
3629 if (flags
!= NBAC_NOWAIT
) {
3630 nfs_buf_refrele(bp
);
3632 nfs_buf_itercomplete(np
, &blist
, NBI_DIRTY
);
3633 lck_mtx_unlock(nfs_buf_mutex
);
3637 if (slpflag
== PCATCH
) {
3643 if (flags
!= NBAC_NOWAIT
) {
3644 nfs_buf_refrele(bp
);
3646 if (error
== EBUSY
) {
3650 /* buffer is no longer valid */
3654 if (ISSET(bp
->nb_flags
, NB_NEEDCOMMIT
)) {
3655 nfs_buf_check_write_verifier(np
, bp
);
3657 if (!ISSET(bp
->nb_flags
, NB_DELWRI
)) {
3658 /* buffer is no longer dirty */
3662 FSDBG(525, bp
, passone
, bp
->nb_lflags
, bp
->nb_flags
);
3663 if ((passone
|| !(waitfor
== MNT_WAIT
|| waitfor
== MNT_DWAIT
)) &&
3664 ISSET(bp
->nb_flags
, NB_NEEDCOMMIT
)) {
3668 nfs_buf_remfree(bp
);
3669 lck_mtx_unlock(nfs_buf_mutex
);
3670 if (ISSET(bp
->nb_flags
, NB_ERROR
)) {
3671 nfs_node_lock_force(np
);
3672 np
->n_error
= bp
->nb_error
? bp
->nb_error
: EIO
;
3673 np
->n_flag
|= NWRITEERR
;
3674 nfs_node_unlock(np
);
3675 nfs_buf_release(bp
, 1);
3676 lck_mtx_lock(nfs_buf_mutex
);
3679 SET(bp
->nb_flags
, NB_ASYNC
);
3681 /* NB_STABLE forces this to be written FILESYNC */
3682 SET(bp
->nb_flags
, NB_STABLE
);
3685 lck_mtx_lock(nfs_buf_mutex
);
3687 nfs_buf_itercomplete(np
, &blist
, NBI_DIRTY
);
3689 lck_mtx_unlock(nfs_buf_mutex
);
3691 if (waitfor
== MNT_WAIT
|| waitfor
== MNT_DWAIT
) {
3692 while ((error
= vnode_waitforwrites(NFSTOV(np
), 0, slpflag
, slptimeo
, "nfsflush"))) {
3693 error2
= nfs_sigintr(NFSTONMP(np
), NULL
, thd
, 0);
3698 if (slpflag
== PCATCH
) {
3705 if (nfsvers
!= NFS_VER2
) {
3706 /* loop while it looks like there are still buffers to be */
3707 /* commited and nfs_flushcommits() seems to be handling them. */
3708 while (np
->n_needcommitcnt
) {
3709 if (nfs_flushcommits(np
, 0)) {
3717 if (!LIST_EMPTY(&np
->n_dirtyblkhd
)) {
3718 nfs_node_lock_force(np
);
3719 np
->n_flag
|= NMODIFIED
;
3720 nfs_node_unlock(np
);
3722 lck_mtx_lock(nfs_buf_mutex
);
3726 if (waitfor
== MNT_WAIT
|| waitfor
== MNT_DWAIT
) {
3727 if (!LIST_EMPTY(&np
->n_dirtyblkhd
)) {
3728 nfs_node_lock_force(np
);
3729 np
->n_flag
|= NMODIFIED
;
3730 nfs_node_unlock(np
);
3732 lck_mtx_lock(nfs_buf_mutex
);
3733 if (!LIST_EMPTY(&np
->n_dirtyblkhd
)) {
3736 lck_mtx_unlock(nfs_buf_mutex
);
3737 nfs_node_lock_force(np
);
3739 * OK, it looks like there are no dirty blocks. If we have no
3740 * writes in flight and no one in the write code, we can clear
3741 * the modified flag. In order to make sure we see the latest
3742 * attributes and size, we also invalidate the attributes and
3743 * advance the attribute cache XID to guarantee that attributes
3744 * newer than our clearing of NMODIFIED will get loaded next.
3745 * (If we don't do this, it's possible for the flush's final
3746 * write/commit (xid1) to be executed in parallel with a subsequent
3747 * getattr request (xid2). The getattr could return attributes
3748 * from *before* the write/commit completed but the stale attributes
3749 * would be preferred because of the xid ordering.)
3751 if (!np
->n_wrbusy
&& !np
->n_numoutput
) {
3752 np
->n_flag
&= ~NMODIFIED
;
3753 NATTRINVALIDATE(np
);
3754 nfs_get_xid(&np
->n_xid
);
3757 nfs_node_lock_force(np
);
3760 FSDBG(526, np
->n_flag
, np
->n_error
, 0, 0);
3761 if (!ignore_writeerr
&& (np
->n_flag
& NWRITEERR
)) {
3762 error
= np
->n_error
;
3763 np
->n_flag
&= ~NWRITEERR
;
3765 nfs_node_unlock(np
);
3767 lck_mtx_lock(nfs_buf_mutex
);
3768 flags
= np
->n_bflag
;
3769 np
->n_bflag
&= ~(NBFLUSHINPROG
| NBFLUSHWANT
);
3770 lck_mtx_unlock(nfs_buf_mutex
);
3771 if (flags
& NBFLUSHWANT
) {
3772 wakeup(&np
->n_bflag
);
3775 FSDBG_BOT(517, np
, error
, ignore_writeerr
, 0);
3780 * Flush out and invalidate all buffers associated with a vnode.
3781 * Called with the underlying object locked.
3784 nfs_vinvalbuf_internal(
3793 struct nfsbuflists blist
;
3794 int list
, error
= 0;
3796 if (flags
& V_SAVE
) {
3797 if ((error
= nfs_flush(np
, MNT_WAIT
, thd
, (flags
& V_IGNORE_WRITEERR
)))) {
3802 lck_mtx_lock(nfs_buf_mutex
);
3805 if (nfs_buf_iterprepare(np
, &blist
, list
)) {
3807 if (nfs_buf_iterprepare(np
, &blist
, list
)) {
3811 while ((bp
= LIST_FIRST(&blist
))) {
3812 LIST_REMOVE(bp
, nb_vnbufs
);
3813 if (list
== NBI_CLEAN
) {
3814 LIST_INSERT_HEAD(&np
->n_cleanblkhd
, bp
, nb_vnbufs
);
3816 LIST_INSERT_HEAD(&np
->n_dirtyblkhd
, bp
, nb_vnbufs
);
3819 while ((error
= nfs_buf_acquire(bp
, NBAC_REMOVE
, slpflag
, slptimeo
))) {
3820 FSDBG(556, np
, bp
, NBOFF(bp
), bp
->nb_flags
);
3821 if (error
!= EAGAIN
) {
3822 FSDBG(554, np
, bp
, -1, error
);
3823 nfs_buf_refrele(bp
);
3824 nfs_buf_itercomplete(np
, &blist
, list
);
3825 lck_mtx_unlock(nfs_buf_mutex
);
3829 nfs_buf_refrele(bp
);
3830 FSDBG(554, np
, bp
, NBOFF(bp
), bp
->nb_flags
);
3831 lck_mtx_unlock(nfs_buf_mutex
);
3832 if ((flags
& V_SAVE
) && UBCINFOEXISTS(NFSTOV(np
)) && bp
->nb_np
&&
3833 (NBOFF(bp
) < (off_t
)np
->n_size
)) {
3834 /* extra paranoia: make sure we're not */
3835 /* somehow leaving any dirty data around */
3838 off_t end
= (NBOFF(bp
) + bp
->nb_bufsize
> (off_t
)np
->n_size
) ?
3839 (np
->n_size
- NBOFF(bp
)) : bp
->nb_bufsize
;
3840 if (!ISSET(bp
->nb_flags
, NB_PAGELIST
)) {
3841 error
= nfs_buf_upl_setup(bp
);
3842 if (error
== EINVAL
) {
3843 /* vm object must no longer exist */
3844 /* hopefully we don't need to do */
3845 /* anything for this buffer */
3847 printf("nfs_vinvalbuf: upl setup failed %d\n", error
);
3849 NBPGS_ERASE(&bp
->nb_valid
);
3850 NBPGS_ERASE(&bp
->nb_dirty
);
3852 nfs_buf_upl_check(bp
);
3853 /* check for any dirty data before the EOF */
3854 if ((bp
->nb_dirtyend
> 0) && (bp
->nb_dirtyoff
< end
)) {
3855 /* clip dirty range to EOF */
3856 if (bp
->nb_dirtyend
> end
) {
3857 bp
->nb_dirtyend
= end
;
3858 if (bp
->nb_dirtyoff
>= bp
->nb_dirtyend
) {
3859 bp
->nb_dirtyoff
= bp
->nb_dirtyend
= 0;
3862 if ((bp
->nb_dirtyend
> 0) && (bp
->nb_dirtyoff
< end
)) {
3866 nfs_buf_pgs_get_page_mask(&pagemask
, round_page_64(end
) / PAGE_SIZE
);
3867 nfs_buf_pgs_bit_and(&bp
->nb_dirty
, &pagemask
, &bp
->nb_dirty
);
3868 if (nfs_buf_pgs_is_set(&bp
->nb_dirty
)) {
3871 /* also make sure we'll have a credential to do the write */
3872 if (mustwrite
&& !IS_VALID_CRED(bp
->nb_wcred
) && !IS_VALID_CRED(cred
)) {
3873 printf("nfs_vinvalbuf: found dirty buffer with no write creds\n");
3877 FSDBG(554, np
, bp
, 0xd00dee, bp
->nb_flags
);
3878 if (!ISSET(bp
->nb_flags
, NB_PAGELIST
)) {
3879 panic("nfs_vinvalbuf: dirty buffer without upl");
3881 /* gotta write out dirty data before invalidating */
3882 /* (NB_STABLE indicates that data writes should be FILESYNC) */
3883 /* (NB_NOCACHE indicates buffer should be discarded) */
3884 CLR(bp
->nb_flags
, (NB_DONE
| NB_ERROR
| NB_INVAL
| NB_ASYNC
));
3885 SET(bp
->nb_flags
, NB_STABLE
| NB_NOCACHE
);
3886 if (!IS_VALID_CRED(bp
->nb_wcred
)) {
3887 kauth_cred_ref(cred
);
3888 bp
->nb_wcred
= cred
;
3890 error
= nfs_buf_write(bp
);
3891 // Note: bp has been released
3893 FSDBG(554, bp
, 0xd00dee, 0xbad, error
);
3894 nfs_node_lock_force(np
);
3895 if ((error
!= EINTR
) && (error
!= ERESTART
)) {
3896 np
->n_error
= error
;
3897 np
->n_flag
|= NWRITEERR
;
3900 * There was a write error and we need to
3901 * invalidate attrs to sync with server.
3902 * (if this write was extending the file,
3903 * we may no longer know the correct size)
3905 NATTRINVALIDATE(np
);
3906 nfs_node_unlock(np
);
3907 if ((error
== EINTR
) || (error
== ERESTART
)) {
3909 * Abort on EINTR. If we don't, we could
3910 * be stuck in this loop forever because
3911 * the buffer will continue to stay dirty.
3913 lck_mtx_lock(nfs_buf_mutex
);
3914 nfs_buf_itercomplete(np
, &blist
, list
);
3915 lck_mtx_unlock(nfs_buf_mutex
);
3920 lck_mtx_lock(nfs_buf_mutex
);
3924 SET(bp
->nb_flags
, NB_INVAL
);
3925 // hold off on FREEUPs until we're done here
3926 nfs_buf_release(bp
, 0);
3927 lck_mtx_lock(nfs_buf_mutex
);
3929 nfs_buf_itercomplete(np
, &blist
, list
);
3931 if (!LIST_EMPTY(&(np
)->n_dirtyblkhd
) || !LIST_EMPTY(&(np
)->n_cleanblkhd
)) {
3932 panic("nfs_vinvalbuf: flush/inval failed");
3934 lck_mtx_unlock(nfs_buf_mutex
);
3935 nfs_node_lock_force(np
);
3936 if (!(flags
& V_SAVE
)) {
3937 np
->n_flag
&= ~NMODIFIED
;
3939 if (vnode_vtype(NFSTOV(np
)) == VREG
) {
3940 np
->n_lastrahead
= -1;
3942 nfs_node_unlock(np
);
3949 * Flush and invalidate all dirty buffers. If another process is already
3950 * doing the flush, just wait for completion.
3953 nfs_vinvalbuf(vnode_t vp
, int flags
, vfs_context_t ctx
, int intrflg
)
3955 return nfs_vinvalbuf2(vp
, flags
, vfs_context_thread(ctx
), vfs_context_ucred(ctx
), intrflg
);
3959 nfs_vinvalbuf2(vnode_t vp
, int flags
, thread_t thd
, kauth_cred_t cred
, int intrflg
)
3961 nfsnode_t np
= VTONFS(vp
);
3962 struct nfsmount
*nmp
= VTONMP(vp
);
3963 int error
, slpflag
, slptimeo
, nflags
, retry
= 0;
3964 int ubcflags
= UBC_PUSHALL
| UBC_SYNC
| UBC_INVALIDATE
;
3965 struct timespec ts
= { .tv_sec
= 2, .tv_nsec
= 0 };
3968 FSDBG_TOP(554, np
, flags
, intrflg
, 0);
3971 * If the mount is gone no sense to try and write anything.
3972 * and hang trying to do IO.
3974 if (nfs_mount_gone(nmp
)) {
3976 ubcflags
&= ~UBC_PUSHALL
;
3979 if (nmp
&& !NMFLAG(nmp
, INTR
)) {
3990 /* First wait for any other process doing a flush to complete. */
3991 lck_mtx_lock(nfs_buf_mutex
);
3992 while (np
->n_bflag
& NBINVALINPROG
) {
3993 np
->n_bflag
|= NBINVALWANT
;
3994 msleep(&np
->n_bflag
, nfs_buf_mutex
, slpflag
, "nfs_vinvalbuf", &ts
);
3995 if ((error
= nfs_sigintr(VTONMP(vp
), NULL
, thd
, 0))) {
3996 lck_mtx_unlock(nfs_buf_mutex
);
3999 if (np
->n_bflag
& NBINVALINPROG
) {
4003 np
->n_bflag
|= NBINVALINPROG
;
4004 lck_mtx_unlock(nfs_buf_mutex
);
4006 /* Now, flush as required. */
4008 error
= nfs_vinvalbuf_internal(np
, flags
, thd
, cred
, slpflag
, 0);
4010 FSDBG(554, np
, 0, 0, error
);
4011 if ((error
= nfs_sigintr(VTONMP(vp
), NULL
, thd
, 0))) {
4014 error
= nfs_vinvalbuf_internal(np
, flags
, thd
, cred
, 0, slptimeo
);
4017 /* get the pages out of vm also */
4018 if (UBCINFOEXISTS(vp
) && (size
= ubc_getsize(vp
))) {
4019 if ((error
= ubc_msync(vp
, 0, size
, NULL
, ubcflags
))) {
4020 if (error
== EINVAL
) {
4021 panic("nfs_vinvalbuf(): ubc_msync failed!, error %d", error
);
4023 if (retry
++ < 10) { /* retry invalidating a few times */
4024 if (retry
> 1 || error
== ENXIO
) {
4025 ubcflags
&= ~UBC_PUSHALL
;
4030 printf("nfs_vinvalbuf(): ubc_msync failed!, error %d\n", error
);
4034 lck_mtx_lock(nfs_buf_mutex
);
4035 nflags
= np
->n_bflag
;
4036 np
->n_bflag
&= ~(NBINVALINPROG
| NBINVALWANT
);
4037 lck_mtx_unlock(nfs_buf_mutex
);
4038 if (nflags
& NBINVALWANT
) {
4039 wakeup(&np
->n_bflag
);
4042 FSDBG_BOT(554, np
, flags
, intrflg
, error
);
4047 * Wait for any busy buffers to complete.
4050 nfs_wait_bufs(nfsnode_t np
)
4053 struct nfsbuflists blist
;
4056 lck_mtx_lock(nfs_buf_mutex
);
4057 if (!nfs_buf_iterprepare(np
, &blist
, NBI_CLEAN
)) {
4058 while ((bp
= LIST_FIRST(&blist
))) {
4059 LIST_REMOVE(bp
, nb_vnbufs
);
4060 LIST_INSERT_HEAD(&np
->n_cleanblkhd
, bp
, nb_vnbufs
);
4062 while ((error
= nfs_buf_acquire(bp
, 0, 0, 0))) {
4063 if (error
!= EAGAIN
) {
4064 nfs_buf_refrele(bp
);
4065 nfs_buf_itercomplete(np
, &blist
, NBI_CLEAN
);
4066 lck_mtx_unlock(nfs_buf_mutex
);
4070 nfs_buf_refrele(bp
);
4073 nfs_buf_itercomplete(np
, &blist
, NBI_CLEAN
);
4075 if (!nfs_buf_iterprepare(np
, &blist
, NBI_DIRTY
)) {
4076 while ((bp
= LIST_FIRST(&blist
))) {
4077 LIST_REMOVE(bp
, nb_vnbufs
);
4078 LIST_INSERT_HEAD(&np
->n_dirtyblkhd
, bp
, nb_vnbufs
);
4080 while ((error
= nfs_buf_acquire(bp
, 0, 0, 0))) {
4081 if (error
!= EAGAIN
) {
4082 nfs_buf_refrele(bp
);
4083 nfs_buf_itercomplete(np
, &blist
, NBI_DIRTY
);
4084 lck_mtx_unlock(nfs_buf_mutex
);
4088 nfs_buf_refrele(bp
);
4091 nfs_buf_itercomplete(np
, &blist
, NBI_DIRTY
);
4093 lck_mtx_unlock(nfs_buf_mutex
);
4098 * Add an async I/O request to the mount's async I/O queue and make
4099 * sure that an nfsiod will service it.
4102 nfs_asyncio_finish(struct nfsreq
*req
)
4104 struct nfsmount
*nmp
;
4105 struct nfsiod
*niod
;
4108 FSDBG_TOP(552, nmp
, 0, 0, 0);
4116 lck_mtx_lock(nfsiod_mutex
);
4117 niod
= nmp
->nm_niod
;
4119 /* grab an nfsiod if we don't have one already */
4121 niod
= TAILQ_FIRST(&nfsiodfree
);
4123 TAILQ_REMOVE(&nfsiodfree
, niod
, niod_link
);
4124 TAILQ_INSERT_TAIL(&nfsiodwork
, niod
, niod_link
);
4125 niod
->niod_nmp
= nmp
;
4126 } else if (((nfsiod_thread_count
< NFSIOD_MAX
) || (nfsiod_thread_count
<= 0)) && (started
< 4)) {
4128 * Try starting a new thread.
4129 * We may try a couple times if other callers
4130 * get the new threads before we do.
4132 lck_mtx_unlock(nfsiod_mutex
);
4134 if (!nfsiod_start()) {
4137 lck_mtx_lock(nfsiod_mutex
);
4142 * If we got here while being on the resendq we need to get off. This
4143 * happens when the timer fires and errors out requests from nfs_sigintr
4144 * or we receive a reply (UDP case) while being on the resend queue so
4145 * we're just finishing up and are not going to be resent.
4147 lck_mtx_lock(&req
->r_mtx
);
4148 if (req
->r_flags
& R_RESENDQ
) {
4149 lck_mtx_lock(&nmp
->nm_lock
);
4150 if ((req
->r_flags
& R_RESENDQ
) && req
->r_rchain
.tqe_next
!= NFSREQNOLIST
) {
4151 NFS_BIO_DBG("Proccessing async request on resendq. Removing");
4152 TAILQ_REMOVE(&nmp
->nm_resendq
, req
, r_rchain
);
4153 req
->r_flags
&= ~R_RESENDQ
;
4154 req
->r_rchain
.tqe_next
= NFSREQNOLIST
;
4155 assert(req
->r_refs
> 1);
4156 /* Remove resendq reference */
4159 lck_mtx_unlock(&nmp
->nm_lock
);
4161 lck_mtx_unlock(&req
->r_mtx
);
4163 if (req
->r_achain
.tqe_next
== NFSREQNOLIST
) {
4164 TAILQ_INSERT_TAIL(&nmp
->nm_iodq
, req
, r_achain
);
4167 /* If this mount doesn't already have an nfsiod working on it... */
4168 if (!nmp
->nm_niod
) {
4169 if (niod
) { /* give it the nfsiod we just grabbed */
4170 nmp
->nm_niod
= niod
;
4171 lck_mtx_unlock(nfsiod_mutex
);
4173 } else if (nfsiod_thread_count
> 0) {
4174 /* just queue it up on nfsiod mounts queue if needed */
4175 if (nmp
->nm_iodlink
.tqe_next
== NFSNOLIST
) {
4176 TAILQ_INSERT_TAIL(&nfsiodmounts
, nmp
, nm_iodlink
);
4178 lck_mtx_unlock(nfsiod_mutex
);
4180 printf("nfs_asyncio(): no nfsiods? %d %d (%d)\n", nfsiod_thread_count
, NFSIOD_MAX
, started
);
4181 lck_mtx_unlock(nfsiod_mutex
);
4182 /* we have no other option but to be persistent */
4187 lck_mtx_unlock(nfsiod_mutex
);
4190 FSDBG_BOT(552, nmp
, 0, 0, 0);
4194 * queue up async I/O request for resend
4195 * Must be called with req->r_mtx locked.
4198 nfs_asyncio_resend(struct nfsreq
*req
)
4200 struct nfsmount
*nmp
= req
->r_nmp
;
4202 if (nfs_mount_gone(nmp
)) {
4207 nfs_gss_clnt_rpcdone(req
);
4209 lck_mtx_lock(&nmp
->nm_lock
);
4210 if (!(req
->r_flags
& R_RESENDQ
)) {
4211 TAILQ_INSERT_TAIL(&nmp
->nm_resendq
, req
, r_rchain
);
4212 req
->r_flags
|= R_RESENDQ
;
4214 * We take a reference on this request so that it can't be
4215 * destroyed while a resend is queued or in progress.
4217 nfs_request_ref(req
, 1);
4219 nfs_mount_sock_thread_wake(nmp
);
4220 lck_mtx_unlock(&nmp
->nm_lock
);
4224 * Read directory data into a buffer.
4226 * Buffer will be filled (unless EOF is hit).
4227 * Buffers after this one may also be completely/partially filled.
4230 nfs_buf_readdir(struct nfsbuf
*bp
, vfs_context_t ctx
)
4232 nfsnode_t np
= bp
->nb_np
;
4233 struct nfsmount
*nmp
= NFSTONMP(np
);
4236 if (nfs_mount_gone(nmp
)) {
4240 if (nmp
->nm_vers
< NFS_VER4
) {
4241 error
= nfs3_readdir_rpc(np
, bp
, ctx
);
4245 error
= nfs4_readdir_rpc(np
, bp
, ctx
);
4248 if (error
&& (error
!= NFSERR_DIRBUFDROPPED
)) {
4249 SET(bp
->nb_flags
, NB_ERROR
);
4250 bp
->nb_error
= error
;
4255 #endif /* CONFIG_NFS_CLIENT */