2 * Copyright (c) 2000-2019 Apple Inc. All rights reserved.
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
28 /* Copyright (c) 1995 NeXT Computer, Inc. All Rights Reserved */
30 * Copyright (c) 1989, 1993
31 * The Regents of the University of California. All rights reserved.
33 * This code is derived from software contributed to Berkeley by
34 * Rick Macklem at The University of Guelph.
36 * Redistribution and use in source and binary forms, with or without
37 * modification, are permitted provided that the following conditions
39 * 1. Redistributions of source code must retain the above copyright
40 * notice, this list of conditions and the following disclaimer.
41 * 2. Redistributions in binary form must reproduce the above copyright
42 * notice, this list of conditions and the following disclaimer in the
43 * documentation and/or other materials provided with the distribution.
44 * 3. All advertising materials mentioning features or use of this software
45 * must display the following acknowledgement:
46 * This product includes software developed by the University of
47 * California, Berkeley and its contributors.
48 * 4. Neither the name of the University nor the names of its contributors
49 * may be used to endorse or promote products derived from this software
50 * without specific prior written permission.
52 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
53 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
54 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
55 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
56 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
57 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
58 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
59 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
60 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
61 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
64 * @(#)nfs_bio.c 8.9 (Berkeley) 3/30/95
65 * FreeBSD-Id: nfs_bio.c,v 1.44 1997/09/10 19:52:25 phk Exp $
68 #include <nfs/nfs_conf.h>
71 #include <sys/param.h>
72 #include <sys/systm.h>
73 #include <sys/resourcevar.h>
74 #include <sys/signalvar.h>
75 #include <sys/proc_internal.h>
76 #include <sys/kauth.h>
77 #include <sys/malloc.h>
78 #include <sys/vnode.h>
79 #include <sys/dirent.h>
80 #include <sys/mount_internal.h>
81 #include <sys/kernel.h>
82 #include <sys/ubc_internal.h>
83 #include <sys/uio_internal.h>
84 #include <sys/kpi_mbuf.h>
87 #include <sys/vmparam.h>
90 #include <kern/clock.h>
91 #include <libkern/OSAtomic.h>
92 #include <kern/kalloc.h>
93 #include <kern/thread_call.h>
95 #include <nfs/rpcv2.h>
96 #include <nfs/nfsproto.h>
98 #include <nfs/nfs_gss.h>
99 #include <nfs/nfsmount.h>
100 #include <nfs/nfsnode.h>
101 #include <sys/buf_internal.h>
102 #include <libkern/OSAtomic.h>
103 #include <os/refcnt.h>
105 #define NFS_BIO_DBG(...) NFS_DBG(NFS_FAC_BIO, 7, ## __VA_ARGS__)
107 kern_return_t
thread_terminate(thread_t
); /* XXX */
109 #define NFSBUFHASH(np, lbn) \
110 (&nfsbufhashtbl[((long)(np) / sizeof(*(np)) + (int)(lbn)) & nfsbufhash])
111 LIST_HEAD(nfsbufhashhead
, nfsbuf
) * nfsbufhashtbl
;
112 struct nfsbuffreehead nfsbuffree
, nfsbuffreemeta
, nfsbufdelwri
;
114 int nfsbufcnt
, nfsbufmin
, nfsbufmax
, nfsbufmetacnt
, nfsbufmetamax
;
115 int nfsbuffreecnt
, nfsbuffreemetacnt
, nfsbufdelwricnt
, nfsneedbuffer
;
117 int nfs_buf_timer_on
= 0;
118 thread_t nfsbufdelwrithd
= NULL
;
120 ZONE_DECLARE(nfsbuf_zone
, "NFS bio", sizeof(struct nfsbuf
), ZC_NONE
);
122 lck_grp_t
*nfs_buf_lck_grp
;
123 lck_mtx_t
*nfs_buf_mutex
;
125 #define NFSBUF_FREE_PERIOD 30 /* seconds */
126 #define NFSBUF_LRU_STALE 120
127 #define NFSBUF_META_STALE 240
129 /* number of nfsbufs nfs_buf_freeup() should attempt to free from nfsbuffree list */
130 #define LRU_TO_FREEUP 6
131 /* number of nfsbufs nfs_buf_freeup() should attempt to free from nfsbuffreemeta list */
132 #define META_TO_FREEUP 3
133 /* total number of nfsbufs nfs_buf_freeup() should attempt to free */
134 #define TOTAL_TO_FREEUP (LRU_TO_FREEUP+META_TO_FREEUP)
135 /* fraction of nfsbufs nfs_buf_freeup() should attempt to free from nfsbuffree list when called from timer */
136 #define LRU_FREEUP_FRAC_ON_TIMER 8
137 /* fraction of nfsbufs nfs_buf_freeup() should attempt to free from nfsbuffreemeta list when called from timer */
138 #define META_FREEUP_FRAC_ON_TIMER 16
139 /* fraction of total nfsbufs that nfsbuffreecnt should exceed before bothering to call nfs_buf_freeup() */
140 #define LRU_FREEUP_MIN_FRAC 4
141 /* fraction of total nfsbufs that nfsbuffreemetacnt should exceed before bothering to call nfs_buf_freeup() */
142 #define META_FREEUP_MIN_FRAC 2
144 #define NFS_ROUND_BLOCK(p, blksize) ((((uint64_t)(p) + blksize - 1) & ~((uint64_t)blksize - 1)) / blksize)
146 #define NFS_BUF_FREEUP() \
148 /* only call nfs_buf_freeup() if it has work to do: */ \
149 if (((nfsbuffreecnt > nfsbufcnt/LRU_FREEUP_MIN_FRAC) || \
150 (nfsbuffreemetacnt > nfsbufcnt/META_FREEUP_MIN_FRAC)) && \
151 ((nfsbufcnt - TOTAL_TO_FREEUP) > nfsbufmin)) \
156 nfs_buf_pgs_get_page_mask(nfsbufpgs
*nfsbp
, off_t page
)
158 off_t page_pos
= page
/ NBPGS_ELEMENT_PAGES
;
159 off_t max_page
= NBPGS_STRUCT_SIZE
* 8;
162 if (page
>= max_page
) {
163 nfs_buf_pgs_bit_not(nfsbp
);
167 NBPGS_SET(nfsbp
, page
);
168 nfsbp
->pages
[page_pos
]--;
169 for (off_t i
= page_pos
- 1; i
>= 0; i
--) {
170 nfsbp
->pages
[i
] = ~0;
175 nfs_buf_pgs_bit_not(nfsbufpgs
*nfsbp
)
177 for (uint32_t i
= 0; i
< NBPGS_ELEMENTS
; i
++) {
178 nfsbp
->pages
[i
] = ~nfsbp
->pages
[i
];
183 nfs_buf_pgs_bit_and(nfsbufpgs
*nfsbp_src1
, nfsbufpgs
*nfsbp_src2
, nfsbufpgs
*nfsbp_dst
)
185 for (uint32_t i
= 0; i
< NBPGS_ELEMENTS
; i
++) {
186 nfsbp_dst
->pages
[i
] = nfsbp_src1
->pages
[i
] & nfsbp_src2
->pages
[i
];
191 nfs_buf_pgs_set_pages_between(nfsbufpgs
*nfsbp
, off_t firstpg
, off_t lastpg
)
193 nfsbufpgs pagemaskfirst
, pagemasklast
;
195 nfs_buf_pgs_get_page_mask(&pagemasklast
, lastpg
);
196 nfs_buf_pgs_get_page_mask(&pagemaskfirst
, firstpg
);
197 nfs_buf_pgs_bit_not(&pagemaskfirst
);
198 nfs_buf_pgs_bit_and(&pagemaskfirst
, &pagemasklast
, nfsbp
);
202 nfs_buf_pgs_is_set(nfsbufpgs
*nfsbp
)
204 for (uint32_t i
= 0; i
< NBPGS_ELEMENTS
; i
++) {
205 if (nfsbp
->pages
[i
] != 0) {
213 * Initialize nfsbuf lists
218 nfs_buf_lck_grp
= lck_grp_alloc_init("nfs_buf", LCK_GRP_ATTR_NULL
);
219 nfs_buf_mutex
= lck_mtx_alloc_init(nfs_buf_lck_grp
, LCK_ATTR_NULL
);
221 nfsbufcnt
= nfsbufmetacnt
=
222 nfsbuffreecnt
= nfsbuffreemetacnt
= nfsbufdelwricnt
= 0;
224 /* size nfsbufmax to cover at most half sane_size (w/default buf size) */
225 nfsbufmax
= (int)(sane_size
>> PAGE_SHIFT
) / (2 * (NFS_RWSIZE
>> PAGE_SHIFT
));
226 nfsbufmetamax
= nfsbufmax
/ 4;
230 nfsbufhashtbl
= hashinit(nfsbufmax
/ 4, M_NFSBIO
, &nfsbufhash
);
231 TAILQ_INIT(&nfsbuffree
);
232 TAILQ_INIT(&nfsbuffreemeta
);
233 TAILQ_INIT(&nfsbufdelwri
);
237 * Check periodically for stale/unused nfs bufs
240 nfs_buf_timer(__unused
void *param0
, __unused
void *param1
)
244 lck_mtx_lock(nfs_buf_mutex
);
245 if (nfsbufcnt
<= nfsbufmin
) {
246 nfs_buf_timer_on
= 0;
247 lck_mtx_unlock(nfs_buf_mutex
);
250 lck_mtx_unlock(nfs_buf_mutex
);
252 nfs_interval_timer_start(nfs_buf_timer_call
,
253 NFSBUF_FREE_PERIOD
* 1000);
257 * try to free up some excess, unused nfsbufs
260 nfs_buf_freeup(int timer
)
265 struct nfsbuffreehead nfsbuffreeup
;
267 TAILQ_INIT(&nfsbuffreeup
);
269 lck_mtx_lock(nfs_buf_mutex
);
273 FSDBG(320, nfsbufcnt
, nfsbuffreecnt
, nfsbuffreemetacnt
, 0);
275 count
= timer
? nfsbuffreecnt
/ LRU_FREEUP_FRAC_ON_TIMER
: LRU_TO_FREEUP
;
276 while ((nfsbufcnt
> nfsbufmin
) && (count
-- > 0)) {
277 fbp
= TAILQ_FIRST(&nfsbuffree
);
281 if (os_ref_get_count(&fbp
->nb_refs
) > 1) {
284 if (NBUFSTAMPVALID(fbp
) &&
285 (fbp
->nb_timestamp
+ (2 * NFSBUF_LRU_STALE
)) > now
.tv_sec
) {
288 nfs_buf_remfree(fbp
);
289 /* disassociate buffer from any nfsnode */
291 if (fbp
->nb_vnbufs
.le_next
!= NFSNOLIST
) {
292 LIST_REMOVE(fbp
, nb_vnbufs
);
293 fbp
->nb_vnbufs
.le_next
= NFSNOLIST
;
297 LIST_REMOVE(fbp
, nb_hash
);
298 TAILQ_INSERT_TAIL(&nfsbuffreeup
, fbp
, nb_free
);
302 count
= timer
? nfsbuffreemetacnt
/ META_FREEUP_FRAC_ON_TIMER
: META_TO_FREEUP
;
303 while ((nfsbufcnt
> nfsbufmin
) && (count
-- > 0)) {
304 fbp
= TAILQ_FIRST(&nfsbuffreemeta
);
308 if (os_ref_get_count(&fbp
->nb_refs
) > 1) {
311 if (NBUFSTAMPVALID(fbp
) &&
312 (fbp
->nb_timestamp
+ (2 * NFSBUF_META_STALE
)) > now
.tv_sec
) {
315 nfs_buf_remfree(fbp
);
316 /* disassociate buffer from any nfsnode */
318 if (fbp
->nb_vnbufs
.le_next
!= NFSNOLIST
) {
319 LIST_REMOVE(fbp
, nb_vnbufs
);
320 fbp
->nb_vnbufs
.le_next
= NFSNOLIST
;
324 LIST_REMOVE(fbp
, nb_hash
);
325 TAILQ_INSERT_TAIL(&nfsbuffreeup
, fbp
, nb_free
);
330 FSDBG(320, nfsbufcnt
, nfsbuffreecnt
, nfsbuffreemetacnt
, 0);
333 lck_mtx_unlock(nfs_buf_mutex
);
335 while ((fbp
= TAILQ_FIRST(&nfsbuffreeup
))) {
336 TAILQ_REMOVE(&nfsbuffreeup
, fbp
, nb_free
);
338 if (IS_VALID_CRED(fbp
->nb_rcred
)) {
339 kauth_cred_unref(&fbp
->nb_rcred
);
341 if (IS_VALID_CRED(fbp
->nb_wcred
)) {
342 kauth_cred_unref(&fbp
->nb_wcred
);
344 /* if buf was NB_META, dump buffer */
345 if (ISSET(fbp
->nb_flags
, NB_META
) && fbp
->nb_data
) {
346 kheap_free(KHEAP_DATA_BUFFERS
, fbp
->nb_data
, fbp
->nb_bufsize
);
348 NFS_ZFREE(nfsbuf_zone
, fbp
);
353 * remove a buffer from the freelist
354 * (must be called with nfs_buf_mutex held)
357 nfs_buf_remfree(struct nfsbuf
*bp
)
359 if (bp
->nb_free
.tqe_next
== NFSNOLIST
) {
360 panic("nfsbuf not on free list");
362 if (ISSET(bp
->nb_flags
, NB_DELWRI
)) {
364 TAILQ_REMOVE(&nfsbufdelwri
, bp
, nb_free
);
365 } else if (ISSET(bp
->nb_flags
, NB_META
)) {
367 TAILQ_REMOVE(&nfsbuffreemeta
, bp
, nb_free
);
370 TAILQ_REMOVE(&nfsbuffree
, bp
, nb_free
);
372 bp
->nb_free
.tqe_next
= NFSNOLIST
;
377 * check for existence of nfsbuf in cache
380 nfs_buf_is_incore(nfsnode_t np
, daddr64_t blkno
)
383 lck_mtx_lock(nfs_buf_mutex
);
384 if (nfs_buf_incore(np
, blkno
)) {
389 lck_mtx_unlock(nfs_buf_mutex
);
394 * return incore buffer (must be called with nfs_buf_mutex held)
397 nfs_buf_incore(nfsnode_t np
, daddr64_t blkno
)
399 /* Search hash chain */
400 struct nfsbuf
* bp
= NFSBUFHASH(np
, blkno
)->lh_first
;
401 for (; bp
!= NULL
; bp
= bp
->nb_hash
.le_next
) {
402 if ((bp
->nb_lblkno
== blkno
) && (bp
->nb_np
== np
)) {
403 if (!ISSET(bp
->nb_flags
, NB_INVAL
)) {
404 FSDBG(547, bp
, blkno
, bp
->nb_flags
, bp
->nb_np
);
413 * Check if it's OK to drop a page.
415 * Called by vnode_pager() on pageout request of non-dirty page.
416 * We need to make sure that it's not part of a delayed write.
417 * If it is, we can't let the VM drop it because we may need it
418 * later when/if we need to write the data (again).
421 nfs_buf_page_inval(vnode_t vp
, off_t offset
)
423 struct nfsmount
*nmp
= VTONMP(vp
);
427 if (nfs_mount_gone(nmp
)) {
431 lck_mtx_lock(nfs_buf_mutex
);
432 bp
= nfs_buf_incore(VTONFS(vp
), (daddr64_t
)(offset
/ nmp
->nm_biosize
));
436 FSDBG(325, bp
, bp
->nb_flags
, bp
->nb_dirtyoff
, bp
->nb_dirtyend
);
437 if (ISSET(bp
->nb_lflags
, NBL_BUSY
)) {
442 * If there's a dirty range in the buffer, check to
443 * see if this page intersects with the dirty range.
444 * If it does, we can't let the pager drop the page.
446 if (bp
->nb_dirtyend
> 0) {
447 off_t start
= offset
- NBOFF(bp
);
448 if ((bp
->nb_dirtyend
> start
) &&
449 (bp
->nb_dirtyoff
< (start
+ PAGE_SIZE
))) {
451 * Before returning the bad news, move the
452 * buffer to the start of the delwri list and
453 * give the list a push to try to flush the
458 TAILQ_INSERT_HEAD(&nfsbufdelwri
, bp
, nb_free
);
460 nfs_buf_delwri_push(1);
464 lck_mtx_unlock(nfs_buf_mutex
);
469 * set up the UPL for a buffer
470 * (must NOT be called with nfs_buf_mutex held)
473 nfs_buf_upl_setup(struct nfsbuf
*bp
)
479 if (ISSET(bp
->nb_flags
, NB_PAGELIST
)) {
483 upl_flags
= UPL_PRECIOUS
;
484 if (!ISSET(bp
->nb_flags
, NB_READ
)) {
486 * We're doing a "write", so we intend to modify
487 * the pages we're gathering.
489 upl_flags
|= UPL_WILL_MODIFY
;
491 kret
= ubc_create_upl_kernel(NFSTOV(bp
->nb_np
), NBOFF(bp
), bp
->nb_bufsize
,
492 &upl
, NULL
, upl_flags
, VM_KERN_MEMORY_FILE
);
493 if (kret
== KERN_INVALID_ARGUMENT
) {
494 /* vm object probably doesn't exist any more */
495 bp
->nb_pagelist
= NULL
;
498 if (kret
!= KERN_SUCCESS
) {
499 printf("nfs_buf_upl_setup(): failed to get pagelist %d\n", kret
);
500 bp
->nb_pagelist
= NULL
;
504 FSDBG(538, bp
, NBOFF(bp
), bp
->nb_bufsize
, bp
->nb_np
);
506 bp
->nb_pagelist
= upl
;
507 SET(bp
->nb_flags
, NB_PAGELIST
);
512 * update buffer's valid/dirty info from UBC
513 * (must NOT be called with nfs_buf_mutex held)
516 nfs_buf_upl_check(struct nfsbuf
*bp
)
519 off_t filesize
, fileoffset
;
522 if (!ISSET(bp
->nb_flags
, NB_PAGELIST
)) {
526 npages
= round_page_32(bp
->nb_bufsize
) / PAGE_SIZE
;
527 filesize
= ubc_getsize(NFSTOV(bp
->nb_np
));
528 fileoffset
= NBOFF(bp
);
529 if (fileoffset
< filesize
) {
530 SET(bp
->nb_flags
, NB_CACHE
);
532 CLR(bp
->nb_flags
, NB_CACHE
);
535 pl
= ubc_upl_pageinfo(bp
->nb_pagelist
);
536 NBPGS_ERASE(&bp
->nb_valid
);
537 NBPGS_ERASE(&bp
->nb_dirty
);
539 for (i
= 0; i
< npages
; i
++, fileoffset
+= PAGE_SIZE_64
) {
540 /* anything beyond the end of the file is not valid or dirty */
541 if (fileoffset
>= filesize
) {
544 if (!upl_valid_page(pl
, i
)) {
545 CLR(bp
->nb_flags
, NB_CACHE
);
548 NBPGVALID_SET(bp
, i
);
549 if (upl_dirty_page(pl
, i
)) {
550 NBPGDIRTY_SET(bp
, i
);
553 fileoffset
= NBOFF(bp
);
554 if (ISSET(bp
->nb_flags
, NB_CACHE
)) {
556 bp
->nb_validend
= bp
->nb_bufsize
;
557 if (fileoffset
+ bp
->nb_validend
> filesize
) {
558 bp
->nb_validend
= filesize
- fileoffset
;
561 bp
->nb_validoff
= bp
->nb_validend
= -1;
563 FSDBG(539, bp
, fileoffset
, bp
->nb_valid
, bp
->nb_dirty
);
564 FSDBG(539, bp
->nb_validoff
, bp
->nb_validend
, bp
->nb_dirtyoff
, bp
->nb_dirtyend
);
568 * make sure that a buffer is mapped
569 * (must NOT be called with nfs_buf_mutex held)
572 nfs_buf_map(struct nfsbuf
*bp
)
579 if (!ISSET(bp
->nb_flags
, NB_PAGELIST
)) {
583 kret
= ubc_upl_map(bp
->nb_pagelist
, (vm_offset_t
*)&(bp
->nb_data
));
584 if (kret
!= KERN_SUCCESS
) {
585 panic("nfs_buf_map: ubc_upl_map() failed with (%d)", kret
);
587 if (bp
->nb_data
== 0) {
588 panic("ubc_upl_map mapped 0");
590 FSDBG(540, bp
, bp
->nb_flags
, NBOFF(bp
), bp
->nb_data
);
595 * normalize an nfsbuf's valid range
597 * the read/write code guarantees that we'll always have a valid
598 * region that is an integral number of pages. If either end
599 * of the valid range isn't page-aligned, it gets corrected
600 * here as we extend the valid range through all of the
601 * contiguous valid pages.
604 nfs_buf_normalize_valid_range(nfsnode_t np
, struct nfsbuf
*bp
)
607 /* pull validoff back to start of contiguous valid page range */
608 pg
= bp
->nb_validoff
/ PAGE_SIZE
;
609 while (pg
>= 0 && NBPGVALID(bp
, pg
)) {
612 bp
->nb_validoff
= (pg
+ 1) * PAGE_SIZE
;
613 /* push validend forward to end of contiguous valid page range */
614 npg
= bp
->nb_bufsize
/ PAGE_SIZE
;
615 pg
= bp
->nb_validend
/ PAGE_SIZE
;
616 while (pg
< npg
&& NBPGVALID(bp
, pg
)) {
619 bp
->nb_validend
= pg
* PAGE_SIZE
;
621 if (NBOFF(bp
) + bp
->nb_validend
> (off_t
)np
->n_size
) {
622 bp
->nb_validend
= np
->n_size
% bp
->nb_bufsize
;
627 * process some entries on the delayed write queue
628 * (must be called with nfs_buf_mutex held)
631 nfs_buf_delwri_service(void)
637 while (i
< 8 && (bp
= TAILQ_FIRST(&nfsbufdelwri
)) != NULL
) {
641 while ((error
= nfs_buf_acquire(bp
, 0, 0, 0)) == EAGAIN
) {
649 /* buffer is no longer valid */
653 if (ISSET(bp
->nb_flags
, NB_NEEDCOMMIT
)) {
654 nfs_buf_check_write_verifier(np
, bp
);
656 if (ISSET(bp
->nb_flags
, NB_NEEDCOMMIT
)) {
657 /* put buffer at end of delwri list */
658 TAILQ_INSERT_TAIL(&nfsbufdelwri
, bp
, nb_free
);
661 lck_mtx_unlock(nfs_buf_mutex
);
662 nfs_flushcommits(np
, 1);
664 SET(bp
->nb_flags
, NB_ASYNC
);
665 lck_mtx_unlock(nfs_buf_mutex
);
669 lck_mtx_lock(nfs_buf_mutex
);
674 * thread to service the delayed write queue when asked
677 nfs_buf_delwri_thread(__unused
void *arg
, __unused wait_result_t wr
)
679 struct timespec ts
= { .tv_sec
= 30, .tv_nsec
= 0 };
682 lck_mtx_lock(nfs_buf_mutex
);
684 nfs_buf_delwri_service();
685 error
= msleep(&nfsbufdelwrithd
, nfs_buf_mutex
, 0, "nfsbufdelwri", &ts
);
687 nfsbufdelwrithd
= NULL
;
688 lck_mtx_unlock(nfs_buf_mutex
);
689 thread_terminate(nfsbufdelwrithd
);
693 * try to push out some delayed/uncommitted writes
694 * ("locked" indicates whether nfs_buf_mutex is already held)
697 nfs_buf_delwri_push(int locked
)
699 if (TAILQ_EMPTY(&nfsbufdelwri
)) {
703 lck_mtx_lock(nfs_buf_mutex
);
705 /* wake up the delayed write service thread */
706 if (nfsbufdelwrithd
) {
707 wakeup(&nfsbufdelwrithd
);
708 } else if (kernel_thread_start(nfs_buf_delwri_thread
, NULL
, &nfsbufdelwrithd
) == KERN_SUCCESS
) {
709 thread_deallocate(nfsbufdelwrithd
);
711 /* otherwise, try to do some of the work ourselves */
712 if (!nfsbufdelwrithd
) {
713 nfs_buf_delwri_service();
716 lck_mtx_unlock(nfs_buf_mutex
);
723 * Returns errno on error, 0 otherwise.
724 * Any buffer is returned in *bpp.
726 * If NBLK_ONLYVALID is set, only return buffer if found in cache.
727 * If NBLK_NOWAIT is set, don't wait for the buffer if it's marked BUSY.
729 * Check for existence of buffer in cache.
730 * Or attempt to reuse a buffer from one of the free lists.
731 * Or allocate a new buffer if we haven't already hit max allocation.
732 * Or wait for a free buffer.
734 * If available buffer found, prepare it, and return it.
736 * If the calling process is interrupted by a signal for
737 * an interruptible mount point, return EINTR.
748 vnode_t vp
= NFSTOV(np
);
749 struct nfsmount
*nmp
= VTONMP(vp
);
752 int slpflag
= PCATCH
;
753 int operation
= (flags
& NBLK_OPMASK
);
757 FSDBG_TOP(541, np
, blkno
, size
, flags
);
761 if (bufsize
> NFS_MAXBSIZE
) {
762 panic("nfs_buf_get: buffer larger than NFS_MAXBSIZE requested");
765 if (nfs_mount_gone(nmp
)) {
766 FSDBG_BOT(541, np
, blkno
, 0, ENXIO
);
770 if (!UBCINFOEXISTS(vp
)) {
771 operation
= NBLK_META
;
772 } else if (bufsize
< (uint32_t)nmp
->nm_biosize
) {
773 /* reg files should always have biosize blocks */
774 bufsize
= nmp
->nm_biosize
;
777 /* if NBLK_WRITE, check for too many delayed/uncommitted writes */
778 if ((operation
== NBLK_WRITE
) && (nfs_nbdwrite
> NFS_A_LOT_OF_DELAYED_WRITES
)) {
779 FSDBG_TOP(542, np
, blkno
, nfs_nbdwrite
, NFS_A_LOT_OF_DELAYED_WRITES
);
781 /* poke the delwri list */
782 nfs_buf_delwri_push(0);
784 /* sleep to let other threads run... */
785 tsleep(&nfs_nbdwrite
, PCATCH
, "nfs_nbdwrite", 1);
786 FSDBG_BOT(542, np
, blkno
, nfs_nbdwrite
, NFS_A_LOT_OF_DELAYED_WRITES
);
790 lck_mtx_lock(nfs_buf_mutex
);
792 /* wait for any buffer invalidation/flushing to complete */
793 while (np
->n_bflag
& NBINVALINPROG
) {
794 np
->n_bflag
|= NBINVALWANT
;
797 msleep(&np
->n_bflag
, nfs_buf_mutex
, slpflag
, "nfs_buf_get_invalwait", &ts
);
798 if ((error
= nfs_sigintr(VTONMP(vp
), NULL
, thd
, 0))) {
799 lck_mtx_unlock(nfs_buf_mutex
);
800 FSDBG_BOT(541, np
, blkno
, 0, error
);
803 if (np
->n_bflag
& NBINVALINPROG
) {
808 /* check for existence of nfsbuf in cache */
809 if ((bp
= nfs_buf_incore(np
, blkno
))) {
810 /* if busy, set wanted and wait */
811 if (ISSET(bp
->nb_lflags
, NBL_BUSY
)) {
812 if (flags
& NBLK_NOWAIT
) {
813 lck_mtx_unlock(nfs_buf_mutex
);
814 FSDBG_BOT(541, np
, blkno
, bp
, 0xbcbcbcbc);
817 FSDBG_TOP(543, np
, blkno
, bp
, bp
->nb_flags
);
818 SET(bp
->nb_lflags
, NBL_WANTED
);
822 msleep(bp
, nfs_buf_mutex
, slpflag
| (PRIBIO
+ 1) | PDROP
,
823 "nfsbufget", (slpflag
== PCATCH
) ? NULL
: &ts
);
825 FSDBG_BOT(543, np
, blkno
, bp
, bp
->nb_flags
);
826 if ((error
= nfs_sigintr(VTONMP(vp
), NULL
, thd
, 0))) {
827 FSDBG_BOT(541, np
, blkno
, 0, error
);
832 if (bp
->nb_bufsize
!= bufsize
) {
833 panic("nfsbuf size mismatch");
835 SET(bp
->nb_lflags
, NBL_BUSY
);
836 SET(bp
->nb_flags
, NB_CACHE
);
838 /* additional paranoia: */
839 if (ISSET(bp
->nb_flags
, NB_PAGELIST
)) {
840 panic("pagelist buffer was not busy");
845 if (flags
& NBLK_ONLYVALID
) {
846 lck_mtx_unlock(nfs_buf_mutex
);
847 FSDBG_BOT(541, np
, blkno
, 0, 0x0000cace);
852 * where to get a free buffer:
853 * - if meta and maxmeta reached, must reuse meta
854 * - alloc new if we haven't reached min bufs
855 * - if free lists are NOT empty
856 * - if free list is stale, use it
857 * - else if freemeta list is stale, use it
858 * - else if max bufs allocated, use least-time-to-stale
859 * - alloc new if we haven't reached max allowed
860 * - start clearing out delwri list and try again
863 if ((operation
== NBLK_META
) && (nfsbufmetacnt
>= nfsbufmetamax
)) {
864 /* if we've hit max meta buffers, must reuse a meta buffer */
865 bp
= TAILQ_FIRST(&nfsbuffreemeta
);
866 } else if ((nfsbufcnt
> nfsbufmin
) &&
867 (!TAILQ_EMPTY(&nfsbuffree
) || !TAILQ_EMPTY(&nfsbuffreemeta
))) {
868 /* try to pull an nfsbuf off a free list */
869 struct nfsbuf
*lrubp
, *metabp
;
873 /* if the next LRU or META buffer is invalid or stale, use it */
874 lrubp
= TAILQ_FIRST(&nfsbuffree
);
875 if (lrubp
&& (!NBUFSTAMPVALID(lrubp
) ||
876 ((lrubp
->nb_timestamp
+ NFSBUF_LRU_STALE
) < now
.tv_sec
))) {
879 metabp
= TAILQ_FIRST(&nfsbuffreemeta
);
880 if (!bp
&& metabp
&& (!NBUFSTAMPVALID(metabp
) ||
881 ((metabp
->nb_timestamp
+ NFSBUF_META_STALE
) < now
.tv_sec
))) {
885 if (!bp
&& (nfsbufcnt
>= nfsbufmax
)) {
886 /* we've already allocated all bufs, so */
887 /* choose the buffer that'll go stale first */
893 time_t lru_stale_time
, meta_stale_time
;
894 lru_stale_time
= lrubp
->nb_timestamp
+ NFSBUF_LRU_STALE
;
895 meta_stale_time
= metabp
->nb_timestamp
+ NFSBUF_META_STALE
;
896 if (lru_stale_time
<= meta_stale_time
) {
906 /* we have a buffer to reuse */
907 FSDBG(544, np
, blkno
, bp
, bp
->nb_flags
);
909 if (ISSET(bp
->nb_flags
, NB_DELWRI
)) {
910 panic("nfs_buf_get: delwri");
912 SET(bp
->nb_lflags
, NBL_BUSY
);
913 /* disassociate buffer from previous nfsnode */
915 if (bp
->nb_vnbufs
.le_next
!= NFSNOLIST
) {
916 LIST_REMOVE(bp
, nb_vnbufs
);
917 bp
->nb_vnbufs
.le_next
= NFSNOLIST
;
921 LIST_REMOVE(bp
, nb_hash
);
922 /* nuke any creds we're holding */
923 if (IS_VALID_CRED(bp
->nb_rcred
)) {
924 kauth_cred_unref(&bp
->nb_rcred
);
926 if (IS_VALID_CRED(bp
->nb_wcred
)) {
927 kauth_cred_unref(&bp
->nb_wcred
);
929 /* if buf will no longer be NB_META, dump old buffer */
930 if (operation
== NBLK_META
) {
931 if (!ISSET(bp
->nb_flags
, NB_META
)) {
934 } else if (ISSET(bp
->nb_flags
, NB_META
)) {
936 kheap_free(KHEAP_DATA_BUFFERS
, bp
->nb_data
, bp
->nb_bufsize
);
941 /* re-init buf fields */
943 bp
->nb_validoff
= bp
->nb_validend
= -1;
944 bp
->nb_dirtyoff
= bp
->nb_dirtyend
= 0;
945 NBPGS_ERASE(&bp
->nb_valid
);
946 NBPGS_ERASE(&bp
->nb_dirty
);
949 /* no buffer to reuse */
950 if ((nfsbufcnt
< nfsbufmax
) &&
951 ((operation
!= NBLK_META
) || (nfsbufmetacnt
< nfsbufmetamax
))) {
952 /* just alloc a new one */
953 bp
= zalloc(nfsbuf_zone
);
957 * If any excess bufs, make sure the timer
958 * is running to free them up later.
960 if (nfsbufcnt
> nfsbufmin
&& !nfs_buf_timer_on
) {
961 nfs_buf_timer_on
= 1;
962 nfs_interval_timer_start(nfs_buf_timer_call
,
963 NFSBUF_FREE_PERIOD
* 1000);
966 if (operation
== NBLK_META
) {
971 bzero(bp
, sizeof(*bp
));
972 os_ref_init(&bp
->nb_refs
, NULL
);
974 bp
->nb_free
.tqe_next
= NFSNOLIST
;
975 bp
->nb_validoff
= bp
->nb_validend
= -1;
976 FSDBG(545, np
, blkno
, bp
, 0);
978 /* too many bufs... wait for buffers to free up */
979 FSDBG_TOP(546, np
, blkno
, nfsbufcnt
, nfsbufmax
);
981 /* poke the delwri list */
982 nfs_buf_delwri_push(1);
985 msleep(&nfsneedbuffer
, nfs_buf_mutex
, PCATCH
| PDROP
, "nfsbufget", NULL
);
986 FSDBG_BOT(546, np
, blkno
, nfsbufcnt
, nfsbufmax
);
987 if ((error
= nfs_sigintr(VTONMP(vp
), NULL
, thd
, 0))) {
988 FSDBG_BOT(541, np
, blkno
, 0, error
);
996 SET(bp
->nb_lflags
, NBL_BUSY
);
998 bp
->nb_lblkno
= blkno
;
999 /* insert buf in hash */
1000 LIST_INSERT_HEAD(NFSBUFHASH(np
, blkno
), bp
, nb_hash
);
1001 /* associate buffer with new nfsnode */
1003 LIST_INSERT_HEAD(&np
->n_cleanblkhd
, bp
, nb_vnbufs
);
1008 lck_mtx_unlock(nfs_buf_mutex
);
1010 switch (operation
) {
1012 SET(bp
->nb_flags
, NB_META
);
1013 if ((bp
->nb_bufsize
!= bufsize
) && bp
->nb_data
) {
1014 kheap_free(KHEAP_DATA_BUFFERS
, bp
->nb_data
, bp
->nb_bufsize
);
1016 bp
->nb_validoff
= bp
->nb_validend
= -1;
1017 bp
->nb_dirtyoff
= bp
->nb_dirtyend
= 0;
1018 NBPGS_ERASE(&bp
->nb_valid
);
1019 NBPGS_ERASE(&bp
->nb_dirty
);
1020 CLR(bp
->nb_flags
, NB_CACHE
);
1023 bp
->nb_data
= kheap_alloc(KHEAP_DATA_BUFFERS
,
1027 /* Ack! couldn't allocate the data buffer! */
1028 /* clean up buffer and return error */
1029 lck_mtx_lock(nfs_buf_mutex
);
1030 LIST_REMOVE(bp
, nb_vnbufs
);
1031 bp
->nb_vnbufs
.le_next
= NFSNOLIST
;
1033 /* invalidate usage timestamp to allow immediate freeing */
1034 NBUFSTAMPINVALIDATE(bp
);
1035 if (bp
->nb_free
.tqe_next
!= NFSNOLIST
) {
1036 panic("nfsbuf on freelist");
1038 TAILQ_INSERT_HEAD(&nfsbuffree
, bp
, nb_free
);
1040 lck_mtx_unlock(nfs_buf_mutex
);
1041 FSDBG_BOT(541, np
, blkno
, 0xb00, ENOMEM
);
1044 bp
->nb_bufsize
= bufsize
;
1050 * Set or clear NB_READ now to let the UPL subsystem know
1051 * if we intend to modify the pages or not.
1053 if (operation
== NBLK_READ
) {
1054 SET(bp
->nb_flags
, NB_READ
);
1056 CLR(bp
->nb_flags
, NB_READ
);
1058 if (bufsize
< PAGE_SIZE
) {
1059 bufsize
= PAGE_SIZE
;
1061 bp
->nb_bufsize
= bufsize
;
1062 bp
->nb_validoff
= bp
->nb_validend
= -1;
1064 if (UBCINFOEXISTS(vp
)) {
1066 if (nfs_buf_upl_setup(bp
)) {
1067 /* unable to create upl */
1068 /* vm object must no longer exist */
1069 /* clean up buffer and return error */
1070 lck_mtx_lock(nfs_buf_mutex
);
1071 LIST_REMOVE(bp
, nb_vnbufs
);
1072 bp
->nb_vnbufs
.le_next
= NFSNOLIST
;
1074 /* invalidate usage timestamp to allow immediate freeing */
1075 NBUFSTAMPINVALIDATE(bp
);
1076 if (bp
->nb_free
.tqe_next
!= NFSNOLIST
) {
1077 panic("nfsbuf on freelist");
1079 TAILQ_INSERT_HEAD(&nfsbuffree
, bp
, nb_free
);
1081 lck_mtx_unlock(nfs_buf_mutex
);
1082 FSDBG_BOT(541, np
, blkno
, 0x2bc, EIO
);
1085 nfs_buf_upl_check(bp
);
1090 panic("nfs_buf_get: %d unknown operation", operation
);
1095 FSDBG_BOT(541, np
, blkno
, bp
, bp
->nb_flags
);
1101 nfs_buf_release(struct nfsbuf
*bp
, int freeup
)
1103 nfsnode_t np
= bp
->nb_np
;
1106 int wakeup_needbuffer
, wakeup_buffer
, wakeup_nbdwrite
;
1108 FSDBG_TOP(548, bp
, NBOFF(bp
), bp
->nb_flags
, bp
->nb_data
);
1109 FSDBG(548, bp
->nb_validoff
, bp
->nb_validend
, bp
->nb_dirtyoff
, bp
->nb_dirtyend
);
1110 FSDBG(548, bp
->nb_valid
, 0, bp
->nb_dirty
, 0);
1112 vp
= np
? NFSTOV(np
) : NULL
;
1113 if (vp
&& UBCINFOEXISTS(vp
) && bp
->nb_bufsize
) {
1118 if (!ISSET(bp
->nb_flags
, NB_PAGELIST
) && !ISSET(bp
->nb_flags
, NB_INVAL
)) {
1119 rv
= nfs_buf_upl_setup(bp
);
1121 printf("nfs_buf_release: upl create failed %d\n", rv
);
1123 nfs_buf_upl_check(bp
);
1126 upl
= bp
->nb_pagelist
;
1128 goto pagelist_cleanup_done
;
1131 if (ubc_upl_unmap(upl
) != KERN_SUCCESS
) {
1132 panic("ubc_upl_unmap failed");
1137 * Abort the pages on error or: if this is an invalid or
1138 * non-needcommit nocache buffer AND no pages are dirty.
1140 if (ISSET(bp
->nb_flags
, NB_ERROR
) || (!nfs_buf_pgs_is_set(&bp
->nb_dirty
) && (ISSET(bp
->nb_flags
, NB_INVAL
) ||
1141 (ISSET(bp
->nb_flags
, NB_NOCACHE
) && !ISSET(bp
->nb_flags
, (NB_NEEDCOMMIT
| NB_DELWRI
)))))) {
1142 if (ISSET(bp
->nb_flags
, (NB_READ
| NB_INVAL
| NB_NOCACHE
))) {
1143 upl_flags
= UPL_ABORT_DUMP_PAGES
;
1147 ubc_upl_abort(upl
, upl_flags
);
1148 goto pagelist_cleanup_done
;
1150 for (i
= 0; i
<= (bp
->nb_bufsize
- 1) / PAGE_SIZE
; i
++) {
1151 if (!NBPGVALID(bp
, i
)) {
1152 ubc_upl_abort_range(upl
,
1153 i
* PAGE_SIZE
, PAGE_SIZE
,
1154 UPL_ABORT_DUMP_PAGES
|
1155 UPL_ABORT_FREE_ON_EMPTY
);
1157 if (NBPGDIRTY(bp
, i
)) {
1158 upl_flags
= UPL_COMMIT_SET_DIRTY
;
1160 upl_flags
= UPL_COMMIT_CLEAR_DIRTY
;
1163 if (!ISSET(bp
->nb_flags
, (NB_NEEDCOMMIT
| NB_DELWRI
))) {
1164 upl_flags
|= UPL_COMMIT_CLEAR_PRECIOUS
;
1167 ubc_upl_commit_range(upl
,
1168 i
* PAGE_SIZE
, PAGE_SIZE
,
1170 UPL_COMMIT_INACTIVATE
|
1171 UPL_COMMIT_FREE_ON_EMPTY
);
1174 pagelist_cleanup_done
:
1175 /* invalidate any pages past EOF */
1176 if (NBOFF(bp
) + bp
->nb_bufsize
> (off_t
)(np
->n_size
)) {
1178 start
= trunc_page_64(np
->n_size
) + PAGE_SIZE_64
;
1179 end
= trunc_page_64(NBOFF(bp
) + bp
->nb_bufsize
);
1180 if (start
< NBOFF(bp
)) {
1184 if ((rv
= ubc_msync(vp
, start
, end
, NULL
, UBC_INVALIDATE
))) {
1185 printf("nfs_buf_release(): ubc_msync failed!, error %d\n", rv
);
1189 CLR(bp
->nb_flags
, NB_PAGELIST
);
1190 bp
->nb_pagelist
= NULL
;
1193 lck_mtx_lock(nfs_buf_mutex
);
1195 wakeup_needbuffer
= wakeup_buffer
= wakeup_nbdwrite
= 0;
1197 /* Wake up any processes waiting for any buffer to become free. */
1198 if (nfsneedbuffer
) {
1200 wakeup_needbuffer
= 1;
1202 /* Wake up any processes waiting for _this_ buffer to become free. */
1203 if (ISSET(bp
->nb_lflags
, NBL_WANTED
)) {
1204 CLR(bp
->nb_lflags
, NBL_WANTED
);
1208 /* If it's non-needcommit nocache, or an error, mark it invalid. */
1209 if (ISSET(bp
->nb_flags
, NB_ERROR
) ||
1210 (ISSET(bp
->nb_flags
, NB_NOCACHE
) && !ISSET(bp
->nb_flags
, (NB_NEEDCOMMIT
| NB_DELWRI
)))) {
1211 SET(bp
->nb_flags
, NB_INVAL
);
1214 if ((bp
->nb_bufsize
<= 0) || ISSET(bp
->nb_flags
, NB_INVAL
)) {
1215 /* If it's invalid or empty, dissociate it from its nfsnode */
1216 if (bp
->nb_vnbufs
.le_next
!= NFSNOLIST
) {
1217 LIST_REMOVE(bp
, nb_vnbufs
);
1218 bp
->nb_vnbufs
.le_next
= NFSNOLIST
;
1221 /* if this was a delayed write, wakeup anyone */
1222 /* waiting for delayed writes to complete */
1223 if (ISSET(bp
->nb_flags
, NB_DELWRI
)) {
1224 CLR(bp
->nb_flags
, NB_DELWRI
);
1227 wakeup_nbdwrite
= 1;
1229 /* invalidate usage timestamp to allow immediate freeing */
1230 NBUFSTAMPINVALIDATE(bp
);
1231 /* put buffer at head of free list */
1232 if (bp
->nb_free
.tqe_next
!= NFSNOLIST
) {
1233 panic("nfsbuf on freelist");
1235 SET(bp
->nb_flags
, NB_INVAL
);
1236 if (ISSET(bp
->nb_flags
, NB_META
)) {
1237 TAILQ_INSERT_HEAD(&nfsbuffreemeta
, bp
, nb_free
);
1238 nfsbuffreemetacnt
++;
1240 TAILQ_INSERT_HEAD(&nfsbuffree
, bp
, nb_free
);
1243 } else if (ISSET(bp
->nb_flags
, NB_DELWRI
)) {
1244 /* put buffer at end of delwri list */
1245 if (bp
->nb_free
.tqe_next
!= NFSNOLIST
) {
1246 panic("nfsbuf on freelist");
1248 TAILQ_INSERT_TAIL(&nfsbufdelwri
, bp
, nb_free
);
1252 /* update usage timestamp */
1254 bp
->nb_timestamp
= now
.tv_sec
;
1255 /* put buffer at end of free list */
1256 if (bp
->nb_free
.tqe_next
!= NFSNOLIST
) {
1257 panic("nfsbuf on freelist");
1259 if (ISSET(bp
->nb_flags
, NB_META
)) {
1260 TAILQ_INSERT_TAIL(&nfsbuffreemeta
, bp
, nb_free
);
1261 nfsbuffreemetacnt
++;
1263 TAILQ_INSERT_TAIL(&nfsbuffree
, bp
, nb_free
);
1270 /* Unlock the buffer. */
1271 CLR(bp
->nb_flags
, (NB_ASYNC
| NB_STABLE
));
1272 CLR(bp
->nb_lflags
, NBL_BUSY
);
1274 FSDBG_BOT(548, bp
, NBOFF(bp
), bp
->nb_flags
, bp
->nb_data
);
1276 lck_mtx_unlock(nfs_buf_mutex
);
1278 if (wakeup_needbuffer
) {
1279 wakeup(&nfsneedbuffer
);
1281 if (wakeup_buffer
) {
1284 if (wakeup_nbdwrite
) {
1285 wakeup(&nfs_nbdwrite
);
1293 * Wait for operations on the buffer to complete.
1294 * When they do, extract and return the I/O's error value.
1297 nfs_buf_iowait(struct nfsbuf
*bp
)
1299 FSDBG_TOP(549, bp
, NBOFF(bp
), bp
->nb_flags
, bp
->nb_error
);
1301 lck_mtx_lock(nfs_buf_mutex
);
1303 while (!ISSET(bp
->nb_flags
, NB_DONE
)) {
1304 msleep(bp
, nfs_buf_mutex
, PRIBIO
+ 1, "nfs_buf_iowait", NULL
);
1307 lck_mtx_unlock(nfs_buf_mutex
);
1309 FSDBG_BOT(549, bp
, NBOFF(bp
), bp
->nb_flags
, bp
->nb_error
);
1311 /* check for interruption of I/O, then errors. */
1312 if (ISSET(bp
->nb_flags
, NB_EINTR
)) {
1313 CLR(bp
->nb_flags
, NB_EINTR
);
1315 } else if (ISSET(bp
->nb_flags
, NB_ERROR
)) {
1316 return bp
->nb_error
? bp
->nb_error
: EIO
;
1322 * Mark I/O complete on a buffer.
1325 nfs_buf_iodone(struct nfsbuf
*bp
)
1327 FSDBG_TOP(550, bp
, NBOFF(bp
), bp
->nb_flags
, bp
->nb_error
);
1329 if (ISSET(bp
->nb_flags
, NB_DONE
)) {
1330 panic("nfs_buf_iodone already");
1333 if (!ISSET(bp
->nb_flags
, NB_READ
)) {
1334 CLR(bp
->nb_flags
, NB_WRITEINPROG
);
1336 * vnode_writedone() takes care of waking up
1337 * any throttled write operations
1339 vnode_writedone(NFSTOV(bp
->nb_np
));
1340 nfs_node_lock_force(bp
->nb_np
);
1341 bp
->nb_np
->n_numoutput
--;
1342 nfs_node_unlock(bp
->nb_np
);
1344 if (ISSET(bp
->nb_flags
, NB_ASYNC
)) { /* if async, release it */
1345 SET(bp
->nb_flags
, NB_DONE
); /* note that it's done */
1346 nfs_buf_release(bp
, 1);
1347 } else { /* or just wakeup the buffer */
1348 lck_mtx_lock(nfs_buf_mutex
);
1349 SET(bp
->nb_flags
, NB_DONE
); /* note that it's done */
1350 CLR(bp
->nb_lflags
, NBL_WANTED
);
1351 lck_mtx_unlock(nfs_buf_mutex
);
1355 FSDBG_BOT(550, bp
, NBOFF(bp
), bp
->nb_flags
, bp
->nb_error
);
1359 nfs_buf_write_delayed(struct nfsbuf
*bp
)
1361 nfsnode_t np
= bp
->nb_np
;
1363 FSDBG_TOP(551, bp
, NBOFF(bp
), bp
->nb_flags
, 0);
1364 FSDBG(551, bp
, bp
->nb_dirtyoff
, bp
->nb_dirtyend
, bp
->nb_dirty
);
1367 * If the block hasn't been seen before:
1368 * (1) Mark it as having been seen,
1369 * (2) Make sure it's on its node's correct block list,
1371 if (!ISSET(bp
->nb_flags
, NB_DELWRI
)) {
1372 SET(bp
->nb_flags
, NB_DELWRI
);
1373 /* move to dirty list */
1374 lck_mtx_lock(nfs_buf_mutex
);
1377 if (bp
->nb_vnbufs
.le_next
!= NFSNOLIST
) {
1378 LIST_REMOVE(bp
, nb_vnbufs
);
1380 LIST_INSERT_HEAD(&np
->n_dirtyblkhd
, bp
, nb_vnbufs
);
1381 lck_mtx_unlock(nfs_buf_mutex
);
1385 * If the vnode has "too many" write operations in progress
1386 * wait for them to finish the IO
1388 vnode_waitforwrites(NFSTOV(np
), VNODE_ASYNC_THROTTLE
, 0, 0, "nfs_buf_write_delayed");
1390 /* the file is in a modified state, so make sure the flag's set */
1391 nfs_node_lock_force(np
);
1392 np
->n_flag
|= NMODIFIED
;
1393 nfs_node_unlock(np
);
1396 * If we have too many delayed write buffers,
1397 * just fall back to doing the async write.
1399 if (nfs_nbdwrite
< 0) {
1400 panic("nfs_buf_write_delayed: Negative nfs_nbdwrite");
1402 if (nfs_nbdwrite
> NFS_A_LOT_OF_DELAYED_WRITES
) {
1403 /* issue async write */
1404 SET(bp
->nb_flags
, NB_ASYNC
);
1406 FSDBG_BOT(551, bp
, NBOFF(bp
), bp
->nb_flags
, bp
->nb_error
);
1410 /* Otherwise, the "write" is done, so mark and release the buffer. */
1411 SET(bp
->nb_flags
, NB_DONE
);
1412 nfs_buf_release(bp
, 1);
1413 FSDBG_BOT(551, bp
, NBOFF(bp
), bp
->nb_flags
, 0);
1418 * Check that a "needcommit" buffer can still be committed.
1419 * If the write verifier has changed, we need to clear the
1420 * the needcommit flag.
1423 nfs_buf_check_write_verifier(nfsnode_t np
, struct nfsbuf
*bp
)
1425 struct nfsmount
*nmp
;
1427 if (!ISSET(bp
->nb_flags
, NB_NEEDCOMMIT
)) {
1432 if (nfs_mount_gone(nmp
)) {
1435 if (!ISSET(bp
->nb_flags
, NB_STALEWVERF
) && (bp
->nb_verf
== nmp
->nm_verf
)) {
1439 /* write verifier changed, clear commit/wverf flags */
1440 CLR(bp
->nb_flags
, (NB_NEEDCOMMIT
| NB_STALEWVERF
));
1442 nfs_node_lock_force(np
);
1443 np
->n_needcommitcnt
--;
1444 CHECK_NEEDCOMMITCNT(np
);
1445 nfs_node_unlock(np
);
1449 * add a reference to a buffer so it doesn't disappear while being used
1450 * (must be called with nfs_buf_mutex held)
1453 nfs_buf_refget(struct nfsbuf
*bp
)
1455 os_ref_retain_locked(&bp
->nb_refs
);
1458 * release a reference on a buffer
1459 * (must be called with nfs_buf_mutex held)
1462 nfs_buf_refrele(struct nfsbuf
*bp
)
1464 (void) os_ref_release_locked(&bp
->nb_refs
);
1468 * mark a particular buffer as BUSY
1469 * (must be called with nfs_buf_mutex held)
1472 nfs_buf_acquire(struct nfsbuf
*bp
, int flags
, int slpflag
, int slptimeo
)
1477 if (ISSET(bp
->nb_lflags
, NBL_BUSY
)) {
1479 * since the lck_mtx_lock may block, the buffer
1480 * may become BUSY, so we need to recheck for
1483 if (flags
& NBAC_NOWAIT
) {
1486 SET(bp
->nb_lflags
, NBL_WANTED
);
1488 ts
.tv_sec
= (slptimeo
/ 100);
1489 /* the hz value is 100; which leads to 10ms */
1490 ts
.tv_nsec
= (slptimeo
% 100) * 10 * NSEC_PER_USEC
* 1000;
1492 error
= msleep(bp
, nfs_buf_mutex
, slpflag
| (PRIBIO
+ 1),
1493 "nfs_buf_acquire", &ts
);
1499 if (flags
& NBAC_REMOVE
) {
1500 nfs_buf_remfree(bp
);
1502 SET(bp
->nb_lflags
, NBL_BUSY
);
1508 * simply drop the BUSY status of a buffer
1509 * (must be called with nfs_buf_mutex held)
1512 nfs_buf_drop(struct nfsbuf
*bp
)
1514 int need_wakeup
= 0;
1516 if (!ISSET(bp
->nb_lflags
, NBL_BUSY
)) {
1517 panic("nfs_buf_drop: buffer not busy!");
1519 if (ISSET(bp
->nb_lflags
, NBL_WANTED
)) {
1520 /* delay the actual wakeup until after we clear NBL_BUSY */
1523 /* Unlock the buffer. */
1524 CLR(bp
->nb_lflags
, (NBL_BUSY
| NBL_WANTED
));
1532 * prepare for iterating over an nfsnode's buffer list
1533 * this lock protects the queue manipulation
1534 * (must be called with nfs_buf_mutex held)
1537 nfs_buf_iterprepare(nfsnode_t np
, struct nfsbuflists
*iterheadp
, int flags
)
1539 struct nfsbuflists
*listheadp
;
1541 if (flags
& NBI_DIRTY
) {
1542 listheadp
= &np
->n_dirtyblkhd
;
1544 listheadp
= &np
->n_cleanblkhd
;
1547 if ((flags
& NBI_NOWAIT
) && (np
->n_bufiterflags
& NBI_ITER
)) {
1548 LIST_INIT(iterheadp
);
1552 while (np
->n_bufiterflags
& NBI_ITER
) {
1553 np
->n_bufiterflags
|= NBI_ITERWANT
;
1554 msleep(&np
->n_bufiterflags
, nfs_buf_mutex
, 0, "nfs_buf_iterprepare", NULL
);
1556 if (LIST_EMPTY(listheadp
)) {
1557 LIST_INIT(iterheadp
);
1560 np
->n_bufiterflags
|= NBI_ITER
;
1562 iterheadp
->lh_first
= listheadp
->lh_first
;
1563 listheadp
->lh_first
->nb_vnbufs
.le_prev
= &iterheadp
->lh_first
;
1564 LIST_INIT(listheadp
);
1570 * clean up after iterating over an nfsnode's buffer list
1571 * this lock protects the queue manipulation
1572 * (must be called with nfs_buf_mutex held)
1575 nfs_buf_itercomplete(nfsnode_t np
, struct nfsbuflists
*iterheadp
, int flags
)
1577 struct nfsbuflists
* listheadp
;
1580 if (flags
& NBI_DIRTY
) {
1581 listheadp
= &np
->n_dirtyblkhd
;
1583 listheadp
= &np
->n_cleanblkhd
;
1586 while (!LIST_EMPTY(iterheadp
)) {
1587 bp
= LIST_FIRST(iterheadp
);
1588 LIST_REMOVE(bp
, nb_vnbufs
);
1589 LIST_INSERT_HEAD(listheadp
, bp
, nb_vnbufs
);
1592 np
->n_bufiterflags
&= ~NBI_ITER
;
1593 if (np
->n_bufiterflags
& NBI_ITERWANT
) {
1594 np
->n_bufiterflags
&= ~NBI_ITERWANT
;
1595 wakeup(&np
->n_bufiterflags
);
1601 * Read an NFS buffer for a file.
1604 nfs_buf_read(struct nfsbuf
*bp
)
1612 cred
= bp
->nb_rcred
;
1613 if (IS_VALID_CRED(cred
)) {
1614 kauth_cred_ref(cred
);
1616 thd
= ISSET(bp
->nb_flags
, NB_ASYNC
) ? NULL
: current_thread();
1619 if (!ISSET(bp
->nb_flags
, NB_READ
)) {
1620 panic("nfs_buf_read: !NB_READ");
1622 if (ISSET(bp
->nb_flags
, NB_DONE
)) {
1623 CLR(bp
->nb_flags
, NB_DONE
);
1628 OSAddAtomic64(1, &nfsstats
.read_bios
);
1630 error
= nfs_buf_read_rpc(bp
, thd
, cred
);
1632 * For async I/O, the callbacks will finish up the
1633 * read. Otherwise, the read has already been finished.
1636 if (IS_VALID_CRED(cred
)) {
1637 kauth_cred_unref(&cred
);
1643 * finish the reading of a buffer
1646 nfs_buf_read_finish(struct nfsbuf
*bp
)
1648 nfsnode_t np
= bp
->nb_np
;
1649 struct nfsmount
*nmp
;
1651 if (!ISSET(bp
->nb_flags
, NB_ERROR
)) {
1652 /* update valid range */
1653 bp
->nb_validoff
= 0;
1654 bp
->nb_validend
= bp
->nb_endio
;
1655 if (bp
->nb_endio
< bp
->nb_bufsize
) {
1657 * The read may be short because we have unflushed writes
1658 * that are extending the file size and the reads hit the
1659 * (old) EOF on the server. So, just make sure nb_validend
1660 * correctly tracks EOF.
1661 * Note that the missing data should have already been zeroed
1662 * in nfs_buf_read_rpc_finish().
1664 off_t boff
= NBOFF(bp
);
1665 if ((off_t
)np
->n_size
>= (boff
+ bp
->nb_bufsize
)) {
1666 bp
->nb_validend
= bp
->nb_bufsize
;
1667 } else if ((off_t
)np
->n_size
>= boff
) {
1668 bp
->nb_validend
= np
->n_size
- boff
;
1670 bp
->nb_validend
= 0;
1673 if ((nmp
= NFSTONMP(np
)) && (nmp
->nm_vers
== NFS_VER2
) &&
1674 ((NBOFF(bp
) + bp
->nb_validend
) > 0x100000000LL
)) {
1675 bp
->nb_validend
= 0x100000000LL
- NBOFF(bp
);
1677 nfs_buf_pgs_get_page_mask(&bp
->nb_valid
, round_page_64(bp
->nb_validend
) / PAGE_SIZE
);
1678 if (bp
->nb_validend
& PAGE_MASK
) {
1679 /* zero-fill remainder of last page */
1680 bzero(bp
->nb_data
+ bp
->nb_validend
, PAGE_SIZE
- (bp
->nb_validend
& PAGE_MASK
));
1687 * initiate the NFS READ RPC(s) for a buffer
1690 nfs_buf_read_rpc(struct nfsbuf
*bp
, thread_t thd
, kauth_cred_t cred
)
1692 struct nfsmount
*nmp
;
1693 nfsnode_t np
= bp
->nb_np
;
1694 int error
= 0, nfsvers
, async
;
1696 uint64_t length
, nrpcs
;
1701 struct nfsreq_cbinfo cb
;
1704 if (nfs_mount_gone(nmp
)) {
1705 bp
->nb_error
= error
= ENXIO
;
1706 SET(bp
->nb_flags
, NB_ERROR
);
1710 nfsvers
= nmp
->nm_vers
;
1711 nmrsize
= nmp
->nm_rsize
;
1715 length
= bp
->nb_bufsize
;
1717 if (nfsvers
== NFS_VER2
) {
1718 if (boff
> 0xffffffffLL
) {
1719 bp
->nb_error
= error
= EFBIG
;
1720 SET(bp
->nb_flags
, NB_ERROR
);
1724 if ((boff
+ length
- 1) > 0xffffffffLL
) {
1725 length
= 0x100000000LL
- boff
;
1729 /* Note: Can only do async I/O if nfsiods are configured. */
1730 async
= (bp
->nb_flags
& NB_ASYNC
);
1731 cb
.rcb_func
= async
? nfs_buf_read_rpc_finish
: NULL
;
1734 bp
->nb_offio
= bp
->nb_endio
= 0;
1735 bp
->nb_rpcs
= nrpcs
= (length
+ nmrsize
- 1) / nmrsize
;
1736 if (async
&& (nrpcs
> 1)) {
1737 SET(bp
->nb_flags
, NB_MULTASYNCRPC
);
1739 CLR(bp
->nb_flags
, NB_MULTASYNCRPC
);
1742 while (length
> 0) {
1743 if (ISSET(bp
->nb_flags
, NB_ERROR
)) {
1744 error
= bp
->nb_error
;
1747 len
= (length
> nmrsize
) ? nmrsize
: (uint32_t)length
;
1748 cb
.rcb_args
.offset
= offset
;
1749 cb
.rcb_args
.length
= len
;
1751 if (nmp
->nm_vers
>= NFS_VER4
) {
1752 cb
.rcb_args
.stategenid
= nmp
->nm_stategenid
;
1756 error
= nmp
->nm_funcs
->nf_read_rpc_async(np
, boff
+ offset
, len
, thd
, cred
, &cb
, &req
);
1765 nfs_buf_read_rpc_finish(req
);
1766 if (ISSET(bp
->nb_flags
, NB_ERROR
)) {
1767 error
= bp
->nb_error
;
1774 * Something bad happened while trying to send the RPC(s).
1775 * Wait for any outstanding requests to complete.
1777 bp
->nb_error
= error
;
1778 SET(bp
->nb_flags
, NB_ERROR
);
1779 if (ISSET(bp
->nb_flags
, NB_MULTASYNCRPC
)) {
1780 nrpcs
= (length
+ nmrsize
- 1) / nmrsize
;
1781 lck_mtx_lock(nfs_buf_mutex
);
1782 bp
->nb_rpcs
-= nrpcs
;
1783 if (bp
->nb_rpcs
== 0) {
1784 /* No RPCs left, so the buffer's done */
1785 lck_mtx_unlock(nfs_buf_mutex
);
1788 /* wait for the last RPC to mark it done */
1789 while (bp
->nb_rpcs
> 0) {
1790 msleep(&bp
->nb_rpcs
, nfs_buf_mutex
, 0,
1791 "nfs_buf_read_rpc_cancel", NULL
);
1793 lck_mtx_unlock(nfs_buf_mutex
);
1804 * finish up an NFS READ RPC on a buffer
1807 nfs_buf_read_rpc_finish(struct nfsreq
*req
)
1809 struct nfsmount
*nmp
;
1810 size_t rlen
, length
;
1811 struct nfsreq_cbinfo cb
;
1813 int error
= 0, nfsvers
, eof
= 0, multasyncrpc
, finished
;
1815 void *wakeme
= NULL
;
1816 struct nfsreq
*rreq
= NULL
;
1821 char uio_buf
[UIO_SIZEOF(1)];
1825 thd
= req
->r_thread
;
1827 if (IS_VALID_CRED(cred
)) {
1828 kauth_cred_ref(cred
);
1830 cb
= req
->r_callback
;
1832 if (cb
.rcb_func
) { /* take an extra reference on the nfsreq in case we want to resend it later due to grace error */
1833 nfs_request_ref(req
, 0);
1837 if (nfs_mount_gone(nmp
)) {
1838 SET(bp
->nb_flags
, NB_ERROR
);
1839 bp
->nb_error
= error
= ENXIO
;
1841 if (error
|| ISSET(bp
->nb_flags
, NB_ERROR
)) {
1843 nfs_request_async_cancel(req
);
1847 nfsvers
= nmp
->nm_vers
;
1848 offset
= cb
.rcb_args
.offset
;
1849 rlen
= length
= cb
.rcb_args
.length
;
1851 auio
= uio_createwithbuffer(1, NBOFF(bp
) + offset
, UIO_SYSSPACE
,
1852 UIO_READ
, &uio_buf
, sizeof(uio_buf
));
1853 uio_addiov(auio
, CAST_USER_ADDR_T(bp
->nb_data
+ offset
), length
);
1855 /* finish the RPC */
1856 error
= nmp
->nm_funcs
->nf_read_rpc_async_finish(np
, req
, auio
, &rlen
, &eof
);
1857 if ((error
== EINPROGRESS
) && cb
.rcb_func
) {
1858 /* async request restarted */
1860 nfs_request_rele(req
);
1862 if (IS_VALID_CRED(cred
)) {
1863 kauth_cred_unref(&cred
);
1868 if ((nmp
->nm_vers
>= NFS_VER4
) && nfs_mount_state_error_should_restart(error
) && !ISSET(bp
->nb_flags
, NB_ERROR
)) {
1869 lck_mtx_lock(&nmp
->nm_lock
);
1870 if ((error
!= NFSERR_OLD_STATEID
) && (error
!= NFSERR_GRACE
) && (cb
.rcb_args
.stategenid
== nmp
->nm_stategenid
)) {
1871 NP(np
, "nfs_buf_read_rpc_finish: error %d @ 0x%llx, 0x%x 0x%x, initiating recovery",
1872 error
, NBOFF(bp
) + offset
, cb
.rcb_args
.stategenid
, nmp
->nm_stategenid
);
1873 nfs_need_recover(nmp
, error
);
1875 lck_mtx_unlock(&nmp
->nm_lock
);
1876 if (np
->n_flag
& NREVOKE
) {
1879 if (error
== NFSERR_GRACE
) {
1882 * For an async I/O request, handle a grace delay just like
1883 * jukebox errors. Set the resend time and queue it up.
1886 if (req
->r_nmrep
.nmc_mhead
) {
1887 mbuf_freem(req
->r_nmrep
.nmc_mhead
);
1888 req
->r_nmrep
.nmc_mhead
= NULL
;
1892 lck_mtx_lock(&req
->r_mtx
);
1893 req
->r_resendtime
= now
.tv_sec
+ 2;
1894 req
->r_xid
= 0; // get a new XID
1895 req
->r_flags
|= R_RESTART
;
1897 nfs_asyncio_resend(req
);
1898 lck_mtx_unlock(&req
->r_mtx
);
1899 if (IS_VALID_CRED(cred
)) {
1900 kauth_cred_unref(&cred
);
1902 /* Note: nfsreq reference taken will be dropped later when finished */
1905 /* otherwise, just pause a couple seconds and retry */
1906 tsleep(&nmp
->nm_state
, (PZERO
- 1), "nfsgrace", 2 * hz
);
1908 if (!(error
= nfs_mount_state_wait_for_recovery(nmp
))) {
1916 SET(bp
->nb_flags
, NB_ERROR
);
1917 bp
->nb_error
= error
;
1921 if ((rlen
> 0) && (bp
->nb_endio
< (offset
+ (int)rlen
))) {
1922 bp
->nb_endio
= offset
+ rlen
;
1925 if ((nfsvers
== NFS_VER2
) || eof
|| (rlen
== 0)) {
1926 /* zero out the remaining data (up to EOF) */
1927 off_t rpcrem
, eofrem
, rem
;
1928 rpcrem
= (length
- rlen
);
1929 eofrem
= np
->n_size
- (NBOFF(bp
) + offset
+ rlen
);
1930 rem
= (rpcrem
< eofrem
) ? rpcrem
: eofrem
;
1932 NFS_BZERO(bp
->nb_data
+ offset
+ rlen
, rem
);
1934 } else if ((rlen
< length
) && !ISSET(bp
->nb_flags
, NB_ERROR
)) {
1938 * We haven't hit EOF and we didn't get all the data
1939 * requested, so we need to issue another read for the rest.
1940 * (Don't bother if the buffer already hit an error.)
1947 cb
.rcb_args
.offset
= offset
;
1948 cb
.rcb_args
.length
= length
;
1950 if (nmp
->nm_vers
>= NFS_VER4
) {
1951 cb
.rcb_args
.stategenid
= nmp
->nm_stategenid
;
1954 error
= nmp
->nm_funcs
->nf_read_rpc_async(np
, NBOFF(bp
) + offset
, length
, thd
, cred
, &cb
, &rreq
);
1956 if (IS_VALID_CRED(cred
)) {
1957 kauth_cred_unref(&cred
);
1960 /* if !async we'll need to wait for this RPC to finish */
1965 nfs_request_rele(req
);
1968 * Outstanding RPC count is unchanged.
1969 * Callback will be called when RPC is done.
1973 SET(bp
->nb_flags
, NB_ERROR
);
1974 bp
->nb_error
= error
;
1979 nfs_request_rele(req
);
1981 if (IS_VALID_CRED(cred
)) {
1982 kauth_cred_unref(&cred
);
1986 * Decrement outstanding RPC count on buffer
1987 * and call nfs_buf_read_finish on last RPC.
1989 * (Note: when there are multiple async RPCs issued for a
1990 * buffer we need nfs_buffer_mutex to avoid problems when
1991 * aborting a partially-initiated set of RPCs)
1994 multasyncrpc
= ISSET(bp
->nb_flags
, NB_MULTASYNCRPC
);
1996 lck_mtx_lock(nfs_buf_mutex
);
2000 finished
= (bp
->nb_rpcs
== 0);
2003 lck_mtx_unlock(nfs_buf_mutex
);
2008 wakeme
= &bp
->nb_rpcs
;
2010 nfs_buf_read_finish(bp
);
2018 * Do buffer readahead.
2019 * Initiate async I/O to read buffers not in cache.
2022 nfs_buf_readahead(nfsnode_t np
, int ioflag
, daddr64_t
*rabnp
, daddr64_t lastrabn
, thread_t thd
, kauth_cred_t cred
)
2024 struct nfsmount
*nmp
= NFSTONMP(np
);
2029 if (nfs_mount_gone(nmp
)) {
2032 if (nmp
->nm_readahead
<= 0) {
2035 if (*rabnp
> lastrabn
) {
2039 for (nra
= 0; (nra
< nmp
->nm_readahead
) && (*rabnp
<= lastrabn
); nra
++, *rabnp
= *rabnp
+ 1) {
2040 /* check if block exists and is valid. */
2041 if ((*rabnp
* nmp
->nm_biosize
) >= (off_t
)np
->n_size
) {
2042 /* stop reading ahead if we're beyond EOF */
2046 error
= nfs_buf_get(np
, *rabnp
, nmp
->nm_biosize
, thd
, NBLK_READ
| NBLK_NOWAIT
, &bp
);
2050 nfs_node_lock_force(np
);
2051 np
->n_lastrahead
= *rabnp
;
2052 nfs_node_unlock(np
);
2056 if ((ioflag
& IO_NOCACHE
) && ISSET(bp
->nb_flags
, NB_CACHE
) &&
2057 !nfs_buf_pgs_is_set(&bp
->nb_dirty
) && !ISSET(bp
->nb_flags
, (NB_DELWRI
| NB_NCRDAHEAD
))) {
2058 CLR(bp
->nb_flags
, NB_CACHE
);
2059 NBPGS_ERASE(&bp
->nb_valid
);
2060 bp
->nb_validoff
= bp
->nb_validend
= -1;
2062 if ((bp
->nb_dirtyend
<= 0) && !nfs_buf_pgs_is_set(&bp
->nb_dirty
) &&
2063 !ISSET(bp
->nb_flags
, (NB_CACHE
| NB_DELWRI
))) {
2064 SET(bp
->nb_flags
, (NB_READ
| NB_ASYNC
));
2065 if (ioflag
& IO_NOCACHE
) {
2066 SET(bp
->nb_flags
, NB_NCRDAHEAD
);
2068 if (!IS_VALID_CRED(bp
->nb_rcred
) && IS_VALID_CRED(cred
)) {
2069 kauth_cred_ref(cred
);
2070 bp
->nb_rcred
= cred
;
2072 if ((error
= nfs_buf_read(bp
))) {
2077 nfs_buf_release(bp
, 1);
2083 * NFS buffer I/O for reading files.
2086 nfs_bioread(nfsnode_t np
, uio_t uio
, int ioflag
, vfs_context_t ctx
)
2088 vnode_t vp
= NFSTOV(np
);
2089 struct nfsbuf
*bp
= NULL
;
2090 struct nfsmount
*nmp
= VTONMP(vp
);
2091 daddr64_t lbn
, rabn
= 0, lastrabn
, maxrabn
= -1;
2092 off_t diff
, on
= 0, n
= 0;
2094 int nfsvers
, biosize
, modified
, readaheads
= 0;
2099 FSDBG_TOP(514, np
, uio_offset(uio
), uio_resid(uio
), ioflag
);
2101 nfsvers
= nmp
->nm_vers
;
2102 biosize
= nmp
->nm_biosize
;
2103 thd
= vfs_context_thread(ctx
);
2104 cred
= vfs_context_ucred(ctx
);
2106 if (vnode_vtype(vp
) != VREG
) {
2107 printf("nfs_bioread: type %x unexpected\n", vnode_vtype(vp
));
2108 FSDBG_BOT(514, np
, 0xd1e0016, 0, EINVAL
);
2113 * For NFS, cache consistency can only be maintained approximately.
2114 * Although RFC1094 does not specify the criteria, the following is
2115 * believed to be compatible with the reference port.
2117 * If the file has changed since the last read RPC or you have
2118 * written to the file, you may have lost data cache consistency
2119 * with the server. So, check for a change, and flush all of the
2120 * file's data out of the cache.
2121 * NB: This implies that cache data can be read when up to
2122 * NFS_MAXATTRTIMO seconds out of date. If you find that you
2123 * need current attributes, nfs_getattr() can be forced to fetch
2124 * new attributes (via NATTRINVALIDATE() or NGA_UNCACHED).
2127 if (ISSET(np
->n_flag
, NUPDATESIZE
)) {
2128 nfs_data_update_size(np
, 0);
2131 if ((error
= nfs_node_lock(np
))) {
2132 FSDBG_BOT(514, np
, 0xd1e0222, 0, error
);
2136 if (np
->n_flag
& NNEEDINVALIDATE
) {
2137 np
->n_flag
&= ~NNEEDINVALIDATE
;
2138 nfs_node_unlock(np
);
2139 error
= nfs_vinvalbuf(vp
, V_SAVE
| V_IGNORE_WRITEERR
, ctx
, 1);
2141 error
= nfs_node_lock(np
);
2144 FSDBG_BOT(514, np
, 0xd1e0322, 0, error
);
2149 modified
= (np
->n_flag
& NMODIFIED
);
2150 nfs_node_unlock(np
);
2151 /* nfs_getattr() will check changed and purge caches */
2152 error
= nfs_getattr(np
, NULL
, ctx
, modified
? NGA_UNCACHED
: NGA_CACHED
);
2154 FSDBG_BOT(514, np
, 0xd1e0004, 0, error
);
2158 if (uio_resid(uio
) == 0) {
2159 FSDBG_BOT(514, np
, 0xd1e0001, 0, 0);
2162 if (uio_offset(uio
) < 0) {
2163 FSDBG_BOT(514, np
, 0xd1e0002, 0, EINVAL
);
2168 * set up readahead - which may be limited by:
2169 * + current request length (for IO_NOCACHE)
2170 * + readahead setting
2173 if (nmp
->nm_readahead
> 0) {
2174 off_t end
= uio_offset(uio
) + uio_resid(uio
);
2175 if (end
> (off_t
)np
->n_size
) {
2178 rabn
= uio_offset(uio
) / biosize
;
2179 maxrabn
= (end
- 1) / biosize
;
2180 nfs_node_lock_force(np
);
2181 if (!(ioflag
& IO_NOCACHE
) &&
2182 (!rabn
|| (rabn
== np
->n_lastread
) || (rabn
== (np
->n_lastread
+ 1)))) {
2183 maxrabn
+= nmp
->nm_readahead
;
2184 if ((maxrabn
* biosize
) >= (off_t
)np
->n_size
) {
2185 maxrabn
= ((off_t
)np
->n_size
- 1) / biosize
;
2188 if (maxrabn
< np
->n_lastrahead
) {
2189 np
->n_lastrahead
= -1;
2191 if (rabn
< np
->n_lastrahead
) {
2192 rabn
= np
->n_lastrahead
+ 1;
2194 nfs_node_unlock(np
);
2200 nfs_data_lock(np
, NFS_DATA_LOCK_SHARED
);
2201 lbn
= uio_offset(uio
) / biosize
;
2204 * Copy directly from any cached pages without grabbing the bufs.
2205 * (If we are NOCACHE and we've issued readahead requests, we need
2206 * to grab the NB_NCRDAHEAD bufs to drop them.)
2208 if ((!(ioflag
& IO_NOCACHE
) || !readaheads
) &&
2209 ((uio
->uio_segflg
== UIO_USERSPACE32
||
2210 uio
->uio_segflg
== UIO_USERSPACE64
||
2211 uio
->uio_segflg
== UIO_USERSPACE
))) {
2212 io_resid
= uio_resid(uio
);
2213 diff
= np
->n_size
- uio_offset(uio
);
2214 if (diff
< io_resid
) {
2218 int count
= (io_resid
> INT_MAX
) ? INT_MAX
: (int)io_resid
;
2219 error
= cluster_copy_ubc_data(vp
, uio
, &count
, 0);
2221 nfs_data_unlock(np
);
2222 FSDBG_BOT(514, np
, uio_offset(uio
), 0xcacefeed, error
);
2226 /* count any biocache reads that we just copied directly */
2227 if (lbn
!= (uio_offset(uio
) / biosize
)) {
2228 OSAddAtomic64(NFS_ROUND_BLOCK(uio_offset(uio
), biosize
) - lbn
, &nfsstats
.biocache_reads
);
2229 FSDBG(514, np
, 0xcacefeed, uio_offset(uio
), error
);
2233 lbn
= uio_offset(uio
) / biosize
;
2234 on
= uio_offset(uio
) % biosize
;
2235 nfs_node_lock_force(np
);
2236 np
->n_lastread
= (uio_offset(uio
) - 1) / biosize
;
2237 nfs_node_unlock(np
);
2239 if ((uio_resid(uio
) <= 0) || (uio_offset(uio
) >= (off_t
)np
->n_size
)) {
2240 nfs_data_unlock(np
);
2241 FSDBG_BOT(514, np
, uio_offset(uio
), uio_resid(uio
), 0xaaaaaaaa);
2245 /* adjust readahead block number, if necessary */
2249 lastrabn
= MIN(maxrabn
, lbn
+ nmp
->nm_readahead
);
2250 if (rabn
<= lastrabn
) { /* start readaheads */
2251 error
= nfs_buf_readahead(np
, ioflag
, &rabn
, lastrabn
, thd
, cred
);
2253 nfs_data_unlock(np
);
2254 FSDBG_BOT(514, np
, 0xd1e000b, 1, error
);
2258 OSAddAtomic64(rabn
- lbn
, &nfsstats
.biocache_reads
);
2260 OSAddAtomic64(1, &nfsstats
.biocache_reads
);
2264 * If the block is in the cache and has the required data
2265 * in a valid region, just copy it out.
2266 * Otherwise, get the block and write back/read in,
2270 io_resid
= uio_resid(uio
);
2271 n
= (io_resid
> (biosize
- on
)) ? (biosize
- on
) : io_resid
;
2272 diff
= np
->n_size
- uio_offset(uio
);
2277 error
= nfs_buf_get(np
, lbn
, biosize
, thd
, NBLK_READ
, &bp
);
2279 nfs_data_unlock(np
);
2280 FSDBG_BOT(514, np
, 0xd1e000c, 0, error
);
2284 if ((ioflag
& IO_NOCACHE
) && ISSET(bp
->nb_flags
, NB_CACHE
)) {
2286 * IO_NOCACHE found a cached buffer.
2287 * Flush the buffer if it's dirty.
2288 * Invalidate the data if it wasn't just read
2289 * in as part of a "nocache readahead".
2291 if (nfs_buf_pgs_is_set(&bp
->nb_dirty
) || (bp
->nb_dirtyend
> 0)) {
2292 /* so write the buffer out and try again */
2293 SET(bp
->nb_flags
, NB_NOCACHE
);
2296 if (ISSET(bp
->nb_flags
, NB_NCRDAHEAD
)) {
2297 CLR(bp
->nb_flags
, NB_NCRDAHEAD
);
2298 SET(bp
->nb_flags
, NB_NOCACHE
);
2302 /* if any pages are valid... */
2303 if (nfs_buf_pgs_is_set(&bp
->nb_valid
)) {
2304 /* ...check for any invalid pages in the read range */
2305 off_t pg
, firstpg
, lastpg
, dirtypg
;
2306 dirtypg
= firstpg
= lastpg
= -1;
2307 pg
= on
/ PAGE_SIZE
;
2308 while (pg
<= (on
+ n
- 1) / PAGE_SIZE
) {
2309 if (!NBPGVALID(bp
, pg
)) {
2314 } else if (firstpg
>= 0 && dirtypg
< 0 && NBPGDIRTY(bp
, pg
)) {
2320 /* if there are no invalid pages, we're all set */
2322 if (bp
->nb_validoff
< 0) {
2323 /* valid range isn't set up, so */
2324 /* set it to what we know is valid */
2325 bp
->nb_validoff
= trunc_page_64(on
);
2326 bp
->nb_validend
= round_page_64(on
+ n
);
2327 nfs_buf_normalize_valid_range(np
, bp
);
2332 /* there are invalid pages in the read range */
2333 if (((dirtypg
> firstpg
) && (dirtypg
< lastpg
)) ||
2334 (((firstpg
* PAGE_SIZE
) < bp
->nb_dirtyend
) && (((lastpg
+ 1) * PAGE_SIZE
) > bp
->nb_dirtyoff
))) {
2335 /* there are also dirty page(s) (or range) in the read range, */
2336 /* so write the buffer out and try again */
2338 CLR(bp
->nb_flags
, (NB_DONE
| NB_ERROR
| NB_INVAL
));
2339 SET(bp
->nb_flags
, NB_ASYNC
);
2340 if (!IS_VALID_CRED(bp
->nb_wcred
)) {
2341 kauth_cred_ref(cred
);
2342 bp
->nb_wcred
= cred
;
2344 error
= nfs_buf_write(bp
);
2346 nfs_data_unlock(np
);
2347 FSDBG_BOT(514, np
, 0xd1e000d, 0, error
);
2352 if (!nfs_buf_pgs_is_set(&bp
->nb_dirty
) && bp
->nb_dirtyend
<= 0 &&
2353 (lastpg
- firstpg
+ 1) > (biosize
/ PAGE_SIZE
) / 2) {
2354 /* we need to read in more than half the buffer and the */
2355 /* buffer's not dirty, so just fetch the whole buffer */
2356 NBPGS_ERASE(&bp
->nb_valid
);
2358 /* read the page range in */
2360 char uio_buf
[UIO_SIZEOF(1)];
2363 auio
= uio_createwithbuffer(1, (NBOFF(bp
) + firstpg
* PAGE_SIZE_64
),
2364 UIO_SYSSPACE
, UIO_READ
, &uio_buf
[0], sizeof(uio_buf
));
2368 NFS_UIO_ADDIOV(auio
, CAST_USER_ADDR_T(bp
->nb_data
+ (firstpg
* PAGE_SIZE
)),
2369 ((lastpg
- firstpg
+ 1) * PAGE_SIZE
));
2370 error
= nfs_read_rpc(np
, auio
, ctx
);
2373 if (ioflag
& IO_NOCACHE
) {
2374 SET(bp
->nb_flags
, NB_NOCACHE
);
2376 nfs_buf_release(bp
, 1);
2377 nfs_data_unlock(np
);
2378 FSDBG_BOT(514, np
, 0xd1e000e, 0, error
);
2381 /* Make sure that the valid range is set to cover this read. */
2382 bp
->nb_validoff
= trunc_page_64(on
);
2383 bp
->nb_validend
= round_page_64(on
+ n
);
2384 nfs_buf_normalize_valid_range(np
, bp
);
2385 if (uio_resid(auio
) > 0) {
2386 /* if short read, must have hit EOF, */
2387 /* so zero the rest of the range */
2388 bzero(CAST_DOWN(caddr_t
, uio_curriovbase(auio
)), uio_resid(auio
));
2390 /* mark the pages (successfully read) as valid */
2391 for (pg
= firstpg
; pg
<= lastpg
; pg
++) {
2392 NBPGVALID_SET(bp
, pg
);
2396 /* if no pages are valid, read the whole block */
2397 if (!nfs_buf_pgs_is_set(&bp
->nb_valid
)) {
2398 if (!IS_VALID_CRED(bp
->nb_rcred
) && IS_VALID_CRED(cred
)) {
2399 kauth_cred_ref(cred
);
2400 bp
->nb_rcred
= cred
;
2402 SET(bp
->nb_flags
, NB_READ
);
2403 CLR(bp
->nb_flags
, (NB_DONE
| NB_ERROR
| NB_INVAL
));
2404 error
= nfs_buf_read(bp
);
2405 if (ioflag
& IO_NOCACHE
) {
2406 SET(bp
->nb_flags
, NB_NOCACHE
);
2409 nfs_data_unlock(np
);
2410 nfs_buf_release(bp
, 1);
2411 FSDBG_BOT(514, np
, 0xd1e000f, 0, error
);
2416 /* validate read range against valid range and clip */
2417 if (bp
->nb_validend
> 0) {
2418 diff
= (on
>= bp
->nb_validend
) ? 0 : (bp
->nb_validend
- on
);
2425 n32
= n
> INT_MAX
? INT_MAX
: (int)n
;
2426 error
= uiomove(bp
->nb_data
+ on
, n32
, uio
);
2427 if (!error
&& n
> n32
) {
2428 error
= uiomove(bp
->nb_data
+ on
+ n32
, (int)(n
- n32
), uio
);
2433 nfs_buf_release(bp
, 1);
2434 nfs_data_unlock(np
);
2435 nfs_node_lock_force(np
);
2436 np
->n_lastread
= (uio_offset(uio
) - 1) / biosize
;
2437 nfs_node_unlock(np
);
2438 } while (error
== 0 && uio_resid(uio
) > 0 && n
> 0);
2439 FSDBG_BOT(514, np
, uio_offset(uio
), uio_resid(uio
), error
);
2444 * limit the number of outstanding async I/O writes
2447 nfs_async_write_start(struct nfsmount
*nmp
)
2449 int error
= 0, slpflag
= NMFLAG(nmp
, INTR
) ? PCATCH
: 0;
2450 struct timespec ts
= { .tv_sec
= 1, .tv_nsec
= 0 };
2452 if (nfs_max_async_writes
<= 0) {
2455 lck_mtx_lock(&nmp
->nm_lock
);
2456 while ((nfs_max_async_writes
> 0) && (nmp
->nm_asyncwrites
>= nfs_max_async_writes
)) {
2457 if ((error
= nfs_sigintr(nmp
, NULL
, current_thread(), 1))) {
2460 msleep(&nmp
->nm_asyncwrites
, &nmp
->nm_lock
, slpflag
| (PZERO
- 1), "nfsasyncwrites", &ts
);
2464 nmp
->nm_asyncwrites
++;
2466 lck_mtx_unlock(&nmp
->nm_lock
);
2470 nfs_async_write_done(struct nfsmount
*nmp
)
2472 if (nmp
->nm_asyncwrites
<= 0) {
2475 lck_mtx_lock(&nmp
->nm_lock
);
2476 if (nmp
->nm_asyncwrites
-- >= nfs_max_async_writes
) {
2477 wakeup(&nmp
->nm_asyncwrites
);
2479 lck_mtx_unlock(&nmp
->nm_lock
);
2483 * write (or commit) the given NFS buffer
2485 * Commit the buffer if we can.
2486 * Write out any dirty range.
2487 * If any dirty pages remain, write them out.
2490 * For async requests, all the work beyond sending the initial
2491 * write RPC is handled in the RPC callback(s).
2494 nfs_buf_write(struct nfsbuf
*bp
)
2496 int error
= 0, oldflags
, async
;
2500 proc_t p
= current_proc();
2502 off_t doff
, dend
, firstpg
, lastpg
;
2504 FSDBG_TOP(553, bp
, NBOFF(bp
), bp
->nb_flags
, 0);
2506 if (!ISSET(bp
->nb_lflags
, NBL_BUSY
)) {
2507 panic("nfs_buf_write: buffer is not busy???");
2511 async
= ISSET(bp
->nb_flags
, NB_ASYNC
);
2512 oldflags
= bp
->nb_flags
;
2514 CLR(bp
->nb_flags
, (NB_READ
| NB_DONE
| NB_ERROR
| NB_DELWRI
));
2515 if (ISSET(oldflags
, NB_DELWRI
)) {
2516 lck_mtx_lock(nfs_buf_mutex
);
2519 lck_mtx_unlock(nfs_buf_mutex
);
2520 wakeup(&nfs_nbdwrite
);
2523 /* move to clean list */
2524 if (ISSET(oldflags
, (NB_ASYNC
| NB_DELWRI
))) {
2525 lck_mtx_lock(nfs_buf_mutex
);
2526 if (bp
->nb_vnbufs
.le_next
!= NFSNOLIST
) {
2527 LIST_REMOVE(bp
, nb_vnbufs
);
2529 LIST_INSERT_HEAD(&np
->n_cleanblkhd
, bp
, nb_vnbufs
);
2530 lck_mtx_unlock(nfs_buf_mutex
);
2532 nfs_node_lock_force(np
);
2534 nfs_node_unlock(np
);
2535 vnode_startwrite(NFSTOV(np
));
2537 if (p
&& p
->p_stats
) {
2538 OSIncrementAtomicLong(&p
->p_stats
->p_ru
.ru_oublock
);
2541 cred
= bp
->nb_wcred
;
2542 if (!IS_VALID_CRED(cred
) && ISSET(bp
->nb_flags
, NB_READ
)) {
2543 cred
= bp
->nb_rcred
; /* shouldn't really happen, but... */
2545 if (IS_VALID_CRED(cred
)) {
2546 kauth_cred_ref(cred
);
2548 thd
= async
? NULL
: current_thread();
2550 /* We need to make sure the pages are locked before doing I/O. */
2551 if (!ISSET(bp
->nb_flags
, NB_META
)) {
2552 if (UBCINFOEXISTS(NFSTOV(np
))) {
2553 if (!ISSET(bp
->nb_flags
, NB_PAGELIST
)) {
2554 error
= nfs_buf_upl_setup(bp
);
2556 printf("nfs_buf_write: upl create failed %d\n", error
);
2557 SET(bp
->nb_flags
, NB_ERROR
);
2558 bp
->nb_error
= error
= EIO
;
2562 nfs_buf_upl_check(bp
);
2565 /* We should never be in nfs_buf_write() with no UBCINFO. */
2566 printf("nfs_buf_write: ubcinfo already gone\n");
2567 SET(bp
->nb_flags
, NB_ERROR
);
2568 bp
->nb_error
= error
= EIO
;
2574 /* If NB_NEEDCOMMIT is set, a commit RPC may do the trick. */
2575 if (ISSET(bp
->nb_flags
, NB_NEEDCOMMIT
)) {
2576 nfs_buf_check_write_verifier(np
, bp
);
2578 if (ISSET(bp
->nb_flags
, NB_NEEDCOMMIT
)) {
2579 struct nfsmount
*nmp
= NFSTONMP(np
);
2580 if (nfs_mount_gone(nmp
)) {
2581 SET(bp
->nb_flags
, NB_ERROR
);
2582 bp
->nb_error
= error
= EIO
;
2586 SET(bp
->nb_flags
, NB_WRITEINPROG
);
2587 error
= nmp
->nm_funcs
->nf_commit_rpc(np
, NBOFF(bp
) + bp
->nb_dirtyoff
,
2588 bp
->nb_dirtyend
- bp
->nb_dirtyoff
, bp
->nb_wcred
, bp
->nb_verf
);
2589 CLR(bp
->nb_flags
, NB_WRITEINPROG
);
2591 if (error
!= NFSERR_STALEWRITEVERF
) {
2592 SET(bp
->nb_flags
, NB_ERROR
);
2593 bp
->nb_error
= error
;
2598 bp
->nb_dirtyoff
= bp
->nb_dirtyend
= 0;
2599 CLR(bp
->nb_flags
, NB_NEEDCOMMIT
);
2600 nfs_node_lock_force(np
);
2601 np
->n_needcommitcnt
--;
2602 CHECK_NEEDCOMMITCNT(np
);
2603 nfs_node_unlock(np
);
2605 if (!error
&& (bp
->nb_dirtyend
> 0)) {
2606 /* sanity check the dirty range */
2607 if (NBOFF(bp
) + bp
->nb_dirtyend
> (off_t
) np
->n_size
) {
2608 bp
->nb_dirtyend
= np
->n_size
- NBOFF(bp
);
2609 if (bp
->nb_dirtyoff
>= bp
->nb_dirtyend
) {
2610 bp
->nb_dirtyoff
= bp
->nb_dirtyend
= 0;
2614 if (!error
&& (bp
->nb_dirtyend
> 0)) {
2615 /* there's a dirty range that needs to be written out */
2616 nfsbufpgs pagemask
, pagemaskand
;
2619 doff
= bp
->nb_dirtyoff
;
2620 dend
= bp
->nb_dirtyend
;
2622 /* if doff page is dirty, move doff to start of page */
2623 if (NBPGDIRTY(bp
, doff
/ PAGE_SIZE
)) {
2624 doff
-= doff
& PAGE_MASK
;
2626 /* try to expand write range to include preceding dirty pages */
2627 if (!(doff
& PAGE_MASK
)) {
2628 while ((doff
> 0) && NBPGDIRTY(bp
, (doff
- 1) / PAGE_SIZE
)) {
2632 /* if dend page is dirty, move dend to start of next page */
2633 if ((dend
& PAGE_MASK
) && NBPGDIRTY(bp
, dend
/ PAGE_SIZE
)) {
2634 dend
= round_page_64(dend
);
2636 /* try to expand write range to include trailing dirty pages */
2637 if (!(dend
& PAGE_MASK
)) {
2638 while ((dend
< (int)bp
->nb_bufsize
) && NBPGDIRTY(bp
, dend
/ PAGE_SIZE
)) {
2642 /* make sure to keep dend clipped to EOF */
2643 if ((NBOFF(bp
) + dend
) > (off_t
) np
->n_size
) {
2644 dend
= np
->n_size
- NBOFF(bp
);
2646 /* calculate range of complete pages being written */
2648 firstpg
= doff
/ PAGE_SIZE
;
2649 lastpg
= (dend
- 1) / PAGE_SIZE
;
2650 /* calculate mask for that page range */
2651 nfs_buf_pgs_set_pages_between(&pagemask
, firstpg
, lastpg
+ 1);
2653 NBPGS_ERASE(&pagemask
);
2657 * compare page mask to nb_dirty; if there are other dirty pages
2658 * then write FILESYNC; otherwise, write UNSTABLE if async and
2659 * not needcommit/stable; otherwise write FILESYNC
2661 nfs_buf_pgs_bit_not(&pagemask
);
2662 nfs_buf_pgs_bit_and(&bp
->nb_dirty
, &pagemask
, &pagemaskand
);
2663 if (nfs_buf_pgs_is_set(&pagemaskand
)) {
2664 iomode
= NFS_WRITE_FILESYNC
;
2665 } else if ((bp
->nb_flags
& (NB_ASYNC
| NB_NEEDCOMMIT
| NB_STABLE
)) == NB_ASYNC
) {
2666 iomode
= NFS_WRITE_UNSTABLE
;
2668 iomode
= NFS_WRITE_FILESYNC
;
2671 /* write the whole contiguous dirty range */
2672 bp
->nb_offio
= doff
;
2673 bp
->nb_endio
= dend
;
2675 OSAddAtomic64(1, &nfsstats
.write_bios
);
2677 SET(bp
->nb_flags
, NB_WRITEINPROG
);
2678 error
= nfs_buf_write_rpc(bp
, iomode
, thd
, cred
);
2680 * For async I/O, the callbacks will finish up the
2681 * write and push out any dirty pages. Otherwise,
2682 * the write has already been finished and any dirty
2686 if (!error
&& nfs_buf_pgs_is_set(&bp
->nb_dirty
)) { /* write out any dirty pages */
2687 error
= nfs_buf_write_dirty_pages(bp
, thd
, cred
);
2691 /* note: bp is still valid only for !async case */
2694 error
= nfs_buf_iowait(bp
);
2695 /* move to clean list */
2696 if (oldflags
& NB_DELWRI
) {
2697 lck_mtx_lock(nfs_buf_mutex
);
2698 if (bp
->nb_vnbufs
.le_next
!= NFSNOLIST
) {
2699 LIST_REMOVE(bp
, nb_vnbufs
);
2701 LIST_INSERT_HEAD(&np
->n_cleanblkhd
, bp
, nb_vnbufs
);
2702 lck_mtx_unlock(nfs_buf_mutex
);
2704 FSDBG_BOT(553, bp
, NBOFF(bp
), bp
->nb_flags
, error
);
2705 nfs_buf_release(bp
, 1);
2706 /* check if we need to invalidate (and we can) */
2707 if ((np
->n_flag
& NNEEDINVALIDATE
) &&
2708 !(np
->n_bflag
& (NBINVALINPROG
| NBFLUSHINPROG
))) {
2710 nfs_node_lock_force(np
);
2711 if (np
->n_flag
& NNEEDINVALIDATE
) {
2713 np
->n_flag
&= ~NNEEDINVALIDATE
;
2715 nfs_node_unlock(np
);
2718 * There was a write error and we need to
2719 * invalidate attrs and flush buffers in
2720 * order to sync up with the server.
2721 * (if this write was extending the file,
2722 * we may no longer know the correct size)
2724 * But we couldn't call vinvalbuf while holding
2725 * the buffer busy. So we call vinvalbuf() after
2726 * releasing the buffer.
2728 nfs_vinvalbuf2(NFSTOV(np
), V_SAVE
| V_IGNORE_WRITEERR
, thd
, cred
, 1);
2733 if (IS_VALID_CRED(cred
)) {
2734 kauth_cred_unref(&cred
);
2740 * finish the writing of a buffer
2743 nfs_buf_write_finish(struct nfsbuf
*bp
, thread_t thd
, kauth_cred_t cred
)
2745 nfsnode_t np
= bp
->nb_np
;
2746 int error
= (bp
->nb_flags
& NB_ERROR
) ? bp
->nb_error
: 0;
2747 off_t firstpg
, lastpg
;
2749 if ((error
== EINTR
) || (error
== ERESTART
)) {
2750 CLR(bp
->nb_flags
, NB_ERROR
);
2751 SET(bp
->nb_flags
, NB_EINTR
);
2756 /* calculate range of complete pages being written */
2757 if (bp
->nb_endio
> bp
->nb_offio
) {
2758 firstpg
= bp
->nb_offio
/ PAGE_SIZE
;
2759 lastpg
= (bp
->nb_endio
- 1) / PAGE_SIZE
;
2760 /* calculate mask for that page range written */
2761 nfs_buf_pgs_set_pages_between(&pagemask
, firstpg
, lastpg
+ 1);
2763 NBPGS_ERASE(&pagemask
);
2765 /* clear dirty bits for pages we've written */
2766 nfs_buf_pgs_bit_not(&pagemask
);
2767 nfs_buf_pgs_bit_and(&bp
->nb_dirty
, &pagemask
, &bp
->nb_dirty
);
2770 /* manage needcommit state */
2771 if (!error
&& (bp
->nb_commitlevel
== NFS_WRITE_UNSTABLE
)) {
2772 if (!ISSET(bp
->nb_flags
, NB_NEEDCOMMIT
)) {
2773 nfs_node_lock_force(np
);
2774 np
->n_needcommitcnt
++;
2775 nfs_node_unlock(np
);
2776 SET(bp
->nb_flags
, NB_NEEDCOMMIT
);
2778 /* make sure nb_dirtyoff/nb_dirtyend reflect actual range written */
2779 bp
->nb_dirtyoff
= bp
->nb_offio
;
2780 bp
->nb_dirtyend
= bp
->nb_endio
;
2781 } else if (ISSET(bp
->nb_flags
, NB_NEEDCOMMIT
)) {
2782 nfs_node_lock_force(np
);
2783 np
->n_needcommitcnt
--;
2784 CHECK_NEEDCOMMITCNT(np
);
2785 nfs_node_unlock(np
);
2786 CLR(bp
->nb_flags
, NB_NEEDCOMMIT
);
2789 CLR(bp
->nb_flags
, NB_WRITEINPROG
);
2792 * For an unstable write, the buffer is still treated as dirty until
2793 * a commit (or stable (re)write) is performed. Buffers needing only
2794 * a commit are marked with the NB_DELWRI and NB_NEEDCOMMIT flags.
2796 * If the write was interrupted we set NB_EINTR. Don't set NB_ERROR
2797 * because that would cause the buffer to be dropped. The buffer is
2798 * still valid and simply needs to be written again.
2800 if ((error
== EINTR
) || (error
== ERESTART
) || (!error
&& (bp
->nb_flags
& NB_NEEDCOMMIT
))) {
2801 CLR(bp
->nb_flags
, NB_INVAL
);
2802 if (!ISSET(bp
->nb_flags
, NB_DELWRI
)) {
2803 SET(bp
->nb_flags
, NB_DELWRI
);
2804 lck_mtx_lock(nfs_buf_mutex
);
2807 lck_mtx_unlock(nfs_buf_mutex
);
2810 * Since for the NB_ASYNC case, we've reassigned the buffer to the
2811 * clean list, we have to reassign it back to the dirty one. Ugh.
2813 if (ISSET(bp
->nb_flags
, NB_ASYNC
)) {
2814 /* move to dirty list */
2815 lck_mtx_lock(nfs_buf_mutex
);
2816 if (bp
->nb_vnbufs
.le_next
!= NFSNOLIST
) {
2817 LIST_REMOVE(bp
, nb_vnbufs
);
2819 LIST_INSERT_HEAD(&np
->n_dirtyblkhd
, bp
, nb_vnbufs
);
2820 lck_mtx_unlock(nfs_buf_mutex
);
2823 /* either there's an error or we don't need to commit */
2826 * There was a write error and we need to invalidate
2827 * attrs and flush buffers in order to sync up with the
2828 * server. (if this write was extending the file, we
2829 * may no longer know the correct size)
2831 * But we can't call vinvalbuf while holding this
2832 * buffer busy. Set a flag to do it after releasing
2835 nfs_node_lock_force(np
);
2836 np
->n_error
= error
;
2837 np
->n_flag
|= (NWRITEERR
| NNEEDINVALIDATE
);
2838 NATTRINVALIDATE(np
);
2839 nfs_node_unlock(np
);
2841 /* clear the dirty range */
2842 bp
->nb_dirtyoff
= bp
->nb_dirtyend
= 0;
2845 if (!error
&& nfs_buf_pgs_is_set(&bp
->nb_dirty
)) {
2846 nfs_buf_write_dirty_pages(bp
, thd
, cred
);
2852 * write out any pages marked dirty in a buffer
2854 * We do use unstable writes and follow up with a commit.
2855 * If we catch the write verifier changing we'll restart
2856 * do the writes filesync.
2859 nfs_buf_write_dirty_pages(struct nfsbuf
*bp
, thread_t thd
, kauth_cred_t cred
)
2861 nfsnode_t np
= bp
->nb_np
;
2862 struct nfsmount
*nmp
= NFSTONMP(np
);
2863 int error
= 0, commit
, iomode
, iomode2
, len
, pg
, count
, npages
, off
;
2867 char uio_buf
[UIO_SIZEOF(1)];
2869 if (!nfs_buf_pgs_is_set(&bp
->nb_dirty
)) {
2873 /* there are pages marked dirty that need to be written out */
2874 OSAddAtomic64(1, &nfsstats
.write_bios
);
2876 SET(bp
->nb_flags
, NB_WRITEINPROG
);
2877 npages
= bp
->nb_bufsize
/ PAGE_SIZE
;
2878 iomode
= NFS_WRITE_UNSTABLE
;
2880 auio
= uio_createwithbuffer(1, 0, UIO_SYSSPACE
, UIO_WRITE
,
2881 &uio_buf
, sizeof(uio_buf
));
2884 NBPGS_COPY(&dirty
, &bp
->nb_dirty
);
2885 wverf
= bp
->nb_verf
;
2886 commit
= NFS_WRITE_FILESYNC
;
2887 for (pg
= 0; pg
< npages
; pg
++) {
2888 if (!NBPGDIRTY(bp
, pg
)) {
2892 while (((pg
+ count
) < npages
) && NBPGDIRTY(bp
, pg
+ count
)) {
2895 /* write count pages starting with page pg */
2896 off
= pg
* PAGE_SIZE
;
2897 len
= count
* PAGE_SIZE
;
2898 /* clip writes to EOF */
2899 if (NBOFF(bp
) + off
+ len
> (off_t
) np
->n_size
) {
2900 len
-= (NBOFF(bp
) + off
+ len
) - np
->n_size
;
2904 uio_reset(auio
, NBOFF(bp
) + off
, UIO_SYSSPACE
, UIO_WRITE
);
2905 uio_addiov(auio
, CAST_USER_ADDR_T(bp
->nb_data
+ off
), len
);
2906 error
= nfs_write_rpc2(np
, auio
, thd
, cred
, &iomode2
, &bp
->nb_verf
);
2910 if (iomode2
< commit
) { /* Retain the lowest commitment level returned. */
2913 if ((commit
!= NFS_WRITE_FILESYNC
) && (wverf
!= bp
->nb_verf
)) {
2914 /* verifier changed, redo all the writes filesync */
2915 iomode
= NFS_WRITE_FILESYNC
;
2919 /* clear dirty bits */
2921 NBPGS_UNSET(&dirty
, pg
);
2922 if (count
) { /* leave pg on last page */
2927 CLR(bp
->nb_flags
, NB_WRITEINPROG
);
2929 if (!error
&& (commit
!= NFS_WRITE_FILESYNC
)) {
2930 error
= nmp
->nm_funcs
->nf_commit_rpc(np
, NBOFF(bp
), bp
->nb_bufsize
, cred
, wverf
);
2931 if (error
== NFSERR_STALEWRITEVERF
) {
2932 /* verifier changed, so we need to restart all the writes */
2933 iomode
= NFS_WRITE_FILESYNC
;
2938 NBPGS_COPY(&bp
->nb_dirty
, &dirty
);
2940 SET(bp
->nb_flags
, NB_ERROR
);
2941 bp
->nb_error
= error
;
2947 * initiate the NFS WRITE RPC(s) for a buffer
2950 nfs_buf_write_rpc(struct nfsbuf
*bp
, int iomode
, thread_t thd
, kauth_cred_t cred
)
2952 struct nfsmount
*nmp
;
2953 nfsnode_t np
= bp
->nb_np
;
2954 int error
= 0, nfsvers
, async
;
2959 struct nfsreq_cbinfo cb
;
2961 char uio_buf
[UIO_SIZEOF(1)];
2962 off_t offset
, length
;
2965 if (nfs_mount_gone(nmp
)) {
2966 bp
->nb_error
= error
= ENXIO
;
2967 SET(bp
->nb_flags
, NB_ERROR
);
2971 nfsvers
= nmp
->nm_vers
;
2972 nmwsize
= nmp
->nm_wsize
;
2974 offset
= bp
->nb_offio
;
2975 length
= bp
->nb_endio
- bp
->nb_offio
;
2977 /* Note: Can only do async I/O if nfsiods are configured. */
2978 async
= (bp
->nb_flags
& NB_ASYNC
) && (NFSIOD_MAX
> 0);
2979 bp
->nb_commitlevel
= NFS_WRITE_FILESYNC
;
2980 cb
.rcb_func
= async
? nfs_buf_write_rpc_finish
: NULL
;
2983 if ((nfsvers
== NFS_VER2
) && ((NBOFF(bp
) + bp
->nb_endio
) > 0xffffffffLL
)) {
2984 bp
->nb_error
= error
= EFBIG
;
2985 SET(bp
->nb_flags
, NB_ERROR
);
2991 /* We should never get here */
2993 printf("nfs_buf_write_rpc: Got request with zero length. np %p, bp %p, offset %lld\n", np
, bp
, offset
);
2995 printf("nfs_buf_write_rpc: Got request with zero length.\n");
2996 #endif /* DEVELOPMENT */
3001 auio
= uio_createwithbuffer(1, NBOFF(bp
) + offset
, UIO_SYSSPACE
,
3002 UIO_WRITE
, &uio_buf
, sizeof(uio_buf
));
3003 NFS_UIO_ADDIOV(auio
, CAST_USER_ADDR_T(bp
->nb_data
+ offset
), length
);
3005 bp
->nb_rpcs
= nrpcs
= (length
+ nmwsize
- 1) / nmwsize
;
3006 if (async
&& (nrpcs
> 1)) {
3007 SET(bp
->nb_flags
, NB_MULTASYNCRPC
);
3009 CLR(bp
->nb_flags
, NB_MULTASYNCRPC
);
3012 while (length
> 0) {
3013 if (ISSET(bp
->nb_flags
, NB_ERROR
)) {
3014 error
= bp
->nb_error
;
3017 len
= (length
> nmwsize
) ? nmwsize
: (uint32_t)length
;
3018 cb
.rcb_args
.offset
= offset
;
3019 cb
.rcb_args
.length
= len
;
3021 if (nmp
->nm_vers
>= NFS_VER4
) {
3022 cb
.rcb_args
.stategenid
= nmp
->nm_stategenid
;
3025 if (async
&& ((error
= nfs_async_write_start(nmp
)))) {
3029 error
= nmp
->nm_funcs
->nf_write_rpc_async(np
, auio
, len
, thd
, cred
,
3033 nfs_async_write_done(nmp
);
3042 nfs_buf_write_rpc_finish(req
);
3047 * Something bad happened while trying to send the RPCs.
3048 * Wait for any outstanding requests to complete.
3050 bp
->nb_error
= error
;
3051 SET(bp
->nb_flags
, NB_ERROR
);
3052 if (ISSET(bp
->nb_flags
, NB_MULTASYNCRPC
)) {
3053 nrpcs
= (length
+ nmwsize
- 1) / nmwsize
;
3054 lck_mtx_lock(nfs_buf_mutex
);
3055 bp
->nb_rpcs
-= nrpcs
;
3056 if (bp
->nb_rpcs
== 0) {
3057 /* No RPCs left, so the buffer's done */
3058 lck_mtx_unlock(nfs_buf_mutex
);
3059 nfs_buf_write_finish(bp
, thd
, cred
);
3061 /* wait for the last RPC to mark it done */
3062 while (bp
->nb_rpcs
> 0) {
3063 msleep(&bp
->nb_rpcs
, nfs_buf_mutex
, 0,
3064 "nfs_buf_write_rpc_cancel", NULL
);
3066 lck_mtx_unlock(nfs_buf_mutex
);
3069 nfs_buf_write_finish(bp
, thd
, cred
);
3071 /* It may have just been an interrupt... that's OK */
3072 if (!ISSET(bp
->nb_flags
, NB_ERROR
)) {
3081 * finish up an NFS WRITE RPC on a buffer
3084 nfs_buf_write_rpc_finish(struct nfsreq
*req
)
3086 int error
= 0, nfsvers
, multasyncrpc
, finished
;
3087 int committed
= NFS_WRITE_FILESYNC
;
3090 size_t rlen
, length
;
3091 void *wakeme
= NULL
;
3092 struct nfsreq_cbinfo cb
;
3093 struct nfsreq
*wreq
= NULL
;
3095 struct nfsmount
*nmp
;
3100 char uio_buf
[UIO_SIZEOF(1)];
3104 thd
= req
->r_thread
;
3106 if (IS_VALID_CRED(cred
)) {
3107 kauth_cred_ref(cred
);
3109 cb
= req
->r_callback
;
3111 if (cb
.rcb_func
) { /* take an extra reference on the nfsreq in case we want to resend it later due to grace error */
3112 nfs_request_ref(req
, 0);
3116 if (nfs_mount_gone(nmp
)) {
3117 SET(bp
->nb_flags
, NB_ERROR
);
3118 bp
->nb_error
= error
= ENXIO
;
3120 if (error
|| ISSET(bp
->nb_flags
, NB_ERROR
)) {
3122 nfs_request_async_cancel(req
);
3125 nfsvers
= nmp
->nm_vers
;
3127 offset
= cb
.rcb_args
.offset
;
3128 rlen
= length
= cb
.rcb_args
.length
;
3130 /* finish the RPC */
3131 error
= nmp
->nm_funcs
->nf_write_rpc_async_finish(np
, req
, &committed
, &rlen
, &wverf
);
3132 if ((error
== EINPROGRESS
) && cb
.rcb_func
) {
3133 /* async request restarted */
3135 nfs_request_rele(req
);
3137 if (IS_VALID_CRED(cred
)) {
3138 kauth_cred_unref(&cred
);
3143 if ((nmp
->nm_vers
>= NFS_VER4
) && nfs_mount_state_error_should_restart(error
) && !ISSET(bp
->nb_flags
, NB_ERROR
)) {
3144 lck_mtx_lock(&nmp
->nm_lock
);
3145 if ((error
!= NFSERR_OLD_STATEID
) && (error
!= NFSERR_GRACE
) && (cb
.rcb_args
.stategenid
== nmp
->nm_stategenid
)) {
3146 NP(np
, "nfs_buf_write_rpc_finish: error %d @ 0x%llx, 0x%x 0x%x, initiating recovery",
3147 error
, NBOFF(bp
) + offset
, cb
.rcb_args
.stategenid
, nmp
->nm_stategenid
);
3148 nfs_need_recover(nmp
, error
);
3150 lck_mtx_unlock(&nmp
->nm_lock
);
3151 if (np
->n_flag
& NREVOKE
) {
3154 if (error
== NFSERR_GRACE
) {
3157 * For an async I/O request, handle a grace delay just like
3158 * jukebox errors. Set the resend time and queue it up.
3161 if (req
->r_nmrep
.nmc_mhead
) {
3162 mbuf_freem(req
->r_nmrep
.nmc_mhead
);
3163 req
->r_nmrep
.nmc_mhead
= NULL
;
3167 lck_mtx_lock(&req
->r_mtx
);
3168 req
->r_resendtime
= now
.tv_sec
+ 2;
3169 req
->r_xid
= 0; // get a new XID
3170 req
->r_flags
|= R_RESTART
;
3172 nfs_asyncio_resend(req
);
3173 lck_mtx_unlock(&req
->r_mtx
);
3174 if (IS_VALID_CRED(cred
)) {
3175 kauth_cred_unref(&cred
);
3177 /* Note: nfsreq reference taken will be dropped later when finished */
3180 /* otherwise, just pause a couple seconds and retry */
3181 tsleep(&nmp
->nm_state
, (PZERO
- 1), "nfsgrace", 2 * hz
);
3183 if (!(error
= nfs_mount_state_wait_for_recovery(nmp
))) {
3191 SET(bp
->nb_flags
, NB_ERROR
);
3192 bp
->nb_error
= error
;
3194 if (error
|| (nfsvers
== NFS_VER2
)) {
3198 SET(bp
->nb_flags
, NB_ERROR
);
3199 bp
->nb_error
= error
= EIO
;
3203 /* save lowest commit level returned */
3204 if (committed
< bp
->nb_commitlevel
) {
3205 bp
->nb_commitlevel
= committed
;
3208 /* check the write verifier */
3210 bp
->nb_verf
= wverf
;
3211 } else if (bp
->nb_verf
!= wverf
) {
3212 /* verifier changed, so buffer will need to be rewritten */
3213 bp
->nb_flags
|= NB_STALEWVERF
;
3214 bp
->nb_commitlevel
= NFS_WRITE_UNSTABLE
;
3215 bp
->nb_verf
= wverf
;
3218 if (!ISSET(bp
->nb_flags
, NB_STALEWVERF
) && rlen
> 0 && (bp
->nb_offio
< (offset
+ (int)rlen
))) {
3219 bp
->nb_offio
= offset
+ rlen
;
3223 * check for a short write
3225 * If the server didn't write all the data, then we
3226 * need to issue another write for the rest of it.
3227 * (Don't bother if the buffer hit an error or stale wverf.)
3229 if ((rlen
< length
) && !(bp
->nb_flags
& (NB_STALEWVERF
| NB_ERROR
))) {
3236 auio
= uio_createwithbuffer(1, NBOFF(bp
) + offset
, UIO_SYSSPACE
,
3237 UIO_WRITE
, &uio_buf
, sizeof(uio_buf
));
3238 uio_addiov(auio
, CAST_USER_ADDR_T(bp
->nb_data
+ offset
), length
);
3240 cb
.rcb_args
.offset
= offset
;
3241 cb
.rcb_args
.length
= length
;
3243 if (nmp
->nm_vers
>= NFS_VER4
) {
3244 cb
.rcb_args
.stategenid
= nmp
->nm_stategenid
;
3247 // XXX iomode should really match the original request
3248 error
= nmp
->nm_funcs
->nf_write_rpc_async(np
, auio
, length
, thd
, cred
,
3249 NFS_WRITE_FILESYNC
, &cb
, &wreq
);
3251 if (IS_VALID_CRED(cred
)) {
3252 kauth_cred_unref(&cred
);
3255 /* if !async we'll need to wait for this RPC to finish */
3260 nfs_request_rele(req
);
3263 * Outstanding RPC count is unchanged.
3264 * Callback will be called when RPC is done.
3268 SET(bp
->nb_flags
, NB_ERROR
);
3269 bp
->nb_error
= error
;
3274 nfs_async_write_done(nmp
);
3275 nfs_request_rele(req
);
3278 * Decrement outstanding RPC count on buffer
3279 * and call nfs_buf_write_finish on last RPC.
3281 * (Note: when there are multiple async RPCs issued for a
3282 * buffer we need nfs_buffer_mutex to avoid problems when
3283 * aborting a partially-initiated set of RPCs)
3285 multasyncrpc
= ISSET(bp
->nb_flags
, NB_MULTASYNCRPC
);
3287 lck_mtx_lock(nfs_buf_mutex
);
3291 finished
= (bp
->nb_rpcs
== 0);
3294 lck_mtx_unlock(nfs_buf_mutex
);
3299 wakeme
= &bp
->nb_rpcs
;
3301 nfs_buf_write_finish(bp
, thd
, cred
);
3307 if (IS_VALID_CRED(cred
)) {
3308 kauth_cred_unref(&cred
);
3311 if (cb
.rcb_func
&& np
->n_needcommitcnt
>= NFS_A_LOT_OF_NEEDCOMMITS
) {
3312 nfs_flushcommits(np
, 1);
3317 * Send commit(s) for the given node's "needcommit" buffers
3320 nfs_flushcommits(nfsnode_t np
, int nowait
)
3322 struct nfsmount
*nmp
;
3323 struct nfsbuf
*bp
, *prevlbp
, *lbp
;
3324 struct nfsbuflists blist
, commitlist
;
3325 int error
= 0, retv
, wcred_set
, flags
;
3326 u_quad_t off
, endoff
, toff
;
3327 uint64_t wverf
, count
;
3328 kauth_cred_t wcred
= NULL
;
3331 FSDBG_TOP(557, np
, 0, 0, 0);
3334 * A nb_flags == (NB_DELWRI | NB_NEEDCOMMIT) block has been written to the
3335 * server, but nas not been committed to stable storage on the server
3336 * yet. The byte range is worked out for as many nfsbufs as we can handle
3337 * and the commit rpc is done.
3339 if (!LIST_EMPTY(&np
->n_dirtyblkhd
)) {
3340 error
= nfs_node_lock(np
);
3344 np
->n_flag
|= NMODIFIED
;
3345 nfs_node_unlock(np
);
3351 LIST_INIT(&commitlist
);
3354 if (nfs_mount_gone(nmp
)) {
3358 if (nmp
->nm_vers
== NFS_VER2
) {
3365 flags
|= NBI_NOWAIT
;
3367 lck_mtx_lock(nfs_buf_mutex
);
3368 wverf
= nmp
->nm_verf
;
3369 if (!nfs_buf_iterprepare(np
, &blist
, flags
)) {
3370 while ((bp
= LIST_FIRST(&blist
))) {
3371 LIST_REMOVE(bp
, nb_vnbufs
);
3372 LIST_INSERT_HEAD(&np
->n_dirtyblkhd
, bp
, nb_vnbufs
);
3373 error
= nfs_buf_acquire(bp
, NBAC_NOWAIT
, 0, 0);
3377 if (ISSET(bp
->nb_flags
, NB_NEEDCOMMIT
)) {
3378 nfs_buf_check_write_verifier(np
, bp
);
3380 if (((bp
->nb_flags
& (NB_DELWRI
| NB_NEEDCOMMIT
)) != (NB_DELWRI
| NB_NEEDCOMMIT
)) ||
3381 (bp
->nb_verf
!= wverf
)) {
3385 nfs_buf_remfree(bp
);
3387 /* buffer UPLs will be grabbed *in order* below */
3389 FSDBG(557, bp
, bp
->nb_flags
, bp
->nb_valid
, bp
->nb_dirty
);
3390 FSDBG(557, bp
->nb_validoff
, bp
->nb_validend
,
3391 bp
->nb_dirtyoff
, bp
->nb_dirtyend
);
3394 * Work out if all buffers are using the same cred
3395 * so we can deal with them all with one commit.
3397 * Note: creds in bp's must be obtained by kauth_cred_ref
3398 * on the same original cred in order for them to be equal.
3400 if (wcred_set
== 0) {
3401 wcred
= bp
->nb_wcred
;
3402 if (!IS_VALID_CRED(wcred
)) {
3403 panic("nfs: needcommit w/out wcred");
3406 } else if ((wcred_set
== 1) && wcred
!= bp
->nb_wcred
) {
3409 SET(bp
->nb_flags
, NB_WRITEINPROG
);
3412 * Add this buffer to the list of buffers we are committing.
3413 * Buffers are inserted into the list in ascending order so that
3414 * we can take the UPLs in order after the list is complete.
3417 LIST_FOREACH(lbp
, &commitlist
, nb_vnbufs
) {
3418 if (bp
->nb_lblkno
< lbp
->nb_lblkno
) {
3423 LIST_REMOVE(bp
, nb_vnbufs
);
3425 LIST_INSERT_AFTER(prevlbp
, bp
, nb_vnbufs
);
3427 LIST_INSERT_HEAD(&commitlist
, bp
, nb_vnbufs
);
3430 /* update commit range start, end */
3431 toff
= NBOFF(bp
) + bp
->nb_dirtyoff
;
3435 toff
+= (u_quad_t
)(bp
->nb_dirtyend
- bp
->nb_dirtyoff
);
3436 if (toff
> endoff
) {
3440 nfs_buf_itercomplete(np
, &blist
, NBI_DIRTY
);
3442 lck_mtx_unlock(nfs_buf_mutex
);
3444 if (LIST_EMPTY(&commitlist
)) {
3450 * We need a UPL to prevent others from accessing the buffers during
3451 * our commit RPC(s).
3453 * We used to also check for dirty pages here; if there were any we'd
3454 * abort the commit and force the entire buffer to be written again.
3455 * Instead of doing that, we just go ahead and commit the dirty range,
3456 * and then leave the buffer around with dirty pages that will be
3457 * written out later.
3459 LIST_FOREACH(bp
, &commitlist
, nb_vnbufs
) {
3460 if (!ISSET(bp
->nb_flags
, NB_PAGELIST
)) {
3461 retv
= nfs_buf_upl_setup(bp
);
3463 /* Unable to create the UPL, the VM object probably no longer exists. */
3464 printf("nfs_flushcommits: upl create failed %d\n", retv
);
3465 NBPGS_ERASE(&bp
->nb_valid
);
3466 NBPGS_ERASE(&bp
->nb_dirty
);
3469 nfs_buf_upl_check(bp
);
3473 * Commit data on the server, as required.
3474 * If all bufs are using the same wcred, then use that with
3475 * one call for all of them, otherwise commit each one
3478 if (wcred_set
== 1) {
3480 * Note, it's possible the commit range could be >2^32-1.
3481 * If it is, we'll send one commit that covers the whole file.
3483 if ((endoff
- off
) > 0xffffffff) {
3486 count
= (endoff
- off
);
3488 retv
= nmp
->nm_funcs
->nf_commit_rpc(np
, off
, count
, wcred
, wverf
);
3491 LIST_FOREACH(bp
, &commitlist
, nb_vnbufs
) {
3492 toff
= NBOFF(bp
) + bp
->nb_dirtyoff
;
3493 count
= bp
->nb_dirtyend
- bp
->nb_dirtyoff
;
3494 retv
= nmp
->nm_funcs
->nf_commit_rpc(np
, toff
, count
, bp
->nb_wcred
, wverf
);
3502 * Now, either mark the blocks I/O done or mark the
3503 * blocks dirty, depending on whether the commit
3506 while ((bp
= LIST_FIRST(&commitlist
))) {
3507 LIST_REMOVE(bp
, nb_vnbufs
);
3508 FSDBG(557, bp
, retv
, bp
->nb_flags
, bp
->nb_dirty
);
3509 nfs_node_lock_force(np
);
3510 CLR(bp
->nb_flags
, (NB_NEEDCOMMIT
| NB_WRITEINPROG
));
3511 np
->n_needcommitcnt
--;
3512 CHECK_NEEDCOMMITCNT(np
);
3513 nfs_node_unlock(np
);
3516 /* move back to dirty list */
3517 lck_mtx_lock(nfs_buf_mutex
);
3518 LIST_INSERT_HEAD(&np
->n_dirtyblkhd
, bp
, nb_vnbufs
);
3519 lck_mtx_unlock(nfs_buf_mutex
);
3520 nfs_buf_release(bp
, 1);
3524 nfs_node_lock_force(np
);
3526 nfs_node_unlock(np
);
3527 vnode_startwrite(NFSTOV(np
));
3528 if (ISSET(bp
->nb_flags
, NB_DELWRI
)) {
3529 lck_mtx_lock(nfs_buf_mutex
);
3532 lck_mtx_unlock(nfs_buf_mutex
);
3533 wakeup(&nfs_nbdwrite
);
3535 CLR(bp
->nb_flags
, (NB_READ
| NB_DONE
| NB_ERROR
| NB_DELWRI
));
3536 /* if block still has dirty pages, we don't want it to */
3537 /* be released in nfs_buf_iodone(). So, don't set NB_ASYNC. */
3538 NBPGS_COPY(&dirty
, &bp
->nb_dirty
);
3539 if (!nfs_buf_pgs_is_set(&dirty
)) {
3540 SET(bp
->nb_flags
, NB_ASYNC
);
3542 CLR(bp
->nb_flags
, NB_ASYNC
);
3545 /* move to clean list */
3546 lck_mtx_lock(nfs_buf_mutex
);
3547 LIST_INSERT_HEAD(&np
->n_cleanblkhd
, bp
, nb_vnbufs
);
3548 lck_mtx_unlock(nfs_buf_mutex
);
3550 bp
->nb_dirtyoff
= bp
->nb_dirtyend
= 0;
3553 if (nfs_buf_pgs_is_set(&dirty
)) {
3554 /* throw it back in as a delayed write buffer */
3555 CLR(bp
->nb_flags
, NB_DONE
);
3556 nfs_buf_write_delayed(bp
);
3561 FSDBG_BOT(557, np
, 0, 0, error
);
3566 * Flush all the blocks associated with a vnode.
3567 * Walk through the buffer pool and push any dirty pages
3568 * associated with the vnode.
3571 nfs_flush(nfsnode_t np
, int waitfor
, thread_t thd
, int ignore_writeerr
)
3574 struct nfsbuflists blist
;
3575 struct nfsmount
*nmp
= NFSTONMP(np
);
3576 int error
= 0, error2
, slptimeo
= 0, slpflag
= 0;
3577 int nfsvers
, flags
, passone
= 1;
3579 FSDBG_TOP(517, np
, waitfor
, ignore_writeerr
, 0);
3581 if (nfs_mount_gone(nmp
)) {
3585 nfsvers
= nmp
->nm_vers
;
3586 if (NMFLAG(nmp
, INTR
)) {
3590 if (!LIST_EMPTY(&np
->n_dirtyblkhd
)) {
3591 nfs_node_lock_force(np
);
3592 np
->n_flag
|= NMODIFIED
;
3593 nfs_node_unlock(np
);
3596 lck_mtx_lock(nfs_buf_mutex
);
3597 while (np
->n_bflag
& NBFLUSHINPROG
) {
3598 np
->n_bflag
|= NBFLUSHWANT
;
3599 error
= msleep(&np
->n_bflag
, nfs_buf_mutex
, slpflag
, "nfs_flush", NULL
);
3600 if ((error
&& (error
!= EWOULDBLOCK
)) ||
3601 ((error
= nfs_sigintr(NFSTONMP(np
), NULL
, thd
, 0)))) {
3602 lck_mtx_unlock(nfs_buf_mutex
);
3606 np
->n_bflag
|= NBFLUSHINPROG
;
3609 * On the first pass, start async/unstable writes on all
3610 * delayed write buffers. Then wait for all writes to complete
3611 * and call nfs_flushcommits() to commit any uncommitted buffers.
3612 * On all subsequent passes, start STABLE writes on any remaining
3613 * dirty buffers. Then wait for all writes to complete.
3616 FSDBG(518, LIST_FIRST(&np
->n_dirtyblkhd
), np
->n_flag
, 0, 0);
3617 if (!NFSTONMP(np
)) {
3618 lck_mtx_unlock(nfs_buf_mutex
);
3623 /* Start/do any write(s) that are required. */
3624 if (!nfs_buf_iterprepare(np
, &blist
, NBI_DIRTY
)) {
3625 while ((bp
= LIST_FIRST(&blist
))) {
3626 LIST_REMOVE(bp
, nb_vnbufs
);
3627 LIST_INSERT_HEAD(&np
->n_dirtyblkhd
, bp
, nb_vnbufs
);
3628 flags
= (passone
|| !(waitfor
== MNT_WAIT
|| waitfor
== MNT_DWAIT
)) ? NBAC_NOWAIT
: 0;
3629 if (flags
!= NBAC_NOWAIT
) {
3632 while ((error
= nfs_buf_acquire(bp
, flags
, slpflag
, slptimeo
))) {
3633 FSDBG(524, bp
, flags
, bp
->nb_lflags
, bp
->nb_flags
);
3634 if (error
== EBUSY
) {
3638 error2
= nfs_sigintr(NFSTONMP(np
), NULL
, thd
, 0);
3640 if (flags
!= NBAC_NOWAIT
) {
3641 nfs_buf_refrele(bp
);
3643 nfs_buf_itercomplete(np
, &blist
, NBI_DIRTY
);
3644 lck_mtx_unlock(nfs_buf_mutex
);
3648 if (slpflag
== PCATCH
) {
3654 if (flags
!= NBAC_NOWAIT
) {
3655 nfs_buf_refrele(bp
);
3657 if (error
== EBUSY
) {
3661 /* buffer is no longer valid */
3665 if (ISSET(bp
->nb_flags
, NB_NEEDCOMMIT
)) {
3666 nfs_buf_check_write_verifier(np
, bp
);
3668 if (!ISSET(bp
->nb_flags
, NB_DELWRI
)) {
3669 /* buffer is no longer dirty */
3673 FSDBG(525, bp
, passone
, bp
->nb_lflags
, bp
->nb_flags
);
3674 if ((passone
|| !(waitfor
== MNT_WAIT
|| waitfor
== MNT_DWAIT
)) &&
3675 ISSET(bp
->nb_flags
, NB_NEEDCOMMIT
)) {
3679 nfs_buf_remfree(bp
);
3680 lck_mtx_unlock(nfs_buf_mutex
);
3681 if (ISSET(bp
->nb_flags
, NB_ERROR
)) {
3682 nfs_node_lock_force(np
);
3683 np
->n_error
= bp
->nb_error
? bp
->nb_error
: EIO
;
3684 np
->n_flag
|= NWRITEERR
;
3685 nfs_node_unlock(np
);
3686 nfs_buf_release(bp
, 1);
3687 lck_mtx_lock(nfs_buf_mutex
);
3690 SET(bp
->nb_flags
, NB_ASYNC
);
3692 /* NB_STABLE forces this to be written FILESYNC */
3693 SET(bp
->nb_flags
, NB_STABLE
);
3696 lck_mtx_lock(nfs_buf_mutex
);
3698 nfs_buf_itercomplete(np
, &blist
, NBI_DIRTY
);
3700 lck_mtx_unlock(nfs_buf_mutex
);
3702 if (waitfor
== MNT_WAIT
|| waitfor
== MNT_DWAIT
) {
3703 while ((error
= vnode_waitforwrites(NFSTOV(np
), 0, slpflag
, slptimeo
, "nfsflush"))) {
3704 error2
= nfs_sigintr(NFSTONMP(np
), NULL
, thd
, 0);
3709 if (slpflag
== PCATCH
) {
3716 if (nfsvers
!= NFS_VER2
) {
3717 /* loop while it looks like there are still buffers to be */
3718 /* commited and nfs_flushcommits() seems to be handling them. */
3719 while (np
->n_needcommitcnt
) {
3720 if (nfs_flushcommits(np
, 0)) {
3728 if (!LIST_EMPTY(&np
->n_dirtyblkhd
)) {
3729 nfs_node_lock_force(np
);
3730 np
->n_flag
|= NMODIFIED
;
3731 nfs_node_unlock(np
);
3733 lck_mtx_lock(nfs_buf_mutex
);
3737 if (waitfor
== MNT_WAIT
|| waitfor
== MNT_DWAIT
) {
3738 if (!LIST_EMPTY(&np
->n_dirtyblkhd
)) {
3739 nfs_node_lock_force(np
);
3740 np
->n_flag
|= NMODIFIED
;
3741 nfs_node_unlock(np
);
3743 lck_mtx_lock(nfs_buf_mutex
);
3744 if (!LIST_EMPTY(&np
->n_dirtyblkhd
)) {
3747 lck_mtx_unlock(nfs_buf_mutex
);
3748 nfs_node_lock_force(np
);
3750 * OK, it looks like there are no dirty blocks. If we have no
3751 * writes in flight and no one in the write code, we can clear
3752 * the modified flag. In order to make sure we see the latest
3753 * attributes and size, we also invalidate the attributes and
3754 * advance the attribute cache XID to guarantee that attributes
3755 * newer than our clearing of NMODIFIED will get loaded next.
3756 * (If we don't do this, it's possible for the flush's final
3757 * write/commit (xid1) to be executed in parallel with a subsequent
3758 * getattr request (xid2). The getattr could return attributes
3759 * from *before* the write/commit completed but the stale attributes
3760 * would be preferred because of the xid ordering.)
3762 if (!np
->n_wrbusy
&& !np
->n_numoutput
) {
3763 np
->n_flag
&= ~NMODIFIED
;
3764 NATTRINVALIDATE(np
);
3765 nfs_get_xid(&np
->n_xid
);
3768 nfs_node_lock_force(np
);
3771 FSDBG(526, np
->n_flag
, np
->n_error
, 0, 0);
3772 if (!ignore_writeerr
&& (np
->n_flag
& NWRITEERR
)) {
3773 error
= np
->n_error
;
3774 np
->n_flag
&= ~NWRITEERR
;
3776 nfs_node_unlock(np
);
3778 lck_mtx_lock(nfs_buf_mutex
);
3779 flags
= np
->n_bflag
;
3780 np
->n_bflag
&= ~(NBFLUSHINPROG
| NBFLUSHWANT
);
3781 lck_mtx_unlock(nfs_buf_mutex
);
3782 if (flags
& NBFLUSHWANT
) {
3783 wakeup(&np
->n_bflag
);
3786 FSDBG_BOT(517, np
, error
, ignore_writeerr
, 0);
3791 * Flush out and invalidate all buffers associated with a vnode.
3792 * Called with the underlying object locked.
3795 nfs_vinvalbuf_internal(
3804 struct nfsbuflists blist
;
3805 int list
, error
= 0;
3807 if (flags
& V_SAVE
) {
3808 if ((error
= nfs_flush(np
, MNT_WAIT
, thd
, (flags
& V_IGNORE_WRITEERR
)))) {
3813 lck_mtx_lock(nfs_buf_mutex
);
3816 if (nfs_buf_iterprepare(np
, &blist
, list
)) {
3818 if (nfs_buf_iterprepare(np
, &blist
, list
)) {
3822 while ((bp
= LIST_FIRST(&blist
))) {
3823 LIST_REMOVE(bp
, nb_vnbufs
);
3824 if (list
== NBI_CLEAN
) {
3825 LIST_INSERT_HEAD(&np
->n_cleanblkhd
, bp
, nb_vnbufs
);
3827 LIST_INSERT_HEAD(&np
->n_dirtyblkhd
, bp
, nb_vnbufs
);
3830 while ((error
= nfs_buf_acquire(bp
, NBAC_REMOVE
, slpflag
, slptimeo
))) {
3831 FSDBG(556, np
, bp
, NBOFF(bp
), bp
->nb_flags
);
3832 if (error
!= EAGAIN
) {
3833 FSDBG(554, np
, bp
, -1, error
);
3834 nfs_buf_refrele(bp
);
3835 nfs_buf_itercomplete(np
, &blist
, list
);
3836 lck_mtx_unlock(nfs_buf_mutex
);
3840 nfs_buf_refrele(bp
);
3841 FSDBG(554, np
, bp
, NBOFF(bp
), bp
->nb_flags
);
3842 lck_mtx_unlock(nfs_buf_mutex
);
3843 if ((flags
& V_SAVE
) && UBCINFOEXISTS(NFSTOV(np
)) && bp
->nb_np
&&
3844 (NBOFF(bp
) < (off_t
)np
->n_size
)) {
3845 /* extra paranoia: make sure we're not */
3846 /* somehow leaving any dirty data around */
3849 off_t end
= (NBOFF(bp
) + bp
->nb_bufsize
> (off_t
)np
->n_size
) ?
3850 (np
->n_size
- NBOFF(bp
)) : bp
->nb_bufsize
;
3851 if (!ISSET(bp
->nb_flags
, NB_PAGELIST
)) {
3852 error
= nfs_buf_upl_setup(bp
);
3853 if (error
== EINVAL
) {
3854 /* vm object must no longer exist */
3855 /* hopefully we don't need to do */
3856 /* anything for this buffer */
3858 printf("nfs_vinvalbuf: upl setup failed %d\n", error
);
3860 NBPGS_ERASE(&bp
->nb_valid
);
3861 NBPGS_ERASE(&bp
->nb_dirty
);
3863 nfs_buf_upl_check(bp
);
3864 /* check for any dirty data before the EOF */
3865 if ((bp
->nb_dirtyend
> 0) && (bp
->nb_dirtyoff
< end
)) {
3866 /* clip dirty range to EOF */
3867 if (bp
->nb_dirtyend
> end
) {
3868 bp
->nb_dirtyend
= end
;
3869 if (bp
->nb_dirtyoff
>= bp
->nb_dirtyend
) {
3870 bp
->nb_dirtyoff
= bp
->nb_dirtyend
= 0;
3873 if ((bp
->nb_dirtyend
> 0) && (bp
->nb_dirtyoff
< end
)) {
3877 nfs_buf_pgs_get_page_mask(&pagemask
, round_page_64(end
) / PAGE_SIZE
);
3878 nfs_buf_pgs_bit_and(&bp
->nb_dirty
, &pagemask
, &bp
->nb_dirty
);
3879 if (nfs_buf_pgs_is_set(&bp
->nb_dirty
)) {
3882 /* also make sure we'll have a credential to do the write */
3883 if (mustwrite
&& !IS_VALID_CRED(bp
->nb_wcred
) && !IS_VALID_CRED(cred
)) {
3884 printf("nfs_vinvalbuf: found dirty buffer with no write creds\n");
3888 FSDBG(554, np
, bp
, 0xd00dee, bp
->nb_flags
);
3889 if (!ISSET(bp
->nb_flags
, NB_PAGELIST
)) {
3890 panic("nfs_vinvalbuf: dirty buffer without upl");
3892 /* gotta write out dirty data before invalidating */
3893 /* (NB_STABLE indicates that data writes should be FILESYNC) */
3894 /* (NB_NOCACHE indicates buffer should be discarded) */
3895 CLR(bp
->nb_flags
, (NB_DONE
| NB_ERROR
| NB_INVAL
| NB_ASYNC
));
3896 SET(bp
->nb_flags
, NB_STABLE
| NB_NOCACHE
);
3897 if (!IS_VALID_CRED(bp
->nb_wcred
)) {
3898 kauth_cred_ref(cred
);
3899 bp
->nb_wcred
= cred
;
3901 error
= nfs_buf_write(bp
);
3902 // Note: bp has been released
3904 FSDBG(554, bp
, 0xd00dee, 0xbad, error
);
3905 nfs_node_lock_force(np
);
3906 if ((error
!= EINTR
) && (error
!= ERESTART
)) {
3907 np
->n_error
= error
;
3908 np
->n_flag
|= NWRITEERR
;
3911 * There was a write error and we need to
3912 * invalidate attrs to sync with server.
3913 * (if this write was extending the file,
3914 * we may no longer know the correct size)
3916 NATTRINVALIDATE(np
);
3917 nfs_node_unlock(np
);
3918 if ((error
== EINTR
) || (error
== ERESTART
)) {
3920 * Abort on EINTR. If we don't, we could
3921 * be stuck in this loop forever because
3922 * the buffer will continue to stay dirty.
3924 lck_mtx_lock(nfs_buf_mutex
);
3925 nfs_buf_itercomplete(np
, &blist
, list
);
3926 lck_mtx_unlock(nfs_buf_mutex
);
3931 lck_mtx_lock(nfs_buf_mutex
);
3935 SET(bp
->nb_flags
, NB_INVAL
);
3936 // hold off on FREEUPs until we're done here
3937 nfs_buf_release(bp
, 0);
3938 lck_mtx_lock(nfs_buf_mutex
);
3940 nfs_buf_itercomplete(np
, &blist
, list
);
3942 if (!LIST_EMPTY(&(np
)->n_dirtyblkhd
) || !LIST_EMPTY(&(np
)->n_cleanblkhd
)) {
3943 panic("nfs_vinvalbuf: flush/inval failed");
3945 lck_mtx_unlock(nfs_buf_mutex
);
3946 nfs_node_lock_force(np
);
3947 if (!(flags
& V_SAVE
)) {
3948 np
->n_flag
&= ~NMODIFIED
;
3950 if (vnode_vtype(NFSTOV(np
)) == VREG
) {
3951 np
->n_lastrahead
= -1;
3953 nfs_node_unlock(np
);
3960 * Flush and invalidate all dirty buffers. If another process is already
3961 * doing the flush, just wait for completion.
3964 nfs_vinvalbuf(vnode_t vp
, int flags
, vfs_context_t ctx
, int intrflg
)
3966 return nfs_vinvalbuf2(vp
, flags
, vfs_context_thread(ctx
), vfs_context_ucred(ctx
), intrflg
);
3970 nfs_vinvalbuf2(vnode_t vp
, int flags
, thread_t thd
, kauth_cred_t cred
, int intrflg
)
3972 nfsnode_t np
= VTONFS(vp
);
3973 struct nfsmount
*nmp
= VTONMP(vp
);
3974 int error
, slpflag
, slptimeo
, nflags
, retry
= 0;
3975 int ubcflags
= UBC_PUSHALL
| UBC_SYNC
| UBC_INVALIDATE
;
3976 struct timespec ts
= { .tv_sec
= 2, .tv_nsec
= 0 };
3979 FSDBG_TOP(554, np
, flags
, intrflg
, 0);
3982 * If the mount is gone no sense to try and write anything.
3983 * and hang trying to do IO.
3985 if (nfs_mount_gone(nmp
)) {
3987 ubcflags
&= ~UBC_PUSHALL
;
3990 if (nmp
&& !NMFLAG(nmp
, INTR
)) {
4001 /* First wait for any other process doing a flush to complete. */
4002 lck_mtx_lock(nfs_buf_mutex
);
4003 while (np
->n_bflag
& NBINVALINPROG
) {
4004 np
->n_bflag
|= NBINVALWANT
;
4005 msleep(&np
->n_bflag
, nfs_buf_mutex
, slpflag
, "nfs_vinvalbuf", &ts
);
4006 if ((error
= nfs_sigintr(VTONMP(vp
), NULL
, thd
, 0))) {
4007 lck_mtx_unlock(nfs_buf_mutex
);
4010 if (np
->n_bflag
& NBINVALINPROG
) {
4014 np
->n_bflag
|= NBINVALINPROG
;
4015 lck_mtx_unlock(nfs_buf_mutex
);
4017 /* Now, flush as required. */
4019 error
= nfs_vinvalbuf_internal(np
, flags
, thd
, cred
, slpflag
, 0);
4021 FSDBG(554, np
, 0, 0, error
);
4022 if ((error
= nfs_sigintr(VTONMP(vp
), NULL
, thd
, 0))) {
4025 error
= nfs_vinvalbuf_internal(np
, flags
, thd
, cred
, 0, slptimeo
);
4028 /* get the pages out of vm also */
4029 if (UBCINFOEXISTS(vp
) && (size
= ubc_getsize(vp
))) {
4030 if ((error
= ubc_msync(vp
, 0, size
, NULL
, ubcflags
))) {
4031 if (error
== EINVAL
) {
4032 panic("nfs_vinvalbuf(): ubc_msync failed!, error %d", error
);
4034 if (retry
++ < 10) { /* retry invalidating a few times */
4035 if (retry
> 1 || error
== ENXIO
) {
4036 ubcflags
&= ~UBC_PUSHALL
;
4041 printf("nfs_vinvalbuf(): ubc_msync failed!, error %d\n", error
);
4045 lck_mtx_lock(nfs_buf_mutex
);
4046 nflags
= np
->n_bflag
;
4047 np
->n_bflag
&= ~(NBINVALINPROG
| NBINVALWANT
);
4048 lck_mtx_unlock(nfs_buf_mutex
);
4049 if (nflags
& NBINVALWANT
) {
4050 wakeup(&np
->n_bflag
);
4053 FSDBG_BOT(554, np
, flags
, intrflg
, error
);
4058 * Wait for any busy buffers to complete.
4061 nfs_wait_bufs(nfsnode_t np
)
4064 struct nfsbuflists blist
;
4067 lck_mtx_lock(nfs_buf_mutex
);
4068 if (!nfs_buf_iterprepare(np
, &blist
, NBI_CLEAN
)) {
4069 while ((bp
= LIST_FIRST(&blist
))) {
4070 LIST_REMOVE(bp
, nb_vnbufs
);
4071 LIST_INSERT_HEAD(&np
->n_cleanblkhd
, bp
, nb_vnbufs
);
4073 while ((error
= nfs_buf_acquire(bp
, 0, 0, 0))) {
4074 if (error
!= EAGAIN
) {
4075 nfs_buf_refrele(bp
);
4076 nfs_buf_itercomplete(np
, &blist
, NBI_CLEAN
);
4077 lck_mtx_unlock(nfs_buf_mutex
);
4081 nfs_buf_refrele(bp
);
4084 nfs_buf_itercomplete(np
, &blist
, NBI_CLEAN
);
4086 if (!nfs_buf_iterprepare(np
, &blist
, NBI_DIRTY
)) {
4087 while ((bp
= LIST_FIRST(&blist
))) {
4088 LIST_REMOVE(bp
, nb_vnbufs
);
4089 LIST_INSERT_HEAD(&np
->n_dirtyblkhd
, bp
, nb_vnbufs
);
4091 while ((error
= nfs_buf_acquire(bp
, 0, 0, 0))) {
4092 if (error
!= EAGAIN
) {
4093 nfs_buf_refrele(bp
);
4094 nfs_buf_itercomplete(np
, &blist
, NBI_DIRTY
);
4095 lck_mtx_unlock(nfs_buf_mutex
);
4099 nfs_buf_refrele(bp
);
4102 nfs_buf_itercomplete(np
, &blist
, NBI_DIRTY
);
4104 lck_mtx_unlock(nfs_buf_mutex
);
4109 * Add an async I/O request to the mount's async I/O queue and make
4110 * sure that an nfsiod will service it.
4113 nfs_asyncio_finish(struct nfsreq
*req
)
4115 struct nfsmount
*nmp
;
4116 struct nfsiod
*niod
;
4119 FSDBG_TOP(552, nmp
, 0, 0, 0);
4127 lck_mtx_lock(nfsiod_mutex
);
4128 niod
= nmp
->nm_niod
;
4130 /* grab an nfsiod if we don't have one already */
4132 niod
= TAILQ_FIRST(&nfsiodfree
);
4134 TAILQ_REMOVE(&nfsiodfree
, niod
, niod_link
);
4135 TAILQ_INSERT_TAIL(&nfsiodwork
, niod
, niod_link
);
4136 niod
->niod_nmp
= nmp
;
4137 } else if (((nfsiod_thread_count
< NFSIOD_MAX
) || (nfsiod_thread_count
<= 0)) && (started
< 4)) {
4139 * Try starting a new thread.
4140 * We may try a couple times if other callers
4141 * get the new threads before we do.
4143 lck_mtx_unlock(nfsiod_mutex
);
4145 if (!nfsiod_start()) {
4148 lck_mtx_lock(nfsiod_mutex
);
4153 * If we got here while being on the resendq we need to get off. This
4154 * happens when the timer fires and errors out requests from nfs_sigintr
4155 * or we receive a reply (UDP case) while being on the resend queue so
4156 * we're just finishing up and are not going to be resent.
4158 lck_mtx_lock(&req
->r_mtx
);
4159 if (req
->r_flags
& R_RESENDQ
) {
4160 lck_mtx_lock(&nmp
->nm_lock
);
4161 if ((req
->r_flags
& R_RESENDQ
) && req
->r_rchain
.tqe_next
!= NFSREQNOLIST
) {
4162 NFS_BIO_DBG("Proccessing async request on resendq. Removing");
4163 TAILQ_REMOVE(&nmp
->nm_resendq
, req
, r_rchain
);
4164 req
->r_flags
&= ~R_RESENDQ
;
4165 req
->r_rchain
.tqe_next
= NFSREQNOLIST
;
4166 assert(req
->r_refs
> 1);
4167 /* Remove resendq reference */
4170 lck_mtx_unlock(&nmp
->nm_lock
);
4172 lck_mtx_unlock(&req
->r_mtx
);
4174 if (req
->r_achain
.tqe_next
== NFSREQNOLIST
) {
4175 TAILQ_INSERT_TAIL(&nmp
->nm_iodq
, req
, r_achain
);
4178 /* If this mount doesn't already have an nfsiod working on it... */
4179 if (!nmp
->nm_niod
) {
4180 if (niod
) { /* give it the nfsiod we just grabbed */
4181 nmp
->nm_niod
= niod
;
4182 lck_mtx_unlock(nfsiod_mutex
);
4184 } else if (nfsiod_thread_count
> 0) {
4185 /* just queue it up on nfsiod mounts queue if needed */
4186 if (nmp
->nm_iodlink
.tqe_next
== NFSNOLIST
) {
4187 TAILQ_INSERT_TAIL(&nfsiodmounts
, nmp
, nm_iodlink
);
4189 lck_mtx_unlock(nfsiod_mutex
);
4191 printf("nfs_asyncio(): no nfsiods? %d %d (%d)\n", nfsiod_thread_count
, NFSIOD_MAX
, started
);
4192 lck_mtx_unlock(nfsiod_mutex
);
4193 /* we have no other option but to be persistent */
4198 lck_mtx_unlock(nfsiod_mutex
);
4201 FSDBG_BOT(552, nmp
, 0, 0, 0);
4205 * queue up async I/O request for resend
4206 * Must be called with req->r_mtx locked.
4209 nfs_asyncio_resend(struct nfsreq
*req
)
4211 struct nfsmount
*nmp
= req
->r_nmp
;
4213 if (nfs_mount_gone(nmp
)) {
4218 nfs_gss_clnt_rpcdone(req
);
4220 lck_mtx_lock(&nmp
->nm_lock
);
4221 if (!(req
->r_flags
& R_RESENDQ
)) {
4222 TAILQ_INSERT_TAIL(&nmp
->nm_resendq
, req
, r_rchain
);
4223 req
->r_flags
|= R_RESENDQ
;
4225 * We take a reference on this request so that it can't be
4226 * destroyed while a resend is queued or in progress.
4228 nfs_request_ref(req
, 1);
4230 nfs_mount_sock_thread_wake(nmp
);
4231 lck_mtx_unlock(&nmp
->nm_lock
);
4235 * Read directory data into a buffer.
4237 * Buffer will be filled (unless EOF is hit).
4238 * Buffers after this one may also be completely/partially filled.
4241 nfs_buf_readdir(struct nfsbuf
*bp
, vfs_context_t ctx
)
4243 nfsnode_t np
= bp
->nb_np
;
4244 struct nfsmount
*nmp
= NFSTONMP(np
);
4247 if (nfs_mount_gone(nmp
)) {
4251 if (nmp
->nm_vers
< NFS_VER4
) {
4252 error
= nfs3_readdir_rpc(np
, bp
, ctx
);
4256 error
= nfs4_readdir_rpc(np
, bp
, ctx
);
4259 if (error
&& (error
!= NFSERR_DIRBUFDROPPED
)) {
4260 SET(bp
->nb_flags
, NB_ERROR
);
4261 bp
->nb_error
= error
;
4266 #endif /* CONFIG_NFS_CLIENT */