2 * Copyright (c) 2000-2019 Apple Inc. All rights reserved.
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
28 /* Copyright (c) 1995 NeXT Computer, Inc. All Rights Reserved */
30 * Copyright (c) 1989, 1993
31 * The Regents of the University of California. All rights reserved.
33 * This code is derived from software contributed to Berkeley by
34 * Rick Macklem at The University of Guelph.
36 * Redistribution and use in source and binary forms, with or without
37 * modification, are permitted provided that the following conditions
39 * 1. Redistributions of source code must retain the above copyright
40 * notice, this list of conditions and the following disclaimer.
41 * 2. Redistributions in binary form must reproduce the above copyright
42 * notice, this list of conditions and the following disclaimer in the
43 * documentation and/or other materials provided with the distribution.
44 * 3. All advertising materials mentioning features or use of this software
45 * must display the following acknowledgement:
46 * This product includes software developed by the University of
47 * California, Berkeley and its contributors.
48 * 4. Neither the name of the University nor the names of its contributors
49 * may be used to endorse or promote products derived from this software
50 * without specific prior written permission.
52 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
53 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
54 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
55 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
56 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
57 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
58 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
59 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
60 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
61 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
64 * @(#)nfs_bio.c 8.9 (Berkeley) 3/30/95
65 * FreeBSD-Id: nfs_bio.c,v 1.44 1997/09/10 19:52:25 phk Exp $
68 #include <nfs/nfs_conf.h>
71 #include <sys/param.h>
72 #include <sys/systm.h>
73 #include <sys/resourcevar.h>
74 #include <sys/signalvar.h>
75 #include <sys/proc_internal.h>
76 #include <sys/kauth.h>
77 #include <sys/malloc.h>
78 #include <sys/vnode.h>
79 #include <sys/dirent.h>
80 #include <sys/mount_internal.h>
81 #include <sys/kernel.h>
82 #include <sys/ubc_internal.h>
83 #include <sys/uio_internal.h>
84 #include <sys/kpi_mbuf.h>
87 #include <sys/vmparam.h>
90 #include <kern/clock.h>
91 #include <libkern/OSAtomic.h>
92 #include <kern/kalloc.h>
93 #include <kern/thread_call.h>
95 #include <nfs/rpcv2.h>
96 #include <nfs/nfsproto.h>
98 #include <nfs/nfs_gss.h>
99 #include <nfs/nfsmount.h>
100 #include <nfs/nfsnode.h>
101 #include <sys/buf_internal.h>
102 #include <libkern/OSAtomic.h>
103 #include <os/refcnt.h>
105 #define NFS_BIO_DBG(...) NFS_DBG(NFS_FAC_BIO, 7, ## __VA_ARGS__)
107 kern_return_t
thread_terminate(thread_t
); /* XXX */
109 #define NFSBUFHASH(np, lbn) \
110 (&nfsbufhashtbl[((long)(np) / sizeof(*(np)) + (int)(lbn)) & nfsbufhash])
111 LIST_HEAD(nfsbufhashhead
, nfsbuf
) * nfsbufhashtbl
;
112 struct nfsbuffreehead nfsbuffree
, nfsbuffreemeta
, nfsbufdelwri
;
114 int nfsbufcnt
, nfsbufmin
, nfsbufmax
, nfsbufmetacnt
, nfsbufmetamax
;
115 int nfsbuffreecnt
, nfsbuffreemetacnt
, nfsbufdelwricnt
, nfsneedbuffer
;
117 int nfs_buf_timer_on
= 0;
118 thread_t nfsbufdelwrithd
= NULL
;
120 static ZONE_DECLARE(nfsbuf_zone
, "NFS bio", sizeof(struct nfsbuf
), ZC_NONE
);
122 static LCK_GRP_DECLARE(nfs_buf_lck_grp
, "nfs buf");
123 LCK_MTX_DECLARE(nfs_buf_mutex
, &nfs_buf_lck_grp
);
125 #define NFSBUF_FREE_PERIOD 30 /* seconds */
126 #define NFSBUF_LRU_STALE 120
127 #define NFSBUF_META_STALE 240
129 /* number of nfsbufs nfs_buf_freeup() should attempt to free from nfsbuffree list */
130 #define LRU_TO_FREEUP 6
131 /* number of nfsbufs nfs_buf_freeup() should attempt to free from nfsbuffreemeta list */
132 #define META_TO_FREEUP 3
133 /* total number of nfsbufs nfs_buf_freeup() should attempt to free */
134 #define TOTAL_TO_FREEUP (LRU_TO_FREEUP+META_TO_FREEUP)
135 /* fraction of nfsbufs nfs_buf_freeup() should attempt to free from nfsbuffree list when called from timer */
136 #define LRU_FREEUP_FRAC_ON_TIMER 8
137 /* fraction of nfsbufs nfs_buf_freeup() should attempt to free from nfsbuffreemeta list when called from timer */
138 #define META_FREEUP_FRAC_ON_TIMER 16
139 /* fraction of total nfsbufs that nfsbuffreecnt should exceed before bothering to call nfs_buf_freeup() */
140 #define LRU_FREEUP_MIN_FRAC 4
141 /* fraction of total nfsbufs that nfsbuffreemetacnt should exceed before bothering to call nfs_buf_freeup() */
142 #define META_FREEUP_MIN_FRAC 2
144 #define NFS_ROUND_BLOCK(p, blksize) ((((uint64_t)(p) + blksize - 1) & ~((uint64_t)blksize - 1)) / blksize)
146 #define NFS_BUF_FREEUP() \
148 /* only call nfs_buf_freeup() if it has work to do: */ \
149 if (((nfsbuffreecnt > nfsbufcnt/LRU_FREEUP_MIN_FRAC) || \
150 (nfsbuffreemetacnt > nfsbufcnt/META_FREEUP_MIN_FRAC)) && \
151 ((nfsbufcnt - TOTAL_TO_FREEUP) > nfsbufmin)) \
156 nfs_buf_pgs_get_page_mask(nfsbufpgs
*nfsbp
, off_t page
)
158 off_t page_pos
= page
/ NBPGS_ELEMENT_PAGES
;
159 off_t max_page
= NBPGS_STRUCT_SIZE
* 8;
162 if (page
>= max_page
) {
163 nfs_buf_pgs_bit_not(nfsbp
);
167 NBPGS_SET(nfsbp
, page
);
168 nfsbp
->pages
[page_pos
]--;
169 for (off_t i
= page_pos
- 1; i
>= 0; i
--) {
170 nfsbp
->pages
[i
] = ~0;
175 nfs_buf_pgs_bit_not(nfsbufpgs
*nfsbp
)
177 for (uint32_t i
= 0; i
< NBPGS_ELEMENTS
; i
++) {
178 nfsbp
->pages
[i
] = ~nfsbp
->pages
[i
];
183 nfs_buf_pgs_bit_and(nfsbufpgs
*nfsbp_src1
, nfsbufpgs
*nfsbp_src2
, nfsbufpgs
*nfsbp_dst
)
185 for (uint32_t i
= 0; i
< NBPGS_ELEMENTS
; i
++) {
186 nfsbp_dst
->pages
[i
] = nfsbp_src1
->pages
[i
] & nfsbp_src2
->pages
[i
];
191 nfs_buf_pgs_set_pages_between(nfsbufpgs
*nfsbp
, off_t firstpg
, off_t lastpg
)
193 nfsbufpgs pagemaskfirst
, pagemasklast
;
195 nfs_buf_pgs_get_page_mask(&pagemasklast
, lastpg
);
196 nfs_buf_pgs_get_page_mask(&pagemaskfirst
, firstpg
);
197 nfs_buf_pgs_bit_not(&pagemaskfirst
);
198 nfs_buf_pgs_bit_and(&pagemaskfirst
, &pagemasklast
, nfsbp
);
202 nfs_buf_pgs_is_set(nfsbufpgs
*nfsbp
)
204 for (uint32_t i
= 0; i
< NBPGS_ELEMENTS
; i
++) {
205 if (nfsbp
->pages
[i
] != 0) {
213 * Initialize nfsbuf lists
218 nfsbufcnt
= nfsbufmetacnt
=
219 nfsbuffreecnt
= nfsbuffreemetacnt
= nfsbufdelwricnt
= 0;
221 /* size nfsbufmax to cover at most half sane_size (w/default buf size) */
222 nfsbufmax
= (int)(sane_size
>> PAGE_SHIFT
) / (2 * (NFS_RWSIZE
>> PAGE_SHIFT
));
223 nfsbufmetamax
= nfsbufmax
/ 4;
227 nfsbufhashtbl
= hashinit(nfsbufmax
/ 4, M_NFSBIO
, &nfsbufhash
);
228 TAILQ_INIT(&nfsbuffree
);
229 TAILQ_INIT(&nfsbuffreemeta
);
230 TAILQ_INIT(&nfsbufdelwri
);
234 * Check periodically for stale/unused nfs bufs
237 nfs_buf_timer(__unused
void *param0
, __unused
void *param1
)
241 lck_mtx_lock(&nfs_buf_mutex
);
242 if (nfsbufcnt
<= nfsbufmin
) {
243 nfs_buf_timer_on
= 0;
244 lck_mtx_unlock(&nfs_buf_mutex
);
247 lck_mtx_unlock(&nfs_buf_mutex
);
249 nfs_interval_timer_start(nfs_buf_timer_call
,
250 NFSBUF_FREE_PERIOD
* 1000);
254 * try to free up some excess, unused nfsbufs
257 nfs_buf_freeup(int timer
)
262 struct nfsbuffreehead nfsbuffreeup
;
264 TAILQ_INIT(&nfsbuffreeup
);
266 lck_mtx_lock(&nfs_buf_mutex
);
270 FSDBG(320, nfsbufcnt
, nfsbuffreecnt
, nfsbuffreemetacnt
, 0);
272 count
= timer
? nfsbuffreecnt
/ LRU_FREEUP_FRAC_ON_TIMER
: LRU_TO_FREEUP
;
273 while ((nfsbufcnt
> nfsbufmin
) && (count
-- > 0)) {
274 fbp
= TAILQ_FIRST(&nfsbuffree
);
278 if (os_ref_get_count(&fbp
->nb_refs
) > 1) {
281 if (NBUFSTAMPVALID(fbp
) &&
282 (fbp
->nb_timestamp
+ (2 * NFSBUF_LRU_STALE
)) > now
.tv_sec
) {
285 nfs_buf_remfree(fbp
);
286 /* disassociate buffer from any nfsnode */
288 if (fbp
->nb_vnbufs
.le_next
!= NFSNOLIST
) {
289 LIST_REMOVE(fbp
, nb_vnbufs
);
290 fbp
->nb_vnbufs
.le_next
= NFSNOLIST
;
294 LIST_REMOVE(fbp
, nb_hash
);
295 TAILQ_INSERT_TAIL(&nfsbuffreeup
, fbp
, nb_free
);
299 count
= timer
? nfsbuffreemetacnt
/ META_FREEUP_FRAC_ON_TIMER
: META_TO_FREEUP
;
300 while ((nfsbufcnt
> nfsbufmin
) && (count
-- > 0)) {
301 fbp
= TAILQ_FIRST(&nfsbuffreemeta
);
305 if (os_ref_get_count(&fbp
->nb_refs
) > 1) {
308 if (NBUFSTAMPVALID(fbp
) &&
309 (fbp
->nb_timestamp
+ (2 * NFSBUF_META_STALE
)) > now
.tv_sec
) {
312 nfs_buf_remfree(fbp
);
313 /* disassociate buffer from any nfsnode */
315 if (fbp
->nb_vnbufs
.le_next
!= NFSNOLIST
) {
316 LIST_REMOVE(fbp
, nb_vnbufs
);
317 fbp
->nb_vnbufs
.le_next
= NFSNOLIST
;
321 LIST_REMOVE(fbp
, nb_hash
);
322 TAILQ_INSERT_TAIL(&nfsbuffreeup
, fbp
, nb_free
);
327 FSDBG(320, nfsbufcnt
, nfsbuffreecnt
, nfsbuffreemetacnt
, 0);
330 lck_mtx_unlock(&nfs_buf_mutex
);
332 while ((fbp
= TAILQ_FIRST(&nfsbuffreeup
))) {
333 TAILQ_REMOVE(&nfsbuffreeup
, fbp
, nb_free
);
335 if (IS_VALID_CRED(fbp
->nb_rcred
)) {
336 kauth_cred_unref(&fbp
->nb_rcred
);
338 if (IS_VALID_CRED(fbp
->nb_wcred
)) {
339 kauth_cred_unref(&fbp
->nb_wcred
);
341 /* if buf was NB_META, dump buffer */
342 if (ISSET(fbp
->nb_flags
, NB_META
) && fbp
->nb_data
) {
343 kheap_free(KHEAP_DATA_BUFFERS
, fbp
->nb_data
, fbp
->nb_bufsize
);
345 NFS_ZFREE(nfsbuf_zone
, fbp
);
350 * remove a buffer from the freelist
351 * (must be called with nfs_buf_mutex held)
354 nfs_buf_remfree(struct nfsbuf
*bp
)
356 if (bp
->nb_free
.tqe_next
== NFSNOLIST
) {
357 panic("nfsbuf not on free list");
359 if (ISSET(bp
->nb_flags
, NB_DELWRI
)) {
361 TAILQ_REMOVE(&nfsbufdelwri
, bp
, nb_free
);
362 } else if (ISSET(bp
->nb_flags
, NB_META
)) {
364 TAILQ_REMOVE(&nfsbuffreemeta
, bp
, nb_free
);
367 TAILQ_REMOVE(&nfsbuffree
, bp
, nb_free
);
369 bp
->nb_free
.tqe_next
= NFSNOLIST
;
374 * check for existence of nfsbuf in cache
377 nfs_buf_is_incore(nfsnode_t np
, daddr64_t blkno
)
380 lck_mtx_lock(&nfs_buf_mutex
);
381 if (nfs_buf_incore(np
, blkno
)) {
386 lck_mtx_unlock(&nfs_buf_mutex
);
391 * return incore buffer (must be called with nfs_buf_mutex held)
394 nfs_buf_incore(nfsnode_t np
, daddr64_t blkno
)
396 /* Search hash chain */
397 struct nfsbuf
* bp
= NFSBUFHASH(np
, blkno
)->lh_first
;
398 for (; bp
!= NULL
; bp
= bp
->nb_hash
.le_next
) {
399 if ((bp
->nb_lblkno
== blkno
) && (bp
->nb_np
== np
)) {
400 if (!ISSET(bp
->nb_flags
, NB_INVAL
)) {
401 FSDBG(547, bp
, blkno
, bp
->nb_flags
, bp
->nb_np
);
410 * Check if it's OK to drop a page.
412 * Called by vnode_pager() on pageout request of non-dirty page.
413 * We need to make sure that it's not part of a delayed write.
414 * If it is, we can't let the VM drop it because we may need it
415 * later when/if we need to write the data (again).
418 nfs_buf_page_inval(vnode_t vp
, off_t offset
)
420 struct nfsmount
*nmp
= VTONMP(vp
);
424 if (nfs_mount_gone(nmp
)) {
428 lck_mtx_lock(&nfs_buf_mutex
);
429 bp
= nfs_buf_incore(VTONFS(vp
), (daddr64_t
)(offset
/ nmp
->nm_biosize
));
433 FSDBG(325, bp
, bp
->nb_flags
, bp
->nb_dirtyoff
, bp
->nb_dirtyend
);
434 if (ISSET(bp
->nb_lflags
, NBL_BUSY
)) {
439 * If there's a dirty range in the buffer, check to
440 * see if this page intersects with the dirty range.
441 * If it does, we can't let the pager drop the page.
443 if (bp
->nb_dirtyend
> 0) {
444 off_t start
= offset
- NBOFF(bp
);
445 if ((bp
->nb_dirtyend
> start
) &&
446 (bp
->nb_dirtyoff
< (start
+ PAGE_SIZE
))) {
448 * Before returning the bad news, move the
449 * buffer to the start of the delwri list and
450 * give the list a push to try to flush the
455 TAILQ_INSERT_HEAD(&nfsbufdelwri
, bp
, nb_free
);
457 nfs_buf_delwri_push(1);
461 lck_mtx_unlock(&nfs_buf_mutex
);
466 * set up the UPL for a buffer
467 * (must NOT be called with nfs_buf_mutex held)
470 nfs_buf_upl_setup(struct nfsbuf
*bp
)
476 if (ISSET(bp
->nb_flags
, NB_PAGELIST
)) {
480 upl_flags
= UPL_PRECIOUS
;
481 if (!ISSET(bp
->nb_flags
, NB_READ
)) {
483 * We're doing a "write", so we intend to modify
484 * the pages we're gathering.
486 upl_flags
|= UPL_WILL_MODIFY
;
488 kret
= ubc_create_upl_kernel(NFSTOV(bp
->nb_np
), NBOFF(bp
), bp
->nb_bufsize
,
489 &upl
, NULL
, upl_flags
, VM_KERN_MEMORY_FILE
);
490 if (kret
== KERN_INVALID_ARGUMENT
) {
491 /* vm object probably doesn't exist any more */
492 bp
->nb_pagelist
= NULL
;
495 if (kret
!= KERN_SUCCESS
) {
496 printf("nfs_buf_upl_setup(): failed to get pagelist %d\n", kret
);
497 bp
->nb_pagelist
= NULL
;
501 FSDBG(538, bp
, NBOFF(bp
), bp
->nb_bufsize
, bp
->nb_np
);
503 bp
->nb_pagelist
= upl
;
504 SET(bp
->nb_flags
, NB_PAGELIST
);
509 * update buffer's valid/dirty info from UBC
510 * (must NOT be called with nfs_buf_mutex held)
513 nfs_buf_upl_check(struct nfsbuf
*bp
)
516 off_t filesize
, fileoffset
;
519 if (!ISSET(bp
->nb_flags
, NB_PAGELIST
)) {
523 npages
= round_page_32(bp
->nb_bufsize
) / PAGE_SIZE
;
524 filesize
= ubc_getsize(NFSTOV(bp
->nb_np
));
525 fileoffset
= NBOFF(bp
);
526 if (fileoffset
< filesize
) {
527 SET(bp
->nb_flags
, NB_CACHE
);
529 CLR(bp
->nb_flags
, NB_CACHE
);
532 pl
= ubc_upl_pageinfo(bp
->nb_pagelist
);
533 NBPGS_ERASE(&bp
->nb_valid
);
534 NBPGS_ERASE(&bp
->nb_dirty
);
536 for (i
= 0; i
< npages
; i
++, fileoffset
+= PAGE_SIZE_64
) {
537 /* anything beyond the end of the file is not valid or dirty */
538 if (fileoffset
>= filesize
) {
541 if (!upl_valid_page(pl
, i
)) {
542 CLR(bp
->nb_flags
, NB_CACHE
);
545 NBPGVALID_SET(bp
, i
);
546 if (upl_dirty_page(pl
, i
)) {
547 NBPGDIRTY_SET(bp
, i
);
550 fileoffset
= NBOFF(bp
);
551 if (ISSET(bp
->nb_flags
, NB_CACHE
)) {
553 bp
->nb_validend
= bp
->nb_bufsize
;
554 if (fileoffset
+ bp
->nb_validend
> filesize
) {
555 bp
->nb_validend
= filesize
- fileoffset
;
558 bp
->nb_validoff
= bp
->nb_validend
= -1;
560 FSDBG(539, bp
, fileoffset
, bp
->nb_valid
, bp
->nb_dirty
);
561 FSDBG(539, bp
->nb_validoff
, bp
->nb_validend
, bp
->nb_dirtyoff
, bp
->nb_dirtyend
);
565 * make sure that a buffer is mapped
566 * (must NOT be called with nfs_buf_mutex held)
569 nfs_buf_map(struct nfsbuf
*bp
)
576 if (!ISSET(bp
->nb_flags
, NB_PAGELIST
)) {
580 kret
= ubc_upl_map(bp
->nb_pagelist
, (vm_offset_t
*)&(bp
->nb_data
));
581 if (kret
!= KERN_SUCCESS
) {
582 panic("nfs_buf_map: ubc_upl_map() failed with (%d)", kret
);
584 if (bp
->nb_data
== 0) {
585 panic("ubc_upl_map mapped 0");
587 FSDBG(540, bp
, bp
->nb_flags
, NBOFF(bp
), bp
->nb_data
);
592 * normalize an nfsbuf's valid range
594 * the read/write code guarantees that we'll always have a valid
595 * region that is an integral number of pages. If either end
596 * of the valid range isn't page-aligned, it gets corrected
597 * here as we extend the valid range through all of the
598 * contiguous valid pages.
601 nfs_buf_normalize_valid_range(nfsnode_t np
, struct nfsbuf
*bp
)
604 /* pull validoff back to start of contiguous valid page range */
605 pg
= bp
->nb_validoff
/ PAGE_SIZE
;
606 while (pg
>= 0 && NBPGVALID(bp
, pg
)) {
609 bp
->nb_validoff
= (pg
+ 1) * PAGE_SIZE
;
610 /* push validend forward to end of contiguous valid page range */
611 npg
= bp
->nb_bufsize
/ PAGE_SIZE
;
612 pg
= bp
->nb_validend
/ PAGE_SIZE
;
613 while (pg
< npg
&& NBPGVALID(bp
, pg
)) {
616 bp
->nb_validend
= pg
* PAGE_SIZE
;
618 if (NBOFF(bp
) + bp
->nb_validend
> (off_t
)np
->n_size
) {
619 bp
->nb_validend
= np
->n_size
% bp
->nb_bufsize
;
624 * process some entries on the delayed write queue
625 * (must be called with nfs_buf_mutex held)
628 nfs_buf_delwri_service(void)
634 while (i
< 8 && (bp
= TAILQ_FIRST(&nfsbufdelwri
)) != NULL
) {
638 while ((error
= nfs_buf_acquire(bp
, 0, 0, 0)) == EAGAIN
) {
646 /* buffer is no longer valid */
650 if (ISSET(bp
->nb_flags
, NB_NEEDCOMMIT
)) {
651 nfs_buf_check_write_verifier(np
, bp
);
653 if (ISSET(bp
->nb_flags
, NB_NEEDCOMMIT
)) {
654 /* put buffer at end of delwri list */
655 TAILQ_INSERT_TAIL(&nfsbufdelwri
, bp
, nb_free
);
658 lck_mtx_unlock(&nfs_buf_mutex
);
659 nfs_flushcommits(np
, 1);
661 SET(bp
->nb_flags
, NB_ASYNC
);
662 lck_mtx_unlock(&nfs_buf_mutex
);
666 lck_mtx_lock(&nfs_buf_mutex
);
671 * thread to service the delayed write queue when asked
674 nfs_buf_delwri_thread(__unused
void *arg
, __unused wait_result_t wr
)
676 struct timespec ts
= { .tv_sec
= 30, .tv_nsec
= 0 };
679 lck_mtx_lock(&nfs_buf_mutex
);
681 nfs_buf_delwri_service();
682 error
= msleep(&nfsbufdelwrithd
, &nfs_buf_mutex
, 0, "nfsbufdelwri", &ts
);
684 nfsbufdelwrithd
= NULL
;
685 lck_mtx_unlock(&nfs_buf_mutex
);
686 thread_terminate(nfsbufdelwrithd
);
690 * try to push out some delayed/uncommitted writes
691 * ("locked" indicates whether nfs_buf_mutex is already held)
694 nfs_buf_delwri_push(int locked
)
696 if (TAILQ_EMPTY(&nfsbufdelwri
)) {
700 lck_mtx_lock(&nfs_buf_mutex
);
702 /* wake up the delayed write service thread */
703 if (nfsbufdelwrithd
) {
704 wakeup(&nfsbufdelwrithd
);
705 } else if (kernel_thread_start(nfs_buf_delwri_thread
, NULL
, &nfsbufdelwrithd
) == KERN_SUCCESS
) {
706 thread_deallocate(nfsbufdelwrithd
);
708 /* otherwise, try to do some of the work ourselves */
709 if (!nfsbufdelwrithd
) {
710 nfs_buf_delwri_service();
713 lck_mtx_unlock(&nfs_buf_mutex
);
720 * Returns errno on error, 0 otherwise.
721 * Any buffer is returned in *bpp.
723 * If NBLK_ONLYVALID is set, only return buffer if found in cache.
724 * If NBLK_NOWAIT is set, don't wait for the buffer if it's marked BUSY.
726 * Check for existence of buffer in cache.
727 * Or attempt to reuse a buffer from one of the free lists.
728 * Or allocate a new buffer if we haven't already hit max allocation.
729 * Or wait for a free buffer.
731 * If available buffer found, prepare it, and return it.
733 * If the calling process is interrupted by a signal for
734 * an interruptible mount point, return EINTR.
745 vnode_t vp
= NFSTOV(np
);
746 struct nfsmount
*nmp
= VTONMP(vp
);
749 int slpflag
= PCATCH
;
750 int operation
= (flags
& NBLK_OPMASK
);
754 FSDBG_TOP(541, np
, blkno
, size
, flags
);
758 if (bufsize
> NFS_MAXBSIZE
) {
759 panic("nfs_buf_get: buffer larger than NFS_MAXBSIZE requested");
762 if (nfs_mount_gone(nmp
)) {
763 FSDBG_BOT(541, np
, blkno
, 0, ENXIO
);
767 if (!UBCINFOEXISTS(vp
)) {
768 operation
= NBLK_META
;
769 } else if (bufsize
< (uint32_t)nmp
->nm_biosize
) {
770 /* reg files should always have biosize blocks */
771 bufsize
= nmp
->nm_biosize
;
774 /* if NBLK_WRITE, check for too many delayed/uncommitted writes */
775 if ((operation
== NBLK_WRITE
) && (nfs_nbdwrite
> NFS_A_LOT_OF_DELAYED_WRITES
)) {
776 FSDBG_TOP(542, np
, blkno
, nfs_nbdwrite
, NFS_A_LOT_OF_DELAYED_WRITES
);
778 /* poke the delwri list */
779 nfs_buf_delwri_push(0);
781 /* sleep to let other threads run... */
782 tsleep(&nfs_nbdwrite
, PCATCH
, "nfs_nbdwrite", 1);
783 FSDBG_BOT(542, np
, blkno
, nfs_nbdwrite
, NFS_A_LOT_OF_DELAYED_WRITES
);
787 lck_mtx_lock(&nfs_buf_mutex
);
789 /* wait for any buffer invalidation/flushing to complete */
790 while (np
->n_bflag
& NBINVALINPROG
) {
791 np
->n_bflag
|= NBINVALWANT
;
794 msleep(&np
->n_bflag
, &nfs_buf_mutex
, slpflag
, "nfs_buf_get_invalwait", &ts
);
795 if ((error
= nfs_sigintr(VTONMP(vp
), NULL
, thd
, 0))) {
796 lck_mtx_unlock(&nfs_buf_mutex
);
797 FSDBG_BOT(541, np
, blkno
, 0, error
);
800 if (np
->n_bflag
& NBINVALINPROG
) {
805 /* check for existence of nfsbuf in cache */
806 if ((bp
= nfs_buf_incore(np
, blkno
))) {
807 /* if busy, set wanted and wait */
808 if (ISSET(bp
->nb_lflags
, NBL_BUSY
)) {
809 if (flags
& NBLK_NOWAIT
) {
810 lck_mtx_unlock(&nfs_buf_mutex
);
811 FSDBG_BOT(541, np
, blkno
, bp
, 0xbcbcbcbc);
814 FSDBG_TOP(543, np
, blkno
, bp
, bp
->nb_flags
);
815 SET(bp
->nb_lflags
, NBL_WANTED
);
819 msleep(bp
, &nfs_buf_mutex
, slpflag
| (PRIBIO
+ 1) | PDROP
,
820 "nfsbufget", (slpflag
== PCATCH
) ? NULL
: &ts
);
822 FSDBG_BOT(543, np
, blkno
, bp
, bp
->nb_flags
);
823 if ((error
= nfs_sigintr(VTONMP(vp
), NULL
, thd
, 0))) {
824 FSDBG_BOT(541, np
, blkno
, 0, error
);
829 if (bp
->nb_bufsize
!= bufsize
) {
830 panic("nfsbuf size mismatch");
832 SET(bp
->nb_lflags
, NBL_BUSY
);
833 SET(bp
->nb_flags
, NB_CACHE
);
835 /* additional paranoia: */
836 if (ISSET(bp
->nb_flags
, NB_PAGELIST
)) {
837 panic("pagelist buffer was not busy");
842 if (flags
& NBLK_ONLYVALID
) {
843 lck_mtx_unlock(&nfs_buf_mutex
);
844 FSDBG_BOT(541, np
, blkno
, 0, 0x0000cace);
849 * where to get a free buffer:
850 * - if meta and maxmeta reached, must reuse meta
851 * - alloc new if we haven't reached min bufs
852 * - if free lists are NOT empty
853 * - if free list is stale, use it
854 * - else if freemeta list is stale, use it
855 * - else if max bufs allocated, use least-time-to-stale
856 * - alloc new if we haven't reached max allowed
857 * - start clearing out delwri list and try again
860 if ((operation
== NBLK_META
) && (nfsbufmetacnt
>= nfsbufmetamax
)) {
861 /* if we've hit max meta buffers, must reuse a meta buffer */
862 bp
= TAILQ_FIRST(&nfsbuffreemeta
);
863 } else if ((nfsbufcnt
> nfsbufmin
) &&
864 (!TAILQ_EMPTY(&nfsbuffree
) || !TAILQ_EMPTY(&nfsbuffreemeta
))) {
865 /* try to pull an nfsbuf off a free list */
866 struct nfsbuf
*lrubp
, *metabp
;
870 /* if the next LRU or META buffer is invalid or stale, use it */
871 lrubp
= TAILQ_FIRST(&nfsbuffree
);
872 if (lrubp
&& (!NBUFSTAMPVALID(lrubp
) ||
873 ((lrubp
->nb_timestamp
+ NFSBUF_LRU_STALE
) < now
.tv_sec
))) {
876 metabp
= TAILQ_FIRST(&nfsbuffreemeta
);
877 if (!bp
&& metabp
&& (!NBUFSTAMPVALID(metabp
) ||
878 ((metabp
->nb_timestamp
+ NFSBUF_META_STALE
) < now
.tv_sec
))) {
882 if (!bp
&& (nfsbufcnt
>= nfsbufmax
)) {
883 /* we've already allocated all bufs, so */
884 /* choose the buffer that'll go stale first */
890 time_t lru_stale_time
, meta_stale_time
;
891 lru_stale_time
= lrubp
->nb_timestamp
+ NFSBUF_LRU_STALE
;
892 meta_stale_time
= metabp
->nb_timestamp
+ NFSBUF_META_STALE
;
893 if (lru_stale_time
<= meta_stale_time
) {
903 /* we have a buffer to reuse */
904 FSDBG(544, np
, blkno
, bp
, bp
->nb_flags
);
906 if (ISSET(bp
->nb_flags
, NB_DELWRI
)) {
907 panic("nfs_buf_get: delwri");
909 SET(bp
->nb_lflags
, NBL_BUSY
);
910 /* disassociate buffer from previous nfsnode */
912 if (bp
->nb_vnbufs
.le_next
!= NFSNOLIST
) {
913 LIST_REMOVE(bp
, nb_vnbufs
);
914 bp
->nb_vnbufs
.le_next
= NFSNOLIST
;
918 LIST_REMOVE(bp
, nb_hash
);
919 /* nuke any creds we're holding */
920 if (IS_VALID_CRED(bp
->nb_rcred
)) {
921 kauth_cred_unref(&bp
->nb_rcred
);
923 if (IS_VALID_CRED(bp
->nb_wcred
)) {
924 kauth_cred_unref(&bp
->nb_wcred
);
926 /* if buf will no longer be NB_META, dump old buffer */
927 if (operation
== NBLK_META
) {
928 if (!ISSET(bp
->nb_flags
, NB_META
)) {
931 } else if (ISSET(bp
->nb_flags
, NB_META
)) {
933 kheap_free(KHEAP_DATA_BUFFERS
, bp
->nb_data
, bp
->nb_bufsize
);
938 /* re-init buf fields */
940 bp
->nb_validoff
= bp
->nb_validend
= -1;
941 bp
->nb_dirtyoff
= bp
->nb_dirtyend
= 0;
942 NBPGS_ERASE(&bp
->nb_valid
);
943 NBPGS_ERASE(&bp
->nb_dirty
);
946 /* no buffer to reuse */
947 if ((nfsbufcnt
< nfsbufmax
) &&
948 ((operation
!= NBLK_META
) || (nfsbufmetacnt
< nfsbufmetamax
))) {
949 /* just alloc a new one */
950 bp
= zalloc(nfsbuf_zone
);
954 * If any excess bufs, make sure the timer
955 * is running to free them up later.
957 if (nfsbufcnt
> nfsbufmin
&& !nfs_buf_timer_on
) {
958 nfs_buf_timer_on
= 1;
959 nfs_interval_timer_start(nfs_buf_timer_call
,
960 NFSBUF_FREE_PERIOD
* 1000);
963 if (operation
== NBLK_META
) {
968 bzero(bp
, sizeof(*bp
));
969 os_ref_init(&bp
->nb_refs
, NULL
);
971 bp
->nb_free
.tqe_next
= NFSNOLIST
;
972 bp
->nb_validoff
= bp
->nb_validend
= -1;
973 FSDBG(545, np
, blkno
, bp
, 0);
975 /* too many bufs... wait for buffers to free up */
976 FSDBG_TOP(546, np
, blkno
, nfsbufcnt
, nfsbufmax
);
978 /* poke the delwri list */
979 nfs_buf_delwri_push(1);
982 msleep(&nfsneedbuffer
, &nfs_buf_mutex
, PCATCH
| PDROP
, "nfsbufget", NULL
);
983 FSDBG_BOT(546, np
, blkno
, nfsbufcnt
, nfsbufmax
);
984 if ((error
= nfs_sigintr(VTONMP(vp
), NULL
, thd
, 0))) {
985 FSDBG_BOT(541, np
, blkno
, 0, error
);
993 SET(bp
->nb_lflags
, NBL_BUSY
);
995 bp
->nb_lblkno
= blkno
;
996 /* insert buf in hash */
997 LIST_INSERT_HEAD(NFSBUFHASH(np
, blkno
), bp
, nb_hash
);
998 /* associate buffer with new nfsnode */
1000 LIST_INSERT_HEAD(&np
->n_cleanblkhd
, bp
, nb_vnbufs
);
1005 lck_mtx_unlock(&nfs_buf_mutex
);
1007 switch (operation
) {
1009 SET(bp
->nb_flags
, NB_META
);
1010 if ((bp
->nb_bufsize
!= bufsize
) && bp
->nb_data
) {
1011 kheap_free(KHEAP_DATA_BUFFERS
, bp
->nb_data
, bp
->nb_bufsize
);
1013 bp
->nb_validoff
= bp
->nb_validend
= -1;
1014 bp
->nb_dirtyoff
= bp
->nb_dirtyend
= 0;
1015 NBPGS_ERASE(&bp
->nb_valid
);
1016 NBPGS_ERASE(&bp
->nb_dirty
);
1017 CLR(bp
->nb_flags
, NB_CACHE
);
1020 bp
->nb_data
= kheap_alloc(KHEAP_DATA_BUFFERS
,
1024 /* Ack! couldn't allocate the data buffer! */
1025 /* clean up buffer and return error */
1026 lck_mtx_lock(&nfs_buf_mutex
);
1027 LIST_REMOVE(bp
, nb_vnbufs
);
1028 bp
->nb_vnbufs
.le_next
= NFSNOLIST
;
1030 /* invalidate usage timestamp to allow immediate freeing */
1031 NBUFSTAMPINVALIDATE(bp
);
1032 if (bp
->nb_free
.tqe_next
!= NFSNOLIST
) {
1033 panic("nfsbuf on freelist");
1035 TAILQ_INSERT_HEAD(&nfsbuffree
, bp
, nb_free
);
1037 lck_mtx_unlock(&nfs_buf_mutex
);
1038 FSDBG_BOT(541, np
, blkno
, 0xb00, ENOMEM
);
1041 bp
->nb_bufsize
= bufsize
;
1047 * Set or clear NB_READ now to let the UPL subsystem know
1048 * if we intend to modify the pages or not.
1050 if (operation
== NBLK_READ
) {
1051 SET(bp
->nb_flags
, NB_READ
);
1053 CLR(bp
->nb_flags
, NB_READ
);
1055 if (bufsize
< PAGE_SIZE
) {
1056 bufsize
= PAGE_SIZE
;
1058 bp
->nb_bufsize
= bufsize
;
1059 bp
->nb_validoff
= bp
->nb_validend
= -1;
1061 if (UBCINFOEXISTS(vp
)) {
1063 if (nfs_buf_upl_setup(bp
)) {
1064 /* unable to create upl */
1065 /* vm object must no longer exist */
1066 /* clean up buffer and return error */
1067 lck_mtx_lock(&nfs_buf_mutex
);
1068 LIST_REMOVE(bp
, nb_vnbufs
);
1069 bp
->nb_vnbufs
.le_next
= NFSNOLIST
;
1071 /* invalidate usage timestamp to allow immediate freeing */
1072 NBUFSTAMPINVALIDATE(bp
);
1073 if (bp
->nb_free
.tqe_next
!= NFSNOLIST
) {
1074 panic("nfsbuf on freelist");
1076 TAILQ_INSERT_HEAD(&nfsbuffree
, bp
, nb_free
);
1078 lck_mtx_unlock(&nfs_buf_mutex
);
1079 FSDBG_BOT(541, np
, blkno
, 0x2bc, EIO
);
1082 nfs_buf_upl_check(bp
);
1087 panic("nfs_buf_get: %d unknown operation", operation
);
1092 FSDBG_BOT(541, np
, blkno
, bp
, bp
->nb_flags
);
1098 nfs_buf_release(struct nfsbuf
*bp
, int freeup
)
1100 nfsnode_t np
= bp
->nb_np
;
1103 int wakeup_needbuffer
, wakeup_buffer
, wakeup_nbdwrite
;
1105 FSDBG_TOP(548, bp
, NBOFF(bp
), bp
->nb_flags
, bp
->nb_data
);
1106 FSDBG(548, bp
->nb_validoff
, bp
->nb_validend
, bp
->nb_dirtyoff
, bp
->nb_dirtyend
);
1107 FSDBG(548, bp
->nb_valid
, 0, bp
->nb_dirty
, 0);
1109 vp
= np
? NFSTOV(np
) : NULL
;
1110 if (vp
&& UBCINFOEXISTS(vp
) && bp
->nb_bufsize
) {
1115 if (!ISSET(bp
->nb_flags
, NB_PAGELIST
) && !ISSET(bp
->nb_flags
, NB_INVAL
)) {
1116 rv
= nfs_buf_upl_setup(bp
);
1118 printf("nfs_buf_release: upl create failed %d\n", rv
);
1120 nfs_buf_upl_check(bp
);
1123 upl
= bp
->nb_pagelist
;
1125 goto pagelist_cleanup_done
;
1128 if (ubc_upl_unmap(upl
) != KERN_SUCCESS
) {
1129 panic("ubc_upl_unmap failed");
1134 * Abort the pages on error or: if this is an invalid or
1135 * non-needcommit nocache buffer AND no pages are dirty.
1137 if (ISSET(bp
->nb_flags
, NB_ERROR
) || (!nfs_buf_pgs_is_set(&bp
->nb_dirty
) && (ISSET(bp
->nb_flags
, NB_INVAL
) ||
1138 (ISSET(bp
->nb_flags
, NB_NOCACHE
) && !ISSET(bp
->nb_flags
, (NB_NEEDCOMMIT
| NB_DELWRI
)))))) {
1139 if (ISSET(bp
->nb_flags
, (NB_READ
| NB_INVAL
| NB_NOCACHE
))) {
1140 upl_flags
= UPL_ABORT_DUMP_PAGES
;
1144 ubc_upl_abort(upl
, upl_flags
);
1145 goto pagelist_cleanup_done
;
1147 for (i
= 0; i
<= (bp
->nb_bufsize
- 1) / PAGE_SIZE
; i
++) {
1148 if (!NBPGVALID(bp
, i
)) {
1149 ubc_upl_abort_range(upl
,
1150 i
* PAGE_SIZE
, PAGE_SIZE
,
1151 UPL_ABORT_DUMP_PAGES
|
1152 UPL_ABORT_FREE_ON_EMPTY
);
1154 if (NBPGDIRTY(bp
, i
)) {
1155 upl_flags
= UPL_COMMIT_SET_DIRTY
;
1157 upl_flags
= UPL_COMMIT_CLEAR_DIRTY
;
1160 if (!ISSET(bp
->nb_flags
, (NB_NEEDCOMMIT
| NB_DELWRI
))) {
1161 upl_flags
|= UPL_COMMIT_CLEAR_PRECIOUS
;
1164 ubc_upl_commit_range(upl
,
1165 i
* PAGE_SIZE
, PAGE_SIZE
,
1167 UPL_COMMIT_INACTIVATE
|
1168 UPL_COMMIT_FREE_ON_EMPTY
);
1171 pagelist_cleanup_done
:
1172 /* invalidate any pages past EOF */
1173 if (NBOFF(bp
) + bp
->nb_bufsize
> (off_t
)(np
->n_size
)) {
1175 start
= trunc_page_64(np
->n_size
) + PAGE_SIZE_64
;
1176 end
= trunc_page_64(NBOFF(bp
) + bp
->nb_bufsize
);
1177 if (start
< NBOFF(bp
)) {
1181 if ((rv
= ubc_msync(vp
, start
, end
, NULL
, UBC_INVALIDATE
))) {
1182 printf("nfs_buf_release(): ubc_msync failed!, error %d\n", rv
);
1186 CLR(bp
->nb_flags
, NB_PAGELIST
);
1187 bp
->nb_pagelist
= NULL
;
1190 lck_mtx_lock(&nfs_buf_mutex
);
1192 wakeup_needbuffer
= wakeup_buffer
= wakeup_nbdwrite
= 0;
1194 /* Wake up any processes waiting for any buffer to become free. */
1195 if (nfsneedbuffer
) {
1197 wakeup_needbuffer
= 1;
1199 /* Wake up any processes waiting for _this_ buffer to become free. */
1200 if (ISSET(bp
->nb_lflags
, NBL_WANTED
)) {
1201 CLR(bp
->nb_lflags
, NBL_WANTED
);
1205 /* If it's non-needcommit nocache, or an error, mark it invalid. */
1206 if (ISSET(bp
->nb_flags
, NB_ERROR
) ||
1207 (ISSET(bp
->nb_flags
, NB_NOCACHE
) && !ISSET(bp
->nb_flags
, (NB_NEEDCOMMIT
| NB_DELWRI
)))) {
1208 SET(bp
->nb_flags
, NB_INVAL
);
1211 if ((bp
->nb_bufsize
<= 0) || ISSET(bp
->nb_flags
, NB_INVAL
)) {
1212 /* If it's invalid or empty, dissociate it from its nfsnode */
1213 if (bp
->nb_vnbufs
.le_next
!= NFSNOLIST
) {
1214 LIST_REMOVE(bp
, nb_vnbufs
);
1215 bp
->nb_vnbufs
.le_next
= NFSNOLIST
;
1218 /* if this was a delayed write, wakeup anyone */
1219 /* waiting for delayed writes to complete */
1220 if (ISSET(bp
->nb_flags
, NB_DELWRI
)) {
1221 CLR(bp
->nb_flags
, NB_DELWRI
);
1224 wakeup_nbdwrite
= 1;
1226 /* invalidate usage timestamp to allow immediate freeing */
1227 NBUFSTAMPINVALIDATE(bp
);
1228 /* put buffer at head of free list */
1229 if (bp
->nb_free
.tqe_next
!= NFSNOLIST
) {
1230 panic("nfsbuf on freelist");
1232 SET(bp
->nb_flags
, NB_INVAL
);
1233 if (ISSET(bp
->nb_flags
, NB_META
)) {
1234 TAILQ_INSERT_HEAD(&nfsbuffreemeta
, bp
, nb_free
);
1235 nfsbuffreemetacnt
++;
1237 TAILQ_INSERT_HEAD(&nfsbuffree
, bp
, nb_free
);
1240 } else if (ISSET(bp
->nb_flags
, NB_DELWRI
)) {
1241 /* put buffer at end of delwri list */
1242 if (bp
->nb_free
.tqe_next
!= NFSNOLIST
) {
1243 panic("nfsbuf on freelist");
1245 TAILQ_INSERT_TAIL(&nfsbufdelwri
, bp
, nb_free
);
1249 /* update usage timestamp */
1251 bp
->nb_timestamp
= now
.tv_sec
;
1252 /* put buffer at end of free list */
1253 if (bp
->nb_free
.tqe_next
!= NFSNOLIST
) {
1254 panic("nfsbuf on freelist");
1256 if (ISSET(bp
->nb_flags
, NB_META
)) {
1257 TAILQ_INSERT_TAIL(&nfsbuffreemeta
, bp
, nb_free
);
1258 nfsbuffreemetacnt
++;
1260 TAILQ_INSERT_TAIL(&nfsbuffree
, bp
, nb_free
);
1267 /* Unlock the buffer. */
1268 CLR(bp
->nb_flags
, (NB_ASYNC
| NB_STABLE
));
1269 CLR(bp
->nb_lflags
, NBL_BUSY
);
1271 FSDBG_BOT(548, bp
, NBOFF(bp
), bp
->nb_flags
, bp
->nb_data
);
1273 lck_mtx_unlock(&nfs_buf_mutex
);
1275 if (wakeup_needbuffer
) {
1276 wakeup(&nfsneedbuffer
);
1278 if (wakeup_buffer
) {
1281 if (wakeup_nbdwrite
) {
1282 wakeup(&nfs_nbdwrite
);
1290 * Wait for operations on the buffer to complete.
1291 * When they do, extract and return the I/O's error value.
1294 nfs_buf_iowait(struct nfsbuf
*bp
)
1296 FSDBG_TOP(549, bp
, NBOFF(bp
), bp
->nb_flags
, bp
->nb_error
);
1298 lck_mtx_lock(&nfs_buf_mutex
);
1300 while (!ISSET(bp
->nb_flags
, NB_DONE
)) {
1301 msleep(bp
, &nfs_buf_mutex
, PRIBIO
+ 1, "nfs_buf_iowait", NULL
);
1304 lck_mtx_unlock(&nfs_buf_mutex
);
1306 FSDBG_BOT(549, bp
, NBOFF(bp
), bp
->nb_flags
, bp
->nb_error
);
1308 /* check for interruption of I/O, then errors. */
1309 if (ISSET(bp
->nb_flags
, NB_EINTR
)) {
1310 CLR(bp
->nb_flags
, NB_EINTR
);
1312 } else if (ISSET(bp
->nb_flags
, NB_ERROR
)) {
1313 return bp
->nb_error
? bp
->nb_error
: EIO
;
1319 * Mark I/O complete on a buffer.
1322 nfs_buf_iodone(struct nfsbuf
*bp
)
1324 FSDBG_TOP(550, bp
, NBOFF(bp
), bp
->nb_flags
, bp
->nb_error
);
1326 if (ISSET(bp
->nb_flags
, NB_DONE
)) {
1327 panic("nfs_buf_iodone already");
1330 if (!ISSET(bp
->nb_flags
, NB_READ
)) {
1331 CLR(bp
->nb_flags
, NB_WRITEINPROG
);
1333 * vnode_writedone() takes care of waking up
1334 * any throttled write operations
1336 vnode_writedone(NFSTOV(bp
->nb_np
));
1337 nfs_node_lock_force(bp
->nb_np
);
1338 bp
->nb_np
->n_numoutput
--;
1339 nfs_node_unlock(bp
->nb_np
);
1341 if (ISSET(bp
->nb_flags
, NB_ASYNC
)) { /* if async, release it */
1342 SET(bp
->nb_flags
, NB_DONE
); /* note that it's done */
1343 nfs_buf_release(bp
, 1);
1344 } else { /* or just wakeup the buffer */
1345 lck_mtx_lock(&nfs_buf_mutex
);
1346 SET(bp
->nb_flags
, NB_DONE
); /* note that it's done */
1347 CLR(bp
->nb_lflags
, NBL_WANTED
);
1348 lck_mtx_unlock(&nfs_buf_mutex
);
1352 FSDBG_BOT(550, bp
, NBOFF(bp
), bp
->nb_flags
, bp
->nb_error
);
1356 nfs_buf_write_delayed(struct nfsbuf
*bp
)
1358 nfsnode_t np
= bp
->nb_np
;
1360 FSDBG_TOP(551, bp
, NBOFF(bp
), bp
->nb_flags
, 0);
1361 FSDBG(551, bp
, bp
->nb_dirtyoff
, bp
->nb_dirtyend
, bp
->nb_dirty
);
1364 * If the block hasn't been seen before:
1365 * (1) Mark it as having been seen,
1366 * (2) Make sure it's on its node's correct block list,
1368 if (!ISSET(bp
->nb_flags
, NB_DELWRI
)) {
1369 SET(bp
->nb_flags
, NB_DELWRI
);
1370 /* move to dirty list */
1371 lck_mtx_lock(&nfs_buf_mutex
);
1374 if (bp
->nb_vnbufs
.le_next
!= NFSNOLIST
) {
1375 LIST_REMOVE(bp
, nb_vnbufs
);
1377 LIST_INSERT_HEAD(&np
->n_dirtyblkhd
, bp
, nb_vnbufs
);
1378 lck_mtx_unlock(&nfs_buf_mutex
);
1382 * If the vnode has "too many" write operations in progress
1383 * wait for them to finish the IO
1385 vnode_waitforwrites(NFSTOV(np
), VNODE_ASYNC_THROTTLE
, 0, 0, "nfs_buf_write_delayed");
1387 /* the file is in a modified state, so make sure the flag's set */
1388 nfs_node_lock_force(np
);
1389 np
->n_flag
|= NMODIFIED
;
1390 nfs_node_unlock(np
);
1393 * If we have too many delayed write buffers,
1394 * just fall back to doing the async write.
1396 if (nfs_nbdwrite
< 0) {
1397 panic("nfs_buf_write_delayed: Negative nfs_nbdwrite");
1399 if (nfs_nbdwrite
> NFS_A_LOT_OF_DELAYED_WRITES
) {
1400 /* issue async write */
1401 SET(bp
->nb_flags
, NB_ASYNC
);
1403 FSDBG_BOT(551, bp
, NBOFF(bp
), bp
->nb_flags
, bp
->nb_error
);
1407 /* Otherwise, the "write" is done, so mark and release the buffer. */
1408 SET(bp
->nb_flags
, NB_DONE
);
1409 nfs_buf_release(bp
, 1);
1410 FSDBG_BOT(551, bp
, NBOFF(bp
), bp
->nb_flags
, 0);
1415 * Check that a "needcommit" buffer can still be committed.
1416 * If the write verifier has changed, we need to clear the
1417 * the needcommit flag.
1420 nfs_buf_check_write_verifier(nfsnode_t np
, struct nfsbuf
*bp
)
1422 struct nfsmount
*nmp
;
1424 if (!ISSET(bp
->nb_flags
, NB_NEEDCOMMIT
)) {
1429 if (nfs_mount_gone(nmp
)) {
1432 if (!ISSET(bp
->nb_flags
, NB_STALEWVERF
) && (bp
->nb_verf
== nmp
->nm_verf
)) {
1436 /* write verifier changed, clear commit/wverf flags */
1437 CLR(bp
->nb_flags
, (NB_NEEDCOMMIT
| NB_STALEWVERF
));
1439 nfs_node_lock_force(np
);
1440 np
->n_needcommitcnt
--;
1441 CHECK_NEEDCOMMITCNT(np
);
1442 nfs_node_unlock(np
);
1446 * add a reference to a buffer so it doesn't disappear while being used
1447 * (must be called with nfs_buf_mutex held)
1450 nfs_buf_refget(struct nfsbuf
*bp
)
1452 os_ref_retain_locked(&bp
->nb_refs
);
1455 * release a reference on a buffer
1456 * (must be called with nfs_buf_mutex held)
1459 nfs_buf_refrele(struct nfsbuf
*bp
)
1461 (void) os_ref_release_locked(&bp
->nb_refs
);
1465 * mark a particular buffer as BUSY
1466 * (must be called with nfs_buf_mutex held)
1469 nfs_buf_acquire(struct nfsbuf
*bp
, int flags
, int slpflag
, int slptimeo
)
1474 if (ISSET(bp
->nb_lflags
, NBL_BUSY
)) {
1476 * since the lck_mtx_lock may block, the buffer
1477 * may become BUSY, so we need to recheck for
1480 if (flags
& NBAC_NOWAIT
) {
1483 SET(bp
->nb_lflags
, NBL_WANTED
);
1485 ts
.tv_sec
= (slptimeo
/ 100);
1486 /* the hz value is 100; which leads to 10ms */
1487 ts
.tv_nsec
= (slptimeo
% 100) * 10 * NSEC_PER_USEC
* 1000;
1489 error
= msleep(bp
, &nfs_buf_mutex
, slpflag
| (PRIBIO
+ 1),
1490 "nfs_buf_acquire", &ts
);
1496 if (flags
& NBAC_REMOVE
) {
1497 nfs_buf_remfree(bp
);
1499 SET(bp
->nb_lflags
, NBL_BUSY
);
1505 * simply drop the BUSY status of a buffer
1506 * (must be called with nfs_buf_mutex held)
1509 nfs_buf_drop(struct nfsbuf
*bp
)
1511 int need_wakeup
= 0;
1513 if (!ISSET(bp
->nb_lflags
, NBL_BUSY
)) {
1514 panic("nfs_buf_drop: buffer not busy!");
1516 if (ISSET(bp
->nb_lflags
, NBL_WANTED
)) {
1517 /* delay the actual wakeup until after we clear NBL_BUSY */
1520 /* Unlock the buffer. */
1521 CLR(bp
->nb_lflags
, (NBL_BUSY
| NBL_WANTED
));
1529 * prepare for iterating over an nfsnode's buffer list
1530 * this lock protects the queue manipulation
1531 * (must be called with nfs_buf_mutex held)
1534 nfs_buf_iterprepare(nfsnode_t np
, struct nfsbuflists
*iterheadp
, int flags
)
1536 struct nfsbuflists
*listheadp
;
1538 if (flags
& NBI_DIRTY
) {
1539 listheadp
= &np
->n_dirtyblkhd
;
1541 listheadp
= &np
->n_cleanblkhd
;
1544 if ((flags
& NBI_NOWAIT
) && (np
->n_bufiterflags
& NBI_ITER
)) {
1545 LIST_INIT(iterheadp
);
1549 while (np
->n_bufiterflags
& NBI_ITER
) {
1550 np
->n_bufiterflags
|= NBI_ITERWANT
;
1551 msleep(&np
->n_bufiterflags
, &nfs_buf_mutex
, 0, "nfs_buf_iterprepare", NULL
);
1553 if (LIST_EMPTY(listheadp
)) {
1554 LIST_INIT(iterheadp
);
1557 np
->n_bufiterflags
|= NBI_ITER
;
1559 iterheadp
->lh_first
= listheadp
->lh_first
;
1560 listheadp
->lh_first
->nb_vnbufs
.le_prev
= &iterheadp
->lh_first
;
1561 LIST_INIT(listheadp
);
1567 * clean up after iterating over an nfsnode's buffer list
1568 * this lock protects the queue manipulation
1569 * (must be called with nfs_buf_mutex held)
1572 nfs_buf_itercomplete(nfsnode_t np
, struct nfsbuflists
*iterheadp
, int flags
)
1574 struct nfsbuflists
* listheadp
;
1577 if (flags
& NBI_DIRTY
) {
1578 listheadp
= &np
->n_dirtyblkhd
;
1580 listheadp
= &np
->n_cleanblkhd
;
1583 while (!LIST_EMPTY(iterheadp
)) {
1584 bp
= LIST_FIRST(iterheadp
);
1585 LIST_REMOVE(bp
, nb_vnbufs
);
1586 LIST_INSERT_HEAD(listheadp
, bp
, nb_vnbufs
);
1589 np
->n_bufiterflags
&= ~NBI_ITER
;
1590 if (np
->n_bufiterflags
& NBI_ITERWANT
) {
1591 np
->n_bufiterflags
&= ~NBI_ITERWANT
;
1592 wakeup(&np
->n_bufiterflags
);
1598 * Read an NFS buffer for a file.
1601 nfs_buf_read(struct nfsbuf
*bp
)
1609 cred
= bp
->nb_rcred
;
1610 if (IS_VALID_CRED(cred
)) {
1611 kauth_cred_ref(cred
);
1613 thd
= ISSET(bp
->nb_flags
, NB_ASYNC
) ? NULL
: current_thread();
1616 if (!ISSET(bp
->nb_flags
, NB_READ
)) {
1617 panic("nfs_buf_read: !NB_READ");
1619 if (ISSET(bp
->nb_flags
, NB_DONE
)) {
1620 CLR(bp
->nb_flags
, NB_DONE
);
1625 OSAddAtomic64(1, &nfsstats
.read_bios
);
1627 error
= nfs_buf_read_rpc(bp
, thd
, cred
);
1629 * For async I/O, the callbacks will finish up the
1630 * read. Otherwise, the read has already been finished.
1633 if (IS_VALID_CRED(cred
)) {
1634 kauth_cred_unref(&cred
);
1640 * finish the reading of a buffer
1643 nfs_buf_read_finish(struct nfsbuf
*bp
)
1645 nfsnode_t np
= bp
->nb_np
;
1646 struct nfsmount
*nmp
;
1648 if (!ISSET(bp
->nb_flags
, NB_ERROR
)) {
1649 /* update valid range */
1650 bp
->nb_validoff
= 0;
1651 bp
->nb_validend
= bp
->nb_endio
;
1652 if (bp
->nb_endio
< bp
->nb_bufsize
) {
1654 * The read may be short because we have unflushed writes
1655 * that are extending the file size and the reads hit the
1656 * (old) EOF on the server. So, just make sure nb_validend
1657 * correctly tracks EOF.
1658 * Note that the missing data should have already been zeroed
1659 * in nfs_buf_read_rpc_finish().
1661 off_t boff
= NBOFF(bp
);
1662 if ((off_t
)np
->n_size
>= (boff
+ bp
->nb_bufsize
)) {
1663 bp
->nb_validend
= bp
->nb_bufsize
;
1664 } else if ((off_t
)np
->n_size
>= boff
) {
1665 bp
->nb_validend
= np
->n_size
- boff
;
1667 bp
->nb_validend
= 0;
1670 if ((nmp
= NFSTONMP(np
)) && (nmp
->nm_vers
== NFS_VER2
) &&
1671 ((NBOFF(bp
) + bp
->nb_validend
) > 0x100000000LL
)) {
1672 bp
->nb_validend
= 0x100000000LL
- NBOFF(bp
);
1674 nfs_buf_pgs_get_page_mask(&bp
->nb_valid
, round_page_64(bp
->nb_validend
) / PAGE_SIZE
);
1675 if (bp
->nb_validend
& PAGE_MASK
) {
1676 /* zero-fill remainder of last page */
1677 bzero(bp
->nb_data
+ bp
->nb_validend
, PAGE_SIZE
- (bp
->nb_validend
& PAGE_MASK
));
1684 * initiate the NFS READ RPC(s) for a buffer
1687 nfs_buf_read_rpc(struct nfsbuf
*bp
, thread_t thd
, kauth_cred_t cred
)
1689 struct nfsmount
*nmp
;
1690 nfsnode_t np
= bp
->nb_np
;
1691 int error
= 0, nfsvers
, async
;
1693 uint64_t length
, nrpcs
;
1698 struct nfsreq_cbinfo cb
;
1701 if (nfs_mount_gone(nmp
)) {
1702 bp
->nb_error
= error
= ENXIO
;
1703 SET(bp
->nb_flags
, NB_ERROR
);
1707 nfsvers
= nmp
->nm_vers
;
1708 nmrsize
= nmp
->nm_rsize
;
1712 length
= bp
->nb_bufsize
;
1714 if (nfsvers
== NFS_VER2
) {
1715 if (boff
> 0xffffffffLL
) {
1716 bp
->nb_error
= error
= EFBIG
;
1717 SET(bp
->nb_flags
, NB_ERROR
);
1721 if ((boff
+ length
- 1) > 0xffffffffLL
) {
1722 length
= 0x100000000LL
- boff
;
1726 /* Note: Can only do async I/O if nfsiods are configured. */
1727 async
= (bp
->nb_flags
& NB_ASYNC
);
1728 cb
.rcb_func
= async
? nfs_buf_read_rpc_finish
: NULL
;
1731 bp
->nb_offio
= bp
->nb_endio
= 0;
1732 bp
->nb_rpcs
= nrpcs
= (length
+ nmrsize
- 1) / nmrsize
;
1733 if (async
&& (nrpcs
> 1)) {
1734 SET(bp
->nb_flags
, NB_MULTASYNCRPC
);
1736 CLR(bp
->nb_flags
, NB_MULTASYNCRPC
);
1739 while (length
> 0) {
1740 if (ISSET(bp
->nb_flags
, NB_ERROR
)) {
1741 error
= bp
->nb_error
;
1744 len
= (length
> nmrsize
) ? nmrsize
: (uint32_t)length
;
1745 cb
.rcb_args
.offset
= offset
;
1746 cb
.rcb_args
.length
= len
;
1748 if (nmp
->nm_vers
>= NFS_VER4
) {
1749 cb
.rcb_args
.stategenid
= nmp
->nm_stategenid
;
1753 error
= nmp
->nm_funcs
->nf_read_rpc_async(np
, boff
+ offset
, len
, thd
, cred
, &cb
, &req
);
1762 nfs_buf_read_rpc_finish(req
);
1763 if (ISSET(bp
->nb_flags
, NB_ERROR
)) {
1764 error
= bp
->nb_error
;
1771 * Something bad happened while trying to send the RPC(s).
1772 * Wait for any outstanding requests to complete.
1774 bp
->nb_error
= error
;
1775 SET(bp
->nb_flags
, NB_ERROR
);
1776 if (ISSET(bp
->nb_flags
, NB_MULTASYNCRPC
)) {
1777 nrpcs
= (length
+ nmrsize
- 1) / nmrsize
;
1778 lck_mtx_lock(&nfs_buf_mutex
);
1779 bp
->nb_rpcs
-= nrpcs
;
1780 if (bp
->nb_rpcs
== 0) {
1781 /* No RPCs left, so the buffer's done */
1782 lck_mtx_unlock(&nfs_buf_mutex
);
1785 /* wait for the last RPC to mark it done */
1786 while (bp
->nb_rpcs
> 0) {
1787 msleep(&bp
->nb_rpcs
, &nfs_buf_mutex
, 0,
1788 "nfs_buf_read_rpc_cancel", NULL
);
1790 lck_mtx_unlock(&nfs_buf_mutex
);
1801 * finish up an NFS READ RPC on a buffer
1804 nfs_buf_read_rpc_finish(struct nfsreq
*req
)
1806 struct nfsmount
*nmp
;
1807 size_t rlen
, length
;
1808 struct nfsreq_cbinfo cb
;
1810 int error
= 0, nfsvers
, eof
= 0, multasyncrpc
, finished
;
1812 void *wakeme
= NULL
;
1813 struct nfsreq
*rreq
= NULL
;
1818 char uio_buf
[UIO_SIZEOF(1)];
1822 thd
= req
->r_thread
;
1824 if (IS_VALID_CRED(cred
)) {
1825 kauth_cred_ref(cred
);
1827 cb
= req
->r_callback
;
1829 if (cb
.rcb_func
) { /* take an extra reference on the nfsreq in case we want to resend it later due to grace error */
1830 nfs_request_ref(req
, 0);
1834 if (nfs_mount_gone(nmp
)) {
1835 SET(bp
->nb_flags
, NB_ERROR
);
1836 bp
->nb_error
= error
= ENXIO
;
1838 if (error
|| ISSET(bp
->nb_flags
, NB_ERROR
)) {
1840 nfs_request_async_cancel(req
);
1844 nfsvers
= nmp
->nm_vers
;
1845 offset
= cb
.rcb_args
.offset
;
1846 rlen
= length
= cb
.rcb_args
.length
;
1848 auio
= uio_createwithbuffer(1, NBOFF(bp
) + offset
, UIO_SYSSPACE
,
1849 UIO_READ
, &uio_buf
, sizeof(uio_buf
));
1850 uio_addiov(auio
, CAST_USER_ADDR_T(bp
->nb_data
+ offset
), length
);
1852 /* finish the RPC */
1853 error
= nmp
->nm_funcs
->nf_read_rpc_async_finish(np
, req
, auio
, &rlen
, &eof
);
1854 if ((error
== EINPROGRESS
) && cb
.rcb_func
) {
1855 /* async request restarted */
1857 nfs_request_rele(req
);
1859 if (IS_VALID_CRED(cred
)) {
1860 kauth_cred_unref(&cred
);
1865 if ((nmp
->nm_vers
>= NFS_VER4
) && nfs_mount_state_error_should_restart(error
) && !ISSET(bp
->nb_flags
, NB_ERROR
)) {
1866 lck_mtx_lock(&nmp
->nm_lock
);
1867 if ((error
!= NFSERR_OLD_STATEID
) && (error
!= NFSERR_GRACE
) && (cb
.rcb_args
.stategenid
== nmp
->nm_stategenid
)) {
1868 NP(np
, "nfs_buf_read_rpc_finish: error %d @ 0x%llx, 0x%x 0x%x, initiating recovery",
1869 error
, NBOFF(bp
) + offset
, cb
.rcb_args
.stategenid
, nmp
->nm_stategenid
);
1870 nfs_need_recover(nmp
, error
);
1872 lck_mtx_unlock(&nmp
->nm_lock
);
1873 if (np
->n_flag
& NREVOKE
) {
1876 if (error
== NFSERR_GRACE
) {
1879 * For an async I/O request, handle a grace delay just like
1880 * jukebox errors. Set the resend time and queue it up.
1883 if (req
->r_nmrep
.nmc_mhead
) {
1884 mbuf_freem(req
->r_nmrep
.nmc_mhead
);
1885 req
->r_nmrep
.nmc_mhead
= NULL
;
1889 lck_mtx_lock(&req
->r_mtx
);
1890 req
->r_resendtime
= now
.tv_sec
+ 2;
1891 req
->r_xid
= 0; // get a new XID
1892 req
->r_flags
|= R_RESTART
;
1894 nfs_asyncio_resend(req
);
1895 lck_mtx_unlock(&req
->r_mtx
);
1896 if (IS_VALID_CRED(cred
)) {
1897 kauth_cred_unref(&cred
);
1899 /* Note: nfsreq reference taken will be dropped later when finished */
1902 /* otherwise, just pause a couple seconds and retry */
1903 tsleep(&nmp
->nm_state
, (PZERO
- 1), "nfsgrace", 2 * hz
);
1905 if (!(error
= nfs_mount_state_wait_for_recovery(nmp
))) {
1913 SET(bp
->nb_flags
, NB_ERROR
);
1914 bp
->nb_error
= error
;
1918 if ((rlen
> 0) && (bp
->nb_endio
< (offset
+ (int)rlen
))) {
1919 bp
->nb_endio
= offset
+ rlen
;
1922 if ((nfsvers
== NFS_VER2
) || eof
|| (rlen
== 0)) {
1923 /* zero out the remaining data (up to EOF) */
1924 off_t rpcrem
, eofrem
, rem
;
1925 rpcrem
= (length
- rlen
);
1926 eofrem
= np
->n_size
- (NBOFF(bp
) + offset
+ rlen
);
1927 rem
= (rpcrem
< eofrem
) ? rpcrem
: eofrem
;
1929 NFS_BZERO(bp
->nb_data
+ offset
+ rlen
, rem
);
1931 } else if ((rlen
< length
) && !ISSET(bp
->nb_flags
, NB_ERROR
)) {
1935 * We haven't hit EOF and we didn't get all the data
1936 * requested, so we need to issue another read for the rest.
1937 * (Don't bother if the buffer already hit an error.)
1944 cb
.rcb_args
.offset
= offset
;
1945 cb
.rcb_args
.length
= length
;
1947 if (nmp
->nm_vers
>= NFS_VER4
) {
1948 cb
.rcb_args
.stategenid
= nmp
->nm_stategenid
;
1951 error
= nmp
->nm_funcs
->nf_read_rpc_async(np
, NBOFF(bp
) + offset
, length
, thd
, cred
, &cb
, &rreq
);
1953 if (IS_VALID_CRED(cred
)) {
1954 kauth_cred_unref(&cred
);
1957 /* if !async we'll need to wait for this RPC to finish */
1962 nfs_request_rele(req
);
1965 * Outstanding RPC count is unchanged.
1966 * Callback will be called when RPC is done.
1970 SET(bp
->nb_flags
, NB_ERROR
);
1971 bp
->nb_error
= error
;
1976 nfs_request_rele(req
);
1978 if (IS_VALID_CRED(cred
)) {
1979 kauth_cred_unref(&cred
);
1983 * Decrement outstanding RPC count on buffer
1984 * and call nfs_buf_read_finish on last RPC.
1986 * (Note: when there are multiple async RPCs issued for a
1987 * buffer we need nfs_buffer_mutex to avoid problems when
1988 * aborting a partially-initiated set of RPCs)
1991 multasyncrpc
= ISSET(bp
->nb_flags
, NB_MULTASYNCRPC
);
1993 lck_mtx_lock(&nfs_buf_mutex
);
1997 finished
= (bp
->nb_rpcs
== 0);
2000 lck_mtx_unlock(&nfs_buf_mutex
);
2005 wakeme
= &bp
->nb_rpcs
;
2007 nfs_buf_read_finish(bp
);
2015 * Do buffer readahead.
2016 * Initiate async I/O to read buffers not in cache.
2019 nfs_buf_readahead(nfsnode_t np
, int ioflag
, daddr64_t
*rabnp
, daddr64_t lastrabn
, thread_t thd
, kauth_cred_t cred
)
2021 struct nfsmount
*nmp
= NFSTONMP(np
);
2026 if (nfs_mount_gone(nmp
)) {
2029 if (nmp
->nm_readahead
<= 0) {
2032 if (*rabnp
> lastrabn
) {
2036 for (nra
= 0; (nra
< nmp
->nm_readahead
) && (*rabnp
<= lastrabn
); nra
++, *rabnp
= *rabnp
+ 1) {
2037 /* check if block exists and is valid. */
2038 if ((*rabnp
* nmp
->nm_biosize
) >= (off_t
)np
->n_size
) {
2039 /* stop reading ahead if we're beyond EOF */
2043 error
= nfs_buf_get(np
, *rabnp
, nmp
->nm_biosize
, thd
, NBLK_READ
| NBLK_NOWAIT
, &bp
);
2047 nfs_node_lock_force(np
);
2048 np
->n_lastrahead
= *rabnp
;
2049 nfs_node_unlock(np
);
2053 if ((ioflag
& IO_NOCACHE
) && ISSET(bp
->nb_flags
, NB_CACHE
) &&
2054 !nfs_buf_pgs_is_set(&bp
->nb_dirty
) && !ISSET(bp
->nb_flags
, (NB_DELWRI
| NB_NCRDAHEAD
))) {
2055 CLR(bp
->nb_flags
, NB_CACHE
);
2056 NBPGS_ERASE(&bp
->nb_valid
);
2057 bp
->nb_validoff
= bp
->nb_validend
= -1;
2059 if ((bp
->nb_dirtyend
<= 0) && !nfs_buf_pgs_is_set(&bp
->nb_dirty
) &&
2060 !ISSET(bp
->nb_flags
, (NB_CACHE
| NB_DELWRI
))) {
2061 SET(bp
->nb_flags
, (NB_READ
| NB_ASYNC
));
2062 if (ioflag
& IO_NOCACHE
) {
2063 SET(bp
->nb_flags
, NB_NCRDAHEAD
);
2065 if (!IS_VALID_CRED(bp
->nb_rcred
) && IS_VALID_CRED(cred
)) {
2066 kauth_cred_ref(cred
);
2067 bp
->nb_rcred
= cred
;
2069 if ((error
= nfs_buf_read(bp
))) {
2074 nfs_buf_release(bp
, 1);
2080 * NFS buffer I/O for reading files.
2083 nfs_bioread(nfsnode_t np
, uio_t uio
, int ioflag
, vfs_context_t ctx
)
2085 vnode_t vp
= NFSTOV(np
);
2086 struct nfsbuf
*bp
= NULL
;
2087 struct nfsmount
*nmp
= VTONMP(vp
);
2088 daddr64_t lbn
, rabn
= 0, lastrabn
, maxrabn
= -1;
2089 off_t diff
, on
= 0, n
= 0;
2091 int nfsvers
, biosize
, modified
, readaheads
= 0;
2096 FSDBG_TOP(514, np
, uio_offset(uio
), uio_resid(uio
), ioflag
);
2098 nfsvers
= nmp
->nm_vers
;
2099 biosize
= nmp
->nm_biosize
;
2100 thd
= vfs_context_thread(ctx
);
2101 cred
= vfs_context_ucred(ctx
);
2103 if (vnode_vtype(vp
) != VREG
) {
2104 printf("nfs_bioread: type %x unexpected\n", vnode_vtype(vp
));
2105 FSDBG_BOT(514, np
, 0xd1e0016, 0, EINVAL
);
2110 * For NFS, cache consistency can only be maintained approximately.
2111 * Although RFC1094 does not specify the criteria, the following is
2112 * believed to be compatible with the reference port.
2114 * If the file has changed since the last read RPC or you have
2115 * written to the file, you may have lost data cache consistency
2116 * with the server. So, check for a change, and flush all of the
2117 * file's data out of the cache.
2118 * NB: This implies that cache data can be read when up to
2119 * NFS_MAXATTRTIMO seconds out of date. If you find that you
2120 * need current attributes, nfs_getattr() can be forced to fetch
2121 * new attributes (via NATTRINVALIDATE() or NGA_UNCACHED).
2124 if (ISSET(np
->n_flag
, NUPDATESIZE
)) {
2125 nfs_data_update_size(np
, 0);
2128 if ((error
= nfs_node_lock(np
))) {
2129 FSDBG_BOT(514, np
, 0xd1e0222, 0, error
);
2133 if (np
->n_flag
& NNEEDINVALIDATE
) {
2134 np
->n_flag
&= ~NNEEDINVALIDATE
;
2135 nfs_node_unlock(np
);
2136 error
= nfs_vinvalbuf(vp
, V_SAVE
| V_IGNORE_WRITEERR
, ctx
, 1);
2138 error
= nfs_node_lock(np
);
2141 FSDBG_BOT(514, np
, 0xd1e0322, 0, error
);
2146 modified
= (np
->n_flag
& NMODIFIED
);
2147 nfs_node_unlock(np
);
2148 /* nfs_getattr() will check changed and purge caches */
2149 error
= nfs_getattr(np
, NULL
, ctx
, modified
? NGA_UNCACHED
: NGA_CACHED
);
2151 FSDBG_BOT(514, np
, 0xd1e0004, 0, error
);
2155 if (uio_resid(uio
) == 0) {
2156 FSDBG_BOT(514, np
, 0xd1e0001, 0, 0);
2159 if (uio_offset(uio
) < 0) {
2160 FSDBG_BOT(514, np
, 0xd1e0002, 0, EINVAL
);
2165 * set up readahead - which may be limited by:
2166 * + current request length (for IO_NOCACHE)
2167 * + readahead setting
2170 if (nmp
->nm_readahead
> 0) {
2171 off_t end
= uio_offset(uio
) + uio_resid(uio
);
2172 if (end
> (off_t
)np
->n_size
) {
2175 rabn
= uio_offset(uio
) / biosize
;
2176 maxrabn
= (end
- 1) / biosize
;
2177 nfs_node_lock_force(np
);
2178 if (!(ioflag
& IO_NOCACHE
) &&
2179 (!rabn
|| (rabn
== np
->n_lastread
) || (rabn
== (np
->n_lastread
+ 1)))) {
2180 maxrabn
+= nmp
->nm_readahead
;
2181 if ((maxrabn
* biosize
) >= (off_t
)np
->n_size
) {
2182 maxrabn
= ((off_t
)np
->n_size
- 1) / biosize
;
2185 if (maxrabn
< np
->n_lastrahead
) {
2186 np
->n_lastrahead
= -1;
2188 if (rabn
< np
->n_lastrahead
) {
2189 rabn
= np
->n_lastrahead
+ 1;
2191 nfs_node_unlock(np
);
2197 nfs_data_lock(np
, NFS_DATA_LOCK_SHARED
);
2198 lbn
= uio_offset(uio
) / biosize
;
2201 * Copy directly from any cached pages without grabbing the bufs.
2202 * (If we are NOCACHE and we've issued readahead requests, we need
2203 * to grab the NB_NCRDAHEAD bufs to drop them.)
2205 if ((!(ioflag
& IO_NOCACHE
) || !readaheads
) &&
2206 ((uio
->uio_segflg
== UIO_USERSPACE32
||
2207 uio
->uio_segflg
== UIO_USERSPACE64
||
2208 uio
->uio_segflg
== UIO_USERSPACE
))) {
2209 io_resid
= uio_resid(uio
);
2210 diff
= np
->n_size
- uio_offset(uio
);
2211 if (diff
< io_resid
) {
2215 int count
= (io_resid
> INT_MAX
) ? INT_MAX
: (int)io_resid
;
2216 error
= cluster_copy_ubc_data(vp
, uio
, &count
, 0);
2218 nfs_data_unlock(np
);
2219 FSDBG_BOT(514, np
, uio_offset(uio
), 0xcacefeed, error
);
2223 /* count any biocache reads that we just copied directly */
2224 if (lbn
!= (uio_offset(uio
) / biosize
)) {
2225 OSAddAtomic64(NFS_ROUND_BLOCK(uio_offset(uio
), biosize
) - lbn
, &nfsstats
.biocache_reads
);
2226 FSDBG(514, np
, 0xcacefeed, uio_offset(uio
), error
);
2230 lbn
= uio_offset(uio
) / biosize
;
2231 on
= uio_offset(uio
) % biosize
;
2232 nfs_node_lock_force(np
);
2233 np
->n_lastread
= (uio_offset(uio
) - 1) / biosize
;
2234 nfs_node_unlock(np
);
2236 if ((uio_resid(uio
) <= 0) || (uio_offset(uio
) >= (off_t
)np
->n_size
)) {
2237 nfs_data_unlock(np
);
2238 FSDBG_BOT(514, np
, uio_offset(uio
), uio_resid(uio
), 0xaaaaaaaa);
2242 /* adjust readahead block number, if necessary */
2246 lastrabn
= MIN(maxrabn
, lbn
+ nmp
->nm_readahead
);
2247 if (rabn
<= lastrabn
) { /* start readaheads */
2248 error
= nfs_buf_readahead(np
, ioflag
, &rabn
, lastrabn
, thd
, cred
);
2250 nfs_data_unlock(np
);
2251 FSDBG_BOT(514, np
, 0xd1e000b, 1, error
);
2255 OSAddAtomic64(rabn
- lbn
, &nfsstats
.biocache_reads
);
2257 OSAddAtomic64(1, &nfsstats
.biocache_reads
);
2261 * If the block is in the cache and has the required data
2262 * in a valid region, just copy it out.
2263 * Otherwise, get the block and write back/read in,
2267 io_resid
= uio_resid(uio
);
2268 n
= (io_resid
> (biosize
- on
)) ? (biosize
- on
) : io_resid
;
2269 diff
= np
->n_size
- uio_offset(uio
);
2274 error
= nfs_buf_get(np
, lbn
, biosize
, thd
, NBLK_READ
, &bp
);
2276 nfs_data_unlock(np
);
2277 FSDBG_BOT(514, np
, 0xd1e000c, 0, error
);
2281 if ((ioflag
& IO_NOCACHE
) && ISSET(bp
->nb_flags
, NB_CACHE
)) {
2283 * IO_NOCACHE found a cached buffer.
2284 * Flush the buffer if it's dirty.
2285 * Invalidate the data if it wasn't just read
2286 * in as part of a "nocache readahead".
2288 if (nfs_buf_pgs_is_set(&bp
->nb_dirty
) || (bp
->nb_dirtyend
> 0)) {
2289 /* so write the buffer out and try again */
2290 SET(bp
->nb_flags
, NB_NOCACHE
);
2293 if (ISSET(bp
->nb_flags
, NB_NCRDAHEAD
)) {
2294 CLR(bp
->nb_flags
, NB_NCRDAHEAD
);
2295 SET(bp
->nb_flags
, NB_NOCACHE
);
2299 /* if any pages are valid... */
2300 if (nfs_buf_pgs_is_set(&bp
->nb_valid
)) {
2301 /* ...check for any invalid pages in the read range */
2302 off_t pg
, firstpg
, lastpg
, dirtypg
;
2303 dirtypg
= firstpg
= lastpg
= -1;
2304 pg
= on
/ PAGE_SIZE
;
2305 while (pg
<= (on
+ n
- 1) / PAGE_SIZE
) {
2306 if (!NBPGVALID(bp
, pg
)) {
2311 } else if (firstpg
>= 0 && dirtypg
< 0 && NBPGDIRTY(bp
, pg
)) {
2317 /* if there are no invalid pages, we're all set */
2319 if (bp
->nb_validoff
< 0) {
2320 /* valid range isn't set up, so */
2321 /* set it to what we know is valid */
2322 bp
->nb_validoff
= trunc_page_64(on
);
2323 bp
->nb_validend
= round_page_64(on
+ n
);
2324 nfs_buf_normalize_valid_range(np
, bp
);
2329 /* there are invalid pages in the read range */
2330 if (((dirtypg
> firstpg
) && (dirtypg
< lastpg
)) ||
2331 (((firstpg
* PAGE_SIZE
) < bp
->nb_dirtyend
) && (((lastpg
+ 1) * PAGE_SIZE
) > bp
->nb_dirtyoff
))) {
2332 /* there are also dirty page(s) (or range) in the read range, */
2333 /* so write the buffer out and try again */
2335 CLR(bp
->nb_flags
, (NB_DONE
| NB_ERROR
| NB_INVAL
));
2336 SET(bp
->nb_flags
, NB_ASYNC
);
2337 if (!IS_VALID_CRED(bp
->nb_wcred
)) {
2338 kauth_cred_ref(cred
);
2339 bp
->nb_wcred
= cred
;
2341 error
= nfs_buf_write(bp
);
2343 nfs_data_unlock(np
);
2344 FSDBG_BOT(514, np
, 0xd1e000d, 0, error
);
2349 if (!nfs_buf_pgs_is_set(&bp
->nb_dirty
) && bp
->nb_dirtyend
<= 0 &&
2350 (lastpg
- firstpg
+ 1) > (biosize
/ PAGE_SIZE
) / 2) {
2351 /* we need to read in more than half the buffer and the */
2352 /* buffer's not dirty, so just fetch the whole buffer */
2353 NBPGS_ERASE(&bp
->nb_valid
);
2355 /* read the page range in */
2357 char uio_buf
[UIO_SIZEOF(1)];
2360 auio
= uio_createwithbuffer(1, (NBOFF(bp
) + firstpg
* PAGE_SIZE_64
),
2361 UIO_SYSSPACE
, UIO_READ
, &uio_buf
[0], sizeof(uio_buf
));
2365 NFS_UIO_ADDIOV(auio
, CAST_USER_ADDR_T(bp
->nb_data
+ (firstpg
* PAGE_SIZE
)),
2366 ((lastpg
- firstpg
+ 1) * PAGE_SIZE
));
2367 error
= nfs_read_rpc(np
, auio
, ctx
);
2370 if (ioflag
& IO_NOCACHE
) {
2371 SET(bp
->nb_flags
, NB_NOCACHE
);
2373 nfs_buf_release(bp
, 1);
2374 nfs_data_unlock(np
);
2375 FSDBG_BOT(514, np
, 0xd1e000e, 0, error
);
2378 /* Make sure that the valid range is set to cover this read. */
2379 bp
->nb_validoff
= trunc_page_64(on
);
2380 bp
->nb_validend
= round_page_64(on
+ n
);
2381 nfs_buf_normalize_valid_range(np
, bp
);
2382 if (uio_resid(auio
) > 0) {
2383 /* if short read, must have hit EOF, */
2384 /* so zero the rest of the range */
2385 bzero(CAST_DOWN(caddr_t
, uio_curriovbase(auio
)), uio_resid(auio
));
2387 /* mark the pages (successfully read) as valid */
2388 for (pg
= firstpg
; pg
<= lastpg
; pg
++) {
2389 NBPGVALID_SET(bp
, pg
);
2393 /* if no pages are valid, read the whole block */
2394 if (!nfs_buf_pgs_is_set(&bp
->nb_valid
)) {
2395 if (!IS_VALID_CRED(bp
->nb_rcred
) && IS_VALID_CRED(cred
)) {
2396 kauth_cred_ref(cred
);
2397 bp
->nb_rcred
= cred
;
2399 SET(bp
->nb_flags
, NB_READ
);
2400 CLR(bp
->nb_flags
, (NB_DONE
| NB_ERROR
| NB_INVAL
));
2401 error
= nfs_buf_read(bp
);
2402 if (ioflag
& IO_NOCACHE
) {
2403 SET(bp
->nb_flags
, NB_NOCACHE
);
2406 nfs_data_unlock(np
);
2407 nfs_buf_release(bp
, 1);
2408 FSDBG_BOT(514, np
, 0xd1e000f, 0, error
);
2413 /* validate read range against valid range and clip */
2414 if (bp
->nb_validend
> 0) {
2415 diff
= (on
>= bp
->nb_validend
) ? 0 : (bp
->nb_validend
- on
);
2422 n32
= n
> INT_MAX
? INT_MAX
: (int)n
;
2423 error
= uiomove(bp
->nb_data
+ on
, n32
, uio
);
2424 if (!error
&& n
> n32
) {
2425 error
= uiomove(bp
->nb_data
+ on
+ n32
, (int)(n
- n32
), uio
);
2430 nfs_buf_release(bp
, 1);
2431 nfs_data_unlock(np
);
2432 nfs_node_lock_force(np
);
2433 np
->n_lastread
= (uio_offset(uio
) - 1) / biosize
;
2434 nfs_node_unlock(np
);
2435 } while (error
== 0 && uio_resid(uio
) > 0 && n
> 0);
2436 FSDBG_BOT(514, np
, uio_offset(uio
), uio_resid(uio
), error
);
2441 * limit the number of outstanding async I/O writes
2444 nfs_async_write_start(struct nfsmount
*nmp
)
2446 int error
= 0, slpflag
= NMFLAG(nmp
, INTR
) ? PCATCH
: 0;
2447 struct timespec ts
= { .tv_sec
= 1, .tv_nsec
= 0 };
2449 if (nfs_max_async_writes
<= 0) {
2452 lck_mtx_lock(&nmp
->nm_lock
);
2453 while ((nfs_max_async_writes
> 0) && (nmp
->nm_asyncwrites
>= nfs_max_async_writes
)) {
2454 if ((error
= nfs_sigintr(nmp
, NULL
, current_thread(), 1))) {
2457 msleep(&nmp
->nm_asyncwrites
, &nmp
->nm_lock
, slpflag
| (PZERO
- 1), "nfsasyncwrites", &ts
);
2461 nmp
->nm_asyncwrites
++;
2463 lck_mtx_unlock(&nmp
->nm_lock
);
2467 nfs_async_write_done(struct nfsmount
*nmp
)
2469 if (nmp
->nm_asyncwrites
<= 0) {
2472 lck_mtx_lock(&nmp
->nm_lock
);
2473 if (nmp
->nm_asyncwrites
-- >= nfs_max_async_writes
) {
2474 wakeup(&nmp
->nm_asyncwrites
);
2476 lck_mtx_unlock(&nmp
->nm_lock
);
2480 * write (or commit) the given NFS buffer
2482 * Commit the buffer if we can.
2483 * Write out any dirty range.
2484 * If any dirty pages remain, write them out.
2487 * For async requests, all the work beyond sending the initial
2488 * write RPC is handled in the RPC callback(s).
2491 nfs_buf_write(struct nfsbuf
*bp
)
2493 int error
= 0, oldflags
, async
;
2497 proc_t p
= current_proc();
2499 off_t doff
, dend
, firstpg
, lastpg
;
2501 FSDBG_TOP(553, bp
, NBOFF(bp
), bp
->nb_flags
, 0);
2503 if (!ISSET(bp
->nb_lflags
, NBL_BUSY
)) {
2504 panic("nfs_buf_write: buffer is not busy???");
2508 async
= ISSET(bp
->nb_flags
, NB_ASYNC
);
2509 oldflags
= bp
->nb_flags
;
2511 CLR(bp
->nb_flags
, (NB_READ
| NB_DONE
| NB_ERROR
| NB_DELWRI
));
2512 if (ISSET(oldflags
, NB_DELWRI
)) {
2513 lck_mtx_lock(&nfs_buf_mutex
);
2516 lck_mtx_unlock(&nfs_buf_mutex
);
2517 wakeup(&nfs_nbdwrite
);
2520 /* move to clean list */
2521 if (ISSET(oldflags
, (NB_ASYNC
| NB_DELWRI
))) {
2522 lck_mtx_lock(&nfs_buf_mutex
);
2523 if (bp
->nb_vnbufs
.le_next
!= NFSNOLIST
) {
2524 LIST_REMOVE(bp
, nb_vnbufs
);
2526 LIST_INSERT_HEAD(&np
->n_cleanblkhd
, bp
, nb_vnbufs
);
2527 lck_mtx_unlock(&nfs_buf_mutex
);
2529 nfs_node_lock_force(np
);
2531 nfs_node_unlock(np
);
2532 vnode_startwrite(NFSTOV(np
));
2534 if (p
&& p
->p_stats
) {
2535 OSIncrementAtomicLong(&p
->p_stats
->p_ru
.ru_oublock
);
2538 cred
= bp
->nb_wcred
;
2539 if (!IS_VALID_CRED(cred
) && ISSET(bp
->nb_flags
, NB_READ
)) {
2540 cred
= bp
->nb_rcred
; /* shouldn't really happen, but... */
2542 if (IS_VALID_CRED(cred
)) {
2543 kauth_cred_ref(cred
);
2545 thd
= async
? NULL
: current_thread();
2547 /* We need to make sure the pages are locked before doing I/O. */
2548 if (!ISSET(bp
->nb_flags
, NB_META
)) {
2549 if (UBCINFOEXISTS(NFSTOV(np
))) {
2550 if (!ISSET(bp
->nb_flags
, NB_PAGELIST
)) {
2551 error
= nfs_buf_upl_setup(bp
);
2553 printf("nfs_buf_write: upl create failed %d\n", error
);
2554 SET(bp
->nb_flags
, NB_ERROR
);
2555 bp
->nb_error
= error
= EIO
;
2559 nfs_buf_upl_check(bp
);
2562 /* We should never be in nfs_buf_write() with no UBCINFO. */
2563 printf("nfs_buf_write: ubcinfo already gone\n");
2564 SET(bp
->nb_flags
, NB_ERROR
);
2565 bp
->nb_error
= error
= EIO
;
2571 /* If NB_NEEDCOMMIT is set, a commit RPC may do the trick. */
2572 if (ISSET(bp
->nb_flags
, NB_NEEDCOMMIT
)) {
2573 nfs_buf_check_write_verifier(np
, bp
);
2575 if (ISSET(bp
->nb_flags
, NB_NEEDCOMMIT
)) {
2576 struct nfsmount
*nmp
= NFSTONMP(np
);
2577 if (nfs_mount_gone(nmp
)) {
2578 SET(bp
->nb_flags
, NB_ERROR
);
2579 bp
->nb_error
= error
= EIO
;
2583 SET(bp
->nb_flags
, NB_WRITEINPROG
);
2584 error
= nmp
->nm_funcs
->nf_commit_rpc(np
, NBOFF(bp
) + bp
->nb_dirtyoff
,
2585 bp
->nb_dirtyend
- bp
->nb_dirtyoff
, bp
->nb_wcred
, bp
->nb_verf
);
2586 CLR(bp
->nb_flags
, NB_WRITEINPROG
);
2588 if (error
!= NFSERR_STALEWRITEVERF
) {
2589 SET(bp
->nb_flags
, NB_ERROR
);
2590 bp
->nb_error
= error
;
2595 bp
->nb_dirtyoff
= bp
->nb_dirtyend
= 0;
2596 CLR(bp
->nb_flags
, NB_NEEDCOMMIT
);
2597 nfs_node_lock_force(np
);
2598 np
->n_needcommitcnt
--;
2599 CHECK_NEEDCOMMITCNT(np
);
2600 nfs_node_unlock(np
);
2602 if (!error
&& (bp
->nb_dirtyend
> 0)) {
2603 /* sanity check the dirty range */
2604 if (NBOFF(bp
) + bp
->nb_dirtyend
> (off_t
) np
->n_size
) {
2605 bp
->nb_dirtyend
= np
->n_size
- NBOFF(bp
);
2606 if (bp
->nb_dirtyoff
>= bp
->nb_dirtyend
) {
2607 bp
->nb_dirtyoff
= bp
->nb_dirtyend
= 0;
2611 if (!error
&& (bp
->nb_dirtyend
> 0)) {
2612 /* there's a dirty range that needs to be written out */
2613 nfsbufpgs pagemask
, pagemaskand
;
2616 doff
= bp
->nb_dirtyoff
;
2617 dend
= bp
->nb_dirtyend
;
2619 /* if doff page is dirty, move doff to start of page */
2620 if (NBPGDIRTY(bp
, doff
/ PAGE_SIZE
)) {
2621 doff
-= doff
& PAGE_MASK
;
2623 /* try to expand write range to include preceding dirty pages */
2624 if (!(doff
& PAGE_MASK
)) {
2625 while ((doff
> 0) && NBPGDIRTY(bp
, (doff
- 1) / PAGE_SIZE
)) {
2629 /* if dend page is dirty, move dend to start of next page */
2630 if ((dend
& PAGE_MASK
) && NBPGDIRTY(bp
, dend
/ PAGE_SIZE
)) {
2631 dend
= round_page_64(dend
);
2633 /* try to expand write range to include trailing dirty pages */
2634 if (!(dend
& PAGE_MASK
)) {
2635 while ((dend
< (int)bp
->nb_bufsize
) && NBPGDIRTY(bp
, dend
/ PAGE_SIZE
)) {
2639 /* make sure to keep dend clipped to EOF */
2640 if ((NBOFF(bp
) + dend
) > (off_t
) np
->n_size
) {
2641 dend
= np
->n_size
- NBOFF(bp
);
2643 /* calculate range of complete pages being written */
2645 firstpg
= doff
/ PAGE_SIZE
;
2646 lastpg
= (dend
- 1) / PAGE_SIZE
;
2647 /* calculate mask for that page range */
2648 nfs_buf_pgs_set_pages_between(&pagemask
, firstpg
, lastpg
+ 1);
2650 NBPGS_ERASE(&pagemask
);
2654 * compare page mask to nb_dirty; if there are other dirty pages
2655 * then write FILESYNC; otherwise, write UNSTABLE if async and
2656 * not needcommit/stable; otherwise write FILESYNC
2658 nfs_buf_pgs_bit_not(&pagemask
);
2659 nfs_buf_pgs_bit_and(&bp
->nb_dirty
, &pagemask
, &pagemaskand
);
2660 if (nfs_buf_pgs_is_set(&pagemaskand
)) {
2661 iomode
= NFS_WRITE_FILESYNC
;
2662 } else if ((bp
->nb_flags
& (NB_ASYNC
| NB_NEEDCOMMIT
| NB_STABLE
)) == NB_ASYNC
) {
2663 iomode
= NFS_WRITE_UNSTABLE
;
2665 iomode
= NFS_WRITE_FILESYNC
;
2668 /* write the whole contiguous dirty range */
2669 bp
->nb_offio
= doff
;
2670 bp
->nb_endio
= dend
;
2672 OSAddAtomic64(1, &nfsstats
.write_bios
);
2674 SET(bp
->nb_flags
, NB_WRITEINPROG
);
2675 error
= nfs_buf_write_rpc(bp
, iomode
, thd
, cred
);
2677 * For async I/O, the callbacks will finish up the
2678 * write and push out any dirty pages. Otherwise,
2679 * the write has already been finished and any dirty
2683 if (!error
&& nfs_buf_pgs_is_set(&bp
->nb_dirty
)) { /* write out any dirty pages */
2684 error
= nfs_buf_write_dirty_pages(bp
, thd
, cred
);
2688 /* note: bp is still valid only for !async case */
2691 error
= nfs_buf_iowait(bp
);
2692 /* move to clean list */
2693 if (oldflags
& NB_DELWRI
) {
2694 lck_mtx_lock(&nfs_buf_mutex
);
2695 if (bp
->nb_vnbufs
.le_next
!= NFSNOLIST
) {
2696 LIST_REMOVE(bp
, nb_vnbufs
);
2698 LIST_INSERT_HEAD(&np
->n_cleanblkhd
, bp
, nb_vnbufs
);
2699 lck_mtx_unlock(&nfs_buf_mutex
);
2701 FSDBG_BOT(553, bp
, NBOFF(bp
), bp
->nb_flags
, error
);
2702 nfs_buf_release(bp
, 1);
2703 /* check if we need to invalidate (and we can) */
2704 if ((np
->n_flag
& NNEEDINVALIDATE
) &&
2705 !(np
->n_bflag
& (NBINVALINPROG
| NBFLUSHINPROG
))) {
2707 nfs_node_lock_force(np
);
2708 if (np
->n_flag
& NNEEDINVALIDATE
) {
2710 np
->n_flag
&= ~NNEEDINVALIDATE
;
2712 nfs_node_unlock(np
);
2715 * There was a write error and we need to
2716 * invalidate attrs and flush buffers in
2717 * order to sync up with the server.
2718 * (if this write was extending the file,
2719 * we may no longer know the correct size)
2721 * But we couldn't call vinvalbuf while holding
2722 * the buffer busy. So we call vinvalbuf() after
2723 * releasing the buffer.
2725 nfs_vinvalbuf2(NFSTOV(np
), V_SAVE
| V_IGNORE_WRITEERR
, thd
, cred
, 1);
2730 if (IS_VALID_CRED(cred
)) {
2731 kauth_cred_unref(&cred
);
2737 * finish the writing of a buffer
2740 nfs_buf_write_finish(struct nfsbuf
*bp
, thread_t thd
, kauth_cred_t cred
)
2742 nfsnode_t np
= bp
->nb_np
;
2743 int error
= (bp
->nb_flags
& NB_ERROR
) ? bp
->nb_error
: 0;
2744 off_t firstpg
, lastpg
;
2746 if ((error
== EINTR
) || (error
== ERESTART
)) {
2747 CLR(bp
->nb_flags
, NB_ERROR
);
2748 SET(bp
->nb_flags
, NB_EINTR
);
2753 /* calculate range of complete pages being written */
2754 if (bp
->nb_endio
> bp
->nb_offio
) {
2755 firstpg
= bp
->nb_offio
/ PAGE_SIZE
;
2756 lastpg
= (bp
->nb_endio
- 1) / PAGE_SIZE
;
2757 /* calculate mask for that page range written */
2758 nfs_buf_pgs_set_pages_between(&pagemask
, firstpg
, lastpg
+ 1);
2760 NBPGS_ERASE(&pagemask
);
2762 /* clear dirty bits for pages we've written */
2763 nfs_buf_pgs_bit_not(&pagemask
);
2764 nfs_buf_pgs_bit_and(&bp
->nb_dirty
, &pagemask
, &bp
->nb_dirty
);
2767 /* manage needcommit state */
2768 if (!error
&& (bp
->nb_commitlevel
== NFS_WRITE_UNSTABLE
)) {
2769 if (!ISSET(bp
->nb_flags
, NB_NEEDCOMMIT
)) {
2770 nfs_node_lock_force(np
);
2771 np
->n_needcommitcnt
++;
2772 nfs_node_unlock(np
);
2773 SET(bp
->nb_flags
, NB_NEEDCOMMIT
);
2775 /* make sure nb_dirtyoff/nb_dirtyend reflect actual range written */
2776 bp
->nb_dirtyoff
= bp
->nb_offio
;
2777 bp
->nb_dirtyend
= bp
->nb_endio
;
2778 } else if (ISSET(bp
->nb_flags
, NB_NEEDCOMMIT
)) {
2779 nfs_node_lock_force(np
);
2780 np
->n_needcommitcnt
--;
2781 CHECK_NEEDCOMMITCNT(np
);
2782 nfs_node_unlock(np
);
2783 CLR(bp
->nb_flags
, NB_NEEDCOMMIT
);
2786 CLR(bp
->nb_flags
, NB_WRITEINPROG
);
2789 * For an unstable write, the buffer is still treated as dirty until
2790 * a commit (or stable (re)write) is performed. Buffers needing only
2791 * a commit are marked with the NB_DELWRI and NB_NEEDCOMMIT flags.
2793 * If the write was interrupted we set NB_EINTR. Don't set NB_ERROR
2794 * because that would cause the buffer to be dropped. The buffer is
2795 * still valid and simply needs to be written again.
2797 if ((error
== EINTR
) || (error
== ERESTART
) || (!error
&& (bp
->nb_flags
& NB_NEEDCOMMIT
))) {
2798 CLR(bp
->nb_flags
, NB_INVAL
);
2799 if (!ISSET(bp
->nb_flags
, NB_DELWRI
)) {
2800 SET(bp
->nb_flags
, NB_DELWRI
);
2801 lck_mtx_lock(&nfs_buf_mutex
);
2804 lck_mtx_unlock(&nfs_buf_mutex
);
2807 * Since for the NB_ASYNC case, we've reassigned the buffer to the
2808 * clean list, we have to reassign it back to the dirty one. Ugh.
2810 if (ISSET(bp
->nb_flags
, NB_ASYNC
)) {
2811 /* move to dirty list */
2812 lck_mtx_lock(&nfs_buf_mutex
);
2813 if (bp
->nb_vnbufs
.le_next
!= NFSNOLIST
) {
2814 LIST_REMOVE(bp
, nb_vnbufs
);
2816 LIST_INSERT_HEAD(&np
->n_dirtyblkhd
, bp
, nb_vnbufs
);
2817 lck_mtx_unlock(&nfs_buf_mutex
);
2820 /* either there's an error or we don't need to commit */
2823 * There was a write error and we need to invalidate
2824 * attrs and flush buffers in order to sync up with the
2825 * server. (if this write was extending the file, we
2826 * may no longer know the correct size)
2828 * But we can't call vinvalbuf while holding this
2829 * buffer busy. Set a flag to do it after releasing
2832 nfs_node_lock_force(np
);
2833 np
->n_error
= error
;
2834 np
->n_flag
|= (NWRITEERR
| NNEEDINVALIDATE
);
2835 NATTRINVALIDATE(np
);
2836 nfs_node_unlock(np
);
2838 /* clear the dirty range */
2839 bp
->nb_dirtyoff
= bp
->nb_dirtyend
= 0;
2842 if (!error
&& nfs_buf_pgs_is_set(&bp
->nb_dirty
)) {
2843 nfs_buf_write_dirty_pages(bp
, thd
, cred
);
2849 * write out any pages marked dirty in a buffer
2851 * We do use unstable writes and follow up with a commit.
2852 * If we catch the write verifier changing we'll restart
2853 * do the writes filesync.
2856 nfs_buf_write_dirty_pages(struct nfsbuf
*bp
, thread_t thd
, kauth_cred_t cred
)
2858 nfsnode_t np
= bp
->nb_np
;
2859 struct nfsmount
*nmp
= NFSTONMP(np
);
2860 int error
= 0, commit
, iomode
, iomode2
, len
, pg
, count
, npages
, off
;
2864 char uio_buf
[UIO_SIZEOF(1)];
2866 if (!nfs_buf_pgs_is_set(&bp
->nb_dirty
)) {
2870 /* there are pages marked dirty that need to be written out */
2871 OSAddAtomic64(1, &nfsstats
.write_bios
);
2873 SET(bp
->nb_flags
, NB_WRITEINPROG
);
2874 npages
= bp
->nb_bufsize
/ PAGE_SIZE
;
2875 iomode
= NFS_WRITE_UNSTABLE
;
2877 auio
= uio_createwithbuffer(1, 0, UIO_SYSSPACE
, UIO_WRITE
,
2878 &uio_buf
, sizeof(uio_buf
));
2881 NBPGS_COPY(&dirty
, &bp
->nb_dirty
);
2882 wverf
= bp
->nb_verf
;
2883 commit
= NFS_WRITE_FILESYNC
;
2884 for (pg
= 0; pg
< npages
; pg
++) {
2885 if (!NBPGDIRTY(bp
, pg
)) {
2889 while (((pg
+ count
) < npages
) && NBPGDIRTY(bp
, pg
+ count
)) {
2892 /* write count pages starting with page pg */
2893 off
= pg
* PAGE_SIZE
;
2894 len
= count
* PAGE_SIZE
;
2895 /* clip writes to EOF */
2896 if (NBOFF(bp
) + off
+ len
> (off_t
) np
->n_size
) {
2897 len
-= (NBOFF(bp
) + off
+ len
) - np
->n_size
;
2901 uio_reset(auio
, NBOFF(bp
) + off
, UIO_SYSSPACE
, UIO_WRITE
);
2902 uio_addiov(auio
, CAST_USER_ADDR_T(bp
->nb_data
+ off
), len
);
2903 error
= nfs_write_rpc2(np
, auio
, thd
, cred
, &iomode2
, &bp
->nb_verf
);
2907 if (iomode2
< commit
) { /* Retain the lowest commitment level returned. */
2910 if ((commit
!= NFS_WRITE_FILESYNC
) && (wverf
!= bp
->nb_verf
)) {
2911 /* verifier changed, redo all the writes filesync */
2912 iomode
= NFS_WRITE_FILESYNC
;
2916 /* clear dirty bits */
2918 NBPGS_UNSET(&dirty
, pg
);
2919 if (count
) { /* leave pg on last page */
2924 CLR(bp
->nb_flags
, NB_WRITEINPROG
);
2926 if (!error
&& (commit
!= NFS_WRITE_FILESYNC
)) {
2927 error
= nmp
->nm_funcs
->nf_commit_rpc(np
, NBOFF(bp
), bp
->nb_bufsize
, cred
, wverf
);
2928 if (error
== NFSERR_STALEWRITEVERF
) {
2929 /* verifier changed, so we need to restart all the writes */
2930 iomode
= NFS_WRITE_FILESYNC
;
2935 NBPGS_COPY(&bp
->nb_dirty
, &dirty
);
2937 SET(bp
->nb_flags
, NB_ERROR
);
2938 bp
->nb_error
= error
;
2944 * initiate the NFS WRITE RPC(s) for a buffer
2947 nfs_buf_write_rpc(struct nfsbuf
*bp
, int iomode
, thread_t thd
, kauth_cred_t cred
)
2949 struct nfsmount
*nmp
;
2950 nfsnode_t np
= bp
->nb_np
;
2951 int error
= 0, nfsvers
, async
;
2956 struct nfsreq_cbinfo cb
;
2958 char uio_buf
[UIO_SIZEOF(1)];
2959 off_t offset
, length
;
2962 if (nfs_mount_gone(nmp
)) {
2963 bp
->nb_error
= error
= ENXIO
;
2964 SET(bp
->nb_flags
, NB_ERROR
);
2968 nfsvers
= nmp
->nm_vers
;
2969 nmwsize
= nmp
->nm_wsize
;
2971 offset
= bp
->nb_offio
;
2972 length
= bp
->nb_endio
- bp
->nb_offio
;
2974 /* Note: Can only do async I/O if nfsiods are configured. */
2975 async
= (bp
->nb_flags
& NB_ASYNC
) && (NFSIOD_MAX
> 0);
2976 bp
->nb_commitlevel
= NFS_WRITE_FILESYNC
;
2977 cb
.rcb_func
= async
? nfs_buf_write_rpc_finish
: NULL
;
2980 if ((nfsvers
== NFS_VER2
) && ((NBOFF(bp
) + bp
->nb_endio
) > 0xffffffffLL
)) {
2981 bp
->nb_error
= error
= EFBIG
;
2982 SET(bp
->nb_flags
, NB_ERROR
);
2988 /* We should never get here */
2990 printf("nfs_buf_write_rpc: Got request with zero length. np %p, bp %p, offset %lld\n", np
, bp
, offset
);
2992 printf("nfs_buf_write_rpc: Got request with zero length.\n");
2993 #endif /* DEVELOPMENT */
2998 auio
= uio_createwithbuffer(1, NBOFF(bp
) + offset
, UIO_SYSSPACE
,
2999 UIO_WRITE
, &uio_buf
, sizeof(uio_buf
));
3000 NFS_UIO_ADDIOV(auio
, CAST_USER_ADDR_T(bp
->nb_data
+ offset
), length
);
3002 bp
->nb_rpcs
= nrpcs
= (length
+ nmwsize
- 1) / nmwsize
;
3003 if (async
&& (nrpcs
> 1)) {
3004 SET(bp
->nb_flags
, NB_MULTASYNCRPC
);
3006 CLR(bp
->nb_flags
, NB_MULTASYNCRPC
);
3009 while (length
> 0) {
3010 if (ISSET(bp
->nb_flags
, NB_ERROR
)) {
3011 error
= bp
->nb_error
;
3014 len
= (length
> nmwsize
) ? nmwsize
: (uint32_t)length
;
3015 cb
.rcb_args
.offset
= offset
;
3016 cb
.rcb_args
.length
= len
;
3018 if (nmp
->nm_vers
>= NFS_VER4
) {
3019 cb
.rcb_args
.stategenid
= nmp
->nm_stategenid
;
3022 if (async
&& ((error
= nfs_async_write_start(nmp
)))) {
3026 error
= nmp
->nm_funcs
->nf_write_rpc_async(np
, auio
, len
, thd
, cred
,
3030 nfs_async_write_done(nmp
);
3039 nfs_buf_write_rpc_finish(req
);
3044 * Something bad happened while trying to send the RPCs.
3045 * Wait for any outstanding requests to complete.
3047 bp
->nb_error
= error
;
3048 SET(bp
->nb_flags
, NB_ERROR
);
3049 if (ISSET(bp
->nb_flags
, NB_MULTASYNCRPC
)) {
3050 nrpcs
= (length
+ nmwsize
- 1) / nmwsize
;
3051 lck_mtx_lock(&nfs_buf_mutex
);
3052 bp
->nb_rpcs
-= nrpcs
;
3053 if (bp
->nb_rpcs
== 0) {
3054 /* No RPCs left, so the buffer's done */
3055 lck_mtx_unlock(&nfs_buf_mutex
);
3056 nfs_buf_write_finish(bp
, thd
, cred
);
3058 /* wait for the last RPC to mark it done */
3059 while (bp
->nb_rpcs
> 0) {
3060 msleep(&bp
->nb_rpcs
, &nfs_buf_mutex
, 0,
3061 "nfs_buf_write_rpc_cancel", NULL
);
3063 lck_mtx_unlock(&nfs_buf_mutex
);
3066 nfs_buf_write_finish(bp
, thd
, cred
);
3068 /* It may have just been an interrupt... that's OK */
3069 if (!ISSET(bp
->nb_flags
, NB_ERROR
)) {
3078 * finish up an NFS WRITE RPC on a buffer
3081 nfs_buf_write_rpc_finish(struct nfsreq
*req
)
3083 int error
= 0, nfsvers
, multasyncrpc
, finished
;
3084 int committed
= NFS_WRITE_FILESYNC
;
3087 size_t rlen
, length
;
3088 void *wakeme
= NULL
;
3089 struct nfsreq_cbinfo cb
;
3090 struct nfsreq
*wreq
= NULL
;
3092 struct nfsmount
*nmp
;
3097 char uio_buf
[UIO_SIZEOF(1)];
3101 thd
= req
->r_thread
;
3103 if (IS_VALID_CRED(cred
)) {
3104 kauth_cred_ref(cred
);
3106 cb
= req
->r_callback
;
3108 if (cb
.rcb_func
) { /* take an extra reference on the nfsreq in case we want to resend it later due to grace error */
3109 nfs_request_ref(req
, 0);
3113 if (nfs_mount_gone(nmp
)) {
3114 SET(bp
->nb_flags
, NB_ERROR
);
3115 bp
->nb_error
= error
= ENXIO
;
3117 if (error
|| ISSET(bp
->nb_flags
, NB_ERROR
)) {
3119 nfs_request_async_cancel(req
);
3122 nfsvers
= nmp
->nm_vers
;
3124 offset
= cb
.rcb_args
.offset
;
3125 rlen
= length
= cb
.rcb_args
.length
;
3127 /* finish the RPC */
3128 error
= nmp
->nm_funcs
->nf_write_rpc_async_finish(np
, req
, &committed
, &rlen
, &wverf
);
3129 if ((error
== EINPROGRESS
) && cb
.rcb_func
) {
3130 /* async request restarted */
3132 nfs_request_rele(req
);
3134 if (IS_VALID_CRED(cred
)) {
3135 kauth_cred_unref(&cred
);
3140 if ((nmp
->nm_vers
>= NFS_VER4
) && nfs_mount_state_error_should_restart(error
) && !ISSET(bp
->nb_flags
, NB_ERROR
)) {
3141 lck_mtx_lock(&nmp
->nm_lock
);
3142 if ((error
!= NFSERR_OLD_STATEID
) && (error
!= NFSERR_GRACE
) && (cb
.rcb_args
.stategenid
== nmp
->nm_stategenid
)) {
3143 NP(np
, "nfs_buf_write_rpc_finish: error %d @ 0x%llx, 0x%x 0x%x, initiating recovery",
3144 error
, NBOFF(bp
) + offset
, cb
.rcb_args
.stategenid
, nmp
->nm_stategenid
);
3145 nfs_need_recover(nmp
, error
);
3147 lck_mtx_unlock(&nmp
->nm_lock
);
3148 if (np
->n_flag
& NREVOKE
) {
3151 if (error
== NFSERR_GRACE
) {
3154 * For an async I/O request, handle a grace delay just like
3155 * jukebox errors. Set the resend time and queue it up.
3158 if (req
->r_nmrep
.nmc_mhead
) {
3159 mbuf_freem(req
->r_nmrep
.nmc_mhead
);
3160 req
->r_nmrep
.nmc_mhead
= NULL
;
3164 lck_mtx_lock(&req
->r_mtx
);
3165 req
->r_resendtime
= now
.tv_sec
+ 2;
3166 req
->r_xid
= 0; // get a new XID
3167 req
->r_flags
|= R_RESTART
;
3169 nfs_asyncio_resend(req
);
3170 lck_mtx_unlock(&req
->r_mtx
);
3171 if (IS_VALID_CRED(cred
)) {
3172 kauth_cred_unref(&cred
);
3174 /* Note: nfsreq reference taken will be dropped later when finished */
3177 /* otherwise, just pause a couple seconds and retry */
3178 tsleep(&nmp
->nm_state
, (PZERO
- 1), "nfsgrace", 2 * hz
);
3180 if (!(error
= nfs_mount_state_wait_for_recovery(nmp
))) {
3188 SET(bp
->nb_flags
, NB_ERROR
);
3189 bp
->nb_error
= error
;
3191 if (error
|| (nfsvers
== NFS_VER2
)) {
3195 SET(bp
->nb_flags
, NB_ERROR
);
3196 bp
->nb_error
= error
= EIO
;
3200 /* save lowest commit level returned */
3201 if (committed
< bp
->nb_commitlevel
) {
3202 bp
->nb_commitlevel
= committed
;
3205 /* check the write verifier */
3207 bp
->nb_verf
= wverf
;
3208 } else if (bp
->nb_verf
!= wverf
) {
3209 /* verifier changed, so buffer will need to be rewritten */
3210 bp
->nb_flags
|= NB_STALEWVERF
;
3211 bp
->nb_commitlevel
= NFS_WRITE_UNSTABLE
;
3212 bp
->nb_verf
= wverf
;
3215 if (!ISSET(bp
->nb_flags
, NB_STALEWVERF
) && rlen
> 0 && (bp
->nb_offio
< (offset
+ (int)rlen
))) {
3216 bp
->nb_offio
= offset
+ rlen
;
3220 * check for a short write
3222 * If the server didn't write all the data, then we
3223 * need to issue another write for the rest of it.
3224 * (Don't bother if the buffer hit an error or stale wverf.)
3226 if ((rlen
< length
) && !(bp
->nb_flags
& (NB_STALEWVERF
| NB_ERROR
))) {
3233 auio
= uio_createwithbuffer(1, NBOFF(bp
) + offset
, UIO_SYSSPACE
,
3234 UIO_WRITE
, &uio_buf
, sizeof(uio_buf
));
3235 uio_addiov(auio
, CAST_USER_ADDR_T(bp
->nb_data
+ offset
), length
);
3237 cb
.rcb_args
.offset
= offset
;
3238 cb
.rcb_args
.length
= length
;
3240 if (nmp
->nm_vers
>= NFS_VER4
) {
3241 cb
.rcb_args
.stategenid
= nmp
->nm_stategenid
;
3244 // XXX iomode should really match the original request
3245 error
= nmp
->nm_funcs
->nf_write_rpc_async(np
, auio
, length
, thd
, cred
,
3246 NFS_WRITE_FILESYNC
, &cb
, &wreq
);
3248 if (IS_VALID_CRED(cred
)) {
3249 kauth_cred_unref(&cred
);
3252 /* if !async we'll need to wait for this RPC to finish */
3257 nfs_request_rele(req
);
3260 * Outstanding RPC count is unchanged.
3261 * Callback will be called when RPC is done.
3265 SET(bp
->nb_flags
, NB_ERROR
);
3266 bp
->nb_error
= error
;
3271 nfs_async_write_done(nmp
);
3272 nfs_request_rele(req
);
3275 * Decrement outstanding RPC count on buffer
3276 * and call nfs_buf_write_finish on last RPC.
3278 * (Note: when there are multiple async RPCs issued for a
3279 * buffer we need nfs_buffer_mutex to avoid problems when
3280 * aborting a partially-initiated set of RPCs)
3282 multasyncrpc
= ISSET(bp
->nb_flags
, NB_MULTASYNCRPC
);
3284 lck_mtx_lock(&nfs_buf_mutex
);
3288 finished
= (bp
->nb_rpcs
== 0);
3291 lck_mtx_unlock(&nfs_buf_mutex
);
3296 wakeme
= &bp
->nb_rpcs
;
3298 nfs_buf_write_finish(bp
, thd
, cred
);
3304 if (IS_VALID_CRED(cred
)) {
3305 kauth_cred_unref(&cred
);
3308 if (cb
.rcb_func
&& np
->n_needcommitcnt
>= NFS_A_LOT_OF_NEEDCOMMITS
) {
3309 nfs_flushcommits(np
, 1);
3314 * Send commit(s) for the given node's "needcommit" buffers
3317 nfs_flushcommits(nfsnode_t np
, int nowait
)
3319 struct nfsmount
*nmp
;
3320 struct nfsbuf
*bp
, *prevlbp
, *lbp
;
3321 struct nfsbuflists blist
, commitlist
;
3322 int error
= 0, retv
, wcred_set
, flags
;
3323 u_quad_t off
, endoff
, toff
;
3324 uint64_t wverf
, count
;
3325 kauth_cred_t wcred
= NULL
;
3328 FSDBG_TOP(557, np
, 0, 0, 0);
3331 * A nb_flags == (NB_DELWRI | NB_NEEDCOMMIT) block has been written to the
3332 * server, but nas not been committed to stable storage on the server
3333 * yet. The byte range is worked out for as many nfsbufs as we can handle
3334 * and the commit rpc is done.
3336 if (!LIST_EMPTY(&np
->n_dirtyblkhd
)) {
3337 error
= nfs_node_lock(np
);
3341 np
->n_flag
|= NMODIFIED
;
3342 nfs_node_unlock(np
);
3348 LIST_INIT(&commitlist
);
3351 if (nfs_mount_gone(nmp
)) {
3355 if (nmp
->nm_vers
== NFS_VER2
) {
3362 flags
|= NBI_NOWAIT
;
3364 lck_mtx_lock(&nfs_buf_mutex
);
3365 wverf
= nmp
->nm_verf
;
3366 if (!nfs_buf_iterprepare(np
, &blist
, flags
)) {
3367 while ((bp
= LIST_FIRST(&blist
))) {
3368 LIST_REMOVE(bp
, nb_vnbufs
);
3369 LIST_INSERT_HEAD(&np
->n_dirtyblkhd
, bp
, nb_vnbufs
);
3370 error
= nfs_buf_acquire(bp
, NBAC_NOWAIT
, 0, 0);
3374 if (ISSET(bp
->nb_flags
, NB_NEEDCOMMIT
)) {
3375 nfs_buf_check_write_verifier(np
, bp
);
3377 if (((bp
->nb_flags
& (NB_DELWRI
| NB_NEEDCOMMIT
)) != (NB_DELWRI
| NB_NEEDCOMMIT
)) ||
3378 (bp
->nb_verf
!= wverf
)) {
3382 nfs_buf_remfree(bp
);
3384 /* buffer UPLs will be grabbed *in order* below */
3386 FSDBG(557, bp
, bp
->nb_flags
, bp
->nb_valid
, bp
->nb_dirty
);
3387 FSDBG(557, bp
->nb_validoff
, bp
->nb_validend
,
3388 bp
->nb_dirtyoff
, bp
->nb_dirtyend
);
3391 * Work out if all buffers are using the same cred
3392 * so we can deal with them all with one commit.
3394 * Note: creds in bp's must be obtained by kauth_cred_ref
3395 * on the same original cred in order for them to be equal.
3397 if (wcred_set
== 0) {
3398 wcred
= bp
->nb_wcred
;
3399 if (!IS_VALID_CRED(wcred
)) {
3400 panic("nfs: needcommit w/out wcred");
3403 } else if ((wcred_set
== 1) && wcred
!= bp
->nb_wcred
) {
3406 SET(bp
->nb_flags
, NB_WRITEINPROG
);
3409 * Add this buffer to the list of buffers we are committing.
3410 * Buffers are inserted into the list in ascending order so that
3411 * we can take the UPLs in order after the list is complete.
3414 LIST_FOREACH(lbp
, &commitlist
, nb_vnbufs
) {
3415 if (bp
->nb_lblkno
< lbp
->nb_lblkno
) {
3420 LIST_REMOVE(bp
, nb_vnbufs
);
3422 LIST_INSERT_AFTER(prevlbp
, bp
, nb_vnbufs
);
3424 LIST_INSERT_HEAD(&commitlist
, bp
, nb_vnbufs
);
3427 /* update commit range start, end */
3428 toff
= NBOFF(bp
) + bp
->nb_dirtyoff
;
3432 toff
+= (u_quad_t
)(bp
->nb_dirtyend
- bp
->nb_dirtyoff
);
3433 if (toff
> endoff
) {
3437 nfs_buf_itercomplete(np
, &blist
, NBI_DIRTY
);
3439 lck_mtx_unlock(&nfs_buf_mutex
);
3441 if (LIST_EMPTY(&commitlist
)) {
3447 * We need a UPL to prevent others from accessing the buffers during
3448 * our commit RPC(s).
3450 * We used to also check for dirty pages here; if there were any we'd
3451 * abort the commit and force the entire buffer to be written again.
3452 * Instead of doing that, we just go ahead and commit the dirty range,
3453 * and then leave the buffer around with dirty pages that will be
3454 * written out later.
3456 LIST_FOREACH(bp
, &commitlist
, nb_vnbufs
) {
3457 if (!ISSET(bp
->nb_flags
, NB_PAGELIST
)) {
3458 retv
= nfs_buf_upl_setup(bp
);
3460 /* Unable to create the UPL, the VM object probably no longer exists. */
3461 printf("nfs_flushcommits: upl create failed %d\n", retv
);
3462 NBPGS_ERASE(&bp
->nb_valid
);
3463 NBPGS_ERASE(&bp
->nb_dirty
);
3466 nfs_buf_upl_check(bp
);
3470 * Commit data on the server, as required.
3471 * If all bufs are using the same wcred, then use that with
3472 * one call for all of them, otherwise commit each one
3475 if (wcred_set
== 1) {
3477 * Note, it's possible the commit range could be >2^32-1.
3478 * If it is, we'll send one commit that covers the whole file.
3480 if ((endoff
- off
) > 0xffffffff) {
3483 count
= (endoff
- off
);
3485 retv
= nmp
->nm_funcs
->nf_commit_rpc(np
, off
, count
, wcred
, wverf
);
3488 LIST_FOREACH(bp
, &commitlist
, nb_vnbufs
) {
3489 toff
= NBOFF(bp
) + bp
->nb_dirtyoff
;
3490 count
= bp
->nb_dirtyend
- bp
->nb_dirtyoff
;
3491 retv
= nmp
->nm_funcs
->nf_commit_rpc(np
, toff
, count
, bp
->nb_wcred
, wverf
);
3499 * Now, either mark the blocks I/O done or mark the
3500 * blocks dirty, depending on whether the commit
3503 while ((bp
= LIST_FIRST(&commitlist
))) {
3504 LIST_REMOVE(bp
, nb_vnbufs
);
3505 FSDBG(557, bp
, retv
, bp
->nb_flags
, bp
->nb_dirty
);
3506 nfs_node_lock_force(np
);
3507 CLR(bp
->nb_flags
, (NB_NEEDCOMMIT
| NB_WRITEINPROG
));
3508 np
->n_needcommitcnt
--;
3509 CHECK_NEEDCOMMITCNT(np
);
3510 nfs_node_unlock(np
);
3513 /* move back to dirty list */
3514 lck_mtx_lock(&nfs_buf_mutex
);
3515 LIST_INSERT_HEAD(&np
->n_dirtyblkhd
, bp
, nb_vnbufs
);
3516 lck_mtx_unlock(&nfs_buf_mutex
);
3517 nfs_buf_release(bp
, 1);
3521 nfs_node_lock_force(np
);
3523 nfs_node_unlock(np
);
3524 vnode_startwrite(NFSTOV(np
));
3525 if (ISSET(bp
->nb_flags
, NB_DELWRI
)) {
3526 lck_mtx_lock(&nfs_buf_mutex
);
3529 lck_mtx_unlock(&nfs_buf_mutex
);
3530 wakeup(&nfs_nbdwrite
);
3532 CLR(bp
->nb_flags
, (NB_READ
| NB_DONE
| NB_ERROR
| NB_DELWRI
));
3533 /* if block still has dirty pages, we don't want it to */
3534 /* be released in nfs_buf_iodone(). So, don't set NB_ASYNC. */
3535 NBPGS_COPY(&dirty
, &bp
->nb_dirty
);
3536 if (!nfs_buf_pgs_is_set(&dirty
)) {
3537 SET(bp
->nb_flags
, NB_ASYNC
);
3539 CLR(bp
->nb_flags
, NB_ASYNC
);
3542 /* move to clean list */
3543 lck_mtx_lock(&nfs_buf_mutex
);
3544 LIST_INSERT_HEAD(&np
->n_cleanblkhd
, bp
, nb_vnbufs
);
3545 lck_mtx_unlock(&nfs_buf_mutex
);
3547 bp
->nb_dirtyoff
= bp
->nb_dirtyend
= 0;
3550 if (nfs_buf_pgs_is_set(&dirty
)) {
3551 /* throw it back in as a delayed write buffer */
3552 CLR(bp
->nb_flags
, NB_DONE
);
3553 nfs_buf_write_delayed(bp
);
3558 FSDBG_BOT(557, np
, 0, 0, error
);
3563 * Flush all the blocks associated with a vnode.
3564 * Walk through the buffer pool and push any dirty pages
3565 * associated with the vnode.
3568 nfs_flush(nfsnode_t np
, int waitfor
, thread_t thd
, int ignore_writeerr
)
3571 struct nfsbuflists blist
;
3572 struct nfsmount
*nmp
= NFSTONMP(np
);
3573 int error
= 0, error2
, slptimeo
= 0, slpflag
= 0;
3574 int nfsvers
, flags
, passone
= 1;
3576 FSDBG_TOP(517, np
, waitfor
, ignore_writeerr
, 0);
3578 if (nfs_mount_gone(nmp
)) {
3582 nfsvers
= nmp
->nm_vers
;
3583 if (NMFLAG(nmp
, INTR
)) {
3587 if (!LIST_EMPTY(&np
->n_dirtyblkhd
)) {
3588 nfs_node_lock_force(np
);
3589 np
->n_flag
|= NMODIFIED
;
3590 nfs_node_unlock(np
);
3593 lck_mtx_lock(&nfs_buf_mutex
);
3594 while (np
->n_bflag
& NBFLUSHINPROG
) {
3595 np
->n_bflag
|= NBFLUSHWANT
;
3596 error
= msleep(&np
->n_bflag
, &nfs_buf_mutex
, slpflag
, "nfs_flush", NULL
);
3597 if ((error
&& (error
!= EWOULDBLOCK
)) ||
3598 ((error
= nfs_sigintr(NFSTONMP(np
), NULL
, thd
, 0)))) {
3599 lck_mtx_unlock(&nfs_buf_mutex
);
3603 np
->n_bflag
|= NBFLUSHINPROG
;
3606 * On the first pass, start async/unstable writes on all
3607 * delayed write buffers. Then wait for all writes to complete
3608 * and call nfs_flushcommits() to commit any uncommitted buffers.
3609 * On all subsequent passes, start STABLE writes on any remaining
3610 * dirty buffers. Then wait for all writes to complete.
3613 FSDBG(518, LIST_FIRST(&np
->n_dirtyblkhd
), np
->n_flag
, 0, 0);
3614 if (!NFSTONMP(np
)) {
3615 lck_mtx_unlock(&nfs_buf_mutex
);
3620 /* Start/do any write(s) that are required. */
3621 if (!nfs_buf_iterprepare(np
, &blist
, NBI_DIRTY
)) {
3622 while ((bp
= LIST_FIRST(&blist
))) {
3623 LIST_REMOVE(bp
, nb_vnbufs
);
3624 LIST_INSERT_HEAD(&np
->n_dirtyblkhd
, bp
, nb_vnbufs
);
3625 flags
= (passone
|| !(waitfor
== MNT_WAIT
|| waitfor
== MNT_DWAIT
)) ? NBAC_NOWAIT
: 0;
3626 if (flags
!= NBAC_NOWAIT
) {
3629 while ((error
= nfs_buf_acquire(bp
, flags
, slpflag
, slptimeo
))) {
3630 FSDBG(524, bp
, flags
, bp
->nb_lflags
, bp
->nb_flags
);
3631 if (error
== EBUSY
) {
3635 error2
= nfs_sigintr(NFSTONMP(np
), NULL
, thd
, 0);
3637 if (flags
!= NBAC_NOWAIT
) {
3638 nfs_buf_refrele(bp
);
3640 nfs_buf_itercomplete(np
, &blist
, NBI_DIRTY
);
3641 lck_mtx_unlock(&nfs_buf_mutex
);
3645 if (slpflag
== PCATCH
) {
3651 if (flags
!= NBAC_NOWAIT
) {
3652 nfs_buf_refrele(bp
);
3654 if (error
== EBUSY
) {
3658 /* buffer is no longer valid */
3662 if (ISSET(bp
->nb_flags
, NB_NEEDCOMMIT
)) {
3663 nfs_buf_check_write_verifier(np
, bp
);
3665 if (!ISSET(bp
->nb_flags
, NB_DELWRI
)) {
3666 /* buffer is no longer dirty */
3670 FSDBG(525, bp
, passone
, bp
->nb_lflags
, bp
->nb_flags
);
3671 if ((passone
|| !(waitfor
== MNT_WAIT
|| waitfor
== MNT_DWAIT
)) &&
3672 ISSET(bp
->nb_flags
, NB_NEEDCOMMIT
)) {
3676 nfs_buf_remfree(bp
);
3677 lck_mtx_unlock(&nfs_buf_mutex
);
3678 if (ISSET(bp
->nb_flags
, NB_ERROR
)) {
3679 nfs_node_lock_force(np
);
3680 np
->n_error
= bp
->nb_error
? bp
->nb_error
: EIO
;
3681 np
->n_flag
|= NWRITEERR
;
3682 nfs_node_unlock(np
);
3683 nfs_buf_release(bp
, 1);
3684 lck_mtx_lock(&nfs_buf_mutex
);
3687 SET(bp
->nb_flags
, NB_ASYNC
);
3689 /* NB_STABLE forces this to be written FILESYNC */
3690 SET(bp
->nb_flags
, NB_STABLE
);
3693 lck_mtx_lock(&nfs_buf_mutex
);
3695 nfs_buf_itercomplete(np
, &blist
, NBI_DIRTY
);
3697 lck_mtx_unlock(&nfs_buf_mutex
);
3699 if (waitfor
== MNT_WAIT
|| waitfor
== MNT_DWAIT
) {
3700 while ((error
= vnode_waitforwrites(NFSTOV(np
), 0, slpflag
, slptimeo
, "nfsflush"))) {
3701 error2
= nfs_sigintr(NFSTONMP(np
), NULL
, thd
, 0);
3706 if (slpflag
== PCATCH
) {
3713 if (nfsvers
!= NFS_VER2
) {
3714 /* loop while it looks like there are still buffers to be */
3715 /* commited and nfs_flushcommits() seems to be handling them. */
3716 while (np
->n_needcommitcnt
) {
3717 if (nfs_flushcommits(np
, 0)) {
3725 if (!LIST_EMPTY(&np
->n_dirtyblkhd
)) {
3726 nfs_node_lock_force(np
);
3727 np
->n_flag
|= NMODIFIED
;
3728 nfs_node_unlock(np
);
3730 lck_mtx_lock(&nfs_buf_mutex
);
3734 if (waitfor
== MNT_WAIT
|| waitfor
== MNT_DWAIT
) {
3735 if (!LIST_EMPTY(&np
->n_dirtyblkhd
)) {
3736 nfs_node_lock_force(np
);
3737 np
->n_flag
|= NMODIFIED
;
3738 nfs_node_unlock(np
);
3740 lck_mtx_lock(&nfs_buf_mutex
);
3741 if (!LIST_EMPTY(&np
->n_dirtyblkhd
)) {
3744 lck_mtx_unlock(&nfs_buf_mutex
);
3745 nfs_node_lock_force(np
);
3747 * OK, it looks like there are no dirty blocks. If we have no
3748 * writes in flight and no one in the write code, we can clear
3749 * the modified flag. In order to make sure we see the latest
3750 * attributes and size, we also invalidate the attributes and
3751 * advance the attribute cache XID to guarantee that attributes
3752 * newer than our clearing of NMODIFIED will get loaded next.
3753 * (If we don't do this, it's possible for the flush's final
3754 * write/commit (xid1) to be executed in parallel with a subsequent
3755 * getattr request (xid2). The getattr could return attributes
3756 * from *before* the write/commit completed but the stale attributes
3757 * would be preferred because of the xid ordering.)
3759 if (!np
->n_wrbusy
&& !np
->n_numoutput
) {
3760 np
->n_flag
&= ~NMODIFIED
;
3761 NATTRINVALIDATE(np
);
3762 nfs_get_xid(&np
->n_xid
);
3765 nfs_node_lock_force(np
);
3768 FSDBG(526, np
->n_flag
, np
->n_error
, 0, 0);
3769 if (!ignore_writeerr
&& (np
->n_flag
& NWRITEERR
)) {
3770 error
= np
->n_error
;
3771 np
->n_flag
&= ~NWRITEERR
;
3773 nfs_node_unlock(np
);
3775 lck_mtx_lock(&nfs_buf_mutex
);
3776 flags
= np
->n_bflag
;
3777 np
->n_bflag
&= ~(NBFLUSHINPROG
| NBFLUSHWANT
);
3778 lck_mtx_unlock(&nfs_buf_mutex
);
3779 if (flags
& NBFLUSHWANT
) {
3780 wakeup(&np
->n_bflag
);
3783 FSDBG_BOT(517, np
, error
, ignore_writeerr
, 0);
3788 * Flush out and invalidate all buffers associated with a vnode.
3789 * Called with the underlying object locked.
3792 nfs_vinvalbuf_internal(
3801 struct nfsbuflists blist
;
3802 int list
, error
= 0;
3804 if (flags
& V_SAVE
) {
3805 if ((error
= nfs_flush(np
, MNT_WAIT
, thd
, (flags
& V_IGNORE_WRITEERR
)))) {
3810 lck_mtx_lock(&nfs_buf_mutex
);
3813 if (nfs_buf_iterprepare(np
, &blist
, list
)) {
3815 if (nfs_buf_iterprepare(np
, &blist
, list
)) {
3819 while ((bp
= LIST_FIRST(&blist
))) {
3820 LIST_REMOVE(bp
, nb_vnbufs
);
3821 if (list
== NBI_CLEAN
) {
3822 LIST_INSERT_HEAD(&np
->n_cleanblkhd
, bp
, nb_vnbufs
);
3824 LIST_INSERT_HEAD(&np
->n_dirtyblkhd
, bp
, nb_vnbufs
);
3827 while ((error
= nfs_buf_acquire(bp
, NBAC_REMOVE
, slpflag
, slptimeo
))) {
3828 FSDBG(556, np
, bp
, NBOFF(bp
), bp
->nb_flags
);
3829 if (error
!= EAGAIN
) {
3830 FSDBG(554, np
, bp
, -1, error
);
3831 nfs_buf_refrele(bp
);
3832 nfs_buf_itercomplete(np
, &blist
, list
);
3833 lck_mtx_unlock(&nfs_buf_mutex
);
3837 nfs_buf_refrele(bp
);
3838 FSDBG(554, np
, bp
, NBOFF(bp
), bp
->nb_flags
);
3839 lck_mtx_unlock(&nfs_buf_mutex
);
3840 if ((flags
& V_SAVE
) && UBCINFOEXISTS(NFSTOV(np
)) && bp
->nb_np
&&
3841 (NBOFF(bp
) < (off_t
)np
->n_size
)) {
3842 /* extra paranoia: make sure we're not */
3843 /* somehow leaving any dirty data around */
3846 off_t end
= (NBOFF(bp
) + bp
->nb_bufsize
> (off_t
)np
->n_size
) ?
3847 (np
->n_size
- NBOFF(bp
)) : bp
->nb_bufsize
;
3848 if (!ISSET(bp
->nb_flags
, NB_PAGELIST
)) {
3849 error
= nfs_buf_upl_setup(bp
);
3850 if (error
== EINVAL
) {
3851 /* vm object must no longer exist */
3852 /* hopefully we don't need to do */
3853 /* anything for this buffer */
3855 printf("nfs_vinvalbuf: upl setup failed %d\n", error
);
3857 NBPGS_ERASE(&bp
->nb_valid
);
3858 NBPGS_ERASE(&bp
->nb_dirty
);
3860 nfs_buf_upl_check(bp
);
3861 /* check for any dirty data before the EOF */
3862 if ((bp
->nb_dirtyend
> 0) && (bp
->nb_dirtyoff
< end
)) {
3863 /* clip dirty range to EOF */
3864 if (bp
->nb_dirtyend
> end
) {
3865 bp
->nb_dirtyend
= end
;
3866 if (bp
->nb_dirtyoff
>= bp
->nb_dirtyend
) {
3867 bp
->nb_dirtyoff
= bp
->nb_dirtyend
= 0;
3870 if ((bp
->nb_dirtyend
> 0) && (bp
->nb_dirtyoff
< end
)) {
3874 nfs_buf_pgs_get_page_mask(&pagemask
, round_page_64(end
) / PAGE_SIZE
);
3875 nfs_buf_pgs_bit_and(&bp
->nb_dirty
, &pagemask
, &bp
->nb_dirty
);
3876 if (nfs_buf_pgs_is_set(&bp
->nb_dirty
)) {
3879 /* also make sure we'll have a credential to do the write */
3880 if (mustwrite
&& !IS_VALID_CRED(bp
->nb_wcred
) && !IS_VALID_CRED(cred
)) {
3881 printf("nfs_vinvalbuf: found dirty buffer with no write creds\n");
3885 FSDBG(554, np
, bp
, 0xd00dee, bp
->nb_flags
);
3886 if (!ISSET(bp
->nb_flags
, NB_PAGELIST
)) {
3887 panic("nfs_vinvalbuf: dirty buffer without upl");
3889 /* gotta write out dirty data before invalidating */
3890 /* (NB_STABLE indicates that data writes should be FILESYNC) */
3891 /* (NB_NOCACHE indicates buffer should be discarded) */
3892 CLR(bp
->nb_flags
, (NB_DONE
| NB_ERROR
| NB_INVAL
| NB_ASYNC
));
3893 SET(bp
->nb_flags
, NB_STABLE
| NB_NOCACHE
);
3894 if (!IS_VALID_CRED(bp
->nb_wcred
)) {
3895 kauth_cred_ref(cred
);
3896 bp
->nb_wcred
= cred
;
3898 error
= nfs_buf_write(bp
);
3899 // Note: bp has been released
3901 FSDBG(554, bp
, 0xd00dee, 0xbad, error
);
3902 nfs_node_lock_force(np
);
3903 if ((error
!= EINTR
) && (error
!= ERESTART
)) {
3904 np
->n_error
= error
;
3905 np
->n_flag
|= NWRITEERR
;
3908 * There was a write error and we need to
3909 * invalidate attrs to sync with server.
3910 * (if this write was extending the file,
3911 * we may no longer know the correct size)
3913 NATTRINVALIDATE(np
);
3914 nfs_node_unlock(np
);
3915 if ((error
== EINTR
) || (error
== ERESTART
)) {
3917 * Abort on EINTR. If we don't, we could
3918 * be stuck in this loop forever because
3919 * the buffer will continue to stay dirty.
3921 lck_mtx_lock(&nfs_buf_mutex
);
3922 nfs_buf_itercomplete(np
, &blist
, list
);
3923 lck_mtx_unlock(&nfs_buf_mutex
);
3928 lck_mtx_lock(&nfs_buf_mutex
);
3932 SET(bp
->nb_flags
, NB_INVAL
);
3933 // hold off on FREEUPs until we're done here
3934 nfs_buf_release(bp
, 0);
3935 lck_mtx_lock(&nfs_buf_mutex
);
3937 nfs_buf_itercomplete(np
, &blist
, list
);
3939 if (!LIST_EMPTY(&(np
)->n_dirtyblkhd
) || !LIST_EMPTY(&(np
)->n_cleanblkhd
)) {
3940 panic("nfs_vinvalbuf: flush/inval failed");
3942 lck_mtx_unlock(&nfs_buf_mutex
);
3943 nfs_node_lock_force(np
);
3944 if (!(flags
& V_SAVE
)) {
3945 np
->n_flag
&= ~NMODIFIED
;
3947 if (vnode_vtype(NFSTOV(np
)) == VREG
) {
3948 np
->n_lastrahead
= -1;
3950 nfs_node_unlock(np
);
3957 * Flush and invalidate all dirty buffers. If another process is already
3958 * doing the flush, just wait for completion.
3961 nfs_vinvalbuf(vnode_t vp
, int flags
, vfs_context_t ctx
, int intrflg
)
3963 return nfs_vinvalbuf2(vp
, flags
, vfs_context_thread(ctx
), vfs_context_ucred(ctx
), intrflg
);
3967 nfs_vinvalbuf2(vnode_t vp
, int flags
, thread_t thd
, kauth_cred_t cred
, int intrflg
)
3969 nfsnode_t np
= VTONFS(vp
);
3970 struct nfsmount
*nmp
= VTONMP(vp
);
3971 int error
, slpflag
, slptimeo
, nflags
, retry
= 0;
3972 int ubcflags
= UBC_PUSHALL
| UBC_SYNC
| UBC_INVALIDATE
;
3973 struct timespec ts
= { .tv_sec
= 2, .tv_nsec
= 0 };
3976 FSDBG_TOP(554, np
, flags
, intrflg
, 0);
3978 if (nmp
&& !NMFLAG(nmp
, INTR
)) {
3989 /* First wait for any other process doing a flush to complete. */
3990 lck_mtx_lock(&nfs_buf_mutex
);
3991 while (np
->n_bflag
& NBINVALINPROG
) {
3992 np
->n_bflag
|= NBINVALWANT
;
3993 msleep(&np
->n_bflag
, &nfs_buf_mutex
, slpflag
, "nfs_vinvalbuf", &ts
);
3994 if ((error
= nfs_sigintr(VTONMP(vp
), NULL
, thd
, 0))) {
3995 lck_mtx_unlock(&nfs_buf_mutex
);
3998 if (np
->n_bflag
& NBINVALINPROG
) {
4002 np
->n_bflag
|= NBINVALINPROG
;
4003 lck_mtx_unlock(&nfs_buf_mutex
);
4005 /* Now, flush as required. */
4007 /* If the mount is gone no sense to try and write anything. and hang trying to do IO. */
4008 if (nfs_mount_gone(nmp
)) {
4012 error
= nfs_vinvalbuf_internal(np
, flags
, thd
, cred
, slpflag
, 0);
4014 FSDBG(554, np
, 0, 0, error
);
4015 if ((error
= nfs_sigintr(VTONMP(vp
), NULL
, thd
, 0))) {
4018 error
= nfs_vinvalbuf_internal(np
, flags
, thd
, cred
, 0, slptimeo
);
4021 /* If the mount is gone no sense to try and write anything. and hang trying to do IO. */
4022 if (nfs_mount_gone(nmp
)) {
4023 ubcflags
&= ~UBC_PUSHALL
;
4026 /* get the pages out of vm also */
4027 if (UBCINFOEXISTS(vp
) && (size
= ubc_getsize(vp
))) {
4028 if ((error
= ubc_msync(vp
, 0, size
, NULL
, ubcflags
))) {
4029 if (error
== EINVAL
) {
4030 panic("nfs_vinvalbuf(): ubc_msync failed!, error %d", error
);
4032 if (retry
++ < 10) { /* retry invalidating a few times */
4033 if (retry
> 1 || error
== ENXIO
) {
4034 ubcflags
&= ~UBC_PUSHALL
;
4039 printf("nfs_vinvalbuf(): ubc_msync failed!, error %d\n", error
);
4043 lck_mtx_lock(&nfs_buf_mutex
);
4044 nflags
= np
->n_bflag
;
4045 np
->n_bflag
&= ~(NBINVALINPROG
| NBINVALWANT
);
4046 lck_mtx_unlock(&nfs_buf_mutex
);
4047 if (nflags
& NBINVALWANT
) {
4048 wakeup(&np
->n_bflag
);
4051 FSDBG_BOT(554, np
, flags
, intrflg
, error
);
4056 * Wait for any busy buffers to complete.
4059 nfs_wait_bufs(nfsnode_t np
)
4062 struct nfsbuflists blist
;
4065 lck_mtx_lock(&nfs_buf_mutex
);
4066 if (!nfs_buf_iterprepare(np
, &blist
, NBI_CLEAN
)) {
4067 while ((bp
= LIST_FIRST(&blist
))) {
4068 LIST_REMOVE(bp
, nb_vnbufs
);
4069 LIST_INSERT_HEAD(&np
->n_cleanblkhd
, bp
, nb_vnbufs
);
4071 while ((error
= nfs_buf_acquire(bp
, 0, 0, 0))) {
4072 if (error
!= EAGAIN
) {
4073 nfs_buf_refrele(bp
);
4074 nfs_buf_itercomplete(np
, &blist
, NBI_CLEAN
);
4075 lck_mtx_unlock(&nfs_buf_mutex
);
4079 nfs_buf_refrele(bp
);
4082 nfs_buf_itercomplete(np
, &blist
, NBI_CLEAN
);
4084 if (!nfs_buf_iterprepare(np
, &blist
, NBI_DIRTY
)) {
4085 while ((bp
= LIST_FIRST(&blist
))) {
4086 LIST_REMOVE(bp
, nb_vnbufs
);
4087 LIST_INSERT_HEAD(&np
->n_dirtyblkhd
, bp
, nb_vnbufs
);
4089 while ((error
= nfs_buf_acquire(bp
, 0, 0, 0))) {
4090 if (error
!= EAGAIN
) {
4091 nfs_buf_refrele(bp
);
4092 nfs_buf_itercomplete(np
, &blist
, NBI_DIRTY
);
4093 lck_mtx_unlock(&nfs_buf_mutex
);
4097 nfs_buf_refrele(bp
);
4100 nfs_buf_itercomplete(np
, &blist
, NBI_DIRTY
);
4102 lck_mtx_unlock(&nfs_buf_mutex
);
4107 * Add an async I/O request to the mount's async I/O queue and make
4108 * sure that an nfsiod will service it.
4111 nfs_asyncio_finish(struct nfsreq
*req
)
4113 struct nfsmount
*nmp
;
4114 struct nfsiod
*niod
;
4117 FSDBG_TOP(552, nmp
, 0, 0, 0);
4125 lck_mtx_lock(&nfsiod_mutex
);
4126 niod
= nmp
->nm_niod
;
4128 /* grab an nfsiod if we don't have one already */
4130 niod
= TAILQ_FIRST(&nfsiodfree
);
4132 TAILQ_REMOVE(&nfsiodfree
, niod
, niod_link
);
4133 TAILQ_INSERT_TAIL(&nfsiodwork
, niod
, niod_link
);
4134 niod
->niod_nmp
= nmp
;
4135 } else if (((nfsiod_thread_count
< NFSIOD_MAX
) || (nfsiod_thread_count
<= 0)) && (started
< 4)) {
4137 * Try starting a new thread.
4138 * We may try a couple times if other callers
4139 * get the new threads before we do.
4141 lck_mtx_unlock(&nfsiod_mutex
);
4143 if (!nfsiod_start()) {
4146 lck_mtx_lock(&nfsiod_mutex
);
4151 * If we got here while being on the resendq we need to get off. This
4152 * happens when the timer fires and errors out requests from nfs_sigintr
4153 * or we receive a reply (UDP case) while being on the resend queue so
4154 * we're just finishing up and are not going to be resent.
4156 lck_mtx_lock(&req
->r_mtx
);
4157 if (req
->r_flags
& R_RESENDQ
) {
4158 lck_mtx_lock(&nmp
->nm_lock
);
4159 if ((req
->r_flags
& R_RESENDQ
) && req
->r_rchain
.tqe_next
!= NFSREQNOLIST
) {
4160 NFS_BIO_DBG("Proccessing async request on resendq. Removing");
4161 TAILQ_REMOVE(&nmp
->nm_resendq
, req
, r_rchain
);
4162 req
->r_flags
&= ~R_RESENDQ
;
4163 req
->r_rchain
.tqe_next
= NFSREQNOLIST
;
4164 assert(req
->r_refs
> 1);
4165 /* Remove resendq reference */
4168 lck_mtx_unlock(&nmp
->nm_lock
);
4170 lck_mtx_unlock(&req
->r_mtx
);
4172 if (req
->r_achain
.tqe_next
== NFSREQNOLIST
) {
4173 TAILQ_INSERT_TAIL(&nmp
->nm_iodq
, req
, r_achain
);
4176 /* If this mount doesn't already have an nfsiod working on it... */
4177 if (!nmp
->nm_niod
) {
4178 if (niod
) { /* give it the nfsiod we just grabbed */
4179 nmp
->nm_niod
= niod
;
4180 lck_mtx_unlock(&nfsiod_mutex
);
4182 } else if (nfsiod_thread_count
> 0) {
4183 /* just queue it up on nfsiod mounts queue if needed */
4184 if (nmp
->nm_iodlink
.tqe_next
== NFSNOLIST
) {
4185 TAILQ_INSERT_TAIL(&nfsiodmounts
, nmp
, nm_iodlink
);
4187 lck_mtx_unlock(&nfsiod_mutex
);
4189 printf("nfs_asyncio(): no nfsiods? %d %d (%d)\n", nfsiod_thread_count
, NFSIOD_MAX
, started
);
4190 lck_mtx_unlock(&nfsiod_mutex
);
4191 /* we have no other option but to be persistent */
4196 lck_mtx_unlock(&nfsiod_mutex
);
4199 FSDBG_BOT(552, nmp
, 0, 0, 0);
4203 * queue up async I/O request for resend
4204 * Must be called with req->r_mtx locked.
4207 nfs_asyncio_resend(struct nfsreq
*req
)
4209 struct nfsmount
*nmp
= req
->r_nmp
;
4211 if (nfs_mount_gone(nmp
)) {
4216 nfs_gss_clnt_rpcdone(req
);
4218 lck_mtx_lock(&nmp
->nm_lock
);
4219 if (!(req
->r_flags
& R_RESENDQ
)) {
4220 TAILQ_INSERT_TAIL(&nmp
->nm_resendq
, req
, r_rchain
);
4221 req
->r_flags
|= R_RESENDQ
;
4223 * We take a reference on this request so that it can't be
4224 * destroyed while a resend is queued or in progress.
4226 nfs_request_ref(req
, 1);
4228 nfs_mount_sock_thread_wake(nmp
);
4229 lck_mtx_unlock(&nmp
->nm_lock
);
4233 * Read directory data into a buffer.
4235 * Buffer will be filled (unless EOF is hit).
4236 * Buffers after this one may also be completely/partially filled.
4239 nfs_buf_readdir(struct nfsbuf
*bp
, vfs_context_t ctx
)
4241 nfsnode_t np
= bp
->nb_np
;
4242 struct nfsmount
*nmp
= NFSTONMP(np
);
4245 if (nfs_mount_gone(nmp
)) {
4249 if (nmp
->nm_vers
< NFS_VER4
) {
4250 error
= nfs3_readdir_rpc(np
, bp
, ctx
);
4254 error
= nfs4_readdir_rpc(np
, bp
, ctx
);
4257 if (error
&& (error
!= NFSERR_DIRBUFDROPPED
)) {
4258 SET(bp
->nb_flags
, NB_ERROR
);
4259 bp
->nb_error
= error
;
4264 #endif /* CONFIG_NFS_CLIENT */